├── README.md ├── SCR2e-errata.html ├── SCR2e-errata.pdf ├── examples-R ├── SCR2e-Intro.R ├── SCR2e-Programming.R ├── SCR2e-ResampAppl.R ├── SCR2e-chBoot.R ├── SCR2e-chDensity.R ├── SCR2e-chMCMC.R ├── SCR2e-chMCint.R ├── SCR2e-chMonteCarlo.R ├── SCR2e-chNumerical.R ├── SCR2e-chOptim.R ├── SCR2e-chPerm.R ├── SCR2e-chRandomProcess.R ├── SCR2e-chRgen.R ├── SCR2e-chVis.R ├── SCR2e-examples-functions.R └── readme.md ├── examples-Rmd ├── FOREARM.DAT ├── Lahman.Rmd ├── SCR2e-Intro.Rmd ├── SCR2e-Programming.Rmd ├── SCR2e-ResampAppl.Rmd ├── SCR2e-chBoot.Rmd ├── SCR2e-chDensity.Rmd ├── SCR2e-chMCMC.Rmd ├── SCR2e-chMCint.Rmd ├── SCR2e-chMonteCarlo.Rmd ├── SCR2e-chNumerical.Rmd ├── SCR2e-chOptim.Rmd ├── SCR2e-chPerm.Rmd ├── SCR2e-chRandomProcess.Rmd ├── SCR2e-chRgen.Rmd ├── SCR2e-chVis.Rmd ├── SCR2e-examples-functions.R └── readme.md └── examples-cpp ├── printme.cpp └── readme.md /README.md: -------------------------------------------------------------------------------- 1 | # SCR2e 2 | Resources for the book "Statistical Computing with R, 2nd ed." 3 | Maria L. Rizzo 2019, Chapman & Hall. 4 | Resources and files for "Statistical Computing with R 2nd edition" by Maria Rizzo, Chapman & Hall/CRC (2019) 5 | 6 | URL: https://www.crcpress.com/Statistical-Computing-with-R-Second-Edition/Rizzo/p/book/9781466553323 7 | 8 | Bibtex entry: 9 | 10 | @book{SCR2, 11 | title = "Statistical Computing with {R}", 12 | edition = "second", 13 | author = "Maria L. Rizzo", 14 | year = "2019", 15 | publisher = "Chapman & Hall/CRC", 16 | series = "The {R} Series", 17 | isbn = "9781466553323", 18 | } 19 | -------------------------------------------------------------------------------- /SCR2e-errata.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mariarizzo/SCR2e/5c72bc153c6b931f88f5703e3c088bd0b263eeca/SCR2e-errata.pdf -------------------------------------------------------------------------------- /examples-R/SCR2e-Intro.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 1 ### 9 | ### Introduction ### 10 | ####################################################### 11 | 12 | 13 | 14 | ### Example 1.1 15 | 16 | sumdice <- function(n) { 17 | k <- sample(1:6, size=n, replace=TRUE) 18 | return(sum(k)) 19 | } 20 | 21 | sumdice(2) 22 | 23 | #to store the result rather than print it 24 | a <- sumdice(100) 25 | 26 | #we expect the mean for 100 dice to be close to 3.5 27 | a / 100 28 | 29 | sumdice <- function(n) 30 | sum(sample(1:6, size=n, replace=TRUE)) 31 | 32 | sumdice <- function(n, sides = 6) { 33 | if (sides < 1) return (0) 34 | k <- sample(1:sides, size=n, replace=TRUE) 35 | return(sum(k)) 36 | } 37 | 38 | sumdice(5) #default 6 sides 39 | sumdice(n=5, sides=4) #4 sides 40 | 41 | ### Example 1.2 (iris data) 42 | 43 | names(iris) 44 | table(iris$Species) 45 | w <- iris[[2]] #Sepal.Width 46 | mean(w) 47 | 48 | attach(iris) 49 | summary(Petal.Length[51:100]) #versicolor petal length 50 | 51 | with(iris, summary(Petal.Length[51:100])) 52 | 53 | out <- with(iris, summary(Petal.Length[51:100])) 54 | 55 | by(iris[,1:4], Species, colMeans) 56 | detach(iris) 57 | 58 | ### Example 1.3 (Arrays) 59 | 60 | x <- 1:24 # vector 61 | dim(x) <- length(x) # 1 dimensional array 62 | matrix(1:24, nrow=4, ncol=6) # 4 by 6 matrix 63 | x <- array(1:24, c(3, 4, 2)) # 3 by 4 by 2 array 64 | 65 | ### Example 1.4 (Matrices) 66 | 67 | A <- matrix(0, nrow=2, ncol=2) 68 | A <- matrix(c(0, 0, 0, 0), nrow=2, ncol=2) 69 | A <- matrix(0, 2, 2) 70 | 71 | A <- matrix(1:8, nrow=2, ncol=4) 72 | 73 | ### Example 1.5 (Iris data cont.) 74 | 75 | x <- as.matrix(iris[,1:4]) #all rows of columns 1 to 4 76 | 77 | mean(x[,2]) #mean of sepal width, all species 78 | mean(x[51:100,3]) #mean of petal length, versicolor 79 | 80 | y <- array(x, dim=c(50, 3, 4)) 81 | mean(y[,,2]) #mean of sepal width, all species 82 | mean(y[,2,3]) #mean of petal length, versicolor 83 | 84 | y <- array(c(x[1:50,], x[51:100,], x[101:150,]), 85 | dim=c(50, 4, 3)) 86 | mean(y[,2,]) #mean of sepal width, all species 87 | mean(y[,3,2]) #mean of petal length, versicolor 88 | 89 | ### Example 1.6 (Run length encoding) 90 | 91 | 92 | n <- 1000 93 | x <- rbinom(n, size = 1, prob = .5) 94 | table(x) 95 | head(x, 30) 96 | 97 | r <- rle(x) 98 | str(r) 99 | 100 | head(r$lengths) 101 | head(r[[1]]) 102 | 103 | max(r$lengths) 104 | log2(length(x)) 105 | 106 | ### Example 1.7 (Named list) 107 | 108 | w <- wilcox.test(rnorm(10), rnorm(10, 2)) 109 | w #print the summary 110 | 111 | w$statistic #stored in object w 112 | w$p.value 113 | unlist(w) 114 | unclass(w) 115 | 116 | ### Example 1.8 (A list of names) 117 | 118 | a <- matrix(runif(8), 4, 2) #a 4x2 matrix 119 | dimnames(a) <- list(NULL, c("x", "y")) 120 | 121 | # if we want row names 122 | dimnames(a) <- list(letters[1:4], c("x", "y")) 123 | a 124 | 125 | # another way to assign row names 126 | row.names(a) <- list("NE", "NW", "SW", "SE") 127 | a 128 | 129 | ### Example 1.9 (Parallel boxplots) 130 | 131 | boxplot(iris$Sepal.Length ~ iris$Species) 132 | 133 | boxplot(iris$Sepal.Length ~ iris$Species, 134 | ylab = "Sepal Length", boxwex = .4) 135 | ### Example 1.10 (Plotting characters and colors) 136 | 137 | plot(0:25, rep(1, 26), pch = 0:25) 138 | text(0:25, 0.9, 0:25) 139 | 140 | 141 | ### Example 1.11 (Barplot for run lengths) 142 | 143 | barplot(table(r$lengths)) #R graphics version 144 | 145 | ## ggplot version 146 | library(ggplot2) 147 | df <- data.frame(lengths = factor(r$lengths)) 148 | ggplot(df, aes(lengths)) + geom_bar() 149 | 150 | ### Example 1.12 (Scatterplots) 151 | 152 | ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) + 153 | geom_point() 154 | 155 | ggplot(iris, aes(Sepal.Length, Sepal.Width, 156 | color = Species, shape = Species)) + geom_point(size = 2) 157 | 158 | ### Example 1.13 (ggplot: parallel boxplots and violin plots) 159 | 160 | ggplot(iris, aes(Species, Sepal.Length)) + geom_boxplot() 161 | ggplot(iris, aes(Species, Sepal.Length)) + geom_violin() 162 | 163 | ggplot(iris, aes(Species, Sepal.Length)) + 164 | geom_boxplot() + coord_flip() 165 | ggplot(iris, aes(Species, Sepal.Length)) + 166 | geom_violin() + coord_flip() 167 | 168 | ### Example 1.14 (MPG by engine displacement) 169 | 170 | ggplot(mpg, aes(displ, hwy)) + 171 | geom_point() + 172 | facet_wrap(~ class) 173 | 174 | ### Example 1.15 (Import data from a local text file) 175 | 176 | # copy "FOREARM.DAT" into working directory 177 | forearm <- scan(file = "FOREARM.DAT") #a vector 178 | 179 | # use pathname if file is not in working directory 180 | # forearm <- scan(file = "./DATASETS/FOREARM.DAT") #a vector 181 | 182 | head(forearm) 183 | 184 | ### Example 1.16 (Importing data from a web page) 185 | 186 | fileloc <- "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data" 187 | 188 | df <- read.table(file = fileloc, na.strings = "?", as.is = TRUE) 189 | str(df) 190 | names(df) <- c("mpg", "cyl", "displ", "hp", "wt", "accel", 191 | "year", "origin", "name") 192 | summary(df) 193 | 194 | ### Example 1.17 (Importing/exporting .csv files) 195 | 196 | #create a data frame 197 | dates <- c("3/27/1995", "4/3/1995", 198 | "4/10/1995", "4/18/1995") 199 | prices <- c(11.1, 7.9, 1.9, 7.3) 200 | d <- data.frame(dates=dates, prices=prices) 201 | 202 | #create the .csv file 203 | filename <- "temp.csv" 204 | write.table(d, file = filename, sep = ",", 205 | row.names = FALSE) 206 | 207 | #read the .csv file 208 | read.table(file = filename, sep = ",", header = TRUE) 209 | read.csv(file = filename) #same thing 210 | 211 | -------------------------------------------------------------------------------- /examples-R/SCR2e-Programming.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 15 ### 9 | ### Programming Topics ### 10 | ####################################################### 11 | 12 | # packages to install: 13 | # energy, ggplot2, profvis, pryr, microbenchmark, mvtnorm, rbenchmark 14 | # for the last section: dplyr, Lahman, Rcpp 15 | 16 | ### Example 15.1 (Benchmarking methods to generate a sequence) 17 | 18 | s1 <- 1:10 19 | s2 <- seq(1, 10, 1) 20 | s3 <- seq.int(1, 10, 1) 21 | df <- data.frame(s1=s1, s2=s2, s3=s3) 22 | str(df) 23 | 24 | library(microbenchmark) 25 | library(ggplot2) 26 | 27 | n <- 1000 28 | mb <- microbenchmark( 29 | seq(1, n, 1), 30 | seq.int(1, n, 1), 31 | 1:n 32 | ) 33 | 34 | print(mb) 35 | autoplot(mb) # display a violin plot 36 | 37 | ### Example 15.2 (Benchmarking methods to initialize a vector) 38 | 39 | n <- 100 40 | mb2 <- microbenchmark( 41 | numeric = numeric(n) + 1, 42 | rep = rep(1, n), 43 | seq = seq(from=1, to=1, length=n), 44 | ones = matrix(1, nrow=n, ncol=1), 45 | as.ones = as.matrix(rep(1, n)) 46 | ) 47 | 48 | print(mb2) 49 | 50 | ### Example 15.3 (Timings of two multivariate normal generators) 51 | 52 | library(rbenchmark) 53 | library(MASS) 54 | library(mvtnorm) 55 | n <- 100 #sample size 56 | d <- 30 #dimension 57 | N <- 2000 #iterations 58 | mu <- numeric(d) 59 | 60 | benchmark( 61 | columns = c("test", "replications", "elapsed", "relative"), 62 | replications = 2000, 63 | cov = {S <- cov(matrix(rnorm(n*d), n, d))}, 64 | mvrnorm = mvrnorm(n, mu, S), 65 | rmvnorm = rmvnorm(n, mu, S) 66 | ) 67 | 68 | ### Example 15.4 (Profiling with Rprof) 69 | 70 | x <- rnorm(1000) 71 | y <- rnorm(1000) 72 | 73 | Rprof("pr.out", line.profiling = TRUE) 74 | energy::dcor(x, y) 75 | Rprof(NULL) 76 | summaryRprof("pr.out") 77 | 78 | ### Example 15.5 (profvis interactive visualization) 79 | 80 | library(profvis) 81 | profvis(energy::dcor(x, y)) 82 | 83 | ### Example 15.6 (Object size) 84 | 85 | x <- matrix(rnorm(5000), 1000, 5) #1000 obs in R^5 86 | DF <- as.data.frame(x) 87 | object.size(x) 88 | object.size(DF) 89 | pryr::object_size(x) 90 | pryr::object_size(DF) 91 | pryr::compare_size(x) 92 | 93 | listTwo <- list(x, x) 94 | pryr::compare_size(listTwo) 95 | 96 | ### Example 15.7 (Comparing objects and attributes) 97 | 98 | str(x) 99 | str(DF) 100 | all.equal(x, DF) 101 | names(attributes(x)) 102 | names(attributes(DF)) 103 | all.equal(x, DF, check.attributes = FALSE) 104 | 105 | ### Example 15.8 (Comparing objects for equality) 106 | 107 | try(ifelse(all.equal(x, DF), "T", "F")) # error 108 | ifelse(isTRUE(all.equal(x, DF)), "T", "F") # correct 109 | 110 | x <- 1 - 10e-4 111 | y <- x + 2 112 | x == (y - 2) # equal mathematically but 113 | isTRUE(all.equal(x, y - 2)) #gives expected result 114 | 115 | ## does not necessarily evaluate to TRUE or FALSE 116 | try(ifelse(all.equal(x, y), "T", "F")) 117 | ## returns TRUE or FALSE 118 | ifelse(isTRUE(all.equal(x, y)), "T", "F") 119 | 120 | ### Example 15.9 (Display R function code) 121 | 122 | nclass.scott 123 | 124 | ### Example 15.10 (RSiteSearch) 125 | 126 | if (interactive()) 127 | RSiteSearch("ggcorr") 128 | 129 | ### Example 15.11 (UseMethod) 130 | 131 | body(density) 132 | args(density.default) 133 | body(density.default) 134 | 135 | ### Example 15.12 (Show methods) 136 | 137 | methods(t.test) 138 | getAnywhere(t.test.formula) 139 | body(stats:::t.test.formula) 140 | 141 | ### Example 15.13 (Object not found or not an exported object) 142 | 143 | try(perc.ci) 144 | try(boot::perc.ci) 145 | 146 | getAnywhere(perc.ci) 147 | args(boot:::perc.ci) 148 | body(boot:::perc.ci) 149 | boot:::perc.ci 150 | getFromNamespace("perc.ci", "boot") 151 | 152 | ### Example 15.14 (getS3method) 153 | 154 | library(microbenchmark) 155 | library(ggplot2) 156 | getAnywhere(autoplot) 157 | getS3method("autoplot", class = "microbenchmark") 158 | getAnywhere(autoplot.microbenchmark) 159 | 160 | ### Example 15.15 (.Primitive or .Internal) 161 | 162 | if (interactive()) 163 | pryr::show_c_source(.Primitive(cumsum(x))) 164 | 165 | ### Example 15.16 (.Call, .External, .C or .Fortran) 166 | 167 | #dist is implemented by a .Call to C_Cdist 168 | 169 | body(dist) 170 | 171 | ### Example 15.17 (A first Rcpp experiment) 172 | 173 | # In RStudio use the menu 174 | # File > New File > C++ file to display this code 175 | 176 | ### Example 15.18 (cppFunction) 177 | 178 | library(Rcpp) 179 | 180 | set.seed(1) 181 | x <- matrix(rnorm(20), nrow = 5, ncol = 4) 182 | 183 | cppFunction('double vecnorm(NumericVector v) { 184 | // compute the Euclidean norm of vector v 185 | int d = v.size(); 186 | double s = 0.0; 187 | for (int i = 0; i < d; i++) 188 | s += v(i) * v(i); 189 | return sqrt(s); 190 | }') 191 | 192 | print(vecnorm(x[, 1])) 193 | print(apply(x, MARGIN = 2, FUN = "vecnorm")) 194 | 195 | 196 | ### Example 15.19 (sourceCpp) 197 | 198 | # Create the C++ source file "printme.cpp" by editing the template 199 | # in RStudio menu: File > New File > C++ 200 | 201 | library(Rcpp) 202 | sourceCpp("..//examples-cpp//printme.cpp") 203 | x <- sample(1:5) 204 | print_me(x) 205 | 206 | ### Example 15.20 (Lahman baseball data) 207 | 208 | library(Lahman) 209 | str(Batting) 210 | # method 1 211 | S <- subset(Batting, Batting$yearID == 1999, 212 | select = c("playerID", "AB", "H")) 213 | 214 | # method 2 (dplyr) 215 | library(dplyr) 216 | Batting %>% filter(yearID == 1999) -> b 217 | 218 | # method 1 219 | AB <- as.vector(by(S$AB, S$playerID, FUN = sum)) 220 | H <- as.vector(by(S$H, S$playerID, FUN = sum)) 221 | S <- data.frame(playerID = unique(S$playerID), 222 | AB = AB, H = H, AVG = round(H / AB, 3), 223 | stringsAsFactors = FALSE) 224 | S400 <- S[S$AB >= 400, ] 225 | 226 | # method 2 (dplyr) 227 | b %>% group_by(playerID) %>% 228 | summarize(AB = sum(AB), H = sum(H)) -> S 229 | S %>% mutate(AVG = round(H / AB, 3)) -> S 230 | S %>% filter(AB >= 400) -> S400 231 | 232 | # method 1 233 | o <- order(S400$AVG, decreasing = TRUE) 234 | S400 <- S400[o, ] 235 | top <- S400[1:10, ] 236 | 237 | # method 2 (dplyr) 238 | S400 %>% arrange(desc(AVG)) -> S400 239 | slice(S400, 1:10) -> top 240 | top 241 | 242 | People %>% select(playerID, nameFirst, nameLast) -> m 243 | top %>% inner_join(m) %>% 244 | select(nameFirst, nameLast, AVG) 245 | 246 | ### Example 15.21 (Comparison with microbenchmark) 247 | 248 | # The source for this example is in R Markdown format 249 | # Open "Lahman.Rmd" in RStudio and knit to see report 250 | -------------------------------------------------------------------------------- /examples-R/SCR2e-ResampAppl.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 9 ### 9 | ### Resampling Applications ### 10 | ####################################################### 11 | 12 | 13 | # packages to install: bootstrap, DAAG, ggplot2 14 | 15 | 16 | ### Example 9.1 (Jackknife-after-bootstrap) 17 | 18 | library(boot) 19 | library(bootstrap) 20 | set.seed(1111) 21 | 22 | theta.boot <- function(patch, i) { 23 | # function to compute the patch ratio statistic 24 | y <- patch[i, "y"] 25 | z <- patch[i, "z"] 26 | mean(y) / mean(z) 27 | } 28 | 29 | boot.out <- boot(bootstrap::patch, 30 | statistic = theta.boot, R=2000) 31 | A <- boot.array(boot.out) 32 | head(A, 3) 33 | mean(A[, 1] == 0) 34 | 35 | # jackknife-after-bootstrap to est. se(se) 36 | A <- boot.array(boot.out) 37 | theta.b <- boot.out$t 38 | n <- NROW(patch) 39 | jack.se <- numeric(n) 40 | 41 | for (i in 1:n) { 42 | #in i-th replicate omit all samples with x[i] 43 | keep <- which(A[, i] == 0) 44 | jack.se[i] <- sd(theta.b[keep]) 45 | } 46 | 47 | print(boot.out) #for se_boot 48 | se.bar <- mean(jack.se) 49 | se.se <- sqrt((n-1) * mean((jack.se - se.bar)^2)) 50 | print(paste("Jackknife-after-bootstrap est. se(se)=", se.se)) 51 | 52 | 53 | ### Example 9.2 (Jackknife-after-bootstrap) 54 | 55 | # initialize 56 | data(patch, package = "bootstrap") 57 | y <- patch$y 58 | z <- patch$z 59 | dat <- cbind(y, z) 60 | n <- NROW(dat) 61 | B <- 2000 62 | 63 | # jackknife-after-bootstrap step 1: run the bootstrap 64 | theta_boot <- function(dat, ind) { 65 | # function to compute the statistic 66 | y <- dat[ind, 1] 67 | z <- dat[ind, 2] 68 | mean(y) / mean(z) 69 | } 70 | 71 | boot.obj <- boot(dat, statistic = theta_boot, R=2000) 72 | theta.hat <- boot.obj$t0 73 | theta.b <- boot.obj$t 74 | se.boot <- sd(theta.b) 75 | 76 | # jackknife-after-bootstrap to est. se(se) 77 | sample.freq <- boot.array(boot.obj) 78 | se.se.reps <- numeric(n) 79 | N <- 1:n 80 | 81 | for (i in N) { 82 | # jackknife-after-bootstrap 83 | # omit all bootstrap samples that contain obs i 84 | keep <- which(sample.freq[ ,i] == 0) 85 | se.se.reps[i] <- sd(theta.b[keep]) 86 | } 87 | 88 | print(boot.obj) 89 | se.bar <- mean(se.se.reps) 90 | se.se <- sqrt((n-1) * mean((se.se.reps - se.bar)^2)) 91 | se.se 92 | 93 | ### Example 9.3 (ironslag linear model) 94 | 95 | library(ggplot2) 96 | library(DAAG) 97 | L1 <- lm(magnetic ~ chemical, data=ironslag) 98 | cf3 <- round(L1$coeff, 3) 99 | cap <- paste("Fit: magnetic =", cf3[1], "+", cf3[2], "chemical") 100 | 101 | ggplot(data=ironslag, aes(chemical, magnetic)) + 102 | geom_point() + geom_smooth(method="lm") + 103 | ggtitle(cap) 104 | 105 | plot(L1, which=1:2, ask=FALSE) #residual plots 106 | 107 | ### Example 9.4 (mammals data) 108 | 109 | library(MASS) 110 | cor(log(mammals$body), log(mammals$brain)) 111 | summary(mammals) 112 | 113 | y <- log(mammals$brain) 114 | x <- log(mammals$body) 115 | L <- lm(y ~ x) 116 | L 117 | 118 | cap <- paste("Fit: log(brain) =", round(L$coeff[1],3), 119 | "+", round(L$coeff[2],3), "log(body)") 120 | ggplot(data=mammals, aes(x, y)) + 121 | geom_point() + geom_smooth(method="lm") + 122 | labs(x = "log(body)", y = "log(brain)", title = cap) 123 | 124 | summary(L)$r.squared 125 | 126 | ### Example 9.5 (ironslag data, resampling cases) 127 | 128 | x <- ironslag$chemical 129 | y <- ironslag$magnetic 130 | m <- 2000 131 | n <- NROW(x) 132 | L1 <- lm(y ~ x) #estimate the model 133 | b0 <- L1$coeff[1]; b1 <- L1$coeff[2] 134 | 135 | ## run bootstrap of cases 136 | out <- replicate(m, expr={ 137 | i <- sample(1:n, replace=TRUE, size=n) 138 | xstar <- x[i] 139 | ystar <- y[i] 140 | Lb <- lm(ystar ~ xstar) 141 | s <- summary(Lb)$sigma 142 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s) 143 | }) 144 | 145 | bootCases <- t(out) 146 | meanCases <- colMeans(bootCases) 147 | sdCases <- apply(bootCases, 2, "sd") 148 | meanCases 149 | sdCases 150 | 151 | biasInt <- mean(bootCases[,1] - b0) #bias for intercept 152 | biasSlope <- mean(bootCases[,2] - b1) #bias for slope 153 | 154 | rbind(estimate=c(b0, b1), bias=c(biasInt, biasSlope), 155 | se=sdCases[1:2], cv=c(biasInt, cv=biasSlope)/sdCases[1:2]) 156 | 157 | ### Example 9.6 (Resampling cases using the boot function) 158 | 159 | # set.seed(1104) 160 | library(boot) 161 | m <- 2000 162 | stats <- function(dat, i) { 163 | x <- dat$chemical[i] 164 | y <- dat$magnetic[i] 165 | Lb <- lm(y ~ x) 166 | s <- summary(Lb)$sigma 167 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s) 168 | } 169 | 170 | boot.out <- boot(ironslag, statistic=stats, R=2000) 171 | boot.out 172 | boot.out$t0 173 | 174 | sd(boot.out$t[,2]) 175 | boottbl <- broom::tidy(boot.out) 176 | boottbl$std.error[2] 177 | 178 | MASS::truehist(boot.out$t[ ,2], main="", xlab="slopes") 179 | abline(v = boot.out$t0[2], lwd=2) 180 | 181 | boot.ci(boot.out, index=2, type=c("norm","perc","basic","bca")) 182 | 183 | ### Example 9.7 (Resampling errors: mammals data) 184 | 185 | x <- log(mammals$brain) 186 | y <- log(mammals$body) 187 | L <- lm(y ~ x) 188 | 189 | m.resid <- rstandard(L, sd = 1) 190 | r <- m.resid - mean(m.resid) 191 | m <- 1000; n <- NROW(x) 192 | estsErr <- replicate(m, expr={ 193 | estar <- sample(r, replace=TRUE, size=n) 194 | ystar <- L$fitted.values + estar 195 | Lb <- lm(ystar ~ x) 196 | s <- summary(Lb)$sigma 197 | c(b0=Lb$coeff[1], b1=Lb$coeff[2], s=s) 198 | }) 199 | ests <- t(estsErr) 200 | summary(ests) 201 | 202 | ### Example 9.8 (Resampling errors, continued) 203 | sd(ests[,2]) 204 | s <- summary(L)$sigma 205 | SSx <- (n - 1) * var(x) 206 | se.beta1 <- sqrt(s^2 / SSx) 207 | se.beta1 208 | s * sqrt(1/n + mean(x)^2 / SSx) 209 | sd(ests[,1]) 210 | 211 | betas <- summary(L)$coeff 212 | betas 213 | betas[, "Std. Error"] 214 | 215 | broom::tidy(summary(L)) 216 | broom::tidy(summary(L))$std.error 217 | 218 | ### Example 9.9 (Model based resampling with the boot function) 219 | regstats <- function(dat, i) { 220 | #dat is a data frame (r, x, yhat) 221 | #r are the modified centered residuals, yhat are the fits 222 | ystar <- dat$yhat + dat$r[i] 223 | xstar <- dat$x 224 | Lnew <- lm(ystar ~ xstar) 225 | Lnew$coefficients 226 | } 227 | 228 | y <- log(mammals$brain) 229 | x <- log(mammals$body) 230 | L <- lm(y ~ x) 231 | r <- rstandard(L, sd=1) 232 | r <- r - mean(r) 233 | df <- data.frame(r=r, x=x, yhat=L$fitted) 234 | head(df) 235 | boot.obj <- boot(data=df, statistic=regstats, R=2000) 236 | broom::tidy(boot.obj) 237 | 238 | 239 | ### Example 9.10 (Empirical influence values for the patch ratio statistic) 240 | 241 | library(boot) 242 | library(bootstrap) 243 | theta_boot <- function(dat, ind) { 244 | # function to compute the patch ratio statistic 245 | mean(dat[ind, ]$y) / mean(dat[ind, ]$z) 246 | } 247 | boot.out <- boot(patch, theta_boot, R = 2000) 248 | infl <- empinf(boot.out, type = "jack") 249 | theta.hat <- boot.out$t0 250 | jack <- theta.hat - infl / (nrow(patch) - 1) 251 | rbind(infl, jack) 252 | 253 | ### Example 9.11 (Jackknife-after-bootstrap plot) 254 | 255 | jack.after.boot(boot.out, useJ=TRUE, stinf=FALSE) 256 | 257 | n <- NROW(patch) 258 | J <- numeric(n) 259 | b.freq <- boot.array(boot.out) 260 | theta.b <- boot.out$t 261 | 262 | for (i in 1:n) { 263 | keep <- which(b.freq[ ,i] == 0) 264 | J[i] <- mean(theta.b[keep]) 265 | } 266 | 267 | # the jackknife influence values 268 | (n - 1) * (mean(J) - J) 269 | 270 | jack.after.boot(boot.out, useJ=TRUE, stinf=TRUE) 271 | 272 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chBoot.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### March 6, 2019 ### 7 | ### ### 8 | ### R code for Chapter 8 ### 9 | ### Bootstrap and Jackknife ### 10 | ####################################################### 11 | 12 | 13 | # packages to install: bootstrap, DAAG 14 | 15 | 16 | ### Example 8.2 (Bootstrap estimate of standard error) 17 | 18 | library(bootstrap) #for the law data 19 | print(cor(law$LSAT, law$GPA)) 20 | print(cor(law82$LSAT, law82$GPA)) 21 | 22 | #set up the bootstrap 23 | B <- 200 #number of replicates 24 | n <- nrow(law) #sample size 25 | R <- numeric(B) #storage for replicates 26 | 27 | #bootstrap estimate of standard error of R 28 | for (b in 1:B) { 29 | #randomly select the indices 30 | i <- sample(1:n, size = n, replace = TRUE) 31 | LSAT <- law$LSAT[i] #i is a vector of indices 32 | GPA <- law$GPA[i] 33 | R[b] <- cor(LSAT, GPA) 34 | } 35 | #output 36 | print(se.R <- sd(R)) 37 | hist(R, prob = TRUE) 38 | 39 | 40 | ### Example 8.3 (Bootstrap estimate of standard error: boot function) 41 | 42 | r <- function(x, i) { 43 | #want correlation of columns 1 and 2 44 | cor(x[i,1], x[i,2]) 45 | } 46 | 47 | library(boot) #for boot function 48 | obj <- boot(data = law, statistic = r, R = 2000) 49 | obj 50 | y <- obj$t 51 | sd(y) 52 | 53 | 54 | ### Example 8.4 (Bootstrap estimate of bias) 55 | 56 | #sample estimate for n=15 57 | theta.hat <- cor(law$LSAT, law$GPA) 58 | 59 | #bootstrap estimate of bias 60 | B <- 2000 #larger for estimating bias 61 | n <- nrow(law) 62 | theta.b <- numeric(B) 63 | 64 | for (b in 1:B) { 65 | i <- sample(1:n, size = n, replace = TRUE) 66 | LSAT <- law$LSAT[i] 67 | GPA <- law$GPA[i] 68 | theta.b[b] <- cor(LSAT, GPA) 69 | } 70 | bias <- mean(theta.b - theta.hat) 71 | bias 72 | 73 | 74 | ### Example 8.5 (Bootstrap estimate of bias of a ratio estimate) 75 | 76 | 77 | data(patch, package = "bootstrap") 78 | patch 79 | 80 | n <- nrow(patch) #in bootstrap package 81 | B <- 2000 82 | theta.b <- numeric(B) 83 | theta.hat <- mean(patch$y) / mean(patch$z) 84 | 85 | #bootstrap 86 | for (b in 1:B) { 87 | i <- sample(1:n, size = n, replace = TRUE) 88 | y <- patch$y[i] 89 | z <- patch$z[i] 90 | theta.b[b] <- mean(y) / mean(z) 91 | } 92 | bias <- mean(theta.b) - theta.hat 93 | se <- sd(theta.b) 94 | print(list(est=theta.hat, bias = bias, 95 | se = se, cv = bias/se)) 96 | 97 | 98 | ### Example 8.6 (Jackknife estimate of bias) 99 | 100 | data(patch, package = "bootstrap") 101 | n <- nrow(patch) 102 | y <- patch$y 103 | z <- patch$z 104 | theta.hat <- mean(y) / mean(z) 105 | print (theta.hat) 106 | 107 | #compute the jackknife replicates, leave-one-out estimates 108 | theta.jack <- numeric(n) 109 | for (i in 1:n) 110 | theta.jack[i] <- mean(y[-i]) / mean(z[-i]) 111 | bias <- (n - 1) * (mean(theta.jack) - theta.hat) 112 | 113 | print(bias) #jackknife estimate of bias 114 | 115 | 116 | ### Example 8.7 (Jackknife estimate of standard error) 117 | 118 | se <- sqrt((n-1) * 119 | mean((theta.jack - mean(theta.jack))^2)) 120 | print(se) 121 | 122 | 123 | ### Example 8.8 (Failure of jackknife) 124 | 125 | set.seed(123) #for the specific example given 126 | #change the seed to see other examples 127 | 128 | n <- 10 129 | x <- sample(1:100, size = n) 130 | 131 | #jackknife estimate of se 132 | M <- numeric(n) 133 | for (i in 1:n) { #leave one out 134 | y <- x[-i] 135 | M[i] <- median(y) 136 | } 137 | Mbar <- mean(M) 138 | print(sqrt((n-1)/n * sum((M - Mbar)^2))) 139 | 140 | #bootstrap estimate of se 141 | Mb <- replicate(1000, expr = { 142 | y <- sample(x, size = n, replace = TRUE) 143 | median(y) }) 144 | print(sd(Mb)) 145 | print(x) 146 | print(M) 147 | print(Mb) 148 | 149 | 150 | ### Example 8.9 (Bootstrap confidence intervals for patch ratio statistic) 151 | 152 | library(boot) #for boot and boot.ci 153 | data(patch, package = "bootstrap") 154 | 155 | theta.boot <- function(dat, ind) { 156 | #function to compute the statistic 157 | y <- dat[ind, 1] 158 | z <- dat[ind, 2] 159 | mean(y) / mean(z) 160 | } 161 | 162 | y <- patch$y 163 | z <- patch$z 164 | dat <- cbind(y, z) 165 | boot.obj <- boot(dat, statistic = theta.boot, R = 2000) 166 | 167 | print(boot.obj) 168 | print(boot.ci(boot.obj, 169 | type = c("basic", "norm", "perc"))) 170 | 171 | 172 | #calculations for bootstrap confidence intervals 173 | alpha <- c(.025, .975) 174 | 175 | #normal 176 | print(boot.obj$t0 + qnorm(alpha) * sd(boot.obj$t)) 177 | 178 | #basic 179 | print(2*boot.obj$t0 - 180 | quantile(boot.obj$t, rev(alpha), type=1)) 181 | 182 | #percentile 183 | print(quantile(boot.obj$t, alpha, type=6)) 184 | 185 | 186 | ### Example 8.10 (Bootstrap confidence intervals for the correlation statistic) 187 | 188 | library(boot) 189 | data(law, package = "bootstrap") 190 | boot.obj <- boot(law, R = 2000, 191 | statistic = function(x, i){cor(x[i,1], x[i,2])}) 192 | print(boot.ci(boot.obj, type=c("basic","norm","perc"))) 193 | 194 | 195 | ### Example 8.11 (Bootstrap t confidence interval) 196 | 197 | boot.t.ci <- 198 | function(x, B = 500, R = 100, level = .95, statistic){ 199 | #compute the bootstrap t CI 200 | x <- as.matrix(x); n <- nrow(x) 201 | stat <- numeric(B); se <- numeric(B) 202 | 203 | boot.se <- function(x, R, f) { 204 | #local function to compute the bootstrap 205 | #estimate of standard error for statistic f(x) 206 | x <- as.matrix(x); m <- nrow(x) 207 | th <- replicate(R, expr = { 208 | i <- sample(1:m, size = m, replace = TRUE) 209 | f(x[i, ]) 210 | }) 211 | return(sd(th)) 212 | } 213 | 214 | for (b in 1:B) { 215 | j <- sample(1:n, size = n, replace = TRUE) 216 | y <- x[j, ] 217 | stat[b] <- statistic(y) 218 | se[b] <- boot.se(y, R = R, f = statistic) 219 | } 220 | stat0 <- statistic(x) 221 | t.stats <- (stat - stat0) / se 222 | se0 <- sd(stat) 223 | alpha <- 1 - level 224 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1) 225 | names(Qt) <- rev(names(Qt)) 226 | CI <- rev(stat0 - Qt * se0) 227 | } 228 | 229 | 230 | ### Example 8.12 (Bootstrap t confidence interval for patch ratio statistic) 231 | 232 | #boot package and patch data were loaded in Example 8.10 233 | #library(boot) #for boot and boot.ci 234 | #data(patch, package = "bootstrap") 235 | 236 | dat <- cbind(patch$y, patch$z) 237 | stat <- function(dat) { 238 | mean(dat[, 1]) / mean(dat[, 2]) } 239 | ci <- boot.t.ci(dat, statistic = stat, B=2000, R=200) 240 | print(ci) 241 | 242 | 243 | ### Example 8.13 (BCa bootstrap confidence interval) 244 | 245 | boot.BCa <- 246 | function(x, th0, th, stat, conf = .95) { 247 | # bootstrap with BCa bootstrap confidence interval 248 | # th0 is the observed statistic 249 | # th is the vector of bootstrap replicates 250 | # stat is the function to compute the statistic 251 | 252 | x <- as.matrix(x) 253 | n <- nrow(x) #observations in rows 254 | N <- 1:n 255 | alpha <- (1 + c(-conf, conf))/2 256 | zalpha <- qnorm(alpha) 257 | 258 | # the bias correction factor 259 | z0 <- qnorm(sum(th < th0) / length(th)) 260 | 261 | # the acceleration factor (jackknife est.) 262 | th.jack <- numeric(n) 263 | for (i in 1:n) { 264 | J <- N[1:(n-1)] 265 | th.jack[i] <- stat(x[-i, ], J) 266 | } 267 | L <- mean(th.jack) - th.jack 268 | a <- sum(L^3)/(6 * sum(L^2)^1.5) 269 | 270 | # BCa conf. limits 271 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha))) 272 | limits <- quantile(th, adj.alpha, type=6) 273 | return(list("est"=th0, "BCa"=limits)) 274 | } 275 | 276 | 277 | ### Example 8.14 (BCa bootstrap confidence interval) 278 | 279 | #boot package and patch data were loaded in Example 8.10 280 | #library(boot) #for boot and boot.ci 281 | #data(patch, package = "bootstrap") 282 | 283 | n <- nrow(patch) 284 | B <- 2000 285 | y <- patch$y 286 | z <- patch$z 287 | x <- cbind(y, z) 288 | theta.b <- numeric(B) 289 | theta.hat <- mean(y) / mean(z) 290 | 291 | #bootstrap 292 | for (b in 1:B) { 293 | i <- sample(1:n, size = n, replace = TRUE) 294 | y <- patch$y[i] 295 | z <- patch$z[i] 296 | theta.b[b] <- mean(y) / mean(z) 297 | } 298 | #compute the BCa interval 299 | stat <- function(dat, index) { 300 | mean(dat[index, 1]) / mean(dat[index, 2]) } 301 | 302 | boot.BCa(x, th0 = theta.hat, th = theta.b, stat = stat) 303 | 304 | 305 | 306 | ### Example 8.15 (BCa bootstrap confidence interval using boot.ci) 307 | 308 | #using x from Example 8.15 309 | boot.obj <- boot(x, statistic = stat, R=2000) 310 | boot.ci(boot.obj, type=c("perc", "bca")) 311 | 312 | ### Example 8.16 (Model selection) 313 | 314 | #to prompt for next graph, uncomment line below 315 | #par(ask = TRUE) 316 | 317 | library(DAAG); attach(ironslag) 318 | a <- seq(10, 40, .1) #sequence for plotting fits 319 | 320 | L1 <- lm(magnetic ~ chemical) 321 | plot(chemical, magnetic, main="Linear", pch=16) 322 | yhat1 <- L1$coef[1] + L1$coef[2] * a 323 | lines(a, yhat1, lwd=2) 324 | 325 | L2 <- lm(magnetic ~ chemical + I(chemical^2)) 326 | plot(chemical, magnetic, main="Quadratic", pch=16) 327 | yhat2 <- L2$coef[1] + L2$coef[2] * a + L2$coef[3] * a^2 328 | lines(a, yhat2, lwd=2) 329 | 330 | L3 <- lm(log(magnetic) ~ chemical) 331 | plot(chemical, magnetic, main="Exponential", pch=16) 332 | logyhat3 <- L3$coef[1] + L3$coef[2] * a 333 | yhat3 <- exp(logyhat3) 334 | lines(a, yhat3, lwd=2) 335 | 336 | L4 <- lm(log(magnetic) ~ log(chemical)) 337 | plot(log(chemical), log(magnetic), main="Log-Log", pch=16) 338 | logyhat4 <- L4$coef[1] + L4$coef[2] * log(a) 339 | lines(log(a), logyhat4, lwd=2) 340 | 341 | ### Example 8.17 (Model selection: Cross validation) 342 | 343 | # Example 8.16, cont. 344 | n <- length(magnetic) #in DAAG ironslag 345 | e1 <- e2 <- e3 <- e4 <- numeric(n) 346 | 347 | # for n-fold cross validation 348 | # fit models on leave-one-out samples 349 | for (k in 1:n) { 350 | y <- magnetic[-k] 351 | x <- chemical[-k] 352 | 353 | J1 <- lm(y ~ x) 354 | yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k] 355 | e1[k] <- magnetic[k] - yhat1 356 | 357 | J2 <- lm(y ~ x + I(x^2)) 358 | yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] + 359 | J2$coef[3] * chemical[k]^2 360 | e2[k] <- magnetic[k] - yhat2 361 | 362 | J3 <- lm(log(y) ~ x) 363 | logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k] 364 | yhat3 <- exp(logyhat3) 365 | e3[k] <- magnetic[k] - yhat3 366 | 367 | J4 <- lm(log(y) ~ log(x)) 368 | logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k]) 369 | yhat4 <- exp(logyhat4) 370 | e4[k] <- magnetic[k] - yhat4 371 | } 372 | 373 | 374 | c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2)) 375 | 376 | #selected model, fitted in Example 8.16 377 | L2 378 | 379 | par(mfrow = c(2, 2)) #layout for graphs 380 | plot(L2$fit, L2$res) #residuals vs fitted values 381 | abline(0, 0) #reference line 382 | qqnorm(L2$res) #normal probability plot 383 | qqline(L2$res) #reference line 384 | par(mfrow = c(1, 1)) #restore display 385 | 386 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chMCint.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 6 ### 9 | ### Monte Carlo Integration and Variance Reduction ### 10 | ####################################################### 11 | 12 | 13 | ### Example 6.1 (Simple Monte Carlo integration) 14 | 15 | m <- 10000 16 | x <- runif(m) 17 | theta.hat <- mean(exp(-x)) 18 | print(theta.hat) 19 | print(1 - exp(-1)) 20 | 21 | 22 | ### Example 6.2 (Simple Monte Carlo integration, cont.) 23 | 24 | m <- 10000 25 | x <- runif(m, min=2, max=4) 26 | theta.hat <- mean(exp(-x)) * 2 27 | print(theta.hat) 28 | print(exp(-2) - exp(-4)) 29 | 30 | 31 | ### Example 6.3 (Monte Carlo integration, unbounded interval) 32 | 33 | x <- seq(.1, 2.5, length = 10) 34 | m <- 10000 35 | u <- runif(m) 36 | cdf <- numeric(length(x)) 37 | for (i in 1:length(x)) { 38 | g <- x[i] * exp(-(u * x[i])^2 / 2) 39 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5 40 | } 41 | 42 | Phi <- pnorm(x) 43 | print(round(rbind(x, cdf, Phi), 3)) 44 | 45 | 46 | ### Example 6.4 (Example 6.3, cont.) 47 | 48 | x <- seq(.1, 2.5, length = 10) 49 | m <- 10000 50 | z <- rnorm(m) 51 | dim(x) <- length(x) 52 | p <- apply(x, MARGIN = 1, 53 | FUN = function(x, z) {mean(z < x)}, z = z) 54 | 55 | Phi <- pnorm(x) 56 | print(round(rbind(x, p, Phi), 3)) 57 | 58 | 59 | ### Example 6.5 (Error bounds for MC integration) 60 | 61 | x <- 2 62 | m <- 10000 63 | z <- rnorm(m) 64 | g <- (z < x) #the indicator function 65 | v <- mean((g - mean(g))^2) / m 66 | cdf <- mean(g) 67 | c(cdf, v) 68 | c(cdf - 1.96 * sqrt(v), cdf + 1.96 * sqrt(v)) 69 | 70 | 71 | ### Example 6.6 (Antithetic variables) 72 | 73 | MC.Phi <- function(x, R = 10000, antithetic = TRUE) { 74 | u <- runif(R/2) 75 | if (!antithetic) v <- runif(R/2) else 76 | v <- 1 - u 77 | u <- c(u, v) 78 | cdf <- numeric(length(x)) 79 | for (i in 1:length(x)) { 80 | g <- x[i] * exp(-(u * x[i])^2 / 2) 81 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5 82 | } 83 | cdf 84 | } 85 | 86 | 87 | x <- seq(.1, 2.5, length=5) 88 | Phi <- pnorm(x) 89 | set.seed(123) 90 | MC1 <- MC.Phi(x, anti = FALSE) 91 | set.seed(123) 92 | MC2 <- MC.Phi(x) 93 | print(round(rbind(x, MC1, MC2, Phi), 5)) 94 | 95 | 96 | m <- 1000 97 | MC1 <- MC2 <- numeric(m) 98 | x <- 1.95 99 | for (i in 1:m) { 100 | MC1[i] <- MC.Phi(x, R = 1000, anti = FALSE) 101 | MC2[i] <- MC.Phi(x, R = 1000) 102 | } 103 | 104 | print(sd(MC1)) 105 | print(sd(MC2)) 106 | print((var(MC1) - var(MC2))/var(MC1)) 107 | 108 | 109 | ### Example 6.7 (Control variate) 110 | 111 | m <- 10000 112 | a <- - 12 + 6 * (exp(1) - 1) 113 | U <- runif(m) 114 | T1 <- exp(U) #simple MC 115 | T2 <- exp(U) + a * (U - 1/2) #controlled 116 | 117 | mean(T1) 118 | mean(T2) 119 | (var(T1) - var(T2)) / var(T1) 120 | 121 | 122 | ### Example 6.8 (MC integration using control variates) 123 | 124 | f <- function(u) 125 | exp(-.5)/(1+u^2) 126 | 127 | g <- function(u) 128 | exp(-u)/(1+u^2) 129 | 130 | set.seed(510) #needed later 131 | u <- runif(10000) 132 | B <- f(u) 133 | A <- g(u) 134 | 135 | cor(A, B) 136 | a <- -cov(A,B) / var(B) #est of c* 137 | a 138 | 139 | m <- 100000 140 | u <- runif(m) 141 | T1 <- g(u) 142 | T2 <- T1 + a * (f(u) - exp(-.5)*pi/4) 143 | 144 | c(mean(T1), mean(T2)) 145 | c(var(T1), var(T2)) 146 | (var(T1) - var(T2)) / var(T1) 147 | 148 | 149 | ### Example 6.9 (Control variate and regression) 150 | 151 | 152 | set.seed(510) 153 | mu <- exp(-.5)*pi/4 154 | u <- runif(10000) 155 | f <- exp(-.5)/(1+u^2) 156 | g <- exp(-u)/(1+u^2) 157 | L <- lm(g ~ f) 158 | L 159 | c.star <- - L$coeff[2] 160 | c.star 161 | 162 | theta.hat <- sum(L$coeff * c(1, mu)) #pred. value at mu 163 | theta.hat 164 | summary(L)$sigma^2 165 | summary(L)$r.squared 166 | 167 | ### Example 6.10 (Control variates and multiple regression) 168 | 169 | # Example 6.9 continued with a second control variate 170 | # and multiple regression to estimate vector c* 171 | 172 | u <- runif(10000) 173 | f1 <- exp(-.5) / (1+u^2) 174 | f2 <- exp(-u) / (1-exp(-1)) 175 | g <- exp(-u) / (1+u^2) 176 | 177 | L2 <- lm(g ~ f1 + f2) 178 | 179 | L2$coeff 180 | c.star <- - L2$coeff[2:3] 181 | c.star 182 | mu1 <- exp(-.5)*pi/4 183 | mu2 <- 1 184 | mu <- c(mu1, mu2) 185 | 186 | # theta.hat is the predicted response at mu 187 | # alternately can use predict.lm method 188 | 189 | theta.hat <- sum(L2$coeff * c(1, mu)) #pred. value at mu 190 | theta.hat 191 | 192 | ## alternately 193 | df <- data.frame(f1=mu1, f2=mu2) 194 | theta.hat <-predict(L2, df) 195 | 196 | # MSE / n is the est. variance of the control estimator 197 | MSE <- summary(L2)$sigma^2 198 | MSE 199 | sqrt(MSE / 10000) 200 | 201 | 202 | # compare with the previous estimates using 203 | # naive MC and control variate f1(u) 204 | # var1=.060231423 var2=.003124814 205 | var0 <- 0.060231423 #naive MC 206 | var1 <- 0.003117644 #controlled estimator with f1 207 | var2 <- MSE #new estimator 208 | 209 | # percent reduction in variance 210 | # it is a weighted average of R^2 values 211 | # so easier to compute directly 212 | 213 | 100 * (var0 - var1) / var0 214 | 100 * (var1 - var2) / var1 215 | 100 * (var0 - var2) / var0 216 | 217 | ### Example 6.11 (Choice of the importance function) 218 | #code for plot is at the end of the file 219 | 220 | m <- 10000 221 | theta.hat <- se <- numeric(5) 222 | g <- function(x) { 223 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) 224 | } 225 | 226 | x <- runif(m) #using f0 227 | fg <- g(x) 228 | theta.hat[1] <- mean(fg) 229 | se[1] <- sd(fg) 230 | 231 | x <- rexp(m, 1) #using f1 232 | fg <- g(x) / exp(-x) 233 | theta.hat[2] <- mean(fg) 234 | se[2] <- sd(fg) 235 | 236 | x <- rcauchy(m) #using f2 237 | i <- c(which(x > 1), which(x < 0)) 238 | x[i] <- 2 #to catch overflow errors in g(x) 239 | fg <- g(x) / dcauchy(x) 240 | theta.hat[3] <- mean(fg) 241 | se[3] <- sd(fg) 242 | 243 | u <- runif(m) #f3, inverse transform method 244 | x <- - log(1 - u * (1 - exp(-1))) 245 | fg <- g(x) / (exp(-x) / (1 - exp(-1))) 246 | theta.hat[4] <- mean(fg) 247 | se[4] <- sd(fg) 248 | 249 | u <- runif(m) #f4, inverse transform method 250 | x <- tan(pi * u / 4) 251 | fg <- g(x) / (4 / ((1 + x^2) * pi)) 252 | theta.hat[5] <- mean(fg) 253 | se[5] <- sd(fg) 254 | 255 | rbind(theta.hat, se / sqrt(m)) 256 | 257 | 258 | ### Example 6.12 (Example 6.11, cont.) 259 | 260 | M <- 20 #number of replicates 261 | T2 <- numeric(4) 262 | estimates <- matrix(0, 10, 2) 263 | 264 | g <- function(x) { 265 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) } 266 | 267 | for (i in 1:10) { 268 | estimates[i, 1] <- mean(g(runif(M))) 269 | T2[1] <- mean(g(runif(M/4, 0, .25))) 270 | T2[2] <- mean(g(runif(M/4, .25, .5))) 271 | T2[3] <- mean(g(runif(M/4, .5, .75))) 272 | T2[4] <- mean(g(runif(M/4, .75, 1))) 273 | estimates[i, 2] <- mean(T2) 274 | } 275 | 276 | estimates 277 | apply(estimates, 2, mean) 278 | apply(estimates, 2, var) 279 | 280 | 281 | ### Example 6.13 (Examples 6.11-6.12, cont.) 282 | 283 | M <- 10000 #number of replicates 284 | k <- 10 #number of strata 285 | r <- M / k #replicates per stratum 286 | N <- 50 #number of times to repeat the estimation 287 | T2 <- numeric(k) 288 | estimates <- matrix(0, N, 2) 289 | 290 | g <- function(x) { 291 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) 292 | } 293 | 294 | for (i in 1:N) { 295 | estimates[i, 1] <- mean(g(runif(M))) 296 | for (j in 1:k) 297 | T2[j] <- mean(g(runif(M/k, (j-1)/k, j/k))) 298 | estimates[i, 2] <- mean(T2) 299 | } 300 | 301 | apply(estimates, 2, mean) 302 | apply(estimates, 2, var) 303 | 304 | 305 | 306 | ### Plot importance functions in Figures 6.1(a) and 6.1.(b) 307 | 308 | #par(ask = TRUE) #uncomment to pause between graphs 309 | 310 | x <- seq(0, 1, .01) 311 | w <- 2 312 | f1 <- exp(-x) 313 | f2 <- (1 / pi) / (1 + x^2) 314 | f3 <- exp(-x) / (1 - exp(-1)) 315 | f4 <- 4 / ((1 + x^2) * pi) 316 | g <- exp(-x) / (1 + x^2) 317 | 318 | #for color change lty to col 319 | 320 | #figure (a) 321 | plot(x, g, type = "l", main = "", ylab = "", 322 | ylim = c(0,2), lwd = w) 323 | lines(x, g/g, lty = 2, lwd = w) 324 | lines(x, f1, lty = 3, lwd = w) 325 | lines(x, f2, lty = 4, lwd = w) 326 | lines(x, f3, lty = 5, lwd = w) 327 | lines(x, f4, lty = 6, lwd = w) 328 | legend("topright", legend = c("g", 0:4), 329 | lty = 1:6, lwd = w, inset = 0.02) 330 | 331 | #figure (b) 332 | plot(x, g, type = "l", main = "", ylab = "", 333 | ylim = c(0,3.2), lwd = w, lty = 2) 334 | lines(x, g/f1, lty = 3, lwd = w) 335 | lines(x, g/f2, lty = 4, lwd = w) 336 | lines(x, g/f3, lty = 5, lwd = w) 337 | lines(x, g/f4, lty = 6, lwd = w) 338 | legend("topright", legend = c(0:4), 339 | lty = 2:6, lwd = w, inset = 0.02) 340 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chMonteCarlo.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 7 ### 9 | ### Monte Carlo Methods in Inference ### 10 | ####################################################### 11 | 12 | # packages to install: energy, ggplot2 13 | 14 | ### Example 7.1 (Basic Monte Carlo estimation) 15 | 16 | m <- 1000 17 | g <- numeric(m) 18 | for (i in 1:m) { 19 | x <- rnorm(2) 20 | g[i] <- abs(x[1] - x[2]) 21 | } 22 | est <- mean(g) 23 | est 24 | 25 | 26 | ### Example 7.2 (Estimating the MSE of a trimmed mean) 27 | 28 | n <- 20 29 | m <- 1000 30 | tmean <- numeric(m) 31 | for (i in 1:m) { 32 | x <- sort(rnorm(n)) 33 | tmean[i] <- sum(x[2:(n-1)]) / (n-2) 34 | } 35 | mse <- mean(tmean^2) 36 | mse 37 | sqrt(sum((tmean - mean(tmean))^2)) / m #se 38 | 39 | n <- 20 40 | m <- 1000 41 | tmean <- numeric(m) 42 | for (i in 1:m) { 43 | x <- sort(rnorm(n)) 44 | tmean[i] <- median(x) 45 | } 46 | mse <- mean(tmean^2) 47 | mse 48 | sqrt(sum((tmean - mean(tmean))^2)) / m #se 49 | 50 | 51 | ### Example 7.3 (MSE of a trimmed mean, cont.) 52 | 53 | set.seed(522) 54 | n <- 20 55 | K <- n/2 - 1 56 | m <- 1000 57 | mse <- matrix(0, n/2, 6) 58 | 59 | trimmed.mse <- function(n, m, k, p) { 60 | #MC est of mse for k-level trimmed mean of 61 | #contaminated normal pN(0,1) + (1-p)N(0,100) 62 | tmean <- numeric(m) 63 | for (i in 1:m) { 64 | sigma <- sample(c(1, 10), size = n, 65 | replace = TRUE, prob = c(p, 1-p)) 66 | x <- sort(rnorm(n, 0, sigma)) 67 | tmean[i] <- sum(x[(k+1):(n-k)]) / (n-2*k) 68 | } 69 | mse.est <- mean(tmean^2) 70 | se.mse <- sqrt(mean((tmean-mean(tmean))^2)) / sqrt(m) 71 | return(c(mse.est, se.mse)) 72 | } 73 | 74 | for (k in 0:K) { 75 | mse[k+1, 1:2] <- trimmed.mse(n=n, m=m, k=k, p=1.0) 76 | mse[k+1, 3:4] <- trimmed.mse(n=n, m=m, k=k, p=.95) 77 | mse[k+1, 5:6] <- trimmed.mse(n=n, m=m, k=k, p=.9) 78 | } 79 | 80 | ### Example 7.4 (Confidence interval for variance) 81 | 82 | n <- 20 83 | alpha <- .05 84 | x <- rnorm(n, mean=0, sd=2) 85 | UCL <- (n-1) * var(x) / qchisq(alpha, df=n-1) 86 | 87 | 88 | ### Example 7.5 (MC estimate of confidence level) 89 | 90 | n <- 20 91 | alpha <- .05 92 | UCL <- replicate(1000, expr = { 93 | x <- rnorm(n, mean = 0, sd = 2) 94 | (n-1) * var(x) / qchisq(alpha, df = n-1) 95 | } ) 96 | #count the number of intervals that contain sigma^2=4 97 | sum(UCL > 4) 98 | #or compute the mean to get the confidence level 99 | mean(UCL > 4) 100 | 101 | 102 | ### Example 7.6 (Empirical confidence level) 103 | 104 | n <- 20 105 | alpha <- .05 106 | UCL <- replicate(1000, expr = { 107 | x <- rchisq(n, df = 2) 108 | (n-1) * var(x) / qchisq(alpha, df = n-1) 109 | } ) 110 | sum(UCL > 4) 111 | mean(UCL > 4) 112 | 113 | 114 | ### Example 7.7 (Empirical Type I error rate) 115 | 116 | n <- 20 117 | alpha <- .05 118 | mu0 <- 500 119 | sigma <- 100 120 | 121 | m <- 10000 #number of replicates 122 | p <- numeric(m) #storage for p-values 123 | for (j in 1:m) { 124 | x <- rnorm(n, mu0, sigma) 125 | ttest <- t.test(x, alternative = "greater", mu = mu0) 126 | p[j] <- ttest$p.value 127 | } 128 | 129 | p.hat <- mean(p < alpha) 130 | se.hat <- sqrt(p.hat * (1 - p.hat) / m) 131 | print(c(p.hat, se.hat)) 132 | 133 | 134 | ### Example 7.8 (Skewness test of normality) 135 | 136 | n <- c(10, 20, 30, 50, 100, 500) #sample sizes 137 | cv <- qnorm(.975, 0, sqrt(6/n)) #crit. values for each n 138 | 139 | sk <- function(x) { 140 | #computes the sample skewness coeff. 141 | xbar <- mean(x) 142 | m3 <- mean((x - xbar)^3) 143 | m2 <- mean((x - xbar)^2) 144 | return( m3 / m2^1.5 ) 145 | } 146 | 147 | #n is a vector of sample sizes 148 | #we are doing length(n) different simulations 149 | 150 | p.reject <- numeric(length(n)) #to store sim. results 151 | m <- 10000 #num. repl. each sim. 152 | 153 | for (i in 1:length(n)) { 154 | sktests <- numeric(m) #test decisions 155 | for (j in 1:m) { 156 | x <- rnorm(n[i]) 157 | #test decision is 1 (reject) or 0 158 | sktests[j] <- as.integer(abs(sk(x)) >= cv[i] ) 159 | } 160 | p.reject[i] <- mean(sktests) #proportion rejected 161 | } 162 | 163 | p.reject 164 | 165 | 166 | ### Example 7.9 (Empirical power) 167 | 168 | #set.seed(521) 169 | n <- 20 170 | m <- 1000 171 | mu0 <- 500 172 | sigma <- 100 173 | mu <- c(seq(450, 650, 10)) #alternatives 174 | M <- length(mu) 175 | power <- numeric(M) 176 | for (i in 1:M) { 177 | mu1 <- mu[i] 178 | pvalues <- replicate(m, expr = { 179 | #simulate under alternative mu1 180 | x <- rnorm(n, mean = mu1, sd = sigma) 181 | ttest <- t.test(x, 182 | alternative = "greater", mu = mu0) 183 | ttest$p.value } ) 184 | power[i] <- mean(pvalues <= .05) 185 | } 186 | 187 | se <- sqrt(power * (1-power) / m) 188 | df <- data.frame(mean=mu, power=power, upper=power+2*se, lower=power-2*se) 189 | 190 | library(ggplot2) 191 | ggplot(df, aes(x=mean, y=power)) + 192 | geom_line() + 193 | geom_vline(xintercept=500, lty=2) + 194 | geom_hline(yintercept=c(0,.05), lty=1:2) + 195 | geom_errorbar(aes(ymin=lower, ymax=upper), width = 0.2, lwd=1.5) 196 | 197 | 198 | ### Example 7.10 (Power of the skewness test of normality) 199 | 200 | #set.seed(111) 201 | alpha <- .1 202 | n <- 30 203 | m <- 2500 204 | epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05)) 205 | N <- length(epsilon) 206 | pwr <- numeric(N) 207 | #critical value for the skewness test 208 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3)))) 209 | 210 | for (j in 1:N) { #for each epsilon 211 | e <- epsilon[j] 212 | sktests <- numeric(m) 213 | for (i in 1:m) { #for each replicate 214 | sigma <- sample(c(1, 10), replace = TRUE, 215 | size = n, prob = c(1-e, e)) 216 | x <- rnorm(n, 0, sigma) 217 | sktests[i] <- as.integer(abs(sk(x)) >= cv) 218 | } 219 | pwr[j] <- mean(sktests) 220 | } 221 | 222 | se <- sqrt(pwr * (1-pwr) / m) 223 | df <- data.frame(epsilon=epsilon, power=pwr, upper=pwr+2*se, lower=pwr-2*se) 224 | 225 | #plot power vs epsilon 226 | library(ggplot2) 227 | ggplot(df, aes(x=epsilon, y=power)) + 228 | geom_line() + labs(x=bquote(epsilon)) + 229 | geom_hline(yintercept=.1, lty=2) + 230 | geom_pointrange(aes(ymin=lower, ymax=upper)) 231 | 232 | 233 | ### Example 7.11 (Power comparison of tests of normality) 234 | 235 | #only one loop, for epsilon=0.1, was shown in the text 236 | #the simulation below takes several minutes to run 237 | 238 | # initialize input and output 239 | library(energy) 240 | alpha <- .1 241 | n <- 30 242 | m <- 500 #try small m for a trial run 243 | test1 <- test2 <- test3 <- numeric(m) 244 | 245 | #critical value for the skewness test 246 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3)))) 247 | sim <- matrix(0, 11, 4) 248 | 249 | # estimate power 250 | for (i in 0:10) { 251 | epsilon <- i * .1 252 | for (j in 1:m) { 253 | e <- epsilon 254 | sigma <- sample(c(1, 10), replace = TRUE, 255 | size = n, prob = c(1-e, e)) 256 | x <- rnorm(n, 0, sigma) 257 | test1[j] <- as.integer(abs(sk(x)) >= cv) 258 | test2[j] <- as.integer( 259 | shapiro.test(x)$p.value <= alpha) 260 | test3[j] <- as.integer( 261 | mvnorm.etest(x, R=200)$p.value <= alpha) 262 | } 263 | print(c(epsilon, mean(test1), mean(test2), mean(test3))) 264 | sim[i+1, ] <- c(epsilon, mean(test1), mean(test2), mean(test3)) 265 | } 266 | detach(package:energy) 267 | 268 | # plot the empirical estimates of power 269 | plot(sim[,1], sim[,2], ylim = c(0, 1), type = "l", 270 | xlab = bquote(epsilon), ylab = "power") 271 | lines(sim[,1], sim[,3], lty = 2) 272 | lines(sim[,1], sim[,4], lty = 4) 273 | abline(h = alpha, lty = 3) 274 | legend("topright", 1, c("skewness", "S-W", "energy"), 275 | lty = c(1,2,4), inset = .02) 276 | 277 | 278 | ### Example 7.12 (Count Five test statistic) 279 | 280 | x1 <- rnorm(20, 0, sd = 1) 281 | x2 <- rnorm(20, 0, sd = 1.5) 282 | y <- c(x1, x2) 283 | 284 | group <- rep(1:2, each = length(x1)) 285 | boxplot(y ~ group, boxwex = .3, xlim = c(.5, 2.5), main = "") 286 | points(group, y) 287 | 288 | # now identify the extreme points 289 | range(x1) 290 | range(x2) 291 | 292 | i <- which(x1 < min(x2)) 293 | j <- which(x2 > max(x1)) 294 | 295 | x1[i] 296 | x2[j] 297 | 298 | out1 <- sum(x1 > max(x2)) + sum(x1 < min(x2)) 299 | out2 <- sum(x2 > max(x1)) + sum(x2 < min(x1)) 300 | max(c(out1, out2)) 301 | 302 | 303 | ### Example 7.13 (Count Five test statistic, cont.) 304 | 305 | maxout <- function(x, y) { 306 | X <- x - mean(x) 307 | Y <- y - mean(y) 308 | outx <- sum(X > max(Y)) + sum(X < min(Y)) 309 | outy <- sum(Y > max(X)) + sum(Y < min(X)) 310 | return(max(c(outx, outy))) 311 | } 312 | 313 | n1 <- n2 <- 20 314 | mu1 <- mu2 <- 0 315 | sigma1 <- sigma2 <- 1 316 | m <- 1000 317 | 318 | # generate samples under H0 319 | stat <- replicate(m, expr={ 320 | x <- rnorm(n1, mu1, sigma1) 321 | y <- rnorm(n2, mu2, sigma2) 322 | maxout(x, y) 323 | }) 324 | print(cumsum(table(stat)) / m) 325 | print(quantile(stat, c(.8, .9, .95))) 326 | 327 | 328 | ### Example 7.14 (Count Five test) 329 | 330 | count5test <- function(x, y) { 331 | X <- x - mean(x) 332 | Y <- y - mean(y) 333 | outx <- sum(X > max(Y)) + sum(X < min(Y)) 334 | outy <- sum(Y > max(X)) + sum(Y < min(X)) 335 | # return 1 (reject) or 0 (do not reject H0) 336 | return(as.integer(max(c(outx, outy)) > 5)) 337 | } 338 | 339 | n1 <- n2 <- 20 340 | mu1 <- mu2 <- 0 341 | sigma1 <- sigma2 <- 1 342 | m <- 10000 343 | tests <- replicate(m, expr = { 344 | x <- rnorm(n1, mu1, sigma1) 345 | y <- rnorm(n2, mu2, sigma2) 346 | x <- x - mean(x) #centered by sample mean 347 | y <- y - mean(y) 348 | count5test(x, y) 349 | } ) 350 | 351 | alphahat <- mean(tests) 352 | print(alphahat) 353 | 354 | 355 | ### Example 7.15 (Count Five test, cont.) 356 | 357 | n1 <- 20 358 | n2 <- 30 359 | mu1 <- mu2 <- 0 360 | sigma1 <- sigma2 <- 1 361 | m <- 10000 362 | 363 | alphahat <- mean(replicate(m, expr={ 364 | x <- rnorm(n1, mu1, sigma1) 365 | y <- rnorm(n2, mu2, sigma2) 366 | x <- x - mean(x) #centered by sample mean 367 | y <- y - mean(y) 368 | count5test(x, y) 369 | })) 370 | 371 | print(alphahat) 372 | 373 | 374 | ### Example 7.16 (Count Five, cont.) 375 | 376 | # generate samples under H1 to estimate power 377 | sigma1 <- 1 378 | sigma2 <- 1.5 379 | 380 | power <- mean(replicate(m, expr={ 381 | x <- rnorm(20, 0, sigma1) 382 | y <- rnorm(20, 0, sigma2) 383 | count5test(x, y) 384 | })) 385 | 386 | print(power) 387 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chNumerical.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 13 ### 9 | ### Introduction to Numerical Methods in R ### 10 | ####################################################### 11 | 12 | # packages to install: gsl 13 | 14 | ### Example 13.1 (Identical and nearly equal) 15 | 16 | isTRUE(all.equal(.2, .3 - .1)) 17 | all.equal(.2, .3) #not a logical value 18 | isTRUE(all.equal(.2, .3)) #always a logical value 19 | 20 | x <- 1:4 21 | y <- 2 22 | y == 2 23 | x == y #not necessarily a single logical value 24 | identical(x, y) #always a single logical value 25 | identical(y, 2) 26 | 27 | 28 | ### Example 13.2 (Ratio of two large numbers) 29 | 30 | n <- 400 31 | (gamma((n-1)/2) / (sqrt(pi) * gamma((n-2)/2))) 32 | exp(lgamma((n-1)/2) - lgamma((n-2)/2)) / sqrt(pi) 33 | 34 | 35 | ### Example 13.3 (Taylor expansion) 36 | 37 | system.time({ 38 | for (i in 1:1000) { 39 | a <- rep(0, 24) 40 | a0 <- pi / 6 41 | a2 <- a0 * a0 42 | a[1] <- -a0^3 / 6 43 | for (i in 2:24) 44 | a[i] <- - a2 * a[i-1] / ((2*i+1)*(2*i)) 45 | a0 + sum(a)} 46 | }) 47 | 48 | system.time({ 49 | for (i in 1:1000) { 50 | K <- 2 * (0:24) + 1 51 | i <- rep(c(1, -1), length=25) 52 | sum(i * (pi/6)^K / factorial(K))} 53 | }) 54 | 55 | 56 | ### Example 13.4 (Derivative of zeta function) 57 | 58 | zeta.deriv <- function(a) { 59 | z <- a - 1 60 | # Stieltjes constants gamma_k for k=1:5 61 | g <- c( 62 | -.7281584548367672e-1, 63 | -.9690363192872318e-2, 64 | .2053834420303346e-2, 65 | .2325370065467300e-2, 66 | .7933238173010627e-3) 67 | i <- c(-1, 1, -1, 1, -1) 68 | n <- 0:4 69 | -1/z^2 + sum(i * g * z^n / factorial(n)) 70 | } 71 | 72 | 73 | ### Example 13.5 (Derivative of zeta function, cont.) 74 | 75 | library(gsl) #for zeta function 76 | z <- c(1.001, 1.01, 1.5, 2, 3, 5) 77 | h <- .Machine$double.eps^0.5 78 | dz <- dq <- rep(0, length(z)) 79 | for (i in 1:length(z)) { 80 | v <- z[i] + h 81 | h <- v - z[i] 82 | a0 <- z[i] - h 83 | if (a0 < 1) a0 <- (1 + z[i])/2 84 | a1 <- z[i] + h 85 | dq[i] <- (zeta(a1) - zeta(a0)) / (a1 - a0) 86 | dz[i] <- zeta.deriv(z[i]) 87 | } 88 | 89 | h 90 | 91 | cbind(z, dz, dq) 92 | 93 | 94 | ### Example 13.6 (Solving f(x)=0) 95 | 96 | f <- function(y, a, n) { 97 | a^2 + y^2 + 2*a*y/(n-1) - (n-2) 98 | } 99 | 100 | a <- 0.5 101 | n <- 20 102 | b0 <- 0 103 | b1 <- 5*n 104 | 105 | #solve using bisection 106 | it <- 0 107 | eps <- .Machine$double.eps^0.25 108 | r <- seq(b0, b1, length=3) 109 | y <- c(f(r[1], a, n), f(r[2], a, n), f(r[3], a, n)) 110 | if (y[1] * y[3] > 0) 111 | stop("f does not have opposite sign at endpoints") 112 | 113 | while(it < 1000 && abs(y[2]) > eps) { 114 | it <- it + 1 115 | if (y[1]*y[2] < 0) { 116 | r[3] <- r[2] 117 | y[3] <- y[2] 118 | } else { 119 | r[1] <- r[2] 120 | y[1] <- y[2] 121 | } 122 | r[2] <- (r[1] + r[3]) / 2 123 | y[2] <- f(r[2], a=a, n=n) 124 | print(c(r[1], y[1], y[3]-y[2])) 125 | } 126 | 127 | 128 | ### Example 13.7 (Solving f(x)=0 with Brent's method: uniroot) 129 | 130 | a <- 0.5 131 | n <- 20 132 | out <- uniroot(function(y) { 133 | a^2 + y^2 + 2*a*y/(n-1) - (n-2) }, 134 | lower = 0, upper = n*5) 135 | unlist(out) 136 | uniroot(function(y) {a^2 + y^2 + 2*a*y/(n-1) - (n-2)}, 137 | interval = c(-n*5, 0))$root 138 | 139 | 140 | ### Example 13.8 (Numerical integration with integrate) 141 | 142 | f <- function(y, N, r, rho) { 143 | (cosh(y) - rho * r)^(1 - N) 144 | } 145 | integrate(f, lower=0, upper=Inf, 146 | rel.tol=.Machine$double.eps^0.25, 147 | N=10, r=0.5, rho=0.2) 148 | 149 | ro <- seq(-.99, .99, .01) 150 | v <- rep(0, length(ro)) 151 | for (i in 1:length(ro)) { 152 | v[i] <- integrate(f, lower=0, upper=Inf, 153 | rel.tol=.Machine$double.eps^0.25, 154 | N=10, r=0.5, rho=ro[i])$value 155 | } 156 | plot(ro, v, type="l", xlab=expression(rho), 157 | ylab="Integral Value (n=10, r=0.5)") 158 | 159 | 160 | ### Example 13.9 (Density of sample correlation coefficient) 161 | 162 | .dcorr <- function(r, N, rho=0) { 163 | # compute the density function of sample correlation 164 | if (abs(r) > 1 || abs(rho) > 1) return (0) 165 | if (N < 4) return (NA) 166 | 167 | if (isTRUE(all.equal(rho, 0.0))) { 168 | a <- exp(lgamma((N - 1)/2) - lgamma((N - 2)/2)) / 169 | sqrt(pi) 170 | return (a * (1 - r^2)^((N - 4)/2)) 171 | } 172 | 173 | # if rho not 0, need to integrate 174 | f <- function(w, R, N, rho) 175 | (cosh(w) - rho * R)^(1 - N) 176 | 177 | #need to insert some error checking here 178 | i <- integrate(f, lower=0, upper=Inf, 179 | R=r, N=N, rho=rho)$value 180 | c1 <- (N - 2) * (1 - rho^2)^((N - 1)/2) 181 | c2 <- (1 - r^2)^((N - 4) / 2) / pi 182 | return(c1 * c2 * i) 183 | } 184 | 185 | r <- as.matrix(seq(-1, 1, .01)) 186 | d1 <- apply(r, 1, .dcorr, N=10, rho=.0) 187 | d2 <- apply(r, 1, .dcorr, N=10, rho=.5) 188 | d3 <- apply(r, 1, .dcorr, N=10, rho=-.5) 189 | plot(r, d2, type="l", lty=2, lwd=2, ylab="density") 190 | lines(r, d1, lwd=2) 191 | lines(r, d3, lty=4, lwd=2) 192 | legend("top", inset=.02, 193 | c("rho = 0", "rho = 0.5", "rho = -0.5"), lty=c(1,2,4), lwd=2) 194 | 195 | 196 | ### Example 13.10 (MLE using mle) 197 | 198 | #the observed sample 199 | y <- c(0.04304550, 0.50263474) 200 | 201 | mlogL <- function(theta=1) { 202 | #minus log-likelihood of exp. density, rate theta 203 | return( - (length(y) * log(theta) - theta * sum(y))) 204 | } 205 | 206 | library(stats4) 207 | fit <- mle(mlogL) 208 | summary(fit) 209 | 210 | # Alternately, the initial value for the optimizer could 211 | # be supplied in the call to mle; two examples are 212 | 213 | mle(mlogL, start=list(theta=1)) 214 | mle(mlogL, start=list(theta=mean(y))) 215 | 216 | ### Application: Evaluating an Expected Value 217 | 218 | y <- VGAM::rpareto(1000, scale=1, shape=3) 219 | hist(y, prob=TRUE, breaks="scott", main="", ylim=c(0,3)) 220 | curve(VGAM::dpareto(x, scale=1, shape=3), add=TRUE) 221 | 222 | a <- 1 223 | s <- 1 224 | b <- 0.5 225 | 226 | ### Example 13.11 (Numerical integration) 227 | 228 | f1 <- function(x, y, s, a, b) { 229 | # the integrand function: |y-x|^b f(x) 230 | (abs(y-x))^b * a * s^a / x^(a+1) 231 | } 232 | 233 | integrate(f1, lower=s, upper=Inf, y=2, s=1, a=1, b=b) 234 | 235 | ### Example 13.12 (Direct evaluation) 236 | 237 | g2 <- function(y, s, b) { 238 | # Compute E|y-X|^b for Pareto(I) with a=1 239 | y0 <- (y - s)/y 240 | c0 <- gsl::hyperg_2F1(b+1, 1, b+2, y0) 241 | ((y - s)^b) - s*b*(y^(b-1)) * 242 | ((y0^b)/b + c0*(y0^(b+1))/(b+1)) + 243 | s*y^(b-1) * beta(b+1,1-b) 244 | } 245 | 246 | g2(2, s, b) 247 | 248 | ### Example 13.13 (Plot of expected distance function) 249 | 250 | p <- c(ppoints(50)/100, ppoints(50)) 251 | x <- VGAM::qpareto(p, scale=s, shape=1) 252 | ex <- g2(x, s=s, b=b) 253 | plot(log(x), log(ex), cex=.25, type="l") 254 | for (i in 1:length(x)) { 255 | y <- x[i] 256 | zi <- integrate(f1, lower=s, upper=Inf, 257 | subdivisions=200, rel.tol=.Machine$double.eps^.5, 258 | stop.on.error=FALSE, y=y, s=s, a=1, b=b) 259 | if (zi$message == "OK") 260 | points(log(y), log(zi$value), col=2, cex=.25) 261 | else print(paste("y=", y, zi$Message)) 262 | } 263 | 264 | integrate(f1, lower=s, upper=Inf, y=s, s=s, a=1, b=b) 265 | g2(s, s, b) 266 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chOptim.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 14 ### 9 | ### Optimization ### 10 | ####################################################### 11 | 12 | 13 | ### Example 14.1 (One-dimensional optimization with optimize) 14 | 15 | f <- function(x) 16 | log(x + log(x))/log(1+x) 17 | 18 | curve(f(x), from = 2, to = 15, ylab = "f(x)") 19 | 20 | optimize(f, lower = 4, upper = 8, maximum = TRUE) 21 | 22 | 23 | ### Example 14.2 (MLE: Gamma distribution) 24 | 25 | m <- 20000 26 | est <- matrix(0, m, 2) 27 | n <- 200 28 | r <- 5 29 | lambda <- 2 30 | 31 | obj <- function(lambda, xbar, logx.bar) { 32 | digamma(lambda * xbar) - logx.bar - log(lambda) 33 | } 34 | 35 | for (i in 1:m) { 36 | x <- rgamma(n, shape=r, rate=lambda) 37 | xbar <- mean(x) 38 | u <- uniroot(obj, lower = .001, upper = 10e5, 39 | xbar = xbar, logx.bar = mean(log(x))) 40 | lambda.hat <- u$root 41 | r.hat <- xbar * lambda.hat 42 | est[i, ] <- c(r.hat, lambda.hat) 43 | } 44 | 45 | ML <- colMeans(est) 46 | 47 | hist(est[, 1], breaks="scott", freq=FALSE, 48 | xlab="r", main="") 49 | points(ML[1], 0, cex=1.5, pch=20) 50 | hist(est[, 2], breaks="scott", freq=FALSE, 51 | xlab=bquote(lambda), main="") 52 | points(ML[2], 0, cex=1.5, pch=20) 53 | 54 | 55 | ### Example 14.3 (Two-dimensional optimization with optim) 56 | 57 | LL <- function(theta, sx, slogx, n) { 58 | r <- theta[1] 59 | lambda <- theta[2] 60 | loglik <- n * r * log(lambda) + (r - 1) * slogx - 61 | lambda * sx - n * log(gamma(r)) 62 | - loglik 63 | } 64 | 65 | n <- 200 66 | r <- 5; lambda <- 2 67 | x <- rgamma(n, shape=r, rate=lambda) 68 | 69 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n) 70 | 71 | mlests <- replicate(20000, expr = { 72 | x <- rgamma(200, shape = 5, rate = 2) 73 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n)$par 74 | }) 75 | colMeans(t(mlests)) 76 | 77 | 78 | ### Example 14.4 (MLE for a quadratic form) 79 | 80 | LL <- function(lambda, y) { 81 | lambda3 <- 1 - sum(lambda) 82 | f1 <- dgamma(y, shape=1/2, rate=1/(2*lambda[1])) 83 | f2 <- dgamma(y, shape=1/2, rate=1/(2*lambda[2])) 84 | f3 <- dgamma(y, shape=1/2, rate=1/(2*lambda3)) 85 | f <- f1/3 + f2/3 + f3/3 #density of mixture 86 | #returning -loglikelihood 87 | return( -sum(log(f))) 88 | } 89 | 90 | set.seed(543) 91 | m <- 2000 92 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda) 93 | lam <- sample(lambda, size = 2000, replace = TRUE) 94 | y <- rgamma(m, shape = .5, rate = 1/(2*lam)) 95 | 96 | opt <- optim(c(.5,.3), LL, y=y) 97 | theta <- c(opt$par, 1 - sum(opt$par)) 98 | 99 | as.data.frame(unlist(opt)) 100 | 101 | theta 102 | 103 | 104 | ### Example 14.5 (EM algorithm for a mixture model) 105 | 106 | set.seed(543) 107 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda) 108 | lam <- sample(lambda, size = 2000, replace = TRUE) 109 | y <- rgamma(m, shape = .5, rate = 1/(2*lam)) 110 | 111 | N <- 10000 #max. number of iterations 112 | L <- c(.5, .4, .1) #initial est. for lambdas 113 | tol <- .Machine$double.eps^0.5 114 | L.old <- L + 1 115 | 116 | for (j in 1:N) { 117 | f1 <- dgamma(y, shape=1/2, rate=1/(2*L[1])) 118 | f2 <- dgamma(y, shape=1/2, rate=1/(2*L[2])) 119 | f3 <- dgamma(y, shape=1/2, rate=1/(2*L[3])) 120 | py <- f1 / (f1 + f2 + f3) #posterior prob y from 1 121 | qy <- f2 / (f1 + f2 + f3) #posterior prob y from 2 122 | ry <- f3 / (f1 + f2 + f3) #posterior prob y from 3 123 | 124 | mu1 <- sum(y * py) / sum(py) #update means 125 | mu2 <- sum(y * qy) / sum(qy) 126 | mu3 <- sum(y * ry) / sum(ry) 127 | L <- c(mu1, mu2, mu3) #update lambdas 128 | L <- L / sum(L) 129 | 130 | if (sum(abs(L - L.old)/L.old) < tol) break 131 | L.old <- L 132 | } 133 | 134 | print(list(lambda = L/sum(L), iter = j, tol = tol)) 135 | 136 | 137 | ### Example 14.6 (Simplex algorithm) 138 | 139 | library(boot) #for simplex function 140 | A1 <- rbind(c(-2, 1, 1), c(4, -1, 3)) 141 | b1 <- c(1, 3) 142 | a <- c(2, 2, 3) 143 | simplex(a = a, A1 = A1, b1 = b1, maxi = TRUE) 144 | detach(package:boot) 145 | 146 | 147 | ### Example 14.7 (Solving the Morra game) 148 | 149 | solve.game <- function(A) { 150 | #solve the two player zero-sum game by simplex method 151 | #optimize for player 1, then player 2 152 | #maximize v subject to ... 153 | #let x strategies 1:m, and put v as extra variable 154 | #A1, the <= constraints 155 | # 156 | min.A <- min(A) 157 | A <- A - min.A #so that v >= 0 158 | max.A <- max(A) 159 | A <- A / max(A) 160 | m <- nrow(A) 161 | n <- ncol(A) 162 | it <- n^3 163 | a <- c(rep(0, m), 1) #objective function 164 | A1 <- -cbind(t(A), rep(-1, n)) #constraints <= 165 | b1 <- rep(0, n) 166 | A3 <- t(as.matrix(c(rep(1, m), 0))) #constraints sum(x)=1 167 | b3 <- 1 168 | sx <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3, 169 | maxi=TRUE, n.iter=it) 170 | #the 'solution' is [x1,x2,...,xm | value of game] 171 | # 172 | #minimize v subject to ... 173 | #let y strategies 1:n, with v as extra variable 174 | a <- c(rep(0, n), 1) #objective function 175 | A1 <- cbind(A, rep(-1, m)) #constraints <= 176 | b1 <- rep(0, m) 177 | A3 <- t(as.matrix(c(rep(1, n), 0))) #constraints sum(y)=1 178 | b3 <- 1 179 | sy <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3, 180 | maxi=FALSE, n.iter=it) 181 | 182 | soln <- list("A" = A * max.A + min.A, 183 | "x" = sx$soln[1:m], 184 | "y" = sy$soln[1:n], 185 | "v" = sx$soln[m+1] * max.A + min.A) 186 | soln 187 | } 188 | 189 | 190 | #enter the payoff matrix 191 | A <- matrix(c( 0,-2,-2,3,0,0,4,0,0, 192 | 2,0,0,0,-3,-3,4,0,0, 193 | 2,0,0,3,0,0,0,-4,-4, 194 | -3,0,-3,0,4,0,0,5,0, 195 | 0,3,0,-4,0,-4,0,5,0, 196 | 0,3,0,0,4,0,-5,0,-5, 197 | -4,-4,0,0,0,5,0,0,6, 198 | 0,0,4,-5,-5,0,0,0,6, 199 | 0,0,4,0,0,5,-6,-6,0), 9, 9) 200 | 201 | library(boot) #needed for simplex function 202 | 203 | s <- solve.game(A) 204 | round(cbind(s$x, s$y), 7) 205 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chPerm.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 10 ### 9 | ### Permutation Tests ### 10 | ####################################################### 11 | 12 | # packages to install: energy, yaImpute 13 | 14 | ### Example 10.1 (Permutation distribution of a statistic) 15 | 16 | attach(chickwts) 17 | x <- sort(weight[feed == "soybean"]) 18 | y <- sort(weight[feed == "linseed"]) 19 | detach(chickwts) 20 | 21 | R <- 999 #number of replicates 22 | z <- c(x, y) #pooled sample 23 | K <- 1:26 24 | reps <- numeric(R) #storage for replicates 25 | t0 <- t.test(x, y)$statistic 26 | 27 | for (i in 1:R) { 28 | #generate indices k for the first sample 29 | k <- sample(K, size = 14, replace = FALSE) 30 | x1 <- z[k] 31 | y1 <- z[-k] #complement of x1 32 | reps[i] <- t.test(x1, y1)$statistic 33 | } 34 | p <- mean(c(t0, reps) >= t0) 35 | p 36 | 37 | hist(reps, main = "", freq = FALSE, xlab = "T (p = 0.202)", 38 | breaks = "scott") 39 | points(t0, 0, cex = 1, pch = 16) #observed T 40 | 41 | 42 | ### Example 10.2 (Permutation distribution of the K-S statistic) 43 | 44 | # continues Example 10.1 45 | R <- 999 #number of replicates 46 | z <- c(x, y) #pooled sample 47 | K <- 1:26 48 | D <- numeric(R) #storage for replicates 49 | options(warn = -1) 50 | D0 <- ks.test(x, y, exact = FALSE)$statistic 51 | for (i in 1:R) { 52 | #generate indices k for the first sample 53 | k <- sample(K, size = 14, replace = FALSE) 54 | x1 <- z[k] 55 | y1 <- z[-k] #complement of x1 56 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic 57 | } 58 | p <- mean(c(D0, D) >= D0) 59 | options(warn = 0) 60 | p 61 | 62 | hist(D, main = "", freq = FALSE, xlab = "D (p = 0.46)", 63 | breaks = "scott") 64 | points(D0, 0, cex = 1, pch = 16) #observed D 65 | 66 | 67 | ### Example 10.3 (Example 10.2, cont.) 68 | 69 | attach(chickwts) 70 | x <- sort(weight[feed == "sunflower"]) 71 | y <- sort(weight[feed == "linseed"]) 72 | detach(chickwts) 73 | 74 | summary(cbind(x, y)) 75 | options(warn = -1) 76 | D0 <- ks.test(x, y, exact = FALSE)$statistic 77 | for (i in 1:R) { 78 | #generate indices k for the first sample 79 | k <- sample(K, size = 14, replace = FALSE) 80 | x1 <- z[k] 81 | y1 <- z[-k] #complement of x1 82 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic 83 | } 84 | p <- mean(c(D0, D) >= D0) 85 | options(warn = 0) 86 | p 87 | 88 | 89 | ### Example 10.4 (Finding nearest neighbors) 90 | ### using yaImpute::ann 91 | 92 | set.seed(439) 93 | library(yaImpute) #for ann function 94 | 95 | #generate a small multivariate data set 96 | x <- matrix(rnorm(12), 3, 4) 97 | y <- matrix(rnorm(12), 3, 4) 98 | z <- rbind(x, y) 99 | k <- nrow(z) #number of nearest neighbors desired 100 | 101 | ## Do an exact kd-tree search 102 | kd.exact <- ann(ref=z, target=z, 103 | tree.type="kd", k=k, verbose=FALSE) 104 | kd.exact$knnIndexDist[,1:k] #NN indices 105 | round(sqrt(kd.exact$knnIndexDist[,-(1:k)]),2) #Euclidean distances 106 | 107 | ## Do an approximate kd-tree search 108 | kd.approx <- ann(ref=z, target=z, 109 | tree.type="kd", k=k, eps=100, verbose=FALSE) 110 | kd.approx$knnIndexDist[,1:k] #NN indices 111 | detach(package:yaImpute) 112 | 113 | 114 | ### Example 10.5 (Nearest neighbor statistic) 115 | ### using yaImpute::ann 116 | 117 | library(yaImpute) 118 | attach(chickwts) 119 | x <- weight[feed == "sunflower"] 120 | y <- weight[feed == "linseed"] 121 | z <- as.matrix(c(x, y)) 122 | detach(chickwts) 123 | 124 | k <- 4 #want first 3 nearest neighbors 125 | NN <- ann(ref=z, target=z, tree.type="kd", k=k, verbose=FALSE) 126 | idx <- NN$knnIndexDist[,1:k] 127 | nn.idx <- idx[,-1] #first NN is in column 2 128 | 129 | block1 <- nn.idx[1:12, ] 130 | block2 <- nn.idx[13:24, ] 131 | i1 <- sum(block1 < 12.5) 132 | i2 <- sum(block2 > 12.5) 133 | 134 | c(i1, i2) 135 | 136 | detach(package:yaImpute) 137 | 138 | 139 | ### Example 10.6 (Nearest neighbor test) 140 | ### using yaImpute::ann 141 | 142 | library(boot) 143 | #continues the previous example 144 | 145 | ## function to return the matrix of indices NN_j of nearest neighbors 146 | NN.idx <- function(x, tree.type="kd", k=NROW(x)) { 147 | x <- as.matrix(x) 148 | k <- min(c(k+1, NROW(x))) 149 | NN <- yaImpute::ann(ref=x, target=x, 150 | tree.type="kd", k=k, verbose=FALSE) 151 | idx <- NN$knnIndexDist[,1:k] 152 | nn.idx <- idx[,-1] #first NN is in column 2 153 | row.names(nn.idx) <- idx[,1] 154 | nn.idx 155 | } 156 | 157 | ## function to compute the NN statistic T(n,3) 158 | Tn3 <- function(z, ix=1:NROW(z), sizes) { 159 | z <- as.matrix(z) 160 | n1 <- sizes[1] 161 | n2 <- sizes[2] 162 | n <- n1 + n2 163 | z <- as.matrix(z[ix, ]) 164 | nn.idx <- NN.idx(z, k=3) 165 | block1 <- nn.idx[1:n1, ] 166 | block2 <- nn.idx[(n1+1):n, ] 167 | i1 <- sum(block1 < n1 + .5) 168 | i2 <- sum(block2 > n1 + .5) 169 | return((i1 + i2) / (3 * n)) 170 | } 171 | 172 | attach(chickwts) 173 | x <- weight[feed == "sunflower"] 174 | y <- weight[feed == "linseed"] 175 | z <- c(x, y) 176 | detach(chickwts) 177 | 178 | N <- c(NROW(x), NROW(y)) 179 | 180 | boot.obj <- boot(data = z, statistic = Tn3, 181 | sim = "permutation", R = 999, sizes = N) 182 | boot.obj 183 | 184 | tb <- c(boot.obj$t, boot.obj$t0) 185 | mean(tb >= boot.obj$t0) 186 | 187 | hist(tb, freq=FALSE, main="", 188 | xlab="replicates of T(n,3) statistic") 189 | points(boot.obj$t0, 0, cex=1, pch=16) 190 | 191 | 192 | 193 | 194 | ### Example 10.7 (Two-sample energy statistic) 195 | 196 | edist.2 <- function(x, ix, sizes) { 197 | # computes the e-statistic between 2 samples 198 | # x: Euclidean distances of pooled sample 199 | # sizes: vector of sample sizes 200 | # ix: a permutation of row indices of x 201 | 202 | dst <- x 203 | n1 <- sizes[1] 204 | n2 <- sizes[2] 205 | ii <- ix[1:n1] 206 | jj <- ix[(n1+1):(n1+n2)] 207 | w <- n1 * n2 / (n1 + n2) 208 | 209 | # permutation applied to rows & cols of dist. matrix 210 | m11 <- sum(dst[ii, ii]) / (n1 * n1) 211 | m22 <- sum(dst[jj, jj]) / (n2 * n2) 212 | m12 <- sum(dst[ii, jj]) / (n1 * n2) 213 | e <- w * ((m12 + m12) - (m11 + m22)) 214 | return (e) 215 | } 216 | 217 | d <- 3 218 | a <- 2 / sqrt(d) 219 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d) 220 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d) 221 | z <- rbind(x, y) 222 | dst <- as.matrix(dist(z)) 223 | 224 | edist.2(dst, 1:30, sizes = c(20, 10)) 225 | 226 | 227 | ### Example 10.8 (Two-sample energy test) 228 | 229 | library(boot) #for boot function 230 | dst <- as.matrix(dist(z)) 231 | N <- c(20, 10) 232 | 233 | boot.obj <- boot(data = dst, statistic = edist.2, 234 | sim = "permutation", R = 999, sizes = N) 235 | boot.obj 236 | 237 | #calculate the ASL 238 | e <- boot.obj$t0 239 | tb <- c(e, boot.obj$t) 240 | mean(tb >= e) 241 | 242 | hist(tb, main = "", breaks="scott", freq=FALSE, 243 | xlab="Replicates of e") 244 | points(e, 0, cex=1, pch=16) 245 | 246 | 247 | #energy test applied under F=G 248 | d <- 3 249 | a <- 0 250 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d) 251 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d) 252 | z <- rbind(x, y) 253 | dst <- as.matrix(dist(z)) 254 | 255 | N <- c(20, 10) 256 | dst <- as.matrix(dist(z)) 257 | boot.obj <- boot(data = dst, statistic = edist.2, 258 | sim="permutation", R=999, sizes=N) 259 | boot.obj 260 | 261 | #calculate the ASL 262 | e <- boot.obj$t0 263 | E <- c(boot.obj$t, e) 264 | mean(E >= e) 265 | 266 | hist(E, main = "", breaks="scott", 267 | xlab="Replicates of e", freq=FALSE) 268 | points(e, 0, cex=1, pch=16) 269 | 270 | ### Example 10.9 (k-sample energy distances) 271 | 272 | z <- iris[ , 1:4] 273 | dst <- dist(z) 274 | energy::edist(dst, sizes = c(50, 50, 50), distance = TRUE) 275 | 276 | ### Example 10.10 (Distance Components (disco)) 277 | 278 | set.seed(413) 279 | energy::disco(iris[ , 1:4], factors = iris$Species, R = 999) 280 | 281 | ### Example 10.11 (Power Comparison) 282 | 283 | # results of several simulations summarized in Table 284 | 285 | ### Example 10.12 (Distance covariance statistic) 286 | 287 | dCov <- function(x, y) { 288 | x <- as.matrix(x) 289 | y <- as.matrix(y) 290 | n <- nrow(x) 291 | m <- nrow(y) 292 | if (n != m || n < 2) stop("Sample sizes must agree") 293 | if (! (all(is.finite(c(x, y))))) 294 | stop("Data contains missing or infinite values") 295 | 296 | Akl <- function(x) { 297 | d <- as.matrix(dist(x)) 298 | m <- rowMeans(d) 299 | M <- mean(d) 300 | a <- sweep(d, 1, m) 301 | b <- sweep(a, 2, m) 302 | return(b + M) 303 | } 304 | A <- Akl(x) 305 | B <- Akl(y) 306 | dCov <- sqrt(mean(A * B)) 307 | dCov 308 | } 309 | 310 | z <- as.matrix(iris[1:50, 1:4]) 311 | x <- z[ , 1:2] 312 | y <- z[ , 3:4] 313 | # compute the observed statistic 314 | dCov(x, y) 315 | 316 | ### Example 10.13 (Distance correlation statistic) 317 | 318 | DCOR <- function(x, y) { 319 | x <- as.matrix(x) 320 | y <- as.matrix(y) 321 | n <- nrow(x) 322 | m <- nrow(y) 323 | if (n != m || n < 2) stop("Sample sizes must agree") 324 | if (! (all(is.finite(c(x, y))))) 325 | stop("Data contains missing or infinite values") 326 | Akl <- function(x) { 327 | d <- as.matrix(dist(x)) 328 | m <- rowMeans(d) 329 | M <- mean(d) 330 | a <- sweep(d, 1, m) 331 | b <- sweep(a, 2, m) 332 | return(b + M) 333 | } 334 | A <- Akl(x) 335 | B <- Akl(y) 336 | dCov <- sqrt(mean(A * B)) 337 | dVarX <- sqrt(mean(A * A)) 338 | dVarY <- sqrt(mean(B * B)) 339 | dCor <- sqrt(dCov / sqrt(dVarX * dVarY)) 340 | list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY) 341 | } 342 | 343 | z <- as.matrix(iris[1:50, 1:4]) 344 | x <- z[ , 1:2] 345 | y <- z[ , 3:4] 346 | 347 | DCOR(x, y) 348 | 349 | 350 | ### Example 10.14 (Distance covariance test) 351 | 352 | ndCov2 <- function(z, ix, dims) { 353 | p <- dims[1] 354 | q1 <- p + 1 355 | d <- p + dims[2] 356 | x <- z[ , 1:p] #leave x as is 357 | y <- z[ix, q1:d] #permute rows of y 358 | return(nrow(z) * dCov(x, y)^2) 359 | } 360 | 361 | library(boot) 362 | z <- as.matrix(iris[1:50, 1:4]) 363 | boot.obj <- boot(data = z, statistic = ndCov2, R = 999, 364 | sim = "permutation", dims = c(2, 2)) 365 | 366 | tb <- c(boot.obj$t0, boot.obj$t) 367 | hist(tb, nclass="scott", xlab="", main="", 368 | freq=FALSE) 369 | points(boot.obj$t0, 0, cex=1, pch=16) 370 | 371 | mean(tb >= boot.obj$t0) 372 | boot.obj 373 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chRandomProcess.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 4 ### 9 | ### Generating Random Processes ### 10 | ####################################################### 11 | 12 | 13 | ### Example 4.1 (Poisson process) 14 | 15 | lambda <- 2 16 | t0 <- 3 17 | Tn <- rexp(100, lambda) #interarrival times 18 | Sn <- cumsum(Tn) #arrival times 19 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 20 | 21 | 22 | ### Example 4.2 (Poisson process, cont.) 23 | 24 | lambda <- 2 25 | t0 <- 3 26 | upper <- 100 27 | pp <- numeric(10000) 28 | for (i in 1:10000) { 29 | N <- rpois(1, lambda * upper) 30 | Un <- runif(N, 0, upper) #unordered arrival times 31 | Sn <- sort(Un) #arrival times 32 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 33 | pp[i] <- n - 1 #arrivals in [0, t0] 34 | } 35 | 36 | #alternately, the loop can be replaced by replicate function 37 | pp <- replicate(10000, expr = { 38 | N <- rpois(1, lambda * upper) 39 | Un <- runif(N, 0, upper) #unordered arrival times 40 | Sn <- sort(Un) #arrival times 41 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 42 | n - 1 }) #arrivals in [0, t0] 43 | 44 | c(mean(pp), var(pp)) 45 | 46 | 47 | ### Example 4.3 (Nonhomogeneous Poisson process) 48 | 49 | lambda <- 3 50 | upper <- 100 51 | N <- rpois(1, lambda * upper) 52 | Tn <- rexp(N, lambda) 53 | Sn <- cumsum(Tn) 54 | Un <- runif(N) 55 | keep <- (Un <= cos(Sn)^2) #indicator, as logical vector 56 | Sn[keep] 57 | 58 | round(Sn[keep], 4) 59 | 60 | 61 | ### Example 4.4 (Renewal process) 62 | 63 | t0 <- 5 64 | Tn <- rgeom(100, prob = .2) #interarrival times 65 | Sn <- cumsum(Tn) #arrival times 66 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 67 | 68 | Nt0 <- replicate(1000, expr = { 69 | Sn <- cumsum(rgeom(100, prob = .2)) 70 | min(which(Sn > t0)) - 1 71 | }) 72 | table(Nt0)/1000 73 | Nt0 74 | 75 | t0 <- seq(0.1, 30, .1) 76 | mt <- numeric(length(t0)) 77 | 78 | for (i in 1:length(t0)) { 79 | mt[i] <- mean(replicate(1000, 80 | { 81 | Sn <- cumsum(rgeom(100, prob = .2)) 82 | min(which(Sn > t0[i])) - 1 83 | })) 84 | } 85 | plot(t0, mt, type = "l", xlab = "t", ylab = "mean") 86 | abline(0, .25) 87 | 88 | 89 | ### Example 4.5 (Symmetric random walk) 90 | 91 | n <- 400 92 | incr <- sample(c(-1, 1), size = n, replace = TRUE) 93 | S <- as.integer(c(0, cumsum(incr))) 94 | plot(0:n, S, type = "l", main = "", xlab = "i") 95 | 96 | ### Example 4.6 (Generator for the time until return to origin) 97 | 98 | set.seed(12345) 99 | 100 | #compute the probabilities directly 101 | n <- 1:10000 102 | p2n <- exp(lgamma(2*n-1) 103 | - log(n) - (2*n-1)*log(2) - 2*lgamma(n)) 104 | #or compute using dbinom 105 | P2n <- (.5/n) * dbinom(n-1, size = 2*n-2, prob = 0.5) 106 | pP2n <- cumsum(P2n) 107 | 108 | #given n compute the time of the last return to 0 in (0,n] 109 | n <- 200 110 | sumT <- 0 111 | while (sumT <= n) { 112 | u <- runif(1) 113 | s <- sum(u > pP2n) 114 | if (s == length(pP2n)) 115 | warning("T is truncated") 116 | Tj <- 2 * (1 + s) 117 | #print(c(Tj, sumT)) 118 | sumT <- sumT + Tj 119 | } 120 | sumT - Tj 121 | 122 | ### Example 4.7 (Brownian motion) 123 | 124 | 125 | simBM <- function(n, T) { 126 | times <- seq(0, T, length = n+1) 127 | z <- rnorm(n) 128 | w <- rep(0, n) 129 | s <- sqrt(diff(times)) 130 | for (k in 2:n) { 131 | w[k] <- w[k-1] + s[k] * z[k] 132 | } 133 | return (list(w=w, t=times)) 134 | } 135 | 136 | set.seed(1) 137 | n <- 200 138 | x1 <- simBM(n, 1) 139 | x2 <- simBM(n, 1) 140 | x3 <- simBM(n, 1) 141 | r <- range(c(x1$w, x2$w, x3$w)) 142 | plot(x1$w, type="l", main="", xlab="t", ylab="W", ylim=r) 143 | lines(x2$w, lty=2) 144 | lines(x3$w, lty=3) 145 | 146 | interpBM <- function(w, t0, times) { 147 | k1 <- sum(times < t0) 148 | k <- k1 + 1 149 | b <- (t0 - times[k1]) / (times[k] - times[k1]) 150 | return (w[k1] + b * (w[k] - w[k1])) 151 | } 152 | 153 | plot(x1$t[1:10], x1$w[1:10], type="b", main="", xlab="t", ylab="W") 154 | tmids <- x1$t + 0.0025 155 | for (i in 1:10) { 156 | w <- interpBM(x1$w, tmids[i], x1$t) 157 | points(tmids[i], w, pch=2) 158 | } 159 | 160 | legend("topleft", c("Generated W", "Interpolated W"), pch=c(1,2), lty=1, bty="n") 161 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chRgen.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 3 ### 9 | ### Methods for Generating Random Variables ### 10 | ####################################################### 11 | 12 | 13 | # packages to install: mvtnorm 14 | 15 | 16 | ### Example 3.1 (Sampling from a finite population) 17 | 18 | #toss some coins 19 | sample(0:1, size = 10, replace = TRUE) 20 | 21 | #choose some lottery numbers 22 | sample(1:100, size = 6, replace = FALSE) 23 | 24 | #permuation of letters a-z 25 | sample(letters) 26 | 27 | #sample from a multinomial distribution 28 | x <- sample(1:3, size = 100, replace = TRUE, 29 | prob = c(.2, .3, .5)) 30 | table(x) 31 | 32 | ### Example 3.2 (Inverse transform method, continuous case) 33 | 34 | n <- 1000 35 | u <- runif(n) 36 | x <- u^(1/3) 37 | hist(x, prob = TRUE, main = bquote(f(x)==3*x^2)) #density histogram of sample 38 | y <- seq(0, 1, .01) 39 | lines(y, 3*y^2) #density curve f(x) 40 | 41 | 42 | ### Example 3.4 (Two point distribution) 43 | 44 | n <- 1000 45 | p <- 0.4 46 | u <- runif(n) 47 | x <- as.integer(u > 0.6) #(u > 0.6) is a logical vector 48 | 49 | mean(x) 50 | var(x) 51 | 52 | 53 | ### Example 3.5 (Geometric distribution) 54 | 55 | n <- 1000 56 | p <- 0.25 57 | u <- runif(n) 58 | k <- ceiling(log(1-u) / log(1-p)) - 1 59 | 60 | # more efficient 61 | k <- floor(log(u) / log(1-p)) 62 | 63 | 64 | ### Example 3.6 (Logarithmic distribution) 65 | 66 | rlogarithmic <- function(n, theta) { 67 | #returns a random logarithmic(theta) sample size n 68 | u <- runif(n) 69 | #set the initial length of cdf vector 70 | N <- ceiling(-16 / log10(theta)) 71 | k <- 1:N 72 | a <- -1/log(1-theta) 73 | fk <- exp(log(a) + k * log(theta) - log(k)) 74 | Fk <- cumsum(fk) 75 | x <- integer(n) 76 | for (i in 1:n) { 77 | x[i] <- as.integer(sum(u[i] > Fk)) #F^{-1}(u)-1 78 | while (x[i] == N) { 79 | #if x==N we need to extend the cdf 80 | #very unlikely because N is large 81 | logf <- log(a) + (N+1)*log(theta) - log(N+1) 82 | fk <- c(fk, exp(logf)) 83 | Fk <- c(Fk, Fk[N] + fk[N+1]) 84 | N <- N + 1 85 | x[i] <- as.integer(sum(u[i] > Fk)) 86 | } 87 | } 88 | x + 1 89 | } 90 | 91 | n <- 1000 92 | theta <- 0.5 93 | x <- rlogarithmic(n, theta) 94 | #compute density of logarithmic(theta) for comparison 95 | k <- sort(unique(x)) 96 | p <- -1 / log(1 - theta) * theta^k / k 97 | se <- sqrt(p*(1-p)/n) #standard error 98 | 99 | round(rbind(table(x)/n, p, se),3) 100 | 101 | 102 | ### Example 3.7 (Acceptance-rejection method) 103 | 104 | n <- 1000 105 | k <- 0 #counter for accepted 106 | j <- 0 #iterations 107 | y <- numeric(n) 108 | 109 | while (k < n) { 110 | u <- runif(1) 111 | j <- j + 1 112 | x <- runif(1) #random variate from g 113 | if (x * (1-x) > u) { 114 | #we accept x 115 | k <- k + 1 116 | y[k] <- x 117 | } 118 | } 119 | 120 | j 121 | 122 | #compare empirical and theoretical percentiles 123 | p <- seq(.1, .9, .1) 124 | Qhat <- quantile(y, p) #quantiles of sample 125 | Q <- qbeta(p, 2, 2) #theoretical quantiles 126 | se <- sqrt(p * (1-p) / (n * dbeta(Q, 2, 2)^2)) #see Ch. 2 127 | round(rbind(Qhat, Q, se), 3) 128 | 129 | 130 | ### Example 3.8 (Beta distribution) 131 | 132 | n <- 1000 133 | a <- 3 134 | b <- 2 135 | u <- rgamma(n, shape=a, rate=1) 136 | v <- rgamma(n, shape=b, rate=1) 137 | x <- u / (u + v) 138 | 139 | q <- qbeta(ppoints(n), a, b) 140 | qqplot(q, x, cex=0.25, xlab="Beta(3, 2)", ylab="Sample") 141 | abline(0, 1) 142 | 143 | 144 | ### Example 3.9 (Logarithmic distribution, version 2) 145 | 146 | n <- 1000 147 | theta <- 0.5 148 | u <- runif(n) #generate logarithmic sample 149 | v <- runif(n) 150 | x <- floor(1 + log(v) / log(1 - (1 - theta)^u)) 151 | k <- 1:max(x) #calc. logarithmic probs. 152 | p <- -1 / log(1 - theta) * theta^k / k 153 | se <- sqrt(p*(1-p)/n) 154 | p.hat <- tabulate(x)/n 155 | 156 | print(round(rbind(p.hat, p, se), 3)) 157 | 158 | # The following function is a simple replacement for 159 | # rlogarithmic in Example 3.6 160 | 161 | rlogarithmic <- function(n, theta) { 162 | stopifnot(all(theta > 0 & theta < 1)) 163 | th <- rep(theta, length=n) 164 | u <- runif(n) 165 | v <- runif(n) 166 | x <- floor(1 + log(v) / log(1 - (1 - th)^u)) 167 | return(x) 168 | } 169 | 170 | 171 | ### Example 3.10 (Chisquare) 172 | 173 | n <- 1000 174 | nu <- 2 175 | X <- matrix(rnorm(n*nu), n, nu)^2 #matrix of sq. normals 176 | #sum the squared normals across each row: method 1 177 | y <- rowSums(X) 178 | #method 2 179 | y <- apply(X, MARGIN=1, FUN=sum) #a vector length n 180 | mean(y) 181 | mean(y^2) 182 | 183 | 184 | ### Example 3.11 (Convolutions and mixtures) 185 | 186 | n <- 1000 187 | x1 <- rgamma(n, 2, 2) 188 | x2 <- rgamma(n, 2, 4) 189 | s <- x1 + x2 #the convolution 190 | u <- runif(n) 191 | k <- as.integer(u > 0.5) #vector of 0's and 1's 192 | x <- k * x1 + (1-k) * x2 #the mixture 193 | 194 | par(mfcol=c(1,2)) #two graphs per page 195 | hist(s, prob=TRUE, xlim=c(0,5), ylim=c(0,1)) 196 | hist(x, prob=TRUE, xlim=c(0,5), ylim=c(0,1)) 197 | par(mfcol=c(1,1)) #restore display 198 | 199 | 200 | ### Example 3.12 (Mixture of several gamma distributions) 201 | # density estimates are plotted 202 | 203 | n <- 5000 204 | k <- sample(1:5, size=n, replace=TRUE, prob=(1:5)/15) 205 | rate <- 1/k 206 | x <- rgamma(n, shape=3, rate=rate) 207 | 208 | #plot the density of the mixture 209 | #with the densities of the components 210 | plot(density(x), xlim=c(0,40), ylim=c(0,.3), 211 | lwd=3, xlab="x", main="") 212 | for (i in 1:5) 213 | lines(density(rgamma(n, 3, 1/i))) 214 | 215 | 216 | ### Example 3.13 (Mixture of several gamma distributions) 217 | 218 | n <- 5000 219 | p <- c(.1,.2,.2,.3,.2) 220 | lambda <- c(1,1.5,2,2.5,3) 221 | k <- sample(1:5, size=n, replace=TRUE, prob=p) 222 | rate <- lambda[k] 223 | x <- rgamma(n, shape=3, rate=rate) 224 | 225 | 226 | ### Example 3.14 (Plot density of mixture) 227 | 228 | f <- function(x, lambda, theta) { 229 | #density of the mixture at the point x 230 | sum(dgamma(x, 3, lambda) * theta) 231 | } 232 | 233 | p <- c(.1,.2,.2,.3,.2) 234 | lambda <- c(1,1.5,2,2.5,3) 235 | 236 | x <- seq(0, 8, length=200) 237 | dim(x) <- length(x) #need for apply 238 | 239 | #compute density of the mixture f(x) along x 240 | y <- apply(x, 1, f, lambda=lambda, theta=p) 241 | 242 | #plot the density of the mixture 243 | plot(x, y, type="l", ylim=c(0,.85), lwd=3, ylab="Density") 244 | 245 | for (j in 1:5) { 246 | #add the j-th gamma density to the plot 247 | y <- apply(x, 1, dgamma, shape=3, rate=lambda[j]) 248 | lines(x, y) 249 | } 250 | 251 | 252 | ### Example 3.15 (Poisson-Gamma mixture) 253 | 254 | #generate a Poisson-Gamma mixture 255 | n <- 1000 256 | r <- 4 257 | beta <- 3 258 | lambda <- rgamma(n, r, beta) #lambda is random 259 | 260 | #now supply the sample of lambda's as the Poisson mean 261 | x <- rpois(n, lambda) #the mixture 262 | 263 | #compare with negative binomial 264 | mix <- tabulate(x+1) / n 265 | negbin <- round(dnbinom(0:max(x), r, beta/(1+beta)), 3) 266 | se <- sqrt(negbin * (1 - negbin) / n) 267 | 268 | round(rbind(mix, negbin, se), 3) 269 | 270 | 271 | ### Example 3.16 (Spectral decomposition method) 272 | 273 | # mean and covariance parameters 274 | mu <- c(0, 0) 275 | Sigma <- matrix(c(1, .9, .9, 1), nrow = 2, ncol = 2) 276 | 277 | rmvn.eigen <- 278 | function(n, mu, Sigma) { 279 | # generate n random vectors from MVN(mu, Sigma) 280 | # dimension is inferred from mu and Sigma 281 | d <- length(mu) 282 | ev <- eigen(Sigma, symmetric = TRUE) 283 | lambda <- ev$values 284 | V <- ev$vectors 285 | R <- V %*% diag(sqrt(lambda)) %*% t(V) 286 | Z <- matrix(rnorm(n*d), nrow = n, ncol = d) 287 | X <- Z %*% R + matrix(mu, n, d, byrow = TRUE) 288 | X 289 | } 290 | 291 | # generate the sample 292 | X <- rmvn.eigen(1000, mu, Sigma) 293 | 294 | plot(X, xlab = "x", ylab = "y", pch = 20) 295 | print(colMeans(X)) 296 | print(cor(X)) 297 | 298 | 299 | ### Example 3.17 (SVD method) 300 | 301 | rmvn.svd <- 302 | function(n, mu, Sigma) { 303 | # generate n random vectors from MVN(mu, Sigma) 304 | # dimension is inferred from mu and Sigma 305 | d <- length(mu) 306 | S <- svd(Sigma) 307 | R <- S$u %*% diag(sqrt(S$d)) %*% t(S$v) #sq. root Sigma 308 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d) 309 | X <- Z %*% R + matrix(mu, n, d, byrow=TRUE) 310 | X 311 | } 312 | 313 | 314 | ### Example 3.18 (Choleski factorization method) 315 | 316 | rmvn.Choleski <- 317 | function(n, mu, Sigma) { 318 | # generate n random vectors from MVN(mu, Sigma) 319 | # dimension is inferred from mu and Sigma 320 | d <- length(mu) 321 | Q <- chol(Sigma) # Choleski factorization of Sigma 322 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d) 323 | X <- Z %*% Q + matrix(mu, n, d, byrow=TRUE) 324 | X 325 | } 326 | 327 | #generating the samples according to the mean and covariance 328 | #structure as the four-dimensional iris virginica data 329 | y <- subset(x=iris, Species=="virginica")[, 1:4] 330 | mu <- colMeans(y) 331 | Sigma <- cov(y) 332 | mu 333 | Sigma 334 | 335 | #now generate MVN data with this mean and covariance 336 | X <- rmvn.Choleski(200, mu, Sigma) 337 | pairs(X) 338 | 339 | ### Example 3.19 (Comparing performance of MVN generators) 340 | 341 | library(MASS) 342 | library(mvtnorm) 343 | n <- 100 #sample size 344 | d <- 30 #dimension 345 | N <- 2000 #iterations 346 | mu <- numeric(d) 347 | 348 | set.seed(100) 349 | system.time(for (i in 1:N) 350 | rmvn.eigen(n, mu, cov(matrix(rnorm(n*d), n, d)))) 351 | set.seed(100) 352 | system.time(for (i in 1:N) 353 | rmvn.svd(n, mu, cov(matrix(rnorm(n*d), n, d)))) 354 | set.seed(100) 355 | system.time(for (i in 1:N) 356 | rmvn.Choleski(n, mu, cov(matrix(rnorm(n*d), n, d)))) 357 | set.seed(100) 358 | system.time(for (i in 1:N) 359 | mvrnorm(n, mu, cov(matrix(rnorm(n*d), n, d)))) 360 | set.seed(100) 361 | system.time(for (i in 1:N) 362 | rmvnorm(n, mu, cov(matrix(rnorm(n*d), n, d)))) 363 | set.seed(100) 364 | system.time(for (i in 1:N) 365 | cov(matrix(rnorm(n*d), n, d))) 366 | 367 | detach(package:MASS) 368 | detach(package:mvtnorm) 369 | 370 | ### Example 3.20 (Multivariate normal mixture) 371 | 372 | library(MASS) #for mvrnorm 373 | #inefficient version loc.mix.0 with loops 374 | 375 | loc.mix.0 <- function(n, p, mu1, mu2, Sigma) { 376 | #generate sample from BVN location mixture 377 | X <- matrix(0, n, 2) 378 | 379 | for (i in 1:n) { 380 | k <- rbinom(1, size = 1, prob = p) 381 | if (k) 382 | X[i,] <- mvrnorm(1, mu = mu1, Sigma) else 383 | X[i,] <- mvrnorm(1, mu = mu2, Sigma) 384 | } 385 | return(X) 386 | } 387 | 388 | #more efficient version 389 | loc.mix <- function(n, p, mu1, mu2, Sigma) { 390 | #generate sample from BVN location mixture 391 | n1 <- rbinom(1, size = n, prob = p) 392 | n2 <- n - n1 393 | x1 <- mvrnorm(n1, mu = mu1, Sigma) 394 | x2 <- mvrnorm(n2, mu = mu2, Sigma) 395 | X <- rbind(x1, x2) #combine the samples 396 | return(X[sample(1:n), ]) #mix them 397 | } 398 | 399 | x <- loc.mix(1000, .5, rep(0, 4), 2:5, Sigma = diag(4)) 400 | r <- range(x) * 1.2 401 | par(mfrow = c(2, 2)) 402 | for (i in 1:4) 403 | hist(x[ , i], xlim = r, ylim = c(0, .3), freq = FALSE, 404 | main = "", breaks = seq(-5, 10, .5)) 405 | 406 | detach(package:MASS) 407 | par(mfrow = c(1, 1)) 408 | 409 | 410 | ### Example 3.21 (Generating variates on a sphere) 411 | 412 | runif.sphere <- function(n, d) { 413 | # return a random sample uniformly distributed 414 | # on the unit sphere in R ^d 415 | M <- matrix(rnorm(n*d), nrow = n, ncol = d) 416 | L <- apply(M, MARGIN = 1, 417 | FUN = function(x){sqrt(sum(x*x))}) 418 | D <- diag(1 / L) 419 | U <- D %*% M 420 | U 421 | } 422 | 423 | #generate a sample in d=2 and plot 424 | X <- runif.sphere(200, 2) 425 | par(pty = "s") 426 | plot(X, xlab = bquote(x[1]), ylab = bquote(x[2])) 427 | par(pty = "m") 428 | -------------------------------------------------------------------------------- /examples-R/SCR2e-chVis.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ### Statistical Computing with R 2e ### 3 | ### Maria L. Rizzo ### 4 | ### Chapman & Hall/CRC The R Series ### 5 | ### ISBN 9781466553323 - CAT# K15269 ### 6 | ### January 2019 ### 7 | ### ### 8 | ### R code for Chapter 5 ### 9 | ### Visualization of Multivariate Data ### 10 | ####################################################### 11 | 12 | 13 | # packages to install: 14 | # bootstrap, corrplot, DAAG, FactoMineR, ggplot2, hexbin 15 | 16 | 17 | ### Example 5.1 (Scatterplot matrix) 18 | 19 | data(iris) 20 | #virginica data in first 4 columns of the last 50 obs. 21 | 22 | # not shown in text 23 | pairs(iris[101:150, 1:4]) 24 | 25 | panel.d <- function(x, ...) { 26 | usr <- par("usr") 27 | on.exit(par(usr=usr)) 28 | par(usr = c(usr[1:2], 0, .5)) 29 | lines(density(x)) 30 | } 31 | 32 | # Fig. 5.1 33 | x <- scale(iris[101:150, 1:4]) 34 | r <- range(x) 35 | pairs(x, diag.panel = panel.d, xlim = r, ylim = r) 36 | 37 | library(lattice) 38 | splom(iris[101:150, 1:4]) #plot 1 39 | 40 | #for all 3 at once, in color, plot 2 41 | splom(iris[,1:4], groups = iris$Species) 42 | 43 | # Fig. 5.2 44 | #for all 3 at once, black and white, plot 3 45 | splom(~iris[1:4], groups = Species, data = iris, 46 | col = 1, pch = c(1, 2, 3), cex = c(.5,.5,.5)) 47 | 48 | ### Example 5.2 (Correlation plots: Decathlon data) 49 | 50 | library(FactoMineR) #decathlon data 51 | library(corrplot) 52 | data("decathlon") 53 | str(decathlon) 54 | 55 | corrMat <- cor(decathlon[, 1:10]) 56 | corrplot(corrMat, type="upper", tl.col="black", tl.srt=45) 57 | corrplot(corrMat, type = "upper", method = "square", 58 | addCoef.col = "black", diag=FALSE) 59 | 60 | ### Example 5.3 (Plot bivariate normal density) 61 | 62 | #the standard BVN density 63 | f <- function(x,y) { 64 | z <- (1/(2*pi)) * exp(-.5 * (x^2 + y^2)) 65 | } 66 | 67 | y <- x <- seq(-3, 3, length= 50) 68 | z <- outer(x, y, f) #compute density for all (x,y) 69 | 70 | persp(x, y, z) #the default plot 71 | 72 | persp(x, y, z, theta = 45, phi = 30, expand = 0.6, 73 | ltheta = 120, shade = 0.75, ticktype = "detailed", 74 | xlab = "X", ylab = "Y", zlab = "f(x, y)") 75 | 76 | 77 | ### Example 5.4 (Add elements to perspective plot) 78 | 79 | #store viewing transformation in M 80 | persp(x, y, z, theta = 45, phi = 30, 81 | expand = .4, box = FALSE) -> M 82 | 83 | #add some points along a circle 84 | a <- seq(-pi, pi, pi/16) 85 | newpts <- cbind(cos(a), sin(a)) * 2 86 | newpts <- cbind(newpts, 0, 1) #z=0, t=1 87 | N <- newpts %*% M 88 | points(N[,1]/N[,4], N[,2]/N[,4], col=2) 89 | 90 | #add lines 91 | x2 <- seq(-3, 3, .1) 92 | y2 <- -x2^2 / 3 93 | z2 <- dnorm(x2) * dnorm(y2) 94 | N <- cbind(x2, y2, z2, 1) %*% M 95 | lines(N[,1]/N[,4], N[,2]/N[,4], col=4) 96 | 97 | #add text 98 | x3 <- c(0, 3.1) 99 | y3 <- c(0, -3.1) 100 | z3 <- dnorm(x3) * dnorm(y3) * 1.1 101 | N <- cbind(x3, y3, z3, 1) %*% M 102 | text(N[1,1]/N[1,4], N[1,2]/N[1,4], "f(x,y)") 103 | text(N[2,1]/N[2,4], N[2,2]/N[2,4], bquote(y==-x^2/3)) 104 | 105 | 106 | ### Example 5.5 (Surface plot using wireframe(lattice)) 107 | 108 | library(lattice) 109 | x <- y <- seq(-3, 3, length= 50) 110 | 111 | xy <- expand.grid(x, y) 112 | z <- (1/(2*pi)) * exp(-.5 * (xy[,1]^2 + xy[,2]^2)) 113 | wireframe(z ~ xy[,1] * xy[,2]) 114 | 115 | 116 | ### Example 5.6 (3D scatterplot) 117 | 118 | library(lattice) 119 | attach(iris) 120 | #basic 3 color plot with arrows along axes 121 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width, 122 | data=iris, groups=Species)) 123 | 124 | print(cloud(Sepal.Length ~ Petal.Length * Petal.Width, 125 | data = iris, groups = Species, main = "1", pch=1:3, 126 | scales = list(draw = FALSE), zlab = "SL", 127 | screen = list(z = 30, x = -75, y = 0)), 128 | split = c(1, 1, 2, 2), more = TRUE) 129 | 130 | print(cloud(Sepal.Width ~ Petal.Length * Petal.Width, 131 | data = iris, groups = Species, main = "2", pch=1:3, 132 | scales = list(draw = FALSE), zlab = "SW", 133 | screen = list(z = 30, x = -75, y = 0)), 134 | split = c(2, 1, 2, 2), more = TRUE) 135 | 136 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width, 137 | data = iris, groups = Species, main = "3", pch=1:3, 138 | scales = list(draw = FALSE), zlab = "PL", 139 | screen = list(z = 30, x = -55, y = 0)), 140 | split = c(1, 2, 2, 2), more = TRUE) 141 | 142 | print(cloud(Petal.Width ~ Sepal.Length * Sepal.Width, 143 | data = iris, groups = Species, main = "4", pch=1:3, 144 | scales = list(draw = FALSE), zlab = "PW", 145 | screen = list(z = 30, x = -55, y = 0)), 146 | split = c(2, 2, 2, 2)) 147 | detach(iris) 148 | 149 | 150 | ### Example 5.7 (Contour plot) 151 | 152 | #contour plot with labels 153 | contour(volcano, asp = 1, labcex = 1) 154 | 155 | #another version from lattice package 156 | library(lattice) 157 | contourplot(volcano) #similar to above 158 | 159 | 160 | ### Example 5.8 (Filled contour plots) 161 | 162 | image(volcano, col = terrain.colors(100), axes = FALSE) 163 | contour(volcano, levels = seq(100,200,by = 10), add = TRUE) 164 | 165 | filled.contour(volcano, color = terrain.colors, asp = 1) 166 | levelplot(volcano, scales = list(draw = FALSE), 167 | xlab = "", ylab = "") 168 | 169 | 170 | ### Example 5.9 (2D histogram) 171 | 172 | library(hexbin) 173 | x <- matrix(rnorm(4000), 2000, 2) 174 | plot(hexbin(x[,1], x[,2])) 175 | 176 | # ggplot version 177 | library(ggplot2) 178 | x <- data.frame(x) 179 | ggplot(x, aes(x[,1], x[,2])) + geom_hex() 180 | 181 | ### Example 5.10 (Andrews curves) 182 | 183 | library(DAAG) 184 | attach(leafshape17) 185 | 186 | f <- function(a, v) { 187 | #Andrews curve f(a) for a data vector v in R^3 188 | v[1]/sqrt(2) + v[2]*sin(a) + v[3]*cos(a) 189 | } 190 | 191 | 192 | #scale data to range [-1, 1] 193 | x <- cbind(bladelen, petiole, bladewid) 194 | n <- nrow(x) 195 | mins <- apply(x, 2, min) #column minimums 196 | maxs <- apply(x, 2, max) #column maximums 197 | r <- maxs - mins #column ranges 198 | y <- sweep(x, 2, mins) #subtract column mins 199 | y <- sweep(y, 2, r, "/") #divide by range 200 | x <- 2 * y - 1 #now has range [-1, 1] 201 | 202 | #set up plot window, but plot nothing yet 203 | plot(0, 0, xlim = c(-pi, pi), ylim = c(-3,3), 204 | xlab = "t", ylab = "Andrews Curves", 205 | main = "", type = "n") 206 | 207 | #now add the Andrews curves for each observation 208 | #line type corresponds to leaf architecture 209 | #0=orthotropic, 1=plagiotropic 210 | a <- seq(-pi, pi, len=101) 211 | dim(a) <- length(a) 212 | for (i in 1:n) { 213 | g <- arch[i] + 1 214 | y <- apply(a, MARGIN = 1, FUN = f, v = x[i,]) 215 | lines(a, y, lty = g) 216 | } 217 | legend(3, c("Orthotropic", "Plagiotropic"), lty = 1:2) 218 | detach(leafshape17) 219 | 220 | 221 | ### Example 5.11 (Parallel coordinates) 222 | 223 | library(MASS) 224 | library(lattice) 225 | #trellis.device(color = FALSE) #black and white display 226 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs. 227 | parallelplot(~x[4:8] | sp*sex, x) 228 | 229 | #trellis.device(color = FALSE) #black and white display 230 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs. 231 | a <- x$CW * x$CL #area of carapace 232 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size 233 | parallelplot(~x[4:8] | sp*sex, x) 234 | 235 | ### Example 5.12 (Segment plot) 236 | 237 | #segment plot 238 | x <- MASS::crabs[seq(5, 200, 5), ] #get every fifth obs. 239 | x <- subset(x, sex == "M") #keep just the males 240 | a <- x$CW * x$CL #area of carapace 241 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size 242 | 243 | #use default color palette or other colors 244 | #palette(gray(seq(.4, .95, len = 5))) #use gray scale 245 | palette(rainbow(6)) #or use color 246 | stars(x[4:8], draw.segments = TRUE, 247 | labels =levels(x$sp), nrow = 4, 248 | ylim = c(-2,10), key.loc = c(3,-1)) 249 | 250 | #after viewing, restore the default colors 251 | palette("default") 252 | 253 | 254 | ### Example 5.13 (PCA for open and closed book exams) 255 | 256 | library(bootstrap) 257 | str(scor) 258 | pairs(scor) 259 | cor(scor) 260 | 261 | n <- nrow(scor) 262 | x <- scale(scor) #center and scale 263 | s <- cov(x) 264 | e <- eigen(s) 265 | lam <- e$values #vector of eigenvalues 266 | P <- e$vectors #matrix of eigenvectors 267 | 268 | plot(lam, type = "b", xlab = "eigenvalues", main = "") 269 | barplot(lam, xlab = "eigenvalues") 270 | 271 | tab <- rbind(lam / sum(lam), cumsum(lam) / sum(lam)) 272 | tab 273 | 274 | z <- x %*% P 275 | dim(z) 276 | head(z) 277 | 278 | pc <- prcomp(scor, center = TRUE, scale = TRUE) 279 | summary(pc) 280 | 281 | df <- scor[1:5, ] 282 | predict(pc, newdata = df) #same as z above 283 | 284 | head(x %*% pc$rotation, 5) 285 | head(pc$rotation) 286 | head(P) 287 | 288 | 289 | ### Example 5.14 (PC Biplot) 290 | 291 | ## plot scor data in the (PC1, PC2) coordinate system 292 | biplot(pc, pc.biplot = TRUE) 293 | round(cor(x, z), 3) 294 | 295 | ### Example 5.15 (PCA: Decathlon data) 296 | 297 | library(FactoMineR) 298 | data(decathlon) 299 | pc <- princomp(decathlon[,1:10], cor = TRUE, scores = TRUE) 300 | plot(pc) # screeplot 301 | biplot(pc) 302 | summary(pc) 303 | 304 | -------------------------------------------------------------------------------- /examples-R/SCR2e-examples-functions.R: -------------------------------------------------------------------------------- 1 | rlogarithmic <- function(n, theta) { 2 | # generate random sample from Logarithmic(theta) 3 | stopifnot(all(theta > 0 & theta < 1)) 4 | th <- rep(theta, length=n) 5 | u <- runif(n) 6 | v <- runif(n) 7 | x <- floor(1 + log(v) / log(1 - (1 - th)^u)) 8 | return(x) 9 | } 10 | 11 | # generate MVN by spectral decomposition method 12 | 13 | rmvn.eigen <- 14 | function(n, mu, Sigma) { 15 | # generate n random vectors from MVN(mu, Sigma) 16 | # dimension is inferred from mu and Sigma 17 | d <- length(mu) 18 | ev <- eigen(Sigma, symmetric = TRUE) 19 | lambda <- ev$values 20 | V <- ev$vectors 21 | R <- V %*% diag(sqrt(lambda)) %*% t(V) 22 | Z <- matrix(rnorm(n*d), nrow = n, ncol = d) 23 | X <- Z %*% R + matrix(mu, n, d, byrow = TRUE) 24 | X 25 | } 26 | 27 | 28 | # generate MVN by singular value decomposition method 29 | 30 | rmvn.svd <- 31 | function(n, mu, Sigma) { 32 | # generate n random vectors from MVN(mu, Sigma) 33 | # dimension is inferred from mu and Sigma 34 | d <- length(mu) 35 | S <- svd(Sigma) 36 | R <- S$u %*% diag(sqrt(S$d)) %*% t(S$v) #sq. root Sigma 37 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d) 38 | X <- Z %*% R + matrix(mu, n, d, byrow=TRUE) 39 | X 40 | } 41 | 42 | 43 | # generate MVN by Choleski factorization method 44 | 45 | rmvn.Choleski <- 46 | function(n, mu, Sigma) { 47 | # generate n random vectors from MVN(mu, Sigma) 48 | # dimension is inferred from mu and Sigma 49 | d <- length(mu) 50 | Q <- chol(Sigma) # Choleski factorization of Sigma 51 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d) 52 | X <- Z %*% Q + matrix(mu, n, d, byrow=TRUE) 53 | X 54 | } 55 | 56 | # Bootstrap t confidence interval 57 | 58 | boot.t.ci <- 59 | function(x, B = 500, R = 100, level = .95, statistic){ 60 | #compute the bootstrap t CI 61 | x <- as.matrix(x); n <- nrow(x) 62 | stat <- numeric(B); se <- numeric(B) 63 | 64 | boot.se <- function(x, R, f) { 65 | #local function to compute the bootstrap 66 | #estimate of standard error for statistic f(x) 67 | x <- as.matrix(x); m <- nrow(x) 68 | th <- replicate(R, expr = { 69 | i <- sample(1:m, size = m, replace = TRUE) 70 | f(x[i, ]) 71 | }) 72 | return(sd(th)) 73 | } 74 | 75 | for (b in 1:B) { 76 | j <- sample(1:n, size = n, replace = TRUE) 77 | y <- x[j, ] 78 | stat[b] <- statistic(y) 79 | se[b] <- boot.se(y, R = R, f = statistic) 80 | } 81 | stat0 <- statistic(x) 82 | t.stats <- (stat - stat0) / se 83 | se0 <- sd(stat) 84 | alpha <- 1 - level 85 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1) 86 | names(Qt) <- rev(names(Qt)) 87 | CI <- rev(stat0 - Qt * se0) 88 | } 89 | 90 | 91 | # BCa bootstrap confidence interval 92 | 93 | boot.BCa <- 94 | function(x, th0, th, stat, conf = .95) { 95 | # bootstrap with BCa bootstrap confidence interval 96 | # th0 is the observed statistic 97 | # th is the vector of bootstrap replicates 98 | # stat is the function to compute the statistic 99 | 100 | x <- as.matrix(x) 101 | n <- nrow(x) #observations in rows 102 | N <- 1:n 103 | alpha <- (1 + c(-conf, conf))/2 104 | zalpha <- qnorm(alpha) 105 | 106 | # the bias correction factor 107 | z0 <- qnorm(sum(th < th0) / length(th)) 108 | 109 | # the acceleration factor (jackknife est.) 110 | th.jack <- numeric(n) 111 | for (i in 1:n) { 112 | J <- N[1:(n-1)] 113 | th.jack[i] <- stat(x[-i, ], J) 114 | } 115 | L <- mean(th.jack) - th.jack 116 | a <- sum(L^3)/(6 * sum(L^2)^1.5) 117 | 118 | # BCa conf. limits 119 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha))) 120 | limits <- quantile(th, adj.alpha, type=6) 121 | return(list("est"=th0, "BCa"=limits)) 122 | } 123 | 124 | # Gelman-Rubin statistic for MCMC convergence 125 | 126 | Gelman.Rubin <- function(psi) { 127 | # psi[i,j] is the statistic psi(X[i,1:j]) 128 | # for chain in i-th row of X 129 | psi <- as.matrix(psi) 130 | n <- ncol(psi) 131 | k <- nrow(psi) 132 | 133 | psi.means <- rowMeans(psi) #row means 134 | B <- n * var(psi.means) #between variance est. 135 | psi.w <- apply(psi, 1, "var") #within variances 136 | W <- mean(psi.w) #within est. 137 | v.hat <- W*(n-1)/n + (B/n) #upper variance est. 138 | r.hat <- v.hat / W #G-R statistic 139 | return(r.hat) 140 | } 141 | 142 | # binning bivariate data 143 | 144 | bin2d <- 145 | function(x, breaks1 = "Sturges", breaks2 = "Sturges"){ 146 | # Data matrix x is n by 2 147 | # breaks1, breaks2: any valid breaks for hist function 148 | # using same defaults as hist 149 | histg1 <- hist(x[,1], breaks = breaks1, plot = FALSE) 150 | histg2 <- hist(x[,2], breaks = breaks2, plot = FALSE) 151 | brx <- histg1$breaks 152 | bry <- histg2$breaks 153 | 154 | # bin frequencies 155 | freq <- table(cut(x[,1], brx), cut(x[,2], bry)) 156 | 157 | return(list(call = match.call(), freq = freq, 158 | breaks1 = brx, breaks2 = bry, 159 | mids1 = histg1$mids, mids2 = histg2$mids)) 160 | } 161 | -------------------------------------------------------------------------------- /examples-R/readme.md: -------------------------------------------------------------------------------- 1 | Code for the examples in "Statistical Computing with R, 2nd ed.". 2 | -------------------------------------------------------------------------------- /examples-Rmd/FOREARM.DAT: -------------------------------------------------------------------------------- 1 | 17.3 18.4 20.9 16.8 18.7 20.5 17.9 20.4 18.3 20.5 2 | 19.0 17.5 18.1 17.1 18.8 20.0 19.1 19.1 17.9 18.3 3 | 18.2 18.9 19.4 18.9 19.4 20.8 17.3 18.5 18.3 19.4 4 | 19.0 19.0 20.5 19.7 18.5 17.7 19.4 18.3 19.6 21.4 5 | 19.0 20.5 20.4 19.7 18.6 19.9 18.3 19.8 19.6 19.0 6 | 20.4 17.3 16.1 19.2 19.6 18.8 19.3 19.1 21.0 18.6 7 | 18.3 18.3 18.7 20.6 18.5 16.4 17.2 17.5 18.0 19.5 8 | 19.9 18.4 18.8 20.1 20.0 18.5 17.5 18.5 17.9 17.4 9 | 18.7 18.6 17.3 18.8 17.8 19.0 19.6 19.3 18.1 18.5 10 | 20.9 19.8 18.1 17.1 19.8 20.6 17.6 19.1 19.5 18.4 11 | 17.7 20.2 19.9 18.6 16.6 19.2 20.0 17.4 17.1 18.3 12 | 19.1 18.5 19.6 18.0 19.4 17.1 19.9 16.3 18.9 20.7 13 | 19.7 18.5 18.4 18.7 19.3 16.3 16.9 18.2 18.5 19.3 14 | 18.1 18.0 19.5 20.3 20.1 17.2 19.5 18.8 19.2 17.7 15 | -------------------------------------------------------------------------------- /examples-Rmd/Lahman.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Benchmarks on Lahman Batting data" 3 | author: "Maria Rizzo" 4 | date: "September 1, 2018" 5 | output: html_document 6 | --- 7 | 8 | ```{r} 9 | library(Lahman) 10 | library(dplyr) 11 | library(ggplot2) 12 | library(microbenchmark) 13 | ``` 14 | 15 | ## Define two functions to compare the two methods 16 | 17 | ```{r} 18 | use_dplyr <- function(yr) { 19 | Batting %>% filter(yearID == yr) -> b 20 | b %>% group_by(playerID) %>% 21 | summarize(AB = sum(AB), H = sum(H)) -> S 22 | S %>% mutate(AVG = round(H / AB, 3)) -> S 23 | S %>% filter(AB >= 400) -> S400 24 | S400 %>% arrange(desc(AVG)) -> S400 25 | slice(S400, 1:10) -> top 26 | top 27 | } 28 | 29 | no_dplyr <- function(yr) { 30 | S <- subset(Batting, Batting$yearID == yr, 31 | select = c("playerID", "AB", "H")) 32 | AB <- as.vector(by(S$AB, S$playerID, FUN = sum)) 33 | H <- as.vector(by(S$H, S$playerID, FUN = sum)) 34 | S <- data.frame(playerID = unique(S$playerID), 35 | AB = AB, H = H, AVG = round(H / AB, 3), 36 | stringsAsFactors = FALSE) 37 | S400 <- S[S$AB >= 400, ] 38 | o <- order(S400$AVG, decreasing = TRUE) 39 | S400 <- S400[o, ] 40 | top <- S400[1:10, ] 41 | top 42 | } 43 | ``` 44 | 45 | Check that these functions obtain the same results. 46 | 47 | ```{r} 48 | all.equal(use_dplyr(1999), no_dplyr(1999)) 49 | ``` 50 | 51 | ## Benchmark comparison for dplyr method vs basic method 52 | 53 | ```{r} 54 | mb <- microbenchmark( 55 | dplyr = use_dplyr(1999), 56 | base = no_dplyr(1999) 57 | ) 58 | 59 | mb 60 | autoplot(mb) 61 | ``` 62 | 63 | Another version of the summary table: 64 | 65 | ```{r} 66 | knitr::kable(summary(mb)) 67 | ``` -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-Intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 1: Introduction 14 | 15 | ## Example 1.1 16 | 17 | ```{r } 18 | sumdice <- function(n) { 19 | k <- sample(1:6, size=n, replace=TRUE) 20 | return(sum(k)) 21 | } 22 | 23 | sumdice(2) 24 | 25 | #to store the result rather than print it 26 | a <- sumdice(100) 27 | 28 | #we expect the mean for 100 dice to be close to 3.5 29 | a / 100 30 | 31 | sumdice <- function(n) 32 | sum(sample(1:6, size=n, replace=TRUE)) 33 | 34 | sumdice <- function(n, sides = 6) { 35 | if (sides < 1) return (0) 36 | k <- sample(1:sides, size=n, replace=TRUE) 37 | return(sum(k)) 38 | } 39 | 40 | sumdice(5) #default 6 sides 41 | sumdice(n=5, sides=4) #4 sides 42 | ``` 43 | 44 | ## Example 1.2 (iris data) 45 | 46 | ```{r } 47 | names(iris) 48 | table(iris$Species) 49 | w <- iris[[2]] #Sepal.Width 50 | mean(w) 51 | 52 | attach(iris) 53 | summary(Petal.Length[51:100]) #versicolor petal length 54 | 55 | with(iris, summary(Petal.Length[51:100])) 56 | 57 | out <- with(iris, summary(Petal.Length[51:100])) 58 | 59 | by(iris[,1:4], Species, colMeans) 60 | detach(iris) 61 | ``` 62 | 63 | ## Example 1.3 (Arrays) 64 | 65 | ```{r } 66 | x <- 1:24 # vector 67 | dim(x) <- length(x) # 1 dimensional array 68 | matrix(1:24, nrow=4, ncol=6) # 4 by 6 matrix 69 | x <- array(1:24, c(3, 4, 2)) # 3 by 4 by 2 array 70 | ``` 71 | 72 | ## Example 1.4 (Matrices) 73 | 74 | ```{r } 75 | A <- matrix(0, nrow=2, ncol=2) 76 | A <- matrix(c(0, 0, 0, 0), nrow=2, ncol=2) 77 | A <- matrix(0, 2, 2) 78 | 79 | A <- matrix(1:8, nrow=2, ncol=4) 80 | ``` 81 | 82 | ## Example 1.5 (Iris data cont.) 83 | 84 | ```{r } 85 | x <- as.matrix(iris[,1:4]) #all rows of columns 1 to 4 86 | 87 | mean(x[,2]) #mean of sepal width, all species 88 | mean(x[51:100,3]) #mean of petal length, versicolor 89 | 90 | y <- array(x, dim=c(50, 3, 4)) 91 | mean(y[,,2]) #mean of sepal width, all species 92 | mean(y[,2,3]) #mean of petal length, versicolor 93 | 94 | y <- array(c(x[1:50,], x[51:100,], x[101:150,]), 95 | dim=c(50, 4, 3)) 96 | mean(y[,2,]) #mean of sepal width, all species 97 | mean(y[,3,2]) #mean of petal length, versicolor 98 | ``` 99 | 100 | ## Example 1.6 (Run length encoding) 101 | 102 | ```{r } 103 | n <- 1000 104 | x <- rbinom(n, size = 1, prob = .5) 105 | table(x) 106 | head(x, 30) 107 | 108 | r <- rle(x) 109 | str(r) 110 | 111 | head(r$lengths) 112 | head(r[[1]]) 113 | 114 | max(r$lengths) 115 | log2(length(x)) 116 | ``` 117 | 118 | ## Example 1.7 (Named list) 119 | 120 | ```{r } 121 | w <- wilcox.test(rnorm(10), rnorm(10, 2)) 122 | w #print the summary 123 | 124 | w$statistic #stored in object w 125 | w$p.value 126 | unlist(w) 127 | unclass(w) 128 | ``` 129 | 130 | ## Example 1.8 (A list of names) 131 | 132 | ```{r } 133 | a <- matrix(runif(8), 4, 2) #a 4x2 matrix 134 | dimnames(a) <- list(NULL, c("x", "y")) 135 | 136 | # if we want row names 137 | dimnames(a) <- list(letters[1:4], c("x", "y")) 138 | a 139 | 140 | # another way to assign row names 141 | row.names(a) <- list("NE", "NW", "SW", "SE") 142 | a 143 | ``` 144 | 145 | ## Example 1.9 (Parallel boxplots) 146 | 147 | ```{r } 148 | boxplot(iris$Sepal.Length ~ iris$Species) 149 | 150 | boxplot(iris$Sepal.Length ~ iris$Species, 151 | ylab = "Sepal Length", boxwex = .4) 152 | ``` 153 | 154 | ## Example 1.10 (Plotting characters and colors) 155 | 156 | ```{r } 157 | plot(0:25, rep(1, 26), pch = 0:25) 158 | text(0:25, 0.9, 0:25) 159 | ``` 160 | 161 | ## Example 1.11 (Barplot for run lengths) 162 | 163 | ```{r } 164 | barplot(table(r$lengths)) #R graphics version 165 | 166 | ## ggplot version 167 | library(ggplot2) 168 | df <- data.frame(lengths = factor(r$lengths)) 169 | ggplot(df, aes(lengths)) + geom_bar() 170 | ``` 171 | 172 | ## Example 1.12 (Scatterplots) 173 | 174 | ```{r } 175 | ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) + 176 | geom_point() 177 | 178 | ggplot(iris, aes(Sepal.Length, Sepal.Width, 179 | color = Species, shape = Species)) + geom_point(size = 2) 180 | ``` 181 | 182 | ## Example 1.13 (ggplot: parallel boxplots and violin plots) 183 | 184 | ```{r } 185 | ggplot(iris, aes(Species, Sepal.Length)) + geom_boxplot() 186 | ggplot(iris, aes(Species, Sepal.Length)) + geom_violin() 187 | 188 | ggplot(iris, aes(Species, Sepal.Length)) + 189 | geom_boxplot() + coord_flip() 190 | ggplot(iris, aes(Species, Sepal.Length)) + 191 | geom_violin() + coord_flip() 192 | ``` 193 | 194 | ## Example 1.14 (MPG by engine displacement) 195 | 196 | ```{r } 197 | ggplot(mpg, aes(displ, hwy)) + 198 | geom_point() + 199 | facet_wrap(~ class) 200 | ``` 201 | 202 | ## Example 1.15 (Import data from a local text file) 203 | 204 | ```{r } 205 | # copy "FOREARM.DAT" into working directory 206 | forearm <- scan(file = "FOREARM.DAT") #a vector 207 | 208 | # use pathname if file is not in working directory 209 | # forearm <- scan(file = "./DATASETS/FOREARM.DAT") #a vector 210 | 211 | head(forearm) 212 | ``` 213 | 214 | ## Example 1.16 (Importing data from a web page) 215 | 216 | ```{r } 217 | fileloc <- "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data" 218 | 219 | df <- read.table(file = fileloc, na.strings = "?", as.is = TRUE) 220 | str(df) 221 | names(df) <- c("mpg", "cyl", "displ", "hp", "wt", "accel", 222 | "year", "origin", "name") 223 | summary(df) 224 | ``` 225 | 226 | ## Example 1.17 (Importing/exporting .csv files) 227 | 228 | ```{r } 229 | #create a data frame 230 | dates <- c("3/27/1995", "4/3/1995", 231 | "4/10/1995", "4/18/1995") 232 | prices <- c(11.1, 7.9, 1.9, 7.3) 233 | d <- data.frame(dates=dates, prices=prices) 234 | 235 | #create the .csv file 236 | filename <- "temp.csv" 237 | write.table(d, file = filename, sep = ",", 238 | row.names = FALSE) 239 | 240 | #read the .csv file 241 | read.table(file = filename, sep = ",", header = TRUE) 242 | read.csv(file = filename) #same thing 243 | ``` 244 | 245 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-Programming.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | --- 11 | 12 | # Chapter 15: Programming Topics 13 | 14 | packages to install: 15 | energy, ggplot2, profvis, pryr, microbenchmark, mvtnorm, rbenchmark 16 | 17 | for the last section: dplyr, Lahman, Rcpp 18 | 19 | ## Example 15.1 (Benchmarking methods to generate a sequence) 20 | 21 | ```{r } 22 | s1 <- 1:10 23 | s2 <- seq(1, 10, 1) 24 | s3 <- seq.int(1, 10, 1) 25 | df <- data.frame(s1=s1, s2=s2, s3=s3) 26 | str(df) 27 | 28 | library(microbenchmark) 29 | library(ggplot2) 30 | 31 | n <- 1000 32 | mb <- microbenchmark( 33 | seq(1, n, 1), 34 | seq.int(1, n, 1), 35 | 1:n 36 | ) 37 | 38 | print(mb) 39 | autoplot(mb) # display a violin plot 40 | ``` 41 | 42 | ## Example 15.2 (Benchmarking methods to initialize a vector) 43 | 44 | ```{r } 45 | n <- 100 46 | mb2 <- microbenchmark( 47 | numeric = numeric(n) + 1, 48 | rep = rep(1, n), 49 | seq = seq(from=1, to=1, length=n), 50 | ones = matrix(1, nrow=n, ncol=1), 51 | as.ones = as.matrix(rep(1, n)) 52 | ) 53 | 54 | print(mb2) 55 | ``` 56 | 57 | ## Example 15.3 (Timings of two multivariate normal generators) 58 | 59 | ```{r } 60 | library(rbenchmark) 61 | library(MASS) 62 | library(mvtnorm) 63 | n <- 100 #sample size 64 | d <- 30 #dimension 65 | N <- 2000 #iterations 66 | mu <- numeric(d) 67 | 68 | benchmark( 69 | columns = c("test", "replications", "elapsed", "relative"), 70 | replications = 2000, 71 | cov = {S <- cov(matrix(rnorm(n*d), n, d))}, 72 | mvrnorm = mvrnorm(n, mu, S), 73 | rmvnorm = rmvnorm(n, mu, S) 74 | ) 75 | ``` 76 | 77 | ## Example 15.4 (Profiling with Rprof) 78 | 79 | ```{r } 80 | x <- rnorm(1000) 81 | y <- rnorm(1000) 82 | 83 | Rprof("pr.out", line.profiling = TRUE) 84 | energy::dcor(x, y) 85 | Rprof(NULL) 86 | summaryRprof("pr.out") 87 | ``` 88 | 89 | ## Example 15.5 (profvis interactive visualization) 90 | 91 | ```{r } 92 | library(profvis) 93 | profvis(energy::dcor(x, y)) 94 | ``` 95 | 96 | ## Example 15.6 (Object size) 97 | 98 | ```{r } 99 | x <- matrix(rnorm(5000), 1000, 5) #1000 obs in R^5 100 | DF <- as.data.frame(x) 101 | object.size(x) 102 | object.size(DF) 103 | pryr::object_size(x) 104 | pryr::object_size(DF) 105 | pryr::compare_size(x) 106 | 107 | listTwo <- list(x, x) 108 | pryr::compare_size(listTwo) 109 | ``` 110 | 111 | ## Example 15.7 (Comparing objects and attributes) 112 | 113 | ```{r } 114 | str(x) 115 | str(DF) 116 | all.equal(x, DF) 117 | names(attributes(x)) 118 | names(attributes(DF)) 119 | all.equal(x, DF, check.attributes = FALSE) 120 | ``` 121 | 122 | ## Example 15.8 (Comparing objects for equality) 123 | 124 | ```{r } 125 | try(ifelse(all.equal(x, DF), "T", "F")) # error 126 | ifelse(isTRUE(all.equal(x, DF)), "T", "F") # correct 127 | 128 | x <- 1 - 10e-4 129 | y <- x + 2 130 | x == (y - 2) # equal mathematically but 131 | isTRUE(all.equal(x, y - 2)) #gives expected result 132 | 133 | ## does not necessarily evaluate to TRUE or FALSE 134 | try(ifelse(all.equal(x, y), "T", "F")) 135 | ## returns TRUE or FALSE 136 | ifelse(isTRUE(all.equal(x, y)), "T", "F") 137 | ``` 138 | 139 | ## Example 15.9 (Display R function code) 140 | 141 | ```{r } 142 | nclass.scott 143 | ``` 144 | 145 | ## Example 15.10 (RSiteSearch) 146 | 147 | ```{r } 148 | if (interactive()) 149 | RSiteSearch("ggcorr") 150 | ``` 151 | 152 | ## Example 15.11 (UseMethod) 153 | 154 | ```{r } 155 | body(density) 156 | args(density.default) 157 | body(density.default) 158 | ``` 159 | 160 | ## Example 15.12 (Show methods) 161 | 162 | ```{r } 163 | methods(t.test) 164 | getAnywhere(t.test.formula) 165 | body(stats:::t.test.formula) 166 | ``` 167 | 168 | ## Example 15.13 (Object not found or not an exported object) 169 | 170 | ```{r } 171 | try(perc.ci) 172 | try(boot::perc.ci) 173 | getAnywhere(perc.ci) 174 | args(boot:::perc.ci) 175 | body(boot:::perc.ci) 176 | boot:::perc.ci 177 | getFromNamespace("perc.ci", "boot") 178 | ``` 179 | 180 | ## Example 15.14 (getS3method) 181 | 182 | ```{r } 183 | library(microbenchmark) 184 | library(ggplot2) 185 | getAnywhere(autoplot) 186 | getS3method("autoplot", class = "microbenchmark") 187 | getAnywhere(autoplot.microbenchmark) 188 | ``` 189 | 190 | ## Example 15.15 (.Primitive or .Internal) 191 | 192 | ```{r } 193 | if (interactive()) 194 | pryr::show_c_source(.Primitive(cumsum(x))) 195 | ``` 196 | 197 | ## Example 15.16 (.Call, .External, .C or .Fortran) 198 | 199 | ```{r } 200 | #### dist is implemented by a .Call to C_Cdist 201 | 202 | body(dist) 203 | ``` 204 | 205 | ## Example 15.17 (A first Rcpp experiment) 206 | ### In RStudio use the menu 207 | ### File > New File > C++ file to display this code 208 | ## Example 15.18 (cppFunction) 209 | 210 | ```{r } 211 | library(Rcpp) 212 | 213 | set.seed(1) 214 | x <- matrix(rnorm(20), nrow = 5, ncol = 4) 215 | 216 | cppFunction('double vecnorm(NumericVector v) { 217 | // compute the Euclidean norm of vector v 218 | int d = v.size(); 219 | double s = 0.0; 220 | for (int i = 0; i < d; i++) 221 | s += v(i) * v(i); 222 | return sqrt(s); 223 | }') 224 | 225 | print(vecnorm(x[, 1])) 226 | print(apply(x, MARGIN = 2, FUN = "vecnorm")) 227 | ``` 228 | 229 | ## Example 15.19 (sourceCpp) 230 | ### Create the C++ source file "printme.cpp" by editing the template 231 | ### in RStudio menu: File > New File > C++ 232 | 233 | ```{r } 234 | library(Rcpp) 235 | sourceCpp("..//examples-cpp//printme.cpp") 236 | x <- sample(1:5) 237 | print_me(x) 238 | ``` 239 | 240 | ## Example 15.20 (Lahman baseball data) 241 | 242 | ```{r } 243 | library(Lahman) 244 | str(Batting) 245 | # method 1 246 | S <- subset(Batting, Batting$yearID == 1999, 247 | select = c("playerID", "AB", "H")) 248 | 249 | # method 2 (dplyr) 250 | library(dplyr) 251 | Batting %>% filter(yearID == 1999) -> b 252 | 253 | # method 1 254 | AB <- as.vector(by(S$AB, S$playerID, FUN = sum)) 255 | H <- as.vector(by(S$H, S$playerID, FUN = sum)) 256 | S <- data.frame(playerID = unique(S$playerID), 257 | AB = AB, H = H, AVG = round(H / AB, 3), 258 | stringsAsFactors = FALSE) 259 | S400 <- S[S$AB >= 400, ] 260 | 261 | # method 2 (dplyr) 262 | b %>% group_by(playerID) %>% 263 | summarize(AB = sum(AB), H = sum(H)) -> S 264 | S %>% mutate(AVG = round(H / AB, 3)) -> S 265 | S %>% filter(AB >= 400) -> S400 266 | 267 | # method 1 268 | o <- order(S400$AVG, decreasing = TRUE) 269 | S400 <- S400[o, ] 270 | top <- S400[1:10, ] 271 | 272 | # method 2 (dplyr) 273 | S400 %>% arrange(desc(AVG)) -> S400 274 | slice(S400, 1:10) -> top 275 | top 276 | 277 | People %>% select(playerID, nameFirst, nameLast) -> m 278 | top %>% inner_join(m) %>% 279 | select(nameFirst, nameLast, AVG) 280 | ``` 281 | 282 | ## Example 15.21 (Comparison with microbenchmark) 283 | ### The source for this example is in R Markdown format 284 | ### Open "Lahman.Rmd" in RStudio and knit to see report 285 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-ResampAppl.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 9: Resampling Applications 14 | 15 | ## Example 9.1 (Jackknife-after-bootstrap) 16 | 17 | ```{r } 18 | library(boot) 19 | library(bootstrap) 20 | set.seed(1111) 21 | 22 | theta.boot <- function(patch, i) { 23 | # function to compute the patch ratio statistic 24 | y <- patch[i, "y"] 25 | z <- patch[i, "z"] 26 | mean(y) / mean(z) 27 | } 28 | 29 | boot.out <- boot(bootstrap::patch, 30 | statistic = theta.boot, R=2000) 31 | A <- boot.array(boot.out) 32 | head(A, 3) 33 | mean(A[, 1] == 0) 34 | ``` 35 | 36 | jackknife-after-bootstrap to est. se(se) 37 | 38 | ```{r } 39 | A <- boot.array(boot.out) 40 | theta.b <- boot.out$t 41 | n <- NROW(patch) 42 | jack.se <- numeric(n) 43 | 44 | for (i in 1:n) { 45 | #in i-th replicate omit all samples with x[i] 46 | keep <- which(A[, i] == 0) 47 | jack.se[i] <- sd(theta.b[keep]) 48 | } 49 | 50 | print(boot.out) #for se_boot 51 | se.bar <- mean(jack.se) 52 | se.se <- sqrt((n-1) * mean((jack.se - se.bar)^2)) 53 | print(paste("Jackknife-after-bootstrap est. se(se)=", se.se)) 54 | ``` 55 | 56 | ## Example 9.2 (Jackknife-after-bootstrap) 57 | initialize 58 | 59 | ```{r } 60 | data(patch, package = "bootstrap") 61 | y <- patch$y 62 | z <- patch$z 63 | dat <- cbind(y, z) 64 | n <- NROW(dat) 65 | B <- 2000 66 | ``` 67 | 68 | jackknife-after-bootstrap step 1: run the bootstrap 69 | 70 | ```{r } 71 | theta_boot <- function(dat, ind) { 72 | # function to compute the statistic 73 | y <- dat[ind, 1] 74 | z <- dat[ind, 2] 75 | mean(y) / mean(z) 76 | } 77 | 78 | boot.obj <- boot(dat, statistic = theta_boot, R=2000) 79 | theta.hat <- boot.obj$t0 80 | theta.b <- boot.obj$t 81 | se.boot <- sd(theta.b) 82 | ``` 83 | 84 | jackknife-after-bootstrap to est. se(se) 85 | 86 | ```{r } 87 | sample.freq <- boot.array(boot.obj) 88 | se.se.reps <- numeric(n) 89 | N <- 1:n 90 | 91 | for (i in N) { 92 | # jackknife-after-bootstrap 93 | # omit all bootstrap samples that contain obs i 94 | keep <- which(sample.freq[ ,i] == 0) 95 | se.se.reps[i] <- sd(theta.b[keep]) 96 | } 97 | 98 | print(boot.obj) 99 | se.bar <- mean(se.se.reps) 100 | se.se <- sqrt((n-1) * mean((se.se.reps - se.bar)^2)) 101 | se.se 102 | ``` 103 | 104 | ## Example 9.3 (ironslag linear model) 105 | 106 | ```{r } 107 | library(ggplot2) 108 | library(DAAG) 109 | L1 <- lm(magnetic ~ chemical, data=ironslag) 110 | cf3 <- round(L1$coeff, 3) 111 | cap <- paste("Fit: magnetic =", cf3[1], "+", cf3[2], "chemical") 112 | 113 | ggplot(data=ironslag, aes(chemical, magnetic)) + 114 | geom_point() + geom_smooth(method="lm") + 115 | ggtitle(cap) 116 | 117 | plot(L1, which=1:2, ask=FALSE) #residual plots 118 | ``` 119 | 120 | ## Example 9.4 (mammals data) 121 | 122 | ```{r, mammals1} 123 | library(MASS) 124 | cor(log(mammals$body), log(mammals$brain)) 125 | summary(mammals) 126 | 127 | y <- log(mammals$brain) 128 | x <- log(mammals$body) 129 | L <- lm(y ~ x) 130 | L 131 | 132 | cap <- paste("Fit: log(brain) =", round(L$coeff[1],3), 133 | "+", round(L$coeff[2],3), "log(body)") 134 | ggplot(data=mammals, aes(x, y)) + 135 | geom_point() + geom_smooth(method="lm") + 136 | labs(x = "log(body)", y = "log(brain)", title = cap) 137 | 138 | summary(L)$r.squared 139 | ``` 140 | 141 | ## Example 9.5 (ironslag data, resampling cases) 142 | 143 | ```{r } 144 | x <- ironslag$chemical 145 | y <- ironslag$magnetic 146 | m <- 2000 147 | n <- NROW(x) 148 | L1 <- lm(y ~ x) #estimate the model 149 | b0 <- L1$coeff[1]; b1 <- L1$coeff[2] 150 | ``` 151 | 152 | #### run bootstrap of cases 153 | 154 | ```{r } 155 | out <- replicate(m, expr={ 156 | i <- sample(1:n, replace=TRUE, size=n) 157 | xstar <- x[i] 158 | ystar <- y[i] 159 | Lb <- lm(ystar ~ xstar) 160 | s <- summary(Lb)$sigma 161 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s) 162 | }) 163 | 164 | bootCases <- t(out) 165 | meanCases <- colMeans(bootCases) 166 | sdCases <- apply(bootCases, 2, "sd") 167 | meanCases 168 | sdCases 169 | 170 | biasInt <- mean(bootCases[,1] - b0) #bias for intercept 171 | biasSlope <- mean(bootCases[,2] - b1) #bias for slope 172 | 173 | rbind(estimate=c(b0, b1), bias=c(biasInt, biasSlope), 174 | se=sdCases[1:2], cv=c(biasInt, cv=biasSlope)/sdCases[1:2]) 175 | ``` 176 | 177 | ## Example 9.6 (Resampling cases using the boot function) 178 | 179 | 180 | ```{r } 181 | set.seed(1104) 182 | library(boot) 183 | m <- 2000 184 | stats <- function(dat, i) { 185 | x <- dat$chemical[i] 186 | y <- dat$magnetic[i] 187 | Lb <- lm(y ~ x) 188 | s <- summary(Lb)$sigma 189 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s) 190 | } 191 | 192 | boot.out <- boot(ironslag, statistic=stats, R=2000) 193 | boot.out 194 | boot.out$t0 195 | 196 | sd(boot.out$t[,2]) 197 | boottbl <- broom::tidy(boot.out) 198 | boottbl$std.error[2] 199 | 200 | MASS::truehist(boot.out$t[ ,2], main="", xlab="slopes") 201 | abline(v = boot.out$t0[2], lwd=2) 202 | 203 | boot.ci(boot.out, index=2, type=c("norm","perc","basic","bca")) 204 | ``` 205 | 206 | ## Example 9.7 (Resampling errors: mammals data) 207 | 208 | Resampling errors: 209 | 210 | ```{r} 211 | x <- log(mammals$brain) 212 | y <- log(mammals$body) 213 | L <- lm(y ~ x) 214 | ``` 215 | 216 | ```{r} 217 | m.resid <- rstandard(L, sd = 1) 218 | r <- m.resid - mean(m.resid) 219 | m <- 1000; n <- NROW(x) 220 | estsErr <- replicate(m, expr={ 221 | estar <- sample(r, replace=TRUE, size=n) 222 | ystar <- L$fitted.values + estar 223 | Lb <- lm(ystar ~ x) 224 | s <- summary(Lb)$sigma 225 | c(b0=Lb$coeff[1], b1=Lb$coeff[2], s=s) 226 | }) 227 | ests <- t(estsErr) 228 | summary(ests) 229 | ``` 230 | 231 | ## Example 9.8 (Resampling errors, continued) 232 | 233 | ```{r } 234 | sd(ests[,2]) 235 | s <- summary(L)$sigma 236 | SSx <- (n - 1) * var(x) 237 | se.beta1 <- sqrt(s^2 / SSx) 238 | se.beta1 239 | s * sqrt(1/n + mean(x)^2 / SSx) 240 | sd(ests[,1]) 241 | 242 | betas <- summary(L)$coeff 243 | betas 244 | betas[, "Std. Error"] 245 | 246 | broom::tidy(summary(L)) 247 | broom::tidy(summary(L))$std.error 248 | ``` 249 | 250 | ## Example 9.9 (Model based resampling with the boot function) 251 | 252 | ```{r } 253 | regstats <- function(dat, i) { 254 | #dat is a data frame (r, x, yhat) 255 | #r are the modified centered residuals, yhat are the fits 256 | ystar <- dat$yhat + dat$r[i] 257 | xstar <- dat$x 258 | Lnew <- lm(ystar ~ xstar) 259 | Lnew$coefficients 260 | } 261 | 262 | y <- log(mammals$brain) 263 | x <- log(mammals$body) 264 | L <- lm(y ~ x) 265 | r <- rstandard(L, sd=1) 266 | r <- r - mean(r) 267 | df <- data.frame(r=r, x=x, yhat=L$fitted) 268 | head(df) 269 | boot.obj <- boot(data=df, statistic=regstats, R=2000) 270 | broom::tidy(boot.obj) 271 | ``` 272 | 273 | ## Example 9.10 (Empirical influence values for the patch ratio statistic) 274 | 275 | ```{r } 276 | library(boot) 277 | library(bootstrap) 278 | theta_boot <- function(dat, ind) { 279 | # function to compute the patch ratio statistic 280 | mean(dat[ind, ]$y) / mean(dat[ind, ]$z) 281 | } 282 | boot.out <- boot(patch, theta_boot, R = 2000) 283 | infl <- empinf(boot.out, type = "jack") 284 | theta.hat <- boot.out$t0 285 | jack <- theta.hat - infl / (nrow(patch) - 1) 286 | rbind(infl, jack) 287 | ``` 288 | 289 | ## Example 9.11 (Jackknife-after-bootstrap plot) 290 | 291 | ```{r } 292 | jack.after.boot(boot.out, useJ=TRUE, stinf=FALSE) 293 | 294 | n <- NROW(patch) 295 | J <- numeric(n) 296 | b.freq <- boot.array(boot.out) 297 | theta.b <- boot.out$t 298 | 299 | for (i in 1:n) { 300 | keep <- which(b.freq[ ,i] == 0) 301 | J[i] <- mean(theta.b[keep]) 302 | } 303 | ``` 304 | 305 | the jackknife influence values 306 | 307 | ```{r } 308 | (n - 1) * (mean(J) - J) 309 | 310 | jack.after.boot(boot.out, useJ=TRUE, stinf=TRUE) 311 | ``` 312 | 313 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chBoot.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 8: Bootstrap and Jackknife 14 | 15 | ## Example 8.2 (Bootstrap estimate of standard error) 16 | 17 | ```{r } 18 | library(bootstrap) #for the law data 19 | print(cor(law$LSAT, law$GPA)) 20 | print(cor(law82$LSAT, law82$GPA)) 21 | 22 | #set up the bootstrap 23 | B <- 200 #number of replicates 24 | n <- nrow(law) #sample size 25 | R <- numeric(B) #storage for replicates 26 | 27 | #bootstrap estimate of standard error of R 28 | for (b in 1:B) { 29 | #randomly select the indices 30 | i <- sample(1:n, size = n, replace = TRUE) 31 | LSAT <- law$LSAT[i] #i is a vector of indices 32 | GPA <- law$GPA[i] 33 | R[b] <- cor(LSAT, GPA) 34 | } 35 | #output 36 | print(se.R <- sd(R)) 37 | hist(R, prob = TRUE) 38 | ``` 39 | 40 | ## Example 8.3 (Bootstrap estimate of standard error: boot function) 41 | 42 | ```{r } 43 | r <- function(x, i) { 44 | #want correlation of columns 1 and 2 45 | cor(x[i,1], x[i,2]) 46 | } 47 | 48 | library(boot) #for boot function 49 | obj <- boot(data = law, statistic = r, R = 2000) 50 | obj 51 | y <- obj$t 52 | sd(y) 53 | ``` 54 | 55 | ## Example 8.4 (Bootstrap estimate of bias) 56 | 57 | ```{r } 58 | #sample estimate for n=15 59 | theta.hat <- cor(law$LSAT, law$GPA) 60 | 61 | #bootstrap estimate of bias 62 | B <- 2000 #larger for estimating bias 63 | n <- nrow(law) 64 | theta.b <- numeric(B) 65 | 66 | for (b in 1:B) { 67 | i <- sample(1:n, size = n, replace = TRUE) 68 | LSAT <- law$LSAT[i] 69 | GPA <- law$GPA[i] 70 | theta.b[b] <- cor(LSAT, GPA) 71 | } 72 | bias <- mean(theta.b - theta.hat) 73 | bias 74 | ``` 75 | 76 | ## Example 8.5 (Bootstrap estimate of bias of a ratio estimate) 77 | 78 | ```{r } 79 | data(patch, package = "bootstrap") 80 | patch 81 | 82 | n <- nrow(patch) #in bootstrap package 83 | B <- 2000 84 | theta.b <- numeric(B) 85 | theta.hat <- mean(patch$y) / mean(patch$z) 86 | 87 | #bootstrap 88 | for (b in 1:B) { 89 | i <- sample(1:n, size = n, replace = TRUE) 90 | y <- patch$y[i] 91 | z <- patch$z[i] 92 | theta.b[b] <- mean(y) / mean(z) 93 | } 94 | bias <- mean(theta.b) - theta.hat 95 | se <- sd(theta.b) 96 | print(list(est=theta.hat, bias = bias, 97 | se = se, cv = bias/se)) 98 | ``` 99 | 100 | ## Example 8.6 (Jackknife estimate of bias) 101 | 102 | ```{r } 103 | data(patch, package = "bootstrap") 104 | n <- nrow(patch) 105 | y <- patch$y 106 | z <- patch$z 107 | theta.hat <- mean(y) / mean(z) 108 | print (theta.hat) 109 | 110 | #compute the jackknife replicates, leave-one-out estimates 111 | theta.jack <- numeric(n) 112 | for (i in 1:n) 113 | theta.jack[i] <- mean(y[-i]) / mean(z[-i]) 114 | bias <- (n - 1) * (mean(theta.jack) - theta.hat) 115 | 116 | print(bias) #jackknife estimate of bias 117 | ``` 118 | 119 | ## Example 8.7 (Jackknife estimate of standard error) 120 | 121 | ```{r } 122 | se <- sqrt((n-1) * 123 | mean((theta.jack - mean(theta.jack))^2)) 124 | print(se) 125 | ``` 126 | 127 | ## Example 8.8 (Failure of jackknife) 128 | 129 | ```{r } 130 | set.seed(123) #for the specific example given 131 | #change the seed to see other examples 132 | 133 | n <- 10 134 | x <- sample(1:100, size = n) 135 | 136 | #jackknife estimate of se 137 | M <- numeric(n) 138 | for (i in 1:n) { #leave one out 139 | y <- x[-i] 140 | M[i] <- median(y) 141 | } 142 | Mbar <- mean(M) 143 | print(sqrt((n-1)/n * sum((M - Mbar)^2))) 144 | 145 | #bootstrap estimate of se 146 | Mb <- replicate(1000, expr = { 147 | y <- sample(x, size = n, replace = TRUE) 148 | median(y) }) 149 | print(sd(Mb)) 150 | print(x) 151 | print(M) 152 | print(Mb) 153 | ``` 154 | 155 | ## Example 8.9 (Bootstrap confidence intervals for patch ratio statistic) 156 | 157 | ```{r } 158 | library(boot) #for boot and boot.ci 159 | data(patch, package = "bootstrap") 160 | 161 | theta.boot <- function(dat, ind) { 162 | #function to compute the statistic 163 | y <- dat[ind, 1] 164 | z <- dat[ind, 2] 165 | mean(y) / mean(z) 166 | } 167 | 168 | y <- patch$y 169 | z <- patch$z 170 | dat <- cbind(y, z) 171 | boot.obj <- boot(dat, statistic = theta.boot, R = 2000) 172 | 173 | print(boot.obj) 174 | print(boot.ci(boot.obj, 175 | type = c("basic", "norm", "perc"))) 176 | 177 | 178 | #calculations for bootstrap confidence intervals 179 | alpha <- c(.025, .975) 180 | 181 | #normal 182 | print(boot.obj$t0 + qnorm(alpha) * sd(boot.obj$t)) 183 | 184 | #basic 185 | print(2*boot.obj$t0 - 186 | quantile(boot.obj$t, rev(alpha), type=1)) 187 | 188 | #percentile 189 | print(quantile(boot.obj$t, alpha, type=6)) 190 | ``` 191 | 192 | ## Example 8.10 (Bootstrap confidence intervals for the correlation statistic) 193 | 194 | ```{r } 195 | library(boot) 196 | data(law, package = "bootstrap") 197 | boot.obj <- boot(law, R = 2000, 198 | statistic = function(x, i){cor(x[i,1], x[i,2])}) 199 | print(boot.ci(boot.obj, type=c("basic","norm","perc"))) 200 | ``` 201 | 202 | ## Example 8.11 (Bootstrap t confidence interval) 203 | 204 | ```{r } 205 | boot.t.ci <- 206 | function(x, B = 500, R = 100, level = .95, statistic){ 207 | #compute the bootstrap t CI 208 | x <- as.matrix(x); n <- nrow(x) 209 | stat <- numeric(B); se <- numeric(B) 210 | 211 | boot.se <- function(x, R, f) { 212 | #local function to compute the bootstrap 213 | #estimate of standard error for statistic f(x) 214 | x <- as.matrix(x); m <- nrow(x) 215 | th <- replicate(R, expr = { 216 | i <- sample(1:m, size = m, replace = TRUE) 217 | f(x[i, ]) 218 | }) 219 | return(sd(th)) 220 | } 221 | 222 | for (b in 1:B) { 223 | j <- sample(1:n, size = n, replace = TRUE) 224 | y <- x[j, ] 225 | stat[b] <- statistic(y) 226 | se[b] <- boot.se(y, R = R, f = statistic) 227 | } 228 | stat0 <- statistic(x) 229 | t.stats <- (stat - stat0) / se 230 | se0 <- sd(stat) 231 | alpha <- 1 - level 232 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1) 233 | names(Qt) <- rev(names(Qt)) 234 | CI <- rev(stat0 - Qt * se0) 235 | } 236 | ``` 237 | 238 | ## Example 8.12 (Bootstrap t confidence interval for patch ratio statistic) 239 | 240 | ```{r } 241 | #boot package and patch data were loaded in Example 8.10 242 | #library(boot) #for boot and boot.ci 243 | #data(patch, package = "bootstrap") 244 | 245 | dat <- cbind(patch$y, patch$z) 246 | stat <- function(dat) { 247 | mean(dat[, 1]) / mean(dat[, 2]) } 248 | ci <- boot.t.ci(dat, statistic = stat, B=2000, R=200) 249 | print(ci) 250 | ``` 251 | 252 | ## Example 8.13 (BCa bootstrap confidence interval) 253 | 254 | ```{r } 255 | boot.BCa <- 256 | function(x, th0, th, stat, conf = .95) { 257 | # bootstrap with BCa bootstrap confidence interval 258 | # th0 is the observed statistic 259 | # th is the vector of bootstrap replicates 260 | # stat is the function to compute the statistic 261 | 262 | x <- as.matrix(x) 263 | n <- nrow(x) #observations in rows 264 | N <- 1:n 265 | alpha <- (1 + c(-conf, conf))/2 266 | zalpha <- qnorm(alpha) 267 | 268 | # the bias correction factor 269 | z0 <- qnorm(sum(th < th0) / length(th)) 270 | 271 | # the acceleration factor (jackknife est.) 272 | th.jack <- numeric(n) 273 | for (i in 1:n) { 274 | J <- N[1:(n-1)] 275 | th.jack[i] <- stat(x[-i, ], J) 276 | } 277 | L <- mean(th.jack) - th.jack 278 | a <- sum(L^3)/(6 * sum(L^2)^1.5) 279 | 280 | # BCa conf. limits 281 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha))) 282 | limits <- quantile(th, adj.alpha, type=6) 283 | return(list("est"=th0, "BCa"=limits)) 284 | } 285 | ``` 286 | 287 | ## Example 8.14 (BCa bootstrap confidence interval) 288 | 289 | ```{r } 290 | #boot package and patch data were loaded in Example 8.10 291 | #library(boot) #for boot and boot.ci 292 | #data(patch, package = "bootstrap") 293 | 294 | n <- nrow(patch) 295 | B <- 2000 296 | y <- patch$y 297 | z <- patch$z 298 | x <- cbind(y, z) 299 | theta.b <- numeric(B) 300 | theta.hat <- mean(y) / mean(z) 301 | 302 | #bootstrap 303 | for (b in 1:B) { 304 | i <- sample(1:n, size = n, replace = TRUE) 305 | y <- patch$y[i] 306 | z <- patch$z[i] 307 | theta.b[b] <- mean(y) / mean(z) 308 | } 309 | #compute the BCa interval 310 | stat <- function(dat, index) { 311 | mean(dat[index, 1]) / mean(dat[index, 2]) } 312 | 313 | boot.BCa(x, th0 = theta.hat, th = theta.b, stat = stat) 314 | ``` 315 | 316 | ## Example 8.15 (BCa bootstrap confidence interval using boot.ci) 317 | 318 | ```{r } 319 | #using x from Example 8.15 320 | boot.obj <- boot(x, statistic = stat, R=2000) 321 | boot.ci(boot.obj, type=c("perc", "bca")) 322 | ``` 323 | 324 | ## Example 8.16 (Model selection) 325 | 326 | ```{r } 327 | #to prompt for next graph, uncomment line below 328 | #par(ask = TRUE) 329 | 330 | library(DAAG); attach(ironslag) 331 | a <- seq(10, 40, .1) #sequence for plotting fits 332 | 333 | L1 <- lm(magnetic ~ chemical) 334 | plot(chemical, magnetic, main="Linear", pch=16) 335 | yhat1 <- L1$coef[1] + L1$coef[2] * a 336 | lines(a, yhat1, lwd=2) 337 | 338 | L2 <- lm(magnetic ~ chemical + I(chemical^2)) 339 | plot(chemical, magnetic, main="Quadratic", pch=16) 340 | yhat2 <- L2$coef[1] + L2$coef[2] * a + L2$coef[3] * a^2 341 | lines(a, yhat2, lwd=2) 342 | 343 | L3 <- lm(log(magnetic) ~ chemical) 344 | plot(chemical, magnetic, main="Exponential", pch=16) 345 | logyhat3 <- L3$coef[1] + L3$coef[2] * a 346 | yhat3 <- exp(logyhat3) 347 | lines(a, yhat3, lwd=2) 348 | 349 | L4 <- lm(log(magnetic) ~ log(chemical)) 350 | plot(log(chemical), log(magnetic), main="Log-Log", pch=16) 351 | logyhat4 <- L4$coef[1] + L4$coef[2] * log(a) 352 | lines(log(a), logyhat4, lwd=2) 353 | ``` 354 | 355 | ## Example 8.17 (Model selection: Cross validation) 356 | 357 | ```{r } 358 | # Example 8.16, cont. 359 | n <- length(magnetic) #in DAAG ironslag 360 | e1 <- e2 <- e3 <- e4 <- numeric(n) 361 | 362 | # for n-fold cross validation 363 | # fit models on leave-one-out samples 364 | for (k in 1:n) { 365 | y <- magnetic[-k] 366 | x <- chemical[-k] 367 | 368 | J1 <- lm(y ~ x) 369 | yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k] 370 | e1[k] <- magnetic[k] - yhat1 371 | 372 | J2 <- lm(y ~ x + I(x^2)) 373 | yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] + 374 | J2$coef[3] * chemical[k]^2 375 | e2[k] <- magnetic[k] - yhat2 376 | 377 | J3 <- lm(log(y) ~ x) 378 | logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k] 379 | yhat3 <- exp(logyhat3) 380 | e3[k] <- magnetic[k] - yhat3 381 | 382 | J4 <- lm(log(y) ~ log(x)) 383 | logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k]) 384 | yhat4 <- exp(logyhat4) 385 | e4[k] <- magnetic[k] - yhat4 386 | } 387 | 388 | 389 | c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2)) 390 | 391 | #selected model, fitted in Example 8.16 392 | L2 393 | 394 | par(mfrow = c(2, 2)) #layout for graphs 395 | plot(L2$fit, L2$res) #residuals vs fitted values 396 | abline(0, 0) #reference line 397 | qqnorm(L2$res) #normal probability plot 398 | qqline(L2$res) #reference line 399 | par(mfrow = c(1, 1)) #restore display 400 | ``` 401 | 402 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chMCint.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 6: Monte Carlo Integration and Variance Reduction 14 | 15 | ## Example 6.1 (Simple Monte Carlo integration) 16 | 17 | ```{r } 18 | m <- 10000 19 | x <- runif(m) 20 | theta.hat <- mean(exp(-x)) 21 | print(theta.hat) 22 | print(1 - exp(-1)) 23 | ``` 24 | 25 | ## Example 6.2 (Simple Monte Carlo integration, cont.) 26 | 27 | ```{r } 28 | m <- 10000 29 | x <- runif(m, min=2, max=4) 30 | theta.hat <- mean(exp(-x)) * 2 31 | print(theta.hat) 32 | print(exp(-2) - exp(-4)) 33 | ``` 34 | 35 | ## Example 6.3 (Monte Carlo integration, unbounded interval) 36 | 37 | ```{r } 38 | x <- seq(.1, 2.5, length = 10) 39 | m <- 10000 40 | u <- runif(m) 41 | cdf <- numeric(length(x)) 42 | for (i in 1:length(x)) { 43 | g <- x[i] * exp(-(u * x[i])^2 / 2) 44 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5 45 | } 46 | 47 | Phi <- pnorm(x) 48 | print(round(rbind(x, cdf, Phi), 3)) 49 | ``` 50 | 51 | ## Example 6.4 (Example 6.3, cont.) 52 | 53 | ```{r } 54 | x <- seq(.1, 2.5, length = 10) 55 | m <- 10000 56 | z <- rnorm(m) 57 | dim(x) <- length(x) 58 | p <- apply(x, MARGIN = 1, 59 | FUN = function(x, z) {mean(z < x)}, z = z) 60 | 61 | Phi <- pnorm(x) 62 | print(round(rbind(x, p, Phi), 3)) 63 | ``` 64 | 65 | ## Example 6.5 (Error bounds for MC integration) 66 | 67 | ```{r } 68 | x <- 2 69 | m <- 10000 70 | z <- rnorm(m) 71 | g <- (z < x) #the indicator function 72 | v <- mean((g - mean(g))^2) / m 73 | cdf <- mean(g) 74 | c(cdf, v) 75 | c(cdf - 1.96 * sqrt(v), cdf + 1.96 * sqrt(v)) 76 | ``` 77 | 78 | ## Example 6.6 (Antithetic variables) 79 | 80 | ```{r } 81 | MC.Phi <- function(x, R = 10000, antithetic = TRUE) { 82 | u <- runif(R/2) 83 | if (!antithetic) v <- runif(R/2) else 84 | v <- 1 - u 85 | u <- c(u, v) 86 | cdf <- numeric(length(x)) 87 | for (i in 1:length(x)) { 88 | g <- x[i] * exp(-(u * x[i])^2 / 2) 89 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5 90 | } 91 | cdf 92 | } 93 | 94 | 95 | x <- seq(.1, 2.5, length=5) 96 | Phi <- pnorm(x) 97 | set.seed(123) 98 | MC1 <- MC.Phi(x, anti = FALSE) 99 | set.seed(123) 100 | MC2 <- MC.Phi(x) 101 | print(round(rbind(x, MC1, MC2, Phi), 5)) 102 | 103 | 104 | m <- 1000 105 | MC1 <- MC2 <- numeric(m) 106 | x <- 1.95 107 | for (i in 1:m) { 108 | MC1[i] <- MC.Phi(x, R = 1000, anti = FALSE) 109 | MC2[i] <- MC.Phi(x, R = 1000) 110 | } 111 | 112 | print(sd(MC1)) 113 | print(sd(MC2)) 114 | print((var(MC1) - var(MC2))/var(MC1)) 115 | ``` 116 | 117 | ## Example 6.7 (Control variate) 118 | 119 | ```{r } 120 | m <- 10000 121 | a <- - 12 + 6 * (exp(1) - 1) 122 | U <- runif(m) 123 | T1 <- exp(U) #simple MC 124 | T2 <- exp(U) + a * (U - 1/2) #controlled 125 | 126 | mean(T1) 127 | mean(T2) 128 | (var(T1) - var(T2)) / var(T1) 129 | ``` 130 | 131 | ## Example 6.8 (MC integration using control variates) 132 | 133 | ```{r } 134 | f <- function(u) 135 | exp(-.5)/(1+u^2) 136 | 137 | g <- function(u) 138 | exp(-u)/(1+u^2) 139 | 140 | set.seed(510) #needed later 141 | u <- runif(10000) 142 | B <- f(u) 143 | A <- g(u) 144 | 145 | cor(A, B) 146 | a <- -cov(A,B) / var(B) #est of c* 147 | a 148 | 149 | m <- 100000 150 | u <- runif(m) 151 | T1 <- g(u) 152 | T2 <- T1 + a * (f(u) - exp(-.5)*pi/4) 153 | 154 | c(mean(T1), mean(T2)) 155 | c(var(T1), var(T2)) 156 | (var(T1) - var(T2)) / var(T1) 157 | ``` 158 | 159 | ## Example 6.9 (Control variate and regression) 160 | 161 | ```{r } 162 | set.seed(510) 163 | mu <- exp(-.5)*pi/4 164 | u <- runif(10000) 165 | f <- exp(-.5)/(1+u^2) 166 | g <- exp(-u)/(1+u^2) 167 | L <- lm(g ~ f) 168 | L 169 | c.star <- - L$coeff[2] 170 | c.star 171 | 172 | theta.hat <- sum(L$coeff * c(1, mu)) #pred. value at mu 173 | theta.hat 174 | summary(L)$sigma^2 175 | summary(L)$r.squared 176 | ``` 177 | 178 | ## Example 6.10 (Control variates and multiple regression) 179 | 180 | ```{r } 181 | # Example 6.9 continued with a second control variate 182 | # and multiple regression to estimate vector c* 183 | 184 | u <- runif(10000) 185 | f1 <- exp(-.5) / (1+u^2) 186 | f2 <- exp(-u) / (1-exp(-1)) 187 | g <- exp(-u) / (1+u^2) 188 | 189 | L2 <- lm(g ~ f1 + f2) 190 | 191 | L2$coeff 192 | c.star <- - L2$coeff[2:3] 193 | c.star 194 | mu1 <- exp(-.5)*pi/4 195 | mu2 <- 1 196 | mu <- c(mu1, mu2) 197 | 198 | # theta.hat is the predicted response at mu 199 | # alternately can use predict.lm method 200 | 201 | theta.hat <- sum(L2$coeff * c(1, mu)) #pred. value at mu 202 | theta.hat 203 | 204 | ## alternately 205 | df <- data.frame(f1=mu1, f2=mu2) 206 | theta.hat <-predict(L2, df) 207 | 208 | # MSE / n is the est. variance of the control estimator 209 | MSE <- summary(L2)$sigma^2 210 | MSE 211 | sqrt(MSE / 10000) 212 | 213 | 214 | # compare with the previous estimates using 215 | # naive MC and control variate f1(u) 216 | # var1=.060231423 var2=.003124814 217 | var0 <- 0.060231423 #naive MC 218 | var1 <- 0.003117644 #controlled estimator with f1 219 | var2 <- MSE #new estimator 220 | 221 | # percent reduction in variance 222 | # it is a weighted average of R^2 values 223 | # so easier to compute directly 224 | 225 | 100 * (var0 - var1) / var0 226 | 100 * (var1 - var2) / var1 227 | 100 * (var0 - var2) / var0 228 | ``` 229 | 230 | ## Example 6.11 (Choice of the importance function) 231 | 232 | ```{r } 233 | #code for plot is at the end of the file 234 | 235 | m <- 10000 236 | theta.hat <- se <- numeric(5) 237 | g <- function(x) { 238 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) 239 | } 240 | 241 | x <- runif(m) #using f0 242 | fg <- g(x) 243 | theta.hat[1] <- mean(fg) 244 | se[1] <- sd(fg) 245 | 246 | x <- rexp(m, 1) #using f1 247 | fg <- g(x) / exp(-x) 248 | theta.hat[2] <- mean(fg) 249 | se[2] <- sd(fg) 250 | 251 | x <- rcauchy(m) #using f2 252 | i <- c(which(x > 1), which(x < 0)) 253 | x[i] <- 2 #to catch overflow errors in g(x) 254 | fg <- g(x) / dcauchy(x) 255 | theta.hat[3] <- mean(fg) 256 | se[3] <- sd(fg) 257 | 258 | u <- runif(m) #f3, inverse transform method 259 | x <- - log(1 - u * (1 - exp(-1))) 260 | fg <- g(x) / (exp(-x) / (1 - exp(-1))) 261 | theta.hat[4] <- mean(fg) 262 | se[4] <- sd(fg) 263 | 264 | u <- runif(m) #f4, inverse transform method 265 | x <- tan(pi * u / 4) 266 | fg <- g(x) / (4 / ((1 + x^2) * pi)) 267 | theta.hat[5] <- mean(fg) 268 | se[5] <- sd(fg) 269 | 270 | rbind(theta.hat, se / sqrt(m)) 271 | ``` 272 | 273 | ## Example 6.12 (Example 6.11, cont.) 274 | 275 | ```{r } 276 | M <- 20 #number of replicates 277 | T2 <- numeric(4) 278 | estimates <- matrix(0, 10, 2) 279 | 280 | g <- function(x) { 281 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) } 282 | 283 | for (i in 1:10) { 284 | estimates[i, 1] <- mean(g(runif(M))) 285 | T2[1] <- mean(g(runif(M/4, 0, .25))) 286 | T2[2] <- mean(g(runif(M/4, .25, .5))) 287 | T2[3] <- mean(g(runif(M/4, .5, .75))) 288 | T2[4] <- mean(g(runif(M/4, .75, 1))) 289 | estimates[i, 2] <- mean(T2) 290 | } 291 | 292 | estimates 293 | apply(estimates, 2, mean) 294 | apply(estimates, 2, var) 295 | ``` 296 | 297 | ## Example 6.13 (Examples 6.11-6.12, cont.) 298 | 299 | ```{r } 300 | M <- 10000 #number of replicates 301 | k <- 10 #number of strata 302 | r <- M / k #replicates per stratum 303 | N <- 50 #number of times to repeat the estimation 304 | T2 <- numeric(k) 305 | estimates <- matrix(0, N, 2) 306 | 307 | g <- function(x) { 308 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) 309 | } 310 | 311 | for (i in 1:N) { 312 | estimates[i, 1] <- mean(g(runif(M))) 313 | for (j in 1:k) 314 | T2[j] <- mean(g(runif(M/k, (j-1)/k, j/k))) 315 | estimates[i, 2] <- mean(T2) 316 | } 317 | 318 | apply(estimates, 2, mean) 319 | apply(estimates, 2, var) 320 | ``` 321 | 322 | ## Plot importance functions in Figures 6.1(a) and 6.1.(b) 323 | 324 | ```{r } 325 | #par(ask = TRUE) #uncomment to pause between graphs 326 | 327 | x <- seq(0, 1, .01) 328 | w <- 2 329 | f1 <- exp(-x) 330 | f2 <- (1 / pi) / (1 + x^2) 331 | f3 <- exp(-x) / (1 - exp(-1)) 332 | f4 <- 4 / ((1 + x^2) * pi) 333 | g <- exp(-x) / (1 + x^2) 334 | 335 | #for color change lty to col 336 | 337 | #figure (a) 338 | plot(x, g, type = "l", main = "", ylab = "", 339 | ylim = c(0,2), lwd = w) 340 | lines(x, g/g, lty = 2, lwd = w) 341 | lines(x, f1, lty = 3, lwd = w) 342 | lines(x, f2, lty = 4, lwd = w) 343 | lines(x, f3, lty = 5, lwd = w) 344 | lines(x, f4, lty = 6, lwd = w) 345 | legend("topright", legend = c("g", 0:4), 346 | lty = 1:6, lwd = w, inset = 0.02) 347 | 348 | #figure (b) 349 | plot(x, g, type = "l", main = "", ylab = "", 350 | ylim = c(0,3.2), lwd = w, lty = 2) 351 | lines(x, g/f1, lty = 3, lwd = w) 352 | lines(x, g/f2, lty = 4, lwd = w) 353 | lines(x, g/f3, lty = 5, lwd = w) 354 | lines(x, g/f4, lty = 6, lwd = w) 355 | legend("topright", legend = c(0:4), 356 | lty = 2:6, lwd = w, inset = 0.02) 357 | ``` 358 | 359 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chMonteCarlo.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 7: Monte Carlo Methods in Inference 14 | 15 | Note: Example 7.11 may take a few minutes. 16 | 17 | 18 | ## Example 7.1 (Basic Monte Carlo estimation) 19 | 20 | ```{r 7.1} 21 | m <- 1000 22 | g <- numeric(m) 23 | for (i in 1:m) { 24 | x <- rnorm(2) 25 | g[i] <- abs(x[1] - x[2]) 26 | } 27 | est <- mean(g) 28 | est 29 | ``` 30 | 31 | ## Example 7.2 (Estimating the MSE of a trimmed mean) 32 | 33 | ```{r 7.2} 34 | n <- 20 35 | m <- 1000 36 | tmean <- numeric(m) 37 | for (i in 1:m) { 38 | x <- sort(rnorm(n)) 39 | tmean[i] <- sum(x[2:(n-1)]) / (n-2) 40 | } 41 | mse <- mean(tmean^2) 42 | mse 43 | sqrt(sum((tmean - mean(tmean))^2)) / m #se 44 | 45 | n <- 20 46 | m <- 1000 47 | tmean <- numeric(m) 48 | for (i in 1:m) { 49 | x <- sort(rnorm(n)) 50 | tmean[i] <- median(x) 51 | } 52 | mse <- mean(tmean^2) 53 | mse 54 | sqrt(sum((tmean - mean(tmean))^2)) / m #se 55 | ``` 56 | 57 | ## Example 7.3 (MSE of a trimmed mean, cont.) 58 | 59 | ```{r 7.3} 60 | set.seed(522) 61 | n <- 20 62 | K <- n/2 - 1 63 | m <- 1000 64 | mse <- matrix(0, n/2, 6) 65 | 66 | trimmed.mse <- function(n, m, k, p) { 67 | #MC est of mse for k-level trimmed mean of 68 | #contaminated normal pN(0,1) + (1-p)N(0,100) 69 | tmean <- numeric(m) 70 | for (i in 1:m) { 71 | sigma <- sample(c(1, 10), size = n, 72 | replace = TRUE, prob = c(p, 1-p)) 73 | x <- sort(rnorm(n, 0, sigma)) 74 | tmean[i] <- sum(x[(k+1):(n-k)]) / (n-2*k) 75 | } 76 | mse.est <- mean(tmean^2) 77 | se.mse <- sqrt(mean((tmean-mean(tmean))^2)) / sqrt(m) 78 | return(c(mse.est, se.mse)) 79 | } 80 | 81 | for (k in 0:K) { 82 | mse[k+1, 1:2] <- trimmed.mse(n=n, m=m, k=k, p=1.0) 83 | mse[k+1, 3:4] <- trimmed.mse(n=n, m=m, k=k, p=.95) 84 | mse[k+1, 5:6] <- trimmed.mse(n=n, m=m, k=k, p=.9) 85 | } 86 | ``` 87 | 88 | ## Example 7.4 (Confidence interval for variance) 89 | 90 | ```{r 7.4} 91 | n <- 20 92 | alpha <- .05 93 | x <- rnorm(n, mean=0, sd=2) 94 | UCL <- (n-1) * var(x) / qchisq(alpha, df=n-1) 95 | ``` 96 | 97 | ## Example 7.5 (MC estimate of confidence level) 98 | 99 | ```{r 7.5} 100 | n <- 20 101 | alpha <- .05 102 | UCL <- replicate(1000, expr = { 103 | x <- rnorm(n, mean = 0, sd = 2) 104 | (n-1) * var(x) / qchisq(alpha, df = n-1) 105 | } ) 106 | #count the number of intervals that contain sigma^2=4 107 | sum(UCL > 4) 108 | #or compute the mean to get the confidence level 109 | mean(UCL > 4) 110 | ``` 111 | 112 | ## Example 7.6 (Empirical confidence level) 113 | 114 | ```{r 7.6} 115 | n <- 20 116 | alpha <- .05 117 | UCL <- replicate(1000, expr = { 118 | x <- rchisq(n, df = 2) 119 | (n-1) * var(x) / qchisq(alpha, df = n-1) 120 | } ) 121 | sum(UCL > 4) 122 | mean(UCL > 4) 123 | ``` 124 | 125 | ## Example 7.7 (Empirical Type I error rate) 126 | 127 | ```{r 7.7} 128 | n <- 20 129 | alpha <- .05 130 | mu0 <- 500 131 | sigma <- 100 132 | 133 | m <- 10000 #number of replicates 134 | p <- numeric(m) #storage for p-values 135 | for (j in 1:m) { 136 | x <- rnorm(n, mu0, sigma) 137 | ttest <- t.test(x, alternative = "greater", mu = mu0) 138 | p[j] <- ttest$p.value 139 | } 140 | 141 | p.hat <- mean(p < alpha) 142 | se.hat <- sqrt(p.hat * (1 - p.hat) / m) 143 | print(c(p.hat, se.hat)) 144 | ``` 145 | 146 | ## Example 7.8 (Skewness test of normality) 147 | 148 | ```{r 7.8} 149 | n <- c(10, 20, 30, 50, 100, 500) #sample sizes 150 | cv <- qnorm(.975, 0, sqrt(6/n)) #crit. values for each n 151 | 152 | sk <- function(x) { 153 | #computes the sample skewness coeff. 154 | xbar <- mean(x) 155 | m3 <- mean((x - xbar)^3) 156 | m2 <- mean((x - xbar)^2) 157 | return( m3 / m2^1.5 ) 158 | } 159 | 160 | #n is a vector of sample sizes 161 | #we are doing length(n) different simulations 162 | 163 | p.reject <- numeric(length(n)) #to store sim. results 164 | m <- 10000 #num. repl. each sim. 165 | 166 | for (i in 1:length(n)) { 167 | sktests <- numeric(m) #test decisions 168 | for (j in 1:m) { 169 | x <- rnorm(n[i]) 170 | #test decision is 1 (reject) or 0 171 | sktests[j] <- as.integer(abs(sk(x)) >= cv[i] ) 172 | } 173 | p.reject[i] <- mean(sktests) #proportion rejected 174 | } 175 | 176 | p.reject 177 | ``` 178 | 179 | ## Example 7.9 (Empirical power) 180 | 181 | ```{r 7.9} 182 | #set.seed(521) 183 | n <- 20 184 | m <- 1000 185 | mu0 <- 500 186 | sigma <- 100 187 | mu <- c(seq(450, 650, 10)) #alternatives 188 | M <- length(mu) 189 | power <- numeric(M) 190 | for (i in 1:M) { 191 | mu1 <- mu[i] 192 | pvalues <- replicate(m, expr = { 193 | #simulate under alternative mu1 194 | x <- rnorm(n, mean = mu1, sd = sigma) 195 | ttest <- t.test(x, 196 | alternative = "greater", mu = mu0) 197 | ttest$p.value } ) 198 | power[i] <- mean(pvalues <= .05) 199 | } 200 | 201 | se <- sqrt(power * (1-power) / m) 202 | df <- data.frame(mean=mu, power=power, upper=power+2*se, lower=power-2*se) 203 | 204 | library(ggplot2) 205 | ggplot(df, aes(x=mean, y=power)) + 206 | geom_line() + 207 | geom_vline(xintercept=500, lty=2) + 208 | geom_hline(yintercept=c(0,.05), lty=1:2) + 209 | geom_errorbar(aes(ymin=lower, ymax=upper), width = 0.2, lwd=1.5) 210 | ``` 211 | 212 | ## Example 7.10 (Power of the skewness test of normality) 213 | 214 | ```{r 7.10} 215 | #set.seed(111) 216 | alpha <- .1 217 | n <- 30 218 | m <- 2500 219 | epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05)) 220 | N <- length(epsilon) 221 | pwr <- numeric(N) 222 | #critical value for the skewness test 223 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3)))) 224 | 225 | for (j in 1:N) { #for each epsilon 226 | e <- epsilon[j] 227 | sktests <- numeric(m) 228 | for (i in 1:m) { #for each replicate 229 | sigma <- sample(c(1, 10), replace = TRUE, 230 | size = n, prob = c(1-e, e)) 231 | x <- rnorm(n, 0, sigma) 232 | sktests[i] <- as.integer(abs(sk(x)) >= cv) 233 | } 234 | pwr[j] <- mean(sktests) 235 | } 236 | 237 | se <- sqrt(pwr * (1-pwr) / m) 238 | df <- data.frame(epsilon=epsilon, power=pwr, upper=pwr+2*se, lower=pwr-2*se) 239 | 240 | #plot power vs epsilon 241 | library(ggplot2) 242 | ggplot(df, aes(x=epsilon, y=power)) + 243 | geom_line() + labs(x=bquote(epsilon)) + 244 | geom_hline(yintercept=.1, lty=2) + 245 | geom_pointrange(aes(ymin=lower, ymax=upper)) 246 | ``` 247 | 248 | ## Example 7.11 (Power comparison of tests of normality) 249 | 250 | ```{r 7.11} 251 | #only one loop, for epsilon=0.1, was shown in the text 252 | #the simulation below takes several minutes to run 253 | 254 | # initialize input and output 255 | library(energy) 256 | alpha <- .1 257 | n <- 30 258 | m <- 500 #try small m for a trial run 259 | test1 <- test2 <- test3 <- numeric(m) 260 | 261 | #critical value for the skewness test 262 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3)))) 263 | sim <- matrix(0, 11, 4) 264 | 265 | # estimate power 266 | for (i in 0:10) { 267 | epsilon <- i * .1 268 | for (j in 1:m) { 269 | e <- epsilon 270 | sigma <- sample(c(1, 10), replace = TRUE, 271 | size = n, prob = c(1-e, e)) 272 | x <- rnorm(n, 0, sigma) 273 | test1[j] <- as.integer(abs(sk(x)) >= cv) 274 | test2[j] <- as.integer( 275 | shapiro.test(x)$p.value <= alpha) 276 | test3[j] <- as.integer( 277 | mvnorm.etest(x, R=200)$p.value <= alpha) 278 | } 279 | print(c(epsilon, mean(test1), mean(test2), mean(test3))) 280 | sim[i+1, ] <- c(epsilon, mean(test1), mean(test2), mean(test3)) 281 | } 282 | detach(package:energy) 283 | 284 | # plot the empirical estimates of power 285 | plot(sim[,1], sim[,2], ylim = c(0, 1), type = "l", 286 | xlab = bquote(epsilon), ylab = "power") 287 | lines(sim[,1], sim[,3], lty = 2) 288 | lines(sim[,1], sim[,4], lty = 4) 289 | abline(h = alpha, lty = 3) 290 | legend("topright", 1, c("skewness", "S-W", "energy"), 291 | lty = c(1,2,4), inset = .02) 292 | ``` 293 | 294 | ## Example 7.12 (Count Five test statistic) 295 | 296 | ```{r 7.12} 297 | x1 <- rnorm(20, 0, sd = 1) 298 | x2 <- rnorm(20, 0, sd = 1.5) 299 | y <- c(x1, x2) 300 | 301 | group <- rep(1:2, each = length(x1)) 302 | boxplot(y ~ group, boxwex = .3, xlim = c(.5, 2.5), main = "") 303 | points(group, y) 304 | 305 | # now identify the extreme points 306 | range(x1) 307 | range(x2) 308 | 309 | i <- which(x1 < min(x2)) 310 | j <- which(x2 > max(x1)) 311 | 312 | x1[i] 313 | x2[j] 314 | 315 | out1 <- sum(x1 > max(x2)) + sum(x1 < min(x2)) 316 | out2 <- sum(x2 > max(x1)) + sum(x2 < min(x1)) 317 | max(c(out1, out2)) 318 | ``` 319 | 320 | ## Example 7.13 (Count Five test statistic, cont.) 321 | 322 | ```{r 7.13} 323 | maxout <- function(x, y) { 324 | X <- x - mean(x) 325 | Y <- y - mean(y) 326 | outx <- sum(X > max(Y)) + sum(X < min(Y)) 327 | outy <- sum(Y > max(X)) + sum(Y < min(X)) 328 | return(max(c(outx, outy))) 329 | } 330 | 331 | n1 <- n2 <- 20 332 | mu1 <- mu2 <- 0 333 | sigma1 <- sigma2 <- 1 334 | m <- 1000 335 | 336 | # generate samples under H0 337 | stat <- replicate(m, expr={ 338 | x <- rnorm(n1, mu1, sigma1) 339 | y <- rnorm(n2, mu2, sigma2) 340 | maxout(x, y) 341 | }) 342 | print(cumsum(table(stat)) / m) 343 | print(quantile(stat, c(.8, .9, .95))) 344 | ``` 345 | 346 | ## Example 7.14 (Count Five test) 347 | 348 | ```{r 7.14} 349 | count5test <- function(x, y) { 350 | X <- x - mean(x) 351 | Y <- y - mean(y) 352 | outx <- sum(X > max(Y)) + sum(X < min(Y)) 353 | outy <- sum(Y > max(X)) + sum(Y < min(X)) 354 | # return 1 (reject) or 0 (do not reject H0) 355 | return(as.integer(max(c(outx, outy)) > 5)) 356 | } 357 | 358 | n1 <- n2 <- 20 359 | mu1 <- mu2 <- 0 360 | sigma1 <- sigma2 <- 1 361 | m <- 10000 362 | tests <- replicate(m, expr = { 363 | x <- rnorm(n1, mu1, sigma1) 364 | y <- rnorm(n2, mu2, sigma2) 365 | x <- x - mean(x) #centered by sample mean 366 | y <- y - mean(y) 367 | count5test(x, y) 368 | } ) 369 | 370 | alphahat <- mean(tests) 371 | print(alphahat) 372 | ``` 373 | 374 | ## Example 7.15 (Count Five test, cont.) 375 | 376 | ```{r 7.15} 377 | n1 <- 20 378 | n2 <- 30 379 | mu1 <- mu2 <- 0 380 | sigma1 <- sigma2 <- 1 381 | m <- 10000 382 | 383 | alphahat <- mean(replicate(m, expr={ 384 | x <- rnorm(n1, mu1, sigma1) 385 | y <- rnorm(n2, mu2, sigma2) 386 | x <- x - mean(x) #centered by sample mean 387 | y <- y - mean(y) 388 | count5test(x, y) 389 | })) 390 | 391 | print(alphahat) 392 | ``` 393 | 394 | ## Example 7.16 (Count Five, cont.) 395 | 396 | ```{r 7.16} 397 | # generate samples under H1 to estimate power 398 | sigma1 <- 1 399 | sigma2 <- 1.5 400 | 401 | power <- mean(replicate(m, expr={ 402 | x <- rnorm(20, 0, sigma1) 403 | y <- rnorm(20, 0, sigma2) 404 | count5test(x, y) 405 | })) 406 | 407 | print(power) 408 | ``` 409 | 410 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chNumerical.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 13: Introduction to Numerical Methods in R 14 | 15 | ## Example 13.1 (Identical and nearly equal) 16 | 17 | ```{r } 18 | isTRUE(all.equal(.2, .3 - .1)) 19 | all.equal(.2, .3) #not a logical value 20 | isTRUE(all.equal(.2, .3)) #always a logical value 21 | 22 | x <- 1:4 23 | y <- 2 24 | y == 2 25 | x == y #not necessarily a single logical value 26 | identical(x, y) #always a single logical value 27 | identical(y, 2) 28 | ``` 29 | 30 | ## Example 13.2 (Ratio of two large numbers) 31 | 32 | ```{r } 33 | n <- 400 34 | (gamma((n-1)/2) / (sqrt(pi) * gamma((n-2)/2))) 35 | exp(lgamma((n-1)/2) - lgamma((n-2)/2)) / sqrt(pi) 36 | ``` 37 | 38 | ## Example 13.3 (Taylor expansion) 39 | 40 | ```{r } 41 | system.time({ 42 | for (i in 1:1000) { 43 | a <- rep(0, 24) 44 | a0 <- pi / 6 45 | a2 <- a0 * a0 46 | a[1] <- -a0^3 / 6 47 | for (i in 2:24) 48 | a[i] <- - a2 * a[i-1] / ((2*i+1)*(2*i)) 49 | a0 + sum(a)} 50 | }) 51 | 52 | system.time({ 53 | for (i in 1:1000) { 54 | K <- 2 * (0:24) + 1 55 | i <- rep(c(1, -1), length=25) 56 | sum(i * (pi/6)^K / factorial(K))} 57 | }) 58 | ``` 59 | 60 | ## Example 13.4 (Derivative of zeta function) 61 | 62 | ```{r } 63 | zeta.deriv <- function(a) { 64 | z <- a - 1 65 | # Stieltjes constants gamma_k for k=1:5 66 | g <- c( 67 | -.7281584548367672e-1, 68 | -.9690363192872318e-2, 69 | .2053834420303346e-2, 70 | .2325370065467300e-2, 71 | .7933238173010627e-3) 72 | i <- c(-1, 1, -1, 1, -1) 73 | n <- 0:4 74 | -1/z^2 + sum(i * g * z^n / factorial(n)) 75 | } 76 | ``` 77 | 78 | ## Example 13.5 (Derivative of zeta function, cont.) 79 | 80 | ```{r } 81 | library(gsl) #for zeta function 82 | z <- c(1.001, 1.01, 1.5, 2, 3, 5) 83 | h <- .Machine$double.eps^0.5 84 | dz <- dq <- rep(0, length(z)) 85 | for (i in 1:length(z)) { 86 | v <- z[i] + h 87 | h <- v - z[i] 88 | a0 <- z[i] - h 89 | if (a0 < 1) a0 <- (1 + z[i])/2 90 | a1 <- z[i] + h 91 | dq[i] <- (zeta(a1) - zeta(a0)) / (a1 - a0) 92 | dz[i] <- zeta.deriv(z[i]) 93 | } 94 | 95 | h 96 | 97 | cbind(z, dz, dq) 98 | ``` 99 | 100 | ## Example 13.6 (Solving f(x)=0) 101 | 102 | ```{r } 103 | f <- function(y, a, n) { 104 | a^2 + y^2 + 2*a*y/(n-1) - (n-2) 105 | } 106 | 107 | a <- 0.5 108 | n <- 20 109 | b0 <- 0 110 | b1 <- 5*n 111 | 112 | #solve using bisection 113 | it <- 0 114 | eps <- .Machine$double.eps^0.25 115 | r <- seq(b0, b1, length=3) 116 | y <- c(f(r[1], a, n), f(r[2], a, n), f(r[3], a, n)) 117 | if (y[1] * y[3] > 0) 118 | stop("f does not have opposite sign at endpoints") 119 | 120 | while(it < 1000 && abs(y[2]) > eps) { 121 | it <- it + 1 122 | if (y[1]*y[2] < 0) { 123 | r[3] <- r[2] 124 | y[3] <- y[2] 125 | } else { 126 | r[1] <- r[2] 127 | y[1] <- y[2] 128 | } 129 | r[2] <- (r[1] + r[3]) / 2 130 | y[2] <- f(r[2], a=a, n=n) 131 | print(c(r[1], y[1], y[3]-y[2])) 132 | } 133 | ``` 134 | 135 | ## Example 13.7 (Solving f(x)=0 with Brent's method: uniroot) 136 | 137 | ```{r } 138 | a <- 0.5 139 | n <- 20 140 | out <- uniroot(function(y) { 141 | a^2 + y^2 + 2*a*y/(n-1) - (n-2) }, 142 | lower = 0, upper = n*5) 143 | unlist(out) 144 | uniroot(function(y) {a^2 + y^2 + 2*a*y/(n-1) - (n-2)}, 145 | interval = c(-n*5, 0))$root 146 | ``` 147 | 148 | ## Example 13.8 (Numerical integration with integrate) 149 | 150 | ```{r } 151 | f <- function(y, N, r, rho) { 152 | (cosh(y) - rho * r)^(1 - N) 153 | } 154 | integrate(f, lower=0, upper=Inf, 155 | rel.tol=.Machine$double.eps^0.25, 156 | N=10, r=0.5, rho=0.2) 157 | 158 | ro <- seq(-.99, .99, .01) 159 | v <- rep(0, length(ro)) 160 | for (i in 1:length(ro)) { 161 | v[i] <- integrate(f, lower=0, upper=Inf, 162 | rel.tol=.Machine$double.eps^0.25, 163 | N=10, r=0.5, rho=ro[i])$value 164 | } 165 | plot(ro, v, type="l", xlab=expression(rho), 166 | ylab="Integral Value (n=10, r=0.5)") 167 | ``` 168 | 169 | ## Example 13.9 (Density of sample correlation coefficient) 170 | 171 | ```{r } 172 | .dcorr <- function(r, N, rho=0) { 173 | # compute the density function of sample correlation 174 | if (abs(r) > 1 || abs(rho) > 1) return (0) 175 | if (N < 4) return (NA) 176 | 177 | if (isTRUE(all.equal(rho, 0.0))) { 178 | a <- exp(lgamma((N - 1)/2) - lgamma((N - 2)/2)) / 179 | sqrt(pi) 180 | return (a * (1 - r^2)^((N - 4)/2)) 181 | } 182 | 183 | # if rho not 0, need to integrate 184 | f <- function(w, R, N, rho) 185 | (cosh(w) - rho * R)^(1 - N) 186 | 187 | #need to insert some error checking here 188 | i <- integrate(f, lower=0, upper=Inf, 189 | R=r, N=N, rho=rho)$value 190 | c1 <- (N - 2) * (1 - rho^2)^((N - 1)/2) 191 | c2 <- (1 - r^2)^((N - 4) / 2) / pi 192 | return(c1 * c2 * i) 193 | } 194 | 195 | r <- as.matrix(seq(-1, 1, .01)) 196 | d1 <- apply(r, 1, .dcorr, N=10, rho=.0) 197 | d2 <- apply(r, 1, .dcorr, N=10, rho=.5) 198 | d3 <- apply(r, 1, .dcorr, N=10, rho=-.5) 199 | plot(r, d2, type="l", lty=2, lwd=2, ylab="density") 200 | lines(r, d1, lwd=2) 201 | lines(r, d3, lty=4, lwd=2) 202 | legend("top", inset=.02, 203 | c("rho = 0", "rho = 0.5", "rho = -0.5"), lty=c(1,2,4), lwd=2) 204 | ``` 205 | 206 | ## Example 13.10 (MLE using mle) 207 | 208 | ```{r } 209 | #the observed sample 210 | y <- c(0.04304550, 0.50263474) 211 | 212 | mlogL <- function(theta=1) { 213 | #minus log-likelihood of exp. density, rate theta 214 | return( - (length(y) * log(theta) - theta * sum(y))) 215 | } 216 | 217 | library(stats4) 218 | fit <- mle(mlogL) 219 | summary(fit) 220 | 221 | # Alternately, the initial value for the optimizer could 222 | # be supplied in the call to mle; two examples are 223 | 224 | mle(mlogL, start=list(theta=1)) 225 | mle(mlogL, start=list(theta=mean(y))) 226 | ``` 227 | 228 | ## Application: Evaluating an Expected Value 229 | 230 | ```{r } 231 | y <- VGAM::rpareto(1000, scale=1, shape=3) 232 | hist(y, prob=TRUE, breaks="scott", main="", ylim=c(0,3)) 233 | curve(VGAM::dpareto(x, scale=1, shape=3), add=TRUE) 234 | 235 | a <- 1 236 | s <- 1 237 | b <- 0.5 238 | ``` 239 | 240 | ## Example 13.11 (Numerical integration) 241 | 242 | ```{r } 243 | f1 <- function(x, y, s, a, b) { 244 | # the integrand function: |y-x|^b f(x) 245 | (abs(y-x))^b * a * s^a / x^(a+1) 246 | } 247 | 248 | integrate(f1, lower=s, upper=Inf, y=2, s=1, a=1, b=b) 249 | ``` 250 | 251 | ## Example 13.12 (Direct evaluation) 252 | 253 | ```{r } 254 | g2 <- function(y, s, b) { 255 | # Compute E|y-X|^b for Pareto(I) with a=1 256 | y0 <- (y - s)/y 257 | c0 <- gsl::hyperg_2F1(b+1, 1, b+2, y0) 258 | ((y - s)^b) - s*b*(y^(b-1)) * 259 | ((y0^b)/b + c0*(y0^(b+1))/(b+1)) + 260 | s*y^(b-1) * beta(b+1,1-b) 261 | } 262 | 263 | g2(2, s, b) 264 | ``` 265 | 266 | ## Example 13.13 (Plot of expected distance function) 267 | 268 | ```{r } 269 | p <- c(ppoints(50)/100, ppoints(50)) 270 | x <- VGAM::qpareto(p, scale=s, shape=1) 271 | ex <- g2(x, s=s, b=b) 272 | plot(log(x), log(ex), cex=.25, type="l") 273 | for (i in 1:length(x)) { 274 | y <- x[i] 275 | zi <- integrate(f1, lower=s, upper=Inf, 276 | subdivisions=200, rel.tol=.Machine$double.eps^.5, 277 | stop.on.error=FALSE, y=y, s=s, a=1, b=b) 278 | if (zi$message == "OK") 279 | points(log(y), log(zi$value), col=2, cex=.25) 280 | else print(paste("y=", y, zi$Message)) 281 | } 282 | 283 | integrate(f1, lower=s, upper=Inf, y=s, s=s, a=1, b=b) 284 | g2(s, s, b) 285 | ``` 286 | 287 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chOptim.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 14: Optimization 14 | 15 | ## Example 14.1 (One-dimensional optimization with optimize) 16 | 17 | ```{r } 18 | f <- function(x) 19 | log(x + log(x))/log(1+x) 20 | 21 | curve(f(x), from = 2, to = 15, ylab = "f(x)") 22 | 23 | optimize(f, lower = 4, upper = 8, maximum = TRUE) 24 | ``` 25 | 26 | ## Example 14.2 (MLE: Gamma distribution) 27 | 28 | ```{r } 29 | m <- 20000 30 | est <- matrix(0, m, 2) 31 | n <- 200 32 | r <- 5 33 | lambda <- 2 34 | 35 | obj <- function(lambda, xbar, logx.bar) { 36 | digamma(lambda * xbar) - logx.bar - log(lambda) 37 | } 38 | 39 | for (i in 1:m) { 40 | x <- rgamma(n, shape=r, rate=lambda) 41 | xbar <- mean(x) 42 | u <- uniroot(obj, lower = .001, upper = 10e5, 43 | xbar = xbar, logx.bar = mean(log(x))) 44 | lambda.hat <- u$root 45 | r.hat <- xbar * lambda.hat 46 | est[i, ] <- c(r.hat, lambda.hat) 47 | } 48 | 49 | ML <- colMeans(est) 50 | 51 | hist(est[, 1], breaks="scott", freq=FALSE, 52 | xlab="r", main="") 53 | points(ML[1], 0, cex=1.5, pch=20) 54 | hist(est[, 2], breaks="scott", freq=FALSE, 55 | xlab=bquote(lambda), main="") 56 | points(ML[2], 0, cex=1.5, pch=20) 57 | ``` 58 | 59 | ## Example 14.3 (Two-dimensional optimization with optim) 60 | 61 | ```{r } 62 | LL <- function(theta, sx, slogx, n) { 63 | r <- theta[1] 64 | lambda <- theta[2] 65 | loglik <- n * r * log(lambda) + (r - 1) * slogx - 66 | lambda * sx - n * log(gamma(r)) 67 | - loglik 68 | } 69 | 70 | n <- 200 71 | r <- 5; lambda <- 2 72 | x <- rgamma(n, shape=r, rate=lambda) 73 | 74 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n) 75 | 76 | mlests <- replicate(20000, expr = { 77 | x <- rgamma(200, shape = 5, rate = 2) 78 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n)$par 79 | }) 80 | colMeans(t(mlests)) 81 | ``` 82 | 83 | ## Example 14.4 (MLE for a quadratic form) 84 | 85 | ```{r } 86 | LL <- function(lambda, y) { 87 | lambda3 <- 1 - sum(lambda) 88 | f1 <- dgamma(y, shape=1/2, rate=1/(2*lambda[1])) 89 | f2 <- dgamma(y, shape=1/2, rate=1/(2*lambda[2])) 90 | f3 <- dgamma(y, shape=1/2, rate=1/(2*lambda3)) 91 | f <- f1/3 + f2/3 + f3/3 #density of mixture 92 | #returning -loglikelihood 93 | return( -sum(log(f))) 94 | } 95 | 96 | set.seed(543) 97 | m <- 2000 98 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda) 99 | lam <- sample(lambda, size = 2000, replace = TRUE) 100 | y <- rgamma(m, shape = .5, rate = 1/(2*lam)) 101 | 102 | opt <- optim(c(.5,.3), LL, y=y) 103 | theta <- c(opt$par, 1 - sum(opt$par)) 104 | 105 | as.data.frame(unlist(opt)) 106 | 107 | theta 108 | ``` 109 | 110 | ## Example 14.5 (EM algorithm for a mixture model) 111 | 112 | ```{r } 113 | set.seed(543) 114 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda) 115 | lam <- sample(lambda, size = 2000, replace = TRUE) 116 | y <- rgamma(m, shape = .5, rate = 1/(2*lam)) 117 | 118 | N <- 10000 #max. number of iterations 119 | L <- c(.5, .4, .1) #initial est. for lambdas 120 | tol <- .Machine$double.eps^0.5 121 | L.old <- L + 1 122 | 123 | for (j in 1:N) { 124 | f1 <- dgamma(y, shape=1/2, rate=1/(2*L[1])) 125 | f2 <- dgamma(y, shape=1/2, rate=1/(2*L[2])) 126 | f3 <- dgamma(y, shape=1/2, rate=1/(2*L[3])) 127 | py <- f1 / (f1 + f2 + f3) #posterior prob y from 1 128 | qy <- f2 / (f1 + f2 + f3) #posterior prob y from 2 129 | ry <- f3 / (f1 + f2 + f3) #posterior prob y from 3 130 | 131 | mu1 <- sum(y * py) / sum(py) #update means 132 | mu2 <- sum(y * qy) / sum(qy) 133 | mu3 <- sum(y * ry) / sum(ry) 134 | L <- c(mu1, mu2, mu3) #update lambdas 135 | L <- L / sum(L) 136 | 137 | if (sum(abs(L - L.old)/L.old) < tol) break 138 | L.old <- L 139 | } 140 | 141 | print(list(lambda = L/sum(L), iter = j, tol = tol)) 142 | ``` 143 | 144 | ## Example 14.6 (Simplex algorithm) 145 | 146 | ```{r } 147 | library(boot) #for simplex function 148 | A1 <- rbind(c(-2, 1, 1), c(4, -1, 3)) 149 | b1 <- c(1, 3) 150 | a <- c(2, 2, 3) 151 | simplex(a = a, A1 = A1, b1 = b1, maxi = TRUE) 152 | detach(package:boot) 153 | ``` 154 | 155 | ## Example 14.7 (Solving the Morra game) 156 | 157 | ```{r } 158 | solve.game <- function(A) { 159 | #solve the two player zero-sum game by simplex method 160 | #optimize for player 1, then player 2 161 | #maximize v subject to ... 162 | #let x strategies 1:m, and put v as extra variable 163 | #A1, the <= constraints 164 | # 165 | min.A <- min(A) 166 | A <- A - min.A #so that v >= 0 167 | max.A <- max(A) 168 | A <- A / max(A) 169 | m <- nrow(A) 170 | n <- ncol(A) 171 | it <- n^3 172 | a <- c(rep(0, m), 1) #objective function 173 | A1 <- -cbind(t(A), rep(-1, n)) #constraints <= 174 | b1 <- rep(0, n) 175 | A3 <- t(as.matrix(c(rep(1, m), 0))) #constraints sum(x)=1 176 | b3 <- 1 177 | sx <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3, 178 | maxi=TRUE, n.iter=it) 179 | #the 'solution' is [x1,x2,...,xm | value of game] 180 | # 181 | #minimize v subject to ... 182 | #let y strategies 1:n, with v as extra variable 183 | a <- c(rep(0, n), 1) #objective function 184 | A1 <- cbind(A, rep(-1, m)) #constraints <= 185 | b1 <- rep(0, m) 186 | A3 <- t(as.matrix(c(rep(1, n), 0))) #constraints sum(y)=1 187 | b3 <- 1 188 | sy <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3, 189 | maxi=FALSE, n.iter=it) 190 | 191 | soln <- list("A" = A * max.A + min.A, 192 | "x" = sx$soln[1:m], 193 | "y" = sy$soln[1:n], 194 | "v" = sx$soln[m+1] * max.A + min.A) 195 | soln 196 | } 197 | 198 | 199 | #enter the payoff matrix 200 | A <- matrix(c( 0,-2,-2,3,0,0,4,0,0, 201 | 2,0,0,0,-3,-3,4,0,0, 202 | 2,0,0,3,0,0,0,-4,-4, 203 | -3,0,-3,0,4,0,0,5,0, 204 | 0,3,0,-4,0,-4,0,5,0, 205 | 0,3,0,0,4,0,-5,0,-5, 206 | -4,-4,0,0,0,5,0,0,6, 207 | 0,0,4,-5,-5,0,0,0,6, 208 | 0,0,4,0,0,5,-6,-6,0), 9, 9) 209 | 210 | library(boot) #needed for simplex function 211 | 212 | s <- solve.game(A) 213 | round(cbind(s$x, s$y), 7) 214 | ``` 215 | 216 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chPerm.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 10: Permutation Tests 14 | 15 | ## Example 10.1 (Permutation distribution of a statistic) 16 | 17 | ```{r } 18 | attach(chickwts) 19 | x <- sort(weight[feed == "soybean"]) 20 | y <- sort(weight[feed == "linseed"]) 21 | detach(chickwts) 22 | 23 | R <- 999 #number of replicates 24 | z <- c(x, y) #pooled sample 25 | K <- 1:26 26 | reps <- numeric(R) #storage for replicates 27 | t0 <- t.test(x, y)$statistic 28 | 29 | for (i in 1:R) { 30 | #generate indices k for the first sample 31 | k <- sample(K, size = 14, replace = FALSE) 32 | x1 <- z[k] 33 | y1 <- z[-k] #complement of x1 34 | reps[i] <- t.test(x1, y1)$statistic 35 | } 36 | p <- mean(c(t0, reps) >= t0) 37 | p 38 | 39 | hist(reps, main = "", freq = FALSE, xlab = "T (p = 0.202)", 40 | breaks = "scott") 41 | points(t0, 0, cex = 1, pch = 16) #observed T 42 | ``` 43 | 44 | ## Example 10.2 (Permutation distribution of the K-S statistic) 45 | 46 | ```{r } 47 | # continues Example 10.1 48 | R <- 999 #number of replicates 49 | z <- c(x, y) #pooled sample 50 | K <- 1:26 51 | D <- numeric(R) #storage for replicates 52 | options(warn = -1) 53 | D0 <- ks.test(x, y, exact = FALSE)$statistic 54 | for (i in 1:R) { 55 | #generate indices k for the first sample 56 | k <- sample(K, size = 14, replace = FALSE) 57 | x1 <- z[k] 58 | y1 <- z[-k] #complement of x1 59 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic 60 | } 61 | p <- mean(c(D0, D) >= D0) 62 | options(warn = 0) 63 | p 64 | 65 | hist(D, main = "", freq = FALSE, xlab = "D (p = 0.46)", 66 | breaks = "scott") 67 | points(D0, 0, cex = 1, pch = 16) #observed D 68 | ``` 69 | 70 | ## Example 10.3 (Example 10.2, cont.) 71 | 72 | ```{r } 73 | attach(chickwts) 74 | x <- sort(weight[feed == "sunflower"]) 75 | y <- sort(weight[feed == "linseed"]) 76 | detach(chickwts) 77 | 78 | summary(cbind(x, y)) 79 | options(warn = -1) 80 | D0 <- ks.test(x, y, exact = FALSE)$statistic 81 | for (i in 1:R) { 82 | #generate indices k for the first sample 83 | k <- sample(K, size = 14, replace = FALSE) 84 | x1 <- z[k] 85 | y1 <- z[-k] #complement of x1 86 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic 87 | } 88 | p <- mean(c(D0, D) >= D0) 89 | options(warn = 0) 90 | p 91 | ``` 92 | 93 | ## Example 10.4 (Finding nearest neighbors) 94 | ## using yaImpute::ann 95 | 96 | ```{r } 97 | set.seed(439) 98 | library(yaImpute) #for ann function 99 | 100 | #generate a small multivariate data set 101 | x <- matrix(rnorm(12), 3, 4) 102 | y <- matrix(rnorm(12), 3, 4) 103 | z <- rbind(x, y) 104 | k <- nrow(z) #number of nearest neighbors desired 105 | 106 | ## Do an exact kd-tree search 107 | kd.exact <- ann(ref=z, target=z, 108 | tree.type="kd", k=k, verbose=FALSE) 109 | kd.exact$knnIndexDist[,1:k] #NN indices 110 | round(sqrt(kd.exact$knnIndexDist[,-(1:k)]),2) #Euclidean distances 111 | 112 | ## Do an approximate kd-tree search 113 | kd.approx <- ann(ref=z, target=z, 114 | tree.type="kd", k=k, eps=100, verbose=FALSE) 115 | kd.approx$knnIndexDist[,1:k] #NN indices 116 | detach(package:yaImpute) 117 | 118 | 119 | ### Example 10.5 (Nearest neighbor statistic) 120 | ### using yaImpute::ann 121 | 122 | library(yaImpute) 123 | attach(chickwts) 124 | x <- weight[feed == "sunflower"] 125 | y <- weight[feed == "linseed"] 126 | z <- as.matrix(c(x, y)) 127 | detach(chickwts) 128 | 129 | k <- 4 #want first 3 nearest neighbors 130 | NN <- ann(ref=z, target=z, tree.type="kd", k=k, verbose=FALSE) 131 | idx <- NN$knnIndexDist[,1:k] 132 | nn.idx <- idx[,-1] #first NN is in column 2 133 | 134 | block1 <- nn.idx[1:12, ] 135 | block2 <- nn.idx[13:24, ] 136 | i1 <- sum(block1 < 12.5) 137 | i2 <- sum(block2 > 12.5) 138 | 139 | c(i1, i2) 140 | 141 | detach(package:yaImpute) 142 | ``` 143 | 144 | ## Example 10.6 (Nearest neighbor test) 145 | ## using yaImpute::ann 146 | 147 | ```{r } 148 | library(boot) 149 | #continues the previous example 150 | 151 | ## function to return the matrix of indices NN_j of nearest neighbors 152 | NN.idx <- function(x, tree.type="kd", k=NROW(x)) { 153 | x <- as.matrix(x) 154 | k <- min(c(k+1, NROW(x))) 155 | NN <- yaImpute::ann(ref=x, target=x, 156 | tree.type="kd", k=k, verbose=FALSE) 157 | idx <- NN$knnIndexDist[,1:k] 158 | nn.idx <- idx[,-1] #first NN is in column 2 159 | row.names(nn.idx) <- idx[,1] 160 | nn.idx 161 | } 162 | 163 | ## function to compute the NN statistic T(n,3) 164 | Tn3 <- function(z, ix=1:NROW(z), sizes) { 165 | z <- as.matrix(z) 166 | n1 <- sizes[1] 167 | n2 <- sizes[2] 168 | n <- n1 + n2 169 | z <- as.matrix(z[ix, ]) 170 | nn.idx <- NN.idx(z, k=3) 171 | block1 <- nn.idx[1:n1, ] 172 | block2 <- nn.idx[(n1+1):n, ] 173 | i1 <- sum(block1 < n1 + .5) 174 | i2 <- sum(block2 > n1 + .5) 175 | return((i1 + i2) / (3 * n)) 176 | } 177 | 178 | attach(chickwts) 179 | x <- weight[feed == "sunflower"] 180 | y <- weight[feed == "linseed"] 181 | z <- c(x, y) 182 | detach(chickwts) 183 | 184 | N <- c(NROW(x), NROW(y)) 185 | 186 | boot.obj <- boot(data = z, statistic = Tn3, 187 | sim = "permutation", R = 999, sizes = N) 188 | boot.obj 189 | 190 | tb <- c(boot.obj$t, boot.obj$t0) 191 | mean(tb >= boot.obj$t0) 192 | 193 | hist(tb, freq=FALSE, main="", 194 | xlab="replicates of T(n,3) statistic") 195 | points(boot.obj$t0, 0, cex=1, pch=16) 196 | ``` 197 | 198 | ## Example 10.7 (Two-sample energy statistic) 199 | 200 | ```{r } 201 | edist.2 <- function(x, ix, sizes) { 202 | # computes the e-statistic between 2 samples 203 | # x: Euclidean distances of pooled sample 204 | # sizes: vector of sample sizes 205 | # ix: a permutation of row indices of x 206 | 207 | dst <- x 208 | n1 <- sizes[1] 209 | n2 <- sizes[2] 210 | ii <- ix[1:n1] 211 | jj <- ix[(n1+1):(n1+n2)] 212 | w <- n1 * n2 / (n1 + n2) 213 | 214 | # permutation applied to rows & cols of dist. matrix 215 | m11 <- sum(dst[ii, ii]) / (n1 * n1) 216 | m22 <- sum(dst[jj, jj]) / (n2 * n2) 217 | m12 <- sum(dst[ii, jj]) / (n1 * n2) 218 | e <- w * ((m12 + m12) - (m11 + m22)) 219 | return (e) 220 | } 221 | 222 | d <- 3 223 | a <- 2 / sqrt(d) 224 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d) 225 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d) 226 | z <- rbind(x, y) 227 | dst <- as.matrix(dist(z)) 228 | 229 | edist.2(dst, 1:30, sizes = c(20, 10)) 230 | ``` 231 | 232 | ## Example 10.8 (Two-sample energy test) 233 | 234 | ```{r } 235 | library(boot) #for boot function 236 | dst <- as.matrix(dist(z)) 237 | N <- c(20, 10) 238 | 239 | boot.obj <- boot(data = dst, statistic = edist.2, 240 | sim = "permutation", R = 999, sizes = N) 241 | boot.obj 242 | 243 | #calculate the ASL 244 | e <- boot.obj$t0 245 | tb <- c(e, boot.obj$t) 246 | mean(tb >= e) 247 | 248 | hist(tb, main = "", breaks="scott", freq=FALSE, 249 | xlab="Replicates of e") 250 | points(e, 0, cex=1, pch=16) 251 | 252 | 253 | #energy test applied under F=G 254 | d <- 3 255 | a <- 0 256 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d) 257 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d) 258 | z <- rbind(x, y) 259 | dst <- as.matrix(dist(z)) 260 | 261 | N <- c(20, 10) 262 | dst <- as.matrix(dist(z)) 263 | boot.obj <- boot(data = dst, statistic = edist.2, 264 | sim="permutation", R=999, sizes=N) 265 | boot.obj 266 | 267 | #calculate the ASL 268 | e <- boot.obj$t0 269 | E <- c(boot.obj$t, e) 270 | mean(E >= e) 271 | 272 | hist(E, main = "", breaks="scott", 273 | xlab="Replicates of e", freq=FALSE) 274 | points(e, 0, cex=1, pch=16) 275 | ``` 276 | 277 | ## Example 10.9 (k-sample energy distances) 278 | 279 | ```{r } 280 | z <- iris[ , 1:4] 281 | dst <- dist(z) 282 | energy::edist(dst, sizes = c(50, 50, 50), distance = TRUE) 283 | ``` 284 | 285 | ## Example 10.10 (Distance Components (disco)) 286 | 287 | ```{r } 288 | set.seed(413) 289 | energy::disco(iris[ , 1:4], factors = iris$Species, R = 999) 290 | ``` 291 | 292 | ## Example 10.11 (Power Comparison) 293 | 294 | ```{r } 295 | # results of several simulations summarized in Table 296 | ``` 297 | 298 | ## Example 10.12 (Distance covariance statistic) 299 | 300 | ```{r } 301 | dCov <- function(x, y) { 302 | x <- as.matrix(x) 303 | y <- as.matrix(y) 304 | n <- nrow(x) 305 | m <- nrow(y) 306 | if (n != m || n < 2) stop("Sample sizes must agree") 307 | if (! (all(is.finite(c(x, y))))) 308 | stop("Data contains missing or infinite values") 309 | 310 | Akl <- function(x) { 311 | d <- as.matrix(dist(x)) 312 | m <- rowMeans(d) 313 | M <- mean(d) 314 | a <- sweep(d, 1, m) 315 | b <- sweep(a, 2, m) 316 | return(b + M) 317 | } 318 | A <- Akl(x) 319 | B <- Akl(y) 320 | dCov <- sqrt(mean(A * B)) 321 | dCov 322 | } 323 | 324 | z <- as.matrix(iris[1:50, 1:4]) 325 | x <- z[ , 1:2] 326 | y <- z[ , 3:4] 327 | # compute the observed statistic 328 | dCov(x, y) 329 | ``` 330 | 331 | ## Example 10.13 (Distance correlation statistic) 332 | 333 | ```{r } 334 | DCOR <- function(x, y) { 335 | x <- as.matrix(x) 336 | y <- as.matrix(y) 337 | n <- nrow(x) 338 | m <- nrow(y) 339 | if (n != m || n < 2) stop("Sample sizes must agree") 340 | if (! (all(is.finite(c(x, y))))) 341 | stop("Data contains missing or infinite values") 342 | Akl <- function(x) { 343 | d <- as.matrix(dist(x)) 344 | m <- rowMeans(d) 345 | M <- mean(d) 346 | a <- sweep(d, 1, m) 347 | b <- sweep(a, 2, m) 348 | return(b + M) 349 | } 350 | A <- Akl(x) 351 | B <- Akl(y) 352 | dCov <- sqrt(mean(A * B)) 353 | dVarX <- sqrt(mean(A * A)) 354 | dVarY <- sqrt(mean(B * B)) 355 | dCor <- sqrt(dCov / sqrt(dVarX * dVarY)) 356 | list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY) 357 | } 358 | 359 | z <- as.matrix(iris[1:50, 1:4]) 360 | x <- z[ , 1:2] 361 | y <- z[ , 3:4] 362 | 363 | DCOR(x, y) 364 | ``` 365 | 366 | ## Example 10.14 (Distance covariance test) 367 | 368 | ```{r } 369 | ndCov2 <- function(z, ix, dims) { 370 | p <- dims[1] 371 | q1 <- p + 1 372 | d <- p + dims[2] 373 | x <- z[ , 1:p] #leave x as is 374 | y <- z[ix, q1:d] #permute rows of y 375 | return(nrow(z) * dCov(x, y)^2) 376 | } 377 | 378 | library(boot) 379 | z <- as.matrix(iris[1:50, 1:4]) 380 | boot.obj <- boot(data = z, statistic = ndCov2, R = 999, 381 | sim = "permutation", dims = c(2, 2)) 382 | 383 | tb <- c(boot.obj$t0, boot.obj$t) 384 | hist(tb, nclass="scott", xlab="", main="", 385 | freq=FALSE) 386 | points(boot.obj$t0, 0, cex=1, pch=16) 387 | 388 | mean(tb >= boot.obj$t0) 389 | boot.obj 390 | ``` 391 | 392 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chRandomProcess.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | 14 | # Chapter 4: Generating Random Processes 15 | 16 | ## Example 4.1 (Poisson process) 17 | 18 | ```{r } 19 | lambda <- 2 20 | t0 <- 3 21 | Tn <- rexp(100, lambda) #interarrival times 22 | Sn <- cumsum(Tn) #arrival times 23 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 24 | ``` 25 | 26 | ## Example 4.2 (Poisson process, cont.) 27 | 28 | ```{r } 29 | lambda <- 2 30 | t0 <- 3 31 | upper <- 100 32 | pp <- numeric(10000) 33 | for (i in 1:10000) { 34 | N <- rpois(1, lambda * upper) 35 | Un <- runif(N, 0, upper) #unordered arrival times 36 | Sn <- sort(Un) #arrival times 37 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 38 | pp[i] <- n - 1 #arrivals in [0, t0] 39 | } 40 | 41 | #alternately, the loop can be replaced by replicate function 42 | pp <- replicate(10000, expr = { 43 | N <- rpois(1, lambda * upper) 44 | Un <- runif(N, 0, upper) #unordered arrival times 45 | Sn <- sort(Un) #arrival times 46 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 47 | n - 1 }) #arrivals in [0, t0] 48 | 49 | c(mean(pp), var(pp)) 50 | ``` 51 | 52 | ## Example 4.3 (Nonhomogeneous Poisson process) 53 | 54 | ```{r } 55 | lambda <- 3 56 | upper <- 100 57 | N <- rpois(1, lambda * upper) 58 | Tn <- rexp(N, lambda) 59 | Sn <- cumsum(Tn) 60 | Un <- runif(N) 61 | keep <- (Un <= cos(Sn)^2) #indicator, as logical vector 62 | Sn[keep] 63 | 64 | round(Sn[keep], 4) 65 | ``` 66 | 67 | ## Example 4.4 (Renewal process) 68 | 69 | ```{r } 70 | t0 <- 5 71 | Tn <- rgeom(100, prob = .2) #interarrival times 72 | Sn <- cumsum(Tn) #arrival times 73 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0] 74 | 75 | Nt0 <- replicate(1000, expr = { 76 | Sn <- cumsum(rgeom(100, prob = .2)) 77 | min(which(Sn > t0)) - 1 78 | }) 79 | table(Nt0)/1000 80 | Nt0 81 | 82 | t0 <- seq(0.1, 30, .1) 83 | mt <- numeric(length(t0)) 84 | 85 | for (i in 1:length(t0)) { 86 | mt[i] <- mean(replicate(1000, 87 | { 88 | Sn <- cumsum(rgeom(100, prob = .2)) 89 | min(which(Sn > t0[i])) - 1 90 | })) 91 | } 92 | plot(t0, mt, type = "l", xlab = "t", ylab = "mean") 93 | abline(0, .25) 94 | ``` 95 | 96 | ## Example 4.5 (Symmetric random walk) 97 | 98 | ```{r } 99 | n <- 400 100 | incr <- sample(c(-1, 1), size = n, replace = TRUE) 101 | S <- as.integer(c(0, cumsum(incr))) 102 | plot(0:n, S, type = "l", main = "", xlab = "i") 103 | ``` 104 | 105 | ## Example 4.6 (Generator for the time until return to origin) 106 | 107 | ```{r } 108 | set.seed(12345) 109 | 110 | #compute the probabilities directly 111 | n <- 1:10000 112 | p2n <- exp(lgamma(2*n-1) 113 | - log(n) - (2*n-1)*log(2) - 2*lgamma(n)) 114 | #or compute using dbinom 115 | P2n <- (.5/n) * dbinom(n-1, size = 2*n-2, prob = 0.5) 116 | pP2n <- cumsum(P2n) 117 | 118 | #given n compute the time of the last return to 0 in (0,n] 119 | n <- 200 120 | sumT <- 0 121 | while (sumT <= n) { 122 | u <- runif(1) 123 | s <- sum(u > pP2n) 124 | if (s == length(pP2n)) 125 | warning("T is truncated") 126 | Tj <- 2 * (1 + s) 127 | #print(c(Tj, sumT)) 128 | sumT <- sumT + Tj 129 | } 130 | sumT - Tj 131 | ``` 132 | 133 | ## Example 4.7 (Brownian motion) 134 | 135 | ```{r } 136 | simBM <- function(n, T) { 137 | times <- seq(0, T, length = n+1) 138 | z <- rnorm(n) 139 | w <- rep(0, n) 140 | s <- sqrt(diff(times)) 141 | for (k in 2:n) { 142 | w[k] <- w[k-1] + s[k] * z[k] 143 | } 144 | return (list(w=w, t=times)) 145 | } 146 | 147 | set.seed(1) 148 | n <- 200 149 | x1 <- simBM(n, 1) 150 | x2 <- simBM(n, 1) 151 | x3 <- simBM(n, 1) 152 | r <- range(c(x1$w, x2$w, x3$w)) 153 | plot(x1$w, type="l", main="", xlab="t", ylab="W", ylim=r) 154 | lines(x2$w, lty=2) 155 | lines(x3$w, lty=3) 156 | 157 | interpBM <- function(w, t0, times) { 158 | k1 <- sum(times < t0) 159 | k <- k1 + 1 160 | b <- (t0 - times[k1]) / (times[k] - times[k1]) 161 | return (w[k1] + b * (w[k] - w[k1])) 162 | } 163 | 164 | plot(x1$t[1:10], x1$w[1:10], type="b", main="", xlab="t", ylab="W") 165 | tmids <- x1$t + 0.0025 166 | for (i in 1:10) { 167 | w <- interpBM(x1$w, tmids[i], x1$t) 168 | points(tmids[i], w, pch=2) 169 | } 170 | 171 | legend("topleft", c("Generated W", "Interpolated W"), pch=c(1,2), lty=1, bty="n") 172 | ``` 173 | 174 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-chVis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Code for SCR 2e Examples" 3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e" 4 | output: 5 | html_document: 6 | code_folding: show 7 | theme: lumen 8 | toc: yes 9 | toc_float: yes 10 | df_print: paged 11 | --- 12 | 13 | # Chapter 5: Visualization of Multivariate Data 14 | 15 | ## Example 5.1 (Scatterplot matrix) 16 | 17 | ```{r } 18 | data(iris) 19 | #virginica data in first 4 columns of the last 50 obs. 20 | 21 | # not shown in text 22 | pairs(iris[101:150, 1:4]) 23 | 24 | panel.d <- function(x, ...) { 25 | usr <- par("usr") 26 | on.exit(par(usr=usr)) 27 | par(usr = c(usr[1:2], 0, .5)) 28 | lines(density(x)) 29 | } 30 | 31 | # Fig. 5.1 32 | x <- scale(iris[101:150, 1:4]) 33 | r <- range(x) 34 | pairs(x, diag.panel = panel.d, xlim = r, ylim = r) 35 | 36 | library(lattice) 37 | splom(iris[101:150, 1:4]) #plot 1 38 | 39 | #for all 3 at once, in color, plot 2 40 | splom(iris[,1:4], groups = iris$Species) 41 | 42 | # Fig. 5.2 43 | #for all 3 at once, black and white, plot 3 44 | splom(~iris[1:4], groups = Species, data = iris, 45 | col = 1, pch = c(1, 2, 3), cex = c(.5,.5,.5)) 46 | ``` 47 | 48 | ## Example 5.2 (Correlation plots: Decathlon data) 49 | 50 | ```{r } 51 | library(FactoMineR) #decathlon data 52 | library(corrplot) 53 | data("decathlon") 54 | str(decathlon) 55 | 56 | corrMat <- cor(decathlon[, 1:10]) 57 | corrplot(corrMat, type="upper", tl.col="black", tl.srt=45) 58 | corrplot(corrMat, type = "upper", method = "square", 59 | addCoef.col = "black", diag=FALSE) 60 | ``` 61 | 62 | ## Example 5.3 (Plot bivariate normal density) 63 | 64 | ```{r } 65 | #the standard BVN density 66 | f <- function(x,y) { 67 | z <- (1/(2*pi)) * exp(-.5 * (x^2 + y^2)) 68 | } 69 | 70 | y <- x <- seq(-3, 3, length= 50) 71 | z <- outer(x, y, f) #compute density for all (x,y) 72 | 73 | persp(x, y, z) #the default plot 74 | 75 | persp(x, y, z, theta = 45, phi = 30, expand = 0.6, 76 | ltheta = 120, shade = 0.75, ticktype = "detailed", 77 | xlab = "X", ylab = "Y", zlab = "f(x, y)") 78 | ``` 79 | 80 | ## Example 5.4 (Add elements to perspective plot) 81 | 82 | ```{r } 83 | #store viewing transformation in M 84 | persp(x, y, z, theta = 45, phi = 30, 85 | expand = .4, box = FALSE) -> M 86 | 87 | #add some points along a circle 88 | a <- seq(-pi, pi, pi/16) 89 | newpts <- cbind(cos(a), sin(a)) * 2 90 | newpts <- cbind(newpts, 0, 1) #z=0, t=1 91 | N <- newpts %*% M 92 | points(N[,1]/N[,4], N[,2]/N[,4], col=2) 93 | 94 | #add lines 95 | x2 <- seq(-3, 3, .1) 96 | y2 <- -x2^2 / 3 97 | z2 <- dnorm(x2) * dnorm(y2) 98 | N <- cbind(x2, y2, z2, 1) %*% M 99 | lines(N[,1]/N[,4], N[,2]/N[,4], col=4) 100 | 101 | #add text 102 | x3 <- c(0, 3.1) 103 | y3 <- c(0, -3.1) 104 | z3 <- dnorm(x3) * dnorm(y3) * 1.1 105 | N <- cbind(x3, y3, z3, 1) %*% M 106 | text(N[1,1]/N[1,4], N[1,2]/N[1,4], "f(x,y)") 107 | text(N[2,1]/N[2,4], N[2,2]/N[2,4], bquote(y==-x^2/3)) 108 | ``` 109 | 110 | ## Example 5.5 (Surface plot using wireframe(lattice)) 111 | 112 | ```{r } 113 | library(lattice) 114 | x <- y <- seq(-3, 3, length= 50) 115 | 116 | xy <- expand.grid(x, y) 117 | z <- (1/(2*pi)) * exp(-.5 * (xy[,1]^2 + xy[,2]^2)) 118 | wireframe(z ~ xy[,1] * xy[,2]) 119 | ``` 120 | 121 | ## Example 5.6 (3D scatterplot) 122 | 123 | ```{r } 124 | library(lattice) 125 | attach(iris) 126 | #basic 3 color plot with arrows along axes 127 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width, 128 | data=iris, groups=Species)) 129 | 130 | print(cloud(Sepal.Length ~ Petal.Length * Petal.Width, 131 | data = iris, groups = Species, main = "1", pch=1:3, 132 | scales = list(draw = FALSE), zlab = "SL", 133 | screen = list(z = 30, x = -75, y = 0)), 134 | split = c(1, 1, 2, 2), more = TRUE) 135 | 136 | print(cloud(Sepal.Width ~ Petal.Length * Petal.Width, 137 | data = iris, groups = Species, main = "2", pch=1:3, 138 | scales = list(draw = FALSE), zlab = "SW", 139 | screen = list(z = 30, x = -75, y = 0)), 140 | split = c(2, 1, 2, 2), more = TRUE) 141 | 142 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width, 143 | data = iris, groups = Species, main = "3", pch=1:3, 144 | scales = list(draw = FALSE), zlab = "PL", 145 | screen = list(z = 30, x = -55, y = 0)), 146 | split = c(1, 2, 2, 2), more = TRUE) 147 | 148 | print(cloud(Petal.Width ~ Sepal.Length * Sepal.Width, 149 | data = iris, groups = Species, main = "4", pch=1:3, 150 | scales = list(draw = FALSE), zlab = "PW", 151 | screen = list(z = 30, x = -55, y = 0)), 152 | split = c(2, 2, 2, 2)) 153 | detach(iris) 154 | ``` 155 | 156 | ## Example 5.7 (Contour plot) 157 | 158 | ```{r } 159 | #contour plot with labels 160 | contour(volcano, asp = 1, labcex = 1) 161 | 162 | #another version from lattice package 163 | library(lattice) 164 | contourplot(volcano) #similar to above 165 | ``` 166 | 167 | ## Example 5.8 (Filled contour plots) 168 | 169 | ```{r } 170 | image(volcano, col = terrain.colors(100), axes = FALSE) 171 | contour(volcano, levels = seq(100,200,by = 10), add = TRUE) 172 | 173 | filled.contour(volcano, color = terrain.colors, asp = 1) 174 | levelplot(volcano, scales = list(draw = FALSE), 175 | xlab = "", ylab = "") 176 | ``` 177 | 178 | ## Example 5.9 (2D histogram) 179 | 180 | ```{r } 181 | library(hexbin) 182 | x <- matrix(rnorm(4000), 2000, 2) 183 | plot(hexbin(x[,1], x[,2])) 184 | 185 | # ggplot version 186 | library(ggplot2) 187 | x <- data.frame(x) 188 | ggplot(x, aes(x[,1], x[,2])) + geom_hex() 189 | ``` 190 | 191 | ## Example 5.10 (Andrews curves) 192 | 193 | ```{r } 194 | library(DAAG) 195 | attach(leafshape17) 196 | 197 | f <- function(a, v) { 198 | #Andrews curve f(a) for a data vector v in R^3 199 | v[1]/sqrt(2) + v[2]*sin(a) + v[3]*cos(a) 200 | } 201 | 202 | 203 | #scale data to range [-1, 1] 204 | x <- cbind(bladelen, petiole, bladewid) 205 | n <- nrow(x) 206 | mins <- apply(x, 2, min) #column minimums 207 | maxs <- apply(x, 2, max) #column maximums 208 | r <- maxs - mins #column ranges 209 | y <- sweep(x, 2, mins) #subtract column mins 210 | y <- sweep(y, 2, r, "/") #divide by range 211 | x <- 2 * y - 1 #now has range [-1, 1] 212 | 213 | #set up plot window, but plot nothing yet 214 | plot(0, 0, xlim = c(-pi, pi), ylim = c(-3,3), 215 | xlab = "t", ylab = "Andrews Curves", 216 | main = "", type = "n") 217 | 218 | #now add the Andrews curves for each observation 219 | #line type corresponds to leaf architecture 220 | #0=orthotropic, 1=plagiotropic 221 | a <- seq(-pi, pi, len=101) 222 | dim(a) <- length(a) 223 | for (i in 1:n) { 224 | g <- arch[i] + 1 225 | y <- apply(a, MARGIN = 1, FUN = f, v = x[i,]) 226 | lines(a, y, lty = g) 227 | } 228 | legend(3, c("Orthotropic", "Plagiotropic"), lty = 1:2) 229 | detach(leafshape17) 230 | ``` 231 | 232 | ## Example 5.11 (Parallel coordinates) 233 | 234 | ```{r } 235 | library(MASS) 236 | library(lattice) 237 | #trellis.device(color = FALSE) #black and white display 238 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs. 239 | parallelplot(~x[4:8] | sp*sex, x) 240 | 241 | #trellis.device(color = FALSE) #black and white display 242 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs. 243 | a <- x$CW * x$CL #area of carapace 244 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size 245 | parallelplot(~x[4:8] | sp*sex, x) 246 | ``` 247 | 248 | ## Example 5.12 (Segment plot) 249 | 250 | ```{r } 251 | #segment plot 252 | x <- MASS::crabs[seq(5, 200, 5), ] #get every fifth obs. 253 | x <- subset(x, sex == "M") #keep just the males 254 | a <- x$CW * x$CL #area of carapace 255 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size 256 | 257 | #use default color palette or other colors 258 | #palette(gray(seq(.4, .95, len = 5))) #use gray scale 259 | palette(rainbow(6)) #or use color 260 | stars(x[4:8], draw.segments = TRUE, 261 | labels =levels(x$sp), nrow = 4, 262 | ylim = c(-2,10), key.loc = c(3,-1)) 263 | 264 | #after viewing, restore the default colors 265 | palette("default") 266 | ``` 267 | 268 | ## Example 5.13 (PCA for open and closed book exams) 269 | 270 | ```{r } 271 | library(bootstrap) 272 | str(scor) 273 | pairs(scor) 274 | cor(scor) 275 | 276 | n <- nrow(scor) 277 | x <- scale(scor) #center and scale 278 | s <- cov(x) 279 | e <- eigen(s) 280 | lam <- e$values #vector of eigenvalues 281 | P <- e$vectors #matrix of eigenvectors 282 | 283 | plot(lam, type = "b", xlab = "eigenvalues", main = "") 284 | barplot(lam, xlab = "eigenvalues") 285 | 286 | tab <- rbind(lam / sum(lam), cumsum(lam) / sum(lam)) 287 | tab 288 | 289 | z <- x %*% P 290 | dim(z) 291 | head(z) 292 | 293 | pc <- prcomp(scor, center = TRUE, scale = TRUE) 294 | summary(pc) 295 | 296 | df <- scor[1:5, ] 297 | predict(pc, newdata = df) #same as z above 298 | 299 | head(x %*% pc$rotation, 5) 300 | head(pc$rotation) 301 | head(P) 302 | ``` 303 | 304 | ## Example 5.14 (PC Biplot) 305 | 306 | ```{r } 307 | ## plot scor data in the (PC1, PC2) coordinate system 308 | biplot(pc, pc.biplot = TRUE) 309 | round(cor(x, z), 3) 310 | ``` 311 | 312 | ## Example 5.15 (PCA: Decathlon data) 313 | 314 | ```{r } 315 | library(FactoMineR) 316 | data(decathlon) 317 | pc <- princomp(decathlon[,1:10], cor = TRUE, scores = TRUE) 318 | plot(pc) # screeplot 319 | biplot(pc) 320 | summary(pc) 321 | ``` 322 | 323 | -------------------------------------------------------------------------------- /examples-Rmd/SCR2e-examples-functions.R: -------------------------------------------------------------------------------- 1 | rlogarithmic <- function(n, theta) { 2 | # generate random sample from Logarithmic(theta) 3 | stopifnot(all(theta > 0 & theta < 1)) 4 | th <- rep(theta, length=n) 5 | u <- runif(n) 6 | v <- runif(n) 7 | x <- floor(1 + log(v) / log(1 - (1 - th)^u)) 8 | return(x) 9 | } 10 | 11 | # generate MVN by spectral decomposition method 12 | 13 | rmvn.eigen <- 14 | function(n, mu, Sigma) { 15 | # generate n random vectors from MVN(mu, Sigma) 16 | # dimension is inferred from mu and Sigma 17 | d <- length(mu) 18 | ev <- eigen(Sigma, symmetric = TRUE) 19 | lambda <- ev$values 20 | V <- ev$vectors 21 | R <- V %*% diag(sqrt(lambda)) %*% t(V) 22 | Z <- matrix(rnorm(n*d), nrow = n, ncol = d) 23 | X <- Z %*% R + matrix(mu, n, d, byrow = TRUE) 24 | X 25 | } 26 | 27 | 28 | # generate MVN by singular value decomposition method 29 | 30 | rmvn.svd <- 31 | function(n, mu, Sigma) { 32 | # generate n random vectors from MVN(mu, Sigma) 33 | # dimension is inferred from mu and Sigma 34 | d <- length(mu) 35 | S <- svd(Sigma) 36 | R <- S$u %*% diag(sqrt(S$d)) %*% t(S$v) #sq. root Sigma 37 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d) 38 | X <- Z %*% R + matrix(mu, n, d, byrow=TRUE) 39 | X 40 | } 41 | 42 | 43 | # generate MVN by Choleski factorization method 44 | 45 | rmvn.Choleski <- 46 | function(n, mu, Sigma) { 47 | # generate n random vectors from MVN(mu, Sigma) 48 | # dimension is inferred from mu and Sigma 49 | d <- length(mu) 50 | Q <- chol(Sigma) # Choleski factorization of Sigma 51 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d) 52 | X <- Z %*% Q + matrix(mu, n, d, byrow=TRUE) 53 | X 54 | } 55 | 56 | # Bootstrap t confidence interval 57 | 58 | boot.t.ci <- 59 | function(x, B = 500, R = 100, level = .95, statistic){ 60 | #compute the bootstrap t CI 61 | x <- as.matrix(x); n <- nrow(x) 62 | stat <- numeric(B); se <- numeric(B) 63 | 64 | boot.se <- function(x, R, f) { 65 | #local function to compute the bootstrap 66 | #estimate of standard error for statistic f(x) 67 | x <- as.matrix(x); m <- nrow(x) 68 | th <- replicate(R, expr = { 69 | i <- sample(1:m, size = m, replace = TRUE) 70 | f(x[i, ]) 71 | }) 72 | return(sd(th)) 73 | } 74 | 75 | for (b in 1:B) { 76 | j <- sample(1:n, size = n, replace = TRUE) 77 | y <- x[j, ] 78 | stat[b] <- statistic(y) 79 | se[b] <- boot.se(y, R = R, f = statistic) 80 | } 81 | stat0 <- statistic(x) 82 | t.stats <- (stat - stat0) / se 83 | se0 <- sd(stat) 84 | alpha <- 1 - level 85 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1) 86 | names(Qt) <- rev(names(Qt)) 87 | CI <- rev(stat0 - Qt * se0) 88 | } 89 | 90 | 91 | # BCa bootstrap confidence interval 92 | 93 | boot.BCa <- 94 | function(x, th0, th, stat, conf = .95) { 95 | # bootstrap with BCa bootstrap confidence interval 96 | # th0 is the observed statistic 97 | # th is the vector of bootstrap replicates 98 | # stat is the function to compute the statistic 99 | 100 | x <- as.matrix(x) 101 | n <- nrow(x) #observations in rows 102 | N <- 1:n 103 | alpha <- (1 + c(-conf, conf))/2 104 | zalpha <- qnorm(alpha) 105 | 106 | # the bias correction factor 107 | z0 <- qnorm(sum(th < th0) / length(th)) 108 | 109 | # the acceleration factor (jackknife est.) 110 | th.jack <- numeric(n) 111 | for (i in 1:n) { 112 | J <- N[1:(n-1)] 113 | th.jack[i] <- stat(x[-i, ], J) 114 | } 115 | L <- mean(th.jack) - th.jack 116 | a <- sum(L^3)/(6 * sum(L^2)^1.5) 117 | 118 | # BCa conf. limits 119 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha))) 120 | limits <- quantile(th, adj.alpha, type=6) 121 | return(list("est"=th0, "BCa"=limits)) 122 | } 123 | 124 | # Gelman-Rubin statistic for MCMC convergence 125 | 126 | Gelman.Rubin <- function(psi) { 127 | # psi[i,j] is the statistic psi(X[i,1:j]) 128 | # for chain in i-th row of X 129 | psi <- as.matrix(psi) 130 | n <- ncol(psi) 131 | k <- nrow(psi) 132 | 133 | psi.means <- rowMeans(psi) #row means 134 | B <- n * var(psi.means) #between variance est. 135 | psi.w <- apply(psi, 1, "var") #within variances 136 | W <- mean(psi.w) #within est. 137 | v.hat <- W*(n-1)/n + (B/n) #upper variance est. 138 | r.hat <- v.hat / W #G-R statistic 139 | return(r.hat) 140 | } 141 | 142 | # binning bivariate data 143 | 144 | bin2d <- 145 | function(x, breaks1 = "Sturges", breaks2 = "Sturges"){ 146 | # Data matrix x is n by 2 147 | # breaks1, breaks2: any valid breaks for hist function 148 | # using same defaults as hist 149 | histg1 <- hist(x[,1], breaks = breaks1, plot = FALSE) 150 | histg2 <- hist(x[,2], breaks = breaks2, plot = FALSE) 151 | brx <- histg1$breaks 152 | bry <- histg2$breaks 153 | 154 | # bin frequencies 155 | freq <- table(cut(x[,1], brx), cut(x[,2], bry)) 156 | 157 | return(list(call = match.call(), freq = freq, 158 | breaks1 = brx, breaks2 = bry, 159 | mids1 = histg1$mids, mids2 = histg2$mids)) 160 | } 161 | -------------------------------------------------------------------------------- /examples-Rmd/readme.md: -------------------------------------------------------------------------------- 1 | Code for the textbook examples in "Statistical Computing with R, 2nd ed." in R Markdown format. 2 | -------------------------------------------------------------------------------- /examples-cpp/printme.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | // [[Rcpp::export]] 5 | void print_me(IntegerVector x) { 6 | // several ways to print in Rcpp 7 | Rprintf("my vector is (%d, %d, %d)\n", 8 | x[0], x[1], x[2]); 9 | Rcpp::Rcout << "my vector is " << x << std::endl; 10 | Rf_PrintValue(x); 11 | } 12 | -------------------------------------------------------------------------------- /examples-cpp/readme.md: -------------------------------------------------------------------------------- 1 | C++ example code 2 | --------------------------------------------------------------------------------