├── .gitattributes ├── .gitignore ├── Chapter01 └── Chapter01.txt ├── Chapter02 └── Chapter02.txt ├── Chapter03 └── Chapter03.txt ├── Chapter04 └── Chapter04.txt ├── Chapter05 └── Chapter05.txt ├── Chapter06 └── Chapter06.txt ├── Chapter07 └── Chapter07.txt ├── Chapter08 └── Chapter08.txt ├── Chapter09 └── Chapter09.txt ├── Chapter10 └── Chapter10.txt ├── Chapter11 └── Chapter11.txt ├── Chapter12 ├── Chapter12.txt └── music-recommendations.R ├── Chapter13 └── Chapter13.txt ├── Chapter14 ├── Chapter14.txt └── Chapter15.txt ├── Chapter15 ├── chapter12.R └── our_cpp_functions.cpp ├── Chapter16 └── Chapter16.txt ├── Chapter17 ├── Chapter17.txt ├── nyc-sat-scores.R └── nyc-sat-scores.Rmd ├── LICENSE └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Chapter01/Chapter01.txt: -------------------------------------------------------------------------------- 1 | Arithmetic and assignment 2 | 3 | 4 | > 2 + 2 5 | [1] 4 6 | > 9 / 3 7 | [1] 3 8 | > 5 %% 2 # modulus operator (remainder of 5 divided by 2) 9 | [1] 1 10 | 11 | 12 | 13 | 14 | 15 | 16 | > 3 + 2 - 10 ^ 2 # ^ is the exponent operator 17 | [1] -95 18 | > 3 + (2 - 10) ^ 2 19 | [1] 67 20 | 21 | 22 | 23 | 24 | 25 | > # assignments follow the form VARIABLE <- VALUE 26 | > var <- 10 27 | > var 28 | [1] 10 29 | > var ^ 2 30 | [1] 100 31 | > VAR / 2 # variable names are case-sensitive 32 | Error: object 'VAR' not found 33 | 34 | 35 | 36 | 37 | 38 | > var # var is 10 39 | [1] 10 40 | > var ^ 2 41 | [1] 100 42 | > var # var is still 10 43 | [1] 10 44 | > var <- var ^ 2 # no return value 45 | > var # var is now 100 46 | [1] 100 47 | 48 | 49 | 50 | 51 | > cos(3.14159) # cosine function 52 | [1] -1 53 | > cos(pi) # pi is a constant that R provides 54 | [1] -1 55 | > acos(-1) # arccosine function 56 | [1] 3.141593 57 | > acos(cos(pi)) + 10 58 | [1] 13.14159 59 | > # functions can be used as arguments to other functions 60 | 61 | 62 | 63 | 64 | > 1 / 0 65 | [1] Inf 66 | > 0 / 0 67 | [1] NaN 68 | 69 | 70 | 71 | 72 | 73 | Logicals and characters 74 | 75 | > foo <- TRUE # foo is of the logical data type 76 | > class(foo) # class() tells us the type 77 | [1] "logical" 78 | > bar <- "hi!" # bar is of the character data type 79 | > class(bar) 80 | [1] "character" 81 | 82 | 83 | 84 | 85 | 86 | > foo 87 | [1] TRUE 88 | > foo && TRUE # boolean and 89 | [1] TRUE 90 | > foo && FALSE 91 | [1] FALSE 92 | > foo || FALSE # boolean or 93 | [1] TRUE 94 | > !foo # negation operator 95 | [1] FALSE 96 | 97 | 98 | 99 | 100 | 101 | > foo && 1 102 | [1] TRUE 103 | > foo && 2 104 | [1] TRUE 105 | > foo && 0 106 | [1] FALSE 107 | 108 | 109 | 110 | 111 | 112 | > 4 < 2 # less than operator 113 | [1] FALSE 114 | > 4 >= 4 # greater than or equal to 115 | [1] TRUE 116 | > 3 == 3 # equality operator 117 | [1] TRUE 118 | > 3 != 2 # inequality operator 119 | [1] TRUE 120 | 121 | 122 | 123 | 124 | 125 | > lang.domain <- "statistics" 126 | > lang.domain <- toupper(lang.domain) 127 | > print(lang.domain) 128 | [1] "STATISTICS" 129 | > # retrieves substring from first character to fourth character 130 | > substr(lang.domain, 1, 4) 131 | [1] "STAT" 132 | > gsub("I", "1", lang.domain) # substitutes every "I" for "1" 133 | [1] "STAT1ST1CS" 134 | > # combines character strings 135 | > paste("R does", lang.domain, "!!!") 136 | [1] "R does STATISTICS !!!" 137 | 138 | 139 | 140 | 141 | 142 | Flow of control 143 | 144 | > if(2 + 2 == 4) 145 | + print("very good") 146 | [1] "very good" 147 | > if(2 + 2 == 5) 148 | + print("all hail to the thief") 149 | 150 | 151 | 152 | 153 | 154 | > if((4/2==2) && (2*2==4)){ 155 | + print("four divided by two is two...") 156 | + print("and two times two is four") 157 | + } 158 | [1] "four divided by two is two..." 159 | [1] "and two times two is four" 160 | 161 | 162 | 163 | 164 | 165 | > closing.time <- TRUE 166 | > if(closing.time){ 167 | + print("you don't have to go home") 168 | + print("but you can't stay here") 169 | + } else{ 170 | + print("you can stay here!") 171 | + } 172 | [1] "you don't have to go home" 173 | [1] "but you can't stay here" 174 | > if(!closing.time){ 175 | + print("you don't have to go home") 176 | + print("but you can't stay here") 177 | + } else{ 178 | + print("you can stay here!") 179 | + } 180 | [1] "you can stay here!" 181 | 182 | 183 | 184 | 185 | 186 | Vectors 187 | 188 | > our.vect <- c(8, 6, 7, 5, 3, 0, 9) 189 | > our.vect 190 | [1] 8 6 7 5 3 0 9 191 | 192 | 193 | 194 | 195 | 196 | > another.vect <- c("8", 6, 7, "-", 3, "0", 9) 197 | > another.vect 198 | [1] "8" "6" "7" "-" "3" "0" "9" 199 | 200 | 201 | 202 | 203 | 204 | Subsetting 205 | 206 | > our.vect[1] # to get the first value 207 | [1] 8 208 | > # the function length() returns the length of a vector 209 | > length(our.vect) 210 | [1] 7 211 | > our.vect[length(our.vect)] # get the last element of a vector 212 | [1] 9 213 | 214 | 215 | 216 | 217 | 218 | > our.vect[10] 219 | [1] NA 220 | 221 | 222 | 223 | 224 | 225 | 226 | > # extract the first, third, fifth, and 227 | > # seventh element from our vector 228 | > our.vect[c(1, 3, 5, 7)] 229 | [1] 8 7 3 9 230 | 231 | 232 | 233 | 234 | 235 | > other.vector <- 1:10 236 | > other.vector 237 | [1] 1 2 3 4 5 6 7 8 9 10 238 | > another.vector <- seq(50, 30, by=-2) 239 | > another.vector 240 | [1] 50 48 46 44 42 40 38 36 34 32 30 241 | 242 | 243 | 244 | 245 | 246 | > our.vect[1:5] 247 | [1] 8 6 7 5 3 248 | 249 | 250 | 251 | 252 | 253 | Vectorized functions 254 | 255 | > # takes the mean of a vector 256 | > mean(our.vect) 257 | [1] 5.428571 258 | > sd(our.vect) # standard deviation 259 | [1] 3.101459 260 | > min(our.vect) 261 | [1] 0 262 | > max(1:10) 263 | [1] 10 264 | > sum(c(1, 2, 3)) 265 | [1] 6 266 | 267 | 268 | 269 | 270 | 271 | > messy.vector <- c(8, 6, NA, 7, 5, NA, 3, 0, 9) 272 | > messy.vector 273 | [1] 8 6 NA 7 5 NA 3 0 9 274 | > length(messy.vector) 275 | [1] 9 276 | 277 | 278 | 279 | 280 | 281 | > mean(messy.vector) 282 | [1] NA 283 | > mean(messy.vector, na.rm=TRUE) 284 | [1] 5.428571 285 | > sum(messy.vector, na.rm=FALSE)[1] NA 286 | > sum(messy.vector, na.rm=TRUE) 287 | [1] 38 288 | 289 | 290 | 291 | 292 | 293 | > log.vector <- c(TRUE, TRUE, FALSE) 294 | > log.vector 295 | [1] TRUE TRUE FALSE 296 | 297 | 298 | 299 | 300 | 301 | > sum(log.vector) 302 | [1] 2 303 | 304 | 305 | 306 | 307 | 308 | > messy.vector 309 | [1] 8 6 NA 7 5 NA 3 0 9 310 | > is.na(messy.vector) 311 | [1] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE 312 | > # 8 6 NA 7 5 NA 3 0 9 313 | 314 | 315 | 316 | 317 | 318 | 319 | > sum(is.na(messy.vector)) 320 | [1] 2 321 | 322 | 323 | 324 | 325 | 326 | 327 | > our.vect > 5 328 | [1] TRUE TRUE TRUE FALSE FALSE FALSE TRUE 329 | 330 | 331 | 332 | 333 | 334 | > sum(our.vect > 5) 335 | [1] 4 336 | 337 | 338 | 339 | 340 | 341 | Advanced subsetting 342 | 343 | > messy.vector[!is.na(messy.vector)] 344 | [1] 8 6 7 5 3 0 9 345 | 346 | 347 | 348 | 349 | 350 | > our.vect[our.vect > 5] 351 | [1] 8 6 7 9 352 | 353 | 354 | 355 | 356 | 357 | > our.vect 358 | [1] 8 6 7 5 3 0 9 359 | > our.vect[1] <- 9 360 | > our.vect 361 | [1] 9 6 7 5 3 0 9 362 | 363 | 364 | 365 | 366 | 367 | 368 | > messy.vector[is.na(messy.vector)] <- 0 369 | > messy.vector 370 | [1] 8 6 0 7 5 0 3 0 9 371 | 372 | 373 | 374 | 375 | 376 | 377 | > ifelse(is.na(messy.vector), 0, messy.vector) 378 | [1] 8 6 0 7 5 0 3 0 9 379 | 380 | 381 | 382 | 383 | 384 | Recycling 385 | 386 | > our.vect + 3 387 | [1] 12 9 10 8 6 3 12 388 | 389 | 390 | 391 | 392 | 393 | 394 | > our.vect + 3 395 | [1] 12 9 10 8 6 3 12 396 | 397 | 398 | 399 | 400 | 401 | > our.vect + c(3, 3, 3, 3, 3, 3, 3) 402 | [1] 12 9 10 8 6 3 12 403 | 404 | 405 | 406 | 407 | 408 | > our.vect[c(TRUE, FALSE)] 409 | [1] 9 7 3 9 410 | 411 | 412 | 413 | 414 | 415 | > our.vect[c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE)] 416 | [1] 9 7 3 9 417 | 418 | 419 | 420 | 421 | 422 | > our.vect + c(3, 0) 423 | [1] 12 6 10 5 6 0 12 424 | Warning message: 425 | In our.vect + c(3, 0) : 426 | longer object length is not a multiple of shorter object length 427 | 428 | 429 | 430 | 431 | 432 | 433 | Functions 434 | 435 | > function.name <- function(argument1, argument2, ...){ 436 | + # some functionality 437 | + } 438 | 439 | 440 | 441 | 442 | > is.even <- function(a.number){ 443 | + remainder <- a.number %% 2 444 | + if(remainder==0) 445 | + return(TRUE) 446 | + return(FALSE) 447 | + } 448 | > # testing it 449 | > is.even(10) 450 | [1] TRUE 451 | > is.even(9) 452 | [1] FALSE 453 | 454 | 455 | 456 | 457 | 458 | 459 | > is.divisible.by <- function(large.number, smaller.number){ 460 | + if(large.number %% smaller.number != 0) 461 | + return(FALSE) 462 | + return(TRUE) 463 | + } 464 | > # testing it 465 | > is.divisible.by(10, 2) 466 | [1] TRUE 467 | > is.divisible.by(10, 3) 468 | [1] FALSE 469 | > is.divisible.by(9, 3) 470 | [1] TRUE 471 | 472 | 473 | 474 | 475 | 476 | > is.even <- function(num){ 477 | + is.divisible.by(num, 2) 478 | + } 479 | 480 | 481 | 482 | 483 | 484 | > sapply(our.vect, is.even) 485 | [1] FALSE TRUE FALSE FALSE FALSE TRUE FALSE 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | > sapply(our.vect, function(num){is.divisible.by(num, 3)}) 494 | [1] TRUE TRUE FALSE FALSE TRUE TRUE TRUE 495 | 496 | 497 | 498 | 499 | 500 | 501 | > where.even <- sapply(our.vect, is.even) 502 | > where.div.3 <- sapply(our.vect, function(num){ 503 | + is.divisible.by(num, 3)}) 504 | > # "&" is like the "&&" and operator but for vectors 505 | > our.vect[where.even & where.div.3] 506 | [1] 6 0 507 | 508 | 509 | 510 | 511 | 512 | 513 | Matrices 514 | 515 | > a.matrix <- matrix(c(1, 2, 3, 4, 5, 6)) 516 | > a.matrix 517 | [,1] 518 | [1,] 1 519 | [2,] 2 520 | [3,] 3 521 | [4,] 4 522 | [5,] 5 523 | [6,] 6 524 | 525 | 526 | 527 | 528 | 529 | 530 | > a.matrix <- matrix(c(1, 2, 3, 4, 5, 6), ncol=2) 531 | > a.matrix 532 | [,1] [,2] 533 | [1,] 1 4 534 | [2,] 2 5 535 | [3,] 3 6 536 | 537 | 538 | 539 | 540 | 541 | 542 | > a2.matrix <- cbind(c(1, 2, 3), c(4, 5, 6)) 543 | 544 | 545 | 546 | 547 | 548 | > a3.matrix <- rbind(c(1, 2, 3), c(4, 5, 6)) 549 | > a3.matrix 550 | [,1] [,2] [,3] 551 | [1,] 1 2 3 552 | [2,] 4 5 6 553 | 554 | 555 | 556 | 557 | 558 | > t(a2.matrix) 559 | 560 | 561 | 562 | 563 | 564 | > a2.matrix 565 | [,1] [,2] 566 | [1,] 1 4 567 | [2,] 2 5 568 | [3,] 3 6 569 | > colSums(a2.matrix) 570 | [1] 6 15 571 | > rowMeans(a2.matrix) 572 | [1] 2.5 3.5 4.5 573 | 574 | 575 | 576 | 577 | 578 | > apply(a2.matrix, 2, sum) 579 | [1] 6 15 580 | > apply(a2.matrix, 1, mean) 581 | [1] 2.5 3.5 4.5 582 | 583 | 584 | 585 | 586 | 587 | 588 | > a2.matrix %*% a2.matrix 589 | Error in a2.matrix %*% a2.matrix : non-conformable arguments 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | > a2.matrix 598 | [,1] [,2] 599 | [1,] 1 4 600 | [2,] 2 5 601 | [3,] 3 6 602 | > a3.matrix 603 | [,1] [,2] [,3] 604 | [1,] 1 2 3 605 | [2,] 4 5 6 606 | > a2.matrix %*% a3.matrix 607 | [,1] [,2] [,3] 608 | [1,] 17 22 27 609 | [2,] 22 29 36 610 | [3,] 27 36 45 611 | > # dim() tells us how many rows and columns 612 | > # (respectively) there are in the given matrix 613 | > dim(a2.matrix) 614 | [1] 3 2 615 | 616 | 617 | 618 | 619 | 620 | 621 | > a2.matrix[2,1] 622 | [1] 2 623 | 624 | 625 | 626 | 627 | 628 | 629 | > # returns the whole second column 630 | a2.matrix[,2] 631 | [1] 4 5 6 632 | > # returns the first row 633 | > a2.matrix[1,] 634 | [1] 1 4 635 | 636 | 637 | 638 | 639 | 640 | 641 | > # give me element in column 2 at the first and third row 642 | > a2.matrix[c(1, 3), 2] 643 | [1] 4 6 644 | 645 | 646 | 647 | 648 | 649 | 650 | Loading data into R 651 | 652 | flavor,number 653 | pistachio,6 654 | mint chocolate chip,7 655 | vanilla,5 656 | chocolate,10 657 | strawberry,2 658 | neopolitan,4 659 | 660 | 661 | 662 | 663 | 664 | 665 | > favs <- read.table("favorites.txt", sep=",", header=TRUE) 666 | 667 | 668 | 669 | 670 | 671 | 672 | > favs <- read.table(file.choose(), sep=",", header=TRUE) 673 | 674 | 675 | 676 | 677 | 678 | 679 | > favs <- read.csv("favorites.txt") 680 | 681 | 682 | 683 | 684 | 685 | > head(favs) 686 | flavor number 687 | 1 pistachio 6 688 | 2 mint chocolate chip 7 689 | 3 vanilla 5 690 | 4 chocolate 10 691 | 5 strawberry 2 692 | 6 neopolitan 4 693 | > class(favs) 694 | [1] "data.frame" 695 | > class(favs$flavor) 696 | [1] "factor" 697 | > class(favs$number) 698 | [1] "numeric" 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | > favs <- read.csv("favorites.txt", stringsAsFactors=FALSE) 707 | > class(favs$flavor) 708 | [1] "character" 709 | 710 | 711 | 712 | 713 | 714 | > favs$flavor 715 | [1] "pistachio" "mint chocolate chip" "vanilla" 716 | [4] "chocolate" "strawberry" "neopolitan" 717 | > favs[["flavor"]] 718 | [1] "pistachio" "mint chocolate chip" "vanilla" 719 | [4] "chocolate" "strawberry" "neopolitan" 720 | > favs[,1] 721 | [1] "pistachio" "mint chocolate chip" "vanilla" 722 | [4] "chocolate" "strawberry" "neopolitan" 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | > names(favs) 732 | [1] "flavor" "number" 733 | > names(favs)[1] <- "flav" 734 | > names(favs) 735 | [1] "flav" "number" 736 | 737 | 738 | 739 | 740 | 741 | 742 | > str(favs) 743 | 'data.frame': 6 obs. of 2 variables: 744 | $ flav : chr "pistachio" "mint chocolate chip" "vanilla" 745 | "chocolate" ... 746 | $ number: num 6 7 5 10 2 4 747 | 748 | 749 | 750 | 751 | 752 | Working with packages 753 | 754 | > # downloads and installs from CRAN 755 | > install.packages("ggplot2") 756 | 757 | 758 | 759 | 760 | 761 | > library(ggplot2) 762 | > ggplot(favs, aes(x=flav, y=number)) + 763 | + geom_bar(stat="identity") + 764 | + ggtitle("Soy ice cream flavor preferences") 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | -------------------------------------------------------------------------------- /Chapter02/Chapter02.txt: -------------------------------------------------------------------------------- 1 | Univariate data 2 | 3 | categorical.data <- c("heads", "tails", "tails", "heads") 4 | 5 | 6 | 7 | 8 | 9 | contin.data <- c(198.41, 178.46, 165.20, 141.71, 138.77) 10 | 11 | 12 | 13 | 14 | 15 | Frequency distributions 16 | 17 | head(mtcars) 18 | mpg cyl disp hp drat wt qsec vs am gear carb 19 | Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 20 | Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 21 | Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 22 | Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 23 | Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 24 | Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 25 | 26 | 27 | 28 | 29 | 30 | unique(mtcars$carb) 31 | [1] 4 1 2 3 6 8 32 | 33 | 34 | 35 | 36 | table(mtcars$carb) 37 | 1 2 3 4 6 8 38 | 7 10 3 10 1 1 39 | 40 | 41 | 42 | 43 | 44 | cut(airquality$Temp, 9) 45 | 46 | 47 | 48 | 49 | 50 | table(cut(airquality$Temp, 9)) 51 | 52 | (56,60.6] (60.6,65.1] (65.1,69.7] (69.7,74.2] (74.2,78.8] 53 | 8 10 14 16 26 54 | (78.8,83.3] (83.3,87.9] (87.9,92.4] (92.4,97] 55 | 35 22 15 7 56 | 57 | 58 | 59 | 60 | Central tendency 61 | 62 | sum(nums)/length(nums) # nums would be a vector of numerics 63 | 64 | 65 | 66 | 67 | 68 | mean(c(1,2,3,4,5)) 69 | [1] 3 70 | 71 | 72 | 73 | 74 | 75 | median(c(3, 7, 6, 10, 3, 7)) 76 | [1] 6.5 77 | 78 | 79 | 80 | 81 | 82 | 83 | Spread 84 | 85 | sum(abs(x - mean(x))) / length(x) 86 | 87 | 88 | 89 | 90 | Probability distributions 91 | 92 | table(mtcars$carb) / length(mtcars$carb) 93 | 1 2 3 4 6 8 94 | 0.21875 0.31250 0.09375 0.31250 0.03125 0.03125 95 | 96 | 97 | 98 | 99 | 100 | 101 | # don't worry about memorizing this 102 | temp.density <- density(airquality$Temp) 103 | pdf <- approxfun(temp.density$x, temp.density$y, rule=2) 104 | integrate(pdf, 80, 90) 105 | 0.3422287 with absolute error < 7.5e-06 106 | 107 | 108 | 109 | 110 | 111 | 112 | Visualization methods 113 | 114 | qplot(column, data=dataframe, geom=...) 115 | 116 | 117 | 118 | 119 | library(ggplot2) 120 | qplot(factor(carb), data=mtcars, geom="bar") 121 | 122 | 123 | 124 | 125 | 126 | qplot(factor(carb), 127 | data=mtcars, 128 | geom="bar", 129 | fill=factor(carb), 130 | xlab="number of carburetors") 131 | 132 | 133 | 134 | 135 | 136 | 137 | qplot(Temp, data=airquality, geom="histogram") 138 | 139 | 140 | 141 | 142 | 143 | 144 | qplot(Temp, data=airquality, geom="histogram", 145 | binwidth=5, color=I("white")) 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | qplot(Temp, data=airquality, geom="density") 154 | 155 | 156 | 157 | 158 | 159 | qplot(Temp, data=airquality, geom="density", 160 | adjust=.5, # changes bandwidth 161 | fill=I("pink"), 162 | alpha=I(.5), # adds transparency 163 | main="density plot of temperature data") 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /Chapter03/Chapter03.txt: -------------------------------------------------------------------------------- 1 | Multivariate data 2 | 3 | head(airquality) 4 | Ozone Solar.R Wind Temp Month Day 5 | 1 41 190 7.4 67 5 1 6 | 2 36 118 8.0 72 5 2 7 | 3 12 149 12.6 74 5 3 8 | 4 18 313 11.5 62 5 4 9 | 5 NA NA 14.3 56 5 5 10 | 6 28 NA 14.9 66 5 6 11 | 12 | 13 | 14 | 15 | 16 | 17 | Relationships between a categorical and continuous variable 18 | 19 | head(iris) 20 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 21 | 1 5.1 3.5 1.4 0.2 setosa 22 | 2 4.9 3.0 1.4 0.2 setosa 23 | 3 4.7 3.2 1.3 0.2 setosa 24 | 4 4.6 3.1 1.5 0.2 setosa 25 | 5 5.0 3.6 1.4 0.2 setosa 26 | 6 5.4 3.9 1.7 0.4 setosa 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | mean(iris$Petal.Length) 39 | [1] 3.758 40 | 41 | 42 | 43 | 44 | 45 | 46 | mean(iris$Petal.Length[iris$Species=="setosa"]) 47 | [1] 1.462 48 | mean(iris$Petal.Length[iris$Species=="versicolor"]) 49 | [1] 4.26 50 | mean(iris$Petal.Length[iris$Species=="virginica"]) 51 | [1] 5.552 52 | 53 | 54 | 55 | 56 | 57 | 58 | by(iris$Petal.Length, iris$Species, mean) 59 | iris$Species: setosa 60 | [1] 1.462 61 | -------------------------------------------- 62 | iris$Species: versicolor 63 | [1] 4.26 64 | -------------------------------------------- 65 | iris$Species: virginica 66 | [1] 5.552 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | by(iris$Petal.Length, iris$Species, sd) 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | by(iris$Petal.Length, iris$Species, summary) 85 | iris$Species: setosa 86 | Min. 1st Qu. Median Mean 3rd Qu. Max. 87 | 1.000 1.400 1.500 1.462 1.575 1.900 88 | ------------------------------------------------ 89 | iris$Species: versicolor 90 | Min. 1st Qu. Median Mean 3rd Qu. Max. 91 | 3.00 4.00 4.35 4.26 4.60 5.10 92 | ------------------------------------------------ 93 | iris$Species: virginica 94 | Min. 1st Qu. Median Mean 3rd Qu. Max. 95 | 4.500 5.100 5.550 5.552 5.875 6.900 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | by(airquality$Temp, airquality$Month, mean) 104 | airquality$Month: 5 105 | [1] 65.54839 106 | --------------------------------------------- 107 | airquality$Month: 6 108 | [1] 79.1 109 | --------------------------------------------- 110 | airquality$Month: 7 111 | [1] 83.90323 112 | --------------------------------------------- 113 | airquality$Month: 8 114 | [1] 83.96774 115 | --------------------------------------------- 116 | airquality$Month: 9 117 | [1] 76.9 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | Relationships between two categorical variables 127 | 128 | ucba <- data.frame(UCBAdmissions) 129 | head(ucba) 130 | Admit Gender Dept Freq 131 | 1 Admitted Male A 512 132 | 2 Rejected Male A 313 133 | 3 Admitted Female A 89 134 | 4 Rejected Female A 19 135 | 5 Admitted Male B 353 136 | 6 Rejected Male B 207 137 | 138 | 139 | 140 | 141 | 142 | # the first argument to xtabs (the formula) should 143 | # be read as: frequency *by* Gender and Admission 144 | cross <- xtabs(Freq ~ Gender+Admit, data=ucba) 145 | cross 146 | Admit 147 | Gender Admitted Rejected 148 | Male 1198 1493 149 | Female 557 1278 150 | 151 | 152 | 153 | 154 | 155 | 156 | prop.table(cross, 1) 157 | Admit 158 | Gender Admitted Rejected 159 | Male 0.4451877 0.5548123 160 | Female 0.3035422 0.6964578 161 | 162 | 163 | 164 | 165 | 166 | cross2 <- xtabs(Freq ~ Gender + Admit, data=ucba[ucba$Dept=="A",]) 167 | prop.table(cross2, 1) 168 | Admit 169 | Gender Admitted Rejected 170 | Male 0.6206061 0.3793939 171 | Female 0.8240741 0.1759259 172 | 173 | 174 | 175 | 176 | 177 | 178 | The relationship between two continuous variables 179 | 180 | head(women) 181 | height weight 182 | 1 58 115 183 | 2 59 117 184 | 3 60 120 185 | 4 61 123 186 | 5 62 126 187 | 6 63 129 188 | nrow(women) 189 | [1] 15 190 | 191 | 192 | 193 | 194 | 195 | 196 | Covariance 197 | 198 | cov(women$weight, women$height) 199 | [1] 69 200 | # the order we put the two columns in 201 | # the arguments doesn't matter 202 | cov(women$height, women$weight) 203 | [1] 69 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | # there are 2.54 centimeters in each inch 213 | # changing the units to centimeters increases 214 | # the variability within the height variable 215 | cov(women$height*2.54, women$weight) 216 | [1] 175.26 217 | 218 | 219 | 220 | 221 | 222 | 223 | Correlation coefficients 224 | 225 | cor(women$height, women$weight) 226 | [1] 0.9954948 227 | cor(women$height*2.54, women$weight) 228 | [1] 0.9954948 229 | 230 | 231 | 232 | 233 | 234 | 235 | xs <- 1:100 236 | cor(xs, xs+100) 237 | [1] 1 238 | cor(xs, xs^3) 239 | [1] 0.917552 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | xs <- 1:100 248 | cor(xs, xs+100, method="spearman") 249 | [1] 1 250 | cor(xs, xs^3, method="spearman") 251 | [1] 1 252 | 253 | 254 | 255 | 256 | 257 | cor(mtcars$wt, mtcars$mpg) 258 | [1] -0.8676594 259 | 260 | 261 | 262 | 263 | 264 | 265 | cor(airquality$Temp, airquality$Wind) 266 | [1] -0.4579879 267 | cor(airquality$Temp, airquality$Wind, method="spearman") 268 | [1] -0.4465408 269 | 270 | 271 | 272 | 273 | 274 | # have to drop 5th column (species is not numeric) 275 | iris.nospecies <- iris[, -5] 276 | cor(iris.nospecies) 277 | Sepal.Length Sepal.Width Petal.Length Petal.Width 278 | Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411 279 | Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259 280 | Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654 281 | Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | Categorical and continuous variables 290 | 291 | library(ggplot) 292 | qplot(Species, Petal.Length, data=iris, geom="boxplot", 293 | fill=Species) 294 | 295 | 296 | 297 | 298 | 299 | qplot(Petal.Length, data=iris, geom="density", alpha=I(.7), 300 | fill=Species) 301 | 302 | 303 | 304 | 305 | 306 | Two categorical variables 307 | 308 | 309 | install.packages("vcd") 310 | library(vcd) 311 | ucba <- data.frame(UCBAdmissions) 312 | mosaic(Freq ~ Gender + Admit, data=ucba, 313 | shade=TRUE, legend=FALSE) 314 | 315 | 316 | 317 | 318 | 319 | mosaic(Freq ~ Gender+Admit, data=ucba[ucba$Dept=="A",], 320 | shade=TRUE, legend=FALSE) 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | Two continuous variables 329 | 330 | 331 | qplot(height, weight, data=women, geom="point") 332 | 333 | 334 | 335 | 336 | qplot(wt, mpg, data=mtcars, geom=c("point", "smooth"), 337 | method="lm", se=FALSE) 338 | 339 | 340 | 341 | 342 | 343 | qplot(wt, mpg, data=mtcars, geom=c("point", "smooth"), se=FALSE) 344 | 345 | 346 | 347 | 348 | 349 | More than two continuous variables 350 | 351 | install.packages("corrgram") 352 | library(corrgram) 353 | corrgram(iris, lower.panel=panel.conf, upper.panel=panel.pts) 354 | 355 | 356 | 357 | 358 | corrgram(iris, lower.panel=panel.pie, upper.panel=panel.pts, 359 | diag.panel=panel.density, 360 | main=paste0("corrgram of petal and sepal ", 361 | "measurements in iris data set")) 362 | 363 | 364 | 365 | 366 | -------------------------------------------------------------------------------- /Chapter04/Chapter04.txt: -------------------------------------------------------------------------------- 1 | The binomial distribution 2 | 3 | pbinom(10, size=30, prob=.5) 4 | [1] 0.04936857 5 | 6 | 7 | 8 | The normal distribution 9 | 10 | 11 | > f <- function(x){ dnorm(x, mean=65, sd=3.5) } 12 | > integrate(f, 70, Inf) 13 | 0.07656373 with absolute error < 2.2e-06 14 | 15 | 16 | 17 | > pnorm(70, mean=65, sd=3.5) 18 | [1] 0.9234363 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Chapter05/Chapter05.txt: -------------------------------------------------------------------------------- 1 | Estimating means 2 | 3 | 4 | > # setting seed will make random number generation reproducible 5 | > set.seed(1) 6 | > all.us.women <- rnorm(10000, mean=65, sd=3.5) 7 | 8 | 9 | 10 | 11 | 12 | > our.sample <- sample(all.us.women, 10) 13 | > mean(our.sample) 14 | [1] 64.51365 15 | 16 | 17 | 18 | 19 | 20 | > population.mean <- mean(all.us.women) 21 | > for(sample.size in seq(5, 30, by=5)){ 22 | + # create empty vector with 1000 elements 23 | + sample.means <- numeric(1000) 24 | + for(i in 1:1000){ 25 | + sample.means[i] <- mean(sample(all.us.women, sample.size)) 26 | + } 27 | + distances.from.true.mean <- abs(sample.means - population.mean) 28 | + mean.distance.from.true.mean <- mean(distances.from.true.mean) 29 | + print(mean.distance.from.true.mean) 30 | + } 31 | [1] 1.245492 32 | [1] 0.8653313 33 | [1] 0.7386099 34 | [1] 0.6355692 35 | [1] 0.5458136 36 | [1] 0.5090788 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | mean(our.new.sample) 45 | [1] 65.19704 46 | 47 | 48 | 49 | 50 | 51 | 52 | The sampling distribution 53 | 54 | > means.of.our.samples <- numeric(10000) 55 | > for(i in 1:10000){ 56 | + a.sample <- sample(all.us.women, 40) 57 | + means.of.our.samples[i] <- mean(a.sample) 58 | + } 59 | 60 | 61 | 62 | 63 | 64 | 65 | > mean(our.new.sample) 66 | [1] 65.19704 67 | > sd(our.new.sample) 68 | [1] 3.588447 69 | > sd(our.new.sample) / sqrt(length(our.new.sample)) 70 | [1] 0.5673833 71 | 72 | 73 | 74 | 75 | 76 | Interval estimation 77 | 78 | > err <- sd(our.new.sample) / sqrt(length(our.new.sample)) 79 | > mean(our.new.sample) - (1.96*err) 80 | [1] 64.08497 81 | > mean(our.new.sample) + (1.96*err) 82 | [1] 66.30912 83 | 84 | 85 | 86 | 87 | 88 | 89 | How did we get 1.96? 90 | 91 | > qnorm(.025) 92 | [1] -1.959964 93 | > pnorm(-1.959964) 94 | [1] 0.025 95 | 96 | 97 | 98 | 99 | 100 | > qnorm(.05) 101 | [1] -1.644854 102 | > qnorm(.95) 103 | [1] 1.644854 104 | > # notice the symmetry? 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | Smaller samples 113 | 114 | > small.sample <- sample(all.us.women, 15) 115 | > mean(small.sample) 116 | [1] 65.51277 117 | > qt(.025, df=14) 118 | [1] -2.144787 119 | > # notice the difference 120 | > qnorm(.025) 121 | [1] -1.959964 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | > err <- sd(small.sample) / sqrt(length(small.sample)) 130 | > mean(small.sample) - (2.145 * err) 131 | [1] 64.09551 132 | > mean(small.sample) + (2.145 * err) 133 | [1] 66.93003 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /Chapter06/Chapter06.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Analysis-with-R-Second-Edition/ece3101ca2f43f725c13a67d27810297bd31142a/Chapter06/Chapter06.txt -------------------------------------------------------------------------------- /Chapter07/Chapter07.txt: -------------------------------------------------------------------------------- 1 | The big idea behind Bayesian analysis 2 | 3 | 4 | > curve(dbeta(x, 70, 60), # plot a beta distribution 5 | + xlab="?", # name x-axis 6 | + ylab="posterior belief", # name y-axis 7 | + type="l", # make smooth line 8 | + yaxt='n') # remove y axis labels 9 | > abline(v=.5, lty=2) # make line at theta = 0.5 10 | 11 | 12 | 13 | 14 | Who cares about coin flips 15 | 16 | 17 | > curve(dbeta(x, 37, 5), xlab="?", 18 | + ylab="posterior belief", 19 | + type="l", yaxt='n') 20 | 21 | 22 | 23 | 24 | > samp <- rbeta(10000, 37, 5) 25 | > quantile(samp, c(.025, .975)) 26 | 2.5% 97.5% 27 | 0.7674591 0.9597010 28 | 29 | 30 | 31 | 32 | > # horizontal line 33 | > lines(c(.767, .96), c(0.1, 0.1)) 34 | > # tiny vertical left boundary 35 | > lines(c(.767, .769), c(0.15, 0.05)) 36 | > # tiny vertical right boundary 37 | > lines(c(.96, .96), c(0.15, 0.05)) 38 | 39 | 40 | 41 | 42 | 43 | 44 | Using JAGS and runjags 45 | 46 | > install.packages(c("rjags", "runjags", "modeest")) 47 | 48 | 49 | 50 | 51 | > library(runjags) 52 | > testjags() 53 | You are using R version 3.2.1 (2015-06-18) on a unix machine, 54 | with the RStudio GUI 55 | The rjags package is installed 56 | JAGS version 3.4.0 found successfully using the command 57 | '/usr/local/bin/jags' 58 | 59 | 60 | 61 | 62 | 63 | 64 | our.model <- "model { 65 | # likelihood function 66 | numSuccesses ~ dbinom(successProb, numTrials) 67 | # prior 68 | successProb ~ dbeta(1, 1) 69 | # parameter of interest 70 | theta <- numSuccesses / numTrials 71 | }" 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | > numSuccesses ~ dbinom(successProb, numTrials) 80 | 81 | 82 | 83 | 84 | 85 | our.data <- list( 86 | numTrials = 40, 87 | successProb = 36/40 88 | ) 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | > results <- autorun.jags(our.model, 98 | + data=our.data, 99 | + n.chains = 3, 100 | + monitor = c('theta')) 101 | 102 | 103 | 104 | 105 | 106 | > plot(results, 107 | + plot.type=c("histogram", "trace"), 108 | + layout=c(2,1)) 109 | 110 | 111 | 112 | 113 | 114 | > # mcmc samples are stored in mcmc attribute 115 | > # of results variable 116 | > results.matrix <- as.matrix(results$mcmc) 117 | > 118 | > # extract the samples for 'theta' 119 | > # the only column, in this case 120 | > theta.samples <- results.matrix[,'theta'] 121 | > 122 | > plot(density(theta.samples, adjust=5)) 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | > quantile(theta.samples, c(.025, .975)) 132 | 2.5% 97.5% 133 | 0.800 0.975 134 | > lines(c(.8, .975), c(0.1, 0.1)) 135 | > lines(c(.8, .8), c(0.15, 0.05)) 136 | > lines(c(.975, .975), c(0.15, 0.05)) 137 | 138 | 139 | 140 | 141 | 142 | 143 | Fitting distributions the Bayesian way 144 | 145 | the.model <- " 146 | model { 147 | mu ~ dunif(0, 60) # prior 148 | stddev ~ dunif(0, 30) # prior 149 | tau <- pow(stddev, -2) 150 | for(i in 1:theLength){ 151 | samp[i] ~ dnorm(mu, tau) # likelihood function 152 | } 153 | }" 154 | 155 | 156 | 157 | 158 | 159 | 160 | the.data <- list( 161 | samp = precip, 162 | theLength = length(precip) 163 | ) 164 | 165 | 166 | 167 | 168 | > results <- autorun.jags(the.model, 169 | + data=the.data, 170 | + n.chains = 3, 171 | + # now we care about two parameters 172 | + monitor = c('mu', 'stddev')) 173 | 174 | 175 | 176 | 177 | > plot(results, 178 | + plot.type=c("histogram", "trace"), 179 | + layout=c(2,2)) 180 | 181 | 182 | 183 | 184 | 185 | 186 | > results.matrix <- as.matrix(results$mcmc) 187 | > 188 | > library(MASS) 189 | > # we need to make a kernel density 190 | > # estimate of the 3-d surface 191 | 192 | > z <- kde2d(results.matrix[,'mu'], 193 | + results.matrix[,'stddev'], 194 | + n=50) 195 | > 196 | > plot(results.matrix) 197 | > contour(z, drawlabels=FALSE, 198 | + nlevels=11, col=rainbow(11), 199 | + lwd=3, add=TRUE) 200 | 201 | 202 | 203 | 204 | 205 | > print(results) 206 | 207 | JAGS model summary statistics from 30000 samples (chains = 3; adapt+burnin 208 | = 5000): 209 | Lower95 Median Upper95 Mean SD Mode 210 | mu 31.645 34.862 38.181 34.866 1.6639 34.895 211 | stddev 11.669 13.886 16.376 13.967 1.2122 13.773 212 | MCerr MC%ofSD SSeff AC.10 psrf 213 | mu 0.012238 0.7 18484 0.002684 1.0001 214 | stddev 0.0093951 0.8 16649 -0.0053588 1.0001 215 | Total time taken: 5 seconds 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | The Bayesian independent samples t-test 224 | 225 | 226 | the.model <- " 227 | model { 228 | # each group will have a separate mu 229 | # and standard deviation 230 | for(j in 1:2){ 231 | mu[j] ~ dunif(0, 60) # prior 232 | stddev[j] ~ dunif(0, 20) # prior 233 | tau[j] <- pow(stddev[j], -2) 234 | } 235 | for(i in 1:theLength){ 236 | # likelihood function 237 | y[i] ~ dnorm(mu[x[i]], tau[x[i]]) 238 | } 239 | }" 240 | 241 | 242 | 243 | 244 | 245 | 246 | the.data <- list( 247 | y = mtcars$mpg, 248 | # 'x' needs to start at 1 so 249 | # 1 is now automatic and 2 is manual 250 | x = ifelse(mtcars$am==1, 1, 2), 251 | theLength = nrow(mtcars) 252 | ) 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | > results <- autorun.jags(the.model, 261 | + data=the.data, 262 | + n.chains = 3, 263 | + monitor = c('mu', 'stddev')) 264 | 265 | 266 | 267 | 268 | 269 | 270 | > results.matrix <- as.matrix(results$mcmc) 271 | > difference.in.means <- (results.matrix[,1] - 272 | + results.matrix[,2]) 273 | 274 | 275 | 276 | 277 | 278 | 279 | -------------------------------------------------------------------------------- /Chapter08/Chapter08.txt: -------------------------------------------------------------------------------- 1 | What's... uhhh... the deal with the bootstrap? 2 | 3 | 4 | > # setting seed will make random number generation reproducible 5 | > set.seed(1) 6 | > all.us.women <- rnorm(10000, mean=65, sd=3.5) 7 | > our.sample <- sample(all.us.women, 40) 8 | > bootstrap.replicates <- numeric(10000) 9 | > for(i in 1:10000){ 10 | + a.sample <- sample(our.sample, 40, replace=TRUE) 11 | + bootstrap.replicates[i] <- mean(a.sample) 12 | > } 13 | > hist(bootstrap.replicates 14 | 15 | 16 | 17 | 18 | 19 | 20 | Performing the bootstrap in R (more elegantly) 21 | 22 | btobj <- boot(our.sample, function(x, i){mean(x[i])}, 10000, 23 | parallel="multicore", ncpus=3) 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | plot(btobj) 32 | 33 | 34 | 35 | str(btobj) 36 | 37 | 38 | 39 | 40 | hist(btobj$t) 41 | 42 | 43 | 44 | 45 | Confidence intervals 46 | 47 | > quantile(btobj$t, prob=0.025) 48 | > quantile(btobj$t, prob=0.975) 49 | 50 | 51 | 52 | 53 | 54 | > boot.ci(btobj, conf=0.95, type="perc") 55 | BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS 56 | Based on 10000 bootstrap replicates 57 | CALL : 58 | boot.ci(boot.out = btobj, conf = 0.95, type = "perc") 59 | Intervals : 60 | Level Percentile 61 | 95% (63.7, 65.8 ) 62 | Calculations and Intervals on Original Scale 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | > boot.ci(btobj, type="bca") 71 | BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS 72 | Based on 10000 bootstrap replicates 73 | CALL : 74 | boot.ci(boot.out = btobj, type = "bca") 75 | Intervals : 76 | Level BCa 77 | 95% (63.67, 65.77 ) 78 | Calculations and Intervals on Original Scale 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | A one-sample test of means 88 | 89 | > t.test(precip, mu=38) 90 | One Sample t-test 91 | data: precip 92 | t = -1.901, df = 69, p-value = 0.06148 93 | alternative hypothesis: true mean is not equal to 38 94 | 95 percent confidence interval: 95 | 31.61748 38.15395 96 | sample estimates: 97 | mean of x 98 | 34.88571 99 | 100 | 101 | 102 | 103 | 104 | 105 | > set.seed(1) 106 | > btobj <- boot(precip, function(x, i){mean(x[i])}, 300000) 107 | > boot.ci(btobj, type="bca") 108 | BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS 109 | Based on 300000 bootstrap replicates 110 | CALL : 111 | boot.ci(boot.out = btobj, type = "bca") 112 | Intervals : 113 | Level BCa 114 | 95% (31.60, 37.98 ) 115 | Calculations and Intervals on Original Scale 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | Bootstrapping statistics other than the mean 125 | 126 | > set.seed(1) 127 | > law.salaries <- c(c(rnorm(70, mean=40000, sd=7000)), 500000, 400000, 128 | 350000) 129 | 130 | 131 | 132 | 133 | 134 | 135 | > btobj.mean <- boot(law.salaries, function(x, i){mean(x[i])}, 300000) 136 | > btobj.median <- boot(law.salaries, function(x, i){median(x[i])}, 300000) 137 | > btobj.tmean <- boot(law.salaries, function(x, i){mean(x[i], trim=0.05)}, 138 | 300000) 139 | > boot.ci(btobj.mean, type="bca") # 44989 - 84084 140 | > boot.ci(btobj.median, type="bca") # 39607 - 42926 141 | > boot.ci(btobj.tmean, type="bca") # 39993 - 57715 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /Chapter09/Chapter09.txt: -------------------------------------------------------------------------------- 1 | Simple linear regression 2 | 3 | 4 | plot(mpg ~ wt, data=mtcars) 5 | 6 | 7 | 8 | 9 | model <- lm(mpg ~ wt, data=mtcars) 10 | 11 | 12 | 13 | 14 | abline(model) 15 | 16 | 17 | 18 | 19 | 20 | summary(model) 21 | Call: 22 | lm(formula = mpg ~ wt, data = mtcars) 23 | Residuals: 24 | Min 1Q Median 3Q Max 25 | -4.5432 -2.3647 -0.1252 1.4096 6.8727 26 | Coefficients: 27 | Estimate Std. Error t value Pr(>|t|) 28 | (Intercept) 37.2851 1.8776 19.858 < 2e-16 *** 29 | wt -5.3445 0.5591 -9.559 1.29e-10 *** 30 | --- 31 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 32 | Residual standard error: 3.046 on 30 degrees of freedom 33 | Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446 34 | F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | predict(model, newdata=data.frame(wt=6)) 43 | 1 44 | 5.218297 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | Simple linear regression with a binary predictor 53 | 54 | model <- lm(mpg ~ am, data=mtcars) 55 | summary(model) 56 | Call: 57 | lm(formula = mpg ~ am, data = mtcars) 58 | Residuals: 59 | Min 1Q Median 3Q Max 60 | -9.3923 -3.0923 -0.2974 3.2439 9.5077 61 | Coefficients: 62 | Estimate Std. Error t value Pr(>|t|) 63 | (Intercept) 17.147 1.125 15.247 1.13e-15 *** 64 | am 7.245 1.764 4.106 0.000285 *** 65 | --- 66 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 67 | Residual standard error: 4.902 on 30 degrees of freedom 68 | Multiple R-squared: 0.3598, Adjusted R-squared: 0.3385 69 | F-statistic: 16.86 on 1 and 30 DF, p-value: 0.000285 70 | mean(mtcars$mpg[mtcars$am==0]) 71 | [1] 17.14737 72 | (mean(mtcars$mpg[mtcars$am==1]) - 73 | mean(mtcars$mpg[mtcars$am==0])) 74 | [1] 7.244939 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | # use var.equal to choose Students t-test 83 | # over Welch's t-test 84 | t.test(mpg ~ am, data=mtcars, var.equal=TRUE) 85 | Two Sample t-test 86 | data: mpg by am 87 | t = -4.1061, df = 30, p-value = 0.000285 88 | alternative hypothesis: true difference in means is not equal to 0 89 | 95 percent confidence interval: 90 | -10.84837 -3.64151 91 | sample estimates: 92 | mean in group 0 mean in group 1 93 | 17.14737 24.39231 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | mtcars$automatic <- ifelse(mtcars$am==0, "yes", "no") 104 | model <- lm(mpg ~ factor(automatic), data=mtcars) 105 | model 106 | Call: 107 | lm(formula = mpg ~ factor(automatic), data = mtcars) 108 | Coefficients: 109 | (Intercept) factor(automatic)yes 110 | 24.392 -7.245 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | A word of warning 124 | 125 | 126 | library(MASS) 127 | data(anscombe) 128 | plot(y3 ~ x3, data=anscombe) 129 | abline(lm(y3 ~ x3, data=anscombe), 130 | col="blue", lty=2, lwd=2) 131 | abline(rlm(y3 ~ x3, data=anscombe), 132 | col="red", lty=1, lwd=2) 133 | 134 | 135 | 136 | 137 | 138 | 139 | Multiple regression 140 | 141 | model <- lm(mpg ~ wt + hp, data=mtcars) 142 | summary(model) 143 | 144 | Call: 145 | lm(formula = mpg ~ wt + hp, data = mtcars) 146 | Residuals: 147 | Min 1Q Median 3Q Max 148 | -3.941 -1.600 -0.182 1.050 5.854 149 | Coefficients: 150 | Estimate Std. Error t value Pr(>|t|) 151 | (Intercept) 37.22727 1.59879 23.285 < 2e-16 *** 152 | wt -3.87783 0.63273 -6.129 1.12e-06 *** 153 | hp -0.03177 0.00903 -3.519 0.00145 ** 154 | --- 155 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 156 | Residual standard error: 2.593 on 29 degrees of freedom 157 | Multiple R-squared: 0.8268, Adjusted R-squared: 0.8148 158 | F-statistic: 69.21 on 2 and 29 DF, p-value: 9.109e-12 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | coef(lm(mpg ~ wt + hp, data=mtcars)) 168 | (Intercept) wt hp 169 | 37.22727012 -3.87783074 -0.03177295 170 | coef(lm(mpg ~ wt, data=mtcars)) 171 | (Intercept) wt 172 | 37.285126 -5.344472 173 | coef(lm(mpg ~ hp, data=mtcars)) 174 | (Intercept) hp 175 | 30.09886054 -0.06822828 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | predict(model, newdata = data.frame(wt=2.5, hp=275)) 185 | 1 186 | 18.79513 187 | 188 | 189 | 190 | 191 | 192 | 193 | Regression with a non-binary predictor 194 | 195 | # the dataset is in the car package 196 | library(car) 197 | model <- lm(wl2 ~ factor(group), data=WeightLoss) 198 | summary(model) 199 | Call: 200 | lm(formula = wl2 ~ factor(group), data = WeightLoss) 201 | Residuals: 202 | Min 1Q Median 3Q Max 203 | -2.100 -1.054 -0.100 0.900 2.900 204 | Coefficients: 205 | Estimate Std. Error t value Pr(>|t|) 206 | (Intercept) 3.3333 0.3756 8.874 5.12e-10 *** 207 | factor(group)Diet 0.5833 0.5312 1.098 0.281 208 | factor(group)DietEx 2.7667 0.5571 4.966 2.37e-05 *** 209 | --- 210 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 211 | Residual standard error: 1.301 on 31 degrees of freedom 212 | Multiple R-squared: 0.4632, Adjusted R-squared: 0.4285 213 | F-statistic: 13.37 on 2 and 31 DF, p-value: 6.494e-05 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | Kitchen sink regression 222 | 223 | # the period after the squiggly denotes all other variables 224 | model <- lm(mpg ~ ., data=mtcars) 225 | summary(model) 226 | Call: 227 | lm(formula = mpg ~ ., data = mtcars) 228 | Residuals: 229 | Min 1Q Median 3Q Max 230 | -3.4506 -1.6044 -0.1196 1.2193 4.6271 231 | Coefficients: 232 | Estimate Std. Error t value Pr(>|t|) 233 | (Intercept) 12.30337 18.71788 0.657 0.5181 234 | cyl -0.11144 1.04502 -0.107 0.9161 235 | disp 0.01334 0.01786 0.747 0.4635 236 | hp -0.02148 0.02177 -0.987 0.3350 237 | drat 0.78711 1.63537 0.481 0.6353 238 | wt -3.71530 1.89441 -1.961 0.0633 . 239 | qsec 0.82104 0.73084 1.123 0.2739 240 | vs 0.31776 2.10451 0.151 0.8814 241 | am 2.52023 2.05665 1.225 0.2340 242 | gear 0.65541 1.49326 0.439 0.6652 243 | carb -0.19942 0.82875 -0.241 0.8122 244 | --- 245 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 246 | Residual standard error: 2.65 on 21 degrees of freedom 247 | Multiple R-squared: 0.869, Adjusted R-squared: 0.8066 248 | F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07 249 | 250 | 251 | 252 | 253 | 254 | 255 | Cross-validation 256 | 257 | set.seed(1) 258 | train.indices <- sample(1:nrow(mtcars), nrow(mtcars)/2) 259 | training <- mtcars[train.indices,] 260 | testing <- mtcars[-train.indices,] 261 | model <- lm(mpg ~ ., data=training) 262 | summary(model) 263 | ..... (output truncated) 264 | Residual standard error: 1.188 on 5 degrees of freedom 265 | Multiple R-squared: 0.988, Adjusted R-squared: 0.9639 266 | F-statistic: 41.06 on 10 and 5 DF, p-value: 0.0003599 267 | 268 | 269 | 270 | 271 | mean((predict(model) - training$mpg) ^ 2) 272 | [1] 0.4408109 273 | # Cool, but how does it perform on the validation set? 274 | mean((predict(model, newdata=testing) - testing$mpg) ^ 2) 275 | [1] 337.9995 276 | 277 | 278 | 279 | 280 | 281 | simpler.model <- lm(mpg ~ am + wt, data=training) 282 | mean((predict(simpler.model) - training$mpg) ^ 2) 283 | [1] 9.396091 284 | mean((predict(simpler.model, newdata=testing) - testing$mpg) ^ 2) 285 | [1] 12.70338 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | library(boot) 294 | bad.model <- glm(mpg ~ ., data=mtcars) 295 | better.model <- glm(mpg ~ am + wt + qsec, data=mtcars) 296 | bad.cv.err <- cv.glm(mtcars, bad.model, K=5) 297 | # the cross-validated MSE estimate we will be using 298 | # is a bias-corrected one stored as the second element 299 | # in the 'delta' vector of the cv.err object 300 | bad.cv.err$delta[2] 301 | [1] 14.92426 302 | 303 | better.cv.err <- cv.glm(mtcars, better.model, K=5) 304 | better.cv.err$delta[2] 305 | [1] 7.944148 306 | 307 | 308 | 309 | 310 | 311 | 312 | Linear regression diagnostics 313 | 314 | my.model <- lm(mpg ~ wt, data=mtcars) 315 | plot(my.model) 316 | 317 | 318 | 319 | 320 | 321 | Fourth Anscombe relationship 322 | 323 | model <- lm(mpg ~ am + wt + qsec, data=mtcars) 324 | library(car) 325 | vif(model) 326 | 327 | am wt qsec 328 | 329 | 330 | 331 | 332 | 333 | 334 | Advanced topics 335 | 336 | install.packages("glmnet") 337 | library(glmnet) 338 | vignette("glmnet_beta") 339 | 340 | 341 | 342 | 343 | my.model <- lm(relief ~ soma*juice, data=my.data) 344 | 345 | 346 | 347 | 348 | 349 | 350 | 2.541437 2.482952 1.364339 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | -------------------------------------------------------------------------------- /Chapter10/Chapter10.txt: -------------------------------------------------------------------------------- 1 | Using k-NN in R 2 | 3 | # "class" is one of the packages that implement k-NN 4 | # "chemometrics" contains a function we need 5 | # "mlbench" holds the dataset 6 | install.packages(c("class", "mlbench", "chemometrics")) 7 | library(class) 8 | library(mlbench) 9 | data(PimaIndiansDiabetes) 10 | PID <- PimaIndiansDiabetes 11 | 12 | 13 | # we set the seed so that our splits are the same 14 | set.seed(3) 15 | ntrain <- round(nrow(PID)*4/5) 16 | train <- sample(1:nrow(PID), ntrain) 17 | training <- PID[train,] 18 | testing <- PID[-train,] 19 | 20 | 21 | 22 | resknn <- knnEval(scale(PID[,-9]), PID[,9], train, kfold=10, 23 | knnvec=seq(1,50,by=1), 24 | legpos="bottomright") 25 | 26 | 27 | 28 | 29 | predictions <- knn(scale(training[,-9]), 30 | scale(testing[,-9]), 31 | training[,9], k=27) 32 | # function to give correct classification rate 33 | accuracy <- function(predictions, answers){ 34 | sum((predictions==answers)/(length(answers))) 35 | } 36 | accuracy(predictions, testing[,9]) 37 | [1] 0.7597403 38 | 39 | 40 | 41 | 42 | 43 | table(test[,9], preds) 44 | preds 45 | neg pos 46 | neg 86 9 47 | pos 28 31 48 | 49 | 50 | 51 | 52 | 53 | 54 | Using logistic regression in R 55 | 56 | 57 | model <- glm(diabetes ~ ., data=PID, family=binomial(logit)) 58 | 59 | 60 | 61 | 62 | 63 | summary(model) 64 | Call: 65 | glm(formula = diabetes ~ ., family = binomial(logit), data = PID) 66 | Deviance Residuals: 67 | Min 1Q Median 3Q Max 68 | -2.5566 -0.7274 -0.4159 0.7267 2.9297 69 | Coefficients: 70 | Estimate Std. Error z value Pr(>|z|) 71 | (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 *** 72 | pregnant 0.1231823 0.0320776 3.840 0.000123 *** 73 | glucose 0.0351637 0.0037087 9.481 < 2e-16 *** 74 | pressure -0.0132955 0.0052336 -2.540 0.011072 * 75 | 76 | 77 | 78 | 79 | 80 | predictions <- round(predict(model, type="response")) 81 | predictions <- ifelse(predictions == 1, "pos", "neg") 82 | accuracy(predictions, PID$diabetes) 83 | [1] 0.7825521 84 | 85 | 86 | 87 | 88 | 89 | set.seed(3) 90 | library(boot) 91 | cv.err <- cv.glm(PID, model, K=5) 92 | cv.err$delta[2] 93 | [1] 0.154716 94 | 1 - cv.err$delta[2] 95 | [1] 0.845284 96 | 97 | 98 | 99 | 100 | 101 | 102 | predictions <- round(predict(model, type="response", 103 | newdata=test)) 104 | predictions <- ifelse(predictions == 1, "pos", "neg") 105 | accuracy(predictions, test[,9]) # 78% 106 | [1] 0.7792208 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | Decision trees 116 | 117 | library(tree) 118 | our.big.tree <- tree(diabetes ~ ., data=training) 119 | summary(our.big.tree) 120 | Classification tree: 121 | tree(formula = diabetes ~ ., data = training) 122 | Variables actually used in tree construction: 123 | [1] "glucose" "age" "mass" "pedigree" "triceps" "pregnant" 124 | [7] "insulin" 125 | Number of terminal nodes: 16 126 | 127 | 128 | Residual mean deviance: 0.7488 = 447.8 / 598 129 | Misclassification error rate: 0.184 = 113 / 614 130 | plot(our.big.tree) 131 | text(our.big.tree) 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | set.seed(3) 141 | cv.results <- cv.tree(our.big.tree, FUN=prune.misclass) 142 | plot(cv.results$size, cv.results$dev, type="b") 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | pruned.tree <- prune.misclass(our.big.tree, best=3) 151 | plot(pruned.tree) 152 | text(pruned.tree) 153 | # let's test its accuracy 154 | pruned.preds <- predict(pruned.tree, newdata=test, type="class") 155 | accuracy(pruned.preds, test[,9]) # 71% 156 | [1] 0.7077922 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | Random forests 165 | 166 | library(randomForest) 167 | forest <- randomForest(diabetes ~ ., data=training, 168 | importance=TRUE, 169 | ntree=2000, 170 | mtry=5) 171 | accuracy(predict(forest), training[,9]) 172 | [1] 0.7654723 173 | predictions <- predict(forest, newdata = test) 174 | accuracy(predictions, test[,9]) 175 | [1] 0.7727273 176 | 177 | 178 | 179 | The circular decision boundary 180 | 181 | model <- glm(factor(dep.var) ~ ind.var1 + 182 | I(ind.var1^2) + ind.var2 + I(ind.var2^2), 183 | data=this, family=binomial(logit)) 184 | 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /Chapter11/Chapter11.txt: -------------------------------------------------------------------------------- 1 | 2 | Creating and plotting time series 3 | 4 | 5 | > 6 | download.file("https://raw.githubusercontent.com/tonyfischetti/dawr/master/ 7 | forecasting/school-supplies.csv", "./school-supplies.csv") 8 | > school <- read.csv("./school-supplies.csv") 9 | > head(school) 10 | thedate hits 11 | 1 2004-01 20 12 | 2 2004-02 24 13 | 3 2004-03 19 14 | 4 2004-04 26 15 | 5 2004-05 25 16 | 6 2004-06 24 17 | > start(schoolts) 18 | [1] 2004 1 19 | > end(schoolts) 20 | [1] 2018 3 21 | 22 | > schoolts <- ts(school$hits, start=c(2004, 1), frequency=12) 23 | > schoolts 24 | Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 25 | 2004 20 24 19 26 25 24 55 88 32 26 20 19 26 | 2005 27 24 22 19 24 22 57 82 34 21 19 19 27 | 2006 22 21 19 17 21 24 61 77 29 18 16 14 28 | 2007 22 16 17 18 21 25 56 86 27 18 16 14 29 | 2008 19 16 16 16 19 20 55 79 24 16 14 12 30 | 2009 16 16 15 13 15 19 54 85 24 14 15 11 31 | 2010 17 15 13 13 15 18 49 78 23 13 11 10 32 | 2011 15 12 11 11 12 17 53 79 22 12 12 10 33 | 34 | 35 | 36 | > library(ggplot2) 37 | > library(forecast) 38 | > autoplot(schoolts) 39 | 40 | 41 | 42 | 43 | 44 | > 45 | download.file("https://raw.githubusercontent.com/tonyfischetti/dawr/master/ 46 | forecasting/yearly-land-air-temp.csv", "yearly-temp.csv") 47 | > yearlytemp <- read.csv("yearly-temp.csv") 48 | > head(yearlytemp) 49 | year val 50 | 1 1880 -0.1900000 51 | 2 1881 -0.1008333 52 | 3 1882 -0.1108333 53 | 4 1883 -0.1941667 54 | 5 1884 -0.2966667 55 | 6 1885 -0.3175000 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | > tempts <- ts(yearlytemp$val, start=1880, frequency=1) 66 | > autoplot(tempts) 67 | 68 | 69 | 70 | 71 | 72 | > # set the seed for deterministic "randomness" 73 | > set.seed(2) 74 | > gausnoise <- rnorm(100, mean=100, sd=20) 75 | > head(gausnoise) 76 | [1] 82.06171 103.69698 131.75691 77.39249 98.39496 102.64841 77 | > gausts <- ts(gausnoise, start=1900, frequency=1) 78 | > autoplot(gausts) 79 | 80 | 81 | 82 | 83 | 84 | > autoplot(AirPassengers) 85 | 86 | 87 | 88 | 89 | 90 | 91 | > autoplot(schoolts) + 92 | + ylim(0, 200) + 93 | + ggtitle('google hits for "school supplies" since 2004') + 94 | + ggsave("school-supplies-fancy.png") 95 | Saving 7 x 6.43 in image 96 | 97 | 98 | 99 | 100 | 101 | Time series decomposition 102 | 103 | > schoolcomps <- decompose(schoolts) 104 | > autoplot(schoolcomps) 105 | 106 | 107 | 108 | 109 | 110 | > aircomps <- decompose(AirPassengers, type="multiplicative") 111 | > autoplot(aircomps) 112 | 113 | 114 | 115 | 116 | 117 | 118 | Autocorrelation 119 | 120 | Lag 0 8 6 7 5 3 0 9 121 | Lag 1 NA 8 6 7 5 3 0 122 | Lag 2 NA NA 8 6 7 5 3 123 | 124 | 125 | 126 | 127 | 128 | > ggAcf(schoolts) 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | > Box.test(schoolts, type="Ljung-Box") 137 | Box-Ljung test 138 | data: schoolts 139 | X-squared = 39.472, df = 1, p-value = 0.0000000003328 140 | 141 | > Box.test(tempts, type="Ljung-Box") 142 | Box-Ljung test 143 | data: tempts 144 | X-squared = 121.71, df = 1, p-value < 2.2e-16 145 | > 146 | > Box.test(gausts, type="Ljung-Box") 147 | Box-Ljung test 148 | data: gausts 149 | X-squared = 0.36027, df = 1, p-value = 0.5484 150 | 151 | 152 | > 153 | > Box.test(AirPassengers, type="Ljung-Box") 154 | Box-Ljung test 155 | data: AirPassengers 156 | X-squared = 132.14, df = 1, p-value < 2.2e-16 157 | 158 | 159 | 160 | 161 | 162 | 163 | Smoothing 164 | 165 | > library(TTR) 166 | > sm5 <- SMA(gausnoise, n=5) 167 | > sm10 <- SMA(gausnoise, n=10) 168 | > sm15 <- SMA(gausnoise, n=15) 169 | > head(sm5, n=10) 170 | [1] NA NA NA NA 98.66061 102.77795 104.87037 171 | [8] 97.56020 110.01960 109.78546 172 | 173 | 174 | 175 | > se1 <- ses(gausts, alpha=.999) 176 | > se.66 <- ses(gausts, alpha=.666) 177 | > se.33 <- ses(gausts, alpha=.333) 178 | > se.1 <- ses(gausts, alpha=0.1) 179 | > se0 <- ses(gausts, alpha=0.001) 180 | > head(fitted(se.1)) 181 | Time Series: 182 | Start = 1900 183 | End = 1905 184 | Frequency = 1 185 | [1] 102.9813 100.8894 101.1701 104.2288 101.5452 101.2301 186 | 187 | 188 | 189 | 190 | 191 | 192 | Simple exponential smoothing for forecasting 193 | 194 | > mfore <- meanf(gausts, h=20) 195 | > autoplot(mfore) 196 | 197 | 198 | 199 | 200 | 201 | > nfore <- naive(gausts, h=20) 202 | > sfore <- ses(gausts, h=20) 203 | > # we are setting PI to false to suppress 204 | > # plotting the prediction intervals 205 | > autoplot(gausts) + 206 | + ylim(0, 200) + xlim(1900, 2010) + 207 | + autolayer(mfore, PI=FALSE, series="mean") + 208 | + autolayer(nfore, PI=FALSE, series="naive") + 209 | + autolayer(sfore, PI=FALSE, series="ses") 210 | 211 | 212 | 213 | 214 | 215 | 216 | > summary(sfore) 217 | Forecast method: Simple exponential smoothing 218 | Model Information:Simple exponential smoothing 219 | Call: 220 | ses(y = gausts, h = 20) 221 | Smoothing parameters: 222 | alpha = 0.0001 223 | Initial states: 224 | l = 99.3055 225 | sigma: 23.0888 226 | AIC AICc BIC 227 | 1094.386 1094.636 1102.202 228 | Error measures: 229 | ME RMSE MAE MPE MAPE MASE 230 | Training set 0.07145194 23.08877 19.04151 -5.942264 20.89301 0.7154352 231 | ACF1 232 | Training set -0.05912586 233 | Forecasts: 234 | Point Forecast Lo 80 Hi 80 Lo 95 Hi 95 235 | 2000 99.3062 69.71675 128.8957 54.05305 144.5594 236 | 2001 99.3062 69.71675 128.8957 54.05305 144.5594 237 | 2002 99.3062 69.71675 128.8957 54.05305 144.5594 238 | .... 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | Accuracy assessment 247 | 248 | 249 | > checkresiduals(mfore) 250 | Ljung-Box test 251 | data: Residuals from Mean 252 | Q* = 12.702, df = 9, p-value = 0.1766 253 | Model df: 1. Total lags used: 10 254 | > 255 | > checkresiduals(nfore) 256 | Ljung-Box test 257 | data: Residuals from Naive method 258 | Q* = 58.18, df = 10, p-value = 0.000000007996 259 | Model df: 0. Total lags used: 10 260 | > 261 | > checkresiduals(sfore) 262 | Ljung-Box test 263 | data: Residuals from Simple exponential smoothing 264 | Q* = 12.701, df = 8, p-value = 0.1226 265 | Model df: 2. Total lags used: 10 266 | 267 | 268 | 269 | 270 | 271 | > train <- window(gausts, start=1900, end=1980) 272 | > nforetr <- naive(train, h=20) 273 | > mforetr <- meanf(train, h=20) 274 | > sforetr <- ses(train, h=20) 275 | 276 | 277 | 278 | 279 | 280 | > accuracy(nforetr, gausts) 281 | RMSE MAE 282 | Training set 33.41918 26.86576 283 | Test set 36.44798 33.16893 284 | > 285 | accuracy(mforetr, gausts) 286 | RMSE MAE 287 | Training set 22.41742 18.33581 288 | Test set 25.96117 22.84341 289 | > 290 | > accuracy(sforetr, gausts) 291 | RMSE MAE 292 | Training set 22.41900 18.34584 293 | Test set 26.00306 22.91065 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | > tsCV(gausts, meanf, h=5) 302 | Time Series: 303 | Start = 1900 304 | End = 1999 305 | Frequency = 1 306 | [1] NA NA NA NA NA 20.5866966 307 | [7] 21.2797482 -10.6324936 40.9624573 -1.4363504 9.0277723 18.1906911 308 | 309 | 310 | 311 | 312 | 313 | 314 | ts_cv_rmse <- function(thets, thefun, h=5){ 315 | resids <- tsCV(thets, thefun, h=h) 316 | return(sqrt(mean(resids^2, na.rm=TRUE))) 317 | } 318 | 319 | 320 | 321 | 322 | 323 | 324 | > ts_cv_rmse(gausts, naive) 325 | [1] 33.82626 326 | > ts_cv_rmse(gausts, meanf) 327 | [1] 23.75519 328 | > ts_cv_rmse(gausts, ses) 329 | [1] 24.00897 330 | 331 | 332 | 333 | 334 | 335 | Double exponential smoothing 336 | 337 | hwfr <- holt(tempts, h=50) 338 | autoplot(hwfr) 339 | 340 | 341 | 342 | 343 | 344 | > hwfrd <- holt(tempts, h=50, damped=TRUE) 345 | > autoplot(hwfrd) 346 | 347 | 348 | 349 | 350 | > checkresiduals(hwfr) 351 | Ljung-Box test 352 | data: Residuals from Holt's method 353 | Q* = 16.666, df = 6, p-value = 0.01059 354 | Model df: 4. Total lags used: 10 355 | > 356 | > checkresiduals(hwfrd) 357 | Ljung-Box test 358 | data: Residuals from Damped Holt's method 359 | Q* = 17.769, df = 5, p-value = 0.00325 360 | Model df: 5. Total lags used: 10 361 | 362 | 363 | 364 | 365 | 366 | 367 | > ts_cv_rmse(tempts, holt) 368 | [1] 0.1780076 369 | > # have to use an anonymous function in order to specify a damped trend 370 | > ts_cv_rmse(tempts, function(x, h) holt(x, damped=TRUE, h=h)) 371 | [1] 0.1892179 372 | > ts_cv_rmse(tempts, meanf) 373 | [1] 0.3555613 374 | > ts_cv_rmse(tempts, naive) 375 | [1] 0.1631819 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | Triple exponential smoothing 384 | 385 | 386 | > hwfore <- hw(schoolts, h=48, seasonal="additive") 387 | > autoplot(hwfore) 388 | 389 | 390 | ts_cv_rmse(schoolts, function(x, h) hw(x, seasonal="additive")) 391 | [1] 9.34646 392 | ts_cv_rmse(schoolts, snaive) 393 | [1] 3 394 | 395 | 396 | 397 | 398 | > hwfore <- hw(AirPassengers, h=48, seasonal="multiplicative") 399 | > # now let's compare the simpler approaches 400 | > snfore <- snaive(AirPassengers, h=48) 401 | > nfore <- naive(AirPassengers, h=48) 402 | > mfore <- meanf(AirPassengers, h=48) 403 | > ts_cv_rmse(AirPassengers, function(x, h) hw(x, 404 | seasonal="multiplicative")) 405 | [1] 22.33878 406 | > ts_cv_rmse(AirPassengers, snaive) 407 | [1] 35.9892 408 | > ts_cv_rmse(AirPassengers, naive) 409 | [1] 77.16156 410 | > ts_cv_rmse(AirPassengers, meanf) 411 | [1] 126.4193 412 | 413 | 414 | 415 | 416 | 417 | 418 | ETS and the state space model 419 | 420 | 421 | > etsobj <- ets(schoolts) 422 | > summary(etsobj) 423 | ETS(M,A,A) 424 | Call: 425 | ets(y = schoolts) 426 | Smoothing parameters: 427 | alpha = 0.0464 428 | beta = 0.0017 429 | gamma = 0.3957 430 | ... 431 | AIC AICc BIC 432 | 1107.799 1111.799 1161.207 433 | Training set error measures: 434 | ME RMSE MAE MPE MAPE MASE ACF1 435 | Training set 0.3560261 3.080813 1.879578 -0.146184 7.017015 0.9028789 436 | 0.2787678 437 | > 438 | > etsfore <- forecast(etsobj, h=48) 439 | 440 | 441 | 442 | 443 | 444 | 445 | Interventions for improvement 446 | 447 | > etsobj <- ets(tempts) 448 | > summary(etsobj) 449 | ETS(A,N,N) 450 | Call: 451 | ets(y = tempts) 452 | ... 453 | > autoplot(forecast(etsobj, h=50)) 454 | 455 | 456 | 457 | 458 | 459 | > etsobj2 <- ets(window(tempts, start=1970)) 460 | > summary(etsobj2) 461 | ETS(A,A,N) 462 | Call: 463 | ets(y = window(tempts, start = 1970)) 464 | ... 465 | > autoplot(forecast(etsobj2, h=50)) 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | -------------------------------------------------------------------------------- /Chapter12/Chapter12.txt: -------------------------------------------------------------------------------- 1 | Relational databases 2 | 3 | 4 | CREATE TABLE artists( 5 | artist_id INTEGER PRIMARY KEY, 6 | name TEXT, 7 | born_on INTEGER 8 | ); 9 | CREATE TABLE paintings( 10 | painting_id INTEGER PRIMARY KEY, 11 | painting_artist INTEGER, 12 | painting_name TEXT, 13 | year_completed INTEGER, 14 | FOREIGN KEY(painting_artist) REFERENCES artists(artist_id) 15 | ); 16 | INSERT INTO artists(name, born_on) 17 | VALUES ("Kay Sage", 1898), 18 | ("Piet Mondrian", 1872), 19 | ("Rene Magritte", 1898), 20 | ("Man Ray", 1890), 21 | ("Jean-Michel Basquiat", 1960); 22 | INSERT INTO paintings(painting_artist, painting_name, year_completed) 23 | VALUES (4, "Orquesta Sinfonica", 1916), 24 | (4, "La Fortune", 1938), 25 | (1, "Tommorow is Never", 1955), 26 | (1, "The Answer is No", 1958), 27 | (1, "No Passing", 1954), 28 | (5, "Bird on Money", 1981), 29 | (2, "Place de la Concorde", 1943), 30 | (2, "Composition No. 10", 1942), 31 | (3, "The Human Condition", 1935), 32 | (3, "The Treachery of Images", 1948), 33 | (3, "The Son of Man", 1964); 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | SELECT * FROM artists; 42 | -------------------------------- 43 | 1 | Kay Sage | 1898 44 | 2 | Piet Mondrian | 1872 45 | 3 | Rene Magritte | 1898 46 | 4 | Man Ray | 1890 47 | 5 | Jean-Michel Basquiat | 1960 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | SELECT * FROM paintings; 56 | -------------------------------------- 57 | 1 | 4 | Orquesta Sinfonica | 1916 58 | 2 | 4 | La Fortune | 1938 59 | 3 | 1 | Tommorow is Never | 1955 60 | 4 | 1 | The Answer is No | 1958 61 | 5 | 1 | No Passing | 1954 62 | 6 | 5 | Bird on Money | 1981 63 | 7 | 2 | Place de la Concorde | 1943 64 | 8 | 2 | Composition No. 10 | 1942 65 | 9 | 3 | The Human Condition | 1935 66 | 10 | 3 | The Treachery of Images | 1948 67 | 11 | 3 | The Son of Man | 1964 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | library(DBI) 76 | library(RSQLite) 77 | # we read the art sqlite db from the current 78 | # working directory which can be get and set 79 | # with getwd() and setwd(), respectively 80 | art_db <- dbConnect(sqlite, "./art.db") 81 | 82 | 83 | 84 | 85 | 86 | result <- dbSendQuery(art_db, 87 | "SELECT paintings.painting_name, artists.name 88 | FROM paintings INNER JOIN artists 89 | ON paintings.painting_artist=artists.artist_id;") 90 | response <- fetch(result) 91 | head(response) 92 | dbClearResult(result) 93 | ---------------------------------------------- 94 | painting_name name 95 | 1 Orquesta Sinfonica Man Ray 96 | 2 La Fortune Man Ray 97 | 3 Tommorow is Never Kay Sage 98 | 4 The Answer is No Kay Sage 99 | 5 No Passing Kay Sage 100 | 101 | 102 | 103 | 104 | 105 | 106 | result <- dbSendQuery(art_db, 107 | "SELECT paintings.year_completed, artists.born_on 108 | FROM paintings INNER JOIN artists 109 | ON paintings.painting_artist=artists.artist_id;") 110 | response <- fetch(result) 111 | head(response) 112 | dbClearResult(result) 113 | ---------------------------- 114 | year_completed born_on 115 | 1 1916 1890 116 | 2 1938 1890 117 | 3 1955 1898 118 | 4 1958 1898 119 | 5 1954 1898 120 | 6 1981 1960 121 | At this time, row-wise subtraction and averaging can be performed simply: 122 | mean(response$year_completed - response$born_on) 123 | ----------- 124 | [1] 51.091 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | dbDisconnect(art_db) 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | Using JSON 143 | 144 | library(jsonlite) 145 | example.json <- ' 146 | { 147 | "thebeatles": { 148 | "formed": 1960, 149 | "members": [ 150 | { 151 | "firstname": "George", 152 | "lastname": "Harrison" 153 | }, 154 | { 155 | "firstname": "Ringo", 156 | "lastname": "Starr" 157 | }, 158 | { 159 | "firstname": "Paul", 160 | "lastname": "McCartney" 161 | }, 162 | { 163 | "firstname": "John", 164 | "lastname": "Lennon" 165 | } 166 | ] 167 | } 168 | }' 169 | the_beatles <- fromJSON(example.json) 170 | 171 | 172 | 173 | print(the_beatles) 174 | --------------------- 175 | $thebeatles 176 | $thebeatles$formed 177 | [1] 1960 178 | $thebeatles$members 179 | firstname lastname 180 | 1 George Harrison 181 | 2 Ringo Starr 182 | 3 Paul McCartney 183 | 4 John Lennon 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | the_beatles$thebeatles$formed 192 | the_beatles[["thebeatles"]][["formed"]] 193 | --------- 194 | [1] 1960 195 | [1] 1960 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | { 204 | "toptags": { 205 | "tag": [ 206 | { 207 | "count": 100, 208 | "name": "female vocalists", 209 | "url": "http://www.last.fm/tag/female+vocalists" 210 | }, 211 | { 212 | "count": 71, 213 | "name": "singer-songwriter", 214 | "url": "http://www.last.fm/tag/singer-songwriter" 215 | }, 216 | { 217 | "count": 65, 218 | "name": "pop", 219 | "url": "http://www.last.fm/tag/pop" 220 | } 221 | ] 222 | } 223 | } 224 | 225 | 226 | 227 | 228 | 229 | URLencode("The Beatles") 230 | ------- 231 | [1] "The%20Beatles" 232 | 233 | 234 | 235 | 236 | create_artist_query_url_lfm <- function(artist_name){ 237 | prefix <- 238 | "http://ws.audioscrobbler.com/2.0/?method=artist.gettoptags&artist=" 239 | postfix <- "&api_key=c2e57923a25c03f3d8b317b3c8622b43&format=json" 240 | encoded_artist <- URLencode(artist_name) 241 | return(paste0(prefix, encoded_artist, postfix)) 242 | } 243 | create_artist_query_url_lfm("Depeche Mode") 244 | -------------------- 245 | [1] 246 | "http://ws.audioscrobbler.com/2.0/?method=artist.gettoptags&artist=Depeche% 247 | 20Mode&api_key=c2e57923a25c03f3d8b317b3c8622b43&format=json" 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | fromJSON(create_artist_query_url_lfm("Depeche Mode")) 258 | ----------------------------------------- 259 | $toptags 260 | $toptags$tag 261 | count name url 262 | 1 100 electronic http://www.last.fm/tag/electronic 263 | 2 87 new wave http://www.last.fm/tag/new+wave 264 | 3 59 80s http://www.last.fm/tag/80s 265 | 4 56 synth pop http://www.last.fm/tag/synth+pop 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | get_tag_vector_lfm <- function(an_artist){ 277 | artist_url <- create_artist_query_url_lfm(an_artist) 278 | json <- fromJSON(artist_url) 279 | return(json$toptags$tag$name) 280 | } 281 | get_tag_vector_lfm("Depeche Mode") 282 | ------------------------------------------ 283 | [1] "electronic" "new wave" "80s" 284 | [4] "synth pop" "synthpop" "seen live" 285 | [7] "alternative" "rock" "british 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | our_artists <- list("Kate Bush", "Peter Tosh", "Radiohead", 300 | "The Smiths", "The Cure", "Black Uhuru") 301 | our_artists_tags <- lapply(our_artists, get_tag_vector_lfm) 302 | names(our_artists_tags) <- our_artists 303 | print(our_artists_tags) 304 | -------------------------------------- 305 | $`Kate Bush` 306 | [1] "female vocalists" "singer-songwriter" "pop" 307 | [4] "alternative" "80s" "british" 308 | ........ 309 | $`Peter Tosh` 310 | [1] "reggae" "roots reggae" "Rasta" 311 | [4] "roots" "ska" "jamaican" 312 | ........ 313 | $Radiohead 314 | [1] "alternative" "alternative rock" 315 | [3] "rock" "indie" 316 | 317 | ........ 318 | $`The Smiths` 319 | [1] "indie" "80s" "post-punk" 320 | [4] "new wave" "alternative" "rock" 321 | ........ 322 | $`The Cure` 323 | [1] "post-punk" "new wave" "alternative" 324 | [4] "80s" "rock" "seen live" 325 | ........ 326 | $`Black Uhuru` 327 | [1] "reggae" "roots reggae" "dub" 328 | [4] "jamaica" "roots" "jamaican" 329 | 330 | 331 | 332 | 333 | 334 | jaccard_index <- function(one, two){ 335 | length(intersect(one, two))/length(union(one, two)) 336 | } 337 | 338 | 339 | 340 | 341 | 342 | jaccard_index(our_artists_tags[["Radiohead"]], 343 | our_artists_tags[["The Cure"]]) 344 | 345 | --------------- 346 | [1] 0.3333 347 | 348 | 349 | 350 | 351 | 352 | 353 | similarity_matrix <- function(artist_list, similarity_fn) { 354 | num <- length(artist_list) 355 | # initialize a num by num matrix of zeroes 356 | sim_matrix <- matrix(0, ncol = num, nrow = num) 357 | # name the rows and columns for easy lookup 358 | rownames(sim_matrix) <- names(artist_list) 359 | colnames(sim_matrix) <- names(artist_list) 360 | # for each row in the matrix 361 | for(i in 1:nrow(sim_matrix)) { 362 | # and each column 363 | for(j in 1:ncol(sim_matrix)) { 364 | # calculate that pair's similarity 365 | the_index <- similarity_fn(artist_list[[i]], 366 | artist_list[[j]]) 367 | # and store it in the right place in the matrix 368 | sim_matrix[i,j] <- round(the_index, 2) 369 | } 370 | } 371 | return(sim_matrix) 372 | } 373 | sim_matrix <- similarity_matrix(our_artists_tags, jaccard_index) 374 | print(sim_matrix) 375 | 376 | 377 | 378 | 379 | 380 | 381 | -------------------------------------------------------------- 382 | Kate Bush Peter Tosh Radiohead The Smiths The Cure Black Uhuru 383 | Kate Bush 1.00 0.05 0.31 0.25 0.21 0.04 384 | Peter Tosh 0.05 1.00 0.02 0.03 0.03 0.33 385 | Radiohead 0.31 0.02 1.00 0.31 0.33 0.04 386 | The Smiths 0.25 0.03 0.31 1.00 0.44 0.05 387 | The Cure 0.21 0.03 0.33 0.44 1.00 0.05 388 | Black Uhuru 0.04 0.33 0.04 0.05 0.05 1.00 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | # The Smiths are the fourth column 402 | sim_matrix[order(sim_matrix[,4], decreasing=TRUE), 4] 403 | ---------------------------------------------- 404 | The Smiths The Cure Radiohead Kate Bush Black Uhuru 405 | 1.00 0.44 0.31 0.25 0.05 406 | Peter Tosh 407 | 0.03 408 | 409 | 410 | 411 | 412 | 413 | 414 | XML 415 | 416 | 417 | example_xml1 <- ' 418 | 419 | 1960 420 | 421 | 422 | George 423 | Harrison 424 | 425 | 426 | Ringo 427 | Starr 428 | 429 | 430 | Paul 431 | McCartney 432 | 433 | 434 | John 435 | Lennon 436 | 437 | 438 | ' 439 | 440 | 441 | 442 | 443 | 444 | 445 | library(XML) 446 | the_beatles <- xmlTreeParse(example_xml1) 447 | print(names(the_beatles)) 448 | ------------------- 449 | [1] "doc" "dtd" 450 | print(the_beatles$doc) 451 | --------------------- 452 | $file 453 | [1] "" 454 | $version 455 | [1] "1.0" 456 | $children 457 | $children$the_beatles 458 | 459 | 1960 460 | 461 | 462 | George 463 | Harrison 464 | 465 | .......... 466 | 467 | 468 | attr(,"class") 469 | [1] "XMLDocumentContent" 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | print(xmlValue(the_beatles$doc$children$the_beatles[["formed"]])) 483 | ---------------------- 484 | [1] "1960" 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | root <- xmlRoot(the_beatles) 495 | sapply(xmlChildren(root[["members"]]), function(x){ 496 | xmlValue(x[["first_name"]]) 497 | }) 498 | ------------------------------------------- 499 | member member member member 500 | "George" "Ringo" "Paul" "John" 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | all_first_names <- "//member/first_name" 510 | 511 | 512 | 513 | 514 | 515 | the_beatles <- xmlParse(example_xml1) 516 | getNodeSet(the_beatles, all_first_names) 517 | -------- 518 | [[1]] 519 | George 520 | [[2]] 521 | Ringo 522 | [[3]] 523 | Paul 524 | [[4]] 525 | John 526 | attr(,"class") 527 | [1] "XMLNodeSet" 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | getNodeSet(the_beatles, "//first_name") 538 | getNodeSet(the_beatles, "/the_beatles/members/member/first_name") 539 | 540 | 541 | 542 | 543 | 544 | sapply(getNodeSet(the_beatles, all_first_names), xmlValue) 545 | ------------------------------- 546 | [1] "George" "Ringo" "Paul" "John" 547 | 548 | 549 | example_xml2 <- ' 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | ' 558 | 559 | 560 | 561 | 562 | 563 | 564 | sapply(getNodeSet(the_beatles, "//member[@first_name]"), 565 | function(x){ xmlAttrs(x)[["first_name"]] }) 566 | ----------- 567 | [1] "George" "Richard" "Paul" "John" 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | Kate Bush 578 | 579 | 580 | kent 581 | 582 | 583 | english 584 | 585 | 586 | british 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | create_artist_query_url_mb <- function(artist){ 602 | encoded_artist <- URLencode(artist) 603 | return(paste0("http://musicbrainz.org/ws/2/artist/?query=artist:", 604 | encoded_artist)) 605 | } 606 | create_artist_query_url_mb("Depeche Mode") 607 | ------- 608 | [1] "http://musicbrainz.org/ws/2/artist/?query=artist:Depeche%20Mode" 609 | 610 | 611 | 612 | 613 | 614 | ns <- "http://musicbrainz.org/ns/mmd-2.0#" 615 | names(ns)[1] <- "ns" 616 | 617 | 618 | 619 | 620 | 621 | 622 | get_tag_vector_mb <- function(an_artist, ns){ 623 | artist_url <- create_artist_query_url_mb(an_artist) 624 | the_xml <- xmlParse(artist_url) 625 | xpath <- "//ns:artist[1]/ns:tag-list/ns:tag/ns:name" 626 | the_nodes <- getNodeSet(the_xml, xpath, ns) 627 | return(unlist(lapply(the_nodes, xmlValue))) 628 | } 629 | get_tag_vector_mb("Depeche Mode", ns) 630 | ------------------------------------- 631 | [1] "electronica" "post punk" "alternative dance" 632 | [4] "electronic" "dark wave" "britannique" 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | our_artists <- list("Kate Bush", "Peter Tosh", "Radiohead", 643 | "The Smiths", "The Cure", "Black Uhuru") 644 | our_artists_tags_mb <- lapply(our_artists, get_tag_vector_mb, ns) 645 | names(our_artists_tags_mb) <- our_artists 646 | sim_matrix <- similarity_matrix(our_artists_tags_mb, jaccard_index) 647 | print(sim_matrix) 648 | ------- 649 | Kate Bush Peter Tosh Radiohead The Smiths The Cure Black Uhuru 650 | Kate Bush 1.00 0.00 0.24 0.27 0.24 0.00 651 | Peter Tosh 0.00 1.00 0.00 0.00 0.00 0.17 652 | Radiohead 0.24 0.00 1.00 0.23 0.23 0.00 653 | The Smiths 0.27 0.00 0.23 1.00 0.38 0.00 654 | The Cure 0.24 0.00 0.23 0.38 1.00 0.00 655 | Black Uhuru 0.00 0.17 0.00 0.00 0.00 1.00 656 | 657 | 658 | > sim_matrix[order(sim_matrix[,4], decreasing=TRUE), 4] 659 | ------------------------------- 660 | The Smiths The Cure Kate Bush Radiohead Peter Tosh Black Uhuru 661 | 1.00 0.38 0.27 0.23 0.00 0.00 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | for(i in 1:length(our_artists_tags)){ 670 | the_artist <- names(our_artists_tags)[i] 671 | # the_artist now holds the current artist's name 672 | combined_tags <- union(our_artists_tags[[the_artist]], 673 | our_artists_tags_mb[[the_artist]]) 674 | our_artists_tags[[the_artist]] <- combined_tags 675 | } 676 | sim_matrix <- similarity_matrix(our_artists_tags, jaccard_index) 677 | print(sim_matrix) 678 | -------- 679 | Kate Bush Peter Tosh Radiohead The Smiths The Cure Black Uhuru 680 | Kate Bush 1.00 0.04 0.29 0.24 0.19 0.03 681 | Peter Tosh 0.04 1.00 0.01 0.03 0.03 0.29 682 | Radiohead 0.29 0.01 1.00 0.29 0.30 0.03 683 | The Smiths 0.24 0.03 0.29 1.00 0.40 0.05 684 | The Cure 0.19 0.03 0.30 0.40 1.00 0.05 685 | Black Uhuru 0.03 0.29 0.03 0.05 0.05 1.00 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | -------------------------------------------------------------------------------- /Chapter12/music-recommendations.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript --vanilla 2 | 3 | ########################################################### 4 | ## ## 5 | ## music-recommendations.R ## 6 | ## ## 7 | ## Author: Tony Fischetti ## 8 | ## tony.fischetti@gmail.com ## 9 | ## ## 10 | ########################################################### 11 | 12 | # workspace cleanup 13 | rm(list=ls()) 14 | 15 | # options 16 | options(echo=TRUE) 17 | options(stringsAsFactors=FALSE) 18 | 19 | # cli args 20 | args <- commandArgs(trailingOnly=TRUE) 21 | 22 | # libraries 23 | library(magrittr) 24 | library(assertr) 25 | library(XML) 26 | 27 | 28 | 29 | # takes a URL that produces XML http result 30 | # and returns parsed XML object 31 | get_xml_response <- function(a_url){ 32 | return(xmlParse(a_url)) 33 | } 34 | 35 | 36 | # takes a artist name and creates a url that 37 | # can be used to query music brainz for the artist 38 | create_artist_query_url <- function(artist){ 39 | encoded_artist <- URLencode(artist) 40 | return(paste0("http://musicbrainz.org/ws/2/artist/?query=artist:", 41 | encoded_artist)) 42 | } 43 | 44 | 45 | # takes the xml response from a music brainz query, and 46 | # the place of the artist in the artist list (usually 1) 47 | # and xml namespace 48 | # returns a vector of the artist tags 49 | get_tag_vector <- function(xml, place, ns){ 50 | xpath <- paste0("//ns:artist[", place, "]/ns:tag-list/ns:tag/ns:name") 51 | the_nodes <- getNodeSet(xml, xpath, ns) 52 | return(unlist(lapply(the_nodes, xmlValue))) 53 | } 54 | 55 | 56 | # combines all steps 57 | get_artists_tags <- function(artist, ns){ 58 | the_query <- create_artist_query_url(artist) 59 | xml_resp <- get_xml_response(the_query) 60 | tag_vector <- get_tag_vector(xml_resp, 1, ns) 61 | return(tag_vector) 62 | } 63 | 64 | 65 | jaccard <- function(one, two){ 66 | length(intersect(one, two))/length(union(one, two)) 67 | } 68 | 69 | make_similarity_matrix <- function(artist_list, similarity_fn) { 70 | num <- length(artist_list) 71 | sim_matrix <- matrix(0, ncol = num, nrow = num) 72 | rownames(sim_matrix) <- names(artist_list) 73 | colnames(sim_matrix) <- names(artist_list) 74 | for(i in 1:nrow(sim_matrix)) { 75 | for(j in 1:ncol(sim_matrix)) { 76 | sim_matrix[i,j] <- round( 77 | similarity_fn(artist_list[[i]], artist_list[[j]]), 2) 78 | } 79 | } 80 | return(sim_matrix) 81 | } 82 | 83 | 84 | 85 | 86 | 87 | ns <- "http://musicbrainz.org/ns/mmd-2.0#" 88 | names(ns)[1] <- "ns" 89 | 90 | 91 | 92 | kate.bush <- get_artists_tags("Kate Bush", ns) 93 | peter.tosh <- get_artists_tags("Peter Tosh", ns) 94 | radiohead <- get_artists_tags("Radiohead", ns) 95 | the.smiths <- get_artists_tags("The Smiths", ns) 96 | the.cure <- get_artists_tags("The Cure", ns) 97 | black.uhuru <- get_artists_tags("Black Uhuru", ns) 98 | 99 | 100 | the.list <- list(kate.bush, peter.tosh, radiohead, 101 | the.smiths, the.cure, black.uhuru) 102 | names(the.list) <- c("kate bush", "peter tosh", "radiohead", 103 | "the smiths", "the cure", "black uhuru") 104 | 105 | (make_similarity_matrix(the.list, jaccard)) 106 | 107 | 108 | #--------------------------------------------------# 109 | #--------------------------------------------------# 110 | #--------------------------------------------------# 111 | 112 | 113 | library(jsonlite) 114 | 115 | 116 | create_artist_query_url_lfm <- function(artist){ 117 | encoded_artist <- URLencode(artist) 118 | return(paste0("http://ws.audioscrobbler.com/2.0/?method=", 119 | "artist.gettoptags&artist=", 120 | encoded_artist, "&api_key=c2e57923a25c03f3d8b31", 121 | "7b3c8622b43&format=json")) 122 | } 123 | 124 | get_json_response <- function(a_url){ 125 | return(fromJSON(a_url)) 126 | } 127 | 128 | get_tag_vector_lfm <- function(json){ 129 | return(json$toptags$tag$name) 130 | } 131 | 132 | get_artists_tags_lfm <- function(artist){ 133 | the_query <- create_artist_query_url_lfm(artist) 134 | json_resp <- get_json_response(the_query) 135 | print(json_resp) 136 | tag_vector <- get_tag_vector_lfm(json_resp) 137 | return(tag_vector) 138 | } 139 | 140 | 141 | get_artists_tags_lfm("Kate Bush") 142 | 143 | 144 | 145 | our_artists <- list("Kate Bush", "Peter Tosh", "Radiohead", 146 | "The Smiths", "The Cure", "Black Uhuru") 147 | our_artists_tags <- lapply(our_artists, get_artists_tags_lfm) 148 | names(our_artists_tags) <- our_artists 149 | 150 | 151 | jaccard <- function(one, two){ 152 | length(intersect(one, two))/length(union(one, two)) 153 | } 154 | 155 | make_similarity_matrix <- function(artist_list, similarity_fn) { 156 | num <- length(artist_list) 157 | sim_matrix <- matrix(0, ncol = num, nrow = num) 158 | rownames(sim_matrix) <- names(artist_list) 159 | colnames(sim_matrix) <- names(artist_list) 160 | for(i in 1:nrow(sim_matrix)) { 161 | for(j in 1:ncol(sim_matrix)) { 162 | sim_matrix[i,j] <- round( 163 | similarity_fn(artist_list[[i]], artist_list[[j]]), 2) 164 | } 165 | } 166 | return(sim_matrix) 167 | } 168 | 169 | 170 | 171 | (make_similarity_matrix(our_artists_tags, jaccard)) 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /Chapter13/Chapter13.txt: -------------------------------------------------------------------------------- 1 | Visualizing missing data 2 | 3 | set.seed(2) 4 | miss_mtcars <- mtcars 5 | 6 | 7 | 8 | some_rows <- sample(1:nrow(miss_mtcars), 7) 9 | miss_mtcars$drat[some_rows] <- NA 10 | some_rows <- sample(1:nrow(miss_mtcars), 5) 11 | miss_mtcars$mpg[some_rows] <- NA 12 | some_rows <- sample(1:nrow(miss_mtcars), 5) 13 | miss_mtcars$cyl[some_rows] <- NA 14 | some_rows <- sample(1:nrow(miss_mtcars), 3) 15 | miss_mtcars$wt[some_rows] <- NA 16 | some_rows <- sample(1:nrow(miss_mtcars), 3) 17 | miss_mtcars$vs[some_rows] <- NA 18 | 19 | 20 | 21 | only_automatic <- which(miss_mtcars$am==0) 22 | some_rows <- sample(only_automatic, 4) 23 | miss_mtcars$qsec[some_rows] <- NA 24 | 25 | 26 | 27 | 28 | 29 | miss_mtcars 30 | mpg cyl disp hp drat wt qsec vs am gear carb 31 | Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 32 | Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 33 | Datsun 710 22.8 4 108.0 93 3.85 NA 18.61 1 1 4 1 34 | Hornet 4 Drive 21.4 6 258.0 110 NA 3.215 19.44 1 0 3 1 35 | Hornet Sportabout 18.7 8 360.0 175 NA 3.440 17.02 0 0 3 2 36 | Valiant 18.1 NA 225.0 105 NA 3.460 NA 1 0 3 1 37 | 38 | 39 | 40 | 41 | 42 | 43 | library(mice) 44 | md.pattern(miss_mtcars) 45 | 46 | disp hp am gear carb wt vs qsec mpg cyl drat 47 | 12 1 1 1 1 1 1 1 1 1 1 1 0 48 | 4 1 1 1 1 1 1 1 1 0 1 1 1 49 | 2 1 1 1 1 1 1 1 1 1 0 1 1 50 | 3 1 1 1 1 1 1 1 1 1 1 0 1 51 | 3 1 1 1 1 1 0 1 1 1 1 1 1 52 | 2 1 1 1 1 1 1 1 0 1 1 1 1 53 | 1 1 1 1 1 1 1 1 1 0 1 0 2 54 | 1 1 1 1 1 1 1 1 0 1 0 1 2 55 | 1 1 1 1 1 1 1 0 1 1 0 1 2 56 | 2 1 1 1 1 1 1 0 1 1 1 0 2 57 | 1 1 1 1 1 1 1 1 0 1 0 0 3 58 | 0 0 0 0 0 3 3 4 5 5 7 27 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | library(VIM) 69 | aggr(miss_mtcars, numbers=TRUE) 70 | 71 | 72 | 73 | 74 | 75 | 76 | Complete case analysis 77 | 78 | mean(miss_mtcars$drat) 79 | [1] NA 80 | mean(miss_mtcars$drat, na.rm=TRUE) 81 | [1] 3.63 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | listwise_model <- lm(mpg ~ am + wt + qsec, 92 | data=miss_mtcars, 93 | na.action = na.omit) 94 | ## OR 95 | # complete.cases returns a boolean vector 96 | comp <- complete.cases(cbind(miss_mtcars$mpg, 97 | miss_mtcars$am, 98 | miss_mtcars$wt, 99 | miss_mtcars$qsec)) 100 | comp_mtcars <- mtcars[comp,] 101 | listwise_model <- lm(mpg ~ am + wt + qsec, 102 | data=comp_mtcars) 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | Mean substitution 111 | 112 | mean_sub <- miss_mtcars 113 | mean_sub$qsec[is.na(mean_sub$qsec)] <- mean(mean_sub$qsec, 114 | na.rm=TRUE) 115 | # etc... 116 | 117 | 118 | 119 | 120 | 121 | Multiple imputation in practice 122 | 123 | 124 | # we are going to set the seed and printFlag to FALSE, but 125 | # everything else will the default argument 126 | imp <- mice(miss_mtcars, seed=3, printFlag=FALSE) 127 | print(imp) 128 | ------------------------------ 129 | Multiply imputed data set 130 | Call: 131 | mice(data = miss_mtcars, printFlag = FALSE, seed = 3) 132 | Number of multiple imputations: 5 133 | Missing cells per column: 134 | mpg cyl disp hp drat wt qsec vs am gear carb 135 | 5 5 0 0 7 3 4 3 0 0 0 136 | 137 | 138 | Imputation methods: 139 | mpg cyl disp hp drat wt qsec vs am gear carb 140 | "pmm" "pmm" "" "" "pmm" "pmm" "pmm" "pmm" "" "" "" 141 | VisitSequence: 142 | mpg cyl drat wt qsec vs 143 | 1 2 5 6 7 8 144 | 145 | 146 | PredictorMatrix: 147 | mpg cyl disp hp drat wt qsec vs am gear carb 148 | mpg 0 1 1 1 1 1 1 1 1 1 1 149 | cyl 1 0 1 1 1 1 1 1 1 1 1 150 | disp 0 0 0 0 0 0 0 0 0 0 0 151 | ... 152 | Random generator seed value: 3 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | # convert categorical variables into factors 163 | miss_mtcars$vs <- factor(miss_mtcars$vs) 164 | miss_mtcars$cyl <- factor(miss_mtcars$cyl) 165 | imp <- mice(miss_mtcars, m=20, seed=3, printFlag=FALSE) 166 | imp$method 167 | ------------------------------------- 168 | mpg cyl disp hp drat 169 | "pmm" "polyreg" "" "" "pmm" 170 | wt qsec vs am gear 171 | "pmm" "pmm" "logreg" "" "" 172 | carb 173 | "" 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | imp$imp$mpg[,1:6] 183 | ------------------------------------ 184 | 1 2 3 4 5 6 185 | Duster 360 19.2 16.4 17.3 15.5 15.0 19.2 186 | Cadillac Fleetwood 15.2 13.3 15.0 13.3 10.4 17.3 187 | Chrysler Imperial 10.4 15.0 15.0 16.4 10.4 10.4 188 | Porsche 914-2 27.3 22.8 21.4 22.8 21.4 15.5 189 | Ferrari Dino 19.2 21.4 19.2 15.2 18.1 19.2 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | densityplot(imp) 198 | 199 | 200 | 201 | 202 | 203 | imp_models <- with(imp, lm(mpg ~ am + wt + qsec)) 204 | 205 | 206 | 207 | 208 | lapply(imp_models$analyses, coef) 209 | --------------------------------- 210 | [[1]] 211 | (Intercept) am wt qsec 212 | 18.1534095 2.0284014 -4.4054825 0.8637856 213 | [[2]] 214 | (Intercept) am wt qsec 215 | 8.375455 3.336896 -3.520882 1.219775 216 | [[3]] 217 | (Intercept) am wt qsec 218 | 5.254578 3.277198 -3.233096 1.337469 219 | ......... 220 | 221 | 222 | 223 | 224 | 225 | 226 | pooled_model <- pool(imp_models) 227 | summary(pooled_model) 228 | ---------------------------------- 229 | est se t df Pr(>|t|) 230 | (Intercept) 7.049781 9.2254581 0.764166 17.63319 0.454873254 231 | am 3.182049 1.7445444 1.824000 21.36600 0.082171407 232 | wt -3.413534 0.9983207 -3.419276 14.99816 0.003804876 233 | qsec 1.270712 0.3660131 3.471765 19.93296 0.002416595 234 | lo 95 hi 95 nmis fmi lambda 235 | (Intercept) -12.3611281 26.460690 NA 0.3459197 0.2757138 236 | am -0.4421495 6.806247 0 0.2290359 0.1600952 237 | wt -5.5414268 -1.285641 3 0.4324828 0.3615349 238 | qsec 0.5070570 2.034366 4 0.2736026 0.2042003 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | -------------------------------------------------------------------------------- /Chapter14/Chapter14.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Analysis-with-R-Second-Edition/ece3101ca2f43f725c13a67d27810297bd31142a/Chapter14/Chapter14.txt -------------------------------------------------------------------------------- /Chapter14/Chapter15.txt: -------------------------------------------------------------------------------- 1 | Allocation of memory 2 | 3 | 4 | set.seed(1) 5 | all.us.women <- rnorm(10000, mean=65, sd=3.5) 6 | means.of.our.samples.bad <- c(1) 7 | # I'm increasing the number of 8 | # samples to 30,000 to prove a point 9 | for(i in 1:30000){ 10 | a.sample <- sample(all.us.women, 40) 11 | means.of.our.samples.bad[i] <- mean(a.sample) 12 | } 13 | 14 | 15 | 16 | means.of.our.samples.bad <- c(1) 17 | system.time( 18 | for(i in 1:30000){ 19 | a.sample <- sample(all.us.women, 40) 20 | means.of.our.samples.bad[i] <- mean(a.sample) 21 | } 22 | ) 23 | means.of.our.samples.good <- numeric(30000) 24 | system.time( 25 | for(i in 1:30000){ 26 | a.sample <- sample(all.us.women, 40) 27 | means.of.our.samples[i] <- mean(a.sample) 28 | } 29 | ) 30 | ------------------------------------- 31 | user system elapsed 32 | 2.024 0.431 2.465 33 | user system elapsed 34 | 0.678 0.004 0.684 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | Vectorization 44 | 45 | system.time( 46 | for(i in 1:length(all.us.women)) 47 | all.us.women[i] ^ 2 48 | ) 49 | -------------------------- 50 | user system elapsed 51 | 0.003 0.000 0.003 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | system.time( 60 | sapply(all.us.women, function(x) x^2) 61 | ) 62 | ----------------------- 63 | user system elapsed 64 | 0.006 0.000 0.006 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | > system.time( 76 | + vapply(all.us.women, function(x) x^2, numeric(1)) 77 | + ) 78 | ------------------------- 79 | user system elapsed 80 | 0.006 0.000 0.005 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | system.time( 92 | all.us.women ^ 2 93 | ) 94 | ---------------------- 95 | user system elapsed 96 | 0 0 0 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | Getting started with parallel R 112 | 113 | 114 | # You don't have to install this if your copy of R is new 115 | library(parallel) 116 | 117 | 118 | library(parallel) 119 | cl <- makeCluster(4) 120 | 121 | 122 | detectCores() 123 | ------------------------ 124 | [1] 4 125 | 126 | 127 | 128 | 129 | for(i in 1:4){ 130 | Sys.sleep(5) 131 | } 132 | Or, equivalently, using lapply: 133 | # lapply will pass each element of the 134 | # vector c(1, 2, 3, 4) to the function 135 | # we write but we'll ignore it 136 | lapply(1:4, function(i) Sys.sleep(5)) 137 | 138 | 139 | 140 | 141 | 142 | system.time( 143 | lapply(1:4, function(i) Sys.sleep(5)) 144 | ) 145 | ---------------------------------------- 146 | user system elapsed 147 | 0.059 0.074 20.005 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | ####################### 157 | # NON-WINDOWS VERSION # 158 | ####################### 159 | system.time( 160 | mclapply(1:4, function(i) Sys.sleep(5), mc.cores=4) 161 | ) 162 | ################### 163 | # WINDOWS VERSION # 164 | ################### 165 | system.time( 166 | parLapply(cl, 1:4, function(i) Sys.sleep(5)) 167 | ) 168 | ---------------------------------------- 169 | user system elapsed 170 | 0.021 0.042 5.013 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | An example of (some) substance 182 | 183 | 184 | 185 | 186 | haversine <- function(lat1, long1, lat2, long2, unit="km"){ 187 | radius <- 6378 # radius of Earth in kilometers 188 | delta.phi <- to.radians(lat2 - lat1) 189 | delta.lambda <- to.radians(long2 - long1) 190 | phi1 <- to.radians(lat1) 191 | phi2 <- to.radians(lat2) 192 | term1 <- sin(delta.phi/2) ^ 2 193 | term2 <- cos(phi1) * cos(phi2) * sin(delta.lambda/2) ^ 2 194 | the.terms <- term1 + term2 195 | delta.sigma <- 2 * atan2(sqrt(the.terms), sqrt(1-the.terms)) 196 | distance <- radius * delta.sigma 197 | if(unit=="km") return(distance) 198 | if(unit=="miles") return(0.621371*distance) 199 | 200 | 201 | 202 | 203 | 204 | to.radians <- function(degrees){ 205 | degrees * pi / 180 206 | } 207 | 208 | 209 | 210 | 211 | 212 | 213 | set.seed(1) 214 | the.url <- 215 | "http://opendata.socrata.com/api/views/rxrh-4cxm/rows.csv?accessType=DOWNLO 216 | AD" 217 | all.airport.locs <- read.csv(the.url, stringsAsFactors=FALSE) 218 | library(magrittr) 219 | library(assertr) 220 | CHECKS <- . %>% 221 | verify(nrow(.) == 13429) %>% 222 | verify(names(.) %in% c("locationID", "Latitude", "Longitude")) %>% 223 | assert(within_bounds(0, 90), Latitude) %>% 224 | assert(within_bounds(0,180), Longitude) 225 | all.airport.locs <- CHECKS(all.airport.locs) 226 | # Let's start off with 400 airports 227 | smp.size <- 400 228 | # choose a random sample of airports 229 | random.sample <- sample((1:nrow(all.airport.locs)), smp.size) 230 | airport.locs <- all.airport.locs[random.sample, ] 231 | row.names(airport.locs) <- NULL 232 | head(airport.locs) 233 | ------------------------------------- 234 | locationID Latitude Longitude 235 | 1 LWV 38.7642 87.6056 236 | 2 LS77 30.7272 91.1486 237 | 3 2N2 43.5919 71.7514 238 | 4 VG00 37.3697 75.9469 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | single.core <- function(airport.locs){ 247 | running.sum <- 0 248 | for(i in 1:(nrow(airport.locs)-1)){ 249 | for(j in (i+1):nrow(airport.locs)){ 250 | # i is the row of the first lat/long pair 251 | # j is the row of the second lat/long pair 252 | this.dist <- haversine(airport.locs[i, 2], 253 | airport.locs[i, 3], 254 | airport.locs[j, 2], 255 | airport.locs[j, 3]) 256 | running.sum <- running.sum + this.dist 257 | } 258 | } 259 | # Now we have to divide by the number of 260 | # distances we took. This is given by 261 | return(running.sum / 262 | ((nrow(airport.locs)*(nrow(airport.locs)-1))/2)) 263 | } 264 | # Now, let's time it! 265 | 266 | ---------------------------- 267 | user system elapsed 268 | 5.400 0.034 5.466 269 | [1] 1667.186 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | # We'll have to limit the output to the 278 | # first 11 columns 279 | combn(1:10, 2)[,1:11] 280 | ---------------------------------------- 281 | [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] 282 | [1,] 1 1 1 1 1 1 1 1 1 283 | [2,] 2 3 4 5 6 7 8 9 10 284 | [,10] [,11] 285 | [1,] 2 2 286 | [2,] 3 4 287 | 288 | 289 | 290 | 291 | small.world <- c("LAX", "ALB", "OLM", "JFK") 292 | all.combs <- combn(1:length(small.world), 2) 293 | for(i in 1:ncol(all.combs)){ 294 | from <- small.world[all.combs[1, i]] 295 | to <- small.world[all.combs[2, i]] 296 | print(paste(from, " <-> ", to)) 297 | } 298 | ---------------------------------------- 299 | [1] "LAX <-> ALB" 300 | [1] "LAX <-> OLM" 301 | [1] "LAX <-> JFK" 302 | [1] "ALB <-> OLM" # back to olympia 303 | [1] "ALB <-> JFK" 304 | [1] "OLM <-> JFK" 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | small.world <- c("LAX", "ALB", "OLM", "JFK") 313 | all.combs <- combn(1:length(small.world), 2) 314 | # instead of printing each airport pair in a string, 315 | # we'll return the string 316 | results <- lapply(1:ncol(all.combs), function(x){ 317 | from <- small.world[all.combs[1, x]] 318 | to <- small.world[all.combs[2, x]] 319 | return(paste(from, " <-> ", to)) 320 | }) 321 | print(results) 322 | ------------------------- 323 | [[1]] 324 | [1] "LAX <-> ALB" 325 | [[2]] 326 | [1] "LAX <-> OLM" 327 | [[3]] 328 | [1] "LAX <-> JFK" 329 | ........ 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | unlist(results) 341 | --------------------- 342 | [1] "LAX <-> ALB" "LAX <-> OLM" "LAX <-> JFK" 343 | [4] "ALB <-> OLM" "ALB <-> JFK" "OLM <-> JFK" 344 | 345 | 346 | 347 | 348 | single.core.lapply <- function(airport.locs){ 349 | all.combs <- combn(1:nrow(airport.locs), 2) 350 | numcombs <- ncol(all.combs) 351 | results <- lapply(1:numcombs, function(x){ 352 | lat1 <- airport.locs[all.combs[1, x], 2] 353 | long1 <- airport.locs[all.combs[1, x], 3] 354 | lat2 <- airport.locs[all.combs[2, x], 2] 355 | long2 <- airport.locs[all.combs[2, x], 3] 356 | return(haversine(lat1, long1, lat2, long2)) 357 | }) 358 | return(sum(unlist(results)) / numcombs) 359 | } 360 | system.time(ave.dist <- single.core.lapply(airport.locs)) 361 | print(ave.dist) 362 | --------------------------------------- 363 | user system elapsed 364 | 5.890 0.042 5.968 365 | [1] 1667.186 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | ####################### 377 | # NON-WINDOWS VERSION # 378 | ####################### 379 | multi.core <- function(airport.locs){ 380 | all.combs <- combn(1:nrow(airport.locs), 2) 381 | numcombs <- ncol(all.combs) 382 | results <- mclapply(1:numcombs, function(x){ 383 | lat1 <- airport.locs[all.combs[1, x], 2] 384 | long1 <- airport.locs[all.combs[1, x], 3] 385 | lat2 <- airport.locs[all.combs[2, x], 2] 386 | long2 <- airport.locs[all.combs[2, x], 3] 387 | return(haversine(lat1, long1, lat2, long2)) 388 | }, mc.cores=4) 389 | return(sum(unlist(results)) / numcombs) 390 | } 391 | ################### 392 | # WINDOWS VERSION # 393 | ################### 394 | clusterExport(cl, c("haversine", "to.radians")) 395 | 396 | multi.core <- function(airport.locs){ 397 | all.combs <- combn(1:nrow(airport.locs), 2) 398 | numcombs <- ncol(all.combs) 399 | results <- parLapply(cl, 1:numcombs, function(x){ 400 | lat1 <- airport.locs[all.combs[1, x], 2] 401 | long1 <- airport.locs[all.combs[1, x], 3] 402 | lat2 <- airport.locs[all.combs[2, x], 2] 403 | long2 <- airport.locs[all.combs[2, x], 3] 404 | return(haversine(lat1, long1, lat2, long2)) 405 | }) 406 | return(sum(unlist(results)) / numcombs) 407 | } 408 | system.time(ave.dist <- multi.core(airport.locs)) 409 | print(ave.dist) 410 | ------------------------------- 411 | user system elapsed 412 | 7.363 0.240 2.743 413 | [1] 1667.186 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | Using Rcpp 422 | 423 | 424 | #include 425 | // [[Rcpp::export]] 426 | double square(double number){ 427 | return(pow(number, 2)); 428 | } 429 | 430 | 431 | 432 | library(Rcpp) 433 | sourceCpp("our_cpp_functions.cpp") 434 | square(3) 435 | -------------------------------- 436 | [1] 9 437 | 438 | 439 | 440 | 441 | square <- function(number){ 442 | return(number^2) 443 | } 444 | 445 | 446 | 447 | 448 | /* Add this (and all other snippets that 449 | start with "// [[Rcpp::export]]") 450 | to the C++ file, not the R code. */ 451 | // [[Rcpp::export]] 452 | double to_radians_cpp(double degrees){ 453 | return(degrees * 3.141593 / 180); 454 | } 455 | # with goes with our R code 456 | sourceCpp("our_cpp_functions.cpp") 457 | to_radians_cpp(10) 458 | ------------------------- 459 | [1] 0.174533 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | // [[Rcpp::export]] 472 | double haversine_cpp(double lat1, double long1, 473 | double lat2, double long2, 474 | std::string unit="km"){ 475 | int radius = 6378; 476 | double delta_phi = to_radians_cpp(lat2 - lat1); 477 | double delta_lambda = to_radians_cpp(long2 - long1); 478 | double phi1 = to_radians_cpp(lat1); 479 | double phi2 = to_radians_cpp(lat2); 480 | double term1 = pow(sin(delta_phi / 2), 2); 481 | double term2 = cos(phi1) * cos(phi2) 482 | term2 = term2 * pow(sin(delta_lambda/2), 2); 483 | double the_terms = term1 + term2; 484 | double delta_sigma = 2 * atan2(sqrt(the_terms), 485 | sqrt(1-the_terms)); 486 | double distance = radius * delta_sigma; 487 | /* if it is anything *but* km it is miles */ 488 | if(unit != "km"){ 489 | return(distance*0.621371); 490 | } 491 | return(distance); 492 | } 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | sourceCpp("our_cpp_functions.cpp") 503 | haversine(51.88, 176.65, 56.94, 154.18) 504 | haversine_cpp(51.88, 176.65, 56.94, 154.18) 505 | ---------------------------------------------- 506 | [1] 1552.079 507 | [1] 1552.079 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | // [[Rcpp::export]] 517 | double sum2(Rcpp::NumericVector a_vector){ 518 | double running_sum = 0; 519 | int length = a_vector.size(); 520 | for( int i = 0; i < length; i++ ){ 521 | running_sum = running_sum + a_vector(i); 522 | } 523 | return(running_sum); 524 | } 525 | 526 | 527 | 528 | 529 | 530 | 531 | // [[Rcpp::export]] 532 | double single_core_cpp(Rcpp::NumericMatrix mat){ 533 | int nrows = mat.nrow(); 534 | int numcomps = nrows*(nrows-1)/2; 535 | double running_sum = 0; 536 | for( int i = 0; i < nrows; i++ ){ 537 | for( int j = i+1; j < nrows; j++){ 538 | double this_dist = haversine_cpp(mat(i,0), mat(i,1), 539 | mat(j,0), mat(j,1)); 540 | running_sum = running_sum + this_dist; 541 | } 542 | } 543 | return running_sum / numcomps; 544 | } 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | sourceCpp("our_cpp_functions.cpp") 558 | the.matrix <- as.matrix(all.airport.locs[,-1]) 559 | system.time(ave.dist <- single_core_cpp(the.matrix)) 560 | print(ave.dist) 561 | ---------------------------------------- 562 | user system elapsed 563 | 0.012 0.000 0.012 564 | [1] 1667.186 565 | 566 | 567 | the.matrix <- as.matrix(all.airport.locs[,-1]) 568 | system.time(ave.dist <- single_core_cpp(the.matrix)) 569 | print(ave.dist) 570 | ------------------------------- 571 | user system elapsed 572 | 12.310 0.080 12.505 573 | [1] 1869.744 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | Being smarter about your code 584 | 585 | 586 | single.core.improved <- function(airport.locs){ 587 | numrows <- nrow(airport.locs) 588 | running.sum <- 0 589 | 590 | for (i in 1:(numrows-1)) { 591 | this.dist <- sum(haversine(airport.locs[i,2], 592 | airport.locs[i, 3], 593 | airport.locs[(i+1):numrows, 2], 594 | airport.locs[(i+1):numrows, 3])) 595 | running.sum <- running.sum + this.dist 596 | } 597 | return(running.sum / (numrows*(numrows-1)/2)) 598 | } 599 | system.time(ave.dist <- single.core.improved(all.airport.locs)) 600 | print(ave.dist) 601 | ------------------------------------------------------------------ 602 | user system elapsed 603 | 15.537 0.173 15.866 604 | [1] 1869.744 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | -------------------------------------------------------------------------------- /Chapter15/chapter12.R: -------------------------------------------------------------------------------- 1 | # You don't have to install this if you copy of R is new 2 | library(parallel) 3 | 4 | detectCores() 5 | 6 | system.time( 7 | lapply(1:4, function(i) Sys.sleep(5)) 8 | ) 9 | 10 | ####################### 11 | # NON-WINDOWS VERSION # 12 | ####################### 13 | system.time( 14 | mclapply(1:4, function(i) Sys.sleep(5), mc.cores=4) 15 | ) 16 | 17 | ################### 18 | # WINDOWS VERSION # 19 | ################### 20 | #system.time( 21 | # parLapply(cl, 1:4, function(i) Sys.sleep(5)) 22 | #) 23 | 24 | # ALL OUTPUT SHOWN WILL BE FOR A 25 | # NON-WINDOWS COMPUTER WITH 4 CORES 26 | 27 | 28 | haversine <- function(lat1, long1, lat2, long2, unit="km"){ 29 | radius <- 6378 # radius of Earth in kilometers 30 | delta.phi <- to.radians(lat2 - lat1) 31 | delta.lambda <- to.radians(long2 - long1) 32 | phi1 <- to.radians(lat1) 33 | phi2 <- to.radians(lat2) 34 | term1 <- sin(delta.phi/2) ^ 2 35 | term2 <- cos(phi1) * cos(phi2) * sin(delta.lambda/2) ^ 2 36 | the.terms <- term1 + term2 37 | delta.sigma <- 2 * atan2(sqrt(the.terms), sqrt(1-the.terms)) 38 | distance <- radius * delta.sigma 39 | if(unit=="km") return(distance) 40 | if(unit=="miles") return(0.621371*distance) 41 | } 42 | 43 | to.radians <- function(degrees){ 44 | degrees * pi / 180 45 | } 46 | 47 | 48 | 49 | 50 | set.seed(1) 51 | 52 | all.airport.locs <- read.csv("http://opendata.socrata.com/api/views/rxrh-4cxm/rows.csv?accessType=DOWNLOAD", 53 | stringsAsFactors=FALSE) 54 | 55 | library(magrittr) 56 | library(assertr) 57 | CHECKS <- . %>% 58 | verify(nrow(.) == 13429) %>% 59 | verify(names(.) %in% c("locationID", "Latitude", "Longitude")) %>% 60 | assert(within_bounds(0, 90), Latitude) %>% 61 | assert(within_bounds(0,180), Longitude) 62 | 63 | all.airport.locs <- CHECKS(all.airport.locs) 64 | 65 | # Let's start off with 400 airports 66 | smp.size <- 400 67 | 68 | # choose a random sample of airports 69 | random.sample <- sample((1:nrow(all.airport.locs)), smp.size) 70 | airport.locs <- all.airport.locs[random.sample, ] 71 | row.names(airport.locs) <- NULL 72 | 73 | head(airport.locs) 74 | 75 | 76 | single.core <- function(airport.locs){ 77 | running.sum <- 0 78 | for(i in 1:(nrow(airport.locs)-1)){ 79 | for(j in (i+1):nrow(airport.locs)){ 80 | # i is the row of the first lat/long pair 81 | # j is the row of the second lat/long pair 82 | this.dist <- haversine(airport.locs[i, 2], 83 | airport.locs[i, 3], 84 | airport.locs[j, 2], 85 | airport.locs[j, 3]) 86 | running.sum <- running.sum + this.dist 87 | } 88 | } 89 | # Now we have to divide by the number of 90 | # distances we took. This is given by 91 | return(running.sum / 92 | ((nrow(airport.locs)*(nrow(airport.locs)-1))/2)) 93 | } 94 | 95 | system.time(ave.dist <- single.core(airport.locs)) 96 | print(ave.dist) 97 | 98 | 99 | combn(1:10, 2)[,1:11] 100 | 101 | 102 | small.world <- c("LAX", "ALB", "OLM", "JFK") 103 | all.combs <- combn(1:length(small.world), 2) 104 | 105 | for(i in 1:ncol(all.combs)){ 106 | from <- small.world[all.combs[1, i]] 107 | to <- small.world[all.combs[2, i]] 108 | print(paste(from, " <-> ", to)) 109 | } 110 | 111 | 112 | small.world <- c("LAX", "ALB", "OLM", "JFK") 113 | all.combs <- combn(1:length(small.world), 2) 114 | 115 | # instead of printing each airport pair in a string, 116 | # we'll return the string 117 | results <- lapply(1:ncol(all.combs), function(x){ 118 | from <- small.world[all.combs[1, x]] 119 | to <- small.world[all.combs[2, x]] 120 | #from <- small.world[all.combs[i, 1]] 121 | #to <- small.world[all.combs[i, 2]] 122 | return(paste(from, " <-> ", to)) 123 | }) 124 | 125 | print(results) 126 | 127 | unlist(results) 128 | 129 | single.core.lapply <- function(airport.locs){ 130 | all.combs <- combn(1:nrow(airport.locs), 2) 131 | numcombs <- ncol(all.combs) 132 | results <- lapply(1:numcombs, function(x){ 133 | lat1 <- airport.locs[all.combs[1, x], 2] 134 | long1 <- airport.locs[all.combs[1, x], 3] 135 | lat2 <- airport.locs[all.combs[2, x], 2] 136 | long2 <- airport.locs[all.combs[2, x], 3] 137 | return(haversine(lat1, long1, lat2, long2)) 138 | }) 139 | return(sum(unlist(results)) / numcombs) 140 | } 141 | 142 | system.time(ave.dist <- single.core.lapply(airport.locs)) 143 | print(ave.dist) 144 | 145 | 146 | ####################### 147 | # NON-WINDOWS VERSION # 148 | ####################### 149 | multi.core <- function(airport.locs){ 150 | all.combs <- combn(1:nrow(airport.locs), 2) 151 | numcombs <- ncol(all.combs) 152 | results <- mclapply(1:numcombs, function(x){ 153 | lat1 <- airport.locs[all.combs[1, x], 2] 154 | long1 <- airport.locs[all.combs[1, x], 3] 155 | lat2 <- airport.locs[all.combs[2, x], 2] 156 | long2 <- airport.locs[all.combs[2, x], 3] 157 | return(haversine(lat1, long1, lat2, long2)) 158 | }, mc.cores=4) 159 | return(sum(unlist(results)) / numcombs) 160 | } 161 | 162 | ################### 163 | # WINDOWS VERSION # 164 | ################### 165 | #clusterExport(cl, c("haversine", "to.radians")) 166 | # 167 | #multi.core <- function(airport.locs){ 168 | # all.combs <- combn(1:nrow(airport.locs), 2) 169 | # numcombs <- ncol(all.combs) 170 | # results <- parLapply(cl, 1:numcombs, function(x){ 171 | # lat1 <- airport.locs[all.combs[1, x], 2] 172 | # long1 <- airport.locs[all.combs[1, x], 3] 173 | # lat2 <- airport.locs[all.combs[2, x], 2] 174 | # long2 <- airport.locs[all.combs[2, x], 3] 175 | # return(haversine(lat1, long1, lat2, long2)) 176 | # }) 177 | # return(sum(unlist(results)) / numcombs) 178 | #} 179 | 180 | system.time(ave.dist <- multi.core(airport.locs)) 181 | print(ave.dist) 182 | 183 | library(Rcpp) 184 | 185 | sourceCpp("our_cpp_functions.cpp") 186 | 187 | square(3) 188 | 189 | 190 | sourceCpp("our_cpp_functions.cpp") 191 | to_radians_cpp(10) 192 | 193 | 194 | sourceCpp("our_cpp_functions.cpp") 195 | 196 | haversine(51.88, 176.65, 56.94, 154.18) 197 | haversine_cpp(51.88, 176.65, 56.94, 154.18) 198 | 199 | 200 | sourceCpp("our_cpp_functions.cpp") 201 | 202 | system.time(ave.dist <- single_core_cpp(as.matrix(airport.locs[,-1]))) 203 | print(ave.dist) 204 | 205 | 206 | 207 | the.matrix <- as.matrix(all.airport.locs[,-1]) 208 | system.time(ave.dist <- single_core_cpp(the.matrix)) 209 | print(ave.dist) 210 | 211 | 212 | 213 | 214 | single.core.improved <- function(airport.locs){ 215 | 216 | numrows <- nrow(airport.locs) 217 | 218 | running.sum <- 0 219 | for (i in 1:(numrows-1)) { 220 | this.dist <- sum(haversine(airport.locs[i,2], 221 | airport.locs[i, 3], 222 | airport.locs[(i+1):numrows, 2], 223 | airport.locs[(i+1):numrows, 3])) 224 | running.sum <- running.sum + this.dist 225 | } 226 | return(running.sum / (numrows*(numrows-1)/2)) 227 | } 228 | 229 | system.time(ave.dist <- single.core.improved(all.airport.locs)) 230 | print(ave.dist) 231 | 232 | -------------------------------------------------------------------------------- /Chapter15/our_cpp_functions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // [[Rcpp::export]] 4 | double square(double number){ 5 | return(pow(number, 2)); 6 | } 7 | 8 | // [[Rcpp::export]] 9 | double to_radians_cpp(double degrees){ 10 | return(degrees * 3.141593 / 180); 11 | } 12 | 13 | 14 | // [[Rcpp::export]] 15 | double sum2(Rcpp::NumericVector a_vector){ 16 | double running_sum = 0; 17 | int length = a_vector.size(); 18 | for( int i = 0; i < length; i++ ){ 19 | running_sum = running_sum + a_vector(i); 20 | } 21 | return(running_sum); 22 | } 23 | 24 | 25 | 26 | // [[Rcpp::export]] 27 | double haversine_cpp(double lat1, double long1, 28 | double lat2, double long2, 29 | std::string unit="km"){ 30 | int radius = 6378; 31 | double delta_phi = to_radians_cpp(lat2 - lat1); 32 | double delta_lambda = to_radians_cpp(long2 - long1); 33 | double phi1 = to_radians_cpp(lat1); 34 | double phi2 = to_radians_cpp(lat2); 35 | double term1 = pow(sin(delta_phi / 2), 2); 36 | double term2 = cos(phi1) * cos(phi2) * pow(sin(delta_lambda/2), 2); 37 | double the_terms = term1 + term2; 38 | double delta_sigma = 2 * atan2(sqrt(the_terms), sqrt(1-the_terms)); 39 | double distance = radius * delta_sigma; 40 | 41 | /* if it is anything *but* km it is miles */ 42 | if(unit != "km"){ 43 | return(distance*0.621371); 44 | } 45 | 46 | return(distance); 47 | } 48 | 49 | 50 | 51 | // [[Rcpp::export]] 52 | double single_core_cpp(Rcpp::NumericMatrix mat){ 53 | int nrows = mat.nrow(); 54 | int numcomps = nrows*(nrows-1)/2; 55 | double running_sum = 0; 56 | for( int i = 0; i < nrows; i++ ){ 57 | for( int j = i+1; j < nrows; j++){ 58 | double this_dist = haversine_cpp(mat(i,0), mat(i,1), 59 | mat(j,0), mat(j,1)); 60 | 61 | running_sum = running_sum + this_dist; 62 | } 63 | } 64 | return running_sum / numcomps; 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /Chapter16/Chapter16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Analysis-with-R-Second-Edition/ece3101ca2f43f725c13a67d27810297bd31142a/Chapter16/Chapter16.txt -------------------------------------------------------------------------------- /Chapter17/Chapter17.txt: -------------------------------------------------------------------------------- 1 | RStudio 2 | 3 | library(ggplot2) 4 | nothing <- data.frame(a=rbinom(1000, 20, .5), 5 | b=c("red", "white"), 6 | c=rnorm(1000, mean=100, sd=10)) 7 | qplot(c, data=nothing, geom="histogram") 8 | write.csv(nothing, "nothing.csv") 9 | Execute the statements one by one. Notice that 10 | 11 | 12 | 13 | Running R scripts 14 | 15 | 16 | R CMD BATCH nothing.R 17 | 18 | R --vanilla CMD BATCH nothing.R 19 | 20 | Rscript nothing.R 21 | 22 | Rscript --vanilla nothing.R 23 | 24 | 25 | 26 | 27 | 28 | An example script 29 | 30 | #!/usr/bin/Rscript --vanilla 31 | ########################################################### 32 | ## ## 33 | ## nyc-sat-scores.R ## 34 | ## ## 35 | ## Author: Tony Fischetti ## 36 | ## tony.fischetti@gmail.com ## 37 | ## ## 38 | ########################################################### 39 | ## 40 | ## Aim: to use Bayesian analysis to compare NYC's 2010 41 | ## combined SAT scores against the average of the 42 | ## rest of the country, which, according to 43 | ## FairTest.com, is 1509 44 | ## 45 | # workspace cleanup 46 | rm(list=ls()) 47 | # options 48 | options(echo=TRUE) 49 | options(stringsAsFactors=FALSE) 50 | # libraries 51 | library(assertr) # for data checking 52 | library(runjags) # for MCMC 53 | # make sure everything is all set with JAGS 54 | testjags() 55 | # yep! 56 | ## read data file 57 | # data was retrieved from NYC Open Data portal 58 | # direct link: 59 | https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNL 60 | OAD 61 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv") 62 | # let's give the columns easier names 63 | better.names <- c("id", "school.name", "n", "read.mean", 64 | "math.mean", "write.mean") 65 | names(nyc.sats) <- better.names 66 | # there are 460 rows but almost 700 NYC schools 67 | # we will *assume*, then, that this is a random 68 | # sample of NYC schools 69 | # let's first check the veracity of this data... 70 | #nyc.sats <- assert(nyc.sats, is.numeric, 71 | # n, read.mean, math.mean, write.mean) 72 | # It looks like check failed because there are "s"s for some 73 | # rows. (??) A look at the data set descriptions indicates 74 | # that the "s" is for schools # with 5 or fewer students. 75 | # For our purposes, let's just exclude them. 76 | # This is a function that takes a vector, replaces all "s"s 77 | # with NAs and make coverts all non-"s"s into numerics 78 | remove.s <- function(vec){ 79 | ifelse(vec=="s", NA, vec) 80 | } 81 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n)) 82 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean)) 83 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean)) 84 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean)) 85 | # Remove schools with fewer than 5 test takers 86 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ] 87 | # Calculate a total combined SAT score 88 | nyc.sats$combined.mean <- (nyc.sats$read.mean + 89 | nyc.sats$math.mean + 90 | nyc.sats$write.mean) 91 | # Let's build a posterior distribution of the true mean 92 | # of NYC high schools' combined SAT scores. 93 | # We're not going to look at the summary statistics, because 94 | # we don't want to bias our priors 95 | # Specify a standard gaussian model 96 | the.model <- " 97 | model { 98 | # priors 99 | mu ~ dunif(0, 2400) 100 | stddev ~ dunif(0, 500) 101 | tau <- pow(stddev, -2) 102 | # likelihood 103 | for(i in 1:theLength){ 104 | samp[i] ~ dnorm(mu, tau) 105 | } 106 | }" 107 | the.data <- list( 108 | samp = nyc.sats$combined.mean, 109 | theLength = length(nyc.sats$combined.mean) 110 | ) 111 | results <- autorun.jags(the.model, data=the.data, 112 | n.chains = 3, 113 | monitor = c('mu', 'stddev')) 114 | # View the results of the MCMC 115 | print(results) 116 | # Plot the MCMC diagnostics 117 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1)) 118 | # Looks good! 119 | # Let's extract the MCMC samples of the mean and get the 120 | # bounds of the middle 95% 121 | results.matrix <- as.matrix(results$mcmc) 122 | mu.samples <- results.matrix[,'mu'] 123 | bounds <- quantile(mu.samples, c(.025, .975)) 124 | # We are 95% sure that the true mean is between 1197 and 1232 125 | # Now let's plot the marginal posterior distribution for the mean 126 | # of the NYC high schools' combined SAT grades and draw the 95% 127 | # percent credible interval. 128 | plot(density(mu.samples), 129 | main=paste("Posterior distribution of mean combined SAT", 130 | "score in NYC high schools (2010)", sep="\n")) 131 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red") 132 | # Given the results, the SAT scores for NYC high schools in 2010 133 | # are *incontrovertibly* not at par with the average SAT scores of 134 | # the nation. 135 | 136 | 137 | 138 | 139 | 140 | Scripting and reproducibility 141 | 142 | > devtools::session_info() 143 | Session info --------------------------------- 144 | setting value 145 | version R version 3.2.1 (2015-06-18) 146 | system x86_64, darwin13.4.0 147 | ui RStudio (0.99.486) 148 | language (EN) 149 | collate en_US.UTF-8 150 | tz America/New_York 151 | date 1969-07-20 152 | Packages ------------------------------------- 153 | package * version date source 154 | assertr * 1.0.0 2015-06-26 CRAN (R 3.2.1) 155 | coda 0.17-1 2015-03-03 CRAN (R 3.2.0) 156 | devtools 1.9.1 2015-09-11 CRAN (R 3.2.0) 157 | digest 0.6.8 2014-12-31 CRAN (R 3.2.0) 158 | lattice 0.20-33 2015-07-14 CRAN (R 3.2.0) 159 | memoise 0.2.1 2014-04-22 CRAN (R 3.2.0) 160 | modeest 2.1 2012-10-15 CRAN (R 3.2.0) 161 | 162 | 163 | rjags 3-15 2015-04-15 CRAN (R 3.2.0) 164 | runjags * 2.0.2-8 2015-09-14 CRAN (R 3.2.0) 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | R projects 173 | 174 | read.csv("/Users/bensisko/Desktop/SAT_Scores_NYC_2010.csv") 175 | 176 | 177 | #!/usr/bin/Rscript --vanilla 178 | source("./code/load-and-clean-sat-data.R") 179 | source("./code/analyze-sat-data.R") 180 | 181 | 182 | 183 | 184 | Communicating results 185 | 186 | 187 | --- 188 | title: "NYC SAT Scores Analysis" 189 | author: "Tony Fischetti" 190 | date: "November 1, 2015" 191 | output: html_document 192 | --- 193 | #### Aim: 194 | To use Bayesian analysis to compare NYC's 2010 195 | combined SAT scores against the average of the 196 | rest of the country, which, according to 197 | FairTest.com, is 1509 198 | ```{r, echo=FALSE} 199 | # options 200 | options(echo=TRUE) 201 | options(stringsAsFactors=FALSE) 202 | ``` 203 | We are going to use the `assertr` and `runjags` 204 | packages for data checking and MCMC, respectively. 205 | ```{r} 206 | # libraries 207 | library(assertr) # for data checking 208 | library(runjags) # for MCMC 209 | ``` 210 | Let's make sure everything is all set with JAGS! 211 | ```{r} 212 | testjags() 213 | ... 214 | 215 | Great! 216 | This data was found in the NYC Open Data Portal: 217 | https://nycopendata.socrata.com 218 | ```{r} 219 | link.to.data <- 220 | "http://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNL 221 | OAD" 222 | download.file(link.to.data, "./data/SAT_Scores_NYC_2010.csv") 223 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv") 224 | ``` 225 | Let's give the columns easier names 226 | ```{r} 227 | better.names <- c("id", "school.name", "n", "read.mean", 228 | "math.mean", "write.mean") 229 | names(nyc.sats) <- better.names 230 | ``` 231 | There are `r nrow(nyc.sats)` rows but almost 700 NYC schools. We will, 232 | therefore, *assume* that this is a random sample of NYC schools. 233 | Let's first check the veracity of this data... 234 | ```{r, error=TRUE} 235 | nyc.sats <- assert(nyc.sats, is.numeric, 236 | n, read.mean, math.mean, write.mean) 237 | ``` 238 | It looks like check failed because there are "s"s for some rows. (??) 239 | A look at the data set descriptions indicates that the "s" is for schools 240 | with 5 or fewer students. For our purposes, let's just exclude them. 241 | This is a function that takes a vector, replaces all "s"s 242 | with NAs and make coverts all non-"s"s into numerics 243 | ```{r} 244 | remove.s <- function(vec){ 245 | ifelse(vec=="s", NA, vec) 246 | } 247 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n)) 248 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean)) 249 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean)) 250 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean)) 251 | 252 | 253 | 254 | Now we are going to remove schools with fewer than 5 test takers 255 | and calculate a combined SAT score 256 | ```{r} 257 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ] 258 | # Calculate a total combined SAT score 259 | nyc.sats$combined.mean <- (nyc.sats$read.mean + 260 | nyc.sats$math.mean + 261 | nyc.sats$write.mean) 262 | ``` 263 | Let's now build a posterior distribution of the true mean of NYC high 264 | schools' combined SAT scores. We're not going to look at the summary 265 | statistics, because we don't want to bias our priors. 266 | We will use a standard gaussian model. 267 | ```{r, cache=TRUE, results="hide", warning=FALSE, message=FALSE} 268 | the.model <- " 269 | model { 270 | # priors 271 | mu ~ dunif(0, 2400) 272 | stddev ~ dunif(0, 500) 273 | tau <- pow(stddev, -2) 274 | # likelihood 275 | for(i in 1:theLength){ 276 | samp[i] ~ dnorm(mu, tau) 277 | } 278 | }" 279 | the.data <- list( 280 | samp = nyc.sats$combined.mean, 281 | theLength = length(nyc.sats$combined.mean) 282 | ) 283 | results <- autorun.jags(the.model, data=the.data, 284 | n.chains = 3, 285 | monitor = c('mu')) 286 | ``` 287 | Let's view the results of the MCMC. 288 | ```{r} 289 | print(results) 290 | ``` 291 | Now let's plot the MCMC diagnostics 292 | ```{r, message=FALSE} 293 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1)) 294 | 295 | Looks good! 296 | Let's extract the MCMC samples of the mean, and get the 297 | bounds of the middle 95% 298 | ```{r} 299 | results.matrix <- as.matrix(results$mcmc) 300 | mu.samples <- results.matrix[,'mu'] 301 | bounds <- quantile(mu.samples, c(.025, .975)) 302 | ``` 303 | We are 95% sure that the true mean is between 304 | `r round(bounds[1], 2)` and `r round(bounds[2], 2)`. 305 | Now let's plot the marginal posterior distribution for the mean 306 | of the NYC high schools' combined SAT grades, and draw the 95% 307 | percent credible interval. 308 | ```{r} 309 | plot(density(mu.samples), 310 | main=paste("Posterior distribution of mean combined SAT", 311 | "score in NYC high schools (2010)", sep="\n")) 312 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red") 313 | ``` 314 | Given the results, the SAT scores for NYC high schools in 2010 315 | are **incontrovertibly** not at par with the average SAT scores of 316 | the nation. 317 | ------------------------------------ 318 | This is some session information for reproducibility: 319 | ```{r} 320 | devtools::session_info() -------------------------------------------------------------------------------- /Chapter17/nyc-sat-scores.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript --vanilla 2 | 3 | ########################################################### 4 | ## ## 5 | ## nyc-sat-scores.R ## 6 | ## ## 7 | ## Author: Tony Fischetti ## 8 | ## tony.fischetti@gmail.com ## 9 | ## ## 10 | ########################################################### 11 | 12 | ## 13 | ## Aim: to use Bayesian analysis to compare NYC's 2010 14 | ## combined SAT scores against the average of the 15 | ## rest of the country, which, according to 16 | ## FairTest.com, is 1509 17 | ## 18 | 19 | # workspace cleanup 20 | rm(list=ls()) 21 | 22 | # options 23 | options(echo=TRUE) 24 | options(stringsAsFactors=FALSE) 25 | 26 | # libraries 27 | library(assertr) # for data checking 28 | library(runjags) # for MCMC 29 | 30 | # make sure everything is all set with JAGS 31 | testjags() 32 | # yep! 33 | 34 | 35 | ## read data file 36 | # data was retrieved from NYC Open Data portal 37 | # direct link: https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNLOAD 38 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv") 39 | 40 | # let's give the columns easier names 41 | better.names <- c("id", "school.name", "n", "read.mean", 42 | "math.mean", "write.mean") 43 | names(nyc.sats) <- better.names 44 | 45 | 46 | # there are 460 rows but almost 700 NYC schools 47 | # we will *assume*, then, that this is a random 48 | # sample of NYC schools 49 | 50 | # let's first check the veracity of this data... 51 | #nyc.sats <- assert(nyc.sats, is.numeric, 52 | # n, read.mean, math.mean, write.mean) 53 | 54 | # It looks like check failed because there are "s"s for some rows. (??) 55 | # A look at the data set descriptions indicates that the "s" is for schools 56 | # with 5 or fewer students. For our purposes, let's just exclude them. 57 | 58 | 59 | # This is a function that takes a vector, replaces all "s"s 60 | # with NAs and make coverts all non-"s"s into numerics 61 | remove.s <- function(vec){ 62 | ifelse(vec=="s", NA, vec) 63 | } 64 | 65 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n)) 66 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean)) 67 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean)) 68 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean)) 69 | 70 | # Remove schools with fewer than 5 test takers 71 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ] 72 | 73 | # Calculate a total combined SAT score 74 | nyc.sats$combined.mean <- (nyc.sats$read.mean + 75 | nyc.sats$math.mean + 76 | nyc.sats$write.mean) 77 | 78 | # Let's build a posterior distribution of the true mean 79 | # of NYC high school's combined SAT scores. 80 | 81 | # We're not going to look at the summary statistics because 82 | # we don't want to bias our priors 83 | 84 | # Specify a standard gaussian model 85 | the.model <- " 86 | model { 87 | # priors 88 | mu ~ dunif(0, 2400) 89 | stddev ~ dunif(0, 500) 90 | tau <- pow(stddev, -2) 91 | 92 | # likelihood 93 | for(i in 1:theLength){ 94 | samp[i] ~ dnorm(mu, tau) 95 | } 96 | }" 97 | 98 | the.data <- list( 99 | samp = nyc.sats$combined.mean, 100 | theLength = length(nyc.sats$combined.mean) 101 | ) 102 | 103 | results <- autorun.jags(the.model, data=the.data, 104 | n.chains = 3, 105 | monitor = c('mu', 'stddev')) 106 | 107 | # View the results of the MCMC 108 | print(results) 109 | 110 | # Plot the MCMC diagnostics 111 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1)) 112 | # Looks good! 113 | 114 | # Let's extract the MCMC samples of the mean and get the 115 | # bounds of the middle 95% 116 | results.matrix <- as.matrix(results$mcmc) 117 | mu.samples <- results.matrix[,'mu'] 118 | bounds <- quantile(mu.samples, c(.025, .975)) 119 | 120 | # We are 95% sure that the true mean is between 1197 and 1232 121 | 122 | # Now let's plot the marginal posterior distribution for the mean 123 | # of the NYC high schools' combined SAT grades and draw the 95% 124 | # percent credible interval. 125 | plot(density(mu.samples), main=paste("Posterior distribution of mean combined SAT", 126 | "score in NYC high schools (2010)", sep="\n")) 127 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red") 128 | 129 | 130 | # Given the results, the SAT scores for NYC high schools in 2010 131 | # are *incontrovertibly* not on par with the average SAT scores of 132 | # the nation. 133 | -------------------------------------------------------------------------------- /Chapter17/nyc-sat-scores.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NYC SAT Scores Analysis" 3 | author: "Tony Fischetti" 4 | date: "November 1, 2015" 5 | output: html_document 6 | --- 7 | 8 | #### Aim: 9 | To use Bayesian analysis to compare NYC's 2010 10 | combined SAT scores against the average of the 11 | rest of the country, which, according to 12 | FairTest.com, is 1509 13 | 14 | 15 | ```{r, echo=FALSE} 16 | # options 17 | options(echo=TRUE) 18 | options(stringsAsFactors=FALSE) 19 | ``` 20 | 21 | We are going to use the `assertr` and `runjags` 22 | packages for data checking and MCMC, respectively. 23 | ```{r} 24 | # libraries 25 | library(assertr) # for data checking 26 | library(runjags) # for MCMC 27 | ``` 28 | 29 | Let's make sure everything is all set with JAGS! 30 | ```{r} 31 | testjags() 32 | ``` 33 | Great! 34 | 35 | This data was found in the NYC Open Data Portal: 36 | https://nycopendata.socrata.com 37 | ```{r} 38 | link.to.data <- "http://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNLOAD" 39 | download.file(link.to.data, "./data/SAT_Scores_NYC_2010.csv") 40 | 41 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv") 42 | ``` 43 | 44 | Let's give the columns easier names 45 | ```{r} 46 | better.names <- c("id", "school.name", "n", "read.mean", 47 | "math.mean", "write.mean") 48 | names(nyc.sats) <- better.names 49 | ``` 50 | 51 | There are `r nrow(nyc.sats)` rows but almost 700 NYC schools. We will, 52 | therefore, *assume* that this is a random sample of NYC schools. 53 | 54 | 55 | Let's first check the veracity of this data... 56 | ```{r, error=TRUE} 57 | nyc.sats <- assert(nyc.sats, is.numeric, 58 | n, read.mean, math.mean, write.mean) 59 | ``` 60 | 61 | It looks like check failed because there are "s"s for some rows. (??) 62 | A look at the data set descriptions indicates that the "s" is for schools 63 | with 5 or fewer students. For our purposes, let's just exclude them. 64 | 65 | 66 | This is a function that takes a vector, replaces all "s"s 67 | with NAs and make coverts all non-"s"s into numerics 68 | ```{r} 69 | remove.s <- function(vec){ 70 | ifelse(vec=="s", NA, vec) 71 | } 72 | 73 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n)) 74 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean)) 75 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean)) 76 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean)) 77 | ``` 78 | 79 | Now we are going to remove schools with fewer than 5 test takers 80 | and calculate a combined SAT score 81 | ```{r} 82 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ] 83 | 84 | # Calculate a total combined SAT score 85 | nyc.sats$combined.mean <- (nyc.sats$read.mean + 86 | nyc.sats$math.mean + 87 | nyc.sats$write.mean) 88 | ``` 89 | 90 | Let's now build a posterior distribution of the true mean 91 | of NYC high school's combined SAT scores. We're not going to look 92 | at the summary statistics because we don't want to bias our priors. 93 | We will use a standard gaussian model. 94 | 95 | ```{r, cache=TRUE, results="hide", warning=FALSE, message=FALSE} 96 | the.model <- " 97 | model { 98 | # priors 99 | mu ~ dunif(0, 2400) 100 | stddev ~ dunif(0, 500) 101 | tau <- pow(stddev, -2) 102 | 103 | # likelihood 104 | for(i in 1:theLength){ 105 | samp[i] ~ dnorm(mu, tau) 106 | } 107 | }" 108 | 109 | the.data <- list( 110 | samp = nyc.sats$combined.mean, 111 | theLength = length(nyc.sats$combined.mean) 112 | ) 113 | 114 | results <- autorun.jags(the.model, data=the.data, 115 | n.chains = 3, 116 | monitor = c('mu')) 117 | ``` 118 | 119 | Let's view the results of the MCMC. 120 | ```{r} 121 | print(results) 122 | ``` 123 | 124 | Now let's plot the MCMC diagnostics 125 | ```{r, message=FALSE} 126 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1)) 127 | ``` 128 | 129 | Looks good! 130 | 131 | 132 | Let's extract the MCMC samples of the mean and get the 133 | bounds of the middle 95% 134 | ```{r} 135 | results.matrix <- as.matrix(results$mcmc) 136 | mu.samples <- results.matrix[,'mu'] 137 | bounds <- quantile(mu.samples, c(.025, .975)) 138 | ``` 139 | 140 | We are 95% sure that the true mean is between `r bounds[1]` and 141 | `r bounds[2]`. 142 | 143 | Now let's plot the marginal posterior distribution for the mean 144 | of the NYC high schools' combined SAT grades and draw the 95% 145 | percent credible interval. 146 | ```{r} 147 | plot(density(mu.samples), 148 | main=paste("Posterior distribution of mean combined SAT", 149 | "score in NYC high schools (2010)", sep="\n")) 150 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red") 151 | ``` 152 | 153 | Given the results, the SAT scores for NYC high schools in 2010 154 | are **incontrovertibly** not on par with the average SAT scores of 155 | the nation. 156 | 157 | ------------------------------------ 158 | 159 | This is some session information for reproducibility: 160 | ```{r} 161 | devtools::session_info() 162 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Data Analysis with R - Second Edition 5 | This is the code repository for [Data Analysis with R - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/data-analysis-r-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788393720), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 6 | ## About the Book 7 | Frequently the tool of choice for academics, R has spread deep into the private sector and can be found in the production pipelines at some of the most advanced and successful enterprises. The power and domain-specificity of R allows the user to express complex analytics easily, quickly, and succinctly. 8 | 9 | Starting with the basics of R and statistical reasoning, this book dives into advanced predictive analytics, showing how to apply those techniques to real-world data though with real-world examples. 10 | 11 | 12 | ## Instructions and Navigation 13 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 14 | 15 | 16 | 17 | The code will look like the following: 18 | ``` 19 | # don't worry about memorizing this 20 | temp.density <- density(airquality$Temp) 21 | pdf <- approxfun(temp.density$x, temp.density$y, rule=2) 22 | integrate(pdf, 80, 90) 23 | ``` 24 | 25 | All code in this book has been written against the latest version of R—3.4.3 at time of 26 | writing. As a matter of good practice, you should keep your R version up to date but most, 27 | if not all, code should work with any reasonably recent version of R. Some of the R 28 | packages we will be installing will require more recent versions though. For the other 29 | software that this book uses, instructions will be furnished pro re nata. If you want to get a 30 | head start, however, install RStudio, JAGS, and a C++ compiler (or Rtools if you use 31 | windows). 32 | 33 | ## Related Products 34 | * [Data Analysis with R](https://www.packtpub.com/big-data-and-business-intelligence/data-analysis-r?utm_source=github&utm_medium=repository&utm_campaign=9781785288142) 35 | 36 | * [Mastering Data Analysis with R](https://www.packtpub.com/big-data-and-business-intelligence/mastering-data-analysis-r?utm_source=github&utm_medium=repository&utm_campaign=9781783982028) 37 | 38 | * [Hands-On Geospatial Analysis with R and QGIS](https://www.packtpub.com/application-development/hands-geospatial-analysis-r-and-qgis?utm_source=github&utm_medium=repository&utm_campaign=9781788991674) 39 | 40 | ### Download a free PDF 41 | 42 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
43 |

https://packt.link/free-ebook/9781788393720

--------------------------------------------------------------------------------