├── .gitattributes
├── .gitignore
├── Chapter01
    └── Chapter01.txt
├── Chapter02
    └── Chapter02.txt
├── Chapter03
    └── Chapter03.txt
├── Chapter04
    └── Chapter04.txt
├── Chapter05
    └── Chapter05.txt
├── Chapter06
    └── Chapter06.txt
├── Chapter07
    └── Chapter07.txt
├── Chapter08
    └── Chapter08.txt
├── Chapter09
    └── Chapter09.txt
├── Chapter10
    └── Chapter10.txt
├── Chapter11
    └── Chapter11.txt
├── Chapter12
    ├── Chapter12.txt
    └── music-recommendations.R
├── Chapter13
    └── Chapter13.txt
├── Chapter14
    ├── Chapter14.txt
    └── Chapter15.txt
├── Chapter15
    ├── chapter12.R
    └── our_cpp_functions.cpp
├── Chapter16
    └── Chapter16.txt
├── Chapter17
    ├── Chapter17.txt
    ├── nyc-sat-scores.R
    └── nyc-sat-scores.Rmd
├── LICENSE
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/Chapter01/Chapter01.txt:
--------------------------------------------------------------------------------
  1 | Arithmetic and assignment
  2 | 
  3 | 
  4 | > 2 + 2
  5 | [1] 4
  6 | > 9 / 3
  7 | [1] 3
  8 | > 5 %% 2 # modulus operator (remainder of 5 divided by 2)
  9 | [1] 1
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | > 3 + 2 - 10 ^ 2 # ^ is the exponent operator
 17 | [1] -95
 18 | > 3 + (2 - 10) ^ 2
 19 | [1] 67
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | > # assignments follow the form VARIABLE <- VALUE
 26 | > var <- 10
 27 | > var
 28 | [1] 10
 29 | > var ^ 2
 30 | [1] 100
 31 | > VAR / 2 # variable names are case-sensitive
 32 | Error: object 'VAR' not found
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | > var # var is 10
 39 | [1] 10
 40 | > var ^ 2
 41 | [1] 100
 42 | > var # var is still 10
 43 | [1] 10
 44 | > var <- var ^ 2 # no return value
 45 | > var # var is now 100
 46 | [1] 100
 47 | 
 48 | 
 49 | 
 50 | 
 51 | > cos(3.14159) # cosine function
 52 | [1] -1
 53 | > cos(pi) # pi is a constant that R provides
 54 | [1] -1
 55 | > acos(-1) # arccosine function
 56 | [1] 3.141593
 57 | > acos(cos(pi)) + 10
 58 | [1] 13.14159
 59 | > # functions can be used as arguments to other functions
 60 | 
 61 | 
 62 | 
 63 | 
 64 | > 1 / 0
 65 | [1] Inf
 66 | > 0 / 0
 67 | [1] NaN
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | Logicals and characters
 74 | 
 75 | > foo <- TRUE # foo is of the logical data type
 76 | > class(foo) # class() tells us the type
 77 | [1] "logical"
 78 | > bar <- "hi!" # bar is of the character data type
 79 | > class(bar)
 80 | [1] "character"
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | > foo
 87 | [1] TRUE
 88 | > foo && TRUE # boolean and
 89 | [1] TRUE
 90 | > foo && FALSE
 91 | [1] FALSE
 92 | > foo || FALSE # boolean or
 93 | [1] TRUE
 94 | > !foo # negation operator
 95 | [1] FALSE
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | > foo && 1
102 | [1] TRUE
103 | > foo && 2
104 | [1] TRUE
105 | > foo && 0
106 | [1] FALSE
107 | 
108 | 
109 | 
110 | 
111 | 
112 | > 4 < 2 # less than operator
113 | [1] FALSE
114 | > 4 >= 4 # greater than or equal to
115 | [1] TRUE
116 | > 3 == 3 # equality operator
117 | [1] TRUE
118 | > 3 != 2 # inequality operator
119 | [1] TRUE
120 | 
121 | 
122 | 
123 | 
124 | 
125 | > lang.domain <- "statistics"
126 | > lang.domain <- toupper(lang.domain)
127 | > print(lang.domain)
128 | [1] "STATISTICS"
129 | > # retrieves substring from first character to fourth character
130 | > substr(lang.domain, 1, 4)
131 | [1] "STAT"
132 | > gsub("I", "1", lang.domain) # substitutes every "I" for "1"
133 | [1] "STAT1ST1CS"
134 | > # combines character strings
135 | > paste("R does", lang.domain, "!!!")
136 | [1] "R does STATISTICS !!!"
137 | 
138 | 
139 | 
140 | 
141 | 
142 | Flow of control
143 | 
144 | > if(2 + 2 == 4)
145 | + print("very good")
146 | [1] "very good"
147 | > if(2 + 2 == 5)
148 | + print("all hail to the thief")
149 | 
150 | 
151 | 
152 | 
153 | 
154 | > if((4/2==2) && (2*2==4)){
155 | + print("four divided by two is two...")
156 | + print("and two times two is four")
157 | + }
158 | [1] "four divided by two is two..."
159 | [1] "and two times two is four"
160 | 
161 | 
162 | 
163 | 
164 | 
165 | > closing.time <- TRUE
166 | > if(closing.time){
167 | + print("you don't have to go home")
168 | + print("but you can't stay here")
169 | + } else{
170 | + print("you can stay here!")
171 | + }
172 | [1] "you don't have to go home"
173 | [1] "but you can't stay here"
174 | > if(!closing.time){
175 | + print("you don't have to go home")
176 | + print("but you can't stay here")
177 | + } else{
178 | + print("you can stay here!")
179 | + }
180 | [1] "you can stay here!"
181 | 
182 | 
183 | 
184 | 
185 | 
186 | Vectors
187 | 
188 | > our.vect <- c(8, 6, 7, 5, 3, 0, 9)
189 | > our.vect
190 | [1] 8 6 7 5 3 0 9
191 | 
192 | 
193 | 
194 | 
195 | 
196 | > another.vect <- c("8", 6, 7, "-", 3, "0", 9)
197 | > another.vect
198 | [1] "8" "6" "7" "-" "3" "0" "9"
199 | 
200 | 
201 | 
202 | 
203 | 
204 | Subsetting
205 | 
206 | > our.vect[1] # to get the first value
207 | [1] 8
208 | > # the function length() returns the length of a vector
209 | > length(our.vect)
210 | [1] 7
211 | > our.vect[length(our.vect)] # get the last element of a vector
212 | [1] 9
213 | 
214 | 
215 | 
216 | 
217 | 
218 | > our.vect[10]
219 | [1] NA
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | > # extract the first, third, fifth, and
227 | > # seventh element from our vector
228 | > our.vect[c(1, 3, 5, 7)]
229 | [1] 8 7 3 9
230 | 
231 | 
232 | 
233 | 
234 | 
235 | > other.vector <- 1:10
236 | > other.vector
237 | [1] 1 2 3 4 5 6 7 8 9 10
238 | > another.vector <- seq(50, 30, by=-2)
239 | > another.vector
240 | [1] 50 48 46 44 42 40 38 36 34 32 30
241 | 
242 | 
243 | 
244 | 
245 | 
246 | > our.vect[1:5]
247 | [1] 8 6 7 5 3
248 | 
249 | 
250 | 
251 | 
252 | 
253 | Vectorized functions
254 | 
255 | > # takes the mean of a vector
256 | > mean(our.vect)
257 | [1] 5.428571
258 | > sd(our.vect) # standard deviation
259 | [1] 3.101459
260 | > min(our.vect)
261 | [1] 0
262 | > max(1:10)
263 | [1] 10
264 | > sum(c(1, 2, 3))
265 | [1] 6
266 | 
267 | 
268 | 
269 | 
270 | 
271 | > messy.vector <- c(8, 6, NA, 7, 5, NA, 3, 0, 9)
272 | > messy.vector
273 | [1] 8 6 NA 7 5 NA 3 0 9
274 | > length(messy.vector)
275 | [1] 9
276 | 
277 | 
278 | 
279 | 
280 | 
281 | > mean(messy.vector)
282 | [1] NA
283 | > mean(messy.vector, na.rm=TRUE)
284 | [1] 5.428571
285 | > sum(messy.vector, na.rm=FALSE)[1] NA
286 | > sum(messy.vector, na.rm=TRUE)
287 | [1] 38
288 | 
289 | 
290 | 
291 | 
292 | 
293 | > log.vector <- c(TRUE, TRUE, FALSE)
294 | > log.vector
295 | [1] TRUE TRUE FALSE
296 | 
297 | 
298 | 
299 | 
300 | 
301 | > sum(log.vector)
302 | [1] 2
303 | 
304 | 
305 | 
306 | 
307 | 
308 | > messy.vector
309 | [1] 8 6 NA 7 5 NA 3 0 9
310 | > is.na(messy.vector)
311 | [1] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
312 | > # 8 6 NA 7 5 NA 3 0 9
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | > sum(is.na(messy.vector))
320 | [1] 2
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | > our.vect > 5
328 | [1] TRUE TRUE TRUE FALSE FALSE FALSE TRUE
329 | 
330 | 
331 | 
332 | 
333 | 
334 | > sum(our.vect > 5)
335 | [1] 4
336 | 
337 | 
338 | 
339 | 
340 | 
341 | Advanced subsetting
342 | 
343 | > messy.vector[!is.na(messy.vector)]
344 | [1] 8 6 7 5 3 0 9
345 | 
346 | 
347 | 
348 | 
349 | 
350 | > our.vect[our.vect > 5]
351 | [1] 8 6 7 9
352 | 
353 | 
354 | 
355 | 
356 | 
357 | > our.vect
358 | [1] 8 6 7 5 3 0 9
359 | > our.vect[1] <- 9
360 | > our.vect
361 | [1] 9 6 7 5 3 0 9
362 | 
363 | 
364 | 
365 | 
366 | 
367 | 
368 | > messy.vector[is.na(messy.vector)] <- 0
369 | > messy.vector
370 | [1] 8 6 0 7 5 0 3 0 9
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | > ifelse(is.na(messy.vector), 0, messy.vector)
378 | [1] 8 6 0 7 5 0 3 0 9
379 | 
380 | 
381 | 
382 | 
383 | 
384 | Recycling
385 | 
386 | > our.vect + 3
387 | [1] 12 9 10 8 6 3 12
388 | 
389 | 
390 | 
391 | 
392 | 
393 | 
394 | > our.vect + 3
395 | [1] 12 9 10 8 6 3 12
396 | 
397 | 
398 | 
399 | 
400 | 
401 | > our.vect + c(3, 3, 3, 3, 3, 3, 3)
402 | [1] 12 9 10 8 6 3 12
403 | 
404 | 
405 | 
406 | 
407 | 
408 | > our.vect[c(TRUE, FALSE)]
409 | [1] 9 7 3 9
410 | 
411 | 
412 | 
413 | 
414 | 
415 | > our.vect[c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE)]
416 | [1] 9 7 3 9
417 | 
418 | 
419 | 
420 | 
421 | 
422 | > our.vect + c(3, 0)
423 | [1] 12 6 10 5 6 0 12
424 | Warning message:
425 | In our.vect + c(3, 0) :
426 | longer object length is not a multiple of shorter object length
427 | 
428 | 
429 | 
430 | 
431 | 
432 | 
433 | Functions
434 | 
435 | > function.name <- function(argument1, argument2, ...){
436 | + # some functionality
437 | + }
438 | 
439 | 
440 | 
441 | 
442 | > is.even <- function(a.number){
443 | + remainder <- a.number %% 2
444 | + if(remainder==0)
445 | + return(TRUE)
446 | + return(FALSE)
447 | + }
448 | > # testing it
449 | > is.even(10)
450 | [1] TRUE
451 | > is.even(9)
452 | [1] FALSE
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 
459 | > is.divisible.by <- function(large.number, smaller.number){
460 | + if(large.number %% smaller.number != 0)
461 | + return(FALSE)
462 | + return(TRUE)
463 | + }
464 | > # testing it
465 | > is.divisible.by(10, 2)
466 | [1] TRUE
467 | > is.divisible.by(10, 3)
468 | [1] FALSE
469 | > is.divisible.by(9, 3)
470 | [1] TRUE
471 | 
472 | 
473 | 
474 | 
475 | 
476 | > is.even <- function(num){
477 | + is.divisible.by(num, 2)
478 | + }
479 | 
480 | 
481 | 
482 | 
483 | 
484 | > sapply(our.vect, is.even)
485 | [1] FALSE TRUE FALSE FALSE FALSE TRUE FALSE
486 | 
487 | 
488 | 
489 | 
490 | 
491 | 
492 | 
493 | > sapply(our.vect, function(num){is.divisible.by(num, 3)})
494 | [1] TRUE TRUE FALSE FALSE TRUE TRUE TRUE
495 | 
496 | 
497 | 
498 | 
499 | 
500 | 
501 | > where.even <- sapply(our.vect, is.even)
502 | > where.div.3 <- sapply(our.vect, function(num){
503 | + is.divisible.by(num, 3)})
504 | > # "&" is like the "&&" and operator but for vectors
505 | > our.vect[where.even & where.div.3]
506 | [1] 6 0
507 | 
508 | 
509 | 
510 | 
511 | 
512 | 
513 | Matrices
514 | 
515 | > a.matrix <- matrix(c(1, 2, 3, 4, 5, 6))
516 | > a.matrix
517 | [,1]
518 | [1,] 1
519 | [2,] 2
520 | [3,] 3
521 | [4,] 4
522 | [5,] 5
523 | [6,] 6
524 | 
525 | 
526 | 
527 | 
528 | 
529 | 
530 | > a.matrix <- matrix(c(1, 2, 3, 4, 5, 6), ncol=2)
531 | > a.matrix
532 | [,1] [,2]
533 | [1,] 1 4
534 | [2,] 2 5
535 | [3,] 3 6
536 | 
537 | 
538 | 
539 | 
540 | 
541 | 
542 | > a2.matrix <- cbind(c(1, 2, 3), c(4, 5, 6))
543 | 
544 | 
545 | 
546 | 
547 | 
548 | > a3.matrix <- rbind(c(1, 2, 3), c(4, 5, 6))
549 | > a3.matrix
550 | [,1] [,2] [,3]
551 | [1,] 1 2 3
552 | [2,] 4 5 6
553 | 
554 | 
555 | 
556 | 
557 | 
558 | > t(a2.matrix)
559 | 
560 | 
561 | 
562 | 
563 | 
564 | > a2.matrix
565 | [,1] [,2]
566 | [1,] 1 4
567 | [2,] 2 5
568 | [3,] 3 6
569 | > colSums(a2.matrix)
570 | [1] 6 15
571 | > rowMeans(a2.matrix)
572 | [1] 2.5 3.5 4.5
573 | 
574 | 
575 | 
576 | 
577 | 
578 | > apply(a2.matrix, 2, sum)
579 | [1] 6 15
580 | > apply(a2.matrix, 1, mean)
581 | [1] 2.5 3.5 4.5
582 | 
583 | 
584 | 
585 | 
586 | 
587 | 
588 | > a2.matrix %*% a2.matrix
589 | Error in a2.matrix %*% a2.matrix : non-conformable arguments
590 | 
591 | 
592 | 
593 | 
594 | 
595 | 
596 | 
597 | > a2.matrix
598 | [,1] [,2]
599 | [1,] 1 4
600 | [2,] 2 5
601 | [3,] 3 6
602 | > a3.matrix
603 | [,1] [,2] [,3]
604 | [1,] 1 2 3
605 | [2,] 4 5 6
606 | > a2.matrix %*% a3.matrix
607 | [,1] [,2] [,3]
608 | [1,] 17 22 27
609 | [2,] 22 29 36
610 | [3,] 27 36 45
611 | > # dim() tells us how many rows and columns
612 | > # (respectively) there are in the given matrix
613 | > dim(a2.matrix)
614 | [1] 3 2
615 | 
616 | 
617 | 
618 | 
619 | 
620 | 
621 | > a2.matrix[2,1]
622 | [1] 2
623 | 
624 | 
625 | 
626 | 
627 | 
628 | 
629 | > # returns the whole second column
630 | a2.matrix[,2]
631 | [1] 4 5 6
632 | > # returns the first row
633 | > a2.matrix[1,]
634 | [1] 1 4
635 | 
636 | 
637 | 
638 | 
639 | 
640 | 
641 | > # give me element in column 2 at the first and third row
642 | > a2.matrix[c(1, 3), 2]
643 | [1] 4 6
644 | 
645 | 
646 | 
647 | 
648 | 
649 | 
650 | Loading data into R
651 | 
652 | flavor,number
653 | pistachio,6
654 | mint chocolate chip,7
655 | vanilla,5
656 | chocolate,10
657 | strawberry,2
658 | neopolitan,4
659 | 
660 | 
661 | 
662 | 
663 | 
664 | 
665 | > favs <- read.table("favorites.txt", sep=",", header=TRUE)
666 | 
667 | 
668 | 
669 | 
670 | 
671 | 
672 | > favs <- read.table(file.choose(), sep=",", header=TRUE)
673 | 
674 | 
675 | 
676 | 
677 | 
678 | 
679 | > favs <- read.csv("favorites.txt")
680 | 
681 | 
682 | 
683 | 
684 | 
685 | > head(favs)
686 | flavor number
687 | 1 pistachio 6
688 | 2 mint chocolate chip 7
689 | 3 vanilla 5
690 | 4 chocolate 10
691 | 5 strawberry 2
692 | 6 neopolitan 4
693 | > class(favs)
694 | [1] "data.frame"
695 | > class(favs$flavor)
696 | [1] "factor"
697 | > class(favs$number)
698 | [1] "numeric"
699 | 
700 | 
701 | 
702 | 
703 | 
704 | 
705 | 
706 | > favs <- read.csv("favorites.txt", stringsAsFactors=FALSE)
707 | > class(favs$flavor)
708 | [1] "character"
709 | 
710 | 
711 | 
712 | 
713 | 
714 | > favs$flavor
715 | [1] "pistachio" "mint chocolate chip" "vanilla"
716 | [4] "chocolate" "strawberry" "neopolitan"
717 | > favs[["flavor"]]
718 | [1] "pistachio" "mint chocolate chip" "vanilla"
719 | [4] "chocolate" "strawberry" "neopolitan"
720 | > favs[,1]
721 | [1] "pistachio" "mint chocolate chip" "vanilla"
722 | [4] "chocolate" "strawberry" "neopolitan"
723 | 
724 | 
725 | 
726 | 
727 | 
728 | 
729 | 
730 | 
731 | > names(favs)
732 | [1] "flavor" "number"
733 | > names(favs)[1] <- "flav"
734 | > names(favs)
735 | [1] "flav" "number"
736 | 
737 | 
738 | 
739 | 
740 | 
741 | 
742 | > str(favs)
743 | 'data.frame': 6 obs. of 2 variables:
744 | $ flav : chr "pistachio" "mint chocolate chip" "vanilla"
745 | "chocolate" ...
746 | $ number: num 6 7 5 10 2 4
747 | 
748 | 
749 | 
750 | 
751 | 
752 | Working with packages
753 | 
754 | > # downloads and installs from CRAN
755 | > install.packages("ggplot2")
756 | 
757 | 
758 | 
759 | 
760 | 
761 | > library(ggplot2)
762 | > ggplot(favs, aes(x=flav, y=number)) +
763 | + geom_bar(stat="identity") +
764 | + ggtitle("Soy ice cream flavor preferences")
765 | 
766 | 
767 | 
768 | 
769 | 
770 | 
771 | 
772 | 
773 | 


--------------------------------------------------------------------------------
/Chapter02/Chapter02.txt:
--------------------------------------------------------------------------------
  1 | Univariate data
  2 | 
  3 | categorical.data <- c("heads", "tails", "tails", "heads")
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | contin.data <- c(198.41, 178.46, 165.20, 141.71, 138.77)
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | Frequency distributions
 16 | 
 17 | head(mtcars)
 18 | 		  mpg 	cyl 	disp 	hp 	drat 	wt 	qsec 	vs 	am 	gear 	carb
 19 | Mazda RX4 	  21.0 	6 	160 	110 	3.90 	2.620 	16.46 	0	 1 	 4	 4
 20 | Mazda RX4 Wag     21.0 	6 	160 	110 	3.90 	2.875 	17.02 	0	 1	 4	 4
 21 | Datsun 710 	  22.8 	4 	108 	93 	3.85 	2.320 	18.61 	1 	 1	 4	 1
 22 | Hornet 4 Drive    21.4	6 	258 	110	3.08 	3.215 	19.44 	1 	 0 	 3	 1
 23 | Hornet Sportabout 18.7 	8 	360 	175 	3.15 	3.440 	17.02 	0	 0 	 3	 2
 24 | Valiant 	  18.1	6 	225 	105 	2.76 	3.460 	20.22 	1	 0 	 3	 1
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | unique(mtcars$carb)
 31 | [1] 4 1 2 3 6 8
 32 | 
 33 | 
 34 | 
 35 | 
 36 | table(mtcars$carb)
 37 | 1 2 3 4 6 8
 38 | 7 10 3 10 1 1
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | cut(airquality$Temp, 9)
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | table(cut(airquality$Temp, 9))
 51 | 
 52 | (56,60.6] (60.6,65.1] (65.1,69.7] (69.7,74.2] (74.2,78.8]
 53 | 	8 	  10 	      14 	16 	      26
 54 | (78.8,83.3] (83.3,87.9] (87.9,92.4] (92.4,97]
 55 | 	35 	   22 		15 	    7
 56 | 
 57 | 
 58 | 
 59 | 
 60 | Central tendency
 61 | 
 62 | sum(nums)/length(nums) # nums would be a vector of numerics
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | mean(c(1,2,3,4,5))
 69 | [1] 3
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | median(c(3, 7, 6, 10, 3, 7))
 76 | [1] 6.5
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | Spread
 84 | 
 85 | sum(abs(x - mean(x))) / length(x)
 86 | 
 87 | 
 88 | 
 89 | 
 90 | Probability distributions
 91 | 
 92 | table(mtcars$carb) / length(mtcars$carb)
 93 |       1       2       3       4       6       8
 94 | 0.21875 0.31250 0.09375 0.31250 0.03125 0.03125
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | # don't worry about memorizing this
102 | temp.density <- density(airquality$Temp)
103 | pdf <- approxfun(temp.density$x, temp.density$y, rule=2)
104 | integrate(pdf, 80, 90)
105 | 0.3422287 with absolute error < 7.5e-06
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | Visualization methods
113 | 
114 | qplot(column, data=dataframe, geom=...)
115 | 
116 | 
117 | 
118 | 
119 | library(ggplot2)
120 | qplot(factor(carb), data=mtcars, geom="bar")
121 | 
122 | 
123 | 
124 | 
125 | 
126 | qplot(factor(carb),
127 | 	data=mtcars,
128 | 	geom="bar",
129 | 	fill=factor(carb),
130 | 	xlab="number of carburetors")
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | qplot(Temp, data=airquality, geom="histogram")
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | qplot(Temp, data=airquality, geom="histogram",
145 | binwidth=5, color=I("white"))
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | qplot(Temp, data=airquality, geom="density")
154 | 
155 | 
156 | 
157 | 
158 | 
159 | qplot(Temp, data=airquality, geom="density",
160 | 	adjust=.5, # changes bandwidth
161 | 	fill=I("pink"),
162 | 	alpha=I(.5), # adds transparency
163 | 	main="density plot of temperature data")
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/Chapter03/Chapter03.txt:
--------------------------------------------------------------------------------
  1 | Multivariate data
  2 | 
  3 | head(airquality)
  4 | Ozone Solar.R Wind Temp Month Day
  5 | 1     41   190  7.4  67   5   1
  6 | 2     36   118  8.0  72   5   2
  7 | 3     12   149 12.6  74   5   3
  8 | 4     18   313 11.5  62   5   4
  9 | 5     NA   NA  14.3  56   5   5
 10 | 6     28   NA  14.9  66   5   6
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | Relationships between a categorical and continuous variable
 18 | 
 19 | head(iris)
 20 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species
 21 | 1 5.1 3.5 1.4 0.2 setosa
 22 | 2 4.9 3.0 1.4 0.2 setosa
 23 | 3 4.7 3.2 1.3 0.2 setosa
 24 | 4 4.6 3.1 1.5 0.2 setosa
 25 | 5 5.0 3.6 1.4 0.2 setosa
 26 | 6 5.4 3.9 1.7 0.4 setosa
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | mean(iris$Petal.Length)
 39 | [1] 3.758
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | mean(iris$Petal.Length[iris$Species=="setosa"])
 47 | [1] 1.462
 48 | mean(iris$Petal.Length[iris$Species=="versicolor"])
 49 | [1] 4.26
 50 | mean(iris$Petal.Length[iris$Species=="virginica"])
 51 | [1] 5.552
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | by(iris$Petal.Length, iris$Species, mean)
 59 | iris$Species: setosa
 60 | [1] 1.462
 61 | --------------------------------------------
 62 | iris$Species: versicolor
 63 | [1] 4.26
 64 | --------------------------------------------
 65 | iris$Species: virginica
 66 | [1] 5.552
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | by(iris$Petal.Length, iris$Species, sd)
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | by(iris$Petal.Length, iris$Species, summary)
 85 | iris$Species: setosa
 86 | Min. 1st Qu. Median Mean 3rd Qu. Max.
 87 | 1.000 1.400 1.500 1.462 1.575 1.900
 88 | ------------------------------------------------
 89 | iris$Species: versicolor
 90 | Min. 1st Qu. Median Mean 3rd Qu. Max.
 91 | 3.00 4.00 4.35 4.26 4.60 5.10
 92 | ------------------------------------------------
 93 | iris$Species: virginica
 94 | Min. 1st Qu. Median Mean 3rd Qu. Max.
 95 | 4.500 5.100 5.550 5.552 5.875 6.900
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | by(airquality$Temp, airquality$Month, mean)
104 | airquality$Month: 5
105 | [1] 65.54839
106 | ---------------------------------------------
107 | airquality$Month: 6
108 | [1] 79.1
109 | ---------------------------------------------
110 | airquality$Month: 7
111 | [1] 83.90323
112 | ---------------------------------------------
113 | airquality$Month: 8
114 | [1] 83.96774
115 | ---------------------------------------------
116 | airquality$Month: 9
117 | [1] 76.9
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | Relationships between two categorical variables
127 | 
128 | ucba <- data.frame(UCBAdmissions)
129 | head(ucba)
130 | Admit Gender Dept Freq
131 | 1 Admitted Male A 512
132 | 2 Rejected Male A 313
133 | 3 Admitted Female A 89
134 | 4 Rejected Female A 19
135 | 5 Admitted Male B 353
136 | 6 Rejected Male B 207
137 | 
138 | 
139 | 
140 | 
141 | 
142 | # the first argument to xtabs (the formula) should
143 | # be read as: frequency *by* Gender and Admission
144 | cross <- xtabs(Freq ~ Gender+Admit, data=ucba)
145 | cross
146 | Admit
147 | Gender Admitted Rejected
148 | Male 1198 1493
149 | Female 557 1278
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | prop.table(cross, 1)
157 | Admit
158 | Gender Admitted Rejected
159 | Male 0.4451877 0.5548123
160 | Female 0.3035422 0.6964578
161 | 
162 | 
163 | 
164 | 
165 | 
166 | cross2 <- xtabs(Freq ~ Gender + Admit, data=ucba[ucba$Dept=="A",])
167 | prop.table(cross2, 1)
168 | Admit
169 | Gender Admitted Rejected
170 | Male 0.6206061 0.3793939
171 | Female 0.8240741 0.1759259
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | The relationship between two continuous variables
179 | 
180 | head(women)
181 | height weight
182 | 1 58 115
183 | 2 59 117
184 | 3 60 120
185 | 4 61 123
186 | 5 62 126
187 | 6 63 129
188 | nrow(women)
189 | [1] 15
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | Covariance
197 | 
198 | cov(women$weight, women$height)
199 | [1] 69
200 | # the order we put the two columns in
201 | # the arguments doesn't matter
202 | cov(women$height, women$weight)
203 | [1] 69
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | # there are 2.54 centimeters in each inch
213 | # changing the units to centimeters increases
214 | # the variability within the height variable
215 | cov(women$height*2.54, women$weight)
216 | [1] 175.26
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | Correlation coefficients
224 | 
225 | cor(women$height, women$weight)
226 | [1] 0.9954948
227 | cor(women$height*2.54, women$weight)
228 | [1] 0.9954948
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | xs <- 1:100
236 | cor(xs, xs+100)
237 | [1] 1
238 | cor(xs, xs^3)
239 | [1] 0.917552
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | xs <- 1:100
248 | cor(xs, xs+100, method="spearman")
249 | [1] 1
250 | cor(xs, xs^3, method="spearman")
251 | [1] 1
252 | 
253 | 
254 | 
255 | 
256 | 
257 | cor(mtcars$wt, mtcars$mpg)
258 | [1] -0.8676594
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | cor(airquality$Temp, airquality$Wind)
266 | [1] -0.4579879
267 | cor(airquality$Temp, airquality$Wind, method="spearman")
268 | [1] -0.4465408
269 | 
270 | 
271 | 
272 | 
273 | 
274 | # have to drop 5th column (species is not numeric)
275 | iris.nospecies <- iris[, -5]
276 | cor(iris.nospecies)
277 | Sepal.Length Sepal.Width Petal.Length Petal.Width
278 | Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
279 | Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
280 | Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
281 | Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
282 | 
283 | 
284 | 
285 | 
286 | 
287 | 
288 | 
289 | Categorical and continuous variables
290 | 
291 | library(ggplot)
292 | qplot(Species, Petal.Length, data=iris, geom="boxplot",
293 | fill=Species)
294 | 
295 | 
296 | 
297 | 
298 | 
299 | qplot(Petal.Length, data=iris, geom="density", alpha=I(.7),
300 | fill=Species)
301 | 
302 | 
303 | 
304 | 
305 | 
306 | Two categorical variables
307 | 
308 | 
309 | install.packages("vcd")
310 | library(vcd)
311 | ucba <- data.frame(UCBAdmissions)
312 | mosaic(Freq ~ Gender + Admit, data=ucba,
313 | shade=TRUE, legend=FALSE)
314 | 
315 | 
316 | 
317 | 
318 | 
319 | mosaic(Freq ~ Gender+Admit, data=ucba[ucba$Dept=="A",],
320 | shade=TRUE, legend=FALSE)
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | Two continuous variables
329 | 
330 | 
331 | qplot(height, weight, data=women, geom="point")
332 | 
333 | 
334 | 
335 | 
336 | qplot(wt, mpg, data=mtcars, geom=c("point", "smooth"),
337 | method="lm", se=FALSE)
338 | 
339 | 
340 | 
341 | 
342 | 
343 | qplot(wt, mpg, data=mtcars, geom=c("point", "smooth"), se=FALSE)
344 | 
345 | 
346 | 
347 | 
348 | 
349 | More than two continuous variables
350 | 
351 | install.packages("corrgram")
352 | library(corrgram)
353 | corrgram(iris, lower.panel=panel.conf, upper.panel=panel.pts)
354 | 
355 | 
356 | 
357 | 
358 | corrgram(iris, lower.panel=panel.pie, upper.panel=panel.pts,
359 | diag.panel=panel.density,
360 | main=paste0("corrgram of petal and sepal ",
361 | "measurements in iris data set"))
362 | 
363 | 
364 | 
365 | 
366 | 


--------------------------------------------------------------------------------
/Chapter04/Chapter04.txt:
--------------------------------------------------------------------------------
 1 | The binomial distribution
 2 | 
 3 | pbinom(10, size=30, prob=.5)
 4 | [1] 0.04936857
 5 | 
 6 | 
 7 | 
 8 | The normal distribution
 9 | 
10 | 
11 | > f <- function(x){ dnorm(x, mean=65, sd=3.5) }
12 | > integrate(f, 70, Inf)
13 | 0.07656373 with absolute error < 2.2e-06
14 | 
15 | 
16 | 
17 | > pnorm(70, mean=65, sd=3.5)
18 | [1] 0.9234363
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/Chapter05/Chapter05.txt:
--------------------------------------------------------------------------------
  1 | Estimating means
  2 | 
  3 | 
  4 | > # setting seed will make random number generation reproducible
  5 | > set.seed(1)
  6 | > all.us.women <- rnorm(10000, mean=65, sd=3.5)
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | > our.sample <- sample(all.us.women, 10)
 13 | > mean(our.sample)
 14 | [1] 64.51365
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | > population.mean <- mean(all.us.women)
 21 | > for(sample.size in seq(5, 30, by=5)){
 22 | + # create empty vector with 1000 elements
 23 | + sample.means <- numeric(1000)
 24 | + for(i in 1:1000){
 25 | + sample.means[i] <- mean(sample(all.us.women, sample.size))
 26 | + }
 27 | + distances.from.true.mean <- abs(sample.means - population.mean)
 28 | + mean.distance.from.true.mean <- mean(distances.from.true.mean)
 29 | + print(mean.distance.from.true.mean)
 30 | + }
 31 | [1] 1.245492
 32 | [1] 0.8653313
 33 | [1] 0.7386099
 34 | [1] 0.6355692
 35 | [1] 0.5458136
 36 | [1] 0.5090788
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | mean(our.new.sample)
 45 | [1] 65.19704
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | The sampling distribution
 53 | 
 54 | > means.of.our.samples <- numeric(10000)
 55 | > for(i in 1:10000){
 56 | + a.sample <- sample(all.us.women, 40)
 57 | + means.of.our.samples[i] <- mean(a.sample)
 58 | + }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | > mean(our.new.sample)
 66 | [1] 65.19704
 67 | > sd(our.new.sample)
 68 | [1] 3.588447
 69 | > sd(our.new.sample) / sqrt(length(our.new.sample))
 70 | [1] 0.5673833
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | Interval estimation
 77 | 
 78 | > err <- sd(our.new.sample) / sqrt(length(our.new.sample))
 79 | > mean(our.new.sample) - (1.96*err)
 80 | [1] 64.08497
 81 | > mean(our.new.sample) + (1.96*err)
 82 | [1] 66.30912
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | How did we get 1.96?
 90 | 
 91 | > qnorm(.025)
 92 | [1] -1.959964
 93 | > pnorm(-1.959964)
 94 | [1] 0.025
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | > qnorm(.05)
101 | [1] -1.644854
102 | > qnorm(.95)
103 | [1] 1.644854
104 | > # notice the symmetry?
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | Smaller samples
113 | 
114 | > small.sample <- sample(all.us.women, 15)
115 | > mean(small.sample)
116 | [1] 65.51277
117 | > qt(.025, df=14)
118 | [1] -2.144787
119 | > # notice the difference
120 | > qnorm(.025)
121 | [1] -1.959964
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | > err <- sd(small.sample) / sqrt(length(small.sample))
130 | > mean(small.sample) - (2.145 * err)
131 | [1] 64.09551
132 | > mean(small.sample) + (2.145 * err)
133 | [1] 66.93003
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/Chapter06/Chapter06.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Analysis-with-R-Second-Edition/ece3101ca2f43f725c13a67d27810297bd31142a/Chapter06/Chapter06.txt


--------------------------------------------------------------------------------
/Chapter07/Chapter07.txt:
--------------------------------------------------------------------------------
  1 | The big idea behind Bayesian analysis
  2 | 
  3 | 
  4 | > curve(dbeta(x, 70, 60), # plot a beta distribution
  5 | + xlab="?", # name x-axis
  6 | + ylab="posterior belief", # name y-axis
  7 | + type="l", # make smooth line
  8 | + yaxt='n') # remove y axis labels
  9 | > abline(v=.5, lty=2) # make line at theta = 0.5
 10 | 
 11 | 
 12 | 
 13 | 
 14 | Who cares about coin flips
 15 | 
 16 | 
 17 | > curve(dbeta(x, 37, 5), xlab="?",
 18 | + ylab="posterior belief",
 19 | + type="l", yaxt='n')
 20 | 
 21 | 
 22 | 
 23 | 
 24 | > samp <- rbeta(10000, 37, 5)
 25 | > quantile(samp, c(.025, .975))
 26 | 2.5% 97.5%
 27 | 0.7674591 0.9597010
 28 | 
 29 | 
 30 | 
 31 | 
 32 | > # horizontal line
 33 | > lines(c(.767, .96), c(0.1, 0.1))
 34 | > # tiny vertical left boundary
 35 | > lines(c(.767, .769), c(0.15, 0.05))
 36 | > # tiny vertical right boundary
 37 | > lines(c(.96, .96), c(0.15, 0.05))
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | Using JAGS and runjags
 45 | 
 46 | > install.packages(c("rjags", "runjags", "modeest"))
 47 | 
 48 | 
 49 | 
 50 | 
 51 | > library(runjags)
 52 | > testjags()
 53 | You are using R version 3.2.1 (2015-06-18) on a unix machine,
 54 | with the RStudio GUI
 55 | The rjags package is installed
 56 | JAGS version 3.4.0 found successfully using the command
 57 | '/usr/local/bin/jags'
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | our.model <- "model {
 65 | # likelihood function
 66 | numSuccesses ~ dbinom(successProb, numTrials)
 67 | # prior
 68 | successProb ~ dbeta(1, 1)
 69 | # parameter of interest
 70 | theta <- numSuccesses / numTrials
 71 | }"
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | > numSuccesses ~ dbinom(successProb, numTrials)
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | our.data <- list(
 86 | numTrials = 40,
 87 | successProb = 36/40
 88 | )
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | > results <- autorun.jags(our.model,
 98 | + data=our.data,
 99 | + n.chains = 3,
100 | + monitor = c('theta'))
101 | 
102 | 
103 | 
104 | 
105 | 
106 | > plot(results,
107 | + plot.type=c("histogram", "trace"),
108 | + layout=c(2,1))
109 | 
110 | 
111 | 
112 | 
113 | 
114 | > # mcmc samples are stored in mcmc attribute
115 | > # of results variable
116 | > results.matrix <- as.matrix(results$mcmc)
117 | >
118 | > # extract the samples for 'theta'
119 | > # the only column, in this case
120 | > theta.samples <- results.matrix[,'theta']
121 | >
122 | > plot(density(theta.samples, adjust=5))
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | > quantile(theta.samples, c(.025, .975))
132 | 2.5% 97.5%
133 | 0.800 0.975
134 | > lines(c(.8, .975), c(0.1, 0.1))
135 | > lines(c(.8, .8), c(0.15, 0.05))
136 | > lines(c(.975, .975), c(0.15, 0.05))
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | Fitting distributions the Bayesian way
144 | 
145 | the.model <- "
146 | model {
147 | mu ~ dunif(0, 60) # prior
148 | stddev ~ dunif(0, 30) # prior
149 | tau <- pow(stddev, -2)
150 | for(i in 1:theLength){
151 | samp[i] ~ dnorm(mu, tau) # likelihood function
152 | }
153 | }"
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | the.data <- list(
161 | samp = precip,
162 | theLength = length(precip)
163 | )
164 | 
165 | 
166 | 
167 | 
168 | > results <- autorun.jags(the.model,
169 | + data=the.data,
170 | + n.chains = 3,
171 | + # now we care about two parameters
172 | + monitor = c('mu', 'stddev'))
173 | 
174 | 
175 | 
176 | 
177 | > plot(results,
178 | + plot.type=c("histogram", "trace"),
179 | + layout=c(2,2))
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | > results.matrix <- as.matrix(results$mcmc)
187 | >
188 | > library(MASS)
189 | > # we need to make a kernel density
190 | > # estimate of the 3-d surface
191 | 
192 | > z <- kde2d(results.matrix[,'mu'],
193 | + results.matrix[,'stddev'],
194 | + n=50)
195 | >
196 | > plot(results.matrix)
197 | > contour(z, drawlabels=FALSE,
198 | + nlevels=11, col=rainbow(11),
199 | + lwd=3, add=TRUE)
200 | 
201 | 
202 | 
203 | 
204 | 
205 | > print(results)
206 | 
207 | JAGS model summary statistics from 30000 samples (chains = 3; adapt+burnin
208 | = 5000):
209 | Lower95 Median Upper95 Mean SD Mode
210 | mu 31.645 34.862 38.181 34.866 1.6639 34.895
211 | stddev 11.669 13.886 16.376 13.967 1.2122 13.773
212 | MCerr MC%ofSD SSeff AC.10 psrf
213 | mu 0.012238 0.7 18484 0.002684 1.0001
214 | stddev 0.0093951 0.8 16649 -0.0053588 1.0001
215 | Total time taken: 5 seconds
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | The Bayesian independent samples t-test
224 | 
225 | 
226 | the.model <- "
227 | model {
228 | # each group will have a separate mu
229 | # and standard deviation
230 | for(j in 1:2){
231 | mu[j] ~ dunif(0, 60) # prior
232 | stddev[j] ~ dunif(0, 20) # prior
233 | tau[j] <- pow(stddev[j], -2)
234 | }
235 | for(i in 1:theLength){
236 | # likelihood function
237 | y[i] ~ dnorm(mu[x[i]], tau[x[i]])
238 | }
239 | }"
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | the.data <- list(
247 | y = mtcars$mpg,
248 | # 'x' needs to start at 1 so
249 | # 1 is now automatic and 2 is manual
250 | x = ifelse(mtcars$am==1, 1, 2),
251 | theLength = nrow(mtcars)
252 | )
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | > results <- autorun.jags(the.model,
261 | + data=the.data,
262 | + n.chains = 3,
263 | + monitor = c('mu', 'stddev'))
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | > results.matrix <- as.matrix(results$mcmc)
271 | > difference.in.means <- (results.matrix[,1] -
272 | + results.matrix[,2])
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 


--------------------------------------------------------------------------------
/Chapter08/Chapter08.txt:
--------------------------------------------------------------------------------
  1 | What's... uhhh... the deal with the bootstrap?
  2 | 
  3 | 
  4 | > # setting seed will make random number generation reproducible
  5 | > set.seed(1)
  6 | > all.us.women <- rnorm(10000, mean=65, sd=3.5)
  7 | > our.sample <- sample(all.us.women, 40)
  8 | > bootstrap.replicates <- numeric(10000)
  9 | > for(i in 1:10000){
 10 | + a.sample <- sample(our.sample, 40, replace=TRUE)
 11 | + bootstrap.replicates[i] <- mean(a.sample)
 12 | > }
 13 | > hist(bootstrap.replicates
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | Performing the bootstrap in R (more elegantly)
 21 | 
 22 | btobj <- boot(our.sample, function(x, i){mean(x[i])}, 10000,
 23 | parallel="multicore", ncpus=3)
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | plot(btobj)
 32 | 
 33 | 
 34 | 
 35 | str(btobj)
 36 | 
 37 | 
 38 | 
 39 | 
 40 | hist(btobj$t)
 41 | 
 42 | 
 43 | 
 44 | 
 45 | Confidence intervals
 46 | 
 47 | > quantile(btobj$t, prob=0.025)
 48 | > quantile(btobj$t, prob=0.975)
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | > boot.ci(btobj, conf=0.95, type="perc")
 55 | BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
 56 | Based on 10000 bootstrap replicates
 57 | CALL :
 58 | boot.ci(boot.out = btobj, conf = 0.95, type = "perc")
 59 | Intervals :
 60 | Level Percentile
 61 | 95% (63.7, 65.8 )
 62 | Calculations and Intervals on Original Scale
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | > boot.ci(btobj, type="bca")
 71 | BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
 72 | Based on 10000 bootstrap replicates
 73 | CALL :
 74 | boot.ci(boot.out = btobj, type = "bca")
 75 | Intervals :
 76 | Level BCa
 77 | 95% (63.67, 65.77 )
 78 | Calculations and Intervals on Original Scale
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | A one-sample test of means
 88 | 
 89 | > t.test(precip, mu=38)
 90 | One Sample t-test
 91 | data: precip
 92 | t = -1.901, df = 69, p-value = 0.06148
 93 | alternative hypothesis: true mean is not equal to 38
 94 | 95 percent confidence interval:
 95 | 31.61748 38.15395
 96 | sample estimates:
 97 | mean of x
 98 | 34.88571
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | > set.seed(1)
106 | > btobj <- boot(precip, function(x, i){mean(x[i])}, 300000)
107 | > boot.ci(btobj, type="bca")
108 | BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
109 | Based on 300000 bootstrap replicates
110 | CALL :
111 | boot.ci(boot.out = btobj, type = "bca")
112 | Intervals :
113 | Level BCa
114 | 95% (31.60, 37.98 )
115 | Calculations and Intervals on Original Scale
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | Bootstrapping statistics other than the mean
125 | 
126 | > set.seed(1)
127 | > law.salaries <- c(c(rnorm(70, mean=40000, sd=7000)), 500000, 400000,
128 | 350000)
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | > btobj.mean <- boot(law.salaries, function(x, i){mean(x[i])}, 300000)
136 | > btobj.median <- boot(law.salaries, function(x, i){median(x[i])}, 300000)
137 | > btobj.tmean <- boot(law.salaries, function(x, i){mean(x[i], trim=0.05)},
138 | 300000)
139 | > boot.ci(btobj.mean, type="bca") # 44989 - 84084
140 | > boot.ci(btobj.median, type="bca") # 39607 - 42926
141 | > boot.ci(btobj.tmean, type="bca") # 39993 - 57715
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/Chapter09/Chapter09.txt:
--------------------------------------------------------------------------------
  1 | Simple linear regression
  2 | 
  3 | 
  4 | plot(mpg ~ wt, data=mtcars)
  5 | 
  6 | 
  7 | 
  8 | 
  9 | model <- lm(mpg ~ wt, data=mtcars)
 10 | 
 11 | 
 12 | 
 13 | 
 14 | abline(model)
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | summary(model)
 21 | Call:
 22 | lm(formula = mpg ~ wt, data = mtcars)
 23 | Residuals:
 24 | Min 1Q Median 3Q Max
 25 | -4.5432 -2.3647 -0.1252 1.4096 6.8727
 26 | Coefficients:
 27 | Estimate Std. Error t value Pr(>|t|)
 28 | (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
 29 | wt -5.3445 0.5591 -9.559 1.29e-10 ***
 30 | ---
 31 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 32 | Residual standard error: 3.046 on 30 degrees of freedom
 33 | Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
 34 | F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | predict(model, newdata=data.frame(wt=6))
 43 | 1
 44 | 5.218297
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | Simple linear regression with a binary predictor
 53 | 
 54 | model <- lm(mpg ~ am, data=mtcars)
 55 | summary(model)
 56 | Call:
 57 | lm(formula = mpg ~ am, data = mtcars)
 58 | Residuals:
 59 | Min 1Q Median 3Q Max
 60 | -9.3923 -3.0923 -0.2974 3.2439 9.5077
 61 | Coefficients:
 62 | Estimate Std. Error t value Pr(>|t|)
 63 | (Intercept) 17.147 1.125 15.247 1.13e-15 ***
 64 | am 7.245 1.764 4.106 0.000285 ***
 65 | ---
 66 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 67 | Residual standard error: 4.902 on 30 degrees of freedom
 68 | Multiple R-squared: 0.3598, Adjusted R-squared: 0.3385
 69 | F-statistic: 16.86 on 1 and 30 DF, p-value: 0.000285
 70 | mean(mtcars$mpg[mtcars$am==0])
 71 | [1] 17.14737
 72 | (mean(mtcars$mpg[mtcars$am==1]) -
 73 | mean(mtcars$mpg[mtcars$am==0]))
 74 | [1] 7.244939
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | # use var.equal to choose Students t-test
 83 | # over Welch's t-test
 84 | t.test(mpg ~ am, data=mtcars, var.equal=TRUE)
 85 | Two Sample t-test
 86 | data: mpg by am
 87 | t = -4.1061, df = 30, p-value = 0.000285
 88 | alternative hypothesis: true difference in means is not equal to 0
 89 | 95 percent confidence interval:
 90 | -10.84837 -3.64151
 91 | sample estimates:
 92 | mean in group 0 mean in group 1
 93 | 17.14737 24.39231
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | mtcars$automatic <- ifelse(mtcars$am==0, "yes", "no")
104 | model <- lm(mpg ~ factor(automatic), data=mtcars)
105 | model
106 | Call:
107 | lm(formula = mpg ~ factor(automatic), data = mtcars)
108 | Coefficients:
109 | (Intercept) factor(automatic)yes
110 | 24.392 -7.245
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | A word of warning
124 | 
125 | 
126 | library(MASS)
127 | data(anscombe)
128 | plot(y3 ~ x3, data=anscombe)
129 | abline(lm(y3 ~ x3, data=anscombe),
130 | col="blue", lty=2, lwd=2)
131 | abline(rlm(y3 ~ x3, data=anscombe),
132 | col="red", lty=1, lwd=2)
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | Multiple regression
140 | 
141 | model <- lm(mpg ~ wt + hp, data=mtcars)
142 | summary(model)
143 | 
144 | Call:
145 | lm(formula = mpg ~ wt + hp, data = mtcars)
146 | Residuals:
147 | Min 1Q Median 3Q Max
148 | -3.941 -1.600 -0.182 1.050 5.854
149 | Coefficients:
150 | Estimate Std. Error t value Pr(>|t|)
151 | (Intercept) 37.22727 1.59879 23.285 < 2e-16 ***
152 | wt -3.87783 0.63273 -6.129 1.12e-06 ***
153 | hp -0.03177 0.00903 -3.519 0.00145 **
154 | ---
155 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
156 | Residual standard error: 2.593 on 29 degrees of freedom
157 | Multiple R-squared: 0.8268, Adjusted R-squared: 0.8148
158 | F-statistic: 69.21 on 2 and 29 DF, p-value: 9.109e-12
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | coef(lm(mpg ~ wt + hp, data=mtcars))
168 | (Intercept) wt hp
169 | 37.22727012 -3.87783074 -0.03177295
170 | coef(lm(mpg ~ wt, data=mtcars))
171 | (Intercept) wt
172 | 37.285126 -5.344472
173 | coef(lm(mpg ~ hp, data=mtcars))
174 | (Intercept) hp
175 | 30.09886054 -0.06822828
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | predict(model, newdata = data.frame(wt=2.5, hp=275))
185 | 1
186 | 18.79513
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | Regression with a non-binary predictor
194 | 
195 | # the dataset is in the car package
196 | library(car)
197 | model <- lm(wl2 ~ factor(group), data=WeightLoss)
198 | summary(model)
199 | Call:
200 | lm(formula = wl2 ~ factor(group), data = WeightLoss)
201 | Residuals:
202 | Min 1Q Median 3Q Max
203 | -2.100 -1.054 -0.100 0.900 2.900
204 | Coefficients:
205 | Estimate Std. Error t value Pr(>|t|)
206 | (Intercept) 3.3333 0.3756 8.874 5.12e-10 ***
207 | factor(group)Diet 0.5833 0.5312 1.098 0.281
208 | factor(group)DietEx 2.7667 0.5571 4.966 2.37e-05 ***
209 | ---
210 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
211 | Residual standard error: 1.301 on 31 degrees of freedom
212 | Multiple R-squared: 0.4632, Adjusted R-squared: 0.4285
213 | F-statistic: 13.37 on 2 and 31 DF, p-value: 6.494e-05
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | Kitchen sink regression
222 | 
223 | # the period after the squiggly denotes all other variables
224 | model <- lm(mpg ~ ., data=mtcars)
225 | summary(model)
226 | Call:
227 | lm(formula = mpg ~ ., data = mtcars)
228 | Residuals:
229 | Min 1Q Median 3Q Max
230 | -3.4506 -1.6044 -0.1196 1.2193 4.6271
231 | Coefficients:
232 | Estimate Std. Error t value Pr(>|t|)
233 | (Intercept) 12.30337 18.71788 0.657 0.5181
234 | cyl -0.11144 1.04502 -0.107 0.9161
235 | disp 0.01334 0.01786 0.747 0.4635
236 | hp -0.02148 0.02177 -0.987 0.3350
237 | drat 0.78711 1.63537 0.481 0.6353
238 | wt -3.71530 1.89441 -1.961 0.0633 .
239 | qsec 0.82104 0.73084 1.123 0.2739
240 | vs 0.31776 2.10451 0.151 0.8814
241 | am 2.52023 2.05665 1.225 0.2340
242 | gear 0.65541 1.49326 0.439 0.6652
243 | carb -0.19942 0.82875 -0.241 0.8122
244 | ---
245 | Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
246 | Residual standard error: 2.65 on 21 degrees of freedom
247 | Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
248 | F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | Cross-validation
256 | 
257 | set.seed(1)
258 | train.indices <- sample(1:nrow(mtcars), nrow(mtcars)/2)
259 | training <- mtcars[train.indices,]
260 | testing <- mtcars[-train.indices,]
261 | model <- lm(mpg ~ ., data=training)
262 | summary(model)
263 | ..... (output truncated)
264 | Residual standard error: 1.188 on 5 degrees of freedom
265 | Multiple R-squared: 0.988, Adjusted R-squared: 0.9639
266 | F-statistic: 41.06 on 10 and 5 DF, p-value: 0.0003599
267 | 
268 | 
269 | 
270 | 
271 | mean((predict(model) - training$mpg) ^ 2)
272 | [1] 0.4408109
273 | # Cool, but how does it perform on the validation set?
274 | mean((predict(model, newdata=testing) - testing$mpg) ^ 2)
275 | [1] 337.9995
276 | 
277 | 
278 | 
279 | 
280 | 
281 | simpler.model <- lm(mpg ~ am + wt, data=training)
282 | mean((predict(simpler.model) - training$mpg) ^ 2)
283 | [1] 9.396091
284 | mean((predict(simpler.model, newdata=testing) - testing$mpg) ^ 2)
285 | [1] 12.70338
286 | 
287 | 
288 | 
289 | 
290 | 
291 | 
292 | 
293 | library(boot)
294 | bad.model <- glm(mpg ~ ., data=mtcars)
295 | better.model <- glm(mpg ~ am + wt + qsec, data=mtcars)
296 | bad.cv.err <- cv.glm(mtcars, bad.model, K=5)
297 | # the cross-validated MSE estimate we will be using
298 | # is a bias-corrected one stored as the second element
299 | # in the 'delta' vector of the cv.err object
300 | bad.cv.err$delta[2]
301 | [1] 14.92426
302 | 
303 | better.cv.err <- cv.glm(mtcars, better.model, K=5)
304 | better.cv.err$delta[2]
305 | [1] 7.944148
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | Linear regression diagnostics
313 | 
314 | my.model <- lm(mpg ~ wt, data=mtcars)
315 | plot(my.model)
316 | 
317 | 
318 | 
319 | 
320 | 
321 | Fourth Anscombe relationship
322 | 
323 | model <- lm(mpg ~ am + wt + qsec, data=mtcars)
324 | library(car)
325 | vif(model)
326 | 
327 | am wt qsec
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | Advanced topics
335 | 
336 | install.packages("glmnet")
337 | library(glmnet)
338 | vignette("glmnet_beta")
339 | 
340 | 
341 | 
342 | 
343 | my.model <- lm(relief ~ soma*juice, data=my.data)
344 | 
345 | 
346 | 
347 | 
348 | 
349 | 
350 | 2.541437 2.482952 1.364339
351 | 
352 | 
353 | 
354 | 
355 | 
356 | 
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 


--------------------------------------------------------------------------------
/Chapter10/Chapter10.txt:
--------------------------------------------------------------------------------
  1 | Using k-NN in R
  2 | 
  3 | # "class" is one of the packages that implement k-NN
  4 | # "chemometrics" contains a function we need
  5 | # "mlbench" holds the dataset
  6 | install.packages(c("class", "mlbench", "chemometrics"))
  7 | library(class)
  8 | library(mlbench)
  9 | data(PimaIndiansDiabetes)
 10 | PID <- PimaIndiansDiabetes
 11 | 
 12 | 
 13 | # we set the seed so that our splits are the same
 14 | set.seed(3)
 15 | ntrain <- round(nrow(PID)*4/5)
 16 | train <- sample(1:nrow(PID), ntrain)
 17 | training <- PID[train,]
 18 | testing <- PID[-train,]
 19 | 
 20 | 
 21 | 
 22 | resknn <- knnEval(scale(PID[,-9]), PID[,9], train, kfold=10,
 23 | knnvec=seq(1,50,by=1),
 24 | legpos="bottomright")
 25 | 
 26 | 
 27 | 
 28 | 
 29 | predictions <- knn(scale(training[,-9]),
 30 | scale(testing[,-9]),
 31 | training[,9], k=27)
 32 | # function to give correct classification rate
 33 | accuracy <- function(predictions, answers){
 34 | sum((predictions==answers)/(length(answers)))
 35 | }
 36 | accuracy(predictions, testing[,9])
 37 | [1] 0.7597403
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | table(test[,9], preds)
 44 | preds
 45 | neg pos
 46 | neg 86 9
 47 | pos 28 31
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | Using logistic regression in R
 55 | 
 56 | 
 57 | model <- glm(diabetes ~ ., data=PID, family=binomial(logit))
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | summary(model)
 64 | Call:
 65 | glm(formula = diabetes ~ ., family = binomial(logit), data = PID)
 66 | Deviance Residuals:
 67 | Min 1Q Median 3Q Max
 68 | -2.5566 -0.7274 -0.4159 0.7267 2.9297
 69 | Coefficients:
 70 | Estimate Std. Error z value Pr(>|z|)
 71 | (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
 72 | pregnant 0.1231823 0.0320776 3.840 0.000123 ***
 73 | glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
 74 | pressure -0.0132955 0.0052336 -2.540 0.011072 *
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | predictions <- round(predict(model, type="response"))
 81 | predictions <- ifelse(predictions == 1, "pos", "neg")
 82 | accuracy(predictions, PID$diabetes)
 83 | [1] 0.7825521
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | set.seed(3)
 90 | library(boot)
 91 | cv.err <- cv.glm(PID, model, K=5)
 92 | cv.err$delta[2]
 93 | [1] 0.154716
 94 | 1 - cv.err$delta[2]
 95 | [1] 0.845284
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | predictions <- round(predict(model, type="response",
103 | newdata=test))
104 | predictions <- ifelse(predictions == 1, "pos", "neg")
105 | accuracy(predictions, test[,9]) # 78%
106 | [1] 0.7792208
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | Decision trees
116 | 
117 | library(tree)
118 | our.big.tree <- tree(diabetes ~ ., data=training)
119 | summary(our.big.tree)
120 | Classification tree:
121 | tree(formula = diabetes ~ ., data = training)
122 | Variables actually used in tree construction:
123 | [1] "glucose" "age" "mass" "pedigree" "triceps" "pregnant"
124 | [7] "insulin"
125 | Number of terminal nodes: 16
126 | 
127 | 
128 | Residual mean deviance: 0.7488 = 447.8 / 598
129 | Misclassification error rate: 0.184 = 113 / 614
130 | plot(our.big.tree)
131 | text(our.big.tree)
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | set.seed(3)
141 | cv.results <- cv.tree(our.big.tree, FUN=prune.misclass)
142 | plot(cv.results$size, cv.results$dev, type="b")
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | pruned.tree <- prune.misclass(our.big.tree, best=3)
151 | plot(pruned.tree)
152 | text(pruned.tree)
153 | # let's test its accuracy
154 | pruned.preds <- predict(pruned.tree, newdata=test, type="class")
155 | accuracy(pruned.preds, test[,9]) # 71%
156 | [1] 0.7077922
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | Random forests
165 | 
166 | library(randomForest)
167 | forest <- randomForest(diabetes ~ ., data=training,
168 | importance=TRUE,
169 | ntree=2000,
170 | mtry=5)
171 | accuracy(predict(forest), training[,9])
172 | [1] 0.7654723
173 | predictions <- predict(forest, newdata = test)
174 | accuracy(predictions, test[,9])
175 | [1] 0.7727273
176 | 
177 | 
178 | 
179 | The circular decision boundary
180 | 
181 | model <- glm(factor(dep.var) ~ ind.var1 +
182 | I(ind.var1^2) + ind.var2 + I(ind.var2^2),
183 | data=this, family=binomial(logit))
184 | 
185 | 
186 | 
187 | 
188 | 


--------------------------------------------------------------------------------
/Chapter11/Chapter11.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | Creating and plotting time series
  3 | 
  4 | 
  5 | >
  6 | download.file("https://raw.githubusercontent.com/tonyfischetti/dawr/master/
  7 | forecasting/school-supplies.csv", "./school-supplies.csv")
  8 | > school <- read.csv("./school-supplies.csv")
  9 | > head(school)
 10 | thedate hits
 11 | 1 2004-01 20
 12 | 2 2004-02 24
 13 | 3 2004-03 19
 14 | 4 2004-04 26
 15 | 5 2004-05 25
 16 | 6 2004-06 24
 17 | > start(schoolts)
 18 | [1] 2004 1
 19 | > end(schoolts)
 20 | [1] 2018 3
 21 | 
 22 | > schoolts <- ts(school$hits, start=c(2004, 1), frequency=12)
 23 | > schoolts
 24 | Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
 25 | 2004 20 24 19 26 25 24 55 88 32 26 20 19
 26 | 2005 27 24 22 19 24 22 57 82 34 21 19 19
 27 | 2006 22 21 19 17 21 24 61 77 29 18 16 14
 28 | 2007 22 16 17 18 21 25 56 86 27 18 16 14
 29 | 2008 19 16 16 16 19 20 55 79 24 16 14 12
 30 | 2009 16 16 15 13 15 19 54 85 24 14 15 11
 31 | 2010 17 15 13 13 15 18 49 78 23 13 11 10
 32 | 2011 15 12 11 11 12 17 53 79 22 12 12 10
 33 | 
 34 | 
 35 | 
 36 | > library(ggplot2)
 37 | > library(forecast)
 38 | > autoplot(schoolts)
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | >
 45 | download.file("https://raw.githubusercontent.com/tonyfischetti/dawr/master/
 46 | forecasting/yearly-land-air-temp.csv", "yearly-temp.csv")
 47 | > yearlytemp <- read.csv("yearly-temp.csv")
 48 | > head(yearlytemp)
 49 | year val
 50 | 1 1880 -0.1900000
 51 | 2 1881 -0.1008333
 52 | 3 1882 -0.1108333
 53 | 4 1883 -0.1941667
 54 | 5 1884 -0.2966667
 55 | 6 1885 -0.3175000
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | > tempts <- ts(yearlytemp$val, start=1880, frequency=1)
 66 | > autoplot(tempts)
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | > # set the seed for deterministic "randomness"
 73 | > set.seed(2)
 74 | > gausnoise <- rnorm(100, mean=100, sd=20)
 75 | > head(gausnoise)
 76 | [1] 82.06171 103.69698 131.75691 77.39249 98.39496 102.64841
 77 | > gausts <- ts(gausnoise, start=1900, frequency=1)
 78 | > autoplot(gausts)
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | > autoplot(AirPassengers)
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | > autoplot(schoolts) +
 92 | + ylim(0, 200) +
 93 | + ggtitle('google hits for "school supplies" since 2004') +
 94 | + ggsave("school-supplies-fancy.png")
 95 | Saving 7 x 6.43 in image
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | Time series decomposition
102 | 
103 | > schoolcomps <- decompose(schoolts)
104 | > autoplot(schoolcomps)
105 | 
106 | 
107 | 
108 | 
109 | 
110 | > aircomps <- decompose(AirPassengers, type="multiplicative")
111 | > autoplot(aircomps)
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | Autocorrelation
119 | 
120 | Lag 0 8 6 7 5 3 0 9
121 | Lag 1 NA 8 6 7 5 3 0
122 | Lag 2 NA NA 8 6 7 5 3
123 | 
124 | 
125 | 
126 | 
127 | 
128 | > ggAcf(schoolts)
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | > Box.test(schoolts, type="Ljung-Box")
137 | Box-Ljung test
138 | data: schoolts
139 | X-squared = 39.472, df = 1, p-value = 0.0000000003328
140 | 
141 | > Box.test(tempts, type="Ljung-Box")
142 | Box-Ljung test
143 | data: tempts
144 | X-squared = 121.71, df = 1, p-value < 2.2e-16
145 | >
146 | > Box.test(gausts, type="Ljung-Box")
147 | Box-Ljung test
148 | data: gausts
149 | X-squared = 0.36027, df = 1, p-value = 0.5484
150 | 
151 | 
152 | >
153 | > Box.test(AirPassengers, type="Ljung-Box")
154 | Box-Ljung test
155 | data: AirPassengers
156 | X-squared = 132.14, df = 1, p-value < 2.2e-16
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | Smoothing
164 | 
165 | > library(TTR)
166 | > sm5 <- SMA(gausnoise, n=5)
167 | > sm10 <- SMA(gausnoise, n=10)
168 | > sm15 <- SMA(gausnoise, n=15)
169 | > head(sm5, n=10)
170 | [1] NA NA NA NA 98.66061 102.77795 104.87037
171 | [8] 97.56020 110.01960 109.78546
172 | 
173 | 
174 | 
175 | > se1 <- ses(gausts, alpha=.999)
176 | > se.66 <- ses(gausts, alpha=.666)
177 | > se.33 <- ses(gausts, alpha=.333)
178 | > se.1 <- ses(gausts, alpha=0.1)
179 | > se0 <- ses(gausts, alpha=0.001)
180 | > head(fitted(se.1))
181 | Time Series:
182 | Start = 1900
183 | End = 1905
184 | Frequency = 1
185 | [1] 102.9813 100.8894 101.1701 104.2288 101.5452 101.2301
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | Simple exponential smoothing for forecasting
193 | 
194 | > mfore <- meanf(gausts, h=20)
195 | > autoplot(mfore)
196 | 
197 | 
198 | 
199 | 
200 | 
201 | > nfore <- naive(gausts, h=20)
202 | > sfore <- ses(gausts, h=20)
203 | > # we are setting PI to false to suppress
204 | > # plotting the prediction intervals
205 | > autoplot(gausts) +
206 | + ylim(0, 200) + xlim(1900, 2010) +
207 | + autolayer(mfore, PI=FALSE, series="mean") +
208 | + autolayer(nfore, PI=FALSE, series="naive") +
209 | + autolayer(sfore, PI=FALSE, series="ses")
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | > summary(sfore)
217 | Forecast method: Simple exponential smoothing
218 | Model Information:Simple exponential smoothing
219 | Call:
220 | ses(y = gausts, h = 20)
221 | Smoothing parameters:
222 | alpha = 0.0001
223 | Initial states:
224 | l = 99.3055
225 | sigma: 23.0888
226 | AIC AICc BIC
227 | 1094.386 1094.636 1102.202
228 | Error measures:
229 | ME RMSE MAE MPE MAPE MASE
230 | Training set 0.07145194 23.08877 19.04151 -5.942264 20.89301 0.7154352
231 | ACF1
232 | Training set -0.05912586
233 | Forecasts:
234 | Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
235 | 2000 99.3062 69.71675 128.8957 54.05305 144.5594
236 | 2001 99.3062 69.71675 128.8957 54.05305 144.5594
237 | 2002 99.3062 69.71675 128.8957 54.05305 144.5594
238 | ....
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | Accuracy assessment
247 | 
248 | 
249 | > checkresiduals(mfore)
250 | Ljung-Box test
251 | data: Residuals from Mean
252 | Q* = 12.702, df = 9, p-value = 0.1766
253 | Model df: 1. Total lags used: 10
254 | >
255 | > checkresiduals(nfore)
256 | Ljung-Box test
257 | data: Residuals from Naive method
258 | Q* = 58.18, df = 10, p-value = 0.000000007996
259 | Model df: 0. Total lags used: 10
260 | >
261 | > checkresiduals(sfore)
262 | Ljung-Box test
263 | data: Residuals from Simple exponential smoothing
264 | Q* = 12.701, df = 8, p-value = 0.1226
265 | Model df: 2. Total lags used: 10
266 | 
267 | 
268 | 
269 | 
270 | 
271 | > train <- window(gausts, start=1900, end=1980)
272 | > nforetr <- naive(train, h=20)
273 | > mforetr <- meanf(train, h=20)
274 | > sforetr <- ses(train, h=20)
275 | 
276 | 
277 | 
278 | 
279 | 
280 | > accuracy(nforetr, gausts)
281 | RMSE MAE
282 | Training set 33.41918 26.86576
283 | Test set 36.44798 33.16893
284 | >
285 | accuracy(mforetr, gausts)
286 | RMSE MAE
287 | Training set 22.41742 18.33581
288 | Test set 25.96117 22.84341
289 | >
290 | > accuracy(sforetr, gausts)
291 | RMSE MAE
292 | Training set 22.41900 18.34584
293 | Test set 26.00306 22.91065
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | > tsCV(gausts, meanf, h=5)
302 | Time Series:
303 | Start = 1900
304 | End = 1999
305 | Frequency = 1
306 | [1] NA NA NA NA NA 20.5866966
307 | [7] 21.2797482 -10.6324936 40.9624573 -1.4363504 9.0277723 18.1906911
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | ts_cv_rmse <- function(thets, thefun, h=5){
315 | resids <- tsCV(thets, thefun, h=h)
316 | return(sqrt(mean(resids^2, na.rm=TRUE)))
317 | }
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 
324 | > ts_cv_rmse(gausts, naive)
325 | [1] 33.82626
326 | > ts_cv_rmse(gausts, meanf)
327 | [1] 23.75519
328 | > ts_cv_rmse(gausts, ses)
329 | [1] 24.00897
330 | 
331 | 
332 | 
333 | 
334 | 
335 | Double exponential smoothing
336 | 
337 | hwfr <- holt(tempts, h=50)
338 | autoplot(hwfr)
339 | 
340 | 
341 | 
342 | 
343 | 
344 | > hwfrd <- holt(tempts, h=50, damped=TRUE)
345 | > autoplot(hwfrd)
346 | 
347 | 
348 | 
349 | 
350 | > checkresiduals(hwfr)
351 | Ljung-Box test
352 | data: Residuals from Holt's method
353 | Q* = 16.666, df = 6, p-value = 0.01059
354 | Model df: 4. Total lags used: 10
355 | >
356 | > checkresiduals(hwfrd)
357 | Ljung-Box test
358 | data: Residuals from Damped Holt's method
359 | Q* = 17.769, df = 5, p-value = 0.00325
360 | Model df: 5. Total lags used: 10
361 | 
362 | 
363 | 
364 | 
365 | 
366 | 
367 | > ts_cv_rmse(tempts, holt)
368 | [1] 0.1780076
369 | > # have to use an anonymous function in order to specify a damped trend
370 | > ts_cv_rmse(tempts, function(x, h) holt(x, damped=TRUE, h=h))
371 | [1] 0.1892179
372 | > ts_cv_rmse(tempts, meanf)
373 | [1] 0.3555613
374 | > ts_cv_rmse(tempts, naive)
375 | [1] 0.1631819
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | Triple exponential smoothing
384 | 
385 | 
386 | > hwfore <- hw(schoolts, h=48, seasonal="additive")
387 | > autoplot(hwfore)
388 | 
389 | 
390 | ts_cv_rmse(schoolts, function(x, h) hw(x, seasonal="additive"))
391 | [1] 9.34646
392 | ts_cv_rmse(schoolts, snaive)
393 | [1] 3
394 | 
395 | 
396 | 
397 | 
398 | > hwfore <- hw(AirPassengers, h=48, seasonal="multiplicative")
399 | > # now let's compare the simpler approaches
400 | > snfore <- snaive(AirPassengers, h=48)
401 | > nfore <- naive(AirPassengers, h=48)
402 | > mfore <- meanf(AirPassengers, h=48)
403 | > ts_cv_rmse(AirPassengers, function(x, h) hw(x,
404 | seasonal="multiplicative"))
405 | [1] 22.33878
406 | > ts_cv_rmse(AirPassengers, snaive)
407 | [1] 35.9892
408 | > ts_cv_rmse(AirPassengers, naive)
409 | [1] 77.16156
410 | > ts_cv_rmse(AirPassengers, meanf)
411 | [1] 126.4193
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 | ETS and the state space model
419 | 
420 | 
421 | > etsobj <- ets(schoolts)
422 | > summary(etsobj)
423 | ETS(M,A,A)
424 | Call:
425 | ets(y = schoolts)
426 | Smoothing parameters:
427 | alpha = 0.0464
428 | beta = 0.0017
429 | gamma = 0.3957
430 | ...
431 | AIC AICc BIC
432 | 1107.799 1111.799 1161.207
433 | Training set error measures:
434 | ME RMSE MAE MPE MAPE MASE ACF1
435 | Training set 0.3560261 3.080813 1.879578 -0.146184 7.017015 0.9028789
436 | 0.2787678
437 | >
438 | > etsfore <- forecast(etsobj, h=48)
439 | 
440 | 
441 | 
442 | 
443 | 
444 | 
445 | Interventions for improvement
446 | 
447 | > etsobj <- ets(tempts)
448 | > summary(etsobj)
449 | ETS(A,N,N)
450 | Call:
451 | ets(y = tempts)
452 | ...
453 | > autoplot(forecast(etsobj, h=50))
454 | 
455 | 
456 | 
457 | 
458 | 
459 | > etsobj2 <- ets(window(tempts, start=1970))
460 | > summary(etsobj2)
461 | ETS(A,A,N)
462 | Call:
463 | ets(y = window(tempts, start = 1970))
464 | ...
465 | > autoplot(forecast(etsobj2, h=50))
466 | 
467 | 
468 | 
469 | 
470 | 
471 | 
472 | 
473 | 
474 | 


--------------------------------------------------------------------------------
/Chapter12/Chapter12.txt:
--------------------------------------------------------------------------------
  1 | Relational databases
  2 | 
  3 | 
  4 | CREATE TABLE artists(
  5 | artist_id INTEGER PRIMARY KEY,
  6 | name TEXT,
  7 | born_on INTEGER
  8 | );
  9 | CREATE TABLE paintings(
 10 | painting_id INTEGER PRIMARY KEY,
 11 | painting_artist INTEGER,
 12 | painting_name TEXT,
 13 | year_completed INTEGER,
 14 | FOREIGN KEY(painting_artist) REFERENCES artists(artist_id)
 15 | );
 16 | INSERT INTO artists(name, born_on)
 17 | VALUES ("Kay Sage", 1898),
 18 | ("Piet Mondrian", 1872),
 19 | ("Rene Magritte", 1898),
 20 | ("Man Ray", 1890),
 21 | ("Jean-Michel Basquiat", 1960);
 22 | INSERT INTO paintings(painting_artist, painting_name, year_completed)
 23 | VALUES (4, "Orquesta Sinfonica", 1916),
 24 | (4, "La Fortune", 1938),
 25 | (1, "Tommorow is Never", 1955),
 26 | (1, "The Answer is No", 1958),
 27 | (1, "No Passing", 1954),
 28 | (5, "Bird on Money", 1981),
 29 | (2, "Place de la Concorde", 1943),
 30 | (2, "Composition No. 10", 1942),
 31 | (3, "The Human Condition", 1935),
 32 | (3, "The Treachery of Images", 1948),
 33 | (3, "The Son of Man", 1964);
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | SELECT * FROM artists;
 42 | --------------------------------
 43 | 1 | Kay Sage | 1898
 44 | 2 | Piet Mondrian | 1872
 45 | 3 | Rene Magritte | 1898
 46 | 4 | Man Ray | 1890
 47 | 5 | Jean-Michel Basquiat | 1960
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | SELECT * FROM paintings;
 56 | --------------------------------------
 57 | 1 | 4 | Orquesta Sinfonica | 1916
 58 | 2 | 4 | La Fortune | 1938
 59 | 3 | 1 | Tommorow is Never | 1955
 60 | 4 | 1 | The Answer is No | 1958
 61 | 5 | 1 | No Passing | 1954
 62 | 6 | 5 | Bird on Money | 1981
 63 | 7 | 2 | Place de la Concorde | 1943
 64 | 8 | 2 | Composition No. 10 | 1942
 65 | 9 | 3 | The Human Condition | 1935
 66 | 10 | 3 | The Treachery of Images | 1948
 67 | 11 | 3 | The Son of Man | 1964
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | library(DBI)
 76 | library(RSQLite)
 77 | # we read the art sqlite db from the current
 78 | # working directory which can be get and set
 79 | # with getwd() and setwd(), respectively
 80 | art_db <- dbConnect(sqlite, "./art.db")
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | result <- dbSendQuery(art_db,
 87 | "SELECT paintings.painting_name, artists.name
 88 | FROM paintings INNER JOIN artists
 89 | ON paintings.painting_artist=artists.artist_id;")
 90 | response <- fetch(result)
 91 | head(response)
 92 | dbClearResult(result)
 93 | ----------------------------------------------
 94 | painting_name name
 95 | 1 Orquesta Sinfonica Man Ray
 96 | 2 La Fortune Man Ray
 97 | 3 Tommorow is Never Kay Sage
 98 | 4 The Answer is No Kay Sage
 99 | 5 No Passing Kay Sage
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | result <- dbSendQuery(art_db,
107 | "SELECT paintings.year_completed, artists.born_on
108 | FROM paintings INNER JOIN artists
109 | ON paintings.painting_artist=artists.artist_id;")
110 | response <- fetch(result)
111 | head(response)
112 | dbClearResult(result)
113 | ----------------------------
114 | year_completed born_on
115 | 1 1916 1890
116 | 2 1938 1890
117 | 3 1955 1898
118 | 4 1958 1898
119 | 5 1954 1898
120 | 6 1981 1960
121 | At this time, row-wise subtraction and averaging can be performed simply:
122 | mean(response$year_completed - response$born_on)
123 | -----------
124 | [1] 51.091
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | dbDisconnect(art_db)
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | Using JSON
143 | 
144 | library(jsonlite)
145 | example.json <- '
146 | {
147 | "thebeatles": {
148 | "formed": 1960,
149 | "members": [
150 | {
151 | "firstname": "George",
152 | "lastname": "Harrison"
153 | },
154 | {
155 | "firstname": "Ringo",
156 | "lastname": "Starr"
157 | },
158 | {
159 | "firstname": "Paul",
160 | "lastname": "McCartney"
161 | },
162 | {
163 | "firstname": "John",
164 | "lastname": "Lennon"
165 | }
166 | ]
167 | }
168 | }'
169 | the_beatles <- fromJSON(example.json)
170 | 
171 | 
172 | 
173 | print(the_beatles)
174 | ---------------------
175 | $thebeatles
176 | $thebeatles$formed
177 | [1] 1960
178 | $thebeatles$members
179 | firstname lastname
180 | 1 George Harrison
181 | 2 Ringo Starr
182 | 3 Paul McCartney
183 | 4 John Lennon
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | the_beatles$thebeatles$formed
192 | the_beatles[["thebeatles"]][["formed"]]
193 | ---------
194 | [1] 1960
195 | [1] 1960
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | {
204 | "toptags": {
205 | "tag": [
206 | {
207 | "count": 100,
208 | "name": "female vocalists",
209 | "url": "http://www.last.fm/tag/female+vocalists"
210 | },
211 | {
212 | "count": 71,
213 | "name": "singer-songwriter",
214 | "url": "http://www.last.fm/tag/singer-songwriter"
215 | },
216 | {
217 | "count": 65,
218 | "name": "pop",
219 | "url": "http://www.last.fm/tag/pop"
220 | }
221 | ]
222 | }
223 | }
224 | 
225 | 
226 | 
227 | 
228 | 
229 | URLencode("The Beatles")
230 | -------
231 | [1] "The%20Beatles"
232 | 
233 | 
234 | 
235 | 
236 | create_artist_query_url_lfm <- function(artist_name){
237 | prefix <-
238 | "http://ws.audioscrobbler.com/2.0/?method=artist.gettoptags&artist="
239 | postfix <- "&api_key=c2e57923a25c03f3d8b317b3c8622b43&format=json"
240 | encoded_artist <- URLencode(artist_name)
241 | return(paste0(prefix, encoded_artist, postfix))
242 | }
243 | create_artist_query_url_lfm("Depeche Mode")
244 | --------------------
245 | [1]
246 | "http://ws.audioscrobbler.com/2.0/?method=artist.gettoptags&artist=Depeche%
247 | 20Mode&api_key=c2e57923a25c03f3d8b317b3c8622b43&format=json"
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | fromJSON(create_artist_query_url_lfm("Depeche Mode"))
258 | -----------------------------------------
259 | $toptags
260 | $toptags$tag
261 | count name url
262 | 1 100 electronic http://www.last.fm/tag/electronic
263 | 2 87 new wave http://www.last.fm/tag/new+wave
264 | 3 59 80s http://www.last.fm/tag/80s
265 | 4 56 synth pop http://www.last.fm/tag/synth+pop
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | get_tag_vector_lfm <- function(an_artist){
277 | artist_url <- create_artist_query_url_lfm(an_artist)
278 | json <- fromJSON(artist_url)
279 | return(json$toptags$tag$name)
280 | }
281 | get_tag_vector_lfm("Depeche Mode")
282 | ------------------------------------------
283 | [1] "electronic" "new wave" "80s"
284 | [4] "synth pop" "synthpop" "seen live"
285 | [7] "alternative" "rock" "british
286 | 
287 | 
288 | 
289 | 
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | our_artists <- list("Kate Bush", "Peter Tosh", "Radiohead",
300 | "The Smiths", "The Cure", "Black Uhuru")
301 | our_artists_tags <- lapply(our_artists, get_tag_vector_lfm)
302 | names(our_artists_tags) <- our_artists
303 | print(our_artists_tags)
304 | --------------------------------------
305 | $`Kate Bush`
306 | [1] "female vocalists" "singer-songwriter" "pop"
307 | [4] "alternative" "80s" "british"
308 | ........
309 | $`Peter Tosh`
310 | [1] "reggae" "roots reggae" "Rasta"
311 | [4] "roots" "ska" "jamaican"
312 | ........
313 | $Radiohead
314 | [1] "alternative" "alternative rock"
315 | [3] "rock" "indie"
316 | 
317 | ........
318 | $`The Smiths`
319 | [1] "indie" "80s" "post-punk"
320 | [4] "new wave" "alternative" "rock"
321 | ........
322 | $`The Cure`
323 | [1] "post-punk" "new wave" "alternative"
324 | [4] "80s" "rock" "seen live"
325 | ........
326 | $`Black Uhuru`
327 | [1] "reggae" "roots reggae" "dub"
328 | [4] "jamaica" "roots" "jamaican"
329 | 
330 | 
331 | 
332 | 
333 | 
334 | jaccard_index <- function(one, two){
335 | length(intersect(one, two))/length(union(one, two))
336 | }
337 | 
338 | 
339 | 
340 | 
341 | 
342 | jaccard_index(our_artists_tags[["Radiohead"]],
343 | our_artists_tags[["The Cure"]])
344 | 
345 | ---------------
346 | [1] 0.3333
347 | 
348 | 
349 | 
350 | 
351 | 
352 | 
353 | similarity_matrix <- function(artist_list, similarity_fn) {
354 | num <- length(artist_list)
355 | # initialize a num by num matrix of zeroes
356 | sim_matrix <- matrix(0, ncol = num, nrow = num)
357 | # name the rows and columns for easy lookup
358 | rownames(sim_matrix) <- names(artist_list)
359 | colnames(sim_matrix) <- names(artist_list)
360 | # for each row in the matrix
361 | for(i in 1:nrow(sim_matrix)) {
362 | # and each column
363 | for(j in 1:ncol(sim_matrix)) {
364 | # calculate that pair's similarity
365 | the_index <- similarity_fn(artist_list[[i]],
366 | artist_list[[j]])
367 | # and store it in the right place in the matrix
368 | sim_matrix[i,j] <- round(the_index, 2)
369 | }
370 | }
371 | return(sim_matrix)
372 | }
373 | sim_matrix <- similarity_matrix(our_artists_tags, jaccard_index)
374 | print(sim_matrix)
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | --------------------------------------------------------------
382 | Kate Bush Peter Tosh Radiohead The Smiths The Cure Black Uhuru
383 | Kate Bush 1.00 0.05 0.31 0.25 0.21 0.04
384 | Peter Tosh 0.05 1.00 0.02 0.03 0.03 0.33
385 | Radiohead 0.31 0.02 1.00 0.31 0.33 0.04
386 | The Smiths 0.25 0.03 0.31 1.00 0.44 0.05
387 | The Cure 0.21 0.03 0.33 0.44 1.00 0.05
388 | Black Uhuru 0.04 0.33 0.04 0.05 0.05 1.00
389 | 
390 | 
391 | 
392 | 
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | # The Smiths are the fourth column
402 | sim_matrix[order(sim_matrix[,4], decreasing=TRUE), 4]
403 | ----------------------------------------------
404 | The Smiths The Cure Radiohead Kate Bush Black Uhuru
405 | 1.00 0.44 0.31 0.25 0.05
406 | Peter Tosh
407 | 0.03
408 | 
409 | 
410 | 
411 | 
412 | 
413 | 
414 | XML
415 | 
416 | 
417 | example_xml1 <- '
418 | <the_beatles>
419 | <formed>1960</formed>
420 | <members>
421 | <member>
422 | <first_name>George</first_name>
423 | <last_name>Harrison</last_name>
424 | </member>
425 | <member>
426 | <first_name>Ringo</first_name>
427 | <last_name>Starr</last_name>
428 | </member>
429 | <member>
430 | <first_name>Paul</first_name>
431 | <last_name>McCartney</last_name>
432 | </member>
433 | <member>
434 | <first_name>John</first_name>
435 | <last_name>Lennon</last_name>
436 | </member>
437 | </members>
438 | </the_beatles>'
439 | 
440 | 
441 | 
442 | 
443 | 
444 | 
445 | library(XML)
446 | the_beatles <- xmlTreeParse(example_xml1)
447 | print(names(the_beatles))
448 | -------------------
449 | [1] "doc" "dtd"
450 | print(the_beatles$doc)
451 | ---------------------
452 | $file
453 | [1] "<buffer>"
454 | $version
455 | [1] "1.0"
456 | $children
457 | $children$the_beatles
458 | <the_beatles>
459 | <formed>1960</formed>
460 | <members>
461 | <member>
462 | <first_name>George</first_name>
463 | <last_name>Harrison</last_name>
464 | </member>
465 | ..........
466 | </members>
467 | </the_beatles>
468 | attr(,"class")
469 | [1] "XMLDocumentContent"
470 | 
471 | 
472 | 
473 | 
474 | 
475 | 
476 | 
477 | 
478 | 
479 | 
480 | 
481 | 
482 | print(xmlValue(the_beatles$doc$children$the_beatles[["formed"]]))
483 | ----------------------
484 | [1] "1960"
485 | 
486 | 
487 | 
488 | 
489 | 
490 | 
491 | 
492 | 
493 | 
494 | root <- xmlRoot(the_beatles)
495 | sapply(xmlChildren(root[["members"]]), function(x){
496 | xmlValue(x[["first_name"]])
497 | })
498 | -------------------------------------------
499 | member member member member
500 | "George" "Ringo" "Paul" "John"
501 | 
502 | 
503 | 
504 | 
505 | 
506 | 
507 | 
508 | 
509 | all_first_names <- "//member/first_name"
510 | 
511 | 
512 | 
513 | 
514 | 
515 | the_beatles <- xmlParse(example_xml1)
516 | getNodeSet(the_beatles, all_first_names)
517 | --------
518 | [[1]]
519 | <first_name>George</first_name>
520 | [[2]]
521 | <first_name>Ringo</first_name>
522 | [[3]]
523 | <first_name>Paul</first_name>
524 | [[4]]
525 | <first_name>John</first_name>
526 | attr(,"class")
527 | [1] "XMLNodeSet"
528 | 
529 | 
530 | 
531 | 
532 | 
533 | 
534 | 
535 | 
536 | 
537 | getNodeSet(the_beatles, "//first_name")
538 | getNodeSet(the_beatles, "/the_beatles/members/member/first_name")
539 | 
540 | 
541 | 
542 | 
543 | 
544 | sapply(getNodeSet(the_beatles, all_first_names), xmlValue)
545 | -------------------------------
546 | [1] "George" "Ringo" "Paul" "John"
547 | 
548 | 
549 | example_xml2 <- '
550 | <the_beatles formed="1990">
551 | <members>
552 | <member first_name="George" last_name="Harrison"/>
553 | <member first_name="Richard" last_name="Starkey"/>
554 | <member first_name="Paul" last_name="McCartney"/>
555 | <member first_name="John" last_name="Lennon"/>
556 | </members>
557 | </the_beatles>'
558 | 
559 | 
560 | 
561 | 
562 | 
563 | 
564 | sapply(getNodeSet(the_beatles, "//member[@first_name]"),
565 | function(x){ xmlAttrs(x)[["first_name"]] })
566 | -----------
567 | [1] "George" "Richard" "Paul" "John"
568 | 
569 | 
570 | 
571 | 
572 | 
573 | 
574 | <metadata xmlns="http://musicbrainz.org/ns/mmd-2.0#">
575 | <artist-list>
576 | <artist>
577 | <name>Kate Bush</name>
578 | <tag-list>
579 | <tag count="1">
580 | <name>kent</name>
581 | </tag>
582 | <tag count="1">
583 | <name>english</name>
584 | </tag>
585 | <tag count="3">
586 | <name>british</name>
587 | </tag>
588 | </tag-list>
589 | </artist>
590 | <artist-list>
591 | </metadata>
592 | 
593 | 
594 | 
595 | 
596 | 
597 | 
598 | 
599 | 
600 | 
601 | create_artist_query_url_mb <- function(artist){
602 | encoded_artist <- URLencode(artist)
603 | return(paste0("http://musicbrainz.org/ws/2/artist/?query=artist:",
604 | encoded_artist))
605 | }
606 | create_artist_query_url_mb("Depeche Mode")
607 | -------
608 | [1] "http://musicbrainz.org/ws/2/artist/?query=artist:Depeche%20Mode"
609 | 
610 | 
611 | 
612 | 
613 | 
614 | ns <- "http://musicbrainz.org/ns/mmd-2.0#"
615 | names(ns)[1] <- "ns"
616 | 
617 | 
618 | 
619 | 
620 | 
621 | 
622 | get_tag_vector_mb <- function(an_artist, ns){
623 | artist_url <- create_artist_query_url_mb(an_artist)
624 | the_xml <- xmlParse(artist_url)
625 | xpath <- "//ns:artist[1]/ns:tag-list/ns:tag/ns:name"
626 | the_nodes <- getNodeSet(the_xml, xpath, ns)
627 | return(unlist(lapply(the_nodes, xmlValue)))
628 | }
629 | get_tag_vector_mb("Depeche Mode", ns)
630 | -------------------------------------
631 | [1] "electronica" "post punk" "alternative dance"
632 | [4] "electronic" "dark wave" "britannique"
633 | 
634 | 
635 | 
636 | 
637 | 
638 | 
639 | 
640 | 
641 | 
642 | our_artists <- list("Kate Bush", "Peter Tosh", "Radiohead",
643 | "The Smiths", "The Cure", "Black Uhuru")
644 | our_artists_tags_mb <- lapply(our_artists, get_tag_vector_mb, ns)
645 | names(our_artists_tags_mb) <- our_artists
646 | sim_matrix <- similarity_matrix(our_artists_tags_mb, jaccard_index)
647 | print(sim_matrix)
648 | -------
649 | Kate Bush Peter Tosh Radiohead The Smiths The Cure Black Uhuru
650 | Kate Bush 1.00 0.00 0.24 0.27 0.24 0.00
651 | Peter Tosh 0.00 1.00 0.00 0.00 0.00 0.17
652 | Radiohead 0.24 0.00 1.00 0.23 0.23 0.00
653 | The Smiths 0.27 0.00 0.23 1.00 0.38 0.00
654 | The Cure 0.24 0.00 0.23 0.38 1.00 0.00
655 | Black Uhuru 0.00 0.17 0.00 0.00 0.00 1.00
656 | 
657 | 
658 | > sim_matrix[order(sim_matrix[,4], decreasing=TRUE), 4]
659 | -------------------------------
660 | The Smiths The Cure Kate Bush Radiohead Peter Tosh Black Uhuru
661 | 1.00 0.38 0.27 0.23 0.00 0.00
662 | 
663 | 
664 | 
665 | 
666 | 
667 | 
668 | 
669 | for(i in 1:length(our_artists_tags)){
670 | the_artist <- names(our_artists_tags)[i]
671 | # the_artist now holds the current artist's name
672 | combined_tags <- union(our_artists_tags[[the_artist]],
673 | our_artists_tags_mb[[the_artist]])
674 | our_artists_tags[[the_artist]] <- combined_tags
675 | }
676 | sim_matrix <- similarity_matrix(our_artists_tags, jaccard_index)
677 | print(sim_matrix)
678 | --------
679 | Kate Bush Peter Tosh Radiohead The Smiths The Cure Black Uhuru
680 | Kate Bush 1.00 0.04 0.29 0.24 0.19 0.03
681 | Peter Tosh 0.04 1.00 0.01 0.03 0.03 0.29
682 | Radiohead 0.29 0.01 1.00 0.29 0.30 0.03
683 | The Smiths 0.24 0.03 0.29 1.00 0.40 0.05
684 | The Cure 0.19 0.03 0.30 0.40 1.00 0.05
685 | Black Uhuru 0.03 0.29 0.03 0.05 0.05 1.00
686 | 
687 | 
688 | 
689 | 
690 | 
691 | 
692 | 
693 | 
694 | 
695 | 
696 | 
697 | 
698 | 
699 | 
700 | 
701 | 
702 | 
703 | 
704 | 
705 | 
706 | 
707 | 
708 | 
709 | 
710 | 
711 | 
712 | 
713 | 
714 | 
715 | 
716 | 
717 | 
718 | 
719 | 
720 | 
721 | 
722 | 
723 | 
724 | 
725 | 
726 | 


--------------------------------------------------------------------------------
/Chapter12/music-recommendations.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript --vanilla
  2 | 
  3 | ###########################################################
  4 | ##                                                       ##
  5 | ##   music-recommendations.R                             ##
  6 | ##                                                       ##
  7 | ##                Author: Tony Fischetti                 ##
  8 | ##                        tony.fischetti@gmail.com       ##
  9 | ##                                                       ##
 10 | ###########################################################
 11 | 
 12 | # workspace cleanup
 13 | rm(list=ls())
 14 | 
 15 | # options
 16 | options(echo=TRUE)
 17 | options(stringsAsFactors=FALSE)
 18 | 
 19 | # cli args
 20 | args <- commandArgs(trailingOnly=TRUE)
 21 | 
 22 | # libraries
 23 | library(magrittr)
 24 | library(assertr)
 25 | library(XML)
 26 | 
 27 | 
 28 | 
 29 | # takes a URL that produces XML http result
 30 | # and returns parsed XML object
 31 | get_xml_response <- function(a_url){
 32 |   return(xmlParse(a_url))
 33 | }
 34 | 
 35 | 
 36 | # takes a artist name and creates a url that
 37 | # can be used to query music brainz for the artist
 38 | create_artist_query_url <- function(artist){
 39 |   encoded_artist <- URLencode(artist)
 40 |   return(paste0("http://musicbrainz.org/ws/2/artist/?query=artist:",
 41 |                 encoded_artist))
 42 | }
 43 | 
 44 | 
 45 | # takes the xml response from a music brainz query, and
 46 | # the place of the artist in the artist list (usually 1)
 47 | # and xml namespace
 48 | # returns a vector of the artist tags
 49 | get_tag_vector <- function(xml, place, ns){
 50 |   xpath <- paste0("//ns:artist[", place, "]/ns:tag-list/ns:tag/ns:name")
 51 |   the_nodes <- getNodeSet(xml, xpath, ns)
 52 |   return(unlist(lapply(the_nodes, xmlValue)))
 53 | }
 54 | 
 55 | 
 56 | # combines all steps
 57 | get_artists_tags <- function(artist, ns){
 58 |   the_query <- create_artist_query_url(artist)
 59 |   xml_resp <- get_xml_response(the_query)
 60 |   tag_vector <- get_tag_vector(xml_resp, 1, ns)
 61 |   return(tag_vector)
 62 | }
 63 | 
 64 | 
 65 | jaccard <- function(one, two){
 66 |   length(intersect(one, two))/length(union(one, two))
 67 | }
 68 | 
 69 | make_similarity_matrix <- function(artist_list, similarity_fn) {
 70 |     num <- length(artist_list)
 71 |     sim_matrix <- matrix(0, ncol = num, nrow = num)
 72 |     rownames(sim_matrix) <- names(artist_list)
 73 |     colnames(sim_matrix) <- names(artist_list)
 74 |     for(i in 1:nrow(sim_matrix)) {
 75 |         for(j in 1:ncol(sim_matrix)) {
 76 |             sim_matrix[i,j] <- round(
 77 |               similarity_fn(artist_list[[i]], artist_list[[j]]), 2)
 78 |       }
 79 |     }
 80 |     return(sim_matrix)
 81 | }
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | ns <- "http://musicbrainz.org/ns/mmd-2.0#"
 88 | names(ns)[1] <- "ns"
 89 | 
 90 | 
 91 | 
 92 | kate.bush <- get_artists_tags("Kate Bush", ns)
 93 | peter.tosh <- get_artists_tags("Peter Tosh", ns)
 94 | radiohead <- get_artists_tags("Radiohead", ns)
 95 | the.smiths <- get_artists_tags("The Smiths", ns)
 96 | the.cure <- get_artists_tags("The Cure", ns)
 97 | black.uhuru <- get_artists_tags("Black Uhuru", ns)
 98 | 
 99 | 
100 | the.list <- list(kate.bush, peter.tosh, radiohead,
101 |                  the.smiths, the.cure, black.uhuru)
102 | names(the.list) <- c("kate bush", "peter tosh", "radiohead",
103 |                      "the smiths", "the cure", "black uhuru")
104 | 
105 | (make_similarity_matrix(the.list, jaccard))
106 | 
107 | 
108 | #--------------------------------------------------#
109 | #--------------------------------------------------#
110 | #--------------------------------------------------#
111 | 
112 | 
113 | library(jsonlite)
114 | 
115 | 
116 | create_artist_query_url_lfm <- function(artist){
117 |   encoded_artist <- URLencode(artist)
118 |   return(paste0("http://ws.audioscrobbler.com/2.0/?method=",
119 |                 "artist.gettoptags&artist=",
120 |                 encoded_artist, "&api_key=c2e57923a25c03f3d8b31",
121 |                 "7b3c8622b43&format=json"))
122 | }
123 | 
124 | get_json_response <- function(a_url){
125 |   return(fromJSON(a_url))
126 | }
127 | 
128 | get_tag_vector_lfm <- function(json){
129 |   return(json$toptags$tag$name)
130 | }
131 | 
132 | get_artists_tags_lfm <- function(artist){
133 |   the_query <- create_artist_query_url_lfm(artist)
134 |   json_resp <- get_json_response(the_query)
135 |   print(json_resp)
136 |   tag_vector <- get_tag_vector_lfm(json_resp)
137 |   return(tag_vector)
138 | }
139 | 
140 | 
141 | get_artists_tags_lfm("Kate Bush")
142 | 
143 | 
144 | 
145 | our_artists <- list("Kate Bush", "Peter Tosh", "Radiohead",
146 |                      "The Smiths", "The Cure", "Black Uhuru")
147 | our_artists_tags <- lapply(our_artists, get_artists_tags_lfm)
148 | names(our_artists_tags) <- our_artists
149 | 
150 | 
151 | jaccard <- function(one, two){
152 |   length(intersect(one, two))/length(union(one, two))
153 | }
154 | 
155 | make_similarity_matrix <- function(artist_list, similarity_fn) {
156 |     num <- length(artist_list)
157 |     sim_matrix <- matrix(0, ncol = num, nrow = num)
158 |     rownames(sim_matrix) <- names(artist_list)
159 |     colnames(sim_matrix) <- names(artist_list)
160 |     for(i in 1:nrow(sim_matrix)) {
161 |         for(j in 1:ncol(sim_matrix)) {
162 |             sim_matrix[i,j] <- round(
163 |               similarity_fn(artist_list[[i]], artist_list[[j]]), 2)
164 |       }
165 |     }
166 |     return(sim_matrix)
167 | }
168 | 
169 | 
170 | 
171 | (make_similarity_matrix(our_artists_tags, jaccard))
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/Chapter13/Chapter13.txt:
--------------------------------------------------------------------------------
  1 | Visualizing missing data
  2 | 
  3 | set.seed(2)
  4 | miss_mtcars <- mtcars
  5 | 
  6 | 
  7 | 
  8 | some_rows <- sample(1:nrow(miss_mtcars), 7)
  9 | miss_mtcars$drat[some_rows] <- NA
 10 | some_rows <- sample(1:nrow(miss_mtcars), 5)
 11 | miss_mtcars$mpg[some_rows] <- NA
 12 | some_rows <- sample(1:nrow(miss_mtcars), 5)
 13 | miss_mtcars$cyl[some_rows] <- NA
 14 | some_rows <- sample(1:nrow(miss_mtcars), 3)
 15 | miss_mtcars$wt[some_rows] <- NA
 16 | some_rows <- sample(1:nrow(miss_mtcars), 3)
 17 | miss_mtcars$vs[some_rows] <- NA
 18 | 
 19 | 
 20 | 
 21 | only_automatic <- which(miss_mtcars$am==0)
 22 | some_rows <- sample(only_automatic, 4)
 23 | miss_mtcars$qsec[some_rows] <- NA
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | miss_mtcars
 30 | mpg cyl disp hp drat wt qsec vs am gear carb
 31 | Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
 32 | Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
 33 | Datsun 710 22.8 4 108.0 93 3.85 NA 18.61 1 1 4 1
 34 | Hornet 4 Drive 21.4 6 258.0 110 NA 3.215 19.44 1 0 3 1
 35 | Hornet Sportabout 18.7 8 360.0 175 NA 3.440 17.02 0 0 3 2
 36 | Valiant 18.1 NA 225.0 105 NA 3.460 NA 1 0 3 1
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | library(mice)
 44 | md.pattern(miss_mtcars)
 45 | 
 46 | disp hp am gear carb wt vs qsec mpg cyl drat
 47 | 12 1 1 1 1 1 1 1 1 1 1 1 0
 48 | 4 1 1 1 1 1 1 1 1 0 1 1 1
 49 | 2 1 1 1 1 1 1 1 1 1 0 1 1
 50 | 3 1 1 1 1 1 1 1 1 1 1 0 1
 51 | 3 1 1 1 1 1 0 1 1 1 1 1 1
 52 | 2 1 1 1 1 1 1 1 0 1 1 1 1
 53 | 1 1 1 1 1 1 1 1 1 0 1 0 2
 54 | 1 1 1 1 1 1 1 1 0 1 0 1 2
 55 | 1 1 1 1 1 1 1 0 1 1 0 1 2
 56 | 2 1 1 1 1 1 1 0 1 1 1 0 2
 57 | 1 1 1 1 1 1 1 1 0 1 0 0 3
 58 | 0 0 0 0 0 3 3 4 5 5 7 27
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | library(VIM)
 69 | aggr(miss_mtcars, numbers=TRUE)
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | Complete case analysis
 77 | 
 78 | mean(miss_mtcars$drat)
 79 | [1] NA
 80 | mean(miss_mtcars$drat, na.rm=TRUE)
 81 | [1] 3.63
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | listwise_model <- lm(mpg ~ am + wt + qsec,
 92 | data=miss_mtcars,
 93 | na.action = na.omit)
 94 | ## OR
 95 | # complete.cases returns a boolean vector
 96 | comp <- complete.cases(cbind(miss_mtcars$mpg,
 97 | miss_mtcars$am,
 98 | miss_mtcars$wt,
 99 | miss_mtcars$qsec))
100 | comp_mtcars <- mtcars[comp,]
101 | listwise_model <- lm(mpg ~ am + wt + qsec,
102 | data=comp_mtcars)
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | Mean substitution
111 | 
112 | mean_sub <- miss_mtcars
113 | mean_sub$qsec[is.na(mean_sub$qsec)] <- mean(mean_sub$qsec,
114 | na.rm=TRUE)
115 | # etc...
116 | 
117 | 
118 | 
119 | 
120 | 
121 | Multiple imputation in practice
122 | 
123 | 
124 | # we are going to set the seed and printFlag to FALSE, but
125 | # everything else will the default argument
126 | imp <- mice(miss_mtcars, seed=3, printFlag=FALSE)
127 | print(imp)
128 | ------------------------------
129 | Multiply imputed data set
130 | Call:
131 | mice(data = miss_mtcars, printFlag = FALSE, seed = 3)
132 | Number of multiple imputations: 5
133 | Missing cells per column:
134 | mpg cyl disp hp drat wt qsec vs am gear carb
135 | 5 5 0 0 7 3 4 3 0 0 0
136 | 
137 | 
138 | Imputation methods:
139 | mpg cyl disp hp drat wt qsec vs am gear carb
140 | "pmm" "pmm" "" "" "pmm" "pmm" "pmm" "pmm" "" "" ""
141 | VisitSequence:
142 | mpg cyl drat wt qsec vs
143 | 1 2 5 6 7 8
144 | 
145 | 
146 | PredictorMatrix:
147 | mpg cyl disp hp drat wt qsec vs am gear carb
148 | mpg 0 1 1 1 1 1 1 1 1 1 1
149 | cyl 1 0 1 1 1 1 1 1 1 1 1
150 | disp 0 0 0 0 0 0 0 0 0 0 0
151 | ...
152 | Random generator seed value: 3
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | # convert categorical variables into factors
163 | miss_mtcars$vs <- factor(miss_mtcars$vs)
164 | miss_mtcars$cyl <- factor(miss_mtcars$cyl)
165 | imp <- mice(miss_mtcars, m=20, seed=3, printFlag=FALSE)
166 | imp$method
167 | -------------------------------------
168 | mpg cyl disp hp drat
169 | "pmm" "polyreg" "" "" "pmm"
170 | wt qsec vs am gear
171 | "pmm" "pmm" "logreg" "" ""
172 | carb
173 | ""
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | imp$imp$mpg[,1:6]
183 | ------------------------------------
184 | 1 2 3 4 5 6
185 | Duster 360 19.2 16.4 17.3 15.5 15.0 19.2
186 | Cadillac Fleetwood 15.2 13.3 15.0 13.3 10.4 17.3
187 | Chrysler Imperial 10.4 15.0 15.0 16.4 10.4 10.4
188 | Porsche 914-2 27.3 22.8 21.4 22.8 21.4 15.5
189 | Ferrari Dino 19.2 21.4 19.2 15.2 18.1 19.2
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | densityplot(imp)
198 | 
199 | 
200 | 
201 | 
202 | 
203 | imp_models <- with(imp, lm(mpg ~ am + wt + qsec))
204 | 
205 | 
206 | 
207 | 
208 | lapply(imp_models$analyses, coef)
209 | ---------------------------------
210 | [[1]]
211 | (Intercept) am wt qsec
212 | 18.1534095 2.0284014 -4.4054825 0.8637856
213 | [[2]]
214 | (Intercept) am wt qsec
215 | 8.375455 3.336896 -3.520882 1.219775
216 | [[3]]
217 | (Intercept) am wt qsec
218 | 5.254578 3.277198 -3.233096 1.337469
219 | .........
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | pooled_model <- pool(imp_models)
227 | summary(pooled_model)
228 | ----------------------------------
229 | est se t df Pr(>|t|)
230 | (Intercept) 7.049781 9.2254581 0.764166 17.63319 0.454873254
231 | am 3.182049 1.7445444 1.824000 21.36600 0.082171407
232 | wt -3.413534 0.9983207 -3.419276 14.99816 0.003804876
233 | qsec 1.270712 0.3660131 3.471765 19.93296 0.002416595
234 | lo 95 hi 95 nmis fmi lambda
235 | (Intercept) -12.3611281 26.460690 NA 0.3459197 0.2757138
236 | am -0.4421495 6.806247 0 0.2290359 0.1600952
237 | wt -5.5414268 -1.285641 3 0.4324828 0.3615349
238 | qsec 0.5070570 2.034366 4 0.2736026 0.2042003
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 


--------------------------------------------------------------------------------
/Chapter14/Chapter14.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Analysis-with-R-Second-Edition/ece3101ca2f43f725c13a67d27810297bd31142a/Chapter14/Chapter14.txt


--------------------------------------------------------------------------------
/Chapter14/Chapter15.txt:
--------------------------------------------------------------------------------
  1 | Allocation of memory
  2 | 
  3 | 
  4 | set.seed(1)
  5 | all.us.women <- rnorm(10000, mean=65, sd=3.5)
  6 | means.of.our.samples.bad <- c(1)
  7 | # I'm increasing the number of
  8 | # samples to 30,000 to prove a point
  9 | for(i in 1:30000){
 10 | a.sample <- sample(all.us.women, 40)
 11 | means.of.our.samples.bad[i] <- mean(a.sample)
 12 | }
 13 | 
 14 | 
 15 | 
 16 | means.of.our.samples.bad <- c(1)
 17 | system.time(
 18 | for(i in 1:30000){
 19 | a.sample <- sample(all.us.women, 40)
 20 | means.of.our.samples.bad[i] <- mean(a.sample)
 21 | }
 22 | )
 23 | means.of.our.samples.good <- numeric(30000)
 24 | system.time(
 25 | for(i in 1:30000){
 26 | a.sample <- sample(all.us.women, 40)
 27 | means.of.our.samples[i] <- mean(a.sample)
 28 | }
 29 | )
 30 | -------------------------------------
 31 | user system elapsed
 32 | 2.024 0.431 2.465
 33 | user system elapsed
 34 | 0.678 0.004 0.684
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | Vectorization
 44 | 
 45 | system.time(
 46 | for(i in 1:length(all.us.women))
 47 | all.us.women[i] ^ 2
 48 | )
 49 | --------------------------
 50 | user system elapsed
 51 | 0.003 0.000 0.003
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | system.time(
 60 | sapply(all.us.women, function(x) x^2)
 61 | )
 62 | -----------------------
 63 | user system elapsed
 64 | 0.006 0.000 0.006
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | > system.time(
 76 | + vapply(all.us.women, function(x) x^2, numeric(1))
 77 | + )
 78 | -------------------------
 79 | user system elapsed
 80 | 0.006 0.000 0.005
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | system.time(
 92 | all.us.women ^ 2
 93 | )
 94 | ----------------------
 95 | user system elapsed
 96 | 0 0 0
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | Getting started with parallel R
112 | 
113 | 
114 | # You don't have to install this if your copy of R is new
115 | library(parallel)
116 | 
117 | 
118 | library(parallel)
119 | cl <- makeCluster(4)
120 | 
121 | 
122 | detectCores()
123 | ------------------------
124 | [1] 4
125 | 
126 | 
127 | 
128 | 
129 | for(i in 1:4){
130 | Sys.sleep(5)
131 | }
132 | Or, equivalently, using lapply:
133 | # lapply will pass each element of the
134 | # vector c(1, 2, 3, 4) to the function
135 | # we write but we'll ignore it
136 | lapply(1:4, function(i) Sys.sleep(5))
137 | 
138 | 
139 | 
140 | 
141 | 
142 | system.time(
143 | lapply(1:4, function(i) Sys.sleep(5))
144 | )
145 | ----------------------------------------
146 | user system elapsed
147 | 0.059 0.074 20.005
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | #######################
157 | # NON-WINDOWS VERSION #
158 | #######################
159 | system.time(
160 | mclapply(1:4, function(i) Sys.sleep(5), mc.cores=4)
161 | )
162 | ###################
163 | # WINDOWS VERSION #
164 | ###################
165 | system.time(
166 | parLapply(cl, 1:4, function(i) Sys.sleep(5))
167 | )
168 | ----------------------------------------
169 | user system elapsed
170 | 0.021 0.042 5.013
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | An example of (some) substance
182 | 
183 | 
184 | 
185 | 
186 | haversine <- function(lat1, long1, lat2, long2, unit="km"){
187 | radius <- 6378 # radius of Earth in kilometers
188 | delta.phi <- to.radians(lat2 - lat1)
189 | delta.lambda <- to.radians(long2 - long1)
190 | phi1 <- to.radians(lat1)
191 | phi2 <- to.radians(lat2)
192 | term1 <- sin(delta.phi/2) ^ 2
193 | term2 <- cos(phi1) * cos(phi2) * sin(delta.lambda/2) ^ 2
194 | the.terms <- term1 + term2
195 | delta.sigma <- 2 * atan2(sqrt(the.terms), sqrt(1-the.terms))
196 | distance <- radius * delta.sigma
197 | if(unit=="km") return(distance)
198 | if(unit=="miles") return(0.621371*distance)
199 | 
200 | 
201 | 
202 | 
203 | 
204 | to.radians <- function(degrees){
205 | degrees * pi / 180
206 | }
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | set.seed(1)
214 | the.url <-
215 | "http://opendata.socrata.com/api/views/rxrh-4cxm/rows.csv?accessType=DOWNLO
216 | AD"
217 | all.airport.locs <- read.csv(the.url, stringsAsFactors=FALSE)
218 | library(magrittr)
219 | library(assertr)
220 | CHECKS <- . %>%
221 | verify(nrow(.) == 13429) %>%
222 | verify(names(.) %in% c("locationID", "Latitude", "Longitude")) %>%
223 | assert(within_bounds(0, 90), Latitude) %>%
224 | assert(within_bounds(0,180), Longitude)
225 | all.airport.locs <- CHECKS(all.airport.locs)
226 | # Let's start off with 400 airports
227 | smp.size <- 400
228 | # choose a random sample of airports
229 | random.sample <- sample((1:nrow(all.airport.locs)), smp.size)
230 | airport.locs <- all.airport.locs[random.sample, ]
231 | row.names(airport.locs) <- NULL
232 | head(airport.locs)
233 | -------------------------------------
234 | locationID Latitude Longitude
235 | 1 LWV 38.7642 87.6056
236 | 2 LS77 30.7272 91.1486
237 | 3 2N2 43.5919 71.7514
238 | 4 VG00 37.3697 75.9469
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | single.core <- function(airport.locs){
247 | running.sum <- 0
248 | for(i in 1:(nrow(airport.locs)-1)){
249 | for(j in (i+1):nrow(airport.locs)){
250 | # i is the row of the first lat/long pair
251 | # j is the row of the second lat/long pair
252 | this.dist <- haversine(airport.locs[i, 2],
253 | airport.locs[i, 3],
254 | airport.locs[j, 2],
255 | airport.locs[j, 3])
256 | running.sum <- running.sum + this.dist
257 | }
258 | }
259 | # Now we have to divide by the number of
260 | # distances we took. This is given by
261 | return(running.sum /
262 | ((nrow(airport.locs)*(nrow(airport.locs)-1))/2))
263 | }
264 | # Now, let's time it!
265 | 
266 | ----------------------------
267 | user system elapsed
268 | 5.400 0.034 5.466
269 | [1] 1667.186
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | # We'll have to limit the output to the
278 | # first 11 columns
279 | combn(1:10, 2)[,1:11]
280 | ----------------------------------------
281 | [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
282 | [1,] 1 1 1 1 1 1 1 1 1
283 | [2,] 2 3 4 5 6 7 8 9 10
284 | [,10] [,11]
285 | [1,] 2 2
286 | [2,] 3 4
287 | 
288 | 
289 | 
290 | 
291 | small.world <- c("LAX", "ALB", "OLM", "JFK")
292 | all.combs <- combn(1:length(small.world), 2)
293 | for(i in 1:ncol(all.combs)){
294 | from <- small.world[all.combs[1, i]]
295 | to <- small.world[all.combs[2, i]]
296 | print(paste(from, " <-> ", to))
297 | }
298 | ----------------------------------------
299 | [1] "LAX <-> ALB"
300 | [1] "LAX <-> OLM"
301 | [1] "LAX <-> JFK"
302 | [1] "ALB <-> OLM" # back to olympia
303 | [1] "ALB <-> JFK"
304 | [1] "OLM <-> JFK"
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | small.world <- c("LAX", "ALB", "OLM", "JFK")
313 | all.combs <- combn(1:length(small.world), 2)
314 | # instead of printing each airport pair in a string,
315 | # we'll return the string
316 | results <- lapply(1:ncol(all.combs), function(x){
317 | from <- small.world[all.combs[1, x]]
318 | to <- small.world[all.combs[2, x]]
319 | return(paste(from, " <-> ", to))
320 | })
321 | print(results)
322 | -------------------------
323 | [[1]]
324 | [1] "LAX <-> ALB"
325 | [[2]]
326 | [1] "LAX <-> OLM"
327 | [[3]]
328 | [1] "LAX <-> JFK"
329 | ........
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | unlist(results)
341 | ---------------------
342 | [1] "LAX <-> ALB" "LAX <-> OLM" "LAX <-> JFK"
343 | [4] "ALB <-> OLM" "ALB <-> JFK" "OLM <-> JFK"
344 | 
345 | 
346 | 
347 | 
348 | single.core.lapply <- function(airport.locs){
349 | all.combs <- combn(1:nrow(airport.locs), 2)
350 | numcombs <- ncol(all.combs)
351 | results <- lapply(1:numcombs, function(x){
352 | lat1 <- airport.locs[all.combs[1, x], 2]
353 | long1 <- airport.locs[all.combs[1, x], 3]
354 | lat2 <- airport.locs[all.combs[2, x], 2]
355 | long2 <- airport.locs[all.combs[2, x], 3]
356 | return(haversine(lat1, long1, lat2, long2))
357 | })
358 | return(sum(unlist(results)) / numcombs)
359 | }
360 | system.time(ave.dist <- single.core.lapply(airport.locs))
361 | print(ave.dist)
362 | ---------------------------------------
363 | user system elapsed
364 | 5.890 0.042 5.968
365 | [1] 1667.186
366 | 
367 | 
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | #######################
377 | # NON-WINDOWS VERSION #
378 | #######################
379 | multi.core <- function(airport.locs){
380 | all.combs <- combn(1:nrow(airport.locs), 2)
381 | numcombs <- ncol(all.combs)
382 | results <- mclapply(1:numcombs, function(x){
383 | lat1 <- airport.locs[all.combs[1, x], 2]
384 | long1 <- airport.locs[all.combs[1, x], 3]
385 | lat2 <- airport.locs[all.combs[2, x], 2]
386 | long2 <- airport.locs[all.combs[2, x], 3]
387 | return(haversine(lat1, long1, lat2, long2))
388 | }, mc.cores=4)
389 | return(sum(unlist(results)) / numcombs)
390 | }
391 | ###################
392 | # WINDOWS VERSION #
393 | ###################
394 | clusterExport(cl, c("haversine", "to.radians"))
395 | 
396 | multi.core <- function(airport.locs){
397 | all.combs <- combn(1:nrow(airport.locs), 2)
398 | numcombs <- ncol(all.combs)
399 | results <- parLapply(cl, 1:numcombs, function(x){
400 | lat1 <- airport.locs[all.combs[1, x], 2]
401 | long1 <- airport.locs[all.combs[1, x], 3]
402 | lat2 <- airport.locs[all.combs[2, x], 2]
403 | long2 <- airport.locs[all.combs[2, x], 3]
404 | return(haversine(lat1, long1, lat2, long2))
405 | })
406 | return(sum(unlist(results)) / numcombs)
407 | }
408 | system.time(ave.dist <- multi.core(airport.locs))
409 | print(ave.dist)
410 | -------------------------------
411 | user system elapsed
412 | 7.363 0.240 2.743
413 | [1] 1667.186
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 
421 | Using Rcpp
422 | 
423 | 
424 | #include <Rcpp.h>
425 | // [[Rcpp::export]]
426 | double square(double number){
427 | return(pow(number, 2));
428 | }
429 | 
430 | 
431 | 
432 | library(Rcpp)
433 | sourceCpp("our_cpp_functions.cpp")
434 | square(3)
435 | --------------------------------
436 | [1] 9
437 | 
438 | 
439 | 
440 | 
441 | square <- function(number){
442 | return(number^2)
443 | }
444 | 
445 | 
446 | 
447 | 
448 | /* Add this (and all other snippets that
449 | start with "// [[Rcpp::export]]")
450 | to the C++ file, not the R code. */
451 | // [[Rcpp::export]]
452 | double to_radians_cpp(double degrees){
453 | return(degrees * 3.141593 / 180);
454 | }
455 | # with goes with our R code
456 | sourceCpp("our_cpp_functions.cpp")
457 | to_radians_cpp(10)
458 | -------------------------
459 | [1] 0.174533
460 | 
461 | 
462 | 
463 | 
464 | 
465 | 
466 | 
467 | 
468 | 
469 | 
470 | 
471 | // [[Rcpp::export]]
472 | double haversine_cpp(double lat1, double long1,
473 | double lat2, double long2,
474 | std::string unit="km"){
475 | int radius = 6378;
476 | double delta_phi = to_radians_cpp(lat2 - lat1);
477 | double delta_lambda = to_radians_cpp(long2 - long1);
478 | double phi1 = to_radians_cpp(lat1);
479 | double phi2 = to_radians_cpp(lat2);
480 | double term1 = pow(sin(delta_phi / 2), 2);
481 | double term2 = cos(phi1) * cos(phi2)
482 | term2 = term2 * pow(sin(delta_lambda/2), 2);
483 | double the_terms = term1 + term2;
484 | double delta_sigma = 2 * atan2(sqrt(the_terms),
485 | sqrt(1-the_terms));
486 | double distance = radius * delta_sigma;
487 | /* if it is anything *but* km it is miles */
488 | if(unit != "km"){
489 | return(distance*0.621371);
490 | }
491 | return(distance);
492 | }
493 | 
494 | 
495 | 
496 | 
497 | 
498 | 
499 | 
500 | 
501 | 
502 | sourceCpp("our_cpp_functions.cpp")
503 | haversine(51.88, 176.65, 56.94, 154.18)
504 | haversine_cpp(51.88, 176.65, 56.94, 154.18)
505 | ----------------------------------------------
506 | [1] 1552.079
507 | [1] 1552.079
508 | 
509 | 
510 | 
511 | 
512 | 
513 | 
514 | 
515 | 
516 | // [[Rcpp::export]]
517 | double sum2(Rcpp::NumericVector a_vector){
518 | double running_sum = 0;
519 | int length = a_vector.size();
520 | for( int i = 0; i < length; i++ ){
521 | running_sum = running_sum + a_vector(i);
522 | }
523 | return(running_sum);
524 | }
525 | 
526 | 
527 | 
528 | 
529 | 
530 | 
531 | // [[Rcpp::export]]
532 | double single_core_cpp(Rcpp::NumericMatrix mat){
533 | int nrows = mat.nrow();
534 | int numcomps = nrows*(nrows-1)/2;
535 | double running_sum = 0;
536 | for( int i = 0; i < nrows; i++ ){
537 | for( int j = i+1; j < nrows; j++){
538 | double this_dist = haversine_cpp(mat(i,0), mat(i,1),
539 | mat(j,0), mat(j,1));
540 | running_sum = running_sum + this_dist;
541 | }
542 | }
543 | return running_sum / numcomps;
544 | }
545 | 
546 | 
547 | 
548 | 
549 | 
550 | 
551 | 
552 | 
553 | 
554 | 
555 | 
556 | 
557 | sourceCpp("our_cpp_functions.cpp")
558 | the.matrix <- as.matrix(all.airport.locs[,-1])
559 | system.time(ave.dist <- single_core_cpp(the.matrix))
560 | print(ave.dist)
561 | ----------------------------------------
562 | user system elapsed
563 | 0.012 0.000 0.012
564 | [1] 1667.186
565 | 
566 | 
567 | the.matrix <- as.matrix(all.airport.locs[,-1])
568 | system.time(ave.dist <- single_core_cpp(the.matrix))
569 | print(ave.dist)
570 | -------------------------------
571 | user system elapsed
572 | 12.310 0.080 12.505
573 | [1] 1869.744
574 | 
575 | 
576 | 
577 | 
578 | 
579 | 
580 | 
581 | 
582 | 
583 | Being smarter about your code
584 | 
585 | 
586 | single.core.improved <- function(airport.locs){
587 | numrows <- nrow(airport.locs)
588 | running.sum <- 0
589 | 
590 | for (i in 1:(numrows-1)) {
591 | this.dist <- sum(haversine(airport.locs[i,2],
592 | airport.locs[i, 3],
593 | airport.locs[(i+1):numrows, 2],
594 | airport.locs[(i+1):numrows, 3]))
595 | running.sum <- running.sum + this.dist
596 | }
597 | return(running.sum / (numrows*(numrows-1)/2))
598 | }
599 | system.time(ave.dist <- single.core.improved(all.airport.locs))
600 | print(ave.dist)
601 | ------------------------------------------------------------------
602 | user system elapsed
603 | 15.537 0.173 15.866
604 | [1] 1869.744
605 | 
606 | 
607 | 
608 | 
609 | 
610 | 
611 | 
612 | 
613 | 
614 | 
615 | 
616 | 
617 | 
618 | 


--------------------------------------------------------------------------------
/Chapter15/chapter12.R:
--------------------------------------------------------------------------------
  1 | # You don't have to install this if you copy of R is new
  2 | library(parallel)
  3 | 
  4 | detectCores()
  5 | 
  6 | system.time(
  7 |   lapply(1:4, function(i) Sys.sleep(5))
  8 | )
  9 | 
 10 | #######################
 11 | # NON-WINDOWS VERSION #
 12 | #######################
 13 | system.time(
 14 |   mclapply(1:4, function(i) Sys.sleep(5), mc.cores=4)
 15 | )
 16 | 
 17 | ###################
 18 | # WINDOWS VERSION #
 19 | ###################
 20 | #system.time(
 21 | #  parLapply(cl, 1:4, function(i) Sys.sleep(5))
 22 | #)
 23 | 
 24 | # ALL OUTPUT SHOWN WILL BE FOR A
 25 | # NON-WINDOWS COMPUTER WITH 4 CORES
 26 | 
 27 | 
 28 | haversine <- function(lat1, long1, lat2, long2, unit="km"){
 29 |   radius <- 6378      # radius of Earth in kilometers
 30 |   delta.phi <- to.radians(lat2 - lat1)
 31 |   delta.lambda <- to.radians(long2 - long1)
 32 |   phi1 <- to.radians(lat1)
 33 |   phi2 <- to.radians(lat2)
 34 |   term1 <- sin(delta.phi/2) ^ 2
 35 |   term2 <- cos(phi1) * cos(phi2) * sin(delta.lambda/2) ^ 2
 36 |   the.terms <- term1 + term2
 37 |   delta.sigma <- 2 * atan2(sqrt(the.terms), sqrt(1-the.terms))
 38 |   distance <- radius * delta.sigma
 39 |   if(unit=="km") return(distance)
 40 |   if(unit=="miles") return(0.621371*distance)
 41 | }
 42 | 
 43 | to.radians <- function(degrees){
 44 |   degrees * pi / 180
 45 | }
 46 | 
 47 | 
 48 | 
 49 | 
 50 | set.seed(1)
 51 | 
 52 | all.airport.locs <- read.csv("http://opendata.socrata.com/api/views/rxrh-4cxm/rows.csv?accessType=DOWNLOAD",
 53 |                              stringsAsFactors=FALSE)
 54 | 
 55 | library(magrittr)
 56 | library(assertr)
 57 | CHECKS <- . %>%
 58 |   verify(nrow(.) == 13429) %>%
 59 |   verify(names(.) %in% c("locationID", "Latitude", "Longitude")) %>%
 60 |   assert(within_bounds(0, 90), Latitude) %>%
 61 |   assert(within_bounds(0,180), Longitude)
 62 | 
 63 | all.airport.locs <- CHECKS(all.airport.locs)
 64 | 
 65 | # Let's start off with 400 airports
 66 | smp.size <- 400
 67 | 
 68 | # choose a random sample of airports
 69 | random.sample <- sample((1:nrow(all.airport.locs)), smp.size)
 70 | airport.locs <- all.airport.locs[random.sample, ]
 71 | row.names(airport.locs) <- NULL
 72 | 
 73 | head(airport.locs)
 74 | 
 75 | 
 76 | single.core <- function(airport.locs){
 77 |   running.sum <- 0
 78 |   for(i in 1:(nrow(airport.locs)-1)){
 79 |     for(j in (i+1):nrow(airport.locs)){
 80 |       # i is the row of the first lat/long pair
 81 |       # j is the row of the second lat/long pair
 82 |       this.dist <- haversine(airport.locs[i, 2],
 83 |                              airport.locs[i, 3],
 84 |                              airport.locs[j, 2],
 85 |                              airport.locs[j, 3])
 86 |       running.sum <- running.sum + this.dist
 87 |     }
 88 |   }
 89 |   # Now we have to divide by the number of
 90 |   # distances we took. This is given by
 91 |   return(running.sum /
 92 |            ((nrow(airport.locs)*(nrow(airport.locs)-1))/2))
 93 | }
 94 | 
 95 | system.time(ave.dist <- single.core(airport.locs))
 96 | print(ave.dist)
 97 | 
 98 | 
 99 | combn(1:10, 2)[,1:11]
100 | 
101 | 
102 | small.world <- c("LAX", "ALB", "OLM", "JFK")
103 | all.combs <- combn(1:length(small.world), 2)
104 | 
105 | for(i in 1:ncol(all.combs)){
106 |   from <- small.world[all.combs[1, i]]
107 |   to <- small.world[all.combs[2, i]]
108 |   print(paste(from, " <-> ", to))
109 | }
110 | 
111 | 
112 | small.world <- c("LAX", "ALB", "OLM", "JFK")
113 | all.combs <- combn(1:length(small.world), 2)
114 | 
115 | # instead of printing each airport pair in a string,
116 | # we'll return the string
117 | results <- lapply(1:ncol(all.combs), function(x){
118 |   from <- small.world[all.combs[1, x]]
119 |   to <- small.world[all.combs[2, x]]
120 |   #from <- small.world[all.combs[i, 1]]
121 |   #to <- small.world[all.combs[i, 2]]
122 |   return(paste(from, " <-> ", to))
123 | })
124 | 
125 | print(results)
126 | 
127 | unlist(results)
128 | 
129 | single.core.lapply <- function(airport.locs){
130 |   all.combs <- combn(1:nrow(airport.locs), 2)
131 |   numcombs <- ncol(all.combs)
132 |   results <- lapply(1:numcombs, function(x){
133 |     lat1  <- airport.locs[all.combs[1, x], 2]
134 |     long1 <- airport.locs[all.combs[1, x], 3]
135 |     lat2  <- airport.locs[all.combs[2, x], 2]
136 |     long2 <- airport.locs[all.combs[2, x], 3]
137 |     return(haversine(lat1, long1, lat2, long2))
138 |   })
139 |   return(sum(unlist(results)) / numcombs)
140 | }
141 | 
142 | system.time(ave.dist <- single.core.lapply(airport.locs))
143 | print(ave.dist)
144 | 
145 | 
146 | #######################
147 | # NON-WINDOWS VERSION #
148 | #######################
149 | multi.core <- function(airport.locs){
150 |   all.combs <- combn(1:nrow(airport.locs), 2)
151 |   numcombs <- ncol(all.combs)
152 |   results <- mclapply(1:numcombs, function(x){
153 |     lat1  <- airport.locs[all.combs[1, x], 2]
154 |     long1 <- airport.locs[all.combs[1, x], 3]
155 |     lat2  <- airport.locs[all.combs[2, x], 2]
156 |     long2 <- airport.locs[all.combs[2, x], 3]
157 |     return(haversine(lat1, long1, lat2, long2))
158 |   }, mc.cores=4)
159 |   return(sum(unlist(results)) / numcombs)
160 | }
161 | 
162 | ###################
163 | # WINDOWS VERSION #
164 | ###################
165 | #clusterExport(cl, c("haversine", "to.radians"))
166 | #
167 | #multi.core <- function(airport.locs){
168 | #  all.combs <- combn(1:nrow(airport.locs), 2)
169 | #  numcombs <- ncol(all.combs)
170 | #  results <- parLapply(cl, 1:numcombs, function(x){
171 | #      lat1  <- airport.locs[all.combs[1, x], 2]
172 | #      long1 <- airport.locs[all.combs[1, x], 3]
173 | #      lat2  <- airport.locs[all.combs[2, x], 2]
174 | #      long2 <- airport.locs[all.combs[2, x], 3]
175 | #      return(haversine(lat1, long1, lat2, long2))
176 | #  })
177 | #  return(sum(unlist(results)) / numcombs)
178 | #}
179 | 
180 | system.time(ave.dist <- multi.core(airport.locs))
181 | print(ave.dist)
182 | 
183 | library(Rcpp)
184 | 
185 | sourceCpp("our_cpp_functions.cpp")
186 | 
187 | square(3)
188 | 
189 | 
190 | sourceCpp("our_cpp_functions.cpp")
191 | to_radians_cpp(10)
192 | 
193 | 
194 | sourceCpp("our_cpp_functions.cpp")
195 | 
196 | haversine(51.88, 176.65, 56.94, 154.18)
197 | haversine_cpp(51.88, 176.65, 56.94, 154.18)
198 | 
199 | 
200 | sourceCpp("our_cpp_functions.cpp")
201 | 
202 | system.time(ave.dist <- single_core_cpp(as.matrix(airport.locs[,-1])))
203 | print(ave.dist)
204 | 
205 | 
206 | 
207 | the.matrix <- as.matrix(all.airport.locs[,-1])
208 | system.time(ave.dist <- single_core_cpp(the.matrix))
209 | print(ave.dist)
210 | 
211 | 
212 | 
213 | 
214 | single.core.improved <- function(airport.locs){
215 |   
216 |   numrows <- nrow(airport.locs)
217 |   
218 |   running.sum <- 0
219 |   for (i in 1:(numrows-1)) {
220 |     this.dist <- sum(haversine(airport.locs[i,2],
221 |                                airport.locs[i, 3],
222 |                                airport.locs[(i+1):numrows, 2],
223 |                                airport.locs[(i+1):numrows, 3]))
224 |     running.sum <- running.sum + this.dist
225 |   }
226 |   return(running.sum / (numrows*(numrows-1)/2))
227 | }
228 | 
229 | system.time(ave.dist <- single.core.improved(all.airport.locs))
230 | print(ave.dist)
231 | 
232 | 


--------------------------------------------------------------------------------
/Chapter15/our_cpp_functions.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | 
 3 | // [[Rcpp::export]]
 4 | double square(double number){
 5 |     return(pow(number, 2));
 6 | }
 7 | 
 8 | // [[Rcpp::export]]
 9 | double to_radians_cpp(double degrees){
10 |     return(degrees * 3.141593 / 180);
11 | }
12 | 
13 | 
14 | // [[Rcpp::export]]
15 | double sum2(Rcpp::NumericVector a_vector){
16 |     double running_sum = 0;
17 |     int length = a_vector.size();
18 |     for( int i = 0; i < length; i++ ){
19 |         running_sum = running_sum + a_vector(i);
20 |     }
21 |     return(running_sum);
22 | }
23 | 
24 | 
25 | 
26 | // [[Rcpp::export]]
27 | double haversine_cpp(double lat1, double long1,
28 |                      double lat2, double long2,
29 |                      std::string unit="km"){
30 |     int radius = 6378;
31 |     double delta_phi = to_radians_cpp(lat2 - lat1);
32 |     double delta_lambda = to_radians_cpp(long2 - long1);
33 |     double phi1 = to_radians_cpp(lat1);
34 |     double phi2 = to_radians_cpp(lat2);
35 |     double term1 = pow(sin(delta_phi / 2), 2);
36 |     double term2 = cos(phi1) * cos(phi2) * pow(sin(delta_lambda/2), 2);
37 |     double the_terms = term1 + term2;
38 |     double delta_sigma = 2 * atan2(sqrt(the_terms), sqrt(1-the_terms));
39 |     double distance = radius * delta_sigma;
40 |  
41 |     /* if it is anything *but* km it is miles */
42 |     if(unit != "km"){
43 |         return(distance*0.621371);
44 |     }
45 |  
46 |     return(distance);
47 | }
48 | 
49 | 
50 | 
51 | // [[Rcpp::export]]
52 | double single_core_cpp(Rcpp::NumericMatrix mat){
53 |     int nrows = mat.nrow();
54 |     int numcomps = nrows*(nrows-1)/2;
55 |     double running_sum = 0;
56 |     for( int i = 0; i < nrows; i++ ){
57 |         for( int j = i+1; j < nrows; j++){
58 |             double this_dist = haversine_cpp(mat(i,0), mat(i,1),
59 |                                              mat(j,0), mat(j,1));
60 | 
61 |             running_sum = running_sum + this_dist;
62 |         }
63 |     }
64 |     return running_sum / numcomps;
65 | }
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/Chapter16/Chapter16.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Analysis-with-R-Second-Edition/ece3101ca2f43f725c13a67d27810297bd31142a/Chapter16/Chapter16.txt


--------------------------------------------------------------------------------
/Chapter17/Chapter17.txt:
--------------------------------------------------------------------------------
  1 | RStudio
  2 | 
  3 | library(ggplot2)
  4 | nothing <- data.frame(a=rbinom(1000, 20, .5),
  5 | b=c("red", "white"),
  6 | c=rnorm(1000, mean=100, sd=10))
  7 | qplot(c, data=nothing, geom="histogram")
  8 | write.csv(nothing, "nothing.csv")
  9 | Execute the statements one by one. Notice that
 10 | 
 11 | 
 12 | 
 13 | Running R scripts
 14 | 
 15 | 
 16 | R CMD BATCH nothing.R
 17 | 
 18 | R --vanilla CMD BATCH nothing.R
 19 | 
 20 | Rscript nothing.R
 21 | 
 22 | Rscript --vanilla nothing.R
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | An example script
 29 | 
 30 | #!/usr/bin/Rscript --vanilla
 31 | ###########################################################
 32 | ## ##
 33 | ## nyc-sat-scores.R ##
 34 | ## ##
 35 | ## Author: Tony Fischetti ##
 36 | ## tony.fischetti@gmail.com ##
 37 | ## ##
 38 | ###########################################################
 39 | ##
 40 | ## Aim: to use Bayesian analysis to compare NYC's 2010
 41 | ## combined SAT scores against the average of the
 42 | ## rest of the country, which, according to
 43 | ## FairTest.com, is 1509
 44 | ##
 45 | # workspace cleanup
 46 | rm(list=ls())
 47 | # options
 48 | options(echo=TRUE)
 49 | options(stringsAsFactors=FALSE)
 50 | # libraries
 51 | library(assertr) # for data checking
 52 | library(runjags) # for MCMC
 53 | # make sure everything is all set with JAGS
 54 | testjags()
 55 | # yep!
 56 | ## read data file
 57 | # data was retrieved from NYC Open Data portal
 58 | # direct link:
 59 | https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNL
 60 | OAD
 61 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
 62 | # let's give the columns easier names
 63 | better.names <- c("id", "school.name", "n", "read.mean",
 64 | "math.mean", "write.mean")
 65 | names(nyc.sats) <- better.names
 66 | # there are 460 rows but almost 700 NYC schools
 67 | # we will *assume*, then, that this is a random
 68 | # sample of NYC schools
 69 | # let's first check the veracity of this data...
 70 | #nyc.sats <- assert(nyc.sats, is.numeric,
 71 | # n, read.mean, math.mean, write.mean)
 72 | # It looks like check failed because there are "s"s for some
 73 | # rows. (??) A look at the data set descriptions indicates
 74 | # that the "s" is for schools # with 5 or fewer students.
 75 | # For our purposes, let's just exclude them.
 76 | # This is a function that takes a vector, replaces all "s"s
 77 | # with NAs and make coverts all non-"s"s into numerics
 78 | remove.s <- function(vec){
 79 | ifelse(vec=="s", NA, vec)
 80 | }
 81 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n))
 82 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean))
 83 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean))
 84 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
 85 | # Remove schools with fewer than 5 test takers
 86 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
 87 | # Calculate a total combined SAT score
 88 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
 89 | nyc.sats$math.mean +
 90 | nyc.sats$write.mean)
 91 | # Let's build a posterior distribution of the true mean
 92 | # of NYC high schools' combined SAT scores.
 93 | # We're not going to look at the summary statistics, because
 94 | # we don't want to bias our priors
 95 | # Specify a standard gaussian model
 96 | the.model <- "
 97 | model {
 98 | # priors
 99 | mu ~ dunif(0, 2400)
100 | stddev ~ dunif(0, 500)
101 | tau <- pow(stddev, -2)
102 | # likelihood
103 | for(i in 1:theLength){
104 | samp[i] ~ dnorm(mu, tau)
105 | }
106 | }"
107 | the.data <- list(
108 | samp = nyc.sats$combined.mean,
109 | theLength = length(nyc.sats$combined.mean)
110 | )
111 | results <- autorun.jags(the.model, data=the.data,
112 | n.chains = 3,
113 | monitor = c('mu', 'stddev'))
114 | # View the results of the MCMC
115 | print(results)
116 | # Plot the MCMC diagnostics
117 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
118 | # Looks good!
119 | # Let's extract the MCMC samples of the mean and get the
120 | # bounds of the middle 95%
121 | results.matrix <- as.matrix(results$mcmc)
122 | mu.samples <- results.matrix[,'mu']
123 | bounds <- quantile(mu.samples, c(.025, .975))
124 | # We are 95% sure that the true mean is between 1197 and 1232
125 | # Now let's plot the marginal posterior distribution for the mean
126 | # of the NYC high schools' combined SAT grades and draw the 95%
127 | # percent credible interval.
128 | plot(density(mu.samples),
129 | main=paste("Posterior distribution of mean combined SAT",
130 | "score in NYC high schools (2010)", sep="\n"))
131 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
132 | # Given the results, the SAT scores for NYC high schools in 2010
133 | # are *incontrovertibly* not at par with the average SAT scores of
134 | # the nation.
135 | 
136 | 
137 | 
138 | 
139 | 
140 | Scripting and reproducibility
141 | 
142 | > devtools::session_info()
143 | Session info ---------------------------------
144 | setting value
145 | version R version 3.2.1 (2015-06-18)
146 | system x86_64, darwin13.4.0
147 | ui RStudio (0.99.486)
148 | language (EN)
149 | collate en_US.UTF-8
150 | tz America/New_York
151 | date 1969-07-20
152 | Packages -------------------------------------
153 | package * version date source
154 | assertr * 1.0.0 2015-06-26 CRAN (R 3.2.1)
155 | coda 0.17-1 2015-03-03 CRAN (R 3.2.0)
156 | devtools 1.9.1 2015-09-11 CRAN (R 3.2.0)
157 | digest 0.6.8 2014-12-31 CRAN (R 3.2.0)
158 | lattice 0.20-33 2015-07-14 CRAN (R 3.2.0)
159 | memoise 0.2.1 2014-04-22 CRAN (R 3.2.0)
160 | modeest 2.1 2012-10-15 CRAN (R 3.2.0)
161 | 
162 | 
163 | rjags 3-15 2015-04-15 CRAN (R 3.2.0)
164 | runjags * 2.0.2-8 2015-09-14 CRAN (R 3.2.0)
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | R projects
173 | 
174 | read.csv("/Users/bensisko/Desktop/SAT_Scores_NYC_2010.csv")
175 | 
176 | 
177 | #!/usr/bin/Rscript --vanilla
178 | source("./code/load-and-clean-sat-data.R")
179 | source("./code/analyze-sat-data.R")
180 | 
181 | 
182 | 
183 | 
184 | Communicating results
185 | 
186 | 
187 | ---
188 | title: "NYC SAT Scores Analysis"
189 | author: "Tony Fischetti"
190 | date: "November 1, 2015"
191 | output: html_document
192 | ---
193 | #### Aim:
194 | To use Bayesian analysis to compare NYC's 2010
195 | combined SAT scores against the average of the
196 | rest of the country, which, according to
197 | FairTest.com, is 1509
198 | ```{r, echo=FALSE}
199 | # options
200 | options(echo=TRUE)
201 | options(stringsAsFactors=FALSE)
202 | ```
203 | We are going to use the `assertr` and `runjags`
204 | packages for data checking and MCMC, respectively.
205 | ```{r}
206 | # libraries
207 | library(assertr) # for data checking
208 | library(runjags) # for MCMC
209 | ```
210 | Let's make sure everything is all set with JAGS!
211 | ```{r}
212 | testjags()
213 | ...
214 | 
215 | Great!
216 | This data was found in the NYC Open Data Portal:
217 | https://nycopendata.socrata.com
218 | ```{r}
219 | link.to.data <-
220 | "http://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNL
221 | OAD"
222 | download.file(link.to.data, "./data/SAT_Scores_NYC_2010.csv")
223 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
224 | ```
225 | Let's give the columns easier names
226 | ```{r}
227 | better.names <- c("id", "school.name", "n", "read.mean",
228 | "math.mean", "write.mean")
229 | names(nyc.sats) <- better.names
230 | ```
231 | There are `r nrow(nyc.sats)` rows but almost 700 NYC schools. We will,
232 | therefore, *assume* that this is a random sample of NYC schools.
233 | Let's first check the veracity of this data...
234 | ```{r, error=TRUE}
235 | nyc.sats <- assert(nyc.sats, is.numeric,
236 | n, read.mean, math.mean, write.mean)
237 | ```
238 | It looks like check failed because there are "s"s for some rows. (??)
239 | A look at the data set descriptions indicates that the "s" is for schools
240 | with 5 or fewer students. For our purposes, let's just exclude them.
241 | This is a function that takes a vector, replaces all "s"s
242 | with NAs and make coverts all non-"s"s into numerics
243 | ```{r}
244 | remove.s <- function(vec){
245 | ifelse(vec=="s", NA, vec)
246 | }
247 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n))
248 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean))
249 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean))
250 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
251 | 
252 | 
253 | 
254 | Now we are going to remove schools with fewer than 5 test takers
255 | and calculate a combined SAT score
256 | ```{r}
257 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
258 | # Calculate a total combined SAT score
259 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
260 | nyc.sats$math.mean +
261 | nyc.sats$write.mean)
262 | ```
263 | Let's now build a posterior distribution of the true mean of NYC high
264 | schools' combined SAT scores. We're not going to look at the summary
265 | statistics, because we don't want to bias our priors.
266 | We will use a standard gaussian model.
267 | ```{r, cache=TRUE, results="hide", warning=FALSE, message=FALSE}
268 | the.model <- "
269 | model {
270 | # priors
271 | mu ~ dunif(0, 2400)
272 | stddev ~ dunif(0, 500)
273 | tau <- pow(stddev, -2)
274 | # likelihood
275 | for(i in 1:theLength){
276 | samp[i] ~ dnorm(mu, tau)
277 | }
278 | }"
279 | the.data <- list(
280 | samp = nyc.sats$combined.mean,
281 | theLength = length(nyc.sats$combined.mean)
282 | )
283 | results <- autorun.jags(the.model, data=the.data,
284 | n.chains = 3,
285 | monitor = c('mu'))
286 | ```
287 | Let's view the results of the MCMC.
288 | ```{r}
289 | print(results)
290 | ```
291 | Now let's plot the MCMC diagnostics
292 | ```{r, message=FALSE}
293 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
294 | 
295 | Looks good!
296 | Let's extract the MCMC samples of the mean, and get the
297 | bounds of the middle 95%
298 | ```{r}
299 | results.matrix <- as.matrix(results$mcmc)
300 | mu.samples <- results.matrix[,'mu']
301 | bounds <- quantile(mu.samples, c(.025, .975))
302 | ```
303 | We are 95% sure that the true mean is between
304 | `r round(bounds[1], 2)` and `r round(bounds[2], 2)`.
305 | Now let's plot the marginal posterior distribution for the mean
306 | of the NYC high schools' combined SAT grades, and draw the 95%
307 | percent credible interval.
308 | ```{r}
309 | plot(density(mu.samples),
310 | main=paste("Posterior distribution of mean combined SAT",
311 | "score in NYC high schools (2010)", sep="\n"))
312 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
313 | ```
314 | Given the results, the SAT scores for NYC high schools in 2010
315 | are **incontrovertibly** not at par with the average SAT scores of
316 | the nation.
317 | ------------------------------------
318 | This is some session information for reproducibility:
319 | ```{r}
320 | devtools::session_info()


--------------------------------------------------------------------------------
/Chapter17/nyc-sat-scores.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript --vanilla
  2 | 
  3 | ###########################################################
  4 | ##                                                       ##
  5 | ##   nyc-sat-scores.R                                    ##
  6 | ##                                                       ##
  7 | ##                Author: Tony Fischetti                 ##
  8 | ##                        tony.fischetti@gmail.com       ##
  9 | ##                                                       ##
 10 | ###########################################################
 11 | 
 12 | ##
 13 | ## Aim: to use Bayesian analysis to compare NYC's 2010 
 14 | ##      combined SAT scores against the average of the
 15 | ##      rest of the country, which, according to
 16 | ##      FairTest.com, is 1509
 17 | ##
 18 | 
 19 | # workspace cleanup
 20 | rm(list=ls())
 21 | 
 22 | # options
 23 | options(echo=TRUE)
 24 | options(stringsAsFactors=FALSE)
 25 | 
 26 | # libraries
 27 | library(assertr)   # for data checking
 28 | library(runjags)   # for MCMC
 29 | 
 30 | # make sure everything is all set with JAGS
 31 | testjags()
 32 | # yep!
 33 | 
 34 | 
 35 | ## read data file
 36 | # data was retrieved from NYC Open Data portal
 37 | # direct link: https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNLOAD
 38 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
 39 | 
 40 | # let's give the columns easier names
 41 | better.names <- c("id", "school.name", "n", "read.mean",
 42 |                   "math.mean", "write.mean")
 43 | names(nyc.sats) <- better.names
 44 | 
 45 | 
 46 | # there are 460 rows but almost 700 NYC schools
 47 | # we will *assume*, then, that this is a random
 48 | # sample of NYC schools
 49 | 
 50 | # let's first check the veracity of this data...
 51 | #nyc.sats <- assert(nyc.sats, is.numeric,
 52 | #                   n, read.mean, math.mean, write.mean)
 53 | 
 54 | # It looks like check failed because there are "s"s for some rows. (??)
 55 | # A look at the data set descriptions indicates that the "s" is for schools
 56 | # with 5 or fewer students. For our purposes, let's just exclude them.
 57 | 
 58 | 
 59 | # This is a function that takes a vector, replaces all "s"s
 60 | # with NAs and make coverts all non-"s"s into numerics
 61 | remove.s <- function(vec){
 62 |   ifelse(vec=="s", NA, vec)
 63 | }
 64 | 
 65 | nyc.sats$n          <- as.numeric(remove.s(nyc.sats$n))
 66 | nyc.sats$read.mean  <- as.numeric(remove.s(nyc.sats$read.mean))
 67 | nyc.sats$math.mean  <- as.numeric(remove.s(nyc.sats$math.mean))
 68 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
 69 | 
 70 | # Remove schools with fewer than 5 test takers
 71 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
 72 | 
 73 | # Calculate a total combined SAT score
 74 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
 75 |                            nyc.sats$math.mean +
 76 |                            nyc.sats$write.mean)
 77 | 
 78 | # Let's build a posterior distribution of the true mean
 79 | # of NYC high school's combined SAT scores.
 80 | 
 81 | # We're not going to look at the summary statistics because
 82 | # we don't want to bias our priors
 83 | 
 84 | # Specify a standard gaussian model
 85 | the.model <- "
 86 | model {
 87 |   # priors
 88 |   mu ~ dunif(0, 2400)
 89 |   stddev ~ dunif(0, 500)
 90 |   tau <- pow(stddev, -2)
 91 | 
 92 |   # likelihood
 93 |   for(i in 1:theLength){
 94 |      samp[i] ~ dnorm(mu, tau)
 95 |   }
 96 | }"
 97 | 
 98 | the.data <- list(
 99 |   samp = nyc.sats$combined.mean,
100 |   theLength = length(nyc.sats$combined.mean)
101 | )
102 | 
103 | results <- autorun.jags(the.model, data=the.data,
104 |                         n.chains = 3,
105 |                         monitor = c('mu', 'stddev'))
106 | 
107 | # View the results of the MCMC
108 | print(results)
109 | 
110 | # Plot the MCMC diagnostics
111 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
112 | # Looks good!
113 | 
114 | # Let's extract the MCMC samples of the mean and get the
115 | # bounds of the middle 95%
116 | results.matrix <- as.matrix(results$mcmc)
117 | mu.samples <- results.matrix[,'mu']
118 | bounds <- quantile(mu.samples, c(.025, .975))
119 | 
120 | # We are 95% sure that the true mean is between 1197 and 1232
121 | 
122 | # Now let's plot the marginal posterior distribution for the mean
123 | # of the NYC high schools' combined SAT grades and draw the 95%
124 | # percent credible interval.
125 | plot(density(mu.samples), main=paste("Posterior distribution of mean combined SAT",
126 |                                      "score in NYC high schools (2010)", sep="\n"))
127 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
128 | 
129 | 
130 | # Given the results, the SAT scores for NYC high schools in 2010
131 | # are *incontrovertibly* not on par with the average SAT scores of
132 | # the nation.
133 | 


--------------------------------------------------------------------------------
/Chapter17/nyc-sat-scores.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "NYC SAT Scores Analysis"
  3 | author: "Tony Fischetti"
  4 | date: "November 1, 2015"
  5 | output: html_document
  6 | ---
  7 | 
  8 | #### Aim:
  9 | To use Bayesian analysis to compare NYC's 2010 
 10 | combined SAT scores against the average of the
 11 | rest of the country, which, according to
 12 | FairTest.com, is 1509
 13 | 
 14 | 
 15 | ```{r, echo=FALSE}
 16 | # options
 17 | options(echo=TRUE)
 18 | options(stringsAsFactors=FALSE)
 19 | ```
 20 | 
 21 | We are going to use the `assertr` and `runjags`
 22 | packages for data checking and MCMC, respectively.
 23 | ```{r}
 24 | # libraries
 25 | library(assertr)   # for data checking
 26 | library(runjags)   # for MCMC
 27 | ```
 28 | 
 29 | Let's make sure everything is all set with JAGS!
 30 | ```{r}
 31 | testjags()
 32 | ```
 33 | Great!
 34 | 
 35 | This data was found in the NYC Open Data Portal:
 36 | https://nycopendata.socrata.com
 37 | ```{r}
 38 | link.to.data <- "http://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNLOAD"
 39 | download.file(link.to.data, "./data/SAT_Scores_NYC_2010.csv")
 40 | 
 41 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
 42 | ```
 43 | 
 44 | Let's give the columns easier names
 45 | ```{r}
 46 | better.names <- c("id", "school.name", "n", "read.mean",
 47 |                   "math.mean", "write.mean")
 48 | names(nyc.sats) <- better.names
 49 | ```
 50 | 
 51 | There are `r nrow(nyc.sats)` rows but almost 700 NYC schools. We will,
 52 | therefore, *assume* that this is a random sample of NYC schools.
 53 | 
 54 | 
 55 | Let's first check the veracity of this data...
 56 | ```{r, error=TRUE}
 57 | nyc.sats <- assert(nyc.sats, is.numeric,
 58 |                    n, read.mean, math.mean, write.mean)
 59 | ```
 60 | 
 61 | It looks like check failed because there are "s"s for some rows. (??)
 62 | A look at the data set descriptions indicates that the "s" is for schools
 63 | with 5 or fewer students. For our purposes, let's just exclude them.
 64 | 
 65 | 
 66 | This is a function that takes a vector, replaces all "s"s
 67 | with NAs and make coverts all non-"s"s into numerics
 68 | ```{r}
 69 | remove.s <- function(vec){
 70 |   ifelse(vec=="s", NA, vec)
 71 | }
 72 | 
 73 | nyc.sats$n          <- as.numeric(remove.s(nyc.sats$n))
 74 | nyc.sats$read.mean  <- as.numeric(remove.s(nyc.sats$read.mean))
 75 | nyc.sats$math.mean  <- as.numeric(remove.s(nyc.sats$math.mean))
 76 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
 77 | ```
 78 | 
 79 | Now we are going to remove schools with fewer than 5 test takers
 80 | and calculate a combined SAT score
 81 | ```{r}
 82 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
 83 | 
 84 | # Calculate a total combined SAT score
 85 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
 86 |                            nyc.sats$math.mean +
 87 |                            nyc.sats$write.mean)
 88 | ```
 89 | 
 90 | Let's now build a posterior distribution of the true mean
 91 | of NYC high school's combined SAT scores. We're not going to look
 92 | at the summary statistics because we don't want to bias our priors.
 93 | We will use a standard gaussian model.
 94 | 
 95 | ```{r, cache=TRUE, results="hide", warning=FALSE, message=FALSE}
 96 | the.model <- "
 97 | model {
 98 |   # priors
 99 |   mu ~ dunif(0, 2400)
100 |   stddev ~ dunif(0, 500)
101 |   tau <- pow(stddev, -2)
102 | 
103 |   # likelihood
104 |   for(i in 1:theLength){
105 |      samp[i] ~ dnorm(mu, tau)
106 |   }
107 | }"
108 | 
109 | the.data <- list(
110 |   samp = nyc.sats$combined.mean,
111 |   theLength = length(nyc.sats$combined.mean)
112 | )
113 | 
114 | results <- autorun.jags(the.model, data=the.data,
115 |                         n.chains = 3,
116 |                         monitor = c('mu'))
117 | ```
118 | 
119 | Let's view the results of the MCMC.
120 | ```{r}
121 | print(results)
122 | ```
123 | 
124 | Now let's plot the MCMC diagnostics
125 | ```{r, message=FALSE}
126 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
127 | ```
128 | 
129 | Looks good!
130 | 
131 | 
132 | Let's extract the MCMC samples of the mean and get the
133 | bounds of the middle 95%
134 | ```{r}
135 | results.matrix <- as.matrix(results$mcmc)
136 | mu.samples <- results.matrix[,'mu']
137 | bounds <- quantile(mu.samples, c(.025, .975))
138 | ```
139 | 
140 | We are 95% sure that the true mean is between `r bounds[1]` and
141 | `r bounds[2]`.
142 | 
143 | Now let's plot the marginal posterior distribution for the mean
144 | of the NYC high schools' combined SAT grades and draw the 95%
145 | percent credible interval.
146 | ```{r}
147 | plot(density(mu.samples),
148 |      main=paste("Posterior distribution of mean combined SAT",
149 |                 "score in NYC high schools (2010)", sep="\n"))
150 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
151 | ```
152 | 
153 | Given the results, the SAT scores for NYC high schools in 2010
154 | are **incontrovertibly** not on par with the average SAT scores of
155 | the nation.
156 | 
157 | ------------------------------------
158 | 
159 | This is some session information for reproducibility:
160 | ```{r}
161 | devtools::session_info()
162 | ```


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Data Analysis with R - Second Edition
 5 | This is the code repository for [Data Analysis with R - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/data-analysis-r-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788393720), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
 6 | ## About the Book
 7 | Frequently the tool of choice for academics, R has spread deep into the private sector and can be found in the production pipelines at some of the most advanced and successful enterprises. The power and domain-specificity of R allows the user to express complex analytics easily, quickly, and succinctly.
 8 | 
 9 | Starting with the basics of R and statistical reasoning, this book dives into advanced predictive analytics, showing how to apply those techniques to real-world data though with real-world examples.
10 | 
11 | 
12 | ## Instructions and Navigation
13 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
14 | 
15 | 
16 | 
17 | The code will look like the following:
18 | ```
19 | # don't worry about memorizing this
20 | temp.density <- density(airquality$Temp)
21 | pdf <- approxfun(temp.density$x, temp.density$y, rule=2)
22 | integrate(pdf, 80, 90)
23 | ```
24 | 
25 | All code in this book has been written against the latest version of R—3.4.3 at time of
26 | writing. As a matter of good practice, you should keep your R version up to date but most,
27 | if not all, code should work with any reasonably recent version of R. Some of the R
28 | packages we will be installing will require more recent versions though. For the other
29 | software that this book uses, instructions will be furnished pro re nata. If you want to get a
30 | head start, however, install RStudio, JAGS, and a C++ compiler (or Rtools if you use
31 | windows).
32 | 
33 | ## Related Products
34 | * [Data Analysis with R](https://www.packtpub.com/big-data-and-business-intelligence/data-analysis-r?utm_source=github&utm_medium=repository&utm_campaign=9781785288142)
35 | 
36 | * [Mastering Data Analysis with R](https://www.packtpub.com/big-data-and-business-intelligence/mastering-data-analysis-r?utm_source=github&utm_medium=repository&utm_campaign=9781783982028)
37 | 
38 | * [Hands-On Geospatial Analysis with R and QGIS](https://www.packtpub.com/application-development/hands-geospatial-analysis-r-and-qgis?utm_source=github&utm_medium=repository&utm_campaign=9781788991674)
39 | 
40 | ### Download a free PDF
41 | 
42 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
43 | <p align="center"> <a href="https://packt.link/free-ebook/9781788393720">https://packt.link/free-ebook/9781788393720 </a> </p>


--------------------------------------------------------------------------------