├── .gitignore ├── LICENSE ├── README.md ├── code_snippets ├── gra2mgcv.R ├── quantile_resid.R └── vis.concurvity.R ├── course_outline.jpg ├── data ├── bbs_data │ ├── bbs_florida.csv │ ├── bbs_florida_richness.csv │ └── routes.csv ├── bbs_florida.csv ├── bbs_florida_richness.csv ├── beta-regression │ └── ZooData.txt ├── drake_griffen │ ├── README │ ├── drake_griffen.RData │ └── make_dat.R ├── forest-health │ ├── beech.raw │ ├── foresthealth.bnd │ ├── foresthealth.dat │ └── foresthealth.gra ├── mexdolphins │ ├── README.md │ ├── mexdolphins.RData │ ├── soap_pred.R │ └── soapy.RData ├── routes.csv ├── time_series │ └── Florida_climate.csv └── yukon_seeds │ ├── seed_data.csv │ └── seed_source_locations.csv ├── example-bivariate-timeseries-and-ti.Rmd ├── example-bivariate-timeseries-and-ti.html ├── example-forest-health.Rmd ├── example-forest-health.html ├── example-linear-functionals.Rmd ├── example-nonlinear-timeseries.Rmd ├── example-nonlinear-timeseries.html ├── example-spatial-mexdolphins-solutions.Rmd ├── example-spatial-mexdolphins-solutions.html ├── example-spatial-mexdolphins.Rmd ├── example-spatial-mexdolphins.html ├── example-spatio-temporal data.Rmd ├── example-spatio-temporal_data.html ├── pre_course_questionnaire.txt ├── slides ├── 00-preamble.Rpres ├── 01-intro.Rpres ├── 02-intro-mgcv.Rpres ├── 03-model_checking.Rpres ├── 04-model_selection.Rpres ├── 05-inference.Rpres ├── 06-fancy.Rpres ├── correct_mathjax_PDF_and_move.sh ├── custom.css ├── images │ ├── Daphnia_magna.png │ ├── addbasis.png │ ├── animal_choice.png │ ├── animal_codes.png │ ├── animal_functions.png │ ├── by.png │ ├── concurvity.png │ ├── count.jpg │ ├── cyclic.png │ ├── jenny_models.png │ ├── mathematical_sobbing.jpg │ ├── mgcv-inside.png │ ├── remlgcv.png │ ├── rummy.gif │ ├── slope-aspect.png │ ├── soap-soap.png │ ├── soap-tprs.png │ ├── soap-truth.png │ ├── spermcovars.png │ ├── spermwhalecovs.R │ ├── spotteddolphin_swfsc.jpg │ ├── thanks_for_all_the_fish.gif │ └── tina-modelling.png └── wiggly.gif └── vis.concurvity.R /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rhistory 2 | .Rproj.user/ 3 | .Rproj.user 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Code in this respository is licensed under the GPL version >= 2. 2 | 3 | All other material is Creative Commons BY licensed (except where it does not belong to us, we do not claim copyright over anyone else's material). 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mgcv-workshop 2 | 3 | Dave Miller's fork of the course we (Dave, Eric Pedersen and Gavin Simpson) at the ESA 2016 annual meeting, adapted for other situations. 4 | 5 | Original repo (with further materials) can be found at [Eric Pedersen's github page](http://github.com/eric-pedersen/mgcv-esa-workshop/). 6 | -------------------------------------------------------------------------------- /code_snippets/gra2mgcv.R: -------------------------------------------------------------------------------- 1 | ## Read in a BayesX graph file and output list required by mgcv's mrf smooth 2 | gra2mgcv <- function(file) { 3 | ## first row contains number of objects in file 4 | ## graph info is stored in triplets or rows after first row 5 | ## - 1st row of triplet is ID 6 | ## - 2nd row of triplet is number of neighbours 7 | ## - 3rd row of triplet is vector of neighbour 8 | gra <- readLines(file) 9 | N <- gra[1] 10 | gra <- gra[-1] # pop the first row which contains N 11 | ids <- seq.int(1, by = 3, length.out = N) 12 | nbs <- seq.int(3, by = 3, length.out = N) 13 | node2id <- function(nodes, ids) { 14 | as.numeric(ids[as.numeric(nodes) + 1]) 15 | } 16 | l <- lapply(strsplit(gra[nbs], " "), node2id, ids = gra[ids]) 17 | names(l) <- gra[ids] 18 | l 19 | } 20 | -------------------------------------------------------------------------------- /code_snippets/quantile_resid.R: -------------------------------------------------------------------------------- 1 | #This code is modified for D.L. Miller's dsm package for distance sampling, from 2 | #the rqgam.check function. The code is designed to extract randomized quantile 3 | #residuals from GAMs, using the family definitions in mgcv. Note statmod only 4 | #supports RQ residuals for the following families: Tweedie, Poisson, Gaussian, Any errors are due to Eric Pedersen 5 | library(statmod) #This has functions for randomized quantile residuals 6 | rqresiduals = function (gam.obj) { 7 | if(!"gam" %in% attr(gam.obj,"class")){ 8 | stop('"gam.obj has to be of class "gam"') 9 | } 10 | if (!grepl("^Tweedie|^Negative Binomial|^poisson|^binomial|^gaussian|^Gamma|^inverse.gaussian", 11 | gam.obj$family$family)){ 12 | stop(paste("family " , gam.obj$family$family, 13 | " is not currently supported by the statmod library, 14 | and any randomized quantile residuals would be inaccurate.", 15 | sep="")) 16 | } 17 | if (grepl("^Tweedie", gam.obj$family$family)) { 18 | if (is.null(environment(gam.obj$family$variance)$p)) { 19 | p.val <- gam.obj$family$getTheta(TRUE) 20 | environment(gam.obj$family$variance)$p <- p.val 21 | } 22 | qres <- qres.tweedie(gam.obj) 23 | } 24 | else if (grepl("^Negative Binomial", gam.obj$family$family)) { 25 | if ("extended.family" %in% class(gam.obj$family)) { 26 | gam.obj$theta <- gam.obj$family$getTheta(TRUE) 27 | } 28 | else { 29 | gam.obj$theta <- gam.obj$family$getTheta() 30 | } 31 | qres <- qres.nbinom(gam.obj) 32 | } 33 | else { 34 | qres <- qresid(gam.obj) 35 | } 36 | return(qres) 37 | } -------------------------------------------------------------------------------- /code_snippets/vis.concurvity.R: -------------------------------------------------------------------------------- 1 | # visualise concurvity between terms in a GAM 2 | # David L Miller 2015, MIT license 3 | 4 | # arguments: 5 | # b -- a fitted gam 6 | # type -- concurvity measure to plot, see ?concurvity 7 | 8 | vis.concurvity <- function(b, type="estimate"){ 9 | cc <- concurvity(b, full=FALSE)[[type]] 10 | 11 | diag(cc) <- NA 12 | cc[lower.tri(cc)]<-NA 13 | 14 | layout(matrix(1:2, ncol=2), widths=c(5,1)) 15 | opar <- par(mar=c(5, 6, 5, 0) + 0.1) 16 | # main plot 17 | image(z=cc, x=1:ncol(cc), y=1:nrow(cc), ylab="", xlab="", 18 | axes=FALSE, asp=1, zlim=c(0,1)) 19 | axis(1, at=1:ncol(cc), labels = colnames(cc), las=2) 20 | axis(2, at=1:nrow(cc), labels = rownames(cc), las=2) 21 | # legend 22 | opar <- par(mar=c(5, 0, 4, 3) + 0.1) 23 | image(t(matrix(rep(seq(0, 1, len=100), 2), ncol=2)), 24 | x=1:3, y=1:101, zlim=c(0,1), axes=FALSE, xlab="", ylab="") 25 | axis(4, at=seq(1,101,len=5), labels = round(seq(0,1,len=5),1), las=2) 26 | par(opar) 27 | } 28 | -------------------------------------------------------------------------------- /course_outline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/course_outline.jpg -------------------------------------------------------------------------------- /data/bbs_data/routes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/bbs_data/routes.csv -------------------------------------------------------------------------------- /data/beta-regression/ZooData.txt: -------------------------------------------------------------------------------- 1 | Number Scans Proportion Size Visual Raised Visitors Feeding Oc Other Enrichment Group Sex Enclosure Vehicle Diet Age Zoo Eps 41 300 0.136666667 650 2 2 6418 1 1 2 2 2 1 0 59 3 2 1 1 47 150 0.313333333 2405 2 1 13607 1 2 2 2 1 1 0 263 1 2 2 2 28 300 0.093333333 1781 4 2 13607 1 2 2 2 2 1 0 263 1 3 2 3 26 150 0.173333333 390 5 1 0 1 2 2 1 1 2 0 0 1 3 2 4 13 148 0.087837838 390 2 1 0 1 2 2 1 1 2 0 0 1 3 2 5 25 152 0.164473684 390 3 1 0 1 2 2 1 1 2 0 0 1 2 2 6 15 301 0.049833887 1200 4 2 11713 1 2 2 2 2 2 20 215 3 8 3 7 78 299 0.260869565 1200 4 2 11713 1 2 2 2 2 2 12 215 3 2 3 8 77 300 0.256666667 1200 4 2 11713 1 2 2 2 2 1 16 215 3 2 3 9 64 300 0.213333333 70 1 1 11713 1 2 2 2 2 1 20 215 3 8 3 10 80 300 0.266666667 40 1 1 11713 1 2 2 2 2 2 12 215 3 2 3 11 28 240 0.116666667 1200 4 2 4225 1 2 2 2 2 1 16 28 3 8 3 12 2 150 0.013333333 650 5 2 96513 1 1 1 2 1 2 20 3 3 7 4 13 12 150 0.08 650 5 2 96513 1 1 1 2 1 2 20 3 3 1 4 14 2 150 0.013333333 650 5 2 7294 1 1 1 1 1 2 0 7 3 7 4 15 16 150 0.106666667 100 1 1 0 1 1 1 1 1 2 20 6 3 7 4 16 12 150 0.08 100 1 1 0 1 1 1 1 1 2 20 6 3 1 4 17 6 356 0.016853933 2030 8 2 76000 1 2 2 1 2 1 6 0 1 2 5 18 45 120 0.375 715 6 2 76000 1 2 2 1 1 2 6 0 1 2 5 19 27 270 0.1 715 6 2 14195 2 2 2 1 2 1 0 0 1 2 5 20 28 90 0.311111111 2030 8 2 14195 2 2 2 1 1 2 0 0 1 2 5 21 14 100 0.14 420 2 1 8971 1 1 2 1 1 1 0 112 3 1 6 22 9 100 0.09 434 2 2 39120 1 1 2 1 1 1 0 116 3 1 6 23 10 100 0.1 434 2 2 27589 1 1 2 1 1 1 0 157 3 1 6 24 13 497 0.026156942 434 2 2 8971 1 1 2 1 2 2 0 112 3 3 6 25 10 495 0.02020202 1700 4 1 39120 1 1 2 1 2 2 1 116 3 3 6 26 9 499 0.018036072 1700 4 1 27589 1 1 2 1 2 2 0 157 3 3 6 27 20 199 0.100502513 3800 8 2 8971 1 1 2 1 2 2 0 112 3 5 6 28 26 200 0.13 3000 8 2 39120 1 1 2 2 2 2 0 63 3 5 6 29 21 200 0.105 3000 8 2 27589 1 1 2 2 2 2 0 84 3 5 6 30 0 296 0 3000 8 2 8971 1 1 2 2 2 1 0 51 3 10 6 31 6 300 0.02 504 4 2 27589 1 1 2 1 2 1 1 157 3 10 6 32 63 299 0.210702341 1700 4 1 8971 1 1 2 1 2 1 1 112 3 10 6 33 21 100 0.21 504 4 2 8971 1 1 2 1 1 2 0 112 3 8 6 34 18 100 0.18 504 4 2 39120 1 1 2 1 1 2 0 116 3 3 6 35 13 192 0.067708333 434 2 1 22000 2 1 2 2 1 2 0 128 3 5 6 36 34 150 0.226666667 1000 2 1 627 1 2 1 2 1 1 0 83 2 7 7 37 21 150 0.14 600 2 1 627 1 2 1 2 1 2 0 83 2 4 7 38 35 150 0.233333333 578 4 1 627 1 2 1 2 1 2 0 83 2 3 7 39 16 150 0.106666667 630 4 1 627 1 2 1 2 1 2 0 83 2 8 7 40 18 150 0.12 587 2 1 627 1 2 1 2 1 2 0 83 2 7 7 41 22 150 0.146666667 203 1 1 627 1 2 1 2 1 1 0 83 2 2 7 42 27 150 0.18 1100 3 1 627 1 2 1 2 1 1 0 54 1 8 7 43 16 300 0.053333333 1400 8 1 627 1 2 1 2 2 1 1 27 1 8 7 44 4 302 0.013245033 600 5 1 0 2 2 1 1 2 1 0 22 1 11 7 45 16 147 0.108843537 1600 9 1 0 2 2 1 1 1 1 0 26 1 9 7 46 4 150 0.026666667 400 4 1 0 2 2 1 1 1 2 0 26 1 8 7 47 35 300 0.116666667 400 4 1 0 2 2 1 1 2 1 0 26 1 5 7 48 11 150 0.073333333 500 9 1 0 2 2 1 1 1 2 0 26 1 11 7 49 8 149 0.053691275 500 9 1 0 2 2 1 1 1 2 0 26 1 12 7 50 18 150 0.12 500 9 1 0 2 2 1 1 1 2 0 26 1 11 7 51 19 299 0.063545151 500 9 1 0 2 2 1 1 2 1 0 26 1 10 7 52 3 150 0.02 500 9 1 0 2 2 1 1 1 1 0 26 1 10 7 53 15 495 0.03030303 1800 8 1 0 2 2 2 1 2 2 0 26 1 1 7 54 36 150 0.24 431 2 1 0 2 2 1 1 1 1 2 32 1 4 7 55 10 150 0.066666667 336 2 1 0 2 2 1 1 1 2 0 32 1 9 7 56 3 149 0.020134228 67.5 1 1 0 2 2 1 1 1 1 0 32 1 7 7 57 1 150 0.006666667 195 2 1 0 2 2 1 1 1 1 2 32 1 8 7 58 10 150 0.066666667 270 3 1 0 2 2 1 1 1 1 0 32 1 12 7 59 12 298 0.040268456 1900 9 1 0 2 2 1 1 2 1 1 19 1 9 7 60 14 150 0.093333333 360 4 1 0 2 2 1 1 1 1 0 11 1 7 7 61 33 150 0.22 360 3 1 0 2 2 1 1 1 2 0 11 1 3 7 62 9 149 0.060402685 266 5 1 0 2 2 1 1 1 2 0 11 1 9 7 63 19 150 0.126666667 360 7 1 0 2 2 1 1 1 2 0 11 1 6 7 64 39 600 0.065 15000 4 1 718 1 2 1 2 2 1 8 51 3 3 8 65 17 300 0.056666667 25000 4 2 718 1 2 1 2 2 2 10 49 3 5 8 66 9 150 0.06 10000 3 1 718 1 2 1 1 1 1 0 27 3 15 8 67 11 300 0.036666667 25000 4 2 718 1 2 1 2 2 2 8 40 3 1 8 68 10 591 0.016920474 10000 9 1 0 2 2 2 2 2 2 0 12 3 9 8 69 2 749 0.002670227 40000 9 2 0 2 1 2 1 2 1 0 16 3 5 8 70 20 448 0.044642857 10000 4 2 0 1 2 1 1 2 1 3 18 3 1 8 71 0 150 0 25000 10 2 0 2 2 2 1 1 1 0 22 3 10 8 72 1 300 0.003333333 25000 10 2 0 2 2 2 1 2 1 0 22 3 10 8 73 13 150 0.086666667 80000 9 2 718 2 2 2 2 1 2 0 47 3 10 8 74 10 150 0.066666667 80000 9 2 718 2 2 2 2 1 2 0 60 3 7 8 75 0 300 0 80000 9 2 0 2 2 2 2 2 2 0 44 3 2 8 76 19 150 0.126666667 3000 5 2 33172 1 2 2 1 1 2 16 11 2 4 9 77 12 150 0.08 62 1 1 0 1 1 2 1 1 2 16 21 2 4 9 78 10 452 0.022123894 3000 5 2 33172 1 2 2 1 2 2 16 1 2 1 9 79 10 450 0.022222222 62 1 1 0 1 1 2 1 2 2 16 17 2 1 9 80 15 149 0.100671141 69 1 1 0 1 2 1 1 1 1 16 10 2 2 9 81 37 150 0.246666667 69 1 1 0 1 2 1 1 1 2 14 17 2 5 9 82 44 300 0.146666667 625 1 2 0 1 2 2 1 2 1 0 34 2 8 9 83 21 150 0.14 625 1 1 0 1 2 2 1 1 2 6 33 2 5 9 84 14 150 0.093333333 10000 2 1 0 1 2 2 1 1 1 2 29 2 9 9 85 55 149 0.369127517 600 2 2 0 1 2 2 1 1 1 2 34 2 9 9 86 28 150 0.186666667 450 1 1 0 1 2 1 1 1 2 2 35 2 2 9 87 17 150 0.113333333 18.6 1 1 0 1 2 1 1 1 2 0 35 2 8 9 88 -------------------------------------------------------------------------------- /data/drake_griffen/README: -------------------------------------------------------------------------------- 1 | Data adapted from: 2 | 3 | Drake, J.M. & B.D. Griffen. 2010. Early warning signals of extinction in deteriorating environments. Nature 467:456-459. doi:10.1038/nature09389 4 | 5 | Downloaded at: http://datadryad.org/resource/doi:10.5061/dryad.q3p64 6 | 7 | 8 | Data adapted from the below files, using the script make_dat.R: 9 | 10 | timeseries.csv - Contains census records (triplicate counting) for all experimental populations 11 | ID: Unique identifier for each experimental population 12 | Subpop: Unique identifier for each chamber of a population 13 | Date: Census date 14 | sample1: First cenus 15 | sample2: Second census 16 | sample3: Third census 17 | HoleNum: Experimental treatment - number of connecting passages (mm) 18 | HoleSize: Experimental treatment - diameter of connecting passages (mm) 19 | Deteriorating: Experimental treatment - indicator for whether or not population was in the deteriorating food treatment group 20 | FoodLevel: Volume of food suspension supplied on the census date 21 | 22 | preprocess.R - Auxilary script to format the data for analysis 23 | -------------------------------------------------------------------------------- /data/drake_griffen/drake_griffen.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/drake_griffen/drake_griffen.RData -------------------------------------------------------------------------------- /data/drake_griffen/make_dat.R: -------------------------------------------------------------------------------- 1 | # run this in the data-and-code directory of the zip file downloaded 2 | # from http://datadryad.org/resource/doi:10.5061/dryad.q3p64 3 | 4 | #Preprocess raw time series data for analysis 5 | source('preprocess.R') 6 | 7 | library(lubridate) 8 | 9 | # data munging 10 | timeseries$date <- mdy(as.character(timeseries$Date.)) 11 | populations$Nhat <- populations$x 12 | populations$x <- NULL 13 | 14 | # table of which groups were deteriorating or not 15 | groups <- unique(timeseries[,c("ID", "Deteriorating")]) 16 | 17 | populations$deteriorating <- populations$ID %in% groups$ID[groups$Deteriorating==1] 18 | 19 | # smooth of population size over time 20 | pop_happy <- subset(populations, !deteriorating) 21 | pop_unhappy <- subset(populations, deteriorating) 22 | 23 | 24 | save.image(file="drake_griffen.RData") 25 | 26 | -------------------------------------------------------------------------------- /data/forest-health/foresthealth.bnd: -------------------------------------------------------------------------------- 1 | "82",5 2 | 15.9882,1.8882 3 | 15.9882,2.1118 4 | 16.2118,2.1118 5 | 16.2118,1.8882 6 | 15.9882,1.8882 7 | "83",5 8 | 15.9882,2.8882 9 | 15.9882,3.1118 10 | 16.2118,3.1118 11 | 16.2118,2.8882 12 | 15.9882,2.8882 13 | "79",5 14 | 14.9882,1.8882 15 | 14.9882,2.1118 16 | 15.2118,2.1118 17 | 15.2118,1.8882 18 | 14.9882,1.8882 19 | "81",5 20 | 14.9882,3.8882 21 | 14.9882,4.1118 22 | 15.2118,4.1118 23 | 15.2118,3.8882 24 | 14.9882,3.8882 25 | "78",5 26 | 13.9882,4.8882 27 | 13.9882,5.1118 28 | 14.2118,5.1118 29 | 14.2118,4.8882 30 | 13.9882,4.8882 31 | "62",5 32 | 10.1882,8.8882 33 | 10.1882,9.1118 34 | 10.4118,9.1118 35 | 10.4118,8.8882 36 | 10.1882,8.8882 37 | "55",5 38 | 9.6882,8.3882 39 | 9.6882,8.6118 40 | 9.9118,8.6118 41 | 9.9118,8.3882 42 | 9.6882,8.3882 43 | "80",5 44 | 14.9882,2.8882 45 | 14.9882,3.1118 46 | 15.2118,3.1118 47 | 15.2118,2.8882 48 | 14.9882,2.8882 49 | "77",5 50 | 13.9882,3.8882 51 | 13.9882,4.1118 52 | 14.2118,4.1118 53 | 14.2118,3.8882 54 | 13.9882,3.8882 55 | "75",5 56 | 12.9882,4.9882 57 | 12.9882,5.2118 58 | 13.2118,5.2118 59 | 13.2118,4.9882 60 | 12.9882,4.9882 61 | "72",5 62 | 12.0882,5.9882 63 | 12.0882,6.2118 64 | 12.3118,6.2118 65 | 12.3118,5.9882 66 | 12.0882,5.9882 67 | "61",5 68 | 10.0882,7.8882 69 | 10.0882,8.1118 70 | 10.3118,8.1118 71 | 10.3118,7.8882 72 | 10.0882,7.8882 73 | "76",5 74 | 13.9882,2.8882 75 | 13.9882,3.1118 76 | 14.2118,3.1118 77 | 14.2118,2.8882 78 | 13.9882,2.8882 79 | "74",5 80 | 12.9882,3.8882 81 | 12.9882,4.1118 82 | 13.2118,4.1118 83 | 13.2118,3.8882 84 | 12.9882,3.8882 85 | "71",5 86 | 12.0882,4.9882 87 | 12.0882,5.2118 88 | 12.3118,5.2118 89 | 12.3118,4.9882 90 | 12.0882,4.9882 91 | "60",5 92 | 10.0882,6.9882 93 | 10.0882,7.2118 94 | 10.3118,7.2118 95 | 10.3118,6.9882 96 | 10.0882,6.9882 97 | "73",5 98 | 12.9882,2.8882 99 | 12.9882,3.1118 100 | 13.2118,3.1118 101 | 13.2118,2.8882 102 | 12.9882,2.8882 103 | "70",5 104 | 11.9882,3.8882 105 | 11.9882,4.1118 106 | 12.2118,4.1118 107 | 12.2118,3.8882 108 | 11.9882,3.8882 109 | "67",5 110 | 11.0882,4.8882 111 | 11.0882,5.1118 112 | 11.3118,5.1118 113 | 11.3118,4.8882 114 | 11.0882,4.8882 115 | "59",5 116 | 10.0882,5.8882 117 | 10.0882,6.1118 118 | 10.3118,6.1118 119 | 10.3118,5.8882 120 | 10.0882,5.8882 121 | "54",5 122 | 9.0882,6.9882 123 | 9.0882,7.2118 124 | 9.3118,7.2118 125 | 9.3118,6.9882 126 | 9.0882,6.9882 127 | "69",5 128 | 11.9882,2.8882 129 | 11.9882,3.1118 130 | 12.2118,3.1118 131 | 12.2118,2.8882 132 | 11.9882,2.8882 133 | "66",5 134 | 10.9882,3.8882 135 | 10.9882,4.1118 136 | 11.2118,4.1118 137 | 11.2118,3.8882 138 | 10.9882,3.8882 139 | "68",5 140 | 11.9882,1.9882 141 | 11.9882,2.2118 142 | 12.2118,2.2118 143 | 12.2118,1.9882 144 | 11.9882,1.9882 145 | "63",5 146 | 10.9882,0.988197 147 | 10.9882,1.2118 148 | 11.2118,1.2118 149 | 11.2118,0.988197 150 | 10.9882,0.988197 151 | "53",5 152 | 9.0882,5.8882 153 | 9.0882,6.1118 154 | 9.3118,6.1118 155 | 9.3118,5.8882 156 | 9.0882,5.8882 157 | "48",5 158 | 8.0882,6.9882 159 | 8.0882,7.2118 160 | 8.3118,7.2118 161 | 8.3118,6.9882 162 | 8.0882,6.9882 163 | "65",5 164 | 10.9882,2.8882 165 | 10.9882,3.1118 166 | 11.2118,3.1118 167 | 11.2118,2.8882 168 | 10.9882,2.8882 169 | "64",5 170 | 10.9882,1.8882 171 | 10.9882,2.1118 172 | 11.2118,2.1118 173 | 11.2118,1.8882 174 | 10.9882,1.8882 175 | "56",5 176 | 9.9882,0.988197 177 | 9.9882,1.2118 178 | 10.2118,1.2118 179 | 10.2118,0.988197 180 | 9.9882,0.988197 181 | "47",5 182 | 8.0882,5.9882 183 | 8.0882,6.2118 184 | 8.3118,6.2118 185 | 8.3118,5.9882 186 | 8.0882,5.9882 187 | "58",5 188 | 9.9882,2.9882 189 | 9.9882,3.2118 190 | 10.2118,3.2118 191 | 10.2118,2.9882 192 | 9.9882,2.9882 193 | "57",5 194 | 9.9882,1.9882 195 | 9.9882,2.2118 196 | 10.2118,2.2118 197 | 10.2118,1.9882 198 | 9.9882,1.9882 199 | "50",5 200 | 8.9882,0.988197 201 | 8.9882,1.2118 202 | 9.2118,1.2118 203 | 9.2118,0.988197 204 | 8.9882,0.988197 205 | "49",5 206 | 8.9882,0.388197 207 | 8.9882,0.611803 208 | 9.2118,0.611803 209 | 9.2118,0.388197 210 | 8.9882,0.388197 211 | "46",5 212 | 8.0882,4.8882 213 | 8.0882,5.1118 214 | 8.3118,5.1118 215 | 8.3118,4.8882 216 | 8.0882,4.8882 217 | "52",5 218 | 8.9882,2.9882 219 | 8.9882,3.2118 220 | 9.2118,3.2118 221 | 9.2118,2.9882 222 | 8.9882,2.9882 223 | "51",5 224 | 8.9882,1.9882 225 | 8.9882,2.2118 226 | 9.2118,2.2118 227 | 9.2118,1.9882 228 | 8.9882,1.9882 229 | "42",5 230 | 7.9882,0.988197 231 | 7.9882,1.2118 232 | 8.2118,1.2118 233 | 8.2118,0.988197 234 | 7.9882,0.988197 235 | "45",5 236 | 7.9882,3.8882 237 | 7.9882,4.1118 238 | 8.2118,4.1118 239 | 8.2118,3.8882 240 | 7.9882,3.8882 241 | "41",5 242 | 6.9882,4.9882 243 | 6.9882,5.2118 244 | 7.2118,5.2118 245 | 7.2118,4.9882 246 | 6.9882,4.9882 247 | "44",5 248 | 7.9882,2.9882 249 | 7.9882,3.2118 250 | 8.2118,3.2118 251 | 8.2118,2.9882 252 | 7.9882,2.9882 253 | "43",5 254 | 7.9882,1.9882 255 | 7.9882,2.2118 256 | 8.2118,2.2118 257 | 8.2118,1.9882 258 | 7.9882,1.9882 259 | "40",5 260 | 6.9882,3.9882 261 | 6.9882,4.2118 262 | 7.2118,4.2118 263 | 7.2118,3.9882 264 | 6.9882,3.9882 265 | "39",5 266 | 6.9882,2.9882 267 | 6.9882,3.2118 268 | 7.2118,3.2118 269 | 7.2118,2.9882 270 | 6.9882,2.9882 271 | "38",5 272 | 6.9882,1.9882 273 | 6.9882,2.2118 274 | 7.2118,2.2118 275 | 7.2118,1.9882 276 | 6.9882,1.9882 277 | "37",5 278 | 6.0882,3.2882 279 | 6.0882,3.5118 280 | 6.3118,3.5118 281 | 6.3118,3.2882 282 | 6.0882,3.2882 283 | "36",5 284 | 6.0882,2.2882 285 | 6.0882,2.5118 286 | 6.3118,2.5118 287 | 6.3118,2.2882 288 | 6.0882,2.2882 289 | "33",5 290 | 5.0882,3.2882 291 | 5.0882,3.5118 292 | 5.3118,3.5118 293 | 5.3118,3.2882 294 | 5.0882,3.2882 295 | "35",5 296 | 5.2882,1.7882 297 | 5.2882,2.0118 298 | 5.5118,2.0118 299 | 5.5118,1.7882 300 | 5.2882,1.7882 301 | "31",5 302 | 5.0882,1.2882 303 | 5.0882,1.5118 304 | 5.3118,1.5118 305 | 5.3118,1.2882 306 | 5.0882,1.2882 307 | "25",5 308 | 4.0882,0.288197 309 | 4.0882,0.511803 310 | 4.3118,0.511803 311 | 4.3118,0.288197 312 | 4.0882,0.288197 313 | "34",5 314 | 5.1882,1.9882 315 | 5.1882,2.2118 316 | 5.4118,2.2118 317 | 5.4118,1.9882 318 | 5.1882,1.9882 319 | "32",5 320 | 5.0882,2.2882 321 | 5.0882,2.5118 322 | 5.3118,2.5118 323 | 5.3118,2.2882 324 | 5.0882,2.2882 325 | "21",5 326 | 3.5882,0.788197 327 | 3.5882,1.0118 328 | 3.8118,1.0118 329 | 3.8118,0.788197 330 | 3.5882,0.788197 331 | "18",5 332 | 3.1882,0.688197 333 | 3.1882,0.911803 334 | 3.4118,0.911803 335 | 3.4118,0.688197 336 | 3.1882,0.688197 337 | "15",5 338 | 3.0882,0.288197 339 | 3.0882,0.511803 340 | 3.3118,0.511803 341 | 3.3118,0.288197 342 | 3.0882,0.288197 343 | "28",5 344 | 4.4882,2.8882 345 | 4.4882,3.1118 346 | 4.7118,3.1118 347 | 4.7118,2.8882 348 | 4.4882,2.8882 349 | "30",5 350 | 4.5882,2.3882 351 | 4.5882,2.6118 352 | 4.8118,2.6118 353 | 4.8118,2.3882 354 | 4.5882,2.3882 355 | "29",5 356 | 4.5882,1.7882 357 | 4.5882,2.0118 358 | 4.8118,2.0118 359 | 4.8118,1.7882 360 | 4.5882,1.7882 361 | "26",5 362 | 4.0882,1.2882 363 | 4.0882,1.5118 364 | 4.3118,1.5118 365 | 4.3118,1.2882 366 | 4.0882,1.2882 367 | "27",5 368 | 4.0882,2.2882 369 | 4.0882,2.5118 370 | 4.3118,2.5118 371 | 4.3118,2.2882 372 | 4.0882,2.2882 373 | "22",5 374 | 3.5882,1.7882 375 | 3.5882,2.0118 376 | 3.8118,2.0118 377 | 3.8118,1.7882 378 | 3.5882,1.7882 379 | "16",5 380 | 3.0882,1.2882 381 | 3.0882,1.5118 382 | 3.3118,1.5118 383 | 3.3118,1.2882 384 | 3.0882,1.2882 385 | "23",5 386 | 3.5882,2.7882 387 | 3.5882,3.0118 388 | 3.8118,3.0118 389 | 3.8118,2.7882 390 | 3.5882,2.7882 391 | "20",5 392 | 3.4882,2.0882 393 | 3.4882,2.3118 394 | 3.7118,2.3118 395 | 3.7118,2.0882 396 | 3.4882,2.0882 397 | "19",5 398 | 3.1882,2.5882 399 | 3.1882,2.8118 400 | 3.4118,2.8118 401 | 3.4118,2.5882 402 | 3.1882,2.5882 403 | "17",5 404 | 3.0882,2.2882 405 | 3.0882,2.5118 406 | 3.3118,2.5118 407 | 3.3118,2.2882 408 | 3.0882,2.2882 409 | "12",5 410 | 2.5882,1.7882 411 | 2.5882,2.0118 412 | 2.8118,2.0118 413 | 2.8118,1.7882 414 | 2.5882,1.7882 415 | "2",5 416 | 1.0882,2.2882 417 | 1.0882,2.5118 418 | 1.3118,2.5118 419 | 1.3118,2.2882 420 | 1.0882,2.2882 421 | "1",5 422 | 0.588197,2.7882 423 | 0.588197,3.0118 424 | 0.811803,3.0118 425 | 0.811803,2.7882 426 | 0.588197,2.7882 427 | "13",5 428 | 2.5882,2.7882 429 | 2.5882,3.0118 430 | 2.8118,3.0118 431 | 2.8118,2.7882 432 | 2.5882,2.7882 433 | "3",5 434 | 1.0882,3.2882 435 | 1.0882,3.5118 436 | 1.3118,3.5118 437 | 1.3118,3.2882 438 | 1.0882,3.2882 439 | "6",5 440 | 1.4882,2.4882 441 | 1.4882,2.7118 442 | 1.7118,2.7118 443 | 1.7118,2.4882 444 | 1.4882,2.4882 445 | "7",5 446 | 1.5882,2.7882 447 | 1.5882,3.0118 448 | 1.8118,3.0118 449 | 1.8118,2.7882 450 | 1.5882,2.7882 451 | "9",5 452 | 2.0882,2.2882 453 | 2.0882,2.5118 454 | 2.3118,2.5118 455 | 2.3118,2.2882 456 | 2.0882,2.2882 457 | "4",5 458 | 1.3882,3.4882 459 | 1.3882,3.7118 460 | 1.6118,3.7118 461 | 1.6118,3.4882 462 | 1.3882,3.4882 463 | "5",5 464 | 1.3882,4.8882 465 | 1.3882,5.1118 466 | 1.6118,5.1118 467 | 1.6118,4.8882 468 | 1.3882,4.8882 469 | "8",5 470 | 1.5882,3.7882 471 | 1.5882,4.0118 472 | 1.8118,4.0118 473 | 1.8118,3.7882 474 | 1.5882,3.7882 475 | "10",5 476 | 2.0882,3.2882 477 | 2.0882,3.5118 478 | 2.3118,3.5118 479 | 2.3118,3.2882 480 | 2.0882,3.2882 481 | "11",5 482 | 2.0882,4.2882 483 | 2.0882,4.5118 484 | 2.3118,4.5118 485 | 2.3118,4.2882 486 | 2.0882,4.2882 487 | "14",5 488 | 2.7882,3.9882 489 | 2.7882,4.2118 490 | 3.0118,4.2118 491 | 3.0118,3.9882 492 | 2.7882,3.9882 493 | "24",5 494 | 3.6882,3.9882 495 | 3.6882,4.2118 496 | 3.9118,4.2118 497 | 3.9118,3.9882 498 | 3.6882,3.9882 499 | -------------------------------------------------------------------------------- /data/forest-health/foresthealth.gra: -------------------------------------------------------------------------------- 1 | 83 2 | 82 3 | 2 4 | 2 1 5 | 83 6 | 2 7 | 0 7 8 | 79 9 | 2 10 | 7 0 11 | 81 12 | 2 13 | 7 8 14 | 78 15 | 2 16 | 8 9 17 | 62 18 | 2 19 | 11 6 20 | 55 21 | 2 22 | 11 5 23 | 80 24 | 4 25 | 2 12 3 1 26 | 77 27 | 4 28 | 12 13 4 3 29 | 75 30 | 3 31 | 13 14 4 32 | 72 33 | 1 34 | 14 35 | 61 36 | 3 37 | 15 6 5 38 | 76 39 | 3 40 | 16 8 7 41 | 74 42 | 4 43 | 16 17 9 8 44 | 71 45 | 4 46 | 17 18 10 9 47 | 60 48 | 3 49 | 19 20 11 50 | 73 51 | 3 52 | 21 13 12 53 | 70 54 | 4 55 | 21 22 14 13 56 | 67 57 | 2 58 | 22 14 59 | 59 60 | 2 61 | 25 15 62 | 54 63 | 3 64 | 25 26 15 65 | 69 66 | 4 67 | 23 27 17 16 68 | 66 69 | 3 70 | 27 18 17 71 | 68 72 | 2 73 | 28 21 74 | 63 75 | 2 76 | 29 28 77 | 53 78 | 3 79 | 30 20 19 80 | 48 81 | 2 82 | 30 20 83 | 65 84 | 4 85 | 28 31 22 21 86 | 64 87 | 4 88 | 24 32 27 23 89 | 56 90 | 4 91 | 33 34 32 24 92 | 47 93 | 3 94 | 35 26 25 95 | 58 96 | 3 97 | 32 36 27 98 | 57 99 | 4 100 | 29 37 31 28 101 | 50 102 | 4 103 | 34 38 37 29 104 | 49 105 | 3 106 | 38 33 29 107 | 46 108 | 3 109 | 39 40 30 110 | 52 111 | 3 112 | 37 41 31 113 | 51 114 | 4 115 | 33 42 36 32 116 | 42 117 | 3 118 | 42 34 33 119 | 45 120 | 3 121 | 41 43 35 122 | 41 123 | 2 124 | 43 35 125 | 44 126 | 4 127 | 42 44 39 36 128 | 43 129 | 4 130 | 38 45 41 37 131 | 40 132 | 4 133 | 44 46 40 39 134 | 39 135 | 5 136 | 45 46 47 43 41 137 | 38 138 | 3 139 | 47 44 42 140 | 37 141 | 4 142 | 47 48 44 43 143 | 36 144 | 6 145 | 49 52 53 46 45 44 146 | 33 147 | 4 148 | 53 58 57 46 149 | 35 150 | 6 151 | 52 53 50 58 59 47 152 | 31 153 | 5 154 | 59 60 53 52 49 155 | 25 156 | 4 157 | 54 55 56 60 158 | 34 159 | 8 160 | 53 50 58 59 57 61 49 47 161 | 32 162 | 9 163 | 50 58 59 57 61 48 52 49 47 164 | 21 165 | 6 166 | 55 63 56 62 51 60 167 | 18 168 | 6 169 | 63 56 54 62 51 60 170 | 15 171 | 4 172 | 63 55 54 51 173 | 28 174 | 7 175 | 61 64 59 58 53 48 52 176 | 30 177 | 10 178 | 59 57 61 64 62 65 53 48 52 49 179 | 29 180 | 10 181 | 57 61 60 62 65 58 50 53 52 49 182 | 26 183 | 9 184 | 51 62 54 65 55 63 61 59 50 185 | 27 186 | 11 187 | 60 64 62 65 66 67 57 59 58 53 52 188 | 22 189 | 12 190 | 54 65 66 55 67 63 68 64 60 61 59 58 191 | 16 192 | 8 193 | 56 68 67 55 65 54 62 60 194 | 23 195 | 8 196 | 62 65 66 67 71 61 57 58 197 | 20 198 | 11 199 | 66 67 63 71 68 62 64 60 61 59 58 200 | 19 201 | 8 202 | 67 71 68 75 65 62 64 61 203 | 17 204 | 9 205 | 63 71 68 75 66 65 62 64 61 206 | 12 207 | 7 208 | 75 71 63 67 66 65 62 209 | 2 210 | 5 211 | 70 72 73 74 75 212 | 1 213 | 5 214 | 69 72 76 73 74 215 | 13 216 | 9 217 | 68 79 75 74 73 67 66 65 64 218 | 3 219 | 7 220 | 69 70 76 73 74 78 79 221 | 6 222 | 8 223 | 76 72 69 70 74 75 79 71 224 | 7 225 | 9 226 | 73 76 72 69 70 78 75 79 71 227 | 9 228 | 8 229 | 74 73 69 79 68 71 67 66 230 | 4 231 | 7 232 | 72 70 73 74 78 79 80 233 | 5 234 | 2 235 | 78 80 236 | 8 237 | 6 238 | 74 77 76 72 79 80 239 | 10 240 | 9 241 | 75 78 74 73 76 72 80 71 81 242 | 11 243 | 5 244 | 79 78 77 76 81 245 | 14 246 | 3 247 | 80 79 82 248 | 24 249 | 1 250 | 81 251 | -------------------------------------------------------------------------------- /data/mexdolphins/README.md: -------------------------------------------------------------------------------- 1 | Pantropical spotted dolphins in the Gulf of Mexico 2 | ================================================== 3 | 4 | Data copied from [this example DSM analysis](http://distancesampling.org/R/vignettes/mexico-analysis.html), after correcting for detectability. 5 | 6 | Data was collected by teh NOAA South East Fisheries Science Center (sic). The OBIS-SEAMAP page for the data may be found at the SEFSC [GoMex Oceanic 1996 survey page](http://seamap.env.duke.edu/dataset/25). 7 | 8 | ## Data citation 9 | 10 | Garrison, L. 2013. SEFSC GoMex Oceanic 1996. Data downloaded from OBIS-SEAMAP (http://seamap.env.duke.edu/dataset/25). 11 | -------------------------------------------------------------------------------- /data/mexdolphins/mexdolphins.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/mexdolphins/mexdolphins.RData -------------------------------------------------------------------------------- /data/mexdolphins/soap_pred.R: -------------------------------------------------------------------------------- 1 | # make soap pred grid 2 | 3 | library(mapdata) 4 | library(maptools) 5 | library(rgeos) 6 | library(rgdal) 7 | library(mgcv) 8 | 9 | load("mexdolphins.RData") 10 | 11 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 12 | 13 | usmap <- map('usa', plot=FALSE, xlim=c(-100,-80), ylim=c(22,33)) 14 | uspoly <- map2SpatialPolygons(usmap, IDs=usmap$names, 15 | proj4string=CRS("+proj=longlat +datum=WGS84")) 16 | 17 | # simplify the polygon 18 | uspoly <- gSimplify(uspoly, 0.08) 19 | 20 | 21 | # bounding box 22 | c1 = cbind(c(-97.5, -82, -82, -97.5), 23 | c(22.5, 22.5, 32, 32)) 24 | r1 = rbind(c1, c1[1, ]) # join 25 | P1 = Polygon(r1) 26 | Ps1 = Polygons(list(P1), ID = "a") 27 | 28 | SPs = SpatialPolygons(list(Ps1),proj4string=CRS("+proj=longlat +datum=WGS84")) 29 | 30 | # plot to check we got the right area 31 | #plot(uspoly)#, xlim=c(-100,-80), ylim=c(22,33)) 32 | #plot(SPs, add=TRUE) 33 | #points(mexdolphins[,c("longitude","latitude")]) 34 | 35 | # take the polygon difference 36 | soap_area <- gDifference(SPs,uspoly) 37 | # again check 38 | #plot(soap_area) 39 | #points(mexdolphins[,c("longitude","latitude")]) 40 | 41 | 42 | xy_bnd <- spTransform(soap_area, CRSobj=lcc_proj4) 43 | xy_bnd <- xy_bnd@polygons[[1]]@Polygons[[1]]@coords 44 | xy_bnd <- list(x=xy_bnd[,1], y=xy_bnd[,2]) 45 | 46 | gr <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), by=16469.27), 47 | y=seq(min(xy_bnd$y), max(xy_bnd$y), by=16469.27)) 48 | x <- gr[,1] 49 | y <- gr[,2] 50 | ind <- inSide(xy_bnd, x, y) 51 | 52 | soap_preddata <- as.data.frame(gr[ind,]) 53 | soap_preddata$area <- preddata$area[1] 54 | soap_preddata$off.set <- log(soap_preddata$area) 55 | 56 | 57 | save(xy_bnd, soap_preddata, file="soapy.RData") 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /data/mexdolphins/soapy.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/mexdolphins/soapy.RData -------------------------------------------------------------------------------- /data/routes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/routes.csv -------------------------------------------------------------------------------- /data/yukon_seeds/seed_source_locations.csv: -------------------------------------------------------------------------------- 1 | Who,Site,Subsite,source_type,polygon,order,Y,X 2 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796723.731,469919.3107 3 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796742.626,469910.4723 4 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796729.702,469974.1999 5 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.89,470025.6419 6 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796719.851,470077.7116 7 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.935,470126.3351 8 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796718.948,470174.2659 9 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796702.393,470236.7358 10 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796694.535,470294.0962 11 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796742.568,470287.086 12 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796724.452,470303.7841 13 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796785.653,470258.4785 14 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.804,470191.593 15 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796964.678,470142.5823 16 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797042.968,470050.4508 17 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797046.439,470002.8427 18 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797040.834,469950.9495 19 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796996.21,469909.3531 20 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796975.594,469904.4163 21 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.762,469895.5375 22 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796850.814,469871.8622 23 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797168.951,469864.6928 24 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797133.442,469889.7501 25 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797067.112,470003.8641 26 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797045.311,469933.9624 27 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803025.398,465082.76 28 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803076.228,465133.385 29 | Eliot,FoxLake,Lowland,SSPolygon,38,1,6796715.73,470304.9674 30 | Eliot,FoxLake,Lowland,SSPolygon,38,2,6796728.455,470293.1938 31 | Eliot,FoxLake,Lowland,SSPolygon,38,3,6796753.325,470273.6796 32 | Eliot,FoxLake,Lowland,SSPolygon,38,4,6796788.911,470252.55 33 | Eliot,FoxLake,Lowland,SSPolygon,38,5,6796847.188,470216.8999 34 | Eliot,FoxLake,Lowland,SSPolygon,38,6,6796887.915,470191.8324 35 | Eliot,FoxLake,Lowland,SSPolygon,38,7,6796932.971,470156.1209 36 | Eliot,FoxLake,Lowland,SSPolygon,38,8,6796962.203,470143.2102 37 | Eliot,FoxLake,Lowland,SSPolygon,38,9,6796979.017,470128.2042 38 | Eliot,FoxLake,Lowland,SSPolygon,38,10,6797017.817,470096.0466 39 | Eliot,FoxLake,Lowland,SSPolygon,38,11,6797024.449,470083.6527 40 | Eliot,FoxLake,Lowland,SSPolygon,38,32,6797050.904,470081.0951 41 | Eliot,FoxLake,Lowland,SSPolygon,38,33,6796981.673,470133.1883 42 | Eliot,FoxLake,Lowland,SSPolygon,38,34,6796839.563,470223.1343 43 | Eliot,FoxLake,Lowland,SSPolygon,38,35,6796790.373,470257.3939 44 | Eliot,FoxLake,Lowland,SSPolygon,38,12,6797041.589,470051.2157 45 | Eliot,FoxLake,Lowland,SSPolygon,38,13,6797044.77,470038.4604 46 | Eliot,FoxLake,Lowland,SSPolygon,38,14,6797049.136,470021.4898 47 | Eliot,FoxLake,Lowland,SSPolygon,38,15,6797043.093,470009.7016 48 | Eliot,FoxLake,Lowland,SSPolygon,38,16,6797045.841,469991.8912 49 | Eliot,FoxLake,Lowland,SSPolygon,38,17,6797050.612,469984.3664 50 | Eliot,FoxLake,Lowland,SSPolygon,38,18,6797061.129,469968.7692 51 | Eliot,FoxLake,Lowland,SSPolygon,38,19,6797037.335,469950.6634 52 | Eliot,FoxLake,Lowland,SSPolygon,38,20,6797009.75,469923.8775 53 | Eliot,FoxLake,Lowland,SSPolygon,38,21,6797014.931,469917.5015 54 | Eliot,FoxLake,Lowland,SSPolygon,89,1,6796994.101,469909.7297 55 | Eliot,FoxLake,Lowland,SSPolygon,89,2,6796985.436,469906.6158 56 | Eliot,FoxLake,Lowland,SSPolygon,89,3,6796973.54,469906.5494 57 | Eliot,FoxLake,Lowland,SSPolygon,89,4,6796960.759,469898.896 58 | Eliot,FoxLake,Lowland,SSPolygon,89,5,6796919.57,469895.9869 59 | Eliot,FoxLake,Lowland,SSPolygon,89,6,6796906.707,469885.8498 60 | Eliot,FoxLake,Lowland,SSPolygon,89,7,6796855.959,469882.6563 61 | Eliot,FoxLake,Lowland,SSPolygon,89,8,6796848.594,469877.0291 62 | Eliot,FoxLake,Lowland,SSPolygon,89,15,6796843.691,469842.534 63 | Eliot,FoxLake,Lowland,SSPolygon,89,9,6796830.91,469827.1575 64 | Eliot,FoxLake,Lowland,SSPolygon,89,14,6796853.289,469799.318 65 | Eliot,FoxLake,Lowland,SSPolygon,89,12,6796817.835,469784.2797 66 | Eliot,FoxLake,Lowland,SSPolygon,89,10,6796812.723,469793.5348 67 | Eliot,FoxLake,Lowland,SSPolygon,89,13,6796821.899,469785.0872 68 | Eliot,FoxLake,Lowland,SSPolygon,89,11,6796814.147,469781.0331 69 | Eliot,FoxLake,Lowland,SSPolygon,159,1,6797215.359,469905.572 70 | Eliot,FoxLake,Lowland,SSPolygon,159,2,6797206.191,469912.9905 71 | Eliot,FoxLake,Lowland,SSPolygon,159,3,6797203.94,469943.5954 72 | Eliot,FoxLake,Lowland,SSPolygon,159,4,6797202.858,469895.6316 73 | Eliot,FoxLake,Lowland,SSPolygon,159,5,6797194.902,469889.1922 74 | Eliot,FoxLake,Lowland,SSPolygon,159,6,6797204.779,469878.4889 75 | Eliot,FoxLake,Lowland,SSPolygon,159,7,6797205.31,469870.0078 76 | Eliot,FoxLake,Lowland,SSPolygon,159,8,6797186.831,469856.0113 77 | Eliot,FoxLake,Lowland,SSPolygon,159,9,6797169.234,469850.342 78 | Eliot,FoxLake,Lowland,SSPolygon,159,10,6797150.315,469847.3502 79 | Eliot,FoxLake,Lowland,SSPolygon,159,11,6797135.268,469840.288 80 | Eliot,FoxLake,Lowland,SSPolygon,159,12,6797125.43,469837.8281 81 | Eliot,FoxLake,Lowland,SSPolygon,159,13,6797121.712,469844.4725 82 | Eliot,FoxLake,Lowland,SSPolygon,159,14,6797136.985,469852.2816 83 | Eliot,FoxLake,Lowland,SSPolygon,159,15,6797162.709,469858.8117 84 | Eliot,FoxLake,Lowland,SSPolygon,159,16,6797178.135,469866.1774 85 | Eliot,FoxLake,Lowland,SSPolygon,159,17,6797187.21,469872.7701 86 | Eliot,FoxLake,Lowland,SSPolygon,159,18,6797176.944,469888.4768 87 | Eliot,FoxLake,Lowland,SSPolygon,159,19,6797171.087,469910.6188 88 | Eliot,FoxLake,Lowland,SSPolygon,159,20,6797182.537,469924.3886 89 | Eliot,FoxLake,Lowland,SSPolygon,159,21,6797174.278,469956.3611 90 | Eliot,FoxLake,Lowland,SSPolygon,159,22,6797169.264,469966.2896 91 | Eliot,FoxLake,Lowland,SSPolygon,185,1,6797141.76,469950.5774 92 | Eliot,FoxLake,Lowland,SSPolygon,185,2,6797134.16,469928.8803 93 | Eliot,FoxLake,Lowland,SSPolygon,185,3,6797133.061,469886.2447 94 | Eliot,FoxLake,Lowland,SSPolygon,185,4,6797112.249,469847.2647 95 | Eliot,FoxLake,Lowland,SSPolygon,185,5,6797112.324,469865.5345 96 | Eliot,FoxLake,Lowland,SSPolygon,185,6,6797121.118,469891.7095 97 | Eliot,FoxLake,Lowland,SSPolygon,185,7,6797127.573,469922.8565 98 | Eliot,FoxLake,Lowland,SSPolygon,185,8,6797126.939,469938.0401 99 | Eliot,FoxLake,Lowland,SSPolygon,38,31,6797091.933,470016.4598 100 | Eliot,FoxLake,Lowland,SSPolygon,38,30,6797079.967,470007.3238 101 | Eliot,FoxLake,Lowland,SSPolygon,38,29,6797067.498,469997.9275 102 | Eliot,FoxLake,Lowland,SSPolygon,38,28,6797063.273,469988.7003 103 | Eliot,FoxLake,Lowland,SSPolygon,38,27,6797070.821,469974.1372 104 | Eliot,FoxLake,Lowland,SSPolygon,38,26,6797063.943,469959.6072 105 | Eliot,FoxLake,Lowland,SSPolygon,38,25,6797049.503,469942.8299 106 | Eliot,FoxLake,Lowland,SSPolygon,38,24,6797039.73,469931.6067 107 | Eliot,FoxLake,Lowland,SSPolygon,38,23,6797030.65,469922.4277 108 | Eliot,FoxLake,Lowland,SSPolygon,38,22,6797020.646,469910.829 109 | Eliot,FoxLake,Lowland,SSPolygon,233,1,6797526.926,469768.1144 110 | Eliot,FoxLake,Lowland,SSPolygon,233,2,6797486.016,469794.4904 111 | Eliot,FoxLake,Lowland,SSPolygon,233,3,6797400.062,469863.5996 112 | Eliot,FoxLake,SW_Facing,SSPolygon,380,1,6803013.014,465080.2516 113 | Eliot,FoxLake,SW_Facing,SSPolygon,380,2,6803089.116,465135.0063 114 | Eliot,FoxLake,SW_Facing,SSPolygon,389,1,6803262.384,464081.8498 115 | Eliot,FoxLake,SW_Facing,SSPolygon,389,2,6803618.396,464040.2266 116 | Eliot,FoxLake,SW_Facing,SSPolygon,391,1,6803587.361,464040.0052 117 | Eliot,FoxLake,SW_Facing,SSPolygon,391,2,6803174.081,464080.9813 118 | Eliot,FoxLake,Lowland,SSIsland,56,NA,6796740.729,470183.8429 119 | Eliot,FoxLake,Lowland,SSIsland,58,NA,6796737.425,470200.11 120 | Eliot,FoxLake,Lowland,SSIsland,65,NA,6796729.556,470238.3273 121 | Eliot,FoxLake,Lowland,SSIsland,67,NA,6796738.312,470243.0132 122 | Eliot,FoxLake,Lowland,SSIsland,68,NA,6796734.045,470246.1296 123 | Eliot,FoxLake,Lowland,SSIsland,97,NA,6796940.921,469902.9191 124 | Eliot,FoxLake,Lowland,SSIsland,98,NA,6796929.918,469896.6909 125 | Eliot,FoxLake,Lowland,SSIsland,99,NA,6796923.639,469901.6075 126 | Eliot,FoxLake,Lowland,SSIsland,114,NA,6796836.526,469792.1375 127 | Eliot,FoxLake,Lowland,SSIsland,156,NA,6796980.95,470060.4577 128 | Eliot,FoxLake,Lowland,SSIsland,182,NA,6797146.336,469972.2485 129 | Eliot,FoxLake,Lowland,SSIsland,183,NA,6797152.1,469950.1815 130 | Eliot,FoxLake,Lowland,SSIsland,184,NA,6797146.21,469953.2912 131 | Eliot,FoxLake,Lowland,SSIsland,212,NA,6796949.689,469802.0181 132 | Eliot,FoxLake,Lowland,SSIsland,213,NA,6796948.007,469797.7336 133 | Eliot,FoxLake,Lowland,SSIsland,214,NA,6796936.248,469789.6354 134 | Eliot,FoxLake,Lowland,SSIsland,231,NA,6797483.691,469755.6377 135 | Eliot,FoxLake,Lowland,SSIsland,232,NA,6797514.164,469726.8687 136 | Eliot,FoxLake,Lowland,SSIndividual,60,NA,6796747.059,470205.7828 137 | Eliot,FoxLake,Lowland,SSIndividual,61,NA,6796737.175,470215.1502 138 | Eliot,FoxLake,Lowland,SSIndividual,62,NA,6796734.953,470215.1313 139 | Eliot,FoxLake,Lowland,SSIndividual,63,NA,6796725.697,470212.2823 140 | Eliot,FoxLake,Lowland,SSIndividual,64,NA,6796727.524,470235.7011 141 | Eliot,FoxLake,Lowland,SSIndividual,66,NA,6796736.703,470238.8686 142 | Eliot,FoxLake,Lowland,SSIndividual,96,NA,6796948.488,469901.2824 143 | Eliot,FoxLake,Lowland,SSIndividual,203,NA,6797043.34,469928.7998 144 | Eliot,FoxLake,Lowland,SSIndividual,204,NA,6797039.158,469928.6471 145 | Eliot,FoxLake,Lowland,SSIndividual,208,NA,6797019.777,469904.491 146 | -------------------------------------------------------------------------------- /example-bivariate-timeseries-and-ti.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "More time series; bivariate smooths and ti()" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | fig_width: 10 10 | fig_height: 4 11 | --- 12 | 13 | ## Background 14 | 15 | This example is an extension of the Fort Lauderdale time series model. Here we look at two additional advanced features, namely 16 | 17 | i. allowing the seasonal and trend terms to interact, and 18 | ii. setting up an ANOVA-like decomposition for interaction smooth using `ti()` terms 19 | 20 | The model in the earlier example estimated a constant non-linear seasonal (within-year) effect and a constant non-linear trend (between year) effect. Such a model is unable to capture any potential change in the seasonal effect (the within-year distribution of temperatures) over time. Or, put another way, is unable to capture potentially different trends in temperature are particular times of the year. 21 | 22 | This example extends the simpler model to more generally model changes in temperature. 23 | 24 | ### Loading and viewing the data 25 | 26 | The time series we'll look at is a long-term temperature and precipitation 27 | data set from the convention center here at Fort Lauderdale, consisting of 28 | monthly mean temperature (in degrees c), the number of days it rained that 29 | month, and the amount of precipitation that month (in mm). The data range from 30 | 1950 to 2015, with a column for year and month (with Jan. = 1). The data is from 31 | a great web-app called FetchClim2, and the link for the data [I used is 32 | here](http://fetchclimate2.cloudapp.net/#page=geography&dm=values&t=years&v=airt(13,6,2,1),prate(12,7,2,1),wet(1)&y=1950:1:2015&dc=1,32,60,91,121,152,182,213,244,274,305,335,366&hc=0,24&p=26.099,-80.123,Point%201&ts=2016-07-17T21:57:11.213Z). 33 | 34 | Here we'll load it and plot the air temperature data. 35 | 36 | ```{r, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE, highlight = TRUE} 37 | library("mgcv") 38 | library("ggplot2") 39 | florida <- read.csv("./data/time_series/Florida_climate.csv", 40 | stringsAsFactors = FALSE) 41 | head(florida) 42 | 43 | ggplot(data = florida, aes(x = month, y = air_temp, color = year, group = year)) + 44 | geom_point() + 45 | geom_line() 46 | ``` 47 | 48 | ## Fitting the smooth interaction model 49 | 50 | Following from the earlier example, we fit a smooth interaction model using a tensor product spline of `year` and `month`. Notice how we specify the marginal basis types as a character vector. Here I restrict the model to AR(1) errors as this was the best model in the earlier example and it is unlikely that additional unmodelled temporal structure will turn up when we use a more general model! 51 | 52 | ```{r fit-smooth-interaction, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE} 53 | m.full <- gamm(air_temp ~ te(year, month, bs = c("tp", "cc")), data = florida, 54 | correlation = corAR1(form = ~ 1 | year), 55 | knots = list(month = c(0.5, 12.5))) 56 | layout(matrix(1:3, nrow = 1)) 57 | plot(m.full$gam, scheme = 1, theta = 40) 58 | plot(m.full$gam, scheme = 1, theta = 80) 59 | plot(m.full$gam, scheme = 1, theta = 120) 60 | layout(1) 61 | summary(m.full$gam) 62 | ``` 63 | 64 | The plot of the tensor product smooth is quite illustrative as to what the model is attempting to do. The earlier model would be forced to retain the same seasonal curve throughout the time series. Now, with the more complex model, the seasonal curve is allowed to change shape smoothly through time. 65 | 66 | Our task now is to test whether this additional complexity is worth the extra effort? Are some months warming more than others in Fort Lauderdale? 67 | 68 | ## Fitting the ANOVA-decompososed model 69 | 70 | The logical thing to do to test whether some months are warming more than others would be to drop the interaction term from the model and compare the simpler and more complex models with AIC or a likelihood ratio test. But if you look back at the model we just fitted, there is no term in that model that relates just to the interaction! The tensor product contains both the main and interactive effects. What's worse is that we can't (always) compare this model with the one we fitted in the earlier example because we need to be careful to ensure the models are properly nested. This is where the `ti()` smoother comes in. 71 | 72 | A `ti()` smoother is like a standard tensor product (`te()`) smooth, but without the main effects smooths. Now we can build a model with separate main effects smooths for `year` and `month` and add in the interaction smooth using `ti()`. This will allow us to fit two properly nested that can be compared to test our null hypothesis that the months are warming at the same rate. 73 | 74 | Let's fit those models 75 | 76 | ```{r fit-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE} 77 | m.main <- gam(air_temp ~ s(year, bs = "tp") + s(month, bs = "cc"), data = florida, 78 | knots = list(month = c(0.5, 12.5))) 79 | m.int <- gam(air_temp ~ ti(year, bs = "tp") + ti(month, bs = "cc") + 80 | ti(year, month, bs = c("tp", "cc"), k = 11), data = florida, 81 | knots = list(month = c(0.5, 12.5))) 82 | ``` 83 | 84 | If you do this with `gamm()` the model fitting will fail, and for good reason. At least the two `gam()` models converged so we can see what is going on. Compare the two models using `AIC()` 85 | 86 | ```{r aic-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE} 87 | AIC(m.main, m.int) 88 | ``` 89 | 90 | AIC tells us that there is some support for the interaction model, but only when we force a more complex surface by specifying `k`. 91 | 92 | Regardless, the interaction term only affords a >1% improvement in deviance explained; whilst the interaction may be favoured by AIC, any variation in the seasonal effect over time must be small. 93 | 94 | ## Exercises 95 | 96 | 1. For an example where there is a stronger change in the seasonal effect, see my two blog posts on applying these models to the Central England Temperature series: 97 | - [Post 1](http://www.fromthebottomoftheheap.net/2015/11/21/climate-change-and-spline-interactions/) 98 | - [Post 2](http://www.fromthebottomoftheheap.net/2015/11/23/are-some-seasons-warming-more-than-others/) 99 | 100 | These two posts take you through a full analysis of this data series. 101 | -------------------------------------------------------------------------------- /example-forest-health.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Markov random field smooths & German forest health data" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | --- 10 | 11 | # Introduction 12 | 13 | In this example, we look at a spatial data set of forest health data from a stand in Germany. We can treat the individual trees as discrete spatial units and fit a spatial term using a special type of spline basis, a Markov random field. 14 | 15 | The data come from a project n the forest of Rothenbuch, Germany, which has been ongoing since 1982. The project studies five species but here we just consider the beech data. Each year the condition of each of 83 trees is categorized into one of nine ordinal classes in terms of the degree (percentage) of defoliation. A defoliation value of 0% indicates a healthy beech tree, whilst a value of 100% indicates a tree is dead. For the purposes of this example, several of the nine ordinal classes are merged to form a three class response variable of interest 16 | 17 | 1. 0%, healthy trees that I'll refer to as the `"low"` defoliation group 18 | 2. 12.5% -- 37.5% defoliation, which I refer to as the `"med"` group, and 19 | 3. 50% -- 100% defoliation, which is the `"high"` group. 20 | 21 | The original classes were `r paste0(c(0,12.5,37.5,50,62.5,75, 87.5, 100), "%")` and are in variable `defol`. 22 | 23 | Alongside the response variable, a number of continuous and categorical covariates are recorded 24 | 25 | - `id` --- location ID number 26 | - `year` --- year of recording 27 | - `x` and `y` --- the x- and y-coordinates of each location 28 | - `age` --- average age of trees at a location 29 | - `canopyd` --- canopy density at the location, in % 30 | - `gradient` --- slope, in % 31 | - `alt` --- altitude above sea level in m 32 | - `depth` --- soil depth in cm 33 | - `ph` --- soil pH at 0--2cm depth 34 | - `watermoisture` --- soil moisture in three categories 35 | 1. `1` moderately dry 36 | 2. `2` moderately moist 37 | 3. `3` moist or temporarily wet 38 | - `alkali` --- fraction of alkali ions in four categories 39 | 1. `1` very low 40 | 2. `2` low 41 | 3. `3` moderate 42 | 4. `4` high 43 | - `humus` thickness of the soil humus layer in five categories 44 | 1. `0` 0cm 45 | 2. `1` 1cm 46 | 3. `2` 2cm 47 | 4. `3` 3cm 48 | 5. `4` $>$3cm 49 | - `type` --- type of forest 50 | - `0` deciduous forest 51 | - `1` mixed forest 52 | - `fert` --- fertilization 53 | - `0` not fertilized 54 | - `1` fertilized 55 | 56 | The aim of the example is to investigate the effect of the measured covariates on the degree of defoliation and to quantify any temporal trend and spatial effect of geographic location in the data set, while adjusting for the effects of the other covariates. 57 | 58 | The data are extensively analysed in Fahrmeir, Kneib, Lang, and Marx (2013) *Regression: Models, Methods and Applications*. Springer. 59 | 60 | # Getting started 61 | 62 | Begin by loading the packages we'll use in this example 63 | 64 | ```{r load-packages} 65 | ## Load packages 66 | library("mgcv") 67 | library("ggplot2") 68 | ``` 69 | 70 | Next, load the forest health data set 71 | 72 | ```{r read-data} 73 | ## Read in the forest health data 74 | forest <- read.table("./data/forest-health/beech.raw", header = TRUE, na.strings = ".") 75 | head(forest) 76 | ``` 77 | 78 | The data require a little processing to ensure they are correctly coded. The chunk below does 79 | 80 | 1. Makes a nicer stand label so that we can sort stands numerically even though the id is character, 81 | 2. Aggregates the defoliation data into an ordered categorical variable for low, medium, or high levels of defoliation. This is converted to a numeric variable for modelling, 82 | 3. Convert the categorical variables to factors. `humus` is converted to a 4-level factor, 83 | 4. Remove some `NA`s, and 84 | 5. Summarises the response variable in a table. 85 | 86 | ```{r process-forest-data} 87 | forest <- transform(forest, id = factor(formatC(id, width = 2, flag = "0"))) 88 | 89 | ## Aggregate defoliation & convert categorical vars to factors 90 | levs <- c("low","med","high") 91 | forest <- transform(forest, 92 | aggDefol = as.numeric(cut(defol, breaks = c(-1,10,45,101), 93 | labels = levs)), 94 | watermoisture = factor(watermoisture), 95 | alkali = factor(alkali), 96 | humus = cut(humus, breaks = c(-0.5, 0.5, 1.5, 2.5, 3.5), 97 | labels = 1:4), 98 | type = factor(type), 99 | fert = factor(fert)) 100 | forest <- droplevels(na.omit(forest)) 101 | head(forest) 102 | with(forest, table(aggDefol)) 103 | ``` 104 | 105 | To view the spatial arrangement of the forest stand, plot the unique `x` and `y` coordinate pairs 106 | 107 | ```{r plot-forest-stand} 108 | ## Plot data 109 | ggplot(unique(forest[, c("x","y")]), aes(x = x, y = y)) + 110 | geom_point() 111 | ``` 112 | 113 | # Non-spatial ordered categorical GAM 114 | 115 | Our first model will ignore the spatial component of the data. All continuous variables except `ph` and `depth` are modelled using splines, and all categorical variables are included in the model. Note that we can speed up fitting by turning on some multi-threaded parallel processing in parts of the fitting algorithm via the `nthreads` control argument. The `ocat` family is used and we specify that there are three classes. Smoothness selection is via REML. 116 | 117 | ```{r fit-naive-model} 118 | ## Model 119 | ctrl <- gam.control(nthreads = 3) 120 | forest.m1 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert + 121 | s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt), 122 | data = forest, family = ocat(R = 3), method = "REML", 123 | control = ctrl) 124 | ``` 125 | 126 | The model summary 127 | 128 | ```{r summary-naive} 129 | summary(forest.m1) 130 | ``` 131 | 132 | indicates all model terms are significant at the 95% level, especially the smooth terms. Don't pay too much attention to this though as we have not yet accounted for spatial structure in the data and several of the variables are likely to be spatially autocorrelated and hence the identified effects may be spurious. 133 | 134 | This is somewhat confirmed by the form of the fitted functions; rather than smooth, monotonic functions man terms are highly non-linear and difficult to interpret. 135 | 136 | ```{r plot-naive-smooths} 137 | plot(forest.m1, pages = 1, shade = 2, scale = 0) 138 | ``` 139 | We might expect that older trees are more susceptible to damage yet confusingly there is a decreasing effect for the very oldest trees. The smooth of `gradient` is uninterpretable. Trees in low and high altitude areas are less damaged than those in areas of intermediate elevation, which is also counter intuitive. 140 | 141 | Running gam.check()` 142 | 143 | ```{r gam-check-naive} 144 | gam.check(forest.m1) 145 | ``` 146 | 147 | suggests no major problems, but the residual plot is difficult to interpret owing to the categorical nature of the response variable. The printed output suggests some smooth terms may need their basis dimension increasing. Before we do this however, we should add a spatial effect to the model. 148 | 149 | # Spatial GAM via a MRF smooth 150 | 151 | For these data, it would be more natural to fit a spatial effect via a 2-d smoother as we've considered in other examples. However, we can consider the trees as being discrete spatial units and fit a spatial effect via a Markov random field (MRF) smooth. To fit the MRF smoother, we need information on the neighbours of each tree. In this example, any tree within 1.8km of a focal tree was considered a neighbour of that focal tree. This neighbourhood information is stored in a BayesX graph file, which we can convert into the format needed for **mgcv**'s MRF basis function. To facilitate reading the graph file, load the utility function `gra2mgcv()` 152 | 153 | ```{r load-graph-fun} 154 | ## souce graph reading function 155 | source("./code_snippets/gra2mgcv.R") 156 | ``` 157 | 158 | Next we load the `.gra` file and do some manipulations to match the `forest` environmental data file 159 | 160 | ```{r load-graph} 161 | ## Read in graph file and output list required by mgcv 162 | nb <- gra2mgcv("./data/forest-health/foresthealth.gra") 163 | nb <- nb[order(as.numeric(names(nb)))] 164 | names(nb) <- formatC(as.numeric(names(nb)), width = 2, flag = "0") 165 | ``` 166 | 167 | Look at the structure of `nb`: 168 | 169 | ```{r head-nb} 170 | head(nb) 171 | ``` 172 | 173 | In **mgcv** the MRF basis can be specified by one or more of 174 | 175 | 1. `polys`; coordinates of vertices defining spatial polygons for each discrete spatial unit. Any two spatial units that share one or more vertices are considered neighbours, 176 | 2. `nb`; a list with one component per spatial unit, where each component contains indices of the neighbouring components of the current spatial unit, and/or 177 | 3. `penalty`; the actual penalty matrix of the MRF basis. This is an $N$ by $N$ matrix with the number of neighbours of each unit on the diagonal, and the $j$th column of the $i$th row set to `-1` if the $j$th spatial unit is a neighbour of the $i$th unit. Elsewhere the penalty matrix is all `0`s. 178 | 179 | **mgcv** will create the penalty matrix for you if you supply `polys` or `nb`. As we don't have polygons here, we'll use the information from the `.gra` file converted to the format needed by `gam()` to indicate the neighbourhood. 180 | 181 | The `nb` list is passed along using the `xt` argument of the `s()` function. For the MRF basis, `xt` is a named list with one or more of the components listed above. In the model call below we use `xt = list(nb = nb)` to pass on the neighbourhood list. To indicate that an MRF smooth is needed, we use `bs = "mrf"` and the covariate of the smooth is the factor indicating to which spatial unit each observation belongs. In the call below we use the `id` variable. Note that this doesn't need to be a factor, just something that can be coerced to one. 182 | 183 | All other aspects of the model fit remain the same hence we use `update()` to add the MRF smooth without repeating everything from the original call. 184 | 185 | ```{r fit-mrf-model} 186 | ## Fit model with MRF 187 | ## forest.m2 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert + 188 | ## s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt) + 189 | ## s(id, bs = "mrf", xt = list(nb = nb)), 190 | ## data = forest, family = ocat(R = 3), method = "REML", 191 | ## control = ctrl) 192 | forest.m2 <- update(forest.m1, . ~ . + s(id, bs = "mrf", xt = list(nb = nb))) 193 | ``` 194 | 195 | Look at the model summary: 196 | 197 | ```{r summary-mrf-model} 198 | summary(forest.m2) 199 | ``` 200 | 201 | What differences do you see between the model with the MRF spatial effect and the first model that we fitted? 202 | 203 | Quickly create plots of the fitted smooth functions 204 | 205 | ```{r plot-mrf-smooths} 206 | plot(forest.m2, pages = 1, shade = 2, scale = 0, seWithMean = TRUE) 207 | ``` 208 | 209 | Notice that here we use `seWithMean = TRUE` as one of the terms has been shrunk back to a linear function and would otherwise have a *bow tie* confidence interval. 210 | 211 | Compare these fitted functions with those from the original model. Are these more in keeping with expectations? 212 | 213 | Model diagnostics are difficult with models for discrete responses such as these. Instead we can interrogate the model to derive quantities of interest that illustrate or summarise the model fit. First we start by generating posterior probabilities for the three ordinal defoliation/damage categories using `predict()`. This is similar to the code Eric showed this morning for the ordered categorical family. 214 | 215 | ```{r fitted-values} 216 | fit <- predict(forest.m2, type = "response") 217 | colnames(fit) <- levs 218 | head(fit) 219 | ``` 220 | 221 | The `predict()` method returns a 3-column matrix, one column per category. The entries in the matrix are the posterior probabilities of class membership, with rows summing to 1. As we'll see later, we can take a *majority wins* approach and assign each observation a fitted class membership and compare these to the known classes. 222 | 223 | For easier visualisation it would be nicer to have these fitted probabilities in a tidy form, which we now do via a few manipulations 224 | 225 | ```{r process-fitted-values} 226 | fit <- setNames(stack(as.data.frame(fit)), c("Prob", "Defoliation")) 227 | fit <- transform(fit, Defoliation = factor(Defoliation, labels = levs)) 228 | fit <- with(forest, cbind(fit, x, y, year)) 229 | head(fit) 230 | ``` 231 | 232 | The fitted values are now ready for plotting. Here we plot just the years 2000--2004, showing the spatial arrangement of trees, faceted by defoliation/damage category: 233 | 234 | ```{r plot-fitted} 235 | ggplot(subset(fit, year >= 2000), aes(x = x, y = y, col = Prob)) + 236 | geom_point(size = 1.2) + 237 | facet_grid(Defoliation ~ year) + 238 | coord_fixed() + 239 | theme(legend.position = "top") 240 | ``` 241 | 242 | A more complex use of the model involves predicting change in class posterior probability over time whilst holding all other variables at the observed mean or category mode 243 | 244 | ```{r timeseries-predictions} 245 | N <- 200 246 | pdat <- with(forest, expand.grid(year = seq(min(year), max(year), length = N), 247 | age = mean(age, na.rm = TRUE), 248 | gradient = mean(gradient, na.rm = TRUE), 249 | canopyd = mean(canopyd, na.rm = TRUE), 250 | alt = mean(alt, na.rm = TRUE), 251 | depth = mean(depth, na.rm = TRUE), 252 | ph = mean(ph, na.rm = TRUE), 253 | humus = factor(2, levels = levels(humus)), 254 | watermoisture = factor(2, levels = levels(watermoisture)), 255 | alkali = factor(2, levels = levels(alkali)), 256 | type = factor(0, levels = c(0,1)), 257 | fert = factor(0, levels = c(0,1)), 258 | id = levels(id))) 259 | pred <- predict(forest.m2, newdata = pdat, type = "response", exclude = "s(id)") 260 | colnames(pred) <- levs 261 | pred <- setNames(stack(as.data.frame(pred)), c("Prob","Defoliation")) 262 | pred <- cbind(pred, pdat) 263 | pred <- transform(pred, Defoliation = factor(Defoliation, levels = levs)) 264 | ``` 265 | 266 | A plot of the predictions is produced using 267 | 268 | ```{r plot-timeseries-predictions} 269 | ggplot(pred, aes(x = year, y = Prob, colour = Defoliation)) + 270 | geom_line() + 271 | theme(legend.position = "top") 272 | ``` 273 | 274 | Using similar code, produce a plot for the effect of `age` holding other values at the mean or mode. 275 | 276 | The final summary of the model that we'll produce is a confusion matrix of observed and predicted class membership. Technically, this is over-optimistic as we are using the same data to both fit and test the model, but it is an illustration of how well the model does are fitting the observed classes. 277 | 278 | ```{r confusion-matrix} 279 | fit <- predict(forest.m2, type = "response") 280 | fitClass <- factor(levs[apply(fit, 1, which.max)], levels = levs) 281 | obsClass <- with(forest, factor(aggDefol, labels = levs)) 282 | sum(fitClass == obsClass) / length(fitClass) 283 | table(fitClass, obsClass) 284 | ``` 285 | 286 | Just don't take it as any indication of how well the model will do at predicting the defoliation class for a new year or tree. 287 | 288 | How better does the model with the MRF spatial effect fit the data compared to the original model that was fitted? We can use AIC as one means of answering that question 289 | 290 | ```{r aic} 291 | AIC(forest.m1, forest.m2) 292 | ``` 293 | 294 | Which model does AIC indicate as fitting the data best? 295 | 296 | ## Exercise 297 | 298 | As an additional exercise, replace the MRF smooth with a 2-d spline of the `x` and `y` coordinates and produce maps of the class probabilities over the spatial domain. You'll need to use ideas and techniques from some of the other spatial examples in order to complete this task. 299 | -------------------------------------------------------------------------------- /example-linear-functionals.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Linear functionals; or: what to do when you've got a bunch of x-values and only one y" 3 | author: Eric Pedersen 4 | output: 5 | html_document: 6 | toc: true 7 | toc_float: true 8 | theme: readable 9 | highlight: haddock 10 | --- 11 | 12 | ##1. Background 13 | 14 | Up to now, we've been looking at data where every coefficient we're using to 15 | predict our y-values consists of a single value for each y. However, not all 16 | predictor data looks like this. In many cases, a single coefficient may have 17 | several values associated with the same outcome variable. For instance, Let's 18 | say we've measured total ecosystem production in several different lakes, as 19 | well as temperature along a gradient away from shoreline in the lake. We want to know: 20 | 21 | 1. how well temperature predicts production 22 | 2. whether temperatures at different distances from the shore have different effects on 23 | production (as production varies a lot between the littoral and pelagic zone). 24 | 25 | Now, there looks like there might be a few ways to answer this in a standard gam 26 | setting. First we could just average temperatures across locations and use that 27 | as a predictor to answer question 1. However, that doesn't give us any insight 28 | into question 2, and it's pretty easy to imagine a case like warmer temperatures 29 | at the shore increase production, but warmer temperatures in the middle of the 30 | lake have very little effect, or where a single warm area could substantially 31 | increase production even in a cold lake. We could also try to fit a seperate 32 | smooth of production on temperature at each shore distance. However, if we have 33 | a lot of distances, this ends up fitting a bunch of seperate smooths, and we'd 34 | rapidly run out of degrees of freedom. Also, it would mean throwing away 35 | information; we'd expect that the effect of temperature a meter below from the 36 | shore should be very similar to the effect of temperature right at the 37 | shoreline, but by fitting a seperate smooth for each, we're ignoring that (and 38 | likely going to suffer from concurvity issues to boot!). 39 | 40 | What we need is a method that can account for the fact that we have multiple x 41 | values for a given predictor. Fortunately, `mcgv` can handle this pretty easily. 42 | It does this by allowing to pass matrix-valued predictors to `s()` and `te()` 43 | terms. When you pass `mgcv` a matrix, it will fit the same smooth function to 44 | each of the columns of the matrix, then sum across the rows of the matrix of 45 | transformed values to give the estimated mean value for a given `y`. Let's say 46 | `y` is a one-dimensional outcome, with n measured values, and `x` is a matrix 47 | with n rows and k columns. The predicted values for the ith value of `y` would 48 | be: $y_i \sim\sum_{j=1}^k f(x_{i,j})$, where $f(x_{i,j})$ is itself a sum of 49 | basis functions multiplied by model coefficients as before. 50 | 51 | In mathemetical terms, this makes our smooth term a *functional*, where our 52 | outcome is a function of a vector of terms rather than a single term, and a 53 | linear functional, as it's a linear sum of smooth terms for each column; but you 54 | really don't need to know much about functionals to use these. I only mention 55 | the functional thing because if you want to dig into this approach further or if 56 | you want to read help files on these, as you have to search 57 | `?linear.functional.terms`. 58 | 59 | There's a few major useages Ive found for these. There's likely more than this, 60 | but these are all cases I've encountered. Also, each of these cases requires you 61 | to set up your predictor matrices carefully, so make sure you know what kind of 62 | functional you'll be fitting, and what predictor matrix is going where in the `s()` 63 | function. 64 | 65 | ### Case 1. Nonlinear averaging. 66 | The first case, and the simplest to fit, is to estimate 67 | a nonlinear average. Let's look at the lake example from before. Say we think 68 | that distance from shore shouldn't matter, but that production increases 69 | non-linearly with temperature. In that case, a cold lake with a few hot spots 70 | may actually be more productive than a lake that's consitently luke warm. 71 | Therefore, if you average over temperatures before estimating a function, you'll 72 | miss this effect (as we know from statistics that the average of a non-linear 73 | function applied to x does not, in general, equal the function applied to the 74 | average of x). In this case, we can just provide a matrix of values (the predictor matrix) to `s()`: in this case, the predictor matrix is a matrix of temperatures where each column is the temperature measured at one of our sites. 75 | We do this in the code below with the variable `temp_matrix`. 76 | 77 | 78 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE} 79 | library(dplyr) 80 | library(mgcv) 81 | library(ggplot2) 82 | library(tidyr) 83 | n_lakes = 200 84 | lake_mean_temps = rnorm(n_lakes, 25,6) 85 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes, 86 | shore_dist_m = c(0,1,5,10,20,40))) 87 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8) 88 | lake_data$production = 5*dnorm(lake_data$temp, 30, sd = 5)/dnorm(1,0,5)+rnorm(n_lakes*6, 0, 1) 89 | lake_data = lake_data %>% group_by(lake)%>% 90 | mutate(production = mean(production), 91 | mean_temp = mean(temp))%>% 92 | spread(shore_dist_m, temp) 93 | 94 | 95 | lake_data$temp_matrix = lake_data %>% 96 | select(`0`:`40`) %>% 97 | as.matrix(.) 98 | head(lake_data) 99 | 100 | 101 | mean_temp_model = gam(production~s(mean_temp),data=lake_data) 102 | nonlin_temp_model = gam(production~s(temp_matrix), 103 | data=lake_data) 104 | 105 | layout(matrix(1:2, nrow=1)) 106 | plot(mean_temp_model) 107 | plot(nonlin_temp_model) 108 | layout(1) 109 | temp_predict_data = data.frame(temp = seq(10,40,length=100), 110 | mean_temp = seq(10,40,length=100)) 111 | 112 | temp_matrix = matrix(seq(10,40,length=100),nrow= 100,ncol=6, 113 | byrow = F) 114 | 115 | temp_predict_data = temp_predict_data %>% 116 | mutate(linear_average = as.numeric(predict(mean_temp_model,.)), 117 | nonlinear_average = as.numeric(predict(nonlin_temp_model, .)), 118 | true_function = 5*dnorm(temp, 30, sd = 5)/dnorm(1,0,5))%>% 119 | gather(model, value,linear_average:true_function) 120 | 121 | #This plots the two fitted models vs. the true model. 122 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+ 123 | geom_line()+ 124 | scale_color_brewer(palette="Set1")+ 125 | theme_bw(20) 126 | ``` 127 | 128 | The nonlinear model is substantially more uncertain at the ends, 129 | bu it fits the true function much more effectively. The function from 130 | the average data substantially underestimates the effect of temperature on 131 | production. 132 | 133 | ### Case 2: Weighted averaging 134 | The next major case where linear functionals are useful is for weighted 135 | averages. Here, you have some predictor variable measured at various points at 136 | different distances or lags from a given point, or at different locations along 137 | some gradient. This could be a variable that's been meaured at various distances 138 | away from each observed site, and you want to understand at what scale that 139 | variable will affect your parameter of interest. It could also be a predictor 140 | variable measured at several time points before the observation, and you want to 141 | know what at what lags the two variables interact. 142 | 143 | In this case, we'll assume that the relationship between our variable of 144 | interest (x) and the outcome is linear at any given lag, but that linear 145 | relationship changes smoothly with the lag. We'll look at the case where both 146 | relationships are nonlinear next. Here we have to create two matrices. The first 147 | is the lag matrix, and it will have one column for each lag we're interested in. 148 | All the values in a given column will be equal to the lag value. 149 | The second matrix is the predictor matrix, and it is the same as (see [the section below](#by_var)) on `by=` terms on how this works). 150 | 151 | 152 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE} 153 | library(dplyr) 154 | library(mgcv) 155 | library(ggplot2) 156 | library(tidyr) 157 | n_lakes = 200 158 | lake_mean_temps = rnorm(n_lakes, 25,6) 159 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes, 160 | shore_dist_m = c(0,1,5,10,20,40))) 161 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8) 162 | lake_data$production = with(lake_data, rnorm(n_lakes*6, 163 | temp*3*exp(-shore_dist_m/10),1)) 164 | lake_data = lake_data %>% group_by(lake)%>% 165 | mutate(production = mean(production), 166 | mean_temp = mean(temp))%>% 167 | spread(shore_dist_m, temp) 168 | 169 | #This is our lag matrix for this example 170 | shore_dist_matrix = matrix(c(0,1,5,10,20,40), nrow=n_lakes,ncol=6, byrow = T) 171 | head(shore_dist_matrix) 172 | 173 | #This is our predictor matrix 174 | lake_data$temp_matrix = lake_data %>% 175 | select(`0`:`40`) %>% 176 | as.matrix(.) 177 | head(lake_data) 178 | 179 | #Note: we need to set k=6 here, as we really only have 6 degrees of freedom (one 180 | #for each distance we've measured from the shore. 181 | 182 | nonlin_temp_model = gam(production~s(shore_dist_matrix, by= temp_matrix,k=6), 183 | data=lake_data) 184 | plot(nonlin_temp_model) 185 | 186 | #This plots the two fitted models vs. the true model. 187 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+ 188 | geom_line()+ 189 | scale_color_brewer(palette="Set1")+ 190 | theme_bw(20) 191 | ``` 192 | 193 | ## 2. Key concepts and functions 194 | 195 | ### using `by=` terms in smoothers {#by_var} 196 | One of the most useful features in `mgcv` is the `by=` argument for smooths. 197 | This has two uses. 198 | 199 | 1. For a given smooth (say, `y~s(x)`), if you set `s(x,by=group`), where group 200 | is some factor-leveled predictor, `mgcv` will fit a seperate smooth of x for 201 | each level of that factor. The model will produce a different smooth $s_k(x)$ 202 | for each kth level, allowing you to test if a given relationship varies betewen 203 | group. 204 | 2. If you instead you set `s(x, by=z)` where z is a numerical value, `mgcv` will 205 | fit a varying slopes regression. Instead of modeling y as a smooth function of 206 | x, this will model y as a *linear* function of z, where the slope of the 207 | relationship between z and y changes smoothly as a function of x, so the 208 | predicted value for the ith value of y would be: $y_i \sim f(x_i) \cdot z_i$. 209 | If you're using matrix predictors as discussed above, for numerical predictors 210 | the `by` variable `z` also has to be a matrix with the same dimensions as `x`. 211 | The predicted value in this case will be: $y_i \sim \sum_{j=1}^k f(x_{i,j})\cdot z_{i,j}$, where $j$ are the columns of the matrices $x$ and $z$. 212 | 213 | 214 | ##Calculating dispersal kernels 215 | 216 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE} 217 | yukon_seedling_data = read.csv("data/yukon_seeds/seed_data.csv") 218 | yukon_source_data =read.csv("data/yukon_seeds/seed_source_locations.csv") 219 | seed_dist = matrix(0, nrow = nrow(yukon_seedling_data), 220 | ncol= nrow(yukon_source_data)) 221 | for(i in 1:nrow(yukon_seedling_data)){ 222 | seed_dist[i,] = sqrt((yukon_source_data$X- yukon_seedling_data$X[i])^2 + (yukon_source_data$Y- yukon_seedling_data$Y[i])^2) 223 | } 224 | seed_dist_l = log(seed_dist) 225 | yukon_seedling_data$min_dist_l = apply(seed_dist_l, MARGIN = 1,min) 226 | 227 | basic_dispersal_model = gam(n_spruce~s(min_dist_l)+offset(log(plot_area_m2)), 228 | data=yukon_seedling_data, family=nb) 229 | full_dispersal_model = gam(n_spruce~s(seed_dist_l)+offset(log(plot_area_m2)), 230 | data=yukon_seedling_data, family=nb) 231 | ``` 232 | 233 | -------------------------------------------------------------------------------- /example-spatial-mexdolphins-solutions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | 10 | --- 11 | 12 | # Preamble 13 | 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013). 15 | 16 | From that appendix: 17 | 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.* 19 | 20 | 21 | ## Doing these exercises 22 | 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works. 24 | 25 | # Data format 26 | 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided: 28 | 29 | 30 | ```{r loaddata} 31 | load("data/mexdolphins/mexdolphins.RData") 32 | ls() 33 | ``` 34 | 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model. 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs 37 | - `preddata` prediction grid without any fancy spatial stuff 38 | 39 | Looking further into the `mexdolphins` frame we see: 40 | 41 | ```{r frameinspect} 42 | str(mexdolphins) 43 | ``` 44 | 45 | A brief explanation of each entry: 46 | 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area) 48 | - `Transect.Label` identifier for the transect that this segment belongs to 49 | - `longitude`, `latitude` location in lat/long of this segment 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection)) 51 | - `Effort` the length of the current segment 52 | - `depth` the bathymetry at the segment's position 53 | - `count` number of dolphins observed in this segment 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this) 56 | 57 | 58 | # Modelling 59 | 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model. 61 | 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*. 63 | 64 | ## A simple model to start with 65 | 66 | We can begin with the model we showed in the first lecture: 67 | 68 | ```{r simplemodel} 69 | library(mgcv) 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 71 | data = mexdolphins, 72 | family = quasipoisson(), 73 | method = "REML") 74 | ``` 75 | 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection. 77 | 78 | We can check the assumptions of this model by using `gam.check`: 79 | 80 | ```{r simplemodel-check} 81 | gam.check(dolphins_depth) 82 | ``` 83 | 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations: 85 | 86 | ```{r zeroresids, fig.width=7, fig.height=7} 87 | # code from the insides of mgcv::gam.check 88 | resid <- residuals(dolphins_depth, type="deviance") 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors) 90 | plot(linpred, resid, main = "Resids vs. linear pred.", 91 | xlab = "linear predictor", ylab = "residuals") 92 | 93 | # now add red dots corresponding to the zero counts 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0], 95 | pch=19, col="red", cex=0.5) 96 | ``` 97 | 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!). 99 | 100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate. 101 | 102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix. 103 | 104 | 105 | **Exercise** 106 | 107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`. 108 | 109 | ```{r simplemodel-bigsmall} 110 | dolphins_depth_bigk <- gam(count ~ s(depth, k=30) + offset(off.set), 111 | data = mexdolphins, 112 | family = quasipoisson(), 113 | method = "REML") 114 | dim(model.matrix(dolphins_depth_bigk)) 115 | dim(dolphins_depth_bigk$smooth[[1]]$S[[1]]) 116 | dolphins_depth_smallk <- gam(count ~ s(depth, k=3) + offset(off.set), 117 | data = mexdolphins, 118 | family = quasipoisson(), 119 | method = "REML") 120 | dim(model.matrix(dolphins_depth_smallk)) 121 | dim(dolphins_depth_smallk$smooth[[1]]$S[[1]]) 122 | ``` 123 | 124 | (Don't worry about the many square brackets etc to get the penalty matrix!) 125 | 126 | ### Plotting 127 | 128 | We can plot the smooth we fitted using `plot`. 129 | 130 | **Exercise** 131 | 132 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which. 133 | 134 | ```{r plotk, } 135 | par(mfrow=c(1,3)) 136 | plot(dolphins_depth, shade=TRUE, main="Default k") 137 | plot(dolphins_depth_bigk, shade=TRUE, main="big k") 138 | plot(dolphins_depth_smallk, shade=TRUE, main="small k") 139 | ``` 140 | 141 | ## Count distributions 142 | 143 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves... 144 | 145 | 146 | ### Tweedie 147 | 148 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution. 149 | 150 | ```{r tw} 151 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set), 152 | data = mexdolphins, 153 | family = tw(), 154 | method = "REML") 155 | ``` 156 | 157 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008). 158 | 159 | 160 | 161 | ### Negative binomial 162 | 163 | **Exercise** 164 | 165 | Now do the same using the negative binomial distribution (`nb()`). 166 | 167 | ```{r nb} 168 | dolphins_xy_nb <- gam(count ~ s(x,y) + s(depth) + offset(off.set), 169 | data = mexdolphins, 170 | family = nb(), 171 | method = "REML") 172 | ``` 173 | 174 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why? 175 | 176 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.* 177 | 178 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships. 179 | 180 | ## Smoothers 181 | 182 | Now let's move onto using different bases for the smoothers. We have a couple of different options here. 183 | 184 | 185 | ### Thin plate splines with shrinkage 186 | 187 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth. 188 | 189 | **Exercise** 190 | 191 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms. 192 | 193 | ```{r tw-ts} 194 | dolphins_xy_tw_ts <- gam(count ~ s(x,y, bs="ts") + s(depth, bs="ts") + 195 | offset(off.set), 196 | data = mexdolphins, 197 | family = tw(), 198 | method = "REML") 199 | ``` 200 | 201 | What are the differences (use `summary`)? 202 | 203 | *EDFs are different for both terms* 204 | 205 | What are the visual differences (use `plot`)? 206 | 207 | *Not really much difference here!* 208 | 209 | 210 | ### Soap film smoother 211 | 212 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands. 213 | 214 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file: 215 | 216 | ```{r soapy} 217 | load("data/mexdolphins/soapy.RData") 218 | ``` 219 | 220 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them. 221 | 222 | ```{r soapknots} 223 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10), 224 | y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10)) 225 | x <- soap_knots$x; y <- soap_knots$y 226 | ind <- inSide(xy_bnd, x, y) 227 | rm(x,y) 228 | soap_knots <- soap_knots[ind, ] 229 | ## inSide doesn't work perfectly, so if you get an error like: 230 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) : 231 | ## knot 54 is on or outside boundary 232 | ## just remove that knot as follows: 233 | soap_knots <- soap_knots[-8, ] 234 | soap_knots <- soap_knots[-54, ] 235 | ``` 236 | 237 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`. 238 | 239 | ```{r soapmodel} 240 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) + 241 | offset(off.set), 242 | data = mexdolphins, 243 | family = tw(), 244 | knots = soap_knots, 245 | method = "REML") 246 | 247 | ``` 248 | 249 | **Exercise** 250 | 251 | Look at the `summary` output for this model and compare it to the other models. 252 | 253 | ```{r soap-summary} 254 | summary(dolphins_soap) 255 | ``` 256 | 257 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out. 258 | 259 | 260 | ```{r soap-plot} 261 | plot(dolphins_soap) 262 | ``` 263 | 264 | # Predictions 265 | 266 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM. 267 | 268 | We can do this very simply, calling predict as one would with a `glm`. For example: 269 | 270 | ```{r pred-qp} 271 | pred_qp <- predict(dolphins_depth, preddata, type="response") 272 | ``` 273 | 274 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`. 275 | 276 | **Exercise** 277 | 278 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum? 279 | 280 | ```{r total-max-min} 281 | sum(pred_qp) 282 | range(pred_qp) 283 | ``` 284 | 285 | ## Plotting predictions 286 | 287 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.* 288 | 289 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop. 290 | 291 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot: 292 | 293 | ```{r gridplotfn} 294 | library(plyr) 295 | # fill must be in the same order as the polygon data 296 | grid_plot_obj <- function(fill, name, sp){ 297 | 298 | # what was the data supplied? 299 | names(fill) <- NULL 300 | row.names(fill) <- NULL 301 | data <- data.frame(fill) 302 | names(data) <- name 303 | 304 | spdf <- SpatialPolygonsDataFrame(sp, data) 305 | spdf@data$id <- rownames(spdf@data) 306 | spdf.points <- fortify(spdf, region="id") 307 | spdf.df <- join(spdf.points, spdf@data, by="id") 308 | 309 | # seems to store the x/y even when projected as labelled as 310 | # "long" and "lat" 311 | spdf.df$x <- spdf.df$long 312 | spdf.df$y <- spdf.df$lat 313 | 314 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df) 315 | } 316 | ``` 317 | 318 | Then we can plot the predicted abundance: 319 | 320 | ```{r plotpred} 321 | library(ggplot2) 322 | library(viridis) 323 | library(sp) 324 | library(rgeos) 325 | library(rgdal) 326 | library(maptools) 327 | 328 | # projection string 329 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 330 | # need sp to transform the data 331 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 332 | 333 | p <- ggplot() + 334 | # abundance 335 | grid_plot_obj(pred_qp, "N", pred.polys) + 336 | # survey lines 337 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 338 | # observations 339 | geom_point(aes(x, y, size=count), 340 | data=mexdolphins[mexdolphins$count>0,], 341 | colour="red", alpha=I(0.7)) + 342 | # make the coordinate system fixed 343 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 344 | xlim = range(mexdolphins$x)) + 345 | # use the viridis colourscheme, which is more colourblind-friendly 346 | scale_fill_viridis() + 347 | # labels 348 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") + 349 | # keep things simple 350 | theme_minimal() 351 | print(p) 352 | ``` 353 | 354 | If you have the `maps` package installed you can also try the following plot the coastline too: 355 | 356 | ```{r maptoo} 357 | library(maps) 358 | map_dat <- map_data("usa") 359 | map_sp <- SpatialPoints(map_dat[,c("long","lat")]) 360 | 361 | # give the sp object a projection 362 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84") 363 | # re-project 364 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4) 365 | map_dat$x <- map_sp.t$long 366 | map_dat$y <- map_sp.t$lat 367 | 368 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 369 | 370 | print(p) 371 | ``` 372 | 373 | ## Comparing predictions from soap and thin plate splines 374 | 375 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area. 376 | 377 | Predicting using the soap film smoother is exactly the same as for the other smoothers: 378 | 379 | ```{r soap-pred} 380 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response") 381 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap") 382 | ``` 383 | 384 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model. 385 | 386 | ```{r xy-soap-pred} 387 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set), 388 | data = mexdolphins, 389 | family = tw(), 390 | method = "REML") 391 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response") 392 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy") 393 | all_pred_N <- rbind(soap_pred_N, xy_pred_N) 394 | 395 | p <- ggplot() + 396 | # abundance 397 | geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) + 398 | # facet it! 399 | facet_wrap(~model) + 400 | # make the coordinate system fixed 401 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 402 | xlim = range(mexdolphins$x)) + 403 | # use the viridis colourscheme, which is more colourblind-friendly 404 | scale_fill_viridis() + 405 | # labels 406 | labs(fill="Predicted\ndensity", x="x", y="y") + 407 | # keep things simple 408 | theme_minimal() + 409 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 410 | print(p) 411 | ``` 412 | 413 | ## Plotting uncertainty 414 | 415 | **Exercise** 416 | 417 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below. 418 | 419 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements. 420 | 421 | ```{r CVmap} 422 | pred_se_qp <- predict(dolphins_depth, preddata, type="response", se.fit=TRUE) 423 | cv <- pred_se_qp$se.fit/pred_se_qp$fit 424 | 425 | p <- ggplot() + 426 | # abundance 427 | grid_plot_obj(cv, "CV", pred.polys) + 428 | # survey lines 429 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 430 | # make the coordinate system fixed 431 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 432 | xlim = range(mexdolphins$x)) + 433 | # use the viridis colourscheme, which is more colourblind-friendly 434 | scale_fill_viridis() + 435 | # labels 436 | labs(fill="CV",x="x",y="y",size="Count") + 437 | # keep things simple 438 | theme_minimal() 439 | print(p) 440 | ``` 441 | 442 | Compare the uncertainties of the different models you've fitted so far. 443 | 444 | ## `lpmatrix` magic 445 | 446 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true: 447 | 448 | ```{r lppred} 449 | # make the Lp matrix 450 | lp <- predict(dolphins_depth, preddata, type="lpmatrix") 451 | # get the linear predictor 452 | lin_pred <- lp %*% coef(dolphins_depth) 453 | # apply the link and multiply by the offset as lpmatrix ignores this 454 | pred <- preddata$area * exp(lin_pred) 455 | # all the same? 456 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE) 457 | ``` 458 | 459 | What else can we do? We can also grab the uncertainty for the sum of the predictions: 460 | 461 | ```{r lpNvar} 462 | # extract the variance-covariance matrix 463 | vc <- vcov(dolphins_depth) 464 | 465 | # reproject the var-covar matrix to be on the linear predictor scale 466 | lin_pred_var <- tcrossprod(lp %*% vc, lp) 467 | 468 | # pre and post multiply by the derivatives of the link, evaluated at the 469 | # predictions -- since the link is exp, we just use the predictions 470 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1) 471 | 472 | # we can then calculate a total CV 473 | sqrt(pred_var)/sum(pred) 474 | ``` 475 | 476 | As you can see, the `lpmatrix` can be very useful! 477 | 478 | # Extra credit 479 | 480 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument. 481 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose? 482 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make? 483 | 484 | # References 485 | 486 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0 487 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105 488 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006 489 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114. 490 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x 491 | 492 | -------------------------------------------------------------------------------- /example-spatial-mexdolphins.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | 10 | --- 11 | 12 | # Preamble 13 | 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013). 15 | 16 | From that appendix: 17 | 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.* 19 | 20 | 21 | ## Doing these exercises 22 | 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works. 24 | 25 | # Data format 26 | 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided: 28 | 29 | 30 | ```{r loaddata} 31 | load("data/mexdolphins/mexdolphins.RData") 32 | ls() 33 | ``` 34 | 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model. 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs 37 | - `preddata` prediction grid without any fancy spatial stuff 38 | 39 | Looking further into the `mexdolphins` frame we see: 40 | 41 | ```{r frameinspect} 42 | str(mexdolphins) 43 | ``` 44 | 45 | A brief explanation of each entry: 46 | 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area) 48 | - `Transect.Label` identifier for the transect that this segment belongs to 49 | - `longitude`, `latitude` location in lat/long of this segment 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection)) 51 | - `Effort` the length of the current segment 52 | - `depth` the bathymetry at the segment's position 53 | - `count` number of dolphins observed in this segment 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this) 56 | 57 | 58 | # Modelling 59 | 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model. 61 | 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*. 63 | 64 | ## A simple model to start with 65 | 66 | We can begin with the model we showed in the first lecture: 67 | 68 | ```{r simplemodel} 69 | library(mgcv) 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 71 | data = mexdolphins, 72 | family = quasipoisson(), 73 | method = "REML") 74 | ``` 75 | 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection. 77 | 78 | We can check the assumptions of this model by using `gam.check`: 79 | 80 | ```{r simplemodel-check} 81 | gam.check(dolphins_depth) 82 | ``` 83 | 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations: 85 | 86 | ```{r zeroresids, fig.width=7, fig.height=7} 87 | # code from the insides of mgcv::gam.check 88 | resid <- residuals(dolphins_depth, type="deviance") 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors) 90 | plot(linpred, resid, main = "Resids vs. linear pred.", 91 | xlab = "linear predictor", ylab = "residuals") 92 | 93 | # now add red dots corresponding to the zero counts 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0], 95 | pch=19, col="red", cex=0.5) 96 | ``` 97 | 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!). 99 | 100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate. 101 | 102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix. 103 | 104 | 105 | **Exercise** 106 | 107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`. 108 | 109 | ```{r simplemodel-bigsmall} 110 | 111 | ``` 112 | 113 | (Don't worry about the many square brackets etc to get the penalty matrix!) 114 | 115 | ### Plotting 116 | 117 | We can plot the smooth we fitted using `plot`. 118 | 119 | **Exercise** 120 | 121 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which. 122 | 123 | ```{r plotk, } 124 | par(mfrow=c(1,3)) 125 | 126 | ``` 127 | 128 | ## Count distributions 129 | 130 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves... 131 | 132 | 133 | ### Tweedie 134 | 135 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution. 136 | 137 | ```{r tw} 138 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set), 139 | data = mexdolphins, 140 | family = tw(), 141 | method = "REML") 142 | ``` 143 | 144 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008). 145 | 146 | 147 | 148 | ### Negative binomial 149 | 150 | **Exercise** 151 | 152 | Now do the same using the negative binomial distribution (`nb()`). 153 | 154 | ```{r nb} 155 | 156 | ``` 157 | 158 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why? 159 | 160 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.* 161 | 162 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships. 163 | 164 | ## Smoothers 165 | 166 | Now let's move onto using different bases for the smoothers. We have a couple of different options here. 167 | 168 | 169 | ### Thin plate splines with shrinkage 170 | 171 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth. 172 | 173 | **Exercise** 174 | 175 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms. 176 | 177 | ```{r tw-ts} 178 | ``` 179 | 180 | What are the differences (use `summary`)? 181 | 182 | What are the visual differences (use `plot`)? 183 | 184 | 185 | ### Soap film smoother 186 | 187 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands. 188 | 189 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file: 190 | 191 | ```{r soapy} 192 | load("data/mexdolphins/soapy.RData") 193 | ``` 194 | 195 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them. 196 | 197 | ```{r soapknots} 198 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10), 199 | y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10)) 200 | x <- soap_knots$x; y <- soap_knots$y 201 | ind <- inSide(xy_bnd, x, y) 202 | rm(x,y) 203 | soap_knots <- soap_knots[ind, ] 204 | ## inSide doesn't work perfectly, so if you get an error like: 205 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) : 206 | ## knot 54 is on or outside boundary 207 | ## just remove that knot as follows: 208 | soap_knots <- soap_knots[-8, ] 209 | soap_knots <- soap_knots[-54, ] 210 | ``` 211 | 212 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`. 213 | 214 | ```{r soapmodel} 215 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) + 216 | offset(off.set), 217 | data = mexdolphins, 218 | family = tw(), 219 | knots = soap_knots, 220 | method = "REML") 221 | 222 | ``` 223 | 224 | **Exercise** 225 | 226 | Look at the `summary` output for this model and compare it to the other models. 227 | 228 | ```{r soap-summary} 229 | ``` 230 | 231 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out. 232 | 233 | 234 | ```{r soap-plot} 235 | ``` 236 | 237 | # Predictions 238 | 239 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM. 240 | 241 | We can do this very simply, calling predict as one would with a `glm`. For example: 242 | 243 | ```{r pred-qp} 244 | pred_qp <- predict(dolphins_depth, preddata, type="response") 245 | ``` 246 | 247 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`. 248 | 249 | **Exercise** 250 | 251 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum? 252 | 253 | ## Plotting predictions 254 | 255 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.* 256 | 257 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop. 258 | 259 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot: 260 | 261 | ```{r gridplotfn} 262 | library(plyr) 263 | # fill must be in the same order as the polygon data 264 | grid_plot_obj <- function(fill, name, sp){ 265 | 266 | # what was the data supplied? 267 | names(fill) <- NULL 268 | row.names(fill) <- NULL 269 | data <- data.frame(fill) 270 | names(data) <- name 271 | 272 | spdf <- SpatialPolygonsDataFrame(sp, data) 273 | spdf@data$id <- rownames(spdf@data) 274 | spdf.points <- fortify(spdf, region="id") 275 | spdf.df <- join(spdf.points, spdf@data, by="id") 276 | 277 | # seems to store the x/y even when projected as labelled as 278 | # "long" and "lat" 279 | spdf.df$x <- spdf.df$long 280 | spdf.df$y <- spdf.df$lat 281 | 282 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df) 283 | } 284 | ``` 285 | 286 | Then we can plot the predicted abundance: 287 | 288 | ```{r plotpred} 289 | library(ggplot2) 290 | library(viridis) 291 | library(sp) 292 | library(rgeos) 293 | library(rgdal) 294 | library(maptools) 295 | 296 | # projection string 297 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 298 | # need sp to transform the data 299 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 300 | 301 | p <- ggplot() + 302 | # abundance 303 | grid_plot_obj(pred_qp, "N", pred.polys) + 304 | # survey lines 305 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 306 | # observations 307 | geom_point(aes(x, y, size=count), 308 | data=mexdolphins[mexdolphins$count>0,], 309 | colour="red", alpha=I(0.7)) + 310 | # make the coordinate system fixed 311 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 312 | xlim = range(mexdolphins$x)) + 313 | # use the viridis colourscheme, which is more colourblind-friendly 314 | scale_fill_viridis() + 315 | # labels 316 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") + 317 | # keep things simple 318 | theme_minimal() 319 | print(p) 320 | ``` 321 | 322 | If you have the `maps` package installed you can also try the following plot the coastline too: 323 | 324 | ```{r maptoo} 325 | library(maps) 326 | map_dat <- map_data("usa") 327 | map_sp <- SpatialPoints(map_dat[,c("long","lat")]) 328 | 329 | # give the sp object a projection 330 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84") 331 | # re-project 332 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4) 333 | map_dat$x <- map_sp.t$long 334 | map_dat$y <- map_sp.t$lat 335 | 336 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 337 | 338 | print(p) 339 | ``` 340 | 341 | ## Comparing predictions from soap and thin plate splines 342 | 343 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area. 344 | 345 | Predicting using the soap film smoother is exactly the same as for the other smoothers: 346 | 347 | ```{r soap-pred} 348 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response") 349 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap") 350 | ``` 351 | 352 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model. 353 | 354 | ```{r xy-soap-pred} 355 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set), 356 | data = mexdolphins, 357 | family = tw(), 358 | method = "REML") 359 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response") 360 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy") 361 | all_pred_N <- rbind(soap_pred_N, xy_pred_N) 362 | 363 | p <- ggplot() + 364 | # abundance 365 | geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) + 366 | # facet it! 367 | facet_wrap(~model) + 368 | # make the coordinate system fixed 369 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 370 | xlim = range(mexdolphins$x)) + 371 | # use the viridis colourscheme, which is more colourblind-friendly 372 | scale_fill_viridis() + 373 | # labels 374 | labs(fill="Predicted\ndensity", x="x", y="y") + 375 | # keep things simple 376 | theme_minimal() + 377 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 378 | print(p) 379 | ``` 380 | 381 | ## Plotting uncertainty 382 | 383 | **Exercise** 384 | 385 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below. 386 | 387 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements. 388 | 389 | ```{r CVmap} 390 | 391 | ``` 392 | 393 | Compare the uncertainties of the different models you've fitted so far. 394 | 395 | ## `lpmatrix` magic 396 | 397 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true: 398 | 399 | ```{r lppred} 400 | # make the Lp matrix 401 | lp <- predict(dolphins_depth, preddata, type="lpmatrix") 402 | # get the linear predictor 403 | lin_pred <- lp %*% coef(dolphins_depth) 404 | # apply the link and multiply by the offset as lpmatrix ignores this 405 | pred <- preddata$area * exp(lin_pred) 406 | # all the same? 407 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE) 408 | ``` 409 | 410 | What else can we do? We can also grab the uncertainty for the sum of the predictions: 411 | 412 | ```{r lpNvar} 413 | # extract the variance-covariance matrix 414 | vc <- vcov(dolphins_depth) 415 | 416 | # reproject the var-covar matrix to be on the linear predictor scale 417 | lin_pred_var <- tcrossprod(lp %*% vc, lp) 418 | 419 | # pre and post multiply by the derivatives of the link, evaluated at the 420 | # predictions -- since the link is exp, we just use the predictions 421 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1) 422 | 423 | # we can then calculate a total CV 424 | sqrt(pred_var)/sum(pred) 425 | ``` 426 | 427 | As you can see, the `lpmatrix` can be very useful! 428 | 429 | # Extra credit 430 | 431 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument. 432 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose? 433 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make? 434 | 435 | # References 436 | 437 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0 438 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105 439 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006 440 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114. 441 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x 442 | 443 | -------------------------------------------------------------------------------- /example-spatio-temporal data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Spatio-temporal dynamics is for the birds" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | --- 10 | 11 | ##1. Background 12 | We've talked about modelling spatial trends in the main workshop and in an 13 | extended example, and about the trials and tribulations of time series data 14 | in another extended example. Here, we're going to combine these, to look at 15 | one of the most challenging types of data for linear models to deal with: 16 | spatiotemporal data. A spatiotemporal dataset is one where the outcome of 17 | interest has been observed at multiple locations and at multiple points in time, 18 | and we want to know how temporal trends change across the landscape (or, from 19 | the other view, how spatial patterns are changing over time). 20 | 21 | We don't have time to really get into the nuts and bolts issues around fitting 22 | complex spatio-temporal models. This exercise is instead designed to give you 23 | a feel for how to use `mgcv` to model complex interactions between variables 24 | with different natural scales, and how to include simple random effects into 25 | gam models. 26 | 27 | ## 2. Key concepts and functions: 28 | Here's a few key ideas and R functions you should familiarize yourself with if 29 | you haven't already encountered them before. For the R functions (which will be) 30 | highlighted `like this`, use `?function_name` to look them up. 31 | 32 | ### Tensor product (`te`) smooths 33 | This tutorial will rely on tensor product smooths. These are used to model 34 | nonlinear interactions between variables that have different natural scales. The 35 | interaction smooths we saw in the start of the workshop were thin plate splines 36 | `y~s(x1,x2, bs="tp")`). These splines assume that the relationship between y and 37 | x1 should be roughly as "wiggly" as the relationship between y and x2, and only 38 | has one smoothing parameter. This makes sense if x1 and x2 are similar variables 39 | (say latitude and longitude, or arm length and leg length), but if we're looking 40 | at the interacting effects of, say, temperature and body weight, a single 41 | smoothing term doesn't make sense. 42 | 43 | This is where tensor product smooths come in. These assume that each variable of 44 | interest should have its own smooth term. These create multi-dimensional smooth 45 | terms by multiplying the values of each one-dimensional basis by each value of 46 | the second. This is a pretty complicated idea, and hard to describe effectively 47 | without more math than we want to go into here. But you can get the idea from a 48 | simple example. Let's say we have two basis functions, one for variable x1 and 49 | one for variable x2. The function for x1 is a bell-curve shaped, and the one for 50 | x2 is linear. The basis functions will look like this: 51 | 52 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE} 53 | library(mgcv) 54 | library(ggplot2) 55 | library(dplyr) 56 | library(viridis) 57 | set.seed(1) 58 | n=25 59 | dat = as.data.frame(expand.grid(x1 = seq(-1,1,length.out = n), 60 | x2= seq(-1,1,length=n))) 61 | dat$x1_margin = -dat$x1 62 | dat$x2_margin = dnorm(dat$x2,0,0.5) 63 | dat$x2_margin = dat$x2_margin - mean(dat$x2_margin) 64 | 65 | layout(matrix(1:2,ncol=2)) 66 | plot(x1_margin~x1,type="l", data= dat%>%filter(!duplicated(x1))) 67 | plot(x2_margin~x2,type="l", data= dat%>%filter(!duplicated(x2))) 68 | layout(1) 69 | ``` 70 | 71 | However, the tensor product of the two variables is more complicated. Note that 72 | when x1 is below zero, the effect of x2 is positive at middle values and negative 73 | at low values, and vice versa when x1 is above zero. 74 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE} 75 | dat$x1x2_tensor = dat$x1_margin*dat$x2_margin 76 | ggplot(aes(x=x1,y=x2,fill=x1x2_tensor),data=dat)+ 77 | geom_tile()+ 78 | scale_x_continuous(expand=c(0,0))+ 79 | scale_y_continuous(expand=c(0,0))+ 80 | scale_fill_viridis()+ 81 | theme_bw() 82 | ``` 83 | 84 | A full tensor product smooth will consist of several basis functions constructed 85 | like this. The resulting smooth is then fitted using the standard gam model, but 86 | with multiple penalty terms (one for each term in the tensor product). Each 87 | penalty penalizes how much the function deviates from a smooth function in the 88 | direction of one of the variables, averaged over all the other variables. This 89 | allows us to have the function be more or less wiggly in one direction than 90 | another. 91 | 92 | They are built using the `te()` function, like this: 93 | 94 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 95 | # We'll recycle our fake marginal and tensor products as the expected value for 96 | # a simulation. Waste not, want not! 97 | dat$y = rnorm(n^2, dat$x1x2_tensor+dat$x1_margin+dat$x2_margin,0.5) 98 | 99 | te_demo = gam(y~te(x1, x2, k = 4^2),data=dat,method="REML") #k=5^2 so there will be 5 marginal bases for each dimension 100 | plot.gam(te_demo,scheme = 2) 101 | summary(te_demo) 102 | print(te_demo$sp) 103 | #These are the estimated smooths for the two dimensions. Note that the smooth for 104 | #x1 (the linear basis) is way smoother than the one for x2 105 | ``` 106 | 107 | 108 | 109 | ### Tensor interaction smooths 110 | There are many cases when we want an estimate of both the average effect of each 111 | variable as well as the interaction. This is tough to extract from just using 112 | `te()` terms, but if we try to combine the two (say, with 113 | `y~s(x1)+s(x2)+te(x1,x2))`) the result is often numerically unstable; it won't 114 | give reliable consistent separation into the terms we want. Instead, we use `ti` 115 | terms for the interaction. The `ti` terms are set up so the average effect of 116 | each variable in the smooth is already excluded from the interaction (and they'll 117 | have fewer basis functions than the equivalent `te` term). Search `?te` if you're 118 | curious about how `mcgv` sets these up; for now, we'll just use them as a black 119 | box. 120 | 121 | This is how you use them in practice: 122 | 123 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 124 | # We'll recycle our fake tensor product as the expected value for a simulation 125 | # Waste not, want not! 126 | 127 | ti_demo = gam(y~s(x1,k=4)+s(x2,k=4) +ti(x1, x2, k = 4^2), 128 | data=dat,method="REML") 129 | layout(matrix(1:3,nrow=1)) 130 | plot.gam(ti_demo,scheme = 2) 131 | layout(1) 132 | summary(ti_demo) 133 | print(ti_demo$sp) 134 | #These are the estimated smooths for the two dimensions. Note that the smooth for 135 | #x1 (the linear basis) is way smoother than the one for x2 136 | ``` 137 | 138 | Note that it does a pretty good job of extracting our assumed marginal 139 | basis functions (linear and Gaussian) and the tensor product into three clearly 140 | separate functions. 141 | 142 | NOTE: Never use `ti` on its own without including the marginal smooths! This 143 | is for the same reason you shouldn't exclude individual linear terms but include 144 | interactions in `glm` models. It really virtually never makes sense to talk about a function with a strong interaction but not marginal effects. 145 | 146 | ### Random effects smooths 147 | The last smooth type we'll use in this exercise is the random effects smooth. 148 | This is just what it sounds on the box: it's a smooth for factor terms, with one 149 | coefficient for each level of the group (the basis functions for this case), and 150 | a penalty term on the squared value of the coefficients to draw them towards the 151 | global mean. This is exactly the same mechanism that packages like `lme4` and 152 | `nlme` use to calculate random effects. In fact, there's a very deep connection 153 | between gams and mixed effects,but we won't go into that here. 154 | 155 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 156 | n= 10 157 | n_groups= 10 158 | group_means = rnorm(n_groups, 0, 2) 159 | dat = data_frame(group = rep(1:n_groups, each=n), 160 | group_f = factor(group)) 161 | dat$y = rnorm(n*n_groups, 3+group_means[dat$group],2) 162 | re_demo = gam(y~s(group_f, bs="re"), 163 | data=dat,method="REML") 164 | plot.gam(re_demo,scheme = 2) 165 | summary(re_demo) 166 | print(re_demo$sp) 167 | ``` 168 | 169 | It's important to know when using `bs="re"` terms whether the variable you're 170 | giving it is numeric or factorial. `mgcv` will not raise an error if you give it 171 | numerical variables to `bs="re"`. Instead, it'll just fit a linear model with a 172 | penalty on the slope of the line between x and y. Try it with the data about to 173 | see, by switching `s(group_f, bs="re")` with `s(group, bs="re")`. `bs="re"` is 174 | set up like this to allow you to use random effects smooths to fit things like 175 | varying slopes random effects models, but make sure you know what you're doing 176 | when venturing into this. 177 | 178 | ## 3. Spatiotemporal models 179 | Now that we've covered the key smoothers, let's take a look at how we'd use them 180 | to model spatiotemporal data. For this section, we'll use data from the Florida 181 | routes of the Breeding Bird Survey. If you haven't heard of it before, this is a 182 | long-term monitoring program tracking bird abundance and diversity across the US 183 | and Canada. It consists of many (over 4000) 24.5 mile long routes. Trained 184 | observers walk these routes each year during peak breeding season, and every 0.5 185 | miles, they stop and count and identify all the birds they can see or hear in a 186 | 0.25 mile radius of the stop. This is one of the best long-term data sets we 187 | have on bird diversity and abundance, and has been used in countless papers. 188 | 189 | Here, we're going to use this data to answer a couple relatively simple 190 | questions: how does bird species richness vary across the state of Florida, has 191 | average richness changed over time, and has the spatial pattern of richness 192 | varied? We obtained data from [this 193 | site](https://www.pwrc.usgs.gov/bbs/RawData/), and summarized it into the total 194 | number of different species and the total number of individual birds observed on 195 | each route in each year in the state of Florida. 196 | 197 | First, we'll load the data, and quickly plot it: 198 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 199 | library(maps) 200 | florida_map = map_data("state","florida")%>% 201 | transmute(Longitude= long, Latitude = lat, order=order) 202 | florida_birds = read.csv("data/bbs_data/bbs_florida_richness.csv") 203 | head(florida_birds) 204 | 205 | ggplot(aes(x=Year,y=Richness),data=florida_birds)+geom_point()+ 206 | theme_bw() 207 | ggplot(aes(x=Longitude,y=Latitude,col=Richness), data=florida_birds) + 208 | geom_polygon(data=florida_map,fill=NA, col="black")+ 209 | geom_point()+theme_bw() 210 | ``` 211 | 212 | It looks like richness may have declined in the end of the time series, and 213 | there seems to be somewhat of a north-south richness gradient, but we need to 214 | model it to really tease these issues out. 215 | 216 | ### Modelling space and time seperately 217 | We'll first take a stab at this to determine if these mean trends actually come 218 | out of the model. As richness is a count variable 219 | we'll use a Poisson gam to model it. This means 220 | that we're assuming mean richness is the product of a smooth 221 | function of location and of date, with no interactions; 222 | E(richness) = exp(f(year))*exp(g(location)). 223 | 224 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 225 | richness_base_model = gam(Richness~ s(Longitude,Latitude)+s(Year), 226 | data=florida_birds,family=poisson,method="REML") 227 | layout(matrix(1:2,ncol=2)) 228 | plot(richness_base_model,scheme=2) 229 | layout(1) 230 | summary(richness_base_model) 231 | ``` 232 | 233 | Note that we do have a thin-plate interaction between longitude and latitude. 234 | This is assuming that the gradient should be equally smooth latitudinally and 235 | longitudinally. This may not be totally accurate, as the function seems to 236 | change faster latitudinally, but we'll ignore that until the exercises at the 237 | bottom. The model seems to be doing pretty well; it captured the original patterns 238 | we saw, and explains a not-inconsiderable 45% of the deviance. 239 | 240 | Let's see if there's any remaining spatiotemporal patterns though: 241 | 242 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 243 | source("code_snippets/quantile_resid.R") 244 | 245 | florida_birds$base_resid = rqresiduals(richness_base_model) 246 | 247 | ggplot(aes(Longitude, Latitude), 248 | data= florida_birds %>% filter(Year%%6==0))+ #only look at every 6th year 249 | geom_point(aes(color=base_resid))+ 250 | geom_polygon(data=florida_map,fill=NA, col="black")+ 251 | scale_color_viridis()+ 252 | facet_wrap(~Year)+ 253 | theme_bw() 254 | 255 | ``` 256 | There's definitely still a pattern in the residuals, such as higher than 257 | expected diversity in the mid-latitudes during the 1990's. 258 | 259 | ### Joint models of space and time 260 | The next step is to determine if there is a significant interaction. This is 261 | where tensor product splines come in. Here, we'll use the `ti` formulation, as 262 | it allows us to directly test whether the interaction significantly improves our 263 | model. 264 | 265 | Something important to note: the `d=c(2,1)` argument it `ti`. This tells the 266 | function that the smooth should consist of tensor product between a 267 | 2-dimensional smooth (lat-long) and a 1-dimensional term (Year). 268 | 269 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 270 | richness_ti_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 271 | ti(Longitude, Latitude, Year, d=c(2,1)), 272 | data=florida_birds,family=poisson,method="REML") 273 | layout(matrix(1:2,ncol=2)) 274 | plot(richness_ti_model,scheme=2) 275 | layout(1) 276 | summary(richness_ti_model) 277 | 278 | ``` 279 | This improves the model fit substantially, which you can see by directly 280 | comparing models with `anova`: 281 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 282 | anova(richness_base_model,richness_ti_model,test = "Chisq") 283 | 284 | ``` 285 | 286 | 287 | The question then becomes: what does this actually mean? Where are we seeing the 288 | fastest and slowest rates of change in richness? It's often difficult to 289 | effectively show spatiotemporal changes, partially as it would require a 290 | 4-dimensional graph to really show what's going on. Still, we can look at a 291 | couple different views by using `predict` effectively: 292 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 293 | #First we'll create gridded data 294 | predict_richess = expand.grid( 295 | Latitude= seq(min(florida_birds$Latitude), 296 | max(florida_birds$Latitude), 297 | length=50), 298 | Longitude = seq(min(florida_birds$Longitude), 299 | max(florida_birds$Longitude), 300 | length=50), 301 | Year = seq(1970,2015,by=5) 302 | ) 303 | # This now selects only that data that falls within Florida's border 304 | predict_richess = predict_richess[with(predict_richess, 305 | inSide(florida_map, Latitude,Longitude)),] 306 | predict_richess$model_fit = predict(richness_ti_model, 307 | predict_richess,type = "response") 308 | ggplot(aes(Longitude, Latitude, fill= model_fit), 309 | data=predict_richess)+ 310 | geom_tile()+ 311 | facet_wrap(~Year,nrow=2)+ 312 | scale_fill_viridis("# of species")+ 313 | theme_bw(10) 314 | ``` 315 | 316 | 317 | Another way of looking at this is by estimating the rate of change 318 | at each location at each time point, and figuring out which locations 319 | are changing most quickly: 320 | 321 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 322 | 323 | predict_richess$model_change =predict(richness_ti_model, 324 | predict_richess%>%mutate(Year=Year+1), 325 | type = "response") - 326 | predict_richess$model_fit 327 | ggplot(aes(Longitude, Latitude, fill= model_change), 328 | data=predict_richess)+ 329 | geom_tile()+ 330 | facet_wrap(~Year,nrow=2)+ 331 | scale_fill_gradient2("Rate of change\n(species per year)")+ 332 | theme_bw(10) 333 | ``` 334 | 335 | ### Accounting for local heterogeneity with `bs="re"` 336 | Examining this, it seems to indicate that richness fluctuated across the state 337 | throughout the study period, but then began sharply declining state-wide in the 338 | 2000's. (Disclaimer: I don't know enough about either the BBS or Florida to say 339 | how much of this decline is real! Please don't quote me in newspapers saying 340 | biodiversity is collapsing in Florida). 341 | 342 | There is still one persistent issue: viewing the maps of richness, there appear 343 | to be small hotspots and coldspots of richness across the state. This may be due 344 | to a factor I've ignored so far: the data is collected in specific, repeated 345 | routes. If it's just easier to spot birds on one route, or there's a lot of 346 | local nesting habitat there, it will have higher richness than expected over the 347 | whole study. Adjusting for these kinds of hot and cold spots will force our 348 | model to be overly wiggly. Further, not all routes are sampled every year, so we 349 | want to make sure that our trends don't just result from less diverse routes 350 | getting sampled later in the survey. 351 | 352 | To try and account for this effect (and other grouped local factors), we 353 | can use random effects smooths. That would make our model look like this: 354 | 355 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 356 | florida_birds$Route_f = factor(florida_birds$Route) 357 | richness_ti_re_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 358 | ti(Longitude, Latitude, Year, d=c(2,1))+ 359 | s(Route_f, bs="re"), 360 | data=florida_birds,family=poisson,method="REML") 361 | layout(matrix(1:3,nrow=1)) 362 | plot(richness_ti_re_model,scheme=2) 363 | layout(1) 364 | 365 | summary(richness_ti_re_model) 366 | anova(richness_ti_model,richness_ti_re_model) 367 | ``` 368 | 369 | We can view the same plots as before with our new model: 370 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 371 | predict_richess$Route_f = 1 372 | predict_richess$model_fit = predict(richness_ti_re_model, 373 | predict_richess,type = "response") 374 | predict_richess$model_change =predict(richness_ti_re_model, 375 | predict_richess%>%mutate(Year=Year+1), 376 | type = "response") - 377 | predict_richess$model_fit 378 | 379 | ggplot(aes(Longitude, Latitude, fill= model_fit), 380 | data=predict_richess)+ 381 | geom_tile()+ 382 | facet_wrap(~Year,nrow=2)+ 383 | scale_fill_viridis("# of species")+ 384 | theme_bw(10) 385 | 386 | ggplot(aes(Longitude, Latitude, fill= model_change), 387 | data=predict_richess)+ 388 | geom_tile()+ 389 | facet_wrap(~Year,nrow=2)+ 390 | scale_fill_gradient2("Rate of change\n(species per year)")+ 391 | theme_bw(10) 392 | ``` 393 | This definely seems to have levelled off some of the hot spots from before, 394 | implying that bird richness is more evenly distributed than our prior model 395 | suggested. Still, the large-scale trends remain the same from before. 396 | 397 | ## 4. Exercises 398 | 399 | 1. Given that most of the variability appears to be latitudinal, try to adapt 400 | the tensor smoothers so that latitude,longitude, and year all have their own 401 | penalty terms. How does this change model fit? 402 | 403 | 2. Following up on the prior 404 | exorcise: build a model that allows you to explicitly calculate the marginal 405 | latitudinal gradient in diversity, and to determine how the latitudinal gradient 406 | has changed over time. 407 | 408 | 3. We know from basic sampling theory that if you sample 409 | fewer individuals over time, you also expect to find lower richness, even if 410 | actual richness stays unchanged. What does the spatiotemporal pattern of 411 | abudance look like? Does adding bird abudance as a predictor explain any of the 412 | patterns in richness? 413 | -------------------------------------------------------------------------------- /pre_course_questionnaire.txt: -------------------------------------------------------------------------------- 1 | What to ask? 2 | ============ 3 | 4 | 5 | Experience with R 6 | - how long have you used it? 7 | - how often do you use it? 8 | Experience with mgcv 9 | - how long have you used it? 10 | - how often do you use it? 11 | 12 | Mathematical background 13 | - I hate math -> I actively enjoy matrix algebra 14 | 15 | What do you do? 16 | 17 | What do you want to do? 18 | 19 | 20 | -------------------------------------------------------------------------------- /slides/00-preamble.Rpres: -------------------------------------------------------------------------------- 1 | The mgcv package as a one-stop-shop for fitting non-linear ecological models 2 | ============================ 3 | author: David L Miller 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | Good morning! 9 | ============== 10 | type: section 11 | 12 | Who is this guy? 13 | ================ 14 | 15 | - (mostly) Cetacean distribution modelling 16 | - Spatial modelling (esp. model checking) 17 | - Distance sampling [distancesampling.org](http://distancesampling.org) 18 | - Statistical software (`Distance`, `mrds`, `dsm`) 19 | - Not a biologist/ecologist (not even really a statistician) 20 | 21 | 22 | Who are you? 23 | ============ 24 | type:section 25 | 26 | 27 | What is the structure of the day? 28 | ================================= 29 | type:section 30 | 31 | Timing 32 | ====== 33 | 34 | - 9am to 5pm 35 | - Morning session: 9am-12pm 36 | - LUNCH?! 37 | - Aternoon session: 1pm-5pm 38 | 39 | Today: me talking a lot 40 | 41 | Tomorrow: less of that? 42 | 43 | Content 44 | ======= 45 | 46 | - Broad overview of what one can do in `mgcv` 47 | - Deeper on simple stuff 48 | - Focus on 2 data sets 49 | - species distribution modelling (dolphins) 50 | - "time series ish" data (zooplankton) 51 | 52 | Overview 53 | ======== 54 | 55 | 1. Preamble 56 | 1. Generalized additive models 57 | 1. Fitting GAMs in practice 58 | 1. Model checking 59 | 1. Model selection 60 | 1. Inference 61 | 1. "Fancy" stuff 62 | 63 | Please interrupt! 64 | ================= 65 | 66 | - I have a weird accent 67 | - I sometimes mumble 68 | - I will probably say something that is unclear 69 | 70 | Credits 71 | ======= 72 | 73 | - Course based on one given at ESA 2016 74 | - Eric Pedersen, UWisc, now DFO 75 | - Gavin Simpson, URegina, SK 76 | - Previous course on density models 77 | - Jason Roberts, Duke 78 | - Using many of the materials from these courses 79 | - All these materials are **open**, reuse as you like! 80 | 81 | ![Creative Commons BY](images/by.png) 82 | 83 | Course website 84 | ============== 85 |
86 |
87 |
88 |
converged.yt/mgcv-workshop
89 | 90 | 91 | -------------------------------------------------------------------------------- /slides/01-intro.Rpres: -------------------------------------------------------------------------------- 1 | Generalized Additive Models 2 | ============================ 3 | author: David L Miller 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | Overview 9 | ========= 10 | 11 | - What is a GAM? 12 | - What is smoothing? 13 | - How do GAMs work? (*Roughly*) 14 | 15 | ```{r setup, include=FALSE} 16 | library(knitr) 17 | library(viridis) 18 | library(ggplot2) 19 | library(reshape2) 20 | library(animation) 21 | library(mgcv) 22 | opts_chunk$set(cache=TRUE, echo=FALSE) 23 | theme_set(theme_minimal()+theme(text=element_text(size=20))) 24 | ``` 25 | 26 | 27 | From GAMs to GLMs and LMs 28 | ============================= 29 | type:section 30 | 31 | 32 | (Generalized) Linear Models 33 | ============================= 34 | 35 | Models that look like: 36 | 37 | $$ 38 | y_i = \beta_0 + x_{1i}\beta_1 + x_{2i}\beta_2 + \ldots + \epsilon_i 39 | $$ 40 | 41 | (describe the response, $y_i$, as linear combination of the covariates, $x_{ji}$, with an offset) 42 | 43 | We can make $y_i\sim$ any exponential family distribution (Normal, Poisson, etc). 44 | 45 | Error term $\epsilon_i$ is normally distributed (usually). 46 | 47 | Why bother with anything more complicated?! 48 | ============================= 49 | type:section 50 | 51 | Is this relationship linear? 52 | ============================= 53 | 54 | ```{r islinear, fig.width=12, fig.height=7} 55 | set.seed(2) ## simulate some data... 56 | dat <- gamSim(1, n=400, dist="normal", scale=1, verbose=FALSE) 57 | dat <- dat[,c("y", "x0", "x1", "x2", "x3")] 58 | p <- ggplot(dat,aes(y=y,x=x1)) + 59 | geom_point() 60 | print(p) 61 | ``` 62 | 63 | A linear model... 64 | ============================= 65 | type:section 66 | 67 | ```{r eval=FALSE, echo=TRUE} 68 | lm(y ~ x1, data=dat) 69 | ``` 70 | 71 | Is this relationship linear? Maybe? 72 | =================================== 73 | 74 | ```{r maybe, fig.width=12, fig.height=7} 75 | print(p + geom_smooth(method="lm")) 76 | ``` 77 | 78 | 79 | What can we do? 80 | ============================= 81 | type:section 82 | 83 | ```{r eval=FALSE, echo=TRUE} 84 | lm(y ~ x1 + poly(x1, 2), data=dat) 85 | ``` 86 | Adding a quadratic term? 87 | ============================= 88 | 89 | 90 | ```{r quadratic, fig.width=12, fig.height=7} 91 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() + 92 | theme_minimal() 93 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2))) 94 | ``` 95 | 96 | 97 | 98 | 99 | Is this sustainable? 100 | ============================= 101 | 102 | - Adding in quadratic (and higher terms) *can* make sense 103 | - This feels a bit *ad hoc* 104 | - Better if we had a **framework** to deal with these issues? 105 | 106 | ```{r ruhroh, fig.width=12, fig.height=6} 107 | p <- ggplot(dat, aes(y=y, x=x2)) + geom_point() + 108 | theme_minimal() 109 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2))) 110 | ``` 111 | 112 | 113 | [drumroll] 114 | ============================= 115 | type:section 116 | 117 | Generalized Additive Models 118 | ============================ 119 | type:section 120 | 121 | "gam" 122 | ==================== 123 | 124 | 1. *Collective noun used to refer to a group of whales, or rarely also of porpoises; a pod.* 125 | 2. *(by extension) A social gathering of whalers (whaling ships).* 126 | 127 |
128 | 129 | (via Nat Kelly, Australian Antarctic Division) 130 | 131 | 132 | Generalized Additive Models 133 | ============================ 134 | 135 | - Generalized: many response distributions 136 | - Additive: terms **add** together 137 | - Models: well, it's a model... 138 | 139 | What does a model look like? 140 | ============================= 141 | 142 | $$ 143 | y_i = \beta_0 + \sum_j s_j(x_{ji}) + \epsilon_i 144 | $$ 145 | 146 | where $\epsilon_i \sim N(0, \sigma^2)$, $y_i \sim \text{Normal}$ (for now) 147 | 148 | Remember that we're modelling the **mean** of this distribution! 149 | 150 | Call the above equation the **linear predictor** 151 | 152 | Okay, but what about these "s" things? 153 | ==================================== 154 | right:55% 155 | 156 | ```{r smoothdat, fig.width=8, fig.height=8} 157 | 158 | spdat <- melt(dat, id.vars = c("y")) 159 | p <- ggplot(spdat,aes(y=y,x=value)) + 160 | geom_point() + 161 | facet_wrap(~variable, nrow=2) 162 | print(p) 163 | ``` 164 | *** 165 | - Think $s$=**smooth** 166 | - Want to model the covariates flexibly 167 | - Covariates and response not necessarily linearly related! 168 | - Want some "wiggles" 169 | 170 | Okay, but what about these "s" things? 171 | ==================================== 172 | right:55% 173 | 174 | ```{r wsmooths, fig.width=8, fig.height=8} 175 | print(p + geom_smooth()) 176 | ``` 177 | *** 178 | - Think $s$=**smooth** 179 | - Want to model the covariates flexibly 180 | - Covariates and response not necessarily linearly related! 181 | - Want some "wiggles" 182 | 183 | What is smoothing? 184 | =============== 185 | type:section 186 | 187 | 188 | Straight lines vs. interpolation 189 | ================================= 190 | right:55% 191 | 192 | ```{r wiggles, fig.height=8, fig.width=8} 193 | library(mgcv) 194 | # hacked from the example in ?gam 195 | set.seed(2) ## simulate some data... 196 | dat <- gamSim(1,n=50,dist="normal",scale=0.5, verbose=FALSE) 197 | dat$y <- dat$f2 + rnorm(length(dat$f2), sd = sqrt(0.5)) 198 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10-mean(dat$y) 199 | ylim <- c(-4,6) 200 | 201 | # fit some models 202 | b.justright <- gam(y~s(x2),data=dat) 203 | b.sp0 <- gam(y~s(x2, sp=0, k=50),data=dat) 204 | b.spinf <- gam(y~s(x2),data=dat, sp=1e10) 205 | 206 | curve(f2,0,1, col="blue", ylim=ylim) 207 | points(dat$x2, dat$y-mean(dat$y)) 208 | 209 | ``` 210 | *** 211 | - Want a line that is "close" to all the data 212 | - Don't want interpolation -- we know there is "error" 213 | - Balance between interpolation and "fit" 214 | 215 | Splines 216 | ======== 217 | 218 | - Functions made of other, simpler functions 219 | - **Basis functions** $b_k(x)$, estimate $\beta_k$ 220 | - $s(x) = \sum_{k=1}^K \beta_k b_k(x)$ 221 | 222 |
223 | 224 | 225 | 226 | Design matrices 227 | =============== 228 | 229 | - We often write models as $X\boldsymbol{\beta}$ 230 | - $X$ is our data 231 | - $\boldsymbol{\beta}$ are parameters we need to estimate 232 | - For a GAM it's the same 233 | - $X$ has columns for each basis, evaluated at each observation (row) 234 | - again, this is the linear predictor 235 | 236 | Measuring wigglyness 237 | ====================== 238 | 239 | - Visually: 240 | - Lots of wiggles == NOT SMOOTH 241 | - Straight line == VERY SMOOTH 242 | - How do we do this mathematically? 243 | - Derivatives! 244 | - (Calculus *was* a useful class afterall!) 245 | 246 | 247 | 248 | Wigglyness by derivatives 249 | ========================== 250 | 251 | ```{r wigglyanim, results="hide"} 252 | library(numDeriv) 253 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 - mean(dat$y) 254 | 255 | xvals <- seq(0,1,len=100) 256 | 257 | plot_wiggly <- function(f2, xvals){ 258 | 259 | # pre-calculate 260 | f2v <- f2(xvals) 261 | f2vg <- grad(f2,xvals) 262 | f2vg2 <- unlist(lapply(xvals, hessian, func=f2)) 263 | f2vg2min <- min(f2vg2) -2 264 | 265 | # now plot 266 | for(i in 1:length(xvals)){ 267 | par(mfrow=c(1,3)) 268 | plot(xvals, f2v, type="l", main="function", ylab="f") 269 | points(xvals[i], f2v[i], pch=19, col="red") 270 | 271 | plot(xvals, f2vg, type="l", main="derivative", ylab="df/dx") 272 | points(xvals[i], f2vg[i], pch=19, col="red") 273 | 274 | plot(xvals, f2vg2, type="l", main="2nd derivative", ylab="d2f/dx2") 275 | points(xvals[i], f2vg2[i], pch=19, col="red") 276 | polygon(x=c(0,xvals[1:i], xvals[i],f2vg2min), 277 | y=c(f2vg2min,f2vg2[1:i],f2vg2min,f2vg2min), col = "grey") 278 | 279 | ani.pause() 280 | } 281 | } 282 | 283 | saveGIF(plot_wiggly(f2, xvals), "wiggly.gif", interval = 0.2, ani.width = 800, ani.height = 400) 284 | ``` 285 | 286 | ![Animation of derivatives](wiggly.gif) 287 | 288 | What was that grey bit? 289 | ========================= 290 | 291 | $$ 292 | \int_\mathbb{R} \left( \frac{\partial^2 f(x)}{\partial x^2}\right)^2 \text{d}x\\ 293 | $$ 294 | 295 | - *Turns out* we can always write this as $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$, so the $\boldsymbol{\beta}$ is separate from the derivatives 296 | - Call $S$ the **penalty matrix** 297 | - Different penalties lead to difference $f$ s $\Rightarrow$ different $b_k(x)$ s 298 | 299 | Making wigglyness matter 300 | ========================= 301 | 302 | - $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$ measures wigglyness 303 | - "Likelihood" measures closeness to the data 304 | - Penalise closeness to the data... 305 | - Use a **smoothing parameter** to decide on that trade-off... 306 | - $\lambda \boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$ 307 | - Estimate the $\beta_k$ terms but penalise objective 308 | - "closeness to data" + penalty 309 | 310 | Smoothing parameter 311 | ======================= 312 | 313 | 314 | ```{r wiggles-plot, fig.width=15} 315 | # make three plots, w. estimated smooth, truth and data on each 316 | par(mfrow=c(1,3), cex.main=3.5) 317 | 318 | plot(b.justright, se=FALSE, ylim=ylim, main=expression(lambda*plain("= just right"))) 319 | points(dat$x2, dat$y-mean(dat$y)) 320 | curve(f2,0,1, col="blue", add=TRUE) 321 | 322 | plot(b.sp0, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*0)) 323 | points(dat$x2, dat$y-mean(dat$y)) 324 | curve(f2,0,1, col="blue", add=TRUE) 325 | 326 | plot(b.spinf, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*infinity)) 327 | points(dat$x2, dat$y-mean(dat$y)) 328 | curve(f2,0,1, col="blue", add=TRUE) 329 | 330 | ``` 331 | 332 | Smoothing parameter selection 333 | ============================== 334 | 335 | - Many methods: AIC, Mallow's $C_p$, GCV, ML, REML 336 | - Recommendation, based on simulation and practice: 337 | - Use REML or ML 338 | - Reiss \& Ogden (2009), Wood (2011) 339 | 340 | 341 | 342 | 343 | Maximum wiggliness 344 | ======================== 345 | 346 | - We can set **basis complexity** or "size" ($k$) 347 | - Maximum wigglyness 348 | - Smooths have **effective degrees of freedom** (EDF) 349 | - EDF < $k$ 350 | - Set $k$ "large enough" 351 | - Penalty does the rest 352 | 353 | 354 | More on this in a bit... 355 | 356 | Response distributions 357 | ====================== 358 | 359 | - Exponential family distributions are available 360 | - Normal, Poisson, binomial, gamma, quasi etc (`?family`) 361 | - Tweedie and negative binomial 362 | - Plus more! (More on that in a bit) 363 | 364 | 365 | uhoh 366 | ====== 367 | title: none 368 | type:section 369 | 370 |

spock sobbing mathematically

371 | 372 | GAM summary 373 | =========== 374 | 375 | - Straight lines suck --- we want **wiggles** 376 | - Use little functions (**basis functions**) to make big functions (**smooths**) 377 | - Need to make sure your smooths are **wiggly enough** 378 | - Use a **penalty** to trade off wiggliness/generality 379 | 380 | 381 | -------------------------------------------------------------------------------- /slides/02-intro-mgcv.Rpres: -------------------------------------------------------------------------------- 1 | Fitting GAMs in practice 2 | ======================== 3 | author: David L Miller 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | Overview 9 | ========= 10 | 11 | - Introduction to some data 12 | - Fitting simple models 13 | - Plotting simple models 14 | 15 | ```{r setup, include=FALSE} 16 | library(knitr) 17 | library(viridis) 18 | library(ggplot2) 19 | library(reshape2) 20 | library(animation) 21 | library(mgcv) 22 | opts_chunk$set(cache=TRUE, echo=FALSE) 23 | theme_set(theme_minimal()+theme(text=element_text(size=20))) 24 | ``` 25 | 26 | 27 | Data 28 | ==== 29 | type:section 30 | 31 | 32 | Extinction experiment 33 | ===================== 34 | 35 | - 60 populations of *Daphnia magna* in lab 36 | - 1/2 in constant conditions, 1/2 in "deteriorating" 37 | - Data from Drake & Griffen (Nature [doi:10.1038/nature09389](http://doi.org/10.1038/nature09389)) 38 | 39 | ![D. magna doi:10.1371/journal.pbio.0030253](images/Daphnia_magna.png) 40 | 41 | Photo credit [doi:10.1371/journal.pbio.0030253](http://doi.org/10.1371/journal.pbio.0030253) 42 | 43 | 44 | Extinction experiment 45 | ===================== 46 | 47 | ```{r data-extinct, fig.width=12, fig.height=8} 48 | load("../data/drake_griffen/drake_griffen.RData") 49 | populations$Treatment <- "Control" 50 | populations$Treatment[populations$deteriorating] <- "Deteriorating" 51 | p <- ggplot(populations) + 52 | geom_point(aes(x=day, y=Nhat, colour=Treatment, group=ID)) + 53 | geom_vline(xintercept=154) + 54 | geom_text(aes(x=x,y=y, label=label), size=9, 55 | data=data.frame(x=230, y=300, label="Start deterioration")) + 56 | labs(y="Count") 57 | print(p) 58 | ``` 59 | 60 | Inferential goals 61 | ================= 62 | 63 | - Make sense of all these dots! 64 | - What are the "average" trends? 65 | - (What are non-average trends?) 66 | - When do the deteriorating populations go extinct on average? 67 | 68 | 69 | Pantropical spotted dolphins 70 | ============================== 71 | 72 | - Example taken from Miller et al (2013) 73 | - [Paper appendix](http://distancesampling.org/R/vignettes/mexico-analysis.html) has a better analysis 74 | - Simple example here, ignoring all kinds of important stuff! 75 | 76 | ![a pantropical spotted dolphin doing its thing](images/spotteddolphin_swfsc.jpg) 77 | 78 | 79 | Inferential aims 80 | ================= 81 | 82 | ```{r loaddat} 83 | load("../data/mexdolphins/mexdolphins.RData") 84 | ``` 85 | 86 | ```{r gridplotfn} 87 | library(rgdal) 88 | library(rgeos) 89 | library(maptools) 90 | library(plyr) 91 | # fill must be in the same order as the polygon data 92 | grid_plot_obj <- function(fill, name, sp){ 93 | 94 | # what was the data supplied? 95 | names(fill) <- NULL 96 | row.names(fill) <- NULL 97 | data <- data.frame(fill) 98 | names(data) <- name 99 | 100 | spdf <- SpatialPolygonsDataFrame(sp, data) 101 | spdf@data$id <- rownames(spdf@data) 102 | spdf.points <- fortify(spdf, region="id") 103 | spdf.df <- join(spdf.points, spdf@data, by="id") 104 | 105 | # seems to store the x/y even when projected as labelled as 106 | # "long" and "lat" 107 | spdf.df$x <- spdf.df$long 108 | spdf.df$y <- spdf.df$lat 109 | 110 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df) 111 | } 112 | ``` 113 | 114 | - How many dolphins are there? 115 | - Where are the dolphins? 116 | - What are they interested in? 117 | 118 | ```{r spatialEDA, fig.cap="", fig.width=15} 119 | # some nearby states, transformed 120 | library(mapdata) 121 | map_dat <- map_data("worldHires",c("usa","mexico")) 122 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 123 | map_sp <- SpatialPoints(map_dat[,c("long","lat")]) 124 | 125 | # give the sp object a projection 126 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84") 127 | # re-project 128 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4) 129 | map_dat$x <- map_sp.t$long 130 | map_dat$y <- map_sp.t$lat 131 | 132 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 133 | p <- ggplot() + 134 | grid_plot_obj(preddata$depth, "Depth", pred.polys) + 135 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 136 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) + 137 | geom_point(aes(x, y, size=count), 138 | data=mexdolphins[mexdolphins$count>0,], 139 | colour="red", alpha=I(0.7)) + 140 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) + 141 | scale_fill_viridis(direction=-1) + 142 | labs(fill="Depth",x="x",y="y",size="Count") + 143 | theme_minimal() 144 | #p <- p + gg.opts 145 | print(p) 146 | ``` 147 | 148 | 149 | Translating maths into R 150 | ========================== 151 | 152 | A simple example: 153 | 154 | $$ 155 | y_i = \beta_0 + s(x) + s(w) + \epsilon_i 156 | $$ 157 | 158 | where $\epsilon_i \sim N(0, \sigma^2)$ 159 | 160 | Let's pretend that $y_i \sim \text{Normal}$ 161 | 162 | - linear predictor: `formula = y ~ s(x) + s(w)` 163 | - response distribution: `family=gaussian()` 164 | - data: `data=some_data_frame` 165 | 166 | Putting that together 167 | ====================== 168 | 169 | ```{r echo=TRUE, eval=FALSE} 170 | my_model <- gam(y ~ s(x) + s(w), 171 | family = gaussian(), 172 | data = some_data_frame, 173 | method = "REML") 174 | ``` 175 | 176 | - `method="REML"` uses REML for smoothness selection (default is `"GCV.Cp"`) 177 | 178 | 179 | What about a practical example? 180 | ================================ 181 | type:section 182 | 183 | Simple extinction example 184 | ========================= 185 | 186 | ```{r ext-ex, echo=TRUE} 187 | dmagna_happy <- gam(Nhat~s(day, k=50), data=pop_happy, method="REML") 188 | dmagna_unhappy <- gam(Nhat~s(day, k=50), data=pop_unhappy, method="REML") 189 | ``` 190 | 191 | - Fit 2 models, one to control group, one to experimental group 192 | - `Nhat` is the count of *D. magna* (estimated from 3 censuses) 193 | - Model as a function of `day` 194 | 195 | What did that do? 196 | ================= 197 | 198 | ```{r ext-ex-summary, echo=TRUE} 199 | summary(dmagna_unhappy) 200 | ``` 201 | 202 | Cool, but what about a plot? 203 | ============================ 204 | 205 | ```{r ext-plot, fig.show="hold", fig.width=12, fig.height=7} 206 | par(mfrow=c(1,2)) 207 | plot(dmagna_happy, main="happy zooplankton", shift=mean(pop_happy$Nhat), scale=0, shade=TRUE) 208 | points(pop_happy[,c("day", "Nhat")], pch=19, cex=0.3, alpha=0.3, col="darkgrey") 209 | abline(h=0) 210 | plot(dmagna_unhappy, main="unhappy zooplankton", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE) 211 | abline(h=0) 212 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey") 213 | ``` 214 | 215 | How did we do that? 216 | =================== 217 | 218 | ```{r ext-plot-t, echo=TRUE, eval=FALSE} 219 | plot(dmagna_unhappy, main="unhappy zooplankton", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE) 220 | abline(h=0) 221 | ``` 222 | 223 | - `scale=0` puts each term on it's own $y$ axis 224 | - Terms are centred, need `shift=` to re-centre (don't in models w/ > 1 term!) 225 | - `shade=TRUE` makes +/-2 se into shaded area 226 | - `main=` sets title, as usual 227 | 228 | A simple dolphin model 229 | ====================== 230 | 231 | ```{r firstdsm, echo=TRUE} 232 | library(mgcv) 233 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 234 | data = mexdolphins, 235 | family = quasipoisson(), 236 | method = "REML") 237 | ``` 238 | 239 | - count is a function of depth 240 | - `off.set` is the effort expended 241 | - we have count data, try quasi-Poisson distribution 242 | 243 | What did that do? 244 | =================== 245 | 246 | ```{r echo=TRUE} 247 | summary(dolphins_depth) 248 | ``` 249 | 250 | Plotting 251 | ================ 252 | 253 | ```{r plotsmooth} 254 | plot(dolphins_depth) 255 | ``` 256 | *** 257 | - `plot(dolphins_depth)` 258 | - Dashed lines indicate +/- 2 standard errors 259 | - Rug plot 260 | - On the link scale 261 | - EDF on $y$ axis 262 | 263 | Plotting (shaded se) 264 | ==================== 265 | 266 | ```{r plotsmooth-shade} 267 | plot(dolphins_depth, shade=TRUE) 268 | ``` 269 | *** 270 | - `plot(dolphins_depth, shade=TRUE)` 271 | - `?plot` has a **lot** of options 272 | - `plotdat <- plot(model)` gives you the data that generates the plot 273 | 274 | Thin plate regression splines 275 | ================================ 276 | 277 | - Default basis 278 | - One basis function per data point 279 | - Reduce # basis functions (eigendecomposition) 280 | - Fitting on reduced problem 281 | - Multidimensional 282 | - Wood (2003) 283 | 284 | 285 | Bivariate terms 286 | ================ 287 | 288 | - Assumed an additive structure 289 | - No interaction 290 | - We can specify `s(x,y)` (and `s(x,y,z,...)`) 291 | - (Assuming *isotropy* here...) 292 | 293 | ```{r xydsmplot, fig.width=15, fig.height=7} 294 | dolphins_depth_xy <- gam(count ~ s(x, y) + offset(off.set), 295 | data = mexdolphins, 296 | family=quasipoisson(), method="REML") 297 | par(mfrow=c(1,3)) 298 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=20, asp=1) 299 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=60, asp=1) 300 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=160, asp=1) 301 | ``` 302 | 303 | Adding a term 304 | =============== 305 | 306 | - Add a **surface** for location ($x$ and $y$) 307 | - Just use `+` for an extra term 308 | 309 | ```{r xydsm, echo=TRUE} 310 | dolphins_depth_xy <- gam(count ~ s(depth) + s(x, y) + offset(off.set), 311 | data = mexdolphins, 312 | family=quasipoisson(), method="REML") 313 | ``` 314 | 315 | 316 | Summary 317 | =================== 318 | 319 | ```{r echo=TRUE} 320 | summary(dolphins_depth_xy) 321 | ``` 322 | 323 | 324 | Plot x,y term 325 | ============= 326 | 327 | ```{r plot-scheme2, echo=TRUE, fig.width=10, eval=FALSE} 328 | plot(dolphins_depth_xy, select=2, cex=3, asp=1, lwd=2, scheme=2) 329 | ``` 330 | - `scale=0`: each plot on different scale 331 | - `select=` picks which smooth to plot 332 | - `scheme=2` much better for bivariate terms 333 | - `vis.gam()` also useful 334 | 335 | Plot x,y term 336 | ============== 337 | ```{r plot-scheme2p, echo=TRUE, fig.width=12} 338 | plot(dolphins_depth_xy, select=2, cex=3, asp=1, lwd=2, scheme=2) 339 | ``` 340 | 341 | Fitting/plotting GAMs summary 342 | ============================= 343 | 344 | - `gam` does all the work 345 | - very similar to `glm` 346 | - `s` indicates a smooth term 347 | - `summary`, `plot` methods 348 | 349 | -------------------------------------------------------------------------------- /slides/03-model_checking.Rpres: -------------------------------------------------------------------------------- 1 | Model checking 2 | ============================ 3 | author: David L Miller 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | Outline 9 | ================= 10 | 11 | You fitted a GAM, everything is fine, right? *Right?* 12 | 13 | But what about? 14 | 15 | - Smooth terms flexibility? 16 | - Non-constant variance? 17 | - Response distribution selection? 18 | - Correlated covariates? 19 | 20 | 21 | ```{r setup, echo=FALSE} 22 | library(mgcv) 23 | library(magrittr) 24 | library(ggplot2) 25 | library(dplyr) 26 | library(statmod) 27 | ``` 28 | 29 | ```{r pres_setup, include=FALSE} 30 | library(knitr) 31 | opts_chunk$set(cache=TRUE, echo=FALSE, fig.align="center") 32 | ``` 33 | 34 | blah 35 | ==== 36 | type:section 37 | title:none 38 | 39 |
"perhaps the most important part of applied statistical modelling"
40 | 41 | Simon Wood, Generalized Additive Models: An Introduction in R 42 | 43 | Basis size (k) 44 | ============== 45 | 46 | - `k` $\approx$ number of basis functions 47 | - Set `k` per term 48 | - e.g. `s(x, k=10)` or `s(x, y, k=100)` 49 | - Penalty removes "extra" wigglyness 50 | - *up to a point!* 51 | - (But computation is slower with bigger `k`) 52 | 53 | ```{r loaddat} 54 | load("../data/drake_griffen/drake_griffen.RData") 55 | ``` 56 | 57 | Default basis size 58 | ==================== 59 | 60 | ```{r gam_check, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15} 61 | b <- gam(Nhat ~ s(day), data=pop_unhappy, method="REML") 62 | gam.check(b) 63 | ``` 64 | 65 | Increasing basis size 66 | ==================== 67 | 68 | ```{r gam_check2, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15} 69 | b_50 <- gam(Nhat ~ s(day, k=50), data=pop_unhappy, method="REML") 70 | gam.check(b_50) 71 | ``` 72 | 73 | Does it make a difference?! 74 | =========================== 75 | 76 | ```{r ext-flex, fig.width=12} 77 | par(mfrow=c(1,2)) 78 | plot(b, main="k=10", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE) 79 | abline(h=0) 80 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey") 81 | plot(b_50, main="k=50", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE) 82 | abline(h=0) 83 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey") 84 | ``` 85 | 86 | General strategy 87 | ================ 88 | 89 | - leave `k` at default, see what happens 90 | - double `k` depending on results from `gam.check` 91 | - repeat 92 | 93 |
94 | `?choose.k` has some good advice 95 | 96 | Historical/philosophical note 97 | ============================= 98 | 99 | - "Keep relationships simple and interpretable" 100 | - What does this mean? 101 | - Bias confirmation? 102 | - Limit model to get "clean" relationships? 103 | - Some literature suggests "limit `k=5`" or somesuch 104 | - Original `gam` package for S+ had a default `k=5` 105 | - Coincidence? 106 | - (Simon Wood, pers. comm.) 107 | 108 | 109 | 110 | Residual checks 111 | =============== 112 | type:section 113 | 114 | Residuals 115 | ========= 116 | 117 | - Deal with 2 types of residuals 118 | - Deviance 119 | - Randomized quantile 120 | - Raw residuals are just (observed - fitted) 121 | - Analog to R^2 122 | - Difficult to assess mean-variance relationship graphically 123 | - Need to rescale so mean-variance is constant 124 | 125 | Deviance residuals 126 | ================== 127 | 128 | - Deviance $\approx$ "R^2 for GAMs" 129 | - Per-observation deviance $\approx$ raw resids? 130 | - Multiply by sign of (observed-fitted) 131 | - Should be Normal(0, 1) distributed 132 | 133 | gam.check() plots 134 | ============================= 135 | 136 | `gam.check()` creates 4 plots: 137 | 138 | 1. Quantile-quantile plots of residuals 139 | 2. Histogram of residuals 140 | 3. Residuals vs. linear predictor 141 | 4. Observed vs. fitted values 142 | 143 | 144 | Checking response distribution 145 | ============================== 146 | 147 | - Left side of `gam.check` plots 148 | - Examples from the Drake & Griffen data 149 | - Looking at the "deteriorating" populations 150 | 151 | ```{r, fig.width=12} 152 | plot(b_50, main="unhappy zooplankton", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE) 153 | abline(h=0) 154 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey") 155 | ``` 156 | 157 | Normal response with count data 158 | =================================== 159 | 160 | ```{r gam_gauss, echo=TRUE, results="hide"} 161 | b <- gam(Nhat ~ s(day, k=50), data=pop_unhappy, method="REML") 162 | ``` 163 | ```{r gam_gauss_p, results="hide", fig.width=10, fig.height=6} 164 | gam.check(b) 165 | ``` 166 | 167 | What about a count distribution? 168 | ================================ 169 | 170 | ```{r gam_quasi, echo=TRUE, results="hide"} 171 | b_quasi <- gam(Nhat ~ s(day, k=50), data=pop_unhappy, method="REML", family=quasipoisson()) 172 | ``` 173 | ```{r gam_quasi_p, results="hide", fig.width=10, fig.height=6} 174 | gam.check(b_quasi) 175 | ``` 176 | 177 | 178 | What about a fancier count distribution? 179 | ================================ 180 | 181 | ```{r gam_nb, echo=TRUE, results="hide"} 182 | b_nb <- gam(Nhat ~ s(day, k=50), data=pop_happy, method="REML", family=nb()) 183 | ``` 184 | ```{r gam_nb_p, results="hide", fig.width=10, fig.height=8} 185 | gam.check(b_nb) 186 | ``` 187 | 188 | 189 | Variance relationships 190 | ====================== 191 | 192 | - Heteroskedasticity 193 | - Do we know that the mean-variance relationship is right? 194 | - Deviance resids should give us constant variance if model correct? 195 | - Right column of `gam.check`: 196 | - residuals vs. linear prediction == cloud 197 | - Response vs. fitted == line-ish 198 | 199 | Mean-variance incorrect 200 | ======================= 201 | 202 | ```{r gam_nb_p2, results="hide", fig.width=10, fig.height=8} 203 | gam.check(b_nb) 204 | ``` 205 | 206 | Close up 207 | ======== 208 | 209 | ```{r res-linear-nb, results="hide", fig.width=10, fig.height=8} 210 | resid <- residuals(b_nb, type = "deviance") 211 | linpred <- predict(b_nb) 212 | plot(linpred, resid, main = "Resids vs. linear pred.", xlab = "linear predictor", 213 | ylab = "residuals", cex=1.5) 214 | ``` 215 | 216 | Randomized quantile residuals 217 | ============================= 218 | 219 | ```{r qres-linear-nb, results="hide", fig.width=10, fig.height=8} 220 | library(statmod) 221 | b_nb$theta <- b_nb$family$getTheta(TRUE) 222 | resid <- qresid(b_nb) 223 | plot(linpred, resid, main = "RQ-Resids vs. linear pred.", xlab = "linear predictor", 224 | ylab = "randomized quantile residuals", cex=1.5) 225 | ``` 226 | 227 | 228 | Shortcomings 229 | ============= 230 | 231 | - Deviance resids vs. linear predictor is victim of artifacts 232 | - Need an alternative 233 | - "Randomised quantile residuals" (*experimental*) 234 | - `rqresiduals` in `statmod` 235 | - Exactly normal residuals ... if the model is right! 236 | - `rqgam.check` in `dsm` (ignore left side plots!) 237 | 238 | These plots are just the start 239 | ============================== 240 | 241 | - Need to go further 242 | - Look at aggregations of residuals by other variables 243 | 244 | ```{r resids-by-pop, results="hide", fig.width=12, fig.height=7} 245 | pop_happy$resids <- residuals(b_nb) 246 | boxplot(resids~ID, pop_happy, main="Residuals per experimental population") 247 | ``` 248 | 249 | Residual checking as art form 250 | ============================= 251 | 252 | - Residuals can tell you a **lot** about your model 253 | - No general method 254 | - Depends on data 255 | - Depends on inferential goals 256 | - Highlight model deficiencies 257 | - Inform what to do next; which other questions are interesting 258 | 259 | 260 | Tobler's first law of geography 261 | ================================== 262 | type:section 263 | 264 | "Everything is related to everything else, but near things are more related than distant things" 265 | 266 | Tobler (1970) 267 | 268 | 269 | Concurvity 270 | ========== 271 | 272 | - We all know correlated covariates are bad 273 | - What about non-linear correlations? 274 | - Can we describe covariates as functions of each other? 275 | - Important for model selection --- sensitivity analysis 276 | 277 | Example - US East coast 278 | ======================= 279 | 280 | ![us east coast with covariates](images/spermcovars.png) 281 | 282 | Checking for concurvity 283 | ======================= 284 | 285 | - Measures, for each smooth term, how well this term could be approximated by 286 | - `concurvity(model, full=TRUE)`: some combination of all other smooth terms 287 | - `concurvity(model, full=FALSE)`: each of the other smooth terms in the model 288 | (useful for identifying which terms are causing issues) 289 | 290 | Plotting concurvity 291 | =================== 292 | 293 | ![concurvity vis](images/concurvity.png) 294 | *** 295 | - We can visualise 296 | - `vis.concurvity` on course site 297 | 298 | Concurvity: things to remember 299 | ============================== 300 | - Can make your model unstable to small changes 301 | - `cor(data)` not sufficient: use the `concurvity(model)` function 302 | - Not always obvious from plots of covariates or smooths 303 | 304 | 305 | 306 | -------------------------------------------------------------------------------- /slides/04-model_selection.Rpres: -------------------------------------------------------------------------------- 1 | Model selection 2 | =============== 3 | author: David L Miller 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | What do we mean by "model selection"? 9 | ===================================== 10 | 11 | - Term "selection" 12 | - Path dependence 13 | - Shrinkage 14 | - Selection between models 15 | - Term formulation 16 | - Selection between structurally different models 17 | 18 | ```{r setup, include=FALSE} 19 | library("knitr") 20 | library("viridis") 21 | library("ggplot2") 22 | library("mgcv") 23 | theme_set(theme_minimal()) 24 | opts_chunk$set(cache=TRUE, echo=FALSE) 25 | ``` 26 | 27 | Term selection 28 | ============== 29 | type:section 30 | 31 | Term selection via p-values 32 | =========================== 33 | 34 | - Old paradigm -- select terms using $p$-values 35 | - $p$-values are **approximate** 36 | 37 | 1. treat smoothing parameters as *known* 38 | 2. rely on asymptotic behaviour 39 | 40 | ($p$-values in `summary.gam()` have changed a lot over time --- all options except current default are deprecated as of `v1.18-13` (*i.e.,* ignore what's in the book!).) 41 | 42 | Technical stuff 43 | =============== 44 | 45 | Test of **zero-effect** of a smooth term 46 | 47 | Default $p$-values rely on theory of Nychka (1988) and Marra & Wood (2012) for confidence interval coverage. 48 | 49 | If the Bayesian CI have good across-the-function properties, Wood (2013a) showed that the $p$-values have: 50 | 51 | - almost the correct null distribution 52 | - reasonable power 53 | 54 | Test statistic is a form of $\chi^2$ statistic, but with complicated degrees of freedom. 55 | 56 | (RE)ML rant again... 57 | ==================== 58 | 59 | Best behaviour when smoothness selection is by **ML** (**REML** also good) 60 | 61 | Neither of these are the default, so remember to use `method = "ML"` or `method = "REML"` as appropriate 62 | 63 | 64 | Term selection by shrinkage 65 | ============================ 66 | type:section 67 | 68 | Shrinkage & additional penalties 69 | ================================ 70 | 71 | - Usually can't remove a whole function when smoothing 72 | - Penalties used act only on *range space* 73 | - *null space* of the basis is unpenalised. 74 | 75 | Parts of $f$ that don't have 2nd derivatives aren't penalised 76 | 77 | $$ 78 | \int_\mathbb{R} \left( \frac{\partial^2 f(x)}{\partial x^2}\right)^2 \text{d}x\\ 79 | $$ 80 | 81 | (Note that penalty form depends on the basis!) 82 | 83 | Double-penalty shrinkage 84 | ======================== 85 | 86 | $\mathbf{S}_j$ penalty matrix $j$, eigendecompose: 87 | 88 | $$ 89 | \mathbf{S}_j = \mathbf{U}_j\mathbf{\Lambda}_j\mathbf{U}_j^{T} 90 | $$ 91 | 92 | where $\mathbf{U}_j$ is a matrix of eigenvectors and $\mathbf{\Lambda}_j$ a diagonal matrix of eigenvalues (i.e., this is an eigen decomposition of $\mathbf{S}_j$). 93 | 94 | $\mathbf{\Lambda}_j$ contains some **0**s due to the spline basis null space --- no matter how large the penalty $\lambda_j$ might get no guarantee a smooth term will be suppressed completely. 95 | 96 | 97 | Shrinkage & additional penalties 98 | ================================ 99 | 100 | `mgcv` has two ways to penalize the null space $\Rightarrow$ term selection 101 | 102 | - *double penalty approach* via `select = TRUE` 103 | - *shrinkage approach* via special bases `"ts"` and `"cs"` 104 | 105 | Marra & Wood (2011) review other options. 106 | 107 | Double-penalty shrinkage 108 | ======================== 109 | 110 | Create a second penalty matrix from $\mathbf{U}_j$, considering only the matrix of eigenvectors associated with the zero eigenvalues 111 | 112 | $$ 113 | \mathbf{S}_j^{*} = \mathbf{U}_j^{*}\mathbf{U}_j^{*T} 114 | $$ 115 | 116 | Now we can fit a GAM with two penalties of the form 117 | 118 | $$ 119 | \lambda_j \mathbf{\beta}^T \mathbf{S}_j \mathbf{\beta} + \lambda_j^{*} \mathbf{\beta}^T \mathbf{S}_j^{*} \mathbf{\beta} 120 | $$ 121 | 122 | In practice, add `select = TRUE` to your `gam()` call 123 | 124 | Shrinkage 125 | ========= 126 | 127 | - Double penalty $\Rightarrow$ twice as many smoothing parameters 128 | - Alternative is shrinkage, add small value to zero eigenvalues 129 | - Null space terms to be shrunk at the same time 130 | 131 | Use `s(..., bs = "ts")` or `s(..., bs = "cs")` in **mgcv** 132 | 133 | Selecting between models 134 | ======================== 135 | type:section 136 | 137 | Model selection by REML/ML 138 | ========================== 139 | 140 | - If you **don't** use shrinkage/double penalty you can't use REML scores to select models 141 | - Need model to be fully penalized for REML score selection 142 | - You can use ML though 143 | - *but* there are other options... 144 | 145 | AIC 146 | === 147 | type:section 148 | 149 | AIC 150 | ============ 151 | 152 | - Use full likelihood of $\boldsymbol{\beta}$ *conditional* upon $\lambda_j$ is used, with the EDF replacing $k$, the number of model parameters 153 | - This *conditional* AIC tends to select complex models, especially those with random effects, as the EDF ignores that $\lambda_j$ are estimated 154 | - Wood et al (2015) suggests a correction that accounts for uncertainty in $\lambda_j$ (`AIC`) 155 | 156 | Okay, fine, but... 157 | ================== 158 | type:section 159 | 160 | GAMs are Bayesian models 161 | ======================== 162 | type:section 163 | 164 | Bayesian models 165 | =============== 166 | 167 | - *duh* 168 | - we can build Bayesian GLMs 169 | - see also INLA and BayesX 170 | - `mgcv` fits Bayesian models 171 | - penalties are prior precision matrices 172 | - (improper) Gaussian prior on $\boldsymbol{\beta}$ 173 | 174 | Empirical Bayes...? 175 | =================== 176 | 177 | - Improper prior derives from $\mathbf{S}_j$ not being of full rank 178 | - zeroes in $\mathbf{\Lambda}_j$. 179 | - Double penalty and shrinkage smooths make prior proper 180 | - **Double penalty**: no assumption as to how much to shrink the null space 181 | - **Shrinkage smooths**: assume null space should be shrunk less than the wiggles 182 | 183 | Practical Bayes 184 | =============== 185 | 186 | Marra & Wood (2011) show that the double penalty and the shrinkage smooth approaches: 187 | 188 | - performed significantly better than alternatives in terms of *predictive ability*, and 189 | - performed as well as alternatives in terms of variable selection 190 | 191 | 192 | 193 | Examples 194 | ======== 195 | type:section 196 | 197 | Back to the dolphins... 198 | ======================= 199 | type:section 200 | 201 | ```{r} 202 | load("../data/mexdolphins/mexdolphins.RData") 203 | ``` 204 | 205 | Fitting some models 206 | ======================= 207 | 208 | ```{r echo=TRUE} 209 | dolphins_depth_xy <- gam(count ~ s(x, y) + s(depth) + offset(off.set), 210 | data = mexdolphins, 211 | family=nb(), method="REML") 212 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 213 | data = mexdolphins, 214 | family=nb(), method="REML") 215 | dolphins_xy <- gam(count ~ s(x, y) + offset(off.set), 216 | data = mexdolphins, 217 | family=nb(), method="REML") 218 | ``` 219 | 220 | Comparing terms by p-value 221 | ========================== 222 | 223 | ```{r echo=TRUE} 224 | summary(dolphins_depth_xy) 225 | ``` 226 | 227 | Comparing models by AIC 228 | ======================= 229 | 230 | ```{r echo=TRUE} 231 | AIC(dolphins_xy, dolphins_depth, dolphins_depth_xy) 232 | ``` 233 | 234 | 235 | Shrinkage (basis) 236 | ========= 237 | 238 | ```{r echo=TRUE} 239 | dolphins_depth_xy_s <- gam(count ~ s(x, y, bs="ts") + s(depth, bs="ts") + offset(off.set), 240 | data = mexdolphins, 241 | family=nb(), method="REML") 242 | ``` 243 | 244 | 245 | Shrinkage (basis) 246 | ========= 247 | ```{r echo=TRUE} 248 | summary(dolphins_depth_xy_s) 249 | ``` 250 | 251 | 252 | Shrinkage (extra penalty) 253 | ========= 254 | 255 | ```{r echo=TRUE} 256 | dolphins_depth_xy_e <- gam(count ~ s(x, y) + s(depth) + offset(off.set), 257 | data = mexdolphins, select=TRUE, 258 | family=nb(), method="REML") 259 | ``` 260 | 261 | 262 | Shrinkage (basis) 263 | ========= 264 | ```{r echo=TRUE} 265 | summary(dolphins_depth_xy_e) 266 | ``` 267 | 268 | These last two models were empirical Bayes models 269 | ================================================= 270 | type:section 271 | 272 | That's all from the dolphins for now... 273 | ==================================== 274 | type:section 275 | 276 |
277 | ![dolphins from HHG2G](images/thanks_for_all_the_fish.gif) 278 | 279 | 280 | 281 | 282 | 283 | -------------------------------------------------------------------------------- /slides/05-inference.Rpres: -------------------------------------------------------------------------------- 1 | Inference 2 | ========= 3 | author: David L Miller 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | ```{r setup, include=FALSE} 9 | library(knitr) 10 | library(viridis) 11 | library(ggplot2) 12 | library(reshape2) 13 | library(animation) 14 | library(mgcv) 15 | opts_chunk$set(cache=TRUE, echo=FALSE) 16 | theme_set(theme_minimal()+theme(text=element_text(size=20))) 17 | ``` 18 | 19 | 20 | What do we want to know? 21 | ======================== 22 | 23 | - Don't just fit models for the sake of it! 24 | - What are our questions? 25 | - Relationship to covariates 26 | - Abundance 27 | - Distribution 28 | - Response to disturbance 29 | - Temporal changes 30 | - Other stuff? 31 | 32 | 33 | 34 | Prediction 35 | =========== 36 | type:section 37 | 38 | What is a prediction? 39 | ===================== 40 | 41 | - Evaluate the model, at a particular covariate combination 42 | - Answering (e.g.) the question "at a given depth, how many dolphins?" 43 | - Steps: 44 | 1. evaluate the $s(\ldots)$ terms 45 | 2. move to the response scale (exponentiate? Do nothing?) 46 | 3. (multiply any offset etc) 47 | 48 | Example of prediction 49 | ====================== 50 | 51 | - in maths: 52 | - Model: $\text{count}_i = A_i \exp \left( \beta_0 + s(x_i, y_i) + s(\text{Depth}_i)\right)$ 53 | - Drop in the values of $x, y, \text{Depth}$ (and $A$) 54 | - in R: 55 | - build a `data.frame` with $x, y, \text{Depth}, A$ 56 | - use `predict()` 57 | 58 | ```{r echo=TRUE, eval=FALSE} 59 | preds <- predict(my_model, newdat=my_data, type="response") 60 | ``` 61 | 62 | (`se.fit=TRUE` gives a standard error for each prediction) 63 | 64 | Back to the dolphins... 65 | ======================= 66 | type:section 67 | 68 | Where are the dolphins? 69 | ======================= 70 | 71 | ```{r echo=FALSE} 72 | 73 | ``` 74 | 75 | ```{r loaddat} 76 | load("../data/mexdolphins/mexdolphins.RData") 77 | ``` 78 | 79 | ```{r gridplotfn} 80 | library(rgdal) 81 | library(rgeos) 82 | library(maptools) 83 | library(plyr) 84 | # fill must be in the same order as the polygon data 85 | grid_plot_obj <- function(fill, name, sp){ 86 | 87 | # what was the data supplied? 88 | names(fill) <- NULL 89 | row.names(fill) <- NULL 90 | data <- data.frame(fill) 91 | names(data) <- name 92 | 93 | spdf <- SpatialPolygonsDataFrame(sp, data) 94 | spdf@data$id <- rownames(spdf@data) 95 | spdf.points <- fortify(spdf, region="id") 96 | spdf.df <- join(spdf.points, spdf@data, by="id") 97 | 98 | # seems to store the x/y even when projected as labelled as 99 | # "long" and "lat" 100 | spdf.df$x <- spdf.df$long 101 | spdf.df$y <- spdf.df$lat 102 | 103 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df) 104 | } 105 | # some nearby states, transformed 106 | library(mapdata) 107 | map_dat <- map_data("worldHires",c("usa","mexico")) 108 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 109 | map_sp <- SpatialPoints(map_dat[,c("long","lat")]) 110 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 111 | 112 | # give the sp object a projection 113 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84") 114 | # re-project 115 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4) 116 | map_dat$x <- map_sp.t$long 117 | map_dat$y <- map_sp.t$lat 118 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 119 | data = mexdolphins, 120 | family=nb(), method="REML") 121 | ``` 122 | 123 | ```{r echo=TRUE} 124 | dolphin_preds <- predict(dolphins_depth, newdata=preddata, 125 | type="response") 126 | ``` 127 | 128 | ```{r fig.width=20} 129 | p <- ggplot() + 130 | grid_plot_obj(dolphin_preds, "N", pred.polys) + 131 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 132 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) + 133 | geom_point(aes(x, y, size=count), 134 | data=mexdolphins[mexdolphins$count>0,], 135 | colour="red", alpha=I(0.7)) + 136 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) + 137 | scale_fill_viridis() + 138 | labs(fill="Predicted\ndensity", x="x", y="y", size="Count") 139 | print(p) 140 | ``` 141 | 142 | (`ggplot2` code included in the slide source) 143 | 144 | Prediction summary 145 | ================== 146 | 147 | - Evaluate the fitted model at a given point 148 | - Can evaluate many at once (`data.frame`) 149 | - Don't forget the `type=...` argument! 150 | - Obtain per-prediction standard error with `se.fit` 151 | 152 | Without uncertainty, we're not doing statistics 153 | =============================================== 154 | type:section 155 | 156 | 157 |
158 |

159 | 160 |

161 | 162 | 163 | Where does uncertainty come from? 164 | ================================= 165 | 166 | - $\boldsymbol{\beta}$: uncertainty in the spline parameters 167 | - $\boldsymbol{\lambda}$: uncertainty in the smoothing parameter 168 | 169 | - (Traditionally we've only addressed the former) 170 | - (New tools let us address the latter...) 171 | 172 | 173 | Parameter uncertainty 174 | ======================= 175 | 176 | From theory: 177 | 178 | $$ 179 | \boldsymbol{\beta} \sim N(\hat{\boldsymbol{\beta}}, \mathbf{V}_\boldsymbol{\beta}) 180 | $$ 181 | 182 | (*caveat: the normality is only* **approximate** *for non-normal response*) 183 | 184 | 185 | **What does this mean?** Variance for each parameter. 186 | 187 | In `mgcv`: `vcov(model)` returns $\mathbf{V}_\boldsymbol{\beta}$. 188 | 189 | 190 | What can we do this this? 191 | =========================== 192 | 193 | - confidence intervals in `plot` 194 | - standard errors using `se.fit` 195 | - derived quantities? (see bibliography) 196 | 197 | blah 198 | ==== 199 | title:none 200 | type:section 201 | 202 | 203 | 204 | 205 | 206 | The lpmatrix, magic, etc 207 | ============================== 208 | 209 | For regular predictions: 210 | 211 | $$ 212 | \hat{\boldsymbol{\eta}}_p = L_p \hat{\boldsymbol{\beta}} 213 | $$ 214 | 215 | form $L_p$ using the prediction data, evaluating basis functions as we go. 216 | 217 | (Need to apply the link function to $\hat{\boldsymbol{\eta}}_p$) 218 | 219 | But the $L_p$ fun doesn't stop there... 220 | 221 | [[mathematics intensifies]] 222 | ============================ 223 | type:section 224 | 225 | Variance and lpmatrix 226 | ====================== 227 | 228 | To get variance on the scale of the linear predictor: 229 | 230 | $$ 231 | V_{\hat{\boldsymbol{\eta}}} = L_p^\text{T} V_\hat{\boldsymbol{\beta}} L_p 232 | $$ 233 | 234 | pre-/post-multiplication shifts the variance matrix from parameter space to linear predictor-space. 235 | 236 | (Can then pre-/post-multiply by derivatives of the link to put variance on response scale) 237 | 238 | Simulating parameters 239 | ====================== 240 | 241 | - $\boldsymbol{\beta}$ has a distribution, we can simulate 242 | 243 | ```{r paramsim, results="hide"} 244 | library(mvtnorm) 245 | 246 | # get the Lp matrix 247 | Lp <- predict(dolphins_depth, newdata=preddata, type="lpmatrix") 248 | 249 | # how many realisations do we want? 250 | frames <- 100 251 | 252 | # generate the betas from the GAM "posterior" 253 | betas <- rmvnorm(frames, coef(dolphins_depth), vcov(dolphins_depth)) 254 | 255 | 256 | # use a function to get animation to play nice... 257 | anim_map <- function(frames){ 258 | # loop to make plots 259 | for(frame in 1:frames){ 260 | 261 | # make the prediction 262 | preddata$preds <- preddata$area * exp(Lp%*%betas[frame,]) 263 | 264 | # plot it (using viridis) 265 | p <- ggplot() + 266 | grid_plot_obj(preddata$preds, "N", pred.polys) + 267 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) + 268 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 269 | xlim = range(mexdolphins$x)) + 270 | scale_fill_viridis(limits=c(0,400)) + 271 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") 272 | 273 | print(p) 274 | } 275 | } 276 | 277 | # make the animation! 278 | saveGIF(anim_map(frames), "uncertainty.gif", interval = 0.15, ani.width = 800, ani.height = 400) 279 | ``` 280 | 281 | ![Animation of uncertainty](uncertainty.gif) 282 | 283 | Uncertainty in smoothing parameter 284 | ================================== 285 | 286 | - Recent work by Simon Wood 287 | - "smoothing parameter uncertainty corrected" version of $V_\hat{\boldsymbol{\beta}}$ 288 | - In a fitted model, we have: 289 | - `$Vp` what we got with `vcov` 290 | - `$Vc` the corrected version 291 | 292 | Variance summary 293 | ================ 294 | 295 | - Everything comes from variance of parameters 296 | - Need to re-project/scale them to get the quantities we need 297 | - `mgcv` does most of the hard work for us 298 | - Fancy stuff possible with a little maths 299 | - Can include uncertainty in the smoothing parameter too 300 | 301 | Summary 302 | ======= 303 | 304 | - `predict` is your friend 305 | - Most stuff comes down to matrix algebra, that `mgcv` sheilds you from 306 | - To do fancy stuff, get inside the matrices 307 | -------------------------------------------------------------------------------- /slides/06-fancy.Rpres: -------------------------------------------------------------------------------- 1 | Fancy stuff 2 | ============================ 3 | author: David L Miller 4 | css: custom.css 5 | 6 | A word of warning 7 | ================= 8 | 9 |
10 |
11 |
12 | ![jenny is the best](images/jenny_models.png) 13 | 14 | 15 | Away from the exponential family 16 | =================================== 17 | type:section 18 | 19 | 20 | ```{r setup, include=F} 21 | library(mgcv) 22 | library(ggplot2) 23 | ``` 24 | 25 | ```{r pres_setup, include =F} 26 | library(knitr) 27 | opts_chunk$set(cache=TRUE, echo=FALSE,fig.align="center") 28 | ``` 29 | 30 | 31 | Modelling "counts" 32 | ================ 33 | type:section 34 | 35 | Counts and count-like things 36 | ============================ 37 | 38 | - Response is a count (not always integer) 39 | - Often, it's mostly zero (that's complicated) 40 | - Could also be catch per unit effort, biomass etc 41 | - Flexible mean-variance relationship 42 | 43 | ![The Count from Sesame Street](images/count.jpg) 44 | 45 | Tweedie distribution 46 | ===================== 47 | 48 | ```{r tweedie} 49 | library(tweedie) 50 | library(RColorBrewer) 51 | 52 | # tweedie 53 | y<-seq(0.01,5,by=0.01) 54 | pows <- seq(1.2, 1.9, by=0.1) 55 | 56 | fymat <- matrix(NA, length(y), length(pows)) 57 | 58 | i <- 1 59 | for(pow in pows){ 60 | fymat[,i] <- dtweedie( y=y, power=pow, mu=2, phi=1) 61 | i <- i+1 62 | } 63 | 64 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5, 65 | main="") 66 | 67 | rr <- brewer.pal(8,"Dark2") 68 | 69 | for(i in 1:ncol(fymat)){ 70 | lines(y, fymat[,i], type="l", col=rr[i], lwd=2) 71 | } 72 | ``` 73 | *** 74 | - $\text{Var}\left(\text{count}\right) = \phi\mathbb{E}(\text{count})^q$ 75 | - Common distributions are sub-cases: 76 | - $q=1 \Rightarrow$ Poisson 77 | - $q=2 \Rightarrow$ Gamma 78 | - $q=3 \Rightarrow$ Normal 79 | - We are interested in $1 < q < 2$ 80 | - (here $q = 1.2, 1.3, \ldots, 1.9$) 81 | - `tw()` 82 | 83 | 84 | Negative binomial 85 | ================== 86 | 87 | ```{r negbin} 88 | y<-seq(1,12,by=1) 89 | disps <- seq(0.001, 1, len=10) 90 | 91 | fymat <- matrix(NA, length(y), length(disps)) 92 | 93 | i <- 1 94 | for(disp in disps){ 95 | fymat[,i] <- dnbinom(y, size=disp, mu=5) 96 | i <- i+1 97 | } 98 | 99 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5, 100 | main="") 101 | 102 | rr <- brewer.pal(8,"Dark2") 103 | 104 | for(i in 1:ncol(fymat)){ 105 | lines(y, fymat[,i], type="l", col=rr[i], lwd=2) 106 | } 107 | ``` 108 | *** 109 | - $\text{Var}\left(\text{count}\right) =$ $\mathbb{E}(\text{count}) + \kappa \mathbb{E}(\text{count})^2$ 110 | - Estimate $\kappa$ 111 | - Is quadratic relationship a "strong" assumption? 112 | - Similar to Poisson: $\text{Var}\left(\text{count}\right) =\mathbb{E}(\text{count})$ 113 | - `nb()` 114 | 115 | 116 | Zero-inflated distributions 117 | =========================== 118 | 119 | - Models the probability of zeros seperately from mean counts given that you've observed more than zero 120 | at a location. 121 | - `ziP` and `ziplss` (for location-scale models) 122 | - zero inflation is assessed *conditional* on the model 123 | - is what you have zero inflation or just lots of zeros? 124 | - don't just jump straight to zero inflation 125 | 126 | 127 | 128 | Other distributions 129 | ====================== 130 | type:section 131 | 132 | The Beta distribution 133 | ====================== 134 | 135 | ```{r beta-dist} 136 | shape1 <- c(0.2, 1, 5, 1, 3, 1.5) 137 | shape2 <- c(0.2, 3, 1, 1, 1.5, 3) 138 | x <- seq(0.01, 0.99, length = 200) 139 | rr <- brewer.pal(length(shape1), "Dark2") 140 | fymat <- mapply(dbeta, shape1, shape2, MoreArgs = list(x = x)) 141 | matplot(x, fymat, type = "l", col = rr, lwd = 2, lty = "solid") 142 | legend("top", bty = "n", 143 | legend = expression(alpha == 0.2 ~~ beta == 0.2, 144 | alpha == 1.0 ~~ beta == 3.0, 145 | alpha == 5.0 ~~ beta == 1.0, 146 | alpha == 1.0 ~~ beta == 1.0, 147 | alpha == 3.0 ~~ beta == 1.5, 148 | alpha == 1.5 ~~ beta == 3.0), 149 | col = rr, cex = 1.25, lty = "solid", lwd = 2) 150 | ``` 151 | 152 | *** 153 | 154 | - Proportions; continuous, bounded at 0 & 1 155 | - Beta distribution is convenient choice 156 | - Two strictly positive shape parameters, $\alpha$ & $\beta$ 157 | - Has support on $x \in (0,1)$ 158 | - Density at $x = 0$ & $x = 1$ is $\infty$, fudge 159 | - `betar()` family in **mgcv** 160 | 161 | t-distribution 162 | ============================ 163 | - Models continuous data w/ longer tails than normal 164 | - Far less sensitive to outliers 165 | - Has one extra parameter: df. 166 | - bigger df: t dist approaches normal 167 | 168 | 169 | ```{r texample2, include =T,echo=F,results= "hide", fig.width=15, fig.height=8} 170 | set.seed(4) 171 | n=300 172 | dat = data.frame(x=seq(0,10,length=n)) 173 | dat$f = 20*exp(-dat$x)*dat$x 174 | dat$y = 1*rt(n,df = 3) + dat$f 175 | norm_mod = gam(y~s(x,k=20), data=dat, family=gaussian(link="identity")) 176 | t_mod = gam(y~s(x,k=20), data=dat, family=scat(link="identity")) 177 | predict_norm = predict(norm_mod,se.fit=T) 178 | predict_t = predict(t_mod,se.fit=T) 179 | fit_vals = data.frame(x = c(dat$x,dat$x), 180 | fit =c(predict_norm[[1]],predict_t[[1]]), 181 | se_min = c(predict_norm[[1]] - 2*predict_norm[[2]], 182 | predict_t[[1]] - 2*predict_t[[2]]), 183 | se_max = c(predict_norm[[1]] + 2*predict_norm[[2]], 184 | predict_t[[1]] + 2*predict_t[[2]]), 185 | model = rep(c("normal errors","t-errors"),each=n)) 186 | ggplot(aes(x=x,y=fit),data=fit_vals)+ 187 | facet_grid(.~model)+ 188 | geom_line(col="red")+ 189 | geom_ribbon(aes(ymin =se_min,ymax = se_max),alpha=0.5,fill="red")+ 190 | annotate(x = dat$x,y=dat$y,size=2,geom="point")+ 191 | annotate(x = dat$x,y=dat$f,size=2,geom="line")+ 192 | theme_bw(20) 193 | ``` 194 | 195 | 196 | 197 | Ordered categorical data 198 | =========================== 199 | ```{r ocat_ex2, include =T,echo=F,results= "hide", fig.width=6} 200 | set.seed(4) 201 | n= 100 202 | dat = data.frame(body_size = seq(-2,2, length=200)) 203 | dat$linear_predictor = 6*exp(dat$body_size*2)/(1+exp(dat$body_size*2))-2 204 | 205 | ggplot(aes(x=body_size, y=linear_predictor),data=dat) + 206 | geom_line()+ 207 | geom_ribbon(aes(ymin=linear_predictor-1,ymax=linear_predictor+1),alpha=0.25)+ 208 | annotate(x= dat$body_size, ymin=-3,ymax=-1, alpha=0.25,geom="ribbon")+ 209 | annotate(x= 0, y=-2, label = "least concern",geom="text",size=10)+ 210 | annotate(x= dat$body_size, ymin=-1,ymax=1.5, alpha=0.25,geom="ribbon",fill="red")+ 211 | annotate(x= 0, y=0.25, label = "vulnerable",geom="text",size=10)+ 212 | annotate(x= dat$body_size, ymin=1.5,ymax=2, alpha=0.25,geom="ribbon",fill="blue")+ 213 | annotate(x= 0, y=1.75, label = "endangered",geom="text",size=10)+ 214 | annotate(x= 0, y=3.5, label = "extinct",geom="text",size=10)+ 215 | scale_x_continuous("relative body size", expand=c(0,0))+ 216 | scale_y_continuous("linear predictor",limits=c(-3,5), expand=c(0,0))+ 217 | theme_bw(30)+ 218 | theme(panel.grid = element_blank()) 219 | 220 | ``` 221 | 222 | *** 223 | 224 | - Data are categories, have order 225 | - e.g.: conservation status: "least concern", "vulnerable", "endangered", "extinct" 226 | - fits a linear latent model using covariates, w/ threshold for each level 227 | - see `?ocat` 228 | - for unordered categories, see `?multinom` 229 | 230 | 231 | 232 | Other distributions (quickly) 233 | =================================== 234 | 235 | - Multivariate normal (`family = "mvn"`) 236 | - Multivariate response, each has different smooth, allow correlation 237 | - Cox proportional hazards (`"family = cox.ph"`) 238 | - Censored data: time until an event occurs, or the study was stopped 239 | - Gaussian location-scale models (`"family = gaulss"`) 240 | - mean ("location") and variance ("scale") as smooths 241 | 242 | All of these distributions have quirks! Read the manual! 243 | 244 | `?family` and `?family.mgcv` 245 | 246 | The end of the distribution zoo 247 | ============== 248 | type:section 249 | 250 | Fancy smoothers 251 | =============== 252 | type:section 253 | 254 | Cyclic smooths 255 | ============== 256 | 257 |

258 | cyclic smooths 259 |
260 | tensor of cyclic 261 |

262 | 263 | 264 | *** 265 | 266 | - cyclic smooths (`bs="cc"`) 267 | - what if smooths need to "match up"? 268 | - ensure up to 2nd derivs match 269 | - need to be careful with end points 270 | - `?smooth.construct.cc.smooth.spec` 271 | 272 | 273 | "Simple" random effects 274 | ============== 275 | 276 | - Earlier: "penalties can be thought of as variance components" 277 | - We can think of random effects as splines too! 278 | - in `mgcv` we can set `bs="re"` 279 | - these are **simple**, non-nested random effects 280 | 281 | ```{r ext-re-get} 282 | load("../data/drake_griffen/drake_griffen.RData") 283 | ``` 284 | 285 | ```{r ext-re-model, fig.width=12, height=4} 286 | ext_re <- gam(Nhat ~ s(day, k=50) + s(ID, bs="re"), data=pop_unhappy) 287 | plot(ext_re, shade=TRUE, pages=1, scale=0) 288 | ``` 289 | 290 | 291 | Complicated random effects 292 | ========================== 293 | 294 | - `gamm` --- uses spline-random effects equiv. 295 | - cast splines as random effects, fit using `nlme` 296 | - random effects are sparse, splines are dense 297 | - often modelling problems with complex models 298 | - `random=...` argument for nesting etc 299 | - model has a `$gam` and `$lme` parts 300 | 301 | 302 | 303 | Correlation stuctures 304 | ===================== 305 | 306 | - again, need to use `gamm` 307 | - `correlation=...` gives structure 308 | - `corAR1`, `corARMA`, `corCAR1` etc 309 | - tend to be hard to fit for SDMs 310 | 311 | 312 | Fancy 2D smoothing 313 | ================== 314 | type:section 315 | 316 | Funny-shaped regions 317 | ==================== 318 | 319 |

320 | fs plot truth 321 |
322 | fs plot trprs 323 |
324 | fs plot soap 325 |

326 | 327 | *** 328 | 329 | - Soap film smoother (`bs="so"`) 330 | - Model takes boundary into account by construction 331 | - Need to specify a boundary and internal knots 332 | - see `?soap` 333 | 334 | 335 | Spatial models using areas 336 | ========================== 337 | 338 | ```{r mrf} 339 | data(columb) ## data frame 340 | data(columb.polys) ## district shapes list 341 | xt <- list(polys=columb.polys) ## neighbourhood structure info for MRF 342 | par(mfrow=c(2,2)) 343 | polys.plot(columb.polys, columb$crime, main="raw crime data") 344 | 345 | 346 | ## An important covariate added... 347 | b <- gam(crime ~ s(district,bs="mrf",k=20,xt=xt)+s(income), 348 | data=columb,method="REML") 349 | 350 | ## plot fitted values by district 351 | fv <- fitted(b) 352 | names(fv) <- as.character(columb$district) 353 | polys.plot(columb.polys,fv, main="fitted values") 354 | 355 | # each "smooth" 356 | plot(b,scheme=c(0,1)) 357 | ``` 358 | 359 | 360 | *** 361 | 362 | - Markov random fields (`bs="mrf"`) 363 | - Need to specify polygons or adjacency matrix 364 | - Not necessarily that useful for marine work? 365 | - see `?mrf` 366 | 367 | 368 | Very general modelling 369 | ====================== 370 | 371 | `mgcv` can fit *anything* you can write as (on the link scale): 372 | 373 | $$ 374 | \mathbf{y} = X\boldsymbol{\beta} \qquad \text{s.t.} \sum_j \boldsymbol{\beta}S_j \boldsymbol{\beta} 375 | $$ 376 | 377 | if you can write your likelihood in a quadratic form, it can be part of a model in `mgcv` 378 | 379 | `?paraPen` 380 | 381 | 382 | Models for large datasets 383 | ========================= 384 | 385 | - `bam` for big additive models 386 | - can handle simple correlation structures 387 | - parallel (block QR decompositions) 388 | - fast! (still experimental) 389 | - Wood, Goude, Shaw (2015) 390 | 391 | Fancy summary 392 | ============= 393 | 394 | - You can do *a lot* of things in `mgcv` 395 | - Start small, work up to complex models 396 | - Sometimes convergence is against you 397 | - There is *a lot* of information in the manual 398 | 399 |
400 |
401 |

402 | 403 | Okay, that's enough 404 | =================== 405 | type:section 406 | 407 |
408 |
409 |
410 | converged.yt/mgcv-workshop 411 | 412 | 413 | 414 | -------------------------------------------------------------------------------- /slides/correct_mathjax_PDF_and_move.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # correct the paths to the mathjax files for HTML versions of the slides 4 | 5 | # changes the link to be to a folder called mathjax-23 in the current directory 6 | # this avoids having lots of copes of the same mathjax stuff 7 | 8 | # also use decktape to make a PDF of the slides (probably only works on 9 | # Dave's laptop) 10 | 11 | for i in $( ls *.html); do 12 | echo processing: $i 13 | FN=${i/.html/}_files\\/ 14 | sed "s/$FN//" "$i" > tmp.html 15 | mv tmp.html $i 16 | PDFFN=${i/.html/}.pdf 17 | ~/sources/decktape/bin/phantomjs ~/sources/decktape/decktape.js automatic -s 1024x768 $i $PDFFN 18 | mv $i ~/current/webwebweb/mgcv-workshop/slides/ 19 | mv $PDFFN ~/current/webwebweb/mgcv-workshop/slides/ 20 | done 21 | -------------------------------------------------------------------------------- /slides/custom.css: -------------------------------------------------------------------------------- 1 | .reveal section img{ 2 | box-shadow: 0px 0px 0px rgba(0, 0, 0, 0); 3 | margin: 0px; 4 | } 5 | 6 | .reveal h1, .reveal h2, .reveal h3 { 7 | word-wrap: normal; 8 | -moz-hyphens: none; 9 | } 10 | 11 | .reveal pre{ 12 | width: 100%; 13 | font-size: 0.51em; 14 | } 15 | 16 | .reveal pre code { 17 | font-size: 1.2em; 18 | } 19 | 20 | div.bigq { 21 | font-size: 200%; 22 | line-height: normal; 23 | } 24 | div.bigqw { 25 | font-size: 200%; 26 | color: #fff; 27 | line-height: normal; 28 | } 29 | div.bigqmono { 30 | font-size: 200%; 31 | color: #fff; 32 | line-height: normal; 33 | font-family: monospaced; 34 | } 35 | 36 | div.medq { 37 | font-size: 150%; 38 | line-height: normal; 39 | } 40 | 41 | .section .reveal .state-background { 42 | background: #000 none repeat scroll 0% 0%; 43 | background-color: #000; 44 | } 45 | 46 | .reveal code.r { 47 | background-color: #F8F8F8; 48 | font-size: 1.2em; 49 | } 50 | 51 | .reveal pre { 52 | width: 100%; 53 | } 54 | -------------------------------------------------------------------------------- /slides/images/Daphnia_magna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/Daphnia_magna.png -------------------------------------------------------------------------------- /slides/images/addbasis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/addbasis.png -------------------------------------------------------------------------------- /slides/images/animal_choice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/animal_choice.png -------------------------------------------------------------------------------- /slides/images/animal_codes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/animal_codes.png -------------------------------------------------------------------------------- /slides/images/animal_functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/animal_functions.png -------------------------------------------------------------------------------- /slides/images/by.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/by.png -------------------------------------------------------------------------------- /slides/images/concurvity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/concurvity.png -------------------------------------------------------------------------------- /slides/images/count.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/count.jpg -------------------------------------------------------------------------------- /slides/images/cyclic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/cyclic.png -------------------------------------------------------------------------------- /slides/images/jenny_models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/jenny_models.png -------------------------------------------------------------------------------- /slides/images/mathematical_sobbing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/mathematical_sobbing.jpg -------------------------------------------------------------------------------- /slides/images/mgcv-inside.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/mgcv-inside.png -------------------------------------------------------------------------------- /slides/images/remlgcv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/remlgcv.png -------------------------------------------------------------------------------- /slides/images/rummy.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/rummy.gif -------------------------------------------------------------------------------- /slides/images/slope-aspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/slope-aspect.png -------------------------------------------------------------------------------- /slides/images/soap-soap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/soap-soap.png -------------------------------------------------------------------------------- /slides/images/soap-tprs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/soap-tprs.png -------------------------------------------------------------------------------- /slides/images/soap-truth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/soap-truth.png -------------------------------------------------------------------------------- /slides/images/spermcovars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/spermcovars.png -------------------------------------------------------------------------------- /slides/images/spermwhalecovs.R: -------------------------------------------------------------------------------- 1 | # grab sperm whale covariates and make a figure 2 | 3 | # only works on my computer, sorry! 4 | load("~/current/spatial-workshops/spermwhale-analysis/predgrid.RData") 5 | 6 | library(ggplot2) 7 | library(gridExtra) 8 | library(viridis) 9 | library(ggalt) 10 | library(sp) 11 | 12 | pp <- list() 13 | 14 | map_dat2 <- map_data("world", c("usa","canada")) 15 | 16 | 17 | 18 | locs <- map_dat2[,c("long","lat")] 19 | coordinates(locs) <- c("long", "lat") # set spatial coordinates 20 | crs.geo <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84") 21 | proj4string(locs) <- crs.geo 22 | crs.aea <- CRS("+proj=aea +lat_1=38 +lat_2=30 +lat_0=34 +lon_0=-73 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +ellps=WGS84 +towgs84=0,0,0") 23 | locs.aea <- spTransform(locs, crs.aea) 24 | 25 | map_dat2$y <- locs.aea$lat 26 | map_dat2$x <- locs.aea$long 27 | 28 | for(value in c("Depth", "SST", "NPP")){ 29 | p <- ggplot(predgrid) + 30 | geom_tile(aes_string(x="x", y="y", fill=value)) + 31 | geom_polygon(aes(x=x, y=y, group = group), 32 | fill = "#1A9850", data=map_dat2) + 33 | scale_fill_viridis() + 34 | coord_equal(xlim=c(-1e6,0.75e6), ylim = c(-0.75e6,0.75e6)) + 35 | theme_minimal() 36 | pp[[value]] <- p 37 | } 38 | 39 | png(file="spermcovars.png", width=800, height=500) 40 | grid.arrange(pp[["Depth"]], pp[["SST"]], pp[["NPP"]], ncol=2) 41 | dev.off() 42 | 43 | -------------------------------------------------------------------------------- /slides/images/spotteddolphin_swfsc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/spotteddolphin_swfsc.jpg -------------------------------------------------------------------------------- /slides/images/thanks_for_all_the_fish.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/thanks_for_all_the_fish.gif -------------------------------------------------------------------------------- /slides/images/tina-modelling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/tina-modelling.png -------------------------------------------------------------------------------- /slides/wiggly.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/wiggly.gif -------------------------------------------------------------------------------- /vis.concurvity.R: -------------------------------------------------------------------------------- 1 | # visualise concurvity between terms in a GAM 2 | # David L Miller 2015, MIT license 3 | 4 | # arguments: 5 | # b -- a fitted gam 6 | # type -- concurvity measure to plot, see ?concurvity 7 | 8 | vis.concurvity <- function(b, type="estimate"){ 9 | cc <- concurvity(b, full=FALSE)[[type]] 10 | 11 | diag(cc) <- NA 12 | cc[lower.tri(cc)]<-NA 13 | 14 | layout(matrix(1:2, ncol=2), widths=c(5,1)) 15 | opar <- par(mar=c(5, 6, 5, 0) + 0.1) 16 | # main plot 17 | image(z=cc, x=1:ncol(cc), y=1:nrow(cc), ylab="", xlab="", 18 | axes=FALSE, asp=1, zlim=c(0,1)) 19 | axis(1, at=1:ncol(cc), labels = colnames(cc), las=2) 20 | axis(2, at=1:nrow(cc), labels = rownames(cc), las=2) 21 | # legend 22 | opar <- par(mar=c(5, 0, 4, 3) + 0.1) 23 | image(t(matrix(rep(seq(0, 1, len=100), 2), ncol=2)), 24 | x=1:3, y=1:101, zlim=c(0,1), axes=FALSE, xlab="", ylab="") 25 | axis(4, at=seq(1,101,len=5), labels = round(seq(0,1,len=5),1), las=2) 26 | par(opar) 27 | } 28 | --------------------------------------------------------------------------------