├── .gitignore ├── LICENSE ├── README.md ├── code_snippets ├── gra2mgcv.R ├── quantile_resid.R └── vis.concurvity.R ├── course_outline.md ├── data ├── bbs_data │ ├── bbs_florida.csv │ ├── bbs_florida_richness.csv │ └── routes.csv ├── bbs_florida.csv ├── bbs_florida_richness.csv ├── beta-regression │ └── ZooData.txt ├── forest-health │ ├── beech.raw │ ├── foresthealth.bnd │ ├── foresthealth.dat │ └── foresthealth.gra ├── mexdolphins │ ├── README.md │ ├── mexdolphins.RData │ ├── soap_pred.R │ └── soapy.RData ├── routes.csv ├── time_series │ └── Florida_climate.csv └── yukon_seeds │ ├── seed_data.csv │ └── seed_source_locations.csv ├── example-bivariate-timeseries-and-ti.Rmd ├── example-bivariate-timeseries-and-ti.html ├── example-forest-health.Rmd ├── example-forest-health.html ├── example-linear-functionals.Rmd ├── example-nonlinear-timeseries.Rmd ├── example-nonlinear-timeseries.html ├── example-spatial-mexdolphins-solutions.Rmd ├── example-spatial-mexdolphins-solutions.html ├── example-spatial-mexdolphins.Rmd ├── example-spatial-mexdolphins.html ├── example-spatio-temporal data.Rmd ├── example-spatio-temporal_data.html ├── mgcv-esa-workshop.Rproj ├── project-requirements.md ├── project_description.md └── slides ├── 00-preamble.Rpres ├── 01-intro.Rpres ├── 02-model_checking.Rpres ├── 03-model-selection.Rpres ├── 04-Beyond_the_exponential_family.Rpres ├── correct_mathjax.sh ├── custom.css ├── images ├── addbasis.png ├── animal_choice.png ├── animal_codes.png ├── animal_functions.png ├── count.jpg ├── mathematical_sobbing.jpg ├── remlgcv.png ├── spotteddolphin_swfsc.jpg └── tina-modelling.png ├── quantile_resid.R ├── uncertainty.gif └── wiggly.gif /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rhistory 2 | .Rproj.user/ 3 | .Rproj.user 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Code in this respository is licensed under the GPL version >= 2. 2 | 3 | All other material is Creative Commons BY licensed (except where it does not belong to us, we do not claim copyright over anyone else's material). 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mgcv-esa-workshop 2 | -------------------------------------------------------------------------------- /code_snippets/gra2mgcv.R: -------------------------------------------------------------------------------- 1 | ## Read in a BayesX graph file and output list required by mgcv's mrf smooth 2 | gra2mgcv <- function(file) { 3 | ## first row contains number of objects in file 4 | ## graph info is stored in triplets or rows after first row 5 | ## - 1st row of triplet is ID 6 | ## - 2nd row of triplet is number of neighbours 7 | ## - 3rd row of triplet is vector of neighbour 8 | gra <- readLines(file) 9 | N <- gra[1] 10 | gra <- gra[-1] # pop the first row which contains N 11 | ids <- seq.int(1, by = 3, length.out = N) 12 | nbs <- seq.int(3, by = 3, length.out = N) 13 | node2id <- function(nodes, ids) { 14 | as.numeric(ids[as.numeric(nodes) + 1]) 15 | } 16 | l <- lapply(strsplit(gra[nbs], " "), node2id, ids = gra[ids]) 17 | names(l) <- gra[ids] 18 | l 19 | } 20 | -------------------------------------------------------------------------------- /code_snippets/quantile_resid.R: -------------------------------------------------------------------------------- 1 | #This code is modified for D.L. Miller's dsm package for distance sampling, from 2 | #the rqgam.check function. The code is designed to extract randomized quantile 3 | #residuals from GAMs, using the family definitions in mgcv. Note statmod only 4 | #supports RQ residuals for the following families: Tweedie, Poisson, Gaussian, Any errors are due to Eric Pedersen 5 | library(statmod) #This has functions for randomized quantile residuals 6 | rqresiduals = function (gam.obj) { 7 | if(!"gam" %in% attr(gam.obj,"class")){ 8 | stop('"gam.obj has to be of class "gam"') 9 | } 10 | if (!grepl("^Tweedie|^Negative Binomial|^poisson|^binomial|^gaussian|^Gamma|^inverse.gaussian", 11 | gam.obj$family$family)){ 12 | stop(paste("family " , gam.obj$family$family, 13 | " is not currently supported by the statmod library, 14 | and any randomized quantile residuals would be inaccurate.", 15 | sep="")) 16 | } 17 | if (grepl("^Tweedie", gam.obj$family$family)) { 18 | if (is.null(environment(gam.obj$family$variance)$p)) { 19 | p.val <- gam.obj$family$getTheta(TRUE) 20 | environment(gam.obj$family$variance)$p <- p.val 21 | } 22 | qres <- qres.tweedie(gam.obj) 23 | } 24 | else if (grepl("^Negative Binomial", gam.obj$family$family)) { 25 | if ("extended.family" %in% class(gam.obj$family)) { 26 | gam.obj$theta <- gam.obj$family$getTheta(TRUE) 27 | } 28 | else { 29 | gam.obj$theta <- gam.obj$family$getTheta() 30 | } 31 | qres <- qres.nbinom(gam.obj) 32 | } 33 | else { 34 | qres <- qresid(gam.obj) 35 | } 36 | return(qres) 37 | } -------------------------------------------------------------------------------- /code_snippets/vis.concurvity.R: -------------------------------------------------------------------------------- 1 | # visualise concurvity between terms in a GAM 2 | # David L Miller 2015, MIT license 3 | 4 | # arguments: 5 | # b -- a fitted gam 6 | # type -- concurvity measure to plot, see ?concurvity 7 | 8 | vis.concurvity <- function(b, type="estimate"){ 9 | cc <- concurvity(b, full=FALSE)[[type]] 10 | 11 | diag(cc) <- NA 12 | cc[lower.tri(cc)]<-NA 13 | 14 | layout(matrix(1:2, ncol=2), widths=c(5,1)) 15 | opar <- par(mar=c(5, 6, 5, 0) + 0.1) 16 | # main plot 17 | image(z=cc, x=1:ncol(cc), y=1:nrow(cc), ylab="", xlab="", 18 | axes=FALSE, asp=1, zlim=c(0,1)) 19 | axis(1, at=1:ncol(cc), labels = colnames(cc), las=2) 20 | axis(2, at=1:nrow(cc), labels = rownames(cc), las=2) 21 | # legend 22 | opar <- par(mar=c(5, 0, 4, 3) + 0.1) 23 | image(t(matrix(rep(seq(0, 1, len=100), 2), ncol=2)), 24 | x=1:3, y=1:101, zlim=c(0,1), axes=FALSE, xlab="", ylab="") 25 | axis(4, at=seq(1,101,len=5), labels = round(seq(0,1,len=5),1), las=2) 26 | par(opar) 27 | } 28 | -------------------------------------------------------------------------------- /course_outline.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Course outline 3 | 4 | --- 5 | 6 | - *Start time:* 8am 7 | - *End time:* 5pm 8 | 9 | 2x 4 hour blocks: 10 | 11 | - Morning: 8am-12pm 12 | - Aternoon: 1pm-5pm 13 | 14 | 15 | # Morning 16 | 17 | ## Intro (Dave M) 18 | 19 | - GLM refresher 20 | - model the *mean* 21 | - here are some distributions (plot + mean-variance) 22 | - GLMs -> GAMs 23 | - Spline 24 | - why are they good? 25 | - how do they work? 26 | - measuring wigglyness 27 | - What is uncertainty? 28 | * "Hook" for "model expansion" 29 | 30 | 31 | ### Practical 32 | 33 | - `gam()` 34 | - `s()` 35 | - `gam.check()` 36 | - `summary()` 37 | - `plot()` 38 | * `predict()` 39 | 40 | 41 | ## Model checking (Eric) 42 | 43 | *(1/2 slot?)* 44 | 45 | - `gam.check()` 46 | - `concurvity()` 47 | - (randomised quantile residuals) 48 | * autocorrelation (check?) 49 | 50 | ### Practical 51 | 52 | - `gam.check()` 53 | 54 | ## Model selection (Gavin) 55 | 56 | *(1/2 slot?)* 57 | 58 | - shrinkage via shrinkage smooths 59 | - shrinkage via select = TRUE 60 | - AIC corrected for estimated smoothness params 61 | - approximate *p* values (mention at least) 62 | 63 | ### Practical 64 | 65 | - `summary()` 66 | 67 | ## Beyond the exponential family (all) 68 | 69 | *(1/2 slot?)* 70 | 71 | - Count data: Tweedie, negative binomial & ZIP (Dave) 72 | - Beta regression (Gavin) 73 | - Other stuff: t, `cox.ph`, ordered categorical, mvnorm (Eric) 74 | 75 | # Afternoon 76 | 77 | ## Extended examples/demos (all) 78 | 79 | Introduce a set of examples, let people pick what they want. Workbook-type situation. 80 | 81 | - Spatial modelling 82 | - Dolphins (Dave) (Example) 83 | - Spatio-temporal models 84 | - Trawl data from ERDDAP (Eric) (Example) 85 | - Lake cores (Gavin) (Demo) 86 | - Time series modelling 87 | - `lynx` (Eric) (Example) 88 | - Ft Lauderdale (Eric) (Example) 89 | - Ft Lauderdale `ti()` (Gavin) (Example) 90 | - Markov random fields 91 | - forest health data (Gavin) (Example) 92 | - `"lpmatrix"` 93 | - dolphins revisit (Dave) (Example) 94 | - random effects 95 | - cheetahs (Gavin) 96 | - ship effect trawl (Eric) 97 | - Location-scale models? 98 | - Quick example (Gavin) (Demo) 99 | - Linear functionals 100 | - dispersal kernels black spruce (Eric) (Example) 101 | 102 | 103 | ## Smoother zoo checklist 104 | 105 | - [ ] 2-D smoother `s()` (Dave intro example) 106 | - [ ] `te()` 107 | - [ ] soap film (Dave spatial dolphins) 108 | - [ ] `"ts"` (Gavin model selection) 109 | - [ ] Cyclic splines 110 | - [ ] `fs` basis/`by=` 111 | - [ ] random effects 112 | - [ ] Gaussian process smoothers? 113 | - [ ] multiple formulae specification? 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /data/bbs_data/routes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/bbs_data/routes.csv -------------------------------------------------------------------------------- /data/beta-regression/ZooData.txt: -------------------------------------------------------------------------------- 1 | Number Scans Proportion Size Visual Raised Visitors Feeding Oc Other Enrichment Group Sex Enclosure Vehicle Diet Age Zoo Eps 41 300 0.136666667 650 2 2 6418 1 1 2 2 2 1 0 59 3 2 1 1 47 150 0.313333333 2405 2 1 13607 1 2 2 2 1 1 0 263 1 2 2 2 28 300 0.093333333 1781 4 2 13607 1 2 2 2 2 1 0 263 1 3 2 3 26 150 0.173333333 390 5 1 0 1 2 2 1 1 2 0 0 1 3 2 4 13 148 0.087837838 390 2 1 0 1 2 2 1 1 2 0 0 1 3 2 5 25 152 0.164473684 390 3 1 0 1 2 2 1 1 2 0 0 1 2 2 6 15 301 0.049833887 1200 4 2 11713 1 2 2 2 2 2 20 215 3 8 3 7 78 299 0.260869565 1200 4 2 11713 1 2 2 2 2 2 12 215 3 2 3 8 77 300 0.256666667 1200 4 2 11713 1 2 2 2 2 1 16 215 3 2 3 9 64 300 0.213333333 70 1 1 11713 1 2 2 2 2 1 20 215 3 8 3 10 80 300 0.266666667 40 1 1 11713 1 2 2 2 2 2 12 215 3 2 3 11 28 240 0.116666667 1200 4 2 4225 1 2 2 2 2 1 16 28 3 8 3 12 2 150 0.013333333 650 5 2 96513 1 1 1 2 1 2 20 3 3 7 4 13 12 150 0.08 650 5 2 96513 1 1 1 2 1 2 20 3 3 1 4 14 2 150 0.013333333 650 5 2 7294 1 1 1 1 1 2 0 7 3 7 4 15 16 150 0.106666667 100 1 1 0 1 1 1 1 1 2 20 6 3 7 4 16 12 150 0.08 100 1 1 0 1 1 1 1 1 2 20 6 3 1 4 17 6 356 0.016853933 2030 8 2 76000 1 2 2 1 2 1 6 0 1 2 5 18 45 120 0.375 715 6 2 76000 1 2 2 1 1 2 6 0 1 2 5 19 27 270 0.1 715 6 2 14195 2 2 2 1 2 1 0 0 1 2 5 20 28 90 0.311111111 2030 8 2 14195 2 2 2 1 1 2 0 0 1 2 5 21 14 100 0.14 420 2 1 8971 1 1 2 1 1 1 0 112 3 1 6 22 9 100 0.09 434 2 2 39120 1 1 2 1 1 1 0 116 3 1 6 23 10 100 0.1 434 2 2 27589 1 1 2 1 1 1 0 157 3 1 6 24 13 497 0.026156942 434 2 2 8971 1 1 2 1 2 2 0 112 3 3 6 25 10 495 0.02020202 1700 4 1 39120 1 1 2 1 2 2 1 116 3 3 6 26 9 499 0.018036072 1700 4 1 27589 1 1 2 1 2 2 0 157 3 3 6 27 20 199 0.100502513 3800 8 2 8971 1 1 2 1 2 2 0 112 3 5 6 28 26 200 0.13 3000 8 2 39120 1 1 2 2 2 2 0 63 3 5 6 29 21 200 0.105 3000 8 2 27589 1 1 2 2 2 2 0 84 3 5 6 30 0 296 0 3000 8 2 8971 1 1 2 2 2 1 0 51 3 10 6 31 6 300 0.02 504 4 2 27589 1 1 2 1 2 1 1 157 3 10 6 32 63 299 0.210702341 1700 4 1 8971 1 1 2 1 2 1 1 112 3 10 6 33 21 100 0.21 504 4 2 8971 1 1 2 1 1 2 0 112 3 8 6 34 18 100 0.18 504 4 2 39120 1 1 2 1 1 2 0 116 3 3 6 35 13 192 0.067708333 434 2 1 22000 2 1 2 2 1 2 0 128 3 5 6 36 34 150 0.226666667 1000 2 1 627 1 2 1 2 1 1 0 83 2 7 7 37 21 150 0.14 600 2 1 627 1 2 1 2 1 2 0 83 2 4 7 38 35 150 0.233333333 578 4 1 627 1 2 1 2 1 2 0 83 2 3 7 39 16 150 0.106666667 630 4 1 627 1 2 1 2 1 2 0 83 2 8 7 40 18 150 0.12 587 2 1 627 1 2 1 2 1 2 0 83 2 7 7 41 22 150 0.146666667 203 1 1 627 1 2 1 2 1 1 0 83 2 2 7 42 27 150 0.18 1100 3 1 627 1 2 1 2 1 1 0 54 1 8 7 43 16 300 0.053333333 1400 8 1 627 1 2 1 2 2 1 1 27 1 8 7 44 4 302 0.013245033 600 5 1 0 2 2 1 1 2 1 0 22 1 11 7 45 16 147 0.108843537 1600 9 1 0 2 2 1 1 1 1 0 26 1 9 7 46 4 150 0.026666667 400 4 1 0 2 2 1 1 1 2 0 26 1 8 7 47 35 300 0.116666667 400 4 1 0 2 2 1 1 2 1 0 26 1 5 7 48 11 150 0.073333333 500 9 1 0 2 2 1 1 1 2 0 26 1 11 7 49 8 149 0.053691275 500 9 1 0 2 2 1 1 1 2 0 26 1 12 7 50 18 150 0.12 500 9 1 0 2 2 1 1 1 2 0 26 1 11 7 51 19 299 0.063545151 500 9 1 0 2 2 1 1 2 1 0 26 1 10 7 52 3 150 0.02 500 9 1 0 2 2 1 1 1 1 0 26 1 10 7 53 15 495 0.03030303 1800 8 1 0 2 2 2 1 2 2 0 26 1 1 7 54 36 150 0.24 431 2 1 0 2 2 1 1 1 1 2 32 1 4 7 55 10 150 0.066666667 336 2 1 0 2 2 1 1 1 2 0 32 1 9 7 56 3 149 0.020134228 67.5 1 1 0 2 2 1 1 1 1 0 32 1 7 7 57 1 150 0.006666667 195 2 1 0 2 2 1 1 1 1 2 32 1 8 7 58 10 150 0.066666667 270 3 1 0 2 2 1 1 1 1 0 32 1 12 7 59 12 298 0.040268456 1900 9 1 0 2 2 1 1 2 1 1 19 1 9 7 60 14 150 0.093333333 360 4 1 0 2 2 1 1 1 1 0 11 1 7 7 61 33 150 0.22 360 3 1 0 2 2 1 1 1 2 0 11 1 3 7 62 9 149 0.060402685 266 5 1 0 2 2 1 1 1 2 0 11 1 9 7 63 19 150 0.126666667 360 7 1 0 2 2 1 1 1 2 0 11 1 6 7 64 39 600 0.065 15000 4 1 718 1 2 1 2 2 1 8 51 3 3 8 65 17 300 0.056666667 25000 4 2 718 1 2 1 2 2 2 10 49 3 5 8 66 9 150 0.06 10000 3 1 718 1 2 1 1 1 1 0 27 3 15 8 67 11 300 0.036666667 25000 4 2 718 1 2 1 2 2 2 8 40 3 1 8 68 10 591 0.016920474 10000 9 1 0 2 2 2 2 2 2 0 12 3 9 8 69 2 749 0.002670227 40000 9 2 0 2 1 2 1 2 1 0 16 3 5 8 70 20 448 0.044642857 10000 4 2 0 1 2 1 1 2 1 3 18 3 1 8 71 0 150 0 25000 10 2 0 2 2 2 1 1 1 0 22 3 10 8 72 1 300 0.003333333 25000 10 2 0 2 2 2 1 2 1 0 22 3 10 8 73 13 150 0.086666667 80000 9 2 718 2 2 2 2 1 2 0 47 3 10 8 74 10 150 0.066666667 80000 9 2 718 2 2 2 2 1 2 0 60 3 7 8 75 0 300 0 80000 9 2 0 2 2 2 2 2 2 0 44 3 2 8 76 19 150 0.126666667 3000 5 2 33172 1 2 2 1 1 2 16 11 2 4 9 77 12 150 0.08 62 1 1 0 1 1 2 1 1 2 16 21 2 4 9 78 10 452 0.022123894 3000 5 2 33172 1 2 2 1 2 2 16 1 2 1 9 79 10 450 0.022222222 62 1 1 0 1 1 2 1 2 2 16 17 2 1 9 80 15 149 0.100671141 69 1 1 0 1 2 1 1 1 1 16 10 2 2 9 81 37 150 0.246666667 69 1 1 0 1 2 1 1 1 2 14 17 2 5 9 82 44 300 0.146666667 625 1 2 0 1 2 2 1 2 1 0 34 2 8 9 83 21 150 0.14 625 1 1 0 1 2 2 1 1 2 6 33 2 5 9 84 14 150 0.093333333 10000 2 1 0 1 2 2 1 1 1 2 29 2 9 9 85 55 149 0.369127517 600 2 2 0 1 2 2 1 1 1 2 34 2 9 9 86 28 150 0.186666667 450 1 1 0 1 2 1 1 1 2 2 35 2 2 9 87 17 150 0.113333333 18.6 1 1 0 1 2 1 1 1 2 0 35 2 8 9 88 -------------------------------------------------------------------------------- /data/forest-health/foresthealth.bnd: -------------------------------------------------------------------------------- 1 | "82",5 2 | 15.9882,1.8882 3 | 15.9882,2.1118 4 | 16.2118,2.1118 5 | 16.2118,1.8882 6 | 15.9882,1.8882 7 | "83",5 8 | 15.9882,2.8882 9 | 15.9882,3.1118 10 | 16.2118,3.1118 11 | 16.2118,2.8882 12 | 15.9882,2.8882 13 | "79",5 14 | 14.9882,1.8882 15 | 14.9882,2.1118 16 | 15.2118,2.1118 17 | 15.2118,1.8882 18 | 14.9882,1.8882 19 | "81",5 20 | 14.9882,3.8882 21 | 14.9882,4.1118 22 | 15.2118,4.1118 23 | 15.2118,3.8882 24 | 14.9882,3.8882 25 | "78",5 26 | 13.9882,4.8882 27 | 13.9882,5.1118 28 | 14.2118,5.1118 29 | 14.2118,4.8882 30 | 13.9882,4.8882 31 | "62",5 32 | 10.1882,8.8882 33 | 10.1882,9.1118 34 | 10.4118,9.1118 35 | 10.4118,8.8882 36 | 10.1882,8.8882 37 | "55",5 38 | 9.6882,8.3882 39 | 9.6882,8.6118 40 | 9.9118,8.6118 41 | 9.9118,8.3882 42 | 9.6882,8.3882 43 | "80",5 44 | 14.9882,2.8882 45 | 14.9882,3.1118 46 | 15.2118,3.1118 47 | 15.2118,2.8882 48 | 14.9882,2.8882 49 | "77",5 50 | 13.9882,3.8882 51 | 13.9882,4.1118 52 | 14.2118,4.1118 53 | 14.2118,3.8882 54 | 13.9882,3.8882 55 | "75",5 56 | 12.9882,4.9882 57 | 12.9882,5.2118 58 | 13.2118,5.2118 59 | 13.2118,4.9882 60 | 12.9882,4.9882 61 | "72",5 62 | 12.0882,5.9882 63 | 12.0882,6.2118 64 | 12.3118,6.2118 65 | 12.3118,5.9882 66 | 12.0882,5.9882 67 | "61",5 68 | 10.0882,7.8882 69 | 10.0882,8.1118 70 | 10.3118,8.1118 71 | 10.3118,7.8882 72 | 10.0882,7.8882 73 | "76",5 74 | 13.9882,2.8882 75 | 13.9882,3.1118 76 | 14.2118,3.1118 77 | 14.2118,2.8882 78 | 13.9882,2.8882 79 | "74",5 80 | 12.9882,3.8882 81 | 12.9882,4.1118 82 | 13.2118,4.1118 83 | 13.2118,3.8882 84 | 12.9882,3.8882 85 | "71",5 86 | 12.0882,4.9882 87 | 12.0882,5.2118 88 | 12.3118,5.2118 89 | 12.3118,4.9882 90 | 12.0882,4.9882 91 | "60",5 92 | 10.0882,6.9882 93 | 10.0882,7.2118 94 | 10.3118,7.2118 95 | 10.3118,6.9882 96 | 10.0882,6.9882 97 | "73",5 98 | 12.9882,2.8882 99 | 12.9882,3.1118 100 | 13.2118,3.1118 101 | 13.2118,2.8882 102 | 12.9882,2.8882 103 | "70",5 104 | 11.9882,3.8882 105 | 11.9882,4.1118 106 | 12.2118,4.1118 107 | 12.2118,3.8882 108 | 11.9882,3.8882 109 | "67",5 110 | 11.0882,4.8882 111 | 11.0882,5.1118 112 | 11.3118,5.1118 113 | 11.3118,4.8882 114 | 11.0882,4.8882 115 | "59",5 116 | 10.0882,5.8882 117 | 10.0882,6.1118 118 | 10.3118,6.1118 119 | 10.3118,5.8882 120 | 10.0882,5.8882 121 | "54",5 122 | 9.0882,6.9882 123 | 9.0882,7.2118 124 | 9.3118,7.2118 125 | 9.3118,6.9882 126 | 9.0882,6.9882 127 | "69",5 128 | 11.9882,2.8882 129 | 11.9882,3.1118 130 | 12.2118,3.1118 131 | 12.2118,2.8882 132 | 11.9882,2.8882 133 | "66",5 134 | 10.9882,3.8882 135 | 10.9882,4.1118 136 | 11.2118,4.1118 137 | 11.2118,3.8882 138 | 10.9882,3.8882 139 | "68",5 140 | 11.9882,1.9882 141 | 11.9882,2.2118 142 | 12.2118,2.2118 143 | 12.2118,1.9882 144 | 11.9882,1.9882 145 | "63",5 146 | 10.9882,0.988197 147 | 10.9882,1.2118 148 | 11.2118,1.2118 149 | 11.2118,0.988197 150 | 10.9882,0.988197 151 | "53",5 152 | 9.0882,5.8882 153 | 9.0882,6.1118 154 | 9.3118,6.1118 155 | 9.3118,5.8882 156 | 9.0882,5.8882 157 | "48",5 158 | 8.0882,6.9882 159 | 8.0882,7.2118 160 | 8.3118,7.2118 161 | 8.3118,6.9882 162 | 8.0882,6.9882 163 | "65",5 164 | 10.9882,2.8882 165 | 10.9882,3.1118 166 | 11.2118,3.1118 167 | 11.2118,2.8882 168 | 10.9882,2.8882 169 | "64",5 170 | 10.9882,1.8882 171 | 10.9882,2.1118 172 | 11.2118,2.1118 173 | 11.2118,1.8882 174 | 10.9882,1.8882 175 | "56",5 176 | 9.9882,0.988197 177 | 9.9882,1.2118 178 | 10.2118,1.2118 179 | 10.2118,0.988197 180 | 9.9882,0.988197 181 | "47",5 182 | 8.0882,5.9882 183 | 8.0882,6.2118 184 | 8.3118,6.2118 185 | 8.3118,5.9882 186 | 8.0882,5.9882 187 | "58",5 188 | 9.9882,2.9882 189 | 9.9882,3.2118 190 | 10.2118,3.2118 191 | 10.2118,2.9882 192 | 9.9882,2.9882 193 | "57",5 194 | 9.9882,1.9882 195 | 9.9882,2.2118 196 | 10.2118,2.2118 197 | 10.2118,1.9882 198 | 9.9882,1.9882 199 | "50",5 200 | 8.9882,0.988197 201 | 8.9882,1.2118 202 | 9.2118,1.2118 203 | 9.2118,0.988197 204 | 8.9882,0.988197 205 | "49",5 206 | 8.9882,0.388197 207 | 8.9882,0.611803 208 | 9.2118,0.611803 209 | 9.2118,0.388197 210 | 8.9882,0.388197 211 | "46",5 212 | 8.0882,4.8882 213 | 8.0882,5.1118 214 | 8.3118,5.1118 215 | 8.3118,4.8882 216 | 8.0882,4.8882 217 | "52",5 218 | 8.9882,2.9882 219 | 8.9882,3.2118 220 | 9.2118,3.2118 221 | 9.2118,2.9882 222 | 8.9882,2.9882 223 | "51",5 224 | 8.9882,1.9882 225 | 8.9882,2.2118 226 | 9.2118,2.2118 227 | 9.2118,1.9882 228 | 8.9882,1.9882 229 | "42",5 230 | 7.9882,0.988197 231 | 7.9882,1.2118 232 | 8.2118,1.2118 233 | 8.2118,0.988197 234 | 7.9882,0.988197 235 | "45",5 236 | 7.9882,3.8882 237 | 7.9882,4.1118 238 | 8.2118,4.1118 239 | 8.2118,3.8882 240 | 7.9882,3.8882 241 | "41",5 242 | 6.9882,4.9882 243 | 6.9882,5.2118 244 | 7.2118,5.2118 245 | 7.2118,4.9882 246 | 6.9882,4.9882 247 | "44",5 248 | 7.9882,2.9882 249 | 7.9882,3.2118 250 | 8.2118,3.2118 251 | 8.2118,2.9882 252 | 7.9882,2.9882 253 | "43",5 254 | 7.9882,1.9882 255 | 7.9882,2.2118 256 | 8.2118,2.2118 257 | 8.2118,1.9882 258 | 7.9882,1.9882 259 | "40",5 260 | 6.9882,3.9882 261 | 6.9882,4.2118 262 | 7.2118,4.2118 263 | 7.2118,3.9882 264 | 6.9882,3.9882 265 | "39",5 266 | 6.9882,2.9882 267 | 6.9882,3.2118 268 | 7.2118,3.2118 269 | 7.2118,2.9882 270 | 6.9882,2.9882 271 | "38",5 272 | 6.9882,1.9882 273 | 6.9882,2.2118 274 | 7.2118,2.2118 275 | 7.2118,1.9882 276 | 6.9882,1.9882 277 | "37",5 278 | 6.0882,3.2882 279 | 6.0882,3.5118 280 | 6.3118,3.5118 281 | 6.3118,3.2882 282 | 6.0882,3.2882 283 | "36",5 284 | 6.0882,2.2882 285 | 6.0882,2.5118 286 | 6.3118,2.5118 287 | 6.3118,2.2882 288 | 6.0882,2.2882 289 | "33",5 290 | 5.0882,3.2882 291 | 5.0882,3.5118 292 | 5.3118,3.5118 293 | 5.3118,3.2882 294 | 5.0882,3.2882 295 | "35",5 296 | 5.2882,1.7882 297 | 5.2882,2.0118 298 | 5.5118,2.0118 299 | 5.5118,1.7882 300 | 5.2882,1.7882 301 | "31",5 302 | 5.0882,1.2882 303 | 5.0882,1.5118 304 | 5.3118,1.5118 305 | 5.3118,1.2882 306 | 5.0882,1.2882 307 | "25",5 308 | 4.0882,0.288197 309 | 4.0882,0.511803 310 | 4.3118,0.511803 311 | 4.3118,0.288197 312 | 4.0882,0.288197 313 | "34",5 314 | 5.1882,1.9882 315 | 5.1882,2.2118 316 | 5.4118,2.2118 317 | 5.4118,1.9882 318 | 5.1882,1.9882 319 | "32",5 320 | 5.0882,2.2882 321 | 5.0882,2.5118 322 | 5.3118,2.5118 323 | 5.3118,2.2882 324 | 5.0882,2.2882 325 | "21",5 326 | 3.5882,0.788197 327 | 3.5882,1.0118 328 | 3.8118,1.0118 329 | 3.8118,0.788197 330 | 3.5882,0.788197 331 | "18",5 332 | 3.1882,0.688197 333 | 3.1882,0.911803 334 | 3.4118,0.911803 335 | 3.4118,0.688197 336 | 3.1882,0.688197 337 | "15",5 338 | 3.0882,0.288197 339 | 3.0882,0.511803 340 | 3.3118,0.511803 341 | 3.3118,0.288197 342 | 3.0882,0.288197 343 | "28",5 344 | 4.4882,2.8882 345 | 4.4882,3.1118 346 | 4.7118,3.1118 347 | 4.7118,2.8882 348 | 4.4882,2.8882 349 | "30",5 350 | 4.5882,2.3882 351 | 4.5882,2.6118 352 | 4.8118,2.6118 353 | 4.8118,2.3882 354 | 4.5882,2.3882 355 | "29",5 356 | 4.5882,1.7882 357 | 4.5882,2.0118 358 | 4.8118,2.0118 359 | 4.8118,1.7882 360 | 4.5882,1.7882 361 | "26",5 362 | 4.0882,1.2882 363 | 4.0882,1.5118 364 | 4.3118,1.5118 365 | 4.3118,1.2882 366 | 4.0882,1.2882 367 | "27",5 368 | 4.0882,2.2882 369 | 4.0882,2.5118 370 | 4.3118,2.5118 371 | 4.3118,2.2882 372 | 4.0882,2.2882 373 | "22",5 374 | 3.5882,1.7882 375 | 3.5882,2.0118 376 | 3.8118,2.0118 377 | 3.8118,1.7882 378 | 3.5882,1.7882 379 | "16",5 380 | 3.0882,1.2882 381 | 3.0882,1.5118 382 | 3.3118,1.5118 383 | 3.3118,1.2882 384 | 3.0882,1.2882 385 | "23",5 386 | 3.5882,2.7882 387 | 3.5882,3.0118 388 | 3.8118,3.0118 389 | 3.8118,2.7882 390 | 3.5882,2.7882 391 | "20",5 392 | 3.4882,2.0882 393 | 3.4882,2.3118 394 | 3.7118,2.3118 395 | 3.7118,2.0882 396 | 3.4882,2.0882 397 | "19",5 398 | 3.1882,2.5882 399 | 3.1882,2.8118 400 | 3.4118,2.8118 401 | 3.4118,2.5882 402 | 3.1882,2.5882 403 | "17",5 404 | 3.0882,2.2882 405 | 3.0882,2.5118 406 | 3.3118,2.5118 407 | 3.3118,2.2882 408 | 3.0882,2.2882 409 | "12",5 410 | 2.5882,1.7882 411 | 2.5882,2.0118 412 | 2.8118,2.0118 413 | 2.8118,1.7882 414 | 2.5882,1.7882 415 | "2",5 416 | 1.0882,2.2882 417 | 1.0882,2.5118 418 | 1.3118,2.5118 419 | 1.3118,2.2882 420 | 1.0882,2.2882 421 | "1",5 422 | 0.588197,2.7882 423 | 0.588197,3.0118 424 | 0.811803,3.0118 425 | 0.811803,2.7882 426 | 0.588197,2.7882 427 | "13",5 428 | 2.5882,2.7882 429 | 2.5882,3.0118 430 | 2.8118,3.0118 431 | 2.8118,2.7882 432 | 2.5882,2.7882 433 | "3",5 434 | 1.0882,3.2882 435 | 1.0882,3.5118 436 | 1.3118,3.5118 437 | 1.3118,3.2882 438 | 1.0882,3.2882 439 | "6",5 440 | 1.4882,2.4882 441 | 1.4882,2.7118 442 | 1.7118,2.7118 443 | 1.7118,2.4882 444 | 1.4882,2.4882 445 | "7",5 446 | 1.5882,2.7882 447 | 1.5882,3.0118 448 | 1.8118,3.0118 449 | 1.8118,2.7882 450 | 1.5882,2.7882 451 | "9",5 452 | 2.0882,2.2882 453 | 2.0882,2.5118 454 | 2.3118,2.5118 455 | 2.3118,2.2882 456 | 2.0882,2.2882 457 | "4",5 458 | 1.3882,3.4882 459 | 1.3882,3.7118 460 | 1.6118,3.7118 461 | 1.6118,3.4882 462 | 1.3882,3.4882 463 | "5",5 464 | 1.3882,4.8882 465 | 1.3882,5.1118 466 | 1.6118,5.1118 467 | 1.6118,4.8882 468 | 1.3882,4.8882 469 | "8",5 470 | 1.5882,3.7882 471 | 1.5882,4.0118 472 | 1.8118,4.0118 473 | 1.8118,3.7882 474 | 1.5882,3.7882 475 | "10",5 476 | 2.0882,3.2882 477 | 2.0882,3.5118 478 | 2.3118,3.5118 479 | 2.3118,3.2882 480 | 2.0882,3.2882 481 | "11",5 482 | 2.0882,4.2882 483 | 2.0882,4.5118 484 | 2.3118,4.5118 485 | 2.3118,4.2882 486 | 2.0882,4.2882 487 | "14",5 488 | 2.7882,3.9882 489 | 2.7882,4.2118 490 | 3.0118,4.2118 491 | 3.0118,3.9882 492 | 2.7882,3.9882 493 | "24",5 494 | 3.6882,3.9882 495 | 3.6882,4.2118 496 | 3.9118,4.2118 497 | 3.9118,3.9882 498 | 3.6882,3.9882 499 | -------------------------------------------------------------------------------- /data/forest-health/foresthealth.gra: -------------------------------------------------------------------------------- 1 | 83 2 | 82 3 | 2 4 | 2 1 5 | 83 6 | 2 7 | 0 7 8 | 79 9 | 2 10 | 7 0 11 | 81 12 | 2 13 | 7 8 14 | 78 15 | 2 16 | 8 9 17 | 62 18 | 2 19 | 11 6 20 | 55 21 | 2 22 | 11 5 23 | 80 24 | 4 25 | 2 12 3 1 26 | 77 27 | 4 28 | 12 13 4 3 29 | 75 30 | 3 31 | 13 14 4 32 | 72 33 | 1 34 | 14 35 | 61 36 | 3 37 | 15 6 5 38 | 76 39 | 3 40 | 16 8 7 41 | 74 42 | 4 43 | 16 17 9 8 44 | 71 45 | 4 46 | 17 18 10 9 47 | 60 48 | 3 49 | 19 20 11 50 | 73 51 | 3 52 | 21 13 12 53 | 70 54 | 4 55 | 21 22 14 13 56 | 67 57 | 2 58 | 22 14 59 | 59 60 | 2 61 | 25 15 62 | 54 63 | 3 64 | 25 26 15 65 | 69 66 | 4 67 | 23 27 17 16 68 | 66 69 | 3 70 | 27 18 17 71 | 68 72 | 2 73 | 28 21 74 | 63 75 | 2 76 | 29 28 77 | 53 78 | 3 79 | 30 20 19 80 | 48 81 | 2 82 | 30 20 83 | 65 84 | 4 85 | 28 31 22 21 86 | 64 87 | 4 88 | 24 32 27 23 89 | 56 90 | 4 91 | 33 34 32 24 92 | 47 93 | 3 94 | 35 26 25 95 | 58 96 | 3 97 | 32 36 27 98 | 57 99 | 4 100 | 29 37 31 28 101 | 50 102 | 4 103 | 34 38 37 29 104 | 49 105 | 3 106 | 38 33 29 107 | 46 108 | 3 109 | 39 40 30 110 | 52 111 | 3 112 | 37 41 31 113 | 51 114 | 4 115 | 33 42 36 32 116 | 42 117 | 3 118 | 42 34 33 119 | 45 120 | 3 121 | 41 43 35 122 | 41 123 | 2 124 | 43 35 125 | 44 126 | 4 127 | 42 44 39 36 128 | 43 129 | 4 130 | 38 45 41 37 131 | 40 132 | 4 133 | 44 46 40 39 134 | 39 135 | 5 136 | 45 46 47 43 41 137 | 38 138 | 3 139 | 47 44 42 140 | 37 141 | 4 142 | 47 48 44 43 143 | 36 144 | 6 145 | 49 52 53 46 45 44 146 | 33 147 | 4 148 | 53 58 57 46 149 | 35 150 | 6 151 | 52 53 50 58 59 47 152 | 31 153 | 5 154 | 59 60 53 52 49 155 | 25 156 | 4 157 | 54 55 56 60 158 | 34 159 | 8 160 | 53 50 58 59 57 61 49 47 161 | 32 162 | 9 163 | 50 58 59 57 61 48 52 49 47 164 | 21 165 | 6 166 | 55 63 56 62 51 60 167 | 18 168 | 6 169 | 63 56 54 62 51 60 170 | 15 171 | 4 172 | 63 55 54 51 173 | 28 174 | 7 175 | 61 64 59 58 53 48 52 176 | 30 177 | 10 178 | 59 57 61 64 62 65 53 48 52 49 179 | 29 180 | 10 181 | 57 61 60 62 65 58 50 53 52 49 182 | 26 183 | 9 184 | 51 62 54 65 55 63 61 59 50 185 | 27 186 | 11 187 | 60 64 62 65 66 67 57 59 58 53 52 188 | 22 189 | 12 190 | 54 65 66 55 67 63 68 64 60 61 59 58 191 | 16 192 | 8 193 | 56 68 67 55 65 54 62 60 194 | 23 195 | 8 196 | 62 65 66 67 71 61 57 58 197 | 20 198 | 11 199 | 66 67 63 71 68 62 64 60 61 59 58 200 | 19 201 | 8 202 | 67 71 68 75 65 62 64 61 203 | 17 204 | 9 205 | 63 71 68 75 66 65 62 64 61 206 | 12 207 | 7 208 | 75 71 63 67 66 65 62 209 | 2 210 | 5 211 | 70 72 73 74 75 212 | 1 213 | 5 214 | 69 72 76 73 74 215 | 13 216 | 9 217 | 68 79 75 74 73 67 66 65 64 218 | 3 219 | 7 220 | 69 70 76 73 74 78 79 221 | 6 222 | 8 223 | 76 72 69 70 74 75 79 71 224 | 7 225 | 9 226 | 73 76 72 69 70 78 75 79 71 227 | 9 228 | 8 229 | 74 73 69 79 68 71 67 66 230 | 4 231 | 7 232 | 72 70 73 74 78 79 80 233 | 5 234 | 2 235 | 78 80 236 | 8 237 | 6 238 | 74 77 76 72 79 80 239 | 10 240 | 9 241 | 75 78 74 73 76 72 80 71 81 242 | 11 243 | 5 244 | 79 78 77 76 81 245 | 14 246 | 3 247 | 80 79 82 248 | 24 249 | 1 250 | 81 251 | -------------------------------------------------------------------------------- /data/mexdolphins/README.md: -------------------------------------------------------------------------------- 1 | Pantropical spotted dolphins in the Gulf of Mexico 2 | ================================================== 3 | 4 | Data copied from [this example DSM analysis](http://distancesampling.org/R/vignettes/mexico-analysis.html), after correcting for detectability. 5 | 6 | Data was collected by teh NOAA South East Fisheries Science Center (sic). The OBIS-SEAMAP page for the data may be found at the SEFSC [GoMex Oceanic 1996 survey page](http://seamap.env.duke.edu/dataset/25). 7 | 8 | ## Data citation 9 | 10 | Garrison, L. 2013. SEFSC GoMex Oceanic 1996. Data downloaded from OBIS-SEAMAP (http://seamap.env.duke.edu/dataset/25). 11 | -------------------------------------------------------------------------------- /data/mexdolphins/mexdolphins.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/mexdolphins/mexdolphins.RData -------------------------------------------------------------------------------- /data/mexdolphins/soap_pred.R: -------------------------------------------------------------------------------- 1 | # make soap pred grid 2 | 3 | library(mapdata) 4 | library(maptools) 5 | library(rgeos) 6 | library(rgdal) 7 | library(mgcv) 8 | 9 | load("mexdolphins.RData") 10 | 11 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 12 | 13 | usmap <- map('usa', plot=FALSE, xlim=c(-100,-80), ylim=c(22,33)) 14 | uspoly <- map2SpatialPolygons(usmap, IDs=usmap$names, 15 | proj4string=CRS("+proj=longlat +datum=WGS84")) 16 | 17 | # simplify the polygon 18 | uspoly <- gSimplify(uspoly, 0.08) 19 | 20 | 21 | # bounding box 22 | c1 = cbind(c(-97.5, -82, -82, -97.5), 23 | c(22.5, 22.5, 32, 32)) 24 | r1 = rbind(c1, c1[1, ]) # join 25 | P1 = Polygon(r1) 26 | Ps1 = Polygons(list(P1), ID = "a") 27 | 28 | SPs = SpatialPolygons(list(Ps1),proj4string=CRS("+proj=longlat +datum=WGS84")) 29 | 30 | # plot to check we got the right area 31 | #plot(uspoly)#, xlim=c(-100,-80), ylim=c(22,33)) 32 | #plot(SPs, add=TRUE) 33 | #points(mexdolphins[,c("longitude","latitude")]) 34 | 35 | # take the polygon difference 36 | soap_area <- gDifference(SPs,uspoly) 37 | # again check 38 | #plot(soap_area) 39 | #points(mexdolphins[,c("longitude","latitude")]) 40 | 41 | 42 | xy_bnd <- spTransform(soap_area, CRSobj=lcc_proj4) 43 | xy_bnd <- xy_bnd@polygons[[1]]@Polygons[[1]]@coords 44 | xy_bnd <- list(x=xy_bnd[,1], y=xy_bnd[,2]) 45 | 46 | gr <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), by=16469.27), 47 | y=seq(min(xy_bnd$y), max(xy_bnd$y), by=16469.27)) 48 | x <- gr[,1] 49 | y <- gr[,2] 50 | ind <- inSide(xy_bnd, x, y) 51 | 52 | soap_preddata <- as.data.frame(gr[ind,]) 53 | soap_preddata$area <- preddata$area[1] 54 | soap_preddata$off.set <- log(soap_preddata$area) 55 | 56 | 57 | save(xy_bnd, soap_preddata, file="soapy.RData") 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /data/mexdolphins/soapy.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/mexdolphins/soapy.RData -------------------------------------------------------------------------------- /data/routes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/routes.csv -------------------------------------------------------------------------------- /data/yukon_seeds/seed_source_locations.csv: -------------------------------------------------------------------------------- 1 | Who,Site,Subsite,source_type,polygon,order,Y,X 2 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796723.731,469919.3107 3 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796742.626,469910.4723 4 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796729.702,469974.1999 5 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.89,470025.6419 6 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796719.851,470077.7116 7 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.935,470126.3351 8 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796718.948,470174.2659 9 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796702.393,470236.7358 10 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796694.535,470294.0962 11 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796742.568,470287.086 12 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796724.452,470303.7841 13 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796785.653,470258.4785 14 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.804,470191.593 15 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796964.678,470142.5823 16 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797042.968,470050.4508 17 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797046.439,470002.8427 18 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797040.834,469950.9495 19 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796996.21,469909.3531 20 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796975.594,469904.4163 21 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.762,469895.5375 22 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796850.814,469871.8622 23 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797168.951,469864.6928 24 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797133.442,469889.7501 25 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797067.112,470003.8641 26 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797045.311,469933.9624 27 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803025.398,465082.76 28 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803076.228,465133.385 29 | Eliot,FoxLake,Lowland,SSPolygon,38,1,6796715.73,470304.9674 30 | Eliot,FoxLake,Lowland,SSPolygon,38,2,6796728.455,470293.1938 31 | Eliot,FoxLake,Lowland,SSPolygon,38,3,6796753.325,470273.6796 32 | Eliot,FoxLake,Lowland,SSPolygon,38,4,6796788.911,470252.55 33 | Eliot,FoxLake,Lowland,SSPolygon,38,5,6796847.188,470216.8999 34 | Eliot,FoxLake,Lowland,SSPolygon,38,6,6796887.915,470191.8324 35 | Eliot,FoxLake,Lowland,SSPolygon,38,7,6796932.971,470156.1209 36 | Eliot,FoxLake,Lowland,SSPolygon,38,8,6796962.203,470143.2102 37 | Eliot,FoxLake,Lowland,SSPolygon,38,9,6796979.017,470128.2042 38 | Eliot,FoxLake,Lowland,SSPolygon,38,10,6797017.817,470096.0466 39 | Eliot,FoxLake,Lowland,SSPolygon,38,11,6797024.449,470083.6527 40 | Eliot,FoxLake,Lowland,SSPolygon,38,32,6797050.904,470081.0951 41 | Eliot,FoxLake,Lowland,SSPolygon,38,33,6796981.673,470133.1883 42 | Eliot,FoxLake,Lowland,SSPolygon,38,34,6796839.563,470223.1343 43 | Eliot,FoxLake,Lowland,SSPolygon,38,35,6796790.373,470257.3939 44 | Eliot,FoxLake,Lowland,SSPolygon,38,12,6797041.589,470051.2157 45 | Eliot,FoxLake,Lowland,SSPolygon,38,13,6797044.77,470038.4604 46 | Eliot,FoxLake,Lowland,SSPolygon,38,14,6797049.136,470021.4898 47 | Eliot,FoxLake,Lowland,SSPolygon,38,15,6797043.093,470009.7016 48 | Eliot,FoxLake,Lowland,SSPolygon,38,16,6797045.841,469991.8912 49 | Eliot,FoxLake,Lowland,SSPolygon,38,17,6797050.612,469984.3664 50 | Eliot,FoxLake,Lowland,SSPolygon,38,18,6797061.129,469968.7692 51 | Eliot,FoxLake,Lowland,SSPolygon,38,19,6797037.335,469950.6634 52 | Eliot,FoxLake,Lowland,SSPolygon,38,20,6797009.75,469923.8775 53 | Eliot,FoxLake,Lowland,SSPolygon,38,21,6797014.931,469917.5015 54 | Eliot,FoxLake,Lowland,SSPolygon,89,1,6796994.101,469909.7297 55 | Eliot,FoxLake,Lowland,SSPolygon,89,2,6796985.436,469906.6158 56 | Eliot,FoxLake,Lowland,SSPolygon,89,3,6796973.54,469906.5494 57 | Eliot,FoxLake,Lowland,SSPolygon,89,4,6796960.759,469898.896 58 | Eliot,FoxLake,Lowland,SSPolygon,89,5,6796919.57,469895.9869 59 | Eliot,FoxLake,Lowland,SSPolygon,89,6,6796906.707,469885.8498 60 | Eliot,FoxLake,Lowland,SSPolygon,89,7,6796855.959,469882.6563 61 | Eliot,FoxLake,Lowland,SSPolygon,89,8,6796848.594,469877.0291 62 | Eliot,FoxLake,Lowland,SSPolygon,89,15,6796843.691,469842.534 63 | Eliot,FoxLake,Lowland,SSPolygon,89,9,6796830.91,469827.1575 64 | Eliot,FoxLake,Lowland,SSPolygon,89,14,6796853.289,469799.318 65 | Eliot,FoxLake,Lowland,SSPolygon,89,12,6796817.835,469784.2797 66 | Eliot,FoxLake,Lowland,SSPolygon,89,10,6796812.723,469793.5348 67 | Eliot,FoxLake,Lowland,SSPolygon,89,13,6796821.899,469785.0872 68 | Eliot,FoxLake,Lowland,SSPolygon,89,11,6796814.147,469781.0331 69 | Eliot,FoxLake,Lowland,SSPolygon,159,1,6797215.359,469905.572 70 | Eliot,FoxLake,Lowland,SSPolygon,159,2,6797206.191,469912.9905 71 | Eliot,FoxLake,Lowland,SSPolygon,159,3,6797203.94,469943.5954 72 | Eliot,FoxLake,Lowland,SSPolygon,159,4,6797202.858,469895.6316 73 | Eliot,FoxLake,Lowland,SSPolygon,159,5,6797194.902,469889.1922 74 | Eliot,FoxLake,Lowland,SSPolygon,159,6,6797204.779,469878.4889 75 | Eliot,FoxLake,Lowland,SSPolygon,159,7,6797205.31,469870.0078 76 | Eliot,FoxLake,Lowland,SSPolygon,159,8,6797186.831,469856.0113 77 | Eliot,FoxLake,Lowland,SSPolygon,159,9,6797169.234,469850.342 78 | Eliot,FoxLake,Lowland,SSPolygon,159,10,6797150.315,469847.3502 79 | Eliot,FoxLake,Lowland,SSPolygon,159,11,6797135.268,469840.288 80 | Eliot,FoxLake,Lowland,SSPolygon,159,12,6797125.43,469837.8281 81 | Eliot,FoxLake,Lowland,SSPolygon,159,13,6797121.712,469844.4725 82 | Eliot,FoxLake,Lowland,SSPolygon,159,14,6797136.985,469852.2816 83 | Eliot,FoxLake,Lowland,SSPolygon,159,15,6797162.709,469858.8117 84 | Eliot,FoxLake,Lowland,SSPolygon,159,16,6797178.135,469866.1774 85 | Eliot,FoxLake,Lowland,SSPolygon,159,17,6797187.21,469872.7701 86 | Eliot,FoxLake,Lowland,SSPolygon,159,18,6797176.944,469888.4768 87 | Eliot,FoxLake,Lowland,SSPolygon,159,19,6797171.087,469910.6188 88 | Eliot,FoxLake,Lowland,SSPolygon,159,20,6797182.537,469924.3886 89 | Eliot,FoxLake,Lowland,SSPolygon,159,21,6797174.278,469956.3611 90 | Eliot,FoxLake,Lowland,SSPolygon,159,22,6797169.264,469966.2896 91 | Eliot,FoxLake,Lowland,SSPolygon,185,1,6797141.76,469950.5774 92 | Eliot,FoxLake,Lowland,SSPolygon,185,2,6797134.16,469928.8803 93 | Eliot,FoxLake,Lowland,SSPolygon,185,3,6797133.061,469886.2447 94 | Eliot,FoxLake,Lowland,SSPolygon,185,4,6797112.249,469847.2647 95 | Eliot,FoxLake,Lowland,SSPolygon,185,5,6797112.324,469865.5345 96 | Eliot,FoxLake,Lowland,SSPolygon,185,6,6797121.118,469891.7095 97 | Eliot,FoxLake,Lowland,SSPolygon,185,7,6797127.573,469922.8565 98 | Eliot,FoxLake,Lowland,SSPolygon,185,8,6797126.939,469938.0401 99 | Eliot,FoxLake,Lowland,SSPolygon,38,31,6797091.933,470016.4598 100 | Eliot,FoxLake,Lowland,SSPolygon,38,30,6797079.967,470007.3238 101 | Eliot,FoxLake,Lowland,SSPolygon,38,29,6797067.498,469997.9275 102 | Eliot,FoxLake,Lowland,SSPolygon,38,28,6797063.273,469988.7003 103 | Eliot,FoxLake,Lowland,SSPolygon,38,27,6797070.821,469974.1372 104 | Eliot,FoxLake,Lowland,SSPolygon,38,26,6797063.943,469959.6072 105 | Eliot,FoxLake,Lowland,SSPolygon,38,25,6797049.503,469942.8299 106 | Eliot,FoxLake,Lowland,SSPolygon,38,24,6797039.73,469931.6067 107 | Eliot,FoxLake,Lowland,SSPolygon,38,23,6797030.65,469922.4277 108 | Eliot,FoxLake,Lowland,SSPolygon,38,22,6797020.646,469910.829 109 | Eliot,FoxLake,Lowland,SSPolygon,233,1,6797526.926,469768.1144 110 | Eliot,FoxLake,Lowland,SSPolygon,233,2,6797486.016,469794.4904 111 | Eliot,FoxLake,Lowland,SSPolygon,233,3,6797400.062,469863.5996 112 | Eliot,FoxLake,SW_Facing,SSPolygon,380,1,6803013.014,465080.2516 113 | Eliot,FoxLake,SW_Facing,SSPolygon,380,2,6803089.116,465135.0063 114 | Eliot,FoxLake,SW_Facing,SSPolygon,389,1,6803262.384,464081.8498 115 | Eliot,FoxLake,SW_Facing,SSPolygon,389,2,6803618.396,464040.2266 116 | Eliot,FoxLake,SW_Facing,SSPolygon,391,1,6803587.361,464040.0052 117 | Eliot,FoxLake,SW_Facing,SSPolygon,391,2,6803174.081,464080.9813 118 | Eliot,FoxLake,Lowland,SSIsland,56,NA,6796740.729,470183.8429 119 | Eliot,FoxLake,Lowland,SSIsland,58,NA,6796737.425,470200.11 120 | Eliot,FoxLake,Lowland,SSIsland,65,NA,6796729.556,470238.3273 121 | Eliot,FoxLake,Lowland,SSIsland,67,NA,6796738.312,470243.0132 122 | Eliot,FoxLake,Lowland,SSIsland,68,NA,6796734.045,470246.1296 123 | Eliot,FoxLake,Lowland,SSIsland,97,NA,6796940.921,469902.9191 124 | Eliot,FoxLake,Lowland,SSIsland,98,NA,6796929.918,469896.6909 125 | Eliot,FoxLake,Lowland,SSIsland,99,NA,6796923.639,469901.6075 126 | Eliot,FoxLake,Lowland,SSIsland,114,NA,6796836.526,469792.1375 127 | Eliot,FoxLake,Lowland,SSIsland,156,NA,6796980.95,470060.4577 128 | Eliot,FoxLake,Lowland,SSIsland,182,NA,6797146.336,469972.2485 129 | Eliot,FoxLake,Lowland,SSIsland,183,NA,6797152.1,469950.1815 130 | Eliot,FoxLake,Lowland,SSIsland,184,NA,6797146.21,469953.2912 131 | Eliot,FoxLake,Lowland,SSIsland,212,NA,6796949.689,469802.0181 132 | Eliot,FoxLake,Lowland,SSIsland,213,NA,6796948.007,469797.7336 133 | Eliot,FoxLake,Lowland,SSIsland,214,NA,6796936.248,469789.6354 134 | Eliot,FoxLake,Lowland,SSIsland,231,NA,6797483.691,469755.6377 135 | Eliot,FoxLake,Lowland,SSIsland,232,NA,6797514.164,469726.8687 136 | Eliot,FoxLake,Lowland,SSIndividual,60,NA,6796747.059,470205.7828 137 | Eliot,FoxLake,Lowland,SSIndividual,61,NA,6796737.175,470215.1502 138 | Eliot,FoxLake,Lowland,SSIndividual,62,NA,6796734.953,470215.1313 139 | Eliot,FoxLake,Lowland,SSIndividual,63,NA,6796725.697,470212.2823 140 | Eliot,FoxLake,Lowland,SSIndividual,64,NA,6796727.524,470235.7011 141 | Eliot,FoxLake,Lowland,SSIndividual,66,NA,6796736.703,470238.8686 142 | Eliot,FoxLake,Lowland,SSIndividual,96,NA,6796948.488,469901.2824 143 | Eliot,FoxLake,Lowland,SSIndividual,203,NA,6797043.34,469928.7998 144 | Eliot,FoxLake,Lowland,SSIndividual,204,NA,6797039.158,469928.6471 145 | Eliot,FoxLake,Lowland,SSIndividual,208,NA,6797019.777,469904.491 146 | -------------------------------------------------------------------------------- /example-bivariate-timeseries-and-ti.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "More time series; bivariate smooths and ti()" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | fig_width: 10 10 | fig_height: 4 11 | --- 12 | 13 | ## Background 14 | 15 | This example is an extension of the Fort Lauderdale time series model. Here we look at two additional advanced features, namely 16 | 17 | i. allowing the seasonal and trend terms to interact, and 18 | ii. setting up an ANOVA-like decomposition for interaction smooth using `ti()` terms 19 | 20 | The model in the earlier example estimated a constant non-linear seasonal (within-year) effect and a constant non-linear trend (between year) effect. Such a model is unable to capture any potential change in the seasonal effect (the within-year distribution of temperatures) over time. Or, put another way, is unable to capture potentially different trends in temperature are particular times of the year. 21 | 22 | This example extends the simpler model to more generally model changes in temperature. 23 | 24 | ### Loading and viewing the data 25 | 26 | The time series we'll look at is a long-term temperature and precipitation 27 | data set from the convention center here at Fort Lauderdale, consisting of 28 | monthly mean temperature (in degrees c), the number of days it rained that 29 | month, and the amount of precipitation that month (in mm). The data range from 30 | 1950 to 2015, with a column for year and month (with Jan. = 1). The data is from 31 | a great web-app called FetchClim2, and the link for the data [I used is 32 | here](http://fetchclimate2.cloudapp.net/#page=geography&dm=values&t=years&v=airt(13,6,2,1),prate(12,7,2,1),wet(1)&y=1950:1:2015&dc=1,32,60,91,121,152,182,213,244,274,305,335,366&hc=0,24&p=26.099,-80.123,Point%201&ts=2016-07-17T21:57:11.213Z). 33 | 34 | Here we'll load it and plot the air temperature data. 35 | 36 | ```{r, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE, highlight = TRUE} 37 | library("mgcv") 38 | library("ggplot2") 39 | florida <- read.csv("./data/time_series/Florida_climate.csv", 40 | stringsAsFactors = FALSE) 41 | head(florida) 42 | 43 | ggplot(data = florida, aes(x = month, y = air_temp, color = year, group = year)) + 44 | geom_point() + 45 | geom_line() 46 | ``` 47 | 48 | ## Fitting the smooth interaction model 49 | 50 | Following from the earlier example, we fit a smooth interaction model using a tensor product spline of `year` and `month`. Notice how we specify the marginal basis types as a character vector. Here I restrict the model to AR(1) errors as this was the best model in the earlier example and it is unlikely that additional unmodelled temporal structure will turn up when we use a more general model! 51 | 52 | ```{r fit-smooth-interaction, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE} 53 | m.full <- gamm(air_temp ~ te(year, month, bs = c("tp", "cc")), data = florida, 54 | correlation = corAR1(form = ~ 1 | year), 55 | knots = list(month = c(0.5, 12.5))) 56 | layout(matrix(1:3, nrow = 1)) 57 | plot(m.full$gam, scheme = 1, theta = 40) 58 | plot(m.full$gam, scheme = 1, theta = 80) 59 | plot(m.full$gam, scheme = 1, theta = 120) 60 | layout(1) 61 | summary(m.full$gam) 62 | ``` 63 | 64 | The plot of the tensor product smooth is quite illustrative as to what the model is attempting to do. The earlier model would be forced to retain the same seasonal curve throughout the time series. Now, with the more complex model, the seasonal curve is allowed to change shape smoothly through time. 65 | 66 | Our task now is to test whether this additional complexity is worth the extra effort? Are some months warming more than others in Fort Lauderdale? 67 | 68 | ## Fitting the ANOVA-decompososed model 69 | 70 | The logical thing to do to test whether some months are warming more than others would be to drop the interaction term from the model and compare the simpler and more complex models with AIC or a likelihood ratio test. But if you look back at the model we just fitted, there is no term in that model that relates just to the interaction! The tensor product contains both the main and interactive effects. What's worse is that we can't (always) compare this model with the one we fitted in the earlier example because we need to be careful to ensure the models are properly nested. This is where the `ti()` smoother comes in. 71 | 72 | A `ti()` smoother is like a standard tensor product (`te()`) smooth, but without the main effects smooths. Now we can build a model with separate main effects smooths for `year` and `month` and add in the interaction smooth using `ti()`. This will allow us to fit two properly nested that can be compared to test our null hypothesis that the months are warming at the same rate. 73 | 74 | Let's fit those models 75 | 76 | ```{r fit-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE} 77 | m.main <- gam(air_temp ~ s(year, bs = "tp") + s(month, bs = "cc"), data = florida, 78 | knots = list(month = c(0.5, 12.5))) 79 | m.int <- gam(air_temp ~ ti(year, bs = "tp") + ti(month, bs = "cc") + 80 | ti(year, month, bs = c("tp", "cc"), k = 11), data = florida, 81 | knots = list(month = c(0.5, 12.5))) 82 | ``` 83 | 84 | If you do this with `gamm()` the model fitting will fail, and for good reason. At least the two `gam()` models converged so we can see what is going on. Compare the two models using `AIC()` 85 | 86 | ```{r aic-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE} 87 | AIC(m.main, m.int) 88 | ``` 89 | 90 | AIC tells us that there is some support for the interaction model, but only when we force a more complex surface by specifying `k`. 91 | 92 | Regardless, the interaction term only affords a >1% improvement in deviance explained; whilst the interaction may be favoured by AIC, any variation in the seasonal effect over time must be small. 93 | 94 | ## Exercises 95 | 96 | 1. For an example where there is a stronger change in the seasonal effect, see my two blog posts on applying these models to the Central England Temperature series: 97 | - [Post 1](http://www.fromthebottomoftheheap.net/2015/11/21/climate-change-and-spline-interactions/) 98 | - [Post 2](http://www.fromthebottomoftheheap.net/2015/11/23/are-some-seasons-warming-more-than-others/) 99 | 100 | These two posts take you through a full analysis of this data series. 101 | -------------------------------------------------------------------------------- /example-forest-health.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Markov random field smooths & German forest health data" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | --- 10 | 11 | # Introduction 12 | 13 | In this example, we look at a spatial data set of forest health data from a stand in Germany. We can treat the individual trees as discrete spatial units and fit a spatial term using a special type of spline basis, a Markov random field. 14 | 15 | The data come from a project n the forest of Rothenbuch, Germany, which has been ongoing since 1982. The project studies five species but here we just consider the beech data. Each year the condition of each of 83 trees is categorized into one of nine ordinal classes in terms of the degree (percentage) of defoliation. A defoliation value of 0% indicates a healthy beech tree, whilst a value of 100% indicates a tree is dead. For the purposes of this example, several of the nine ordinal classes are merged to form a three class response variable of interest 16 | 17 | 1. 0%, healthy trees that I'll refer to as the `"low"` defoliation group 18 | 2. 12.5% -- 37.5% defoliation, which I refer to as the `"med"` group, and 19 | 3. 50% -- 100% defoliation, which is the `"high"` group. 20 | 21 | The original classes were `r paste0(c(0,12.5,37.5,50,62.5,75, 87.5, 100), "%")` and are in variable `defol`. 22 | 23 | Alongside the response variable, a number of continuous and categorical covariates are recorded 24 | 25 | - `id` --- location ID number 26 | - `year` --- year of recording 27 | - `x` and `y` --- the x- and y-coordinates of each location 28 | - `age` --- average age of trees at a location 29 | - `canopyd` --- canopy density at the location, in % 30 | - `gradient` --- slope, in % 31 | - `alt` --- altitude above sea level in m 32 | - `depth` --- soil depth in cm 33 | - `ph` --- soil pH at 0--2cm depth 34 | - `watermoisture` --- soil moisture in three categories 35 | 1. `1` moderately dry 36 | 2. `2` moderately moist 37 | 3. `3` moist or temporarily wet 38 | - `alkali` --- fraction of alkali ions in four categories 39 | 1. `1` very low 40 | 2. `2` low 41 | 3. `3` moderate 42 | 4. `4` high 43 | - `humus` thickness of the soil humus layer in five categories 44 | 1. `0` 0cm 45 | 2. `1` 1cm 46 | 3. `2` 2cm 47 | 4. `3` 3cm 48 | 5. `4` $>$3cm 49 | - `type` --- type of forest 50 | - `0` deciduous forest 51 | - `1` mixed forest 52 | - `fert` --- fertilization 53 | - `0` not fertilized 54 | - `1` fertilized 55 | 56 | The aim of the example is to investigate the effect of the measured covariates on the degree of defoliation and to quantify any temporal trend and spatial effect of geographic location in the data set, while adjusting for the effects of the other covariates. 57 | 58 | The data are extensively analysed in Fahrmeir, Kneib, Lang, and Marx (2013) *Regression: Models, Methods and Applications*. Springer. 59 | 60 | # Getting started 61 | 62 | Begin by loading the packages we'll use in this example 63 | 64 | ```{r load-packages} 65 | ## Load packages 66 | library("mgcv") 67 | library("ggplot2") 68 | ``` 69 | 70 | Next, load the forest health data set 71 | 72 | ```{r read-data} 73 | ## Read in the forest health data 74 | forest <- read.table("./data/forest-health/beech.raw", header = TRUE, na.strings = ".") 75 | head(forest) 76 | ``` 77 | 78 | The data require a little processing to ensure they are correctly coded. The chunk below does 79 | 80 | 1. Makes a nicer stand label so that we can sort stands numerically even though the id is character, 81 | 2. Aggregates the defoliation data into an ordered categorical variable for low, medium, or high levels of defoliation. This is converted to a numeric variable for modelling, 82 | 3. Convert the categorical variables to factors. `humus` is converted to a 4-level factor, 83 | 4. Remove some `NA`s, and 84 | 5. Summarises the response variable in a table. 85 | 86 | ```{r process-forest-data} 87 | forest <- transform(forest, id = factor(formatC(id, width = 2, flag = "0"))) 88 | 89 | ## Aggregate defoliation & convert categorical vars to factors 90 | levs <- c("low","med","high") 91 | forest <- transform(forest, 92 | aggDefol = as.numeric(cut(defol, breaks = c(-1,10,45,101), 93 | labels = levs)), 94 | watermoisture = factor(watermoisture), 95 | alkali = factor(alkali), 96 | humus = cut(humus, breaks = c(-0.5, 0.5, 1.5, 2.5, 3.5), 97 | labels = 1:4), 98 | type = factor(type), 99 | fert = factor(fert)) 100 | forest <- droplevels(na.omit(forest)) 101 | head(forest) 102 | with(forest, table(aggDefol)) 103 | ``` 104 | 105 | To view the spatial arrangement of the forest stand, plot the unique `x` and `y` coordinate pairs 106 | 107 | ```{r plot-forest-stand} 108 | ## Plot data 109 | ggplot(unique(forest[, c("x","y")]), aes(x = x, y = y)) + 110 | geom_point() 111 | ``` 112 | 113 | # Non-spatial ordered categorical GAM 114 | 115 | Our first model will ignore the spatial component of the data. All continuous variables except `ph` and `depth` are modelled using splines, and all categorical variables are included in the model. Note that we can speed up fitting by turning on some multi-threaded parallel processing in parts of the fitting algorithm via the `nthreads` control argument. The `ocat` family is used and we specify that there are three classes. Smoothness selection is via REML. 116 | 117 | ```{r fit-naive-model} 118 | ## Model 119 | ctrl <- gam.control(nthreads = 3) 120 | forest.m1 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert + 121 | s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt), 122 | data = forest, family = ocat(R = 3), method = "REML", 123 | control = ctrl) 124 | ``` 125 | 126 | The model summary 127 | 128 | ```{r summary-naive} 129 | summary(forest.m1) 130 | ``` 131 | 132 | indicates all model terms are significant at the 95% level, especially the smooth terms. Don't pay too much attention to this though as we have not yet accounted for spatial structure in the data and several of the variables are likely to be spatially autocorrelated and hence the identified effects may be spurious. 133 | 134 | This is somewhat confirmed by the form of the fitted functions; rather than smooth, monotonic functions man terms are highly non-linear and difficult to interpret. 135 | 136 | ```{r plot-naive-smooths} 137 | plot(forest.m1, pages = 1, shade = 2, scale = 0) 138 | ``` 139 | We might expect that older trees are more susceptible to damage yet confusingly there is a decreasing effect for the very oldest trees. The smooth of `gradient` is uninterpretable. Trees in low and high altitude areas are less damaged than those in areas of intermediate elevation, which is also counter intuitive. 140 | 141 | Running gam.check()` 142 | 143 | ```{r gam-check-naive} 144 | gam.check(forest.m1) 145 | ``` 146 | 147 | suggests no major problems, but the residual plot is difficult to interpret owing to the categorical nature of the response variable. The printed output suggests some smooth terms may need their basis dimension increasing. Before we do this however, we should add a spatial effect to the model. 148 | 149 | # Spatial GAM via a MRF smooth 150 | 151 | For these data, it would be more natural to fit a spatial effect via a 2-d smoother as we've considered in other examples. However, we can consider the trees as being discrete spatial units and fit a spatial effect via a Markov random field (MRF) smooth. To fit the MRF smoother, we need information on the neighbours of each tree. In this example, any tree within 1.8km of a focal tree was considered a neighbour of that focal tree. This neighbourhood information is stored in a BayesX graph file, which we can convert into the format needed for **mgcv**'s MRF basis function. To facilitate reading the graph file, load the utility function `gra2mgcv()` 152 | 153 | ```{r load-graph-fun} 154 | ## souce graph reading function 155 | source("./code_snippets/gra2mgcv.R") 156 | ``` 157 | 158 | Next we load the `.gra` file and do some manipulations to match the `forest` environmental data file 159 | 160 | ```{r load-graph} 161 | ## Read in graph file and output list required by mgcv 162 | nb <- gra2mgcv("./data/forest-health/foresthealth.gra") 163 | nb <- nb[order(as.numeric(names(nb)))] 164 | names(nb) <- formatC(as.numeric(names(nb)), width = 2, flag = "0") 165 | ``` 166 | 167 | Look at the structure of `nb`: 168 | 169 | ```{r head-nb} 170 | head(nb) 171 | ``` 172 | 173 | In **mgcv** the MRF basis can be specified by one or more of 174 | 175 | 1. `polys`; coordinates of vertices defining spatial polygons for each discrete spatial unit. Any two spatial units that share one or more vertices are considered neighbours, 176 | 2. `nb`; a list with one component per spatial unit, where each component contains indices of the neighbouring components of the current spatial unit, and/or 177 | 3. `penalty`; the actual penalty matrix of the MRF basis. This is an $N$ by $N$ matrix with the number of neighbours of each unit on the diagonal, and the $j$th column of the $i$th row set to `-1` if the $j$th spatial unit is a neighbour of the $i$th unit. Elsewhere the penalty matrix is all `0`s. 178 | 179 | **mgcv** will create the penalty matrix for you if you supply `polys` or `nb`. As we don't have polygons here, we'll use the information from the `.gra` file converted to the format needed by `gam()` to indicate the neighbourhood. 180 | 181 | The `nb` list is passed along using the `xt` argument of the `s()` function. For the MRF basis, `xt` is a named list with one or more of the components listed above. In the model call below we use `xt = list(nb = nb)` to pass on the neighbourhood list. To indicate that an MRF smooth is needed, we use `bs = "mrf"` and the covariate of the smooth is the factor indicating to which spatial unit each observation belongs. In the call below we use the `id` variable. Note that this doesn't need to be a factor, just something that can be coerced to one. 182 | 183 | All other aspects of the model fit remain the same hence we use `update()` to add the MRF smooth without repeating everything from the original call. 184 | 185 | ```{r fit-mrf-model} 186 | ## Fit model with MRF 187 | ## forest.m2 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert + 188 | ## s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt) + 189 | ## s(id, bs = "mrf", xt = list(nb = nb)), 190 | ## data = forest, family = ocat(R = 3), method = "REML", 191 | ## control = ctrl) 192 | forest.m2 <- update(forest.m1, . ~ . + s(id, bs = "mrf", xt = list(nb = nb))) 193 | ``` 194 | 195 | Look at the model summary: 196 | 197 | ```{r summary-mrf-model} 198 | summary(forest.m2) 199 | ``` 200 | 201 | What differences do you see between the model with the MRF spatial effect and the first model that we fitted? 202 | 203 | Quickly create plots of the fitted smooth functions 204 | 205 | ```{r plot-mrf-smooths} 206 | plot(forest.m2, pages = 1, shade = 2, scale = 0, seWithMean = TRUE) 207 | ``` 208 | 209 | Notice that here we use `seWithMean = TRUE` as one of the terms has been shrunk back to a linear function and would otherwise have a *bow tie* confidence interval. 210 | 211 | Compare these fitted functions with those from the original model. Are these more in keeping with expectations? 212 | 213 | Model diagnostics are difficult with models for discrete responses such as these. Instead we can interrogate the model to derive quantities of interest that illustrate or summarise the model fit. First we start by generating posterior probabilities for the three ordinal defoliation/damage categories using `predict()`. This is similar to the code Eric showed this morning for the ordered categorical family. 214 | 215 | ```{r fitted-values} 216 | fit <- predict(forest.m2, type = "response") 217 | colnames(fit) <- levs 218 | head(fit) 219 | ``` 220 | 221 | The `predict()` method returns a 3-column matrix, one column per category. The entries in the matrix are the posterior probabilities of class membership, with rows summing to 1. As we'll see later, we can take a *majority wins* approach and assign each observation a fitted class membership and compare these to the known classes. 222 | 223 | For easier visualisation it would be nicer to have these fitted probabilities in a tidy form, which we now do via a few manipulations 224 | 225 | ```{r process-fitted-values} 226 | fit <- setNames(stack(as.data.frame(fit)), c("Prob", "Defoliation")) 227 | fit <- transform(fit, Defoliation = factor(Defoliation, labels = levs)) 228 | fit <- with(forest, cbind(fit, x, y, year)) 229 | head(fit) 230 | ``` 231 | 232 | The fitted values are now ready for plotting. Here we plot just the years 2000--2004, showing the spatial arrangement of trees, faceted by defoliation/damage category: 233 | 234 | ```{r plot-fitted} 235 | ggplot(subset(fit, year >= 2000), aes(x = x, y = y, col = Prob)) + 236 | geom_point(size = 1.2) + 237 | facet_grid(Defoliation ~ year) + 238 | coord_fixed() + 239 | theme(legend.position = "top") 240 | ``` 241 | 242 | A more complex use of the model involves predicting change in class posterior probability over time whilst holding all other variables at the observed mean or category mode 243 | 244 | ```{r timeseries-predictions} 245 | N <- 200 246 | pdat <- with(forest, expand.grid(year = seq(min(year), max(year), length = N), 247 | age = mean(age, na.rm = TRUE), 248 | gradient = mean(gradient, na.rm = TRUE), 249 | canopyd = mean(canopyd, na.rm = TRUE), 250 | alt = mean(alt, na.rm = TRUE), 251 | depth = mean(depth, na.rm = TRUE), 252 | ph = mean(ph, na.rm = TRUE), 253 | humus = factor(2, levels = levels(humus)), 254 | watermoisture = factor(2, levels = levels(watermoisture)), 255 | alkali = factor(2, levels = levels(alkali)), 256 | type = factor(0, levels = c(0,1)), 257 | fert = factor(0, levels = c(0,1)), 258 | id = levels(id))) 259 | pred <- predict(forest.m2, newdata = pdat, type = "response", exclude = "s(id)") 260 | colnames(pred) <- levs 261 | pred <- setNames(stack(as.data.frame(pred)), c("Prob","Defoliation")) 262 | pred <- cbind(pred, pdat) 263 | pred <- transform(pred, Defoliation = factor(Defoliation, levels = levs)) 264 | ``` 265 | 266 | A plot of the predictions is produced using 267 | 268 | ```{r plot-timeseries-predictions} 269 | ggplot(pred, aes(x = year, y = Prob, colour = Defoliation)) + 270 | geom_line() + 271 | theme(legend.position = "top") 272 | ``` 273 | 274 | Using similar code, produce a plot for the effect of `age` holding other values at the mean or mode. 275 | 276 | The final summary of the model that we'll produce is a confusion matrix of observed and predicted class membership. Technically, this is over-optimistic as we are using the same data to both fit and test the model, but it is an illustration of how well the model does are fitting the observed classes. 277 | 278 | ```{r confusion-matrix} 279 | fit <- predict(forest.m2, type = "response") 280 | fitClass <- factor(levs[apply(fit, 1, which.max)], levels = levs) 281 | obsClass <- with(forest, factor(aggDefol, labels = levs)) 282 | sum(fitClass == obsClass) / length(fitClass) 283 | table(fitClass, obsClass) 284 | ``` 285 | 286 | Just don't take it as any indication of how well the model will do at predicting the defoliation class for a new year or tree. 287 | 288 | How better does the model with the MRF spatial effect fit the data compared to the original model that was fitted? We can use AIC as one means of answering that question 289 | 290 | ```{r aic} 291 | AIC(forest.m1, forest.m2) 292 | ``` 293 | 294 | Which model does AIC indicate as fitting the data best? 295 | 296 | ## Exercise 297 | 298 | As an additional exercise, replace the MRF smooth with a 2-d spline of the `x` and `y` coordinates and produce maps of the class probabilities over the spatial domain. You'll need to use ideas and techniques from some of the other spatial examples in order to complete this task. 299 | -------------------------------------------------------------------------------- /example-linear-functionals.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Linear functionals; or: what to do when you've got a bunch of x-values and only one y" 3 | author: Eric Pedersen 4 | output: 5 | html_document: 6 | toc: true 7 | toc_float: true 8 | theme: readable 9 | highlight: haddock 10 | --- 11 | 12 | ##1. Background 13 | 14 | Up to now, we've been looking at data where every coefficient we're using to 15 | predict our y-values consists of a single value for each y. However, not all 16 | predictor data looks like this. In many cases, a single coefficient may have 17 | several values associated with the same outcome variable. For instance, Let's 18 | say we've measured total ecosystem production in several different lakes, as 19 | well as temperature along a gradient away from shoreline in the lake. We want to know: 20 | 21 | 1. how well temperature predicts production 22 | 2. whether temperatures at different distances from the shore have different effects on 23 | production (as production varies a lot between the littoral and pelagic zone). 24 | 25 | Now, there looks like there might be a few ways to answer this in a standard gam 26 | setting. First we could just average temperatures across locations and use that 27 | as a predictor to answer question 1. However, that doesn't give us any insight 28 | into question 2, and it's pretty easy to imagine a case like warmer temperatures 29 | at the shore increase production, but warmer temperatures in the middle of the 30 | lake have very little effect, or where a single warm area could substantially 31 | increase production even in a cold lake. We could also try to fit a seperate 32 | smooth of production on temperature at each shore distance. However, if we have 33 | a lot of distances, this ends up fitting a bunch of seperate smooths, and we'd 34 | rapidly run out of degrees of freedom. Also, it would mean throwing away 35 | information; we'd expect that the effect of temperature a meter below from the 36 | shore should be very similar to the effect of temperature right at the 37 | shoreline, but by fitting a seperate smooth for each, we're ignoring that (and 38 | likely going to suffer from concurvity issues to boot!). 39 | 40 | What we need is a method that can account for the fact that we have multiple x 41 | values for a given predictor. Fortunately, `mcgv` can handle this pretty easily. 42 | It does this by allowing to pass matrix-valued predictors to `s()` and `te()` 43 | terms. When you pass `mgcv` a matrix, it will fit the same smooth function to 44 | each of the columns of the matrix, then sum across the rows of the matrix of 45 | transformed values to give the estimated mean value for a given `y`. Let's say 46 | `y` is a one-dimensional outcome, with n measured values, and `x` is a matrix 47 | with n rows and k columns. The predicted values for the ith value of `y` would 48 | be: $y_i \sim\sum_{j=1}^k f(x_{i,j})$, where $f(x_{i,j})$ is itself a sum of 49 | basis functions multiplied by model coefficients as before. 50 | 51 | In mathemetical terms, this makes our smooth term a *functional*, where our 52 | outcome is a function of a vector of terms rather than a single term, and a 53 | linear functional, as it's a linear sum of smooth terms for each column; but you 54 | really don't need to know much about functionals to use these. I only mention 55 | the functional thing because if you want to dig into this approach further or if 56 | you want to read help files on these, as you have to search 57 | `?linear.functional.terms`. 58 | 59 | There's a few major useages Ive found for these. There's likely more than this, 60 | but these are all cases I've encountered. Also, each of these cases requires you 61 | to set up your predictor matrices carefully, so make sure you know what kind of 62 | functional you'll be fitting, and what predictor matrix is going where in the `s()` 63 | function. 64 | 65 | ### Case 1. Nonlinear averaging. 66 | The first case, and the simplest to fit, is to estimate 67 | a nonlinear average. Let's look at the lake example from before. Say we think 68 | that distance from shore shouldn't matter, but that production increases 69 | non-linearly with temperature. In that case, a cold lake with a few hot spots 70 | may actually be more productive than a lake that's consitently luke warm. 71 | Therefore, if you average over temperatures before estimating a function, you'll 72 | miss this effect (as we know from statistics that the average of a non-linear 73 | function applied to x does not, in general, equal the function applied to the 74 | average of x). In this case, we can just provide a matrix of values (the predictor matrix) to `s()`: in this case, the predictor matrix is a matrix of temperatures where each column is the temperature measured at one of our sites. 75 | We do this in the code below with the variable `temp_matrix`. 76 | 77 | 78 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE} 79 | library(dplyr) 80 | library(mgcv) 81 | library(ggplot2) 82 | library(tidyr) 83 | n_lakes = 200 84 | lake_mean_temps = rnorm(n_lakes, 25,6) 85 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes, 86 | shore_dist_m = c(0,1,5,10,20,40))) 87 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8) 88 | lake_data$production = 5*dnorm(lake_data$temp, 30, sd = 5)/dnorm(1,0,5)+rnorm(n_lakes*6, 0, 1) 89 | lake_data = lake_data %>% group_by(lake)%>% 90 | mutate(production = mean(production), 91 | mean_temp = mean(temp))%>% 92 | spread(shore_dist_m, temp) 93 | 94 | 95 | lake_data$temp_matrix = lake_data %>% 96 | select(`0`:`40`) %>% 97 | as.matrix(.) 98 | head(lake_data) 99 | 100 | 101 | mean_temp_model = gam(production~s(mean_temp),data=lake_data) 102 | nonlin_temp_model = gam(production~s(temp_matrix), 103 | data=lake_data) 104 | 105 | layout(matrix(1:2, nrow=1)) 106 | plot(mean_temp_model) 107 | plot(nonlin_temp_model) 108 | layout(1) 109 | temp_predict_data = data.frame(temp = seq(10,40,length=100), 110 | mean_temp = seq(10,40,length=100)) 111 | 112 | temp_matrix = matrix(seq(10,40,length=100),nrow= 100,ncol=6, 113 | byrow = F) 114 | 115 | temp_predict_data = temp_predict_data %>% 116 | mutate(linear_average = as.numeric(predict(mean_temp_model,.)), 117 | nonlinear_average = as.numeric(predict(nonlin_temp_model, .)), 118 | true_function = 5*dnorm(temp, 30, sd = 5)/dnorm(1,0,5))%>% 119 | gather(model, value,linear_average:true_function) 120 | 121 | #This plots the two fitted models vs. the true model. 122 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+ 123 | geom_line()+ 124 | scale_color_brewer(palette="Set1")+ 125 | theme_bw(20) 126 | ``` 127 | 128 | The nonlinear model is substantially more uncertain at the ends, 129 | bu it fits the true function much more effectively. The function from 130 | the average data substantially underestimates the effect of temperature on 131 | production. 132 | 133 | ### Case 2: Weighted averaging 134 | The next major case where linear functionals are useful is for weighted 135 | averages. Here, you have some predictor variable measured at various points at 136 | different distances or lags from a given point, or at different locations along 137 | some gradient. This could be a variable that's been meaured at various distances 138 | away from each observed site, and you want to understand at what scale that 139 | variable will affect your parameter of interest. It could also be a predictor 140 | variable measured at several time points before the observation, and you want to 141 | know what at what lags the two variables interact. 142 | 143 | In this case, we'll assume that the relationship between our variable of 144 | interest (x) and the outcome is linear at any given lag, but that linear 145 | relationship changes smoothly with the lag. We'll look at the case where both 146 | relationships are nonlinear next. Here we have to create two matrices. The first 147 | is the lag matrix, and it will have one column for each lag we're interested in. 148 | All the values in a given column will be equal to the lag value. 149 | The second matrix is the predictor matrix, and it is the same as (see [the section below](#by_var)) on `by=` terms on how this works). 150 | 151 | 152 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE} 153 | library(dplyr) 154 | library(mgcv) 155 | library(ggplot2) 156 | library(tidyr) 157 | n_lakes = 200 158 | lake_mean_temps = rnorm(n_lakes, 25,6) 159 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes, 160 | shore_dist_m = c(0,1,5,10,20,40))) 161 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8) 162 | lake_data$production = with(lake_data, rnorm(n_lakes*6, 163 | temp*3*exp(-shore_dist_m/10),1)) 164 | lake_data = lake_data %>% group_by(lake)%>% 165 | mutate(production = mean(production), 166 | mean_temp = mean(temp))%>% 167 | spread(shore_dist_m, temp) 168 | 169 | #This is our lag matrix for this example 170 | shore_dist_matrix = matrix(c(0,1,5,10,20,40), nrow=n_lakes,ncol=6, byrow = T) 171 | head(shore_dist_matrix) 172 | 173 | #This is our predictor matrix 174 | lake_data$temp_matrix = lake_data %>% 175 | select(`0`:`40`) %>% 176 | as.matrix(.) 177 | head(lake_data) 178 | 179 | #Note: we need to set k=6 here, as we really only have 6 degrees of freedom (one 180 | #for each distance we've measured from the shore. 181 | 182 | nonlin_temp_model = gam(production~s(shore_dist_matrix, by= temp_matrix,k=6), 183 | data=lake_data) 184 | plot(nonlin_temp_model) 185 | 186 | #This plots the two fitted models vs. the true model. 187 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+ 188 | geom_line()+ 189 | scale_color_brewer(palette="Set1")+ 190 | theme_bw(20) 191 | ``` 192 | 193 | ## 2. Key concepts and functions 194 | 195 | ### using `by=` terms in smoothers {#by_var} 196 | One of the most useful features in `mgcv` is the `by=` argument for smooths. 197 | This has two uses. 198 | 199 | 1. For a given smooth (say, `y~s(x)`), if you set `s(x,by=group`), where group 200 | is some factor-leveled predictor, `mgcv` will fit a seperate smooth of x for 201 | each level of that factor. The model will produce a different smooth $s_k(x)$ 202 | for each kth level, allowing you to test if a given relationship varies betewen 203 | group. 204 | 2. If you instead you set `s(x, by=z)` where z is a numerical value, `mgcv` will 205 | fit a varying slopes regression. Instead of modeling y as a smooth function of 206 | x, this will model y as a *linear* function of z, where the slope of the 207 | relationship between z and y changes smoothly as a function of x, so the 208 | predicted value for the ith value of y would be: $y_i \sim f(x_i) \cdot z_i$. 209 | If you're using matrix predictors as discussed above, for numerical predictors 210 | the `by` variable `z` also has to be a matrix with the same dimensions as `x`. 211 | The predicted value in this case will be: $y_i \sim \sum_{j=1}^k f(x_{i,j})\cdot z_{i,j}$, where $j$ are the columns of the matrices $x$ and $z$. 212 | 213 | 214 | ##Calculating dispersal kernels 215 | 216 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE} 217 | yukon_seedling_data = read.csv("data/yukon_seeds/seed_data.csv") 218 | yukon_source_data =read.csv("data/yukon_seeds/seed_source_locations.csv") 219 | seed_dist = matrix(0, nrow = nrow(yukon_seedling_data), 220 | ncol= nrow(yukon_source_data)) 221 | for(i in 1:nrow(yukon_seedling_data)){ 222 | seed_dist[i,] = sqrt((yukon_source_data$X- yukon_seedling_data$X[i])^2 + (yukon_source_data$Y- yukon_seedling_data$Y[i])^2) 223 | } 224 | seed_dist_l = log(seed_dist) 225 | yukon_seedling_data$min_dist_l = apply(seed_dist_l, MARGIN = 1,min) 226 | 227 | basic_dispersal_model = gam(n_spruce~s(min_dist_l)+offset(log(plot_area_m2)), 228 | data=yukon_seedling_data, family=nb) 229 | full_dispersal_model = gam(n_spruce~s(seed_dist_l)+offset(log(plot_area_m2)), 230 | data=yukon_seedling_data, family=nb) 231 | ``` 232 | 233 | -------------------------------------------------------------------------------- /example-spatial-mexdolphins-solutions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | 10 | --- 11 | 12 | # Preamble 13 | 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013). 15 | 16 | From that appendix: 17 | 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.* 19 | 20 | 21 | ## Doing these exercises 22 | 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works. 24 | 25 | # Data format 26 | 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided: 28 | 29 | 30 | ```{r loaddata} 31 | load("data/mexdolphins/mexdolphins.RData") 32 | ls() 33 | ``` 34 | 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model. 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs 37 | - `preddata` prediction grid without any fancy spatial stuff 38 | 39 | Looking further into the `mexdolphins` frame we see: 40 | 41 | ```{r frameinspect} 42 | str(mexdolphins) 43 | ``` 44 | 45 | A brief explanation of each entry: 46 | 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area) 48 | - `Transect.Label` identifier for the transect that this segment belongs to 49 | - `longitude`, `latitude` location in lat/long of this segment 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection)) 51 | - `Effort` the length of the current segment 52 | - `depth` the bathymetry at the segment's position 53 | - `count` number of dolphins observed in this segment 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this) 56 | 57 | 58 | # Modelling 59 | 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model. 61 | 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*. 63 | 64 | ## A simple model to start with 65 | 66 | We can begin with the model we showed in the first lecture: 67 | 68 | ```{r simplemodel} 69 | library(mgcv) 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 71 | data = mexdolphins, 72 | family = quasipoisson(), 73 | method = "REML") 74 | ``` 75 | 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection. 77 | 78 | We can check the assumptions of this model by using `gam.check`: 79 | 80 | ```{r simplemodel-check} 81 | gam.check(dolphins_depth) 82 | ``` 83 | 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations: 85 | 86 | ```{r zeroresids, fig.width=7, fig.height=7} 87 | # code from the insides of mgcv::gam.check 88 | resid <- residuals(dolphins_depth, type="deviance") 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors) 90 | plot(linpred, resid, main = "Resids vs. linear pred.", 91 | xlab = "linear predictor", ylab = "residuals") 92 | 93 | # now add red dots corresponding to the zero counts 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0], 95 | pch=19, col="red", cex=0.5) 96 | ``` 97 | 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!). 99 | 100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate. 101 | 102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix. 103 | 104 | 105 | **Exercise** 106 | 107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`. 108 | 109 | ```{r simplemodel-bigsmall} 110 | dolphins_depth_bigk <- gam(count ~ s(depth, k=30) + offset(off.set), 111 | data = mexdolphins, 112 | family = quasipoisson(), 113 | method = "REML") 114 | dim(model.matrix(dolphins_depth_bigk)) 115 | dim(dolphins_depth_bigk$smooth[[1]]$S[[1]]) 116 | dolphins_depth_smallk <- gam(count ~ s(depth, k=3) + offset(off.set), 117 | data = mexdolphins, 118 | family = quasipoisson(), 119 | method = "REML") 120 | dim(model.matrix(dolphins_depth_smallk)) 121 | dim(dolphins_depth_smallk$smooth[[1]]$S[[1]]) 122 | ``` 123 | 124 | (Don't worry about the many square brackets etc to get the penalty matrix!) 125 | 126 | ### Plotting 127 | 128 | We can plot the smooth we fitted using `plot`. 129 | 130 | **Exercise** 131 | 132 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which. 133 | 134 | ```{r plotk, } 135 | par(mfrow=c(1,3)) 136 | plot(dolphins_depth, shade=TRUE, main="Default k") 137 | plot(dolphins_depth_bigk, shade=TRUE, main="big k") 138 | plot(dolphins_depth_smallk, shade=TRUE, main="small k") 139 | ``` 140 | 141 | ## Count distributions 142 | 143 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves... 144 | 145 | 146 | ### Tweedie 147 | 148 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution. 149 | 150 | ```{r tw} 151 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set), 152 | data = mexdolphins, 153 | family = tw(), 154 | method = "REML") 155 | ``` 156 | 157 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008). 158 | 159 | 160 | 161 | ### Negative binomial 162 | 163 | **Exercise** 164 | 165 | Now do the same using the negative binomial distribution (`nb()`). 166 | 167 | ```{r nb} 168 | dolphins_xy_nb <- gam(count ~ s(x,y) + s(depth) + offset(off.set), 169 | data = mexdolphins, 170 | family = nb(), 171 | method = "REML") 172 | ``` 173 | 174 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why? 175 | 176 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.* 177 | 178 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships. 179 | 180 | ## Smoothers 181 | 182 | Now let's move onto using different bases for the smoothers. We have a couple of different options here. 183 | 184 | 185 | ### Thin plate splines with shrinkage 186 | 187 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth. 188 | 189 | **Exercise** 190 | 191 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms. 192 | 193 | ```{r tw-ts} 194 | dolphins_xy_tw_ts <- gam(count ~ s(x,y, bs="ts") + s(depth, bs="ts") + 195 | offset(off.set), 196 | data = mexdolphins, 197 | family = tw(), 198 | method = "REML") 199 | ``` 200 | 201 | What are the differences (use `summary`)? 202 | 203 | *EDFs are different for both terms* 204 | 205 | What are the visual differences (use `plot`)? 206 | 207 | *Not really much difference here!* 208 | 209 | 210 | ### Soap film smoother 211 | 212 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands. 213 | 214 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file: 215 | 216 | ```{r soapy} 217 | load("data/mexdolphins/soapy.RData") 218 | ``` 219 | 220 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them. 221 | 222 | ```{r soapknots} 223 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10), 224 | y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10)) 225 | x <- soap_knots$x; y <- soap_knots$y 226 | ind <- inSide(xy_bnd, x, y) 227 | rm(x,y) 228 | soap_knots <- soap_knots[ind, ] 229 | ## inSide doesn't work perfectly, so if you get an error like: 230 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) : 231 | ## knot 54 is on or outside boundary 232 | ## just remove that knot as follows: 233 | soap_knots <- soap_knots[-8, ] 234 | soap_knots <- soap_knots[-54, ] 235 | ``` 236 | 237 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`. 238 | 239 | ```{r soapmodel} 240 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) + 241 | offset(off.set), 242 | data = mexdolphins, 243 | family = tw(), 244 | knots = soap_knots, 245 | method = "REML") 246 | 247 | ``` 248 | 249 | **Exercise** 250 | 251 | Look at the `summary` output for this model and compare it to the other models. 252 | 253 | ```{r soap-summary} 254 | summary(dolphins_soap) 255 | ``` 256 | 257 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out. 258 | 259 | 260 | ```{r soap-plot} 261 | plot(dolphins_soap) 262 | ``` 263 | 264 | # Predictions 265 | 266 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM. 267 | 268 | We can do this very simply, calling predict as one would with a `glm`. For example: 269 | 270 | ```{r pred-qp} 271 | pred_qp <- predict(dolphins_depth, preddata, type="response") 272 | ``` 273 | 274 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`. 275 | 276 | **Exercise** 277 | 278 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum? 279 | 280 | ```{r total-max-min} 281 | sum(pred_qp) 282 | range(pred_qp) 283 | ``` 284 | 285 | ## Plotting predictions 286 | 287 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.* 288 | 289 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop. 290 | 291 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot: 292 | 293 | ```{r gridplotfn} 294 | library(plyr) 295 | # fill must be in the same order as the polygon data 296 | grid_plot_obj <- function(fill, name, sp){ 297 | 298 | # what was the data supplied? 299 | names(fill) <- NULL 300 | row.names(fill) <- NULL 301 | data <- data.frame(fill) 302 | names(data) <- name 303 | 304 | spdf <- SpatialPolygonsDataFrame(sp, data) 305 | spdf@data$id <- rownames(spdf@data) 306 | spdf.points <- fortify(spdf, region="id") 307 | spdf.df <- join(spdf.points, spdf@data, by="id") 308 | 309 | # seems to store the x/y even when projected as labelled as 310 | # "long" and "lat" 311 | spdf.df$x <- spdf.df$long 312 | spdf.df$y <- spdf.df$lat 313 | 314 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df) 315 | } 316 | ``` 317 | 318 | Then we can plot the predicted abundance: 319 | 320 | ```{r plotpred} 321 | library(ggplot2) 322 | library(viridis) 323 | library(sp) 324 | library(rgeos) 325 | library(rgdal) 326 | library(maptools) 327 | 328 | # projection string 329 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 330 | # need sp to transform the data 331 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 332 | 333 | p <- ggplot() + 334 | # abundance 335 | grid_plot_obj(pred_qp, "N", pred.polys) + 336 | # survey lines 337 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 338 | # observations 339 | geom_point(aes(x, y, size=count), 340 | data=mexdolphins[mexdolphins$count>0,], 341 | colour="red", alpha=I(0.7)) + 342 | # make the coordinate system fixed 343 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 344 | xlim = range(mexdolphins$x)) + 345 | # use the viridis colourscheme, which is more colourblind-friendly 346 | scale_fill_viridis() + 347 | # labels 348 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") + 349 | # keep things simple 350 | theme_minimal() 351 | print(p) 352 | ``` 353 | 354 | If you have the `maps` package installed you can also try the following plot the coastline too: 355 | 356 | ```{r maptoo} 357 | library(maps) 358 | map_dat <- map_data("usa") 359 | map_sp <- SpatialPoints(map_dat[,c("long","lat")]) 360 | 361 | # give the sp object a projection 362 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84") 363 | # re-project 364 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4) 365 | map_dat$x <- map_sp.t$long 366 | map_dat$y <- map_sp.t$lat 367 | 368 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 369 | 370 | print(p) 371 | ``` 372 | 373 | ## Comparing predictions from soap and thin plate splines 374 | 375 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area. 376 | 377 | Predicting using the soap film smoother is exactly the same as for the other smoothers: 378 | 379 | ```{r soap-pred} 380 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response") 381 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap") 382 | ``` 383 | 384 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model. 385 | 386 | ```{r xy-soap-pred} 387 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set), 388 | data = mexdolphins, 389 | family = tw(), 390 | method = "REML") 391 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response") 392 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy") 393 | all_pred_N <- rbind(soap_pred_N, xy_pred_N) 394 | 395 | p <- ggplot() + 396 | # abundance 397 | geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) + 398 | # facet it! 399 | facet_wrap(~model) + 400 | # make the coordinate system fixed 401 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 402 | xlim = range(mexdolphins$x)) + 403 | # use the viridis colourscheme, which is more colourblind-friendly 404 | scale_fill_viridis() + 405 | # labels 406 | labs(fill="Predicted\ndensity", x="x", y="y") + 407 | # keep things simple 408 | theme_minimal() + 409 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 410 | print(p) 411 | ``` 412 | 413 | ## Plotting uncertainty 414 | 415 | **Exercise** 416 | 417 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below. 418 | 419 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements. 420 | 421 | ```{r CVmap} 422 | pred_se_qp <- predict(dolphins_depth, preddata, type="response", se.fit=TRUE) 423 | cv <- pred_se_qp$se.fit/pred_se_qp$fit 424 | 425 | p <- ggplot() + 426 | # abundance 427 | grid_plot_obj(cv, "CV", pred.polys) + 428 | # survey lines 429 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 430 | # make the coordinate system fixed 431 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 432 | xlim = range(mexdolphins$x)) + 433 | # use the viridis colourscheme, which is more colourblind-friendly 434 | scale_fill_viridis() + 435 | # labels 436 | labs(fill="CV",x="x",y="y",size="Count") + 437 | # keep things simple 438 | theme_minimal() 439 | print(p) 440 | ``` 441 | 442 | Compare the uncertainties of the different models you've fitted so far. 443 | 444 | ## `lpmatrix` magic 445 | 446 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true: 447 | 448 | ```{r lppred} 449 | # make the Lp matrix 450 | lp <- predict(dolphins_depth, preddata, type="lpmatrix") 451 | # get the linear predictor 452 | lin_pred <- lp %*% coef(dolphins_depth) 453 | # apply the link and multiply by the offset as lpmatrix ignores this 454 | pred <- preddata$area * exp(lin_pred) 455 | # all the same? 456 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE) 457 | ``` 458 | 459 | What else can we do? We can also grab the uncertainty for the sum of the predictions: 460 | 461 | ```{r lpNvar} 462 | # extract the variance-covariance matrix 463 | vc <- vcov(dolphins_depth) 464 | 465 | # reproject the var-covar matrix to be on the linear predictor scale 466 | lin_pred_var <- tcrossprod(lp %*% vc, lp) 467 | 468 | # pre and post multiply by the derivatives of the link, evaluated at the 469 | # predictions -- since the link is exp, we just use the predictions 470 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1) 471 | 472 | # we can then calculate a total CV 473 | sqrt(pred_var)/sum(pred) 474 | ``` 475 | 476 | As you can see, the `lpmatrix` can be very useful! 477 | 478 | # Extra credit 479 | 480 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument. 481 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose? 482 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make? 483 | 484 | # References 485 | 486 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0 487 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105 488 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006 489 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114. 490 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x 491 | 492 | -------------------------------------------------------------------------------- /example-spatial-mexdolphins.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | 10 | --- 11 | 12 | # Preamble 13 | 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013). 15 | 16 | From that appendix: 17 | 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.* 19 | 20 | 21 | ## Doing these exercises 22 | 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works. 24 | 25 | # Data format 26 | 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided: 28 | 29 | 30 | ```{r loaddata} 31 | load("data/mexdolphins/mexdolphins.RData") 32 | ls() 33 | ``` 34 | 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model. 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs 37 | - `preddata` prediction grid without any fancy spatial stuff 38 | 39 | Looking further into the `mexdolphins` frame we see: 40 | 41 | ```{r frameinspect} 42 | str(mexdolphins) 43 | ``` 44 | 45 | A brief explanation of each entry: 46 | 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area) 48 | - `Transect.Label` identifier for the transect that this segment belongs to 49 | - `longitude`, `latitude` location in lat/long of this segment 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection)) 51 | - `Effort` the length of the current segment 52 | - `depth` the bathymetry at the segment's position 53 | - `count` number of dolphins observed in this segment 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this) 56 | 57 | 58 | # Modelling 59 | 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model. 61 | 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*. 63 | 64 | ## A simple model to start with 65 | 66 | We can begin with the model we showed in the first lecture: 67 | 68 | ```{r simplemodel} 69 | library(mgcv) 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 71 | data = mexdolphins, 72 | family = quasipoisson(), 73 | method = "REML") 74 | ``` 75 | 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection. 77 | 78 | We can check the assumptions of this model by using `gam.check`: 79 | 80 | ```{r simplemodel-check} 81 | gam.check(dolphins_depth) 82 | ``` 83 | 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations: 85 | 86 | ```{r zeroresids, fig.width=7, fig.height=7} 87 | # code from the insides of mgcv::gam.check 88 | resid <- residuals(dolphins_depth, type="deviance") 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors) 90 | plot(linpred, resid, main = "Resids vs. linear pred.", 91 | xlab = "linear predictor", ylab = "residuals") 92 | 93 | # now add red dots corresponding to the zero counts 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0], 95 | pch=19, col="red", cex=0.5) 96 | ``` 97 | 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!). 99 | 100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate. 101 | 102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix. 103 | 104 | 105 | **Exercise** 106 | 107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`. 108 | 109 | ```{r simplemodel-bigsmall} 110 | 111 | ``` 112 | 113 | (Don't worry about the many square brackets etc to get the penalty matrix!) 114 | 115 | ### Plotting 116 | 117 | We can plot the smooth we fitted using `plot`. 118 | 119 | **Exercise** 120 | 121 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which. 122 | 123 | ```{r plotk, } 124 | par(mfrow=c(1,3)) 125 | 126 | ``` 127 | 128 | ## Count distributions 129 | 130 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves... 131 | 132 | 133 | ### Tweedie 134 | 135 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution. 136 | 137 | ```{r tw} 138 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set), 139 | data = mexdolphins, 140 | family = tw(), 141 | method = "REML") 142 | ``` 143 | 144 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008). 145 | 146 | 147 | 148 | ### Negative binomial 149 | 150 | **Exercise** 151 | 152 | Now do the same using the negative binomial distribution (`nb()`). 153 | 154 | ```{r nb} 155 | 156 | ``` 157 | 158 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why? 159 | 160 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.* 161 | 162 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships. 163 | 164 | ## Smoothers 165 | 166 | Now let's move onto using different bases for the smoothers. We have a couple of different options here. 167 | 168 | 169 | ### Thin plate splines with shrinkage 170 | 171 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth. 172 | 173 | **Exercise** 174 | 175 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms. 176 | 177 | ```{r tw-ts} 178 | ``` 179 | 180 | What are the differences (use `summary`)? 181 | 182 | What are the visual differences (use `plot`)? 183 | 184 | 185 | ### Soap film smoother 186 | 187 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands. 188 | 189 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file: 190 | 191 | ```{r soapy} 192 | load("data/mexdolphins/soapy.RData") 193 | ``` 194 | 195 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them. 196 | 197 | ```{r soapknots} 198 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10), 199 | y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10)) 200 | x <- soap_knots$x; y <- soap_knots$y 201 | ind <- inSide(xy_bnd, x, y) 202 | rm(x,y) 203 | soap_knots <- soap_knots[ind, ] 204 | ## inSide doesn't work perfectly, so if you get an error like: 205 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) : 206 | ## knot 54 is on or outside boundary 207 | ## just remove that knot as follows: 208 | soap_knots <- soap_knots[-8, ] 209 | soap_knots <- soap_knots[-54, ] 210 | ``` 211 | 212 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`. 213 | 214 | ```{r soapmodel} 215 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) + 216 | offset(off.set), 217 | data = mexdolphins, 218 | family = tw(), 219 | knots = soap_knots, 220 | method = "REML") 221 | 222 | ``` 223 | 224 | **Exercise** 225 | 226 | Look at the `summary` output for this model and compare it to the other models. 227 | 228 | ```{r soap-summary} 229 | ``` 230 | 231 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out. 232 | 233 | 234 | ```{r soap-plot} 235 | ``` 236 | 237 | # Predictions 238 | 239 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM. 240 | 241 | We can do this very simply, calling predict as one would with a `glm`. For example: 242 | 243 | ```{r pred-qp} 244 | pred_qp <- predict(dolphins_depth, preddata, type="response") 245 | ``` 246 | 247 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`. 248 | 249 | **Exercise** 250 | 251 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum? 252 | 253 | ## Plotting predictions 254 | 255 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.* 256 | 257 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop. 258 | 259 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot: 260 | 261 | ```{r gridplotfn} 262 | library(plyr) 263 | # fill must be in the same order as the polygon data 264 | grid_plot_obj <- function(fill, name, sp){ 265 | 266 | # what was the data supplied? 267 | names(fill) <- NULL 268 | row.names(fill) <- NULL 269 | data <- data.frame(fill) 270 | names(data) <- name 271 | 272 | spdf <- SpatialPolygonsDataFrame(sp, data) 273 | spdf@data$id <- rownames(spdf@data) 274 | spdf.points <- fortify(spdf, region="id") 275 | spdf.df <- join(spdf.points, spdf@data, by="id") 276 | 277 | # seems to store the x/y even when projected as labelled as 278 | # "long" and "lat" 279 | spdf.df$x <- spdf.df$long 280 | spdf.df$y <- spdf.df$lat 281 | 282 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df) 283 | } 284 | ``` 285 | 286 | Then we can plot the predicted abundance: 287 | 288 | ```{r plotpred} 289 | library(ggplot2) 290 | library(viridis) 291 | library(sp) 292 | library(rgeos) 293 | library(rgdal) 294 | library(maptools) 295 | 296 | # projection string 297 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 298 | # need sp to transform the data 299 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 300 | 301 | p <- ggplot() + 302 | # abundance 303 | grid_plot_obj(pred_qp, "N", pred.polys) + 304 | # survey lines 305 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 306 | # observations 307 | geom_point(aes(x, y, size=count), 308 | data=mexdolphins[mexdolphins$count>0,], 309 | colour="red", alpha=I(0.7)) + 310 | # make the coordinate system fixed 311 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 312 | xlim = range(mexdolphins$x)) + 313 | # use the viridis colourscheme, which is more colourblind-friendly 314 | scale_fill_viridis() + 315 | # labels 316 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") + 317 | # keep things simple 318 | theme_minimal() 319 | print(p) 320 | ``` 321 | 322 | If you have the `maps` package installed you can also try the following plot the coastline too: 323 | 324 | ```{r maptoo} 325 | library(maps) 326 | map_dat <- map_data("usa") 327 | map_sp <- SpatialPoints(map_dat[,c("long","lat")]) 328 | 329 | # give the sp object a projection 330 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84") 331 | # re-project 332 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4) 333 | map_dat$x <- map_sp.t$long 334 | map_dat$y <- map_sp.t$lat 335 | 336 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 337 | 338 | print(p) 339 | ``` 340 | 341 | ## Comparing predictions from soap and thin plate splines 342 | 343 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area. 344 | 345 | Predicting using the soap film smoother is exactly the same as for the other smoothers: 346 | 347 | ```{r soap-pred} 348 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response") 349 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap") 350 | ``` 351 | 352 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model. 353 | 354 | ```{r xy-soap-pred} 355 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set), 356 | data = mexdolphins, 357 | family = tw(), 358 | method = "REML") 359 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response") 360 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy") 361 | all_pred_N <- rbind(soap_pred_N, xy_pred_N) 362 | 363 | p <- ggplot() + 364 | # abundance 365 | geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) + 366 | # facet it! 367 | facet_wrap(~model) + 368 | # make the coordinate system fixed 369 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 370 | xlim = range(mexdolphins$x)) + 371 | # use the viridis colourscheme, which is more colourblind-friendly 372 | scale_fill_viridis() + 373 | # labels 374 | labs(fill="Predicted\ndensity", x="x", y="y") + 375 | # keep things simple 376 | theme_minimal() + 377 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) 378 | print(p) 379 | ``` 380 | 381 | ## Plotting uncertainty 382 | 383 | **Exercise** 384 | 385 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below. 386 | 387 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements. 388 | 389 | ```{r CVmap} 390 | 391 | ``` 392 | 393 | Compare the uncertainties of the different models you've fitted so far. 394 | 395 | ## `lpmatrix` magic 396 | 397 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true: 398 | 399 | ```{r lppred} 400 | # make the Lp matrix 401 | lp <- predict(dolphins_depth, preddata, type="lpmatrix") 402 | # get the linear predictor 403 | lin_pred <- lp %*% coef(dolphins_depth) 404 | # apply the link and multiply by the offset as lpmatrix ignores this 405 | pred <- preddata$area * exp(lin_pred) 406 | # all the same? 407 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE) 408 | ``` 409 | 410 | What else can we do? We can also grab the uncertainty for the sum of the predictions: 411 | 412 | ```{r lpNvar} 413 | # extract the variance-covariance matrix 414 | vc <- vcov(dolphins_depth) 415 | 416 | # reproject the var-covar matrix to be on the linear predictor scale 417 | lin_pred_var <- tcrossprod(lp %*% vc, lp) 418 | 419 | # pre and post multiply by the derivatives of the link, evaluated at the 420 | # predictions -- since the link is exp, we just use the predictions 421 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1) 422 | 423 | # we can then calculate a total CV 424 | sqrt(pred_var)/sum(pred) 425 | ``` 426 | 427 | As you can see, the `lpmatrix` can be very useful! 428 | 429 | # Extra credit 430 | 431 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument. 432 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose? 433 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make? 434 | 435 | # References 436 | 437 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0 438 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105 439 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006 440 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114. 441 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x 442 | 443 | -------------------------------------------------------------------------------- /example-spatio-temporal data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Spatio-temporal dynamics is for the birds" 3 | output: 4 | html_document: 5 | toc: true 6 | toc_float: true 7 | theme: readable 8 | highlight: haddock 9 | --- 10 | 11 | ##1. Background 12 | We've talked about modelling spatial trends in the main workshop and in an 13 | extended example, and about the trials and tribulations of time series data 14 | in another extended example. Here, we're going to combine these, to look at 15 | one of the most challenging types of data for linear models to deal with: 16 | spatiotemporal data. A spatiotemporal dataset is one where the outcome of 17 | interest has been observed at multiple locations and at multiple points in time, 18 | and we want to know how temporal trends change across the landscape (or, from 19 | the other view, how spatial patterns are changing over time). 20 | 21 | We don't have time to really get into the nuts and bolts issues around fitting 22 | complex spatio-temporal models. This exercise is instead designed to give you 23 | a feel for how to use `mgcv` to model complex interactions between variables 24 | with different natural scales, and how to include simple random effects into 25 | gam models. 26 | 27 | ## 2. Key concepts and functions: 28 | Here's a few key ideas and R functions you should familiarize yourself with if 29 | you haven't already encountered them before. For the R functions (which will be) 30 | highlighted `like this`, use `?function_name` to look them up. 31 | 32 | ### Tensor product (`te`) smooths 33 | This tutorial will rely on tensor product smooths. These are used to model 34 | nonlinear interactions between variables that have different natural scales. The 35 | interaction smooths we saw in the start of the workshop were thin plate splines 36 | `y~s(x1,x2, bs="tp")`). These splines assume that the relationship between y and 37 | x1 should be roughly as "wiggly" as the relationship between y and x2, and only 38 | has one smoothing parameter. This makes sense if x1 and x2 are similar variables 39 | (say latitude and longitude, or arm length and leg length), but if we're looking 40 | at the interacting effects of, say, temperature and body weight, a single 41 | smoothing term doesn't make sense. 42 | 43 | This is where tensor product smooths come in. These assume that each variable of 44 | interest should have its own smooth term. These create multi-dimensional smooth 45 | terms by multiplying the values of each one-dimensional basis by each value of 46 | the second. This is a pretty complicated idea, and hard to describe effectively 47 | without more math than we want to go into here. But you can get the idea from a 48 | simple example. Let's say we have two basis functions, one for variable x1 and 49 | one for variable x2. The function for x1 is a bell-curve shaped, and the one for 50 | x2 is linear. The basis functions will look like this: 51 | 52 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE} 53 | library(mgcv) 54 | library(ggplot2) 55 | library(dplyr) 56 | library(viridis) 57 | set.seed(1) 58 | n=25 59 | dat = as.data.frame(expand.grid(x1 = seq(-1,1,length.out = n), 60 | x2= seq(-1,1,length=n))) 61 | dat$x1_margin = -dat$x1 62 | dat$x2_margin = dnorm(dat$x2,0,0.5) 63 | dat$x2_margin = dat$x2_margin - mean(dat$x2_margin) 64 | 65 | layout(matrix(1:2,ncol=2)) 66 | plot(x1_margin~x1,type="l", data= dat%>%filter(!duplicated(x1))) 67 | plot(x2_margin~x2,type="l", data= dat%>%filter(!duplicated(x2))) 68 | layout(1) 69 | ``` 70 | 71 | However, the tensor product of the two variables is more complicated. Note that 72 | when x1 is below zero, the effect of x2 is positive at middle values and negative 73 | at low values, and vice versa when x1 is above zero. 74 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE} 75 | dat$x1x2_tensor = dat$x1_margin*dat$x2_margin 76 | ggplot(aes(x=x1,y=x2,fill=x1x2_tensor),data=dat)+ 77 | geom_tile()+ 78 | scale_x_continuous(expand=c(0,0))+ 79 | scale_y_continuous(expand=c(0,0))+ 80 | scale_fill_viridis()+ 81 | theme_bw() 82 | ``` 83 | 84 | A full tensor product smooth will consist of several basis functions constructed 85 | like this. The resulting smooth is then fitted using the standard gam model, but 86 | with multiple penalty terms (one for each term in the tensor product). Each 87 | penalty penalizes how much the function deviates from a smooth function in the 88 | direction of one of the variables, averaged over all the other variables. This 89 | allows us to have the function be more or less wiggly in one direction than 90 | another. 91 | 92 | They are built using the `te()` function, like this: 93 | 94 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 95 | # We'll recycle our fake marginal and tensor products as the expected value for 96 | # a simulation. Waste not, want not! 97 | dat$y = rnorm(n^2, dat$x1x2_tensor+dat$x1_margin+dat$x2_margin,0.5) 98 | 99 | te_demo = gam(y~te(x1, x2, k = 4^2),data=dat,method="REML") #k=5^2 so there will be 5 marginal bases for each dimension 100 | plot.gam(te_demo,scheme = 2) 101 | summary(te_demo) 102 | print(te_demo$sp) 103 | #These are the estimated smooths for the two dimensions. Note that the smooth for 104 | #x1 (the linear basis) is way smoother than the one for x2 105 | ``` 106 | 107 | 108 | 109 | ### Tensor interaction smooths 110 | There are many cases when we want an estimate of both the average effect of each 111 | variable as well as the interaction. This is tough to extract from just using 112 | `te()` terms, but if we try to combine the two (say, with 113 | `y~s(x1)+s(x2)+te(x1,x2))`) the result is often numerically unstable; it won't 114 | give reliable consistent separation into the terms we want. Instead, we use `ti` 115 | terms for the interaction. The `ti` terms are set up so the average effect of 116 | each variable in the smooth is already excluded from the interaction (and they'll 117 | have fewer basis functions than the equivalent `te` term). Search `?te` if you're 118 | curious about how `mcgv` sets these up; for now, we'll just use them as a black 119 | box. 120 | 121 | This is how you use them in practice: 122 | 123 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 124 | # We'll recycle our fake tensor product as the expected value for a simulation 125 | # Waste not, want not! 126 | 127 | ti_demo = gam(y~s(x1,k=4)+s(x2,k=4) +ti(x1, x2, k = 4^2), 128 | data=dat,method="REML") 129 | layout(matrix(1:3,nrow=1)) 130 | plot.gam(ti_demo,scheme = 2) 131 | layout(1) 132 | summary(ti_demo) 133 | print(ti_demo$sp) 134 | #These are the estimated smooths for the two dimensions. Note that the smooth for 135 | #x1 (the linear basis) is way smoother than the one for x2 136 | ``` 137 | 138 | Note that it does a pretty good job of extracting our assumed marginal 139 | basis functions (linear and Gaussian) and the tensor product into three clearly 140 | separate functions. 141 | 142 | NOTE: Never use `ti` on its own without including the marginal smooths! This 143 | is for the same reason you shouldn't exclude individual linear terms but include 144 | interactions in `glm` models. It really virtually never makes sense to talk about a function with a strong interaction but not marginal effects. 145 | 146 | ### Random effects smooths 147 | The last smooth type we'll use in this exercise is the random effects smooth. 148 | This is just what it sounds on the box: it's a smooth for factor terms, with one 149 | coefficient for each level of the group (the basis functions for this case), and 150 | a penalty term on the squared value of the coefficients to draw them towards the 151 | global mean. This is exactly the same mechanism that packages like `lme4` and 152 | `nlme` use to calculate random effects. In fact, there's a very deep connection 153 | between gams and mixed effects,but we won't go into that here. 154 | 155 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 156 | n= 10 157 | n_groups= 10 158 | group_means = rnorm(n_groups, 0, 2) 159 | dat = data_frame(group = rep(1:n_groups, each=n), 160 | group_f = factor(group)) 161 | dat$y = rnorm(n*n_groups, 3+group_means[dat$group],2) 162 | re_demo = gam(y~s(group_f, bs="re"), 163 | data=dat,method="REML") 164 | plot.gam(re_demo,scheme = 2) 165 | summary(re_demo) 166 | print(re_demo$sp) 167 | ``` 168 | 169 | It's important to know when using `bs="re"` terms whether the variable you're 170 | giving it is numeric or factorial. `mgcv` will not raise an error if you give it 171 | numerical variables to `bs="re"`. Instead, it'll just fit a linear model with a 172 | penalty on the slope of the line between x and y. Try it with the data about to 173 | see, by switching `s(group_f, bs="re")` with `s(group, bs="re")`. `bs="re"` is 174 | set up like this to allow you to use random effects smooths to fit things like 175 | varying slopes random effects models, but make sure you know what you're doing 176 | when venturing into this. 177 | 178 | ## 3. Spatiotemporal models 179 | Now that we've covered the key smoothers, let's take a look at how we'd use them 180 | to model spatiotemporal data. For this section, we'll use data from the Florida 181 | routes of the Breeding Bird Survey. If you haven't heard of it before, this is a 182 | long-term monitoring program tracking bird abundance and diversity across the US 183 | and Canada. It consists of many (over 4000) 24.5 mile long routes. Trained 184 | observers walk these routes each year during peak breeding season, and every 0.5 185 | miles, they stop and count and identify all the birds they can see or hear in a 186 | 0.25 mile radius of the stop. This is one of the best long-term data sets we 187 | have on bird diversity and abundance, and has been used in countless papers. 188 | 189 | Here, we're going to use this data to answer a couple relatively simple 190 | questions: how does bird species richness vary across the state of Florida, has 191 | average richness changed over time, and has the spatial pattern of richness 192 | varied? We obtained data from [this 193 | site](https://www.pwrc.usgs.gov/bbs/RawData/), and summarized it into the total 194 | number of different species and the total number of individual birds observed on 195 | each route in each year in the state of Florida. 196 | 197 | First, we'll load the data, and quickly plot it: 198 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 199 | library(maps) 200 | florida_map = map_data("state","florida")%>% 201 | transmute(Longitude= long, Latitude = lat, order=order) 202 | florida_birds = read.csv("data/bbs_data/bbs_florida_richness.csv") 203 | head(florida_birds) 204 | 205 | ggplot(aes(x=Year,y=Richness),data=florida_birds)+geom_point()+ 206 | theme_bw() 207 | ggplot(aes(x=Longitude,y=Latitude,col=Richness), data=florida_birds) + 208 | geom_polygon(data=florida_map,fill=NA, col="black")+ 209 | geom_point()+theme_bw() 210 | ``` 211 | 212 | It looks like richness may have declined in the end of the time series, and 213 | there seems to be somewhat of a north-south richness gradient, but we need to 214 | model it to really tease these issues out. 215 | 216 | ### Modelling space and time seperately 217 | We'll first take a stab at this to determine if these mean trends actually come 218 | out of the model. As richness is a count variable 219 | we'll use a Poisson gam to model it. This means 220 | that we're assuming mean richness is the product of a smooth 221 | function of location and of date, with no interactions; 222 | E(richness) = exp(f(year))*exp(g(location)). 223 | 224 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 225 | richness_base_model = gam(Richness~ s(Longitude,Latitude)+s(Year), 226 | data=florida_birds,family=poisson,method="REML") 227 | layout(matrix(1:2,ncol=2)) 228 | plot(richness_base_model,scheme=2) 229 | layout(1) 230 | summary(richness_base_model) 231 | ``` 232 | 233 | Note that we do have a thin-plate interaction between longitude and latitude. 234 | This is assuming that the gradient should be equally smooth latitudinally and 235 | longitudinally. This may not be totally accurate, as the function seems to 236 | change faster latitudinally, but we'll ignore that until the exercises at the 237 | bottom. The model seems to be doing pretty well; it captured the original patterns 238 | we saw, and explains a not-inconsiderable 45% of the deviance. 239 | 240 | Let's see if there's any remaining spatiotemporal patterns though: 241 | 242 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 243 | source("code_snippets/quantile_resid.R") 244 | 245 | florida_birds$base_resid = rqresiduals(richness_base_model) 246 | 247 | ggplot(aes(Longitude, Latitude), 248 | data= florida_birds %>% filter(Year%%6==0))+ #only look at every 6th year 249 | geom_point(aes(color=base_resid))+ 250 | geom_polygon(data=florida_map,fill=NA, col="black")+ 251 | scale_color_viridis()+ 252 | facet_wrap(~Year)+ 253 | theme_bw() 254 | 255 | ``` 256 | There's definitely still a pattern in the residuals, such as higher than 257 | expected diversity in the mid-latitudes during the 1990's. 258 | 259 | ### Joint models of space and time 260 | The next step is to determine if there is a significant interaction. This is 261 | where tensor product splines come in. Here, we'll use the `ti` formulation, as 262 | it allows us to directly test whether the interaction significantly improves our 263 | model. 264 | 265 | Something important to note: the `d=c(2,1)` argument it `ti`. This tells the 266 | function that the smooth should consist of tensor product between a 267 | 2-dimensional smooth (lat-long) and a 1-dimensional term (Year). 268 | 269 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 270 | richness_ti_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 271 | ti(Longitude, Latitude, Year, d=c(2,1)), 272 | data=florida_birds,family=poisson,method="REML") 273 | layout(matrix(1:2,ncol=2)) 274 | plot(richness_ti_model,scheme=2) 275 | layout(1) 276 | summary(richness_ti_model) 277 | 278 | ``` 279 | This improves the model fit substantially, which you can see by directly 280 | comparing models with `anova`: 281 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 282 | anova(richness_base_model,richness_ti_model,test = "Chisq") 283 | 284 | ``` 285 | 286 | 287 | The question then becomes: what does this actually mean? Where are we seeing the 288 | fastest and slowest rates of change in richness? It's often difficult to 289 | effectively show spatiotemporal changes, partially as it would require a 290 | 4-dimensional graph to really show what's going on. Still, we can look at a 291 | couple different views by using `predict` effectively: 292 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 293 | #First we'll create gridded data 294 | predict_richess = expand.grid( 295 | Latitude= seq(min(florida_birds$Latitude), 296 | max(florida_birds$Latitude), 297 | length=50), 298 | Longitude = seq(min(florida_birds$Longitude), 299 | max(florida_birds$Longitude), 300 | length=50), 301 | Year = seq(1970,2015,by=5) 302 | ) 303 | # This now selects only that data that falls within Florida's border 304 | predict_richess = predict_richess[with(predict_richess, 305 | inSide(florida_map, Latitude,Longitude)),] 306 | predict_richess$model_fit = predict(richness_ti_model, 307 | predict_richess,type = "response") 308 | ggplot(aes(Longitude, Latitude, fill= model_fit), 309 | data=predict_richess)+ 310 | geom_tile()+ 311 | facet_wrap(~Year,nrow=2)+ 312 | scale_fill_viridis("# of species")+ 313 | theme_bw(10) 314 | ``` 315 | 316 | 317 | Another way of looking at this is by estimating the rate of change 318 | at each location at each time point, and figuring out which locations 319 | are changing most quickly: 320 | 321 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 322 | 323 | predict_richess$model_change =predict(richness_ti_model, 324 | predict_richess%>%mutate(Year=Year+1), 325 | type = "response") - 326 | predict_richess$model_fit 327 | ggplot(aes(Longitude, Latitude, fill= model_change), 328 | data=predict_richess)+ 329 | geom_tile()+ 330 | facet_wrap(~Year,nrow=2)+ 331 | scale_fill_gradient2("Rate of change\n(species per year)")+ 332 | theme_bw(10) 333 | ``` 334 | 335 | ### Accounting for local heterogeneity with `bs="re"` 336 | Examining this, it seems to indicate that richness fluctuated across the state 337 | throughout the study period, but then began sharply declining state-wide in the 338 | 2000's. (Disclaimer: I don't know enough about either the BBS or Florida to say 339 | how much of this decline is real! Please don't quote me in newspapers saying 340 | biodiversity is collapsing in Florida). 341 | 342 | There is still one persistent issue: viewing the maps of richness, there appear 343 | to be small hotspots and coldspots of richness across the state. This may be due 344 | to a factor I've ignored so far: the data is collected in specific, repeated 345 | routes. If it's just easier to spot birds on one route, or there's a lot of 346 | local nesting habitat there, it will have higher richness than expected over the 347 | whole study. Adjusting for these kinds of hot and cold spots will force our 348 | model to be overly wiggly. Further, not all routes are sampled every year, so we 349 | want to make sure that our trends don't just result from less diverse routes 350 | getting sampled later in the survey. 351 | 352 | To try and account for this effect (and other grouped local factors), we 353 | can use random effects smooths. That would make our model look like this: 354 | 355 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 356 | florida_birds$Route_f = factor(florida_birds$Route) 357 | richness_ti_re_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 358 | ti(Longitude, Latitude, Year, d=c(2,1))+ 359 | s(Route_f, bs="re"), 360 | data=florida_birds,family=poisson,method="REML") 361 | layout(matrix(1:3,nrow=1)) 362 | plot(richness_ti_re_model,scheme=2) 363 | layout(1) 364 | 365 | summary(richness_ti_re_model) 366 | anova(richness_ti_model,richness_ti_re_model) 367 | ``` 368 | 369 | We can view the same plots as before with our new model: 370 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE} 371 | predict_richess$Route_f = 1 372 | predict_richess$model_fit = predict(richness_ti_re_model, 373 | predict_richess,type = "response") 374 | predict_richess$model_change =predict(richness_ti_re_model, 375 | predict_richess%>%mutate(Year=Year+1), 376 | type = "response") - 377 | predict_richess$model_fit 378 | 379 | ggplot(aes(Longitude, Latitude, fill= model_fit), 380 | data=predict_richess)+ 381 | geom_tile()+ 382 | facet_wrap(~Year,nrow=2)+ 383 | scale_fill_viridis("# of species")+ 384 | theme_bw(10) 385 | 386 | ggplot(aes(Longitude, Latitude, fill= model_change), 387 | data=predict_richess)+ 388 | geom_tile()+ 389 | facet_wrap(~Year,nrow=2)+ 390 | scale_fill_gradient2("Rate of change\n(species per year)")+ 391 | theme_bw(10) 392 | ``` 393 | This definely seems to have levelled off some of the hot spots from before, 394 | implying that bird richness is more evenly distributed than our prior model 395 | suggested. Still, the large-scale trends remain the same from before. 396 | 397 | ## 4. Exercises 398 | 399 | 1. Given that most of the variability appears to be latitudinal, try to adapt 400 | the tensor smoothers so that latitude,longitude, and year all have their own 401 | penalty terms. How does this change model fit? 402 | 403 | 2. Following up on the prior 404 | exorcise: build a model that allows you to explicitly calculate the marginal 405 | latitudinal gradient in diversity, and to determine how the latitudinal gradient 406 | has changed over time. 407 | 408 | 3. We know from basic sampling theory that if you sample 409 | fewer individuals over time, you also expect to find lower richness, even if 410 | actual richness stays unchanged. What does the spatiotemporal pattern of 411 | abudance look like? Does adding bird abudance as a predictor explain any of the 412 | patterns in richness? 413 | -------------------------------------------------------------------------------- /mgcv-esa-workshop.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /project-requirements.md: -------------------------------------------------------------------------------- 1 | ### Title of the session 2 | 3 | The mgcv package as a one-stop-shop for fitting non-linear ecological models 4 | 5 | ### Description of the session (appears online only; 250 words max.) 6 | 7 | To address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the mgcv package for the R language, can be used to fit models to data from a wide range of sources. 8 | 9 | mgcv is one of the most popular packages for modelling non-linear relationships. However, many users do not know how versatile and powerful a tool it can be. This workshop will focus on teaching participants how to use mgcv in a wide variety of situations (including spatio-temporal, zero-inflated, heavy-tailed, time series, and survival data) and advanced use of mgcv (fitting smooth interactions, seasonal effects, spatial effects, Markov random fields and varying-coefficient models). 10 | 11 | The workshop will give paricipants an understanding of: (a) practical elements of smoothing theory, with a focus on why they would choose to use different types of smoothers (b) model checking and selection (c) the range of modelling possibilities using mgcv. Participants will be assumed to be familiar with the basics of R (loading/manipulating data, functions, and plotting) and regression in R (lm() and glm()). The organizers have extensive practical experience with ecological statistics and modelling using mgcv. 12 | 13 | ### Summary sentence (appears in print only; 50 word max.) 14 | 15 | mgcv is one of the most popular R packages for non-linear modelling, but many users do not know how versatile and powerful it is. This workshop will focus on using mgcv in a wide variety of situations common in ecology, and advanced use of mgcv for model fitting. 16 | 17 | ### Name and contact information (affiliation, email) for the lead organizer and any co-organizers 18 | 19 | Eric Pedersen 20 | Center for Limnology 21 | 680 North Park Street 22 | University of Wisconsin-Madison 23 | Madison, WI, USA 53704 24 | eric.pedersen@wisc.edu 25 | 26 | David L Miller 27 | Northeast Fisheries Science Center 28 | National Oceanic and Atmospheric Administration 29 | 166 Water Street 30 | Woods Hole, MA 02543 31 | 32 | Gavin L. Simpson 33 | Institute of Environmental Change and Society 34 | University of Regina 35 | 3737 Wascana Parkway 36 | Regina 37 | Saskatchewan 38 | Canada 39 | S4S 0A2 40 | 41 | ### Minimum and maximum number of participants (to assist in room assignment). 42 | 10 minimum, 30 maximum. 43 | 44 | ### Requested scheduling 45 | Saturday (August 6th), all day. 46 | 47 | ### Any additional A/V equipment (standard A/V setup is a screen, LCD projector, and laptop) 48 | We will need the standard setup, as well as wifi access for all participants. 49 | 50 | ### Room set-up desired: theater, conference, hollow square, rounds, other 51 | A conference room. 52 | 53 | ### Food and beverage requests 54 | No food or beverages requested. 55 | 56 | ### Underwriting of workshop costs by a group or agency 57 | None of the costs will be underwritten. 58 | 59 | ### Is the session intended to be linked to a scientific session? 60 | No. 61 | 62 | ### Is the session intended to be linked to a business meeting or mixer? 63 | No. 64 | 65 | ### Describe any known (workshop/event) scheduling conflicts (what should it follow/precede/not conflict with?) 66 | We would require a Saturday time slot, as one of the organizers (Gavin Simpson) is also organizing 67 | the Advanced Vegan Workshop Proposal on Sunday AM and attending the ESA Council meeting on Sunday PM. 68 | -------------------------------------------------------------------------------- /project_description.md: -------------------------------------------------------------------------------- 1 | # The mgcv package as a one-stop-shop for fitting non-linear ecological models 2 | 3 | To address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the mgcv package for the R language, can be used to fit models to data from a wide range of sources. 4 | 5 | mgcv is one of the most popular packages for modelling non-linear relationships. However, many users do not know how versatile and powerful a tool it can be. This workshop will focus on teaching participants how to use mgcv in a wide variety of situations (including spatio-temporal, zero-inflated, heavy-tailed, time series, and survival data) and advanced use of mgcv (fitting smooth interactions, seasonal effects, spatial effects, Markov random fields and varying-coefficient models). 6 | 7 | The workshop will give paricipants an understanding of: (a) practical elements of smoothing theory, with a focus on why they would choose to use different types of smoothers (b) model checking and selection (c) the range of modelling possibilities using mgcv. Participants will be assumed to be familiar with the basics of R (loading/manipulating data, functions, and plotting) and regression in R (lm() and glm()). The organizers have extensive practical experience with ecological statistics and modelling using mgcv. 8 | -------------------------------------------------------------------------------- /slides/00-preamble.Rpres: -------------------------------------------------------------------------------- 1 | The mgcv package as a one-stop-shop for fitting non-linear ecological models 2 | ============================ 3 | author: David L Miller, Eric Pedersen, and Gavin L Simpson 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | Good morning! 9 | ============== 10 | type: section 11 | 12 | 13 | 14 | Logistics 15 | ========== 16 | - where is the coffee? 17 | - where is the bathroom? 18 | - what if there is a fire? 19 | 20 | Who are we? 21 | =========== 22 | type:section 23 | 24 | 25 | Eric Pedersen 26 | ============= 27 | - Populuation, community, and movement ecologist 28 | - (sort of) Fisheries (sort of) ecologist 29 | - Movement and life-history modelling 30 | - Connecting theoretical models to data with statistics 31 | - Unreasonably obsessed with smoothing and penalization as a solution to all of life's problems 32 | 33 | Gavin Simpson 34 | ============= 35 | 36 | - (Palaeo)ecologist interested in community dynamics & environmental change 37 | - (Palaeo)limnologist interested in environmental change 38 | - Self-taught not-statistician 39 | - Ecological stats software (vegan plus some palaeo stuff) 40 | - To me most things are a model I can (will!) fit with *mgcv* 41 | 42 | David L Miller 43 | =============== 44 | 45 | - Cetacean distribution modelling 46 | - Spatial modelling (esp. model checking) 47 | - Distance sampling [distancesampling.org](http://distancesampling.org) 48 | - Statistical software (`Distance`, `mrds`, `dsm`) 49 | - Trying to shoehorn all projects into `mgcv` 50 | 51 | 52 | Who are you? 53 | ============ 54 | type:section 55 | 56 | 57 | What is the structure of the day? 58 | ================================= 59 | type:section 60 | 61 | Timing 62 | ====== 63 | 64 | - 8am to 5pm 65 | - Morning session: 8am-12pm 66 | - Aternoon session: 1pm-5pm 67 | 68 | (Approximate) schedule 69 | ====================== 70 | 71 | Time | Thing 72 | -----------|-------------------- 73 | 0800-0815 | Intro 74 | 0815-0945 | Generalized Additive Models 75 | 0945-1000 | Coffee 76 | 1000-1045 | Model checking 77 | 1045-1130 | Model selection 78 | 1130-1200 | Beyond the exponential family 79 | 1200-1300 | Lunch 80 | 1300-1700 | Extended examples/demos 81 | 82 | 83 | Foraging 84 | ======== 85 | 86 | 87 | Wifi 88 | ==== 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /slides/01-intro.Rpres: -------------------------------------------------------------------------------- 1 | Generalized Additive Models 2 | ============================ 3 | author: David L Miller 4 | css: custom.css 5 | transition: none 6 | 7 | 8 | Overview 9 | ========= 10 | 11 | - What is a GAM? 12 | - What is smoothing? 13 | - How do GAMs work? (*Roughly*) 14 | - Fitting and plotting simple models 15 | 16 | ```{r setup, include=FALSE} 17 | library(knitr) 18 | library(viridis) 19 | library(ggplot2) 20 | library(reshape2) 21 | library(animation) 22 | library(mgcv) 23 | opts_chunk$set(cache=TRUE, echo=FALSE) 24 | ``` 25 | 26 | What is a GAM? 27 | =============== 28 | type:section 29 | 30 | Generalized Additive Models 31 | ============================ 32 | 33 | - Generalized: many response distributions 34 | - Additive: terms **add** together 35 | - Models: well, it's a model... 36 | 37 | To GAMs from GLMs and LMs 38 | ============================= 39 | type:section 40 | 41 | 42 | (Generalized) Linear Models 43 | ============================= 44 | 45 | Models that look like: 46 | 47 | $$ 48 | y_i = \beta_0 + x_{1i}\beta_1 + x_{2i}\beta_2 + \ldots + \epsilon_i 49 | $$ 50 | 51 | (describe the response, $y_i$, as linear combination of the covariates, $x_{ji}$, with an offset) 52 | 53 | We can make $y_i\sim$ any exponential family distribution (Normal, Poisson, etc). 54 | 55 | Error term $\epsilon_i$ is normally distributed (usually). 56 | 57 | Why bother with anything more complicated?! 58 | ============================= 59 | type:section 60 | 61 | Is this linear? 62 | ============================= 63 | 64 | ```{r islinear, fig.width=12, fig.height=6} 65 | set.seed(2) ## simulate some data... 66 | dat <- gamSim(1, n=400, dist="normal", scale=1, verbose=FALSE) 67 | dat <- dat[,c("y", "x0", "x1", "x2", "x3")] 68 | p <- ggplot(dat,aes(y=y,x=x1)) + 69 | geom_point() + 70 | theme_minimal() 71 | print(p) 72 | ``` 73 | 74 | Is this linear? Maybe? 75 | ============================= 76 | 77 | ```{r eval=FALSE, echo=TRUE} 78 | lm(y ~ x1, data=dat) 79 | ``` 80 | 81 | 82 | ```{r maybe, fig.width=12, fig.height=6} 83 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() + 84 | theme_minimal() 85 | print(p + geom_smooth(method="lm")) 86 | ``` 87 | 88 | 89 | 90 | What can we do? 91 | ============================= 92 | type:section 93 | 94 | Adding a quadratic term? 95 | ============================= 96 | 97 | ```{r eval=FALSE, echo=TRUE} 98 | lm(y ~ x1 + poly(x1, 2), data=dat) 99 | ``` 100 | 101 | ```{r quadratic, fig.width=12, fig.height=6} 102 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() + 103 | theme_minimal() 104 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2))) 105 | ``` 106 | 107 | 108 | 109 | 110 | Is this sustainable? 111 | ============================= 112 | 113 | - Adding in quadratic (and higher terms) *can* make sense 114 | - This feels a bit *ad hoc* 115 | - Better if we had a **framework** to deal with these issues? 116 | 117 | ```{r ruhroh, fig.width=12, fig.height=6} 118 | p <- ggplot(dat, aes(y=y, x=x2)) + geom_point() + 119 | theme_minimal() 120 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2))) 121 | ``` 122 | 123 | 124 | [drumroll] 125 | ============================= 126 | type:section 127 | 128 | 129 | What does a model look like? 130 | ============================= 131 | 132 | $$ 133 | y_i = \beta_0 + \sum_j s_j(x_{ji}) + \epsilon_i 134 | $$ 135 | 136 | where $\epsilon_i \sim N(0, \sigma^2)$, $y_i \sim \text{Normal}$ (for now) 137 | 138 | Remember that we're modelling the mean of this distribution! 139 | 140 | Call the above equation the **linear predictor** 141 | 142 | Okay, but what about these "s" things? 143 | ==================================== 144 | right:55% 145 | 146 | ```{r smoothdat, fig.width=8, fig.height=8} 147 | 148 | spdat <- melt(dat, id.vars = c("y")) 149 | p <- ggplot(spdat,aes(y=y,x=value)) + 150 | geom_point() + 151 | theme_minimal() + 152 | facet_wrap(~variable, nrow=2) 153 | print(p) 154 | ``` 155 | *** 156 | - Think $s$=**smooth** 157 | - Want to model the covariates flexibly 158 | - Covariates and response not necessarily linearly related! 159 | - Want some "wiggles" 160 | 161 | Okay, but what about these "s" things? 162 | ==================================== 163 | right:55% 164 | 165 | ```{r wsmooths, fig.width=8, fig.height=8} 166 | p <- p + geom_smooth() 167 | print(p) 168 | ``` 169 | *** 170 | - Think $s$=**smooth** 171 | - Want to model the covariates flexibly 172 | - Covariates and response not necessarily linearly related! 173 | - Want some "wiggles" 174 | 175 | What is smoothing? 176 | =============== 177 | type:section 178 | 179 | 180 | Straight lines vs. interpolation 181 | ================================= 182 | right:55% 183 | 184 | ```{r wiggles, fig.height=8, fig.width=8} 185 | library(mgcv) 186 | # hacked from the example in ?gam 187 | set.seed(2) ## simulate some data... 188 | dat <- gamSim(1,n=50,dist="normal",scale=0.5, verbose=FALSE) 189 | dat$y <- dat$f2 + rnorm(length(dat$f2), sd = sqrt(0.5)) 190 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10-mean(dat$y) 191 | ylim <- c(-4,6) 192 | 193 | # fit some models 194 | b.justright <- gam(y~s(x2),data=dat) 195 | b.sp0 <- gam(y~s(x2, sp=0, k=50),data=dat) 196 | b.spinf <- gam(y~s(x2),data=dat, sp=1e10) 197 | 198 | curve(f2,0,1, col="blue", ylim=ylim) 199 | points(dat$x2, dat$y-mean(dat$y)) 200 | 201 | ``` 202 | *** 203 | - Want a line that is "close" to all the data 204 | - Don't want interpolation -- we know there is "error" 205 | - Balance between interpolation and "fit" 206 | 207 | Splines 208 | ======== 209 | 210 | - Functions made of other, simpler functions 211 | - **Basis functions** $b_k$, estimate $\beta_k$ 212 | - $s(x) = \sum_{k=1}^K \beta_k b_k(x)$ 213 | - Makes the math(s) much easier 214 | 215 | 216 | 217 | Design matrices 218 | =============== 219 | 220 | - We often write models as $X\boldsymbol{\beta}$ 221 | - $X$ is our data 222 | - $\boldsymbol{\beta}$ are parameters we need to estimate 223 | - For a GAM it's the same 224 | - $X$ has columns for each basis, evaluated at each observation 225 | - again, this is the linear predictor 226 | 227 | Measuring wigglyness 228 | ====================== 229 | 230 | - Visually: 231 | - Lots of wiggles == NOT SMOOTH 232 | - Straight line == VERY SMOOTH 233 | - How do we do this mathematically? 234 | - Derivatives! 235 | - (Calculus *was* a useful class afterall!) 236 | 237 | 238 | 239 | Wigglyness by derivatives 240 | ========================== 241 | 242 | ```{r wigglyanim, results="hide"} 243 | library(numDeriv) 244 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 - mean(dat$y) 245 | 246 | xvals <- seq(0,1,len=100) 247 | 248 | plot_wiggly <- function(f2, xvals){ 249 | 250 | # pre-calculate 251 | f2v <- f2(xvals) 252 | f2vg <- grad(f2,xvals) 253 | f2vg2 <- unlist(lapply(xvals, hessian, func=f2)) 254 | f2vg2min <- min(f2vg2) -2 255 | 256 | # now plot 257 | for(i in 1:length(xvals)){ 258 | par(mfrow=c(1,3)) 259 | plot(xvals, f2v, type="l", main="function", ylab="f") 260 | points(xvals[i], f2v[i], pch=19, col="red") 261 | 262 | plot(xvals, f2vg, type="l", main="derivative", ylab="df/dx") 263 | points(xvals[i], f2vg[i], pch=19, col="red") 264 | 265 | plot(xvals, f2vg2, type="l", main="2nd derivative", ylab="d2f/dx2") 266 | points(xvals[i], f2vg2[i], pch=19, col="red") 267 | polygon(x=c(0,xvals[1:i], xvals[i],f2vg2min), 268 | y=c(f2vg2min,f2vg2[1:i],f2vg2min,f2vg2min), col = "grey") 269 | 270 | ani.pause() 271 | } 272 | } 273 | 274 | saveGIF(plot_wiggly(f2, xvals), "wiggly.gif", interval = 0.2, ani.width = 800, ani.height = 400) 275 | ``` 276 | 277 | ![Animation of derivatives](wiggly.gif) 278 | 279 | What was that grey bit? 280 | ========================= 281 | 282 | $$ 283 | \int_\mathbb{R} \left( \frac{\partial^2 f(x)}{\partial^2 x}\right)^2 \text{d}x\\ 284 | $$ 285 | 286 | (Take some derivatives of the smooth and integrate them over $x$) 287 | 288 | (*Turns out* we can always write this as $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$, so the $\boldsymbol{\beta}$ is separate from the derivatives) 289 | 290 | (Call $S$ the **penalty matrix**) 291 | 292 | Making wigglyness matter 293 | ========================= 294 | 295 | - $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$ measures wigglyness 296 | - "Likelihood" measures closeness to the data 297 | - Penalise closeness to the data... 298 | - Use a **smoothing parameter** to decide on that trade-off... 299 | - $\lambda \beta^\text{T}S\beta$ 300 | - Estimate the $\beta_k$ terms but penalise objective 301 | - "closeness to data" + penalty 302 | 303 | Smoothing parameter 304 | ======================= 305 | 306 | 307 | ```{r wiggles-plot, fig.width=15} 308 | # make three plots, w. estimated smooth, truth and data on each 309 | par(mfrow=c(1,3), cex.main=3.5) 310 | 311 | plot(b.justright, se=FALSE, ylim=ylim, main=expression(lambda*plain("= just right"))) 312 | points(dat$x2, dat$y-mean(dat$y)) 313 | curve(f2,0,1, col="blue", add=TRUE) 314 | 315 | plot(b.sp0, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*0)) 316 | points(dat$x2, dat$y-mean(dat$y)) 317 | curve(f2,0,1, col="blue", add=TRUE) 318 | 319 | plot(b.spinf, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*infinity)) 320 | points(dat$x2, dat$y-mean(dat$y)) 321 | curve(f2,0,1, col="blue", add=TRUE) 322 | 323 | ``` 324 | 325 | Smoothing parameter selection 326 | ============================== 327 | 328 | - Many methods: AIC, Mallow's $C_p$, GCV, ML, REML 329 | - Recommendation, based on simulation and practice: 330 | - Use REML or ML 331 | - Reiss \& Ogden (2009), Wood (2011) 332 | 333 | 334 | 335 | 336 | Maximum wiggliness 337 | ======================== 338 | 339 | - We can set **basis complexity** or "size" ($k$) 340 | - Maximum wigglyness 341 | - Smooths have **effective degrees of freedom** (EDF) 342 | - EDF < $k$ 343 | - Set $k$ "large enough" 344 | - Penalty does the rest 345 | 346 | 347 | More on this in a bit... 348 | 349 | uhoh 350 | ====== 351 | title: none 352 | type:section 353 | 354 |

spock sobbing mathematically

355 | 356 | GAM summary 357 | =========== 358 | 359 | - Straight lines suck --- we want **wiggles** 360 | - Use little functions (**basis functions**) to make big functions (**smooths**) 361 | - Need to make sure your smooths are **wiggly enough** 362 | - Use a **penalty** to trade off wiggliness/generality 363 | 364 | 365 | Fitting GAMs in practice 366 | ========================= 367 | type:section 368 | 369 | Translating maths into R 370 | ========================== 371 | 372 | A simple example: 373 | 374 | $$ 375 | y_i = \beta_0 + s(x) + s(w) + \epsilon_i 376 | $$ 377 | 378 | where $\epsilon_i \sim N(0, \sigma^2)$ 379 | 380 | Let's pretend that $y_i \sim \text{Normal}$ 381 | 382 | - linear predictor: `formula = y ~ s(x) + s(w)` 383 | - response distribution: `family=gaussian()` 384 | - data: `data=some_data_frame` 385 | 386 | Putting that together 387 | ====================== 388 | 389 | ```{r echo=TRUE, eval=FALSE} 390 | my_model <- gam(y ~ s(x) + s(w), 391 | family = gaussian(), 392 | data = some_data_frame, 393 | method = "REML") 394 | ``` 395 | 396 | - `method="REML"` uses REML for smoothness selection (default is `"GCV.Cp"`) 397 | 398 | What about a practical example? 399 | ================================ 400 | type:section 401 | 402 | 403 | Pantropical spotted dolphins 404 | ============================== 405 | 406 | - Example taken from Miller et al (2013) 407 | - [Paper appendix](http://distancesampling.org/R/vignettes/mexico-analysis.html) has a better analysis 408 | - Simple example here, ignoring all kinds of important stuff! 409 | 410 | ![a pantropical spotted dolphin doing its thing](images/spotteddolphin_swfsc.jpg) 411 | 412 | 413 | Inferential aims 414 | ================= 415 | 416 | ```{r loaddat} 417 | load("../data/mexdolphins/mexdolphins.RData") 418 | ``` 419 | 420 | ```{r gridplotfn} 421 | library(rgdal) 422 | library(rgeos) 423 | library(maptools) 424 | library(plyr) 425 | # fill must be in the same order as the polygon data 426 | grid_plot_obj <- function(fill, name, sp){ 427 | 428 | # what was the data supplied? 429 | names(fill) <- NULL 430 | row.names(fill) <- NULL 431 | data <- data.frame(fill) 432 | names(data) <- name 433 | 434 | spdf <- SpatialPolygonsDataFrame(sp, data) 435 | spdf@data$id <- rownames(spdf@data) 436 | spdf.points <- fortify(spdf, region="id") 437 | spdf.df <- join(spdf.points, spdf@data, by="id") 438 | 439 | # seems to store the x/y even when projected as labelled as 440 | # "long" and "lat" 441 | spdf.df$x <- spdf.df$long 442 | spdf.df$y <- spdf.df$lat 443 | 444 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df) 445 | } 446 | ``` 447 | 448 | - How many dolphins are there? 449 | - Where are the dolphins? 450 | - What are they interested in? 451 | 452 | ```{r spatialEDA, fig.cap="", fig.width=15} 453 | 454 | # some nearby states, transformed 455 | library(mapdata) 456 | map_dat <- map_data("worldHires",c("usa","mexico")) 457 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ") 458 | map_sp <- SpatialPoints(map_dat[,c("long","lat")]) 459 | 460 | # give the sp object a projection 461 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84") 462 | # re-project 463 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4) 464 | map_dat$x <- map_sp.t$long 465 | map_dat$y <- map_sp.t$lat 466 | 467 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 468 | p <- ggplot() + 469 | grid_plot_obj(preddata$depth, "Depth", pred.polys) + 470 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 471 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) + 472 | geom_point(aes(x, y, size=count), 473 | data=mexdolphins[mexdolphins$count>0,], 474 | colour="red", alpha=I(0.7)) + 475 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) + 476 | scale_fill_viridis() + 477 | labs(fill="Depth",x="x",y="y",size="Count") + 478 | theme_minimal() 479 | #p <- p + gg.opts 480 | print(p) 481 | ``` 482 | 483 | A simple dolphin model 484 | =============== 485 | 486 | ```{r firstdsm, echo=TRUE} 487 | library(mgcv) 488 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set), 489 | data = mexdolphins, 490 | family = quasipoisson(), 491 | method = "REML") 492 | ``` 493 | 494 | - count is a function of depth 495 | - `off.set` is the effort expended 496 | - we have count data, try quasi-Poisson distribution 497 | 498 | What did that do? 499 | =================== 500 | 501 | ```{r echo=TRUE} 502 | summary(dolphins_depth) 503 | ``` 504 | 505 | Plotting 506 | ================ 507 | 508 | ```{r plotsmooth} 509 | plot(dolphins_depth) 510 | ``` 511 | *** 512 | - `plot(dolphins_depth)` 513 | - Dashed lines indicate +/- 2 standard errors 514 | - Rug plot 515 | - On the link scale 516 | - EDF on $y$ axis 517 | 518 | Thin plate regression splines 519 | ================================ 520 | 521 | - Default basis 522 | - One basis function per data point 523 | - Reduce # basis functions (eigendecomposition) 524 | - Fitting on reduced problem 525 | - Multidimensional 526 | - Wood (2003) 527 | 528 | 529 | Bivariate terms 530 | ================ 531 | 532 | - Assumed an additive structure 533 | - No interaction 534 | - We can specify `s(x,y)` (and `s(x,y,z,...)`) 535 | - (Assuming *isotropy* here...) 536 | 537 | ```{r xydsmplot, fig.width=15, fig.height=7} 538 | dolphins_depth_xy <- gam(count ~ s(x, y) + offset(off.set), 539 | data = mexdolphins, 540 | family=quasipoisson(), method="REML") 541 | par(mfrow=c(1,3)) 542 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=20, asp=1) 543 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=60, asp=1) 544 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=160, asp=1) 545 | ``` 546 | 547 | Adding a term 548 | =============== 549 | 550 | - Add a **surface** for location ($x$ and $y$) 551 | - Just use `+` for an extra term 552 | 553 | ```{r xydsm, echo=TRUE} 554 | dolphins_depth_xy <- gam(count ~ s(depth) + s(x, y) + offset(off.set), 555 | data = mexdolphins, 556 | family=quasipoisson(), method="REML") 557 | ``` 558 | 559 | 560 | Summary 561 | =================== 562 | 563 | ```{r echo=TRUE} 564 | summary(dolphins_depth_xy) 565 | ``` 566 | 567 | Plotting 568 | ================ 569 | 570 | ```{r plotsmooth-xy, fig.width=12, echo=TRUE} 571 | plot(dolphins_depth_xy, scale=0, pages=1) 572 | ``` 573 | - `scale=0`: each plot on different scale 574 | - `pages=1`: plot together 575 | 576 | 577 | Plotting 2d terms... erm... 578 | ================ 579 | 580 | ```{r plotsmooth-xy-biv, fig.width=15, fig.height=7, echo=TRUE} 581 | plot(dolphins_depth_xy, select=2, cex=2, asp=1, lwd=2) 582 | ``` 583 | 584 | - `select=` picks which smooth to plot 585 | 586 | Let's try something different 587 | =============================== 588 | 589 | ```{r plot-scheme2, echo=TRUE, fig.width=10} 590 | plot(dolphins_depth_xy, select=2, cex=2, asp=1, lwd=2, scheme=2) 591 | ``` 592 | - `scheme=2` much better for bivariate terms 593 | - `vis.gam()` is much more general 594 | 595 | More complex plots 596 | =================== 597 | 598 | ```{r visgam, fig.width=15, echo=TRUE} 599 | par(mfrow=c(1,2)) 600 | vis.gam(dolphins_depth_xy, view=c("depth","x"), too.far=0.1, phi=30, theta=45) 601 | vis.gam(dolphins_depth_xy, view=c("depth","x"), plot.type="contour", too.far=0.1,asp=1/1000) 602 | ``` 603 | 604 | 605 | Fitting/plotting GAMs summary 606 | ============================= 607 | 608 | - `gam` does all the work 609 | - very similar to `glm` 610 | - `s` indicates a smooth term 611 | - `plot` can give simple plots 612 | - `vis.gam` for more advanced stuff 613 | 614 | 615 | Prediction 616 | =========== 617 | type:section 618 | 619 | What is a prediction? 620 | ===================== 621 | 622 | - Evaluate the model, at a particular covariate combination 623 | - Answering (e.g.) the question "at a given depth, how many dolphins?" 624 | - Steps: 625 | 1. evaluate the $s(\ldots)$ terms 626 | 2. move to the response scale (exponentiate? Do nothing?) 627 | 3. (multiply any offset etc) 628 | 629 | Example of prediction 630 | ====================== 631 | 632 | - in maths: 633 | - Model: $\text{count}_i = A_i \exp \left( \beta_0 + s(x_i, y_i) + s(\text{Depth}_i)\right)$ 634 | - Drop in the values of $x, y, \text{Depth}$ (and $A$) 635 | - in R: 636 | - build a `data.frame` with $x, y, \text{Depth}, A$ 637 | - use `predict()` 638 | 639 | ```{r echo=TRUE, eval=FALSE} 640 | preds <- predict(my_model, newdat=my_data, type="response") 641 | ``` 642 | 643 | (`se.fit=TRUE` gives a standard error for each prediction) 644 | 645 | Back to the dolphins... 646 | ======================= 647 | type:section 648 | 649 | Where are the dolphins? 650 | ======================= 651 | 652 | ```{r echo=TRUE} 653 | dolphin_preds <- predict(dolphins_depth_xy, newdata=preddata, 654 | type="response") 655 | ``` 656 | 657 | ```{r fig.width=20} 658 | p <- ggplot() + 659 | grid_plot_obj(dolphin_preds, "N", pred.polys) + 660 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) + 661 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) + 662 | geom_point(aes(x, y, size=count), 663 | data=mexdolphins[mexdolphins$count>0,], 664 | colour="red", alpha=I(0.7)) + 665 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) + 666 | scale_fill_viridis() + 667 | labs(fill="Predicted\ndensity", x="x", y="y", size="Count") + 668 | theme_minimal() 669 | print(p) 670 | ``` 671 | 672 | (`ggplot2` code included in the slide source) 673 | 674 | Prediction summary 675 | ================== 676 | 677 | - Evaluate the fitted model at a given point 678 | - Can evaluate many at once (`data.frame`) 679 | - Don't forget the `type=...` argument! 680 | - Obtain per-prediction standard error with `se.fit` 681 | 682 | What about uncertainty? 683 | ======================== 684 | type:section 685 | 686 | Without uncertainty, we're not doing statistics 687 | ======================== 688 | type:section 689 | 690 | Where does uncertainty come from? 691 | ================================= 692 | 693 | - $\boldsymbol{\beta}$: uncertainty in the spline parameters 694 | - $\boldsymbol{\lambda}$: uncertainty in the smoothing parameter 695 | 696 | - (Traditionally we've only addressed the former) 697 | - (New tools let us address the latter...) 698 | 699 | 700 | Parameter uncertainty 701 | ======================= 702 | 703 | From theory: 704 | 705 | $$ 706 | \boldsymbol{\beta} \sim N(\hat{\boldsymbol{\beta}}, \mathbf{V}_\boldsymbol{\beta}) 707 | $$ 708 | 709 | (*caveat: the normality is only* **approximate** *for non-normal response*) 710 | 711 | 712 | **What does this mean?** Variance for each parameter. 713 | 714 | In `mgcv`: `vcov(model)` returns $\mathbf{V}_\boldsymbol{\beta}$. 715 | 716 | 717 | What can we do this this? 718 | =========================== 719 | 720 | - confidence intervals in `plot` 721 | - standard errors using `se.fit` 722 | - derived quantities? (see bibliography) 723 | 724 | blah 725 | ==== 726 | title:none 727 | type:section 728 | 729 | 730 | 731 | 732 | 733 | The lpmatrix, magic, etc 734 | ============================== 735 | 736 | For regular predictions: 737 | 738 | $$ 739 | \hat{\boldsymbol{\eta}}_p = L_p \hat{\boldsymbol{\beta}} 740 | $$ 741 | 742 | form $L_p$ using the prediction data, evaluating basis functions as we go. 743 | 744 | (Need to apply the link function to $\hat{\boldsymbol{\eta}}_p$) 745 | 746 | But the $L_p$ fun doesn't stop there... 747 | 748 | [[mathematics intensifies]] 749 | ============================ 750 | type:section 751 | 752 | Variance and lpmatrix 753 | ====================== 754 | 755 | To get variance on the scale of the linear predictor: 756 | 757 | $$ 758 | V_{\hat{\boldsymbol{\eta}}} = L_p^\text{T} V_\hat{\boldsymbol{\beta}} L_p 759 | $$ 760 | 761 | pre-/post-multiplication shifts the variance matrix from parameter space to linear predictor-space. 762 | 763 | (Can then pre-/post-multiply by derivatives of the link to put variance on response scale) 764 | 765 | Simulating parameters 766 | ====================== 767 | 768 | - $\boldsymbol{\beta}$ has a distribution, we can simulate 769 | 770 | ```{r paramsim, results="hide"} 771 | library(mvtnorm) 772 | 773 | # get the Lp matrix 774 | Lp <- predict(dolphins_depth_xy, newdata=preddata, type="lpmatrix") 775 | 776 | # how many realisations do we want? 777 | frames <- 100 778 | 779 | # generate the betas from the GAM "posterior" 780 | betas <- rmvnorm(frames, coef(dolphins_depth_xy), vcov(dolphins_depth_xy)) 781 | 782 | 783 | # use a function to get animation to play nice... 784 | anim_map <- function(){ 785 | # loop to make plots 786 | for(frame in 1:frames){ 787 | 788 | # make the prediction 789 | preddata$preds <- preddata$area * exp(Lp%*%betas[frame,]) 790 | 791 | # plot it (using viridis) 792 | p <- ggplot() + 793 | grid_plot_obj(preddata$preds, "N", pred.polys) + 794 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) + 795 | coord_fixed(ratio=1, ylim = range(mexdolphins$y), 796 | xlim = range(mexdolphins$x)) + 797 | scale_fill_viridis(limits=c(0,200)) + 798 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") + 799 | theme_minimal() 800 | 801 | print(p) 802 | } 803 | } 804 | 805 | # make the animation! 806 | saveGIF(anim_map(), "uncertainty.gif", outdir = "new", interval = 0.15, ani.width = 800, ani.height = 400) 807 | ``` 808 | 809 | ![Animation of uncertainty](uncertainty.gif) 810 | 811 | Uncertainty in smoothing parameter 812 | ================================== 813 | 814 | - Recent work by Simon Wood 815 | - "smoothing parameter uncertainty corrected" version of $V_\hat{\boldsymbol{\beta}}$ 816 | - In a fitted model, we have: 817 | - `$Vp` what we got with `vcov` 818 | - `$Vc` the corrected version 819 | - Still experimental 820 | 821 | Variance summary 822 | ================ 823 | 824 | - Everything comes from variance of parameters 825 | - Need to re-project/scale them to get the quantities we need 826 | - `mgcv` does most of the hard work for us 827 | - Fancy stuff possible with a little maths 828 | - Can include uncertainty in the smoothing parameter too 829 | 830 | Okay, that was a lot of information 831 | =================================== 832 | type:section 833 | 834 | Summary 835 | ======= 836 | 837 | - GAMs are GLMs plus some extra wiggles 838 | - Need to make sure things are *just wiggly enough* 839 | - Basis + penalty is the way to do this 840 | - Fitting looks like `glm` with extra `s()` terms 841 | - Most stuff comes down to matrix algebra, that `mgcv` sheilds you from 842 | - To do fancy stuff, get inside the matrices 843 | 844 | COFFEE 845 | ====== 846 | type:section -------------------------------------------------------------------------------- /slides/02-model_checking.Rpres: -------------------------------------------------------------------------------- 1 | Model checking 2 | ============================ 3 | author: Eric J Pedersen 4 | date: August 6th, 2016 5 | css: custom.css 6 | transition: none 7 | 8 | 9 | Outline 10 | ================= 11 | So you have a GAM: 12 | - How do you know you have the right degrees of freedom? `gam.check()` 13 | - Diagnosing model issues: `gam.check()` part 2 14 | - Do you have the right error model? Randomized quantile residuals 15 | - When covariates aren't independent: estimating concurvity 16 | 17 | 18 | Packages you'll need to follow along: 19 | ====================================== 20 | ```{r setup, include=TRUE} 21 | 22 | library(mgcv) 23 | library(magrittr) 24 | library(ggplot2) 25 | library(dplyr) 26 | library(statmod) 27 | source("quantile_resid.R") 28 | 29 | ``` 30 | 31 | ```{r pres_setup, include =F} 32 | library(knitr) 33 | opts_chunk$set(cache=TRUE, echo=FALSE,fig.align="center") 34 | ``` 35 | 36 | 37 | GAMs are models too 38 | ==================== 39 | With all models, how accurate your predictions will be depends on how good the model is 40 | 41 | ```{r misspecify,fig.width=15, fig.height=7} 42 | set.seed(15) 43 | model_list = c("right model", 44 | "wrong distribution", 45 | "heteroskedasticity", 46 | "dependent data", 47 | "wrong functional form") 48 | n = 60 49 | sigma=1 50 | x = seq(-1,1, length=n) 51 | model_data = as.data.frame(expand.grid( x=x,model=model_list)) 52 | model_data$y = 5*model_data$x^2 + 2*model_data$x 53 | for(i in model_list){ 54 | if(i == "right model"){ 55 | model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 56 | rnorm(n,0, sigma) 57 | } else if(i == "wrong distribution"){ 58 | model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 59 | rt(n,df = 3)*sigma 60 | } else if(i == "heteroskedasticity"){ 61 | model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 62 | rnorm(n,0, sigma*10^(model_data[model_data$model==i, "x"])) 63 | } else if(i == "dependent data"){ 64 | model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 65 | arima.sim(model = list(ar=c(.7)),n = n,sd=sigma) 66 | } else if(i=="wrong functional form") { 67 | model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 68 | rnorm(n,0, sigma) + ifelse(model_data[model_data$model==i, "x"]>0, 5,-5) 69 | } 70 | } 71 | ggplot(aes(x,y), data= model_data)+ 72 | geom_point()+ 73 | geom_line(color=ifelse(model_data$model=="dependent data", "black",NA))+ 74 | facet_wrap(~model)+ 75 | geom_smooth(method=gam, formula = y~s(x,k=12),method.args = list(method="REML"))+ 76 | theme_bw()+ 77 | theme(strip.text = element_text(size=20)) 78 | ``` 79 | 80 | 81 | So how do we test how well our model fits? 82 | =========================================== 83 | type:section 84 | 85 | 86 | Examples: 87 | ============================ 88 | 89 | ```{r sims, include=TRUE,echo=TRUE} 90 | set.seed(2) 91 | n = 400 92 | x1 = rnorm(n) 93 | x2 = rnorm(n) 94 | y_val =1 + 2*cos(pi*x1) + 2/(1+exp(-5*(x2))) 95 | y_norm = y_val + rnorm(n, 0, 0.5) 96 | y_negbinom = rnbinom(n, mu = exp(y_val),size=10) 97 | y_binom = rbinom(n,1,prob = exp(y_val)/(1+exp(y_val))) 98 | ``` 99 | 100 | ```{r sims_plot,fig.width=15,fig.align="center"} 101 | layout(matrix(1:6, ncol=3)) 102 | plot(x1,y_norm,cex.lab=2,cex.axis=2) 103 | plot(x2,y_norm,cex.lab=2,cex.axis=2) 104 | plot(x1,y_negbinom,cex.lab=2,cex.axis=2) 105 | plot(x2,y_negbinom,cex.lab=2,cex.axis=2) 106 | plot(x1,y_binom,cex.lab=2,cex.axis=2) 107 | plot(x2,y_binom,cex.lab=2,cex.axis=2) 108 | layout(1) 109 | ``` 110 | 111 | 112 | gam.check() part 1: do you have the right functional form? 113 | ============================= 114 | type:section 115 | 116 | 117 | How well does the model fit? 118 | ============================= 119 | - Many choices: k, family, type of smoother, ... 120 | - How do we assess how well our model fits? 121 | 122 | 123 | 124 | Basis size (k) 125 | ============== 126 | 127 | - Set `k` per term 128 | - e.g. `s(x, k=10)` or `s(x, y, k=100)` 129 | - Penalty removes "extra" wigglyness 130 | - *up to a point!* 131 | - (But computation is slower with bigger `k`) 132 | 133 | 134 | Checking basis size 135 | ==================== 136 | 137 | ```{r gam_check_norm1, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15,fig.align="center"} 138 | norm_model_1 = gam(y_norm~s(x1,k=4)+s(x2,k=4),method= "REML") 139 | gam.check(norm_model_1) 140 | ``` 141 | 142 | Checking basis size 143 | ==================== 144 | 145 | ```{r gam_check_norm2, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15,fig.align="center"} 146 | norm_model_2 = gam(y_norm~s(x1,k=12)+s(x2,k=4),method= "REML") 147 | gam.check(norm_model_2) 148 | ``` 149 | 150 | Checking basis size 151 | ==================== 152 | 153 | ```{r gam_check_norm3, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15,fig.align="center"} 154 | norm_model_3 = gam(y_norm~s(x1,k=12)+s(x2,k=12),method= "REML") 155 | gam.check(norm_model_3) 156 | ``` 157 | 158 | Checking basis size 159 | ==================== 160 | 161 | ```{r gam_check_norm4, include=TRUE,echo=TRUE, fig.width=12, fig.height=6,fig.align="center"} 162 | layout(matrix(1:6,ncol=2,byrow = T)) 163 | plot(norm_model_1);plot(norm_model_2);plot(norm_model_3) 164 | layout(1) 165 | ``` 166 | 167 | 168 | 169 | Using gam.check() part 2: visual checks 170 | ============================= 171 | type:section 172 | 173 | 174 | gam.check() plots 175 | ============================= 176 | 177 | `gam.check()` creates 4 plots: 178 | 179 | 1. Quantile-quantile plots of residuals. If the model is right, should follow 1-1 line 180 | 181 | 2. Histogram of residuals 182 | 183 | 3. Residuals vs. linear predictor 184 | 185 | 4. Observed vs. fitted values 186 | 187 | `gam.check()` uses deviance residuals by default 188 | 189 | 190 | gam.check() plots: Gaussian data, Gaussian model 191 | ============================= 192 | 193 | 194 | ```{r gam_check_plots1, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"} 195 | norm_model = gam(y_norm~s(x1,k=12)+s(x2,k=12),method= "REML") 196 | gam.check(norm_model) 197 | ``` 198 | 199 | 200 | gam.check() plots: negative binomial data, Poisson model 201 | ============================= 202 | 203 | 204 | ```{r gam_check_plots2, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"} 205 | pois_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=poisson,method= "REML") 206 | gam.check(pois_model) 207 | ``` 208 | 209 | gam.check() plots: negative binomial data, negative binomial model 210 | ============================= 211 | 212 | 213 | ```{r gam_check_plots3, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"} 214 | negbin_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=nb,method= "REML") 215 | gam.check(negbin_model) 216 | ``` 217 | 218 | 219 | 220 | 221 | Exercises 222 | ============= 223 | 1. Work with the `y_negbinom` data. Try fitting both Poisson and negative 224 | binomial model, with different degrees of freedom. Using model summaries and 225 | `gam.check`, determine which model requires more degrees of freedom to fit well. 226 | How do the fitted functions vary between the two? 227 | 228 | 2. Using the `y_binomial` data, fit a smooth model with `family=binomial`, and 229 | use `gam.check()` to determine the appropriate degrees of freedom. What do the 230 | `gam.check()` plots tell you about model fit? 231 | 232 | Fixing Residuals 233 | ================= 234 | type:section 235 | 236 | What are residuals? 237 | ==================== 238 | 239 | - Generally residuals = observed value - fitted value 240 | - BUT hard to see patterns in these "raw" residuals 241 | - Need to standardise -- **deviance residuals** 242 | - Residual sum of squares $\Rightarrow$ linear model 243 | - deviance $\Rightarrow$ GAM 244 | - Expect these residuals $\sim N(0,1)$ 245 | 246 | 247 | 248 | Shortcomings 249 | ============= 250 | 251 | - `gam.check` left side can be helpful 252 | - Right side is victim of artifacts 253 | - Need an alternative 254 | - "Randomised quantile residuals" (*experimental*) 255 | - `rqresiduals` 256 | - Exactly normal residuals ... if the model is right! 257 | 258 | 259 | Using randomized quantile residuals 260 | ============= 261 | 262 | 263 | ```{r rqresid1, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"} 264 | pois_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=poisson,method= "REML") 265 | pois_resid = residuals(pois_model, type="deviance") 266 | pois_rqresid = rqresiduals(pois_model) 267 | layout(matrix(1:2, nrow=1)) 268 | plot(pois_model$linear.predictors,pois_resid) 269 | plot(pois_model$linear.predictors,pois_rqresid) 270 | ``` 271 | 272 | Using randomized quantile residuals 273 | ============= 274 | 275 | 276 | ```{r rqresid2, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"} 277 | negbin_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=nb,method= "REML") 278 | negbin_rqresid = rqresiduals(negbin_model) 279 | layout(matrix(1:2, nrow=1)) 280 | plot(pois_model$linear.predictors,pois_rqresid) 281 | plot(negbin_model$linear.predictors,negbin_rqresid) 282 | ``` 283 | 284 | 285 | Exercise 286 | ========= 287 | Use the `rqresiduals` function to test the model residuals for patterns 288 | from your binomial model from the previous exercise 289 | 290 | Concurvity 291 | ============================= 292 | type:section 293 | 294 | What is concurvity? 295 | ====================== 296 | 297 | - Nonlinear measure, similar to co-linearity 298 | 299 | - Measures, for each smooth term, how well this term could be approximated by 300 | - `concurvity(model, full=TRUE)`: some combination of all other smooth terms 301 | - `concurvity(model, full=FALSE)`: Each of the other smooth terms in the model 302 | (useful for identifying which terms are causing issues) 303 | 304 | A demonstration 305 | ============================= 306 | 307 | 308 | ```{r concurve1,fig.width=12, fig.height=5} 309 | library(mgcv) 310 | set.seed(1) 311 | n=200 312 | alpha = 0 313 | x1_cc = rnorm(n) 314 | mean_constant = alpha 315 | var_constant = alpha^2 316 | x2_cc = alpha*x1_cc^2 - mean_constant + rnorm(n,0,1-var_constant) 317 | par(mfrow=c(1,3)) 318 | plot(x1_cc,x2_cc) 319 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc))) 320 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML") 321 | plot(m1,scale=0) 322 | print("concurvity(m1)") 323 | print(round(concurvity(m1),2)) 324 | ``` 325 | 326 | 327 | 328 | A demonstration 329 | ============================= 330 | 331 | 332 | ```{r concurve2,fig.width=12, fig.height=5} 333 | set.seed(1) 334 | n=200 335 | alpha = 0.33 336 | mean_constant = alpha 337 | var_constant = alpha^2 338 | x1_cc = rnorm(n) 339 | x2_cc = alpha*x1_cc^2-mean_constant + rnorm(n,0,1-var_constant) 340 | par(mfrow=c(1,3)) 341 | plot(x1_cc,x2_cc) 342 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc))) 343 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML") 344 | plot(m1,scale=0) 345 | print("concurvity(m1, full=TRUE)") 346 | print(round(concurvity(m1),2)) 347 | ``` 348 | 349 | 350 | A demonstration 351 | ============================= 352 | 353 | ```{r concurve3,fig.width=12, fig.height=5} 354 | library(mgcv) 355 | set.seed(1) 356 | n=200 357 | max_val = sqrt(pi/(pi-2)) 358 | alpha = 0.66 359 | x1_cc = rnorm(n) 360 | mean_constant = alpha 361 | var_constant = alpha^2 362 | x2_cc = alpha*x1_cc^2-mean_constant + rnorm(n,0,1-var_constant) 363 | par(mfrow=c(1,3)) 364 | plot(x1_cc,x2_cc) 365 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc))) 366 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML") 367 | plot(m1,scale=0) 368 | print("concurvity(m1, full=TRUE)") 369 | print(round(concurvity(m1),2)) 370 | ``` 371 | 372 | 373 | A demonstration 374 | ============================= 375 | 376 | 377 | 378 | ```{r concurve4,fig.width=12, fig.height=5} 379 | set.seed(1) 380 | alpha = 1 381 | mean_constant = alpha 382 | var_constant = alpha^2 383 | x2_cc = alpha*x1_cc^2-mean_constant + rnorm(n,0,1-var_constant) 384 | par(mfrow=c(1,3)) 385 | plot(x1_cc,x2_cc) 386 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc))) 387 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML") 388 | plot(m1,scale=0) 389 | print("concurvity(m1, full=TRUE)") 390 | print(round(concurvity(m1),2)) 391 | par(mfrow=c(1,1)) 392 | ``` 393 | 394 | Concurvity: things to remember 395 | ============================== 396 | - Can make your model unstable to small changes 397 | - `cor(data)` not sufficient: use the `concurvity(model)` function 398 | - Not always obvious from plots of smooths!! 399 | 400 | 401 | Overall 402 | ========= 403 | Make sure to test your model! GAMs are powerful, but with great power... 404 | 405 | You should at least: 406 | 407 | 1. Check if your smooths are sufficiently smooth 408 | 409 | 2. Test if you have the right distribution 410 | 411 | 3. Make sure there's no patterns left in your data 412 | 413 | 4. If you have time series, grouped, spatial, etc. data, check for dependencies 414 | 415 | 416 | We'll get into testing for dependence later on in the extended examples. 417 | 418 | -------------------------------------------------------------------------------- /slides/03-model-selection.Rpres: -------------------------------------------------------------------------------- 1 | GAMs: Model Selection 2 | ============================ 3 | author: David L Miller, Eric Pedersen, and Gavin L Simpson 4 | date: August 6th, 2016 5 | css: custom.css 6 | transition: none 7 | 8 | Overview 9 | ========= 10 | 11 | - Model selection 12 | - Shrinkage smooths 13 | - Shrinkage via double penalty (`select = TRUE`) 14 | - Confidence intervals for smooths 15 | - *p* values 16 | - `anova()` 17 | - AIC 18 | 19 | ```{r setup, include=FALSE} 20 | library("knitr") 21 | library("viridis") 22 | library("ggplot2") 23 | library("mgcv") 24 | library("cowplot") 25 | theme_set(theme_bw()) 26 | opts_chunk$set(cache=TRUE, echo=FALSE) 27 | ``` 28 | 29 | Model selection 30 | =============== 31 | type:section 32 | 33 | Model selection 34 | =============== 35 | 36 | Model (or variable) selection --- and important area of theoretical and applied interest 37 | 38 | - In statistics we aim for a balance between *fit* and *parsimony* 39 | - In applied research we seek the set of covariates with strongest effects on $y$ 40 | 41 | We seek a subset of covariates that improves *interpretability* and *prediction accuracy* 42 | 43 | Shrinkage & additional penalties 44 | ================================ 45 | type:section 46 | 47 | Shrinkage & additional penalties 48 | ================================ 49 | 50 | Smoothing parameter estimation allows selection of a wide range of potentially complex functions for smooths... 51 | 52 | But, cannot remove a term entirely from the model because the penalties used act only on the *range space* of a spline basis. The *null space* of the basis is unpenalised. 53 | 54 | - **Null space** --- the basis functions that are smooth (constant, linear) 55 | - **Range space** --- the basis functions that are wiggly 56 | 57 | Shrinkage & additional penalties 58 | ================================ 59 | 60 | **mgcv** has two ways to penalize the null space, i.e. to do selection 61 | 62 | - *double penalty approach* via `select = TRUE` 63 | - *shrinkage approach* via special bases for thin plate and cubic splines 64 | 65 | Other shrinkage/selection approaches are available 66 | 67 | Double-penalty shrinkage 68 | ======================== 69 | 70 | $\mathbf{S}_j$ is the smoothing penalty matrix & can be decomposed as 71 | 72 | $$ 73 | \mathbf{S}_j = \mathbf{U}_j\mathbf{\Lambda}_j\mathbf{U}_j^{T} 74 | $$ 75 | 76 | where $\mathbf{U}_j$ is a matrix of eigenvectors and $\mathbf{\Lambda}_j$ a diagonal matrix of eigenvalues (i.e. this is an eigen decomposition of $\mathbf{S}_j$). 77 | 78 | $\mathbf{\Lambda}_j$ contains some **0**s due to the spline basis null space --- no matter how large the penalty $\lambda_j$ might get no guarantee a smooth term will be suppressed completely. 79 | 80 | To solve this we need an extra penalty... 81 | 82 | Double-penalty shrinkage 83 | ======================== 84 | 85 | Create a second penalty matrix from $\mathbf{U}_j$, considering only the matrix of eigenvectors associated with the zero eigenvalues 86 | 87 | $$ 88 | \mathbf{S}_j^{*} = \mathbf{U}_j^{*}\mathbf{U}_j^{*T} 89 | $$ 90 | 91 | Now we can fit a GAM with two penalties of the form 92 | 93 | $$ 94 | \lambda_j \mathbf{\beta}^T \mathbf{S}_j \mathbf{\beta} + \lambda_j^{*} \mathbf{\beta}^T \mathbf{S}_j^{*} \mathbf{\beta} 95 | $$ 96 | 97 | Which implies two sets of penalties need to be estimated. 98 | 99 | In practice, add `select = TRUE` to your `gam()` call 100 | 101 | Shrinkage 102 | ========= 103 | 104 | The double penalty approach requires twice as many smoothness parameters to be estimated. An alternative is the shrinkage approach, where $\mathbf{S}_j$ is replaced by 105 | 106 | 107 | $$ 108 | \tilde{\mathbf{S}}_j = \mathbf{U}_j\tilde{\mathbf{\Lambda}}_j\mathbf{U}_j^{T} 109 | $$ 110 | 111 | where $\tilde{\mathbf{\Lambda}}_j$ is as before except the zero eigenvalues are set to some small value $\epsilon$. 112 | 113 | This allows the null space terms to be shrunk by the standard smoothing parameters. 114 | 115 | Use `s(..., bs = "ts")` or `s(..., bs = "cs")` in **mgcv** 116 | 117 | Empirical Bayes...? 118 | =================== 119 | 120 | $\mathbf{S}_j$ can be viewed as prior precision matrices and $\lambda_j$ as improper Gaussian priors on the spline coefficients. 121 | 122 | The impropriety derives from $\mathbf{S}_j$ not being of full rank (zeroes in $\mathbf{\Lambda}_j$). 123 | 124 | Both the double penalty and shrinkage smooths remove the impropriety from the Gaussian prior 125 | 126 | Empirical Bayes...? 127 | =================== 128 | 129 | - **Double penalty** --- makes no assumption as to how much to shrink the null space. This is determined from the data via estimation of $\lambda_j^{*}$ 130 | - **Shrinkage smooths** --- assumes null space should be shrunk less than the wiggly part 131 | 132 | Marra & Wood (2011) show that the double penalty and the shrinkage smooth approaches 133 | 134 | - performed significantly better than alternatives in terms of *predictive ability*, and 135 | - performed as well as alternatives in terms of variable selection 136 | 137 | Example 138 | ======= 139 | left: 60% 140 | ```{r setup-shrinkage-example, echo = FALSE, include = FALSE} 141 | ## an example of automatic model selection via null space penalization 142 | set.seed(3) 143 | n <- 200 144 | dat <- gamSim(1, n=n, scale=.15, dist="poisson") ## simulate data 145 | dat <- transform(dat, x4 = runif(n, 0, 1), x5 = runif(n, 0, 1)) ## spurious 146 | b <- gam(y ~ s(x0) + s(x1) + s(x2) + s(x3) + s(x4) + s(x5), data = dat, 147 | family=poisson, select = TRUE, method = "REML") 148 | #summary(b) 149 | #plot(b, pages = 1) 150 | ``` 151 | ```{r shrinkage-example-truth} 152 | p1 <- ggplot(dat, aes(x = x0, y = f0)) + geom_line() 153 | p2 <- ggplot(dat, aes(x = x1, y = f1)) + geom_line() 154 | p3 <- ggplot(dat, aes(x = x2, y = f2)) + geom_line() 155 | p4 <- ggplot(dat, aes(x = x3, y = f3)) + geom_line() 156 | plot_grid(p1, p2, p3, p4, ncol = 2, align = "vh", labels = paste0("x", 1:4)) 157 | ``` 158 | 159 | *** 160 | 161 | - Simulate Poisson counts 162 | - 4 known functions 163 | - 2 spurious covariates 164 | 165 | Example 166 | ======= 167 | ```{r shrinkage-example-summary} 168 | summary(b) 169 | ``` 170 | 171 | Example 172 | ======= 173 | ```{r shrinkage-example-plot, fig.width = 16, fig.height = 9} 174 | plot(b, scheme = 1, pages = 1) 175 | ``` 176 | 177 | Confidence intervals for smooths 178 | ================================ 179 | type:section 180 | 181 | Confidence intervals for smooths 182 | ================================ 183 | 184 | `plot.gam()` produces approximate 95% intervals (at +/- 2 SEs) 185 | 186 | What do these intervals represent? 187 | 188 | Nychka (1988) showed that standard Wahba/Silverman type Bayesian confidence intervals on smooths had good **across-the-function** frequentist coverage properties. 189 | 190 | Confidence intervals for smooths 191 | ================================ 192 | 193 | Marra & Wood (2012) extended this theory to the generalised case and explain where the coverage properties failed: 194 | 195 | *Musn't over-smooth too much, which happens when $\lambda_j$ are over-estimated* 196 | 197 | Two situations where this might occur 198 | 199 | 1. where true effect is almost in the penalty null space, $\hat{\lambda}_j \rightarrow \infty$ 200 | 2. where $\hat{\lambda}_j$ difficult to estimate due to highly correlated covariates 201 | - if 2 correlated covariates have different amounts of wiggliness, estimated effects can have degree of smoothness *reversed* 202 | 203 | Don't over-smooth 204 | ================= 205 | 206 | > In summary, we have shown that Bayesian componentwise variable width intervals... for the smooth components of an additive model **should achieve close to nominal *across-the-function* coverage probability**, provided only that we do not over-smooth so heavily... Beyond this requirement not to oversmooth too heavily, the results appear to have rather weak dependence on smoothing parameter values, suggesting that the neglect of smoothing parameter variability should not significantly degrade interval performance. 207 | 208 | Confidence intervals for smooths 209 | ================================ 210 | 211 | Marra & Wood (2012) suggested a solution to situation 1., namely true functions close to the penalty null space. 212 | 213 | Smooths are normally subject to *identifiability* constraints (centred), which leads to zero variance where the estimated function crosses the zero line. 214 | 215 | Instead, compute intervals for $j$ th smooth as if it alone had the intercept; identifiability constraints go on the other smooth terms. 216 | 217 | Use `seWithMean = TRUE` in call to `plot.gam()` 218 | 219 | Example 220 | ======= 221 | 222 | ```{r setup-confint-example, fig = TRUE, fig.width = 11, fig.height = 10, results = "hide"} 223 | library(mgcv) 224 | set.seed(0) 225 | ## fake some data... 226 | f1 <- function(x) {exp(2 * x)} 227 | f2 <- function(x) { 228 | 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 229 | } 230 | f3 <- function(x) {x*0} 231 | 232 | n<-200 233 | sig2 <- 12 234 | x0 <- rep(1:4,50) 235 | x1 <- runif(n, 0, 1) 236 | x2 <- runif(n, 0, 1) 237 | x3 <- runif(n, 0, 1) 238 | e <- rnorm(n, 0, sqrt(sig2)) 239 | y <- 2*x0 + f1(x1) + f2(x2) + f3(x3) + e 240 | x0 <- factor(x0) 241 | 242 | ## fit and plot... 243 | b <- gam(y ~ x0 + s(x1) + s(x2) + s(x3)) 244 | 245 | op <- par(mar = c(4,4,1,1) + 0.1) 246 | layout(matrix(1:9, ncol = 3, byrow = TRUE)) 247 | curve(f1) 248 | curve(f2) 249 | curve(f3) 250 | plot(b, shade=TRUE) 251 | plot(b, shade = TRUE, seWithMean = TRUE) ## better coverage intervals 252 | layout(1) 253 | par(op) 254 | ``` 255 | 256 | p values for smooths 257 | ==================== 258 | type:section 259 | 260 | p values for smooths 261 | ==================== 262 | 263 | ...are approximate: 264 | 265 | 1. they don't really account for the estimation of $\lambda_j$ --- treated as known 266 | 2. rely on asymptotic behaviour --- they tend towards being right as sample size tends to $\infty$ 267 | 268 | Also, *p* values in `summary.gam()` have changed a lot over time --- all options except current default are deprecated as of `v1.18-13`. 269 | 270 | The approach described in Wood (2006) is "*no longer recommended*"! 271 | 272 | p values for smooths 273 | ==================== 274 | 275 | ...are a test of **zero-effect** of a smooth term 276 | 277 | Default *p* values rely on theory of Nychka (1988) and Marra & Wood (2012) for confidence interval coverage. 278 | 279 | If the Bayesian CI have good across-the-function properties, Wood (2013a) showed that the *p* values have 280 | 281 | - almost the correct null distribution 282 | - reasonable power 283 | 284 | Test statistic is a form of $\chi^2$ statistic, but with complicated degrees of freedom. 285 | 286 | p values for unpenalized smooths 287 | ================================ 288 | 289 | The results of Nychka (1988) and Marra & Wood (2012) break down if smooth terms are unpenalized. 290 | 291 | This include i.i.d. Gaussian random effects, (e.g. `bs = "re"`.) 292 | 293 | Wood (2013b) proposed instead a test based on a likelihood ratio statistic: 294 | 295 | - the reference distribution used is appropriate for testing a $\mathrm{H}_0$ on the boundary of the allowed parameter space... 296 | - ...in other words, it corrects for a $\mathrm{H}_0$ that a variance term is zero. 297 | 298 | p values for smooths 299 | ==================== 300 | 301 | have the best behaviour when smoothness selection is done using **ML**, then **REML**. 302 | 303 | Neither of these are the default, so remember to use `method = "ML"` or `method = "REML"` as appropriate 304 | 305 | p values for parametric terms 306 | ============================= 307 | 308 | ...are based on Wald statistics using the Bayesian covariance matrix for the coefficients. 309 | 310 | This is the "right thing to do" when there are random effects terms present and doesn't really affect performance if there aren't. 311 | 312 | Hence in most instances you won't need to change the default `freq = FALSE` in `summary.gam()` 313 | 314 | anova() 315 | =========== 316 | type:section 317 | 318 | anova() 319 | =========== 320 | 321 | **mgcv** provides an `anova()` method for `"gam"` objects: 322 | 323 | 1. Single model form: `anova(m1)` 324 | 2. Multi model form: `anova(m1, m2, m3)` 325 | 326 | anova() --- single model form 327 | ============================= 328 | 329 | This differs from `anova()` methods for `"lm"` or `"glm"` objects: 330 | 331 | - the tests are Wald-like tests as described for `summary.gam()` of a $\mathrm{H}_0$ of zero-effect of a smooth term 332 | - these are not *sequential* tests! 333 | 334 | anova() 335 | =========== 336 | 337 | ```{r anova-example-single, echo = TRUE} 338 | b1 <- gam(y ~ x0 + s(x1) + s(x2) + s(x3), method = "REML") 339 | anova(b1) 340 | ``` 341 | 342 | anova() --- multi model form 343 | ============================= 344 | 345 | The multi-model form should really be used with care --- the *p* values are really *approximate* 346 | 347 | ```{r anova-example-multi, echo = TRUE} 348 | b1 <- gam(y ~ s(x0) + s(x1) + s(x2) + s(x3) + s(x4) + s(x5), data = dat, 349 | family=poisson, method = "ML") 350 | b2 <- update(b1, . ~ . - s(x3) - s(x4) - s(x5)) 351 | anova(b2, b1, test = "LRT") 352 | ``` 353 | 354 | For *general smooths* deviance is replaced by $-2\mathcal{L}(\hat{\beta})$ 355 | 356 | AIC for GAMs 357 | ============ 358 | type:section 359 | 360 | AIC for GAMs 361 | ============ 362 | 363 | - Comparison of GAMs by a form of AIC is an alternative frequentist approach to model selection 364 | - Rather than using the marginal likelihood, the likelihood of the $\mathbf{\beta}_j$ *conditional* upon $\lambda_j$ is used, with the EDF replacing $k$, the number of model parameters 365 | - This *conditional* AIC tends to select complex models, especially those with random effects, as the EDF ignores that $\lambda_j$ are estimated 366 | - Wood et al (2015) suggests a correction that accounts for uncertainty in $\lambda_j$ 367 | 368 | $$ 369 | AIC = -2l(\hat{\beta}) + 2\mathrm{tr}(\widehat{\mathcal{I}}V^{'}_{\beta}) 370 | $$ 371 | 372 | AIC 373 | ==================== 374 | 375 | In this example, $x_3$, $x_4$, and $x_5$ have no effects on $y$ 376 | 377 | ```{r aic-example, echo = TRUE} 378 | AIC(b1, b2) 379 | ``` 380 | 381 | References 382 | ========== 383 | - [Marra & Wood (2011) *Computational Statistics and Data Analysis* **55** 2372--2387.](http://doi.org/10.1016/j.csda.2011.02.004) 384 | - [Marra & Wood (2012) *Scandinavian journal of statistics, theory and applications* **39**(1), 53--74.](http://doi.org/10.1111/j.1467-9469.2011.00760.x.) 385 | - [Nychka (1988) *Journal of the American Statistical Association* **83**(404) 1134--1143.](http://doi.org/10.1080/01621459.1988.10478711) 386 | - Wood (2006) *Generalized Additive Models: An Introduction with R*. Chapman and Hall/CRC. 387 | - [Wood (2013a) *Biometrika* **100**(1) 221--228.](http://doi.org/10.1093/biomet/ass048) 388 | - [Wood (2013b) *Biometrika* **100**(4) 1005--1010.](http://doi.org/10.1093/biomet/ast038) -------------------------------------------------------------------------------- /slides/04-Beyond_the_exponential_family.Rpres: -------------------------------------------------------------------------------- 1 | Beyond the exponential family 2 | ============================ 3 | author: Eric Pedersen, Gavin Simpson, David Miller 4 | date: August 6th, 2016 5 | css: custom.css 6 | transition: none 7 | 8 | 9 | Away from the exponential family 10 | =================================== 11 | incremental: true 12 | 13 | Most glm families (Poisson, Gamma, Gaussian, Binomial) are *exponential families* 14 | 15 | $$ f(x|\theta) \sim exp(\sum_i \eta_i(\theta)T_i(x) - A(\theta))$$ 16 | 17 | - Computationally easy 18 | - Has sufficient statistics: easier to estimate parameter variance 19 | - ... but it doesn't describe everything 20 | - `mgcv` has expanded to cover many new families 21 | - Lets you model a much wider range of scenarios with smooths 22 | 23 | 24 | ```{r setup, include=F} 25 | 26 | library(mgcv) 27 | library(magrittr) 28 | library(ggplot2) 29 | library(dplyr) 30 | library(tidyr) 31 | 32 | ``` 33 | 34 | ```{r pres_setup, include =F} 35 | library(knitr) 36 | opts_chunk$set(cache=TRUE, echo=FALSE,fig.align="center") 37 | ``` 38 | 39 | 40 | What we'll cover 41 | =================================== 42 | 43 | - "Counts": Negative binomial and Tweedie distributions 44 | - Modelling proportions with the Beta distribution 45 | - Robust regression with the Student's t distribution 46 | - Ordered and unorderd categorical data 47 | - Multivariate normal data 48 | - Modelling exta zeros with zero-inflated and adjusted families 49 | 50 | - _NOTE_: All the distributions we're covering here have their own quirks. Read 51 | the help files carefully before using them! 52 | 53 | Modelling "counts" 54 | ================ 55 | type:section 56 | 57 | Counts and count-like things 58 | ============================ 59 | 60 | - Response is a count (not always integer) 61 | - Often, it's mostly zero (that's complicated) 62 | - Could also be catch per unit effort, biomass etc 63 | - Flexible mean-variance relationship 64 | 65 | ![The Count from Sesame Street](images/count.jpg) 66 | 67 | Tweedie distribution 68 | ===================== 69 | 70 | ```{r tweedie} 71 | library(tweedie) 72 | library(RColorBrewer) 73 | 74 | # tweedie 75 | y<-seq(0.01,5,by=0.01) 76 | pows <- seq(1.2, 1.9, by=0.1) 77 | 78 | fymat <- matrix(NA, length(y), length(pows)) 79 | 80 | i <- 1 81 | for(pow in pows){ 82 | fymat[,i] <- dtweedie( y=y, power=pow, mu=2, phi=1) 83 | i <- i+1 84 | } 85 | 86 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5, 87 | main="") 88 | 89 | rr <- brewer.pal(8,"Dark2") 90 | 91 | for(i in 1:ncol(fymat)){ 92 | lines(y, fymat[,i], type="l", col=rr[i], lwd=2) 93 | } 94 | ``` 95 | *** 96 | - $\text{Var}\left(\text{count}\right) = \phi\mathbb{E}(\text{count})^q$ 97 | - Common distributions are sub-cases: 98 | - $q=1 \Rightarrow$ Poisson 99 | - $q=2 \Rightarrow$ Gamma 100 | - $q=3 \Rightarrow$ Normal 101 | - We are interested in $1 < q < 2$ 102 | - (here $q = 1.2, 1.3, \ldots, 1.9$) 103 | - `tw()` 104 | 105 | 106 | Negative binomial 107 | ================== 108 | 109 | ```{r negbin} 110 | y<-seq(1,12,by=1) 111 | disps <- seq(0.001, 1, len=10) 112 | 113 | fymat <- matrix(NA, length(y), length(disps)) 114 | 115 | i <- 1 116 | for(disp in disps){ 117 | fymat[,i] <- dnbinom(y, size=disp, mu=5) 118 | i <- i+1 119 | } 120 | 121 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5, 122 | main="") 123 | 124 | rr <- brewer.pal(8,"Dark2") 125 | 126 | for(i in 1:ncol(fymat)){ 127 | lines(y, fymat[,i], type="l", col=rr[i], lwd=2) 128 | } 129 | ``` 130 | *** 131 | - $\text{Var}\left(\text{count}\right) =$ $\mathbb{E}(\text{count}) + \kappa \mathbb{E}(\text{count})^2$ 132 | - Estimate $\kappa$ 133 | - Is quadratic relationship a "strong" assumption? 134 | - Similar to Poisson: $\text{Var}\left(\text{count}\right) =\mathbb{E}(\text{count})$ 135 | - `nb()` 136 | 137 | 138 | 139 | 140 | Modelling proportions 141 | ====================== 142 | type:section 143 | 144 | The Beta distribution 145 | ====================== 146 | 147 | ```{r beta-dist} 148 | shape1 <- c(0.2, 1, 5, 1, 3, 1.5) 149 | shape2 <- c(0.2, 3, 1, 1, 1.5, 3) 150 | x <- seq(0.01, 0.99, length = 200) 151 | rr <- brewer.pal(length(shape1), "Dark2") 152 | fymat <- mapply(dbeta, shape1, shape2, MoreArgs = list(x = x)) 153 | matplot(x, fymat, type = "l", col = rr, lwd = 2, lty = "solid") 154 | legend("top", bty = "n", 155 | legend = expression(alpha == 0.2 ~~ beta == 0.2, 156 | alpha == 1.0 ~~ beta == 3.0, 157 | alpha == 5.0 ~~ beta == 1.0, 158 | alpha == 1.0 ~~ beta == 1.0, 159 | alpha == 3.0 ~~ beta == 1.5, 160 | alpha == 1.5 ~~ beta == 3.0), 161 | col = rr, cex = 1.25, lty = "solid", lwd = 2) 162 | ``` 163 | 164 | *** 165 | 166 | - Proportions; continuous, bounded at 0 & 1 167 | - Beta distribution is convenient choice 168 | - Two strictly positive shape parameters, $\alpha$ & $\beta$ 169 | - Has support on $x \in (0,1)$ 170 | - Density at $x = 0$ & $x = 1$ is $\infty$, fudge 171 | - **betareg** package 172 | - `betar()` family in **mgcv** 173 | 174 | Beta or Binomial? 175 | ================= 176 | 177 | The binomial model also model's proportions --- more specifically it models the number of successes in $m$ trials. If you have data of this form then model the binomial counts as this can yield predicted *counts* if required. 178 | 179 | If you have true percentage or proportion data, say estimated prpotional plant cover in a quadrat, then the beta model is appropriate. 180 | 181 | Also, if all you have is the percentages, the beta model is unlikely to be terribly bad. 182 | 183 | Stereotypic behaviour in captive cheetahs 184 | ========================================= 185 | 186 | To illustrate the use of the `betar()` family in **mgcv** we use a behavioural data set of observations on captive cheetahs. These data are prvided and extensively analysed in Zuur et al () and originate from Quirke et al (2012). 187 | 188 | Stereotypic behaviour in captive cheetahs 189 | ========================================= 190 | 191 | - data collected from nine zoos 192 | - at randomised times of day a random number of scans (videos) of captive cheetah behaviour were recorded and analysed over a period of several months 193 | - presence of stereotypical behaviour was recorded 194 | - all individuals in an enclosure were assessed; where more than 1 individual data were aggregated over individuals to achieve 1 data point per enclosure per sampling occasion 195 | - a number of covariates were also recorded 196 | - data technically a binomial counts but we'll ignore count data and model the proportion of scans showing stereotypical behaviour 197 | 198 | Cheetah: data processing 199 | ======================== 200 | 201 | ```{r cheetah-load-data, echo = TRUE} 202 | cheetah <- read.table("../data/beta-regression/ZooData.txt", header = TRUE) 203 | names(cheetah) 204 | cheetah <- transform(cheetah, Raised = factor(Raised), 205 | Feeding = factor(Feeding), 206 | Oc = factor(Oc), 207 | Other = factor(Other), 208 | Enrichment = factor(Enrichment), 209 | Group = factor(Group), 210 | Sex = factor(Sex, labels = c("Male","Female")), 211 | Zoo = factor(Zoo)) 212 | ``` 213 | 214 | Cheetah: model fitting 215 | ====================== 216 | 217 | ```{r cheetah-model, echo = TRUE} 218 | m <- gam(Proportion ~ s(log(Size)) + s(Visitors) + s(Enclosure) + 219 | s(Vehicle) + s(Age) + s(Zoo, bs = "re") + 220 | Feeding + Oc + Other + Enrichment + Group + Sex, 221 | data = cheetah, family = betar(), method = "REML") 222 | ``` 223 | 224 | Cheetah: model summary 225 | ====================== 226 | title: false 227 | 228 | ```{R cheetah-summary} 229 | summary(m) 230 | ``` 231 | 232 | Cheetah: model smooths 233 | ====================== 234 | 235 | ```{R cheetah-plot-smooths, fig.width = 15, fig.height = 8} 236 | layout(matrix(1:6, ncol = 3, byrow = TRUE)) 237 | plot(m, shade = TRUE, scale = 0, seWithMean = TRUE) 238 | layout(1) 239 | ``` 240 | 241 | Modelling outliers 242 | ==================== 243 | type:section 244 | 245 | The student-t distribution 246 | ============================ 247 | - Models continuous data w/ longer tails than normal 248 | - Far less sensitive to outliers 249 | - Has one extra parameter: df. 250 | - bigger df: t dist approaches normal 251 | 252 | 253 | 254 | 255 | ```{r tplot,fig.width=15, fig.height=6} 256 | set.seed(2) 257 | dat = data.frame(df = rep(c(2,4,50),each=500)) 258 | dat$x = rt(1500,df=dat$df) 259 | dat$df_val = paste("df = ",dat$df, sep ="") 260 | x_val = seq(min(dat$x),max(dat$x), length=200) 261 | ggplot(aes(x=x),data=dat)+ 262 | geom_density(col="red")+ 263 | facet_grid(.~df_val)+ 264 | annotate(x=x_val,y=dnorm(x_val), geom = "line")+ 265 | theme_bw(base_size = 20) 266 | ``` 267 | 268 | 269 | 270 | 271 | The student-t distribution: Usage 272 | ============================ 273 | ```{r texample, eval=FALSE,echo=T} 274 | set.seed(4) 275 | n=300 276 | dat = data.frame(x=seq(0,10,length=n)) 277 | dat$f = 20*exp(-dat$x)*dat$x 278 | dat$y = 1*rt(n,df = 3) + dat$f 279 | norm_mod = gam(y~s(x,k=20), data=dat, family=gaussian(link="identity")) 280 | t_mod = gam(y~s(x,k=20), data=dat, family=scat(link="identity")) 281 | ``` 282 | 283 | The student-t distribution: Usage 284 | ============================ 285 | ```{r texample2, include =T,echo=F,results= "hide", fig.width=15, fig.height=8} 286 | set.seed(4) 287 | n=300 288 | dat = data.frame(x=seq(0,10,length=n)) 289 | dat$f = 20*exp(-dat$x)*dat$x 290 | dat$y = 1*rt(n,df = 3) + dat$f 291 | norm_mod = gam(y~s(x,k=20), data=dat, family=gaussian(link="identity")) 292 | t_mod = gam(y~s(x,k=20), data=dat, family=scat(link="identity")) 293 | predict_norm = predict(norm_mod,se.fit=T) 294 | predict_t = predict(t_mod,se.fit=T) 295 | fit_vals = data.frame(x = c(dat$x,dat$x), 296 | fit =c(predict_norm[[1]],predict_t[[1]]), 297 | se_min = c(predict_norm[[1]] - 2*predict_norm[[2]], 298 | predict_t[[1]] - 2*predict_t[[2]]), 299 | se_max = c(predict_norm[[1]] + 2*predict_norm[[2]], 300 | predict_t[[1]] + 2*predict_t[[2]]), 301 | model = rep(c("normal errors","t-errors"),each=n)) 302 | ggplot(aes(x=x,y=fit),data=fit_vals)+ 303 | facet_grid(.~model)+ 304 | geom_line(col="red")+ 305 | geom_ribbon(aes(ymin =se_min,ymax = se_max),alpha=0.5,fill="red")+ 306 | annotate(x = dat$x,y=dat$y,size=2,geom="point")+ 307 | annotate(x = dat$x,y=dat$f,size=2,geom="line")+ 308 | theme_bw(20) 309 | ``` 310 | 311 | 312 | 313 | The student-t distribution: Usage 314 | ============================ 315 | ```{r texample3, include =T} 316 | summary(t_mod) 317 | ``` 318 | 319 | 320 | 321 | 322 | Modelling multi-dimensional data 323 | =========================== 324 | type:section 325 | 326 | Ordered categorical data 327 | =========================== 328 | - Assumes data are in discrete categories, and categories fall in order 329 | - e.g.: conservation status: "least concern", "vulnerable", "endangered", "extinct" 330 | - fits a linear latent model using covariates, w/ threshold for each level 331 | - First cut-off always occurs at -1 332 | 333 | 334 | Ordered categorical data 335 | =========================== 336 | ```{r ocat_ex1, include =T,echo=F,results= "hide", fig.width=15, fig.height=8} 337 | set.seed(4) 338 | n= 100 339 | dat = data.frame(body_size = seq(-2,2, length=200)) 340 | dat$linear_predictor = 6*exp(dat$body_size*2)/(1+exp(dat$body_size*2))-2 341 | 342 | ggplot(aes(x=body_size, y=linear_predictor),data=dat) + 343 | annotate(x= dat$body_size, ymin=-3,ymax=-1, alpha=0.25,geom="ribbon")+ 344 | annotate(x= 0, y=-2, label = "least concern",geom="text",size=10)+ 345 | annotate(x= dat$body_size, ymin=-1,ymax=1.5, alpha=0.25,geom="ribbon",fill="red")+ 346 | annotate(x= 0, y=0.25, label = "vulnerable",geom="text",size=10)+ 347 | annotate(x= dat$body_size, ymin=1.5,ymax=2, alpha=0.25,geom="ribbon",fill="blue")+ 348 | annotate(x= 0, y=1.75, label = "endangered",geom="text",size=10)+ 349 | annotate(x= 0, y=3.5, label = "extinct",geom="text",size=10)+ 350 | scale_y_continuous("linear predictor", expand=c(0,0),limits=c(-3,5))+ 351 | scale_x_continuous("relative body size", expand=c(0,0))+ 352 | theme_bw(30)+ 353 | theme(panel.grid = element_blank()) 354 | 355 | ``` 356 | 357 | 358 | Ordered categorical data 359 | =========================== 360 | ```{r ocat_ex2, include =T,echo=F,results= "hide", fig.width=15, fig.height=8} 361 | set.seed(4) 362 | n= 100 363 | dat = data.frame(body_size = seq(-2,2, length=200)) 364 | dat$linear_predictor = 6*exp(dat$body_size*2)/(1+exp(dat$body_size*2))-2 365 | 366 | ggplot(aes(x=body_size, y=linear_predictor),data=dat) + 367 | geom_line()+ 368 | geom_ribbon(aes(ymin=linear_predictor-1,ymax=linear_predictor+1),alpha=0.25)+ 369 | annotate(x= dat$body_size, ymin=-3,ymax=-1, alpha=0.25,geom="ribbon")+ 370 | annotate(x= 0, y=-2, label = "least concern",geom="text",size=10)+ 371 | annotate(x= dat$body_size, ymin=-1,ymax=1.5, alpha=0.25,geom="ribbon",fill="red")+ 372 | annotate(x= 0, y=0.25, label = "vulnerable",geom="text",size=10)+ 373 | annotate(x= dat$body_size, ymin=1.5,ymax=2, alpha=0.25,geom="ribbon",fill="blue")+ 374 | annotate(x= 0, y=1.75, label = "endangered",geom="text",size=10)+ 375 | annotate(x= 0, y=3.5, label = "extinct",geom="text",size=10)+ 376 | scale_x_continuous("relative body size", expand=c(0,0))+ 377 | scale_y_continuous("linear predictor",limits=c(-3,5), expand=c(0,0))+ 378 | theme_bw(30)+ 379 | theme(panel.grid = element_blank()) 380 | 381 | ``` 382 | 383 | 384 | Using ocat 385 | =========================== 386 | ```{r ocat_ex3, include =T,echo=T,results= "hide", fig.width=15, fig.height=5} 387 | n= 200 388 | dat = data.frame(x1 = runif(n,-1,1),x2=2*pi*runif(n)) 389 | dat$f = dat$x1^2 + sin(dat$x2) 390 | dat$y_latent = dat$f + rnorm(n,dat$f) 391 | dat$y = ifelse(dat$y_latent<0,1, ifelse(dat$y_latent<0.5,2,3)) 392 | ocat_model = gam(y~s(x1)+s(x2), family=ocat(R=3),data=dat) 393 | plot(ocat_model,page=1) 394 | ``` 395 | 396 | Using ocat 397 | =========================== 398 | ```{r ocat_ex4, include =T,echo=T, fig.width=15, fig.height=8} 399 | summary(ocat_model) 400 | ``` 401 | 402 | Using ocat 403 | =========================== 404 | ```{r ocat_ex5, include =T,echo=F,results= "hide", fig.width=15, fig.height=8} 405 | ocat_predict = predict(ocat_model,type = "response") 406 | colnames(ocat_predict) = c("1","2","3") 407 | ocat_predict = as.data.frame(ocat_predict)%>% 408 | mutate(x= fitted(ocat_model),y=as.numeric(dat$y))%>% 409 | gather(pred_level,prediction,`1`:`3`)%>% 410 | mutate(pred_level = as.numeric(pred_level), 411 | obs_val = as.numeric(y==pred_level)) 412 | 413 | ggplot(aes(x= x, y= obs_val),data=ocat_predict)+ 414 | facet_wrap(~pred_level)+ 415 | geom_point()+ 416 | geom_line(aes(y= prediction))+ 417 | theme_bw(30) 418 | ``` 419 | 420 | 421 | 422 | Unordered categorical data 423 | =========================== 424 | - What do you do if categorical data doesn't fall in a nice order? 425 | 426 | Unordered categorical data 427 | =========================== 428 | - What do you do if categorical data doesn't fall in a nice order? 429 | ![](images/animal_choice.png) 430 | 431 | Unordered categorical data 432 | =========================== 433 | incremental: true 434 | - Model probability of a category occuring relative to an (arbitrary) reference level 435 | - one linear equation for each category except the reference class 436 | - $p(y=i|\mathbf{x}) = exp(\mu_i(\mathbf{x}))/(1+\sum_j exp(\mu_j(\mathbf{x}))$ 437 | - $\mu_i(\mathbf{x}) = s_{1,j}(x_1) + s_{2,j}(x_2)$ 438 | - $p(y=0|\mathbf{x})= 1/(1+\sum_j exp(\mu_j(\mathbf{x}))$ 439 | 440 | 441 | 442 | Using the multinom function 443 | =========================== 444 | ![](images/animal_codes.png) 445 | 446 | *** 447 | ![](images/animal_functions.png) 448 | 449 | 450 | 451 | Using the multinom function 452 | =========================== 453 | ```{r multinom1, include =T,echo=F,results= "hide", fig.width=15, fig.height=8} 454 | set.seed(10) 455 | n= 500 456 | dat = data.frame(tree_cover = round(runif(n,0,1),2),road_dist=round(10*runif(n),1)) 457 | dat$f_deer = with(dat, 4*exp(-road_dist/2)*road_dist - 2*(tree_cover)^2) 458 | dat$f_pig = with(dat, log(exp(-road_dist)+0.1)+1) 459 | prob_matrix = cbind(1, exp(dat$f_deer),exp(dat$f_pig)) 460 | dat$y = apply(prob_matrix,MARGIN = 1,function(x) sample(0:2,size = 1,prob = x)) 461 | model_dat = dat%>% select(tree_cover, road_dist, y) 462 | ``` 463 | 464 | ```{r multinom2, include =T,echo=T, fig.width=15, fig.height=8} 465 | head(model_dat) 466 | ``` 467 | 468 | *** 469 | ```{r multinom3, include =T,echo=T, fig.width=6, fig.height=6} 470 | pairs(model_dat) 471 | ``` 472 | 473 | 474 | Using the multinom function 475 | =========================== 476 | ```{r multinom4, include =T,echo=F,results= "hide", fig.width=15, fig.height=8} 477 | multinom_model = gam(list(y~s(tree_cover)+s(road_dist), ~s(tree_cover)+s(road_dist)), 478 | data= model_dat, family=multinom(K=2)) 479 | plot(multinom_model,page=1,cex.axis=2,cex.lab=2) 480 | ``` 481 | 482 | 483 | 484 | Understanding the results 485 | =========================== 486 | ```{r multinom5, include =T,echo=T,results= "hide", fig.width=15, fig.height=4} 487 | multinom_pred_data = as.data.frame(expand.grid(road_dist =seq(0,10,length=50), 488 | tree_cover =c(0,0.33,0.66,1))) 489 | multinom_pred = predict(multinom_model, multinom_pred_data,type = "response") 490 | colnames(multinom_pred) = c("monkey","deer","pig") 491 | multinom_pred_data = cbind(multinom_pred_data,multinom_pred) 492 | multinom_pred_data_long = multinom_pred_data %>% 493 | gather(species, probability, monkey, deer,pig)%>% 494 | mutate(tree_cover =paste("tree cover = ", tree_cover,sep="")) 495 | ggplot(aes(road_dist, probability,color=species),data=multinom_pred_data_long)+ 496 | geom_line()+ 497 | facet_grid(.~tree_cover)+ 498 | theme_bw(20) 499 | 500 | ``` 501 | 502 | 503 | Other multivariate distributions to check out 504 | =================================== 505 | type: section 506 | 507 | 508 | Multivariate normal (family = mvn) 509 | =========================== 510 | incremental: true 511 | 512 | - Fit a different smooth model for multiple y-variables, but allowing 513 | correlation between y's 514 | - Example uses: multi-species distribution models, measuring latent correlations 515 | between environmental predictors 516 | - mgcv code: formula=list(y1~s(x1)+s(x2), y2 = s(x1)+s(x3)), family = mvn(d=2) 517 | 518 | Cox Proportional hazards (family = cox.ph) 519 | =========================== 520 | incremental: true 521 | 522 | - Censored data: y measures time until an event occurs, or the study was stopped (censoring) 523 | - Measures relative rates, rather than absolute rates (no intercepts) 524 | - Example uses: time until an individual is infected, time until a subpopulation goes 525 | extinct, time until lake is invaded 526 | - mgcv code: `formula = y~s(x1)+s(x2), weights= censor.var,family=cox.ph` 527 | - censor.var = 0 if censored, 1 if not 528 | 529 | 530 | Gaussian location-scale models (family = gaulss) 531 | =========================== 532 | incremental: true 533 | 534 | - Model both the mean ("location") and variance ("scale") as smooth functions of predictors 535 | - Example uses: detecting early warning signs in time series, finding factors driving 536 | population variability 537 | - mgcv code: `formula = list(y~s(x1)+s(x2), ~s(x2)+s(x3)), family=gaulss` 538 | - censor.var = 0 if censored, 1 if not 539 | 540 | 541 | 542 | Zero-inflated Poisson location-scale models (family = ziplss) 543 | =========================== 544 | incremental: true 545 | 546 | - Models the probability of zeros seperately from mean counts given that you've observed more than zero 547 | at a location. 548 | - Example uses: Counts of prey caught when a predator might switch between not hunting at all (zeros) 549 | and active hunting 550 | - mgcv code: `formula = list(y~s(x1)+s(x2), ~s(x2)+s(x3)), family=ziplss` 551 | 552 | 553 | The end of the distribution zoo 554 | ============== 555 | type:section 556 | 557 | That's the end of this section! We convene after lunch (1:00 PM). You'll 558 | get to work through a few more advanced examples of your choice. 559 | 560 | 561 | -------------------------------------------------------------------------------- /slides/correct_mathjax.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # correct the paths to the mathjax files for HTML versions of the slides 4 | 5 | # changes the link to be to a folder called mathjax-23 in the current directory 6 | # this avoids having lots of copes of the same mathjax stuff 7 | 8 | # also use decktape to make a PDF of the slides (probably only works on 9 | # Dave's laptop) 10 | 11 | for i in $( ls *.html); do 12 | echo processing: $i 13 | FN=${i/.html/}_files\\/ 14 | sed "s/$FN//" "$i" > tmp.html 15 | mv tmp.html $i 16 | PDFFN=${i/.html/}.pdf 17 | ~/sources/decktape/bin/phantomjs ~/sources/decktape/decktape.js automatic -s 1024x768 $i $PDFFN 18 | done 19 | -------------------------------------------------------------------------------- /slides/custom.css: -------------------------------------------------------------------------------- 1 | .reveal section img{ 2 | box-shadow: 0px 0px 0px rgba(0, 0, 0, 0); 3 | margin: 0px; 4 | } 5 | 6 | .reveal h1, .reveal h2, .reveal h3 { 7 | word-wrap: normal; 8 | -moz-hyphens: none; 9 | } 10 | 11 | .reveal pre{ 12 | width: 100%; 13 | font-size: 0.51em; 14 | } 15 | 16 | .reveal pre code { 17 | font-size: 1.2em; 18 | } 19 | 20 | div.bigq { 21 | font-size: 200%; 22 | line-height: normal; 23 | } 24 | 25 | div.medq { 26 | font-size: 150%; 27 | line-height: normal; 28 | } 29 | 30 | .section .reveal .state-background { 31 | background: #000 none repeat scroll 0% 0%; 32 | background-color: #000; 33 | } 34 | -------------------------------------------------------------------------------- /slides/images/addbasis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/addbasis.png -------------------------------------------------------------------------------- /slides/images/animal_choice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/animal_choice.png -------------------------------------------------------------------------------- /slides/images/animal_codes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/animal_codes.png -------------------------------------------------------------------------------- /slides/images/animal_functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/animal_functions.png -------------------------------------------------------------------------------- /slides/images/count.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/count.jpg -------------------------------------------------------------------------------- /slides/images/mathematical_sobbing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/mathematical_sobbing.jpg -------------------------------------------------------------------------------- /slides/images/remlgcv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/remlgcv.png -------------------------------------------------------------------------------- /slides/images/spotteddolphin_swfsc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/spotteddolphin_swfsc.jpg -------------------------------------------------------------------------------- /slides/images/tina-modelling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/tina-modelling.png -------------------------------------------------------------------------------- /slides/quantile_resid.R: -------------------------------------------------------------------------------- 1 | #This code is modified for D.L. Miller's dsm package for distance sampling, from 2 | #the rqgam.check function. The code is designed to extract randomized quantile 3 | #residuals from GAMs, using the family definitions in mgcv. Note statmod only 4 | #supports RQ residuals for the following families: Tweedie, Poisson, Gaussian, Any errors are due to Eric Pedersen 5 | library(statmod) #This has functions for randomized quantile residuals 6 | rqresiduals = function (gam.obj) { 7 | if(!"gam" %in% attr(gam.obj,"class")){ 8 | stop('"gam.obj has to be of class "gam"') 9 | } 10 | if (!grepl("^Tweedie|^Negative Binomial|^poisson|^binomial|^gaussian|^Gamma|^inverse.gaussian", 11 | gam.obj$family$family)){ 12 | stop(paste("family " , gam.obj$family$family, 13 | " is not currently supported by the statmod library, 14 | and any randomized quantile residuals would be inaccurate.", 15 | sep="")) 16 | } 17 | if (grepl("^Tweedie", gam.obj$family$family)) { 18 | if (is.null(environment(gam.obj$family$variance)$p)) { 19 | p.val <- gam.obj$family$getTheta(TRUE) 20 | environment(gam.obj$family$variance)$p <- p.val 21 | } 22 | qres <- qres.tweedie(gam.obj) 23 | } 24 | else if (grepl("^Negative Binomial", gam.obj$family$family)) { 25 | if ("extended.family" %in% class(gam.obj$family)) { 26 | gam.obj$theta <- gam.obj$family$getTheta(TRUE) 27 | } 28 | else { 29 | gam.obj$theta <- gam.obj$family$getTheta() 30 | } 31 | qres <- qres.nbinom(gam.obj) 32 | } 33 | else { 34 | qres <- qresid(gam.obj) 35 | } 36 | return(qres) 37 | } -------------------------------------------------------------------------------- /slides/uncertainty.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/uncertainty.gif -------------------------------------------------------------------------------- /slides/wiggly.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/wiggly.gif --------------------------------------------------------------------------------