├── .gitignore
├── LICENSE
├── README.md
├── code_snippets
├── gra2mgcv.R
├── quantile_resid.R
└── vis.concurvity.R
├── course_outline.md
├── data
├── bbs_data
│ ├── bbs_florida.csv
│ ├── bbs_florida_richness.csv
│ └── routes.csv
├── bbs_florida.csv
├── bbs_florida_richness.csv
├── beta-regression
│ └── ZooData.txt
├── forest-health
│ ├── beech.raw
│ ├── foresthealth.bnd
│ ├── foresthealth.dat
│ └── foresthealth.gra
├── mexdolphins
│ ├── README.md
│ ├── mexdolphins.RData
│ ├── soap_pred.R
│ └── soapy.RData
├── routes.csv
├── time_series
│ └── Florida_climate.csv
└── yukon_seeds
│ ├── seed_data.csv
│ └── seed_source_locations.csv
├── example-bivariate-timeseries-and-ti.Rmd
├── example-bivariate-timeseries-and-ti.html
├── example-forest-health.Rmd
├── example-forest-health.html
├── example-linear-functionals.Rmd
├── example-nonlinear-timeseries.Rmd
├── example-nonlinear-timeseries.html
├── example-spatial-mexdolphins-solutions.Rmd
├── example-spatial-mexdolphins-solutions.html
├── example-spatial-mexdolphins.Rmd
├── example-spatial-mexdolphins.html
├── example-spatio-temporal data.Rmd
├── example-spatio-temporal_data.html
├── mgcv-esa-workshop.Rproj
├── project-requirements.md
├── project_description.md
└── slides
├── 00-preamble.Rpres
├── 01-intro.Rpres
├── 02-model_checking.Rpres
├── 03-model-selection.Rpres
├── 04-Beyond_the_exponential_family.Rpres
├── correct_mathjax.sh
├── custom.css
├── images
├── addbasis.png
├── animal_choice.png
├── animal_codes.png
├── animal_functions.png
├── count.jpg
├── mathematical_sobbing.jpg
├── remlgcv.png
├── spotteddolphin_swfsc.jpg
└── tina-modelling.png
├── quantile_resid.R
├── uncertainty.gif
└── wiggly.gif
/.gitignore:
--------------------------------------------------------------------------------
1 | *.Rhistory
2 | .Rproj.user/
3 | .Rproj.user
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Code in this respository is licensed under the GPL version >= 2.
2 |
3 | All other material is Creative Commons BY licensed (except where it does not belong to us, we do not claim copyright over anyone else's material).
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mgcv-esa-workshop
2 |
--------------------------------------------------------------------------------
/code_snippets/gra2mgcv.R:
--------------------------------------------------------------------------------
1 | ## Read in a BayesX graph file and output list required by mgcv's mrf smooth
2 | gra2mgcv <- function(file) {
3 | ## first row contains number of objects in file
4 | ## graph info is stored in triplets or rows after first row
5 | ## - 1st row of triplet is ID
6 | ## - 2nd row of triplet is number of neighbours
7 | ## - 3rd row of triplet is vector of neighbour
8 | gra <- readLines(file)
9 | N <- gra[1]
10 | gra <- gra[-1] # pop the first row which contains N
11 | ids <- seq.int(1, by = 3, length.out = N)
12 | nbs <- seq.int(3, by = 3, length.out = N)
13 | node2id <- function(nodes, ids) {
14 | as.numeric(ids[as.numeric(nodes) + 1])
15 | }
16 | l <- lapply(strsplit(gra[nbs], " "), node2id, ids = gra[ids])
17 | names(l) <- gra[ids]
18 | l
19 | }
20 |
--------------------------------------------------------------------------------
/code_snippets/quantile_resid.R:
--------------------------------------------------------------------------------
1 | #This code is modified for D.L. Miller's dsm package for distance sampling, from
2 | #the rqgam.check function. The code is designed to extract randomized quantile
3 | #residuals from GAMs, using the family definitions in mgcv. Note statmod only
4 | #supports RQ residuals for the following families: Tweedie, Poisson, Gaussian, Any errors are due to Eric Pedersen
5 | library(statmod) #This has functions for randomized quantile residuals
6 | rqresiduals = function (gam.obj) {
7 | if(!"gam" %in% attr(gam.obj,"class")){
8 | stop('"gam.obj has to be of class "gam"')
9 | }
10 | if (!grepl("^Tweedie|^Negative Binomial|^poisson|^binomial|^gaussian|^Gamma|^inverse.gaussian",
11 | gam.obj$family$family)){
12 | stop(paste("family " , gam.obj$family$family,
13 | " is not currently supported by the statmod library,
14 | and any randomized quantile residuals would be inaccurate.",
15 | sep=""))
16 | }
17 | if (grepl("^Tweedie", gam.obj$family$family)) {
18 | if (is.null(environment(gam.obj$family$variance)$p)) {
19 | p.val <- gam.obj$family$getTheta(TRUE)
20 | environment(gam.obj$family$variance)$p <- p.val
21 | }
22 | qres <- qres.tweedie(gam.obj)
23 | }
24 | else if (grepl("^Negative Binomial", gam.obj$family$family)) {
25 | if ("extended.family" %in% class(gam.obj$family)) {
26 | gam.obj$theta <- gam.obj$family$getTheta(TRUE)
27 | }
28 | else {
29 | gam.obj$theta <- gam.obj$family$getTheta()
30 | }
31 | qres <- qres.nbinom(gam.obj)
32 | }
33 | else {
34 | qres <- qresid(gam.obj)
35 | }
36 | return(qres)
37 | }
--------------------------------------------------------------------------------
/code_snippets/vis.concurvity.R:
--------------------------------------------------------------------------------
1 | # visualise concurvity between terms in a GAM
2 | # David L Miller 2015, MIT license
3 |
4 | # arguments:
5 | # b -- a fitted gam
6 | # type -- concurvity measure to plot, see ?concurvity
7 |
8 | vis.concurvity <- function(b, type="estimate"){
9 | cc <- concurvity(b, full=FALSE)[[type]]
10 |
11 | diag(cc) <- NA
12 | cc[lower.tri(cc)]<-NA
13 |
14 | layout(matrix(1:2, ncol=2), widths=c(5,1))
15 | opar <- par(mar=c(5, 6, 5, 0) + 0.1)
16 | # main plot
17 | image(z=cc, x=1:ncol(cc), y=1:nrow(cc), ylab="", xlab="",
18 | axes=FALSE, asp=1, zlim=c(0,1))
19 | axis(1, at=1:ncol(cc), labels = colnames(cc), las=2)
20 | axis(2, at=1:nrow(cc), labels = rownames(cc), las=2)
21 | # legend
22 | opar <- par(mar=c(5, 0, 4, 3) + 0.1)
23 | image(t(matrix(rep(seq(0, 1, len=100), 2), ncol=2)),
24 | x=1:3, y=1:101, zlim=c(0,1), axes=FALSE, xlab="", ylab="")
25 | axis(4, at=seq(1,101,len=5), labels = round(seq(0,1,len=5),1), las=2)
26 | par(opar)
27 | }
28 |
--------------------------------------------------------------------------------
/course_outline.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Course outline
3 |
4 | ---
5 |
6 | - *Start time:* 8am
7 | - *End time:* 5pm
8 |
9 | 2x 4 hour blocks:
10 |
11 | - Morning: 8am-12pm
12 | - Aternoon: 1pm-5pm
13 |
14 |
15 | # Morning
16 |
17 | ## Intro (Dave M)
18 |
19 | - GLM refresher
20 | - model the *mean*
21 | - here are some distributions (plot + mean-variance)
22 | - GLMs -> GAMs
23 | - Spline
24 | - why are they good?
25 | - how do they work?
26 | - measuring wigglyness
27 | - What is uncertainty?
28 | * "Hook" for "model expansion"
29 |
30 |
31 | ### Practical
32 |
33 | - `gam()`
34 | - `s()`
35 | - `gam.check()`
36 | - `summary()`
37 | - `plot()`
38 | * `predict()`
39 |
40 |
41 | ## Model checking (Eric)
42 |
43 | *(1/2 slot?)*
44 |
45 | - `gam.check()`
46 | - `concurvity()`
47 | - (randomised quantile residuals)
48 | * autocorrelation (check?)
49 |
50 | ### Practical
51 |
52 | - `gam.check()`
53 |
54 | ## Model selection (Gavin)
55 |
56 | *(1/2 slot?)*
57 |
58 | - shrinkage via shrinkage smooths
59 | - shrinkage via select = TRUE
60 | - AIC corrected for estimated smoothness params
61 | - approximate *p* values (mention at least)
62 |
63 | ### Practical
64 |
65 | - `summary()`
66 |
67 | ## Beyond the exponential family (all)
68 |
69 | *(1/2 slot?)*
70 |
71 | - Count data: Tweedie, negative binomial & ZIP (Dave)
72 | - Beta regression (Gavin)
73 | - Other stuff: t, `cox.ph`, ordered categorical, mvnorm (Eric)
74 |
75 | # Afternoon
76 |
77 | ## Extended examples/demos (all)
78 |
79 | Introduce a set of examples, let people pick what they want. Workbook-type situation.
80 |
81 | - Spatial modelling
82 | - Dolphins (Dave) (Example)
83 | - Spatio-temporal models
84 | - Trawl data from ERDDAP (Eric) (Example)
85 | - Lake cores (Gavin) (Demo)
86 | - Time series modelling
87 | - `lynx` (Eric) (Example)
88 | - Ft Lauderdale (Eric) (Example)
89 | - Ft Lauderdale `ti()` (Gavin) (Example)
90 | - Markov random fields
91 | - forest health data (Gavin) (Example)
92 | - `"lpmatrix"`
93 | - dolphins revisit (Dave) (Example)
94 | - random effects
95 | - cheetahs (Gavin)
96 | - ship effect trawl (Eric)
97 | - Location-scale models?
98 | - Quick example (Gavin) (Demo)
99 | - Linear functionals
100 | - dispersal kernels black spruce (Eric) (Example)
101 |
102 |
103 | ## Smoother zoo checklist
104 |
105 | - [ ] 2-D smoother `s()` (Dave intro example)
106 | - [ ] `te()`
107 | - [ ] soap film (Dave spatial dolphins)
108 | - [ ] `"ts"` (Gavin model selection)
109 | - [ ] Cyclic splines
110 | - [ ] `fs` basis/`by=`
111 | - [ ] random effects
112 | - [ ] Gaussian process smoothers?
113 | - [ ] multiple formulae specification?
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/data/bbs_data/routes.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/bbs_data/routes.csv
--------------------------------------------------------------------------------
/data/beta-regression/ZooData.txt:
--------------------------------------------------------------------------------
1 | Number Scans Proportion Size Visual Raised Visitors Feeding Oc Other Enrichment Group Sex Enclosure Vehicle Diet Age Zoo Eps
41 300 0.136666667 650 2 2 6418 1 1 2 2 2 1 0 59 3 2 1 1
47 150 0.313333333 2405 2 1 13607 1 2 2 2 1 1 0 263 1 2 2 2
28 300 0.093333333 1781 4 2 13607 1 2 2 2 2 1 0 263 1 3 2 3
26 150 0.173333333 390 5 1 0 1 2 2 1 1 2 0 0 1 3 2 4
13 148 0.087837838 390 2 1 0 1 2 2 1 1 2 0 0 1 3 2 5
25 152 0.164473684 390 3 1 0 1 2 2 1 1 2 0 0 1 2 2 6
15 301 0.049833887 1200 4 2 11713 1 2 2 2 2 2 20 215 3 8 3 7
78 299 0.260869565 1200 4 2 11713 1 2 2 2 2 2 12 215 3 2 3 8
77 300 0.256666667 1200 4 2 11713 1 2 2 2 2 1 16 215 3 2 3 9
64 300 0.213333333 70 1 1 11713 1 2 2 2 2 1 20 215 3 8 3 10
80 300 0.266666667 40 1 1 11713 1 2 2 2 2 2 12 215 3 2 3 11
28 240 0.116666667 1200 4 2 4225 1 2 2 2 2 1 16 28 3 8 3 12
2 150 0.013333333 650 5 2 96513 1 1 1 2 1 2 20 3 3 7 4 13
12 150 0.08 650 5 2 96513 1 1 1 2 1 2 20 3 3 1 4 14
2 150 0.013333333 650 5 2 7294 1 1 1 1 1 2 0 7 3 7 4 15
16 150 0.106666667 100 1 1 0 1 1 1 1 1 2 20 6 3 7 4 16
12 150 0.08 100 1 1 0 1 1 1 1 1 2 20 6 3 1 4 17
6 356 0.016853933 2030 8 2 76000 1 2 2 1 2 1 6 0 1 2 5 18
45 120 0.375 715 6 2 76000 1 2 2 1 1 2 6 0 1 2 5 19
27 270 0.1 715 6 2 14195 2 2 2 1 2 1 0 0 1 2 5 20
28 90 0.311111111 2030 8 2 14195 2 2 2 1 1 2 0 0 1 2 5 21
14 100 0.14 420 2 1 8971 1 1 2 1 1 1 0 112 3 1 6 22
9 100 0.09 434 2 2 39120 1 1 2 1 1 1 0 116 3 1 6 23
10 100 0.1 434 2 2 27589 1 1 2 1 1 1 0 157 3 1 6 24
13 497 0.026156942 434 2 2 8971 1 1 2 1 2 2 0 112 3 3 6 25
10 495 0.02020202 1700 4 1 39120 1 1 2 1 2 2 1 116 3 3 6 26
9 499 0.018036072 1700 4 1 27589 1 1 2 1 2 2 0 157 3 3 6 27
20 199 0.100502513 3800 8 2 8971 1 1 2 1 2 2 0 112 3 5 6 28
26 200 0.13 3000 8 2 39120 1 1 2 2 2 2 0 63 3 5 6 29
21 200 0.105 3000 8 2 27589 1 1 2 2 2 2 0 84 3 5 6 30
0 296 0 3000 8 2 8971 1 1 2 2 2 1 0 51 3 10 6 31
6 300 0.02 504 4 2 27589 1 1 2 1 2 1 1 157 3 10 6 32
63 299 0.210702341 1700 4 1 8971 1 1 2 1 2 1 1 112 3 10 6 33
21 100 0.21 504 4 2 8971 1 1 2 1 1 2 0 112 3 8 6 34
18 100 0.18 504 4 2 39120 1 1 2 1 1 2 0 116 3 3 6 35
13 192 0.067708333 434 2 1 22000 2 1 2 2 1 2 0 128 3 5 6 36
34 150 0.226666667 1000 2 1 627 1 2 1 2 1 1 0 83 2 7 7 37
21 150 0.14 600 2 1 627 1 2 1 2 1 2 0 83 2 4 7 38
35 150 0.233333333 578 4 1 627 1 2 1 2 1 2 0 83 2 3 7 39
16 150 0.106666667 630 4 1 627 1 2 1 2 1 2 0 83 2 8 7 40
18 150 0.12 587 2 1 627 1 2 1 2 1 2 0 83 2 7 7 41
22 150 0.146666667 203 1 1 627 1 2 1 2 1 1 0 83 2 2 7 42
27 150 0.18 1100 3 1 627 1 2 1 2 1 1 0 54 1 8 7 43
16 300 0.053333333 1400 8 1 627 1 2 1 2 2 1 1 27 1 8 7 44
4 302 0.013245033 600 5 1 0 2 2 1 1 2 1 0 22 1 11 7 45
16 147 0.108843537 1600 9 1 0 2 2 1 1 1 1 0 26 1 9 7 46
4 150 0.026666667 400 4 1 0 2 2 1 1 1 2 0 26 1 8 7 47
35 300 0.116666667 400 4 1 0 2 2 1 1 2 1 0 26 1 5 7 48
11 150 0.073333333 500 9 1 0 2 2 1 1 1 2 0 26 1 11 7 49
8 149 0.053691275 500 9 1 0 2 2 1 1 1 2 0 26 1 12 7 50
18 150 0.12 500 9 1 0 2 2 1 1 1 2 0 26 1 11 7 51
19 299 0.063545151 500 9 1 0 2 2 1 1 2 1 0 26 1 10 7 52
3 150 0.02 500 9 1 0 2 2 1 1 1 1 0 26 1 10 7 53
15 495 0.03030303 1800 8 1 0 2 2 2 1 2 2 0 26 1 1 7 54
36 150 0.24 431 2 1 0 2 2 1 1 1 1 2 32 1 4 7 55
10 150 0.066666667 336 2 1 0 2 2 1 1 1 2 0 32 1 9 7 56
3 149 0.020134228 67.5 1 1 0 2 2 1 1 1 1 0 32 1 7 7 57
1 150 0.006666667 195 2 1 0 2 2 1 1 1 1 2 32 1 8 7 58
10 150 0.066666667 270 3 1 0 2 2 1 1 1 1 0 32 1 12 7 59
12 298 0.040268456 1900 9 1 0 2 2 1 1 2 1 1 19 1 9 7 60
14 150 0.093333333 360 4 1 0 2 2 1 1 1 1 0 11 1 7 7 61
33 150 0.22 360 3 1 0 2 2 1 1 1 2 0 11 1 3 7 62
9 149 0.060402685 266 5 1 0 2 2 1 1 1 2 0 11 1 9 7 63
19 150 0.126666667 360 7 1 0 2 2 1 1 1 2 0 11 1 6 7 64
39 600 0.065 15000 4 1 718 1 2 1 2 2 1 8 51 3 3 8 65
17 300 0.056666667 25000 4 2 718 1 2 1 2 2 2 10 49 3 5 8 66
9 150 0.06 10000 3 1 718 1 2 1 1 1 1 0 27 3 15 8 67
11 300 0.036666667 25000 4 2 718 1 2 1 2 2 2 8 40 3 1 8 68
10 591 0.016920474 10000 9 1 0 2 2 2 2 2 2 0 12 3 9 8 69
2 749 0.002670227 40000 9 2 0 2 1 2 1 2 1 0 16 3 5 8 70
20 448 0.044642857 10000 4 2 0 1 2 1 1 2 1 3 18 3 1 8 71
0 150 0 25000 10 2 0 2 2 2 1 1 1 0 22 3 10 8 72
1 300 0.003333333 25000 10 2 0 2 2 2 1 2 1 0 22 3 10 8 73
13 150 0.086666667 80000 9 2 718 2 2 2 2 1 2 0 47 3 10 8 74
10 150 0.066666667 80000 9 2 718 2 2 2 2 1 2 0 60 3 7 8 75
0 300 0 80000 9 2 0 2 2 2 2 2 2 0 44 3 2 8 76
19 150 0.126666667 3000 5 2 33172 1 2 2 1 1 2 16 11 2 4 9 77
12 150 0.08 62 1 1 0 1 1 2 1 1 2 16 21 2 4 9 78
10 452 0.022123894 3000 5 2 33172 1 2 2 1 2 2 16 1 2 1 9 79
10 450 0.022222222 62 1 1 0 1 1 2 1 2 2 16 17 2 1 9 80
15 149 0.100671141 69 1 1 0 1 2 1 1 1 1 16 10 2 2 9 81
37 150 0.246666667 69 1 1 0 1 2 1 1 1 2 14 17 2 5 9 82
44 300 0.146666667 625 1 2 0 1 2 2 1 2 1 0 34 2 8 9 83
21 150 0.14 625 1 1 0 1 2 2 1 1 2 6 33 2 5 9 84
14 150 0.093333333 10000 2 1 0 1 2 2 1 1 1 2 29 2 9 9 85
55 149 0.369127517 600 2 2 0 1 2 2 1 1 1 2 34 2 9 9 86
28 150 0.186666667 450 1 1 0 1 2 1 1 1 2 2 35 2 2 9 87
17 150 0.113333333 18.6 1 1 0 1 2 1 1 1 2 0 35 2 8 9 88
--------------------------------------------------------------------------------
/data/forest-health/foresthealth.bnd:
--------------------------------------------------------------------------------
1 | "82",5
2 | 15.9882,1.8882
3 | 15.9882,2.1118
4 | 16.2118,2.1118
5 | 16.2118,1.8882
6 | 15.9882,1.8882
7 | "83",5
8 | 15.9882,2.8882
9 | 15.9882,3.1118
10 | 16.2118,3.1118
11 | 16.2118,2.8882
12 | 15.9882,2.8882
13 | "79",5
14 | 14.9882,1.8882
15 | 14.9882,2.1118
16 | 15.2118,2.1118
17 | 15.2118,1.8882
18 | 14.9882,1.8882
19 | "81",5
20 | 14.9882,3.8882
21 | 14.9882,4.1118
22 | 15.2118,4.1118
23 | 15.2118,3.8882
24 | 14.9882,3.8882
25 | "78",5
26 | 13.9882,4.8882
27 | 13.9882,5.1118
28 | 14.2118,5.1118
29 | 14.2118,4.8882
30 | 13.9882,4.8882
31 | "62",5
32 | 10.1882,8.8882
33 | 10.1882,9.1118
34 | 10.4118,9.1118
35 | 10.4118,8.8882
36 | 10.1882,8.8882
37 | "55",5
38 | 9.6882,8.3882
39 | 9.6882,8.6118
40 | 9.9118,8.6118
41 | 9.9118,8.3882
42 | 9.6882,8.3882
43 | "80",5
44 | 14.9882,2.8882
45 | 14.9882,3.1118
46 | 15.2118,3.1118
47 | 15.2118,2.8882
48 | 14.9882,2.8882
49 | "77",5
50 | 13.9882,3.8882
51 | 13.9882,4.1118
52 | 14.2118,4.1118
53 | 14.2118,3.8882
54 | 13.9882,3.8882
55 | "75",5
56 | 12.9882,4.9882
57 | 12.9882,5.2118
58 | 13.2118,5.2118
59 | 13.2118,4.9882
60 | 12.9882,4.9882
61 | "72",5
62 | 12.0882,5.9882
63 | 12.0882,6.2118
64 | 12.3118,6.2118
65 | 12.3118,5.9882
66 | 12.0882,5.9882
67 | "61",5
68 | 10.0882,7.8882
69 | 10.0882,8.1118
70 | 10.3118,8.1118
71 | 10.3118,7.8882
72 | 10.0882,7.8882
73 | "76",5
74 | 13.9882,2.8882
75 | 13.9882,3.1118
76 | 14.2118,3.1118
77 | 14.2118,2.8882
78 | 13.9882,2.8882
79 | "74",5
80 | 12.9882,3.8882
81 | 12.9882,4.1118
82 | 13.2118,4.1118
83 | 13.2118,3.8882
84 | 12.9882,3.8882
85 | "71",5
86 | 12.0882,4.9882
87 | 12.0882,5.2118
88 | 12.3118,5.2118
89 | 12.3118,4.9882
90 | 12.0882,4.9882
91 | "60",5
92 | 10.0882,6.9882
93 | 10.0882,7.2118
94 | 10.3118,7.2118
95 | 10.3118,6.9882
96 | 10.0882,6.9882
97 | "73",5
98 | 12.9882,2.8882
99 | 12.9882,3.1118
100 | 13.2118,3.1118
101 | 13.2118,2.8882
102 | 12.9882,2.8882
103 | "70",5
104 | 11.9882,3.8882
105 | 11.9882,4.1118
106 | 12.2118,4.1118
107 | 12.2118,3.8882
108 | 11.9882,3.8882
109 | "67",5
110 | 11.0882,4.8882
111 | 11.0882,5.1118
112 | 11.3118,5.1118
113 | 11.3118,4.8882
114 | 11.0882,4.8882
115 | "59",5
116 | 10.0882,5.8882
117 | 10.0882,6.1118
118 | 10.3118,6.1118
119 | 10.3118,5.8882
120 | 10.0882,5.8882
121 | "54",5
122 | 9.0882,6.9882
123 | 9.0882,7.2118
124 | 9.3118,7.2118
125 | 9.3118,6.9882
126 | 9.0882,6.9882
127 | "69",5
128 | 11.9882,2.8882
129 | 11.9882,3.1118
130 | 12.2118,3.1118
131 | 12.2118,2.8882
132 | 11.9882,2.8882
133 | "66",5
134 | 10.9882,3.8882
135 | 10.9882,4.1118
136 | 11.2118,4.1118
137 | 11.2118,3.8882
138 | 10.9882,3.8882
139 | "68",5
140 | 11.9882,1.9882
141 | 11.9882,2.2118
142 | 12.2118,2.2118
143 | 12.2118,1.9882
144 | 11.9882,1.9882
145 | "63",5
146 | 10.9882,0.988197
147 | 10.9882,1.2118
148 | 11.2118,1.2118
149 | 11.2118,0.988197
150 | 10.9882,0.988197
151 | "53",5
152 | 9.0882,5.8882
153 | 9.0882,6.1118
154 | 9.3118,6.1118
155 | 9.3118,5.8882
156 | 9.0882,5.8882
157 | "48",5
158 | 8.0882,6.9882
159 | 8.0882,7.2118
160 | 8.3118,7.2118
161 | 8.3118,6.9882
162 | 8.0882,6.9882
163 | "65",5
164 | 10.9882,2.8882
165 | 10.9882,3.1118
166 | 11.2118,3.1118
167 | 11.2118,2.8882
168 | 10.9882,2.8882
169 | "64",5
170 | 10.9882,1.8882
171 | 10.9882,2.1118
172 | 11.2118,2.1118
173 | 11.2118,1.8882
174 | 10.9882,1.8882
175 | "56",5
176 | 9.9882,0.988197
177 | 9.9882,1.2118
178 | 10.2118,1.2118
179 | 10.2118,0.988197
180 | 9.9882,0.988197
181 | "47",5
182 | 8.0882,5.9882
183 | 8.0882,6.2118
184 | 8.3118,6.2118
185 | 8.3118,5.9882
186 | 8.0882,5.9882
187 | "58",5
188 | 9.9882,2.9882
189 | 9.9882,3.2118
190 | 10.2118,3.2118
191 | 10.2118,2.9882
192 | 9.9882,2.9882
193 | "57",5
194 | 9.9882,1.9882
195 | 9.9882,2.2118
196 | 10.2118,2.2118
197 | 10.2118,1.9882
198 | 9.9882,1.9882
199 | "50",5
200 | 8.9882,0.988197
201 | 8.9882,1.2118
202 | 9.2118,1.2118
203 | 9.2118,0.988197
204 | 8.9882,0.988197
205 | "49",5
206 | 8.9882,0.388197
207 | 8.9882,0.611803
208 | 9.2118,0.611803
209 | 9.2118,0.388197
210 | 8.9882,0.388197
211 | "46",5
212 | 8.0882,4.8882
213 | 8.0882,5.1118
214 | 8.3118,5.1118
215 | 8.3118,4.8882
216 | 8.0882,4.8882
217 | "52",5
218 | 8.9882,2.9882
219 | 8.9882,3.2118
220 | 9.2118,3.2118
221 | 9.2118,2.9882
222 | 8.9882,2.9882
223 | "51",5
224 | 8.9882,1.9882
225 | 8.9882,2.2118
226 | 9.2118,2.2118
227 | 9.2118,1.9882
228 | 8.9882,1.9882
229 | "42",5
230 | 7.9882,0.988197
231 | 7.9882,1.2118
232 | 8.2118,1.2118
233 | 8.2118,0.988197
234 | 7.9882,0.988197
235 | "45",5
236 | 7.9882,3.8882
237 | 7.9882,4.1118
238 | 8.2118,4.1118
239 | 8.2118,3.8882
240 | 7.9882,3.8882
241 | "41",5
242 | 6.9882,4.9882
243 | 6.9882,5.2118
244 | 7.2118,5.2118
245 | 7.2118,4.9882
246 | 6.9882,4.9882
247 | "44",5
248 | 7.9882,2.9882
249 | 7.9882,3.2118
250 | 8.2118,3.2118
251 | 8.2118,2.9882
252 | 7.9882,2.9882
253 | "43",5
254 | 7.9882,1.9882
255 | 7.9882,2.2118
256 | 8.2118,2.2118
257 | 8.2118,1.9882
258 | 7.9882,1.9882
259 | "40",5
260 | 6.9882,3.9882
261 | 6.9882,4.2118
262 | 7.2118,4.2118
263 | 7.2118,3.9882
264 | 6.9882,3.9882
265 | "39",5
266 | 6.9882,2.9882
267 | 6.9882,3.2118
268 | 7.2118,3.2118
269 | 7.2118,2.9882
270 | 6.9882,2.9882
271 | "38",5
272 | 6.9882,1.9882
273 | 6.9882,2.2118
274 | 7.2118,2.2118
275 | 7.2118,1.9882
276 | 6.9882,1.9882
277 | "37",5
278 | 6.0882,3.2882
279 | 6.0882,3.5118
280 | 6.3118,3.5118
281 | 6.3118,3.2882
282 | 6.0882,3.2882
283 | "36",5
284 | 6.0882,2.2882
285 | 6.0882,2.5118
286 | 6.3118,2.5118
287 | 6.3118,2.2882
288 | 6.0882,2.2882
289 | "33",5
290 | 5.0882,3.2882
291 | 5.0882,3.5118
292 | 5.3118,3.5118
293 | 5.3118,3.2882
294 | 5.0882,3.2882
295 | "35",5
296 | 5.2882,1.7882
297 | 5.2882,2.0118
298 | 5.5118,2.0118
299 | 5.5118,1.7882
300 | 5.2882,1.7882
301 | "31",5
302 | 5.0882,1.2882
303 | 5.0882,1.5118
304 | 5.3118,1.5118
305 | 5.3118,1.2882
306 | 5.0882,1.2882
307 | "25",5
308 | 4.0882,0.288197
309 | 4.0882,0.511803
310 | 4.3118,0.511803
311 | 4.3118,0.288197
312 | 4.0882,0.288197
313 | "34",5
314 | 5.1882,1.9882
315 | 5.1882,2.2118
316 | 5.4118,2.2118
317 | 5.4118,1.9882
318 | 5.1882,1.9882
319 | "32",5
320 | 5.0882,2.2882
321 | 5.0882,2.5118
322 | 5.3118,2.5118
323 | 5.3118,2.2882
324 | 5.0882,2.2882
325 | "21",5
326 | 3.5882,0.788197
327 | 3.5882,1.0118
328 | 3.8118,1.0118
329 | 3.8118,0.788197
330 | 3.5882,0.788197
331 | "18",5
332 | 3.1882,0.688197
333 | 3.1882,0.911803
334 | 3.4118,0.911803
335 | 3.4118,0.688197
336 | 3.1882,0.688197
337 | "15",5
338 | 3.0882,0.288197
339 | 3.0882,0.511803
340 | 3.3118,0.511803
341 | 3.3118,0.288197
342 | 3.0882,0.288197
343 | "28",5
344 | 4.4882,2.8882
345 | 4.4882,3.1118
346 | 4.7118,3.1118
347 | 4.7118,2.8882
348 | 4.4882,2.8882
349 | "30",5
350 | 4.5882,2.3882
351 | 4.5882,2.6118
352 | 4.8118,2.6118
353 | 4.8118,2.3882
354 | 4.5882,2.3882
355 | "29",5
356 | 4.5882,1.7882
357 | 4.5882,2.0118
358 | 4.8118,2.0118
359 | 4.8118,1.7882
360 | 4.5882,1.7882
361 | "26",5
362 | 4.0882,1.2882
363 | 4.0882,1.5118
364 | 4.3118,1.5118
365 | 4.3118,1.2882
366 | 4.0882,1.2882
367 | "27",5
368 | 4.0882,2.2882
369 | 4.0882,2.5118
370 | 4.3118,2.5118
371 | 4.3118,2.2882
372 | 4.0882,2.2882
373 | "22",5
374 | 3.5882,1.7882
375 | 3.5882,2.0118
376 | 3.8118,2.0118
377 | 3.8118,1.7882
378 | 3.5882,1.7882
379 | "16",5
380 | 3.0882,1.2882
381 | 3.0882,1.5118
382 | 3.3118,1.5118
383 | 3.3118,1.2882
384 | 3.0882,1.2882
385 | "23",5
386 | 3.5882,2.7882
387 | 3.5882,3.0118
388 | 3.8118,3.0118
389 | 3.8118,2.7882
390 | 3.5882,2.7882
391 | "20",5
392 | 3.4882,2.0882
393 | 3.4882,2.3118
394 | 3.7118,2.3118
395 | 3.7118,2.0882
396 | 3.4882,2.0882
397 | "19",5
398 | 3.1882,2.5882
399 | 3.1882,2.8118
400 | 3.4118,2.8118
401 | 3.4118,2.5882
402 | 3.1882,2.5882
403 | "17",5
404 | 3.0882,2.2882
405 | 3.0882,2.5118
406 | 3.3118,2.5118
407 | 3.3118,2.2882
408 | 3.0882,2.2882
409 | "12",5
410 | 2.5882,1.7882
411 | 2.5882,2.0118
412 | 2.8118,2.0118
413 | 2.8118,1.7882
414 | 2.5882,1.7882
415 | "2",5
416 | 1.0882,2.2882
417 | 1.0882,2.5118
418 | 1.3118,2.5118
419 | 1.3118,2.2882
420 | 1.0882,2.2882
421 | "1",5
422 | 0.588197,2.7882
423 | 0.588197,3.0118
424 | 0.811803,3.0118
425 | 0.811803,2.7882
426 | 0.588197,2.7882
427 | "13",5
428 | 2.5882,2.7882
429 | 2.5882,3.0118
430 | 2.8118,3.0118
431 | 2.8118,2.7882
432 | 2.5882,2.7882
433 | "3",5
434 | 1.0882,3.2882
435 | 1.0882,3.5118
436 | 1.3118,3.5118
437 | 1.3118,3.2882
438 | 1.0882,3.2882
439 | "6",5
440 | 1.4882,2.4882
441 | 1.4882,2.7118
442 | 1.7118,2.7118
443 | 1.7118,2.4882
444 | 1.4882,2.4882
445 | "7",5
446 | 1.5882,2.7882
447 | 1.5882,3.0118
448 | 1.8118,3.0118
449 | 1.8118,2.7882
450 | 1.5882,2.7882
451 | "9",5
452 | 2.0882,2.2882
453 | 2.0882,2.5118
454 | 2.3118,2.5118
455 | 2.3118,2.2882
456 | 2.0882,2.2882
457 | "4",5
458 | 1.3882,3.4882
459 | 1.3882,3.7118
460 | 1.6118,3.7118
461 | 1.6118,3.4882
462 | 1.3882,3.4882
463 | "5",5
464 | 1.3882,4.8882
465 | 1.3882,5.1118
466 | 1.6118,5.1118
467 | 1.6118,4.8882
468 | 1.3882,4.8882
469 | "8",5
470 | 1.5882,3.7882
471 | 1.5882,4.0118
472 | 1.8118,4.0118
473 | 1.8118,3.7882
474 | 1.5882,3.7882
475 | "10",5
476 | 2.0882,3.2882
477 | 2.0882,3.5118
478 | 2.3118,3.5118
479 | 2.3118,3.2882
480 | 2.0882,3.2882
481 | "11",5
482 | 2.0882,4.2882
483 | 2.0882,4.5118
484 | 2.3118,4.5118
485 | 2.3118,4.2882
486 | 2.0882,4.2882
487 | "14",5
488 | 2.7882,3.9882
489 | 2.7882,4.2118
490 | 3.0118,4.2118
491 | 3.0118,3.9882
492 | 2.7882,3.9882
493 | "24",5
494 | 3.6882,3.9882
495 | 3.6882,4.2118
496 | 3.9118,4.2118
497 | 3.9118,3.9882
498 | 3.6882,3.9882
499 |
--------------------------------------------------------------------------------
/data/forest-health/foresthealth.gra:
--------------------------------------------------------------------------------
1 | 83
2 | 82
3 | 2
4 | 2 1
5 | 83
6 | 2
7 | 0 7
8 | 79
9 | 2
10 | 7 0
11 | 81
12 | 2
13 | 7 8
14 | 78
15 | 2
16 | 8 9
17 | 62
18 | 2
19 | 11 6
20 | 55
21 | 2
22 | 11 5
23 | 80
24 | 4
25 | 2 12 3 1
26 | 77
27 | 4
28 | 12 13 4 3
29 | 75
30 | 3
31 | 13 14 4
32 | 72
33 | 1
34 | 14
35 | 61
36 | 3
37 | 15 6 5
38 | 76
39 | 3
40 | 16 8 7
41 | 74
42 | 4
43 | 16 17 9 8
44 | 71
45 | 4
46 | 17 18 10 9
47 | 60
48 | 3
49 | 19 20 11
50 | 73
51 | 3
52 | 21 13 12
53 | 70
54 | 4
55 | 21 22 14 13
56 | 67
57 | 2
58 | 22 14
59 | 59
60 | 2
61 | 25 15
62 | 54
63 | 3
64 | 25 26 15
65 | 69
66 | 4
67 | 23 27 17 16
68 | 66
69 | 3
70 | 27 18 17
71 | 68
72 | 2
73 | 28 21
74 | 63
75 | 2
76 | 29 28
77 | 53
78 | 3
79 | 30 20 19
80 | 48
81 | 2
82 | 30 20
83 | 65
84 | 4
85 | 28 31 22 21
86 | 64
87 | 4
88 | 24 32 27 23
89 | 56
90 | 4
91 | 33 34 32 24
92 | 47
93 | 3
94 | 35 26 25
95 | 58
96 | 3
97 | 32 36 27
98 | 57
99 | 4
100 | 29 37 31 28
101 | 50
102 | 4
103 | 34 38 37 29
104 | 49
105 | 3
106 | 38 33 29
107 | 46
108 | 3
109 | 39 40 30
110 | 52
111 | 3
112 | 37 41 31
113 | 51
114 | 4
115 | 33 42 36 32
116 | 42
117 | 3
118 | 42 34 33
119 | 45
120 | 3
121 | 41 43 35
122 | 41
123 | 2
124 | 43 35
125 | 44
126 | 4
127 | 42 44 39 36
128 | 43
129 | 4
130 | 38 45 41 37
131 | 40
132 | 4
133 | 44 46 40 39
134 | 39
135 | 5
136 | 45 46 47 43 41
137 | 38
138 | 3
139 | 47 44 42
140 | 37
141 | 4
142 | 47 48 44 43
143 | 36
144 | 6
145 | 49 52 53 46 45 44
146 | 33
147 | 4
148 | 53 58 57 46
149 | 35
150 | 6
151 | 52 53 50 58 59 47
152 | 31
153 | 5
154 | 59 60 53 52 49
155 | 25
156 | 4
157 | 54 55 56 60
158 | 34
159 | 8
160 | 53 50 58 59 57 61 49 47
161 | 32
162 | 9
163 | 50 58 59 57 61 48 52 49 47
164 | 21
165 | 6
166 | 55 63 56 62 51 60
167 | 18
168 | 6
169 | 63 56 54 62 51 60
170 | 15
171 | 4
172 | 63 55 54 51
173 | 28
174 | 7
175 | 61 64 59 58 53 48 52
176 | 30
177 | 10
178 | 59 57 61 64 62 65 53 48 52 49
179 | 29
180 | 10
181 | 57 61 60 62 65 58 50 53 52 49
182 | 26
183 | 9
184 | 51 62 54 65 55 63 61 59 50
185 | 27
186 | 11
187 | 60 64 62 65 66 67 57 59 58 53 52
188 | 22
189 | 12
190 | 54 65 66 55 67 63 68 64 60 61 59 58
191 | 16
192 | 8
193 | 56 68 67 55 65 54 62 60
194 | 23
195 | 8
196 | 62 65 66 67 71 61 57 58
197 | 20
198 | 11
199 | 66 67 63 71 68 62 64 60 61 59 58
200 | 19
201 | 8
202 | 67 71 68 75 65 62 64 61
203 | 17
204 | 9
205 | 63 71 68 75 66 65 62 64 61
206 | 12
207 | 7
208 | 75 71 63 67 66 65 62
209 | 2
210 | 5
211 | 70 72 73 74 75
212 | 1
213 | 5
214 | 69 72 76 73 74
215 | 13
216 | 9
217 | 68 79 75 74 73 67 66 65 64
218 | 3
219 | 7
220 | 69 70 76 73 74 78 79
221 | 6
222 | 8
223 | 76 72 69 70 74 75 79 71
224 | 7
225 | 9
226 | 73 76 72 69 70 78 75 79 71
227 | 9
228 | 8
229 | 74 73 69 79 68 71 67 66
230 | 4
231 | 7
232 | 72 70 73 74 78 79 80
233 | 5
234 | 2
235 | 78 80
236 | 8
237 | 6
238 | 74 77 76 72 79 80
239 | 10
240 | 9
241 | 75 78 74 73 76 72 80 71 81
242 | 11
243 | 5
244 | 79 78 77 76 81
245 | 14
246 | 3
247 | 80 79 82
248 | 24
249 | 1
250 | 81
251 |
--------------------------------------------------------------------------------
/data/mexdolphins/README.md:
--------------------------------------------------------------------------------
1 | Pantropical spotted dolphins in the Gulf of Mexico
2 | ==================================================
3 |
4 | Data copied from [this example DSM analysis](http://distancesampling.org/R/vignettes/mexico-analysis.html), after correcting for detectability.
5 |
6 | Data was collected by teh NOAA South East Fisheries Science Center (sic). The OBIS-SEAMAP page for the data may be found at the SEFSC [GoMex Oceanic 1996 survey page](http://seamap.env.duke.edu/dataset/25).
7 |
8 | ## Data citation
9 |
10 | Garrison, L. 2013. SEFSC GoMex Oceanic 1996. Data downloaded from OBIS-SEAMAP (http://seamap.env.duke.edu/dataset/25).
11 |
--------------------------------------------------------------------------------
/data/mexdolphins/mexdolphins.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/mexdolphins/mexdolphins.RData
--------------------------------------------------------------------------------
/data/mexdolphins/soap_pred.R:
--------------------------------------------------------------------------------
1 | # make soap pred grid
2 |
3 | library(mapdata)
4 | library(maptools)
5 | library(rgeos)
6 | library(rgdal)
7 | library(mgcv)
8 |
9 | load("mexdolphins.RData")
10 |
11 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
12 |
13 | usmap <- map('usa', plot=FALSE, xlim=c(-100,-80), ylim=c(22,33))
14 | uspoly <- map2SpatialPolygons(usmap, IDs=usmap$names,
15 | proj4string=CRS("+proj=longlat +datum=WGS84"))
16 |
17 | # simplify the polygon
18 | uspoly <- gSimplify(uspoly, 0.08)
19 |
20 |
21 | # bounding box
22 | c1 = cbind(c(-97.5, -82, -82, -97.5),
23 | c(22.5, 22.5, 32, 32))
24 | r1 = rbind(c1, c1[1, ]) # join
25 | P1 = Polygon(r1)
26 | Ps1 = Polygons(list(P1), ID = "a")
27 |
28 | SPs = SpatialPolygons(list(Ps1),proj4string=CRS("+proj=longlat +datum=WGS84"))
29 |
30 | # plot to check we got the right area
31 | #plot(uspoly)#, xlim=c(-100,-80), ylim=c(22,33))
32 | #plot(SPs, add=TRUE)
33 | #points(mexdolphins[,c("longitude","latitude")])
34 |
35 | # take the polygon difference
36 | soap_area <- gDifference(SPs,uspoly)
37 | # again check
38 | #plot(soap_area)
39 | #points(mexdolphins[,c("longitude","latitude")])
40 |
41 |
42 | xy_bnd <- spTransform(soap_area, CRSobj=lcc_proj4)
43 | xy_bnd <- xy_bnd@polygons[[1]]@Polygons[[1]]@coords
44 | xy_bnd <- list(x=xy_bnd[,1], y=xy_bnd[,2])
45 |
46 | gr <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), by=16469.27),
47 | y=seq(min(xy_bnd$y), max(xy_bnd$y), by=16469.27))
48 | x <- gr[,1]
49 | y <- gr[,2]
50 | ind <- inSide(xy_bnd, x, y)
51 |
52 | soap_preddata <- as.data.frame(gr[ind,])
53 | soap_preddata$area <- preddata$area[1]
54 | soap_preddata$off.set <- log(soap_preddata$area)
55 |
56 |
57 | save(xy_bnd, soap_preddata, file="soapy.RData")
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/data/mexdolphins/soapy.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/mexdolphins/soapy.RData
--------------------------------------------------------------------------------
/data/routes.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/routes.csv
--------------------------------------------------------------------------------
/data/yukon_seeds/seed_source_locations.csv:
--------------------------------------------------------------------------------
1 | Who,Site,Subsite,source_type,polygon,order,Y,X
2 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796723.731,469919.3107
3 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796742.626,469910.4723
4 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796729.702,469974.1999
5 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.89,470025.6419
6 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796719.851,470077.7116
7 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.935,470126.3351
8 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796718.948,470174.2659
9 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796702.393,470236.7358
10 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796694.535,470294.0962
11 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796742.568,470287.086
12 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796724.452,470303.7841
13 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796785.653,470258.4785
14 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.804,470191.593
15 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796964.678,470142.5823
16 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797042.968,470050.4508
17 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797046.439,470002.8427
18 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797040.834,469950.9495
19 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796996.21,469909.3531
20 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796975.594,469904.4163
21 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.762,469895.5375
22 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796850.814,469871.8622
23 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797168.951,469864.6928
24 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797133.442,469889.7501
25 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797067.112,470003.8641
26 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797045.311,469933.9624
27 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803025.398,465082.76
28 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803076.228,465133.385
29 | Eliot,FoxLake,Lowland,SSPolygon,38,1,6796715.73,470304.9674
30 | Eliot,FoxLake,Lowland,SSPolygon,38,2,6796728.455,470293.1938
31 | Eliot,FoxLake,Lowland,SSPolygon,38,3,6796753.325,470273.6796
32 | Eliot,FoxLake,Lowland,SSPolygon,38,4,6796788.911,470252.55
33 | Eliot,FoxLake,Lowland,SSPolygon,38,5,6796847.188,470216.8999
34 | Eliot,FoxLake,Lowland,SSPolygon,38,6,6796887.915,470191.8324
35 | Eliot,FoxLake,Lowland,SSPolygon,38,7,6796932.971,470156.1209
36 | Eliot,FoxLake,Lowland,SSPolygon,38,8,6796962.203,470143.2102
37 | Eliot,FoxLake,Lowland,SSPolygon,38,9,6796979.017,470128.2042
38 | Eliot,FoxLake,Lowland,SSPolygon,38,10,6797017.817,470096.0466
39 | Eliot,FoxLake,Lowland,SSPolygon,38,11,6797024.449,470083.6527
40 | Eliot,FoxLake,Lowland,SSPolygon,38,32,6797050.904,470081.0951
41 | Eliot,FoxLake,Lowland,SSPolygon,38,33,6796981.673,470133.1883
42 | Eliot,FoxLake,Lowland,SSPolygon,38,34,6796839.563,470223.1343
43 | Eliot,FoxLake,Lowland,SSPolygon,38,35,6796790.373,470257.3939
44 | Eliot,FoxLake,Lowland,SSPolygon,38,12,6797041.589,470051.2157
45 | Eliot,FoxLake,Lowland,SSPolygon,38,13,6797044.77,470038.4604
46 | Eliot,FoxLake,Lowland,SSPolygon,38,14,6797049.136,470021.4898
47 | Eliot,FoxLake,Lowland,SSPolygon,38,15,6797043.093,470009.7016
48 | Eliot,FoxLake,Lowland,SSPolygon,38,16,6797045.841,469991.8912
49 | Eliot,FoxLake,Lowland,SSPolygon,38,17,6797050.612,469984.3664
50 | Eliot,FoxLake,Lowland,SSPolygon,38,18,6797061.129,469968.7692
51 | Eliot,FoxLake,Lowland,SSPolygon,38,19,6797037.335,469950.6634
52 | Eliot,FoxLake,Lowland,SSPolygon,38,20,6797009.75,469923.8775
53 | Eliot,FoxLake,Lowland,SSPolygon,38,21,6797014.931,469917.5015
54 | Eliot,FoxLake,Lowland,SSPolygon,89,1,6796994.101,469909.7297
55 | Eliot,FoxLake,Lowland,SSPolygon,89,2,6796985.436,469906.6158
56 | Eliot,FoxLake,Lowland,SSPolygon,89,3,6796973.54,469906.5494
57 | Eliot,FoxLake,Lowland,SSPolygon,89,4,6796960.759,469898.896
58 | Eliot,FoxLake,Lowland,SSPolygon,89,5,6796919.57,469895.9869
59 | Eliot,FoxLake,Lowland,SSPolygon,89,6,6796906.707,469885.8498
60 | Eliot,FoxLake,Lowland,SSPolygon,89,7,6796855.959,469882.6563
61 | Eliot,FoxLake,Lowland,SSPolygon,89,8,6796848.594,469877.0291
62 | Eliot,FoxLake,Lowland,SSPolygon,89,15,6796843.691,469842.534
63 | Eliot,FoxLake,Lowland,SSPolygon,89,9,6796830.91,469827.1575
64 | Eliot,FoxLake,Lowland,SSPolygon,89,14,6796853.289,469799.318
65 | Eliot,FoxLake,Lowland,SSPolygon,89,12,6796817.835,469784.2797
66 | Eliot,FoxLake,Lowland,SSPolygon,89,10,6796812.723,469793.5348
67 | Eliot,FoxLake,Lowland,SSPolygon,89,13,6796821.899,469785.0872
68 | Eliot,FoxLake,Lowland,SSPolygon,89,11,6796814.147,469781.0331
69 | Eliot,FoxLake,Lowland,SSPolygon,159,1,6797215.359,469905.572
70 | Eliot,FoxLake,Lowland,SSPolygon,159,2,6797206.191,469912.9905
71 | Eliot,FoxLake,Lowland,SSPolygon,159,3,6797203.94,469943.5954
72 | Eliot,FoxLake,Lowland,SSPolygon,159,4,6797202.858,469895.6316
73 | Eliot,FoxLake,Lowland,SSPolygon,159,5,6797194.902,469889.1922
74 | Eliot,FoxLake,Lowland,SSPolygon,159,6,6797204.779,469878.4889
75 | Eliot,FoxLake,Lowland,SSPolygon,159,7,6797205.31,469870.0078
76 | Eliot,FoxLake,Lowland,SSPolygon,159,8,6797186.831,469856.0113
77 | Eliot,FoxLake,Lowland,SSPolygon,159,9,6797169.234,469850.342
78 | Eliot,FoxLake,Lowland,SSPolygon,159,10,6797150.315,469847.3502
79 | Eliot,FoxLake,Lowland,SSPolygon,159,11,6797135.268,469840.288
80 | Eliot,FoxLake,Lowland,SSPolygon,159,12,6797125.43,469837.8281
81 | Eliot,FoxLake,Lowland,SSPolygon,159,13,6797121.712,469844.4725
82 | Eliot,FoxLake,Lowland,SSPolygon,159,14,6797136.985,469852.2816
83 | Eliot,FoxLake,Lowland,SSPolygon,159,15,6797162.709,469858.8117
84 | Eliot,FoxLake,Lowland,SSPolygon,159,16,6797178.135,469866.1774
85 | Eliot,FoxLake,Lowland,SSPolygon,159,17,6797187.21,469872.7701
86 | Eliot,FoxLake,Lowland,SSPolygon,159,18,6797176.944,469888.4768
87 | Eliot,FoxLake,Lowland,SSPolygon,159,19,6797171.087,469910.6188
88 | Eliot,FoxLake,Lowland,SSPolygon,159,20,6797182.537,469924.3886
89 | Eliot,FoxLake,Lowland,SSPolygon,159,21,6797174.278,469956.3611
90 | Eliot,FoxLake,Lowland,SSPolygon,159,22,6797169.264,469966.2896
91 | Eliot,FoxLake,Lowland,SSPolygon,185,1,6797141.76,469950.5774
92 | Eliot,FoxLake,Lowland,SSPolygon,185,2,6797134.16,469928.8803
93 | Eliot,FoxLake,Lowland,SSPolygon,185,3,6797133.061,469886.2447
94 | Eliot,FoxLake,Lowland,SSPolygon,185,4,6797112.249,469847.2647
95 | Eliot,FoxLake,Lowland,SSPolygon,185,5,6797112.324,469865.5345
96 | Eliot,FoxLake,Lowland,SSPolygon,185,6,6797121.118,469891.7095
97 | Eliot,FoxLake,Lowland,SSPolygon,185,7,6797127.573,469922.8565
98 | Eliot,FoxLake,Lowland,SSPolygon,185,8,6797126.939,469938.0401
99 | Eliot,FoxLake,Lowland,SSPolygon,38,31,6797091.933,470016.4598
100 | Eliot,FoxLake,Lowland,SSPolygon,38,30,6797079.967,470007.3238
101 | Eliot,FoxLake,Lowland,SSPolygon,38,29,6797067.498,469997.9275
102 | Eliot,FoxLake,Lowland,SSPolygon,38,28,6797063.273,469988.7003
103 | Eliot,FoxLake,Lowland,SSPolygon,38,27,6797070.821,469974.1372
104 | Eliot,FoxLake,Lowland,SSPolygon,38,26,6797063.943,469959.6072
105 | Eliot,FoxLake,Lowland,SSPolygon,38,25,6797049.503,469942.8299
106 | Eliot,FoxLake,Lowland,SSPolygon,38,24,6797039.73,469931.6067
107 | Eliot,FoxLake,Lowland,SSPolygon,38,23,6797030.65,469922.4277
108 | Eliot,FoxLake,Lowland,SSPolygon,38,22,6797020.646,469910.829
109 | Eliot,FoxLake,Lowland,SSPolygon,233,1,6797526.926,469768.1144
110 | Eliot,FoxLake,Lowland,SSPolygon,233,2,6797486.016,469794.4904
111 | Eliot,FoxLake,Lowland,SSPolygon,233,3,6797400.062,469863.5996
112 | Eliot,FoxLake,SW_Facing,SSPolygon,380,1,6803013.014,465080.2516
113 | Eliot,FoxLake,SW_Facing,SSPolygon,380,2,6803089.116,465135.0063
114 | Eliot,FoxLake,SW_Facing,SSPolygon,389,1,6803262.384,464081.8498
115 | Eliot,FoxLake,SW_Facing,SSPolygon,389,2,6803618.396,464040.2266
116 | Eliot,FoxLake,SW_Facing,SSPolygon,391,1,6803587.361,464040.0052
117 | Eliot,FoxLake,SW_Facing,SSPolygon,391,2,6803174.081,464080.9813
118 | Eliot,FoxLake,Lowland,SSIsland,56,NA,6796740.729,470183.8429
119 | Eliot,FoxLake,Lowland,SSIsland,58,NA,6796737.425,470200.11
120 | Eliot,FoxLake,Lowland,SSIsland,65,NA,6796729.556,470238.3273
121 | Eliot,FoxLake,Lowland,SSIsland,67,NA,6796738.312,470243.0132
122 | Eliot,FoxLake,Lowland,SSIsland,68,NA,6796734.045,470246.1296
123 | Eliot,FoxLake,Lowland,SSIsland,97,NA,6796940.921,469902.9191
124 | Eliot,FoxLake,Lowland,SSIsland,98,NA,6796929.918,469896.6909
125 | Eliot,FoxLake,Lowland,SSIsland,99,NA,6796923.639,469901.6075
126 | Eliot,FoxLake,Lowland,SSIsland,114,NA,6796836.526,469792.1375
127 | Eliot,FoxLake,Lowland,SSIsland,156,NA,6796980.95,470060.4577
128 | Eliot,FoxLake,Lowland,SSIsland,182,NA,6797146.336,469972.2485
129 | Eliot,FoxLake,Lowland,SSIsland,183,NA,6797152.1,469950.1815
130 | Eliot,FoxLake,Lowland,SSIsland,184,NA,6797146.21,469953.2912
131 | Eliot,FoxLake,Lowland,SSIsland,212,NA,6796949.689,469802.0181
132 | Eliot,FoxLake,Lowland,SSIsland,213,NA,6796948.007,469797.7336
133 | Eliot,FoxLake,Lowland,SSIsland,214,NA,6796936.248,469789.6354
134 | Eliot,FoxLake,Lowland,SSIsland,231,NA,6797483.691,469755.6377
135 | Eliot,FoxLake,Lowland,SSIsland,232,NA,6797514.164,469726.8687
136 | Eliot,FoxLake,Lowland,SSIndividual,60,NA,6796747.059,470205.7828
137 | Eliot,FoxLake,Lowland,SSIndividual,61,NA,6796737.175,470215.1502
138 | Eliot,FoxLake,Lowland,SSIndividual,62,NA,6796734.953,470215.1313
139 | Eliot,FoxLake,Lowland,SSIndividual,63,NA,6796725.697,470212.2823
140 | Eliot,FoxLake,Lowland,SSIndividual,64,NA,6796727.524,470235.7011
141 | Eliot,FoxLake,Lowland,SSIndividual,66,NA,6796736.703,470238.8686
142 | Eliot,FoxLake,Lowland,SSIndividual,96,NA,6796948.488,469901.2824
143 | Eliot,FoxLake,Lowland,SSIndividual,203,NA,6797043.34,469928.7998
144 | Eliot,FoxLake,Lowland,SSIndividual,204,NA,6797039.158,469928.6471
145 | Eliot,FoxLake,Lowland,SSIndividual,208,NA,6797019.777,469904.491
146 |
--------------------------------------------------------------------------------
/example-bivariate-timeseries-and-ti.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "More time series; bivariate smooths and ti()"
3 | output:
4 | html_document:
5 | toc: true
6 | toc_float: true
7 | theme: readable
8 | highlight: haddock
9 | fig_width: 10
10 | fig_height: 4
11 | ---
12 |
13 | ## Background
14 |
15 | This example is an extension of the Fort Lauderdale time series model. Here we look at two additional advanced features, namely
16 |
17 | i. allowing the seasonal and trend terms to interact, and
18 | ii. setting up an ANOVA-like decomposition for interaction smooth using `ti()` terms
19 |
20 | The model in the earlier example estimated a constant non-linear seasonal (within-year) effect and a constant non-linear trend (between year) effect. Such a model is unable to capture any potential change in the seasonal effect (the within-year distribution of temperatures) over time. Or, put another way, is unable to capture potentially different trends in temperature are particular times of the year.
21 |
22 | This example extends the simpler model to more generally model changes in temperature.
23 |
24 | ### Loading and viewing the data
25 |
26 | The time series we'll look at is a long-term temperature and precipitation
27 | data set from the convention center here at Fort Lauderdale, consisting of
28 | monthly mean temperature (in degrees c), the number of days it rained that
29 | month, and the amount of precipitation that month (in mm). The data range from
30 | 1950 to 2015, with a column for year and month (with Jan. = 1). The data is from
31 | a great web-app called FetchClim2, and the link for the data [I used is
32 | here](http://fetchclimate2.cloudapp.net/#page=geography&dm=values&t=years&v=airt(13,6,2,1),prate(12,7,2,1),wet(1)&y=1950:1:2015&dc=1,32,60,91,121,152,182,213,244,274,305,335,366&hc=0,24&p=26.099,-80.123,Point%201&ts=2016-07-17T21:57:11.213Z).
33 |
34 | Here we'll load it and plot the air temperature data.
35 |
36 | ```{r, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE, highlight = TRUE}
37 | library("mgcv")
38 | library("ggplot2")
39 | florida <- read.csv("./data/time_series/Florida_climate.csv",
40 | stringsAsFactors = FALSE)
41 | head(florida)
42 |
43 | ggplot(data = florida, aes(x = month, y = air_temp, color = year, group = year)) +
44 | geom_point() +
45 | geom_line()
46 | ```
47 |
48 | ## Fitting the smooth interaction model
49 |
50 | Following from the earlier example, we fit a smooth interaction model using a tensor product spline of `year` and `month`. Notice how we specify the marginal basis types as a character vector. Here I restrict the model to AR(1) errors as this was the best model in the earlier example and it is unlikely that additional unmodelled temporal structure will turn up when we use a more general model!
51 |
52 | ```{r fit-smooth-interaction, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
53 | m.full <- gamm(air_temp ~ te(year, month, bs = c("tp", "cc")), data = florida,
54 | correlation = corAR1(form = ~ 1 | year),
55 | knots = list(month = c(0.5, 12.5)))
56 | layout(matrix(1:3, nrow = 1))
57 | plot(m.full$gam, scheme = 1, theta = 40)
58 | plot(m.full$gam, scheme = 1, theta = 80)
59 | plot(m.full$gam, scheme = 1, theta = 120)
60 | layout(1)
61 | summary(m.full$gam)
62 | ```
63 |
64 | The plot of the tensor product smooth is quite illustrative as to what the model is attempting to do. The earlier model would be forced to retain the same seasonal curve throughout the time series. Now, with the more complex model, the seasonal curve is allowed to change shape smoothly through time.
65 |
66 | Our task now is to test whether this additional complexity is worth the extra effort? Are some months warming more than others in Fort Lauderdale?
67 |
68 | ## Fitting the ANOVA-decompososed model
69 |
70 | The logical thing to do to test whether some months are warming more than others would be to drop the interaction term from the model and compare the simpler and more complex models with AIC or a likelihood ratio test. But if you look back at the model we just fitted, there is no term in that model that relates just to the interaction! The tensor product contains both the main and interactive effects. What's worse is that we can't (always) compare this model with the one we fitted in the earlier example because we need to be careful to ensure the models are properly nested. This is where the `ti()` smoother comes in.
71 |
72 | A `ti()` smoother is like a standard tensor product (`te()`) smooth, but without the main effects smooths. Now we can build a model with separate main effects smooths for `year` and `month` and add in the interaction smooth using `ti()`. This will allow us to fit two properly nested that can be compared to test our null hypothesis that the months are warming at the same rate.
73 |
74 | Let's fit those models
75 |
76 | ```{r fit-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
77 | m.main <- gam(air_temp ~ s(year, bs = "tp") + s(month, bs = "cc"), data = florida,
78 | knots = list(month = c(0.5, 12.5)))
79 | m.int <- gam(air_temp ~ ti(year, bs = "tp") + ti(month, bs = "cc") +
80 | ti(year, month, bs = c("tp", "cc"), k = 11), data = florida,
81 | knots = list(month = c(0.5, 12.5)))
82 | ```
83 |
84 | If you do this with `gamm()` the model fitting will fail, and for good reason. At least the two `gam()` models converged so we can see what is going on. Compare the two models using `AIC()`
85 |
86 | ```{r aic-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
87 | AIC(m.main, m.int)
88 | ```
89 |
90 | AIC tells us that there is some support for the interaction model, but only when we force a more complex surface by specifying `k`.
91 |
92 | Regardless, the interaction term only affords a >1% improvement in deviance explained; whilst the interaction may be favoured by AIC, any variation in the seasonal effect over time must be small.
93 |
94 | ## Exercises
95 |
96 | 1. For an example where there is a stronger change in the seasonal effect, see my two blog posts on applying these models to the Central England Temperature series:
97 | - [Post 1](http://www.fromthebottomoftheheap.net/2015/11/21/climate-change-and-spline-interactions/)
98 | - [Post 2](http://www.fromthebottomoftheheap.net/2015/11/23/are-some-seasons-warming-more-than-others/)
99 |
100 | These two posts take you through a full analysis of this data series.
101 |
--------------------------------------------------------------------------------
/example-forest-health.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Markov random field smooths & German forest health data"
3 | output:
4 | html_document:
5 | toc: true
6 | toc_float: true
7 | theme: readable
8 | highlight: haddock
9 | ---
10 |
11 | # Introduction
12 |
13 | In this example, we look at a spatial data set of forest health data from a stand in Germany. We can treat the individual trees as discrete spatial units and fit a spatial term using a special type of spline basis, a Markov random field.
14 |
15 | The data come from a project n the forest of Rothenbuch, Germany, which has been ongoing since 1982. The project studies five species but here we just consider the beech data. Each year the condition of each of 83 trees is categorized into one of nine ordinal classes in terms of the degree (percentage) of defoliation. A defoliation value of 0% indicates a healthy beech tree, whilst a value of 100% indicates a tree is dead. For the purposes of this example, several of the nine ordinal classes are merged to form a three class response variable of interest
16 |
17 | 1. 0%, healthy trees that I'll refer to as the `"low"` defoliation group
18 | 2. 12.5% -- 37.5% defoliation, which I refer to as the `"med"` group, and
19 | 3. 50% -- 100% defoliation, which is the `"high"` group.
20 |
21 | The original classes were `r paste0(c(0,12.5,37.5,50,62.5,75, 87.5, 100), "%")` and are in variable `defol`.
22 |
23 | Alongside the response variable, a number of continuous and categorical covariates are recorded
24 |
25 | - `id` --- location ID number
26 | - `year` --- year of recording
27 | - `x` and `y` --- the x- and y-coordinates of each location
28 | - `age` --- average age of trees at a location
29 | - `canopyd` --- canopy density at the location, in %
30 | - `gradient` --- slope, in %
31 | - `alt` --- altitude above sea level in m
32 | - `depth` --- soil depth in cm
33 | - `ph` --- soil pH at 0--2cm depth
34 | - `watermoisture` --- soil moisture in three categories
35 | 1. `1` moderately dry
36 | 2. `2` moderately moist
37 | 3. `3` moist or temporarily wet
38 | - `alkali` --- fraction of alkali ions in four categories
39 | 1. `1` very low
40 | 2. `2` low
41 | 3. `3` moderate
42 | 4. `4` high
43 | - `humus` thickness of the soil humus layer in five categories
44 | 1. `0` 0cm
45 | 2. `1` 1cm
46 | 3. `2` 2cm
47 | 4. `3` 3cm
48 | 5. `4` $>$3cm
49 | - `type` --- type of forest
50 | - `0` deciduous forest
51 | - `1` mixed forest
52 | - `fert` --- fertilization
53 | - `0` not fertilized
54 | - `1` fertilized
55 |
56 | The aim of the example is to investigate the effect of the measured covariates on the degree of defoliation and to quantify any temporal trend and spatial effect of geographic location in the data set, while adjusting for the effects of the other covariates.
57 |
58 | The data are extensively analysed in Fahrmeir, Kneib, Lang, and Marx (2013) *Regression: Models, Methods and Applications*. Springer.
59 |
60 | # Getting started
61 |
62 | Begin by loading the packages we'll use in this example
63 |
64 | ```{r load-packages}
65 | ## Load packages
66 | library("mgcv")
67 | library("ggplot2")
68 | ```
69 |
70 | Next, load the forest health data set
71 |
72 | ```{r read-data}
73 | ## Read in the forest health data
74 | forest <- read.table("./data/forest-health/beech.raw", header = TRUE, na.strings = ".")
75 | head(forest)
76 | ```
77 |
78 | The data require a little processing to ensure they are correctly coded. The chunk below does
79 |
80 | 1. Makes a nicer stand label so that we can sort stands numerically even though the id is character,
81 | 2. Aggregates the defoliation data into an ordered categorical variable for low, medium, or high levels of defoliation. This is converted to a numeric variable for modelling,
82 | 3. Convert the categorical variables to factors. `humus` is converted to a 4-level factor,
83 | 4. Remove some `NA`s, and
84 | 5. Summarises the response variable in a table.
85 |
86 | ```{r process-forest-data}
87 | forest <- transform(forest, id = factor(formatC(id, width = 2, flag = "0")))
88 |
89 | ## Aggregate defoliation & convert categorical vars to factors
90 | levs <- c("low","med","high")
91 | forest <- transform(forest,
92 | aggDefol = as.numeric(cut(defol, breaks = c(-1,10,45,101),
93 | labels = levs)),
94 | watermoisture = factor(watermoisture),
95 | alkali = factor(alkali),
96 | humus = cut(humus, breaks = c(-0.5, 0.5, 1.5, 2.5, 3.5),
97 | labels = 1:4),
98 | type = factor(type),
99 | fert = factor(fert))
100 | forest <- droplevels(na.omit(forest))
101 | head(forest)
102 | with(forest, table(aggDefol))
103 | ```
104 |
105 | To view the spatial arrangement of the forest stand, plot the unique `x` and `y` coordinate pairs
106 |
107 | ```{r plot-forest-stand}
108 | ## Plot data
109 | ggplot(unique(forest[, c("x","y")]), aes(x = x, y = y)) +
110 | geom_point()
111 | ```
112 |
113 | # Non-spatial ordered categorical GAM
114 |
115 | Our first model will ignore the spatial component of the data. All continuous variables except `ph` and `depth` are modelled using splines, and all categorical variables are included in the model. Note that we can speed up fitting by turning on some multi-threaded parallel processing in parts of the fitting algorithm via the `nthreads` control argument. The `ocat` family is used and we specify that there are three classes. Smoothness selection is via REML.
116 |
117 | ```{r fit-naive-model}
118 | ## Model
119 | ctrl <- gam.control(nthreads = 3)
120 | forest.m1 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert +
121 | s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt),
122 | data = forest, family = ocat(R = 3), method = "REML",
123 | control = ctrl)
124 | ```
125 |
126 | The model summary
127 |
128 | ```{r summary-naive}
129 | summary(forest.m1)
130 | ```
131 |
132 | indicates all model terms are significant at the 95% level, especially the smooth terms. Don't pay too much attention to this though as we have not yet accounted for spatial structure in the data and several of the variables are likely to be spatially autocorrelated and hence the identified effects may be spurious.
133 |
134 | This is somewhat confirmed by the form of the fitted functions; rather than smooth, monotonic functions man terms are highly non-linear and difficult to interpret.
135 |
136 | ```{r plot-naive-smooths}
137 | plot(forest.m1, pages = 1, shade = 2, scale = 0)
138 | ```
139 | We might expect that older trees are more susceptible to damage yet confusingly there is a decreasing effect for the very oldest trees. The smooth of `gradient` is uninterpretable. Trees in low and high altitude areas are less damaged than those in areas of intermediate elevation, which is also counter intuitive.
140 |
141 | Running gam.check()`
142 |
143 | ```{r gam-check-naive}
144 | gam.check(forest.m1)
145 | ```
146 |
147 | suggests no major problems, but the residual plot is difficult to interpret owing to the categorical nature of the response variable. The printed output suggests some smooth terms may need their basis dimension increasing. Before we do this however, we should add a spatial effect to the model.
148 |
149 | # Spatial GAM via a MRF smooth
150 |
151 | For these data, it would be more natural to fit a spatial effect via a 2-d smoother as we've considered in other examples. However, we can consider the trees as being discrete spatial units and fit a spatial effect via a Markov random field (MRF) smooth. To fit the MRF smoother, we need information on the neighbours of each tree. In this example, any tree within 1.8km of a focal tree was considered a neighbour of that focal tree. This neighbourhood information is stored in a BayesX graph file, which we can convert into the format needed for **mgcv**'s MRF basis function. To facilitate reading the graph file, load the utility function `gra2mgcv()`
152 |
153 | ```{r load-graph-fun}
154 | ## souce graph reading function
155 | source("./code_snippets/gra2mgcv.R")
156 | ```
157 |
158 | Next we load the `.gra` file and do some manipulations to match the `forest` environmental data file
159 |
160 | ```{r load-graph}
161 | ## Read in graph file and output list required by mgcv
162 | nb <- gra2mgcv("./data/forest-health/foresthealth.gra")
163 | nb <- nb[order(as.numeric(names(nb)))]
164 | names(nb) <- formatC(as.numeric(names(nb)), width = 2, flag = "0")
165 | ```
166 |
167 | Look at the structure of `nb`:
168 |
169 | ```{r head-nb}
170 | head(nb)
171 | ```
172 |
173 | In **mgcv** the MRF basis can be specified by one or more of
174 |
175 | 1. `polys`; coordinates of vertices defining spatial polygons for each discrete spatial unit. Any two spatial units that share one or more vertices are considered neighbours,
176 | 2. `nb`; a list with one component per spatial unit, where each component contains indices of the neighbouring components of the current spatial unit, and/or
177 | 3. `penalty`; the actual penalty matrix of the MRF basis. This is an $N$ by $N$ matrix with the number of neighbours of each unit on the diagonal, and the $j$th column of the $i$th row set to `-1` if the $j$th spatial unit is a neighbour of the $i$th unit. Elsewhere the penalty matrix is all `0`s.
178 |
179 | **mgcv** will create the penalty matrix for you if you supply `polys` or `nb`. As we don't have polygons here, we'll use the information from the `.gra` file converted to the format needed by `gam()` to indicate the neighbourhood.
180 |
181 | The `nb` list is passed along using the `xt` argument of the `s()` function. For the MRF basis, `xt` is a named list with one or more of the components listed above. In the model call below we use `xt = list(nb = nb)` to pass on the neighbourhood list. To indicate that an MRF smooth is needed, we use `bs = "mrf"` and the covariate of the smooth is the factor indicating to which spatial unit each observation belongs. In the call below we use the `id` variable. Note that this doesn't need to be a factor, just something that can be coerced to one.
182 |
183 | All other aspects of the model fit remain the same hence we use `update()` to add the MRF smooth without repeating everything from the original call.
184 |
185 | ```{r fit-mrf-model}
186 | ## Fit model with MRF
187 | ## forest.m2 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert +
188 | ## s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt) +
189 | ## s(id, bs = "mrf", xt = list(nb = nb)),
190 | ## data = forest, family = ocat(R = 3), method = "REML",
191 | ## control = ctrl)
192 | forest.m2 <- update(forest.m1, . ~ . + s(id, bs = "mrf", xt = list(nb = nb)))
193 | ```
194 |
195 | Look at the model summary:
196 |
197 | ```{r summary-mrf-model}
198 | summary(forest.m2)
199 | ```
200 |
201 | What differences do you see between the model with the MRF spatial effect and the first model that we fitted?
202 |
203 | Quickly create plots of the fitted smooth functions
204 |
205 | ```{r plot-mrf-smooths}
206 | plot(forest.m2, pages = 1, shade = 2, scale = 0, seWithMean = TRUE)
207 | ```
208 |
209 | Notice that here we use `seWithMean = TRUE` as one of the terms has been shrunk back to a linear function and would otherwise have a *bow tie* confidence interval.
210 |
211 | Compare these fitted functions with those from the original model. Are these more in keeping with expectations?
212 |
213 | Model diagnostics are difficult with models for discrete responses such as these. Instead we can interrogate the model to derive quantities of interest that illustrate or summarise the model fit. First we start by generating posterior probabilities for the three ordinal defoliation/damage categories using `predict()`. This is similar to the code Eric showed this morning for the ordered categorical family.
214 |
215 | ```{r fitted-values}
216 | fit <- predict(forest.m2, type = "response")
217 | colnames(fit) <- levs
218 | head(fit)
219 | ```
220 |
221 | The `predict()` method returns a 3-column matrix, one column per category. The entries in the matrix are the posterior probabilities of class membership, with rows summing to 1. As we'll see later, we can take a *majority wins* approach and assign each observation a fitted class membership and compare these to the known classes.
222 |
223 | For easier visualisation it would be nicer to have these fitted probabilities in a tidy form, which we now do via a few manipulations
224 |
225 | ```{r process-fitted-values}
226 | fit <- setNames(stack(as.data.frame(fit)), c("Prob", "Defoliation"))
227 | fit <- transform(fit, Defoliation = factor(Defoliation, labels = levs))
228 | fit <- with(forest, cbind(fit, x, y, year))
229 | head(fit)
230 | ```
231 |
232 | The fitted values are now ready for plotting. Here we plot just the years 2000--2004, showing the spatial arrangement of trees, faceted by defoliation/damage category:
233 |
234 | ```{r plot-fitted}
235 | ggplot(subset(fit, year >= 2000), aes(x = x, y = y, col = Prob)) +
236 | geom_point(size = 1.2) +
237 | facet_grid(Defoliation ~ year) +
238 | coord_fixed() +
239 | theme(legend.position = "top")
240 | ```
241 |
242 | A more complex use of the model involves predicting change in class posterior probability over time whilst holding all other variables at the observed mean or category mode
243 |
244 | ```{r timeseries-predictions}
245 | N <- 200
246 | pdat <- with(forest, expand.grid(year = seq(min(year), max(year), length = N),
247 | age = mean(age, na.rm = TRUE),
248 | gradient = mean(gradient, na.rm = TRUE),
249 | canopyd = mean(canopyd, na.rm = TRUE),
250 | alt = mean(alt, na.rm = TRUE),
251 | depth = mean(depth, na.rm = TRUE),
252 | ph = mean(ph, na.rm = TRUE),
253 | humus = factor(2, levels = levels(humus)),
254 | watermoisture = factor(2, levels = levels(watermoisture)),
255 | alkali = factor(2, levels = levels(alkali)),
256 | type = factor(0, levels = c(0,1)),
257 | fert = factor(0, levels = c(0,1)),
258 | id = levels(id)))
259 | pred <- predict(forest.m2, newdata = pdat, type = "response", exclude = "s(id)")
260 | colnames(pred) <- levs
261 | pred <- setNames(stack(as.data.frame(pred)), c("Prob","Defoliation"))
262 | pred <- cbind(pred, pdat)
263 | pred <- transform(pred, Defoliation = factor(Defoliation, levels = levs))
264 | ```
265 |
266 | A plot of the predictions is produced using
267 |
268 | ```{r plot-timeseries-predictions}
269 | ggplot(pred, aes(x = year, y = Prob, colour = Defoliation)) +
270 | geom_line() +
271 | theme(legend.position = "top")
272 | ```
273 |
274 | Using similar code, produce a plot for the effect of `age` holding other values at the mean or mode.
275 |
276 | The final summary of the model that we'll produce is a confusion matrix of observed and predicted class membership. Technically, this is over-optimistic as we are using the same data to both fit and test the model, but it is an illustration of how well the model does are fitting the observed classes.
277 |
278 | ```{r confusion-matrix}
279 | fit <- predict(forest.m2, type = "response")
280 | fitClass <- factor(levs[apply(fit, 1, which.max)], levels = levs)
281 | obsClass <- with(forest, factor(aggDefol, labels = levs))
282 | sum(fitClass == obsClass) / length(fitClass)
283 | table(fitClass, obsClass)
284 | ```
285 |
286 | Just don't take it as any indication of how well the model will do at predicting the defoliation class for a new year or tree.
287 |
288 | How better does the model with the MRF spatial effect fit the data compared to the original model that was fitted? We can use AIC as one means of answering that question
289 |
290 | ```{r aic}
291 | AIC(forest.m1, forest.m2)
292 | ```
293 |
294 | Which model does AIC indicate as fitting the data best?
295 |
296 | ## Exercise
297 |
298 | As an additional exercise, replace the MRF smooth with a 2-d spline of the `x` and `y` coordinates and produce maps of the class probabilities over the spatial domain. You'll need to use ideas and techniques from some of the other spatial examples in order to complete this task.
299 |
--------------------------------------------------------------------------------
/example-linear-functionals.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Linear functionals; or: what to do when you've got a bunch of x-values and only one y"
3 | author: Eric Pedersen
4 | output:
5 | html_document:
6 | toc: true
7 | toc_float: true
8 | theme: readable
9 | highlight: haddock
10 | ---
11 |
12 | ##1. Background
13 |
14 | Up to now, we've been looking at data where every coefficient we're using to
15 | predict our y-values consists of a single value for each y. However, not all
16 | predictor data looks like this. In many cases, a single coefficient may have
17 | several values associated with the same outcome variable. For instance, Let's
18 | say we've measured total ecosystem production in several different lakes, as
19 | well as temperature along a gradient away from shoreline in the lake. We want to know:
20 |
21 | 1. how well temperature predicts production
22 | 2. whether temperatures at different distances from the shore have different effects on
23 | production (as production varies a lot between the littoral and pelagic zone).
24 |
25 | Now, there looks like there might be a few ways to answer this in a standard gam
26 | setting. First we could just average temperatures across locations and use that
27 | as a predictor to answer question 1. However, that doesn't give us any insight
28 | into question 2, and it's pretty easy to imagine a case like warmer temperatures
29 | at the shore increase production, but warmer temperatures in the middle of the
30 | lake have very little effect, or where a single warm area could substantially
31 | increase production even in a cold lake. We could also try to fit a seperate
32 | smooth of production on temperature at each shore distance. However, if we have
33 | a lot of distances, this ends up fitting a bunch of seperate smooths, and we'd
34 | rapidly run out of degrees of freedom. Also, it would mean throwing away
35 | information; we'd expect that the effect of temperature a meter below from the
36 | shore should be very similar to the effect of temperature right at the
37 | shoreline, but by fitting a seperate smooth for each, we're ignoring that (and
38 | likely going to suffer from concurvity issues to boot!).
39 |
40 | What we need is a method that can account for the fact that we have multiple x
41 | values for a given predictor. Fortunately, `mcgv` can handle this pretty easily.
42 | It does this by allowing to pass matrix-valued predictors to `s()` and `te()`
43 | terms. When you pass `mgcv` a matrix, it will fit the same smooth function to
44 | each of the columns of the matrix, then sum across the rows of the matrix of
45 | transformed values to give the estimated mean value for a given `y`. Let's say
46 | `y` is a one-dimensional outcome, with n measured values, and `x` is a matrix
47 | with n rows and k columns. The predicted values for the ith value of `y` would
48 | be: $y_i \sim\sum_{j=1}^k f(x_{i,j})$, where $f(x_{i,j})$ is itself a sum of
49 | basis functions multiplied by model coefficients as before.
50 |
51 | In mathemetical terms, this makes our smooth term a *functional*, where our
52 | outcome is a function of a vector of terms rather than a single term, and a
53 | linear functional, as it's a linear sum of smooth terms for each column; but you
54 | really don't need to know much about functionals to use these. I only mention
55 | the functional thing because if you want to dig into this approach further or if
56 | you want to read help files on these, as you have to search
57 | `?linear.functional.terms`.
58 |
59 | There's a few major useages Ive found for these. There's likely more than this,
60 | but these are all cases I've encountered. Also, each of these cases requires you
61 | to set up your predictor matrices carefully, so make sure you know what kind of
62 | functional you'll be fitting, and what predictor matrix is going where in the `s()`
63 | function.
64 |
65 | ### Case 1. Nonlinear averaging.
66 | The first case, and the simplest to fit, is to estimate
67 | a nonlinear average. Let's look at the lake example from before. Say we think
68 | that distance from shore shouldn't matter, but that production increases
69 | non-linearly with temperature. In that case, a cold lake with a few hot spots
70 | may actually be more productive than a lake that's consitently luke warm.
71 | Therefore, if you average over temperatures before estimating a function, you'll
72 | miss this effect (as we know from statistics that the average of a non-linear
73 | function applied to x does not, in general, equal the function applied to the
74 | average of x). In this case, we can just provide a matrix of values (the predictor matrix) to `s()`: in this case, the predictor matrix is a matrix of temperatures where each column is the temperature measured at one of our sites.
75 | We do this in the code below with the variable `temp_matrix`.
76 |
77 |
78 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
79 | library(dplyr)
80 | library(mgcv)
81 | library(ggplot2)
82 | library(tidyr)
83 | n_lakes = 200
84 | lake_mean_temps = rnorm(n_lakes, 25,6)
85 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes,
86 | shore_dist_m = c(0,1,5,10,20,40)))
87 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8)
88 | lake_data$production = 5*dnorm(lake_data$temp, 30, sd = 5)/dnorm(1,0,5)+rnorm(n_lakes*6, 0, 1)
89 | lake_data = lake_data %>% group_by(lake)%>%
90 | mutate(production = mean(production),
91 | mean_temp = mean(temp))%>%
92 | spread(shore_dist_m, temp)
93 |
94 |
95 | lake_data$temp_matrix = lake_data %>%
96 | select(`0`:`40`) %>%
97 | as.matrix(.)
98 | head(lake_data)
99 |
100 |
101 | mean_temp_model = gam(production~s(mean_temp),data=lake_data)
102 | nonlin_temp_model = gam(production~s(temp_matrix),
103 | data=lake_data)
104 |
105 | layout(matrix(1:2, nrow=1))
106 | plot(mean_temp_model)
107 | plot(nonlin_temp_model)
108 | layout(1)
109 | temp_predict_data = data.frame(temp = seq(10,40,length=100),
110 | mean_temp = seq(10,40,length=100))
111 |
112 | temp_matrix = matrix(seq(10,40,length=100),nrow= 100,ncol=6,
113 | byrow = F)
114 |
115 | temp_predict_data = temp_predict_data %>%
116 | mutate(linear_average = as.numeric(predict(mean_temp_model,.)),
117 | nonlinear_average = as.numeric(predict(nonlin_temp_model, .)),
118 | true_function = 5*dnorm(temp, 30, sd = 5)/dnorm(1,0,5))%>%
119 | gather(model, value,linear_average:true_function)
120 |
121 | #This plots the two fitted models vs. the true model.
122 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+
123 | geom_line()+
124 | scale_color_brewer(palette="Set1")+
125 | theme_bw(20)
126 | ```
127 |
128 | The nonlinear model is substantially more uncertain at the ends,
129 | bu it fits the true function much more effectively. The function from
130 | the average data substantially underestimates the effect of temperature on
131 | production.
132 |
133 | ### Case 2: Weighted averaging
134 | The next major case where linear functionals are useful is for weighted
135 | averages. Here, you have some predictor variable measured at various points at
136 | different distances or lags from a given point, or at different locations along
137 | some gradient. This could be a variable that's been meaured at various distances
138 | away from each observed site, and you want to understand at what scale that
139 | variable will affect your parameter of interest. It could also be a predictor
140 | variable measured at several time points before the observation, and you want to
141 | know what at what lags the two variables interact.
142 |
143 | In this case, we'll assume that the relationship between our variable of
144 | interest (x) and the outcome is linear at any given lag, but that linear
145 | relationship changes smoothly with the lag. We'll look at the case where both
146 | relationships are nonlinear next. Here we have to create two matrices. The first
147 | is the lag matrix, and it will have one column for each lag we're interested in.
148 | All the values in a given column will be equal to the lag value.
149 | The second matrix is the predictor matrix, and it is the same as (see [the section below](#by_var)) on `by=` terms on how this works).
150 |
151 |
152 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
153 | library(dplyr)
154 | library(mgcv)
155 | library(ggplot2)
156 | library(tidyr)
157 | n_lakes = 200
158 | lake_mean_temps = rnorm(n_lakes, 25,6)
159 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes,
160 | shore_dist_m = c(0,1,5,10,20,40)))
161 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8)
162 | lake_data$production = with(lake_data, rnorm(n_lakes*6,
163 | temp*3*exp(-shore_dist_m/10),1))
164 | lake_data = lake_data %>% group_by(lake)%>%
165 | mutate(production = mean(production),
166 | mean_temp = mean(temp))%>%
167 | spread(shore_dist_m, temp)
168 |
169 | #This is our lag matrix for this example
170 | shore_dist_matrix = matrix(c(0,1,5,10,20,40), nrow=n_lakes,ncol=6, byrow = T)
171 | head(shore_dist_matrix)
172 |
173 | #This is our predictor matrix
174 | lake_data$temp_matrix = lake_data %>%
175 | select(`0`:`40`) %>%
176 | as.matrix(.)
177 | head(lake_data)
178 |
179 | #Note: we need to set k=6 here, as we really only have 6 degrees of freedom (one
180 | #for each distance we've measured from the shore.
181 |
182 | nonlin_temp_model = gam(production~s(shore_dist_matrix, by= temp_matrix,k=6),
183 | data=lake_data)
184 | plot(nonlin_temp_model)
185 |
186 | #This plots the two fitted models vs. the true model.
187 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+
188 | geom_line()+
189 | scale_color_brewer(palette="Set1")+
190 | theme_bw(20)
191 | ```
192 |
193 | ## 2. Key concepts and functions
194 |
195 | ### using `by=` terms in smoothers {#by_var}
196 | One of the most useful features in `mgcv` is the `by=` argument for smooths.
197 | This has two uses.
198 |
199 | 1. For a given smooth (say, `y~s(x)`), if you set `s(x,by=group`), where group
200 | is some factor-leveled predictor, `mgcv` will fit a seperate smooth of x for
201 | each level of that factor. The model will produce a different smooth $s_k(x)$
202 | for each kth level, allowing you to test if a given relationship varies betewen
203 | group.
204 | 2. If you instead you set `s(x, by=z)` where z is a numerical value, `mgcv` will
205 | fit a varying slopes regression. Instead of modeling y as a smooth function of
206 | x, this will model y as a *linear* function of z, where the slope of the
207 | relationship between z and y changes smoothly as a function of x, so the
208 | predicted value for the ith value of y would be: $y_i \sim f(x_i) \cdot z_i$.
209 | If you're using matrix predictors as discussed above, for numerical predictors
210 | the `by` variable `z` also has to be a matrix with the same dimensions as `x`.
211 | The predicted value in this case will be: $y_i \sim \sum_{j=1}^k f(x_{i,j})\cdot z_{i,j}$, where $j$ are the columns of the matrices $x$ and $z$.
212 |
213 |
214 | ##Calculating dispersal kernels
215 |
216 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
217 | yukon_seedling_data = read.csv("data/yukon_seeds/seed_data.csv")
218 | yukon_source_data =read.csv("data/yukon_seeds/seed_source_locations.csv")
219 | seed_dist = matrix(0, nrow = nrow(yukon_seedling_data),
220 | ncol= nrow(yukon_source_data))
221 | for(i in 1:nrow(yukon_seedling_data)){
222 | seed_dist[i,] = sqrt((yukon_source_data$X- yukon_seedling_data$X[i])^2 + (yukon_source_data$Y- yukon_seedling_data$Y[i])^2)
223 | }
224 | seed_dist_l = log(seed_dist)
225 | yukon_seedling_data$min_dist_l = apply(seed_dist_l, MARGIN = 1,min)
226 |
227 | basic_dispersal_model = gam(n_spruce~s(min_dist_l)+offset(log(plot_area_m2)),
228 | data=yukon_seedling_data, family=nb)
229 | full_dispersal_model = gam(n_spruce~s(seed_dist_l)+offset(log(plot_area_m2)),
230 | data=yukon_seedling_data, family=nb)
231 | ```
232 |
233 |
--------------------------------------------------------------------------------
/example-spatial-mexdolphins-solutions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico"
3 | output:
4 | html_document:
5 | toc: true
6 | toc_float: true
7 | theme: readable
8 | highlight: haddock
9 |
10 | ---
11 |
12 | # Preamble
13 |
14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013).
15 |
16 | From that appendix:
17 |
18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.*
19 |
20 |
21 | ## Doing these exercises
22 |
23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works.
24 |
25 | # Data format
26 |
27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided:
28 |
29 |
30 | ```{r loaddata}
31 | load("data/mexdolphins/mexdolphins.RData")
32 | ls()
33 | ```
34 |
35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model.
36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs
37 | - `preddata` prediction grid without any fancy spatial stuff
38 |
39 | Looking further into the `mexdolphins` frame we see:
40 |
41 | ```{r frameinspect}
42 | str(mexdolphins)
43 | ```
44 |
45 | A brief explanation of each entry:
46 |
47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area)
48 | - `Transect.Label` identifier for the transect that this segment belongs to
49 | - `longitude`, `latitude` location in lat/long of this segment
50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection))
51 | - `Effort` the length of the current segment
52 | - `depth` the bathymetry at the segment's position
53 | - `count` number of dolphins observed in this segment
54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment
55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this)
56 |
57 |
58 | # Modelling
59 |
60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model.
61 |
62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*.
63 |
64 | ## A simple model to start with
65 |
66 | We can begin with the model we showed in the first lecture:
67 |
68 | ```{r simplemodel}
69 | library(mgcv)
70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
71 | data = mexdolphins,
72 | family = quasipoisson(),
73 | method = "REML")
74 | ```
75 |
76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection.
77 |
78 | We can check the assumptions of this model by using `gam.check`:
79 |
80 | ```{r simplemodel-check}
81 | gam.check(dolphins_depth)
82 | ```
83 |
84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations:
85 |
86 | ```{r zeroresids, fig.width=7, fig.height=7}
87 | # code from the insides of mgcv::gam.check
88 | resid <- residuals(dolphins_depth, type="deviance")
89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors)
90 | plot(linpred, resid, main = "Resids vs. linear pred.",
91 | xlab = "linear predictor", ylab = "residuals")
92 |
93 | # now add red dots corresponding to the zero counts
94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0],
95 | pch=19, col="red", cex=0.5)
96 | ```
97 |
98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!).
99 |
100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate.
101 |
102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix.
103 |
104 |
105 | **Exercise**
106 |
107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`.
108 |
109 | ```{r simplemodel-bigsmall}
110 | dolphins_depth_bigk <- gam(count ~ s(depth, k=30) + offset(off.set),
111 | data = mexdolphins,
112 | family = quasipoisson(),
113 | method = "REML")
114 | dim(model.matrix(dolphins_depth_bigk))
115 | dim(dolphins_depth_bigk$smooth[[1]]$S[[1]])
116 | dolphins_depth_smallk <- gam(count ~ s(depth, k=3) + offset(off.set),
117 | data = mexdolphins,
118 | family = quasipoisson(),
119 | method = "REML")
120 | dim(model.matrix(dolphins_depth_smallk))
121 | dim(dolphins_depth_smallk$smooth[[1]]$S[[1]])
122 | ```
123 |
124 | (Don't worry about the many square brackets etc to get the penalty matrix!)
125 |
126 | ### Plotting
127 |
128 | We can plot the smooth we fitted using `plot`.
129 |
130 | **Exercise**
131 |
132 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which.
133 |
134 | ```{r plotk, }
135 | par(mfrow=c(1,3))
136 | plot(dolphins_depth, shade=TRUE, main="Default k")
137 | plot(dolphins_depth_bigk, shade=TRUE, main="big k")
138 | plot(dolphins_depth_smallk, shade=TRUE, main="small k")
139 | ```
140 |
141 | ## Count distributions
142 |
143 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves...
144 |
145 |
146 | ### Tweedie
147 |
148 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution.
149 |
150 | ```{r tw}
151 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
152 | data = mexdolphins,
153 | family = tw(),
154 | method = "REML")
155 | ```
156 |
157 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008).
158 |
159 |
160 |
161 | ### Negative binomial
162 |
163 | **Exercise**
164 |
165 | Now do the same using the negative binomial distribution (`nb()`).
166 |
167 | ```{r nb}
168 | dolphins_xy_nb <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
169 | data = mexdolphins,
170 | family = nb(),
171 | method = "REML")
172 | ```
173 |
174 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why?
175 |
176 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.*
177 |
178 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships.
179 |
180 | ## Smoothers
181 |
182 | Now let's move onto using different bases for the smoothers. We have a couple of different options here.
183 |
184 |
185 | ### Thin plate splines with shrinkage
186 |
187 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth.
188 |
189 | **Exercise**
190 |
191 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms.
192 |
193 | ```{r tw-ts}
194 | dolphins_xy_tw_ts <- gam(count ~ s(x,y, bs="ts") + s(depth, bs="ts") +
195 | offset(off.set),
196 | data = mexdolphins,
197 | family = tw(),
198 | method = "REML")
199 | ```
200 |
201 | What are the differences (use `summary`)?
202 |
203 | *EDFs are different for both terms*
204 |
205 | What are the visual differences (use `plot`)?
206 |
207 | *Not really much difference here!*
208 |
209 |
210 | ### Soap film smoother
211 |
212 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands.
213 |
214 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file:
215 |
216 | ```{r soapy}
217 | load("data/mexdolphins/soapy.RData")
218 | ```
219 |
220 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them.
221 |
222 | ```{r soapknots}
223 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10),
224 | y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10))
225 | x <- soap_knots$x; y <- soap_knots$y
226 | ind <- inSide(xy_bnd, x, y)
227 | rm(x,y)
228 | soap_knots <- soap_knots[ind, ]
229 | ## inSide doesn't work perfectly, so if you get an error like:
230 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) :
231 | ## knot 54 is on or outside boundary
232 | ## just remove that knot as follows:
233 | soap_knots <- soap_knots[-8, ]
234 | soap_knots <- soap_knots[-54, ]
235 | ```
236 |
237 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`.
238 |
239 | ```{r soapmodel}
240 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) +
241 | offset(off.set),
242 | data = mexdolphins,
243 | family = tw(),
244 | knots = soap_knots,
245 | method = "REML")
246 |
247 | ```
248 |
249 | **Exercise**
250 |
251 | Look at the `summary` output for this model and compare it to the other models.
252 |
253 | ```{r soap-summary}
254 | summary(dolphins_soap)
255 | ```
256 |
257 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out.
258 |
259 |
260 | ```{r soap-plot}
261 | plot(dolphins_soap)
262 | ```
263 |
264 | # Predictions
265 |
266 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM.
267 |
268 | We can do this very simply, calling predict as one would with a `glm`. For example:
269 |
270 | ```{r pred-qp}
271 | pred_qp <- predict(dolphins_depth, preddata, type="response")
272 | ```
273 |
274 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`.
275 |
276 | **Exercise**
277 |
278 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum?
279 |
280 | ```{r total-max-min}
281 | sum(pred_qp)
282 | range(pred_qp)
283 | ```
284 |
285 | ## Plotting predictions
286 |
287 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.*
288 |
289 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop.
290 |
291 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot:
292 |
293 | ```{r gridplotfn}
294 | library(plyr)
295 | # fill must be in the same order as the polygon data
296 | grid_plot_obj <- function(fill, name, sp){
297 |
298 | # what was the data supplied?
299 | names(fill) <- NULL
300 | row.names(fill) <- NULL
301 | data <- data.frame(fill)
302 | names(data) <- name
303 |
304 | spdf <- SpatialPolygonsDataFrame(sp, data)
305 | spdf@data$id <- rownames(spdf@data)
306 | spdf.points <- fortify(spdf, region="id")
307 | spdf.df <- join(spdf.points, spdf@data, by="id")
308 |
309 | # seems to store the x/y even when projected as labelled as
310 | # "long" and "lat"
311 | spdf.df$x <- spdf.df$long
312 | spdf.df$y <- spdf.df$lat
313 |
314 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
315 | }
316 | ```
317 |
318 | Then we can plot the predicted abundance:
319 |
320 | ```{r plotpred}
321 | library(ggplot2)
322 | library(viridis)
323 | library(sp)
324 | library(rgeos)
325 | library(rgdal)
326 | library(maptools)
327 |
328 | # projection string
329 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
330 | # need sp to transform the data
331 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4)
332 |
333 | p <- ggplot() +
334 | # abundance
335 | grid_plot_obj(pred_qp, "N", pred.polys) +
336 | # survey lines
337 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
338 | # observations
339 | geom_point(aes(x, y, size=count),
340 | data=mexdolphins[mexdolphins$count>0,],
341 | colour="red", alpha=I(0.7)) +
342 | # make the coordinate system fixed
343 | coord_fixed(ratio=1, ylim = range(mexdolphins$y),
344 | xlim = range(mexdolphins$x)) +
345 | # use the viridis colourscheme, which is more colourblind-friendly
346 | scale_fill_viridis() +
347 | # labels
348 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") +
349 | # keep things simple
350 | theme_minimal()
351 | print(p)
352 | ```
353 |
354 | If you have the `maps` package installed you can also try the following plot the coastline too:
355 |
356 | ```{r maptoo}
357 | library(maps)
358 | map_dat <- map_data("usa")
359 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
360 |
361 | # give the sp object a projection
362 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
363 | # re-project
364 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
365 | map_dat$x <- map_sp.t$long
366 | map_dat$y <- map_sp.t$lat
367 |
368 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
369 |
370 | print(p)
371 | ```
372 |
373 | ## Comparing predictions from soap and thin plate splines
374 |
375 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area.
376 |
377 | Predicting using the soap film smoother is exactly the same as for the other smoothers:
378 |
379 | ```{r soap-pred}
380 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response")
381 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap")
382 | ```
383 |
384 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model.
385 |
386 | ```{r xy-soap-pred}
387 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set),
388 | data = mexdolphins,
389 | family = tw(),
390 | method = "REML")
391 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response")
392 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy")
393 | all_pred_N <- rbind(soap_pred_N, xy_pred_N)
394 |
395 | p <- ggplot() +
396 | # abundance
397 | geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) +
398 | # facet it!
399 | facet_wrap(~model) +
400 | # make the coordinate system fixed
401 | coord_fixed(ratio=1, ylim = range(mexdolphins$y),
402 | xlim = range(mexdolphins$x)) +
403 | # use the viridis colourscheme, which is more colourblind-friendly
404 | scale_fill_viridis() +
405 | # labels
406 | labs(fill="Predicted\ndensity", x="x", y="y") +
407 | # keep things simple
408 | theme_minimal() +
409 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
410 | print(p)
411 | ```
412 |
413 | ## Plotting uncertainty
414 |
415 | **Exercise**
416 |
417 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below.
418 |
419 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements.
420 |
421 | ```{r CVmap}
422 | pred_se_qp <- predict(dolphins_depth, preddata, type="response", se.fit=TRUE)
423 | cv <- pred_se_qp$se.fit/pred_se_qp$fit
424 |
425 | p <- ggplot() +
426 | # abundance
427 | grid_plot_obj(cv, "CV", pred.polys) +
428 | # survey lines
429 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
430 | # make the coordinate system fixed
431 | coord_fixed(ratio=1, ylim = range(mexdolphins$y),
432 | xlim = range(mexdolphins$x)) +
433 | # use the viridis colourscheme, which is more colourblind-friendly
434 | scale_fill_viridis() +
435 | # labels
436 | labs(fill="CV",x="x",y="y",size="Count") +
437 | # keep things simple
438 | theme_minimal()
439 | print(p)
440 | ```
441 |
442 | Compare the uncertainties of the different models you've fitted so far.
443 |
444 | ## `lpmatrix` magic
445 |
446 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true:
447 |
448 | ```{r lppred}
449 | # make the Lp matrix
450 | lp <- predict(dolphins_depth, preddata, type="lpmatrix")
451 | # get the linear predictor
452 | lin_pred <- lp %*% coef(dolphins_depth)
453 | # apply the link and multiply by the offset as lpmatrix ignores this
454 | pred <- preddata$area * exp(lin_pred)
455 | # all the same?
456 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE)
457 | ```
458 |
459 | What else can we do? We can also grab the uncertainty for the sum of the predictions:
460 |
461 | ```{r lpNvar}
462 | # extract the variance-covariance matrix
463 | vc <- vcov(dolphins_depth)
464 |
465 | # reproject the var-covar matrix to be on the linear predictor scale
466 | lin_pred_var <- tcrossprod(lp %*% vc, lp)
467 |
468 | # pre and post multiply by the derivatives of the link, evaluated at the
469 | # predictions -- since the link is exp, we just use the predictions
470 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1)
471 |
472 | # we can then calculate a total CV
473 | sqrt(pred_var)/sum(pred)
474 | ```
475 |
476 | As you can see, the `lpmatrix` can be very useful!
477 |
478 | # Extra credit
479 |
480 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument.
481 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose?
482 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make?
483 |
484 | # References
485 |
486 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0
487 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105
488 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006
489 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114.
490 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x
491 |
492 |
--------------------------------------------------------------------------------
/example-spatial-mexdolphins.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico"
3 | output:
4 | html_document:
5 | toc: true
6 | toc_float: true
7 | theme: readable
8 | highlight: haddock
9 |
10 | ---
11 |
12 | # Preamble
13 |
14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013).
15 |
16 | From that appendix:
17 |
18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.*
19 |
20 |
21 | ## Doing these exercises
22 |
23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works.
24 |
25 | # Data format
26 |
27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided:
28 |
29 |
30 | ```{r loaddata}
31 | load("data/mexdolphins/mexdolphins.RData")
32 | ls()
33 | ```
34 |
35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model.
36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs
37 | - `preddata` prediction grid without any fancy spatial stuff
38 |
39 | Looking further into the `mexdolphins` frame we see:
40 |
41 | ```{r frameinspect}
42 | str(mexdolphins)
43 | ```
44 |
45 | A brief explanation of each entry:
46 |
47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area)
48 | - `Transect.Label` identifier for the transect that this segment belongs to
49 | - `longitude`, `latitude` location in lat/long of this segment
50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection))
51 | - `Effort` the length of the current segment
52 | - `depth` the bathymetry at the segment's position
53 | - `count` number of dolphins observed in this segment
54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment
55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this)
56 |
57 |
58 | # Modelling
59 |
60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model.
61 |
62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*.
63 |
64 | ## A simple model to start with
65 |
66 | We can begin with the model we showed in the first lecture:
67 |
68 | ```{r simplemodel}
69 | library(mgcv)
70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
71 | data = mexdolphins,
72 | family = quasipoisson(),
73 | method = "REML")
74 | ```
75 |
76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection.
77 |
78 | We can check the assumptions of this model by using `gam.check`:
79 |
80 | ```{r simplemodel-check}
81 | gam.check(dolphins_depth)
82 | ```
83 |
84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations:
85 |
86 | ```{r zeroresids, fig.width=7, fig.height=7}
87 | # code from the insides of mgcv::gam.check
88 | resid <- residuals(dolphins_depth, type="deviance")
89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors)
90 | plot(linpred, resid, main = "Resids vs. linear pred.",
91 | xlab = "linear predictor", ylab = "residuals")
92 |
93 | # now add red dots corresponding to the zero counts
94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0],
95 | pch=19, col="red", cex=0.5)
96 | ```
97 |
98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!).
99 |
100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate.
101 |
102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix.
103 |
104 |
105 | **Exercise**
106 |
107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`.
108 |
109 | ```{r simplemodel-bigsmall}
110 |
111 | ```
112 |
113 | (Don't worry about the many square brackets etc to get the penalty matrix!)
114 |
115 | ### Plotting
116 |
117 | We can plot the smooth we fitted using `plot`.
118 |
119 | **Exercise**
120 |
121 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which.
122 |
123 | ```{r plotk, }
124 | par(mfrow=c(1,3))
125 |
126 | ```
127 |
128 | ## Count distributions
129 |
130 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves...
131 |
132 |
133 | ### Tweedie
134 |
135 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution.
136 |
137 | ```{r tw}
138 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
139 | data = mexdolphins,
140 | family = tw(),
141 | method = "REML")
142 | ```
143 |
144 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008).
145 |
146 |
147 |
148 | ### Negative binomial
149 |
150 | **Exercise**
151 |
152 | Now do the same using the negative binomial distribution (`nb()`).
153 |
154 | ```{r nb}
155 |
156 | ```
157 |
158 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why?
159 |
160 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.*
161 |
162 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships.
163 |
164 | ## Smoothers
165 |
166 | Now let's move onto using different bases for the smoothers. We have a couple of different options here.
167 |
168 |
169 | ### Thin plate splines with shrinkage
170 |
171 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth.
172 |
173 | **Exercise**
174 |
175 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms.
176 |
177 | ```{r tw-ts}
178 | ```
179 |
180 | What are the differences (use `summary`)?
181 |
182 | What are the visual differences (use `plot`)?
183 |
184 |
185 | ### Soap film smoother
186 |
187 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands.
188 |
189 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file:
190 |
191 | ```{r soapy}
192 | load("data/mexdolphins/soapy.RData")
193 | ```
194 |
195 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them.
196 |
197 | ```{r soapknots}
198 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10),
199 | y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10))
200 | x <- soap_knots$x; y <- soap_knots$y
201 | ind <- inSide(xy_bnd, x, y)
202 | rm(x,y)
203 | soap_knots <- soap_knots[ind, ]
204 | ## inSide doesn't work perfectly, so if you get an error like:
205 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) :
206 | ## knot 54 is on or outside boundary
207 | ## just remove that knot as follows:
208 | soap_knots <- soap_knots[-8, ]
209 | soap_knots <- soap_knots[-54, ]
210 | ```
211 |
212 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`.
213 |
214 | ```{r soapmodel}
215 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) +
216 | offset(off.set),
217 | data = mexdolphins,
218 | family = tw(),
219 | knots = soap_knots,
220 | method = "REML")
221 |
222 | ```
223 |
224 | **Exercise**
225 |
226 | Look at the `summary` output for this model and compare it to the other models.
227 |
228 | ```{r soap-summary}
229 | ```
230 |
231 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out.
232 |
233 |
234 | ```{r soap-plot}
235 | ```
236 |
237 | # Predictions
238 |
239 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM.
240 |
241 | We can do this very simply, calling predict as one would with a `glm`. For example:
242 |
243 | ```{r pred-qp}
244 | pred_qp <- predict(dolphins_depth, preddata, type="response")
245 | ```
246 |
247 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`.
248 |
249 | **Exercise**
250 |
251 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum?
252 |
253 | ## Plotting predictions
254 |
255 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.*
256 |
257 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop.
258 |
259 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot:
260 |
261 | ```{r gridplotfn}
262 | library(plyr)
263 | # fill must be in the same order as the polygon data
264 | grid_plot_obj <- function(fill, name, sp){
265 |
266 | # what was the data supplied?
267 | names(fill) <- NULL
268 | row.names(fill) <- NULL
269 | data <- data.frame(fill)
270 | names(data) <- name
271 |
272 | spdf <- SpatialPolygonsDataFrame(sp, data)
273 | spdf@data$id <- rownames(spdf@data)
274 | spdf.points <- fortify(spdf, region="id")
275 | spdf.df <- join(spdf.points, spdf@data, by="id")
276 |
277 | # seems to store the x/y even when projected as labelled as
278 | # "long" and "lat"
279 | spdf.df$x <- spdf.df$long
280 | spdf.df$y <- spdf.df$lat
281 |
282 | geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
283 | }
284 | ```
285 |
286 | Then we can plot the predicted abundance:
287 |
288 | ```{r plotpred}
289 | library(ggplot2)
290 | library(viridis)
291 | library(sp)
292 | library(rgeos)
293 | library(rgdal)
294 | library(maptools)
295 |
296 | # projection string
297 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
298 | # need sp to transform the data
299 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4)
300 |
301 | p <- ggplot() +
302 | # abundance
303 | grid_plot_obj(pred_qp, "N", pred.polys) +
304 | # survey lines
305 | geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
306 | # observations
307 | geom_point(aes(x, y, size=count),
308 | data=mexdolphins[mexdolphins$count>0,],
309 | colour="red", alpha=I(0.7)) +
310 | # make the coordinate system fixed
311 | coord_fixed(ratio=1, ylim = range(mexdolphins$y),
312 | xlim = range(mexdolphins$x)) +
313 | # use the viridis colourscheme, which is more colourblind-friendly
314 | scale_fill_viridis() +
315 | # labels
316 | labs(fill="Predicted\ndensity",x="x",y="y",size="Count") +
317 | # keep things simple
318 | theme_minimal()
319 | print(p)
320 | ```
321 |
322 | If you have the `maps` package installed you can also try the following plot the coastline too:
323 |
324 | ```{r maptoo}
325 | library(maps)
326 | map_dat <- map_data("usa")
327 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
328 |
329 | # give the sp object a projection
330 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
331 | # re-project
332 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
333 | map_dat$x <- map_sp.t$long
334 | map_dat$y <- map_sp.t$lat
335 |
336 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
337 |
338 | print(p)
339 | ```
340 |
341 | ## Comparing predictions from soap and thin plate splines
342 |
343 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area.
344 |
345 | Predicting using the soap film smoother is exactly the same as for the other smoothers:
346 |
347 | ```{r soap-pred}
348 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response")
349 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap")
350 | ```
351 |
352 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model.
353 |
354 | ```{r xy-soap-pred}
355 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set),
356 | data = mexdolphins,
357 | family = tw(),
358 | method = "REML")
359 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response")
360 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy")
361 | all_pred_N <- rbind(soap_pred_N, xy_pred_N)
362 |
363 | p <- ggplot() +
364 | # abundance
365 | geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) +
366 | # facet it!
367 | facet_wrap(~model) +
368 | # make the coordinate system fixed
369 | coord_fixed(ratio=1, ylim = range(mexdolphins$y),
370 | xlim = range(mexdolphins$x)) +
371 | # use the viridis colourscheme, which is more colourblind-friendly
372 | scale_fill_viridis() +
373 | # labels
374 | labs(fill="Predicted\ndensity", x="x", y="y") +
375 | # keep things simple
376 | theme_minimal() +
377 | geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
378 | print(p)
379 | ```
380 |
381 | ## Plotting uncertainty
382 |
383 | **Exercise**
384 |
385 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below.
386 |
387 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements.
388 |
389 | ```{r CVmap}
390 |
391 | ```
392 |
393 | Compare the uncertainties of the different models you've fitted so far.
394 |
395 | ## `lpmatrix` magic
396 |
397 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true:
398 |
399 | ```{r lppred}
400 | # make the Lp matrix
401 | lp <- predict(dolphins_depth, preddata, type="lpmatrix")
402 | # get the linear predictor
403 | lin_pred <- lp %*% coef(dolphins_depth)
404 | # apply the link and multiply by the offset as lpmatrix ignores this
405 | pred <- preddata$area * exp(lin_pred)
406 | # all the same?
407 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE)
408 | ```
409 |
410 | What else can we do? We can also grab the uncertainty for the sum of the predictions:
411 |
412 | ```{r lpNvar}
413 | # extract the variance-covariance matrix
414 | vc <- vcov(dolphins_depth)
415 |
416 | # reproject the var-covar matrix to be on the linear predictor scale
417 | lin_pred_var <- tcrossprod(lp %*% vc, lp)
418 |
419 | # pre and post multiply by the derivatives of the link, evaluated at the
420 | # predictions -- since the link is exp, we just use the predictions
421 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1)
422 |
423 | # we can then calculate a total CV
424 | sqrt(pred_var)/sum(pred)
425 | ```
426 |
427 | As you can see, the `lpmatrix` can be very useful!
428 |
429 | # Extra credit
430 |
431 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument.
432 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose?
433 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make?
434 |
435 | # References
436 |
437 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0
438 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105
439 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006
440 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114.
441 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x
442 |
443 |
--------------------------------------------------------------------------------
/example-spatio-temporal data.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Spatio-temporal dynamics is for the birds"
3 | output:
4 | html_document:
5 | toc: true
6 | toc_float: true
7 | theme: readable
8 | highlight: haddock
9 | ---
10 |
11 | ##1. Background
12 | We've talked about modelling spatial trends in the main workshop and in an
13 | extended example, and about the trials and tribulations of time series data
14 | in another extended example. Here, we're going to combine these, to look at
15 | one of the most challenging types of data for linear models to deal with:
16 | spatiotemporal data. A spatiotemporal dataset is one where the outcome of
17 | interest has been observed at multiple locations and at multiple points in time,
18 | and we want to know how temporal trends change across the landscape (or, from
19 | the other view, how spatial patterns are changing over time).
20 |
21 | We don't have time to really get into the nuts and bolts issues around fitting
22 | complex spatio-temporal models. This exercise is instead designed to give you
23 | a feel for how to use `mgcv` to model complex interactions between variables
24 | with different natural scales, and how to include simple random effects into
25 | gam models.
26 |
27 | ## 2. Key concepts and functions:
28 | Here's a few key ideas and R functions you should familiarize yourself with if
29 | you haven't already encountered them before. For the R functions (which will be)
30 | highlighted `like this`, use `?function_name` to look them up.
31 |
32 | ### Tensor product (`te`) smooths
33 | This tutorial will rely on tensor product smooths. These are used to model
34 | nonlinear interactions between variables that have different natural scales. The
35 | interaction smooths we saw in the start of the workshop were thin plate splines
36 | `y~s(x1,x2, bs="tp")`). These splines assume that the relationship between y and
37 | x1 should be roughly as "wiggly" as the relationship between y and x2, and only
38 | has one smoothing parameter. This makes sense if x1 and x2 are similar variables
39 | (say latitude and longitude, or arm length and leg length), but if we're looking
40 | at the interacting effects of, say, temperature and body weight, a single
41 | smoothing term doesn't make sense.
42 |
43 | This is where tensor product smooths come in. These assume that each variable of
44 | interest should have its own smooth term. These create multi-dimensional smooth
45 | terms by multiplying the values of each one-dimensional basis by each value of
46 | the second. This is a pretty complicated idea, and hard to describe effectively
47 | without more math than we want to go into here. But you can get the idea from a
48 | simple example. Let's say we have two basis functions, one for variable x1 and
49 | one for variable x2. The function for x1 is a bell-curve shaped, and the one for
50 | x2 is linear. The basis functions will look like this:
51 |
52 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE}
53 | library(mgcv)
54 | library(ggplot2)
55 | library(dplyr)
56 | library(viridis)
57 | set.seed(1)
58 | n=25
59 | dat = as.data.frame(expand.grid(x1 = seq(-1,1,length.out = n),
60 | x2= seq(-1,1,length=n)))
61 | dat$x1_margin = -dat$x1
62 | dat$x2_margin = dnorm(dat$x2,0,0.5)
63 | dat$x2_margin = dat$x2_margin - mean(dat$x2_margin)
64 |
65 | layout(matrix(1:2,ncol=2))
66 | plot(x1_margin~x1,type="l", data= dat%>%filter(!duplicated(x1)))
67 | plot(x2_margin~x2,type="l", data= dat%>%filter(!duplicated(x2)))
68 | layout(1)
69 | ```
70 |
71 | However, the tensor product of the two variables is more complicated. Note that
72 | when x1 is below zero, the effect of x2 is positive at middle values and negative
73 | at low values, and vice versa when x1 is above zero.
74 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE}
75 | dat$x1x2_tensor = dat$x1_margin*dat$x2_margin
76 | ggplot(aes(x=x1,y=x2,fill=x1x2_tensor),data=dat)+
77 | geom_tile()+
78 | scale_x_continuous(expand=c(0,0))+
79 | scale_y_continuous(expand=c(0,0))+
80 | scale_fill_viridis()+
81 | theme_bw()
82 | ```
83 |
84 | A full tensor product smooth will consist of several basis functions constructed
85 | like this. The resulting smooth is then fitted using the standard gam model, but
86 | with multiple penalty terms (one for each term in the tensor product). Each
87 | penalty penalizes how much the function deviates from a smooth function in the
88 | direction of one of the variables, averaged over all the other variables. This
89 | allows us to have the function be more or less wiggly in one direction than
90 | another.
91 |
92 | They are built using the `te()` function, like this:
93 |
94 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
95 | # We'll recycle our fake marginal and tensor products as the expected value for
96 | # a simulation. Waste not, want not!
97 | dat$y = rnorm(n^2, dat$x1x2_tensor+dat$x1_margin+dat$x2_margin,0.5)
98 |
99 | te_demo = gam(y~te(x1, x2, k = 4^2),data=dat,method="REML") #k=5^2 so there will be 5 marginal bases for each dimension
100 | plot.gam(te_demo,scheme = 2)
101 | summary(te_demo)
102 | print(te_demo$sp)
103 | #These are the estimated smooths for the two dimensions. Note that the smooth for
104 | #x1 (the linear basis) is way smoother than the one for x2
105 | ```
106 |
107 |
108 |
109 | ### Tensor interaction smooths
110 | There are many cases when we want an estimate of both the average effect of each
111 | variable as well as the interaction. This is tough to extract from just using
112 | `te()` terms, but if we try to combine the two (say, with
113 | `y~s(x1)+s(x2)+te(x1,x2))`) the result is often numerically unstable; it won't
114 | give reliable consistent separation into the terms we want. Instead, we use `ti`
115 | terms for the interaction. The `ti` terms are set up so the average effect of
116 | each variable in the smooth is already excluded from the interaction (and they'll
117 | have fewer basis functions than the equivalent `te` term). Search `?te` if you're
118 | curious about how `mcgv` sets these up; for now, we'll just use them as a black
119 | box.
120 |
121 | This is how you use them in practice:
122 |
123 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
124 | # We'll recycle our fake tensor product as the expected value for a simulation
125 | # Waste not, want not!
126 |
127 | ti_demo = gam(y~s(x1,k=4)+s(x2,k=4) +ti(x1, x2, k = 4^2),
128 | data=dat,method="REML")
129 | layout(matrix(1:3,nrow=1))
130 | plot.gam(ti_demo,scheme = 2)
131 | layout(1)
132 | summary(ti_demo)
133 | print(ti_demo$sp)
134 | #These are the estimated smooths for the two dimensions. Note that the smooth for
135 | #x1 (the linear basis) is way smoother than the one for x2
136 | ```
137 |
138 | Note that it does a pretty good job of extracting our assumed marginal
139 | basis functions (linear and Gaussian) and the tensor product into three clearly
140 | separate functions.
141 |
142 | NOTE: Never use `ti` on its own without including the marginal smooths! This
143 | is for the same reason you shouldn't exclude individual linear terms but include
144 | interactions in `glm` models. It really virtually never makes sense to talk about a function with a strong interaction but not marginal effects.
145 |
146 | ### Random effects smooths
147 | The last smooth type we'll use in this exercise is the random effects smooth.
148 | This is just what it sounds on the box: it's a smooth for factor terms, with one
149 | coefficient for each level of the group (the basis functions for this case), and
150 | a penalty term on the squared value of the coefficients to draw them towards the
151 | global mean. This is exactly the same mechanism that packages like `lme4` and
152 | `nlme` use to calculate random effects. In fact, there's a very deep connection
153 | between gams and mixed effects,but we won't go into that here.
154 |
155 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
156 | n= 10
157 | n_groups= 10
158 | group_means = rnorm(n_groups, 0, 2)
159 | dat = data_frame(group = rep(1:n_groups, each=n),
160 | group_f = factor(group))
161 | dat$y = rnorm(n*n_groups, 3+group_means[dat$group],2)
162 | re_demo = gam(y~s(group_f, bs="re"),
163 | data=dat,method="REML")
164 | plot.gam(re_demo,scheme = 2)
165 | summary(re_demo)
166 | print(re_demo$sp)
167 | ```
168 |
169 | It's important to know when using `bs="re"` terms whether the variable you're
170 | giving it is numeric or factorial. `mgcv` will not raise an error if you give it
171 | numerical variables to `bs="re"`. Instead, it'll just fit a linear model with a
172 | penalty on the slope of the line between x and y. Try it with the data about to
173 | see, by switching `s(group_f, bs="re")` with `s(group, bs="re")`. `bs="re"` is
174 | set up like this to allow you to use random effects smooths to fit things like
175 | varying slopes random effects models, but make sure you know what you're doing
176 | when venturing into this.
177 |
178 | ## 3. Spatiotemporal models
179 | Now that we've covered the key smoothers, let's take a look at how we'd use them
180 | to model spatiotemporal data. For this section, we'll use data from the Florida
181 | routes of the Breeding Bird Survey. If you haven't heard of it before, this is a
182 | long-term monitoring program tracking bird abundance and diversity across the US
183 | and Canada. It consists of many (over 4000) 24.5 mile long routes. Trained
184 | observers walk these routes each year during peak breeding season, and every 0.5
185 | miles, they stop and count and identify all the birds they can see or hear in a
186 | 0.25 mile radius of the stop. This is one of the best long-term data sets we
187 | have on bird diversity and abundance, and has been used in countless papers.
188 |
189 | Here, we're going to use this data to answer a couple relatively simple
190 | questions: how does bird species richness vary across the state of Florida, has
191 | average richness changed over time, and has the spatial pattern of richness
192 | varied? We obtained data from [this
193 | site](https://www.pwrc.usgs.gov/bbs/RawData/), and summarized it into the total
194 | number of different species and the total number of individual birds observed on
195 | each route in each year in the state of Florida.
196 |
197 | First, we'll load the data, and quickly plot it:
198 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
199 | library(maps)
200 | florida_map = map_data("state","florida")%>%
201 | transmute(Longitude= long, Latitude = lat, order=order)
202 | florida_birds = read.csv("data/bbs_data/bbs_florida_richness.csv")
203 | head(florida_birds)
204 |
205 | ggplot(aes(x=Year,y=Richness),data=florida_birds)+geom_point()+
206 | theme_bw()
207 | ggplot(aes(x=Longitude,y=Latitude,col=Richness), data=florida_birds) +
208 | geom_polygon(data=florida_map,fill=NA, col="black")+
209 | geom_point()+theme_bw()
210 | ```
211 |
212 | It looks like richness may have declined in the end of the time series, and
213 | there seems to be somewhat of a north-south richness gradient, but we need to
214 | model it to really tease these issues out.
215 |
216 | ### Modelling space and time seperately
217 | We'll first take a stab at this to determine if these mean trends actually come
218 | out of the model. As richness is a count variable
219 | we'll use a Poisson gam to model it. This means
220 | that we're assuming mean richness is the product of a smooth
221 | function of location and of date, with no interactions;
222 | E(richness) = exp(f(year))*exp(g(location)).
223 |
224 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
225 | richness_base_model = gam(Richness~ s(Longitude,Latitude)+s(Year),
226 | data=florida_birds,family=poisson,method="REML")
227 | layout(matrix(1:2,ncol=2))
228 | plot(richness_base_model,scheme=2)
229 | layout(1)
230 | summary(richness_base_model)
231 | ```
232 |
233 | Note that we do have a thin-plate interaction between longitude and latitude.
234 | This is assuming that the gradient should be equally smooth latitudinally and
235 | longitudinally. This may not be totally accurate, as the function seems to
236 | change faster latitudinally, but we'll ignore that until the exercises at the
237 | bottom. The model seems to be doing pretty well; it captured the original patterns
238 | we saw, and explains a not-inconsiderable 45% of the deviance.
239 |
240 | Let's see if there's any remaining spatiotemporal patterns though:
241 |
242 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
243 | source("code_snippets/quantile_resid.R")
244 |
245 | florida_birds$base_resid = rqresiduals(richness_base_model)
246 |
247 | ggplot(aes(Longitude, Latitude),
248 | data= florida_birds %>% filter(Year%%6==0))+ #only look at every 6th year
249 | geom_point(aes(color=base_resid))+
250 | geom_polygon(data=florida_map,fill=NA, col="black")+
251 | scale_color_viridis()+
252 | facet_wrap(~Year)+
253 | theme_bw()
254 |
255 | ```
256 | There's definitely still a pattern in the residuals, such as higher than
257 | expected diversity in the mid-latitudes during the 1990's.
258 |
259 | ### Joint models of space and time
260 | The next step is to determine if there is a significant interaction. This is
261 | where tensor product splines come in. Here, we'll use the `ti` formulation, as
262 | it allows us to directly test whether the interaction significantly improves our
263 | model.
264 |
265 | Something important to note: the `d=c(2,1)` argument it `ti`. This tells the
266 | function that the smooth should consist of tensor product between a
267 | 2-dimensional smooth (lat-long) and a 1-dimensional term (Year).
268 |
269 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
270 | richness_ti_model = gam(Richness~ s(Longitude,Latitude)+s(Year) +
271 | ti(Longitude, Latitude, Year, d=c(2,1)),
272 | data=florida_birds,family=poisson,method="REML")
273 | layout(matrix(1:2,ncol=2))
274 | plot(richness_ti_model,scheme=2)
275 | layout(1)
276 | summary(richness_ti_model)
277 |
278 | ```
279 | This improves the model fit substantially, which you can see by directly
280 | comparing models with `anova`:
281 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
282 | anova(richness_base_model,richness_ti_model,test = "Chisq")
283 |
284 | ```
285 |
286 |
287 | The question then becomes: what does this actually mean? Where are we seeing the
288 | fastest and slowest rates of change in richness? It's often difficult to
289 | effectively show spatiotemporal changes, partially as it would require a
290 | 4-dimensional graph to really show what's going on. Still, we can look at a
291 | couple different views by using `predict` effectively:
292 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
293 | #First we'll create gridded data
294 | predict_richess = expand.grid(
295 | Latitude= seq(min(florida_birds$Latitude),
296 | max(florida_birds$Latitude),
297 | length=50),
298 | Longitude = seq(min(florida_birds$Longitude),
299 | max(florida_birds$Longitude),
300 | length=50),
301 | Year = seq(1970,2015,by=5)
302 | )
303 | # This now selects only that data that falls within Florida's border
304 | predict_richess = predict_richess[with(predict_richess,
305 | inSide(florida_map, Latitude,Longitude)),]
306 | predict_richess$model_fit = predict(richness_ti_model,
307 | predict_richess,type = "response")
308 | ggplot(aes(Longitude, Latitude, fill= model_fit),
309 | data=predict_richess)+
310 | geom_tile()+
311 | facet_wrap(~Year,nrow=2)+
312 | scale_fill_viridis("# of species")+
313 | theme_bw(10)
314 | ```
315 |
316 |
317 | Another way of looking at this is by estimating the rate of change
318 | at each location at each time point, and figuring out which locations
319 | are changing most quickly:
320 |
321 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
322 |
323 | predict_richess$model_change =predict(richness_ti_model,
324 | predict_richess%>%mutate(Year=Year+1),
325 | type = "response") -
326 | predict_richess$model_fit
327 | ggplot(aes(Longitude, Latitude, fill= model_change),
328 | data=predict_richess)+
329 | geom_tile()+
330 | facet_wrap(~Year,nrow=2)+
331 | scale_fill_gradient2("Rate of change\n(species per year)")+
332 | theme_bw(10)
333 | ```
334 |
335 | ### Accounting for local heterogeneity with `bs="re"`
336 | Examining this, it seems to indicate that richness fluctuated across the state
337 | throughout the study period, but then began sharply declining state-wide in the
338 | 2000's. (Disclaimer: I don't know enough about either the BBS or Florida to say
339 | how much of this decline is real! Please don't quote me in newspapers saying
340 | biodiversity is collapsing in Florida).
341 |
342 | There is still one persistent issue: viewing the maps of richness, there appear
343 | to be small hotspots and coldspots of richness across the state. This may be due
344 | to a factor I've ignored so far: the data is collected in specific, repeated
345 | routes. If it's just easier to spot birds on one route, or there's a lot of
346 | local nesting habitat there, it will have higher richness than expected over the
347 | whole study. Adjusting for these kinds of hot and cold spots will force our
348 | model to be overly wiggly. Further, not all routes are sampled every year, so we
349 | want to make sure that our trends don't just result from less diverse routes
350 | getting sampled later in the survey.
351 |
352 | To try and account for this effect (and other grouped local factors), we
353 | can use random effects smooths. That would make our model look like this:
354 |
355 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
356 | florida_birds$Route_f = factor(florida_birds$Route)
357 | richness_ti_re_model = gam(Richness~ s(Longitude,Latitude)+s(Year) +
358 | ti(Longitude, Latitude, Year, d=c(2,1))+
359 | s(Route_f, bs="re"),
360 | data=florida_birds,family=poisson,method="REML")
361 | layout(matrix(1:3,nrow=1))
362 | plot(richness_ti_re_model,scheme=2)
363 | layout(1)
364 |
365 | summary(richness_ti_re_model)
366 | anova(richness_ti_model,richness_ti_re_model)
367 | ```
368 |
369 | We can view the same plots as before with our new model:
370 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
371 | predict_richess$Route_f = 1
372 | predict_richess$model_fit = predict(richness_ti_re_model,
373 | predict_richess,type = "response")
374 | predict_richess$model_change =predict(richness_ti_re_model,
375 | predict_richess%>%mutate(Year=Year+1),
376 | type = "response") -
377 | predict_richess$model_fit
378 |
379 | ggplot(aes(Longitude, Latitude, fill= model_fit),
380 | data=predict_richess)+
381 | geom_tile()+
382 | facet_wrap(~Year,nrow=2)+
383 | scale_fill_viridis("# of species")+
384 | theme_bw(10)
385 |
386 | ggplot(aes(Longitude, Latitude, fill= model_change),
387 | data=predict_richess)+
388 | geom_tile()+
389 | facet_wrap(~Year,nrow=2)+
390 | scale_fill_gradient2("Rate of change\n(species per year)")+
391 | theme_bw(10)
392 | ```
393 | This definely seems to have levelled off some of the hot spots from before,
394 | implying that bird richness is more evenly distributed than our prior model
395 | suggested. Still, the large-scale trends remain the same from before.
396 |
397 | ## 4. Exercises
398 |
399 | 1. Given that most of the variability appears to be latitudinal, try to adapt
400 | the tensor smoothers so that latitude,longitude, and year all have their own
401 | penalty terms. How does this change model fit?
402 |
403 | 2. Following up on the prior
404 | exorcise: build a model that allows you to explicitly calculate the marginal
405 | latitudinal gradient in diversity, and to determine how the latitudinal gradient
406 | has changed over time.
407 |
408 | 3. We know from basic sampling theory that if you sample
409 | fewer individuals over time, you also expect to find lower richness, even if
410 | actual richness stays unchanged. What does the spatiotemporal pattern of
411 | abudance look like? Does adding bird abudance as a predictor explain any of the
412 | patterns in richness?
413 |
--------------------------------------------------------------------------------
/mgcv-esa-workshop.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------
/project-requirements.md:
--------------------------------------------------------------------------------
1 | ### Title of the session
2 |
3 | The mgcv package as a one-stop-shop for fitting non-linear ecological models
4 |
5 | ### Description of the session (appears online only; 250 words max.)
6 |
7 | To address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the mgcv package for the R language, can be used to fit models to data from a wide range of sources.
8 |
9 | mgcv is one of the most popular packages for modelling non-linear relationships. However, many users do not know how versatile and powerful a tool it can be. This workshop will focus on teaching participants how to use mgcv in a wide variety of situations (including spatio-temporal, zero-inflated, heavy-tailed, time series, and survival data) and advanced use of mgcv (fitting smooth interactions, seasonal effects, spatial effects, Markov random fields and varying-coefficient models).
10 |
11 | The workshop will give paricipants an understanding of: (a) practical elements of smoothing theory, with a focus on why they would choose to use different types of smoothers (b) model checking and selection (c) the range of modelling possibilities using mgcv. Participants will be assumed to be familiar with the basics of R (loading/manipulating data, functions, and plotting) and regression in R (lm() and glm()). The organizers have extensive practical experience with ecological statistics and modelling using mgcv.
12 |
13 | ### Summary sentence (appears in print only; 50 word max.)
14 |
15 | mgcv is one of the most popular R packages for non-linear modelling, but many users do not know how versatile and powerful it is. This workshop will focus on using mgcv in a wide variety of situations common in ecology, and advanced use of mgcv for model fitting.
16 |
17 | ### Name and contact information (affiliation, email) for the lead organizer and any co-organizers
18 |
19 | Eric Pedersen
20 | Center for Limnology
21 | 680 North Park Street
22 | University of Wisconsin-Madison
23 | Madison, WI, USA 53704
24 | eric.pedersen@wisc.edu
25 |
26 | David L Miller
27 | Northeast Fisheries Science Center
28 | National Oceanic and Atmospheric Administration
29 | 166 Water Street
30 | Woods Hole, MA 02543
31 |
32 | Gavin L. Simpson
33 | Institute of Environmental Change and Society
34 | University of Regina
35 | 3737 Wascana Parkway
36 | Regina
37 | Saskatchewan
38 | Canada
39 | S4S 0A2
40 |
41 | ### Minimum and maximum number of participants (to assist in room assignment).
42 | 10 minimum, 30 maximum.
43 |
44 | ### Requested scheduling
45 | Saturday (August 6th), all day.
46 |
47 | ### Any additional A/V equipment (standard A/V setup is a screen, LCD projector, and laptop)
48 | We will need the standard setup, as well as wifi access for all participants.
49 |
50 | ### Room set-up desired: theater, conference, hollow square, rounds, other
51 | A conference room.
52 |
53 | ### Food and beverage requests
54 | No food or beverages requested.
55 |
56 | ### Underwriting of workshop costs by a group or agency
57 | None of the costs will be underwritten.
58 |
59 | ### Is the session intended to be linked to a scientific session?
60 | No.
61 |
62 | ### Is the session intended to be linked to a business meeting or mixer?
63 | No.
64 |
65 | ### Describe any known (workshop/event) scheduling conflicts (what should it follow/precede/not conflict with?)
66 | We would require a Saturday time slot, as one of the organizers (Gavin Simpson) is also organizing
67 | the Advanced Vegan Workshop Proposal on Sunday AM and attending the ESA Council meeting on Sunday PM.
68 |
--------------------------------------------------------------------------------
/project_description.md:
--------------------------------------------------------------------------------
1 | # The mgcv package as a one-stop-shop for fitting non-linear ecological models
2 |
3 | To address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the mgcv package for the R language, can be used to fit models to data from a wide range of sources.
4 |
5 | mgcv is one of the most popular packages for modelling non-linear relationships. However, many users do not know how versatile and powerful a tool it can be. This workshop will focus on teaching participants how to use mgcv in a wide variety of situations (including spatio-temporal, zero-inflated, heavy-tailed, time series, and survival data) and advanced use of mgcv (fitting smooth interactions, seasonal effects, spatial effects, Markov random fields and varying-coefficient models).
6 |
7 | The workshop will give paricipants an understanding of: (a) practical elements of smoothing theory, with a focus on why they would choose to use different types of smoothers (b) model checking and selection (c) the range of modelling possibilities using mgcv. Participants will be assumed to be familiar with the basics of R (loading/manipulating data, functions, and plotting) and regression in R (lm() and glm()). The organizers have extensive practical experience with ecological statistics and modelling using mgcv.
8 |
--------------------------------------------------------------------------------
/slides/00-preamble.Rpres:
--------------------------------------------------------------------------------
1 | The mgcv package as a one-stop-shop for fitting non-linear ecological models
2 | ============================
3 | author: David L Miller, Eric Pedersen, and Gavin L Simpson
4 | css: custom.css
5 | transition: none
6 |
7 |
8 | Good morning!
9 | ==============
10 | type: section
11 |
12 |
13 |
14 | Logistics
15 | ==========
16 | - where is the coffee?
17 | - where is the bathroom?
18 | - what if there is a fire?
19 |
20 | Who are we?
21 | ===========
22 | type:section
23 |
24 |
25 | Eric Pedersen
26 | =============
27 | - Populuation, community, and movement ecologist
28 | - (sort of) Fisheries (sort of) ecologist
29 | - Movement and life-history modelling
30 | - Connecting theoretical models to data with statistics
31 | - Unreasonably obsessed with smoothing and penalization as a solution to all of life's problems
32 |
33 | Gavin Simpson
34 | =============
35 |
36 | - (Palaeo)ecologist interested in community dynamics & environmental change
37 | - (Palaeo)limnologist interested in environmental change
38 | - Self-taught not-statistician
39 | - Ecological stats software (vegan plus some palaeo stuff)
40 | - To me most things are a model I can (will!) fit with *mgcv*
41 |
42 | David L Miller
43 | ===============
44 |
45 | - Cetacean distribution modelling
46 | - Spatial modelling (esp. model checking)
47 | - Distance sampling [distancesampling.org](http://distancesampling.org)
48 | - Statistical software (`Distance`, `mrds`, `dsm`)
49 | - Trying to shoehorn all projects into `mgcv`
50 |
51 |
52 | Who are you?
53 | ============
54 | type:section
55 |
56 |
57 | What is the structure of the day?
58 | =================================
59 | type:section
60 |
61 | Timing
62 | ======
63 |
64 | - 8am to 5pm
65 | - Morning session: 8am-12pm
66 | - Aternoon session: 1pm-5pm
67 |
68 | (Approximate) schedule
69 | ======================
70 |
71 | Time | Thing
72 | -----------|--------------------
73 | 0800-0815 | Intro
74 | 0815-0945 | Generalized Additive Models
75 | 0945-1000 | Coffee
76 | 1000-1045 | Model checking
77 | 1045-1130 | Model selection
78 | 1130-1200 | Beyond the exponential family
79 | 1200-1300 | Lunch
80 | 1300-1700 | Extended examples/demos
81 |
82 |
83 | Foraging
84 | ========
85 |
86 |
87 | Wifi
88 | ====
89 |
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/slides/01-intro.Rpres:
--------------------------------------------------------------------------------
1 | Generalized Additive Models
2 | ============================
3 | author: David L Miller
4 | css: custom.css
5 | transition: none
6 |
7 |
8 | Overview
9 | =========
10 |
11 | - What is a GAM?
12 | - What is smoothing?
13 | - How do GAMs work? (*Roughly*)
14 | - Fitting and plotting simple models
15 |
16 | ```{r setup, include=FALSE}
17 | library(knitr)
18 | library(viridis)
19 | library(ggplot2)
20 | library(reshape2)
21 | library(animation)
22 | library(mgcv)
23 | opts_chunk$set(cache=TRUE, echo=FALSE)
24 | ```
25 |
26 | What is a GAM?
27 | ===============
28 | type:section
29 |
30 | Generalized Additive Models
31 | ============================
32 |
33 | - Generalized: many response distributions
34 | - Additive: terms **add** together
35 | - Models: well, it's a model...
36 |
37 | To GAMs from GLMs and LMs
38 | =============================
39 | type:section
40 |
41 |
42 | (Generalized) Linear Models
43 | =============================
44 |
45 | Models that look like:
46 |
47 | $$
48 | y_i = \beta_0 + x_{1i}\beta_1 + x_{2i}\beta_2 + \ldots + \epsilon_i
49 | $$
50 |
51 | (describe the response, $y_i$, as linear combination of the covariates, $x_{ji}$, with an offset)
52 |
53 | We can make $y_i\sim$ any exponential family distribution (Normal, Poisson, etc).
54 |
55 | Error term $\epsilon_i$ is normally distributed (usually).
56 |
57 | Why bother with anything more complicated?!
58 | =============================
59 | type:section
60 |
61 | Is this linear?
62 | =============================
63 |
64 | ```{r islinear, fig.width=12, fig.height=6}
65 | set.seed(2) ## simulate some data...
66 | dat <- gamSim(1, n=400, dist="normal", scale=1, verbose=FALSE)
67 | dat <- dat[,c("y", "x0", "x1", "x2", "x3")]
68 | p <- ggplot(dat,aes(y=y,x=x1)) +
69 | geom_point() +
70 | theme_minimal()
71 | print(p)
72 | ```
73 |
74 | Is this linear? Maybe?
75 | =============================
76 |
77 | ```{r eval=FALSE, echo=TRUE}
78 | lm(y ~ x1, data=dat)
79 | ```
80 |
81 |
82 | ```{r maybe, fig.width=12, fig.height=6}
83 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() +
84 | theme_minimal()
85 | print(p + geom_smooth(method="lm"))
86 | ```
87 |
88 |
89 |
90 | What can we do?
91 | =============================
92 | type:section
93 |
94 | Adding a quadratic term?
95 | =============================
96 |
97 | ```{r eval=FALSE, echo=TRUE}
98 | lm(y ~ x1 + poly(x1, 2), data=dat)
99 | ```
100 |
101 | ```{r quadratic, fig.width=12, fig.height=6}
102 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() +
103 | theme_minimal()
104 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2)))
105 | ```
106 |
107 |
108 |
109 |
110 | Is this sustainable?
111 | =============================
112 |
113 | - Adding in quadratic (and higher terms) *can* make sense
114 | - This feels a bit *ad hoc*
115 | - Better if we had a **framework** to deal with these issues?
116 |
117 | ```{r ruhroh, fig.width=12, fig.height=6}
118 | p <- ggplot(dat, aes(y=y, x=x2)) + geom_point() +
119 | theme_minimal()
120 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2)))
121 | ```
122 |
123 |
124 | [drumroll]
125 | =============================
126 | type:section
127 |
128 |
129 | What does a model look like?
130 | =============================
131 |
132 | $$
133 | y_i = \beta_0 + \sum_j s_j(x_{ji}) + \epsilon_i
134 | $$
135 |
136 | where $\epsilon_i \sim N(0, \sigma^2)$, $y_i \sim \text{Normal}$ (for now)
137 |
138 | Remember that we're modelling the mean of this distribution!
139 |
140 | Call the above equation the **linear predictor**
141 |
142 | Okay, but what about these "s" things?
143 | ====================================
144 | right:55%
145 |
146 | ```{r smoothdat, fig.width=8, fig.height=8}
147 |
148 | spdat <- melt(dat, id.vars = c("y"))
149 | p <- ggplot(spdat,aes(y=y,x=value)) +
150 | geom_point() +
151 | theme_minimal() +
152 | facet_wrap(~variable, nrow=2)
153 | print(p)
154 | ```
155 | ***
156 | - Think $s$=**smooth**
157 | - Want to model the covariates flexibly
158 | - Covariates and response not necessarily linearly related!
159 | - Want some "wiggles"
160 |
161 | Okay, but what about these "s" things?
162 | ====================================
163 | right:55%
164 |
165 | ```{r wsmooths, fig.width=8, fig.height=8}
166 | p <- p + geom_smooth()
167 | print(p)
168 | ```
169 | ***
170 | - Think $s$=**smooth**
171 | - Want to model the covariates flexibly
172 | - Covariates and response not necessarily linearly related!
173 | - Want some "wiggles"
174 |
175 | What is smoothing?
176 | ===============
177 | type:section
178 |
179 |
180 | Straight lines vs. interpolation
181 | =================================
182 | right:55%
183 |
184 | ```{r wiggles, fig.height=8, fig.width=8}
185 | library(mgcv)
186 | # hacked from the example in ?gam
187 | set.seed(2) ## simulate some data...
188 | dat <- gamSim(1,n=50,dist="normal",scale=0.5, verbose=FALSE)
189 | dat$y <- dat$f2 + rnorm(length(dat$f2), sd = sqrt(0.5))
190 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10-mean(dat$y)
191 | ylim <- c(-4,6)
192 |
193 | # fit some models
194 | b.justright <- gam(y~s(x2),data=dat)
195 | b.sp0 <- gam(y~s(x2, sp=0, k=50),data=dat)
196 | b.spinf <- gam(y~s(x2),data=dat, sp=1e10)
197 |
198 | curve(f2,0,1, col="blue", ylim=ylim)
199 | points(dat$x2, dat$y-mean(dat$y))
200 |
201 | ```
202 | ***
203 | - Want a line that is "close" to all the data
204 | - Don't want interpolation -- we know there is "error"
205 | - Balance between interpolation and "fit"
206 |
207 | Splines
208 | ========
209 |
210 | - Functions made of other, simpler functions
211 | - **Basis functions** $b_k$, estimate $\beta_k$
212 | - $s(x) = \sum_{k=1}^K \beta_k b_k(x)$
213 | - Makes the math(s) much easier
214 |
215 |
216 |
217 | Design matrices
218 | ===============
219 |
220 | - We often write models as $X\boldsymbol{\beta}$
221 | - $X$ is our data
222 | - $\boldsymbol{\beta}$ are parameters we need to estimate
223 | - For a GAM it's the same
224 | - $X$ has columns for each basis, evaluated at each observation
225 | - again, this is the linear predictor
226 |
227 | Measuring wigglyness
228 | ======================
229 |
230 | - Visually:
231 | - Lots of wiggles == NOT SMOOTH
232 | - Straight line == VERY SMOOTH
233 | - How do we do this mathematically?
234 | - Derivatives!
235 | - (Calculus *was* a useful class afterall!)
236 |
237 |
238 |
239 | Wigglyness by derivatives
240 | ==========================
241 |
242 | ```{r wigglyanim, results="hide"}
243 | library(numDeriv)
244 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 - mean(dat$y)
245 |
246 | xvals <- seq(0,1,len=100)
247 |
248 | plot_wiggly <- function(f2, xvals){
249 |
250 | # pre-calculate
251 | f2v <- f2(xvals)
252 | f2vg <- grad(f2,xvals)
253 | f2vg2 <- unlist(lapply(xvals, hessian, func=f2))
254 | f2vg2min <- min(f2vg2) -2
255 |
256 | # now plot
257 | for(i in 1:length(xvals)){
258 | par(mfrow=c(1,3))
259 | plot(xvals, f2v, type="l", main="function", ylab="f")
260 | points(xvals[i], f2v[i], pch=19, col="red")
261 |
262 | plot(xvals, f2vg, type="l", main="derivative", ylab="df/dx")
263 | points(xvals[i], f2vg[i], pch=19, col="red")
264 |
265 | plot(xvals, f2vg2, type="l", main="2nd derivative", ylab="d2f/dx2")
266 | points(xvals[i], f2vg2[i], pch=19, col="red")
267 | polygon(x=c(0,xvals[1:i], xvals[i],f2vg2min),
268 | y=c(f2vg2min,f2vg2[1:i],f2vg2min,f2vg2min), col = "grey")
269 |
270 | ani.pause()
271 | }
272 | }
273 |
274 | saveGIF(plot_wiggly(f2, xvals), "wiggly.gif", interval = 0.2, ani.width = 800, ani.height = 400)
275 | ```
276 |
277 | 
278 |
279 | What was that grey bit?
280 | =========================
281 |
282 | $$
283 | \int_\mathbb{R} \left( \frac{\partial^2 f(x)}{\partial^2 x}\right)^2 \text{d}x\\
284 | $$
285 |
286 | (Take some derivatives of the smooth and integrate them over $x$)
287 |
288 | (*Turns out* we can always write this as $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$, so the $\boldsymbol{\beta}$ is separate from the derivatives)
289 |
290 | (Call $S$ the **penalty matrix**)
291 |
292 | Making wigglyness matter
293 | =========================
294 |
295 | - $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$ measures wigglyness
296 | - "Likelihood" measures closeness to the data
297 | - Penalise closeness to the data...
298 | - Use a **smoothing parameter** to decide on that trade-off...
299 | - $\lambda \beta^\text{T}S\beta$
300 | - Estimate the $\beta_k$ terms but penalise objective
301 | - "closeness to data" + penalty
302 |
303 | Smoothing parameter
304 | =======================
305 |
306 |
307 | ```{r wiggles-plot, fig.width=15}
308 | # make three plots, w. estimated smooth, truth and data on each
309 | par(mfrow=c(1,3), cex.main=3.5)
310 |
311 | plot(b.justright, se=FALSE, ylim=ylim, main=expression(lambda*plain("= just right")))
312 | points(dat$x2, dat$y-mean(dat$y))
313 | curve(f2,0,1, col="blue", add=TRUE)
314 |
315 | plot(b.sp0, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*0))
316 | points(dat$x2, dat$y-mean(dat$y))
317 | curve(f2,0,1, col="blue", add=TRUE)
318 |
319 | plot(b.spinf, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*infinity))
320 | points(dat$x2, dat$y-mean(dat$y))
321 | curve(f2,0,1, col="blue", add=TRUE)
322 |
323 | ```
324 |
325 | Smoothing parameter selection
326 | ==============================
327 |
328 | - Many methods: AIC, Mallow's $C_p$, GCV, ML, REML
329 | - Recommendation, based on simulation and practice:
330 | - Use REML or ML
331 | - Reiss \& Ogden (2009), Wood (2011)
332 |
333 |
334 |
335 |
336 | Maximum wiggliness
337 | ========================
338 |
339 | - We can set **basis complexity** or "size" ($k$)
340 | - Maximum wigglyness
341 | - Smooths have **effective degrees of freedom** (EDF)
342 | - EDF < $k$
343 | - Set $k$ "large enough"
344 | - Penalty does the rest
345 |
346 |
347 | More on this in a bit...
348 |
349 | uhoh
350 | ======
351 | title: none
352 | type:section
353 |
354 |