├── .gitignore
├── LICENSE
├── README.md
├── code_snippets
    ├── gra2mgcv.R
    ├── quantile_resid.R
    └── vis.concurvity.R
├── course_outline.md
├── data
    ├── bbs_data
    │   ├── bbs_florida.csv
    │   ├── bbs_florida_richness.csv
    │   └── routes.csv
    ├── bbs_florida.csv
    ├── bbs_florida_richness.csv
    ├── beta-regression
    │   └── ZooData.txt
    ├── forest-health
    │   ├── beech.raw
    │   ├── foresthealth.bnd
    │   ├── foresthealth.dat
    │   └── foresthealth.gra
    ├── mexdolphins
    │   ├── README.md
    │   ├── mexdolphins.RData
    │   ├── soap_pred.R
    │   └── soapy.RData
    ├── routes.csv
    ├── time_series
    │   └── Florida_climate.csv
    └── yukon_seeds
    │   ├── seed_data.csv
    │   └── seed_source_locations.csv
├── example-bivariate-timeseries-and-ti.Rmd
├── example-bivariate-timeseries-and-ti.html
├── example-forest-health.Rmd
├── example-forest-health.html
├── example-linear-functionals.Rmd
├── example-nonlinear-timeseries.Rmd
├── example-nonlinear-timeseries.html
├── example-spatial-mexdolphins-solutions.Rmd
├── example-spatial-mexdolphins-solutions.html
├── example-spatial-mexdolphins.Rmd
├── example-spatial-mexdolphins.html
├── example-spatio-temporal data.Rmd
├── example-spatio-temporal_data.html
├── mgcv-esa-workshop.Rproj
├── project-requirements.md
├── project_description.md
└── slides
    ├── 00-preamble.Rpres
    ├── 01-intro.Rpres
    ├── 02-model_checking.Rpres
    ├── 03-model-selection.Rpres
    ├── 04-Beyond_the_exponential_family.Rpres
    ├── correct_mathjax.sh
    ├── custom.css
    ├── images
        ├── addbasis.png
        ├── animal_choice.png
        ├── animal_codes.png
        ├── animal_functions.png
        ├── count.jpg
        ├── mathematical_sobbing.jpg
        ├── remlgcv.png
        ├── spotteddolphin_swfsc.jpg
        └── tina-modelling.png
    ├── quantile_resid.R
    ├── uncertainty.gif
    └── wiggly.gif


/.gitignore:
--------------------------------------------------------------------------------
1 | *.Rhistory
2 | .Rproj.user/
3 | .Rproj.user
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Code in this respository is licensed under the GPL version >= 2.
2 | 
3 | All other material is Creative Commons BY licensed (except where it does not belong to us, we do not claim copyright over anyone else's material).
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mgcv-esa-workshop
2 | 


--------------------------------------------------------------------------------
/code_snippets/gra2mgcv.R:
--------------------------------------------------------------------------------
 1 | ## Read in a BayesX graph file and output list required by mgcv's mrf smooth
 2 | gra2mgcv <- function(file) {
 3 |     ## first row contains number of objects in file
 4 |     ## graph info is stored in triplets or rows after first row
 5 |     ##   - 1st row of triplet is ID
 6 |     ##   - 2nd row of triplet is number of neighbours
 7 |     ##   - 3rd row of triplet is vector of neighbour
 8 |     gra <- readLines(file)
 9 |     N <-  gra[1]
10 |     gra <- gra[-1]                      # pop the first row which contains N
11 |     ids <- seq.int(1, by = 3, length.out = N)
12 |     nbs <- seq.int(3, by = 3, length.out = N)
13 |     node2id <- function(nodes, ids) {
14 |         as.numeric(ids[as.numeric(nodes) + 1])
15 |     }
16 |     l <- lapply(strsplit(gra[nbs], " "), node2id, ids = gra[ids])
17 |     names(l) <-  gra[ids]
18 |     l
19 | }
20 | 


--------------------------------------------------------------------------------
/code_snippets/quantile_resid.R:
--------------------------------------------------------------------------------
 1 | #This code is modified for D.L. Miller's dsm package for distance sampling, from
 2 | #the rqgam.check function. The code is designed to extract randomized quantile 
 3 | #residuals from GAMs, using the family definitions in mgcv. Note statmod only
 4 | #supports RQ residuals for the following families: Tweedie, Poisson, Gaussian,  Any errors are due to Eric Pedersen
 5 | library(statmod) #This has functions for randomized quantile residuals
 6 | rqresiduals = function (gam.obj) {
 7 |   if(!"gam" %in% attr(gam.obj,"class")){
 8 |     stop('"gam.obj has to be of class "gam"')
 9 |   }
10 |   if (!grepl("^Tweedie|^Negative Binomial|^poisson|^binomial|^gaussian|^Gamma|^inverse.gaussian",
11 |              gam.obj$family$family)){
12 |     stop(paste("family " , gam.obj$family$family, 
13 |                 " is not currently supported by the statmod library, 
14 |                  and any randomized quantile residuals would be inaccurate.",
15 |                sep=""))
16 |   }
17 |   if (grepl("^Tweedie", gam.obj$family$family)) {
18 |     if (is.null(environment(gam.obj$family$variance)$p)) {
19 |       p.val <- gam.obj$family$getTheta(TRUE)
20 |       environment(gam.obj$family$variance)$p <- p.val
21 |     }
22 |     qres <- qres.tweedie(gam.obj)
23 |   }
24 |   else if (grepl("^Negative Binomial", gam.obj$family$family)) {
25 |     if ("extended.family" %in% class(gam.obj$family)) {
26 |       gam.obj$theta <- gam.obj$family$getTheta(TRUE)
27 |     }
28 |     else {
29 |       gam.obj$theta <- gam.obj$family$getTheta()
30 |     }
31 |     qres <- qres.nbinom(gam.obj)
32 |   }
33 |   else {
34 |     qres <- qresid(gam.obj)
35 |   }
36 |   return(qres)
37 | }


--------------------------------------------------------------------------------
/code_snippets/vis.concurvity.R:
--------------------------------------------------------------------------------
 1 | # visualise concurvity between terms in a GAM
 2 | # David L Miller 2015, MIT license
 3 | 
 4 | # arguments:
 5 | #   b     -- a fitted gam
 6 | #   type  -- concurvity measure to plot, see ?concurvity
 7 | 
 8 | vis.concurvity <- function(b, type="estimate"){
 9 |   cc <- concurvity(b, full=FALSE)[[type]]
10 | 
11 |   diag(cc) <- NA
12 |   cc[lower.tri(cc)]<-NA
13 | 
14 |   layout(matrix(1:2, ncol=2), widths=c(5,1))
15 |   opar <- par(mar=c(5, 6, 5, 0) + 0.1)
16 |   # main plot
17 |   image(z=cc, x=1:ncol(cc), y=1:nrow(cc), ylab="", xlab="",
18 |         axes=FALSE, asp=1, zlim=c(0,1))
19 |   axis(1, at=1:ncol(cc), labels = colnames(cc), las=2)
20 |   axis(2, at=1:nrow(cc), labels = rownames(cc), las=2)
21 |   # legend
22 |   opar <- par(mar=c(5, 0, 4, 3) + 0.1)
23 |   image(t(matrix(rep(seq(0, 1, len=100), 2), ncol=2)),
24 |         x=1:3, y=1:101, zlim=c(0,1), axes=FALSE, xlab="", ylab="")
25 |   axis(4, at=seq(1,101,len=5), labels = round(seq(0,1,len=5),1), las=2)
26 |   par(opar)
27 | }
28 | 


--------------------------------------------------------------------------------
/course_outline.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Course outline
  3 | 
  4 | ---
  5 | 
  6 | - *Start time:* 8am
  7 | - *End time:* 5pm
  8 | 
  9 | 2x 4 hour blocks:
 10 | 
 11 | - Morning: 8am-12pm
 12 | - Aternoon: 1pm-5pm
 13 | 
 14 | 
 15 | # Morning
 16 | 
 17 | ## Intro (Dave M)
 18 | 
 19 | - GLM refresher
 20 |   - model the *mean*
 21 |   - here are some distributions (plot + mean-variance)
 22 | - GLMs -> GAMs
 23 | - Spline
 24 |   - why are they good?
 25 |   - how do they work?
 26 |   - measuring wigglyness
 27 | - What is uncertainty?
 28 | * "Hook" for "model expansion"
 29 | 
 30 | 
 31 | ### Practical
 32 | 
 33 | - `gam()`
 34 | - `s()`
 35 | - `gam.check()`
 36 | - `summary()`
 37 | - `plot()`
 38 | * `predict()`
 39 | 
 40 | 
 41 | ## Model checking (Eric)
 42 | 
 43 | *(1/2 slot?)*
 44 | 
 45 | - `gam.check()`
 46 | - `concurvity()`
 47 | - (randomised quantile residuals)
 48 | * autocorrelation (check?)
 49 | 
 50 | ### Practical
 51 | 
 52 | - `gam.check()`
 53 | 
 54 | ## Model selection (Gavin)
 55 | 
 56 | *(1/2 slot?)*
 57 | 
 58 | - shrinkage via shrinkage smooths
 59 | - shrinkage via select = TRUE
 60 | - AIC corrected for estimated smoothness params
 61 | - approximate *p* values (mention at least)
 62 | 
 63 | ### Practical
 64 | 
 65 | - `summary()`
 66 | 
 67 | ## Beyond the exponential family (all)
 68 | 
 69 | *(1/2 slot?)*
 70 | 
 71 | - Count data: Tweedie, negative binomial & ZIP (Dave)
 72 | - Beta regression (Gavin)
 73 | - Other stuff: t, `cox.ph`, ordered categorical, mvnorm (Eric)
 74 | 
 75 | # Afternoon
 76 | 
 77 | ## Extended examples/demos (all)
 78 | 
 79 | Introduce a set of examples, let people pick what they want. Workbook-type situation.
 80 | 
 81 | - Spatial modelling
 82 |   - Dolphins (Dave) (Example)
 83 | - Spatio-temporal models
 84 |   - Trawl data from ERDDAP (Eric) (Example)
 85 |   - Lake cores (Gavin) (Demo)
 86 | - Time series modelling
 87 |   - `lynx` (Eric) (Example)
 88 |   - Ft Lauderdale (Eric) (Example)
 89 |   - Ft Lauderdale `ti()` (Gavin) (Example)
 90 | - Markov random fields
 91 |   - forest health data (Gavin) (Example)
 92 | - `"lpmatrix"`
 93 |   - dolphins revisit (Dave) (Example)
 94 | - random effects
 95 |   - cheetahs (Gavin)
 96 |   - ship effect trawl (Eric)
 97 | - Location-scale models?
 98 |   - Quick example (Gavin) (Demo)
 99 | - Linear functionals
100 |   - dispersal kernels black spruce (Eric) (Example)
101 | 
102 | 
103 | ## Smoother zoo checklist
104 | 
105 | - [ ] 2-D smoother `s()` (Dave intro example)
106 | - [ ] `te()`
107 | - [ ] soap film (Dave spatial dolphins)
108 | - [ ] `"ts"` (Gavin model selection)
109 | - [ ] Cyclic splines
110 | - [ ] `fs` basis/`by=`
111 | - [ ] random effects
112 | - [ ] Gaussian process smoothers?
113 | - [ ] multiple formulae specification?
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/data/bbs_data/routes.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/bbs_data/routes.csv


--------------------------------------------------------------------------------
/data/beta-regression/ZooData.txt:
--------------------------------------------------------------------------------
1 | Number	Scans	Proportion	Size	Visual	Raised	Visitors	Feeding	Oc	Other	Enrichment	Group	Sex	Enclosure	Vehicle	Diet	Age	Zoo	Eps41	300	0.136666667	650	2	2	6418	1	1	2	2	2	1	0	59	3	2	1	147	150	0.313333333	2405	2	1	13607	1	2	2	2	1	1	0	263	1	2	2	228	300	0.093333333	1781	4	2	13607	1	2	2	2	2	1	0	263	1	3	2	326	150	0.173333333	390	5	1	0	1	2	2	1	1	2	0	0	1	3	2	413	148	0.087837838	390	2	1	0	1	2	2	1	1	2	0	0	1	3	2	525	152	0.164473684	390	3	1	0	1	2	2	1	1	2	0	0	1	2	2	615	301	0.049833887	1200	4	2	11713	1	2	2	2	2	2	20	215	3	8	3	778	299	0.260869565	1200	4	2	11713	1	2	2	2	2	2	12	215	3	2	3	877	300	0.256666667	1200	4	2	11713	1	2	2	2	2	1	16	215	3	2	3	964	300	0.213333333	70	1	1	11713	1	2	2	2	2	1	20	215	3	8	3	1080	300	0.266666667	40	1	1	11713	1	2	2	2	2	2	12	215	3	2	3	1128	240	0.116666667	1200	4	2	4225	1	2	2	2	2	1	16	28	3	8	3	122	150	0.013333333	650	5	2	96513	1	1	1	2	1	2	20	3	3	7	4	1312	150	0.08	650	5	2	96513	1	1	1	2	1	2	20	3	3	1	4	142	150	0.013333333	650	5	2	7294	1	1	1	1	1	2	0	7	3	7	4	1516	150	0.106666667	100	1	1	0	1	1	1	1	1	2	20	6	3	7	4	1612	150	0.08	100	1	1	0	1	1	1	1	1	2	20	6	3	1	4	176	356	0.016853933	2030	8	2	76000	1	2	2	1	2	1	6	0	1	2	5	1845	120	0.375	715	6	2	76000	1	2	2	1	1	2	6	0	1	2	5	1927	270	0.1	715	6	2	14195	2	2	2	1	2	1	0	0	1	2	5	2028	90	0.311111111	2030	8	2	14195	2	2	2	1	1	2	0	0	1	2	5	2114	100	0.14	420	2	1	8971	1	1	2	1	1	1	0	112	3	1	6	229	100	0.09	434	2	2	39120	1	1	2	1	1	1	0	116	3	1	6	2310	100	0.1	434	2	2	27589	1	1	2	1	1	1	0	157	3	1	6	2413	497	0.026156942	434	2	2	8971	1	1	2	1	2	2	0	112	3	3	6	2510	495	0.02020202	1700	4	1	39120	1	1	2	1	2	2	1	116	3	3	6	269	499	0.018036072	1700	4	1	27589	1	1	2	1	2	2	0	157	3	3	6	2720	199	0.100502513	3800	8	2	8971	1	1	2	1	2	2	0	112	3	5	6	2826	200	0.13	3000	8	2	39120	1	1	2	2	2	2	0	63	3	5	6	2921	200	0.105	3000	8	2	27589	1	1	2	2	2	2	0	84	3	5	6	300	296	0	3000	8	2	8971	1	1	2	2	2	1	0	51	3	10	6	316	300	0.02	504	4	2	27589	1	1	2	1	2	1	1	157	3	10	6	3263	299	0.210702341	1700	4	1	8971	1	1	2	1	2	1	1	112	3	10	6	3321	100	0.21	504	4	2	8971	1	1	2	1	1	2	0	112	3	8	6	3418	100	0.18	504	4	2	39120	1	1	2	1	1	2	0	116	3	3	6	3513	192	0.067708333	434	2	1	22000	2	1	2	2	1	2	0	128	3	5	6	3634	150	0.226666667	1000	2	1	627	1	2	1	2	1	1	0	83	2	7	7	3721	150	0.14	600	2	1	627	1	2	1	2	1	2	0	83	2	4	7	3835	150	0.233333333	578	4	1	627	1	2	1	2	1	2	0	83	2	3	7	3916	150	0.106666667	630	4	1	627	1	2	1	2	1	2	0	83	2	8	7	4018	150	0.12	587	2	1	627	1	2	1	2	1	2	0	83	2	7	7	4122	150	0.146666667	203	1	1	627	1	2	1	2	1	1	0	83	2	2	7	4227	150	0.18	1100	3	1	627	1	2	1	2	1	1	0	54	1	8	7	4316	300	0.053333333	1400	8	1	627	1	2	1	2	2	1	1	27	1	8	7	444	302	0.013245033	600	5	1	0	2	2	1	1	2	1	0	22	1	11	7	4516	147	0.108843537	1600	9	1	0	2	2	1	1	1	1	0	26	1	9	7	464	150	0.026666667	400	4	1	0	2	2	1	1	1	2	0	26	1	8	7	4735	300	0.116666667	400	4	1	0	2	2	1	1	2	1	0	26	1	5	7	4811	150	0.073333333	500	9	1	0	2	2	1	1	1	2	0	26	1	11	7	498	149	0.053691275	500	9	1	0	2	2	1	1	1	2	0	26	1	12	7	5018	150	0.12	500	9	1	0	2	2	1	1	1	2	0	26	1	11	7	5119	299	0.063545151	500	9	1	0	2	2	1	1	2	1	0	26	1	10	7	523	150	0.02	500	9	1	0	2	2	1	1	1	1	0	26	1	10	7	5315	495	0.03030303	1800	8	1	0	2	2	2	1	2	2	0	26	1	1	7	5436	150	0.24	431	2	1	0	2	2	1	1	1	1	2	32	1	4	7	5510	150	0.066666667	336	2	1	0	2	2	1	1	1	2	0	32	1	9	7	563	149	0.020134228	67.5	1	1	0	2	2	1	1	1	1	0	32	1	7	7	571	150	0.006666667	195	2	1	0	2	2	1	1	1	1	2	32	1	8	7	5810	150	0.066666667	270	3	1	0	2	2	1	1	1	1	0	32	1	12	7	5912	298	0.040268456	1900	9	1	0	2	2	1	1	2	1	1	19	1	9	7	6014	150	0.093333333	360	4	1	0	2	2	1	1	1	1	0	11	1	7	7	6133	150	0.22	360	3	1	0	2	2	1	1	1	2	0	11	1	3	7	629	149	0.060402685	266	5	1	0	2	2	1	1	1	2	0	11	1	9	7	6319	150	0.126666667	360	7	1	0	2	2	1	1	1	2	0	11	1	6	7	6439	600	0.065	15000	4	1	718	1	2	1	2	2	1	8	51	3	3	8	6517	300	0.056666667	25000	4	2	718	1	2	1	2	2	2	10	49	3	5	8	669	150	0.06	10000	3	1	718	1	2	1	1	1	1	0	27	3	15	8	6711	300	0.036666667	25000	4	2	718	1	2	1	2	2	2	8	40	3	1	8	6810	591	0.016920474	10000	9	1	0	2	2	2	2	2	2	0	12	3	9	8	692	749	0.002670227	40000	9	2	0	2	1	2	1	2	1	0	16	3	5	8	7020	448	0.044642857	10000	4	2	0	1	2	1	1	2	1	3	18	3	1	8	710	150	0	25000	10	2	0	2	2	2	1	1	1	0	22	3	10	8	721	300	0.003333333	25000	10	2	0	2	2	2	1	2	1	0	22	3	10	8	7313	150	0.086666667	80000	9	2	718	2	2	2	2	1	2	0	47	3	10	8	7410	150	0.066666667	80000	9	2	718	2	2	2	2	1	2	0	60	3	7	8	750	300	0	80000	9	2	0	2	2	2	2	2	2	0	44	3	2	8	7619	150	0.126666667	3000	5	2	33172	1	2	2	1	1	2	16	11	2	4	9	7712	150	0.08	62	1	1	0	1	1	2	1	1	2	16	21	2	4	9	7810	452	0.022123894	3000	5	2	33172	1	2	2	1	2	2	16	1	2	1	9	7910	450	0.022222222	62	1	1	0	1	1	2	1	2	2	16	17	2	1	9	8015	149	0.100671141	69	1	1	0	1	2	1	1	1	1	16	10	2	2	9	8137	150	0.246666667	69	1	1	0	1	2	1	1	1	2	14	17	2	5	9	8244	300	0.146666667	625	1	2	0	1	2	2	1	2	1	0	34	2	8	9	8321	150	0.14	625	1	1	0	1	2	2	1	1	2	6	33	2	5	9	8414	150	0.093333333	10000	2	1	0	1	2	2	1	1	1	2	29	2	9	9	8555	149	0.369127517	600	2	2	0	1	2	2	1	1	1	2	34	2	9	9	8628	150	0.186666667	450	1	1	0	1	2	1	1	1	2	2	35	2	2	9	8717	150	0.113333333	18.6	1	1	0	1	2	1	1	1	2	0	35	2	8	9	88


--------------------------------------------------------------------------------
/data/forest-health/foresthealth.bnd:
--------------------------------------------------------------------------------
  1 | "82",5
  2 | 15.9882,1.8882
  3 | 15.9882,2.1118
  4 | 16.2118,2.1118
  5 | 16.2118,1.8882
  6 | 15.9882,1.8882
  7 | "83",5
  8 | 15.9882,2.8882
  9 | 15.9882,3.1118
 10 | 16.2118,3.1118
 11 | 16.2118,2.8882
 12 | 15.9882,2.8882
 13 | "79",5
 14 | 14.9882,1.8882
 15 | 14.9882,2.1118
 16 | 15.2118,2.1118
 17 | 15.2118,1.8882
 18 | 14.9882,1.8882
 19 | "81",5
 20 | 14.9882,3.8882
 21 | 14.9882,4.1118
 22 | 15.2118,4.1118
 23 | 15.2118,3.8882
 24 | 14.9882,3.8882
 25 | "78",5
 26 | 13.9882,4.8882
 27 | 13.9882,5.1118
 28 | 14.2118,5.1118
 29 | 14.2118,4.8882
 30 | 13.9882,4.8882
 31 | "62",5
 32 | 10.1882,8.8882
 33 | 10.1882,9.1118
 34 | 10.4118,9.1118
 35 | 10.4118,8.8882
 36 | 10.1882,8.8882
 37 | "55",5
 38 | 9.6882,8.3882
 39 | 9.6882,8.6118
 40 | 9.9118,8.6118
 41 | 9.9118,8.3882
 42 | 9.6882,8.3882
 43 | "80",5
 44 | 14.9882,2.8882
 45 | 14.9882,3.1118
 46 | 15.2118,3.1118
 47 | 15.2118,2.8882
 48 | 14.9882,2.8882
 49 | "77",5
 50 | 13.9882,3.8882
 51 | 13.9882,4.1118
 52 | 14.2118,4.1118
 53 | 14.2118,3.8882
 54 | 13.9882,3.8882
 55 | "75",5
 56 | 12.9882,4.9882
 57 | 12.9882,5.2118
 58 | 13.2118,5.2118
 59 | 13.2118,4.9882
 60 | 12.9882,4.9882
 61 | "72",5
 62 | 12.0882,5.9882
 63 | 12.0882,6.2118
 64 | 12.3118,6.2118
 65 | 12.3118,5.9882
 66 | 12.0882,5.9882
 67 | "61",5
 68 | 10.0882,7.8882
 69 | 10.0882,8.1118
 70 | 10.3118,8.1118
 71 | 10.3118,7.8882
 72 | 10.0882,7.8882
 73 | "76",5
 74 | 13.9882,2.8882
 75 | 13.9882,3.1118
 76 | 14.2118,3.1118
 77 | 14.2118,2.8882
 78 | 13.9882,2.8882
 79 | "74",5
 80 | 12.9882,3.8882
 81 | 12.9882,4.1118
 82 | 13.2118,4.1118
 83 | 13.2118,3.8882
 84 | 12.9882,3.8882
 85 | "71",5
 86 | 12.0882,4.9882
 87 | 12.0882,5.2118
 88 | 12.3118,5.2118
 89 | 12.3118,4.9882
 90 | 12.0882,4.9882
 91 | "60",5
 92 | 10.0882,6.9882
 93 | 10.0882,7.2118
 94 | 10.3118,7.2118
 95 | 10.3118,6.9882
 96 | 10.0882,6.9882
 97 | "73",5
 98 | 12.9882,2.8882
 99 | 12.9882,3.1118
100 | 13.2118,3.1118
101 | 13.2118,2.8882
102 | 12.9882,2.8882
103 | "70",5
104 | 11.9882,3.8882
105 | 11.9882,4.1118
106 | 12.2118,4.1118
107 | 12.2118,3.8882
108 | 11.9882,3.8882
109 | "67",5
110 | 11.0882,4.8882
111 | 11.0882,5.1118
112 | 11.3118,5.1118
113 | 11.3118,4.8882
114 | 11.0882,4.8882
115 | "59",5
116 | 10.0882,5.8882
117 | 10.0882,6.1118
118 | 10.3118,6.1118
119 | 10.3118,5.8882
120 | 10.0882,5.8882
121 | "54",5
122 | 9.0882,6.9882
123 | 9.0882,7.2118
124 | 9.3118,7.2118
125 | 9.3118,6.9882
126 | 9.0882,6.9882
127 | "69",5
128 | 11.9882,2.8882
129 | 11.9882,3.1118
130 | 12.2118,3.1118
131 | 12.2118,2.8882
132 | 11.9882,2.8882
133 | "66",5
134 | 10.9882,3.8882
135 | 10.9882,4.1118
136 | 11.2118,4.1118
137 | 11.2118,3.8882
138 | 10.9882,3.8882
139 | "68",5
140 | 11.9882,1.9882
141 | 11.9882,2.2118
142 | 12.2118,2.2118
143 | 12.2118,1.9882
144 | 11.9882,1.9882
145 | "63",5
146 | 10.9882,0.988197
147 | 10.9882,1.2118
148 | 11.2118,1.2118
149 | 11.2118,0.988197
150 | 10.9882,0.988197
151 | "53",5
152 | 9.0882,5.8882
153 | 9.0882,6.1118
154 | 9.3118,6.1118
155 | 9.3118,5.8882
156 | 9.0882,5.8882
157 | "48",5
158 | 8.0882,6.9882
159 | 8.0882,7.2118
160 | 8.3118,7.2118
161 | 8.3118,6.9882
162 | 8.0882,6.9882
163 | "65",5
164 | 10.9882,2.8882
165 | 10.9882,3.1118
166 | 11.2118,3.1118
167 | 11.2118,2.8882
168 | 10.9882,2.8882
169 | "64",5
170 | 10.9882,1.8882
171 | 10.9882,2.1118
172 | 11.2118,2.1118
173 | 11.2118,1.8882
174 | 10.9882,1.8882
175 | "56",5
176 | 9.9882,0.988197
177 | 9.9882,1.2118
178 | 10.2118,1.2118
179 | 10.2118,0.988197
180 | 9.9882,0.988197
181 | "47",5
182 | 8.0882,5.9882
183 | 8.0882,6.2118
184 | 8.3118,6.2118
185 | 8.3118,5.9882
186 | 8.0882,5.9882
187 | "58",5
188 | 9.9882,2.9882
189 | 9.9882,3.2118
190 | 10.2118,3.2118
191 | 10.2118,2.9882
192 | 9.9882,2.9882
193 | "57",5
194 | 9.9882,1.9882
195 | 9.9882,2.2118
196 | 10.2118,2.2118
197 | 10.2118,1.9882
198 | 9.9882,1.9882
199 | "50",5
200 | 8.9882,0.988197
201 | 8.9882,1.2118
202 | 9.2118,1.2118
203 | 9.2118,0.988197
204 | 8.9882,0.988197
205 | "49",5
206 | 8.9882,0.388197
207 | 8.9882,0.611803
208 | 9.2118,0.611803
209 | 9.2118,0.388197
210 | 8.9882,0.388197
211 | "46",5
212 | 8.0882,4.8882
213 | 8.0882,5.1118
214 | 8.3118,5.1118
215 | 8.3118,4.8882
216 | 8.0882,4.8882
217 | "52",5
218 | 8.9882,2.9882
219 | 8.9882,3.2118
220 | 9.2118,3.2118
221 | 9.2118,2.9882
222 | 8.9882,2.9882
223 | "51",5
224 | 8.9882,1.9882
225 | 8.9882,2.2118
226 | 9.2118,2.2118
227 | 9.2118,1.9882
228 | 8.9882,1.9882
229 | "42",5
230 | 7.9882,0.988197
231 | 7.9882,1.2118
232 | 8.2118,1.2118
233 | 8.2118,0.988197
234 | 7.9882,0.988197
235 | "45",5
236 | 7.9882,3.8882
237 | 7.9882,4.1118
238 | 8.2118,4.1118
239 | 8.2118,3.8882
240 | 7.9882,3.8882
241 | "41",5
242 | 6.9882,4.9882
243 | 6.9882,5.2118
244 | 7.2118,5.2118
245 | 7.2118,4.9882
246 | 6.9882,4.9882
247 | "44",5
248 | 7.9882,2.9882
249 | 7.9882,3.2118
250 | 8.2118,3.2118
251 | 8.2118,2.9882
252 | 7.9882,2.9882
253 | "43",5
254 | 7.9882,1.9882
255 | 7.9882,2.2118
256 | 8.2118,2.2118
257 | 8.2118,1.9882
258 | 7.9882,1.9882
259 | "40",5
260 | 6.9882,3.9882
261 | 6.9882,4.2118
262 | 7.2118,4.2118
263 | 7.2118,3.9882
264 | 6.9882,3.9882
265 | "39",5
266 | 6.9882,2.9882
267 | 6.9882,3.2118
268 | 7.2118,3.2118
269 | 7.2118,2.9882
270 | 6.9882,2.9882
271 | "38",5
272 | 6.9882,1.9882
273 | 6.9882,2.2118
274 | 7.2118,2.2118
275 | 7.2118,1.9882
276 | 6.9882,1.9882
277 | "37",5
278 | 6.0882,3.2882
279 | 6.0882,3.5118
280 | 6.3118,3.5118
281 | 6.3118,3.2882
282 | 6.0882,3.2882
283 | "36",5
284 | 6.0882,2.2882
285 | 6.0882,2.5118
286 | 6.3118,2.5118
287 | 6.3118,2.2882
288 | 6.0882,2.2882
289 | "33",5
290 | 5.0882,3.2882
291 | 5.0882,3.5118
292 | 5.3118,3.5118
293 | 5.3118,3.2882
294 | 5.0882,3.2882
295 | "35",5
296 | 5.2882,1.7882
297 | 5.2882,2.0118
298 | 5.5118,2.0118
299 | 5.5118,1.7882
300 | 5.2882,1.7882
301 | "31",5
302 | 5.0882,1.2882
303 | 5.0882,1.5118
304 | 5.3118,1.5118
305 | 5.3118,1.2882
306 | 5.0882,1.2882
307 | "25",5
308 | 4.0882,0.288197
309 | 4.0882,0.511803
310 | 4.3118,0.511803
311 | 4.3118,0.288197
312 | 4.0882,0.288197
313 | "34",5
314 | 5.1882,1.9882
315 | 5.1882,2.2118
316 | 5.4118,2.2118
317 | 5.4118,1.9882
318 | 5.1882,1.9882
319 | "32",5
320 | 5.0882,2.2882
321 | 5.0882,2.5118
322 | 5.3118,2.5118
323 | 5.3118,2.2882
324 | 5.0882,2.2882
325 | "21",5
326 | 3.5882,0.788197
327 | 3.5882,1.0118
328 | 3.8118,1.0118
329 | 3.8118,0.788197
330 | 3.5882,0.788197
331 | "18",5
332 | 3.1882,0.688197
333 | 3.1882,0.911803
334 | 3.4118,0.911803
335 | 3.4118,0.688197
336 | 3.1882,0.688197
337 | "15",5
338 | 3.0882,0.288197
339 | 3.0882,0.511803
340 | 3.3118,0.511803
341 | 3.3118,0.288197
342 | 3.0882,0.288197
343 | "28",5
344 | 4.4882,2.8882
345 | 4.4882,3.1118
346 | 4.7118,3.1118
347 | 4.7118,2.8882
348 | 4.4882,2.8882
349 | "30",5
350 | 4.5882,2.3882
351 | 4.5882,2.6118
352 | 4.8118,2.6118
353 | 4.8118,2.3882
354 | 4.5882,2.3882
355 | "29",5
356 | 4.5882,1.7882
357 | 4.5882,2.0118
358 | 4.8118,2.0118
359 | 4.8118,1.7882
360 | 4.5882,1.7882
361 | "26",5
362 | 4.0882,1.2882
363 | 4.0882,1.5118
364 | 4.3118,1.5118
365 | 4.3118,1.2882
366 | 4.0882,1.2882
367 | "27",5
368 | 4.0882,2.2882
369 | 4.0882,2.5118
370 | 4.3118,2.5118
371 | 4.3118,2.2882
372 | 4.0882,2.2882
373 | "22",5
374 | 3.5882,1.7882
375 | 3.5882,2.0118
376 | 3.8118,2.0118
377 | 3.8118,1.7882
378 | 3.5882,1.7882
379 | "16",5
380 | 3.0882,1.2882
381 | 3.0882,1.5118
382 | 3.3118,1.5118
383 | 3.3118,1.2882
384 | 3.0882,1.2882
385 | "23",5
386 | 3.5882,2.7882
387 | 3.5882,3.0118
388 | 3.8118,3.0118
389 | 3.8118,2.7882
390 | 3.5882,2.7882
391 | "20",5
392 | 3.4882,2.0882
393 | 3.4882,2.3118
394 | 3.7118,2.3118
395 | 3.7118,2.0882
396 | 3.4882,2.0882
397 | "19",5
398 | 3.1882,2.5882
399 | 3.1882,2.8118
400 | 3.4118,2.8118
401 | 3.4118,2.5882
402 | 3.1882,2.5882
403 | "17",5
404 | 3.0882,2.2882
405 | 3.0882,2.5118
406 | 3.3118,2.5118
407 | 3.3118,2.2882
408 | 3.0882,2.2882
409 | "12",5
410 | 2.5882,1.7882
411 | 2.5882,2.0118
412 | 2.8118,2.0118
413 | 2.8118,1.7882
414 | 2.5882,1.7882
415 | "2",5
416 | 1.0882,2.2882
417 | 1.0882,2.5118
418 | 1.3118,2.5118
419 | 1.3118,2.2882
420 | 1.0882,2.2882
421 | "1",5
422 | 0.588197,2.7882
423 | 0.588197,3.0118
424 | 0.811803,3.0118
425 | 0.811803,2.7882
426 | 0.588197,2.7882
427 | "13",5
428 | 2.5882,2.7882
429 | 2.5882,3.0118
430 | 2.8118,3.0118
431 | 2.8118,2.7882
432 | 2.5882,2.7882
433 | "3",5
434 | 1.0882,3.2882
435 | 1.0882,3.5118
436 | 1.3118,3.5118
437 | 1.3118,3.2882
438 | 1.0882,3.2882
439 | "6",5
440 | 1.4882,2.4882
441 | 1.4882,2.7118
442 | 1.7118,2.7118
443 | 1.7118,2.4882
444 | 1.4882,2.4882
445 | "7",5
446 | 1.5882,2.7882
447 | 1.5882,3.0118
448 | 1.8118,3.0118
449 | 1.8118,2.7882
450 | 1.5882,2.7882
451 | "9",5
452 | 2.0882,2.2882
453 | 2.0882,2.5118
454 | 2.3118,2.5118
455 | 2.3118,2.2882
456 | 2.0882,2.2882
457 | "4",5
458 | 1.3882,3.4882
459 | 1.3882,3.7118
460 | 1.6118,3.7118
461 | 1.6118,3.4882
462 | 1.3882,3.4882
463 | "5",5
464 | 1.3882,4.8882
465 | 1.3882,5.1118
466 | 1.6118,5.1118
467 | 1.6118,4.8882
468 | 1.3882,4.8882
469 | "8",5
470 | 1.5882,3.7882
471 | 1.5882,4.0118
472 | 1.8118,4.0118
473 | 1.8118,3.7882
474 | 1.5882,3.7882
475 | "10",5
476 | 2.0882,3.2882
477 | 2.0882,3.5118
478 | 2.3118,3.5118
479 | 2.3118,3.2882
480 | 2.0882,3.2882
481 | "11",5
482 | 2.0882,4.2882
483 | 2.0882,4.5118
484 | 2.3118,4.5118
485 | 2.3118,4.2882
486 | 2.0882,4.2882
487 | "14",5
488 | 2.7882,3.9882
489 | 2.7882,4.2118
490 | 3.0118,4.2118
491 | 3.0118,3.9882
492 | 2.7882,3.9882
493 | "24",5
494 | 3.6882,3.9882
495 | 3.6882,4.2118
496 | 3.9118,4.2118
497 | 3.9118,3.9882
498 | 3.6882,3.9882
499 | 


--------------------------------------------------------------------------------
/data/forest-health/foresthealth.gra:
--------------------------------------------------------------------------------
  1 | 83
  2 | 82
  3 | 2
  4 | 2 1 
  5 | 83
  6 | 2
  7 | 0 7 
  8 | 79
  9 | 2
 10 | 7 0 
 11 | 81
 12 | 2
 13 | 7 8 
 14 | 78
 15 | 2
 16 | 8 9 
 17 | 62
 18 | 2
 19 | 11 6 
 20 | 55
 21 | 2
 22 | 11 5 
 23 | 80
 24 | 4
 25 | 2 12 3 1 
 26 | 77
 27 | 4
 28 | 12 13 4 3 
 29 | 75
 30 | 3
 31 | 13 14 4 
 32 | 72
 33 | 1
 34 | 14 
 35 | 61
 36 | 3
 37 | 15 6 5 
 38 | 76
 39 | 3
 40 | 16 8 7 
 41 | 74
 42 | 4
 43 | 16 17 9 8 
 44 | 71
 45 | 4
 46 | 17 18 10 9 
 47 | 60
 48 | 3
 49 | 19 20 11 
 50 | 73
 51 | 3
 52 | 21 13 12 
 53 | 70
 54 | 4
 55 | 21 22 14 13 
 56 | 67
 57 | 2
 58 | 22 14 
 59 | 59
 60 | 2
 61 | 25 15 
 62 | 54
 63 | 3
 64 | 25 26 15 
 65 | 69
 66 | 4
 67 | 23 27 17 16 
 68 | 66
 69 | 3
 70 | 27 18 17 
 71 | 68
 72 | 2
 73 | 28 21 
 74 | 63
 75 | 2
 76 | 29 28 
 77 | 53
 78 | 3
 79 | 30 20 19 
 80 | 48
 81 | 2
 82 | 30 20 
 83 | 65
 84 | 4
 85 | 28 31 22 21 
 86 | 64
 87 | 4
 88 | 24 32 27 23 
 89 | 56
 90 | 4
 91 | 33 34 32 24 
 92 | 47
 93 | 3
 94 | 35 26 25 
 95 | 58
 96 | 3
 97 | 32 36 27 
 98 | 57
 99 | 4
100 | 29 37 31 28 
101 | 50
102 | 4
103 | 34 38 37 29 
104 | 49
105 | 3
106 | 38 33 29 
107 | 46
108 | 3
109 | 39 40 30 
110 | 52
111 | 3
112 | 37 41 31 
113 | 51
114 | 4
115 | 33 42 36 32 
116 | 42
117 | 3
118 | 42 34 33 
119 | 45
120 | 3
121 | 41 43 35 
122 | 41
123 | 2
124 | 43 35 
125 | 44
126 | 4
127 | 42 44 39 36 
128 | 43
129 | 4
130 | 38 45 41 37 
131 | 40
132 | 4
133 | 44 46 40 39 
134 | 39
135 | 5
136 | 45 46 47 43 41 
137 | 38
138 | 3
139 | 47 44 42 
140 | 37
141 | 4
142 | 47 48 44 43 
143 | 36
144 | 6
145 | 49 52 53 46 45 44 
146 | 33
147 | 4
148 | 53 58 57 46 
149 | 35
150 | 6
151 | 52 53 50 58 59 47 
152 | 31
153 | 5
154 | 59 60 53 52 49 
155 | 25
156 | 4
157 | 54 55 56 60 
158 | 34
159 | 8
160 | 53 50 58 59 57 61 49 47 
161 | 32
162 | 9
163 | 50 58 59 57 61 48 52 49 47 
164 | 21
165 | 6
166 | 55 63 56 62 51 60 
167 | 18
168 | 6
169 | 63 56 54 62 51 60 
170 | 15
171 | 4
172 | 63 55 54 51 
173 | 28
174 | 7
175 | 61 64 59 58 53 48 52 
176 | 30
177 | 10
178 | 59 57 61 64 62 65 53 48 52 49 
179 | 29
180 | 10
181 | 57 61 60 62 65 58 50 53 52 49 
182 | 26
183 | 9
184 | 51 62 54 65 55 63 61 59 50 
185 | 27
186 | 11
187 | 60 64 62 65 66 67 57 59 58 53 52 
188 | 22
189 | 12
190 | 54 65 66 55 67 63 68 64 60 61 59 58 
191 | 16
192 | 8
193 | 56 68 67 55 65 54 62 60 
194 | 23
195 | 8
196 | 62 65 66 67 71 61 57 58 
197 | 20
198 | 11
199 | 66 67 63 71 68 62 64 60 61 59 58 
200 | 19
201 | 8
202 | 67 71 68 75 65 62 64 61 
203 | 17
204 | 9
205 | 63 71 68 75 66 65 62 64 61 
206 | 12
207 | 7
208 | 75 71 63 67 66 65 62 
209 | 2
210 | 5
211 | 70 72 73 74 75 
212 | 1
213 | 5
214 | 69 72 76 73 74 
215 | 13
216 | 9
217 | 68 79 75 74 73 67 66 65 64 
218 | 3
219 | 7
220 | 69 70 76 73 74 78 79 
221 | 6
222 | 8
223 | 76 72 69 70 74 75 79 71 
224 | 7
225 | 9
226 | 73 76 72 69 70 78 75 79 71 
227 | 9
228 | 8
229 | 74 73 69 79 68 71 67 66 
230 | 4
231 | 7
232 | 72 70 73 74 78 79 80 
233 | 5
234 | 2
235 | 78 80 
236 | 8
237 | 6
238 | 74 77 76 72 79 80 
239 | 10
240 | 9
241 | 75 78 74 73 76 72 80 71 81 
242 | 11
243 | 5
244 | 79 78 77 76 81 
245 | 14
246 | 3
247 | 80 79 82 
248 | 24
249 | 1
250 | 81 
251 | 


--------------------------------------------------------------------------------
/data/mexdolphins/README.md:
--------------------------------------------------------------------------------
 1 | Pantropical spotted dolphins in the Gulf of Mexico
 2 | ==================================================
 3 | 
 4 | Data copied from [this example DSM analysis](http://distancesampling.org/R/vignettes/mexico-analysis.html), after correcting for detectability.
 5 | 
 6 | Data was collected by teh NOAA South East Fisheries Science Center (sic). The OBIS-SEAMAP page for the data may be found at the SEFSC [GoMex Oceanic 1996 survey page](http://seamap.env.duke.edu/dataset/25).
 7 | 
 8 | ## Data citation
 9 | 
10 | Garrison, L. 2013. SEFSC GoMex Oceanic 1996. Data downloaded from OBIS-SEAMAP (http://seamap.env.duke.edu/dataset/25).
11 | 


--------------------------------------------------------------------------------
/data/mexdolphins/mexdolphins.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/mexdolphins/mexdolphins.RData


--------------------------------------------------------------------------------
/data/mexdolphins/soap_pred.R:
--------------------------------------------------------------------------------
 1 | # make soap pred grid
 2 | 
 3 | library(mapdata)
 4 | library(maptools)
 5 | library(rgeos)
 6 | library(rgdal)
 7 | library(mgcv)
 8 | 
 9 | load("mexdolphins.RData")
10 | 
11 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
12 | 
13 | usmap <- map('usa', plot=FALSE, xlim=c(-100,-80), ylim=c(22,33))
14 | uspoly <- map2SpatialPolygons(usmap, IDs=usmap$names,
15 |                               proj4string=CRS("+proj=longlat +datum=WGS84"))
16 | 
17 | # simplify the polygon
18 | uspoly <- gSimplify(uspoly, 0.08)
19 | 
20 | 
21 | # bounding box
22 | c1 = cbind(c(-97.5, -82, -82, -97.5),
23 |            c(22.5, 22.5, 32, 32))
24 | r1 = rbind(c1, c1[1, ])  # join
25 | P1 = Polygon(r1)
26 | Ps1 = Polygons(list(P1), ID = "a")
27 | 
28 | SPs = SpatialPolygons(list(Ps1),proj4string=CRS("+proj=longlat +datum=WGS84"))
29 | 
30 | # plot to check we got the right area
31 | #plot(uspoly)#, xlim=c(-100,-80), ylim=c(22,33))
32 | #plot(SPs, add=TRUE)
33 | #points(mexdolphins[,c("longitude","latitude")])
34 | 
35 | # take the polygon difference
36 | soap_area <- gDifference(SPs,uspoly)
37 | # again check
38 | #plot(soap_area)
39 | #points(mexdolphins[,c("longitude","latitude")])
40 | 
41 | 
42 | xy_bnd <- spTransform(soap_area, CRSobj=lcc_proj4)
43 | xy_bnd <- xy_bnd@polygons[[1]]@Polygons[[1]]@coords
44 | xy_bnd <- list(x=xy_bnd[,1], y=xy_bnd[,2])
45 | 
46 | gr <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), by=16469.27),
47 |                   y=seq(min(xy_bnd$y), max(xy_bnd$y), by=16469.27))
48 | x <- gr[,1]
49 | y <- gr[,2]
50 | ind <- inSide(xy_bnd, x, y)
51 | 
52 | soap_preddata <- as.data.frame(gr[ind,])
53 | soap_preddata$area <- preddata$area[1]
54 | soap_preddata$off.set <- log(soap_preddata$area)
55 | 
56 | 
57 | save(xy_bnd, soap_preddata, file="soapy.RData")
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/data/mexdolphins/soapy.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/mexdolphins/soapy.RData


--------------------------------------------------------------------------------
/data/routes.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/data/routes.csv


--------------------------------------------------------------------------------
/data/yukon_seeds/seed_source_locations.csv:
--------------------------------------------------------------------------------
  1 | Who,Site,Subsite,source_type,polygon,order,Y,X
  2 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796723.731,469919.3107
  3 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796742.626,469910.4723
  4 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796729.702,469974.1999
  5 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.89,470025.6419
  6 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796719.851,470077.7116
  7 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.935,470126.3351
  8 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796718.948,470174.2659
  9 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796702.393,470236.7358
 10 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796694.535,470294.0962
 11 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796742.568,470287.086
 12 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796724.452,470303.7841
 13 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796785.653,470258.4785
 14 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.804,470191.593
 15 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796964.678,470142.5823
 16 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797042.968,470050.4508
 17 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797046.439,470002.8427
 18 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797040.834,469950.9495
 19 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796996.21,469909.3531
 20 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796975.594,469904.4163
 21 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.762,469895.5375
 22 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796850.814,469871.8622
 23 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797168.951,469864.6928
 24 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797133.442,469889.7501
 25 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797067.112,470003.8641
 26 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797045.311,469933.9624
 27 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803025.398,465082.76
 28 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803076.228,465133.385
 29 | Eliot,FoxLake,Lowland,SSPolygon,38,1,6796715.73,470304.9674
 30 | Eliot,FoxLake,Lowland,SSPolygon,38,2,6796728.455,470293.1938
 31 | Eliot,FoxLake,Lowland,SSPolygon,38,3,6796753.325,470273.6796
 32 | Eliot,FoxLake,Lowland,SSPolygon,38,4,6796788.911,470252.55
 33 | Eliot,FoxLake,Lowland,SSPolygon,38,5,6796847.188,470216.8999
 34 | Eliot,FoxLake,Lowland,SSPolygon,38,6,6796887.915,470191.8324
 35 | Eliot,FoxLake,Lowland,SSPolygon,38,7,6796932.971,470156.1209
 36 | Eliot,FoxLake,Lowland,SSPolygon,38,8,6796962.203,470143.2102
 37 | Eliot,FoxLake,Lowland,SSPolygon,38,9,6796979.017,470128.2042
 38 | Eliot,FoxLake,Lowland,SSPolygon,38,10,6797017.817,470096.0466
 39 | Eliot,FoxLake,Lowland,SSPolygon,38,11,6797024.449,470083.6527
 40 | Eliot,FoxLake,Lowland,SSPolygon,38,32,6797050.904,470081.0951
 41 | Eliot,FoxLake,Lowland,SSPolygon,38,33,6796981.673,470133.1883
 42 | Eliot,FoxLake,Lowland,SSPolygon,38,34,6796839.563,470223.1343
 43 | Eliot,FoxLake,Lowland,SSPolygon,38,35,6796790.373,470257.3939
 44 | Eliot,FoxLake,Lowland,SSPolygon,38,12,6797041.589,470051.2157
 45 | Eliot,FoxLake,Lowland,SSPolygon,38,13,6797044.77,470038.4604
 46 | Eliot,FoxLake,Lowland,SSPolygon,38,14,6797049.136,470021.4898
 47 | Eliot,FoxLake,Lowland,SSPolygon,38,15,6797043.093,470009.7016
 48 | Eliot,FoxLake,Lowland,SSPolygon,38,16,6797045.841,469991.8912
 49 | Eliot,FoxLake,Lowland,SSPolygon,38,17,6797050.612,469984.3664
 50 | Eliot,FoxLake,Lowland,SSPolygon,38,18,6797061.129,469968.7692
 51 | Eliot,FoxLake,Lowland,SSPolygon,38,19,6797037.335,469950.6634
 52 | Eliot,FoxLake,Lowland,SSPolygon,38,20,6797009.75,469923.8775
 53 | Eliot,FoxLake,Lowland,SSPolygon,38,21,6797014.931,469917.5015
 54 | Eliot,FoxLake,Lowland,SSPolygon,89,1,6796994.101,469909.7297
 55 | Eliot,FoxLake,Lowland,SSPolygon,89,2,6796985.436,469906.6158
 56 | Eliot,FoxLake,Lowland,SSPolygon,89,3,6796973.54,469906.5494
 57 | Eliot,FoxLake,Lowland,SSPolygon,89,4,6796960.759,469898.896
 58 | Eliot,FoxLake,Lowland,SSPolygon,89,5,6796919.57,469895.9869
 59 | Eliot,FoxLake,Lowland,SSPolygon,89,6,6796906.707,469885.8498
 60 | Eliot,FoxLake,Lowland,SSPolygon,89,7,6796855.959,469882.6563
 61 | Eliot,FoxLake,Lowland,SSPolygon,89,8,6796848.594,469877.0291
 62 | Eliot,FoxLake,Lowland,SSPolygon,89,15,6796843.691,469842.534
 63 | Eliot,FoxLake,Lowland,SSPolygon,89,9,6796830.91,469827.1575
 64 | Eliot,FoxLake,Lowland,SSPolygon,89,14,6796853.289,469799.318
 65 | Eliot,FoxLake,Lowland,SSPolygon,89,12,6796817.835,469784.2797
 66 | Eliot,FoxLake,Lowland,SSPolygon,89,10,6796812.723,469793.5348
 67 | Eliot,FoxLake,Lowland,SSPolygon,89,13,6796821.899,469785.0872
 68 | Eliot,FoxLake,Lowland,SSPolygon,89,11,6796814.147,469781.0331
 69 | Eliot,FoxLake,Lowland,SSPolygon,159,1,6797215.359,469905.572
 70 | Eliot,FoxLake,Lowland,SSPolygon,159,2,6797206.191,469912.9905
 71 | Eliot,FoxLake,Lowland,SSPolygon,159,3,6797203.94,469943.5954
 72 | Eliot,FoxLake,Lowland,SSPolygon,159,4,6797202.858,469895.6316
 73 | Eliot,FoxLake,Lowland,SSPolygon,159,5,6797194.902,469889.1922
 74 | Eliot,FoxLake,Lowland,SSPolygon,159,6,6797204.779,469878.4889
 75 | Eliot,FoxLake,Lowland,SSPolygon,159,7,6797205.31,469870.0078
 76 | Eliot,FoxLake,Lowland,SSPolygon,159,8,6797186.831,469856.0113
 77 | Eliot,FoxLake,Lowland,SSPolygon,159,9,6797169.234,469850.342
 78 | Eliot,FoxLake,Lowland,SSPolygon,159,10,6797150.315,469847.3502
 79 | Eliot,FoxLake,Lowland,SSPolygon,159,11,6797135.268,469840.288
 80 | Eliot,FoxLake,Lowland,SSPolygon,159,12,6797125.43,469837.8281
 81 | Eliot,FoxLake,Lowland,SSPolygon,159,13,6797121.712,469844.4725
 82 | Eliot,FoxLake,Lowland,SSPolygon,159,14,6797136.985,469852.2816
 83 | Eliot,FoxLake,Lowland,SSPolygon,159,15,6797162.709,469858.8117
 84 | Eliot,FoxLake,Lowland,SSPolygon,159,16,6797178.135,469866.1774
 85 | Eliot,FoxLake,Lowland,SSPolygon,159,17,6797187.21,469872.7701
 86 | Eliot,FoxLake,Lowland,SSPolygon,159,18,6797176.944,469888.4768
 87 | Eliot,FoxLake,Lowland,SSPolygon,159,19,6797171.087,469910.6188
 88 | Eliot,FoxLake,Lowland,SSPolygon,159,20,6797182.537,469924.3886
 89 | Eliot,FoxLake,Lowland,SSPolygon,159,21,6797174.278,469956.3611
 90 | Eliot,FoxLake,Lowland,SSPolygon,159,22,6797169.264,469966.2896
 91 | Eliot,FoxLake,Lowland,SSPolygon,185,1,6797141.76,469950.5774
 92 | Eliot,FoxLake,Lowland,SSPolygon,185,2,6797134.16,469928.8803
 93 | Eliot,FoxLake,Lowland,SSPolygon,185,3,6797133.061,469886.2447
 94 | Eliot,FoxLake,Lowland,SSPolygon,185,4,6797112.249,469847.2647
 95 | Eliot,FoxLake,Lowland,SSPolygon,185,5,6797112.324,469865.5345
 96 | Eliot,FoxLake,Lowland,SSPolygon,185,6,6797121.118,469891.7095
 97 | Eliot,FoxLake,Lowland,SSPolygon,185,7,6797127.573,469922.8565
 98 | Eliot,FoxLake,Lowland,SSPolygon,185,8,6797126.939,469938.0401
 99 | Eliot,FoxLake,Lowland,SSPolygon,38,31,6797091.933,470016.4598
100 | Eliot,FoxLake,Lowland,SSPolygon,38,30,6797079.967,470007.3238
101 | Eliot,FoxLake,Lowland,SSPolygon,38,29,6797067.498,469997.9275
102 | Eliot,FoxLake,Lowland,SSPolygon,38,28,6797063.273,469988.7003
103 | Eliot,FoxLake,Lowland,SSPolygon,38,27,6797070.821,469974.1372
104 | Eliot,FoxLake,Lowland,SSPolygon,38,26,6797063.943,469959.6072
105 | Eliot,FoxLake,Lowland,SSPolygon,38,25,6797049.503,469942.8299
106 | Eliot,FoxLake,Lowland,SSPolygon,38,24,6797039.73,469931.6067
107 | Eliot,FoxLake,Lowland,SSPolygon,38,23,6797030.65,469922.4277
108 | Eliot,FoxLake,Lowland,SSPolygon,38,22,6797020.646,469910.829
109 | Eliot,FoxLake,Lowland,SSPolygon,233,1,6797526.926,469768.1144
110 | Eliot,FoxLake,Lowland,SSPolygon,233,2,6797486.016,469794.4904
111 | Eliot,FoxLake,Lowland,SSPolygon,233,3,6797400.062,469863.5996
112 | Eliot,FoxLake,SW_Facing,SSPolygon,380,1,6803013.014,465080.2516
113 | Eliot,FoxLake,SW_Facing,SSPolygon,380,2,6803089.116,465135.0063
114 | Eliot,FoxLake,SW_Facing,SSPolygon,389,1,6803262.384,464081.8498
115 | Eliot,FoxLake,SW_Facing,SSPolygon,389,2,6803618.396,464040.2266
116 | Eliot,FoxLake,SW_Facing,SSPolygon,391,1,6803587.361,464040.0052
117 | Eliot,FoxLake,SW_Facing,SSPolygon,391,2,6803174.081,464080.9813
118 | Eliot,FoxLake,Lowland,SSIsland,56,NA,6796740.729,470183.8429
119 | Eliot,FoxLake,Lowland,SSIsland,58,NA,6796737.425,470200.11
120 | Eliot,FoxLake,Lowland,SSIsland,65,NA,6796729.556,470238.3273
121 | Eliot,FoxLake,Lowland,SSIsland,67,NA,6796738.312,470243.0132
122 | Eliot,FoxLake,Lowland,SSIsland,68,NA,6796734.045,470246.1296
123 | Eliot,FoxLake,Lowland,SSIsland,97,NA,6796940.921,469902.9191
124 | Eliot,FoxLake,Lowland,SSIsland,98,NA,6796929.918,469896.6909
125 | Eliot,FoxLake,Lowland,SSIsland,99,NA,6796923.639,469901.6075
126 | Eliot,FoxLake,Lowland,SSIsland,114,NA,6796836.526,469792.1375
127 | Eliot,FoxLake,Lowland,SSIsland,156,NA,6796980.95,470060.4577
128 | Eliot,FoxLake,Lowland,SSIsland,182,NA,6797146.336,469972.2485
129 | Eliot,FoxLake,Lowland,SSIsland,183,NA,6797152.1,469950.1815
130 | Eliot,FoxLake,Lowland,SSIsland,184,NA,6797146.21,469953.2912
131 | Eliot,FoxLake,Lowland,SSIsland,212,NA,6796949.689,469802.0181
132 | Eliot,FoxLake,Lowland,SSIsland,213,NA,6796948.007,469797.7336
133 | Eliot,FoxLake,Lowland,SSIsland,214,NA,6796936.248,469789.6354
134 | Eliot,FoxLake,Lowland,SSIsland,231,NA,6797483.691,469755.6377
135 | Eliot,FoxLake,Lowland,SSIsland,232,NA,6797514.164,469726.8687
136 | Eliot,FoxLake,Lowland,SSIndividual,60,NA,6796747.059,470205.7828
137 | Eliot,FoxLake,Lowland,SSIndividual,61,NA,6796737.175,470215.1502
138 | Eliot,FoxLake,Lowland,SSIndividual,62,NA,6796734.953,470215.1313
139 | Eliot,FoxLake,Lowland,SSIndividual,63,NA,6796725.697,470212.2823
140 | Eliot,FoxLake,Lowland,SSIndividual,64,NA,6796727.524,470235.7011
141 | Eliot,FoxLake,Lowland,SSIndividual,66,NA,6796736.703,470238.8686
142 | Eliot,FoxLake,Lowland,SSIndividual,96,NA,6796948.488,469901.2824
143 | Eliot,FoxLake,Lowland,SSIndividual,203,NA,6797043.34,469928.7998
144 | Eliot,FoxLake,Lowland,SSIndividual,204,NA,6797039.158,469928.6471
145 | Eliot,FoxLake,Lowland,SSIndividual,208,NA,6797019.777,469904.491
146 | 


--------------------------------------------------------------------------------
/example-bivariate-timeseries-and-ti.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "More time series; bivariate smooths and ti()"
  3 | output: 
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 |     fig_width: 10
 10 |     fig_height: 4
 11 | ---
 12 | 
 13 | ## Background 
 14 | 
 15 | This example is an extension of the Fort Lauderdale time series model. Here we look at two additional advanced features, namely
 16 | 
 17 | i. allowing the seasonal and trend terms to interact, and
 18 | ii. setting up an ANOVA-like decomposition for interaction smooth using `ti()` terms
 19 | 
 20 | The model in the earlier example estimated a constant non-linear seasonal (within-year) effect and a constant non-linear trend (between year) effect. Such a model is unable to capture any potential change in the seasonal effect (the within-year distribution of temperatures) over time. Or, put another way, is unable to capture potentially different trends in temperature are particular times of the year.
 21 | 
 22 | This example extends the simpler model to more generally model changes in temperature.
 23 | 
 24 | ### Loading and viewing the data
 25 | 
 26 | The time series we'll look at is a long-term temperature and precipitation
 27 | data set from the convention center here at Fort Lauderdale, consisting of 
 28 | monthly mean temperature (in degrees c), the number of days it rained that
 29 | month, and the amount of precipitation that month (in mm). The data range from
 30 | 1950 to 2015, with a column for year and month (with Jan. = 1). The data is from
 31 | a great web-app called FetchClim2, and the link for the data [I used is
 32 | here](http://fetchclimate2.cloudapp.net/#page=geography&dm=values&t=years&v=airt(13,6,2,1),prate(12,7,2,1),wet(1)&y=1950:1:2015&dc=1,32,60,91,121,152,182,213,244,274,305,335,366&hc=0,24&p=26.099,-80.123,Point%201&ts=2016-07-17T21:57:11.213Z).
 33 | 
 34 | Here we'll load it and plot the air temperature data.
 35 | 
 36 | ```{r, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE, highlight = TRUE}
 37 | library("mgcv")
 38 | library("ggplot2")
 39 | florida <- read.csv("./data/time_series/Florida_climate.csv",
 40 |                     stringsAsFactors = FALSE)
 41 | head(florida)
 42 | 
 43 | ggplot(data = florida, aes(x = month, y = air_temp, color = year, group = year)) +
 44 |   geom_point() +
 45 |   geom_line()
 46 | ```
 47 | 
 48 | ## Fitting the smooth interaction model
 49 | 
 50 | Following from the earlier example, we fit a smooth interaction model using a tensor product spline of `year` and `month`. Notice how we specify the marginal basis types as a character vector. Here I restrict the model to AR(1) errors as this was the best model in the earlier example and it is unlikely that additional unmodelled temporal structure will turn up when we use a more general model!
 51 | 
 52 | ```{r fit-smooth-interaction, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
 53 | m.full <- gamm(air_temp ~ te(year, month, bs = c("tp", "cc")), data = florida,
 54 |                correlation = corAR1(form = ~ 1 | year),
 55 |                knots = list(month = c(0.5, 12.5)))
 56 | layout(matrix(1:3, nrow = 1))
 57 | plot(m.full$gam, scheme = 1, theta = 40)
 58 | plot(m.full$gam, scheme = 1, theta = 80)
 59 | plot(m.full$gam, scheme = 1, theta = 120)
 60 | layout(1)
 61 | summary(m.full$gam)
 62 | ```
 63 | 
 64 | The plot of the tensor product smooth is quite illustrative as to what the model is attempting to do. The earlier model would be forced to retain the same seasonal curve throughout the time series. Now, with the more complex model, the seasonal curve is allowed to change shape smoothly through time.
 65 | 
 66 | Our task now is to test whether this additional complexity is worth the extra effort? Are some months warming more than others in Fort Lauderdale?
 67 | 
 68 | ## Fitting the ANOVA-decompososed model
 69 | 
 70 | The logical thing to do to test whether some months are warming more than others would be to drop the interaction term from the model and compare the simpler and more complex models with AIC or a likelihood ratio test. But if you look back at the model we just fitted, there is no term in that model that relates just to the interaction! The tensor product contains both the main and interactive effects. What's worse is that we can't (always) compare this model with the one we fitted in the earlier example because we need to be careful to ensure the models are properly nested. This is where the `ti()` smoother comes in.
 71 | 
 72 | A `ti()` smoother is like a standard tensor product (`te()`) smooth, but without the main effects smooths. Now we can build a model with separate main effects smooths for `year` and `month` and add in the interaction smooth using `ti()`. This will allow us to fit two properly nested that can be compared to test our null hypothesis that the months are warming at the same rate.
 73 | 
 74 | Let's fit those models
 75 | 
 76 | ```{r fit-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
 77 | m.main <- gam(air_temp ~ s(year, bs = "tp") + s(month, bs = "cc"), data = florida,
 78 |               knots = list(month = c(0.5, 12.5)))
 79 | m.int  <- gam(air_temp ~ ti(year, bs = "tp") + ti(month, bs = "cc") +
 80 |                 ti(year, month, bs = c("tp", "cc"), k = 11), data = florida,
 81 |               knots = list(month = c(0.5, 12.5)))
 82 | ```
 83 | 
 84 | If you do this with `gamm()` the model fitting will fail, and for good reason. At least the two `gam()` models converged so we can see what is going on. Compare the two models using `AIC()`
 85 | 
 86 | ```{r aic-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
 87 | AIC(m.main, m.int)
 88 | ```
 89 | 
 90 | AIC tells us that there is some support for the interaction model, but only when we force a more complex surface by specifying `k`.
 91 | 
 92 | Regardless, the interaction term only affords a >1% improvement in deviance explained; whilst the interaction may be favoured by AIC, any variation in the seasonal effect over time must be small.
 93 | 
 94 | ## Exercises
 95 | 
 96 | 1. For an example where there is a stronger change in the seasonal effect, see my two blog posts on applying these models to the Central England Temperature series:
 97 |     - [Post 1](http://www.fromthebottomoftheheap.net/2015/11/21/climate-change-and-spline-interactions/)
 98 |     - [Post 2](http://www.fromthebottomoftheheap.net/2015/11/23/are-some-seasons-warming-more-than-others/)
 99 | 
100 |     These two posts take you through a full analysis of this data series.
101 | 


--------------------------------------------------------------------------------
/example-forest-health.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Markov random field smooths & German forest health data"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 | ---
 10 | 
 11 | # Introduction
 12 | 
 13 | In this example, we look at a spatial data set of forest health data from a stand in Germany. We can treat the individual trees as discrete spatial units and fit a spatial term using a special type of spline basis, a Markov random field.
 14 | 
 15 | The data come from a project n the forest of Rothenbuch, Germany, which has been ongoing since 1982. The project studies five species but here we just consider the beech data. Each year the condition of each of 83 trees is categorized into one of nine ordinal classes in terms of the degree (percentage) of defoliation. A defoliation value of 0% indicates a healthy beech tree, whilst a value of 100% indicates a tree is dead. For the purposes of this example, several of the nine ordinal classes are merged to form a three class response variable of interest
 16 | 
 17 | 1. 0%, healthy trees that I'll refer to as the `"low"` defoliation group
 18 | 2. 12.5% -- 37.5% defoliation, which I refer to as the `"med"` group, and
 19 | 3. 50% -- 100% defoliation, which is the `"high"` group.
 20 | 
 21 | The original classes were `r paste0(c(0,12.5,37.5,50,62.5,75, 87.5, 100), "%")` and are in variable `defol`.
 22 | 
 23 | Alongside the response variable, a number of continuous and categorical covariates are recorded
 24 | 
 25 | - `id` --- location ID number
 26 | - `year` --- year of recording
 27 | - `x` and `y` --- the x- and y-coordinates of each location
 28 | - `age` --- average age of trees at a location
 29 | - `canopyd` --- canopy density at the location, in %
 30 | - `gradient` --- slope, in %
 31 | - `alt` --- altitude above sea level in m
 32 | - `depth` --- soil depth in cm
 33 | - `ph` --- soil pH at 0--2cm depth
 34 | - `watermoisture` --- soil moisture in three categories
 35 |     1. `1` moderately dry
 36 |     2. `2` moderately moist
 37 |     3. `3` moist or temporarily wet
 38 | - `alkali` --- fraction of alkali ions in four categories
 39 |     1. `1` very low
 40 |     2. `2` low
 41 |     3. `3` moderate
 42 | 	4. `4` high
 43 | - `humus` thickness of the soil humus layer in five categories
 44 |     1. `0` 0cm
 45 |     2. `1` 1cm
 46 |     3. `2` 2cm
 47 | 	4. `3` 3cm
 48 | 	5. `4` $>$3cm
 49 | - `type` --- type of forest
 50 |     - `0` deciduous forest
 51 |     - `1` mixed forest
 52 | - `fert` --- fertilization
 53 |     - `0` not fertilized
 54 |     - `1` fertilized
 55 | 
 56 | The aim of the example is to investigate the effect of the measured covariates on the degree of defoliation and to quantify any temporal trend and spatial effect of geographic location in the data set, while adjusting for the effects of the other covariates.
 57 | 
 58 | The data are extensively analysed in Fahrmeir, Kneib, Lang, and Marx (2013) *Regression: Models, Methods and Applications*. Springer.
 59 | 
 60 | # Getting started
 61 | 
 62 | Begin by loading the packages we'll use in this example
 63 | 
 64 | ```{r load-packages}
 65 | ## Load packages
 66 | library("mgcv")
 67 | library("ggplot2")
 68 | ```
 69 | 
 70 | Next, load the forest health data set
 71 | 
 72 | ```{r read-data}
 73 | ## Read in the forest health data
 74 | forest <- read.table("./data/forest-health/beech.raw", header = TRUE, na.strings = ".")
 75 | head(forest)
 76 | ```
 77 | 
 78 | The data require a little processing to ensure they are correctly coded. The chunk below does
 79 | 
 80 | 1. Makes a nicer stand label so that we can sort stands numerically even though the id is character,
 81 | 2. Aggregates the defoliation data into an ordered categorical variable for low, medium, or high levels of defoliation. This is converted to a numeric variable for modelling,
 82 | 3. Convert the categorical variables to factors. `humus` is converted to a 4-level factor,
 83 | 4. Remove some `NA`s, and
 84 | 5. Summarises the response variable in a table.
 85 | 
 86 | ```{r process-forest-data}
 87 | forest <- transform(forest, id = factor(formatC(id, width = 2, flag = "0")))
 88 | 
 89 | ## Aggregate defoliation & convert categorical vars to factors
 90 | levs <- c("low","med","high")
 91 | forest <- transform(forest,
 92 |                     aggDefol = as.numeric(cut(defol, breaks = c(-1,10,45,101),
 93 |                                               labels = levs)),
 94 |                     watermoisture = factor(watermoisture),
 95 |                     alkali = factor(alkali),
 96 |                     humus = cut(humus, breaks = c(-0.5, 0.5, 1.5, 2.5, 3.5),
 97 |                                 labels = 1:4),
 98 |                     type = factor(type),
 99 |                     fert = factor(fert))
100 | forest <- droplevels(na.omit(forest))
101 | head(forest)
102 | with(forest, table(aggDefol))
103 | ```
104 | 
105 | To view the spatial arrangement of the forest stand, plot the unique `x` and `y` coordinate pairs
106 | 
107 | ```{r plot-forest-stand}
108 | ## Plot data
109 | ggplot(unique(forest[, c("x","y")]), aes(x = x, y = y)) +
110 |     geom_point()
111 | ```
112 | 
113 | # Non-spatial ordered categorical GAM
114 | 
115 | Our first model will ignore the spatial component of the data. All continuous variables except `ph` and `depth` are modelled using splines, and all categorical variables are included in the model. Note that we can speed up fitting by turning on some multi-threaded parallel processing in parts of the fitting algorithm via the `nthreads` control argument. The `ocat` family is used and we specify that there are three classes. Smoothness selection is via REML.
116 | 
117 | ```{r fit-naive-model}
118 | ## Model
119 | ctrl <- gam.control(nthreads = 3)
120 | forest.m1 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert +
121 |                     s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt),
122 |                 data = forest, family = ocat(R = 3), method = "REML",
123 |                 control = ctrl)
124 | ```
125 | 
126 | The model summary
127 | 
128 | ```{r summary-naive}
129 | summary(forest.m1)
130 | ```
131 | 
132 | indicates all model terms are significant at the 95% level, especially the smooth terms. Don't pay too much attention to this though as we have not yet accounted for spatial structure in the data and several of the variables are likely to be spatially autocorrelated and hence the identified effects may be spurious.
133 | 
134 | This  is somewhat confirmed by the form of the fitted functions; rather than smooth, monotonic functions man terms are highly non-linear and difficult to interpret.
135 | 
136 | ```{r plot-naive-smooths}
137 | plot(forest.m1, pages = 1, shade = 2, scale = 0)
138 | ```
139 | We might expect that older trees are more susceptible to damage yet confusingly there is a decreasing effect for the very oldest trees. The smooth of `gradient` is uninterpretable. Trees in low and high altitude areas are less damaged than those in areas of intermediate elevation, which is also counter intuitive.
140 | 
141 | Running gam.check()`
142 | 
143 | ```{r gam-check-naive}
144 | gam.check(forest.m1)
145 | ```
146 | 
147 | suggests no major problems, but the residual plot is difficult to interpret owing to the categorical nature of the response variable. The printed output suggests some smooth terms may need their basis dimension increasing. Before we do this however, we should add a spatial effect to the model.
148 | 
149 | # Spatial GAM via a MRF smooth
150 | 
151 | For these data, it would be more natural to fit a spatial effect via a 2-d smoother as we've considered in other examples. However, we can consider the trees as being discrete spatial units and fit a spatial effect via a Markov random field (MRF) smooth. To fit the MRF smoother, we need information on the neighbours of each tree. In this example, any tree within 1.8km of a focal tree was considered a neighbour of that focal tree. This neighbourhood information is stored in a BayesX graph file, which we can convert into the format needed for **mgcv**'s MRF basis function. To facilitate reading the graph file, load the utility function `gra2mgcv()`
152 | 
153 | ```{r load-graph-fun}
154 | ## souce graph reading function
155 | source("./code_snippets/gra2mgcv.R")
156 | ```
157 | 
158 | Next we load the `.gra` file and do some manipulations to match the `forest` environmental data file
159 | 
160 | ```{r load-graph}
161 | ## Read in graph file and output list required by mgcv
162 | nb <- gra2mgcv("./data/forest-health/foresthealth.gra")
163 | nb <- nb[order(as.numeric(names(nb)))]
164 | names(nb) <- formatC(as.numeric(names(nb)), width = 2, flag = "0")
165 | ```
166 | 
167 | Look at the structure of `nb`:
168 | 
169 | ```{r head-nb}
170 | head(nb)
171 | ```
172 | 
173 | In **mgcv** the MRF basis can be specified by one or more of
174 | 
175 | 1. `polys`; coordinates of vertices defining spatial polygons for each discrete spatial unit. Any two spatial units that share one or more vertices are considered neighbours,
176 | 2. `nb`; a list with one component per spatial unit, where each component contains indices of the neighbouring components of the current spatial unit, and/or
177 | 3. `penalty`; the actual penalty matrix of the MRF basis. This is an $N$ by $N$ matrix with the number of neighbours of each unit on the diagonal, and the $j$th column of the $i$th row set to `-1` if the $j$th spatial unit is a neighbour of the $i$th unit. Elsewhere the penalty matrix is all `0`s.
178 | 
179 | **mgcv** will create the penalty matrix for you if you supply `polys` or `nb`. As we don't have polygons here, we'll use the information from the `.gra` file converted to the format needed by `gam()` to indicate the neighbourhood.
180 | 
181 | The `nb` list is passed along using the `xt` argument of the `s()` function. For the MRF basis, `xt` is a named list with one or more of the components listed above. In the model call below we use `xt = list(nb = nb)` to pass on the neighbourhood list. To indicate that an MRF smooth is needed, we use `bs = "mrf"` and the covariate of the smooth is the factor indicating to which spatial unit each observation belongs. In the call below we use the `id` variable. Note that this doesn't need to be a factor, just something that can be coerced to one.
182 | 
183 | All other aspects of the model fit remain the same hence we use `update()` to add the MRF smooth without repeating everything from the original call.
184 | 
185 | ```{r fit-mrf-model}
186 | ## Fit model with MRF
187 | ## forest.m2 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert +
188 | ##                      s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt) +
189 | ##                      s(id, bs = "mrf", xt = list(nb = nb)),
190 | ##                 data = forest, family = ocat(R = 3), method = "REML",
191 | ##                 control = ctrl)
192 | forest.m2 <- update(forest.m1, . ~ . + s(id, bs = "mrf", xt = list(nb = nb)))
193 | ```
194 | 
195 | Look at the model summary:
196 | 
197 | ```{r summary-mrf-model}
198 | summary(forest.m2)
199 | ```
200 | 
201 | What differences do you see between the model with the MRF spatial effect and the first model that we fitted?
202 | 
203 | Quickly create plots of the fitted smooth functions
204 | 
205 | ```{r plot-mrf-smooths}
206 | plot(forest.m2, pages = 1, shade = 2, scale = 0, seWithMean = TRUE)
207 | ```
208 | 
209 | Notice that here we use `seWithMean = TRUE` as one of the terms has been shrunk back to a linear function and would otherwise have a *bow tie* confidence interval.
210 | 
211 | Compare these fitted functions with those from the original model. Are these more in keeping with expectations?
212 | 
213 | Model diagnostics are difficult with models for discrete responses such as these. Instead we can interrogate the model to derive quantities of interest that illustrate or summarise the model fit. First we start by generating posterior probabilities for the three ordinal defoliation/damage categories using `predict()`. This is similar to the code Eric showed this morning for the ordered categorical family.
214 | 
215 | ```{r fitted-values}
216 | fit <- predict(forest.m2, type = "response")
217 | colnames(fit) <- levs
218 | head(fit)
219 | ```
220 | 
221 | The `predict()` method returns a 3-column matrix, one column per category. The entries in the matrix are the posterior probabilities of class membership, with rows summing to 1. As we'll see later, we can take a *majority wins* approach and assign each observation a fitted class membership and compare these to the known classes.
222 | 
223 | For easier visualisation it would be nicer to have these fitted probabilities in a tidy form, which we now do via a few manipulations
224 | 
225 | ```{r process-fitted-values}
226 | fit <- setNames(stack(as.data.frame(fit)), c("Prob", "Defoliation"))
227 | fit <- transform(fit, Defoliation = factor(Defoliation, labels = levs))
228 | fit <- with(forest, cbind(fit, x, y, year))
229 | head(fit)
230 | ```
231 | 
232 | The fitted values are now ready for plotting. Here we plot just the years 2000--2004, showing the spatial arrangement of trees, faceted by defoliation/damage category:
233 | 
234 | ```{r plot-fitted}
235 | ggplot(subset(fit, year >= 2000), aes(x = x, y = y, col = Prob)) +
236 |     geom_point(size = 1.2) +
237 |     facet_grid(Defoliation ~ year) +
238 |     coord_fixed() +
239 |     theme(legend.position = "top")
240 | ```
241 | 
242 | A more complex use of the model involves predicting change in class posterior probability over time whilst holding all other variables at the observed mean or category mode
243 | 
244 | ```{r timeseries-predictions}
245 | N <- 200
246 | pdat <- with(forest, expand.grid(year = seq(min(year), max(year), length = N),
247 |                                  age = mean(age, na.rm = TRUE),
248 |                                  gradient = mean(gradient, na.rm = TRUE),
249 |                                  canopyd = mean(canopyd, na.rm = TRUE),
250 |                                  alt = mean(alt, na.rm = TRUE),
251 |                                  depth = mean(depth, na.rm = TRUE),
252 |                                  ph = mean(ph, na.rm = TRUE),
253 |                                  humus = factor(2, levels = levels(humus)),
254 |                                  watermoisture = factor(2, levels = levels(watermoisture)),
255 |                                  alkali = factor(2, levels = levels(alkali)),
256 |                                  type = factor(0, levels = c(0,1)),
257 |                                  fert = factor(0, levels = c(0,1)),
258 |                                  id = levels(id)))
259 | pred <- predict(forest.m2, newdata = pdat, type = "response", exclude = "s(id)")
260 | colnames(pred) <- levs
261 | pred <- setNames(stack(as.data.frame(pred)), c("Prob","Defoliation"))
262 | pred <- cbind(pred, pdat)
263 | pred <- transform(pred, Defoliation = factor(Defoliation, levels = levs))
264 | ```
265 | 
266 | A plot of the predictions is produced using
267 | 
268 | ```{r plot-timeseries-predictions}
269 | ggplot(pred, aes(x = year, y = Prob, colour = Defoliation)) +
270 |     geom_line() +
271 |     theme(legend.position = "top")
272 | ```
273 | 
274 | Using similar code, produce a plot for the effect of `age` holding other values at the mean or mode.
275 | 
276 | The final summary of the model that we'll produce is a confusion matrix of observed and predicted class membership. Technically, this is over-optimistic as we are using the same data to both fit and test the model, but it is an illustration of how well the model does are fitting the observed classes.
277 | 
278 | ```{r confusion-matrix}
279 | fit <- predict(forest.m2, type = "response")
280 | fitClass <- factor(levs[apply(fit, 1, which.max)], levels = levs)
281 | obsClass <- with(forest, factor(aggDefol, labels = levs))
282 | sum(fitClass == obsClass) / length(fitClass)
283 | table(fitClass, obsClass)
284 | ```
285 | 
286 | Just don't take it as any indication of how well the model will do at predicting the defoliation class for a new year or tree.
287 | 
288 | How better does the model with the MRF spatial effect fit the data compared to the original model that was fitted? We can use AIC as one means of answering that question
289 | 
290 | ```{r aic}
291 | AIC(forest.m1, forest.m2)
292 | ```
293 | 
294 | Which model does AIC indicate as fitting the data best?
295 | 
296 | ## Exercise
297 | 
298 | As an additional exercise, replace the MRF smooth with a 2-d spline of the `x` and `y` coordinates and produce maps of the class probabilities over the spatial domain. You'll need to use ideas and techniques from some of the other spatial examples in order to complete this task.
299 | 


--------------------------------------------------------------------------------
/example-linear-functionals.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Linear functionals; or: what to do when you've got a bunch of x-values and only one y"
  3 | author: Eric Pedersen
  4 | output:
  5 |   html_document:
  6 |     toc: true
  7 |     toc_float: true
  8 |     theme: readable
  9 |     highlight: haddock
 10 | ---
 11 | 
 12 | ##1. Background
 13 | 
 14 | Up to now, we've been looking at data where every coefficient we're using to 
 15 | predict our y-values consists of a single value for each y. However, not all 
 16 | predictor data looks like this. In many cases, a single coefficient may have 
 17 | several values associated with the same outcome variable. For instance, Let's
 18 | say we've measured total ecosystem production in several different lakes, as
 19 | well as temperature along a gradient away from shoreline in the lake. We want to know:
 20 | 
 21 | 1. how well temperature predicts production
 22 | 2. whether temperatures at different distances from the shore have different effects on
 23 | production (as production varies a lot between the littoral and pelagic zone).
 24 | 
 25 | Now, there looks like there might be a few ways to answer this in a standard gam
 26 | setting. First we could just average temperatures across locations and use that 
 27 | as a predictor to answer question 1. However, that doesn't give us any insight 
 28 | into question 2, and it's pretty easy to imagine a case like warmer temperatures
 29 | at the shore increase production, but warmer temperatures in the middle of the 
 30 | lake have very little effect, or where a single warm area could substantially
 31 | increase production even in a cold lake. We could also try to fit a seperate
 32 | smooth of production on temperature at each shore distance. However, if we have
 33 | a lot of distances, this ends up fitting a bunch of seperate smooths, and we'd
 34 | rapidly run out of degrees of freedom. Also, it would mean throwing away
 35 | information; we'd expect that the effect of temperature a meter below from the
 36 | shore should be very similar to the effect of temperature right at the
 37 | shoreline, but by fitting a seperate smooth for each, we're ignoring that (and
 38 | likely going to suffer from concurvity issues to boot!).
 39 | 
 40 | What we need is a method that can account for the fact that we have multiple x 
 41 | values for a given predictor. Fortunately, `mcgv` can handle this pretty easily.
 42 | It does this by allowing to pass matrix-valued predictors to `s()` and `te()` 
 43 | terms. When you pass `mgcv` a matrix, it will fit the same smooth function to 
 44 | each of the columns of the matrix, then sum across the rows of the matrix of
 45 | transformed values to give the estimated mean value for a given `y`. Let's say
 46 | `y` is a one-dimensional outcome, with n measured values, and `x` is a matrix
 47 | with n rows and k columns. The predicted values for the ith value of `y` would
 48 | be: $y_i \sim\sum_{j=1}^k f(x_{i,j})$, where $f(x_{i,j})$ is itself a sum of
 49 | basis functions multiplied by model coefficients as before.
 50 | 
 51 | In mathemetical terms, this makes our smooth term a *functional*, where our
 52 | outcome is a function of a vector of terms rather than a single term, and a
 53 | linear functional, as it's a linear sum of smooth terms for each column; but you
 54 | really don't need to know much about functionals to use these. I only mention
 55 | the functional thing because if you want to dig into this approach further or if
 56 | you want to read help files on these, as you have to search
 57 | `?linear.functional.terms`.
 58 | 
 59 | There's a few major useages Ive found for these. There's likely more than this,
 60 | but these are all cases I've encountered. Also, each of these cases requires you
 61 | to set up your predictor matrices carefully, so make sure you know what kind of
 62 | functional you'll be fitting, and what predictor matrix is going where in the `s()`
 63 | function.
 64 | 
 65 | ### Case 1. Nonlinear averaging. 
 66 | The first case, and the simplest to fit, is to estimate 
 67 | a nonlinear average. Let's look at the lake example from before. Say we think 
 68 | that distance from shore shouldn't matter, but that production increases
 69 | non-linearly with temperature. In that case, a cold lake with a few hot spots
 70 | may actually be more productive than a lake that's consitently luke warm.
 71 | Therefore, if you average over temperatures before estimating a function, you'll 
 72 | miss this effect (as we know from statistics that the average of a non-linear
 73 | function applied to x does not, in general, equal the function applied to the
 74 | average of x). In this case, we can just provide a matrix of values (the predictor matrix) to `s()`: in this case, the predictor matrix is a matrix of temperatures where each column is the temperature measured at one of our sites. 
 75 | We do this in the code below with the variable `temp_matrix`. 
 76 | 
 77 | 
 78 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
 79 | library(dplyr)
 80 | library(mgcv)
 81 | library(ggplot2)
 82 | library(tidyr)
 83 | n_lakes = 200
 84 | lake_mean_temps = rnorm(n_lakes, 25,6)
 85 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes,
 86 |                                       shore_dist_m = c(0,1,5,10,20,40)))
 87 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8)
 88 | lake_data$production = 5*dnorm(lake_data$temp, 30, sd = 5)/dnorm(1,0,5)+rnorm(n_lakes*6, 0, 1)
 89 | lake_data = lake_data %>% group_by(lake)%>%
 90 |   mutate(production = mean(production),
 91 |          mean_temp = mean(temp))%>%
 92 |   spread(shore_dist_m, temp)
 93 | 
 94 | 
 95 | lake_data$temp_matrix = lake_data %>% 
 96 |   select(`0`:`40`) %>%
 97 |   as.matrix(.)
 98 | head(lake_data)
 99 | 
100 | 
101 | mean_temp_model = gam(production~s(mean_temp),data=lake_data)
102 | nonlin_temp_model = gam(production~s(temp_matrix),
103 |                                 data=lake_data)
104 | 
105 | layout(matrix(1:2, nrow=1))
106 | plot(mean_temp_model)
107 | plot(nonlin_temp_model)
108 | layout(1)
109 | temp_predict_data = data.frame(temp = seq(10,40,length=100),
110 |                           mean_temp = seq(10,40,length=100))
111 | 
112 | temp_matrix = matrix(seq(10,40,length=100),nrow= 100,ncol=6,
113 |                                        byrow = F) 
114 | 
115 | temp_predict_data = temp_predict_data %>%
116 |   mutate(linear_average = as.numeric(predict(mean_temp_model,.)),
117 |          nonlinear_average = as.numeric(predict(nonlin_temp_model, .)),
118 |         true_function = 5*dnorm(temp, 30, sd = 5)/dnorm(1,0,5))%>%
119 |   gather(model, value,linear_average:true_function)
120 | 
121 | #This plots the two fitted models vs. the true model.
122 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+
123 |   geom_line()+
124 |   scale_color_brewer(palette="Set1")+
125 |   theme_bw(20)
126 | ```
127 | 
128 | The nonlinear model is substantially more uncertain at the ends,
129 | bu it fits the true function much more effectively. The function from 
130 | the average data substantially underestimates the effect of temperature on 
131 | production.
132 | 
133 | ### Case 2: Weighted averaging
134 | The next major case where linear functionals are useful is for weighted 
135 | averages. Here, you have some predictor variable measured at various points at
136 | different distances or lags from a given point, or at different locations along
137 | some gradient. This could be a variable that's been meaured at various distances
138 | away from each observed site, and you want to understand at what scale that
139 | variable will affect your parameter of interest. It could also be a predictor
140 | variable measured at several time points before the observation, and you want to
141 | know what at what lags the two variables interact.
142 | 
143 | In this case, we'll assume that the relationship between our variable of 
144 | interest (x) and the outcome is linear at any given lag, but that linear
145 | relationship changes smoothly with the lag. We'll look at the case where both
146 | relationships are nonlinear next. Here we have to create two matrices. The first
147 | is the lag matrix, and it will have one column for each lag we're interested in. 
148 | All the values in a given column will be equal to the lag value.
149 | The second matrix is the predictor matrix, and it is the same as (see [the section below](#by_var)) on `by=` terms on how this works).
150 | 
151 | 
152 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
153 | library(dplyr)
154 | library(mgcv)
155 | library(ggplot2)
156 | library(tidyr)
157 | n_lakes = 200
158 | lake_mean_temps = rnorm(n_lakes, 25,6)
159 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes,
160 |                                       shore_dist_m = c(0,1,5,10,20,40)))
161 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8)
162 | lake_data$production = with(lake_data, rnorm(n_lakes*6, 
163 |                              temp*3*exp(-shore_dist_m/10),1))
164 | lake_data = lake_data %>% group_by(lake)%>%
165 |   mutate(production = mean(production),
166 |          mean_temp = mean(temp))%>%
167 |   spread(shore_dist_m, temp)
168 | 
169 | #This is our lag matrix for this example
170 | shore_dist_matrix = matrix(c(0,1,5,10,20,40), nrow=n_lakes,ncol=6, byrow = T)
171 | head(shore_dist_matrix)
172 | 
173 | #This is our predictor matrix
174 | lake_data$temp_matrix = lake_data %>% 
175 |   select(`0`:`40`) %>%
176 |   as.matrix(.)
177 | head(lake_data)
178 | 
179 | #Note: we need to set k=6 here, as we really only have 6 degrees of freedom (one
180 | #for each distance we've measured from the shore.
181 | 
182 | nonlin_temp_model = gam(production~s(shore_dist_matrix, by= temp_matrix,k=6),
183 |                                 data=lake_data)
184 | plot(nonlin_temp_model)
185 | 
186 | #This plots the two fitted models vs. the true model.
187 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+
188 |   geom_line()+
189 |   scale_color_brewer(palette="Set1")+
190 |   theme_bw(20)
191 | ```
192 | 
193 | ## 2. Key concepts and functions
194 | 
195 | ### using `by=` terms in smoothers {#by_var}
196 | One of the most useful features in `mgcv` is the `by=` argument for smooths. 
197 | This has two uses. 
198 | 
199 | 1. For a given smooth (say, `y~s(x)`), if you set `s(x,by=group`), where group
200 | is some factor-leveled predictor, `mgcv` will fit a seperate smooth of x for
201 | each level of that factor. The model will produce a different smooth $s_k(x)$
202 | for each kth level, allowing you to test if a given relationship varies betewen
203 | group. 
204 | 2. If you instead you set `s(x, by=z)` where z is a numerical value, `mgcv` will
205 | fit a varying slopes regression. Instead of modeling y as a smooth function of
206 | x, this will model y as a *linear* function of z, where the slope of the
207 | relationship between z and y changes smoothly as a function of x, so the 
208 | predicted value for the ith value of y would be:  $y_i \sim  f(x_i) \cdot z_i$. 
209 | If you're using matrix predictors as discussed above, for numerical predictors 
210 | the `by` variable `z` also has to be a matrix with the same dimensions as `x`. 
211 | The predicted value in this case will be: $y_i \sim  \sum_{j=1}^k f(x_{i,j})\cdot z_{i,j}$, where $j$ are the columns of the matrices $x$ and $z$.
212 | 
213 | 
214 | ##Calculating dispersal kernels
215 | 
216 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
217 | yukon_seedling_data = read.csv("data/yukon_seeds/seed_data.csv")
218 | yukon_source_data  =read.csv("data/yukon_seeds/seed_source_locations.csv")
219 | seed_dist = matrix(0, nrow = nrow(yukon_seedling_data), 
220 |                    ncol= nrow(yukon_source_data))
221 | for(i in 1:nrow(yukon_seedling_data)){
222 |     seed_dist[i,] = sqrt((yukon_source_data$X- yukon_seedling_data$X[i])^2 + (yukon_source_data$Y- yukon_seedling_data$Y[i])^2)
223 | }
224 | seed_dist_l = log(seed_dist)
225 | yukon_seedling_data$min_dist_l = apply(seed_dist_l, MARGIN = 1,min)
226 | 
227 | basic_dispersal_model = gam(n_spruce~s(min_dist_l)+offset(log(plot_area_m2)),
228 |                             data=yukon_seedling_data, family=nb)
229 | full_dispersal_model = gam(n_spruce~s(seed_dist_l)+offset(log(plot_area_m2)),
230 |                             data=yukon_seedling_data, family=nb)
231 | ```
232 | 
233 | 


--------------------------------------------------------------------------------
/example-spatial-mexdolphins-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 |     
 10 | ---
 11 | 
 12 | # Preamble
 13 | 
 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013).
 15 | 
 16 | From that appendix:
 17 | 
 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.*
 19 | 
 20 | 
 21 | ## Doing these exercises
 22 | 
 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works.
 24 | 
 25 | # Data format
 26 | 
 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided:
 28 | 
 29 | 
 30 | ```{r loaddata}
 31 | load("data/mexdolphins/mexdolphins.RData")
 32 | ls()
 33 | ```
 34 | 
 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model.
 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs
 37 | - `preddata` prediction grid without any fancy spatial stuff
 38 | 
 39 | Looking further into the `mexdolphins` frame we see:
 40 | 
 41 | ```{r frameinspect}
 42 | str(mexdolphins)
 43 | ```
 44 | 
 45 | A brief explanation of each entry:
 46 | 
 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area)
 48 | - `Transect.Label` identifier for the transect that this segment belongs to
 49 | - `longitude`, `latitude` location in lat/long of this segment
 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection))
 51 | - `Effort` the length of the current segment
 52 | - `depth` the bathymetry at the segment's position
 53 | - `count` number of dolphins observed in this segment
 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment
 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this)
 56 | 
 57 | 
 58 | # Modelling
 59 | 
 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model.
 61 | 
 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*.
 63 | 
 64 | ## A simple model to start with
 65 | 
 66 | We can begin with the model we showed in the first lecture:
 67 | 
 68 | ```{r simplemodel}
 69 | library(mgcv)
 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
 71 |                       data = mexdolphins,
 72 |                       family = quasipoisson(),
 73 |                       method = "REML")
 74 | ```
 75 | 
 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection.
 77 | 
 78 | We can check the assumptions of this model by using `gam.check`:
 79 | 
 80 | ```{r simplemodel-check}
 81 | gam.check(dolphins_depth)
 82 | ```
 83 | 
 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations:
 85 | 
 86 | ```{r zeroresids, fig.width=7, fig.height=7}
 87 | # code from the insides of mgcv::gam.check
 88 | resid <- residuals(dolphins_depth, type="deviance")
 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors)
 90 | plot(linpred, resid, main = "Resids vs. linear pred.",
 91 |      xlab = "linear predictor", ylab = "residuals")
 92 | 
 93 | # now add red dots corresponding to the zero counts
 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0],
 95 |        pch=19, col="red", cex=0.5)
 96 | ```
 97 | 
 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!).
 99 | 
100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate.
101 | 
102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix.
103 | 
104 | 
105 | **Exercise**
106 | 
107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`.
108 | 
109 | ```{r simplemodel-bigsmall}
110 | dolphins_depth_bigk <- gam(count ~ s(depth, k=30) + offset(off.set),
111 |                       data = mexdolphins,
112 |                       family = quasipoisson(),
113 |                       method = "REML")
114 | dim(model.matrix(dolphins_depth_bigk))
115 | dim(dolphins_depth_bigk$smooth[[1]]$S[[1]])
116 | dolphins_depth_smallk <- gam(count ~ s(depth, k=3) + offset(off.set),
117 |                       data = mexdolphins,
118 |                       family = quasipoisson(),
119 |                       method = "REML")
120 | dim(model.matrix(dolphins_depth_smallk))
121 | dim(dolphins_depth_smallk$smooth[[1]]$S[[1]])
122 | ```
123 | 
124 | (Don't worry about the many square brackets etc to get the penalty matrix!)
125 | 
126 | ### Plotting
127 | 
128 | We can plot the smooth we fitted using `plot`.
129 | 
130 | **Exercise**
131 | 
132 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which.
133 | 
134 | ```{r plotk, }
135 | par(mfrow=c(1,3))
136 | plot(dolphins_depth, shade=TRUE, main="Default k")
137 | plot(dolphins_depth_bigk, shade=TRUE, main="big k")
138 | plot(dolphins_depth_smallk, shade=TRUE, main="small k")
139 | ```
140 | 
141 | ## Count distributions
142 | 
143 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves...
144 | 
145 | 
146 | ### Tweedie
147 | 
148 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution.
149 | 
150 | ```{r tw}
151 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
152 |                          data = mexdolphins,
153 |                          family = tw(),
154 |                          method = "REML")
155 | ```
156 | 
157 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008).
158 | 
159 | 
160 | 
161 | ### Negative binomial
162 | 
163 | **Exercise**
164 | 
165 | Now do the same using the negative binomial distribution (`nb()`).
166 | 
167 | ```{r nb}
168 | dolphins_xy_nb <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
169 |                          data = mexdolphins,
170 |                          family = nb(),
171 |                          method = "REML")
172 | ```
173 | 
174 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why?
175 | 
176 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.*
177 | 
178 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships.
179 | 
180 | ## Smoothers
181 | 
182 | Now let's move onto using different bases for the smoothers. We have a couple of different options here.
183 | 
184 | 
185 | ### Thin plate splines with shrinkage
186 | 
187 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth.
188 | 
189 | **Exercise**
190 | 
191 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms.
192 | 
193 | ```{r tw-ts}
194 | dolphins_xy_tw_ts <- gam(count ~ s(x,y, bs="ts") + s(depth, bs="ts") +
195 |                                  offset(off.set),
196 |                          data = mexdolphins,
197 |                          family = tw(),
198 |                          method = "REML")
199 | ```
200 | 
201 | What are the differences (use `summary`)?
202 | 
203 | *EDFs are different for both terms*
204 | 
205 | What are the visual differences (use `plot`)?
206 | 
207 | *Not really much difference here!*
208 | 
209 | 
210 | ### Soap film smoother
211 | 
212 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands.
213 | 
214 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file:
215 | 
216 | ```{r soapy}
217 | load("data/mexdolphins/soapy.RData")
218 | ```
219 | 
220 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them.
221 | 
222 | ```{r soapknots}
223 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10),
224 |                           y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10))
225 | x <- soap_knots$x; y <- soap_knots$y
226 | ind <- inSide(xy_bnd, x, y)
227 | rm(x,y)
228 | soap_knots <- soap_knots[ind, ]
229 | ## inSide doesn't work perfectly, so if you get an error like:
230 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) :
231 | ##  knot 54 is on or outside boundary
232 | ## just remove that knot as follows:
233 | soap_knots <- soap_knots[-8, ]
234 | soap_knots <- soap_knots[-54, ]
235 | ```
236 | 
237 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`.
238 | 
239 | ```{r soapmodel}
240 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) +
241 |                      offset(off.set),
242 |                      data = mexdolphins,
243 |                      family = tw(),
244 |                      knots = soap_knots,
245 |                      method = "REML")
246 | 
247 | ```
248 | 
249 | **Exercise**
250 | 
251 | Look at the `summary` output for this model and compare it to the other models.
252 | 
253 | ```{r soap-summary}
254 | summary(dolphins_soap)
255 | ```
256 | 
257 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out.
258 | 
259 | 
260 | ```{r soap-plot}
261 | plot(dolphins_soap)
262 | ```
263 | 
264 | # Predictions
265 | 
266 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM.
267 | 
268 | We can do this very simply, calling predict as one would with a `glm`. For example:
269 | 
270 | ```{r pred-qp}
271 | pred_qp <- predict(dolphins_depth, preddata, type="response")
272 | ```
273 | 
274 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`.
275 | 
276 | **Exercise**
277 | 
278 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum?
279 | 
280 | ```{r total-max-min}
281 | sum(pred_qp)
282 | range(pred_qp)
283 | ```
284 | 
285 | ## Plotting predictions
286 | 
287 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.*
288 | 
289 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop.
290 | 
291 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot:
292 | 
293 | ```{r gridplotfn}
294 | library(plyr)
295 | # fill must be in the same order as the polygon data
296 | grid_plot_obj <- function(fill, name, sp){
297 | 
298 |   # what was the data supplied?
299 |   names(fill) <- NULL
300 |   row.names(fill) <- NULL
301 |   data <- data.frame(fill)
302 |   names(data) <- name
303 | 
304 |   spdf <- SpatialPolygonsDataFrame(sp, data)
305 |   spdf@data$id <- rownames(spdf@data)
306 |   spdf.points <- fortify(spdf, region="id")
307 |   spdf.df <- join(spdf.points, spdf@data, by="id")
308 | 
309 |   # seems to store the x/y even when projected as labelled as
310 |   # "long" and "lat"
311 |   spdf.df$x <- spdf.df$long
312 |   spdf.df$y <- spdf.df$lat
313 | 
314 |   geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
315 | }
316 | ```
317 | 
318 | Then we can plot the predicted abundance:
319 | 
320 | ```{r plotpred}
321 | library(ggplot2)
322 | library(viridis)
323 | library(sp)
324 | library(rgeos)
325 | library(rgdal)
326 | library(maptools)
327 | 
328 | # projection string
329 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
330 | # need sp to transform the data
331 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4)
332 | 
333 | p <- ggplot() +
334 |       # abundance
335 |       grid_plot_obj(pred_qp, "N", pred.polys) +
336 |       # survey lines
337 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
338 |       # observations
339 |       geom_point(aes(x, y, size=count),
340 |                  data=mexdolphins[mexdolphins$count>0,],
341 |                  colour="red", alpha=I(0.7)) +
342 |       # make the coordinate system fixed
343 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
344 |                   xlim = range(mexdolphins$x)) +
345 |       # use the viridis colourscheme, which is more colourblind-friendly
346 |       scale_fill_viridis() +
347 |       # labels
348 |       labs(fill="Predicted\ndensity",x="x",y="y",size="Count") +
349 |       # keep things simple
350 |       theme_minimal()
351 | print(p)
352 | ```
353 | 
354 | If you have the `maps` package installed you can also try the following plot the coastline too:
355 | 
356 | ```{r maptoo}
357 | library(maps)
358 | map_dat <- map_data("usa")
359 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
360 | 
361 | # give the sp object a projection
362 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
363 | # re-project
364 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
365 | map_dat$x <- map_sp.t$long
366 | map_dat$y <- map_sp.t$lat
367 | 
368 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
369 | 
370 | print(p)
371 | ```
372 | 
373 | ## Comparing predictions from soap and thin plate splines
374 | 
375 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area.
376 | 
377 | Predicting using the soap film smoother is exactly the same as for the other smoothers:
378 | 
379 | ```{r soap-pred}
380 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response")
381 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap")
382 | ```
383 | 
384 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model.
385 | 
386 | ```{r xy-soap-pred}
387 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set),
388 |                      data = mexdolphins,
389 |                      family = tw(),
390 |                      method = "REML")
391 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response")
392 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy")
393 | all_pred_N <- rbind(soap_pred_N, xy_pred_N)
394 | 
395 | p <- ggplot() +
396 |       # abundance
397 |       geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) +
398 |       # facet it!
399 |       facet_wrap(~model) +
400 |       # make the coordinate system fixed
401 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
402 |                   xlim = range(mexdolphins$x)) +
403 |       # use the viridis colourscheme, which is more colourblind-friendly
404 |       scale_fill_viridis() +
405 |       # labels
406 |       labs(fill="Predicted\ndensity", x="x", y="y") +
407 |       # keep things simple
408 |       theme_minimal() +
409 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
410 | print(p)
411 | ```
412 | 
413 | ## Plotting uncertainty
414 | 
415 | **Exercise**
416 | 
417 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below.
418 | 
419 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements.
420 | 
421 | ```{r CVmap}
422 | pred_se_qp <- predict(dolphins_depth, preddata, type="response", se.fit=TRUE)
423 | cv <- pred_se_qp$se.fit/pred_se_qp$fit
424 | 
425 | p <- ggplot() +
426 |       # abundance
427 |       grid_plot_obj(cv, "CV", pred.polys) +
428 |       # survey lines
429 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
430 |       # make the coordinate system fixed
431 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
432 |                   xlim = range(mexdolphins$x)) +
433 |       # use the viridis colourscheme, which is more colourblind-friendly
434 |       scale_fill_viridis() +
435 |       # labels
436 |       labs(fill="CV",x="x",y="y",size="Count") +
437 |       # keep things simple
438 |       theme_minimal()
439 | print(p)
440 | ```
441 | 
442 | Compare the uncertainties of the different models you've fitted so far.
443 | 
444 | ## `lpmatrix` magic
445 | 
446 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true:
447 | 
448 | ```{r lppred}
449 | # make the Lp matrix
450 | lp <- predict(dolphins_depth, preddata, type="lpmatrix")
451 | # get the linear predictor
452 | lin_pred <- lp %*% coef(dolphins_depth)
453 | # apply the link and multiply by the offset as lpmatrix ignores this
454 | pred <- preddata$area * exp(lin_pred)
455 | # all the same?
456 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE)
457 | ```
458 | 
459 | What else can we do? We can also grab the uncertainty for the sum of the predictions:
460 | 
461 | ```{r lpNvar}
462 | # extract the variance-covariance matrix
463 | vc <- vcov(dolphins_depth)
464 | 
465 | # reproject the var-covar matrix to be on the linear predictor scale
466 | lin_pred_var <- tcrossprod(lp %*% vc, lp)
467 | 
468 | # pre and post multiply by the derivatives of the link, evaluated at the
469 | # predictions -- since the link is exp, we just use the predictions
470 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1)
471 | 
472 | # we can then calculate a total CV
473 | sqrt(pred_var)/sum(pred)
474 | ```
475 | 
476 | As you can see, the `lpmatrix` can be very useful!
477 | 
478 | # Extra credit
479 | 
480 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument.
481 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose?
482 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make?
483 | 
484 | # References
485 | 
486 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0
487 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105
488 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006
489 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114.
490 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x
491 | 
492 | 


--------------------------------------------------------------------------------
/example-spatial-mexdolphins.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 |     
 10 | ---
 11 | 
 12 | # Preamble
 13 | 
 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013).
 15 | 
 16 | From that appendix:
 17 | 
 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.*
 19 | 
 20 | 
 21 | ## Doing these exercises
 22 | 
 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works.
 24 | 
 25 | # Data format
 26 | 
 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided:
 28 | 
 29 | 
 30 | ```{r loaddata}
 31 | load("data/mexdolphins/mexdolphins.RData")
 32 | ls()
 33 | ```
 34 | 
 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model.
 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs
 37 | - `preddata` prediction grid without any fancy spatial stuff
 38 | 
 39 | Looking further into the `mexdolphins` frame we see:
 40 | 
 41 | ```{r frameinspect}
 42 | str(mexdolphins)
 43 | ```
 44 | 
 45 | A brief explanation of each entry:
 46 | 
 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area)
 48 | - `Transect.Label` identifier for the transect that this segment belongs to
 49 | - `longitude`, `latitude` location in lat/long of this segment
 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection))
 51 | - `Effort` the length of the current segment
 52 | - `depth` the bathymetry at the segment's position
 53 | - `count` number of dolphins observed in this segment
 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment
 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this)
 56 | 
 57 | 
 58 | # Modelling
 59 | 
 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model.
 61 | 
 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*.
 63 | 
 64 | ## A simple model to start with
 65 | 
 66 | We can begin with the model we showed in the first lecture:
 67 | 
 68 | ```{r simplemodel}
 69 | library(mgcv)
 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
 71 |                       data = mexdolphins,
 72 |                       family = quasipoisson(),
 73 |                       method = "REML")
 74 | ```
 75 | 
 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection.
 77 | 
 78 | We can check the assumptions of this model by using `gam.check`:
 79 | 
 80 | ```{r simplemodel-check}
 81 | gam.check(dolphins_depth)
 82 | ```
 83 | 
 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations:
 85 | 
 86 | ```{r zeroresids, fig.width=7, fig.height=7}
 87 | # code from the insides of mgcv::gam.check
 88 | resid <- residuals(dolphins_depth, type="deviance")
 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors)
 90 | plot(linpred, resid, main = "Resids vs. linear pred.",
 91 |      xlab = "linear predictor", ylab = "residuals")
 92 | 
 93 | # now add red dots corresponding to the zero counts
 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0],
 95 |        pch=19, col="red", cex=0.5)
 96 | ```
 97 | 
 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!).
 99 | 
100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate.
101 | 
102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix.
103 | 
104 | 
105 | **Exercise**
106 | 
107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`.
108 | 
109 | ```{r simplemodel-bigsmall}
110 | 
111 | ```
112 | 
113 | (Don't worry about the many square brackets etc to get the penalty matrix!)
114 | 
115 | ### Plotting
116 | 
117 | We can plot the smooth we fitted using `plot`.
118 | 
119 | **Exercise**
120 | 
121 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which.
122 | 
123 | ```{r plotk, }
124 | par(mfrow=c(1,3))
125 | 
126 | ```
127 | 
128 | ## Count distributions
129 | 
130 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves...
131 | 
132 | 
133 | ### Tweedie
134 | 
135 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution.
136 | 
137 | ```{r tw}
138 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
139 |                          data = mexdolphins,
140 |                          family = tw(),
141 |                          method = "REML")
142 | ```
143 | 
144 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008).
145 | 
146 | 
147 | 
148 | ### Negative binomial
149 | 
150 | **Exercise**
151 | 
152 | Now do the same using the negative binomial distribution (`nb()`).
153 | 
154 | ```{r nb}
155 | 
156 | ```
157 | 
158 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why?
159 | 
160 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.*
161 | 
162 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships.
163 | 
164 | ## Smoothers
165 | 
166 | Now let's move onto using different bases for the smoothers. We have a couple of different options here.
167 | 
168 | 
169 | ### Thin plate splines with shrinkage
170 | 
171 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth.
172 | 
173 | **Exercise**
174 | 
175 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms.
176 | 
177 | ```{r tw-ts}
178 | ```
179 | 
180 | What are the differences (use `summary`)?
181 | 
182 | What are the visual differences (use `plot`)?
183 | 
184 | 
185 | ### Soap film smoother
186 | 
187 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands.
188 | 
189 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file:
190 | 
191 | ```{r soapy}
192 | load("data/mexdolphins/soapy.RData")
193 | ```
194 | 
195 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them.
196 | 
197 | ```{r soapknots}
198 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10),
199 |                           y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10))
200 | x <- soap_knots$x; y <- soap_knots$y
201 | ind <- inSide(xy_bnd, x, y)
202 | rm(x,y)
203 | soap_knots <- soap_knots[ind, ]
204 | ## inSide doesn't work perfectly, so if you get an error like:
205 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) :
206 | ##  knot 54 is on or outside boundary
207 | ## just remove that knot as follows:
208 | soap_knots <- soap_knots[-8, ]
209 | soap_knots <- soap_knots[-54, ]
210 | ```
211 | 
212 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`.
213 | 
214 | ```{r soapmodel}
215 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) +
216 |                      offset(off.set),
217 |                      data = mexdolphins,
218 |                      family = tw(),
219 |                      knots = soap_knots,
220 |                      method = "REML")
221 | 
222 | ```
223 | 
224 | **Exercise**
225 | 
226 | Look at the `summary` output for this model and compare it to the other models.
227 | 
228 | ```{r soap-summary}
229 | ```
230 | 
231 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out.
232 | 
233 | 
234 | ```{r soap-plot}
235 | ```
236 | 
237 | # Predictions
238 | 
239 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM.
240 | 
241 | We can do this very simply, calling predict as one would with a `glm`. For example:
242 | 
243 | ```{r pred-qp}
244 | pred_qp <- predict(dolphins_depth, preddata, type="response")
245 | ```
246 | 
247 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`.
248 | 
249 | **Exercise**
250 | 
251 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum?
252 | 
253 | ## Plotting predictions
254 | 
255 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.*
256 | 
257 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop.
258 | 
259 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot:
260 | 
261 | ```{r gridplotfn}
262 | library(plyr)
263 | # fill must be in the same order as the polygon data
264 | grid_plot_obj <- function(fill, name, sp){
265 | 
266 |   # what was the data supplied?
267 |   names(fill) <- NULL
268 |   row.names(fill) <- NULL
269 |   data <- data.frame(fill)
270 |   names(data) <- name
271 | 
272 |   spdf <- SpatialPolygonsDataFrame(sp, data)
273 |   spdf@data$id <- rownames(spdf@data)
274 |   spdf.points <- fortify(spdf, region="id")
275 |   spdf.df <- join(spdf.points, spdf@data, by="id")
276 | 
277 |   # seems to store the x/y even when projected as labelled as
278 |   # "long" and "lat"
279 |   spdf.df$x <- spdf.df$long
280 |   spdf.df$y <- spdf.df$lat
281 | 
282 |   geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
283 | }
284 | ```
285 | 
286 | Then we can plot the predicted abundance:
287 | 
288 | ```{r plotpred}
289 | library(ggplot2)
290 | library(viridis)
291 | library(sp)
292 | library(rgeos)
293 | library(rgdal)
294 | library(maptools)
295 | 
296 | # projection string
297 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
298 | # need sp to transform the data
299 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4)
300 | 
301 | p <- ggplot() +
302 |       # abundance
303 |       grid_plot_obj(pred_qp, "N", pred.polys) +
304 |       # survey lines
305 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
306 |       # observations
307 |       geom_point(aes(x, y, size=count),
308 |                  data=mexdolphins[mexdolphins$count>0,],
309 |                  colour="red", alpha=I(0.7)) +
310 |       # make the coordinate system fixed
311 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
312 |                   xlim = range(mexdolphins$x)) +
313 |       # use the viridis colourscheme, which is more colourblind-friendly
314 |       scale_fill_viridis() +
315 |       # labels
316 |       labs(fill="Predicted\ndensity",x="x",y="y",size="Count") +
317 |       # keep things simple
318 |       theme_minimal()
319 | print(p)
320 | ```
321 | 
322 | If you have the `maps` package installed you can also try the following plot the coastline too:
323 | 
324 | ```{r maptoo}
325 | library(maps)
326 | map_dat <- map_data("usa")
327 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
328 | 
329 | # give the sp object a projection
330 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
331 | # re-project
332 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
333 | map_dat$x <- map_sp.t$long
334 | map_dat$y <- map_sp.t$lat
335 | 
336 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
337 | 
338 | print(p)
339 | ```
340 | 
341 | ## Comparing predictions from soap and thin plate splines
342 | 
343 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area.
344 | 
345 | Predicting using the soap film smoother is exactly the same as for the other smoothers:
346 | 
347 | ```{r soap-pred}
348 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response")
349 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap")
350 | ```
351 | 
352 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model.
353 | 
354 | ```{r xy-soap-pred}
355 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set),
356 |                      data = mexdolphins,
357 |                      family = tw(),
358 |                      method = "REML")
359 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response")
360 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy")
361 | all_pred_N <- rbind(soap_pred_N, xy_pred_N)
362 | 
363 | p <- ggplot() +
364 |       # abundance
365 |       geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) +
366 |       # facet it!
367 |       facet_wrap(~model) +
368 |       # make the coordinate system fixed
369 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
370 |                   xlim = range(mexdolphins$x)) +
371 |       # use the viridis colourscheme, which is more colourblind-friendly
372 |       scale_fill_viridis() +
373 |       # labels
374 |       labs(fill="Predicted\ndensity", x="x", y="y") +
375 |       # keep things simple
376 |       theme_minimal() +
377 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
378 | print(p)
379 | ```
380 | 
381 | ## Plotting uncertainty
382 | 
383 | **Exercise**
384 | 
385 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below.
386 | 
387 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements.
388 | 
389 | ```{r CVmap}
390 | 
391 | ```
392 | 
393 | Compare the uncertainties of the different models you've fitted so far.
394 | 
395 | ## `lpmatrix` magic
396 | 
397 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true:
398 | 
399 | ```{r lppred}
400 | # make the Lp matrix
401 | lp <- predict(dolphins_depth, preddata, type="lpmatrix")
402 | # get the linear predictor
403 | lin_pred <- lp %*% coef(dolphins_depth)
404 | # apply the link and multiply by the offset as lpmatrix ignores this
405 | pred <- preddata$area * exp(lin_pred)
406 | # all the same?
407 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE)
408 | ```
409 | 
410 | What else can we do? We can also grab the uncertainty for the sum of the predictions:
411 | 
412 | ```{r lpNvar}
413 | # extract the variance-covariance matrix
414 | vc <- vcov(dolphins_depth)
415 | 
416 | # reproject the var-covar matrix to be on the linear predictor scale
417 | lin_pred_var <- tcrossprod(lp %*% vc, lp)
418 | 
419 | # pre and post multiply by the derivatives of the link, evaluated at the
420 | # predictions -- since the link is exp, we just use the predictions
421 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1)
422 | 
423 | # we can then calculate a total CV
424 | sqrt(pred_var)/sum(pred)
425 | ```
426 | 
427 | As you can see, the `lpmatrix` can be very useful!
428 | 
429 | # Extra credit
430 | 
431 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument.
432 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose?
433 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make?
434 | 
435 | # References
436 | 
437 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0
438 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105
439 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006
440 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114.
441 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x
442 | 
443 | 


--------------------------------------------------------------------------------
/example-spatio-temporal data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Spatio-temporal dynamics is for the birds"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 | ---
 10 | 
 11 | ##1. Background 
 12 | We've talked about modelling spatial trends in the main workshop and in an
 13 | extended example, and about the trials and tribulations of time series data
 14 | in another extended example. Here, we're going to combine these, to look at 
 15 | one of the most challenging types of data for linear models to deal with:
 16 | spatiotemporal data. A spatiotemporal dataset is one where the outcome of
 17 | interest has been observed at multiple locations and at multiple points in time,
 18 | and we want to know how temporal trends change across the landscape (or, from
 19 | the other view, how spatial patterns are changing over time). 
 20 | 
 21 | We don't have time to really get into the nuts and bolts issues around fitting 
 22 | complex spatio-temporal models. This exercise is instead designed to give you
 23 | a feel for how to use `mgcv` to model complex interactions between variables
 24 | with different natural scales, and how to include simple random effects into 
 25 | gam models.
 26 | 
 27 | ## 2. Key concepts and functions: 
 28 | Here's a few key ideas and R functions you should familiarize yourself with if 
 29 | you haven't already encountered them before. For the R functions (which will be)
 30 | highlighted `like this`, use `?function_name` to look them up. 
 31 | 
 32 | ### Tensor product (`te`) smooths
 33 | This tutorial will rely on tensor product smooths. These are used to model 
 34 | nonlinear interactions between variables that have different natural scales. The
 35 | interaction smooths we saw in the start of the workshop were thin plate splines 
 36 | `y~s(x1,x2, bs="tp")`). These splines assume that the relationship between y and
 37 | x1 should be roughly as "wiggly" as the relationship between y and x2, and only
 38 | has one smoothing parameter. This makes sense if x1 and x2 are similar variables
 39 | (say latitude and longitude, or arm length and leg length), but if we're looking
 40 | at the interacting effects of, say, temperature and body weight, a single
 41 | smoothing term doesn't make sense.
 42 | 
 43 | This is where tensor product smooths come in. These assume that each variable of
 44 | interest should have its own smooth term. These create multi-dimensional smooth 
 45 | terms by multiplying the values of each one-dimensional basis by each value of
 46 | the second. This is a pretty complicated idea, and hard to describe effectively 
 47 | without more math than we want to go into here. But you can get the idea from a
 48 | simple example. Let's say we have two basis functions, one for variable x1 and 
 49 | one for variable x2. The function for x1 is a bell-curve shaped, and the one for
 50 | x2 is linear. The basis functions will look like this:
 51 | 
 52 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE}
 53 | library(mgcv)
 54 | library(ggplot2)
 55 | library(dplyr)
 56 | library(viridis)
 57 | set.seed(1)
 58 | n=25
 59 | dat = as.data.frame(expand.grid(x1 = seq(-1,1,length.out = n),
 60 |                                 x2= seq(-1,1,length=n)))
 61 | dat$x1_margin = -dat$x1
 62 | dat$x2_margin = dnorm(dat$x2,0,0.5)
 63 | dat$x2_margin = dat$x2_margin - mean(dat$x2_margin)
 64 | 
 65 | layout(matrix(1:2,ncol=2))
 66 | plot(x1_margin~x1,type="l", data= dat%>%filter(!duplicated(x1)))
 67 | plot(x2_margin~x2,type="l", data= dat%>%filter(!duplicated(x2)))
 68 | layout(1)
 69 | ```
 70 | 
 71 | However, the tensor product of the two variables is more complicated. Note that
 72 | when x1 is below zero, the effect of x2 is positive at middle values and negative
 73 | at low values, and vice versa when x1 is above zero. 
 74 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE}
 75 | dat$x1x2_tensor = dat$x1_margin*dat$x2_margin
 76 | ggplot(aes(x=x1,y=x2,fill=x1x2_tensor),data=dat)+
 77 |   geom_tile()+
 78 |   scale_x_continuous(expand=c(0,0))+
 79 |   scale_y_continuous(expand=c(0,0))+
 80 |   scale_fill_viridis()+
 81 |   theme_bw()
 82 | ```
 83 | 
 84 | A full tensor product smooth will consist of several basis functions constructed
 85 | like this. The resulting smooth is then fitted using the standard gam model, but
 86 | with multiple penalty terms (one for each term in the tensor product). Each
 87 | penalty penalizes how much the function deviates from a smooth function in the
 88 | direction of one of the variables, averaged over all the other variables. This
 89 | allows us to have the function be more or less wiggly in one direction than
 90 | another. 
 91 | 
 92 | They are built using the `te()` function, like this:
 93 | 
 94 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
 95 | # We'll recycle our fake marginal and tensor products as the expected value for
 96 | # a simulation. Waste not, want not!
 97 | dat$y = rnorm(n^2, dat$x1x2_tensor+dat$x1_margin+dat$x2_margin,0.5) 
 98 | 
 99 | te_demo = gam(y~te(x1, x2, k = 4^2),data=dat,method="REML") #k=5^2 so there will be 5 marginal bases for each dimension
100 | plot.gam(te_demo,scheme = 2)
101 | summary(te_demo)
102 | print(te_demo$sp) 
103 | #These are the estimated smooths for the two dimensions. Note that the smooth for
104 | #x1 (the linear basis) is way smoother than the one for x2
105 | ```
106 | 
107 | 
108 | 
109 | ### Tensor interaction smooths
110 | There are many cases when we want an estimate of both the average effect of each
111 | variable as well as the interaction. This is tough to extract from just using 
112 | `te()` terms, but if we try to combine the two (say, with 
113 | `y~s(x1)+s(x2)+te(x1,x2))`) the result is often numerically unstable; it won't
114 | give reliable consistent separation into the terms we want. Instead, we use `ti`
115 | terms for the interaction. The `ti` terms are set up so the average effect of
116 | each variable in the smooth is already excluded from the interaction (and they'll
117 | have fewer basis functions than the equivalent `te` term). Search `?te` if you're 
118 | curious about how `mcgv` sets these up; for now, we'll just use them as a black 
119 | box. 
120 | 
121 | This is how you use them in practice: 
122 | 
123 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
124 | # We'll recycle our fake tensor product as the expected value for a simulation
125 | # Waste not, want not!
126 | 
127 | ti_demo = gam(y~s(x1,k=4)+s(x2,k=4) +ti(x1, x2, k = 4^2),
128 |               data=dat,method="REML")
129 | layout(matrix(1:3,nrow=1))
130 | plot.gam(ti_demo,scheme = 2)
131 | layout(1)
132 | summary(ti_demo)
133 | print(ti_demo$sp) 
134 | #These are the estimated smooths for the two dimensions. Note that the smooth for
135 | #x1 (the linear basis) is way smoother than the one for x2
136 | ```
137 | 
138 | Note that it does a pretty good job of extracting our assumed marginal 
139 | basis functions (linear and Gaussian) and the tensor product into three clearly 
140 | separate functions. 
141 | 
142 | NOTE: Never use `ti` on its own without including the marginal smooths! This
143 | is for the same reason you shouldn't exclude individual linear terms but include
144 | interactions in `glm` models. It really virtually never makes sense to talk about a function with a strong interaction but not marginal effects.
145 | 
146 | ### Random effects smooths
147 | The last smooth type we'll use in this exercise is the random effects smooth. 
148 | This is just what it sounds on the box: it's a smooth for factor terms, with one
149 | coefficient for each level of the group (the basis functions for this case), and 
150 | a penalty term on the squared value of the coefficients to draw them towards the
151 | global mean. This is exactly the same mechanism that packages like `lme4` and
152 | `nlme` use to calculate random effects. In fact, there's a very deep connection
153 | between gams and mixed effects,but we won't go into that here. 
154 | 
155 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
156 | n= 10
157 | n_groups= 10
158 | group_means = rnorm(n_groups, 0, 2)
159 | dat = data_frame(group = rep(1:n_groups, each=n),
160 |                  group_f = factor(group))
161 | dat$y = rnorm(n*n_groups, 3+group_means[dat$group],2)
162 | re_demo = gam(y~s(group_f, bs="re"),
163 |               data=dat,method="REML")
164 | plot.gam(re_demo,scheme = 2)
165 | summary(re_demo)
166 | print(re_demo$sp) 
167 | ```
168 | 
169 | It's important to know when using `bs="re"` terms whether the variable you're 
170 | giving it is numeric or factorial. `mgcv` will not raise an error if you give it
171 | numerical variables to `bs="re"`. Instead, it'll just fit a linear model with a 
172 | penalty on the slope of the line between x and y. Try it with the data about to 
173 | see, by switching `s(group_f, bs="re")` with `s(group, bs="re")`. `bs="re"` is 
174 | set up like this to allow you to use random effects smooths to fit things like 
175 | varying slopes random effects models, but make sure you know what you're doing
176 | when venturing into this.
177 | 
178 | ## 3. Spatiotemporal models
179 | Now that we've covered the key smoothers, let's take a look at how we'd use them
180 | to model spatiotemporal data. For this section, we'll use data from the Florida 
181 | routes of the Breeding Bird Survey. If you haven't heard of it before, this is a
182 | long-term monitoring program tracking bird abundance and diversity across the US 
183 | and Canada. It consists of many (over 4000) 24.5 mile long routes. Trained 
184 | observers walk these routes each year during peak breeding season, and every 0.5
185 | miles, they stop and count and identify all the birds they can see or hear in a
186 | 0.25 mile radius of the stop. This is one of the best long-term data sets we
187 | have on bird diversity and abundance, and has been used in countless papers. 
188 | 
189 | Here, we're going to use this data to answer a couple relatively simple
190 | questions: how does bird species richness vary across the state of Florida, has
191 | average richness changed over time, and has the spatial pattern of richness
192 | varied? We obtained data from [this
193 | site](https://www.pwrc.usgs.gov/bbs/RawData/), and summarized it into the total
194 | number of different species and the total number of individual birds observed on
195 | each route in each year in the state of Florida. 
196 | 
197 | First, we'll load the data, and quickly plot it:
198 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
199 | library(maps)
200 | florida_map = map_data("state","florida")%>%
201 |   transmute(Longitude= long, Latitude = lat, order=order)
202 | florida_birds = read.csv("data/bbs_data/bbs_florida_richness.csv")
203 | head(florida_birds)
204 | 
205 | ggplot(aes(x=Year,y=Richness),data=florida_birds)+geom_point()+
206 |   theme_bw()
207 | ggplot(aes(x=Longitude,y=Latitude,col=Richness), data=florida_birds) +
208 |     geom_polygon(data=florida_map,fill=NA, col="black")+
209 |   geom_point()+theme_bw()
210 | ```
211 | 
212 | It looks like richness may have declined in the end of the time series, and
213 | there seems to be somewhat of a north-south richness gradient, but we need to
214 | model it to really tease these issues out. 
215 | 
216 | ### Modelling space and time seperately
217 | We'll first take a stab at this to determine if these mean trends actually come
218 | out of the model. As richness is a count variable
219 | we'll use a Poisson gam to model it. This means
220 | that we're assuming mean richness is the product of a smooth
221 | function of location and of date, with no interactions; 
222 | E(richness) = exp(f(year))*exp(g(location)).
223 | 
224 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
225 | richness_base_model = gam(Richness~ s(Longitude,Latitude)+s(Year),
226 |                           data=florida_birds,family=poisson,method="REML")
227 | layout(matrix(1:2,ncol=2))
228 | plot(richness_base_model,scheme=2)
229 | layout(1)
230 | summary(richness_base_model)
231 | ```
232 | 
233 | Note that we do have a thin-plate interaction between longitude and latitude.
234 | This is assuming that the gradient should be equally smooth latitudinally and
235 | longitudinally. This may not be totally accurate, as the function seems to
236 | change faster latitudinally, but we'll ignore that until the exercises at the 
237 | bottom. The model seems to be doing pretty well; it captured the original patterns
238 | we saw, and explains a not-inconsiderable 45% of the deviance. 
239 | 
240 | Let's see if there's any remaining spatiotemporal patterns though:
241 | 
242 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
243 | source("code_snippets/quantile_resid.R")
244 | 
245 | florida_birds$base_resid = rqresiduals(richness_base_model)
246 | 
247 | ggplot(aes(Longitude, Latitude),
248 |        data= florida_birds %>% filter(Year%%6==0))+ #only look at every 6th year
249 |   geom_point(aes(color=base_resid))+
250 |   geom_polygon(data=florida_map,fill=NA, col="black")+
251 |   scale_color_viridis()+
252 |   facet_wrap(~Year)+
253 |   theme_bw()
254 | 
255 | ```
256 | There's definitely still a pattern in the residuals, such as higher than
257 | expected diversity in the mid-latitudes during the 1990's.
258 | 
259 | ### Joint models of space and time 
260 | The next step is to determine if there is a significant interaction. This is
261 | where tensor product splines come in. Here, we'll use the `ti` formulation, as
262 | it allows us to directly test whether the interaction significantly improves our
263 | model.
264 | 
265 | Something important to note: the `d=c(2,1)` argument it `ti`. This tells the
266 | function that the smooth should consist of tensor product between a
267 | 2-dimensional smooth (lat-long) and a 1-dimensional term (Year).
268 |  
269 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
270 | richness_ti_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 
271 |                           ti(Longitude, Latitude, Year, d=c(2,1)),
272 |                           data=florida_birds,family=poisson,method="REML")
273 | layout(matrix(1:2,ncol=2))
274 | plot(richness_ti_model,scheme=2)
275 | layout(1)
276 | summary(richness_ti_model)
277 | 
278 | ```
279 | This improves the model fit substantially, which you can see by directly
280 | comparing models with `anova`:
281 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
282 | anova(richness_base_model,richness_ti_model,test = "Chisq")
283 | 
284 | ```
285 | 
286 | 
287 | The question then becomes: what does this actually mean? Where are we seeing the
288 | fastest and slowest rates of change in richness? It's often difficult to
289 | effectively show spatiotemporal changes, partially as it would require a
290 | 4-dimensional graph to really show what's going on. Still, we can look at a
291 | couple different views by using `predict` effectively:
292 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
293 | #First we'll create gridded data
294 | predict_richess = expand.grid(
295 |   Latitude= seq(min(florida_birds$Latitude), 
296 |                 max(florida_birds$Latitude),
297 |                 length=50),
298 |   Longitude = seq(min(florida_birds$Longitude),
299 |                   max(florida_birds$Longitude),
300 |                   length=50),
301 |   Year = seq(1970,2015,by=5)
302 | )
303 | # This now selects only that data that falls within Florida's border
304 | predict_richess = predict_richess[with(predict_richess, 
305 |                                        inSide(florida_map, Latitude,Longitude)),]
306 | predict_richess$model_fit = predict(richness_ti_model,
307 |                                     predict_richess,type = "response")
308 | ggplot(aes(Longitude, Latitude, fill= model_fit),
309 |        data=predict_richess)+
310 |   geom_tile()+
311 |   facet_wrap(~Year,nrow=2)+
312 |   scale_fill_viridis("# of species")+
313 |   theme_bw(10)
314 | ```
315 | 
316 | 
317 | Another way of looking at this is by estimating the rate of change
318 | at each location at each time point, and figuring out which locations
319 | are changing most quickly:
320 | 
321 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
322 | 
323 | predict_richess$model_change =predict(richness_ti_model,
324 |                                       predict_richess%>%mutate(Year=Year+1),
325 |                                       type = "response") -
326 |   predict_richess$model_fit 
327 | ggplot(aes(Longitude, Latitude, fill= model_change),
328 |        data=predict_richess)+
329 |   geom_tile()+
330 |   facet_wrap(~Year,nrow=2)+
331 |   scale_fill_gradient2("Rate of change\n(species per year)")+
332 |   theme_bw(10)
333 | ```
334 | 
335 | ### Accounting for local heterogeneity with `bs="re"`
336 | Examining this, it seems to indicate that richness fluctuated across the state
337 | throughout the study period, but then began sharply declining state-wide in the
338 | 2000's. (Disclaimer: I don't know enough about either the BBS or Florida to say
339 | how much of this decline is real! Please don't quote me in newspapers saying
340 | biodiversity is collapsing in Florida).
341 | 
342 | There is still one persistent issue: viewing the maps of richness, there appear
343 | to be small hotspots and coldspots of richness across the state. This may be due
344 | to a factor I've ignored so far: the data is collected in specific, repeated
345 | routes. If it's just easier to spot birds on one route, or there's a lot of
346 | local nesting habitat there, it will have higher richness than expected over the
347 | whole study. Adjusting for these kinds of hot and cold spots will force our
348 | model to be overly wiggly. Further, not all routes are sampled every year, so we
349 | want to make sure that our trends don't just result from less diverse routes
350 | getting sampled later in the survey.
351 | 
352 | To try and account for this effect (and other grouped local factors), we 
353 | can use random effects smooths. That would make our model look like this:
354 | 
355 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
356 | florida_birds$Route_f = factor(florida_birds$Route)
357 | richness_ti_re_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 
358 |                           ti(Longitude, Latitude, Year, d=c(2,1))+
359 |                           s(Route_f, bs="re"),
360 |                           data=florida_birds,family=poisson,method="REML")
361 | layout(matrix(1:3,nrow=1))
362 | plot(richness_ti_re_model,scheme=2)
363 | layout(1)
364 | 
365 | summary(richness_ti_re_model)
366 | anova(richness_ti_model,richness_ti_re_model)
367 | ```
368 | 
369 | We can view the same plots as before with our new model:
370 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
371 | predict_richess$Route_f = 1
372 | predict_richess$model_fit = predict(richness_ti_re_model,
373 |                                     predict_richess,type = "response")
374 | predict_richess$model_change =predict(richness_ti_re_model,
375 |                                       predict_richess%>%mutate(Year=Year+1),
376 |                                       type = "response") -
377 |   predict_richess$model_fit 
378 | 
379 | ggplot(aes(Longitude, Latitude, fill= model_fit),
380 |        data=predict_richess)+
381 |   geom_tile()+
382 |   facet_wrap(~Year,nrow=2)+
383 |   scale_fill_viridis("# of species")+
384 |   theme_bw(10)
385 | 
386 | ggplot(aes(Longitude, Latitude, fill= model_change),
387 |        data=predict_richess)+
388 |   geom_tile()+
389 |   facet_wrap(~Year,nrow=2)+
390 |   scale_fill_gradient2("Rate of change\n(species per year)")+
391 |   theme_bw(10)
392 | ```
393 | This definely seems to have levelled off some of the hot spots from before, 
394 | implying that bird richness is more evenly distributed than our prior model 
395 | suggested. Still, the large-scale trends remain the same from before.
396 | 
397 | ## 4. Exercises
398 | 
399 | 1. Given that most of the variability appears to be latitudinal, try to adapt 
400 | the tensor smoothers so that latitude,longitude, and year all have their own 
401 | penalty terms. How does this change model fit? 
402 | 
403 | 2. Following up on the prior
404 | exorcise: build a model that allows you to explicitly calculate the marginal
405 | latitudinal gradient in diversity, and to determine how the latitudinal gradient
406 | has changed over time. 
407 | 
408 | 3. We know from basic sampling theory that if you sample
409 | fewer individuals over time, you also expect to find lower richness, even if
410 | actual richness stays unchanged. What does the spatiotemporal pattern of
411 | abudance look like? Does adding bird abudance as a predictor explain any of the
412 | patterns in richness?
413 | 


--------------------------------------------------------------------------------
/mgcv-esa-workshop.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/project-requirements.md:
--------------------------------------------------------------------------------
 1 | ### Title of the session
 2 | 
 3 | The mgcv package as a one-stop-shop for fitting non-linear ecological models
 4 | 
 5 | ### Description of the session (appears online only; 250 words max.)
 6 | 
 7 | To address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the mgcv package for the R language, can be used to fit models to data from a wide range of sources.
 8 | 
 9 | mgcv is one of the most popular packages for modelling non-linear relationships. However, many users do not know how versatile and powerful a tool it can be. This workshop will focus on teaching participants how to use mgcv in a wide variety of situations (including spatio-temporal, zero-inflated, heavy-tailed, time series, and survival data) and advanced use of mgcv (fitting smooth interactions, seasonal effects, spatial effects, Markov random fields and varying-coefficient models).
10 | 
11 | The workshop will give paricipants an understanding of: (a) practical elements of smoothing theory, with a focus on why they would choose to use different types of smoothers (b) model checking and selection (c) the range of modelling possibilities using mgcv. Participants will be assumed to be familiar with the basics of R (loading/manipulating data, functions, and plotting) and regression in R (lm() and glm()). The organizers have extensive practical experience with ecological statistics and modelling using mgcv.
12 | 
13 | ### Summary sentence (appears in print only; 50 word max.)
14 | 
15 | mgcv is one of the most popular R packages for non-linear modelling, but many users do not know how versatile and powerful it is. This workshop will focus on using mgcv in a wide variety of situations common in ecology, and advanced use of mgcv for model fitting.
16 | 
17 | ### Name and contact information (affiliation, email) for the lead organizer and any co-organizers
18 | 
19 | Eric Pedersen  
20 | Center for Limnology  
21 | 680 North Park Street  
22 | University of Wisconsin-Madison  
23 | Madison, WI, USA 53704 
24 | eric.pedersen@wisc.edu
25 | 
26 | David L Miller  
27 | Northeast Fisheries Science Center  
28 | National Oceanic and Atmospheric Administration  
29 | 166 Water Street  
30 | Woods Hole, MA 02543
31 | 
32 | Gavin L. Simpson  
33 | Institute of Environmental Change and Society  
34 | University of Regina  
35 | 3737 Wascana Parkway  
36 | Regina  
37 | Saskatchewan  
38 | Canada
39 | S4S 0A2
40 | 
41 | ### Minimum and maximum number of participants (to assist in room assignment).
42 | 10 minimum, 30 maximum.
43 | 
44 | ### Requested scheduling
45 | Saturday (August 6th), all day. 
46 | 
47 | ### Any additional A/V equipment (standard A/V setup is a screen, LCD projector, and laptop)
48 | We will need the standard setup, as well as wifi access for all participants.
49 | 
50 | ### Room set-up desired: theater, conference, hollow square, rounds, other
51 | A conference room.
52 | 
53 | ### Food and beverage requests
54 | No food or beverages requested.
55 | 
56 | ### Underwriting of workshop costs by a group or agency
57 | None of the costs will be underwritten.
58 | 
59 | ### Is the session intended to be linked to a scientific session?
60 | No.
61 | 
62 | ### Is the session intended to be linked to a business meeting or mixer?
63 | No. 
64 | 
65 | ### Describe any known (workshop/event) scheduling conflicts (what should it follow/precede/not conflict with?)
66 | We would require a Saturday time slot, as one of the organizers (Gavin Simpson) is also organizing 
67 | the Advanced Vegan Workshop Proposal on Sunday AM and attending the ESA Council meeting on Sunday PM.
68 | 


--------------------------------------------------------------------------------
/project_description.md:
--------------------------------------------------------------------------------
1 | # The mgcv package as a one-stop-shop for fitting non-linear ecological models
2 | 
3 | To address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the mgcv package for the R language, can be used to fit models to data from a wide range of sources.
4 | 
5 | mgcv is one of the most popular packages for modelling non-linear relationships. However, many users do not know how versatile and powerful a tool it can be. This workshop will focus on teaching participants how to use mgcv in a wide variety of situations (including spatio-temporal, zero-inflated,  heavy-tailed, time series, and survival data) and advanced use of mgcv (fitting smooth interactions, seasonal effects, spatial effects, Markov random fields and varying-coefficient models).
6 | 
7 | The workshop will give paricipants an understanding of: (a) practical elements of smoothing theory, with a focus on why they would choose to use different types of smoothers (b) model checking and selection (c) the range of modelling possibilities using mgcv. Participants will be assumed to be familiar with the basics of R (loading/manipulating data, functions, and plotting) and regression in R (lm() and glm()). The organizers have extensive practical experience with ecological statistics and modelling using mgcv.
8 | 


--------------------------------------------------------------------------------
/slides/00-preamble.Rpres:
--------------------------------------------------------------------------------
 1 | The mgcv package as a one-stop-shop for fitting non-linear ecological models
 2 | ============================
 3 | author: David L Miller, Eric Pedersen, and Gavin L Simpson
 4 | css: custom.css
 5 | transition: none
 6 | 
 7 | 
 8 | Good morning!
 9 | ==============
10 | type: section
11 | 
12 | 
13 | 
14 | Logistics
15 | ==========
16 | - where is the coffee?
17 | - where is the bathroom?
18 | - what if there is a fire?
19 | 
20 | Who are we?
21 | ===========
22 | type:section
23 | 
24 | 
25 | Eric Pedersen
26 | =============
27 | - Populuation, community, and movement ecologist
28 | - (sort of) Fisheries (sort of) ecologist
29 | - Movement and life-history modelling 
30 | - Connecting theoretical models to data with statistics
31 | - Unreasonably obsessed with smoothing and penalization as a solution to all of life's problems
32 | 
33 | Gavin Simpson
34 | =============
35 | 
36 | - (Palaeo)ecologist interested in community dynamics & environmental change
37 | - (Palaeo)limnologist interested in environmental change
38 | - Self-taught not-statistician
39 | - Ecological stats software (vegan plus some palaeo stuff)
40 | - To me most things are a model I can (will!) fit with *mgcv*
41 | 
42 | David L Miller
43 | ===============
44 | 
45 | - Cetacean distribution modelling
46 | - Spatial modelling (esp. model checking)
47 | - Distance sampling [distancesampling.org](http://distancesampling.org)
48 | - Statistical software (`Distance`, `mrds`, `dsm`)
49 | - Trying to shoehorn all projects into `mgcv`
50 | 
51 | 
52 | Who are you?
53 | ============
54 | type:section
55 | 
56 | 
57 | What is the structure of the day?
58 | =================================
59 | type:section
60 | 
61 | Timing
62 | ======
63 | 
64 | - 8am to 5pm
65 | - Morning session: 8am-12pm
66 | - Aternoon session: 1pm-5pm
67 | 
68 | (Approximate) schedule
69 | ======================
70 | 
71 | Time       | Thing
72 | -----------|--------------------
73 | 0800-0815  | Intro
74 | 0815-0945  | Generalized Additive Models
75 | 0945-1000  | Coffee
76 | 1000-1045  | Model checking
77 | 1045-1130  | Model selection
78 | 1130-1200  | Beyond the exponential family
79 | 1200-1300  | Lunch
80 | 1300-1700  | Extended examples/demos
81 | 
82 | 
83 | Foraging
84 | ========
85 | 
86 | 
87 | Wifi
88 | ====
89 | 
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/slides/01-intro.Rpres:
--------------------------------------------------------------------------------
  1 | Generalized Additive Models
  2 | ============================
  3 | author: David L Miller
  4 | css: custom.css
  5 | transition: none
  6 | 
  7 | 
  8 | Overview
  9 | =========
 10 | 
 11 | - What is a GAM?
 12 | - What is smoothing?
 13 | - How do GAMs work? (*Roughly*)
 14 | - Fitting and plotting simple models
 15 | 
 16 | ```{r setup, include=FALSE}
 17 | library(knitr)
 18 | library(viridis)
 19 | library(ggplot2)
 20 | library(reshape2)
 21 | library(animation)
 22 | library(mgcv)
 23 | opts_chunk$set(cache=TRUE, echo=FALSE)
 24 | ```
 25 | 
 26 | What is a GAM?
 27 | ===============
 28 | type:section
 29 | 
 30 | Generalized Additive Models
 31 | ============================
 32 | 
 33 | - Generalized: many response distributions
 34 | - Additive: terms **add** together
 35 | - Models: well, it's a model...
 36 | 
 37 | To GAMs from GLMs and LMs
 38 | =============================
 39 | type:section
 40 | 
 41 | 
 42 | (Generalized) Linear Models
 43 | =============================
 44 | 
 45 | Models that look like:
 46 | 
 47 | $$
 48 | y_i = \beta_0 + x_{1i}\beta_1 + x_{2i}\beta_2 + \ldots + \epsilon_i
 49 | $$
 50 | 
 51 | (describe the response, $y_i$, as linear combination of the covariates, $x_{ji}$, with an offset)
 52 | 
 53 | We can make $y_i\sim$ any exponential family distribution (Normal, Poisson, etc).
 54 | 
 55 | Error term $\epsilon_i$ is normally distributed (usually).
 56 | 
 57 | Why bother with anything more complicated?!
 58 | =============================
 59 | type:section
 60 | 
 61 | Is this linear?
 62 | =============================
 63 | 
 64 | ```{r islinear, fig.width=12, fig.height=6}
 65 | set.seed(2) ## simulate some data...
 66 | dat <- gamSim(1, n=400, dist="normal", scale=1, verbose=FALSE)
 67 | dat <- dat[,c("y", "x0", "x1", "x2", "x3")]
 68 | p <- ggplot(dat,aes(y=y,x=x1)) +
 69 |       geom_point() +
 70 |       theme_minimal()
 71 | print(p)
 72 | ```
 73 | 
 74 | Is this linear? Maybe?
 75 | =============================
 76 | 
 77 | ```{r eval=FALSE, echo=TRUE}
 78 | lm(y ~ x1, data=dat)
 79 | ```
 80 | 
 81 | 
 82 | ```{r maybe, fig.width=12, fig.height=6}
 83 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() +
 84 |       theme_minimal()
 85 | print(p + geom_smooth(method="lm"))
 86 | ```
 87 | 
 88 | 
 89 | 
 90 | What can we do?
 91 | =============================
 92 | type:section
 93 | 
 94 | Adding a quadratic term?
 95 | =============================
 96 | 
 97 | ```{r eval=FALSE, echo=TRUE}
 98 | lm(y ~ x1 + poly(x1, 2), data=dat)
 99 | ```
100 | 
101 | ```{r quadratic, fig.width=12, fig.height=6}
102 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() +
103 |       theme_minimal()
104 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2)))
105 | ```
106 | 
107 | 
108 | 
109 | 
110 | Is this sustainable?
111 | =============================
112 | 
113 | - Adding in quadratic (and higher terms) *can* make sense
114 | - This feels a bit *ad hoc*
115 | - Better if we had a **framework** to deal with these issues?
116 | 
117 | ```{r ruhroh, fig.width=12, fig.height=6}
118 | p <- ggplot(dat, aes(y=y, x=x2)) + geom_point() +
119 |       theme_minimal()
120 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2)))
121 | ```
122 | 
123 | 
124 | [drumroll]
125 | =============================
126 | type:section
127 | 
128 | 
129 | What does a model look like?
130 | =============================
131 | 
132 | $$
133 | y_i = \beta_0 + \sum_j s_j(x_{ji}) + \epsilon_i
134 | $$
135 | 
136 | where $\epsilon_i \sim N(0, \sigma^2)$, $y_i \sim \text{Normal}$ (for now)
137 | 
138 | Remember that we're modelling the mean of this distribution!
139 | 
140 | Call the above equation the **linear predictor**
141 | 
142 | Okay, but what about these "s" things?
143 | ====================================
144 | right:55%
145 | 
146 | ```{r smoothdat, fig.width=8, fig.height=8}
147 | 
148 | spdat <- melt(dat, id.vars = c("y"))
149 | p <- ggplot(spdat,aes(y=y,x=value)) +
150 |       geom_point() +
151 |       theme_minimal() +
152 |       facet_wrap(~variable, nrow=2)
153 | print(p)
154 | ```
155 | ***
156 | - Think $s$=**smooth**
157 | - Want to model the covariates flexibly
158 | - Covariates and response not necessarily linearly related!
159 | - Want some "wiggles"
160 | 
161 | Okay, but what about these "s" things?
162 | ====================================
163 | right:55%
164 | 
165 | ```{r wsmooths, fig.width=8, fig.height=8}
166 | p <- p + geom_smooth()
167 | print(p)
168 | ```
169 | ***
170 | - Think $s$=**smooth**
171 | - Want to model the covariates flexibly
172 | - Covariates and response not necessarily linearly related!
173 | - Want some "wiggles"
174 | 
175 | What is smoothing?
176 | ===============
177 | type:section
178 | 
179 | 
180 | Straight lines vs. interpolation
181 | =================================
182 | right:55%
183 | 
184 | ```{r wiggles, fig.height=8, fig.width=8}
185 | library(mgcv)
186 | # hacked from the example in ?gam
187 | set.seed(2) ## simulate some data... 
188 | dat <- gamSim(1,n=50,dist="normal",scale=0.5, verbose=FALSE)
189 | dat$y <- dat$f2 + rnorm(length(dat$f2), sd = sqrt(0.5))
190 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10-mean(dat$y)
191 | ylim <- c(-4,6)
192 | 
193 | # fit some models
194 | b.justright <- gam(y~s(x2),data=dat)
195 | b.sp0 <- gam(y~s(x2, sp=0, k=50),data=dat)
196 | b.spinf <- gam(y~s(x2),data=dat, sp=1e10)
197 | 
198 | curve(f2,0,1, col="blue", ylim=ylim)
199 | points(dat$x2, dat$y-mean(dat$y))
200 | 
201 | ```
202 | ***
203 | - Want a line that is "close" to all the data
204 | - Don't want interpolation -- we know there is "error"
205 | - Balance between interpolation and "fit"
206 | 
207 | Splines
208 | ========
209 | 
210 | - Functions made of other, simpler functions
211 | - **Basis functions** $b_k$, estimate $\beta_k$ 
212 | - $s(x) = \sum_{k=1}^K \beta_k b_k(x)$
213 | - Makes the math(s) much easier
214 | 
215 | <img src="images/addbasis.png">
216 | 
217 | Design matrices
218 | ===============
219 | 
220 | - We often write models as $X\boldsymbol{\beta}$
221 |   - $X$ is our data
222 |   - $\boldsymbol{\beta}$ are parameters we need to estimate
223 | - For a GAM it's the same
224 |   - $X$ has columns for each basis, evaluated at each observation
225 |   - again, this is the linear predictor
226 |   
227 | Measuring wigglyness
228 | ======================
229 | 
230 | - Visually:
231 |   - Lots of wiggles == NOT SMOOTH
232 |   - Straight line == VERY SMOOTH
233 | - How do we do this mathematically?
234 |   - Derivatives!
235 |   - (Calculus *was* a useful class afterall!)
236 | 
237 | 
238 | 
239 | Wigglyness by derivatives
240 | ==========================
241 | 
242 | ```{r wigglyanim, results="hide"}
243 | library(numDeriv)
244 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 - mean(dat$y)
245 | 
246 | xvals <- seq(0,1,len=100)
247 | 
248 | plot_wiggly <- function(f2, xvals){
249 | 
250 |   # pre-calculate
251 |   f2v <- f2(xvals)
252 |   f2vg <- grad(f2,xvals)
253 |   f2vg2 <- unlist(lapply(xvals, hessian, func=f2))
254 |   f2vg2min <- min(f2vg2) -2
255 |   
256 |   # now plot
257 |   for(i in 1:length(xvals)){
258 |     par(mfrow=c(1,3))
259 |     plot(xvals, f2v, type="l", main="function", ylab="f")
260 |     points(xvals[i], f2v[i], pch=19, col="red")
261 |     
262 |     plot(xvals, f2vg, type="l", main="derivative", ylab="df/dx")
263 |     points(xvals[i], f2vg[i], pch=19, col="red")
264 |     
265 |     plot(xvals, f2vg2, type="l", main="2nd derivative", ylab="d2f/dx2")
266 |     points(xvals[i], f2vg2[i], pch=19, col="red")
267 |     polygon(x=c(0,xvals[1:i], xvals[i],f2vg2min),
268 |             y=c(f2vg2min,f2vg2[1:i],f2vg2min,f2vg2min), col = "grey")
269 |     
270 |     ani.pause()
271 |   }
272 | }
273 | 
274 | saveGIF(plot_wiggly(f2, xvals), "wiggly.gif", interval = 0.2, ani.width = 800, ani.height = 400)
275 | ```
276 | 
277 | ![Animation of derivatives](wiggly.gif)
278 | 
279 | What was that grey bit?
280 | =========================
281 | 
282 | $$
283 | \int_\mathbb{R} \left( \frac{\partial^2 f(x)}{\partial^2 x}\right)^2 \text{d}x\\
284 | $$
285 | 
286 | (Take some derivatives of the smooth and integrate them over $x$)
287 | 
288 | (*Turns out* we can always write this as $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$, so the $\boldsymbol{\beta}$ is separate from the derivatives)
289 | 
290 | (Call $S$ the **penalty matrix**)
291 | 
292 | Making wigglyness matter
293 | =========================
294 | 
295 | - $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$ measures wigglyness
296 | - "Likelihood" measures closeness to the data
297 | - Penalise closeness to the data...
298 | - Use a **smoothing parameter** to decide on that trade-off...
299 |   - $\lambda \beta^\text{T}S\beta$
300 | - Estimate the $\beta_k$ terms but penalise objective
301 |   - "closeness to data" + penalty
302 | 
303 | Smoothing parameter
304 | =======================
305 | 
306 | 
307 | ```{r wiggles-plot, fig.width=15}
308 | # make three plots, w. estimated smooth, truth and data on each
309 | par(mfrow=c(1,3), cex.main=3.5)
310 | 
311 | plot(b.justright, se=FALSE, ylim=ylim, main=expression(lambda*plain("= just right")))
312 | points(dat$x2, dat$y-mean(dat$y))
313 | curve(f2,0,1, col="blue", add=TRUE)
314 | 
315 | plot(b.sp0, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*0))
316 | points(dat$x2, dat$y-mean(dat$y))
317 | curve(f2,0,1, col="blue", add=TRUE)
318 | 
319 | plot(b.spinf, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*infinity)) 
320 | points(dat$x2, dat$y-mean(dat$y))
321 | curve(f2,0,1, col="blue", add=TRUE)
322 | 
323 | ```
324 | 
325 | Smoothing parameter selection
326 | ==============================
327 | 
328 | - Many methods: AIC, Mallow's $C_p$, GCV, ML, REML
329 | - Recommendation, based on simulation and practice:
330 |   - Use REML or ML
331 |   - Reiss \& Ogden (2009), Wood (2011)
332 |   
333 | <img src="images/remlgcv.png">
334 | 
335 | 
336 | Maximum wiggliness
337 | ========================
338 | 
339 | - We can set **basis complexity** or "size" ($k$)
340 |   - Maximum wigglyness
341 | - Smooths have **effective degrees of freedom** (EDF)
342 | - EDF < $k$
343 | - Set $k$ "large enough"
344 |   - Penalty does the rest
345 | 
346 | 
347 | More on this in a bit...
348 | 
349 | uhoh
350 | ======
351 | title: none
352 | type:section
353 | 
354 | <p align="center"><img width=150% alt="spock sobbing mathematically" src="images/mathematical_sobbing.jpg"></p>
355 | 
356 | GAM summary
357 | ===========
358 | 
359 | - Straight lines suck --- we want **wiggles**
360 | - Use little functions (**basis functions**) to make big functions (**smooths**)
361 | - Need to make sure your smooths are **wiggly enough**
362 | - Use a **penalty** to trade off wiggliness/generality 
363 | 
364 | 
365 | Fitting GAMs in practice
366 | =========================
367 | type:section
368 | 
369 | Translating maths into R
370 | ==========================
371 | 
372 | A simple example:
373 | 
374 | $$
375 | y_i = \beta_0 + s(x) + s(w) + \epsilon_i
376 | $$
377 | 
378 | where $\epsilon_i \sim N(0, \sigma^2)$
379 | 
380 | Let's pretend that $y_i \sim \text{Normal}$
381 | 
382 | - linear predictor: `formula = y ~ s(x) + s(w)`
383 | - response distribution: `family=gaussian()`
384 | - data: `data=some_data_frame` 
385 | 
386 | Putting that together
387 | ======================
388 | 
389 | ```{r echo=TRUE, eval=FALSE}
390 | my_model <- gam(y ~ s(x) + s(w),
391 |                 family = gaussian(),
392 |                 data = some_data_frame,
393 |                 method = "REML")
394 | ```
395 | 
396 | - `method="REML"` uses REML for smoothness selection (default is `"GCV.Cp"`)
397 | 
398 | What about a practical example?
399 | ================================
400 | type:section
401 | 
402 | 
403 | Pantropical spotted dolphins
404 | ==============================
405 | 
406 | - Example taken from Miller et al (2013)
407 | - [Paper appendix](http://distancesampling.org/R/vignettes/mexico-analysis.html) has a better analysis
408 | - Simple example here, ignoring all kinds of important stuff!
409 | 
410 | ![a pantropical spotted dolphin doing its thing](images/spotteddolphin_swfsc.jpg)
411 | 
412 | 
413 | Inferential aims
414 | =================
415 | 
416 | ```{r loaddat}
417 | load("../data/mexdolphins/mexdolphins.RData")
418 | ```
419 | 
420 | ```{r gridplotfn}
421 | library(rgdal)
422 | library(rgeos)
423 | library(maptools)
424 | library(plyr)
425 | # fill must be in the same order as the polygon data
426 | grid_plot_obj <- function(fill, name, sp){
427 | 
428 |   # what was the data supplied?
429 |   names(fill) <- NULL
430 |   row.names(fill) <- NULL
431 |   data <- data.frame(fill)
432 |   names(data) <- name
433 | 
434 |   spdf <- SpatialPolygonsDataFrame(sp, data)
435 |   spdf@data$id <- rownames(spdf@data)
436 |   spdf.points <- fortify(spdf, region="id")
437 |   spdf.df <- join(spdf.points, spdf@data, by="id")
438 | 
439 |   # seems to store the x/y even when projected as labelled as
440 |   # "long" and "lat"
441 |   spdf.df$x <- spdf.df$long
442 |   spdf.df$y <- spdf.df$lat
443 | 
444 |   geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
445 | }
446 | ```
447 | 
448 | - How many dolphins are there?
449 | - Where are the dolphins?
450 | - What are they interested in?
451 | 
452 | ```{r spatialEDA, fig.cap="", fig.width=15}
453 | 
454 | # some nearby states, transformed
455 | library(mapdata)
456 | map_dat <- map_data("worldHires",c("usa","mexico"))
457 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
458 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
459 | 
460 | # give the sp object a projection
461 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
462 | # re-project
463 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
464 | map_dat$x <- map_sp.t$long
465 | map_dat$y <- map_sp.t$lat
466 | 
467 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 
468 | p <- ggplot() +
469 |       grid_plot_obj(preddata$depth, "Depth", pred.polys) + 
470 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
471 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) +
472 |       geom_point(aes(x, y, size=count),
473 |                  data=mexdolphins[mexdolphins$count>0,],
474 |                  colour="red", alpha=I(0.7)) +
475 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) +
476 |       scale_fill_viridis() +
477 |       labs(fill="Depth",x="x",y="y",size="Count") +
478 |       theme_minimal()
479 | #p <- p + gg.opts
480 | print(p)
481 | ```
482 | 
483 | A simple dolphin model
484 | ===============
485 | 
486 | ```{r firstdsm, echo=TRUE}
487 | library(mgcv)
488 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
489 |                       data = mexdolphins,
490 |                       family = quasipoisson(),
491 |                       method = "REML")
492 | ```
493 | 
494 | - count is a function of depth
495 | - `off.set` is the effort expended
496 | - we have count data, try quasi-Poisson distribution
497 | 
498 | What did that do?
499 | ===================
500 | 
501 | ```{r echo=TRUE}
502 | summary(dolphins_depth)
503 | ```
504 | 
505 | Plotting
506 | ================
507 | 
508 | ```{r plotsmooth}
509 | plot(dolphins_depth)
510 | ```
511 | ***
512 | - `plot(dolphins_depth)`
513 | - Dashed lines indicate +/- 2 standard errors
514 | - Rug plot
515 | - On the link scale
516 | - EDF on $y$ axis
517 | 
518 | Thin plate regression splines
519 | ================================
520 | 
521 | - Default basis
522 | - One basis function per data point
523 | - Reduce # basis functions (eigendecomposition)
524 | - Fitting on reduced problem
525 | - Multidimensional
526 | - Wood (2003)
527 | 
528 | 
529 | Bivariate terms
530 | ================
531 | 
532 | - Assumed an additive structure
533 | - No interaction
534 | - We can specify `s(x,y)` (and `s(x,y,z,...)`)
535 | - (Assuming *isotropy* here...)
536 | 
537 | ```{r xydsmplot, fig.width=15, fig.height=7}
538 | dolphins_depth_xy <- gam(count ~ s(x, y) + offset(off.set),
539 |                  data = mexdolphins,
540 |                  family=quasipoisson(), method="REML")
541 | par(mfrow=c(1,3))
542 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=20, asp=1)
543 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=60, asp=1)
544 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=160, asp=1)
545 | ```
546 | 
547 | Adding a term
548 | ===============
549 | 
550 | - Add a **surface** for location ($x$ and $y$)
551 | - Just use `+` for an extra term
552 | 
553 | ```{r xydsm, echo=TRUE}
554 | dolphins_depth_xy <- gam(count ~ s(depth) + s(x, y) + offset(off.set),
555 |                  data = mexdolphins,
556 |                  family=quasipoisson(), method="REML")
557 | ```
558 | 
559 | 
560 | Summary
561 | ===================
562 | 
563 | ```{r echo=TRUE}
564 | summary(dolphins_depth_xy)
565 | ```
566 | 
567 | Plotting
568 | ================
569 | 
570 | ```{r plotsmooth-xy, fig.width=12, echo=TRUE}
571 | plot(dolphins_depth_xy, scale=0, pages=1)
572 | ```
573 | - `scale=0`: each plot on different scale
574 | - `pages=1`: plot together
575 | 
576 | 
577 | Plotting 2d terms... erm...
578 | ================
579 | 
580 | ```{r plotsmooth-xy-biv, fig.width=15, fig.height=7, echo=TRUE}
581 | plot(dolphins_depth_xy, select=2, cex=2, asp=1, lwd=2)
582 | ```
583 | 
584 | - `select=` picks which smooth to plot
585 | 
586 | Let's try something different
587 | ===============================
588 | 
589 | ```{r plot-scheme2, echo=TRUE, fig.width=10}
590 | plot(dolphins_depth_xy, select=2, cex=2, asp=1, lwd=2, scheme=2)
591 | ```
592 | - `scheme=2` much better for bivariate terms
593 | - `vis.gam()` is much more general
594 | 
595 | More complex plots
596 | ===================
597 | 
598 | ```{r visgam, fig.width=15, echo=TRUE}
599 | par(mfrow=c(1,2))
600 | vis.gam(dolphins_depth_xy, view=c("depth","x"), too.far=0.1, phi=30, theta=45)
601 | vis.gam(dolphins_depth_xy, view=c("depth","x"), plot.type="contour", too.far=0.1,asp=1/1000)
602 | ```
603 | 
604 | 
605 | Fitting/plotting GAMs summary
606 | =============================
607 | 
608 | - `gam` does all the work
609 | - very similar to `glm`
610 | - `s` indicates a smooth term
611 | - `plot` can give simple plots
612 | - `vis.gam` for more advanced stuff
613 | 
614 | 
615 | Prediction
616 | ===========
617 | type:section
618 | 
619 | What is a prediction?
620 | =====================
621 | 
622 | - Evaluate the model, at a particular covariate combination
623 | - Answering (e.g.) the question "at a given depth, how many dolphins?"
624 | - Steps:
625 |   1. evaluate the $s(\ldots)$ terms
626 |   2. move to the response scale (exponentiate? Do nothing?)
627 |   3. (multiply any offset etc)
628 | 
629 | Example of prediction
630 | ======================
631 | 
632 | - in maths:
633 |   - Model: $\text{count}_i = A_i \exp \left( \beta_0 + s(x_i, y_i) + s(\text{Depth}_i)\right)$
634 |   - Drop in the values of $x, y, \text{Depth}$ (and $A$)
635 | - in R:
636 |   - build a `data.frame` with $x, y, \text{Depth}, A$
637 |   - use `predict()`
638 | 
639 | ```{r echo=TRUE, eval=FALSE}
640 | preds <- predict(my_model, newdat=my_data, type="response")
641 | ```
642 | 
643 | (`se.fit=TRUE` gives a standard error for each prediction)
644 | 
645 | Back to the dolphins...
646 | =======================
647 | type:section
648 | 
649 | Where are the dolphins?
650 | =======================
651 | 
652 | ```{r echo=TRUE}
653 | dolphin_preds <- predict(dolphins_depth_xy, newdata=preddata,
654 |                          type="response")
655 | ```
656 | 
657 | ```{r fig.width=20}
658 | p <- ggplot() +
659 |       grid_plot_obj(dolphin_preds, "N", pred.polys) + 
660 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
661 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) +
662 |       geom_point(aes(x, y, size=count),
663 |                  data=mexdolphins[mexdolphins$count>0,],
664 |                  colour="red", alpha=I(0.7)) +
665 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) +
666 |       scale_fill_viridis() +
667 |       labs(fill="Predicted\ndensity", x="x", y="y", size="Count") +
668 |       theme_minimal()
669 | print(p)
670 | ```
671 | 
672 | (`ggplot2` code included in the slide source)
673 | 
674 | Prediction summary
675 | ==================
676 | 
677 | - Evaluate the fitted model at a given point
678 | - Can evaluate many at once (`data.frame`)
679 | - Don't forget the `type=...` argument!
680 | - Obtain per-prediction standard error with `se.fit`
681 | 
682 | What about uncertainty?
683 | ========================
684 | type:section
685 | 
686 | Without uncertainty, we're not doing statistics 
687 | ========================
688 | type:section
689 | 
690 | Where does uncertainty come from?
691 | =================================
692 | 
693 | - $\boldsymbol{\beta}$: uncertainty in the spline parameters
694 | - $\boldsymbol{\lambda}$: uncertainty in the smoothing parameter
695 | 
696 | - (Traditionally we've only addressed the former)
697 | - (New tools let us address the latter...)
698 | 
699 | 
700 | Parameter uncertainty
701 | =======================
702 | 
703 | From theory:
704 | 
705 | $$
706 | \boldsymbol{\beta} \sim N(\hat{\boldsymbol{\beta}},  \mathbf{V}_\boldsymbol{\beta})
707 | $$
708 | 
709 | (*caveat: the normality is only* **approximate** *for non-normal response*)
710 | 
711 | 
712 | **What does this mean?** Variance for each parameter.
713 | 
714 | In `mgcv`: `vcov(model)` returns $\mathbf{V}_\boldsymbol{\beta}$.
715 | 
716 | 
717 | What can we do this this?
718 | ===========================
719 | 
720 | - confidence intervals in `plot`
721 | - standard errors using `se.fit`
722 | - derived quantities? (see bibliography)
723 | 
724 | blah
725 | ====
726 | title:none
727 | type:section
728 | 
729 | <img src="images/tina-modelling.png">
730 | 
731 | 
732 | 
733 | The lpmatrix, magic, etc
734 | ==============================
735 | 
736 | For regular predictions:
737 | 
738 | $$
739 | \hat{\boldsymbol{\eta}}_p = L_p \hat{\boldsymbol{\beta}}
740 | $$
741 | 
742 | form $L_p$ using the prediction data, evaluating basis functions as we go.
743 | 
744 | (Need to apply the link function to $\hat{\boldsymbol{\eta}}_p$)
745 | 
746 | But the $L_p$ fun doesn't stop there...
747 | 
748 | [[mathematics intensifies]]
749 | ============================
750 | type:section
751 | 
752 | Variance and lpmatrix
753 | ======================
754 | 
755 | To get variance on the scale of the linear predictor:
756 | 
757 | $$
758 | V_{\hat{\boldsymbol{\eta}}} = L_p^\text{T} V_\hat{\boldsymbol{\beta}} L_p
759 | $$
760 | 
761 | pre-/post-multiplication shifts the variance matrix from parameter space to linear predictor-space.
762 | 
763 | (Can then pre-/post-multiply by derivatives of the link to put variance on response scale)
764 | 
765 | Simulating parameters
766 | ======================
767 | 
768 | - $\boldsymbol{\beta}$ has a distribution, we can simulate
769 | 
770 | ```{r paramsim, results="hide"}
771 | library(mvtnorm)
772 | 
773 | # get the Lp matrix
774 | Lp <- predict(dolphins_depth_xy, newdata=preddata, type="lpmatrix")
775 | 
776 | # how many realisations do we want?
777 | frames <- 100
778 | 
779 | # generate the betas from the GAM "posterior"
780 | betas <- rmvnorm(frames, coef(dolphins_depth_xy), vcov(dolphins_depth_xy))
781 | 
782 | 
783 | # use a function to get animation to play nice...
784 | anim_map <- function(){
785 |   # loop to make plots
786 |   for(frame in 1:frames){
787 | 
788 |     # make the prediction
789 |     preddata$preds <- preddata$area * exp(Lp%*%betas[frame,])
790 | 
791 |     # plot it (using viridis)
792 |     p <- ggplot() +
793 |           grid_plot_obj(preddata$preds, "N", pred.polys) + 
794 |           geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) +
795 |           coord_fixed(ratio=1, ylim = range(mexdolphins$y),
796 |                       xlim = range(mexdolphins$x)) +
797 |           scale_fill_viridis(limits=c(0,200)) +
798 |           labs(fill="Predicted\ndensity",x="x",y="y",size="Count") +
799 |           theme_minimal()
800 |     
801 |     print(p)
802 |   }
803 | }
804 | 
805 | # make the animation!
806 | saveGIF(anim_map(), "uncertainty.gif", outdir = "new", interval = 0.15, ani.width = 800, ani.height = 400)
807 | ```
808 | 
809 | ![Animation of uncertainty](uncertainty.gif)
810 | 
811 | Uncertainty in smoothing parameter
812 | ==================================
813 | 
814 | - Recent work by Simon Wood
815 | - "smoothing parameter uncertainty corrected" version of $V_\hat{\boldsymbol{\beta}}$
816 | - In a fitted model, we have:
817 |   - `$Vp` what we got with `vcov`
818 |   - `$Vc` the corrected version
819 | - Still experimental
820 | 
821 | Variance summary
822 | ================
823 | 
824 | - Everything comes from variance of parameters
825 | - Need to re-project/scale them to get the quantities we need
826 | - `mgcv` does most of the hard work for us
827 | - Fancy stuff possible with a little maths
828 | - Can include uncertainty in the smoothing parameter too
829 | 
830 | Okay, that was a lot of information
831 | ===================================
832 | type:section
833 | 
834 | Summary
835 | =======
836 | 
837 | - GAMs are GLMs plus some extra wiggles
838 | - Need to make sure things are *just wiggly enough*
839 |   - Basis + penalty is the way to do this
840 | - Fitting looks like `glm` with extra `s()` terms
841 | - Most stuff comes down to matrix algebra, that `mgcv` sheilds you from
842 |   - To do fancy stuff, get inside the matrices
843 | 
844 | COFFEE
845 | ======
846 | type:section


--------------------------------------------------------------------------------
/slides/02-model_checking.Rpres:
--------------------------------------------------------------------------------
  1 | Model checking
  2 | ============================
  3 | author: Eric J Pedersen
  4 | date: August 6th, 2016
  5 | css: custom.css
  6 | transition: none
  7 | 
  8 | 
  9 | Outline
 10 | =================
 11 | So you have a GAM:
 12 | - How do you know you have the right degrees of freedom? `gam.check()`
 13 | - Diagnosing model issues: `gam.check()` part 2
 14 | - Do you have the right error model? Randomized quantile residuals
 15 | - When covariates aren't independent: estimating concurvity
 16 | 
 17 | 
 18 | Packages you'll need to follow along:
 19 | ======================================
 20 | ```{r setup, include=TRUE}
 21 | 
 22 | library(mgcv)
 23 | library(magrittr)
 24 | library(ggplot2)
 25 | library(dplyr)
 26 | library(statmod)
 27 | source("quantile_resid.R")
 28 | 
 29 | ```
 30 | 
 31 | ```{r pres_setup, include =F}
 32 | library(knitr)
 33 | opts_chunk$set(cache=TRUE, echo=FALSE,fig.align="center")
 34 | ```
 35 | 
 36 | 
 37 | GAMs are models too
 38 | ====================
 39 | With all models, how accurate your predictions will be depends on how good the model is
 40 | 
 41 | ```{r misspecify,fig.width=15, fig.height=7}
 42 | set.seed(15)
 43 | model_list = c("right model", 
 44 |                "wrong distribution",
 45 |                "heteroskedasticity",
 46 |                "dependent data",
 47 |                "wrong functional form")
 48 | n = 60
 49 | sigma=1
 50 | x = seq(-1,1, length=n)
 51 | model_data = as.data.frame(expand.grid( x=x,model=model_list))
 52 | model_data$y = 5*model_data$x^2 + 2*model_data$x
 53 | for(i in model_list){
 54 |   if(i == "right model"){
 55 |     model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 
 56 |       rnorm(n,0, sigma)
 57 |   } else if(i == "wrong distribution"){
 58 |     model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 
 59 |       rt(n,df = 3)*sigma
 60 |   } else if(i == "heteroskedasticity"){
 61 |     model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+  
 62 |       rnorm(n,0, sigma*10^(model_data[model_data$model==i, "x"]))
 63 |   } else if(i == "dependent data"){
 64 |     model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 
 65 |       arima.sim(model = list(ar=c(.7)),n = n,sd=sigma) 
 66 |   } else if(i=="wrong functional form") {
 67 |     model_data[model_data$model==i, "y"] = model_data[model_data$model==i, "y"]+ 
 68 |       rnorm(n,0, sigma) + ifelse(model_data[model_data$model==i, "x"]>0, 5,-5)
 69 |   }
 70 | }
 71 | ggplot(aes(x,y), data= model_data)+
 72 |   geom_point()+
 73 |   geom_line(color=ifelse(model_data$model=="dependent data", "black",NA))+
 74 |   facet_wrap(~model)+
 75 |   geom_smooth(method=gam, formula = y~s(x,k=12),method.args = list(method="REML"))+
 76 |   theme_bw()+
 77 |   theme(strip.text = element_text(size=20))
 78 | ```
 79 | 
 80 | 
 81 | So how do we test how well our model fits?
 82 | ===========================================
 83 | type:section
 84 | 
 85 | 
 86 | Examples:
 87 | ============================
 88 | 
 89 | ```{r sims, include=TRUE,echo=TRUE}
 90 | set.seed(2)
 91 | n = 400
 92 | x1 = rnorm(n)
 93 | x2 = rnorm(n)
 94 | y_val =1 + 2*cos(pi*x1) + 2/(1+exp(-5*(x2)))
 95 | y_norm = y_val + rnorm(n, 0, 0.5)
 96 | y_negbinom = rnbinom(n, mu = exp(y_val),size=10)
 97 | y_binom = rbinom(n,1,prob = exp(y_val)/(1+exp(y_val)))
 98 | ```
 99 | 
100 | ```{r sims_plot,fig.width=15,fig.align="center"}
101 | layout(matrix(1:6, ncol=3))
102 | plot(x1,y_norm,cex.lab=2,cex.axis=2)
103 | plot(x2,y_norm,cex.lab=2,cex.axis=2)
104 | plot(x1,y_negbinom,cex.lab=2,cex.axis=2)
105 | plot(x2,y_negbinom,cex.lab=2,cex.axis=2)
106 | plot(x1,y_binom,cex.lab=2,cex.axis=2)
107 | plot(x2,y_binom,cex.lab=2,cex.axis=2)
108 | layout(1)
109 | ```
110 | 
111 | 
112 | gam.check() part 1: do you have the right functional form?
113 | =============================
114 | type:section
115 | 
116 | 
117 | How well does the model fit?
118 | =============================
119 | - Many choices: k, family, type of smoother, ...
120 | - How do we assess how well our model fits?
121 | 
122 | 
123 | 
124 | Basis size (k)
125 | ==============
126 | 
127 | - Set `k` per term
128 | - e.g. `s(x, k=10)` or `s(x, y, k=100)`
129 | - Penalty removes "extra" wigglyness
130 |   - *up to a point!*
131 | - (But computation is slower with bigger `k`)
132 | 
133 | 
134 | Checking basis size
135 | ====================
136 | 
137 | ```{r gam_check_norm1, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15,fig.align="center"}
138 | norm_model_1 = gam(y_norm~s(x1,k=4)+s(x2,k=4),method= "REML")
139 | gam.check(norm_model_1)
140 | ```
141 | 
142 | Checking basis size
143 | ====================
144 | 
145 | ```{r gam_check_norm2, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15,fig.align="center"}
146 | norm_model_2 = gam(y_norm~s(x1,k=12)+s(x2,k=4),method= "REML")
147 | gam.check(norm_model_2)
148 | ```
149 | 
150 | Checking basis size
151 | ====================
152 | 
153 | ```{r gam_check_norm3, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15,fig.align="center"}
154 | norm_model_3 = gam(y_norm~s(x1,k=12)+s(x2,k=12),method= "REML")
155 | gam.check(norm_model_3)
156 | ```
157 | 
158 | Checking basis size
159 | ====================
160 | 
161 | ```{r gam_check_norm4, include=TRUE,echo=TRUE, fig.width=12, fig.height=6,fig.align="center"}
162 | layout(matrix(1:6,ncol=2,byrow = T))
163 | plot(norm_model_1);plot(norm_model_2);plot(norm_model_3)
164 | layout(1)
165 | ```
166 | 
167 | 
168 | 
169 | Using gam.check() part 2: visual checks
170 | =============================
171 | type:section
172 | 
173 | 
174 | gam.check() plots
175 | =============================
176 | 
177 | `gam.check()` creates 4 plots: 
178 | 
179 | 1. Quantile-quantile plots of residuals. If the model is right, should follow 1-1 line
180 | 
181 | 2. Histogram of residuals
182 | 
183 | 3. Residuals vs. linear predictor
184 | 
185 | 4. Observed vs. fitted values
186 | 
187 | `gam.check()` uses deviance residuals by default
188 | 
189 | 
190 | gam.check() plots: Gaussian data, Gaussian model
191 | =============================
192 | 
193 | 
194 | ```{r gam_check_plots1, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"}
195 | norm_model = gam(y_norm~s(x1,k=12)+s(x2,k=12),method= "REML")
196 | gam.check(norm_model)
197 | ```
198 | 
199 | 
200 | gam.check() plots: negative binomial data, Poisson model
201 | =============================
202 | 
203 | 
204 | ```{r gam_check_plots2, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"}
205 | pois_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=poisson,method= "REML")
206 | gam.check(pois_model)
207 | ```
208 | 
209 | gam.check() plots: negative binomial data, negative binomial model
210 | =============================
211 | 
212 | 
213 | ```{r gam_check_plots3, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"}
214 | negbin_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=nb,method= "REML")
215 | gam.check(negbin_model)
216 | ```
217 | 
218 | 
219 | 
220 | 
221 | Exercises
222 | =============
223 | 1. Work with the `y_negbinom` data. Try fitting both Poisson and negative
224 | binomial model, with different degrees of freedom. Using model summaries and
225 | `gam.check`, determine which model requires more degrees of freedom to fit well.
226 | How do the fitted functions vary between the two?
227 | 
228 | 2. Using the `y_binomial` data, fit a smooth model with `family=binomial`, and
229 | use `gam.check()` to determine the appropriate degrees of freedom. What do the
230 | `gam.check()` plots tell you about model fit?
231 | 
232 | Fixing Residuals
233 | =================
234 | type:section
235 | 
236 | What are residuals?
237 | ====================
238 | 
239 | - Generally residuals = observed value - fitted value
240 | - BUT hard to see patterns in these "raw" residuals
241 | - Need to standardise -- **deviance residuals**
242 | - Residual sum of squares $\Rightarrow$ linear model
243 |   - deviance $\Rightarrow$ GAM
244 | - Expect these residuals $\sim N(0,1)$
245 | 
246 | 
247 | 
248 | Shortcomings
249 | =============
250 | 
251 | - `gam.check` left side can be helpful
252 | - Right side is victim of artifacts
253 | - Need an alternative
254 | - "Randomised quantile residuals" (*experimental*)
255 |   - `rqresiduals`
256 |   - Exactly normal residuals ... if the model is right!
257 | 
258 | 
259 | Using randomized quantile residuals
260 | =============
261 | 
262 | 
263 | ```{r rqresid1, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"}
264 | pois_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=poisson,method= "REML")
265 | pois_resid = residuals(pois_model, type="deviance")
266 | pois_rqresid = rqresiduals(pois_model)
267 | layout(matrix(1:2, nrow=1))
268 | plot(pois_model$linear.predictors,pois_resid)
269 | plot(pois_model$linear.predictors,pois_rqresid)
270 | ```
271 | 
272 | Using randomized quantile residuals
273 | =============
274 | 
275 | 
276 | ```{r rqresid2, include=T,echo=TRUE, results="hide", fig.width=12, fig.height=6,fig.align="center"}
277 | negbin_model = gam(y_negbinom~s(x1,k=12)+s(x2,k=12),family=nb,method= "REML")
278 | negbin_rqresid = rqresiduals(negbin_model)
279 | layout(matrix(1:2, nrow=1))
280 | plot(pois_model$linear.predictors,pois_rqresid)
281 | plot(negbin_model$linear.predictors,negbin_rqresid)
282 | ```
283 | 
284 | 
285 | Exercise
286 | =========
287 | Use the `rqresiduals` function to test the model residuals for patterns
288 | from your binomial model from the previous exercise
289 | 
290 | Concurvity
291 | =============================
292 | type:section
293 | 
294 | What is concurvity?
295 | ======================
296 | 
297 | - Nonlinear measure, similar to co-linearity
298 | 
299 | - Measures, for each smooth term, how well this term could be approximated by
300 |   - `concurvity(model, full=TRUE)`: some combination of all other smooth terms
301 |   - `concurvity(model, full=FALSE)`: Each of the other smooth terms in the model 
302 |   (useful for identifying which terms are causing issues)
303 | 
304 | A demonstration
305 | =============================
306 | 
307 | 
308 | ```{r concurve1,fig.width=12, fig.height=5}
309 | library(mgcv)
310 | set.seed(1)
311 | n=200
312 | alpha = 0
313 | x1_cc = rnorm(n)
314 | mean_constant = alpha
315 | var_constant = alpha^2
316 | x2_cc = alpha*x1_cc^2 - mean_constant + rnorm(n,0,1-var_constant)
317 | par(mfrow=c(1,3))
318 | plot(x1_cc,x2_cc)
319 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc)))
320 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML")
321 | plot(m1,scale=0)
322 | print("concurvity(m1)")
323 | print(round(concurvity(m1),2))
324 | ```
325 | 
326 | 
327 | 
328 | A demonstration
329 | =============================
330 | 
331 | 
332 | ```{r concurve2,fig.width=12, fig.height=5}
333 | set.seed(1)
334 | n=200
335 | alpha = 0.33
336 | mean_constant = alpha
337 | var_constant = alpha^2
338 | x1_cc = rnorm(n)
339 | x2_cc = alpha*x1_cc^2-mean_constant + rnorm(n,0,1-var_constant)
340 | par(mfrow=c(1,3))
341 | plot(x1_cc,x2_cc)
342 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc)))
343 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML")
344 | plot(m1,scale=0)
345 | print("concurvity(m1, full=TRUE)")
346 | print(round(concurvity(m1),2))
347 | ```
348 | 
349 | 
350 | A demonstration
351 | =============================
352 | 
353 | ```{r concurve3,fig.width=12, fig.height=5}
354 | library(mgcv)
355 | set.seed(1)
356 | n=200
357 | max_val = sqrt(pi/(pi-2))
358 | alpha = 0.66
359 | x1_cc = rnorm(n)
360 | mean_constant = alpha
361 | var_constant = alpha^2
362 | x2_cc = alpha*x1_cc^2-mean_constant + rnorm(n,0,1-var_constant)
363 | par(mfrow=c(1,3))
364 | plot(x1_cc,x2_cc)
365 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc)))
366 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML")
367 | plot(m1,scale=0)
368 | print("concurvity(m1, full=TRUE)")
369 | print(round(concurvity(m1),2))
370 | ```
371 | 
372 | 
373 | A demonstration
374 | =============================
375 | 
376 | 
377 | 
378 | ```{r concurve4,fig.width=12, fig.height=5}
379 | set.seed(1)
380 | alpha = 1
381 | mean_constant = alpha
382 | var_constant = alpha^2
383 | x2_cc = alpha*x1_cc^2-mean_constant + rnorm(n,0,1-var_constant)
384 | par(mfrow=c(1,3))
385 | plot(x1_cc,x2_cc)
386 | y = 3 + cos(pi*x1_cc) + 1/(1+exp(-5*(x2_cc)))
387 | m1 = gam(y~s(x1_cc)+s(x2_cc),method= "REML")
388 | plot(m1,scale=0)
389 | print("concurvity(m1, full=TRUE)")
390 | print(round(concurvity(m1),2))
391 | par(mfrow=c(1,1))
392 | ```
393 | 
394 | Concurvity: things to remember
395 | ==============================
396 | - Can make your model unstable to small changes
397 | - `cor(data)` not sufficient: use the `concurvity(model)` function
398 | - Not always obvious from plots of smooths!!
399 | 
400 | 
401 | Overall
402 | =========
403 | Make sure to test your model! GAMs are powerful, but with great power...
404 | 
405 | You should at least:
406 | 
407 | 1. Check if your smooths are sufficiently smooth
408 | 
409 | 2. Test if you have the right distribution
410 | 
411 | 3. Make sure there's no patterns left in your data
412 | 
413 | 4. If you have time series, grouped, spatial, etc. data, check for dependencies
414 | 
415 | 
416 | We'll get into testing for dependence later on in the extended examples.
417 | 
418 | 


--------------------------------------------------------------------------------
/slides/03-model-selection.Rpres:
--------------------------------------------------------------------------------
  1 | GAMs: Model Selection
  2 | ============================
  3 | author: David L Miller, Eric Pedersen, and Gavin L Simpson
  4 | date: August 6th, 2016
  5 | css: custom.css
  6 | transition: none
  7 | 
  8 | Overview
  9 | =========
 10 | 
 11 | - Model selection
 12 | - Shrinkage smooths
 13 | - Shrinkage via double penalty (`select = TRUE`)
 14 | - Confidence intervals for smooths
 15 | - *p* values
 16 | - `anova()`
 17 | - AIC
 18 | 
 19 | ```{r setup, include=FALSE}
 20 | library("knitr")
 21 | library("viridis")
 22 | library("ggplot2")
 23 | library("mgcv")
 24 | library("cowplot")
 25 | theme_set(theme_bw())
 26 | opts_chunk$set(cache=TRUE, echo=FALSE)
 27 | ```
 28 | 
 29 | Model selection
 30 | ===============
 31 | type:section
 32 | 
 33 | Model selection
 34 | ===============
 35 | 
 36 | Model (or variable) selection --- and important area of theoretical and applied interest
 37 | 
 38 | - In statistics we aim for a balance between *fit* and *parsimony*
 39 | - In applied research we seek the set of covariates with strongest effects on $y$
 40 | 
 41 | We seek a subset of covariates that improves *interpretability* and *prediction accuracy*
 42 | 
 43 | Shrinkage & additional penalties
 44 | ================================
 45 | type:section
 46 | 
 47 | Shrinkage & additional penalties
 48 | ================================
 49 | 
 50 | Smoothing parameter estimation allows selection of a wide range of potentially complex functions for smooths...
 51 | 
 52 | But, cannot remove a term entirely from the model because the penalties used act only on the *range space* of a spline basis. The *null space* of the basis is unpenalised.
 53 | 
 54 | - **Null space** --- the basis functions that are smooth (constant, linear)
 55 | - **Range space** --- the basis functions that are wiggly
 56 | 
 57 | Shrinkage & additional penalties
 58 | ================================
 59 | 
 60 | **mgcv** has two ways to penalize the null space, i.e. to do selection
 61 | 
 62 | - *double penalty approach* via `select = TRUE`
 63 | - *shrinkage approach* via special bases for thin plate and cubic splines
 64 | 
 65 | Other shrinkage/selection approaches are available
 66 | 
 67 | Double-penalty shrinkage
 68 | ========================
 69 | 
 70 | $\mathbf{S}_j$ is the smoothing penalty matrix & can be decomposed as
 71 | 
 72 | $$
 73 | \mathbf{S}_j = \mathbf{U}_j\mathbf{\Lambda}_j\mathbf{U}_j^{T}
 74 | $$
 75 | 
 76 | where $\mathbf{U}_j$ is a matrix of eigenvectors and $\mathbf{\Lambda}_j$ a diagonal matrix of eigenvalues (i.e. this is an eigen decomposition of $\mathbf{S}_j$).
 77 | 
 78 | $\mathbf{\Lambda}_j$ contains some **0**s due to the spline basis null space --- no matter how large the penalty $\lambda_j$ might get no guarantee a smooth term will be suppressed completely.
 79 | 
 80 | To solve this we need an extra penalty...
 81 | 
 82 | Double-penalty shrinkage
 83 | ========================
 84 | 
 85 | Create a second penalty matrix from $\mathbf{U}_j$, considering only the matrix of eigenvectors associated with the zero eigenvalues
 86 | 
 87 | $$
 88 | \mathbf{S}_j^{*} = \mathbf{U}_j^{*}\mathbf{U}_j^{*T}
 89 | $$
 90 | 
 91 | Now we can fit a GAM with two penalties of the form
 92 | 
 93 | $$
 94 | \lambda_j \mathbf{\beta}^T \mathbf{S}_j \mathbf{\beta} + \lambda_j^{*} \mathbf{\beta}^T \mathbf{S}_j^{*} \mathbf{\beta}
 95 | $$
 96 | 
 97 | Which implies two sets of penalties need to be estimated.
 98 | 
 99 | In practice, add `select = TRUE` to your `gam()` call
100 | 
101 | Shrinkage
102 | =========
103 | 
104 | The double penalty approach requires twice as many smoothness parameters to be estimated. An alternative is the shrinkage approach, where $\mathbf{S}_j$ is replaced by
105 | 
106 | 
107 | $$
108 | \tilde{\mathbf{S}}_j = \mathbf{U}_j\tilde{\mathbf{\Lambda}}_j\mathbf{U}_j^{T}
109 | $$
110 | 
111 | where $\tilde{\mathbf{\Lambda}}_j$ is as before except the zero eigenvalues are set to some small value $\epsilon$.
112 | 
113 | This allows the null space terms to be shrunk by the standard smoothing parameters.
114 | 
115 | Use `s(..., bs = "ts")` or `s(..., bs = "cs")` in **mgcv**
116 | 
117 | Empirical Bayes...?
118 | ===================
119 | 
120 | $\mathbf{S}_j$ can be viewed as prior precision matrices and $\lambda_j$ as improper Gaussian priors on the spline coefficients.
121 | 
122 | The impropriety derives from $\mathbf{S}_j$ not being of full rank (zeroes in $\mathbf{\Lambda}_j$).
123 | 
124 | Both the double penalty and shrinkage smooths remove the impropriety from the Gaussian prior
125 | 
126 | Empirical Bayes...?
127 | ===================
128 | 
129 | - **Double penalty** --- makes no assumption as to how much to shrink the null space. This is determined from the data via estimation of $\lambda_j^{*}$
130 | - **Shrinkage smooths** --- assumes null space should be shrunk less than the wiggly part
131 | 
132 | Marra & Wood (2011) show that the double penalty and the shrinkage smooth approaches
133 | 
134 | - performed significantly better than alternatives in terms of *predictive ability*, and
135 | - performed as well as alternatives in terms of variable selection
136 | 
137 | Example
138 | =======
139 | left: 60%
140 | ```{r setup-shrinkage-example, echo = FALSE, include = FALSE}
141 | ## an example of automatic model selection via null space penalization
142 | set.seed(3)
143 | n <- 200
144 | dat <- gamSim(1, n=n, scale=.15, dist="poisson")                ## simulate data
145 | dat <- transform(dat, x4 = runif(n, 0, 1), x5 = runif(n, 0, 1)) ## spurious
146 | b <- gam(y ~ s(x0) + s(x1) + s(x2) + s(x3) + s(x4) + s(x5), data = dat,
147 |          family=poisson, select = TRUE, method = "REML")
148 | #summary(b)
149 | #plot(b, pages = 1)
150 | ```
151 | ```{r shrinkage-example-truth}
152 | p1 <- ggplot(dat, aes(x = x0, y = f0)) + geom_line()
153 | p2 <- ggplot(dat, aes(x = x1, y = f1)) + geom_line()
154 | p3 <- ggplot(dat, aes(x = x2, y = f2)) + geom_line()
155 | p4 <- ggplot(dat, aes(x = x3, y = f3)) + geom_line()
156 | plot_grid(p1, p2, p3, p4, ncol = 2, align = "vh", labels = paste0("x", 1:4))
157 | ```
158 | 
159 | ***
160 | 
161 | - Simulate Poisson counts
162 | - 4 known functions
163 | - 2 spurious covariates
164 | 
165 | Example
166 | =======
167 | ```{r shrinkage-example-summary}
168 | summary(b)
169 | ```
170 | 
171 | Example
172 | =======
173 | ```{r shrinkage-example-plot, fig.width = 16, fig.height = 9}
174 | plot(b, scheme = 1, pages = 1)
175 | ```
176 | 
177 | Confidence intervals for smooths
178 | ================================
179 | type:section
180 | 
181 | Confidence intervals for smooths
182 | ================================
183 | 
184 | `plot.gam()` produces approximate 95% intervals (at +/- 2 SEs)
185 | 
186 | What do these intervals represent?
187 | 
188 | Nychka (1988) showed that standard Wahba/Silverman type Bayesian confidence intervals on smooths had good **across-the-function** frequentist coverage properties.
189 | 
190 | Confidence intervals for smooths
191 | ================================
192 | 
193 | Marra & Wood (2012) extended this theory to the generalised case and explain where the coverage properties failed:
194 | 
195 | *Musn't over-smooth too much, which happens when $\lambda_j$ are over-estimated*
196 | 
197 | Two situations where this might occur
198 | 
199 | 1. where true effect is almost in the penalty null space, $\hat{\lambda}_j \rightarrow \infty$
200 | 2. where $\hat{\lambda}_j$ difficult to estimate due to highly correlated covariates
201 |     - if 2 correlated covariates have different amounts of wiggliness, estimated effects can have degree of smoothness *reversed*
202 | 
203 | Don't over-smooth
204 | =================
205 | 
206 | > <small>In summary, we have shown that Bayesian componentwise variable width intervals... for the smooth components of an additive model **should achieve close to nominal *across-the-function* coverage probability**, provided only that we do not over-smooth so heavily... Beyond this  requirement not to oversmooth too heavily, the results appear to have rather weak dependence on smoothing parameter values, suggesting that the neglect of smoothing parameter  variability  should  not  significantly  degrade  interval  performance.</small>
207 | 
208 | Confidence intervals for smooths
209 | ================================
210 | 
211 | Marra & Wood (2012) suggested a solution to situation 1., namely true functions close to the penalty null space.
212 | 
213 | Smooths are normally subject to *identifiability* constraints (centred), which leads to zero variance where the estimated function crosses the zero line.
214 | 
215 | Instead, compute intervals for $j$ th smooth as if it alone had the intercept; identifiability constraints go on the other smooth terms.
216 | 
217 | Use `seWithMean = TRUE` in call to `plot.gam()`
218 | 
219 | Example
220 | =======
221 | 
222 | ```{r setup-confint-example, fig = TRUE, fig.width = 11, fig.height = 10, results = "hide"}
223 | library(mgcv)
224 | set.seed(0)
225 | ## fake some data...
226 | f1 <- function(x) {exp(2 * x)}
227 | f2 <- function(x) { 
228 |   0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 
229 | }
230 | f3 <- function(x) {x*0}
231 | 
232 | n<-200
233 | sig2 <- 12
234 | x0 <- rep(1:4,50)
235 | x1 <- runif(n, 0, 1)
236 | x2 <- runif(n, 0, 1)
237 | x3 <- runif(n, 0, 1)
238 | e <- rnorm(n, 0, sqrt(sig2))
239 | y <- 2*x0 + f1(x1) + f2(x2) + f3(x3) + e
240 | x0 <- factor(x0)
241 | 
242 | ## fit and plot...
243 | b <- gam(y ~ x0 + s(x1) + s(x2) + s(x3))
244 | 
245 | op <- par(mar = c(4,4,1,1) + 0.1)
246 | layout(matrix(1:9, ncol = 3, byrow = TRUE))
247 | curve(f1)
248 | curve(f2)
249 | curve(f3)
250 | plot(b, shade=TRUE)
251 | plot(b, shade = TRUE, seWithMean = TRUE) ## better coverage intervals
252 | layout(1)
253 | par(op)
254 | ```
255 | 
256 | p values for smooths
257 | ====================
258 | type:section
259 | 
260 | p values for smooths
261 | ====================
262 | 
263 | ...are approximate:
264 | 
265 | 1. they don't really account for the estimation of $\lambda_j$ --- treated as known
266 | 2. rely on asymptotic behaviour --- they tend towards being right as sample size tends to $\infty$
267 | 
268 | Also, *p* values in `summary.gam()` have changed a lot over time --- all options except current default are deprecated as of `v1.18-13`.
269 | 
270 | The approach described in Wood (2006) is "*no longer recommended*"!
271 | 
272 | p values for smooths
273 | ====================
274 | 
275 | ...are a test of **zero-effect** of a smooth term
276 | 
277 | Default *p* values rely on theory of Nychka (1988) and Marra & Wood (2012) for confidence interval coverage.
278 | 
279 | If the Bayesian CI have good across-the-function properties, Wood (2013a) showed that the *p* values have
280 | 
281 | - almost the correct null distribution
282 | - reasonable power
283 | 
284 | Test statistic is a form of $\chi^2$ statistic, but with complicated degrees of freedom.
285 | 
286 | p values for unpenalized smooths
287 | ================================
288 | 
289 | The results of Nychka (1988) and Marra & Wood (2012) break down if smooth terms are unpenalized.
290 | 
291 | This include i.i.d. Gaussian random effects, (e.g. `bs = "re"`.)
292 | 
293 | Wood (2013b) proposed instead a test based on a likelihood ratio statistic:
294 | 
295 | - the reference distribution used is appropriate for testing a $\mathrm{H}_0$ on the boundary of the allowed parameter space...
296 | - ...in other words, it corrects for a $\mathrm{H}_0$ that a variance term is zero.
297 | 
298 | p values for smooths
299 | ====================
300 | 
301 | have the best behaviour when smoothness selection is done using **ML**, then **REML**.
302 | 
303 | Neither of these are the default, so remember to use `method = "ML"` or `method = "REML"` as appropriate
304 | 
305 | p values for parametric terms
306 | =============================
307 | 
308 | ...are based on Wald statistics using the Bayesian covariance matrix for the coefficients.
309 | 
310 | This is the "right thing to do" when there are random effects terms present and doesn't really affect performance if there aren't.
311 | 
312 | Hence in most instances you won't need to change the default `freq = FALSE` in `summary.gam()`
313 | 
314 | anova()
315 | ===========
316 | type:section
317 | 
318 | anova()
319 | ===========
320 | 
321 | **mgcv** provides an `anova()` method for `"gam"` objects:
322 | 
323 | 1. Single model form: `anova(m1)`
324 | 2. Multi model form: `anova(m1, m2, m3)`
325 | 
326 | anova() --- single model form
327 | =============================
328 | 
329 | This differs from `anova()` methods for `"lm"` or `"glm"` objects:
330 | 
331 | - the tests are Wald-like tests as described for `summary.gam()` of a $\mathrm{H}_0$ of zero-effect of a smooth term
332 | - these are not *sequential* tests!
333 | 
334 | anova()
335 | ===========
336 | 
337 | ```{r anova-example-single, echo = TRUE}
338 | b1 <- gam(y ~ x0 + s(x1) + s(x2) + s(x3), method = "REML")
339 | anova(b1)
340 | ```
341 | 
342 | anova() --- multi model form
343 | =============================
344 | 
345 | The multi-model form should really be used with care --- the *p* values are really *approximate*
346 | 
347 | ```{r anova-example-multi, echo = TRUE}
348 | b1 <- gam(y ~ s(x0) + s(x1) + s(x2) + s(x3) + s(x4) + s(x5), data = dat,
349 |          family=poisson, method = "ML")
350 | b2 <- update(b1, . ~ . - s(x3) - s(x4) - s(x5))
351 | anova(b2, b1, test = "LRT")
352 | ```
353 | 
354 | For *general smooths* deviance is replaced by $-2\mathcal{L}(\hat{\beta})$
355 | 
356 | AIC for GAMs
357 | ============
358 | type:section
359 | 
360 | AIC for GAMs
361 | ============
362 | 
363 | - Comparison of GAMs by a form of AIC is an alternative frequentist approach to model selection
364 | - Rather than using the marginal likelihood, the likelihood of the $\mathbf{\beta}_j$ *conditional* upon $\lambda_j$ is used, with the EDF replacing $k$, the number of model parameters
365 | - This *conditional* AIC tends to select complex models, especially those with random effects, as the EDF ignores that $\lambda_j$ are estimated
366 | - Wood et al (2015) suggests a correction that accounts for uncertainty in $\lambda_j$
367 | 
368 | $$
369 | AIC = -2l(\hat{\beta}) + 2\mathrm{tr}(\widehat{\mathcal{I}}V^{'}_{\beta})
370 | $$
371 | 
372 | AIC
373 | ====================
374 | 
375 | In this example, $x_3$, $x_4$, and $x_5$ have no effects on $y$
376 | 
377 | ```{r aic-example, echo = TRUE}
378 | AIC(b1, b2)
379 | ```
380 | 
381 | References
382 | ==========
383 | - [Marra & Wood (2011) *Computational Statistics and Data Analysis* **55** 2372--2387.](http://doi.org/10.1016/j.csda.2011.02.004)
384 | - [Marra & Wood (2012) *Scandinavian journal of statistics, theory and applications* **39**(1), 53--74.](http://doi.org/10.1111/j.1467-9469.2011.00760.x.)
385 | - [Nychka (1988) *Journal of the American Statistical Association* **83**(404) 1134--1143.](http://doi.org/10.1080/01621459.1988.10478711)
386 | - Wood (2006) *Generalized Additive Models: An Introduction with R*. Chapman and Hall/CRC.
387 | - [Wood (2013a) *Biometrika* **100**(1) 221--228.](http://doi.org/10.1093/biomet/ass048)
388 | - [Wood (2013b) *Biometrika* **100**(4) 1005--1010.](http://doi.org/10.1093/biomet/ast038)


--------------------------------------------------------------------------------
/slides/04-Beyond_the_exponential_family.Rpres:
--------------------------------------------------------------------------------
  1 | Beyond the exponential family
  2 | ============================
  3 | author: Eric Pedersen, Gavin Simpson, David Miller
  4 | date: August 6th, 2016
  5 | css: custom.css
  6 | transition: none
  7 | 
  8 | 
  9 | Away from the exponential family
 10 | ===================================
 11 | incremental: true
 12 | 
 13 | Most glm families (Poisson, Gamma, Gaussian, Binomial) are *exponential families*
 14 | 
 15 | $$ f(x|\theta) \sim exp(\sum_i \eta_i(\theta)T_i(x) - A(\theta))$$
 16 | 
 17 | - Computationally easy
 18 | - Has sufficient statistics: easier to estimate parameter variance
 19 | - ... but it doesn't describe everything
 20 | - `mgcv` has expanded to cover many new families
 21 | - Lets you model a much wider range of scenarios with smooths
 22 | 
 23 | 
 24 | ```{r setup, include=F}
 25 | 
 26 | library(mgcv)
 27 | library(magrittr)
 28 | library(ggplot2)
 29 | library(dplyr)
 30 | library(tidyr)
 31 | 
 32 | ```
 33 | 
 34 | ```{r pres_setup, include =F}
 35 | library(knitr)
 36 | opts_chunk$set(cache=TRUE, echo=FALSE,fig.align="center")
 37 | ```
 38 | 
 39 | 
 40 | What we'll cover
 41 | ===================================
 42 | 
 43 | - "Counts": Negative binomial and Tweedie distributions
 44 | - Modelling proportions with the Beta distribution
 45 | - Robust regression with the Student's t distribution
 46 | - Ordered and unorderd categorical data
 47 | - Multivariate normal data
 48 | - Modelling exta zeros with zero-inflated and adjusted families
 49 | 
 50 | - _NOTE_: All the distributions we're covering here have their own quirks. Read 
 51 | the help files carefully before using them!
 52 | 
 53 | Modelling "counts"
 54 | ================
 55 | type:section
 56 | 
 57 | Counts and count-like things
 58 | ============================
 59 | 
 60 | - Response is a count (not always integer)
 61 | - Often, it's mostly zero (that's complicated)
 62 | - Could also be catch per unit effort, biomass etc
 63 | - Flexible mean-variance relationship
 64 | 
 65 | ![The Count from Sesame Street](images/count.jpg)
 66 | 
 67 | Tweedie distribution
 68 | =====================
 69 | 
 70 | ```{r tweedie}
 71 | library(tweedie)
 72 | library(RColorBrewer)
 73 | 
 74 | # tweedie
 75 | y<-seq(0.01,5,by=0.01)
 76 | pows <- seq(1.2, 1.9, by=0.1)
 77 | 
 78 | fymat <- matrix(NA, length(y), length(pows))
 79 | 
 80 | i <- 1
 81 | for(pow in pows){
 82 |   fymat[,i] <- dtweedie( y=y, power=pow, mu=2, phi=1)
 83 |   i <- i+1
 84 | }
 85 | 
 86 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5,
 87 |      main="")
 88 | 
 89 | rr <- brewer.pal(8,"Dark2")
 90 | 
 91 | for(i in 1:ncol(fymat)){
 92 |   lines(y, fymat[,i], type="l", col=rr[i], lwd=2)
 93 | }
 94 | ```
 95 | ***
 96 | -  $\text{Var}\left(\text{count}\right) = \phi\mathbb{E}(\text{count})^q$
 97 | - Common distributions are sub-cases:
 98 |   - $q=1 \Rightarrow$ Poisson
 99 |   - $q=2 \Rightarrow$ Gamma
100 |   - $q=3 \Rightarrow$ Normal
101 | - We are interested in $1 < q < 2$ 
102 | - (here $q = 1.2, 1.3, \ldots, 1.9$)
103 | - `tw()`
104 | 
105 | 
106 | Negative binomial
107 | ==================
108 | 
109 | ```{r negbin}
110 | y<-seq(1,12,by=1)
111 | disps <- seq(0.001, 1, len=10)
112 | 
113 | fymat <- matrix(NA, length(y), length(disps))
114 | 
115 | i <- 1
116 | for(disp in disps){
117 |   fymat[,i] <- dnbinom(y, size=disp, mu=5)
118 |   i <- i+1
119 | }
120 | 
121 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5,
122 |      main="")
123 | 
124 | rr <- brewer.pal(8,"Dark2")
125 | 
126 | for(i in 1:ncol(fymat)){
127 |   lines(y, fymat[,i], type="l", col=rr[i], lwd=2)
128 | }
129 | ```
130 | ***
131 | - $\text{Var}\left(\text{count}\right) =$ $\mathbb{E}(\text{count}) + \kappa \mathbb{E}(\text{count})^2$
132 | - Estimate $\kappa$
133 | - Is quadratic relationship a "strong" assumption?
134 | - Similar to Poisson: $\text{Var}\left(\text{count}\right) =\mathbb{E}(\text{count})$ 
135 | - `nb()`
136 | 
137 | 
138 | 
139 | 
140 | Modelling proportions
141 | ======================
142 | type:section 
143 | 
144 | The Beta distribution
145 | ======================
146 | 
147 | ```{r beta-dist}
148 | shape1 <- c(0.2, 1, 5, 1, 3, 1.5)
149 | shape2 <- c(0.2, 3, 1, 1, 1.5, 3)
150 | x <- seq(0.01, 0.99, length = 200)
151 | rr <- brewer.pal(length(shape1), "Dark2")
152 | fymat <- mapply(dbeta, shape1, shape2, MoreArgs = list(x = x))
153 | matplot(x, fymat, type = "l", col = rr, lwd = 2, lty = "solid")
154 | legend("top", bty = "n",
155 |        legend = expression(alpha == 0.2 ~~ beta == 0.2,
156 |                            alpha == 1.0 ~~ beta == 3.0,
157 |                            alpha == 5.0 ~~ beta == 1.0,
158 |                            alpha == 1.0 ~~ beta == 1.0,
159 |                            alpha == 3.0 ~~ beta == 1.5,
160 |                            alpha == 1.5 ~~ beta == 3.0),
161 |        col = rr, cex = 1.25, lty = "solid", lwd = 2)
162 | ```
163 | 
164 | ***
165 | 
166 | - Proportions; continuous, bounded at 0 & 1
167 | - Beta distribution is convenient choice
168 | - Two strictly positive shape parameters, $\alpha$ & $\beta$
169 | - Has support on $x \in (0,1)$
170 | - Density at $x = 0$ & $x = 1$ is $\infty$, fudge
171 | - **betareg** package
172 | - `betar()` family in **mgcv**
173 | 
174 | Beta or Binomial?
175 | =================
176 | 
177 | The binomial model also model's proportions --- more specifically it models the number of successes in $m$ trials. If you have data of this form then model the binomial counts as this can yield predicted *counts* if required.
178 | 
179 | If you have true percentage or proportion data, say estimated prpotional plant cover in a quadrat, then the beta model is appropriate.
180 | 
181 | Also, if all you have is the percentages, the beta model is unlikely to be terribly bad.
182 | 
183 | Stereotypic behaviour in captive cheetahs
184 | =========================================
185 | 
186 | To illustrate the use of the `betar()` family in **mgcv** we use a behavioural data set of observations on captive cheetahs. These data are prvided and extensively analysed in Zuur et al () and originate from Quirke et al (2012).
187 | 
188 | Stereotypic behaviour in captive cheetahs
189 | =========================================
190 | 
191 | - data collected from nine zoos
192 | - at randomised times of day a random number of scans (videos) of captive cheetah behaviour were recorded and analysed over a period of several months
193 | - presence of stereotypical behaviour was recorded
194 | - all individuals in an enclosure were assessed; where more than 1 individual data were aggregated over individuals to achieve 1 data point per enclosure per sampling occasion
195 | - a number of covariates were also recorded
196 | - data technically a binomial counts but we'll ignore count data and model the proportion of scans showing stereotypical behaviour
197 | 
198 | Cheetah: data processing
199 | ========================
200 | 
201 | ```{r cheetah-load-data, echo = TRUE}
202 | cheetah <- read.table("../data/beta-regression/ZooData.txt", header = TRUE)
203 | names(cheetah)
204 | cheetah <- transform(cheetah, Raised = factor(Raised),
205 |                      Feeding = factor(Feeding),
206 |                      Oc = factor(Oc),
207 |                      Other = factor(Other),
208 |                      Enrichment = factor(Enrichment),
209 |                      Group = factor(Group),
210 |                      Sex = factor(Sex, labels = c("Male","Female")),
211 |                      Zoo = factor(Zoo))
212 | ```
213 | 
214 | Cheetah: model fitting
215 | ======================
216 | 
217 | ```{r cheetah-model, echo = TRUE}
218 | m <- gam(Proportion ~ s(log(Size)) + s(Visitors) + s(Enclosure) +
219 |            s(Vehicle) + s(Age) + s(Zoo, bs = "re") + 
220 |            Feeding + Oc + Other + Enrichment + Group + Sex,
221 |          data = cheetah, family = betar(), method = "REML")
222 | ```
223 | 
224 | Cheetah: model summary
225 | ======================
226 | title: false
227 | 
228 | ```{R cheetah-summary}
229 | summary(m)
230 | ```
231 | 
232 | Cheetah: model smooths
233 | ======================
234 | 
235 | ```{R cheetah-plot-smooths, fig.width = 15, fig.height = 8}
236 | layout(matrix(1:6, ncol = 3, byrow = TRUE))
237 | plot(m, shade = TRUE, scale = 0, seWithMean = TRUE)
238 | layout(1)
239 | ```
240 | 
241 | Modelling outliers
242 | ====================
243 | type:section
244 | 
245 | The student-t distribution
246 | ============================
247 | - Models continuous data w/ longer tails than normal
248 | - Far less sensitive to outliers
249 | - Has one extra parameter: df. 
250 | - bigger df: t dist approaches normal
251 | 
252 | 
253 | 
254 | 
255 | ```{r tplot,fig.width=15, fig.height=6}
256 | set.seed(2)
257 | dat = data.frame(df = rep(c(2,4,50),each=500))
258 | dat$x = rt(1500,df=dat$df)
259 | dat$df_val = paste("df = ",dat$df, sep ="")
260 | x_val = seq(min(dat$x),max(dat$x), length=200)
261 | ggplot(aes(x=x),data=dat)+
262 |   geom_density(col="red")+
263 |   facet_grid(.~df_val)+
264 |   annotate(x=x_val,y=dnorm(x_val), geom = "line")+
265 |   theme_bw(base_size = 20)
266 | ```
267 | 
268 | 
269 | 
270 | 
271 | The student-t distribution: Usage
272 | ============================
273 | ```{r texample, eval=FALSE,echo=T}
274 | set.seed(4)
275 | n=300
276 | dat = data.frame(x=seq(0,10,length=n))
277 | dat$f = 20*exp(-dat$x)*dat$x
278 | dat$y  = 1*rt(n,df = 3) + dat$f
279 | norm_mod =  gam(y~s(x,k=20), data=dat, family=gaussian(link="identity"))
280 | t_mod = gam(y~s(x,k=20), data=dat, family=scat(link="identity"))
281 | ```
282 | 
283 | The student-t distribution: Usage
284 | ============================
285 | ```{r texample2, include =T,echo=F,results= "hide", fig.width=15, fig.height=8}
286 | set.seed(4)
287 | n=300
288 | dat = data.frame(x=seq(0,10,length=n))
289 | dat$f = 20*exp(-dat$x)*dat$x
290 | dat$y  = 1*rt(n,df = 3) + dat$f
291 | norm_mod =  gam(y~s(x,k=20), data=dat, family=gaussian(link="identity"))
292 | t_mod = gam(y~s(x,k=20), data=dat, family=scat(link="identity"))
293 | predict_norm = predict(norm_mod,se.fit=T)
294 | predict_t = predict(t_mod,se.fit=T)
295 | fit_vals = data.frame(x = c(dat$x,dat$x), 
296 |                       fit =c(predict_norm[[1]],predict_t[[1]]),
297 |                       se_min = c(predict_norm[[1]] - 2*predict_norm[[2]],
298 |                                  predict_t[[1]] - 2*predict_t[[2]]),
299 |                       se_max = c(predict_norm[[1]] + 2*predict_norm[[2]],
300 |                                  predict_t[[1]] + 2*predict_t[[2]]),
301 |                       model = rep(c("normal errors","t-errors"),each=n))
302 | ggplot(aes(x=x,y=fit),data=fit_vals)+
303 |   facet_grid(.~model)+
304 |   geom_line(col="red")+
305 |   geom_ribbon(aes(ymin =se_min,ymax = se_max),alpha=0.5,fill="red")+
306 |   annotate(x = dat$x,y=dat$y,size=2,geom="point")+
307 |   annotate(x = dat$x,y=dat$f,size=2,geom="line")+
308 |   theme_bw(20)
309 | ```
310 | 
311 | 
312 | 
313 | The student-t distribution: Usage
314 | ============================
315 | ```{r texample3, include =T}
316 | summary(t_mod)
317 | ```
318 | 
319 | 
320 | 
321 | 
322 | Modelling multi-dimensional data 
323 | ===========================
324 | type:section
325 | 
326 | Ordered categorical data 
327 | ===========================
328 | - Assumes data are in discrete categories, and categories fall in order
329 | - e.g.: conservation status: "least concern", "vulnerable", "endangered", "extinct"
330 | - fits a linear latent model using covariates, w/ threshold for each level
331 | - First cut-off always occurs at -1 
332 | 
333 | 
334 | Ordered categorical data 
335 | ===========================
336 | ```{r ocat_ex1, include =T,echo=F,results= "hide", fig.width=15, fig.height=8}
337 | set.seed(4)
338 | n= 100
339 | dat  = data.frame(body_size = seq(-2,2, length=200))
340 | dat$linear_predictor = 6*exp(dat$body_size*2)/(1+exp(dat$body_size*2))-2
341 | 
342 | ggplot(aes(x=body_size, y=linear_predictor),data=dat) + 
343 |   annotate(x= dat$body_size, ymin=-3,ymax=-1, alpha=0.25,geom="ribbon")+
344 |   annotate(x= 0, y=-2, label = "least concern",geom="text",size=10)+
345 |   annotate(x= dat$body_size, ymin=-1,ymax=1.5, alpha=0.25,geom="ribbon",fill="red")+
346 |   annotate(x= 0, y=0.25, label = "vulnerable",geom="text",size=10)+
347 |   annotate(x= dat$body_size, ymin=1.5,ymax=2, alpha=0.25,geom="ribbon",fill="blue")+
348 |   annotate(x= 0, y=1.75, label = "endangered",geom="text",size=10)+
349 |   annotate(x= 0, y=3.5, label = "extinct",geom="text",size=10)+
350 |   scale_y_continuous("linear predictor", expand=c(0,0),limits=c(-3,5))+
351 |   scale_x_continuous("relative body size", expand=c(0,0))+
352 |   theme_bw(30)+
353 |   theme(panel.grid = element_blank())
354 | 
355 | ```
356 | 
357 | 
358 | Ordered categorical data 
359 | ===========================
360 | ```{r ocat_ex2, include =T,echo=F,results= "hide", fig.width=15, fig.height=8}
361 | set.seed(4)
362 | n= 100
363 | dat  = data.frame(body_size = seq(-2,2, length=200))
364 | dat$linear_predictor = 6*exp(dat$body_size*2)/(1+exp(dat$body_size*2))-2
365 | 
366 | ggplot(aes(x=body_size, y=linear_predictor),data=dat) + 
367 |   geom_line()+
368 |   geom_ribbon(aes(ymin=linear_predictor-1,ymax=linear_predictor+1),alpha=0.25)+
369 |   annotate(x= dat$body_size, ymin=-3,ymax=-1, alpha=0.25,geom="ribbon")+
370 |   annotate(x= 0, y=-2, label = "least concern",geom="text",size=10)+
371 |   annotate(x= dat$body_size, ymin=-1,ymax=1.5, alpha=0.25,geom="ribbon",fill="red")+
372 |   annotate(x= 0, y=0.25, label = "vulnerable",geom="text",size=10)+
373 |   annotate(x= dat$body_size, ymin=1.5,ymax=2, alpha=0.25,geom="ribbon",fill="blue")+
374 |   annotate(x= 0, y=1.75, label = "endangered",geom="text",size=10)+
375 |   annotate(x= 0, y=3.5, label = "extinct",geom="text",size=10)+
376 |   scale_x_continuous("relative body size", expand=c(0,0))+
377 |   scale_y_continuous("linear predictor",limits=c(-3,5), expand=c(0,0))+
378 |   theme_bw(30)+
379 |   theme(panel.grid = element_blank())
380 | 
381 | ```
382 | 
383 | 
384 | Using ocat
385 | ===========================
386 | ```{r ocat_ex3, include =T,echo=T,results= "hide", fig.width=15, fig.height=5}
387 | n= 200
388 | dat = data.frame(x1 = runif(n,-1,1),x2=2*pi*runif(n))
389 | dat$f = dat$x1^2 + sin(dat$x2)
390 | dat$y_latent = dat$f + rnorm(n,dat$f)
391 | dat$y = ifelse(dat$y_latent<0,1, ifelse(dat$y_latent<0.5,2,3))
392 | ocat_model = gam(y~s(x1)+s(x2), family=ocat(R=3),data=dat)
393 | plot(ocat_model,page=1)
394 | ```
395 | 
396 | Using ocat
397 | ===========================
398 | ```{r ocat_ex4, include =T,echo=T, fig.width=15, fig.height=8}
399 | summary(ocat_model)
400 | ```
401 | 
402 | Using ocat
403 | ===========================
404 | ```{r ocat_ex5, include =T,echo=F,results= "hide", fig.width=15, fig.height=8}
405 | ocat_predict = predict(ocat_model,type = "response")
406 | colnames(ocat_predict) = c("1","2","3")
407 | ocat_predict = as.data.frame(ocat_predict)%>%
408 |   mutate(x= fitted(ocat_model),y=as.numeric(dat$y))%>%
409 |   gather(pred_level,prediction,`1`:`3`)%>%
410 |   mutate(pred_level = as.numeric(pred_level),
411 |          obs_val = as.numeric(y==pred_level))
412 | 
413 | ggplot(aes(x= x, y= obs_val),data=ocat_predict)+
414 |   facet_wrap(~pred_level)+
415 |   geom_point()+
416 |   geom_line(aes(y= prediction))+
417 |   theme_bw(30)
418 | ```
419 | 
420 | 
421 | 
422 | Unordered categorical data 
423 | ===========================
424 | - What do you do if categorical data doesn't fall in a nice order? 
425 | 
426 | Unordered categorical data 
427 | ===========================
428 | - What do you do if categorical data doesn't fall in a nice order? 
429 |  ![](images/animal_choice.png)
430 |  
431 |  Unordered categorical data 
432 | ===========================
433 | incremental: true
434 | - Model probability of a category occuring relative to an (arbitrary) reference level
435 | - one linear equation for each category except the reference class
436 | - $p(y=i|\mathbf{x}) = exp(\mu_i(\mathbf{x}))/(1+\sum_j exp(\mu_j(\mathbf{x}))$
437 | - $\mu_i(\mathbf{x}) = s_{1,j}(x_1) + s_{2,j}(x_2)$
438 | - $p(y=0|\mathbf{x})= 1/(1+\sum_j exp(\mu_j(\mathbf{x}))$
439 | 
440 | 
441 |   
442 | Using the multinom function
443 | ===========================
444 |  ![](images/animal_codes.png)
445 |  
446 |  ***
447 |   ![](images/animal_functions.png)
448 |  
449 |  
450 |  
451 | Using the multinom function
452 | ===========================
453 | ```{r multinom1, include =T,echo=F,results= "hide", fig.width=15, fig.height=8}
454 | set.seed(10)
455 | n= 500
456 | dat = data.frame(tree_cover = round(runif(n,0,1),2),road_dist=round(10*runif(n),1))
457 | dat$f_deer = with(dat, 4*exp(-road_dist/2)*road_dist - 2*(tree_cover)^2)
458 | dat$f_pig = with(dat, log(exp(-road_dist)+0.1)+1)
459 | prob_matrix = cbind(1, exp(dat$f_deer),exp(dat$f_pig))
460 | dat$y = apply(prob_matrix,MARGIN = 1,function(x) sample(0:2,size = 1,prob = x))
461 | model_dat = dat%>% select(tree_cover, road_dist, y)
462 | ```
463 | 
464 | ```{r multinom2, include =T,echo=T, fig.width=15, fig.height=8}
465 | head(model_dat)
466 | ``` 
467 | 
468 | ***
469 |  ```{r multinom3, include =T,echo=T, fig.width=6, fig.height=6}
470 | pairs(model_dat)
471 | ``` 
472 | 
473 |  
474 | Using the multinom function
475 | ===========================
476 | ```{r multinom4, include =T,echo=F,results= "hide", fig.width=15, fig.height=8}
477 | multinom_model = gam(list(y~s(tree_cover)+s(road_dist), ~s(tree_cover)+s(road_dist)),
478 |                      data= model_dat, family=multinom(K=2))
479 | plot(multinom_model,page=1,cex.axis=2,cex.lab=2)
480 | ```
481 | 
482 | 
483 |  
484 | Understanding the results
485 | ===========================
486 | ```{r multinom5, include =T,echo=T,results= "hide", fig.width=15, fig.height=4}
487 | multinom_pred_data = as.data.frame(expand.grid(road_dist =seq(0,10,length=50),
488 |                                                tree_cover =c(0,0.33,0.66,1)))
489 | multinom_pred = predict(multinom_model, multinom_pred_data,type = "response")
490 | colnames(multinom_pred) = c("monkey","deer","pig")
491 | multinom_pred_data = cbind(multinom_pred_data,multinom_pred)
492 | multinom_pred_data_long = multinom_pred_data %>%
493 |   gather(species, probability, monkey, deer,pig)%>%
494 |   mutate(tree_cover =paste("tree cover = ", tree_cover,sep=""))
495 | ggplot(aes(road_dist, probability,color=species),data=multinom_pred_data_long)+
496 |   geom_line()+
497 |   facet_grid(.~tree_cover)+
498 |   theme_bw(20)
499 | 
500 | ```
501 | 
502 | 
503 | Other multivariate distributions to check out
504 | ===================================
505 | type: section
506 | 
507 | 
508 | Multivariate normal (family = mvn)
509 | ===========================
510 | incremental: true
511 | 
512 | - Fit a different smooth model for multiple y-variables, but allowing 
513 |   correlation between y's
514 | - Example uses: multi-species distribution models, measuring latent correlations
515 |   between environmental predictors
516 | - mgcv code: formula=list(y1~s(x1)+s(x2), y2 = s(x1)+s(x3)), family = mvn(d=2)
517 | 
518 | Cox Proportional hazards (family = cox.ph)
519 | ===========================
520 | incremental: true
521 | 
522 | - Censored data: y measures time until an event occurs, or the study was stopped (censoring)
523 | - Measures relative rates, rather than absolute rates (no intercepts)
524 | - Example uses: time until an individual is infected, time until a subpopulation goes 
525 |   extinct, time until lake is invaded
526 | - mgcv code: `formula = y~s(x1)+s(x2), weights= censor.var,family=cox.ph` 
527 | - censor.var = 0 if censored, 1 if not
528 | 
529 | 
530 | Gaussian location-scale models (family = gaulss)
531 | ===========================
532 | incremental: true
533 | 
534 | - Model both the mean ("location") and variance ("scale") as smooth functions of predictors 
535 | - Example uses: detecting early warning signs in time series, finding factors driving 
536 | population variability
537 | - mgcv code: `formula = list(y~s(x1)+s(x2), ~s(x2)+s(x3)), family=gaulss`
538 | - censor.var = 0 if censored, 1 if not
539 | 
540 | 
541 | 
542 | Zero-inflated Poisson location-scale models (family = ziplss)
543 | ===========================
544 | incremental: true
545 | 
546 | - Models the probability of zeros seperately from mean counts given that you've observed more than zero
547 | at a location.
548 | - Example uses: Counts of prey caught when a predator might switch between not hunting at all (zeros)
549 | and active hunting
550 | - mgcv code: `formula = list(y~s(x1)+s(x2), ~s(x2)+s(x3)), family=ziplss` 
551 | 
552 | 
553 | The end of the distribution zoo
554 | ==============
555 | type:section 
556 | 
557 | That's the end of this section! We convene after lunch (1:00 PM). You'll 
558 | get to work through a few more advanced examples of your choice. 
559 | 
560 | 
561 |  


--------------------------------------------------------------------------------
/slides/correct_mathjax.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # correct the paths to the mathjax files for HTML versions of the slides
 4 | 
 5 | # changes the link to be to a folder called mathjax-23 in the current directory
 6 | # this avoids having lots of copes of the same mathjax stuff
 7 | 
 8 | # also use decktape to make a PDF of the slides (probably only works on
 9 | # Dave's laptop)
10 | 
11 | for i in $( ls *.html); do
12 |   echo processing: $i
13 |   FN=${i/.html/}_files\\/
14 |   sed "s/$FN//" "$i" > tmp.html
15 |   mv tmp.html $i
16 |   PDFFN=${i/.html/}.pdf
17 |   ~/sources/decktape/bin/phantomjs ~/sources/decktape/decktape.js automatic -s 1024x768 $i $PDFFN
18 | done
19 | 


--------------------------------------------------------------------------------
/slides/custom.css:
--------------------------------------------------------------------------------
 1 | .reveal section img{
 2 |   box-shadow: 0px 0px 0px rgba(0, 0, 0, 0);
 3 |   margin: 0px;
 4 | }
 5 | 
 6 | .reveal h1, .reveal h2, .reveal h3 {
 7 |   word-wrap: normal;
 8 |   -moz-hyphens: none;
 9 | }
10 | 
11 | .reveal pre{
12 |   width: 100%;
13 |   font-size: 0.51em;
14 | }
15 | 
16 | .reveal pre code {
17 |   font-size: 1.2em;
18 | }
19 | 
20 | div.bigq {
21 |  font-size: 200%;
22 |  line-height: normal;
23 | }
24 | 
25 | div.medq {
26 |  font-size: 150%;
27 |  line-height: normal;
28 | }
29 | 
30 | .section .reveal .state-background {
31 |   background: #000 none repeat scroll 0% 0%;
32 |   background-color: #000;
33 | }
34 | 


--------------------------------------------------------------------------------
/slides/images/addbasis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/addbasis.png


--------------------------------------------------------------------------------
/slides/images/animal_choice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/animal_choice.png


--------------------------------------------------------------------------------
/slides/images/animal_codes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/animal_codes.png


--------------------------------------------------------------------------------
/slides/images/animal_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/animal_functions.png


--------------------------------------------------------------------------------
/slides/images/count.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/count.jpg


--------------------------------------------------------------------------------
/slides/images/mathematical_sobbing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/mathematical_sobbing.jpg


--------------------------------------------------------------------------------
/slides/images/remlgcv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/remlgcv.png


--------------------------------------------------------------------------------
/slides/images/spotteddolphin_swfsc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/spotteddolphin_swfsc.jpg


--------------------------------------------------------------------------------
/slides/images/tina-modelling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/images/tina-modelling.png


--------------------------------------------------------------------------------
/slides/quantile_resid.R:
--------------------------------------------------------------------------------
 1 | #This code is modified for D.L. Miller's dsm package for distance sampling, from
 2 | #the rqgam.check function. The code is designed to extract randomized quantile 
 3 | #residuals from GAMs, using the family definitions in mgcv. Note statmod only
 4 | #supports RQ residuals for the following families: Tweedie, Poisson, Gaussian,  Any errors are due to Eric Pedersen
 5 | library(statmod) #This has functions for randomized quantile residuals
 6 | rqresiduals = function (gam.obj) {
 7 |   if(!"gam" %in% attr(gam.obj,"class")){
 8 |     stop('"gam.obj has to be of class "gam"')
 9 |   }
10 |   if (!grepl("^Tweedie|^Negative Binomial|^poisson|^binomial|^gaussian|^Gamma|^inverse.gaussian",
11 |              gam.obj$family$family)){
12 |     stop(paste("family " , gam.obj$family$family, 
13 |                 " is not currently supported by the statmod library, 
14 |                  and any randomized quantile residuals would be inaccurate.",
15 |                sep=""))
16 |   }
17 |   if (grepl("^Tweedie", gam.obj$family$family)) {
18 |     if (is.null(environment(gam.obj$family$variance)$p)) {
19 |       p.val <- gam.obj$family$getTheta(TRUE)
20 |       environment(gam.obj$family$variance)$p <- p.val
21 |     }
22 |     qres <- qres.tweedie(gam.obj)
23 |   }
24 |   else if (grepl("^Negative Binomial", gam.obj$family$family)) {
25 |     if ("extended.family" %in% class(gam.obj$family)) {
26 |       gam.obj$theta <- gam.obj$family$getTheta(TRUE)
27 |     }
28 |     else {
29 |       gam.obj$theta <- gam.obj$family$getTheta()
30 |     }
31 |     qres <- qres.nbinom(gam.obj)
32 |   }
33 |   else {
34 |     qres <- qresid(gam.obj)
35 |   }
36 |   return(qres)
37 | }


--------------------------------------------------------------------------------
/slides/uncertainty.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/uncertainty.gif


--------------------------------------------------------------------------------
/slides/wiggly.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-pedersen/mgcv-esa-workshop/37fd31ac1a9e14ee524c2841eff2f37a0d962fde/slides/wiggly.gif


--------------------------------------------------------------------------------