├── .gitignore
├── LICENSE
├── README.md
├── code_snippets
    ├── gra2mgcv.R
    ├── quantile_resid.R
    └── vis.concurvity.R
├── course_outline.jpg
├── data
    ├── bbs_data
    │   ├── bbs_florida.csv
    │   ├── bbs_florida_richness.csv
    │   └── routes.csv
    ├── bbs_florida.csv
    ├── bbs_florida_richness.csv
    ├── beta-regression
    │   └── ZooData.txt
    ├── drake_griffen
    │   ├── README
    │   ├── drake_griffen.RData
    │   └── make_dat.R
    ├── forest-health
    │   ├── beech.raw
    │   ├── foresthealth.bnd
    │   ├── foresthealth.dat
    │   └── foresthealth.gra
    ├── mexdolphins
    │   ├── README.md
    │   ├── mexdolphins.RData
    │   ├── soap_pred.R
    │   └── soapy.RData
    ├── routes.csv
    ├── time_series
    │   └── Florida_climate.csv
    └── yukon_seeds
    │   ├── seed_data.csv
    │   └── seed_source_locations.csv
├── example-bivariate-timeseries-and-ti.Rmd
├── example-bivariate-timeseries-and-ti.html
├── example-forest-health.Rmd
├── example-forest-health.html
├── example-linear-functionals.Rmd
├── example-nonlinear-timeseries.Rmd
├── example-nonlinear-timeseries.html
├── example-spatial-mexdolphins-solutions.Rmd
├── example-spatial-mexdolphins-solutions.html
├── example-spatial-mexdolphins.Rmd
├── example-spatial-mexdolphins.html
├── example-spatio-temporal data.Rmd
├── example-spatio-temporal_data.html
├── pre_course_questionnaire.txt
├── slides
    ├── 00-preamble.Rpres
    ├── 01-intro.Rpres
    ├── 02-intro-mgcv.Rpres
    ├── 03-model_checking.Rpres
    ├── 04-model_selection.Rpres
    ├── 05-inference.Rpres
    ├── 06-fancy.Rpres
    ├── correct_mathjax_PDF_and_move.sh
    ├── custom.css
    ├── images
    │   ├── Daphnia_magna.png
    │   ├── addbasis.png
    │   ├── animal_choice.png
    │   ├── animal_codes.png
    │   ├── animal_functions.png
    │   ├── by.png
    │   ├── concurvity.png
    │   ├── count.jpg
    │   ├── cyclic.png
    │   ├── jenny_models.png
    │   ├── mathematical_sobbing.jpg
    │   ├── mgcv-inside.png
    │   ├── remlgcv.png
    │   ├── rummy.gif
    │   ├── slope-aspect.png
    │   ├── soap-soap.png
    │   ├── soap-tprs.png
    │   ├── soap-truth.png
    │   ├── spermcovars.png
    │   ├── spermwhalecovs.R
    │   ├── spotteddolphin_swfsc.jpg
    │   ├── thanks_for_all_the_fish.gif
    │   └── tina-modelling.png
    └── wiggly.gif
└── vis.concurvity.R


/.gitignore:
--------------------------------------------------------------------------------
1 | *.Rhistory
2 | .Rproj.user/
3 | .Rproj.user
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Code in this respository is licensed under the GPL version >= 2.
2 | 
3 | All other material is Creative Commons BY licensed (except where it does not belong to us, we do not claim copyright over anyone else's material).
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mgcv-workshop
2 | 
3 | Dave Miller's fork of the course we (Dave, Eric Pedersen and Gavin Simpson) at the ESA 2016 annual meeting, adapted for other situations.
4 | 
5 | Original repo (with further materials) can be found at [Eric Pedersen's github page](http://github.com/eric-pedersen/mgcv-esa-workshop/).
6 | 


--------------------------------------------------------------------------------
/code_snippets/gra2mgcv.R:
--------------------------------------------------------------------------------
 1 | ## Read in a BayesX graph file and output list required by mgcv's mrf smooth
 2 | gra2mgcv <- function(file) {
 3 |     ## first row contains number of objects in file
 4 |     ## graph info is stored in triplets or rows after first row
 5 |     ##   - 1st row of triplet is ID
 6 |     ##   - 2nd row of triplet is number of neighbours
 7 |     ##   - 3rd row of triplet is vector of neighbour
 8 |     gra <- readLines(file)
 9 |     N <-  gra[1]
10 |     gra <- gra[-1]                      # pop the first row which contains N
11 |     ids <- seq.int(1, by = 3, length.out = N)
12 |     nbs <- seq.int(3, by = 3, length.out = N)
13 |     node2id <- function(nodes, ids) {
14 |         as.numeric(ids[as.numeric(nodes) + 1])
15 |     }
16 |     l <- lapply(strsplit(gra[nbs], " "), node2id, ids = gra[ids])
17 |     names(l) <-  gra[ids]
18 |     l
19 | }
20 | 


--------------------------------------------------------------------------------
/code_snippets/quantile_resid.R:
--------------------------------------------------------------------------------
 1 | #This code is modified for D.L. Miller's dsm package for distance sampling, from
 2 | #the rqgam.check function. The code is designed to extract randomized quantile 
 3 | #residuals from GAMs, using the family definitions in mgcv. Note statmod only
 4 | #supports RQ residuals for the following families: Tweedie, Poisson, Gaussian,  Any errors are due to Eric Pedersen
 5 | library(statmod) #This has functions for randomized quantile residuals
 6 | rqresiduals = function (gam.obj) {
 7 |   if(!"gam" %in% attr(gam.obj,"class")){
 8 |     stop('"gam.obj has to be of class "gam"')
 9 |   }
10 |   if (!grepl("^Tweedie|^Negative Binomial|^poisson|^binomial|^gaussian|^Gamma|^inverse.gaussian",
11 |              gam.obj$family$family)){
12 |     stop(paste("family " , gam.obj$family$family, 
13 |                 " is not currently supported by the statmod library, 
14 |                  and any randomized quantile residuals would be inaccurate.",
15 |                sep=""))
16 |   }
17 |   if (grepl("^Tweedie", gam.obj$family$family)) {
18 |     if (is.null(environment(gam.obj$family$variance)$p)) {
19 |       p.val <- gam.obj$family$getTheta(TRUE)
20 |       environment(gam.obj$family$variance)$p <- p.val
21 |     }
22 |     qres <- qres.tweedie(gam.obj)
23 |   }
24 |   else if (grepl("^Negative Binomial", gam.obj$family$family)) {
25 |     if ("extended.family" %in% class(gam.obj$family)) {
26 |       gam.obj$theta <- gam.obj$family$getTheta(TRUE)
27 |     }
28 |     else {
29 |       gam.obj$theta <- gam.obj$family$getTheta()
30 |     }
31 |     qres <- qres.nbinom(gam.obj)
32 |   }
33 |   else {
34 |     qres <- qresid(gam.obj)
35 |   }
36 |   return(qres)
37 | }


--------------------------------------------------------------------------------
/code_snippets/vis.concurvity.R:
--------------------------------------------------------------------------------
 1 | # visualise concurvity between terms in a GAM
 2 | # David L Miller 2015, MIT license
 3 | 
 4 | # arguments:
 5 | #   b     -- a fitted gam
 6 | #   type  -- concurvity measure to plot, see ?concurvity
 7 | 
 8 | vis.concurvity <- function(b, type="estimate"){
 9 |   cc <- concurvity(b, full=FALSE)[[type]]
10 | 
11 |   diag(cc) <- NA
12 |   cc[lower.tri(cc)]<-NA
13 | 
14 |   layout(matrix(1:2, ncol=2), widths=c(5,1))
15 |   opar <- par(mar=c(5, 6, 5, 0) + 0.1)
16 |   # main plot
17 |   image(z=cc, x=1:ncol(cc), y=1:nrow(cc), ylab="", xlab="",
18 |         axes=FALSE, asp=1, zlim=c(0,1))
19 |   axis(1, at=1:ncol(cc), labels = colnames(cc), las=2)
20 |   axis(2, at=1:nrow(cc), labels = rownames(cc), las=2)
21 |   # legend
22 |   opar <- par(mar=c(5, 0, 4, 3) + 0.1)
23 |   image(t(matrix(rep(seq(0, 1, len=100), 2), ncol=2)),
24 |         x=1:3, y=1:101, zlim=c(0,1), axes=FALSE, xlab="", ylab="")
25 |   axis(4, at=seq(1,101,len=5), labels = round(seq(0,1,len=5),1), las=2)
26 |   par(opar)
27 | }
28 | 


--------------------------------------------------------------------------------
/course_outline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/course_outline.jpg


--------------------------------------------------------------------------------
/data/bbs_data/routes.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/bbs_data/routes.csv


--------------------------------------------------------------------------------
/data/beta-regression/ZooData.txt:
--------------------------------------------------------------------------------
1 | Number	Scans	Proportion	Size	Visual	Raised	Visitors	Feeding	Oc	Other	Enrichment	Group	Sex	Enclosure	Vehicle	Diet	Age	Zoo	Eps41	300	0.136666667	650	2	2	6418	1	1	2	2	2	1	0	59	3	2	1	147	150	0.313333333	2405	2	1	13607	1	2	2	2	1	1	0	263	1	2	2	228	300	0.093333333	1781	4	2	13607	1	2	2	2	2	1	0	263	1	3	2	326	150	0.173333333	390	5	1	0	1	2	2	1	1	2	0	0	1	3	2	413	148	0.087837838	390	2	1	0	1	2	2	1	1	2	0	0	1	3	2	525	152	0.164473684	390	3	1	0	1	2	2	1	1	2	0	0	1	2	2	615	301	0.049833887	1200	4	2	11713	1	2	2	2	2	2	20	215	3	8	3	778	299	0.260869565	1200	4	2	11713	1	2	2	2	2	2	12	215	3	2	3	877	300	0.256666667	1200	4	2	11713	1	2	2	2	2	1	16	215	3	2	3	964	300	0.213333333	70	1	1	11713	1	2	2	2	2	1	20	215	3	8	3	1080	300	0.266666667	40	1	1	11713	1	2	2	2	2	2	12	215	3	2	3	1128	240	0.116666667	1200	4	2	4225	1	2	2	2	2	1	16	28	3	8	3	122	150	0.013333333	650	5	2	96513	1	1	1	2	1	2	20	3	3	7	4	1312	150	0.08	650	5	2	96513	1	1	1	2	1	2	20	3	3	1	4	142	150	0.013333333	650	5	2	7294	1	1	1	1	1	2	0	7	3	7	4	1516	150	0.106666667	100	1	1	0	1	1	1	1	1	2	20	6	3	7	4	1612	150	0.08	100	1	1	0	1	1	1	1	1	2	20	6	3	1	4	176	356	0.016853933	2030	8	2	76000	1	2	2	1	2	1	6	0	1	2	5	1845	120	0.375	715	6	2	76000	1	2	2	1	1	2	6	0	1	2	5	1927	270	0.1	715	6	2	14195	2	2	2	1	2	1	0	0	1	2	5	2028	90	0.311111111	2030	8	2	14195	2	2	2	1	1	2	0	0	1	2	5	2114	100	0.14	420	2	1	8971	1	1	2	1	1	1	0	112	3	1	6	229	100	0.09	434	2	2	39120	1	1	2	1	1	1	0	116	3	1	6	2310	100	0.1	434	2	2	27589	1	1	2	1	1	1	0	157	3	1	6	2413	497	0.026156942	434	2	2	8971	1	1	2	1	2	2	0	112	3	3	6	2510	495	0.02020202	1700	4	1	39120	1	1	2	1	2	2	1	116	3	3	6	269	499	0.018036072	1700	4	1	27589	1	1	2	1	2	2	0	157	3	3	6	2720	199	0.100502513	3800	8	2	8971	1	1	2	1	2	2	0	112	3	5	6	2826	200	0.13	3000	8	2	39120	1	1	2	2	2	2	0	63	3	5	6	2921	200	0.105	3000	8	2	27589	1	1	2	2	2	2	0	84	3	5	6	300	296	0	3000	8	2	8971	1	1	2	2	2	1	0	51	3	10	6	316	300	0.02	504	4	2	27589	1	1	2	1	2	1	1	157	3	10	6	3263	299	0.210702341	1700	4	1	8971	1	1	2	1	2	1	1	112	3	10	6	3321	100	0.21	504	4	2	8971	1	1	2	1	1	2	0	112	3	8	6	3418	100	0.18	504	4	2	39120	1	1	2	1	1	2	0	116	3	3	6	3513	192	0.067708333	434	2	1	22000	2	1	2	2	1	2	0	128	3	5	6	3634	150	0.226666667	1000	2	1	627	1	2	1	2	1	1	0	83	2	7	7	3721	150	0.14	600	2	1	627	1	2	1	2	1	2	0	83	2	4	7	3835	150	0.233333333	578	4	1	627	1	2	1	2	1	2	0	83	2	3	7	3916	150	0.106666667	630	4	1	627	1	2	1	2	1	2	0	83	2	8	7	4018	150	0.12	587	2	1	627	1	2	1	2	1	2	0	83	2	7	7	4122	150	0.146666667	203	1	1	627	1	2	1	2	1	1	0	83	2	2	7	4227	150	0.18	1100	3	1	627	1	2	1	2	1	1	0	54	1	8	7	4316	300	0.053333333	1400	8	1	627	1	2	1	2	2	1	1	27	1	8	7	444	302	0.013245033	600	5	1	0	2	2	1	1	2	1	0	22	1	11	7	4516	147	0.108843537	1600	9	1	0	2	2	1	1	1	1	0	26	1	9	7	464	150	0.026666667	400	4	1	0	2	2	1	1	1	2	0	26	1	8	7	4735	300	0.116666667	400	4	1	0	2	2	1	1	2	1	0	26	1	5	7	4811	150	0.073333333	500	9	1	0	2	2	1	1	1	2	0	26	1	11	7	498	149	0.053691275	500	9	1	0	2	2	1	1	1	2	0	26	1	12	7	5018	150	0.12	500	9	1	0	2	2	1	1	1	2	0	26	1	11	7	5119	299	0.063545151	500	9	1	0	2	2	1	1	2	1	0	26	1	10	7	523	150	0.02	500	9	1	0	2	2	1	1	1	1	0	26	1	10	7	5315	495	0.03030303	1800	8	1	0	2	2	2	1	2	2	0	26	1	1	7	5436	150	0.24	431	2	1	0	2	2	1	1	1	1	2	32	1	4	7	5510	150	0.066666667	336	2	1	0	2	2	1	1	1	2	0	32	1	9	7	563	149	0.020134228	67.5	1	1	0	2	2	1	1	1	1	0	32	1	7	7	571	150	0.006666667	195	2	1	0	2	2	1	1	1	1	2	32	1	8	7	5810	150	0.066666667	270	3	1	0	2	2	1	1	1	1	0	32	1	12	7	5912	298	0.040268456	1900	9	1	0	2	2	1	1	2	1	1	19	1	9	7	6014	150	0.093333333	360	4	1	0	2	2	1	1	1	1	0	11	1	7	7	6133	150	0.22	360	3	1	0	2	2	1	1	1	2	0	11	1	3	7	629	149	0.060402685	266	5	1	0	2	2	1	1	1	2	0	11	1	9	7	6319	150	0.126666667	360	7	1	0	2	2	1	1	1	2	0	11	1	6	7	6439	600	0.065	15000	4	1	718	1	2	1	2	2	1	8	51	3	3	8	6517	300	0.056666667	25000	4	2	718	1	2	1	2	2	2	10	49	3	5	8	669	150	0.06	10000	3	1	718	1	2	1	1	1	1	0	27	3	15	8	6711	300	0.036666667	25000	4	2	718	1	2	1	2	2	2	8	40	3	1	8	6810	591	0.016920474	10000	9	1	0	2	2	2	2	2	2	0	12	3	9	8	692	749	0.002670227	40000	9	2	0	2	1	2	1	2	1	0	16	3	5	8	7020	448	0.044642857	10000	4	2	0	1	2	1	1	2	1	3	18	3	1	8	710	150	0	25000	10	2	0	2	2	2	1	1	1	0	22	3	10	8	721	300	0.003333333	25000	10	2	0	2	2	2	1	2	1	0	22	3	10	8	7313	150	0.086666667	80000	9	2	718	2	2	2	2	1	2	0	47	3	10	8	7410	150	0.066666667	80000	9	2	718	2	2	2	2	1	2	0	60	3	7	8	750	300	0	80000	9	2	0	2	2	2	2	2	2	0	44	3	2	8	7619	150	0.126666667	3000	5	2	33172	1	2	2	1	1	2	16	11	2	4	9	7712	150	0.08	62	1	1	0	1	1	2	1	1	2	16	21	2	4	9	7810	452	0.022123894	3000	5	2	33172	1	2	2	1	2	2	16	1	2	1	9	7910	450	0.022222222	62	1	1	0	1	1	2	1	2	2	16	17	2	1	9	8015	149	0.100671141	69	1	1	0	1	2	1	1	1	1	16	10	2	2	9	8137	150	0.246666667	69	1	1	0	1	2	1	1	1	2	14	17	2	5	9	8244	300	0.146666667	625	1	2	0	1	2	2	1	2	1	0	34	2	8	9	8321	150	0.14	625	1	1	0	1	2	2	1	1	2	6	33	2	5	9	8414	150	0.093333333	10000	2	1	0	1	2	2	1	1	1	2	29	2	9	9	8555	149	0.369127517	600	2	2	0	1	2	2	1	1	1	2	34	2	9	9	8628	150	0.186666667	450	1	1	0	1	2	1	1	1	2	2	35	2	2	9	8717	150	0.113333333	18.6	1	1	0	1	2	1	1	1	2	0	35	2	8	9	88


--------------------------------------------------------------------------------
/data/drake_griffen/README:
--------------------------------------------------------------------------------
 1 | Data adapted from:
 2 | 
 3 |   Drake, J.M. & B.D. Griffen. 2010. Early warning signals of extinction in deteriorating environments. Nature 467:456-459. doi:10.1038/nature09389
 4 | 
 5 | Downloaded at: http://datadryad.org/resource/doi:10.5061/dryad.q3p64
 6 | 
 7 | 
 8 | Data adapted from the below files, using the script make_dat.R:
 9 | 
10 | timeseries.csv - Contains census records (triplicate counting) for all experimental populations
11 |   ID: Unique identifier for each experimental population
12 |   Subpop: Unique identifier for each chamber of a population
13 |   Date: Census date
14 |   sample1: First cenus
15 |   sample2: Second census
16 |   sample3: Third census
17 |   HoleNum: Experimental treatment - number of connecting passages (mm)
18 |   HoleSize: Experimental treatment - diameter of connecting passages (mm)
19 |   Deteriorating: Experimental treatment - indicator for whether or not population was in the deteriorating food treatment group
20 |   FoodLevel:  Volume of food suspension supplied on the census date
21 | 
22 | preprocess.R - Auxilary script to format the data for analysis
23 | 


--------------------------------------------------------------------------------
/data/drake_griffen/drake_griffen.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/drake_griffen/drake_griffen.RData


--------------------------------------------------------------------------------
/data/drake_griffen/make_dat.R:
--------------------------------------------------------------------------------
 1 | # run this in the data-and-code directory of the zip file downloaded
 2 | # from http://datadryad.org/resource/doi:10.5061/dryad.q3p64
 3 | 
 4 | #Preprocess raw time series data for analysis
 5 | source('preprocess.R')
 6 | 
 7 | library(lubridate)
 8 | 
 9 | # data munging
10 | timeseries$date <- mdy(as.character(timeseries$Date.))
11 | populations$Nhat <- populations$x
12 | populations$x <- NULL
13 | 
14 | # table of which groups were deteriorating or not
15 | groups <- unique(timeseries[,c("ID", "Deteriorating")])
16 | 
17 | populations$deteriorating <- populations$ID %in% groups$ID[groups$Deteriorating==1]
18 | 
19 | # smooth of population size over time
20 | pop_happy <- subset(populations, !deteriorating)
21 | pop_unhappy <- subset(populations, deteriorating)
22 | 
23 | 
24 | save.image(file="drake_griffen.RData")
25 | 
26 | 


--------------------------------------------------------------------------------
/data/forest-health/foresthealth.bnd:
--------------------------------------------------------------------------------
  1 | "82",5
  2 | 15.9882,1.8882
  3 | 15.9882,2.1118
  4 | 16.2118,2.1118
  5 | 16.2118,1.8882
  6 | 15.9882,1.8882
  7 | "83",5
  8 | 15.9882,2.8882
  9 | 15.9882,3.1118
 10 | 16.2118,3.1118
 11 | 16.2118,2.8882
 12 | 15.9882,2.8882
 13 | "79",5
 14 | 14.9882,1.8882
 15 | 14.9882,2.1118
 16 | 15.2118,2.1118
 17 | 15.2118,1.8882
 18 | 14.9882,1.8882
 19 | "81",5
 20 | 14.9882,3.8882
 21 | 14.9882,4.1118
 22 | 15.2118,4.1118
 23 | 15.2118,3.8882
 24 | 14.9882,3.8882
 25 | "78",5
 26 | 13.9882,4.8882
 27 | 13.9882,5.1118
 28 | 14.2118,5.1118
 29 | 14.2118,4.8882
 30 | 13.9882,4.8882
 31 | "62",5
 32 | 10.1882,8.8882
 33 | 10.1882,9.1118
 34 | 10.4118,9.1118
 35 | 10.4118,8.8882
 36 | 10.1882,8.8882
 37 | "55",5
 38 | 9.6882,8.3882
 39 | 9.6882,8.6118
 40 | 9.9118,8.6118
 41 | 9.9118,8.3882
 42 | 9.6882,8.3882
 43 | "80",5
 44 | 14.9882,2.8882
 45 | 14.9882,3.1118
 46 | 15.2118,3.1118
 47 | 15.2118,2.8882
 48 | 14.9882,2.8882
 49 | "77",5
 50 | 13.9882,3.8882
 51 | 13.9882,4.1118
 52 | 14.2118,4.1118
 53 | 14.2118,3.8882
 54 | 13.9882,3.8882
 55 | "75",5
 56 | 12.9882,4.9882
 57 | 12.9882,5.2118
 58 | 13.2118,5.2118
 59 | 13.2118,4.9882
 60 | 12.9882,4.9882
 61 | "72",5
 62 | 12.0882,5.9882
 63 | 12.0882,6.2118
 64 | 12.3118,6.2118
 65 | 12.3118,5.9882
 66 | 12.0882,5.9882
 67 | "61",5
 68 | 10.0882,7.8882
 69 | 10.0882,8.1118
 70 | 10.3118,8.1118
 71 | 10.3118,7.8882
 72 | 10.0882,7.8882
 73 | "76",5
 74 | 13.9882,2.8882
 75 | 13.9882,3.1118
 76 | 14.2118,3.1118
 77 | 14.2118,2.8882
 78 | 13.9882,2.8882
 79 | "74",5
 80 | 12.9882,3.8882
 81 | 12.9882,4.1118
 82 | 13.2118,4.1118
 83 | 13.2118,3.8882
 84 | 12.9882,3.8882
 85 | "71",5
 86 | 12.0882,4.9882
 87 | 12.0882,5.2118
 88 | 12.3118,5.2118
 89 | 12.3118,4.9882
 90 | 12.0882,4.9882
 91 | "60",5
 92 | 10.0882,6.9882
 93 | 10.0882,7.2118
 94 | 10.3118,7.2118
 95 | 10.3118,6.9882
 96 | 10.0882,6.9882
 97 | "73",5
 98 | 12.9882,2.8882
 99 | 12.9882,3.1118
100 | 13.2118,3.1118
101 | 13.2118,2.8882
102 | 12.9882,2.8882
103 | "70",5
104 | 11.9882,3.8882
105 | 11.9882,4.1118
106 | 12.2118,4.1118
107 | 12.2118,3.8882
108 | 11.9882,3.8882
109 | "67",5
110 | 11.0882,4.8882
111 | 11.0882,5.1118
112 | 11.3118,5.1118
113 | 11.3118,4.8882
114 | 11.0882,4.8882
115 | "59",5
116 | 10.0882,5.8882
117 | 10.0882,6.1118
118 | 10.3118,6.1118
119 | 10.3118,5.8882
120 | 10.0882,5.8882
121 | "54",5
122 | 9.0882,6.9882
123 | 9.0882,7.2118
124 | 9.3118,7.2118
125 | 9.3118,6.9882
126 | 9.0882,6.9882
127 | "69",5
128 | 11.9882,2.8882
129 | 11.9882,3.1118
130 | 12.2118,3.1118
131 | 12.2118,2.8882
132 | 11.9882,2.8882
133 | "66",5
134 | 10.9882,3.8882
135 | 10.9882,4.1118
136 | 11.2118,4.1118
137 | 11.2118,3.8882
138 | 10.9882,3.8882
139 | "68",5
140 | 11.9882,1.9882
141 | 11.9882,2.2118
142 | 12.2118,2.2118
143 | 12.2118,1.9882
144 | 11.9882,1.9882
145 | "63",5
146 | 10.9882,0.988197
147 | 10.9882,1.2118
148 | 11.2118,1.2118
149 | 11.2118,0.988197
150 | 10.9882,0.988197
151 | "53",5
152 | 9.0882,5.8882
153 | 9.0882,6.1118
154 | 9.3118,6.1118
155 | 9.3118,5.8882
156 | 9.0882,5.8882
157 | "48",5
158 | 8.0882,6.9882
159 | 8.0882,7.2118
160 | 8.3118,7.2118
161 | 8.3118,6.9882
162 | 8.0882,6.9882
163 | "65",5
164 | 10.9882,2.8882
165 | 10.9882,3.1118
166 | 11.2118,3.1118
167 | 11.2118,2.8882
168 | 10.9882,2.8882
169 | "64",5
170 | 10.9882,1.8882
171 | 10.9882,2.1118
172 | 11.2118,2.1118
173 | 11.2118,1.8882
174 | 10.9882,1.8882
175 | "56",5
176 | 9.9882,0.988197
177 | 9.9882,1.2118
178 | 10.2118,1.2118
179 | 10.2118,0.988197
180 | 9.9882,0.988197
181 | "47",5
182 | 8.0882,5.9882
183 | 8.0882,6.2118
184 | 8.3118,6.2118
185 | 8.3118,5.9882
186 | 8.0882,5.9882
187 | "58",5
188 | 9.9882,2.9882
189 | 9.9882,3.2118
190 | 10.2118,3.2118
191 | 10.2118,2.9882
192 | 9.9882,2.9882
193 | "57",5
194 | 9.9882,1.9882
195 | 9.9882,2.2118
196 | 10.2118,2.2118
197 | 10.2118,1.9882
198 | 9.9882,1.9882
199 | "50",5
200 | 8.9882,0.988197
201 | 8.9882,1.2118
202 | 9.2118,1.2118
203 | 9.2118,0.988197
204 | 8.9882,0.988197
205 | "49",5
206 | 8.9882,0.388197
207 | 8.9882,0.611803
208 | 9.2118,0.611803
209 | 9.2118,0.388197
210 | 8.9882,0.388197
211 | "46",5
212 | 8.0882,4.8882
213 | 8.0882,5.1118
214 | 8.3118,5.1118
215 | 8.3118,4.8882
216 | 8.0882,4.8882
217 | "52",5
218 | 8.9882,2.9882
219 | 8.9882,3.2118
220 | 9.2118,3.2118
221 | 9.2118,2.9882
222 | 8.9882,2.9882
223 | "51",5
224 | 8.9882,1.9882
225 | 8.9882,2.2118
226 | 9.2118,2.2118
227 | 9.2118,1.9882
228 | 8.9882,1.9882
229 | "42",5
230 | 7.9882,0.988197
231 | 7.9882,1.2118
232 | 8.2118,1.2118
233 | 8.2118,0.988197
234 | 7.9882,0.988197
235 | "45",5
236 | 7.9882,3.8882
237 | 7.9882,4.1118
238 | 8.2118,4.1118
239 | 8.2118,3.8882
240 | 7.9882,3.8882
241 | "41",5
242 | 6.9882,4.9882
243 | 6.9882,5.2118
244 | 7.2118,5.2118
245 | 7.2118,4.9882
246 | 6.9882,4.9882
247 | "44",5
248 | 7.9882,2.9882
249 | 7.9882,3.2118
250 | 8.2118,3.2118
251 | 8.2118,2.9882
252 | 7.9882,2.9882
253 | "43",5
254 | 7.9882,1.9882
255 | 7.9882,2.2118
256 | 8.2118,2.2118
257 | 8.2118,1.9882
258 | 7.9882,1.9882
259 | "40",5
260 | 6.9882,3.9882
261 | 6.9882,4.2118
262 | 7.2118,4.2118
263 | 7.2118,3.9882
264 | 6.9882,3.9882
265 | "39",5
266 | 6.9882,2.9882
267 | 6.9882,3.2118
268 | 7.2118,3.2118
269 | 7.2118,2.9882
270 | 6.9882,2.9882
271 | "38",5
272 | 6.9882,1.9882
273 | 6.9882,2.2118
274 | 7.2118,2.2118
275 | 7.2118,1.9882
276 | 6.9882,1.9882
277 | "37",5
278 | 6.0882,3.2882
279 | 6.0882,3.5118
280 | 6.3118,3.5118
281 | 6.3118,3.2882
282 | 6.0882,3.2882
283 | "36",5
284 | 6.0882,2.2882
285 | 6.0882,2.5118
286 | 6.3118,2.5118
287 | 6.3118,2.2882
288 | 6.0882,2.2882
289 | "33",5
290 | 5.0882,3.2882
291 | 5.0882,3.5118
292 | 5.3118,3.5118
293 | 5.3118,3.2882
294 | 5.0882,3.2882
295 | "35",5
296 | 5.2882,1.7882
297 | 5.2882,2.0118
298 | 5.5118,2.0118
299 | 5.5118,1.7882
300 | 5.2882,1.7882
301 | "31",5
302 | 5.0882,1.2882
303 | 5.0882,1.5118
304 | 5.3118,1.5118
305 | 5.3118,1.2882
306 | 5.0882,1.2882
307 | "25",5
308 | 4.0882,0.288197
309 | 4.0882,0.511803
310 | 4.3118,0.511803
311 | 4.3118,0.288197
312 | 4.0882,0.288197
313 | "34",5
314 | 5.1882,1.9882
315 | 5.1882,2.2118
316 | 5.4118,2.2118
317 | 5.4118,1.9882
318 | 5.1882,1.9882
319 | "32",5
320 | 5.0882,2.2882
321 | 5.0882,2.5118
322 | 5.3118,2.5118
323 | 5.3118,2.2882
324 | 5.0882,2.2882
325 | "21",5
326 | 3.5882,0.788197
327 | 3.5882,1.0118
328 | 3.8118,1.0118
329 | 3.8118,0.788197
330 | 3.5882,0.788197
331 | "18",5
332 | 3.1882,0.688197
333 | 3.1882,0.911803
334 | 3.4118,0.911803
335 | 3.4118,0.688197
336 | 3.1882,0.688197
337 | "15",5
338 | 3.0882,0.288197
339 | 3.0882,0.511803
340 | 3.3118,0.511803
341 | 3.3118,0.288197
342 | 3.0882,0.288197
343 | "28",5
344 | 4.4882,2.8882
345 | 4.4882,3.1118
346 | 4.7118,3.1118
347 | 4.7118,2.8882
348 | 4.4882,2.8882
349 | "30",5
350 | 4.5882,2.3882
351 | 4.5882,2.6118
352 | 4.8118,2.6118
353 | 4.8118,2.3882
354 | 4.5882,2.3882
355 | "29",5
356 | 4.5882,1.7882
357 | 4.5882,2.0118
358 | 4.8118,2.0118
359 | 4.8118,1.7882
360 | 4.5882,1.7882
361 | "26",5
362 | 4.0882,1.2882
363 | 4.0882,1.5118
364 | 4.3118,1.5118
365 | 4.3118,1.2882
366 | 4.0882,1.2882
367 | "27",5
368 | 4.0882,2.2882
369 | 4.0882,2.5118
370 | 4.3118,2.5118
371 | 4.3118,2.2882
372 | 4.0882,2.2882
373 | "22",5
374 | 3.5882,1.7882
375 | 3.5882,2.0118
376 | 3.8118,2.0118
377 | 3.8118,1.7882
378 | 3.5882,1.7882
379 | "16",5
380 | 3.0882,1.2882
381 | 3.0882,1.5118
382 | 3.3118,1.5118
383 | 3.3118,1.2882
384 | 3.0882,1.2882
385 | "23",5
386 | 3.5882,2.7882
387 | 3.5882,3.0118
388 | 3.8118,3.0118
389 | 3.8118,2.7882
390 | 3.5882,2.7882
391 | "20",5
392 | 3.4882,2.0882
393 | 3.4882,2.3118
394 | 3.7118,2.3118
395 | 3.7118,2.0882
396 | 3.4882,2.0882
397 | "19",5
398 | 3.1882,2.5882
399 | 3.1882,2.8118
400 | 3.4118,2.8118
401 | 3.4118,2.5882
402 | 3.1882,2.5882
403 | "17",5
404 | 3.0882,2.2882
405 | 3.0882,2.5118
406 | 3.3118,2.5118
407 | 3.3118,2.2882
408 | 3.0882,2.2882
409 | "12",5
410 | 2.5882,1.7882
411 | 2.5882,2.0118
412 | 2.8118,2.0118
413 | 2.8118,1.7882
414 | 2.5882,1.7882
415 | "2",5
416 | 1.0882,2.2882
417 | 1.0882,2.5118
418 | 1.3118,2.5118
419 | 1.3118,2.2882
420 | 1.0882,2.2882
421 | "1",5
422 | 0.588197,2.7882
423 | 0.588197,3.0118
424 | 0.811803,3.0118
425 | 0.811803,2.7882
426 | 0.588197,2.7882
427 | "13",5
428 | 2.5882,2.7882
429 | 2.5882,3.0118
430 | 2.8118,3.0118
431 | 2.8118,2.7882
432 | 2.5882,2.7882
433 | "3",5
434 | 1.0882,3.2882
435 | 1.0882,3.5118
436 | 1.3118,3.5118
437 | 1.3118,3.2882
438 | 1.0882,3.2882
439 | "6",5
440 | 1.4882,2.4882
441 | 1.4882,2.7118
442 | 1.7118,2.7118
443 | 1.7118,2.4882
444 | 1.4882,2.4882
445 | "7",5
446 | 1.5882,2.7882
447 | 1.5882,3.0118
448 | 1.8118,3.0118
449 | 1.8118,2.7882
450 | 1.5882,2.7882
451 | "9",5
452 | 2.0882,2.2882
453 | 2.0882,2.5118
454 | 2.3118,2.5118
455 | 2.3118,2.2882
456 | 2.0882,2.2882
457 | "4",5
458 | 1.3882,3.4882
459 | 1.3882,3.7118
460 | 1.6118,3.7118
461 | 1.6118,3.4882
462 | 1.3882,3.4882
463 | "5",5
464 | 1.3882,4.8882
465 | 1.3882,5.1118
466 | 1.6118,5.1118
467 | 1.6118,4.8882
468 | 1.3882,4.8882
469 | "8",5
470 | 1.5882,3.7882
471 | 1.5882,4.0118
472 | 1.8118,4.0118
473 | 1.8118,3.7882
474 | 1.5882,3.7882
475 | "10",5
476 | 2.0882,3.2882
477 | 2.0882,3.5118
478 | 2.3118,3.5118
479 | 2.3118,3.2882
480 | 2.0882,3.2882
481 | "11",5
482 | 2.0882,4.2882
483 | 2.0882,4.5118
484 | 2.3118,4.5118
485 | 2.3118,4.2882
486 | 2.0882,4.2882
487 | "14",5
488 | 2.7882,3.9882
489 | 2.7882,4.2118
490 | 3.0118,4.2118
491 | 3.0118,3.9882
492 | 2.7882,3.9882
493 | "24",5
494 | 3.6882,3.9882
495 | 3.6882,4.2118
496 | 3.9118,4.2118
497 | 3.9118,3.9882
498 | 3.6882,3.9882
499 | 


--------------------------------------------------------------------------------
/data/forest-health/foresthealth.gra:
--------------------------------------------------------------------------------
  1 | 83
  2 | 82
  3 | 2
  4 | 2 1 
  5 | 83
  6 | 2
  7 | 0 7 
  8 | 79
  9 | 2
 10 | 7 0 
 11 | 81
 12 | 2
 13 | 7 8 
 14 | 78
 15 | 2
 16 | 8 9 
 17 | 62
 18 | 2
 19 | 11 6 
 20 | 55
 21 | 2
 22 | 11 5 
 23 | 80
 24 | 4
 25 | 2 12 3 1 
 26 | 77
 27 | 4
 28 | 12 13 4 3 
 29 | 75
 30 | 3
 31 | 13 14 4 
 32 | 72
 33 | 1
 34 | 14 
 35 | 61
 36 | 3
 37 | 15 6 5 
 38 | 76
 39 | 3
 40 | 16 8 7 
 41 | 74
 42 | 4
 43 | 16 17 9 8 
 44 | 71
 45 | 4
 46 | 17 18 10 9 
 47 | 60
 48 | 3
 49 | 19 20 11 
 50 | 73
 51 | 3
 52 | 21 13 12 
 53 | 70
 54 | 4
 55 | 21 22 14 13 
 56 | 67
 57 | 2
 58 | 22 14 
 59 | 59
 60 | 2
 61 | 25 15 
 62 | 54
 63 | 3
 64 | 25 26 15 
 65 | 69
 66 | 4
 67 | 23 27 17 16 
 68 | 66
 69 | 3
 70 | 27 18 17 
 71 | 68
 72 | 2
 73 | 28 21 
 74 | 63
 75 | 2
 76 | 29 28 
 77 | 53
 78 | 3
 79 | 30 20 19 
 80 | 48
 81 | 2
 82 | 30 20 
 83 | 65
 84 | 4
 85 | 28 31 22 21 
 86 | 64
 87 | 4
 88 | 24 32 27 23 
 89 | 56
 90 | 4
 91 | 33 34 32 24 
 92 | 47
 93 | 3
 94 | 35 26 25 
 95 | 58
 96 | 3
 97 | 32 36 27 
 98 | 57
 99 | 4
100 | 29 37 31 28 
101 | 50
102 | 4
103 | 34 38 37 29 
104 | 49
105 | 3
106 | 38 33 29 
107 | 46
108 | 3
109 | 39 40 30 
110 | 52
111 | 3
112 | 37 41 31 
113 | 51
114 | 4
115 | 33 42 36 32 
116 | 42
117 | 3
118 | 42 34 33 
119 | 45
120 | 3
121 | 41 43 35 
122 | 41
123 | 2
124 | 43 35 
125 | 44
126 | 4
127 | 42 44 39 36 
128 | 43
129 | 4
130 | 38 45 41 37 
131 | 40
132 | 4
133 | 44 46 40 39 
134 | 39
135 | 5
136 | 45 46 47 43 41 
137 | 38
138 | 3
139 | 47 44 42 
140 | 37
141 | 4
142 | 47 48 44 43 
143 | 36
144 | 6
145 | 49 52 53 46 45 44 
146 | 33
147 | 4
148 | 53 58 57 46 
149 | 35
150 | 6
151 | 52 53 50 58 59 47 
152 | 31
153 | 5
154 | 59 60 53 52 49 
155 | 25
156 | 4
157 | 54 55 56 60 
158 | 34
159 | 8
160 | 53 50 58 59 57 61 49 47 
161 | 32
162 | 9
163 | 50 58 59 57 61 48 52 49 47 
164 | 21
165 | 6
166 | 55 63 56 62 51 60 
167 | 18
168 | 6
169 | 63 56 54 62 51 60 
170 | 15
171 | 4
172 | 63 55 54 51 
173 | 28
174 | 7
175 | 61 64 59 58 53 48 52 
176 | 30
177 | 10
178 | 59 57 61 64 62 65 53 48 52 49 
179 | 29
180 | 10
181 | 57 61 60 62 65 58 50 53 52 49 
182 | 26
183 | 9
184 | 51 62 54 65 55 63 61 59 50 
185 | 27
186 | 11
187 | 60 64 62 65 66 67 57 59 58 53 52 
188 | 22
189 | 12
190 | 54 65 66 55 67 63 68 64 60 61 59 58 
191 | 16
192 | 8
193 | 56 68 67 55 65 54 62 60 
194 | 23
195 | 8
196 | 62 65 66 67 71 61 57 58 
197 | 20
198 | 11
199 | 66 67 63 71 68 62 64 60 61 59 58 
200 | 19
201 | 8
202 | 67 71 68 75 65 62 64 61 
203 | 17
204 | 9
205 | 63 71 68 75 66 65 62 64 61 
206 | 12
207 | 7
208 | 75 71 63 67 66 65 62 
209 | 2
210 | 5
211 | 70 72 73 74 75 
212 | 1
213 | 5
214 | 69 72 76 73 74 
215 | 13
216 | 9
217 | 68 79 75 74 73 67 66 65 64 
218 | 3
219 | 7
220 | 69 70 76 73 74 78 79 
221 | 6
222 | 8
223 | 76 72 69 70 74 75 79 71 
224 | 7
225 | 9
226 | 73 76 72 69 70 78 75 79 71 
227 | 9
228 | 8
229 | 74 73 69 79 68 71 67 66 
230 | 4
231 | 7
232 | 72 70 73 74 78 79 80 
233 | 5
234 | 2
235 | 78 80 
236 | 8
237 | 6
238 | 74 77 76 72 79 80 
239 | 10
240 | 9
241 | 75 78 74 73 76 72 80 71 81 
242 | 11
243 | 5
244 | 79 78 77 76 81 
245 | 14
246 | 3
247 | 80 79 82 
248 | 24
249 | 1
250 | 81 
251 | 


--------------------------------------------------------------------------------
/data/mexdolphins/README.md:
--------------------------------------------------------------------------------
 1 | Pantropical spotted dolphins in the Gulf of Mexico
 2 | ==================================================
 3 | 
 4 | Data copied from [this example DSM analysis](http://distancesampling.org/R/vignettes/mexico-analysis.html), after correcting for detectability.
 5 | 
 6 | Data was collected by teh NOAA South East Fisheries Science Center (sic). The OBIS-SEAMAP page for the data may be found at the SEFSC [GoMex Oceanic 1996 survey page](http://seamap.env.duke.edu/dataset/25).
 7 | 
 8 | ## Data citation
 9 | 
10 | Garrison, L. 2013. SEFSC GoMex Oceanic 1996. Data downloaded from OBIS-SEAMAP (http://seamap.env.duke.edu/dataset/25).
11 | 


--------------------------------------------------------------------------------
/data/mexdolphins/mexdolphins.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/mexdolphins/mexdolphins.RData


--------------------------------------------------------------------------------
/data/mexdolphins/soap_pred.R:
--------------------------------------------------------------------------------
 1 | # make soap pred grid
 2 | 
 3 | library(mapdata)
 4 | library(maptools)
 5 | library(rgeos)
 6 | library(rgdal)
 7 | library(mgcv)
 8 | 
 9 | load("mexdolphins.RData")
10 | 
11 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
12 | 
13 | usmap <- map('usa', plot=FALSE, xlim=c(-100,-80), ylim=c(22,33))
14 | uspoly <- map2SpatialPolygons(usmap, IDs=usmap$names,
15 |                               proj4string=CRS("+proj=longlat +datum=WGS84"))
16 | 
17 | # simplify the polygon
18 | uspoly <- gSimplify(uspoly, 0.08)
19 | 
20 | 
21 | # bounding box
22 | c1 = cbind(c(-97.5, -82, -82, -97.5),
23 |            c(22.5, 22.5, 32, 32))
24 | r1 = rbind(c1, c1[1, ])  # join
25 | P1 = Polygon(r1)
26 | Ps1 = Polygons(list(P1), ID = "a")
27 | 
28 | SPs = SpatialPolygons(list(Ps1),proj4string=CRS("+proj=longlat +datum=WGS84"))
29 | 
30 | # plot to check we got the right area
31 | #plot(uspoly)#, xlim=c(-100,-80), ylim=c(22,33))
32 | #plot(SPs, add=TRUE)
33 | #points(mexdolphins[,c("longitude","latitude")])
34 | 
35 | # take the polygon difference
36 | soap_area <- gDifference(SPs,uspoly)
37 | # again check
38 | #plot(soap_area)
39 | #points(mexdolphins[,c("longitude","latitude")])
40 | 
41 | 
42 | xy_bnd <- spTransform(soap_area, CRSobj=lcc_proj4)
43 | xy_bnd <- xy_bnd@polygons[[1]]@Polygons[[1]]@coords
44 | xy_bnd <- list(x=xy_bnd[,1], y=xy_bnd[,2])
45 | 
46 | gr <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), by=16469.27),
47 |                   y=seq(min(xy_bnd$y), max(xy_bnd$y), by=16469.27))
48 | x <- gr[,1]
49 | y <- gr[,2]
50 | ind <- inSide(xy_bnd, x, y)
51 | 
52 | soap_preddata <- as.data.frame(gr[ind,])
53 | soap_preddata$area <- preddata$area[1]
54 | soap_preddata$off.set <- log(soap_preddata$area)
55 | 
56 | 
57 | save(xy_bnd, soap_preddata, file="soapy.RData")
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/data/mexdolphins/soapy.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/mexdolphins/soapy.RData


--------------------------------------------------------------------------------
/data/routes.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/data/routes.csv


--------------------------------------------------------------------------------
/data/yukon_seeds/seed_source_locations.csv:
--------------------------------------------------------------------------------
  1 | Who,Site,Subsite,source_type,polygon,order,Y,X
  2 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796723.731,469919.3107
  3 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796742.626,469910.4723
  4 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796729.702,469974.1999
  5 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.89,470025.6419
  6 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796719.851,470077.7116
  7 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796725.935,470126.3351
  8 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796718.948,470174.2659
  9 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796702.393,470236.7358
 10 | Eliot,FoxLake,Lowland,SSWall,NA,NA,6796694.535,470294.0962
 11 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796742.568,470287.086
 12 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796724.452,470303.7841
 13 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796785.653,470258.4785
 14 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.804,470191.593
 15 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796964.678,470142.5823
 16 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797042.968,470050.4508
 17 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797046.439,470002.8427
 18 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797040.834,469950.9495
 19 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796996.21,469909.3531
 20 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796975.594,469904.4163
 21 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796888.762,469895.5375
 22 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6796850.814,469871.8622
 23 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797168.951,469864.6928
 24 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797133.442,469889.7501
 25 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797067.112,470003.8641
 26 | Eliot,FoxLake,Lowland,SSSample,NA,NA,6797045.311,469933.9624
 27 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803025.398,465082.76
 28 | Eliot,FoxLake,SW_Facing,SSSample,NA,NA,6803076.228,465133.385
 29 | Eliot,FoxLake,Lowland,SSPolygon,38,1,6796715.73,470304.9674
 30 | Eliot,FoxLake,Lowland,SSPolygon,38,2,6796728.455,470293.1938
 31 | Eliot,FoxLake,Lowland,SSPolygon,38,3,6796753.325,470273.6796
 32 | Eliot,FoxLake,Lowland,SSPolygon,38,4,6796788.911,470252.55
 33 | Eliot,FoxLake,Lowland,SSPolygon,38,5,6796847.188,470216.8999
 34 | Eliot,FoxLake,Lowland,SSPolygon,38,6,6796887.915,470191.8324
 35 | Eliot,FoxLake,Lowland,SSPolygon,38,7,6796932.971,470156.1209
 36 | Eliot,FoxLake,Lowland,SSPolygon,38,8,6796962.203,470143.2102
 37 | Eliot,FoxLake,Lowland,SSPolygon,38,9,6796979.017,470128.2042
 38 | Eliot,FoxLake,Lowland,SSPolygon,38,10,6797017.817,470096.0466
 39 | Eliot,FoxLake,Lowland,SSPolygon,38,11,6797024.449,470083.6527
 40 | Eliot,FoxLake,Lowland,SSPolygon,38,32,6797050.904,470081.0951
 41 | Eliot,FoxLake,Lowland,SSPolygon,38,33,6796981.673,470133.1883
 42 | Eliot,FoxLake,Lowland,SSPolygon,38,34,6796839.563,470223.1343
 43 | Eliot,FoxLake,Lowland,SSPolygon,38,35,6796790.373,470257.3939
 44 | Eliot,FoxLake,Lowland,SSPolygon,38,12,6797041.589,470051.2157
 45 | Eliot,FoxLake,Lowland,SSPolygon,38,13,6797044.77,470038.4604
 46 | Eliot,FoxLake,Lowland,SSPolygon,38,14,6797049.136,470021.4898
 47 | Eliot,FoxLake,Lowland,SSPolygon,38,15,6797043.093,470009.7016
 48 | Eliot,FoxLake,Lowland,SSPolygon,38,16,6797045.841,469991.8912
 49 | Eliot,FoxLake,Lowland,SSPolygon,38,17,6797050.612,469984.3664
 50 | Eliot,FoxLake,Lowland,SSPolygon,38,18,6797061.129,469968.7692
 51 | Eliot,FoxLake,Lowland,SSPolygon,38,19,6797037.335,469950.6634
 52 | Eliot,FoxLake,Lowland,SSPolygon,38,20,6797009.75,469923.8775
 53 | Eliot,FoxLake,Lowland,SSPolygon,38,21,6797014.931,469917.5015
 54 | Eliot,FoxLake,Lowland,SSPolygon,89,1,6796994.101,469909.7297
 55 | Eliot,FoxLake,Lowland,SSPolygon,89,2,6796985.436,469906.6158
 56 | Eliot,FoxLake,Lowland,SSPolygon,89,3,6796973.54,469906.5494
 57 | Eliot,FoxLake,Lowland,SSPolygon,89,4,6796960.759,469898.896
 58 | Eliot,FoxLake,Lowland,SSPolygon,89,5,6796919.57,469895.9869
 59 | Eliot,FoxLake,Lowland,SSPolygon,89,6,6796906.707,469885.8498
 60 | Eliot,FoxLake,Lowland,SSPolygon,89,7,6796855.959,469882.6563
 61 | Eliot,FoxLake,Lowland,SSPolygon,89,8,6796848.594,469877.0291
 62 | Eliot,FoxLake,Lowland,SSPolygon,89,15,6796843.691,469842.534
 63 | Eliot,FoxLake,Lowland,SSPolygon,89,9,6796830.91,469827.1575
 64 | Eliot,FoxLake,Lowland,SSPolygon,89,14,6796853.289,469799.318
 65 | Eliot,FoxLake,Lowland,SSPolygon,89,12,6796817.835,469784.2797
 66 | Eliot,FoxLake,Lowland,SSPolygon,89,10,6796812.723,469793.5348
 67 | Eliot,FoxLake,Lowland,SSPolygon,89,13,6796821.899,469785.0872
 68 | Eliot,FoxLake,Lowland,SSPolygon,89,11,6796814.147,469781.0331
 69 | Eliot,FoxLake,Lowland,SSPolygon,159,1,6797215.359,469905.572
 70 | Eliot,FoxLake,Lowland,SSPolygon,159,2,6797206.191,469912.9905
 71 | Eliot,FoxLake,Lowland,SSPolygon,159,3,6797203.94,469943.5954
 72 | Eliot,FoxLake,Lowland,SSPolygon,159,4,6797202.858,469895.6316
 73 | Eliot,FoxLake,Lowland,SSPolygon,159,5,6797194.902,469889.1922
 74 | Eliot,FoxLake,Lowland,SSPolygon,159,6,6797204.779,469878.4889
 75 | Eliot,FoxLake,Lowland,SSPolygon,159,7,6797205.31,469870.0078
 76 | Eliot,FoxLake,Lowland,SSPolygon,159,8,6797186.831,469856.0113
 77 | Eliot,FoxLake,Lowland,SSPolygon,159,9,6797169.234,469850.342
 78 | Eliot,FoxLake,Lowland,SSPolygon,159,10,6797150.315,469847.3502
 79 | Eliot,FoxLake,Lowland,SSPolygon,159,11,6797135.268,469840.288
 80 | Eliot,FoxLake,Lowland,SSPolygon,159,12,6797125.43,469837.8281
 81 | Eliot,FoxLake,Lowland,SSPolygon,159,13,6797121.712,469844.4725
 82 | Eliot,FoxLake,Lowland,SSPolygon,159,14,6797136.985,469852.2816
 83 | Eliot,FoxLake,Lowland,SSPolygon,159,15,6797162.709,469858.8117
 84 | Eliot,FoxLake,Lowland,SSPolygon,159,16,6797178.135,469866.1774
 85 | Eliot,FoxLake,Lowland,SSPolygon,159,17,6797187.21,469872.7701
 86 | Eliot,FoxLake,Lowland,SSPolygon,159,18,6797176.944,469888.4768
 87 | Eliot,FoxLake,Lowland,SSPolygon,159,19,6797171.087,469910.6188
 88 | Eliot,FoxLake,Lowland,SSPolygon,159,20,6797182.537,469924.3886
 89 | Eliot,FoxLake,Lowland,SSPolygon,159,21,6797174.278,469956.3611
 90 | Eliot,FoxLake,Lowland,SSPolygon,159,22,6797169.264,469966.2896
 91 | Eliot,FoxLake,Lowland,SSPolygon,185,1,6797141.76,469950.5774
 92 | Eliot,FoxLake,Lowland,SSPolygon,185,2,6797134.16,469928.8803
 93 | Eliot,FoxLake,Lowland,SSPolygon,185,3,6797133.061,469886.2447
 94 | Eliot,FoxLake,Lowland,SSPolygon,185,4,6797112.249,469847.2647
 95 | Eliot,FoxLake,Lowland,SSPolygon,185,5,6797112.324,469865.5345
 96 | Eliot,FoxLake,Lowland,SSPolygon,185,6,6797121.118,469891.7095
 97 | Eliot,FoxLake,Lowland,SSPolygon,185,7,6797127.573,469922.8565
 98 | Eliot,FoxLake,Lowland,SSPolygon,185,8,6797126.939,469938.0401
 99 | Eliot,FoxLake,Lowland,SSPolygon,38,31,6797091.933,470016.4598
100 | Eliot,FoxLake,Lowland,SSPolygon,38,30,6797079.967,470007.3238
101 | Eliot,FoxLake,Lowland,SSPolygon,38,29,6797067.498,469997.9275
102 | Eliot,FoxLake,Lowland,SSPolygon,38,28,6797063.273,469988.7003
103 | Eliot,FoxLake,Lowland,SSPolygon,38,27,6797070.821,469974.1372
104 | Eliot,FoxLake,Lowland,SSPolygon,38,26,6797063.943,469959.6072
105 | Eliot,FoxLake,Lowland,SSPolygon,38,25,6797049.503,469942.8299
106 | Eliot,FoxLake,Lowland,SSPolygon,38,24,6797039.73,469931.6067
107 | Eliot,FoxLake,Lowland,SSPolygon,38,23,6797030.65,469922.4277
108 | Eliot,FoxLake,Lowland,SSPolygon,38,22,6797020.646,469910.829
109 | Eliot,FoxLake,Lowland,SSPolygon,233,1,6797526.926,469768.1144
110 | Eliot,FoxLake,Lowland,SSPolygon,233,2,6797486.016,469794.4904
111 | Eliot,FoxLake,Lowland,SSPolygon,233,3,6797400.062,469863.5996
112 | Eliot,FoxLake,SW_Facing,SSPolygon,380,1,6803013.014,465080.2516
113 | Eliot,FoxLake,SW_Facing,SSPolygon,380,2,6803089.116,465135.0063
114 | Eliot,FoxLake,SW_Facing,SSPolygon,389,1,6803262.384,464081.8498
115 | Eliot,FoxLake,SW_Facing,SSPolygon,389,2,6803618.396,464040.2266
116 | Eliot,FoxLake,SW_Facing,SSPolygon,391,1,6803587.361,464040.0052
117 | Eliot,FoxLake,SW_Facing,SSPolygon,391,2,6803174.081,464080.9813
118 | Eliot,FoxLake,Lowland,SSIsland,56,NA,6796740.729,470183.8429
119 | Eliot,FoxLake,Lowland,SSIsland,58,NA,6796737.425,470200.11
120 | Eliot,FoxLake,Lowland,SSIsland,65,NA,6796729.556,470238.3273
121 | Eliot,FoxLake,Lowland,SSIsland,67,NA,6796738.312,470243.0132
122 | Eliot,FoxLake,Lowland,SSIsland,68,NA,6796734.045,470246.1296
123 | Eliot,FoxLake,Lowland,SSIsland,97,NA,6796940.921,469902.9191
124 | Eliot,FoxLake,Lowland,SSIsland,98,NA,6796929.918,469896.6909
125 | Eliot,FoxLake,Lowland,SSIsland,99,NA,6796923.639,469901.6075
126 | Eliot,FoxLake,Lowland,SSIsland,114,NA,6796836.526,469792.1375
127 | Eliot,FoxLake,Lowland,SSIsland,156,NA,6796980.95,470060.4577
128 | Eliot,FoxLake,Lowland,SSIsland,182,NA,6797146.336,469972.2485
129 | Eliot,FoxLake,Lowland,SSIsland,183,NA,6797152.1,469950.1815
130 | Eliot,FoxLake,Lowland,SSIsland,184,NA,6797146.21,469953.2912
131 | Eliot,FoxLake,Lowland,SSIsland,212,NA,6796949.689,469802.0181
132 | Eliot,FoxLake,Lowland,SSIsland,213,NA,6796948.007,469797.7336
133 | Eliot,FoxLake,Lowland,SSIsland,214,NA,6796936.248,469789.6354
134 | Eliot,FoxLake,Lowland,SSIsland,231,NA,6797483.691,469755.6377
135 | Eliot,FoxLake,Lowland,SSIsland,232,NA,6797514.164,469726.8687
136 | Eliot,FoxLake,Lowland,SSIndividual,60,NA,6796747.059,470205.7828
137 | Eliot,FoxLake,Lowland,SSIndividual,61,NA,6796737.175,470215.1502
138 | Eliot,FoxLake,Lowland,SSIndividual,62,NA,6796734.953,470215.1313
139 | Eliot,FoxLake,Lowland,SSIndividual,63,NA,6796725.697,470212.2823
140 | Eliot,FoxLake,Lowland,SSIndividual,64,NA,6796727.524,470235.7011
141 | Eliot,FoxLake,Lowland,SSIndividual,66,NA,6796736.703,470238.8686
142 | Eliot,FoxLake,Lowland,SSIndividual,96,NA,6796948.488,469901.2824
143 | Eliot,FoxLake,Lowland,SSIndividual,203,NA,6797043.34,469928.7998
144 | Eliot,FoxLake,Lowland,SSIndividual,204,NA,6797039.158,469928.6471
145 | Eliot,FoxLake,Lowland,SSIndividual,208,NA,6797019.777,469904.491
146 | 


--------------------------------------------------------------------------------
/example-bivariate-timeseries-and-ti.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "More time series; bivariate smooths and ti()"
  3 | output: 
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 |     fig_width: 10
 10 |     fig_height: 4
 11 | ---
 12 | 
 13 | ## Background 
 14 | 
 15 | This example is an extension of the Fort Lauderdale time series model. Here we look at two additional advanced features, namely
 16 | 
 17 | i. allowing the seasonal and trend terms to interact, and
 18 | ii. setting up an ANOVA-like decomposition for interaction smooth using `ti()` terms
 19 | 
 20 | The model in the earlier example estimated a constant non-linear seasonal (within-year) effect and a constant non-linear trend (between year) effect. Such a model is unable to capture any potential change in the seasonal effect (the within-year distribution of temperatures) over time. Or, put another way, is unable to capture potentially different trends in temperature are particular times of the year.
 21 | 
 22 | This example extends the simpler model to more generally model changes in temperature.
 23 | 
 24 | ### Loading and viewing the data
 25 | 
 26 | The time series we'll look at is a long-term temperature and precipitation
 27 | data set from the convention center here at Fort Lauderdale, consisting of 
 28 | monthly mean temperature (in degrees c), the number of days it rained that
 29 | month, and the amount of precipitation that month (in mm). The data range from
 30 | 1950 to 2015, with a column for year and month (with Jan. = 1). The data is from
 31 | a great web-app called FetchClim2, and the link for the data [I used is
 32 | here](http://fetchclimate2.cloudapp.net/#page=geography&dm=values&t=years&v=airt(13,6,2,1),prate(12,7,2,1),wet(1)&y=1950:1:2015&dc=1,32,60,91,121,152,182,213,244,274,305,335,366&hc=0,24&p=26.099,-80.123,Point%201&ts=2016-07-17T21:57:11.213Z).
 33 | 
 34 | Here we'll load it and plot the air temperature data.
 35 | 
 36 | ```{r, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE, highlight = TRUE}
 37 | library("mgcv")
 38 | library("ggplot2")
 39 | florida <- read.csv("./data/time_series/Florida_climate.csv",
 40 |                     stringsAsFactors = FALSE)
 41 | head(florida)
 42 | 
 43 | ggplot(data = florida, aes(x = month, y = air_temp, color = year, group = year)) +
 44 |   geom_point() +
 45 |   geom_line()
 46 | ```
 47 | 
 48 | ## Fitting the smooth interaction model
 49 | 
 50 | Following from the earlier example, we fit a smooth interaction model using a tensor product spline of `year` and `month`. Notice how we specify the marginal basis types as a character vector. Here I restrict the model to AR(1) errors as this was the best model in the earlier example and it is unlikely that additional unmodelled temporal structure will turn up when we use a more general model!
 51 | 
 52 | ```{r fit-smooth-interaction, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
 53 | m.full <- gamm(air_temp ~ te(year, month, bs = c("tp", "cc")), data = florida,
 54 |                correlation = corAR1(form = ~ 1 | year),
 55 |                knots = list(month = c(0.5, 12.5)))
 56 | layout(matrix(1:3, nrow = 1))
 57 | plot(m.full$gam, scheme = 1, theta = 40)
 58 | plot(m.full$gam, scheme = 1, theta = 80)
 59 | plot(m.full$gam, scheme = 1, theta = 120)
 60 | layout(1)
 61 | summary(m.full$gam)
 62 | ```
 63 | 
 64 | The plot of the tensor product smooth is quite illustrative as to what the model is attempting to do. The earlier model would be forced to retain the same seasonal curve throughout the time series. Now, with the more complex model, the seasonal curve is allowed to change shape smoothly through time.
 65 | 
 66 | Our task now is to test whether this additional complexity is worth the extra effort? Are some months warming more than others in Fort Lauderdale?
 67 | 
 68 | ## Fitting the ANOVA-decompososed model
 69 | 
 70 | The logical thing to do to test whether some months are warming more than others would be to drop the interaction term from the model and compare the simpler and more complex models with AIC or a likelihood ratio test. But if you look back at the model we just fitted, there is no term in that model that relates just to the interaction! The tensor product contains both the main and interactive effects. What's worse is that we can't (always) compare this model with the one we fitted in the earlier example because we need to be careful to ensure the models are properly nested. This is where the `ti()` smoother comes in.
 71 | 
 72 | A `ti()` smoother is like a standard tensor product (`te()`) smooth, but without the main effects smooths. Now we can build a model with separate main effects smooths for `year` and `month` and add in the interaction smooth using `ti()`. This will allow us to fit two properly nested that can be compared to test our null hypothesis that the months are warming at the same rate.
 73 | 
 74 | Let's fit those models
 75 | 
 76 | ```{r fit-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
 77 | m.main <- gam(air_temp ~ s(year, bs = "tp") + s(month, bs = "cc"), data = florida,
 78 |               knots = list(month = c(0.5, 12.5)))
 79 | m.int  <- gam(air_temp ~ ti(year, bs = "tp") + ti(month, bs = "cc") +
 80 |                 ti(year, month, bs = c("tp", "cc"), k = 11), data = florida,
 81 |               knots = list(month = c(0.5, 12.5)))
 82 | ```
 83 | 
 84 | If you do this with `gamm()` the model fitting will fail, and for good reason. At least the two `gam()` models converged so we can see what is going on. Compare the two models using `AIC()`
 85 | 
 86 | ```{r aic-anova-decomp, echo = TRUE, tidy = FALSE, include = TRUE, message = FALSE}
 87 | AIC(m.main, m.int)
 88 | ```
 89 | 
 90 | AIC tells us that there is some support for the interaction model, but only when we force a more complex surface by specifying `k`.
 91 | 
 92 | Regardless, the interaction term only affords a >1% improvement in deviance explained; whilst the interaction may be favoured by AIC, any variation in the seasonal effect over time must be small.
 93 | 
 94 | ## Exercises
 95 | 
 96 | 1. For an example where there is a stronger change in the seasonal effect, see my two blog posts on applying these models to the Central England Temperature series:
 97 |     - [Post 1](http://www.fromthebottomoftheheap.net/2015/11/21/climate-change-and-spline-interactions/)
 98 |     - [Post 2](http://www.fromthebottomoftheheap.net/2015/11/23/are-some-seasons-warming-more-than-others/)
 99 | 
100 |     These two posts take you through a full analysis of this data series.
101 | 


--------------------------------------------------------------------------------
/example-forest-health.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Markov random field smooths & German forest health data"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 | ---
 10 | 
 11 | # Introduction
 12 | 
 13 | In this example, we look at a spatial data set of forest health data from a stand in Germany. We can treat the individual trees as discrete spatial units and fit a spatial term using a special type of spline basis, a Markov random field.
 14 | 
 15 | The data come from a project n the forest of Rothenbuch, Germany, which has been ongoing since 1982. The project studies five species but here we just consider the beech data. Each year the condition of each of 83 trees is categorized into one of nine ordinal classes in terms of the degree (percentage) of defoliation. A defoliation value of 0% indicates a healthy beech tree, whilst a value of 100% indicates a tree is dead. For the purposes of this example, several of the nine ordinal classes are merged to form a three class response variable of interest
 16 | 
 17 | 1. 0%, healthy trees that I'll refer to as the `"low"` defoliation group
 18 | 2. 12.5% -- 37.5% defoliation, which I refer to as the `"med"` group, and
 19 | 3. 50% -- 100% defoliation, which is the `"high"` group.
 20 | 
 21 | The original classes were `r paste0(c(0,12.5,37.5,50,62.5,75, 87.5, 100), "%")` and are in variable `defol`.
 22 | 
 23 | Alongside the response variable, a number of continuous and categorical covariates are recorded
 24 | 
 25 | - `id` --- location ID number
 26 | - `year` --- year of recording
 27 | - `x` and `y` --- the x- and y-coordinates of each location
 28 | - `age` --- average age of trees at a location
 29 | - `canopyd` --- canopy density at the location, in %
 30 | - `gradient` --- slope, in %
 31 | - `alt` --- altitude above sea level in m
 32 | - `depth` --- soil depth in cm
 33 | - `ph` --- soil pH at 0--2cm depth
 34 | - `watermoisture` --- soil moisture in three categories
 35 |     1. `1` moderately dry
 36 |     2. `2` moderately moist
 37 |     3. `3` moist or temporarily wet
 38 | - `alkali` --- fraction of alkali ions in four categories
 39 |     1. `1` very low
 40 |     2. `2` low
 41 |     3. `3` moderate
 42 | 	4. `4` high
 43 | - `humus` thickness of the soil humus layer in five categories
 44 |     1. `0` 0cm
 45 |     2. `1` 1cm
 46 |     3. `2` 2cm
 47 | 	4. `3` 3cm
 48 | 	5. `4` $>$3cm
 49 | - `type` --- type of forest
 50 |     - `0` deciduous forest
 51 |     - `1` mixed forest
 52 | - `fert` --- fertilization
 53 |     - `0` not fertilized
 54 |     - `1` fertilized
 55 | 
 56 | The aim of the example is to investigate the effect of the measured covariates on the degree of defoliation and to quantify any temporal trend and spatial effect of geographic location in the data set, while adjusting for the effects of the other covariates.
 57 | 
 58 | The data are extensively analysed in Fahrmeir, Kneib, Lang, and Marx (2013) *Regression: Models, Methods and Applications*. Springer.
 59 | 
 60 | # Getting started
 61 | 
 62 | Begin by loading the packages we'll use in this example
 63 | 
 64 | ```{r load-packages}
 65 | ## Load packages
 66 | library("mgcv")
 67 | library("ggplot2")
 68 | ```
 69 | 
 70 | Next, load the forest health data set
 71 | 
 72 | ```{r read-data}
 73 | ## Read in the forest health data
 74 | forest <- read.table("./data/forest-health/beech.raw", header = TRUE, na.strings = ".")
 75 | head(forest)
 76 | ```
 77 | 
 78 | The data require a little processing to ensure they are correctly coded. The chunk below does
 79 | 
 80 | 1. Makes a nicer stand label so that we can sort stands numerically even though the id is character,
 81 | 2. Aggregates the defoliation data into an ordered categorical variable for low, medium, or high levels of defoliation. This is converted to a numeric variable for modelling,
 82 | 3. Convert the categorical variables to factors. `humus` is converted to a 4-level factor,
 83 | 4. Remove some `NA`s, and
 84 | 5. Summarises the response variable in a table.
 85 | 
 86 | ```{r process-forest-data}
 87 | forest <- transform(forest, id = factor(formatC(id, width = 2, flag = "0")))
 88 | 
 89 | ## Aggregate defoliation & convert categorical vars to factors
 90 | levs <- c("low","med","high")
 91 | forest <- transform(forest,
 92 |                     aggDefol = as.numeric(cut(defol, breaks = c(-1,10,45,101),
 93 |                                               labels = levs)),
 94 |                     watermoisture = factor(watermoisture),
 95 |                     alkali = factor(alkali),
 96 |                     humus = cut(humus, breaks = c(-0.5, 0.5, 1.5, 2.5, 3.5),
 97 |                                 labels = 1:4),
 98 |                     type = factor(type),
 99 |                     fert = factor(fert))
100 | forest <- droplevels(na.omit(forest))
101 | head(forest)
102 | with(forest, table(aggDefol))
103 | ```
104 | 
105 | To view the spatial arrangement of the forest stand, plot the unique `x` and `y` coordinate pairs
106 | 
107 | ```{r plot-forest-stand}
108 | ## Plot data
109 | ggplot(unique(forest[, c("x","y")]), aes(x = x, y = y)) +
110 |     geom_point()
111 | ```
112 | 
113 | # Non-spatial ordered categorical GAM
114 | 
115 | Our first model will ignore the spatial component of the data. All continuous variables except `ph` and `depth` are modelled using splines, and all categorical variables are included in the model. Note that we can speed up fitting by turning on some multi-threaded parallel processing in parts of the fitting algorithm via the `nthreads` control argument. The `ocat` family is used and we specify that there are three classes. Smoothness selection is via REML.
116 | 
117 | ```{r fit-naive-model}
118 | ## Model
119 | ctrl <- gam.control(nthreads = 3)
120 | forest.m1 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert +
121 |                     s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt),
122 |                 data = forest, family = ocat(R = 3), method = "REML",
123 |                 control = ctrl)
124 | ```
125 | 
126 | The model summary
127 | 
128 | ```{r summary-naive}
129 | summary(forest.m1)
130 | ```
131 | 
132 | indicates all model terms are significant at the 95% level, especially the smooth terms. Don't pay too much attention to this though as we have not yet accounted for spatial structure in the data and several of the variables are likely to be spatially autocorrelated and hence the identified effects may be spurious.
133 | 
134 | This  is somewhat confirmed by the form of the fitted functions; rather than smooth, monotonic functions man terms are highly non-linear and difficult to interpret.
135 | 
136 | ```{r plot-naive-smooths}
137 | plot(forest.m1, pages = 1, shade = 2, scale = 0)
138 | ```
139 | We might expect that older trees are more susceptible to damage yet confusingly there is a decreasing effect for the very oldest trees. The smooth of `gradient` is uninterpretable. Trees in low and high altitude areas are less damaged than those in areas of intermediate elevation, which is also counter intuitive.
140 | 
141 | Running gam.check()`
142 | 
143 | ```{r gam-check-naive}
144 | gam.check(forest.m1)
145 | ```
146 | 
147 | suggests no major problems, but the residual plot is difficult to interpret owing to the categorical nature of the response variable. The printed output suggests some smooth terms may need their basis dimension increasing. Before we do this however, we should add a spatial effect to the model.
148 | 
149 | # Spatial GAM via a MRF smooth
150 | 
151 | For these data, it would be more natural to fit a spatial effect via a 2-d smoother as we've considered in other examples. However, we can consider the trees as being discrete spatial units and fit a spatial effect via a Markov random field (MRF) smooth. To fit the MRF smoother, we need information on the neighbours of each tree. In this example, any tree within 1.8km of a focal tree was considered a neighbour of that focal tree. This neighbourhood information is stored in a BayesX graph file, which we can convert into the format needed for **mgcv**'s MRF basis function. To facilitate reading the graph file, load the utility function `gra2mgcv()`
152 | 
153 | ```{r load-graph-fun}
154 | ## souce graph reading function
155 | source("./code_snippets/gra2mgcv.R")
156 | ```
157 | 
158 | Next we load the `.gra` file and do some manipulations to match the `forest` environmental data file
159 | 
160 | ```{r load-graph}
161 | ## Read in graph file and output list required by mgcv
162 | nb <- gra2mgcv("./data/forest-health/foresthealth.gra")
163 | nb <- nb[order(as.numeric(names(nb)))]
164 | names(nb) <- formatC(as.numeric(names(nb)), width = 2, flag = "0")
165 | ```
166 | 
167 | Look at the structure of `nb`:
168 | 
169 | ```{r head-nb}
170 | head(nb)
171 | ```
172 | 
173 | In **mgcv** the MRF basis can be specified by one or more of
174 | 
175 | 1. `polys`; coordinates of vertices defining spatial polygons for each discrete spatial unit. Any two spatial units that share one or more vertices are considered neighbours,
176 | 2. `nb`; a list with one component per spatial unit, where each component contains indices of the neighbouring components of the current spatial unit, and/or
177 | 3. `penalty`; the actual penalty matrix of the MRF basis. This is an $N$ by $N$ matrix with the number of neighbours of each unit on the diagonal, and the $j$th column of the $i$th row set to `-1` if the $j$th spatial unit is a neighbour of the $i$th unit. Elsewhere the penalty matrix is all `0`s.
178 | 
179 | **mgcv** will create the penalty matrix for you if you supply `polys` or `nb`. As we don't have polygons here, we'll use the information from the `.gra` file converted to the format needed by `gam()` to indicate the neighbourhood.
180 | 
181 | The `nb` list is passed along using the `xt` argument of the `s()` function. For the MRF basis, `xt` is a named list with one or more of the components listed above. In the model call below we use `xt = list(nb = nb)` to pass on the neighbourhood list. To indicate that an MRF smooth is needed, we use `bs = "mrf"` and the covariate of the smooth is the factor indicating to which spatial unit each observation belongs. In the call below we use the `id` variable. Note that this doesn't need to be a factor, just something that can be coerced to one.
182 | 
183 | All other aspects of the model fit remain the same hence we use `update()` to add the MRF smooth without repeating everything from the original call.
184 | 
185 | ```{r fit-mrf-model}
186 | ## Fit model with MRF
187 | ## forest.m2 <- gam(aggDefol ~ ph + depth + watermoisture + alkali + humus + type + fert +
188 | ##                      s(age) + s(gradient, k = 20) + s(canopyd) + s(year) + s(alt) +
189 | ##                      s(id, bs = "mrf", xt = list(nb = nb)),
190 | ##                 data = forest, family = ocat(R = 3), method = "REML",
191 | ##                 control = ctrl)
192 | forest.m2 <- update(forest.m1, . ~ . + s(id, bs = "mrf", xt = list(nb = nb)))
193 | ```
194 | 
195 | Look at the model summary:
196 | 
197 | ```{r summary-mrf-model}
198 | summary(forest.m2)
199 | ```
200 | 
201 | What differences do you see between the model with the MRF spatial effect and the first model that we fitted?
202 | 
203 | Quickly create plots of the fitted smooth functions
204 | 
205 | ```{r plot-mrf-smooths}
206 | plot(forest.m2, pages = 1, shade = 2, scale = 0, seWithMean = TRUE)
207 | ```
208 | 
209 | Notice that here we use `seWithMean = TRUE` as one of the terms has been shrunk back to a linear function and would otherwise have a *bow tie* confidence interval.
210 | 
211 | Compare these fitted functions with those from the original model. Are these more in keeping with expectations?
212 | 
213 | Model diagnostics are difficult with models for discrete responses such as these. Instead we can interrogate the model to derive quantities of interest that illustrate or summarise the model fit. First we start by generating posterior probabilities for the three ordinal defoliation/damage categories using `predict()`. This is similar to the code Eric showed this morning for the ordered categorical family.
214 | 
215 | ```{r fitted-values}
216 | fit <- predict(forest.m2, type = "response")
217 | colnames(fit) <- levs
218 | head(fit)
219 | ```
220 | 
221 | The `predict()` method returns a 3-column matrix, one column per category. The entries in the matrix are the posterior probabilities of class membership, with rows summing to 1. As we'll see later, we can take a *majority wins* approach and assign each observation a fitted class membership and compare these to the known classes.
222 | 
223 | For easier visualisation it would be nicer to have these fitted probabilities in a tidy form, which we now do via a few manipulations
224 | 
225 | ```{r process-fitted-values}
226 | fit <- setNames(stack(as.data.frame(fit)), c("Prob", "Defoliation"))
227 | fit <- transform(fit, Defoliation = factor(Defoliation, labels = levs))
228 | fit <- with(forest, cbind(fit, x, y, year))
229 | head(fit)
230 | ```
231 | 
232 | The fitted values are now ready for plotting. Here we plot just the years 2000--2004, showing the spatial arrangement of trees, faceted by defoliation/damage category:
233 | 
234 | ```{r plot-fitted}
235 | ggplot(subset(fit, year >= 2000), aes(x = x, y = y, col = Prob)) +
236 |     geom_point(size = 1.2) +
237 |     facet_grid(Defoliation ~ year) +
238 |     coord_fixed() +
239 |     theme(legend.position = "top")
240 | ```
241 | 
242 | A more complex use of the model involves predicting change in class posterior probability over time whilst holding all other variables at the observed mean or category mode
243 | 
244 | ```{r timeseries-predictions}
245 | N <- 200
246 | pdat <- with(forest, expand.grid(year = seq(min(year), max(year), length = N),
247 |                                  age = mean(age, na.rm = TRUE),
248 |                                  gradient = mean(gradient, na.rm = TRUE),
249 |                                  canopyd = mean(canopyd, na.rm = TRUE),
250 |                                  alt = mean(alt, na.rm = TRUE),
251 |                                  depth = mean(depth, na.rm = TRUE),
252 |                                  ph = mean(ph, na.rm = TRUE),
253 |                                  humus = factor(2, levels = levels(humus)),
254 |                                  watermoisture = factor(2, levels = levels(watermoisture)),
255 |                                  alkali = factor(2, levels = levels(alkali)),
256 |                                  type = factor(0, levels = c(0,1)),
257 |                                  fert = factor(0, levels = c(0,1)),
258 |                                  id = levels(id)))
259 | pred <- predict(forest.m2, newdata = pdat, type = "response", exclude = "s(id)")
260 | colnames(pred) <- levs
261 | pred <- setNames(stack(as.data.frame(pred)), c("Prob","Defoliation"))
262 | pred <- cbind(pred, pdat)
263 | pred <- transform(pred, Defoliation = factor(Defoliation, levels = levs))
264 | ```
265 | 
266 | A plot of the predictions is produced using
267 | 
268 | ```{r plot-timeseries-predictions}
269 | ggplot(pred, aes(x = year, y = Prob, colour = Defoliation)) +
270 |     geom_line() +
271 |     theme(legend.position = "top")
272 | ```
273 | 
274 | Using similar code, produce a plot for the effect of `age` holding other values at the mean or mode.
275 | 
276 | The final summary of the model that we'll produce is a confusion matrix of observed and predicted class membership. Technically, this is over-optimistic as we are using the same data to both fit and test the model, but it is an illustration of how well the model does are fitting the observed classes.
277 | 
278 | ```{r confusion-matrix}
279 | fit <- predict(forest.m2, type = "response")
280 | fitClass <- factor(levs[apply(fit, 1, which.max)], levels = levs)
281 | obsClass <- with(forest, factor(aggDefol, labels = levs))
282 | sum(fitClass == obsClass) / length(fitClass)
283 | table(fitClass, obsClass)
284 | ```
285 | 
286 | Just don't take it as any indication of how well the model will do at predicting the defoliation class for a new year or tree.
287 | 
288 | How better does the model with the MRF spatial effect fit the data compared to the original model that was fitted? We can use AIC as one means of answering that question
289 | 
290 | ```{r aic}
291 | AIC(forest.m1, forest.m2)
292 | ```
293 | 
294 | Which model does AIC indicate as fitting the data best?
295 | 
296 | ## Exercise
297 | 
298 | As an additional exercise, replace the MRF smooth with a 2-d spline of the `x` and `y` coordinates and produce maps of the class probabilities over the spatial domain. You'll need to use ideas and techniques from some of the other spatial examples in order to complete this task.
299 | 


--------------------------------------------------------------------------------
/example-linear-functionals.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Linear functionals; or: what to do when you've got a bunch of x-values and only one y"
  3 | author: Eric Pedersen
  4 | output:
  5 |   html_document:
  6 |     toc: true
  7 |     toc_float: true
  8 |     theme: readable
  9 |     highlight: haddock
 10 | ---
 11 | 
 12 | ##1. Background
 13 | 
 14 | Up to now, we've been looking at data where every coefficient we're using to 
 15 | predict our y-values consists of a single value for each y. However, not all 
 16 | predictor data looks like this. In many cases, a single coefficient may have 
 17 | several values associated with the same outcome variable. For instance, Let's
 18 | say we've measured total ecosystem production in several different lakes, as
 19 | well as temperature along a gradient away from shoreline in the lake. We want to know:
 20 | 
 21 | 1. how well temperature predicts production
 22 | 2. whether temperatures at different distances from the shore have different effects on
 23 | production (as production varies a lot between the littoral and pelagic zone).
 24 | 
 25 | Now, there looks like there might be a few ways to answer this in a standard gam
 26 | setting. First we could just average temperatures across locations and use that 
 27 | as a predictor to answer question 1. However, that doesn't give us any insight 
 28 | into question 2, and it's pretty easy to imagine a case like warmer temperatures
 29 | at the shore increase production, but warmer temperatures in the middle of the 
 30 | lake have very little effect, or where a single warm area could substantially
 31 | increase production even in a cold lake. We could also try to fit a seperate
 32 | smooth of production on temperature at each shore distance. However, if we have
 33 | a lot of distances, this ends up fitting a bunch of seperate smooths, and we'd
 34 | rapidly run out of degrees of freedom. Also, it would mean throwing away
 35 | information; we'd expect that the effect of temperature a meter below from the
 36 | shore should be very similar to the effect of temperature right at the
 37 | shoreline, but by fitting a seperate smooth for each, we're ignoring that (and
 38 | likely going to suffer from concurvity issues to boot!).
 39 | 
 40 | What we need is a method that can account for the fact that we have multiple x 
 41 | values for a given predictor. Fortunately, `mcgv` can handle this pretty easily.
 42 | It does this by allowing to pass matrix-valued predictors to `s()` and `te()` 
 43 | terms. When you pass `mgcv` a matrix, it will fit the same smooth function to 
 44 | each of the columns of the matrix, then sum across the rows of the matrix of
 45 | transformed values to give the estimated mean value for a given `y`. Let's say
 46 | `y` is a one-dimensional outcome, with n measured values, and `x` is a matrix
 47 | with n rows and k columns. The predicted values for the ith value of `y` would
 48 | be: $y_i \sim\sum_{j=1}^k f(x_{i,j})$, where $f(x_{i,j})$ is itself a sum of
 49 | basis functions multiplied by model coefficients as before.
 50 | 
 51 | In mathemetical terms, this makes our smooth term a *functional*, where our
 52 | outcome is a function of a vector of terms rather than a single term, and a
 53 | linear functional, as it's a linear sum of smooth terms for each column; but you
 54 | really don't need to know much about functionals to use these. I only mention
 55 | the functional thing because if you want to dig into this approach further or if
 56 | you want to read help files on these, as you have to search
 57 | `?linear.functional.terms`.
 58 | 
 59 | There's a few major useages Ive found for these. There's likely more than this,
 60 | but these are all cases I've encountered. Also, each of these cases requires you
 61 | to set up your predictor matrices carefully, so make sure you know what kind of
 62 | functional you'll be fitting, and what predictor matrix is going where in the `s()`
 63 | function.
 64 | 
 65 | ### Case 1. Nonlinear averaging. 
 66 | The first case, and the simplest to fit, is to estimate 
 67 | a nonlinear average. Let's look at the lake example from before. Say we think 
 68 | that distance from shore shouldn't matter, but that production increases
 69 | non-linearly with temperature. In that case, a cold lake with a few hot spots
 70 | may actually be more productive than a lake that's consitently luke warm.
 71 | Therefore, if you average over temperatures before estimating a function, you'll 
 72 | miss this effect (as we know from statistics that the average of a non-linear
 73 | function applied to x does not, in general, equal the function applied to the
 74 | average of x). In this case, we can just provide a matrix of values (the predictor matrix) to `s()`: in this case, the predictor matrix is a matrix of temperatures where each column is the temperature measured at one of our sites. 
 75 | We do this in the code below with the variable `temp_matrix`. 
 76 | 
 77 | 
 78 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
 79 | library(dplyr)
 80 | library(mgcv)
 81 | library(ggplot2)
 82 | library(tidyr)
 83 | n_lakes = 200
 84 | lake_mean_temps = rnorm(n_lakes, 25,6)
 85 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes,
 86 |                                       shore_dist_m = c(0,1,5,10,20,40)))
 87 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8)
 88 | lake_data$production = 5*dnorm(lake_data$temp, 30, sd = 5)/dnorm(1,0,5)+rnorm(n_lakes*6, 0, 1)
 89 | lake_data = lake_data %>% group_by(lake)%>%
 90 |   mutate(production = mean(production),
 91 |          mean_temp = mean(temp))%>%
 92 |   spread(shore_dist_m, temp)
 93 | 
 94 | 
 95 | lake_data$temp_matrix = lake_data %>% 
 96 |   select(`0`:`40`) %>%
 97 |   as.matrix(.)
 98 | head(lake_data)
 99 | 
100 | 
101 | mean_temp_model = gam(production~s(mean_temp),data=lake_data)
102 | nonlin_temp_model = gam(production~s(temp_matrix),
103 |                                 data=lake_data)
104 | 
105 | layout(matrix(1:2, nrow=1))
106 | plot(mean_temp_model)
107 | plot(nonlin_temp_model)
108 | layout(1)
109 | temp_predict_data = data.frame(temp = seq(10,40,length=100),
110 |                           mean_temp = seq(10,40,length=100))
111 | 
112 | temp_matrix = matrix(seq(10,40,length=100),nrow= 100,ncol=6,
113 |                                        byrow = F) 
114 | 
115 | temp_predict_data = temp_predict_data %>%
116 |   mutate(linear_average = as.numeric(predict(mean_temp_model,.)),
117 |          nonlinear_average = as.numeric(predict(nonlin_temp_model, .)),
118 |         true_function = 5*dnorm(temp, 30, sd = 5)/dnorm(1,0,5))%>%
119 |   gather(model, value,linear_average:true_function)
120 | 
121 | #This plots the two fitted models vs. the true model.
122 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+
123 |   geom_line()+
124 |   scale_color_brewer(palette="Set1")+
125 |   theme_bw(20)
126 | ```
127 | 
128 | The nonlinear model is substantially more uncertain at the ends,
129 | bu it fits the true function much more effectively. The function from 
130 | the average data substantially underestimates the effect of temperature on 
131 | production.
132 | 
133 | ### Case 2: Weighted averaging
134 | The next major case where linear functionals are useful is for weighted 
135 | averages. Here, you have some predictor variable measured at various points at
136 | different distances or lags from a given point, or at different locations along
137 | some gradient. This could be a variable that's been meaured at various distances
138 | away from each observed site, and you want to understand at what scale that
139 | variable will affect your parameter of interest. It could also be a predictor
140 | variable measured at several time points before the observation, and you want to
141 | know what at what lags the two variables interact.
142 | 
143 | In this case, we'll assume that the relationship between our variable of 
144 | interest (x) and the outcome is linear at any given lag, but that linear
145 | relationship changes smoothly with the lag. We'll look at the case where both
146 | relationships are nonlinear next. Here we have to create two matrices. The first
147 | is the lag matrix, and it will have one column for each lag we're interested in. 
148 | All the values in a given column will be equal to the lag value.
149 | The second matrix is the predictor matrix, and it is the same as (see [the section below](#by_var)) on `by=` terms on how this works).
150 | 
151 | 
152 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
153 | library(dplyr)
154 | library(mgcv)
155 | library(ggplot2)
156 | library(tidyr)
157 | n_lakes = 200
158 | lake_mean_temps = rnorm(n_lakes, 25,6)
159 | lake_data = as.data.frame(expand.grid(lake = 1:n_lakes,
160 |                                       shore_dist_m = c(0,1,5,10,20,40)))
161 | lake_data$temp = rnorm(n_lakes*6, lake_mean_temps[lake_data$lake], 8)
162 | lake_data$production = with(lake_data, rnorm(n_lakes*6, 
163 |                              temp*3*exp(-shore_dist_m/10),1))
164 | lake_data = lake_data %>% group_by(lake)%>%
165 |   mutate(production = mean(production),
166 |          mean_temp = mean(temp))%>%
167 |   spread(shore_dist_m, temp)
168 | 
169 | #This is our lag matrix for this example
170 | shore_dist_matrix = matrix(c(0,1,5,10,20,40), nrow=n_lakes,ncol=6, byrow = T)
171 | head(shore_dist_matrix)
172 | 
173 | #This is our predictor matrix
174 | lake_data$temp_matrix = lake_data %>% 
175 |   select(`0`:`40`) %>%
176 |   as.matrix(.)
177 | head(lake_data)
178 | 
179 | #Note: we need to set k=6 here, as we really only have 6 degrees of freedom (one
180 | #for each distance we've measured from the shore.
181 | 
182 | nonlin_temp_model = gam(production~s(shore_dist_matrix, by= temp_matrix,k=6),
183 |                                 data=lake_data)
184 | plot(nonlin_temp_model)
185 | 
186 | #This plots the two fitted models vs. the true model.
187 | ggplot(aes(temp, value,color=model), data=temp_predict_data)+
188 |   geom_line()+
189 |   scale_color_brewer(palette="Set1")+
190 |   theme_bw(20)
191 | ```
192 | 
193 | ## 2. Key concepts and functions
194 | 
195 | ### using `by=` terms in smoothers {#by_var}
196 | One of the most useful features in `mgcv` is the `by=` argument for smooths. 
197 | This has two uses. 
198 | 
199 | 1. For a given smooth (say, `y~s(x)`), if you set `s(x,by=group`), where group
200 | is some factor-leveled predictor, `mgcv` will fit a seperate smooth of x for
201 | each level of that factor. The model will produce a different smooth $s_k(x)$
202 | for each kth level, allowing you to test if a given relationship varies betewen
203 | group. 
204 | 2. If you instead you set `s(x, by=z)` where z is a numerical value, `mgcv` will
205 | fit a varying slopes regression. Instead of modeling y as a smooth function of
206 | x, this will model y as a *linear* function of z, where the slope of the
207 | relationship between z and y changes smoothly as a function of x, so the 
208 | predicted value for the ith value of y would be:  $y_i \sim  f(x_i) \cdot z_i$. 
209 | If you're using matrix predictors as discussed above, for numerical predictors 
210 | the `by` variable `z` also has to be a matrix with the same dimensions as `x`. 
211 | The predicted value in this case will be: $y_i \sim  \sum_{j=1}^k f(x_{i,j})\cdot z_{i,j}$, where $j$ are the columns of the matrices $x$ and $z$.
212 | 
213 | 
214 | ##Calculating dispersal kernels
215 | 
216 | ```{r, echo=T,tidy=F,include=T, message=FALSE,highlight=TRUE}
217 | yukon_seedling_data = read.csv("data/yukon_seeds/seed_data.csv")
218 | yukon_source_data  =read.csv("data/yukon_seeds/seed_source_locations.csv")
219 | seed_dist = matrix(0, nrow = nrow(yukon_seedling_data), 
220 |                    ncol= nrow(yukon_source_data))
221 | for(i in 1:nrow(yukon_seedling_data)){
222 |     seed_dist[i,] = sqrt((yukon_source_data$X- yukon_seedling_data$X[i])^2 + (yukon_source_data$Y- yukon_seedling_data$Y[i])^2)
223 | }
224 | seed_dist_l = log(seed_dist)
225 | yukon_seedling_data$min_dist_l = apply(seed_dist_l, MARGIN = 1,min)
226 | 
227 | basic_dispersal_model = gam(n_spruce~s(min_dist_l)+offset(log(plot_area_m2)),
228 |                             data=yukon_seedling_data, family=nb)
229 | full_dispersal_model = gam(n_spruce~s(seed_dist_l)+offset(log(plot_area_m2)),
230 |                             data=yukon_seedling_data, family=nb)
231 | ```
232 | 
233 | 


--------------------------------------------------------------------------------
/example-spatial-mexdolphins-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 |     
 10 | ---
 11 | 
 12 | # Preamble
 13 | 
 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013).
 15 | 
 16 | From that appendix:
 17 | 
 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.*
 19 | 
 20 | 
 21 | ## Doing these exercises
 22 | 
 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works.
 24 | 
 25 | # Data format
 26 | 
 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided:
 28 | 
 29 | 
 30 | ```{r loaddata}
 31 | load("data/mexdolphins/mexdolphins.RData")
 32 | ls()
 33 | ```
 34 | 
 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model.
 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs
 37 | - `preddata` prediction grid without any fancy spatial stuff
 38 | 
 39 | Looking further into the `mexdolphins` frame we see:
 40 | 
 41 | ```{r frameinspect}
 42 | str(mexdolphins)
 43 | ```
 44 | 
 45 | A brief explanation of each entry:
 46 | 
 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area)
 48 | - `Transect.Label` identifier for the transect that this segment belongs to
 49 | - `longitude`, `latitude` location in lat/long of this segment
 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection))
 51 | - `Effort` the length of the current segment
 52 | - `depth` the bathymetry at the segment's position
 53 | - `count` number of dolphins observed in this segment
 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment
 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this)
 56 | 
 57 | 
 58 | # Modelling
 59 | 
 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model.
 61 | 
 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*.
 63 | 
 64 | ## A simple model to start with
 65 | 
 66 | We can begin with the model we showed in the first lecture:
 67 | 
 68 | ```{r simplemodel}
 69 | library(mgcv)
 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
 71 |                       data = mexdolphins,
 72 |                       family = quasipoisson(),
 73 |                       method = "REML")
 74 | ```
 75 | 
 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection.
 77 | 
 78 | We can check the assumptions of this model by using `gam.check`:
 79 | 
 80 | ```{r simplemodel-check}
 81 | gam.check(dolphins_depth)
 82 | ```
 83 | 
 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations:
 85 | 
 86 | ```{r zeroresids, fig.width=7, fig.height=7}
 87 | # code from the insides of mgcv::gam.check
 88 | resid <- residuals(dolphins_depth, type="deviance")
 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors)
 90 | plot(linpred, resid, main = "Resids vs. linear pred.",
 91 |      xlab = "linear predictor", ylab = "residuals")
 92 | 
 93 | # now add red dots corresponding to the zero counts
 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0],
 95 |        pch=19, col="red", cex=0.5)
 96 | ```
 97 | 
 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!).
 99 | 
100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate.
101 | 
102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix.
103 | 
104 | 
105 | **Exercise**
106 | 
107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`.
108 | 
109 | ```{r simplemodel-bigsmall}
110 | dolphins_depth_bigk <- gam(count ~ s(depth, k=30) + offset(off.set),
111 |                       data = mexdolphins,
112 |                       family = quasipoisson(),
113 |                       method = "REML")
114 | dim(model.matrix(dolphins_depth_bigk))
115 | dim(dolphins_depth_bigk$smooth[[1]]$S[[1]])
116 | dolphins_depth_smallk <- gam(count ~ s(depth, k=3) + offset(off.set),
117 |                       data = mexdolphins,
118 |                       family = quasipoisson(),
119 |                       method = "REML")
120 | dim(model.matrix(dolphins_depth_smallk))
121 | dim(dolphins_depth_smallk$smooth[[1]]$S[[1]])
122 | ```
123 | 
124 | (Don't worry about the many square brackets etc to get the penalty matrix!)
125 | 
126 | ### Plotting
127 | 
128 | We can plot the smooth we fitted using `plot`.
129 | 
130 | **Exercise**
131 | 
132 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which.
133 | 
134 | ```{r plotk, }
135 | par(mfrow=c(1,3))
136 | plot(dolphins_depth, shade=TRUE, main="Default k")
137 | plot(dolphins_depth_bigk, shade=TRUE, main="big k")
138 | plot(dolphins_depth_smallk, shade=TRUE, main="small k")
139 | ```
140 | 
141 | ## Count distributions
142 | 
143 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves...
144 | 
145 | 
146 | ### Tweedie
147 | 
148 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution.
149 | 
150 | ```{r tw}
151 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
152 |                          data = mexdolphins,
153 |                          family = tw(),
154 |                          method = "REML")
155 | ```
156 | 
157 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008).
158 | 
159 | 
160 | 
161 | ### Negative binomial
162 | 
163 | **Exercise**
164 | 
165 | Now do the same using the negative binomial distribution (`nb()`).
166 | 
167 | ```{r nb}
168 | dolphins_xy_nb <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
169 |                          data = mexdolphins,
170 |                          family = nb(),
171 |                          method = "REML")
172 | ```
173 | 
174 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why?
175 | 
176 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.*
177 | 
178 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships.
179 | 
180 | ## Smoothers
181 | 
182 | Now let's move onto using different bases for the smoothers. We have a couple of different options here.
183 | 
184 | 
185 | ### Thin plate splines with shrinkage
186 | 
187 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth.
188 | 
189 | **Exercise**
190 | 
191 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms.
192 | 
193 | ```{r tw-ts}
194 | dolphins_xy_tw_ts <- gam(count ~ s(x,y, bs="ts") + s(depth, bs="ts") +
195 |                                  offset(off.set),
196 |                          data = mexdolphins,
197 |                          family = tw(),
198 |                          method = "REML")
199 | ```
200 | 
201 | What are the differences (use `summary`)?
202 | 
203 | *EDFs are different for both terms*
204 | 
205 | What are the visual differences (use `plot`)?
206 | 
207 | *Not really much difference here!*
208 | 
209 | 
210 | ### Soap film smoother
211 | 
212 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands.
213 | 
214 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file:
215 | 
216 | ```{r soapy}
217 | load("data/mexdolphins/soapy.RData")
218 | ```
219 | 
220 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them.
221 | 
222 | ```{r soapknots}
223 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10),
224 |                           y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10))
225 | x <- soap_knots$x; y <- soap_knots$y
226 | ind <- inSide(xy_bnd, x, y)
227 | rm(x,y)
228 | soap_knots <- soap_knots[ind, ]
229 | ## inSide doesn't work perfectly, so if you get an error like:
230 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) :
231 | ##  knot 54 is on or outside boundary
232 | ## just remove that knot as follows:
233 | soap_knots <- soap_knots[-8, ]
234 | soap_knots <- soap_knots[-54, ]
235 | ```
236 | 
237 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`.
238 | 
239 | ```{r soapmodel}
240 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) +
241 |                      offset(off.set),
242 |                      data = mexdolphins,
243 |                      family = tw(),
244 |                      knots = soap_knots,
245 |                      method = "REML")
246 | 
247 | ```
248 | 
249 | **Exercise**
250 | 
251 | Look at the `summary` output for this model and compare it to the other models.
252 | 
253 | ```{r soap-summary}
254 | summary(dolphins_soap)
255 | ```
256 | 
257 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out.
258 | 
259 | 
260 | ```{r soap-plot}
261 | plot(dolphins_soap)
262 | ```
263 | 
264 | # Predictions
265 | 
266 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM.
267 | 
268 | We can do this very simply, calling predict as one would with a `glm`. For example:
269 | 
270 | ```{r pred-qp}
271 | pred_qp <- predict(dolphins_depth, preddata, type="response")
272 | ```
273 | 
274 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`.
275 | 
276 | **Exercise**
277 | 
278 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum?
279 | 
280 | ```{r total-max-min}
281 | sum(pred_qp)
282 | range(pred_qp)
283 | ```
284 | 
285 | ## Plotting predictions
286 | 
287 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.*
288 | 
289 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop.
290 | 
291 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot:
292 | 
293 | ```{r gridplotfn}
294 | library(plyr)
295 | # fill must be in the same order as the polygon data
296 | grid_plot_obj <- function(fill, name, sp){
297 | 
298 |   # what was the data supplied?
299 |   names(fill) <- NULL
300 |   row.names(fill) <- NULL
301 |   data <- data.frame(fill)
302 |   names(data) <- name
303 | 
304 |   spdf <- SpatialPolygonsDataFrame(sp, data)
305 |   spdf@data$id <- rownames(spdf@data)
306 |   spdf.points <- fortify(spdf, region="id")
307 |   spdf.df <- join(spdf.points, spdf@data, by="id")
308 | 
309 |   # seems to store the x/y even when projected as labelled as
310 |   # "long" and "lat"
311 |   spdf.df$x <- spdf.df$long
312 |   spdf.df$y <- spdf.df$lat
313 | 
314 |   geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
315 | }
316 | ```
317 | 
318 | Then we can plot the predicted abundance:
319 | 
320 | ```{r plotpred}
321 | library(ggplot2)
322 | library(viridis)
323 | library(sp)
324 | library(rgeos)
325 | library(rgdal)
326 | library(maptools)
327 | 
328 | # projection string
329 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
330 | # need sp to transform the data
331 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4)
332 | 
333 | p <- ggplot() +
334 |       # abundance
335 |       grid_plot_obj(pred_qp, "N", pred.polys) +
336 |       # survey lines
337 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
338 |       # observations
339 |       geom_point(aes(x, y, size=count),
340 |                  data=mexdolphins[mexdolphins$count>0,],
341 |                  colour="red", alpha=I(0.7)) +
342 |       # make the coordinate system fixed
343 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
344 |                   xlim = range(mexdolphins$x)) +
345 |       # use the viridis colourscheme, which is more colourblind-friendly
346 |       scale_fill_viridis() +
347 |       # labels
348 |       labs(fill="Predicted\ndensity",x="x",y="y",size="Count") +
349 |       # keep things simple
350 |       theme_minimal()
351 | print(p)
352 | ```
353 | 
354 | If you have the `maps` package installed you can also try the following plot the coastline too:
355 | 
356 | ```{r maptoo}
357 | library(maps)
358 | map_dat <- map_data("usa")
359 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
360 | 
361 | # give the sp object a projection
362 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
363 | # re-project
364 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
365 | map_dat$x <- map_sp.t$long
366 | map_dat$y <- map_sp.t$lat
367 | 
368 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
369 | 
370 | print(p)
371 | ```
372 | 
373 | ## Comparing predictions from soap and thin plate splines
374 | 
375 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area.
376 | 
377 | Predicting using the soap film smoother is exactly the same as for the other smoothers:
378 | 
379 | ```{r soap-pred}
380 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response")
381 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap")
382 | ```
383 | 
384 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model.
385 | 
386 | ```{r xy-soap-pred}
387 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set),
388 |                      data = mexdolphins,
389 |                      family = tw(),
390 |                      method = "REML")
391 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response")
392 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy")
393 | all_pred_N <- rbind(soap_pred_N, xy_pred_N)
394 | 
395 | p <- ggplot() +
396 |       # abundance
397 |       geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) +
398 |       # facet it!
399 |       facet_wrap(~model) +
400 |       # make the coordinate system fixed
401 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
402 |                   xlim = range(mexdolphins$x)) +
403 |       # use the viridis colourscheme, which is more colourblind-friendly
404 |       scale_fill_viridis() +
405 |       # labels
406 |       labs(fill="Predicted\ndensity", x="x", y="y") +
407 |       # keep things simple
408 |       theme_minimal() +
409 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
410 | print(p)
411 | ```
412 | 
413 | ## Plotting uncertainty
414 | 
415 | **Exercise**
416 | 
417 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below.
418 | 
419 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements.
420 | 
421 | ```{r CVmap}
422 | pred_se_qp <- predict(dolphins_depth, preddata, type="response", se.fit=TRUE)
423 | cv <- pred_se_qp$se.fit/pred_se_qp$fit
424 | 
425 | p <- ggplot() +
426 |       # abundance
427 |       grid_plot_obj(cv, "CV", pred.polys) +
428 |       # survey lines
429 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
430 |       # make the coordinate system fixed
431 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
432 |                   xlim = range(mexdolphins$x)) +
433 |       # use the viridis colourscheme, which is more colourblind-friendly
434 |       scale_fill_viridis() +
435 |       # labels
436 |       labs(fill="CV",x="x",y="y",size="Count") +
437 |       # keep things simple
438 |       theme_minimal()
439 | print(p)
440 | ```
441 | 
442 | Compare the uncertainties of the different models you've fitted so far.
443 | 
444 | ## `lpmatrix` magic
445 | 
446 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true:
447 | 
448 | ```{r lppred}
449 | # make the Lp matrix
450 | lp <- predict(dolphins_depth, preddata, type="lpmatrix")
451 | # get the linear predictor
452 | lin_pred <- lp %*% coef(dolphins_depth)
453 | # apply the link and multiply by the offset as lpmatrix ignores this
454 | pred <- preddata$area * exp(lin_pred)
455 | # all the same?
456 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE)
457 | ```
458 | 
459 | What else can we do? We can also grab the uncertainty for the sum of the predictions:
460 | 
461 | ```{r lpNvar}
462 | # extract the variance-covariance matrix
463 | vc <- vcov(dolphins_depth)
464 | 
465 | # reproject the var-covar matrix to be on the linear predictor scale
466 | lin_pred_var <- tcrossprod(lp %*% vc, lp)
467 | 
468 | # pre and post multiply by the derivatives of the link, evaluated at the
469 | # predictions -- since the link is exp, we just use the predictions
470 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1)
471 | 
472 | # we can then calculate a total CV
473 | sqrt(pred_var)/sum(pred)
474 | ```
475 | 
476 | As you can see, the `lpmatrix` can be very useful!
477 | 
478 | # Extra credit
479 | 
480 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument.
481 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose?
482 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make?
483 | 
484 | # References
485 | 
486 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0
487 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105
488 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006
489 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114.
490 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x
491 | 
492 | 


--------------------------------------------------------------------------------
/example-spatial-mexdolphins.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Further analysis of pantropical spotted dolphins in the Gulf of Mexico"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 |     
 10 | ---
 11 | 
 12 | # Preamble
 13 | 
 14 | This exercise is based on the [Appendix of Miller et al 2013](http://distancesampling.org/R/vignettes/mexico-analysis.html). In this example we're ignoring all kinds of important things like detectability and availability. This should not be treated as a serious analysis of these data! For a more complete treatment of detection-corrected abundance estimation via distance sampling and generalized additive models, see Miller et al. (2013).
 15 | 
 16 | From that appendix:
 17 | 
 18 | *The analysis is based on a dataset of observations of pantropical dolphins in the Gulf of Mexico (shipped with Distance 6.0 and later). For convenience the data are bundled in an `R`-friendly format, although all of the code necessary for creating the data from the Distance project files is available [on github](http://github.com/dill/mexico-data). The OBIS-SEAMAP page for the data may be found at the [SEFSC GoMex Oceanic 1996](http://seamap.env.duke.edu/dataset/25) survey page.*
 19 | 
 20 | 
 21 | ## Doing these exercises
 22 | 
 23 | Probably the easiest way to do these exercises is to open this document in RStudio and go through the code blocks one by one (hitting the "play" button in the editor window), filling in the code where necessary and executing the commands one-by-one. You can then compile the document once you're done to check that everything works.
 24 | 
 25 | # Data format
 26 | 
 27 | The data are provided in the `data/mexdolphins` folder as the file `mexdolphins.RData`. Loading this we can see what is provided:
 28 | 
 29 | 
 30 | ```{r loaddata}
 31 | load("data/mexdolphins/mexdolphins.RData")
 32 | ls()
 33 | ```
 34 | 
 35 | - `mexdolphins` the `data.frame` containing the observations and covariates, used to fit the model.
 36 | - `pred_latlong` an `sp` object that has the shapefile for the prediction grid, used for fancy graphs
 37 | - `preddata` prediction grid without any fancy spatial stuff
 38 | 
 39 | Looking further into the `mexdolphins` frame we see:
 40 | 
 41 | ```{r frameinspect}
 42 | str(mexdolphins)
 43 | ```
 44 | 
 45 | A brief explanation of each entry:
 46 | 
 47 | - `Sample.Label` identifier for the effort "segment" (approximately square sampling area)
 48 | - `Transect.Label` identifier for the transect that this segment belongs to
 49 | - `longitude`, `latitude` location in lat/long of this segment
 50 | - `x`, `y` location in projected coordinates (projected using the [North American Lambert Conformal Conic projection](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection))
 51 | - `Effort` the length of the current segment
 52 | - `depth` the bathymetry at the segment's position
 53 | - `count` number of dolphins observed in this segment
 54 | - `segment.area` the area of the segment (`Effort` multiplied by the width of the segment
 55 | - `off.set` the logarithm of the `segment.area` multiplied by a correction for detectability (see link to appendix above for more information on this)
 56 | 
 57 | 
 58 | # Modelling
 59 | 
 60 | Our objective here is to build a spatially explicit model of abundance of the dolphins. In some sense this is a kind of species distribution model.
 61 | 
 62 | Our possible covariates to model abundance are location and depth. These are fairly good predictors of the abundance (SPOILER ALERT), though we could probably improve the model further by including things like sea surface temperature and chlorophyll *a*.
 63 | 
 64 | ## A simple model to start with
 65 | 
 66 | We can begin with the model we showed in the first lecture:
 67 | 
 68 | ```{r simplemodel}
 69 | library(mgcv)
 70 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
 71 |                       data = mexdolphins,
 72 |                       family = quasipoisson(),
 73 |                       method = "REML")
 74 | ```
 75 | 
 76 | That is we fit the counts as a function of depth, using the offset to take into account effort. We use a quasi-Poisson response (i.e., modelling just the mean-variance relationship such that the variance is proportional to the mean) and use REML for smoothness selection.
 77 | 
 78 | We can check the assumptions of this model by using `gam.check`:
 79 | 
 80 | ```{r simplemodel-check}
 81 | gam.check(dolphins_depth)
 82 | ```
 83 | 
 84 | As is usual for count data, these plots are a bit tricky to interpret. For example the residuals vs linear predictor plot in the top right has that nasty line through it that makes looking for pattern tricky. We can see easily that the line equates to the zero count observations:
 85 | 
 86 | ```{r zeroresids, fig.width=7, fig.height=7}
 87 | # code from the insides of mgcv::gam.check
 88 | resid <- residuals(dolphins_depth, type="deviance")
 89 | linpred <- napredict(dolphins_depth$na.action, dolphins_depth$linear.predictors)
 90 | plot(linpred, resid, main = "Resids vs. linear pred.",
 91 |      xlab = "linear predictor", ylab = "residuals")
 92 | 
 93 | # now add red dots corresponding to the zero counts
 94 | points(linpred[mexdolphins$count==0],resid[mexdolphins$count==0],
 95 |        pch=19, col="red", cex=0.5)
 96 | ```
 97 | 
 98 | We can use randomised quantile residuals instead of deviance residuals to get around this in some cases (though not quasi-Poisson, as we don't have a proper likelihood!).
 99 | 
100 | Ignoring the plots for now (as we'll address them in the next section), let's look at the text output. It seems that the `k` value we set (or rather the default of 10) seems to have been adequate.
101 | 
102 | We could increase the value of `k` by replacing the `s(...)` with, for example, `s(depth, k=25)` (for a possibly very wiggly function) or `s(depth, k=3)` (for a much less wiggly function). Making `k` big will create a bigger design matrix and penalty matrix.
103 | 
104 | 
105 | **Exercise**
106 | 
107 | Look at the differences in the size of the design and penalty matrices by using `dim(odel.matrix(...))` and `dim(model$smooth[[1]]$S[[1]])`, replacing `...` and `model` appropriately for models with `k=3` and `k=30`.
108 | 
109 | ```{r simplemodel-bigsmall}
110 | 
111 | ```
112 | 
113 | (Don't worry about the many square brackets etc to get the penalty matrix!)
114 | 
115 | ### Plotting
116 | 
117 | We can plot the smooth we fitted using `plot`.
118 | 
119 | **Exercise**
120 | 
121 | Compare the first model we fitted with the two using different `k` values above. Use `par(mfrow=c(1,3))` to put them all in one graphic. Look at `?plot.gam` and plot the confidence intervals as a filled "confidence band". Title the plots appropriately so you can check which is which.
122 | 
123 | ```{r plotk, }
124 | par(mfrow=c(1,3))
125 | 
126 | ```
127 | 
128 | ## Count distributions
129 | 
130 | In general quasi-Poisson doesn't seem to do too great a job at modelling data with many zeros. Luckily we have a few tricks up our sleeves...
131 | 
132 | 
133 | ### Tweedie
134 | 
135 | Adding a smooth of `x` and `y` to our model with `s(x,y)`, we can then switch the `family=` argument to use `tw()` for a Tweedie distribution.
136 | 
137 | ```{r tw}
138 | dolphins_xy_tw <- gam(count ~ s(x,y) + s(depth) + offset(off.set),
139 |                          data = mexdolphins,
140 |                          family = tw(),
141 |                          method = "REML")
142 | ```
143 | 
144 | More information on Tweedie distributions can be found in Foster & Bravington (2012) and Shono (2008).
145 | 
146 | 
147 | 
148 | ### Negative binomial
149 | 
150 | **Exercise**
151 | 
152 | Now do the same using the negative binomial distribution (`nb()`).
153 | 
154 | ```{r nb}
155 | 
156 | ```
157 | 
158 | Looking at the quantile-quantile plots only in the `gam.check` output for these two models, which do you prefer? Why?
159 | 
160 | *Looks like Tweedie is better here as the points are closer to the x=y line in the Q-Q plot. Also the histogram of residuals looks more (though not very) normal.*
161 | 
162 | Look at the results of calling `summary` on both models and note that there are differences in the resulting models, due to the differing mean-variance relationships.
163 | 
164 | ## Smoothers
165 | 
166 | Now let's move onto using different bases for the smoothers. We have a couple of different options here.
167 | 
168 | 
169 | ### Thin plate splines with shrinkage
170 | 
171 | By default we use the `"tp"` basis. This is just plain thin plate regression splines (as defined in Wood, 2003). We can also use the `"ts"` basis, which is the same but with extra shrinkage on the usually unpenalised parts of model. In the univariate case this is the linear slope term of the smooth.
172 | 
173 | **Exercise**
174 | 
175 | Compare the results from one of the models above with a version using the thin plate with shrinkage using the `bs="ts"` argument to `s()` for both terms.
176 | 
177 | ```{r tw-ts}
178 | ```
179 | 
180 | What are the differences (use `summary`)?
181 | 
182 | What are the visual differences (use `plot`)?
183 | 
184 | 
185 | ### Soap film smoother
186 | 
187 | We can use a soap film smoother (Wood, 2008) to take into account a complex boundary, such as a coastline or islands.
188 | 
189 | Here I've built a simple coastline of the US states bordering the Gulf of Mexico (see the `soap_pred.R` file for how this was constructed). We can load up this boundary and the prediction grid from the following `RData` file:
190 | 
191 | ```{r soapy}
192 | load("data/mexdolphins/soapy.RData")
193 | ```
194 | 
195 | Now we need to build knots for the soap film, for this we simply create a grid, then find the grid points inside the boundary. We don't need too many of them.
196 | 
197 | ```{r soapknots}
198 | soap_knots <- expand.grid(x=seq(min(xy_bnd$x), max(xy_bnd$x), length.out=10),
199 |                           y=seq(min(xy_bnd$y), max(xy_bnd$y), length.out=10))
200 | x <- soap_knots$x; y <- soap_knots$y
201 | ind <- inSide(xy_bnd, x, y)
202 | rm(x,y)
203 | soap_knots <- soap_knots[ind, ]
204 | ## inSide doesn't work perfectly, so if you get an error like:
205 | ## Error in crunch.knots(ret$G, knots, x0, y0, dx, dy) :
206 | ##  knot 54 is on or outside boundary
207 | ## just remove that knot as follows:
208 | soap_knots <- soap_knots[-8, ]
209 | soap_knots <- soap_knots[-54, ]
210 | ```
211 | 
212 | We can now fit our model. Note that we specify a basis via `bs=` and the boundary via `xt` (for e`xt`ra information) in the `s()` term. We also include the knots as a `knots=` argument to `gam`.
213 | 
214 | ```{r soapmodel}
215 | dolphins_soap <- gam(count ~ s(x,y, bs="so", xt=list(bnd=list(xy_bnd))) +
216 |                      offset(off.set),
217 |                      data = mexdolphins,
218 |                      family = tw(),
219 |                      knots = soap_knots,
220 |                      method = "REML")
221 | 
222 | ```
223 | 
224 | **Exercise**
225 | 
226 | Look at the `summary` output for this model and compare it to the other models.
227 | 
228 | ```{r soap-summary}
229 | ```
230 | 
231 | The plotting function for soap film smooths looks much nicer by default than for other 2D smooths -- try it out.
232 | 
233 | 
234 | ```{r soap-plot}
235 | ```
236 | 
237 | # Predictions
238 | 
239 | As we saw in the intro to GAMs slides, `predict` is your friend when it comes to making predictions for the GAM.
240 | 
241 | We can do this very simply, calling predict as one would with a `glm`. For example:
242 | 
243 | ```{r pred-qp}
244 | pred_qp <- predict(dolphins_depth, preddata, type="response")
245 | ```
246 | 
247 | Now, this just gives a long vector of numbers for the predicted number of animals per cell. We can find the total abundance using `sum`.
248 | 
249 | **Exercise**
250 | 
251 | How many dolphins are there in the total area? What is the maximum in a given cell? What is the minimum?
252 | 
253 | ## Plotting predictions
254 | 
255 | *Note that this section requires quite a few additional packages to run the examples, so may not run the first time. You can use* `install.packages` *to grab the packages you need.*
256 | 
257 | Plotting predictions in projected coordinate systems is tricky. I'll show to methods here but not go into too much detail, as that's not the aim of this workshop.
258 | 
259 | For the non-soap models, we'll use the below helper function to put the predictions into a bunch of squares and then return an appropriate `ggplot2` object for us to plot:
260 | 
261 | ```{r gridplotfn}
262 | library(plyr)
263 | # fill must be in the same order as the polygon data
264 | grid_plot_obj <- function(fill, name, sp){
265 | 
266 |   # what was the data supplied?
267 |   names(fill) <- NULL
268 |   row.names(fill) <- NULL
269 |   data <- data.frame(fill)
270 |   names(data) <- name
271 | 
272 |   spdf <- SpatialPolygonsDataFrame(sp, data)
273 |   spdf@data$id <- rownames(spdf@data)
274 |   spdf.points <- fortify(spdf, region="id")
275 |   spdf.df <- join(spdf.points, spdf@data, by="id")
276 | 
277 |   # seems to store the x/y even when projected as labelled as
278 |   # "long" and "lat"
279 |   spdf.df$x <- spdf.df$long
280 |   spdf.df$y <- spdf.df$lat
281 | 
282 |   geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
283 | }
284 | ```
285 | 
286 | Then we can plot the predicted abundance:
287 | 
288 | ```{r plotpred}
289 | library(ggplot2)
290 | library(viridis)
291 | library(sp)
292 | library(rgeos)
293 | library(rgdal)
294 | library(maptools)
295 | 
296 | # projection string
297 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
298 | # need sp to transform the data
299 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4)
300 | 
301 | p <- ggplot() +
302 |       # abundance
303 |       grid_plot_obj(pred_qp, "N", pred.polys) +
304 |       # survey lines
305 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
306 |       # observations
307 |       geom_point(aes(x, y, size=count),
308 |                  data=mexdolphins[mexdolphins$count>0,],
309 |                  colour="red", alpha=I(0.7)) +
310 |       # make the coordinate system fixed
311 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
312 |                   xlim = range(mexdolphins$x)) +
313 |       # use the viridis colourscheme, which is more colourblind-friendly
314 |       scale_fill_viridis() +
315 |       # labels
316 |       labs(fill="Predicted\ndensity",x="x",y="y",size="Count") +
317 |       # keep things simple
318 |       theme_minimal()
319 | print(p)
320 | ```
321 | 
322 | If you have the `maps` package installed you can also try the following plot the coastline too:
323 | 
324 | ```{r maptoo}
325 | library(maps)
326 | map_dat <- map_data("usa")
327 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
328 | 
329 | # give the sp object a projection
330 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
331 | # re-project
332 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
333 | map_dat$x <- map_sp.t$long
334 | map_dat$y <- map_sp.t$lat
335 | 
336 | p <- p + geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
337 | 
338 | print(p)
339 | ```
340 | 
341 | ## Comparing predictions from soap and thin plate splines
342 | 
343 | The `soap_preddata` frame has a much larger prediction grid that covers a much wider area.
344 | 
345 | Predicting using the soap film smoother is exactly the same as for the other smoothers:
346 | 
347 | ```{r soap-pred}
348 | soap_pred_N <- predict(dolphins_soap, soap_preddata, type="response")
349 | soap_pred_N <- cbind.data.frame(soap_preddata, N=soap_pred_N, model="soap")
350 | ```
351 | 
352 | Use the above as a template (along with `ggplot2::facet_wrap()`) to build a comparison plot with the predictions of the soap film and another model.
353 | 
354 | ```{r xy-soap-pred}
355 | dolphins_xy_q <- gam(count ~ s(x,y, bs="ts") + offset(off.set),
356 |                      data = mexdolphins,
357 |                      family = tw(),
358 |                      method = "REML")
359 | xy_pred_N <- predict(dolphins_xy_q, soap_preddata, type="response")
360 | xy_pred_N <- cbind.data.frame(soap_preddata, N=xy_pred_N, model="xy")
361 | all_pred_N <- rbind(soap_pred_N, xy_pred_N)
362 | 
363 | p <- ggplot() +
364 |       # abundance
365 |       geom_tile(aes(x=x, y=y, width=sqrt(area), height=sqrt(area), fill=N), data=all_pred_N) +
366 |       # facet it!
367 |       facet_wrap(~model) +
368 |       # make the coordinate system fixed
369 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y),
370 |                   xlim = range(mexdolphins$x)) +
371 |       # use the viridis colourscheme, which is more colourblind-friendly
372 |       scale_fill_viridis() +
373 |       # labels
374 |       labs(fill="Predicted\ndensity", x="x", y="y") +
375 |       # keep things simple
376 |       theme_minimal() +
377 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat)
378 | print(p)
379 | ```
380 | 
381 | ## Plotting uncertainty
382 | 
383 | **Exercise**
384 | 
385 | We can use the `se.fit=TRUE` argument to get the per-cell standard errors, then divide these through by the abundances to get a coefficient of variation (CV) per cell. We can then plot that using the same technique as above. Try this out below.
386 | 
387 | Note that the resulting object from setting `se.fit=TRUE` is a list with two elements.
388 | 
389 | ```{r CVmap}
390 | 
391 | ```
392 | 
393 | Compare the uncertainties of the different models you've fitted so far.
394 | 
395 | ## `lpmatrix` magic
396 | 
397 | Now for some `lpmatrix` magic. We said before that the `lpmatrix` maps the parameters onto the predictions. Let's show that's true:
398 | 
399 | ```{r lppred}
400 | # make the Lp matrix
401 | lp <- predict(dolphins_depth, preddata, type="lpmatrix")
402 | # get the linear predictor
403 | lin_pred <- lp %*% coef(dolphins_depth)
404 | # apply the link and multiply by the offset as lpmatrix ignores this
405 | pred <- preddata$area * exp(lin_pred)
406 | # all the same?
407 | all.equal(pred[,1], as.numeric(pred_qp), check.attributes=FALSE)
408 | ```
409 | 
410 | What else can we do? We can also grab the uncertainty for the sum of the predictions:
411 | 
412 | ```{r lpNvar}
413 | # extract the variance-covariance matrix
414 | vc <- vcov(dolphins_depth)
415 | 
416 | # reproject the var-covar matrix to be on the linear predictor scale
417 | lin_pred_var <- tcrossprod(lp %*% vc, lp)
418 | 
419 | # pre and post multiply by the derivatives of the link, evaluated at the
420 | # predictions -- since the link is exp, we just use the predictions
421 | pred_var <- matrix(pred,nrow=1) %*% lin_pred_var %*% matrix(pred,ncol=1)
422 | 
423 | # we can then calculate a total CV
424 | sqrt(pred_var)/sum(pred)
425 | ```
426 | 
427 | As you can see, the `lpmatrix` can be very useful!
428 | 
429 | # Extra credit
430 | 
431 | - Experiment with `vis.gam` (watch out for the `view=` option) and plot the 2D smooths you've fitted. Check out the `too.far=` agument.
432 | - Use the randomized quantile residuals plot to inspect the models you fitted above. What are the differences you see between them and the deviance residuals you see above? Which model would you choose?
433 | - Redo the side-by-side soap film and another smoother plot, but showing the coefficient of variation. What difference does the soap film make?
434 | 
435 | # References
436 | 
437 | - Foster, S. D., & Bravington, M. V. (2012). A Poisson–Gamma model for analysis of ecological non-negative continuous data. Environmental and Ecological Statistics, 20(4), 533–552. http://doi.org/10.1007/s10651-012-0233-0
438 | - Miller, D. L., Burt, M. L., Rexstad, E. A., & Thomas, L. (2013). Spatial models for distance sampling data: recent developments and future directions. Methods in Ecology and Evolution, 4(11), 1001–1010. http://doi.org/10.1111/2041-210X.12105
439 | - Shono, H. (2008). Application of the Tweedie distribution to zero-catch data in CPUE analysis. Fisheries Research, 93(1-2), 154–162. http://doi.org/10.1016/j.fishres.2008.03.006
440 | - Wood, S. N. (2003). Thin plate regression splines. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 65(1), 95–114.
441 | - Wood, S. N., Bravington, M. V., & Hedley, S. L. (2008). Soap film smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 70(5), 931–955. http://doi.org/10.1111/j.1467-9868.2008.00665.x
442 | 
443 | 


--------------------------------------------------------------------------------
/example-spatio-temporal data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Spatio-temporal dynamics is for the birds"
  3 | output:
  4 |   html_document:
  5 |     toc: true
  6 |     toc_float: true
  7 |     theme: readable
  8 |     highlight: haddock
  9 | ---
 10 | 
 11 | ##1. Background 
 12 | We've talked about modelling spatial trends in the main workshop and in an
 13 | extended example, and about the trials and tribulations of time series data
 14 | in another extended example. Here, we're going to combine these, to look at 
 15 | one of the most challenging types of data for linear models to deal with:
 16 | spatiotemporal data. A spatiotemporal dataset is one where the outcome of
 17 | interest has been observed at multiple locations and at multiple points in time,
 18 | and we want to know how temporal trends change across the landscape (or, from
 19 | the other view, how spatial patterns are changing over time). 
 20 | 
 21 | We don't have time to really get into the nuts and bolts issues around fitting 
 22 | complex spatio-temporal models. This exercise is instead designed to give you
 23 | a feel for how to use `mgcv` to model complex interactions between variables
 24 | with different natural scales, and how to include simple random effects into 
 25 | gam models.
 26 | 
 27 | ## 2. Key concepts and functions: 
 28 | Here's a few key ideas and R functions you should familiarize yourself with if 
 29 | you haven't already encountered them before. For the R functions (which will be)
 30 | highlighted `like this`, use `?function_name` to look them up. 
 31 | 
 32 | ### Tensor product (`te`) smooths
 33 | This tutorial will rely on tensor product smooths. These are used to model 
 34 | nonlinear interactions between variables that have different natural scales. The
 35 | interaction smooths we saw in the start of the workshop were thin plate splines 
 36 | `y~s(x1,x2, bs="tp")`). These splines assume that the relationship between y and
 37 | x1 should be roughly as "wiggly" as the relationship between y and x2, and only
 38 | has one smoothing parameter. This makes sense if x1 and x2 are similar variables
 39 | (say latitude and longitude, or arm length and leg length), but if we're looking
 40 | at the interacting effects of, say, temperature and body weight, a single
 41 | smoothing term doesn't make sense.
 42 | 
 43 | This is where tensor product smooths come in. These assume that each variable of
 44 | interest should have its own smooth term. These create multi-dimensional smooth 
 45 | terms by multiplying the values of each one-dimensional basis by each value of
 46 | the second. This is a pretty complicated idea, and hard to describe effectively 
 47 | without more math than we want to go into here. But you can get the idea from a
 48 | simple example. Let's say we have two basis functions, one for variable x1 and 
 49 | one for variable x2. The function for x1 is a bell-curve shaped, and the one for
 50 | x2 is linear. The basis functions will look like this:
 51 | 
 52 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE}
 53 | library(mgcv)
 54 | library(ggplot2)
 55 | library(dplyr)
 56 | library(viridis)
 57 | set.seed(1)
 58 | n=25
 59 | dat = as.data.frame(expand.grid(x1 = seq(-1,1,length.out = n),
 60 |                                 x2= seq(-1,1,length=n)))
 61 | dat$x1_margin = -dat$x1
 62 | dat$x2_margin = dnorm(dat$x2,0,0.5)
 63 | dat$x2_margin = dat$x2_margin - mean(dat$x2_margin)
 64 | 
 65 | layout(matrix(1:2,ncol=2))
 66 | plot(x1_margin~x1,type="l", data= dat%>%filter(!duplicated(x1)))
 67 | plot(x2_margin~x2,type="l", data= dat%>%filter(!duplicated(x2)))
 68 | layout(1)
 69 | ```
 70 | 
 71 | However, the tensor product of the two variables is more complicated. Note that
 72 | when x1 is below zero, the effect of x2 is positive at middle values and negative
 73 | at low values, and vice versa when x1 is above zero. 
 74 | ```{r, echo=T,tidy=F,results="hide", include=T, message=FALSE,highlight=TRUE}
 75 | dat$x1x2_tensor = dat$x1_margin*dat$x2_margin
 76 | ggplot(aes(x=x1,y=x2,fill=x1x2_tensor),data=dat)+
 77 |   geom_tile()+
 78 |   scale_x_continuous(expand=c(0,0))+
 79 |   scale_y_continuous(expand=c(0,0))+
 80 |   scale_fill_viridis()+
 81 |   theme_bw()
 82 | ```
 83 | 
 84 | A full tensor product smooth will consist of several basis functions constructed
 85 | like this. The resulting smooth is then fitted using the standard gam model, but
 86 | with multiple penalty terms (one for each term in the tensor product). Each
 87 | penalty penalizes how much the function deviates from a smooth function in the
 88 | direction of one of the variables, averaged over all the other variables. This
 89 | allows us to have the function be more or less wiggly in one direction than
 90 | another. 
 91 | 
 92 | They are built using the `te()` function, like this:
 93 | 
 94 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
 95 | # We'll recycle our fake marginal and tensor products as the expected value for
 96 | # a simulation. Waste not, want not!
 97 | dat$y = rnorm(n^2, dat$x1x2_tensor+dat$x1_margin+dat$x2_margin,0.5) 
 98 | 
 99 | te_demo = gam(y~te(x1, x2, k = 4^2),data=dat,method="REML") #k=5^2 so there will be 5 marginal bases for each dimension
100 | plot.gam(te_demo,scheme = 2)
101 | summary(te_demo)
102 | print(te_demo$sp) 
103 | #These are the estimated smooths for the two dimensions. Note that the smooth for
104 | #x1 (the linear basis) is way smoother than the one for x2
105 | ```
106 | 
107 | 
108 | 
109 | ### Tensor interaction smooths
110 | There are many cases when we want an estimate of both the average effect of each
111 | variable as well as the interaction. This is tough to extract from just using 
112 | `te()` terms, but if we try to combine the two (say, with 
113 | `y~s(x1)+s(x2)+te(x1,x2))`) the result is often numerically unstable; it won't
114 | give reliable consistent separation into the terms we want. Instead, we use `ti`
115 | terms for the interaction. The `ti` terms are set up so the average effect of
116 | each variable in the smooth is already excluded from the interaction (and they'll
117 | have fewer basis functions than the equivalent `te` term). Search `?te` if you're 
118 | curious about how `mcgv` sets these up; for now, we'll just use them as a black 
119 | box. 
120 | 
121 | This is how you use them in practice: 
122 | 
123 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
124 | # We'll recycle our fake tensor product as the expected value for a simulation
125 | # Waste not, want not!
126 | 
127 | ti_demo = gam(y~s(x1,k=4)+s(x2,k=4) +ti(x1, x2, k = 4^2),
128 |               data=dat,method="REML")
129 | layout(matrix(1:3,nrow=1))
130 | plot.gam(ti_demo,scheme = 2)
131 | layout(1)
132 | summary(ti_demo)
133 | print(ti_demo$sp) 
134 | #These are the estimated smooths for the two dimensions. Note that the smooth for
135 | #x1 (the linear basis) is way smoother than the one for x2
136 | ```
137 | 
138 | Note that it does a pretty good job of extracting our assumed marginal 
139 | basis functions (linear and Gaussian) and the tensor product into three clearly 
140 | separate functions. 
141 | 
142 | NOTE: Never use `ti` on its own without including the marginal smooths! This
143 | is for the same reason you shouldn't exclude individual linear terms but include
144 | interactions in `glm` models. It really virtually never makes sense to talk about a function with a strong interaction but not marginal effects.
145 | 
146 | ### Random effects smooths
147 | The last smooth type we'll use in this exercise is the random effects smooth. 
148 | This is just what it sounds on the box: it's a smooth for factor terms, with one
149 | coefficient for each level of the group (the basis functions for this case), and 
150 | a penalty term on the squared value of the coefficients to draw them towards the
151 | global mean. This is exactly the same mechanism that packages like `lme4` and
152 | `nlme` use to calculate random effects. In fact, there's a very deep connection
153 | between gams and mixed effects,but we won't go into that here. 
154 | 
155 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
156 | n= 10
157 | n_groups= 10
158 | group_means = rnorm(n_groups, 0, 2)
159 | dat = data_frame(group = rep(1:n_groups, each=n),
160 |                  group_f = factor(group))
161 | dat$y = rnorm(n*n_groups, 3+group_means[dat$group],2)
162 | re_demo = gam(y~s(group_f, bs="re"),
163 |               data=dat,method="REML")
164 | plot.gam(re_demo,scheme = 2)
165 | summary(re_demo)
166 | print(re_demo$sp) 
167 | ```
168 | 
169 | It's important to know when using `bs="re"` terms whether the variable you're 
170 | giving it is numeric or factorial. `mgcv` will not raise an error if you give it
171 | numerical variables to `bs="re"`. Instead, it'll just fit a linear model with a 
172 | penalty on the slope of the line between x and y. Try it with the data about to 
173 | see, by switching `s(group_f, bs="re")` with `s(group, bs="re")`. `bs="re"` is 
174 | set up like this to allow you to use random effects smooths to fit things like 
175 | varying slopes random effects models, but make sure you know what you're doing
176 | when venturing into this.
177 | 
178 | ## 3. Spatiotemporal models
179 | Now that we've covered the key smoothers, let's take a look at how we'd use them
180 | to model spatiotemporal data. For this section, we'll use data from the Florida 
181 | routes of the Breeding Bird Survey. If you haven't heard of it before, this is a
182 | long-term monitoring program tracking bird abundance and diversity across the US 
183 | and Canada. It consists of many (over 4000) 24.5 mile long routes. Trained 
184 | observers walk these routes each year during peak breeding season, and every 0.5
185 | miles, they stop and count and identify all the birds they can see or hear in a
186 | 0.25 mile radius of the stop. This is one of the best long-term data sets we
187 | have on bird diversity and abundance, and has been used in countless papers. 
188 | 
189 | Here, we're going to use this data to answer a couple relatively simple
190 | questions: how does bird species richness vary across the state of Florida, has
191 | average richness changed over time, and has the spatial pattern of richness
192 | varied? We obtained data from [this
193 | site](https://www.pwrc.usgs.gov/bbs/RawData/), and summarized it into the total
194 | number of different species and the total number of individual birds observed on
195 | each route in each year in the state of Florida. 
196 | 
197 | First, we'll load the data, and quickly plot it:
198 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
199 | library(maps)
200 | florida_map = map_data("state","florida")%>%
201 |   transmute(Longitude= long, Latitude = lat, order=order)
202 | florida_birds = read.csv("data/bbs_data/bbs_florida_richness.csv")
203 | head(florida_birds)
204 | 
205 | ggplot(aes(x=Year,y=Richness),data=florida_birds)+geom_point()+
206 |   theme_bw()
207 | ggplot(aes(x=Longitude,y=Latitude,col=Richness), data=florida_birds) +
208 |     geom_polygon(data=florida_map,fill=NA, col="black")+
209 |   geom_point()+theme_bw()
210 | ```
211 | 
212 | It looks like richness may have declined in the end of the time series, and
213 | there seems to be somewhat of a north-south richness gradient, but we need to
214 | model it to really tease these issues out. 
215 | 
216 | ### Modelling space and time seperately
217 | We'll first take a stab at this to determine if these mean trends actually come
218 | out of the model. As richness is a count variable
219 | we'll use a Poisson gam to model it. This means
220 | that we're assuming mean richness is the product of a smooth
221 | function of location and of date, with no interactions; 
222 | E(richness) = exp(f(year))*exp(g(location)).
223 | 
224 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
225 | richness_base_model = gam(Richness~ s(Longitude,Latitude)+s(Year),
226 |                           data=florida_birds,family=poisson,method="REML")
227 | layout(matrix(1:2,ncol=2))
228 | plot(richness_base_model,scheme=2)
229 | layout(1)
230 | summary(richness_base_model)
231 | ```
232 | 
233 | Note that we do have a thin-plate interaction between longitude and latitude.
234 | This is assuming that the gradient should be equally smooth latitudinally and
235 | longitudinally. This may not be totally accurate, as the function seems to
236 | change faster latitudinally, but we'll ignore that until the exercises at the 
237 | bottom. The model seems to be doing pretty well; it captured the original patterns
238 | we saw, and explains a not-inconsiderable 45% of the deviance. 
239 | 
240 | Let's see if there's any remaining spatiotemporal patterns though:
241 | 
242 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
243 | source("code_snippets/quantile_resid.R")
244 | 
245 | florida_birds$base_resid = rqresiduals(richness_base_model)
246 | 
247 | ggplot(aes(Longitude, Latitude),
248 |        data= florida_birds %>% filter(Year%%6==0))+ #only look at every 6th year
249 |   geom_point(aes(color=base_resid))+
250 |   geom_polygon(data=florida_map,fill=NA, col="black")+
251 |   scale_color_viridis()+
252 |   facet_wrap(~Year)+
253 |   theme_bw()
254 | 
255 | ```
256 | There's definitely still a pattern in the residuals, such as higher than
257 | expected diversity in the mid-latitudes during the 1990's.
258 | 
259 | ### Joint models of space and time 
260 | The next step is to determine if there is a significant interaction. This is
261 | where tensor product splines come in. Here, we'll use the `ti` formulation, as
262 | it allows us to directly test whether the interaction significantly improves our
263 | model.
264 | 
265 | Something important to note: the `d=c(2,1)` argument it `ti`. This tells the
266 | function that the smooth should consist of tensor product between a
267 | 2-dimensional smooth (lat-long) and a 1-dimensional term (Year).
268 |  
269 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
270 | richness_ti_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 
271 |                           ti(Longitude, Latitude, Year, d=c(2,1)),
272 |                           data=florida_birds,family=poisson,method="REML")
273 | layout(matrix(1:2,ncol=2))
274 | plot(richness_ti_model,scheme=2)
275 | layout(1)
276 | summary(richness_ti_model)
277 | 
278 | ```
279 | This improves the model fit substantially, which you can see by directly
280 | comparing models with `anova`:
281 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
282 | anova(richness_base_model,richness_ti_model,test = "Chisq")
283 | 
284 | ```
285 | 
286 | 
287 | The question then becomes: what does this actually mean? Where are we seeing the
288 | fastest and slowest rates of change in richness? It's often difficult to
289 | effectively show spatiotemporal changes, partially as it would require a
290 | 4-dimensional graph to really show what's going on. Still, we can look at a
291 | couple different views by using `predict` effectively:
292 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
293 | #First we'll create gridded data
294 | predict_richess = expand.grid(
295 |   Latitude= seq(min(florida_birds$Latitude), 
296 |                 max(florida_birds$Latitude),
297 |                 length=50),
298 |   Longitude = seq(min(florida_birds$Longitude),
299 |                   max(florida_birds$Longitude),
300 |                   length=50),
301 |   Year = seq(1970,2015,by=5)
302 | )
303 | # This now selects only that data that falls within Florida's border
304 | predict_richess = predict_richess[with(predict_richess, 
305 |                                        inSide(florida_map, Latitude,Longitude)),]
306 | predict_richess$model_fit = predict(richness_ti_model,
307 |                                     predict_richess,type = "response")
308 | ggplot(aes(Longitude, Latitude, fill= model_fit),
309 |        data=predict_richess)+
310 |   geom_tile()+
311 |   facet_wrap(~Year,nrow=2)+
312 |   scale_fill_viridis("# of species")+
313 |   theme_bw(10)
314 | ```
315 | 
316 | 
317 | Another way of looking at this is by estimating the rate of change
318 | at each location at each time point, and figuring out which locations
319 | are changing most quickly:
320 | 
321 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
322 | 
323 | predict_richess$model_change =predict(richness_ti_model,
324 |                                       predict_richess%>%mutate(Year=Year+1),
325 |                                       type = "response") -
326 |   predict_richess$model_fit 
327 | ggplot(aes(Longitude, Latitude, fill= model_change),
328 |        data=predict_richess)+
329 |   geom_tile()+
330 |   facet_wrap(~Year,nrow=2)+
331 |   scale_fill_gradient2("Rate of change\n(species per year)")+
332 |   theme_bw(10)
333 | ```
334 | 
335 | ### Accounting for local heterogeneity with `bs="re"`
336 | Examining this, it seems to indicate that richness fluctuated across the state
337 | throughout the study period, but then began sharply declining state-wide in the
338 | 2000's. (Disclaimer: I don't know enough about either the BBS or Florida to say
339 | how much of this decline is real! Please don't quote me in newspapers saying
340 | biodiversity is collapsing in Florida).
341 | 
342 | There is still one persistent issue: viewing the maps of richness, there appear
343 | to be small hotspots and coldspots of richness across the state. This may be due
344 | to a factor I've ignored so far: the data is collected in specific, repeated
345 | routes. If it's just easier to spot birds on one route, or there's a lot of
346 | local nesting habitat there, it will have higher richness than expected over the
347 | whole study. Adjusting for these kinds of hot and cold spots will force our
348 | model to be overly wiggly. Further, not all routes are sampled every year, so we
349 | want to make sure that our trends don't just result from less diverse routes
350 | getting sampled later in the survey.
351 | 
352 | To try and account for this effect (and other grouped local factors), we 
353 | can use random effects smooths. That would make our model look like this:
354 | 
355 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
356 | florida_birds$Route_f = factor(florida_birds$Route)
357 | richness_ti_re_model = gam(Richness~ s(Longitude,Latitude)+s(Year) + 
358 |                           ti(Longitude, Latitude, Year, d=c(2,1))+
359 |                           s(Route_f, bs="re"),
360 |                           data=florida_birds,family=poisson,method="REML")
361 | layout(matrix(1:3,nrow=1))
362 | plot(richness_ti_re_model,scheme=2)
363 | layout(1)
364 | 
365 | summary(richness_ti_re_model)
366 | anova(richness_ti_model,richness_ti_re_model)
367 | ```
368 | 
369 | We can view the same plots as before with our new model:
370 | ```{r, echo=T,tidy=F, include=T, message=FALSE,highlight=TRUE}
371 | predict_richess$Route_f = 1
372 | predict_richess$model_fit = predict(richness_ti_re_model,
373 |                                     predict_richess,type = "response")
374 | predict_richess$model_change =predict(richness_ti_re_model,
375 |                                       predict_richess%>%mutate(Year=Year+1),
376 |                                       type = "response") -
377 |   predict_richess$model_fit 
378 | 
379 | ggplot(aes(Longitude, Latitude, fill= model_fit),
380 |        data=predict_richess)+
381 |   geom_tile()+
382 |   facet_wrap(~Year,nrow=2)+
383 |   scale_fill_viridis("# of species")+
384 |   theme_bw(10)
385 | 
386 | ggplot(aes(Longitude, Latitude, fill= model_change),
387 |        data=predict_richess)+
388 |   geom_tile()+
389 |   facet_wrap(~Year,nrow=2)+
390 |   scale_fill_gradient2("Rate of change\n(species per year)")+
391 |   theme_bw(10)
392 | ```
393 | This definely seems to have levelled off some of the hot spots from before, 
394 | implying that bird richness is more evenly distributed than our prior model 
395 | suggested. Still, the large-scale trends remain the same from before.
396 | 
397 | ## 4. Exercises
398 | 
399 | 1. Given that most of the variability appears to be latitudinal, try to adapt 
400 | the tensor smoothers so that latitude,longitude, and year all have their own 
401 | penalty terms. How does this change model fit? 
402 | 
403 | 2. Following up on the prior
404 | exorcise: build a model that allows you to explicitly calculate the marginal
405 | latitudinal gradient in diversity, and to determine how the latitudinal gradient
406 | has changed over time. 
407 | 
408 | 3. We know from basic sampling theory that if you sample
409 | fewer individuals over time, you also expect to find lower richness, even if
410 | actual richness stays unchanged. What does the spatiotemporal pattern of
411 | abudance look like? Does adding bird abudance as a predictor explain any of the
412 | patterns in richness?
413 | 


--------------------------------------------------------------------------------
/pre_course_questionnaire.txt:
--------------------------------------------------------------------------------
 1 | What to ask?
 2 | ============
 3 | 
 4 | 
 5 | Experience with R
 6 |  - how long have you used it?
 7 |  - how often do you use it?
 8 | Experience with mgcv
 9 |  - how long have you used it?
10 |  - how often do you use it?
11 | 
12 | Mathematical background
13 |  - I hate math -> I actively enjoy matrix algebra
14 | 
15 | What do  you do?
16 | 
17 | What do you want to do?
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/slides/00-preamble.Rpres:
--------------------------------------------------------------------------------
 1 | The mgcv package as a one-stop-shop for fitting non-linear ecological models
 2 | ============================
 3 | author: David L Miller
 4 | css: custom.css
 5 | transition: none
 6 | 
 7 | 
 8 | Good morning!
 9 | ==============
10 | type: section
11 | 
12 | Who is this guy?
13 | ================
14 | 
15 | - (mostly) Cetacean distribution modelling
16 | - Spatial modelling (esp. model checking)
17 | - Distance sampling [distancesampling.org](http://distancesampling.org)
18 | - Statistical software (`Distance`, `mrds`, `dsm`)
19 | - Not a biologist/ecologist (not even really a statistician)
20 | 
21 | 
22 | Who are you?
23 | ============
24 | type:section
25 | 
26 | 
27 | What is the structure of the day?
28 | =================================
29 | type:section
30 | 
31 | Timing
32 | ======
33 | 
34 | - 9am to 5pm
35 | - Morning session: 9am-12pm
36 | - LUNCH?!
37 | - Aternoon session: 1pm-5pm
38 | 
39 | Today: me talking a lot
40 | 
41 | Tomorrow: less of that?
42 | 
43 | Content
44 | =======
45 | 
46 | - Broad overview of what one can do in `mgcv`
47 | - Deeper on simple stuff
48 | - Focus on 2 data sets
49 |   - species distribution modelling (dolphins)
50 |   - "time series ish" data (zooplankton)
51 | 
52 | Overview
53 | ========
54 | 
55 | 1. Preamble
56 | 1. Generalized additive models
57 | 1. Fitting GAMs in practice
58 | 1. Model checking
59 | 1. Model selection
60 | 1. Inference
61 | 1. "Fancy" stuff
62 | 
63 | Please interrupt!
64 | =================
65 | 
66 | - I have a weird accent 
67 | - I sometimes mumble
68 | - I will probably say something that is unclear
69 | 
70 | Credits
71 | =======
72 | 
73 | - Course based on one given at ESA 2016
74 |   - Eric Pedersen, UWisc, now DFO
75 |   - Gavin Simpson, URegina, SK
76 | - Previous course on density models
77 |   - Jason Roberts, Duke
78 | - Using many of the materials from these courses
79 | - All these materials are **open**, reuse as you like!
80 | 
81 | ![Creative Commons BY](images/by.png)
82 | 
83 | Course website
84 | ==============
85 | <br/>
86 | <br/>
87 | <br/>
88 | <div class="bigq"><a href="http://converged.yt/mgcv-workshop">converged.yt/mgcv-workshop</a></div>
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/slides/01-intro.Rpres:
--------------------------------------------------------------------------------
  1 | Generalized Additive Models
  2 | ============================
  3 | author: David L Miller
  4 | css: custom.css
  5 | transition: none
  6 | 
  7 | 
  8 | Overview
  9 | =========
 10 | 
 11 | - What is a GAM?
 12 | - What is smoothing?
 13 | - How do GAMs work? (*Roughly*)
 14 | 
 15 | ```{r setup, include=FALSE}
 16 | library(knitr)
 17 | library(viridis)
 18 | library(ggplot2)
 19 | library(reshape2)
 20 | library(animation)
 21 | library(mgcv)
 22 | opts_chunk$set(cache=TRUE, echo=FALSE)
 23 | theme_set(theme_minimal()+theme(text=element_text(size=20)))
 24 | ```
 25 | 
 26 | 
 27 | From GAMs to GLMs and LMs
 28 | =============================
 29 | type:section
 30 | 
 31 | 
 32 | (Generalized) Linear Models
 33 | =============================
 34 | 
 35 | Models that look like:
 36 | 
 37 | $$
 38 | y_i = \beta_0 + x_{1i}\beta_1 + x_{2i}\beta_2 + \ldots + \epsilon_i
 39 | $$
 40 | 
 41 | (describe the response, $y_i$, as linear combination of the covariates, $x_{ji}$, with an offset)
 42 | 
 43 | We can make $y_i\sim$ any exponential family distribution (Normal, Poisson, etc).
 44 | 
 45 | Error term $\epsilon_i$ is normally distributed (usually).
 46 | 
 47 | Why bother with anything more complicated?!
 48 | =============================
 49 | type:section
 50 | 
 51 | Is this relationship linear?
 52 | =============================
 53 | 
 54 | ```{r islinear, fig.width=12, fig.height=7}
 55 | set.seed(2) ## simulate some data...
 56 | dat <- gamSim(1, n=400, dist="normal", scale=1, verbose=FALSE)
 57 | dat <- dat[,c("y", "x0", "x1", "x2", "x3")]
 58 | p <- ggplot(dat,aes(y=y,x=x1)) +
 59 |       geom_point()
 60 | print(p)
 61 | ```
 62 | 
 63 | A linear model...
 64 | =============================
 65 | type:section
 66 | 
 67 | ```{r eval=FALSE, echo=TRUE}
 68 | lm(y ~ x1, data=dat)
 69 | ```
 70 | 
 71 | Is this relationship linear? Maybe?
 72 | ===================================
 73 | 
 74 | ```{r maybe, fig.width=12, fig.height=7}
 75 | print(p + geom_smooth(method="lm"))
 76 | ```
 77 | 
 78 | 
 79 | What can we do?
 80 | =============================
 81 | type:section
 82 | 
 83 | ```{r eval=FALSE, echo=TRUE}
 84 | lm(y ~ x1 + poly(x1, 2), data=dat)
 85 | ```
 86 | Adding a quadratic term?
 87 | =============================
 88 | 
 89 | 
 90 | ```{r quadratic, fig.width=12, fig.height=7}
 91 | p <- ggplot(dat, aes(y=y, x=x1)) + geom_point() +
 92 |       theme_minimal()
 93 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2)))
 94 | ```
 95 | 
 96 | 
 97 | 
 98 | 
 99 | Is this sustainable?
100 | =============================
101 | 
102 | - Adding in quadratic (and higher terms) *can* make sense
103 | - This feels a bit *ad hoc*
104 | - Better if we had a **framework** to deal with these issues?
105 | 
106 | ```{r ruhroh, fig.width=12, fig.height=6}
107 | p <- ggplot(dat, aes(y=y, x=x2)) + geom_point() +
108 |       theme_minimal()
109 | print(p + geom_smooth(method="lm", formula=y~x+poly(x, 2)))
110 | ```
111 | 
112 | 
113 | [drumroll]
114 | =============================
115 | type:section
116 | 
117 | Generalized Additive Models
118 | ============================
119 | type:section
120 | 
121 | "gam"
122 | ====================
123 | 
124 | 1. *Collective noun used to refer to a group of whales, or rarely also of porpoises; a pod.*
125 | 2. *(by extension) A social gathering of whalers (whaling ships).*
126 | 
127 | <br/>
128 | 
129 | (via Nat Kelly, Australian Antarctic Division)
130 | 
131 | 
132 | Generalized Additive Models
133 | ============================
134 | 
135 | - Generalized: many response distributions
136 | - Additive: terms **add** together
137 | - Models: well, it's a model...
138 | 
139 | What does a model look like?
140 | =============================
141 | 
142 | $$
143 | y_i = \beta_0 + \sum_j s_j(x_{ji}) + \epsilon_i
144 | $$
145 | 
146 | where $\epsilon_i \sim N(0, \sigma^2)$, $y_i \sim \text{Normal}$ (for now)
147 | 
148 | Remember that we're modelling the **mean** of this distribution!
149 | 
150 | Call the above equation the **linear predictor**
151 | 
152 | Okay, but what about these "s" things?
153 | ====================================
154 | right:55%
155 | 
156 | ```{r smoothdat, fig.width=8, fig.height=8}
157 | 
158 | spdat <- melt(dat, id.vars = c("y"))
159 | p <- ggplot(spdat,aes(y=y,x=value)) +
160 |       geom_point() +
161 |       facet_wrap(~variable, nrow=2)
162 | print(p)
163 | ```
164 | ***
165 | - Think $s$=**smooth**
166 | - Want to model the covariates flexibly
167 | - Covariates and response not necessarily linearly related!
168 | - Want some "wiggles"
169 | 
170 | Okay, but what about these "s" things?
171 | ====================================
172 | right:55%
173 | 
174 | ```{r wsmooths, fig.width=8, fig.height=8}
175 | print(p + geom_smooth())
176 | ```
177 | ***
178 | - Think $s$=**smooth**
179 | - Want to model the covariates flexibly
180 | - Covariates and response not necessarily linearly related!
181 | - Want some "wiggles"
182 | 
183 | What is smoothing?
184 | ===============
185 | type:section
186 | 
187 | 
188 | Straight lines vs. interpolation
189 | =================================
190 | right:55%
191 | 
192 | ```{r wiggles, fig.height=8, fig.width=8}
193 | library(mgcv)
194 | # hacked from the example in ?gam
195 | set.seed(2) ## simulate some data... 
196 | dat <- gamSim(1,n=50,dist="normal",scale=0.5, verbose=FALSE)
197 | dat$y <- dat$f2 + rnorm(length(dat$f2), sd = sqrt(0.5))
198 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10-mean(dat$y)
199 | ylim <- c(-4,6)
200 | 
201 | # fit some models
202 | b.justright <- gam(y~s(x2),data=dat)
203 | b.sp0 <- gam(y~s(x2, sp=0, k=50),data=dat)
204 | b.spinf <- gam(y~s(x2),data=dat, sp=1e10)
205 | 
206 | curve(f2,0,1, col="blue", ylim=ylim)
207 | points(dat$x2, dat$y-mean(dat$y))
208 | 
209 | ```
210 | ***
211 | - Want a line that is "close" to all the data
212 | - Don't want interpolation -- we know there is "error"
213 | - Balance between interpolation and "fit"
214 | 
215 | Splines
216 | ========
217 | 
218 | - Functions made of other, simpler functions
219 | - **Basis functions** $b_k(x)$, estimate $\beta_k$ 
220 | - $s(x) = \sum_{k=1}^K \beta_k b_k(x)$
221 | 
222 | <br/>
223 | 
224 | <img src="images/addbasis.png">
225 | 
226 | Design matrices
227 | ===============
228 | 
229 | - We often write models as $X\boldsymbol{\beta}$
230 |   - $X$ is our data
231 |   - $\boldsymbol{\beta}$ are parameters we need to estimate
232 | - For a GAM it's the same
233 |   - $X$ has columns for each basis, evaluated at each observation (row)
234 |   - again, this is the linear predictor
235 |   
236 | Measuring wigglyness
237 | ======================
238 | 
239 | - Visually:
240 |   - Lots of wiggles == NOT SMOOTH
241 |   - Straight line == VERY SMOOTH
242 | - How do we do this mathematically?
243 |   - Derivatives!
244 |   - (Calculus *was* a useful class afterall!)
245 | 
246 | 
247 | 
248 | Wigglyness by derivatives
249 | ==========================
250 | 
251 | ```{r wigglyanim, results="hide"}
252 | library(numDeriv)
253 | f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 - mean(dat$y)
254 | 
255 | xvals <- seq(0,1,len=100)
256 | 
257 | plot_wiggly <- function(f2, xvals){
258 | 
259 |   # pre-calculate
260 |   f2v <- f2(xvals)
261 |   f2vg <- grad(f2,xvals)
262 |   f2vg2 <- unlist(lapply(xvals, hessian, func=f2))
263 |   f2vg2min <- min(f2vg2) -2
264 |   
265 |   # now plot
266 |   for(i in 1:length(xvals)){
267 |     par(mfrow=c(1,3))
268 |     plot(xvals, f2v, type="l", main="function", ylab="f")
269 |     points(xvals[i], f2v[i], pch=19, col="red")
270 |     
271 |     plot(xvals, f2vg, type="l", main="derivative", ylab="df/dx")
272 |     points(xvals[i], f2vg[i], pch=19, col="red")
273 |     
274 |     plot(xvals, f2vg2, type="l", main="2nd derivative", ylab="d2f/dx2")
275 |     points(xvals[i], f2vg2[i], pch=19, col="red")
276 |     polygon(x=c(0,xvals[1:i], xvals[i],f2vg2min),
277 |             y=c(f2vg2min,f2vg2[1:i],f2vg2min,f2vg2min), col = "grey")
278 |     
279 |     ani.pause()
280 |   }
281 | }
282 | 
283 | saveGIF(plot_wiggly(f2, xvals), "wiggly.gif", interval = 0.2, ani.width = 800, ani.height = 400)
284 | ```
285 | 
286 | ![Animation of derivatives](wiggly.gif)
287 | 
288 | What was that grey bit?
289 | =========================
290 | 
291 | $$
292 | \int_\mathbb{R} \left( \frac{\partial^2 f(x)}{\partial x^2}\right)^2 \text{d}x\\
293 | $$
294 | 
295 | - *Turns out* we can always write this as $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$, so the $\boldsymbol{\beta}$ is separate from the derivatives
296 | - Call $S$ the **penalty matrix**
297 | - Different penalties lead to difference $f$ s $\Rightarrow$ different $b_k(x)$ s
298 | 
299 | Making wigglyness matter
300 | =========================
301 | 
302 | - $\boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$ measures wigglyness
303 | - "Likelihood" measures closeness to the data
304 | - Penalise closeness to the data...
305 | - Use a **smoothing parameter** to decide on that trade-off...
306 |   - $\lambda \boldsymbol{\beta}^\text{T}S\boldsymbol{\beta}$
307 | - Estimate the $\beta_k$ terms but penalise objective
308 |   - "closeness to data" + penalty
309 | 
310 | Smoothing parameter
311 | =======================
312 | 
313 | 
314 | ```{r wiggles-plot, fig.width=15}
315 | # make three plots, w. estimated smooth, truth and data on each
316 | par(mfrow=c(1,3), cex.main=3.5)
317 | 
318 | plot(b.justright, se=FALSE, ylim=ylim, main=expression(lambda*plain("= just right")))
319 | points(dat$x2, dat$y-mean(dat$y))
320 | curve(f2,0,1, col="blue", add=TRUE)
321 | 
322 | plot(b.sp0, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*0))
323 | points(dat$x2, dat$y-mean(dat$y))
324 | curve(f2,0,1, col="blue", add=TRUE)
325 | 
326 | plot(b.spinf, se=FALSE, ylim=ylim, main=expression(lambda*plain("=")*infinity)) 
327 | points(dat$x2, dat$y-mean(dat$y))
328 | curve(f2,0,1, col="blue", add=TRUE)
329 | 
330 | ```
331 | 
332 | Smoothing parameter selection
333 | ==============================
334 | 
335 | - Many methods: AIC, Mallow's $C_p$, GCV, ML, REML
336 | - Recommendation, based on simulation and practice:
337 |   - Use REML or ML
338 |   - Reiss \& Ogden (2009), Wood (2011)
339 |   
340 | <img src="images/remlgcv.png">
341 | 
342 | 
343 | Maximum wiggliness
344 | ========================
345 | 
346 | - We can set **basis complexity** or "size" ($k$)
347 |   - Maximum wigglyness
348 | - Smooths have **effective degrees of freedom** (EDF)
349 | - EDF < $k$
350 | - Set $k$ "large enough"
351 |   - Penalty does the rest
352 | 
353 | 
354 | More on this in a bit...
355 | 
356 | Response distributions
357 | ======================
358 | 
359 | - Exponential family distributions are available
360 | - Normal, Poisson, binomial, gamma, quasi etc (`?family`)
361 | - Tweedie and negative binomial
362 | - Plus more! (More on that in a bit)
363 | 
364 | 
365 | uhoh
366 | ======
367 | title: none
368 | type:section
369 | 
370 | <p align="center"><img width=150% alt="spock sobbing mathematically" src="images/mathematical_sobbing.jpg"></p>
371 | 
372 | GAM summary
373 | ===========
374 | 
375 | - Straight lines suck --- we want **wiggles**
376 | - Use little functions (**basis functions**) to make big functions (**smooths**)
377 | - Need to make sure your smooths are **wiggly enough**
378 | - Use a **penalty** to trade off wiggliness/generality 
379 | 
380 | 
381 | 


--------------------------------------------------------------------------------
/slides/02-intro-mgcv.Rpres:
--------------------------------------------------------------------------------
  1 | Fitting GAMs in practice
  2 | ========================
  3 | author: David L Miller
  4 | css: custom.css
  5 | transition: none
  6 | 
  7 | 
  8 | Overview
  9 | =========
 10 | 
 11 | - Introduction to some data
 12 | - Fitting simple models
 13 | - Plotting simple models
 14 | 
 15 | ```{r setup, include=FALSE}
 16 | library(knitr)
 17 | library(viridis)
 18 | library(ggplot2)
 19 | library(reshape2)
 20 | library(animation)
 21 | library(mgcv)
 22 | opts_chunk$set(cache=TRUE, echo=FALSE)
 23 | theme_set(theme_minimal()+theme(text=element_text(size=20)))
 24 | ```
 25 | 
 26 | 
 27 | Data
 28 | ====
 29 | type:section
 30 | 
 31 | 
 32 | Extinction experiment
 33 | =====================
 34 | 
 35 | - 60 populations of *Daphnia magna* in lab
 36 | - 1/2 in constant conditions, 1/2 in "deteriorating"
 37 | - Data from Drake & Griffen (Nature [doi:10.1038/nature09389](http://doi.org/10.1038/nature09389))
 38 | 
 39 | ![D. magna doi:10.1371/journal.pbio.0030253](images/Daphnia_magna.png)
 40 | 
 41 | <small>Photo credit [doi:10.1371/journal.pbio.0030253](http://doi.org/10.1371/journal.pbio.0030253)</small>
 42 | 
 43 | 
 44 | Extinction experiment
 45 | =====================
 46 | 
 47 | ```{r data-extinct, fig.width=12, fig.height=8}
 48 | load("../data/drake_griffen/drake_griffen.RData")
 49 | populations$Treatment <- "Control"
 50 | populations$Treatment[populations$deteriorating] <- "Deteriorating"
 51 | p <- ggplot(populations) +
 52 |   geom_point(aes(x=day, y=Nhat, colour=Treatment, group=ID)) +
 53 |   geom_vline(xintercept=154) +
 54 |   geom_text(aes(x=x,y=y, label=label), size=9,
 55 |             data=data.frame(x=230, y=300, label="Start deterioration")) +
 56 |   labs(y="Count")
 57 | print(p)
 58 | ```
 59 | 
 60 | Inferential goals
 61 | =================
 62 | 
 63 | - Make sense of all these dots!
 64 | - What are the "average" trends?
 65 |   - (What are non-average trends?)
 66 | - When do the deteriorating populations go extinct on average?
 67 | 
 68 | 
 69 | Pantropical spotted dolphins
 70 | ==============================
 71 | 
 72 | - Example taken from Miller et al (2013)
 73 | - [Paper appendix](http://distancesampling.org/R/vignettes/mexico-analysis.html) has a better analysis
 74 | - Simple example here, ignoring all kinds of important stuff!
 75 | 
 76 | ![a pantropical spotted dolphin doing its thing](images/spotteddolphin_swfsc.jpg)
 77 | 
 78 | 
 79 | Inferential aims
 80 | =================
 81 | 
 82 | ```{r loaddat}
 83 | load("../data/mexdolphins/mexdolphins.RData")
 84 | ```
 85 | 
 86 | ```{r gridplotfn}
 87 | library(rgdal)
 88 | library(rgeos)
 89 | library(maptools)
 90 | library(plyr)
 91 | # fill must be in the same order as the polygon data
 92 | grid_plot_obj <- function(fill, name, sp){
 93 | 
 94 |   # what was the data supplied?
 95 |   names(fill) <- NULL
 96 |   row.names(fill) <- NULL
 97 |   data <- data.frame(fill)
 98 |   names(data) <- name
 99 | 
100 |   spdf <- SpatialPolygonsDataFrame(sp, data)
101 |   spdf@data$id <- rownames(spdf@data)
102 |   spdf.points <- fortify(spdf, region="id")
103 |   spdf.df <- join(spdf.points, spdf@data, by="id")
104 | 
105 |   # seems to store the x/y even when projected as labelled as
106 |   # "long" and "lat"
107 |   spdf.df$x <- spdf.df$long
108 |   spdf.df$y <- spdf.df$lat
109 | 
110 |   geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
111 | }
112 | ```
113 | 
114 | - How many dolphins are there?
115 | - Where are the dolphins?
116 | - What are they interested in?
117 | 
118 | ```{r spatialEDA, fig.cap="", fig.width=15}
119 | # some nearby states, transformed
120 | library(mapdata)
121 | map_dat <- map_data("worldHires",c("usa","mexico"))
122 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
123 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
124 | 
125 | # give the sp object a projection
126 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
127 | # re-project
128 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
129 | map_dat$x <- map_sp.t$long
130 | map_dat$y <- map_sp.t$lat
131 | 
132 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 
133 | p <- ggplot() +
134 |       grid_plot_obj(preddata$depth, "Depth", pred.polys) + 
135 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
136 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) +
137 |       geom_point(aes(x, y, size=count),
138 |                  data=mexdolphins[mexdolphins$count>0,],
139 |                  colour="red", alpha=I(0.7)) +
140 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) +
141 |       scale_fill_viridis(direction=-1) +
142 |       labs(fill="Depth",x="x",y="y",size="Count") +
143 |       theme_minimal()
144 | #p <- p + gg.opts
145 | print(p)
146 | ```
147 | 
148 | 
149 | Translating maths into R
150 | ==========================
151 | 
152 | A simple example:
153 | 
154 | $$
155 | y_i = \beta_0 + s(x) + s(w) + \epsilon_i
156 | $$
157 | 
158 | where $\epsilon_i \sim N(0, \sigma^2)$
159 | 
160 | Let's pretend that $y_i \sim \text{Normal}$
161 | 
162 | - linear predictor: `formula = y ~ s(x) + s(w)`
163 | - response distribution: `family=gaussian()`
164 | - data: `data=some_data_frame` 
165 | 
166 | Putting that together
167 | ======================
168 | 
169 | ```{r echo=TRUE, eval=FALSE}
170 | my_model <- gam(y ~ s(x) + s(w),
171 |                 family = gaussian(),
172 |                 data = some_data_frame,
173 |                 method = "REML")
174 | ```
175 | 
176 | - `method="REML"` uses REML for smoothness selection (default is `"GCV.Cp"`)
177 | 
178 | 
179 | What about a practical example?
180 | ================================
181 | type:section
182 | 
183 | Simple extinction example
184 | =========================
185 | 
186 | ```{r ext-ex, echo=TRUE}
187 | dmagna_happy <- gam(Nhat~s(day,  k=50), data=pop_happy, method="REML")
188 | dmagna_unhappy <- gam(Nhat~s(day, k=50), data=pop_unhappy, method="REML")
189 | ```
190 | 
191 | - Fit 2 models, one to control group, one to experimental group
192 | - `Nhat` is the count of *D. magna* (estimated from 3 censuses)
193 | - Model as a function of `day`
194 | 
195 | What did that do?
196 | =================
197 | 
198 | ```{r ext-ex-summary, echo=TRUE}
199 | summary(dmagna_unhappy)
200 | ```
201 | 
202 | Cool, but what about a plot?
203 | ============================
204 | 
205 | ```{r ext-plot, fig.show="hold", fig.width=12, fig.height=7}
206 | par(mfrow=c(1,2))
207 | plot(dmagna_happy, main="happy zooplankton", shift=mean(pop_happy$Nhat), scale=0, shade=TRUE)
208 | points(pop_happy[,c("day", "Nhat")], pch=19, cex=0.3, alpha=0.3, col="darkgrey")
209 | abline(h=0)
210 | plot(dmagna_unhappy, main="unhappy zooplankton", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE)
211 | abline(h=0)
212 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey")
213 | ```
214 | 
215 | How did we do that?
216 | ===================
217 | 
218 | ```{r ext-plot-t, echo=TRUE, eval=FALSE}
219 | plot(dmagna_unhappy, main="unhappy zooplankton", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE)
220 | abline(h=0)
221 | ```
222 | 
223 | - `scale=0` puts each term on it's own $y$ axis
224 | - Terms are centred, need `shift=` to re-centre (don't in models w/ > 1 term!)
225 | - `shade=TRUE` makes +/-2 se into shaded area
226 | - `main=` sets title, as usual
227 | 
228 | A simple dolphin model
229 | ======================
230 | 
231 | ```{r firstdsm, echo=TRUE}
232 | library(mgcv)
233 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
234 |                       data = mexdolphins,
235 |                       family = quasipoisson(),
236 |                       method = "REML")
237 | ```
238 | 
239 | - count is a function of depth
240 | - `off.set` is the effort expended
241 | - we have count data, try quasi-Poisson distribution
242 | 
243 | What did that do?
244 | ===================
245 | 
246 | ```{r echo=TRUE}
247 | summary(dolphins_depth)
248 | ```
249 | 
250 | Plotting
251 | ================
252 | 
253 | ```{r plotsmooth}
254 | plot(dolphins_depth)
255 | ```
256 | ***
257 | - `plot(dolphins_depth)`
258 | - Dashed lines indicate +/- 2 standard errors
259 | - Rug plot
260 | - On the link scale
261 | - EDF on $y$ axis
262 | 
263 | Plotting (shaded se)
264 | ====================
265 | 
266 | ```{r plotsmooth-shade}
267 | plot(dolphins_depth, shade=TRUE)
268 | ```
269 | ***
270 | - `plot(dolphins_depth, shade=TRUE)`
271 | - `?plot` has a **lot** of options
272 | - `plotdat <- plot(model)` gives you the data that generates the plot
273 | 
274 | Thin plate regression splines
275 | ================================
276 | 
277 | - Default basis
278 | - One basis function per data point
279 | - Reduce # basis functions (eigendecomposition)
280 | - Fitting on reduced problem
281 | - Multidimensional
282 | - Wood (2003)
283 | 
284 | 
285 | Bivariate terms
286 | ================
287 | 
288 | - Assumed an additive structure
289 | - No interaction
290 | - We can specify `s(x,y)` (and `s(x,y,z,...)`)
291 | - (Assuming *isotropy* here...)
292 | 
293 | ```{r xydsmplot, fig.width=15, fig.height=7}
294 | dolphins_depth_xy <- gam(count ~ s(x, y) + offset(off.set),
295 |                  data = mexdolphins,
296 |                  family=quasipoisson(), method="REML")
297 | par(mfrow=c(1,3))
298 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=20, asp=1)
299 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=60, asp=1)
300 | vis.gam(dolphins_depth_xy, view=c("x","y"), phi=45, theta=160, asp=1)
301 | ```
302 | 
303 | Adding a term
304 | ===============
305 | 
306 | - Add a **surface** for location ($x$ and $y$)
307 | - Just use `+` for an extra term
308 | 
309 | ```{r xydsm, echo=TRUE}
310 | dolphins_depth_xy <- gam(count ~ s(depth) + s(x, y) + offset(off.set),
311 |                  data = mexdolphins,
312 |                  family=quasipoisson(), method="REML")
313 | ```
314 | 
315 | 
316 | Summary
317 | ===================
318 | 
319 | ```{r echo=TRUE}
320 | summary(dolphins_depth_xy)
321 | ```
322 | 
323 | 
324 | Plot x,y term
325 | =============
326 | 
327 | ```{r plot-scheme2, echo=TRUE, fig.width=10, eval=FALSE}
328 | plot(dolphins_depth_xy, select=2, cex=3, asp=1, lwd=2, scheme=2)
329 | ```
330 | - `scale=0`: each plot on different scale
331 | - `select=` picks which smooth to plot
332 | - `scheme=2` much better for bivariate terms
333 | - `vis.gam()` also useful
334 | 
335 | Plot x,y term
336 | ==============
337 | ```{r plot-scheme2p, echo=TRUE, fig.width=12}
338 | plot(dolphins_depth_xy, select=2, cex=3, asp=1, lwd=2, scheme=2)
339 | ```
340 | 
341 | Fitting/plotting GAMs summary
342 | =============================
343 | 
344 | - `gam` does all the work
345 | - very similar to `glm`
346 | - `s` indicates a smooth term
347 | - `summary`, `plot` methods 
348 | 
349 | 


--------------------------------------------------------------------------------
/slides/03-model_checking.Rpres:
--------------------------------------------------------------------------------
  1 | Model checking
  2 | ============================
  3 | author: David L Miller
  4 | css: custom.css
  5 | transition: none
  6 | 
  7 | 
  8 | Outline
  9 | =================
 10 | 
 11 | You fitted a GAM, everything is fine, right? *Right?*
 12 | 
 13 | But what about?
 14 | 
 15 | - Smooth terms flexibility?
 16 | - Non-constant variance?
 17 | - Response distribution selection?
 18 | - Correlated covariates?
 19 | 
 20 | 
 21 | ```{r setup, echo=FALSE}
 22 | library(mgcv)
 23 | library(magrittr)
 24 | library(ggplot2)
 25 | library(dplyr)
 26 | library(statmod)
 27 | ```
 28 | 
 29 | ```{r pres_setup, include=FALSE}
 30 | library(knitr)
 31 | opts_chunk$set(cache=TRUE, echo=FALSE, fig.align="center")
 32 | ```
 33 | 
 34 | blah
 35 | ====
 36 | type:section
 37 | title:none
 38 | 
 39 | <div class="bigqw"><i>"perhaps the most important part of applied statistical modelling"</i></div>
 40 | 
 41 | <small>Simon Wood, Generalized Additive Models: An Introduction in R</small>
 42 | 
 43 | Basis size (k)
 44 | ==============
 45 | 
 46 | - `k` $\approx$ number of basis functions
 47 | - Set `k` per term
 48 | - e.g. `s(x, k=10)` or `s(x, y, k=100)`
 49 | - Penalty removes "extra" wigglyness
 50 |   - *up to a point!*
 51 | - (But computation is slower with bigger `k`)
 52 | 
 53 | ```{r loaddat}
 54 | load("../data/drake_griffen/drake_griffen.RData")
 55 | ```
 56 | 
 57 | Default basis size
 58 | ====================
 59 | 
 60 | ```{r gam_check, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15}
 61 | b <- gam(Nhat ~ s(day), data=pop_unhappy, method="REML")
 62 | gam.check(b)
 63 | ```
 64 | 
 65 | Increasing basis size
 66 | ====================
 67 | 
 68 | ```{r gam_check2, fig.keep="none", include=TRUE,echo=TRUE, fig.width=15}
 69 | b_50 <- gam(Nhat ~ s(day, k=50), data=pop_unhappy, method="REML")
 70 | gam.check(b_50)
 71 | ```
 72 | 
 73 | Does it make a difference?!
 74 | ===========================
 75 | 
 76 | ```{r ext-flex, fig.width=12}
 77 | par(mfrow=c(1,2))
 78 | plot(b, main="k=10", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE)
 79 | abline(h=0)
 80 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey")
 81 | plot(b_50, main="k=50", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE)
 82 | abline(h=0)
 83 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey")
 84 | ```
 85 | 
 86 | General strategy
 87 | ================
 88 | 
 89 | - leave `k` at default, see what happens
 90 | - double `k` depending on results from `gam.check`
 91 | - repeat
 92 | 
 93 | <br/>
 94 | `?choose.k` has some good advice
 95 | 
 96 | Historical/philosophical note
 97 | =============================
 98 | 
 99 | - "Keep relationships simple and interpretable"
100 |   - What does this mean?
101 |   - Bias confirmation?
102 |   - Limit model to get "clean" relationships?
103 | - Some literature suggests "limit `k=5`" or somesuch
104 |   - Original `gam` package for S+ had a default `k=5`
105 |   - Coincidence?
106 |   - (Simon Wood, pers. comm.)
107 | 
108 | 
109 | 
110 | Residual checks
111 | ===============
112 | type:section
113 | 
114 | Residuals
115 | =========
116 | 
117 | - Deal with 2 types of residuals
118 |   - Deviance
119 |   - Randomized quantile
120 | - Raw residuals are just (observed - fitted)
121 |   - Analog to R^2
122 |   - Difficult to assess mean-variance relationship graphically
123 |   - Need to rescale so mean-variance is constant
124 |   
125 | Deviance residuals
126 | ==================
127 | 
128 | - Deviance $\approx$ "R^2 for GAMs"
129 | - Per-observation deviance $\approx$ raw resids?
130 | - Multiply by sign of (observed-fitted)
131 | - Should be Normal(0, 1) distributed
132 | 
133 | gam.check() plots
134 | =============================
135 | 
136 | `gam.check()` creates 4 plots: 
137 | 
138 | 1. Quantile-quantile plots of residuals
139 | 2. Histogram of residuals
140 | 3. Residuals vs. linear predictor
141 | 4. Observed vs. fitted values
142 | 
143 | 
144 | Checking response distribution
145 | ==============================
146 | 
147 | - Left side of `gam.check` plots
148 | - Examples from the Drake & Griffen data
149 | - Looking at the "deteriorating" populations
150 | 
151 | ```{r, fig.width=12}
152 | plot(b_50, main="unhappy zooplankton", shift=mean(pop_unhappy$Nhat), scale=0, shade=TRUE)
153 | abline(h=0)
154 | points(pop_unhappy[,c("day", "Nhat")], pch=19, cex=0.3, col="darkgrey")
155 | ```
156 | 
157 | Normal response with count data
158 | ===================================
159 | 
160 | ```{r gam_gauss, echo=TRUE, results="hide"}
161 | b <- gam(Nhat ~ s(day, k=50), data=pop_unhappy, method="REML")
162 | ```
163 | ```{r gam_gauss_p, results="hide", fig.width=10, fig.height=6}
164 | gam.check(b)
165 | ```
166 | 
167 | What about a count distribution?
168 | ================================
169 | 
170 | ```{r gam_quasi, echo=TRUE, results="hide"}
171 | b_quasi <- gam(Nhat ~ s(day, k=50), data=pop_unhappy, method="REML", family=quasipoisson())
172 | ```
173 | ```{r gam_quasi_p, results="hide", fig.width=10, fig.height=6}
174 | gam.check(b_quasi)
175 | ```
176 | 
177 | 
178 | What about a fancier count distribution?
179 | ================================
180 | 
181 | ```{r gam_nb, echo=TRUE, results="hide"}
182 | b_nb <- gam(Nhat ~ s(day, k=50), data=pop_happy, method="REML", family=nb())
183 | ```
184 | ```{r gam_nb_p, results="hide", fig.width=10, fig.height=8}
185 | gam.check(b_nb)
186 | ```
187 | 
188 | 
189 | Variance relationships
190 | ======================
191 | 
192 | - Heteroskedasticity
193 | - Do we know that the mean-variance relationship is right?
194 | - Deviance resids should give us constant variance if model correct?
195 | - Right column of `gam.check`:
196 |   - residuals vs. linear prediction == cloud
197 |   - Response vs. fitted == line-ish
198 | 
199 | Mean-variance incorrect
200 | =======================
201 | 
202 | ```{r gam_nb_p2, results="hide", fig.width=10, fig.height=8}
203 | gam.check(b_nb)
204 | ```
205 | 
206 | Close up
207 | ========
208 | 
209 | ```{r res-linear-nb, results="hide", fig.width=10, fig.height=8}
210 | resid <- residuals(b_nb, type = "deviance")
211 | linpred <- predict(b_nb)
212 | plot(linpred, resid, main = "Resids vs. linear pred.", xlab = "linear predictor", 
213 |      ylab = "residuals", cex=1.5)
214 | ```
215 | 
216 | Randomized quantile residuals
217 | =============================
218 | 
219 | ```{r qres-linear-nb, results="hide", fig.width=10, fig.height=8}
220 | library(statmod)
221 | b_nb$theta <- b_nb$family$getTheta(TRUE)
222 | resid <- qresid(b_nb)
223 | plot(linpred, resid, main = "RQ-Resids vs. linear pred.", xlab = "linear predictor", 
224 |      ylab = "randomized quantile residuals", cex=1.5)
225 | ```
226 | 
227 | 
228 | Shortcomings
229 | =============
230 | 
231 | - Deviance resids vs. linear predictor is victim of artifacts
232 | - Need an alternative
233 | - "Randomised quantile residuals" (*experimental*)
234 |   - `rqresiduals` in `statmod`
235 |   - Exactly normal residuals ... if the model is right!
236 |   - `rqgam.check` in `dsm` (ignore left side plots!)
237 | 
238 | These plots are just the start
239 | ==============================
240 | 
241 | - Need to go further
242 | - Look at aggregations of residuals by other variables
243 | 
244 | ```{r resids-by-pop, results="hide", fig.width=12, fig.height=7}
245 | pop_happy$resids <- residuals(b_nb)
246 | boxplot(resids~ID, pop_happy, main="Residuals per experimental population")
247 | ```
248 | 
249 | Residual checking as art form
250 | =============================
251 | 
252 | - Residuals can tell you a **lot** about your model
253 | - No general method
254 |   - Depends on data
255 |   - Depends on inferential goals
256 | - Highlight model deficiencies
257 | - Inform what to do next; which other questions are interesting
258 | 
259 | 
260 | Tobler's first law of geography
261 | ==================================
262 | type:section
263 | 
264 | "Everything is related to everything else, but near things are more related than distant things"
265 | 
266 | Tobler (1970)
267 | 
268 | 
269 | Concurvity
270 | ==========
271 | 
272 | - We all know correlated covariates are bad
273 | - What about non-linear correlations?
274 | - Can we describe covariates as functions of each other?
275 | - Important for model selection --- sensitivity analysis
276 | 
277 | Example - US East coast
278 | =======================
279 | 
280 | ![us east coast with covariates](images/spermcovars.png)
281 | 
282 | Checking for concurvity
283 | =======================
284 | 
285 | - Measures, for each smooth term, how well this term could be approximated by
286 |   - `concurvity(model, full=TRUE)`: some combination of all other smooth terms
287 |   - `concurvity(model, full=FALSE)`: each of the other smooth terms in the model 
288 |   (useful for identifying which terms are causing issues)
289 | 
290 | Plotting concurvity
291 | ===================
292 | 
293 | ![concurvity vis](images/concurvity.png)
294 | ***
295 | - We can visualise
296 | - `vis.concurvity` on course site
297 | 
298 | Concurvity: things to remember
299 | ==============================
300 | - Can make your model unstable to small changes
301 | - `cor(data)` not sufficient: use the `concurvity(model)` function
302 | - Not always obvious from plots of covariates or smooths
303 | 
304 | 
305 | 
306 | 


--------------------------------------------------------------------------------
/slides/04-model_selection.Rpres:
--------------------------------------------------------------------------------
  1 | Model selection
  2 | ===============
  3 | author: David L Miller
  4 | css: custom.css
  5 | transition: none
  6 | 
  7 | 
  8 | What do we mean by "model selection"?
  9 | =====================================
 10 | 
 11 | - Term "selection"
 12 |   - Path dependence
 13 |   - Shrinkage
 14 | - Selection between models
 15 |   - Term formulation
 16 |   - Selection between structurally different models
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | library("knitr")
 20 | library("viridis")
 21 | library("ggplot2")
 22 | library("mgcv")
 23 | theme_set(theme_minimal())
 24 | opts_chunk$set(cache=TRUE, echo=FALSE)
 25 | ```
 26 | 
 27 | Term selection
 28 | ==============
 29 | type:section
 30 | 
 31 | Term selection via p-values
 32 | ===========================
 33 | 
 34 | - Old paradigm -- select terms using $p$-values
 35 | - $p$-values are **approximate**
 36 | 
 37 | 1. treat smoothing parameters as *known*
 38 | 2. rely on asymptotic behaviour
 39 | 
 40 | ($p$-values in `summary.gam()` have changed a lot over time --- all options except current default are deprecated as of `v1.18-13` (*i.e.,* ignore what's in the book!).)
 41 | 
 42 | Technical stuff
 43 | ===============
 44 | 
 45 | Test of **zero-effect** of a smooth term
 46 | 
 47 | Default $p$-values rely on theory of Nychka (1988) and Marra & Wood (2012) for confidence interval coverage.
 48 | 
 49 | If the Bayesian CI have good across-the-function properties, Wood (2013a) showed that the $p$-values have:
 50 | 
 51 | - almost the correct null distribution
 52 | - reasonable power
 53 | 
 54 | Test statistic is a form of $\chi^2$ statistic, but with complicated degrees of freedom.
 55 | 
 56 | (RE)ML rant again...
 57 | ====================
 58 | 
 59 | Best behaviour when smoothness selection is by **ML** (**REML** also good)
 60 | 
 61 | Neither of these are the default, so remember to use `method = "ML"` or `method = "REML"` as appropriate
 62 | 
 63 | 
 64 | Term selection by shrinkage
 65 | ============================
 66 | type:section
 67 | 
 68 | Shrinkage & additional penalties
 69 | ================================
 70 | 
 71 | - Usually can't remove a whole function when smoothing
 72 | - Penalties used act only on *range space*
 73 | - *null space* of the basis is unpenalised.
 74 | 
 75 | Parts of $f$ that don't have 2<sup>nd</sup> derivatives aren't penalised
 76 | 
 77 | $$
 78 | \int_\mathbb{R} \left( \frac{\partial^2 f(x)}{\partial x^2}\right)^2 \text{d}x\\
 79 | $$
 80 | 
 81 | <small>(Note that penalty form depends on the basis!)</small>
 82 | 
 83 | Double-penalty shrinkage
 84 | ========================
 85 | 
 86 | $\mathbf{S}_j$ penalty matrix $j$, eigendecompose:
 87 | 
 88 | $$
 89 | \mathbf{S}_j = \mathbf{U}_j\mathbf{\Lambda}_j\mathbf{U}_j^{T}
 90 | $$
 91 | 
 92 | where $\mathbf{U}_j$ is a matrix of eigenvectors and $\mathbf{\Lambda}_j$ a diagonal matrix of eigenvalues (i.e., this is an eigen decomposition of $\mathbf{S}_j$).
 93 | 
 94 | $\mathbf{\Lambda}_j$ contains some **0**s due to the spline basis null space --- no matter how large the penalty $\lambda_j$ might get no guarantee a smooth term will be suppressed completely.
 95 | 
 96 | 
 97 | Shrinkage & additional penalties
 98 | ================================
 99 | 
100 | `mgcv` has two ways to penalize the null space $\Rightarrow$ term selection
101 | 
102 | - *double penalty approach* via `select = TRUE`
103 | - *shrinkage approach* via special bases `"ts"` and `"cs"`
104 | 
105 | Marra & Wood (2011) review other options.
106 | 
107 | Double-penalty shrinkage
108 | ========================
109 | 
110 | Create a second penalty matrix from $\mathbf{U}_j$, considering only the matrix of eigenvectors associated with the zero eigenvalues
111 | 
112 | $$
113 | \mathbf{S}_j^{*} = \mathbf{U}_j^{*}\mathbf{U}_j^{*T}
114 | $$
115 | 
116 | Now we can fit a GAM with two penalties of the form
117 | 
118 | $$
119 | \lambda_j \mathbf{\beta}^T \mathbf{S}_j \mathbf{\beta} + \lambda_j^{*} \mathbf{\beta}^T \mathbf{S}_j^{*} \mathbf{\beta}
120 | $$
121 | 
122 | In practice, add `select = TRUE` to your `gam()` call
123 | 
124 | Shrinkage
125 | =========
126 | 
127 | - Double penalty $\Rightarrow$ twice as many smoothing parameters
128 | - Alternative is shrinkage, add small value to zero eigenvalues
129 | - Null space terms to be shrunk at the same time
130 | 
131 | Use `s(..., bs = "ts")` or `s(..., bs = "cs")` in **mgcv**
132 | 
133 | Selecting between models
134 | ========================
135 | type:section
136 | 
137 | Model selection by REML/ML
138 | ==========================
139 | 
140 | - If you **don't** use shrinkage/double penalty you can't use REML scores to select models
141 | - Need model to be fully penalized for REML score selection
142 | - You can use ML though
143 | - *but* there are other options...
144 | 
145 | AIC
146 | ===
147 | type:section
148 | 
149 | AIC
150 | ============
151 | 
152 | - Use full likelihood of $\boldsymbol{\beta}$ *conditional* upon $\lambda_j$ is used, with the EDF replacing $k$, the number of model parameters
153 | - This *conditional* AIC tends to select complex models, especially those with random effects, as the EDF ignores that $\lambda_j$ are estimated
154 | - Wood et al (2015) suggests a correction that accounts for uncertainty in $\lambda_j$ (`AIC`)
155 | 
156 | Okay, fine, but...
157 | ==================
158 | type:section
159 | 
160 | GAMs are Bayesian models
161 | ========================
162 | type:section
163 | 
164 | Bayesian models
165 | ===============
166 | 
167 | - *duh*
168 |   - we can build Bayesian GLMs
169 |   - see also INLA and BayesX
170 | - `mgcv` fits Bayesian models
171 | - penalties are prior precision matrices
172 | - (improper) Gaussian prior on $\boldsymbol{\beta}$
173 | 
174 | Empirical Bayes...?
175 | ===================
176 | 
177 | - Improper prior derives from $\mathbf{S}_j$ not being of full rank 
178 |   - zeroes in $\mathbf{\Lambda}_j$.
179 | - Double penalty and shrinkage smooths make prior proper
180 |   - **Double penalty**: no assumption as to how much to shrink the null space
181 |   - **Shrinkage smooths**: assume null space should be shrunk less than the wiggles
182 | 
183 | Practical Bayes
184 | ===============
185 | 
186 | Marra & Wood (2011) show that the double penalty and the shrinkage smooth approaches:
187 | 
188 | - performed significantly better than alternatives in terms of *predictive ability*, and
189 | - performed as well as alternatives in terms of variable selection
190 | 
191 | 
192 | 
193 | Examples
194 | ========
195 | type:section
196 | 
197 | Back to the dolphins...
198 | =======================
199 | type:section
200 | 
201 | ```{r}
202 | load("../data/mexdolphins/mexdolphins.RData")
203 | ```
204 | 
205 | Fitting some models
206 | =======================
207 | 
208 | ```{r echo=TRUE}
209 | dolphins_depth_xy <- gam(count ~ s(x, y) + s(depth) + offset(off.set),
210 |                  data = mexdolphins,
211 |                  family=nb(), method="REML")
212 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
213 |                  data = mexdolphins,
214 |                  family=nb(), method="REML")
215 | dolphins_xy <- gam(count ~ s(x, y) + offset(off.set),
216 |                  data = mexdolphins,
217 |                  family=nb(), method="REML")
218 | ```
219 | 
220 | Comparing terms by p-value
221 | ==========================
222 | 
223 | ```{r echo=TRUE}
224 | summary(dolphins_depth_xy)
225 | ```
226 | 
227 | Comparing models by AIC
228 | =======================
229 | 
230 | ```{r echo=TRUE}
231 | AIC(dolphins_xy, dolphins_depth, dolphins_depth_xy)
232 | ```
233 | 
234 | 
235 | Shrinkage (basis)
236 | =========
237 | 
238 | ```{r echo=TRUE}
239 | dolphins_depth_xy_s <- gam(count ~ s(x, y, bs="ts") + s(depth, bs="ts") + offset(off.set),
240 |                  data = mexdolphins,
241 |                  family=nb(), method="REML")
242 | ```
243 | 
244 | 
245 | Shrinkage (basis)
246 | =========
247 | ```{r echo=TRUE}
248 | summary(dolphins_depth_xy_s)
249 | ```
250 | 
251 | 
252 | Shrinkage (extra penalty)
253 | =========
254 | 
255 | ```{r echo=TRUE}
256 | dolphins_depth_xy_e <- gam(count ~ s(x, y) + s(depth) + offset(off.set),
257 |                  data = mexdolphins, select=TRUE,
258 |                  family=nb(), method="REML")
259 | ```
260 | 
261 | 
262 | Shrinkage (basis)
263 | =========
264 | ```{r echo=TRUE}
265 | summary(dolphins_depth_xy_e)
266 | ```
267 | 
268 | These last two models were empirical Bayes models
269 | =================================================
270 | type:section
271 | 
272 | That's all from the dolphins for now...
273 | ====================================
274 | type:section
275 | 
276 | <br/>
277 | ![dolphins from HHG2G](images/thanks_for_all_the_fish.gif)
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 


--------------------------------------------------------------------------------
/slides/05-inference.Rpres:
--------------------------------------------------------------------------------
  1 | Inference
  2 | =========
  3 | author: David L Miller
  4 | css: custom.css
  5 | transition: none
  6 | 
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | library(knitr)
 10 | library(viridis)
 11 | library(ggplot2)
 12 | library(reshape2)
 13 | library(animation)
 14 | library(mgcv)
 15 | opts_chunk$set(cache=TRUE, echo=FALSE)
 16 | theme_set(theme_minimal()+theme(text=element_text(size=20)))
 17 | ```
 18 | 
 19 | 
 20 | What do we want to know?
 21 | ========================
 22 | 
 23 | - Don't just fit models for the sake of it!
 24 | - What are our questions?
 25 |   - Relationship to covariates
 26 |   - Abundance
 27 |   - Distribution
 28 |   - Response to disturbance
 29 |   - Temporal changes
 30 |   - Other stuff?
 31 | 
 32 | 
 33 | 
 34 | Prediction
 35 | ===========
 36 | type:section
 37 | 
 38 | What is a prediction?
 39 | =====================
 40 | 
 41 | - Evaluate the model, at a particular covariate combination
 42 | - Answering (e.g.) the question "at a given depth, how many dolphins?"
 43 | - Steps:
 44 |   1. evaluate the $s(\ldots)$ terms
 45 |   2. move to the response scale (exponentiate? Do nothing?)
 46 |   3. (multiply any offset etc)
 47 | 
 48 | Example of prediction
 49 | ======================
 50 | 
 51 | - in maths:
 52 |   - Model: $\text{count}_i = A_i \exp \left( \beta_0 + s(x_i, y_i) + s(\text{Depth}_i)\right)$
 53 |   - Drop in the values of $x, y, \text{Depth}$ (and $A$)
 54 | - in R:
 55 |   - build a `data.frame` with $x, y, \text{Depth}, A$
 56 |   - use `predict()`
 57 | 
 58 | ```{r echo=TRUE, eval=FALSE}
 59 | preds <- predict(my_model, newdat=my_data, type="response")
 60 | ```
 61 | 
 62 | (`se.fit=TRUE` gives a standard error for each prediction)
 63 | 
 64 | Back to the dolphins...
 65 | =======================
 66 | type:section
 67 | 
 68 | Where are the dolphins?
 69 | =======================
 70 | 
 71 | ```{r echo=FALSE}
 72 | 
 73 | ```
 74 | 
 75 | ```{r loaddat}
 76 | load("../data/mexdolphins/mexdolphins.RData")
 77 | ```
 78 | 
 79 | ```{r gridplotfn}
 80 | library(rgdal)
 81 | library(rgeos)
 82 | library(maptools)
 83 | library(plyr)
 84 | # fill must be in the same order as the polygon data
 85 | grid_plot_obj <- function(fill, name, sp){
 86 | 
 87 |   # what was the data supplied?
 88 |   names(fill) <- NULL
 89 |   row.names(fill) <- NULL
 90 |   data <- data.frame(fill)
 91 |   names(data) <- name
 92 | 
 93 |   spdf <- SpatialPolygonsDataFrame(sp, data)
 94 |   spdf@data$id <- rownames(spdf@data)
 95 |   spdf.points <- fortify(spdf, region="id")
 96 |   spdf.df <- join(spdf.points, spdf@data, by="id")
 97 | 
 98 |   # seems to store the x/y even when projected as labelled as
 99 |   # "long" and "lat"
100 |   spdf.df$x <- spdf.df$long
101 |   spdf.df$y <- spdf.df$lat
102 | 
103 |   geom_polygon(aes_string(x="x",y="y",fill=name, group="group"), data=spdf.df)
104 | }
105 | # some nearby states, transformed
106 | library(mapdata)
107 | map_dat <- map_data("worldHires",c("usa","mexico"))
108 | lcc_proj4 <- CRS("+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
109 | map_sp <- SpatialPoints(map_dat[,c("long","lat")])
110 | pred.polys <- spTransform(pred_latlong, CRSobj=lcc_proj4) 
111 | 
112 | # give the sp object a projection
113 | proj4string(map_sp) <-CRS("+proj=longlat +datum=WGS84")
114 | # re-project
115 | map_sp.t <- spTransform(map_sp, CRSobj=lcc_proj4)
116 | map_dat$x <- map_sp.t$long
117 | map_dat$y <- map_sp.t$lat
118 | dolphins_depth <- gam(count ~ s(depth) + offset(off.set),
119 |                       data = mexdolphins,
120 |                       family=nb(), method="REML")
121 | ```
122 | 
123 | ```{r echo=TRUE}
124 | dolphin_preds <- predict(dolphins_depth, newdata=preddata,
125 |                          type="response")
126 | ```
127 | 
128 | ```{r fig.width=20}
129 | p <- ggplot() +
130 |       grid_plot_obj(dolphin_preds, "N", pred.polys) + 
131 |       geom_line(aes(x, y, group=Transect.Label), data=mexdolphins) +
132 |       geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) +
133 |       geom_point(aes(x, y, size=count),
134 |                  data=mexdolphins[mexdolphins$count>0,],
135 |                  colour="red", alpha=I(0.7)) +
136 |       coord_fixed(ratio=1, ylim = range(mexdolphins$y), xlim = range(mexdolphins$x)) +
137 |       scale_fill_viridis() +
138 |       labs(fill="Predicted\ndensity", x="x", y="y", size="Count")
139 | print(p)
140 | ```
141 | 
142 | (`ggplot2` code included in the slide source)
143 | 
144 | Prediction summary
145 | ==================
146 | 
147 | - Evaluate the fitted model at a given point
148 | - Can evaluate many at once (`data.frame`)
149 | - Don't forget the `type=...` argument!
150 | - Obtain per-prediction standard error with `se.fit`
151 | 
152 | Without uncertainty, we're not doing statistics 
153 | ===============================================
154 | type:section
155 | 
156 | 
157 | <br/>
158 | <p align="center">
159 | <img src="images/rummy.gif" width=500px>
160 | </p>
161 | 
162 | 
163 | Where does uncertainty come from?
164 | =================================
165 | 
166 | - $\boldsymbol{\beta}$: uncertainty in the spline parameters
167 | - $\boldsymbol{\lambda}$: uncertainty in the smoothing parameter
168 | 
169 | - (Traditionally we've only addressed the former)
170 | - (New tools let us address the latter...)
171 | 
172 | 
173 | Parameter uncertainty
174 | =======================
175 | 
176 | From theory:
177 | 
178 | $$
179 | \boldsymbol{\beta} \sim N(\hat{\boldsymbol{\beta}},  \mathbf{V}_\boldsymbol{\beta})
180 | $$
181 | 
182 | (*caveat: the normality is only* **approximate** *for non-normal response*)
183 | 
184 | 
185 | **What does this mean?** Variance for each parameter.
186 | 
187 | In `mgcv`: `vcov(model)` returns $\mathbf{V}_\boldsymbol{\beta}$.
188 | 
189 | 
190 | What can we do this this?
191 | ===========================
192 | 
193 | - confidence intervals in `plot`
194 | - standard errors using `se.fit`
195 | - derived quantities? (see bibliography)
196 | 
197 | blah
198 | ====
199 | title:none
200 | type:section
201 | 
202 | <img src="images/tina-modelling.png">
203 | 
204 | 
205 | 
206 | The lpmatrix, magic, etc
207 | ==============================
208 | 
209 | For regular predictions:
210 | 
211 | $$
212 | \hat{\boldsymbol{\eta}}_p = L_p \hat{\boldsymbol{\beta}}
213 | $$
214 | 
215 | form $L_p$ using the prediction data, evaluating basis functions as we go.
216 | 
217 | (Need to apply the link function to $\hat{\boldsymbol{\eta}}_p$)
218 | 
219 | But the $L_p$ fun doesn't stop there...
220 | 
221 | [[mathematics intensifies]]
222 | ============================
223 | type:section
224 | 
225 | Variance and lpmatrix
226 | ======================
227 | 
228 | To get variance on the scale of the linear predictor:
229 | 
230 | $$
231 | V_{\hat{\boldsymbol{\eta}}} = L_p^\text{T} V_\hat{\boldsymbol{\beta}} L_p
232 | $$
233 | 
234 | pre-/post-multiplication shifts the variance matrix from parameter space to linear predictor-space.
235 | 
236 | (Can then pre-/post-multiply by derivatives of the link to put variance on response scale)
237 | 
238 | Simulating parameters
239 | ======================
240 | 
241 | - $\boldsymbol{\beta}$ has a distribution, we can simulate
242 | 
243 | ```{r paramsim, results="hide"}
244 | library(mvtnorm)
245 | 
246 | # get the Lp matrix
247 | Lp <- predict(dolphins_depth, newdata=preddata, type="lpmatrix")
248 | 
249 | # how many realisations do we want?
250 | frames <- 100
251 | 
252 | # generate the betas from the GAM "posterior"
253 | betas <- rmvnorm(frames, coef(dolphins_depth), vcov(dolphins_depth))
254 | 
255 | 
256 | # use a function to get animation to play nice...
257 | anim_map <- function(frames){
258 |   # loop to make plots
259 |   for(frame in 1:frames){
260 | 
261 |     # make the prediction
262 |     preddata$preds <- preddata$area * exp(Lp%*%betas[frame,])
263 | 
264 |     # plot it (using viridis)
265 |     p <- ggplot() +
266 |           grid_plot_obj(preddata$preds, "N", pred.polys) + 
267 |           geom_polygon(aes(x=x, y=y, group = group), fill = "#1A9850", data=map_dat) +
268 |           coord_fixed(ratio=1, ylim = range(mexdolphins$y),
269 |                       xlim = range(mexdolphins$x)) +
270 |           scale_fill_viridis(limits=c(0,400)) +
271 |           labs(fill="Predicted\ndensity",x="x",y="y",size="Count")
272 |     
273 |     print(p)
274 |   }
275 | }
276 | 
277 | # make the animation!
278 | saveGIF(anim_map(frames), "uncertainty.gif", interval = 0.15, ani.width = 800, ani.height = 400)
279 | ```
280 | 
281 | ![Animation of uncertainty](uncertainty.gif)
282 | 
283 | Uncertainty in smoothing parameter
284 | ==================================
285 | 
286 | - Recent work by Simon Wood
287 | - "smoothing parameter uncertainty corrected" version of $V_\hat{\boldsymbol{\beta}}$
288 | - In a fitted model, we have:
289 |   - `$Vp` what we got with `vcov`
290 |   - `$Vc` the corrected version
291 | 
292 | Variance summary
293 | ================
294 | 
295 | - Everything comes from variance of parameters
296 | - Need to re-project/scale them to get the quantities we need
297 | - `mgcv` does most of the hard work for us
298 | - Fancy stuff possible with a little maths
299 | - Can include uncertainty in the smoothing parameter too
300 | 
301 | Summary
302 | =======
303 | 
304 | - `predict` is your friend
305 | - Most stuff comes down to matrix algebra, that `mgcv` sheilds you from
306 |   - To do fancy stuff, get inside the matrices
307 | 


--------------------------------------------------------------------------------
/slides/06-fancy.Rpres:
--------------------------------------------------------------------------------
  1 | Fancy stuff
  2 | ============================
  3 | author: David L Miller
  4 | css: custom.css
  5 | 
  6 | A word of warning
  7 | =================
  8 | 
  9 | <br/>
 10 | <br/>
 11 | <br/>
 12 | ![jenny is the best](images/jenny_models.png)
 13 | 
 14 | 
 15 | Away from the exponential family
 16 | ===================================
 17 | type:section
 18 | 
 19 | 
 20 | ```{r setup, include=F}
 21 | library(mgcv)
 22 | library(ggplot2)
 23 | ```
 24 | 
 25 | ```{r pres_setup, include =F}
 26 | library(knitr)
 27 | opts_chunk$set(cache=TRUE, echo=FALSE,fig.align="center")
 28 | ```
 29 | 
 30 | 
 31 | Modelling "counts"
 32 | ================
 33 | type:section
 34 | 
 35 | Counts and count-like things
 36 | ============================
 37 | 
 38 | - Response is a count (not always integer)
 39 | - Often, it's mostly zero (that's complicated)
 40 | - Could also be catch per unit effort, biomass etc
 41 | - Flexible mean-variance relationship
 42 | 
 43 | ![The Count from Sesame Street](images/count.jpg)
 44 | 
 45 | Tweedie distribution
 46 | =====================
 47 | 
 48 | ```{r tweedie}
 49 | library(tweedie)
 50 | library(RColorBrewer)
 51 | 
 52 | # tweedie
 53 | y<-seq(0.01,5,by=0.01)
 54 | pows <- seq(1.2, 1.9, by=0.1)
 55 | 
 56 | fymat <- matrix(NA, length(y), length(pows))
 57 | 
 58 | i <- 1
 59 | for(pow in pows){
 60 |   fymat[,i] <- dtweedie( y=y, power=pow, mu=2, phi=1)
 61 |   i <- i+1
 62 | }
 63 | 
 64 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5,
 65 |      main="")
 66 | 
 67 | rr <- brewer.pal(8,"Dark2")
 68 | 
 69 | for(i in 1:ncol(fymat)){
 70 |   lines(y, fymat[,i], type="l", col=rr[i], lwd=2)
 71 | }
 72 | ```
 73 | ***
 74 | -  $\text{Var}\left(\text{count}\right) = \phi\mathbb{E}(\text{count})^q$
 75 | - Common distributions are sub-cases:
 76 |   - $q=1 \Rightarrow$ Poisson
 77 |   - $q=2 \Rightarrow$ Gamma
 78 |   - $q=3 \Rightarrow$ Normal
 79 | - We are interested in $1 < q < 2$ 
 80 | - (here $q = 1.2, 1.3, \ldots, 1.9$)
 81 | - `tw()`
 82 | 
 83 | 
 84 | Negative binomial
 85 | ==================
 86 | 
 87 | ```{r negbin}
 88 | y<-seq(1,12,by=1)
 89 | disps <- seq(0.001, 1, len=10)
 90 | 
 91 | fymat <- matrix(NA, length(y), length(disps))
 92 | 
 93 | i <- 1
 94 | for(disp in disps){
 95 |   fymat[,i] <- dnbinom(y, size=disp, mu=5)
 96 |   i <- i+1
 97 | }
 98 | 
 99 | plot(range(y), range(fymat), type="n", ylab="Density", xlab="x", cex.lab=1.5,
100 |      main="")
101 | 
102 | rr <- brewer.pal(8,"Dark2")
103 | 
104 | for(i in 1:ncol(fymat)){
105 |   lines(y, fymat[,i], type="l", col=rr[i], lwd=2)
106 | }
107 | ```
108 | ***
109 | - $\text{Var}\left(\text{count}\right) =$ $\mathbb{E}(\text{count}) + \kappa \mathbb{E}(\text{count})^2$
110 | - Estimate $\kappa$
111 | - Is quadratic relationship a "strong" assumption?
112 | - Similar to Poisson: $\text{Var}\left(\text{count}\right) =\mathbb{E}(\text{count})$ 
113 | - `nb()`
114 | 
115 | 
116 | Zero-inflated distributions
117 | ===========================
118 | 
119 | - Models the probability of zeros seperately from mean counts given that you've observed more than zero
120 | at a location.
121 | - `ziP` and `ziplss` (for location-scale models)
122 | - zero inflation is assessed *conditional* on the model
123 |   - is what you have zero inflation or just lots of zeros?
124 |   - don't just jump straight to zero inflation
125 | 
126 | 
127 | 
128 | Other distributions
129 | ======================
130 | type:section 
131 | 
132 | The Beta distribution
133 | ======================
134 | 
135 | ```{r beta-dist}
136 | shape1 <- c(0.2, 1, 5, 1, 3, 1.5)
137 | shape2 <- c(0.2, 3, 1, 1, 1.5, 3)
138 | x <- seq(0.01, 0.99, length = 200)
139 | rr <- brewer.pal(length(shape1), "Dark2")
140 | fymat <- mapply(dbeta, shape1, shape2, MoreArgs = list(x = x))
141 | matplot(x, fymat, type = "l", col = rr, lwd = 2, lty = "solid")
142 | legend("top", bty = "n",
143 |        legend = expression(alpha == 0.2 ~~ beta == 0.2,
144 |                            alpha == 1.0 ~~ beta == 3.0,
145 |                            alpha == 5.0 ~~ beta == 1.0,
146 |                            alpha == 1.0 ~~ beta == 1.0,
147 |                            alpha == 3.0 ~~ beta == 1.5,
148 |                            alpha == 1.5 ~~ beta == 3.0),
149 |        col = rr, cex = 1.25, lty = "solid", lwd = 2)
150 | ```
151 | 
152 | ***
153 | 
154 | - Proportions; continuous, bounded at 0 & 1
155 | - Beta distribution is convenient choice
156 | - Two strictly positive shape parameters, $\alpha$ & $\beta$
157 | - Has support on $x \in (0,1)$
158 | - Density at $x = 0$ & $x = 1$ is $\infty$, fudge
159 | - `betar()` family in **mgcv**
160 | 
161 | t-distribution
162 | ============================
163 | - Models continuous data w/ longer tails than normal
164 | - Far less sensitive to outliers
165 | - Has one extra parameter: df. 
166 | - bigger df: t dist approaches normal
167 | 
168 | 
169 | ```{r texample2, include =T,echo=F,results= "hide", fig.width=15, fig.height=8}
170 | set.seed(4)
171 | n=300
172 | dat = data.frame(x=seq(0,10,length=n))
173 | dat$f = 20*exp(-dat$x)*dat$x
174 | dat$y  = 1*rt(n,df = 3) + dat$f
175 | norm_mod =  gam(y~s(x,k=20), data=dat, family=gaussian(link="identity"))
176 | t_mod = gam(y~s(x,k=20), data=dat, family=scat(link="identity"))
177 | predict_norm = predict(norm_mod,se.fit=T)
178 | predict_t = predict(t_mod,se.fit=T)
179 | fit_vals = data.frame(x = c(dat$x,dat$x), 
180 |                       fit =c(predict_norm[[1]],predict_t[[1]]),
181 |                       se_min = c(predict_norm[[1]] - 2*predict_norm[[2]],
182 |                                  predict_t[[1]] - 2*predict_t[[2]]),
183 |                       se_max = c(predict_norm[[1]] + 2*predict_norm[[2]],
184 |                                  predict_t[[1]] + 2*predict_t[[2]]),
185 |                       model = rep(c("normal errors","t-errors"),each=n))
186 | ggplot(aes(x=x,y=fit),data=fit_vals)+
187 |   facet_grid(.~model)+
188 |   geom_line(col="red")+
189 |   geom_ribbon(aes(ymin =se_min,ymax = se_max),alpha=0.5,fill="red")+
190 |   annotate(x = dat$x,y=dat$y,size=2,geom="point")+
191 |   annotate(x = dat$x,y=dat$f,size=2,geom="line")+
192 |   theme_bw(20)
193 | ```
194 | 
195 | 
196 | 
197 | Ordered categorical data 
198 | ===========================
199 | ```{r ocat_ex2, include =T,echo=F,results= "hide", fig.width=6}
200 | set.seed(4)
201 | n= 100
202 | dat  = data.frame(body_size = seq(-2,2, length=200))
203 | dat$linear_predictor = 6*exp(dat$body_size*2)/(1+exp(dat$body_size*2))-2
204 | 
205 | ggplot(aes(x=body_size, y=linear_predictor),data=dat) + 
206 |   geom_line()+
207 |   geom_ribbon(aes(ymin=linear_predictor-1,ymax=linear_predictor+1),alpha=0.25)+
208 |   annotate(x= dat$body_size, ymin=-3,ymax=-1, alpha=0.25,geom="ribbon")+
209 |   annotate(x= 0, y=-2, label = "least concern",geom="text",size=10)+
210 |   annotate(x= dat$body_size, ymin=-1,ymax=1.5, alpha=0.25,geom="ribbon",fill="red")+
211 |   annotate(x= 0, y=0.25, label = "vulnerable",geom="text",size=10)+
212 |   annotate(x= dat$body_size, ymin=1.5,ymax=2, alpha=0.25,geom="ribbon",fill="blue")+
213 |   annotate(x= 0, y=1.75, label = "endangered",geom="text",size=10)+
214 |   annotate(x= 0, y=3.5, label = "extinct",geom="text",size=10)+
215 |   scale_x_continuous("relative body size", expand=c(0,0))+
216 |   scale_y_continuous("linear predictor",limits=c(-3,5), expand=c(0,0))+
217 |   theme_bw(30)+
218 |   theme(panel.grid = element_blank())
219 | 
220 | ```
221 | 
222 | ***
223 | 
224 | - Data are categories, have order
225 | - e.g.: conservation status: "least concern", "vulnerable", "endangered", "extinct"
226 | - fits a linear latent model using covariates, w/ threshold for each level
227 | - see `?ocat`
228 | - for unordered categories, see `?multinom`
229 | 
230 | 
231 |   
232 | Other distributions (quickly)
233 | ===================================
234 | 
235 | - Multivariate normal (`family = "mvn"`)
236 |   - Multivariate response, each has different smooth, allow correlation
237 | - Cox proportional hazards (`"family = cox.ph"`)
238 |   - Censored data: time until an event occurs, or the study was stopped
239 | - Gaussian location-scale models (`"family = gaulss"`)
240 |   - mean ("location") and variance ("scale") as smooths
241 | 
242 | All of these distributions have quirks! Read the manual!
243 | 
244 | `?family` and `?family.mgcv`
245 | 
246 | The end of the distribution zoo
247 | ==============
248 | type:section 
249 | 
250 | Fancy smoothers
251 | ===============
252 | type:section 
253 | 
254 | Cyclic smooths
255 | ==============
256 | 
257 | <p align="center">
258 | <img alt="cyclic smooths" src="images/cyclic.png">
259 | <br/>
260 | <img alt="tensor of cyclic" src="images/slope-aspect.png">
261 | </p>
262 | 
263 | 
264 | ***
265 | 
266 | - cyclic smooths (`bs="cc"`)
267 | - what if smooths need to "match up"?
268 | - ensure up to 2nd derivs match
269 | - need to be careful with end points
270 | - `?smooth.construct.cc.smooth.spec`
271 | 
272 | 
273 | "Simple" random effects
274 | ==============
275 | 
276 | - Earlier: "penalties can be thought of as variance components"
277 | - We can think of random effects as splines too!
278 | - in `mgcv` we can set `bs="re"`
279 | - these are **simple**, non-nested random effects
280 | 
281 | ```{r ext-re-get}
282 | load("../data/drake_griffen/drake_griffen.RData")
283 | ```
284 | 
285 | ```{r ext-re-model, fig.width=12, height=4}
286 | ext_re <- gam(Nhat ~ s(day, k=50) + s(ID, bs="re"), data=pop_unhappy)
287 | plot(ext_re, shade=TRUE, pages=1, scale=0)
288 | ```
289 | 
290 | 
291 | Complicated random effects
292 | ==========================
293 | 
294 | - `gamm` --- uses spline-random effects equiv.
295 | - cast splines as random effects, fit using `nlme`
296 | - random effects are sparse, splines are dense
297 | - often modelling problems with complex models
298 | - `random=...` argument for nesting etc
299 | - model has a `$gam` and `$lme` parts
300 | 
301 | 
302 | 
303 | Correlation stuctures
304 | =====================
305 | 
306 | - again, need to use `gamm`
307 | - `correlation=...` gives structure
308 | - `corAR1`, `corARMA`, `corCAR1` etc
309 | - tend to be hard to fit for SDMs
310 | 
311 | 
312 | Fancy 2D smoothing
313 | ==================
314 | type:section
315 | 
316 | Funny-shaped regions
317 | ====================
318 | 
319 | <p align="center">
320 | <img alt="fs plot truth" src="images/soap-truth.png">
321 | <br/>
322 | <img alt="fs plot trprs" src="images/soap-tprs.png">
323 | <br/>
324 | <img alt="fs plot soap" src="images/soap-soap.png">
325 | </p>
326 | 
327 | ***
328 | 
329 | - Soap film smoother (`bs="so"`)
330 | - Model takes boundary into account by construction
331 | - Need to specify a boundary and internal knots
332 | - see `?soap`
333 | 
334 | 
335 | Spatial models using areas
336 | ==========================
337 | 
338 | ```{r mrf}
339 | data(columb)       ## data frame
340 | data(columb.polys) ## district shapes list
341 | xt <- list(polys=columb.polys) ## neighbourhood structure info for MRF
342 | par(mfrow=c(2,2))
343 | polys.plot(columb.polys, columb$crime, main="raw crime data")
344 | 
345 | 
346 | ## An important covariate added...
347 | b <- gam(crime ~ s(district,bs="mrf",k=20,xt=xt)+s(income),
348 |          data=columb,method="REML")
349 | 
350 | ## plot fitted values by district
351 | fv <- fitted(b)
352 | names(fv) <- as.character(columb$district)
353 | polys.plot(columb.polys,fv, main="fitted values")
354 | 
355 | # each "smooth"
356 | plot(b,scheme=c(0,1))
357 | ```
358 | 
359 | 
360 | ***
361 | 
362 | - Markov random fields (`bs="mrf"`)
363 | - Need to specify polygons or adjacency matrix
364 | - Not necessarily that useful for marine work?
365 | - see `?mrf`
366 | 
367 | 
368 | Very general modelling
369 | ======================
370 | 
371 | `mgcv` can fit *anything* you can write as (on the link scale):
372 | 
373 | $$
374 | \mathbf{y} = X\boldsymbol{\beta} \qquad \text{s.t.} \sum_j \boldsymbol{\beta}S_j \boldsymbol{\beta}
375 | $$
376 | 
377 | if you can write your likelihood in a quadratic form, it can be part of a model in `mgcv` 
378 | 
379 | `?paraPen`
380 | 
381 | 
382 | Models for large datasets
383 | =========================
384 | 
385 | - `bam` for big additive models
386 | - can handle simple correlation structures
387 | - parallel (block QR decompositions)
388 | - fast! (still experimental)
389 | - Wood, Goude, Shaw (2015)
390 | 
391 | Fancy summary
392 | =============
393 | 
394 | - You can do *a lot* of things in `mgcv`
395 | - Start small, work up to complex models
396 | - Sometimes convergence is against you
397 | - There is *a lot* of information in the manual
398 | 
399 | <br/>
400 | <br/>
401 | <p align="right"><img src="images/mgcv-inside.png"></p>
402 | 
403 | Okay, that's enough
404 | ===================
405 | type:section
406 | 
407 | <br/>
408 | <br/>
409 | <br/>
410 | converged.yt/mgcv-workshop
411 | 
412 | 
413 | 
414 | 


--------------------------------------------------------------------------------
/slides/correct_mathjax_PDF_and_move.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # correct the paths to the mathjax files for HTML versions of the slides
 4 | 
 5 | # changes the link to be to a folder called mathjax-23 in the current directory
 6 | # this avoids having lots of copes of the same mathjax stuff
 7 | 
 8 | # also use decktape to make a PDF of the slides (probably only works on
 9 | # Dave's laptop)
10 | 
11 | for i in $( ls *.html); do
12 |   echo processing: $i
13 |   FN=${i/.html/}_files\\/
14 |   sed "s/$FN//" "$i" > tmp.html
15 |   mv tmp.html $i
16 |   PDFFN=${i/.html/}.pdf
17 |   ~/sources/decktape/bin/phantomjs ~/sources/decktape/decktape.js automatic -s 1024x768 $i $PDFFN
18 |   mv $i ~/current/webwebweb/mgcv-workshop/slides/
19 |   mv $PDFFN ~/current/webwebweb/mgcv-workshop/slides/
20 | done
21 | 


--------------------------------------------------------------------------------
/slides/custom.css:
--------------------------------------------------------------------------------
 1 | .reveal section img{
 2 |   box-shadow: 0px 0px 0px rgba(0, 0, 0, 0);
 3 |   margin: 0px;
 4 | }
 5 | 
 6 | .reveal h1, .reveal h2, .reveal h3 {
 7 |   word-wrap: normal;
 8 |   -moz-hyphens: none;
 9 | }
10 | 
11 | .reveal pre{
12 |   width: 100%;
13 |   font-size: 0.51em;
14 | }
15 | 
16 | .reveal pre code {
17 |   font-size: 1.2em;
18 | }
19 | 
20 | div.bigq {
21 |  font-size: 200%;
22 |  line-height: normal;
23 | }
24 | div.bigqw {
25 |  font-size: 200%;
26 |  color: #fff;
27 |  line-height: normal;
28 | }
29 | div.bigqmono {
30 |  font-size: 200%;
31 |  color: #fff;
32 |  line-height: normal;
33 |  font-family: monospaced;
34 | }
35 | 
36 | div.medq {
37 |  font-size: 150%;
38 |  line-height: normal;
39 | }
40 | 
41 | .section .reveal .state-background {
42 |   background: #000 none repeat scroll 0% 0%;
43 |   background-color: #000;
44 | }
45 | 
46 | .reveal code.r {
47 |     background-color: #F8F8F8;
48 |     font-size: 1.2em;
49 | }
50 | 
51 | .reveal pre {
52 |     width: 100%;
53 | }
54 | 


--------------------------------------------------------------------------------
/slides/images/Daphnia_magna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/Daphnia_magna.png


--------------------------------------------------------------------------------
/slides/images/addbasis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/addbasis.png


--------------------------------------------------------------------------------
/slides/images/animal_choice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/animal_choice.png


--------------------------------------------------------------------------------
/slides/images/animal_codes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/animal_codes.png


--------------------------------------------------------------------------------
/slides/images/animal_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/animal_functions.png


--------------------------------------------------------------------------------
/slides/images/by.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/by.png


--------------------------------------------------------------------------------
/slides/images/concurvity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/concurvity.png


--------------------------------------------------------------------------------
/slides/images/count.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/count.jpg


--------------------------------------------------------------------------------
/slides/images/cyclic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/cyclic.png


--------------------------------------------------------------------------------
/slides/images/jenny_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/jenny_models.png


--------------------------------------------------------------------------------
/slides/images/mathematical_sobbing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/mathematical_sobbing.jpg


--------------------------------------------------------------------------------
/slides/images/mgcv-inside.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/mgcv-inside.png


--------------------------------------------------------------------------------
/slides/images/remlgcv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/remlgcv.png


--------------------------------------------------------------------------------
/slides/images/rummy.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/rummy.gif


--------------------------------------------------------------------------------
/slides/images/slope-aspect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/slope-aspect.png


--------------------------------------------------------------------------------
/slides/images/soap-soap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/soap-soap.png


--------------------------------------------------------------------------------
/slides/images/soap-tprs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/soap-tprs.png


--------------------------------------------------------------------------------
/slides/images/soap-truth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/soap-truth.png


--------------------------------------------------------------------------------
/slides/images/spermcovars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/spermcovars.png


--------------------------------------------------------------------------------
/slides/images/spermwhalecovs.R:
--------------------------------------------------------------------------------
 1 | # grab sperm whale covariates and make a figure
 2 | 
 3 | # only works on my computer, sorry!
 4 | load("~/current/spatial-workshops/spermwhale-analysis/predgrid.RData")
 5 | 
 6 | library(ggplot2)
 7 | library(gridExtra)
 8 | library(viridis)
 9 | library(ggalt)
10 | library(sp)
11 | 
12 | pp <- list()
13 | 
14 | map_dat2 <- map_data("world", c("usa","canada"))
15 | 
16 | 
17 | 
18 | locs <- map_dat2[,c("long","lat")]
19 | coordinates(locs) <- c("long", "lat")  # set spatial coordinates
20 | crs.geo <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84")
21 | proj4string(locs) <- crs.geo
22 | crs.aea <- CRS("+proj=aea +lat_1=38 +lat_2=30 +lat_0=34 +lon_0=-73 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +ellps=WGS84 +towgs84=0,0,0")
23 | locs.aea <- spTransform(locs, crs.aea)
24 | 
25 | map_dat2$y <- locs.aea$lat
26 | map_dat2$x <- locs.aea$long
27 | 
28 | for(value in c("Depth", "SST", "NPP")){
29 |   p <- ggplot(predgrid) +
30 |     geom_tile(aes_string(x="x", y="y", fill=value)) +
31 |     geom_polygon(aes(x=x, y=y, group = group),
32 |                      fill = "#1A9850", data=map_dat2) +
33 |     scale_fill_viridis() +
34 |     coord_equal(xlim=c(-1e6,0.75e6), ylim = c(-0.75e6,0.75e6)) +
35 |     theme_minimal()
36 |   pp[[value]] <- p
37 | }
38 | 
39 | png(file="spermcovars.png", width=800, height=500)
40 | grid.arrange(pp[["Depth"]], pp[["SST"]], pp[["NPP"]], ncol=2)
41 | dev.off()
42 | 
43 | 


--------------------------------------------------------------------------------
/slides/images/spotteddolphin_swfsc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/spotteddolphin_swfsc.jpg


--------------------------------------------------------------------------------
/slides/images/thanks_for_all_the_fish.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/thanks_for_all_the_fish.gif


--------------------------------------------------------------------------------
/slides/images/tina-modelling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/images/tina-modelling.png


--------------------------------------------------------------------------------
/slides/wiggly.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dill/mgcv-workshop/9a23801d4e149cab1e7332a9ca0a3c0ecd6cb705/slides/wiggly.gif


--------------------------------------------------------------------------------
/vis.concurvity.R:
--------------------------------------------------------------------------------
 1 | # visualise concurvity between terms in a GAM
 2 | # David L Miller 2015, MIT license
 3 | 
 4 | # arguments:
 5 | #   b     -- a fitted gam
 6 | #   type  -- concurvity measure to plot, see ?concurvity
 7 | 
 8 | vis.concurvity <- function(b, type="estimate"){
 9 |   cc <- concurvity(b, full=FALSE)[[type]]
10 | 
11 |   diag(cc) <- NA
12 |   cc[lower.tri(cc)]<-NA
13 | 
14 |   layout(matrix(1:2, ncol=2), widths=c(5,1))
15 |   opar <- par(mar=c(5, 6, 5, 0) + 0.1)
16 |   # main plot
17 |   image(z=cc, x=1:ncol(cc), y=1:nrow(cc), ylab="", xlab="",
18 |         axes=FALSE, asp=1, zlim=c(0,1))
19 |   axis(1, at=1:ncol(cc), labels = colnames(cc), las=2)
20 |   axis(2, at=1:nrow(cc), labels = rownames(cc), las=2)
21 |   # legend
22 |   opar <- par(mar=c(5, 0, 4, 3) + 0.1)
23 |   image(t(matrix(rep(seq(0, 1, len=100), 2), ncol=2)),
24 |         x=1:3, y=1:101, zlim=c(0,1), axes=FALSE, xlab="", ylab="")
25 |   axis(4, at=seq(1,101,len=5), labels = round(seq(0,1,len=5),1), las=2)
26 |   par(opar)
27 | }
28 | 


--------------------------------------------------------------------------------