├── .gitignore
├── Code
    ├── 0_install_packgs.R
    ├── Session_1.R
    ├── Session_1.pdf
    ├── Session_2
    │   └── Session_2_extract.R
    ├── Session_2_exerc.R
    ├── Session_3
    │   └── Session_3_extract.R
    ├── Session_4
    │   └── Session_4_extr.R
    ├── Session_5
    │   └── Session_5_extr.R
    ├── Session_5_exerc.R
    ├── Session_6
    │   ├── Session_6_extr.R
    │   └── glm_explorer
    │   │   ├── README.md
    │   │   ├── exercises.md
    │   │   ├── functions.R
    │   │   ├── introduction.md
    │   │   ├── server.R
    │   │   ├── shiny.Rproj
    │   │   └── ui.R
    ├── Session_6_7_exerc.R
    ├── Session_7
    │   ├── Session_7_extr.R
    │   └── data_EC.csv
    ├── Session_8
    │   └── Session_8_extr.R
    ├── Session_8_exerc.R
    ├── Session_9
    │   └── Session_9_extr.R
    └── Session_9_exerc.R
├── Data
    ├── OstraMRegS400JB.txt
    ├── envtrain.csv
    └── installed_pkgs.txt
├── LICENSE
├── Literature_commented.pdf
├── README.md
├── Schedule.pdf
└── Slides
    ├── 0_Definitions.pdf
    ├── 1_for_extern.pdf
    ├── 1_for_internal_students.pdf
    ├── 2.pdf
    ├── 3.pdf
    ├── 4.pdf
    ├── 5.pdf
    ├── 6.pdf
    ├── 7.pdf
    ├── 8.pdf
    └── 9.pdf


/.gitignore:
--------------------------------------------------------------------------------
 1 | Code/.Rhistory
 2 | Code/0_a_prep_script.R
 3 | Slides/0_Definitions.Rmd
 4 | Slides/.Rhistory
 5 | .gitignore
 6 | Code/Session_2/*
 7 | Code/Session_3/*
 8 | Code/Session_4/*
 9 | Code/Session_5/*
10 | Code/Session_6/*
11 | Code/Session_6/glm_explorer/*
12 | Code/Session_7/*
13 | Code/Session_8/*
14 | Code/Session_9/*
15 | Code/.Rapp.history
16 | Data/.Rapp.history
17 | # exceptions
18 | !Code/Session_2_exerc.R
19 | !Code/Session_2_exerc.R
20 | !Code/Session_2/Session_2_extract.R
21 | !Code/Session_3/Session_3_extract.R
22 | !Code/Session_4/Session_4_extr.R
23 | !Code/Session_5/Session_5_extr.R
24 | !Code/Session_6/Session_6_extr.R
25 | !Code/Session_6/glm_explorer/exercises.md
26 | !Code/Session_6/glm_explorer/functions.R
27 | !Code/Session_6/glm_explorer/introduction.md
28 | !Code/Session_6/glm_explorer/README
29 | !Code/Session_6/glm_explorer/server.R
30 | !Code/Session_6/glm_explorer/shiny.Rproj
31 | !Code/Session_6/glm_explorer/ui.R
32 | !Code/Session_7/Session_7_extr.R
33 | !Code/Session_7/data_EC.csv
34 | !Code/Session_8/Session_8_extr.R
35 | !Code/Session_9/Session_9_extr.R
36 | .DS_Store
37 | .Rproj.user
38 | 


--------------------------------------------------------------------------------
/Code/0_install_packgs.R:
--------------------------------------------------------------------------------
1 | pkgs <- readLines(file("https://raw.githubusercontent.com/rbslandau/data_analysis/master/Data/installed_pkgs.txt", "r"))
2 | str(pkgs)
3 | install.packages(pkgs)
4 | 


--------------------------------------------------------------------------------
/Code/Session_1.R:
--------------------------------------------------------------------------------
  1 | #' ---
  2 | #' title: "Session 1: First steps in R and data exploration"
  3 | #' author: "Ralf B. Schäfer"
  4 | #' date: "`r Sys.Date()`"
  5 | #' output: pdf_document
  6 | #' urlcolor: blue
  7 | #' bibliography: /Users/ralfs/Literatur/Natural_Sciences.bib
  8 | #' 
  9 | #' ---
 10 | #' 
 11 | #' 
 12 | #' # First steps
 13 | #' Before we begin, some notes on notation and terminology: **Bold statements** mostly refer to little exercises or questions, R functions
 14 | #' inside the text look like this: `function()` and [this is an URL](https://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/teaching/r-statistics)
 15 | #' that brings you to the course website. R code chunks are formatted as follows
 16 | #' (This code does not run!):
 17 | #+ eval=FALSE
 18 | object_assignment <- thisfunction(argument = does_not_work$it_only_serves_illustration)
 19 | # Comments within R code are coloured in brown. 
 20 | #' If you write own R code, consider the [style guide by Hadley Wickham](http://adv-r.had.co.nz/Style.html) as well as using the [styler package](https://cran.r-project.org/web/packages/styler/index.html)
 21 | #' on [files or on code through the RStudio Addins](https://github.com/r-lib/styler).
 22 | #' Although called "First steps", we assume that you have followed short tutorials on installing and using R/R Studio and on the very
 23 | #' basics of R. We recommend the free datacamp course [Introduction to R](https://www.datacamp.com/courses/free-introduction-to-r) and the first free chapters of this 
 24 | #' [Online book](https://rafalab.github.io/dsbook/). Moreover, a brief introduction into RStudio is available in [this blog](https://www.statsandr.com/blog/how-to-install-r-and-rstudio/).
 25 | #' 
 26 | #' Whenever you start working with R, you should set a working directory. This is the directory where R, unless 
 27 | #' specified otherwise, will look for files to load or will save files. The working directory can be set through the
 28 | #' R Studio [Graphical User Interface (GUI)](https://rafalab.github.io/dsbook/getting-started.html#rstudio):
 29 | #' Go to Session –> Set Working Directory –> Choose Directory... . However,
 30 | #' you can also do this from the command line using the command `setwd()`:
 31 | setwd("~/Gitprojects/Teaching/Data_analysis/Code")
 32 | #' If you run the script, you have to replace my file path with a path to a working directory 
 33 | #' on your local machine. To simplify the identification of your path, you can use the 
 34 | #' following function:
 35 | #+ eval=FALSE
 36 | file.choose()
 37 | #' and select a file in your desired working directory. Subsequently, copy the path
 38 | #' **without** the file reference into the `setwd()` function. Make sure
 39 | #' to enclose the path with double quotation marks. Finally, we set an option to reduce the amount of output
 40 | #' that is printed to save some space in this document:
 41 | options(max.print = 50, width = 80)
 42 | #' For an overview of the options available to influence computations and displaying check the help:
 43 | #+ eval=FALSE
 44 | ?options
 45 | #' 
 46 | #' We start by importing a data file from the university website (obviously, you require an internet connection to 
 47 | #' successfully run the command). To store the data after import, we assign (<-) the data to an object (*data*). 
 48 | link <- "http://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/Teaching/possum.csv"
 49 | # link is not properly displayed in knitted document, but you can copy it from
 50 | # the Session_1.R document (which you should use anyway to run the R code yourself)
 51 | data <- read.table(link)
 52 | #' The object *data* is now shown in the *Environment pane* in R Studio.
 53 | #' For interested students: **What happens if you run the read table function without assignment to an object?**
 54 | #' *Hint: Inspect the Console pane in R Studio.*
 55 | #' 
 56 | #' Useful functions to inspect imported data are `head()` and `str()`.
 57 | #' **Try what these functions do by running them on the imported data and by calling the related help pages
 58 | #' as shown below:**
 59 | #+ eval=FALSE 
 60 | head(data)
 61 | ?head
 62 | str(data)
 63 | ?str
 64 | 
 65 | #' The imported data look mixed up. This happens when the arguments in the import function have not been set properly.
 66 | #' We can set different arguments including:
 67 | #' 
 68 | #' * header: specify if the data have a header (TRUE) or not (FALSE)
 69 | #' * sep: specify the column separator (in this case ";")
 70 | #' * dec: specify the character used for the decimal point (in this case "."). In Germany and several other countries, 
 71 | #' the character for the decimal point is ",", which often leads to trouble (unless you do not exchange
 72 | #' files with others). In academic contexts, I generally recommend to set all software products such as spreadsheet programs
 73 | #' (e.g. Microsoft Excel, LibreOffice Calc) [to locale "English (Great Britain)"](https://help.libreoffice.org/Common/Languages#Locale_setting)
 74 | #' or another locale related to an english-speaking country. 
 75 | #' * row.names: specify row names, a variable that provides row names (our data set actually contains a variable
 76 | #' *row.names*, but we ignore that) or import without row names (= NULL)
 77 | #' 
 78 | #' Call `?read.table` for further details and options. 
 79 | #' Misspecification of import arguments is one of the most frequent errors of beginners. 
 80 | #' We run the import function again, now with the arguments properly specified.
 81 | pos_dat <- read.table(url(link), dec = ".", sep = ";", header = TRUE, row.names = NULL)
 82 | # close connection after import
 83 | close(url(link))
 84 | #' For further details on how to import data refer to [this tutorial](https://www.datacamp.com/community/tutorials/r-data-import-tutorial).
 85 | #' 
 86 | #' We inspect the imported data again.
 87 | # show first 3 rows of dataframe
 88 | head(pos_dat) 
 89 | # looks ok
 90 | str(pos_dat)  
 91 | # structure of object also looks meaningful now 
 92 | # data.frame with 15 variables and 104 rows (obs. = observations)
 93 | # $ indicates variables with type of data (i.e. Factor: categorical, chr: character,
 94 | # int: integer = natural number, num: numeric = real number)   
 95 | 
 96 | #' The file was taken from the R package [DAAG](https://cran.r-project.org/web/packages/DAAG/index.html) and contains information on 
 97 | #' [possums](https://en.wikipedia.org/wiki/Phalangeridae#/media/File:Brushtail_possum.jpg) that were published in @lindenmayerMorphologicalVariationPopulations1995. To access an R package, we have
 98 | #' to load and attach a package with the function `library()`:
 99 | library(DAAG)
100 | #' If you do not have the package installed, you need to install the package via:
101 | # install.packages("DAAG") # (remove hashtag in this case)
102 | #' If loaded and attached, we can subsequently load the possum data, which we have imported from a file above,
103 | #' quite conveniently (e.g. without the need to specify separator or decimal point) from the package:
104 | data(possum)
105 | # display first rows
106 | head(possum)
107 | #' The [metadata](https://en.wikipedia.org/wiki/Metadata) for data that are in a package can be called via help. **Call the help for the possum data and study the metadata.** 
108 | #'
109 | #' You can also save data that are in the session workspace for later use. For example, if you wanted to save the 
110 | #' imported possum data for later use (ignoring that we have them in the package) on your local hard drive, 
111 | #' execute the following code:
112 | #+ eval=FALSE
113 | save(pos_dat, file = "Possum.Rdata")
114 | #'  Of course, you can provide a different path to the file argument in the `save()` function.
115 | #' **Where has the file been saved?** *Hint: If you can't find the file, run the following function:*
116 | #+ eval=FALSE
117 | getwd()
118 | #' If saved, you can load the data into a new R session with:
119 | #+ eval=FALSE
120 | load("Possum.Rdata")
121 | # Works only if file is in working directory!
122 | 
123 | #' # Data handling
124 | #' Now that we have properly imported the data into R, we can explore and analyse the data. However, before data exploration and analysis, 
125 | #' the data often require (re-)organisation including joining data sets, transformation and subsetting. Here, we focus on
126 | #' subsetting based on conditions. But let us start with some basics. To return the column names (i.e. variable names) call:
127 | names(pos_dat)
128 | #' or:
129 | colnames(pos_dat)
130 | #' The function `colnames()`, in contrast to `names()`, also works for matrices.
131 | #' If we want to access variables in a dataframe, we can do this as follows: 
132 | pos_dat$totlngth
133 | # Displays the data stored in the column totlngth (total length of a possum)
134 | #' Generally, we can access parts of objects including vectors, matrices, data.frames and lists with squared brackets:
135 | # Select column via name
136 | pos_dat[ , "totlngth"]
137 | #' **If you assign the resulting data to a new object, of what class (e.g. list, vector, matrix) is the resulting object?** 
138 | #' 
139 | #' We can also select rows and columns via column and row numbers: 
140 | # Select row 1 to 3 of the column 5 and 6 
141 | pos_dat[1:3, 5:6] 
142 | # Select row 1, 3 and 4 of columns 7 and 9
143 | pos_dat[c(1,3,4), c(7,9)] 
144 | #' If we want to store the selected rows and columns, we can simply assign them to a new object:
145 | #+ eval=FALSE
146 | new_obj <- pos_dat[c(1,3,4), c(7,9)]
147 | #'
148 | #' Often we want to subset data based on conditions. If we apply a condition to a vector, we obtain a logical (i.e. TRUE/FALSE) vector:
149 | pos_dat$totlngth > 95
150 | # TRUE: condition met, FALSE: condition not met
151 | #' To use a condition to select data, we apply the logical vector to an object (e.g. vector or dataframe):
152 | log_vector <- pos_dat$totlngth > 95
153 | pos_dat$totlngth[log_vector]
154 | # Subset variable directly without storing logical vector in object
155 | pos_dat$totlngth[pos_dat$totlngth > 95]
156 | # Different way to do this
157 | pos_dat[pos_dat$totlngth > 95, "totlngth"]
158 | # Subset dataframe
159 | pos_dat[pos_dat$totlngth > 95, ]
160 | #' To query values *smaller or equal than* is done in R via `<=`, to query values *larger or equal than* is 
161 | #' done via `>=`. Similarly, `==` means *equal*, 
162 | #' and `!=` means *not equal*. We exemplify this by querying selected variables conditioned
163 | #' by the sex of the possums:
164 | # Select male possums
165 | pos_dat[pos_dat$sex == "m", c(5,7:9)]
166 | # Select female possums
167 | pos_dat[pos_dat$sex == "f", c(5,7:9)]
168 | # Select female possums as those not male
169 | pos_dat[pos_dat$sex != "m", c(5,7:9)]
170 | #' Sometimes it is necessary to know the row numbers that meet a condition. These can be queried using the `which()` function:
171 | which(pos_dat$totlngth > 95)
172 | #' In every scripting or programming language, you often have multiple ways to reach a result. 
173 | #' **Try yourself: Store the vector resulting from the 'which' function and use it as condition for subsetting the dataframe pos_dat**. 
174 | #' 
175 | #' Of course, you can also combine conditions (& = logical AND, | = logical OR) to subset data:
176 | # selects all possums that are male and larger than 95 (cm)
177 | pos_dat[pos_dat$sex == "m" & pos_dat$totlngth > 95, ] # AND
178 | # selects all possums that are male or larger than 95 (cm) 
179 | pos_dat[pos_dat$sex == "m" | pos_dat$totlngth > 95, ]# OR
180 | 
181 | #' The combination of conditions looks complicated. The [dplyr package](https://cran.r-project.org/web/packages/dplyr/index.html) provides functions
182 | #' that are intended to simplify such operations on dataframes. We only outline a few examples for application of dplyr functions,
183 | #'  please refer to a [blog post](https://datascienceplus.com/getting-started-with-dplyr-in-r-using-titanic-dataset/) and the 
184 | #'  course material on OpenOLAT for more extensive tutorials.
185 | # load dplyr library
186 | library(dplyr)
187 | # Select variables with the select function
188 | # First argument is data set followed by the variable names
189 | select(pos_dat, totlngth, sex, skullw)
190 | # Note: No need for quotation marks.
191 | # Select rows with the filter function
192 | filter(pos_dat, totlngth > 95)
193 | filter(pos_dat, sex == "m")
194 | # Combine conditions
195 | filter(pos_dat, totlngth > 95 & sex == "m")
196 | #' We can also combine `select()` and `filter()`:
197 | select(filter(pos_dat, sex == "m"), totlngth, sex, skullw)
198 | #' In this example, the output of `filter()` takes the position of 
199 | #' the *data* argument in the `select()` function. A particular strength of dplyr is the use of 
200 | #' [pipelines](https://en.wikipedia.org/wiki/Pipeline_(Unix)), defined in the R context as a sequence of functions, where the output from one
201 | #' function feeds directly as input of the next function. This can also enhance readability. Consider for example the previous code
202 | #' (combination of `select()` and `filter()`) rewritten as pipe (pipe operator: %>%):
203 | pos_dat %>%
204 |   filter(sex == "m") %>%
205 |   select(totlngth, sex, skullw)
206 | #' dplyr also provides a useful function (`arrange()`) to sort a dataframe according to selected variables:
207 | pos_dat %>%
208 |   arrange(totlngth)
209 | # now in descending order
210 | pos_dat %>%
211 |   arrange(desc(totlngth))
212 | #' Compare this to the sorting of dataframes in basic R, which is much less elegant:
213 | ord_1 <- order(pos_dat[ ,"totlngth"])
214 | pos_dat[ord_1, ]
215 | #' We can also easily sort by multiple columns and afterwards select a few variables:
216 | pos_dat %>%
217 |   arrange(age, desc(totlngth)) %>%
218 |   select(age, totlngth, belly)
219 | #' Another useful function is `rename()`:
220 | pos_dat %>%
221 |   rename(total_length = totlngth)
222 | # We inspect the original dataframe
223 | head(pos_dat)
224 | #' Why is the original name still in the dataframe? **What would you need to do, to keep the
225 | #' changed name?**
226 | #' 
227 | #' Finally, the mutate function allows to create new columns, for example, as
228 | #' a function of existing columns:
229 | pos_dat %>%
230 |   mutate(Sum_hdln_totlng = hdlngth + totlngth)
231 | # Combined with subsetting to a few columns
232 | pos_dat %>%
233 |   mutate(Sum_hdln_totlng = hdlngth + totlngth) %>%
234 |   select(sex, age, hdlngth, totlngth, Sum_hdln_totlng)
235 | #' Still, you would need to assign this to a new object to store the changes.
236 | #' 
237 | #' # Data exploration
238 | #' 
239 | #' After we have learnt how to process data, we explore the data that will be analysed in the next session. Although graphical tools are most suitable 
240 | #' to obtain an overview on data, the `summary()` function quickly provides information on potential outliers, 
241 | #' missing values (*NA's*) and the range of data:
242 | # Reset max.print options to 100 to avoid that information is omitted
243 | options(max.print = 120)
244 | summary(pos_dat)
245 | #' For the categorical variables *Pop* and *sex* the function returns the number of cases per level. For numerical variables, the minimum, maximum,
246 | #' quartiles, median and mean are returned. In the following we use the cleveland plot and boxplot to check for potential errors and outliers.
247 | #' Let us first look at a cleveland plot:
248 | dotchart(pos_dat$totlngth) 
249 | # Provides an overview, but plot would benefit from polishing.
250 | # Increase font size of labels and symbols
251 | par(cex = 1.4)
252 | # Check ?par for explanation and overview of other arguments
253 | dotchart(pos_dat$totlngth,cex.lab = 1.3, 
254 |      xlab = "Total length [cm]", main = "Data overview") 
255 | #' No outlier is visible. This is how an outlier would look like:
256 | #+ echo = FALSE
257 | totlng_outl <- c(pos_dat$totlngth, 830)
258 | dotchart(totlng_outl,cex.lab = 1.3, 
259 |          xlab = "Total length [cm]", main = "Data overview") 
260 | #' The observation at 830 is an extreme outlier. If you spot such an extreme difference to the remainder
261 | #' of the data, you should scrutinise the raw data, because an order of magnitude difference points 
262 | #' to an error with a decimal point during data entry. Another useful tool that can be used for
263 | #' different purposes including checking for outliers is the boxplot (for brief explanation and different types 
264 | #' [visit the R graph gallery](https://www.r-graph-gallery.com/boxplot/)).
265 | boxplot(pos_dat$totlngth, las = 1, cex.lab = 1.3, 
266 |         ylab = "Total length [cm]", main = "Data overview")
267 | # For the same variable with an added outlier
268 | boxplot(totlng_outl, las = 1, cex.lab = 1.3, 
269 |         ylab = "Total length [cm]", main = "Data overview")
270 | #' You can save a figure from the graphics device using the graphical user interface (*Export* in R Studio). Direct export of 
271 | #' a figure without plotting in R Studio can be done using specific functions such as `jpeg()`, 
272 | #' `png()` or `pdf()`. For details see `?jpeg` and 
273 | #' `?pdf`
274 | #+ eval = FALSE
275 | # example for directly exporting the boxplot to a file
276 | jpeg("Boxplot.jpeg", quality = 100)
277 | par(cex = 1.4)
278 | boxplot(pos_dat$totlngth, las = 1, cex.lab = 1.3, 
279 |         ylab = "Total length [cm]", main = "Data overview")
280 | dev.off()
281 | # Switches off the device (here: saves content to file in working directory)
282 | #' Although the boxplot is widely used and you should be familiar with its interpretation, interesting alternatives such as the beanplot
283 | #' have been introduced [@kampstraBeanplotBoxplotAlternative2008]. The related paper is [available via open access](http://www.jstatsoft.org/v28/c01/). 
284 | #' Refer to the paper and the lecture for explanation of the plot. We first load and attach the package:
285 | library(beanplot)
286 | #' Now we create a beanplot for the total lenght of possums
287 | beanplot(pos_dat$totlngth) 
288 | # provides single observations (lines), mean (thick line) and 
289 | # displays the estimated probability density distribution (black polygon)
290 | #' Again, a few additional arguments improve the quality of the figure. It is quite handy
291 | #' that these arguments are the same as for the boxplot or dotchart:
292 | beanplot(pos_dat$totlngth, las = 1, cex.lab = 1.3, 
293 |          ylab = "Total length [cm]", main = "Data overview") 
294 | #' Both the boxplot and beanplot are practical tools to compare the distribution among variables or groups. Several statistical tests
295 | #' for between group differences (e.g. *t*-test or analysis of variance) require that the variance is homogeneous across groups.
296 | #' This homogeneity of variance translates to a similar spread of the data around the mean. Examplarily, we inspect the distribution
297 | #' of possums from Victoria and from other states with a conditional boxplot:
298 | boxplot(totlngth ~ Pop, data = pos_dat, las = 1, cex.lab = 1.3, 
299 |         ylab = "Total length [cm]", xlab = "Possum population") 
300 | #' The *tilde* sign ~ is used in R to formulate statistical models, where the response variable(s) are on the left hand side
301 | #' and the explanatory variables/predictors on the right hand side. Here, the notation results in the plotting of the responses 
302 | #' per factor level of the variable *Pop*. 
303 | #' 
304 | #' To ease visual comparison of the spread around the median, we put both variables on the same median. This can be done with the base
305 | #' R function `tapply()` that applies a function to each group of data defined by the levels of a factor.
306 | #' We calculate the median for each group and assign it to the object *med*:
307 | med <- tapply(pos_dat$totlngth, pos_dat$Pop, median)
308 | # first argument: data, second argument: factor, third argument: function
309 | # to be applied to each group defined by the factor
310 | med
311 | #' The same calculation can be done using dplyr functions. We need two new functions `group_by()` and
312 | #' `summarise()` to do this elegantly. **Check what is done by calling the help for the new functions and
313 | #' by sequential execution of the code below (i.e. first execute the code until second %>% (not included), then until third %>%
314 | #' (not included)):**  
315 | pos_dat %>% 
316 |   group_by(Pop) %>%
317 |   select(totlngth, Pop) %>%
318 |   summarise(med = median(totlngth)) 
319 | #' Note that a different type of object is produced using dplyr than when using the base R function above:
320 | #' a so-called [tibble](https://cran.r-project.org/web/packages/tibble/vignettes/tibble.html).
321 | #'
322 | #' Anyway, we proceed within the framework of base R and now substract the respective median from each observation of the groups.
323 | w <- pos_dat$totlngth - med[pos_dat$Pop]
324 | #' **Call the object *med* and check how it is modified by *med[pos_dat$Pop]*.**
325 | #' 
326 | #' The resulting vector *w* gives the distance of each observation to the median of its group. 
327 | #' This is equivalent to setting the median of both data sets to 0. We plot again:
328 | boxplot(w ~ Pop, data = pos_dat, las = 1, cex.lab = 1.3, 
329 |         ylab = "Distance to median [cm]", xlab = "Possum population")
330 | #' The variance looks slightly higher in Victoria, but such a small difference would be irrelevant (i.e. not violate 
331 | #' the assumption of same variance of statistical tests). If this was a serious analysis, we would also need to account 
332 | #' for the sex of possums as this is known to influence the body length. Ignoring the
333 | #' sex in the analysis could lead to flawed inference if for example the proportion of females differs between the 
334 | #' two sample populations *Pop* (see the Chapter *Sample and Population* in the document *Key terms and concepts*, in particular
335 | #' the explanation of the *Simpson's paradox*). Indeed, the proportion of females differs strongly between possums measured
336 | #' in Victoria (*Vic*) and other states (*other*):
337 | pos_dat %>% 
338 |   select(sex, Pop) %>% 
339 |   group_by(Pop, sex) %>% 
340 |   count
341 | # more than 50% females in Victoria, but only 1/3 in other states.
342 | 
343 | #' A question you may still have is: *When should we be concerned about a difference
344 | #' in the variance?* We discuss this question in the context of the respective method as there is no general answer.
345 | #' 
346 | #' We can also use the beanplot for comparison:
347 | beanplot(w ~ Pop, data = pos_dat, las = 1, cex.lab = 1.3, 
348 |         ylab = "Distance to median [cm]", xlab = "Possum population")
349 | 
350 | #' The Victorian population is less symmetric than the population from other states, presumably due to the higher proportion of 
351 | #' females in the population (superimposition of two distributions). Note that we can again use the same arguments for the boxplot 
352 | #' and beanplot function. Quite convenient, isn't it? However,
353 | #' the beanplot displays the mean instead of median. **Try to produce the same plot with the mean substracted from each observation!** *Hint:
354 | #' the function to calculate the mean is* `mean()`.
355 | #' 
356 | #' Another assumption of several data analysis tools is that the data is normally distributed. This can be checked using 
357 | #' the so-called *QQ-plot*, which plots theoretical quantiles from a normal distribution against the sample Quantiles 
358 | #' (the definition and interpretation of quantiles is explained in the lecture). If the data originate from a
359 | #' normal distribution, the points should approximately fall on a 1:1 line. For statistical tests focusing on
360 | #' between-group differences, the assumption would need to be checked for each group. Here, we ignore any potential grouping 
361 | #' structure of the data and exemplify the QQ-plot for the variable total length:
362 | # Quantile-quantile plot
363 | qqnorm(pos_dat$totlngth)
364 | # We add a line that goes through the first and third quartiles, 
365 | # which helps to spot deviations.
366 | qqline(pos_dat$totlngth)
367 | #' The deviations here are minor and can be ignored. Again, you may ask: *When should we be concerned about a deviation?*
368 | #' A helpful function in this context is `qreference()` provided 
369 | #' in the package [DAAG](https://cran.r-project.org/web/packages/DAAG/index.html), which relates to the book
370 | #' by Maindonald & Braun [-@maindonaldDataAnalysisGraphics2010]. It produces reference plots to aid in 
371 | #' the evaluation whether the data are normally distributed. The reference plots are based on sample quantiles from
372 | #' data that has been generated through random sampling from a normal distribution with the same parameters (sample mean and sample 
373 | #' variance) and sample size as the data under evaluation.  
374 | 
375 | library(DAAG)
376 | qreference(pos_dat$totlngth, nrep = 8, xlab = "Theoretical quantiles") 
377 | # nrep controls the number of reference plots
378 | #' The reference plots give an idea how data randomly sampled from a normal distribution can deviate from the theoretical
379 | #' normal distribution. Clearly, the data (blue) does not look conspicuous when compared to the reference plots (purple).
380 | #' 
381 | #' A similar approach is to plot the QQ-plot for the data among reference QQ-plots without indication in the figure, 
382 | #' which of the QQ-plots relates to the data. Unless the data deviate strongly from a normal distribution, the QQ-plot 
383 | #' related to the data is presumably indistinguishable from the reference plots. Hence, if you cannot identify the 
384 | #' QQ-plot related to the data, there is no need for concern regarding a potential deviation from the normal distribution.
385 | #' The code for this approach is provided in a [blog](https://biologyforfun.wordpress.com/2014/04/16/checking-glm-model-assumptions-in-r/).
386 | #' 
387 | #' How would a strong deviation look? We discuss two examples. First, we draw samples from a binomial distribution:
388 | set.seed(2018)
389 | # set.seed is used for reproducibility
390 | # i.e. the function returns the same random numbers
391 | x <- rbinom(n = 15, size = 5, p = 0.6) 
392 | # n = sample size, size = number of trials
393 | # p = probability of success
394 | #' We use the QQ plot to evaluate for normal distribution:
395 | qqnorm(x)
396 | # Strong deviation
397 | qqline(x)
398 | # The deviation is particularly obvious when compared to reference plots
399 | qreference(x, nrep = 8, xlab = "Theoretical quantiles") 
400 | #' Even if the data would deviate less from the 1:1 line, the data clearly comes from a discrete distribution, whereas the normal distribution 
401 | #' is continuous. In the second example, we draw samples from a [uniform distribution](https://en.wikipedia.org/wiki/Uniform_distribution_(continuous)).
402 | set.seed(2018)
403 | y <- runif(50, min = 0, max = 20) 
404 | #' Again, we use the QQ plot to evaluate for normal distribution:
405 | qqnorm(y)
406 | qqline(y)
407 | #' A strong deviation is visible, particularly for the lower and upper quantiles. This impression is confirmed when
408 | #' comparing the QQ-plot for the data to reference QQ-plots. A much stronger curvature is visible.
409 | qreference(y, nrep = 8, xlab = "Theoretical quantiles") 
410 | 
411 | #' Another useful tool is the histogram. It can be used to check normality of the data, symmetry and whether the data is
412 | #' bi- or multi-modal. Typically, the histogram displays the frequency with which values of the data fall into, typically
413 | #' same-sized, intervals. For example, we plot a histogram for the total length of Victorian possum populations:
414 | # extract Victorian possum data for variable total length
415 | Vic_pos <- pos_dat %>%
416 |               filter(Pop == "Vic") %>%
417 |               select(totlngth)
418 | # convert to vector, otherwise histogram function throws an error
419 | Vic_pos2 <- as.vector(t(Vic_pos))
420 | # create histogram
421 | hist(Vic_pos2) 
422 | # Intervals have a width of 5
423 | #' The histogram shows that more than 15 possums had a lenght between 85 and 90 cm, whereas 11 and 12 possums had a lenght
424 | #' between 80 and 85 cm and between 90 and 95 cm, respectively. Instead of absolute frequencies, the histogram can also be used 
425 | #' to display relative frequencies, i.e. the probability densities: 
426 | hist(Vic_pos2, probability = TRUE) 
427 | #' Note that different intervals affect the outlook and potentially the interpretation of a histogram:
428 | # We use the data from the non-Victorian possum population.
429 | nVic_pos <- pos_dat %>%
430 |   filter(Pop == "other") %>%
431 |   select(totlngth)
432 | # convert to vector
433 | nVic_pos2 <- as.vector(t(nVic_pos))
434 | # plot histogram, manually provide breaks
435 | hist(nVic_pos2, breaks = 72.5 + (0:5) * 5, 
436 |      xlim = c(70, 105), ylim = c(0, 26), 
437 |      xlab = "Total length (cm)", main = "", las = 1) 
438 | # Plot suggests a normal distribution
439 | # Provide different set of breaks
440 | hist(nVic_pos2, breaks = 75 + (0:5) * 5, 
441 |      xlim = c(70, 105), ylim = c(0, 26), 
442 |      xlab = "Total length (cm)", main = "", las = 1) 
443 | # With different break points between the intervals, the data look rather skewed
444 | #' The outlook and interpretation is also affected by the number of breaks:
445 | par(mfrow = c(2,2), las = 1)
446 | hist(nVic_pos2, breaks = 5, main = "5 breaks", xlab = "Total length (cm)")
447 | hist(nVic_pos2, breaks = 10, main = "10 breaks", xlab = "Total length (cm)")
448 | hist(nVic_pos2, breaks = 20, main = "20 breaks", xlab = "Total length (cm)")
449 | hist(nVic_pos2, breaks = 50, main = "50 breaks", xlab = "Total length (cm)")
450 | #' Adding density lines to histograms aids in reducing the effect of the number
451 | #' of breaks and of the position of the break points on the interpretation. The density lines are derived
452 | #' from the empirical density distribution of the data.
453 | # Derive density line
454 | dens <- density(nVic_pos2)  
455 | # add to first histogram that suggested normal distribution
456 | # only works for relative frequency histogram, which requires
457 | # to set the argument probability = TRUE
458 | par(mfrow = c(1,2), las = 1)
459 | hist(nVic_pos2, breaks = 72.5 + (0:5) * 5, 
460 |      xlim = c(70, 105), ylim = c(0, 0.11), probability = TRUE,
461 |      xlab = "Total length (cm)", main = "") 
462 | lines(dens) 
463 | # add to histogram for which data look rather skewed
464 | hist(nVic_pos2, breaks = 75 + (0:5) * 5, 
465 |      xlim = c(70, 105), ylim = c(0, 0.11), probability = TRUE,
466 |      xlab = "Total length (cm)", main = "")
467 | lines(dens) 
468 | #' The density lines are the same and confirm the approximate normal distribution of the data.
469 | #' Similarly, the density lines can alleviate the effect of the number of breaks on the interpretation,
470 | #'  where too few or too many breaks may result in flawed interpretations.
471 | par(mfrow = c(1,1), las = 1)
472 | hist(nVic_pos2, breaks = 20, 
473 |      xlim = c(70, 105), ylim = c(0, 0.2), probability = TRUE,
474 |      xlab = "Total length (cm)", main = "20 breaks")
475 | lines(dens) 
476 | #' Setting break points manually or a high number of breaks is particularly useful when the aim is to assess
477 | #' the frequency of distinct values such as zeros.
478 | #' 
479 | #' To evaluate normality of data in a histogram can be done by overlaying a density line from 
480 | #' a theoretical normal probability distribution. To do this, we generate a normal distribution
481 | #' with the parameters (i.e. mean and variance) taken from the sample data.
482 | # calculate sample mean
483 | mean_samp <- mean(nVic_pos2)
484 | # calculate sample standard deviation
485 | sd_samp <- sd(nVic_pos2)
486 | # derive densities for normal distribution
487 | dens_norm <- dnorm(seq(70, 105, by=.5), mean = mean_samp, sd = sd_samp)
488 | # add to plot
489 | hist(nVic_pos2, breaks = 20, 
490 |      xlim = c(70, 105), ylim = c(0, 0.2), probability = TRUE,
491 |      xlab = "Total length (cm)", main = "20 breaks", las = 1)
492 | lines(dens) 
493 | lines(seq(70, 105, by=.5), dens_norm, col="blue", lwd = 2)
494 | #' The blue line displays the normal distribution. A slight deviation of the sample
495 | #' distribution is visible, which is presumably due to ignoring the influence of sex 
496 | #' on the length of possums.
497 | #' 
498 | #' A range of plots that illustrate different deviations from normality are available on the
499 | #' [histogram Wikipedia page](https://en.wikipedia.org/wiki/Histogram): [right skewed distribution](https://en.wikipedia.org/wiki/Histogram#/media/File:Skewed-right.png), 
500 | #' [left skewed distribution](https://en.wikipedia.org/wiki/Histogram#/media/File:Skewed-left.png), 
501 | #' [bimodal distribution](https://en.wikipedia.org/wiki/Histogram#/media/File:Bimodal-histogram.png) and
502 | #' [multimodal distribution](https://en.wikipedia.org/wiki/Histogram#/media/File:Multimodal.png).
503 | #' 
504 | #' Several other tools for exploratory analysis that have been mentioned in the lecture will be used
505 | #' and introduced in the context of specific methods of data analysis later in the course.
506 | #' 
507 | #' You can render the Rmarkdown document related to this pdf by executing the following function:
508 | # rmarkdown::render("/Users/ralfs/Gitprojects/Teaching/Data_analysis/Code/Session_1.R")
509 | # You need to replace the file location with the path to your file location.
510 | 
511 | #' # References
512 | 
513 | 
514 | 


--------------------------------------------------------------------------------
/Code/Session_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Code/Session_1.pdf


--------------------------------------------------------------------------------
/Code/Session_2/Session_2_extract.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=FALSE, context = "setup"-----------------------------
  2 | library(learnr)
  3 | library(sandwich)
  4 | library(lmtest)
  5 | library(vegan)
  6 | data("varechem") 
  7 | knitr::opts_chunk$set(echo = FALSE)
  8 | 
  9 | ## ----data, echo = TRUE, include = TRUE-----------------------------------
 10 | data("varechem")
 11 | 
 12 | ## ----data_inspec, exercise = TRUE----------------------------------------
 13 | ?varechem
 14 | head()
 15 | 
 16 | ## ----linmod, echo = TRUE, include = TRUE, context = "setup"--------------
 17 | mod_1 <- lm(S ~ K, data = varechem)
 18 | 
 19 | ## ----linmod_plot, echo = TRUE, include = TRUE----------------------------
 20 | par(cex = 1.4, las = 1)
 21 | plot(S ~ K, data = varechem, ylab = "S mg/kg", xlab = "K mg/kg")
 22 | abline(mod_1, lwd = 2)
 23 | 
 24 | ## ----linmod_sum, echo = TRUE, include = TRUE-----------------------------
 25 | summary.lm(mod_1)
 26 | 
 27 | ## ----matrix_mult-solution------------------------------------------------
 28 | # Create vector of 1s as intercept term 
 29 | inter <- rep(1, nrow(varechem))
 30 | inter
 31 | #  Combine intercept term with values from X (K)
 32 | full <- cbind(inter, varechem$K)
 33 | full
 34 | 
 35 | # Implement the equation for the calculation of b in R:
 36 | betas <- solve(t(full) %*% full) %*% (t(full) %*% varechem$S)
 37 | betas
 38 | 
 39 | ## ----matrix_mult_2, exercise = TRUE--------------------------------------
 40 | coef(mod_1)
 41 | 
 42 | ## ----linmod_intervals, exercise=TRUE, exercise.eval = TRUE, exercise.lines = 4----
 43 | ci <- predict(mod_1, interval = "confidence", level = 0.95)
 44 | pi <- predict(mod_1, interval = "prediction", level = 0.95)
 45 | 
 46 | ## ----linmod_int_plot, echo = TRUE, include = TRUE------------------------
 47 | par(las = 1, cex = 1.8, mar = c(5,5,1,1))
 48 | plot(S ~ K, data = varechem, ylab = expression(paste("S (mg ", kg^-1,")")), xlab = expression(paste("K (mg ", kg^-1,")")))
 49 | abline(mod_1, lwd = 2)
 50 | lines(sort(varechem$K), ci[order(varechem$K) ,2], lty = 2, col = 'blue', lwd = 2)
 51 | lines(sort(varechem$K), ci[order(varechem$K) ,3], lty = 2, col = 'blue', lwd = 2)
 52 | lines(sort(varechem$K), pi[order(varechem$K) ,2], lty = 2, col = 'red', lwd = 2)
 53 | lines(sort(varechem$K), pi[order(varechem$K) ,3], lty = 2, col = 'red', lwd = 2)
 54 | 
 55 | ## ----intervals_exer-hint-2-----------------------------------------------
 56 | ci2 <- predict(mod_1, interval = "confidence", level = 0.68)
 57 | par(las = 1, cex = 1.8, mar = c(5,5,1,1))
 58 | plot(S ~ K, data = varechem, ylab = expression(paste("S (mg ", kg^-1,")")), xlab = expression(paste("K (mg ", kg^-1,")")))
 59 | abline(mod_1, lwd = 2)
 60 | lines(sort(varechem$K), ci2[order(varechem$K) ,2], lty = 2, col = 'blue', lwd = 2)
 61 | lines(sort(varechem$K), ci2[order(varechem$K) ,3], lty = 2, col = 'blue', lwd = 2)
 62 | 
 63 | ## ----summary_linmod, echo = TRUE, include = TRUE-------------------------
 64 | summary(mod_1)
 65 | 
 66 | ## ----anova_linmod, echo = TRUE, include = TRUE---------------------------
 67 | anova(mod_1)
 68 | 
 69 | ## ----linmod_diag_general, echo = TRUE, include = TRUE, eval = FALSE------
 70 | ## par(mfrow = c(1, 4))
 71 | ## plot(mod_1)
 72 | 
 73 | ## ----linmod_diag_qq, echo = TRUE, include = TRUE-------------------------
 74 | par(mfrow = c(1, 1))
 75 | plot(mod_1, which = 2)
 76 | 
 77 | ## ----linmod_diag_qq2, echo = TRUE, include = TRUE------------------------
 78 | library(DAAG)
 79 | qreference(rstandard(mod_1), nrep = 8, xlab = "Theoretical Quantiles")
 80 | 
 81 | ## ----linmod_diag_assum, echo = TRUE, include = TRUE----------------------
 82 | par(mfrow = c(1,2))
 83 | plot(mod_1, which = c(1,3))
 84 | 
 85 | ## ----setup_2, include = FALSE, context = "setup"-------------------------
 86 | set.seed(568)  # this makes the example exactly reproducible
 87 | # define parameters
 88 | n_start <- rep(1:100, 2)
 89 | n <- sort(n_start)
 90 | b0 <- 0
 91 | b1 <- 1
 92 | sigma2_a <- n^1.5
 93 | sigma2_b <- n^3
 94 | err_l <- rnorm(n, mean = 0, sd = sqrt(sigma2_a))
 95 | err_h <- rnorm(n, mean = 0, sd = sqrt(sigma2_b))
 96 | err_n <- rnorm(n, mean = 0, sd = 2)  
 97 | # set up different models
 98 | y_l   <- b0 + b1 * n + err_l
 99 | y_h   <- b0 + b1 * n + err_h
100 | y_n   <- b0 + b1 * n + err_n
101 | y_nl1 <- b1 * n + 0.5 * n^2 -0.04 * n^3 + err_n
102 | y_nl2 <- b1 * n + 0.5 * n^2 -0.005 * n^3 + err_n
103 | # model with linear model
104 | mod_incvar <- lm(y_l ~ n)
105 | mod_h <- lm(y_h ~ n)
106 | mod_n <- lm(y_n ~ n)
107 | mod_nl1 <- lm(y_nl1 ~ n)
108 | mod_nl2 <- lm(y_nl2 ~ n)
109 | 
110 | ## ----linmod_assum_test1, include = TRUE, echo = FALSE--------------------
111 | par(mfrow = c(1,3))
112 | plot(mod_incvar, which = c(1:3))
113 | 
114 | ## ----linmod_assum_test2, include = TRUE, echo = FALSE--------------------
115 | par(mfrow = c(1,3))
116 | plot(mod_n, which = c(1:3))
117 | 
118 | ## ----linmod_assum_test3, include = TRUE, echo = FALSE--------------------
119 | par(mfrow = c(1,3))
120 | plot(mod_h, which = c(1:3))
121 | 
122 | ## ----linmod_assum_test4, include = TRUE, echo = FALSE--------------------
123 | par(mfrow = c(1,3))
124 | plot(mod_nl1, which = c(1:3))
125 | 
126 | ## ----linmod_assum_test5, include = TRUE, echo = FALSE--------------------
127 | par(mfrow = c(1,3))
128 | plot(mod_nl2, which = c(1:3))
129 | 
130 | ## ----linmod_hetero1, include = TRUE, echo = TRUE-------------------------
131 | plot(y_l ~ n, xlab = "Explanatory variable", ylab = "Response", las = 1)
132 | abline(mod_incvar)
133 | par(mfrow = c(1,3))
134 | plot(mod_incvar, which = c(1:3))
135 | 
136 | ## ----linmod_hetero2, include = TRUE, echo = TRUE, context = "setup"------
137 | library(sandwich)
138 | library(lmtest)
139 | # Calculate variance covariance matrix
140 | corse_lm <- vcovHC(mod_incvar)
141 | # Compare original and corrected standard errors
142 | coeftest(mod_incvar)
143 | coeftest(mod_incvar, vcov = corse_lm)
144 | 
145 | ## ----heterosc_exer-solution----------------------------------------------
146 | coeftest(mod_incvar, vcov = corse_lm)
147 | summary(mod_incvar)
148 | 
149 | ## ----linmod_nonlin1, include = TRUE, echo = TRUE-------------------------
150 | mod_nl1 <- lm(y_nl1 ~ n)
151 | plot(y_nl1 ~ n, xlab = "Explanatory variable", ylab = "Response", las = 1)
152 | abline(mod_nl1, lwd = 2, col = "red")
153 | par(mfrow = c(1,3))
154 | plot(mod_nl1, which = c(1:3))
155 | summary(mod_nl1)
156 | 
157 | ## ----linmod_nonlin2, exercise = TRUE-------------------------------------
158 | mod_nl1b <- lm(y_nl1 ~ n + I(n^2))
159 | plot(y_nl1 ~ n, xlab = "Explanatory variable", ylab = "Response", las = 1)
160 | # Extract fitted values
161 | fit_nl1 <- predict(mod_nl1b)
162 | # Plot
163 | lines(n, fit_nl1, lwd = 2, col = "red")
164 | par(mfrow = c(1,3))
165 | plot(mod_nl1b, which = c(1:3))
166 | summary(mod_nl1b)
167 | 
168 | ## ----linmod_nonlin3-solution---------------------------------------------
169 | mod_nl1c <- lm(y_nl1 ~ n + I(n^2) + I(n^3))
170 | plot(y_nl1 ~ n, xlab = "Explanatory variable", ylab = "Response", las = 1)
171 | # Extract fitted values
172 | fit_nl2 <- predict(mod_nl1c)
173 | # Plot
174 | lines(n, fit_nl2, lwd = 2, col = "red")
175 | par(mfrow = c(1,3))
176 | plot(mod_nl1c, which = c(1:3))
177 | summary(mod_nl1c)
178 | 
179 | ## ----nonlin_corr, include = TRUE, echo = TRUE----------------------------
180 | cor(n, n^2)
181 | 
182 | ## ----linmod_cook, include = TRUE, echo = TRUE----------------------------
183 | plot(mod_1, which = 5)
184 | 
185 | ## ----linmod_all, include = TRUE, echo = TRUE-----------------------------
186 | par(mfrow = c(1, 3))
187 | plot(mod_1)
188 | 
189 | ## ----linmod_dfbetas, include = TRUE, echo = TRUE-------------------------
190 | # First display original coefficients
191 | coef(mod_1)
192 | # Inspect change in coefficients when removing the respective observation
193 | dfbeta(mod_1)
194 | 
195 | ## ----linmod_incvareaveout, include = TRUE, echo = TRUE, context = "setup"----
196 | rem_obs <- which(rownames(varechem) == "9") 
197 | mod_2 <- lm(S ~ K, data = varechem[-rem_obs, ])
198 | 
199 | ## ----linmod_incvareaveout_exerc, exercise = TRUE-------------------------
200 | # Plot without observation
201 | plot(S ~ K, data = varechem[-rem_obs, ])
202 | # Add point in different colour
203 | points(varechem$K[rem_obs], varechem$S[rem_obs], col = "red")
204 | # Plot both models
205 | abline()
206 | abline()
207 | # Compare summaries
208 | summary()
209 | summary()
210 | # The difference between the coefficients is reported with dfbeta
211 | dfbeta(mod_1)[rem_obs, ]
212 | 
213 | ## ----linmod_outputform, echo = TRUE, include = TRUE----------------------
214 | library(sjPlot)
215 | tab_model(mod_1)
216 | 
217 | 


--------------------------------------------------------------------------------
/Code/Session_2_exerc.R:
--------------------------------------------------------------------------------
 1 | ############################
 2 | # R exercise Session 2	   #
 3 | ############################	
 4 | # Ralf B. Schäfer, 11.11.2019
 5 | 
 6 | ##############################
 7 | # Preparation for Exercise   #
 8 | ##############################
 9 | 
10 | # We load data from the university website
11 | read.table("http://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/Teaching/possum.csv")
12 | # Formatting does not look good
13 | 
14 | # We have to specify different options:
15 | # header = TRUE
16 | # sep (for separator) = ";"
17 | # dec (for decimal point) = "."
18 | # We also omit the row.names.
19 | # See ?read.table for the various options
20 | pos_dat <- read.table(url("http://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/Teaching/possum.csv"), dec = ".", sep = ";", header = TRUE, row.names = NULL)
21 | close(url("http://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/Teaching/possum.csv"))
22 | 
23 | pos_dat
24 | # looks ok
25 | head(pos_dat)
26 | # first 6 rows (for long data.frames)
27 | str(pos_dat)
28 | # structure of object (data.frame with 14 variables and 104 rows, also class of variables)
29 | 
30 | ###########################################
31 | # file information
32 | ###########################################
33 | 
34 | # This file was taken from the DAAG package which is maintained by John Maindonald
35 | # and W. John Braun and contains information on possums that were published in:
36 | #
37 | # Lindenmayer, D. B., Viggers, K. L., Cunningham, R. B., and Donnelly, C. F. 1995.
38 | # Morphological variation among columns of the mountain brushtail possum,
39 | # Trichosurus caninus Ogilby (Phalangeridae: Marsupiala).
40 | # Australian Journal of Zoology 43: 449-458
41 | #
42 | # You can find more details on the variables in the DAAD package (if installed):
43 | # library(DAAG)
44 | # data(possum)
45 | # ?possum
46 | 
47 | # You should save the file for later use on your local hard drive
48 | save(pos_dat, file = "Possum.Rdata")
49 | # The file will be saved in your current working directory (run getwd())
50 | # If you start a new session you can just load the saved data with:
51 | # load("Possum.Rdata")
52 | # (assuming that you have set the working directory to the directory with the file)
53 | 
54 | # Finally, you could conventionally download the file to your local harddrive and read it from there:
55 | # read.csv2("/Users/ralfs/Downloads/Possum.csv")
56 | # would also work if adjusted to your path
57 | # read.csv and read.csv2 are different functions for reading data frames
58 | # that have predefined options for csv files
59 | # The most general function is read.table, were you have to define the format of your data
60 | # (header, separator, decimal point, ...)
61 | 
62 | 
63 | #################################################################
64 | # Exercise: Chasing possums can be laborious.   				#
65 | # An easy way would be to predict their length from footprints.	#
66 | # Would you recommend to predict the total length of the possum #
67 | # from traces of their 	feet in the snow?   					#
68 | # And is an invasive measurement of the skull width necessary   #
69 | # or can it be approximated with the head length?				#
70 | # Identify the type of research question, create linear 		#
71 | # regression models and run complete model diagnostics			#
72 | #################################################################
73 | 


--------------------------------------------------------------------------------
/Code/Session_3/Session_3_extract.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=FALSE------------------------------------------------
  2 | library(learnr)
  3 | knitr::opts_chunk$set(echo = FALSE)
  4 | 
  5 | ## ----load-data, echo = FALSE, include = FALSE, context = "setup"---------
  6 | pos_dat <- read.table(url("http://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/Teaching/possum.csv"), dec = ".", sep = ";", header = TRUE, row.names = NULL)
  7 | 
  8 | ## ----beanplot, exercise = TRUE, exercise.eval = TRUE---------------------
  9 | library(beanplot)
 10 | 
 11 | 
 12 | ## ----beanplot-hint-2-----------------------------------------------------
 13 | mean_pops <- tapply(pos_dat$totlngth, pos_dat$Pop, mean)
 14 | totlngth_sub <- pos_dat$totlngth - mean_pops[pos_dat$Pop]
 15 | beanplot(totlngth_sub ~ Pop, data = pos_dat, las = 1, cex.lab = 1.3,
 16 | ylab = "Distance to mean [cm]", xlab = "Possum population")
 17 | 
 18 | ## ----explore-sd, echo = TRUE, include = TRUE-----------------------------
 19 | summary(pos_dat$Pop)
 20 | # Calculate standard deviation for each population
 21 | sd_pops <- tapply(pos_dat$totlngth, pos_dat$Pop, sd)
 22 | sd_pops
 23 | # Calculate ratio
 24 | sd_pops[2]/sd_pops[1]
 25 | 
 26 | ## ----t_test_normaldist, echo = TRUE, include = TRUE----------------------
 27 | qqnorm(pos_dat$totlngth[pos_dat$Pop == "other" ], datax = TRUE)
 28 | 
 29 | ## ----normaldist_ref-solution---------------------------------------------
 30 | library(DAAG)
 31 | # set seed to make example reproducible
 32 | set.seed(999)
 33 | qreference(pos_dat$totlngth[pos_dat$Pop == "other" ], nrep = 8)
 34 | 
 35 | ## ----normaldist-exec2-solution-------------------------------------------
 36 | qqnorm(pos_dat$totlngth[pos_dat$Pop == "Vic" ], datax = TRUE)
 37 | # set seed to make example reproducible
 38 | set.seed(999)
 39 | qreference(pos_dat$totlngth[pos_dat$Pop == "Vic" ], nrep = 8)
 40 | 
 41 | ## ----normdist-altern, echo = TRUE, include = TRUE, context = "setup"-----
 42 | set.seed(1246)
 43 | # Set position of the real plot in a 3x3 panel
 44 | s <- sample(1:12, size = 1)
 45 | par(mfrow = c(2, 3))
 46 | # Plotting will be rowwise, i.e. by row
 47 | for (i in 1:12) {
 48 |     if (i == s) {
 49 |         # the real plot
 50 |         qqnorm(pos_dat$totlngth[pos_dat$Pop == "Vic" ])
 51 |         qqline(pos_dat$totlngth[pos_dat$Pop == "Vic" ])
 52 |     } else {
 53 |         # draw values from normal distribution
 54 |         samp_dat <- rnorm(n = length(pos_dat$totlngth[pos_dat$Pop == "Vic" ]),
 55 |                    mean = mean(pos_dat$totlngth[pos_dat$Pop == "Vic" ]),
 56 |                    sd = sd(pos_dat$totlngth[pos_dat$Pop == "Vic" ]))
 57 |         qqnorm(samp_dat)
 58 |         qqline(samp_dat)
 59 |     }
 60 | }
 61 | 
 62 | ## ----normaldist-exec3-solution-------------------------------------------
 63 | s
 64 | 
 65 | ## ----extra_sect-shapiro, echo = TRUE, include = TRUE---------------------
 66 | # Set seed for random number generator. This makes the example reproducible
 67 | set.seed(3300)
 68 | # We draw 15 observations from a binomial distribution
 69 | # with each of 5 trials and the probability of success is 0.6
 70 | x <- rbinom(15, 5, 0.6)
 71 | # We use the so-called Shapiro-Wilk test of normal distribution.
 72 | shapiro.test(x)
 73 | 
 74 | ## ----extra_sect-shapiro_2, echo = TRUE, include = TRUE-------------------
 75 | qreference(x, nrep = 8)
 76 | 
 77 | ## ----t-test, echo = TRUE, include = TRUE---------------------------------
 78 | t.test(totlngth ~ Pop, var.equal = TRUE, data = pos_dat)
 79 | 
 80 | ## ----mean_function, echo = TRUE, include = TRUE, context = "setup"-------
 81 | meanDif <- function(data, group) {
 82 |   mean(data[group == "other"]) - mean(data[group == "Vic"]) 
 83 | }
 84 | 
 85 | ## ----perm_prep, echo = TRUE, include = TRUE, context = "setup"-----------
 86 | perm_vec <- numeric(length = 10000)
 87 | N <- nrow(pos_dat)
 88 | 
 89 | ## ----permutation, echo = TRUE, include = TRUE, context = "setup"---------
 90 | library(permute)
 91 | # Make reproducible through setting a seed
 92 | set.seed(999)
 93 | for (i in seq_len(length(perm_vec) - 1)) # Loop runs 9999 times
 94 | {
 95 |   perm <- shuffle(N)
 96 |   perm_vec[i] <- with(pos_dat, meanDif(totlngth, Pop[perm]))
 97 | }
 98 | 
 99 | ## ----permutation_add, echo = TRUE, include = TRUE, context = "setup"-----
100 | perm_vec[10000] <- with(pos_dat, meanDif(totlngth, Pop))
101 | 
102 | ## ----permutation_plot, echo = TRUE, include = TRUE-----------------------
103 | par(cex = 1.2, mfrow = c(1, 1))
104 | hist(perm_vec, breaks = 20, xlab = "Mean difference of possum populations")
105 | rug(perm_vec[10000], col = "red", lwd = 2, ticksize = 0.5)
106 | 
107 | ## ----permutation_pvalue, exercise = TRUE, exercise.eval = TRUE-----------
108 | Dbig2 <- sum(abs(perm_vec) >= abs(perm_vec[10000]))
109 | Dbig2 / length(perm_vec)
110 | 
111 | ## ----permutation_direct, echo = TRUE, include = TRUE---------------------
112 | library(DAAG)
113 | with(pos_dat, twot.permutation(totlngth[Pop == "other"], totlngth[Pop == "Vic"], nsim=9999, plotit = FALSE))
114 | 
115 | ## ----bootstr_mean, echo = TRUE, include = TRUE---------------------------
116 | library(boot)
117 | # Set up boot function
118 | boot_samp1 <- boot(
119 |  data = pos_dat[pos_dat$Pop == "Vic", "totlngth"],
120 |  statistic = function(x, i) {
121 |                             mean(x[i]) 
122 |                             }, 
123 |  # x refers to the data, i to indices (see help of boot)
124 |  R = 10000, # number of bootstrap replicates
125 |  parallel = "multicore", ncpus = 8
126 |  # on Windows OS set parallel ="no" and remove "ncpus = 8"
127 | )
128 | # Plot distribution of bootstrapped statistic and a QQ plot
129 | plot(boot_samp1)
130 | # Plot a frequency plot of the bootstrapped statistic
131 | par(cex = 1.4)
132 | hist(boot_samp1$t, breaks = 100, main = "", xlab = "t*")
133 | 
134 | ## ----bootstr_confint, echo = TRUE, include = TRUE------------------------
135 | (cis <- boot.ci(boot_samp1, type = "bca"))
136 | 
137 | ## ----bootstr_confint_plot, echo = TRUE, include = TRUE-------------------
138 | par(cex = 1.4)
139 | hist(boot_samp1$t, breaks = 100, main = "", xlab = "t*")
140 | lines(x = c(cis$bca[4], cis$bca[5]), y = c(300, 300), col = "red", lwd = 2)
141 | mtext(text = c("95% Confidence interval"), side = 3, col = "red", cex = 1.2)
142 | 
143 | ## ----linmod_prep, echo = FALSE, include = FALSE--------------------------
144 | library(vegan)
145 | data("varechem")
146 | mod_1 <- lm(S ~ K, data = varechem)
147 | 
148 | ## ----linmod_exer, exercise = TRUE----------------------------------------
149 | 
150 | 
151 | ## ----boot_resid, echo = TRUE, include = TRUE-----------------------------
152 | library(car)
153 | # Make example reproducable
154 | set.seed(30)
155 | boot_mod_res <- Boot(mod_1, f = coef, R = 9999, method = c("residual"))
156 | # see help of the Boot function for explanation
157 | confint(boot_mod_res, level = .95, type = "bca")
158 | confint(mod_1)
159 | 
160 | ## ----boot_case, echo = TRUE, include = TRUE------------------------------
161 | set.seed(30)
162 | boot_mod_case <- Boot(mod_1, f = coef, R = 9999, method = c("case"))
163 | confint(boot_mod_case, level = .95, type = "bca")
164 | confint(boot_mod_res, level = .95, type = "bca")
165 | confint(mod_1)
166 | 
167 | ## ----crossv, echo = TRUE, include = TRUE---------------------------------
168 | library(caret)
169 | set.seed(111) # Make reproducible
170 | # Define training control
171 | train.control <- trainControl(method = "cv", 
172 |                               number = 5)
173 | # number gives the k in k-fold cross-validation
174 | # Train the model
175 | model_cv <- train(S ~ K, data = varechem, method = "lm",
176 |                trControl = train.control)
177 | # Summarize the results
178 | print(model_cv)
179 | 
180 | ## ----crossv_MSPE, echo = TRUE, include = TRUE----------------------------
181 | library(DAAG)
182 | # m gives the k in k-fold cross-validation
183 | # seed is an internal set.seed argument
184 | # plotit = FALSE suppresses automatic plotting
185 | cv.lm(data = varechem, form.lm = formula(S ~ K), m = 5, seed = 30, plotit = FALSE)
186 | 
187 | ## ----mod1_MSPE, echo = TRUE, include = TRUE------------------------------
188 | mean(resid(mod_1)^2)
189 | 
190 | 


--------------------------------------------------------------------------------
/Code/Session_4/Session_4_extr.R:
--------------------------------------------------------------------------------
  1 | ## ----data_imp, echo = TRUE, include = TRUE, context = "setup"----------------------------------------------------
  2 | omb_dat <- read.csv("https://raw.githubusercontent.com/rbslandau/Function_div/master/omb.csv", sep = "", header = TRUE, as.is = TRUE)
  3 | # create categorical predictor from sampling site codes
  4 | omb_dat$Land_type <- substr(omb_dat$Sites, start = 1, stop = 1)
  5 | # quick look at dataframe
  6 | head(omb_dat)
  7 | # plot data 
  8 | library(beanplot)
  9 | par(cex = 1.3, las = 1)
 10 | beanplot(OMB ~ Land_type, data = omb_dat, 
 11 |          ylab = expression(italic("k")["inv"]), 
 12 |          xlab = "Land use type")
 13 | 
 14 | 
 15 | 
 16 | 
 17 | ## ----two_level_preprocess, echo = TRUE, include = TRUE, message = FALSE, context = "setup"-----------------------
 18 | library(dplyr)
 19 | omb_two <- omb_dat %>% filter(Land_type == "A" | Land_type == "F")
 20 | 
 21 | 
 22 | ## ----two_level_lm, echo = TRUE, include = TRUE, context = "setup"------------------------------------------------
 23 | mod_omb <- lm(OMB ~ Land_type, data = omb_two)
 24 | # show results
 25 | summary(mod_omb)
 26 | # Compare to the output of a t-test
 27 | t.test(OMB ~ Land_type, data = omb_two, var.equal = TRUE)
 28 | 
 29 | 
 30 | ## ----two_level_fitted, echo = TRUE, include = TRUE---------------------------------------------------------------
 31 | fitted(mod_omb)
 32 | 
 33 | 
 34 | ## ----two_level_fitted_calc, echo = TRUE, include = TRUE----------------------------------------------------------
 35 | calc_obs <- fitted(mod_omb) + residuals(mod_omb)
 36 | # print beside original observations
 37 | print(cbind(omb_two$OMB, calc_obs))
 38 | 
 39 | 
 40 | ## ----se_illustrated, echo = TRUE, include = TRUE-----------------------------------------------------------------
 41 | # extract matrix X
 42 | X <- model.matrix(mod_omb)
 43 | # extract RMSE
 44 | rmse <- sigma(mod_omb)
 45 | # matrix operations
 46 | X_ops <- sqrt(diag(solve(t(X) %*% X)))
 47 | rmse * X_ops
 48 | # Matches with Std. Error from summary function
 49 | summary(mod_omb)
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | ## ----exercise-fitmod-solution------------------------------------------------------------------------------------
 56 | mod_omb2 <- lm(OMB ~ Land_type, data = omb_dat)
 57 | 
 58 | 
 59 | 
 60 | ## ----summary_mod_omb, echo = TRUE, include = TRUE----------------------------------------------------------------
 61 | summary(mod_omb2)
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | ## ----anova-mod, echo = TRUE, include = TRUE, context = "setup"---------------------------------------------------
 68 | mod_aov <- aov(OMB ~ Land_type, data = omb_dat)
 69 | summary(mod_aov)
 70 | # compare anova output to regression output 
 71 | summary(mod_omb2)
 72 | 
 73 | 
 74 | ## ----anova-summary, echo = TRUE, include = TRUE------------------------------------------------------------------
 75 | summary.lm(mod_aov)
 76 | 
 77 | 
 78 | ## ----anova-summary-comp, echo = TRUE, include = TRUE-------------------------------------------------------------
 79 | summary(mod_omb2)
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | ## ----model_check, echo = TRUE, include = TRUE--------------------------------------------------------------------
 87 | par(mfrow = c(1,2))
 88 | plot(mod_aov, which = c(1,3))
 89 | 
 90 | 
 91 | 
 92 | 
 93 | ## ----anova_variance_residuals, include= TRUE, echo = TRUE--------------------------------------------------------
 94 | par(cex = 1.3, las = 1)
 95 | beanplot(residuals(mod_aov) ~ omb_dat$Land_type, 
 96 |          ylab = "Residual", 
 97 |          xlab = "Land use type")
 98 | 
 99 | 
100 | ## ----anova_normdist, include= TRUE, echo = TRUE------------------------------------------------------------------
101 | library(DAAG)
102 | # set seed to make example reproducible
103 | set.seed(999)
104 | qreference(residuals(mod_aov), nrep = 8)
105 | 
106 | 
107 | ## ----summary2_mod_omb, echo = TRUE, include = TRUE---------------------------------------------------------------
108 | summary(mod_omb2)
109 | 
110 | 
111 | 
112 | 
113 | ## ----level_sort, include= TRUE, echo = TRUE, context = "setup"---------------------------------------------------
114 | # Land_type currently is a character vector
115 | str(omb_dat)
116 | # A character vector is automatically converted to a factor when entered into a linear model
117 | # To change the order of the levels, we need to convert the character vector to a factor
118 | land_fac <- factor(omb_dat$Land_type)
119 | levels(land_fac)
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | ## ----contrast_1, echo = TRUE, include = TRUE---------------------------------------------------------------------
127 | contrasts(land_fac)
128 | 
129 | 
130 | ## ----contrast_matrix, echo = TRUE, include = TRUE, context = "setup"---------------------------------------------
131 | cont_matrix <- cbind(c(1,1,1,1), contrasts(land_fac))
132 | cont_matrix
133 | 
134 | 
135 | ## ----contrast_inverse, echo = TRUE, include = TRUE, context = "setup"--------------------------------------------
136 | solve(cont_matrix)
137 | 
138 | 
139 | ## ----mod_omb_corrp, echo = TRUE, include = TRUE, context = "setup"-----------------------------------------------
140 | # Remember the original p-values
141 | summary(mod_omb2)
142 | # extract p-values for pairwise comparisons
143 | summary(mod_omb2)$coefficients
144 | # given in rows 2-4 of fourth column
145 | p_vals <- summary(mod_omb2)$coefficients[ 2:4,4]
146 | p.adjust(p_vals, method = "bonferroni")
147 | 
148 | 
149 | ## ----mod_omb_padj2, echo = TRUE, include = TRUE------------------------------------------------------------------
150 | p.adjust(p_vals, method = "fdr")
151 | 
152 | 
153 | ## ----p_adj, echo = TRUE, include = TRUE--------------------------------------------------------------------------
154 | # we create a set of p-values:
155 | p_values <- c(0.0001, 0.0001, 0.005, 0.01, 0.02, 0.04, 0.1, 0.5, 0.5, 0.8, 0.9, 0.9)
156 | p.adjust(p_values, method = "fdr")
157 | # compare to bonferroni correction
158 | p.adjust(p_values, method = "bonferroni")
159 | 
160 | 
161 | 
162 | 
163 | ## ----load-data-pos, echo = TRUE, include = TRUE, context = "setup"-----------------------------------------------
164 | pos_dat <- read.table(url("http://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/Teaching/possum.csv"), dec = ".", sep = ";", header = TRUE, row.names = NULL)
165 | # convert to factors
166 | pos_dat$Pop <- factor(pos_dat$Pop)
167 | pos_dat$sex <- factor(pos_dat$sex)
168 | # look at observations across factor levels
169 | xtabs(~ sex + Pop, data = pos_dat)
170 | 
171 | 
172 | ## ----lm-pos2, echo = TRUE, include = TRUE, context = "setup"-----------------------------------------------------
173 | pos_mod <- lm(totlngth ~ sex * Pop, data = pos_dat)
174 | summary(pos_mod) # equivalent to summary.lm()
175 | summary.aov(pos_mod)
176 | 
177 | 
178 | ## ----lm-anova, echo = TRUE, include = TRUE, context = "setup"----------------------------------------------------
179 | null_mod <- lm(totlngth ~ 1, data = pos_dat)
180 | summary(null_mod)
181 | 
182 | 
183 | 
184 | 
185 | ## ----lm-anova2, echo = TRUE, include = TRUE----------------------------------------------------------------------
186 | anova(null_mod, pos_mod)
187 | summary(pos_mod)
188 | 
189 | 
190 | 
191 | 
192 | 
193 | ## ----anova_type3, echo = TRUE, include = TRUE, context = "setup"-------------------------------------------------
194 | library(car)
195 | Anova(pos_mod, type = 3)
196 | # compare to model with different sequence
197 | pos_mod2 <- lm(totlngth ~ Pop * sex, data = pos_dat)
198 | Anova(pos_mod2, type = 3)
199 | # Set orthogonal contrasts
200 | contrasts(pos_dat$Pop) <- contrasts(pos_dat$sex) <- "contr.sum"
201 | # update models
202 | pos_mod_ortho <- update(pos_mod)
203 | pos_mod_ortho2 <- update(pos_mod2)
204 | Anova(pos_mod_ortho, type = 3)
205 | Anova(pos_mod_ortho2, type = 3)
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | ## ----sjPlot, echo = TRUE, include = TRUE-------------------------------------------------------------------------
218 | library(sjPlot)
219 | plot_model(pos_mod, type = "int", title = "", axis.title = c("Sex of possum", "Total lenght of possum in cm"), axis.lim = c(80,95), legend.title = "Population")
220 | 
221 | 
222 | ## ----interaction_plot, echo = TRUE, include = TRUE, purl = TRUE--------------------------------------------------
223 | library(png)
224 | img <- readPNG('Riesch_etal_figs.png')
225 | grid::grid.raster(img)
226 | 
227 | 
228 | 
229 | ## ----interaction-data1, echo = TRUE, include = TRUE, context = "setup"-------------------------------------------
230 | # create data
231 | data 	<- c(17,18,18,19,20, 17,18,18,19,20,17,18,18,19,20,70,75,77,80,71)
232 | factor1a 	<- c(rep("control", 5), rep("treated", 5), rep("control", 5), rep("treated", 5))
233 | factor2a <- c(rep("low", 10), rep("high", 10))
234 | factor1 <- factor(factor1a)
235 | factor2 <- factor(factor2a)
236 | dataset <- data.frame(data, factor1, factor2)
237 | dataset$factor2 <- relevel(dataset$factor2, ref = "low")
238 | # set orthogonal contrast for Type 3 ANOVA
239 | xtabs(~ factor1 + factor2, data = dataset)
240 | contrasts(dataset$factor1) <- contrasts(dataset$factor2) <- "contr.sum"
241 | 
242 | 
243 | ## ----interaction-modelfit1, echo = TRUE, include = TRUE----------------------------------------------------------
244 | mod <- lm(data ~ factor1*factor2, data = dataset)
245 | summary.lm(mod)
246 | plot_model(mod, type = "int", title = "", axis.title = c("Factor 1", "Response"), axis.lim = c(10,85), legend.title = "Factor 2")
247 | Anova(mod, type = 2)
248 | Anova(mod, type = 3)
249 | summary.aov(mod)
250 | 
251 | 
252 | ## ----interaction-data2, echo = TRUE, include = TRUE, context = "setup"-------------------------------------------
253 | # create data sets
254 | resp1 <- c(20:25, 25:30, 27:32, 70:75)
255 | fact1a <- c(rep("Control", 12), rep("Treat", 12))
256 | fact2a <- c(rep("Low", 6), rep("High", 6), rep("Low", 6), rep("High", 6))
257 | fact1 <- factor(fact1a)
258 | fact2 <- factor(fact2a)
259 | data1 <- data.frame(resp1, fact1, fact2)
260 | data1$fact2 <- relevel(data1$fact2, ref = "Low")
261 | # set orthogonal contrast for Type 3 ANOVA
262 | contrasts(data1$fact1) <- contrasts(data1$fact2) <- "contr.sum"
263 | xtabs(~ fact1 + fact2, data = data1)
264 | 
265 | 
266 | ## ----interaction-modelfit2, echo = TRUE, include = TRUE----------------------------------------------------------
267 | mod1 <- lm(resp1 ~ fact1*fact2, data = data1)
268 | summary.lm(mod1)
269 | plot_model(mod1, type = "int", title = "", axis.title = c("Factor 1", "Response"), axis.lim = c(10,85), legend.title = "Factor 2")
270 | Anova(mod1, type = 2)
271 | Anova(mod1, type = 3)
272 | summary.aov(mod1)
273 | 
274 | 
275 | 
276 | 
277 | ## ----ancova_1, echo = TRUE, include = TRUE-----------------------------------------------------------------------
278 | anc_mod <- lm(totlngth ~ sex * skullw, data = pos_dat)
279 | summary.lm(anc_mod)
280 | Anova(anc_mod, type = 2)
281 | library(ggplot2)
282 | pred <- predict(anc_mod)
283 | ggplot(cbind(pos_dat, pred), aes(x = skullw, y = totlngth, colour = sex)) + 
284 |   geom_line(aes(y = pred)) + geom_point() +
285 |   xlab("Skull width (cm)") + ylab("Total lenght of possum in cm")
286 | 
287 | 
288 | ## ----ancova_2, echo = TRUE, include = TRUE-----------------------------------------------------------------------
289 | anc_mod2 <- update(anc_mod, .~. -sex:skullw)
290 | summary.lm(anc_mod2)
291 | Anova(anc_mod2, type = 2)
292 | library(ggplot2)
293 | pred2 <- predict(anc_mod2)
294 | ggplot(cbind(pos_dat, pred2), aes(x = skullw, y = totlngth, colour = sex)) + 
295 |   geom_line(aes(y = pred2)) + geom_point() +
296 |   xlab("Skull width (cm)") + ylab("Total lenght of possum in cm")
297 | 
298 | 


--------------------------------------------------------------------------------
/Code/Session_5/Session_5_extr.R:
--------------------------------------------------------------------------------
  1 | ## ----load_data, include = TRUE, echo = TRUE, purl = TRUE, context = "setup"--------------------------------------
  2 | data_oc <- read.csv("https://raw.githubusercontent.com/rbslandau/Data_analysis/master/Data/OstraMRegS400JB.txt", sep = "\t")
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | ## ----data_range_summ, include = TRUE, echo = TRUE, purl = TRUE---------------------------------------------------
  9 | summary(data_oc2)
 10 | 
 11 | 
 12 | ## ----data_range_hist, include = TRUE, echo = TRUE, purl = TRUE---------------------------------------------------
 13 | par(mfrow = c(1, 3))
 14 | hist(data_oc2$DP)
 15 | hist(data_oc2$RC)
 16 | hist(data_oc2$BT)
 17 | hist(data_oc2$SA)
 18 | hist(data_oc2$SP)
 19 | hist(data_oc2$IC)
 20 | hist(data_oc2$P)
 21 | hist(data_oc2$LAT)
 22 | hist(data_oc2$LON)
 23 | 
 24 | 
 25 | ## ----data_range_hist3, include = TRUE, echo = TRUE, purl = TRUE--------------------------------------------------
 26 | hist(log10(data_oc2$DP))
 27 | 
 28 | 
 29 | ## ----add_variable, include = TRUE, echo = TRUE, context = "setup"------------------------------------------------
 30 | data_oc2$DPlog <- log10(data_oc$DP)
 31 | 
 32 | 
 33 | 
 34 | 
 35 | ## ----remove_vars2, include = TRUE, echo = TRUE, context = "setup"------------------------------------------------
 36 | data_check <- data_oc2[ , !names(data_oc2) %in% c("E100", "DP", "SR")]
 37 | 
 38 | ## ----data_callin_code, include = TRUE, echo = TRUE, eval = FALSE-------------------------------------------------
 39 | ## library(ggplot2)
 40 | ## library(GGally)
 41 | ## # We define a function to change the colour of points and lines (otherwise both are black)
 42 | ## lowerFn <- function(data, mapping, method = "lm", ...) {
 43 | ##   p <- ggplot(data = data, mapping = mapping) +
 44 | ##               geom_point(colour = "blue") +
 45 | ##               geom_smooth(method = method, color = "red", ...)
 46 | ##   p
 47 | ## }
 48 | ## # Run ggpairs()
 49 | ## ggpairs(data_check, lower = list(continuous = wrap(lowerFn, method = "lm")),
 50 | ##                     diag = list(continuous = wrap("densityDiag", colour = "blue")),
 51 | ##                     upper = list(continuous = wrap("cor", size = 10)))
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | ## ----vif_prep, include = TRUE, echo = TRUE-----------------------------------------------------------------------
 60 | mod_vif <- lm(data_oc2$E100 ~ ., data = data_check)
 61 | 
 62 | 
 63 | ## ----vif_calc, include = TRUE, echo = TRUE-----------------------------------------------------------------------
 64 | library(car)
 65 | vif(mod_vif)
 66 | 
 67 | 
 68 | ## ----vif_proof, include = TRUE, echo = TRUE----------------------------------------------------------------------
 69 | mod_vifb <- lm(data_check$IC ~ ., data = data_check[ , names(data_check) != "IC"])
 70 | r2 <- summary.lm(mod_vifb)$r.squared
 71 | 1 / (1 - r2)
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | ## ----sample_size, include = TRUE, echo = TRUE--------------------------------------------------------------------
 78 | library(Hmisc)
 79 | describe(data_env)
 80 | 
 81 | 
 82 | 
 83 | 
 84 | ## ----full_mod, include = TRUE, echo = TRUE, context = "setup"----------------------------------------------------
 85 | mod_1 <- lm(data_oc2$E100 ~ . + BT:SP, data = data_env, na.action = "na.fail")
 86 | summary(mod_1)
 87 | 
 88 | 
 89 | ## ----full_mod_wo_inter, include = TRUE, echo = TRUE--------------------------------------------------------------
 90 | mod_1_wointer <- update(mod_1, . ~ . -1)
 91 | summary(mod_1_wointer)
 92 | 
 93 | 
 94 | 
 95 | 
 96 | ## ----full_mod_outputanova, include = TRUE, echo = TRUE-----------------------------------------------------------
 97 | library(car)
 98 | Anova(mod_1, type = 2)
 99 | 
100 | 
101 | 
102 | 
103 | ## ----all_mods, include = TRUE, echo = TRUE-----------------------------------------------------------------------
104 | library(MuMIn)
105 | allmodels <- dredge(mod_1, extra = "R^2")
106 | print(allmodels)
107 | 
108 | 
109 | ## ----extr_top_mod, include = TRUE, echo = TRUE-------------------------------------------------------------------
110 | topmod <- get.models(allmodels, subset = 1)
111 | print(topmod)
112 | 
113 | 
114 | ## ----model_avg_mod, include = TRUE, echo = TRUE------------------------------------------------------------------
115 | library(MuMIn)
116 | avg_model <- model.avg(allmodels, subset = delta < 2, fit = TRUE)
117 | summary(avg_model)
118 | 
119 | 
120 | ## ----recalc_models, include = TRUE, echo = TRUE------------------------------------------------------------------
121 | allmodels2 <- dredge(mod_1, extra = "R^2", rank = "BIC")
122 | avg_model_bic <- model.avg(allmodels2, subset = delta < 2, fit = TRUE)
123 | summary(avg_model_bic)
124 | 
125 | 
126 | 
127 | 
128 | 
129 | ## ----model_avg_accuracy_meas, include = TRUE, echo = TRUE--------------------------------------------------------
130 | fit_y <-  predict(avg_model)
131 | res_ssq_avg <- sum((data_oc2$E100 - fit_y) ^ 2)
132 | tot_ssq_avg <- sum((data_oc2$E100 - mean(data_oc2$E100)) ^ 2)
133 | # Based on the residual sum of squares and the total sum of squares we can compute the R^2:
134 | 1 - res_ssq_avg / tot_ssq_avg
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | ## ----stepwise_mod1, include = TRUE, echo = TRUE, context = "setup"-----------------------------------------------
144 | summary(mod_1)
145 | # Remove interaction
146 | mod_2 <- update(mod_1, ~. - BT:SP)
147 | # Check model
148 | summary(mod_2)
149 | 
150 | 
151 | ## ----stepwise_mod2, include = TRUE, echo = TRUE------------------------------------------------------------------
152 | # Regression output
153 | summary(mod_2)
154 | # Anova Type 2 output
155 | Anova(mod_2, type = 2)
156 | # Output of partial F-test
157 | mod_3 <- update(mod_2, ~. - LAT)
158 | anova(mod_2, mod_3)
159 | # Output of drop1
160 | drop1(mod_2, test = "F")
161 | 
162 | 
163 | 
164 | 
165 | 
166 | ## ----stepwise_mod_anova, include = TRUE, echo = TRUE-------------------------------------------------------------
167 | anova(mod_1, mod_3)
168 | 
169 | 
170 | 
171 | 
172 | 
173 | ## ----stepwise_inform_theoretic, include = TRUE, echo = TRUE------------------------------------------------------
174 | # Calculation of AIC
175 | AIC(mod_1)
176 | AIC(mod_2)
177 | # Calculation of BIC
178 | BIC(mod_1)
179 | BIC(mod_2)
180 | # Calculation of corrected AIC
181 | library(MuMIn)
182 | AICc(mod_1)
183 | AICc(mod_2)
184 | 
185 | 
186 | 
187 | 
188 | 
189 | ## ----manual_calc_AICc, include = TRUE, echo = TRUE---------------------------------------------------------------
190 | # extract number of parameters and add 1 for the estimated variance
191 | p <- length(mod_1$coefficients) + 1
192 | # extract sample size
193 | n <- nrow(data_env)
194 | # calculate corrected AIC:
195 | AIC(mod_1) + 2 * p * (p + 1) / (n - p - 1)
196 | 
197 | # Same as
198 | library(MuMIn)
199 | AICc(mod_1)
200 | 
201 | 
202 | ## ----auto_modelling, include = TRUE, echo = TRUE-----------------------------------------------------------------
203 | # fit intercept-only nullmodel: no variables, only mean
204 | nullmodel <- lm(data_oc2$E100 ~ 1, data = data_env)
205 | # start stepwise algorithm
206 | step(object = mod_1, scope = list(upper = mod_1, lower = nullmodel), direction = "backward", trace = 100, k = log(n))
207 | 
208 | 
209 | ## ----auto_modelling2, include = TRUE, echo = TRUE----------------------------------------------------------------
210 | step(
211 |   nullmodel, direction = "forward", trace = 100,
212 |   scope = list(upper = mod_1, lower = nullmodel), k = log(n)
213 | )
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | ## ----rela_impo, include = TRUE, echo = TRUE----------------------------------------------------------------------
224 | library(relaimpo)
225 | pred_imp_lmg <- calc.relimp(mod_2, type = c("lmg"), rela = TRUE)
226 | pred_imp_beta <- calc.relimp(mod_2, type = c("betasq"), rela = TRUE)
227 | plot(pred_imp_lmg, main = "")
228 | plot(pred_imp_beta, main = "")
229 | 
230 | 
231 | ## ----rela_hierpart, include = TRUE, echo = TRUE------------------------------------------------------------------
232 | library(hier.part)
233 | hier.part(y = data_oc2$E100, xcan = data_env, gof = "Rsqu")
234 | 
235 | 
236 | 
237 | 
238 | ## ----shrink_prep, include = TRUE, echo = TRUE--------------------------------------------------------------------
239 | mod_5_s <- lm(data_oc2$E100 ~ BT + SP + P + LON + DPlog, data = data_env, x = TRUE, y = TRUE)
240 | 
241 | 
242 | ## ----shrink_conduc, include = TRUE, echo = TRUE------------------------------------------------------------------
243 | library(shrink)
244 | # global shrinkage
245 | shrink_res1 <- shrink(mod_5_s, type = "global")
246 | shrink_res1
247 | # reproduce results manually
248 | coef(mod_5_s)[-1] * shrink_res1$ShrinkageFactors
249 | # note that the intercept is removed because the intercept requires no shrinkage
250 | 
251 | # parameterwise shrinkage
252 | shrink_res2 <- shrink(mod_5_s, type = "parameterwise")
253 | shrink_res2
254 | 
255 | 
256 | 
257 | 
258 | ## ----lasso_1, include = TRUE, echo = TRUE------------------------------------------------------------------------
259 | library(glmnet)
260 | # fit model with lasso, requires predictors as matrix
261 | # and response as vector
262 | lasso_mod <- glmnet(x = as.matrix(data_env), y = data_oc2$E100)
263 | plot(lasso_mod, label = TRUE)
264 | 
265 | 
266 | ## ----lasso_2, include = TRUE, echo = TRUE------------------------------------------------------------------------
267 | plot(lasso_mod, label = TRUE, xvar = "lambda")
268 | plot(lasso_mod, label = TRUE, xvar = "dev")
269 | 
270 | 
271 | ## ----lasso_cv, include = TRUE, echo = TRUE-----------------------------------------------------------------------
272 | # set seed to make reproducible
273 | set.seed(111)
274 | cvfit <- cv.glmnet(as.matrix(data_env), data_oc2$E100)
275 | plot(cvfit)
276 | 
277 | 
278 | ## ----lasso_extract, include = TRUE, echo = TRUE------------------------------------------------------------------
279 | # extract lambdas
280 | cvfit$lambda.min
281 | cvfit$lambda.1se
282 | # extract regression coefficients
283 | coef(cvfit, s = "lambda.min")
284 | coef(cvfit, s = "lambda.1se")
285 | 
286 | 
287 | ## ----lasso_extract_zero, include = TRUE, echo = TRUE-------------------------------------------------------------
288 | coef(cvfit, s = 1)
289 | 
290 | 
291 | ## ----lasso_stabsel, include = TRUE, echo = TRUE------------------------------------------------------------------
292 | library(stabs)
293 | ## make reproducible
294 | set.seed(1204)
295 | (stab.lasso <- stabsel(x = as.matrix(data_env), y = data_oc2$E100,
296 |                        fitfun = glmnet.lasso, cutoff = 0.70, PFER = 1))
297 | # plot estimate for selection probability
298 | plot(stab.lasso, main = "Lasso")
299 | 
300 | 
301 | 
302 | ## ----mod_vis_pedagog, include = TRUE, echo = TRUE----------------------------------------------------------------
303 | library(car)
304 | mcPlots(mod_2, ~SP, overlaid = FALSE)
305 | 
306 | 
307 | ## ----mod_vis_effects, include = TRUE, echo = TRUE----------------------------------------------------------------
308 | library(effects)
309 | plot(predictorEffects(mod = mod_5_s, predictor = "SP"), ylab = "Rarefied richness (100 ind.)")
310 | plot(predictorEffects(mod = mod_5_s, predictor = "BT"), ylab = "Rarefied richness (100 ind.)")
311 | plot(predictorEffects(mod = mod_5_s, predictor = "DPlog"), ylab = "Rarefied richness (100 ind.)")
312 | 
313 | 
314 | ## ----select_inf, include = TRUE, echo = TRUE---------------------------------------------------------------------
315 | # load library for functions
316 | library(selectiveInference)
317 | # fit model
318 | # the function requires that the object with the predictors is of class matrix
319 | # and the response should be of class vector
320 | fs_model <- fs(as.matrix(data_env), data_oc2$E100)
321 | # we plot the model
322 | plot(fs_model)
323 | # Displays change in coefficients over stepwise process
324 | # inference for model
325 | fsInf(fs_model)
326 | 
327 | 
328 | ## ----select_inf_names, include = TRUE, echo = TRUE---------------------------------------------------------------
329 | names(data_env)
330 | 
331 | 
332 | ## ----select_inf_final, include = TRUE, echo = TRUE---------------------------------------------------------------
333 | coef(fs_model, s = 4)
334 | 
335 | 
336 | ## ----bootstrapping_BIC, include = TRUE, echo = TRUE--------------------------------------------------------------
337 | library(bootStepAIC)
338 | # set seed to make analysis reproducibleset.seed(111)# See help for details on functiondata_env$E100 <- data_oc2$E100# we need a scoping assignment here (will locally run with "normal" assignment)data_env <<- as.data.frame(data_env)mod_1 <- lm(E100 ~ RC + BT + SP + P + LAT + LON, data = data_env)# as before k can be used to select BIC, here we set to number of rowsn <- log(nrow(data_env))boot.stepAIC(object = mod_1, data = data_env, k = n, direction = "backward")


--------------------------------------------------------------------------------
/Code/Session_5_exerc.R:
--------------------------------------------------------------------------------
 1 | ############################
 2 | # R exercise Session 5	   #
 3 | ############################	
 4 | # Ralf B. Schäfer, 11.11.2019
 5 | 
 6 | ##############################
 7 | # Preparation for Exercise   #
 8 | ##############################
 9 | 
10 | # We load data that is contained within an R package
11 | # If you cannot access the data, install the package
12 | # via: install.packages("HSAUR2")
13 | 
14 | data("USairpollution", package = "HSAUR2")
15 | # See package information for details on the data set
16 | head(USairpollution)
17 | 
18 | #############################################################################
19 | # Exercise: For an effective environmental protection,						#
20 | # you need to know causes of pollution.										#
21 | # In this case study (using real world data), the aim is to identify		#	
22 | # the variables exhibiting the highest explanatory power					#
23 | # for the SO2 air concentrations.											#
24 | # Model the SO2 concentration as response and use the other variables		#
25 | # as predictors. Compare the results for the following methods:				#
26 | # 1) manual model building based on hypotheses,								#
27 | # 2) automatic backward model selection with BIC							#
28 | # 3) LASSO																	#
29 | # Also compare the regression coefficients from post-selection shrinkage 	#	
30 | # for 1) or 2) with those from the LASSO. Finally, conduct model diagnosis	#
31 | # and plot the model with effect plots and determine the variable importance#
32 | # for a final model that you select.										#
33 | #############################################################################
34 | 
35 | 


--------------------------------------------------------------------------------
/Code/Session_6/Session_6_extr.R:
--------------------------------------------------------------------------------
  1 | ## ----load_data, include = TRUE, echo = TRUE, purl = TRUE-----------------
  2 | library(DAAG)
  3 | # display first rows
  4 | head(frogs)
  5 | 
  6 | ## ----gen_response, include = TRUE, echo = TRUE, purl = TRUE--------------
  7 | # generate response with error
  8 | resp <- -15 + 0.1* frogs$avrain + rnorm(length(frogs$avrain), sd = 2)
  9 | 
 10 | ## ----rel_resp_avrain, include = TRUE, echo = TRUE, purl = TRUE-----------
 11 | library(xkcd)
 12 | library(extrafont)
 13 | library(ggplot2)
 14 | # create plot
 15 | p <- ggplot() + geom_point(aes(x = frogs$avrain, y = resp), color='blue') + 
 16 |     theme(text = element_text(size = 20, family = "xkcd")) + xlab("Average rain in spring") + ylab ("Response") +
 17 |     geom_smooth(method = "lm", aes(x = frogs$avrain, y = resp))
 18 | p
 19 | 
 20 | ## ----logit_avrain, include = TRUE, echo = TRUE, purl = TRUE--------------
 21 | # convert response to probability
 22 | pr <- exp(resp) / ( 1 + exp(resp))
 23 | # use probabilities to sample from distribution
 24 | resp_obs <- rbinom(length(frogs$avrain), 1, pr)
 25 | 
 26 | # fit linear model and extract fitted values
 27 | lm_mod  <- lm(resp ~ frogs$avrain)
 28 | fit_values <- fitted(lm_mod)
 29 | # convert fitted values to fitted probabilities using logit
 30 | pr_fit <- exp(fit_values) / ( 1 + exp(fit_values))
 31 | # plot model
 32 | p2 <- ggplot() + geom_point(aes(x = frogs$avrain, y = resp_obs), color='blue') + geom_line(aes(x = frogs$avrain, y = pr_fit), color='blue') + 
 33 |     theme(text = element_text(size = 20, family = "xkcd")) + xlab("Average rain in spring") + ylab ("Transformed response")
 34 | p2
 35 | 
 36 | ## ----spm_code, include = TRUE, echo = TRUE, eval = FALSE-----------------
 37 | ## library(car)
 38 | ## spm(frogs[, c(4:10)])
 39 | 
 40 | ## ----trans_check, exercise = TRUE, eval = FALSE--------------------------
 41 | ## par(mfrow = c(1, 3))
 42 | ## for (nam in c("distance", "NoOfPools")) {
 43 | ##   y <- frogs[, nam]
 44 | ##   plot(density(y), main = "", xlab = nam)
 45 | ##   plot(density(sqrt(y)), main = "", xlab = nam)
 46 | ##   plot(density(log(y)), main = "", xlab = nam)
 47 | ## }
 48 | 
 49 | ## ----transformation,  include = TRUE, echo = TRUE, purl = TRUE, context = "setup"----
 50 | # log transformation of NoOfPools and distance,
 51 | logNoPools <- log(frogs$NoOfPools)
 52 | logdistance <- log(frogs$distance)
 53 | # create easily accessible data sets
 54 | predictors <- data.frame(frogs[ , c(4, 7:10)], logNoPools, logdistance)
 55 | resp_frogs <- frogs$pres.abs
 56 | 
 57 | ## ----coll_check, include = TRUE, echo = TRUE, eval = FALSE---------------
 58 | ## spm(predictors)
 59 | 
 60 | ## ----vif,  include = TRUE, echo = TRUE, purl = TRUE----------------------
 61 | frog.glm <- glm(resp_frogs ~ ., family = binomial(link = "logit"),
 62 |                 data = predictors, na.action = "na.fail")
 63 | vif(frog.glm)
 64 | 
 65 | ## ----vif_recalc,  include = TRUE, echo = TRUE, purl = TRUE---------------
 66 | frog.glm2 <- glm(resp_frogs ~ logNoPools + logdistance + NoOfSites + avrain + I(meanmin + meanmax),
 67 |                 family = binomial(link = "logit"), data = predictors, na.action = "na.fail")
 68 | vif(frog.glm2)
 69 | 
 70 | ## ----Wald_select1,  include = TRUE, echo = TRUE, purl = TRUE-------------
 71 | summary(frog.glm2)
 72 | 
 73 | ## ----Wald_select2,  include = TRUE, echo = TRUE, purl = TRUE-------------
 74 | frog.glm3 <- update(frog.glm2, ~ . - NoOfSites)
 75 | summary(frog.glm3)
 76 | 
 77 | ## ----LRT_1,  include = TRUE, echo = TRUE, purl = TRUE--------------------
 78 | anova(frog.glm3, frog.glm2, test = "Chisq")
 79 | 
 80 | ## ----likelihood_manual,  include = TRUE, echo = TRUE, purl = TRUE--------
 81 | LL1 <- logLik(frog.glm2)
 82 | LL2 <- logLik(frog.glm3)
 83 | 
 84 | ## ----likelihood_manual_2,  include = TRUE, echo = TRUE, purl = TRUE------
 85 | # calculate difference
 86 | -2*(LL2-LL1)
 87 | # compare to chi-square distribution
 88 | pchisq(-2*(LL2-LL1), df = 1, lower.tail=FALSE)
 89 | 
 90 | ## ----drop1_LRT,  include = TRUE, echo = TRUE, purl = TRUE----------------
 91 | drop1(frog.glm3, test = "Chisq")
 92 | 
 93 | ## ----sequ_LRT,  include = TRUE, echo = TRUE, purl = TRUE-----------------
 94 | anova(frog.glm3, test = "Chisq")
 95 | 
 96 | ## ----typetwo_LRT,  include = TRUE, echo = TRUE, purl = TRUE--------------
 97 | library(car)
 98 | Anova(frog.glm3)
 99 | 
100 | ## ----information_theoretic,  include = TRUE, echo = TRUE, purl = TRUE----
101 | # calculation of AIC
102 | AIC(frog.glm2)
103 | AIC(frog.glm3)
104 | # calculation of BIC
105 | BIC(frog.glm2)
106 | BIC(frog.glm3)
107 | # calculation of drop1 for AIC
108 | drop1(frog.glm3)
109 | # calculation of drop1 for BIC
110 | # requires sample size n
111 | samp_size_n <- nrow(predictors)
112 | drop1(frog.glm3, k = log(samp_size_n))
113 | 
114 | ## ----lasso_glm_1, include = TRUE, echo = TRUE, context = "data"----------
115 | library(dplyr)
116 | # add combined variable
117 | predictors_new <- predictors %>%
118 |               mutate(mean_maxmin = meanmin + meanmax)
119 | # remove collinear variables
120 | drop_cols <- c("altitude", "meanmin", "meanmax")
121 | predictors_lasso <- predictors_new %>% 
122 |                                select(-one_of(drop_cols))
123 | 
124 | ## ----lasso_glm_2, include = TRUE, echo = TRUE----------------------------
125 | library(glmnet)
126 | # fit model with lasso, requires predictors as matrix
127 | # and response as vector
128 | lasso_mod <- glmnet(x = as.matrix(predictors_lasso), y = resp_frogs, family = "binomial")
129 | par(cex = 1.2)
130 | plot(lasso_mod, label = TRUE, xvar = "lambda")
131 | 
132 | ## ----lasso_plot_dev, include = TRUE, echo = TRUE-------------------------
133 | plot(lasso_mod, label = TRUE, xvar = "dev")
134 | 
135 | ## ----deviance_expl, include = TRUE, echo = TRUE--------------------------
136 | library(modEvA)
137 | Dsquared(frog.glm3)
138 | 
139 | ## ----lasso_cv, include = TRUE, echo = TRUE-------------------------------
140 | # set seed to make reproducible
141 | set.seed(222)
142 | cvfit <- cv.glmnet(x = as.matrix(predictors_lasso), y = resp_frogs, family = "binomial")
143 | plot(cvfit)
144 | 
145 | ## ----lasso_coefs, include = TRUE, echo = TRUE----------------------------
146 | coef(cvfit, s = "lambda.min")
147 | coef(cvfit, s = "lambda.1se")
148 | 
149 | ## ----summary_disp, include = TRUE, echo = TRUE---------------------------
150 | summary(frog.glm3)
151 | 
152 | ## ----residuals_disp, include = TRUE, echo = TRUE-------------------------
153 | # extract residuals
154 | pearson_resid <- residuals(frog.glm, type = "pearson")
155 | # calculate sum of squared residuals
156 | sum_square_resid <- sum(pearson_resid^2)
157 | # divide by degrees of freedom
158 | sum_square_resid/df.residual(frog.glm)
159 | 
160 | ## ----dharma_disp, include = TRUE, echo = TRUE----------------------------
161 | library(DHARMa)
162 | sim_glm3 <- simulateResiduals(frog.glm3)
163 | # plot residuals
164 | plot(sim_glm3)
165 | # statistical test
166 | testDispersion(sim_glm3)
167 | 
168 | ## ----dharma_disp_simul, include = TRUE, echo = TRUE---------------------------- 
169 | # load library
170 | library(DHARMa)
171 | # Create overdispersed data using a function in the package
172 | Overdisp_data <- createData(sampleSize = 200, overdispersion = 2.5, family = poisson())
173 | # Fit GLM
174 | Overdisp_mod <- glm(observedResponse ~ Environment1 , family = "poisson", data = Overdisp_data)
175 | # Simulate residuals
176 | Simul_output <- simulateResiduals(fittedModel = Overdisp_mod)
177 | # Diagnostic plots
178 | plot(Simul_output)
179 | # statistical test
180 | testDispersion(Simul_output)
181 | 
182 | ## ----residual_plot_per_component, include = TRUE, echo = TRUE------------
183 | plotResiduals(sim_glm3, form = predictors_lasso$logNoPools)
184 | plotResiduals(sim_glm3, form = predictors_lasso$logdistance)
185 | plotResiduals(sim_glm3, form = predictors_lasso$avrain)
186 | plotResiduals(sim_glm3, form = predictors_lasso$mean_maxmin)
187 | 
188 | ## ----residual_plot_diag1, include = TRUE, echo = TRUE--------------------
189 | plotResiduals(sim_glm3, form = frogs$distance)
190 | 
191 | ## ----simul_quad_prep, include = TRUE, echo = TRUE------------------------
192 | # define maximum and minimum of gradient and difference between possible values
193 | seq_vals <- seq(from = -10, to = 10, by = 0.01)
194 | # draw 50 samples from gradient -> random gradient
195 | set.seed(222) # make example reproducible
196 | x <- sample(seq_vals, size = 50, replace = TRUE)
197 | # generate quadratic response and add random error
198 | resp_new <-  0.1* x^2 - 0.1 * x -3 + rnorm(length(x), sd = 1)
199 | # convert response to probability pi
200 | prob_new <- exp(resp_new) / ( 1 + exp(resp_new))
201 | # use pis to sample from binomial distribution
202 | resp_obs_new <- rbinom(length(x), 1, prob_new)
203 | 
204 | ## ----simul_quad, include = TRUE, echo = TRUE-----------------------------
205 | # fit GLM
206 | mod_quad <- glm(resp_obs_new ~ x, family = binomial(link = "logit"))
207 | # simulate residuals
208 | sim_quad_mod <- simulateResiduals(mod_quad)
209 | # plot residuals
210 | set.seed(111)
211 | plot(sim_quad_mod)
212 | # call summary
213 | summary(mod_quad)
214 | 
215 | ## ----rootogram, include = TRUE, echo = TRUE, eval = FALSE----------------
216 | ## install.packages("countreg", repos="http://R-Forge.R-project.org")
217 | ## library(countreg)
218 | ## rootogram(<glm.model>)
219 | 
220 | ## ----cooks_leverage, include = TRUE, echo = TRUE, eval = TRUE------------
221 | library(ggplot2)
222 | library(ggfortify)
223 | par(mfrow = c(1, 1))
224 | autoplot(frog.glm3, which = 5)
225 | 
226 | ## ----cooks_leverage_index, include = TRUE, echo = TRUE, eval = TRUE------
227 | library(car)
228 | influenceIndexPlot(frog.glm3, vars = c("Cook"), id = TRUE, grid = TRUE)
229 | influenceIndexPlot(frog.glm3, vars = c("hat"), id = TRUE, grid = TRUE)
230 | 
231 | ## ----comp_coeffs, include = TRUE, echo = TRUE, eval = TRUE---------------
232 | # remove observations using subset
233 | frog.glm3_red <- update(frog.glm3, subset = -c(77, 182))
234 | compareCoefs(frog.glm3, frog.glm3_red)
235 | 
236 | ## ----mod_vis_effects, include = TRUE, echo = TRUE------------------------
237 | # calculate combined variable
238 | meanmix <- frogs$meanmax + frogs$meanmin
239 | # refit model
240 | frog.glm3b <- glm(resp_frogs ~ logNoPools + logdistance + avrain + meanmix,
241 |                  family = binomial(link = "logit"), data = predictors, na.action = "na.fail")
242 | # create plots
243 | library(effects)
244 | plot(predictorEffects(frog.glm3b, predictor = ~ logNoPools), ylab = "Probability of occurrence")
245 | plot(predictorEffects(frog.glm3b, predictor = ~ logdistance), ylab = "Probability of occurrence")
246 | plot(predictorEffects(frog.glm3b, predictor = ~ avrain), ylab = "Probability of occurrence")
247 | plot(predictorEffects(frog.glm3b, predictor = ~ meanmix), ylab = "Probability of occurrence")
248 | 
249 | ## ----mod_vis_effects_scales, include = TRUE, echo = TRUE-----------------
250 | plot(predictorEffects(frog.glm3b, predictor = ~ meanmix), axes=list(y=list(type="link", lab="logit scale, logit labels")))
251 | plot(predictorEffects(frog.glm3b, predictor = ~ meanmix), axes=list(y=list(type="response", lab="response scale, probability labels")))
252 | 
253 | ## ----mod_vis_effects_pr, include = TRUE, echo = TRUE---------------------
254 | plot(predictorEffects(frog.glm3b, predictor = ~ meanmix, residuals = TRUE), ylab = "Logit of estimated probabilities", axes=list(y=list(type="link")))
255 | 
256 | ## ----quadmod_vis_effects_pr, include = TRUE, echo = TRUE-----------------
257 | plot(predictorEffects(mod_quad, residuals = TRUE), ylab = "Logit of estimated probabilities", axes=list(y=list(type="link")))
258 | 
259 | 


--------------------------------------------------------------------------------
/Code/Session_6/glm_explorer/README.md:
--------------------------------------------------------------------------------
1 | Shiny app to interactively explore GLMs.
2 | 
3 | Simulates data (which can be varied by the user), fits GLMs (specified by the user) to the data and shows diagnostic plots.
4 | 


--------------------------------------------------------------------------------
/Code/Session_6/glm_explorer/exercises.md:
--------------------------------------------------------------------------------
  1 | ## Exercises
  2 | 
  3 | ### Gaussian Linear Model
  4 | #### Data Generation
  5 | 
  6 | 1. Vary only the intercept. What happens to the simulated data?
  7 | 2. Set the intercept back to 0, vary only the slope. What happens to the simulated data?
  8 | 3. Set the slope back to zero. Vary only the group difference. What do you observe?
  9 | 4. Set the group difference to 2, change only the slope. What happens to the simulated data?
 10 | 5. Set the group difference to 2, and the slope to 2.
 11 | Change only the intercept. What happens to the simulated data?
 12 | 6. Set the intercept to 0, the group difference to 2, the slope to 2.
 13 | Change the interaction. What do you observe for a negative and positive interaction?
 14 | 8. Set the intercept to 0, the group difference to 2, the slope to 2 and the interaction to 0. Change $\sigma$. What is happening?
 15 | 
 16 | 
 17 | #### Model fitting
 18 | 
 19 | 1- `Simulate data`: Reset the app (press `F5`).  Set the intercept to 1 and the slope to -1. 
 20 | 
 21 | `Fit model`: Change the model formula between `y ~ x` and `y ~ 1`. What is the difference between the fitted values?
 22 | 
 23 | `Model summary`: Which model has the lower AIC? Where can you read the result for the estimate of $\sigma$?
 24 | 
 25 | `Model coefficients`: Compare the estimates of the regression coefficients. Which model recovers the coefficients?
 26 | 
 27 | `Model diagnostics`: Inspect the differences between the two models. Why is there no *Residuals vs. Leverage* plot?
 28 | 
 29 | 2- `Simulate data`: Keep the intercept at 1 and set the slope to -1. Set the group difference to 2.  
 30 | 
 31 | `Fit model`: What is the difference in the plotted model between `y ~ x`, `y ~ fac`, `y ~ x + fac` and `y ~ x + fac + x:fac` as model formula? 
 32 | 
 33 | `Model summary`: What is the difference between the model formulas `y ~ x`, `y ~ fac`, `y ~ x + fac` and `y ~ x + fac + x:fac` regarding the coefficients? What is the slope for group B?
 34 | 
 35 | `Model diagnostics`:  Inspect the differences between the models?
 36 | 
 37 | 3- `Simulate data`: Keep as is and set the interaction to -1.
 38 | 
 39 | `Fit model`: What is the difference between `y ~ x + fac` and `y ~ x + fac + x:fac` as model formula? 
 40 | 
 41 | `Model summary`: What is the difference in AIC and the estimate of $\sigma$ between `y ~ x + fac` and `y ~ x + fac + x:fac` as fitted terms? What is the slope for group B?
 42 | 
 43 | `Model coefficients`: What is the difference between the models `y ~ x + fac` and `y ~ x + fac + x:fac`?
 44 | 
 45 | `Model diagnostics`:  What is the difference between the models `y ~ x + fac` and `y ~ x + fac + x:fac`?
 46 | 
 47 | 
 48 | 4- `Simulate data`: Keep as is but set the interaction to 1.
 49 | 
 50 | `Model summary`: What is the slope for group B?
 51 | 
 52 | 
 53 | 
 54 | 5- `Simulate data`: Keep all settings. Set $\sigma$ to the following levels: 0.2, 1.5 and 3. 
 55 | 
 56 | `Fit model`: What do you observe for the `y ~ x + fac + x:fac` model?
 57 | 
 58 | `Model summary`: What do you observe for the `y ~ x + fac + x:fac` model?
 59 | 
 60 | `Model coefficients`: What do you observe for the `y ~ x + fac + x:fac` model?
 61 | 
 62 | `Model diagnostics`:  What do you observe for the `y ~ x + fac + x:fac` model?
 63 | 
 64 | 
 65 | 6- `Simulate data`: Keep all settings. Set the number of observations to the following levels: 10, 100, 500, 1000. 
 66 | 
 67 | `Fit model`: What do you observe for the `y ~ x + fac + x:fac` model?
 68 | 
 69 | `Model summary`: What do you observe for the `y ~ x + fac + x:fac` model?
 70 | 
 71 | `Model coefficients`: What do you observe for the `y ~ x + fac + x:fac` model?
 72 | 
 73 | `Model diagnostics`:  What do you observe for the `y ~ x + fac + x:fac` model?
 74 | 
 75 | 
 76 | 
 77 | 
 78 | ### Count data models
 79 | 
 80 | Ignore the `Model coefficients` for these models.
 81 | 
 82 | ### Data generation
 83 | 
 84 | 7- `Simulate data`: Reset the app. Set family to `Poisson` and Link function to `log`.
 85 | 
 86 |  What is the difference in the generated data compared to data from the Gaussian model with identity link? What happens in the Poisson GLM if you change the intercept to -3 or 3 (mind the scale)?
 87 | 
 88 | 
 89 | ### Model fitting
 90 | 
 91 | 
 92 | 8- `Simulate data`: Set family to `Poisson` and Link function to `log`, the intercept to -1, the slope to 0.3 and the interaction to -0.1.
 93 | 
 94 | `Fit model`: Set family to `Poisson` and fit the models `y ~ x + fac` and `y ~ x + fac + x:fac`. What is the difference between the link functions?
 95 | 
 96 | `Model summary`: What is the difference in AIC between the link functions for the models `y ~ x + fac` and `y ~ x + fac + x:fac`?
 97 | 
 98 | `Model diagnostics`: What is the difference between the link functions for the model `y ~ x + fac`? How can we spot an incorrectly specified link?
 99 | 
100 | 
101 | 9- `Simulate data`: Keep settings as is.
102 | 
103 | `Fit model`: Compare the `Poisson` model with `log` link to the `Gaussian` model with `identity` link for the model formula `y ~ x + fac`. Are negative values meaningful?  
104 |   
105 |   Note that you can not compare the AIC between the linear (Gaussian) model and the Poisson GLM as the likelihood functions differ and a comparison is therefore not meaningful. Hence, ignore the `Model summary`.
106 | 
107 | `Model diagnostics`: Compare the related model diagnostics.
108 | 
109 | 
110 | 10- `Simulate data`: Keep settings as is. 
111 | 
112 | `Fit model`: Compare the `Poisson` model with `log` link to the `Gaussian` model with `log` link for the model formula `y ~ x + fac`. Which model fits better?  
113 |   
114 |   Note that you can not compare the AIC between the linear (Gaussian) model and the Poisson GLM as the likelihood functions differ and a comparison is therefore not meaningful. Hence, ignore the `Model summary`.
115 | 
116 | `Model diagnostics`: Compare the `Poisson` model with `log` link to the `Gaussian` model with `log` link for the model formula `y ~ x + fac`? Which model violates assumptions?  
117 | 
118 | 
119 | 11- `Simulate data`: Set family to `Negative binomial` and Link function to `log`, the intercept to -2, the slope to 0.2 and the interaction to -0.1.
120 | 
121 | `Fit model`: Compare the `Poisson` model with `log` link to the `Negative binomial` model with `log` link for the model formulas `y ~ x + fac` and `y ~ x + fac + x:fac`. Also check the prediction bands. Does the Poisson model provide good predictions?
122 | 
123 | `Model summary`:  Compare the `Poisson` model with `log` link to the `Negative binomial` model with `log` link for the model formulas `y ~ x + fac`. Compare the estimates and the standard errors. Which model gives a lower standard error? What are the consequences? What about the ignored overdispersion  (residual deviance >> the degrees of freedom) in the poisson model?
124 | 
125 | `Model diagnostics`: How can you detect the overdispersion of the Poisson model? Which plots are useful?
126 | 
127 | 
128 | 12- `Simulate data`: Set family to `Negative binomial` and Link function to `log`, the intercept to -2, the slope to 0.2 and the interaction to -0.1.
129 | 
130 | `Fit model`: Fit the negative binomial model `y ~ x + fac + x:fac`. Compare different values of $\kappa$ (e.g. 0.3, 1, 1.5, 3). What is $\kappa$ controlling?
131 | 
132 | `Model summary`:  How does $\kappa$ effect the summary?  
133 |   
134 |   
135 | 
136 | 13- `Simulate data`: Keep as is.
137 | 
138 | `Fit model`: Fit the Poisson model `y ~ x + fac`. Compare the model fits including prediction bands for different values of $\kappa$ (e.g. 0.3, 1, 1.5, 3). 
139 | 
140 | `Model summary`:  How does $\kappa$ effect the summary?
141 | 
142 | `Model diagnostics`: At which value of $\kappa$ does the overdispersion of the Poisson model disappear?
143 | 
144 | 


--------------------------------------------------------------------------------
/Code/Session_6/glm_explorer/functions.R:
--------------------------------------------------------------------------------
  1 | library(ggplot2)
  2 | library(MASS)
  3 | library(ggfortify)
  4 | library(gridExtra)
  5 | library(markdown)
  6 | library(DHARMa)
  7 | 
  8 | 
  9 | #' @param n number of observations
 10 | #' @param a intercept
 11 | #' @param b_x effect of continuous variable
 12 | #' @param b_fac effect of categorial variable
 13 | #' @param b_int interaction effect
 14 | #' @param link link function
 15 | #' @param family error distribution function
 16 | datagen <- function(n = 100, 
 17 |                     a = 0, 
 18 |                     b_x = 2, 
 19 |                     b_fac = -1, 
 20 |                     b_int = -3,
 21 |                     link = c('identity', 'log'),
 22 |                     family = c('gaussian', 'poisson', 'negbin'),
 23 |                     sigma = 1,
 24 |                     dispersion = 4) {
 25 |   link <- match.arg(link)
 26 |   family <- match.arg(family)
 27 |   
 28 |   x <- runif(n, min = 0, max = 20)
 29 |   fac1 <- sample(c('A', 'B'), n, replace = TRUE)
 30 |   fac <- factor(fac1)
 31 |   fac_dummy <- ifelse(fac1 == 'A', 0, 1)
 32 |   
 33 |   # mean
 34 |   link_mu <- a + b_x*x + b_fac*fac_dummy + b_int*fac_dummy*x
 35 |   mu <- switch(link,
 36 |                identity = link_mu,
 37 |                log = exp(link_mu))
 38 |   if (family %in% c('poisson', 'negbin') && any(mu < 0))
 39 |     stop("Cannot simulate Poisson or NegBin with negative mean. 
 40 |          Try changing the link function.")
 41 |   # response
 42 |   y <- switch(family,
 43 |          poisson = rpois(n, mu),
 44 |          gaussian = rnorm(n, mean = mu, sd = sigma),
 45 |          negbin = rnbinom(n, mu = mu, size = 1/dispersion))
 46 |     
 47 |   # return
 48 |   df <- data.frame(x, y, fac)
 49 |   return(df)
 50 | }
 51 | 
 52 | 
 53 | 
 54 | #' @param df data.frame as returned by datagen
 55 | #' @param link link function
 56 | #' @param family error distribution function
 57 | datamodel <- function(df, 
 58 |                       family = c('gaussian', 'poisson', 'negbin'),
 59 |                       link = c('identity', 'log'),
 60 |                       terms = c('intercept', 'x', 'fac', 'both', 'interaction')){
 61 |   link <- match.arg(link)
 62 |   family <- match.arg(family)
 63 |   terms <- match.arg(terms)
 64 |   form <- switch(terms,
 65 |                  intercept = as.formula(y ~ 1),
 66 |                  x = as.formula(y ~ x),
 67 |                  fac = as.formula(y ~ fac),
 68 |                  both = as.formula(y ~ x+fac),
 69 |                  interaction = as.formula(y ~ x+fac+x:fac)
 70 |                  )
 71 |   start <- switch(terms,
 72 |                   intercept = 1,
 73 |                   x = rep(1, 2),
 74 |                   fac = rep(1, 2),
 75 |                   both = rep(1, 3),
 76 |                   interaction = rep(1, 4)
 77 |                   )
 78 |   
 79 |   mod <- switch(family,
 80 |          poisson = glm(form, data = df, family = poisson(link = link),
 81 |              start = start),
 82 |          gaussian = glm(form, data = df, family = gaussian(link = link),
 83 |                         start = start),
 84 |          negbin = glm.nb(form, data = df), link = link)
 85 |   return(mod)
 86 | }
 87 | 
 88 | mod_char <- function(family = c('gaussian', 'poisson', 'negbin'),
 89 |                      link = c('identity', 'log'),
 90 |                      terms = c('intercept', 'x', 'fac', 'both', 'interaction')) {
 91 |   form <- switch(terms,
 92 |                  intercept = "y ~ 1",
 93 |                  x = "y ~ x",
 94 |                  fac = "y ~ fac",
 95 |                  both = "y ~ x + fac",
 96 |                  interaction = "y ~ x + fac + x:fac"
 97 |   )
 98 |   
 99 |   
100 |   switch(family,
101 |     negbin = paste0("glm.nb(", form, ", data = df, link = ", link, ")"),
102 |     gaussian = paste0("glm(", form, ", data = df, gaussian(link = ", link, ")"),
103 |     poisson = paste0("glm(", form, ", data = df, poisson(link = ", link, ")")
104 |   )
105 | }
106 |   
107 |   
108 | #' @param df data.frame as returned by datagen
109 | #' @param mod model as returned by datamodel
110 | #' @param show_pred logical; show prediction band?
111 | #' @param lim ylimits of plot
112 | dataplot <- function(df, mod = NULL, show_pred, ylim = NULL
113 |                      ) {
114 |   # model fit + ci
115 |   pdat <- expand.grid(x = seq(min(df$x), max(df$x), 
116 |                               length.out = 100), 
117 |                       fac = levels(df$fac))
118 |   pdat$fit <- predict(mod, newdata = pdat, type = "link")
119 |   pdat$se <- predict(mod, newdata = pdat, type = "link", se.fit = TRUE)$se.fit
120 |   mod_fam <- mod$family$family
121 |   mod_fam <- ifelse(grepl('Negative Binomial', mod_fam), 'negbin', mod_fam)
122 |   # 95% CI
123 |   crit <- switch(mod_fam,
124 |                  gaussian = qt(0.975, df = mod$df.residual),
125 |                  poisson = qnorm(0.975),
126 |                  negbin = qt(0.975, df = mod$df.residual))
127 |   pdat$lwr <- pdat$fit - crit * pdat$se
128 |   pdat$upr <- pdat$fit + crit * pdat$se
129 |   pdat$fit_r <- mod$family$linkinv(pdat$fit)
130 |   pdat$lwr_r <- mod$family$linkinv(pdat$lwr)
131 |   pdat$upr_r <- mod$family$linkinv(pdat$upr)
132 |   
133 |   p <- ggplot() + 
134 |     geom_line(data = pdat, aes(x = x, y = fit_r, col = fac)) +
135 |     geom_line(data = pdat, aes(x = x, y = upr_r, col = fac), 
136 |               linetype = 'dashed') +
137 |     geom_line(data = pdat, aes(x = x, y = lwr_r, col = fac), 
138 |               linetype = 'dashed') +
139 |     geom_point(data = df, aes(x = x, y = y, color = fac)) +
140 |     labs(y = 'y') +
141 |     # ylim(lim) +
142 |     theme_bw() +
143 |     geom_hline(aes(yintercept = 0), linetype = 'dotted') +
144 |     geom_vline(aes(xintercept = 0), linetype = 'dotted')
145 |   
146 |   if (!is.null(ylim)) {
147 |     p <- p +
148 |       ylim(ylim)
149 |   }
150 |   
151 |   # simulate from model for PI
152 |   if (show_pred) {
153 |     nsim <- 1000
154 |     y_sim <- simulate(mod, nsim = nsim)
155 |     y_sim_minmax <- apply(y_sim, 1, quantile, probs = c(0.025, 0.975))
156 |     simdat <- data.frame(ysim_min = y_sim_minmax[1, ],
157 |                          ysim_max = y_sim_minmax[2, ],
158 |                          x = df$x,
159 |                          fac = df$fac)
160 |     p <- p + 
161 |       geom_ribbon(data = simdat, aes(x = x, ymax = ysim_max, ymin = ysim_min, 
162 |                                      fill = fac), alpha = 0.2)
163 |   }
164 |   
165 |   p
166 | }
167 | 
168 | range_warn <- function(df){
169 |   if (any(df$y > 10) | any(df$y < -10))
170 |     return('*** Simulated data out of plotting range!. \n Not all data points are displayed. Models are fitted to all data.***')
171 | }
172 |   
173 |   
174 | rawplot <- function(df, ylim = NULL) {
175 |   p <- ggplot() + 
176 |     geom_point(data = df, aes(x = x, y = y, color = fac)) +
177 |     labs(y = 'y') +
178 |     # ylim(lim) +
179 |     theme_bw() +
180 |     geom_hline(aes(yintercept = 0), linetype = 'dotted') +
181 |     geom_vline(aes(xintercept = 0), linetype = 'dotted')
182 |   
183 |   if (!is.null(ylim)) {
184 |     p <- p +
185 |       ylim(ylim)
186 |   }
187 |   p
188 | }
189 | 
190 | coefplot <- function(a = 2, 
191 |                      b_x = 1, 
192 |                      b_fac = 1, 
193 |                      b_int = -2,
194 |                      mod) {
195 |   coefs <- coef(mod)
196 |   se <- diag(vcov(mod))^0.5
197 |   terms <- c('Intercept', 'x', 'fac', 'x:fac')
198 |   terms <- terms[seq_along(coefs)]
199 |   truths <- c(a, b_x, b_fac, b_int)
200 |   truths <- truths[seq_along(coefs)]
201 |   df <- data.frame(term = terms, estimate = coefs, se, truths)
202 |   mod_fam <- mod$family$family
203 |   mod_fam <- ifelse(grepl('Negative Binomial', mod_fam), 'negbin', mod_fam)
204 |   crit <- switch(mod_fam,
205 |                  gaussian = qt(0.975, df = mod$df.residual),
206 |                  poisson = qnorm(0.975),
207 |                  negbin = qt(0.975, df = mod$df.residual))
208 |   df$lwr <- df$estimate - crit * df$se
209 |   df$upr <- df$estimate + crit * df$se
210 |   tlev <- levels(df$term)
211 |   #reorder levels
212 |   df$term <- factor(df$term, levels = c('Intercept', 'x', 'fac', 'x:fac'))
213 |   # df$term <- factor(df$term, levels = c(tlev[which(levels(df$term) == 'Intercept')],
214 |   #                                       tlev[which(levels(df$term) == 'x')],
215 |   #                                       tlev[which(levels(df$term) == 'fac')],
216 |   #                                       tlev[which(levels(df$term) == 'x:fac')]))
217 |   p <- ggplot(df, aes(x = term)) +
218 |     geom_pointrange(aes(y = estimate, ymax = upr, ymin = lwr)) +
219 |     geom_point(aes(y = truths), col = 'red') +
220 |     geom_hline(yintercept = 0, linetype = 'dashed') +
221 |     coord_flip() +
222 |     theme_bw() +
223 |     labs(x = 'Coefficient', y = 'Value') +
224 |     scale_x_discrete(breaks = c('Intercept', 'x', 'fac', 'x:fac'))
225 |   p
226 | }
227 | 
228 | diagplot <- function(df, mod) {
229 |   par(mfrow = c(1,2))
230 |   plot(mod, which = 1)
231 |   plot(df$y,
232 |     predict(mod, type = 'response'),
233 |     main = 'Observed vs. Predicted',
234 |     xlab = 'Observed',
235 |     ylab = 'Predicted')
236 |   abline(0, 1, col = 'red')
237 | }
238 | 
239 | diagplot2 <- function(df, mod) {
240 |   par(mfrow = c(1,2))
241 |   plot(mod, which = 2)
242 |   plot(mod, which = 5)
243 | 
244 | }
245 | 
246 | dharmaplot <- function(mod) {
247 |   simulationOutput <- simulateResiduals(fittedModel = mod, n = 200)
248 |   plotSimulatedResiduals(simulationOutput = simulationOutput)
249 | }
250 | 
251 | 
252 | chk_pos <- function(y, fam) {
253 |   if (fam %in% c('poisson', 'negbin') && any(y < 0)) {
254 |     "Negative values in data. Cannot fit Poisson or NegBin."
255 |   } else {
256 |     NULL
257 |   }
258 | }
259 | 


--------------------------------------------------------------------------------
/Code/Session_6/glm_explorer/introduction.md:
--------------------------------------------------------------------------------
  1 | ## Introduction
  2 | 
  3 | This application allows you to simulate data with known effects and properties. Subsequently, you can explore the fit of Generalized Linear Models (GLMs) with different specifications to these data. Hence, you can explore interactively what happens if you fit a GLM with an assumed distribution that matches or does not match the distribution of the simulated data. In particular, you can inspect the resulting diagnostic plots. For example, you can check how diagnostic plots look if you specify an incorrect model. 
  4 | Overall, you will gain a better understanding of GLMs, their application and diagnostics. 
  5 | 
  6 | 
  7 | To use the app, click the `Modelling` tab in the top navigation bar.
  8 | You can modify the data and model in two modules (tabs):
  9 | 
 10 | 1. `Simulate`
 11 | 2. `Model`
 12 | 
 13 | Under `Simulate` you can specify the properties of the simulated data, including error structure, model parameters and sample size.
 14 | Under `Model` you can specify the GLM that is fitted to the simulated data.
 15 | Additional tabs provide you with model information. The `Model summary` tab provides the output of the `summary()` function, `Model coefficients` provides a visual overview of the estimated regression coefficients and `Model diagnostics` provides diagnostic plots for the fitted model. You can either explore for yourself or follow `Exercises` (see tab). I recommend following the exercises as they are designed to provide you with the most relevant insights.
 16 | 
 17 | 
 18 | 
 19 | ## Data Simulation
 20 | 
 21 | Data are generated from a GLM (`Simulate` tab).
 22 | The simulated data depend on two predictors (one continuous *x* and one categorical *fac*) and their interaction (*x:fac*), unless the related regression coefficient is set to 0.
 23 | You can vary the number of observations (i.e. sample size), the distribution from which the data originates, the link function and the variability within the data.
 24 | 
 25 | The simulation model can be written as:
 26 | 
 27 | $$Y \sim D(\mu)$$
 28 | $$\eta = g(\mu)$$
 29 | $$\eta = \beta_{0} + \beta_{1} x + \beta_{2} \text {fac} + \beta_{3} \: x \: \text {fac}$$
 30 | 
 31 | The distribution of the response ($D()$) can be set to:
 32 | 
 33 | 1. Gaussian (i.e. Normal distribution): Normal$(\mu, \sigma)$
 34 | 2. Poission distribution: Pois$(\mu)$
 35 | 3. Negative Binomial distribution: Neg.Bin$(\mu, \kappa)$
 36 | 
 37 | The link-function $f()$ can be set to:
 38 | 
 39 | 1. identity link: $g(\mu) = \mu$
 40 | 2. log link: $g(\mu) = log_e(\mu)$
 41 | 
 42 | For the ordinary linear regression you need to select `Gaussian` and the `identity` link. If you use a log link, you should set a small slope (i.e. $\beta_1$) of around 0.1 and a small interaction term (i.e. $\beta_3$) of 0 or 0.1, otherwise you won't see much of the variation.
 43 | 
 44 | The effect of the continuous and categorical variables *x* and *fac* can be set via the related regression coefficients:
 45 | 
 46 | 1. Intercept: Intercept ($\beta_0$)
 47 | 2. Continuous predictor: Slope ($\beta_1$)
 48 | 3. Categorical predictor with two levels (A and B): Group difference ($\beta_2$)
 49 | 4. The interaction between the continuous and categorical predictor: Interaction between *x* and *fac* ($\beta_3$)
 50 | 
 51 | Additionally, the variance can be varied (via $\sigma$ for normally distributed data and $\kappa$ for negative binomial data).
 52 | 
 53 | All of these specifications can be done under the `Simulate` tab, on the left.
 54 | On the right from this tab you will see a plot of the simulated data.
 55 | 
 56 | 
 57 | ## Fit a model to these data
 58 | 
 59 | In the `Model` tab you can set, on the left, specifications of the GLM including the distribution, link and predictors.
 60 | On the right you will see:
 61 | 
 62 | 1. The simulated data
 63 | 2. The fitted model (solid line)
 64 | 3. The pointwise 95% Confidence Interval (dashed line)
 65 | 
 66 | Optional: 
 67 | 4. The 95% Prediction Interval (PI, shaded band). The PI is based on simulations from the model.
 68 | 
 69 | Below the plot you find the corresponding R command.
 70 | 
 71 | 
 72 | ## Model diagnostics
 73 | 
 74 | The application provides several model outputs and diagnostics:
 75 | 
 76 | 1. `Model summary`
 77 | 2. `Model coefficients`
 78 | 3. `Model diagnostics` 
 79 | 
 80 | Within each module, you can change the fitted model on the left to see how this affects the output and diagnostics.
 81 | 
 82 | 
 83 | `Model summary` prints the `summary()` for the model.
 84 | 
 85 | `Model coefficients` shows a plot of the estimated coefficients (black dots),
 86 | their 95% confidence intervals (black lines) and the true (= value in the statistical population used for simulation) coefficients (red dots).
 87 | 
 88 | `Model Diagnostics` shows diagnostics plots from basic R on top:
 89 | 
 90 | 1. (Pearson) Residuals vs. Fitted values
 91 | 2. Observed vs. Predicted (Fitted) values
 92 | 3. QQ-plot
 93 | 4. (Pearson) Residuals vs. Leverage
 94 | 
 95 | 
 96 | Below you find two plots from the [DHARMa package](https://cran.r-project.org/web/packages/DHARMa/index.html) for diagnosing GL(M)M.
 97 | The left plot shows a QQ-plot for *randomized quantile residuals*, the right plot the *randomized quantile residuals* vs. fitted values.
 98 | 
 99 | 
100 | ## Exercises
101 | 
102 | You find several exercises under the `Exercises` tab. 
103 | 
104 | You can also find them [here](https://raw.githubusercontent.com/rbslandau/Data_analysis/master/Code/Session_6/glm_explorer/exercises.md). To do the exercises is most convenient, if you have them open in a separate (browser) window. You can also copy and paste the text into a text editor or download it into a file and then open in related program.
105 | 
106 | You can always reset the app by pressing `F5` in your browser.
107 | 
108 | 
109 | 
110 | ## Meta
111 | This app was created using [Shiny](https://shiny.rstudio.com/) from RStudio.
112 | It was originally written by [Eduard Szöcs](http://edild.github.io/) and has been modified by [Ralf B. Schäfer](https://github.com/rbslandau). It is licensed under [MIT](https://opensource.org/licenses/MIT).
113 | 
114 | 


--------------------------------------------------------------------------------
/Code/Session_6/glm_explorer/server.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # This is the server logic for a Shiny web application.
 3 | # You can find out more about building applications with Shiny here:
 4 | #
 5 | # http://shiny.rstudio.com
 6 | #
 7 | 
 8 | library(shiny)
 9 | source('functions.R')
10 | 
11 | 
12 | shinyServer(function(input, output) {
13 |   df <- reactive({
14 |     datagen(n = input$n, 
15 |             a = input$a, 
16 |             b_x = input$b_x, 
17 |             b_fac = input$b_fac, 
18 |             b_int = input$b_int,
19 |             link = input$link,
20 |             family = input$family,
21 |             sigma = input$sigma,
22 |             dispersion = input$dispersion)
23 |   })
24 |   
25 |   mod <- reactive({
26 |     validate(
27 |       chk_pos(df()$y, input$family_mod)
28 |     )
29 |     
30 |     datamodel(df(), 
31 |               family = input$family_mod, 
32 |               link = input$link_mod,
33 |               terms = input$terms_mod)
34 |   })
35 |   
36 | 
37 |   output$Plot_model <- renderPlot({
38 |     rawplot(df())
39 |   })
40 |   
41 |   output$Plot_model2 <- renderPlot({
42 |     dataplot(df(), mod(), show_pred = input$show_pred)
43 |   })
44 |   
45 |   output$Summary <- renderPrint({
46 |     summary(mod())
47 |   })
48 |   
49 |   output$model_char <- renderText({
50 |     mod_char(family = input$family_mod, 
51 |               link = input$link_mod,
52 |               terms = input$terms_mod)
53 |   })
54 |   
55 |   output$range_warn <- renderText({
56 |     range_warn(df())
57 |   })
58 |   
59 |   
60 |   output$Plot_coefs <- renderPlot({
61 |     coefplot(a = input$a, 
62 |              b_x = input$b_x, 
63 |              b_fac = input$b_fac, 
64 |              b_int = input$b_int,
65 |              mod = mod())
66 |   })
67 |   
68 |   output$Plot_diag <- renderPlot({
69 |     diagplot(df(), mod())
70 |   })
71 |   
72 |   output$Plot_diag2 <- renderPlot({
73 |     diagplot2(df(), mod())
74 |   })
75 |   
76 |   output$Plot_dharma <- renderPlot({
77 |     validate(
78 |       need(input$terms_mod != 'intercept', 
79 |            "DHARMa not working for intercept only models.")
80 |     )
81 |     dharmaplot(mod())
82 |   })
83 | })
84 | 


--------------------------------------------------------------------------------
/Code/Session_6/glm_explorer/shiny.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | 


--------------------------------------------------------------------------------
/Code/Session_6/glm_explorer/ui.R:
--------------------------------------------------------------------------------
  1 | 7
  2 | # This is the user-interface definition of a Shiny web application.
  3 | # You can find out more about building applications with Shiny here:
  4 | #
  5 | # http://shiny.rstudio.com
  6 | #
  7 | 
  8 | library(shiny)
  9 | 
 10 | shinyUI(navbarPage("Explore Generalized Linear Models",
 11 |   tabPanel("Introduction",
 12 |            withMathJax(),
 13 |            includeMarkdown("introduction.md")
 14 |   ),
 15 |   tabPanel("Modelling",
 16 |     sidebarLayout(
 17 |       sidebarPanel(
 18 |         conditionalPanel(condition="input.conditionedPanels==1",
 19 |           h3('Simulated data'),
 20 |           selectInput("family",
 21 |                       "Family:",
 22 |                       c("Gaussian" = "gaussian",
 23 |                         "Poisson" = "poisson",
 24 |                         "Negative binomial" = "negbin"),
 25 |                       "gaussian",
 26 |                       FALSE
 27 |           ),
 28 |           selectInput("link",
 29 |                       "Link function:",
 30 |                       c("identity" = "identity",
 31 |                         "log" = "log"),
 32 |                       "identity",
 33 |                       FALSE
 34 |           ),
 35 |           sliderInput("n",
 36 |                       "Number of observations:",
 37 |                       min = 10,
 38 |                       max = 1000,
 39 |                       value = 200,
 40 |                       step = 10),
 41 |           sliderInput("a",
 42 |                       HTML("Intercept (<i>&beta;</i><sub>0</sub>):"),
 43 |                       min = -3,
 44 |                       max = 3,
 45 |                       value = 0,
 46 |                       step = 0.5),
 47 |           sliderInput("b_x",
 48 |                        HTML("Slope (<i>&beta;</i><sub>1</sub>):"),
 49 |                       min = -2,
 50 |                       max = 2,
 51 |                       value = 0.3,
 52 |                       step = 0.1),
 53 |           sliderInput("b_fac",
 54 |                        HTML("Group difference (<i>&beta;</i><sub>2</sub>):"),
 55 |                       min = -2,
 56 |                       max = 2,
 57 |                       value = 0,
 58 |                       step = 0.1),
 59 |           sliderInput("b_int",
 60 |                        HTML("Interaction between <i>x</i> and <i>fac</i> (<i>&beta;</i><sub>3</sub>):"),
 61 |                       min = -3,
 62 |                       max = 3,
 63 |                       value = 0,
 64 |                       step = 0.1),
 65 |           sliderInput("sigma",
 66 |                        HTML("<i>&sigma;</i> (only Gaussian):"),
 67 |                       min = 0,
 68 |                       max = 3,
 69 |                       value = 1,
 70 |                       step = 0.1),
 71 |           sliderInput("dispersion",
 72 |                        HTML("<i>&kappa;</i> (only Negative Binomial):"),
 73 |                       min = 0,
 74 |                       max = 3,
 75 |                       value = 1.5,
 76 |                       step = 0.1)
 77 |         ),
 78 |         conditionalPanel(condition = "input.conditionedPanels==2",
 79 |           h3('Fit model'),
 80 |           selectInput("family_mod",
 81 |                       "Family:",
 82 |                       c("Gaussian" = "gaussian",
 83 |                         "Poisson" = "poisson",
 84 |                         "Negative binomial" = "negbin"),
 85 |                       "gaussian",
 86 |                       FALSE),
 87 |           selectInput("link_mod",
 88 |                       "Link function:",
 89 |                       c("identity" = "identity",
 90 |                         "log" = "log"),
 91 |                       "identity",
 92 |                       FALSE),
 93 |           selectInput("terms_mod",
 94 |                       "Model formula:",
 95 |                       c("y ~ 1 (Intercept only)" = "intercept",
 96 |                         "y ~ x" = "x",
 97 |                         "y ~ fac" = "fac",
 98 |                         "y ~ x + fac" = "both",
 99 |                         "y ~ x + fac + x:fac" = "interaction"),
100 |                       "x",
101 |                       FALSE),
102 |           checkboxInput("show_pred", "Show 95% prediction intervals?", FALSE)
103 |         )
104 |       ),
105 |       mainPanel(
106 |         tabsetPanel(
107 |           tabPanel("Simulate data", value = 1,
108 |                    plotOutput("Plot_model")
109 |                    # , verbatimTextOutput("range_warn")
110 |           ),
111 |           tabPanel("Fit model", value = 2,
112 |                    plotOutput("Plot_model2"),
113 |                    verbatimTextOutput("model_char")
114 |           ),
115 |           tabPanel("Model summary", value = 2,
116 |                    verbatimTextOutput("Summary")
117 |           ),
118 |           tabPanel("Model coefficients", value = 2,
119 |                    plotOutput("Plot_coefs")
120 |           ),
121 |           tabPanel("Model diagnostics", value = 2,
122 |                    plotOutput("Plot_diag"),
123 |                    plotOutput("Plot_diag2"),
124 |                    plotOutput("Plot_dharma")
125 |           ),
126 |           id = "conditionedPanels"
127 |         )
128 |       )
129 |     )
130 |   ),
131 |   tabPanel("Exercises",
132 |            includeMarkdown("exercises.md")
133 |   )
134 | ))
135 | 


--------------------------------------------------------------------------------
/Code/Session_6_7_exerc.R:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | # R exercise Session 6	& 7   #
 3 | ###############################	
 4 | # Ralf B. Schäfer, 20.12.2019
 5 | 
 6 | #############################################################################
 7 | # Exercise: Climate change influences the distribution of species. To 		#
 8 | # predict potential changes in the distribution requires knowledge on the   #
 9 | # relationship between the occurrence and climatic variables. In this 		#
10 | # exercise we aim to identify the most important climatic variables that 	#
11 | # determine the distribution of three-toed sloths, in particular of 		#
12 | # Bradypus variegatus that occurs in Central and South America. 			#
13 | # First, we retrieve the data.												#
14 | #############################################################################
15 | 
16 | 
17 | 
18 | ##############################
19 | # Preparation for Exercise   #
20 | ##############################
21 | 
22 | # Read data set that consists of presence absence data for the Bradypus (column: pa), 
23 | # which has been taken from https://www.gbif.org, and climatic variables, taken from http://www.worldclim.org.
24 | # See http://www.worldclim.org/bioclim regarding the meaning of the individual climatic variables
25 | 
26 | pa_data <- read.csv("https://raw.githubusercontent.com/rbslandau/Data_analysis/master/Data/envtrain.csv")
27 | 
28 | ## Check data
29 | head(pa_data)
30 | str(pa_data)
31 | 
32 | # Convert variable biome to factor
33 | pa_data$biome <- factor(pa_data$biome)
34 | 
35 | 
36 | #############################################################################
37 | # Identify the climatic variable(s) that is/are most important for the 		#
38 | # occurence of the Bradypus!												#
39 | # (1) Use a GLM to model the presence - absence as response variable and    #
40 | # theclimatic variables as explanatory variables. Select a method of your   #
41 | # for model selection and run a model diagnosis afterwards.					#
42 | # Plot and interpret the model with effect plots.							#
43 | # (2) Repeat the same analysis using a CART (optional: Random Forest). 		#
44 | # Compare the results and the prediction accuracy of both models.			#
45 | #############################################################################
46 | 


--------------------------------------------------------------------------------
/Code/Session_7/Session_7_extr.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include = FALSE, purl = TRUE---------------------------------
  2 | knitr::opts_chunk$set(echo = FALSE)
  3 | library(learnr)
  4 | library(DAAG)
  5 | library(rpart)
  6 | library(party)
  7 | library(partykit)
  8 | library(dplyr)
  9 | library(InformationValue)
 10 | 
 11 | ## ----load_data, include = TRUE, echo = TRUE, purl = TRUE-----------------
 12 | library(DAAG)
 13 | # display first rows
 14 | head(frogs)
 15 | 
 16 | ## ----data_prep, include = FALSE, eval = TRUE, echo = FALSE, purl = TRUE, context = "setup"----
 17 | library(dplyr)
 18 | frogs_new <- frogs %>% 
 19 |                  mutate(mean_av = meanmin + meanmax)
 20 | predictors <- frogs_new[ , c(5:8, 11)]
 21 | resp_frogs <- frogs$pres.abs
 22 | 
 23 | ## ----fit_tree, include = TRUE, echo = TRUE, purl = TRUE, context = "setup"----
 24 | library(rpart)
 25 | set.seed(3333) # make example reproducible (cross-validation, which will be called later)
 26 | frog_tree <- rpart(resp_frogs ~ ., data = predictors, method = "class", control = rpart.control(minsplit = 10))
 27 | 
 28 | ## ----plot_tree_large, include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
 29 | par(cex = 1.2, mar = c(0, 0, 0, 0))
 30 | plot(frog_tree, margin = 0.05)
 31 | text(frog_tree)
 32 | 
 33 | ## ----cross_val_tree, include = TRUE, echo = TRUE, purl = TRUE------------
 34 | plotcp(frog_tree)
 35 | 
 36 | ## ----prune_tree, include = TRUE, echo = TRUE, purl = TRUE----------------
 37 | tree_pruned <- prune(frog_tree, cp = 0.031)
 38 | # plot result
 39 | par(cex = 1.1, mar = c(0, 0, 0, 0))
 40 | plot(tree_pruned, margin = 0.05)
 41 | text(tree_pruned)
 42 | 
 43 | ## ----nicer_tree, include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
 44 | par(cex = 1.1, mar = c(0, 0, 0, 0))
 45 | plot(tree_pruned, uniform = TRUE, branch = 0.35, margin = 0.05)
 46 | text(tree_pruned, pretty = 1, all = T, use.n = T, fancy = T)
 47 | 
 48 | ## ----nicer_tree_2, include = TRUE, echo = TRUE, purl = TRUE--------------
 49 | library(rpart.plot)
 50 | prp(tree_pruned, type = 2, extra = 1)
 51 | 
 52 | ## ----accurac_tree, include = TRUE, echo = TRUE, purl = TRUE--------------
 53 | printcp(tree_pruned)
 54 | 
 55 | ## ----calc_predict_tree, include = TRUE, echo = TRUE, purl = TRUE---------
 56 | # predict response
 57 | pred_prunded <- predict(tree_pruned, type = "class")
 58 | # mean of logical test for equality of predicted and observed response
 59 | # if values match, returns 1, else 0 
 60 | # the mean is equivalent to dividing the number of matches by the number of comparisons
 61 | mean(pred_prunded == resp_frogs)
 62 | # 1 - mean yields to (estimate of) the error rate
 63 | 1 - mean(pred_prunded == resp_frogs)
 64 | 
 65 | ## ----transformation,  include = TRUE, echo = TRUE, purl = TRUE, context = "setup"----
 66 | # log transformation of NoOfPools and distance,
 67 | logNoPools <- log(frogs$NoOfPools)
 68 | logdistance <- log(frogs$distance)
 69 | # create easily accessible data set
 70 | mod_glm_dat <- data.frame(predictors[ , c(3:5)], logNoPools, logdistance, resp_frogs)
 71 | 
 72 | ## ----GLM_predict,  include = TRUE, echo = TRUE, purl = TRUE--------------
 73 | pred_glm <- predict(frog.glm3, type = "response")
 74 | 
 75 | ## ----GLM_predict_convert,  include = TRUE, echo = TRUE, purl = TRUE------
 76 | library(InformationValue)
 77 | misClassError(resp_frogs, pred_glm)
 78 | 
 79 | ## ----GLM_poptimal_cutoff,  include = TRUE, echo = TRUE, purl = TRUE------
 80 | library(InformationValue)
 81 | optim_cut_glm <- optimalCutoff(resp_frogs, pred_glm)[1]
 82 | # optimal value is
 83 | optim_cut_glm
 84 | # compute related classification error
 85 | misClassError(resp_frogs, pred_glm, optim_cut_glm)
 86 | 
 87 | ## ----GLM_cv_cutoff,  include = TRUE, echo = TRUE, purl = TRUE------------
 88 | library(boot)
 89 | # define cost function - see help of cv.glm
 90 | cost_func <- function(resp_frogs, pi = 0) mean(abs(resp_frogs-pi) > 0.5)
 91 | # set seed to make reproducible
 92 | set.seed(42)
 93 | cv.err <- cv.glm(mod_glm_dat, frog.glm3, cost = cost_func, K = 10)
 94 | cv.err$delta[1]
 95 | 
 96 | ## ----class_tree_var_import,  include = TRUE, echo = TRUE, purl = TRUE----
 97 | tree_pruned$variable.importance
 98 | 
 99 | ## ----class_tree_summary,  include = TRUE, echo = TRUE, purl = TRUE-------
100 | summary(tree_pruned)
101 | 
102 | ## ----load_data_2, include = TRUE, echo = TRUE, purl = TRUE, context = "setup"----
103 | data_oc <- read.csv("https://raw.githubusercontent.com/rbslandau/Data_analysis/master/Data/OstraMRegS400JB.txt", sep = "\t")
104 | data_oc2 <- data_oc[ , !names(data_oc) %in% c("MDS1", "MDS2", "DCA1", "DCA2", "IC","SA", "SR")]
105 | 
106 | ## ----reg_tree_fitting,  include = TRUE, echo = TRUE, purl = TRUE---------
107 | # we set a seed to make the example reproducible.
108 | set.seed(21)
109 | reg_ostrac <- rpart(E100 ~ ., data = data_oc2, control = rpart.control(minsplit = 10))
110 | 
111 | ## ----reg_tree_cp,  include = TRUE, echo = TRUE, purl = TRUE--------------
112 | plotcp(reg_ostrac)
113 | 
114 | ## ----prune_reg_tree,  include = TRUE, echo = TRUE, purl = TRUE-----------
115 | ostrac_pruned <- prune(reg_ostrac, cp = 0.057)
116 | # plot tree
117 | library(rpart.plot)
118 | prp(ostrac_pruned, type = 2, extra = 1)
119 | 
120 | ## ----reg_tree_rsqplot,  include = TRUE, echo = TRUE, purl = TRUE---------
121 | # two plots on one page 
122 | par(mfrow=c(1,2))
123 | rsq.rpart(ostrac_pruned) 
124 | 
125 | ## ----reg_tree_rsq,  include = TRUE, echo = TRUE, purl = TRUE-------------
126 | rsq_extr <- printcp(ostrac_pruned)
127 | # extract rsquare values 
128 | rsqu_1 <- 1-rsq_extr[, 3:4]
129 | # adjust headers
130 | colnames(rsqu_1) <- c("Rsquare", "Xval Rsquare")
131 | rsqu_1
132 | 
133 | ## ----cond_tree,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
134 | library(party)
135 | library(partykit)
136 | # convert response to factor
137 | resp_frogfac <- as.factor(resp_frogs)
138 | frog_ctree <- ctree(resp_frogfac ~ ., data = predictors, control = ctree_control(minsplit = 10))
139 | plot(frog_ctree)
140 | 
141 | ## ----cond_tree_predict,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
142 | print(frog_ctree)
143 | pred_ctree <- predict(frog_ctree)
144 | # calculation of error rate
145 | 1 - mean(pred_ctree == resp_frogs)
146 | 
147 | ## ----cond_tree_table,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
148 | table(pred_ctree, resp_frogs)
149 | 
150 | ## ----cond_regtree,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
151 | cit_ostrac <- ctree(E100 ~ ., data = data_oc2, control = ctree_control(minsplit = 10))
152 | 
153 | ## ----cond_regtree_plot,  include = TRUE, echo = TRUE, eval = FALSE, purl = TRUE----
154 | ## plot(cit_ostrac)
155 | 
156 | ## ----cond_regtree_print,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
157 | print(cit_ostrac)
158 | 
159 | ## ----cit_crossval,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
160 | # load package
161 | library(caret)
162 | # set parameters for algorithm. See ?trainControl for details. 
163 | fitControl <- trainControl(method = "cv", number = 10)
164 | gridcont <- expand.grid(mincriterion = 0.95)
165 | 
166 | ## ----cit_crossval_2,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
167 | # set seed for reproducibility
168 | set.seed(2020)
169 | fit.ctree2CV <- train(as.factor(pres.abs) ~ ., data = frogs_new[ , c(1, 5:8, 11)], method = 'ctree', trControl = fitControl, tuneGrid = gridcont)
170 | print(fit.ctree2CV)
171 | 
172 | ## ----cit_multi,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
173 | data("HuntingSpiders", package = "partykit")
174 | # create formula
175 | form_multtree <- formula(arct.lute + pard.lugu + zora.spin + pard.nigr + pard.pull + aulo.albi + troc.terr + alop.cune + pard.mont + alop.acce + alop.fabr + arct.peri ~ herbs + reft + moss + sand + twigs + water)
176 | sptree <- ctree(form_multtree, data = HuntingSpiders, teststat = "max", minsplit = 5, pargs = GenzBretz(abseps = .1, releps = .1))
177 | 
178 | ## ----cit_multi_plot,  include = TRUE, echo = TRUE, eval = FALSE, purl = TRUE----
179 | ## plot(sptree, terminal_panel = node_barplot)
180 | 
181 | ## ----cit_multi_plot2,  include = TRUE, echo = TRUE, eval = FALSE, purl = TRUE----
182 | ##   plot(sptree)
183 | 
184 | ## ----inst_mvpart,  include = TRUE, echo = TRUE, eval = FALSE, purl = TRUE----
185 | ## install.packages("devtools")
186 | ## # load package
187 | ## library(devtools)
188 | ## # install mvpart and extension package
189 | ## install_github("cran/mvpart", force = TRUE)
190 | ## install_github("cran/MVPARTwrap",  force = TRUE)
191 | 
192 | ## ----mvtree_prep,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
193 | # extract spider data
194 | spider_init <- HuntingSpiders[ , 1:12]
195 | # convert to matrix
196 | spiders <- data.matrix(spider_init)
197 | # extract environmental variables
198 | env_vars  <- HuntingSpiders[ , 13:18]
199 | 
200 | ## ----mvtree_result,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE, context = "setup"----
201 | library(mvpart)
202 | # set graphic parameter to avoid cluttered plot
203 | par(cex = 0.75)
204 | set.seed(555)
205 | mult_cart <- mvpart(spiders ~. , data = env_vars, xv = "min", minsplit = 5)
206 | 
207 | ## ----mvtree_print_results,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
208 | print(mult_cart)
209 | 
210 | ## ----mvtree_summary_results,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
211 | summary(mult_cart)
212 | 
213 | ## ----mvtree_result2,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
214 | library(mvpart)
215 | # set graphic parameter to avoid cluttered plot
216 | par(cex = 0.75)
217 | set.seed(555)
218 | mult_cart_mcv <- mvpart(spiders ~. , data = env_vars, xv = "min", minsplit = 5, xvmult = 100) 
219 | 
220 | ## ----mvtpart_extract,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
221 | # extract terminal node numbers
222 | term_nod_num <- mult_cart$where
223 | # convert to factor and subsequently extract levels
224 | term_nod_fct <- as.factor(term_nod_num)
225 | groups_mvpart <- levels(term_nod_fct)
226 | # now we have the terminal nodes in an object
227 | groups_mvpart
228 | 
229 | ## ----mvtpart_extract_node,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
230 | # select number of terminal node (refers to position in groups_mvpart vector)
231 | leaf_id <- 1
232 | # inspect spider composition
233 | spiders[which(mult_cart$where == groups_mvpart[leaf_id]), ]
234 | # inspect environmental variables
235 | env_vars[which(mult_cart$where == groups_mvpart[leaf_id]), ]
236 | 
237 | ## ----mvtpart_tab1,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
238 | leaf_sum <- matrix(data = 0, nrow = length(groups_mvpart), ncol = ncol(spiders))
239 | # matrix
240 | leaf_sum
241 | # assign column names
242 | colnames(leaf_sum) <- colnames(spiders)
243 | # assign row names from group vector
244 | rownames(leaf_sum) <- groups_mvpart
245 | # look at matrix again
246 | leaf_sum
247 | 
248 | ## ----mvtpart_tab2,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
249 | for(i in 1:length(groups_mvpart))
250 |                                 {
251 |                                 leaf_sum[i, ] <- apply(spiders[which(mult_cart$where == groups_mvpart[i]), ], 2, sum)
252 |                                 }
253 | # look at table with filled information
254 | leaf_sum
255 | 
256 | ## ----mvtpart_tab3,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
257 | # first sort vector with information on grouping in ascending order
258 | term_nod_sort <- sort(term_nod_num)
259 | # sum per group level
260 | term_tab <- table(term_nod_sort)
261 | # print table - first row gives group, second row number of observations
262 | term_tab
263 | # extract number of observations
264 | num_obs_term <- as.vector(term_tab)
265 | # divide sums of spider species by number of observations. Vector is applied to columns, which means that the calculations are done node-wise.
266 | leaf_avg <- leaf_sum/num_obs_term
267 | # look at table with filled information
268 | leaf_avg
269 | 
270 | ## ----mvtpart_barplots,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
271 | # set plooting window with the length of the group vector as number of rows
272 | par(mfrow = c(1, 2))
273 | for (i in 1:2)
274 |               { 
275 |                 barplot(leaf_avg[i, ], main = paste("leaf no.", groups_mvpart[i]), las = 2, ylim = c(0, 8))
276 |                 }
277 | par(mfrow = c(1, 2))
278 | for (i in 3:4)
279 |               { 
280 |                 barplot(leaf_avg[i, ], main = paste("leaf no.", groups_mvpart[i]), las = 2, ylim = c(0, 8))
281 |               }
282 | par(mfrow = c(1, 2))
283 | for (i in 5)
284 |               { 
285 |                 barplot(leaf_avg[i, ], main = paste("leaf no.", groups_mvpart[i]), las = 2, ylim = c(0, 8))
286 |               }
287 | 
288 | ## ----mvtpart_barplots_code,  include = TRUE, echo = TRUE, eval = FALSE, purl = TRUE----
289 | ## # loop for plotting
290 | ## par(mfrow = c(3, 2))
291 | ## for (i in 1:length(groups_mvpart))
292 | ##                      {
293 | ##                      barplot(leaf_sum[i, ], main = paste("leaf no.", groups_mvpart[i]))
294 | ##                    }
295 | 
296 | ## ----mvrt_wrap, exercise = TRUE, exercise.eval = FALSE, purl = TRUE------
297 | library(MVPARTwrap)
298 | # use MRT function to extract information from an mvpart object
299 | extract_mrtwrap <- MRT(mult_cart, percent = 10)
300 | 
301 | 
302 | 
303 | ## ----mvrt_cluster,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
304 | trclcomp(mult_cart, method = "com")
305 | 
306 | ## ----mvrt_pca,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
307 | rpart.pca(mult_cart, interact = TRUE, wgt.ave = TRUE) 
308 | 
309 | ## ----rf_ostracod,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
310 | library(randomForest)
311 | set.seed(2019)
312 | ostrac_rf <- randomForest(E100 ~ ., data = data_oc2, importance = TRUE)
313 | print(ostrac_rf)
314 | 
315 | ## ----rf_ostra_varimp,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
316 | # putting an expression into brackets means that the resulting object is called after execution
317 | (imp <- importance(ostrac_rf))
318 | 
319 | ## ----rf_ostra_pdp,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
320 | # extract variable names
321 | vars <- rownames(imp)
322 | # order variables by order of importance
323 | # create order vector
324 | imp_seq <- order(imp[ ,1], decreasing = TRUE)
325 | # use order vector to sort variables
326 | impvar <- vars[imp_seq]
327 | # loop to create plots
328 | for (i in seq_along(impvar)) 
329 |                             {
330 |                               partialPlot(ostrac_rf, pred.data = data_oc2, x.var = impvar[i], xlab = impvar[i], main = paste("Partial Dependence Plot for", impvar[i]))
331 |                               }
332 | 
333 | ## ----read_ec_data,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE, context = "setup"----
334 | ec <- read.csv("data_EC.csv", header = TRUE, sep = ",") 
335 | # overview on variables in dataset
336 | names(ec)
337 | # remove id
338 | ec_1 <- ec %>% select(-site_id)
339 | # extract predictors
340 | predictors_ec <- ec_1 %>% select(-EC_value_fin)
341 | # extract response
342 | resp_ec <- ec_1 %>% select(EC_value_fin)
343 | 
344 | ## ----read_ec_data-exercise,  exercise = TRUE, exercise.eval = FALSE, purl = TRUE----
345 | 
346 | 
347 | ## ----ec_near_zero,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
348 | library(caret)
349 | zero_var_variables <- nearZeroVar(predictors_ec, saveMetrics = TRUE)
350 | zero_var_variables
351 | 
352 | ## ----rf_ec,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE-------
353 | set.seed(2019)
354 | ec_rf <- randomForest(EC_value_fin ~ ., data = ec_1, importance = TRUE)
355 | print(ec_rf)
356 | 
357 | ## ----rf_ec_pdp,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
358 | imp_ec <- importance(ec_rf)
359 | # extract variable names
360 | vars_ec <- rownames(imp_ec)
361 | # order variables by order of importance
362 | # create order vector
363 | imp_seqec <- order(imp_ec[ ,1], decreasing = TRUE)
364 | # use order vector to sort variables
365 | impvarec <- vars_ec[imp_seqec]
366 | # loop to create plots
367 | for (i in seq_along(impvarec)) 
368 |                             {
369 |                               partialPlot(ec_rf, pred.data = ec_1, x.var = impvarec[i], xlab = impvarec[i], main = paste("Partial Dependence Plot for", impvarec[i]))
370 |                               }
371 | 
372 | ## ----rf_ec_remvars,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
373 | # Create vector of variables to be dropped
374 | drop_vars <- c("soil_permeability", "area_sqkm", "water_capability", "MgO_mean", "Bulk_mean", "S_mean")
375 | # drop columns
376 | ec_2 <- ec_1 %>% select(-one_of(drop_vars))
377 | # refit model
378 | set.seed(2019)
379 | ec_rf_up <- randomForest(EC_value_fin ~ ., data = ec_2, importance = TRUE)
380 | print(ec_rf_up)
381 | 
382 | ## ----rf_ec_tune_ntree,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
383 | # fit model with 1000 trees
384 | set.seed(2019)
385 | ec_rf_up_1000 <- randomForest(EC_value_fin ~ ., data = ec_2, importance = TRUE, ntree = 1000)
386 | print(ec_rf_up_1000)
387 | # fit model with 2000 trees
388 | set.seed(2019)
389 | ec_rf_up_2000 <- randomForest(EC_value_fin ~ ., data = ec_2, importance = TRUE, ntree = 2000)
390 | print(ec_rf_up_2000)
391 | 
392 | ## ----rf_ec_imp_plots,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
393 | varImpPlot(ec_rf_up, main = "Default fit, 500 trees", scale = TRUE, type = 1)
394 | varImpPlot(ec_rf_up_1000, main = "1000 trees", scale = TRUE, type = 1)
395 | varImpPlot(ec_rf_up_2000, main = "2000 trees", scale = TRUE, type = 1)
396 | 
397 | ## ----rf_ec_tune_mtry,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
398 | set.seed(2019)
399 | tuneRF(x = ec_2[ , -1], y = ec_2[ , 1], mtryStart = 3, ntree = 2000, stepFactor = 1.5, improve = 0.01, trace = TRUE, plot = TRUE)
400 | 
401 | ## ----rf_ec_plot_finalmod,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
402 | # fit model with 2000 trees
403 | set.seed(2019)
404 | ec_rf_fin <- randomForest(EC_value_fin ~ ., data = ec_2, importance = TRUE, ntree = 2000, mtry = 3)
405 | print(ec_rf_fin)
406 | varImpPlot(ec_rf_fin, main = "2000 trees", scale = FALSE, type = 1)
407 | 
408 | ## ----rf_ec_pred_obsplot,  include = TRUE, echo = TRUE, eval = TRUE, purl = TRUE----
409 | pred_ec <- predict(ec_rf_fin)
410 | # plot EC_preds and EC_obs
411 | plot(pred_ec, ec_2$EC_value_fin, ylab = "Observed EC [mS/cm]", xlab = "Predicted EC [mS/cm]")
412 | # add regression line
413 | abline(lm(ec_2$EC_value_fin ~ pred_ec))
414 | 
415 | 


--------------------------------------------------------------------------------
/Code/Session_8/Session_8_extr.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include = FALSE, purl = TRUE--------------------------------------------------------------------------------
  2 | knitr::opts_chunk$set(echo = FALSE)
  3 | library(vegan)
  4 | library(learnr)
  5 | library(ggplot2)
  6 | library(GGally)
  7 | lowerFn <- function(data, mapping, method = "lm", ...) {
  8 |   p <- ggplot(data = data, mapping = mapping) +
  9 |     geom_point(colour = "blue") +
 10 |     geom_smooth(method = method, color = "red", ...)
 11 |   p
 12 | }
 13 | data("varechem")
 14 | # setup species with unimodal gradient
 15 | hss <- c(1, 2, 4, 7, 8, 7, 4, 2, 1)
 16 | # unimodal gradient
 17 | spec1 <- c(hss, rep(0, 10)) #
 18 | spec2 <- c(rep(0, 5), hss, rep(0, 5))
 19 | spec3 <- c(rep(0, 10), hss)
 20 | data <- cbind(spec1, spec2, spec3)
 21 | # create data.frame
 22 | species_dat <- data.frame(data)
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | ## ----load_data, include = TRUE, echo = TRUE, purl = TRUE----------------------------------------------------------------
 30 | library(vegan)
 31 | # load data
 32 | data(varechem)
 33 | # reduce to chemical data
 34 | varechem_red <- varechem[ ,1:11]
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | ## ----data_mult_normality, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-----------------------------------------
 42 | library(mvoutlier)
 43 | par(mfrow = c(1, 2), las = 1)
 44 | chisq.plot(varechem_red, quan = 1, ask = FALSE)
 45 | 
 46 | 
 47 | ## ----data_callin_code, include = TRUE, echo = TRUE, eval = FALSE--------------------------------------------------------
 48 | ## library(ggplot2)
 49 | ## library(GGally)
 50 | ## # We define a function to change the colour of points and lines (otherwise both are black)
 51 | ## lowerFn <- function(data, mapping, method = "lm", ...) {
 52 | ##   p <- ggplot(data = data, mapping = mapping) +
 53 | ##               geom_point(colour = "blue") +
 54 | ##               geom_smooth(method = method, color = "red", ...)
 55 | ##   p
 56 | ##   }
 57 | ## # Run ggpairs()
 58 | ## ggpairs(varechem_red, lower = list(continuous = wrap(lowerFn, method = "lm")),
 59 | ##                     diag = list(continuous = wrap("densityDiag", colour = "blue")),
 60 | ##                     upper = list(continuous = wrap("cor", size = 10)))
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | ## ----data_transform_sqrt, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-----------------------------------------
 67 | varechemical <- sqrt(varechem_red)
 68 | 
 69 | 
 70 | 
 71 | 
 72 | ## ----species_data_generation, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-------------------------------------
 73 | # setup species with unimodal gradient
 74 | hss <- c(1, 2, 4, 7, 8, 7, 4, 2, 1)
 75 | # unimodal gradient
 76 | spec1 <- c(hss, rep(0, 10)) #
 77 | spec2 <- c(rep(0, 5), hss, rep(0, 5))
 78 | spec3 <- c(rep(0, 10), hss)
 79 | data <- cbind(spec1, spec2, spec3)
 80 | # create data.frame
 81 | species_dat <- data.frame(data)
 82 | # display species distribution along sites
 83 | par(las = 1)
 84 | plot(spec1, col = "blue", type = "o", ylab = "Species abundance", xlab = "Site", main = "Three species against environmental gradient")
 85 | points(spec2, col = "red")
 86 | lines(spec2, col = "red")
 87 | points(spec3, col = "magenta")
 88 | lines(spec3, col = "magenta")
 89 | 
 90 | 
 91 | ## ----specdat_mult_normality, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE--------------------------------------
 92 | library(mvoutlier)
 93 | par(mfrow = c(1, 2), las = 1)
 94 | chisq.plot(species_dat, quan = 1, ask = FALSE)
 95 | # add information on data
 96 | mtext("Species data")
 97 | # plot soil data again
 98 | chisq.plot(varechemical, quan = 1, ask = FALSE)
 99 | mtext( "Soil data")
100 | 
101 | 
102 | ## ----specdat_pairs, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-----------------------------------------------
103 | ggpairs(species_dat, lower = list(continuous = wrap(lowerFn, method = "lm")),
104 |                     diag = list(continuous = wrap("densityDiag", colour = "blue")),
105 |                     upper = list(continuous = wrap("cor", size = 10)))
106 | 
107 | 
108 | 
109 | 
110 | ## ----soil_standard, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-----------------------------------------------
111 | soil_scaled <- scale(varechemical)
112 | # convert to data.frame
113 | soil_scaled_df <- data.frame(soil_scaled)
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | ## ----pca_soil, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE----------------------------------------------------
122 | va_pca <- rda(soil_scaled_df)
123 | summary(va_pca, display = NULL)
124 | 
125 | 
126 | 
127 | 
128 | ## ----pca_extract_evs, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE---------------------------------------------
129 | ev <- va_pca$CA$eig
130 | # calculate the sum of eigenvalues
131 | sum(ev)
132 | # confirms that the total variance is preserved in the eigenvalues
133 | 
134 | 
135 | 
136 | 
137 | ## ----pca_sum_criterion, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-------------------------------------------
138 | # set alpha as sum criterion
139 | alpha <- 0.9
140 | # extract the cumulative proportion of explained variance from object
141 | cum_prop_var <- summary(va_pca)$cont$importance[3, ]
142 | # check how many components are required
143 | cum_prop_var > alpha
144 | # or even more automated
145 | ncol(soil_scaled_df) - sum(cum_prop_var > alpha) +1
146 | 
147 | 
148 | ## ----pca_screeplot, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-----------------------------------------------
149 | screeplot(va_pca, type = "lines")
150 | 
151 | 
152 | ## ----pca_brokenstick, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE---------------------------------------------
153 | screeplot(va_pca, type = "lines", bstick = TRUE)
154 | 
155 | 
156 | ## ----pca_rowcrossvalid, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-------------------------------------------
157 | library(chemometrics)
158 | pca_cv <- pcaCV(soil_scaled_df, center = FALSE, scale = FALSE, segments = 5, plot.opt = TRUE)
159 | 
160 | 
161 | ## ----NA_added, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE----------------------------------------------------
162 | # add a random NA value
163 | # determine row
164 | set.seed(2019)
165 | row_val <- sample(1:nrow(soil_scaled_df), size = 1)
166 | # determine column
167 | set.seed(2019)
168 | col_val <- sample(1:ncol(soil_scaled_df), size = 1)
169 | # set to NA after creating a copy of the dataframe
170 | soil_scaled_dfna <- soil_scaled_df
171 | soil_scaled_dfna[row_val, col_val] <- NA
172 | 
173 | 
174 | ## ----pca_ekcv, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE----------------------------------------------------
175 | library(missMDA)
176 | # make example reproducible
177 | set.seed(1001) 
178 | estim_ncpPCA(soil_scaled_dfna, method.cv = "Kfold", pNA = 0.20, ncp.max = 6, verbose = FALSE)
179 | 
180 | 
181 | ## ----pca_gcv, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE-----------------------------------------------------
182 | # for details
183 | library(missMDA)
184 | # make example reproducible
185 | set.seed(100) 
186 | estim_ncpPCA(soil_scaled_dfna, method.cv = "gcv", ncp.max = 6, verbose = FALSE)
187 | 
188 | 
189 | ## ----broken_stick_sim, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE--------------------------------------------
190 | # set maximum number of pieces
191 | p <- 2
192 | # create vector to store simulation results
193 | vec_l <- c(1:10000)
194 | # means that simulation is repeated 10000 times
195 | # we obtain 10000 values for l
196 | # set seed for reproducible example
197 | set.seed(2019)
198 | # create loop
199 | for (i in 1:length(vec_l)) # loop runs vec_l times
200 |       {
201 |           y <- runif(p - 1) # sample from uniform distribution between p-1 = 1 and 0
202 |           # save value after identifying longest piece
203 |          vec_l[i] <- ifelse(y >= 0.5, y, 1 - y)
204 |       }
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | ## ----pca_biplot_code, include = TRUE, echo = TRUE, eval = FALSE---------------------------------------------------------
214 | ## biplot(va_pca, scaling = 3, display = c("sp", "site"), cex = 1.2)
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | ## ----pca_outlier_diag, include = TRUE, eval = TRUE, echo = TRUE, purl = TRUE--------------------------------------------
223 | # run PCA
224 | soil_pca_princ <- princomp(soil_scaled_df)
225 | # set plotting parameters, see ?par
226 | par(mfrow = c(1, 2), cex = 2)
227 | # run function
228 | library(chemometrics)
229 | pcaDiagplot(soil_scaled_df, soil_pca_princ, a = 2)
230 | 
231 | 
232 | ## ----pca_biplot_sc1, include = TRUE, echo = TRUE, eval = FALSE----------------------------------------------------------
233 | ## biplot(va_pca, scaling = 1, display = c("sp", "site"), main = "Scaling 1: Distance biplot")
234 | 
235 | 
236 | 
237 | 
238 | 
239 | ## ----pca_biplot_sc2, include = TRUE, echo = TRUE, eval = FALSE----------------------------------------------------------
240 | ## biplot(va_pca, scaling = 2, display = c("sp", "site"), main = "Scaling 2: Correlation biplot")
241 | 
242 | 
243 | 
244 | 
245 | 
246 | ## ----pca_biplot_sc3, include = TRUE, echo = TRUE, eval = FALSE----------------------------------------------------------
247 | ## biplot(va_pca, scaling = 3, display = c("sp", "site"), main = "Scaling 3: Symmetric biplot")
248 | 
249 | 
250 | 
251 | 
252 | 
253 | ## ----load_iris, include = TRUE, echo = TRUE, eval = TRUE----------------------------------------------------------------
254 | data(iris)
255 | # we take a random subsample of the data to enhance readability of the plots
256 | set.seed(2019)
257 | samp_vec <- sample(1:nrow(iris), 80)
258 | # use sample vector for extraction
259 | # remove column containing species information
260 | iris_sub <- iris[samp_vec, 1:4]
261 | # run PCA
262 | iris_pca <- rda(iris_sub, scale = TRUE)
263 | summary(iris_pca, display = NULL)
264 | 
265 | 
266 | ## ----pca_biplot_sc1_iris, include = TRUE, echo = TRUE, eval = FALSE-----------------------------------------------------
267 | ## biplot(iris_pca, scaling = 1, display = c("sp", "site"), main = "Scaling 1: Distance biplot")
268 | 
269 | 
270 | 
271 | 
272 | 
273 | ## ----pca_biplot_sc2_iris, include = TRUE, echo = TRUE, eval = FALSE-----------------------------------------------------
274 | ## biplot(iris_pca, scaling = 2, display = c("sp", "site"), main = "Scaling 2: Correlation biplot")
275 | 
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 
282 | ## ----loadings, include = TRUE, echo = TRUE, eval = TRUE-----------------------------------------------------------------
283 | # Extract and display loadings
284 | (loadings_pca <- va_pca$CA$v)
285 | 
286 | 
287 | 
288 | 
289 | ## ----loadings-2, include = TRUE, echo = TRUE, eval = TRUE---------------------------------------------------------------
290 | (ident_mat <- loadings_pca %*% t(loadings_pca))
291 | 
292 | 
293 | ## ----loadings-3-rounding, include = TRUE, echo = TRUE, eval = TRUE------------------------------------------------------
294 | round(ident_mat)
295 | 
296 | 
297 | ## ----loadings-4-corr_loadings, include = TRUE, echo = TRUE, eval = TRUE-------------------------------------------------
298 | # extract and display square root of eigenvalues
299 | sdev <- sqrt(va_pca$CA$eig)
300 | sdev
301 | # we have to transpose the matrix to obtain the correct result
302 | cor_loadings <- t(loadings_pca) * sdev
303 | cor_loadings
304 | 
305 | 
306 | ## ----loadings-5-corr_loadings, include = TRUE, echo = TRUE, eval = TRUE-------------------------------------------------
307 | # we transpose the correlation loadings to have the same format as the result from the correlation below
308 | t(cor_loadings)
309 | 
310 | # prepare choice of components
311 | num_col_soilsdata <- ncol(varechemical)
312 | # create vector with all columns identified by their number
313 | (col_vec_pca <- 1:num_col_soilsdata)
314 | # run scores function from vegan package
315 | pca_scores <- vegan::scores(va_pca, disp = "sites", choices = col_vec_pca, scaling = 0)
316 | # calculate correlation coefficients between (scaled) soil data and the principal components
317 | cor(scale(varechemical), pca_scores)
318 | 
319 | 
320 | 
321 | 
322 | ## ----spca-1, include = TRUE, echo = TRUE, eval = TRUE-------------------------------------------------------------------
323 | library(pcaPP)
324 | # set k.max as the max number of considered sparse PCs
325 | k.max <- 2
326 | # run function
327 | oTPO <- opt.TPO(soil_scaled_df, k.max = k.max)
328 | oTPO$pc.noord$lambda
329 | 
330 | 
331 | ## ----spca-2, include = TRUE, echo = TRUE, eval = TRUE-------------------------------------------------------------------
332 | plot(oTPO, k = 1)
333 | 
334 | 
335 | ## ----spca-3, include = TRUE, echo = TRUE, eval = TRUE-------------------------------------------------------------------
336 | oTPO$pc.noord$lambda
337 | # use optimized lambdas to compute sparsePCA
338 | spc <- sPCAgrid(soil_scaled_df, k = k.max, lambda = oTPO$pc.noord$lambda)
339 | summary(spc, loadings = TRUE)
340 | 
341 | 
342 | ## ----spca-3-eigenvals, include = TRUE, echo = TRUE, eval = TRUE---------------------------------------------------------
343 | spc$sdev^2
344 | 
345 | 
346 | 
347 | 
348 | ## ----spca-4-corrloadings, include = TRUE, echo = TRUE, eval = TRUE------------------------------------------------------
349 | # extract loadings
350 | spc$loadings[]
351 | 
352 | 
353 | ## ----spca-5-plot, include = TRUE, echo = TRUE, eval = TRUE--------------------------------------------------------------
354 | biplot(spc, cex = 0.6)
355 | 
356 | 
357 | ## ----pca-reg-1, include = TRUE, echo = TRUE, eval = TRUE----------------------------------------------------------------
358 | # create index for all PCs, number of PCs equals number of variables in original dataset
359 | pca_index <- c(1:ncol(varechemical))
360 | # extract scores
361 | pc_scores_reg <- vegan::scores(va_pca, display = "sites", choices = pca_index, scaling = 0)
362 | 
363 | 
364 | ## ----pca-reg-2, include = TRUE, echo = TRUE, eval = TRUE----------------------------------------------------------------
365 | vegan::scores(spc, display = "sites", scaling = 0)
366 | # same as:
367 | spc$scores
368 | 
369 | 
370 | ## ----pca-fail1, include = TRUE, echo = TRUE, eval = TRUE----------------------------------------------------------------
371 | pca_specdat <- rda(species_dat)
372 | 
373 | 
374 | ## ----pca-fail_biplot_code, include = TRUE, echo = TRUE, eval = FALSE----------------------------------------------------
375 | ## biplot(pca_specdat, scaling = 3, display = c("sp", "site"), cex = 1.8)
376 | 
377 | 
378 | 
379 | 
380 | 
381 | ## ----pca-fail2, include = TRUE, echo = TRUE, eval = TRUE----------------------------------------------------------------
382 | pca_specdat$CA$v
383 | 
384 | 
385 | ## ----pca-fail3, include = TRUE, echo = TRUE, eval = TRUE----------------------------------------------------------------
386 | summary(pca_specdat, display = NULL)
387 | 
388 | 
389 | ## ----pca-hell1, include = TRUE, echo = TRUE, eval = TRUE----------------------------------------------------------------
390 | # Apply hellinger transformation
391 | spec_hell <- decostand(species_dat, method = "hellinger")
392 | # Run PCA
393 | pca_spec_hell <- rda(spec_hell)
394 | 
395 | 
396 | ## ----pca-hell2, include = TRUE, echo = TRUE, eval = FALSE---------------------------------------------------------------
397 | ## # Biplot
398 | ## biplot(pca_spec_hell, scaling = 3, display = c("sp", "site"), cex = 1.8)
399 | 
400 | 


--------------------------------------------------------------------------------
/Code/Session_8_exerc.R:
--------------------------------------------------------------------------------
 1 | ############################
 2 | # R exercise Session 8	   #
 3 | ############################	
 4 | # Ralf B. Schäfer, 20.12.2019
 5 | 
 6 | ##############################
 7 | # Preparation for Exercise   #
 8 | ##############################
 9 | 
10 | library(chemometrics)
11 | data(ash)
12 | 
13 | #############################################################################################################
14 | # The ash softening temperature (SOT) is an important variable to determine the characteristics of fuel. 	#
15 | # Run a PCA for the ash data set. Make sure you remove the response variable (SOT) and the log-transformed 	#
16 | # variables before the PCA and take an own (substantiated) decision regarding potential transformations.	#
17 | # Once you have run the PCA, answer the following questions:												#
18 | # How much variance is captured when plotting the first two principal components in a biplot?				#
19 | # Identify the number of meaningful PCs and check the intercorrelation of these axes. What do you observe?	#
20 | # Which variables contribute most to the construction of the first axis?									#
21 | # Compare these results to those of a sparse PCA.															#
22 | #																											#
23 | # Optional: Run a principal component regression (response: SOT, predictors: PCs) and compare the results 	#
24 | # to those from an ordinary multiple linear regression analysis for this data set in terms of selected 		#
25 | # variables, interpretation and explained variance.															#
26 | #############################################################################################################
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/Code/Session_9/Session_9_extr.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=FALSE, context = "setup"--------------------------------------------------------------------------------------------------------------
  2 | library(learnr)
  3 | library(vegan)
  4 | library(dplyr)
  5 | knitr::opts_chunk$set(echo = FALSE)
  6 | # remove variables from chemical data
  7 | remove_vars <- c("Mn", "Mg", "P", "S", "Zn", "Fe")
  8 | data(varechem)
  9 | varechem_red <- varechem %>% select(-one_of(remove_vars))
 10 | 
 11 | 
 12 | 
 13 | 
 14 | ## ----load_data, include = TRUE, echo = TRUE, purl = TRUE--------------------------------------------------------------------------------------------------
 15 | library(vegan)
 16 | # load chemical data
 17 | data(varechem)
 18 | # load vegetation data
 19 | data(varespec)
 20 | head(varespec)
 21 | 
 22 | 
 23 | ## ----rare_spec, include = TRUE, echo = TRUE, purl = TRUE--------------------------------------------------------------------------------------------------
 24 | # transform data into presence-absence
 25 | varespec_pa <- decostand(varespec, "pa")
 26 | # calculate sum per species
 27 | vare_sum <- apply(varespec_pa, 2, sum)
 28 | # display number of occurrences in ascending order
 29 | sort(vare_sum)
 30 | # remove species that occur at less than 5 sites
 31 | varespec_fin <- varespec[ , !vare_sum < 5]
 32 | 
 33 | 
 34 | ## ----mean_var, include = TRUE, echo = TRUE, purl = TRUE---------------------------------------------------------------------------------------------------
 35 | # calculate mean
 36 | vare_mean <- apply(varespec_fin, 2, mean)
 37 | # calculate variance
 38 | vare_var <- apply(varespec_fin, 2, var)
 39 | # plot on log scale
 40 | par(las = 1, cex = 1.5, mfrow = c(1,2))
 41 | plot(vare_mean, vare_var, log = "xy", xlab = "Mean", ylab = "Variance")
 42 | 
 43 | 
 44 | ## ----dca_1, include = TRUE, echo = TRUE, purl = TRUE------------------------------------------------------------------------------------------------------
 45 | decorana(varespec_fin)
 46 | 
 47 | 
 48 | ## ----dca_2, include = TRUE, echo = TRUE, purl = TRUE------------------------------------------------------------------------------------------------------
 49 | varespec_hel <- decostand(varespec_fin, "hellinger")
 50 | decorana(varespec_hel)
 51 | 
 52 | 
 53 | ## ----remove_vars, include = TRUE, echo = TRUE, purl = TRUE------------------------------------------------------------------------------------------------
 54 | library(dplyr)
 55 | remove_vars <- c("Mn", "Mg", "P", "S", "Zn", "Fe")
 56 | varechem_red <- varechem %>% select(-one_of(remove_vars))
 57 | 
 58 | 
 59 | ## ----data_callin_code, include = TRUE, echo = TRUE, eval = FALSE------------------------------------------------------------------------------------------
 60 | ## library(ggplot2)
 61 | ## library(GGally)
 62 | ## # We define a function to change the colour of points and lines (otherwise both are black)
 63 | ## lowerFn <- function(data, mapping, method = "lm", ...) {
 64 | ##   p <- ggplot(data = data, mapping = mapping) +
 65 | ##               geom_point(colour = "blue") +
 66 | ##               geom_smooth(method = method, color = "red", ...)
 67 | ##   p
 68 | ##   }
 69 | ## # Run ggpairs()
 70 | ## ggpairs(varechem_red, lower = list(continuous = wrap(lowerFn, method = "lm")),
 71 | ##                     diag = list(continuous = wrap("densityDiag", colour = "blue")),
 72 | ##                     upper = list(continuous = wrap("cor", size = 10)))
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | ## ----rda, include = TRUE, echo = TRUE, purl = TRUE--------------------------------------------------------------------------------------------------------
 83 | var_rda <- rda(varespec_hel ~ ., data = varechem_red)
 84 | summary(var_rda, display = NA)
 85 | 
 86 | 
 87 | ## ----rda-rare, include = TRUE, echo = TRUE, purl = TRUE---------------------------------------------------------------------------------------------------
 88 | # We first need to Hellinger transform the data with rare taxa
 89 | varespec_raw_hel <- decostand(varespec, "hellinger")
 90 | # run RDA
 91 | var_rda_rare <- rda(varespec_raw_hel ~ ., data = varechem_red)
 92 | summary(var_rda_rare, display = NA)
 93 | 
 94 | 
 95 | ## ----rda-test, include = TRUE, echo = TRUE, purl = TRUE---------------------------------------------------------------------------------------------------
 96 | # Global test of the RDA result
 97 | set.seed(2222)
 98 | anova.cca(var_rda, step = 1000)
 99 | 
100 | # Tests of all canonical axes
101 | set.seed(2222)
102 | anova.cca(var_rda, by = "axis", step = 1000)
103 | 
104 | 
105 | ## ----rda_triplot_1, include = TRUE, echo = TRUE, eval = FALSE---------------------------------------------------------------------------------------------
106 | ## plot(var_rda, scaling = 1, main = "RDA scaling 1: Distance triplot", display = c("sp", "lc", "cn"))
107 | ## var.sc <- scores(var_rda, choices = 1:2, scaling = 1, display = c("sp"))
108 | ## arrows(0, 0, var.sc[ , 1], var.sc[ , 2], length = 0, lty = 1, col = "red")
109 | 
110 | 
111 | 
112 | 
113 | 
114 | ## ----rda_triplot_2, include = TRUE, echo = TRUE, eval = FALSE---------------------------------------------------------------------------------------------
115 | ## plot(var_rda, main = "RDA scaling 2: Correlation triplot", display = c("sp", "lc", "cn"))
116 | ## var2.sc <- scores(var_rda, choices = 1:2, display = c("sp"))
117 | ## arrows(0, 0, var2.sc[ , 1], var2.sc[ , 2], length = 0, lty = 1, col = "red")
118 | 
119 | 
120 | 
121 | 
122 | 
123 | ## ----rda-mod-selection, include = TRUE, echo = TRUE, purl = TRUE------------------------------------------------------------------------------------------
124 | # Check R2 of model that is used for scope (full model)
125 | RsquareAdj(var_rda)
126 | # run stepwise forward algorithm
127 | step_R2 <- ordiR2step(rda(varespec_hel ~ 1, data = varechem_red), scope = formula(var_rda))
128 | 
129 | 
130 | ## ----rda_triplot_3, include = TRUE, echo = TRUE, eval = FALSE---------------------------------------------------------------------------------------------
131 | ## plot(step_R2, scaling = 1, main = "RDA scaling 1: Distance triplot", display = c("sp", "lc", "cn"))
132 | ## var.sc <- scores(step_R2, choices = 1:2, scaling = 1, display = c("sp"))
133 | ## arrows(0, 0, var.sc[ , 1], var.sc[ , 2], length = 0, lty = 1, col = "red")
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | ## ----rda-partial, include = TRUE, echo = TRUE, purl = TRUE------------------------------------------------------------------------------------------------
142 | var_rda_partial <- rda(varespec_hel ~ Al + Condition(N + K + Ca), data = varechem_red)
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | ## ----prc-inspect, include = TRUE, echo = TRUE, purl = TRUE------------------------------------------------------------------------------------------------
150 | library(vegan)
151 | data(pyrifos)
152 | head(pyrifos[ , c(1:15)])
153 | summary(pyrifos[ , c(1:15)])
154 | 
155 | 
156 | ## ----prc-prep, include = TRUE, echo = TRUE, purl = TRUE---------------------------------------------------------------------------------------------------
157 | ditch <- gl(12, 1, length = 132)
158 | week <- gl(11, 12, labels = c(-4, -1, 0.1, 1, 2, 4, 8, 12, 15, 19, 24))
159 | # negative week means pre-treatment
160 | conc <- factor(rep(c(0.1, 0, 0, 0.9, 0, 44, 6, 0.1, 44, 0.9, 0, 6), 11))
161 | 
162 | 
163 | ## ----prc-prep-rare, include = TRUE, echo = TRUE, purl = TRUE----------------------------------------------------------------------------------------------
164 | # transform data into presence-absence
165 | pyrifos_pa <- decostand(pyrifos, "pa")
166 | # calculate sum per species
167 | pyri_sum <- apply(pyrifos_pa, 2, sum)
168 | # remove species that occur less than 11 sites
169 | pyrifos_fin <- pyrifos[ , !pyri_sum < 11]
170 | 
171 | 
172 | ## ----prc-run, include = TRUE, echo = TRUE, purl = TRUE----------------------------------------------------------------------------------------------------
173 | pyrifos_prc <- prc(response = pyrifos, treatment = conc, time = week) 
174 | pyrifos_prc
175 | 
176 | 
177 | ## ----prc-plot, include = TRUE, echo = TRUE, purl = TRUE, eval = FALSE-------------------------------------------------------------------------------------
178 | ## # extract information for plotting, i.e. to limit plotting to species with higher scores
179 | ## pyrifos_prc_sum <- summary(pyrifos_prc, scaling = "species")
180 | ## # create plot
181 | ## par(cex = 1.7, mar = c(6,6,1,1))
182 | ## plot(pyrifos_prc, select = abs(pyrifos_prc_sum$sp) > 0.5,  lwd = 3, scaling = "species")
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | ## ----matrix_gen, include = TRUE, echo = TRUE, purl = TRUE-------------------------------------------------------------------------------------------------
191 | mat1 <- matrix(data = c(40, 5, 12, 0, 0, 40, 10, 1, 18, 15, 22, 3, 100, 5, 22, 16, 200, 50, 2, 1, 1, 0, 0, 0, 40, 10, 0, 0, 0, 20), nrow = 5, byrow = TRUE)
192 | mat1 
193 | 
194 | 
195 | ## ----dist_eu, include = TRUE, echo = TRUE, purl = TRUE----------------------------------------------------------------------------------------------------
196 | library(vegan)
197 | # calculation of euclidean distance
198 | eu_dist <- vegdist(mat1, method = "euclidean")
199 | eu_dist
200 | 
201 | 
202 | ## ----dist_bray, include = TRUE, echo = TRUE, purl = TRUE--------------------------------------------------------------------------------------------------
203 | br_dist <- vegdist(mat1, method = "bray")
204 | br_dist
205 | 
206 | 
207 | 
208 | 
209 | ## ----nmds, include = TRUE, echo = TRUE, purl = TRUE-------------------------------------------------------------------------------------------------------
210 | specnmds <- metaMDS(varespec_fin, k = 2)
211 | 
212 | 
213 | ## ----nmds_results, include = TRUE, echo = TRUE, purl = TRUE-----------------------------------------------------------------------------------------------
214 | specnmds
215 | 
216 | 
217 | ## ----nmds_plot, include = TRUE, echo = TRUE, purl = TRUE, eval = FALSE------------------------------------------------------------------------------------
218 | ## par(cex = 1.5)
219 | ## plot(specnmds, type = "t")
220 | 
221 | 
222 | 
223 | 
224 | 
225 | ## ----nmds_nice, include = TRUE, echo = TRUE, purl = TRUE, eval = FALSE------------------------------------------------------------------------------------
226 | ## sumcols <- colSums(varespec_fin)
227 | ## # calculate sum of columns, i.e. total abundance of species. If two species would be plotted
228 | ## # on top of each other in the ordination, select the species that has higher column sums.
229 | ## par(cex = 1.5)
230 | ## # preparation for plotting, create empty plot that will be manually filled
231 | ## plot(specnmds, dis = "sp", type = "n")
232 | ## orditorp(specnmds, display = "sp", priority = sumcols, col = "red", pcol = "black", pch = "+", cex = 0.8)
233 | ## # species scores are obtained by weighted averaging as for RDA (using WA scores)
234 | ## # add sites
235 | ## orditorp(specnmds, display = "sites", col = "blue", pcol = "lightblue", pch = "#", cex = 0.8)
236 | 
237 | 
238 | 
239 | 
240 | 
241 | ## ----nmds_stress, include = TRUE, echo = TRUE, purl = TRUE, eval = FALSE----------------------------------------------------------------------------------
242 | ## stressplot(specnmds, main = "Shepard plot")
243 | 
244 | 
245 | 
246 | 
247 | 
248 | ## ----nmds_goodness, include = TRUE, echo = TRUE, purl = TRUE, eval = FALSE--------------------------------------------------------------------------------
249 | ## # calculate goodness of fit per site
250 | ## good_site <- goodness(specnmds)
251 | ## par(cex = 2.5)
252 | ## plot(specnmds, type = "t", main = "Goodness of fit")
253 | ## points(specnmds, display = "sites", cex = 2*good_site/mean(good_site))
254 | 
255 | 
256 | 
257 | 
258 | 
259 | ## ----nmds_3d, include = TRUE, echo = TRUE, purl = TRUE, eval = FALSE--------------------------------------------------------------------------------------
260 | ## library(vegan3d)
261 | ## # in case you need to install this library, run:
262 | ## # install.packages("vegan3d")
263 | ## # and then execute the function above
264 | ## 
265 | ## # 3d plot for a model object  with 3 dimensions nmds_3d
266 | ## ordirgl(nmds_3d, type = "t")
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | ## ----pyrifos_backtrans, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE-----------------------------------------------------------------------------
276 | pyrifos_t <- round((exp(pyrifos_fin) - 1)/10)
277 | 
278 | 
279 | ## ----pyrifos_mvglm_fit_1, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE---------------------------------------------------------------------------
280 | library(mvabund)
281 | # convert to mvabund object
282 | pyrifos_mv <- mvabund(pyrifos_t)
283 | # combine into dataframe
284 | env <- data.frame(conc, week)
285 | # fit GLMmv
286 | mod_full <- manyglm(pyrifos_mv ~ conc + week + conc:week, data = env, family = "poisson")
287 | 
288 | 
289 | ## ----model_inspect_1, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE-------------------------------------------------------------------------------
290 | plot(mod_full)
291 | 
292 | 
293 | ## ----pyrifos_mvglm_fit_2, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE---------------------------------------------------------------------------
294 | # fit GLMmv
295 | mod_full_nb <- manyglm(pyrifos_mv ~ conc + week + conc:week, data = env, family = "negative.binomial")
296 | # inspect model
297 | plot(mod_full_nb)
298 | 
299 | 
300 | ## ----mean_var_pyrifos, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE------------------------------------------------------------------------------
301 | meanvar.plot(pyrifos_mv ~ conc)
302 | abline(a = 0, b = 1, col = "green")
303 | 
304 | 
305 | ## ----permutations_GLMmv, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE----------------------------------------------------------------------------
306 | # define permutation scheme
307 | control <- how(within = Within(type = "none"), plots = Plots(strata = factor(ditch), type = "free"), nperm = 99)
308 | # construct permutation matrix
309 | set.seed(222)
310 | permutations <- shuffleSet(nrow(pyrifos_t), control = control)
311 | # show permutation matrix
312 | permutations[1:10, 1:24]
313 | 
314 | 
315 | ## ----GLMmv_anova, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE-----------------------------------------------------------------------------------
316 | # note that the function may run about 3 mins
317 | aov_mglm <- anova(mod_full_nb, bootID = permutations, test = "LR", p.uni = "adjusted", rep.seed = TRUE)
318 | aov_mglm$table
319 | 
320 | 
321 | ## ----GLMmv_example_access, include = TRUE, echo = TRUE, purl = TRUE, eval = FALSE-------------------------------------------------------------------------
322 | ## # access test statistics (deviance)
323 | ## aov_mglm$uni.test
324 | ## # access p-values
325 | ## aov_mglm$uni.p
326 | 
327 | 
328 | ## ----GLMmv_extract_uni, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE-----------------------------------------------------------------------------
329 | # deviance of all taxa for treatment variable conc
330 | aov_mglm$uni.test[ 2, ]
331 | # deviance of top 10 taxa
332 | (topt_GLMmv_conc <- sort(aov_mglm$uni.test[2, ], dec = TRUE)[1:10])
333 | # compute fraction of total deviance from fitted model
334 | # total deviance from model
335 | total_dev_conc <- aov_mglm$table[2,3]
336 | # calculate fraction
337 | sum(topt_GLMmv_conc)/total_dev_conc
338 | 
339 | 
340 | ## ----GLMmv_mod_comp, include = TRUE, echo = TRUE, purl = TRUE, eval = TRUE--------------------------------------------------------------------------------
341 | # fit reduced model
342 | mod_reduced_nb <- manyglm(pyrifos_mv ~ week, data = env, family = "negative.binomial")
343 | aov_mglm2 <- anova(mod_reduced_nb, mod_full_nb, bootID = permutations, p.uni = "adjusted", test = "LR", rep.seed = TRUE)
344 | 
345 | 


--------------------------------------------------------------------------------
/Code/Session_9_exerc.R:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | # R exercise Session 9        #
 3 | ###############################	
 4 | # Ralf B. Schäfer, 27.2.2020
 5 | 
 6 | 
 7 | # Exercise 1
 8 | #####################################################################################
 9 | # Sea level rise may affect the ecology of the Dutch coastal system.				#
10 | # The Dutch governmental institute RIKZ therefore started a research project		#
11 | # on the relationship between several abiotic environmental variables				#
12 | # (e.g., sediment composition, slope of the beach) and the benthic fauna.			#
13 | # The aim is to identify the main drivers of the benthic community composition.		#
14 | # Use an RDA to find the explanatory variables that explain the community pattern.	#
15 | #####################################################################################
16 | 
17 | 
18 | ##############################
19 | # Preparation for Exercise   #
20 | ##############################
21 | # We use the RIKZ data set (taken from Zuur et al. 2007).
22 | # Conduct a complete RDA and evaluate how many RDA axes are required.
23 | 
24 | # Load data set
25 | RIKZ <- read.table("https://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/teaching/RIKZ_data/at_download/file", header = TRUE)
26 | 
27 | # You need to extract the species data (Polychaeta, Crustacea, Mollusca, Insecta) as responses
28 | # and the variables in the columns 9 to 15 as explanatory variables.
29 | # Information on environmental variables:
30 | # Exposure is an index that is composed of the following elements: wave action, length of the surf zone, 
31 | # slope, grain size and the depth of the anaerobic layer.
32 | # Salinity and temperature are classical parameters.
33 | # NAP is the height of the sampling station relative to the mean tidal level, measured in meters.
34 | # Penetrability: Habitat variable indicating resistance of ground, measured in N per cm2
35 | # Grain size: Measured in mm
36 | # Humus: constitutes the amount of organic material in %
37 | # Chalk: constitutes the amount of chalk in %
38 | 
39 | 
40 | # Exercise  2
41 | #####################################################################################################
42 | # a) Compute the Bray-Curtis, Euclidean and Jaccard dissimilarities/distances 						#
43 | # for the matrix given below. What are the differences between the coefficients 					#
44 | # regarding the relationship of the highest to lowest dissimilarity/distance? 						#
45 | # What else do you observe? Use the function vegdist() to calculate dissimilarity/distance			#
46 | # and check, which arguments you need to provide. Be careful when calculating Jaccard dissimilarity #
47 | # (you need to set a further argument)!																#
48 | #																									#
49 | # b) How does standardisation of the data affect the results?										#
50 | # Use decostand(yourmatrix, method="max") and recalculate the dissimilarities/distances.			#
51 | # This will divide each observation by the maximum value of each variable (species).				#
52 | #####################################################################################################
53 | 
54 | ##############################
55 | # Preparation for Exercise   #
56 | ##############################
57 | library(vegan)
58 | # We generate a matrix with 6 species observed at 5 sites
59 | # that you should use to calculate some dissimilarity and distance measures
60 | mat1 <- matrix(data = c(40, 5, 12, 0, 0, 40, 10, 1, 18, 15, 22, 3, 100, 5, 22, 16, 200, 50, 2, 1, 1, 0, 0, 0, 40, 10, 0, 0, 0, 20), nrow = 5, byrow = TRUE)
61 | mat1
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/Data/OstraMRegS400JB.txt:
--------------------------------------------------------------------------------
1 | E100	MDS1	MDS2	DP	RC	BT	SA	SP	IC	P	DCA1	DCA2	LAT	LON	SR13.07835929	-0.421782262	-0.109599162	26	2	-0.8923	31.2973	2.407069488	293.3	62.2709046	-0.695245	-0.161782	70.3	-146.08	1411.50393789	-0.687217026	0.058053531	31	2	-0.8991	31.3444	2.726143023	307.3	46.83229563	-0.825609	-0.52251	70.35	-146.57	1211.98495746	-0.193091929	0.007325566	27	2	-0.756	31.2805	4.244606629	330.8	64.8012962	-0.364178	-0.302336	70.22	-148.07	1414.64481452	-0.10556473	-0.194642873	41	2	-0.8733	31.9668	3.300004614	322.4	18.04454704	-0.369583	-0.420674	70.82	-148.05	1521.63442633	-0.217091589	0.124089231	24	2	-1.2173	31.4632	2.924773838	322.9	28.1575449	-0.369887	-0.180748	70.73	-148.8	237.794046938	-0.685612449	-0.053002807	19.8	2	-0.8745	30.7309	2.862883285	313.6	46.2440839	-1.196125	-0.434827	70.87	-150.5	106.870186558	-0.471004351	-0.221217553	39	2	-0.9741	31.2562	2.39576339	298.4	77.2498188	-0.841082	-0.360164	70.14	-145.48	712.41905119	-0.474030046	0.05069296	26	2	-0.8991	31.3444	2.726143023	307.3	46.83229563	-0.678971	-0.289629	70.36	-146.6	169.235466516	-0.128024004	-0.481929596	19	2	-0.8745	30.7309	3.27417203	316	36.79187201	-0.581833	-0.129747	70.99	-150.53	107.617344889	-0.505377368	-0.522113307	4	2	-0.4913	28.2688	2.161362972	291.2	106.3568752	-1.043385	0.277559	70.46	-148.77	86.663122939	-0.442717285	-0.381797672	5.8	2	-1.29	31.16	2.501879642	308.4	71.604238	-1.170311	0.06024	70.56	-149.45	711.90938807	-0.372151464	-0.24312244	11	2	-1.315	31.2286	2.337552291	301.3	98.1563487	-0.914648	-0.211094	70.57	-149.45	155.229980411	-0.45115751	-0.352719168	5	2	-1.315	31.2286	2.337552291	301.3	98.1563487	-1.116325	0.041967	70.56	-149.46	612.74809193	-0.542030646	0.26009264	19.5	2	-1.1671	30.9923	2.311520443	290.9	86.4431529	-0.632481	-0.299085	70.52	-147.82	149.697510249	-0.027496009	-0.380829644	5.8	2	-1.2776	30.2478	2.12174278	289.6	110.1650039	-0.080814	-0.04571	70.32	-147.56	1213.71698785	0.004187554	-0.189050716	6.5	2	-1.3206	30.6793	2.12174278	289.6	110.1650039	-0.077247	-0.07285	70.3	-147.5	154.429012291	-0.742930416	-0.546511446	1.8	2	-0.8411	28.9594	2.158876048	292.9	149.148385	-1.131257	0.12761	70.59	-151.96	55.115856326	-0.114405714	-0.928863211	1.5	2	-0.4913	28.2688	2.161362972	291.2	106.3568752	-0.478341	-0.107935	70.47	-148.85	66.552801151	-0.502968512	-0.622073426	1.8	2	-0.4913	28.2688	2.086063217	288.5	115.3947681	-1.269706	0.231089	70.43	-148.67	910.53286017	-0.220994165	-0.431227798	13.4	2	-1.0991	30.0739	2.582560301	306.1	59.3993048	-0.629133	-0.187012	70.27	-146.5	1110.08769198	-0.257744433	-0.469753557	4	2	-0.1329	27.2346	2.550299856	305.6	75.3283775	-0.833628	-0.081337	70.2	-146.5	1612.92294513	-0.451307486	-0.192309415	14	2	-1.0814	31.2584	2.329531816	295.7	107.8377679	-0.881707	-0.274191	70.64	-150.5	137.862959788	-0.486849014	-0.301238625	12.5	2	-1.108	30.0992	2.199881014	292.8	94.7263046	-1.300863	-0.212138	70.27	-146.61	87.974987148	-0.769524923	-0.664527159	6	2	-1.1109	29.8816	2.19120124	292	95.5367748	-0.634251	0.021745	70.27	-146.85	912.44647906	-0.637284234	0.016409108	22	2	-1.0154	31.4083	2.637698855	307.7	54.7112934	-0.969271	-0.227829	70.82	-150.04	1314.13298415	-0.598776609	0.230763787	23	2	-1.0154	31.4083	2.637698855	307.7	54.7112934	-0.863143	-0.242523	70.83	-150.2	1715.13419488	-0.034859171	-0.29341489	17.5	2	-0.8745	30.7309	3.27417203	316	36.79187201	-0.148832	-0.389244	70.96	-150.64	1620.29394052	-0.050521154	-0.226483514	14.1	2	-1.3396	30.9188	2.248472961	291.7	90.4688632	-0.258603	-0.326817	70.52	-148.17	376.69253479	-0.442459486	-0.461283584	5	2	-1.2413	29.7754	2.237993177	287.7	94.6749239	-1.097436	-0.114861	69.68	-141.32	717.69536701	-0.07417252	0.028319658	34	2	-1.3563	31.4208	2.134952123	270.9	100.7725201	-0.198051	-0.458308	69.88	-141.15	2017.12484704	-0.213467577	0.194078135	32	2	-1.3563	31.4208	2.138518714	270.9	102.5794933	-0.392781	-0.118143	69.89	-141.24	2113.98178018	-0.480884655	0.152491632	30	2	-1.3097	31.4454	2.09403743	272.5	101.9757827	-0.79815	-0.087174	69.82	-141.26	1915.31708587	-0.318094205	0.046374479	23	2	-1.3021	30.6584	2.082981823	278.1	97.9528357	-0.487705	-0.329575	69.79	-141.37	1714.88627464	0.067873456	-0.27525306	16.5	2	-1.3021	30.6584	2.240000008	288.5	90.5177704	-0.051844	-0.357476	69.75	-141.44	216.245931672	-0.288720621	-0.092400493	4	2	-0.7791	27.4204	2.237993177	287.7	94.6749239	-0.689299	0.062616	69.66	-141.28	85.777498122	-0.703383748	-0.220835439	4	2	-0.7791	27.4204	2.418840383	298.9	85.4798148	-1.401381	0.0327	69.66	-141.36	817.95751304	-0.321381329	0.093451329	23.5	2	-1.2878	30.4828	2.276854181	286.9	78.3493834	-0.34941	-0.317597	70.06	-142.49	1911.5892667	0.041626398	-0.48069761	16	2	-1.3095	30.4271	2.242625474	286.9	79.1752907	-0.298058	-0.254191	70.02	-142.52	1616.58581143	0.141743317	-0.065286613	18	2	-1.1671	30.9923	2.926917816	314.9	35.9447791	0.097156	-0.34219	70.62	-148.13	194.77025504	-0.71006407	-0.337890141	20	20	-0.9756	30.6654	2.015441243	282.9	169.9894736	-1.324628	0.324257	69.71	-136.13	55.833322722	-0.771249687	-0.205060543	3	20	1.3146	20.4717	1.988472105	282.9	189.8928828	-1.715452	-0.049824	69.62	-135.87	61.476190476	-2.026526073	0.496134301	1	20	0.9214	17.7846	2.053688465	289.3	196.924109	-3.023436	-0.090055	69.66	-134.53	24.226551703	-0.662013938	-0.341364019	8	20	-0.3284	28.825	2.053428048	285.3	175.2827397	-1.298792	0.295871	69.78	-132.84	69.043733861	-0.526282764	-0.254964442	10	20	-0.3284	28.825	1.949003384	274.1	182.4312119	-1.096349	-0.157304	69.91	-132.82	114.810535561	-0.878431793	-0.301886484	3	20	1.1103	17.5715	2.031011093	284.5	186.4179736	-1.567239	0.023161	69.78	-132.07	78.124079981	-0.520603535	-0.153515894	12	20	0.1418	28.9796	1.970983448	280.5	183.6564742	-1.215477	-0.144418	70.07	-131.2	138.159204556	-0.270994516	-0.799579719	8	20	-0.015	29.5247	2.064353902	288.5	152.2237317	-1.123882	0.159012	70.26	-130.08	1013.58851966	-0.643409378	0.446086107	20	20	-0.9488	31.5945	1.65719544	237.6	153.1801296	-0.448793	-0.799296	70.5	-130.18	158.725601184	-0.324222278	-0.544417531	13	20	-0.0589	29.8512	2.077463256	285.2	149.5721071	-1.092282	-0.010921	70.41	-128.79	1411.20628429	1.129546888	-1.146674793	64	19	0.5555	34.657	1.733018486	224.8	65.8300524	1.945933	0.187307	75.19	22.23	1616.95666239	0.829732997	-0.3544845	30	19	-0.63	34.85	2.516128799	288.4	22.31869708	1.446487	0.547341	79.95	50.02	1922.78895933	1.104946712	0.124220711	15	8	-1.44	34.74	2.339667302	290	18.66308908	1.578815	0.002755	76.22	62.67	337.945891014	-0.48894663	-0.700133241	2.5	2	-0.165	27.4763	3.27791681	320.8	52.5423131	-0.812076	1.237756	70.16	-146.09	119.66667286	-0.288717489	-0.613991277	3	2	-0.1329	27.2346	2.681719718	312	61.5299886	-0.86015	0.041933	70.19	-146.38	148.641374726	-0.331011583	-0.430083636	4.5	2	-0.1175	26.9942	2.519874437	306.4	87.7482266	-0.944447	-0.029653	70.13	-146.66	1114.46064603	-0.325005426	-0.334855058	5.5	2	-1.1964	29.959	2.18414189	295.2	101.2618247	-0.900615	-0.046457	70.13	-147.03	1611.87863663	-0.200477602	-0.276393861	5	2	-1.1964	29.959	2.166990104	292.8	99.5362499	-0.601647	-0.150898	70.29	-147.03	179.853374231	-0.16497344	-0.329910484	7	2	-1.3206	30.6793	2.130342981	291.2	107.519268	-0.47119	-0.143663	70.28	-147.36	167.727236352	-0.387859877	-0.44303025	3	2	-0.5557	27.502	2.133573527	292	111.4090654	-0.989544	-0.011902	70.22	-147.54	815.66565078	-0.004124063	-0.321105497	7.1	2	-1.3206	30.6793	2.143098641	289.3	97.5712274	-0.184796	-0.225706	70.36	-147.5	2711.89448321	-0.0279253	-0.345591538	8	2	-1.37	31.16	2.274951586	298	79.5032715	-0.193657	-0.312772	70.47	-148.4	2114.66830919	-0.266942459	-0.194783527	3	2	-0.8446	28.8257	2.186201001	290.9	131.108669	-0.629978	-0.395979	70.62	-150.48	1612.25394573	-0.333975962	-0.136219094	19	2	-1.0914	30.7016	2.390846343	293.3	65.3396752	-0.715116	-0.287477	70.24	-146.03	168.342910552	-0.197355563	-0.65439974	17.2	2	-1.1	30.87	3.108145321	312.8	35.38498321	-0.789708	-0.162747	70.98	-150.72	92.890519033	-0.733301247	-1.216092293	14	2	-1.04	30.13	3.53033529	325.2	17.9295739	-1.421204	0.184506	70.99	-150.79	35.342011722	-0.425277439	-0.501209709	3.2	2	-0.6644	29.2743	2.337744589	298.4	134.4428241	-1.039087	0.145911	70.7	-152.33	85.536775361	-0.390138028	-0.585626382	16	2	-0.6999	30.5374	2.892046513	310.1	54.46143	-1.120711	0.272651	71	-151.35	611.04775585	0.015353321	-0.628480077	13	2	-1.0814	31.2584	2.186201001	290.9	131.108669	-0.202348	-0.453294	70.61	-150.41	1415.39317049	-0.277071221	-0.149291075	21	2	-1.2173	31.4632	2.625884861	304.5	63.129175	-0.706919	-0.185452	70.71	-149.27	1816.90799016	1.204884802	-0.75805454	25	25	-0.4763	32.8659	1.738464515	247.6	37.95881295	2.189112	-0.104379	66.09	-84.99	1818.64585724	1.350291646	-0.19421167	26	26	0.2461	32.8149	2.343413641	290.9	26.5749066	1.771691	-0.274104	66.22	-83.58	2517.25652542	1.407485266	-0.239417941	26	26	-0.4763	32.8659	2.045304481	278	27.26359534	1.829359	-0.349459	66.13	-84.63	2615.60178604	1.238759494	-0.208679603	26	23	-0.4763	32.8659	2.045304481	278	27.26359534	1.709664	-0.391575	66.13	-84.63	1918.25216732	1.434689239	-0.000271761	26	21	-0.0901	32.8452	2.435538882	295.7	29.25592181	1.919293	-0.151804	66.12	-83.7	2317.41142621	1.310293201	-0.041985721	50	21	-0.09	32.85	2.435538882	295.7	29.25592181	1.855951	-0.239793	66.12	-83.7	2412.70938737	1.062819196	-0.277323575	26	23	-0.4763	32.8659	2.077752232	278.8	27.04362311	1.241538	-0.811475	66.13	-84.59	1817.77247253	1.397439627	0.10384888	30	32	-1.0229	33.343	2.000439341	270.8	52.3811486	1.727325	-0.037695	76.46	-70.21	2318.78246904	1.318463217	-0.358196148	31	28	-0.9896	33.3313	3.045202805	311.6	31.9469205	1.773383	-0.452898	76.55	-68.9	2222.81518747	0.667144478	0.520599023	200	24	-0.1835	34.4235	4.241858072	339.6	8.64905248	0.902937	0.310862	75.33	-19	2920.21994569	0.618772689	-0.188623502	97.5	30	-0.1727	34.4474	3.156631658	320.4	21.4465063	0.840603	-0.242104	74.08	-20.74	2117.66019791	1.241343515	-0.359487712	34	28	-0.9896	33.3313	3.059069903	312.4	31.7556896	1.76752	-0.295738	76.56	-68.9	2213.26040141	0.763080613	-0.708971202	25	27	-0.0773	31.8811	2.718144983	310.9	22.52291963	1.844639	-0.365896	69.37	-81.8	146.883993247	1.941870164	-0.327785291	11	31	1.1488	31.8918	1.323587545	172.4	79.46495359	2.936689	0.202341	60.83	-46.8	74.797520661	1.830784889	-1.194386528	6	31	1.0129	32.9878	6.734781851	364.4	0.4781088	3.486407	0.316516	60.97	-46.53	55.395581978	1.841134969	-0.906791056	8	31	1.0129	32.9878	6.734781851	364.4	0.4781088	3.113619	0.087705	60.97	-46.53	711.73204843	-0.008817443	0.453823715	49	8	-1.09	33.5	2.939239447	306	70.09469114	0.06751	-0.669472	74.83	83.43	1212.60139413	0.523760052	0.467201489	55	8	-1.31	34	3.33177814	330.8	26.6303957	0.434876	-0.30833	76.66	83.88	1312.23716207	0.506879328	0.588598144	73	8	-1.38	34.2	3.581982717	326	22.7716655	0.415785	-0.212831	76.94	85.76	1311.99084778	0.011660638	0.47308781	48	8	-1.23	33.71	3.985762094	336.4	33.0026802	0.112673	-0.45043	75.46	82.55	159.577959834	0.631635178	0.31512328	27	8	-1.56	32.1	2.710931036	304.4	112.219159	0.469603	-0.771505	74.5	74	107.659758875	-0.620038109	0.911880562	37	8	-1.54	33	2.118677121	281.2	155.3659298	0.082097	-0.308496	74	80.01	108.92081448	-0.670074169	1.259259451	41	8	-1.56	32.1	1.970257924	273.2	179.9169224	-0.171204	-0.403362	73.77	79.99	99.656285053	0.214041774	0.487583367	26	8	-1.59	32	2.282533102	294.8	156.9287739	0.068177	-0.999039	74	74	106.960054008	0.307205562	0.176677788	23	8	-1.59	29.5	1.832387628	256.4	198.8554404	0.056118	-0.105273	73.42	78.81	815.76439041	1.377130212	0.231275529	36	8	-1.21	33.93	3.101131654	310	36.2816713	1.921477	0.219081	74.36	59.48	189.738459397	-0.128935554	0.274167433	66	9	-1.65	33.78	4.162063135	342	17.4666126	-0.327232	0.361078	77.6	132.27	1113.74479357	-0.008631339	0.842546607	76	9	-1.65	34.11	3.646346286	344.4	11.4854105	0.098447	1.130567	77.83	132.23	1415.32109613	-0.065332227	0.484133646	276.4	9	1.18	34.7	3.776789737	348.4	10.3991295	-0.011518	0.860682	77.28	120.06	1718.0506336	0.014685256	0.937575147	68	9	-1.31	33.48	4.323491587	346	9.3315084	-0.029513	1.157294	76.95	118.59	1912.87352819	-0.198318133	0.340133658	61.6	9	-1.57	33.68	3.664409053	343.6	12.72572387	-0.316269	0.677512	76.77	116.05	157.177822014	0.924102737	1.266462754	265	8	1.8034	34.9475	5.666383863	352.4	3.35833062	0.622251	2.168583	81.46	97.57	818.12850868	1.292756675	0.028786247	55	22	1.6327	32.715	1.439448049	227.6	131.2468465	1.62937	-0.237041	65.99	-168.46	2312.33095567	0.782077031	1.459444439	265	8	1.8034	34.9475	5.666383863	352.4	3.35833062	0.613858	2.685447	81.46	97.57	1810.71954545	0.694104086	-0.481657868	30	24	2.469	34.0594	1.634639407	204.1	45.31568226	0.768617	-0.456289	65.58	-37.18	1416.22061101	-0.334374645	0.368023922	66	20	-1.4641	32.3987	1.711348837	241.3	199.0925835	-0.178129	-0.692784	70	-137.33	2216.54410153	0.514854678	0.168219808	115	20	-1.4895	33.0023	1.884784874	259.7	162.0481322	0.549303	-0.479857	70.08	-137.83	2019.66636605	0.149918889	0.046849698	66	20	-1.4545	32.4005	1.806340788	248.5	184.981947	0.234017	-0.670986	70.08	-137.55	2727.97321008	0.335566604	-0.019250111	128	20	-1.5009	33.0017	1.642004491	227.7	220.9444837	0.350883	-0.389852	69.6	-138.2	308.095053161	-0.868013468	0.007420917	22	20	-1.0055	30.8415	1.681154581	241.3	229.5327834	-1.34831	-0.1533	69.75	-136.65	97.657903966	-1.006973629	0.062138172	43	20	-1.02	31.54	1.698270816	241.2	171.1571988	-1.414073	-0.335043	69.92	-137.1	107.834275033	-0.950146615	0.083766446	22	20	-0.9513	30.6668	1.672503547	238.1	200.3442656	-1.34332	-0.391407	69.97	-135.79	1111.84130849	0.360813901	0.560123082	78	20	-1.4573	32.4467	2.309886127	279.2	53.02152358	0.042026	-0.697181	70.84	-133.89	137.170695475	-0.874059532	-0.029752057	14	20	-0.5336	28.3957	1.793086279	253.3	194.4124629	-1.388182	-0.41281	69.97	-134.74	1016.32323847	0.608573227	-0.044727348	67	20	-1.454	32.4103	2.232268977	282.4	124.7385967	0.538512	-1.133284	70.4	-136.6	175.053635009	-0.813898266	-0.181879728	22	20	-0.9224	30.6901	1.926095581	274.1	189.400372	-1.45498	0.224013	69.79	-135.52	715.35297728	-0.452793805	0.302098426	19.5	2	-0.7921	31.5669	1.954941028	264.4	105.0486049	-0.426611	-0.374363	70.67	-128.53	238.918846546	-0.970599732	0.462956095	18.7	20	-0.8365	31.6113	2.030721925	272	134.0074646	-0.990811	-0.605052	70.58	-129.41	1611.71941222	0.321661621	1.489683688	132	8	-1.4558	34.4509	4.126549911	343.6	14.05504895	0.275853	1.978508	77.89	101.59	1311.86139971	0.5985446	1.5697855	191	9	1.6702	34.8071	4.018648715	339.6	17.33868097	0.314211	1.916359	78.07	133.61	129.69047153	0.004404283	1.020964429	73	9	-1.6457	33.8777	3.637930357	338	21.0788793	0.174741	0.41923	77.91	133.56	1016.468041	0.318411865	1.023234393	92	29	-1.4611	34.1968	2.861213184	338.8	24.8181758	0.316615	1.069346	77.03	126.41	1911.6264136	0.137808636	1.273268773	193	9	0.4803	34.7265	3.688037365	338.8	16.346522	0.115894	1.531342	77.25	118.55	1218.91507205	0.035887778	1.020879817	101	9	-1.4394	34.2074	3.880521956	342	15.5135833	0.021041	1.122772	77.17	118.71	2013.66318579	0.253687634	1.306224016	235	9	0.77	34.87	3.635324149	339.6	13.28444625	0.027554	2.00733	78.58	111.39	143.333332959	-1.07554293	-0.20119891	7	20	-0.2807	28.808	2.047182087	285.3	182.6045566	-1.693962	-0.20589	69.77	-132.7	411.5715296	-0.693901815	0.93113673	51.45	22	-1.43	32.48	2.700733808	315.6	17.0670194	-0.089943	-0.57133	71.21	-168.34	126.056334429	1.85807796	0.170997911	145	20	6.7699	35.0945	1.39238745	181.3	116.5138656	1.35823	-0.0287	68.24	11.16	93.472108045	1.84367632	1.122134103	188.5	19	6.8526	35.152	1.336721775	176.5	120.123644	1.05254	2.44455	67.6	10.55	415.61099194	1.88534027	0.484282993	184	20	6.7863	35.0169	1.300903495	184.4	134.9629425	1.81398	0.30953	69.82	17.63	239.716962169	1.54902623	0.618914692	136	20	6.6548	34.929	1.342420344	178.1	118.4672485	1.23591	0.28506	68.78	13.05	1611.90428484	1.67880501	0.310479733	190	20	5.68	35.11	1.407110293	201.2	112.4964611	1.42653	-0.07066	70.48	17.45	203.859876567	1.51165286	1.164977377	120	20	6.7964	34.6518	1.207628845	174.9	114.9639332	0.88876	2.72946	67.4	12.72	64.430578447	1.204292	1.048409433	120	20	6.9129	34.7193	1.220220867	172.5	121.1569215	0.87873	2.25317	67.15	12.17	75.168809362	0.92164466	-2.486183582	77	21	7.3445	32.1854	1.15127158	137.2	223.8269285	5.36425	0.04654	59.08	-138.67	65.689360448	0.43343879	-1.965319273	132	21	5.6378	32.9268	1.116450169	132.4	201.8551112	4.58132	0.16794	59.56	-141.42	67.754385965	0.5415787	-2.086193995	104	21	6.0405	32.4985	1.113992132	131.6	212.347647	4.47794	-0.33005	59.46	-139.84	8


--------------------------------------------------------------------------------
/Data/installed_pkgs.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | DAAG
 3 | dplyr
 4 | beanplot
 5 | learnr
 6 | sandwich
 7 | lmtest
 8 | vegan
 9 | sjPlot
10 | permute
11 | boot
12 | car
13 | caret
14 | png
15 | ggplot2
16 | GGally
17 | Hmisc
18 | MuMIn
19 | relaimpo
20 | hier.part
21 | shrink
22 | glmnet
23 | stabs
24 | effects
25 | selectiveInference
26 | bootStepAIC
27 | MASS
28 | ggfortify
29 | gridExtra
30 | markdown
31 | DHARMa
32 | shiny
33 | xkcd
34 | extrafont
35 | modEvA
36 | countreg
37 | rpart
38 | party
39 | partykit
40 | InformationValue
41 | rpart.plot
42 | devtools
43 | mvpart
44 | MVPARTwrap
45 | randomForest
46 | chemometrics
47 | mvoutlier
48 | missMDA
49 | pcaPP
50 | vegan3d
51 | mvabund
52 | 


--------------------------------------------------------------------------------
/Literature_commented.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Literature_commented.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Course material for the lecture "Tools for complex data analysis"
 2 | 
 3 | by Ralf B. Schäfer, University of Koblenz-Landau, Winter Semester 2022/23
 4 | 
 5 | **Note: [As I have left](https://www.uni-due.de/biologie/oekotoxikologie/index_en.php) the University of Koblenz-Landau (as of 1.1.2023 the RPTU Kaiserslautern-Landau), material on the university server such as videos and tutorials may stop working at some stage and this course will not be maintained anymore.**
 6 | 
 7 | ## Overview
 8 | 
 9 | This repository provides all course materials including R code, slides and data as well as the links to teaching videos.
10 | Note that all material comes without guarantee! In case you have any comments regarding content, 
11 | please feel free to contact me, but note that I generally do not respond to computer issues or questions 
12 | for statistical advice (unless working on a joint publication).
13 | 
14 | ## Prerequisites
15 | 
16 | To prepare your computer for the lecture, you need to install [R](http://www.r-project.org/) and 
17 | [R Studio](http://www.rstudio.com/). I use several packages in the course, if you want to have all packages 
18 | ready at the beginning of the course, run the script 0_install_packgs.R in the Code folder. Note that 
19 | the course is work in progress and you should run this script again from to time to time to be up to date with 
20 | the packages required. Alternatively, you can just install a new package whenever it is needed:
21 | 
22 | ```
23 | install.packages("package_name")
24 | ```
25 | 
26 | ## Links to videos
27 | [Session 1: Introduction to data analysis](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=3b8033e8-f181-425e-a3ce-ad7c00a108cb)
28 | 
29 | [Session 2: Linear regression](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=1ab66bfe-a33d-4a6c-889e-ad7c00a108d0)  
30 | 
31 | [Session 3: Assessing hypotheses and simulation-based tools](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=8186339e-19fe-4bd6-b587-ad7c00a108d4)
32 | 
33 | [Session 4: ANOVA, ANCOVA, multiple regression and interactions](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=e904ac58-dd3c-4379-87ca-ad7c00a108de)
34 | 
35 | [Session 5: Multiple regression: Modelling strategies](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=6de626ae-8fa4-428a-b7b0-ad7c00a111e9)
36 | 
37 | [Session 6: GLMs](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=856a11a9-7379-4ee5-bfac-ad7c00a1130c)
38 | 
39 | [Session 7: Unsupervised learning: CART](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=864561bc-be80-49eb-9b0b-ad7c00a1153e)
40 | 
41 | [Session 8: Principal component analysis](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=2129473d-6a6e-46a3-8864-ad7c00a11cdc)
42 | 
43 | [Session 9: Redundancy analysis, Similarity measures, NMDS and multivariate GLMs](https://vcm.uni-kl.de/Panopto/Pages/Sessions/List.aspx?folderID=f993c161-3678-45f8-8a36-ad7c00a12b7d)
44 | 
45 | ## Links to R tutorials available for some sessions
46 | (**university account required, choose these links if you are student of the university**)  
47 | 
48 | [Session 2: Linear regression](https://data-analysis.uni-landau.de/auth_session/2/)  
49 | 
50 | [Session 3: Assessing hypotheses and simulation-based tools](https://data-analysis.uni-landau.de/auth_session/3/)
51 | 
52 | [Session 4: ANOVA, ANCOVA, multiple regression and interactions](https://data-analysis.uni-landau.de/auth_session/4/)
53 | 
54 | [Session 5: Multiple regression: Modelling strategies](https://data-analysis.uni-landau.de/auth_session/5/)
55 | 
56 | [Session 6a: GLM explorer](https://data-analysis.uni-landau.de/auth_session/6_glm/)
57 | 
58 | [Session 6b: GLM tutorial](https://data-analysis.uni-landau.de/auth_session/6/)  
59 | 
60 | [Session 7: CART tutorial](https://data-analysis.uni-landau.de/auth_session/7/)  
61 | 
62 | [Session 8: PCA tutorial](https://data-analysis.uni-landau.de/auth_session/8/) 
63 | 
64 | [Session 9: Multivariate gradients tutorial](https://data-analysis.uni-landau.de/auth_session/9/)  
65 | 
66 | ***
67 | 
68 | ## Public Links to R tutorials for some sessions
69 | (**no university account required, choose these links if you are in an online study program without university account or for general access**)  
70 | 
71 | [Session 2: Linear regression](https://data-analysis.uni-landau.de/open_session/2/)  
72 | 
73 | [Session 3: Assessing hypotheses and simulation-based tools](https://data-analysis.uni-landau.de/open_session/3/)
74 | 
75 | [Session 4: ANOVA, ANCOVA, multiple regression and interactions](https://data-analysis.uni-landau.de/open_session/4/)
76 | 
77 | [Session 5: Multiple regression: Modelling strategies](https://data-analysis.uni-landau.de/open_session/5/)
78 | 
79 | [Session 6a: GLM explorer](https://data-analysis.uni-landau.de/open_session/6_glm/)
80 | 
81 | [Session 6b: GLM tutorial](https://data-analysis.uni-landau.de/open_session/6/)  
82 | 
83 | [Session 7: CART tutorial](https://data-analysis.uni-landau.de/open_session/7/)  
84 | 
85 | [Session 8: PCA tutorial](https://data-analysis.uni-landau.de/open_session/8/) 
86 | 
87 | [Session 9: Multivariate gradients tutorial](https://data-analysis.uni-landau.de/open_session/9/)  
88 | 
89 | 
90 | ## Acknowledgments
91 | * [Noel Juvigny-Khenafou](https://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/staff/juvigny-khenafou) is thanked for replacing me in teaching this lecture and dealing with the students (I was given partial teaching relief to join university management in the merger process with the TU Kaiserslautern). 
92 | * [Achim Zeileis](https://eeecon.uibk.ac.at/~zeileis/) is thanked for development and support with the
93 | [R exams](http://www.r-exams.org) package that is used for automated test generation in this course. [Felix Högerl](https://www.uni-koblenz-landau.de/en/campus-landau/faculty7/environmental-sciences/landscape-ecology/staff/hoegerl) is thanked for help with exam evaluation.
94 | * and of course a huge thanks to all the package authors and the whole R community as
95 | well as the stackexchange and stackoverflow community.
96 | 


--------------------------------------------------------------------------------
/Schedule.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Schedule.pdf


--------------------------------------------------------------------------------
/Slides/0_Definitions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/0_Definitions.pdf


--------------------------------------------------------------------------------
/Slides/1_for_extern.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/1_for_extern.pdf


--------------------------------------------------------------------------------
/Slides/1_for_internal_students.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/1_for_internal_students.pdf


--------------------------------------------------------------------------------
/Slides/2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/2.pdf


--------------------------------------------------------------------------------
/Slides/3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/3.pdf


--------------------------------------------------------------------------------
/Slides/4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/4.pdf


--------------------------------------------------------------------------------
/Slides/5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/5.pdf


--------------------------------------------------------------------------------
/Slides/6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/6.pdf


--------------------------------------------------------------------------------
/Slides/7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/7.pdf


--------------------------------------------------------------------------------
/Slides/8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/8.pdf


--------------------------------------------------------------------------------
/Slides/9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schaeferRCOHR/Data_analysis/af20101c70a3c63386397141493b129b777bf752/Slides/9.pdf


--------------------------------------------------------------------------------