├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── DESCRIPTION
├── LICENSE-MIT.md
├── LICENSE.md
├── README.md
├── _commands
    ├── build-pdf.R
    └── build-slides.R
├── _includes
    ├── footer.html
    ├── header.html
    ├── preamble.tex
    └── style.html
├── _site.yml
├── _templates
    └── lectures.Rmd
├── about.Rmd
├── assignment-01.Rmd
├── assignment-02.Rmd
├── assignment-03.Rmd
├── assignment-04.Rmd
├── assignment-05.Rmd
├── assignment-06.Rmd
├── assignment-07.Rmd
├── assignment-09-challenge.Rmd
├── assignment-final.Rmd
├── codemeta.json
├── data
    ├── Assign05_Question3.csv
    ├── Fitzpatrick_2018.csv
    ├── NEON_PlantPA_HARV_201707.csv
    ├── Rivkin_2018_AJB.txt
    ├── Santangelo_JEB_2018.csv
    ├── Thompson-Johnson_2016_Evol.csv
    ├── africa.wide.csv
    ├── iris.csv
    ├── jellyfish.csv
    ├── kenya.wide.csv
    ├── lec09_CommunityMatrix_Example.csv
    ├── plant-biomass-preprocess.csv
    ├── plant_phenology.csv
    ├── portal_data.csv
    ├── predator_prey_body_size.txt
    ├── pseudo.LTRs
    ├── pseudo.ara.busco
    ├── pseudo.euk.busco
    ├── pseudoMol_Kdist.txt
    ├── rikz_data.txt
    ├── survey.csv.gz
    ├── wc2.0_bio_10m_01.tif
    └── wc2.0_bio_10m_12.tif
├── image
    ├── Liriodendron_tulipifera.png
    ├── RIKZ_data.png
    ├── RIKZ_data_Crossed.png
    ├── RIKZ_data_DeepNest.png
    ├── SEM-figure.png
    ├── SEMfig.png
    ├── assignment-8-figure-q1.png
    ├── boxplot-problem.gif
    ├── colourblind.png
    ├── comic-filenaming.gif
    ├── dynamite-bars.png
    ├── dynamite-vs-dists.png
    ├── favicon.png
    ├── fig_scientific_method.png
    ├── git_lesson
    │   ├── branch_dropdown.png
    │   ├── branches.png
    │   ├── delete_branch.png
    │   ├── sample_rmd.png
    │   └── yellow_prompt.png
    ├── heatmap.png
    ├── logistic.gif
    ├── lotka-volterra.gif
    ├── model.png
    ├── predator-prey.gif
    └── signal-transduction-pathway.png
├── index.Rmd
├── lec01-introduction.Rmd
├── lec02-basic-r.Rmd
├── lec03-basic-r.Rmd
├── lec04-dplyr.Rmd
├── lec05-dplyr.Rmd
├── lec06-exploratory-data-analysis.Rmd
├── lec07-linear-modelling.Rmd
├── lec08-linear-mixed-effects-models.Rmd
├── lec09-model-selection.Rmd
├── lec10-multivariate-stats.Rmd
├── lec11-spatial-stats.Rmd
├── lec12-randomization-tests.Rmd
├── lec13-theory.Rmd
├── lec14-datasets.Rmd
├── lec15-git-projects.Rmd
├── mid-project-update.Rmd
├── paper.bib
├── paper.md
├── rcourse.Rproj
├── resources.Rmd
└── resources
    ├── HighstatLibV6.R
    └── Statistical-decision-tree.pdf


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^README-.*\.png$
5 | ^\.travis\.yml$
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | _site/
 6 | _pdf/
 7 | misc/
 8 | about.html
 9 | assignment-01.html
10 | assignment-02.html
11 | assignment-03.html
12 | assignment-04.html
13 | assignment-05.html
14 | assignment-06.html
15 | assignment-07.html
16 | assignment-08.html
17 | assignment-08_files/
18 | assignment-final.html
19 | index.docx
20 | index.html
21 | index.pdf
22 | lec01-introduction.html
23 | lec01-introduction_files/
24 | lec02-basic-r.html
25 | lec03-basic-r.html
26 | lec04-dplyr.html
27 | lec04-dplyr_files/
28 | lec05-dplyr.html
29 | lec05-dplyr_files/
30 | lec06-pop-models.html
31 | lec06-pop-models_files/
32 | lec07-pop-models.html
33 | lec07-pop-models_files/
34 | lec08-linear-mixed-effects-models.html
35 | lec08-linear-mixed-effects-models_files/
36 | site_libs/
37 | Fitzpatrick_2018.csv
38 | NEON_PlantPA_HARV_201707.csv
39 | lec09_CommunityMatrix_Example.csv
40 | rikz_data.txt
41 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: R
 2 | sudo: false
 3 | cache: packages
 4 | script:
 5 | - Rscript -e "rmarkdown::render_site('.')"
 6 | - touch _site/.nojekyll
 7 | dist: trusty
 8 | addons:
 9 |   apt:
10 |     packages:
11 |       - gdal-bin
12 |       - libgdal1-dev
13 |       - libproj-dev
14 | deploy:
15 |   provider: pages
16 |   skip_cleanup: true
17 |   github_token: $GITHUB_PAT
18 |   local_dir: _site
19 |   on:
20 |     branch: master
21 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Quantitative Methods in R for Biology is an open-source course,
 4 | aimed at a third- to fourth-year undergraduate level. 
 5 | 
 6 | ## How to Contribute
 7 | 
 8 | Anyone can contribute to the course repository via [pull requests][pull-requests].
 9 | 
10 | We use [GitHub flow][github-flow] to manage changes:
11 | 
12 | 1. Create a [fork][fork-explanation] of this repository, and [clone][clone-explanation] it to your local computer.
13 | 2. In your local copy of this repository, create a new [branch][branch-explanation].
14 | 3. Commit your changes to that branch.
15 | 4. Push the edits on that branch to your fork on GitHub.
16 | 5. Submit a pull request to the master repository (`UofTCoders/rcourse`).
17 | 6. Your pull request will trigger a [Travis][travis-website] build (see below for details).
18 | 7. If you receive feedback on your pull request, or encounter errors in the Travis build, 
19 | make further commits to the new branch on your fork. These will automatically be added to 
20 | your pull request. 
21 | 
22 | You may wish to look at [How to Contribute to an Open Source Project on GitHub][contribute]
23 | for more detailed instructions. The [GitHub Glossary][glossary] is also a useful resource that explains
24 | Git-related terminology. 
25 | 
26 | ## Continuous Integration with Travis
27 | 
28 | We use [Travis CI][travis-website] to test all materials in the course repo. Any changes
29 | in the form of a pull request will trigger a Travis build, where 
30 | Travis will attempt to test the code in the repo,
31 | [knitting][knitr-explanation] all lesson materials in the process. 
32 | Any errors will cause the Travis build to fail.
33 | 
34 | Pull requests can only be merged into the repo with a passing Travis build;
35 | this is to ensure that all course material is functional. If you submit a pull
36 | request that does not pass a Travis build, a traceback to the error can be found 
37 | on the link to that pull request's respective build. 
38 | 
39 | More on the practice of continuous integration can be found on [Travis CI's website][ci-explanation]. 
40 | 
41 | ## Format
42 | 
43 | We follow a consistent format across all course materials. A lesson
44 | template can be found [here][lecture-template].
45 | 
46 | ### Lessons
47 | 
48 | 1. All lesson material is in R Markdown (`.Rmd`) format.
49 | 
50 | 2. All lessons begin with a Lesson Preamble, subdivided into
51 | 'Learning objectives' and a 'Lesson outline'. The outline should
52 | also list approximate time requirements for each segment. 
53 | 
54 | 3. Lessons include a mix of code chunks and text, organized using Markdown headers.
55 | 
56 | 4. Students should be able to follow the contents of the lesson from the text alone;
57 | i.e. the file should contain _all_ conceptual explanations.
58 | 
59 | ### Assignments
60 | 
61 | 1. All assignments are in R Markdown (`.Rmd`) format.
62 | 
63 | 2. Assignment files contain a numbered list of questions and are comparatively light on code.
64 | They are designed such that students fill in answers by adding in code chunks of their own.
65 | 
66 | 3. Code chunks in assignments should mostly be limited to loading required packages,
67 | downloading required data, or data cleaning if necessary. 
68 | 
69 | 
70 | [branch-explanation]: https://help.github.com/articles/about-branches/
71 | [ci-explanation]: https://docs.travis-ci.com/user/for-beginners/
72 | [clone-explanation]: https://help.github.com/articles/cloning-a-repository/
73 | [contribute]: https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github
74 | [fork-explanation]: https://help.github.com/articles/fork-a-repo/
75 | [github-flow]: https://guides.github.com/introduction/flow/
76 | [glossary]: https://help.github.com/articles/github-glossary/
77 | [knitr-explanation]: https://rmarkdown.rstudio.com/authoring_quick_tour.html
78 | [lecture-template]: https://github.com/UofTCoders/rcourse/blob/master/_templates/lectures.Rmd
79 | [pull-requests]: https://help.github.com/articles/about-pull-requests/
80 | [travis-website]: https://travis-ci.org/
81 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Type: Website
 2 | Package: rcourse
 3 | Title: Reproducible Quantitative Methods in R Course in Ecology and Evolutionary Biology
 4 | Version: 2.2.0
 5 | Maintainer: Luke Johnston <lwjohnst@gmail.com>
 6 | Author: Luke Johnston <lwjohnst@gmail.com>
 7 | Authors@R: c(
 8 |     person("Luke", "Johnston", email = "lwjohnst@gmail.com", role = c("aut", "cre")),
 9 |     person("Madeleine", "Bonsma", email = "m.bonsma@mail.utoronto.ca", role = c("aut")),
10 |     person("Lindsay", "Coome", email = "lindsay.coome@mail.utoronto.ca", role = c("aut")),
11 |     person("Joel", "Ostblom", email = "joel.ostblom@gmail.com", role = c("aut")),
12 |     person("Elliott", "Sales de Andrade", email = "esalesde@physics.utoronto.ca", role = c("aut")),
13 |     person("Lina", "Tran", email = "lina.mntran@gmail.com", role = c("aut")),
14 |     person("Sara", "Mahallati", email = "sara.mahallati@gmail.com", role = c("aut")),
15 |     person("James", "Santangelo", email = "james.santangelo37@gmail.com", role = c("aut")),
16 |     person("Ahmed", "Hasan", email = "ahmed.hasan@mail.utoronto.ca", role = c("aut")),
17 |     person("Amber", "Hoi", email = "amber.hoi@mail.utoronto.ca", role = c("ctb")),
18 |     person("Zoe", "Humphries", email = "zoe.humphries@mail.utoronto.ca", role = c("ctb"))
19 |     )
20 | Depends: R (>= 3.4.0)
21 | License: CC-BY
22 | Encoding: UTF-8
23 | Imports:
24 |     rmarkdown,
25 |     knitr,
26 |     tidyverse,
27 |     GGally,
28 |     broom,
29 |     lattice,
30 |     lme4,
31 |     lmerTest,
32 |     reshape2,
33 |     EcoSimR,
34 |     car,
35 |     multcomp,
36 |     MuMIn,
37 |     deSolve,
38 |     PerformanceAnalytics,
39 |     viridis,
40 |     lavaan,
41 |     ggfortify,
42 |     nlme,
43 |     sp,
44 |     ape,
45 |     rgdal,
46 |     raster,
47 |     maps
48 | 


--------------------------------------------------------------------------------
/LICENSE-MIT.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "MIT License"
 3 | ---
 4 | 
 5 | MIT License
 6 | 
 7 | Copyright (c) 2017-2018 UofTCoders
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Licensing and copyright information"
 3 | ---
 4 | 
 5 | The course material is licensed under the 
 6 | [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
 7 | and the course code is licensed under a [MIT License](LICENSE-MIT.html)
 8 | by the [UofTCoders](https://uoftcoders.github.io) (see the [about](about.html)
 9 | for those involved in creating the course).
10 | In addition to our own developed material, we have also modified material from
11 | other courses and workshops:
12 | 
13 | - [Data Carpentry](http://datacarpentry.org) (licensed under the 
14 | [CC-BY 2.0 Generic License](https://creativecommons.org/licenses/by/2.0/))
15 | - [Reproducible Quantitative Methods Course](https://cbahlai.github.io/rqm-template/)
16 | (licensed under the [CC-BY 4.0 International License](https://creativecommons.org/licenses/by/4.0/))
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Quantitative Methods in R for Biology
 2 | =========================================
 3 | 
 4 | Course in Ecology and Evolutionary Biology
 5 | ------------------------------------------
 6 | 
 7 | [![Build Status](https://travis-ci.org/UofTCoders/rcourse.svg?branch=master)](https://travis-ci.org/UofTCoders/rcourse)
 8 | [![DOI](https://zenodo.org/badge/97400494.svg)](https://zenodo.org/badge/latestdoi/97400494)
 9 | [![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/)
10 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
11 | [![status](http://jose.theoj.org/papers/1a083e69c49c15011f9404dfab9b1ec8/status.svg)](http://jose.theoj.org/papers/1a083e69c49c15011f9404dfab9b1ec8)
12 | 
13 | ## Description
14 | 
15 | Quantitative Methods in R for Biology is a course aimed at undergraduates at a third year level or above. 
16 | The course covers statistics and data analysis for ecology and reproducible quantitative methods in R. 
17 | 
18 | Statistical analysis, modelling, and data simulation are essential skills for ecologists and evolutionary biologists. 
19 | Furthermore, ever larger datasets are quickly becoming the norm in a variety of scientific disciplines. 
20 | This course is therefore designed to meet a growing demand for reproducible, openly accessible, 
21 | analytically thorough, and well-documented science. Students will learn to develop ecological population models, 
22 | analyze large datasets, and document their research using the R programming language. No prior programming experience is required.
23 | 
24 | For more detail on the course, check out the [syllabus](https://uoftcoders.github.io/rcourse/).
25 | 
26 | ## Instructional Design
27 | 
28 | The lectures in this course are designed to be presented using a participatory live-coding approach.
29 | This involves an instructor typing and running code in [RStudio](https://www.rstudio.com/) in front of the class, while the class follows
30 | along using their own computers. Challenges are interspersed in the lesson material, allowing students to 
31 | collaboratively work on smaller coding problems for a few minutes. All lesson materials are provided ahead
32 | of time on the course website for students to refer to during lectures.
33 | 
34 | The bulk of the course's assessment structure involves weekly assignments. These assignments
35 | are primarily code-based and are designed to also be completed in RStudio using the R Markdown format.
36 | 
37 | At the end of the course, students undertake a group project, wherein they attempt to address a scientific
38 | question by applying techniques learned over the course to open ecological data. At the end of the semester,
39 | groups present their work in a conference-style presentation, and submit a report in the style of a scientific paper. 
40 | 
41 | ## Lecture Content
42 | 
43 | The course's lesson material is broadly subdivided into three main topics:
44 | 
45 | 1. Introductory R (Lectures 1-5)
46 |     * Introduces students to the R programming language, with a focus on 
47 |       data wrangling and visualization.
48 | 2. Statistical analysis (Lectures 6-12)
49 |     * Introduces concepts such as regression, principal component analysis, statistical models, and numerical models.
50 | 3. Reproducible science (Lectures 13-15)
51 |     * Prepares students for project work period and introduces methods for reproducible science (GitHub, R Markdown).
52 | 
53 | ## Content Reuse Instructions
54 | 
55 | If you are interested in using or modifying this content and repository for your 
56 | own course, there are a few steps you need to take:
57 | 
58 | 1. Create a fork of this repository.
59 | 1. Create a [personal access token](https://help.github.com/en/articles/creating-a-personal-access-token-for-the-command-line) for your account on GitHub (make sure to enable the "repo" scope so that using this token will enable writing to your GitHub repos) and copy the token to your clipboard.
60 | 1. Go to https://travis-ci.org/USER/REPO/settings replacing `USER` with your GitHub ID and `REPO` with the name of the forked repository.
61 | 1. Under the section "Environment Variables", type `GITHUB_TOKEN` in the "Name" text box and paste your personal access token into the "Value" text box.
62 | 
63 | In general, the first time the Travis CI builds can take about 15-25 minutes but subsequent builds take about 5-6 minutes.
64 | 
65 | ## Contributing
66 | 
67 | If you are interested in contributing to the course material, please refer to the guidelines in [CONTRIBUTING.md](https://github.com/UofTCoders/rcourse/blob/master/CONTRIBUTING.md). 
68 | 
69 | ## Related Publications
70 | 
71 | Santangelo JS (2019). Data simulation and randomization tests. NEON Faculty Mentoring Network, QUBES Educational Resources. doi:10.25334/Q4CT7P. [Available online](https://qubeshub.org/qubesresources/publications/996/1).
72 | 
73 | Bonsma-Fisher M, Hasan AR (2018). Working with plant phenology data and fitting a nonlinear model using least squares in R. NEON Faculty Mentoring Network, QUBES Educational Resources. doi:10.25334/Q4Q73D. [Available online](https://qubeshub.org/qubesresources/publications/978/1).
74 | 
75 | ## Acknowledgements
76 | 
77 | We thank Dr. Christie Bahlai, Dr. Asher Cutter, Dr. Martin Krkosek, and the Department of Ecology
78 | and Evolutionary Biology at the University of Toronto for helping make this course a reality.
79 | 
80 | We also thank Dr. Megan Jones and Dr. Kusum Naithani for their support and guidance, particularly
81 | around use of the NEON Ecological Observatory data. 
82 | 


--------------------------------------------------------------------------------
/_commands/build-pdf.R:
--------------------------------------------------------------------------------
 1 | # Run in parent directory ("rcourse/", not "rcourse/R").
 2 | #
 3 | # Usage:
 4 | #
 5 | #   Rscript _commands/build-pdf.R
 6 | 
 7 | # Convert all Rmd files into PDF files
 8 | rmd_files <- list.files(pattern = "^lec11.*.Rmd$")
 9 | sapply(
10 |     rmd_files,
11 |     rmarkdown::render,
12 |     output_format = "pdf_document",
13 |     output_dir = "_pdf",
14 |     output_options = list(
15 |         pandoc_args = c("-V", "fontsize=12pt", "-V", "papersize=letter"),
16 |         include = list(in_header = "_includes/preamble.tex")
17 |     )
18 | )
19 | 


--------------------------------------------------------------------------------
/_commands/build-slides.R:
--------------------------------------------------------------------------------
 1 | # Run this command in the parent directory (`rcourse/`)
 2 | #
 3 | # Usage:
 4 | #
 5 | #   Rscript _commmands/build-slides.R
 6 | #
 7 | 
 8 | # Generate the slide html files, of only the lectures.
 9 | rmd_files <- c(
10 |     "lec14-datasets.Rmd"
11 | )
12 | 
13 | if (!is.na(rmd_files)) {
14 |     sapply(
15 |         rmd_files,
16 |         rmarkdown::render,
17 |         output_format = "ioslides_presentation",
18 |         output_dir = "_site/slides",
19 |         output_options = list(
20 |             df_print = "kable",
21 |             slide_level = 3,
22 |             smaller = TRUE,
23 |             transition = 0.01
24 |         )
25 |     )
26 | }
27 | 


--------------------------------------------------------------------------------
/_includes/footer.html:
--------------------------------------------------------------------------------
1 | 
2 | <hr>
3 | 
4 | <p>This work is licensed under a <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>. See the <a href="LICENSE.html">licensing</a> page for more details about copyright information.</p>
5 | 


--------------------------------------------------------------------------------
/_includes/header.html:
--------------------------------------------------------------------------------
1 | <link rel="icon" type="image/png" href="image/favicon.png">
2 | 


--------------------------------------------------------------------------------
/_includes/preamble.tex:
--------------------------------------------------------------------------------
1 | 
2 | \usepackage[bitstream-charter]{mathdesign}
3 | \usepackage[T1]{fontenc}
4 | \usepackage[utf8]{inputenc}
5 | 


--------------------------------------------------------------------------------
/_includes/style.html:
--------------------------------------------------------------------------------
 1 | <style type="text/css">
 2 | 
 3 | table {
 4 |     border-collapse: collapse;
 5 | }
 6 | 
 7 | thead {
 8 |     border-top: solid #DCDCDC;
 9 |     border-bottom: solid #DCDCDC;
10 | }
11 | 
12 | tr.odd {
13 |     background-color: #F8F8F8;
14 | }
15 | 
16 | tr:last-child {
17 |     border-bottom: solid #DCDCDC;
18 | }
19 | 
20 | </style>
21 | 


--------------------------------------------------------------------------------
/_site.yml:
--------------------------------------------------------------------------------
  1 | name: EEB313H1 Theoretical Ecology and Reproducible Quantitative Methods in R
  2 | output_dir: _site
  3 | exclude:
  4 | - DESCRIPTION
  5 | - LICENSE
  6 | new_session: true
  7 | navbar:
  8 |   title: EEB313H1
  9 |   left:
 10 |   - text: Syllabus
 11 |     icon: fa-map-o
 12 |     href: index.html
 13 |   - text: Lectures
 14 |     icon: fa-book
 15 |     menu:
 16 |     - text: 'Sept. 10: Intro to course, programming, RStudio, and R Markdown'
 17 |       href: lec01-introduction.html
 18 |     - text: 'Sept. 12: Assignment, vectors, functions'
 19 |       href: lec02-basic-r.html
 20 |     - text: 'Sept. 17: Data frames, intro to dplyr'
 21 |       href: lec03-basic-r.html
 22 |     - text: 'Sept. 19: Data wrangling in dplyr, ggplot, tidy data'
 23 |       href: lec04-dplyr.html
 24 |     - text: 'Sept. 24: More dplyr and ggplot'
 25 |       href: lec05-dplyr.html
 26 |     - text: 'Sept. 26: Exploratory data analysis'
 27 |       href: lec06-exploratory-data-analysis.html
 28 |     - text: 'Oct. 01: Linear models and statistical modelling'
 29 |       href: lec07-linear-modelling.html
 30 |     - text: 'Oct. 03: Mixed effects models'
 31 |       href: lec08-linear-mixed-effects-models.html
 32 |     - text: 'Oct. 08: Model Selection'
 33 |       href: lec09-model-selection.html
 34 |     - text: 'Oct. 10: Multivariate stats'
 35 |       href: lec10-multivariate-stats.html
 36 |     - text: 'Oct. 15: Spatial stats'
 37 |       href: lec11-spatial-stats.html
 38 |     - text: 'Oct. 17: Simulating data: Randomization tests'
 39 |       href: lec12-randomization-tests.html
 40 |     - text: 'Oct. 22 & 24: Mathematical models in EEB'
 41 |       href: lec13-theory.html
 42 |     - text: 'Oct. 29: Datasets, hypotheses, begin projects'
 43 |       href: lec14-datasets.html
 44 |     - text: 'Oct. 29 (cont.): Collaborating with GitHub'
 45 |       href: lec15-git-projects.html
 46 |     - text: 'Oct. 31: Project work (no lesson)'
 47 |     - text: 'Nov. 12: Project work (no lesson)'
 48 |     - text: 'Nov. 14: Project work (no lesson)'
 49 |     - text: 'Nov. 19: Project work (no lesson)'
 50 |     - text: 'Nov. 21: Project work (no lesson)'
 51 |     - text: 'Nov. 26: Project work (no lesson)'
 52 |     - text: 'Nov. 28: Project work (no lesson)'
 53 |     - text: 'Dec. 03: Project work (no lesson)'
 54 |     - text: 'Dec. 05: Group presentations (no lesson)'
 55 |   - text: Assignments
 56 |     icon: fa-book
 57 |     menu:
 58 |     - text: Assignment 1
 59 |       href: assignment-01.html
 60 |     - text: Assignment 2
 61 |       href: assignment-02.html
 62 |     - text: Assignment 3
 63 |       href: assignment-03.html
 64 |     - text: Assignment 4
 65 |       href: assignment-04.html
 66 |     - text: Assignment 5
 67 |       href: assignment-05.html
 68 |     - text: Assignment 6
 69 |       href: assignment-06.html
 70 |     - text: Assignment 7
 71 |       href: assignment-07.html
 72 |     - text: Mid-project update
 73 |       href: mid-project-update.html
 74 |     - text: Challenge assignment
 75 |       href: assignment-09-challenge.html
 76 |     - text: Final project
 77 |       href: assignment-final.html
 78 |   - text: Resources and FAQ
 79 |     icon: fa-question
 80 |     href: resources.html
 81 |   - text: About
 82 |     icon: fa-info
 83 |     href: about.html
 84 |   right:
 85 |   - icon: fa-bars
 86 |     menu:
 87 |     - text: Contact
 88 |       icon: fa-envelope
 89 |       href: mailto:ahmed.hasan@mail.utoronto.ca
 90 |     - text: GitHub
 91 |       icon: fa-github
 92 |       href: https://github.com/uoftcoders/rcourse
 93 | 
 94 | output:
 95 |   html_document:
 96 |     toc: yes
 97 |     toc_depth: 4
 98 |     toc_float:
 99 |       collapsed: no
100 |       smooth_scroll: no
101 |     include:
102 |       in_header:
103 |       - _includes/header.html
104 |       - _includes/style.html
105 |       after_body: _includes/footer.html
106 |     theme: lumen
107 |     highlight: haddock
108 |     mathjax: https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML
109 |     lib_dir: site_libs
110 |     self_contained: no
111 | 
112 | 


--------------------------------------------------------------------------------
/_templates/lectures.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Lecture title"
 3 | ---
 4 | 
 5 | <!-- Make sure to remove all commented out notes -->
 6 | 
 7 | <!-- horizontal line for website -->
 8 | 
 9 | ----
10 | 
11 | <!-- Clearly stated lesson objectives -->
12 | 
13 | ## Lesson preamble
14 | 
15 | <!-- `##` = website/pdf header 3, slide header 1 (slide title) -->
16 | 
17 | ### Lesson objectives:
18 | 
19 | <!-- items can (but don't have to) reveal one at a time in slides -->
20 | 
21 | - Item 1 
22 | - Item 2
23 | 
24 | ### Lesson outline:
25 | 
26 | <!-- good to have an estimated time that each section takes, for us and them -->
27 | Total lesson time: {{num}} hours
28 | 
29 | - Outline 1 ({{num}} min)
30 | - Outline 2 ({{num}} min)
31 | - Outline 3 ({{num}} min)
32 | 
33 | <!-- horizontal line for website -->
34 | 
35 | ----
36 | 
37 | ## Header 2 ('section' header)
38 | 
39 | <!-- Header 3 will be a block section in slides -->
40 | 
41 | ### Header 3 (slide title)
42 | 
43 | ```{r}
44 | # R code chunk
45 | ```
46 | 
47 | #### Header 4 (slide block title)
48 | 
49 | ## {{ Group | Individual }} exercise
50 | 


--------------------------------------------------------------------------------
/about.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "About the course and instructors/TAs"
 3 | ---
 4 | 
 5 | ## Course inspiration
 6 | 
 7 | Based on [RQM](https://cbahlai.github.io/rqm-template/). Draw much inspiration and content from [Software Carpentry](https://software-carpentry.org/) and [Data Carpentry](http://www.datacarpentry.org/).
 8 | 
 9 | ## The course creators and instructors/TAs
10 | 
11 | - Madeleine Bonsma-Fisher (Physics)
12 | - Lindsay Coome (Psychology)
13 | - Luke Johnston (Nutritional Sciences)
14 | - Sara Mahallati (IBBME)
15 | - Joel Östblom (IBBME)
16 | - Elliott Sales de Andrade (Physics)
17 | - Lina Tran (Physiology)
18 | - James Santangelo (EEB)
19 | - Ahmed Hasan (CSB)
20 | - Zoe Humphries (EEB)
21 | - Amber Hoi (EEB)
22 | 
23 | Licensing information can be found in the [license](LICENSE.html) page.
24 | 


--------------------------------------------------------------------------------
/assignment-01.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Assignment 1: Intro to programming (4 marks)'
 3 | output:
 4 |     html_document:
 5 |         toc: false
 6 | ---
 7 | 
 8 | *To submit this assignment, upload the full document on Quercus,
 9 | including the original questions, your code, and the output. Submit
10 | your assignment as a knitted `.pdf` (prefered) or `.html` file.*
11 | 
12 | 1.  Get set up at home (or on a lab computer after hours). (1.5 marks)
13 |     -   Install [R](https://cran.rstudio.com/) and
14 |         [RStudio](https://www.rstudio.com/products/rstudio/download/)
15 |         (already installed on the lab computers).
16 |     -   Open a new R Notebook and read the instructions about how to use
17 |         the R Markdown syntax.
18 |     -   Open this assignment file (`assignment-01.Rmd`) in RStudio or
19 |         copy its content into an empty R Notebook.
20 |     -   Insert a code chunk below, above question 2.
21 |     -   In the code chunk, use `install.packages("<package_name>")` to
22 |         install `tidyverse` and `rmarkdown`. Remember to run the code
23 |         chunk to execute the commands.
24 |     -   Load the two libraries you just installed into your environment
25 |         with `library(<package_name>)` (no surrounding quotation marks
26 |         as with `install.packages()`). Add this to the same code chunk
27 |         you created previously and execute it again (don't worry that
28 |         the `install.packages()` commands have already been executed
29 |         once, R is smart and checks if you already have those
30 |         installed).
31 |     -   Run `sessionInfo()` to list all the loaded packages.
32 |         -   You should see the following packages under "other attached
33 |             packages": `rmarkdown`, `dplyr`, `purrr`, `readr`, `tidyr`,
34 |             `tibble`, `ggplot`, and `tidyverse`.
35 |     -   Since this is your first assignment, we have already completed
36 |         most of this question below. You still need to run the code
37 |         chunk on your computer to confirm that the packages installed
38 |         without errors and to get the `sessionInfo()` output for your
39 |         computer. You might receive warnings that functions from other
40 |         packages are masked when you load `tidyverse`, this is fine.
41 | 
42 |         ```{r}
43 |         install.packages("tidyverse")
44 |         install.packages("rmarkdown")
45 |     
46 |         library(tidyverse)
47 |         library(rmarkdown)
48 |     
49 |         sessionInfo()
50 |     
51 |         # Expected answer
52 |         # The output is included below this code chunk with the appropriate 
53 |         # packages loaded.
54 |         ```
55 | 
56 | 2.  What is R Markdown and why are we using it in this class? *Hint:*
57 |     You are using R Markdown right now to complete this assignment! (1
58 |     mark)
59 |     -   Which key combination would you use to insert a chunk of code in
60 |         an R Markdown document?
61 |     -   Which key combination would you use to execute a code chunk?
62 | 
63 | 3.  Provide a few reasons as to why it is beneficial to create documents
64 |     like R Notebooks rather than using spreadsheet software for
65 |     exploratory data analyses. (1 mark)
66 | 
67 | 4.  Fill out the pre-course survey posted on Quercus. Type your student number 
68 |     below to confirm that you are done. (0.5 marks)
69 | 


--------------------------------------------------------------------------------
/assignment-02.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Assignment 2: Base R (8 marks)'
  3 | output:
  4 |     html_document:
  5 |         toc: false
  6 | ---
  7 | 
  8 | *To submit this assignment, upload the full document on blackboard,
  9 | including the original questions, your code, and the output. Submit
 10 | you assignment as a knitted `.pdf` (preferred) or `.html` file.*
 11 | 
 12 | 1.  Variable assignment (1 mark)
 13 | 
 14 |     a.  Assign the value `5` to the variable/object `a`. Display `a`.
 15 |         (0.25 marks)
 16 | 
 17 |     b.  Assign the result of `10/3` to the variable `b`. Display `b`.
 18 |         (0.25 marks)
 19 | 
 20 |     c.  Write a function that adds two numbers and returns their sum.
 21 |         Use it to assign the sum of `a` and `b` to `result`. Display `result`.
 22 |         (In practice, there is already a more sophisticated built-in
 23 |         function for this: `result <- sum(a, b)`) (0.25 marks)
 24 | 
 25 |     d.  Write a function that multiplies two numbers and returns their product.
 26 |         Use it to assign the product of `a` and `b` to `product`. Display `product`.
 27 |         (In practice, there is already a more sophisticated built-in
 28 |         function for this: `product <- prod(a, b)`) (0.25 marks)
 29 | 
 30 | 2.  Vectors (1 mark)
 31 | 
 32 |     a.  Create a vector `v` with all integers 0-30, and a vector `w`
 33 |         with every third integer in the same range. (0.25 marks)
 34 | 
 35 |     b.  What is the difference in lengths of the vectors `v` and `w`?
 36 |         (0.25 marks)
 37 | 
 38 |     c.  Create a new vector, `v_square`, with the square of elements at indices
 39 |         3, 6, 7, 10, 15, 22, 23, 24, and 30 from the variable `v`. *Hint:
 40 |         Use indexing rather than a for loop.* (0.25 marks)
 41 | 
 42 |     d.  Calculate the mean and median of the first five values from
 43 |         `v_square`. (0.25 marks)
 44 | 
 45 | 3.  Boolean indexing (1 mark)
 46 | 
 47 |     a.  Create a boolean vector `v_bool`, indicating which vector `v`
 48 |         elements are bigger than 20. How many values are over 20? *Hint:
 49 |         In R, TRUE = 1, and FALSE = 0, so you can use simple arithmetic
 50 |         to find this out.* (0.5 marks)
 51 | 
 52 |     b.  Display the output of `v[TRUE]`. Explain why you think R outputs this.
 53 |         (0.25 marks) _(Note: this is not really something you would ever need 
 54 |         to do in practice!)_
 55 | 
 56 |     b.  Use the variable `v_bool` as an index to extract the elements
 57 |         from `v` that are bigger than 20. What are the min and max
 58 |         values of this new vector? (0.25 marks)
 59 | 
 60 | 4.  Data frames (2 marks)
 61 | 
 62 |     a.  There are many built-in data frames in R, which you can find
 63 |         [more details about
 64 |         online](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html).
 65 |         What are the column names of the built-in dataframe `beaver1`?
 66 |         How many observations (rows) and variables (columns) are there?
 67 |         (0.5 marks)
 68 | 
 69 |     b.  Display both the first 6 and last 6 rows of this data frame.
 70 |         Show how to do so with both indexing as well as specialized functions. (0.5 marks)
 71 | 
 72 |     c.  What is the min, mean, and max body temperature in this data set?
 73 |         *Hint: Remember that each column in a data frame is a vector, so
 74 |         you can use the same functions as in the previous question on
 75 |         vectors.* (0.5 marks)
 76 | 
 77 |     d.  Use the `summary` function to display an overview of the `temp`
 78 |         column. (0.25 marks)
 79 | 
 80 |     e.  Use a single instance of the `summary` function to display an overview 
 81 |         of the `time` and `temp` columns. (0.25 marks)
 82 | 
 83 | 5.  Data frames with dplyr (3 marks)
 84 | 
 85 |     a.  Say we're attempting to calculate mean temperature in the `beaver1` dataset.
 86 |         What is wrong with the following chain of dplyr commands? (0.5 marks)
 87 |         ```
 88 |         beaver1 %>%
 89 |             filter(is.na(temp)) %>%
 90 |             summarise(mean_temp = mean(temp))
 91 |         ```
 92 | 
 93 |     b.  Use dplyr to randomly sample 20 rows from `beaver1`. Calculate
 94 |         mean temperature from this subsetted dataset. (0.5 marks)
 95 |         _Hint: you may want to refer to the dplyr cheatsheet for this_
 96 | 
 97 |     b.  Using the full `beaver1` dataset, calculate the mean temperature 
 98 |         for day 346. (0.25 marks) 
 99 |         _Note: use the full dataset for parts c-f below as well._
100 | 
101 |     c.  Rather than using `filter()` to calculate the mean for each day
102 |         separately, the more convenient `group_by()` can be used to
103 |         aggregate measurements by a categorical value (such as the `day`
104 |         column in `beaver`). Use this approach to calculate the mean
105 |         temperature and activity level for each of the days in the
106 |         dataset. (0.5 marks)
107 | 
108 |     d.  Express in writing what the average activity level from the
109 |         above calculation means. *Hint: Remember that you can [read a
110 |         description of the columns
111 |         online](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html).*
112 |         (0.25 marks)
113 | 
114 |     e.  How many observations are there per day in this dataset? (0.25
115 |         marks)
116 | 
117 |     f.  How many observations are there per day when the beaver is
118 |         active outside the retreat? (0.25 marks)
119 | 
120 |     g.  Grouping by activity level *and* the day of the observation.
121 |         Which variable seems to be more related to high body
122 |         temperature: activity level or day of measurement? (0.5 marks)
123 | 


--------------------------------------------------------------------------------
/assignment-03.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Assignment 3: dplyr and ggplot (8 marks)'
  3 | output:
  4 |     html_document:
  5 |         toc: false
  6 | ---
  7 | 
  8 | *To submit this assignment, upload the full document on blackboard,
  9 | including the original questions, your code, and the output. Submit
 10 | you assignment as a knitted `.pdf` (prefered) or `.html` file.*
 11 | 
 12 | 1.  Plotting (1 mark)
 13 | 
 14 |     Run the block below to create a categorical variable of the `activ`
 15 |     column. This will make dplyr recognize that there are only two
 16 |     levels of activity (0 and 1), rather than a continuous range 0-1,
 17 |     which will facilitate plotting.
 18 | 
 19 |     ```{r}
 20 |     library(tidyverse)
 21 |     beaver1 <- beaver1 %>%
 22 |         mutate(factor_activ = factor(activ))
 23 |     ```
 24 | 
 25 |     a.  In the previous assignment, we saw that the beaver's body
 26 |         temperature was the highest when the beaver was outside the
 27 |         retreat. However, we did not explore the distribution of
 28 |         temperatures for the active and inactive conditions. Create a
 29 |         histogram with the temperature on the x-axis and color the bins
 30 |         corresponding to the activity variable. *Hint: You need to use
 31 |         the `fill` parameter rather than `color`; and make sure you are
 32 |         using the correct `activ` column!* (0.25 marks)
 33 | 
 34 |     b.  We already know that the beaver's body temperature is correlated
 35 |         with whether it is outside the retreat or not. However, we did
 36 |         not control for the time of day, maybe the beaver's temperature
 37 |         is even better predicted by knowing what time of day it is. To
 38 |         satisfactorily answer this question, we should perform a
 39 |         regression analysis, but we easily can get a good overview by
 40 |         plotting the data. Make a scatter plot with the time of day on
 41 |         the x-axis and the body temperature on the y-axis. Color the
 42 |         scatter points according the beaver's activity level and
 43 |         separate the measurements into one plot per day. *Hint: To
 44 |         separate measurements per day, you could use `filter()` and two
 45 |         chunks of code, but try the more efficient way of facetting into
 46 |         subplots, which we talked about in the lecture.* (0.75 marks)
 47 | 
 48 | 2.  Read in and pre-process data (1.5 marks)
 49 | 
 50 |     Ok, that's enough about beaver body temperatures. Now you will apply
 51 |     your data wrangling skills on the yearly change in biomass of plants
 52 |     in the [beautiful Abisko national park in northern
 53 |     Sweden](https://en.wikipedia.org/wiki/Abisko_National_Park). We have
 54 |     preprocessed this data and made [it available as a csv file via this
 55 |     link](https://uoftcoders.github.io/rcourse/data/plant-biomass-preprocess.csv).
 56 |     You can find the original data and a short readme on
 57 |     [figshare](https://figshare.com/articles/Time_Series_of_plant_biomass_during_1998-2013/4149648)
 58 |     and [dryad](https://datadryad.org/resource/doi:10.5061/dryad.38s21).
 59 |     The original study[^1] is available with an open access license.
 60 |     Reading through the readme on figshare, and the study abstract will
 61 |     increase your understanding for working with the data.
 62 | 
 63 |     a.  Read the data directly from the provided URL into a variable
 64 |         called `plant_biomass` and display the first six rows. (0.25
 65 |         mark)
 66 | 
 67 |     b.  Convert the Latin column names into their common English names:
 68 |         lingonberry, bilberry, bog bilberry, dwarf birch, crowberry, and
 69 |         wavy hair grass. After this, display all column names. *Hint:
 70 |         Search online to find out which Latin and English names pair up.
 71 |         There is a function in the `dplyr` cheat sheet that might help you
 72 |         rename these columns. Finally, check the [tidyverse style
 73 |         guide](http://style.tidyverse.org/syntax.html#object-names) to make
 74 |         sure your new column names are formatted correctly.* (0.5 marks)
 75 | 
 76 |     c.  This is a wide data frame (species make up the column names). A
 77 |         long format is easier to analyze, so gather the species names
 78 |         into one column (`species`) and the measurement values into
 79 |         another column (`biomass`). Assign it to the variable
 80 |         `plant_biomass` to overwrite the previous data frame. Make
 81 |         sure you don't lose any columns in the reshaping process!
 82 |         *Hint: Make sure the output is correct before overwriting the
 83 |         old variable.* (0.75 marks)
 84 | 
 85 | 3.  Data exploration (4.5 marks)
 86 | 
 87 |     Now that our data is in a tidy format, we can start exploring it!
 88 | 
 89 |     a.  What is the average biomass in g/m^2 for all observations in
 90 |         the study? (0.25 marks)
 91 | 
 92 |     b.  How does the average biomass compare between the grazed control
 93 |         sites and those that were protected from herbivores. (0.25
 94 |         marks)
 95 | 
 96 |     c.  Display a table of the average plant biomass for each year.
 97 |         (0.25 marks)
 98 | 
 99 |     d.  What is the mean plant biomass per year for the `grazedcontrol`
100 |         and `rodentexclosure` groups (spread these variables as separate
101 |         columns in a table). (0.5 marks)
102 | 
103 |     e.  Compare the biomass for `grazedcontrol` with that of
104 |         `rodentexclosure` graphically in a line plot. What could explain
105 |         the big dip in biomass year 2012? *Hint: The published study
106 |         might be able to help with the second question...* (0.5 marks)
107 | 
108 |     f.  How many distinct species are there? (0.25 marks)
109 | 
110 |     g.  Check whether there is an equal number of observations per
111 |         species. (0.25 marks)
112 | 
113 |     h.  Compare the yearly change in mean biomass for each species in a
114 |         lineplot. (0.5 marks)
115 | 
116 |     i.  From the previous two questions, we found that the biomass is
117 |         higher in the sites with rodent exclosures (especially in recent
118 |         years), and that the crowberry is the dominant species. Notice
119 |         how the lines for `rodentexclosure` (refer back to 3.d above) 
120 |         and `crowberry` are of similar shape. Coincidence? Let's find out! 
121 |         Use a facetted line plot to explore whether all plant species are 
122 |         impacted equally by grazing. (0.75 mark)
123 | 
124 |     j.  The habitat could also be affecting the biomass of different
125 |         species. Explore graphically if this is the case. *Hint: Think
126 |         about how to change your dataset groupings to make this plot* 
127 |         (0.5 marks)
128 | 
129 |     k.  It looks like both habitat and treatment have an effect on most
130 |         of the species! Let's dissect the data further by visualizing
131 |         the effect on each species of _both_ the habitat and treatment by
132 |         facetting the plot accordingly. *Hint: This is a hard one! You may want 
133 |         to explore R's documentation for ggplot's `facet_grid`* (0.5 marks)
134 | 
135 | 4.  Create a new column that represents the square of the biomass.
136 |     Display the three largest `squared_biomass` observations in
137 |     descending order. Only include the columns `year`, `squared_biomass`
138 |     and `species` and only observations between the years 2003 and 2008
139 |     from the forest habitat. *Hint: Break this down into single criteria
140 |     and add one at a time. You will be able to obtain the desired result
141 |     with five operations.* (1 mark)
142 | 
143 | [^1]: Olofsson J, te Beest M, Ericson L (2013) Complex biotic
144 |     interactions drive long-term vegetation dynamics in a subarctic
145 |     ecosystem. Philosophical Transactions of the Royal Society B
146 |     368(1624): 20120486. <https://dx.doi.org/10.1098/rstb.2012.0486>
147 | 


--------------------------------------------------------------------------------
/assignment-04.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Assignment 4: Exploration, linear and mixed-effects models'
  3 | output:
  4 |     html_document:
  5 |         toc: false
  6 | ---
  7 | 
  8 | *To submit this assignment, upload the full document on blackboard,
  9 | including the original questions, your code, and the output. Submit
 10 | you assignment as a knitted `.pdf` (prefered) or `.html` file.*
 11 | 
 12 | 1. Visualization (3 marks)
 13 | 
 14 |     Import the tidyverse library. We will be using the same beaver1 dataset that
 15 |     we used in last week's assignment.
 16 | 
 17 |     ```{r message=FALSE, warning=FALSE}
 18 |     library(tidyverse)
 19 |     ```
 20 | 
 21 |     a. Create a histogram to visualize the distribution of the beavers' body
 22 |     temperatures, separating the temperature data based on the beaver's activity level.
 23 |     (after transforming it into a categorical variable the way you did for your
 24 |     last assignment). Describe the properties of the distribution. When
 25 |     creating this plot for the purpose of evaluating temperature, what
 26 |     argument did you adjust and why? (1 mark)
 27 | 
 28 |     b. What type of variables are temperature and time of day? With this in
 29 |     mind, create a visualization that will help you get a better understanding
 30 |     of the relationship between these variables. (0.5 mark)
 31 | 
 32 |     c. Create a single box plot to simultaneously visualise temperature,
 33 |      activity, and day. (0.5 mark)
 34 |     
 35 |     d. What is one prediction you might make about the relationships among your
 36 |     variables (based on the patterns you observed)? Create a visualization that
 37 |     illustrates your prediction, improving on your previous plots in at least one
 38 |     way. State why this plot is an improvement. (1 mark)
 39 | 
 40 | 2. Outliers (2 marks)
 41 | 
 42 |     a. In the beaver1 dataset, there are some particularly high/low body
 43 |     temperature measurements. Give an example of a systematic or random error
 44 |     (state which) that could have influenced these values. (0.5 marks)
 45 | 
 46 |     b. Consider whether these values would affect your ability to test whether
 47 |     temperature varies by activity level. You should generate plots and/or
 48 |     perform statistical tests with and without these points, and make an
 49 |     informed decision about whether they should be kept or dropped (Hint: you
 50 |     may want to either create a second data set or get creative with colour.)
 51 |     State whether you would remove the points and why. (1.5 marks)
 52 |     
 53 | 3. Linear models (3 marks)
 54 | 
 55 | Run the following code to load the CO2 dataset.
 56 | 
 57 |     ```{r}
 58 |     co2_df <- as_data_frame(as.matrix(CO2)) %>% 
 59 |         mutate(conc = as.integer(conc),
 60 |                uptake = as.numeric(uptake))
 61 |     ```
 62 | 
 63 |     a. Look through the help documentation (?CO2) to understand what each
 64 |     variable means. Imagine you were running a statistical model to assess the
 65 |     effects of chilling on plant CO2 uptake. What would the $y$ and $x$
 66 |     variables be in such a model? What about if you were trying to assess the
 67 |     relationship between ambient CO~2~ concentrations and plant uptake? Briefly
 68 |     defend these choices. (1 mark)
 69 | 
 70 |     b. How much does `uptake` change if `conc` goes up by 10 mL/L? Write out the
 71 |     interpretation as a simple statement of this contribution of `conc` on
 72 |     `uptake`. How much CO2 would you predict plants to uptake if atmospheric
 73 |     concentrations were 2,450 mL/L?. Show your work. (2 marks)
 74 |         
 75 | 4. Linear mixed-effects models (4 marks).
 76 | 
 77 |     Santangelo _et al._ (2018) were interested in understanding how plant
 78 |     defenses, herbivores, and pollinators influence the expression of plant
 79 |     floral traits (e.g. flower size). Their experiment had 3 treatments, each
 80 |     with 2 levels: Plant defense (2 levels: defended vs. undefended), herbivory
 81 |     (2 levels: reduced vs. ambient) and pollination (2 levels: open vs.
 82 |     supplemental). These treatments were fully crossed for a total of 8
 83 |     treatment combinations. In each treatment combination, they grew 4
 84 |     individuals from each of 25 plant genotypes for a total of 800 plants (8
 85 |     treatment combinations x 25 genotypes x 4 individuals per genotype). Plants
 86 |     were grown in a common garden at the Koffler Scientific Reserve (UofT's field
 87 |     research station) and 6 floral traits were measured on all plants throughout
 88 |     the summer. We will analyze how the treatments influenced one of these
 89 |     traits in this exercise. Run the code chunk below to download the data,
 90 |     which includes only a subset of the columns from the full dataset:
 91 |     
 92 |     ```{r}
 93 |     library(tidyverse)
 94 |     
 95 |     plant_data <- "https://uoftcoders.github.io/rcourse/data/Santangelo_JEB_2018.csv"
 96 |     download.file(plant_data, "Santangelo_JEB_2018.csv")
 97 |     plant_data <- read_csv("Santangelo_JEB_2018.csv", 
 98 |                            col_names = TRUE)
 99 |     glimpse(plant_data)
100 |     head(plant_data)
101 |     ```
102 | 
103 |     You can see that the data contain 792 observations (i.e. plants, 8 died
104 |     during the experiment). There are 50 genotypes across 3 treatments:
105 |     Herbivory, Pollination, and HCN (i.e. hydrogen cyanide, a plant defense).
106 |     There are 6 plant floral traits: Number of days to first flower, banner
107 |     petal length, banner petal width, plant biomass, number of flowers, and
108 |     number of inflorescences. Finally, since plants that are closer in space in
109 |     the common garden may have similar trait expression due to more similar
110 |     environments, the authors included 6 spatial "blocks" to account for this
111 |     environmental variation (i.e. Plant from block A "share" an environment and
112 |     those from block B "share" an environment, etc.). Also keep in mind that
113 |     each treatment combination contains 4 individuals of each genotype, which
114 |     are likely to have similar trait expression due simply to shared genetics.
115 |     
116 |     a. Use the `lme4` and `lmerTest` R packages to run a linear mixed-effects
117 |     model examining how herbivores (`Herbivory`), Pollinators (`Pollination`),
118 |     plant defenses (`HCN`) _and all interactions_ influences the width of
119 |     banner petals (`Avg.Bnr.Wdth`) produced by plants while accounting for
120 |     variation due to spatial block and plant genotype. Also allow the intercept
121 |     for `Genotype` to vary across the levels of the herbivory treatment. (1
122 |     mark: 0.5 for correct fixed effects specification and 0.5 for correct random
123 |     effects structure). You only need to specify the model for this part of the
124 |     question.
125 | 
126 |     b. Summarize (i.e. get the output) the model that you ran in part (a). Did
127 |     any of the treatments have a significant effect on banner petal length? If
128 |     so, which ones? Based on your examination of the model output, how can you
129 |     tell which level of the significant treatments resulted in longer or shorter
130 |     mean banner petal widths? Make a statement for each significant **main**
131 |     effects in the model (i.e. not interactions). If none of the main effects
132 |     are significant, then simply write "there are no significant main effects in
133 |     the model" (0.5 marks).
134 | 
135 |     c. Using `dplyr` and `gglot2`, plot the mean banner width for one of the
136 |     significant interactions in the model above (whichever you choose). The idea
137 |     is to show how both treatments interact to influence the mean length of
138 |     banner petals using a combination of different colours, linetypes, shapes,
139 |     etc. on the same plot (i.e., no faceting). Avoid overlapping points in the
140 |     figure. Also include error bars/bands with one standard error around the
141 |     mean. As a reminder, I have included the formula to calculate the standard
142 |     error of the mean below. (1.5 marks).
143 |     
144 |     $$ SE = \frac{sd}{\sqrt{n}}  $$
145 | 
146 |     d. After accounting for the fixed effects, what percentage of the variation
147 |     in banner petal width was explained by each of the random effects in the
148 |     model? Show yor work. (0.5 marks).
149 | 
150 |     e. Descibe the pattern you see in the figure generated in part (c). Why do
151 |     you think the interaction you plotted was significant in the model? (0.5 marks)
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/assignment-05.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Assignment 5: Model selection and multivariate statistics (9 marks)'
  3 | output:
  4 |     html_document:
  5 |         toc: false
  6 | ---
  7 | 
  8 | *To submit this assignment, upload the full document on blackboard,
  9 | including the original questions, your code, and the output. Submit
 10 | your assignment as a knitted `.pdf` (preferred) or `.html` file.*
 11 | 
 12 | 1. In this exercise, we will once again use the data of Santangelo _et al._
 13 | (2019) that you used in assignment 4. Let's go ahead and load in the data. See
 14 | assignment 4 if you need a refresher on the details of the experiment and the
 15 | dataset. (5 marks total)
 16 | 
 17 |     ```{r message=FALSE, warning=FALSE}
 18 |     library(tidyverse)
 19 |     library(lme4)
 20 |     library(lmerTest)
 21 |     
 22 |     
 23 |     Santangelo_data <- "https://uoftcoders.github.io/rcourse/data/Santangelo_JEB_2018.csv"
 24 |     download.file(Santangelo_data, "Santangelo_JEB_2018.csv")
 25 |     Santangelo_data <- read_csv("Santangelo_JEB_2018.csv", 
 26 |                                 col_names = TRUE)
 27 |     glimpse(Santangelo_data)
 28 |     head(Santangelo_data)
 29 |     ```
 30 | 
 31 | 
 32 |     a. Model selection works best when there are no missing values in your
 33 |     dataset. We will be identifying the best model that predict variation in
 34 |     flowering time (`Flower.date`) across plants. Create a dataset that excludes
 35 |     rows where there is missing flowering date data. (0.25 marks)
 36 |     
 37 |     b. We want to know how HCN, herbivores, and pollinators influence flowering
 38 |     date. We also think that the effect of herbivores and pollinators on
 39 |     flowering data might depend on whether the plant is producing HCN. Create a
 40 |     model that includes fixed-effects that test these predictions. Be sure to
 41 |     account for variation due to `Genotype`, `Block` and the `Genotype` by
 42 |     `Herbivory` interactions by including these terms as random effects. This
 43 |     will be our saturated model. You can ignore `boundary (singular) fit`
 44 |     warnings that may arise. (1 marks)
 45 |     
 46 |     c. We will generate a reduced model from the saturated model in (a). Should
 47 |     we use AIC or AIC~c~. Why? Show your calculation. (0.5 marks)
 48 |     
 49 |     d. Using the approach described in lecture 11, optimize the random effect
 50 |     structure of this model. Show the AIC/AIC~c~ output for each model of
 51 |     varying random effect strucure. Provide a one sentence justification for
 52 |     each random effect the model, justifying whether it is fixed (i.e., in every
 53 |     model) or whether some models will drop this effect. Describe in one
 54 |     sentence what the optimal random effect structure of the model is and why.
 55 |     (0.5 marks)
 56 |     
 57 |     e. Using the model with the optimal random-effect structure identified in
 58 |     (c), find the optimal fixed-effect structure. Be sure to show all the models
 59 |     and their AIC/AIC~c~ scores. (1.5 mark)
 60 |     
 61 |     f. Based on the AIC/AIC~c~ output from (d), generate your final model with
 62 |     both optimized fixed and random effects. Summarize the model and interpret
 63 |     its output. Is there a significant effect of any treatment? If so, which
 64 |     one(s) and in which direction. Make a statement about the significant
 65 |     treatments' effects on flowering date. Use the model's output to support
 66 |     your answer. You only need to interpret significant main effects here (i.e.
 67 |     not interactions). (0.75 marks)
 68 |     
 69 |     g. Do you think we were justified in interpreting a single model? Why or why
 70 |     not? What alternative approach could we have used? (0.25 marks).
 71 | 
 72 |     h. Use `dplyr` and `ggplot2` to plot the flowering date of plants by the
 73 |     _main_ effect that showed a significant effect in the optimized model above.
 74 |     The figure should show the mean plus and minus a single standard error of
 75 |     the mean. Suggest one biological interpretation of the pattern you see in
 76 |     the figure and in the model (i.e. why do you think this would happen). If
 77 |     there are no significant effects in the model, simply write "There are no
 78 |     significant effects!". (0.25 marks)
 79 |     
 80 | 2. During the multivariate statistics lecture, we made use of vector community
 81 | and malaria survey data collected by Mbogo _et al._ (2003) to disentangle the
 82 | effects of vector abundance, species richness, and composition, on malaria
 83 | prevalence (see path diagram in lecture 10 for a reminder of these
 84 | relationships). In this exercise, we will complete the analysis of the
 85 | strucutral equation model we began building in class. (1.5 marks total)
 86 | 
 87 | Here are some relevant snippets of code taken from the lecture notes to get you
 88 | started on this exercise.
 89 | 
 90 | ```{r eval=FALSE}
 91 | 
 92 | library(lavaan)
 93 | 
 94 | kenya.wide <- read.csv("kenya.wide.csv", header=TRUE, sep=",")
 95 | 
 96 | kenya.pca <- kenya.wide %>% 
 97 |   dplyr::select(arabiensis, gambiae, funestus, merus) %>% #choose relevant columns
 98 |   mutate_all(sqrt) %>% #this is the Hellinger transformation 
 99 |   prcomp(.) #pipe directly into baseR function for PCA
100 | 
101 | axes <- data.frame(kenya.pca$x)
102 | kenya.wide <- bind_cols(kenya.wide, axes) 
103 | 
104 | kenya.wide$s.abun <- log(kenya.wide$total.abundance) 
105 | kenya.wide$s.sr <- log(kenya.wide$SR) 
106 | kenya.wide$s.pfpr <- log(kenya.wide$PfPR) 
107 | 
108 | sem02 <- '
109 | # regressions
110 | l.pfpr ~ a*l.sr + b*l.abun + c*PC2
111 | # correlations
112 | l.sr ~~ d*l.abun
113 | PC2 ~~ e*l.sr
114 | # defined parameters
115 | indirect.abun := (a*d) #indirect effect of abundance via SR
116 | indirect.abun2 := (d*e*c) #indirect effect of abundance via PC2
117 | total.abun := b + (a*d) + (d*e*c) 
118 | '
119 | 
120 | ```
121 | 
122 |     a. Complete the structural equation model by adding in calculations for the
123 |     indirect and total effects of species richness (SR) and composition (PC2) on
124 |     malaria prevalence (PfPR). (0.5 marks)
125 |   
126 |     b. Evaluate the model, bootstrapping confidence intervals for path
127 |     coefficients with seed #778. Which predictor had the largest _direct_ effect
128 |     on malaria prevalence? How about _total_ effect? Briefly explain these
129 |     effects in plain english (1 sentence each). (1 mark)
130 | 
131 | 
132 | 3. In this exercise, we will be investigating the relationship between vector
133 | community structure and another commonly used metric of disease risk --
134 | entomological inoculation rate (EIR). EIR is a measure of the number of bites by
135 | infectious mosquitoes per person per unit time. We will be making use of the
136 | same data from Mbogo _et al._ (2003) as before, only this time, we will start
137 | with the long form data. (2.5 marks total)
138 | 
139 | ```{r eval=FALSE}
140 | 
141 | kenya.long <- read.csv("kenya.long.csv", header=TRUE, sep=",")
142 | 
143 | ```
144 | 
145 | This dataset consists of the same information as kenya.wide, with the addition
146 | of one new columns for "EIR".
147 | 
148 |     a. Convert this dataset to the wide format. Fill cells in the wide dataset
149 |     with the **relative abundance** of each species, and include the columns
150 |     "total abundance" and "EIR" in the final product. (Hint: use xxxx_join to
151 |     add the desired columns to the wide dataset after you spread it) (Hint2:
152 |     pivot_wider() may be easier to use than spread()) (1 marks)
153 | 
154 |     b. Construct a series of linear models to investigate the relationship
155 |     between EIR and i) total mosquito abundance and ii) the abundance of each
156 |     species. Interpret the results of these models. (Hint: is EIR a simple
157 |     function of total mosquito abundance, or is there a particular species that
158 |     is contributing disproportionately to it?) (0.5 mark)
159 |   
160 |     c. Investigate the influence of total abundance and community structure (use
161 |     the first two PC axes) on EIR with a strcutural equation model. Include only
162 |     direct effects only in this model, and pretend we have reason to believe
163 |     total abundance is associated with community composition.
164 |         i. Briefly explain the correlation structure you have chosen for your
165 |         predictors, total abundance, PC1, and PC2. (0.5 marks)
166 |         ii. Evaluate the model. Are these results congruent with your findings
167 |         from part (a)? (0.5 marks)
168 | 
169 | 


--------------------------------------------------------------------------------
/assignment-06.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Assignment 6: Spatial statistics, simulating data, and randomization tests (8 marks)'
 3 | output:
 4 |     html_document:
 5 |         toc: false
 6 | ---
 7 | 
 8 | <!-- keep this part -->
 9 | *To submit this assignment, upload the full document on blackboard,
10 | including the original questions, your code, and the output. Submit
11 | your assignment as a knitted `.pdf` (preferred) or `.html` file.*
12 | 
13 | 1. In this exercise, we will continue to use the vector community and malaria survey data collected by Mbogo _et al._ (2003) (i.e., `kenya.wide.csv`). We will inspect whether spatial autocorrelation affects our inference of the effects of vector abundance, species richness, and composition, on malaria prevalence. We will also investigate which environmental factors underpin the distribution of mosquitoes, all in the context of space. (4 marks).
14 | 
15 |     a. Compute Moran's autocorrelation statistic to assess whether mosquito abundance and species richness, and malaria prevalence are correlated in space. (1 mark)
16 | 
17 |     b. Extract annual average temperature and rainfall data from WorldClim2 using the raster files provided in class. Create maps (see appendix to lecture notes) to show the variation in these climatic factors across the sites. (1 mark)
18 | 
19 |     c. Investigate whether rainfall influence mosquito abundance across sites. Your complete analysis should include formally testing whether the temperature and rainfall patterns across sites are correlated in space (construct a variogram and interpret when neccessary), regression models with the appropriate autocorrelation structure, and an interpretation of your model outputs. Feel free to make use of additional plots to help explain your findings (2 mark)
20 | 
21 | 
22 | 2. Simulating data (2 marks)
23 | 
24 |     a. Generate a gamma distribution by randomly sampling 30 points from a distribution with shape parameter equal to 1.35 and rate parameter equal to 0.5. Plot this distribution. Set a seed of 42. (0.5 mark).
25 | 
26 |     b. Plot the distribution of sample means obtained by generating 5000 gamma distributions with the same parameters as in (a). In other words, the distribution should be made up of 5000 means, each from a different simulated gamma distribution. Set a seed of 43. What do you notice about this distribution when compared to the original distribution in (a)? Why would we expect this? (0.5 marks)
27 | 
28 |     c. In this exercise you will simulate a multiple regression. Remember, multiple regression means that there is more than one explanatory (aka predictor, independent) variable for a given response variable. Multiple regression thus estimates a separate effect (i.e. _beta_) for each explanatory variable in the model, while holding the other variables constant. This exercise is only a slight extension of the model that we simulated in lecture. Simulate a model that satisfies the conditions below and show the model output using `summary()`. Set a seed of 44. (1 mark).
29 | 
30 |         1. `x1` is an explanatory variable with _sequence_ from 51 to 70 with 1 unit intervals between each value (i.e. 20 values total).
31 |         2. `x2` is an explanatory variable of length 20 randomly drawn for a normal distribution with mean equal to 62 and standard deviation equal to 2.7.
32 |         2. `x3` is an explanatory variable of length 20 randomly drawn for a gamma distribution with shape equal to 5 and rate equal to 0.5.
33 |         3. the `y_intercept` is 22
34 |         4. The beta associated with `x1` is 0.62.
35 |         5. The beta associated with `x2` is 0.047`
36 |         6. The beta associated with `x3` is 0.185
37 |         6. The error is drawn from a normal distribution with mean equal to 0 and standard deviation equal to 1.65.
38 |         7. `y` is a linear combination of `x1`, `x2` and `x3`. There are no interations.
39 | 
40 | 3. Randomization test (2 marks)
41 | 
42 |     Run the code chunk below to load the data that will be used in this exercise.
43 | 
44 |     ```{r message=FALSE, warning=FALSE}
45 |     library(tidyverse)
46 |     df <- "https://uoftcoders.github.io/rcourse/data/Assign05_Question3.csv"
47 |     download.file(df, "Assign05_Question3.csv")
48 |     df <- read_csv("data/Assign05_Question3.csv", 
49 |                            col_names = TRUE)
50 |     glimpse(df)
51 |     ```
52 | 
53 |     a. Generate a histogram showing the null distribution of t-statistics between groups one and two from the `df` dataframe that you just loaded. The null distribution should be based 5,000 reshufflings of the data. (1 mark). Overlay onto this histogram the observed t-statistic as a dashed vertical line. Set a seed of 45. **Hint:** t-tests return list objects that can be indexed using `$`
54 | 
55 |     b. Perform a permutation test testing whether the observed t statistic between groups one and two is different than what would be expected by chance. Include a statement about whether there is a significant difference between groups based on your permutation test and be sure to include the P-value. How does this P-value compare to one obtained from a simple t-test? Why? (1 mark)
56 |     
57 | 


--------------------------------------------------------------------------------
/assignment-07.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Assignment 7: Modelling (9 marks)'
 3 | output:
 4 |   pdf_document: default
 5 |   html_document: default
 6 | ---
 7 | 
 8 | ```{r}
 9 | 
10 | library(deSolve)
11 | 
12 | ```
13 | 
14 | 
15 | 1. In this exercise, we will be working with the Lotka-Volterra competition
16 | model introduced in class. Use the following parameter set in this question. (3
17 | marks)
18 | 
19 | - `r` = 0.5
20 | - `K` = 1000
21 | - `alpha12` = 3
22 | - `alpha21` = 1.5
23 | 
24 | 
25 |     a. Solve the Lotka-Volterra competition model and plot the trajetory of the
26 |     population for 100 time steps. Briefly explain the population dynamics of these
27 |     species. (1 mark)
28 | 
29 |     b. Catastrophe hits population 2 at t=10, such that their numbers were
30 |     drastically decreased to a quarter of what it was (hint: at t=10, N2 reset
31 |     to 1/4 its non-catastrophe value)! Use a simulation to show the trajetory of
32 |     these species over the 100 time steps. Explain what you saw. (2 marks)
33 | 
34 | 
35 | 2. In this exercise, we will be working with the malaria dynamics model we
36 | worked with in class, and we will be thinking of ways in which we can
37 | "implement" various mosquito control methods in this hypothetical population.
38 | For each of the following mosquito control strategies, describe how you would
39 | implement them in terms of math. For example, you may wish to modify some
40 | parameter, add a parameter, or out right change the strucutre of the model.
41 | Explain your choices, and include any new parameters or new equations where
42 | applicable (e.g., include an equation to show where a new parameter would
43 | appear). (3 marks)
44 | 
45 |     a. Use of insecticide to kill off adult mosquitoes. (1 mark)
46 |     b. Use of bednet to reduce contact between mosquito and host. (1 mark)
47 |     c. Provide hosts with vaccines. (1 mark)
48 | 
49 | 
50 | 3.  The Allee Effect (3 marks)
51 | 
52 |     Generally, as population size increases, a population will
53 |     experience a decreased growth rate due to greater competition for
54 |     resources. This is a negative density-dependent growth rate, and one
55 |     example of this is the logistic model.
56 | 
57 |     The Allee effect introduces positive density dependence, where
58 |     increases in population size result in increased growth rates over a
59 |     certain range of population sizes. One way to incorporate
60 |     the Allee effect into the logistic growth equation is as follows:
61 | 
62 |     $$\frac{dN}{dt} = rN\left(1-\frac{N}{K}\right)\left(\frac{N-A}{K}\right)$$
63 | 
64 |     Here $r$ represents the growth rate of the population, $K$ is the
65 |     carrying capacity, and $A$ is the critical population size above which 
66 |     the total growth rate is positive.
67 | 
68 |     a. Take $r=1$, $A=10$, and $K=50$. Plot $\frac{dN}{dt}$ vs. $N$ for
69 |     $0 \le N \le 55$. For which values of $N$ is the growth rate
70 |     ($\frac{dN}{dt}$) positive or negative? (0.5 marks)
71 | 
72 |     b. Plot the **per capita** growth rate ($\frac{1}{N}\frac{dN}{dt}$)
73 |     vs. $N$ for this model of the Allee effect and for the logistic growth
74 |     model: $\frac{dN}{dt} = rN(1-\frac{N}{K})$. (1 marks)
75 | 
76 |     c. What do you notice about the density ($N$) dependence of the per capita
77 |     growth rate in each case? Hint: in the logistic model, the growth rate **per
78 |     capita** (per organism) decreases in a straight line as $N$ increases. (0.5
79 |     marks)
80 |     
81 |      d. What happens to the Allee effect as $A$ decreases? Plot curves for $A=0$
82 |      and a few values of $A>0$. (0.5 marks)
83 | 
84 |     e. Describe two biological situations in which you might expect to see an Allee
85 |     effect (either weak or strong). (0.5 marks)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/assignment-09-challenge.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Challenge assignment (14.5 marks)"
  3 | output:
  4 |     html_document:
  5 |         toc: false
  6 | ---
  7 | 
  8 | *To submit this assignment, upload the full document on Quercus,
  9 | including the original questions, your code, and the output. Submit
 10 | your assignment as a knitted `.pdf` (prefered) or `.html` file.*
 11 | 
 12 | 
 13 | Part of being an effective scientist involves being able to solve problems you
 14 | have not encountered before. This is certainly true of programming as well,
 15 | where problems are typically solved by furious bouts of Googling, reading
 16 | documentation, and trial and error of proposed solutions. In this assignment,
 17 | like previous ones, you will be evaluated on your ability to solve data
 18 | manipulation and analysis tasks. However, unlike previous assignments, some of
 19 | the solutions to the problems will require more research and effort on your
 20 | part. It may require the use of packages and techniques not explored in class,
 21 | but all problems are solveable, often with only a few lines of code. By now, you
 22 | should all have the terminology required to search for solutions to the problems
 23 | below. As with all programming problems, there are many possible ways to get the
 24 | answer to the problems below.
 25 | 
 26 | 1. Simpson's diversity index and bootstrapping (6 marks)
 27 | 
 28 |     In lecture 12, we used data from the National Ecological Observatory Networkd on
 29 | the abundance and percent cover of plant species across sites in the Harvard
 30 | Forest from the year 2017. Run the code chunck below to read in the data if you
 31 | do not already have it.
 32 | 
 33 | ```{r message=FALSE}
 34 | library(tidyverse)
 35 | neon_data <- "https://uoftcoders.github.io/rcourse/data/NEON_PlantPA_HARV_201707.csv"
 36 | download.file(neon_data, "NEON_PlantPA_HARV_201707.csv")
 37 | neon_data <- read_csv("NEON_PlantPA_HARV_201707.csv", 
 38 |                                 col_names = TRUE)
 39 | ```
 40 | 
 41 | a. Using the raw NEON data, create a matrix with plant species as rows and sites
 42 | as columns. The cell values should represent the abundance of each species at a
 43 | given site. (1 mark)
 44 | 
 45 | b. Write a function that takes a single numeric vector as an argument and
 46 | returns Simpson's Index of Diversity for the numeric vector. Test your
 47 | function by computing Simpson's diversity index on the `test_vector` in the
 48 | code chunk below. Be sure to report the Simpson's index for `test_vector` in
 49 | your final assignment. As a reminder, I have included the formula for
 50 | Simpson's index below:
 51 | 
 52 | ```{r}
 53 | test_vector <- c(0, 1, 0, 5, 0, 1, 0, 4, 3, 0, 0, 0, 0, 1, 4, 0, 5, 0, 
 54 |                  3, 0, 11, 2, 19, 0, 11, 0, 0, 0, 0, 0)
 55 | ```
 56 | 
 57 |     
 58 | $$D = 1 - \sum_{i = 1}^{s}(p_i)^2$$
 59 | 
 60 | where `s` is the species richness (i.e. number of species) and `p_i` is th
 61 | relative abundance of species _i_. A `D` value of 0 represents no diversity
 62 | and a D value of 1 represent infinite diversity. (1 mark)
 63 | 
 64 | c. In lecture 12, we discussed how resampling techniques can be used to
 65 | conduct hypothesis tests (i.e., permutation tests) by comparing an observed
 66 | population parameter (e.g., mean, median) to a null distribution of that
 67 | parameter generated by resampling your observed data _without_ replacement
 68 | many times. Resampling can additionally be used to generate confidence
 69 | intervals around a population parameter (e.g., mean median, slope) or other
 70 | statistic (e.g., C-score) using a technique known as _bootstrapping_.
 71 | Bootstrapping allows us to estimate the true distribution of a statistic in
 72 | cases where it is unknown, which we can use to estimate our uncertainty
 73 | (e.g., Standard Error, Confidence Intervals) in a population parameter.
 74 | Importantly, this applies _regardless of the shape of the distribution_,
 75 | although some adjustments often have to be made for strongly skewed
 76 | distributions. While there are a few different types of bootstrapped
 77 | confidence intervals (e.g., bias-corrected and accelerated, _t_ with
 78 | bootstrap), in this exercise, you will write a function that calculates the
 79 | simple 95% percentile bootstrapped confidence interval from a given numeric
 80 | vector. Your function should have the following properties:
 81 | 
 82 |     1. It should take in two arguments: A numeric vector and an integer
 83 |     representing the number of iterations 
 84 |     2. It should return a data frame with two columns: The lower and upper
 85 |     quantiles of the distribution, representing the 2.5 and 97.5
 86 |     percentiles, respectively (**hint:** The `quantile()` function).
 87 |         
 88 | Write the function as described above. How does the bootstrap differ from a
 89 | permutation test (note: you don't need code here, just tell me in words).
 90 | Set a seed of 42 and test your function on the `test_vector` from part (b).
 91 | (1.5 marks)
 92 | 
 93 | d. Use your functions defined in (b) and (c) to estimate the Simpson's diversity
 94 | index and corresponding 95% bootstrapped confidence intervals around the
 95 | Simpson's index for each of the sites in your dataframe from (a). Use 1000
 96 | iterations for the bootstrapping procedure. Your answer should be a single
 97 | dataframe with four columns: `site`, `simpson`, `lower`, and `upper`. Points are
 98 | awarded for conciseness of the code. (**Hint:** The most concise answer will
 99 | likely make use of the `purrr` package, which is part of the `tidyverse`). Set a
100 | seed of 43. (1.5 marks)
101 | 
102 | e. Using the dataframe from (d), plot the Simpson's index (y-axis) for each
103 | of the sites (x-axis) as a single point surrounded by its lower and upper
104 | 95% CIs. Order the x-axis from lowest to highest Simpson's index. (1 mark)
105 | 
106 | 2. Recreating a figure 
107 | 
108 |     In assignment 3, we explored a dataset containing changes in yearly plant
109 |     biomass in Abisko National Park, Sweden. In this question, we will use `ggplot2` to
110 |     reproduce a figure in the original paper (Olofsson et al, 2013;
111 |     [link](https://royalsocietypublishing.org/doi/full/10.1098/rstb.2012.0486)). 
112 | 
113 |     You should still have the dataset from when you completed assignment 3, but
114 |     if not, run the code chunk below to download it.
115 | 
116 | ```{r message=FALSE}
117 | plant_biomass_url <- 'https://raw.githubusercontent.com/UofTCoders/rcourse/master/data/plant-biomass-preprocess.csv'
118 | download.file(plant_biomass_url, 'plant-biomass-preprocess.csv')
119 | plant_biomass <- read_csv('plant-biomass-preprocess.csv')
120 | ```
121 | 
122 | Reproduce figure 4 from the paper using `ggplot2`. Pay close attention to the
123 | overall structure of the figure, the scale on the axes, and so on. The colors of
124 | the points do not need to be the exact same colors as those in the figure, but
125 | they should be sufficiently close. _(Note: you can ignore the SEM points, and
126 | the 'look' of your axes does not have to match the figure exactly. The species
127 | names also not have to be included in the body of the plot (like in the paper)
128 | as long as they are visible in some form_ (1 mark)
129 | 
130 | 3. Use the built-in R dataset `iris`, for this question. (2 marks) 
131 | 
132 |      a. Test the relationship between sepal length and sepal width _for each
133 |      species_ using linear models. Set sepal width as the response and sepal
134 |      length as the predictor. Output a single data frame containing the beta
135 |      estimate and p-value associated with the predictor for each of the three
136 |      models, and also make sure to include a column called 'species' in the
137 |      final data frame. Your final data frame should only have species, term,
138 |      estimate, and p.value as columns. Do not include the intercepts. _(Hint:
139 |      You want to simultaneously perform the same model on different subsets of
140 |      the data with only a few lines of code)_ (1 mark)
141 |      
142 |     b. Use `ggplot2` to plot a scatter plot of sepal width by sepal length,
143 |     coloured by species. Plot the three linear fits as well, also coloured by
144 |     species. Below your code, comment on how the estimate values from your
145 |     linear models above correspond to the plotted fits. (1 mark)
146 |     
147 | 4. The Canadian lynx population cycle (3.5 marks)
148 | 
149 |     The Canadian lynx experiences large periodic changes in its population 
150 |     size over a timescale of several years. This is thought to be driven by 
151 |     oscillations in the population size of the snowshoe hare, the primary food 
152 |     source for the lynx. Read more about the lynx population cycle on this 
153 |     [Northwest Territories website](https://www.enr.gov.nt.ca/en/services/lynx/lynx-snowshoe-hare-cycle).
154 |     
155 |     R has a built-in dataset called `lynx` which contains annual population 
156 |     measurements for the Canadian lynx as a time series. 
157 | 
158 |     (a) Plot the abundance of `lynx` vs. time in years using either `ggplot` or
159 |     `qplot`. Plot points and a connect them by a line . Create a time series that
160 |     starts at 0 and ends at the total number of years in the dataset (total years $=
161 |     1934-1821$). By eye, estimate the time between peaks in the population. (0.5
162 |     marks)
163 | 
164 |     (b) Define a function called `sine_model` that takes 5 arguments: a vector of years
165 |     for the x-axis and four parameters (amplitude, period, phase, and offset). 
166 |     Recall the general formula for a sine wave:
167 |     $$y = A \text{sin}(kx - b) + c$$
168 |     where $k = 2\pi / T$, $T$ is the period or length of time between peaks,
169 |     $A$ is the amplitude, $b$ is the phase, and $c$ is the offset.
170 |     Using a value of $A = c = 1700$ for both the amplitude and offset and a value of 
171 |     $b = 2.75$ for the phase, plot the lynx data as before and add a sine curve 
172 |     using your guess of the timescale from part (a) for the period.
173 |     Use a colour other than black to plot the sine wave. 
174 |     Note that the x axis must start at 0 in order for the offset of $2.75$
175 |     to match the data. (1 mark)
176 | 
177 |     (c) Use least-squares fitting to refine your estimate of the lynx cycle length. (1.5 marks)
178 |         - Create a numeric vector with a range of values for the period that
179 |         span your guess from part (a).
180 |         - Write a function to calculate the Residual Sums of Squares (RSS) from
181 |         a model fitted to the `lynx` data. 
182 |             - This function should calculate the sum of the difference
183 |             (*residuals*) between the lynx data and your prediction, then return
184 |             the sum of the residuals squared.
185 |         - Apply this function over the numeric vector of period values you
186 |         created. Essentially, you should be striving to obtain an RSS value for
187 |         models fitted using all of the different period values in your numeric
188 |         vector.
189 |         - Plot the sum of the residuals squared vs. the range of period values. By eye,
190 |         what is the minimum of this curve? 
191 |         - Identify the period value that provides the best fit to the `lynx`
192 |         data. Models with lowest RSS fit the data best. What is your calculated
193 |         length of the lynx population cycle?
194 | 
195 |     (d) Plot the lynx data again and plot your best fit curve on top. 
196 |     (0.5 marks)
197 | 
198 | 5. In class, we worked with a simple one-locus haploid model
199 | of evolution and investigated how different forces of evolution (e.g., selection
200 | vs. drift) affect allele frequencies. In this challenge assignment, we will
201 | naturally upgrade to a diploid model to see how evolution work in this kind of
202 | system. 
203 | 
204 |     We will once again be considering the case of malaria (of course, what else does
205 | Amber think about anyway). In assignment 7, we (you) worked on ways to
206 | incorporate various facets of biological detail into a baseline, super simple,
207 | disease model. We will incorporate another level of detail in this exercise:
208 | host genotype. The trait that we will be working with is the sickle cell trait.
209 | You can read more about the sickle-cell trait on Wikipedia. Basically, this is a
210 | blood-cell-shape trait that is controlled at a single locus by two alleles: $A$
211 | for wildtype "normal" blood shape, and $S$ for sickle-shaped blood cells. There
212 | are thus three possible host genotypes with respect to the sickle cell trait:
213 | homozygous "normal" blood ($AA$), heterozygous ($AS$), and homozygous sickle
214 | blood ($SS$). People with homozygous $SS$ suffer from debilitating illness
215 | (sickle-cell disease) and often result in mortality at a young age if untreated.
216 | Heterozygotes ($AS$), while still suffering minor illness, receives partial
217 | immunity against malaria. It was thus hypothesized that the deleterious $S$
218 | allele is maintained in the population due to **balancing selection** -- the
219 | strong selective pressure exerted by malaria gave heterozygotes an advantage
220 | over the homozygotes, thus saving the $S$ allele from being purged from the
221 | population.
222 | 
223 |     We will be using mathematical models to help us test this hypothesis. A good
224 | start is to first extend the framework we used in lecture and think about how to
225 | represent allele frequency change in a diploid population.
226 | 
227 |     In population genetics it is often easier to work with
228 | allele frequencies (proportion of a certain allele in a population) rather than
229 | changes to their absolute numbers. Here, we will use $p$ and $q$ to denote the
230 | proportions of the $A$ and $S$ alleles in the population, respectively. The
231 | frequencies of the three genotypes in the population are thus given by $p^2$,
232 | $2pq$, and $q^2$ for AA, AS, and SS individuals, respectively. $p^2$, $2pq$, and
233 | $q^2$ are known as Hardy-Weinberg proportions and they have the convenient
234 | property of summing to 1.
235 | 
236 |     Applying selection to this population alters the frequency of each _genotype_ by
237 | an amount that is proportional to their fitness. We will denote the fitness of
238 | each host genotype group as $W_{AA}$, $W_{AS}$, and $W_{SS}$. The frequencies of
239 | each genotype following selection (i.e., as weighted by its fitness) is
240 | therefore $p^2W_{AA}$, $2pqW_{AS}$, and $q^2W_{SS}$. Dividing these quantities
241 | by their sum (the mean fitness of the population, often denoted with
242 | ${\bar{W}}$) allows us to retrieve the frequency of the three genotypes after
243 | selection (e.g., $p^2 \frac{W_{AA}}{\bar{W}}$). (2 marks)
244 | 
245 |     a) Derive a recursion equation that describe the change in **allele**
246 |     frequencies of $A$ and $S$ in one time step. (Hint: How do you go from genotype
247 |     frequency to allele frequency?) (1 marks)
248 | 
249 |     b) Simulate the trajetory of change of the A allele. Use the parameter set
250 |     provided, and include a graph in your answer. Explain what you see, and refer to
251 |     the parameter values provide in your explanation. (1 mark)
252 | 
253 | 
254 | ```{r}
255 | 
256 | # Time
257 | times <- 500
258 | timevec <- seq(1, times, 1)
259 | 
260 | # Parameters
261 | N <- 10000 # Total population size (assume constant)
262 | WAA <- 0.5
263 | WAS <- 0.7
264 | WSS <- 0.2
265 | 
266 | # Intitial condition
267 | p <- 0.5 # proportion of A allele
268 | 
269 | ```
270 | 


--------------------------------------------------------------------------------
/assignment-final.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Final assignment: Scientific report and presentation"
  3 | output:
  4 |     html_document:
  5 |         toc: false
  6 | ---
  7 | 
  8 | ## Project description
  9 | 
 10 | The course project is a self-directed group data analysis project using real 
 11 | ecological data and rigorous scientific methods. Groups are expected to hypothesize
 12 | about their chosen data, examine their hypotheses with reproducible and quantitative
 13 | analysis techniques, visualize their results, and create scientific products in the
 14 | form of a report and a presentation.
 15 | 
 16 | You might end up with a publishable scientific product! 
 17 | [This paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5210691/) was written by a
 18 | group of graduate students as part of the first version of this course, 
 19 | which was created by Dr. Christie Bahlai. 
 20 | 
 21 | ### Data
 22 | 
 23 | A list of recommended datasets can be found
 24 | [here](https://uoftcoders.github.io/rcourse/lec14-datasets.html#datasets_available_for_use). 
 25 | You are welcome to choose a
 26 | dataset not listed, or data collected as part of a research project, but keep in mind
 27 | that you may not submit anything twice: any work you do as part of this course may not
 28 | be submitted for credit in another course (such as a fourth-year research project) and vice versa. 
 29 | If choosing a dataset not listed, make sure it is well-documented, legitimate, and 
 30 | complex enough to support your analysis efforts. Your work should be original; your
 31 | project should not be a reproduction of published analyses. 
 32 | 
 33 | ### Project deliverables
 34 | 
 35 | The following components will be graded as part of the project:
 36 | 
 37 | 1. Mid-project update (Due Nov. 14):
 38 |     * Details for this assignment can be found [HERE](https://uoftcoders.github.io/rcourse/mid-project-update.html).
 39 |   
 40 | 2. Report styled as a journal article, with these or similar sections (more info
 41 | below) (Due. Dec. 5):
 42 |     * Abstract
 43 |     * Introduction / Background and Rationale
 44 |     * Methods (with "Data Description" and "Data Analysis" subsections)
 45 |     * Results 
 46 |     * Discussion 
 47 |     * Conclusion
 48 |     * Code: project results must be reproducible by someone else
 49 |    
 50 | 3. 10 minute presentation with 2 minutes for questions, styled as a conference
 51 | presentation (assume not too much familiarity with the topic in the audience).
 52 | The presentations will be held on the last day of class (Dec 3).
 53 | 
 54 | While you may not submit your work for this course for credit in another course, you
 55 | are welcome to publish or present your work in an academic setting. Groups are
 56 | encouraged to publish their work on [figshare](https://figshare.com/), an open, 
 57 | citable repository of scientific content. 
 58 | 
 59 | ### Report guidelines
 60 | 
 61 | For the report, you are expected to:
 62 | 
 63 | - Search the previous research and literature on your research questions.
 64 | - Have clear and explicit objectives and hypotheses.
 65 | - Adequately describe and properly cite the data source(s) you will analyze.
 66 | - Describe your data analysis in sufficient detail for others to understand what
 67 | you did and why.
 68 | - Show all the results of your pre-planned data analysis and any additional 
 69 | explorations you did.
 70 | - Discuss the meaning of your results and how they fit with the previous
 71 | literature.
 72 | 
 73 | The report and associated code is expected to:
 74 | 
 75 | - Be entirely reproducible: You may find
 76 | [Rprojects](https://r4ds.had.co.nz/workflow-projects.html) helpful in making
 77 | your projects reproducible. Rprojects can be commited to GitHub, allowing anyone
 78 | to clone the repo and run your analyses without having to worry about the paths
 79 | to all the files being different on their computers. [This
 80 | lesson](https://utm-coders.github.io/studyGroup/lessons/misc/project-management-R/lesson/)
 81 | on reproducible project management in R may also be helpful.
 82 | - Have well documented code: A well documented project will have README files
 83 | describing the contents of all folders in your GitHub repos. It will also
 84 | contain effective in-line comments in your scripts that showcase the logic of
 85 | your analyses and data-wrangling tasks. [This
 86 | lesson](https://swcarpentry.github.io/r-novice-inflammation/06-best-practices-R/)
 87 | on best practices for writing R code is a good starting place.
 88 | 
 89 | You are also expected to work well as a team, and use GitHub to submit and store 
 90 | your final product (more details below).
 91 | 
 92 | As a *guideline*, aim for at least 2500 words and about 6-8 figures/tables.
 93 | *This is **not** a hard criteria*. We are flexible in these *guidelines*, since
 94 | we want you to learn to work as a team and create a scientific product. You'll
 95 | be surprised how quickly the words, figures, and tables start adding up.
 96 | 
 97 | Your code should follow the coding style found [on our resources page](resources.html). 
 98 | 
 99 | All items (except the presentation) are due on December 5th at 11:59 pm.
100 | 
101 | ## Project submission
102 | 
103 | The project report and code should be submitted on GitHub. The report should
104 | also be submitted on Quercus. Each group will have their own GitHub repository
105 | in the [EEB313-2019](https://github.com/eeb313-2019) organization to which you
106 | can upload your report and code. You are welcome to use your GitHub repository
107 | for collaborative work during the project, but feel free to use other tools such
108 | as Google Drive, Dropbox, Overleaf, etc. if you prefer.
109 | 
110 | ## Project grading rubric
111 | 
112 | |                                                | Inadequate (0 marks)                                                                                                                                                                                          | Adequate (4 marks)                                                                                                                                                | Excellent (8 marks)                                                                                                                                                                    |
113 | |------------|--------------------|--------------------|--------------------|
114 | | Contribution to group work | Student contributed little to project; self-assessed contributions are low in quality and/or quantity; self-assessment is not consistent with actual contribution.                                            | Student contributed adequately to project; made some significant contributions                                                                                    | Student substantially contributed to project to ensure success; self-assessed contributions are crucial to project; self-assessment is consistent with actual contribution. |
115 | | Content                                        | Missing crucial information; methods and results are inconsistent, not logical, or not adequately explained; conclusions are confusing or unsupported by results; unnecessary information included as clutter | Most essential information included; methods and results are adequately described; conclusions supported by results; most included material is relevant to report | All essential information included; methods and results are succinct, clear, logical, and scientifically valid; conclusions are creative and meaningful; project is concise throughout |
116 | | Style and reproducibility                      | Code and writing are poorly organized, poorly formatted, missing units, difficult to read, poorly documented, difficult to reproduce analyses                                                                 | Code and writing are well-organized, well-formatted, consistent use of units and significant figures                                                              | Code and writing are precise and clear throughout, free of errors, well-organized, well-documented, easily reproducible analyses, publication-ready                                    |
117 | | Presentation                      | Presentation is poorly organized; much too long or much too short; presentation is unclear; presentation is missing information; presentation is not scientific and professional; presentation uses too much jargon; not all team members participate; does not adequately address audience questions            | Presentation is adequately organized; timing is appropriate; most information is presented logically; presentation is scientific and professional; most jargon is avoided; all team members participate but equally; audience questions are sometimes addressed well | Presentation is clearly and logically organized; presentation flows and is easy to follow; presentation includes appropriate information without jargon; presentation is well-rehearsed and high-quality; all team members participate equally; audience questions are clearly addressed |
118 | 
119 | As the final project is a team effort, all members within a group will receive the same mark in the final three categories and an individual mark for their contribution to group work. A final project that is considered to lie between two of the defined levels will be marked accordingly, e.g. between "Adequate" and "Excellent" would be 5, 6, or 7 marks.
120 | 
121 | 


--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
 3 |   "@type": "Code",
 4 |   "author": [
 5 |     {
 6 |       "@id": "0000-0003-4169-2616",
 7 |       "@type": "Person",
 8 |       "email": "lwjohnst@ph.au.dk",
 9 |       "name": "Luke Johnston",
10 |       "affiliation": "Department of Nutritional Sciences, University of Toronto; Department of Public Health, Aarhus University"
11 |     },
12 |     {
13 |       "@id": "0000-0002-5813-4664",
14 |       "@type": "Person",
15 |       "email": "m.bonsma@mail.utoronto.ca",
16 |       "name": "Madeleine Bonsma-Fisher",
17 |       "affiliation": "Department of Physics, University of Toronto"
18 |     },
19 |     {
20 |       "@id": "0000-0003-0051-3239",
21 |       "@type": "Person",
22 |       "email": "joel.ostblom@mail.utoronto.ca",
23 |       "name": "Joel Ostblom",
24 |       "affiliation": "Institute of Biomaterials and Biomedical Engineering, University of Toronto"
25 |     },
26 |     {
27 |       "@id": "0000-0003-0002-8399",
28 |       "@type": "Person",
29 |       "email": "ahmed.hasan@mail.utoronto.ca",
30 |       "name": "Ahmed Hasan",
31 |       "affiliation": "Department of Cell and Systems Biology, University of Toronto"
32 |     },
33 |     {
34 |       "@id": "0000-0002-5921-2548",
35 |       "@type": "Person",
36 |       "email": "james.santangelo@mail.utoronto.ca",
37 |       "name": "James Santangelo",
38 |       "affiliation": "Department of Ecology and Evolutionary Biology, University of Toronto"
39 |     },
40 |     {
41 |       "@id": "0000-0003-3504-4524",
42 |       "@type": "Person",
43 |       "email": "lina.tran@mail.utoronto.ca",
44 |       "name": "Lina Tran",
45 |       "affiliation": "Department of Physiology, University of Toronto"
46 |     },
47 |     {
48 |       "@id": "0000-0001-7310-8942",
49 |       "@type": "Person",
50 |       "email": "esalesde@physics.utoronto.ca",
51 |       "name": "Elliott Sales de Andrade",
52 |       "affiliation": "Department of Physics, University of Toronto"
53 |     },
54 |     {
55 |       "@id": "0000-0001-8126-3571",
56 |       "@type": "Person",
57 |       "email": "lindsay.coome@mail.utoronto.ca",
58 |       "name": "Lindsay Coome",
59 |       "affiliation": "Department of Psychology, University of Toronto"
60 |     },
61 |     {
62 |       "@id": "0000-0002-6765-0898",
63 |       "@type": "Person",
64 |       "email": "sara.mahallati@mail.utoronto.ca",
65 |       "name": "Sara Mahallati",
66 |       "affiliation": "Institute of Biomaterials and Biomedical Engineering, University of Toronto"
67 |     }
68 |   ],
69 |   "identifier": "https://doi.org/10.5281/zenodo.2335179",
70 |   "codeRepository": "https://github.com/UofTCoders/rcourse",
71 |   "datePublished": "2019-01-03",
72 |   "dateModified": "2019-01-03",
73 |   "dateCreated": "2019-01-03",
74 |   "description": "This material contains modular participatory live-coding lectures covering statistics and data analysis for ecology and reproducible quantitative methods in R. Statistical analysis, modelling, simulation, and data analysis are essential skills for applying ecology concepts to data. This material is designed to meet a growing demand for reproducible, openly accessible, analytically thorough, and well documented science.",
75 |   "keywords": "R, programming, coding, ecology, statistics, differential equations, modelling, regression, population dynamics",
76 |   "license": "MIT, CC-BY-4.0",
77 |   "title": "A graduate student-led participatory live-coding quantitative methods course in R: Experiences on initiating, developing, and teaching",
78 |   "version": "v2.1.0"
79 | }
80 | 


--------------------------------------------------------------------------------
/data/Assign05_Question3.csv:
--------------------------------------------------------------------------------
 1 | "x","group"
 2 | 6.42599961350736,"One"
 3 | 5.12082462316966,"One"
 4 | 5.52557783669731,"One"
 5 | 5.06744070340457,"One"
 6 | 4.87736583761432,"One"
 7 | 5.58150736478004,"One"
 8 | 5.37327730829425,"One"
 9 | 5.78183037915858,"One"
10 | 8.26129675390806,"One"
11 | 5.71236878207077,"One"
12 | 4.58697722696339,"One"
13 | 6.26998610870784,"One"
14 | 7.54029660627869,"One"
15 | 8.01169838357271,"One"
16 | 6.50193828344865,"One"
17 | 5.65876995724927,"One"
18 | 5.95480957203462,"One"
19 | 5.81211095689094,"One"
20 | 10.7110129391544,"One"
21 | 3.93438002921211,"One"
22 | 4.58106862210215,"One"
23 | 6.28458771094597,"One"
24 | 5.7708518286089,"One"
25 | 5.48310171915475,"One"
26 | 5.45300589678716,"One"
27 | 5.96726955712034,"One"
28 | 4.92520727831643,"One"
29 | 6.20818072170388,"One"
30 | 7.84436341251713,"One"
31 | 6.24427864300449,"One"
32 | 4.46362057038254,"Two"
33 | 2.22232496254666,"Two"
34 | 1.98157958025982,"Two"
35 | 2.18236156089807,"Two"
36 | 6.42835743190577,"Two"
37 | 9.54107898626658,"Two"
38 | 9.53893270912479,"Two"
39 | 12.5571987052659,"Two"
40 | 1.68003859331248,"Two"
41 | 0.903556644459741,"Two"
42 | 3.03841374553939,"Two"
43 | 6.33546732283046,"Two"
44 | 4.50159690356383,"Two"
45 | 8.06059074469022,"Two"
46 | 3.07028592501813,"Two"
47 | 8.0740754861533,"Two"
48 | 7.02112356971714,"Two"
49 | 1.1424767963171,"Two"
50 | 4.32155813691801,"Two"
51 | 0.845567933804901,"Two"
52 | 9.60419330543276,"Two"
53 | 4.98987313114606,"Two"
54 | 4.59513912092922,"Two"
55 | 10.8146670109643,"Two"
56 | 7.8797937958823,"Two"
57 | 2.32450148400854,"Two"
58 | 2.70998938787323,"Two"
59 | 2.40385691756134,"Two"
60 | 5.13332639128709,"Two"
61 | 7.44063769131989,"Two"
62 | 


--------------------------------------------------------------------------------
/data/africa.wide.csv:
--------------------------------------------------------------------------------
 1 | siteID,country,year,lat,long,marPOS,PfEX,gdp,WDIregion,WHOregion,PfPR,total.abundance,SR,percent
 2 | 2,Madagascar,1991,-18.58,47.15,23,48,448.0355093,Eastern Africa,Africa,0.479166667,0.145686909,2,1
 3 | 3,Madagascar,1989,-18.95,47.57,25,62,491.4355261,Eastern Africa,Africa,0.403225806,118.9660198,1,0.333333333
 4 | 4,Madagascar,1997,-19.4,46.72,580,1909,414.7742537,Eastern Africa,Africa,0.303823992,39.94314602,2,0.333333333
 5 | 7,Kenya,1998,-3.07,40.14,51,100,855.0818286,Eastern Africa,Africa,0.51,201.9725175,4,1
 6 | 8,Kenya,1998,-3.13,40.11,69,101,855.0818286,Eastern Africa,Africa,0.683168317,202.8294379,4,1
 7 | 9,Kenya,1998,-3.14,40.14,60,100,855.0818286,Eastern Africa,Africa,0.6,200.5822147,4,1
 8 | 10,Kenya,1998,-3.21,40.07,49,100,855.0818286,Eastern Africa,Africa,0.49,203.6217999,3,1
 9 | 11,Kenya,1998,-3.26,40.01,67,100,855.0818286,Eastern Africa,Africa,0.67,196.8772455,3,1
10 | 12,Kenya,1998,-3.29,40.08,59,100,855.0818286,Eastern Africa,Africa,0.59,202.6825175,3,1
11 | 13,Kenya,1998,-3.34,40,40,99,855.0818286,Eastern Africa,Africa,0.404040404,194.9104229,2,1
12 | 14,Kenya,1998,-3.53,39.78,69,99,855.0818286,Eastern Africa,Africa,0.696969697,202.0763041,2,1
13 | 15,Kenya,1998,-3.54,39.61,73,100,855.0818286,Eastern Africa,Africa,0.73,204.3535139,3,1
14 | 16,Kenya,1998,-3.59,39.53,72,100,855.0818286,Eastern Africa,Africa,0.72,203.324865,3,1
15 | 17,Kenya,1998,-3.63,39.73,53,98,855.0818286,Eastern Africa,Africa,0.540816327,205.2243983,4,1
16 | 18,Kenya,1998,-3.67,39.75,76,101,855.0818286,Eastern Africa,Africa,0.752475248,202.6983479,3,1
17 | 19,Kenya,1998,-3.68,39.85,57,100,855.0818286,Eastern Africa,Africa,0.57,198.5154037,3,1
18 | 20,Kenya,1998,-3.73,39.7,40,100,855.0818286,Eastern Africa,Africa,0.4,205.3764645,1,1
19 | 21,Kenya,1998,-3.73,39.8,62,101,855.0818286,Eastern Africa,Africa,0.613861386,202.0250468,4,1
20 | 22,Kenya,1998,-3.7,39.73,70,100,855.0818286,Eastern Africa,Africa,0.7,206.3674447,2,1
21 | 23,Kenya,1998,-3.79,39.82,57,100,855.0818286,Eastern Africa,Africa,0.57,201.5500176,1,1
22 | 24,Kenya,1998,-3.92,39.77,65,100,855.0818286,Eastern Africa,Africa,0.65,194.9217999,3,1
23 | 25,Kenya,1998,-3.9,39.73,83,99,855.0818286,Eastern Africa,Africa,0.838383838,197.7376686,3,1
24 | 26,Kenya,1998,-3,40.2,82,100,855.0818286,Eastern Africa,Africa,0.82,201.6029205,4,1
25 | 27,Kenya,1998,-4.12,39.28,73,100,855.0818286,Eastern Africa,Africa,0.73,199.4587563,4,1
26 | 28,Kenya,1998,-4.12,39.37,84,100,855.0818286,Eastern Africa,Africa,0.84,201.8712231,3,1
27 | 29,Kenya,1998,-4.14,39.39,78,100,855.0818286,Eastern Africa,Africa,0.78,203.7867121,3,1
28 | 30,Kenya,1998,-4.16,39.45,59,100,855.0818286,Eastern Africa,Africa,0.59,203.3170876,2,1
29 | 31,Kenya,1998,-4.18,39.5,44,100,855.0818286,Eastern Africa,Africa,0.44,200.2900023,2,1
30 | 32,Kenya,1998,-4.18,39.53,66,100,855.0818286,Eastern Africa,Africa,0.66,200.2900023,2,1
31 | 33,Kenya,1998,-4.27,39.58,51,100,855.0818286,Eastern Africa,Africa,0.51,192.8893965,3,1
32 | 34,Kenya,1998,-4.38,39.47,50,78,855.0818286,Eastern Africa,Africa,0.641025641,190.178306,3,1
33 | 35,Kenya,1998,-4.43,39.5,60,100,855.0818286,Eastern Africa,Africa,0.6,193.2649569,4,1
34 | 36,Kenya,1998,-4.6,39.17,67,100,855.0818286,Eastern Africa,Africa,0.67,188.6794788,4,1
35 | 51,Kenya,2003,0.17,34.75,230,709,824.4529698,Eastern Africa,Africa,0.324400564,61.82157094,2,0.5
36 | 57,Guinea-Bissau,1995,11.91,-15.6,52,112,638.1742814,Western Africa,Africa,0.464285714,169.1899305,2,1
37 | 58,Burkina Faso,1995,12.67,-1.23,907,1189,361.121422,Western Africa,Africa,0.73510252,156.2303621,3,1
38 | 59,Mali,1989,13.22,-5.92,2,123,502.8532591,Western Africa,Africa,0.074417961,79.92881402,2,1
39 | 60,The Gambia,1988,13.48,-16.68,8,386,511.0285137,Western Africa,Africa,0.020725389,184.315253,1,0.5
40 | 61,Senegal,1994,13.72,-16.42,117,220,995.4211649,Western Africa,Africa,0.636354435,172.8095093,4,0.571428571
41 | 62,Senegal,1995,13.93,-16.76,41,661,1020.69932,Western Africa,Africa,0.062027231,193.8609932,4,0.8
42 | 63,Senegal,1995,13.98,-16.77,49,403,1020.69932,Western Africa,Africa,0.121588089,191.3167558,4,0.8
43 | 64,Senegal,1995,14.05,-16.68,121,773,1020.69932,Western Africa,Africa,0.156532988,178.172849,4,0.8
44 | 65,Senegal,1995,14.1,-16.67,21,512,1020.69932,Western Africa,Africa,0.041015625,171.0232027,3,0.75
45 | 66,Senegal,1995,14.15,-16.65,113,691,1020.69932,Western Africa,Africa,0.163531114,161.9986126,3,0.75
46 | 67,Senegal,1995,14.53,-16.43,144,332,1020.69932,Western Africa,Africa,0.43373494,156.0269517,3,0.6
47 | 69,Senegal,1995,14.54,-16.44,261,366,1020.69932,Western Africa,Africa,0.713114754,156.0269517,4,0.8
48 | 70,Senegal,1995,14.55,-16.45,110,172,1020.69932,Western Africa,Africa,0.639534884,156.6305489,4,0.8
49 | 72,Senegal,1991,14.91,-17.07,10,222,1060.78698,Western Africa,Africa,0.045045045,185.3918401,5,0.833333333
50 | 73,Eritrea,2002,15.11,36.65,10,300,601.0342469,Eastern Africa,Africa,0.033333333,147.9212393,1,1
51 | 75,Mali,1988,15,2.97,83,186,490.2892417,Western Africa,Africa,0.446236559,52.32812383,2,1
52 | 77,Senegal,1990,16.5,-14.44,9,118,1066.3899,Western Africa,Africa,0.076271186,74.72705868,3,1
53 | 78,Senegal,1990,16.52,-14.43,5,61,1066.3899,Western Africa,Africa,0.081967213,76.91797725,3,0.75
54 | 79,Senegal,1990,16.52,-14.62,10,109,1066.3899,Western Africa,Africa,0.091743119,79.85343679,3,1
55 | 80,Mali,1988,16.96,-0.36,11,206,490.2892417,Western Africa,Africa,0.053398058,29.91677606,2,1
56 | 86,Mali,1988,17.33,0.12,5,188,490.2892417,Western Africa,Africa,0.026595745,19.86864387,1,1
57 | 89,Mali,1988,18.44,1.4,2,251,490.2892417,Western Africa,Africa,0.007968127,8.111480598,2,1
58 | 95,Cameroon,1998,3.82,11.48,205,372,1132.952026,Western Africa,Africa,0.551075269,200.3646769,4,0.5
59 | 101,Cameroon,2001,4.01,9.19,74,174,1183.43807,Western Africa,Africa,0.425287356,214.5101509,3,1
60 | 102,Cameroon,2001,4.03,9.18,174,689,1183.43807,Western Africa,Africa,0.252539913,214.5101509,4,0.8
61 | 103,Cameroon,2001,4.07,9.36,556,1690,1183.43807,Western Africa,Africa,0.328994083,203.4368237,3,0.75
62 | 107,Cote d'Ivoire,1996,5.95,-7.47,304,317,1372.790201,Western Africa,Africa,0.846252918,206.44001,2,1
63 | 111,Ethiopia,1997,8.05,38.73,8,127,199.1113028,Eastern Africa,Africa,0.062992126,36.48174272,2,1
64 | 113,Cameroon,2002,9.4,13.51,41,122,1201.912083,Western Africa,Africa,0.336065574,8.700992323,5,0.833333333


--------------------------------------------------------------------------------
/data/iris.csv:
--------------------------------------------------------------------------------
  1 | "sepal_length","sepal_width","petal_length","petal_width","species"
  2 | 5.1,3.5,1.4,0.2,"setosa"
  3 | 4.9,3,1.4,0.2,"setosa"
  4 | 4.7,3.2,1.3,0.2,"setosa"
  5 | 4.6,3.1,1.5,0.2,"setosa"
  6 | 5,3.6,1.4,0.2,"setosa"
  7 | 5.4,3.9,1.7,0.4,"setosa"
  8 | 4.6,3.4,1.4,0.3,"setosa"
  9 | 5,3.4,1.5,0.2,"setosa"
 10 | 4.4,2.9,1.4,0.2,"setosa"
 11 | 4.9,3.1,1.5,0.1,"setosa"
 12 | 5.4,3.7,1.5,0.2,"setosa"
 13 | 4.8,3.4,1.6,0.2,"setosa"
 14 | 4.8,3,1.4,0.1,"setosa"
 15 | 4.3,3,1.1,0.1,"setosa"
 16 | 5.8,4,1.2,0.2,"setosa"
 17 | 5.7,4.4,1.5,0.4,"setosa"
 18 | 5.4,3.9,1.3,0.4,"setosa"
 19 | 5.1,3.5,1.4,0.3,"setosa"
 20 | 5.7,3.8,1.7,0.3,"setosa"
 21 | 5.1,3.8,1.5,0.3,"setosa"
 22 | 5.4,3.4,1.7,0.2,"setosa"
 23 | 5.1,3.7,1.5,0.4,"setosa"
 24 | 4.6,3.6,1,0.2,"setosa"
 25 | 5.1,3.3,1.7,0.5,"setosa"
 26 | 4.8,3.4,1.9,0.2,"setosa"
 27 | 5,3,1.6,0.2,"setosa"
 28 | 5,3.4,1.6,0.4,"setosa"
 29 | 5.2,3.5,1.5,0.2,"setosa"
 30 | 5.2,3.4,1.4,0.2,"setosa"
 31 | 4.7,3.2,1.6,0.2,"setosa"
 32 | 4.8,3.1,1.6,0.2,"setosa"
 33 | 5.4,3.4,1.5,0.4,"setosa"
 34 | 5.2,4.1,1.5,0.1,"setosa"
 35 | 5.5,4.2,1.4,0.2,"setosa"
 36 | 4.9,3.1,1.5,0.2,"setosa"
 37 | 5,3.2,1.2,0.2,"setosa"
 38 | 5.5,3.5,1.3,0.2,"setosa"
 39 | 4.9,3.6,1.4,0.1,"setosa"
 40 | 4.4,3,1.3,0.2,"setosa"
 41 | 5.1,3.4,1.5,0.2,"setosa"
 42 | 5,3.5,1.3,0.3,"setosa"
 43 | 4.5,2.3,1.3,0.3,"setosa"
 44 | 4.4,3.2,1.3,0.2,"setosa"
 45 | 5,3.5,1.6,0.6,"setosa"
 46 | 5.1,3.8,1.9,0.4,"setosa"
 47 | 4.8,3,1.4,0.3,"setosa"
 48 | 5.1,3.8,1.6,0.2,"setosa"
 49 | 4.6,3.2,1.4,0.2,"setosa"
 50 | 5.3,3.7,1.5,0.2,"setosa"
 51 | 5,3.3,1.4,0.2,"setosa"
 52 | 7,3.2,4.7,1.4,"versicolor"
 53 | 6.4,3.2,4.5,1.5,"versicolor"
 54 | 6.9,3.1,4.9,1.5,"versicolor"
 55 | 5.5,2.3,4,1.3,"versicolor"
 56 | 6.5,2.8,4.6,1.5,"versicolor"
 57 | 5.7,2.8,4.5,1.3,"versicolor"
 58 | 6.3,3.3,4.7,1.6,"versicolor"
 59 | 4.9,2.4,3.3,1,"versicolor"
 60 | 6.6,2.9,4.6,1.3,"versicolor"
 61 | 5.2,2.7,3.9,1.4,"versicolor"
 62 | 5,2,3.5,1,"versicolor"
 63 | 5.9,3,4.2,1.5,"versicolor"
 64 | 6,2.2,4,1,"versicolor"
 65 | 6.1,2.9,4.7,1.4,"versicolor"
 66 | 5.6,2.9,3.6,1.3,"versicolor"
 67 | 6.7,3.1,4.4,1.4,"versicolor"
 68 | 5.6,3,4.5,1.5,"versicolor"
 69 | 5.8,2.7,4.1,1,"versicolor"
 70 | 6.2,2.2,4.5,1.5,"versicolor"
 71 | 5.6,2.5,3.9,1.1,"versicolor"
 72 | 5.9,3.2,4.8,1.8,"versicolor"
 73 | 6.1,2.8,4,1.3,"versicolor"
 74 | 6.3,2.5,4.9,1.5,"versicolor"
 75 | 6.1,2.8,4.7,1.2,"versicolor"
 76 | 6.4,2.9,4.3,1.3,"versicolor"
 77 | 6.6,3,4.4,1.4,"versicolor"
 78 | 6.8,2.8,4.8,1.4,"versicolor"
 79 | 6.7,3,5,1.7,"versicolor"
 80 | 6,2.9,4.5,1.5,"versicolor"
 81 | 5.7,2.6,3.5,1,"versicolor"
 82 | 5.5,2.4,3.8,1.1,"versicolor"
 83 | 5.5,2.4,3.7,1,"versicolor"
 84 | 5.8,2.7,3.9,1.2,"versicolor"
 85 | 6,2.7,5.1,1.6,"versicolor"
 86 | 5.4,3,4.5,1.5,"versicolor"
 87 | 6,3.4,4.5,1.6,"versicolor"
 88 | 6.7,3.1,4.7,1.5,"versicolor"
 89 | 6.3,2.3,4.4,1.3,"versicolor"
 90 | 5.6,3,4.1,1.3,"versicolor"
 91 | 5.5,2.5,4,1.3,"versicolor"
 92 | 5.5,2.6,4.4,1.2,"versicolor"
 93 | 6.1,3,4.6,1.4,"versicolor"
 94 | 5.8,2.6,4,1.2,"versicolor"
 95 | 5,2.3,3.3,1,"versicolor"
 96 | 5.6,2.7,4.2,1.3,"versicolor"
 97 | 5.7,3,4.2,1.2,"versicolor"
 98 | 5.7,2.9,4.2,1.3,"versicolor"
 99 | 6.2,2.9,4.3,1.3,"versicolor"
100 | 5.1,2.5,3,1.1,"versicolor"
101 | 5.7,2.8,4.1,1.3,"versicolor"
102 | 6.3,3.3,6,2.5,"virginica"
103 | 5.8,2.7,5.1,1.9,"virginica"
104 | 7.1,3,5.9,2.1,"virginica"
105 | 6.3,2.9,5.6,1.8,"virginica"
106 | 6.5,3,5.8,2.2,"virginica"
107 | 7.6,3,6.6,2.1,"virginica"
108 | 4.9,2.5,4.5,1.7,"virginica"
109 | 7.3,2.9,6.3,1.8,"virginica"
110 | 6.7,2.5,5.8,1.8,"virginica"
111 | 7.2,3.6,6.1,2.5,"virginica"
112 | 6.5,3.2,5.1,2,"virginica"
113 | 6.4,2.7,5.3,1.9,"virginica"
114 | 6.8,3,5.5,2.1,"virginica"
115 | 5.7,2.5,5,2,"virginica"
116 | 5.8,2.8,5.1,2.4,"virginica"
117 | 6.4,3.2,5.3,2.3,"virginica"
118 | 6.5,3,5.5,1.8,"virginica"
119 | 7.7,3.8,6.7,2.2,"virginica"
120 | 7.7,2.6,6.9,2.3,"virginica"
121 | 6,2.2,5,1.5,"virginica"
122 | 6.9,3.2,5.7,2.3,"virginica"
123 | 5.6,2.8,4.9,2,"virginica"
124 | 7.7,2.8,6.7,2,"virginica"
125 | 6.3,2.7,4.9,1.8,"virginica"
126 | 6.7,3.3,5.7,2.1,"virginica"
127 | 7.2,3.2,6,1.8,"virginica"
128 | 6.2,2.8,4.8,1.8,"virginica"
129 | 6.1,3,4.9,1.8,"virginica"
130 | 6.4,2.8,5.6,2.1,"virginica"
131 | 7.2,3,5.8,1.6,"virginica"
132 | 7.4,2.8,6.1,1.9,"virginica"
133 | 7.9,3.8,6.4,2,"virginica"
134 | 6.4,2.8,5.6,2.2,"virginica"
135 | 6.3,2.8,5.1,1.5,"virginica"
136 | 6.1,2.6,5.6,1.4,"virginica"
137 | 7.7,3,6.1,2.3,"virginica"
138 | 6.3,3.4,5.6,2.4,"virginica"
139 | 6.4,3.1,5.5,1.8,"virginica"
140 | 6,3,4.8,1.8,"virginica"
141 | 6.9,3.1,5.4,2.1,"virginica"
142 | 6.7,3.1,5.6,2.4,"virginica"
143 | 6.9,3.1,5.1,2.3,"virginica"
144 | 5.8,2.7,5.1,1.9,"virginica"
145 | 6.8,3.2,5.9,2.3,"virginica"
146 | 6.7,3.3,5.7,2.5,"virginica"
147 | 6.7,3,5.2,2.3,"virginica"
148 | 6.3,2.5,5,1.9,"virginica"
149 | 6.5,3,5.2,2,"virginica"
150 | 6.2,3.4,5.4,2.3,"virginica"
151 | 5.9,3,5.1,1.8,"virginica"
152 | 


--------------------------------------------------------------------------------
/data/jellyfish.csv:
--------------------------------------------------------------------------------
1 | Location,Width,Length D,6,9D,6.5,8D,6.5,9D,7,9D,7,10D,7,11D,8,9.5D,8,10D,8,10D,8,11D,9,11D,10,13D,11,13D,11,14D,11,14D,12,13D,13,14D,14,16D,15,16D,15,16D,15,19D,16,16S,12,14S,13,17S,14,16.5S,14,19S,15,16S,15,17S,15,18S,15,18S,15,19S,15,21S,16,18S,16,19S,16,20S,16,20S,16,21S,16.5,19S,17,20S,18,19S,18,19S,18,20S,19,20S,19,22S,20,22S,21,21M,11,8M,15,10M,11,11M,16,16M,16,20M,12,20M,8,21M,16.5,19M,13,18M,14,18M,13,16.5M,7,13M,6,13


--------------------------------------------------------------------------------
/data/kenya.wide.csv:
--------------------------------------------------------------------------------
 1 | site.id,gambiae,funestus,arabiensis,merus,lat,long,site.name,total.abundance,PfPOS,PfNEG,PfEX,PfPR,distance,SR
 2 | 24,0.075056861,0.93025019,0.0015163,0,-4.38,39.48,Magaoni,1327,64,14,78,0.8205128,42.15,3
 3 | 2,0.398981324,0.0237691,0.281833616,0.295415959,-3,40.2,Garithe,589,79,21,100,0.79,130.7,4
 4 | 3,0.90512334,0.018975332,0.075901328,0,-3.54,39.61,Kagombani,527,72,28,100,0.72,57.23,3
 5 | 4,0.836852207,0.034548944,0.10940499,0.019193858,-3.15,40.15,Majenjeni,521,58,42,100,0.58,113.4,4
 6 | 21,0.78372591,0.17130621,0.040685225,0.004282655,-4.13,39.29,Amani,467,73,27,100,0.73,43.11,4
 7 | 14,0.444191344,0.466970387,0.061503417,0.027334852,-3.64,39.74,Jaribuni,439,53,45,98,0.540816327,46.25,4
 8 | 27,0.016393443,0.7470726,0.06323185,0.173302108,-4.6,39.167,Tsuini,427,67,33,100,0.67,82.61,4
 9 | 9,0.720095694,0.107655502,0.172248804,0,-3.59,39.53,Paziani,418,72,28,100,0.72,53.66,3
10 | 23,0.112531969,0.846547315,0.007672634,0.033248082,-4.43,39.5,Gazi,391,60,40,100,0.6,45.68,4
11 | 5,0.731903485,0.024128686,0.214477212,0.02919571,-3.13,40.11,Masheheni,373,67,34,101,0.663366337,113.4,4
12 | 28,0.114285714,0.794285714,0,0,-4.19,39.54,Vinuni,350,66,34,100,0.66,21.22,2
13 | 13,0.683890578,0.282674772,0.012158055,0.021276596,-3.74,39.8,Dindiri,329,62,39,101,0.613861386,37.37,4
14 | 11,0.335548173,0.651162791,0.013289037,0,-3.93,39.77,Barani,301,65,35,100,0.65,63.54,3
15 | 6,0.627586207,0.296551724,0.075862069,0,-3.21,40.07,Maziwani,290,48,52,100,0.48,103.4,3
16 | 17,0.641304348,0.329710145,0,0.028985507,-3.67,39.75,Majajani,276,76,25,101,0.752475248,43.18,3
17 | 8,0.496183206,0.030534351,0.404580153,0.06870229,-3.07,40.15,Mjanaheri,262,49,51,100,0.49,121.3,4
18 | 29,0.212765957,0.787234043,0,0,-4.19,39.5,Vuga,188,44,56,100,0.44,24.45,2
19 | 1,0.353658537,0,0.646341463,0,-3.34,40.01,Dabaso,164,38,61,99,0.383838384,87.5,2
20 | 18,0.787234043,0.191489362,0.021276596,0,-3.91,39.74,Mtepeni,141,82,17,99,0.828282828,17.4,3
21 | 10,0.656033058,0.305785124,0.031239669,0,-3.26,40.02,Mijomboni,121,63,37,100,0.63,96.05,3
22 | 30,0.744186047,0.255813953,0,0,-4.16,39.46,Ziwani,86,59,41,100,0.59,26.3,2
23 | 22,0.894117647,0.094117647,0.011764706,0,-4.12,39.37,Dumbule,85,82,18,100,0.82,34.2,3
24 | 15,0.880597015,0.119402985,0,0,-3.71,39.74,Kitsoeni,67,69,31,100,0.69,38.6,2
25 | 19,1,0,0,0,-3.79,39.83,Shariani,58,54,46,100,0.54,33.92,1
26 | 7,0.571428571,0.017857143,0.410714286,0,-3.3,40.09,Mbaraka Chembe,56,59,41,100,0.59,95.54,3
27 | 25,0.652173913,0.304347826,0.043478261,0,-4.15,39.4,Moyeni,46,78,22,100,0.78,31.95,3
28 | 26,0.604651163,0.162790698,0.23255814,0,-4.27,39.58,Mwaroni,43,50,50,100,0.5,26.42,3
29 | 16,0.666666667,0.333333333,0,0,-3.53,39.78,Kitengwani,30,69,30,99,0.696969697,59.1,2
30 | 20,0.631578947,0.315789474,0.052631579,0,-3.68,39.85,Takaungu,19,52,48,100,0.52,45.59,3
31 | 12,1,0,0,0,-3.73,39.71,Chasimba,2,39,61,100,0.39,35.86,1


--------------------------------------------------------------------------------
/data/lec09_CommunityMatrix_Example.csv:
--------------------------------------------------------------------------------
1 | ,Site 1,Site 2,Site 3,Site 4
2 | Species 1,0,0,0,1
3 | Species 2,0,1,1,1
4 | Species 3,1,0,1,0
5 | Species 4,1,1,1,0
6 | Species 5,0,1,0,1


--------------------------------------------------------------------------------
/data/predator_prey_body_size.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/predator_prey_body_size.txt


--------------------------------------------------------------------------------
/data/rikz_data.txt:
--------------------------------------------------------------------------------
 1 | "Richness"	"Exposure"	"NAP"	"Beach"	"Site"
 2 | 11	10	0.045	1	1
 3 | 10	10	-1.036	1	2
 4 | 13	10	-1.336	1	3
 5 | 11	10	0.616	1	4
 6 | 10	10	-0.684	1	5
 7 | 8	8	1.19	2	1
 8 | 9	8	0.82	2	2
 9 | 8	8	0.635	2	3
10 | 19	8	0.061	2	4
11 | 17	8	-1.334	2	5
12 | 6	11	-0.976	3	1
13 | 1	11	1.494	3	2
14 | 4	11	-0.201	3	3
15 | 3	11	-0.482	3	4
16 | 3	11	0.167	3	5
17 | 1	11	1.768	4	1
18 | 3	11	-0.03	4	2
19 | 3	11	0.46	4	3
20 | 1	11	1.367	4	4
21 | 4	11	-0.811	4	5
22 | 3	10	1.117	5	1
23 | 22	10	-0.503	5	2
24 | 6	10	0.729	5	3
25 | 0	10	1.627	5	4
26 | 6	10	0.054	5	5
27 | 5	11	-0.578	6	1
28 | 4	11	-0.348	6	2
29 | 1	11	2.222	6	3
30 | 6	11	-0.893	6	4
31 | 4	11	0.766	6	5
32 | 2	11	0.883	7	1
33 | 1	11	1.786	7	2
34 | 1	11	1.375	7	3
35 | 3	11	-0.06	7	4
36 | 4	11	0.367	7	5
37 | 3	10	1.671	8	1
38 | 5	10	-0.375	8	2
39 | 7	10	-1.005	8	3
40 | 5	10	0.17	8	4
41 | 0	10	2.052	8	5
42 | 7	10	-0.356	9	1
43 | 11	10	0.094	9	2
44 | 3	10	-0.002	9	3
45 | 0	10	2.255	9	4
46 | 2	10	0.865	9	5
47 | 


--------------------------------------------------------------------------------
/data/survey.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/survey.csv.gz


--------------------------------------------------------------------------------
/data/wc2.0_bio_10m_01.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/wc2.0_bio_10m_01.tif


--------------------------------------------------------------------------------
/data/wc2.0_bio_10m_12.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/wc2.0_bio_10m_12.tif


--------------------------------------------------------------------------------
/image/Liriodendron_tulipifera.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/Liriodendron_tulipifera.png


--------------------------------------------------------------------------------
/image/RIKZ_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/RIKZ_data.png


--------------------------------------------------------------------------------
/image/RIKZ_data_Crossed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/RIKZ_data_Crossed.png


--------------------------------------------------------------------------------
/image/RIKZ_data_DeepNest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/RIKZ_data_DeepNest.png


--------------------------------------------------------------------------------
/image/SEM-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/SEM-figure.png


--------------------------------------------------------------------------------
/image/SEMfig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/SEMfig.png


--------------------------------------------------------------------------------
/image/assignment-8-figure-q1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/assignment-8-figure-q1.png


--------------------------------------------------------------------------------
/image/boxplot-problem.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/boxplot-problem.gif


--------------------------------------------------------------------------------
/image/colourblind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/colourblind.png


--------------------------------------------------------------------------------
/image/comic-filenaming.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/comic-filenaming.gif


--------------------------------------------------------------------------------
/image/dynamite-bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/dynamite-bars.png


--------------------------------------------------------------------------------
/image/dynamite-vs-dists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/dynamite-vs-dists.png


--------------------------------------------------------------------------------
/image/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/favicon.png


--------------------------------------------------------------------------------
/image/fig_scientific_method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/fig_scientific_method.png


--------------------------------------------------------------------------------
/image/git_lesson/branch_dropdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/branch_dropdown.png


--------------------------------------------------------------------------------
/image/git_lesson/branches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/branches.png


--------------------------------------------------------------------------------
/image/git_lesson/delete_branch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/delete_branch.png


--------------------------------------------------------------------------------
/image/git_lesson/sample_rmd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/sample_rmd.png


--------------------------------------------------------------------------------
/image/git_lesson/yellow_prompt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/yellow_prompt.png


--------------------------------------------------------------------------------
/image/heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/heatmap.png


--------------------------------------------------------------------------------
/image/logistic.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/logistic.gif


--------------------------------------------------------------------------------
/image/lotka-volterra.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/lotka-volterra.gif


--------------------------------------------------------------------------------
/image/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/model.png


--------------------------------------------------------------------------------
/image/predator-prey.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/predator-prey.gif


--------------------------------------------------------------------------------
/image/signal-transduction-pathway.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/signal-transduction-pathway.png


--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Syllabus: EEB313 Quantitative Methods in R for Biology [24L, 12P]'
  3 | ---
  4 | 
  5 | This course covers statistics and data analysis for ecology and reproducible quantitative methods in R. Statistical analysis, modelling, simulation, and data analysis are essential skills for applying ecology concepts to data. This course is designed to meet a growing demand for reproducible, openly accessible, analytically thorough, and well documented science. Students will learn to develop ecological population models, analyze data, and document their research using the R programming language. No prerequisite programming experience is required.
  6 | 
  7 | Prerequisites: BIO220H1 and one of EEB225H1, STA288H1, or STA220H1
  8 | 
  9 | ## Time
 10 | Tue and Thu 2:10 - 4:00 pm. Office hours are Tue 4:00 - 5:00 pm.
 11 | 
 12 | ## Class locations
 13 | 
 14 | | Day | Room         |
 15 | |-----|--------------|
 16 | | Tue | [Ramsay Wright](http://map.utoronto.ca/utsg/building/072) (RW 109) |
 17 | | Thu | [Ramsay Wright](http://map.utoronto.ca/utsg/building/072) (RW 109) |
 18 | 
 19 | Office hours are on Tuesdays from 4 to 5 PM in RW 109.
 20 | 
 21 | The lecture hall has access to individual computers for the students. To use the computer workstations, students can login with their UTORid and password. Programs and packages that you install, and files that you save, will be deleted from these computers daily. Please bring a USB key to save files onto or email them to yourself. Students can use any of the lecture halls when there are no classes scheduled. Lecture halls are usually open 9 am - 5 pm, see the [online schedules](http://lab.chass.utoronto.ca/carr.php) for available times.
 22 | 
 23 | ## Contact info
 24 | Quercus is the preferred communication channel. If you need to use email instead, please address all general course-related issues to james.santangelo@mail.utoronto.ca, and project specific communication to the respective TA of your group. Prefix the subject matter with "EEB313". If you do not receive a reply within 48 hours (excluding week-ends), please send a reminder.
 25 | 
 26 | ### Course Instructors
 27 | - James Santangelo, james.santangelo@mail.utoronto.ca
 28 | - Ahmed Hasan, ahmed.hasan@mail.utoronto.ca
 29 | - Zoe Humphries, zoe.humphries@mail.utoronto.ca
 30 | - Amber Hoi, amber.hoi@mail.utoronto.ca
 31 | 
 32 | ### Supervising professor
 33 | Prof. Benjamin Gilbert, benjamin.gilbert@utoronto.ca , 416-978-4065, ES3035
 34 | 
 35 | ## Course Website and Quercus
 36 | All course information is accessible [on its own website](https://uoftcoders.github.io/rcourse/) and on [Quercus](https://q.utoronto.ca), including the syllabus, assessments, and lecture slides. If you have any problem accessing the material, let us know via email right away so we can fix the problem.
 37 | 
 38 | ## Recommended resources
 39 | - [R for Data science](http://r4ds.had.co.nz/), H Wickham, G Grolemund, 2017
 40 |     - Excellent open access resource for R.
 41 | - [RStudio cheat sheets](https://www.rstudio.com/resources/cheatsheets/), RStudio, 2017
 42 |     - As good as it sounds, great quick reference.
 43 | - [R for ecological data science](https://datacarpentry.org/R-ecology-lesson/index.html)
 44 |     - An inspiration for our lectures.
 45 | 
 46 | ## Course learning outcomes
 47 | 1. Develop proficiency in the programming language R.
 48 | 2. Use R to apply statistics to analyze and interpret data.
 49 | 3. Choose appropriate analysis techniques for a variety of data types and formats.
 50 | 4. Learn and use techniques and best practices for reproducible, high-quality science.
 51 | 5. Learn how to work as part of a research team to produce a scientific product.
 52 | 6. Learn what is required to generate a scientific item ready for publishing.
 53 | 
 54 | ## Improving your writing skills
 55 | Effective communication is crucial in science. The [University of Toronto provides services](http://writing.utoronto.ca/) to help you improve your writing, from general advices on effective writing to writing centers and writing courses. The Faculty of Arts & Science also offers an English Language Learning (ELL) program, which provides free individualized instruction in English skills. Take advantage of these!
 56 | 
 57 | ## Academic integrity
 58 | You should be aware of the University of Toronto Code of Behaviour on Academic Matters. Also see [How Not to Plagiarize](http://advice.writing.utoronto.ca/using-sources/how-not-to-plagiarize/). Note that it is NOT appropriate to use large sections from internet sources, and inserting a few words here and there does not make it an original piece of writing. Be careful in using internet sources – there is no review of most online material and there are many errors out there. Use only academic or government internet sources when absolutely necessary. Make sure you read material from many sources (published, peer-reviewed, trusted internet sources) and that you write an original text using this information. Always cite your sources. In case of doubt about plagiarism, talk to your instructor. Please make sure that what you submit for the final project does not overlap with what you submit for other classes, such as the 4th year research project. We will not enforce this, but the department will.
 59 | 
 60 | ## Lecture schedule
 61 | | Week | Date   | Topic                                      | Instructor                |
 62 | |------|--------|--------------------------------------------|---------------------------|
 63 | | 1    | Sep 10 | Intro to course, programming, RStudio, R Markdown   | Everyone       |
 64 | | 1    | Sep 12 | Assignment, vectors, functions                      | Ahmed          |
 65 | | 2    | Sep 17 | Data frames, intro to dplyr                         | Ahmed          |
 66 | | 2    | Sep 19 | Data wrangling in dplyr, ggplot, tidy data          | Ahmed          |
 67 | | 3    | Sep 24 | More dplyr and ggplot                               | Ahmed          |
 68 | | 3    | Sep 26 | Exploratory data analysis                           | Zoe            |
 69 | | 4    | Oct 01 | Linear models and statistical modelling             | Zoe            |
 70 | | 4    | Oct 03 | Mixed effects models                                | James          |
 71 | | 5    | Oct 08 | Model selection                                     | James          |
 72 | | 5    | Oct 10 | Multivariate stats                                  | Amber          |
 73 | | 6    | Oct 15 | Spatial stats                                       | Amber          |
 74 | | 6    | Oct 17 | Simulating data                                     | James          |
 75 | | 7    | Oct 22 | Ecological modelling                                | Amber          |
 76 | | 7    | Oct 24 | Evolutionary modelling                              | Zoe            |
 77 | | 8    | Oct 29 | Reproducible science                                | Everyone       |
 78 | | 8    | Oct 31 | Datasets, hypotheses, begin projects                | Everyone       |
 79 | | -    | Nov 05 | Fall break                                          | -              |
 80 | | -    | Nov 07 | Fall break                                          | -              |
 81 | | 9    | Nov 12 | Project work                                        | Everyone       |
 82 | | 9    | Nov 14 | Project work                                        | Everyone       |
 83 | | 10   | Nov 19 | Project work                                        | Everyone       |
 84 | | 10   | Nov 21 | Project work                                        | Everyone       |
 85 | | 11   | Nov 26 | Project work                                        | Everyone       |
 86 | | 11   | Nov 28 | Project work                                        | Everyone       |
 87 | | 12   | Dec 03 | Group presentations                                 | Everyone       |
 88 | 
 89 | ## Assessment schedule
 90 | | Assignment                       | Type                | Due date   | Marks |
 91 | |----------------------------------|---------------------|------------|-------|
 92 | | Getting set up                   | Individual          | Sep 19     | 4     |
 93 | | Basic R and dplyr                | Individual          | Sep 26     | 8     |
 94 | | dplyr and tidy data              | Individual          | Oct 03     | 8     |
 95 | | Data exploration, linear models  | Individual          | Oct 10     | 8     |
 96 | | Model selection, multivar. stats | Individual          | Oct 17     | 8     |
 97 | | Spatial stats, randomization     | Individual          | Oct 24     | 8     |
 98 | | Modelling                        | Individual          | Oct 31     | 8     |
 99 | | Mid-project update               | Project, Group      | Nov 14     | 10    |
100 | | Challenge assignment             | Individual          | Nov 21     | 16    |
101 | | Final report, presentation       | Project, Group      | Dec 03     | 22    |
102 | 
103 | There are 100 marks in total. Your final course mark will be the sum of your assignment scores, which will be translated to a letter grade according to the [official grading scale](http://www.artsci.utoronto.ca/faculty-staff/teacher-info/academic-handbook-for-instructors/sections-9-11#official) of the Faculty of Arts and Science.
104 | 
105 | Assignments will be distributed and submitted in the R Markdown format via Quercus. Assignments will be handed out on Tuesdays and are due 11:59 pm on the Thursday seven weekdays later. _There will be a penalty of 5% per day (including week-ends) for late submissions_.
106 | 
107 | 
108 | ### Final project grading rubric
109 | 
110 | |                                                | Inadequate (0 marks)                                                                                                                                                                                          | Adequate (4 marks)                                                                                                                                                | Excellent (8 marks)                                                                                                                                                                    |
111 | |------------|--------------------|--------------------|--------------------|
112 | | Contribution to group work | Student contributed little to project; self-assessed contributions are low in quality and/or quantity; self-assessment is not consistent with actual contribution.                                            | Student contributed adequately to project; made some significant contributions                                                                                    | Student substantially contributed to project to ensure success; self-assessed contributions are crucial to project; self-assessment is consistent with actual contribution. |
113 | | Content                                        | Missing crucial information; methods and results are inconsistent, not logical, or not adequately explained; conclusions are confusing or unsupported by results; unnecessary information included as clutter | Most essential information included; methods and results are adequately described; conclusions supported by results; most included material is relevant to report | All essential information included; methods and results are succinct, clear, logical, and scientifically valid; conclusions are creative and meaningful; project is concise throughout |
114 | | Style and reproducibility                      | Code and writing are poorly organized, poorly formatted, missing units, difficult to read, poorly documented, difficult to reproduce analyses                                                                 | Code and writing are well-organized, well-formatted, consistent use of units and significant figures                                                              | Code and writing are precise and clear throughout, free of errors, well-organized, well-documented, easily reproducible analyses, publication-ready                                    |
115 | | Presentation                      | Presentation is poorly organized; much too long or much too short; presentation is unclear; presentation is missing information; presentation is not scientific and professional; presentation uses too much jargon; not all team members participate; does not adequately address audience questions            | Presentation is adequately organized; timing is appropriate; most information is presented logically; presentation is scientific and professional; most jargon is avoided; all team members participate but equally; audience questions are sometimes addressed well | Presentation is clearly and logically organized; presentation flows and is easy to follow; presentation includes appropriate information without jargon; presentation is well-rehearsed and high-quality; all team members participate equally; audience questions are clearly addressed |
116 | 
117 | As the final project is a team effort, all members within a group will receive the same mark in the final three categories and an individual mark for their contribution to group work. A final project that is considered to lie between two of the defined levels will be marked accordingly, e.g. between "Adequate" and "Excellent" would be 5, 6, or 7 marks.
118 | 


--------------------------------------------------------------------------------
/lec04-dplyr.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Data wrangling and visualization in the tidyverse
  3 | author: Joel Östblom
  4 | ---
  5 | 
  6 | ## Lesson preamble
  7 | 
  8 | > ### Learning Objectives
  9 | >
 10 | > - Understand the split-apply-combine concept for data analysis.
 11 | > - Use `summarize`, `group_by`, and `tally` to split a data frame into groups
 12 | > of observations, apply a summary statistics for each group, and then combine
 13 | > the results.
 14 | > - Produce scatter plots, line plots, and histograms using ggplot.
 15 | > - Set universal plot settings.
 16 | > 
 17 | > ### Lesson outline
 18 | >
 19 | > - Split-apply-combine techniques in **`dplyr`** (25 min)
 20 | > - Using `tally` to summarize categorical data (15 min)
 21 | > - Plotting with **`ggplot2`** (20 min)
 22 | > - Building plots iteratively (25 min)
 23 | 
 24 | -----
 25 | 
 26 | ## Setting up
 27 | 
 28 | Start by loading the required packages. Both **`ggplot2`** and **`dplyr`** are
 29 | included in the **`tidyverse`** package collection.
 30 | 
 31 | ```{r}
 32 | # Install if needed
 33 | # install.packages('tidyverse')
 34 | library(tidyverse)
 35 | ```
 36 | 
 37 | Load the data we saved in the previous lesson.
 38 | 
 39 | ```{r, eval=FALSE}
 40 | # Download if needed
 41 | # download.file("https://ndownloader.figshare.com/files/2292169", "data/portal_data.csv")
 42 | surveys <- read_csv('portal_data.csv')
 43 | ```
 44 | 
 45 | ```{r, echo=FALSE}
 46 | surveys <- read_csv('data/portal_data.csv')
 47 | ```
 48 | 
 49 | ```{r}
 50 | surveys
 51 | ```
 52 | 
 53 | 
 54 | ## Split-apply-combine techniques in dplyr
 55 | 
 56 | Many data analysis tasks can be approached using the *split-apply-combine*
 57 | paradigm: split the data into groups, apply some analysis to each group, and
 58 | then combine the results.
 59 | 
 60 | **`dplyr`** facilitates this workflow through the use of `group_by()`
 61 | to split data and `summarize()`, which collapses each group into a single-row
 62 | summary of that group. The arguments to `group_by()` are the column names that
 63 | contain the **categorical** variables for which you want to calculate the
 64 | summary statistics. Let's view the mean `weight` by sex.
 65 | 
 66 | ```{r}
 67 | surveys %>%
 68 |     group_by(sex) %>%
 69 |     summarize(mean_weight = mean(weight))
 70 | ```
 71 | 
 72 | The mean weights become `NA` since there are individual observations that are
 73 | `NA`. Let's remove those observations.
 74 | 
 75 | ```{r}
 76 | surveys %>%
 77 |     filter(!is.na(weight)) %>%
 78 |     group_by(sex) %>%
 79 |     summarize(mean_weight = mean(weight))
 80 | ```
 81 | 
 82 | There is one row here that is neither male nor female, these are observations
 83 | where the animal escaped before the sex could not be determined. Let's remove
 84 | those as well.
 85 | 
 86 | ```{r}
 87 | surveys %>%
 88 |     filter(!is.na(weight) & !is.na(sex)) %>%
 89 |     group_by(sex) %>%
 90 |     summarize(mean_weight = mean(weight))
 91 | ```
 92 | 
 93 | You can also group by multiple columns:
 94 | 
 95 | ```{r}
 96 | surveys %>%
 97 |     filter(!is.na(weight) & !is.na(sex)) %>%
 98 |     group_by(genus, sex) %>%
 99 |     summarize(mean_weight = mean(weight))
100 | ```
101 | 
102 | Since we will use the same filtered and grouped data frame in multiple code
103 | chunks below, we could assign this subset of the data to a new variable and use
104 | this variable in the subsequent code chunks instead of typing out the functions
105 | each time.
106 | 
107 | ```{r}
108 | filtered_surveys <- surveys %>%
109 |     filter(!is.na(weight) & !is.na(sex)) %>%
110 |     group_by(genus, sex)
111 | ```
112 | 
113 | If you want to display more data, you can use the `print()` function at the end
114 | of your chain with the argument `n` specifying the number of rows to display.
115 | 
116 | ```{r}
117 | filtered_surveys %>%
118 |     summarize(mean_weight = mean(weight)) %>%
119 |     print(n = 15) # Will change the knitted output, not the notebook
120 | ```
121 | 
122 | Once the data are grouped, you can also summarize multiple variables at the same
123 | time. For instance, we could add a column indicating the minimum weight for each
124 | species for each sex:
125 | 
126 | ```{r}
127 | filtered_surveys %>%
128 |     summarize(mean_weight = mean(weight),
129 |               min_weight = min(weight))
130 | ```
131 | 
132 | #### Challenge
133 | 
134 | 1. Use `group_by()` and `summarize()` to find the mean, min, and max hindfoot
135 | length for each species.
136 | 
137 | 2. What was the heaviest animal measured in each year? Return the columns `year`,
138 | `genus`, `species`, and `weight`.
139 | 
140 | ```{r, include=FALSE}
141 | ## Answer 1
142 | surveys %>%
143 |     filter(!is.na(hindfoot_length)) %>%
144 |     group_by(species) %>%
145 |     summarize(
146 |         mean_hindfoot_length = mean(hindfoot_length),
147 |         min_hindfoot_length = min(hindfoot_length),
148 |         max_hindfoot_length = max(hindfoot_length)
149 |     )
150 | ## Answer 2
151 | surveys %>%
152 |     filter(!is.na(weight)) %>%
153 |     group_by(year) %>%
154 |     filter(weight == max(weight)) %>% # This is going to compare to the max weight within each group
155 |     select(year, genus, species, weight) %>%
156 |     arrange(year)
157 | ```
158 | 
159 | 
160 | ### Using tally to summarize categorical data
161 | 
162 | When working with data, it is also common to want to know the number of
163 | observations found for each factor or combination of factors. For this, **`dplyr`**
164 | provides `tally()`. For example, if we want to group by taxa and find the
165 | number of observations for each taxa, we would do:
166 | 
167 | ```{r}
168 | surveys %>%
169 |     group_by(taxa) %>%
170 |     tally()
171 | ```
172 | 
173 | We can also use `tally()` when grouping on multiple variables:
174 | 
175 | ```{r}
176 | surveys %>%
177 |     group_by(taxa, sex) %>%
178 |     tally()
179 | ```
180 | 
181 | Here, `tally()` is the action applied to the groups created by `group_by()` and
182 | counts the total number of records for each category.
183 | 
184 | If there are many groups, `tally()` is not that useful on its own. For example,
185 | when we want to view the five most abundant species among the observations:
186 | 
187 | ```{r}
188 | surveys %>%
189 |     group_by(species) %>%
190 |     tally()
191 | ```
192 | 
193 | Since there are 40 rows in this output, we would like to order the table to
194 | display the most abundant species first. In `dplyr`, we say that we want to
195 | `arrange()` the data.
196 | 
197 | ```{r}
198 | surveys %>%
199 |     group_by(species) %>%
200 |     tally() %>%
201 |     arrange(n)
202 | ```
203 | 
204 | Still not that useful. Since we are interested in the most abundant species, we
205 | want to display those with the highest count first, in other words, we want to
206 | arrange the column `n` in descending order:
207 | 
208 | ```{r}
209 | surveys %>%
210 |     group_by(species) %>%
211 |     tally() %>%
212 |     arrange(desc(n)) %>%
213 |     head(5)
214 | ```
215 | 
216 | If we want to include more attributes about these species, we can include these
217 | in the call to `group_by()`:
218 | 
219 | ```{r}
220 | surveys %>%
221 |     group_by(species, taxa, genus) %>%
222 |     tally() %>%
223 |     arrange(desc(n)) %>%
224 |     head(5)
225 | ```
226 | 
227 | Be careful not to include anything that would split the group into subgroups,
228 | such as `sex`, `year` etc.
229 | 
230 | #### Challenge
231 | 
232 | 1. How many individuals were caught in each `plot_type` surveyed?
233 | 
234 | 2. You saw above how to count the number of individuals of each `sex` using a
235 | combination of `group_by()` and `tally()`. How could you get the same result
236 | using `group_by()` and `summarize()`? Hint: see `?n`.
237 | 
238 | 
239 | ```{r, include=FALSE}
240 | ## Answer 1
241 | surveys %>%
242 |     group_by(plot_type) %>%
243 |     tally()
244 | 
245 | ## Answer 2
246 | surveys %>%
247 |   group_by(sex) %>%
248 |   summarize(n = n())
249 | ```
250 | 
251 | 
252 | ## Plotting with ggplot2
253 | 
254 | **`ggplot2`** is a plotting package that makes it simple to create complex plots
255 | from data frames. The name **`ggplot2`** comes from its inspiration, the book "A
256 | grammar of graphics", and the main goal is to allow coders to express
257 | their desired outcome on a high level instead of telling the computer every
258 | detail about what will happen. For example, you would say "color my data by
259 | species" instead of "go through this data frame and plot any observations of
260 | species1 in blue, any observations of species2 in red, etc". Thanks to this
261 | functional way of interfaces with data, only minimal changes are required if the
262 | underlying data change or to change the type of plot. This helps in thinking
263 | about the data and creating publication quality plots with minimal amounts of
264 | adjustments and tweaking.
265 | 
266 | ggplot graphics are built step by step by adding new elements, or layers. Adding layers in
267 | this fashion allows for extensive flexibility and customization of plots. To
268 | build a ggplot, we need to:
269 | 
270 | 1. Use the `ggplot()` function and bind the plot to a specific data frame using the
271 |       `data` argument
272 | 
273 | ```{r}
274 | ggplot(data = surveys)
275 | ```
276 | 
277 | Remember, if the arguments are provided in the right order then the names of the
278 | arguments can be omitted.
279 | 
280 | ```{r}
281 | ggplot(surveys)
282 | ```
283 | 
284 | 2. Define aesthetics (`aes`), by selecting the variables to be plotted and the
285 |    variables to define the presentation such as plotting size, shape color, etc.
286 | 
287 | ```{r}
288 | ggplot(surveys, aes(x = weight, y = hindfoot_length))
289 | ```
290 | 
291 | 3. Add `geoms` -- geometrical objects as a graphical representation of the data
292 | in the plot (points, lines, bars). **`ggplot2`** offers many different geoms; we
293 | will use a few common ones today, including:
294 |       * `geom_point()` for scatter plots, dot plots, etc.
295 |       * `geom_line()` for trend lines, time-series, etc.
296 |       * `geom_histogram()` for histograms
297 | 
298 | To add a geom to the plot use `+` operator. Because we have two continuous
299 | variables, let's use `geom_point()` first:
300 | 
301 | ```{r}
302 | # If this takes way too long on your machine, create a subset from a random
303 | # sample of a suitable size and continue working with this instead of `survey`.
304 | #survey_subset <- sample_n(surveys, size = 5000)
305 | 
306 | ggplot(surveys, aes(x = weight, y = hindfoot_length)) +
307 |   geom_point()
308 | ```
309 | 
310 | The `+` in the **`ggplot2`** package is particularly useful because it allows you
311 | to modify existing `ggplot` objects. This means you can easily set up plot
312 | "templates" and conveniently explore different types of plots, so the above
313 | plot can also be generated with code like this:
314 | 
315 | ```{r, first-ggplot-with-plus}
316 | # Assign plot to a variable
317 | surveys_plot <- ggplot(surveys, aes(x = weight, y = hindfoot_length))
318 | 
319 | # Draw the plot
320 | surveys_plot + geom_point()
321 | ```
322 | 
323 | Notes:
324 | 
325 | - Anything you put in the `ggplot()` function can be seen by any geom layers
326 |   that you add (i.e., these are universal plot settings). This includes the x and
327 |   y axis you set up in `aes()`.
328 | - You can also specify aesthetics for a given geom independently of the
329 |   aesthetics defined globally in the `ggplot()` function.
330 | - The `+` sign used to add layers must be placed at the end of each line containing
331 | a layer. If, instead, the `+` sign is added in the line before the other layer,
332 | **`ggplot2`** will not add the new layer and R will return an error message.
333 | 
334 | 
335 | ### Building plots iteratively
336 | 
337 | Building plots with ggplot is typically an iterative process. We start by
338 | defining the dataset we'll use, lay the axes, and choose a geom:
339 | 
340 | ```{r}
341 | ggplot(surveys, aes(x = weight, y = hindfoot_length)) +
342 |     geom_point()
343 | ```
344 | 
345 | Then, we start modifying this plot to extract more information from it. For
346 | instance, we can add transparency (`alpha`) to reduce overplotting:
347 | 
348 | 
349 | ```{r}
350 | ggplot(data = surveys, aes(x = weight, y = hindfoot_length)) +
351 |     geom_point(alpha = 0.2)
352 | ```
353 | 
354 | Based on the hindfoot length and the weights, there appears to be 4-5 clusters
355 | in this data. Potentially, one of the categorical variables we have in the data
356 | could explain this pattern. Coloring the data points according to a
357 | categorical variable is an easy way to find out if there seems to be
358 | correlation. Let's try this with `plot_type`.
359 | 
360 | ```{r}
361 | ggplot(surveys, aes(x = weight, y = hindfoot_length, color = plot_type)) +
362 |     geom_point(alpha = 0.2)
363 | ```
364 | 
365 | It seems like the type of plot the animal was captured on correlates well with
366 | some of these clusters, but there are still many that are quite mixed. Let's try
367 | to do better! This time, the information about the data can provide some clues
368 | to which variable to look at. The plot above suggests that there might be 4-5
369 | clusters, so a variable with 4-5 values is a good guess for what could explain
370 | the observed pattern in the scatter plot.
371 | 
372 | ```{r}
373 | surveys %>%
374 |     summarize_all(n_distinct)
375 | ```
376 | 
377 | Remember that there are still `NA` values here, that's why there appears to be
378 | three sexes although there is only male and female. There are four taxa so that
379 | could be a good candidate, let's see which those are.
380 | 
381 | ```{r}
382 | surveys %>%
383 |     distinct(taxa)
384 | ```
385 | 
386 | It seems reasonable that these taxa contain animals different enough to have
387 | diverse weights and length of their feet. Lets use this categorical variable to
388 | color the scatter plot.
389 | 
390 | ```{r}
391 | ggplot(surveys, aes(x = weight, y = hindfoot_length, color = taxa)) +
392 |     geom_point(alpha = 0.2)
393 | ```
394 | 
395 | Only rodents? That was unexpected... Let's check what's going on.
396 | 
397 | ```{r}
398 | surveys %>%
399 |     group_by(taxa) %>%
400 |     tally()
401 | ```
402 | 
403 | There is definitely mostly rodents in our data set...
404 | 
405 | ```{r}
406 | surveys %>%
407 |     filter(!is.na(hindfoot_length)) %>% # control by removing `!`
408 |     group_by(taxa) %>%
409 |     tally()
410 | ```
411 | 
412 | ...and it turns out that only rodents, have had their hindfeet measured!
413 | 
414 | Let's remove all animals that did not have their hindfeet measured, including
415 | those rodents that did not. Animals without their weight measured will also be
416 | removed.
417 | 
418 | ```{r}
419 | surveys_hf_wt <- surveys %>%
420 |     filter(!is.na(hindfoot_length) & !is.na(weight))
421 | 
422 | surveys_hf_wt %>%
423 |     summarize_all(n_distinct)
424 | ```
425 | 
426 | Maybe the genus can explain what we are seeing.
427 | 
428 | ```{r}
429 | ggplot(surveys_hf_wt, aes(x = weight, y = hindfoot_length, color = genus)) +
430 |     geom_point(alpha = 0.2)
431 | ```
432 | 
433 | Now this looks good! There is a clear separation between different genus, but
434 | also significant spread within genus, for example in the weight of the green
435 | Neotoma observations. There are also two clearly separate clusters that are both
436 | colored in olive green (Dipodomys). Maybe separating the observations into
437 | different species would be better?
438 | 
439 | ```{r}
440 | ggplot(surveys_hf_wt, aes(x = weight, y = hindfoot_length, color = species)) +
441 |     geom_point(alpha = 0.2)
442 | ```
443 | 
444 | Great! Together with the genus plot, this definitely seem to explain most of the
445 | variance we see in the hindfoot length and weight measurements. It is still a
446 | bit messy as it appears like we have around 5 clusters, but there are 21 species
447 | in the legend.
448 | 
449 | ```{r}
450 | surveys %>%
451 |     filter(!is.na(hindfoot_length) & !is.na(weight)) %>%
452 |     group_by(species) %>%
453 |     tally() %>%
454 |     arrange(desc(n))
455 | ```
456 | 
457 | There is a big drop from 838 to 159, let's include only those with more than 800
458 | observations.
459 | 
460 | ```{r}
461 | surveys_abun_species <- surveys %>%
462 |     filter(!is.na(hindfoot_length) & !is.na(weight)) %>%
463 |     group_by(species) %>%
464 |     mutate(n = n()) %>% # add count value to each row
465 |     filter(n > 800) %>%
466 |     select(-n)
467 | 
468 | surveys_abun_species
469 | ```
470 | 
471 | Still has almost 25k observations, so only 10k was removed.
472 | 
473 | ```{r}
474 | ggplot(surveys_abun_species, aes(x = weight, y = hindfoot_length, color = species)) +
475 |     geom_point(alpha = 0.2)
476 | ```
477 | 
478 | 
479 | #### Challenge
480 | 
481 | Create a scatter plot of `hindfoot_length` over `species` with the `weight` showing in different colors.
482 | Is there any problem with this plot? *Hint: think about how many observations there are*
483 | 
484 | ```{r, include=FALSE}
485 | ggplot(surveys_abun_species, aes(x = weight, y = species, color = hindfoot_length)) +
486 |     geom_point(size = 0.1, position = 'jitter')
487 | ```
488 | 
489 | 
490 | *Parts of this lesson material were taken and modified from [Data
491 | Carpentry](https://datacarpentry.org) under their CC-BY copyright license. See
492 | their [lesson page](https://datacarpentry.org/R-ecology-lesson/03-dplyr.html)
493 | for the original source.*
494 | 


--------------------------------------------------------------------------------
/lec14-datasets.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Scientific method, team dynamics, and project datasets"
  3 | author: "Luke Johnston"
  4 | output: pdf_document
  5 | ---
  6 | 
  7 | ## Lesson preamble:
  8 | 
  9 | > ### Lesson objectives:
 10 | > 
 11 | > - Learn about the scientific method and applying it
 12 | > - Learn basics of group dynamics in a team setting
 13 | > - Get into your groups, start your projects
 14 | >       - Identify which dataset to use
 15 | >       - Brainstorm possible research questions
 16 | >
 17 | > ### Lesson outline:
 18 | >
 19 | > - What is the scientific method (20 min)
 20 | > - Team dynamics and assigning roles and tasks (15-20 min)
 21 | > - Getting into groups (10 min)
 22 | >     - Exercise (25-30 min)
 23 | > - Start project work (20-25 min)
 24 | > - Datasets available for use (10-20 min)
 25 | 
 26 | -----
 27 | 
 28 | ```{r message=FALSE, warning=FALSE, include=FALSE, eval=FALSE}
 29 | # Run these locally if making changes.
 30 | library(dplyr)
 31 | DiagrammeR::grViz('
 32 | digraph rmarkdown {
 33 |     graph [layout = neato, overlap = false, splines = true]
 34 |     node [shape = box, style = rounded]
 35 | 
 36 |     Hypothesis [pos = "0.75,3.85!"]
 37 |     CollectData [label = "Collect Data", pos = "0.75,3!"]
 38 |     AnalyzeData [label = "Analyze Data", pos = "0.75,2.25!"]
 39 |     HypothesisTrue [label = "Hypothesis\nif TRUE", pos = "0,1.5!"]
 40 |     HypothesisFalse [label = "Hypothesis\nif FALSE", pos = "1.5,1.5!"]
 41 |     Report [label = "Disseminate", pos = "0.75,0.75!"]
 42 | 
 43 |     Hypothesis -> CollectData -> AnalyzeData -> HypothesisTrue -> Report
 44 |     AnalyzeData -> HypothesisFalse -> Report
 45 |     {HypothesisTrue HypothesisFalse AnalyzeData CollectData} -> Hypothesis
 46 | }') %>% 
 47 |         DiagrammeRsvg::export_svg() %>%
 48 |         charToRaw() %>%
 49 |         rsvg::rsvg_png(file = "image/fig_scientific_method.png", 
 50 |                        width = 500, height = 700)
 51 | ```
 52 | 
 53 | ## Scientific method
 54 | 
 55 | <!-- What is the scientific method? -->
 56 | 
 57 | ### Simplified diagram of the scientific method
 58 | 
 59 | ![Simplified process flow for the scientific method](image/fig_scientific_method.png)
 60 | 
 61 | ### Steps in the scientific process[^sci_method]
 62 | 
 63 | 1. Identify research question(s).
 64 | 2. Look into what the previous literature shows.
 65 | 3. Create one or more hypotheses or objectives.
 66 | 4. Write up an outline or expected approach to answering those
 67 | questions/objectives (analysis and presentation plan):
 68 |     - How will the data be obtained and what is the data (i.e. the source)?
 69 |     - What statistical/mathematical techniques have previous researchers used?
 70 |     Will you use them? Are they appropriate (optional, may need expert)?
 71 |     - How will the results/data be presented or visualized (possible
 72 |     tables/figures)?
 73 | 5. Run the planned analyses (or additional ones that come may up).
 74 | 6. Visualize or present all results from the analyses.
 75 | 7. Interpret the results and how they fit with the previous literature.
 76 | 8. Draw conclusions based on the hypotheses/objectives.
 77 | 9. Disseminate your results (in blogs, pre-print archives, journals, conferences)
 78 | 
 79 | [^sci_method]: See [Khan Academy on Scientific Method](https://www.khanacademy.org/science/biology/intro-to-biology/science-of-biology/a/the-science-of-biology) for a brief overview and explanation on the scientific process.
 80 | 
 81 | ## Team dynamics
 82 | 
 83 | ### Basics of succeeding as a team
 84 | 
 85 | Final assignment is in a group and to succeed, you need to understand team dynamics:
 86 | 
 87 | - Communication is **vital** to work together and to achieve the goal
 88 | - Teams go through various stages
 89 | - Need consensus for group norms, goals, duties/responsibilities, and conduct/behaviour
 90 |     - Important that everyone has a stake in the project
 91 | - Rotate roles (specifically for the leader/facilitator)
 92 | 
 93 | ### Stages of group formation
 94 | 
 95 | - "Forming-Storming-Norming-Performing-Adjourning Model"[^group_forming]
 96 | - Essentially, groups go through stages (not always all and not always in order):
 97 |     1. Getting to know one other
 98 |     2. Feeling comfortable and safer, testing boundaries
 99 |     3. Opening up, voicing opinions, potential for conflict (which can be good!)
100 |     4. Stronger bonds form, cooperative
101 |     5. Greater focus and energy on completion of project
102 |     6. Team ends after project is successfully completed
103 | - These stages can be fast-tracked by discussing norms, duties, and conduct early on
104 | 
105 | [^group_forming]: See [Principles of Group Dynamics](https://open.lib.umn.edu/principlesmanagement/chapter/13-3-group-dynamics/) 
106 | 
107 | ### Roles and responsibilities
108 | 
109 | - Leader/Facilitator's duty:
110 |     - Goal is to keep things running smoothly, focused on the task, and on track for time
111 |     - Keep everyone on topic and on task; stay aware of the time
112 |     - (Try to) mediate or resolve any conflicts (there will *always* be some type of conflict; how it's dealt with is what matters)
113 |     - (Try to) encourage everyone to participate and allow everyone a chance at talking
114 | - Recorder's duty:
115 |     - Goal is to write/type down main or important points raised or discussed when team is meeting
116 |     - Keep notes and files organized and orderly
117 | - Organizer's duty:
118 |     - Arrange for next meeting time and location
119 |     - Send reminders to members a day or two before meeting
120 |     - Make and email a simple agenda of tasks to do or to discuss
121 | 
122 | ### Code of conduct
123 | 
124 | - Vital to establishing boundaries and expectations of being a team member
125 |     - How do you want each member to treat each other?
126 |     - How do you deal with conflict?
127 |     - What is acceptable and unacceptable behaviour?
128 | - These are outlined in the code of conduct
129 | - Mostly its common sense (be kind and respectful)
130 |     - But its important that you as a team write out what everyone wants and agrees to
131 | 
132 | #### Example Code of Conduct:
133 | - [Contributor Covenant](https://www.contributor-covenant.org/version/1/4/code-of-conduct)
134 | - [UofT Coders Code of Conduct](https://github.com/UofTCoders/studyGroup/blob/gh-pages/codeOfConduct.md)
135 | 
136 | ## Exercise
137 | 
138 | ### As a group, complete these tasks
139 | 
140 | <!-- show assigned groups -->
141 | 
142 | - Get into your groups
143 | - Introduce each other:
144 |     - Find out everyone's name and year of study
145 |     - Find out other things about each other (e.g. any plans for next year, etc)
146 | - Create a one or two word "team name". We'll use this to create a shared folder for everyone to use.
147 | - Assign roles to each person (these roles will be rotated in every group setting):
148 |     - You need: facilitator, recorder, organizer
149 |     - Discuss how and when roles will be rotated *(record it)*
150 | - Discuss and brainstorm some codes of conduct you want your team to follow *(record it)*
151 | - Take a few minutes, think about your own skills and what you feel are most competent in
152 |     - Then, share the top one or two of those skills *(record those skills)*
153 | - Discuss how responsibilities of each member will be decided on *(record it)*
154 | 
155 | ## Starting the projects
156 | 
157 | ### Datasets available for use
158 | 
159 | - [Continuous Plankton Recorder Dataset](https://www.gbif.org/dataset/67c54f85-7910-4cbf-8de4-6f0b136a0e34)
160 | 
161 |     Data on northern hemisphere plankton species, latitude, longitude, date.
162 |     Going back to 1946. 935 Mb size, almost 2.75 million rows of data.
163 | 
164 | - [Insecta of Costa Rica](https://www.gbif.org/dataset/3e9817c1-8302-4955-87e3-a408db0ea379)
165 | 
166 |     Data on insects species in Costa Rica, latitude, longitude, elevation, date.
167 |     1.4 Gb size, almost 3.25 million rows of data.
168 | 
169 | - [Marine predator and prey body sizes](http://www.esapubs.org/archive/ecol/E089/051/default.htm#data)
170 | 
171 |     Data from 27 different global locations on species, body measurements,
172 |     latitude, longitude, date. 21 Mb, almost 35,000 rows of data (in long
173 |     format).
174 | 
175 | - [Mammalian life history](http://www.esapubs.org/archive/ecol/E084/093/default.htm)
176 | 
177 |     Data about general mammalian life history with species, body size, lifespan,
178 |     litter size, and other reproductive variables. 150 Kb size, 1440 rows of
179 |     data.
180 | 
181 | - [North American Bird Breeding Survey](https://www.pwrc.usgs.gov/BBS/?CFID=36951359&CFTOKEN=5135bf261f2f1478-471B9FA3-C648-BE26-7C2176ADADE30428)
182 | 
183 |     Data about number of birds at multiple stops in North America. Many datasets
184 |     of varying rows that need to be linked together. ~`r (50*114)/1000` Gb size
185 |     (can be shortened) across >50 files.
186 |     
187 | - [National Ecological Observatory Network](http://data.neonscience.org/static/browse.html)
188 | 
189 |     A repository of many large scale ecological datasets from a variety of systems collected over           multiple years at at approximately 50 sites in the USA. Feel free to browse the datasets for ones of     interest to you, but I have highlighted a few below. 
190 |     
191 |     + [Ground beetles in pitfall traps](http://data.neonscience.org/data-product-view?dpCode=DP1.10022.001): ~26 Mb .csv file with ID of ground beetle species from 40 traps arrayed in each of ~50 NEON sites since 2013. 
192 |     + [Macroinvertebrate collection](http://data.neonscience.org/data-product-view?dpCode=DP1.20120.001): ~9 Mb .csv file with ID's of benthid macroinvertebrates from lakes, non-wadeable streams, and wadeable streams from sites across the NEON network dating back to 2014. 
193 |     + [Plant presence/absence and percent cover](http://data.neonscience.org/data-product-view?dpCode=DP1.10058.001): Presence/absence and percent cover of species in 10m^2^, 100m^2^, and 400m^2^ quadrats from multiple plots in each of 50 NEON sites dating back to 2013. This is the dataset we worked with in lecture 9. 
194 |     
195 | - [US EPA National Aquatic Resource Surveys](https://www.epa.gov/national-aquatic-resource-surveys/data-national-aquatic-resource-surveys)
196 | 
197 |     Numerous datasets from annual surveys of aquatic habitats conducted by the US Environmental             Protection Agency. Includes data relating to the physical environments (e.g. water quality, chemical     properties, landscape variables, etc.) to the biotic environment (e.g. phytoplankton concentrations,     benthic macroinvertebrates, etc.). Be sure to download the metadata as well, which is on the same       page linked above. 
198 |     
199 | - [International Council for the Exploration of the Sea](http://www.ices.dk/marine-data/data-portals/Pages/default.aspx)
200 | 
201 |     Many large datasets from fish oceanic fish surveys. Includes oceanographic data in addition to          biodiversity datasets, fish stomach content data, physical environmental condition and contaminants,     predation, etc.
202 | 
203 | - [Alberta Ecological Information System](https://open.alberta.ca/opendata/ecological-information-systems-data)
204 | 
205 |     Data on vegetation and soil plots in Alberta. Over 26 000 sites are available.
206 |     
207 | - [Beaver abundance and distribution](https://open.canada.ca/data/en/dataset/b9f21e91-d34d-4730-8195-edf051121e9d)
208 | 
209 |     Aerial and ground surveys of beaver abundance, feeding, and lodge distribution in Elk Island National Park           starting in 1959.
210 |     
211 | - Benthic invertebrate abundance in [Ivvavik](https://open.canada.ca/data/en/dataset/3bad5ce0-0b16-43ee-be32-78cc2f64843f), [Tuktut](https://open.canada.ca/data/en/dataset/9046af59-81c4-4759-8979-f6185af8387d), and [Aulavik](https://open.canada.ca/data/en/dataset/2770949b-043c-4073-bc6c-b38b03a5f528)
212 | 
213 |     Counts of benthic invertebrate taxa from 3 river corridors from 2009 to 2015. Recommended to use all     3 datasets. 
214 |     
215 | - [Red-backed salamander abundance](https://open.canada.ca/data/en/dataset/3571474b-8d75-491d-816e-f84677b81a7c)
216 | 
217 |     Abundance of red-backed salamanders from 4 sites in the Bruce Peninsula from 2004 to 2017. 
218 | 
219 | In addition to the datasets shown above, we encourage students to bring their own datasets for use in their group projects.
220 | 
221 | ### Example hypotheses with figures
222 | 
223 | > Simple hypothesis: Bigger predators eat bigger prey. 
224 | 
225 | Easy to identify independent and dependent variables and visualize with plots
226 | and test with linear regression.
227 | 
228 | > Advanced hypothesis: Which characteristics determine prey size among marine
229 | predators?
230 | 
231 | Lots of possible groupings available. Start with visualizing some good
232 | candidate variables such as predator weight and length. Realize that the
233 | relationship is more complex than this and start dividing the data set
234 | according to species, water temperature, weather, etc. Use a combination of
235 | visualization and regression analyses. Fit models to the data to determine
236 | which types of regressions are appropriate.
237 | 
238 | ```{r, message=FALSE}
239 | library(tidyverse)
240 | ```
241 | 
242 | ```{r, eval=FALSE}
243 | # Read data online and fix colnames
244 | pred_prey <- read_tsv('http://www.esapubs.org/archive/ecol/E089/051/Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt')
245 | ```
246 | 
247 | ```{r, echo=FALSE}
248 | # download.file(
249 | #     'http://www.esapubs.org/archive/ecol/E089/051/Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt',
250 | #     "data/predator_prey_body_size.txt"
251 | #     )
252 | # pred_prey <- read_tsv("data/predator_prey_body_size.txt")
253 | # pred_prey <- sample_n(pred_prey, 1000)
254 | # write_tsv(pred_prey, "data/predator_prey_body_size.txt")
255 | pred_prey <- read_tsv("data/predator_prey_body_size.txt")
256 | ```
257 | 
258 | ```{r}
259 | colnames(pred_prey) <- tolower(gsub(' ', '_', colnames(pred_prey)))
260 | 
261 | # Only adults and some columns of interest
262 | adult_pred_food <- pred_prey %>% 
263 |     mutate(predator_lifestage = tolower(predator_lifestage)) %>% # Mix of upper and lower case...
264 |     filter(predator_lifestage == 'adult') %>% 
265 |     select(predator_common_name, predator_length, prey_mass, predator_mass)
266 | 
267 | # Summarize data to plot
268 | plot_data <- adult_pred_food %>% 
269 |     group_by(predator_common_name) %>% 
270 |     summarize(
271 |         mean_pred_mass = mean(predator_mass),
272 |         mean_prey_mass = mean(prey_mass),
273 |         mean_pred_length = mean(predator_length)) %>% 
274 |     filter(mean_pred_mass < 8000) # "outliers"
275 | 
276 | # Plot pred weight vs prey weight
277 | ggplot(plot_data, aes(x = mean_pred_mass, y = mean_prey_mass)) +
278 |     geom_point() +
279 |     geom_smooth()
280 | 
281 | # Plot pred length vs prey weight
282 | ggplot(plot_data, aes(x = mean_pred_length, y = mean_prey_mass)) +
283 |     geom_point() +
284 |     geom_smooth()
285 | ```
286 | 
287 | 
288 | ### As a group, complete these tasks
289 | 
290 | In your group, rotate roles (need a facilitator and recorder at minimum). Before
291 | the end of class, finish these:
292 | 
293 | - Choose two possible datasets (or more) your team would like to work from (can also be
294 | datasets not presented in class) *(record them)*
295 | - Look into the data documentation, see what type of variables there are, what
296 | published articles are available.
297 | - Then, brainstorm as many research questions as possible for those two datasets
298 | *(record them)*
299 |     - Goal is to write down as many ideas as possible
300 |     - No question is off limits and no question is too simple or too complex! 
301 |     - (You might combine or split questions later, just get whatever down!)
302 |     - Just write what ever comes to mind, whether it is an idea or not. Just start writing!
303 | 
304 | Make sure to *record* everything down! Many of these tasks will also be part of
305 | your mid-project update!
306 | 
307 | ### Set up GitHub account
308 | 
309 | Before we finish the class, we need to prepare a bit for next class. We need to
310 | create a [GitHub](https://github.com) account!
311 | 


--------------------------------------------------------------------------------
/mid-project-update.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'EEB313 Mid-project update (12 marks)'
  3 | output:
  4 |     html_document:
  5 |         toc: false
  6 | ---
  7 | 
  8 | *To submit this assignment, upload the full document on Quercus, including the
  9 | original questions, your code (if applicable), and the output. Submit your
 10 | assignment as a knitted `.pdf` or `.html` file.*
 11 | 
 12 | Prior to beginning this assignment, we suggest you take a look through [this
 13 | link](https://swcarpentry.github.io/r-novice-gapminder/02-project-intro/index.html)
 14 | on how to manage project in RStudio using Rprojects. While not a component of
 15 | this assignment, the tutorial may prove useful as you move forward with your
 16 | group projects. 
 17 | 
 18 | These tasks seem like a lot of work. However, we will be doing these tasks
 19 | already during class and during project work. The purpose of the mid-project
 20 | update is for you to set up your repo and get something started on your final
 21 | project. While most of these tasks are group based, please fill out each as an
 22 | individual. In particular, the first question should be of *your* forked
 23 | repository, not one of your team members. The remainder of the questions each
 24 | involve a single pull request; make sure everyone in your group does at least
 25 | one. Each team member should submit their *own assignment*.
 26 | 
 27 | 1.  A new repository has been created for your project on the
 28 |     `EEB313-2019` GitHub organization. Paste the URL of this forked 
 29 |     version below. (0.25 marks)
 30 | 
 31 |     -   URL of *your* fork:
 32 | 
 33 | 2.  On the main repo (you will be working on this from here on out, unless you
 34 |     are doing the fork-based workflow) -- have one of your group members create
 35 |     a new branch from `master` called `add-conduct`. Create a `CONDUCT.md` file in
 36 |     this branch. Write down what you as a team decided on for acceptable
 37 |     conduct/behaviour of team members to each other (e.g. "Must be considerate and
 38 |     respectful"). You may reuse existing Codes of Conduct, such as the UofT Coders
 39 |     Code of Conduct or the Contributor Covenant, but make sure to both a) credit
 40 |     whichever you use and b) still expand upon it with project-specific items (i.e.
 41 |     meeting frequency, PR approval policy, role rotations). The designated group
 42 |     member needs to create a pull request of this new file to the `master` branch.
 43 |     Copy and paste the link to the pull request below. Every team member must
 44 |     approve this pull request via whatever means you as a team decide on ('all
 45 |     members must 'thumbs up' the PR message', or 'all members must make a new
 46 |     comment saying 'good to merge'). Make sure the expectations and behaviours are
 47 |     explicit and clear. (1.0 marks)
 48 | 
 49 |     -   URL of team member's pull request for `CONDUCT.md`:
 50 | 
 51 | 3.  A `README.md` file should already exist in your project. Once the previous
 52 |     PR has been merged into `master`, complete the following tasks in the
 53 |     README, discussed and written up *as a team*. Decide who on your team will
 54 |     create the branch, make these edits, and create a pull request of this file to
 55 |     the main project repository. This person should *not* be the same person who
 56 |     did task 2. Every team member must approve this pull request. (1.75 marks)
 57 | 
 58 |     -   Create the following headers (make sure to use Markdown headers
 59 |         `#`): "Introduction to the project", "Description of the data",
 60 |         "Team description". 
 61 | 
 62 |     -   Fill out the "Introduction to the project" section, answering
 63 |         these questions in a paragraph form (don't include these
 64 |         questions in the section). What is your project about? What is
 65 |         the goal? Why are you doing it?
 66 | 
 67 |     -   Fill out the "Description of the data" section, and briefly
 68 |         write down what the data is about, what are the variables you
 69 |         think you'll use, how the data was collected, and how it will
 70 |         answer your research questions. Include a reference of the
 71 |         dataset if one is available, for instance:
 72 | 
 73 |             Forstmann BU, et al. (2014) Data from: Multi-modal ultra-high resolution
 74 |             structural 7-Tesla MRI data repository. Dryad Digital Repository.
 75 |             (https://dx.doi.org/10.5061/dryad.fb41s)
 76 | 
 77 |     -   Fill out the "Team description" section by writing down a brief
 78 |         biography of each member (including what their skills are and
 79 |         what their approximate responsibilities -- which can change later --
 80 |         are for the project) as well as how team roles will be rotated.
 81 | 
 82 |     -   URL of team member's pull request for `README.md`:
 83 | 
 84 | 4.  Once the previous PR has been merged into `master`, have a different group member create a file called
 85 |     `doc/objectives.md` in a new branch. Note that this will simultaneously create a new folder
 86 |     called `doc` containing a file called `objectives.md` -- this is [how folders are created on GitHub repos](https://github.com/KirstieJane/STEMMRoleModels/wiki/Creating-new-folders-in-GitHub-repository-via-the-browser).
 87 |     In `objectives.md`, create one header (`#` markdown header) called "Study
 88 |     objectives". Create a list (`-` markdown syntax) of each of your research
 89 |     questions that you *as a team* thought of. It doesn't matter what the questions
 90 |     are, how simple, complicated or obvious they are. Just have something written
 91 |     down, and make sure these are _explicitly_ framed as questions or
 92 |     hypotheses. In this file, also list explicit predictions wherever possible;
 93 |     i.e. what you think a given relationship might look like. Bear in mind that
 94 |     these research questions *will* change as you do your analyses. The point is
 95 |     for you to get started thinking about ideas. Decide on another (new) team
 96 |     member to create this file, record it, and make a pull request of this new
 97 |     file. Every team member must approve this pull request. (3 marks)
 98 | 
 99 |     -   URL of team member's pull request for `objectives.md`:
100 | 
101 | 5.  Once the previous PR has been merged, start up another branch and create a
102 |     file in the `doc/` folder called `analysis-plan.md`. Create three headers
103 |     (`#` markdown headers): "Possible analyses", "Possible results tables", and
104 |     "Possible results figures". As a team, discuss and record some possible
105 |     analyses on the data that you could do to answer the research questions.
106 |     Discuss and record possible ways to present your results (possible tables, some
107 |     visualizations). Moreover, discuss how you anticipate getting from the raw data
108 |     to whatever summary data you will use to generate a given plot (i.e. explain a
109 |     data cleaning/transformation plan). Moreover, discuss what kinds of statistical
110 |     approaches you anticipate employing. Once again, these do not have to be final,
111 |     but you need to show that your team has thought about how to approach this.
112 |     Decide on another (new) team member to create this file in a new branch, record
113 |     it, and make a pull request of this new file. Every team member must approve
114 |     this pull request. *Note:* this analysis and presentation plan does **not**
115 |     have to be accurate, nor do you have to use this later on.  It could and will
116 |     (very likely) change. The point is to get you as a group thinking about how you
117 |     will answer the research questions. (3 marks)
118 | 
119 |     -   URL of team member's pull request for `analysis-plan.md`:
120 | 
121 | 6. Finally, once the previous PR has been merged, have another group member
122 |    create a file called `plots/mock/README.md` in a new branch. This will also
123 |     create a folder called `plots` and a subfolder called `mock` within it. Add a
124 |     few mock figures into this folder showing your predictions. These do not have
125 |     to be 'publication-ready' plots, and can be made in any software of your
126 |     choosing (R, Excel, PowerPoint, etc) with or without simulated data points; the
127 |     important thing is that the predictions are clear and that your team can show
128 |     you have been thinking about how to present your data. In `README.md`, list
129 |     details about the mock figures in pseudo-figure caption format. Note that image
130 |     files (png, pdf, etc) can also be uploaded to GitHub via dragging and dropping
131 |     -- but make sure you are in a branch before you do this. The image files should
132 |     be part of the same PR as `README.md`. (3 marks) 
133 | 
134 |     - URL of team member's pull request for `plots/mock/README.md` and image files:
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/paper.bib:
--------------------------------------------------------------------------------
  1 | @inproceedings{haaranen_programming_2017,
  2 | 	address = {New York, NY, USA},
  3 | 	series = {{ITiCSE} '17},
  4 | 	title = {Programming as a performance: Live-streaming and its implications for {Computer} {Science} education},
  5 | 	isbn = {978-1-4503-4704-4},
  6 | 	shorttitle = {Programming as a performance},
  7 | 	url = {https://doi.acm.org/10.1145/3059009.3059035},
  8 | 	doi = {10.1145/3059009.3059035},
  9 | 	abstract = {This article discusses an emerging phenomenon of streaming programming to a live audience who in turn can interact with the streamer. In essence, this means broadcasting the programming environment and typically a web camera feed of the streamer to viewers. Streaming programming bears many similarities with live-streaming playing of video games, which has become extremely popular among gamers over the recent years. In fact, streaming programming often use the same web services as streaming gaming, and the audiences overlap. In this article, we describe this novel approach to programming and situate it in the broader context of computer science education. To gain a deeper insight into this phenomena, we analyzed viewer discussions during a particular programming stream broadcasted during a game programming competition. Finally, we discuss the benefits this approach could offer to computer science education.},
 10 | 	urldate = {2018-12-21},
 11 | 	booktitle = {Proceedings of the 2017 {ACM} {Conference} on {Innovation} and {Technology} in {Computer} {Science} {Education}},
 12 | 	publisher = {ACM},
 13 | 	author = {Haaranen, Lassi},
 14 | 	year = {2017},
 15 | 	keywords = {computer science education, game-based learning, online communities, streaming},
 16 | 	pages = {353--358}
 17 | }
 18 | 
 19 | @inproceedings{rubin_effectiveness_2013,
 20 | 	address = {New York, NY, USA},
 21 | 	series = {{SIGCSE} '13},
 22 | 	title = {The effectiveness of live-coding to teach introductory programming},
 23 | 	isbn = {978-1-4503-1868-6},
 24 | 	url = {https://doi.acm.org/10.1145/2445196.2445388},
 25 | 	doi = {10.1145/2445196.2445388},
 26 | 	abstract = {Live-coding is defined as "the process of designing and implementing a [coding] project in front of class during lecture period". In this article we present our research design and results regarding the effectiveness of live-coding to teach introductory programming. The research design includes two experimental groups spread across four sections of an introductory C++ course at Colorado School of Mines. In the control group, students were taught using static code, meaning that instructors never typed, but instead viewed, compiled, and executed code examples. In the experimental or "live-coding" group, instructors started each lecture with a blank screen, and taught code examples by systematically typing, compiling, and testing code to solve example problems. To assess the effectiveness of live-coding, we administered four surveys and analyzed final grades. Two of the surveys were given at the beginning of the course, and were used to measure baseline programming knowledge and student learning preferences (i.e., VARK). The other two surveys, given at the end of the course, were designed to measure the amount of programming knowledge obtained as well as preferences towards live coding. Lastly, final grades were analyzed in terms of its subcomponents: the assignments, exams, final project, and overall grade. Based on our results, we conclude that teaching via live-coding is as good as if not better than using static code examples.},
 27 | 	urldate = {2018-12-21},
 28 | 	booktitle = {Proceeding of the 44th {ACM} {Technical} {Symposium} on {Computer} {Science} {Education}},
 29 | 	publisher = {ACM},
 30 | 	author = {Rubin, Marc J.},
 31 | 	year = {2013},
 32 | 	keywords = {introductory, live-coding, pedagogy, programming},
 33 | 	pages = {651--656}
 34 | }
 35 | 
 36 | @book{wilson_teaching_2018,
 37 | 	address = {Leipzig},
 38 | 	title = {Teaching tech together: how to design and deliver lessons that work and build a teaching community around them},
 39 | 	isbn = {978-0-9881137-0-1},
 40 | 	shorttitle = {Teaching tech together},
 41 | 	language = {en},
 42 | 	publisher = {Amazon Distribution GmbH},
 43 | 	author = {Wilson, Greg},
 44 | 	year = {2018},
 45 | 	url = {http://teachtogether.tech/}
 46 | }
 47 | 
 48 | @article{strobel_when_2009,
 49 | 	title = {When is {PBL} more effective? {A} meta-synthesis of meta-analyses comparing {PBL} to conventional classrooms},
 50 | 	volume = {3},
 51 | 	issn = {1541-5015},
 52 | 	shorttitle = {When is {PBL} more effective?},
 53 | 	url = {https://docs.lib.purdue.edu/ijpbl/vol3/iss1/4},
 54 | 	doi = {10.7771/1541-5015.1046},
 55 | 	number = {1},
 56 | 	journal = {Interdisciplinary Journal of Problem-Based Learning},
 57 | 	author = {Strobel, Johannes and Barneveld, Angela van},
 58 | 	month = mar,
 59 | 	year = {2009}
 60 | }
 61 | 
 62 | @article{markham_project_2011,
 63 | 	title = {Project-based learning: A bridge just far enough},
 64 | 	volume = {39},
 65 | 	copyright = {Copyright E L Kurdyla Publishing LLC Dec 2011},
 66 | 	issn = {14811782},
 67 | 	url = {https://search.proquest.com/docview/915254354/abstract/707DEDB5F1E145E5PQ/1},
 68 | 	abstract = {[...] well-executed PBL emphasizes a carefully planned assessment that incorporates formative feedback, detailed rubrics, and multiple evaluations of content and skills. [...] PBL can be defined as an extended learning process that uses inquiry and challenge to stimulate the groivth and mastery of skills. [...] PBL refocuses education on the student, not the curriculum-a shift mandated by the global world, which rewards intangible assets such as drive, passion, creativity, empathy, and resiliency.},
 69 | 	language = {English},
 70 | 	number = {2},
 71 | 	urldate = {2018-12-21},
 72 | 	journal = {Teacher Librarian; Bowie},
 73 | 	author = {Markham, Thom},
 74 | 	month = dec,
 75 | 	year = {2011},
 76 | 	keywords = {Advantages, Core curriculum, Design, Education, Methods, Teaching, Young adults},
 77 | 	pages = {38--42}
 78 | }
 79 | 
 80 | @book{sawyer_cambridge_2006,
 81 | 	address = {Cambridge, NY, USA},
 82 | 	title = {The {Cambridge} handbook of the learning sciences},
 83 | 	isbn = {978-0-521-84554-0 978-0-521-60777-3},
 84 | 	language = {en},
 85 | 	publisher = {Cambridge University Press},
 86 | 	editor = {Sawyer, R. Keith},
 87 | 	year = {2006},
 88 | 	doi = {10.1192/bjp.bp.106.029678},
 89 | 	note = {OCLC: ocm62728545},
 90 | 	keywords = {Cognitive learning, Learning, Learning, Psychology of, Social aspects}
 91 | }
 92 | 
 93 | @article{wilson-software-carpentry,
 94 |     author =  {Greg Wilson},
 95 |     title =   {{Software} {Carpentry}: Getting scientists to write better code by making them more productive},
 96 |     journal = {Computing in Science \& Engineering},
 97 |     month =   {November--December},
 98 |     year =    {2006},
 99 |     doi = {10.1109/MCSE.2006.122},
100 |     note =    {Summarizes the what and why of Version 3 of the course.}
101 | }
102 | 
103 | @Manual{tidyverse,
104 |   title = {tidyverse: Easily install and load the 'Tidyverse'},
105 |   author = {Hadley Wickham},
106 |   year = {2017},
107 |   note = {R package version 1.2.1},
108 |   url = {https://CRAN.R-project.org/package=tidyverse},
109 | }
110 | 
111 | @Manual{R,
112 |   title = {R: A language and environment for statistical computing},
113 |   author = {{R Core Team}},
114 |   organization = {R Foundation for Statistical Computing},
115 |   address = {Vienna, Austria},
116 |   year = {2018},
117 |   url = {https://www.R-project.org/},
118 | }
119 | 
120 | @Misc{carpentry,
121 |   author = {Achaz {von Hardenberg} and Adam Obeng and Aleksandra Pawlik and Alex Pletzer and Alexey Shiklomanov and Anne Fouilloux and April Wright and Auriel Fournier and Ben Marwick and C. Titus Brown and Carolina Johnson and Carolyn Voter and Catherine Hulshof and Christie Bahlai and Clara Shaw and Daijiang Li and Daina Bouquin and Daniel Stubbs and Danielle Quinn and Darya Vanichkina and Dmytro Fishman and Earle Wilson and Edmund Hart and Eilis Hannon and Elena Sügis and Eli Strauss and Emilia Gan and Erin Becker and Ethan White and Francisco Rodriguez-Sanchez and Francois Michonneau and Fred Boehm and {GMoncrieff} and Hao Ye and Harriet Dashnow and Hilmar Lapp and {JSurman} and Jaime Ashander and Jarrett Byrnes and Jeffrey W Hollister and Jieming Chen and Jillian Dunic and {Jon} and Jonathan Keane and Joseph Stachelek and Josh Herr and K. A. S. Mislan and Kara Woo and Karen Cranston and Kari L. Jordan and Karthik Ram and Kate Hertweck and Kathe Todd-Brown and Katie Lotterhos and Kayla Peck and Kenan Direk and Kevin Hall and Kristian Tylén and Kyriakos Chatzidimitriou and Lachlan Deer and Laurent Gatto and Leah Wasser and Leszek Tarkowski and Lisa Breckels and M. Foos and Marco Chiapello and Mark Robinson and Markus J. Akenbrand and Mateusz Kuzak and Matthias Grenié and Matthias Grenié and Maëlle Salmon and Meghan Duffy and Michael Koontz and Myfanwy Johnston and Nicholas Marino and Nick Carchedi and Olivia Burge and Philip Lijnzaad and Philip Lijnzaad and Ryan Peek and Sarah Supp and Shawn Taylor and Stephanie Labou and Steve Pederson and Tara Webster and Taylor Reiter and Thomas Sandmann and Tracy Teal and Will Furnass and Will Pearse and Ye Li and Zena Lapp and {ab604} and {ashander} and {cengel} and Brian Seok and {sfn_brt} and {suparee}},
122 |   title = {{Data} {Carpentry}: {R} for data analysis and visualization of ecological data},
123 |   editor = {Francois Michonneau and Auriel Fournier},
124 |   month = {November},
125 |   year = {2018},
126 |   url = {https://datacarpentry.org/R-ecology-lesson/},
127 |   doi = {10.5281/zenodo.569338},
128 | }
129 | 
130 | @Manual{dplyr,
131 |   title = {dplyr: A grammar of data manipulation},
132 |   author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller},
133 |   year = {2018},
134 |   note = {R package version 0.7.8},
135 |   url = {https://CRAN.R-project.org/package=dplyr},
136 | }
137 | 
138 | @Article{lme4,
139 |   title = {Fitting linear mixed-effects models using {lme4}},
140 |   author = {Douglas Bates and Martin M{\"a}chler and Ben Bolker and Steve Walker},
141 |   journal = {Journal of Statistical Software},
142 |   year = {2015},
143 |   volume = {67},
144 |   number = {1},
145 |   pages = {1--48},
146 |   doi = {10.18637/jss.v067.i01},
147 | }
148 | 
149 | @Article{lmerTest,
150 |   title = {{lmerTest} package: Tests in linear mixed effects models},
151 |   author = {Alexandra Kuznetsova and Per B. Brockhoff and Rune H. B. Christensen},
152 |   journal = {Journal of Statistical Software},
153 |   year = {2017},
154 |   volume = {82},
155 |   number = {13},
156 |   pages = {1--26},
157 |   doi = {10.18637/jss.v082.i13},
158 | }
159 | 
160 | @Manual{rmarkdown,
161 |   title = {rmarkdown: Dynamic documents for {R}},
162 |   author = {JJ Allaire and Yihui Xie and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang and Richard Iannone},
163 |   year = {2018},
164 |   note = {R package version 1.11},
165 |   url = {https://rmarkdown.rstudio.com},
166 | }
167 | 
168 | @Article{mice,
169 |   title = {{mice}: Multivariate imputation by chained equations in {R}},
170 |   author = {Stef {van Buuren} and Karin Groothuis-Oudshoorn},
171 |   journal = {Journal of Statistical Software},
172 |   year = {2011},
173 |   volume = {45},
174 |   number = {3},
175 |   pages = {1-67},
176 |   doi = {10.18637/jss.v045.i03},
177 |   url = {https://www.jstatsoft.org/v45/i03/},
178 | }
179 | 
180 | @Article{multcomp,
181 |   title = {Simultaneous inference in general parametric models},
182 |   author = {Torsten Hothorn and Frank Bretz and Peter Westfall},
183 |   journal = {Biometrical Journal},
184 |   year = {2008},
185 |   volume = {50},
186 |   number = {3},
187 |   pages = {346--363},
188 |   doi = {10.1002/bimj.200810425}
189 | }
190 | 
191 | @Manual{psych,
192 |   title = {{psych}: Procedures for psychological, psychometric, and personality research},
193 |   author = {William Revelle},
194 |   organization = {Northwestern University},
195 |   address = {Evanston, IL, USA},
196 |   year = {2018},
197 |   note = {R package version 1.8.10},
198 |   url = {https://CRAN.R-project.org/package=psych},
199 | }
200 | 
201 | @Book{car,
202 |   title = {An {R} companion to applied regression},
203 |   edition = 2,
204 |   author = {John Fox and Sanford Weisberg},
205 |   year = {2011},
206 |   publisher = {Sage},
207 |   address = {Thousand Oaks, CA, USA},
208 |   url = {https://socialsciences.mcmaster.ca/jfox/Books/Companion-2E},
209 | }
210 | 
211 | @Article{reshape2,
212 |   title = {Reshaping data with the {reshape} package},
213 |   author = {Hadley Wickham},
214 |   journal = {Journal of Statistical Software},
215 |   year = {2007},
216 |   volume = {21},
217 |   number = {12},
218 |   pages = {1--20},
219 |   url = {https://www.jstatsoft.org/v21/i12/},
220 |   doi = {10.18637/jss.v021.i12},
221 | }
222 | 
223 | @Article{plyr,
224 |   title = {The split-apply-combine strategy for data analysis},
225 |   author = {Hadley Wickham},
226 |   journal = {Journal of Statistical Software},
227 |   year = {2011},
228 |   volume = {40},
229 |   number = {1},
230 |   pages = {1--29},
231 |   url = {https://www.jstatsoft.org/v40/i01/},
232 |   doi = {10.18637/jss.v040.i01}
233 | }
234 | 
235 | @Manual{MuMIn,
236 |   title = {{MuMIn}: Multi-model inference},
237 |   author = {Kamil Bartoń},
238 |   year = {2018},
239 |   note = {R package version 1.42.1},
240 |   url = {https://CRAN.R-project.org/package=MuMIn},
241 | }
242 | 
243 | @Article{deSolve,
244 |   title = {Solving differential equations in {R}: package {deSolve}},
245 |   author = {Karline Soetaert and Thomas Petzoldt and R. Woodrow Setzer},
246 |   journal = {Journal of Statistical Software},
247 |   volume = {33},
248 |   number = {9},
249 |   pages = {1--25},
250 |   year = {2010},
251 |   coden = {JSSOBK},
252 |   issn = {1548-7660},
253 |   url = {https://www.jstatsoft.org/v33/i09},
254 |   doi = {10.18637/jss.v033.i09},
255 |   keywords = {ordinary differential equations, partial differential
256 |   equations, differential algebraic equations, initial value problems,
257 |   R, FORTRAN, C},
258 | }
259 | 
260 | @Manual{knitr,
261 |   title = {{knitr}: A general-purpose package for dynamic report generation in {R}},
262 |   author = {Yihui Xie},
263 |   year = {2018},
264 |   note = {R package version 1.21},
265 |   url = {https://yihui.name/knitr/},
266 | }
267 | 
268 | @Manual{EcoSimR,
269 |   title = {{EcoSimR}: Null model analysis for ecological data},
270 |   author = {Nicholas J. Gotelli and Edmund M. Hart and Aaron M. Ellison},
271 |   year = {2015},
272 |   note = {R package version 0.1.0},
273 |   url = {https://github.com/gotellilab/EcoSimR},
274 |   doi = {10.5281/zenodo.16522}
275 | }
276 | 


--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "A graduate student-led participatory live-coding quantitative methods course in R: Experiences on initiating, developing, and teaching"
  3 | tags:
  4 | - R
  5 | - ecology
  6 | - statistics
  7 | - biology
  8 | - undergraduate
  9 | authors:
 10 | - name: Luke W. Johnston
 11 |   orcid: 0000-0003-4169-2616
 12 |   affiliation: "2,3"
 13 | - name: Madeleine Bonsma-Fisher
 14 |   orcid: 0000-0002-5813-4664
 15 |   affiliation: 1
 16 | - name: Joel Ostblom
 17 |   orcid: 0000-0003-0051-3239
 18 |   affiliation: 4
 19 | - name: Ahmed R. Hasan
 20 |   orcid: 0000-0003-0002-8399
 21 |   affiliation: 6
 22 | - name: James S. Santangelo
 23 |   orcid: 0000-0002-5921-2548
 24 |   affiliation: 7
 25 | - name: Lindsay Coome
 26 |   orcid: 0000-0001-8126-3571
 27 |   affiliation: 5
 28 | - name: Lina Tran
 29 |   orcid: 0000-0003-3504-4524
 30 |   affiliation: 8
 31 | - name: Elliott Sales de Andrade
 32 |   orcid: 0000-0001-7310-8942
 33 |   affiliation: 1
 34 | - name: Sara Mahallati
 35 |   orcid: 0000-0002-6765-0898
 36 |   affiliation: 4
 37 | affiliations:
 38 | - name: Department of Physics, University of Toronto
 39 |   index: 1
 40 | - name: Department of Nutritional Sciences, University of Toronto
 41 |   index: 2
 42 | - name: Department of Public Health, Aarhus University
 43 |   index: 3
 44 | - name: Institute of Biomaterials and Biomedical Engineering, University of Toronto
 45 |   index: 4
 46 | - name: Department of Psychology, University of Toronto
 47 |   index: 5
 48 | - name: Department of Cell and Systems Biology, University of Toronto
 49 |   index: 6
 50 | - name: Department of Ecology and Evolutionary Biology, University of Toronto
 51 |   index: 7
 52 | - name: Department of Physiology, University of Toronto
 53 |   index: 8
 54 | date: 15 January 2019
 55 | bibliography: paper.bib
 56 | ---
 57 | 
 58 | # Introduction
 59 | 
 60 | We present an open source learning module suitable for a semester long course
 61 | and designed to leverage participatory live-coding techniques to teach both
 62 | statistical and programming skills to primarily upper-year undergraduate biology
 63 | students. Our learning module has three self-contained submodules spanning
 64 | sixteen lessons: 1) Programming in R, basic data wrangling, and visualizations;
 65 | 2) Exploratory data analysis, statistics, and modelling; and 3) Collaborative
 66 | and reproducible science. Our learning module includes eight assignments
 67 | distributed throughout the term to assess students' learning and understanding.
 68 | The material is made available as R Markdown documents and designed to be taught
 69 | using R Notebooks. Students are not expected to have any prior knowledge of the
 70 | R language. Our material is licensed under CC-BY 4.0 while the code is under the
 71 | MIT License. Our course is a response to the growing need for programmatic
 72 | training emphasizing sound data analysis practices among researchers. We believe
 73 | the included lesson topics, open accessibility, and modularity of our course
 74 | makes it an ideal resource for instructors.
 75 | 
 76 | # Statement of Need
 77 | 
 78 | In traditional undergraduate biology education, students learn statistical
 79 | skills and biological concepts separately, without any practical application
 80 | through coding. Designed primarily for upper-year undergraduate students, this
 81 | learning module emphasizes gaining skills in R coding in the context of learning
 82 | statistics and ecology. Notably, the material covers statistical concepts that
 83 | are broadly useful in biological sciences, including mixed effects models,
 84 | randomization tests, model selection, and differential equations. While we
 85 | delivered the material and concepts as a four-month long course, these concepts
 86 | are structured into primarily independent submodules focused around several
 87 | lessons, which could easily be mixed and matched to suit any desired learning
 88 | outcome. Lessons were designed to be interactive and delivered in a
 89 | participatory live-coding format so students learn experientially. The teaching
 90 | material includes assignments to hone and reinforce students' understanding and
 91 | allow them to critically apply their skills to new problems. Reproducible
 92 | quantitative research skills are emphasized throughout, culminating in an
 93 | open-ended self-directed project that requires students to apply their skills to
 94 | a real ecological dataset and problem. The teaching material is hosted in a
 95 | public GitHub repository which automatically generates a website that presents
 96 | the text, code, and code output together on the same page. The material is
 97 | openly available and licensed; anyone can easily copy and modify for their own
 98 | purposes.
 99 | 
100 | # Learning Objectives and Content
101 | 
102 | The overarching objective of the course is to teach reproducible and
103 | collaborative quantitative research skills. The lessons are described in more
104 | detail in Table 1 and are organized into three submodules:
105 | 
106 | 1. Programming in R [@R], basic data wrangling, and visualization (lessons 1-5).
107 | 2. Exploratory and statistical data analysis (lessons 6-13).
108 | 3. Collaborative and reproducible science (lessons 14-15). 
109 | 
110 | | **Submodule** | **Lesson** | **Description** | **Packages used** |
111 | |:----------|:-------------|:-------------|:------------------|
112 | | Programming in R, data wrangling, visualization | 1 | Introducing R, RStudio, and R Markdown | |
113 | | | 2 | Vectors, data frames, basic operations, and functions  | `tidyverse` [@tidyverse] |
114 | | | 3 | Introduction to exploratory data analysis | `tidyverse` |
115 | | | 4 | Introduction to statistics and visualization | `tidyverse` |
116 | | | 5 | Data transformation and visualization | `tidyverse` |
117 | | Exploratory and statistical data analysis | 6 | Cleaning and preprocessing raw data | `tidyverse`; `mice` [@mice] |
118 | | | 7 | Descriptive and inferential statistics | `tidyverse`; `car` [@car]; `psych` [@psych]; `multcomp` [@multcomp] |
119 | | | 8 | Linear mixed-effects models | `tidyverse`; `plyr` [@plyr]; `lme4` [@lme4]; `lmerTest` [@lmerTest] |
120 | | | 9 | Randomization tests and data simulation  | `tidyverse`; `reshape2` [@reshape2]; `EcoSimR` [@EcoSimR] |
121 | | | 10 | Multivariate statistics (e.g. PCA) | `tidyverse`; `car`; `psych`; `multcomp`|
122 | | | 11 | Model selection and averaging  | `tidyverse`; `lme4`; `lmerTest`; `MuMIn` [@MuMIn] |
123 | | Numerical models | 12| Population modelling with differential equations  | `tidyverse`; `deSolve` [@deSolve] |
124 | | | 13 | Time-series data and numerical models  | `tidyverse`; `deSolve` |
125 | | Collaborative and reproducible science | 14 | Scientific methods | |
126 | | | 15 | Collaborating through Git and GitHub | |
127 | | | 16 | Manuscript preparation in R Markdown | `knitr` [@knitr]; `rmarkdown` [@rmarkdown] |
128 | 
129 | Table: Overview of submodules, lessons, and packages used in the learning module.
130 | 
131 | # Instructional Design
132 | 
133 | Drawing on the instructors' previous experiences teaching introductory
134 | programming workshops, we designed our lessons to have the following components:
135 | 
136 | 1. *Lesson Outline*: Each lesson has a clearly defined outline of the lesson
137 | objectives, including expected time spent on each objective. This gives students
138 | a clear expectation of what they should learn and gain from the lesson. It also
139 | provides a structured template for instructors to prioritize content and gauge
140 | how much time each objective should take.
141 | 2. *Participatory Live-Coding*: Coding in real-time with the students actively
142 | coding along, forms the primary focus of each lesson. This hands-on approach to
143 | teaching is frequently used by teaching organizations such as 
144 | [Software Carpentry](https://software-carpentry.org/blog/2016/04/tips-tricks-live-coding.html)
145 | [@carpentry;@rubin_effectiveness_2013;@haaranen_programming_2017;@wilson_teaching_2018].
146 | While many learning outcomes focus on developing programming proficiency, some
147 | lessons are centred around concepts (such as "Statistical Modelling" or
148 | "Differential Equations"), during which we still use the live-coding approach.
149 | This approach not only demonstrates the concepts in a step-by-step fashion but
150 | also helps students practice writing code.
151 | 3. *Interwoven Exercises*: Coding exercises or discussion points are
152 | interspersed throughout each lesson to assess and reinforce the concepts and
153 | skills being taught. These exercises challenge the students and help build
154 | confidence in the material and in their coding skills. They also help
155 | instructors identify problem areas that should be further reinforced later in
156 | the lesson or submodule.
157 | 4. *Summative Assignments*: Lesson specific assignments are used every two
158 | lessons to test the competency of students to the lesson material and expected
159 | skills to be gained, while a comprehensive final assignment is used to test the
160 | students' ability to bring together all concepts learned throughout the learning
161 | module.
162 | 
163 | Each of our submodules and individual lessons built on skills and concepts that
164 | would ultimately allow students to complete a final open-ended analysis of real
165 | open ecological data. We deliberately chose large and messy (e.g. missing
166 | values) datasets for the students, reflecting the types of data that are being
167 | increasingly generated across various disciplines. With this goal in mind, we
168 | designed lessons to provide the building blocks to clean, manipulate, visualize,
169 | and analyze any dataset the students may come across, both for the final project
170 | and in their future research.
171 | 
172 | # Teaching Experience
173 | 
174 | For the first iteration of the course, our teaching team consisted of six
175 | graduate students from diverse fields of research; we divided course topics
176 | among each instructor to develop and deliver individual lessons and assignments
177 | to the eight students. We reduced the number of instructors to four graduate
178 | students for the second iteration and the number of students increased to 26. We
179 | estimate four instructors could effectively teach the current iteration of the
180 | course to around 40 students. We consider having instructors come from multiple
181 | fields as a major strength and strongly recommend this practice for teaching
182 | quantitative research methods and skills.
183 | 
184 | To maximize the learning experience, we prioritized in-class participation,
185 | engagement, and hands-on experience. The main teaching techniques we used to
186 | achieve this goal were participatory live-coding, exercises interwoven with
187 | teaching, and project-based learning
188 | [@sawyer_cambridge_2006;@strobel_when_2009;@markham_project_2011] where students
189 | collaborated in teams on data analysis problems to mimic a real world scenario.
190 | 
191 | To ensure proper teaching assistance was available at all times, we adopted a
192 | technique used successfully in workshops developed by The Carpentries
193 | [@wilson-software-carpentry]. This technique involved having at least two
194 | instructors present for each lesson, where one instructed and another acted as a
195 | "helper". Students would signal for assistance by attaching colored sticky notes
196 | to the back of their laptop monitor. This method avoided interrupting the lesson
197 | flow when individual students needed assistance.
198 | 
199 | # Story of the project
200 | 
201 | While there are many excellent open source software packages available for
202 | quantitative data analysis, the use of less capable tools (such as spreadsheet
203 | software) is still prevalent among researchers, even though these drastically
204 | reduce analytical reproducibility, power, and efficiency. This happens partly
205 | due to lack of awareness, and partly because graduate students, many of whom
206 | will be future researchers, often are not incentivized to learn new and better
207 | tools, as they usually must use what their supervisor or colleagues use. Those
208 | who do try to learn these modern tools often do so in isolation and without much
209 | formal training available. These are major barriers to learning. To help break
210 | down these barriers, we launched the graduate student group 
211 | [University of Toronto Coders](https://uoftcoders.github.io/) where we run
212 | peer-led learning sessions on using code for research through skill sharing,
213 | co-working, and community building in a friendly and supportive environment.
214 | 
215 | After running many sessions and consistently receiving overwhelmingly positive
216 | feedback on our content and teaching style, we sought to formally share our
217 | experiences through the university curriculum. We designed a course on open,
218 | reproducible data analysis, and contacted multiple departments that could be
219 | interested in hosting this course. The Department of Ecology and Evolutionary
220 | Biology at the University of Toronto agreed, and we ran a pilot of the course
221 | with the title "Theoretical Ecology and Reproducible Quantitative Methods in R"
222 | to fourth-year undergraduate students. We modelled the structure and portions of
223 | the course content after the course ["Reproducible Quantitative
224 | Methods"](https://cbahlai.github.io/rqm-template/), which was created by Dr.
225 | Christie Bahlai. We extensively modified the lesson content to include expanded
226 | material on data wrangling, visualization, reproducibility, collaborative
227 | science, and additional theoretical ecology topics.
228 | 
229 | Following a successful pilot term, we modified our lesson material further again
230 | to include more generally applicable statistical concepts and far fewer
231 | theoretical ecological concepts. We also renamed the course to "Quantitative
232 | Methods in R for Biology" to reflect this change. On both occasions,
233 | the course received excellent feedback from the students and the supervising
234 | professors and has been incorporated into the long-term curriculum as a third
235 | year level course.
236 | 
237 | # Contributions
238 | 
239 | LWJ, MB-F, LT, and LC conceptualized the course. JO lead course development. JO,
240 | MB-F, LWJ, LC, ES, and LT designed and taught the first iteration of the course.
241 | JSS, LC, MB-F, and ARH taught the second iteration of the course, with guest
242 | lectures from SM and LT. Lesson development for second iteration: JO and ARH
243 | (1-5), JSS (8, 9, 11), LC (6, 7, 10), MB-F (12, 13), LWJ (14), ARH and SM (15),
244 | LT (16). LWJ, MB-F, JO, SM, LT, ARH, and JSS wrote the paper. LWJ, MB-F, ES, JO,
245 | LT, JSS, and AH proofread and edited the final draft.
246 | 
247 | # References
248 | 


--------------------------------------------------------------------------------
/rcourse.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: No
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Website
19 | 


--------------------------------------------------------------------------------
/resources.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Resources and Additional Information"
 3 | ---
 4 | 
 5 | ## R Coding Style Guide
 6 | 
 7 | *All code submitted for grading* for the final assignment will need to conform to the coding style
 8 | guidelines as outline on [this Style Guide](http://style.tidyverse.org/index.html).
 9 | We will be discussing and highlighting how to automatically (or at least
10 | consistently) use this style as we go through the lectures and labs. For the
11 | first few assignments, you will not be required to follow these guidelines,
12 | however we will make a note in your assignment for you to fix it next time. In
13 | later assignments, part of the grade will come from adhering to the coding
14 | styles.
15 | 
16 | ## R Markdown Workflow
17 | 
18 | - [Predictive Ecology - Rmarkdown in a scientific workflow](http://predictiveecology.org/2016/10/21/Rmarkdown-science-workflow.html)
19 | - [R for Data Science - R Markdown workflow](http://r4ds.had.co.nz/r-markdown-workflow.html)
20 | - [Decision tree for choosing appropriate statistical tests](resources/Statistical-decision-tree.pdf)
21 | 


--------------------------------------------------------------------------------
/resources/HighstatLibV6.R:
--------------------------------------------------------------------------------
  1 | #Library files for courses provided by: Highland Statistics Ltd.
  2 | #To cite these functions, use:
  3 | #Mixed effects models and extensions in ecology with R. (2009).
  4 | #Zuur, AF, Ieno, EN, Walker, N, Saveliev, AA, and Smith, GM. Springer.
  5 | 
  6 | #Copyright Highland Statistics LTD.
  7 | 
  8 | #####################################################################
  9 | #VIF FUNCTION.
 10 | #To use:  corvif(YourDataFile)
 11 | corvif <- function(dataz) {
 12 |   dataz <- as.data.frame(dataz)
 13 |   #correlation part
 14 |   #cat("Correlations of the variables\n\n")
 15 |   #tmp_cor <- cor(dataz,use="complete.obs")
 16 |   #print(tmp_cor)
 17 |   
 18 |   #vif part
 19 |   form    <- formula(paste("fooy ~ ",paste(strsplit(names(dataz)," "),collapse=" + ")))
 20 |   dataz   <- data.frame(fooy=1,dataz)
 21 |   lm_mod  <- lm(form,dataz)
 22 |   
 23 |   cat("\n\nVariance inflation factors\n\n")
 24 |   print(myvif(lm_mod))
 25 | }
 26 | 
 27 | 
 28 | #Support function for corvif. Will not be called by the user
 29 | myvif <- function(mod) {
 30 |   v <- vcov(mod)
 31 |   assign <- attributes(model.matrix(mod))$assign
 32 |   if (names(coefficients(mod)[1]) == "(Intercept)") {
 33 |     v <- v[-1, -1]
 34 |     assign <- assign[-1]
 35 |   } else warning("No intercept: vifs may not be sensible.")
 36 |   terms <- labels(terms(mod))
 37 |   n.terms <- length(terms)
 38 |   if (n.terms < 2) stop("The model contains fewer than 2 terms")
 39 |   if (length(assign) > dim(v)[1] ) {
 40 |     diag(tmp_cor)<-0
 41 |     if (any(tmp_cor==1.0)){
 42 |       return("Sample size is too small, 100% collinearity is present")
 43 |     } else {
 44 |       return("Sample size is too small")
 45 |     }
 46 |   }
 47 |   R <- cov2cor(v)
 48 |   detR <- det(R)
 49 |   result <- matrix(0, n.terms, 3)
 50 |   rownames(result) <- terms
 51 |   colnames(result) <- c("GVIF", "Df", "GVIF^(1/2Df)")
 52 |   for (term in 1:n.terms) {
 53 |     subs <- which(assign == term)
 54 |     result[term, 1] <- det(as.matrix(R[subs, subs])) * det(as.matrix(R[-subs, -subs])) / detR
 55 |     result[term, 2] <- length(subs)
 56 |   }
 57 |   if (all(result[, 2] == 1)) {
 58 |     result <- data.frame(GVIF=result[, 1])
 59 |   } else {
 60 |     result[, 3] <- result[, 1]^(1/(2 * result[, 2]))
 61 |   }
 62 |   invisible(result)
 63 | }
 64 | #END VIF FUNCTIONS
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | ##################################################################
 71 | ##################################################################
 72 | #Here are some functions that we took from the pairs help file and
 73 | #modified, or wrote ourselves. To cite these, use the r citation: citation()
 74 | 
 75 | panel.cor <- function(x, y, digits=1, prefix="", cex.cor = 6)
 76 | {
 77 |   usr <- par("usr"); on.exit(par(usr))
 78 |   par(usr = c(0, 1, 0, 1))
 79 |   r1=cor(x,y,use="pairwise.complete.obs")
 80 |   r <- abs(cor(x, y,use="pairwise.complete.obs"))
 81 |   txt <- format(c(r1, 0.123456789), digits=digits)[1]
 82 |   txt <- paste(prefix, txt, sep="")
 83 |   if(missing(cex.cor)) { cex <- 0.9/strwidth(txt) } else {
 84 |      cex = cex.cor}
 85 |   text(0.5, 0.5, txt, cex = cex * r)
 86 | }
 87 | 
 88 | ##################################################################
 89 | panel.smooth2=function (x, y, col = par("col"), bg = NA, pch = par("pch"),
 90 |                         cex = 1, col.smooth = "black", span = 2/3, iter = 3, ...)
 91 | {
 92 |   points(x, y, pch = pch, col = col, bg = bg, cex = cex)
 93 |   ok <- is.finite(x) & is.finite(y)
 94 |   if (any(ok))
 95 |     lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
 96 |           col = 1, ...)
 97 | }
 98 | 
 99 | ##################################################################
100 | panel.lines2=function (x, y, col = par("col"), bg = NA, pch = par("pch"),
101 |                        cex = 1, ...)
102 | {
103 |   points(x, y, pch = pch, col = col, bg = bg, cex = cex)
104 |   ok <- is.finite(x) & is.finite(y)
105 |   if (any(ok)){
106 |     tmp=lm(y[ok]~x[ok])
107 |     abline(tmp)}
108 | }
109 | 
110 | ##################################################################
111 | panel.hist <- function(x, ...)
112 | {
113 |   usr <- par("usr"); on.exit(par(usr))
114 |   par(usr = c(usr[1:2], 0, 1.5) )
115 |   h <- hist(x, plot = FALSE)
116 |   breaks <- h$breaks; nB <- length(breaks)
117 |   y <- h$counts; y <- y/max(y)
118 |   rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
119 | }
120 | ##################################################################
121 | ##################################################################
122 | 
123 | 
124 | 
125 | ##################################################################
126 | ##################################################################
127 | #Functions for variograms
128 | #To cite these functions, use:
129 | #Mixed effects models and extensions in ecology with R. (2009).
130 | #Zuur, AF, Ieno, EN, Walker, N, Saveliev, AA, and Smith, GM. Springer.
131 | #Make a variogram for one variable
132 | #To use, type:  MyVariogram(XUTM, YUTM, E , MyDistance=10)
133 | # XUTM is x coordinates
134 | # XUTM is y coordinates
135 | # E is variable used in sample variogram
136 | # MyDistance is the cutoff value for the distances
137 | 
138 | MyVariogram <- function(x,y,z, MyDistance) {
139 |   library(gstat)
140 |   mydata      <- data.frame(z, x, y)
141 |   coordinates(mydata)    <- c("x", "y")  
142 |   Var <- variogram(z ~ 1, mydata, cutoff = MyDistance)
143 |   data.frame(Var$np, Var$dist, Var$gamma)
144 | }
145 | 
146 | #Function for making multiple variograms in an xyplot
147 | #To use, type:  MultiVariogram(Z, MyVar,XUTM, YUTM, MyDistance=10)
148 | # Z is a data frame with all the data 
149 | # Character string with variable names that will be used in the xyplot
150 | # XUTM is x coordinates
151 | # XUTM is y coordinates
152 | # MyDistance is the cutoff value for the distances
153 | 
154 | MultiVariogram <- function(Z, MyVar, x, y, MyDistance) {
155 |   #Z is the data frame with data
156 |   #MyVar is a list of variables for for which variograms are calculated
157 |   #x, y: spatial coordinates
158 |   #MyDistance: limit for distances in the variogram
159 |   
160 |   library(lattice)
161 |   VarAll<- c(NA,NA,NA,NA)
162 |   for (i in MyVar){
163 |     vi <- MyVariogram(x,y,Z[,i], MyDistance)
164 |     vii <- cbind(vi, i)
165 |     VarAll <- rbind(VarAll,vii)
166 |   }
167 |   VarAll <- VarAll[-1,]
168 |   
169 |   P <- xyplot(Var.gamma ~ Var.dist | factor(i), col = 1, type = "p", pch = 16,
170 |               data = VarAll,
171 |               xlab = "Distance",
172 |               ylab = "Semi-variogram",
173 |               strip = function(bg='white', ...)
174 |                 strip.default(bg='white', ...),
175 |               scales = list(alternating = T,
176 |                             x = list(relation = "same"),
177 |                             y = list(relation = "same"))
178 |               )
179 |   
180 |   print(P)
181 | }
182 | #End variogram code
183 | ##########################################################
184 | 
185 | #Function for multi-panel Cleveland dotplot.
186 | #The input file must contain no categorical variables
187 | Mydotplot <- function(DataSelected){
188 | 
189 | P <- dotplot(as.matrix(as.matrix(DataSelected)),
190 |           groups=FALSE,
191 |           strip = strip.custom(bg = 'white',
192 |                                par.strip.text = list(cex = 1.2)),
193 |           scales = list(x = list(relation = "free", draw = TRUE),
194 |                         y = list(relation = "free", draw = FALSE)),
195 |           col=1, cex  = 0.5, pch = 16,
196 |           xlab = list(label = "Value of the variable", cex = 1.5),
197 |           ylab = list(label = "Order of the data from text file", cex = 1.5))
198 |   
199 | print(P)  
200 |   }
201 | 
202 | 
203 | #Add more code here:
204 | 
205 | 
206 | Mybwplot <- function(Z, MyVar, TargetVar){
207 | #Multipanel boxplots
208 | #Z: data set
209 | #MyVar: character string
210 | #TargetVar: variable for the x-axis..must be a factor
211 |   
212 |   AllY <- as.vector(as.matrix(Z[,MyVar]))
213 |   AllX <- rep(Z[,TargetVar], length(MyVar))
214 |   ID <- rep(MyVar, each = nrow(Z))
215 |   
216 | P <- bwplot(AllY ~ factor(AllX) | ID, horizontal = FALSE,
217 |          ylab = "", xlab = "",
218 |          scales = list(alternating = T,cex.lab = 1.5,
219 |                        x = list(relation = "same",rot =90, abbreviate = TRUE, cex = 1.5),
220 |                        y = list(relation = "free", draw = FALSE)),
221 |          strip = strip.custom(bg = 'white',
222 |                               par.strip.text = list(cex = 1.2)),
223 |          cex = .5,
224 |          par.settings = list(
225 |            box.rectangle = list(col = 1),
226 |            box.umbrella  = list(col = 1),
227 |            plot.symbol   = list(cex = .5, col = 1)))
228 | print(P)
229 |   }
230 | 
231 | 
232 | 
233 | #######################################################
234 | MyxyplotBin <- function(Z, MyV, NameY1) {
235 |   AllX  <- as.vector(as.matrix(Z[,MyV]))
236 |   AllY  <- rep(Z[,NameY1] , length(MyV))
237 |   AllID <- rep(MyV, each = nrow(Z))
238 |   
239 |   
240 |   library(mgcv)
241 |   library(lattice)
242 |   
243 |   P <- xyplot(AllY ~ AllX | factor(AllID), col = 1,
244 |               strip = function(bg='white', ...) strip.default(bg='white', ...),
245 |               scales = list(alternating = T, 
246 |                             x = list(relation = "free"),
247 |                             y = list(relation = "same")),
248 |               xlab = "Covariate",
249 |               ylab = "Probability of presence",
250 |               panel=function(x,y){
251 |                 panel.grid(h=-1, v= 2)
252 |                 panel.points(x,y,col=1)
253 |                 tmp<-gam(y~s(x, k = 4), family = binomial)
254 |                 MyData <- data.frame(x = seq(min(x), max(x), length = 25))
255 |                 p1 <- predict(tmp, newdata = MyData, type ="response")
256 |                 panel.lines(MyData$x,p1, col = 1, lwd = 3)
257 |               })
258 |   
259 |   print(P)
260 | }
261 | #######################################################
262 | 
263 | #######################################################
264 | Myxyplot <- function(Z, MyV, NameY1,MyYlab="") {
265 |   AllX  <- as.vector(as.matrix(Z[,MyV]))
266 |   AllY  <- rep(Z[,NameY1] , length(MyV))
267 |   AllID <- rep(MyV, each = nrow(Z))
268 |   
269 |   
270 |   library(mgcv)
271 |   library(lattice)
272 |   
273 |   P <- xyplot(AllY ~ AllX|factor(AllID), col = 1,
274 |               xlab = list("Explanatory variables", cex = 1.5),
275 |               #ylab = list("Response variable", cex = 1.5),
276 |               #ylab = list("Pearson residuals", cex = 1.5),
277 |               ylab = list(MyYlab, cex = 1.5),
278 |               #layout = c(2,2),   #Modify
279 |               strip = function(bg='white', ...)
280 |                 strip.default(bg='white', ...),
281 |               scales = list(alternating = T,
282 |                             x = list(relation = "free"),
283 |                             y = list(relation = "same")),
284 |               panel=function(x, y){
285 |                 panel.grid(h=-1, v= 2)
286 |                 panel.points(x, y, col = 1)
287 |                 panel.loess(x, y, span = 0.8,col = 1, lwd = 2)})
288 |   
289 |   print(P)
290 | }
291 | #######################################################
292 | 
293 | MyxyplotPolygon <- function(Z, MyV, NameY1) {
294 |   AllX  <- as.vector(as.matrix(Z[,MyV]))
295 |   AllY  <- rep(Z[,NameY1] , length(MyV))
296 |   AllID <- rep(MyV, each = nrow(Z))
297 |   
298 |   
299 |   library(mgcv)
300 |   library(lattice)
301 |   Z <- xyplot(AllY ~ AllX|factor(AllID), col = 1,
302 |               xlab = list(label = "Explanatory variables", cex = 1.5),
303 |               ylab = "",
304 |               strip = function(bg='white',cex.lab = 1.5,...)
305 |                 strip.default(bg='white', ...),
306 |               scales = list(alternating = T,
307 |                             x = list(relation = "free"),
308 |                             y = list(relation = "same")),
309 |               panel=function(x, y){
310 |                 t1 <- gam(y~s(x))
311 |                 MD1 <- data.frame(x=seq(from = min(x, na.rm = TRUE),
312 |                                         to = max(x, na.rm = TRUE),
313 |                                         length = 100))
314 |                 P1 <- predict(t1,   se.fit = TRUE)
315 |                 I1 <- order(x)
316 |                 xs <- sort(x)
317 |                 panel.lines(xs, P1$fit[I1], col = 1)
318 |                 panel.polygon(c(xs, rev(xs)),
319 |                               c(P1$fit[I1]-2*P1$se.fit[I1],
320 |                                 rev(P1$fit[I1]+2*P1$se.fit[I1])),
321 |                               col = gray(0.7),
322 |                               density = 10 )
323 |                 panel.grid(h=-1, v= 2)
324 |                 panel.abline(0,0)
325 |                 panel.points(x, y, col = 1)
326 |                 
327 |               })
328 |   #Because the xyplot is inside a function you need to print 
329 |   #construction below
330 |   print(Z)
331 | }
332 | 
333 | ################################################
334 | #Mypairs
335 | #Make fancy pair plots
336 | Mypairs <- function(Z) {
337 |   MyVarx <- colnames(Z)
338 |   pairs(Z, labels = MyVarx,
339 |       cex.labels =  2,
340 |       lower.panel = function(x, y, digits=2, prefix="", cex.cor = 7) {
341 |         panel.cor(x, y, digits, prefix, cex.cor)}, 
342 |       upper.panel =  function(x, y) points(x, y, 
343 |                                            pch = 16, cex = 0.8, 
344 |                                            col = gray(0.1)))
345 |  #print(P)
346 | }


--------------------------------------------------------------------------------
/resources/Statistical-decision-tree.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/resources/Statistical-decision-tree.pdf


--------------------------------------------------------------------------------