├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── DESCRIPTION ├── LICENSE-MIT.md ├── LICENSE.md ├── README.md ├── _commands ├── build-pdf.R └── build-slides.R ├── _includes ├── footer.html ├── header.html ├── preamble.tex └── style.html ├── _site.yml ├── _templates └── lectures.Rmd ├── about.Rmd ├── assignment-01.Rmd ├── assignment-02.Rmd ├── assignment-03.Rmd ├── assignment-04.Rmd ├── assignment-05.Rmd ├── assignment-06.Rmd ├── assignment-07.Rmd ├── assignment-09-challenge.Rmd ├── assignment-final.Rmd ├── codemeta.json ├── data ├── Assign05_Question3.csv ├── Fitzpatrick_2018.csv ├── NEON_PlantPA_HARV_201707.csv ├── Rivkin_2018_AJB.txt ├── Santangelo_JEB_2018.csv ├── Thompson-Johnson_2016_Evol.csv ├── africa.wide.csv ├── iris.csv ├── jellyfish.csv ├── kenya.wide.csv ├── lec09_CommunityMatrix_Example.csv ├── plant-biomass-preprocess.csv ├── plant_phenology.csv ├── portal_data.csv ├── predator_prey_body_size.txt ├── pseudo.LTRs ├── pseudo.ara.busco ├── pseudo.euk.busco ├── pseudoMol_Kdist.txt ├── rikz_data.txt ├── survey.csv.gz ├── wc2.0_bio_10m_01.tif └── wc2.0_bio_10m_12.tif ├── image ├── Liriodendron_tulipifera.png ├── RIKZ_data.png ├── RIKZ_data_Crossed.png ├── RIKZ_data_DeepNest.png ├── SEM-figure.png ├── SEMfig.png ├── assignment-8-figure-q1.png ├── boxplot-problem.gif ├── colourblind.png ├── comic-filenaming.gif ├── dynamite-bars.png ├── dynamite-vs-dists.png ├── favicon.png ├── fig_scientific_method.png ├── git_lesson │ ├── branch_dropdown.png │ ├── branches.png │ ├── delete_branch.png │ ├── sample_rmd.png │ └── yellow_prompt.png ├── heatmap.png ├── logistic.gif ├── lotka-volterra.gif ├── model.png ├── predator-prey.gif └── signal-transduction-pathway.png ├── index.Rmd ├── lec01-introduction.Rmd ├── lec02-basic-r.Rmd ├── lec03-basic-r.Rmd ├── lec04-dplyr.Rmd ├── lec05-dplyr.Rmd ├── lec06-exploratory-data-analysis.Rmd ├── lec07-linear-modelling.Rmd ├── lec08-linear-mixed-effects-models.Rmd ├── lec09-model-selection.Rmd ├── lec10-multivariate-stats.Rmd ├── lec11-spatial-stats.Rmd ├── lec12-randomization-tests.Rmd ├── lec13-theory.Rmd ├── lec14-datasets.Rmd ├── lec15-git-projects.Rmd ├── mid-project-update.Rmd ├── paper.bib ├── paper.md ├── rcourse.Rproj ├── resources.Rmd └── resources ├── HighstatLibV6.R └── Statistical-decision-tree.pdf /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^README-.*\.png$ 5 | ^\.travis\.yml$ 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | _site/ 6 | _pdf/ 7 | misc/ 8 | about.html 9 | assignment-01.html 10 | assignment-02.html 11 | assignment-03.html 12 | assignment-04.html 13 | assignment-05.html 14 | assignment-06.html 15 | assignment-07.html 16 | assignment-08.html 17 | assignment-08_files/ 18 | assignment-final.html 19 | index.docx 20 | index.html 21 | index.pdf 22 | lec01-introduction.html 23 | lec01-introduction_files/ 24 | lec02-basic-r.html 25 | lec03-basic-r.html 26 | lec04-dplyr.html 27 | lec04-dplyr_files/ 28 | lec05-dplyr.html 29 | lec05-dplyr_files/ 30 | lec06-pop-models.html 31 | lec06-pop-models_files/ 32 | lec07-pop-models.html 33 | lec07-pop-models_files/ 34 | lec08-linear-mixed-effects-models.html 35 | lec08-linear-mixed-effects-models_files/ 36 | site_libs/ 37 | Fitzpatrick_2018.csv 38 | NEON_PlantPA_HARV_201707.csv 39 | lec09_CommunityMatrix_Example.csv 40 | rikz_data.txt 41 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: R 2 | sudo: false 3 | cache: packages 4 | script: 5 | - Rscript -e "rmarkdown::render_site('.')" 6 | - touch _site/.nojekyll 7 | dist: trusty 8 | addons: 9 | apt: 10 | packages: 11 | - gdal-bin 12 | - libgdal1-dev 13 | - libproj-dev 14 | deploy: 15 | provider: pages 16 | skip_cleanup: true 17 | github_token: $GITHUB_PAT 18 | local_dir: _site 19 | on: 20 | branch: master 21 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Quantitative Methods in R for Biology is an open-source course, 4 | aimed at a third- to fourth-year undergraduate level. 5 | 6 | ## How to Contribute 7 | 8 | Anyone can contribute to the course repository via [pull requests][pull-requests]. 9 | 10 | We use [GitHub flow][github-flow] to manage changes: 11 | 12 | 1. Create a [fork][fork-explanation] of this repository, and [clone][clone-explanation] it to your local computer. 13 | 2. In your local copy of this repository, create a new [branch][branch-explanation]. 14 | 3. Commit your changes to that branch. 15 | 4. Push the edits on that branch to your fork on GitHub. 16 | 5. Submit a pull request to the master repository (`UofTCoders/rcourse`). 17 | 6. Your pull request will trigger a [Travis][travis-website] build (see below for details). 18 | 7. If you receive feedback on your pull request, or encounter errors in the Travis build, 19 | make further commits to the new branch on your fork. These will automatically be added to 20 | your pull request. 21 | 22 | You may wish to look at [How to Contribute to an Open Source Project on GitHub][contribute] 23 | for more detailed instructions. The [GitHub Glossary][glossary] is also a useful resource that explains 24 | Git-related terminology. 25 | 26 | ## Continuous Integration with Travis 27 | 28 | We use [Travis CI][travis-website] to test all materials in the course repo. Any changes 29 | in the form of a pull request will trigger a Travis build, where 30 | Travis will attempt to test the code in the repo, 31 | [knitting][knitr-explanation] all lesson materials in the process. 32 | Any errors will cause the Travis build to fail. 33 | 34 | Pull requests can only be merged into the repo with a passing Travis build; 35 | this is to ensure that all course material is functional. If you submit a pull 36 | request that does not pass a Travis build, a traceback to the error can be found 37 | on the link to that pull request's respective build. 38 | 39 | More on the practice of continuous integration can be found on [Travis CI's website][ci-explanation]. 40 | 41 | ## Format 42 | 43 | We follow a consistent format across all course materials. A lesson 44 | template can be found [here][lecture-template]. 45 | 46 | ### Lessons 47 | 48 | 1. All lesson material is in R Markdown (`.Rmd`) format. 49 | 50 | 2. All lessons begin with a Lesson Preamble, subdivided into 51 | 'Learning objectives' and a 'Lesson outline'. The outline should 52 | also list approximate time requirements for each segment. 53 | 54 | 3. Lessons include a mix of code chunks and text, organized using Markdown headers. 55 | 56 | 4. Students should be able to follow the contents of the lesson from the text alone; 57 | i.e. the file should contain _all_ conceptual explanations. 58 | 59 | ### Assignments 60 | 61 | 1. All assignments are in R Markdown (`.Rmd`) format. 62 | 63 | 2. Assignment files contain a numbered list of questions and are comparatively light on code. 64 | They are designed such that students fill in answers by adding in code chunks of their own. 65 | 66 | 3. Code chunks in assignments should mostly be limited to loading required packages, 67 | downloading required data, or data cleaning if necessary. 68 | 69 | 70 | [branch-explanation]: https://help.github.com/articles/about-branches/ 71 | [ci-explanation]: https://docs.travis-ci.com/user/for-beginners/ 72 | [clone-explanation]: https://help.github.com/articles/cloning-a-repository/ 73 | [contribute]: https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github 74 | [fork-explanation]: https://help.github.com/articles/fork-a-repo/ 75 | [github-flow]: https://guides.github.com/introduction/flow/ 76 | [glossary]: https://help.github.com/articles/github-glossary/ 77 | [knitr-explanation]: https://rmarkdown.rstudio.com/authoring_quick_tour.html 78 | [lecture-template]: https://github.com/UofTCoders/rcourse/blob/master/_templates/lectures.Rmd 79 | [pull-requests]: https://help.github.com/articles/about-pull-requests/ 80 | [travis-website]: https://travis-ci.org/ 81 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Website 2 | Package: rcourse 3 | Title: Reproducible Quantitative Methods in R Course in Ecology and Evolutionary Biology 4 | Version: 2.2.0 5 | Maintainer: Luke Johnston 6 | Author: Luke Johnston 7 | Authors@R: c( 8 | person("Luke", "Johnston", email = "lwjohnst@gmail.com", role = c("aut", "cre")), 9 | person("Madeleine", "Bonsma", email = "m.bonsma@mail.utoronto.ca", role = c("aut")), 10 | person("Lindsay", "Coome", email = "lindsay.coome@mail.utoronto.ca", role = c("aut")), 11 | person("Joel", "Ostblom", email = "joel.ostblom@gmail.com", role = c("aut")), 12 | person("Elliott", "Sales de Andrade", email = "esalesde@physics.utoronto.ca", role = c("aut")), 13 | person("Lina", "Tran", email = "lina.mntran@gmail.com", role = c("aut")), 14 | person("Sara", "Mahallati", email = "sara.mahallati@gmail.com", role = c("aut")), 15 | person("James", "Santangelo", email = "james.santangelo37@gmail.com", role = c("aut")), 16 | person("Ahmed", "Hasan", email = "ahmed.hasan@mail.utoronto.ca", role = c("aut")), 17 | person("Amber", "Hoi", email = "amber.hoi@mail.utoronto.ca", role = c("ctb")), 18 | person("Zoe", "Humphries", email = "zoe.humphries@mail.utoronto.ca", role = c("ctb")) 19 | ) 20 | Depends: R (>= 3.4.0) 21 | License: CC-BY 22 | Encoding: UTF-8 23 | Imports: 24 | rmarkdown, 25 | knitr, 26 | tidyverse, 27 | GGally, 28 | broom, 29 | lattice, 30 | lme4, 31 | lmerTest, 32 | reshape2, 33 | EcoSimR, 34 | car, 35 | multcomp, 36 | MuMIn, 37 | deSolve, 38 | PerformanceAnalytics, 39 | viridis, 40 | lavaan, 41 | ggfortify, 42 | nlme, 43 | sp, 44 | ape, 45 | rgdal, 46 | raster, 47 | maps 48 | -------------------------------------------------------------------------------- /LICENSE-MIT.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "MIT License" 3 | --- 4 | 5 | MIT License 6 | 7 | Copyright (c) 2017-2018 UofTCoders 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Licensing and copyright information" 3 | --- 4 | 5 | The course material is licensed under the 6 | [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) 7 | and the course code is licensed under a [MIT License](LICENSE-MIT.html) 8 | by the [UofTCoders](https://uoftcoders.github.io) (see the [about](about.html) 9 | for those involved in creating the course). 10 | In addition to our own developed material, we have also modified material from 11 | other courses and workshops: 12 | 13 | - [Data Carpentry](http://datacarpentry.org) (licensed under the 14 | [CC-BY 2.0 Generic License](https://creativecommons.org/licenses/by/2.0/)) 15 | - [Reproducible Quantitative Methods Course](https://cbahlai.github.io/rqm-template/) 16 | (licensed under the [CC-BY 4.0 International License](https://creativecommons.org/licenses/by/4.0/)) 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Quantitative Methods in R for Biology 2 | ========================================= 3 | 4 | Course in Ecology and Evolutionary Biology 5 | ------------------------------------------ 6 | 7 | [![Build Status](https://travis-ci.org/UofTCoders/rcourse.svg?branch=master)](https://travis-ci.org/UofTCoders/rcourse) 8 | [![DOI](https://zenodo.org/badge/97400494.svg)](https://zenodo.org/badge/latestdoi/97400494) 9 | [![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/) 10 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 11 | [![status](http://jose.theoj.org/papers/1a083e69c49c15011f9404dfab9b1ec8/status.svg)](http://jose.theoj.org/papers/1a083e69c49c15011f9404dfab9b1ec8) 12 | 13 | ## Description 14 | 15 | Quantitative Methods in R for Biology is a course aimed at undergraduates at a third year level or above. 16 | The course covers statistics and data analysis for ecology and reproducible quantitative methods in R. 17 | 18 | Statistical analysis, modelling, and data simulation are essential skills for ecologists and evolutionary biologists. 19 | Furthermore, ever larger datasets are quickly becoming the norm in a variety of scientific disciplines. 20 | This course is therefore designed to meet a growing demand for reproducible, openly accessible, 21 | analytically thorough, and well-documented science. Students will learn to develop ecological population models, 22 | analyze large datasets, and document their research using the R programming language. No prior programming experience is required. 23 | 24 | For more detail on the course, check out the [syllabus](https://uoftcoders.github.io/rcourse/). 25 | 26 | ## Instructional Design 27 | 28 | The lectures in this course are designed to be presented using a participatory live-coding approach. 29 | This involves an instructor typing and running code in [RStudio](https://www.rstudio.com/) in front of the class, while the class follows 30 | along using their own computers. Challenges are interspersed in the lesson material, allowing students to 31 | collaboratively work on smaller coding problems for a few minutes. All lesson materials are provided ahead 32 | of time on the course website for students to refer to during lectures. 33 | 34 | The bulk of the course's assessment structure involves weekly assignments. These assignments 35 | are primarily code-based and are designed to also be completed in RStudio using the R Markdown format. 36 | 37 | At the end of the course, students undertake a group project, wherein they attempt to address a scientific 38 | question by applying techniques learned over the course to open ecological data. At the end of the semester, 39 | groups present their work in a conference-style presentation, and submit a report in the style of a scientific paper. 40 | 41 | ## Lecture Content 42 | 43 | The course's lesson material is broadly subdivided into three main topics: 44 | 45 | 1. Introductory R (Lectures 1-5) 46 | * Introduces students to the R programming language, with a focus on 47 | data wrangling and visualization. 48 | 2. Statistical analysis (Lectures 6-12) 49 | * Introduces concepts such as regression, principal component analysis, statistical models, and numerical models. 50 | 3. Reproducible science (Lectures 13-15) 51 | * Prepares students for project work period and introduces methods for reproducible science (GitHub, R Markdown). 52 | 53 | ## Content Reuse Instructions 54 | 55 | If you are interested in using or modifying this content and repository for your 56 | own course, there are a few steps you need to take: 57 | 58 | 1. Create a fork of this repository. 59 | 1. Create a [personal access token](https://help.github.com/en/articles/creating-a-personal-access-token-for-the-command-line) for your account on GitHub (make sure to enable the "repo" scope so that using this token will enable writing to your GitHub repos) and copy the token to your clipboard. 60 | 1. Go to https://travis-ci.org/USER/REPO/settings replacing `USER` with your GitHub ID and `REPO` with the name of the forked repository. 61 | 1. Under the section "Environment Variables", type `GITHUB_TOKEN` in the "Name" text box and paste your personal access token into the "Value" text box. 62 | 63 | In general, the first time the Travis CI builds can take about 15-25 minutes but subsequent builds take about 5-6 minutes. 64 | 65 | ## Contributing 66 | 67 | If you are interested in contributing to the course material, please refer to the guidelines in [CONTRIBUTING.md](https://github.com/UofTCoders/rcourse/blob/master/CONTRIBUTING.md). 68 | 69 | ## Related Publications 70 | 71 | Santangelo JS (2019). Data simulation and randomization tests. NEON Faculty Mentoring Network, QUBES Educational Resources. doi:10.25334/Q4CT7P. [Available online](https://qubeshub.org/qubesresources/publications/996/1). 72 | 73 | Bonsma-Fisher M, Hasan AR (2018). Working with plant phenology data and fitting a nonlinear model using least squares in R. NEON Faculty Mentoring Network, QUBES Educational Resources. doi:10.25334/Q4Q73D. [Available online](https://qubeshub.org/qubesresources/publications/978/1). 74 | 75 | ## Acknowledgements 76 | 77 | We thank Dr. Christie Bahlai, Dr. Asher Cutter, Dr. Martin Krkosek, and the Department of Ecology 78 | and Evolutionary Biology at the University of Toronto for helping make this course a reality. 79 | 80 | We also thank Dr. Megan Jones and Dr. Kusum Naithani for their support and guidance, particularly 81 | around use of the NEON Ecological Observatory data. 82 | -------------------------------------------------------------------------------- /_commands/build-pdf.R: -------------------------------------------------------------------------------- 1 | # Run in parent directory ("rcourse/", not "rcourse/R"). 2 | # 3 | # Usage: 4 | # 5 | # Rscript _commands/build-pdf.R 6 | 7 | # Convert all Rmd files into PDF files 8 | rmd_files <- list.files(pattern = "^lec11.*.Rmd$") 9 | sapply( 10 | rmd_files, 11 | rmarkdown::render, 12 | output_format = "pdf_document", 13 | output_dir = "_pdf", 14 | output_options = list( 15 | pandoc_args = c("-V", "fontsize=12pt", "-V", "papersize=letter"), 16 | include = list(in_header = "_includes/preamble.tex") 17 | ) 18 | ) 19 | -------------------------------------------------------------------------------- /_commands/build-slides.R: -------------------------------------------------------------------------------- 1 | # Run this command in the parent directory (`rcourse/`) 2 | # 3 | # Usage: 4 | # 5 | # Rscript _commmands/build-slides.R 6 | # 7 | 8 | # Generate the slide html files, of only the lectures. 9 | rmd_files <- c( 10 | "lec14-datasets.Rmd" 11 | ) 12 | 13 | if (!is.na(rmd_files)) { 14 | sapply( 15 | rmd_files, 16 | rmarkdown::render, 17 | output_format = "ioslides_presentation", 18 | output_dir = "_site/slides", 19 | output_options = list( 20 | df_print = "kable", 21 | slide_level = 3, 22 | smaller = TRUE, 23 | transition = 0.01 24 | ) 25 | ) 26 | } 27 | -------------------------------------------------------------------------------- /_includes/footer.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |

This work is licensed under a Creative Commons Attribution 4.0 International License. See the licensing page for more details about copyright information.

5 | -------------------------------------------------------------------------------- /_includes/header.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /_includes/preamble.tex: -------------------------------------------------------------------------------- 1 | 2 | \usepackage[bitstream-charter]{mathdesign} 3 | \usepackage[T1]{fontenc} 4 | \usepackage[utf8]{inputenc} 5 | -------------------------------------------------------------------------------- /_includes/style.html: -------------------------------------------------------------------------------- 1 | 21 | -------------------------------------------------------------------------------- /_site.yml: -------------------------------------------------------------------------------- 1 | name: EEB313H1 Theoretical Ecology and Reproducible Quantitative Methods in R 2 | output_dir: _site 3 | exclude: 4 | - DESCRIPTION 5 | - LICENSE 6 | new_session: true 7 | navbar: 8 | title: EEB313H1 9 | left: 10 | - text: Syllabus 11 | icon: fa-map-o 12 | href: index.html 13 | - text: Lectures 14 | icon: fa-book 15 | menu: 16 | - text: 'Sept. 10: Intro to course, programming, RStudio, and R Markdown' 17 | href: lec01-introduction.html 18 | - text: 'Sept. 12: Assignment, vectors, functions' 19 | href: lec02-basic-r.html 20 | - text: 'Sept. 17: Data frames, intro to dplyr' 21 | href: lec03-basic-r.html 22 | - text: 'Sept. 19: Data wrangling in dplyr, ggplot, tidy data' 23 | href: lec04-dplyr.html 24 | - text: 'Sept. 24: More dplyr and ggplot' 25 | href: lec05-dplyr.html 26 | - text: 'Sept. 26: Exploratory data analysis' 27 | href: lec06-exploratory-data-analysis.html 28 | - text: 'Oct. 01: Linear models and statistical modelling' 29 | href: lec07-linear-modelling.html 30 | - text: 'Oct. 03: Mixed effects models' 31 | href: lec08-linear-mixed-effects-models.html 32 | - text: 'Oct. 08: Model Selection' 33 | href: lec09-model-selection.html 34 | - text: 'Oct. 10: Multivariate stats' 35 | href: lec10-multivariate-stats.html 36 | - text: 'Oct. 15: Spatial stats' 37 | href: lec11-spatial-stats.html 38 | - text: 'Oct. 17: Simulating data: Randomization tests' 39 | href: lec12-randomization-tests.html 40 | - text: 'Oct. 22 & 24: Mathematical models in EEB' 41 | href: lec13-theory.html 42 | - text: 'Oct. 29: Datasets, hypotheses, begin projects' 43 | href: lec14-datasets.html 44 | - text: 'Oct. 29 (cont.): Collaborating with GitHub' 45 | href: lec15-git-projects.html 46 | - text: 'Oct. 31: Project work (no lesson)' 47 | - text: 'Nov. 12: Project work (no lesson)' 48 | - text: 'Nov. 14: Project work (no lesson)' 49 | - text: 'Nov. 19: Project work (no lesson)' 50 | - text: 'Nov. 21: Project work (no lesson)' 51 | - text: 'Nov. 26: Project work (no lesson)' 52 | - text: 'Nov. 28: Project work (no lesson)' 53 | - text: 'Dec. 03: Project work (no lesson)' 54 | - text: 'Dec. 05: Group presentations (no lesson)' 55 | - text: Assignments 56 | icon: fa-book 57 | menu: 58 | - text: Assignment 1 59 | href: assignment-01.html 60 | - text: Assignment 2 61 | href: assignment-02.html 62 | - text: Assignment 3 63 | href: assignment-03.html 64 | - text: Assignment 4 65 | href: assignment-04.html 66 | - text: Assignment 5 67 | href: assignment-05.html 68 | - text: Assignment 6 69 | href: assignment-06.html 70 | - text: Assignment 7 71 | href: assignment-07.html 72 | - text: Mid-project update 73 | href: mid-project-update.html 74 | - text: Challenge assignment 75 | href: assignment-09-challenge.html 76 | - text: Final project 77 | href: assignment-final.html 78 | - text: Resources and FAQ 79 | icon: fa-question 80 | href: resources.html 81 | - text: About 82 | icon: fa-info 83 | href: about.html 84 | right: 85 | - icon: fa-bars 86 | menu: 87 | - text: Contact 88 | icon: fa-envelope 89 | href: mailto:ahmed.hasan@mail.utoronto.ca 90 | - text: GitHub 91 | icon: fa-github 92 | href: https://github.com/uoftcoders/rcourse 93 | 94 | output: 95 | html_document: 96 | toc: yes 97 | toc_depth: 4 98 | toc_float: 99 | collapsed: no 100 | smooth_scroll: no 101 | include: 102 | in_header: 103 | - _includes/header.html 104 | - _includes/style.html 105 | after_body: _includes/footer.html 106 | theme: lumen 107 | highlight: haddock 108 | mathjax: https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML 109 | lib_dir: site_libs 110 | self_contained: no 111 | 112 | -------------------------------------------------------------------------------- /_templates/lectures.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lecture title" 3 | --- 4 | 5 | 6 | 7 | 8 | 9 | ---- 10 | 11 | 12 | 13 | ## Lesson preamble 14 | 15 | 16 | 17 | ### Lesson objectives: 18 | 19 | 20 | 21 | - Item 1 22 | - Item 2 23 | 24 | ### Lesson outline: 25 | 26 | 27 | Total lesson time: {{num}} hours 28 | 29 | - Outline 1 ({{num}} min) 30 | - Outline 2 ({{num}} min) 31 | - Outline 3 ({{num}} min) 32 | 33 | 34 | 35 | ---- 36 | 37 | ## Header 2 ('section' header) 38 | 39 | 40 | 41 | ### Header 3 (slide title) 42 | 43 | ```{r} 44 | # R code chunk 45 | ``` 46 | 47 | #### Header 4 (slide block title) 48 | 49 | ## {{ Group | Individual }} exercise 50 | -------------------------------------------------------------------------------- /about.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "About the course and instructors/TAs" 3 | --- 4 | 5 | ## Course inspiration 6 | 7 | Based on [RQM](https://cbahlai.github.io/rqm-template/). Draw much inspiration and content from [Software Carpentry](https://software-carpentry.org/) and [Data Carpentry](http://www.datacarpentry.org/). 8 | 9 | ## The course creators and instructors/TAs 10 | 11 | - Madeleine Bonsma-Fisher (Physics) 12 | - Lindsay Coome (Psychology) 13 | - Luke Johnston (Nutritional Sciences) 14 | - Sara Mahallati (IBBME) 15 | - Joel Östblom (IBBME) 16 | - Elliott Sales de Andrade (Physics) 17 | - Lina Tran (Physiology) 18 | - James Santangelo (EEB) 19 | - Ahmed Hasan (CSB) 20 | - Zoe Humphries (EEB) 21 | - Amber Hoi (EEB) 22 | 23 | Licensing information can be found in the [license](LICENSE.html) page. 24 | -------------------------------------------------------------------------------- /assignment-01.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Assignment 1: Intro to programming (4 marks)' 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | *To submit this assignment, upload the full document on Quercus, 9 | including the original questions, your code, and the output. Submit 10 | your assignment as a knitted `.pdf` (prefered) or `.html` file.* 11 | 12 | 1. Get set up at home (or on a lab computer after hours). (1.5 marks) 13 | - Install [R](https://cran.rstudio.com/) and 14 | [RStudio](https://www.rstudio.com/products/rstudio/download/) 15 | (already installed on the lab computers). 16 | - Open a new R Notebook and read the instructions about how to use 17 | the R Markdown syntax. 18 | - Open this assignment file (`assignment-01.Rmd`) in RStudio or 19 | copy its content into an empty R Notebook. 20 | - Insert a code chunk below, above question 2. 21 | - In the code chunk, use `install.packages("")` to 22 | install `tidyverse` and `rmarkdown`. Remember to run the code 23 | chunk to execute the commands. 24 | - Load the two libraries you just installed into your environment 25 | with `library()` (no surrounding quotation marks 26 | as with `install.packages()`). Add this to the same code chunk 27 | you created previously and execute it again (don't worry that 28 | the `install.packages()` commands have already been executed 29 | once, R is smart and checks if you already have those 30 | installed). 31 | - Run `sessionInfo()` to list all the loaded packages. 32 | - You should see the following packages under "other attached 33 | packages": `rmarkdown`, `dplyr`, `purrr`, `readr`, `tidyr`, 34 | `tibble`, `ggplot`, and `tidyverse`. 35 | - Since this is your first assignment, we have already completed 36 | most of this question below. You still need to run the code 37 | chunk on your computer to confirm that the packages installed 38 | without errors and to get the `sessionInfo()` output for your 39 | computer. You might receive warnings that functions from other 40 | packages are masked when you load `tidyverse`, this is fine. 41 | 42 | ```{r} 43 | install.packages("tidyverse") 44 | install.packages("rmarkdown") 45 | 46 | library(tidyverse) 47 | library(rmarkdown) 48 | 49 | sessionInfo() 50 | 51 | # Expected answer 52 | # The output is included below this code chunk with the appropriate 53 | # packages loaded. 54 | ``` 55 | 56 | 2. What is R Markdown and why are we using it in this class? *Hint:* 57 | You are using R Markdown right now to complete this assignment! (1 58 | mark) 59 | - Which key combination would you use to insert a chunk of code in 60 | an R Markdown document? 61 | - Which key combination would you use to execute a code chunk? 62 | 63 | 3. Provide a few reasons as to why it is beneficial to create documents 64 | like R Notebooks rather than using spreadsheet software for 65 | exploratory data analyses. (1 mark) 66 | 67 | 4. Fill out the pre-course survey posted on Quercus. Type your student number 68 | below to confirm that you are done. (0.5 marks) 69 | -------------------------------------------------------------------------------- /assignment-02.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Assignment 2: Base R (8 marks)' 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | *To submit this assignment, upload the full document on blackboard, 9 | including the original questions, your code, and the output. Submit 10 | you assignment as a knitted `.pdf` (preferred) or `.html` file.* 11 | 12 | 1. Variable assignment (1 mark) 13 | 14 | a. Assign the value `5` to the variable/object `a`. Display `a`. 15 | (0.25 marks) 16 | 17 | b. Assign the result of `10/3` to the variable `b`. Display `b`. 18 | (0.25 marks) 19 | 20 | c. Write a function that adds two numbers and returns their sum. 21 | Use it to assign the sum of `a` and `b` to `result`. Display `result`. 22 | (In practice, there is already a more sophisticated built-in 23 | function for this: `result <- sum(a, b)`) (0.25 marks) 24 | 25 | d. Write a function that multiplies two numbers and returns their product. 26 | Use it to assign the product of `a` and `b` to `product`. Display `product`. 27 | (In practice, there is already a more sophisticated built-in 28 | function for this: `product <- prod(a, b)`) (0.25 marks) 29 | 30 | 2. Vectors (1 mark) 31 | 32 | a. Create a vector `v` with all integers 0-30, and a vector `w` 33 | with every third integer in the same range. (0.25 marks) 34 | 35 | b. What is the difference in lengths of the vectors `v` and `w`? 36 | (0.25 marks) 37 | 38 | c. Create a new vector, `v_square`, with the square of elements at indices 39 | 3, 6, 7, 10, 15, 22, 23, 24, and 30 from the variable `v`. *Hint: 40 | Use indexing rather than a for loop.* (0.25 marks) 41 | 42 | d. Calculate the mean and median of the first five values from 43 | `v_square`. (0.25 marks) 44 | 45 | 3. Boolean indexing (1 mark) 46 | 47 | a. Create a boolean vector `v_bool`, indicating which vector `v` 48 | elements are bigger than 20. How many values are over 20? *Hint: 49 | In R, TRUE = 1, and FALSE = 0, so you can use simple arithmetic 50 | to find this out.* (0.5 marks) 51 | 52 | b. Display the output of `v[TRUE]`. Explain why you think R outputs this. 53 | (0.25 marks) _(Note: this is not really something you would ever need 54 | to do in practice!)_ 55 | 56 | b. Use the variable `v_bool` as an index to extract the elements 57 | from `v` that are bigger than 20. What are the min and max 58 | values of this new vector? (0.25 marks) 59 | 60 | 4. Data frames (2 marks) 61 | 62 | a. There are many built-in data frames in R, which you can find 63 | [more details about 64 | online](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html). 65 | What are the column names of the built-in dataframe `beaver1`? 66 | How many observations (rows) and variables (columns) are there? 67 | (0.5 marks) 68 | 69 | b. Display both the first 6 and last 6 rows of this data frame. 70 | Show how to do so with both indexing as well as specialized functions. (0.5 marks) 71 | 72 | c. What is the min, mean, and max body temperature in this data set? 73 | *Hint: Remember that each column in a data frame is a vector, so 74 | you can use the same functions as in the previous question on 75 | vectors.* (0.5 marks) 76 | 77 | d. Use the `summary` function to display an overview of the `temp` 78 | column. (0.25 marks) 79 | 80 | e. Use a single instance of the `summary` function to display an overview 81 | of the `time` and `temp` columns. (0.25 marks) 82 | 83 | 5. Data frames with dplyr (3 marks) 84 | 85 | a. Say we're attempting to calculate mean temperature in the `beaver1` dataset. 86 | What is wrong with the following chain of dplyr commands? (0.5 marks) 87 | ``` 88 | beaver1 %>% 89 | filter(is.na(temp)) %>% 90 | summarise(mean_temp = mean(temp)) 91 | ``` 92 | 93 | b. Use dplyr to randomly sample 20 rows from `beaver1`. Calculate 94 | mean temperature from this subsetted dataset. (0.5 marks) 95 | _Hint: you may want to refer to the dplyr cheatsheet for this_ 96 | 97 | b. Using the full `beaver1` dataset, calculate the mean temperature 98 | for day 346. (0.25 marks) 99 | _Note: use the full dataset for parts c-f below as well._ 100 | 101 | c. Rather than using `filter()` to calculate the mean for each day 102 | separately, the more convenient `group_by()` can be used to 103 | aggregate measurements by a categorical value (such as the `day` 104 | column in `beaver`). Use this approach to calculate the mean 105 | temperature and activity level for each of the days in the 106 | dataset. (0.5 marks) 107 | 108 | d. Express in writing what the average activity level from the 109 | above calculation means. *Hint: Remember that you can [read a 110 | description of the columns 111 | online](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html).* 112 | (0.25 marks) 113 | 114 | e. How many observations are there per day in this dataset? (0.25 115 | marks) 116 | 117 | f. How many observations are there per day when the beaver is 118 | active outside the retreat? (0.25 marks) 119 | 120 | g. Grouping by activity level *and* the day of the observation. 121 | Which variable seems to be more related to high body 122 | temperature: activity level or day of measurement? (0.5 marks) 123 | -------------------------------------------------------------------------------- /assignment-03.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Assignment 3: dplyr and ggplot (8 marks)' 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | *To submit this assignment, upload the full document on blackboard, 9 | including the original questions, your code, and the output. Submit 10 | you assignment as a knitted `.pdf` (prefered) or `.html` file.* 11 | 12 | 1. Plotting (1 mark) 13 | 14 | Run the block below to create a categorical variable of the `activ` 15 | column. This will make dplyr recognize that there are only two 16 | levels of activity (0 and 1), rather than a continuous range 0-1, 17 | which will facilitate plotting. 18 | 19 | ```{r} 20 | library(tidyverse) 21 | beaver1 <- beaver1 %>% 22 | mutate(factor_activ = factor(activ)) 23 | ``` 24 | 25 | a. In the previous assignment, we saw that the beaver's body 26 | temperature was the highest when the beaver was outside the 27 | retreat. However, we did not explore the distribution of 28 | temperatures for the active and inactive conditions. Create a 29 | histogram with the temperature on the x-axis and color the bins 30 | corresponding to the activity variable. *Hint: You need to use 31 | the `fill` parameter rather than `color`; and make sure you are 32 | using the correct `activ` column!* (0.25 marks) 33 | 34 | b. We already know that the beaver's body temperature is correlated 35 | with whether it is outside the retreat or not. However, we did 36 | not control for the time of day, maybe the beaver's temperature 37 | is even better predicted by knowing what time of day it is. To 38 | satisfactorily answer this question, we should perform a 39 | regression analysis, but we easily can get a good overview by 40 | plotting the data. Make a scatter plot with the time of day on 41 | the x-axis and the body temperature on the y-axis. Color the 42 | scatter points according the beaver's activity level and 43 | separate the measurements into one plot per day. *Hint: To 44 | separate measurements per day, you could use `filter()` and two 45 | chunks of code, but try the more efficient way of facetting into 46 | subplots, which we talked about in the lecture.* (0.75 marks) 47 | 48 | 2. Read in and pre-process data (1.5 marks) 49 | 50 | Ok, that's enough about beaver body temperatures. Now you will apply 51 | your data wrangling skills on the yearly change in biomass of plants 52 | in the [beautiful Abisko national park in northern 53 | Sweden](https://en.wikipedia.org/wiki/Abisko_National_Park). We have 54 | preprocessed this data and made [it available as a csv file via this 55 | link](https://uoftcoders.github.io/rcourse/data/plant-biomass-preprocess.csv). 56 | You can find the original data and a short readme on 57 | [figshare](https://figshare.com/articles/Time_Series_of_plant_biomass_during_1998-2013/4149648) 58 | and [dryad](https://datadryad.org/resource/doi:10.5061/dryad.38s21). 59 | The original study[^1] is available with an open access license. 60 | Reading through the readme on figshare, and the study abstract will 61 | increase your understanding for working with the data. 62 | 63 | a. Read the data directly from the provided URL into a variable 64 | called `plant_biomass` and display the first six rows. (0.25 65 | mark) 66 | 67 | b. Convert the Latin column names into their common English names: 68 | lingonberry, bilberry, bog bilberry, dwarf birch, crowberry, and 69 | wavy hair grass. After this, display all column names. *Hint: 70 | Search online to find out which Latin and English names pair up. 71 | There is a function in the `dplyr` cheat sheet that might help you 72 | rename these columns. Finally, check the [tidyverse style 73 | guide](http://style.tidyverse.org/syntax.html#object-names) to make 74 | sure your new column names are formatted correctly.* (0.5 marks) 75 | 76 | c. This is a wide data frame (species make up the column names). A 77 | long format is easier to analyze, so gather the species names 78 | into one column (`species`) and the measurement values into 79 | another column (`biomass`). Assign it to the variable 80 | `plant_biomass` to overwrite the previous data frame. Make 81 | sure you don't lose any columns in the reshaping process! 82 | *Hint: Make sure the output is correct before overwriting the 83 | old variable.* (0.75 marks) 84 | 85 | 3. Data exploration (4.5 marks) 86 | 87 | Now that our data is in a tidy format, we can start exploring it! 88 | 89 | a. What is the average biomass in g/m^2 for all observations in 90 | the study? (0.25 marks) 91 | 92 | b. How does the average biomass compare between the grazed control 93 | sites and those that were protected from herbivores. (0.25 94 | marks) 95 | 96 | c. Display a table of the average plant biomass for each year. 97 | (0.25 marks) 98 | 99 | d. What is the mean plant biomass per year for the `grazedcontrol` 100 | and `rodentexclosure` groups (spread these variables as separate 101 | columns in a table). (0.5 marks) 102 | 103 | e. Compare the biomass for `grazedcontrol` with that of 104 | `rodentexclosure` graphically in a line plot. What could explain 105 | the big dip in biomass year 2012? *Hint: The published study 106 | might be able to help with the second question...* (0.5 marks) 107 | 108 | f. How many distinct species are there? (0.25 marks) 109 | 110 | g. Check whether there is an equal number of observations per 111 | species. (0.25 marks) 112 | 113 | h. Compare the yearly change in mean biomass for each species in a 114 | lineplot. (0.5 marks) 115 | 116 | i. From the previous two questions, we found that the biomass is 117 | higher in the sites with rodent exclosures (especially in recent 118 | years), and that the crowberry is the dominant species. Notice 119 | how the lines for `rodentexclosure` (refer back to 3.d above) 120 | and `crowberry` are of similar shape. Coincidence? Let's find out! 121 | Use a facetted line plot to explore whether all plant species are 122 | impacted equally by grazing. (0.75 mark) 123 | 124 | j. The habitat could also be affecting the biomass of different 125 | species. Explore graphically if this is the case. *Hint: Think 126 | about how to change your dataset groupings to make this plot* 127 | (0.5 marks) 128 | 129 | k. It looks like both habitat and treatment have an effect on most 130 | of the species! Let's dissect the data further by visualizing 131 | the effect on each species of _both_ the habitat and treatment by 132 | facetting the plot accordingly. *Hint: This is a hard one! You may want 133 | to explore R's documentation for ggplot's `facet_grid`* (0.5 marks) 134 | 135 | 4. Create a new column that represents the square of the biomass. 136 | Display the three largest `squared_biomass` observations in 137 | descending order. Only include the columns `year`, `squared_biomass` 138 | and `species` and only observations between the years 2003 and 2008 139 | from the forest habitat. *Hint: Break this down into single criteria 140 | and add one at a time. You will be able to obtain the desired result 141 | with five operations.* (1 mark) 142 | 143 | [^1]: Olofsson J, te Beest M, Ericson L (2013) Complex biotic 144 | interactions drive long-term vegetation dynamics in a subarctic 145 | ecosystem. Philosophical Transactions of the Royal Society B 146 | 368(1624): 20120486. 147 | -------------------------------------------------------------------------------- /assignment-04.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Assignment 4: Exploration, linear and mixed-effects models' 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | *To submit this assignment, upload the full document on blackboard, 9 | including the original questions, your code, and the output. Submit 10 | you assignment as a knitted `.pdf` (prefered) or `.html` file.* 11 | 12 | 1. Visualization (3 marks) 13 | 14 | Import the tidyverse library. We will be using the same beaver1 dataset that 15 | we used in last week's assignment. 16 | 17 | ```{r message=FALSE, warning=FALSE} 18 | library(tidyverse) 19 | ``` 20 | 21 | a. Create a histogram to visualize the distribution of the beavers' body 22 | temperatures, separating the temperature data based on the beaver's activity level. 23 | (after transforming it into a categorical variable the way you did for your 24 | last assignment). Describe the properties of the distribution. When 25 | creating this plot for the purpose of evaluating temperature, what 26 | argument did you adjust and why? (1 mark) 27 | 28 | b. What type of variables are temperature and time of day? With this in 29 | mind, create a visualization that will help you get a better understanding 30 | of the relationship between these variables. (0.5 mark) 31 | 32 | c. Create a single box plot to simultaneously visualise temperature, 33 | activity, and day. (0.5 mark) 34 | 35 | d. What is one prediction you might make about the relationships among your 36 | variables (based on the patterns you observed)? Create a visualization that 37 | illustrates your prediction, improving on your previous plots in at least one 38 | way. State why this plot is an improvement. (1 mark) 39 | 40 | 2. Outliers (2 marks) 41 | 42 | a. In the beaver1 dataset, there are some particularly high/low body 43 | temperature measurements. Give an example of a systematic or random error 44 | (state which) that could have influenced these values. (0.5 marks) 45 | 46 | b. Consider whether these values would affect your ability to test whether 47 | temperature varies by activity level. You should generate plots and/or 48 | perform statistical tests with and without these points, and make an 49 | informed decision about whether they should be kept or dropped (Hint: you 50 | may want to either create a second data set or get creative with colour.) 51 | State whether you would remove the points and why. (1.5 marks) 52 | 53 | 3. Linear models (3 marks) 54 | 55 | Run the following code to load the CO2 dataset. 56 | 57 | ```{r} 58 | co2_df <- as_data_frame(as.matrix(CO2)) %>% 59 | mutate(conc = as.integer(conc), 60 | uptake = as.numeric(uptake)) 61 | ``` 62 | 63 | a. Look through the help documentation (?CO2) to understand what each 64 | variable means. Imagine you were running a statistical model to assess the 65 | effects of chilling on plant CO2 uptake. What would the $y$ and $x$ 66 | variables be in such a model? What about if you were trying to assess the 67 | relationship between ambient CO~2~ concentrations and plant uptake? Briefly 68 | defend these choices. (1 mark) 69 | 70 | b. How much does `uptake` change if `conc` goes up by 10 mL/L? Write out the 71 | interpretation as a simple statement of this contribution of `conc` on 72 | `uptake`. How much CO2 would you predict plants to uptake if atmospheric 73 | concentrations were 2,450 mL/L?. Show your work. (2 marks) 74 | 75 | 4. Linear mixed-effects models (4 marks). 76 | 77 | Santangelo _et al._ (2018) were interested in understanding how plant 78 | defenses, herbivores, and pollinators influence the expression of plant 79 | floral traits (e.g. flower size). Their experiment had 3 treatments, each 80 | with 2 levels: Plant defense (2 levels: defended vs. undefended), herbivory 81 | (2 levels: reduced vs. ambient) and pollination (2 levels: open vs. 82 | supplemental). These treatments were fully crossed for a total of 8 83 | treatment combinations. In each treatment combination, they grew 4 84 | individuals from each of 25 plant genotypes for a total of 800 plants (8 85 | treatment combinations x 25 genotypes x 4 individuals per genotype). Plants 86 | were grown in a common garden at the Koffler Scientific Reserve (UofT's field 87 | research station) and 6 floral traits were measured on all plants throughout 88 | the summer. We will analyze how the treatments influenced one of these 89 | traits in this exercise. Run the code chunk below to download the data, 90 | which includes only a subset of the columns from the full dataset: 91 | 92 | ```{r} 93 | library(tidyverse) 94 | 95 | plant_data <- "https://uoftcoders.github.io/rcourse/data/Santangelo_JEB_2018.csv" 96 | download.file(plant_data, "Santangelo_JEB_2018.csv") 97 | plant_data <- read_csv("Santangelo_JEB_2018.csv", 98 | col_names = TRUE) 99 | glimpse(plant_data) 100 | head(plant_data) 101 | ``` 102 | 103 | You can see that the data contain 792 observations (i.e. plants, 8 died 104 | during the experiment). There are 50 genotypes across 3 treatments: 105 | Herbivory, Pollination, and HCN (i.e. hydrogen cyanide, a plant defense). 106 | There are 6 plant floral traits: Number of days to first flower, banner 107 | petal length, banner petal width, plant biomass, number of flowers, and 108 | number of inflorescences. Finally, since plants that are closer in space in 109 | the common garden may have similar trait expression due to more similar 110 | environments, the authors included 6 spatial "blocks" to account for this 111 | environmental variation (i.e. Plant from block A "share" an environment and 112 | those from block B "share" an environment, etc.). Also keep in mind that 113 | each treatment combination contains 4 individuals of each genotype, which 114 | are likely to have similar trait expression due simply to shared genetics. 115 | 116 | a. Use the `lme4` and `lmerTest` R packages to run a linear mixed-effects 117 | model examining how herbivores (`Herbivory`), Pollinators (`Pollination`), 118 | plant defenses (`HCN`) _and all interactions_ influences the width of 119 | banner petals (`Avg.Bnr.Wdth`) produced by plants while accounting for 120 | variation due to spatial block and plant genotype. Also allow the intercept 121 | for `Genotype` to vary across the levels of the herbivory treatment. (1 122 | mark: 0.5 for correct fixed effects specification and 0.5 for correct random 123 | effects structure). You only need to specify the model for this part of the 124 | question. 125 | 126 | b. Summarize (i.e. get the output) the model that you ran in part (a). Did 127 | any of the treatments have a significant effect on banner petal length? If 128 | so, which ones? Based on your examination of the model output, how can you 129 | tell which level of the significant treatments resulted in longer or shorter 130 | mean banner petal widths? Make a statement for each significant **main** 131 | effects in the model (i.e. not interactions). If none of the main effects 132 | are significant, then simply write "there are no significant main effects in 133 | the model" (0.5 marks). 134 | 135 | c. Using `dplyr` and `gglot2`, plot the mean banner width for one of the 136 | significant interactions in the model above (whichever you choose). The idea 137 | is to show how both treatments interact to influence the mean length of 138 | banner petals using a combination of different colours, linetypes, shapes, 139 | etc. on the same plot (i.e., no faceting). Avoid overlapping points in the 140 | figure. Also include error bars/bands with one standard error around the 141 | mean. As a reminder, I have included the formula to calculate the standard 142 | error of the mean below. (1.5 marks). 143 | 144 | $$ SE = \frac{sd}{\sqrt{n}} $$ 145 | 146 | d. After accounting for the fixed effects, what percentage of the variation 147 | in banner petal width was explained by each of the random effects in the 148 | model? Show yor work. (0.5 marks). 149 | 150 | e. Descibe the pattern you see in the figure generated in part (c). Why do 151 | you think the interaction you plotted was significant in the model? (0.5 marks) 152 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /assignment-05.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Assignment 5: Model selection and multivariate statistics (9 marks)' 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | *To submit this assignment, upload the full document on blackboard, 9 | including the original questions, your code, and the output. Submit 10 | your assignment as a knitted `.pdf` (preferred) or `.html` file.* 11 | 12 | 1. In this exercise, we will once again use the data of Santangelo _et al._ 13 | (2019) that you used in assignment 4. Let's go ahead and load in the data. See 14 | assignment 4 if you need a refresher on the details of the experiment and the 15 | dataset. (5 marks total) 16 | 17 | ```{r message=FALSE, warning=FALSE} 18 | library(tidyverse) 19 | library(lme4) 20 | library(lmerTest) 21 | 22 | 23 | Santangelo_data <- "https://uoftcoders.github.io/rcourse/data/Santangelo_JEB_2018.csv" 24 | download.file(Santangelo_data, "Santangelo_JEB_2018.csv") 25 | Santangelo_data <- read_csv("Santangelo_JEB_2018.csv", 26 | col_names = TRUE) 27 | glimpse(Santangelo_data) 28 | head(Santangelo_data) 29 | ``` 30 | 31 | 32 | a. Model selection works best when there are no missing values in your 33 | dataset. We will be identifying the best model that predict variation in 34 | flowering time (`Flower.date`) across plants. Create a dataset that excludes 35 | rows where there is missing flowering date data. (0.25 marks) 36 | 37 | b. We want to know how HCN, herbivores, and pollinators influence flowering 38 | date. We also think that the effect of herbivores and pollinators on 39 | flowering data might depend on whether the plant is producing HCN. Create a 40 | model that includes fixed-effects that test these predictions. Be sure to 41 | account for variation due to `Genotype`, `Block` and the `Genotype` by 42 | `Herbivory` interactions by including these terms as random effects. This 43 | will be our saturated model. You can ignore `boundary (singular) fit` 44 | warnings that may arise. (1 marks) 45 | 46 | c. We will generate a reduced model from the saturated model in (a). Should 47 | we use AIC or AIC~c~. Why? Show your calculation. (0.5 marks) 48 | 49 | d. Using the approach described in lecture 11, optimize the random effect 50 | structure of this model. Show the AIC/AIC~c~ output for each model of 51 | varying random effect strucure. Provide a one sentence justification for 52 | each random effect the model, justifying whether it is fixed (i.e., in every 53 | model) or whether some models will drop this effect. Describe in one 54 | sentence what the optimal random effect structure of the model is and why. 55 | (0.5 marks) 56 | 57 | e. Using the model with the optimal random-effect structure identified in 58 | (c), find the optimal fixed-effect structure. Be sure to show all the models 59 | and their AIC/AIC~c~ scores. (1.5 mark) 60 | 61 | f. Based on the AIC/AIC~c~ output from (d), generate your final model with 62 | both optimized fixed and random effects. Summarize the model and interpret 63 | its output. Is there a significant effect of any treatment? If so, which 64 | one(s) and in which direction. Make a statement about the significant 65 | treatments' effects on flowering date. Use the model's output to support 66 | your answer. You only need to interpret significant main effects here (i.e. 67 | not interactions). (0.75 marks) 68 | 69 | g. Do you think we were justified in interpreting a single model? Why or why 70 | not? What alternative approach could we have used? (0.25 marks). 71 | 72 | h. Use `dplyr` and `ggplot2` to plot the flowering date of plants by the 73 | _main_ effect that showed a significant effect in the optimized model above. 74 | The figure should show the mean plus and minus a single standard error of 75 | the mean. Suggest one biological interpretation of the pattern you see in 76 | the figure and in the model (i.e. why do you think this would happen). If 77 | there are no significant effects in the model, simply write "There are no 78 | significant effects!". (0.25 marks) 79 | 80 | 2. During the multivariate statistics lecture, we made use of vector community 81 | and malaria survey data collected by Mbogo _et al._ (2003) to disentangle the 82 | effects of vector abundance, species richness, and composition, on malaria 83 | prevalence (see path diagram in lecture 10 for a reminder of these 84 | relationships). In this exercise, we will complete the analysis of the 85 | strucutral equation model we began building in class. (1.5 marks total) 86 | 87 | Here are some relevant snippets of code taken from the lecture notes to get you 88 | started on this exercise. 89 | 90 | ```{r eval=FALSE} 91 | 92 | library(lavaan) 93 | 94 | kenya.wide <- read.csv("kenya.wide.csv", header=TRUE, sep=",") 95 | 96 | kenya.pca <- kenya.wide %>% 97 | dplyr::select(arabiensis, gambiae, funestus, merus) %>% #choose relevant columns 98 | mutate_all(sqrt) %>% #this is the Hellinger transformation 99 | prcomp(.) #pipe directly into baseR function for PCA 100 | 101 | axes <- data.frame(kenya.pca$x) 102 | kenya.wide <- bind_cols(kenya.wide, axes) 103 | 104 | kenya.wide$s.abun <- log(kenya.wide$total.abundance) 105 | kenya.wide$s.sr <- log(kenya.wide$SR) 106 | kenya.wide$s.pfpr <- log(kenya.wide$PfPR) 107 | 108 | sem02 <- ' 109 | # regressions 110 | l.pfpr ~ a*l.sr + b*l.abun + c*PC2 111 | # correlations 112 | l.sr ~~ d*l.abun 113 | PC2 ~~ e*l.sr 114 | # defined parameters 115 | indirect.abun := (a*d) #indirect effect of abundance via SR 116 | indirect.abun2 := (d*e*c) #indirect effect of abundance via PC2 117 | total.abun := b + (a*d) + (d*e*c) 118 | ' 119 | 120 | ``` 121 | 122 | a. Complete the structural equation model by adding in calculations for the 123 | indirect and total effects of species richness (SR) and composition (PC2) on 124 | malaria prevalence (PfPR). (0.5 marks) 125 | 126 | b. Evaluate the model, bootstrapping confidence intervals for path 127 | coefficients with seed #778. Which predictor had the largest _direct_ effect 128 | on malaria prevalence? How about _total_ effect? Briefly explain these 129 | effects in plain english (1 sentence each). (1 mark) 130 | 131 | 132 | 3. In this exercise, we will be investigating the relationship between vector 133 | community structure and another commonly used metric of disease risk -- 134 | entomological inoculation rate (EIR). EIR is a measure of the number of bites by 135 | infectious mosquitoes per person per unit time. We will be making use of the 136 | same data from Mbogo _et al._ (2003) as before, only this time, we will start 137 | with the long form data. (2.5 marks total) 138 | 139 | ```{r eval=FALSE} 140 | 141 | kenya.long <- read.csv("kenya.long.csv", header=TRUE, sep=",") 142 | 143 | ``` 144 | 145 | This dataset consists of the same information as kenya.wide, with the addition 146 | of one new columns for "EIR". 147 | 148 | a. Convert this dataset to the wide format. Fill cells in the wide dataset 149 | with the **relative abundance** of each species, and include the columns 150 | "total abundance" and "EIR" in the final product. (Hint: use xxxx_join to 151 | add the desired columns to the wide dataset after you spread it) (Hint2: 152 | pivot_wider() may be easier to use than spread()) (1 marks) 153 | 154 | b. Construct a series of linear models to investigate the relationship 155 | between EIR and i) total mosquito abundance and ii) the abundance of each 156 | species. Interpret the results of these models. (Hint: is EIR a simple 157 | function of total mosquito abundance, or is there a particular species that 158 | is contributing disproportionately to it?) (0.5 mark) 159 | 160 | c. Investigate the influence of total abundance and community structure (use 161 | the first two PC axes) on EIR with a strcutural equation model. Include only 162 | direct effects only in this model, and pretend we have reason to believe 163 | total abundance is associated with community composition. 164 | i. Briefly explain the correlation structure you have chosen for your 165 | predictors, total abundance, PC1, and PC2. (0.5 marks) 166 | ii. Evaluate the model. Are these results congruent with your findings 167 | from part (a)? (0.5 marks) 168 | 169 | -------------------------------------------------------------------------------- /assignment-06.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Assignment 6: Spatial statistics, simulating data, and randomization tests (8 marks)' 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | 9 | *To submit this assignment, upload the full document on blackboard, 10 | including the original questions, your code, and the output. Submit 11 | your assignment as a knitted `.pdf` (preferred) or `.html` file.* 12 | 13 | 1. In this exercise, we will continue to use the vector community and malaria survey data collected by Mbogo _et al._ (2003) (i.e., `kenya.wide.csv`). We will inspect whether spatial autocorrelation affects our inference of the effects of vector abundance, species richness, and composition, on malaria prevalence. We will also investigate which environmental factors underpin the distribution of mosquitoes, all in the context of space. (4 marks). 14 | 15 | a. Compute Moran's autocorrelation statistic to assess whether mosquito abundance and species richness, and malaria prevalence are correlated in space. (1 mark) 16 | 17 | b. Extract annual average temperature and rainfall data from WorldClim2 using the raster files provided in class. Create maps (see appendix to lecture notes) to show the variation in these climatic factors across the sites. (1 mark) 18 | 19 | c. Investigate whether rainfall influence mosquito abundance across sites. Your complete analysis should include formally testing whether the temperature and rainfall patterns across sites are correlated in space (construct a variogram and interpret when neccessary), regression models with the appropriate autocorrelation structure, and an interpretation of your model outputs. Feel free to make use of additional plots to help explain your findings (2 mark) 20 | 21 | 22 | 2. Simulating data (2 marks) 23 | 24 | a. Generate a gamma distribution by randomly sampling 30 points from a distribution with shape parameter equal to 1.35 and rate parameter equal to 0.5. Plot this distribution. Set a seed of 42. (0.5 mark). 25 | 26 | b. Plot the distribution of sample means obtained by generating 5000 gamma distributions with the same parameters as in (a). In other words, the distribution should be made up of 5000 means, each from a different simulated gamma distribution. Set a seed of 43. What do you notice about this distribution when compared to the original distribution in (a)? Why would we expect this? (0.5 marks) 27 | 28 | c. In this exercise you will simulate a multiple regression. Remember, multiple regression means that there is more than one explanatory (aka predictor, independent) variable for a given response variable. Multiple regression thus estimates a separate effect (i.e. _beta_) for each explanatory variable in the model, while holding the other variables constant. This exercise is only a slight extension of the model that we simulated in lecture. Simulate a model that satisfies the conditions below and show the model output using `summary()`. Set a seed of 44. (1 mark). 29 | 30 | 1. `x1` is an explanatory variable with _sequence_ from 51 to 70 with 1 unit intervals between each value (i.e. 20 values total). 31 | 2. `x2` is an explanatory variable of length 20 randomly drawn for a normal distribution with mean equal to 62 and standard deviation equal to 2.7. 32 | 2. `x3` is an explanatory variable of length 20 randomly drawn for a gamma distribution with shape equal to 5 and rate equal to 0.5. 33 | 3. the `y_intercept` is 22 34 | 4. The beta associated with `x1` is 0.62. 35 | 5. The beta associated with `x2` is 0.047` 36 | 6. The beta associated with `x3` is 0.185 37 | 6. The error is drawn from a normal distribution with mean equal to 0 and standard deviation equal to 1.65. 38 | 7. `y` is a linear combination of `x1`, `x2` and `x3`. There are no interations. 39 | 40 | 3. Randomization test (2 marks) 41 | 42 | Run the code chunk below to load the data that will be used in this exercise. 43 | 44 | ```{r message=FALSE, warning=FALSE} 45 | library(tidyverse) 46 | df <- "https://uoftcoders.github.io/rcourse/data/Assign05_Question3.csv" 47 | download.file(df, "Assign05_Question3.csv") 48 | df <- read_csv("data/Assign05_Question3.csv", 49 | col_names = TRUE) 50 | glimpse(df) 51 | ``` 52 | 53 | a. Generate a histogram showing the null distribution of t-statistics between groups one and two from the `df` dataframe that you just loaded. The null distribution should be based 5,000 reshufflings of the data. (1 mark). Overlay onto this histogram the observed t-statistic as a dashed vertical line. Set a seed of 45. **Hint:** t-tests return list objects that can be indexed using `$` 54 | 55 | b. Perform a permutation test testing whether the observed t statistic between groups one and two is different than what would be expected by chance. Include a statement about whether there is a significant difference between groups based on your permutation test and be sure to include the P-value. How does this P-value compare to one obtained from a simple t-test? Why? (1 mark) 56 | 57 | -------------------------------------------------------------------------------- /assignment-07.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Assignment 7: Modelling (9 marks)' 3 | output: 4 | pdf_document: default 5 | html_document: default 6 | --- 7 | 8 | ```{r} 9 | 10 | library(deSolve) 11 | 12 | ``` 13 | 14 | 15 | 1. In this exercise, we will be working with the Lotka-Volterra competition 16 | model introduced in class. Use the following parameter set in this question. (3 17 | marks) 18 | 19 | - `r` = 0.5 20 | - `K` = 1000 21 | - `alpha12` = 3 22 | - `alpha21` = 1.5 23 | 24 | 25 | a. Solve the Lotka-Volterra competition model and plot the trajetory of the 26 | population for 100 time steps. Briefly explain the population dynamics of these 27 | species. (1 mark) 28 | 29 | b. Catastrophe hits population 2 at t=10, such that their numbers were 30 | drastically decreased to a quarter of what it was (hint: at t=10, N2 reset 31 | to 1/4 its non-catastrophe value)! Use a simulation to show the trajetory of 32 | these species over the 100 time steps. Explain what you saw. (2 marks) 33 | 34 | 35 | 2. In this exercise, we will be working with the malaria dynamics model we 36 | worked with in class, and we will be thinking of ways in which we can 37 | "implement" various mosquito control methods in this hypothetical population. 38 | For each of the following mosquito control strategies, describe how you would 39 | implement them in terms of math. For example, you may wish to modify some 40 | parameter, add a parameter, or out right change the strucutre of the model. 41 | Explain your choices, and include any new parameters or new equations where 42 | applicable (e.g., include an equation to show where a new parameter would 43 | appear). (3 marks) 44 | 45 | a. Use of insecticide to kill off adult mosquitoes. (1 mark) 46 | b. Use of bednet to reduce contact between mosquito and host. (1 mark) 47 | c. Provide hosts with vaccines. (1 mark) 48 | 49 | 50 | 3. The Allee Effect (3 marks) 51 | 52 | Generally, as population size increases, a population will 53 | experience a decreased growth rate due to greater competition for 54 | resources. This is a negative density-dependent growth rate, and one 55 | example of this is the logistic model. 56 | 57 | The Allee effect introduces positive density dependence, where 58 | increases in population size result in increased growth rates over a 59 | certain range of population sizes. One way to incorporate 60 | the Allee effect into the logistic growth equation is as follows: 61 | 62 | $$\frac{dN}{dt} = rN\left(1-\frac{N}{K}\right)\left(\frac{N-A}{K}\right)$$ 63 | 64 | Here $r$ represents the growth rate of the population, $K$ is the 65 | carrying capacity, and $A$ is the critical population size above which 66 | the total growth rate is positive. 67 | 68 | a. Take $r=1$, $A=10$, and $K=50$. Plot $\frac{dN}{dt}$ vs. $N$ for 69 | $0 \le N \le 55$. For which values of $N$ is the growth rate 70 | ($\frac{dN}{dt}$) positive or negative? (0.5 marks) 71 | 72 | b. Plot the **per capita** growth rate ($\frac{1}{N}\frac{dN}{dt}$) 73 | vs. $N$ for this model of the Allee effect and for the logistic growth 74 | model: $\frac{dN}{dt} = rN(1-\frac{N}{K})$. (1 marks) 75 | 76 | c. What do you notice about the density ($N$) dependence of the per capita 77 | growth rate in each case? Hint: in the logistic model, the growth rate **per 78 | capita** (per organism) decreases in a straight line as $N$ increases. (0.5 79 | marks) 80 | 81 | d. What happens to the Allee effect as $A$ decreases? Plot curves for $A=0$ 82 | and a few values of $A>0$. (0.5 marks) 83 | 84 | e. Describe two biological situations in which you might expect to see an Allee 85 | effect (either weak or strong). (0.5 marks) 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /assignment-09-challenge.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Challenge assignment (14.5 marks)" 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | *To submit this assignment, upload the full document on Quercus, 9 | including the original questions, your code, and the output. Submit 10 | your assignment as a knitted `.pdf` (prefered) or `.html` file.* 11 | 12 | 13 | Part of being an effective scientist involves being able to solve problems you 14 | have not encountered before. This is certainly true of programming as well, 15 | where problems are typically solved by furious bouts of Googling, reading 16 | documentation, and trial and error of proposed solutions. In this assignment, 17 | like previous ones, you will be evaluated on your ability to solve data 18 | manipulation and analysis tasks. However, unlike previous assignments, some of 19 | the solutions to the problems will require more research and effort on your 20 | part. It may require the use of packages and techniques not explored in class, 21 | but all problems are solveable, often with only a few lines of code. By now, you 22 | should all have the terminology required to search for solutions to the problems 23 | below. As with all programming problems, there are many possible ways to get the 24 | answer to the problems below. 25 | 26 | 1. Simpson's diversity index and bootstrapping (6 marks) 27 | 28 | In lecture 12, we used data from the National Ecological Observatory Networkd on 29 | the abundance and percent cover of plant species across sites in the Harvard 30 | Forest from the year 2017. Run the code chunck below to read in the data if you 31 | do not already have it. 32 | 33 | ```{r message=FALSE} 34 | library(tidyverse) 35 | neon_data <- "https://uoftcoders.github.io/rcourse/data/NEON_PlantPA_HARV_201707.csv" 36 | download.file(neon_data, "NEON_PlantPA_HARV_201707.csv") 37 | neon_data <- read_csv("NEON_PlantPA_HARV_201707.csv", 38 | col_names = TRUE) 39 | ``` 40 | 41 | a. Using the raw NEON data, create a matrix with plant species as rows and sites 42 | as columns. The cell values should represent the abundance of each species at a 43 | given site. (1 mark) 44 | 45 | b. Write a function that takes a single numeric vector as an argument and 46 | returns Simpson's Index of Diversity for the numeric vector. Test your 47 | function by computing Simpson's diversity index on the `test_vector` in the 48 | code chunk below. Be sure to report the Simpson's index for `test_vector` in 49 | your final assignment. As a reminder, I have included the formula for 50 | Simpson's index below: 51 | 52 | ```{r} 53 | test_vector <- c(0, 1, 0, 5, 0, 1, 0, 4, 3, 0, 0, 0, 0, 1, 4, 0, 5, 0, 54 | 3, 0, 11, 2, 19, 0, 11, 0, 0, 0, 0, 0) 55 | ``` 56 | 57 | 58 | $$D = 1 - \sum_{i = 1}^{s}(p_i)^2$$ 59 | 60 | where `s` is the species richness (i.e. number of species) and `p_i` is th 61 | relative abundance of species _i_. A `D` value of 0 represents no diversity 62 | and a D value of 1 represent infinite diversity. (1 mark) 63 | 64 | c. In lecture 12, we discussed how resampling techniques can be used to 65 | conduct hypothesis tests (i.e., permutation tests) by comparing an observed 66 | population parameter (e.g., mean, median) to a null distribution of that 67 | parameter generated by resampling your observed data _without_ replacement 68 | many times. Resampling can additionally be used to generate confidence 69 | intervals around a population parameter (e.g., mean median, slope) or other 70 | statistic (e.g., C-score) using a technique known as _bootstrapping_. 71 | Bootstrapping allows us to estimate the true distribution of a statistic in 72 | cases where it is unknown, which we can use to estimate our uncertainty 73 | (e.g., Standard Error, Confidence Intervals) in a population parameter. 74 | Importantly, this applies _regardless of the shape of the distribution_, 75 | although some adjustments often have to be made for strongly skewed 76 | distributions. While there are a few different types of bootstrapped 77 | confidence intervals (e.g., bias-corrected and accelerated, _t_ with 78 | bootstrap), in this exercise, you will write a function that calculates the 79 | simple 95% percentile bootstrapped confidence interval from a given numeric 80 | vector. Your function should have the following properties: 81 | 82 | 1. It should take in two arguments: A numeric vector and an integer 83 | representing the number of iterations 84 | 2. It should return a data frame with two columns: The lower and upper 85 | quantiles of the distribution, representing the 2.5 and 97.5 86 | percentiles, respectively (**hint:** The `quantile()` function). 87 | 88 | Write the function as described above. How does the bootstrap differ from a 89 | permutation test (note: you don't need code here, just tell me in words). 90 | Set a seed of 42 and test your function on the `test_vector` from part (b). 91 | (1.5 marks) 92 | 93 | d. Use your functions defined in (b) and (c) to estimate the Simpson's diversity 94 | index and corresponding 95% bootstrapped confidence intervals around the 95 | Simpson's index for each of the sites in your dataframe from (a). Use 1000 96 | iterations for the bootstrapping procedure. Your answer should be a single 97 | dataframe with four columns: `site`, `simpson`, `lower`, and `upper`. Points are 98 | awarded for conciseness of the code. (**Hint:** The most concise answer will 99 | likely make use of the `purrr` package, which is part of the `tidyverse`). Set a 100 | seed of 43. (1.5 marks) 101 | 102 | e. Using the dataframe from (d), plot the Simpson's index (y-axis) for each 103 | of the sites (x-axis) as a single point surrounded by its lower and upper 104 | 95% CIs. Order the x-axis from lowest to highest Simpson's index. (1 mark) 105 | 106 | 2. Recreating a figure 107 | 108 | In assignment 3, we explored a dataset containing changes in yearly plant 109 | biomass in Abisko National Park, Sweden. In this question, we will use `ggplot2` to 110 | reproduce a figure in the original paper (Olofsson et al, 2013; 111 | [link](https://royalsocietypublishing.org/doi/full/10.1098/rstb.2012.0486)). 112 | 113 | You should still have the dataset from when you completed assignment 3, but 114 | if not, run the code chunk below to download it. 115 | 116 | ```{r message=FALSE} 117 | plant_biomass_url <- 'https://raw.githubusercontent.com/UofTCoders/rcourse/master/data/plant-biomass-preprocess.csv' 118 | download.file(plant_biomass_url, 'plant-biomass-preprocess.csv') 119 | plant_biomass <- read_csv('plant-biomass-preprocess.csv') 120 | ``` 121 | 122 | Reproduce figure 4 from the paper using `ggplot2`. Pay close attention to the 123 | overall structure of the figure, the scale on the axes, and so on. The colors of 124 | the points do not need to be the exact same colors as those in the figure, but 125 | they should be sufficiently close. _(Note: you can ignore the SEM points, and 126 | the 'look' of your axes does not have to match the figure exactly. The species 127 | names also not have to be included in the body of the plot (like in the paper) 128 | as long as they are visible in some form_ (1 mark) 129 | 130 | 3. Use the built-in R dataset `iris`, for this question. (2 marks) 131 | 132 | a. Test the relationship between sepal length and sepal width _for each 133 | species_ using linear models. Set sepal width as the response and sepal 134 | length as the predictor. Output a single data frame containing the beta 135 | estimate and p-value associated with the predictor for each of the three 136 | models, and also make sure to include a column called 'species' in the 137 | final data frame. Your final data frame should only have species, term, 138 | estimate, and p.value as columns. Do not include the intercepts. _(Hint: 139 | You want to simultaneously perform the same model on different subsets of 140 | the data with only a few lines of code)_ (1 mark) 141 | 142 | b. Use `ggplot2` to plot a scatter plot of sepal width by sepal length, 143 | coloured by species. Plot the three linear fits as well, also coloured by 144 | species. Below your code, comment on how the estimate values from your 145 | linear models above correspond to the plotted fits. (1 mark) 146 | 147 | 4. The Canadian lynx population cycle (3.5 marks) 148 | 149 | The Canadian lynx experiences large periodic changes in its population 150 | size over a timescale of several years. This is thought to be driven by 151 | oscillations in the population size of the snowshoe hare, the primary food 152 | source for the lynx. Read more about the lynx population cycle on this 153 | [Northwest Territories website](https://www.enr.gov.nt.ca/en/services/lynx/lynx-snowshoe-hare-cycle). 154 | 155 | R has a built-in dataset called `lynx` which contains annual population 156 | measurements for the Canadian lynx as a time series. 157 | 158 | (a) Plot the abundance of `lynx` vs. time in years using either `ggplot` or 159 | `qplot`. Plot points and a connect them by a line . Create a time series that 160 | starts at 0 and ends at the total number of years in the dataset (total years $= 161 | 1934-1821$). By eye, estimate the time between peaks in the population. (0.5 162 | marks) 163 | 164 | (b) Define a function called `sine_model` that takes 5 arguments: a vector of years 165 | for the x-axis and four parameters (amplitude, period, phase, and offset). 166 | Recall the general formula for a sine wave: 167 | $$y = A \text{sin}(kx - b) + c$$ 168 | where $k = 2\pi / T$, $T$ is the period or length of time between peaks, 169 | $A$ is the amplitude, $b$ is the phase, and $c$ is the offset. 170 | Using a value of $A = c = 1700$ for both the amplitude and offset and a value of 171 | $b = 2.75$ for the phase, plot the lynx data as before and add a sine curve 172 | using your guess of the timescale from part (a) for the period. 173 | Use a colour other than black to plot the sine wave. 174 | Note that the x axis must start at 0 in order for the offset of $2.75$ 175 | to match the data. (1 mark) 176 | 177 | (c) Use least-squares fitting to refine your estimate of the lynx cycle length. (1.5 marks) 178 | - Create a numeric vector with a range of values for the period that 179 | span your guess from part (a). 180 | - Write a function to calculate the Residual Sums of Squares (RSS) from 181 | a model fitted to the `lynx` data. 182 | - This function should calculate the sum of the difference 183 | (*residuals*) between the lynx data and your prediction, then return 184 | the sum of the residuals squared. 185 | - Apply this function over the numeric vector of period values you 186 | created. Essentially, you should be striving to obtain an RSS value for 187 | models fitted using all of the different period values in your numeric 188 | vector. 189 | - Plot the sum of the residuals squared vs. the range of period values. By eye, 190 | what is the minimum of this curve? 191 | - Identify the period value that provides the best fit to the `lynx` 192 | data. Models with lowest RSS fit the data best. What is your calculated 193 | length of the lynx population cycle? 194 | 195 | (d) Plot the lynx data again and plot your best fit curve on top. 196 | (0.5 marks) 197 | 198 | 5. In class, we worked with a simple one-locus haploid model 199 | of evolution and investigated how different forces of evolution (e.g., selection 200 | vs. drift) affect allele frequencies. In this challenge assignment, we will 201 | naturally upgrade to a diploid model to see how evolution work in this kind of 202 | system. 203 | 204 | We will once again be considering the case of malaria (of course, what else does 205 | Amber think about anyway). In assignment 7, we (you) worked on ways to 206 | incorporate various facets of biological detail into a baseline, super simple, 207 | disease model. We will incorporate another level of detail in this exercise: 208 | host genotype. The trait that we will be working with is the sickle cell trait. 209 | You can read more about the sickle-cell trait on Wikipedia. Basically, this is a 210 | blood-cell-shape trait that is controlled at a single locus by two alleles: $A$ 211 | for wildtype "normal" blood shape, and $S$ for sickle-shaped blood cells. There 212 | are thus three possible host genotypes with respect to the sickle cell trait: 213 | homozygous "normal" blood ($AA$), heterozygous ($AS$), and homozygous sickle 214 | blood ($SS$). People with homozygous $SS$ suffer from debilitating illness 215 | (sickle-cell disease) and often result in mortality at a young age if untreated. 216 | Heterozygotes ($AS$), while still suffering minor illness, receives partial 217 | immunity against malaria. It was thus hypothesized that the deleterious $S$ 218 | allele is maintained in the population due to **balancing selection** -- the 219 | strong selective pressure exerted by malaria gave heterozygotes an advantage 220 | over the homozygotes, thus saving the $S$ allele from being purged from the 221 | population. 222 | 223 | We will be using mathematical models to help us test this hypothesis. A good 224 | start is to first extend the framework we used in lecture and think about how to 225 | represent allele frequency change in a diploid population. 226 | 227 | In population genetics it is often easier to work with 228 | allele frequencies (proportion of a certain allele in a population) rather than 229 | changes to their absolute numbers. Here, we will use $p$ and $q$ to denote the 230 | proportions of the $A$ and $S$ alleles in the population, respectively. The 231 | frequencies of the three genotypes in the population are thus given by $p^2$, 232 | $2pq$, and $q^2$ for AA, AS, and SS individuals, respectively. $p^2$, $2pq$, and 233 | $q^2$ are known as Hardy-Weinberg proportions and they have the convenient 234 | property of summing to 1. 235 | 236 | Applying selection to this population alters the frequency of each _genotype_ by 237 | an amount that is proportional to their fitness. We will denote the fitness of 238 | each host genotype group as $W_{AA}$, $W_{AS}$, and $W_{SS}$. The frequencies of 239 | each genotype following selection (i.e., as weighted by its fitness) is 240 | therefore $p^2W_{AA}$, $2pqW_{AS}$, and $q^2W_{SS}$. Dividing these quantities 241 | by their sum (the mean fitness of the population, often denoted with 242 | ${\bar{W}}$) allows us to retrieve the frequency of the three genotypes after 243 | selection (e.g., $p^2 \frac{W_{AA}}{\bar{W}}$). (2 marks) 244 | 245 | a) Derive a recursion equation that describe the change in **allele** 246 | frequencies of $A$ and $S$ in one time step. (Hint: How do you go from genotype 247 | frequency to allele frequency?) (1 marks) 248 | 249 | b) Simulate the trajetory of change of the A allele. Use the parameter set 250 | provided, and include a graph in your answer. Explain what you see, and refer to 251 | the parameter values provide in your explanation. (1 mark) 252 | 253 | 254 | ```{r} 255 | 256 | # Time 257 | times <- 500 258 | timevec <- seq(1, times, 1) 259 | 260 | # Parameters 261 | N <- 10000 # Total population size (assume constant) 262 | WAA <- 0.5 263 | WAS <- 0.7 264 | WSS <- 0.2 265 | 266 | # Intitial condition 267 | p <- 0.5 # proportion of A allele 268 | 269 | ``` 270 | -------------------------------------------------------------------------------- /assignment-final.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Final assignment: Scientific report and presentation" 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | ## Project description 9 | 10 | The course project is a self-directed group data analysis project using real 11 | ecological data and rigorous scientific methods. Groups are expected to hypothesize 12 | about their chosen data, examine their hypotheses with reproducible and quantitative 13 | analysis techniques, visualize their results, and create scientific products in the 14 | form of a report and a presentation. 15 | 16 | You might end up with a publishable scientific product! 17 | [This paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5210691/) was written by a 18 | group of graduate students as part of the first version of this course, 19 | which was created by Dr. Christie Bahlai. 20 | 21 | ### Data 22 | 23 | A list of recommended datasets can be found 24 | [here](https://uoftcoders.github.io/rcourse/lec14-datasets.html#datasets_available_for_use). 25 | You are welcome to choose a 26 | dataset not listed, or data collected as part of a research project, but keep in mind 27 | that you may not submit anything twice: any work you do as part of this course may not 28 | be submitted for credit in another course (such as a fourth-year research project) and vice versa. 29 | If choosing a dataset not listed, make sure it is well-documented, legitimate, and 30 | complex enough to support your analysis efforts. Your work should be original; your 31 | project should not be a reproduction of published analyses. 32 | 33 | ### Project deliverables 34 | 35 | The following components will be graded as part of the project: 36 | 37 | 1. Mid-project update (Due Nov. 14): 38 | * Details for this assignment can be found [HERE](https://uoftcoders.github.io/rcourse/mid-project-update.html). 39 | 40 | 2. Report styled as a journal article, with these or similar sections (more info 41 | below) (Due. Dec. 5): 42 | * Abstract 43 | * Introduction / Background and Rationale 44 | * Methods (with "Data Description" and "Data Analysis" subsections) 45 | * Results 46 | * Discussion 47 | * Conclusion 48 | * Code: project results must be reproducible by someone else 49 | 50 | 3. 10 minute presentation with 2 minutes for questions, styled as a conference 51 | presentation (assume not too much familiarity with the topic in the audience). 52 | The presentations will be held on the last day of class (Dec 3). 53 | 54 | While you may not submit your work for this course for credit in another course, you 55 | are welcome to publish or present your work in an academic setting. Groups are 56 | encouraged to publish their work on [figshare](https://figshare.com/), an open, 57 | citable repository of scientific content. 58 | 59 | ### Report guidelines 60 | 61 | For the report, you are expected to: 62 | 63 | - Search the previous research and literature on your research questions. 64 | - Have clear and explicit objectives and hypotheses. 65 | - Adequately describe and properly cite the data source(s) you will analyze. 66 | - Describe your data analysis in sufficient detail for others to understand what 67 | you did and why. 68 | - Show all the results of your pre-planned data analysis and any additional 69 | explorations you did. 70 | - Discuss the meaning of your results and how they fit with the previous 71 | literature. 72 | 73 | The report and associated code is expected to: 74 | 75 | - Be entirely reproducible: You may find 76 | [Rprojects](https://r4ds.had.co.nz/workflow-projects.html) helpful in making 77 | your projects reproducible. Rprojects can be commited to GitHub, allowing anyone 78 | to clone the repo and run your analyses without having to worry about the paths 79 | to all the files being different on their computers. [This 80 | lesson](https://utm-coders.github.io/studyGroup/lessons/misc/project-management-R/lesson/) 81 | on reproducible project management in R may also be helpful. 82 | - Have well documented code: A well documented project will have README files 83 | describing the contents of all folders in your GitHub repos. It will also 84 | contain effective in-line comments in your scripts that showcase the logic of 85 | your analyses and data-wrangling tasks. [This 86 | lesson](https://swcarpentry.github.io/r-novice-inflammation/06-best-practices-R/) 87 | on best practices for writing R code is a good starting place. 88 | 89 | You are also expected to work well as a team, and use GitHub to submit and store 90 | your final product (more details below). 91 | 92 | As a *guideline*, aim for at least 2500 words and about 6-8 figures/tables. 93 | *This is **not** a hard criteria*. We are flexible in these *guidelines*, since 94 | we want you to learn to work as a team and create a scientific product. You'll 95 | be surprised how quickly the words, figures, and tables start adding up. 96 | 97 | Your code should follow the coding style found [on our resources page](resources.html). 98 | 99 | All items (except the presentation) are due on December 5th at 11:59 pm. 100 | 101 | ## Project submission 102 | 103 | The project report and code should be submitted on GitHub. The report should 104 | also be submitted on Quercus. Each group will have their own GitHub repository 105 | in the [EEB313-2019](https://github.com/eeb313-2019) organization to which you 106 | can upload your report and code. You are welcome to use your GitHub repository 107 | for collaborative work during the project, but feel free to use other tools such 108 | as Google Drive, Dropbox, Overleaf, etc. if you prefer. 109 | 110 | ## Project grading rubric 111 | 112 | | | Inadequate (0 marks) | Adequate (4 marks) | Excellent (8 marks) | 113 | |------------|--------------------|--------------------|--------------------| 114 | | Contribution to group work | Student contributed little to project; self-assessed contributions are low in quality and/or quantity; self-assessment is not consistent with actual contribution. | Student contributed adequately to project; made some significant contributions | Student substantially contributed to project to ensure success; self-assessed contributions are crucial to project; self-assessment is consistent with actual contribution. | 115 | | Content | Missing crucial information; methods and results are inconsistent, not logical, or not adequately explained; conclusions are confusing or unsupported by results; unnecessary information included as clutter | Most essential information included; methods and results are adequately described; conclusions supported by results; most included material is relevant to report | All essential information included; methods and results are succinct, clear, logical, and scientifically valid; conclusions are creative and meaningful; project is concise throughout | 116 | | Style and reproducibility | Code and writing are poorly organized, poorly formatted, missing units, difficult to read, poorly documented, difficult to reproduce analyses | Code and writing are well-organized, well-formatted, consistent use of units and significant figures | Code and writing are precise and clear throughout, free of errors, well-organized, well-documented, easily reproducible analyses, publication-ready | 117 | | Presentation | Presentation is poorly organized; much too long or much too short; presentation is unclear; presentation is missing information; presentation is not scientific and professional; presentation uses too much jargon; not all team members participate; does not adequately address audience questions | Presentation is adequately organized; timing is appropriate; most information is presented logically; presentation is scientific and professional; most jargon is avoided; all team members participate but equally; audience questions are sometimes addressed well | Presentation is clearly and logically organized; presentation flows and is easy to follow; presentation includes appropriate information without jargon; presentation is well-rehearsed and high-quality; all team members participate equally; audience questions are clearly addressed | 118 | 119 | As the final project is a team effort, all members within a group will receive the same mark in the final three categories and an individual mark for their contribution to group work. A final project that is considered to lie between two of the defined levels will be marked accordingly, e.g. between "Adequate" and "Excellent" would be 5, 6, or 7 marks. 120 | 121 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", 3 | "@type": "Code", 4 | "author": [ 5 | { 6 | "@id": "0000-0003-4169-2616", 7 | "@type": "Person", 8 | "email": "lwjohnst@ph.au.dk", 9 | "name": "Luke Johnston", 10 | "affiliation": "Department of Nutritional Sciences, University of Toronto; Department of Public Health, Aarhus University" 11 | }, 12 | { 13 | "@id": "0000-0002-5813-4664", 14 | "@type": "Person", 15 | "email": "m.bonsma@mail.utoronto.ca", 16 | "name": "Madeleine Bonsma-Fisher", 17 | "affiliation": "Department of Physics, University of Toronto" 18 | }, 19 | { 20 | "@id": "0000-0003-0051-3239", 21 | "@type": "Person", 22 | "email": "joel.ostblom@mail.utoronto.ca", 23 | "name": "Joel Ostblom", 24 | "affiliation": "Institute of Biomaterials and Biomedical Engineering, University of Toronto" 25 | }, 26 | { 27 | "@id": "0000-0003-0002-8399", 28 | "@type": "Person", 29 | "email": "ahmed.hasan@mail.utoronto.ca", 30 | "name": "Ahmed Hasan", 31 | "affiliation": "Department of Cell and Systems Biology, University of Toronto" 32 | }, 33 | { 34 | "@id": "0000-0002-5921-2548", 35 | "@type": "Person", 36 | "email": "james.santangelo@mail.utoronto.ca", 37 | "name": "James Santangelo", 38 | "affiliation": "Department of Ecology and Evolutionary Biology, University of Toronto" 39 | }, 40 | { 41 | "@id": "0000-0003-3504-4524", 42 | "@type": "Person", 43 | "email": "lina.tran@mail.utoronto.ca", 44 | "name": "Lina Tran", 45 | "affiliation": "Department of Physiology, University of Toronto" 46 | }, 47 | { 48 | "@id": "0000-0001-7310-8942", 49 | "@type": "Person", 50 | "email": "esalesde@physics.utoronto.ca", 51 | "name": "Elliott Sales de Andrade", 52 | "affiliation": "Department of Physics, University of Toronto" 53 | }, 54 | { 55 | "@id": "0000-0001-8126-3571", 56 | "@type": "Person", 57 | "email": "lindsay.coome@mail.utoronto.ca", 58 | "name": "Lindsay Coome", 59 | "affiliation": "Department of Psychology, University of Toronto" 60 | }, 61 | { 62 | "@id": "0000-0002-6765-0898", 63 | "@type": "Person", 64 | "email": "sara.mahallati@mail.utoronto.ca", 65 | "name": "Sara Mahallati", 66 | "affiliation": "Institute of Biomaterials and Biomedical Engineering, University of Toronto" 67 | } 68 | ], 69 | "identifier": "https://doi.org/10.5281/zenodo.2335179", 70 | "codeRepository": "https://github.com/UofTCoders/rcourse", 71 | "datePublished": "2019-01-03", 72 | "dateModified": "2019-01-03", 73 | "dateCreated": "2019-01-03", 74 | "description": "This material contains modular participatory live-coding lectures covering statistics and data analysis for ecology and reproducible quantitative methods in R. Statistical analysis, modelling, simulation, and data analysis are essential skills for applying ecology concepts to data. This material is designed to meet a growing demand for reproducible, openly accessible, analytically thorough, and well documented science.", 75 | "keywords": "R, programming, coding, ecology, statistics, differential equations, modelling, regression, population dynamics", 76 | "license": "MIT, CC-BY-4.0", 77 | "title": "A graduate student-led participatory live-coding quantitative methods course in R: Experiences on initiating, developing, and teaching", 78 | "version": "v2.1.0" 79 | } 80 | -------------------------------------------------------------------------------- /data/Assign05_Question3.csv: -------------------------------------------------------------------------------- 1 | "x","group" 2 | 6.42599961350736,"One" 3 | 5.12082462316966,"One" 4 | 5.52557783669731,"One" 5 | 5.06744070340457,"One" 6 | 4.87736583761432,"One" 7 | 5.58150736478004,"One" 8 | 5.37327730829425,"One" 9 | 5.78183037915858,"One" 10 | 8.26129675390806,"One" 11 | 5.71236878207077,"One" 12 | 4.58697722696339,"One" 13 | 6.26998610870784,"One" 14 | 7.54029660627869,"One" 15 | 8.01169838357271,"One" 16 | 6.50193828344865,"One" 17 | 5.65876995724927,"One" 18 | 5.95480957203462,"One" 19 | 5.81211095689094,"One" 20 | 10.7110129391544,"One" 21 | 3.93438002921211,"One" 22 | 4.58106862210215,"One" 23 | 6.28458771094597,"One" 24 | 5.7708518286089,"One" 25 | 5.48310171915475,"One" 26 | 5.45300589678716,"One" 27 | 5.96726955712034,"One" 28 | 4.92520727831643,"One" 29 | 6.20818072170388,"One" 30 | 7.84436341251713,"One" 31 | 6.24427864300449,"One" 32 | 4.46362057038254,"Two" 33 | 2.22232496254666,"Two" 34 | 1.98157958025982,"Two" 35 | 2.18236156089807,"Two" 36 | 6.42835743190577,"Two" 37 | 9.54107898626658,"Two" 38 | 9.53893270912479,"Two" 39 | 12.5571987052659,"Two" 40 | 1.68003859331248,"Two" 41 | 0.903556644459741,"Two" 42 | 3.03841374553939,"Two" 43 | 6.33546732283046,"Two" 44 | 4.50159690356383,"Two" 45 | 8.06059074469022,"Two" 46 | 3.07028592501813,"Two" 47 | 8.0740754861533,"Two" 48 | 7.02112356971714,"Two" 49 | 1.1424767963171,"Two" 50 | 4.32155813691801,"Two" 51 | 0.845567933804901,"Two" 52 | 9.60419330543276,"Two" 53 | 4.98987313114606,"Two" 54 | 4.59513912092922,"Two" 55 | 10.8146670109643,"Two" 56 | 7.8797937958823,"Two" 57 | 2.32450148400854,"Two" 58 | 2.70998938787323,"Two" 59 | 2.40385691756134,"Two" 60 | 5.13332639128709,"Two" 61 | 7.44063769131989,"Two" 62 | -------------------------------------------------------------------------------- /data/africa.wide.csv: -------------------------------------------------------------------------------- 1 | siteID,country,year,lat,long,marPOS,PfEX,gdp,WDIregion,WHOregion,PfPR,total.abundance,SR,percent 2 | 2,Madagascar,1991,-18.58,47.15,23,48,448.0355093,Eastern Africa,Africa,0.479166667,0.145686909,2,1 3 | 3,Madagascar,1989,-18.95,47.57,25,62,491.4355261,Eastern Africa,Africa,0.403225806,118.9660198,1,0.333333333 4 | 4,Madagascar,1997,-19.4,46.72,580,1909,414.7742537,Eastern Africa,Africa,0.303823992,39.94314602,2,0.333333333 5 | 7,Kenya,1998,-3.07,40.14,51,100,855.0818286,Eastern Africa,Africa,0.51,201.9725175,4,1 6 | 8,Kenya,1998,-3.13,40.11,69,101,855.0818286,Eastern Africa,Africa,0.683168317,202.8294379,4,1 7 | 9,Kenya,1998,-3.14,40.14,60,100,855.0818286,Eastern Africa,Africa,0.6,200.5822147,4,1 8 | 10,Kenya,1998,-3.21,40.07,49,100,855.0818286,Eastern Africa,Africa,0.49,203.6217999,3,1 9 | 11,Kenya,1998,-3.26,40.01,67,100,855.0818286,Eastern Africa,Africa,0.67,196.8772455,3,1 10 | 12,Kenya,1998,-3.29,40.08,59,100,855.0818286,Eastern Africa,Africa,0.59,202.6825175,3,1 11 | 13,Kenya,1998,-3.34,40,40,99,855.0818286,Eastern Africa,Africa,0.404040404,194.9104229,2,1 12 | 14,Kenya,1998,-3.53,39.78,69,99,855.0818286,Eastern Africa,Africa,0.696969697,202.0763041,2,1 13 | 15,Kenya,1998,-3.54,39.61,73,100,855.0818286,Eastern Africa,Africa,0.73,204.3535139,3,1 14 | 16,Kenya,1998,-3.59,39.53,72,100,855.0818286,Eastern Africa,Africa,0.72,203.324865,3,1 15 | 17,Kenya,1998,-3.63,39.73,53,98,855.0818286,Eastern Africa,Africa,0.540816327,205.2243983,4,1 16 | 18,Kenya,1998,-3.67,39.75,76,101,855.0818286,Eastern Africa,Africa,0.752475248,202.6983479,3,1 17 | 19,Kenya,1998,-3.68,39.85,57,100,855.0818286,Eastern Africa,Africa,0.57,198.5154037,3,1 18 | 20,Kenya,1998,-3.73,39.7,40,100,855.0818286,Eastern Africa,Africa,0.4,205.3764645,1,1 19 | 21,Kenya,1998,-3.73,39.8,62,101,855.0818286,Eastern Africa,Africa,0.613861386,202.0250468,4,1 20 | 22,Kenya,1998,-3.7,39.73,70,100,855.0818286,Eastern Africa,Africa,0.7,206.3674447,2,1 21 | 23,Kenya,1998,-3.79,39.82,57,100,855.0818286,Eastern Africa,Africa,0.57,201.5500176,1,1 22 | 24,Kenya,1998,-3.92,39.77,65,100,855.0818286,Eastern Africa,Africa,0.65,194.9217999,3,1 23 | 25,Kenya,1998,-3.9,39.73,83,99,855.0818286,Eastern Africa,Africa,0.838383838,197.7376686,3,1 24 | 26,Kenya,1998,-3,40.2,82,100,855.0818286,Eastern Africa,Africa,0.82,201.6029205,4,1 25 | 27,Kenya,1998,-4.12,39.28,73,100,855.0818286,Eastern Africa,Africa,0.73,199.4587563,4,1 26 | 28,Kenya,1998,-4.12,39.37,84,100,855.0818286,Eastern Africa,Africa,0.84,201.8712231,3,1 27 | 29,Kenya,1998,-4.14,39.39,78,100,855.0818286,Eastern Africa,Africa,0.78,203.7867121,3,1 28 | 30,Kenya,1998,-4.16,39.45,59,100,855.0818286,Eastern Africa,Africa,0.59,203.3170876,2,1 29 | 31,Kenya,1998,-4.18,39.5,44,100,855.0818286,Eastern Africa,Africa,0.44,200.2900023,2,1 30 | 32,Kenya,1998,-4.18,39.53,66,100,855.0818286,Eastern Africa,Africa,0.66,200.2900023,2,1 31 | 33,Kenya,1998,-4.27,39.58,51,100,855.0818286,Eastern Africa,Africa,0.51,192.8893965,3,1 32 | 34,Kenya,1998,-4.38,39.47,50,78,855.0818286,Eastern Africa,Africa,0.641025641,190.178306,3,1 33 | 35,Kenya,1998,-4.43,39.5,60,100,855.0818286,Eastern Africa,Africa,0.6,193.2649569,4,1 34 | 36,Kenya,1998,-4.6,39.17,67,100,855.0818286,Eastern Africa,Africa,0.67,188.6794788,4,1 35 | 51,Kenya,2003,0.17,34.75,230,709,824.4529698,Eastern Africa,Africa,0.324400564,61.82157094,2,0.5 36 | 57,Guinea-Bissau,1995,11.91,-15.6,52,112,638.1742814,Western Africa,Africa,0.464285714,169.1899305,2,1 37 | 58,Burkina Faso,1995,12.67,-1.23,907,1189,361.121422,Western Africa,Africa,0.73510252,156.2303621,3,1 38 | 59,Mali,1989,13.22,-5.92,2,123,502.8532591,Western Africa,Africa,0.074417961,79.92881402,2,1 39 | 60,The Gambia,1988,13.48,-16.68,8,386,511.0285137,Western Africa,Africa,0.020725389,184.315253,1,0.5 40 | 61,Senegal,1994,13.72,-16.42,117,220,995.4211649,Western Africa,Africa,0.636354435,172.8095093,4,0.571428571 41 | 62,Senegal,1995,13.93,-16.76,41,661,1020.69932,Western Africa,Africa,0.062027231,193.8609932,4,0.8 42 | 63,Senegal,1995,13.98,-16.77,49,403,1020.69932,Western Africa,Africa,0.121588089,191.3167558,4,0.8 43 | 64,Senegal,1995,14.05,-16.68,121,773,1020.69932,Western Africa,Africa,0.156532988,178.172849,4,0.8 44 | 65,Senegal,1995,14.1,-16.67,21,512,1020.69932,Western Africa,Africa,0.041015625,171.0232027,3,0.75 45 | 66,Senegal,1995,14.15,-16.65,113,691,1020.69932,Western Africa,Africa,0.163531114,161.9986126,3,0.75 46 | 67,Senegal,1995,14.53,-16.43,144,332,1020.69932,Western Africa,Africa,0.43373494,156.0269517,3,0.6 47 | 69,Senegal,1995,14.54,-16.44,261,366,1020.69932,Western Africa,Africa,0.713114754,156.0269517,4,0.8 48 | 70,Senegal,1995,14.55,-16.45,110,172,1020.69932,Western Africa,Africa,0.639534884,156.6305489,4,0.8 49 | 72,Senegal,1991,14.91,-17.07,10,222,1060.78698,Western Africa,Africa,0.045045045,185.3918401,5,0.833333333 50 | 73,Eritrea,2002,15.11,36.65,10,300,601.0342469,Eastern Africa,Africa,0.033333333,147.9212393,1,1 51 | 75,Mali,1988,15,2.97,83,186,490.2892417,Western Africa,Africa,0.446236559,52.32812383,2,1 52 | 77,Senegal,1990,16.5,-14.44,9,118,1066.3899,Western Africa,Africa,0.076271186,74.72705868,3,1 53 | 78,Senegal,1990,16.52,-14.43,5,61,1066.3899,Western Africa,Africa,0.081967213,76.91797725,3,0.75 54 | 79,Senegal,1990,16.52,-14.62,10,109,1066.3899,Western Africa,Africa,0.091743119,79.85343679,3,1 55 | 80,Mali,1988,16.96,-0.36,11,206,490.2892417,Western Africa,Africa,0.053398058,29.91677606,2,1 56 | 86,Mali,1988,17.33,0.12,5,188,490.2892417,Western Africa,Africa,0.026595745,19.86864387,1,1 57 | 89,Mali,1988,18.44,1.4,2,251,490.2892417,Western Africa,Africa,0.007968127,8.111480598,2,1 58 | 95,Cameroon,1998,3.82,11.48,205,372,1132.952026,Western Africa,Africa,0.551075269,200.3646769,4,0.5 59 | 101,Cameroon,2001,4.01,9.19,74,174,1183.43807,Western Africa,Africa,0.425287356,214.5101509,3,1 60 | 102,Cameroon,2001,4.03,9.18,174,689,1183.43807,Western Africa,Africa,0.252539913,214.5101509,4,0.8 61 | 103,Cameroon,2001,4.07,9.36,556,1690,1183.43807,Western Africa,Africa,0.328994083,203.4368237,3,0.75 62 | 107,Cote d'Ivoire,1996,5.95,-7.47,304,317,1372.790201,Western Africa,Africa,0.846252918,206.44001,2,1 63 | 111,Ethiopia,1997,8.05,38.73,8,127,199.1113028,Eastern Africa,Africa,0.062992126,36.48174272,2,1 64 | 113,Cameroon,2002,9.4,13.51,41,122,1201.912083,Western Africa,Africa,0.336065574,8.700992323,5,0.833333333 -------------------------------------------------------------------------------- /data/iris.csv: -------------------------------------------------------------------------------- 1 | "sepal_length","sepal_width","petal_length","petal_width","species" 2 | 5.1,3.5,1.4,0.2,"setosa" 3 | 4.9,3,1.4,0.2,"setosa" 4 | 4.7,3.2,1.3,0.2,"setosa" 5 | 4.6,3.1,1.5,0.2,"setosa" 6 | 5,3.6,1.4,0.2,"setosa" 7 | 5.4,3.9,1.7,0.4,"setosa" 8 | 4.6,3.4,1.4,0.3,"setosa" 9 | 5,3.4,1.5,0.2,"setosa" 10 | 4.4,2.9,1.4,0.2,"setosa" 11 | 4.9,3.1,1.5,0.1,"setosa" 12 | 5.4,3.7,1.5,0.2,"setosa" 13 | 4.8,3.4,1.6,0.2,"setosa" 14 | 4.8,3,1.4,0.1,"setosa" 15 | 4.3,3,1.1,0.1,"setosa" 16 | 5.8,4,1.2,0.2,"setosa" 17 | 5.7,4.4,1.5,0.4,"setosa" 18 | 5.4,3.9,1.3,0.4,"setosa" 19 | 5.1,3.5,1.4,0.3,"setosa" 20 | 5.7,3.8,1.7,0.3,"setosa" 21 | 5.1,3.8,1.5,0.3,"setosa" 22 | 5.4,3.4,1.7,0.2,"setosa" 23 | 5.1,3.7,1.5,0.4,"setosa" 24 | 4.6,3.6,1,0.2,"setosa" 25 | 5.1,3.3,1.7,0.5,"setosa" 26 | 4.8,3.4,1.9,0.2,"setosa" 27 | 5,3,1.6,0.2,"setosa" 28 | 5,3.4,1.6,0.4,"setosa" 29 | 5.2,3.5,1.5,0.2,"setosa" 30 | 5.2,3.4,1.4,0.2,"setosa" 31 | 4.7,3.2,1.6,0.2,"setosa" 32 | 4.8,3.1,1.6,0.2,"setosa" 33 | 5.4,3.4,1.5,0.4,"setosa" 34 | 5.2,4.1,1.5,0.1,"setosa" 35 | 5.5,4.2,1.4,0.2,"setosa" 36 | 4.9,3.1,1.5,0.2,"setosa" 37 | 5,3.2,1.2,0.2,"setosa" 38 | 5.5,3.5,1.3,0.2,"setosa" 39 | 4.9,3.6,1.4,0.1,"setosa" 40 | 4.4,3,1.3,0.2,"setosa" 41 | 5.1,3.4,1.5,0.2,"setosa" 42 | 5,3.5,1.3,0.3,"setosa" 43 | 4.5,2.3,1.3,0.3,"setosa" 44 | 4.4,3.2,1.3,0.2,"setosa" 45 | 5,3.5,1.6,0.6,"setosa" 46 | 5.1,3.8,1.9,0.4,"setosa" 47 | 4.8,3,1.4,0.3,"setosa" 48 | 5.1,3.8,1.6,0.2,"setosa" 49 | 4.6,3.2,1.4,0.2,"setosa" 50 | 5.3,3.7,1.5,0.2,"setosa" 51 | 5,3.3,1.4,0.2,"setosa" 52 | 7,3.2,4.7,1.4,"versicolor" 53 | 6.4,3.2,4.5,1.5,"versicolor" 54 | 6.9,3.1,4.9,1.5,"versicolor" 55 | 5.5,2.3,4,1.3,"versicolor" 56 | 6.5,2.8,4.6,1.5,"versicolor" 57 | 5.7,2.8,4.5,1.3,"versicolor" 58 | 6.3,3.3,4.7,1.6,"versicolor" 59 | 4.9,2.4,3.3,1,"versicolor" 60 | 6.6,2.9,4.6,1.3,"versicolor" 61 | 5.2,2.7,3.9,1.4,"versicolor" 62 | 5,2,3.5,1,"versicolor" 63 | 5.9,3,4.2,1.5,"versicolor" 64 | 6,2.2,4,1,"versicolor" 65 | 6.1,2.9,4.7,1.4,"versicolor" 66 | 5.6,2.9,3.6,1.3,"versicolor" 67 | 6.7,3.1,4.4,1.4,"versicolor" 68 | 5.6,3,4.5,1.5,"versicolor" 69 | 5.8,2.7,4.1,1,"versicolor" 70 | 6.2,2.2,4.5,1.5,"versicolor" 71 | 5.6,2.5,3.9,1.1,"versicolor" 72 | 5.9,3.2,4.8,1.8,"versicolor" 73 | 6.1,2.8,4,1.3,"versicolor" 74 | 6.3,2.5,4.9,1.5,"versicolor" 75 | 6.1,2.8,4.7,1.2,"versicolor" 76 | 6.4,2.9,4.3,1.3,"versicolor" 77 | 6.6,3,4.4,1.4,"versicolor" 78 | 6.8,2.8,4.8,1.4,"versicolor" 79 | 6.7,3,5,1.7,"versicolor" 80 | 6,2.9,4.5,1.5,"versicolor" 81 | 5.7,2.6,3.5,1,"versicolor" 82 | 5.5,2.4,3.8,1.1,"versicolor" 83 | 5.5,2.4,3.7,1,"versicolor" 84 | 5.8,2.7,3.9,1.2,"versicolor" 85 | 6,2.7,5.1,1.6,"versicolor" 86 | 5.4,3,4.5,1.5,"versicolor" 87 | 6,3.4,4.5,1.6,"versicolor" 88 | 6.7,3.1,4.7,1.5,"versicolor" 89 | 6.3,2.3,4.4,1.3,"versicolor" 90 | 5.6,3,4.1,1.3,"versicolor" 91 | 5.5,2.5,4,1.3,"versicolor" 92 | 5.5,2.6,4.4,1.2,"versicolor" 93 | 6.1,3,4.6,1.4,"versicolor" 94 | 5.8,2.6,4,1.2,"versicolor" 95 | 5,2.3,3.3,1,"versicolor" 96 | 5.6,2.7,4.2,1.3,"versicolor" 97 | 5.7,3,4.2,1.2,"versicolor" 98 | 5.7,2.9,4.2,1.3,"versicolor" 99 | 6.2,2.9,4.3,1.3,"versicolor" 100 | 5.1,2.5,3,1.1,"versicolor" 101 | 5.7,2.8,4.1,1.3,"versicolor" 102 | 6.3,3.3,6,2.5,"virginica" 103 | 5.8,2.7,5.1,1.9,"virginica" 104 | 7.1,3,5.9,2.1,"virginica" 105 | 6.3,2.9,5.6,1.8,"virginica" 106 | 6.5,3,5.8,2.2,"virginica" 107 | 7.6,3,6.6,2.1,"virginica" 108 | 4.9,2.5,4.5,1.7,"virginica" 109 | 7.3,2.9,6.3,1.8,"virginica" 110 | 6.7,2.5,5.8,1.8,"virginica" 111 | 7.2,3.6,6.1,2.5,"virginica" 112 | 6.5,3.2,5.1,2,"virginica" 113 | 6.4,2.7,5.3,1.9,"virginica" 114 | 6.8,3,5.5,2.1,"virginica" 115 | 5.7,2.5,5,2,"virginica" 116 | 5.8,2.8,5.1,2.4,"virginica" 117 | 6.4,3.2,5.3,2.3,"virginica" 118 | 6.5,3,5.5,1.8,"virginica" 119 | 7.7,3.8,6.7,2.2,"virginica" 120 | 7.7,2.6,6.9,2.3,"virginica" 121 | 6,2.2,5,1.5,"virginica" 122 | 6.9,3.2,5.7,2.3,"virginica" 123 | 5.6,2.8,4.9,2,"virginica" 124 | 7.7,2.8,6.7,2,"virginica" 125 | 6.3,2.7,4.9,1.8,"virginica" 126 | 6.7,3.3,5.7,2.1,"virginica" 127 | 7.2,3.2,6,1.8,"virginica" 128 | 6.2,2.8,4.8,1.8,"virginica" 129 | 6.1,3,4.9,1.8,"virginica" 130 | 6.4,2.8,5.6,2.1,"virginica" 131 | 7.2,3,5.8,1.6,"virginica" 132 | 7.4,2.8,6.1,1.9,"virginica" 133 | 7.9,3.8,6.4,2,"virginica" 134 | 6.4,2.8,5.6,2.2,"virginica" 135 | 6.3,2.8,5.1,1.5,"virginica" 136 | 6.1,2.6,5.6,1.4,"virginica" 137 | 7.7,3,6.1,2.3,"virginica" 138 | 6.3,3.4,5.6,2.4,"virginica" 139 | 6.4,3.1,5.5,1.8,"virginica" 140 | 6,3,4.8,1.8,"virginica" 141 | 6.9,3.1,5.4,2.1,"virginica" 142 | 6.7,3.1,5.6,2.4,"virginica" 143 | 6.9,3.1,5.1,2.3,"virginica" 144 | 5.8,2.7,5.1,1.9,"virginica" 145 | 6.8,3.2,5.9,2.3,"virginica" 146 | 6.7,3.3,5.7,2.5,"virginica" 147 | 6.7,3,5.2,2.3,"virginica" 148 | 6.3,2.5,5,1.9,"virginica" 149 | 6.5,3,5.2,2,"virginica" 150 | 6.2,3.4,5.4,2.3,"virginica" 151 | 5.9,3,5.1,1.8,"virginica" 152 | -------------------------------------------------------------------------------- /data/jellyfish.csv: -------------------------------------------------------------------------------- 1 | Location,Width,Length D,6,9 D,6.5,8 D,6.5,9 D,7,9 D,7,10 D,7,11 D,8,9.5 D,8,10 D,8,10 D,8,11 D,9,11 D,10,13 D,11,13 D,11,14 D,11,14 D,12,13 D,13,14 D,14,16 D,15,16 D,15,16 D,15,19 D,16,16 S,12,14 S,13,17 S,14,16.5 S,14,19 S,15,16 S,15,17 S,15,18 S,15,18 S,15,19 S,15,21 S,16,18 S,16,19 S,16,20 S,16,20 S,16,21 S,16.5,19 S,17,20 S,18,19 S,18,19 S,18,20 S,19,20 S,19,22 S,20,22 S,21,21 M,11,8 M,15,10 M,11,11 M,16,16 M,16,20 M,12,20 M,8,21 M,16.5,19 M,13,18 M,14,18 M,13,16.5 M,7,13 M,6,13 -------------------------------------------------------------------------------- /data/kenya.wide.csv: -------------------------------------------------------------------------------- 1 | site.id,gambiae,funestus,arabiensis,merus,lat,long,site.name,total.abundance,PfPOS,PfNEG,PfEX,PfPR,distance,SR 2 | 24,0.075056861,0.93025019,0.0015163,0,-4.38,39.48,Magaoni,1327,64,14,78,0.8205128,42.15,3 3 | 2,0.398981324,0.0237691,0.281833616,0.295415959,-3,40.2,Garithe,589,79,21,100,0.79,130.7,4 4 | 3,0.90512334,0.018975332,0.075901328,0,-3.54,39.61,Kagombani,527,72,28,100,0.72,57.23,3 5 | 4,0.836852207,0.034548944,0.10940499,0.019193858,-3.15,40.15,Majenjeni,521,58,42,100,0.58,113.4,4 6 | 21,0.78372591,0.17130621,0.040685225,0.004282655,-4.13,39.29,Amani,467,73,27,100,0.73,43.11,4 7 | 14,0.444191344,0.466970387,0.061503417,0.027334852,-3.64,39.74,Jaribuni,439,53,45,98,0.540816327,46.25,4 8 | 27,0.016393443,0.7470726,0.06323185,0.173302108,-4.6,39.167,Tsuini,427,67,33,100,0.67,82.61,4 9 | 9,0.720095694,0.107655502,0.172248804,0,-3.59,39.53,Paziani,418,72,28,100,0.72,53.66,3 10 | 23,0.112531969,0.846547315,0.007672634,0.033248082,-4.43,39.5,Gazi,391,60,40,100,0.6,45.68,4 11 | 5,0.731903485,0.024128686,0.214477212,0.02919571,-3.13,40.11,Masheheni,373,67,34,101,0.663366337,113.4,4 12 | 28,0.114285714,0.794285714,0,0,-4.19,39.54,Vinuni,350,66,34,100,0.66,21.22,2 13 | 13,0.683890578,0.282674772,0.012158055,0.021276596,-3.74,39.8,Dindiri,329,62,39,101,0.613861386,37.37,4 14 | 11,0.335548173,0.651162791,0.013289037,0,-3.93,39.77,Barani,301,65,35,100,0.65,63.54,3 15 | 6,0.627586207,0.296551724,0.075862069,0,-3.21,40.07,Maziwani,290,48,52,100,0.48,103.4,3 16 | 17,0.641304348,0.329710145,0,0.028985507,-3.67,39.75,Majajani,276,76,25,101,0.752475248,43.18,3 17 | 8,0.496183206,0.030534351,0.404580153,0.06870229,-3.07,40.15,Mjanaheri,262,49,51,100,0.49,121.3,4 18 | 29,0.212765957,0.787234043,0,0,-4.19,39.5,Vuga,188,44,56,100,0.44,24.45,2 19 | 1,0.353658537,0,0.646341463,0,-3.34,40.01,Dabaso,164,38,61,99,0.383838384,87.5,2 20 | 18,0.787234043,0.191489362,0.021276596,0,-3.91,39.74,Mtepeni,141,82,17,99,0.828282828,17.4,3 21 | 10,0.656033058,0.305785124,0.031239669,0,-3.26,40.02,Mijomboni,121,63,37,100,0.63,96.05,3 22 | 30,0.744186047,0.255813953,0,0,-4.16,39.46,Ziwani,86,59,41,100,0.59,26.3,2 23 | 22,0.894117647,0.094117647,0.011764706,0,-4.12,39.37,Dumbule,85,82,18,100,0.82,34.2,3 24 | 15,0.880597015,0.119402985,0,0,-3.71,39.74,Kitsoeni,67,69,31,100,0.69,38.6,2 25 | 19,1,0,0,0,-3.79,39.83,Shariani,58,54,46,100,0.54,33.92,1 26 | 7,0.571428571,0.017857143,0.410714286,0,-3.3,40.09,Mbaraka Chembe,56,59,41,100,0.59,95.54,3 27 | 25,0.652173913,0.304347826,0.043478261,0,-4.15,39.4,Moyeni,46,78,22,100,0.78,31.95,3 28 | 26,0.604651163,0.162790698,0.23255814,0,-4.27,39.58,Mwaroni,43,50,50,100,0.5,26.42,3 29 | 16,0.666666667,0.333333333,0,0,-3.53,39.78,Kitengwani,30,69,30,99,0.696969697,59.1,2 30 | 20,0.631578947,0.315789474,0.052631579,0,-3.68,39.85,Takaungu,19,52,48,100,0.52,45.59,3 31 | 12,1,0,0,0,-3.73,39.71,Chasimba,2,39,61,100,0.39,35.86,1 -------------------------------------------------------------------------------- /data/lec09_CommunityMatrix_Example.csv: -------------------------------------------------------------------------------- 1 | ,Site 1,Site 2,Site 3,Site 4 2 | Species 1,0,0,0,1 3 | Species 2,0,1,1,1 4 | Species 3,1,0,1,0 5 | Species 4,1,1,1,0 6 | Species 5,0,1,0,1 -------------------------------------------------------------------------------- /data/predator_prey_body_size.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/predator_prey_body_size.txt -------------------------------------------------------------------------------- /data/rikz_data.txt: -------------------------------------------------------------------------------- 1 | "Richness" "Exposure" "NAP" "Beach" "Site" 2 | 11 10 0.045 1 1 3 | 10 10 -1.036 1 2 4 | 13 10 -1.336 1 3 5 | 11 10 0.616 1 4 6 | 10 10 -0.684 1 5 7 | 8 8 1.19 2 1 8 | 9 8 0.82 2 2 9 | 8 8 0.635 2 3 10 | 19 8 0.061 2 4 11 | 17 8 -1.334 2 5 12 | 6 11 -0.976 3 1 13 | 1 11 1.494 3 2 14 | 4 11 -0.201 3 3 15 | 3 11 -0.482 3 4 16 | 3 11 0.167 3 5 17 | 1 11 1.768 4 1 18 | 3 11 -0.03 4 2 19 | 3 11 0.46 4 3 20 | 1 11 1.367 4 4 21 | 4 11 -0.811 4 5 22 | 3 10 1.117 5 1 23 | 22 10 -0.503 5 2 24 | 6 10 0.729 5 3 25 | 0 10 1.627 5 4 26 | 6 10 0.054 5 5 27 | 5 11 -0.578 6 1 28 | 4 11 -0.348 6 2 29 | 1 11 2.222 6 3 30 | 6 11 -0.893 6 4 31 | 4 11 0.766 6 5 32 | 2 11 0.883 7 1 33 | 1 11 1.786 7 2 34 | 1 11 1.375 7 3 35 | 3 11 -0.06 7 4 36 | 4 11 0.367 7 5 37 | 3 10 1.671 8 1 38 | 5 10 -0.375 8 2 39 | 7 10 -1.005 8 3 40 | 5 10 0.17 8 4 41 | 0 10 2.052 8 5 42 | 7 10 -0.356 9 1 43 | 11 10 0.094 9 2 44 | 3 10 -0.002 9 3 45 | 0 10 2.255 9 4 46 | 2 10 0.865 9 5 47 | -------------------------------------------------------------------------------- /data/survey.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/survey.csv.gz -------------------------------------------------------------------------------- /data/wc2.0_bio_10m_01.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/wc2.0_bio_10m_01.tif -------------------------------------------------------------------------------- /data/wc2.0_bio_10m_12.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/data/wc2.0_bio_10m_12.tif -------------------------------------------------------------------------------- /image/Liriodendron_tulipifera.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/Liriodendron_tulipifera.png -------------------------------------------------------------------------------- /image/RIKZ_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/RIKZ_data.png -------------------------------------------------------------------------------- /image/RIKZ_data_Crossed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/RIKZ_data_Crossed.png -------------------------------------------------------------------------------- /image/RIKZ_data_DeepNest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/RIKZ_data_DeepNest.png -------------------------------------------------------------------------------- /image/SEM-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/SEM-figure.png -------------------------------------------------------------------------------- /image/SEMfig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/SEMfig.png -------------------------------------------------------------------------------- /image/assignment-8-figure-q1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/assignment-8-figure-q1.png -------------------------------------------------------------------------------- /image/boxplot-problem.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/boxplot-problem.gif -------------------------------------------------------------------------------- /image/colourblind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/colourblind.png -------------------------------------------------------------------------------- /image/comic-filenaming.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/comic-filenaming.gif -------------------------------------------------------------------------------- /image/dynamite-bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/dynamite-bars.png -------------------------------------------------------------------------------- /image/dynamite-vs-dists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/dynamite-vs-dists.png -------------------------------------------------------------------------------- /image/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/favicon.png -------------------------------------------------------------------------------- /image/fig_scientific_method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/fig_scientific_method.png -------------------------------------------------------------------------------- /image/git_lesson/branch_dropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/branch_dropdown.png -------------------------------------------------------------------------------- /image/git_lesson/branches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/branches.png -------------------------------------------------------------------------------- /image/git_lesson/delete_branch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/delete_branch.png -------------------------------------------------------------------------------- /image/git_lesson/sample_rmd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/sample_rmd.png -------------------------------------------------------------------------------- /image/git_lesson/yellow_prompt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/git_lesson/yellow_prompt.png -------------------------------------------------------------------------------- /image/heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/heatmap.png -------------------------------------------------------------------------------- /image/logistic.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/logistic.gif -------------------------------------------------------------------------------- /image/lotka-volterra.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/lotka-volterra.gif -------------------------------------------------------------------------------- /image/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/model.png -------------------------------------------------------------------------------- /image/predator-prey.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/predator-prey.gif -------------------------------------------------------------------------------- /image/signal-transduction-pathway.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/image/signal-transduction-pathway.png -------------------------------------------------------------------------------- /index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Syllabus: EEB313 Quantitative Methods in R for Biology [24L, 12P]' 3 | --- 4 | 5 | This course covers statistics and data analysis for ecology and reproducible quantitative methods in R. Statistical analysis, modelling, simulation, and data analysis are essential skills for applying ecology concepts to data. This course is designed to meet a growing demand for reproducible, openly accessible, analytically thorough, and well documented science. Students will learn to develop ecological population models, analyze data, and document their research using the R programming language. No prerequisite programming experience is required. 6 | 7 | Prerequisites: BIO220H1 and one of EEB225H1, STA288H1, or STA220H1 8 | 9 | ## Time 10 | Tue and Thu 2:10 - 4:00 pm. Office hours are Tue 4:00 - 5:00 pm. 11 | 12 | ## Class locations 13 | 14 | | Day | Room | 15 | |-----|--------------| 16 | | Tue | [Ramsay Wright](http://map.utoronto.ca/utsg/building/072) (RW 109) | 17 | | Thu | [Ramsay Wright](http://map.utoronto.ca/utsg/building/072) (RW 109) | 18 | 19 | Office hours are on Tuesdays from 4 to 5 PM in RW 109. 20 | 21 | The lecture hall has access to individual computers for the students. To use the computer workstations, students can login with their UTORid and password. Programs and packages that you install, and files that you save, will be deleted from these computers daily. Please bring a USB key to save files onto or email them to yourself. Students can use any of the lecture halls when there are no classes scheduled. Lecture halls are usually open 9 am - 5 pm, see the [online schedules](http://lab.chass.utoronto.ca/carr.php) for available times. 22 | 23 | ## Contact info 24 | Quercus is the preferred communication channel. If you need to use email instead, please address all general course-related issues to james.santangelo@mail.utoronto.ca, and project specific communication to the respective TA of your group. Prefix the subject matter with "EEB313". If you do not receive a reply within 48 hours (excluding week-ends), please send a reminder. 25 | 26 | ### Course Instructors 27 | - James Santangelo, james.santangelo@mail.utoronto.ca 28 | - Ahmed Hasan, ahmed.hasan@mail.utoronto.ca 29 | - Zoe Humphries, zoe.humphries@mail.utoronto.ca 30 | - Amber Hoi, amber.hoi@mail.utoronto.ca 31 | 32 | ### Supervising professor 33 | Prof. Benjamin Gilbert, benjamin.gilbert@utoronto.ca , 416-978-4065, ES3035 34 | 35 | ## Course Website and Quercus 36 | All course information is accessible [on its own website](https://uoftcoders.github.io/rcourse/) and on [Quercus](https://q.utoronto.ca), including the syllabus, assessments, and lecture slides. If you have any problem accessing the material, let us know via email right away so we can fix the problem. 37 | 38 | ## Recommended resources 39 | - [R for Data science](http://r4ds.had.co.nz/), H Wickham, G Grolemund, 2017 40 | - Excellent open access resource for R. 41 | - [RStudio cheat sheets](https://www.rstudio.com/resources/cheatsheets/), RStudio, 2017 42 | - As good as it sounds, great quick reference. 43 | - [R for ecological data science](https://datacarpentry.org/R-ecology-lesson/index.html) 44 | - An inspiration for our lectures. 45 | 46 | ## Course learning outcomes 47 | 1. Develop proficiency in the programming language R. 48 | 2. Use R to apply statistics to analyze and interpret data. 49 | 3. Choose appropriate analysis techniques for a variety of data types and formats. 50 | 4. Learn and use techniques and best practices for reproducible, high-quality science. 51 | 5. Learn how to work as part of a research team to produce a scientific product. 52 | 6. Learn what is required to generate a scientific item ready for publishing. 53 | 54 | ## Improving your writing skills 55 | Effective communication is crucial in science. The [University of Toronto provides services](http://writing.utoronto.ca/) to help you improve your writing, from general advices on effective writing to writing centers and writing courses. The Faculty of Arts & Science also offers an English Language Learning (ELL) program, which provides free individualized instruction in English skills. Take advantage of these! 56 | 57 | ## Academic integrity 58 | You should be aware of the University of Toronto Code of Behaviour on Academic Matters. Also see [How Not to Plagiarize](http://advice.writing.utoronto.ca/using-sources/how-not-to-plagiarize/). Note that it is NOT appropriate to use large sections from internet sources, and inserting a few words here and there does not make it an original piece of writing. Be careful in using internet sources – there is no review of most online material and there are many errors out there. Use only academic or government internet sources when absolutely necessary. Make sure you read material from many sources (published, peer-reviewed, trusted internet sources) and that you write an original text using this information. Always cite your sources. In case of doubt about plagiarism, talk to your instructor. Please make sure that what you submit for the final project does not overlap with what you submit for other classes, such as the 4th year research project. We will not enforce this, but the department will. 59 | 60 | ## Lecture schedule 61 | | Week | Date | Topic | Instructor | 62 | |------|--------|--------------------------------------------|---------------------------| 63 | | 1 | Sep 10 | Intro to course, programming, RStudio, R Markdown | Everyone | 64 | | 1 | Sep 12 | Assignment, vectors, functions | Ahmed | 65 | | 2 | Sep 17 | Data frames, intro to dplyr | Ahmed | 66 | | 2 | Sep 19 | Data wrangling in dplyr, ggplot, tidy data | Ahmed | 67 | | 3 | Sep 24 | More dplyr and ggplot | Ahmed | 68 | | 3 | Sep 26 | Exploratory data analysis | Zoe | 69 | | 4 | Oct 01 | Linear models and statistical modelling | Zoe | 70 | | 4 | Oct 03 | Mixed effects models | James | 71 | | 5 | Oct 08 | Model selection | James | 72 | | 5 | Oct 10 | Multivariate stats | Amber | 73 | | 6 | Oct 15 | Spatial stats | Amber | 74 | | 6 | Oct 17 | Simulating data | James | 75 | | 7 | Oct 22 | Ecological modelling | Amber | 76 | | 7 | Oct 24 | Evolutionary modelling | Zoe | 77 | | 8 | Oct 29 | Reproducible science | Everyone | 78 | | 8 | Oct 31 | Datasets, hypotheses, begin projects | Everyone | 79 | | - | Nov 05 | Fall break | - | 80 | | - | Nov 07 | Fall break | - | 81 | | 9 | Nov 12 | Project work | Everyone | 82 | | 9 | Nov 14 | Project work | Everyone | 83 | | 10 | Nov 19 | Project work | Everyone | 84 | | 10 | Nov 21 | Project work | Everyone | 85 | | 11 | Nov 26 | Project work | Everyone | 86 | | 11 | Nov 28 | Project work | Everyone | 87 | | 12 | Dec 03 | Group presentations | Everyone | 88 | 89 | ## Assessment schedule 90 | | Assignment | Type | Due date | Marks | 91 | |----------------------------------|---------------------|------------|-------| 92 | | Getting set up | Individual | Sep 19 | 4 | 93 | | Basic R and dplyr | Individual | Sep 26 | 8 | 94 | | dplyr and tidy data | Individual | Oct 03 | 8 | 95 | | Data exploration, linear models | Individual | Oct 10 | 8 | 96 | | Model selection, multivar. stats | Individual | Oct 17 | 8 | 97 | | Spatial stats, randomization | Individual | Oct 24 | 8 | 98 | | Modelling | Individual | Oct 31 | 8 | 99 | | Mid-project update | Project, Group | Nov 14 | 10 | 100 | | Challenge assignment | Individual | Nov 21 | 16 | 101 | | Final report, presentation | Project, Group | Dec 03 | 22 | 102 | 103 | There are 100 marks in total. Your final course mark will be the sum of your assignment scores, which will be translated to a letter grade according to the [official grading scale](http://www.artsci.utoronto.ca/faculty-staff/teacher-info/academic-handbook-for-instructors/sections-9-11#official) of the Faculty of Arts and Science. 104 | 105 | Assignments will be distributed and submitted in the R Markdown format via Quercus. Assignments will be handed out on Tuesdays and are due 11:59 pm on the Thursday seven weekdays later. _There will be a penalty of 5% per day (including week-ends) for late submissions_. 106 | 107 | 108 | ### Final project grading rubric 109 | 110 | | | Inadequate (0 marks) | Adequate (4 marks) | Excellent (8 marks) | 111 | |------------|--------------------|--------------------|--------------------| 112 | | Contribution to group work | Student contributed little to project; self-assessed contributions are low in quality and/or quantity; self-assessment is not consistent with actual contribution. | Student contributed adequately to project; made some significant contributions | Student substantially contributed to project to ensure success; self-assessed contributions are crucial to project; self-assessment is consistent with actual contribution. | 113 | | Content | Missing crucial information; methods and results are inconsistent, not logical, or not adequately explained; conclusions are confusing or unsupported by results; unnecessary information included as clutter | Most essential information included; methods and results are adequately described; conclusions supported by results; most included material is relevant to report | All essential information included; methods and results are succinct, clear, logical, and scientifically valid; conclusions are creative and meaningful; project is concise throughout | 114 | | Style and reproducibility | Code and writing are poorly organized, poorly formatted, missing units, difficult to read, poorly documented, difficult to reproduce analyses | Code and writing are well-organized, well-formatted, consistent use of units and significant figures | Code and writing are precise and clear throughout, free of errors, well-organized, well-documented, easily reproducible analyses, publication-ready | 115 | | Presentation | Presentation is poorly organized; much too long or much too short; presentation is unclear; presentation is missing information; presentation is not scientific and professional; presentation uses too much jargon; not all team members participate; does not adequately address audience questions | Presentation is adequately organized; timing is appropriate; most information is presented logically; presentation is scientific and professional; most jargon is avoided; all team members participate but equally; audience questions are sometimes addressed well | Presentation is clearly and logically organized; presentation flows and is easy to follow; presentation includes appropriate information without jargon; presentation is well-rehearsed and high-quality; all team members participate equally; audience questions are clearly addressed | 116 | 117 | As the final project is a team effort, all members within a group will receive the same mark in the final three categories and an individual mark for their contribution to group work. A final project that is considered to lie between two of the defined levels will be marked accordingly, e.g. between "Adequate" and "Excellent" would be 5, 6, or 7 marks. 118 | -------------------------------------------------------------------------------- /lec04-dplyr.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data wrangling and visualization in the tidyverse 3 | author: Joel Östblom 4 | --- 5 | 6 | ## Lesson preamble 7 | 8 | > ### Learning Objectives 9 | > 10 | > - Understand the split-apply-combine concept for data analysis. 11 | > - Use `summarize`, `group_by`, and `tally` to split a data frame into groups 12 | > of observations, apply a summary statistics for each group, and then combine 13 | > the results. 14 | > - Produce scatter plots, line plots, and histograms using ggplot. 15 | > - Set universal plot settings. 16 | > 17 | > ### Lesson outline 18 | > 19 | > - Split-apply-combine techniques in **`dplyr`** (25 min) 20 | > - Using `tally` to summarize categorical data (15 min) 21 | > - Plotting with **`ggplot2`** (20 min) 22 | > - Building plots iteratively (25 min) 23 | 24 | ----- 25 | 26 | ## Setting up 27 | 28 | Start by loading the required packages. Both **`ggplot2`** and **`dplyr`** are 29 | included in the **`tidyverse`** package collection. 30 | 31 | ```{r} 32 | # Install if needed 33 | # install.packages('tidyverse') 34 | library(tidyverse) 35 | ``` 36 | 37 | Load the data we saved in the previous lesson. 38 | 39 | ```{r, eval=FALSE} 40 | # Download if needed 41 | # download.file("https://ndownloader.figshare.com/files/2292169", "data/portal_data.csv") 42 | surveys <- read_csv('portal_data.csv') 43 | ``` 44 | 45 | ```{r, echo=FALSE} 46 | surveys <- read_csv('data/portal_data.csv') 47 | ``` 48 | 49 | ```{r} 50 | surveys 51 | ``` 52 | 53 | 54 | ## Split-apply-combine techniques in dplyr 55 | 56 | Many data analysis tasks can be approached using the *split-apply-combine* 57 | paradigm: split the data into groups, apply some analysis to each group, and 58 | then combine the results. 59 | 60 | **`dplyr`** facilitates this workflow through the use of `group_by()` 61 | to split data and `summarize()`, which collapses each group into a single-row 62 | summary of that group. The arguments to `group_by()` are the column names that 63 | contain the **categorical** variables for which you want to calculate the 64 | summary statistics. Let's view the mean `weight` by sex. 65 | 66 | ```{r} 67 | surveys %>% 68 | group_by(sex) %>% 69 | summarize(mean_weight = mean(weight)) 70 | ``` 71 | 72 | The mean weights become `NA` since there are individual observations that are 73 | `NA`. Let's remove those observations. 74 | 75 | ```{r} 76 | surveys %>% 77 | filter(!is.na(weight)) %>% 78 | group_by(sex) %>% 79 | summarize(mean_weight = mean(weight)) 80 | ``` 81 | 82 | There is one row here that is neither male nor female, these are observations 83 | where the animal escaped before the sex could not be determined. Let's remove 84 | those as well. 85 | 86 | ```{r} 87 | surveys %>% 88 | filter(!is.na(weight) & !is.na(sex)) %>% 89 | group_by(sex) %>% 90 | summarize(mean_weight = mean(weight)) 91 | ``` 92 | 93 | You can also group by multiple columns: 94 | 95 | ```{r} 96 | surveys %>% 97 | filter(!is.na(weight) & !is.na(sex)) %>% 98 | group_by(genus, sex) %>% 99 | summarize(mean_weight = mean(weight)) 100 | ``` 101 | 102 | Since we will use the same filtered and grouped data frame in multiple code 103 | chunks below, we could assign this subset of the data to a new variable and use 104 | this variable in the subsequent code chunks instead of typing out the functions 105 | each time. 106 | 107 | ```{r} 108 | filtered_surveys <- surveys %>% 109 | filter(!is.na(weight) & !is.na(sex)) %>% 110 | group_by(genus, sex) 111 | ``` 112 | 113 | If you want to display more data, you can use the `print()` function at the end 114 | of your chain with the argument `n` specifying the number of rows to display. 115 | 116 | ```{r} 117 | filtered_surveys %>% 118 | summarize(mean_weight = mean(weight)) %>% 119 | print(n = 15) # Will change the knitted output, not the notebook 120 | ``` 121 | 122 | Once the data are grouped, you can also summarize multiple variables at the same 123 | time. For instance, we could add a column indicating the minimum weight for each 124 | species for each sex: 125 | 126 | ```{r} 127 | filtered_surveys %>% 128 | summarize(mean_weight = mean(weight), 129 | min_weight = min(weight)) 130 | ``` 131 | 132 | #### Challenge 133 | 134 | 1. Use `group_by()` and `summarize()` to find the mean, min, and max hindfoot 135 | length for each species. 136 | 137 | 2. What was the heaviest animal measured in each year? Return the columns `year`, 138 | `genus`, `species`, and `weight`. 139 | 140 | ```{r, include=FALSE} 141 | ## Answer 1 142 | surveys %>% 143 | filter(!is.na(hindfoot_length)) %>% 144 | group_by(species) %>% 145 | summarize( 146 | mean_hindfoot_length = mean(hindfoot_length), 147 | min_hindfoot_length = min(hindfoot_length), 148 | max_hindfoot_length = max(hindfoot_length) 149 | ) 150 | ## Answer 2 151 | surveys %>% 152 | filter(!is.na(weight)) %>% 153 | group_by(year) %>% 154 | filter(weight == max(weight)) %>% # This is going to compare to the max weight within each group 155 | select(year, genus, species, weight) %>% 156 | arrange(year) 157 | ``` 158 | 159 | 160 | ### Using tally to summarize categorical data 161 | 162 | When working with data, it is also common to want to know the number of 163 | observations found for each factor or combination of factors. For this, **`dplyr`** 164 | provides `tally()`. For example, if we want to group by taxa and find the 165 | number of observations for each taxa, we would do: 166 | 167 | ```{r} 168 | surveys %>% 169 | group_by(taxa) %>% 170 | tally() 171 | ``` 172 | 173 | We can also use `tally()` when grouping on multiple variables: 174 | 175 | ```{r} 176 | surveys %>% 177 | group_by(taxa, sex) %>% 178 | tally() 179 | ``` 180 | 181 | Here, `tally()` is the action applied to the groups created by `group_by()` and 182 | counts the total number of records for each category. 183 | 184 | If there are many groups, `tally()` is not that useful on its own. For example, 185 | when we want to view the five most abundant species among the observations: 186 | 187 | ```{r} 188 | surveys %>% 189 | group_by(species) %>% 190 | tally() 191 | ``` 192 | 193 | Since there are 40 rows in this output, we would like to order the table to 194 | display the most abundant species first. In `dplyr`, we say that we want to 195 | `arrange()` the data. 196 | 197 | ```{r} 198 | surveys %>% 199 | group_by(species) %>% 200 | tally() %>% 201 | arrange(n) 202 | ``` 203 | 204 | Still not that useful. Since we are interested in the most abundant species, we 205 | want to display those with the highest count first, in other words, we want to 206 | arrange the column `n` in descending order: 207 | 208 | ```{r} 209 | surveys %>% 210 | group_by(species) %>% 211 | tally() %>% 212 | arrange(desc(n)) %>% 213 | head(5) 214 | ``` 215 | 216 | If we want to include more attributes about these species, we can include these 217 | in the call to `group_by()`: 218 | 219 | ```{r} 220 | surveys %>% 221 | group_by(species, taxa, genus) %>% 222 | tally() %>% 223 | arrange(desc(n)) %>% 224 | head(5) 225 | ``` 226 | 227 | Be careful not to include anything that would split the group into subgroups, 228 | such as `sex`, `year` etc. 229 | 230 | #### Challenge 231 | 232 | 1. How many individuals were caught in each `plot_type` surveyed? 233 | 234 | 2. You saw above how to count the number of individuals of each `sex` using a 235 | combination of `group_by()` and `tally()`. How could you get the same result 236 | using `group_by()` and `summarize()`? Hint: see `?n`. 237 | 238 | 239 | ```{r, include=FALSE} 240 | ## Answer 1 241 | surveys %>% 242 | group_by(plot_type) %>% 243 | tally() 244 | 245 | ## Answer 2 246 | surveys %>% 247 | group_by(sex) %>% 248 | summarize(n = n()) 249 | ``` 250 | 251 | 252 | ## Plotting with ggplot2 253 | 254 | **`ggplot2`** is a plotting package that makes it simple to create complex plots 255 | from data frames. The name **`ggplot2`** comes from its inspiration, the book "A 256 | grammar of graphics", and the main goal is to allow coders to express 257 | their desired outcome on a high level instead of telling the computer every 258 | detail about what will happen. For example, you would say "color my data by 259 | species" instead of "go through this data frame and plot any observations of 260 | species1 in blue, any observations of species2 in red, etc". Thanks to this 261 | functional way of interfaces with data, only minimal changes are required if the 262 | underlying data change or to change the type of plot. This helps in thinking 263 | about the data and creating publication quality plots with minimal amounts of 264 | adjustments and tweaking. 265 | 266 | ggplot graphics are built step by step by adding new elements, or layers. Adding layers in 267 | this fashion allows for extensive flexibility and customization of plots. To 268 | build a ggplot, we need to: 269 | 270 | 1. Use the `ggplot()` function and bind the plot to a specific data frame using the 271 | `data` argument 272 | 273 | ```{r} 274 | ggplot(data = surveys) 275 | ``` 276 | 277 | Remember, if the arguments are provided in the right order then the names of the 278 | arguments can be omitted. 279 | 280 | ```{r} 281 | ggplot(surveys) 282 | ``` 283 | 284 | 2. Define aesthetics (`aes`), by selecting the variables to be plotted and the 285 | variables to define the presentation such as plotting size, shape color, etc. 286 | 287 | ```{r} 288 | ggplot(surveys, aes(x = weight, y = hindfoot_length)) 289 | ``` 290 | 291 | 3. Add `geoms` -- geometrical objects as a graphical representation of the data 292 | in the plot (points, lines, bars). **`ggplot2`** offers many different geoms; we 293 | will use a few common ones today, including: 294 | * `geom_point()` for scatter plots, dot plots, etc. 295 | * `geom_line()` for trend lines, time-series, etc. 296 | * `geom_histogram()` for histograms 297 | 298 | To add a geom to the plot use `+` operator. Because we have two continuous 299 | variables, let's use `geom_point()` first: 300 | 301 | ```{r} 302 | # If this takes way too long on your machine, create a subset from a random 303 | # sample of a suitable size and continue working with this instead of `survey`. 304 | #survey_subset <- sample_n(surveys, size = 5000) 305 | 306 | ggplot(surveys, aes(x = weight, y = hindfoot_length)) + 307 | geom_point() 308 | ``` 309 | 310 | The `+` in the **`ggplot2`** package is particularly useful because it allows you 311 | to modify existing `ggplot` objects. This means you can easily set up plot 312 | "templates" and conveniently explore different types of plots, so the above 313 | plot can also be generated with code like this: 314 | 315 | ```{r, first-ggplot-with-plus} 316 | # Assign plot to a variable 317 | surveys_plot <- ggplot(surveys, aes(x = weight, y = hindfoot_length)) 318 | 319 | # Draw the plot 320 | surveys_plot + geom_point() 321 | ``` 322 | 323 | Notes: 324 | 325 | - Anything you put in the `ggplot()` function can be seen by any geom layers 326 | that you add (i.e., these are universal plot settings). This includes the x and 327 | y axis you set up in `aes()`. 328 | - You can also specify aesthetics for a given geom independently of the 329 | aesthetics defined globally in the `ggplot()` function. 330 | - The `+` sign used to add layers must be placed at the end of each line containing 331 | a layer. If, instead, the `+` sign is added in the line before the other layer, 332 | **`ggplot2`** will not add the new layer and R will return an error message. 333 | 334 | 335 | ### Building plots iteratively 336 | 337 | Building plots with ggplot is typically an iterative process. We start by 338 | defining the dataset we'll use, lay the axes, and choose a geom: 339 | 340 | ```{r} 341 | ggplot(surveys, aes(x = weight, y = hindfoot_length)) + 342 | geom_point() 343 | ``` 344 | 345 | Then, we start modifying this plot to extract more information from it. For 346 | instance, we can add transparency (`alpha`) to reduce overplotting: 347 | 348 | 349 | ```{r} 350 | ggplot(data = surveys, aes(x = weight, y = hindfoot_length)) + 351 | geom_point(alpha = 0.2) 352 | ``` 353 | 354 | Based on the hindfoot length and the weights, there appears to be 4-5 clusters 355 | in this data. Potentially, one of the categorical variables we have in the data 356 | could explain this pattern. Coloring the data points according to a 357 | categorical variable is an easy way to find out if there seems to be 358 | correlation. Let's try this with `plot_type`. 359 | 360 | ```{r} 361 | ggplot(surveys, aes(x = weight, y = hindfoot_length, color = plot_type)) + 362 | geom_point(alpha = 0.2) 363 | ``` 364 | 365 | It seems like the type of plot the animal was captured on correlates well with 366 | some of these clusters, but there are still many that are quite mixed. Let's try 367 | to do better! This time, the information about the data can provide some clues 368 | to which variable to look at. The plot above suggests that there might be 4-5 369 | clusters, so a variable with 4-5 values is a good guess for what could explain 370 | the observed pattern in the scatter plot. 371 | 372 | ```{r} 373 | surveys %>% 374 | summarize_all(n_distinct) 375 | ``` 376 | 377 | Remember that there are still `NA` values here, that's why there appears to be 378 | three sexes although there is only male and female. There are four taxa so that 379 | could be a good candidate, let's see which those are. 380 | 381 | ```{r} 382 | surveys %>% 383 | distinct(taxa) 384 | ``` 385 | 386 | It seems reasonable that these taxa contain animals different enough to have 387 | diverse weights and length of their feet. Lets use this categorical variable to 388 | color the scatter plot. 389 | 390 | ```{r} 391 | ggplot(surveys, aes(x = weight, y = hindfoot_length, color = taxa)) + 392 | geom_point(alpha = 0.2) 393 | ``` 394 | 395 | Only rodents? That was unexpected... Let's check what's going on. 396 | 397 | ```{r} 398 | surveys %>% 399 | group_by(taxa) %>% 400 | tally() 401 | ``` 402 | 403 | There is definitely mostly rodents in our data set... 404 | 405 | ```{r} 406 | surveys %>% 407 | filter(!is.na(hindfoot_length)) %>% # control by removing `!` 408 | group_by(taxa) %>% 409 | tally() 410 | ``` 411 | 412 | ...and it turns out that only rodents, have had their hindfeet measured! 413 | 414 | Let's remove all animals that did not have their hindfeet measured, including 415 | those rodents that did not. Animals without their weight measured will also be 416 | removed. 417 | 418 | ```{r} 419 | surveys_hf_wt <- surveys %>% 420 | filter(!is.na(hindfoot_length) & !is.na(weight)) 421 | 422 | surveys_hf_wt %>% 423 | summarize_all(n_distinct) 424 | ``` 425 | 426 | Maybe the genus can explain what we are seeing. 427 | 428 | ```{r} 429 | ggplot(surveys_hf_wt, aes(x = weight, y = hindfoot_length, color = genus)) + 430 | geom_point(alpha = 0.2) 431 | ``` 432 | 433 | Now this looks good! There is a clear separation between different genus, but 434 | also significant spread within genus, for example in the weight of the green 435 | Neotoma observations. There are also two clearly separate clusters that are both 436 | colored in olive green (Dipodomys). Maybe separating the observations into 437 | different species would be better? 438 | 439 | ```{r} 440 | ggplot(surveys_hf_wt, aes(x = weight, y = hindfoot_length, color = species)) + 441 | geom_point(alpha = 0.2) 442 | ``` 443 | 444 | Great! Together with the genus plot, this definitely seem to explain most of the 445 | variance we see in the hindfoot length and weight measurements. It is still a 446 | bit messy as it appears like we have around 5 clusters, but there are 21 species 447 | in the legend. 448 | 449 | ```{r} 450 | surveys %>% 451 | filter(!is.na(hindfoot_length) & !is.na(weight)) %>% 452 | group_by(species) %>% 453 | tally() %>% 454 | arrange(desc(n)) 455 | ``` 456 | 457 | There is a big drop from 838 to 159, let's include only those with more than 800 458 | observations. 459 | 460 | ```{r} 461 | surveys_abun_species <- surveys %>% 462 | filter(!is.na(hindfoot_length) & !is.na(weight)) %>% 463 | group_by(species) %>% 464 | mutate(n = n()) %>% # add count value to each row 465 | filter(n > 800) %>% 466 | select(-n) 467 | 468 | surveys_abun_species 469 | ``` 470 | 471 | Still has almost 25k observations, so only 10k was removed. 472 | 473 | ```{r} 474 | ggplot(surveys_abun_species, aes(x = weight, y = hindfoot_length, color = species)) + 475 | geom_point(alpha = 0.2) 476 | ``` 477 | 478 | 479 | #### Challenge 480 | 481 | Create a scatter plot of `hindfoot_length` over `species` with the `weight` showing in different colors. 482 | Is there any problem with this plot? *Hint: think about how many observations there are* 483 | 484 | ```{r, include=FALSE} 485 | ggplot(surveys_abun_species, aes(x = weight, y = species, color = hindfoot_length)) + 486 | geom_point(size = 0.1, position = 'jitter') 487 | ``` 488 | 489 | 490 | *Parts of this lesson material were taken and modified from [Data 491 | Carpentry](https://datacarpentry.org) under their CC-BY copyright license. See 492 | their [lesson page](https://datacarpentry.org/R-ecology-lesson/03-dplyr.html) 493 | for the original source.* 494 | -------------------------------------------------------------------------------- /lec14-datasets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Scientific method, team dynamics, and project datasets" 3 | author: "Luke Johnston" 4 | output: pdf_document 5 | --- 6 | 7 | ## Lesson preamble: 8 | 9 | > ### Lesson objectives: 10 | > 11 | > - Learn about the scientific method and applying it 12 | > - Learn basics of group dynamics in a team setting 13 | > - Get into your groups, start your projects 14 | > - Identify which dataset to use 15 | > - Brainstorm possible research questions 16 | > 17 | > ### Lesson outline: 18 | > 19 | > - What is the scientific method (20 min) 20 | > - Team dynamics and assigning roles and tasks (15-20 min) 21 | > - Getting into groups (10 min) 22 | > - Exercise (25-30 min) 23 | > - Start project work (20-25 min) 24 | > - Datasets available for use (10-20 min) 25 | 26 | ----- 27 | 28 | ```{r message=FALSE, warning=FALSE, include=FALSE, eval=FALSE} 29 | # Run these locally if making changes. 30 | library(dplyr) 31 | DiagrammeR::grViz(' 32 | digraph rmarkdown { 33 | graph [layout = neato, overlap = false, splines = true] 34 | node [shape = box, style = rounded] 35 | 36 | Hypothesis [pos = "0.75,3.85!"] 37 | CollectData [label = "Collect Data", pos = "0.75,3!"] 38 | AnalyzeData [label = "Analyze Data", pos = "0.75,2.25!"] 39 | HypothesisTrue [label = "Hypothesis\nif TRUE", pos = "0,1.5!"] 40 | HypothesisFalse [label = "Hypothesis\nif FALSE", pos = "1.5,1.5!"] 41 | Report [label = "Disseminate", pos = "0.75,0.75!"] 42 | 43 | Hypothesis -> CollectData -> AnalyzeData -> HypothesisTrue -> Report 44 | AnalyzeData -> HypothesisFalse -> Report 45 | {HypothesisTrue HypothesisFalse AnalyzeData CollectData} -> Hypothesis 46 | }') %>% 47 | DiagrammeRsvg::export_svg() %>% 48 | charToRaw() %>% 49 | rsvg::rsvg_png(file = "image/fig_scientific_method.png", 50 | width = 500, height = 700) 51 | ``` 52 | 53 | ## Scientific method 54 | 55 | 56 | 57 | ### Simplified diagram of the scientific method 58 | 59 | ![Simplified process flow for the scientific method](image/fig_scientific_method.png) 60 | 61 | ### Steps in the scientific process[^sci_method] 62 | 63 | 1. Identify research question(s). 64 | 2. Look into what the previous literature shows. 65 | 3. Create one or more hypotheses or objectives. 66 | 4. Write up an outline or expected approach to answering those 67 | questions/objectives (analysis and presentation plan): 68 | - How will the data be obtained and what is the data (i.e. the source)? 69 | - What statistical/mathematical techniques have previous researchers used? 70 | Will you use them? Are they appropriate (optional, may need expert)? 71 | - How will the results/data be presented or visualized (possible 72 | tables/figures)? 73 | 5. Run the planned analyses (or additional ones that come may up). 74 | 6. Visualize or present all results from the analyses. 75 | 7. Interpret the results and how they fit with the previous literature. 76 | 8. Draw conclusions based on the hypotheses/objectives. 77 | 9. Disseminate your results (in blogs, pre-print archives, journals, conferences) 78 | 79 | [^sci_method]: See [Khan Academy on Scientific Method](https://www.khanacademy.org/science/biology/intro-to-biology/science-of-biology/a/the-science-of-biology) for a brief overview and explanation on the scientific process. 80 | 81 | ## Team dynamics 82 | 83 | ### Basics of succeeding as a team 84 | 85 | Final assignment is in a group and to succeed, you need to understand team dynamics: 86 | 87 | - Communication is **vital** to work together and to achieve the goal 88 | - Teams go through various stages 89 | - Need consensus for group norms, goals, duties/responsibilities, and conduct/behaviour 90 | - Important that everyone has a stake in the project 91 | - Rotate roles (specifically for the leader/facilitator) 92 | 93 | ### Stages of group formation 94 | 95 | - "Forming-Storming-Norming-Performing-Adjourning Model"[^group_forming] 96 | - Essentially, groups go through stages (not always all and not always in order): 97 | 1. Getting to know one other 98 | 2. Feeling comfortable and safer, testing boundaries 99 | 3. Opening up, voicing opinions, potential for conflict (which can be good!) 100 | 4. Stronger bonds form, cooperative 101 | 5. Greater focus and energy on completion of project 102 | 6. Team ends after project is successfully completed 103 | - These stages can be fast-tracked by discussing norms, duties, and conduct early on 104 | 105 | [^group_forming]: See [Principles of Group Dynamics](https://open.lib.umn.edu/principlesmanagement/chapter/13-3-group-dynamics/) 106 | 107 | ### Roles and responsibilities 108 | 109 | - Leader/Facilitator's duty: 110 | - Goal is to keep things running smoothly, focused on the task, and on track for time 111 | - Keep everyone on topic and on task; stay aware of the time 112 | - (Try to) mediate or resolve any conflicts (there will *always* be some type of conflict; how it's dealt with is what matters) 113 | - (Try to) encourage everyone to participate and allow everyone a chance at talking 114 | - Recorder's duty: 115 | - Goal is to write/type down main or important points raised or discussed when team is meeting 116 | - Keep notes and files organized and orderly 117 | - Organizer's duty: 118 | - Arrange for next meeting time and location 119 | - Send reminders to members a day or two before meeting 120 | - Make and email a simple agenda of tasks to do or to discuss 121 | 122 | ### Code of conduct 123 | 124 | - Vital to establishing boundaries and expectations of being a team member 125 | - How do you want each member to treat each other? 126 | - How do you deal with conflict? 127 | - What is acceptable and unacceptable behaviour? 128 | - These are outlined in the code of conduct 129 | - Mostly its common sense (be kind and respectful) 130 | - But its important that you as a team write out what everyone wants and agrees to 131 | 132 | #### Example Code of Conduct: 133 | - [Contributor Covenant](https://www.contributor-covenant.org/version/1/4/code-of-conduct) 134 | - [UofT Coders Code of Conduct](https://github.com/UofTCoders/studyGroup/blob/gh-pages/codeOfConduct.md) 135 | 136 | ## Exercise 137 | 138 | ### As a group, complete these tasks 139 | 140 | 141 | 142 | - Get into your groups 143 | - Introduce each other: 144 | - Find out everyone's name and year of study 145 | - Find out other things about each other (e.g. any plans for next year, etc) 146 | - Create a one or two word "team name". We'll use this to create a shared folder for everyone to use. 147 | - Assign roles to each person (these roles will be rotated in every group setting): 148 | - You need: facilitator, recorder, organizer 149 | - Discuss how and when roles will be rotated *(record it)* 150 | - Discuss and brainstorm some codes of conduct you want your team to follow *(record it)* 151 | - Take a few minutes, think about your own skills and what you feel are most competent in 152 | - Then, share the top one or two of those skills *(record those skills)* 153 | - Discuss how responsibilities of each member will be decided on *(record it)* 154 | 155 | ## Starting the projects 156 | 157 | ### Datasets available for use 158 | 159 | - [Continuous Plankton Recorder Dataset](https://www.gbif.org/dataset/67c54f85-7910-4cbf-8de4-6f0b136a0e34) 160 | 161 | Data on northern hemisphere plankton species, latitude, longitude, date. 162 | Going back to 1946. 935 Mb size, almost 2.75 million rows of data. 163 | 164 | - [Insecta of Costa Rica](https://www.gbif.org/dataset/3e9817c1-8302-4955-87e3-a408db0ea379) 165 | 166 | Data on insects species in Costa Rica, latitude, longitude, elevation, date. 167 | 1.4 Gb size, almost 3.25 million rows of data. 168 | 169 | - [Marine predator and prey body sizes](http://www.esapubs.org/archive/ecol/E089/051/default.htm#data) 170 | 171 | Data from 27 different global locations on species, body measurements, 172 | latitude, longitude, date. 21 Mb, almost 35,000 rows of data (in long 173 | format). 174 | 175 | - [Mammalian life history](http://www.esapubs.org/archive/ecol/E084/093/default.htm) 176 | 177 | Data about general mammalian life history with species, body size, lifespan, 178 | litter size, and other reproductive variables. 150 Kb size, 1440 rows of 179 | data. 180 | 181 | - [North American Bird Breeding Survey](https://www.pwrc.usgs.gov/BBS/?CFID=36951359&CFTOKEN=5135bf261f2f1478-471B9FA3-C648-BE26-7C2176ADADE30428) 182 | 183 | Data about number of birds at multiple stops in North America. Many datasets 184 | of varying rows that need to be linked together. ~`r (50*114)/1000` Gb size 185 | (can be shortened) across >50 files. 186 | 187 | - [National Ecological Observatory Network](http://data.neonscience.org/static/browse.html) 188 | 189 | A repository of many large scale ecological datasets from a variety of systems collected over multiple years at at approximately 50 sites in the USA. Feel free to browse the datasets for ones of interest to you, but I have highlighted a few below. 190 | 191 | + [Ground beetles in pitfall traps](http://data.neonscience.org/data-product-view?dpCode=DP1.10022.001): ~26 Mb .csv file with ID of ground beetle species from 40 traps arrayed in each of ~50 NEON sites since 2013. 192 | + [Macroinvertebrate collection](http://data.neonscience.org/data-product-view?dpCode=DP1.20120.001): ~9 Mb .csv file with ID's of benthid macroinvertebrates from lakes, non-wadeable streams, and wadeable streams from sites across the NEON network dating back to 2014. 193 | + [Plant presence/absence and percent cover](http://data.neonscience.org/data-product-view?dpCode=DP1.10058.001): Presence/absence and percent cover of species in 10m^2^, 100m^2^, and 400m^2^ quadrats from multiple plots in each of 50 NEON sites dating back to 2013. This is the dataset we worked with in lecture 9. 194 | 195 | - [US EPA National Aquatic Resource Surveys](https://www.epa.gov/national-aquatic-resource-surveys/data-national-aquatic-resource-surveys) 196 | 197 | Numerous datasets from annual surveys of aquatic habitats conducted by the US Environmental Protection Agency. Includes data relating to the physical environments (e.g. water quality, chemical properties, landscape variables, etc.) to the biotic environment (e.g. phytoplankton concentrations, benthic macroinvertebrates, etc.). Be sure to download the metadata as well, which is on the same page linked above. 198 | 199 | - [International Council for the Exploration of the Sea](http://www.ices.dk/marine-data/data-portals/Pages/default.aspx) 200 | 201 | Many large datasets from fish oceanic fish surveys. Includes oceanographic data in addition to biodiversity datasets, fish stomach content data, physical environmental condition and contaminants, predation, etc. 202 | 203 | - [Alberta Ecological Information System](https://open.alberta.ca/opendata/ecological-information-systems-data) 204 | 205 | Data on vegetation and soil plots in Alberta. Over 26 000 sites are available. 206 | 207 | - [Beaver abundance and distribution](https://open.canada.ca/data/en/dataset/b9f21e91-d34d-4730-8195-edf051121e9d) 208 | 209 | Aerial and ground surveys of beaver abundance, feeding, and lodge distribution in Elk Island National Park starting in 1959. 210 | 211 | - Benthic invertebrate abundance in [Ivvavik](https://open.canada.ca/data/en/dataset/3bad5ce0-0b16-43ee-be32-78cc2f64843f), [Tuktut](https://open.canada.ca/data/en/dataset/9046af59-81c4-4759-8979-f6185af8387d), and [Aulavik](https://open.canada.ca/data/en/dataset/2770949b-043c-4073-bc6c-b38b03a5f528) 212 | 213 | Counts of benthic invertebrate taxa from 3 river corridors from 2009 to 2015. Recommended to use all 3 datasets. 214 | 215 | - [Red-backed salamander abundance](https://open.canada.ca/data/en/dataset/3571474b-8d75-491d-816e-f84677b81a7c) 216 | 217 | Abundance of red-backed salamanders from 4 sites in the Bruce Peninsula from 2004 to 2017. 218 | 219 | In addition to the datasets shown above, we encourage students to bring their own datasets for use in their group projects. 220 | 221 | ### Example hypotheses with figures 222 | 223 | > Simple hypothesis: Bigger predators eat bigger prey. 224 | 225 | Easy to identify independent and dependent variables and visualize with plots 226 | and test with linear regression. 227 | 228 | > Advanced hypothesis: Which characteristics determine prey size among marine 229 | predators? 230 | 231 | Lots of possible groupings available. Start with visualizing some good 232 | candidate variables such as predator weight and length. Realize that the 233 | relationship is more complex than this and start dividing the data set 234 | according to species, water temperature, weather, etc. Use a combination of 235 | visualization and regression analyses. Fit models to the data to determine 236 | which types of regressions are appropriate. 237 | 238 | ```{r, message=FALSE} 239 | library(tidyverse) 240 | ``` 241 | 242 | ```{r, eval=FALSE} 243 | # Read data online and fix colnames 244 | pred_prey <- read_tsv('http://www.esapubs.org/archive/ecol/E089/051/Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt') 245 | ``` 246 | 247 | ```{r, echo=FALSE} 248 | # download.file( 249 | # 'http://www.esapubs.org/archive/ecol/E089/051/Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt', 250 | # "data/predator_prey_body_size.txt" 251 | # ) 252 | # pred_prey <- read_tsv("data/predator_prey_body_size.txt") 253 | # pred_prey <- sample_n(pred_prey, 1000) 254 | # write_tsv(pred_prey, "data/predator_prey_body_size.txt") 255 | pred_prey <- read_tsv("data/predator_prey_body_size.txt") 256 | ``` 257 | 258 | ```{r} 259 | colnames(pred_prey) <- tolower(gsub(' ', '_', colnames(pred_prey))) 260 | 261 | # Only adults and some columns of interest 262 | adult_pred_food <- pred_prey %>% 263 | mutate(predator_lifestage = tolower(predator_lifestage)) %>% # Mix of upper and lower case... 264 | filter(predator_lifestage == 'adult') %>% 265 | select(predator_common_name, predator_length, prey_mass, predator_mass) 266 | 267 | # Summarize data to plot 268 | plot_data <- adult_pred_food %>% 269 | group_by(predator_common_name) %>% 270 | summarize( 271 | mean_pred_mass = mean(predator_mass), 272 | mean_prey_mass = mean(prey_mass), 273 | mean_pred_length = mean(predator_length)) %>% 274 | filter(mean_pred_mass < 8000) # "outliers" 275 | 276 | # Plot pred weight vs prey weight 277 | ggplot(plot_data, aes(x = mean_pred_mass, y = mean_prey_mass)) + 278 | geom_point() + 279 | geom_smooth() 280 | 281 | # Plot pred length vs prey weight 282 | ggplot(plot_data, aes(x = mean_pred_length, y = mean_prey_mass)) + 283 | geom_point() + 284 | geom_smooth() 285 | ``` 286 | 287 | 288 | ### As a group, complete these tasks 289 | 290 | In your group, rotate roles (need a facilitator and recorder at minimum). Before 291 | the end of class, finish these: 292 | 293 | - Choose two possible datasets (or more) your team would like to work from (can also be 294 | datasets not presented in class) *(record them)* 295 | - Look into the data documentation, see what type of variables there are, what 296 | published articles are available. 297 | - Then, brainstorm as many research questions as possible for those two datasets 298 | *(record them)* 299 | - Goal is to write down as many ideas as possible 300 | - No question is off limits and no question is too simple or too complex! 301 | - (You might combine or split questions later, just get whatever down!) 302 | - Just write what ever comes to mind, whether it is an idea or not. Just start writing! 303 | 304 | Make sure to *record* everything down! Many of these tasks will also be part of 305 | your mid-project update! 306 | 307 | ### Set up GitHub account 308 | 309 | Before we finish the class, we need to prepare a bit for next class. We need to 310 | create a [GitHub](https://github.com) account! 311 | -------------------------------------------------------------------------------- /mid-project-update.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'EEB313 Mid-project update (12 marks)' 3 | output: 4 | html_document: 5 | toc: false 6 | --- 7 | 8 | *To submit this assignment, upload the full document on Quercus, including the 9 | original questions, your code (if applicable), and the output. Submit your 10 | assignment as a knitted `.pdf` or `.html` file.* 11 | 12 | Prior to beginning this assignment, we suggest you take a look through [this 13 | link](https://swcarpentry.github.io/r-novice-gapminder/02-project-intro/index.html) 14 | on how to manage project in RStudio using Rprojects. While not a component of 15 | this assignment, the tutorial may prove useful as you move forward with your 16 | group projects. 17 | 18 | These tasks seem like a lot of work. However, we will be doing these tasks 19 | already during class and during project work. The purpose of the mid-project 20 | update is for you to set up your repo and get something started on your final 21 | project. While most of these tasks are group based, please fill out each as an 22 | individual. In particular, the first question should be of *your* forked 23 | repository, not one of your team members. The remainder of the questions each 24 | involve a single pull request; make sure everyone in your group does at least 25 | one. Each team member should submit their *own assignment*. 26 | 27 | 1. A new repository has been created for your project on the 28 | `EEB313-2019` GitHub organization. Paste the URL of this forked 29 | version below. (0.25 marks) 30 | 31 | - URL of *your* fork: 32 | 33 | 2. On the main repo (you will be working on this from here on out, unless you 34 | are doing the fork-based workflow) -- have one of your group members create 35 | a new branch from `master` called `add-conduct`. Create a `CONDUCT.md` file in 36 | this branch. Write down what you as a team decided on for acceptable 37 | conduct/behaviour of team members to each other (e.g. "Must be considerate and 38 | respectful"). You may reuse existing Codes of Conduct, such as the UofT Coders 39 | Code of Conduct or the Contributor Covenant, but make sure to both a) credit 40 | whichever you use and b) still expand upon it with project-specific items (i.e. 41 | meeting frequency, PR approval policy, role rotations). The designated group 42 | member needs to create a pull request of this new file to the `master` branch. 43 | Copy and paste the link to the pull request below. Every team member must 44 | approve this pull request via whatever means you as a team decide on ('all 45 | members must 'thumbs up' the PR message', or 'all members must make a new 46 | comment saying 'good to merge'). Make sure the expectations and behaviours are 47 | explicit and clear. (1.0 marks) 48 | 49 | - URL of team member's pull request for `CONDUCT.md`: 50 | 51 | 3. A `README.md` file should already exist in your project. Once the previous 52 | PR has been merged into `master`, complete the following tasks in the 53 | README, discussed and written up *as a team*. Decide who on your team will 54 | create the branch, make these edits, and create a pull request of this file to 55 | the main project repository. This person should *not* be the same person who 56 | did task 2. Every team member must approve this pull request. (1.75 marks) 57 | 58 | - Create the following headers (make sure to use Markdown headers 59 | `#`): "Introduction to the project", "Description of the data", 60 | "Team description". 61 | 62 | - Fill out the "Introduction to the project" section, answering 63 | these questions in a paragraph form (don't include these 64 | questions in the section). What is your project about? What is 65 | the goal? Why are you doing it? 66 | 67 | - Fill out the "Description of the data" section, and briefly 68 | write down what the data is about, what are the variables you 69 | think you'll use, how the data was collected, and how it will 70 | answer your research questions. Include a reference of the 71 | dataset if one is available, for instance: 72 | 73 | Forstmann BU, et al. (2014) Data from: Multi-modal ultra-high resolution 74 | structural 7-Tesla MRI data repository. Dryad Digital Repository. 75 | (https://dx.doi.org/10.5061/dryad.fb41s) 76 | 77 | - Fill out the "Team description" section by writing down a brief 78 | biography of each member (including what their skills are and 79 | what their approximate responsibilities -- which can change later -- 80 | are for the project) as well as how team roles will be rotated. 81 | 82 | - URL of team member's pull request for `README.md`: 83 | 84 | 4. Once the previous PR has been merged into `master`, have a different group member create a file called 85 | `doc/objectives.md` in a new branch. Note that this will simultaneously create a new folder 86 | called `doc` containing a file called `objectives.md` -- this is [how folders are created on GitHub repos](https://github.com/KirstieJane/STEMMRoleModels/wiki/Creating-new-folders-in-GitHub-repository-via-the-browser). 87 | In `objectives.md`, create one header (`#` markdown header) called "Study 88 | objectives". Create a list (`-` markdown syntax) of each of your research 89 | questions that you *as a team* thought of. It doesn't matter what the questions 90 | are, how simple, complicated or obvious they are. Just have something written 91 | down, and make sure these are _explicitly_ framed as questions or 92 | hypotheses. In this file, also list explicit predictions wherever possible; 93 | i.e. what you think a given relationship might look like. Bear in mind that 94 | these research questions *will* change as you do your analyses. The point is 95 | for you to get started thinking about ideas. Decide on another (new) team 96 | member to create this file, record it, and make a pull request of this new 97 | file. Every team member must approve this pull request. (3 marks) 98 | 99 | - URL of team member's pull request for `objectives.md`: 100 | 101 | 5. Once the previous PR has been merged, start up another branch and create a 102 | file in the `doc/` folder called `analysis-plan.md`. Create three headers 103 | (`#` markdown headers): "Possible analyses", "Possible results tables", and 104 | "Possible results figures". As a team, discuss and record some possible 105 | analyses on the data that you could do to answer the research questions. 106 | Discuss and record possible ways to present your results (possible tables, some 107 | visualizations). Moreover, discuss how you anticipate getting from the raw data 108 | to whatever summary data you will use to generate a given plot (i.e. explain a 109 | data cleaning/transformation plan). Moreover, discuss what kinds of statistical 110 | approaches you anticipate employing. Once again, these do not have to be final, 111 | but you need to show that your team has thought about how to approach this. 112 | Decide on another (new) team member to create this file in a new branch, record 113 | it, and make a pull request of this new file. Every team member must approve 114 | this pull request. *Note:* this analysis and presentation plan does **not** 115 | have to be accurate, nor do you have to use this later on. It could and will 116 | (very likely) change. The point is to get you as a group thinking about how you 117 | will answer the research questions. (3 marks) 118 | 119 | - URL of team member's pull request for `analysis-plan.md`: 120 | 121 | 6. Finally, once the previous PR has been merged, have another group member 122 | create a file called `plots/mock/README.md` in a new branch. This will also 123 | create a folder called `plots` and a subfolder called `mock` within it. Add a 124 | few mock figures into this folder showing your predictions. These do not have 125 | to be 'publication-ready' plots, and can be made in any software of your 126 | choosing (R, Excel, PowerPoint, etc) with or without simulated data points; the 127 | important thing is that the predictions are clear and that your team can show 128 | you have been thinking about how to present your data. In `README.md`, list 129 | details about the mock figures in pseudo-figure caption format. Note that image 130 | files (png, pdf, etc) can also be uploaded to GitHub via dragging and dropping 131 | -- but make sure you are in a branch before you do this. The image files should 132 | be part of the same PR as `README.md`. (3 marks) 133 | 134 | - URL of team member's pull request for `plots/mock/README.md` and image files: 135 | 136 | 137 | -------------------------------------------------------------------------------- /paper.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{haaranen_programming_2017, 2 | address = {New York, NY, USA}, 3 | series = {{ITiCSE} '17}, 4 | title = {Programming as a performance: Live-streaming and its implications for {Computer} {Science} education}, 5 | isbn = {978-1-4503-4704-4}, 6 | shorttitle = {Programming as a performance}, 7 | url = {https://doi.acm.org/10.1145/3059009.3059035}, 8 | doi = {10.1145/3059009.3059035}, 9 | abstract = {This article discusses an emerging phenomenon of streaming programming to a live audience who in turn can interact with the streamer. In essence, this means broadcasting the programming environment and typically a web camera feed of the streamer to viewers. Streaming programming bears many similarities with live-streaming playing of video games, which has become extremely popular among gamers over the recent years. In fact, streaming programming often use the same web services as streaming gaming, and the audiences overlap. In this article, we describe this novel approach to programming and situate it in the broader context of computer science education. To gain a deeper insight into this phenomena, we analyzed viewer discussions during a particular programming stream broadcasted during a game programming competition. Finally, we discuss the benefits this approach could offer to computer science education.}, 10 | urldate = {2018-12-21}, 11 | booktitle = {Proceedings of the 2017 {ACM} {Conference} on {Innovation} and {Technology} in {Computer} {Science} {Education}}, 12 | publisher = {ACM}, 13 | author = {Haaranen, Lassi}, 14 | year = {2017}, 15 | keywords = {computer science education, game-based learning, online communities, streaming}, 16 | pages = {353--358} 17 | } 18 | 19 | @inproceedings{rubin_effectiveness_2013, 20 | address = {New York, NY, USA}, 21 | series = {{SIGCSE} '13}, 22 | title = {The effectiveness of live-coding to teach introductory programming}, 23 | isbn = {978-1-4503-1868-6}, 24 | url = {https://doi.acm.org/10.1145/2445196.2445388}, 25 | doi = {10.1145/2445196.2445388}, 26 | abstract = {Live-coding is defined as "the process of designing and implementing a [coding] project in front of class during lecture period". In this article we present our research design and results regarding the effectiveness of live-coding to teach introductory programming. The research design includes two experimental groups spread across four sections of an introductory C++ course at Colorado School of Mines. In the control group, students were taught using static code, meaning that instructors never typed, but instead viewed, compiled, and executed code examples. In the experimental or "live-coding" group, instructors started each lecture with a blank screen, and taught code examples by systematically typing, compiling, and testing code to solve example problems. To assess the effectiveness of live-coding, we administered four surveys and analyzed final grades. Two of the surveys were given at the beginning of the course, and were used to measure baseline programming knowledge and student learning preferences (i.e., VARK). The other two surveys, given at the end of the course, were designed to measure the amount of programming knowledge obtained as well as preferences towards live coding. Lastly, final grades were analyzed in terms of its subcomponents: the assignments, exams, final project, and overall grade. Based on our results, we conclude that teaching via live-coding is as good as if not better than using static code examples.}, 27 | urldate = {2018-12-21}, 28 | booktitle = {Proceeding of the 44th {ACM} {Technical} {Symposium} on {Computer} {Science} {Education}}, 29 | publisher = {ACM}, 30 | author = {Rubin, Marc J.}, 31 | year = {2013}, 32 | keywords = {introductory, live-coding, pedagogy, programming}, 33 | pages = {651--656} 34 | } 35 | 36 | @book{wilson_teaching_2018, 37 | address = {Leipzig}, 38 | title = {Teaching tech together: how to design and deliver lessons that work and build a teaching community around them}, 39 | isbn = {978-0-9881137-0-1}, 40 | shorttitle = {Teaching tech together}, 41 | language = {en}, 42 | publisher = {Amazon Distribution GmbH}, 43 | author = {Wilson, Greg}, 44 | year = {2018}, 45 | url = {http://teachtogether.tech/} 46 | } 47 | 48 | @article{strobel_when_2009, 49 | title = {When is {PBL} more effective? {A} meta-synthesis of meta-analyses comparing {PBL} to conventional classrooms}, 50 | volume = {3}, 51 | issn = {1541-5015}, 52 | shorttitle = {When is {PBL} more effective?}, 53 | url = {https://docs.lib.purdue.edu/ijpbl/vol3/iss1/4}, 54 | doi = {10.7771/1541-5015.1046}, 55 | number = {1}, 56 | journal = {Interdisciplinary Journal of Problem-Based Learning}, 57 | author = {Strobel, Johannes and Barneveld, Angela van}, 58 | month = mar, 59 | year = {2009} 60 | } 61 | 62 | @article{markham_project_2011, 63 | title = {Project-based learning: A bridge just far enough}, 64 | volume = {39}, 65 | copyright = {Copyright E L Kurdyla Publishing LLC Dec 2011}, 66 | issn = {14811782}, 67 | url = {https://search.proquest.com/docview/915254354/abstract/707DEDB5F1E145E5PQ/1}, 68 | abstract = {[...] well-executed PBL emphasizes a carefully planned assessment that incorporates formative feedback, detailed rubrics, and multiple evaluations of content and skills. [...] PBL can be defined as an extended learning process that uses inquiry and challenge to stimulate the groivth and mastery of skills. [...] PBL refocuses education on the student, not the curriculum-a shift mandated by the global world, which rewards intangible assets such as drive, passion, creativity, empathy, and resiliency.}, 69 | language = {English}, 70 | number = {2}, 71 | urldate = {2018-12-21}, 72 | journal = {Teacher Librarian; Bowie}, 73 | author = {Markham, Thom}, 74 | month = dec, 75 | year = {2011}, 76 | keywords = {Advantages, Core curriculum, Design, Education, Methods, Teaching, Young adults}, 77 | pages = {38--42} 78 | } 79 | 80 | @book{sawyer_cambridge_2006, 81 | address = {Cambridge, NY, USA}, 82 | title = {The {Cambridge} handbook of the learning sciences}, 83 | isbn = {978-0-521-84554-0 978-0-521-60777-3}, 84 | language = {en}, 85 | publisher = {Cambridge University Press}, 86 | editor = {Sawyer, R. Keith}, 87 | year = {2006}, 88 | doi = {10.1192/bjp.bp.106.029678}, 89 | note = {OCLC: ocm62728545}, 90 | keywords = {Cognitive learning, Learning, Learning, Psychology of, Social aspects} 91 | } 92 | 93 | @article{wilson-software-carpentry, 94 | author = {Greg Wilson}, 95 | title = {{Software} {Carpentry}: Getting scientists to write better code by making them more productive}, 96 | journal = {Computing in Science \& Engineering}, 97 | month = {November--December}, 98 | year = {2006}, 99 | doi = {10.1109/MCSE.2006.122}, 100 | note = {Summarizes the what and why of Version 3 of the course.} 101 | } 102 | 103 | @Manual{tidyverse, 104 | title = {tidyverse: Easily install and load the 'Tidyverse'}, 105 | author = {Hadley Wickham}, 106 | year = {2017}, 107 | note = {R package version 1.2.1}, 108 | url = {https://CRAN.R-project.org/package=tidyverse}, 109 | } 110 | 111 | @Manual{R, 112 | title = {R: A language and environment for statistical computing}, 113 | author = {{R Core Team}}, 114 | organization = {R Foundation for Statistical Computing}, 115 | address = {Vienna, Austria}, 116 | year = {2018}, 117 | url = {https://www.R-project.org/}, 118 | } 119 | 120 | @Misc{carpentry, 121 | author = {Achaz {von Hardenberg} and Adam Obeng and Aleksandra Pawlik and Alex Pletzer and Alexey Shiklomanov and Anne Fouilloux and April Wright and Auriel Fournier and Ben Marwick and C. Titus Brown and Carolina Johnson and Carolyn Voter and Catherine Hulshof and Christie Bahlai and Clara Shaw and Daijiang Li and Daina Bouquin and Daniel Stubbs and Danielle Quinn and Darya Vanichkina and Dmytro Fishman and Earle Wilson and Edmund Hart and Eilis Hannon and Elena Sügis and Eli Strauss and Emilia Gan and Erin Becker and Ethan White and Francisco Rodriguez-Sanchez and Francois Michonneau and Fred Boehm and {GMoncrieff} and Hao Ye and Harriet Dashnow and Hilmar Lapp and {JSurman} and Jaime Ashander and Jarrett Byrnes and Jeffrey W Hollister and Jieming Chen and Jillian Dunic and {Jon} and Jonathan Keane and Joseph Stachelek and Josh Herr and K. A. S. Mislan and Kara Woo and Karen Cranston and Kari L. Jordan and Karthik Ram and Kate Hertweck and Kathe Todd-Brown and Katie Lotterhos and Kayla Peck and Kenan Direk and Kevin Hall and Kristian Tylén and Kyriakos Chatzidimitriou and Lachlan Deer and Laurent Gatto and Leah Wasser and Leszek Tarkowski and Lisa Breckels and M. Foos and Marco Chiapello and Mark Robinson and Markus J. Akenbrand and Mateusz Kuzak and Matthias Grenié and Matthias Grenié and Maëlle Salmon and Meghan Duffy and Michael Koontz and Myfanwy Johnston and Nicholas Marino and Nick Carchedi and Olivia Burge and Philip Lijnzaad and Philip Lijnzaad and Ryan Peek and Sarah Supp and Shawn Taylor and Stephanie Labou and Steve Pederson and Tara Webster and Taylor Reiter and Thomas Sandmann and Tracy Teal and Will Furnass and Will Pearse and Ye Li and Zena Lapp and {ab604} and {ashander} and {cengel} and Brian Seok and {sfn_brt} and {suparee}}, 122 | title = {{Data} {Carpentry}: {R} for data analysis and visualization of ecological data}, 123 | editor = {Francois Michonneau and Auriel Fournier}, 124 | month = {November}, 125 | year = {2018}, 126 | url = {https://datacarpentry.org/R-ecology-lesson/}, 127 | doi = {10.5281/zenodo.569338}, 128 | } 129 | 130 | @Manual{dplyr, 131 | title = {dplyr: A grammar of data manipulation}, 132 | author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller}, 133 | year = {2018}, 134 | note = {R package version 0.7.8}, 135 | url = {https://CRAN.R-project.org/package=dplyr}, 136 | } 137 | 138 | @Article{lme4, 139 | title = {Fitting linear mixed-effects models using {lme4}}, 140 | author = {Douglas Bates and Martin M{\"a}chler and Ben Bolker and Steve Walker}, 141 | journal = {Journal of Statistical Software}, 142 | year = {2015}, 143 | volume = {67}, 144 | number = {1}, 145 | pages = {1--48}, 146 | doi = {10.18637/jss.v067.i01}, 147 | } 148 | 149 | @Article{lmerTest, 150 | title = {{lmerTest} package: Tests in linear mixed effects models}, 151 | author = {Alexandra Kuznetsova and Per B. Brockhoff and Rune H. B. Christensen}, 152 | journal = {Journal of Statistical Software}, 153 | year = {2017}, 154 | volume = {82}, 155 | number = {13}, 156 | pages = {1--26}, 157 | doi = {10.18637/jss.v082.i13}, 158 | } 159 | 160 | @Manual{rmarkdown, 161 | title = {rmarkdown: Dynamic documents for {R}}, 162 | author = {JJ Allaire and Yihui Xie and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang and Richard Iannone}, 163 | year = {2018}, 164 | note = {R package version 1.11}, 165 | url = {https://rmarkdown.rstudio.com}, 166 | } 167 | 168 | @Article{mice, 169 | title = {{mice}: Multivariate imputation by chained equations in {R}}, 170 | author = {Stef {van Buuren} and Karin Groothuis-Oudshoorn}, 171 | journal = {Journal of Statistical Software}, 172 | year = {2011}, 173 | volume = {45}, 174 | number = {3}, 175 | pages = {1-67}, 176 | doi = {10.18637/jss.v045.i03}, 177 | url = {https://www.jstatsoft.org/v45/i03/}, 178 | } 179 | 180 | @Article{multcomp, 181 | title = {Simultaneous inference in general parametric models}, 182 | author = {Torsten Hothorn and Frank Bretz and Peter Westfall}, 183 | journal = {Biometrical Journal}, 184 | year = {2008}, 185 | volume = {50}, 186 | number = {3}, 187 | pages = {346--363}, 188 | doi = {10.1002/bimj.200810425} 189 | } 190 | 191 | @Manual{psych, 192 | title = {{psych}: Procedures for psychological, psychometric, and personality research}, 193 | author = {William Revelle}, 194 | organization = {Northwestern University}, 195 | address = {Evanston, IL, USA}, 196 | year = {2018}, 197 | note = {R package version 1.8.10}, 198 | url = {https://CRAN.R-project.org/package=psych}, 199 | } 200 | 201 | @Book{car, 202 | title = {An {R} companion to applied regression}, 203 | edition = 2, 204 | author = {John Fox and Sanford Weisberg}, 205 | year = {2011}, 206 | publisher = {Sage}, 207 | address = {Thousand Oaks, CA, USA}, 208 | url = {https://socialsciences.mcmaster.ca/jfox/Books/Companion-2E}, 209 | } 210 | 211 | @Article{reshape2, 212 | title = {Reshaping data with the {reshape} package}, 213 | author = {Hadley Wickham}, 214 | journal = {Journal of Statistical Software}, 215 | year = {2007}, 216 | volume = {21}, 217 | number = {12}, 218 | pages = {1--20}, 219 | url = {https://www.jstatsoft.org/v21/i12/}, 220 | doi = {10.18637/jss.v021.i12}, 221 | } 222 | 223 | @Article{plyr, 224 | title = {The split-apply-combine strategy for data analysis}, 225 | author = {Hadley Wickham}, 226 | journal = {Journal of Statistical Software}, 227 | year = {2011}, 228 | volume = {40}, 229 | number = {1}, 230 | pages = {1--29}, 231 | url = {https://www.jstatsoft.org/v40/i01/}, 232 | doi = {10.18637/jss.v040.i01} 233 | } 234 | 235 | @Manual{MuMIn, 236 | title = {{MuMIn}: Multi-model inference}, 237 | author = {Kamil Bartoń}, 238 | year = {2018}, 239 | note = {R package version 1.42.1}, 240 | url = {https://CRAN.R-project.org/package=MuMIn}, 241 | } 242 | 243 | @Article{deSolve, 244 | title = {Solving differential equations in {R}: package {deSolve}}, 245 | author = {Karline Soetaert and Thomas Petzoldt and R. Woodrow Setzer}, 246 | journal = {Journal of Statistical Software}, 247 | volume = {33}, 248 | number = {9}, 249 | pages = {1--25}, 250 | year = {2010}, 251 | coden = {JSSOBK}, 252 | issn = {1548-7660}, 253 | url = {https://www.jstatsoft.org/v33/i09}, 254 | doi = {10.18637/jss.v033.i09}, 255 | keywords = {ordinary differential equations, partial differential 256 | equations, differential algebraic equations, initial value problems, 257 | R, FORTRAN, C}, 258 | } 259 | 260 | @Manual{knitr, 261 | title = {{knitr}: A general-purpose package for dynamic report generation in {R}}, 262 | author = {Yihui Xie}, 263 | year = {2018}, 264 | note = {R package version 1.21}, 265 | url = {https://yihui.name/knitr/}, 266 | } 267 | 268 | @Manual{EcoSimR, 269 | title = {{EcoSimR}: Null model analysis for ecological data}, 270 | author = {Nicholas J. Gotelli and Edmund M. Hart and Aaron M. Ellison}, 271 | year = {2015}, 272 | note = {R package version 0.1.0}, 273 | url = {https://github.com/gotellilab/EcoSimR}, 274 | doi = {10.5281/zenodo.16522} 275 | } 276 | -------------------------------------------------------------------------------- /paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "A graduate student-led participatory live-coding quantitative methods course in R: Experiences on initiating, developing, and teaching" 3 | tags: 4 | - R 5 | - ecology 6 | - statistics 7 | - biology 8 | - undergraduate 9 | authors: 10 | - name: Luke W. Johnston 11 | orcid: 0000-0003-4169-2616 12 | affiliation: "2,3" 13 | - name: Madeleine Bonsma-Fisher 14 | orcid: 0000-0002-5813-4664 15 | affiliation: 1 16 | - name: Joel Ostblom 17 | orcid: 0000-0003-0051-3239 18 | affiliation: 4 19 | - name: Ahmed R. Hasan 20 | orcid: 0000-0003-0002-8399 21 | affiliation: 6 22 | - name: James S. Santangelo 23 | orcid: 0000-0002-5921-2548 24 | affiliation: 7 25 | - name: Lindsay Coome 26 | orcid: 0000-0001-8126-3571 27 | affiliation: 5 28 | - name: Lina Tran 29 | orcid: 0000-0003-3504-4524 30 | affiliation: 8 31 | - name: Elliott Sales de Andrade 32 | orcid: 0000-0001-7310-8942 33 | affiliation: 1 34 | - name: Sara Mahallati 35 | orcid: 0000-0002-6765-0898 36 | affiliation: 4 37 | affiliations: 38 | - name: Department of Physics, University of Toronto 39 | index: 1 40 | - name: Department of Nutritional Sciences, University of Toronto 41 | index: 2 42 | - name: Department of Public Health, Aarhus University 43 | index: 3 44 | - name: Institute of Biomaterials and Biomedical Engineering, University of Toronto 45 | index: 4 46 | - name: Department of Psychology, University of Toronto 47 | index: 5 48 | - name: Department of Cell and Systems Biology, University of Toronto 49 | index: 6 50 | - name: Department of Ecology and Evolutionary Biology, University of Toronto 51 | index: 7 52 | - name: Department of Physiology, University of Toronto 53 | index: 8 54 | date: 15 January 2019 55 | bibliography: paper.bib 56 | --- 57 | 58 | # Introduction 59 | 60 | We present an open source learning module suitable for a semester long course 61 | and designed to leverage participatory live-coding techniques to teach both 62 | statistical and programming skills to primarily upper-year undergraduate biology 63 | students. Our learning module has three self-contained submodules spanning 64 | sixteen lessons: 1) Programming in R, basic data wrangling, and visualizations; 65 | 2) Exploratory data analysis, statistics, and modelling; and 3) Collaborative 66 | and reproducible science. Our learning module includes eight assignments 67 | distributed throughout the term to assess students' learning and understanding. 68 | The material is made available as R Markdown documents and designed to be taught 69 | using R Notebooks. Students are not expected to have any prior knowledge of the 70 | R language. Our material is licensed under CC-BY 4.0 while the code is under the 71 | MIT License. Our course is a response to the growing need for programmatic 72 | training emphasizing sound data analysis practices among researchers. We believe 73 | the included lesson topics, open accessibility, and modularity of our course 74 | makes it an ideal resource for instructors. 75 | 76 | # Statement of Need 77 | 78 | In traditional undergraduate biology education, students learn statistical 79 | skills and biological concepts separately, without any practical application 80 | through coding. Designed primarily for upper-year undergraduate students, this 81 | learning module emphasizes gaining skills in R coding in the context of learning 82 | statistics and ecology. Notably, the material covers statistical concepts that 83 | are broadly useful in biological sciences, including mixed effects models, 84 | randomization tests, model selection, and differential equations. While we 85 | delivered the material and concepts as a four-month long course, these concepts 86 | are structured into primarily independent submodules focused around several 87 | lessons, which could easily be mixed and matched to suit any desired learning 88 | outcome. Lessons were designed to be interactive and delivered in a 89 | participatory live-coding format so students learn experientially. The teaching 90 | material includes assignments to hone and reinforce students' understanding and 91 | allow them to critically apply their skills to new problems. Reproducible 92 | quantitative research skills are emphasized throughout, culminating in an 93 | open-ended self-directed project that requires students to apply their skills to 94 | a real ecological dataset and problem. The teaching material is hosted in a 95 | public GitHub repository which automatically generates a website that presents 96 | the text, code, and code output together on the same page. The material is 97 | openly available and licensed; anyone can easily copy and modify for their own 98 | purposes. 99 | 100 | # Learning Objectives and Content 101 | 102 | The overarching objective of the course is to teach reproducible and 103 | collaborative quantitative research skills. The lessons are described in more 104 | detail in Table 1 and are organized into three submodules: 105 | 106 | 1. Programming in R [@R], basic data wrangling, and visualization (lessons 1-5). 107 | 2. Exploratory and statistical data analysis (lessons 6-13). 108 | 3. Collaborative and reproducible science (lessons 14-15). 109 | 110 | | **Submodule** | **Lesson** | **Description** | **Packages used** | 111 | |:----------|:-------------|:-------------|:------------------| 112 | | Programming in R, data wrangling, visualization | 1 | Introducing R, RStudio, and R Markdown | | 113 | | | 2 | Vectors, data frames, basic operations, and functions | `tidyverse` [@tidyverse] | 114 | | | 3 | Introduction to exploratory data analysis | `tidyverse` | 115 | | | 4 | Introduction to statistics and visualization | `tidyverse` | 116 | | | 5 | Data transformation and visualization | `tidyverse` | 117 | | Exploratory and statistical data analysis | 6 | Cleaning and preprocessing raw data | `tidyverse`; `mice` [@mice] | 118 | | | 7 | Descriptive and inferential statistics | `tidyverse`; `car` [@car]; `psych` [@psych]; `multcomp` [@multcomp] | 119 | | | 8 | Linear mixed-effects models | `tidyverse`; `plyr` [@plyr]; `lme4` [@lme4]; `lmerTest` [@lmerTest] | 120 | | | 9 | Randomization tests and data simulation | `tidyverse`; `reshape2` [@reshape2]; `EcoSimR` [@EcoSimR] | 121 | | | 10 | Multivariate statistics (e.g. PCA) | `tidyverse`; `car`; `psych`; `multcomp`| 122 | | | 11 | Model selection and averaging | `tidyverse`; `lme4`; `lmerTest`; `MuMIn` [@MuMIn] | 123 | | Numerical models | 12| Population modelling with differential equations | `tidyverse`; `deSolve` [@deSolve] | 124 | | | 13 | Time-series data and numerical models | `tidyverse`; `deSolve` | 125 | | Collaborative and reproducible science | 14 | Scientific methods | | 126 | | | 15 | Collaborating through Git and GitHub | | 127 | | | 16 | Manuscript preparation in R Markdown | `knitr` [@knitr]; `rmarkdown` [@rmarkdown] | 128 | 129 | Table: Overview of submodules, lessons, and packages used in the learning module. 130 | 131 | # Instructional Design 132 | 133 | Drawing on the instructors' previous experiences teaching introductory 134 | programming workshops, we designed our lessons to have the following components: 135 | 136 | 1. *Lesson Outline*: Each lesson has a clearly defined outline of the lesson 137 | objectives, including expected time spent on each objective. This gives students 138 | a clear expectation of what they should learn and gain from the lesson. It also 139 | provides a structured template for instructors to prioritize content and gauge 140 | how much time each objective should take. 141 | 2. *Participatory Live-Coding*: Coding in real-time with the students actively 142 | coding along, forms the primary focus of each lesson. This hands-on approach to 143 | teaching is frequently used by teaching organizations such as 144 | [Software Carpentry](https://software-carpentry.org/blog/2016/04/tips-tricks-live-coding.html) 145 | [@carpentry;@rubin_effectiveness_2013;@haaranen_programming_2017;@wilson_teaching_2018]. 146 | While many learning outcomes focus on developing programming proficiency, some 147 | lessons are centred around concepts (such as "Statistical Modelling" or 148 | "Differential Equations"), during which we still use the live-coding approach. 149 | This approach not only demonstrates the concepts in a step-by-step fashion but 150 | also helps students practice writing code. 151 | 3. *Interwoven Exercises*: Coding exercises or discussion points are 152 | interspersed throughout each lesson to assess and reinforce the concepts and 153 | skills being taught. These exercises challenge the students and help build 154 | confidence in the material and in their coding skills. They also help 155 | instructors identify problem areas that should be further reinforced later in 156 | the lesson or submodule. 157 | 4. *Summative Assignments*: Lesson specific assignments are used every two 158 | lessons to test the competency of students to the lesson material and expected 159 | skills to be gained, while a comprehensive final assignment is used to test the 160 | students' ability to bring together all concepts learned throughout the learning 161 | module. 162 | 163 | Each of our submodules and individual lessons built on skills and concepts that 164 | would ultimately allow students to complete a final open-ended analysis of real 165 | open ecological data. We deliberately chose large and messy (e.g. missing 166 | values) datasets for the students, reflecting the types of data that are being 167 | increasingly generated across various disciplines. With this goal in mind, we 168 | designed lessons to provide the building blocks to clean, manipulate, visualize, 169 | and analyze any dataset the students may come across, both for the final project 170 | and in their future research. 171 | 172 | # Teaching Experience 173 | 174 | For the first iteration of the course, our teaching team consisted of six 175 | graduate students from diverse fields of research; we divided course topics 176 | among each instructor to develop and deliver individual lessons and assignments 177 | to the eight students. We reduced the number of instructors to four graduate 178 | students for the second iteration and the number of students increased to 26. We 179 | estimate four instructors could effectively teach the current iteration of the 180 | course to around 40 students. We consider having instructors come from multiple 181 | fields as a major strength and strongly recommend this practice for teaching 182 | quantitative research methods and skills. 183 | 184 | To maximize the learning experience, we prioritized in-class participation, 185 | engagement, and hands-on experience. The main teaching techniques we used to 186 | achieve this goal were participatory live-coding, exercises interwoven with 187 | teaching, and project-based learning 188 | [@sawyer_cambridge_2006;@strobel_when_2009;@markham_project_2011] where students 189 | collaborated in teams on data analysis problems to mimic a real world scenario. 190 | 191 | To ensure proper teaching assistance was available at all times, we adopted a 192 | technique used successfully in workshops developed by The Carpentries 193 | [@wilson-software-carpentry]. This technique involved having at least two 194 | instructors present for each lesson, where one instructed and another acted as a 195 | "helper". Students would signal for assistance by attaching colored sticky notes 196 | to the back of their laptop monitor. This method avoided interrupting the lesson 197 | flow when individual students needed assistance. 198 | 199 | # Story of the project 200 | 201 | While there are many excellent open source software packages available for 202 | quantitative data analysis, the use of less capable tools (such as spreadsheet 203 | software) is still prevalent among researchers, even though these drastically 204 | reduce analytical reproducibility, power, and efficiency. This happens partly 205 | due to lack of awareness, and partly because graduate students, many of whom 206 | will be future researchers, often are not incentivized to learn new and better 207 | tools, as they usually must use what their supervisor or colleagues use. Those 208 | who do try to learn these modern tools often do so in isolation and without much 209 | formal training available. These are major barriers to learning. To help break 210 | down these barriers, we launched the graduate student group 211 | [University of Toronto Coders](https://uoftcoders.github.io/) where we run 212 | peer-led learning sessions on using code for research through skill sharing, 213 | co-working, and community building in a friendly and supportive environment. 214 | 215 | After running many sessions and consistently receiving overwhelmingly positive 216 | feedback on our content and teaching style, we sought to formally share our 217 | experiences through the university curriculum. We designed a course on open, 218 | reproducible data analysis, and contacted multiple departments that could be 219 | interested in hosting this course. The Department of Ecology and Evolutionary 220 | Biology at the University of Toronto agreed, and we ran a pilot of the course 221 | with the title "Theoretical Ecology and Reproducible Quantitative Methods in R" 222 | to fourth-year undergraduate students. We modelled the structure and portions of 223 | the course content after the course ["Reproducible Quantitative 224 | Methods"](https://cbahlai.github.io/rqm-template/), which was created by Dr. 225 | Christie Bahlai. We extensively modified the lesson content to include expanded 226 | material on data wrangling, visualization, reproducibility, collaborative 227 | science, and additional theoretical ecology topics. 228 | 229 | Following a successful pilot term, we modified our lesson material further again 230 | to include more generally applicable statistical concepts and far fewer 231 | theoretical ecological concepts. We also renamed the course to "Quantitative 232 | Methods in R for Biology" to reflect this change. On both occasions, 233 | the course received excellent feedback from the students and the supervising 234 | professors and has been incorporated into the long-term curriculum as a third 235 | year level course. 236 | 237 | # Contributions 238 | 239 | LWJ, MB-F, LT, and LC conceptualized the course. JO lead course development. JO, 240 | MB-F, LWJ, LC, ES, and LT designed and taught the first iteration of the course. 241 | JSS, LC, MB-F, and ARH taught the second iteration of the course, with guest 242 | lectures from SM and LT. Lesson development for second iteration: JO and ARH 243 | (1-5), JSS (8, 9, 11), LC (6, 7, 10), MB-F (12, 13), LWJ (14), ARH and SM (15), 244 | LT (16). LWJ, MB-F, JO, SM, LT, ARH, and JSS wrote the paper. LWJ, MB-F, ES, JO, 245 | LT, JSS, and AH proofread and edited the final draft. 246 | 247 | # References 248 | -------------------------------------------------------------------------------- /rcourse.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Website 19 | -------------------------------------------------------------------------------- /resources.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Resources and Additional Information" 3 | --- 4 | 5 | ## R Coding Style Guide 6 | 7 | *All code submitted for grading* for the final assignment will need to conform to the coding style 8 | guidelines as outline on [this Style Guide](http://style.tidyverse.org/index.html). 9 | We will be discussing and highlighting how to automatically (or at least 10 | consistently) use this style as we go through the lectures and labs. For the 11 | first few assignments, you will not be required to follow these guidelines, 12 | however we will make a note in your assignment for you to fix it next time. In 13 | later assignments, part of the grade will come from adhering to the coding 14 | styles. 15 | 16 | ## R Markdown Workflow 17 | 18 | - [Predictive Ecology - Rmarkdown in a scientific workflow](http://predictiveecology.org/2016/10/21/Rmarkdown-science-workflow.html) 19 | - [R for Data Science - R Markdown workflow](http://r4ds.had.co.nz/r-markdown-workflow.html) 20 | - [Decision tree for choosing appropriate statistical tests](resources/Statistical-decision-tree.pdf) 21 | -------------------------------------------------------------------------------- /resources/HighstatLibV6.R: -------------------------------------------------------------------------------- 1 | #Library files for courses provided by: Highland Statistics Ltd. 2 | #To cite these functions, use: 3 | #Mixed effects models and extensions in ecology with R. (2009). 4 | #Zuur, AF, Ieno, EN, Walker, N, Saveliev, AA, and Smith, GM. Springer. 5 | 6 | #Copyright Highland Statistics LTD. 7 | 8 | ##################################################################### 9 | #VIF FUNCTION. 10 | #To use: corvif(YourDataFile) 11 | corvif <- function(dataz) { 12 | dataz <- as.data.frame(dataz) 13 | #correlation part 14 | #cat("Correlations of the variables\n\n") 15 | #tmp_cor <- cor(dataz,use="complete.obs") 16 | #print(tmp_cor) 17 | 18 | #vif part 19 | form <- formula(paste("fooy ~ ",paste(strsplit(names(dataz)," "),collapse=" + "))) 20 | dataz <- data.frame(fooy=1,dataz) 21 | lm_mod <- lm(form,dataz) 22 | 23 | cat("\n\nVariance inflation factors\n\n") 24 | print(myvif(lm_mod)) 25 | } 26 | 27 | 28 | #Support function for corvif. Will not be called by the user 29 | myvif <- function(mod) { 30 | v <- vcov(mod) 31 | assign <- attributes(model.matrix(mod))$assign 32 | if (names(coefficients(mod)[1]) == "(Intercept)") { 33 | v <- v[-1, -1] 34 | assign <- assign[-1] 35 | } else warning("No intercept: vifs may not be sensible.") 36 | terms <- labels(terms(mod)) 37 | n.terms <- length(terms) 38 | if (n.terms < 2) stop("The model contains fewer than 2 terms") 39 | if (length(assign) > dim(v)[1] ) { 40 | diag(tmp_cor)<-0 41 | if (any(tmp_cor==1.0)){ 42 | return("Sample size is too small, 100% collinearity is present") 43 | } else { 44 | return("Sample size is too small") 45 | } 46 | } 47 | R <- cov2cor(v) 48 | detR <- det(R) 49 | result <- matrix(0, n.terms, 3) 50 | rownames(result) <- terms 51 | colnames(result) <- c("GVIF", "Df", "GVIF^(1/2Df)") 52 | for (term in 1:n.terms) { 53 | subs <- which(assign == term) 54 | result[term, 1] <- det(as.matrix(R[subs, subs])) * det(as.matrix(R[-subs, -subs])) / detR 55 | result[term, 2] <- length(subs) 56 | } 57 | if (all(result[, 2] == 1)) { 58 | result <- data.frame(GVIF=result[, 1]) 59 | } else { 60 | result[, 3] <- result[, 1]^(1/(2 * result[, 2])) 61 | } 62 | invisible(result) 63 | } 64 | #END VIF FUNCTIONS 65 | 66 | 67 | 68 | 69 | 70 | ################################################################## 71 | ################################################################## 72 | #Here are some functions that we took from the pairs help file and 73 | #modified, or wrote ourselves. To cite these, use the r citation: citation() 74 | 75 | panel.cor <- function(x, y, digits=1, prefix="", cex.cor = 6) 76 | { 77 | usr <- par("usr"); on.exit(par(usr)) 78 | par(usr = c(0, 1, 0, 1)) 79 | r1=cor(x,y,use="pairwise.complete.obs") 80 | r <- abs(cor(x, y,use="pairwise.complete.obs")) 81 | txt <- format(c(r1, 0.123456789), digits=digits)[1] 82 | txt <- paste(prefix, txt, sep="") 83 | if(missing(cex.cor)) { cex <- 0.9/strwidth(txt) } else { 84 | cex = cex.cor} 85 | text(0.5, 0.5, txt, cex = cex * r) 86 | } 87 | 88 | ################################################################## 89 | panel.smooth2=function (x, y, col = par("col"), bg = NA, pch = par("pch"), 90 | cex = 1, col.smooth = "black", span = 2/3, iter = 3, ...) 91 | { 92 | points(x, y, pch = pch, col = col, bg = bg, cex = cex) 93 | ok <- is.finite(x) & is.finite(y) 94 | if (any(ok)) 95 | lines(stats::lowess(x[ok], y[ok], f = span, iter = iter), 96 | col = 1, ...) 97 | } 98 | 99 | ################################################################## 100 | panel.lines2=function (x, y, col = par("col"), bg = NA, pch = par("pch"), 101 | cex = 1, ...) 102 | { 103 | points(x, y, pch = pch, col = col, bg = bg, cex = cex) 104 | ok <- is.finite(x) & is.finite(y) 105 | if (any(ok)){ 106 | tmp=lm(y[ok]~x[ok]) 107 | abline(tmp)} 108 | } 109 | 110 | ################################################################## 111 | panel.hist <- function(x, ...) 112 | { 113 | usr <- par("usr"); on.exit(par(usr)) 114 | par(usr = c(usr[1:2], 0, 1.5) ) 115 | h <- hist(x, plot = FALSE) 116 | breaks <- h$breaks; nB <- length(breaks) 117 | y <- h$counts; y <- y/max(y) 118 | rect(breaks[-nB], 0, breaks[-1], y, col="white", ...) 119 | } 120 | ################################################################## 121 | ################################################################## 122 | 123 | 124 | 125 | ################################################################## 126 | ################################################################## 127 | #Functions for variograms 128 | #To cite these functions, use: 129 | #Mixed effects models and extensions in ecology with R. (2009). 130 | #Zuur, AF, Ieno, EN, Walker, N, Saveliev, AA, and Smith, GM. Springer. 131 | #Make a variogram for one variable 132 | #To use, type: MyVariogram(XUTM, YUTM, E , MyDistance=10) 133 | # XUTM is x coordinates 134 | # XUTM is y coordinates 135 | # E is variable used in sample variogram 136 | # MyDistance is the cutoff value for the distances 137 | 138 | MyVariogram <- function(x,y,z, MyDistance) { 139 | library(gstat) 140 | mydata <- data.frame(z, x, y) 141 | coordinates(mydata) <- c("x", "y") 142 | Var <- variogram(z ~ 1, mydata, cutoff = MyDistance) 143 | data.frame(Var$np, Var$dist, Var$gamma) 144 | } 145 | 146 | #Function for making multiple variograms in an xyplot 147 | #To use, type: MultiVariogram(Z, MyVar,XUTM, YUTM, MyDistance=10) 148 | # Z is a data frame with all the data 149 | # Character string with variable names that will be used in the xyplot 150 | # XUTM is x coordinates 151 | # XUTM is y coordinates 152 | # MyDistance is the cutoff value for the distances 153 | 154 | MultiVariogram <- function(Z, MyVar, x, y, MyDistance) { 155 | #Z is the data frame with data 156 | #MyVar is a list of variables for for which variograms are calculated 157 | #x, y: spatial coordinates 158 | #MyDistance: limit for distances in the variogram 159 | 160 | library(lattice) 161 | VarAll<- c(NA,NA,NA,NA) 162 | for (i in MyVar){ 163 | vi <- MyVariogram(x,y,Z[,i], MyDistance) 164 | vii <- cbind(vi, i) 165 | VarAll <- rbind(VarAll,vii) 166 | } 167 | VarAll <- VarAll[-1,] 168 | 169 | P <- xyplot(Var.gamma ~ Var.dist | factor(i), col = 1, type = "p", pch = 16, 170 | data = VarAll, 171 | xlab = "Distance", 172 | ylab = "Semi-variogram", 173 | strip = function(bg='white', ...) 174 | strip.default(bg='white', ...), 175 | scales = list(alternating = T, 176 | x = list(relation = "same"), 177 | y = list(relation = "same")) 178 | ) 179 | 180 | print(P) 181 | } 182 | #End variogram code 183 | ########################################################## 184 | 185 | #Function for multi-panel Cleveland dotplot. 186 | #The input file must contain no categorical variables 187 | Mydotplot <- function(DataSelected){ 188 | 189 | P <- dotplot(as.matrix(as.matrix(DataSelected)), 190 | groups=FALSE, 191 | strip = strip.custom(bg = 'white', 192 | par.strip.text = list(cex = 1.2)), 193 | scales = list(x = list(relation = "free", draw = TRUE), 194 | y = list(relation = "free", draw = FALSE)), 195 | col=1, cex = 0.5, pch = 16, 196 | xlab = list(label = "Value of the variable", cex = 1.5), 197 | ylab = list(label = "Order of the data from text file", cex = 1.5)) 198 | 199 | print(P) 200 | } 201 | 202 | 203 | #Add more code here: 204 | 205 | 206 | Mybwplot <- function(Z, MyVar, TargetVar){ 207 | #Multipanel boxplots 208 | #Z: data set 209 | #MyVar: character string 210 | #TargetVar: variable for the x-axis..must be a factor 211 | 212 | AllY <- as.vector(as.matrix(Z[,MyVar])) 213 | AllX <- rep(Z[,TargetVar], length(MyVar)) 214 | ID <- rep(MyVar, each = nrow(Z)) 215 | 216 | P <- bwplot(AllY ~ factor(AllX) | ID, horizontal = FALSE, 217 | ylab = "", xlab = "", 218 | scales = list(alternating = T,cex.lab = 1.5, 219 | x = list(relation = "same",rot =90, abbreviate = TRUE, cex = 1.5), 220 | y = list(relation = "free", draw = FALSE)), 221 | strip = strip.custom(bg = 'white', 222 | par.strip.text = list(cex = 1.2)), 223 | cex = .5, 224 | par.settings = list( 225 | box.rectangle = list(col = 1), 226 | box.umbrella = list(col = 1), 227 | plot.symbol = list(cex = .5, col = 1))) 228 | print(P) 229 | } 230 | 231 | 232 | 233 | ####################################################### 234 | MyxyplotBin <- function(Z, MyV, NameY1) { 235 | AllX <- as.vector(as.matrix(Z[,MyV])) 236 | AllY <- rep(Z[,NameY1] , length(MyV)) 237 | AllID <- rep(MyV, each = nrow(Z)) 238 | 239 | 240 | library(mgcv) 241 | library(lattice) 242 | 243 | P <- xyplot(AllY ~ AllX | factor(AllID), col = 1, 244 | strip = function(bg='white', ...) strip.default(bg='white', ...), 245 | scales = list(alternating = T, 246 | x = list(relation = "free"), 247 | y = list(relation = "same")), 248 | xlab = "Covariate", 249 | ylab = "Probability of presence", 250 | panel=function(x,y){ 251 | panel.grid(h=-1, v= 2) 252 | panel.points(x,y,col=1) 253 | tmp<-gam(y~s(x, k = 4), family = binomial) 254 | MyData <- data.frame(x = seq(min(x), max(x), length = 25)) 255 | p1 <- predict(tmp, newdata = MyData, type ="response") 256 | panel.lines(MyData$x,p1, col = 1, lwd = 3) 257 | }) 258 | 259 | print(P) 260 | } 261 | ####################################################### 262 | 263 | ####################################################### 264 | Myxyplot <- function(Z, MyV, NameY1,MyYlab="") { 265 | AllX <- as.vector(as.matrix(Z[,MyV])) 266 | AllY <- rep(Z[,NameY1] , length(MyV)) 267 | AllID <- rep(MyV, each = nrow(Z)) 268 | 269 | 270 | library(mgcv) 271 | library(lattice) 272 | 273 | P <- xyplot(AllY ~ AllX|factor(AllID), col = 1, 274 | xlab = list("Explanatory variables", cex = 1.5), 275 | #ylab = list("Response variable", cex = 1.5), 276 | #ylab = list("Pearson residuals", cex = 1.5), 277 | ylab = list(MyYlab, cex = 1.5), 278 | #layout = c(2,2), #Modify 279 | strip = function(bg='white', ...) 280 | strip.default(bg='white', ...), 281 | scales = list(alternating = T, 282 | x = list(relation = "free"), 283 | y = list(relation = "same")), 284 | panel=function(x, y){ 285 | panel.grid(h=-1, v= 2) 286 | panel.points(x, y, col = 1) 287 | panel.loess(x, y, span = 0.8,col = 1, lwd = 2)}) 288 | 289 | print(P) 290 | } 291 | ####################################################### 292 | 293 | MyxyplotPolygon <- function(Z, MyV, NameY1) { 294 | AllX <- as.vector(as.matrix(Z[,MyV])) 295 | AllY <- rep(Z[,NameY1] , length(MyV)) 296 | AllID <- rep(MyV, each = nrow(Z)) 297 | 298 | 299 | library(mgcv) 300 | library(lattice) 301 | Z <- xyplot(AllY ~ AllX|factor(AllID), col = 1, 302 | xlab = list(label = "Explanatory variables", cex = 1.5), 303 | ylab = "", 304 | strip = function(bg='white',cex.lab = 1.5,...) 305 | strip.default(bg='white', ...), 306 | scales = list(alternating = T, 307 | x = list(relation = "free"), 308 | y = list(relation = "same")), 309 | panel=function(x, y){ 310 | t1 <- gam(y~s(x)) 311 | MD1 <- data.frame(x=seq(from = min(x, na.rm = TRUE), 312 | to = max(x, na.rm = TRUE), 313 | length = 100)) 314 | P1 <- predict(t1, se.fit = TRUE) 315 | I1 <- order(x) 316 | xs <- sort(x) 317 | panel.lines(xs, P1$fit[I1], col = 1) 318 | panel.polygon(c(xs, rev(xs)), 319 | c(P1$fit[I1]-2*P1$se.fit[I1], 320 | rev(P1$fit[I1]+2*P1$se.fit[I1])), 321 | col = gray(0.7), 322 | density = 10 ) 323 | panel.grid(h=-1, v= 2) 324 | panel.abline(0,0) 325 | panel.points(x, y, col = 1) 326 | 327 | }) 328 | #Because the xyplot is inside a function you need to print 329 | #construction below 330 | print(Z) 331 | } 332 | 333 | ################################################ 334 | #Mypairs 335 | #Make fancy pair plots 336 | Mypairs <- function(Z) { 337 | MyVarx <- colnames(Z) 338 | pairs(Z, labels = MyVarx, 339 | cex.labels = 2, 340 | lower.panel = function(x, y, digits=2, prefix="", cex.cor = 7) { 341 | panel.cor(x, y, digits, prefix, cex.cor)}, 342 | upper.panel = function(x, y) points(x, y, 343 | pch = 16, cex = 0.8, 344 | col = gray(0.1))) 345 | #print(P) 346 | } -------------------------------------------------------------------------------- /resources/Statistical-decision-tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UofTCoders/rcourse/ea3ff68c83777babb659ed3e62e9897796326915/resources/Statistical-decision-tree.pdf --------------------------------------------------------------------------------