├── .Rprofile
├── .gitignore
├── .nojekyll
├── 01_introduction.Rmd
├── 02_ebird-data.Rmd
├── 03_covariates.Rmd
├── 04_encounter.Rmd
├── 05_occupancy.Rmd
├── 06_abundance.Rmd
├── 07_references.Rmd
├── DESCRIPTION
├── LICENSE
├── README.md
├── _bookdown.yml
├── _bookdown_files
└── ebird-best-practices_files
│ └── figure-html
│ ├── abundance-assess-mag-plot-1.png
│ ├── abundance-model-cov-plot-1.png
│ ├── abundance-model-dist-1.png
│ ├── abundance-predict-map-1.png
│ ├── abundance-predict-peak-1.png
│ ├── ebird-explore-distance-1.png
│ ├── ebird-explore-duration-1.png
│ ├── ebird-explore-map-1.png
│ ├── ebird-explore-observers-1.png
│ ├── ebird-explore-time-1.png
│ ├── encounter-habitat-pd-1.png
│ ├── encounter-habitat-pi-1.png
│ ├── encounter-predict-map-1.png
│ ├── encounter-predict-time-1.png
│ ├── encounter-rf-cal-cal-1.png
│ ├── encounter-sss-map-1.png
│ ├── encounter-sss-toy-1.png
│ ├── landcover-prediction-map-1.png
│ └── occupancy-predict-map-1.png
├── _common.R
├── _config.yml
├── _output.yml
├── data
├── data.zip
├── ebd_checklists_june_bcr27.txt
├── ebd_woothr_june_bcr27.txt
├── ebd_woothr_june_bcr27_zf.csv
├── gis-data.gpkg
├── mcd12q1_classes.csv
├── modis
│ ├── modis_mcd12q1_umd_2010.tif
│ ├── modis_mcd12q1_umd_2011.tif
│ ├── modis_mcd12q1_umd_2012.tif
│ ├── modis_mcd12q1_umd_2013.tif
│ ├── modis_mcd12q1_umd_2014.tif
│ ├── modis_mcd12q1_umd_2015.tif
│ ├── modis_mcd12q1_umd_2016.tif
│ ├── modis_mcd12q1_umd_2017.tif
│ └── modis_mcd12q1_umd_2018.tif
├── modis_pland_location-year.csv
├── pland-elev_location-year.csv
├── pland-elev_prediction-surface.csv
└── prediction-surface.tif
├── docs
├── abundance.html
├── covariates.html
├── ebird-best-practices_files
│ └── figure-html
│ │ ├── abundance-assess-mag-plot-1.png
│ │ ├── abundance-model-cov-plot-1.png
│ │ ├── abundance-model-dist-1.png
│ │ ├── abundance-predict-map-1.png
│ │ ├── abundance-predict-peak-1.png
│ │ ├── ebird-explore-distance-1.png
│ │ ├── ebird-explore-duration-1.png
│ │ ├── ebird-explore-map-1.png
│ │ ├── ebird-explore-observers-1.png
│ │ ├── ebird-explore-time-1.png
│ │ ├── encounter-habitat-pd-1.png
│ │ ├── encounter-habitat-pi-1.png
│ │ ├── encounter-predict-map-1.png
│ │ ├── encounter-predict-time-1.png
│ │ ├── encounter-rf-cal-cal-1.png
│ │ ├── encounter-sss-map-1.png
│ │ ├── encounter-sss-toy-1.png
│ │ ├── landcover-prediction-map-1.png
│ │ └── occupancy-predict-map-1.png
├── ebird.html
├── encounter.html
├── images
│ ├── 02_ebird-data_checklist.png
│ ├── 03_covariates_elevation.png
│ └── 09_ebird_split_custom.png
├── index.html
├── intro.html
├── libs
│ ├── gitbook-2.6.7
│ │ ├── css
│ │ │ ├── fontawesome
│ │ │ │ └── fontawesome-webfont.ttf
│ │ │ ├── plugin-bookdown.css
│ │ │ ├── plugin-clipboard.css
│ │ │ ├── plugin-fontsettings.css
│ │ │ ├── plugin-highlight.css
│ │ │ ├── plugin-search.css
│ │ │ ├── plugin-table.css
│ │ │ └── style.css
│ │ └── js
│ │ │ ├── app.min.js
│ │ │ ├── clipboard.min.js
│ │ │ ├── jquery.highlight.js
│ │ │ ├── lunr.js
│ │ │ ├── plugin-bookdown.js
│ │ │ ├── plugin-clipboard.js
│ │ │ ├── plugin-fontsettings.js
│ │ │ ├── plugin-search.js
│ │ │ └── plugin-sharing.js
│ ├── header-attrs-2.2
│ │ └── header-attrs.js
│ └── jquery-2.2.3
│ │ └── jquery.min.js
├── occupancy.html
├── references.html
├── search_index.json
└── style.css
├── ebird-best-practices.Rproj
├── images
├── 02_ebird-data_checklist.png
├── 03_covariates_elevation.png
└── 09_ebird_split_custom.png
├── index.Rmd
├── index.html
├── output
├── abundance-model_abd_woothr.tif
├── abundance-model_se_woothr.tif
├── occupancy-model_prob_woothr.tif
├── occupancy-model_se_woothr.tif
├── rf-model_encounter-rate_woothr.tif
├── woothr_occupancy-model_gof.rds
└── woothr_occupancy-model_predictions.rds
├── packages.bib
├── preamble.tex
├── references.bib
└── style.css
/.Rprofile:
--------------------------------------------------------------------------------
1 | library(bookdown)
2 | if (file.exists("~/.Rprofile")) {
3 | source("~/.Rprofile")
4 | }
5 |
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rhistory
2 | .RData
3 | .Rproj.user
4 | .Rbuildignore
5 | data/calculate-cci.R
6 | data/ebd_all_june_bcr27.txt
7 | data/cci_june_bcr27.csv
8 | data/elevation_1KMmd_GMTEDmd.tif
--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/.nojekyll
--------------------------------------------------------------------------------
/01_introduction.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: html_document
3 | editor_options:
4 | chunk_output_type: console
5 | ---
6 |
7 | # Introduction and Setup {#intro}
8 |
9 | ## Introduction {#intro-intro}
10 |
11 | Citizen science data are increasingly making important contributions to ecological research and conservation. One of the most common forms of citizen science data is derived from members of the public recording species observations. [eBird](https://ebird.org/) [@sullivanEBirdEnterpriseIntegrated2014] is the largest of these biological citizen science programs. The eBird database contains well over one billion bird observations from every country in the world, with observations of nearly every bird species on Earth. The eBird database is valuable to researchers across the globe, due to its year-round, broad spatial coverage, high volumes of [open access](https://en.wikipedia.org/wiki/Open_data) data, and applications to many ecological questions. These data have been widely used in scientific research to study phenology, species distributions, population trends, evolution, behavior, global change, and conservation. However, robust inference with eBird data requires careful processing of the data to address the challenges associated with citizen science datasets. This book, and the [associated paper](https://onlinelibrary.wiley.com/doi/10.1111/ddi.13271), outlines a set of best practices for addressing these challenges and making reliable estimates of species distributions from eBird data.
12 |
13 | There are two key characteristics that distinguish eBird from many other citizen science projects and facilitate robust ecological analyses: the checklist structure enables non-detection to be inferred and the effort information associated with a checklist facilitates robust analyses by accounting for variation in the observation process [@lasorteOpportunitiesChallengesBig2018; @kellingFindingSignalNoise2018]. When a participant submits data to eBird, sightings of multiple species from the same observation period are grouped together into a single **checklist**. **Complete checklists** are those for which the participant reported all birds that they were able to detect and identify. Critically, this enables scientists to infer counts of zero individuals for the species that were not reported. If checklists are not complete, it's not possible to ascertain whether the absence of a species on a list was a non-detection or the result of a participant not recording the species. In addition, citizen science projects occur on a spectrum from those with predefined sampling structures that resemble more traditional survey designs, to those that are unstructured and collect observations opportunistically. eBird is a **semi-structured** project, having flexible, easy to follow protocols that attract many participants, but also collecting data on the observation process (e.g. amount of time spent birding, number of observers, etc.), which can be used in subsequent analyses [@kellingFindingSignalNoise2018].
14 |
15 | Despite the strengths of eBird data, species observations collected through citizen science projects present a number of challenges that are not found in conventional scientific data. The following are some of the primary challenges associated these data; challenges that will be addressed throughout this book:
16 |
17 | - **Taxonomic bias:** participants often have preferences for certain species, which may lead to preferential recording of some species over others [@greenwoodCitizensScienceBird2007; @tullochBehaviouralEcologyApproach2012]. Restricting analyses to complete checklists largely mitigates this issue.
18 | - **Spatial bias:** most participants in citizen science surveys sample near their homes [@luckAlleviatingSpatialConflict2004], in easily accessible areas such as roadsides [@kadmonEffectRoadsideBias2004], or in areas and habitats of known high biodiversity [@prendergastCorrectingVariationRecording1993]. A simple method to reduce the spatial bias that we describe is to create an equal area grid over the region of interest, and sample a given number of checklists from within each grid cell.
19 | - **Temporal bias:** participants preferentially sample when they are available, such as weekends [@courterWeekendBiasCitizen2013], and at times of year when they expect to observe more birds, notably during spring migration [@sullivanEBirdEnterpriseIntegrated2014]. To address the weekend bias, we recommend using a temporal scale of a week or multiple weeks for most analyses.
20 | - **Spatial precision:** the spatial location of an eBird checklist is given as a single latitude-longitude point; however, this may not be precise for two main reasons. First, for traveling checklists, this location represents just one point on the journey. Second, eBird checklists are often assigned to a **hotspot** (a common location for all birders visiting a popular birding site) rather than their true location. For these reasons, it's not appropriate to align the eBird locations with very precise habitat covariates, and we recommend summarizing covariates within a neighborhood around the checklist location.
21 | - **Class imbalance:** bird species that are rare or hard to detect may have data with high class imbalance, with many more checklists with non-detections than detections. For these species, a distribution model predicting that the species is absent everywhere will have high accuracy, but no ecological value. We'll follow the methods for addressing class imbalance proposed by Robinson et al. [-@robinsonUsingCitizenScience2018].
22 | - **Variation in detectability:** detectability describes the probability of a species that is present in an area being detected and identified. Detectability varies by season, habitat, and species [@johnstonSpeciesTraitsExplain2014; @johnstonEstimatesObserverExpertise2018]. Furthermore, eBird data are collected with high variation in effort, time of day, number of observers, and external conditions such as weather, all of which can affect the detectability of species [@ellisEffectsWeatherTime2018; @oliveiraObservationDiurnalSoaring2018]. Therefore, detectability is particularly important to consider when comparing between seasons, habitats or species. Since eBird uses a semi-structured protocol, that collects variables associated with variation in detectability, we'll be able to account for a larger proportion of this variation in our analyses.
23 |
24 | The remainder of this book will demonstrate how to address these challenges using real data from eBird to produce reliable estimates of species distributions. In general, we'll take a two-pronged approach to dealing with unstructured data and maximizing the value of citizen science data: imposing more structure onto the data via data filtering and including covariates in models to account for the remaining variation.
25 |
26 | The next two chapters show how to access and prepare [eBird data](#ebird) and [land cover covariates](#covariates), respectively. The remaining three chapters provide examples of different species distribution models that can be fit using these data: [encounter rate models](#encounter), [occupancy models](#occupancy), and [abundance models](#abundance). Although these examples focus on the use of eBird data, in many cases they also apply to similar citizen science datasets.
27 |
28 | ## Prerequisites {#intro-pre}
29 |
30 | To understand the code examples used throughout this book, some knowledge of the programming language [R](https://www.r-project.org/) is required. If you don't meet this requirement, or begin to feel lost trying to understand the code used in this book, we suggest consulting one of the excellent free resources available online for learning R. For those with little or no prior programming experience, [Hands-On Programming with R](https://rstudio-education.github.io/hopr/) is an excellent introduction. For those with some familiarity with the basics of R that want to take their skills to the next level, we suggest [R for Data Science](https://r4ds.hadley.nz/) as the best resource for learning how to work with data within R.
31 |
32 | ## Setup {#intro-setup}
33 |
34 | ### Software {#intro-setup-software}
35 |
36 | The examples throughout this website use the programming language **R** [@R-base] to work with eBird data. If you don't have R installed, [download it now](https://cloud.r-project.org/), if you already have R, chances are you're using an outdated version, so [update it to the latest version now](https://cloud.r-project.org/). R is updated regularly, and **it is important that you have the most recent version of R** to avoid headaches when installing packages. We suggest checking every couple months to see if a new version has been released.
37 |
38 | We strongly encourage R users to use **RStudio**. RStudio is not required to follow along with this book; however, it will make your R experience significantly better. If you don't have RStudio, [download it now](https://www.rstudio.com/products/rstudio/download/#download), if you already have it, [update it](https://www.rstudio.com/products/rstudio/download/#download) because new versions with useful additional features are regularly released. Pro tip: immediately go into RStudio preferences (Tools > Global Options) and on the General pane uncheck "Restore .RData into workspace at startup" and set "Save workspace to .RData on exit" to "Never". This will avoid cluttering your R session with old data and save you headaches down the road.
39 |
40 | ### R packages {#intro-setup-packages}
41 |
42 | The examples in this book use a variety of R packages for accessing eBird data, working with spatial data, data processing and manipulation, and model fitting. To install all the packages necessary to work through this book, run the following code:
43 |
44 | ```{r packages, eval = FALSE}
45 | if (!requireNamespace("pak", quietly = TRUE)) {
46 | install.packages("pak")
47 | }
48 | pak::pak()
49 | ```
50 |
51 | Note that several of the spatial packages require dependencies. If installing these packages fails, consult the [instructions for installing dependencies on the `sf` package website](https://r-spatial.github.io/sf/#installing). Finally, **ensure all R packages are updated** to their most recent version by clicking on the Update button on the Packages tab in RStudio.
52 |
53 | ### Tidyverse {#intro-setup-tidyverse}
54 |
55 | Throughout this book, we use packages from the [Tidyverse](https://www.tidyverse.org/), an opinionated collection of R packages designed for data science. Packages such as [`ggplot2`](http://ggplot2.tidyverse.org/), for data visualization, and [`dplyr`](http://dplyr.tidyverse.org/), for data manipulation, are two of the most well known Tidyverse packages; however, there are many more. In the following chapters, we often use Tidyverse functions without explanation. If you encounter a function you're unfamiliar with, consult the documentation for help (e.g. `?mutate` to see help for the `dplyr` function `mutate()`). More generally, the free online book [R for Data Science](http://r4ds.had.co.nz/) by [Hadley Wickham](http://hadley.nz/) is the best introduction to working with data in R using the Tidyverse.
56 |
57 | The one piece of the Tidyverse that we will cover here, because it is ubiquitous throughout this book and unfamiliar to many, is the pipe operator `%>%`. The pipe operator takes the expression to the left of it and "pipes" it into the first argument of the expression on the right, i.e. one can replace `f(x)` with `x %>% f()`. The pipe makes code significantly more readable by avoiding nested function calls, reducing the need for intermediate variables, and making sequential operations read left-to-right. For example, to add a new variable to a data frame, then summarize using a grouping variable, the following are equivalent:
58 |
59 | ```{r pipes}
60 | library(dplyr)
61 |
62 | # pipes
63 | mtcars %>%
64 | mutate(wt_kg = 454 * wt) %>%
65 | group_by(cyl) %>%
66 | summarize(wt_kg = mean(wt_kg))
67 |
68 | # intermediate variables
69 | mtcars_kg <- mutate(mtcars, wt_kg = 454 * wt)
70 | mtcars_grouped <- group_by(mtcars_kg, cyl)
71 | summarize(mtcars_grouped, wt_kg = mean(wt_kg))
72 |
73 | # nested function calls
74 | summarize(
75 | group_by(
76 | mutate(mtcars, wt_kg = 454 * wt),
77 | cyl
78 | ),
79 | wt_kg = mean(wt_kg)
80 | )
81 | ```
82 |
83 | Once you become familiar with the pipe operator, we believe you'll find the the above example using the pipe the easiest of the three to read and interpret.
84 |
85 | ### Getting eBird data access {#intro-setup-ebird}
86 |
87 | The complete eBird database is provided via the [eBird Basic Dataset (EBD)](https://ebird.org/science/download-ebird-data-products), a large text file. To access the EBD, begin by [creating an eBird account and signing in](https://secure.birds.cornell.edu/cassso/account/create). Then visit the [eBird Data Access page](https://ebird.org/data/download) and fill out the data access request form. eBird data access is free; however, you will need to request access in order to download the EBD. Filling out the access request form allows eBird to keep track of the number of people using the data and obtain information on the applications for which the data are used.
88 |
89 | Once you have access to the data, proceed to the [download page](https://ebird.org/data/download/ebd). Download both the World EBD (~ 42 GB compressed, ~ 210 GB uncompressed) and corresponding Sampling Event Data (~ 3.5 GB compressed, ~ 11 GB uncompressed). The former provides observation-level data, while the latter provides checklist-level data; both files are required for species distribution modeling. If limited hard drive space or a slow internet connection make dealing with these large files challenging, consult Section \@ref(ebird-size-custom) for details on a method for downloading a subset of EBD.
90 |
91 | The downloaded data files will be in `.tar` format, and should be unarchived. The resulting directories will contain files with extension `.txt.gz`, these files should be uncompressed (on Windows use [7-Zip](https://www.7-zip.org/), on Mac use the default system uncompression utility) to produce two text files (e.g., `ebd_relAug-2019.txt` and `ebd_sampling_relAug-2019.txt`). Move these two large, uncompressed .txt files to a sensible, central location on your computer. In general, we suggest creating an `ebird/` folder nested in a `data/` folder within your home directory (i.e. `~/data/ebird/`) to store these files, and throughout the remainder of this chapter we'll assume you've placed the data there. If you choose to store the EBD elsewhere, you will need to update references to this folder in the code. If the files are too large to fit on your computer's hard drive, they can be stored on an external hard drive.
92 |
93 | Each time you want to access eBird data in an R project, you'll need to reference the full path to these text files, for example `~/data/ebird/ebd_relAug-2019.txt`. In general, it's best to avoid using absolute paths in R scripts because it makes them less portable–if you're sharing the files with someone else, they'll need to change the file paths to point to the location at which they've stored the eBird data. The R package `auk` provides a workaround for this, by allowing users to set an environment variable (`EBD_PATH`) that points to the directory where you've stored the eBird data. To set this variable, use the function [`auk_set_ebd_path()`](https://cornelllabofornithology.github.io/auk/reference/auk_set_ebd_path.html). For example, if the EBD and Sampling Event Data files are in `~/data/ebird/`, use:
94 |
95 | ```{r set-ebd-path, eval = FALSE}
96 | # set ebd path
97 | auk::auk_set_ebd_path("~/data/ebird/")
98 | ```
99 |
100 | After **restarting your R session**, you should be able to refer directly to the EBD or Sampling Event Data files within `auk` functions (e.g., `auk_ebd("ebd_relAug-2019.txt")`). Provided your collaborators have also set `EDB_PATH`, your scripts should now be portable.
101 |
102 | You now have access to the full eBird dataset! Note, however, that **the EBD is updated monthly**. If you want the most recent eBird records, be sure to **regularly download an updated version**. Finally, **whenever you update the EBD, always update the `auk` package as well**, this will ensure that `auk` will be able to handle any changes to the EBD that may have occurred.
103 |
104 | ### GIS data {#intro-setup-gis}
105 |
106 | Throughout this book, we'll be producing maps of species distributions. To provide context for these distributions, we'll need GIS data for political boundaries. [Natural Earth](https://www.naturalearthdata.com/) is the best source for a range of tightly integrated vector and raster GIS data for producing professional cartographic maps. The R package, [`rnaturalearth`](https://github.com/ropensci/rnaturalearth) provides a convenient method for accessing these data from within R. We'll also need [Bird Conservation Region (BCR)](http://nabci-us.org/resources/bird-conservation-regions/) boundaries, which are available through [Bird Studies Canada](https://www.birdscanada.org/research/gislab/index.jsp?targetpg=bcr&targetpg=bcr).
107 |
108 | These GIS data layers are most easily accessed by [**downloading the data package**](https://github.com/cornelllabofornithology/ebird-best-practices/raw/master/data/data.zip) for this book.
109 | These data were generated using the following code. If you intend to run this code yourself, first create an RStudio project, so the files will be stored within the `data/` subdirectory of the project. This will allow us to load these data in later chapters as they're needed. Note that calls to `ne_download()` often produce warnings suggesting that you've used the incorrect "category"; these can safely be ignored.
110 |
111 | ```{r intro-setup-gis, eval = FALSE}
112 | library(sf)
113 | library(rnaturalearth)
114 | library(dplyr)
115 |
116 | # file to save spatial data
117 | gpkg_dir <- "data"
118 | if (!dir.exists(gpkg_dir)) {
119 | dir.create(gpkg_dir)
120 | }
121 | f_ne <- file.path(gpkg_dir, "gis-data.gpkg")
122 |
123 | # download bcrs
124 | tmp_dir <- normalizePath(tempdir())
125 | tmp_bcr <- file.path(tmp_dir, "bcr.zip")
126 | paste0("https://www.birdscanada.org/research/gislab/download/",
127 | "bcr_terrestrial_shape.zip") %>%
128 | download.file(destfile = tmp_bcr)
129 | unzip(tmp_bcr, exdir = tmp_dir)
130 | bcr <- file.path(tmp_dir, "BCR_Terrestrial_master_International.shp") %>%
131 | read_sf() %>%
132 | select(bcr_code = BCR, bcr_name = LABEL) %>%
133 | filter(bcr_code == 27)
134 | # clean up
135 | list.files(tmp_dir, "bcr", ignore.case = TRUE, full.names = TRUE) %>%
136 | unlink()
137 |
138 | # political boundaries
139 | # land border with lakes removed
140 | ne_land <- ne_download(scale = 50, category = "cultural",
141 | type = "admin_0_countries_lakes",
142 | returnclass = "sf") %>%
143 | filter(CONTINENT == "North America") %>%
144 | st_set_precision(1e6) %>%
145 | st_union()
146 | # country lines
147 | # downloaded globally then filtered to north america with st_intersect()
148 | ne_country_lines <- ne_download(scale = 50, category = "cultural",
149 | type = "admin_0_boundary_lines_land",
150 | returnclass = "sf") %>%
151 | st_geometry()
152 | ne_country_lines <- st_intersects(ne_country_lines, ne_land, sparse = FALSE) %>%
153 | as.logical() %>%
154 | {ne_country_lines[.]}
155 | # states, north america
156 | ne_state_lines <- ne_download(scale = 50, category = "cultural",
157 | type = "admin_1_states_provinces_lines",
158 | returnclass = "sf") %>%
159 | filter(adm0_a3 %in% c("USA", "CAN")) %>%
160 | mutate(iso_a2 = recode(adm0_a3, USA = "US", CAN = "CAN")) %>%
161 | select(country = adm0_name, country_code = iso_a2)
162 |
163 | # output
164 | unlink(f_ne)
165 | write_sf(ne_land, f_ne, "ne_land")
166 | write_sf(ne_country_lines, f_ne, "ne_country_lines")
167 | write_sf(ne_state_lines, f_ne, "ne_state_lines")
168 | write_sf(bcr, f_ne, "bcr")
169 | ```
170 |
--------------------------------------------------------------------------------
/03_covariates.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: html_document
3 | editor_options:
4 | chunk_output_type: console
5 | ---
6 |
7 | # Habitat Covariates {#covariates}
8 |
9 | ## Introduction {#covariates-intro}
10 |
11 | Species distribution models work by finding associations between species occurrence or abundance and environmental variables. Using these relationships, it's possible to predict the distribution in areas that aren't sampled, provided we know the value of the environmental variables in these areas. Therefore, to proceed with the modeling in the next several chapters, we'll need to prepare a suite of environmental variables to be used as covariates in our models. The particular set of covariates that's most suitable for a given study will depend on the focal species, region, and time period, as well as the availability of data. When species distributions are well defined by the environmental covariates, extrapolations to unsurveyed areas will be more accurate. So, it's worth considering which environmental covariates are important for your species. Scientists can use many variables to characterise a species distribution - for example, climate, weather, and soil type. Here we use only landcover and elevation as example environmental covariates.
12 |
13 | Fortunately, there are an abundance of freely available, satellite-based land cover products derived from satellites such as [Landsat](https://en.wikipedia.org/wiki/Landsat_program), [SPOT](https://en.wikipedia.org/wiki/SPOT_(satellite)), and [MODIS](https://en.wikipedia.org/wiki/Moderate_Resolution_Imaging_Spectroradiometer) that are suitable for distribution modeling. This land cover data will act as a proxy for habitat and throughout this book we'll often use habitat and land cover interchangeably. In addition, we'll include elevation as an additional covariate, which can be important for many species.
14 |
15 | For the examples in this book, we'll use land cover covariates derived from the [MODIS MCD12Q1 v006](https://lpdaac.usgs.gov/products/mcd12q1v006/) land cover product [@friedlMCD12Q1MODISTerra2015]. This product has global coverage at 500 m spatial resolution and annual temporal resolution from 2001-2018. These data are available for several different classification schemes. We'll use the University of Maryland (UMD) land cover classification, which provides a globally accurate classification of land cover in our experience. This system classifies pixels into one of 16 different land cover classes:
16 |
17 | ```{r landcover, echo = FALSE}
18 | lc_classes <- readr::read_csv("data/mcd12q1_classes.csv")
19 | names(lc_classes) <- stringr::str_to_title(names(lc_classes))
20 | knitr::kable(lc_classes)
21 | ```
22 |
23 | For a wide range of studies, this MODIS land cover dataset will be suitable for generating habitat covariates; however, there may be particular cases where the study species, habitat, or ecological question requires different, or more specialized, data. For example, shorebird distribution modeling would benefit from data on the [extent of tidal flats](https://www.intertidal.app), seabirds distributions are often influenced by [ocean depth](https://eatlas.org.au/data/uuid/80301676-97fb-4bdf-b06c-e961e5c0cb0b), and in many regions [elevation](https://github.com/jhollist/elevatr) plays a critical role in shaping species distributions. Regardless of which habitat data you decide to use for your project, this chapter should provide a template for how to prepare these data as covariates for modeling species distributions.
24 |
25 | The following section will cover how to access and download MODIS land cover data. Next, we'll demonstrate how to summarize these data within a neighborhood around each checklist location. Then, we'll calculate a set of covariates over a regular grid, which we'll use to make predictions of species distributions throughout our study area. Finally, as an example of including covariate data from multiple sources, we'll demonstrate how to incorporate elevation data as an additional covariate. If you want to skip this section and jump straight to the modeling, you can download the [data package](https://raw.githubusercontent.com/cornelllabofornithology/ebird-best-practices/master/data.zip), which includes all the prepared MODIS data that we'll use in the remainder of this book.
26 |
27 | ## Downloading MODIS data {#covariates-dl}
28 |
29 | As with most satellite data, MODIS data are provided as [1200 km by 1200 km tiles](https://modis-land.gsfc.nasa.gov/MODLAND_grid.html) for ease of download. Each tile is a [raster GIS dataset](http://desktop.arcgis.com/en/arcmap/10.3/manage-data/raster-and-images/what-is-raster-data.htm) consisting of a regular grid of 500 m resolution cells. The surface of the Earth is divided up into a grid of these tiles, each given an ID, for example, h10v12 is the tile from the 10th column and 12th row of the grid. Compiling MODIS data for a given region requires figuring out which set of tiles covers the region, downloading those tiles, combining the tiles together into a single raster dataset, and converting from the native MODIS HDF format, which R can't read, to a standard GeoTIFF format. This needs to be done for each year for which we want habitat data, and can be a time consuming and error prone process. Fortunately, the [R package `MODIS`](https://github.com/MatMatt/MODIS) automates most of these steps. Unfortunately, this package can be challenging and confusing to get working. With this in mind, this section will provide detailed instruction for setting up and using the `MODIS` package.
30 |
31 | Let's start by figuring out the tile IDs for the tiles that BCR 27 spans. Recall that we prepared a BCR boundary in Section \@ref(intro-setup-gis) of the Introduction; if you haven't already done so, [download the data package](https://github.com/cornelllabofornithology/ebird-best-practices/raw/master/data/data.zip) now to get that boundary. Given a set of spatial features, the `MODIS` package can quickly tell us which MODIS tiles we need.
32 |
33 | ```{r landcover-dl-data}
34 | library(sf)
35 | library(raster)
36 | library(MODIS)
37 | library(exactextractr)
38 | library(viridis)
39 | library(tidyverse)
40 | # resolve namespace conflicts
41 | select <- dplyr::select
42 | map <- purrr::map
43 | projection <- raster::projection
44 |
45 | # bcr 27 boundary
46 | bcr <- read_sf("data/gis-data.gpkg", "bcr") %>%
47 | filter(bcr_code == 27) %>%
48 | # project to the native modis projection
49 | st_transform(crs = paste("+proj=sinu +lon_0=0 +x_0=0 +y_0=0",
50 | "+a=6371007.181 +b=6371007.181 +units=m +no_defs"))
51 | # load ebird data
52 | ebird <- read_csv("data/ebd_woothr_june_bcr27_zf.csv")
53 | # get list of tiles required to cover this bcr
54 | tiles <- bcr %>%
55 | st_transform(crs = 4326) %>%
56 | st_bbox() %>%
57 | as.list() %>%
58 | getTile()
59 | tiles[["tile"]]
60 | ```
61 |
62 | So, we'll need to download these three tiles for each of the 10 years from 2010-2019.
63 |
64 | ### `MODIS` setup {#covariates-dl-setup}
65 |
66 | Before we start using `MODIS` for the first time, a bit of setup is required. First, [sign up for a NASA Earthdata account](https://urs.earthdata.nasa.gov/users/new) to get access to MODIS, and other NASA data. Then use `MODIS::EarthdataLogin(usr = "username", pwd = "password")`, with the username and password you just created, to store your login credentials so the `MODIS` package can access them.
67 |
68 | Next, you'll need to install [GDAL](https://www.gdal.org/), an open source library for working with geospatial data that's needed for processing the MODIS tiles. The steps for installing GDAL are system dependent:
69 |
70 | - **Mac OS X:** First, check if GDAL is installed with HDF4 support by running `gdal-config --formats` in Terminal. If you see `hdf4` in the list, you don't need to do anything else! If not, [install the Homebrew](https://brew.sh/) package manager by following the [instructions on the website](https://brew.sh/). Then, run the following commands in Terminal to install GDAL:
71 |
72 | ```
73 | brew tap osgeo/osgeo4mac
74 | brew install hdf4
75 | brew link --overwrite hdf4
76 | brew install osgeo-gdal
77 | brew link --force osgeo-gdal
78 | ```
79 |
80 | - **Windows:** install GDAL using [OSGeo4W](http://trac.osgeo.org/osgeo4w/), a suite of open source geospatial tools. In R, run `MODIS:::checkTools("GDAL")`, which will search your system for GDAL and suggest a command such as `MODIS::MODISoptions(gdalPath = "c:/OSGeo4W64/bin")` that will make GDAL available to the `MODIS` package. Run this command and, when it asks, agree to making the settings permanent.
81 | - **Linux:** run `sudo apt-get install gdal-bin` in the terminal.
82 |
83 | Finally, run `MODIS:::checkTools("GDAL")` to check that GDAL is installed and that the `MODIS` package can find it. If GDAL can't be found, you'll need to manually locate it and use `MODIS::MODISoptions(gdalPath = "path/to/gdal/")` to tell the `MODIS` package where it is.
84 |
85 | ### Download using R {#covariates-dl-r}
86 |
87 | Once all the setup steps have been completed, we can start downloading some data! The `MODIS` function `runGdal()` downloads and processes MODIS tiles into a single GeoTIFF for each year. Note that at the time of writing, land cover data from 2019 haven't been prepared yet, so we'll use 2018 data for both 2018 and 2019. The key arguments to `runGdal()` are:
88 |
89 | - `product`: is the specific MODIS product to download. For a full list of available datasets use `MODIS::getProduct()`.
90 | - `collection`: each MODIS product may have multiple collections, corresponding roughly to versions. Use `getCollection()` to find the available collection for a given product.
91 | - `SDSstring`: a string specifying which bands to extract, with zeros for bands to drop and 1 for bands to keep. Most MODIS products have multiple bands stored in a single raster file, for example, reflectances in different wavelength ranges or, in our case, land cover using different land cover classification systems. The [documentation for the MCD12Q1 dataset](https://lpdaac.usgs.gov/products/mcd12q1v006/) shows that there are 13 bands in the downloaded files, and we're interested in band 2, which contains the UMD landcover classification.
92 | - `extent`: any of several different spatial objects specifying the region that we want data for. In our case, we'll use the BCR polygon; however, for a list of available options consult the help for `getTile()`. Note that `runGdal()` will return raster data in the same projection as the input extent, which is why we projected the BCR boundary to the MODIS sinusoidal projection.
93 | - `begin` and `end`: the start and end dates of the time period from which to extract data. Although the land cover data are only available annually, we need to specify full dates because some other products are available on a more granular basis.
94 | - `outDirPath`: directory to store processed MODIS data.
95 | - `job`: a name for this task, which will become the sub-directory of `outDirPath` within which the processed data are stored.
96 |
97 | ```{r landcover-dl-r, eval = FALSE}
98 | # earliest year of ebird data
99 | begin_year <- format(min(ebird$observation_date), "%Y.01.01")
100 | # end date for ebird data
101 | end_year <- format(max(ebird$observation_date), "%Y.12.31")
102 | # download tiles and combine into a single raster for each year
103 | tifs <- runGdal(product = "MCD12Q1", collection = "006", SDSstring = "01",
104 | extent = bcr %>% st_buffer(dist = 10000),
105 | begin = begin_year, end = end_year,
106 | outDirPath = "data", job = "modis",
107 | MODISserverOrder = "LPDAAC") %>%
108 | pluck("MCD12Q1.006") %>%
109 | unlist()
110 |
111 | # rename tifs to have more descriptive names
112 | new_names <- format(as.Date(names(tifs)), "%Y") %>%
113 | sprintf("modis_mcd12q1_umd_%s.tif", .) %>%
114 | file.path(dirname(tifs), .)
115 | file.rename(tifs, new_names)
116 | ```
117 |
118 | If everything ran smoothly, we now have annual GeoTIFFs of MODIS land cover data that we can load into R. You may see error messages stating `Cannot find proj.db`, or something similar, these can be safely ignored provided the modis have been created in `data/modis/` directory.
119 |
120 | ```{r landcover-dl-load}
121 | # load the landcover data
122 | landcover <- list.files("data/modis", "^modis_mcd12q1_umd",
123 | full.names = TRUE) %>%
124 | stack()
125 | # label layers with year
126 | landcover <- names(landcover) %>%
127 | str_extract("(?<=modis_mcd12q1_umd_)[0-9]{4}") %>%
128 | paste0("y", .) %>%
129 | setNames(landcover, .)
130 | landcover
131 | ```
132 |
133 | These data have not been prepared yet for the last couple years, so we'll need to fill in the missing years using the most recent year for which there is data. To facilitate that, let's figure out which is the most recent year with data.
134 |
135 | ```{r landcover-dl-load-max}
136 | max_lc_year <- names(landcover) %>%
137 | str_extract("[0-9]{4}") %>%
138 | as.integer() %>%
139 | max()
140 | ```
141 |
142 | So, we have landcover data up to `r max_lc_year`.
143 |
144 | ### Troubleshooting {#covariates-dl-trouble}
145 |
146 | If the call to `runGDAL()` didn't work for you, don't worry, you're not alone! It's challenging to get the `MODIS` package working and errors are common when you're first trying to get it set up. The most common error is not having GDAL installed correctly, which will give an error like `GDAL not installed or configured`. Either you don't have GDAL at all or you have it, but it doesn't have support for HDF4 files (this is the native format for MODIS data). Try following the [above instructions](habit-dl-setup) again. If it still doesn't work, consult the instructions on the `MODIStsp` website for [installing GDAL](http://ropensci.github.io/MODIStsp/articles/installation.html#installing-gdal-1-11-1).
147 |
148 | Another error you may see is: `Make sure either 'wget' or 'curl' is available in order to download data from LP DAAC or NSIDC.`. This should only arise on versions of Windows before Windows 10. If you see this error, you'll need to install `curl`, which is used by R to download the MODIS tiles. There is a StackOverflow question with [excellent instructions](https://stackoverflow.com/questions/9507353/how-do-i-install-and-use-curl-on-windows) for installing `curl` and getting it setup on your system.
149 |
150 | If these tips haven't solved your particular problem, you'll need to turn to Google to troubleshoot or find someone who has experience with these tools and ask them to help. Good luck!
151 |
152 | ## Landscape metrics {#covariates-lsm}
153 |
154 | At this point we could use the MODIS land cover data directly, simply extracting the land cover class for each checklist location. However, we instead advocate summarizing the land cover data within a neighborhood around the checklist locations. As discussed in Section \@ref(intro-intro), checklist locations are not precise, so it's more appropriate to use the habitat in the surrounding area, rather than only at the checklist location. More fundamentally, organisms interact with their environment not at a single point, but at the scale of a landscape, so it's important to include habitat information characterizing a suitably-sized landscape around the observation location.
155 |
156 | There are a variety of **landscape metrics** that can be used to characterize the composition (what habitat is available) and configuration (how that habitat is arranged spatially) of landscapes. The simplest metric of landscape composition is the proportion of the landscape in each land cover class (PLAND in the parlance of [FRAGSTATS](https://www.umass.edu/landeco/research/fragstats/fragstats.html)). For a broad range of scenarios, PLAND is a reliable choice for calculating habitat covariates in distribution modeling. Based on our experience working with eBird data, an approximately 2.5 km by 2.5 km neighborhood (5 by 5 MODIS cells) centered on the checklist location is sufficient to account for the spatial precision in the data when the maximum distance of travelling counts has been limited to 5 km, while being a relevant ecological scale for many bird species.
157 |
158 | We'll start by finding the full set of unique checklists locations for each year in the eBird data. Then we convert these locations to spatial `sf` features and project them to the sinusoidal equal area projection used by MODIS. We'll buffer these points to create a neighborhood around each location with a diameter equal to 5 MODIS cells. Finally, we split the neighborhoods up by year so we can match to MODIS land cover data from the corresponding year.
159 |
160 | ```{r landcover-lsm-buffer}
161 | neighborhood_radius <- 5 * ceiling(max(res(landcover))) / 2
162 | ebird_buff <- ebird %>%
163 | distinct(year = format(observation_date, "%Y"),
164 | locality_id, latitude, longitude) %>%
165 | # for 2019 use 2018 landcover data
166 | mutate(year_lc = if_else(as.integer(year) > max_lc_year,
167 | as.character(max_lc_year), year),
168 | year_lc = paste0("y", year_lc)) %>%
169 | # convert to spatial features
170 | st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
171 | # transform to modis projection
172 | st_transform(crs = projection(landcover)) %>%
173 | # buffer to create neighborhood around each point
174 | st_buffer(dist = neighborhood_radius) %>%
175 | # nest by year
176 | nest(data = c(year, locality_id, geometry))
177 | ```
178 |
179 | Now, we'll loop over the years and for each square neighborhood extract all the raster values within that neighborhood and count the number of cells of each landcover class. We use the `exactextractr` package for this, since it's often orders of magnitude faster than using `raster::extract()`.
180 |
181 | ```{r landcover-lsm-extract}
182 | lc_extract <- NULL
183 | for (yr in names(landcover)) {
184 | # get the buffered checklists for a given year
185 | regions <- ebird_buff$data[[which(yr == ebird_buff$year_lc)]]
186 | # get landcover values within each buffered checklist area
187 | ee <- exact_extract(landcover[[yr]], regions, progress = FALSE)
188 | # count the number of each landcover class for each checklist buffer
189 | ee_count <- map(ee, ~ count(., landcover = value))
190 | # attach the year and locality id back to the checklists
191 | ee_summ <- tibble(st_drop_geometry(regions), data = ee_count) %>%
192 | unnest(data)
193 | # bind to results
194 | lc_extract <- bind_rows(lc_extract, ee_summ)
195 | }
196 | ```
197 |
198 | Now we have the set of land cover values within a neighborhood around each checklist location. We can summarize these data within each neighborhood to calculate PLAND: the proportion of the neighborhood within each land cover class.
199 |
200 | ```{r landcover-lsm-pland}
201 | pland <- lc_extract %>%
202 | # calculate proporiton
203 | group_by(locality_id, year) %>%
204 | mutate(pland = n / sum(n)) %>%
205 | ungroup() %>%
206 | select(-n) %>%
207 | # remove NAs after tallying so pland is relative to total number of cells
208 | filter(!is.na(landcover))
209 | ```
210 |
211 | Finally, we'll convert the numeric landcover codes to more descriptive names and transform the data to a wide format with each row a location and the PLAND values in columns.
212 |
213 | ```{r landcover-lsm-trans}
214 | # convert names to be more descriptive
215 | lc_names <- tibble(landcover = 0:15,
216 | lc_name = c("pland_00_water",
217 | "pland_01_evergreen_needleleaf",
218 | "pland_02_evergreen_broadleaf",
219 | "pland_03_deciduous_needleleaf",
220 | "pland_04_deciduous_broadleaf",
221 | "pland_05_mixed_forest",
222 | "pland_06_closed_shrubland",
223 | "pland_07_open_shrubland",
224 | "pland_08_woody_savanna",
225 | "pland_09_savanna",
226 | "pland_10_grassland",
227 | "pland_11_wetland",
228 | "pland_12_cropland",
229 | "pland_13_urban",
230 | "pland_14_mosiac",
231 | "pland_15_barren"))
232 | pland <- pland %>%
233 | inner_join(lc_names, by = "landcover") %>%
234 | arrange(landcover) %>%
235 | select(-landcover)
236 |
237 | # tranform to wide format, filling in implicit missing values with 0s%>%
238 | pland <- pland %>%
239 | pivot_wider(names_from = lc_name,
240 | values_from = pland,
241 | values_fill = list(pland = 0))
242 |
243 | # save
244 | write_csv(pland, "data/modis_pland_location-year.csv")
245 | ```
246 |
247 | ## Prediction surface {#covariates-prediction}
248 |
249 | After fitting species distribution models, the goal is typically to make predictions throughout the study area. To do this, we'll need a regular grid of habitat covariates over which to make predictions. In this section, we'll create such a prediction surface for BCR 27 using the MODIS land cover data from the most recent year for which they're available. To start, we'll need a template raster with cells equal in size to the neighborhoods we defined in the previous section: 5 by 5 MODIS land cover cells. We can use `raster::aggregate()` to achieve this. We'll also use `raster::rasterize()` to assign the value 1 to all cells within BCR 27 and leave all cells outside BCR 27 empty.
250 |
251 | ```{r landcover-prediction-template}
252 | agg_factor <- round(2 * neighborhood_radius / res(landcover))
253 | r <- raster(landcover) %>%
254 | aggregate(agg_factor)
255 | r <- bcr %>%
256 | st_transform(crs = projection(r)) %>%
257 | rasterize(r, field = 1) %>%
258 | # remove any empty cells at edges
259 | trim()
260 | r <- writeRaster(r, filename = "data/prediction-surface.tif", overwrite = TRUE)
261 | ```
262 |
263 | Next, for each cell of this raster, we'll calculate the PLAND metrics using the same approach as the previous section. Note that we will only be creating this prediction surface for the most current year of landcover data in our example.
264 |
265 | ```{r landcover-prediction-calc}
266 | # get cell centers and create neighborhoods
267 | r_centers <- rasterToPoints(r, spatial = TRUE) %>%
268 | st_as_sf() %>%
269 | transmute(id = row_number())
270 | r_cells <- st_buffer(r_centers, dist = neighborhood_radius)
271 |
272 | # extract landcover values within neighborhoods, only needed most recent year
273 | lc_extract_pred <- landcover %>%
274 | exact_extract(r_cells, progress = FALSE) %>%
275 | map(~ count(., landcover = value)) %>%
276 | tibble(id = r_cells$id, data = .) %>%
277 | unnest(data)
278 |
279 | # calculate the percent for each landcover class
280 | pland_pred <- lc_extract_pred %>%
281 | group_by(id) %>%
282 | mutate(pland = n / sum(n)) %>%
283 | ungroup() %>%
284 | select(-n) %>%
285 | # remove NAs after tallying so pland is relative to total number of cells
286 | filter(!is.na(landcover))
287 |
288 | # convert names to be more descriptive
289 | pland_pred <- pland_pred %>%
290 | inner_join(lc_names, by = "landcover") %>%
291 | arrange(landcover) %>%
292 | select(-landcover)
293 |
294 | # tranform to wide format, filling in implicit missing values with 0s
295 | pland_pred <- pland_pred %>%
296 | pivot_wider(names_from = lc_name,
297 | values_from = pland,
298 | values_fill = list(pland = 0)) %>%
299 | mutate(year = max_lc_year) %>%
300 | select(id, year, everything())
301 |
302 | # join in coordinates
303 | pland_coords <- st_transform(r_centers, crs = 4326) %>%
304 | st_coordinates() %>%
305 | as.data.frame() %>%
306 | cbind(id = r_centers$id, .) %>%
307 | rename(longitude = X, latitude = Y) %>%
308 | inner_join(pland_pred, by = "id")
309 | ```
310 |
311 | Keeping these data in a data frame is a compact way to store them and will be required once we make model predictions in later chapters. However, we can always use the raster template to convert these PLAND metrics into a spatial format, for example, if we want to map them. Let's look at how this works for land cover class 4: deciduous broadleaf forest.
312 |
313 | ```{r landcover-prediction-map}
314 | map_proj <- st_crs("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96")
315 | forest_cover <- pland_coords %>%
316 | # convert to spatial features
317 | st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
318 | st_transform(crs = projection(r)) %>%
319 | # rasterize points
320 | rasterize(r, field = "pland_04_deciduous_broadleaf") %>%
321 | # project to albers equal-area for mapping
322 | projectRaster(crs = map_proj$proj4string, method = "ngb") %>%
323 | # trim off empty edges of raster
324 | trim()
325 |
326 | # make a map
327 | par(mar = c(0.25, 0.25, 2, 0.25))
328 | t <- str_glue("Proportion of Deciduous Broadleaf Forest\n",
329 | "{max_lc_year} MODIS Landcover")
330 | plot(forest_cover, axes = FALSE, box = FALSE, col = viridis(10), main = t)
331 | ```
332 |
333 | ## Elevation {#covariates-elevation}
334 |
335 | In some scenarios, you may want to include additional covariates to complement the land cover variables. There is a wealth of open access raster data available for this purpose; however, in most cases, these data will not have a simple R interface for accessing them. Instead, you'll typically have to manually download and process these data. As an example of how this works, we'll demonstrate how to include covariates for elevation, which frequently plays an important role in shaping species distributions.
336 |
337 | Amatulli et al. [-@amatulliSuiteGlobalCrossscale2018] provide a suite of global, 1km resolution topographic variables designed for use in distribution modeling. A range of variables are available, including elevation, slope, roughness, and many others; we'll focus on elevation here, but the approach can easily be applied to other variables. To start, visit [the website for these data](http://www.earthenv.org/topography), download the 1 km resolution median elevation product, and save the file (`elevation_1KMmd_GMTEDmd.tif`) in the `data/` subdirectory of your project:
338 |
339 | 
340 |
341 | ```{r covariates-elevation-dl, echo=FALSE}
342 | f_dem <- "elevation_1KMmd_GMTEDmd.tif"
343 | if (!file.exists(file.path("data", f_dem))) {
344 | download.file(paste0("https://data.earthenv.org/topography/", f_dem),
345 | file.path("data", f_dem))
346 | }
347 | ```
348 |
349 | Next we'll load the file, crop it down from it's full global extent to just the portion we need for BCR 27, and reproject it to the MODIS sinusoidal projection.
350 |
351 | ```{r covariates-elevation-load}
352 | elev <- raster("data/elevation_1KMmd_GMTEDmd.tif")
353 | # crop, buffer bcr by 10 km to provide a little wiggly room
354 | elev <- bcr %>%
355 | st_buffer(dist = 10000) %>%
356 | st_transform(crs = projection(elev)) %>%
357 | crop(elev, .) %>%
358 | projectRaster(crs = projection(landcover))
359 | ```
360 |
361 | Now we extract the elevation values within the neighborhood of each checklist location just as we did before for the land cover data. Then we'll calculate the median and standard deviation of the elevation within each neighborhood.
362 |
363 | ```{r covariates-elevation-extract}
364 | # buffer each checklist location
365 | ebird_buff_noyear <- ebird %>%
366 | distinct(locality_id, latitude, longitude) %>%
367 | st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
368 | st_transform(crs = projection(elev)) %>%
369 | st_buffer(dist = neighborhood_radius)
370 |
371 | # extract elevation values and calculate median and sd
372 | locs <- st_set_geometry(ebird_buff_noyear, NULL) %>%
373 | mutate(id = row_number())
374 | elev_checklists <- exact_extract(elev, ebird_buff_noyear, progress = FALSE) %>%
375 | map_dfr(~ tibble(elevation_median = mean(.$value, na.rm = TRUE),
376 | elevation_sd = sd(.$value, na.rm = TRUE))) %>%
377 | # join to lookup table to get locality_id
378 | bind_cols(locs, .)
379 | ```
380 |
381 | We'll need to repeat this process to calculate the elevation covariates for the prediction surface.
382 |
383 | ```{r covariates-elevation-pred}
384 | # extract and calculate median and sd
385 | elev_pred <- exact_extract(elev, r_cells, progress = FALSE) %>%
386 | map_dfr(~ tibble(elevation_median = mean(.$value, na.rm = TRUE),
387 | elevation_sd = sd(.$value, na.rm = TRUE))) %>%
388 | # join to lookup table to get locality_id
389 | bind_cols(st_drop_geometry(r_cells), .)
390 | ```
391 |
392 | Finally, we'll combine these elevation covariates with the land cover covariates.
393 |
394 | ```{r covariates-elevation-combine}
395 | # checklist covariates
396 | pland_elev_checklist <- inner_join(pland, elev_checklists, by = "locality_id")
397 | write_csv(pland_elev_checklist, "data/pland-elev_location-year.csv")
398 |
399 | # prediction surface covariates
400 | pland_elev_pred <- inner_join(pland_coords, elev_pred, by = "id")
401 | write_csv(pland_elev_pred, "data/pland-elev_prediction-surface.csv")
402 | glimpse(pland_elev_pred)
403 | ```
404 |
405 | This completes the data preparation. The following chapters will focus on using these data to model species distributions.
406 |
--------------------------------------------------------------------------------
/07_references.Rmd:
--------------------------------------------------------------------------------
1 | `r if (knitr::is_html_output()) '# References {-}'`
2 |
3 |
4 | ```{r prepare-data-package, echo = FALSE}
5 | zip_file <- "data/data.zip"
6 | files <- c("ebd_woothr_june_bcr27.txt",
7 | "ebd_checklists_june_bcr27.txt",
8 | "ebd_woothr_june_bcr27_zf.csv",
9 | "gis-data.gpkg",
10 | "mcd12q1_classes.csv",
11 | "pland-elev_location-year.csv",
12 | "pland-elev_prediction-surface.csv",
13 | "prediction-surface.tif")
14 | unlink(zip_file)
15 | zip(zip_file, file.path("data", files))
16 | ```
17 |
18 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: ebirstbestpractices
2 | Title: Best Practices for Using eBird Data
3 | Version: 2.0
4 | Authors@R: c(
5 | person("Matthew", "Strimas-Mackey", , email = "mes335@cornell.edu",
6 | c("aut", "cre"),
7 | comment = c(ORCID = "0000-0001-8929-7776"),
8 | person("Wesley", Hochachka", "M.", "wmh6@cornell.edu", "aut"),
9 | person("Viviana", "Ruiz-Gutierrez", , "vr45@cornell.edu", "aut"),
10 | person("Orin", "Robinson", "J.", "ojr7@cornell.edu", "aut"),
11 | person("Eliot", "Miller", "T."", "etm45@cornell.edu", "aut"),
12 | person("Tom", "Auer", , "mta45@cornell.edu", "aut"),
13 | person("Steve", "Kelling", , "stk2@cornell.edu", "aut"),
14 | person("Daniel", "Fink", , "garrett@cornell.edu", "aut"),
15 | person("Alison", "Johnston", , "aj327@cornell.edu", "aut",
16 | comment = c(ORCID = "0000-0001-8221-013X"))
17 | )
18 | URL: https://github.com/CornellLabOfOrnithology/ebird-best-practices
19 | License: GPL-3
20 | Depends: R (>= 3.5.0)
21 | Imports:
22 | AICcmodavg,
23 | auk (>= 0.3.3),
24 | bookdown,
25 | data.table,
26 | dggridR,
27 | dplyr (>= 0.8.3),
28 | ebirdst,
29 | exactextractr,
30 | fields,
31 | fitdistrplus,
32 | forcats,
33 | ggplot2,
34 | ggthemes,
35 | gridExtra,
36 | hexbin,
37 | janitor,
38 | knitr,
39 | lubridate,
40 | lwgeom,
41 | mgcv,
42 | MODIS,
43 | MuMIn,
44 | pdp,
45 | PresenceAbsence,
46 | purrr,
47 | ranger (>= 0.11.2),
48 | raster (>= 2.9-5),
49 | readr,
50 | rlang (>= 0.4.1),
51 | rmarkdown,
52 | rnaturalearth,
53 | rvest,
54 | scales,
55 | scam,
56 | sf (>= 0.8-0),
57 | smoothr,
58 | stringr,
59 | tibble,
60 | tidyr (>= 1.2.1),
61 | tidyverse,
62 | unmarked,
63 | verification,
64 | viridis
65 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This work is licensed under the Creative Commons Attribution-NonCommercial-NoDerivs 3.0 United States License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-nd/3.0/us/ or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Best Practices for Using eBird Data
2 |
3 | [](https://zenodo.org/badge/latestdoi/233674076)
4 | []()
5 | [](http://www.gnu.org/licenses/gpl-3.0)
7 |
8 | This repository contains the source for the book [Best Practices for Using eBird Data](https://cornelllabofornithology.github.io/ebird-best-practices/)
9 | book, which is a supplement to *Analytical guidelines to increase the value of community science data: An example using eBird data to estimate species distributions* ((Johnston et al. 2021)[available on bioRxiv](https://onlinelibrary.wiley.com/doi/10.1111/ddi.13271).)
10 |
11 | The R packages used in this book can be installed with:
12 |
13 | ```{r}
14 | # install.packages("remotes")
15 | remotes::install_github("mstrimas/ebppackages")
16 | ```
17 |
18 | Please cite this book as:
19 |
20 | > Strimas-Mackey, M., W.M. Hochachka, V. Ruiz-Gutierrez, O.J. Robinson, E.T. Miller, T. Auer, S. Kelling, D. Fink, A. Johnston. 2023. Best Practices for Using eBird Data. Version 2.0. https://cornelllabofornithology.github.io/ebird-best-practices/. Cornell Lab of Ornithology, Ithaca, New York. https://doi.org/10.5281/zenodo.3620739
--------------------------------------------------------------------------------
/_bookdown.yml:
--------------------------------------------------------------------------------
1 | book_filename: "ebird-best-practices"
2 | delete_merged_file: true
3 | before_chapter_script: "_common.r"
4 | output_dir: "docs"
5 | language:
6 | ui:
7 | chapter_name: "Chapter "
8 |
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/abundance-assess-mag-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/abundance-assess-mag-plot-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/abundance-model-cov-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/abundance-model-cov-plot-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/abundance-model-dist-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/abundance-model-dist-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/abundance-predict-map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/abundance-predict-map-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/abundance-predict-peak-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/abundance-predict-peak-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-distance-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-distance-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-duration-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-duration-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-map-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-observers-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-observers-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-time-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/ebird-explore-time-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/encounter-habitat-pd-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/encounter-habitat-pd-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/encounter-habitat-pi-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/encounter-habitat-pi-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/encounter-predict-map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/encounter-predict-map-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/encounter-predict-time-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/encounter-predict-time-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/encounter-rf-cal-cal-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/encounter-rf-cal-cal-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/encounter-sss-map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/encounter-sss-map-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/encounter-sss-toy-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/encounter-sss-toy-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/landcover-prediction-map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/landcover-prediction-map-1.png
--------------------------------------------------------------------------------
/_bookdown_files/ebird-best-practices_files/figure-html/occupancy-predict-map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_bookdown_files/ebird-best-practices_files/figure-html/occupancy-predict-map-1.png
--------------------------------------------------------------------------------
/_common.R:
--------------------------------------------------------------------------------
1 | knitr::opts_knit$set(root.dir = here::here())
2 | set.seed(1)
3 | options(digits = 3)
4 |
5 | knitr::opts_chunk$set(
6 | comment = "#>",
7 | collapse = TRUE,
8 | cache = FALSE,
9 | warning = FALSE,
10 | error = FALSE,
11 | message = FALSE,
12 | out.width = "\\textwidth",
13 | fig.align = "center",
14 | fig.width = 7,
15 | fig.asp = 0.618, # 1 / phi
16 | fig.show = "hold"
17 | )
18 | options(knitr.kable.NA = "")
19 | options(dplyr.print_min = 6, dplyr.print_max = 6)
20 |
21 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CornellLabofOrnithology/ebird-best-practices/c50c52745cda828e0da9882dae0a4beb49a561a8/_config.yml
--------------------------------------------------------------------------------
/_output.yml:
--------------------------------------------------------------------------------
1 | bookdown::gitbook:
2 | css: style.css
3 | config:
4 | toc:
5 | collapse: section
6 | before: |
7 |
Matthew Strimas-Mackey, Wesley M. Hochachka, Viviana Ruiz-Gutierrez, Orin J. Robinson, Eliot T. Miller, Tom Auer, Steve Kelling, Daniel Fink, Alison Johnston
256 |
Version 1.0
257 |
258 |
259 |
260 |
Version 1.0 is Retired!
261 |
262 |
This version of Best Practices for Using eBird Data is no longer being maintained. This webpage is now for reference purposes only. Please consult Version 2.0) of Best Practices for Using eBird Data instead.