├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── tests ├── testthat.R ├── spelling.R └── testthat │ └── test-ebird.R ├── .Rbuildignore ├── inst ├── extdata │ ├── ebd_relAug-2021.tar │ └── ebd_sampling_relAug-2021.tar └── WORDLIST ├── NAMESPACE ├── codecov.yml ├── birddb.Rproj ├── R ├── duckdb_pragmas.R ├── sample_data.R ├── ebird_data_dir.R ├── ebird.R ├── ebird_remote.R ├── ebird_conn.R └── import_ebird.R ├── man ├── sample_data.Rd ├── ebird_data_dir.Rd ├── ebird_remote.Rd ├── ebird_conn.Rd ├── ebird_tbl.Rd └── import_ebird.Rd ├── LICENSE ├── DESCRIPTION ├── README.Rmd └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(birddb) 3 | 4 | test_check("birddb") 5 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^\.github$ 5 | ^codecov\.yml$ 6 | -------------------------------------------------------------------------------- /inst/extdata/ebd_relAug-2021.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cboettig/birddb/HEAD/inst/extdata/ebd_relAug-2021.tar -------------------------------------------------------------------------------- /inst/extdata/ebd_sampling_relAug-2021.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cboettig/birddb/HEAD/inst/extdata/ebd_sampling_relAug-2021.tar -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | CMD 2 | Codecov 3 | dir 4 | dplyr 5 | dbplyr 6 | duckdb 7 | DuckDB 8 | eBird 9 | ebd 10 | ebird 11 | http 12 | tarfile 13 | unarchive -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly = TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = FALSE, 3 | skip_on_cran = TRUE) 4 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(checklists) 4 | export(ebird_conn) 5 | export(ebird_data_dir) 6 | export(ebird_remote) 7 | export(import_ebird) 8 | export(observations) 9 | export(sample_checklist_data) 10 | export(sample_observation_data) 11 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /birddb.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /R/duckdb_pragmas.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | duckdb_mem_limit <- function(conn, memory_limit = 16, units = "GB"){ 4 | DBI::dbExecute(conn = conn, 5 | paste0("PRAGMA memory_limit='", memory_limit, units, "'")) 6 | } 7 | # set CPU parallel 8 | duckdb_parallel <- function(conn, mc.cores = options("mc.cores", 2L)){ 9 | DBI::dbExecute(conn, paste0("PRAGMA threads=", mc.cores)) 10 | } 11 | 12 | ## Used by in-memory connections when creating temporary tables 13 | duckdb_set_tempdir <- function(conn, temp = tempdir()){ 14 | DBI::dbExecute(conn, paste0("PRAGMA temp_directory='", temp, "'")) 15 | } 16 | -------------------------------------------------------------------------------- /man/sample_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_data.R 3 | \name{sample_data} 4 | \alias{sample_data} 5 | \alias{sample_checklist_data} 6 | \alias{sample_observation_data} 7 | \title{Provide path to a small subset of eBird data} 8 | \usage{ 9 | sample_checklist_data() 10 | 11 | sample_observation_data() 12 | } 13 | \value{ 14 | The path to the sample tar archive file. 15 | } 16 | \description{ 17 | These small sample dataset consists of all observations from Hong Kong in the 18 | year 2012. Sample files are provided for checklist and observation data, both 19 | packaged as tar archive files to mimic the format of the eBird Basic Dataset 20 | download. 21 | } 22 | \examples{ 23 | sample_checklist_data() 24 | sample_observation_data() 25 | } 26 | -------------------------------------------------------------------------------- /R/sample_data.R: -------------------------------------------------------------------------------- 1 | #' Provide path to a small subset of eBird data 2 | #' 3 | #' These small sample dataset consists of all observations from Hong Kong in the 4 | #' year 2012. Sample files are provided for checklist and observation data, both 5 | #' packaged as tar archive files to mimic the format of the eBird Basic Dataset 6 | #' download. 7 | #' 8 | #' @name sample_data 9 | #' @return The path to the sample tar archive file. 10 | #' @examples 11 | #' sample_checklist_data() 12 | #' sample_observation_data() 13 | NULL 14 | 15 | #' @export 16 | #' @rdname sample_data 17 | sample_checklist_data <- function() { 18 | system.file("extdata", "ebd_sampling_relAug-2021.tar", package = "birddb", 19 | mustWork = TRUE) 20 | } 21 | 22 | #' @export 23 | #' @rdname sample_data 24 | sample_observation_data <- function() { 25 | system.file("extdata", "ebd_relAug-2021.tar", package = "birddb", 26 | mustWork = TRUE) 27 | } 28 | -------------------------------------------------------------------------------- /man/ebird_data_dir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebird_data_dir.R 3 | \name{ebird_data_dir} 4 | \alias{ebird_data_dir} 5 | \title{Retrieve directory used to store eBird data parquet files} 6 | \usage{ 7 | ebird_data_dir() 8 | } 9 | \description{ 10 | Show the location used by \code{birddb} to store eBird data parquet files. The 11 | default location is that chosen by R based on your OS, see 12 | \code{\link[tools:userdir]{tools::R_user_dir()}}. Alternately, users can configure a different permanent 13 | storage location by setting their desired path in the environmental variable 14 | \code{BIRDDB_HOME}. This may be desirable when multiple users of the same machine 15 | or server want to access a single copy of the eBird data. To set 16 | \code{BIRDDB_HOME}, add it to your \code{.Renviron} file, for example by using 17 | \code{usethis::edit_r_environ()}. 18 | } 19 | \examples{ 20 | ebird_data_dir() 21 | } 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Carl Boettiger 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: birddb 2 | Version: 0.1.0 3 | Title: Local Database Interface to eBird 4 | Description: Yet another package for working with 'eBird' data in R. This package 5 | is similar to 'auk' but provides a relational database interface. This allows 6 | users to query the data using familiar operations such as 'dplyr'. The 7 | high-performance backend is made possible by 'parquet' and 'duckdb'. 8 | Authors@R: c( 9 | person("Carl", "Boettiger", , "cboettig@gmail.com", c("aut", "cre"), 10 | comment = c(ORCID = "0000-0002-1642-628X")) 11 | ) 12 | License: MIT + file LICENSE 13 | Encoding: UTF-8 14 | ByteCompile: true 15 | Depends: R (>= 4.0) 16 | Imports: 17 | arrow (>= 7.0.0), 18 | duckdb (>= 0.2.9), 19 | DBI, 20 | dplyr, 21 | dbplyr, 22 | digest, 23 | utils 24 | Suggests: 25 | spelling, 26 | testthat (>= 3.0.0), 27 | covr, 28 | knitr, 29 | rmarkdown, 30 | progress 31 | URL: https://github.com/cboettig/birddb 32 | BugReports: https://github.com/cboettig/birddb 33 | Language: en-US 34 | Roxygen: list(markdown = TRUE) 35 | RoxygenNote: 7.1.2 36 | Config/testthat/edition: 3 37 | -------------------------------------------------------------------------------- /tests/testthat/test-ebird.R: -------------------------------------------------------------------------------- 1 | test_that("birddb works", { 2 | temp_dir <- file.path(tempdir(), "birddb") 3 | Sys.setenv("BIRDDB_HOME" = temp_dir) 4 | 5 | import_ebird(sample_observation_data()) 6 | import_ebird(sample_checklist_data()) 7 | 8 | # observations 9 | con <- ebird_conn("observations") 10 | observations <- observations(con) 11 | expect_s3_class(observations, "tbl") 12 | expect_s3_class(observations, "tbl_dbi") 13 | expect_equal(DBI::dbListTables(con), "observations") 14 | 15 | out <- observations %>% dplyr::count(common_name) %>% dplyr::collect() 16 | expect_s3_class(out, "data.frame") 17 | expect_gt(nrow(out), 0) 18 | 19 | # checklists 20 | con <- ebird_conn("checklists") 21 | checklists <- checklists(con) 22 | expect_s3_class(checklists, "tbl") 23 | expect_s3_class(checklists, "tbl_dbi") 24 | # ensure that both tables are in the same database 25 | expect_equal(sort(DBI::dbListTables(con)), 26 | c("checklists", "observations")) 27 | 28 | out <- checklists %>% dplyr::count(country) %>% dplyr::collect() 29 | expect_s3_class(out, "data.frame") 30 | expect_gt(nrow(out), 0) 31 | 32 | # cleanup 33 | DBI::dbDisconnect(con, shutdown = TRUE) 34 | unlink(temp_dir, recursive = TRUE) 35 | }) 36 | -------------------------------------------------------------------------------- /R/ebird_data_dir.R: -------------------------------------------------------------------------------- 1 | #' Retrieve directory used to store eBird data parquet files 2 | #' 3 | #' Show the location used by `birddb` to store eBird data parquet files. The 4 | #' default location is that chosen by R based on your OS, see 5 | #' [tools::R_user_dir()]. Alternately, users can configure a different permanent 6 | #' storage location by setting their desired path in the environmental variable 7 | #' `BIRDDB_HOME`. This may be desirable when multiple users of the same machine 8 | #' or server want to access a single copy of the eBird data. To set 9 | #' `BIRDDB_HOME`, add it to your `.Renviron` file, for example by using 10 | #' `usethis::edit_r_environ()`. 11 | #' 12 | #' @export 13 | #' @examples 14 | #' ebird_data_dir() 15 | ebird_data_dir <- function() { 16 | Sys.getenv("BIRDDB_HOME", 17 | tools::R_user_dir("birddb", "data") 18 | ) 19 | } 20 | 21 | # a location for duckdb view files 22 | # very small, but should not be shared between users 23 | # currently defaults to storing in memory, making it ephemeral 24 | ebird_db_dir <- function() { 25 | path <- Sys.getenv("BIRDDB_DUCKDB", ":memory:") 26 | if (path == ":memory:") { 27 | return(path) 28 | } 29 | dir.create(path, recursive = TRUE, showWarnings = FALSE) 30 | file.path(path, "database") 31 | } 32 | -------------------------------------------------------------------------------- /man/ebird_remote.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebird_remote.R 3 | \name{ebird_remote} 4 | \alias{ebird_remote} 5 | \title{ebird remote} 6 | \usage{ 7 | ebird_remote( 8 | dataset = c("observations", "checklists"), 9 | version = "Oct-2021", 10 | bucket = "ebird", 11 | to_duckdb = FALSE, 12 | host = "minio.cirrus.carlboettiger.info", 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{dataset}{name of dataset (table) to access.} 18 | 19 | \item{version}{eBird snapshot date} 20 | 21 | \item{bucket}{eBird bucket name (including region)} 22 | 23 | \item{to_duckdb}{Return a remote duckdb connection or arrow connection? 24 | Note that leaving as FALSE may be faster but is limited to the dplyr-style 25 | operations supported by \link{arrow} alone.} 26 | 27 | \item{host}{Remote S3-based host of eBird parquet data} 28 | 29 | \item{...}{additional parameters passed to the s3_bucket() (e.g. for remote 30 | access to independently hosted buckets)} 31 | } 32 | \description{ 33 | Connect to an eBird snapshot remote. Can be much faster than downloading 34 | for one-off use or when using the package from a server in the same region 35 | as the data. 36 | } 37 | \examples{ 38 | \dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} 39 | \dontshow{\}) # examplesIf} 40 | } 41 | -------------------------------------------------------------------------------- /R/ebird.R: -------------------------------------------------------------------------------- 1 | #' Return a remote connection to a table in your local eBird database 2 | #' 3 | #' Parquet files setup with a view in a DuckDB database, as done by 4 | #' [ebird_conn()], can be queried with [dplyr] syntax. This function sets up 5 | #' [tbl_dbi] object, which are remote tables referencing either the checklist or 6 | #' observation dataset. These remote tables can then by queried with [dplyr] 7 | #' similarly to a [data.frame]. 8 | #' 9 | #' @param conn a connection to the local eBird database, see [ebird_conn()]. 10 | #' 11 | #' @details 12 | #' When working with a remote table in [dplyr], the primary different compared 13 | #' to working with a normal [data.frame] is that calls are evaluated lazily, 14 | #' generating SQL that is only sent to the database when you request the data. 15 | #' The [dplyr] functions [collect()] and [compute()] can be used to force 16 | #' evaluation. 17 | #' 18 | #' @return A [tbl_dbi] object referencing either the checklist or observation 19 | #' data in DuckDB. 20 | #' @name ebird_tbl 21 | #' @examples 22 | #' # only use a tempdir for this example, don't copy this for real data 23 | #' temp_dir <- file.path(tempdir(), "birddb") 24 | #' Sys.setenv("BIRDDB_HOME" = temp_dir) 25 | #' 26 | #' # get the path to a sample dataset provided with the package 27 | #' tar <- sample_observation_data() 28 | #' # import the sample dataset to parquet 29 | #' import_ebird(tar) 30 | #' 31 | #' # set up the database connection to the observations data 32 | #' observations <- observations() 33 | #' # query the data, number of observations of each species 34 | #' dplyr::count(observations, common_name) 35 | #' 36 | #' unlink(temp_dir, recursive = TRUE) 37 | NULL 38 | 39 | #' @rdname ebird_tbl 40 | #' @export 41 | observations <- function(conn = ebird_conn("observations")) { 42 | dplyr::tbl(conn, "observations") 43 | } 44 | 45 | #' @rdname ebird_tbl 46 | #' @export 47 | checklists <- function(conn = ebird_conn("checklists")) { 48 | dplyr::tbl(conn, "checklists") 49 | } 50 | -------------------------------------------------------------------------------- /man/ebird_conn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebird_conn.R 3 | \name{ebird_conn} 4 | \alias{ebird_conn} 5 | \title{Set up a \code{DBI}-style database connection to the imported eBird data} 6 | \usage{ 7 | ebird_conn( 8 | dataset = c("observations", "checklists"), 9 | cache_connection = TRUE, 10 | memory_limit = 16 11 | ) 12 | } 13 | \arguments{ 14 | \item{dataset}{the type of dataset to set up a connection to, either the 15 | observations of checklists.} 16 | 17 | \item{cache_connection}{should we preserve a cache of the connection? allows 18 | faster load times and prevents connection from being garbage-collected.} 19 | 20 | \item{memory_limit}{the memory limit for DuckDB.} 21 | } 22 | \value{ 23 | A \link{DBI} connection object using to communicate with the DuckDB 24 | database containing the eBird data. 25 | } 26 | \description{ 27 | Parquet files can be accessed as though they were relational database tables 28 | by setting up a view to the file using DuckDB. This function sets up a view 29 | on either the checklist or observation dataset and returns a \link{DBI}-style 30 | database connection to the data. The returned object can then be queried 31 | with SQL syntax via \link{DBI} or with \link{dplyr} syntax via \link{dbplyr}. For the latter 32 | approach, consider using the \code{\link[=checklists]{checklists()}} and \code{\link[=observations]{observations()}} functions 33 | which will return \link{tbl} objects ready for access using \link{dplyr} syntax. 34 | } 35 | \examples{ 36 | # only use a tempdir for this example, don't copy this for real data 37 | temp_dir <- file.path(tempdir(), "birddb") 38 | Sys.setenv("BIRDDB_HOME" = temp_dir) 39 | 40 | # get the path to a sample dataset provided with the package 41 | tar <- sample_observation_data() 42 | # import the sample dataset to parquet 43 | import_ebird(tar) 44 | # set up the database connection 45 | con <- ebird_conn(dataset = "observations") 46 | 47 | unlink(temp_dir, recursive = TRUE) 48 | } 49 | -------------------------------------------------------------------------------- /R/ebird_remote.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' ebird remote 4 | #' 5 | #' Connect to an eBird snapshot remote. Can be much faster than downloading 6 | #' for one-off use or when using the package from a server in the same region 7 | #' as the data. 8 | #' 9 | #' @param version eBird snapshot date 10 | #' @param bucket eBird bucket name (including region) 11 | #' @param to_duckdb Return a remote duckdb connection or arrow connection? 12 | #' Note that leaving as FALSE may be faster but is limited to the dplyr-style 13 | #' operations supported by [arrow] alone. 14 | #' @param dataset name of dataset (table) to access. 15 | #' @param host Remote S3-based host of eBird parquet data 16 | #' @param ... additional parameters passed to the s3_bucket() (e.g. for remote 17 | #' access to independently hosted buckets) 18 | #' @examplesIf interactive() 19 | #' @export 20 | #' 21 | ebird_remote <- 22 | function(dataset = c("observations", "checklists"), 23 | version = "Oct-2021", 24 | bucket = "ebird", 25 | to_duckdb = FALSE, 26 | host = "minio.cirrus.carlboettiger.info", 27 | ...) { 28 | dataset <- match.arg(dataset) 29 | 30 | ## Not ideal, but these will cause problems if set 31 | unset_aws_env() 32 | 33 | server <- arrow::s3_bucket(bucket, 34 | endpoint_override = host, 35 | ...) 36 | 37 | path <- server$path(file.path(version, dataset, fsep = "/")) 38 | df <- arrow::open_dataset(path) 39 | if (to_duckdb) { 40 | df <- arrow::to_duckdb(df) 41 | } 42 | df 43 | } 44 | 45 | 46 | 47 | unset_aws_env <- function() { 48 | ## Consider re-setting these afterwards. 49 | ## What about ~/.aws ? 50 | ## Maybe set these to empty strings instead of unsetting? 51 | 52 | ## Would be nice if we could simply override the detection of these 53 | Sys.unsetenv("AWS_DEFAULT_REGION") 54 | Sys.unsetenv("AWS_S3_ENDPOINT") 55 | Sys.unsetenv("AWS_ACCESS_KEY_ID") 56 | Sys.unsetenv("AWS_SECRET_ACCESS_KEY") 57 | } 58 | -------------------------------------------------------------------------------- /man/ebird_tbl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebird.R 3 | \name{ebird_tbl} 4 | \alias{ebird_tbl} 5 | \alias{observations} 6 | \alias{checklists} 7 | \title{Return a remote connection to a table in your local eBird database} 8 | \usage{ 9 | observations(conn = ebird_conn("observations")) 10 | 11 | checklists(conn = ebird_conn("checklists")) 12 | } 13 | \arguments{ 14 | \item{conn}{a connection to the local eBird database, see \code{\link[=ebird_conn]{ebird_conn()}}.} 15 | } 16 | \value{ 17 | A \link{tbl_dbi} object referencing either the checklist or observation 18 | data in DuckDB. 19 | } 20 | \description{ 21 | Parquet files setup with a view in a DuckDB database, as done by 22 | \code{\link[=ebird_conn]{ebird_conn()}}, can be queried with \link{dplyr} syntax. This function sets up 23 | \link{tbl_dbi} object, which are remote tables referencing either the checklist or 24 | observation dataset. These remote tables can then by queried with \link{dplyr} 25 | similarly to a \link{data.frame}. 26 | } 27 | \details{ 28 | When working with a remote table in \link{dplyr}, the primary different compared 29 | to working with a normal \link{data.frame} is that calls are evaluated lazily, 30 | generating SQL that is only sent to the database when you request the data. 31 | The \link{dplyr} functions \code{\link[=collect]{collect()}} and \code{\link[=compute]{compute()}} can be used to force 32 | evaluation. 33 | } 34 | \examples{ 35 | # only use a tempdir for this example, don't copy this for real data 36 | temp_dir <- file.path(tempdir(), "birddb") 37 | Sys.setenv("BIRDDB_HOME" = temp_dir) 38 | 39 | # get the path to a sample dataset provided with the package 40 | tar <- sample_observation_data() 41 | # import the sample dataset to parquet 42 | import_ebird(tar) 43 | 44 | # set up the database connection to the observations data 45 | observations <- observations() 46 | # query the data, number of observations of each species 47 | dplyr::count(observations, common_name) 48 | 49 | unlink(temp_dir, recursive = TRUE) 50 | } 51 | -------------------------------------------------------------------------------- /man/import_ebird.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/import_ebird.R 3 | \name{import_ebird} 4 | \alias{import_ebird} 5 | \title{Import eBird data to parquet} 6 | \usage{ 7 | import_ebird(tarfile) 8 | } 9 | \arguments{ 10 | \item{tarfile}{path to the tar archive file downloaded from the eBird 11 | website. Files containing either observation data (e.g. 12 | \verb{ebd_rel.tar}) or checklist (e.g. \verb{ebd_sampling_rel.tar}) data 13 | can be provided} 14 | } 15 | \value{ 16 | Invisibly return the path to the directory containing eBird parquet 17 | files. 18 | } 19 | \description{ 20 | eBird data are released as tab-separated text files, packaged into tar 21 | archives. Given a path to an eBird tarfile, this function will extract and 22 | import the tar archive into a parquet-based database in your 23 | \code{\link[=ebird_data_dir]{ebird_data_dir()}}. 24 | } 25 | \details{ 26 | \href{https://ebird.org/home}{eBird} data are collected and organized around the 27 | concept of a checklist, representing observations from a single birding 28 | event. Each checklist contains a list of species observed, counts of the 29 | number of individuals seen of each species, the location and time of the 30 | observations, and a measure of the effort expended while collecting these 31 | data. The majority of the \href{https://ebird.org/home}{eBird} database is 32 | available for download in the form of the \href{https://ebird.org/data/download}{eBird Basic Dataset (EBD)}, a set of two tab-separated text 33 | files. 34 | 35 | The \strong{checklist} dataset (referred to as the Sampling Event Data on the 36 | eBird website) consists of one row for each eBird checklist and columns 37 | contain checklist-level information such as location, date, and search 38 | effort. The \strong{observation} dataset consists of one row for each species 39 | observed on each checklist and columns contain checklist-level information 40 | such as number of individuals detected. This dataset also contains all 41 | checklist-level variables, duplicated for each species on the same checklist. 42 | 43 | After \href{https://ebird.org/data/download}{submitting a request for data access}, users can download either or both 44 | of these datasets as tar archive files. \code{import_ebird()} takes the path to a 45 | tar file as input and imports the text file contained within to a parquet 46 | file, which will allow much easier access to the data. This function will 47 | automatically detect whether you are importing a checklist or observation 48 | dataset provided you \strong{do not change the name of the downloaded file or 49 | unarchive the tar file}. The parquet files will be stored in the directory 50 | specified by \code{\link[=ebird_data_dir]{ebird_data_dir()}}, consult the help for that function to learn 51 | how to modify the parquet directory. 52 | } 53 | \examples{ 54 | # only use a tempdir for this example, don't copy this for real data 55 | temp_dir <- file.path(tempdir(), "birddb") 56 | Sys.setenv("BIRDDB_HOME" = temp_dir) 57 | 58 | # get the path to a sample dataset provided with the package 59 | tar <- sample_observation_data() 60 | # import the sample dataset to parquet 61 | import_ebird(tar) 62 | 63 | unlink(temp_dir, recursive = TRUE) 64 | } 65 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | pull_request: 9 | branches: 10 | - main 11 | - master 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: ${{ matrix.config.os }} 18 | 19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | config: 25 | - {os: windows-latest, r: 'release'} 26 | - {os: macOS-latest, r: 'release'} 27 | - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 28 | - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } 29 | 30 | env: 31 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 32 | RSPM: ${{ matrix.config.rspm }} 33 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 34 | 35 | steps: 36 | - uses: actions/checkout@v2 37 | 38 | - uses: r-lib/actions/setup-r@v1 39 | with: 40 | r-version: ${{ matrix.config.r }} 41 | 42 | - uses: r-lib/actions/setup-pandoc@v1 43 | 44 | - name: Query dependencies 45 | run: | 46 | install.packages('remotes') 47 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 48 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 49 | shell: Rscript {0} 50 | 51 | - name: Restore R package cache 52 | uses: actions/cache@v2 53 | with: 54 | path: ${{ env.R_LIBS_USER }} 55 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 56 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 57 | 58 | - name: Install system dependencies 59 | if: runner.os == 'Linux' 60 | run: | 61 | while read -r cmd 62 | do 63 | eval sudo $cmd 64 | done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') 65 | 66 | - name: Install dependencies 67 | run: | 68 | remotes::install_deps(dependencies = TRUE) 69 | remotes::install_cran("rcmdcheck") 70 | install.packages("https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz", repos = NULL) 71 | shell: Rscript {0} 72 | 73 | - name: Check 74 | env: 75 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 76 | run: | 77 | options(crayon.enabled = TRUE) 78 | rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 79 | shell: Rscript {0} 80 | 81 | - name: Upload check results 82 | if: failure() 83 | uses: actions/upload-artifact@main 84 | with: 85 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 86 | path: check 87 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | editor_options: 4 | chunk_output_type: console 5 | --- 6 | 7 | 8 | 9 | ```{r, include = FALSE} 10 | knitr::opts_chunk$set( 11 | collapse = TRUE, 12 | comment = "#>", 13 | fig.path = "man/figures/README-", 14 | out.width = "100%" 15 | ) 16 | 17 | Sys.setenv("BIRDDB_HOME" = tempdir()) 18 | ``` 19 | 20 | # birddb 21 | 22 | 23 | [![R-CMD-check](https://github.com/cboettig/birddb/workflows/R-CMD-check/badge.svg)](https://github.com/cboettig/birddb/actions) 24 | [![Codecov test coverage](https://codecov.io/gh/cboettig/birddb/branch/main/graph/badge.svg)](https://codecov.io/gh/cboettig/birddb?branch=main) 25 | [![CRAN status](https://www.r-pkg.org/badges/version/birddb)](https://CRAN.R-project.org/package=birddb) 26 | 27 | 28 | The goal of `birddb` is to provide a relational database interface to a local copy of eBird. `birddb` works by importing the text-based ebird file into a local parquet file using [arrow](https://cran.r-project.org/package=arrow), which can be queried as a relational database using the familiar `dplyr` interface. 29 | `dplyr` translates R-based queries into SQL commands which are past to [`duckdb`](https://duckdb.org), which then queries the parquet database. Unlike the native `arrow` interface, `duckdb` supports the full set of SQL instructions, including windowed operations like `group_by`+`summarise` as well as table joins. 30 | 31 | 32 | ## Installation 33 | 34 | You can install the released version of `birddb` from [CRAN](https://CRAN.R-project.org) with: 35 | 36 | ``` r 37 | install.packages("birddb") 38 | ``` 39 | 40 | And the development version from [GitHub](https://github.com/) with: 41 | 42 | ``` r 43 | # install.packages("devtools") 44 | devtools::install_github("cboettig/birddb") 45 | ``` 46 | ## Getting Started 47 | 48 | ```{r message=FALSE} 49 | library(birddb) 50 | library(dplyr) 51 | ``` 52 | 53 | Before you can use `birddb` you will need to download the latest version of the eBird Basic Dataset from http://ebird.org/ebird/data/download. 54 | Once you have obtained a downloaded copy of the `tar` file, `birddb` can import it for you. The one-time import of the full data dump is a little slow (about 1 hr in my benchmark) due to the time required to extract the tar file and convert the text data into parquet format. 55 | 56 | For illustration and testing purposes, we will use the small eBird sample data, included in the package for convenience and testing purposes: 57 | 58 | ```{r} 59 | observations_tar <- birddb::sample_observation_data() 60 | checklists_tar <- birddb::sample_checklist_data() 61 | ``` 62 | 63 | Importing will now create the local parquet-based copies in the default directory given by `ebird_data_dir()`. 64 | Users can set an alternative location by setting the environmental variable `BIRDDB_HOME` to the desired path. 65 | 66 | ```{r} 67 | import_ebird(observations_tar) 68 | import_ebird(checklists_tar) 69 | ``` 70 | 71 | Once the data have been downloaded and imported successfully, a user can access the full eBird dataset quite quickly: 72 | 73 | ```{r} 74 | observations <- observations() 75 | checklists <- checklists() 76 | ``` 77 | 78 | To see the available columns in each dataset use: 79 | 80 | ```{r} 81 | colnames(observations) 82 | colnames(checklists) 83 | ``` 84 | 85 | Now, we can use `dplyr` to perform standard queries. For example, to see the number of observations for each species in the sample dataset: 86 | 87 | ```{r} 88 | observations %>% count(scientific_name, sort = TRUE) 89 | ``` 90 | 91 | 92 | ```{r include=FALSE} 93 | Sys.unsetenv("BIRDDB_HOME") 94 | ``` 95 | -------------------------------------------------------------------------------- /R/ebird_conn.R: -------------------------------------------------------------------------------- 1 | #' Set up a `DBI`-style database connection to the imported eBird data 2 | #' 3 | #' Parquet files can be accessed as though they were relational database tables 4 | #' by setting up a view to the file using DuckDB. This function sets up a view 5 | #' on either the checklist or observation dataset and returns a [DBI]-style 6 | #' database connection to the data. The returned object can then be queried 7 | #' with SQL syntax via [DBI] or with [dplyr] syntax via [dbplyr]. For the latter 8 | #' approach, consider using the [checklists()] and [observations()] functions 9 | #' which will return [tbl] objects ready for access using [dplyr] syntax. 10 | #' 11 | #' @param dataset the type of dataset to set up a connection to, either the 12 | #' observations of checklists. 13 | #' @param cache_connection should we preserve a cache of the connection? allows 14 | #' faster load times and prevents connection from being garbage-collected. 15 | #' @param memory_limit the memory limit for DuckDB. 16 | #' 17 | #' @return A [DBI] connection object using to communicate with the DuckDB 18 | #' database containing the eBird data. 19 | #' @export 20 | #' @examples 21 | #' # only use a tempdir for this example, don't copy this for real data 22 | #' temp_dir <- file.path(tempdir(), "birddb") 23 | #' Sys.setenv("BIRDDB_HOME" = temp_dir) 24 | #' 25 | #' # get the path to a sample dataset provided with the package 26 | #' tar <- sample_observation_data() 27 | #' # import the sample dataset to parquet 28 | #' import_ebird(tar) 29 | #' # set up the database connection 30 | #' con <- ebird_conn(dataset = "observations") 31 | #' 32 | #' unlink(temp_dir, recursive = TRUE) 33 | ebird_conn <- function(dataset = c("observations", "checklists"), 34 | cache_connection = TRUE, 35 | memory_limit = 16) { 36 | 37 | dataset <- match.arg(dataset) 38 | #parquet <- ebird_parquet_files(dataset = dataset) 39 | 40 | conn <- duckdb_connection(memory_limit = memory_limit, 41 | cache_connection = cache_connection) 42 | # create the view if does not exist 43 | 44 | parquet <- paste0(file.path(ebird_data_dir(), dataset), "/*.parquet") 45 | 46 | if (!dataset %in% DBI::dbListTables(conn)){ 47 | # query to create view in duckdb to the parquet file 48 | view_query <- paste0("CREATE VIEW '", dataset, 49 | "' AS SELECT * FROM parquet_scan('", 50 | parquet, "');") 51 | DBI::dbSendQuery(conn, view_query) 52 | } 53 | 54 | 55 | conn 56 | } 57 | 58 | duckdb_connection <- function(dir = ebird_db_dir(), 59 | memory_limit = 16, 60 | mc.cores = arrow::cpu_count(), 61 | cache_connection = TRUE 62 | ) { 63 | stopifnot(is.logical(cache_connection), 64 | length(cache_connection) == 1) 65 | stopifnot(is.numeric(memory_limit), 66 | length(memory_limit) == 1, 67 | !is.na(memory_limit), 68 | memory_limit > 0) 69 | 70 | # check for a cached connection 71 | conn <- mget("birddb", envir = birddb_cache, ifnotfound = NA)[["birddb"]] 72 | 73 | # Disconnect if it's an invalid connection (expired in cache) 74 | if ( db_is_invalid(conn) ) { 75 | conn <- DBI::dbDisconnect(conn, shutdown = TRUE) 76 | } 77 | 78 | # We don't have a valid cached connection, so we must create one! 79 | if (!inherits(conn, "DBIConnection")){ 80 | conn <- DBI::dbConnect(drv = duckdb::duckdb(), dir) 81 | } 82 | 83 | ## PRAGMAS 84 | duckdb_mem_limit(conn, memory_limit, "GB") 85 | duckdb_parallel(conn, mc.cores) 86 | duckdb_set_tempdir(conn, tempdir()) 87 | 88 | # (re)-cache the connection 89 | if (cache_connection) { 90 | assign("birddb", conn, envir = birddb_cache) 91 | } 92 | 93 | conn 94 | } 95 | 96 | ## 97 | db_is_invalid <- function(conn) { 98 | inherits(conn, "DBIConnection") && !DBI::dbIsValid(conn) 99 | } 100 | 101 | ebird_parquet_files <- function(dataset = c("observations", "checklists")) { 102 | dataset <- match.arg(dataset) 103 | 104 | # list of all parquet files 105 | dir <- file.path(ebird_data_dir(), dataset) 106 | file <- list.files(dir, pattern = "[.]parquet", 107 | full.names = TRUE, recursive = TRUE) 108 | 109 | # currently we're assuming no partitioning is being used hence 1 file 110 | # will need to modify later if partitioning is implemented 111 | if (length(file) == 0) { 112 | stop("No parquet files found in: ", dir) 113 | } 114 | 115 | return(file) 116 | } 117 | 118 | # environment to store the cached copy of the connection 119 | birddb_cache <- new.env() 120 | 121 | # finalizer to close the connection on exit. 122 | local_db_disconnect <- function(db = ebird_conn()){ 123 | if (inherits(db, "DBIConnection")) { 124 | suppressWarnings({ 125 | DBI::dbDisconnect(db, shutdown = TRUE) 126 | }) 127 | } 128 | if (exists("birddb", envir = birddb_cache)) { 129 | suppressWarnings({ 130 | rm("birddb", envir = birddb_cache) 131 | }) 132 | } 133 | } 134 | reg.finalizer(birddb_cache, local_db_disconnect, onexit = TRUE) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # birddb 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/cboettig/birddb/workflows/R-CMD-check/badge.svg)](https://github.com/cboettig/birddb/actions) 9 | [![Codecov test 10 | coverage](https://codecov.io/gh/cboettig/birddb/branch/main/graph/badge.svg)](https://codecov.io/gh/cboettig/birddb?branch=main) 11 | [![CRAN 12 | status](https://www.r-pkg.org/badges/version/birddb)](https://CRAN.R-project.org/package=birddb) 13 | 14 | 15 | The goal of `birddb` is to provide a relational database interface to a 16 | local copy of eBird. `birddb` works by importing the text-based ebird 17 | file into a local parquet file using 18 | [arrow](https://cran.r-project.org/package=arrow), which can be queried 19 | as a relational database using the familiar `dplyr` interface. `dplyr` 20 | translates R-based queries into SQL commands which are past to 21 | [`duckdb`](https://duckdb.org), which then queries the parquet database. 22 | Unlike the native `arrow` interface, `duckdb` supports the full set of 23 | SQL instructions, including windowed operations like 24 | `group_by`+`summarise` as well as table joins. 25 | 26 | ## Installation 27 | 28 | 36 | 37 | And the development version from [GitHub](https://github.com/) with: 38 | 39 | ``` r 40 | # install.packages("devtools") 41 | devtools::install_github("cboettig/birddb") 42 | ``` 43 | 44 | ## Getting Started 45 | 46 | ``` r 47 | library(birddb) 48 | library(dplyr) 49 | ``` 50 | 51 | Before you can use `birddb` you will need to download the latest version 52 | of the eBird Basic Dataset from . 53 | Once you have obtained a downloaded copy of the `tar` file, `birddb` can 54 | import it for you. The one-time import of the full data dump is a little 55 | slow (about 1 hr in my benchmark) due to the time required to extract 56 | the tar file and convert the text data into parquet format. 57 | 58 | For illustration and testing purposes, we will use the small eBird 59 | sample data, included in the package for convenience and testing 60 | purposes: 61 | 62 | ``` r 63 | observations_tar <- birddb::sample_observation_data() 64 | checklists_tar <- birddb::sample_checklist_data() 65 | ``` 66 | 67 | Importing will now create the local parquet-based copies in the default 68 | directory given by `ebird_data_dir()`. Users can set an alternative 69 | location by setting the environmental variable `BIRDDB_HOME` to the 70 | desired path. 71 | 72 | ``` r 73 | import_ebird(observations_tar) 74 | #> Importing observations data from the eBird Basic Dataset: ebd_relAug-2021.tar 75 | #> Extracting from tar archive... 76 | #> Importing to parquet... 77 | import_ebird(checklists_tar) 78 | #> Importing checklists data from the eBird Basic Dataset: ebd_sampling_relAug-2021.tar 79 | #> Extracting from tar archive... 80 | #> Importing to parquet... 81 | ``` 82 | 83 | Once the data have been downloaded and imported successfully, a user can 84 | access the full eBird dataset quite quickly: 85 | 86 | ``` r 87 | observations <- observations() 88 | checklists <- checklists() 89 | ``` 90 | 91 | To see the available columns in each dataset use: 92 | 93 | ``` r 94 | colnames(observations) 95 | #> [1] "global_unique_identifier" "last_edited_date" 96 | #> [3] "taxonomic_order" "category" 97 | #> [5] "common_name" "scientific_name" 98 | #> [7] "subspecies_common_name" "subspecies_scientific_name" 99 | #> [9] "observation_count" "breeding_code" 100 | #> [11] "breeding_category" "behavior_code" 101 | #> [13] "age_sex" "country" 102 | #> [15] "country_code" "state" 103 | #> [17] "state_code" "county" 104 | #> [19] "county_code" "iba_code" 105 | #> [21] "bcr_code" "usfws_code" 106 | #> [23] "atlas_block" "locality" 107 | #> [25] "locality_id" "locality_type" 108 | #> [27] "latitude" "longitude" 109 | #> [29] "observation_date" "time_observations_started" 110 | #> [31] "observer_id" "sampling_event_identifier" 111 | #> [33] "protocol_type" "protocol_code" 112 | #> [35] "project_code" "duration_minutes" 113 | #> [37] "effort_distance_km" "effort_area_ha" 114 | #> [39] "number_observers" "all_species_reported" 115 | #> [41] "group_identifier" "has_media" 116 | #> [43] "approved" "reviewed" 117 | #> [45] "reason" "trip_comments" 118 | #> [47] "species_comments" 119 | colnames(checklists) 120 | #> [1] "last_edited_date" "country" 121 | #> [3] "country_code" "state" 122 | #> [5] "state_code" "county" 123 | #> [7] "county_code" "iba_code" 124 | #> [9] "bcr_code" "usfws_code" 125 | #> [11] "atlas_block" "locality" 126 | #> [13] "locality_id" "locality_type" 127 | #> [15] "latitude" "longitude" 128 | #> [17] "observation_date" "time_observations_started" 129 | #> [19] "observer_id" "sampling_event_identifier" 130 | #> [21] "protocol_type" "protocol_code" 131 | #> [23] "project_code" "duration_minutes" 132 | #> [25] "effort_distance_km" "effort_area_ha" 133 | #> [27] "number_observers" "all_species_reported" 134 | #> [29] "group_identifier" "trip_comments" 135 | ``` 136 | 137 | Now, we can use `dplyr` to perform standard queries. For example, to see 138 | the number of observations for each species in the sample dataset: 139 | 140 | ``` r 141 | observations %>% count(scientific_name, sort = TRUE) 142 | #> # Source: lazy query [?? x 2] 143 | #> # Database: duckdb_connection 144 | #> # Ordered by: desc(n) 145 | #> scientific_name n 146 | #> 147 | #> 1 Pycnonotus sinensis 275 148 | #> 2 Pycnonotus jocosus 270 149 | #> 3 Streptopelia chinensis 258 150 | #> 4 Milvus migrans 251 151 | #> 5 Copsychus saularis 228 152 | #> 6 Zosterops simplex 201 153 | #> 7 Acridotheres cristatellus 181 154 | #> 8 Passer montanus 174 155 | #> 9 Pterorhinus perspicillatus 172 156 | #> 10 Motacilla alba 172 157 | #> # … with more rows 158 | ``` 159 | -------------------------------------------------------------------------------- /R/import_ebird.R: -------------------------------------------------------------------------------- 1 | #' Import eBird data to parquet 2 | #' 3 | #' eBird data are released as tab-separated text files, packaged into tar 4 | #' archives. Given a path to an eBird tarfile, this function will extract and 5 | #' import the tar archive into a parquet-based database in your 6 | #' [ebird_data_dir()]. 7 | #' 8 | #' @param tarfile path to the tar archive file downloaded from the eBird 9 | #' website. Files containing either observation data (e.g. 10 | #' `ebd_rel.tar`) or checklist (e.g. `ebd_sampling_rel.tar`) data 11 | #' can be provided 12 | #' 13 | #' @details 14 | #' [eBird](https://ebird.org/home) data are collected and organized around the 15 | #' concept of a checklist, representing observations from a single birding 16 | #' event. Each checklist contains a list of species observed, counts of the 17 | #' number of individuals seen of each species, the location and time of the 18 | #' observations, and a measure of the effort expended while collecting these 19 | #' data. The majority of the [eBird](https://ebird.org/home) database is 20 | #' available for download in the form of the [eBird Basic Dataset 21 | #' (EBD)](https://ebird.org/data/download), a set of two tab-separated text 22 | #' files. 23 | #' 24 | #' The **checklist** dataset (referred to as the Sampling Event Data on the 25 | #' eBird website) consists of one row for each eBird checklist and columns 26 | #' contain checklist-level information such as location, date, and search 27 | #' effort. The **observation** dataset consists of one row for each species 28 | #' observed on each checklist and columns contain checklist-level information 29 | #' such as number of individuals detected. This dataset also contains all 30 | #' checklist-level variables, duplicated for each species on the same checklist. 31 | #' 32 | #' After [submitting a request for data 33 | #' access](https://ebird.org/data/download), users can download either or both 34 | #' of these datasets as tar archive files. `import_ebird()` takes the path to a 35 | #' tar file as input and imports the text file contained within to a parquet 36 | #' file, which will allow much easier access to the data. This function will 37 | #' automatically detect whether you are importing a checklist or observation 38 | #' dataset provided you **do not change the name of the downloaded file or 39 | #' unarchive the tar file**. The parquet files will be stored in the directory 40 | #' specified by [ebird_data_dir()], consult the help for that function to learn 41 | #' how to modify the parquet directory. 42 | #' 43 | #' @return Invisibly return the path to the directory containing eBird parquet 44 | #' files. 45 | #' @export 46 | #' @examples 47 | #' # only use a tempdir for this example, don't copy this for real data 48 | #' temp_dir <- file.path(tempdir(), "birddb") 49 | #' Sys.setenv("BIRDDB_HOME" = temp_dir) 50 | #' 51 | #' # get the path to a sample dataset provided with the package 52 | #' tar <- sample_observation_data() 53 | #' # import the sample dataset to parquet 54 | #' import_ebird(tar) 55 | #' 56 | #' unlink(temp_dir, recursive = TRUE) 57 | import_ebird <- function(tarfile) { 58 | if (is_checklists(tarfile)) { 59 | dataset <- "checklists" 60 | } else if (is_observations(tarfile, allow_subset = FALSE)) { 61 | dataset <- "observations" 62 | } else if (is_observations(tarfile, allow_subset = TRUE)) { 63 | stop("It appears you downloaded a subset of eBird data using the ", 64 | "Custom Download form. birddb currently only supports importing ", 65 | "the full eBird Basic Dataset.") 66 | } else { 67 | stop("Non-stardard eBird data filename provided: ", basename(tarfile)) 68 | } 69 | dest <- file.path(ebird_data_dir(), dataset) 70 | 71 | # confirm overwrite 72 | if (dir.exists(dest)) { 73 | if (interactive()) { 74 | msg <- paste("eBird", dataset, "data already exists in", 75 | ebird_data_dir(), 76 | "would you like to overwrite this data?") 77 | overwrite <- utils::askYesNo(msg, default = NA) 78 | if (!isTRUE(overwrite)) { 79 | warning("Cancelling data import to avoid overwriting existing data.") 80 | return(invisible()) 81 | } 82 | } else { 83 | message("Overwriting existing eBird ", dataset, " data.") 84 | } 85 | } 86 | 87 | message(sprintf("Importing %s data from the eBird Basic Dataset: %s", 88 | dataset, basename(tarfile))) 89 | 90 | # extract the tarfile to a temp directory 91 | message("Extracting from tar archive...") 92 | source_dir <- file.path(ebird_data_dir(), "ebird_tmp") 93 | unlink(source_dir, recursive = TRUE) 94 | dir.create(source_dir, recursive = TRUE) 95 | utils::untar(tarfile = tarfile, exdir = source_dir) 96 | ebd <- list.files(source_dir, pattern = "ebd.*\\.txt\\.gz", 97 | full.names = TRUE, recursive = TRUE) 98 | if (length(ebd) != 1 || !file.exists(ebd)) { 99 | stop("txt.gz file not successfully extracted from tarfile.") 100 | } 101 | 102 | # open tsv and set up data schema 103 | ds <- arrow_open_ebird_txt(ebd, dest) 104 | 105 | # stream to parquet 106 | message("Importing to parquet...") 107 | arrow::write_dataset(ds, dest, format = "parquet", 108 | max_rows_per_file=1000000L) 109 | 110 | # save metadata 111 | record_metadata(tarfile) 112 | 113 | unlink(source_dir, recursive = TRUE) 114 | invisible(dest) 115 | } 116 | 117 | arrow_open_ebird_txt <- function(ebd, dest) { 118 | ds <- arrow::open_dataset(ebd, format = "tsv") 119 | col_names <- names(ds) 120 | 121 | col_types <- ebird_col_type(col_names) 122 | expand_schema = list(string = arrow::string(), 123 | binary = arrow::binary(), 124 | integer = arrow::int64(), 125 | double = arrow::float64(), 126 | timestamp = arrow::timestamp(unit = "us"), 127 | date = arrow::date64()) 128 | ebd_schema <- expand_schema[col_types] 129 | names(ebd_schema) <- col_names 130 | sch <- do.call(arrow::schema, ebd_schema) 131 | 132 | # based on the schema defined above open tsv file for streaming 133 | ds <- arrow::open_dataset(ebd, format = "tsv", schema = sch, skip_rows = 1) 134 | 135 | # clean up column names 136 | col_names <- names(ds) 137 | col_names <- col_names[col_names != ""] 138 | names(col_names) <- gsub("[/ ]", "_", tolower(col_names)) 139 | ds <- dplyr::select(ds, dplyr::all_of(col_names)) 140 | 141 | return(ds) 142 | } 143 | 144 | ebird_col_type <- function(col_names) { 145 | # types for columns that are not character 146 | col_types <- c(`LAST EDITED DATE` = "timestamp", 147 | `TAXONOMIC ORDER` = "integer", 148 | `LATITUDE` = "double", `LONGITUDE` = "double", 149 | `OBSERVATION DATE` = "date", 150 | `DURATION MINUTES` = "integer", 151 | `EFFORT DISTANCE KM` = "double", 152 | `EFFORT AREA HA` = "double", 153 | `NUMBER OBSERVERS` = "integer", 154 | `ALL SPECIES REPORTED` = "integer", 155 | `HAS MEDIA` = "integer", 156 | `APPROVED` = "integer", 157 | `REVIEWED` = "integer") 158 | # assume anything else is character 159 | col_types <- col_types[col_names] 160 | col_types[is.na(col_types)] <- "string" 161 | names(col_types) <- col_names 162 | return(col_types) 163 | } 164 | 165 | record_metadata <- function(tarfile) { 166 | stopifnot(is.character(tarfile), length(tarfile) == 1, file.exists(tarfile)) 167 | 168 | f <- basename(tarfile) 169 | if (is_checklists(f)) { 170 | dataset <- "checklists" 171 | subset <- NA_character_ 172 | } else if (is_observations(f, allow_subset = FALSE)) { 173 | dataset <- "observations" 174 | subset <- NA_character_ 175 | # todo: implement ability to import ebd subset, currently in a zip file 176 | # } else if (is_observations(f, allow_subset = TRUE)) { 177 | # dataset <- "observations" 178 | # subset <- sub("ebd_([-_A-Za-z0-9]+)_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.tar", 179 | # "\\1", f) 180 | } else { 181 | stop("The provided tar filename does not appear to contain eBird data. ", 182 | "The expected format is, e.g., ebd_relJul-2021.tar.") 183 | } 184 | 185 | # parse date from filename 186 | rawdate <- sub("ebd[-_A-Za-z0-9]*_rel([A-Z]{1}[a-z]{2}-[0-9]{4})\\.tar", 187 | "\\1", f) 188 | date <- strsplit(rawdate, "-")[[1]] 189 | date[1] <- match(date[1], month.abb) 190 | date <- paste(date[2], date[1], "1", sep = "-") 191 | date <- as.Date(date, format = "%Y-%m-%d") 192 | if (is.na(date)) { 193 | stop("Month and year could not be parsed from filename: ", rawdate) 194 | } 195 | version = format(date, "%Y-%m") 196 | 197 | if (!is.na(subset)) { 198 | message("EBD subset detected for: ", subset) 199 | } 200 | 201 | # sha256 file hash 202 | hash <- digest::digest(tarfile, algo = "crc32", file = TRUE) 203 | 204 | # save to csv 205 | file_metadata <- data.frame(dataset = dataset, 206 | version = version, 207 | subset = subset, 208 | source_file = tarfile, 209 | file_size = file.size(tarfile), 210 | hash_crc32 = as.character(hash)[], 211 | timestamp = Sys.time()) 212 | f_metadata <- file.path(ebird_data_dir(), 213 | paste0(dataset, "-metadata.csv")) 214 | utils::write.csv(file_metadata, file = f_metadata, row.names = FALSE, na = "") 215 | 216 | invisible(file_metadata) 217 | } 218 | 219 | 220 | is_checklists <- function(x) { 221 | x <- basename(x) 222 | if (!grepl("\\.tar$", x)) { 223 | stop("The provided file does not appear to be a tar archive. The file ", 224 | "extension should be .tar.") 225 | } 226 | grepl("ebd_sampling_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.tar$", x) 227 | } 228 | 229 | is_observations <- function(x, allow_subset = FALSE) { 230 | x <- basename(x) 231 | if (!grepl("\\.tar$", x)) { 232 | stop("The provided file does not appear to be a tar archive. The file ", 233 | "extension should be .tar.") 234 | } 235 | is_obs <- grepl("ebd_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.tar$", x) 236 | if (allow_subset) { 237 | is_ss <- grepl("ebd[-_A-Za-z0-9]*_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.zip$", x) 238 | is_obs <- is_obs | is_ss 239 | } 240 | return(is_obs) 241 | } 242 | 243 | 244 | 245 | # https://ebird.org/data/download/ebd 246 | # https://download.ebird.org/ebd/prepackaged/ebd_sampling_relOct-2021.tar 247 | # https://download.ebird.org/ebd/prepackaged/ebd_relOct-2021.tar 248 | --------------------------------------------------------------------------------