├── .github
    ├── .gitignore
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── tests
    ├── testthat.R
    ├── spelling.R
    └── testthat
    │   └── test-ebird.R
├── .Rbuildignore
├── inst
    ├── extdata
    │   ├── ebd_relAug-2021.tar
    │   └── ebd_sampling_relAug-2021.tar
    └── WORDLIST
├── NAMESPACE
├── codecov.yml
├── birddb.Rproj
├── R
    ├── duckdb_pragmas.R
    ├── sample_data.R
    ├── ebird_data_dir.R
    ├── ebird.R
    ├── ebird_remote.R
    ├── ebird_conn.R
    └── import_ebird.R
├── man
    ├── sample_data.Rd
    ├── ebird_data_dir.Rd
    ├── ebird_remote.Rd
    ├── ebird_conn.Rd
    ├── ebird_tbl.Rd
    └── import_ebird.Rd
├── LICENSE
├── DESCRIPTION
├── README.Rmd
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(birddb)
3 | 
4 | test_check("birddb")
5 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^\.github$
5 | ^codecov\.yml$
6 | 


--------------------------------------------------------------------------------
/inst/extdata/ebd_relAug-2021.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cboettig/birddb/HEAD/inst/extdata/ebd_relAug-2021.tar


--------------------------------------------------------------------------------
/inst/extdata/ebd_sampling_relAug-2021.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cboettig/birddb/HEAD/inst/extdata/ebd_sampling_relAug-2021.tar


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | CMD
 2 | Codecov
 3 | dir
 4 | dplyr
 5 | dbplyr
 6 | duckdb
 7 | DuckDB
 8 | eBird
 9 | ebd
10 | ebird
11 | http
12 | tarfile
13 | unarchive


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if(requireNamespace('spelling', quietly = TRUE))
2 |   spelling::spell_check_test(vignettes = TRUE, error = FALSE,
3 |                              skip_on_cran = TRUE)
4 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(checklists)
 4 | export(ebird_conn)
 5 | export(ebird_data_dir)
 6 | export(ebird_remote)
 7 | export(import_ebird)
 8 | export(observations)
 9 | export(sample_checklist_data)
10 | export(sample_observation_data)
11 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/birddb.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace
19 | 


--------------------------------------------------------------------------------
/R/duckdb_pragmas.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | duckdb_mem_limit <- function(conn, memory_limit = 16, units = "GB"){
 4 |   DBI::dbExecute(conn = conn, 
 5 |                  paste0("PRAGMA memory_limit='", memory_limit, units, "'"))
 6 | }
 7 | # set CPU parallel
 8 | duckdb_parallel <- function(conn, mc.cores = options("mc.cores", 2L)){
 9 |   DBI::dbExecute(conn, paste0("PRAGMA threads=", mc.cores))
10 | }
11 | 
12 | ## Used by in-memory connections when creating temporary tables
13 | duckdb_set_tempdir <- function(conn, temp = tempdir()){
14 |   DBI::dbExecute(conn, paste0("PRAGMA temp_directory='", temp, "'"))
15 | }
16 | 


--------------------------------------------------------------------------------
/man/sample_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sample_data.R
 3 | \name{sample_data}
 4 | \alias{sample_data}
 5 | \alias{sample_checklist_data}
 6 | \alias{sample_observation_data}
 7 | \title{Provide path to a small subset of eBird data}
 8 | \usage{
 9 | sample_checklist_data()
10 | 
11 | sample_observation_data()
12 | }
13 | \value{
14 | The path to the sample tar archive file.
15 | }
16 | \description{
17 | These small sample dataset consists of all observations from Hong Kong in the
18 | year 2012. Sample files are provided for checklist and observation data, both
19 | packaged as tar archive files to mimic the format of the eBird Basic Dataset
20 | download.
21 | }
22 | \examples{
23 | sample_checklist_data()
24 | sample_observation_data()
25 | }
26 | 


--------------------------------------------------------------------------------
/R/sample_data.R:
--------------------------------------------------------------------------------
 1 | #' Provide path to a small subset of eBird data
 2 | #' 
 3 | #' These small sample dataset consists of all observations from Hong Kong in the 
 4 | #' year 2012. Sample files are provided for checklist and observation data, both 
 5 | #' packaged as tar archive files to mimic the format of the eBird Basic Dataset 
 6 | #' download.
 7 | #' 
 8 | #' @name sample_data
 9 | #' @return The path to the sample tar archive file.
10 | #' @examples
11 | #' sample_checklist_data()
12 | #' sample_observation_data()
13 | NULL
14 | 
15 | #' @export
16 | #' @rdname sample_data
17 | sample_checklist_data <- function() {
18 |   system.file("extdata", "ebd_sampling_relAug-2021.tar", package = "birddb", 
19 |               mustWork = TRUE)
20 | }
21 | 
22 | #' @export
23 | #' @rdname sample_data
24 | sample_observation_data <- function() {
25 |   system.file("extdata", "ebd_relAug-2021.tar", package = "birddb", 
26 |               mustWork = TRUE)
27 | }
28 | 


--------------------------------------------------------------------------------
/man/ebird_data_dir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ebird_data_dir.R
 3 | \name{ebird_data_dir}
 4 | \alias{ebird_data_dir}
 5 | \title{Retrieve directory used to store eBird data parquet files}
 6 | \usage{
 7 | ebird_data_dir()
 8 | }
 9 | \description{
10 | Show the location used by \code{birddb} to store eBird data parquet files. The
11 | default location is that chosen by R based on your OS, see
12 | \code{\link[tools:userdir]{tools::R_user_dir()}}. Alternately, users can configure a different permanent
13 | storage location by setting their desired path in the environmental variable
14 | \code{BIRDDB_HOME}. This may be desirable when multiple users of the same machine
15 | or server want to access a single copy of the eBird data. To set
16 | \code{BIRDDB_HOME}, add it to your \code{.Renviron} file, for example by using
17 | \code{usethis::edit_r_environ()}.
18 | }
19 | \examples{
20 | ebird_data_dir()
21 | }
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Carl Boettiger
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: birddb
 2 | Version: 0.1.0
 3 | Title: Local Database Interface to eBird
 4 | Description: Yet another package for working with 'eBird' data in R.  This package
 5 |   is similar to 'auk' but provides a relational database interface. This allows
 6 |   users to query the data using familiar operations such as 'dplyr'. The 
 7 |   high-performance backend is made possible by 'parquet' and 'duckdb'.
 8 | Authors@R: c(
 9 |             person("Carl", "Boettiger", , "cboettig@gmail.com", c("aut", "cre"),
10 |                    comment = c(ORCID = "0000-0002-1642-628X"))
11 |             )
12 | License: MIT + file LICENSE
13 | Encoding: UTF-8
14 | ByteCompile: true
15 | Depends: R (>= 4.0)
16 | Imports:
17 |     arrow (>= 7.0.0),
18 |     duckdb (>= 0.2.9),
19 |     DBI,
20 |     dplyr,
21 |     dbplyr,
22 |     digest,
23 |     utils
24 | Suggests:
25 |     spelling,
26 |     testthat (>= 3.0.0),
27 |     covr,
28 |     knitr,
29 |     rmarkdown,
30 |     progress
31 | URL: https://github.com/cboettig/birddb
32 | BugReports: https://github.com/cboettig/birddb
33 | Language: en-US
34 | Roxygen: list(markdown = TRUE)
35 | RoxygenNote: 7.1.2
36 | Config/testthat/edition: 3
37 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ebird.R:
--------------------------------------------------------------------------------
 1 | test_that("birddb works", {
 2 |   temp_dir <- file.path(tempdir(), "birddb")
 3 |   Sys.setenv("BIRDDB_HOME" = temp_dir)
 4 |   
 5 |   import_ebird(sample_observation_data())
 6 |   import_ebird(sample_checklist_data())
 7 |   
 8 |   # observations
 9 |   con <- ebird_conn("observations")
10 |   observations <- observations(con)
11 |   expect_s3_class(observations, "tbl")
12 |   expect_s3_class(observations, "tbl_dbi")
13 |   expect_equal(DBI::dbListTables(con), "observations")
14 |   
15 |   out <- observations %>% dplyr::count(common_name) %>% dplyr::collect()
16 |   expect_s3_class(out, "data.frame")
17 |   expect_gt(nrow(out), 0)
18 |   
19 |   # checklists 
20 |   con <- ebird_conn("checklists")
21 |   checklists <- checklists(con)
22 |   expect_s3_class(checklists, "tbl")
23 |   expect_s3_class(checklists, "tbl_dbi")
24 |   # ensure that both tables are in the same database
25 |   expect_equal(sort(DBI::dbListTables(con)), 
26 |                c("checklists", "observations"))
27 |   
28 |   out <- checklists %>% dplyr::count(country) %>% dplyr::collect()
29 |   expect_s3_class(out, "data.frame")
30 |   expect_gt(nrow(out), 0)
31 |   
32 |   # cleanup
33 |   DBI::dbDisconnect(con, shutdown = TRUE)
34 |   unlink(temp_dir, recursive = TRUE)
35 | })
36 | 


--------------------------------------------------------------------------------
/R/ebird_data_dir.R:
--------------------------------------------------------------------------------
 1 | #' Retrieve directory used to store eBird data parquet files
 2 | #'
 3 | #' Show the location used by `birddb` to store eBird data parquet files. The
 4 | #' default location is that chosen by R based on your OS, see
 5 | #' [tools::R_user_dir()]. Alternately, users can configure a different permanent
 6 | #' storage location by setting their desired path in the environmental variable
 7 | #' `BIRDDB_HOME`. This may be desirable when multiple users of the same machine
 8 | #' or server want to access a single copy of the eBird data. To set
 9 | #' `BIRDDB_HOME`, add it to your `.Renviron` file, for example by using
10 | #' `usethis::edit_r_environ()`.
11 | #' 
12 | #' @export
13 | #' @examples
14 | #' ebird_data_dir()
15 | ebird_data_dir <- function() {
16 |   Sys.getenv("BIRDDB_HOME", 
17 |              tools::R_user_dir("birddb", "data")
18 |   )
19 | }
20 | 
21 | # a location for duckdb view files
22 | # very small, but should not be shared between users
23 | # currently defaults to storing in memory, making it ephemeral
24 | ebird_db_dir <- function() {
25 |   path <- Sys.getenv("BIRDDB_DUCKDB", ":memory:")
26 |   if (path == ":memory:") {
27 |     return(path)
28 |   }
29 |   dir.create(path, recursive = TRUE, showWarnings = FALSE)
30 |   file.path(path, "database")
31 | }
32 | 


--------------------------------------------------------------------------------
/man/ebird_remote.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ebird_remote.R
 3 | \name{ebird_remote}
 4 | \alias{ebird_remote}
 5 | \title{ebird remote}
 6 | \usage{
 7 | ebird_remote(
 8 |   dataset = c("observations", "checklists"),
 9 |   version = "Oct-2021",
10 |   bucket = "ebird",
11 |   to_duckdb = FALSE,
12 |   host = "minio.cirrus.carlboettiger.info",
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{dataset}{name of dataset (table) to access.}
18 | 
19 | \item{version}{eBird snapshot date}
20 | 
21 | \item{bucket}{eBird bucket name (including region)}
22 | 
23 | \item{to_duckdb}{Return a remote duckdb connection or arrow connection?
24 | Note that leaving as FALSE may be faster but is limited to the dplyr-style
25 | operations supported by \link{arrow} alone.}
26 | 
27 | \item{host}{Remote S3-based host of eBird parquet data}
28 | 
29 | \item{...}{additional parameters passed to the s3_bucket() (e.g. for remote
30 | access to independently hosted buckets)}
31 | }
32 | \description{
33 | Connect to an eBird snapshot remote. Can be much faster than downloading
34 | for one-off use or when using the package from a server in the same region
35 | as the data.
36 | }
37 | \examples{
38 | \dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
39 | \dontshow{\}) # examplesIf}
40 | }
41 | 


--------------------------------------------------------------------------------
/R/ebird.R:
--------------------------------------------------------------------------------
 1 | #' Return a remote connection to a table in your local eBird database
 2 | #'
 3 | #' Parquet files setup with a view in a DuckDB database, as done by
 4 | #' [ebird_conn()], can be queried with [dplyr] syntax. This function sets up
 5 | #' [tbl_dbi] object, which are remote tables referencing either the checklist or
 6 | #' observation dataset. These remote tables can then by queried with [dplyr]
 7 | #' similarly to a [data.frame].
 8 | #' 
 9 | #' @param conn a connection to the local eBird database, see [ebird_conn()].
10 | #' 
11 | #' @details 
12 | #' When working with a remote table in [dplyr], the primary different compared
13 | #' to working with a normal [data.frame] is that calls are evaluated lazily,
14 | #' generating SQL that is only sent to the database when you request the data. 
15 | #' The [dplyr] functions [collect()] and [compute()] can be used to force 
16 | #' evaluation.
17 | #' 
18 | #' @return A [tbl_dbi] object referencing either the checklist or observation 
19 | #'   data in DuckDB.
20 | #' @name ebird_tbl
21 | #' @examples
22 | #' # only use a tempdir for this example, don't copy this for real data
23 | #' temp_dir <- file.path(tempdir(), "birddb")
24 | #' Sys.setenv("BIRDDB_HOME" = temp_dir)
25 | #' 
26 | #' # get the path to a sample dataset provided with the package
27 | #' tar <- sample_observation_data()
28 | #' # import the sample dataset to parquet
29 | #' import_ebird(tar)
30 | #' 
31 | #' # set up the database connection to the observations data
32 | #' observations <- observations()
33 | #' # query the data, number of observations of each species
34 | #' dplyr::count(observations, common_name)
35 | #' 
36 | #' unlink(temp_dir, recursive = TRUE)
37 | NULL
38 | 
39 | #' @rdname ebird_tbl
40 | #' @export
41 | observations <- function(conn = ebird_conn("observations")) {
42 |   dplyr::tbl(conn, "observations")
43 | }
44 | 
45 | #' @rdname ebird_tbl
46 | #' @export
47 | checklists <- function(conn = ebird_conn("checklists")) {
48 |   dplyr::tbl(conn, "checklists")
49 | }
50 | 


--------------------------------------------------------------------------------
/man/ebird_conn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ebird_conn.R
 3 | \name{ebird_conn}
 4 | \alias{ebird_conn}
 5 | \title{Set up a \code{DBI}-style database connection to the imported eBird data}
 6 | \usage{
 7 | ebird_conn(
 8 |   dataset = c("observations", "checklists"),
 9 |   cache_connection = TRUE,
10 |   memory_limit = 16
11 | )
12 | }
13 | \arguments{
14 | \item{dataset}{the type of dataset to set up a connection to, either the
15 | observations of checklists.}
16 | 
17 | \item{cache_connection}{should we preserve a cache of the connection? allows
18 | faster load times and prevents connection from being garbage-collected.}
19 | 
20 | \item{memory_limit}{the memory limit for DuckDB.}
21 | }
22 | \value{
23 | A \link{DBI} connection object using to communicate with the DuckDB
24 | database containing the eBird data.
25 | }
26 | \description{
27 | Parquet files can be accessed as though they were relational database tables
28 | by setting up a view to the file using DuckDB. This function sets up a view
29 | on either the checklist or observation dataset and returns a \link{DBI}-style
30 | database connection to the data. The returned object can then be queried
31 | with SQL syntax via \link{DBI} or with \link{dplyr} syntax via \link{dbplyr}. For the latter
32 | approach, consider using the \code{\link[=checklists]{checklists()}} and \code{\link[=observations]{observations()}} functions
33 | which will return \link{tbl} objects ready for access using \link{dplyr} syntax.
34 | }
35 | \examples{
36 | # only use a tempdir for this example, don't copy this for real data
37 | temp_dir <- file.path(tempdir(), "birddb")
38 | Sys.setenv("BIRDDB_HOME" = temp_dir)
39 | 
40 | # get the path to a sample dataset provided with the package
41 | tar <- sample_observation_data()
42 | # import the sample dataset to parquet
43 | import_ebird(tar)
44 | # set up the database connection
45 | con <- ebird_conn(dataset = "observations")
46 | 
47 | unlink(temp_dir, recursive = TRUE)
48 | }
49 | 


--------------------------------------------------------------------------------
/R/ebird_remote.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #' ebird remote
 4 | #'
 5 | #' Connect to an eBird snapshot remote. Can be much faster than downloading
 6 | #' for one-off use or when using the package from a server in the same region
 7 | #' as the data.
 8 | #'
 9 | #' @param version eBird snapshot date
10 | #' @param bucket eBird bucket name (including region)
11 | #' @param to_duckdb Return a remote duckdb connection or arrow connection?
12 | #'   Note that leaving as FALSE may be faster but is limited to the dplyr-style
13 | #'   operations supported by [arrow] alone.
14 | #' @param dataset name of dataset (table) to access.
15 | #' @param host Remote S3-based host of eBird parquet data
16 | #' @param ... additional parameters passed to the s3_bucket() (e.g. for remote
17 | #'  access to independently hosted buckets)
18 | #' @examplesIf interactive()
19 | #' @export
20 | #'
21 | ebird_remote <-
22 |   function(dataset = c("observations", "checklists"),
23 |            version = "Oct-2021",
24 |            bucket = "ebird",
25 |            to_duckdb = FALSE,
26 |            host = "minio.cirrus.carlboettiger.info",
27 |            ...) {
28 |     dataset <- match.arg(dataset)
29 |     
30 |     ## Not ideal, but these will cause problems if set
31 |     unset_aws_env()
32 |     
33 |     server <- arrow::s3_bucket(bucket,
34 |                                endpoint_override = host,
35 |                                ...)
36 |     
37 |     path <- server$path(file.path(version, dataset, fsep = "/"))
38 |     df <- arrow::open_dataset(path)
39 |     if (to_duckdb) {
40 |       df <- arrow::to_duckdb(df)
41 |     }
42 |     df
43 |   }
44 | 
45 | 
46 | 
47 | unset_aws_env <- function() {
48 |   ## Consider re-setting these afterwards.
49 |   ## What about ~/.aws ?
50 |   ## Maybe set these to empty strings instead of unsetting?
51 |   
52 |   ## Would be nice if we could simply override the detection of these
53 |   Sys.unsetenv("AWS_DEFAULT_REGION")
54 |   Sys.unsetenv("AWS_S3_ENDPOINT")
55 |   Sys.unsetenv("AWS_ACCESS_KEY_ID")
56 |   Sys.unsetenv("AWS_SECRET_ACCESS_KEY")
57 | }
58 | 


--------------------------------------------------------------------------------
/man/ebird_tbl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ebird.R
 3 | \name{ebird_tbl}
 4 | \alias{ebird_tbl}
 5 | \alias{observations}
 6 | \alias{checklists}
 7 | \title{Return a remote connection to a table in your local eBird database}
 8 | \usage{
 9 | observations(conn = ebird_conn("observations"))
10 | 
11 | checklists(conn = ebird_conn("checklists"))
12 | }
13 | \arguments{
14 | \item{conn}{a connection to the local eBird database, see \code{\link[=ebird_conn]{ebird_conn()}}.}
15 | }
16 | \value{
17 | A \link{tbl_dbi} object referencing either the checklist or observation
18 | data in DuckDB.
19 | }
20 | \description{
21 | Parquet files setup with a view in a DuckDB database, as done by
22 | \code{\link[=ebird_conn]{ebird_conn()}}, can be queried with \link{dplyr} syntax. This function sets up
23 | \link{tbl_dbi} object, which are remote tables referencing either the checklist or
24 | observation dataset. These remote tables can then by queried with \link{dplyr}
25 | similarly to a \link{data.frame}.
26 | }
27 | \details{
28 | When working with a remote table in \link{dplyr}, the primary different compared
29 | to working with a normal \link{data.frame} is that calls are evaluated lazily,
30 | generating SQL that is only sent to the database when you request the data.
31 | The \link{dplyr} functions \code{\link[=collect]{collect()}} and \code{\link[=compute]{compute()}} can be used to force
32 | evaluation.
33 | }
34 | \examples{
35 | # only use a tempdir for this example, don't copy this for real data
36 | temp_dir <- file.path(tempdir(), "birddb")
37 | Sys.setenv("BIRDDB_HOME" = temp_dir)
38 | 
39 | # get the path to a sample dataset provided with the package
40 | tar <- sample_observation_data()
41 | # import the sample dataset to parquet
42 | import_ebird(tar)
43 | 
44 | # set up the database connection to the observations data
45 | observations <- observations()
46 | # query the data, number of observations of each species
47 | dplyr::count(observations, common_name)
48 | 
49 | unlink(temp_dir, recursive = TRUE)
50 | }
51 | 


--------------------------------------------------------------------------------
/man/import_ebird.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/import_ebird.R
 3 | \name{import_ebird}
 4 | \alias{import_ebird}
 5 | \title{Import eBird data to parquet}
 6 | \usage{
 7 | import_ebird(tarfile)
 8 | }
 9 | \arguments{
10 | \item{tarfile}{path to the tar archive file downloaded from the eBird
11 | website. Files containing either observation data (e.g.
12 | \verb{ebd_rel<DATE>.tar}) or checklist (e.g. \verb{ebd_sampling_rel<DATE>.tar}) data
13 | can be provided}
14 | }
15 | \value{
16 | Invisibly return the path to the directory containing eBird parquet
17 | files.
18 | }
19 | \description{
20 | eBird data are released as tab-separated text files, packaged into tar
21 | archives. Given a path to an eBird tarfile, this function will extract and
22 | import the tar archive into a parquet-based database in your
23 | \code{\link[=ebird_data_dir]{ebird_data_dir()}}.
24 | }
25 | \details{
26 | \href{https://ebird.org/home}{eBird} data are collected and organized around the
27 | concept of a checklist, representing observations from a single birding
28 | event. Each checklist contains a list of species observed, counts of the
29 | number of individuals seen of each species, the location and time of the
30 | observations, and a measure of the effort expended while collecting these
31 | data. The majority of the \href{https://ebird.org/home}{eBird} database is
32 | available for download in the form of the \href{https://ebird.org/data/download}{eBird Basic Dataset (EBD)}, a set of two tab-separated text
33 | files.
34 | 
35 | The \strong{checklist} dataset (referred to as the Sampling Event Data on the
36 | eBird website) consists of one row for each eBird checklist and columns
37 | contain checklist-level information such as location, date, and search
38 | effort. The \strong{observation} dataset consists of one row for each species
39 | observed on each checklist and columns contain checklist-level information
40 | such as number of individuals detected. This dataset also contains all
41 | checklist-level variables, duplicated for each species on the same checklist.
42 | 
43 | After \href{https://ebird.org/data/download}{submitting a request for data access}, users can download either or both
44 | of these datasets as tar archive files. \code{import_ebird()} takes the path to a
45 | tar file as input and imports the text file contained within to a parquet
46 | file, which will allow much easier access to the data. This function will
47 | automatically detect whether you are importing a checklist or observation
48 | dataset provided you \strong{do not change the name of the downloaded file or
49 | unarchive the tar file}. The parquet files will be stored in the directory
50 | specified by \code{\link[=ebird_data_dir]{ebird_data_dir()}}, consult the help for that function to learn
51 | how to modify the parquet directory.
52 | }
53 | \examples{
54 | # only use a tempdir for this example, don't copy this for real data
55 | temp_dir <- file.path(tempdir(), "birddb")
56 | Sys.setenv("BIRDDB_HOME" = temp_dir)
57 | 
58 | # get the path to a sample dataset provided with the package
59 | tar <- sample_observation_data()
60 | # import the sample dataset to parquet
61 | import_ebird(tar)
62 | 
63 | unlink(temp_dir, recursive = TRUE)
64 | }
65 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |       - master
12 | 
13 | name: R-CMD-check
14 | 
15 | jobs:
16 |   R-CMD-check:
17 |     runs-on: ${{ matrix.config.os }}
18 | 
19 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         config:
25 |           - {os: windows-latest, r: 'release'}
26 |           - {os: macOS-latest, r: 'release'}
27 |           - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
28 |           - {os: ubuntu-20.04,   r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" }
29 | 
30 |     env:
31 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
32 |       RSPM: ${{ matrix.config.rspm }}
33 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
34 | 
35 |     steps:
36 |       - uses: actions/checkout@v2
37 | 
38 |       - uses: r-lib/actions/setup-r@v1
39 |         with:
40 |           r-version: ${{ matrix.config.r }}
41 | 
42 |       - uses: r-lib/actions/setup-pandoc@v1
43 | 
44 |       - name: Query dependencies
45 |         run: |
46 |           install.packages('remotes')
47 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
48 |           writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
49 |         shell: Rscript {0}
50 | 
51 |       - name: Restore R package cache
52 |         uses: actions/cache@v2
53 |         with:
54 |           path: ${{ env.R_LIBS_USER }}
55 |           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
56 |           restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
57 | 
58 |       - name: Install system dependencies
59 |         if: runner.os == 'Linux'
60 |         run: |
61 |           while read -r cmd
62 |           do
63 |             eval sudo $cmd
64 |           done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
65 | 
66 |       - name: Install dependencies
67 |         run: |
68 |           remotes::install_deps(dependencies = TRUE)
69 |           remotes::install_cran("rcmdcheck")
70 |           install.packages("https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz", repos = NULL)
71 |         shell: Rscript {0}
72 |         
73 |       - name: Check
74 |         env:
75 |           _R_CHECK_CRAN_INCOMING_REMOTE_: false
76 |         run: |
77 |           options(crayon.enabled = TRUE)
78 |           rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
79 |         shell: Rscript {0}
80 | 
81 |       - name: Upload check results
82 |         if: failure()
83 |         uses: actions/upload-artifact@main
84 |         with:
85 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
86 |           path: check
87 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | editor_options: 
 4 |   chunk_output_type: console
 5 | ---
 6 | 
 7 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 8 | 
 9 | ```{r, include = FALSE}
10 | knitr::opts_chunk$set(
11 |   collapse = TRUE,
12 |   comment = "#>",
13 |   fig.path = "man/figures/README-",
14 |   out.width = "100%"
15 | )
16 | 
17 | Sys.setenv("BIRDDB_HOME" = tempdir())
18 | ```
19 | 
20 | # birddb
21 | 
22 | <!-- badges: start -->
23 | [![R-CMD-check](https://github.com/cboettig/birddb/workflows/R-CMD-check/badge.svg)](https://github.com/cboettig/birddb/actions)
24 | [![Codecov test coverage](https://codecov.io/gh/cboettig/birddb/branch/main/graph/badge.svg)](https://codecov.io/gh/cboettig/birddb?branch=main)
25 | [![CRAN status](https://www.r-pkg.org/badges/version/birddb)](https://CRAN.R-project.org/package=birddb)
26 | <!-- badges: end -->
27 | 
28 | The goal of `birddb` is to provide a relational database interface to a local copy of eBird.  `birddb` works by importing the text-based ebird file into a local parquet file using [arrow](https://cran.r-project.org/package=arrow), which can be queried as a relational database using the familiar `dplyr` interface.
29 | `dplyr` translates R-based queries into SQL commands which are past to [`duckdb`](https://duckdb.org), which then queries the parquet database. Unlike the native `arrow` interface, `duckdb` supports the full set of SQL instructions, including windowed operations like `group_by`+`summarise` as well as table joins.
30 | 
31 | 
32 | ## Installation
33 | 
34 | You can install the released version of `birddb` from [CRAN](https://CRAN.R-project.org) with:
35 | 
36 | ``` r
37 | install.packages("birddb")
38 | ```
39 | 
40 | And the development version from [GitHub](https://github.com/) with:
41 | 
42 | ``` r
43 | # install.packages("devtools")
44 | devtools::install_github("cboettig/birddb")
45 | ```
46 | ## Getting Started
47 | 
48 | ```{r message=FALSE}
49 | library(birddb)
50 | library(dplyr)
51 | ```
52 | 
53 | Before you can use `birddb` you will need to download the latest version of the eBird Basic Dataset from http://ebird.org/ebird/data/download.
54 | Once you have obtained a downloaded copy of the `tar` file, `birddb` can import it for you. The one-time import of the full data dump is a little slow (about 1 hr in my benchmark) due to the time required to extract the tar file and convert the text data into parquet format.
55 | 
56 | For illustration and testing purposes, we will use the small eBird sample data, included in the package for convenience and testing purposes:
57 | 
58 | ```{r}
59 | observations_tar <- birddb::sample_observation_data()
60 | checklists_tar <- birddb::sample_checklist_data()
61 | ```
62 | 
63 | Importing will now create the local parquet-based copies in the default directory given by `ebird_data_dir()`.
64 | Users can set an alternative location by setting the environmental variable `BIRDDB_HOME` to the desired path.
65 | 
66 | ```{r}
67 | import_ebird(observations_tar)
68 | import_ebird(checklists_tar)
69 | ```
70 | 
71 | Once the data have been downloaded and imported successfully, a user can access the full eBird dataset quite quickly:
72 | 
73 | ```{r}  
74 | observations <- observations()
75 | checklists <- checklists()
76 | ```
77 | 
78 | To see the available columns in each dataset use:
79 | 
80 | ```{r}
81 | colnames(observations)
82 | colnames(checklists)
83 | ```
84 | 
85 | Now, we can use `dplyr` to perform standard queries. For example, to see the number of observations for each species in the sample dataset:
86 | 
87 | ```{r}
88 | observations %>% count(scientific_name, sort = TRUE)
89 | ```
90 | 
91 | 
92 | ```{r include=FALSE}
93 | Sys.unsetenv("BIRDDB_HOME")
94 | ```
95 | 


--------------------------------------------------------------------------------
/R/ebird_conn.R:
--------------------------------------------------------------------------------
  1 | #' Set up a `DBI`-style database connection to the imported eBird data
  2 | #' 
  3 | #' Parquet files can be accessed as though they were relational database tables 
  4 | #' by setting up a view to the file using DuckDB. This function sets up a view 
  5 | #' on either the checklist or observation dataset and returns a [DBI]-style 
  6 | #' database connection to the data. The returned object can then be queried 
  7 | #' with SQL syntax via [DBI] or with [dplyr] syntax via [dbplyr]. For the latter 
  8 | #' approach, consider using the [checklists()] and [observations()] functions 
  9 | #' which will return [tbl] objects ready for access using [dplyr] syntax.
 10 | #' 
 11 | #' @param dataset the type of dataset to set up a connection to, either the 
 12 | #' observations of checklists.
 13 | #' @param cache_connection should we preserve a cache of the connection? allows
 14 | #'   faster load times and prevents connection from being garbage-collected.
 15 | #' @param memory_limit the memory limit for DuckDB.
 16 | #' 
 17 | #' @return A [DBI] connection object using to communicate with the DuckDB 
 18 | #'   database containing the eBird data.
 19 | #' @export
 20 | #' @examples
 21 | #' # only use a tempdir for this example, don't copy this for real data
 22 | #' temp_dir <- file.path(tempdir(), "birddb")
 23 | #' Sys.setenv("BIRDDB_HOME" = temp_dir)
 24 | #' 
 25 | #' # get the path to a sample dataset provided with the package
 26 | #' tar <- sample_observation_data()
 27 | #' # import the sample dataset to parquet
 28 | #' import_ebird(tar)
 29 | #' # set up the database connection
 30 | #' con <- ebird_conn(dataset = "observations")
 31 | #' 
 32 | #' unlink(temp_dir, recursive = TRUE)
 33 | ebird_conn <- function(dataset = c("observations", "checklists"), 
 34 |                        cache_connection = TRUE,
 35 |                        memory_limit = 16) {
 36 |   
 37 |   dataset <- match.arg(dataset)
 38 |   #parquet <- ebird_parquet_files(dataset = dataset)
 39 |   
 40 |   conn <- duckdb_connection(memory_limit = memory_limit,
 41 |                             cache_connection = cache_connection)
 42 |   # create the view if does not exist
 43 |   
 44 |   parquet <- paste0(file.path(ebird_data_dir(), dataset), "/*.parquet")
 45 |   
 46 |   if (!dataset %in% DBI::dbListTables(conn)){
 47 |     # query to create view in duckdb to the parquet file
 48 |     view_query <- paste0("CREATE VIEW '", dataset, 
 49 |                          "' AS SELECT * FROM parquet_scan('",
 50 |                          parquet, "');")
 51 |     DBI::dbSendQuery(conn, view_query)
 52 |   }
 53 | 
 54 |   
 55 |   conn
 56 | }
 57 | 
 58 | duckdb_connection <- function(dir = ebird_db_dir(),
 59 |                               memory_limit = 16,
 60 |                               mc.cores = arrow::cpu_count(),
 61 |                               cache_connection = TRUE
 62 |                               ) {
 63 |   stopifnot(is.logical(cache_connection), 
 64 |             length(cache_connection) == 1)
 65 |   stopifnot(is.numeric(memory_limit), 
 66 |             length(memory_limit) == 1,
 67 |             !is.na(memory_limit), 
 68 |             memory_limit > 0)
 69 |   
 70 |   # check for a cached connection
 71 |   conn <- mget("birddb", envir = birddb_cache, ifnotfound = NA)[["birddb"]]
 72 |   
 73 |   # Disconnect if it's an invalid connection (expired in cache)
 74 |   if ( db_is_invalid(conn) ) {
 75 |     conn <- DBI::dbDisconnect(conn, shutdown = TRUE)
 76 |   }
 77 |   
 78 |   # We don't have a valid cached connection, so we must create one! 
 79 |   if (!inherits(conn, "DBIConnection")){
 80 |     conn <- DBI::dbConnect(drv = duckdb::duckdb(), dir)  
 81 |   }
 82 |   
 83 |   ## PRAGMAS
 84 |   duckdb_mem_limit(conn, memory_limit, "GB")
 85 |   duckdb_parallel(conn, mc.cores)
 86 |   duckdb_set_tempdir(conn, tempdir())
 87 | 
 88 |   # (re)-cache the connection
 89 |   if (cache_connection) {
 90 |     assign("birddb", conn, envir = birddb_cache)
 91 |   }
 92 | 
 93 | conn
 94 | }
 95 | 
 96 | ## 
 97 | db_is_invalid <- function(conn) {
 98 |   inherits(conn, "DBIConnection") && !DBI::dbIsValid(conn)
 99 | }
100 | 
101 | ebird_parquet_files <- function(dataset = c("observations", "checklists")) {
102 |   dataset <- match.arg(dataset)
103 |   
104 |   # list of all parquet files
105 |   dir <- file.path(ebird_data_dir(), dataset)
106 |   file <- list.files(dir, pattern = "[.]parquet", 
107 |                      full.names = TRUE, recursive = TRUE)
108 |   
109 |   # currently we're assuming no partitioning is being used hence 1 file
110 |   # will need to modify later if partitioning is implemented
111 |   if (length(file) == 0) {
112 |     stop("No parquet files found in: ", dir)
113 |   }
114 |   
115 |   return(file)
116 | }
117 | 
118 | # environment to store the cached copy of the connection
119 | birddb_cache <- new.env()
120 | 
121 | # finalizer to close the connection on exit.
122 | local_db_disconnect <- function(db = ebird_conn()){
123 |   if (inherits(db, "DBIConnection")) {
124 |     suppressWarnings({
125 |       DBI::dbDisconnect(db, shutdown = TRUE)
126 |     })
127 |   }
128 |   if (exists("birddb", envir = birddb_cache)) {
129 |     suppressWarnings({
130 |       rm("birddb", envir = birddb_cache)
131 |     })
132 |   }
133 | }
134 | reg.finalizer(birddb_cache, local_db_disconnect, onexit = TRUE)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # birddb
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![R-CMD-check](https://github.com/cboettig/birddb/workflows/R-CMD-check/badge.svg)](https://github.com/cboettig/birddb/actions)
  9 | [![Codecov test
 10 | coverage](https://codecov.io/gh/cboettig/birddb/branch/main/graph/badge.svg)](https://codecov.io/gh/cboettig/birddb?branch=main)
 11 | [![CRAN
 12 | status](https://www.r-pkg.org/badges/version/birddb)](https://CRAN.R-project.org/package=birddb)
 13 | <!-- badges: end -->
 14 | 
 15 | The goal of `birddb` is to provide a relational database interface to a
 16 | local copy of eBird. `birddb` works by importing the text-based ebird
 17 | file into a local parquet file using
 18 | [arrow](https://cran.r-project.org/package=arrow), which can be queried
 19 | as a relational database using the familiar `dplyr` interface. `dplyr`
 20 | translates R-based queries into SQL commands which are past to
 21 | [`duckdb`](https://duckdb.org), which then queries the parquet database.
 22 | Unlike the native `arrow` interface, `duckdb` supports the full set of
 23 | SQL instructions, including windowed operations like
 24 | `group_by`+`summarise` as well as table joins.
 25 | 
 26 | ## Installation
 27 | 
 28 | <!-- 
 29 | You can install the released version of `birddb` from
 30 | [CRAN](https://CRAN.R-project.org) with:
 31 | 
 32 | ``` r
 33 | install.packages("birddb")
 34 | ```
 35 | --> 
 36 | 
 37 | And the development version from [GitHub](https://github.com/) with:
 38 | 
 39 | ``` r
 40 | # install.packages("devtools")
 41 | devtools::install_github("cboettig/birddb")
 42 | ```
 43 | 
 44 | ## Getting Started
 45 | 
 46 | ``` r
 47 | library(birddb)
 48 | library(dplyr)
 49 | ```
 50 | 
 51 | Before you can use `birddb` you will need to download the latest version
 52 | of the eBird Basic Dataset from <http://ebird.org/ebird/data/download>.
 53 | Once you have obtained a downloaded copy of the `tar` file, `birddb` can
 54 | import it for you. The one-time import of the full data dump is a little
 55 | slow (about 1 hr in my benchmark) due to the time required to extract
 56 | the tar file and convert the text data into parquet format.
 57 | 
 58 | For illustration and testing purposes, we will use the small eBird
 59 | sample data, included in the package for convenience and testing
 60 | purposes:
 61 | 
 62 | ``` r
 63 | observations_tar <- birddb::sample_observation_data()
 64 | checklists_tar <- birddb::sample_checklist_data()
 65 | ```
 66 | 
 67 | Importing will now create the local parquet-based copies in the default
 68 | directory given by `ebird_data_dir()`. Users can set an alternative
 69 | location by setting the environmental variable `BIRDDB_HOME` to the
 70 | desired path.
 71 | 
 72 | ``` r
 73 | import_ebird(observations_tar)
 74 | #> Importing observations data from the eBird Basic Dataset: ebd_relAug-2021.tar
 75 | #> Extracting from tar archive...
 76 | #> Importing to parquet...
 77 | import_ebird(checklists_tar)
 78 | #> Importing checklists data from the eBird Basic Dataset: ebd_sampling_relAug-2021.tar
 79 | #> Extracting from tar archive...
 80 | #> Importing to parquet...
 81 | ```
 82 | 
 83 | Once the data have been downloaded and imported successfully, a user can
 84 | access the full eBird dataset quite quickly:
 85 | 
 86 | ``` r
 87 | observations <- observations()
 88 | checklists <- checklists()
 89 | ```
 90 | 
 91 | To see the available columns in each dataset use:
 92 | 
 93 | ``` r
 94 | colnames(observations)
 95 | #>  [1] "global_unique_identifier"   "last_edited_date"          
 96 | #>  [3] "taxonomic_order"            "category"                  
 97 | #>  [5] "common_name"                "scientific_name"           
 98 | #>  [7] "subspecies_common_name"     "subspecies_scientific_name"
 99 | #>  [9] "observation_count"          "breeding_code"             
100 | #> [11] "breeding_category"          "behavior_code"             
101 | #> [13] "age_sex"                    "country"                   
102 | #> [15] "country_code"               "state"                     
103 | #> [17] "state_code"                 "county"                    
104 | #> [19] "county_code"                "iba_code"                  
105 | #> [21] "bcr_code"                   "usfws_code"                
106 | #> [23] "atlas_block"                "locality"                  
107 | #> [25] "locality_id"                "locality_type"             
108 | #> [27] "latitude"                   "longitude"                 
109 | #> [29] "observation_date"           "time_observations_started" 
110 | #> [31] "observer_id"                "sampling_event_identifier" 
111 | #> [33] "protocol_type"              "protocol_code"             
112 | #> [35] "project_code"               "duration_minutes"          
113 | #> [37] "effort_distance_km"         "effort_area_ha"            
114 | #> [39] "number_observers"           "all_species_reported"      
115 | #> [41] "group_identifier"           "has_media"                 
116 | #> [43] "approved"                   "reviewed"                  
117 | #> [45] "reason"                     "trip_comments"             
118 | #> [47] "species_comments"
119 | colnames(checklists)
120 | #>  [1] "last_edited_date"          "country"                  
121 | #>  [3] "country_code"              "state"                    
122 | #>  [5] "state_code"                "county"                   
123 | #>  [7] "county_code"               "iba_code"                 
124 | #>  [9] "bcr_code"                  "usfws_code"               
125 | #> [11] "atlas_block"               "locality"                 
126 | #> [13] "locality_id"               "locality_type"            
127 | #> [15] "latitude"                  "longitude"                
128 | #> [17] "observation_date"          "time_observations_started"
129 | #> [19] "observer_id"               "sampling_event_identifier"
130 | #> [21] "protocol_type"             "protocol_code"            
131 | #> [23] "project_code"              "duration_minutes"         
132 | #> [25] "effort_distance_km"        "effort_area_ha"           
133 | #> [27] "number_observers"          "all_species_reported"     
134 | #> [29] "group_identifier"          "trip_comments"
135 | ```
136 | 
137 | Now, we can use `dplyr` to perform standard queries. For example, to see
138 | the number of observations for each species in the sample dataset:
139 | 
140 | ``` r
141 | observations %>% count(scientific_name, sort = TRUE)
142 | #> # Source:     lazy query [?? x 2]
143 | #> # Database:   duckdb_connection
144 | #> # Ordered by: desc(n)
145 | #>    scientific_name                n
146 | #>    <chr>                      <dbl>
147 | #>  1 Pycnonotus sinensis          275
148 | #>  2 Pycnonotus jocosus           270
149 | #>  3 Streptopelia chinensis       258
150 | #>  4 Milvus migrans               251
151 | #>  5 Copsychus saularis           228
152 | #>  6 Zosterops simplex            201
153 | #>  7 Acridotheres cristatellus    181
154 | #>  8 Passer montanus              174
155 | #>  9 Pterorhinus perspicillatus   172
156 | #> 10 Motacilla alba               172
157 | #> # … with more rows
158 | ```
159 | 


--------------------------------------------------------------------------------
/R/import_ebird.R:
--------------------------------------------------------------------------------
  1 | #' Import eBird data to parquet
  2 | #' 
  3 | #' eBird data are released as tab-separated text files, packaged into tar
  4 | #' archives. Given a path to an eBird tarfile, this function will extract and
  5 | #' import the tar archive into a parquet-based database in your
  6 | #' [ebird_data_dir()].
  7 | #' 
  8 | #' @param tarfile path to the tar archive file downloaded from the eBird
  9 | #'   website. Files containing either observation data (e.g.
 10 | #'   `ebd_rel<DATE>.tar`) or checklist (e.g. `ebd_sampling_rel<DATE>.tar`) data
 11 | #'   can be provided
 12 | #'   
 13 | #' @details 
 14 | #' [eBird](https://ebird.org/home) data are collected and organized around the
 15 | #' concept of a checklist, representing observations from a single birding
 16 | #' event. Each checklist contains a list of species observed, counts of the
 17 | #' number of individuals seen of each species, the location and time of the
 18 | #' observations, and a measure of the effort expended while collecting these
 19 | #' data. The majority of the [eBird](https://ebird.org/home) database is
 20 | #' available for download in the form of the [eBird Basic Dataset
 21 | #' (EBD)](https://ebird.org/data/download), a set of two tab-separated text
 22 | #' files. 
 23 | #' 
 24 | #' The **checklist** dataset (referred to as the Sampling Event Data on the
 25 | #' eBird website) consists of one row for each eBird checklist and columns
 26 | #' contain checklist-level information such as location, date, and search
 27 | #' effort. The **observation** dataset consists of one row for each species
 28 | #' observed on each checklist and columns contain checklist-level information
 29 | #' such as number of individuals detected. This dataset also contains all
 30 | #' checklist-level variables, duplicated for each species on the same checklist.
 31 | #'
 32 | #' After [submitting a request for data
 33 | #' access](https://ebird.org/data/download), users can download either or both
 34 | #' of these datasets as tar archive files. `import_ebird()` takes the path to a
 35 | #' tar file as input and imports the text file contained within to a parquet
 36 | #' file, which will allow much easier access to the data. This function will
 37 | #' automatically detect whether you are importing a checklist or observation
 38 | #' dataset provided you **do not change the name of the downloaded file or
 39 | #' unarchive the tar file**. The parquet files will be stored in the directory
 40 | #' specified by [ebird_data_dir()], consult the help for that function to learn
 41 | #' how to modify the parquet directory.
 42 | #' 
 43 | #' @return Invisibly return the path to the directory containing eBird parquet
 44 | #'   files.
 45 | #' @export
 46 | #' @examples
 47 | #' # only use a tempdir for this example, don't copy this for real data
 48 | #' temp_dir <- file.path(tempdir(), "birddb")
 49 | #' Sys.setenv("BIRDDB_HOME" = temp_dir)
 50 | #' 
 51 | #' # get the path to a sample dataset provided with the package
 52 | #' tar <- sample_observation_data()
 53 | #' # import the sample dataset to parquet
 54 | #' import_ebird(tar)
 55 | #' 
 56 | #' unlink(temp_dir, recursive = TRUE)
 57 | import_ebird <- function(tarfile) {
 58 |   if (is_checklists(tarfile)) {
 59 |     dataset <- "checklists"
 60 |   } else if (is_observations(tarfile, allow_subset = FALSE)) {
 61 |     dataset <- "observations"
 62 |   } else if (is_observations(tarfile, allow_subset = TRUE)) {
 63 |     stop("It appears you downloaded a subset of eBird data using the ",
 64 |          "Custom Download form. birddb currently only supports importing ",
 65 |          "the full eBird Basic Dataset.")
 66 |   } else {
 67 |     stop("Non-stardard eBird data filename provided: ", basename(tarfile))
 68 |   }
 69 |   dest <- file.path(ebird_data_dir(), dataset)
 70 |   
 71 |   # confirm overwrite
 72 |   if (dir.exists(dest)) {
 73 |     if (interactive()) {
 74 |       msg <- paste("eBird", dataset, "data already exists in",
 75 |                    ebird_data_dir(),
 76 |                    "would you like to overwrite this data?")
 77 |       overwrite <- utils::askYesNo(msg, default = NA)
 78 |       if (!isTRUE(overwrite)) {
 79 |         warning("Cancelling data import to avoid overwriting existing data.")
 80 |         return(invisible())
 81 |       }
 82 |     } else {
 83 |       message("Overwriting existing eBird ", dataset, " data.")
 84 |     }
 85 |   }
 86 |   
 87 |   message(sprintf("Importing %s data from the eBird Basic Dataset: %s",
 88 |                   dataset, basename(tarfile)))
 89 |   
 90 |   # extract the tarfile to a temp directory
 91 |   message("Extracting from tar archive...")
 92 |   source_dir <- file.path(ebird_data_dir(), "ebird_tmp")
 93 |   unlink(source_dir, recursive = TRUE)
 94 |   dir.create(source_dir, recursive = TRUE)
 95 |   utils::untar(tarfile = tarfile, exdir = source_dir)
 96 |   ebd <- list.files(source_dir, pattern = "ebd.*\\.txt\\.gz",
 97 |                     full.names = TRUE, recursive = TRUE)
 98 |   if (length(ebd) != 1 || !file.exists(ebd)) {
 99 |     stop("txt.gz file not successfully extracted from tarfile.")
100 |   }
101 |   
102 |   # open tsv and set up data schema
103 |   ds <- arrow_open_ebird_txt(ebd, dest)
104 |   
105 |   # stream to parquet
106 |   message("Importing to parquet...")
107 |   arrow::write_dataset(ds, dest, format = "parquet", 
108 |                        max_rows_per_file=1000000L)
109 |   
110 |   # save metadata
111 |   record_metadata(tarfile)
112 |   
113 |   unlink(source_dir, recursive = TRUE)
114 |   invisible(dest)
115 | }
116 | 
117 | arrow_open_ebird_txt <- function(ebd, dest) {
118 |   ds <- arrow::open_dataset(ebd, format = "tsv")
119 |   col_names <- names(ds)
120 |   
121 |   col_types <- ebird_col_type(col_names)
122 |   expand_schema = list(string = arrow::string(), 
123 |                        binary = arrow::binary(),
124 |                        integer = arrow::int64(), 
125 |                        double = arrow::float64(),
126 |                        timestamp = arrow::timestamp(unit = "us"),
127 |                        date = arrow::date64())
128 |   ebd_schema <- expand_schema[col_types]
129 |   names(ebd_schema) <- col_names
130 |   sch <- do.call(arrow::schema, ebd_schema)
131 |   
132 |   # based on the schema defined above open tsv file for streaming
133 |   ds <- arrow::open_dataset(ebd, format = "tsv", schema = sch, skip_rows = 1)
134 |   
135 |   # clean up column names
136 |   col_names <- names(ds)
137 |   col_names <- col_names[col_names != ""] 
138 |   names(col_names) <- gsub("[/ ]", "_", tolower(col_names))
139 |   ds <- dplyr::select(ds, dplyr::all_of(col_names))
140 |   
141 |   return(ds)
142 | }
143 | 
144 | ebird_col_type <- function(col_names) {
145 |   # types for columns that are not character
146 |   col_types <- c(`LAST EDITED DATE` = "timestamp", 
147 |                  `TAXONOMIC ORDER` = "integer", 
148 |                  `LATITUDE` = "double", `LONGITUDE` = "double", 
149 |                  `OBSERVATION DATE` = "date",
150 |                  `DURATION MINUTES` = "integer", 
151 |                  `EFFORT DISTANCE KM` = "double", 
152 |                  `EFFORT AREA HA` = "double", 
153 |                  `NUMBER OBSERVERS` = "integer", 
154 |                  `ALL SPECIES REPORTED` = "integer", 
155 |                  `HAS MEDIA` = "integer", 
156 |                  `APPROVED` = "integer", 
157 |                  `REVIEWED` = "integer")
158 |   # assume anything else is character
159 |   col_types <- col_types[col_names]
160 |   col_types[is.na(col_types)] <- "string"
161 |   names(col_types) <- col_names
162 |   return(col_types)
163 | }
164 | 
165 | record_metadata <- function(tarfile) {
166 |   stopifnot(is.character(tarfile), length(tarfile) == 1, file.exists(tarfile))
167 |   
168 |   f <- basename(tarfile)
169 |   if (is_checklists(f)) {
170 |     dataset <- "checklists"
171 |     subset <- NA_character_
172 |   } else if (is_observations(f, allow_subset = FALSE)) {
173 |     dataset <- "observations"
174 |     subset <- NA_character_
175 |   # todo: implement ability to import ebd subset, currently in a zip file
176 |   # } else if (is_observations(f, allow_subset = TRUE)) {
177 |   #   dataset <- "observations"
178 |   #   subset <- sub("ebd_([-_A-Za-z0-9]+)_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.tar",
179 |   #                 "\\1", f)
180 |   } else {
181 |     stop("The provided tar filename does not appear to contain eBird data. ", 
182 |          "The expected format is, e.g., ebd_relJul-2021.tar.")
183 |   }
184 |   
185 |   # parse date from filename
186 |   rawdate <- sub("ebd[-_A-Za-z0-9]*_rel([A-Z]{1}[a-z]{2}-[0-9]{4})\\.tar", 
187 |                  "\\1", f)
188 |   date <- strsplit(rawdate, "-")[[1]]
189 |   date[1] <- match(date[1], month.abb)
190 |   date <- paste(date[2], date[1], "1", sep = "-")
191 |   date <- as.Date(date, format = "%Y-%m-%d")
192 |   if (is.na(date)) {
193 |     stop("Month and year could not be parsed from filename: ", rawdate)
194 |   }
195 |   version = format(date, "%Y-%m")
196 |   
197 |   if (!is.na(subset)) {
198 |     message("EBD subset detected for: ", subset)
199 |   }
200 |   
201 |   # sha256 file hash
202 |   hash <- digest::digest(tarfile, algo = "crc32", file = TRUE)
203 |   
204 |   # save to csv
205 |   file_metadata <- data.frame(dataset = dataset, 
206 |                               version = version,
207 |                               subset = subset, 
208 |                               source_file = tarfile,
209 |                               file_size = file.size(tarfile),
210 |                               hash_crc32 = as.character(hash)[],
211 |                               timestamp = Sys.time())
212 |   f_metadata <- file.path(ebird_data_dir(),
213 |                           paste0(dataset, "-metadata.csv"))
214 |   utils::write.csv(file_metadata, file = f_metadata, row.names = FALSE, na = "")
215 |   
216 |   invisible(file_metadata)
217 | }
218 | 
219 | 
220 | is_checklists <- function(x) {
221 |   x <- basename(x)
222 |   if (!grepl("\\.tar$", x)) {
223 |     stop("The provided file does not appear to be a tar archive. The file ",
224 |          "extension should be .tar.")
225 |   }
226 |   grepl("ebd_sampling_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.tar$", x)
227 | }
228 | 
229 | is_observations <- function(x, allow_subset = FALSE) {
230 |   x <- basename(x)
231 |   if (!grepl("\\.tar$", x)) {
232 |     stop("The provided file does not appear to be a tar archive. The file ",
233 |          "extension should be .tar.")
234 |   }
235 |   is_obs <- grepl("ebd_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.tar$", x)
236 |   if (allow_subset) {
237 |     is_ss <- grepl("ebd[-_A-Za-z0-9]*_rel[A-Z]{1}[a-z]{2}-[0-9]{4}\\.zip$", x)
238 |     is_obs <- is_obs | is_ss
239 |   }
240 |   return(is_obs)
241 | }
242 | 
243 | 
244 | 
245 | # https://ebird.org/data/download/ebd
246 | # https://download.ebird.org/ebd/prepackaged/ebd_sampling_relOct-2021.tar
247 | # https://download.ebird.org/ebd/prepackaged/ebd_relOct-2021.tar
248 | 


--------------------------------------------------------------------------------