├── .github
    ├── .gitignore
    └── workflows
    │   ├── pkgdown.yaml
    │   └── R-CMD-check.yaml
├── LICENSE
├── cran-comments.md
├── .gitignore
├── _pkgdown.yml
├── inst
    ├── extdata
    │   ├── metro.fgb
    │   ├── world.fgb
    │   ├── world.gpkg
    │   └── spatial-test.csv
    ├── WORDLIST
    └── examples
    │   ├── spatial_module.R
    │   ├── s3_bucket.R
    │   ├── neon-read-remote-csv.R
    │   ├── s3-tests.R
    │   ├── more-spatial.R
    │   └── s3-spatial.R
├── CRAN-SUBMISSION
├── man
    ├── figures
    │   ├── README-unnamed-chunk-8-1.png
    │   └── README-unnamed-chunk-10-1.png
    ├── duckdb_reset.Rd
    ├── duckdb_extensions.Rd
    ├── as_dataset.Rd
    ├── close_connection.Rd
    ├── as_view.Rd
    ├── load_spatial.Rd
    ├── to_json.Rd
    ├── duckdb_get_config.Rd
    ├── to_h3j.Rd
    ├── duckdb_config.Rd
    ├── to_geojson.Rd
    ├── load_h3.Rd
    ├── st_read_meta.Rd
    ├── duckdb_secrets.Rd
    ├── write_dataset.Rd
    ├── to_sf.Rd
    ├── write_geo.Rd
    ├── spatial_join.Rd
    ├── duckdb_s3_config.Rd
    ├── cached_connection.Rd
    └── open_dataset.Rd
├── tests
    ├── spelling.R
    ├── testthat
    │   ├── test-secrets.R
    │   ├── test-config.R
    │   ├── test-load_extension.R
    │   ├── test-h3.R
    │   ├── test-s3_uri.R
    │   ├── test-spatial.R
    │   ├── test-open_dataset.R
    │   └── test-write_dataset.R
    └── testthat.R
├── .Rbuildignore
├── todo.md
├── duckdbfs.Rproj
├── NAMESPACE
├── R
    ├── load_spatial.R
    ├── to_h3j.R
    ├── to_json.R
    ├── load_h3.R
    ├── st_read_meta.R
    ├── write_geo.R
    ├── to_sf.R
    ├── utils.R
    ├── duckdb_secrets.R
    ├── to_geojson.R
    ├── write_dataset.R
    ├── parse_uri.R
    ├── cached_connection.R
    ├── spatial_join.R
    ├── duckdb_config.R
    └── open_dataset.R
├── LICENSE.md
├── DESCRIPTION
├── NEWS.md
├── README.Rmd
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2023
2 | COPYRIGHT HOLDER: duckdbfs authors
3 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## R CMD check results
2 | 
3 | 0 errors | 0 warnings | 0 notes
4 | 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | docs
6 | duckdbfs.Rproj
7 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://cboettig.github.io/duckdbfs/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/inst/extdata/metro.fgb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cboettig/duckdbfs/HEAD/inst/extdata/metro.fgb


--------------------------------------------------------------------------------
/inst/extdata/world.fgb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cboettig/duckdbfs/HEAD/inst/extdata/world.fgb


--------------------------------------------------------------------------------
/inst/extdata/world.gpkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cboettig/duckdbfs/HEAD/inst/extdata/world.gpkg


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 0.1.2
2 | Date: 2025-10-12 05:58:15 UTC
3 | SHA: b9c90fe2751836e080f22decf00dbb7d8ccd4a38
4 | 


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cboettig/duckdbfs/HEAD/man/figures/README-unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cboettig/duckdbfs/HEAD/man/figures/README-unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/inst/extdata/spatial-test.csv:
--------------------------------------------------------------------------------
 1 | site,latitude,longitude
 2 | a,1,1
 3 | b,2,2
 4 | c,3,3
 5 | d,4,4
 6 | e,5,5
 7 | f,6,6
 8 | g,7,7
 9 | h,8,8
10 | i,9,9
11 | j,10,10
12 | 


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if(requireNamespace('spelling', quietly = TRUE))
2 |   spelling::spell_check_test(vignettes = TRUE, error = FALSE,
3 |                              skip_on_cran = TRUE)
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test-secrets.R:
--------------------------------------------------------------------------------
 1 | 
 2 | test_that("test secrets", {
 3 | 
 4 | skip_on_cran()
 5 | status <- duckdb_secrets()
 6 | 
 7 | expect_true(status == 1)
 8 | close_connection()
 9 | 
10 | })
11 | 
12 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^duckdbfs\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | ^\.github$
 5 | ^README\.Rmd$
 6 | ^_pkgdown\.yml$
 7 | ^docs$
 8 | ^pkgdown$
 9 | ^cran-comments\.md$
10 | ^CRAN-SUBMISSION$
11 | ^todo\.md$
12 | 


--------------------------------------------------------------------------------
/tests/testthat/test-config.R:
--------------------------------------------------------------------------------
 1 | test_that("config", {
 2 | 
 3 |     skip_on_cran() # All examples must run on slow machine in 5 secs
 4 | 
 5 |     duckdb_config(threads = 1, memory_limit = '10GB')
 6 |     duckdb_config(threads = 10)
 7 | 
 8 |     threads = duckdb_get_config("threads")
 9 |     expect_equal(threads, '10')
10 |     duckdb_reset("threads")
11 | })
12 | 


--------------------------------------------------------------------------------
/todo.md:
--------------------------------------------------------------------------------
 1 | - [ ] Standardize API naming conventions (mimic ibis more?  and arrow?).  we have:
 2 | 
 3 | - open_dataset
 4 | - spatial_join
 5 | - st_read_meta
 6 | 
 7 | - to_geojson
 8 | - to_h3j
 9 | - to_sf
10 | - write_dataset
11 | - write_geo
12 | 
13 | - as_dataset
14 | - as_view
15 | 
16 | - cached_connection
17 | - close_connection
18 | 
19 | 
20 | - duckdb_config
21 | - duckdb_connect
22 | - duckdb_extensions
23 | - duckdb_get_config
24 | - duckdb_reset
25 | - duckdb_s3_config
26 | - duckdb_secrets
27 | 
28 | - load_h3
29 | - load_spatial
30 | 
31 | 


--------------------------------------------------------------------------------
/duckdbfs.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | ProjectId: bf7dba09-62b8-4ca3-a8e0-1e071a9108f7
 3 | 
 4 | RestoreWorkspace: No
 5 | SaveWorkspace: No
 6 | AlwaysSaveHistory: Default
 7 | 
 8 | EnableCodeIndexing: Yes
 9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 2
11 | Encoding: UTF-8
12 | 
13 | RnwWeave: Sweave
14 | LaTeX: pdfLaTeX
15 | 
16 | AutoAppendNewline: Yes
17 | StripTrailingWhitespace: Yes
18 | LineEndingConversion: Posix
19 | 
20 | BuildType: Package
21 | PackageUseDevtools: Yes
22 | PackageInstallArgs: --no-multiarch --with-keep.source
23 | PackageRoxygenize: rd,collate,namespace
24 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(as_dataset)
 4 | export(as_view)
 5 | export(cached_connection)
 6 | export(close_connection)
 7 | export(duckdb_config)
 8 | export(duckdb_connect)
 9 | export(duckdb_extensions)
10 | export(duckdb_get_config)
11 | export(duckdb_reset)
12 | export(duckdb_s3_config)
13 | export(duckdb_secrets)
14 | export(load_h3)
15 | export(load_spatial)
16 | export(open_dataset)
17 | export(spatial_join)
18 | export(st_read_meta)
19 | export(to_geojson)
20 | export(to_h3j)
21 | export(to_sf)
22 | export(write_dataset)
23 | export(write_geo)
24 | 


--------------------------------------------------------------------------------
/R/load_spatial.R:
--------------------------------------------------------------------------------
 1 | #' load the duckdb geospatial data plugin
 2 | #'
 3 | #' @inheritParams duckdb_s3_config
 4 | #' @param nightly should we use the nightly version or not?
 5 | #'   default FALSE, configurable as `duckdbfs_use_nightly` option.
 6 | #' @param force force re-install?
 7 | #' @return loads the extension and returns status invisibly.
 8 | #' @references <https://duckdb.org/docs/extensions/spatial.html>
 9 | #' @export
10 | load_spatial <- function(
11 |     conn = cached_connection(),
12 |     nightly = getOption("duckdbfs_use_nightly", FALSE),
13 |     force = FALSE
14 | ) {
15 |     load_extension("spatial", conn = conn, nightly = nightly, force = force)
16 | }
17 | 


--------------------------------------------------------------------------------
/man/duckdb_reset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckdb_config.R
 3 | \name{duckdb_reset}
 4 | \alias{duckdb_reset}
 5 | \title{duckdb reset configuration to default}
 6 | \usage{
 7 | duckdb_reset(x, conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{x}{parameter name}
11 | 
12 | \item{conn}{A connection to a database.}
13 | }
14 | \description{
15 | duckdb reset configuration to default
16 | }
17 | \examples{
18 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
19 | duckdb_config(threads = 10)
20 | duckdb_get_config("threads")
21 | duckdb_reset("threads")
22 | \dontshow{\}) # examplesIf}
23 | }
24 | \seealso{
25 | duckdb_config, duckdb_get_config
26 | }
27 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | CMD
 2 | CRS
 3 | DuckDB
 4 | EPSG
 5 | GBIF
 6 | GC
 7 | GDAL
 8 | Geospatial
 9 | JSON
10 | MINIO
11 | Quickstart
12 | README
13 | SSL
14 | URI
15 | URIs
16 | WKB
17 | WKT
18 | behaviour
19 | bigint
20 | bugfix
21 | cachable
22 | config
23 | containsproperly
24 | crs
25 | csv
26 | dbExecute
27 | dbplyr
28 | dbpylr
29 | dplyr
30 | duckdb
31 | duckdb's
32 | duckdb’s
33 | dwithin
34 | filesize
35 | finalizer
36 | gc
37 | gdal
38 | geo
39 | geojson
40 | geospatial
41 | globbing
42 | http
43 | https
44 | json
45 | md
46 | minio
47 | parsers
48 | pipline
49 | postgis
50 | pre
51 | proj
52 | repo
53 | reprojection
54 | schemas
55 | serializer
56 | tbl
57 | tempdir
58 | tibble
59 | un
60 | uploader
61 | vhost
62 | vsis
63 | ’s
64 | 


--------------------------------------------------------------------------------
/man/duckdb_extensions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckdb_config.R
 3 | \name{duckdb_extensions}
 4 | \alias{duckdb_extensions}
 5 | \title{show duckdb extensions}
 6 | \usage{
 7 | duckdb_extensions(conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{conn}{A connection to a database.}
11 | }
12 | \value{
13 | a data frame listing all available extensions, with boolean columns
14 | indicating which extensions are installed or loaded, and a description of each
15 | extension.
16 | }
17 | \description{
18 | show duckdb extensions
19 | }
20 | \examples{
21 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
22 | duckdb_extensions()
23 | \dontshow{\}) # examplesIf}
24 | }
25 | 


--------------------------------------------------------------------------------
/man/as_dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/write_dataset.R
 3 | \name{as_dataset}
 4 | \alias{as_dataset}
 5 | \title{as_dataset}
 6 | \usage{
 7 | as_dataset(df, conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{df}{a local data frame.  Otherwise will be passed back without side effects}
11 | 
12 | \item{conn}{A connection to a database.}
13 | }
14 | \value{
15 | a remote \code{dplyr::tbl} connection to the table.
16 | }
17 | \description{
18 | Push a local (in-memory) dataset into a the duckdb database as a table.
19 | This enables it to share the connection source with other data.
20 | This is equivalent to the behavior of copy=TRUE on many (but not all) of the two-table verbs in dplyr.
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(duckdbfs)
11 | 
12 | 
13 | has_spatial <- function() {
14 |     duckdbfs::duckdb_extensions() |>
15 |         dplyr::filter(extension_name == "spatial") |>
16 |         dplyr::pull(installed)
17 | }
18 | 
19 | 
20 | # tests that don't need extensions loaded shouldn't access internet
21 | #options("duckdbfs_autoload_extensions" = FALSE)
22 | 
23 | test_check("duckdbfs")
24 | 


--------------------------------------------------------------------------------
/tests/testthat/test-load_extension.R:
--------------------------------------------------------------------------------
 1 | test_that("extensions installation", {
 2 |   # cran tests cannot fail if no network is available
 3 |   skip_on_cran()
 4 |   skip_if_offline()
 5 | 
 6 |   close_connection()
 7 |   duckdb_connect()
 8 | 
 9 |   load_httpfs(nightly = FALSE, force = FALSE)
10 | 
11 |   # core extensions only, don't test 'spatial' or 'h3'
12 | 
13 |   exts <- duckdb_extensions()
14 |   status <- exts[exts$extension_name == "httpfs", ]
15 |   expect_true(status$installed)
16 |   expect_equal(status$installed_from, "core")
17 | 
18 |   exts <- duckdb_extensions()
19 |   status <- exts[exts$extension_name == "httpfs", ]
20 |   expect_true(status$installed)
21 |   expect_equal(status$installed_from, "core")
22 | 
23 |   load_extension("json")
24 | })
25 | 


--------------------------------------------------------------------------------
/inst/examples/spatial_module.R:
--------------------------------------------------------------------------------
 1 | st_read <- function() {
 2 | 
 3 | }
 4 | 
 5 | st_write <- function() {
 6 | 
 7 | }
 8 | 
 9 | st_perimeter <- function() {
10 |   # no sf equivalent
11 | }
12 | 
13 | # functions that operate on geometries already work within `mutate` calls
14 | st_area <- function() {
15 | 
16 | }
17 | 
18 | 
19 | st_intersection <- function(x, y, ...) {
20 |   sf::st_intersection(x, y, ...)
21 | }
22 | 
23 | st_intersects <- function(x, y, ...) {
24 |   if(inherits(x, "sf")) {
25 |     sf::st_intersection(x, y, ...)
26 |   }
27 | 
28 | }
29 | 
30 | 
31 | st_union <- function(x, y, ...,
32 |                      by_feature = by_feature, is_coverage = is_coverage) {
33 |   if(inherits(x, "sf")) {
34 |     sf::st_union(x, y, ..., by_feature = by_feature, is_coverage = is_coverage)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/R/to_h3j.R:
--------------------------------------------------------------------------------
 1 | #' Write H3 hexagon data out as an h3j-compliant JSON file
 2 | #' NOTE: the column containing H3 hashes must be named `hexid`
 3 | #'
 4 | #' @inheritParams write_dataset
 5 | #' @examplesIf interactive()
 6 | #' # example code
 7 | #'
 8 | #' @export
 9 | to_h3j <- function(dataset, path, conn = cached_connection(), as_http = FALSE) {
10 |     cols <- paste(colnames(dataset), collapse = ", ")
11 |     sql <- dbplyr::sql_render(dataset)
12 |     q <- glue::glue(
13 |         "
14 |     COPY (
15 |       WITH t1 AS ({sql})
16 |       SELECT json_group_array(struct_pack({cols}))
17 |       AS cells
18 |       FROM t1
19 |     ) TO '{path}' (FORMAT JSON)
20 |   "
21 |     )
22 |     DBI::dbExecute(conn, q)
23 |     if (as_http) {
24 |         path <- s3_as_http(path)
25 |     }
26 |     invisible(path)
27 | }
28 | 


--------------------------------------------------------------------------------
/inst/examples/s3_bucket.R:
--------------------------------------------------------------------------------
 1 | 
 2 | s3_bucket <- function(bucket, anonymous = NULL, access_key = NULL,
 3 |                       secret_key = NULL, session_token = NULL,
 4 |                       region = NULL, endpoint_override = NULL, scheme = NULL,
 5 |                       conn = cached_connection()) {
 6 | 
 7 |   if (!grepl("^s3://", bucket)) {
 8 |     bucket <- paste0("s3://", bucket)
 9 |   }
10 | 
11 |   duckdb_s3_config(conn = conn,
12 |                    anonymous = anonymous,
13 |                    s3_access_key_id = access_key,
14 |                    s3_secret_access_key = secret_key,
15 |                    s3_session_token = session_token,
16 |                    s3_endpoint = endpoint_override,
17 |                    s3_use_ssl = !identical(scheme, "http"),
18 |                    )
19 |   return(bucket)
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/man/close_connection.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cached_connection.R
 3 | \name{close_connection}
 4 | \alias{close_connection}
 5 | \title{close connection}
 6 | \usage{
 7 | close_connection(conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{conn}{a duckdb connection (leave blank)
11 | Closes the invisible cached connection to duckdb}
12 | }
13 | \value{
14 | returns nothing.
15 | }
16 | \description{
17 | close connection
18 | }
19 | \details{
20 | Shuts down connection before gc removes it.
21 | Then clear cached reference to avoid using a stale connection
22 | This avoids complaint about connection being garbage collected.
23 | }
24 | \examples{
25 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
26 | 
27 | close_connection()
28 | \dontshow{\}) # examplesIf}
29 | }
30 | 


--------------------------------------------------------------------------------
/man/as_view.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/spatial_join.R
 3 | \name{as_view}
 4 | \alias{as_view}
 5 | \title{as_view}
 6 | \usage{
 7 | as_view(x, tblname = tmp_tbl_name(), conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{x}{a duckdb spatial dataset}
11 | 
12 | \item{tblname}{The name of the table to create in the database.}
13 | 
14 | \item{conn}{A connection to a database.}
15 | }
16 | \description{
17 | Create a View of the current query.  This can be an effective way to allow
18 | a query chain to remain lazy
19 | }
20 | \examples{
21 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
22 | path <- system.file("extdata/spatial-test.csv", package="duckdbfs")
23 | df <- open_dataset(path)
24 | library(dplyr)
25 | 
26 | df |> filter(latitude > 5) |> as_view()
27 | \dontshow{\}) # examplesIf}
28 | }
29 | 


--------------------------------------------------------------------------------
/R/to_json.R:
--------------------------------------------------------------------------------
 1 | #' to_json
 2 | #' write data out as a JSON object
 3 | #'
 4 | #' @inheritParams write_dataset
 5 | #' @param array generate a JSON array?
 6 | #' @param options additional options
 7 | #' @param options additional options as a char string, see
 8 | # https://duckdb.org/docs/sql/statements/copy.html#json-options
 9 | #' @return path, invisibly
10 | 
11 | to_json <- function(
12 |   dataset,
13 |   path,
14 |   conn = cached_connection(),
15 |   array = TRUE,
16 |   options = NULL,
17 |   as_http = FALSE
18 | ) {
19 |   sql <- dbplyr::sql_render(dataset)
20 |   if (array) {
21 |     options <- c("ARRAY true", options)
22 |   }
23 | 
24 |   options <- paste("FORMAT JSON", options, sep = ", ", collapse = ", ")
25 | 
26 |   q <- glue::glue("COPY ({sql}) TO '{path}' ({options});")
27 |   DBI::dbExecute(conn, q)
28 | 
29 |   if (as_http) {
30 |     path <- s3_as_http(path)
31 |   }
32 | 
33 |   invisible(path)
34 | }
35 | 


--------------------------------------------------------------------------------
/man/load_spatial.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_spatial.R
 3 | \name{load_spatial}
 4 | \alias{load_spatial}
 5 | \title{load the duckdb geospatial data plugin}
 6 | \usage{
 7 | load_spatial(
 8 |   conn = cached_connection(),
 9 |   nightly = getOption("duckdbfs_use_nightly", FALSE),
10 |   force = FALSE
11 | )
12 | }
13 | \arguments{
14 | \item{conn}{A database connection object created using the
15 | \code{cache_connection} function (default: \code{cache_connection()}).}
16 | 
17 | \item{nightly}{should we use the nightly version or not?
18 | default FALSE, configurable as \code{duckdbfs_use_nightly} option.}
19 | 
20 | \item{force}{force re-install?}
21 | }
22 | \value{
23 | loads the extension and returns status invisibly.
24 | }
25 | \description{
26 | load the duckdb geospatial data plugin
27 | }
28 | \references{
29 | \url{https://duckdb.org/docs/extensions/spatial.html}
30 | }
31 | 


--------------------------------------------------------------------------------
/man/to_json.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/to_json.R
 3 | \name{to_json}
 4 | \alias{to_json}
 5 | \title{to_json
 6 | write data out as a JSON object}
 7 | \usage{
 8 | to_json(
 9 |   dataset,
10 |   path,
11 |   conn = cached_connection(),
12 |   array = TRUE,
13 |   options = NULL,
14 |   as_http = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{dataset}{a remote tbl object from \code{open_dataset},
19 | or an in-memory data.frame.}
20 | 
21 | \item{path}{a local file path or S3 path with write credentials}
22 | 
23 | \item{conn}{duckdbfs database connection}
24 | 
25 | \item{array}{generate a JSON array?}
26 | 
27 | \item{options}{additional options as a char string, see}
28 | 
29 | \item{as_http}{if path is an S3 location, will return corresponding HTTP address.}
30 | }
31 | \value{
32 | path, invisibly
33 | }
34 | \description{
35 | to_json
36 | write data out as a JSON object
37 | }
38 | 


--------------------------------------------------------------------------------
/man/duckdb_get_config.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckdb_config.R
 3 | \name{duckdb_get_config}
 4 | \alias{duckdb_get_config}
 5 | \title{duckdb reset configuration to default}
 6 | \usage{
 7 | duckdb_get_config(x = NULL, conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{x}{parameter name. Omit to see a table of all settings.}
11 | 
12 | \item{conn}{A connection to a database.}
13 | }
14 | \description{
15 | duckdb reset configuration to default
16 | }
17 | \examples{
18 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
19 | # Full config table
20 | duckdb_get_config()
21 | 
22 | # look up single config value
23 | duckdb_get_config("threads")
24 | 
25 | # set a different value, test, reset.
26 | duckdb_config(threads = 10)
27 | duckdb_get_config("threads")
28 | duckdb_reset("threads")
29 | \dontshow{\}) # examplesIf}
30 | }
31 | \seealso{
32 | duckdb_config, duckdb_get_config
33 | }
34 | 


--------------------------------------------------------------------------------
/man/to_h3j.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/to_h3j.R
 3 | \name{to_h3j}
 4 | \alias{to_h3j}
 5 | \title{Write H3 hexagon data out as an h3j-compliant JSON file
 6 | NOTE: the column containing H3 hashes must be named \code{hexid}}
 7 | \usage{
 8 | to_h3j(dataset, path, conn = cached_connection(), as_http = FALSE)
 9 | }
10 | \arguments{
11 | \item{dataset}{a remote tbl object from \code{open_dataset},
12 | or an in-memory data.frame.}
13 | 
14 | \item{path}{a local file path or S3 path with write credentials}
15 | 
16 | \item{conn}{duckdbfs database connection}
17 | 
18 | \item{as_http}{if path is an S3 location, will return corresponding HTTP address.}
19 | }
20 | \description{
21 | Write H3 hexagon data out as an h3j-compliant JSON file
22 | NOTE: the column containing H3 hashes must be named \code{hexid}
23 | }
24 | \examples{
25 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
26 | # example code
27 | \dontshow{\}) # examplesIf}
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/testthat/test-h3.R:
--------------------------------------------------------------------------------
 1 | test_that("h3", {
 2 |   skip_if_offline() # needs to be able to load the spatial module
 3 |   skip_if_not_installed("sf")
 4 |   skip_on_cran()
 5 |   skip_on_os("windows") # h3 extension not built for windows(?)
 6 |   skip_if_not(has_spatial(), "spatial extension not available")
 7 | 
 8 |   # start a fresh connection
 9 |   options("duckdbfs_autoload_extensions" = TRUE)
10 |   close_connection()
11 | 
12 |   library(dplyr)
13 |   load_h3()
14 | 
15 |   # requires json extension, autoload:
16 | 
17 |   path <- tempfile(fileext = ".h3j")
18 |   ex <- system.file("extdata/spatial-test.csv", package = "duckdbfs")
19 | 
20 |   zoom <- 9L # Zoom must be explicit integer, L
21 |   query <- ex |>
22 |     open_dataset(format = "csv") |>
23 |     mutate(h3id = h3_latlng_to_cell_string(latitude, longitude, zoom))
24 | 
25 |   df <- collect(query)
26 |   expect_s3_class(df, "data.frame")
27 | 
28 |   query |> to_h3j(path)
29 |   expect_true(file.exists(path))
30 | 
31 |   # unset autoload
32 |   options("duckdbfs_autoload_extensions" = TRUE)
33 | })
34 | 


--------------------------------------------------------------------------------
/man/duckdb_config.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckdb_config.R
 3 | \name{duckdb_config}
 4 | \alias{duckdb_config}
 5 | \title{duckdb configuration}
 6 | \usage{
 7 | duckdb_config(..., conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{...}{named argument of the parameters to set, see examples
11 | see all possible configuration options at \url{https://duckdb.org/docs/sql/configuration.html}}
12 | 
13 | \item{conn}{A connection to a database.}
14 | }
15 | \value{
16 | the active duckdb connection, invisibly
17 | }
18 | \description{
19 | duckdb configuration
20 | }
21 | \details{
22 | Note: in I/O bound tasks such as streaming data, it can be helpful to set
23 | thread parallelism significantly higher than available CPU cores.
24 | }
25 | \examples{
26 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
27 | duckdb_config(threads = 1, memory_limit = '10GB')
28 | duckdb_get_config("threads")
29 | duckdb_reset("threads")
30 | \dontshow{\}) # examplesIf}
31 | }
32 | \seealso{
33 | duckdb_reset, duckdb_get_config
34 | }
35 | 


--------------------------------------------------------------------------------
/R/load_h3.R:
--------------------------------------------------------------------------------
 1 | #' load the duckdb geospatial data plugin
 2 | #' @inheritParams load_spatial
 3 | #' @param repo repository path for community extensions
 4 | #' @return loads the extension and returns status invisibly.
 5 | #' @references <https://github.com/isaacbrodsky/h3-duckdb>
 6 | #'
 7 | #' @examplesIf interactive()
 8 | #'
 9 | #' library(dplyr)
10 | #' load_h3()
11 | #' ex <- system.file("extdata/spatial-test.csv", package="duckdbfs")
12 | #'
13 | #' zoom <- 9L # Zoom must be explicit integer, L
14 | #' query <- ex |>
15 | #'   open_dataset(format = "csv") |>
16 | #'   mutate(h3id = h3_latlng_to_cell_string(latitude, longitude, zoom))
17 | #'
18 | #'  # as data.frame
19 | #'  collect(query)
20 | #'
21 | #'  # write to a file
22 | #'  path <- tempfile(fileext = ".h3j")
23 | #'  query |> to_h3j(path)
24 | #'
25 | #' @export
26 | load_h3 <- function(
27 |   conn = cached_connection(),
28 |   repo = "http://community-extensions.duckdb.org"
29 | ) {
30 |   DBI::dbExecute(conn, glue::glue("INSTALL h3 from '{repo}'"))
31 |   status <- DBI::dbExecute(conn, "LOAD h3")
32 | 
33 |   invisible(status)
34 | }
35 | 


--------------------------------------------------------------------------------
/man/to_geojson.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/to_geojson.R
 3 | \name{to_geojson}
 4 | \alias{to_geojson}
 5 | \title{Write geojson using duckdb's native JSON writer}
 6 | \usage{
 7 | to_geojson(
 8 |   dataset,
 9 |   path,
10 |   conn = cached_connection(),
11 |   id_col = NULL,
12 |   as_http = FALSE,
13 |   server = Sys.getenv("AWS_S3_ENDPOINT", "s3.amanzonaws.com"),
14 |   use_ssl = Sys.getenv("AWS_HTTPS", "TRUE")
15 | )
16 | }
17 | \arguments{
18 | \item{dataset}{a remote tbl object from \code{open_dataset},
19 | or an in-memory data.frame.}
20 | 
21 | \item{path}{a local file path or S3 path with write credentials}
22 | 
23 | \item{conn}{duckdbfs database connection}
24 | 
25 | \item{id_col}{(deprecated). to_geojson() will preserve all atomic columns
26 | as properties.}
27 | 
28 | \item{as_http}{convert returned S3 path to URL (e.g. for public buckets)}
29 | 
30 | \item{server}{aws endpoint if converting s3 path to URL}
31 | 
32 | \item{use_ssl}{should url use https}
33 | }
34 | \value{
35 | path, invisibly
36 | }
37 | \description{
38 | Write geojson using duckdb's native JSON writer
39 | }
40 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2023 duckdbfs authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/testthat/test-s3_uri.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # consider deprecating
 3 | 
 4 | test_that("s3 uri parsing", {
 5 | 
 6 |   skip_on_cran()
 7 | 
 8 |   url <- "s3://neon4cast-scores/parquet/aquatics"
 9 |   parts <- url_parse(url)
10 |   expect_true(parts$scheme == "s3")
11 |   expect_null(parts$username)
12 |   expect_equal(parts$path, "/parquet/aquatics")
13 | 
14 |   url <- "s3://neon4cast-scores/parquet/aquatics?endpoint_url=data.ecoforecast.org"
15 |   parts <- url_parse(url)
16 |   expect_true(parts$scheme == "s3")
17 |   expect_null(parts$username)
18 |   expect_equal(parts$path, "/parquet/aquatics")
19 |   expect_equal(parts$query[["endpoint_url"]], "data.ecoforecast.org")
20 | 
21 | 
22 |   url <- "s3://user:password:token@neon4cast-scores/parquet/aquatics?endpoint_url=data.ecoforecast.org"
23 |   parts <- url_parse(url)
24 |   expect_true(parts$scheme == "s3")
25 |   expect_equal(parts$username, "user")
26 |   expect_equal(parts$password, "password")
27 |   expect_equal(parts$token, "token")
28 |   expect_equal(parts$path, "/parquet/aquatics")
29 |   expect_equal(parts$query[["endpoint_url"]], "data.ecoforecast.org")
30 | 
31 |   url <- "s3://anonymous@neon4cast-scores/parquet/aquatics?endpoint_url=data.ecoforecast.org"
32 |   parts <- url_parse(url)
33 |   expect_equal(parts$username, "anonymous")
34 | 
35 | })
36 | 


--------------------------------------------------------------------------------
/man/load_h3.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_h3.R
 3 | \name{load_h3}
 4 | \alias{load_h3}
 5 | \title{load the duckdb geospatial data plugin}
 6 | \usage{
 7 | load_h3(
 8 |   conn = cached_connection(),
 9 |   repo = "http://community-extensions.duckdb.org"
10 | )
11 | }
12 | \arguments{
13 | \item{conn}{A database connection object created using the
14 | \code{cache_connection} function (default: \code{cache_connection()}).}
15 | 
16 | \item{repo}{repository path for community extensions}
17 | }
18 | \value{
19 | loads the extension and returns status invisibly.
20 | }
21 | \description{
22 | load the duckdb geospatial data plugin
23 | }
24 | \examples{
25 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
26 | 
27 | library(dplyr)
28 | load_h3()
29 | ex <- system.file("extdata/spatial-test.csv", package="duckdbfs")
30 | 
31 | zoom <- 9L # Zoom must be explicit integer, L
32 | query <- ex |>
33 |   open_dataset(format = "csv") |>
34 |   mutate(h3id = h3_latlng_to_cell_string(latitude, longitude, zoom))
35 | 
36 |  # as data.frame
37 |  collect(query)
38 | 
39 |  # write to a file
40 |  path <- tempfile(fileext = ".h3j")
41 |  query |> to_h3j(path)
42 | \dontshow{\}) # examplesIf}
43 | }
44 | \references{
45 | \url{https://github.com/isaacbrodsky/h3-duckdb}
46 | }
47 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: duckdbfs
 2 | Title: High Performance Remote File System, Database and 'Geospatial' Access Using 'duckdb'
 3 | Version: 0.1.2.99
 4 | Authors@R: 
 5 |   c(person("Carl", "Boettiger", , "cboettig@gmail.com", c("aut", "cre"),
 6 |          comment = c(ORCID = "0000-0002-1642-628X")), 
 7 |     person("Michael D.","Sumner", role = c("ctb"), email =
 8 |     "mdsumner@gmail.com", comment=c(ORCID = "0000-0002-2471-7511")))
 9 | Description: Provides friendly wrappers for creating 'duckdb'-backed connections
10 |   to tabular datasets ('csv', parquet, etc) on local or remote file systems.
11 |   This mimics the behaviour of "open_dataset" in the 'arrow' package, 
12 |   but in addition to 'S3' file system also generalizes to any list of 'http' URLs.
13 | License: MIT + file LICENSE
14 | Encoding: UTF-8
15 | Roxygen: list(markdown = TRUE)
16 | RoxygenNote: 7.3.3
17 | URL: https://github.com/cboettig/duckdbfs, https://cboettig.github.io/duckdbfs/
18 | BugReports: https://github.com/cboettig/duckdbfs/issues
19 | Depends: R (>= 4.2)
20 | Imports: 
21 |     DBI,
22 |     dbplyr,
23 |     dplyr,
24 |     duckdb (>= 1.1),
25 |     fs,
26 |     glue
27 | Suggests: 
28 |     curl,
29 |     sf,
30 |     jsonlite,
31 |     spelling,
32 |     minioclient,
33 |     testthat (>= 3.0.0)
34 | Config/testthat/edition: 3
35 | Language: en-US
36 | 


--------------------------------------------------------------------------------
/inst/examples/neon-read-remote-csv.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # remotes::install_github("duckdbfs")
 3 | library(duckdbfs)
 4 | library(neonstore)
 5 | library(dplyr)
 6 | df <-  neonstore:::neon_data(product = "DP1.20288.001",
 7 |                         start_date = "2023-06-01",
 8 |                         end_date = "2023-08-01",
 9 |                         type="basic"
10 |                         )
11 |   urls <- df |>
12 |   dplyr::filter(grepl("waq_instantaneous", name)) |>
13 |   pull(url)
14 | 
15 | 
16 | 
17 | ds <- duckdbfs::open_dataset(urls,
18 |                              format="csv",
19 |                              filename = TRUE)
20 | 
21 | 
22 | sites <- duckdbfs::open_dataset("https://raw.githubusercontent.com/eco4cast/neon4cast-targets/main/NEON_Field_Site_Metadata_20220412.csv",
23 |                                 format = "csv")
24 | 
25 | 
26 | aq <- ds |>
27 |   mutate(field_site_id = regexp_extract(filename, "NEON.DOM.SITE.DP1.20288.001/(\\w{4})", 1L)) |>
28 |   left_join(sites) |>
29 |   mutate(geometry = ST_Point(field_longitude, field_latitude)) |>
30 |   to_sf()
31 | 
32 | aq |> select(geometry) |> distinct() |> plot()
33 | 
34 | 
35 | 
36 | sites <- duckdbfs::open_dataset("https://raw.githubusercontent.com/eco4cast/neon4cast-targets/main/NEON_Field_Site_Metadata_20220412.csv",
37 |                                 format = "csv")
38 | 


--------------------------------------------------------------------------------
/man/st_read_meta.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/st_read_meta.R
 3 | \name{st_read_meta}
 4 | \alias{st_read_meta}
 5 | \title{read spatial metadata}
 6 | \usage{
 7 | st_read_meta(
 8 |   path,
 9 |   layer = 1L,
10 |   tblname = tbl_name(path),
11 |   conn = cached_connection(),
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{path}{URL or path to spatial data file}
17 | 
18 | \item{layer}{layer number to read metadata for, defaults to first layer.}
19 | 
20 | \item{tblname}{metadata will be stored as a view with this name,
21 | by default this is based on the name of the file.}
22 | 
23 | \item{conn}{A connection to a database.}
24 | 
25 | \item{...}{optional additional arguments passed to \code{\link[=duckdb_s3_config]{duckdb_s3_config()}}.
26 | Note these apply after those set by the URI notation and thus may be used
27 | to override or provide settings not supported in that format.}
28 | }
29 | \value{
30 | A lazy \code{dplyr::tbl} object containing core spatial metadata such
31 | as projection information.
32 | }
33 | \description{
34 | At this time, reads a subset of spatial metadata.
35 | This is similar to what is reported by \code{ogrinfo -json}
36 | }
37 | \examples{
38 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
39 | st_read_meta("https://github.com/duckdb/duckdb_spatial/raw/main/test/data/amsterdam_roads.fgb")
40 | \dontshow{\}) # examplesIf}
41 | }
42 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     permissions:
23 |       contents: write
24 |     steps:
25 |       - uses: actions/checkout@v3
26 | 
27 |       - uses: r-lib/actions/setup-pandoc@v2
28 | 
29 |       - uses: r-lib/actions/setup-r@v2
30 |         with:
31 |           use-public-rspm: true
32 | 
33 |       - uses: r-lib/actions/setup-r-dependencies@v2
34 |         with:
35 |           extra-packages: any::pkgdown, github::cboettig/minioclient, local::.
36 |           needs: website
37 | 
38 |       - name: Build site
39 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
40 |         shell: Rscript {0}
41 | 
42 |       - name: Deploy to GitHub pages 🚀
43 |         if: github.event_name != 'pull_request'
44 |         uses: JamesIves/github-pages-deploy-action@v4.4.1
45 |         with:
46 |           clean: false
47 |           branch: gh-pages
48 |           folder: docs
49 | 


--------------------------------------------------------------------------------
/man/duckdb_secrets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckdb_secrets.R
 3 | \name{duckdb_secrets}
 4 | \alias{duckdb_secrets}
 5 | \title{duckdb secrets}
 6 | \usage{
 7 | duckdb_secrets(
 8 |   key = Sys.getenv("AWS_ACCESS_KEY_ID", ""),
 9 |   secret = Sys.getenv("AWS_SECRET_ACCESS_KEY", ""),
10 |   endpoint = Sys.getenv("AWS_S3_ENDPOINT", "s3.amazonaws.com"),
11 |   region = Sys.getenv("AWS_REGION", "us-east-1"),
12 |   bucket = NULL,
13 |   url_style = NULL,
14 |   use_ssl = Sys.getenv("AWS_HTTPS", "TRUE"),
15 |   url_compatibility_mode = TRUE,
16 |   session_token = Sys.getenv("AWS_SESSION_TOKEN", ""),
17 |   type = "S3",
18 |   conn = cached_connection()
19 | )
20 | }
21 | \arguments{
22 | \item{key}{key}
23 | 
24 | \item{secret}{secret}
25 | 
26 | \item{endpoint}{endpoint address}
27 | 
28 | \item{region}{AWS region (ignored by some other S3 providers)}
29 | 
30 | \item{bucket}{restricts the "SCOPE" of this key to only objects in this
31 | bucket-name.  note that the bucket name is currently insensitive to endpoint}
32 | 
33 | \item{url_style}{path or vhost, for S3}
34 | 
35 | \item{use_ssl}{Use SSL address (https instead of http), default TRUE}
36 | 
37 | \item{url_compatibility_mode}{optional mode for increased compatibility with some endpoints}
38 | 
39 | \item{session_token}{AWS session token, used in some AWS authentication with short-lived tokens}
40 | 
41 | \item{type}{Key type, e.g. S3.  See duckdb docs for details.
42 | references \url{https://duckdb.org/docs/configuration/secrets_manager.html}}
43 | 
44 | \item{conn}{A connection to a database.}
45 | }
46 | \description{
47 | Configure the duckdb secrets for remote access.
48 | }
49 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 | #          - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v3
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 | #      - name: extra deps
43 | #        run: R -e "remotes::install_github('cboettig/minioclient')"
44 | 
45 |       - uses: r-lib/actions/setup-r-dependencies@v2
46 |         with:
47 |           extra-packages: any::rcmdcheck, github::cboettig/minioclient
48 |           needs: check
49 | 
50 | 
51 |       - uses: r-lib/actions/check-r-package@v2
52 |         with:
53 |           upload-snapshots: true
54 | 


--------------------------------------------------------------------------------
/man/write_dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/write_dataset.R
 3 | \name{write_dataset}
 4 | \alias{write_dataset}
 5 | \title{write_dataset}
 6 | \usage{
 7 | write_dataset(
 8 |   dataset,
 9 |   path,
10 |   conn = cached_connection(),
11 |   format = c("parquet", "csv"),
12 |   partitioning = dplyr::group_vars(dataset),
13 |   overwrite = TRUE,
14 |   options = list(),
15 |   as_http = FALSE,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{dataset}{a remote tbl object from \code{open_dataset},
21 | or an in-memory data.frame.}
22 | 
23 | \item{path}{a local file path or S3 path with write credentials}
24 | 
25 | \item{conn}{duckdbfs database connection}
26 | 
27 | \item{format}{export format}
28 | 
29 | \item{partitioning}{names of columns to use as partition variables}
30 | 
31 | \item{overwrite}{allow overwriting of existing files?}
32 | 
33 | \item{options}{Additional arguments to COPY, see \url{https://duckdb.org/docs/stable/sql/statements/copy.html#copy--to-options}
34 | Note, uses duckdb native syntax, e.g. c("PER_THREAD_OUTPUT false"), for named arguments, see examples.
35 | (Recall SQL is case-insensitive).}
36 | 
37 | \item{as_http}{if path is an S3 location, will return corresponding HTTP address.}
38 | 
39 | \item{...}{additional arguments to \code{\link[=duckdb_s3_config]{duckdb_s3_config()}}}
40 | }
41 | \value{
42 | Returns the path, invisibly.
43 | }
44 | \description{
45 | write_dataset
46 | }
47 | \examples{
48 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
49 |   write_dataset(mtcars, tempfile())
50 | \dontshow{\}) # examplesIf}
51 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
52 | write_dataset(mtcars, tempdir())
53 | write_dataset(mtcars, tempdir(), options = c("PER_THREAD_OUTPUT FALSE", "RETURN_STATS TRUE"))
54 | \dontshow{\}) # examplesIf}
55 | }
56 | \seealso{
57 | to_sf to_json to_geojson write_geo
58 | }
59 | 


--------------------------------------------------------------------------------
/inst/examples/s3-tests.R:
--------------------------------------------------------------------------------
 1 | 
 2 | test_that("s3 minio", {
 3 | 
 4 |   skip_if_offline()
 5 |   skip_on_cran()
 6 |   skip_if_not_installed("minioclient")
 7 | 
 8 |   # Hmm... this is quite an involved setup but...
 9 |   # Put some parquet the MINIO test server:
10 |   base <- paste0("https://github.com/duckdb/duckdb/raw/main/",
11 |                  "data/parquet-testing/hive-partitioning/union_by_name/")
12 |   f1 <- paste0(base, "x=1/f1.parquet")
13 |   tmp <- tempfile(fileext = ".parquet")
14 |   download.file(f1, tmp, quiet = TRUE)
15 |   minioclient::mc("mb -p play/duckdbfs", verbose = FALSE)
16 |   minioclient::mc_cp(tmp, "play/duckdbfs")
17 | 
18 |   # allow password-less access
19 |   minioclient::mc("anonymous set download play/duckdbfs", verbose=FALSE)
20 | 
21 |   # Could set passwords here if necessary
22 |   duckdb_s3_config(s3_endpoint = "play.min.io",
23 |                    s3_url_style="path")
24 |   df <- open_dataset("s3://duckdbfs/")
25 | 
26 |   expect_s3_class(df, "tbl")
27 |   expect_s3_class(df, "tbl_duckdb_connection")
28 | 
29 |   minioclient::mc("rb --force play/duckdbfs", verbose = FALSE)
30 | 
31 | })
32 | 
33 | 
34 | 
35 | test_that("write_dataset to s3:", {
36 | 
37 |   #  skip("S3 write not enabled")
38 |   skip_if_offline()
39 |   skip_on_cran()
40 |   skip_if_not_installed("jsonlite")
41 |   skip_if_not_installed("minioclient")
42 |   minioclient::install_mc(force = TRUE)
43 |   p <- minioclient::mc_alias_ls("play --json")
44 |   config <- jsonlite::fromJSON(p$stdout)
45 | 
46 |   minioclient::mc_mb("play/duckdbfs")
47 | 
48 |   library(dplyr)
49 | 
50 |   mtcars |> group_by(cyl, gear) |>
51 |   write_dataset("s3://duckdbfs/mtcars",
52 |                 s3_access_key_id = config$accessKey,
53 |                 s3_secret_access_key = config$secretKey,
54 |                 s3_endpoint = config$URL,
55 |                 s3_use_ssl=TRUE,
56 |                 s3_url_style="path"
57 |   )
58 | 
59 |   expect_true(TRUE)
60 |   minioclient::mc("rb --force play/duckdbfs")
61 | 
62 | })
63 | 


--------------------------------------------------------------------------------
/R/st_read_meta.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #url <- "https://github.com/duckdb/duckdb_spatial/raw/main/test/data/amsterdam_roads.fgb"
 3 | #path <- paste0("/vsicurl/", url)
 4 | 
 5 | #' read spatial metadata
 6 | #'
 7 | #' At this time, reads a subset of spatial metadata.
 8 | #' This is similar to what is reported by `ogrinfo -json`
 9 | #' @param path URL or path to spatial data file
10 | #' @param layer layer number to read metadata for, defaults to first layer.
11 | #' @param tblname metadata will be stored as a view with this name,
12 | #' by default this is based on the name of the file.
13 | #' @inheritParams open_dataset
14 | #' @return  A lazy `dplyr::tbl` object containing core spatial metadata such
15 | #' as projection information.
16 | #' @export
17 | #' @examplesIf interactive()
18 | #' st_read_meta("https://github.com/duckdb/duckdb_spatial/raw/main/test/data/amsterdam_roads.fgb")
19 | #'
20 | st_read_meta <- function(path,
21 |                          layer = 1L,
22 |                          tblname = tbl_name(path),
23 |                          conn = cached_connection(),
24 |                          ...){
25 | 
26 | 
27 |   duckdbfs::duckdb_s3_config(conn, ...)
28 |   load_httpfs(conn)
29 |   load_spatial(conn)
30 | 
31 |   ##strip VSI, not supported
32 |   path <- strip_vsi(path)
33 |   query <- glue::glue(
34 |   "CREATE OR REPLACE VIEW {tblname}_meta AS SELECT
35 |     layers[{i}].feature_count as feature_count,
36 |     layers[{i}].geometry_fields[1].name as geom_column_name,
37 |     layers[{i}].geometry_fields[1].type as geom_type,
38 |     layers[{i}].geometry_fields[1].crs.auth_name as name,
39 |     layers[{i}].geometry_fields[1].crs.auth_code as code,
40 |     layers[{i}].geometry_fields[1].crs.wkt as wkt,
41 |     layers[{i}].geometry_fields[1].crs.proj4 as proj4
42 |     FROM st_read_meta('{path}');
43 |     ", i = layer)
44 | 
45 |   DBI::dbSendQuery(conn, query)
46 |   out <- dplyr::tbl(conn, glue::glue("{tblname}_meta"))
47 |   dplyr::collect(out) # small table, no point in being lazy
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/man/to_sf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/to_sf.R
 3 | \name{to_sf}
 4 | \alias{to_sf}
 5 | \title{Convert output to sf object}
 6 | \usage{
 7 | to_sf(x, crs = NA, conn = cached_connection())
 8 | }
 9 | \arguments{
10 | \item{x}{a remote duckdb \code{tbl} (from \code{open_dataset}) or dplyr-pipeline thereof.}
11 | 
12 | \item{crs}{The coordinate reference system, any format understood by \code{sf::st_crs}.}
13 | 
14 | \item{conn}{the connection object from the tbl.
15 | Takes a duckdb table (from \code{open_dataset}) or a dataset or dplyr
16 | pipline and returns an sf object. \strong{Important}: the table must have
17 | a \code{geometry} column, which you will almost always have to create
18 | first.
19 | 
20 | Note: \code{to_sf()} triggers collection into R.  This function is suitable
21 | to use at the end of a dplyr pipeline that will subset the data.
22 | Using this function on a large dataset without filtering first may
23 | exceed available memory.}
24 | }
25 | \value{
26 | an \code{sf} class object (in memory).
27 | }
28 | \description{
29 | Convert output to sf object
30 | }
31 | \examples{
32 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
33 | 
34 | library(dplyr)
35 | csv_file <- system.file("extdata/spatial-test.csv", package="duckdbfs")
36 | 
37 | # Note that we almost always must first create a `geometry` column, e.g.
38 | # from lat/long columns using the `st_point` method.
39 | sf <-
40 |   open_dataset(csv_file, format = "csv") |>
41 |   mutate(geom = ST_Point(longitude, latitude)) |>
42 |   to_sf()
43 | 
44 | # We can use the full space of spatial operations, including spatial
45 | # and normal dplyr filters.  All operations are translated into a
46 | # spatial SQL query by `to_sf`:
47 | open_dataset(csv_file, format = "csv") |>
48 |   mutate(geom = ST_Point(longitude, latitude)) |>
49 |   mutate(dist = ST_Distance(geom, ST_Point(0,0))) |>
50 |   filter(site \%in\% c("a", "b", "e")) |>
51 |   to_sf()
52 | 
53 | \dontshow{\}) # examplesIf}
54 | }
55 | 


--------------------------------------------------------------------------------
/R/write_geo.R:
--------------------------------------------------------------------------------
 1 | #' Write a spatial file with gdal
 2 | #'
 3 | #' Write out to any spatial data format supported by GDAL.
 4 | #' @inheritParams write_dataset
 5 | #' @param driver driver, see <https://duckdb.org/docs/stable/extensions/spatial/gdal>
 6 | #' @param layer_creation_options to GDAL, see <https://duckdb.org/docs/stable/extensions/spatial/gdal>
 7 | #' @param srs Set a spatial reference system as metadata to use for the export.
 8 | #'  This can be a WKT string, an EPSG code or a proj-string, basically anything
 9 | #'  you would normally be able to pass to GDAL. Note that this will not perform
10 | #'  any reprojection of the input geometry, it just sets the metadata if the
11 | #'  target driver supports it.
12 | #' @return path, invisibly
13 | #' @details NOTE: This uses the version of GDAL packaged inside of duckdb, and not the
14 | #' system GDAL. At this time, duckdb's pre-packaged GDAL does not support s3 writes,
15 | #' and will produce a "Error: Not implemented Error: GDAL Error (6): Seek not supported on writable /vsis3/ files".
16 | #' Use to_geojson() to export using duckdb's native JSON serializer instead.
17 | #' @examplesIf interactive()
18 | #' local_file <-  system.file("extdata/spatial-test.csv", package="duckdbfs")
19 | #' load_spatial()
20 | #' tbl <- open_dataset(local_file, format='csv')
21 | #' write_geo(tbl, "spatial.geojson")
22 | #'
23 | #' @export
24 | write_geo <- function(
25 |     dataset,
26 |     path,
27 |     conn = cached_connection(),
28 |     driver = 'GeoJSON',
29 |     layer_creation_options = 'WRITE_BBOX=YES',
30 |     srs = 'ESPG:4326',
31 |     as_http = FALSE
32 | ) {
33 |     cols <- paste(colnames(dataset), collapse = ", ")
34 |     sql <- dbplyr::sql_render(dataset)
35 |     q <- glue::glue(
36 |         "
37 |     COPY ({sql}) TO '{path}'
38 |     WITH (FORMAT gdal, DRIVER '{driver}',
39 |           LAYER_CREATION_OPTIONS '{layer_creation_options}',
40 |           SRS '{srs}'
41 |           );
42 |   "
43 |     )
44 |     DBI::dbExecute(conn, q)
45 | 
46 |     if (as_http) {
47 |         path <- s3_as_http(path)
48 |     }
49 |     invisible(path)
50 | }
51 | 


--------------------------------------------------------------------------------
/man/write_geo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/write_geo.R
 3 | \name{write_geo}
 4 | \alias{write_geo}
 5 | \title{Write a spatial file with gdal}
 6 | \usage{
 7 | write_geo(
 8 |   dataset,
 9 |   path,
10 |   conn = cached_connection(),
11 |   driver = "GeoJSON",
12 |   layer_creation_options = "WRITE_BBOX=YES",
13 |   srs = "ESPG:4326",
14 |   as_http = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{dataset}{a remote tbl object from \code{open_dataset},
19 | or an in-memory data.frame.}
20 | 
21 | \item{path}{a local file path or S3 path with write credentials}
22 | 
23 | \item{conn}{duckdbfs database connection}
24 | 
25 | \item{driver}{driver, see \url{https://duckdb.org/docs/stable/extensions/spatial/gdal}}
26 | 
27 | \item{layer_creation_options}{to GDAL, see \url{https://duckdb.org/docs/stable/extensions/spatial/gdal}}
28 | 
29 | \item{srs}{Set a spatial reference system as metadata to use for the export.
30 | This can be a WKT string, an EPSG code or a proj-string, basically anything
31 | you would normally be able to pass to GDAL. Note that this will not perform
32 | any reprojection of the input geometry, it just sets the metadata if the
33 | target driver supports it.}
34 | 
35 | \item{as_http}{if path is an S3 location, will return corresponding HTTP address.}
36 | }
37 | \value{
38 | path, invisibly
39 | }
40 | \description{
41 | Write out to any spatial data format supported by GDAL.
42 | }
43 | \details{
44 | NOTE: This uses the version of GDAL packaged inside of duckdb, and not the
45 | system GDAL. At this time, duckdb's pre-packaged GDAL does not support s3 writes,
46 | and will produce a "Error: Not implemented Error: GDAL Error (6): Seek not supported on writable /vsis3/ files".
47 | Use to_geojson() to export using duckdb's native JSON serializer instead.
48 | }
49 | \examples{
50 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
51 | local_file <-  system.file("extdata/spatial-test.csv", package="duckdbfs")
52 | load_spatial()
53 | tbl <- open_dataset(local_file, format='csv')
54 | write_geo(tbl, "spatial.geojson")
55 | \dontshow{\}) # examplesIf}
56 | }
57 | 


--------------------------------------------------------------------------------
/inst/examples/more-spatial.R:
--------------------------------------------------------------------------------
 1 | library(duckdbfs)
 2 | library(dplyr)
 3 | library(DBI)
 4 | 
 5 | #load_spatial()
 6 | #con <- cached_connection()
 7 | 
 8 | devtools::install_github("cboettig/duckdbfs@spatial-read")
 9 | countries <- open_dataset("/vsicurl/https://github.com/cboettig/duckdbfs/raw/spatial-read/inst/extdata/world.gpkg",
10 |                           format = "sf", tblname = "countries")
11 | 
12 | cities <- open_dataset("/vsicurl/https://github.com/cboettig/duckdbfs/raw/spatial-read/inst/extdata/metro.fgb",
13 |                           format = "sf", tblname = "cities")
14 | 
15 | con <- cached_connection()
16 | ## We can count number of cities in each country with a bit of SQL
17 | x <- DBI::dbGetQuery(con, "
18 | SELECT countries.iso_a3, count(cities.geom) AS total
19 | FROM countries
20 | LEFT JOIN cities
21 | ON st_contains(countries.geom, cities.geom)
22 | GROUP BY countries.iso_a3
23 | ORDER BY total DESC
24 | LIMIT 6
25 | ")
26 | 
27 | # in dplyr this could be nice and pretty, but `join_by` refuses the syntax
28 | countries |>
29 |   left_join(cities, join_by(st_contains(geom, geom))) |>
30 |   count(iso_a3, sort=TRUE)
31 | 
32 | 
33 | # other dplyr functions have no difficulty passing on these arguments:
34 | melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text()
35 | countries |> filter(st_contains(geom, ST_GeomFromText({melbourne})))
36 | 
37 | 
38 | # Aside: left_join() without count() looks like this in SQL .. much more verbose than dplyr
39 | x <- DBI::dbGetQuery(con,"
40 | SELECT countries.iso_a3, cities.geom, countries.geom AS geometry
41 | FROM countries
42 | LEFT JOIN cities
43 | ON st_contains(countries.geom, cities.geom)
44 | ") |> as_tibble()
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | ## accessing secure data with credentials
52 | 
53 | KBAs <- "/vsis3/biodiversity/KBAsGlobal_2023_March_01_POL.shp"
54 | kba_pts <- "/vsis3/biodiversity/KBAsGlobal_2023_March_01_PNT.shp"
55 | Sys.setenv("AWS_ACCESS_KEY_ID"=Sys.getenv("NVME_KEY"))
56 | Sys.setenv("AWS_SECRET_ACCESS_KEY"=Sys.getenv("NVME_SECRET"))
57 | Sys.setenv("AWS_S3_ENDPOINT"="minio.carlboettiger.info")
58 | Sys.setenv("AWS_VIRTUAL_HOSTING"=FALSE)
59 | x <- sf::read_sf(kba_pts)
60 | kbas <- open_dataset(kba_pts, format="sf", tblname="kbas")
61 | 
62 | 


--------------------------------------------------------------------------------
/R/to_sf.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #' Convert output to sf object
 4 | #'
 5 | #' @param x a remote duckdb `tbl` (from `open_dataset`) or dplyr-pipeline thereof.
 6 | #' @param crs The coordinate reference system, any format understood by `sf::st_crs`.
 7 | #' @param conn the connection object from the tbl.
 8 | #' Takes a duckdb table (from `open_dataset`) or a dataset or dplyr
 9 | #' pipline and returns an sf object. **Important**: the table must have
10 | #' a `geometry` column, which you will almost always have to create
11 | #' first.
12 | #'
13 | #' Note: `to_sf()` triggers collection into R.  This function is suitable
14 | #' to use at the end of a dplyr pipeline that will subset the data.
15 | #' Using this function on a large dataset without filtering first may
16 | #' exceed available memory.
17 | #' @return an `sf` class object (in memory).
18 | #'
19 | #' @examplesIf interactive()
20 | #'
21 | #' library(dplyr)
22 | #' csv_file <- system.file("extdata/spatial-test.csv", package="duckdbfs")
23 | #'
24 | #' # Note that we almost always must first create a `geometry` column, e.g.
25 | #' # from lat/long columns using the `st_point` method.
26 | #' sf <-
27 | #'   open_dataset(csv_file, format = "csv") |>
28 | #'   mutate(geom = ST_Point(longitude, latitude)) |>
29 | #'   to_sf()
30 | #'
31 | #' # We can use the full space of spatial operations, including spatial
32 | #' # and normal dplyr filters.  All operations are translated into a
33 | #' # spatial SQL query by `to_sf`:
34 | #' open_dataset(csv_file, format = "csv") |>
35 | #'   mutate(geom = ST_Point(longitude, latitude)) |>
36 | #'   mutate(dist = ST_Distance(geom, ST_Point(0,0))) |>
37 | #'   filter(site %in% c("a", "b", "e")) |>
38 | #'   to_sf()
39 | #'
40 | #'
41 | #' @export
42 | to_sf <- function(x,
43 |                   crs = NA,
44 |                   conn = cached_connection()) {
45 | 
46 |   load_spatial(conn)
47 | 
48 |   if("geometry" %in% colnames(x)) {
49 |     x <- x |> dplyr::rename(geom=geometry)
50 |     geometry_column <- "geom"
51 |   }
52 |   sql <- x |>
53 |     dplyr::mutate(geom = ST_AsWKB(geom)) |>
54 |     dbplyr::sql_render()
55 | 
56 |   requireNamespace("sf", quietly = TRUE)
57 |   out <- sf::st_read(conn, query=sql, geometry_column = "geom")
58 |   if (!is.na(crs)) {
59 |     sf::st_crs(out) <- crs
60 |   }
61 |   out
62 | }
63 | 
64 | utils::globalVariables(c("ST_AsWKB", "geom", "geometry"), package = "duckdbfs")
65 | 


--------------------------------------------------------------------------------
/tests/testthat/test-spatial.R:
--------------------------------------------------------------------------------
 1 | test_that("spatial", {
 2 |   skip_if_offline() # needs to be able to load the spatial module
 3 |   skip_if_not_installed("sf")
 4 |   skip_on_cran()
 5 | 
 6 |   skip_if_not(has_spatial(), "spatial extension not available")
 7 | 
 8 |   library(dplyr)
 9 |   library(sf)
10 |   ex <- system.file("extdata/spatial-test.csv", package = "duckdbfs") |>
11 |     open_dataset(format = "csv") |>
12 |     dplyr::mutate(geometry = ST_Point(longitude, latitude)) |>
13 |     to_sf()
14 | 
15 |   expect_true(TRUE)
16 | })
17 | 
18 | test_that("spatial vector read", {
19 |   skip_if_not_installed("sf")
20 |   skip_if_offline() # needs to be able to load the spatial module
21 |   skip_on_cran()
22 | 
23 |   skip_if_not(has_spatial(), "spatial extension not available")
24 | 
25 |   # lazy-read external data ( urls work too!)
26 |   path <- system.file("extdata/world.fgb", package = "duckdbfs")
27 |   x <- open_dataset(path, format = "sf")
28 | 
29 |   # read into R
30 |   y <- x |> to_sf()
31 | 
32 |   expect_s3_class(x, "tbl_lazy")
33 |   expect_s3_class(x, "tbl")
34 |   expect_s3_class(y, "sf")
35 | })
36 | 
37 | 
38 | test_that("spatial_join", {
39 |   skip_if_not_installed("sf")
40 |   skip_if_offline() # needs to be able to load the spatial module
41 |   skip_on_cran()
42 |   skip_if_not(has_spatial(), "spatial extension not available")
43 | 
44 |   countries <-
45 |     paste0(
46 |       "https://github.com/cboettig/duckdbfs/",
47 |       "raw/main/inst/extdata/world.fgb"
48 |     ) |>
49 |     open_dataset()
50 | 
51 |   cities <-
52 |     paste0(
53 |       "https://github.com/cboettig/duckdbfs/raw/",
54 |       "main/inst/extdata/metro.fgb"
55 |     ) |>
56 |     open_dataset()
57 | 
58 |   out <-
59 |     countries |>
60 |     dplyr::filter(iso_a3 == "AUS") |>
61 |     spatial_join(cities)
62 | 
63 |   expect_s3_class(out, "tbl_lazy")
64 | 
65 |   local <- to_sf(out)
66 |   expect_s3_class(local, "sf")
67 |   expect_true(all(local$iso_a3 == "AUS"))
68 | 
69 |   ## add examples of other types of spatial joins
70 | })
71 | 
72 | 
73 | ## Test st_read_meta
74 | 
75 | test_that("st_read_meta", {
76 |   skip_if_offline() # needs to be able to load the spatial module
77 |   skip_if_not_installed("sf")
78 |   skip_on_cran()
79 |   skip_if_not(has_spatial(), "spatial extension not available")
80 | 
81 |   load_spatial()
82 | 
83 |   df <-
84 |     "https://github.com/duckdb/duckdb_spatial/raw/main/test/data/amsterdam_roads.fgb" |>
85 |     st_read_meta()
86 |   expect_equal(df$code, "3857")
87 | })
88 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | tbl_name <- function(path) {
 2 |   if (length(path) > 1) {
 3 |     path <- path[[1]]
 4 |   }
 5 |   # sql-safe names based on path
 6 |   name <- basename(tools::file_path_sans_ext(path))
 7 |   name <- gsub("[^a-zA-Z0-9]", "_", name)
 8 |   ## what if it starts with a digit
 9 |   if (grepl("^[0-9]", name)) {
10 |     name <- paste0("file_", name)
11 |   }
12 |   name
13 | }
14 | 
15 | tmp_tbl_name <- function(n = 15) {
16 |   paste0(sample(letters, n, replace = TRUE), collapse = "")
17 | }
18 | 
19 | remote_src <- function(conn) {
20 |   dbplyr::remote_src(conn)
21 | }
22 | 
23 | strip_vsi <- function(path) {
24 |   if (grepl("^/vsi\\w+/", path)) {
25 |     path <- gsub("^/vsi\\w+/", "", path)
26 |   }
27 |   path
28 | }
29 | 
30 | 
31 | # Convert S3 addresses to http addresses, suitable for sharing publicly.
32 | # no change on paths that are local or already http
33 | s3_as_http <- function(
34 |   path,
35 |   endpoint = Sys.getenv("AWS_S3_ENDPOINT", "s3.amazonaws.com"),
36 |   use_ssl = Sys.getenv("AWS_HTTPS", "TRUE")
37 | ) {
38 |   if (use_ssl) {
39 |     http <- "https"
40 |   } else {
41 |     http <- "http"
42 |   }
43 | 
44 |   # handle GDAL-type paths too
45 |   if (grepl("^/vsis3/", path)) {
46 |     path <- gsub("^/vsis3/", glue::glue("{http}://{endpoint}/"), path)
47 |   }
48 | 
49 |   if (grepl("^s3://", path)) {
50 |     path <- gsub("^s3://", glue::glue("{http}://{endpoint}/"), path)
51 |   }
52 |   path
53 | }
54 | 
55 | 
56 | load_extension <-
57 |   function(
58 |     extension = "httpfs",
59 |     conn = cached_connection(),
60 |     nightly = getOption("duckdbfs_use_nightly", FALSE),
61 |     force = FALSE
62 |   ) {
63 |     exts <- duckdb_extensions()
64 |     source <- ""
65 |     if (nightly) {
66 |       source <- " FROM 'http://nightly-extensions.duckdb.org'"
67 |     }
68 |     status <- exts[exts$extension_name == extension, ]
69 |     status_code <- 0
70 |     if (force) {
71 |       FORCE <- "FORCE "
72 |     } else {
73 |       FORCE <- ""
74 |     }
75 | 
76 |     if (!status$installed) {
77 |       if (!nightly) {
78 |         DBI::dbExecute(
79 |           conn,
80 |           paste0(FORCE, glue::glue("INSTALL '{extension}'"), source, ";")
81 |         )
82 |       } else {
83 |         source <- " FROM 'http://nightly-extensions.duckdb.org'"
84 |         status_code <- DBI::dbExecute(
85 |           conn,
86 |           paste0(FORCE, glue::glue("INSTALL '{extension}'"), source, ";")
87 |         )
88 |       }
89 |     }
90 |     if (!status$loaded) {
91 |       status_code <- DBI::dbExecute(conn, glue::glue("LOAD '{extension}';"))
92 |     }
93 | 
94 |     invisible(status_code)
95 |   }
96 | 


--------------------------------------------------------------------------------
/tests/testthat/test-open_dataset.R:
--------------------------------------------------------------------------------
  1 | 
  2 | test_that("local csv files", {
  3 |   cars <- tempfile()
  4 |   write.csv(mtcars, cars)
  5 |   df <- open_dataset(cars, format = "csv")
  6 |   expect_true(inherits(df, "tbl_duckdb_connection"))
  7 |   unlink(cars)
  8 | 
  9 |   close_connection()
 10 | 
 11 | 
 12 | })
 13 | 
 14 | test_that("duckdb_s3_config", {
 15 | 
 16 |   skip_if_offline()
 17 |   skip_on_cran()
 18 |   status <- duckdb_s3_config(
 19 |              s3_access_key_id = "YOUR_ACCESS_KEY_ID",
 20 |              s3_secret_access_key = "YOUR_SECRET_ACCESS_KEY",
 21 |              s3_endpoint = "YOUR_S3_ENDPOINT",
 22 |              s3_region = "YOUR_S3_REGION",
 23 |              s3_uploader_max_filesize = "800GB",
 24 |              s3_uploader_max_parts_per_file = 1000,
 25 |              s3_url_compatibility_mode = FALSE,
 26 |              s3_url_style = "vhost",
 27 |              s3_use_ssl = TRUE)
 28 | 
 29 |   expect_identical(status, 0)
 30 | 
 31 | 
 32 | })
 33 | 
 34 | 
 35 | test_that("https", {
 36 | 
 37 |   skip_if_offline()
 38 |   skip_on_cran()
 39 | 
 40 |   base <- paste0("https://github.com/duckdb/duckdb/raw/main/",
 41 |                  "data/parquet-testing/hive-partitioning/union_by_name/")
 42 |   f1 <- paste0(base, "x=1/f1.parquet")
 43 |   f2 <- paste0(base, "x=1/f2.parquet")
 44 |   f3 <- paste0(base, "x=2/f2.parquet")
 45 | 
 46 |   conn <- cached_connection()
 47 |   ds <- open_dataset( c(f1,f2,f3),
 48 |                       conn = conn,
 49 |                       unify_schemas = TRUE)
 50 |   expect_s3_class(ds, "tbl")
 51 | 
 52 |   df <- dplyr::collect(ds)
 53 |   expect_s3_class(df, "data.frame")
 54 |   close_connection(conn)
 55 | })
 56 | 
 57 | 
 58 | test_that("close_connection", {
 59 | 
 60 |   skip_on_cran()
 61 | 
 62 |   close_connection()
 63 |   close_connection()
 64 |   expect_true(TRUE)
 65 | })
 66 | 
 67 | 
 68 | 
 69 | test_that("s3", {
 70 | 
 71 |   skip_if_offline()
 72 |   skip_on_cran()
 73 |   close_connection()
 74 |   parquet <- "s3://gbif-open-data-us-east-1/occurrence/2023-06-01/occurrence.parquet"
 75 |   gbif <- open_dataset(parquet,
 76 |                        anonymous = TRUE,
 77 |                        s3_region="us-east-1")
 78 |   expect_s3_class(gbif, "tbl_dbi")
 79 |   expect_s3_class(gbif, "tbl")
 80 | 
 81 | })
 82 | 
 83 | 
 84 | 
 85 | test_that("custom csv parsing", {
 86 |   cars <- tempfile()
 87 |   write.table(mtcars, cars, row.names = FALSE)
 88 |   df <- open_dataset(cars, format = "csv", parser_options = c(delim = "' '", header = TRUE))
 89 |   expect_true(inherits(df, "tbl_duckdb_connection"))
 90 |   df <- dplyr::collect(df)
 91 |   expect_true(nrow(df) > 1)
 92 |   expect_true(ncol(df) > 1)
 93 |   expect_true("mpg" %in% names(df))
 94 |   unlink(cars)
 95 | 
 96 |   close_connection()
 97 | 
 98 | 
 99 | })
100 | 


--------------------------------------------------------------------------------
/R/duckdb_secrets.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' duckdb secrets
 3 | #'
 4 | #' Configure the duckdb secrets for remote access.
 5 | #' @inheritParams open_dataset
 6 | #' @param key key
 7 | #' @param secret secret
 8 | #' @param endpoint endpoint address
 9 | #' @param region AWS region (ignored by some other S3 providers)
10 | #' @param bucket restricts the "SCOPE" of this key to only objects in this
11 | #' bucket-name.  note that the bucket name is currently insensitive to endpoint
12 | #' @param url_style path or vhost, for S3
13 | #' @param use_ssl Use SSL address (https instead of http), default TRUE
14 | #' @param url_compatibility_mode optional mode for increased compatibility with some endpoints
15 | #' @param session_token AWS session token, used in some AWS authentication with short-lived tokens
16 | #' @param type Key type, e.g. S3.  See duckdb docs for details.
17 | #' references <https://duckdb.org/docs/configuration/secrets_manager.html>
18 | #' @export
19 | duckdb_secrets <- function(key = Sys.getenv("AWS_ACCESS_KEY_ID", ""),
20 |                            secret = Sys.getenv("AWS_SECRET_ACCESS_KEY", ""),
21 |                            endpoint = Sys.getenv("AWS_S3_ENDPOINT",
22 |                                                  "s3.amazonaws.com"),
23 |                            region = Sys.getenv("AWS_REGION",  "us-east-1"),
24 |                            bucket = NULL,
25 |                            url_style = NULL,
26 |                            use_ssl = Sys.getenv("AWS_HTTPS", "TRUE"),
27 |                            url_compatibility_mode = TRUE,
28 |                            session_token = Sys.getenv("AWS_SESSION_TOKEN", ""),
29 |                            type = "S3",
30 |                            conn = cached_connection()) {
31 | 
32 |   g <- glue::glue
33 | 
34 |   if (!is.null(url_style)){
35 |     url_style <- g("URL_STYLE '{url_style}'")
36 |   } else { 
37 |     if (grepl('amazonaws.com', endpoint)) {
38 |       url_style <- "URL_STYLE 'vhost'"
39 |     } else if (type == "S3") {
40 |       url_style <- "URL_STYLE 'path'"
41 |     }
42 |   }
43 | 
44 | 
45 |   if (!is.null(bucket)) {
46 |     bucket <- g("SCOPE 's3://{bucket}'")
47 |   }
48 | 
49 |   if(!is.null(session_token) || session_token != "") {
50 |     session_token <- g("SESSION_TOKEN '{session_token}'")
51 |   }
52 | 
53 |   query <- paste0(
54 |             g("CREATE OR REPLACE SECRET s3_{key} ("),
55 |             paste(c(
56 |               g("TYPE {type}"),
57 |               g("KEY_ID '{key}'"),
58 |               g("SECRET '{secret}'"),
59 |               g("ENDPOINT '{endpoint}'"),
60 |               g("REGION '{region}'"),
61 |               g("URL_COMPATIBILITY_MODE {url_compatibility_mode}"),
62 |               g("USE_SSL {use_ssl}"),
63 |               url_style,
64 |               bucket,
65 |               session_token),
66 |               collapse = ", "
67 |             ),
68 |             ");"
69 |            )
70 | 
71 |   DBI::dbExecute(conn, query)
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/R/to_geojson.R:
--------------------------------------------------------------------------------
 1 | #' Write geojson using duckdb's native JSON writer
 2 | #'
 3 | #' @inheritParams write_dataset
 4 | #' @param id_col (deprecated). to_geojson() will preserve all atomic columns
 5 | #' as properties.
 6 | #' @param as_http convert returned S3 path to URL (e.g. for public buckets)
 7 | #' @param server aws endpoint if converting s3 path to URL
 8 | #' @param use_ssl should url use https
 9 | #' @return path, invisibly
10 | #' @export
11 | to_geojson <- function(
12 |     dataset,
13 |     path,
14 |     conn = cached_connection(),
15 |     id_col = NULL,
16 |     as_http = FALSE,
17 |     server = Sys.getenv("AWS_S3_ENDPOINT", "s3.amanzonaws.com"),
18 |     use_ssl = Sys.getenv("AWS_HTTPS", "TRUE")
19 | ) {
20 |     # In geojson it must be called "geometry"
21 |     dataset <- safe_geometry_name(dataset)
22 |     # Forget about nested list columns/properties
23 |     dataset <- drop_nested_cols(dataset)
24 | 
25 |     who <- colnames(dataset)
26 |     properties <- who[who != "geometry"]
27 | 
28 |     collection <- glue::glue_sql("'FeatureCollection'", .con = conn)
29 |     sql <- dbplyr::sql_render(dataset)
30 | 
31 |     # Build the properties object dynamically
32 |     prop_pairs <- paste0("'", properties, "': t1.", properties, collapse = ", ")
33 | 
34 |     q <- glue::glue(
35 |         "
36 |    COPY (
37 |      WITH t1 AS (<sql>)
38 |      SELECT json_group_array(
39 |                 {'type': 'Feature',
40 |                  'properties': {<prop_pairs>},
41 |                  'geometry': ST_AsGeoJSON(t1.geometry)
42 |                 }) as features,
43 |                 <collection> as type
44 |          FROM t1
45 |   ) TO '<path>' (FORMAT json);
46 |   ",
47 |         .open = "<",
48 |         .close = ">"
49 |     )
50 | 
51 |     DBI::dbExecute(conn, q)
52 | 
53 |     if (as_http) {
54 |         path <- s3_as_http(path, server, use_ssl)
55 |     }
56 | 
57 |     invisible(path)
58 | }
59 | 
60 | # Make geometry column always called "geometry" (GeoJSON standard name)
61 | safe_geometry_name <- function(dataset) {
62 |     # FIXME identify geometry-type column in duckdb.
63 |     # error if there are multiple such.
64 |     if ("geom" %in% colnames(dataset)) {
65 |         dataset <- dplyr::rename(dataset, geometry = geom)
66 |     }
67 |     if ("Shape" %in% colnames(dataset)) {
68 |         dataset <- dplyr::rename(dataset, geometry = Shape)
69 |     }
70 |     if ("SHAPE" %in% colnames(dataset)) {
71 |         dataset <- dplyr::rename(dataset, geometry = SHAPE)
72 |     }
73 |     dataset
74 | }
75 | 
76 | drop_nested_cols <- function(gdf) {
77 |     # Use native R types from parsing first row.
78 |     if (inherits(gdf, "tbl_lazy")) {
79 |         x <- dplyr::collect(utils::head(gdf, 1))
80 |     }
81 |     keep <- lapply(x, function(x) is.atomic(x) || inherits(x, "sfc"))
82 |     cols <- c(names(keep[unlist(keep)]), "geometry")
83 |     dplyr::select(gdf, dplyr::any_of(cols))
84 | }
85 | 
86 | utils::globalVariables(
87 |     c("geom", "geometry", "Shape", "SHAPE"),
88 |     package = "duckdbfs"
89 | )
90 | 
91 | # smoketest
92 | #local_file <- system.file("extdata/world.fgb", package = "duckdbfs")
93 | #dataset <- open_dataset(local_file, format = 'sf') |> head(3)
94 | #dataset |> to_geojson("testme.json")
95 | #terra::vect("testme.json")
96 | 


--------------------------------------------------------------------------------
/man/spatial_join.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/spatial_join.R
 3 | \name{spatial_join}
 4 | \alias{spatial_join}
 5 | \title{spatial_join}
 6 | \usage{
 7 | spatial_join(
 8 |   x,
 9 |   y,
10 |   by = c("st_intersects", "st_within", "st_dwithin", "st_touches", "st_contains",
11 |     "st_containsproperly", "st_covers", "st_overlaps", "st_crosses", "st_equals",
12 |     "st_disjoint"),
13 |   args = "",
14 |   join = "left",
15 |   tblname = tmp_tbl_name(),
16 |   conn = cached_connection()
17 | )
18 | }
19 | \arguments{
20 | \item{x}{a duckdb table with a spatial geometry column called "geom"}
21 | 
22 | \item{y}{a duckdb table with a spatial geometry column called "geom"}
23 | 
24 | \item{by}{A spatial join function, see details.}
25 | 
26 | \item{args}{additional arguments to join function (e.g. distance for st_dwithin)}
27 | 
28 | \item{join}{JOIN type (left, right, inner, full)}
29 | 
30 | \item{tblname}{name for the temporary view}
31 | 
32 | \item{conn}{the duckdb connection (imputed by duckdbfs by default,
33 | must be shared across both tables)}
34 | }
35 | \value{
36 | a (lazy) view of the resulting table. Users can continue to operate
37 | on using dplyr operations and call to_st() to collect this as an sf object.
38 | }
39 | \description{
40 | spatial_join
41 | }
42 | \details{
43 | Possible \href{https://postgis.net/workshops/postgis-intro/spatial_relationships.html}{spatial joins} include:\tabular{ll}{
44 |    Function \tab Description \cr
45 |    st_intersects \tab Geometry A intersects with geometry B \cr
46 |    st_disjoint \tab The complement of intersects \cr
47 |    st_within \tab Geometry A is within geometry B (complement of contains) \cr
48 |    st_dwithin \tab Geometries are within a specified distance, expressed in the same units as the coordinate reference system. \cr
49 |    st_touches \tab Two polygons touch if the that have at least one point in common, even if their interiors do not touch. \cr
50 |    st_contains \tab Geometry A entirely contains to geometry B. (complement of within) \cr
51 |    st_containsproperly \tab stricter version of \code{st_contains} (boundary counts as external) \cr
52 |    st_covers \tab geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.) \cr
53 |    st_overlaps \tab geometry A intersects but does not completely contain geometry B \cr
54 |    st_equals \tab geometry A is equal to geometry B \cr
55 |    st_crosses \tab Lines or points in geometry A cross geometry B. \cr
56 | }
57 | 
58 | 
59 | All though SQL is not case sensitive, this function expects only
60 | lower case names for "by" functions.
61 | }
62 | \examples{
63 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
64 | 
65 | # note we can read in remote data in a variety of vector formats:
66 | countries <-
67 | paste0("/vsicurl/",
68 |        "https://github.com/cboettig/duckdbfs/",
69 |        "raw/spatial-read/inst/extdata/world.gpkg") |>
70 | open_dataset(format = "sf")
71 | 
72 | cities <-
73 |  paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/",
74 |         "spatial-read/inst/extdata/metro.fgb") |>
75 |  open_dataset(format = "sf")
76 | 
77 | countries |>
78 |   dplyr::filter(iso_a3 == "AUS") |>
79 |   spatial_join(cities)
80 | \dontshow{\}) # examplesIf}
81 | }
82 | 


--------------------------------------------------------------------------------
/man/duckdb_s3_config.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckdb_config.R
 3 | \name{duckdb_s3_config}
 4 | \alias{duckdb_s3_config}
 5 | \title{Configure S3 settings for database connection}
 6 | \usage{
 7 | duckdb_s3_config(
 8 |   conn = cached_connection(),
 9 |   s3_access_key_id = NULL,
10 |   s3_secret_access_key = NULL,
11 |   s3_endpoint = NULL,
12 |   s3_region = NULL,
13 |   s3_session_token = NULL,
14 |   s3_uploader_max_filesize = NULL,
15 |   s3_uploader_max_parts_per_file = NULL,
16 |   s3_uploader_thread_limit = NULL,
17 |   s3_url_compatibility_mode = NULL,
18 |   s3_url_style = NULL,
19 |   s3_use_ssl = NULL,
20 |   anonymous = NULL
21 | )
22 | }
23 | \arguments{
24 | \item{conn}{A database connection object created using the
25 | \code{cache_connection} function (default: \code{cache_connection()}).}
26 | 
27 | \item{s3_access_key_id}{The S3 access key ID (default: \code{NULL}).}
28 | 
29 | \item{s3_secret_access_key}{The S3 secret access key (default: \code{NULL}).}
30 | 
31 | \item{s3_endpoint}{The S3 endpoint (default: \code{NULL}).}
32 | 
33 | \item{s3_region}{The S3 region (default: \code{NULL}).}
34 | 
35 | \item{s3_session_token}{The S3 session token (default: \code{NULL}).}
36 | 
37 | \item{s3_uploader_max_filesize}{The maximum filesize for S3 uploader
38 | (between 50GB and 5TB, default 800GB).}
39 | 
40 | \item{s3_uploader_max_parts_per_file}{The maximum number of parts per file
41 | for S3 uploader (between 1 and 10000, default 10000).}
42 | 
43 | \item{s3_uploader_thread_limit}{The thread limit for S3 uploader
44 | (default: 50).}
45 | 
46 | \item{s3_url_compatibility_mode}{Disable Globs and Query Parameters on
47 | S3 URLs (default: 0, allows globs/queries).}
48 | 
49 | \item{s3_url_style}{The style of S3 URLs to use. Default is
50 | "vhost" unless s3_endpoint is set, which makes default "path"
51 | (i.e. MINIO systems).}
52 | 
53 | \item{s3_use_ssl}{Enable or disable SSL for S3 connections
54 | (default: 1 (TRUE)).}
55 | 
56 | \item{anonymous}{request anonymous access (sets \code{s3_access_key_id} and
57 | \code{s3_secret_access_key} to \code{""}, allowing anonymous access to public buckets).}
58 | }
59 | \value{
60 | Returns silently (NULL) if successful.
61 | }
62 | \description{
63 | This function is used to configure S3 settings for a database connection.
64 | It allows you to set various S3-related parameters such as access key,
65 | secret access key, endpoint, region, session token, uploader settings,
66 | URL compatibility mode, URL style, and SSL usage.
67 | }
68 | \details{
69 | see \url{https://duckdb.org/docs/sql/configuration.html}
70 | }
71 | \examples{
72 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
73 | # Configure S3 settings
74 | duckdb_s3_config(
75 |            s3_access_key_id = "YOUR_ACCESS_KEY_ID",
76 |            s3_secret_access_key = "YOUR_SECRET_ACCESS_KEY",
77 |            s3_endpoint = "YOUR_S3_ENDPOINT",
78 |            s3_region = "YOUR_S3_REGION",
79 |            s3_uploader_max_filesize = "800GB",
80 |            s3_uploader_max_parts_per_file = 100,
81 |            s3_uploader_thread_limit = 8,
82 |            s3_url_compatibility_mode = FALSE,
83 |            s3_url_style = "vhost",
84 |            s3_use_ssl = TRUE,
85 |            anonymous = TRUE)
86 | \dontshow{\}) # examplesIf}
87 | }
88 | 


--------------------------------------------------------------------------------
/inst/examples/s3-spatial.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library(dplyr)
  3 | library(duckdbfs)
  4 | system.file("extdata/spatial-test.csv", package="duckdbfs") |>
  5 |   open_dataset(format = "csv") |>
  6 |   mutate(geometry = ST_Point(longitude, latitude)) |>
  7 |   to_sf()
  8 | 
  9 | library(dplyr)
 10 | library(sf)
 11 | library(spData)
 12 | library(duckdbfs)
 13 | 
 14 | gbif <- duckdbfs::open_dataset("s3://gbif-open-data-us-east-1/occurrence/2022-12-01/occurrence.parquet/**", tblname = "gbif")
 15 | duckdbfs::load_spatial()
 16 | con <- duckdbfs::cached_connection()
 17 | 
 18 | # let's filter the parquet data by spatial polygon!
 19 | # of course it would be much faster using x/y lims from bbox using vanilla SQL, this is just a proof-of-concept
 20 | costa_rica <- world |> filter(grepl("Costa Rica", name_long)) |> pull(geom) |> st_as_text()
 21 | 
 22 | ## FIXME wrap this so we don't need cached_connection() call and sql_render() and st_read?
 23 | 
 24 | bench::bench_time({
 25 | sql <- gbif |>
 26 |   mutate(geometry = ST_Point(decimallongitude, decimallatitude),
 27 |          geom = ST_AsWKB(geometry)) |>
 28 |   filter(class == "Mammalia") |>
 29 |   filter(ST_Within(geometry, ST_GeomFromText({costa_rica}))) |>
 30 |   dbplyr::sql_render()
 31 | 
 32 | cr_species <- st_read(con, query=sql, geometry_column = "geom", EWKB=FALSE)
 33 | cr_species |> as_tibble()
 34 | })
 35 | 
 36 | 
 37 | 
 38 | 
 39 | devtools::load_all()
 40 | library(dplyr)
 41 | costa_rica <- spData::world |>
 42 |   dplyr::filter(grepl("Costa Rica", name_long)) |>
 43 |   dplyr::pull(geom) |>
 44 |   sf::st_as_text()
 45 | 
 46 | gbif <- duckdbfs::open_dataset("s3://gbif-open-data-us-east-1/occurrence/2022-12-01/occurrence.parquet/**", tblname = "gbif")
 47 | x <-
 48 |   gbif |>
 49 |   mutate(geometry = ST_Point(decimallongitude, decimallatitude)) |>
 50 |   filter(class == "Mammalia") |>
 51 |   filter(ST_Within(geometry, ST_GeomFromText({costa_rica})))
 52 | x |> to_sf()
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | con <- duckdbfs::cached_connection()
 59 | load_spatial(con)
 60 | sql <- x |>
 61 |   dplyr::mutate(wkb_geometry = ST_AsWKB(geometry)) |>
 62 |   dbplyr::sql_render()
 63 | sf::st_read(con, query=sql, geometry_column = "wkb_geometry")
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | ## with helpers
 70 | species <- gbif |>
 71 |   mutate(geometry = ST_Point(decimallongitude, decimallatitude)) |>
 72 |   filter(class == "Mammalia") |>
 73 |   filter(ST_Within(geometry, ST_GeomFromText({costa_rica}))) |>
 74 |   to_sf()
 75 | 
 76 | 
 77 | 
 78 | ## compare to:
 79 | bench::bench_time({
 80 |   ex <- gbif |>
 81 |     filter(class == "Mammalia",
 82 |            between(decimallongitude, -85.94, -82.55),
 83 |            between(decimallatitude, 8.22, 11.22)) |>
 84 |     collect()
 85 | 
 86 | })
 87 | 
 88 | 
 89 | 
 90 | 
 91 | ## boilerplate setup
 92 | library(duckdb)
 93 | conn <- DBI::dbConnect(duckdb::duckdb())
 94 | status <- DBI::dbExecute(conn, "INSTALL 'spatial';")
 95 | status <- DBI::dbExecute(conn, "LOAD 'spatial';")
 96 | test <- data.frame(site = letters[1:10], latitude = 1:10, longitude = 1:10)
 97 | DBI::dbWriteTable(conn, "test", test)
 98 | 
 99 | ## Here we go:
100 | sql <- tbl(conn, "test") |>
101 |   mutate(geom = ST_AsWKB(ST_Point(longitude, latitude))) |>
102 |   dbplyr::sql_render()
103 | 
104 | ex <- st_read(conn, query=sql, geometry_column = "geom", EWKB=FALSE)
105 | 
106 | 


--------------------------------------------------------------------------------
/man/cached_connection.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cached_connection.R
 3 | \name{cached_connection}
 4 | \alias{cached_connection}
 5 | \alias{duckdb_connect}
 6 | \title{create a cachable duckdb connection}
 7 | \usage{
 8 | cached_connection(
 9 |   dbdir = ":memory:",
10 |   read_only = FALSE,
11 |   bigint = "numeric",
12 |   config = list(temp_directory = tempfile()),
13 |   autoload_exts = getOption("duckdbfs_autoload_extensions", TRUE),
14 |   with_spatial = not_windows() && getOption("duckdbfs_autoload_extensions", TRUE),
15 |   with_h3 = not_windows() && getOption("duckdbfs_autoload_extensions", TRUE)
16 | )
17 | }
18 | \arguments{
19 | \item{dbdir}{Location for database files. Should be a path to an existing
20 | directory in the file system. With the default (or \code{""}), all
21 | data is kept in RAM.}
22 | 
23 | \item{read_only}{Set to \code{TRUE} for read-only operation.
24 | For file-based databases, this is only applied when the database file is opened for the first time.
25 | Subsequent connections (via the same \code{drv} object or a \code{drv} object pointing to the same path)
26 | will silently ignore this flag.}
27 | 
28 | \item{bigint}{How 64-bit integers should be returned. There are two options: \code{"numeric"} and \code{"integer64"}.
29 | If \code{"numeric"} is selected, bigint integers will be treated as double/numeric.
30 | If \code{"integer64"} is selected, bigint integers will be set to bit64 encoding.}
31 | 
32 | \item{config}{Named list with DuckDB configuration flags, see
33 | \url{https://duckdb.org/docs/configuration/overview#configuration-reference} for the possible options.
34 | These flags are only applied when the database object is instantiated.
35 | Subsequent connections will silently ignore these flags.}
36 | 
37 | \item{autoload_exts}{should we auto-load extensions?  TRUE by default,
38 | can be configured with \code{options(duckdbfs_autoload_extensions = FALSE)}}
39 | 
40 | \item{with_spatial}{install (if missing) and load spatial extension, default TRUE
41 | Opt out by closing any active cached connection first (with
42 | \code{close_connection()}) and re-instantiating the with
43 | \code{connect(with_spatial = FALSE)}.}
44 | 
45 | \item{with_h3}{install (if missing) and load  the h3 spatial index extension.  Default TRUE}
46 | }
47 | \value{
48 | a \code{\link[duckdb:duckdb]{duckdb::duckdb()}} connection object
49 | }
50 | \description{
51 | This function is primarily intended for internal use by other
52 | \code{duckdbfs} functions.  However, it can be called directly by
53 | the user whenever it is desirable to have direct access to the
54 | connection object.
55 | }
56 | \details{
57 | When first called (by a user or internal function),
58 | this function both creates a duckdb connection and places
59 | that connection into a cache (\code{duckdbfs_conn} option).
60 | On subsequent calls, this function returns the cached connection,
61 | rather than recreating a fresh connection.
62 | 
63 | This frees the user from the responsibility of managing a
64 | connection object, because functions needing access to the
65 | connection can use this to create or access the existing connection.
66 | At the close of the global environment, this function's finalizer
67 | should gracefully shutdown the connection before removing the cache.
68 | 
69 | By default, this function creates an in-memory connection. When reading
70 | from on-disk or remote files (parquet or csv), this option can still
71 | effectively support most operations on much-larger-than-RAM data.
72 | However, some operations require additional working space, so by default
73 | we set a temporary storage location in configuration as well.
74 | }
75 | \examples{
76 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
77 | 
78 | con <- cached_connection()
79 | close_connection(con)
80 | \dontshow{\}) # examplesIf}
81 | }
82 | 


--------------------------------------------------------------------------------
/R/write_dataset.R:
--------------------------------------------------------------------------------
  1 | #' write_dataset
  2 | #'
  3 | #' @param dataset a remote tbl object from `open_dataset`,
  4 | #' or an in-memory data.frame.
  5 | #' @param path a local file path or S3 path with write credentials
  6 | #' @param conn duckdbfs database connection
  7 | #' @param format export format
  8 | #' @param partitioning names of columns to use as partition variables
  9 | #' @param overwrite allow overwriting of existing files?
 10 | #' @param as_http if path is an S3 location, will return corresponding HTTP address.
 11 | #' @param options Additional arguments to COPY, see <https://duckdb.org/docs/stable/sql/statements/copy.html#copy--to-options>
 12 | #' Note, uses duckdb native syntax, e.g. c("PER_THREAD_OUTPUT false"), for named arguments, see examples.
 13 | #' (Recall SQL is case-insensitive).
 14 | #' @param ... additional arguments to [duckdb_s3_config()]
 15 | #'
 16 | #' @seealso to_sf to_json to_geojson write_geo
 17 | #' @examplesIf interactive()
 18 | #'   write_dataset(mtcars, tempfile())
 19 | #' @return Returns the path, invisibly.
 20 | #' @export
 21 | #' @examplesIf interactive()
 22 | #' write_dataset(mtcars, tempdir())
 23 | #' write_dataset(mtcars, tempdir(), options = c("PER_THREAD_OUTPUT FALSE", "RETURN_STATS TRUE"))
 24 | #'
 25 | write_dataset <- function(
 26 |   dataset,
 27 |   path,
 28 |   conn = cached_connection(),
 29 |   format = c("parquet", "csv"),
 30 |   partitioning = dplyr::group_vars(dataset),
 31 |   overwrite = TRUE,
 32 |   options = list(),
 33 |   as_http = FALSE,
 34 |   ...
 35 | ) {
 36 |   format <- match.arg(format)
 37 |   version <- DBI::dbExecute(conn, "PRAGMA version;")
 38 | 
 39 |   if (is_not_remote(dataset)) {
 40 |     tblname = tmp_tbl_name()
 41 |     DBI::dbWriteTable(conn, name = tblname, value = dataset)
 42 |   } else {
 43 |     tblname <- as.character(remote_name(dataset, conn))
 44 |   }
 45 | 
 46 |   ## local writes use different notation to allow overwrites:
 47 |   allow_overwrite <- character(0)
 48 |   if (overwrite) {
 49 |     allow_overwrite <- "OVERWRITE_OR_IGNORE"
 50 |   }
 51 | 
 52 |   path <- parse_uri(path, conn = conn, recursive = FALSE)
 53 |   if (grepl("^s3://", path)) {
 54 |     duckdb_s3_config(conn = conn, ...)
 55 |   }
 56 | 
 57 |   partition_by <- character(0)
 58 |   if (length(partitioning) > 0) {
 59 |     partition_by <- paste0(
 60 |       "PARTITION_BY (",
 61 |       paste(partitioning, collapse = ", "),
 62 |       ") "
 63 |     )
 64 |   }
 65 | 
 66 |   format <- toupper(format)
 67 |   format_by <- glue::glue("FORMAT {format}")
 68 |   options_vec <- c(format_by, partition_by, allow_overwrite, options)
 69 |   copy_options <- glue::glue_collapse(options_vec, sep = ", ")
 70 | 
 71 |   copy <- glue::glue("COPY {tblname} TO '{path}' ")
 72 |   query <- glue::glue(copy, "({copy_options})", ";")
 73 |   status <- DBI::dbSendQuery(conn, query)
 74 | 
 75 |   if (as_http) {
 76 |     path <- s3_as_http(path)
 77 |   }
 78 | 
 79 |   invisible(path)
 80 | }
 81 | 
 82 | is_not_remote <- function(x) {
 83 |   is.null(suppressWarnings(dbplyr::remote_src(x)))
 84 | }
 85 | 
 86 | 
 87 | remote_name <- function(x, con) {
 88 |   out <- dbplyr::remote_name(x)
 89 |   if (is.null(out)) {
 90 |     out <- paste0("(", dbplyr::sql_render(x$lazy_query, con = con), ")")
 91 |   }
 92 |   out
 93 | }
 94 | 
 95 | #' as_dataset
 96 | #'
 97 | #' Push a local (in-memory) dataset into a the duckdb database as a table.
 98 | #' This enables it to share the connection source with other data.
 99 | #' This is equivalent to the behavior of copy=TRUE on many (but not all) of the two-table verbs in dplyr.
100 | #' @param df a local data frame.  Otherwise will be passed back without side effects
101 | #' @return a remote `dplyr::tbl` connection to the table.
102 | #' @inheritParams open_dataset
103 | #' @export
104 | as_dataset <- function(df, conn = cached_connection()) {
105 |   if (is_not_remote(df)) {
106 |     tblname = tmp_tbl_name()
107 |     DBI::dbWriteTable(conn, name = tblname, value = df)
108 |     df = dplyr::tbl(conn, tblname)
109 |   }
110 |   return(df)
111 | }
112 | 
113 | #local_file <-  system.file("extdata/world.fgb", package="duckdbfs")
114 | #dataset <- open_dataset(local_file, format='sf') |> head(3)
115 | #dataset |> to_geojson("testme.json")
116 | #terra::vect("testme.json")
117 | 


--------------------------------------------------------------------------------
/man/open_dataset.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/open_dataset.R
  3 | \name{open_dataset}
  4 | \alias{open_dataset}
  5 | \title{Open a dataset from a variety of sources}
  6 | \usage{
  7 | open_dataset(
  8 |   sources,
  9 |   schema = NULL,
 10 |   hive_style = TRUE,
 11 |   unify_schemas = FALSE,
 12 |   format = c("parquet", "csv", "tsv", "sf"),
 13 |   conn = cached_connection(),
 14 |   tblname = tmp_tbl_name(),
 15 |   mode = "VIEW",
 16 |   filename = FALSE,
 17 |   recursive = TRUE,
 18 |   parser_options = list(),
 19 |   ...
 20 | )
 21 | }
 22 | \arguments{
 23 | \item{sources}{A character vector of paths to the dataset files.}
 24 | 
 25 | \item{schema}{The schema for the dataset. If NULL, the schema will be
 26 | inferred from the dataset files.}
 27 | 
 28 | \item{hive_style}{A logical value indicating whether to the dataset uses
 29 | Hive-style partitioning.}
 30 | 
 31 | \item{unify_schemas}{A logical value indicating whether to unify the schemas
 32 | of the dataset files (union_by_name). If TRUE, will execute a UNION by
 33 | column name across all files (NOTE: this can add considerably to
 34 | the initial execution time)}
 35 | 
 36 | \item{format}{The format of the dataset files. One of \code{"parquet"}, \code{"csv"},
 37 | or \code{"sf"} (spatial vector files supported by the sf package / GDAL).
 38 | if no argument is provided, the function will try to guess the type based
 39 | on minimal heuristics.}
 40 | 
 41 | \item{conn}{A connection to a database.}
 42 | 
 43 | \item{tblname}{The name of the table to create in the database.}
 44 | 
 45 | \item{mode}{The mode to create the table in. One of \code{"VIEW"} or \code{"TABLE"}.
 46 | Creating a \code{VIEW}, the default, will execute more quickly because it
 47 | does not create a local copy of the dataset.  \code{TABLE} will create a local
 48 | copy in duckdb's native format, downloading the full dataset if necessary.
 49 | When using \code{TABLE} mode with large data, please be sure to use a \code{conn}
 50 | connections with disk-based storage, e.g. by calling \code{\link[=cached_connection]{cached_connection()}},
 51 | e.g. \code{cached_connection("storage_path")}, otherwise the full data must fit
 52 | into RAM.  Using \code{TABLE} assumes familiarity with R's DBI-based interface.}
 53 | 
 54 | \item{filename}{A logical value indicating whether to include the filename in
 55 | the table name.}
 56 | 
 57 | \item{recursive}{should we assume recursive path? default TRUE. Set to FALSE
 58 | if trying to open a single, un-partitioned file.}
 59 | 
 60 | \item{parser_options}{additional options passed to the parser, e.g. to
 61 | read_csv(), see \url{https://duckdb.org/docs/stable/data/csv/overview.html#parameters}}
 62 | 
 63 | \item{...}{optional additional arguments passed to \code{\link[=duckdb_s3_config]{duckdb_s3_config()}}.
 64 | Note these apply after those set by the URI notation and thus may be used
 65 | to override or provide settings not supported in that format.}
 66 | }
 67 | \value{
 68 | A lazy \code{dplyr::tbl} object representing the opened dataset backed
 69 | by a duckdb SQL connection.  Most \code{dplyr} (and some \code{tidyr}) verbs can be
 70 | used directly on this object, as they can be translated into SQL commands
 71 | automatically via \code{dbplyr}.  Generic R commands require using
 72 | \code{\link[dplyr:compute]{dplyr::collect()}} on the table, which forces evaluation and reading the
 73 | resulting data into memory.
 74 | }
 75 | \description{
 76 | This function opens a dataset from a variety of sources, including Parquet,
 77 | CSV, etc, using either local file system paths, URLs, or S3 bucket URI
 78 | notation.
 79 | }
 80 | \examples{
 81 | \dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
 82 | # A remote, hive-partitioned Parquet dataset
 83 | base <- paste0("https://github.com/duckdb/duckdb/raw/main/",
 84 |              "data/parquet-testing/hive-partitioning/union_by_name/")
 85 | f1 <- paste0(base, "x=1/f1.parquet")
 86 | f2 <- paste0(base, "x=1/f2.parquet")
 87 | f3 <- paste0(base, "x=2/f2.parquet")
 88 | 
 89 | open_dataset(c(f1,f2,f3), unify_schemas = TRUE)
 90 | 
 91 | # Access an S3 database specifying an independently-hosted (MINIO) endpoint
 92 | efi <- open_dataset("s3://neon4cast-scores/parquet/aquatics",
 93 |                     s3_access_key_id="",
 94 |                     s3_endpoint="data.ecoforecast.org")
 95 | 
 96 | # Use parser-options for non-standard csv:
 97 |  cars <- tempfile() # dummy data
 98 |  write.table(mtcars, cars, row.names = FALSE)
 99 | 
100 | # Note nested quotes on parser option for delimiter:
101 |  df <- open_dataset(cars, format = "csv",
102 |                     parser_options = c(delim = "' '", header = TRUE))
103 | \dontshow{\}) # examplesIf}
104 | }
105 | 


--------------------------------------------------------------------------------
/R/parse_uri.R:
--------------------------------------------------------------------------------
  1 | parse_uri <- function(sources, conn = cached_connection(), recursive = TRUE) {
  2 |   sources
  3 |   }
  4 | 
  5 | 
  6 | # Attempts URI compatibility with arrow
  7 | # Maybe not a good idea....
  8 | deprecated <- list(
  9 | parse_uri <- function(sources, conn = cached_connection(), recursive = TRUE) {
 10 | 
 11 |   if(any(grepl("^\\w+://", sources))) {
 12 |     # local file paths that don't require network should not attempt to load it
 13 |     # Maybe unnecessary as httpfs should be bundled with R's binary duckdb
 14 |     load_httpfs(conn)
 15 |   }
 16 | 
 17 |   # http URLs pass through as is, can't do recursion
 18 |   if(any(grepl("^http", sources))) {
 19 |     return(sources)
 20 |   }
 21 | 
 22 |   ## for now only parse sources of length-1
 23 |   if(length(sources) > 1) return(sources)
 24 | 
 25 |   if (grepl("^s3://", sources)) {
 26 |     # first strip any * for compatibility
 27 |     # sources <- gsub("/\\*+$", "", sources)
 28 | 
 29 |     if(grepl("\\?", sources)) {
 30 |       warning("using a query parameter to configure S3 is deprecated")
 31 | 
 32 |       url <- url_parse(sources)
 33 |       scheme <- url$query[["scheme"]]
 34 |       use_ssl <- !identical(scheme, "http")
 35 | 
 36 |       if(identical(url$username, "anonymous")) {
 37 |         url$username <- ""
 38 |         url$password <- ""
 39 |       }
 40 | 
 41 |       duckdb_s3_config(conn = conn,
 42 |                       s3_access_key_id = url$username,
 43 |                       s3_secret_access_key = url$password,
 44 |                       s3_session_token = url$token,
 45 |                       s3_endpoint = url$query[["endpoint_override"]],
 46 |                       s3_region = url$query[["region"]],
 47 |                       s3_use_ssl = as.integer(use_ssl))
 48 | 
 49 |       sources <- paste0(url$scheme, "://", url$hostname, url$path)
 50 |     }
 51 |   }
 52 | 
 53 |   if(recursive) {
 54 |     # Don't use recursive directory globs if we know it is a local file,
 55 |     # or if it has a standard extension,
 56 |     # or already ends with a glob
 57 |     # Otherwise, we append the "/**".
 58 |     if ( !fs::is_file(sources) && 
 59 |          !grepl("\\*$", sources) && 
 60 |          (!grepl("\\.parquet$", sources) || 
 61 |           !grepl("\\.csv$", sources) || 
 62 |           !grepl("\\.csv.gz$", sources)  )
 63 |        ){
 64 |       sources <- gsub("\\/$", "", sources)
 65 |       sources <- paste0(sources, "/**")
 66 |     }
 67 |   }
 68 |   sources
 69 | }
 70 | )
 71 | 
 72 | 
 73 | 
 74 | ## Adapted from httr2 0.2.3, MIT License, RStudio
 75 | parse_match <- function(x,pattern) {
 76 |   m <- regexec(pattern, x, perl = TRUE)
 77 |   pieces <- regmatches(x, m)[[1]][-1]
 78 |   lapply(pieces, empty_to_null)
 79 | }
 80 | 
 81 | empty_to_null <- function(x) {
 82 |   if (x == "")
 83 |     NULL
 84 |   else x
 85 | }
 86 | null_to_empty <- function(x) {
 87 |   if (is.null(x))
 88 |     ""
 89 |   else x
 90 | }
 91 | 
 92 | parse_delim <- function(x, delim, quote = "\"", ...) {
 93 |   scan(text = x, what = character(), sep = delim, quote = quote,
 94 |        quiet = TRUE, strip.white = TRUE, ...)
 95 | }
 96 | 
 97 | parse_name_equals_value <- function (x) {
 98 |   loc <- regexpr("=", x, fixed = TRUE)
 99 |   pieces <- regmatches(x, loc, invert = TRUE)
100 |   expand <- function(x) if (length(x) == 1)
101 |     c(x, "")
102 |   else x
103 |   pieces <- lapply(pieces, expand)
104 |   val <- trimws(vapply(pieces, "[[", "", 2))
105 |   name <- trimws(vapply(pieces, "[[", "", 1))
106 |   stats::setNames(as.list(val), name)
107 | }
108 | 
109 | query_parse <- function(x) {
110 |   x <- gsub("^\\?", "", x)
111 |   params <- parse_name_equals_value(parse_delim(x, "&"))
112 |   if (length(params) == 0) {
113 |     return(NULL)
114 |   }
115 |   #out <- as.list(curl::curl_unescape(params))
116 |   #names(out) <- curl::curl_unescape(names(params))
117 |   #out
118 |   params
119 | }
120 | 
121 | 
122 | url_parse <- function(url) {
123 | 
124 |   pieces <- parse_match(url, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?")
125 |   scheme <- pieces[[2]]
126 |   authority <- null_to_empty(pieces[[4]])
127 |   path <- pieces[[5]]
128 |   query <- pieces[[7]]
129 |   if (!is.null(query)) {
130 |     query <- query_parse(query)
131 |   }
132 |   fragment <- pieces[[9]]
133 |   pieces <- parse_match(authority, "^(([^@]+)@)?([^:]+)?(:([^#]+))?")
134 | 
135 | 
136 |   username <- NULL
137 |   password <- NULL
138 |   token <- NULL
139 | 
140 |   username <- pieces[[2]]
141 |   if (!is.null(username)) {
142 |     keys <- strsplit(username, ":")[[1]]
143 |     if(length(keys) > 0) {
144 |       username <- keys[1]
145 |     }
146 |     if(length(keys) > 1) {
147 |       password <- keys[2]
148 |     }
149 |     if(length(keys) > 2) {
150 |       token <- keys[3]
151 |     }
152 |   }
153 |   hostname <- pieces[[3]]
154 |   port <- pieces[[5]]
155 |   list(scheme = scheme, hostname = hostname, username = username,
156 |        password = password, token = token, port = port, path = path,
157 |        query = query, fragment = fragment)
158 | }
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/R/cached_connection.R:
--------------------------------------------------------------------------------
  1 | duckdbfs_env <- new.env()
  2 | 
  3 | 
  4 | #' create a cachable duckdb connection
  5 | #'
  6 | #' This function is primarily intended for internal use by other
  7 | #' `duckdbfs` functions.  However, it can be called directly by
  8 | #' the user whenever it is desirable to have direct access to the
  9 | #' connection object.
 10 | #'
 11 | #' When first called (by a user or internal function),
 12 | #' this function both creates a duckdb connection and places
 13 | #' that connection into a cache (`duckdbfs_conn` option).
 14 | #' On subsequent calls, this function returns the cached connection,
 15 | #' rather than recreating a fresh connection.
 16 | #'
 17 | #' This frees the user from the responsibility of managing a
 18 | #' connection object, because functions needing access to the
 19 | #' connection can use this to create or access the existing connection.
 20 | #' At the close of the global environment, this function's finalizer
 21 | #' should gracefully shutdown the connection before removing the cache.
 22 | #'
 23 | #'
 24 | #' By default, this function creates an in-memory connection. When reading
 25 | #' from on-disk or remote files (parquet or csv), this option can still
 26 | #' effectively support most operations on much-larger-than-RAM data.
 27 | #' However, some operations require additional working space, so by default
 28 | #' we set a temporary storage location in configuration as well.
 29 | #' @inheritParams duckdb::duckdb
 30 | #' @param autoload_exts should we auto-load extensions?  TRUE by default,
 31 | #' can be configured with `options(duckdbfs_autoload_extensions = FALSE)`
 32 | #' @param with_spatial install (if missing) and load spatial extension, default TRUE
 33 | #'  Opt out by closing any active cached connection first (with
 34 | #'  `close_connection()`) and re-instantiating the with
 35 | #'  `connect(with_spatial = FALSE)`.
 36 | #' @param with_h3 install (if missing) and load  the h3 spatial index extension.  Default TRUE
 37 | #' @returns a [duckdb::duckdb()] connection object
 38 | #' @aliases cached_connection duckdb_connect
 39 | #' @examplesIf interactive()
 40 | #'
 41 | #' con <- cached_connection()
 42 | #' close_connection(con)
 43 | #'
 44 | #' @export
 45 | #'
 46 | cached_connection <- function(
 47 |   dbdir = ":memory:",
 48 |   read_only = FALSE,
 49 |   bigint = "numeric",
 50 |   config = list(temp_directory = tempfile()),
 51 |   autoload_exts = getOption("duckdbfs_autoload_extensions", TRUE),
 52 |   with_spatial = not_windows() &&
 53 |     getOption("duckdbfs_autoload_extensions", TRUE),
 54 |   with_h3 = not_windows() &&
 55 |     getOption("duckdbfs_autoload_extensions", TRUE)
 56 | ) {
 57 |   #conn <- mget("duckdbfs_conn", envir = duckdbfs_env,
 58 |   #             ifnotfound = list(NULL))$duckdbfs_conn
 59 | 
 60 |   conn <- getOption("duckdbfs_conn", NULL)
 61 | 
 62 |   ## destroy invalid (closed) connections first
 63 |   if (inherits(conn, "duckdb_connection")) {
 64 |     if (!DBI::dbIsValid(conn)) {
 65 |       close_connection(conn)
 66 |       conn <- NULL
 67 |     }
 68 |   }
 69 | 
 70 |   if (!inherits(conn, "duckdb_connection")) {
 71 |     if (getOption("duckdbfs_debug", FALSE)) {
 72 |       message("Making a duckdb connection!")
 73 |     }
 74 | 
 75 |     conn <- DBI::dbConnect(
 76 |       duckdb::duckdb(),
 77 |       dbdir = dbdir,
 78 |       read_only = read_only,
 79 |       bigint = bigint,
 80 |       config = config
 81 |     )
 82 | 
 83 |     if (with_spatial) {
 84 |       # can't use load_spatial here, creates infinite recursion
 85 |       DBI::dbExecute(conn, "INSTALL spatial;")
 86 |       DBI::dbExecute(conn, "LOAD spatial;")
 87 |     }
 88 | 
 89 |     if (with_h3) {
 90 |       DBI::dbExecute(conn, "INSTALL h3 from community;")
 91 |       DBI::dbExecute(conn, "LOAD h3;")
 92 |     }
 93 | 
 94 |     if (autoload_exts) {
 95 |       DBI::dbExecute(conn, "SET autoinstall_known_extensions=1;")
 96 |       DBI::dbExecute(conn, "SET autoload_known_extensions=1;")
 97 |     }
 98 | 
 99 |     options(duckdbfs_conn = conn)
100 |     # assign("duckdbfs_conn", conn, envir = duckdbfs_env)
101 |   }
102 | 
103 |   ## create finalizer to avoid duckdb complaining that connection
104 |   ## was not shut down before gc
105 |   e <- globalenv()
106 |   reg.finalizer(e, function(e) close_connection(), TRUE)
107 | 
108 |   conn
109 | }
110 | 
111 | 
112 | #' close connection
113 | #'
114 | #' @param conn a duckdb connection (leave blank)
115 | #' Closes the invisible cached connection to duckdb
116 | #' @details
117 | #' Shuts down connection before gc removes it.
118 | #' Then clear cached reference to avoid using a stale connection
119 | #' This avoids complaint about connection being garbage collected.
120 | #' @returns returns nothing.
121 | #' @examplesIf interactive()
122 | #'
123 | #' close_connection()
124 | #'
125 | #' @export
126 | close_connection <- function(conn = cached_connection()) {
127 |   if (DBI::dbIsValid(conn)) {
128 |     DBI::dbDisconnect(conn, shutdown = TRUE)
129 |   }
130 | 
131 |   ## clear cached reference to the now-closed connection
132 |   # name <- ls("duckdbfs_conn", envir = duckdbfs_env)
133 |   #if(length(name) > 0) rm("duckdbfs_conn", envir = duckdbfs_env)
134 |   .Options$duckdbfs_conn <- NULL
135 | 
136 |   rm(conn)
137 | }
138 | 
139 | 
140 | #' @export
141 | duckdb_connect <- cached_connection
142 | 
143 | 
144 | not_windows <- function() {
145 |   tolower(Sys.info()[["sysname"]]) != "windows"
146 | }
147 | 


--------------------------------------------------------------------------------
/R/spatial_join.R:
--------------------------------------------------------------------------------
  1 | #' spatial_join
  2 | #'
  3 | #' @param x a duckdb table with a spatial geometry column called "geom"
  4 | #' @param y a duckdb table with a spatial geometry column called "geom"
  5 | #' @param by A spatial join function, see details.
  6 | #' @param join JOIN type (left, right, inner, full)
  7 | #' @param args additional arguments to join function (e.g. distance for st_dwithin)
  8 | #' @param tblname name for the temporary view
  9 | #' @param conn the duckdb connection (imputed by duckdbfs by default,
 10 | #' must be shared across both tables)
 11 | #' @return a (lazy) view of the resulting table. Users can continue to operate
 12 | #' on using dplyr operations and call to_st() to collect this as an sf object.
 13 | #' @details
 14 | #'
 15 | #' Possible [spatial joins](https://postgis.net/workshops/postgis-intro/spatial_relationships.html) include:
 16 | #'
 17 | #' Function            | Description
 18 | #' -------------------- | --------------------------------------------------------------------------------------------
 19 | #' st_intersects       | Geometry A intersects with geometry B
 20 | #' st_disjoint         | The complement of intersects
 21 | #' st_within           | Geometry A is within geometry B (complement of contains)
 22 | #' st_dwithin          | Geometries are within a specified distance, expressed in the same units as the coordinate reference system.
 23 | #' st_touches          | Two polygons touch if the that have at least one point in common, even if their interiors do not touch.
 24 | #' st_contains         | Geometry A entirely contains to geometry B. (complement of within)
 25 | #' st_containsproperly | stricter version of `st_contains` (boundary counts as external)
 26 | #' st_covers           | geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.)
 27 | #' st_overlaps         | geometry A intersects but does not completely contain geometry B
 28 | #' st_equals           | geometry A is equal to geometry B
 29 | #' st_crosses          | Lines or points in geometry A cross geometry B.
 30 | #'
 31 | #' All though SQL is not case sensitive, this function expects only
 32 | #' lower case names for "by" functions.
 33 | #'
 34 | #' @examplesIf interactive()
 35 | #'
 36 | #' # note we can read in remote data in a variety of vector formats:
 37 | #' countries <-
 38 | #' paste0("/vsicurl/",
 39 | #'        "https://github.com/cboettig/duckdbfs/",
 40 | #'        "raw/spatial-read/inst/extdata/world.gpkg") |>
 41 | #' open_dataset(format = "sf")
 42 | #'
 43 | #' cities <-
 44 | #'  paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/",
 45 | #'         "spatial-read/inst/extdata/metro.fgb") |>
 46 | #'  open_dataset(format = "sf")
 47 | #'
 48 | #' countries |>
 49 | #'   dplyr::filter(iso_a3 == "AUS") |>
 50 | #'   spatial_join(cities)
 51 | #'
 52 | #' @export
 53 | spatial_join <- function(x,
 54 |                          y,
 55 |                          by=c("st_intersects", "st_within",
 56 |                               "st_dwithin", "st_touches",
 57 |                               "st_contains", "st_containsproperly",
 58 |                               "st_covers", "st_overlaps",
 59 |                               "st_crosses", "st_equals",
 60 |                               "st_disjoint"),
 61 |                          args = "",
 62 |                          join="left",
 63 |                          tblname =  tmp_tbl_name(),
 64 |                          conn = cached_connection()) {
 65 | 
 66 |   by <- match.arg(by)
 67 |   ## x,y may be promised queries
 68 |   x <- as_view(x)
 69 |   y <- as_view(y)
 70 | 
 71 |   # buil spatial join query
 72 |   x.name <- remote_name(x, conn)
 73 |   y.name <- remote_name(y, conn)
 74 |   x.geom <- paste0(x.name, ".geom")
 75 |   y.geom <- paste0(y.name, ".geom")
 76 | 
 77 |   if(args != ""){
 78 |     args <- paste(",", args)
 79 |   }
 80 | 
 81 |   # be more careful than SELECT *
 82 | 
 83 |   # x.geom becomes the "geom" column, y.geom becomes geom:1
 84 |   query <- paste(
 85 |     "SELECT *",
 86 |     "FROM", x.name,
 87 |     join, "JOIN", y.name,
 88 |     "ON", paste0(by, "(", x.geom, ", ", y.geom, args, ")")
 89 |   )
 90 |   query_to_view(query, tblname, conn)
 91 | 
 92 | }
 93 | 
 94 | 
 95 | #' as_view
 96 | #'
 97 | #' Create a View of the current query.  This can be an effective way to allow
 98 | #' a query chain to remain lazy
 99 | #' @param x a duckdb spatial dataset
100 | #' @inheritParams open_dataset
101 | #' @examplesIf interactive()
102 | #' path <- system.file("extdata/spatial-test.csv", package="duckdbfs")
103 | #' df <- open_dataset(path)
104 | #' library(dplyr)
105 | #'
106 | #' df |> filter(latitude > 5) |> as_view()
107 | #'
108 | #' @export
109 | as_view <- function(x, tblname =  tmp_tbl_name(), conn = cached_connection()) {
110 | 
111 |   # assert x is a tbl_lazy, a tbl_sql, and a tbl_duckdb_connection
112 | 
113 |   ## lazy_base_query objects are good to go.
114 |   if(inherits(x$lazy_query, "lazy_base_query")) {
115 |     return(x)
116 |   }
117 |   ## lazy_select_query objects are unnamed,
118 |   ## convert to named views so we can re-use them in queries
119 |   q <- dbplyr::sql_render(x)
120 |   query_to_view(q, tblname, conn)
121 | }
122 | 
123 | query_to_view <- function(query,
124 |                           tblname =  tmp_tbl_name(),
125 |                           conn = cached_connection()) {
126 |   q <- paste("CREATE OR REPLACE TEMPORARY VIEW", tblname, "AS", query)
127 |   DBI::dbSendQuery(conn, q)
128 |   dplyr::tbl(conn, tblname)
129 | }
130 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | # duckdbfs 0.1.2.99
  2 | 
  3 | * All methods that write to a file / path now return that path (invisibly).  Previously the return was just inherited from dbExecute() call, except for write_dataset() which always followed this convention. An additional optional argument has been added which can format the returned path as an HTTP address. 
  4 | 
  5 | # duckdbfs 0.1.2
  6 | 
  7 | * `write_geo()` now takes argument, `srs` for projection information
  8 | * `to_geojson()` now writes all atomic columns, not just an id column.
  9 | 
 10 | # duckdbfs 0.1.1
 11 | 
 12 | * new function `duckdb_config()` streamlines common configurations, like `duckdb_config(threads = 1, memory_limit = '10GB')`
 13 | * related helpers `duckdb_get_config()` shows any or all configuration settings, `duckdb_reset()` restores defaults.
 14 | * new function `duckdb_extensions()` lists all available, installed, or loaded extensions and descriptions. 
 15 | * `cached_connection()` is aliased as `duckdb_connect()`, reflecting its use as more than an under-the-hood utility. 
 16 | * `load_h3()` and `load_spatial()` are called by default.  Opt out by closing any active cached connection first (with `close_connection()`) and re-instantiating the with `connect(with_h3 = FALSE)` etc.  
 17 | * `open_dataset()` gains the argument `parser_options` to pass arbitrary options to parsers such as duckdb's read_csv(), see <https://duckdb.org/docs/stable/data/csv/overview.html#parameters>.  
 18 | * `write_dataset()` gains the argument `options` to support custom options controlling the COPY behavior writing to file, such as thread parallelism, file naming conventions, and more.  see <https://duckdb.org/docs/stable/sql/statements/copy.html#copy--to-options>
 19 | * S3-based access will no longer automatically try recursion if path ends in a recognized extension, `.parquet`, `.csv`, or `.csv.gz`
 20 | 
 21 | # duckdbfs 0.1.0
 22 | 
 23 | * Adds `to_h3j()` method for streaming data to H3J format 
 24 | * Adds `duckdb_secrets()` as more modern [credential manager](https://duckdb.org/docs/stable/configuration/secrets_manager.html)
 25 | * Adds `write_geo()` method, currently writes geojson [#37](https://github.com/cboettig/duckdbfs/issues/37)
 26 | * `cached_connection()` / `connect()` now supports `config` argument and sets a temporary directory to the R tempdir by default, allowing disk-backed storage when duckdb detects memory limits.  
 27 | 
 28 | # duckdbfs 0.0.9
 29 | 
 30 | * Restore default to non-nightly. 
 31 | 
 32 | # duckdbfs 0.0.8
 33 | 
 34 | * work-around for error `The file was built for DuckDB version 'v1.1.3', but we can only load extensions built for DuckDB version '19864453f7'.`
 35 |   by using nightly repo for extensions by default. 
 36 | 
 37 | 
 38 | # duckdbfs 0.0.7
 39 | 
 40 | * The default `cached_connection()` helper will configure a temporary storage location by default.
 41 |   It also now supports all options supported by `duckdb::duckdb()` for connection creation. 
 42 | * New `as_dataset()` utility copies a local in-memory data.frame into the connection.
 43 |   
 44 | # duckdbfs 0.0.6
 45 | 
 46 | * bugfix: reading from local disk recursively no longer requires manual `**`.
 47 |   Also, trying to read from an existing _local_ file won't try and append recursive search
 48 |   even when given the default recursive=TRUE option.
 49 | * bugfix: `open_dataset()` uses random table name by default, avoid naming collisions.
 50 | 
 51 | # duckdbfs 0.0.5
 52 | 
 53 | * bugfix `write_dataset()` no longer adds `**` into paths when writing some partitions.
 54 | * Protect from unsupported table names generated from file names that start with a digit, fixes #21. 
 55 | 
 56 | # duckdbfs 0.0.4
 57 | 
 58 | * `open_dataset()` gains the ability to read spatial vector data formats
 59 |   (objects read by `sf`) using `format="sf"`
 60 | * default geometry column in `to_sf()` is now termed `geom`, to match the default
 61 |   used in `duckdb`'s `st_read()` function.
 62 | * `open_dataset()` now tries to guess the data format instead of defaulting to
 63 |   parquet when no format is explicitly provided. 
 64 | 
 65 | * a new function, `spatial_join()`, allows a variety of spatial joins.  
 66 | * a new function, `st_read_meta()`, exposes the spatial metadata of remote spatial objects.
 67 | * new helper function, `as_view()`, creates a temporary view of a query.
 68 | 
 69 | # duckdbfs 0.0.3
 70 | 
 71 | * `write_dataset()` now understands lazy queries, not just lazy tables.
 72 | 
 73 | # duckdbfs 0.0.2
 74 | 
 75 | * duckdbfs now has spatial data query support! Users can leverage spatial
 76 |   data operations like `st_distance()` and `st_area()` and request return
 77 |   values as `sf` objects.  Supports network-based access too.  See README.md
 78 | 
 79 | * Added `write_dataset()` which can write to (potentially partitioned) parquet
 80 |   to local directories or remote (S3) buckets.
 81 | 
 82 | * The S3 interface supports `arrow`-compatible URI notation:
 83 |   - Alternate endpoints can now be passed like so 
 84 |     `s3://userid:secret_token@bucket-name?endpoint_override=data.ecoforecast.org`
 85 |   - Users can omit the use of `*` (match any file) or `**` 
 86 |     (recursive search) and just supply a path.  Recursive search is then
 87 |     assumed automatically.  Note: unlike `arrow`, this still supports the
 88 |     use of globs (`*`) elsewhere in the path, e.g. `s3://bucket/*/path`
 89 | 
 90 | * `duckdb_s3_config` gains argument `anonymous` allowing users to ignore existing
 91 |   AWS keys that may be set in environmental variables or AWS configuration files.
 92 |   This can also be passed as the username position in URI notation, e.g.
 93 |   `s3://anonymous@bucket_name`.  
 94 | 
 95 | * `open_dataset` drops use of `endpoint` as an argument.  Instead, alternative
 96 |   S3 endpoints can be set either by using the URI query notation or calling
 97 |   `duckdb_s3_config()` first.  Additionally, any arguments to `duckdb_s3_config()`,
 98 |   including `s3_endpoint`, can now be passed to `open_dataset` through the `...`.
 99 |   Note these settings will override any set by the URI notation.
100 | 
101 | # duckdbfs 0.0.1
102 | 
103 | * Initial release to CRAN
104 | 


--------------------------------------------------------------------------------
/tests/testthat/test-write_dataset.R:
--------------------------------------------------------------------------------
  1 | #' Note that it is not possible to open from one S3 source and write to another
  2 | #'
  3 | test_that("write_dataset", {
  4 |   skip_on_cran()
  5 |   ## write an in-memory dataset
  6 |   path <- file.path(tempdir(), "mtcars.parquet")
  7 |   write_dataset(mtcars, path)
  8 |   expect_true(file.exists(path))
  9 |   df <- open_dataset(path)
 10 |   expect_s3_class(df, "tbl")
 11 | 
 12 |   write_dataset(
 13 |     mtcars,
 14 |     path,
 15 |     options = c("PER_THREAD_OUTPUT FALSE", "FILENAME_PATTERN 'cars_{i}'")
 16 |   )
 17 | 
 18 |   expect_true(file.exists(path))
 19 |   df <- open_dataset(path)
 20 |   expect_s3_class(df, "tbl")
 21 | 
 22 |   ## write from an on-disk dataset
 23 |   local_file <- system.file("extdata/spatial-test.csv", package = "duckdbfs")
 24 |   tbl <- open_dataset(local_file, format = 'csv')
 25 |   path <- file.path(tempdir(), "spatial.parquet")
 26 |   write_dataset(tbl, path)
 27 | 
 28 |   expect_true(file.exists(path))
 29 |   df <- open_dataset(path)
 30 |   expect_s3_class(df, "tbl")
 31 | 
 32 |   ## Write from a query string
 33 |   path2 <- file.path(tempdir(), "spatial2.parquet")
 34 | 
 35 |   dataset <- tbl |>
 36 |     dplyr::mutate(new = "test")
 37 |   dataset |>
 38 |     write_dataset(path2)
 39 | })
 40 | 
 41 | test_that("write_dataset partitions", {
 42 |   skip_on_cran()
 43 |   ## write an in-memory dataset
 44 |   path <- file.path(tempdir(), "mtcars")
 45 |   library(dplyr)
 46 | 
 47 |   mtcars |>
 48 |     group_by(cyl, gear) |>
 49 |     write_dataset(path, options = "FILENAME_PATTERN 'cars_{uuid}'")
 50 | 
 51 |   expect_true(dir.exists(path))
 52 |   df <- open_dataset(path)
 53 |   expect_s3_class(df, "tbl")
 54 |   parts <- list.files(path)
 55 |   expect_true(any(grepl("cyl=4", parts)))
 56 | 
 57 |   path <- file.path(tempdir(), "mtcars2")
 58 |   mtcars |> write_dataset(path, partitioning = "cyl", overwrite = TRUE)
 59 |   expect_true(file.exists(path))
 60 |   df <- open_dataset(path)
 61 |   expect_s3_class(df, "tbl")
 62 | 
 63 |   unlink(path, recursive = TRUE)
 64 | })
 65 | 
 66 | 
 67 | test_that("write_dataset, remote input", {
 68 |   skip_on_cran()
 69 |   skip_if_offline()
 70 | 
 71 |   tbl <- open_dataset(
 72 |     paste0(
 73 |       "https://raw.githubusercontent.com/cboettig/duckdbfs/",
 74 |       "main/inst/extdata/spatial-test.csv"
 75 |     ),
 76 |     format = "csv"
 77 |   )
 78 | 
 79 |   path <- file.path(tempdir(), "spatial.parquet")
 80 |   write_dataset(tbl, path)
 81 | 
 82 |   expect_true(file.exists(path))
 83 |   df <- open_dataset(path)
 84 |   expect_s3_class(df, "tbl")
 85 | })
 86 | 
 87 | test_that("write_dataset to s3:", {
 88 |   skip_on_os("windows")
 89 |   skip_if_offline()
 90 |   skip_on_cran()
 91 |   skip_if_not_installed("jsonlite")
 92 |   skip_if_not_installed("minioclient")
 93 | 
 94 |   minioclient::install_mc(force = TRUE)
 95 | 
 96 |   p <- minioclient::mc_alias_ls("play --json")
 97 |   config <- jsonlite::fromJSON(p$stdout)
 98 | 
 99 |   minioclient::mc_mb("play/duckdbfs")
100 |   duckdb_secrets(
101 |     config$accessKey,
102 |     config$secretKey,
103 |     gsub("https://", "", config$URL)
104 |   )
105 | 
106 |   mtcars |> write_dataset("s3://duckdbfs/mtcars.parquet")
107 | 
108 |   expect_true(TRUE)
109 |   minioclient::mc("rb --force play/duckdbfs")
110 | 
111 |   close_connection()
112 | })
113 | 
114 | mc_config_get <- function(alias = "play") {
115 |   # this can fail tp parse on windows, stdout is not pure json
116 |   # p <- minioclient::mc_alias_ls(paste(alias, "--json"))
117 |   # config <- jsonlite::fromJSON(p$stdout)
118 | 
119 |   ## fails to find config on remote
120 |   path <- getOption("minioclient.dir", tools::R_user_dir("minioclient", "data"))
121 |   json <- jsonlite::read_json(file.path(path, "config.json"))
122 |   config <- json$aliases[[alias]]
123 |   config$alias <- alias
124 |   config$URL <- config$url
125 |   config
126 | }
127 | 
128 | 
129 | test_that("write_geo", {
130 |   skip_on_cran()
131 |   skip_if_not_installed("sf")
132 |   skip_if_not(has_spatial(), "spatial extension not available")
133 | 
134 |   ## write from an on-disk dataset
135 |   local_file <- system.file("extdata/world.fgb", package = "duckdbfs")
136 |   load_spatial()
137 |   tbl <- open_dataset(local_file, format = 'sf')
138 |   path <- file.path(tempdir(), "spatial.geojson")
139 |   write_geo(tbl, path)
140 | 
141 |   expect_true(file.exists(path))
142 |   df <- sf::st_read(path)
143 |   expect_s3_class(df, "sf")
144 |   expect_gt(nrow(df), 1)
145 | })
146 | 
147 | 
148 | test_that("to_geojson", {
149 |   skip_on_cran()
150 |   skip_if_offline() # extensions need internet
151 |   skip_if_not(has_spatial(), "spatial extension not available")
152 | 
153 |   load_extension("json")
154 | 
155 |   ## write from an on-disk dataset
156 |   local_file <- system.file("extdata/world.fgb", package = "duckdbfs")
157 |   load_spatial()
158 |   tbl <- open_dataset(local_file, format = 'sf')
159 |   path <- file.path(tempdir(), "spatial1.geojson")
160 |   to_geojson(tbl, path, id_col = "iso_a3")
161 | 
162 |   expect_true(file.exists(path))
163 | 
164 |   skip_if_not_installed("sf")
165 | 
166 |   ## not sure why sf doesn't recognize this file!
167 |   #df <- sf::st_read(path)
168 |   #expect_s3_class(df, "sf")
169 |   #expect_gt(nrow(df), 1)
170 | })
171 | 
172 | 
173 | test_that("to_geojson s3", {
174 |   skip_on_cran()
175 |   skip_if_offline() # extensions need internet
176 |   skip_if_not_installed("sf")
177 |   skip_if_not_installed("jsonlite")
178 |   skip_if_not_installed("minioclient")
179 |   skip_if_not(has_spatial(), "spatial extension not available")
180 | 
181 |   minioclient::install_mc(force = TRUE)
182 | 
183 |   skip_on_os("windows")
184 |   p <- minioclient::mc_alias_ls("play --json")
185 |   config <- jsonlite::fromJSON(p$stdout)
186 |   minioclient::mc_mb("play/duckdbfs")
187 | 
188 |   duckdb_secrets(
189 |     config$accessKey,
190 |     config$secretKey,
191 |     gsub("https://", "", config$URL)
192 |   )
193 |   load_spatial()
194 | 
195 |   ## write from an on-disk dataset
196 |   local_file <- system.file("extdata/world.fgb", package = "duckdbfs")
197 |   tbl <- open_dataset(local_file, format = 'sf')
198 |   path <- "s3://duckdbfs/spatial-test.geojson"
199 |   to_geojson(tbl, path, id_col = "iso_a3")
200 | 
201 |   expect_true(TRUE)
202 | })
203 | 


--------------------------------------------------------------------------------
/R/duckdb_config.R:
--------------------------------------------------------------------------------
  1 | #' duckdb configuration
  2 | #'
  3 | #' @inheritParams open_dataset
  4 | #' @param ... named argument of the parameters to set, see examples
  5 | #' see all possible configuration options at <https://duckdb.org/docs/sql/configuration.html>
  6 | #' @return the active duckdb connection, invisibly
  7 | #' @details Note: in I/O bound tasks such as streaming data, it can be helpful to set
  8 | #' thread parallelism significantly higher than available CPU cores.
  9 | #' @seealso duckdb_reset, duckdb_get_config
 10 | #' @export
 11 | #' @examplesIf interactive()
 12 | #' duckdb_config(threads = 1, memory_limit = '10GB')
 13 | #' duckdb_get_config("threads")
 14 | #' duckdb_reset("threads")
 15 | duckdb_config <- function(..., conn = cached_connection()) {
 16 |   parameters <- list(...)
 17 |   for (p in names(parameters)) {
 18 |     cmd <- paste0("SET ", p, "='", parameters[p], "';")
 19 |     DBI::dbExecute(conn, cmd)
 20 |   }
 21 |   invisible(conn)
 22 | }
 23 | 
 24 | 
 25 | #' duckdb reset configuration to default
 26 | #'
 27 | #' @inheritParams open_dataset
 28 | #' @param x parameter name
 29 | #' @seealso duckdb_config, duckdb_get_config
 30 | #' @export
 31 | #' @examplesIf interactive()
 32 | #' duckdb_config(threads = 10)
 33 | #' duckdb_get_config("threads")
 34 | #' duckdb_reset("threads")
 35 | duckdb_reset <- function(x, conn = cached_connection()) {
 36 |   cmd <- paste0("RESET ", x, ";")
 37 |   DBI::dbExecute(conn, cmd)
 38 |   invisible(conn)
 39 | }
 40 | 
 41 | 
 42 | #' duckdb reset configuration to default
 43 | #'
 44 | #' @inheritParams open_dataset
 45 | #' @param x parameter name. Omit to see a table of all settings.
 46 | #' @seealso duckdb_config, duckdb_get_config
 47 | #' @export
 48 | #' @examplesIf interactive()
 49 | #' # Full config table
 50 | #' duckdb_get_config()
 51 | #'
 52 | #' # look up single config value
 53 | #' duckdb_get_config("threads")
 54 | #'
 55 | #' # set a different value, test, reset.
 56 | #' duckdb_config(threads = 10)
 57 | #' duckdb_get_config("threads")
 58 | #' duckdb_reset("threads")
 59 | #'
 60 | duckdb_get_config <- function(x = NULL, conn = cached_connection()) {
 61 |   cmd <- paste0("SELECT * FROM duckdb_settings()")
 62 |   settings <- DBI::dbGetQuery(conn, cmd)
 63 |   settings <- dplyr::as_tibble(settings)
 64 | 
 65 |   if (is.null(x)) {
 66 |     return(settings)
 67 |   }
 68 | 
 69 |   settings$value[settings$name == tolower(x)]
 70 | }
 71 | 
 72 | 
 73 | # internal
 74 | duckdb_set <- function(x, conn = cached_connection()) {
 75 |   if (!is.null(x)) {
 76 |     name <- deparse(substitute(x))
 77 |     cmd <- paste0("SET ", name, "='", x, "';")
 78 |     DBI::dbExecute(conn, cmd)
 79 |   }
 80 | }
 81 | 
 82 | 
 83 | #' Configure S3 settings for database connection
 84 | #'
 85 | #' This function is used to configure S3 settings for a database connection.
 86 | #' It allows you to set various S3-related parameters such as access key,
 87 | #' secret access key, endpoint, region, session token, uploader settings,
 88 | #' URL compatibility mode, URL style, and SSL usage.
 89 | #'
 90 | #' @param conn A database connection object created using the
 91 | #'  \code{cache_connection} function (default: \code{cache_connection()}).
 92 | #' @param s3_access_key_id The S3 access key ID (default: \code{NULL}).
 93 | #' @param s3_secret_access_key The S3 secret access key (default: \code{NULL}).
 94 | #' @param s3_endpoint The S3 endpoint (default: \code{NULL}).
 95 | #' @param s3_region The S3 region (default: \code{NULL}).
 96 | #' @param s3_session_token The S3 session token (default: \code{NULL}).
 97 | #' @param s3_uploader_max_filesize The maximum filesize for S3 uploader
 98 | #'  (between 50GB and 5TB, default 800GB).
 99 | #' @param s3_uploader_max_parts_per_file The maximum number of parts per file
100 | #'  for S3 uploader (between 1 and 10000, default 10000).
101 | #' @param s3_uploader_thread_limit The thread limit for S3 uploader
102 | #'  (default: 50).
103 | #' @param s3_url_compatibility_mode Disable Globs and Query Parameters on
104 | #'  S3 URLs (default: 0, allows globs/queries).
105 | #' @param s3_url_style The style of S3 URLs to use. Default is
106 | #' "vhost" unless s3_endpoint is set, which makes default "path"
107 | #'  (i.e. MINIO systems).
108 | #' @param s3_use_ssl Enable or disable SSL for S3 connections
109 | #'  (default: 1 (TRUE)).
110 | #' @param anonymous request anonymous access (sets `s3_access_key_id` and
111 | #'   `s3_secret_access_key` to `""`, allowing anonymous access to public buckets).
112 | #' @details see <https://duckdb.org/docs/sql/configuration.html>
113 | #' @return Returns silently (NULL) if successful.
114 | #'
115 | #' @examplesIf interactive()
116 | #' # Configure S3 settings
117 | #' duckdb_s3_config(
118 | #'            s3_access_key_id = "YOUR_ACCESS_KEY_ID",
119 | #'            s3_secret_access_key = "YOUR_SECRET_ACCESS_KEY",
120 | #'            s3_endpoint = "YOUR_S3_ENDPOINT",
121 | #'            s3_region = "YOUR_S3_REGION",
122 | #'            s3_uploader_max_filesize = "800GB",
123 | #'            s3_uploader_max_parts_per_file = 100,
124 | #'            s3_uploader_thread_limit = 8,
125 | #'            s3_url_compatibility_mode = FALSE,
126 | #'            s3_url_style = "vhost",
127 | #'            s3_use_ssl = TRUE,
128 | #'            anonymous = TRUE)
129 | #'
130 | #' @export
131 | duckdb_s3_config <- function(
132 |   conn = cached_connection(),
133 |   s3_access_key_id = NULL,
134 |   s3_secret_access_key = NULL,
135 |   s3_endpoint = NULL,
136 |   s3_region = NULL,
137 |   s3_session_token = NULL,
138 |   s3_uploader_max_filesize = NULL,
139 |   s3_uploader_max_parts_per_file = NULL,
140 |   s3_uploader_thread_limit = NULL,
141 |   s3_url_compatibility_mode = NULL,
142 |   s3_url_style = NULL,
143 |   s3_use_ssl = NULL,
144 |   anonymous = NULL
145 | ) {
146 |   if (!is.null(s3_endpoint) && is.null(s3_url_style)) {
147 |     s3_url_style <- "path"
148 |   }
149 | 
150 |   if (!is.null(s3_endpoint)) {
151 |     s3_endpoint <- gsub("^http[s]://", "", s3_endpoint)
152 |   }
153 | 
154 |   if (!is.null(anonymous)) {
155 |     if (!is.null(s3_access_key_id) || !is.null(s3_secret_access_key)) {
156 |       warning(paste(
157 |         "access keys provided when anonymous access requested.\n",
158 |         "keys will be ignored"
159 |       ))
160 |     }
161 |     s3_access_key_id <- ""
162 |     s3_secret_access_key <- ""
163 |   }
164 | 
165 |   load_httpfs(conn)
166 |   duckdb_set(s3_access_key_id, conn = conn)
167 |   duckdb_set(s3_secret_access_key, conn = conn)
168 |   duckdb_set(s3_endpoint, conn = conn)
169 |   duckdb_set(s3_region, conn = conn)
170 |   duckdb_set(s3_secret_access_key, conn = conn)
171 |   duckdb_set(s3_session_token, conn = conn)
172 |   duckdb_set(s3_uploader_max_filesize, conn = conn)
173 |   duckdb_set(s3_uploader_max_parts_per_file, conn = conn)
174 |   duckdb_set(s3_uploader_thread_limit, conn = conn)
175 |   duckdb_set(s3_url_compatibility_mode, conn = conn)
176 |   duckdb_set(s3_url_style, conn = conn)
177 |   duckdb_set(s3_use_ssl, conn = conn)
178 | }
179 | 
180 | 
181 | load_httpfs <- function(
182 |   conn = cached_connection(),
183 |   nightly = getOption("duckdbfs_use_nightly", FALSE),
184 |   force = FALSE
185 | ) {
186 |   load_extension("httpfs", conn = conn, nightly = nightly, force = force)
187 | }
188 | 
189 | 
190 | enable_parallel <- function(
191 |   conn = cached_connection(),
192 |   duckdb_cores = parallel::detectCores()
193 | ) {
194 |   status <- DBI::dbExecute(conn, paste0("PRAGMA threads=", duckdb_cores))
195 |   invisible(status)
196 | }
197 | 
198 | 
199 | #' show duckdb extensions
200 | #'
201 | #' @inheritParams open_dataset
202 | #' @return a data frame listing all available extensions, with boolean columns
203 | #' indicating which extensions are installed or loaded, and a description of each
204 | #' extension.
205 | #' @export
206 | #' @examplesIf interactive()
207 | #' duckdb_extensions()
208 | duckdb_extensions <- function(conn = cached_connection()) {
209 |   query <- "SELECT * FROM duckdb_extensions();"
210 |   DBI::dbGetQuery(conn, query)
211 | }
212 | 


--------------------------------------------------------------------------------
/R/open_dataset.R:
--------------------------------------------------------------------------------
  1 | #' Open a dataset from a variety of sources
  2 | #'
  3 | #' This function opens a dataset from a variety of sources, including Parquet,
  4 | #' CSV, etc, using either local file system paths, URLs, or S3 bucket URI
  5 | #' notation.
  6 | #'
  7 | #' @param sources A character vector of paths to the dataset files.
  8 | #' @param schema The schema for the dataset. If NULL, the schema will be
  9 | #'  inferred from the dataset files.
 10 | #' @param hive_style A logical value indicating whether to the dataset uses
 11 | #' Hive-style partitioning.
 12 | #' @param unify_schemas A logical value indicating whether to unify the schemas
 13 | #'  of the dataset files (union_by_name). If TRUE, will execute a UNION by
 14 | #'  column name across all files (NOTE: this can add considerably to
 15 | #'  the initial execution time)
 16 | #' @param format The format of the dataset files. One of `"parquet"`, `"csv"`,
 17 | #'  or `"sf"` (spatial vector files supported by the sf package / GDAL).
 18 | #'  if no argument is provided, the function will try to guess the type based
 19 | #'  on minimal heuristics.
 20 | #' @param conn A connection to a database.
 21 | #' @param tblname The name of the table to create in the database.
 22 | #' @param mode The mode to create the table in. One of `"VIEW"` or `"TABLE"`.
 23 | #' Creating a `VIEW`, the default, will execute more quickly because it
 24 | #' does not create a local copy of the dataset.  `TABLE` will create a local
 25 | #' copy in duckdb's native format, downloading the full dataset if necessary.
 26 | #' When using `TABLE` mode with large data, please be sure to use a `conn`
 27 | #' connections with disk-based storage, e.g. by calling [cached_connection()],
 28 | #' e.g. `cached_connection("storage_path")`, otherwise the full data must fit
 29 | #' into RAM.  Using `TABLE` assumes familiarity with R's DBI-based interface.
 30 | #' @param filename A logical value indicating whether to include the filename in
 31 | #' the table name.
 32 | #' @param recursive should we assume recursive path? default TRUE. Set to FALSE
 33 | #' if trying to open a single, un-partitioned file.
 34 | #' @param parser_options additional options passed to the parser, e.g. to
 35 | #' read_csv(), see <https://duckdb.org/docs/stable/data/csv/overview.html#parameters>
 36 | #' @param ... optional additional arguments passed to [duckdb_s3_config()].
 37 | #'   Note these apply after those set by the URI notation and thus may be used
 38 | #'   to override or provide settings not supported in that format.
 39 | #' @return A lazy `dplyr::tbl` object representing the opened dataset backed
 40 | #' by a duckdb SQL connection.  Most `dplyr` (and some `tidyr`) verbs can be
 41 | #' used directly on this object, as they can be translated into SQL commands
 42 | #' automatically via `dbplyr`.  Generic R commands require using
 43 | #' [dplyr::collect()] on the table, which forces evaluation and reading the
 44 | #' resulting data into memory.
 45 | #'
 46 | #' @examplesIf interactive()
 47 | #' # A remote, hive-partitioned Parquet dataset
 48 | #' base <- paste0("https://github.com/duckdb/duckdb/raw/main/",
 49 | #'              "data/parquet-testing/hive-partitioning/union_by_name/")
 50 | #' f1 <- paste0(base, "x=1/f1.parquet")
 51 | #' f2 <- paste0(base, "x=1/f2.parquet")
 52 | #' f3 <- paste0(base, "x=2/f2.parquet")
 53 | #'
 54 | #' open_dataset(c(f1,f2,f3), unify_schemas = TRUE)
 55 | #'
 56 | #' # Access an S3 database specifying an independently-hosted (MINIO) endpoint
 57 | #' efi <- open_dataset("s3://neon4cast-scores/parquet/aquatics",
 58 | #'                     s3_access_key_id="",
 59 | #'                     s3_endpoint="data.ecoforecast.org")
 60 | #' 
 61 | #' # Use parser-options for non-standard csv:
 62 | #'  cars <- tempfile() # dummy data
 63 | #'  write.table(mtcars, cars, row.names = FALSE)
 64 | #' 
 65 | #' # Note nested quotes on parser option for delimiter:
 66 | #'  df <- open_dataset(cars, format = "csv",
 67 | #'                     parser_options = c(delim = "' '", header = TRUE))
 68 | #'
 69 | #' @export
 70 | open_dataset <- function(sources,
 71 |                          schema = NULL,
 72 |                          hive_style = TRUE,
 73 |                          unify_schemas = FALSE,
 74 |                          format = c("parquet", "csv", "tsv", "sf"),
 75 |                          conn = cached_connection(),
 76 |                          tblname = tmp_tbl_name(),
 77 |                          mode = "VIEW",
 78 |                          filename = FALSE,
 79 |                          recursive = TRUE,
 80 |                          parser_options = list(),
 81 |                          ...) {
 82 | 
 83 |   format <- select_format(sources, format)
 84 |   sources <- parse_uri(sources, conn = conn, recursive = recursive)
 85 | 
 86 |   if(length(list(...)) > 0) { # can also be specified in URI query notation
 87 |     duckdb_s3_config(conn = conn, ...)
 88 |   }
 89 | 
 90 |   # ensure active connection
 91 |   version <- DBI::dbExecute(conn, "PRAGMA version;")
 92 | 
 93 |   if(format == "sf") {
 94 |     load_spatial(conn = conn)
 95 |     # for now VSI prefixes are not supported, but httpfs can handle spatial
 96 |     sources <- strip_vsi(sources)
 97 |   }
 98 |   view_query <- query_string(tblname,
 99 |                              sources,
100 |                              format = format,
101 |                              mode = mode,
102 |                              hive_partitioning = hive_style,
103 |                              union_by_name = unify_schemas,
104 |                              filename = filename,
105 |                              parser_options = parser_options
106 |                              )
107 | 
108 |   DBI::dbSendQuery(conn, view_query)
109 |   dplyr::tbl(conn, tblname)
110 | }
111 | 
112 | 
113 | 
114 | select_format <- function(sources, format) {
115 |   ## does not guess file types in s3 buckets.
116 | 
117 |   if(length(format) == 1) {
118 |     return(format)
119 |   }
120 | 
121 |   # format for vector sources always based on first element
122 |   sources <- sources[[1]]
123 | 
124 |    # default to parquet for S3 addresses
125 |   if(grepl("^s3://", sources)) {
126 |     return("parquet")
127 |   }
128 | 
129 |   if( fs::is_dir(sources) ) {
130 |     sources <- fs::dir_ls(sources, recurse = TRUE, type = "file")
131 |     sources <- sources[[1]]
132 |   }
133 | 
134 |   format <- tools::file_ext(sources)
135 | 
136 |   #if(grepl("^/vsi", sources)) {
137 |   #  return("sf")
138 |   #}
139 | 
140 |   # detect spatial types
141 |   if(format %in% c("fgb", "shp", "json", "geojson", "gdb", "gpkg",
142 |                    "kml", "gmt")) {
143 |     return("sf")
144 |   }
145 | 
146 |   # default
147 |   if (format == "") {
148 |     return("parquet")
149 |   }
150 | 
151 |   format
152 | }
153 | 
154 | 
155 | use_recursive <- function(sources) {
156 |   !all(identical(tools::file_ext(sources), ""))
157 | }
158 | 
159 | vec_as_str <- function(x) {
160 |   if(length(x) <= 1) return(paste0("'",x,"'"))
161 |   paste0("[", paste0(paste0("'", x, "'"), collapse = ","),"]")
162 | }
163 | 
164 | query_string <- function(tblname,
165 |                          sources,
166 |                          format = c("parquet", "csv", "tsv", "text", "sf"),
167 |                          mode = c("VIEW", "TABLE"),
168 |                          hive_partitioning = TRUE,
169 |                          union_by_name = FALSE,
170 |                          filename = FALSE, 
171 |                          parser_options = list()) {
172 |  # format <- match.arg(format)
173 |   scanner <- switch(format,
174 |                     "parquet" = "parquet_scan(",
175 |                     "csv"  = "read_csv_auto(",
176 |                     "sf" = "st_read(",
177 |                     "read_csv_auto("
178 |   )
179 | 
180 |   source_uris <- vec_as_str(sources)
181 | 
182 |   ## Allow overwrites on VIEW
183 |   mode <- switch(mode,
184 |          "VIEW" = "OR REPLACE TEMPORARY VIEW",
185 |          "TABLE" = "TABLE")
186 | 
187 |   tabular_options <- paste0(
188 |     ", HIVE_PARTITIONING=", hive_partitioning,
189 |     ", UNION_BY_NAME=", union_by_name,
190 |     ", FILENAME=", filename)
191 | 
192 |   options <- switch(format,
193 |                     "parquet" = tabular_options,
194 |                     "csv"  = tabular_options,
195 |                     "sf" = "",
196 |                     tabular_options
197 |   )
198 | 
199 |   # append any custom options
200 |   if(length(parser_options) > 0) {
201 |     pairs <- paste(names(parser_options), "=", parser_options,
202 |                  sep = "", collapse = ", ")
203 |     options <- paste0(c(options, pairs), collapse=", ")
204 |   }
205 |   paste0(
206 |     paste("CREATE", mode, tblname, "AS SELECT * FROM "),
207 |     paste0(scanner, source_uris, options,
208 |            ");")
209 |   )
210 | }
211 | 
212 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 |   
  8 | ```{r, include = FALSE}
  9 | knitr::opts_chunk$set(
 10 |   collapse = TRUE,
 11 |   comment = "#>",
 12 |   fig.path = "man/figures/README-",
 13 |   out.width = "100%"
 14 | )
 15 | ```
 16 | 
 17 | # duckdbfs
 18 | 
 19 | <!-- badges: start -->
 20 | [![R-CMD-check](https://github.com/cboettig/duckdbfs/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cboettig/duckdbfs/actions/workflows/R-CMD-check.yaml)
 21 | <!-- badges: end -->
 22 | 
 23 | 
 24 | duckdbfs is a simple wrapper around the `duckdb` package to facilitate working with the construction of a single lazy table (SQL connection) from a set of file paths, URLs, or S3 URIs. 
 25 | 
 26 | ## Installation
 27 | 
 28 | You can install the development version of duckdbfs from [GitHub](https://github.com/) with:
 29 | 
 30 | ``` r
 31 | # install.packages("devtools")
 32 | devtools::install_github("cboettig/duckdbfs")
 33 | ```
 34 | 
 35 | ## Quickstart
 36 | 
 37 | ```{r setup}
 38 | library(duckdbfs)
 39 | library(dplyr)
 40 | ```
 41 | 
 42 | Imagine we have a collection of URLs to files we want to combine into a single tibble in R.  The files could be parquet or csv, and some files may have additional columns not present in other files.  The combined data may be very large, potentially bigger than available RAM or slow to download completely, but we may only want a subset using methods like `dplyr::filter()` or `dplyr::summarise()`.  
 43 | 
 44 | 
 45 | ```{r example}
 46 | base <- paste0("https://github.com/duckdb/duckdb/raw/main/",
 47 |                "data/parquet-testing/hive-partitioning/union_by_name/")
 48 | f1 <- paste0(base, "x=1/f1.parquet")
 49 | f2 <- paste0(base, "x=1/f2.parquet")
 50 | f3 <- paste0(base, "x=2/f2.parquet")
 51 | urls <- c(f1,f2,f3)
 52 | ```
 53 | 
 54 | We can easily access this data without downloading by passing a vector of URLs. 
 55 | Note that if schemas (column names) do not match, we must explicitly request 
 56 | `duckdb` join the two schemas. Leave this as default, `FALSE` when not required
 57 | to achieve much better performance.
 58 | 
 59 | ```{r}
 60 | ds <- open_dataset(urls, unify_schemas = TRUE)
 61 | ds
 62 | ```
 63 | 
 64 | Use `filter()`, `select()`, etc from dplyr to subset and process data -- [any method supported by dbplyr](https://dbplyr.tidyverse.org/reference/index.html). Then use `dplyr::collect()` to trigger evaluation and ingest results of the query into R.
 65 | 
 66 | ## S3-based access
 67 | 
 68 | We can also access remote data over the S3 protocol. An advantage of S3 is that 
 69 | unlike https, it can discover all files in a given folder, so we don't have to list them individually.
 70 | This is particularly convenient for accessing large, partitioned datasets, like GBIF: (nearly 200 GB of data split across more than 2000 parquet files)
 71 | 
 72 | ```{r}
 73 | parquet <- "s3://gbif-open-data-us-east-1/occurrence/2023-06-01/occurrence.parquet"
 74 | duckdb_s3_config()
 75 | gbif <- open_dataset(parquet, anonymous = TRUE, s3_region="us-east-1")
 76 | ```
 77 | 
 78 | 
 79 | The additional configuration arguments are passed to the helper function `duckdb_s3_config()` to set access credentials and configure other settings, like alternative endpoints (for use with S3-compliant systems like [minio](https://www.min.io/)).  Of course it also possible to set these ahead of time by calling `duckdb_s3_config()` directly.  Many of these settings can also be passed along more compactly using the URI query notation found in the `arrow` package.  For instance, we can request anonymous access to a bucket on an alternative endpoint as:
 80 | 
 81 | ```{r}
 82 | efi <- open_dataset("s3://anonymous@neon4cast-scores/parquet/aquatics?endpoint_override=data.ecoforecast.org")
 83 | ```
 84 | 
 85 | 
 86 | ## Spatial data
 87 | 
 88 | `duckdb` can also understand a wide array of spatial data queries for spatial
 89 | vector data, similar to operations found in the popular `sf` package.
 90 | See [the list of supported functions](https://github.com/duckdb/duckdb-spatial#supported-functions) for details.
 91 | Most spatial query operations require an geometry column that expresses the 
 92 | simple feature geometry in `duckdb`'s internal geometry format 
 93 | (nearly but not exactly WKB).  
 94 | 
 95 | ### Generating spatial data from tabular
 96 | 
 97 | A common pattern will first generate the 
 98 | geometry column from raw columns, such as `latitude` and `lognitude` columns,
 99 | using the `duckdb` implementation of the a method familiar to postgis, `st_point`:
100 | 
101 | ```{r}
102 | spatial_ex <- paste0("https://raw.githubusercontent.com/cboettig/duckdbfs/",
103 |                      "main/inst/extdata/spatial-test.csv") |>
104 |   open_dataset(format = "csv") 
105 | 
106 | spatial_ex |>
107 |   mutate(geometry = st_point(longitude, latitude)) |>
108 |   mutate(dist = st_distance(geometry, st_point(0,0))) |> 
109 |   to_sf(crs = 4326)
110 | ```
111 | 
112 | Note that when coercing generic tabular such as CSV into spatial data, the user
113 | is responsible for specifying the coordinate reference system (crs) used by the columns.
114 | For instance, in this case our data is latitude-longitude, so we specify the 
115 | corresponding EPSG code.  This is optional (sf allows objects to have unknown CRS),
116 | but advisable. 
117 | 
118 | 
119 | Recall that when used against any sort of external database like `duckdb`, 
120 | most `dplyr` functions like `dplyr::mutate()` are being transcribed into SQL
121 | by `dbplyr`, and not actually ever run in R. This allows us to seamlessly pass
122 | along spatial functions like `st_point`, despite this not being an available R
123 | function. (Also note that SQL is not case-sensitive, so this function is also
124 | written as `ST_Point`).
125 | Optionally, we can do additional operations on this geometry column, such as
126 | computing distances (`st_distance` shown here), spatial filters, and so forth.
127 | The `to_sf()` coercion will parse its input into a SQL query that gets
128 | passed to `duckdb`, and the return object will be collected through
129 | `sf::st_read`, returning an (in-memory) `sf` object. 
130 | 
131 | For more details including a complete list of the dozens of spatial operations currently supported and notes on performance and current limitations, see the [duckdb spatial docs](https://github.com/duckdb/duckdb-spatial)
132 | 
133 | ### Reading spatial vector files
134 | 
135 | The `duckdb` spatial package can also use GDAL to read large spatial vector files.
136 | This includes support for remote files. This means that we can
137 | easily subset columns from a wide array of potentially remote file types and
138 | filter on rows and columns, and perform many spatial operations without ever
139 | reading the entire objects into memory in R.  
140 | 
141 | 
142 | ```{r}
143 | url <- "https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/world.fgb"
144 | countries <- open_dataset(url, format = "sf")
145 | ```
146 | 
147 | Note that `open_dataset()` always returns a lazy remote table -- we have not yet
148 | downloaded the data, let alone read it into R.  We simply have a connection allowing
149 | us to stream the data.  
150 | 
151 | We can examine the spatial metadata associated with this remote dataset using the duckdbfs spatial helper function, `st_read_meta`,
152 | 
153 | ```{r}
154 | countries_meta <- st_read_meta(url)
155 | countries_meta
156 | ```
157 | 
158 | Because this is a small dataset, we can bring the entire data into R (in memory) using `to_sf()`, specifying the CRS indicated in this metadata:
159 | 
160 | ```{r}
161 | in_mem <- countries |> to_sf(crs = countries_meta$wkt)
162 | ```
163 | However, we can also do a wide range of spatial observations without importing the data.
164 | This can be particularly helpful when working with very large datasets.
165 | For example: which country polygon contains Melbourne?  
166 | Note the result is still a lazy read, we haven't downloaded or read in the full spatial data object. 
167 | 
168 | ```{r}
169 | library(sf)
170 | melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text()
171 | 
172 | countries |> 
173 |   filter(st_contains(geom, ST_GeomFromText({melbourne})))
174 | 
175 | ```
176 | 
177 | As before, we use `to_sf()` to read in the query results as a native (in-memory) `sf` object:
178 | 
179 | ```{r}
180 | sf_obj <- countries |> filter(continent == "Africa") |> to_sf() 
181 | plot(sf_obj["name"])
182 | ```
183 | 
184 | ## Spatial joins
185 | 
186 | One very common operation are spatial joins, which can be a very powerful way to subset large data. Lets consider a set of point geometries representing the coordinates of major cities around the world:
187 | 
188 | ```{r}
189 | url_cities <- "https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/metro.fgb"
190 | cities <- open_dataset(url_cities, format="sf")
191 | ```
192 | 
193 | Note that metadata must be read directly from the source file, it is not embedded into the duckdb table view.
194 | Before combining this data with the countries data, we confirm that the CRS is the same for both datasets:
195 | 
196 | ```{r}
197 | countries_meta$proj4
198 | st_read_meta(url_cities)$proj4
199 | ```
200 | 
201 | For instance, we can return all points (cities) within a collection of polygons (all country boundaries in Oceania continent):
202 | 
203 | ```{r}
204 | countries |>
205 |    dplyr::filter(continent == "Oceania") |>
206 |    spatial_join(cities, by = "st_intersects", join="inner") |>
207 |    select(name_long, sovereignt, pop2020) 
208 | 
209 | ```
210 | 
211 | 
212 | Possible [spatial joins](https://postgis.net/workshops/postgis-intro/spatial_relationships.html) include:
213 | 
214 |  Function            | Description
215 | -------------------- | --------------------------------------------------------------------------------------------
216 |  st_intersects       | Geometry A intersects with geometry B
217 |  st_disjoint         | The complement of intersects
218 |  st_within           | Geometry A is within geometry B (complement of contains)
219 |  st_dwithin          | Geometries are within a specified distance, expressed in the same units as the coordinate reference system.
220 |  st_touches          | Two polygons touch if the that have at least one point in common, even if their interiors do not touch.
221 |  st_contains         | Geometry A entirely contains to geometry B. (complement of within)
222 |  st_containsproperly | stricter version of `st_contains` (boundary counts as external)
223 |  st_covers           | geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.)
224 |  st_overlaps         | geometry A intersects but does not completely contain geometry B
225 |  st_equals           | geometry A is equal to geometry B
226 |  st_crosses          | Lines or points in geometry A cross geometry B.  
227 | 
228 | Note that while SQL functions are not case-sensitive, `spatial_join` expects lower-case names.
229 | 
230 | ## Writing datasets
231 | 
232 | Like `arrow::write_dataset()`, `duckdbfs::write_dataset()` can write partitioned parquet files to local disks and also directly to an S3 bucket. Partitioned writes should take advantage of threading. Partition variables can be specified explicitly, or any `dplyr` grouping variables will be used by default:
233 | 
234 | ```{r message=FALSE}
235 | mtcars |> group_by(cyl, gear) |> write_dataset(tempfile())
236 | ```
237 | 
238 | 
239 | ## Local files
240 | 
241 | Of course, `open_dataset()` and `write_dataset()` also be used with local files.  Remember that parquet format is not required, we can read csv files (including multiple and hive-partitioned csv files). 
242 | 
243 | ```{r}
244 | write.csv(mtcars, "mtcars.csv", row.names=FALSE)
245 | lazy_cars <- open_dataset("mtcars.csv", format = "csv")
246 | ```
247 | 
248 | 
249 | ```{r include = FALSE}
250 | unlink("mtcars.csv")
251 | ```
252 | 
253 | 
254 | ## Mechanism / motivation
255 | 
256 | This package simply creates a duckdb connection, ensures the `httpfs` and `spatial` extensions are installed if necessary, sets the S3 configuration, and then constructs a `VIEW` using duckdb's `parquet_scan()` or `read_csv_auto()` methods and associated options.  It then returns a `dplyr::tbl()` for the resulting view.  Though straightforward, this process is substantially more verbose than the analogous single function call provided by `arrow::open_dataset()` due mostly to the necessary string manipulation to construct the VIEW as a SQL statement.  I've used this pattern a lot, especially when arrow is not an option (http data) or has substantially worse performance (many S3 URIs).
257 | 
258 | 
259 | ## Advanced notes
260 | 
261 | This is very similar to the behavior of `arrow::open_dataset()`, with a few exceptions: 
262 | 
263 | - at this time, `arrow` does not support access over HTTP -- remote sources must be in an S3 or GC-based object store.
264 | - With local file system or S3 paths, `duckdb` can support "globbing" at any point in the path, e.g. `open_dataset(data/*/subdir)`. (Like arrow, `duckdbfs::open_dataset` will assume recursive path discovery on directories). Note that http(s) URLs will always require the full vector since a `ls()` method is not possible.  Even with URLs or vector-based paths, `duckdb` can automatically populate column names given only by hive structure when `hive_style=TRUE` (default).  Note that passing a vector of paths can be significantly faster than globbing with S3 sources where the `ls()` operation is relatively expensive when there are many partitions.
265 | 
266 | 
267 | ## Performance notes
268 | 
269 | - In some settings, `duckdbfs::open_dataset` can give substantially better performance (orders of magnitude) than `arrow::open_dataset()`, while in other settings it may be comparable or even slower.  Package versions, system libraries, network architecture, remote storage performance, network traffic, and other factors can all influence performance, making precise benchmark comparisons in real-world contexts difficult.
270 | - On slow network connections or when accessing a remote table repeatedly, it may improve performance to create a local copy of the table rather than perform all operations over the network.  The simplest way to do this is by setting the `mode = "TABLE"` instead of "VIEW" on open dataset.  It is probably desirable to pass a duckdb connection backed by persistent disk location in this case instead of the default `cached_connection()` unless available RAM is not limiting. 
271 | - `unify_schema` is very computationally expensive. Ensuring all files/partitions match schema in advance or processing different files separately can greatly improve performance. 
272 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # duckdbfs
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![R-CMD-check](https://github.com/cboettig/duckdbfs/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cboettig/duckdbfs/actions/workflows/R-CMD-check.yaml)
  9 | <!-- badges: end -->
 10 | 
 11 | duckdbfs is a simple wrapper around the `duckdb` package to facilitate
 12 | working with the construction of a single lazy table (SQL connection)
 13 | from a set of file paths, URLs, or S3 URIs.
 14 | 
 15 | ## Installation
 16 | 
 17 | You can install the development version of duckdbfs from
 18 | [GitHub](https://github.com/) with:
 19 | 
 20 | ``` r
 21 | # install.packages("devtools")
 22 | devtools::install_github("cboettig/duckdbfs")
 23 | ```
 24 | 
 25 | ## Quickstart
 26 | 
 27 | ``` r
 28 | library(duckdbfs)
 29 | library(dplyr)
 30 | #> 
 31 | #> Attaching package: 'dplyr'
 32 | #> The following objects are masked from 'package:stats':
 33 | #> 
 34 | #>     filter, lag
 35 | #> The following objects are masked from 'package:base':
 36 | #> 
 37 | #>     intersect, setdiff, setequal, union
 38 | ```
 39 | 
 40 | Imagine we have a collection of URLs to files we want to combine into a
 41 | single tibble in R. The files could be parquet or csv, and some files
 42 | may have additional columns not present in other files. The combined
 43 | data may be very large, potentially bigger than available RAM or slow to
 44 | download completely, but we may only want a subset using methods like
 45 | `dplyr::filter()` or `dplyr::summarise()`.
 46 | 
 47 | ``` r
 48 | base <- paste0("https://github.com/duckdb/duckdb/raw/main/",
 49 |                "data/parquet-testing/hive-partitioning/union_by_name/")
 50 | f1 <- paste0(base, "x=1/f1.parquet")
 51 | f2 <- paste0(base, "x=1/f2.parquet")
 52 | f3 <- paste0(base, "x=2/f2.parquet")
 53 | urls <- c(f1,f2,f3)
 54 | ```
 55 | 
 56 | We can easily access this data without downloading by passing a vector
 57 | of URLs. Note that if schemas (column names) do not match, we must
 58 | explicitly request `duckdb` join the two schemas. Leave this as default,
 59 | `FALSE` when not required to achieve much better performance.
 60 | 
 61 | ``` r
 62 | ds <- open_dataset(urls, unify_schemas = TRUE)
 63 | ds
 64 | #> # Source:   table<f1> [3 x 4]
 65 | #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:]
 66 | #>       i     j     x     k
 67 | #>   <int> <int> <dbl> <int>
 68 | #> 1    42    84     1    NA
 69 | #> 2    42    84     1    NA
 70 | #> 3    NA   128     2    33
 71 | ```
 72 | 
 73 | Use `filter()`, `select()`, etc from dplyr to subset and process data –
 74 | [any method supported by
 75 | dbpylr](https://dbplyr.tidyverse.org/reference/index.html). Then use
 76 | `dplyr::collect()` to trigger evaluation and ingest results of the query
 77 | into R.
 78 | 
 79 | ## S3-based access
 80 | 
 81 | We can also access remote data over the S3 protocol. An advantage of S3
 82 | is that unlike https, it can discover all files in a given folder, so we
 83 | don’t have to list them individually. This is particularly convenient
 84 | for accessing large, partitioned datasets, like GBIF: (nearly 200 GB of
 85 | data split across more than 2000 parquet files)
 86 | 
 87 | ``` r
 88 | parquet <- "s3://gbif-open-data-us-east-1/occurrence/2023-06-01/occurrence.parquet"
 89 | duckdb_s3_config()
 90 | gbif <- open_dataset(parquet, anonymous = TRUE, s3_region="us-east-1")
 91 | ```
 92 | 
 93 | The additional configuration arguments are passed to the helper function
 94 | `duckdb_s3_config()` to set access credentials and configure other
 95 | settings, like alternative endpoints (for use with S3-compliant systems
 96 | like [minio](https://www.min.io/)). Of course it also possible to set these
 97 | ahead of time by calling `duckdb_s3_config()` directly. Many of these
 98 | settings can also be passed along more compactly using the URI query
 99 | notation found in the `arrow` package. For instance, we can request
100 | anonymous access to a bucket on an alternative endpoint as:
101 | 
102 | ``` r
103 | efi <- open_dataset("s3://anonymous@neon4cast-scores/parquet/aquatics?endpoint_override=data.ecoforecast.org")
104 | ```
105 | 
106 | ## Spatial data
107 | 
108 | `duckdb` can also understand a wide array of spatial data queries for
109 | spatial vector data, similar to operations found in the popular `sf`
110 | package. See [the list of supported
111 | functions](https://github.com/duckdb/duckdb-spatial#supported-functions)
112 | for details. Most spatial query operations require an geometry column
113 | that expresses the simple feature geometry in `duckdb`’s internal
114 | geometry format (nearly but not exactly WKB).
115 | 
116 | ### Generating spatial data from tabular
117 | 
118 | A common pattern will first generate the geometry column from raw
119 | columns, such as `latitude` and `lognitude` columns, using the `duckdb`
120 | implementation of the a method familiar to postgis, `st_point`:
121 | 
122 | ``` r
123 | spatial_ex <- paste0("https://raw.githubusercontent.com/cboettig/duckdbfs/",
124 |                      "main/inst/extdata/spatial-test.csv") |>
125 |   open_dataset(format = "csv") 
126 | 
127 | spatial_ex |>
128 |   mutate(geometry = st_point(longitude, latitude)) |>
129 |   mutate(dist = st_distance(geometry, st_point(0,0))) |> 
130 |   to_sf(crs = 4326)
131 | #> Simple feature collection with 10 features and 4 fields
132 | #> Geometry type: POINT
133 | #> Dimension:     XY
134 | #> Bounding box:  xmin: 1 ymin: 1 xmax: 10 ymax: 10
135 | #> Geodetic CRS:  WGS 84
136 | #>    site latitude longitude      dist          geom
137 | #> 1     a        1         1  1.414214   POINT (1 1)
138 | #> 2     b        2         2  2.828427   POINT (2 2)
139 | #> 3     c        3         3  4.242641   POINT (3 3)
140 | #> 4     d        4         4  5.656854   POINT (4 4)
141 | #> 5     e        5         5  7.071068   POINT (5 5)
142 | #> 6     f        6         6  8.485281   POINT (6 6)
143 | #> 7     g        7         7  9.899495   POINT (7 7)
144 | #> 8     h        8         8 11.313708   POINT (8 8)
145 | #> 9     i        9         9 12.727922   POINT (9 9)
146 | #> 10    j       10        10 14.142136 POINT (10 10)
147 | ```
148 | 
149 | Note that when coercing generic tabular such as CSV into spatial data,
150 | the user is responsible for specifying the coordinate reference system
151 | (crs) used by the columns. For instance, in this case our data is
152 | latitude-longitude, so we specify the corresponding EPSG code. This is
153 | optional (sf allows objects to have unknown CRS), but advisable.
154 | 
155 | Recall that when used against any sort of external database like
156 | `duckdb`, most `dplyr` functions like `dplyr::mutate()` are being
157 | transcribed into SQL by `dbplyr`, and not actually ever run in R. This
158 | allows us to seamlessly pass along spatial functions like `st_point`,
159 | despite this not being an available R function. (Also note that SQL is
160 | not case-sensitive, so this function is also written as `ST_Point`).
161 | Optionally, we can do additional operations on this geometry column,
162 | such as computing distances (`st_distance` shown here), spatial filters,
163 | and so forth. The `to_sf()` coercion will parse its input into a SQL
164 | query that gets passed to `duckdb`, and the return object will be
165 | collected through `sf::st_read`, returning an (in-memory) `sf` object.
166 | 
167 | For more details including a complete list of the dozens of spatial
168 | operations currently supported and notes on performance and current
169 | limitations, see the [duckdb spatial
170 | docs](https://github.com/duckdb/duckdb-spatial)
171 | 
172 | ### Reading spatial vector files
173 | 
174 | The `duckdb` spatial package can also use GDAL to read large spatial
175 | vector files. This includes support for remote files. This means that we
176 | can easily subset columns from a wide array of potentially remote file
177 | types and filter on rows and columns, and perform many spatial
178 | operations without ever reading the entire objects into memory in R.
179 | 
180 | ``` r
181 | url <- "https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/world.fgb"
182 | countries <- open_dataset(url, format = "sf")
183 | ```
184 | 
185 | Note that `open_dataset()` always returns a lazy remote table – we have
186 | not yet downloaded the data, let alone read it into R. We simply have a
187 | connection allowing us to stream the data.
188 | 
189 | We can examine the spatial metadata associated with this remote dataset
190 | using the duckdbfs spatial helper function, `st_read_meta`,
191 | 
192 | ``` r
193 | countries_meta <- st_read_meta(url)
194 | countries_meta
195 | #> # A tibble: 1 × 7
196 | #>   feature_count geom_column_name geom_type     name  code  wkt             proj4
197 | #>           <dbl> <chr>            <chr>         <chr> <chr> <chr>           <chr>
198 | #> 1           177 geom             Multi Polygon EPSG  4326  "GEOGCS[\"WGS … +pro…
199 | ```
200 | 
201 | Because this is a small dataset, we can bring the entire data into R (in
202 | memory) using `to_sf()`, specifying the CRS indicated in this metadata:
203 | 
204 | ``` r
205 | in_mem <- countries |> to_sf(crs = countries_meta$wkt)
206 | ```
207 | 
208 | However, we can also do a wide range of spatial observations without
209 | importing the data. This can be particularly helpful when working with
210 | very large datasets. For example: which country polygon contains
211 | Melbourne?  
212 | Note the result is still a lazy read, we haven’t downloaded or read in
213 | the full spatial data object.
214 | 
215 | ``` r
216 | library(sf)
217 | #> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE
218 | melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text()
219 | 
220 | countries |> 
221 |   filter(st_contains(geom, ST_GeomFromText({melbourne})))
222 | #> # Source:   SQL [1 x 16]
223 | #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:]
224 | #>   iso_a3 name      sovereignt continent    area  pop_est pop_est_dens economy   
225 | #>   <chr>  <chr>     <chr>      <chr>       <dbl>    <dbl>        <dbl> <chr>     
226 | #> 1 AUS    Australia Australia  Oceania   7682300 21262641         2.77 2. Develo…
227 | #> # ℹ 8 more variables: income_grp <chr>, gdp_cap_est <dbl>, life_exp <dbl>,
228 | #> #   well_being <dbl>, footprint <dbl>, inequality <dbl>, HPI <dbl>, geom <list>
229 | ```
230 | 
231 | As before, we use `to_sf()` to read in the query results as a native
232 | (in-memory) `sf` object:
233 | 
234 | ``` r
235 | sf_obj <- countries |> filter(continent == "Africa") |> to_sf() 
236 | plot(sf_obj["name"])
237 | ```
238 | 
239 | <img src="man/figures/README-unnamed-chunk-10-1.png" width="100%" />
240 | 
241 | ## Spatial joins
242 | 
243 | One very common operation are spatial joins, which can be a very
244 | powerful way to subset large data. Lets consider a set of point
245 | geometries representing the coordinates of major cities around the
246 | world:
247 | 
248 | ``` r
249 | url_cities <- "https://github.com/cboettig/duckdbfs/raw/spatial-read/inst/extdata/metro.fgb"
250 | cities <- open_dataset(url_cities, format="sf")
251 | ```
252 | 
253 | Note that metadata must be read directly from the source file, it is not
254 | embedded into the duckdb table view. Before combining this data with the
255 | countries data, we confirm that the CRS is the same for both datasets:
256 | 
257 | ``` r
258 | countries_meta$proj4
259 | #> [1] "+proj=longlat +datum=WGS84 +no_defs"
260 | st_read_meta(url_cities)$proj4
261 | #> [1] "+proj=longlat +datum=WGS84 +no_defs"
262 | ```
263 | 
264 | For instance, we can return all points (cities) within a collection of
265 | polygons (all country boundaries in Oceania continent):
266 | 
267 | ``` r
268 | countries |>
269 |    dplyr::filter(continent == "Oceania") |>
270 |    spatial_join(cities, by = "st_intersects", join="inner") |>
271 |    select(name_long, sovereignt, pop2020) 
272 | #> # Source:   SQL [6 x 3]
273 | #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:]
274 | #>   name_long sovereignt  pop2020
275 | #>   <chr>     <chr>         <dbl>
276 | #> 1 Brisbane  Australia   2388517
277 | #> 2 Perth     Australia   2036118
278 | #> 3 Sydney    Australia   4729406
279 | #> 4 Adelaide  Australia   1320783
280 | #> 5 Auckland  New Zealand 1426070
281 | #> 6 Melbourne Australia   4500501
282 | ```
283 | 
284 | Possible [spatial
285 | joins](https://postgis.net/workshops/postgis-intro/spatial_relationships.html)
286 | include:
287 | 
288 | | Function            | Description                                                                                                   |
289 | |---------------------|---------------------------------------------------------------------------------------------------------------|
290 | | st_intersects       | Geometry A intersects with geometry B                                                                         |
291 | | st_disjoint         | The complement of intersects                                                                                  |
292 | | st_within           | Geometry A is within geometry B (complement of contains)                                                      |
293 | | st_dwithin          | Geometries are within a specified distance, expressed in the same units as the coordinate reference system.   |
294 | | st_touches          | Two polygons touch if the that have at least one point in common, even if their interiors do not touch.       |
295 | | st_contains         | Geometry A entirely contains to geometry B. (complement of within)                                            |
296 | | st_containsproperly | stricter version of `st_contains` (boundary counts as external)                                               |
297 | | st_covers           | geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.) |
298 | | st_overlaps         | geometry A intersects but does not completely contain geometry B                                              |
299 | | st_equals           | geometry A is equal to geometry B                                                                             |
300 | | st_crosses          | Lines or points in geometry A cross geometry B.                                                               |
301 | 
302 | Note that while SQL functions are not case-sensitive, `spatial_join`
303 | expects lower-case names.
304 | 
305 | ## Writing datasets
306 | 
307 | Like `arrow::write_dataset()`, `duckdbfs::write_dataset()` can write
308 | partitioned parquet files to local disks and also directly to an S3
309 | bucket. Partitioned writes should take advantage of threading. Partition
310 | variables can be specified explicitly, or any `dplyr` grouping variables
311 | will be used by default:
312 | 
313 | ``` r
314 | mtcars |> group_by(cyl, gear) |> write_dataset(tempfile())
315 | ```
316 | 
317 | ## Local files
318 | 
319 | Of course, `open_dataset()` and `write_dataset()` also be used with
320 | local files. Remember that parquet format is not required, we can read
321 | csv files (including multiple and hive-partitioned csv files).
322 | 
323 | ``` r
324 | write.csv(mtcars, "mtcars.csv", row.names=FALSE)
325 | lazy_cars <- open_dataset("mtcars.csv", format = "csv")
326 | ```
327 | 
328 | ## Mechanism / motivation
329 | 
330 | This package simply creates a duckdb connection, ensures the `httpfs`
331 | and `spatial` extensions are installed if necessary, sets the S3
332 | configuration, and then constructs a `VIEW` using duckdb’s
333 | `parquet_scan()` or `read_csv_auto()` methods and associated options. It
334 | then returns a `dplyr::tbl()` for the resulting view. Though
335 | straightforward, this process is substantially more verbose than the
336 | analogous single function call provided by `arrow::open_dataset()` due
337 | mostly to the necessary string manipulation to construct the VIEW as a
338 | SQL statement. I’ve used this pattern a lot, especially when arrow is
339 | not an option (http data) or has substantially worse performance (many
340 | S3 URIs).
341 | 
342 | ## Advanced notes
343 | 
344 | This is very similar to the behavior of `arrow::open_dataset()`, with a
345 | few exceptions:
346 | 
347 | - at this time, `arrow` does not support access over HTTP – remote
348 |   sources must be in an S3 or GC-based object store.
349 | - With local file system or S3 paths, `duckdb` can support “globbing” at
350 |   any point in the path, e.g. `open_dataset(data/*/subdir)`. (Like
351 |   arrow, `duckdbfs::open_dataset` will assume recursive path discovery
352 |   on directories). Note that http(s) URLs will always require the full
353 |   vector since a `ls()` method is not possible. Even with URLs or
354 |   vector-based paths, `duckdb` can automatically populate column names
355 |   given only by hive structure when `hive_style=TRUE` (default). Note
356 |   that passing a vector of paths can be significantly faster than
357 |   globbing with S3 sources where the `ls()` operation is relatively
358 |   expensive when there are many partitions.
359 | 
360 | ## Performance notes
361 | 
362 | - In some settings, `duckdbfs::open_dataset` can give substantially
363 |   better performance (orders of magnitude) than `arrow::open_dataset()`,
364 |   while in other settings it may be comparable or even slower. Package
365 |   versions, system libraries, network architecture, remote storage
366 |   performance, network traffic, and other factors can all influence
367 |   performance, making precise benchmark comparisons in real-world
368 |   contexts difficult.
369 | - On slow network connections or when accessing a remote table
370 |   repeatedly, it may improve performance to create a local copy of the
371 |   table rather than perform all operations over the network. The
372 |   simplest way to do this is by setting the `mode = "TABLE"` instead of
373 |   “VIEW” on open dataset. It is probably desirable to pass a duckdb
374 |   connection backed by persistent disk location in this case instead of
375 |   the default `cached_connection()` unless available RAM is not
376 |   limiting.
377 | - `unify_schema` is very computationally expensive. Ensuring all
378 |   files/partitions match schema in advance or processing different files
379 |   separately can greatly improve performance.
380 | 


--------------------------------------------------------------------------------