├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── duckreg-package.R
    ├── duckreg.R
    └── print.R
├── README.md
├── inst
    └── tinytest
    │   └── test_duckreg.R
├── man
    ├── duckreg-package.Rd
    ├── duckreg.Rd
    └── print.duckreg.Rd
└── tests
    └── tinytest.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.git$
2 | ^.*\.Rproj$
3 | ^\.Rproj\.user$
4 | ^\.github$
5 | ^LICENSE\.md$
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Retrieved 2017-Oct-12 from https://github.com/github/gitignore/blob/master/R.gitignore
 2 | # Licensed under CC0-1.0 https://github.com/github/gitignore/blob/master/LICENSE
 3 | 
 4 | # History files
 5 | .Rhistory
 6 | .Rapp.history
 7 | 
 8 | # Session Data files
 9 | .RData
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | *.Rproj
23 | 
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 | 
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 | 
31 | # knitr and R markdown default cache directories
32 | /*_cache/
33 | /cache/
34 | 
35 | # Temporary files created by R markdown
36 | *.utf8.md
37 | *.knit.md
38 | 
39 | 
40 | 
41 | ## Retrieved 2023-11-16 from https://github.com/github/gitignore/blob/main/C%2B%2B.gitignore
42 | 
43 | # Prerequisites
44 | *.d
45 | 
46 | # Compiled Object files
47 | *.slo
48 | *.lo
49 | *.o
50 | *.obj
51 | 
52 | # Precompiled Headers
53 | *.gch
54 | *.pch
55 | 
56 | # Compiled Dynamic libraries
57 | *.so
58 | *.dylib
59 | *.dll
60 | 
61 | # Fortran module files
62 | *.mod
63 | *.smod
64 | 
65 | # Compiled Static libraries
66 | *.lai
67 | *.la
68 | *.a
69 | *.lib
70 | 
71 | # Executables
72 | *.exe
73 | *.out
74 | *.app
75 | 
76 | SCRATCH/


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: duckreg
 2 | Type: Package
 3 | Title: Out-of-Core Regressions With DuckDB
 4 | Version: 0.0.1.99
 5 | Date: 2025-01-29
 6 | Authors@R:
 7 |   c(
 8 |     person(
 9 |       given = "Grant",
10 |       family = "McDermott",
11 |       role = c("aut", "cre"),
12 |       email = "gmcd@amazon.com",
13 |       comment = c(ORCID = "0000-0001-7883-8573")
14 |     ),
15 |     person(
16 |       given = "Apoorva",
17 |       famil = "Lal", 
18 |       email = "lal.apoorva@gmail.com",
19 |       role = c("ctb"),
20 |       comment = c(ORCID = "0000-0002-3697-614X")
21 |     )
22 |   )
23 | Description: Leverages the power of DuckDB to run regressions on very large
24 |   datasets, which may not fit into R's memory. The core procedure follows Wong
25 |   et al. (2021) <doi:10.48550/arXiv.2102.11297> by reducing ("compressing") the
26 |   data to a set of summary statistics and then running frequency-weighted least
27 |   squares on this smaller dataset. Robust standard errors are computed from
28 |   sufficient statistics, while clustered standard errors are computed using the
29 |   cluster bootstrap.
30 | Imports: 
31 |   DBI,
32 |   duckdb,
33 |   Formula,
34 |   glue,
35 |   Matrix,
36 |   stats
37 | Suggests: 
38 |   tinytest
39 | Encoding: UTF-8
40 | RoxygenNote: 7.3.2
41 | License: MIT + file LICENSE
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2024
2 | COPYRIGHT HOLDER: duckreg authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2024 duckreg authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,duckreg)
 4 | export(duckreg)
 5 | importFrom(DBI,dbConnect)
 6 | importFrom(DBI,dbDisconnect)
 7 | importFrom(DBI,dbGetQuery)
 8 | importFrom(Formula,Formula)
 9 | importFrom(Matrix,Diagonal)
10 | importFrom(Matrix,chol2inv)
11 | importFrom(Matrix,crossprod)
12 | importFrom(Matrix,sparse.model.matrix)
13 | importFrom(duckdb,duckdb)
14 | importFrom(duckdb,duckdb_register)
15 | importFrom(glue,glue)
16 | importFrom(glue,glue_sql)
17 | importFrom(stats,reformulate)
18 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # duckreg 0.0.1
2 | 
3 | * Initial GitHub release.
4 | 


--------------------------------------------------------------------------------
/R/duckreg-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 | 
4 | NULL


--------------------------------------------------------------------------------
/R/duckreg.R:
--------------------------------------------------------------------------------
  1 | #' Run a compressed regression with a DuckDB backend.
  2 | #'
  3 | #' @md
  4 | #' @description
  5 | #' Leverages the power of DuckDB to run regressions on very large datasets,
  6 | #' which may not fit into R's memory. The core procedure follows Wong et al.
  7 | #' (2021) by reducing ("compressing") the data to a set of summary statistics
  8 | #' and then running frequency-weighted least squares on this smaller dataset.
  9 | #' Robust standard errors are computed from sufficient statistics.
 10 | #' 
 11 | #' @param fml A \code{\link[stats]{formula}} representing the relation to be
 12 | #' estimated. Fixed-effects should be included after a pipe, e.g
 13 | #' `fml = y ~ x1 + x2 | fe1 + f2`. Currently, only simple additive terms
 14 | #' are supported (i.e., no interaction terms, transformations or literals).
 15 | #' @param conn Connection to a DuckDB database, e.g. created with
 16 | #' \code{\link[DBI]{dbConnect}}. Can be either persistent (disk-backed) or
 17 | #' ephemeral (in-memory). If no connection is provided, then an ephemeral
 18 | #' connection will be created automatically and closed before the function
 19 | #' exits. Note that a persistent (disk-backed) database connection is
 20 | #' required for larger-than-RAM datasets in order to take advantage of DuckDB's
 21 | #' streaming functionality.
 22 | #' @param table,data,path Mututally exclusive arguments for specifying the data
 23 | #' table (object) to be queried. In order of precedence:
 24 | #' - `table`: Character string giving the name of the data table in an
 25 | #' existing (open) DuckDB connection.
 26 | #' - `data`: R dataframe that can be copied over to `conn` as a temporary
 27 | #' table for querying via the DuckDB query engine. Ignored if `table` is
 28 | #' provided.
 29 | #' - `path`: Character string giving a path to the data file(s) on disk, which
 30 | #' will be read into `conn`. Internally, this string is passed to the `FROM`
 31 | #' query statement, so could (should) include file globbing for
 32 | #' Hive-partitioned datasets, e.g. `"mydata/**/.*parquet"`. For more precision,
 33 | #' however, it is recommended to pass the desired DuckDB reader function as
 34 | #' part of this string, e.g. `"read_parquet('mydata/**/*.parquet')"`;
 35 | #' note the use of single quotes.
 36 | #' Ignored if either `table` or `data` is provided. 
 37 | #' @param vcov Character string denoting the desired type of variance-
 38 | #' covariance correction / standard errors. At present, only "hc1"
 39 | #' (heteroskedasticity-consistent) are supported, which is also thus
 40 | #' the default.
 41 | #' @param query_only Logical indicating whether only the underlying compression
 42 | #'   SQL query should be returned (i.e., no computation will be performed).
 43 | #'   Default is `FALSE`.
 44 | #' @param data_only Logical indicating whether only the compressed dataset
 45 | #'   should be returned (i.e., no regression is run). Default is `FALSE`.
 46 | #' 
 47 | #' @return A list of class "duckreg" containing various slots, including a table
 48 | #' of coefficients (which the associated print method will display).
 49 | #' @references
 50 | #' Wong, J., Forsell, E., Lewis, R., Mao, T., & Wardrop, M. (2021).
 51 | #' \cite{You Only Compress Once: Optimal Data Compression for Estimating Linear Models.} 
 52 | #' arXiv preprint arXiv:2102.11297.
 53 | #' Available: https://doi.org/10.48550/arXiv.2102.11297
 54 | #' 
 55 | #' @importFrom DBI dbConnect dbDisconnect dbGetQuery
 56 | #' @importFrom duckdb duckdb duckdb_register
 57 | #' @importFrom Formula Formula
 58 | #' @importFrom Matrix chol2inv crossprod Diagonal sparse.model.matrix 
 59 | #' @importFrom stats reformulate
 60 | #' @importFrom glue glue glue_sql
 61 | #' 
 62 | #' @examples
 63 | #' 
 64 | #' # A not very compelling example using a small in-memory dataset:
 65 | #' (mod = duckreg(Temp ~ Wind | Month, data = airquality))
 66 | #' 
 67 | #' Same result as lm
 68 | #' summary(lm(Temp ~ Wind + factor(Month), data = airquality))
 69 | #' 
 70 | #' # Aside: duckreg's default print method hides the "nuisance" coefficients
 71 | #' # like the intercept and fixed effect(s). But we can grab them if we want.
 72 | #' print(mod, fes = TRUE)
 73 | #' 
 74 | #' # Note: for a more compelling and appropriate use-case, i.e. regression on a
 75 | #' # big (~180 million row) dataset of Hive-partioned parquet files, see the
 76 | #' # package website:
 77 | #' # https://github.com/grantmcdermott/duckreg?tab=readme-ov-file#quickstart
 78 | #' @export
 79 | duckreg = function(
 80 |    fml,
 81 |    conn = NULL,
 82 |    table = NULL,
 83 |    data = NULL,
 84 |    path = NULL,
 85 |    vcov = "hc1",
 86 |    query_only = FALSE,
 87 |    data_only = FALSE
 88 |    ) {
 89 |   
 90 |      # compress = match.arg(compress) 
 91 |        
 92 |      if (is.null(conn)) {
 93 |          conn = dbConnect(duckdb(), shutdown = TRUE)
 94 |          on.exit(dbDisconnect(conn), add = TRUE)
 95 |      }
 96 |      
 97 |      if (!is.null(table)) {
 98 |       if (!is.character(table)) stop("\nThe `table` argument must be a character string.\n")
 99 |       # from_statement = paste("FROM", table)
100 |       from_statement = glue("FROM {table}")
101 |      } else if (!is.null(data)) {
102 |       if (!inherits(data, "data.frame")) stop("\nThe `data` argument must be a data.frame.\n")
103 |       duckdb_register(conn, "tmp_table", data)
104 |       # DBI::dbWriteTable(conn, "tmp_table", data)
105 |       from_statement = "FROM tmp_table"
106 |      } else if (!is.null(path)) {
107 |       if (!is.character(path)) stop("\nThe `path` argument must be a character string.\n")
108 |       if (!(grepl("^read|^scan", path) && grepl("'", path))) {
109 |          path = gsub('"', "'", path)
110 |          from_statement = glue("FROM '{path}'")
111 |       } else {
112 |          from_statement = glue("FROM {path}")
113 |       }
114 |      } else {
115 |       stop("\nOne of of `table`, `data`, or `path` arguments is required.\n")
116 |      }
117 | 
118 |      # vars of interest
119 |      fml = Formula(fml)
120 |      yvar = all.vars(formula(fml, lhs = 1, rhs = 0))
121 |      xvars = all.vars(formula(fml, lhs = 0, rhs = 1))
122 |      fes = if (length(fml)[2] > 1) all.vars(formula(fml, lhs = 0, rhs = 2)) else NULL
123 |    #   vars = all.vars(fml)
124 |    #   yvar = vars[1]
125 |    #   xvars = vars[-1]
126 |      
127 |      # query string
128 |      query_string = paste0(
129 |          "
130 |         WITH cte AS (
131 |            SELECT
132 |               ",
133 |          paste(c(xvars, fes), collapse = ", "), ",
134 |               COUNT(*) AS n,
135 |               SUM(", yvar, ") as sum_Y,
136 |               SUM(POW(", yvar, ", 2)) as sum_Y_sq,
137 |            ", from_statement, "
138 |            GROUP BY ALL
139 |         )
140 |         FROM cte
141 |         SELECT
142 |            *,
143 |            sum_Y / n AS mean_Y,
144 |            sqrt(n) AS wts
145 |         "
146 |      )
147 |    
148 |      if (isTRUE(query_only)) return(query_string)
149 |      
150 |      # fetch data
151 |      compressed_dat = dbGetQuery(conn = conn, query_string)
152 | 
153 |      # turn FEs into factors
154 |      for (f in fes) {
155 |         compressed_dat[[f]] = factor(compressed_dat[[f]])
156 |      }
157 |      rm(f)
158 |      
159 |      if (isTRUE(data_only)) return(compressed_dat)
160 | 
161 |      # design and outcome matrices
162 |      X = sparse.model.matrix(reformulate(c(xvars, fes)), compressed_dat)
163 |      Y = compressed_dat[, "mean_Y"]
164 |      Xw = X * compressed_dat[["wts"]]
165 |      Yw = Y * compressed_dat[["wts"]]
166 | 
167 |      # beta values
168 |      betahat = chol2inv(chol(crossprod(Xw))) %*% crossprod(Xw, Yw)
169 | 
170 |      # standard errors (currently only HC1)
171 |      vcov_type = tolower(vcov)
172 |      if (identical(vcov_type, "hc1")) {
173 |         n = compressed_dat[["n"]]
174 |         yprime = compressed_dat[["sum_Y"]]
175 |         yprimeprime = compressed_dat[["sum_Y_sq"]]
176 |         # Compute yhat
177 |         yhat = X %*% betahat
178 |         # Compute rss_g
179 |         rss_g = (yhat^2) * n - 2 * yhat * yprime + yprimeprime
180 |         # Compute vcov components
181 |         bread = solve(crossprod(X, Diagonal(x = n) %*% X))
182 |         meat = crossprod(X, Diagonal(x = as.vector(rss_g)) %*% X)
183 |         n_nk = sum(n) / (sum(n) - ncol(X))
184 |         vcov = n_nk * (bread %*% meat %*% bread)
185 |         vcov = as.matrix(vcov)
186 |         # grab SEs
187 |         ses = sqrt(diag(vcov))
188 |      }
189 |      attr(vcov, "type") = vcov_type
190 | 
191 |      # return object
192 |      coefs = betahat[, 1]
193 |      zvalues = coefs / ses
194 |      nparams = length(coefs)
195 |      nobs = nrow(compressed_dat)
196 |      nobs_orig = sum(compressed_dat$n)
197 |      pvalues = 2*pt(-abs(zvalues), max(nobs - nparams, 1))
198 |      coeftable = cbind(
199 |         estimate = coefs,
200 |         std.error = ses,
201 |         statistic = zvalues,
202 |         p.values = pvalues
203 |      )
204 | 
205 |      ret = list(
206 |       coeftable = coeftable,
207 |       vcov = vcov,
208 |       fml = fml,
209 |       yvar = yvar,
210 |       xvars = xvars,
211 |       fes = fes,
212 |       query_string = query_string,
213 |       nobs = nobs,
214 |       nobs_orig = nobs_orig
215 |      )
216 | 
217 |      ## Overload class ----
218 |      class(ret) = c("duckreg", class(ret))
219 | 
220 |      ret
221 | }
222 | 


--------------------------------------------------------------------------------
/R/print.R:
--------------------------------------------------------------------------------
 1 | #' Print method for duckreg objects
 2 | #' @param x `duckreg` object.
 3 | #' @param fes Should the fixed effects be displayed? Default is `FALSE`.
 4 | #' @export
 5 | print.duckreg = function(x, fes = FALSE, ...) {
 6 |     ct = x[["coeftable"]]
 7 |     colnames(ct) = c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
 8 |     if (!isTRUE(fes) && !is.null(x$fes)) {
 9 |         xvars = x[["xvars"]]
10 |         ct = ct[xvars, , drop = FALSE]
11 |     }
12 |     cat("Compressed OLS estimation, Dep. Var.:", x$yvar, "\n")
13 |     cat("Observations.:", prettyNum(x$nobs_orig, big.mark = ","), "(original) |", prettyNum(x$nobs, big.mark = ","), "(compressed)", "\n")
14 |     print_coeftable(ct)
15 |     invisible(ct)
16 | }
17 | 
18 | # stolen from Laurent here:
19 | # https://github.com/lrberge/fixest/blob/5523d48ef4a430fa2e82815ca589fc8a47168fe7/R/miscfuns.R#L3758
20 | print_coeftable = function(coeftable, lastLine = "", show_signif = TRUE){
21 |   # Simple function that does as the function coeftable but handles special cases
22 |   # => to take care of the case when the coefficient is bounded
23 | 
24 |   if(!is.data.frame(coeftable)){
25 |     class(coeftable) = NULL
26 |     ct = as.data.frame(coeftable)
27 |   } else {
28 |     ct = coeftable
29 |   }
30 | 
31 |   signifCode = c("***"=0.001, "** "=0.01, "*  "=0.05, ".  "=0.1)
32 | 
33 |   pvalues = ct[, 4]
34 | 
35 |   stars = cut(pvalues, breaks = c(-1, signifCode, 100), labels = c(names(signifCode), ""))
36 |   stars[is.na(stars)] = ""
37 | 
38 |   whoIsLow = !is.na(pvalues) & pvalues < 2.2e-16
39 | 
40 |   # Note that it's a bit different than format => I don't like xxe-yy numbers, very hard to read: you can't see large/small nbers at first sight
41 |   for(i in 1:3){
42 |     ct[, i] = decimalFormat(ct[, i])
43 |   }
44 | 
45 |   ct[!whoIsLow, 4] = format(ct[!whoIsLow, 4], digits = 5)
46 | 
47 |   ct[whoIsLow, 4] = "< 2.2e-16"
48 |   ct[is.na(ct[, 4]), 4] = "NA"
49 | 
50 |   ct[, 5] = stars
51 |   names(ct)[5] = ""
52 | 
53 |   print(ct)
54 |   
55 |   cat(lastLine)
56 | 
57 |   if(show_signif){
58 |     cat("---\nSignif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n")
59 |   }
60 | }
61 | 
62 | decimalFormat = function(x){
63 | 
64 |   who_valid = which(!is.na(x) & is.numeric(x))
65 |   if(length(who_valid) == 0) return(x)
66 | 
67 |   res = x
68 | 
69 |   x_valid = x[who_valid]
70 |   xPower = log10(abs(x_valid))
71 | 
72 |   if(min(xPower) > 0){
73 |     pow_round = max(1, 6 - ceiling(xPower))
74 |   } else if(min(xPower) < -5){
75 |     pow_round = ceiling(abs(min(xPower))) + 2
76 |   } else {
77 |     pow_round = 6
78 |   }
79 | 
80 |   res[who_valid] = round(x_valid, pow_round)
81 | 
82 |   res
83 | }
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # duckreg
  2 | 
  3 | Very fast out-of-memory regressions with DuckDB.
  4 | 
  5 | ## What
  6 | 
  7 | R package that leverages the power of [DuckDB](https://duckdb.org/) to run
  8 | regressions on very large datasets, which may not fit into R's memory.
  9 | The core procedure follows
 10 | [Wong _et al_. (2021)](https://doi.org/10.48550/arXiv.2102.11297)
 11 | by reducing ("compressing") the data to a set of summary statistics and then
 12 | running frequency-weighted least squares on this smaller dataset. Robust
 13 | standard errors are computed from sufficient statistics.
 14 | 
 15 | The **duckreg** package is inspired by, and has similar aims to, the
 16 | [Python package of the same name](https://github.com/py-econometrics/duckreg).
 17 | Compared to the Python implementation, the functionality of this R version is
 18 | currently limited to compressed regressions only. But we plan to add support for
 19 | Mundlak regression, double-demeaning, etc. in the near future. On the other
 20 | hand, this R version does benefit from a significantly smaller dependency
 21 | footprint (<5 recursive dependencies vs. over 40), which should hopefully
 22 | enable faster and simpler installs, as well as a lower long-term maintenance
 23 | burden. I've also added some R syntactic sugar, so that you can specify
 24 | regressions using the familiar formula syntax, along with "pretty" print
 25 | methods.
 26 | 
 27 | ## Install
 28 | 
 29 | ```r
 30 | # install.packages("remotes")
 31 | remotes::install_github("grantmcdermott/duckreg")
 32 | ```
 33 | 
 34 | ## Quickstart
 35 | 
 36 | ### Small dataset
 37 | 
 38 | To get ourselves situated, we'll first demonstrate by using an in-memory R
 39 | dataset.
 40 | 
 41 | ``` r
 42 | library(duckreg)
 43 | library(fixest)   # for data and comparison
 44 | 
 45 | data("trade", package = "fixest")
 46 | 
 47 | duckreg(Euros ~ dist_km | Destination + Origin, data = trade, vcov = 'hc1')
 48 | #> Compressed OLS estimation, Dep. Var.: Euros 
 49 | #> Observations.: 38,325 (original) | 210 (compressed) 
 50 | #>         Estimate Std. Error t value  Pr(>|t|)    
 51 | #> dist_km -45709.8    1195.84 -38.224 < 2.2e-16 ***
 52 | #> ---
 53 | #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 54 | ```
 55 | 
 56 | Behind the scenes, **duckreg** has compressed the original dataset down from
 57 | nearly 40,000 observations to only 210, before running the final (weighted)
 58 | regression on this much smaller data object. We can can confirm that this
 59 | compression approach still gives us the same result as running `fixest::feols`
 60 | on the full dataset:
 61 | 
 62 | ```r
 63 | feols(Euros ~ dist_km | Destination + Origin, data = trade, vcov = 'hc1')
 64 | #> OLS estimation, Dep. Var.: Euros
 65 | #> Observations: 38,325
 66 | #> Fixed-effects: Destination: 15,  Origin: 15
 67 | #> Standard-errors: Heteroskedasticity-robust 
 68 | #>         Estimate Std. Error t value  Pr(>|t|)    
 69 | #> dist_km -45709.8    1195.84 -38.224 < 2.2e-16 ***
 70 | #> ---
 71 | #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 72 | #> RMSE: 124,221,786.3     Adj. R2: 0.215289
 73 | #>                       Within R2: 0.025914
 74 | ```
 75 | 
 76 | ### Big dataset
 77 | 
 78 | For a more appropriate **duckreg** use-case, let's run a regression on some NYC
 79 | taxi data. (Download instructions
 80 | [here](https://grantmcdermott.com/duckdb-polars/requirements.html).)
 81 | **duckreg** offers two basic ways to interact with, and analyse, data of this
 82 | size.
 83 | 
 84 | #### Option 1: "On-the-fly"
 85 | 
 86 | Use the `path` argument to read the data directly from disk and perform the
 87 | compression computation in an ephemeral DuckDB connection. This requires that
 88 | data are small enough to fit into RAM... but please note that "small enough" is
 89 | a very relative concept. Thanks to DuckDB's incredible efficiency, your RAM
 90 | should be able to handle very large datasets that would otherwise crash your R
 91 | session, and require only a fraction of the computation time.
 92 | 
 93 | ```r
 94 | duckreg(
 95 |     tip_amount ~ fare_amount + passenger_count | month + vendor_name,
 96 |     path = "read_parquet('nyc-taxi/**/*.parquet')" ## path to hive-partitoned dataset
 97 | )
 98 | #> Compressed OLS estimation, Dep. Var.: tip_amount 
 99 | #> Observations.: 178,544,324 (original) | 70,782 (compressed) 
100 | #>                  Estimate Std. Error  t value  Pr(>|t|)    
101 | #> fare_amount      0.106744   0.000068 1564.742 < 2.2e-16 ***
102 | #> passenger_count -0.029086   0.000106 -273.866 < 2.2e-16 ***
103 | #> ---
104 | #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
105 | ```
106 | 
107 | Note the size of the original dataset, which is nearly 180 million rows, versus
108 | the compressed dataset, which is down to only 70k. On my laptop this regression
109 | completes in **under 4 seconds**... and that includes the time it took to read
110 | and compress the data from disk!
111 | 
112 | #### Option 2: Persistent database
113 | 
114 | While querying on-the-fly is both convenient and extremely performant, you can
115 | of course also run regressions against existing tables in a persistent DuckDB
116 | database. This latter approach requires that you specify appropriate `conn` and
117 | `table` arguments. But note that querying against a persistent database also
118 | means that you can run regressions against bigger than RAM data, since we will
119 | automatically take advantage of DuckDB's
120 | [out-of-core functionality](https://duckdb.org/2024/07/09/memory-management.html) 
121 | (streaming, hash aggregation, etc.).
122 | 
123 | ```r
124 | ## Explicitly load the duckdb (and thus also DBI) to create persistent database
125 | ## and create a table with our taxi data.
126 | library(duckdb)
127 | #> Loading required package: DBI
128 | 
129 | # create connection to persistent database
130 | con = dbConnect(duckdb(), dbdir = "nyc.db")
131 | 
132 | # create a 'taxi' table in our new nyc.db database
133 | dbExecute(
134 |    con,
135 |    "
136 |    CREATE TABLE taxi AS
137 |       FROM read_parquet('~/Documents/Projects/duckdb-polars/nyc-taxi/**/*.parquet')
138 |       SELECT tip_amount, fare_amount, passenger_count, month, vendor_name
139 |    "
140 | )
141 | 
142 | # same result as earlier
143 | duckreg(
144 |     tip_amount ~ fare_amount + passenger_count | month + vendor_name,
145 |     conn = con,    # database connection,
146 |     table = "taxi" # table name
147 | )
148 | #> Compressed OLS estimation, Dep. Var.: tip_amount 
149 | #> Observations.: 178,544,324 (original) | 70,782 (compressed) 
150 | #>                  Estimate Std. Error  t value  Pr(>|t|)    
151 | #> fare_amount      0.106744   0.000068 1564.742 < 2.2e-16 ***
152 | #> passenger_count -0.029086   0.000106 -273.866 < 2.2e-16 ***
153 | #> ---
154 | #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
155 | 
156 | ## (Optional clean-up)
157 | dbRemoveTable(con, "taxi")
158 | dbDisconnect(con)
159 | unlink("nyc.db") # remove from disk
160 | ```


--------------------------------------------------------------------------------
/inst/tinytest/test_duckreg.R:
--------------------------------------------------------------------------------
1 | 
2 | ## example of simple test that always succeeds
3 | expect_equal(1 + 1, 2)
4 | 
5 | # ## test of placeholder function hello()
6 | # expect_stdout(hello(), "Hello, world")
7 | # expect_stdout(hello("kitty"), "Hello, kitty")
8 | 


--------------------------------------------------------------------------------
/man/duckreg-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckreg-package.R
 3 | \docType{package}
 4 | \name{duckreg-package}
 5 | \alias{duckreg-package}
 6 | \title{duckreg: Out-of-Core Regressions With DuckDB}
 7 | \description{
 8 | Leverages the power of DuckDB to run regressions on very large datasets, which may not fit into R's memory. The core procedure follows Wong et al. (2021) \doi{10.48550/arXiv.2102.11297} by reducing ("compressing") the data to a set of summary statistics and then running frequency-weighted least squares on this smaller dataset. Robust standard errors are computed from sufficient statistics, while clustered standard errors are computed using the cluster bootstrap.
 9 | }
10 | \author{
11 | \strong{Maintainer}: Grant McDermott \email{gmcd@amazon.com} (\href{https://orcid.org/0000-0001-7883-8573}{ORCID})
12 | 
13 | Other contributors:
14 | \itemize{
15 |   \item Apoorva Lal \email{lal.apoorva@gmail.com} (\href{https://orcid.org/0000-0002-3697-614X}{ORCID}) [contributor]
16 | }
17 | 
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/duckreg.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duckreg.R
 3 | \name{duckreg}
 4 | \alias{duckreg}
 5 | \title{Run a compressed regression with a DuckDB backend.}
 6 | \usage{
 7 | duckreg(
 8 |   fml,
 9 |   conn = NULL,
10 |   table = NULL,
11 |   data = NULL,
12 |   path = NULL,
13 |   vcov = "hc1",
14 |   query_only = FALSE,
15 |   data_only = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{fml}{A \code{\link[stats]{formula}} representing the relation to be
20 | estimated. Fixed-effects should be included after a pipe, e.g
21 | \code{fml = y ~ x1 + x2 | fe1 + f2}. Currently, only simple additive terms
22 | are supported (i.e., no interaction terms, transformations or literals).}
23 | 
24 | \item{conn}{Connection to a DuckDB database, e.g. created with
25 | \code{\link[DBI]{dbConnect}}. Can be either persistent (disk-backed) or
26 | ephemeral (in-memory). If no connection is provided, then an ephemeral
27 | connection will be created automatically and closed before the function
28 | exits. Note that a persistent (disk-backed) database connection is
29 | required for larger-than-RAM datasets in order to take advantage of DuckDB's
30 | streaming functionality.}
31 | 
32 | \item{table, data, path}{Mututally exclusive arguments for specifying the data
33 | table (object) to be queried. In order of precedence:
34 | \itemize{
35 | \item \code{table}: Character string giving the name of the data table in an
36 | existing (open) DuckDB connection.
37 | \item \code{data}: R dataframe that can be copied over to \code{conn} as a temporary
38 | table for querying via the DuckDB query engine. Ignored if \code{table} is
39 | provided.
40 | \item \code{path}: Character string giving a path to the data file(s) on disk, which
41 | will be read into \code{conn}. Internally, this string is passed to the \code{FROM}
42 | query statement, so could (should) include file globbing for
43 | Hive-partitioned datasets, e.g. \code{"mydata/**/.*parquet"}. For more precision,
44 | however, it is recommended to pass the desired DuckDB reader function as
45 | part of this string, e.g. \code{"read_parquet('mydata/**/*.parquet')"};
46 | note the use of single quotes.
47 | Ignored if either \code{table} or \code{data} is provided.
48 | }}
49 | 
50 | \item{vcov}{Character string denoting the desired type of variance-
51 | covariance correction / standard errors. At present, only "hc1"
52 | (heteroskedasticity-consistent) are supported, which is also thus
53 | the default.}
54 | 
55 | \item{query_only}{Logical indicating whether only the underlying compression
56 | SQL query should be returned (i.e., no computation will be performed).
57 | Default is \code{FALSE}.}
58 | 
59 | \item{data_only}{Logical indicating whether only the compressed dataset
60 | should be returned (i.e., no regression is run). Default is \code{FALSE}.}
61 | }
62 | \value{
63 | A list of class "duckreg" containing various slots, including a table
64 | of coefficients (which the associated print method will display).
65 | }
66 | \description{
67 | Leverages the power of DuckDB to run regressions on very large datasets,
68 | which may not fit into R's memory. The core procedure follows Wong et al.
69 | (2021) by reducing ("compressing") the data to a set of summary statistics
70 | and then running frequency-weighted least squares on this smaller dataset.
71 | Robust standard errors are computed from sufficient statistics.
72 | }
73 | \examples{
74 | 
75 | # A not very compelling example using a small in-memory dataset:
76 | (mod = duckreg(Temp ~ Wind | Month, data = airquality))
77 | 
78 | Same result as lm
79 | summary(lm(Temp ~ Wind + factor(Month), data = airquality))
80 | 
81 | # Aside: duckreg's default print method hides the "nuisance" coefficients
82 | # like the intercept and fixed effect(s). But we can grab them if we want.
83 | print(mod, fes = TRUE)
84 | 
85 | # Note: for a more compelling and appropriate use-case, i.e. regression on a
86 | # big (~180 million row) dataset of Hive-partioned parquet files, see the
87 | # package website:
88 | # https://github.com/grantmcdermott/duckreg?tab=readme-ov-file#quickstart
89 | }
90 | \references{
91 | Wong, J., Forsell, E., Lewis, R., Mao, T., & Wardrop, M. (2021).
92 | \cite{You Only Compress Once: Optimal Data Compression for Estimating Linear Models.}
93 | arXiv preprint arXiv:2102.11297.
94 | Available: https://doi.org/10.48550/arXiv.2102.11297
95 | }
96 | 


--------------------------------------------------------------------------------
/man/print.duckreg.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.R
 3 | \name{print.duckreg}
 4 | \alias{print.duckreg}
 5 | \title{Print method for duckreg objects}
 6 | \usage{
 7 | \method{print}{duckreg}(x, fes = FALSE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{`duckreg` object.}
11 | 
12 | \item{fes}{Should the fixed effects be displayed? Default is `FALSE`.}
13 | }
14 | \description{
15 | Print method for duckreg objects
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/tinytest.R:
--------------------------------------------------------------------------------
1 | 
2 | if ( requireNamespace("tinytest", quietly=TRUE) ){
3 |   tinytest::test_package("duckreg")
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------