├── .Rbuildignore
├── .editorconfig
├── .github
└── workflows
│ ├── pkgdown.yml
│ └── r-cmd-check.yml
├── .gitignore
├── .ignore
├── .lintr
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
├── DataBackendDplyr.R
├── DataBackendDuckDB.R
├── DataBackendPolars.R
├── as_duckdb_backend.R
├── as_polars_backend.R
├── as_sqlite_backend.R
├── helper.R
└── zzz.R
├── README.Rmd
├── README.md
├── inst
└── extdata
│ ├── spam.parquet
│ ├── userdata1.parquet
│ ├── userdata2.parquet
│ ├── userdata3.parquet
│ ├── userdata4.parquet
│ └── userdata5.parquet
├── man-roxygen
├── field_connector.R
├── field_levels.R
├── param_connector.R
├── param_path.R
├── param_primary_key.R
└── param_strings_as_factors.R
├── man
├── DataBackendDplyr.Rd
├── DataBackendDuckDB.Rd
├── DataBackendPolars.Rd
├── as_duckdb_backend.Rd
├── as_polars_backend.Rd
├── as_sqlite_backend.Rd
├── figures
│ └── logo_navbar.png
└── mlr3db-package.Rd
├── mlr3db.Rproj
├── pkgdown
├── _pkgdown.yml
└── favicon
│ ├── apple-touch-icon-120x120.png
│ ├── apple-touch-icon-152x152.png
│ ├── apple-touch-icon-180x180.png
│ ├── apple-touch-icon-60x60.png
│ ├── apple-touch-icon-76x76.png
│ ├── apple-touch-icon.png
│ ├── favicon-16x16.png
│ ├── favicon-32x32.png
│ └── favicon.ico
└── tests
├── testthat.R
└── testthat
├── helper.R
├── setup.R
├── teardown.R
├── test_as_duckdb_backend.R
├── test_as_polars_backend.R
├── test_as_sqlite_backend.R
├── test_dplyr.R
├── test_duckdb.R
├── test_polars.R
├── test_reconnect.R
├── test_train_predict_dplyr.R
├── test_train_predict_duckdb.R
└── test_train_predict_polars.R
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^LICENSE$
2 | .ignore
3 | .editorconfig
4 | .gitignore
5 | ^.git$
6 | ^.github$
7 | ^.*\.Rproj$
8 | ^\.Rproj\.user$
9 | ^man-roxygen$
10 | ^docs$
11 | ^pkgdown$
12 | ^\.ccache$
13 | ^\.github$
14 | ^.lintr$
15 | ^README\.Rmd$
16 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # See http://editorconfig.org
2 | root = true
3 |
4 | [*]
5 | charset = utf-8
6 | end_of_line = lf
7 | insert_final_newline = true
8 | indent_style = space
9 | trim_trailing_whitespace = true
10 |
11 | [*.{r,R,md,Rmd}]
12 | indent_size = 2
13 |
14 | [*.{c,h}]
15 | indent_size = 4
16 |
17 | [*.{cpp,hpp}]
18 | indent_size = 4
19 |
20 | [{NEWS.md,DESCRIPTION,LICENSE}]
21 | max_line_length = 80
22 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yml:
--------------------------------------------------------------------------------
1 | # pkgdown workflow of the mlr3 ecosystem v0.1.0
2 | # https://github.com/mlr-org/actions
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 | release:
11 | types:
12 | - published
13 | workflow_dispatch:
14 |
15 | name: pkgdown
16 |
17 | jobs:
18 | pkgdown:
19 | runs-on: ubuntu-latest
20 |
21 | concurrency:
22 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
23 | env:
24 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
25 | steps:
26 | - uses: actions/checkout@v3
27 |
28 | - uses: r-lib/actions/setup-pandoc@v2
29 |
30 | - uses: r-lib/actions/setup-r@v2
31 | with:
32 | extra-repositories: 'https://community.r-multiverse.org'
33 |
34 | - uses: r-lib/actions/setup-r-dependencies@v2
35 | with:
36 | extra-packages: any::pkgdown, local::.
37 | needs: website
38 |
39 | - name: Install template
40 | run: pak::pkg_install("mlr-org/mlr3pkgdowntemplate")
41 | shell: Rscript {0}
42 |
43 | - name: Build site
44 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
45 | shell: Rscript {0}
46 |
47 | - name: Deploy
48 | if: github.event_name != 'pull_request'
49 | uses: JamesIves/github-pages-deploy-action@v4.4.1
50 | with:
51 | clean: false
52 | branch: gh-pages
53 | folder: docs
54 |
--------------------------------------------------------------------------------
/.github/workflows/r-cmd-check.yml:
--------------------------------------------------------------------------------
1 | # r cmd check workflow of the mlr3 ecosystem v0.1.0
2 | # https://github.com/mlr-org/actions
3 | on:
4 | workflow_dispatch:
5 | push:
6 | branches:
7 | - main
8 | pull_request:
9 | branches:
10 | - main
11 |
12 | name: r-cmd-check
13 |
14 | jobs:
15 | r-cmd-check:
16 | runs-on: ${{ matrix.config.os }}
17 |
18 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
19 |
20 | env:
21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |
23 | strategy:
24 | fail-fast: false
25 | matrix:
26 | config:
27 | - {os: ubuntu-latest, r: 'devel'}
28 | - {os: ubuntu-latest, r: 'release'}
29 |
30 | steps:
31 | - uses: actions/checkout@v3
32 |
33 | - uses: r-lib/actions/setup-r@v2
34 | with:
35 | r-version: ${{ matrix.config.r }}
36 | extra-repositories: 'https://community.r-multiverse.org'
37 |
38 | - uses: r-lib/actions/setup-r-dependencies@v2
39 | with:
40 | extra-packages: any::rcmdcheck
41 | needs: check
42 |
43 | - uses: r-lib/actions/check-r-package@v2
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | docs/
6 | .DS_Store
7 |
--------------------------------------------------------------------------------
/.ignore:
--------------------------------------------------------------------------------
1 | man/
2 | docs/
3 | pkgdown/
4 |
--------------------------------------------------------------------------------
/.lintr:
--------------------------------------------------------------------------------
1 | linters: linters_with_defaults(
2 | # lintr defaults: https://github.com/jimhester/lintr#available-linters
3 | # the following setup changes/removes certain linters
4 | assignment_linter = NULL, # do not force using <- for assignments
5 | object_name_linter = object_name_linter(c("snake_case", "CamelCase")), # only allow snake case and camel case object names
6 | cyclocomp_linter = NULL, # do not check function complexity
7 | commented_code_linter = NULL, # allow code in comments
8 | line_length_linter = line_length_linter(120)
9 | )
10 |
11 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: mlr3db
2 | Title: Data Base Backend for 'mlr3'
3 | Version: 0.5.1-9000
4 | Authors@R:
5 | c(
6 | person(given = "Michel",
7 | family = "Lang",
8 | role = c("cre", "aut"),
9 | email = "michellang@gmail.com",
10 | comment = c(ORCID = "0000-0001-9754-0393")),
11 | person(given = "Lona",
12 | family = "Koers",
13 | role = c("aut"),
14 | email = "lona.koers@gmail.com")
15 | )
16 | Description: Extends the 'mlr3' package with a backend to
17 | transparently work with databases such as 'SQLite', 'DuckDB', 'MySQL',
18 | 'MariaDB', or 'PostgreSQL'. The package provides two additional backends:
19 | 'DataBackendDplyr' relies on the abstraction of package 'dbplyr' to
20 | interact with most DBMS. 'DataBackendDuckDB' operates on 'DuckDB' data bases
21 | and also on Apache Parquet files.
22 | License: LGPL-3
23 | URL: https:///mlr3db.mlr-org.com,
24 | https://github.com/mlr-org/mlr3db
25 | BugReports: https://github.com/mlr-org/mlr3db/issues
26 | Depends:
27 | mlr3 (>= 0.13.0),
28 | R (>= 3.1.0)
29 | Imports:
30 | R6,
31 | backports,
32 | checkmate,
33 | data.table,
34 | mlr3misc (>= 0.10.0)
35 | Suggests:
36 | DBI,
37 | RSQLite,
38 | dbplyr,
39 | dplyr,
40 | duckdb (>= 0.4.0),
41 | future,
42 | future.apply,
43 | future.callr,
44 | lgr,
45 | polars,
46 | testthat (>= 3.0.0),
47 | tibble
48 | Encoding: UTF-8
49 | Config/testthat/edition: 3
50 | Roxygen: list(markdown = TRUE)
51 | RoxygenNote: 7.3.2
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(as_data_backend,RPolarsDataFrame)
4 | S3method(as_data_backend,RPolarsLazyFrame)
5 | S3method(as_data_backend,tbl_SQLiteConnection)
6 | S3method(as_data_backend,tbl_duckdb_connection)
7 | S3method(as_data_backend,tbl_lazy)
8 | S3method(as_duckdb_backend,DataBackend)
9 | S3method(as_duckdb_backend,character)
10 | S3method(as_duckdb_backend,data.frame)
11 | S3method(as_polars_backend,DataBackend)
12 | S3method(as_polars_backend,data.frame)
13 | S3method(as_sqlite_backend,DataBackend)
14 | S3method(as_sqlite_backend,data.frame)
15 | export(DataBackendDplyr)
16 | export(DataBackendDuckDB)
17 | export(DataBackendPolars)
18 | export(as_duckdb_backend)
19 | export(as_polars_backend)
20 | export(as_sqlite_backend)
21 | if (getRversion() >= "3.6.0") S3method(dplyr::show_query, DataBackendDplyr)
22 | import(checkmate)
23 | import(data.table)
24 | importFrom(R6,R6Class)
25 | importFrom(mlr3,DataBackend)
26 | importFrom(mlr3,as_data_backend)
27 | importFrom(mlr3misc,calculate_hash)
28 | importFrom(mlr3misc,map_lgl)
29 | importFrom(stats,setNames)
30 | importFrom(utils,head)
31 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # mlr3db 0.5.2
2 |
3 | - Bugfix: `DataBackendDuckDB` calculated missing values incorrectly.
4 | - Compatibility with future versions of `dbplyr` (#35).
5 |
6 | # mlr3db 0.5.1
7 |
8 | - Compatibility with new duckdb version (#36).
9 |
10 | # mlr3db 0.5.0
11 |
12 | - Support for parquet files as Backend via DuckDB.
13 | - New converter `as_duckdb_backend()`.
14 |
15 | # mlr3db 0.4.2
16 |
17 | - Compatibility fixes with new duckdb version.
18 |
19 | # mlr3db 0.4.1
20 |
21 | - Temporarily disabled some tests to overcome some regressions in duckdb.
22 |
23 | # mlr3db 0.4.0
24 |
25 | * Added a `show_query()` method for `DataBackendDplyr` (#4).
26 | * A reconnector is automatically added in `as_data_backend()` for objects of
27 | type `tbl_SQLiteConnection` and `tbl_duckdb_connection`.
28 |
29 | # mlr3db 0.3.0
30 |
31 | * New backend `DataBackendDuckDB`.
32 | * `dplyr` is now optional (moved from imports to suggests).
33 |
34 | # mlr3db 0.2.0
35 |
36 | * Set a primary key for SQLite databases generated from data frames.
37 | * Set a reconnector for SQLite databases generated from data frames.
38 | * Resolved a warning signaled by dplyr-1.0.0.
39 |
40 | # mlr3db 0.1.5
41 |
42 | * `as_data_backend()` method to construct a `DataBackendDplyr` now specialized
43 | to operate on objects of type `"tbl_lazy"` (was `"tbl"` before). This way,
44 | local `"tbl"` objects such as tibbles are converted to a
45 | `DataBackendDataTable` by `mlr3::as_data_backend.data.frame()`.
46 |
47 | # mlr3db 0.1.4
48 |
49 | * Connections can now be automatically re-connected via a user-provided function.
50 | * `DataBackendDplyr` now has a finalizer which automatically disconnects the
51 | database connection during garbage collection.
52 |
53 | # mlr3db 0.1.3
54 |
55 | * During construction of `DataBackendDplyr`, you can now select columns to be
56 | converted from string to factor. This simplifies the work with SQL databases
57 | which do not naturally support factors (or where the level information is
58 | lost in the transaction).
59 |
60 | # mlr3db 0.1.2
61 |
62 | * Fixed `$distinct()` to not return missing values per default.
63 | * Added `na_rm` argument to `$distinct()`.
64 | * Renamed `as_sqlite()` to `as_sqlite_backend()`
65 |
66 | # mlr3db 0.1.1
67 |
68 | * Initial release.
69 |
--------------------------------------------------------------------------------
/R/DataBackendDplyr.R:
--------------------------------------------------------------------------------
1 | #' @title DataBackend for dplyr/dbplyr
2 | #'
3 | #' @description
4 | #' A [mlr3::DataBackend] using [dplyr::tbl()] from packages \CRANpkg{dplyr}/\CRANpkg{dbplyr}.
5 | #' This includes [`tibbles`][tibble::tibble()] and abstract database connections interfaced by \CRANpkg{dbplyr}.
6 | #' The latter allows [mlr3::Task]s to interface an out-of-memory database.
7 | #'
8 | #'
9 | #' @param rows `integer()`\cr
10 | #' Row indices.
11 | #' @param cols `character()`\cr
12 | #' Column names.
13 | #' @param data_format (`character(1)`)\cr
14 | #' Desired data format, e.g. `"data.table"` or `"Matrix"`.
15 | #' @param na_rm `logical(1)`\cr
16 | #' Whether to remove NAs or not.
17 | #'
18 | #' @template param_primary_key
19 | #' @template param_strings_as_factors
20 | #' @template param_connector
21 | #'
22 | #' @importFrom mlr3 DataBackend
23 | #' @export
24 | #' @examples
25 | #' if (mlr3misc::require_namespaces(c("tibble", "RSQLite", "dbplyr"), quietly = TRUE)) {
26 | #' # Backend using a in-memory tibble
27 | #' data = tibble::as_tibble(iris)
28 | #' data$Sepal.Length[1:30] = NA
29 | #' data$row_id = 1:150
30 | #' b = DataBackendDplyr$new(data, primary_key = "row_id")
31 | #'
32 | #' # Object supports all accessors of DataBackend
33 | #' print(b)
34 | #' b$nrow
35 | #' b$ncol
36 | #' b$colnames
37 | #' b$data(rows = 100:101, cols = "Species")
38 | #' b$distinct(b$rownames, "Species")
39 | #'
40 | #' # Classification task using this backend
41 | #' task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species")
42 | #' print(task)
43 | #' head(task)
44 | #'
45 | #' # Create a temporary SQLite database
46 | #' con = DBI::dbConnect(RSQLite::SQLite(), ":memory:")
47 | #' dplyr::copy_to(con, data)
48 | #' tbl = dplyr::tbl(con, "data")
49 | #'
50 | #' # Define a backend on a subset of the database: do not use column "Sepal.Width"
51 | #' tbl = dplyr::select_at(tbl, setdiff(colnames(tbl), "Sepal.Width"))
52 | #' tbl = dplyr::filter(tbl, row_id %in% 1:120) # Use only first 120 rows
53 | #' b = DataBackendDplyr$new(tbl, primary_key = "row_id")
54 | #' print(b)
55 | #'
56 | #' # Query disinct values
57 | #' b$distinct(b$rownames, "Species")
58 | #'
59 | #' # Query number of missing values
60 | #' b$missings(b$rownames, b$colnames)
61 | #'
62 | #' # Note that SQLite does not support factors, column Species has been converted to character
63 | #' lapply(b$head(), class)
64 | #'
65 | #' # Cleanup
66 | #' rm(tbl)
67 | #' DBI::dbDisconnect(con)
68 | #' }
69 | DataBackendDplyr = R6Class("DataBackendDplyr", inherit = DataBackend, cloneable = FALSE,
70 | public = list(
71 | #' @template field_levels
72 | levels = NULL,
73 |
74 | #' @template field_connector
75 | connector = NULL,
76 |
77 | #' @description
78 | #'
79 | #' Creates a backend for a [dplyr::tbl()] object.
80 | #'
81 | #' @param data ([dplyr::tbl()])\cr
82 | #' The data object.
83 | #'
84 | #' Instead of calling the constructor yourself, you can call [mlr3::as_data_backend()]
85 | #' on a [dplyr::tbl()].
86 | #' Note that only objects of class `"tbl_lazy"` will be converted to a [DataBackendDplyr]
87 | #' (this includes all connectors from \CRANpkg{dbplyr}).
88 | #' Local `"tbl"` objects such as [`tibbles`][tibble::tibble()] will converted to a
89 | #' [DataBackendDataTable][mlr3::DataBackendDataTable].
90 | initialize = function(data, primary_key, strings_as_factors = TRUE, connector = NULL) {
91 | loadNamespace("DBI")
92 | loadNamespace("dbplyr")
93 |
94 | if (!dplyr::is.tbl(data)) {
95 | stop("Argument 'data' must be of class 'tbl'")
96 | }
97 |
98 | if (inherits(data, "tbl_sql")) {
99 | requireNamespace("dbplyr")
100 | }
101 |
102 | super$initialize(data, primary_key)
103 | assert_choice(primary_key, colnames(data))
104 |
105 | if (isFALSE(strings_as_factors)) {
106 | self$levels = list()
107 | } else {
108 | h = self$head(1L)
109 | string_cols = setdiff(names(h)[map_lgl(h, is.character)], self$primary_key)
110 |
111 | if (isTRUE(strings_as_factors)) {
112 | strings_as_factors = string_cols
113 | } else {
114 | assert_subset(strings_as_factors, string_cols)
115 | }
116 |
117 | self$levels = self$distinct(rows = NULL, cols = strings_as_factors)
118 | }
119 |
120 | self$connector = assert_function(connector, args = character(), null.ok = TRUE)
121 | },
122 |
123 | #' @description
124 | #' Finalizer which disconnects from the database.
125 | #' This is called during garbage collection of the instance.
126 | #' @return `logical(1)`, the return value of [DBI::dbDisconnect()].
127 | finalize = function() {
128 | if (isTRUE(self$valid)) {
129 | DBI::dbDisconnect(private$.data$src$con)
130 | }
131 | },
132 |
133 | #' @description
134 | #' Returns a slice of the data.
135 | #' Calls [dplyr::filter()] and [dplyr::select()] on the table and converts it to a [data.table::data.table()].
136 | #'
137 | #' The rows must be addressed as vector of primary key values, columns must be referred to via column names.
138 | #' Queries for rows with no matching row id and queries for columns with no matching
139 | #' column name are silently ignored.
140 | #' Rows are guaranteed to be returned in the same order as `rows`, columns may be returned in an arbitrary order.
141 | #' Duplicated row ids result in duplicated rows, duplicated column names lead to an exception.
142 | data = function(rows, cols, data_format = "data.table") {
143 | private$.reconnect()
144 | rows = assert_integerish(rows, coerce = TRUE)
145 | assert_names(cols, type = "unique")
146 | assert_choice(data_format, self$data_formats)
147 | cols = intersect(cols, colnames(private$.data))
148 |
149 | res = setDT(dplyr::collect(dplyr::select_at(
150 | dplyr::filter_at(private$.data, self$primary_key, dplyr::all_vars(. %in% rows)),
151 | union(cols, self$primary_key))))
152 |
153 | recode(res[list(rows), cols, nomatch = NULL, with = FALSE, on = self$primary_key],
154 | self$levels)
155 | },
156 |
157 | #' @description
158 | #' Retrieve the first `n` rows.
159 | #'
160 | #' @param n (`integer(1)`)\cr
161 | #' Number of rows.
162 | #'
163 | #' @return [data.table::data.table()] of the first `n` rows.
164 | head = function(n = 6L) {
165 | private$.reconnect()
166 | recode(setDT(dplyr::collect(head(private$.data, n))), self$levels)
167 | },
168 |
169 | #' @description
170 | #' Returns a named list of vectors of distinct values for each column
171 | #' specified. If `na_rm` is `TRUE`, missing values are removed from the
172 | #' returned vectors of distinct values. Non-existing rows and columns are
173 | #' silently ignored.
174 | #'
175 | #' @return Named `list()` of distinct values.
176 | distinct = function(rows, cols, na_rm = TRUE) {
177 | private$.reconnect()
178 | # TODO: what does dplyr::disinct return for enums?
179 | assert_names(cols, type = "unique")
180 | cols = intersect(cols, self$colnames)
181 |
182 | tbl = private$.data
183 | if (!is.null(rows)) {
184 | tbl = dplyr::filter_at(tbl, self$primary_key, dplyr::all_vars(. %in% rows))
185 | }
186 |
187 | get_distinct = function(col) {
188 | x = dplyr::collect(dplyr::distinct(dplyr::select_at(tbl, col)))[[1L]]
189 | if (is.factor(x)) {
190 | x = as.character(x)
191 | }
192 | if (na_rm) {
193 | x = x[!is.na(x)]
194 | }
195 | x
196 | }
197 | setNames(lapply(cols, get_distinct), cols)
198 | },
199 |
200 | #' @description
201 | #' Returns the number of missing values per column in the specified slice
202 | #' of data. Non-existing rows and columns are silently ignored.
203 | #'
204 | #' @return Total of missing values per column (named `numeric()`).
205 | missings = function(rows, cols) {
206 | private$.reconnect()
207 | rows = assert_integerish(rows, coerce = TRUE)
208 | assert_names(cols, type = "unique")
209 |
210 | cols = intersect(cols, self$colnames)
211 | if (length(cols) == 0L) {
212 | return(setNames(integer(0L), character(0L)))
213 | }
214 |
215 | res = dplyr::collect(dplyr::summarize_at(
216 | dplyr::filter_at(private$.data, self$primary_key, dplyr::all_vars(. %in% rows)),
217 | cols, list(~ sum(is.na(.), na.rm = TRUE))))
218 |
219 | if (nrow(res) == 0L) {
220 | return(setNames(integer(length(cols)), cols))
221 | }
222 | unlist(res, recursive = FALSE)
223 | }
224 | ),
225 |
226 | active = list(
227 | #' @field rownames (`integer()`)\cr
228 | #' Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.
229 | rownames = function() {
230 | private$.reconnect()
231 | dplyr::collect(dplyr::select_at(private$.data, self$primary_key))[[1L]]
232 | },
233 |
234 | #' @field colnames (`character()`)\cr
235 | #' Returns vector of all column names, including the primary key column.
236 | colnames = function() {
237 | private$.reconnect()
238 | colnames(private$.data)
239 | },
240 |
241 | #' @field nrow (`integer(1)`)\cr
242 | #' Number of rows (observations).
243 | nrow = function() {
244 | private$.reconnect()
245 | dplyr::collect(dplyr::tally(private$.data))[[1L]]
246 | },
247 |
248 | #' @field ncol (`integer(1)`)\cr
249 | #' Number of columns (variables), including the primary key column.
250 | ncol = function() {
251 | private$.reconnect()
252 | ncol(private$.data)
253 | },
254 |
255 | #' @field valid (`logical(1)`)\cr
256 | #' Returns `NA` if the data does not inherits from `"tbl_sql"` (i.e., it is not a real SQL data base).
257 | #' Returns the result of [DBI::dbIsValid()] otherwise.
258 | valid = function() {
259 | if (!inherits(private$.data, "tbl_sql")) {
260 | return(NA)
261 | }
262 |
263 | loadNamespace("DBI")
264 | loadNamespace("dbplyr")
265 |
266 | # workaround for https://github.com/r-dbi/DBI/issues/302
267 | force(names(private$.data$src$con))
268 |
269 | DBI::dbIsValid(private$.data$src$con)
270 | }
271 | ),
272 |
273 | private = list(
274 | .calculate_hash = function() {
275 | private$.reconnect()
276 | calculate_hash(private$.data)
277 | },
278 |
279 | .reconnect = function() {
280 | if (isFALSE(self$valid)) {
281 | if (is.null(self$connector)) {
282 | stop("Invalid connection. Provide a connector during construction to automatically reconnect", call. = FALSE)
283 | }
284 |
285 | con = self$connector()
286 |
287 | if (!all(class(private$.data$src$con) == class(con))) {
288 | stop(sprintf("Reconnecting failed. Expected a connection of class %s, but got %s",
289 | paste0(class(private$.data$src$con), collapse = "/"), paste0(class(con), collapse = "/")), call. = FALSE)
290 | }
291 |
292 | private$.data$src$con = con
293 | }
294 | }
295 | )
296 | )
297 |
298 | #' @importFrom mlr3 as_data_backend
299 | #' @export
300 | as_data_backend.tbl_SQLiteConnection = function(data, primary_key, strings_as_factors = TRUE, ...) { # nolint
301 | b = DataBackendDplyr$new(data, primary_key)
302 | path = data$src$con@dbname
303 | if (!identical(path, ":memory:") && test_string(path) && file.exists(path)) {
304 | b$connector = sqlite_reconnector(path)
305 | }
306 | return(b)
307 | }
308 |
309 | #' @importFrom mlr3 as_data_backend
310 | #' @export
311 | as_data_backend.tbl_lazy = function(data, primary_key, strings_as_factors = TRUE, ...) { # nolint
312 | DataBackendDplyr$new(data, primary_key)
313 | }
314 |
315 | #' @rawNamespace if (getRversion() >= "3.6.0") S3method(dplyr::show_query, DataBackendDplyr)
316 | show_query.DataBackendDplyr = function(x, ...) { # nolint
317 | requireNamespace("dplyr")
318 | requireNamespace("dbplyr")
319 | dplyr::show_query(x$.__enclos_env__$private$.data)
320 | }
321 |
--------------------------------------------------------------------------------
/R/DataBackendDuckDB.R:
--------------------------------------------------------------------------------
1 | #' @title DataBackend for DuckDB
2 | #'
3 | #' @description
4 | #' A [mlr3::DataBackend] for \CRANpkg{duckdb}.
5 | #' Can be easily constructed with [as_duckdb_backend()].
6 | #'
7 | #' @seealso
8 | #' \url{https://duckdb.org/}
9 | #'
10 | #' @param rows `integer()`\cr
11 | #' Row indices.
12 | #' @param cols `character()`\cr
13 | #' Column names.
14 | #' @param data_format (`character(1)`)\cr
15 | #' Desired data format, e.g. `"data.table"` or `"Matrix"`.
16 | #' @param na_rm `logical(1)`\cr
17 | #' Whether to remove NAs or not.
18 | #'
19 | #' @template param_primary_key
20 | #' @template param_strings_as_factors
21 | #' @template param_connector
22 | #'
23 | #' @importFrom mlr3 DataBackend
24 | #' @export
25 | DataBackendDuckDB = R6Class("DataBackendDuckDB", inherit = DataBackend, cloneable = FALSE,
26 | public = list(
27 | #' @template field_levels
28 | levels = NULL,
29 |
30 | #' @template field_connector
31 | connector = NULL,
32 |
33 | #' @field table (`character(1)`)\cr
34 | #' Data base table or view to operate on.
35 | table = NULL,
36 |
37 | #' @description
38 | #'
39 | #' Creates a backend for a [duckdb::duckdb()] database.
40 | #'
41 | #' @param data (connection)\cr
42 | #' A connection created with [DBI::dbConnect()].
43 | #' If constructed manually (and not via the helper function [as_duckdb_backend()],
44 | #' make sure that there exists an (unique) index for the key column.
45 | #' @param table (`character(1)`)\cr
46 | #' Table or view to operate on.
47 | initialize = function(data, table, primary_key, strings_as_factors = TRUE, connector = NULL) {
48 | loadNamespace("duckdb")
49 |
50 | assert_class(data, "duckdb_connection")
51 | super$initialize(data, primary_key)
52 | self$table = assert_string(table)
53 |
54 | info = self$table_info
55 | assert_choice(self$primary_key, info$name)
56 | assert_choice(self$table, DBI::dbGetQuery(private$.data, "PRAGMA show_tables")$name)
57 | self$connector = assert_function(connector, args = character(), null.ok = TRUE)
58 |
59 | if (isFALSE(strings_as_factors)) {
60 | self$levels = list()
61 | } else {
62 | string_cols = info$name[tolower(info$type) %in% c("varchar", "string", "text")]
63 | string_cols = setdiff(string_cols, self$primary_key)
64 |
65 | if (isTRUE(strings_as_factors)) {
66 | strings_as_factors = string_cols
67 | } else {
68 | assert_subset(strings_as_factors, string_cols)
69 | }
70 |
71 | self$levels = self$distinct(rows = NULL, cols = strings_as_factors)
72 | }
73 |
74 | },
75 |
76 | #' @description
77 | #' Finalizer which disconnects from the database.
78 | #' This is called during garbage collection of the instance.
79 | #' @return `logical(1)`, the return value of [DBI::dbDisconnect()].
80 | finalize = function() {
81 | if (isTRUE(self$valid)) {
82 | DBI::dbDisconnect(private$.data, shutdown = TRUE)
83 | }
84 | },
85 |
86 | #' @description
87 | #' Returns a slice of the data.
88 | #'
89 | #' The rows must be addressed as vector of primary key values, columns must be referred to via column names.
90 | #' Queries for rows with no matching row id and queries for columns with no matching
91 | #' column name are silently ignored.
92 | #' Rows are guaranteed to be returned in the same order as `rows`, columns may be returned in an arbitrary order.
93 | #' Duplicated row ids result in duplicated rows, duplicated column names lead to an exception.
94 | data = function(rows, cols, data_format = "data.table") {
95 | private$.reconnect()
96 | rows = assert_integerish(rows, coerce = TRUE)
97 | assert_names(cols, type = "unique")
98 | assert_choice(data_format, self$data_formats)
99 | cols = intersect(cols, self$colnames)
100 | tmp_tbl = write_temp_table(private$.data, rows)
101 | on.exit(DBI::dbRemoveTable(private$.data, tmp_tbl, temporary = TRUE))
102 |
103 | query = sprintf('SELECT %1$s FROM "%2$s" INNER JOIN "%3$s" ON "%2$s"."row_id" = "%3$s"."%4$s"',
104 | paste0(sprintf('"%s"."%s"', self$table, union(cols, self$primary_key)), collapse = ","),
105 | tmp_tbl, self$table, self$primary_key)
106 |
107 | res = setDT(DBI::dbGetQuery(private$.data, query), key = self$primary_key)
108 | recode(res[list(rows), cols, nomatch = NULL, on = self$primary_key, with = FALSE],
109 | self$levels)
110 | },
111 |
112 | #' @description
113 | #' Retrieve the first `n` rows.
114 | #'
115 | #' @param n (`integer(1)`)\cr
116 | #' Number of rows.
117 | #'
118 | #' @return [data.table::data.table()] of the first `n` rows.
119 | head = function(n = 6L) {
120 | private$.reconnect()
121 | res = DBI::dbGetQuery(private$.data,
122 | sprintf('SELECT * FROM "%s" ORDER BY "%s" LIMIT %i', self$table, self$primary_key, n))
123 | recode(setDT(res), self$levels)
124 | },
125 |
126 | #' @description
127 | #' Returns a named list of vectors of distinct values for each column
128 | #' specified. If `na_rm` is `TRUE`, missing values are removed from the
129 | #' returned vectors of distinct values. Non-existing rows and columns are
130 | #' silently ignored.
131 | #'
132 | #' @return Named `list()` of distinct values.
133 | distinct = function(rows, cols, na_rm = TRUE) {
134 | private$.reconnect()
135 | assert_names(cols, type = "unique")
136 | cols = intersect(cols, self$colnames)
137 | order = sprintf('ORDER BY "%s"', self$primary_key)
138 |
139 | if (is.null(rows)) {
140 | get_query = function(col) {
141 | sprintf('SELECT DISTINCT("%s") FROM "%s"', col, self$table)
142 | }
143 | } else {
144 | tmp_tbl = write_temp_table(private$.data, rows)
145 | on.exit(DBI::dbRemoveTable(private$.data, tmp_tbl, temporary = TRUE))
146 |
147 | get_query = function(col) {
148 | sprintf('SELECT DISTINCT("%1$s"."%2$s") FROM "%3$s" LEFT JOIN "%1$s" ON "%3$s"."row_id" = "%1$s"."%4$s"',
149 | self$table, col, tmp_tbl, self$primary_key)
150 | }
151 | }
152 |
153 | res = lapply(cols, function(col) {
154 | query = get_query(col)
155 | if (na_rm) {
156 | query = sprintf('%s WHERE "%s"."%s" IS NOT NULL', query, self$table, col)
157 | }
158 | levels = DBI::dbGetQuery(private$.data, paste(query, order))[[1L]]
159 | if (is.factor(levels)) as.character(levels) else levels
160 | })
161 |
162 | setNames(res, cols)
163 | },
164 |
165 | #' @description
166 | #' Returns the number of missing values per column in the specified slice
167 | #' of data. Non-existing rows and columns are silently ignored.
168 | #'
169 | #' @return Total of missing values per column (named `numeric()`).
170 | missings = function(rows, cols) {
171 | private$.reconnect()
172 | rows = assert_integerish(rows, coerce = TRUE)
173 | assert_names(cols, type = "unique")
174 |
175 | cols = intersect(cols, self$colnames)
176 | if (length(cols) == 0L) {
177 | return(setNames(integer(0L), character(0L)))
178 | }
179 |
180 | tmp_tbl = write_temp_table(private$.data, rows)
181 | on.exit(DBI::dbRemoveTable(private$.data, tmp_tbl, temporary = TRUE))
182 |
183 | query = sprintf('SELECT %1$s FROM (SELECT * FROM "%2$s" INNER JOIN "%3$s" ON "%2$s"."%4$s" = "%3$s"."row_id")',
184 | paste0(sprintf('COUNT("%s")', cols), collapse = ","),
185 | self$table,
186 | tmp_tbl,
187 | self$primary_key
188 | )
189 |
190 | counts = unlist(DBI::dbGetQuery(private$.data, query), recursive = FALSE)
191 | setNames(as.integer(length(rows) - counts), cols)
192 | }
193 | ),
194 |
195 | active = list(
196 | #' @field table_info (`data.frame()`)\cr
197 | #' Data frame as returned by pragma `table_info()`.
198 | table_info = function() {
199 | private$.reconnect()
200 | DBI::dbGetQuery(private$.data, sprintf("PRAGMA table_info('%s')", self$table))
201 | },
202 |
203 | #' @field rownames (`integer()`)\cr
204 | #' Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.
205 | rownames = function() {
206 | private$.reconnect()
207 | res = DBI::dbGetQuery(private$.data,
208 | sprintf('SELECT "%1$s" FROM "%2$s" ORDER BY "%1$s"', self$primary_key, self$table))
209 | res[[1L]]
210 | },
211 |
212 | #' @field colnames (`character()`)\cr
213 | #' Returns vector of all column names, including the primary key column.
214 | colnames = function() {
215 | private$.reconnect()
216 | self$table_info$name
217 | },
218 |
219 | #' @field nrow (`integer(1)`)\cr
220 | #' Number of rows (observations).
221 | nrow = function() {
222 | private$.reconnect()
223 | res = DBI::dbGetQuery(private$.data,
224 | sprintf('SELECT COUNT(*) AS n FROM "%s"', self$table))
225 | as.integer(res$n)
226 | },
227 |
228 | #' @field ncol (`integer(1)`)\cr
229 | #' Number of columns (variables), including the primary key column.
230 | ncol = function() {
231 | private$.reconnect()
232 | nrow(self$table_info)
233 | },
234 |
235 | #' @field valid (`logical(1)`)\cr
236 | #' Returns `NA` if the data does not inherits from `"tbl_sql"` (i.e., it is not a real SQL data base).
237 | #' Returns the result of [DBI::dbIsValid()] otherwise.
238 | valid = function() {
239 | loadNamespace("DBI")
240 | loadNamespace("duckdb")
241 | DBI::dbIsValid(private$.data)
242 | }
243 | ),
244 |
245 | private = list(
246 | .calculate_hash = function() {
247 | private$.reconnect()
248 | calculate_hash(private$.data@driver@dbdir)
249 | },
250 |
251 | .reconnect = function() {
252 | if (isFALSE(self$valid)) {
253 | if (is.null(self$connector)) {
254 | stop("Invalid connection. Provide a connector during construction to automatically reconnect", call. = FALSE)
255 | }
256 |
257 | con = self$connector()
258 |
259 | if (!all(class(private$.data) == class(con))) {
260 | stop(sprintf("Reconnecting failed. Expected a connection of class %s, but got %s",
261 | paste0(class(private$.data$src$con), collapse = "/"), paste0(class(con), collapse = "/")), call. = FALSE)
262 | }
263 |
264 | private$.data = con
265 | }
266 | }
267 | )
268 | )
269 |
270 | write_temp_table = function(con, rows) {
271 | tbl_name = sprintf("rows_%i", Sys.getpid())
272 | DBI::dbWriteTable(con, tbl_name, data.frame(row_id = sort(unique(rows))),
273 | temporary = TRUE, overwrite = TRUE, append = FALSE)
274 | tbl_name
275 | }
276 |
277 | #' @importFrom mlr3 as_data_backend
278 | #' @export
279 | as_data_backend.tbl_duckdb_connection = function(data, primary_key, strings_as_factors = TRUE, ...) { # nolint
280 | b = DataBackendDuckDB$new(data, primary_key)
281 | path = data$src$con@driver@dbdir
282 | if (!identical(path, ":memory:") && test_string(path) && file.exists(path)) {
283 | b$connector = duckdb_reconnector(path)
284 | }
285 | return(b)
286 | }
287 |
--------------------------------------------------------------------------------
/R/DataBackendPolars.R:
--------------------------------------------------------------------------------
1 | #' @title DataBackend for Polars
2 | #'
3 | #' @description
4 | #' A [mlr3::DataBackend] using `RPolarsLazyFrame` from package \CRANpkg{polars}.
5 | #' Can be easily constructed with [as_polars_backend()].
6 | #' [mlr3::Task]s can interface out-of-memory files if the `polars::RPolarsLazyFrame` was imported using a `polars::scan_x` function.
7 | #' Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable.
8 | #' A connector is not required but can be useful e.g. for scanning larger than memory files
9 | #'
10 | #' @seealso
11 | #' \url{https://pola-rs.github.io/r-polars/}
12 | #'
13 | #' @param rows (`integer()`)\cr
14 | #' Row indices.
15 | #' @param cols (`character()`)\cr
16 | #' Column names.
17 | #' @param na_rm (`logical(1)`)\cr
18 | #' Whether to remove NAs or not.
19 | #' @param primary_key (`character(1)`)\cr
20 | #' Name of the primary key column.
21 | #' Because `polars` does not natively support primary keys, uniqueness of the primary key column is expected but not enforced.
22 | #' @param connector (`function()`)\cr
23 | #' Optional function which is called to re-connect to e.g. a source file in case the connection became invalid.
24 | #'
25 | #' @template param_strings_as_factors
26 | #'
27 | #' @importFrom mlr3 DataBackend
28 | #' @export
29 | #' @examples
30 | #' if (mlr3misc::require_namespaces("polars", quietly = TRUE)) {
31 | #' # Backend using a in-memory data set
32 | #' data = iris
33 | #' data$Sepal.Length[1:30] = NA
34 | #' data$row_id = 1:150
35 | #' data = polars::as_polars_lf(data)
36 | #' b = DataBackendPolars$new(data, primary_key = "row_id")
37 | #'
38 | #' # Object supports all accessors of DataBackend
39 | #' print(b)
40 | #' b$nrow
41 | #' b$ncol
42 | #' b$colnames
43 | #' b$data(rows = 100:101, cols = "Species")
44 | #' b$distinct(b$rownames, "Species")
45 | #'
46 | #' # Classification task using this backend
47 | #' task = mlr3::TaskClassif$new(id = "iris_polars", backend = b, target = "Species")
48 | #' print(task)
49 | #' head(task)
50 | #'
51 | #' # Write a parquet file to scan
52 | #' data$collect()$write_parquet("iris.parquet")
53 | #' data = polars::pl$scan_parquet("iris.parquet")
54 | #'
55 | #' # Backend that re-reads the parquet file if the connection fails
56 | #' b = DataBackendPolars$new(data, "row_id",
57 | #' connector = function() polars::pl$scan_parquet("iris.parquet"))
58 | #' print(b)
59 | #'
60 | #' # Define a backend on a subset of the database: do not use column "Sepal.Width"
61 | #' data = data$select(
62 | #' polars::pl$col(setdiff(colnames(data), "Sepal.Width"))
63 | #' )$filter(
64 | #' polars::pl$col("row_id")$is_in(1:120) # Use only first 120 rows
65 | #' )
66 | #'
67 | #' # Backend with only scanned data
68 | #' b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE)
69 | #' print(b)
70 | #'
71 | #' # Query disinct values
72 | #' b$distinct(b$rownames, "Species")
73 | #'
74 | #' # Query number of missing values
75 | #' b$missings(b$rownames, b$colnames)
76 | #'
77 | #' # Cleanup
78 | #' if (file.exists("iris.parquet")) {
79 | #' file.remove("iris.parquet")
80 | #' }
81 | #' }
82 | DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneable = FALSE,
83 | public = list(
84 | #' @template field_levels
85 | levels = NULL,
86 |
87 | #' @template field_connector
88 | connector = NULL,
89 |
90 | #' @description
91 | #'
92 | #' Creates a backend for a [polars::RPolarsDataFrame] object.
93 | #'
94 | #' @param data ([polars::RPolarsLazyFrame])\cr
95 | #' The data object.
96 | #'
97 | #' Instead of calling the constructor itself, please call [mlr3::as_data_backend()] on
98 | #' a [polars::RPolarsLazyFrame] or [polars::RPolarsDataFrame].
99 | #' Note that only [polars::RPolarsLazyFrame]s will be converted to a [DataBackendPolars].
100 | #' [polars::RPolarsDataFrame] objects without lazy execution will be converted to a
101 | #' [DataBackendDataTable][mlr3::DataBackendDataTable].
102 | initialize = function(data, primary_key, strings_as_factors = TRUE, connector = NULL) {
103 | loadNamespace("polars")
104 | assert_choice(class(data), "RPolarsLazyFrame")
105 |
106 | super$initialize(data, primary_key)
107 | assert_choice(primary_key, colnames(data))
108 | self$connector = assert_function(connector, args = character(), null.ok = TRUE)
109 |
110 | if (isFALSE(strings_as_factors)) {
111 | self$levels = list()
112 | } else {
113 | h = self$head(1L)
114 | string_cols = setdiff(names(h)[map_lgl(h, function(x) {is.character(x) || is.factor(x)})], self$primary_key)
115 |
116 | if (isTRUE(strings_as_factors)) {
117 | strings_as_factors = string_cols
118 | } else {
119 | assert_subset(strings_as_factors, string_cols)
120 | }
121 |
122 | self$levels = self$distinct(rows = NULL, cols = strings_as_factors)
123 | }
124 | },
125 |
126 | #' @description
127 | #' Returns a slice of the data.
128 | #'
129 | #' The rows must be addressed as vector of primary key values, columns must be referred to via column names.
130 | #' Queries for rows with no matching row id and queries for columns with no matching
131 | #' column name are silently ignored.
132 | data = function(rows, cols) {
133 | private$.reconnect()
134 | rows = assert_integerish(rows, coerce = TRUE)
135 | assert_names(cols, type = "unique")
136 | cols = intersect(cols, self$colnames)
137 |
138 | data = private$.data
139 | res = data$filter(polars::pl$col(self$primary_key)$is_in(rows))$select(polars::pl$col(union(self$primary_key, cols)))$collect(streaming = TRUE)
140 | res = as.data.table(res)
141 |
142 | recode(res[list(rows), cols, nomatch = NULL, on = self$primary_key, with = FALSE],
143 | self$levels)
144 | },
145 |
146 | #' @description
147 | #' Retrieve the first `n` rows.
148 | #'
149 | #' @param n (`integer(1)`)\cr
150 | #' Number of rows.
151 | #'
152 | #' @return [data.table::data.table()] of the first `n` rows.
153 | head = function(n = 6L) {
154 | private$.reconnect()
155 | recode(as.data.table(private$.data$head(n)$collect(streaming = TRUE)), self$levels)
156 | },
157 |
158 | #' @description
159 | #' Returns a named list of vectors of distinct values for each column
160 | #' specified. If `na_rm` is `TRUE`, missing values are removed from the
161 | #' returned vectors of distinct values. Non-existing rows and columns are
162 | #' silently ignored.
163 | #'
164 | #' @return Named `list()` of distinct values.
165 | distinct = function(rows, cols, na_rm = TRUE) {
166 | private$.reconnect()
167 | assert_names(cols, type = "unique")
168 | cols = intersect(cols, self$colnames)
169 |
170 | dat = private$.data
171 |
172 | if (!is.null(rows)) {
173 | dat = dat$filter(polars::pl$col(self$primary_key)$is_in(rows))
174 | }
175 |
176 | get_distinct = function(col) {
177 | x = as.vector(
178 | dat$select(
179 | polars::pl$col(col)$unique()
180 | )$collect(streaming = TRUE)$get_column(col)
181 | )
182 |
183 | if (is.factor(x)) {
184 | x = as.character(x)
185 | }
186 | if (na_rm) {
187 | x = x[!is.na(x)]
188 | }
189 | x
190 | }
191 | setNames(lapply(cols, get_distinct), cols)
192 | },
193 |
194 | #' @description
195 | #' Returns the number of missing values per column in the specified slice
196 | #' of data. Non-existing rows and columns are silently ignored.
197 | #'
198 | #' @return Total of missing values per column (named `numeric()`).
199 | missings = function(rows, cols) {
200 | private$.reconnect()
201 | rows = assert_integerish(rows, coerce = TRUE)
202 | assert_names(cols, type = "unique")
203 |
204 | cols = intersect(cols, self$colnames)
205 | if (length(cols) == 0L) {
206 | return(setNames(integer(0L), character(0L)))
207 | }
208 |
209 | res = private$.data$filter(
210 | polars::pl$col(self$primary_key)$is_in(rows)
211 | )
212 | res = res$select(
213 | lapply(cols, function(col) {
214 | polars::pl$col(col)$is_null()$sum()$alias(col)
215 | })
216 | )$collect(streaming = TRUE)
217 |
218 | if (res$height == 0L) {
219 | return(setNames(integer(length(cols)), cols))
220 | }
221 |
222 | setNames(mlr3misc::map_int(cols, function(col) as.integer(as.vector(res$get_column(col)))), cols)
223 | }
224 | ),
225 |
226 | active = list(
227 | #' @field rownames (`integer()`)\cr
228 | #' Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.
229 | rownames = function() {
230 | private$.reconnect()
231 |
232 | as.vector(
233 | private$.data$
234 | select(polars::pl$col(self$primary_key))$
235 | collect()$
236 | get_column(self$primary_key)
237 | )
238 | },
239 |
240 | #' @field colnames (`character()`)\cr
241 | #' Returns vector of all column names, including the primary key column.
242 | colnames = function() {
243 | private$.reconnect()
244 | names(private$.data$schema)
245 | },
246 |
247 | #' @field nrow (`integer(1)`)\cr
248 | #' Number of rows (observations).
249 | nrow = function() {
250 | private$.reconnect()
251 | n = private$.data$select(polars::pl$len())$collect(streaming = TRUE)$item()
252 | as.integer(n)
253 | },
254 |
255 | #' @field ncol (`integer(1)`)\cr
256 | #' Number of columns (variables), including the primary key column.
257 | ncol = function() {
258 | private$.reconnect()
259 | length(private$.data$schema)
260 | }
261 | ),
262 |
263 | private = list(
264 | .calculate_hash = function() {
265 | private$.reconnect()
266 | calculate_hash(private$.data)
267 | },
268 |
269 | .reconnect = function() {
270 | if (is.null(self$connector)) {
271 | return(invisible())
272 | }
273 |
274 | con = self$connector()
275 |
276 | if (!all(class(private$.data) == class(con))) {
277 | stop(sprintf("Reconnecting failed. Expected a connection of class %s, but got %s",
278 | paste0(class(private$.data), collapse = "/"), paste0(class(con), collapse = "/")), call. = FALSE)
279 | }
280 |
281 | private$.data = con
282 | }
283 | )
284 | )
285 |
286 | #' @importFrom mlr3 as_data_backend
287 | #' @export
288 | as_data_backend.RPolarsDataFrame = function(data, primary_key = NULL, ...) { # nolint
289 | data = as.data.frame(data)
290 |
291 | if (!is.null(primary_key) && test_integerish(data[[primary_key]])) {
292 | data[[primary_key]] = as.integer(data[[primary_key]])
293 | }
294 |
295 | as_data_backend(data, primary_key = primary_key)
296 | }
297 |
298 | #' @importFrom mlr3 as_data_backend
299 | #' @export
300 | as_data_backend.RPolarsLazyFrame = function(data, primary_key, strings_as_factors = TRUE, ...) { # nolint
301 | DataBackendPolars$new(data, primary_key, strings_as_factors)
302 | }
303 |
--------------------------------------------------------------------------------
/R/as_duckdb_backend.R:
--------------------------------------------------------------------------------
1 | #' @title Convert to DuckDB Backend
2 | #'
3 | #' @description
4 | #' Converts to a [DataBackendDuckDB] using the \CRANpkg{duckdb} database, depending on the input type:
5 | #'
6 | #' * `data.frame`: Creates a new [DataBackendDataTable] first using [as_data_backend()], then proceeds
7 | #' with the conversion from [DataBackendDataTable] to [DataBackendDuckDB].
8 | #' * [mlr3::DataBackend]: Creates a new DuckDB data base in the specified path.
9 | #' The filename is determined by the hash of the [DataBackend].
10 | #' If the file already exists, a connection to the existing database is established and the existing
11 | #' files are reused.
12 | #'
13 | #' The created backend automatically reconnects to the database if the connection was lost, e.g. because
14 | #' the object was serialized to the filesystem and restored in a different R session.
15 | #' The only requirement is that the path does not change and that the path is accessible
16 | #' on all workers.
17 | #'
18 | #' @param data (`data.frame()` | [mlr3::DataBackend])\cr
19 | #' See description.
20 | #' @param ... (`any`)\cr
21 | #' Additional arguments, passed to [DataBackendDuckDB].
22 | #' @template param_path
23 | #'
24 | #' @return [DataBackendDuckDB] or [Task].
25 | #' @export
26 | as_duckdb_backend = function(data, path = getOption("mlr3db.duckdb_dir", ":temp:"), ...) {
27 | UseMethod("as_duckdb_backend")
28 | }
29 |
30 | #' @export
31 | as_duckdb_backend.data.frame = function(data, path = getOption("mlr3db.duckdb_dir", ":temp:"), primary_key = NULL, ...) { # nolint
32 | backend = as_data_backend(data, primary_key = primary_key)
33 | as_duckdb_backend.DataBackend(backend, path = path, ...)
34 | }
35 |
36 | #' @export
37 | as_duckdb_backend.character = function(data, path = getOption("mlr3db.duckdb_dir", ":temp:"), primary_key = NULL, ...) {
38 | assert_file_exists(data, access = "r", extension = "parquet")
39 | con = DBI::dbConnect(duckdb::duckdb())
40 |
41 | query = "CREATE OR REPLACE VIEW 'mlr3db_view' AS SELECT *"
42 | if (is.null(primary_key)) {
43 | primary_key = "mlr3_row_id"
44 | query = paste0(query, ", row_number() OVER () AS mlr3_row_id")
45 | } else {
46 | assert_string(primary_key)
47 | }
48 |
49 | query = sprintf("%s FROM parquet_scan(['%s'])", query, paste0(data, collapse = "','"))
50 | DBI::dbExecute(con, query)
51 |
52 | DataBackendDuckDB$new(con, table = "mlr3db_view", primary_key = primary_key)
53 | }
54 |
55 | #' @export
56 | as_duckdb_backend.DataBackend = function(data, path = getOption("mlr3db.duckdb_dir", ":temp:"), ...) { # nolint
57 | path = get_db_path(path, hash = data$hash, "duckdb")
58 | primary_key = data$primary_key
59 |
60 | con = NULL
61 | on.exit({
62 | if (!is.null(con)) DBI::dbDisconnect(con, shutdown = TRUE)
63 | }, add = TRUE)
64 |
65 | if (!file.exists(path)) {
66 | on.exit({
67 | if (file.exists(path)) unlink(paste0(path, c("", ".wal", ".tmp"), recursive = TRUE))
68 | }, add = TRUE)
69 |
70 | con = DBI::dbConnect(duckdb::duckdb(), dbdir = path, read_only = FALSE)
71 | DBI::dbWriteTable(con, "data", data$head(Inf), row.names = FALSE)
72 | DBI::dbExecute(con, sprintf('CREATE UNIQUE INDEX primary_key ON "%s" ("%s")', "data", primary_key))
73 | DBI::dbDisconnect(con, shutdown = TRUE)
74 | }
75 |
76 | con = DBI::dbConnect(duckdb::duckdb(), dbdir = path, read_only = TRUE)
77 | backend = DataBackendDuckDB$new(con, table = "data", primary_key = primary_key, ...)
78 | backend$connector = duckdb_reconnector(path)
79 |
80 | on.exit()
81 | return(backend)
82 | }
83 |
84 | duckdb_reconnector = function(path) {
85 | force(path)
86 | function() {
87 | DBI::dbConnect(duckdb::duckdb(), path, read_only = TRUE)
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/R/as_polars_backend.R:
--------------------------------------------------------------------------------
1 | #' @title Convert to Polars Backend
2 | #'
3 | #' @description
4 | #' Converts to a [DataBackendPolars] using the \CRANpkg{polars} database, depending on the input type:
5 | #'
6 | #' * `data.frame`: Creates a new [DataBackendDataTable] first using [as_data_backend()], then proceeds
7 | #' with the conversion from [DataBackendDataTable] to [DataBackendPolars].
8 | #' * [mlr3::DataBackend]: Creates a new [DataBackendPolars].
9 | #'
10 | #' There is no automatic connection to the origin file set.
11 | #' If the data is obtained using scanning and the data is streamed, a `connector` can be set manually but is not required.
12 | #'
13 | #' @param data (`data.frame()` | [mlr3::DataBackend])\cr
14 | #' See description.
15 | #' @param streaming (`logical(1)`)\cr
16 | #' Whether the data should be only scanned (recommended for large data sets) and streamed with
17 | #' every [DataBackendPolars] operation or loaded into memory completely.
18 | #'
19 | #' @param ... (`any`)\cr
20 | #' Additional arguments, passed to [DataBackendPolars].
21 | #'
22 | #' @return [DataBackendPolars] or [Task].
23 | #' @export
24 | as_polars_backend = function(data, streaming = FALSE, ...) {
25 | UseMethod("as_polars_backend")
26 | }
27 |
28 |
29 | #' @export
30 | as_polars_backend.data.frame = function(data, streaming = FALSE, primary_key = NULL, ...) {
31 | backend = as_data_backend(data, primary_key = primary_key, streaming = streaming)
32 | as_polars_backend.DataBackend(backend, ...)
33 | }
34 |
35 |
36 | #' @export
37 | as_polars_backend.DataBackend = function(data, streaming = FALSE, ...) {
38 | path = get_db_path(tempfile(), data$hash, "polars")
39 |
40 | on.exit({
41 | if (file.exists(path)) file.remove(path)
42 | })
43 |
44 | primary_key = data$primary_key
45 |
46 | if(streaming) {
47 | polars::as_polars_df(data$head(Inf))$write_parquet(sprintf("%s.parquet", path))
48 | data = polars::pl$scan_parquet(sprintf("%s.parquet", path))
49 | } else {
50 | data = polars::as_polars_lf(data$head(Inf))
51 | }
52 |
53 | DataBackendPolars$new(data = data, primary_key = primary_key, ...)
54 | }
55 |
--------------------------------------------------------------------------------
/R/as_sqlite_backend.R:
--------------------------------------------------------------------------------
1 | #' @title Convert to SQLite Backend
2 | #'
3 | #' @description
4 | #' Converts to a [DataBackendDplyr] using a \CRANpkg{RSQLite} database, depending on the input type:
5 | #'
6 | #' * `data.frame`: Creates a new [DataBackendDataTable] first using [as_data_backend()], then proceeds
7 | #' with the conversion from [DataBackendDataTable] to [DataBackendDplyr].
8 | #' * [mlr3::DataBackend]: Creates a new SQLite data base in the specified path.
9 | #' The filename is determined by the hash of the [DataBackend].
10 | #' If the file already exists, a connection to the existing database is established and the existing
11 | #' files are reused.
12 | #'
13 | #' The created backend automatically reconnects to the database if the connection was lost, e.g. because
14 | #' the object was serialized to the filesystem and restored in a different R session.
15 | #' The only requirement is that the path does not change and that the path is accessible
16 | #' on all workers.
17 | #'
18 | #' @param data (`data.frame()` | [mlr3::DataBackend]\cr
19 | #' See description.
20 | #' @param ... (`any`)\cr
21 | #' Additional arguments, passed to [DataBackendDplyr].
22 | #' @template param_path
23 | #'
24 | #' @return [DataBackendDplyr] or [Task].
25 | #' @export
26 | as_sqlite_backend = function(data, path = getOption("mlr3db.sqlite_dir", ":temp:"), ...) {
27 | UseMethod("as_sqlite_backend")
28 | }
29 |
30 | #' @inheritParams as_data_backend
31 | #' @export
32 | as_sqlite_backend.data.frame = function(data, path = getOption("mlr3db.sqlite_dir", ":temp:"), primary_key = NULL, keep_rownames = FALSE, ...) { # nolint
33 | backend = as_data_backend(data, primary_key = primary_key, keep_rownames = keep_rownames)
34 | as_sqlite_backend.DataBackend(backend, path = path, ...)
35 | }
36 |
37 | #' @export
38 | as_sqlite_backend.DataBackend = function(data, path = getOption("mlr3db.sqlite_dir", ":temp:"), ...) { # nolint
39 | path = get_db_path(path, data$hash, "sqlite")
40 | primary_key = data$primary_key
41 |
42 | if (!file.exists(path)) {
43 | on.exit({
44 | if (file.exists(path)) file.remove(path)
45 | })
46 |
47 | con = DBI::dbConnect(RSQLite::SQLite(), dbname = path, flags = RSQLite::SQLITE_RWC)
48 | field_types = setNames("INTEGER NOT NULL PRIMARY KEY", primary_key)
49 | DBI::dbWriteTable(con, "data", data$head(Inf), row.names = FALSE, field.types = field_types)
50 | DBI::dbDisconnect(con)
51 | }
52 |
53 | con = DBI::dbConnect(RSQLite::SQLite(), path, flags = RSQLite::SQLITE_RO)
54 | backend = DataBackendDplyr$new(dplyr::tbl(con, "data"), primary_key = primary_key, ...)
55 | backend$connector = sqlite_reconnector(path)
56 |
57 | on.exit()
58 | return(backend)
59 | }
60 |
61 | sqlite_reconnector = function(path) {
62 | force(path)
63 | function() {
64 | DBI::dbConnect(RSQLite::SQLite(), path, flags = RSQLite::SQLITE_RO)
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/R/helper.R:
--------------------------------------------------------------------------------
1 | recode = function(tab, levels) {
2 | if (length(levels) > 0L) {
3 | for (col in intersect(names(tab), names(levels))) {
4 | set(tab, i = NULL, j = col, value = factor(tab[[col]], levels = levels[[col]]))
5 | }
6 | }
7 |
8 | tab[]
9 | }
10 |
11 | get_db_path = function(path, hash, extension) {
12 | assert_string(path)
13 | parent = switch(path,
14 | ":temp:" = tempdir(),
15 | ":user:" = R_user_dir("mlr3db", "cache"),
16 | path
17 | )
18 | if (!dir.exists(parent)) {
19 | dir.create(parent, recursive = TRUE)
20 | }
21 |
22 | file.path(parent, sprintf("%s.%s",
23 | gsub("[^[:alnum:]._-]", "_", hash),
24 | extension)
25 | )
26 | }
27 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | #' @import data.table
2 | #' @import checkmate
3 | #' @importFrom stats setNames
4 | #' @importFrom utils head
5 | #' @importFrom R6 R6Class
6 | #' @importFrom mlr3misc map_lgl calculate_hash
7 | #' @section Options:
8 | #' * `mlr3db.sqlite_dir`: Default directory to store SQLite databases constructed
9 | #' with [as_sqlite_backend()]..
10 | #' * `mlr3db.sqlite_dir`: Default directory to store DuckDB databases constructed
11 | #' with [as_duckdb_backend()]..
12 | "_PACKAGE"
13 |
14 | .onLoad = function(libname, pkgname) { # nolint
15 | # nocov start
16 | backports::import(pkgname)
17 | backports::import(pkgname, "R_user_dir", force = TRUE)
18 | } # nocov end
19 |
20 | mlr3misc::leanify_package()
21 | utils::globalVariables(".", "mlr3db", add = TRUE)
22 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 | ```{r, include = FALSE}
6 | knitr::opts_chunk$set(
7 | collapse = TRUE,
8 | comment = "#>",
9 | fig.path = "man/figures/README-",
10 | out.width = "100%"
11 | )
12 | lgr::get_logger("mlr3")$set_threshold("warn")
13 | ```
14 |
15 | # mlr3db
16 |
17 |
18 | [](https://github.com/mlr-org/mlr3db/actions/workflows/r-cmd-check.yml)
19 | [](https://cran.r-project.org/package=mlr3db)
20 | [](https://stackoverflow.com/questions/tagged/mlr3)
21 | [](https://lmmisld-lmu-stats-slds.srv.mwn.de/mlr_invite/)
22 |
23 |
24 | Package website: [release](https://mlr3db.mlr-org.com/) | [dev](https://mlr3db.mlr-org.com/dev/)
25 |
26 | Extends the [mlr3](https://mlr3.mlr-org.com/) package with a DataBackend to transparently work with databases.
27 | Two additional backends are currently implemented:
28 |
29 | * `DataBackendDplyr`: Relies internally on the abstraction of [dplyr](https://dplyr.tidyverse.org/) and [dbplyr](https://dbplyr.tidyverse.org/).
30 | This allows working on a broad range of DBMS, such as SQLite, MySQL, MariaDB, or PostgreSQL.
31 | * `DataBackendDuckDB`: Connector to [duckdb](https://cran.r-project.org/package=duckdb).
32 | This includes support for Parquet files (see example below).
33 |
34 | To construct the backends, you have to establish a connection to the DBMS yourself with the [DBI](https://cran.r-project.org/package=DBI) package.
35 | For the serverless SQLite and DuckDB, we provide the converters `as_sqlite_backend()` and `as_duckdb_backend()`.
36 |
37 |
38 | ## Installation
39 |
40 | You can install the released version of mlr3db from [CRAN](https://CRAN.R-project.org) with:
41 |
42 | ```{r, eval = FALSE}
43 | install.packages("mlr3db")
44 | ```
45 |
46 | And the development version from [GitHub](https://github.com/) with:
47 |
48 | ```{r, eval = FALSE}
49 | # install.packages("devtools")
50 | devtools::install_github("mlr-org/mlr3db")
51 | ```
52 |
53 | ## Example
54 |
55 | ### DataBackendDplyr
56 |
57 | ```{r}
58 | library("mlr3db")
59 |
60 | # Create a classification task:
61 | task = tsk("spam")
62 |
63 | # Convert the task backend from a in-memory backend (DataBackendDataTable)
64 | # to an out-of-memory SQLite backend via DataBackendDplyr.
65 | # A temporary directory is used here to store the database files.
66 | task$backend = as_sqlite_backend(task$backend, path = tempfile())
67 |
68 | # Resample a classification tree using a 3-fold CV.
69 | # The requested data will be queried and fetched from the database in the background.
70 | resample(task, lrn("classif.rpart"), rsmp("cv", folds = 3))
71 | ```
72 |
73 | ### DataBackendDuckDB
74 |
75 | ```{r}
76 | library("mlr3db")
77 |
78 | # Get an example parquet file from the package install directory:
79 | # spam dataset (tsk("spam")) stored as parquet file
80 | file = system.file(file.path("extdata", "spam.parquet"), package = "mlr3db")
81 |
82 | # Create a backend on the file
83 | backend = as_duckdb_backend(file)
84 |
85 | # Construct classification task on the constructed backend
86 | task = as_task_classif(backend, target = "type")
87 |
88 | # Resample a classification tree using a 3-fold CV.
89 | # The requested data will be queried and fetched from the database in the background.
90 | resample(task, lrn("classif.rpart"), rsmp("cv", folds = 3))
91 | ```
92 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # mlr3db
3 |
4 |
5 |
6 | [](https://github.com/mlr-org/mlr3db/actions/workflows/r-cmd-check.yml)
7 | [](https://cran.r-project.org/package=mlr3db)
9 | [](https://stackoverflow.com/questions/tagged/mlr3)
10 | [](https://lmmisld-lmu-stats-slds.srv.mwn.de/mlr_invite/)
11 |
12 |
13 | Package website: [release](https://mlr3db.mlr-org.com/) \|
14 | [dev](https://mlr3db.mlr-org.com/dev/)
15 |
16 | Extends the [mlr3](https://mlr3.mlr-org.com/) package with a DataBackend
17 | to transparently work with databases. Two additional backends are
18 | currently implemented:
19 |
20 | - `DataBackendDplyr`: Relies internally on the abstraction of
21 | [dplyr](https://dplyr.tidyverse.org/) and
22 | [dbplyr](https://dbplyr.tidyverse.org/). This allows working on a
23 | broad range of DBMS, such as SQLite, MySQL, MariaDB, or PostgreSQL.
24 | - `DataBackendDuckDB`: Connector to
25 | [duckdb](https://cran.r-project.org/package=duckdb). This includes
26 | support for Parquet files (see example below).
27 |
28 | To construct the backends, you have to establish a connection to the
29 | DBMS yourself with the [DBI](https://cran.r-project.org/package=DBI)
30 | package. For the serverless SQLite and DuckDB, we provide the converters
31 | `as_sqlite_backend()` and `as_duckdb_backend()`.
32 |
33 | ## Installation
34 |
35 | You can install the released version of mlr3db from
36 | [CRAN](https://CRAN.R-project.org) with:
37 |
38 | ``` r
39 | install.packages("mlr3db")
40 | ```
41 |
42 | And the development version from [GitHub](https://github.com/) with:
43 |
44 | ``` r
45 | # install.packages("devtools")
46 | devtools::install_github("mlr-org/mlr3db")
47 | ```
48 |
49 | ## Example
50 |
51 | ### DataBackendDplyr
52 |
53 | ``` r
54 | library("mlr3db")
55 | #> Loading required package: mlr3
56 |
57 | # Create a classification task:
58 | task = tsk("spam")
59 |
60 | # Convert the task backend from a in-memory backend (DataBackendDataTable)
61 | # to an out-of-memory SQLite backend via DataBackendDplyr.
62 | # A temporary directory is used here to store the database files.
63 | task$backend = as_sqlite_backend(task$backend, path = tempfile())
64 |
65 | # Resample a classification tree using a 3-fold CV.
66 | # The requested data will be queried and fetched from the database in the background.
67 | resample(task, lrn("classif.rpart"), rsmp("cv", folds = 3))
68 | #> of 3 iterations
69 | #> * Task: spam
70 | #> * Learner: classif.rpart
71 | #> * Warnings: 0 in 0 iterations
72 | #> * Errors: 0 in 0 iterations
73 | ```
74 |
75 | ### DataBackendDuckDB
76 |
77 | ``` r
78 | library("mlr3db")
79 |
80 | # Get an example parquet file from the package install directory:
81 | # spam dataset (tsk("spam")) stored as parquet file
82 | file = system.file(file.path("extdata", "spam.parquet"), package = "mlr3db")
83 |
84 | # Create a backend on the file
85 | backend = as_duckdb_backend(file)
86 |
87 | # Construct classification task on the constructed backend
88 | task = as_task_classif(backend, target = "type")
89 |
90 | # Resample a classification tree using a 3-fold CV.
91 | # The requested data will be queried and fetched from the database in the background.
92 | resample(task, lrn("classif.rpart"), rsmp("cv", folds = 3))
93 | #> of 3 iterations
94 | #> * Task: backend
95 | #> * Learner: classif.rpart
96 | #> * Warnings: 0 in 0 iterations
97 | #> * Errors: 0 in 0 iterations
98 | ```
99 |
--------------------------------------------------------------------------------
/inst/extdata/spam.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlr-org/mlr3db/b0b90a9968d5f86fc07629367c3a7ed1f8c00e5f/inst/extdata/spam.parquet
--------------------------------------------------------------------------------
/inst/extdata/userdata1.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlr-org/mlr3db/b0b90a9968d5f86fc07629367c3a7ed1f8c00e5f/inst/extdata/userdata1.parquet
--------------------------------------------------------------------------------
/inst/extdata/userdata2.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlr-org/mlr3db/b0b90a9968d5f86fc07629367c3a7ed1f8c00e5f/inst/extdata/userdata2.parquet
--------------------------------------------------------------------------------
/inst/extdata/userdata3.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlr-org/mlr3db/b0b90a9968d5f86fc07629367c3a7ed1f8c00e5f/inst/extdata/userdata3.parquet
--------------------------------------------------------------------------------
/inst/extdata/userdata4.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlr-org/mlr3db/b0b90a9968d5f86fc07629367c3a7ed1f8c00e5f/inst/extdata/userdata4.parquet
--------------------------------------------------------------------------------
/inst/extdata/userdata5.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlr-org/mlr3db/b0b90a9968d5f86fc07629367c3a7ed1f8c00e5f/inst/extdata/userdata5.parquet
--------------------------------------------------------------------------------
/man-roxygen/field_connector.R:
--------------------------------------------------------------------------------
1 | #' @field connector (`function()`)\cr
2 | #' Function which is called to re-connect in case the connection became invalid.
3 |
--------------------------------------------------------------------------------
/man-roxygen/field_levels.R:
--------------------------------------------------------------------------------
1 | #' @field levels (named `list()`)\cr
2 | #' List (named with column names) of factor levels as `character()`.
3 | #' Used to auto-convert character columns to factor variables.
4 |
--------------------------------------------------------------------------------
/man-roxygen/param_connector.R:
--------------------------------------------------------------------------------
1 | #' @param connector (function()`)\cr
2 | #' If not `NULL`, a function which re-connects to the database in case the connection has become invalid.
3 | #' Database connections can become invalid due to timeouts or if the backend is serialized
4 | #' to the file system and then de-serialized again.
5 | #' This round trip is often performed for parallelization, e.g. to send the objects to remote workers.
6 | #' [DBI::dbIsValid()] is called to validate the connection.
7 | #' The function must return just the connection, not a [dplyr::tbl()] object!
8 | #' Note that this this function is serialized together with the backend, including
9 | #' possible sensitive information such as login credentials.
10 | #' These can be retrieved from the stored [mlr3::DataBackend]/[mlr3::Task].
11 | #' To protect your credentials, it is recommended to use the \CRANpkg{secret} package.
12 |
--------------------------------------------------------------------------------
/man-roxygen/param_path.R:
--------------------------------------------------------------------------------
1 | #' @param path (`character(1)`)\cr
2 | #' Path for the DuckDB databases.
3 | #' Either a valid path to a directory which will be created if it not exists, or one of the special strings:
4 | #'
5 | #' * `":temp:"` (default): Temporary directory of the R session is used, see [tempdir()].
6 | #' Note that this directory will be removed during the shutdown of the R session.
7 | #' Also note that this usually does not work for parallelization on remote workers.
8 | #' Set to a custom path instead or use special string `":user:"` instead.
9 | #' * `":user:"`: User cache directory as returned by [R_user_dir()] is used.
10 | #'
11 | #'
12 | #' The default for this argument can be configured via option `"mlr3db.sqlite_dir"` or `"mlr3db.duckdb_dir"`,
13 | #' respectively. The database files will use the hash of the [DataBackend] as filename with
14 | #' file extension `".duckdb"` or `".sqlite"`.
15 | #' If the database already exists on the file system, the converters will just established a new read-only
16 | #' connection.
17 |
--------------------------------------------------------------------------------
/man-roxygen/param_primary_key.R:
--------------------------------------------------------------------------------
1 | #' @param primary_key (`character(1)`)\cr
2 | #' Name of the primary key column.
3 |
--------------------------------------------------------------------------------
/man-roxygen/param_strings_as_factors.R:
--------------------------------------------------------------------------------
1 | #' @param strings_as_factors (`logical(1)` || `character()`)\cr
2 | #' Either a character vector of column names to convert to factors, or a single logical flag:
3 | #' if `FALSE`, no column will be converted, if `TRUE` all string columns (except the primary key).
4 | #' For conversion, the backend is queried for distinct values of the respective columns
5 | #' on construction and their levels are stored in `$levels`.
6 |
--------------------------------------------------------------------------------
/man/DataBackendDplyr.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/DataBackendDplyr.R
3 | \name{DataBackendDplyr}
4 | \alias{DataBackendDplyr}
5 | \title{DataBackend for dplyr/dbplyr}
6 | \description{
7 | A \link[mlr3:DataBackend]{mlr3::DataBackend} using \code{\link[dplyr:tbl]{dplyr::tbl()}} from packages \CRANpkg{dplyr}/\CRANpkg{dbplyr}.
8 | This includes \code{\link[tibble:tibble]{tibbles}} and abstract database connections interfaced by \CRANpkg{dbplyr}.
9 | The latter allows \link[mlr3:Task]{mlr3::Task}s to interface an out-of-memory database.
10 | }
11 | \examples{
12 | if (mlr3misc::require_namespaces(c("tibble", "RSQLite", "dbplyr"), quietly = TRUE)) {
13 | # Backend using a in-memory tibble
14 | data = tibble::as_tibble(iris)
15 | data$Sepal.Length[1:30] = NA
16 | data$row_id = 1:150
17 | b = DataBackendDplyr$new(data, primary_key = "row_id")
18 |
19 | # Object supports all accessors of DataBackend
20 | print(b)
21 | b$nrow
22 | b$ncol
23 | b$colnames
24 | b$data(rows = 100:101, cols = "Species")
25 | b$distinct(b$rownames, "Species")
26 |
27 | # Classification task using this backend
28 | task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species")
29 | print(task)
30 | head(task)
31 |
32 | # Create a temporary SQLite database
33 | con = DBI::dbConnect(RSQLite::SQLite(), ":memory:")
34 | dplyr::copy_to(con, data)
35 | tbl = dplyr::tbl(con, "data")
36 |
37 | # Define a backend on a subset of the database: do not use column "Sepal.Width"
38 | tbl = dplyr::select_at(tbl, setdiff(colnames(tbl), "Sepal.Width"))
39 | tbl = dplyr::filter(tbl, row_id \%in\% 1:120) # Use only first 120 rows
40 | b = DataBackendDplyr$new(tbl, primary_key = "row_id")
41 | print(b)
42 |
43 | # Query disinct values
44 | b$distinct(b$rownames, "Species")
45 |
46 | # Query number of missing values
47 | b$missings(b$rownames, b$colnames)
48 |
49 | # Note that SQLite does not support factors, column Species has been converted to character
50 | lapply(b$head(), class)
51 |
52 | # Cleanup
53 | rm(tbl)
54 | DBI::dbDisconnect(con)
55 | }
56 | }
57 | \section{Super class}{
58 | \code{\link[mlr3:DataBackend]{mlr3::DataBackend}} -> \code{DataBackendDplyr}
59 | }
60 | \section{Public fields}{
61 | \if{html}{\out{
}}
62 | \describe{
63 | \item{\code{levels}}{(named \code{list()})\cr
64 | List (named with column names) of factor levels as \code{character()}.
65 | Used to auto-convert character columns to factor variables.}
66 |
67 | \item{\code{connector}}{(\verb{function()})\cr
68 | Function which is called to re-connect in case the connection became invalid.}
69 | }
70 | \if{html}{\out{
}}
74 | \describe{
75 | \item{\code{rownames}}{(\code{integer()})\cr
76 | Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.}
77 |
78 | \item{\code{colnames}}{(\code{character()})\cr
79 | Returns vector of all column names, including the primary key column.}
80 |
81 | \item{\code{nrow}}{(\code{integer(1)})\cr
82 | Number of rows (observations).}
83 |
84 | \item{\code{ncol}}{(\code{integer(1)})\cr
85 | Number of columns (variables), including the primary key column.}
86 |
87 | \item{\code{valid}}{(\code{logical(1)})\cr
88 | Returns \code{NA} if the data does not inherits from \code{"tbl_sql"} (i.e., it is not a real SQL data base).
89 | Returns the result of \code{\link[DBI:dbIsValid]{DBI::dbIsValid()}} otherwise.}
90 | }
91 | \if{html}{\out{
}}
128 | \describe{
129 | \item{\code{data}}{(\code{\link[dplyr:tbl]{dplyr::tbl()}})\cr
130 | The data object.
131 |
132 | Instead of calling the constructor yourself, you can call \code{\link[mlr3:as_data_backend]{mlr3::as_data_backend()}}
133 | on a \code{\link[dplyr:tbl]{dplyr::tbl()}}.
134 | Note that only objects of class \code{"tbl_lazy"} will be converted to a \link{DataBackendDplyr}
135 | (this includes all connectors from \CRANpkg{dbplyr}).
136 | Local \code{"tbl"} objects such as \code{\link[tibble:tibble]{tibbles}} will converted to a
137 | \link[mlr3:DataBackendDataTable]{DataBackendDataTable}.}
138 |
139 | \item{\code{primary_key}}{(\code{character(1)})\cr
140 | Name of the primary key column.}
141 |
142 | \item{\code{strings_as_factors}}{(\code{logical(1)} || \code{character()})\cr
143 | Either a character vector of column names to convert to factors, or a single logical flag:
144 | if \code{FALSE}, no column will be converted, if \code{TRUE} all string columns (except the primary key).
145 | For conversion, the backend is queried for distinct values of the respective columns
146 | on construction and their levels are stored in \verb{$levels}.}
147 |
148 | \item{\code{connector}}{(function()\verb{)\\cr If not }NULL`, a function which re-connects to the database in case the connection has become invalid.
149 | Database connections can become invalid due to timeouts or if the backend is serialized
150 | to the file system and then de-serialized again.
151 | This round trip is often performed for parallelization, e.g. to send the objects to remote workers.
152 | \code{\link[DBI:dbIsValid]{DBI::dbIsValid()}} is called to validate the connection.
153 | The function must return just the connection, not a \code{\link[dplyr:tbl]{dplyr::tbl()}} object!
154 | Note that this this function is serialized together with the backend, including
155 | possible sensitive information such as login credentials.
156 | These can be retrieved from the stored \link[mlr3:DataBackend]{mlr3::DataBackend}/\link[mlr3:Task]{mlr3::Task}.
157 | To protect your credentials, it is recommended to use the \CRANpkg{secret} package.}
158 | }
159 | \if{html}{\out{
}}
160 | }
161 | }
162 | \if{html}{\out{}}
163 | \if{html}{\out{}}
164 | \if{latex}{\out{\hypertarget{method-DataBackendDplyr-finalize}{}}}
165 | \subsection{Method \code{finalize()}}{
166 | Finalizer which disconnects from the database.
167 | This is called during garbage collection of the instance.
168 | \subsection{Usage}{
169 | \if{html}{\out{
}}
170 | }
171 |
172 | \subsection{Returns}{
173 | \code{logical(1)}, the return value of \code{\link[DBI:dbDisconnect]{DBI::dbDisconnect()}}.
174 | }
175 | }
176 | \if{html}{\out{}}
177 | \if{html}{\out{}}
178 | \if{latex}{\out{\hypertarget{method-DataBackendDplyr-data}{}}}
179 | \subsection{Method \code{data()}}{
180 | Returns a slice of the data.
181 | Calls \code{\link[dplyr:filter]{dplyr::filter()}} and \code{\link[dplyr:select]{dplyr::select()}} on the table and converts it to a \code{\link[data.table:data.table]{data.table::data.table()}}.
182 |
183 | The rows must be addressed as vector of primary key values, columns must be referred to via column names.
184 | Queries for rows with no matching row id and queries for columns with no matching
185 | column name are silently ignored.
186 | Rows are guaranteed to be returned in the same order as \code{rows}, columns may be returned in an arbitrary order.
187 | Duplicated row ids result in duplicated rows, duplicated column names lead to an exception.
188 | \subsection{Usage}{
189 | \if{html}{\out{
}}
218 | \describe{
219 | \item{\code{n}}{(\code{integer(1)})\cr
220 | Number of rows.}
221 | }
222 | \if{html}{\out{
}}
223 | }
224 | \subsection{Returns}{
225 | \code{\link[data.table:data.table]{data.table::data.table()}} of the first \code{n} rows.
226 | }
227 | }
228 | \if{html}{\out{}}
229 | \if{html}{\out{}}
230 | \if{latex}{\out{\hypertarget{method-DataBackendDplyr-distinct}{}}}
231 | \subsection{Method \code{distinct()}}{
232 | Returns a named list of vectors of distinct values for each column
233 | specified. If \code{na_rm} is \code{TRUE}, missing values are removed from the
234 | returned vectors of distinct values. Non-existing rows and columns are
235 | silently ignored.
236 | \subsection{Usage}{
237 | \if{html}{\out{
}}
278 | }
279 | \subsection{Returns}{
280 | Total of missing values per column (named \code{numeric()}).
281 | }
282 | }
283 | }
284 |
--------------------------------------------------------------------------------
/man/DataBackendDuckDB.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/DataBackendDuckDB.R
3 | \name{DataBackendDuckDB}
4 | \alias{DataBackendDuckDB}
5 | \title{DataBackend for DuckDB}
6 | \description{
7 | A \link[mlr3:DataBackend]{mlr3::DataBackend} for \CRANpkg{duckdb}.
8 | Can be easily constructed with \code{\link[=as_duckdb_backend]{as_duckdb_backend()}}.
9 | }
10 | \seealso{
11 | \url{https://duckdb.org/}
12 | }
13 | \section{Super class}{
14 | \code{\link[mlr3:DataBackend]{mlr3::DataBackend}} -> \code{DataBackendDuckDB}
15 | }
16 | \section{Public fields}{
17 | \if{html}{\out{
}}
18 | \describe{
19 | \item{\code{levels}}{(named \code{list()})\cr
20 | List (named with column names) of factor levels as \code{character()}.
21 | Used to auto-convert character columns to factor variables.}
22 |
23 | \item{\code{connector}}{(\verb{function()})\cr
24 | Function which is called to re-connect in case the connection became invalid.}
25 |
26 | \item{\code{table}}{(\code{character(1)})\cr
27 | Data base table or view to operate on.}
28 | }
29 | \if{html}{\out{
}}
33 | \describe{
34 | \item{\code{table_info}}{(\code{data.frame()})\cr
35 | Data frame as returned by pragma \code{table_info()}.}
36 |
37 | \item{\code{rownames}}{(\code{integer()})\cr
38 | Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.}
39 |
40 | \item{\code{colnames}}{(\code{character()})\cr
41 | Returns vector of all column names, including the primary key column.}
42 |
43 | \item{\code{nrow}}{(\code{integer(1)})\cr
44 | Number of rows (observations).}
45 |
46 | \item{\code{ncol}}{(\code{integer(1)})\cr
47 | Number of columns (variables), including the primary key column.}
48 |
49 | \item{\code{valid}}{(\code{logical(1)})\cr
50 | Returns \code{NA} if the data does not inherits from \code{"tbl_sql"} (i.e., it is not a real SQL data base).
51 | Returns the result of \code{\link[DBI:dbIsValid]{DBI::dbIsValid()}} otherwise.}
52 | }
53 | \if{html}{\out{
}}
91 | \describe{
92 | \item{\code{data}}{(connection)\cr
93 | A connection created with \code{\link[DBI:dbConnect]{DBI::dbConnect()}}.
94 | If constructed manually (and not via the helper function \code{\link[=as_duckdb_backend]{as_duckdb_backend()}},
95 | make sure that there exists an (unique) index for the key column.}
96 |
97 | \item{\code{table}}{(\code{character(1)})\cr
98 | Table or view to operate on.}
99 |
100 | \item{\code{primary_key}}{(\code{character(1)})\cr
101 | Name of the primary key column.}
102 |
103 | \item{\code{strings_as_factors}}{(\code{logical(1)} || \code{character()})\cr
104 | Either a character vector of column names to convert to factors, or a single logical flag:
105 | if \code{FALSE}, no column will be converted, if \code{TRUE} all string columns (except the primary key).
106 | For conversion, the backend is queried for distinct values of the respective columns
107 | on construction and their levels are stored in \verb{$levels}.}
108 |
109 | \item{\code{connector}}{(function()\verb{)\\cr If not }NULL`, a function which re-connects to the database in case the connection has become invalid.
110 | Database connections can become invalid due to timeouts or if the backend is serialized
111 | to the file system and then de-serialized again.
112 | This round trip is often performed for parallelization, e.g. to send the objects to remote workers.
113 | \code{\link[DBI:dbIsValid]{DBI::dbIsValid()}} is called to validate the connection.
114 | The function must return just the connection, not a \code{\link[dplyr:tbl]{dplyr::tbl()}} object!
115 | Note that this this function is serialized together with the backend, including
116 | possible sensitive information such as login credentials.
117 | These can be retrieved from the stored \link[mlr3:DataBackend]{mlr3::DataBackend}/\link[mlr3:Task]{mlr3::Task}.
118 | To protect your credentials, it is recommended to use the \CRANpkg{secret} package.}
119 | }
120 | \if{html}{\out{
}}
121 | }
122 | }
123 | \if{html}{\out{}}
124 | \if{html}{\out{}}
125 | \if{latex}{\out{\hypertarget{method-DataBackendDuckDB-finalize}{}}}
126 | \subsection{Method \code{finalize()}}{
127 | Finalizer which disconnects from the database.
128 | This is called during garbage collection of the instance.
129 | \subsection{Usage}{
130 | \if{html}{\out{
}}
131 | }
132 |
133 | \subsection{Returns}{
134 | \code{logical(1)}, the return value of \code{\link[DBI:dbDisconnect]{DBI::dbDisconnect()}}.
135 | }
136 | }
137 | \if{html}{\out{}}
138 | \if{html}{\out{}}
139 | \if{latex}{\out{\hypertarget{method-DataBackendDuckDB-data}{}}}
140 | \subsection{Method \code{data()}}{
141 | Returns a slice of the data.
142 |
143 | The rows must be addressed as vector of primary key values, columns must be referred to via column names.
144 | Queries for rows with no matching row id and queries for columns with no matching
145 | column name are silently ignored.
146 | Rows are guaranteed to be returned in the same order as \code{rows}, columns may be returned in an arbitrary order.
147 | Duplicated row ids result in duplicated rows, duplicated column names lead to an exception.
148 | \subsection{Usage}{
149 | \if{html}{\out{
}}
178 | \describe{
179 | \item{\code{n}}{(\code{integer(1)})\cr
180 | Number of rows.}
181 | }
182 | \if{html}{\out{
}}
183 | }
184 | \subsection{Returns}{
185 | \code{\link[data.table:data.table]{data.table::data.table()}} of the first \code{n} rows.
186 | }
187 | }
188 | \if{html}{\out{}}
189 | \if{html}{\out{}}
190 | \if{latex}{\out{\hypertarget{method-DataBackendDuckDB-distinct}{}}}
191 | \subsection{Method \code{distinct()}}{
192 | Returns a named list of vectors of distinct values for each column
193 | specified. If \code{na_rm} is \code{TRUE}, missing values are removed from the
194 | returned vectors of distinct values. Non-existing rows and columns are
195 | silently ignored.
196 | \subsection{Usage}{
197 | \if{html}{\out{
}}
238 | }
239 | \subsection{Returns}{
240 | Total of missing values per column (named \code{numeric()}).
241 | }
242 | }
243 | }
244 |
--------------------------------------------------------------------------------
/man/DataBackendPolars.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/DataBackendPolars.R
3 | \name{DataBackendPolars}
4 | \alias{DataBackendPolars}
5 | \title{DataBackend for Polars}
6 | \description{
7 | A \link[mlr3:DataBackend]{mlr3::DataBackend} using \code{RPolarsLazyFrame} from package \CRANpkg{polars}.
8 | Can be easily constructed with \code{\link[=as_polars_backend]{as_polars_backend()}}.
9 | \link[mlr3:Task]{mlr3::Task}s can interface out-of-memory files if the \code{polars::RPolarsLazyFrame} was imported using a \code{polars::scan_x} function.
10 | Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable.
11 | A connector is not required but can be useful e.g. for scanning larger than memory files
12 | }
13 | \examples{
14 | if (mlr3misc::require_namespaces("polars", quietly = TRUE)) {
15 | # Backend using a in-memory data set
16 | data = iris
17 | data$Sepal.Length[1:30] = NA
18 | data$row_id = 1:150
19 | data = polars::as_polars_lf(data)
20 | b = DataBackendPolars$new(data, primary_key = "row_id")
21 |
22 | # Object supports all accessors of DataBackend
23 | print(b)
24 | b$nrow
25 | b$ncol
26 | b$colnames
27 | b$data(rows = 100:101, cols = "Species")
28 | b$distinct(b$rownames, "Species")
29 |
30 | # Classification task using this backend
31 | task = mlr3::TaskClassif$new(id = "iris_polars", backend = b, target = "Species")
32 | print(task)
33 | head(task)
34 |
35 | # Write a parquet file to scan
36 | data$collect()$write_parquet("iris.parquet")
37 | data = polars::pl$scan_parquet("iris.parquet")
38 |
39 | # Backend that re-reads the parquet file if the connection fails
40 | b = DataBackendPolars$new(data, "row_id",
41 | connector = function() polars::pl$scan_parquet("iris.parquet"))
42 | print(b)
43 |
44 | # Define a backend on a subset of the database: do not use column "Sepal.Width"
45 | data = data$select(
46 | polars::pl$col(setdiff(colnames(data), "Sepal.Width"))
47 | )$filter(
48 | polars::pl$col("row_id")$is_in(1:120) # Use only first 120 rows
49 | )
50 |
51 | # Backend with only scanned data
52 | b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE)
53 | print(b)
54 |
55 | # Query disinct values
56 | b$distinct(b$rownames, "Species")
57 |
58 | # Query number of missing values
59 | b$missings(b$rownames, b$colnames)
60 |
61 | # Cleanup
62 | if (file.exists("iris.parquet")) {
63 | file.remove("iris.parquet")
64 | }
65 | }
66 | }
67 | \seealso{
68 | \url{https://pola-rs.github.io/r-polars/}
69 | }
70 | \section{Super class}{
71 | \code{\link[mlr3:DataBackend]{mlr3::DataBackend}} -> \code{DataBackendPolars}
72 | }
73 | \section{Public fields}{
74 | \if{html}{\out{
}}
75 | \describe{
76 | \item{\code{levels}}{(named \code{list()})\cr
77 | List (named with column names) of factor levels as \code{character()}.
78 | Used to auto-convert character columns to factor variables.}
79 |
80 | \item{\code{connector}}{(\verb{function()})\cr
81 | Function which is called to re-connect in case the connection became invalid.}
82 | }
83 | \if{html}{\out{
}}
87 | \describe{
88 | \item{\code{rownames}}{(\code{integer()})\cr
89 | Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.}
90 |
91 | \item{\code{colnames}}{(\code{character()})\cr
92 | Returns vector of all column names, including the primary key column.}
93 |
94 | \item{\code{nrow}}{(\code{integer(1)})\cr
95 | Number of rows (observations).}
96 |
97 | \item{\code{ncol}}{(\code{integer(1)})\cr
98 | Number of columns (variables), including the primary key column.}
99 | }
100 | \if{html}{\out{
}}
136 | \describe{
137 | \item{\code{data}}{(\link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame})\cr
138 | The data object.
139 |
140 | Instead of calling the constructor itself, please call \code{\link[mlr3:as_data_backend]{mlr3::as_data_backend()}} on
141 | a \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame} or \link[polars:RPolarsDataFrame]{polars::RPolarsDataFrame}.
142 | Note that only \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame}s will be converted to a \link{DataBackendPolars}.
143 | \link[polars:RPolarsDataFrame]{polars::RPolarsDataFrame} objects without lazy execution will be converted to a
144 | \link[mlr3:DataBackendDataTable]{DataBackendDataTable}.}
145 |
146 | \item{\code{primary_key}}{(\code{character(1)})\cr
147 | Name of the primary key column.
148 | Because \code{polars} does not natively support primary keys, uniqueness of the primary key column is expected but not enforced.}
149 |
150 | \item{\code{strings_as_factors}}{(\code{logical(1)} || \code{character()})\cr
151 | Either a character vector of column names to convert to factors, or a single logical flag:
152 | if \code{FALSE}, no column will be converted, if \code{TRUE} all string columns (except the primary key).
153 | For conversion, the backend is queried for distinct values of the respective columns
154 | on construction and their levels are stored in \verb{$levels}.}
155 |
156 | \item{\code{connector}}{(\verb{function()})\cr
157 | Optional function which is called to re-connect to e.g. a source file in case the connection became invalid.}
158 | }
159 | \if{html}{\out{
}}
160 | }
161 | }
162 | \if{html}{\out{}}
163 | \if{html}{\out{}}
164 | \if{latex}{\out{\hypertarget{method-DataBackendPolars-data}{}}}
165 | \subsection{Method \code{data()}}{
166 | Returns a slice of the data.
167 |
168 | The rows must be addressed as vector of primary key values, columns must be referred to via column names.
169 | Queries for rows with no matching row id and queries for columns with no matching
170 | column name are silently ignored.
171 | \subsection{Usage}{
172 | \if{html}{\out{
}}
198 | \describe{
199 | \item{\code{n}}{(\code{integer(1)})\cr
200 | Number of rows.}
201 | }
202 | \if{html}{\out{
}}
203 | }
204 | \subsection{Returns}{
205 | \code{\link[data.table:data.table]{data.table::data.table()}} of the first \code{n} rows.
206 | }
207 | }
208 | \if{html}{\out{}}
209 | \if{html}{\out{}}
210 | \if{latex}{\out{\hypertarget{method-DataBackendPolars-distinct}{}}}
211 | \subsection{Method \code{distinct()}}{
212 | Returns a named list of vectors of distinct values for each column
213 | specified. If \code{na_rm} is \code{TRUE}, missing values are removed from the
214 | returned vectors of distinct values. Non-existing rows and columns are
215 | silently ignored.
216 | \subsection{Usage}{
217 | \if{html}{\out{
}}
258 | }
259 | \subsection{Returns}{
260 | Total of missing values per column (named \code{numeric()}).
261 | }
262 | }
263 | }
264 |
--------------------------------------------------------------------------------
/man/as_duckdb_backend.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/as_duckdb_backend.R
3 | \name{as_duckdb_backend}
4 | \alias{as_duckdb_backend}
5 | \title{Convert to DuckDB Backend}
6 | \usage{
7 | as_duckdb_backend(data, path = getOption("mlr3db.duckdb_dir", ":temp:"), ...)
8 | }
9 | \arguments{
10 | \item{data}{(\code{data.frame()} | \link[mlr3:DataBackend]{mlr3::DataBackend})\cr
11 | See description.}
12 |
13 | \item{path}{(\code{character(1)})\cr
14 | Path for the DuckDB databases.
15 | Either a valid path to a directory which will be created if it not exists, or one of the special strings:
16 | \itemize{
17 | \item \code{":temp:"} (default): Temporary directory of the R session is used, see \code{\link[=tempdir]{tempdir()}}.
18 | Note that this directory will be removed during the shutdown of the R session.
19 | Also note that this usually does not work for parallelization on remote workers.
20 | Set to a custom path instead or use special string \code{":user:"} instead.
21 | \item \code{":user:"}: User cache directory as returned by \code{\link[=R_user_dir]{R_user_dir()}} is used.
22 | }
23 |
24 | The default for this argument can be configured via option \code{"mlr3db.sqlite_dir"} or \code{"mlr3db.duckdb_dir"},
25 | respectively. The database files will use the hash of the \link{DataBackend} as filename with
26 | file extension \code{".duckdb"} or \code{".sqlite"}.
27 | If the database already exists on the file system, the converters will just established a new read-only
28 | connection.}
29 |
30 | \item{...}{(\code{any})\cr
31 | Additional arguments, passed to \link{DataBackendDuckDB}.}
32 | }
33 | \value{
34 | \link{DataBackendDuckDB} or \link{Task}.
35 | }
36 | \description{
37 | Converts to a \link{DataBackendDuckDB} using the \CRANpkg{duckdb} database, depending on the input type:
38 | \itemize{
39 | \item \code{data.frame}: Creates a new \link{DataBackendDataTable} first using \code{\link[=as_data_backend]{as_data_backend()}}, then proceeds
40 | with the conversion from \link{DataBackendDataTable} to \link{DataBackendDuckDB}.
41 | \item \link[mlr3:DataBackend]{mlr3::DataBackend}: Creates a new DuckDB data base in the specified path.
42 | The filename is determined by the hash of the \link{DataBackend}.
43 | If the file already exists, a connection to the existing database is established and the existing
44 | files are reused.
45 | }
46 |
47 | The created backend automatically reconnects to the database if the connection was lost, e.g. because
48 | the object was serialized to the filesystem and restored in a different R session.
49 | The only requirement is that the path does not change and that the path is accessible
50 | on all workers.
51 | }
52 |
--------------------------------------------------------------------------------
/man/as_polars_backend.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/as_polars_backend.R
3 | \name{as_polars_backend}
4 | \alias{as_polars_backend}
5 | \title{Convert to Polars Backend}
6 | \usage{
7 | as_polars_backend(data, streaming = FALSE, ...)
8 | }
9 | \arguments{
10 | \item{data}{(\code{data.frame()} | \link[mlr3:DataBackend]{mlr3::DataBackend})\cr
11 | See description.}
12 |
13 | \item{streaming}{(\code{logical(1)})\cr
14 | Whether the data should be only scanned (recommended for large data sets) and streamed with
15 | every \link{DataBackendPolars} operation or loaded into memory completely.}
16 |
17 | \item{...}{(\code{any})\cr
18 | Additional arguments, passed to \link{DataBackendPolars}.}
19 | }
20 | \value{
21 | \link{DataBackendPolars} or \link{Task}.
22 | }
23 | \description{
24 | Converts to a \link{DataBackendPolars} using the \CRANpkg{polars} database, depending on the input type:
25 | \itemize{
26 | \item \code{data.frame}: Creates a new \link{DataBackendDataTable} first using \code{\link[=as_data_backend]{as_data_backend()}}, then proceeds
27 | with the conversion from \link{DataBackendDataTable} to \link{DataBackendPolars}.
28 | \item \link[mlr3:DataBackend]{mlr3::DataBackend}: Creates a new \link{DataBackendPolars}.
29 | }
30 |
31 | There is no automatic connection to the origin file set.
32 | If the data is obtained using scanning and the data is streamed, a \code{connector} can be set manually but is not required.
33 | }
34 |
--------------------------------------------------------------------------------
/man/as_sqlite_backend.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/as_sqlite_backend.R
3 | \name{as_sqlite_backend}
4 | \alias{as_sqlite_backend}
5 | \title{Convert to SQLite Backend}
6 | \usage{
7 | as_sqlite_backend(data, path = getOption("mlr3db.sqlite_dir", ":temp:"), ...)
8 | }
9 | \arguments{
10 | \item{data}{(\code{data.frame()} | \link[mlr3:DataBackend]{mlr3::DataBackend}\cr
11 | See description.}
12 |
13 | \item{path}{(\code{character(1)})\cr
14 | Path for the DuckDB databases.
15 | Either a valid path to a directory which will be created if it not exists, or one of the special strings:
16 | \itemize{
17 | \item \code{":temp:"} (default): Temporary directory of the R session is used, see \code{\link[=tempdir]{tempdir()}}.
18 | Note that this directory will be removed during the shutdown of the R session.
19 | Also note that this usually does not work for parallelization on remote workers.
20 | Set to a custom path instead or use special string \code{":user:"} instead.
21 | \item \code{":user:"}: User cache directory as returned by \code{\link[=R_user_dir]{R_user_dir()}} is used.
22 | }
23 |
24 | The default for this argument can be configured via option \code{"mlr3db.sqlite_dir"} or \code{"mlr3db.duckdb_dir"},
25 | respectively. The database files will use the hash of the \link{DataBackend} as filename with
26 | file extension \code{".duckdb"} or \code{".sqlite"}.
27 | If the database already exists on the file system, the converters will just established a new read-only
28 | connection.}
29 |
30 | \item{...}{(\code{any})\cr
31 | Additional arguments, passed to \link{DataBackendDplyr}.}
32 | }
33 | \value{
34 | \link{DataBackendDplyr} or \link{Task}.
35 | }
36 | \description{
37 | Converts to a \link{DataBackendDplyr} using a \CRANpkg{RSQLite} database, depending on the input type:
38 | \itemize{
39 | \item \code{data.frame}: Creates a new \link{DataBackendDataTable} first using \code{\link[=as_data_backend]{as_data_backend()}}, then proceeds
40 | with the conversion from \link{DataBackendDataTable} to \link{DataBackendDplyr}.
41 | \item \link[mlr3:DataBackend]{mlr3::DataBackend}: Creates a new SQLite data base in the specified path.
42 | The filename is determined by the hash of the \link{DataBackend}.
43 | If the file already exists, a connection to the existing database is established and the existing
44 | files are reused.
45 | }
46 |
47 | The created backend automatically reconnects to the database if the connection was lost, e.g. because
48 | the object was serialized to the filesystem and restored in a different R session.
49 | The only requirement is that the path does not change and that the path is accessible
50 | on all workers.
51 | }
52 |
--------------------------------------------------------------------------------
/man/figures/logo_navbar.png:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | mlr3/logo_navbar.png at master · mlr-org/mlr3
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
929 |
930 | You signed in with another tab or window. Reload to refresh your session.
931 | You signed out in another tab or window. Reload to refresh your session.
932 |