├── .github
├── .gitignore
└── workflows
│ ├── check-release.yaml
│ ├── pkgdown.yaml
│ └── test-coverage.yaml
├── vignettes
├── .gitignore
└── aa-conversions.Rmd
├── inst
└── extdata
│ ├── iris.fst
│ ├── iris.rds
│ ├── iris.duckdb
│ ├── iris.parquet
│ ├── iris.sqlite
│ ├── multifile.zip
│ ├── iris_dataset
│ ├── Species=setosa
│ │ └── part-0.parquet
│ ├── Species=versicolor
│ │ └── part-0.parquet
│ └── Species=virginica
│ │ └── part-0.parquet
│ ├── region_2022.txt
│ ├── region_2022.csv
│ ├── region_2022_with_comment.csv
│ ├── iris.ndjson
│ └── iris.json
├── man
├── figures
│ ├── hex_parquetize.png
│ └── Insee_example_csv.gif
├── expect_missing_argument.Rd
├── parquetize_example.Rd
├── expect_parquet.Rd
├── get_partitions.Rd
├── parquetize-package.Rd
├── get_parquet_info.Rd
├── check_parquet.Rd
├── write_parquet_at_once.Rd
├── download_extract.Rd
├── rbind_parquet.Rd
├── fst_to_parquet.Rd
├── rds_to_parquet.Rd
├── json_to_parquet.Rd
├── sqlite_to_parquet.Rd
├── write_parquet_by_chunk.Rd
├── dbi_to_parquet.Rd
├── csv_to_parquet.Rd
└── table_to_parquet.Rd
├── .Rbuildignore
├── .gitignore
├── data-raw
├── iris-rds.R
├── iris-fst.R
├── iris-sqlite.R
├── iris-parquet.R
└── region-2022.R
├── tests
├── testthat
│ ├── test-check_parquet.R
│ ├── test-parquetize_example.R
│ ├── test-get_parquet_info.R
│ ├── test-rbind_parquet.R
│ ├── test-get_partitions.R
│ ├── test-fst_to_parquet.R
│ ├── test-rds_to_parquet.R
│ ├── test-write_parquet_at_once.R
│ ├── test-download_extract.R
│ ├── test-utilities.R
│ ├── test-json_to_parquet.R
│ ├── test-sqlite_to_parquet.R
│ ├── test-dbi_to_parquet.R
│ ├── test-write_parquet_by_chunk.R
│ ├── test-testthat-helpers.R
│ ├── test-csv_to_parquet.R
│ └── test-table_to_parquet.R
└── testthat.R
├── parquetize.Rproj
├── _pkgdown.yml
├── R
├── package-parquetize.R
├── parquetize_example.R
├── get_partitions.R
├── check_parquet.R
├── get_parquet_info.R
├── testthat-helpers.R
├── rds_to_parquet.R
├── fst_to_parquet.R
├── download_extract.R
├── write_parquet_at_once.R
├── utilities.R
├── json_to_parquet.R
├── rbind_parquet.R
├── sqlite_to_parquet.R
├── write_parquet_by_chunk.R
├── dbi_to_parquet.R
├── csv_to_parquet.R
└── table_to_parquet.R
├── DESCRIPTION
├── NAMESPACE
├── dev
└── dev_history.R
├── CONTRIBUTING.md
├── README.md
└── NEWS.md
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/inst/extdata/iris.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.fst
--------------------------------------------------------------------------------
/inst/extdata/iris.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.rds
--------------------------------------------------------------------------------
/inst/extdata/iris.duckdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.duckdb
--------------------------------------------------------------------------------
/inst/extdata/iris.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.parquet
--------------------------------------------------------------------------------
/inst/extdata/iris.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.sqlite
--------------------------------------------------------------------------------
/inst/extdata/multifile.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/multifile.zip
--------------------------------------------------------------------------------
/man/figures/hex_parquetize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/man/figures/hex_parquetize.png
--------------------------------------------------------------------------------
/man/figures/Insee_example_csv.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/man/figures/Insee_example_csv.gif
--------------------------------------------------------------------------------
/inst/extdata/iris_dataset/Species=setosa/part-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris_dataset/Species=setosa/part-0.parquet
--------------------------------------------------------------------------------
/inst/extdata/iris_dataset/Species=versicolor/part-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris_dataset/Species=versicolor/part-0.parquet
--------------------------------------------------------------------------------
/inst/extdata/iris_dataset/Species=virginica/part-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris_dataset/Species=virginica/part-0.parquet
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^dev/dev_history\.R$
4 | ^data-raw$
5 | ^_pkgdown\.yml$
6 | ^docs$
7 | ^pkgdown$
8 | ^\.github$
9 | ^CONTRIBUTING.md
10 |
11 | # This folders
12 | tests/testthat/output/
13 | tests/testthat/Data_test/
14 | ^doc$
15 | ^Meta$
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # This files
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | .Ruserdata
6 |
7 | Thumbs.db
8 | docs
9 | inst/doc
10 |
11 | # This folders
12 | Data/
13 | Data_test/
14 | tests/testthat/output/
15 | tests/testthat/Data/
16 |
17 | # Files with this extensions
18 | *.sas7bdat
19 | /doc/
20 | /Meta/
21 |
--------------------------------------------------------------------------------
/data-raw/iris-rds.R:
--------------------------------------------------------------------------------
1 | #################################################################%#
2 | #### Code to create the rds file `iris.rds sous `inst/extdata`####
3 | ###############################################################%#
4 |
5 | data(iris)
6 |
7 | saveRDS(object = iris,
8 | file = "inst/extdata/iris.rds")
9 |
--------------------------------------------------------------------------------
/data-raw/iris-fst.R:
--------------------------------------------------------------------------------
1 | #################################################################%#
2 | #### Code to create the rds file `iris.fst sous `inst/extdata`####
3 | ###############################################################%#
4 |
5 | library(fst)
6 |
7 | data(iris)
8 |
9 | fst::write.fst(x = iris,
10 | path = "inst/extdata/iris.fst")
11 |
--------------------------------------------------------------------------------
/tests/testthat/test-check_parquet.R:
--------------------------------------------------------------------------------
1 | test_that("check_parquet fails on bad file", {
2 | expect_error(
3 | check_parquet(parquetize_example("iris.sqlite")),
4 | regexp = "Error creating dataset"
5 | )
6 | })
7 |
8 | test_that("check_parquet fails on missing file", {
9 | expect_error(
10 | check_parquet("no_such_file"),
11 | class = "no_such_file"
12 | )
13 | })
14 |
--------------------------------------------------------------------------------
/data-raw/iris-sqlite.R:
--------------------------------------------------------------------------------
1 | ####################################################################%#
2 | #### Code to create the rds file `iris.sqlite sous `inst/extdata`####
3 | ##################################################################%#
4 |
5 | library(RSQLite)
6 | library(DBI)
7 | con <- DBI::dbConnect(RSQLite::SQLite(), "inst/extdata/iris.sqlite")
8 | dbWriteTable(con, "iris", iris)
9 | dbDisconnect(con)
10 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | # This file is part of the standard setup for testthat.
2 | # It is recommended that you do not modify it.
3 | #
4 | # Where should you do additional test configuration?
5 | # Learn more about the roles of various files in:
6 | # * https://r-pkgs.org/tests.html
7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
8 |
9 | library(testthat)
10 | library(parquetize)
11 |
12 | test_check("parquetize")
13 |
--------------------------------------------------------------------------------
/parquetize.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 | ProjectId: 673eed3b-84af-4afe-8e19-fe69a63307c1
3 |
4 | RestoreWorkspace: Default
5 | SaveWorkspace: Default
6 | AlwaysSaveHistory: Default
7 |
8 | EnableCodeIndexing: Yes
9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 2
11 | Encoding: UTF-8
12 |
13 | RnwWeave: Sweave
14 | LaTeX: pdfLaTeX
15 |
16 | AutoAppendNewline: Yes
17 | StripTrailingWhitespace: Yes
18 |
19 | BuildType: Package
20 | PackageUseDevtools: Yes
21 | PackageInstallArgs: --no-multiarch --with-keep.source
22 |
--------------------------------------------------------------------------------
/man/expect_missing_argument.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/testthat-helpers.R
3 | \name{expect_missing_argument}
4 | \alias{expect_missing_argument}
5 | \title{Check if missing argument error is raised}
6 | \usage{
7 | expect_missing_argument(object, regexp)
8 | }
9 | \arguments{
10 | \item{object}{the object to check}
11 |
12 | \item{regexp}{a regexp with the message we must find}
13 | }
14 | \value{
15 | same as expect_error
16 | }
17 | \description{
18 | Check if missing argument error is raised
19 | }
20 | \keyword{internal}
21 |
--------------------------------------------------------------------------------
/tests/testthat/test-parquetize_example.R:
--------------------------------------------------------------------------------
1 | test_that("test number of sample files in the package positive", {
2 | expect_true(
3 | length(parquetize_example()) > 0
4 | )
5 | })
6 |
7 | test_that("test with file", {
8 | expect_no_error(
9 | parquetize_example("iris.json")
10 | )
11 | })
12 |
13 | test_that("test with directory without extension", {
14 | expect_no_error(
15 | parquetize_example("iris_dataset")
16 | )
17 | })
18 |
19 | test_that("test fails if file does not exist", {
20 | expect_error(
21 | parquetize_example("no_such_dataset"),
22 | class = "no_such_file"
23 | )
24 | })
25 |
--------------------------------------------------------------------------------
/data-raw/iris-parquet.R:
--------------------------------------------------------------------------------
1 | #########################################################################################%#
2 | #### Code to create the csv file `iris.parquet and partitioned files to `inst/extdata`####
3 | #######################################################################################%#
4 |
5 | library(arrow)
6 |
7 | data(iris)
8 |
9 | # For iris.parquet
10 | arrow::write_parquet(x = iris,
11 | sink = "inst/extdata/iris.parquet")
12 |
13 | # For partitioned files
14 |
15 | arrow::write_dataset(dataset = iris,
16 | path = "inst/extdata/",
17 | partitioning = c("Species"))
18 |
--------------------------------------------------------------------------------
/man/parquetize_example.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/parquetize_example.R
3 | \name{parquetize_example}
4 | \alias{parquetize_example}
5 | \title{Get path to parquetize example}
6 | \usage{
7 | parquetize_example(file = NULL)
8 | }
9 | \arguments{
10 | \item{file}{Name of file or directory. If \code{NULL}, the example files will be listed.}
11 | }
12 | \value{
13 | A character string
14 | }
15 | \description{
16 | parquetize comes bundled with a number of sample files in its \code{inst/extdata}
17 | directory. This function make them easy to access
18 | }
19 | \examples{
20 | parquetize_example()
21 | parquetize_example("region_2022.csv")
22 | parquetize_example("iris_dataset")
23 | }
24 |
--------------------------------------------------------------------------------
/tests/testthat/test-get_parquet_info.R:
--------------------------------------------------------------------------------
1 | test_that("get_parquet_info works for file", {
2 | parquet <- system.file("extdata", "iris.parquet", package = "parquetize")
3 | info <- get_parquet_info(parquet)
4 |
5 | expect_s3_class(info, "tbl")
6 | expect_equal(nrow(info), 1)
7 | expect_equal(ncol(info), 5)
8 |
9 | expect_equal(info[[1, "path"]], parquet)
10 | expect_equal(info[[1, "num_rows"]], 150)
11 | expect_equal(info[[1, "num_row_groups"]], 1)
12 | expect_equal(info[[1, "num_columns"]], 5)
13 | expect_equal(info[[1, "mean_row_group_size"]], 150)
14 | })
15 |
16 | test_that("get_parquet_info works for dataset", {
17 | parquet <- system.file("extdata", "iris_dataset", package = "parquetize")
18 | info <- get_parquet_info(parquet)
19 |
20 | expect_s3_class(info, "tbl")
21 | expect_equal(nrow(info), 3)
22 | expect_equal(ncol(info), 5)
23 | })
24 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | template:
2 | bootstrap: 5
3 | bootswatch: litera
4 |
5 | navbar:
6 | right:
7 | - text: Contribute
8 | icon: fab fa-github fa-lg
9 | href: https://github.com/ddotta/parquetize
10 |
11 | reference:
12 | - title: Functions
13 | desc: The conversion functions available in this package
14 | contents:
15 | - csv_to_parquet
16 | - json_to_parquet
17 | - rds_to_parquet
18 | - fst_to_parquet
19 | - table_to_parquet
20 | - sqlite_to_parquet
21 | - dbi_to_parquet
22 | - title: Other functions
23 | contents:
24 | - get_parquet_info
25 | - get_partitions
26 | - check_parquet
27 | - download_extract
28 | - rbind_parquet
29 | - parquetize_example
30 | - title: Developers
31 | contents:
32 | - write_parquet_by_chunk
33 | - write_parquet_at_once
34 |
--------------------------------------------------------------------------------
/.github/workflows/check-release.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | name: R-CMD-check
12 |
13 | jobs:
14 | R-CMD-check:
15 | runs-on: ubuntu-latest
16 | env:
17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
18 | R_KEEP_PKG_SOURCE: yes
19 | steps:
20 | - uses: actions/checkout@v3
21 |
22 | - uses: r-lib/actions/setup-r@v2
23 | with:
24 | use-public-rspm: true
25 |
26 | - uses: r-lib/actions/setup-r-dependencies@v2
27 | with:
28 | extra-packages: any::rcmdcheck
29 | needs: check
30 |
31 | - uses: r-lib/actions/check-r-package@v2
32 |
--------------------------------------------------------------------------------
/R/package-parquetize.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | #' @importFrom DBI dbClearResult dbConnect dbDisconnect dbFetch dbHasCompleted dbListTables dbReadTable dbSendQuery
3 | #' @importFrom RSQLite SQLite
4 | #' @importFrom arrow open_dataset read_json_arrow read_parquet write_dataset write_parquet
5 | #' @importFrom cli cli_abort cli_alert_danger cli_alert_info cli_alert_success cli_alert_warning cli_progress_bar cli_progress_message
6 | #' @importFrom curl curl_download
7 | #' @importFrom fst read.fst
8 | #' @importFrom glue glue glue_sql
9 | #' @importFrom haven read_dta read_sas read_sav
10 | #' @importFrom jsonlite read_json
11 | #' @importFrom lifecycle deprecate_warn deprecated
12 | #' @importFrom readr locale read_delim
13 | #' @importFrom tibble as_tibble
14 | #' @importFrom tidyselect all_of everything
15 | #' @importFrom tools file_ext file_path_sans_ext
16 | #' @importFrom utils object.size unzip
17 | #' @importFrom rlang inject
18 | #' @import dplyr
19 | "_PACKAGE"
20 |
--------------------------------------------------------------------------------
/man/expect_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/testthat-helpers.R
3 | \name{expect_parquet}
4 | \alias{expect_parquet}
5 | \title{Check if parquet dataset/file is readable and has the good number of rows}
6 | \usage{
7 | expect_parquet(
8 | path,
9 | with_lines,
10 | with_partitions = NULL,
11 | with_columns = NULL,
12 | with_files = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{path}{to the parquet file or dataset}
17 |
18 | \item{with_lines}{number of lines the file/dataset should have}
19 |
20 | \item{with_partitions}{NULL or a vector with the partition names the dataset should have}
21 |
22 | \item{with_columns}{NULL or a column's name vector the dataset/file should have}
23 |
24 | \item{with_files}{NULL or number of files a dataset should have}
25 | }
26 | \value{
27 | the dataset handle
28 | }
29 | \description{
30 | Check if parquet dataset/file is readable and has the good number of rows
31 | }
32 | \keyword{internal}
33 |
--------------------------------------------------------------------------------
/man/get_partitions.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_partitions.R
3 | \name{get_partitions}
4 | \alias{get_partitions}
5 | \title{get unique values from table's column}
6 | \usage{
7 | get_partitions(conn, table, column)
8 | }
9 | \arguments{
10 | \item{conn}{A \code{DBIConnection} object, as return by \code{DBI::dbConnect}}
11 |
12 | \item{table}{a DB table name}
13 |
14 | \item{column}{a column name for the table passed in param}
15 | }
16 | \value{
17 | a vector with unique values for the column of the table
18 | }
19 | \description{
20 | This function allows you to extract unique values from a table's column to use as partitions.\cr
21 |
22 | Internally, this function does "SELECT DISTINCT(\code{mycolumn}) FROM \code{mytable};"
23 | }
24 | \examples{
25 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(),
26 | system.file("extdata","iris.sqlite",package = "parquetize"))
27 |
28 | get_partitions(dbi_connection, "iris", "Species")
29 | }
30 |
--------------------------------------------------------------------------------
/tests/testthat/test-rbind_parquet.R:
--------------------------------------------------------------------------------
1 | test_that("Checks rbind_parquet creates correct output file", {
2 | temp_dir <- tempfile()
3 |
4 | dir.create(temp_dir, showWarnings = FALSE)
5 |
6 | file.create(fileext = file.path(temp_dir, "test_data1-4.parquet"))
7 | write_parquet(data.frame(
8 | x = c("a","b","c"),
9 | y = c(1L,2L,3L)
10 | ), file.path(temp_dir, "test_data1-4.parquet"))
11 |
12 | file.create(fileext = file.path(temp_dir, "test_data4-6.parquet"))
13 | write_parquet(data.frame(
14 | x = c("d","e","f"),
15 | y = c(4L,5L,6L)
16 | ), file.path(temp_dir, "test_data4-6.parquet"))
17 |
18 | test_data <- rbind_parquet(folder = temp_dir,
19 | output_name = "test_data",
20 | delete_initial_files = FALSE)
21 |
22 | expect_equal(
23 | unname(unlist(lapply(test_data, class))),
24 | c("character", "integer")
25 | )
26 |
27 | expect_equal(names(test_data), c("x",
28 | "y"))
29 |
30 | })
31 |
--------------------------------------------------------------------------------
/man/parquetize-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/package-parquetize.R
3 | \docType{package}
4 | \name{parquetize-package}
5 | \alias{parquetize}
6 | \alias{parquetize-package}
7 | \title{parquetize: Convert Files to Parquet Format}
8 | \description{
9 | Collection of functions to get files in parquet format. Parquet is a columnar storage file format \url{https://parquet.apache.org/}. The files to convert can be of several formats ("csv", "RData", "rds", "RSQLite", "json", "ndjson", "SAS", "SPSS"...).
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 | \item \url{https://ddotta.github.io/parquetize/}
15 | \item \url{https://github.com/ddotta/parquetize}
16 | \item Report bugs at \url{https://github.com/ddotta/parquetize/issues}
17 | }
18 |
19 | }
20 | \author{
21 | \strong{Maintainer}: Damien Dotta \email{damien.dotta@live.fr}
22 |
23 | Authors:
24 | \itemize{
25 | \item Nicolas Chuche \email{nicolas.chuche@barna.be}
26 | }
27 |
28 | }
29 | \keyword{internal}
30 |
--------------------------------------------------------------------------------
/R/parquetize_example.R:
--------------------------------------------------------------------------------
1 | #' @name parquetize_example
2 | #'
3 | #' @title Get path to parquetize example
4 | #'
5 | #' @description parquetize comes bundled with a number of sample files in its `inst/extdata`
6 | #' directory. This function make them easy to access
7 | #'
8 | #' @param file Name of file or directory. If `NULL`, the example files will be listed.
9 | #
10 | #' @return A character string
11 | #'
12 | #' @export
13 | #' @examples
14 | #' parquetize_example()
15 | #' parquetize_example("region_2022.csv")
16 | #' parquetize_example("iris_dataset")
17 |
18 | parquetize_example <- function(file = NULL) {
19 | # To show all example files contained in parquetize
20 | if (is.null(file)) {
21 | return(dir(system.file("extdata", package = "parquetize")))
22 | }
23 |
24 | #To get the path to a file or a directory
25 | tryCatch(
26 | system.file("extdata", file, package = "parquetize", mustWork = TRUE),
27 | error = function(cond) cli_abort("Be careful, {file} doesn't exist in parquetize", class = "no_such_file")
28 | )
29 | }
30 |
--------------------------------------------------------------------------------
/data-raw/region-2022.R:
--------------------------------------------------------------------------------
1 | ################################################################################################%#
2 | #### Code to create the csv/txt file `region_2022.csv` and `region_2022.txt` in `inst/extdata`####
3 | ################################################################################################%#
4 |
5 | # The file `region_2022.csv` comes from the site insee.fr.
6 | # It can be downloaded at the following URL :
7 | # https://www.insee.fr/fr/information/6051727
8 |
9 | library(curl)
10 | library(readr)
11 |
12 | zipinseefr <- curl_download("https://www.insee.fr/fr/statistiques/fichier/6051727/cog_ensemble_2022_csv.zip",
13 | tempfile())
14 | filesinseefr <- unzip(zipfile=zipinseefr)
15 |
16 | region_2022 <- read_delim(filesinseefr[11],
17 | show_col_types = FALSE)
18 |
19 | write.csv2(
20 | region_2022,
21 | file = "inst/extdata/region_2022.csv",
22 | row.names = FALSE)
23 |
24 | write.table(
25 | region_2022,
26 | file = "inst/extdata/region_2022.txt",
27 | row.names = FALSE
28 | )
29 |
--------------------------------------------------------------------------------
/tests/testthat/test-get_partitions.R:
--------------------------------------------------------------------------------
1 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(),
2 | system.file("extdata","iris.sqlite",package = "parquetize"))
3 | on.exit(DBI::dbDisconnect(dbi_connection))
4 |
5 | test_that("Checks get_partitions returns the good value", {
6 | partitions <- expect_no_error(
7 | get_partitions(
8 | conn = dbi_connection,
9 | table = "iris",
10 | column = "Species"
11 | ),
12 | )
13 |
14 | testthat::expect_setequal(partitions, c("setosa", "versicolor", "virginica"))
15 | })
16 |
17 | test_that("Checks arguments are correctly filled in", {
18 | expect_missing_argument(
19 | get_partitions(
20 | table = "iris",
21 | column = "Species"
22 | ),
23 | regexp = "conn"
24 | )
25 | expect_missing_argument(
26 | get_partitions(
27 | conn = dbi_connection,
28 | column = "Species"
29 | ),
30 | regexp = "table"
31 | )
32 | expect_missing_argument(
33 | get_partitions(
34 | conn = dbi_connection,
35 | table = "iris",
36 | ),
37 | regexp = "column"
38 | )
39 | })
40 |
41 |
--------------------------------------------------------------------------------
/man/get_parquet_info.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_parquet_info.R
3 | \name{get_parquet_info}
4 | \alias{get_parquet_info}
5 | \title{Get various info on parquet files}
6 | \usage{
7 | get_parquet_info(path)
8 | }
9 | \arguments{
10 | \item{path}{parquet file path or directory. If directory is given,
11 | \code{get_parquet_info} will be applied on all parquet files found in
12 | subdirectories}
13 | }
14 | \value{
15 | a tibble with 5 columns :
16 | \itemize{
17 | \item path, file path
18 | \item num_rows, number of rows
19 | \item num_row_groups, number of group row
20 | \item num_columns,
21 | \item row_group_size, mean row group size
22 | }
23 |
24 | If one column contain \code{NA}, parquet file may be malformed.
25 | }
26 | \description{
27 | One very important parquet metadata is the row group size.\cr
28 |
29 | If it's value is low (below 10 000), you should rebuild your parquet files.\cr
30 |
31 | Normal value is between 30 000 and 1 000 000
32 | }
33 | \examples{
34 | get_parquet_info(system.file("extdata", "iris.parquet", package = "parquetize"))
35 |
36 | get_parquet_info(system.file("extdata", "iris_dataset", package = "parquetize"))
37 | }
38 |
--------------------------------------------------------------------------------
/inst/extdata/region_2022.txt:
--------------------------------------------------------------------------------
1 | "REG" "CHEFLIEU" "TNCC" "NCC" "NCCENR" "LIBELLE"
2 | 1 "97105" 3 "GUADELOUPE" "Guadeloupe" "Guadeloupe"
3 | 2 "97209" 3 "MARTINIQUE" "Martinique" "Martinique"
4 | 3 "97302" 3 "GUYANE" "Guyane" "Guyane"
5 | 4 "97411" 0 "LA REUNION" "La Réunion" "La Réunion"
6 | 6 "97608" 0 "MAYOTTE" "Mayotte" "Mayotte"
7 | 11 "75056" 1 "ILE DE FRANCE" "Île-de-France" "Île-de-France"
8 | 24 "45234" 2 "CENTRE VAL DE LOIRE" "Centre-Val de Loire" "Centre-Val de Loire"
9 | 27 "21231" 0 "BOURGOGNE FRANCHE COMTE" "Bourgogne-Franche-Comté" "Bourgogne-Franche-Comté"
10 | 28 "76540" 0 "NORMANDIE" "Normandie" "Normandie"
11 | 32 "59350" 4 "HAUTS DE FRANCE" "Hauts-de-France" "Hauts-de-France"
12 | 44 "67482" 2 "GRAND EST" "Grand Est" "Grand Est"
13 | 52 "44109" 4 "PAYS DE LA LOIRE" "Pays de la Loire" "Pays de la Loire"
14 | 53 "35238" 0 "BRETAGNE" "Bretagne" "Bretagne"
15 | 75 "33063" 3 "NOUVELLE AQUITAINE" "Nouvelle-Aquitaine" "Nouvelle-Aquitaine"
16 | 76 "31555" 1 "OCCITANIE" "Occitanie" "Occitanie"
17 | 84 "69123" 1 "AUVERGNE RHONE ALPES" "Auvergne-Rhône-Alpes" "Auvergne-Rhône-Alpes"
18 | 93 "13055" 0 "PROVENCE ALPES COTE D AZUR" "Provence-Alpes-Côte d'Azur" "Provence-Alpes-Côte d'Azur"
19 | 94 "2A004" 0 "CORSE" "Corse" "Corse"
20 |
--------------------------------------------------------------------------------
/inst/extdata/region_2022.csv:
--------------------------------------------------------------------------------
1 | "REG";"CHEFLIEU";"TNCC";"NCC";"NCCENR";"LIBELLE"
2 | "01";"97105";3;"GUADELOUPE";"Guadeloupe";"Guadeloupe"
3 | "02";"97209";3;"MARTINIQUE";"Martinique";"Martinique"
4 | "03";"97302";3;"GUYANE";"Guyane";"Guyane"
5 | "04";"97411";0;"LA REUNION";"La Réunion";"La Réunion"
6 | "06";"97608";0;"MAYOTTE";"Mayotte";"Mayotte"
7 | "11";"75056";1;"ILE DE FRANCE";"Île-de-France";"Île-de-France"
8 | "24";"45234";2;"CENTRE VAL DE LOIRE";"Centre-Val de Loire";"Centre-Val de Loire"
9 | "27";"21231";0;"BOURGOGNE FRANCHE COMTE";"Bourgogne-Franche-Comté";"Bourgogne-Franche-Comté"
10 | "28";"76540";0;"NORMANDIE";"Normandie";"Normandie"
11 | "32";"59350";4;"HAUTS DE FRANCE";"Hauts-de-France";"Hauts-de-France"
12 | "44";"67482";2;"GRAND EST";"Grand Est";"Grand Est"
13 | "52";"44109";4;"PAYS DE LA LOIRE";"Pays de la Loire";"Pays de la Loire"
14 | "53";"35238";0;"BRETAGNE";"Bretagne";"Bretagne"
15 | "75";"33063";3;"NOUVELLE AQUITAINE";"Nouvelle-Aquitaine";"Nouvelle-Aquitaine"
16 | "76";"31555";1;"OCCITANIE";"Occitanie";"Occitanie"
17 | "84";"69123";1;"AUVERGNE RHONE ALPES";"Auvergne-Rhône-Alpes";"Auvergne-Rhône-Alpes"
18 | "93";"13055";0;"PROVENCE ALPES COTE D AZUR";"Provence-Alpes-Côte d'Azur";"Provence-Alpes-Côte d'Azur"
19 | "94";"2A004";0;"CORSE";"Corse";"Corse"
20 |
--------------------------------------------------------------------------------
/inst/extdata/region_2022_with_comment.csv:
--------------------------------------------------------------------------------
1 | # A comment
2 | "REG";"CHEFLIEU";"TNCC";"NCC";"NCCENR";"LIBELLE"
3 | "01";"97105";3;"GUADELOUPE";"Guadeloupe";"Guadeloupe"
4 | "02";"97209";3;"MARTINIQUE";"Martinique";"Martinique"
5 | "03";"97302";3;"GUYANE";"Guyane";"Guyane"
6 | "04";"97411";0;"LA REUNION";"La Réunion";"La Réunion"
7 | "06";"97608";0;"MAYOTTE";"Mayotte";"Mayotte"
8 | "11";"75056";1;"ILE DE FRANCE";"Île-de-France";"Île-de-France"
9 | "24";"45234";2;"CENTRE VAL DE LOIRE";"Centre-Val de Loire";"Centre-Val de Loire"
10 | "27";"21231";0;"BOURGOGNE FRANCHE COMTE";"Bourgogne-Franche-Comté";"Bourgogne-Franche-Comté"
11 | "28";"76540";0;"NORMANDIE";"Normandie";"Normandie"
12 | "32";"59350";4;"HAUTS DE FRANCE";"Hauts-de-France";"Hauts-de-France"
13 | "44";"67482";2;"GRAND EST";"Grand Est";"Grand Est"
14 | "52";"44109";4;"PAYS DE LA LOIRE";"Pays de la Loire";"Pays de la Loire"
15 | "53";"35238";0;"BRETAGNE";"Bretagne";"Bretagne"
16 | "75";"33063";3;"NOUVELLE AQUITAINE";"Nouvelle-Aquitaine";"Nouvelle-Aquitaine"
17 | "76";"31555";1;"OCCITANIE";"Occitanie";"Occitanie"
18 | "84";"69123";1;"AUVERGNE RHONE ALPES";"Auvergne-Rhône-Alpes";"Auvergne-Rhône-Alpes"
19 | "93";"13055";0;"PROVENCE ALPES COTE D AZUR";"Provence-Alpes-Côte d'Azur";"Provence-Alpes-Côte d'Azur"
20 | "94";"2A004";0;"CORSE";"Corse";"Corse"
21 |
--------------------------------------------------------------------------------
/man/check_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/check_parquet.R
3 | \name{check_parquet}
4 | \alias{check_parquet}
5 | \title{Check if parquet file or dataset is readable and return basic informations}
6 | \usage{
7 | check_parquet(path)
8 | }
9 | \arguments{
10 | \item{path}{path to the file or dataset}
11 | }
12 | \value{
13 | a tibble with information on parquet dataset/file's columns with
14 | three columns : field name, arrow type and nullable
15 | }
16 | \description{
17 | This function checks if a file/dataset is a valid parquet format.
18 | It will print the number of lines/columns and return a tibble on columns
19 | information.
20 | }
21 | \details{
22 | This function will :
23 | \itemize{
24 | \item open the parquet dataset/file to check if it's valid
25 | \item print the number of lines
26 | \item print the number of columns
27 | \item return a tibble with 2 columns :
28 | \itemize{
29 | \item the column name (string)
30 | \item the arrow type (string)
31 | }
32 | }
33 |
34 | You can find a list of arrow type in the documentation
35 | \href{https://arrow.apache.org/docs/r/articles/data_types.html}{on this page}.
36 | }
37 | \examples{
38 |
39 | # check a parquet file
40 | check_parquet(parquetize_example("iris.parquet"))
41 |
42 | # check a parquet dataset
43 | check_parquet(parquetize_example("iris_dataset"))
44 | }
45 |
--------------------------------------------------------------------------------
/R/get_partitions.R:
--------------------------------------------------------------------------------
1 | #' @name get_partitions
2 | #'
3 | #' @title get unique values from table's column
4 | #'
5 | #' @description This function allows you to extract unique values from a table's column to use as partitions.\cr
6 | #'
7 | #' Internally, this function does "SELECT DISTINCT(`mycolumn`) FROM `mytable`;"
8 | #'
9 | #' @param conn A `DBIConnection` object, as return by `DBI::dbConnect`
10 | #' @param table a DB table name
11 | #' @param column a column name for the table passed in param
12 | #'
13 | #' @return a vector with unique values for the column of the table
14 | #' @export
15 | #'
16 | #' @examples
17 | #' dbi_connection <- DBI::dbConnect(RSQLite::SQLite(),
18 | #' system.file("extdata","iris.sqlite",package = "parquetize"))
19 | #'
20 | #' get_partitions(dbi_connection, "iris", "Species")
21 | get_partitions <- function(conn, table, column) {
22 | if (missing(conn)) {
23 | cli_abort("Be careful, the argument conn must be filled in", class = "parquetize_missing_argument")
24 | }
25 | if (missing(table)) {
26 | cli_abort("Be careful, the argument table must be filled in", class = "parquetize_missing_argument")
27 | }
28 | if (missing(column)) {
29 | cli_abort("Be careful, the argument column must be filled in", class = "parquetize_missing_argument")
30 | }
31 |
32 | DBI::dbGetQuery(conn, glue::glue("SELECT distinct({`column`}) FROM {`table`}", .con = conn))[,1]
33 | }
34 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: parquetize
2 | Type: Package
3 | Title: Convert Files to Parquet Format
4 | Version: 0.5.8
5 | Authors@R:
6 | c(person(given = "Damien",
7 | family = "Dotta",
8 | role = c("aut", "cre"),
9 | email = "damien.dotta@live.fr"),
10 | person(given = "Nicolas",
11 | family = "Chuche",
12 | role = c("aut"),
13 | email = "nicolas.chuche@barna.be"))
14 | Description: Collection of functions to get files in parquet format.
15 | Parquet is a columnar storage file format .
16 | The files to convert can be of several formats
17 | ("csv", "RData", "rds", "RSQLite",
18 | "json", "ndjson", "SAS", "SPSS"...).
19 | License: Apache License (>= 2.0)
20 | Encoding: UTF-8
21 | Depends:
22 | R (>= 3.5.0)
23 | URL: https://ddotta.github.io/parquetize/,
24 | https://github.com/ddotta/parquetize
25 | BugReports: https://github.com/ddotta/parquetize/issues
26 | Roxygen: list(markdown = TRUE)
27 | RoxygenNote: 7.3.2
28 | Suggests:
29 | knitr,
30 | rmarkdown,
31 | testthat (>= 3.0.0)
32 | Config/testthat/edition: 3
33 | Imports:
34 | haven (>= 2.4.0),
35 | arrow,
36 | curl,
37 | readr,
38 | jsonlite,
39 | DBI,
40 | RSQLite,
41 | cli,
42 | tidyselect,
43 | lifecycle,
44 | tools,
45 | glue,
46 | fst,
47 | rlang,
48 | dplyr,
49 | tibble
50 | VignetteBuilder: knitr
51 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 | release:
9 | types: [published]
10 | workflow_dispatch:
11 |
12 | name: pkgdown
13 |
14 | jobs:
15 | pkgdown:
16 | runs-on: ubuntu-latest
17 | # Only restrict concurrency for non-PR jobs
18 | concurrency:
19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 | env:
21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 | steps:
23 | - uses: actions/checkout@v3
24 |
25 | - uses: r-lib/actions/setup-pandoc@v2
26 |
27 | - uses: r-lib/actions/setup-r@v2
28 | with:
29 | use-public-rspm: true
30 |
31 | - uses: r-lib/actions/setup-r-dependencies@v2
32 | with:
33 | extra-packages: any::pkgdown, local::.
34 | needs: website
35 |
36 | - name: Build site
37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 | shell: Rscript {0}
39 |
40 | - name: Deploy to GitHub pages 🚀
41 | if: github.event_name != 'pull_request'
42 | uses: JamesIves/github-pages-deploy-action@v4.4.1
43 | with:
44 | clean: false
45 | branch: gh-pages
46 | folder: docs
47 |
--------------------------------------------------------------------------------
/tests/testthat/test-fst_to_parquet.R:
--------------------------------------------------------------------------------
1 | test_that("Checks arguments are correctly filled in", {
2 | expect_missing_argument(
3 | fst_to_parquet(
4 | path_to_file = system.file("extdata","iris.fst",package = "parquetize")
5 | ),
6 | regexp = "path_to_parquet"
7 | )
8 | expect_missing_argument(
9 | fst_to_parquet(
10 | path_to_parquet = tempfile()
11 | ),
12 | regexp = "path_to_file"
13 | )
14 | })
15 |
16 | test_that("Checks message is displayed with fst file", {
17 | path_to_parquet <- tempfile()
18 |
19 | expect_no_error(
20 | fst_to_parquet(
21 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"),
22 | path_to_parquet = path_to_parquet
23 | )
24 | )
25 | expect_parquet(
26 | file.path(path_to_parquet),
27 | with_lines = 150
28 | )
29 |
30 | })
31 |
32 | test_that("Checks message is displayed with by adding partition and partitioning argument", {
33 | path_to_parquet <- tempfile()
34 |
35 | expect_no_error(
36 | fst_to_parquet(
37 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"),
38 | path_to_parquet = path_to_parquet,
39 | partition = "yes",
40 | partitioning = c("Species")
41 | )
42 | )
43 |
44 | expect_parquet(
45 | file.path(path_to_parquet),
46 | with_lines = 150
47 | )
48 | expect_identical(
49 | dir(path_to_parquet),
50 | c('Species=setosa', 'Species=versicolor', 'Species=virginica')
51 | )
52 | })
53 |
--------------------------------------------------------------------------------
/tests/testthat/test-rds_to_parquet.R:
--------------------------------------------------------------------------------
1 | test_that("Checks arguments are correctly filled in", {
2 | expect_missing_argument(
3 | rds_to_parquet(
4 | path_to_file = system.file("extdata","iris.rds",package = "parquetize")
5 | ),
6 | regexp = "path_to_parquet"
7 | )
8 | expect_missing_argument(
9 | rds_to_parquet(
10 | path_to_parquet = tempfile()
11 | ),
12 | regexp = "path_to_file"
13 | )
14 | })
15 |
16 | test_that("Checks message is displayed with rds file", {
17 | path_to_parquet <- tempfile()
18 |
19 | expect_no_error(
20 | rds_to_parquet(
21 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"),
22 | path_to_parquet = path_to_parquet
23 | )
24 | )
25 | expect_parquet(
26 | file.path(path_to_parquet),
27 | with_lines = 150
28 | )
29 |
30 | })
31 |
32 | test_that("Checks message is displayed with by adding partition and partitioning argument", {
33 | path_to_parquet <- tempfile()
34 |
35 | expect_no_error(
36 | rds_to_parquet(
37 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"),
38 | path_to_parquet = path_to_parquet,
39 | partition = "yes",
40 | partitioning = c("Species")
41 | )
42 | )
43 |
44 | expect_parquet(
45 | file.path(path_to_parquet),
46 | with_lines = 150
47 | )
48 | expect_identical(
49 | dir(path_to_parquet),
50 | c('Species=setosa', 'Species=versicolor', 'Species=virginica')
51 | )
52 | })
53 |
--------------------------------------------------------------------------------
/tests/testthat/test-write_parquet_at_once.R:
--------------------------------------------------------------------------------
1 | test_that("write_parquet_at_once warn if path_to_parquet is a directory for a parquet file", {
2 | path_to_parquet <- tempfile()
3 | dir.create(path_to_parquet, showWarnings = FALSE)
4 | expect_message(
5 | write_parquet_at_once(mtcars, path_to_parquet = path_to_parquet, partition = "no"),
6 | regexp = "path_to_parquet should be a file name"
7 | )
8 | })
9 |
10 | test_that("write_parquet_at_once fails on missing argument", {
11 | expect_missing_argument(
12 | write_parquet_at_once(
13 | path_to_parquet = path_to_parquet
14 | ),
15 | regexp = "data"
16 | )
17 |
18 | expect_missing_argument(
19 | write_parquet_at_once(
20 | data = iris
21 | ),
22 | regexp = "path_to_parquet"
23 | )
24 | })
25 |
26 | test_that("write_parquet_at_once works for simple parquet file", {
27 | path_to_parquet <- tempfile()
28 | expect_no_error(
29 | write_parquet_at_once(iris, path_to_parquet)
30 | )
31 |
32 | expect_parquet(
33 | path_to_parquet,
34 | with_lines = 150,
35 | with_file = 1
36 | )
37 | })
38 |
39 | test_that("write_parquet_at_once works for partitioned dataset", {
40 | path_to_parquet <- tempfile()
41 | expect_no_error(
42 | write_parquet_at_once(iris, path_to_parquet, partition = "yes", partitioning = "Species")
43 | )
44 |
45 | expect_parquet(
46 | path_to_parquet,
47 | with_lines = 150,
48 | with_file = 3,
49 | with_partitions = c("Species=setosa", "Species=versicolor", "Species=virginica")
50 | )
51 | })
52 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 |
9 | name: test-coverage
10 |
11 | jobs:
12 | test-coverage:
13 | runs-on: ubuntu-latest
14 | env:
15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 |
20 | - uses: r-lib/actions/setup-r@v2
21 | with:
22 | use-public-rspm: true
23 |
24 | - uses: r-lib/actions/setup-r-dependencies@v2
25 | with:
26 | extra-packages: any::covr
27 | needs: coverage
28 |
29 | - name: Test coverage
30 | run: |
31 | covr::codecov(
32 | quiet = FALSE,
33 | clean = FALSE,
34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
35 | )
36 | shell: Rscript {0}
37 |
38 | - name: Show testthat output
39 | if: always()
40 | run: |
41 | ## --------------------------------------------------------------------
42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 | shell: bash
44 |
45 | - name: Upload test results
46 | if: failure()
47 | uses: actions/upload-artifact@v4
48 | with:
49 | name: coverage-test-failures
50 | path: ${{ runner.temp }}/package
51 |
--------------------------------------------------------------------------------
/tests/testthat/test-download_extract.R:
--------------------------------------------------------------------------------
1 | test_that("Checks download_extract return local file if not a zip", {
2 | expect_equal(
3 | download_extract("/my/local/file.truc"),
4 | "/my/local/file.truc"
5 | )
6 | })
7 |
8 | test_that("Checks download_extract returns the csv file of local zip", {
9 | expect_match(
10 | download_extract(system.file("extdata","mtcars.csv.zip", package = "readr")),
11 | ".*/mtcars.csv"
12 | )
13 | })
14 |
15 | test_that("Checks download_extract fails with error if zip has more than one file and no filename_in_zip", {
16 | skip_if_offline()
17 |
18 | expect_missing_argument(
19 | download_extract(
20 | system.file("extdata","multifile.zip",package = "parquetize")
21 | ),
22 | regexp = "filename_in_zip"
23 | )
24 | })
25 |
26 | test_that("Checks download_extract works with multi files zip", {
27 | file <- download_extract(
28 | system.file("extdata","multifile.zip",package = "parquetize"),
29 | filename_in_zip = "region_2022.csv"
30 | )
31 |
32 | expect_match(
33 | file,
34 | ".*/region_2022.csv"
35 | )
36 |
37 | expect_true(
38 | file.exists(file)
39 | )
40 | })
41 |
42 | test_that("Checks download_extract returns the csv file of remote zip", {
43 | skip_if_offline()
44 |
45 | file <- download_extract(
46 | "https://www.stats.govt.nz/assets/Uploads/Business-employment-data/Business-employment-data-June-2022-quarter/Download-data/business-employment-data-june-2022-quarter-csv.zip"
47 | )
48 |
49 | expect_match(
50 | file,
51 | ".*/machine-readable-business-employment-data-june-2022-quarter.csv"
52 | )
53 |
54 | expect_true(
55 | file.exists(file)
56 | )
57 | })
58 |
59 |
--------------------------------------------------------------------------------
/R/check_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name check_parquet
2 | #'
3 | #' @title Check if parquet file or dataset is readable and return basic informations
4 | #'
5 | #' @description This function checks if a file/dataset is a valid parquet format.
6 | #' It will print the number of lines/columns and return a tibble on columns
7 | #' information.
8 | #'
9 | #' @details This function will :
10 | #'
11 | #' * open the parquet dataset/file to check if it's valid
12 | #' * print the number of lines
13 | #' * print the number of columns
14 | #' * return a tibble with 2 columns :
15 | #'
16 | #' * the column name (string)
17 | #' * the arrow type (string)
18 | #'
19 | #' You can find a list of arrow type in the documentation
20 | #' \href{https://arrow.apache.org/docs/r/articles/data_types.html}{on this page}.
21 | #'
22 | #' @param path path to the file or dataset
23 | #'
24 | #' @return a tibble with information on parquet dataset/file's columns with
25 | #' three columns : field name, arrow type and nullable
26 | #'
27 | #' @export
28 | #'
29 | #' @examples
30 | #'
31 | #' # check a parquet file
32 | #' check_parquet(parquetize_example("iris.parquet"))
33 | #'
34 | #' # check a parquet dataset
35 | #' check_parquet(parquetize_example("iris_dataset"))
36 | check_parquet <- function(path) {
37 |
38 | if (isFALSE(file.exists(path))) {
39 | cli_abort("Be careful, {path} doesn't exist", class = "no_such_file")
40 | }
41 |
42 | cli_alert_info("checking: {path}")
43 |
44 | ds <- arrow::open_dataset(path, unify_schemas = TRUE)
45 | cli_alert_success("loading dataset: ok")
46 |
47 | cli_alert_success("number of lines: {nrow(ds)}")
48 | cli_alert_success("number of columns: {length(names(ds))}")
49 |
50 | get_col_types(ds)
51 | }
52 |
--------------------------------------------------------------------------------
/tests/testthat/test-utilities.R:
--------------------------------------------------------------------------------
1 | test_that("test get_haven_read_function_by_extension returns the good method", {
2 | file <- system.file("examples","iris.dta", package = "haven")
3 | fun <- get_haven_read_function_for_file(file)
4 | expect_s3_class(fun(file), "tbl")
5 |
6 | file <- system.file("examples","iris.sas7bdat", package = "haven")
7 | fun <- get_haven_read_function_for_file(file)
8 | expect_s3_class(fun(file), "tbl")
9 |
10 | file <- system.file("examples","iris.sav", package = "haven")
11 | fun <- get_haven_read_function_for_file(file)
12 | expect_s3_class(fun(file), "tbl")
13 | })
14 |
15 |
16 | test_that("tests get_haven_read_function_by_extension fails when needed", {
17 | expect_error(
18 | get_haven_read_function_for_file("/some/bad/file/without_extension"),
19 | class = "parquetize_bad_argument"
20 | )
21 |
22 | expect_error(
23 | get_haven_read_function_for_file("/some/bad/file/with_bad_extension.xlsx"),
24 | class = "parquetize_bad_argument"
25 | )
26 | })
27 |
28 | test_that("test get_lines_for_memory return the good number of lines", {
29 | file <- system.file("examples","iris.dta", package = "haven")
30 | read_method <- get_haven_read_function_for_file(file)
31 | data <- read_method(file, n_max = Inf)
32 |
33 | expect_equal(
34 | get_lines_for_memory(data, max_memory = 1 / 1024),
35 | 16
36 | )
37 | })
38 |
39 | test_that("test is_remote works", {
40 | expect_true(is_remote("https://my_url/"))
41 | expect_true(is_remote("http://my_url/"))
42 | expect_true(is_remote("ftp://my_url/"))
43 | expect_true(is_remote("ftps://my_url/"))
44 |
45 | expect_false(is_remote("c://my_url/"))
46 | expect_false(is_remote("/my_url/"))
47 | })
48 |
49 |
50 |
--------------------------------------------------------------------------------
/man/write_parquet_at_once.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/write_parquet_at_once.R
3 | \name{write_parquet_at_once}
4 | \alias{write_parquet_at_once}
5 | \title{write parquet file or dataset based on partition argument \cr}
6 | \usage{
7 | write_parquet_at_once(
8 | data,
9 | path_to_parquet,
10 | partition = "no",
11 | compression = "snappy",
12 | compression_level = NULL,
13 | ...
14 | )
15 | }
16 | \arguments{
17 | \item{data}{the data.frame/tibble to write}
18 |
19 | \item{path_to_parquet}{String that indicates the path to the directory where
20 | the output parquet file or dataset will be stored.}
21 |
22 | \item{partition}{string ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
23 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.}
24 |
25 | \item{compression}{compression algorithm. Default "snappy".}
26 |
27 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
28 |
29 | \item{...}{Additional format-specific arguments, see
30 | \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}}
31 | }
32 | \value{
33 | a dataset as return by arrow::open_dataset
34 | }
35 | \description{
36 | Low level function that implements the logic to write a parquet file or a dataset from data
37 | }
38 | \examples{
39 |
40 | write_parquet_at_once(iris, tempfile())
41 |
42 | write_parquet_at_once(iris, tempfile(), partition = "yes", partitioning = c("Species"))
43 |
44 | \dontrun{
45 | write_parquet_at_once(iris, tempfile(), compression="gzip", compression_level = 5)
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/man/download_extract.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/download_extract.R
3 | \name{download_extract}
4 | \alias{download_extract}
5 | \title{download and uncompress file if needed}
6 | \usage{
7 | download_extract(path, filename_in_zip)
8 | }
9 | \arguments{
10 | \item{path}{the input file's path or url.}
11 |
12 | \item{filename_in_zip}{name of the csv file in the zip. Required if
13 | several csv are included in the zip.}
14 | }
15 | \value{
16 | the path to the usable (uncompressed) file, invisibly.
17 | }
18 | \description{
19 | This function will download the file if the file is remote and
20 | unzip it if it is zipped. It will just return the input path argument if
21 | it's neither. \cr
22 |
23 | If the zip contains multiple files, you can use \code{filename_in_zip} to set the file you want to unzip and use.
24 |
25 | You can pipe output on all \verb{*_to_parquet} functions.
26 | }
27 | \examples{
28 |
29 | # 1. unzip a local zip file
30 | # 2. parquetize it
31 |
32 | file_path <- download_extract(system.file("extdata","mtcars.csv.zip", package = "readr"))
33 | csv_to_parquet(
34 | file_path,
35 | path_to_parquet = tempfile(fileext = ".parquet")
36 | )
37 |
38 | # 1. download a remote file
39 | # 2. extract the file census2021-ts007-ctry.csv
40 | # 3. parquetize it
41 |
42 | file_path <- download_extract(
43 | "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip",
44 | filename_in_zip = "census2021-ts007-ctry.csv"
45 | )
46 | csv_to_parquet(
47 | file_path,
48 | path_to_parquet = tempfile(fileext = ".parquet")
49 | )
50 |
51 | # the file is local and not zipped so :
52 | # 1. parquetize it
53 |
54 | file_path <- download_extract(parquetize_example("region_2022.csv"))
55 | csv_to_parquet(
56 | file_path,
57 | path_to_parquet = tempfile(fileext = ".parquet")
58 | )
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(check_parquet)
4 | export(csv_to_parquet)
5 | export(dbi_to_parquet)
6 | export(download_extract)
7 | export(expect_missing_argument)
8 | export(expect_parquet)
9 | export(fst_to_parquet)
10 | export(get_parquet_info)
11 | export(get_partitions)
12 | export(json_to_parquet)
13 | export(parquetize_example)
14 | export(rbind_parquet)
15 | export(rds_to_parquet)
16 | export(sqlite_to_parquet)
17 | export(table_to_parquet)
18 | export(write_parquet_at_once)
19 | export(write_parquet_by_chunk)
20 | import(dplyr)
21 | importFrom(DBI,dbClearResult)
22 | importFrom(DBI,dbConnect)
23 | importFrom(DBI,dbDisconnect)
24 | importFrom(DBI,dbFetch)
25 | importFrom(DBI,dbHasCompleted)
26 | importFrom(DBI,dbListTables)
27 | importFrom(DBI,dbReadTable)
28 | importFrom(DBI,dbSendQuery)
29 | importFrom(RSQLite,SQLite)
30 | importFrom(arrow,open_dataset)
31 | importFrom(arrow,read_json_arrow)
32 | importFrom(arrow,read_parquet)
33 | importFrom(arrow,write_dataset)
34 | importFrom(arrow,write_parquet)
35 | importFrom(cli,cli_abort)
36 | importFrom(cli,cli_alert_danger)
37 | importFrom(cli,cli_alert_info)
38 | importFrom(cli,cli_alert_success)
39 | importFrom(cli,cli_alert_warning)
40 | importFrom(cli,cli_progress_bar)
41 | importFrom(cli,cli_progress_message)
42 | importFrom(curl,curl_download)
43 | importFrom(fst,read.fst)
44 | importFrom(glue,glue)
45 | importFrom(glue,glue_sql)
46 | importFrom(haven,read_dta)
47 | importFrom(haven,read_sas)
48 | importFrom(haven,read_sav)
49 | importFrom(jsonlite,read_json)
50 | importFrom(lifecycle,deprecate_warn)
51 | importFrom(lifecycle,deprecated)
52 | importFrom(readr,locale)
53 | importFrom(readr,read_delim)
54 | importFrom(rlang,inject)
55 | importFrom(tibble,as_tibble)
56 | importFrom(tidyselect,all_of)
57 | importFrom(tidyselect,everything)
58 | importFrom(tools,file_ext)
59 | importFrom(tools,file_path_sans_ext)
60 | importFrom(utils,object.size)
61 | importFrom(utils,unzip)
62 |
--------------------------------------------------------------------------------
/man/rbind_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/rbind_parquet.R
3 | \name{rbind_parquet}
4 | \alias{rbind_parquet}
5 | \title{Function to bind multiple parquet files by row}
6 | \usage{
7 | rbind_parquet(
8 | folder,
9 | output_name,
10 | delete_initial_files = TRUE,
11 | compression = "snappy",
12 | compression_level = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{folder}{the folder where the initial files are stored}
17 |
18 | \item{output_name}{name of the output parquet file}
19 |
20 | \item{delete_initial_files}{Boolean. Should the function delete the initial files ? By default TRUE.}
21 |
22 | \item{compression}{compression algorithm. Default "snappy".}
23 |
24 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
25 | }
26 | \value{
27 | Parquet files, invisibly
28 | }
29 | \description{
30 | This function read all parquet files in \code{folder} argument that starts with \code{output_name},
31 | combine them using rbind and write the result to a new parquet file. \cr
32 |
33 | It can also delete the initial files if \code{delete_initial_files} argument is TRUE. \cr
34 |
35 | Be careful, this function will not work if files with different structures
36 | are present in the folder given with the argument \code{folder}.
37 | }
38 | \examples{
39 | \dontrun{
40 | library(arrow)
41 | if (file.exists('output')==FALSE) {
42 | dir.create("output")
43 | }
44 |
45 | file.create(fileext = "output/test_data1-4.parquet")
46 | write_parquet(data.frame(
47 | x = c("a","b","c"),
48 | y = c(1L,2L,3L)
49 | ),
50 | "output/test_data1-4.parquet")
51 |
52 | file.create(fileext = "output/test_data4-6.parquet")
53 | write_parquet(data.frame(
54 | x = c("d","e","f"),
55 | y = c(4L,5L,6L)
56 | ), "output/test_data4-6.parquet")
57 |
58 | test_data <- rbind_parquet(folder = "output",
59 | output_name = "test_data",
60 | delete_initial_files = FALSE)
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/tests/testthat/test-json_to_parquet.R:
--------------------------------------------------------------------------------
1 | test_that("Checks arguments are correctly filled in", {
2 | testthat::local_edition(3)
3 |
4 | expect_missing_argument(
5 | json_to_parquet(
6 | path_to_file = system.file("extdata","iris.ndjson",package = "parquetize")
7 | ),
8 | regexp = "path_to_parquet"
9 | )
10 |
11 | expect_missing_argument(
12 | json_to_parquet(
13 | path_to_parquet = tempfile()
14 | ),
15 | regexp = "path_to_file"
16 | )
17 |
18 | expect_error(
19 | json_to_parquet(
20 | path_to_file = system.file("extdata","iris.json",package = "parquetize"),
21 | path_to_parquet = tempfile(),
22 | format = "xjson"
23 | ),
24 | class = "parquetize_bad_format"
25 | )
26 | })
27 |
28 | test_that("Checks converting json file works", {
29 | path_to_parquet <- tempfile()
30 |
31 | json_to_parquet(
32 | path_to_file = system.file("extdata","iris.json",package = "parquetize"),
33 | path_to_parquet = path_to_parquet
34 | )
35 |
36 | expect_parquet(
37 | path_to_parquet,
38 | with_lines = 150
39 | )
40 | })
41 |
42 | test_that("Checks converting ndjson file works", {
43 | path_to_parquet <- tempfile()
44 |
45 | json_to_parquet(
46 | path_to_file = system.file("extdata","iris.ndjson",package = "parquetize"),
47 | path_to_parquet = path_to_parquet,
48 | format = "ndjson"
49 | )
50 | expect_parquet(
51 | path_to_parquet,
52 | with_lines = 150
53 | )
54 |
55 | })
56 |
57 | test_that("Checks adding partition and partitioning argument works", {
58 | path_to_parquet <- tempfile()
59 |
60 | json_to_parquet(
61 | path_to_file = system.file("extdata","iris.json",package = "parquetize"),
62 | path_to_parquet = path_to_parquet,
63 | partition = "yes",
64 | partitioning = c("Species")
65 | )
66 | expect_parquet(
67 | path_to_parquet,
68 | with_lines = 150,
69 | with_partitions = c('Species=setosa', 'Species=versicolor', 'Species=virginica')
70 | )
71 | })
72 |
--------------------------------------------------------------------------------
/R/get_parquet_info.R:
--------------------------------------------------------------------------------
1 | #' @name get_parquet_info
2 | #'
3 | #' @title Get various info on parquet files
4 | #'
5 | #' @description One very important parquet metadata is the row group size.\cr
6 | #'
7 | #' If it's value is low (below 10 000), you should rebuild your parquet files.\cr
8 | #'
9 | #' Normal value is between 30 000 and 1 000 000
10 | #'
11 | #' @param path parquet file path or directory. If directory is given,
12 | #' `get_parquet_info` will be applied on all parquet files found in
13 | #' subdirectories
14 | #'
15 | #' @return a tibble with 5 columns :
16 | #' * path, file path
17 | #' * num_rows, number of rows
18 | #' * num_row_groups, number of group row
19 | #' * num_columns,
20 | #' * row_group_size, mean row group size
21 | #'
22 | #' If one column contain `NA`, parquet file may be malformed.
23 | #'
24 | #' @export
25 | #'
26 | #' @examples
27 | #' get_parquet_info(system.file("extdata", "iris.parquet", package = "parquetize"))
28 | #'
29 | #' get_parquet_info(system.file("extdata", "iris_dataset", package = "parquetize"))
30 | get_parquet_info <- function(path) {
31 | if (dir.exists(path)) {
32 | files <- list.files(path, recursive = TRUE, pattern = "*.parquet$", full.names = T)
33 | } else if (file.exists(path)) {
34 | files <- path
35 | } else {
36 | stop("path must be a file or a directory")
37 | }
38 |
39 | tibble::tibble(
40 | path = files,
41 | num_rows = sapply(files, get_parquet_attribute, attribute = "num_rows"),
42 | num_row_groups = sapply(files, get_parquet_attribute, attribute = "num_row_groups"),
43 | num_columns = sapply(files, get_parquet_attribute, attribute = "num_columns")
44 | ) %>%
45 | dplyr::mutate(
46 | mean_row_group_size = .data$num_rows / .data$num_row_groups
47 | )
48 | }
49 |
50 | #' @name get_parquet_attribute
51 | #'
52 | #' @title Utility to get attributes from a parquet file
53 | #'
54 | #' @param path parquet file path or directory.
55 | #' @param attribute name of searched attribute
56 | #'
57 | #' @noRd
58 | get_parquet_attribute <- function(path, attribute) {
59 | tryCatch({
60 | reader <- arrow::ParquetFileReader$create(path)
61 | reader[[attribute]]
62 | },
63 | error = function(e) { return(NA_real_) }
64 | )
65 | }
66 |
--------------------------------------------------------------------------------
/R/testthat-helpers.R:
--------------------------------------------------------------------------------
1 | #' Check if parquet dataset/file is readable and has the good number of rows
2 | #'
3 | #' @param path to the parquet file or dataset
4 | #' @param with_lines number of lines the file/dataset should have
5 | #' @param with_partitions NULL or a vector with the partition names the dataset should have
6 | #' @param with_columns NULL or a column's name vector the dataset/file should have
7 | #' @param with_files NULL or number of files a dataset should have
8 | #'
9 | #' @return the dataset handle
10 | #' @export
11 | #'
12 | #' @keywords internal
13 | expect_parquet <- function(
14 | path,
15 | with_lines,
16 | with_partitions = NULL,
17 | with_columns = NULL,
18 | with_files = NULL) {
19 | dataset <- testthat::expect_no_error(arrow::open_dataset(path))
20 | testthat::expect_equal(nrow(dataset), with_lines)
21 |
22 | if (!is.null(with_partitions)) {
23 | tryCatch(
24 | testthat::expect_setequal(dir(path), with_partitions),
25 | error = function(cond) { cli::cli_abort("{with_partitions} different from {dir(path)}", class = "partquetize_test_with_partitions")}
26 | )
27 | }
28 |
29 | if (!is.null(with_columns)) {
30 | tryCatch(
31 | testthat::expect_setequal(names(dataset), with_columns),
32 | error = function(cond) { cli::cli_abort("{with_columns} different from {names(dataset)}", class = "partquetize_test_with_columns") }
33 | )
34 | }
35 |
36 | if (!is.null(with_files)) {
37 | files_number <- length(dataset$files)
38 |
39 | tryCatch(
40 | testthat::expect_equal(files_number, with_files),
41 | error = function(cond) { cli::cli_abort("we should have {with_files} files. We have {files_number}", class = "partquetize_test_with_files") }
42 | )
43 | }
44 | return(dataset)
45 | }
46 |
47 | #' Check if missing argument error is raised
48 | #'
49 | #' @param object the object to check
50 | #' @param regexp a regexp with the message we must find
51 | #'
52 | #' @return same as expect_error
53 | #' @export
54 | #'
55 | #' @keywords internal
56 | expect_missing_argument <- function(object, regexp) {
57 | testthat::expect_error(
58 | object,
59 | class = "parquetize_missing_argument",
60 | regexp = regexp
61 | )
62 | }
63 |
--------------------------------------------------------------------------------
/tests/testthat/test-sqlite_to_parquet.R:
--------------------------------------------------------------------------------
1 | test_that("Checks arguments are correctly filled in", {
2 | expect_missing_argument(
3 | sqlite_to_parquet(
4 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize")
5 | ),
6 | regexp = "path_to_parquet"
7 | )
8 | expect_missing_argument(
9 | sqlite_to_parquet(
10 | path_to_parquet = tempfile()
11 | ),
12 | regexp = "path_to_file"
13 | )
14 | })
15 |
16 | test_that("Check if extension used in path_to_file is correct", {
17 | expect_error(
18 | sqlite_to_parquet(
19 | path_to_file = system.file("extdata","iris.sqliteee",package = "parquetize")
20 | ),
21 | class = "parquetize_bad_format"
22 | )
23 | })
24 |
25 | test_that("Check if parquetize fails when table does not exist", {
26 | expect_error(
27 | sqlite_to_parquet(
28 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"),
29 | path_to_parquet = tempfile(),
30 | table = "nosuchtable"
31 | ),
32 | class = "parquetize_missing_table",
33 | regexp = "nosuchtable"
34 | )
35 | })
36 |
37 | test_that("Checks message is displayed with sqlite file", {
38 | path_to_parquet <- tempfile()
39 |
40 | expect_no_error(
41 | sqlite_to_parquet(
42 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"),
43 | table_in_sqlite = "iris",
44 | path_to_parquet = path_to_parquet
45 | )
46 | )
47 |
48 | expect_parquet(
49 | file.path(path_to_parquet),
50 | with_lines = 150
51 | )
52 |
53 | })
54 |
55 | test_that("Checks message is displayed with by adding partition and partitioning argument", {
56 | path_to_parquet <- tempfile()
57 |
58 | expect_no_error(
59 | sqlite_to_parquet(
60 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"),
61 | table_in_sqlite = "iris",
62 | path_to_parquet = path_to_parquet,
63 | partition = "yes",
64 | partitioning = c("Species")
65 | )
66 | )
67 |
68 | expect_parquet(
69 | file.path(path_to_parquet),
70 | with_lines = 150
71 | )
72 |
73 | expect_identical(
74 | dir(path_to_parquet),
75 | c('Species=setosa', 'Species=versicolor', 'Species=virginica')
76 | )
77 |
78 | })
79 |
--------------------------------------------------------------------------------
/dev/dev_history.R:
--------------------------------------------------------------------------------
1 | #------------------------------------------------------------------------#
2 | # Exemple sous https://linogaliana.gitlab.io/collaboratif/package.html #
3 |
4 | #################### AU QUOTIDIEN ###############################
5 | # 3.a. Inclure du code, le documenter et le tester
6 | # Pour chaque fonction du package :
7 | usethis::use_r("csv_to_parquet")
8 | usethis::use_test("csv_to_parquet")
9 | # écrire le code de la fonction
10 | # documenter la fonction
11 | # # Pour mettre à jour la documentation et le NAMESPACE
12 | # devtools::document()
13 | roxygen2::roxygenise()
14 | # écrire les tests
15 | # exécuter les tests
16 | devtools::test()
17 |
18 | # 3.b. Si besoin, déclarer une dépendance dans DESCRIPTION
19 | usethis::use_package("readr")
20 | # pour utiliser %>% dans un package
21 | # usethis::use_pipe()
22 |
23 | # Pour réaliser le contrôle de conformité du package
24 | devtools::check()
25 |
26 | # 3.c. Astuce qui peut aider durant le développement
27 | # Charger l'ensemble des fonctions de son package
28 | devtools::load_all()
29 |
30 | # Pour le code coverage
31 | covr::package_coverage()
32 | covr::report()
33 | #------------------------------------------------#
34 |
35 | # Ajout de `dev/dev_history.R` au .Rbuildignore
36 | usethis::use_build_ignore("dev/dev_history.R")
37 |
38 | # Ajout d'un fichier NEWS
39 | usethis::use_news_md()
40 |
41 | # Creation du squelette du pkgdown
42 | usethis::use_pkgdown()
43 |
44 | # Configuration des GHA
45 | usethis::use_github_action(name = "check-release")
46 |
47 | # Ajout des fichiers dans `data-raw`
48 | usethis::use_data_raw("region-2022")
49 |
50 | # Creation des vignettes
51 | usethis::use_vignette("aa-conversions")
52 |
53 | # Creation du repertoire testthat
54 | usethis::use_testthat()
55 |
56 | # Pour avoir le détail du code coverage par fonction
57 | covr::report()
58 |
59 | ################ En fin de developpement ##########
60 |
61 | # Construction du site (uniquement sur SSP Cloud)
62 | pkgdown::build_site(override = list(destination = "../website"))
63 |
64 | # Construction du fichier .tar.gz
65 | devtools::build()
66 |
67 | # Construction du fichier .zip (format binaire)
68 | devtools::build(binary=TRUE)
69 |
70 | # Construction du manuel au format pdf
71 | devtools::build_manual(path = "manuel")
72 |
--------------------------------------------------------------------------------
/R/rds_to_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name rds_to_parquet
2 | #'
3 | #' @title Convert a rds file to parquet format
4 | #'
5 | #' @description This function allows to convert a rds file to parquet format. \cr
6 | #'
7 | #' Two conversions possibilities are offered :
8 | #'
9 | #'\itemize{
10 | #'
11 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;}
12 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;}
13 | #'
14 | #' }
15 | #'
16 | #' @inheritParams table_to_parquet
17 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
18 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.
19 | #' @return A parquet file, invisibly
20 | #'
21 | #' @export
22 | #'
23 | #' @examples
24 | #'
25 | #' # Conversion from a local rds file to a single parquet file ::
26 | #'
27 | #' rds_to_parquet(
28 | #' path_to_file = system.file("extdata","iris.rds",package = "parquetize"),
29 | #' path_to_parquet = tempfile(fileext = ".parquet")
30 | #' )
31 | #'
32 | #' # Conversion from a local rds file to a partitioned parquet file ::
33 | #'
34 | #' rds_to_parquet(
35 | #' path_to_file = system.file("extdata","iris.rds",package = "parquetize"),
36 | #' path_to_parquet = tempfile(fileext = ".parquet"),
37 | #' partition = "yes",
38 | #' partitioning = c("Species")
39 | #' )
40 |
41 | rds_to_parquet <- function(
42 | path_to_file,
43 | path_to_parquet,
44 | partition = "no",
45 | compression = "snappy",
46 | compression_level = NULL,
47 | ...
48 | ) {
49 |
50 | # Check if path_to_file is missing
51 | if (missing(path_to_file)) {
52 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument")
53 | }
54 |
55 | # Check if path_to_parquet is missing
56 | if (missing(path_to_parquet)) {
57 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument")
58 | }
59 |
60 | Sys.sleep(0.01)
61 | cli_progress_message("Reading data...")
62 |
63 | rds_output <- readRDS(file = path_to_file)
64 |
65 | dataset <- write_parquet_at_once(
66 | rds_output,
67 | path_to_parquet,
68 | partition,
69 | compression,
70 | compression_level,
71 | ...)
72 |
73 | return(invisible(dataset))
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/R/fst_to_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name fst_to_parquet
2 | #'
3 | #' @title Convert a fst file to parquet format
4 | #'
5 | #' @description This function allows to convert a fst file to parquet format. \cr
6 | #'
7 | #' Two conversions possibilities are offered :
8 | #'
9 | #'\itemize{
10 | #'
11 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;}
12 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;}
13 | #'
14 | #' }
15 | #'
16 | #' @inheritParams table_to_parquet
17 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
18 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.
19 | #' @return A parquet file, invisibly
20 | #'
21 | #' @export
22 | #'
23 | #' @examples
24 | #'
25 | #' # Conversion from a local fst file to a single parquet file ::
26 | #'
27 | #' fst_to_parquet(
28 | #' path_to_file = system.file("extdata","iris.fst",package = "parquetize"),
29 | #' path_to_parquet = tempfile(fileext = ".parquet")
30 | #' )
31 | #'
32 | #' # Conversion from a local fst file to a partitioned parquet file ::
33 | #'
34 | #' fst_to_parquet(
35 | #' path_to_file = system.file("extdata","iris.fst",package = "parquetize"),
36 | #' path_to_parquet = tempfile(fileext = ".parquet"),
37 | #' partition = "yes",
38 | #' partitioning = c("Species")
39 | #' )
40 |
41 | fst_to_parquet <- function(
42 | path_to_file,
43 | path_to_parquet,
44 | partition = "no",
45 | compression = "snappy",
46 | compression_level = NULL,
47 | ...
48 | ) {
49 |
50 | # Check if path_to_file is missing
51 | if (missing(path_to_file)) {
52 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument")
53 | }
54 |
55 | # Check if path_to_parquet is missing
56 | if (missing(path_to_parquet)) {
57 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument")
58 | }
59 |
60 | Sys.sleep(0.01)
61 | cli_progress_message("Reading data...")
62 |
63 | fst_output <- fst::read.fst(path = path_to_file)
64 |
65 | dataset <- write_parquet_at_once(
66 | fst_output,
67 | path_to_parquet,
68 | partition,
69 | compression,
70 | compression_level,
71 | ...)
72 |
73 | return(invisible(dataset))
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/man/fst_to_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fst_to_parquet.R
3 | \name{fst_to_parquet}
4 | \alias{fst_to_parquet}
5 | \title{Convert a fst file to parquet format}
6 | \usage{
7 | fst_to_parquet(
8 | path_to_file,
9 | path_to_parquet,
10 | partition = "no",
11 | compression = "snappy",
12 | compression_level = NULL,
13 | ...
14 | )
15 | }
16 | \arguments{
17 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).}
18 |
19 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.}
20 |
21 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
22 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.
23 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.}
24 |
25 | \item{compression}{compression algorithm. Default "snappy".}
26 |
27 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
28 |
29 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
30 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.}
31 | }
32 | \value{
33 | A parquet file, invisibly
34 | }
35 | \description{
36 | This function allows to convert a fst file to parquet format. \cr
37 |
38 | Two conversions possibilities are offered :
39 |
40 | \itemize{
41 |
42 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;}
43 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;}
44 |
45 | }
46 | }
47 | \examples{
48 |
49 | # Conversion from a local fst file to a single parquet file ::
50 |
51 | fst_to_parquet(
52 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"),
53 | path_to_parquet = tempfile(fileext = ".parquet")
54 | )
55 |
56 | # Conversion from a local fst file to a partitioned parquet file ::
57 |
58 | fst_to_parquet(
59 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"),
60 | path_to_parquet = tempfile(fileext = ".parquet"),
61 | partition = "yes",
62 | partitioning = c("Species")
63 | )
64 | }
65 |
--------------------------------------------------------------------------------
/man/rds_to_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/rds_to_parquet.R
3 | \name{rds_to_parquet}
4 | \alias{rds_to_parquet}
5 | \title{Convert a rds file to parquet format}
6 | \usage{
7 | rds_to_parquet(
8 | path_to_file,
9 | path_to_parquet,
10 | partition = "no",
11 | compression = "snappy",
12 | compression_level = NULL,
13 | ...
14 | )
15 | }
16 | \arguments{
17 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).}
18 |
19 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.}
20 |
21 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
22 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.
23 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.}
24 |
25 | \item{compression}{compression algorithm. Default "snappy".}
26 |
27 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
28 |
29 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
30 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.}
31 | }
32 | \value{
33 | A parquet file, invisibly
34 | }
35 | \description{
36 | This function allows to convert a rds file to parquet format. \cr
37 |
38 | Two conversions possibilities are offered :
39 |
40 | \itemize{
41 |
42 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;}
43 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;}
44 |
45 | }
46 | }
47 | \examples{
48 |
49 | # Conversion from a local rds file to a single parquet file ::
50 |
51 | rds_to_parquet(
52 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"),
53 | path_to_parquet = tempfile(fileext = ".parquet")
54 | )
55 |
56 | # Conversion from a local rds file to a partitioned parquet file ::
57 |
58 | rds_to_parquet(
59 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"),
60 | path_to_parquet = tempfile(fileext = ".parquet"),
61 | partition = "yes",
62 | partitioning = c("Species")
63 | )
64 | }
65 |
--------------------------------------------------------------------------------
/R/download_extract.R:
--------------------------------------------------------------------------------
1 | #' @name download_extract
2 | #'
3 | #' @title download and uncompress file if needed
4 | #'
5 | #' @description This function will download the file if the file is remote and
6 | #' unzip it if it is zipped. It will just return the input path argument if
7 | #' it's neither. \cr
8 | #'
9 | #' If the zip contains multiple files, you can use `filename_in_zip` to set the file you want to unzip and use.
10 | #'
11 | #' You can pipe output on all `*_to_parquet` functions.
12 | #'
13 | #'
14 | #' @param path the input file's path or url.
15 | #' @param filename_in_zip name of the csv file in the zip. Required if
16 | #' several csv are included in the zip.
17 | #'
18 | #' @return the path to the usable (uncompressed) file, invisibly.
19 | #'
20 | #' @export
21 | #'
22 | #' @examples
23 | #'
24 | #' # 1. unzip a local zip file
25 | #' # 2. parquetize it
26 | #'
27 | #' file_path <- download_extract(system.file("extdata","mtcars.csv.zip", package = "readr"))
28 | #' csv_to_parquet(
29 | #' file_path,
30 | #' path_to_parquet = tempfile(fileext = ".parquet")
31 | #' )
32 | #'
33 | #' # 1. download a remote file
34 | #' # 2. extract the file census2021-ts007-ctry.csv
35 | #' # 3. parquetize it
36 | #'
37 | #' file_path <- download_extract(
38 | #' "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip",
39 | #' filename_in_zip = "census2021-ts007-ctry.csv"
40 | #' )
41 | #' csv_to_parquet(
42 | #' file_path,
43 | #' path_to_parquet = tempfile(fileext = ".parquet")
44 | #' )
45 | #'
46 | #' # the file is local and not zipped so :
47 | #' # 1. parquetize it
48 | #'
49 | #' file_path <- download_extract(parquetize_example("region_2022.csv"))
50 | #' csv_to_parquet(
51 | #' file_path,
52 | #' path_to_parquet = tempfile(fileext = ".parquet")
53 | #' )
54 | #'
55 | download_extract <- function(path, filename_in_zip) {
56 | if (is_remote(path)) {
57 | tmp_file <- curl_download(path,tempfile(fileext = file_ext(path)))
58 | } else {
59 | tmp_file <- path
60 | }
61 |
62 | if (!is_zip(path)) return(invisible(tmp_file))
63 |
64 | csv_files <- unzip(zipfile=tmp_file,exdir=tempfile())
65 | names(csv_files) <- basename(csv_files)
66 |
67 | if (length(csv_files) > 1 & missing(filename_in_zip)) {
68 | cli_abort("Be careful, zip files contains more than one file, you must set filename_in_zip argument",
69 | class = "parquetize_missing_argument")
70 | } else if (length(csv_files) > 1) {
71 | path <- csv_files[[filename_in_zip]]
72 | } else {
73 | path <- csv_files[[1]]
74 | }
75 | invisible(path)
76 | }
77 |
78 |
--------------------------------------------------------------------------------
/R/write_parquet_at_once.R:
--------------------------------------------------------------------------------
1 | #' @name write_parquet_at_once
2 | #'
3 | #' @title write parquet file or dataset based on partition argument \cr
4 | #'
5 | #' @description Low level function that implements the logic to write a parquet file or a dataset from data
6 | #'
7 | #' @param data the data.frame/tibble to write
8 | #' @inheritParams write_parquet_by_chunk
9 | #' @param partition string ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
10 | #' If "yes", `"partitioning"` argument must be filled in. In this case, a folder will be created for each modality of the variable filled in `"partitioning"`.
11 | #'
12 | #' @return a dataset as return by arrow::open_dataset
13 | #'
14 | #' @export
15 | #'
16 | #' @examples
17 | #'
18 | #' write_parquet_at_once(iris, tempfile())
19 | #'
20 | #' write_parquet_at_once(iris, tempfile(), partition = "yes", partitioning = c("Species"))
21 | #'
22 | #' \dontrun{
23 | #' write_parquet_at_once(iris, tempfile(), compression="gzip", compression_level = 5)
24 | #' }
25 | write_parquet_at_once <- function(
26 | data,
27 | path_to_parquet,
28 | partition = "no",
29 | compression = "snappy",
30 | compression_level = NULL,
31 | ...) {
32 | Sys.sleep(0.01)
33 | cli_progress_message("Writing data...")
34 |
35 | if (missing(data)) {
36 | cli_abort("Be careful, data argument is mandatory", class = "parquetize_missing_argument")
37 | }
38 |
39 | if (missing(path_to_parquet)) {
40 | cli_abort("Be careful, path_to_parquet argument is mandatory", class = "parquetize_missing_argument")
41 | }
42 |
43 | if (partition == "no") {
44 | if (isTRUE(file.info(path_to_parquet)$isdir)) {
45 | path_to_parquet <- file.path(path_to_parquet, paste0(basename(path_to_parquet), ".parquet"))
46 | cli_alert_warning("Be careful, path_to_parquet should be a file name, using : {path_to_parquet}")
47 | }
48 |
49 | write_parquet(data,
50 | sink = path_to_parquet,
51 | compression = compression,
52 | compression_level = compression_level,
53 | ...)
54 | parquet_type <- "file"
55 | } else if (partition == "yes") {
56 | write_dataset(data,
57 | path = path_to_parquet,
58 | compression = compression,
59 | compression_level = compression_level,
60 | ...)
61 | parquet_type <- "dataset"
62 | }
63 | Sys.sleep(0.01)
64 | cli_alert_success("\nData are available in parquet {parquet_type} under {path_to_parquet}")
65 | invisible(arrow::open_dataset(path_to_parquet))
66 | }
67 |
--------------------------------------------------------------------------------
/man/json_to_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/json_to_parquet.R
3 | \name{json_to_parquet}
4 | \alias{json_to_parquet}
5 | \title{Convert a json file to parquet format}
6 | \usage{
7 | json_to_parquet(
8 | path_to_file,
9 | path_to_parquet,
10 | format = "json",
11 | partition = "no",
12 | compression = "snappy",
13 | compression_level = NULL,
14 | ...
15 | )
16 | }
17 | \arguments{
18 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).}
19 |
20 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.}
21 |
22 | \item{format}{string that indicates if the format is "json" (by default) or "ndjson"}
23 |
24 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
25 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.
26 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.}
27 |
28 | \item{compression}{compression algorithm. Default "snappy".}
29 |
30 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
31 |
32 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
33 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.}
34 | }
35 | \value{
36 | A parquet file, invisibly
37 | }
38 | \description{
39 | This function allows to convert a \href{https://www.json.org/json-en.html}{json}
40 | or \href{https://docs.mulesoft.com/dataweave/latest/dataweave-formats-ndjson}{ndjson} file to parquet format. \cr
41 |
42 | Two conversions possibilities are offered :
43 |
44 | \itemize{
45 |
46 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;}
47 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;}
48 |
49 | }
50 | }
51 | \examples{
52 |
53 | # Conversion from a local json file to a single parquet file ::
54 |
55 | json_to_parquet(
56 | path_to_file = system.file("extdata","iris.json",package = "parquetize"),
57 | path_to_parquet = tempfile(fileext = ".parquet")
58 | )
59 |
60 | # Conversion from a local ndjson file to a partitioned parquet file ::
61 |
62 | json_to_parquet(
63 | path_to_file = system.file("extdata","iris.ndjson",package = "parquetize"),
64 | path_to_parquet = tempfile(fileext = ".parquet"),
65 | format = "ndjson"
66 | )
67 | }
68 |
--------------------------------------------------------------------------------
/tests/testthat/test-dbi_to_parquet.R:
--------------------------------------------------------------------------------
1 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(),
2 | system.file("extdata","iris.sqlite",package = "parquetize"))
3 | on.exit(DBI::dbDisconnect(dbi_connection))
4 |
5 | test_that("Checks arguments are correctly filled in", {
6 | expect_missing_argument(
7 | dbi_to_parquet(
8 | sql_query = "SELECT * FROM iris",
9 | path_to_parquet = "Data_test"
10 | ),
11 | regexp = "conn"
12 | )
13 |
14 | expect_missing_argument(
15 | dbi_to_parquet(
16 | conn = dbi_connection,
17 | path_to_parquet = "Data_test"
18 | ),
19 | regexp = "sql_query"
20 | )
21 |
22 | expect_missing_argument(
23 | dbi_to_parquet(
24 | conn = dbi_connection,
25 | sql_query = "SELECT * FROM iris"
26 | ),
27 | regexp = "path_to_parquet"
28 | )
29 | })
30 |
31 | test_that("Checks simple query generate a parquet file", {
32 | path_to_parquet <- tempfile()
33 |
34 | expect_no_error(
35 | dbi_to_parquet(
36 | conn = dbi_connection,
37 | sql_query = "SELECT * FROM iris",
38 | path_to_parquet = path_to_parquet
39 | )
40 | )
41 |
42 | expect_parquet(
43 | path_to_parquet,
44 | with_lines = 150
45 | )
46 | })
47 |
48 | test_that("Checks simple query generate a parquet file with good messages", {
49 | path_to_parquet <- tempfile()
50 |
51 | expect_no_error(
52 | dbi_to_parquet(
53 | conn = dbi_connection,
54 | sql_query = "SELECT * FROM iris",
55 | path_to_parquet = path_to_parquet,
56 | partition = "yes",
57 | partitioning = "Species"
58 | )
59 | )
60 |
61 | expect_parquet(
62 | path_to_parquet,
63 | with_lines = 150,
64 | with_partitions = c("Species=setosa", "Species=versicolor", "Species=virginica")
65 | )
66 | })
67 |
68 | test_that("Checks simple query works by chunk with max_rows", {
69 | path_to_parquet <- tempfile()
70 |
71 | expect_no_error(
72 | dbi_to_parquet(
73 | conn = dbi_connection,
74 | sql_query = "SELECT * FROM iris",
75 | path_to_parquet = path_to_parquet,
76 | max_rows = 49
77 | )
78 | )
79 |
80 | expect_parquet(
81 | path_to_parquet,
82 | with_lines = 150
83 | )
84 | })
85 |
86 | test_that("Checks simple query works by chunk with max_memory", {
87 | path_to_parquet <- tempfile()
88 | parquetname <- "iris"
89 |
90 | expect_no_error(
91 | dbi_to_parquet(
92 | conn = dbi_connection,
93 | sql_query = "SELECT * FROM iris",
94 | path_to_parquet = path_to_parquet,
95 | max_memory = 2 / 1024
96 | )
97 | )
98 |
99 | expect_parquet(
100 | path_to_parquet,
101 | with_lines = 150
102 | )
103 | })
104 |
105 |
--------------------------------------------------------------------------------
/man/sqlite_to_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/sqlite_to_parquet.R
3 | \name{sqlite_to_parquet}
4 | \alias{sqlite_to_parquet}
5 | \title{Convert a sqlite file to parquet format}
6 | \usage{
7 | sqlite_to_parquet(
8 | path_to_file,
9 | table_in_sqlite,
10 | path_to_parquet,
11 | partition = "no",
12 | compression = "snappy",
13 | compression_level = NULL,
14 | ...
15 | )
16 | }
17 | \arguments{
18 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).}
19 |
20 | \item{table_in_sqlite}{string that indicates the name of the table to convert in the sqlite file}
21 |
22 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.}
23 |
24 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
25 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.
26 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.}
27 |
28 | \item{compression}{compression algorithm. Default "snappy".}
29 |
30 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
31 |
32 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
33 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.}
34 | }
35 | \value{
36 | A parquet file, invisibly
37 | }
38 | \description{
39 | This function allows to convert a table from a sqlite file to parquet format. \cr
40 | The following extensions are supported :
41 | "db","sdb","sqlite","db3","s3db","sqlite3","sl3","db2","s2db","sqlite2","sl2". \cr
42 |
43 | Two conversions possibilities are offered :
44 |
45 | \itemize{
46 |
47 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;}
48 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;}
49 |
50 | }
51 | }
52 | \examples{
53 |
54 | # Conversion from a local sqlite file to a single parquet file :
55 |
56 | sqlite_to_parquet(
57 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"),
58 | table_in_sqlite = "iris",
59 | path_to_parquet = tempfile(fileext = ".parquet")
60 | )
61 |
62 | # Conversion from a local sqlite file to a partitioned parquet file :
63 |
64 | sqlite_to_parquet(
65 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"),
66 | table_in_sqlite = "iris",
67 | path_to_parquet = tempfile(),
68 | partition = "yes",
69 | partitioning = c("Species")
70 | )
71 | }
72 |
--------------------------------------------------------------------------------
/tests/testthat/test-write_parquet_by_chunk.R:
--------------------------------------------------------------------------------
1 | # we create the closure to loop over the data.frame
2 | my_read_closure <- function() {
3 | function(input, skip = 0L, n_max = Inf) {
4 | # if we are after the end of the input we return an empty data.frame
5 | if (skip+1 > nrow(input)) { return(data.frame()) }
6 |
7 | input[(skip+1):(min(skip+n_max, nrow(input))),]
8 | }
9 | }
10 |
11 | test_that("checks that argument works", {
12 | read_method <- my_read_closure()
13 |
14 | expect_missing_argument(
15 | write_parquet_by_chunk(
16 | input = mtcars,
17 | path_to_parquet = tempfile(),
18 | max_rows = 10,
19 | ),
20 | regexp = "read_method"
21 | )
22 |
23 | expect_missing_argument(
24 | write_parquet_by_chunk(
25 | read_method = read_method,
26 | path_to_parquet = tempfile(),
27 | max_rows = 10,
28 | ),
29 | regexp = "input"
30 | )
31 |
32 | expect_error(
33 | write_parquet_by_chunk(
34 | read_method = "",
35 | input = mtcars,
36 | path_to_parquet = tempfile(),
37 | max_rows = 10,
38 | ),
39 | regexp = "read_method",
40 | class = "parquetize_bad_argument"
41 | )
42 |
43 | expect_error(
44 | write_parquet_by_chunk(
45 | read_method = read_method,
46 | input = mtcars,
47 | path_to_parquet = tempfile(),
48 | max_rows = 10,
49 | max_memory = 10,
50 | ),
51 | regexp = "can not be used together",
52 | class = "parquetize_bad_argument"
53 | )
54 | })
55 |
56 | test_that("works with empty data", {
57 | path_to_parquet <- tempfile()
58 | read_method <- my_read_closure()
59 |
60 | expect_no_error(
61 | write_parquet_by_chunk(
62 | read_method = read_method,
63 | input = data.frame(),
64 | path_to_parquet = path_to_parquet,
65 | max_rows = 50,
66 | )
67 | )
68 |
69 | expect_parquet(path_to_parquet, with_lines = 0)
70 | })
71 |
72 | test_that("Checks parquetizing by nrow chunks works", {
73 | path_to_parquet <- tempfile()
74 | read_method <- my_read_closure()
75 |
76 | expect_no_error(
77 | write_parquet_by_chunk(
78 | read_method = read_method,
79 | input = iris,
80 | path_to_parquet = path_to_parquet,
81 | max_rows = 50,
82 | )
83 | )
84 |
85 | expect_parquet(path_to_parquet, with_lines = 150, with_files = 3)
86 | })
87 |
88 | test_that("Checks parquetizing by memory size chunks works", {
89 | path_to_parquet <- tempfile()
90 | read_method <- my_read_closure()
91 |
92 | expect_no_error(
93 | write_parquet_by_chunk(
94 | read_method = read_method,
95 | input = iris,
96 | path_to_parquet = path_to_parquet,
97 | max_memory = 2 / 1024,
98 | )
99 | )
100 |
101 | expect_parquet(path_to_parquet, with_lines = 150, with_files = 4)
102 | })
103 |
--------------------------------------------------------------------------------
/R/utilities.R:
--------------------------------------------------------------------------------
1 | #' @name get_lines_for_memory
2 | #'
3 | #' @title Utility to guess the number of lines fiting in given memory_size
4 | #'
5 | #' @param data a tibble/dataframe of equivalent with the data sample used to guess memory
6 | #' @param memory_size memory (in Mo) to use for one chunk, default to 4000Mb
7 | #'
8 | #' This method tries to estimate the number lines that fit in argument
9 | #' memory_size
10 | #'
11 | #' @noRd
12 | get_lines_for_memory <- function(data, max_memory = 4000) {
13 | data_memory_size <- object.size(data)
14 | # cosmetic : remove object.size attribute
15 | attributes(data_memory_size) <- NULL
16 |
17 | # max_memory is in Mb and data_memory_size in bytes
18 | lines <- ceiling(max_memory * 1024 * 1024 * nrow(data) / data_memory_size)
19 | lines
20 | }
21 |
22 | haven_read_function_by_extension <- list(
23 | "sas7bdat" = haven::read_sas,
24 | "SAS7BDAT" = haven::read_sas,
25 | "sav" = haven::read_sav,
26 | "SAV" = haven::read_sav,
27 | "dta" = haven::read_dta,
28 | "DTA" = haven::read_dta
29 | )
30 |
31 | #' @name get_read_function_for_file
32 | #'
33 | #' @title Utility that returns the haven method to use for given file
34 | #'
35 | #' @param file_name string that indicates the path to the input file
36 | #'
37 | #' @noRd
38 | get_haven_read_function_for_file <- function(file_name) {
39 | ext <- tools::file_ext(file_name)
40 | if (ext == "") {
41 | cli_abort("Be careful, unable to find a read method for \"{file_name}\", it has no extension",
42 | class = "parquetize_bad_argument")
43 | }
44 |
45 | fun <- haven_read_function_by_extension[[ext]]
46 | if (is.null(fun)) {
47 | cli_abort("Be careful, no method to read \"{file_name}\" file",
48 | class = "parquetize_bad_argument")
49 | }
50 |
51 | fun
52 | }
53 |
54 |
55 | #' @name is_remote
56 | #'
57 | #' @title Utility to check if file is local or remote
58 | #'
59 | #' @param path file's path
60 | #' @return TRUE if remote, FALSE otherwise
61 | #'
62 | #' @noRd
63 |
64 | is_remote <- function(path) {
65 | grepl('(http|ftp)s?://', path)
66 | }
67 |
68 | #' @name is_zip
69 | #'
70 | #' @title Utility to check if file is a zip
71 | #'
72 | #' @param path file's path
73 | #' @return TRUE if zip, FALSE otherwise
74 | #'
75 | #' @noRd
76 |
77 | is_zip <- function(path) {
78 | grepl('\\.zip$', path, ignore.case = TRUE)
79 | }
80 |
81 | #' @name get_col_types
82 | #'
83 | #' @title Utility to get informations on parquet file's columns
84 | #'
85 | #' @param ds a dataset/parquet file
86 | #'
87 | #' @return a tibble with 2 columns :
88 | #'
89 | #' * the column name (string)
90 | #' * the arrow type (string)
91 | #
92 | #' @noRd
93 | get_col_types <- function(ds) {
94 | fields <- ds$schema$fields
95 |
96 | tibble(
97 | name = unlist(lapply(fields, function(x) { x$name })),
98 | type = unlist(lapply(fields, function(x) { x$type$name }))
99 | )
100 | }
101 |
--------------------------------------------------------------------------------
/tests/testthat/test-testthat-helpers.R:
--------------------------------------------------------------------------------
1 | test_that("expect_parquet fails on file error", {
2 | expect_error(
3 | expect_parquet(parquetize_example("region_2022.csv"), with_lines = 25),
4 | regexp = "Invalid"
5 | )
6 | })
7 |
8 | test_that("expect_parquet fails on file's number of line", {
9 | expect_error(
10 | expect_parquet(parquetize_example("iris_dataset"), with_lines = 25),
11 | class = "expectation_failure"
12 | )
13 | })
14 |
15 | test_that("expect_parquet works without partitions", {
16 | expect_no_error(
17 | expect_parquet(parquetize_example("iris_dataset"), with_lines = 150)
18 | )
19 | })
20 |
21 | test_that("expect_parquet works with partitions", {
22 | expect_no_error(
23 | expect_parquet(parquetize_example("iris_dataset"),
24 | with_lines = 150,
25 | with_partitions = c('Species=setosa', 'Species=versicolor', 'Species=virginica'),
26 | with_files = 3)
27 | )
28 | })
29 |
30 | test_that("expect_parquet works with columns", {
31 | expect_no_error(
32 | expect_parquet(parquetize_example("iris_dataset"),
33 | with_lines = 150,
34 | with_columns = c("Petal.Width", "Sepal.Length", "Sepal.Width", "Species", "Petal.Length"))
35 | )
36 | })
37 |
38 | test_that("expect_parquet fails works with partitions", {
39 | expect_error(
40 | expect_parquet(parquetize_example("iris_dataset"),
41 | with_lines = 150,
42 | with_partitions = c('Species=setosa')),
43 | class = "partquetize_test_with_partitions"
44 | )
45 | })
46 |
47 | test_that("expect_parquet fails with bad columns columns", {
48 | expect_error(
49 | expect_parquet(parquetize_example("iris_dataset"),
50 | with_lines = 150,
51 | with_columns = c("Petal.Length", "Petal.Width", "Sepal.Length")),
52 | class = "partquetize_test_with_columns"
53 | )
54 | })
55 |
56 | test_that("expect_missing_argument check good errors", {
57 | raising_fun <- function() {
58 | cli_abort("string", class = "parquetize_missing_argument")
59 | }
60 | expect_no_error(
61 | expect_missing_argument(raising_fun(), regexp = "string")
62 | )
63 | })
64 |
65 | test_that("expect_missing_argument fails on bad string", {
66 | raising_fun <- function() {
67 | cli_abort("string", class = "parquetize_missing_argument")
68 | }
69 | expect_error(
70 | expect_missing_argument(raising_fun(), regexp = "message")
71 | )
72 | })
73 |
74 | test_that("expect_missing_argument fails on bad error type", {
75 | raising_fun <- function() {
76 | cli_abort("string", class = "a_class")
77 | }
78 | expect_error(
79 | expect_missing_argument(raising_fun(), regexp = "string"),
80 | class = "a_class"
81 | )
82 | })
83 |
84 | test_that("expect_parquet fails with bad files number", {
85 | expect_error(
86 | expect_parquet(parquetize_example("iris_dataset"), with_lines = 150, with_files = 100),
87 | class = "partquetize_test_with_files"
88 | )
89 | })
90 |
--------------------------------------------------------------------------------
/R/json_to_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name json_to_parquet
2 | #'
3 | #' @title Convert a json file to parquet format
4 | #'
5 | #' @description This function allows to convert a \href{https://www.json.org/json-en.html}{json}
6 | #' or \href{https://docs.mulesoft.com/dataweave/latest/dataweave-formats-ndjson}{ndjson} file to parquet format. \cr
7 | #'
8 | #' Two conversions possibilities are offered :
9 | #'
10 | #'\itemize{
11 | #'
12 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;}
13 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;}
14 | #'
15 | #' }
16 | #'
17 | #' @param format string that indicates if the format is "json" (by default) or "ndjson"
18 | #' @inheritParams table_to_parquet
19 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
20 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.
21 | #' @return A parquet file, invisibly
22 | #'
23 | #' @export
24 | #'
25 | #' @examples
26 | #'
27 | #' # Conversion from a local json file to a single parquet file ::
28 | #'
29 | #' json_to_parquet(
30 | #' path_to_file = system.file("extdata","iris.json",package = "parquetize"),
31 | #' path_to_parquet = tempfile(fileext = ".parquet")
32 | #' )
33 | #'
34 | #' # Conversion from a local ndjson file to a partitioned parquet file ::
35 | #'
36 | #' json_to_parquet(
37 | #' path_to_file = system.file("extdata","iris.ndjson",package = "parquetize"),
38 | #' path_to_parquet = tempfile(fileext = ".parquet"),
39 | #' format = "ndjson"
40 | #' )
41 |
42 | json_to_parquet <- function(
43 | path_to_file,
44 | path_to_parquet,
45 | format = "json",
46 | partition = "no",
47 | compression = "snappy",
48 | compression_level = NULL,
49 | ...
50 | ) {
51 |
52 | # Check if path_to_file is missing
53 | if (missing(path_to_file)) {
54 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument")
55 | }
56 |
57 | # Check if path_to_parquet is missing
58 | if (missing(path_to_parquet)) {
59 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument")
60 | }
61 |
62 | # Check if format is equal to "json" or "ndjson"
63 | if (!(format %in% c("json","ndjson"))) {
64 | cli_abort("Be careful, the argument format must be equal to 'json' or 'ndjson'", class = "parquetize_bad_format")
65 | }
66 |
67 | Sys.sleep(0.01)
68 | cli_progress_message("Reading data...")
69 |
70 | if (format == "json") {
71 | json_output <- jsonlite::read_json(path = path_to_file,
72 | simplifyVector = TRUE)
73 | } else if (format == "ndjson") {
74 | json_output <- read_json_arrow(file = path_to_file,
75 | as_data_frame = TRUE)
76 | }
77 |
78 | dataset <- write_parquet_at_once(json_output, path_to_parquet, partition, ...)
79 |
80 | return(invisible(dataset))
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/R/rbind_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name rbind_parquet
2 | #'
3 | #' @title Function to bind multiple parquet files by row
4 | #'
5 | #' @description This function read all parquet files in `folder` argument that starts with `output_name`,
6 | #' combine them using rbind and write the result to a new parquet file. \cr
7 | #'
8 | #' It can also delete the initial files if `delete_initial_files` argument is TRUE. \cr
9 | #'
10 | #' Be careful, this function will not work if files with different structures
11 | #' are present in the folder given with the argument `folder`.
12 | #'
13 | #' @param folder the folder where the initial files are stored
14 | #' @param output_name name of the output parquet file
15 | #' @param delete_initial_files Boolean. Should the function delete the initial files ? By default TRUE.
16 | #' @param compression compression algorithm. Default "snappy".
17 | #' @param compression_level compression level. Meaning depends on compression algorithm.
18 | #'
19 | #' @return Parquet files, invisibly
20 | #'
21 | #' @export
22 | #'
23 | #' @examples
24 | #' \dontrun{
25 | #' library(arrow)
26 | #' if (file.exists('output')==FALSE) {
27 | #' dir.create("output")
28 | #' }
29 | #'
30 | #' file.create(fileext = "output/test_data1-4.parquet")
31 | #' write_parquet(data.frame(
32 | #' x = c("a","b","c"),
33 | #' y = c(1L,2L,3L)
34 | #' ),
35 | #' "output/test_data1-4.parquet")
36 | #'
37 | #' file.create(fileext = "output/test_data4-6.parquet")
38 | #' write_parquet(data.frame(
39 | #' x = c("d","e","f"),
40 | #' y = c(4L,5L,6L)
41 | #' ), "output/test_data4-6.parquet")
42 | #'
43 | #' test_data <- rbind_parquet(folder = "output",
44 | #' output_name = "test_data",
45 | #' delete_initial_files = FALSE)
46 | #' }
47 |
48 | rbind_parquet <- function(folder,
49 | output_name,
50 | delete_initial_files = TRUE,
51 | compression = "snappy",
52 | compression_level = NULL) {
53 |
54 | # Get the list of files in the folder
55 | files <- list.files(folder, pattern = paste0("^",output_name,".*\\.parquet$"))
56 |
57 | # Initialize an empty list to store the data frames
58 | data_frames <- list()
59 |
60 | # Loop through the files
61 | for (file in files) {
62 | # Read the parquet file into a data frame
63 | df <- read_parquet(file.path(folder,file))
64 |
65 | # Add the data frame to the list
66 | data_frames[[file]] <- df
67 | }
68 |
69 | # Use rbind to combine the data frames into a single data frame
70 | combined_df <- do.call(rbind, data_frames)
71 |
72 | # Delete the initial parquet files
73 | if (isTRUE(delete_initial_files)) {
74 | unlink(file.path(folder,files))
75 | }
76 |
77 | # Write the combined data frame to a new parquet file
78 | write_parquet(combined_df,
79 | file.path(folder, paste0(output_name,".parquet")),
80 | compression = compression,
81 | compression_level = compression_level)
82 |
83 | cli_alert_success("\nThe {output_name} parquet file is available under {folder}")
84 |
85 | return(invisible(combined_df))
86 | }
87 |
--------------------------------------------------------------------------------
/R/sqlite_to_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name sqlite_to_parquet
2 | #'
3 | #' @title Convert a sqlite file to parquet format
4 | #'
5 | #' @description This function allows to convert a table from a sqlite file to parquet format. \cr
6 | #' The following extensions are supported :
7 | #' "db","sdb","sqlite","db3","s3db","sqlite3","sl3","db2","s2db","sqlite2","sl2". \cr
8 | #'
9 | #' Two conversions possibilities are offered :
10 | #'
11 | #'\itemize{
12 | #'
13 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;}
14 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;}
15 | #'
16 | #' }
17 | #'
18 | #' @param table_in_sqlite string that indicates the name of the table to convert in the sqlite file
19 | #' @inheritParams table_to_parquet
20 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
21 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.
22 | #' @return A parquet file, invisibly
23 | #'
24 | #' @export
25 | #'
26 | #' @examples
27 | #'
28 | #' # Conversion from a local sqlite file to a single parquet file :
29 | #'
30 | #' sqlite_to_parquet(
31 | #' path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"),
32 | #' table_in_sqlite = "iris",
33 | #' path_to_parquet = tempfile(fileext = ".parquet")
34 | #' )
35 | #'
36 | #' # Conversion from a local sqlite file to a partitioned parquet file :
37 | #'
38 | #' sqlite_to_parquet(
39 | #' path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"),
40 | #' table_in_sqlite = "iris",
41 | #' path_to_parquet = tempfile(),
42 | #' partition = "yes",
43 | #' partitioning = c("Species")
44 | #' )
45 |
46 | sqlite_to_parquet <- function(
47 | path_to_file,
48 | table_in_sqlite,
49 | path_to_parquet,
50 | partition = "no",
51 | compression = "snappy",
52 | compression_level = NULL,
53 | ...
54 | ) {
55 |
56 | # Check if path_to_file is missing
57 | if (missing(path_to_file)) {
58 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument")
59 | }
60 |
61 | # Check if extension used in path_to_file is correct
62 | if (!(sub(".*\\.", "", path_to_file) %in% c("db","sdb","sqlite","db3","s3db","sqlite3","sl3","db2","s2db","sqlite2","sl2"))) {
63 | cli_abort("Be careful, the extension used in path_to_file is not correct", class = "parquetize_bad_format")
64 | }
65 |
66 | # Check if path_to_parquet is missing
67 | if (missing(path_to_parquet)) {
68 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument")
69 | }
70 |
71 | Sys.sleep(0.01)
72 | cli_progress_message("Reading data...")
73 |
74 | con_sqlite <- DBI::dbConnect(RSQLite::SQLite(), path_to_file)
75 |
76 | # Check if table_in_sqlite exists in sqlite file
77 | list_table <- DBI::dbListTables(con_sqlite)
78 | if (!(table_in_sqlite %in% list_table)==TRUE) {
79 | cli_abort("Be careful, the table filled in the table_in_sqlite argument {table_in_sqlite} does not exist in your sqlite file",
80 | class = "parquetize_missing_table")
81 | }
82 |
83 | sqlite_output <- DBI::dbReadTable(con_sqlite, table_in_sqlite)
84 |
85 | DBI::dbDisconnect(con_sqlite, shutdown=TRUE)
86 |
87 | Sys.sleep(0.01)
88 | cli_progress_message("Writing data...")
89 |
90 | dataset <- write_parquet_at_once(
91 | sqlite_output,
92 | path_to_parquet,
93 | partition,
94 | compression,
95 | compression_level,
96 | ...)
97 |
98 | return(invisible(dataset))
99 |
100 | }
101 |
--------------------------------------------------------------------------------
/man/write_parquet_by_chunk.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/write_parquet_by_chunk.R
3 | \name{write_parquet_by_chunk}
4 | \alias{write_parquet_by_chunk}
5 | \title{read input by chunk on function and create dataset \cr}
6 | \usage{
7 | write_parquet_by_chunk(
8 | read_method,
9 | input,
10 | path_to_parquet,
11 | max_rows = NULL,
12 | max_memory = NULL,
13 | chunk_memory_sample_lines = 10000,
14 | compression = "snappy",
15 | compression_level = NULL,
16 | ...
17 | )
18 | }
19 | \arguments{
20 | \item{read_method}{a method to read input files. This method take only three
21 | arguments
22 |
23 | \code{input} : some kind of data. Can be a
24 | \code{skip} : the number of row to skip
25 | \code{n_max} : the number of row to return
26 |
27 | This method will be called until it returns a dataframe/tibble with zero row.}
28 |
29 | \item{input}{that indicates the path to the input. It can be anything you
30 | want but more often a file's path or a data.frame.}
31 |
32 | \item{path_to_parquet}{String that indicates the path to the directory where
33 | the output parquet file or dataset will be stored.}
34 |
35 | \item{max_rows}{Number of lines that defines the size of the chunk. This
36 | argument can not be filled in if max_memory is used.}
37 |
38 | \item{max_memory}{Memory size (in Mb) in which data of one parquet file
39 | should roughly fit.}
40 |
41 | \item{chunk_memory_sample_lines}{Number of lines to read to evaluate
42 | max_memory. Default to 10 000.}
43 |
44 | \item{compression}{compression algorithm. Default "snappy".}
45 |
46 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
47 |
48 | \item{...}{Additional format-specific arguments, see
49 | \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}}
50 | }
51 | \value{
52 | a dataset as return by arrow::open_dataset
53 | }
54 | \description{
55 | Low level function that implements the logic to to read input file by chunk and write a
56 | dataset. \cr
57 |
58 | It will:
59 |
60 | \itemize{
61 | \item{calculate the number of row by chunk if needed;}
62 | \item{loop over the input file by chunk;}
63 | \item{write each output files.}
64 | }
65 | }
66 | \examples{
67 |
68 | # example with a dataframe
69 |
70 | # we create the function to loop over the data.frame
71 |
72 | read_method <- function(input, skip = 0L, n_max = Inf) {
73 | # if we are after the end of the input we return an empty data.frame
74 | if (skip+1 > nrow(input)) { return(data.frame()) }
75 |
76 | # return the n_max row from skip + 1
77 | input[(skip+1):(min(skip+n_max, nrow(input))),]
78 | }
79 |
80 | # we use it
81 |
82 | write_parquet_by_chunk(
83 | read_method = read_method,
84 | input = mtcars,
85 | path_to_parquet = tempfile(),
86 | max_rows = 10,
87 | )
88 |
89 |
90 | #
91 | # Example with haven::read_sas
92 | #
93 |
94 | # we need to pass two argument beside the 3 input, skip and n_max.
95 | # We will use a closure :
96 |
97 | my_read_closure <- function(encoding, columns) {
98 | function(input, skip = OL, n_max = Inf) {
99 | haven::read_sas(data_file = input,
100 | n_max = n_max,
101 | skip = skip,
102 | encoding = encoding,
103 | col_select = all_of(columns))
104 | }
105 | }
106 |
107 | # we initialize the closure
108 |
109 | read_method <- my_read_closure(encoding = "WINDOWS-1252", columns = c("Species", "Petal_Width"))
110 |
111 | # we use it
112 | write_parquet_by_chunk(
113 | read_method = read_method,
114 | input = system.file("examples","iris.sas7bdat", package = "haven"),
115 | path_to_parquet = tempfile(),
116 | max_rows = 75,
117 | )
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to parquetize development
2 |
3 | The goal of this guide is to help you get up and contributing to `parquetize` as
4 | quickly as possible. The guide is divided into two main pieces:
5 |
6 | 1. Filing a bug report or feature request in an issue.
7 | 1. Suggesting a change via a pull request.
8 |
9 |
10 | ## Issues
11 |
12 | When filing an issue, the most important thing is to include a minimal
13 | reproducible example so that I can quickly verify the problem, and then figure
14 | out how to fix it. There are three things you need to include to make your
15 | example reproducible: required packages, data, code.
16 |
17 | 1. **Packages** should be loaded at the top of the script, so it's easy to
18 | see which ones the example needs.
19 |
20 | 1. The easiest way to include **data** is to use `dput()` to generate the R code
21 | to recreate it. For example, to recreate the `mtcars` dataset in R,
22 | I'd perform the following steps:
23 |
24 | 1. Run `dput(mtcars)` in R
25 | 2. Copy the output
26 | 3. In my reproducible script, type `mtcars <- ` then paste.
27 |
28 | But even better is if you can create a `data.frame()` with just a handful
29 | of rows and columns that still illustrates the problem.
30 |
31 | 1. Spend a little bit of time ensuring that your **code** is easy for others to
32 | read:
33 |
34 | * make sure you've used spaces and your variable names are concise, but
35 | informative
36 |
37 | * use comments to indicate where your problem lies
38 |
39 | * do your best to remove everything that is not related to the problem.
40 | The shorter your code is, the easier it is to understand.
41 |
42 | You can check you have actually made a reproducible example by starting up a
43 | fresh R session and pasting your script in.
44 |
45 | (Unless you've been specifically asked for it, please don't include the output
46 | of `sessionInfo()`.)
47 |
48 | ## Pull requests
49 |
50 | To contribute a change to `parquetize`, you follow these steps:
51 |
52 | 1. Create a branch in git and make your changes.
53 | 1. Push branch to github and issue pull request (PR).
54 | 1. Discuss the pull request.
55 | 1. Iterate until either I accept the PR or decide that it's not
56 | a good fit for `parquetize`.
57 |
58 | Each of these steps are described in more detail below. This might feel
59 | overwhelming the first time you get set up, but it gets easier with practice.
60 |
61 | If you're not familiar with git or github, please start by reading
62 |
63 | Pull requests will be evaluated against a seven point checklist:
64 |
65 | 1. __Motivation__. Your pull request should clearly and concisely motivate the
66 | need for change.
67 |
68 | Also include this motivation in `NEWS` so that when a new release of
69 | parquetize comes out it's easy for users to see what's changed. Add your
70 | item at the top of the file and use markdown for formatting. The
71 | news item should end with `(@yourGithubUsername, #the_issue_number)`.
72 |
73 | 2. __Only related changes__. Before you submit your pull request, please
74 | check to make sure that you haven't accidentally included any unrelated
75 | changes. These make it harder to see exactly what's changed, and to
76 | evaluate any unexpected side effects.
77 |
78 | Each PR corresponds to a git branch, so if you expect to submit
79 | multiple changes make sure to create multiple branches. If you have
80 | multiple changes that depend on each other, start with the first one
81 | and don't submit any others until the first one has been processed.
82 |
83 | 3. If you're adding new parameters or a new function, you'll also need
84 | to document them with [roxygen](https://github.com/klutometis/roxygen).
85 | Make sure to re-run `devtools::document()` on the code before submitting.
86 |
87 | 4. If fixing a bug or adding a new feature,
88 | please add a [testthat](https://github.com/r-lib/testthat) unit test.
89 |
90 | This seems like a lot of work but don't worry if your pull request isn't perfect.
91 | A pull request ("PR") is a process, and unless you've submitted a few in the
92 | past it's unlikely that your pull request will be accepted as is.
93 |
94 | Many thanks in advance !
95 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | 
4 | [](https://CRAN.R-project.org/package=parquetize)
5 | [](https://CRAN.R-project.org/package=parquetize)
6 | [](https://cran.r-project.org/package=parquetize)
7 | [](https://github.com/ddotta/parquetize/actions/workflows/check-release.yaml)
9 | [](https://app.codecov.io/gh/ddotta/parquetize)
10 | [](https://www.codefactor.io/repository/github/ddotta/parquetize)
11 |
12 |
13 | :package: Package `parquetize`
14 | ======================================
15 |
16 | R package that allows to convert databases of different formats (csv, SAS, SPSS, Stata, rds, sqlite, JSON, ndJSON) to [parquet](https://parquet.apache.org/) format in a same function.
17 |
18 | ## Installation
19 |
20 | To install `parquetize` from CRAN :
21 |
22 | ``` r
23 | install.packages("parquetize")
24 | ```
25 |
26 | Or alternatively to install the development version from GitHub :
27 |
28 | ``` r
29 | remotes::install_github("ddotta/parquetize")
30 | ```
31 |
32 | Then to load it :
33 |
34 | ``` r
35 | library(parquetize)
36 | ```
37 |
38 | ## Why this package ?
39 |
40 | This package is a simple wrapper of some very useful functions from the [haven](https://github.com/tidyverse/haven), [readr](https://github.com/tidyverse/readr/), [jsonlite](https://github.com/jeroen/jsonlite), [RSQLite](https://github.com/r-dbi/RSQLite) and [arrow](https://github.com/apache/arrow) packages.
41 |
42 | While working, I realized that I was often repeating the same operation when working with parquet files :
43 |
44 | - I import the file in R with {haven}, {jsonlite}, {readr}, {DBI} or {RSQLite}.
45 | - And I export the file in parquet format
46 |
47 | As a fervent of the DRY principle (don't repeat yourself) the exported functions of this package make my life easier and **execute these operations within the same function**.
48 |
49 | **The last benefit** of using package `{parquetize}` is that its functions allow to create single parquet files or partitioned files depending on the arguments chosen in the functions.
50 |
51 | - [csv_to_parquet()](https://ddotta.github.io/parquetize/reference/csv_to_parquet.html)
52 | - **The other benefit of this function** is that it allows you to convert csv or txt files whether they are stored locally or available on the internet directly to csv/txt format or inside a zip.
53 | - [json_to_parquet()](https://ddotta.github.io/parquetize/reference/json_to_parquet.html)
54 | - **The other benefit of this function** is that it handles JSON and ndJSON files in a same function. There is only one function to use for these 2 cases.
55 | - [rds_to_parquet()](https://ddotta.github.io/parquetize/reference/rds_to_parquet.html)
56 | - [fst_to_parquet()](https://ddotta.github.io/parquetize/reference/fst_to_parquet.html)
57 | - [table_to_parquet()](https://ddotta.github.io/parquetize/reference/table_to_parquet.html)
58 | - **The other benefit of this function** is that it handles SAS, SPSS and Stata files in a same function. There is only one function to use for these 3 cases. To avoid overcharging R's RAM for huge table, the conversion can be done by chunk. For more information, see [here](https://ddotta.github.io/parquetize/articles/aa-conversions.html)
59 | - [sqlite_to_parquet()](https://ddotta.github.io/parquetize/reference/sqlite_to_parquet.html)
60 | - [dbi_to_parquet()](https://ddotta.github.io/parquetize/reference/dbi_to_parquet.html)
61 |
62 |
63 | For more details, see the examples associated with each function in the documentation.
64 |
65 | ## Example
66 |
67 | You want to use the Insee file of first names by birth department? Use R and {parquetize} package that takes care of everything: it downloads the data (3.7 million rows) and converts it to parquet format in few seconds !
68 |
69 |
70 |
71 | ## Contribution
72 |
73 | Feel welcome to contribute to add features that you find useful in your daily work.
74 | Ideas are welcomed in [the issues](https://github.com/ddotta/parquetize/issues).
75 |
--------------------------------------------------------------------------------
/man/dbi_to_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/dbi_to_parquet.R
3 | \name{dbi_to_parquet}
4 | \alias{dbi_to_parquet}
5 | \title{Convert a SQL Query on a DBI connection to parquet format}
6 | \usage{
7 | dbi_to_parquet(
8 | conn,
9 | sql_query,
10 | path_to_parquet,
11 | max_memory,
12 | max_rows,
13 | chunk_memory_sample_lines = 10000,
14 | partition = "no",
15 | compression = "snappy",
16 | compression_level = NULL,
17 | ...
18 | )
19 | }
20 | \arguments{
21 | \item{conn}{A DBIConnection object, as return by DBI::dbConnect}
22 |
23 | \item{sql_query}{a character string containing an SQL query (this argument is passed to DBI::dbSendQuery)}
24 |
25 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.}
26 |
27 | \item{max_memory}{Memory size (in Mb) in which data of one parquet file should roughly fit.}
28 |
29 | \item{max_rows}{Number of lines that defines the size of the chunk.
30 | This argument can not be filled in if max_memory is used.}
31 |
32 | \item{chunk_memory_sample_lines}{Number of lines to read to evaluate max_memory. Default to 10 000.}
33 |
34 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
35 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.
36 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.}
37 |
38 | \item{compression}{compression algorithm. Default "snappy".}
39 |
40 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
41 |
42 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
43 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.}
44 | }
45 | \value{
46 | A parquet file, invisibly
47 | }
48 | \description{
49 | This function allows to convert a SQL query from a DBI to parquet format.\cr
50 |
51 | It handles all DBI supported databases.
52 |
53 | Two conversions possibilities are offered :
54 |
55 | \itemize{
56 |
57 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;}
58 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;}
59 |
60 | }
61 |
62 | Examples explain how to convert a query to a chunked dataset
63 | }
64 | \examples{
65 |
66 | # Conversion from a sqlite dbi connection to a single parquet file :
67 |
68 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(),
69 | system.file("extdata","iris.sqlite",package = "parquetize"))
70 |
71 | # Reading iris table from local sqlite database
72 | # and conversion to one parquet file :
73 |
74 | dbi_to_parquet(
75 | conn = dbi_connection,
76 | sql_query = "SELECT * FROM iris",
77 | path_to_parquet = tempfile(fileext=".parquet"),
78 | )
79 |
80 | # Reading iris table from local sqlite database by chunk (using
81 | # `max_memory` argument) and conversion to multiple parquet files
82 |
83 | dbi_to_parquet(
84 | conn = dbi_connection,
85 | sql_query = "SELECT * FROM iris",
86 | path_to_parquet = tempdir(),
87 | max_memory = 2 / 1024
88 | )
89 |
90 | # Using chunk and partition together is not possible directly but easy to do :
91 | # Reading iris table from local sqlite database by chunk (using
92 | # `max_memory` argument) and conversion to arrow dataset partitioned by
93 | # species
94 |
95 | # get unique values of column "iris from table "iris"
96 | partitions <- get_partitions(dbi_connection, table = "iris", column = "Species")
97 |
98 | # loop over those values
99 | for (species in partitions) {
100 | dbi_to_parquet(
101 | conn = dbi_connection,
102 | # use glue_sql to create the query filtering the partition
103 | sql_query = glue::glue_sql("SELECT * FROM iris where Species = {species}",
104 | .con = dbi_connection),
105 | # add the partition name in the output dir to respect parquet partition schema
106 | path_to_parquet = file.path(tempdir(), "iris", paste0("Species=", species)),
107 | max_memory = 2 / 1024,
108 | )
109 | }
110 |
111 | # If you need a more complicated query to get your partitions, you can use
112 | # dbGetQuery directly :
113 | col_to_partition <- DBI::dbGetQuery(dbi_connection, "SELECT distinct(`Species`) FROM `iris`")[,1]
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/tests/testthat/test-csv_to_parquet.R:
--------------------------------------------------------------------------------
1 | options(timeout=200)
2 |
3 | test_that("Checks arguments are correctly filled in", {
4 | expect_missing_argument(
5 | csv_to_parquet(
6 | path_to_parquet = tempfile()
7 | ),
8 | regexp = "path_to_file"
9 | )
10 |
11 | expect_missing_argument(
12 | csv_to_parquet(
13 | path_to_file = parquetize_example("region_2022.csv")
14 | ),
15 | regexp = "path_to_parquet"
16 | )
17 | })
18 |
19 | test_that("Checks simple conversion works", {
20 | path_to_parquet <- tempfile()
21 |
22 | expect_no_error(
23 | csv_to_parquet(
24 | path_to_file = parquetize_example("region_2022.csv"),
25 | path_to_parquet = path_to_parquet
26 | )
27 | )
28 |
29 | expect_parquet(path = path_to_parquet, with_lines = 18)
30 | })
31 |
32 | test_that("Checks url_to_csv argument is deprecated", {
33 | expect_warning(
34 | csv_to_parquet(
35 | url_to_csv = "https://github.com/sidsriv/Introduction-to-Data-Science-in-python/raw/master/census.csv",
36 | path_to_parquet = tempfile()
37 | ),
38 | regexp = "deprecated"
39 | )
40 | })
41 | test_that("Checks csv_as_a_zip is deprecated", {
42 | expect_warning(
43 | csv_to_parquet(
44 | path_to_file = system.file("extdata","mtcars.csv.zip", package = "readr"),
45 | path_to_parquet = tempfile(),
46 | csv_as_a_zip = TRUE
47 | ),
48 | regexp = "deprecated"
49 | )
50 | })
51 |
52 |
53 | test_that("Checks it works with compression", {
54 | skip_if_offline()
55 |
56 | path_to_parquet <- tempfile()
57 |
58 | expect_no_error(
59 | csv_to_parquet(
60 | path_to_file = parquetize_example("region_2022.csv"),
61 | path_to_parquet = path_to_parquet,
62 | compression = "gzip",
63 | compression_level = 5
64 | )
65 | )
66 |
67 | expect_parquet(path = path_to_parquet, with_lines = 18)
68 | })
69 |
70 | test_that("Checks it works when partitioning", {
71 | path_to_parquet <- tempfile()
72 |
73 | expect_no_error(
74 | csv_to_parquet(
75 | path_to_file = parquetize_example("region_2022.csv"),
76 | path_to_parquet = path_to_parquet,
77 | partition = "yes",
78 | partitioning = c("REG")
79 | )
80 | )
81 |
82 | expect_parquet(path = path_to_parquet, with_lines = 18)
83 | })
84 |
85 | test_that("Checks error if argument columns is not a character vector", {
86 | expect_error(
87 | csv_to_parquet(
88 | path_to_file = parquetize_example("region_2022.csv"),
89 | path_to_parquet = tempfile(),
90 | columns = matrix(1:10)
91 | ),
92 | class = "parquetize_bad_argument"
93 | )
94 | })
95 |
96 | test_that("Checks columns are selected as wanted", {
97 | path_to_parquet <- tempfile()
98 | columns <- c("REG","LIBELLE")
99 |
100 | expect_no_error(
101 | csv_to_parquet(
102 | path_to_file = parquetize_example("region_2022.csv"),
103 | path_to_parquet = path_to_parquet,
104 | columns = columns
105 | )
106 | )
107 |
108 | expect_parquet(
109 | path_to_parquet,
110 | with_lines = 18,
111 | with_columns = columns)
112 | })
113 |
114 | test_that("Checks message zip with one file works", {
115 | path_to_parquet <- tempfile()
116 |
117 | expect_no_error(
118 | csv_to_parquet(
119 | path_to_file = system.file("extdata","mtcars.csv.zip", package = "readr"),
120 | path_to_parquet = path_to_parquet,
121 | )
122 | )
123 |
124 | expect_parquet(path = path_to_parquet, with_lines = 32)
125 | })
126 |
127 |
128 | test_that("Checks we have only selected columns in parquet file", {
129 | path_to_parquet <- tempfile()
130 | columns <- c("REG","LIBELLE")
131 |
132 | csv_to_parquet(
133 | path_to_file = parquetize_example("region_2022.csv"),
134 | path_to_parquet = path_to_parquet,
135 | columns = columns
136 | )
137 |
138 | expect_setequal(
139 | names(read_parquet(path_to_parquet)),
140 | columns
141 | )
142 | })
143 |
144 |
145 | test_that("Checks error if csv starts with a comment", {
146 | expect_error(
147 | csv_to_parquet(
148 | path_to_file = parquetize_example("region_2022_with_comment.csv"),
149 | path_to_parquet = tempfile()
150 | ),
151 | regexp = 'Could not guess the delimiter'
152 | )
153 | })
154 |
155 |
156 | test_that("Checks conversion works with read_delim_args", {
157 | path_to_parquet <- tempfile()
158 |
159 | expect_no_error(
160 | csv_to_parquet(
161 | path_to_file = parquetize_example("region_2022_with_comment.csv"),
162 | path_to_parquet = path_to_parquet,
163 | read_delim_args = list(comment = '#')
164 | )
165 | )
166 |
167 | expect_parquet(path = path_to_parquet, with_lines = 18)
168 | })
169 |
--------------------------------------------------------------------------------
/man/csv_to_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/csv_to_parquet.R
3 | \name{csv_to_parquet}
4 | \alias{csv_to_parquet}
5 | \title{Convert a csv or a txt file to parquet format}
6 | \usage{
7 | csv_to_parquet(
8 | path_to_file,
9 | url_to_csv = lifecycle::deprecated(),
10 | csv_as_a_zip = lifecycle::deprecated(),
11 | filename_in_zip,
12 | path_to_parquet,
13 | columns = "all",
14 | compression = "snappy",
15 | compression_level = NULL,
16 | partition = "no",
17 | encoding = "UTF-8",
18 | read_delim_args = list(),
19 | ...
20 | )
21 | }
22 | \arguments{
23 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).}
24 |
25 | \item{url_to_csv}{DEPRECATED use path_to_file instead}
26 |
27 | \item{csv_as_a_zip}{DEPRECATED}
28 |
29 | \item{filename_in_zip}{name of the csv/txt file in the zip. Required if several csv/txt are included in the zip.}
30 |
31 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.}
32 |
33 | \item{columns}{Character vector of columns to select from the input file (by default, all columns are selected).}
34 |
35 | \item{compression}{compression algorithm. Default "snappy".}
36 |
37 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
38 |
39 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
40 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.
41 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.}
42 |
43 | \item{encoding}{String that indicates the character encoding for the input file.}
44 |
45 | \item{read_delim_args}{list of arguments for \code{read_delim}.}
46 |
47 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
48 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.}
49 | }
50 | \value{
51 | A parquet file, invisibly
52 | }
53 | \description{
54 | This function allows to convert a csv or a txt file to parquet format. \cr
55 |
56 | Two conversions possibilities are offered :
57 |
58 | \itemize{
59 |
60 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;}
61 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;}
62 |
63 | }
64 | }
65 | \note{
66 | Be careful, if the zip size exceeds 4 GB, the function may truncate
67 | the data (because unzip() won't work reliably in this case -
68 | see \href{https://rdrr.io/r/utils/unzip.html}{here}).
69 | In this case, it's advised to unzip your csv/txt file by hand
70 | (for example with \href{https://www.7-zip.org/}{7-Zip})
71 | then use the function with the argument \code{path_to_file}.
72 | }
73 | \examples{
74 |
75 | # Conversion from a local csv file to a single parquet file :
76 |
77 | csv_to_parquet(
78 | path_to_file = parquetize_example("region_2022.csv"),
79 | path_to_parquet = tempfile(fileext=".parquet")
80 | )
81 |
82 | # Conversion from a local txt file to a single parquet file :
83 |
84 | csv_to_parquet(
85 | path_to_file = parquetize_example("region_2022.txt"),
86 | path_to_parquet = tempfile(fileext=".parquet")
87 | )
88 |
89 | # Conversion from a local csv file to a single parquet file and select only
90 | # few columns :
91 |
92 | csv_to_parquet(
93 | path_to_file = parquetize_example("region_2022.csv"),
94 | path_to_parquet = tempfile(fileext = ".parquet"),
95 | columns = c("REG","LIBELLE")
96 | )
97 |
98 | # Conversion from a local csv file to a partitioned parquet file :
99 |
100 | csv_to_parquet(
101 | path_to_file = parquetize_example("region_2022.csv"),
102 | path_to_parquet = tempfile(fileext = ".parquet"),
103 | partition = "yes",
104 | partitioning = c("REG")
105 | )
106 |
107 | # Conversion from a URL and a zipped file (csv) :
108 |
109 | csv_to_parquet(
110 | path_to_file = "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip",
111 | filename_in_zip = "census2021-ts007-ctry.csv",
112 | path_to_parquet = tempfile(fileext = ".parquet")
113 | )
114 |
115 | \dontrun{
116 | # Conversion from a URL and a zipped file (txt) :
117 |
118 | csv_to_parquet(
119 | path_to_file = "https://sourceforge.net/projects/irisdss/files/latest/download",
120 | filename_in_zip = "IRIS TEST data.txt",
121 | path_to_parquet = tempfile(fileext=".parquet")
122 | )
123 |
124 | # Conversion from a URL and a csv file with "gzip" compression :
125 |
126 | csv_to_parquet(
127 | path_to_file =
128 | "https://github.com/sidsriv/Introduction-to-Data-Science-in-python/raw/master/census.csv",
129 | path_to_parquet = tempfile(fileext = ".parquet"),
130 | compression = "gzip",
131 | compression_level = 5
132 | )
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/vignettes/aa-conversions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Convert huge input file to parquet"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{aa-conversions}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r, include = FALSE}
11 | knitr::opts_chunk$set(
12 | collapse = TRUE,
13 | comment = "#>"
14 | )
15 | ```
16 |
17 | ```{r setup}
18 | library(parquetize)
19 | ```
20 |
21 | ## With `table_to_parquet()`
22 |
23 | For **huge input files in SAS, SPSS and Stata formats**, the parquetize package allows you to perform a clever conversion by using `max_memory` or `max_rows` in the [`table_to_parquet()`](https://ddotta.github.io/parquetize/reference/table_to_parquet.html) function.
24 | The native behavior of this function (and all other functions in the package) is to load the entire table to be converted into R and then write it to disk (in a single file or a partitioned directory).
25 |
26 | When handling very large files, the risk that frequently occurs is that the R session aborts because it cannot load the entire database into memory.
27 | This risk is even more present if you work locally on your computer and it can be limited if you work on remote servers.
28 | **`table_to_parquet()` offers this solution which answers a need expressed by parquetize users.**
29 |
30 | -------
31 | **The idea is to split the very large table into "chunks" based on memory consumption of input data or on the number of rows in the table in order to be able to simultaneously :**
32 | - **read a chunk of the very large database**
33 | - **write this chunk in the floor file**
34 | ------
35 |
36 | Here are examples from the documentation using the iris table. There's two ways to split output files :
37 |
38 | * by memory consumption
39 | * by number of lines
40 |
41 | ### Spliting data by memory consumption
42 |
43 | `table_to_parquet` can guess the number of lines to put in a file based on the
44 | memory consuption with the argument `max_memory` expressed in Mb.
45 |
46 | Here we cut the 150 rows into chunks of roughly 5 Kb when a file is loaded as a
47 | tibble.
48 | In this example we get 2 parquet files of 89 lines called `iris1-89.parquet` and `iris90-150.parquet`
49 |
50 | ```{r iris-memory-example}
51 | table_to_parquet(
52 | path_to_file = system.file("examples", "iris.sas7bdat", package = "haven"),
53 | path_to_parquet = tempfile(),
54 | max_memory = 5 / 1024,
55 | encoding = "utf-8"
56 | )
57 | ```
58 |
59 | In real life, you should use a `max_memory` in the Gb range, for example
60 | with a SAS file of 50 000 000 lines and using `max_memory` of 5000 Mb :
61 |
62 |
63 | ```{r real-memory-example, eval=FALSE}
64 | table_to_parquet(
65 | path_to_file = "myhugefile.sas7bdat",
66 | path_to_parquet = tempdir(),
67 | max_memory = 5000,
68 | encoding = "utf-8"
69 | )
70 | ```
71 |
72 |
73 | ### Splitting data by number of lines
74 |
75 | > Tip: The number of lines that each chunk must contain must be supported by the RAM of your computer/server. Ideally, the number of chunks to be defined must be limited. It should be in tens and not hundreds to limit the number of intermediate files (see example below).
76 |
77 | Here we cut the 150 rows into 3 chunks of 50 rows. In this example we get 3 parquet files of 50 lines called `iris1-50.parquet`, `iris51-100.parquet` and `iris101-151.parquet`
78 |
79 | ```{r iris-example, eval=FALSE}
80 | table_to_parquet(
81 | path_to_file = system.file("examples", "iris.sas7bdat", package = "haven"),
82 | path_to_parquet = tempfile(),
83 | max_rows = 50,
84 | encoding = "utf-8"
85 | )
86 | ```
87 |
88 | In real life, we can perform this kind of request with the parquetize API (for example with a SAS file of 50 000 000 lines and defining 25 chunks of 2 000 000 rows each) :
89 |
90 |
91 | ```{r real-example, eval=FALSE}
92 | table_to_parquet(
93 | path_to_file = "myhugefile.sas7bdat",
94 | path_to_parquet = tempdir(),
95 | max_rows = 2000000,
96 | encoding = "utf-8"
97 | )
98 | ```
99 |
100 | Files `myhugefile1-2000000.parquet`, `myhugefile2000001-4000000.parquet` ... will be created.
101 |
102 | ## Function `rbind_parquet()`
103 |
104 | If at the end of the conversion with `table_to_parquet()`, **you want to reconstitute a unique initial table** and **if you have the computer resources (in RAM) to do so**, you can use the helper function provided with the API of `rbind_parquet()`.
105 | This function allows to bind multiple parquet files by row.
106 | Here's an example without deleting initial files (`delete_initial_files`=FALSE) :
107 |
108 | ```{r rbind_parquet-example, eval=FALSE}
109 | rbind_parquet(
110 | folder = tempfile(),
111 | output_name = "myhugefile",
112 | delete_initial_files = FALSE
113 | )
114 | ```
115 | The `myhugefile.parquet` file will be created from the `myhugefile1-2000000.parquet`, `myhugefile2000001-4000000.parquet`... files!
116 |
117 | ## Alternatives to `{parquetize}`
118 |
119 | Despite our best efforts, you may not be able to convert your very large database with {parquetize}.
120 | In this case, one solution is probably to turn to [duckdb](https://github.com/duckdb/duckdb-r), which offers undeniable advantages when it comes to conversion operations.
121 |
--------------------------------------------------------------------------------
/tests/testthat/test-table_to_parquet.R:
--------------------------------------------------------------------------------
1 | test_that("Checks arguments are filled in", {
2 | expect_missing_argument(
3 | table_to_parquet(
4 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
5 | encoding = "utf-8"
6 | ),
7 | regexp = "path_to_parquet"
8 | )
9 |
10 | expect_missing_argument(
11 | table_to_parquet(
12 | path_to_parquet = tempfile(),
13 | encoding = "utf-8"
14 | ),
15 | regexp = "path_to_file"
16 | )
17 | })
18 |
19 | test_that("Checks we can not use chunk_size with negative skip", {
20 | expect_error(
21 | table_to_parquet(
22 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
23 | path_to_parquet = tempfile(),
24 | encoding = "utf-8",
25 | max_rows = 50,
26 | skip = -100
27 | ),
28 | class = "parquetize_bad_argument",
29 | regexp = "skip must be must be greater than"
30 | )
31 | })
32 |
33 | test_that("Checks by_chunk is deprecated", {
34 | expect_warning(
35 | table_to_parquet(
36 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
37 | path_to_parquet = tempfile(),
38 | by_chunk = TRUE,
39 | max_rows = 50
40 | ),
41 | regexp = "This argument is no longer needed"
42 | )
43 | })
44 |
45 | test_that("Checks chunk_size and chunk_memory_size are deprecated", {
46 | expect_warning(
47 | table_to_parquet(
48 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
49 | path_to_parquet = tempfile(),
50 | chunk_size = 1000
51 | ),
52 | regexp = "This argument is deprecated"
53 | )
54 |
55 | expect_warning(
56 | table_to_parquet(
57 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
58 | path_to_parquet = tempfile(),
59 | chunk_memory_size = 1000
60 | ),
61 | regexp = "This argument is deprecated"
62 | )
63 | })
64 |
65 |
66 | test_that("Checks argument columns is a character vector", {
67 | expect_error(
68 | table_to_parquet(
69 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
70 | path_to_parquet = tempfile(),
71 | columns = matrix(1:10)
72 | ),
73 | class = "parquetize_bad_type"
74 | )
75 | })
76 |
77 | test_that("Checks parquetizing all formats works and return files with the good number of lines", {
78 | for (extension in c("sas7bdat", "sav", "dta")) {
79 | path_to_parquet <- tempfile()
80 | file <- paste0("iris.", extension)
81 |
82 | expect_no_error(
83 | table_to_parquet(
84 | path_to_file = system.file("examples",file, package = "haven"),
85 | path_to_parquet = path_to_parquet
86 | )
87 | )
88 |
89 | expect_parquet(path_to_parquet, with_lines = 150)
90 | }
91 | })
92 |
93 | test_that("Checks parquetizing by chunk with encoding works", {
94 | path_to_parquet <- tempfile()
95 |
96 | expect_no_error(
97 | table_to_parquet(
98 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
99 | path_to_parquet = path_to_parquet,
100 | max_rows = 50,
101 | encoding = "utf-8"
102 | )
103 | )
104 |
105 | expect_parquet(path_to_parquet, with_lines = 150, with_files = 3)
106 | })
107 |
108 | test_that("Checks parquetizing works with partitioning", {
109 | path_to_parquet <- tempfile()
110 |
111 | expect_no_error(
112 | table_to_parquet(
113 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
114 | path_to_parquet = path_to_parquet,
115 | partition = "yes",
116 | partitioning = "Species"
117 | )
118 | )
119 | expect_parquet(
120 | path_to_parquet,
121 | with_lines = 150,
122 | with_partitions = c("Species=setosa", "Species=versic", "Species=virgin")
123 | )
124 |
125 | })
126 |
127 | test_that("Checks it fails with SAS by adding max_rows, partition and partitioning argument", {
128 | expect_error(
129 | table_to_parquet(
130 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
131 | path_to_parquet = tempfile(),
132 | max_rows = 50,
133 | partition = "yes",
134 | partitioning = "Species"
135 | ),
136 | class = "parquetize_bad_argument"
137 | )
138 | })
139 |
140 | test_that("Checks we have only selected columns in parquet file", {
141 | input_file <- system.file("examples","iris.sas7bdat", package = "haven")
142 |
143 | path_to_parquet <- tempfile()
144 | columns <- c("Species","Sepal_Length")
145 |
146 | table_to_parquet(
147 | path_to_file = input_file,
148 | path_to_parquet = path_to_parquet,
149 | columns = columns
150 | )
151 |
152 | expect_parquet(
153 | path_to_parquet,
154 | with_lines = 150,
155 | with_columns = columns
156 | )
157 | })
158 |
159 | test_that("Checks we have only selected columns in parquet dataset", {
160 | input_file <- system.file("examples","iris.sas7bdat", package = "haven")
161 | path_to_parquet <- tempfile()
162 | columns <- c("Species","Sepal_Length")
163 |
164 | table_to_parquet(
165 | path_to_file = input_file,
166 | path_to_parquet = path_to_parquet,
167 | columns = columns,
168 | max_rows = 50
169 | )
170 |
171 | expect_parquet(
172 | path_to_parquet,
173 | with_lines = 150,
174 | with_columns = columns
175 | )
176 | })
177 |
--------------------------------------------------------------------------------
/R/write_parquet_by_chunk.R:
--------------------------------------------------------------------------------
1 | #' @name write_parquet_by_chunk
2 | #'
3 | #' @title read input by chunk on function and create dataset \cr
4 | #'
5 | #' @description Low level function that implements the logic to to read input file by chunk and write a
6 | #' dataset. \cr
7 | #'
8 | #' It will:
9 | #'
10 | #' \itemize{
11 | #' \item{calculate the number of row by chunk if needed;}
12 | #' \item{loop over the input file by chunk;}
13 | #' \item{write each output files.}
14 | #' }
15 | #'
16 | #' @param read_method a method to read input files. This method take only three
17 | #' arguments
18 | #'
19 | #' `input` : some kind of data. Can be a
20 | #' `skip` : the number of row to skip
21 | #' `n_max` : the number of row to return
22 | #'
23 | #' This method will be called until it returns a dataframe/tibble with zero row.
24 | #'
25 | #' @param input that indicates the path to the input. It can be anything you
26 | #' want but more often a file's path or a data.frame.
27 | #' @param path_to_parquet String that indicates the path to the directory where
28 | #' the output parquet file or dataset will be stored.
29 | #' @param max_memory Memory size (in Mb) in which data of one parquet file
30 | #' should roughly fit.
31 | #' @param max_rows Number of lines that defines the size of the chunk. This
32 | #' argument can not be filled in if max_memory is used.
33 | #' @param chunk_memory_sample_lines Number of lines to read to evaluate
34 | #' max_memory. Default to 10 000.
35 | #' @param compression compression algorithm. Default "snappy".
36 | #' @param compression_level compression level. Meaning depends on compression algorithm.
37 | #' @param ... Additional format-specific arguments, see
38 | #' \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
39 | #'
40 | #' @return a dataset as return by arrow::open_dataset
41 | #' @export
42 | #'
43 | #' @examples
44 | #'
45 | #' # example with a dataframe
46 | #'
47 | #' # we create the function to loop over the data.frame
48 | #'
49 | #' read_method <- function(input, skip = 0L, n_max = Inf) {
50 | #' # if we are after the end of the input we return an empty data.frame
51 | #' if (skip+1 > nrow(input)) { return(data.frame()) }
52 | #'
53 | #' # return the n_max row from skip + 1
54 | #' input[(skip+1):(min(skip+n_max, nrow(input))),]
55 | #' }
56 | #'
57 | #' # we use it
58 | #'
59 | #' write_parquet_by_chunk(
60 | #' read_method = read_method,
61 | #' input = mtcars,
62 | #' path_to_parquet = tempfile(),
63 | #' max_rows = 10,
64 | #' )
65 | #'
66 | #'
67 | #' #
68 | #' # Example with haven::read_sas
69 | #' #
70 | #'
71 | #' # we need to pass two argument beside the 3 input, skip and n_max.
72 | #' # We will use a closure :
73 | #'
74 | #' my_read_closure <- function(encoding, columns) {
75 | #' function(input, skip = OL, n_max = Inf) {
76 | #' haven::read_sas(data_file = input,
77 | #' n_max = n_max,
78 | #' skip = skip,
79 | #' encoding = encoding,
80 | #' col_select = all_of(columns))
81 | #' }
82 | #' }
83 | #'
84 | #' # we initialize the closure
85 | #'
86 | #' read_method <- my_read_closure(encoding = "WINDOWS-1252", columns = c("Species", "Petal_Width"))
87 | #'
88 | #' # we use it
89 | #' write_parquet_by_chunk(
90 | #' read_method = read_method,
91 | #' input = system.file("examples","iris.sas7bdat", package = "haven"),
92 | #' path_to_parquet = tempfile(),
93 | #' max_rows = 75,
94 | #' )
95 | #'
96 | write_parquet_by_chunk <- function(
97 | read_method,
98 | input,
99 | path_to_parquet,
100 | max_rows = NULL,
101 | max_memory = NULL,
102 | chunk_memory_sample_lines = 10000,
103 | compression = "snappy",
104 | compression_level = NULL,
105 | ...
106 | ) {
107 | if (missing(read_method)) {
108 | cli_abort("Be careful, read_method argument is mandatory", class = "parquetize_missing_argument")
109 | }
110 |
111 | if (!is.function(read_method)) {
112 | cli_abort("Be careful, read_method must be a function", class = "parquetize_bad_argument")
113 | }
114 |
115 | if (missing(input)) {
116 | cli_abort("Be careful, input argument is mandatory", class = "parquetize_missing_argument")
117 | }
118 |
119 | # max_rows and max_memory can not be used together so fails
120 | if (!is.null(max_rows) & !is.null(max_memory)) {
121 | cli_abort("Be careful, max_rows and max_memory can not be used together", class = "parquetize_bad_argument")
122 | }
123 |
124 | if (is.null(max_rows)) {
125 | data <- read_method(input, n_max = chunk_memory_sample_lines)
126 | max_rows <- get_lines_for_memory(data,
127 | max_memory = max_memory)
128 | }
129 |
130 | dir.create(path_to_parquet, showWarnings = FALSE, recursive = TRUE)
131 |
132 | parquetname <- tools::file_path_sans_ext(basename(path_to_parquet))
133 |
134 | skip = 0
135 | while (TRUE) {
136 | Sys.sleep(0.01)
137 | cli_progress_message("Reading data...")
138 |
139 | tbl <- read_method(input, skip = skip, n_max = max_rows)
140 | if (nrow(tbl) != 0) {
141 | Sys.sleep(0.01)
142 | parquetizename <- glue::glue("{parquetname}-{skip+1}-{skip+nrow(tbl)}.parquet")
143 | cli_progress_message("Writing {parquetizename}...")
144 | write_parquet(tbl,
145 | sink = file.path(path_to_parquet,
146 | parquetizename),
147 | compression = compression,
148 | compression_level = compression_level,
149 | ...
150 | )
151 | }
152 | skip <- skip + nrow(tbl)
153 | if (nrow(tbl) < max_rows) { break }
154 | }
155 |
156 | Sys.sleep(0.01)
157 | cli_alert_success("\nData are available in parquet dataset under {path_to_parquet}/")
158 |
159 | invisible(arrow::open_dataset(path_to_parquet))
160 | }
161 |
--------------------------------------------------------------------------------
/R/dbi_to_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name dbi_to_parquet
2 | #'
3 | #' @title Convert a SQL Query on a DBI connection to parquet format
4 | #'
5 | #' @description This function allows to convert a SQL query from a DBI to parquet format.\cr
6 | #'
7 | #' It handles all DBI supported databases.
8 | #'
9 | #' Two conversions possibilities are offered :
10 | #'
11 | #'\itemize{
12 | #'
13 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;}
14 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;}
15 | #'
16 | #' }
17 | #'
18 | #' Examples explain how to convert a query to a chunked dataset
19 | #'
20 | #' @param conn A DBIConnection object, as return by DBI::dbConnect
21 | #' @param sql_query a character string containing an SQL query (this argument is passed to DBI::dbSendQuery)
22 | #' @inheritParams table_to_parquet
23 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
24 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.
25 | #' @return A parquet file, invisibly
26 | #'
27 | #' @export
28 | #'
29 | #' @examples
30 | #'
31 | #' # Conversion from a sqlite dbi connection to a single parquet file :
32 | #'
33 | #' dbi_connection <- DBI::dbConnect(RSQLite::SQLite(),
34 | #' system.file("extdata","iris.sqlite",package = "parquetize"))
35 | #'
36 | #' # Reading iris table from local sqlite database
37 | #' # and conversion to one parquet file :
38 | #'
39 | #' dbi_to_parquet(
40 | #' conn = dbi_connection,
41 | #' sql_query = "SELECT * FROM iris",
42 | #' path_to_parquet = tempfile(fileext=".parquet"),
43 | #' )
44 | #'
45 | #' # Reading iris table from local sqlite database by chunk (using
46 | #' # `max_memory` argument) and conversion to multiple parquet files
47 | #'
48 | #' dbi_to_parquet(
49 | #' conn = dbi_connection,
50 | #' sql_query = "SELECT * FROM iris",
51 | #' path_to_parquet = tempdir(),
52 | #' max_memory = 2 / 1024
53 | #' )
54 | #'
55 | #' # Using chunk and partition together is not possible directly but easy to do :
56 | #' # Reading iris table from local sqlite database by chunk (using
57 | #' # `max_memory` argument) and conversion to arrow dataset partitioned by
58 | #' # species
59 | #'
60 | #' # get unique values of column "iris from table "iris"
61 | #' partitions <- get_partitions(dbi_connection, table = "iris", column = "Species")
62 | #'
63 | #' # loop over those values
64 | #' for (species in partitions) {
65 | #' dbi_to_parquet(
66 | #' conn = dbi_connection,
67 | #' # use glue_sql to create the query filtering the partition
68 | #' sql_query = glue::glue_sql("SELECT * FROM iris where Species = {species}",
69 | #' .con = dbi_connection),
70 | #' # add the partition name in the output dir to respect parquet partition schema
71 | #' path_to_parquet = file.path(tempdir(), "iris", paste0("Species=", species)),
72 | #' max_memory = 2 / 1024,
73 | #' )
74 | #' }
75 | #'
76 | #' # If you need a more complicated query to get your partitions, you can use
77 | #' # dbGetQuery directly :
78 | #' col_to_partition <- DBI::dbGetQuery(dbi_connection, "SELECT distinct(`Species`) FROM `iris`")[,1]
79 | #'
80 | dbi_to_parquet <- function(
81 | conn,
82 | sql_query,
83 | path_to_parquet,
84 | max_memory,
85 | max_rows,
86 | chunk_memory_sample_lines = 10000,
87 | partition = "no",
88 | compression = "snappy",
89 | compression_level = NULL,
90 | ...
91 | ) {
92 | if (missing(conn)) {
93 | cli_abort("Be careful, the argument conn must be filled in", class = "parquetize_missing_argument")
94 | }
95 |
96 | # Check if path_to_parquet is missing
97 | if (missing(sql_query)) {
98 | cli_abort("Be careful, the argument sql_query must be filled in", class = "parquetize_missing_argument")
99 | }
100 |
101 | # Check if path_to_parquet is missing
102 | if (missing(path_to_parquet)) {
103 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument")
104 | }
105 |
106 | by_chunk <- !(missing(max_rows) & missing(max_memory))
107 |
108 | if (by_chunk == TRUE) {
109 |
110 | dir.create(path_to_parquet, showWarnings = FALSE, recursive = TRUE)
111 |
112 | if (missing(max_rows)) {
113 | # create the query and send it to the DB
114 | result <- dbSendQuery(conn, sql_query)
115 | # fetch a sample of result
116 | data <- dbFetch(result, n = chunk_memory_sample_lines)
117 | # close the query in DB
118 | dbClearResult(result)
119 |
120 | max_rows <- get_lines_for_memory(data,
121 | max_memory = max_memory)
122 | }
123 |
124 | result <- dbSendQuery(conn, sql_query)
125 | on.exit(dbClearResult(result))
126 |
127 | skip <- 0
128 | while (!dbHasCompleted(result)) {
129 | Sys.sleep(0.01)
130 | cli_progress_message("Reading data...")
131 | data <- dbFetch(result, n = max_rows)
132 |
133 | parquetizename <- glue::glue("part-{skip+1}-{skip+nrow(data)}.parquet")
134 | Sys.sleep(0.01)
135 | cli_progress_message("Writing data in {parquetizename}...")
136 | write_parquet(data,
137 | sink = file.path(path_to_parquet,
138 | parquetizename),
139 | compression = compression,
140 | compression_level = compression_level,
141 | ...
142 | )
143 | skip <- skip + nrow(data)
144 | }
145 | cli_alert_success("\nParquet dataset is available under {path_to_parquet}/")
146 | return(invisible(TRUE))
147 | }
148 |
149 | result <- dbSendQuery(conn, sql_query)
150 | on.exit(dbClearResult(result))
151 |
152 | Sys.sleep(0.01)
153 | cli_progress_message("Reading data...")
154 | output <- dbFetch(result)
155 |
156 | parquetfile <- write_parquet_at_once(
157 | output,
158 | path_to_parquet,
159 | partition,
160 | compression,
161 | compression_level,
162 | ...)
163 |
164 | return(invisible(parquetfile))
165 | }
166 |
--------------------------------------------------------------------------------
/R/csv_to_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name csv_to_parquet
2 | #' @title Convert a csv or a txt file to parquet format
3 | #'
4 | #' @description This function allows to convert a csv or a txt file to parquet format. \cr
5 | #'
6 | #' Two conversions possibilities are offered :
7 | #'
8 | #'\itemize{
9 | #'
10 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;}
11 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;}
12 | #'
13 | #' }
14 | #'
15 | #' @param filename_in_zip name of the csv/txt file in the zip. Required if several csv/txt are included in the zip.
16 | #' @param url_to_csv DEPRECATED use path_to_file instead
17 | #' @param csv_as_a_zip DEPRECATED
18 | #' @inheritParams table_to_parquet
19 | #' @param read_delim_args list of arguments for `read_delim`.
20 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
21 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.
22 | #'
23 | #' @note Be careful, if the zip size exceeds 4 GB, the function may truncate
24 | #' the data (because unzip() won't work reliably in this case -
25 | #' see \href{https://rdrr.io/r/utils/unzip.html}{here}).
26 | #' In this case, it's advised to unzip your csv/txt file by hand
27 | #' (for example with \href{https://www.7-zip.org/}{7-Zip})
28 | #' then use the function with the argument `path_to_file`.
29 | #'
30 | #' @return A parquet file, invisibly
31 | #'
32 | #' @export
33 | #'
34 | #' @examples
35 | #'
36 | #' # Conversion from a local csv file to a single parquet file :
37 | #'
38 | #' csv_to_parquet(
39 | #' path_to_file = parquetize_example("region_2022.csv"),
40 | #' path_to_parquet = tempfile(fileext=".parquet")
41 | #' )
42 | #'
43 | #' # Conversion from a local txt file to a single parquet file :
44 | #'
45 | #' csv_to_parquet(
46 | #' path_to_file = parquetize_example("region_2022.txt"),
47 | #' path_to_parquet = tempfile(fileext=".parquet")
48 | #' )
49 | #'
50 | #' # Conversion from a local csv file to a single parquet file and select only
51 | #' # few columns :
52 | #'
53 | #' csv_to_parquet(
54 | #' path_to_file = parquetize_example("region_2022.csv"),
55 | #' path_to_parquet = tempfile(fileext = ".parquet"),
56 | #' columns = c("REG","LIBELLE")
57 | #' )
58 | #'
59 | #' # Conversion from a local csv file to a partitioned parquet file :
60 | #'
61 | #' csv_to_parquet(
62 | #' path_to_file = parquetize_example("region_2022.csv"),
63 | #' path_to_parquet = tempfile(fileext = ".parquet"),
64 | #' partition = "yes",
65 | #' partitioning = c("REG")
66 | #' )
67 | #'
68 | #' # Conversion from a URL and a zipped file (csv) :
69 | #'
70 | #' csv_to_parquet(
71 | #' path_to_file = "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip",
72 | #' filename_in_zip = "census2021-ts007-ctry.csv",
73 | #' path_to_parquet = tempfile(fileext = ".parquet")
74 | #' )
75 | #'
76 | #' \dontrun{
77 | #' # Conversion from a URL and a zipped file (txt) :
78 | #'
79 | #' csv_to_parquet(
80 | #' path_to_file = "https://sourceforge.net/projects/irisdss/files/latest/download",
81 | #' filename_in_zip = "IRIS TEST data.txt",
82 | #' path_to_parquet = tempfile(fileext=".parquet")
83 | #' )
84 | #'
85 | #' # Conversion from a URL and a csv file with "gzip" compression :
86 | #'
87 | #' csv_to_parquet(
88 | #' path_to_file =
89 | #' "https://github.com/sidsriv/Introduction-to-Data-Science-in-python/raw/master/census.csv",
90 | #' path_to_parquet = tempfile(fileext = ".parquet"),
91 | #' compression = "gzip",
92 | #' compression_level = 5
93 | #' )
94 | #' }
95 | csv_to_parquet <- function(
96 | path_to_file,
97 | url_to_csv = lifecycle::deprecated(),
98 | csv_as_a_zip = lifecycle::deprecated(),
99 | filename_in_zip,
100 | path_to_parquet,
101 | columns = "all",
102 | compression = "snappy",
103 | compression_level = NULL,
104 | partition = "no",
105 | encoding = "UTF-8",
106 | read_delim_args = list(),
107 | ...
108 | ) {
109 | if (!missing(url_to_csv)) {
110 | lifecycle::deprecate_warn(
111 | when = "0.5.5",
112 | what = "csv_to_parquet(url_to_csv)",
113 | details = "This argument is replaced by path_to_file."
114 | )
115 | }
116 |
117 | if (!missing(csv_as_a_zip)) {
118 | lifecycle::deprecate_warn(
119 | when = "0.5.5",
120 | what = "csv_to_parquet(csv_as_a_zip)",
121 | details = "This argument is no longer needed, parquetize detect zip file by extension."
122 | )
123 | }
124 |
125 | # Check if at least one of the two arguments path_to_file or url_to_csv is set
126 | if (missing(path_to_file) & missing(url_to_csv)) {
127 | cli_abort("Be careful, you have to fill the path_to_file argument", class = "parquetize_missing_argument")
128 | }
129 |
130 | # Check if path_to_parquet is missing
131 | if (missing(path_to_parquet)) {
132 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument")
133 | }
134 |
135 | # Check if columns argument is a character vector
136 | if (isFALSE(is.vector(columns) & is.character(columns))) {
137 | cli_abort(c("Be careful, the argument columns must be a character vector",
138 | 'You can use `all` or `c("col1", "col2"))`'),
139 | class = "parquetize_bad_argument")
140 | }
141 |
142 | if (missing(path_to_file)) {
143 | path_to_file <- url_to_csv
144 | }
145 |
146 | input_file <- download_extract(path_to_file, filename_in_zip)
147 |
148 | Sys.sleep(0.01)
149 | cli_progress_message("Reading data...")
150 |
151 | csv_output <- inject(
152 | read_delim(
153 | file = input_file,
154 | locale = locale(encoding = encoding),
155 | lazy = TRUE,
156 | show_col_types = FALSE,
157 | col_select = if (identical(columns,"all")) everything() else all_of(columns),
158 | !!!read_delim_args
159 | )
160 | )
161 |
162 | dataset <- write_parquet_at_once(
163 | csv_output,
164 | path_to_parquet,
165 | partition,
166 | compression,
167 | compression_level,
168 | ...)
169 |
170 | return(invisible(dataset))
171 | }
172 |
--------------------------------------------------------------------------------
/man/table_to_parquet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/table_to_parquet.R
3 | \name{table_to_parquet}
4 | \alias{table_to_parquet}
5 | \title{Convert an input file to parquet format}
6 | \usage{
7 | table_to_parquet(
8 | path_to_file,
9 | path_to_parquet,
10 | max_memory = NULL,
11 | max_rows = NULL,
12 | chunk_size = lifecycle::deprecated(),
13 | chunk_memory_size = lifecycle::deprecated(),
14 | columns = "all",
15 | by_chunk = lifecycle::deprecated(),
16 | skip = 0,
17 | partition = "no",
18 | encoding = NULL,
19 | chunk_memory_sample_lines = 10000,
20 | compression = "snappy",
21 | compression_level = NULL,
22 | user_na = FALSE,
23 | ...
24 | )
25 | }
26 | \arguments{
27 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).}
28 |
29 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.}
30 |
31 | \item{max_memory}{Memory size (in Mb) in which data of one parquet file should roughly fit.}
32 |
33 | \item{max_rows}{Number of lines that defines the size of the chunk.
34 | This argument can not be filled in if max_memory is used.}
35 |
36 | \item{chunk_size}{DEPRECATED use max_rows}
37 |
38 | \item{chunk_memory_size}{DEPRECATED use max_memory}
39 |
40 | \item{columns}{Character vector of columns to select from the input file (by default, all columns are selected).}
41 |
42 | \item{by_chunk}{DEPRECATED use max_memory or max_rows instead}
43 |
44 | \item{skip}{By default 0. This argument must be filled in if \code{by_chunk} is TRUE. Number of lines to ignore when converting.}
45 |
46 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
47 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.
48 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.}
49 |
50 | \item{encoding}{String that indicates the character encoding for the input file.}
51 |
52 | \item{chunk_memory_sample_lines}{Number of lines to read to evaluate max_memory. Default to 10 000.}
53 |
54 | \item{compression}{compression algorithm. Default "snappy".}
55 |
56 | \item{compression_level}{compression level. Meaning depends on compression algorithm.}
57 |
58 | \item{user_na}{If \code{TRUE} variables with user defined missing will be read
59 | into \code{\link[haven:labelled_spss]{haven::labelled_spss()}} objects. If \code{FALSE}, the default, user-defined missings will be converted to \code{NA}.}
60 |
61 | \item{...}{Additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
62 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.}
63 | }
64 | \value{
65 | Parquet files, invisibly
66 | }
67 | \description{
68 | This function allows to convert an input file to parquet format. \cr
69 |
70 | It handles SAS, SPSS and Stata files in a same function. There is only one function to use for these 3 cases.
71 | For these 3 cases, the function guesses the data format using the extension of the input file (in the \code{path_to_file} argument). \cr
72 |
73 | Two conversions possibilities are offered :
74 |
75 | \itemize{
76 |
77 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;}
78 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;}
79 |
80 | }
81 |
82 | To avoid overcharging R's RAM, the conversion can be done by chunk. One of arguments \code{max_memory} or \code{max_rows} must then be used.
83 | This is very useful for huge tables and for computers with little RAM because the conversion is then done
84 | with less memory consumption. For more information, see \href{https://ddotta.github.io/parquetize/articles/aa-conversions.html}{here}.
85 | }
86 | \examples{
87 | # Conversion from a SAS file to a single parquet file :
88 |
89 | table_to_parquet(
90 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
91 | path_to_parquet = tempfile(fileext = ".parquet")
92 | )
93 |
94 | # Conversion from a SPSS file to a single parquet file :
95 |
96 | table_to_parquet(
97 | path_to_file = system.file("examples","iris.sav", package = "haven"),
98 | path_to_parquet = tempfile(fileext = ".parquet"),
99 | )
100 | # Conversion from a Stata file to a single parquet file without progress bar :
101 |
102 | table_to_parquet(
103 | path_to_file = system.file("examples","iris.dta", package = "haven"),
104 | path_to_parquet = tempfile(fileext = ".parquet")
105 | )
106 |
107 | # Reading SPSS file by chunk (using `max_rows` argument)
108 | # and conversion to multiple parquet files :
109 |
110 | table_to_parquet(
111 | path_to_file = system.file("examples","iris.sav", package = "haven"),
112 | path_to_parquet = tempfile(),
113 | max_rows = 50,
114 | )
115 |
116 | # Reading SPSS file by chunk (using `max_memory` argument)
117 | # and conversion to multiple parquet files of 5 Kb when loaded (5 Mb / 1024)
118 | # (in real files, you should use bigger value that fit in memory like 3000
119 | # or 4000) :
120 |
121 | table_to_parquet(
122 | path_to_file = system.file("examples","iris.sav", package = "haven"),
123 | path_to_parquet = tempfile(),
124 | max_memory = 5 / 1024
125 | )
126 |
127 | # Reading SAS file by chunk of 50 lines with encoding
128 | # and conversion to multiple files :
129 |
130 | table_to_parquet(
131 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
132 | path_to_parquet = tempfile(),
133 | max_rows = 50,
134 | encoding = "utf-8"
135 | )
136 |
137 | # Conversion from a SAS file to a single parquet file and select only
138 | # few columns :
139 |
140 | table_to_parquet(
141 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
142 | path_to_parquet = tempfile(fileext = ".parquet"),
143 | columns = c("Species","Petal_Length")
144 | )
145 |
146 | # Conversion from a SAS file to a partitioned parquet file :
147 |
148 | table_to_parquet(
149 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
150 | path_to_parquet = tempfile(),
151 | partition = "yes",
152 | partitioning = c("Species") # vector use as partition key
153 | )
154 |
155 | # Reading SAS file by chunk of 50 lines
156 | # and conversion to multiple files with zstd, compression level 10
157 |
158 | if (isTRUE(arrow::arrow_info()$capabilities[['zstd']])) {
159 | table_to_parquet(
160 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
161 | path_to_parquet = tempfile(),
162 | max_rows = 50,
163 | compression = "zstd",
164 | compression_level = 10
165 | )
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # parquetize (WIP)
2 |
3 | This release includes :
4 |
5 | - `{parquetize}` now has a new `get_parquet_info` function for retrieving metadata from parquet files. This function is particularly useful for row group size (added by @nbc).
6 |
7 | # parquetize 0.5.7
8 |
9 | This release includes :
10 |
11 | - bugfix by @leungi: remove single quotes in SQL statement thatgenerates incorrect SQL syntax for connection of type Microsoft SQL Server #45
12 | - `{parquetize}` now has a minimal version (2.4.0) for `{haven}` dependency package to ensure that conversions are performed correctly from SAS files compressed in BINARY mode #46
13 | - `csv_to_parquet` now has a `read_delim_args` argument, allowing passing of arguments to `read_delim` (added by @nikostr).
14 | - `table_to_parquet` can now convert files with uppercase extensions (.SAS7BDAT, .SAV, .DTA)
15 |
16 |
17 | # parquetize 0.5.6.1
18 |
19 | This release includes :
20 |
21 | #### fst_to_parquet function
22 |
23 | - a new [fst_to_parquet](https://ddotta.github.io/parquetize/reference/fst_to_parquet.html) function that converts a fst file to parquet format.
24 |
25 | #### Other
26 |
27 | - Rely more on `@inheritParams` to simply documentation of functions arguments #38. This leads to some renaming of arguments (e.g `path_to_csv` -> `path_to_file`...)
28 | - Arguments `compression` and `compression_level` are now passed to write_parquet_at_once and write_parquet_by_chunk functions and now available in main conversion functions of `parquetize` #36
29 | - Group `@importFrom` in a file to facilitate their maintenance #37
30 | - work on download_extract tests #43
31 |
32 | # parquetize 0.5.6
33 |
34 | This release includes :
35 |
36 | #### Possibility to use a RDBMS as source
37 |
38 | You can convert to parquet any query you want on any DBI compatible RDBMS :
39 |
40 | ```{r}
41 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(),
42 | system.file("extdata","iris.sqlite",package = "parquetize"))
43 |
44 | # Reading iris table from local sqlite database
45 | # and conversion to one parquet file :
46 | dbi_to_parquet(
47 | conn = dbi_connection,
48 | sql_query = "SELECT * FROM iris",
49 | path_to_parquet = tempdir(),
50 | parquetname = "iris"
51 | )
52 | ```
53 |
54 | You can find more information on
55 | [`dbi_to_parquet`](https://ddotta.github.io/parquetize/reference/dbi_to_parquet.html) documentation.
56 |
57 | #### check_parquet function
58 |
59 | - a new [check_parquet](https://ddotta.github.io/parquetize/reference/check_parquet.html) function that check if a dataset/file is valid and return columns and arrow type
60 |
61 | #### Deprecations
62 |
63 | Two arguments are deprecated to avoid confusion with arrow concept and keep consistency
64 |
65 | * `chunk_size` is replaced by `max_rows` (chunk size is an arrow concept).
66 | * `chunk_memory_size` is replaced by `max_memory` for consistency
67 |
68 | #### Other
69 |
70 | - refactoring : extract the logic to write parquet files as chunk or at once in write_parquet_by_chunk and write_parquet_at_once
71 | - a big test's refactoring : all _to_parquet output files are formally validate (readable as parquet, number of lines, partitions, number of files).
72 | - use cli_abort instead of cli_alert_danger with stop("") everywhere
73 | - some minors changes
74 | - bugfix: table_to_parquet did not select columns as expected
75 | - bugfix: skip_if_offline tests with download
76 |
77 | # parquetize 0.5.5
78 |
79 | This release includes :
80 |
81 | #### A very important new contributor to `parquetize` !
82 |
83 | Due to these numerous contributions, @nbc is now officially part of the project authors !
84 |
85 | #### Three arguments deprecation
86 |
87 | After a big refactoring, three arguments are deprecated :
88 |
89 | * `by_chunk` : `table_to_parquet` will automatically chunked if you use one of `chunk_memory_size` or `chunk_size`.
90 | * `csv_as_a_zip`: `csv_to_table` will detect if file is a zip by the extension.
91 | * `url_to_csv` : use `path_to_csv` instead, `csv_to_table` will detect if the file is remote with the file path.
92 |
93 | They will raise a deprecation warning for the moment.
94 |
95 | #### Chunking by memory size
96 |
97 | The possibility to chunk parquet by memory size with `table_to_parquet()`:
98 | `table_to_parquet()` takes a `chunk_memory_size` argument to convert an input
99 | file into parquet file of roughly `chunk_memory_size` Mb size when data are
100 | loaded in memory.
101 |
102 | Argument `by_chunk` is deprecated (see above).
103 |
104 | Example of use of the argument `chunk_memory_size`:
105 |
106 | ```{r}
107 | table_to_parquet(
108 | path_to_table = system.file("examples","iris.sas7bdat", package = "haven"),
109 | path_to_parquet = tempdir(),
110 | chunk_memory_size = 5000, # this will create files of around 5Gb when loaded in memory
111 | )
112 | ```
113 |
114 | #### Passing argument like compression to `write_parquet` when chunking
115 |
116 | The functionality for users to pass argument to `write_parquet()` when
117 | chunking argument (in the ellipsis). Can be used for example to pass
118 | `compression` and `compression_level`.
119 |
120 | Example:
121 |
122 | ```{r}
123 | table_to_parquet(
124 | path_to_table = system.file("examples","iris.sas7bdat", package = "haven"),
125 | path_to_parquet = tempdir(),
126 | compression = "zstd",
127 | compression_level = 10,
128 | chunk_memory_size = 5000
129 | )
130 | ```
131 |
132 | #### A new function `download_extract`
133 |
134 | This function is added to ... download and unzip file if needed.
135 |
136 | ```{r}
137 | file_path <- download_extract(
138 | "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip",
139 | filename_in_zip = "census2021-ts007-ctry.csv"
140 | )
141 | csv_to_parquet(
142 | file_path,
143 | path_to_parquet = tempdir()
144 | )
145 | ```
146 |
147 | #### Other
148 |
149 | Under the cover, this release has hardened tests
150 |
151 | # parquetize 0.5.4
152 |
153 | This release fix an error when converting a sas file by chunk.
154 |
155 | # parquetize 0.5.3
156 |
157 | This release includes :
158 |
159 | - Added columns selection to `table_to_parquet()` and `csv_to_parquet()` functions #20
160 | - The example files in parquet format of the iris table have been migrated to the `inst/extdata` directory.
161 |
162 | # parquetize 0.5.2
163 |
164 | This release includes :
165 |
166 | - The behaviour of `table_to_parquet()` function has been fixed when the argument `by_chunk` is TRUE.
167 |
168 | # parquetize 0.5.1
169 |
170 | This release removes `duckdb_to_parquet()` function on the advice of Brian Ripley from CRAN.
171 | Indeed, the storage of DuckDB is not yet stable. The storage will be stabilized when version 1.0 releases.
172 |
173 | # parquetize 0.5.0
174 |
175 | This release includes corrections for CRAN submission.
176 |
177 | # parquetize 0.4.0
178 |
179 | **This release includes an important feature :**
180 |
181 | The `table_to_parquet()` function can now convert tables to parquet format with less memory consumption.
182 | Useful for huge tables and for computers with little RAM. (#15)
183 | A vignette has been written about it. See [here](https://ddotta.github.io/parquetize/articles/aa-conversions.html).
184 |
185 | * Removal of the `nb_rows` argument in the `table_to_parquet()` function
186 | * Replaced by new arguments `by_chunk`, `chunk_size` and `skip` (see documentation)
187 | * Progress bars are now managed with [{cli} package](https://github.com/r-lib/cli)
188 |
189 | # parquetize 0.3.0
190 |
191 | * Added `duckdb_to_parquet()` function to convert duckdb files to parquet format.
192 | * Added `sqlite_to_parquet()` function to convert sqlite files to parquet format.
193 |
194 | # parquetize 0.2.0
195 |
196 | * Added `rds_to_parquet()` function to convert rds files to parquet format.
197 | * Added `json_to_parquet()` function to convert json and ndjson files to parquet format.
198 | * Added the possibility to convert a csv file to a partitioned parquet file.
199 | * Improving code coverage (#9)
200 | * Check if `path_to_parquet` exists in functions `csv_to_parquet()` or `table_to_parquet()` (@py-b)
201 |
202 |
203 | # parquetize 0.1.0
204 |
205 | * Added `table_to_parquet()` function to convert SAS, SPSS and Stata files to parquet format.
206 | * Added `csv_to_parquet()` function to convert csv files to parquet format.
207 | * Added `parquetize_example()` function to get path to package data examples.
208 | * Added a `NEWS.md` file to track changes to the package.
209 |
--------------------------------------------------------------------------------
/R/table_to_parquet.R:
--------------------------------------------------------------------------------
1 | #' @name table_to_parquet
2 | #'
3 | #' @title Convert an input file to parquet format
4 | #'
5 | #' @description This function allows to convert an input file to parquet format. \cr
6 | #'
7 | #' It handles SAS, SPSS and Stata files in a same function. There is only one function to use for these 3 cases.
8 | #' For these 3 cases, the function guesses the data format using the extension of the input file (in the `path_to_file` argument). \cr
9 | #'
10 | #' Two conversions possibilities are offered :
11 | #'
12 | #'\itemize{
13 | #'
14 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;}
15 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;}
16 | #'
17 | #' }
18 | #'
19 | #' To avoid overcharging R's RAM, the conversion can be done by chunk. One of arguments `max_memory` or `max_rows` must then be used.
20 | #' This is very useful for huge tables and for computers with little RAM because the conversion is then done
21 | #' with less memory consumption. For more information, see \href{https://ddotta.github.io/parquetize/articles/aa-conversions.html}{here}.
22 | #'
23 | #' @param path_to_file String that indicates the path to the input file (don't forget the extension).
24 | #' @param path_to_parquet String that indicates the path to the directory where the parquet files will be stored.
25 | #' @param columns Character vector of columns to select from the input file (by default, all columns are selected).
26 | #' @param max_memory Memory size (in Mb) in which data of one parquet file should roughly fit.
27 | #' @param max_rows Number of lines that defines the size of the chunk.
28 | #' This argument can not be filled in if max_memory is used.
29 | #' @param chunk_memory_sample_lines Number of lines to read to evaluate max_memory. Default to 10 000.
30 | #' @param by_chunk DEPRECATED use max_memory or max_rows instead
31 | #' @param chunk_size DEPRECATED use max_rows
32 | #' @param chunk_memory_size DEPRECATED use max_memory
33 | #' @param skip By default 0. This argument must be filled in if `by_chunk` is TRUE. Number of lines to ignore when converting.
34 | #' @param partition String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file.
35 | #' If "yes", `"partitioning"` argument must be filled in. In this case, a folder will be created for each modality of the variable filled in `"partitioning"`.
36 | #' Be careful, this argument can not be "yes" if `max_memory` or `max_rows` argument are not NULL.
37 | #' @param encoding String that indicates the character encoding for the input file.
38 | #' @param compression compression algorithm. Default "snappy".
39 | #' @param compression_level compression level. Meaning depends on compression algorithm.
40 | #' @param user_na If `TRUE` variables with user defined missing will be read
41 | #' into [haven::labelled_spss()] objects. If `FALSE`, the default, user-defined missings will be converted to `NA`.
42 | #' @param ... Additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}
43 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.
44 | #'
45 | #' @return Parquet files, invisibly
46 | #'
47 | #' @export
48 | #'
49 | #' @examples
50 | #' # Conversion from a SAS file to a single parquet file :
51 | #'
52 | #' table_to_parquet(
53 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
54 | #' path_to_parquet = tempfile(fileext = ".parquet")
55 | #' )
56 | #'
57 | #' # Conversion from a SPSS file to a single parquet file :
58 | #'
59 | #' table_to_parquet(
60 | #' path_to_file = system.file("examples","iris.sav", package = "haven"),
61 | #' path_to_parquet = tempfile(fileext = ".parquet"),
62 | #' )
63 | #' # Conversion from a Stata file to a single parquet file without progress bar :
64 | #'
65 | #' table_to_parquet(
66 | #' path_to_file = system.file("examples","iris.dta", package = "haven"),
67 | #' path_to_parquet = tempfile(fileext = ".parquet")
68 | #' )
69 | #'
70 | #' # Reading SPSS file by chunk (using `max_rows` argument)
71 | #' # and conversion to multiple parquet files :
72 | #'
73 | #' table_to_parquet(
74 | #' path_to_file = system.file("examples","iris.sav", package = "haven"),
75 | #' path_to_parquet = tempfile(),
76 | #' max_rows = 50,
77 | #' )
78 | #'
79 | #' # Reading SPSS file by chunk (using `max_memory` argument)
80 | #' # and conversion to multiple parquet files of 5 Kb when loaded (5 Mb / 1024)
81 | #' # (in real files, you should use bigger value that fit in memory like 3000
82 | #' # or 4000) :
83 | #'
84 | #' table_to_parquet(
85 | #' path_to_file = system.file("examples","iris.sav", package = "haven"),
86 | #' path_to_parquet = tempfile(),
87 | #' max_memory = 5 / 1024
88 | #' )
89 | #'
90 | #' # Reading SAS file by chunk of 50 lines with encoding
91 | #' # and conversion to multiple files :
92 | #'
93 | #' table_to_parquet(
94 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
95 | #' path_to_parquet = tempfile(),
96 | #' max_rows = 50,
97 | #' encoding = "utf-8"
98 | #' )
99 | #'
100 | #' # Conversion from a SAS file to a single parquet file and select only
101 | #' # few columns :
102 | #'
103 | #' table_to_parquet(
104 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
105 | #' path_to_parquet = tempfile(fileext = ".parquet"),
106 | #' columns = c("Species","Petal_Length")
107 | #' )
108 | #'
109 | #' # Conversion from a SAS file to a partitioned parquet file :
110 | #'
111 | #' table_to_parquet(
112 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
113 | #' path_to_parquet = tempfile(),
114 | #' partition = "yes",
115 | #' partitioning = c("Species") # vector use as partition key
116 | #' )
117 | #'
118 | #' # Reading SAS file by chunk of 50 lines
119 | #' # and conversion to multiple files with zstd, compression level 10
120 | #'
121 | #' if (isTRUE(arrow::arrow_info()$capabilities[['zstd']])) {
122 | #' table_to_parquet(
123 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"),
124 | #' path_to_parquet = tempfile(),
125 | #' max_rows = 50,
126 | #' compression = "zstd",
127 | #' compression_level = 10
128 | #' )
129 | #' }
130 |
131 | table_to_parquet <- function(
132 | path_to_file,
133 | path_to_parquet,
134 | max_memory = NULL,
135 | max_rows = NULL,
136 | chunk_size = lifecycle::deprecated(),
137 | chunk_memory_size = lifecycle::deprecated(),
138 | columns = "all",
139 | by_chunk = lifecycle::deprecated(),
140 | skip = 0,
141 | partition = "no",
142 | encoding = NULL,
143 | chunk_memory_sample_lines = 10000,
144 | compression = "snappy",
145 | compression_level = NULL,
146 | user_na = FALSE,
147 | ...
148 | ) {
149 | if (!missing(by_chunk)) {
150 | lifecycle::deprecate_warn(
151 | when = "0.5.5",
152 | what = "table_to_parquet(by_chunk)",
153 | details = "This argument is no longer needed, table_to_parquet will chunk if one of max_memory or max_rows is setted"
154 | )
155 | }
156 |
157 | if (!missing(chunk_size)) {
158 | lifecycle::deprecate_warn(
159 | when = "0.5.5",
160 | what = "table_to_parquet(chunk_size)",
161 | details = "This argument is deprecated, use max_rows."
162 | )
163 | max_rows <- chunk_size
164 | }
165 |
166 | if (!missing(chunk_memory_size)) {
167 | lifecycle::deprecate_warn(
168 | when = "0.5.5",
169 | what = "table_to_parquet(chunk_memory_size)",
170 | details = "This argument is deprecated, use max_memory."
171 | )
172 | max_memory <- chunk_memory_size
173 | }
174 |
175 | # Check if path_to_file is missing
176 | if (missing(path_to_file)) {
177 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument")
178 | }
179 |
180 | # Check if path_to_parquet is missing
181 | if (missing(path_to_parquet)) {
182 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument")
183 | }
184 |
185 | # Check if columns argument is a character vector
186 | if (isFALSE(is.vector(columns) & is.character(columns))) {
187 | cli_abort(c("Be careful, the argument columns must be a character vector",
188 | 'You can use `all` or `c("col1", "col2"))`'),
189 | class = "parquetize_bad_type")
190 | }
191 |
192 | by_chunk <- !(missing(max_rows) & missing(max_memory))
193 |
194 | # Check if skip argument is correctly filled in by_chunk argument is TRUE
195 | if (by_chunk==TRUE & skip<0) {
196 | cli_abort("Be careful, if you want to do a conversion by chunk then the argument skip must be must be greater than 0",
197 | class = "parquetize_bad_argument")
198 | }
199 |
200 | # If by_chunk argument is TRUE and partition argument is equal to "yes" it fails
201 | if (by_chunk==TRUE & partition == "yes") {
202 | cli_abort("Be careful, when max_rows or max_memory are used, partition and partitioning can not be used", class = "parquetize_bad_argument")
203 | }
204 |
205 |
206 | # Closure to create read data
207 | closure_read_method <- function(encoding, columns, user_na) {
208 | method <- get_haven_read_function_for_file(path_to_file)
209 | function(path, n_max = Inf, skip = 0L) {
210 |
211 | ext <- tools::file_ext(path_to_file)
212 |
213 | if (ext != "sav") {
214 | method(path,
215 | n_max = n_max,
216 | skip = skip,
217 | encoding = encoding,
218 | col_select = if (identical(columns,"all")) everything() else all_of(columns))
219 |
220 | } else if (ext == "sav") {
221 | method(path,
222 | n_max = n_max,
223 | skip = skip,
224 | encoding = encoding,
225 | col_select = if (identical(columns,"all")) everything() else all_of(columns),
226 | user_na = user_na)
227 | }
228 | }
229 | }
230 |
231 | read_method <- closure_read_method(encoding = encoding, columns = columns, user_na = user_na)
232 |
233 | if (by_chunk) {
234 | ds <- write_parquet_by_chunk(
235 | read_method = read_method,
236 | input = path_to_file,
237 | path_to_parquet = path_to_parquet,
238 | max_rows = max_rows,
239 | max_memory = max_memory,
240 | chunk_memory_sample_lines = chunk_memory_sample_lines,
241 | ...
242 | )
243 | return(invisible(ds))
244 | }
245 |
246 | Sys.sleep(0.01)
247 | cli_progress_message("Reading data...")
248 | table_output <- read_method(path_to_file)
249 |
250 | parquetfile <- write_parquet_at_once(
251 | table_output,
252 | path_to_parquet,
253 | partition,
254 | compression,
255 | compression_level,
256 | ...)
257 |
258 | cli_alert_success("\nThe {path_to_file} file is available in parquet format under {path_to_parquet}")
259 |
260 | return(invisible(parquetfile))
261 | }
262 |
--------------------------------------------------------------------------------
/inst/extdata/iris.ndjson:
--------------------------------------------------------------------------------
1 | {"sepalLength": 5.1, "sepalWidth": 3.5, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
2 | {"sepalLength": 4.9, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
3 | {"sepalLength": 4.7, "sepalWidth": 3.2, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"}
4 | {"sepalLength": 4.6, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
5 | {"sepalLength": 5.0, "sepalWidth": 3.6, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
6 | {"sepalLength": 5.4, "sepalWidth": 3.9, "petalLength": 1.7, "petalWidth": 0.4, "species": "setosa"}
7 | {"sepalLength": 4.6, "sepalWidth": 3.4, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"}
8 | {"sepalLength": 5.0, "sepalWidth": 3.4, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
9 | {"sepalLength": 4.4, "sepalWidth": 2.9, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
10 | {"sepalLength": 4.9, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.1, "species": "setosa"}
11 | {"sepalLength": 5.4, "sepalWidth": 3.7, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
12 | {"sepalLength": 4.8, "sepalWidth": 3.4, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"}
13 | {"sepalLength": 4.8, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.1, "species": "setosa"}
14 | {"sepalLength": 4.3, "sepalWidth": 3.0, "petalLength": 1.1, "petalWidth": 0.1, "species": "setosa"}
15 | {"sepalLength": 5.8, "sepalWidth": 4.0, "petalLength": 1.2, "petalWidth": 0.2, "species": "setosa"}
16 | {"sepalLength": 5.7, "sepalWidth": 4.4, "petalLength": 1.5, "petalWidth": 0.4, "species": "setosa"}
17 | {"sepalLength": 5.4, "sepalWidth": 3.9, "petalLength": 1.3, "petalWidth": 0.4, "species": "setosa"}
18 | {"sepalLength": 5.1, "sepalWidth": 3.5, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"}
19 | {"sepalLength": 5.7, "sepalWidth": 3.8, "petalLength": 1.7, "petalWidth": 0.3, "species": "setosa"}
20 | {"sepalLength": 5.1, "sepalWidth": 3.8, "petalLength": 1.5, "petalWidth": 0.3, "species": "setosa"}
21 | {"sepalLength": 5.4, "sepalWidth": 3.4, "petalLength": 1.7, "petalWidth": 0.2, "species": "setosa"}
22 | {"sepalLength": 5.1, "sepalWidth": 3.7, "petalLength": 1.5, "petalWidth": 0.4, "species": "setosa"}
23 | {"sepalLength": 4.6, "sepalWidth": 3.6, "petalLength": 1.0, "petalWidth": 0.2, "species": "setosa"}
24 | {"sepalLength": 5.1, "sepalWidth": 3.3, "petalLength": 1.7, "petalWidth": 0.5, "species": "setosa"}
25 | {"sepalLength": 4.8, "sepalWidth": 3.4, "petalLength": 1.9, "petalWidth": 0.2, "species": "setosa"}
26 | {"sepalLength": 5.0, "sepalWidth": 3.0, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"}
27 | {"sepalLength": 5.0, "sepalWidth": 3.4, "petalLength": 1.6, "petalWidth": 0.4, "species": "setosa"}
28 | {"sepalLength": 5.2, "sepalWidth": 3.5, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
29 | {"sepalLength": 5.2, "sepalWidth": 3.4, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
30 | {"sepalLength": 4.7, "sepalWidth": 3.2, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"}
31 | {"sepalLength": 4.8, "sepalWidth": 3.1, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"}
32 | {"sepalLength": 5.4, "sepalWidth": 3.4, "petalLength": 1.5, "petalWidth": 0.4, "species": "setosa"}
33 | {"sepalLength": 5.2, "sepalWidth": 4.1, "petalLength": 1.5, "petalWidth": 0.1, "species": "setosa"}
34 | {"sepalLength": 5.5, "sepalWidth": 4.2, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
35 | {"sepalLength": 4.9, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
36 | {"sepalLength": 5.0, "sepalWidth": 3.2, "petalLength": 1.2, "petalWidth": 0.2, "species": "setosa"}
37 | {"sepalLength": 5.5, "sepalWidth": 3.5, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"}
38 | {"sepalLength": 4.9, "sepalWidth": 3.6, "petalLength": 1.4, "petalWidth": 0.1, "species": "setosa"}
39 | {"sepalLength": 4.4, "sepalWidth": 3.0, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"}
40 | {"sepalLength": 5.1, "sepalWidth": 3.4, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
41 | {"sepalLength": 5.0, "sepalWidth": 3.5, "petalLength": 1.3, "petalWidth": 0.3, "species": "setosa"}
42 | {"sepalLength": 4.5, "sepalWidth": 2.3, "petalLength": 1.3, "petalWidth": 0.3, "species": "setosa"}
43 | {"sepalLength": 4.4, "sepalWidth": 3.2, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"}
44 | {"sepalLength": 5.0, "sepalWidth": 3.5, "petalLength": 1.6, "petalWidth": 0.6, "species": "setosa"}
45 | {"sepalLength": 5.1, "sepalWidth": 3.8, "petalLength": 1.9, "petalWidth": 0.4, "species": "setosa"}
46 | {"sepalLength": 4.8, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"}
47 | {"sepalLength": 5.1, "sepalWidth": 3.8, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"}
48 | {"sepalLength": 4.6, "sepalWidth": 3.2, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
49 | {"sepalLength": 5.3, "sepalWidth": 3.7, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
50 | {"sepalLength": 5.0, "sepalWidth": 3.3, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
51 | {"sepalLength": 7.0, "sepalWidth": 3.2, "petalLength": 4.7, "petalWidth": 1.4, "species": "versicolor"}
52 | {"sepalLength": 6.4, "sepalWidth": 3.2, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"}
53 | {"sepalLength": 6.9, "sepalWidth": 3.1, "petalLength": 4.9, "petalWidth": 1.5, "species": "versicolor"}
54 | {"sepalLength": 5.5, "sepalWidth": 2.3, "petalLength": 4.0, "petalWidth": 1.3, "species": "versicolor"}
55 | {"sepalLength": 6.5, "sepalWidth": 2.8, "petalLength": 4.6, "petalWidth": 1.5, "species": "versicolor"}
56 | {"sepalLength": 5.7, "sepalWidth": 2.8, "petalLength": 4.5, "petalWidth": 1.3, "species": "versicolor"}
57 | {"sepalLength": 6.3, "sepalWidth": 3.3, "petalLength": 4.7, "petalWidth": 1.6, "species": "versicolor"}
58 | {"sepalLength": 4.9, "sepalWidth": 2.4, "petalLength": 3.3, "petalWidth": 1.0, "species": "versicolor"}
59 | {"sepalLength": 6.6, "sepalWidth": 2.9, "petalLength": 4.6, "petalWidth": 1.3, "species": "versicolor"}
60 | {"sepalLength": 5.2, "sepalWidth": 2.7, "petalLength": 3.9, "petalWidth": 1.4, "species": "versicolor"}
61 | {"sepalLength": 5.0, "sepalWidth": 2.0, "petalLength": 3.5, "petalWidth": 1.0, "species": "versicolor"}
62 | {"sepalLength": 5.9, "sepalWidth": 3.0, "petalLength": 4.2, "petalWidth": 1.5, "species": "versicolor"}
63 | {"sepalLength": 6.0, "sepalWidth": 2.2, "petalLength": 4.0, "petalWidth": 1.0, "species": "versicolor"}
64 | {"sepalLength": 6.1, "sepalWidth": 2.9, "petalLength": 4.7, "petalWidth": 1.4, "species": "versicolor"}
65 | {"sepalLength": 5.6, "sepalWidth": 2.9, "petalLength": 3.6, "petalWidth": 1.3, "species": "versicolor"}
66 | {"sepalLength": 6.7, "sepalWidth": 3.1, "petalLength": 4.4, "petalWidth": 1.4, "species": "versicolor"}
67 | {"sepalLength": 5.6, "sepalWidth": 3.0, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"}
68 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 4.1, "petalWidth": 1.0, "species": "versicolor"}
69 | {"sepalLength": 6.2, "sepalWidth": 2.2, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"}
70 | {"sepalLength": 5.6, "sepalWidth": 2.5, "petalLength": 3.9, "petalWidth": 1.1, "species": "versicolor"}
71 | {"sepalLength": 5.9, "sepalWidth": 3.2, "petalLength": 4.8, "petalWidth": 1.8, "species": "versicolor"}
72 | {"sepalLength": 6.1, "sepalWidth": 2.8, "petalLength": 4.0, "petalWidth": 1.3, "species": "versicolor"}
73 | {"sepalLength": 6.3, "sepalWidth": 2.5, "petalLength": 4.9, "petalWidth": 1.5, "species": "versicolor"}
74 | {"sepalLength": 6.1, "sepalWidth": 2.8, "petalLength": 4.7, "petalWidth": 1.2, "species": "versicolor"}
75 | {"sepalLength": 6.4, "sepalWidth": 2.9, "petalLength": 4.3, "petalWidth": 1.3, "species": "versicolor"}
76 | {"sepalLength": 6.6, "sepalWidth": 3.0, "petalLength": 4.4, "petalWidth": 1.4, "species": "versicolor"}
77 | {"sepalLength": 6.8, "sepalWidth": 2.8, "petalLength": 4.8, "petalWidth": 1.4, "species": "versicolor"}
78 | {"sepalLength": 6.7, "sepalWidth": 3.0, "petalLength": 5.0, "petalWidth": 1.7, "species": "versicolor"}
79 | {"sepalLength": 6.0, "sepalWidth": 2.9, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"}
80 | {"sepalLength": 5.7, "sepalWidth": 2.6, "petalLength": 3.5, "petalWidth": 1.0, "species": "versicolor"}
81 | {"sepalLength": 5.5, "sepalWidth": 2.4, "petalLength": 3.8, "petalWidth": 1.1, "species": "versicolor"}
82 | {"sepalLength": 5.5, "sepalWidth": 2.4, "petalLength": 3.7, "petalWidth": 1.0, "species": "versicolor"}
83 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 3.9, "petalWidth": 1.2, "species": "versicolor"}
84 | {"sepalLength": 6.0, "sepalWidth": 2.7, "petalLength": 5.1, "petalWidth": 1.6, "species": "versicolor"}
85 | {"sepalLength": 5.4, "sepalWidth": 3.0, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"}
86 | {"sepalLength": 6.0, "sepalWidth": 3.4, "petalLength": 4.5, "petalWidth": 1.6, "species": "versicolor"}
87 | {"sepalLength": 6.7, "sepalWidth": 3.1, "petalLength": 4.7, "petalWidth": 1.5, "species": "versicolor"}
88 | {"sepalLength": 6.3, "sepalWidth": 2.3, "petalLength": 4.4, "petalWidth": 1.3, "species": "versicolor"}
89 | {"sepalLength": 5.6, "sepalWidth": 3.0, "petalLength": 4.1, "petalWidth": 1.3, "species": "versicolor"}
90 | {"sepalLength": 5.5, "sepalWidth": 2.5, "petalLength": 4.0, "petalWidth": 1.3, "species": "versicolor"}
91 | {"sepalLength": 5.5, "sepalWidth": 2.6, "petalLength": 4.4, "petalWidth": 1.2, "species": "versicolor"}
92 | {"sepalLength": 6.1, "sepalWidth": 3.0, "petalLength": 4.6, "petalWidth": 1.4, "species": "versicolor"}
93 | {"sepalLength": 5.8, "sepalWidth": 2.6, "petalLength": 4.0, "petalWidth": 1.2, "species": "versicolor"}
94 | {"sepalLength": 5.0, "sepalWidth": 2.3, "petalLength": 3.3, "petalWidth": 1.0, "species": "versicolor"}
95 | {"sepalLength": 5.6, "sepalWidth": 2.7, "petalLength": 4.2, "petalWidth": 1.3, "species": "versicolor"}
96 | {"sepalLength": 5.7, "sepalWidth": 3.0, "petalLength": 4.2, "petalWidth": 1.2, "species": "versicolor"}
97 | {"sepalLength": 5.7, "sepalWidth": 2.9, "petalLength": 4.2, "petalWidth": 1.3, "species": "versicolor"}
98 | {"sepalLength": 6.2, "sepalWidth": 2.9, "petalLength": 4.3, "petalWidth": 1.3, "species": "versicolor"}
99 | {"sepalLength": 5.1, "sepalWidth": 2.5, "petalLength": 3.0, "petalWidth": 1.1, "species": "versicolor"}
100 | {"sepalLength": 5.7, "sepalWidth": 2.8, "petalLength": 4.1, "petalWidth": 1.3, "species": "versicolor"}
101 | {"sepalLength": 6.3, "sepalWidth": 3.3, "petalLength": 6.0, "petalWidth": 2.5, "species": "virginica"}
102 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 5.1, "petalWidth": 1.9, "species": "virginica"}
103 | {"sepalLength": 7.1, "sepalWidth": 3.0, "petalLength": 5.9, "petalWidth": 2.1, "species": "virginica"}
104 | {"sepalLength": 6.3, "sepalWidth": 2.9, "petalLength": 5.6, "petalWidth": 1.8, "species": "virginica"}
105 | {"sepalLength": 6.5, "sepalWidth": 3.0, "petalLength": 5.8, "petalWidth": 2.2, "species": "virginica"}
106 | {"sepalLength": 7.6, "sepalWidth": 3.0, "petalLength": 6.6, "petalWidth": 2.1, "species": "virginica"}
107 | {"sepalLength": 4.9, "sepalWidth": 2.5, "petalLength": 4.5, "petalWidth": 1.7, "species": "virginica"}
108 | {"sepalLength": 7.3, "sepalWidth": 2.9, "petalLength": 6.3, "petalWidth": 1.8, "species": "virginica"}
109 | {"sepalLength": 6.7, "sepalWidth": 2.5, "petalLength": 5.8, "petalWidth": 1.8, "species": "virginica"}
110 | {"sepalLength": 7.2, "sepalWidth": 3.6, "petalLength": 6.1, "petalWidth": 2.5, "species": "virginica"}
111 | {"sepalLength": 6.5, "sepalWidth": 3.2, "petalLength": 5.1, "petalWidth": 2.0, "species": "virginica"}
112 | {"sepalLength": 6.4, "sepalWidth": 2.7, "petalLength": 5.3, "petalWidth": 1.9, "species": "virginica"}
113 | {"sepalLength": 6.8, "sepalWidth": 3.0, "petalLength": 5.5, "petalWidth": 2.1, "species": "virginica"}
114 | {"sepalLength": 5.7, "sepalWidth": 2.5, "petalLength": 5.0, "petalWidth": 2.0, "species": "virginica"}
115 | {"sepalLength": 5.8, "sepalWidth": 2.8, "petalLength": 5.1, "petalWidth": 2.4, "species": "virginica"}
116 | {"sepalLength": 6.4, "sepalWidth": 3.2, "petalLength": 5.3, "petalWidth": 2.3, "species": "virginica"}
117 | {"sepalLength": 6.5, "sepalWidth": 3.0, "petalLength": 5.5, "petalWidth": 1.8, "species": "virginica"}
118 | {"sepalLength": 7.7, "sepalWidth": 3.8, "petalLength": 6.7, "petalWidth": 2.2, "species": "virginica"}
119 | {"sepalLength": 7.7, "sepalWidth": 2.6, "petalLength": 6.9, "petalWidth": 2.3, "species": "virginica"}
120 | {"sepalLength": 6.0, "sepalWidth": 2.2, "petalLength": 5.0, "petalWidth": 1.5, "species": "virginica"}
121 | {"sepalLength": 6.9, "sepalWidth": 3.2, "petalLength": 5.7, "petalWidth": 2.3, "species": "virginica"}
122 | {"sepalLength": 5.6, "sepalWidth": 2.8, "petalLength": 4.9, "petalWidth": 2.0, "species": "virginica"}
123 | {"sepalLength": 7.7, "sepalWidth": 2.8, "petalLength": 6.7, "petalWidth": 2.0, "species": "virginica"}
124 | {"sepalLength": 6.3, "sepalWidth": 2.7, "petalLength": 4.9, "petalWidth": 1.8, "species": "virginica"}
125 | {"sepalLength": 6.7, "sepalWidth": 3.3, "petalLength": 5.7, "petalWidth": 2.1, "species": "virginica"}
126 | {"sepalLength": 7.2, "sepalWidth": 3.2, "petalLength": 6.0, "petalWidth": 1.8, "species": "virginica"}
127 | {"sepalLength": 6.2, "sepalWidth": 2.8, "petalLength": 4.8, "petalWidth": 1.8, "species": "virginica"}
128 | {"sepalLength": 6.1, "sepalWidth": 3.0, "petalLength": 4.9, "petalWidth": 1.8, "species": "virginica"}
129 | {"sepalLength": 6.4, "sepalWidth": 2.8, "petalLength": 5.6, "petalWidth": 2.1, "species": "virginica"}
130 | {"sepalLength": 7.2, "sepalWidth": 3.0, "petalLength": 5.8, "petalWidth": 1.6, "species": "virginica"}
131 | {"sepalLength": 7.4, "sepalWidth": 2.8, "petalLength": 6.1, "petalWidth": 1.9, "species": "virginica"}
132 | {"sepalLength": 7.9, "sepalWidth": 3.8, "petalLength": 6.4, "petalWidth": 2.0, "species": "virginica"}
133 | {"sepalLength": 6.4, "sepalWidth": 2.8, "petalLength": 5.6, "petalWidth": 2.2, "species": "virginica"}
134 | {"sepalLength": 6.3, "sepalWidth": 2.8, "petalLength": 5.1, "petalWidth": 1.5, "species": "virginica"}
135 | {"sepalLength": 6.1, "sepalWidth": 2.6, "petalLength": 5.6, "petalWidth": 1.4, "species": "virginica"}
136 | {"sepalLength": 7.7, "sepalWidth": 3.0, "petalLength": 6.1, "petalWidth": 2.3, "species": "virginica"}
137 | {"sepalLength": 6.3, "sepalWidth": 3.4, "petalLength": 5.6, "petalWidth": 2.4, "species": "virginica"}
138 | {"sepalLength": 6.4, "sepalWidth": 3.1, "petalLength": 5.5, "petalWidth": 1.8, "species": "virginica"}
139 | {"sepalLength": 6.0, "sepalWidth": 3.0, "petalLength": 4.8, "petalWidth": 1.8, "species": "virginica"}
140 | {"sepalLength": 6.9, "sepalWidth": 3.1, "petalLength": 5.4, "petalWidth": 2.1, "species": "virginica"}
141 | {"sepalLength": 6.7, "sepalWidth": 3.1, "petalLength": 5.6, "petalWidth": 2.4, "species": "virginica"}
142 | {"sepalLength": 6.9, "sepalWidth": 3.1, "petalLength": 5.1, "petalWidth": 2.3, "species": "virginica"}
143 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 5.1, "petalWidth": 1.9, "species": "virginica"}
144 | {"sepalLength": 6.8, "sepalWidth": 3.2, "petalLength": 5.9, "petalWidth": 2.3, "species": "virginica"}
145 | {"sepalLength": 6.7, "sepalWidth": 3.3, "petalLength": 5.7, "petalWidth": 2.5, "species": "virginica"}
146 | {"sepalLength": 6.7, "sepalWidth": 3.0, "petalLength": 5.2, "petalWidth": 2.3, "species": "virginica"}
147 | {"sepalLength": 6.3, "sepalWidth": 2.5, "petalLength": 5.0, "petalWidth": 1.9, "species": "virginica"}
148 | {"sepalLength": 6.5, "sepalWidth": 3.0, "petalLength": 5.2, "petalWidth": 2.0, "species": "virginica"}
149 | {"sepalLength": 6.2, "sepalWidth": 3.4, "petalLength": 5.4, "petalWidth": 2.3, "species": "virginica"}
150 | {"sepalLength": 5.9, "sepalWidth": 3.0, "petalLength": 5.1, "petalWidth": 1.8, "species": "virginica"}
151 |
--------------------------------------------------------------------------------
/inst/extdata/iris.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "Sepal.Length": 5.1,
4 | "Sepal.Width": 3.5,
5 | "Petal.Length": 1.4,
6 | "Petal.Width": 0.2,
7 | "Species": "setosa"
8 | },
9 | {
10 | "Sepal.Length": 4.9,
11 | "Sepal.Width": 3,
12 | "Petal.Length": 1.4,
13 | "Petal.Width": 0.2,
14 | "Species": "setosa"
15 | },
16 | {
17 | "Sepal.Length": 4.7,
18 | "Sepal.Width": 3.2,
19 | "Petal.Length": 1.3,
20 | "Petal.Width": 0.2,
21 | "Species": "setosa"
22 | },
23 | {
24 | "Sepal.Length": 4.6,
25 | "Sepal.Width": 3.1,
26 | "Petal.Length": 1.5,
27 | "Petal.Width": 0.2,
28 | "Species": "setosa"
29 | },
30 | {
31 | "Sepal.Length": 5,
32 | "Sepal.Width": 3.6,
33 | "Petal.Length": 1.4,
34 | "Petal.Width": 0.2,
35 | "Species": "setosa"
36 | },
37 | {
38 | "Sepal.Length": 5.4,
39 | "Sepal.Width": 3.9,
40 | "Petal.Length": 1.7,
41 | "Petal.Width": 0.4,
42 | "Species": "setosa"
43 | },
44 | {
45 | "Sepal.Length": 4.6,
46 | "Sepal.Width": 3.4,
47 | "Petal.Length": 1.4,
48 | "Petal.Width": 0.3,
49 | "Species": "setosa"
50 | },
51 | {
52 | "Sepal.Length": 5,
53 | "Sepal.Width": 3.4,
54 | "Petal.Length": 1.5,
55 | "Petal.Width": 0.2,
56 | "Species": "setosa"
57 | },
58 | {
59 | "Sepal.Length": 4.4,
60 | "Sepal.Width": 2.9,
61 | "Petal.Length": 1.4,
62 | "Petal.Width": 0.2,
63 | "Species": "setosa"
64 | },
65 | {
66 | "Sepal.Length": 4.9,
67 | "Sepal.Width": 3.1,
68 | "Petal.Length": 1.5,
69 | "Petal.Width": 0.1,
70 | "Species": "setosa"
71 | },
72 | {
73 | "Sepal.Length": 5.4,
74 | "Sepal.Width": 3.7,
75 | "Petal.Length": 1.5,
76 | "Petal.Width": 0.2,
77 | "Species": "setosa"
78 | },
79 | {
80 | "Sepal.Length": 4.8,
81 | "Sepal.Width": 3.4,
82 | "Petal.Length": 1.6,
83 | "Petal.Width": 0.2,
84 | "Species": "setosa"
85 | },
86 | {
87 | "Sepal.Length": 4.8,
88 | "Sepal.Width": 3,
89 | "Petal.Length": 1.4,
90 | "Petal.Width": 0.1,
91 | "Species": "setosa"
92 | },
93 | {
94 | "Sepal.Length": 4.3,
95 | "Sepal.Width": 3,
96 | "Petal.Length": 1.1,
97 | "Petal.Width": 0.1,
98 | "Species": "setosa"
99 | },
100 | {
101 | "Sepal.Length": 5.8,
102 | "Sepal.Width": 4,
103 | "Petal.Length": 1.2,
104 | "Petal.Width": 0.2,
105 | "Species": "setosa"
106 | },
107 | {
108 | "Sepal.Length": 5.7,
109 | "Sepal.Width": 4.4,
110 | "Petal.Length": 1.5,
111 | "Petal.Width": 0.4,
112 | "Species": "setosa"
113 | },
114 | {
115 | "Sepal.Length": 5.4,
116 | "Sepal.Width": 3.9,
117 | "Petal.Length": 1.3,
118 | "Petal.Width": 0.4,
119 | "Species": "setosa"
120 | },
121 | {
122 | "Sepal.Length": 5.1,
123 | "Sepal.Width": 3.5,
124 | "Petal.Length": 1.4,
125 | "Petal.Width": 0.3,
126 | "Species": "setosa"
127 | },
128 | {
129 | "Sepal.Length": 5.7,
130 | "Sepal.Width": 3.8,
131 | "Petal.Length": 1.7,
132 | "Petal.Width": 0.3,
133 | "Species": "setosa"
134 | },
135 | {
136 | "Sepal.Length": 5.1,
137 | "Sepal.Width": 3.8,
138 | "Petal.Length": 1.5,
139 | "Petal.Width": 0.3,
140 | "Species": "setosa"
141 | },
142 | {
143 | "Sepal.Length": 5.4,
144 | "Sepal.Width": 3.4,
145 | "Petal.Length": 1.7,
146 | "Petal.Width": 0.2,
147 | "Species": "setosa"
148 | },
149 | {
150 | "Sepal.Length": 5.1,
151 | "Sepal.Width": 3.7,
152 | "Petal.Length": 1.5,
153 | "Petal.Width": 0.4,
154 | "Species": "setosa"
155 | },
156 | {
157 | "Sepal.Length": 4.6,
158 | "Sepal.Width": 3.6,
159 | "Petal.Length": 1,
160 | "Petal.Width": 0.2,
161 | "Species": "setosa"
162 | },
163 | {
164 | "Sepal.Length": 5.1,
165 | "Sepal.Width": 3.3,
166 | "Petal.Length": 1.7,
167 | "Petal.Width": 0.5,
168 | "Species": "setosa"
169 | },
170 | {
171 | "Sepal.Length": 4.8,
172 | "Sepal.Width": 3.4,
173 | "Petal.Length": 1.9,
174 | "Petal.Width": 0.2,
175 | "Species": "setosa"
176 | },
177 | {
178 | "Sepal.Length": 5,
179 | "Sepal.Width": 3,
180 | "Petal.Length": 1.6,
181 | "Petal.Width": 0.2,
182 | "Species": "setosa"
183 | },
184 | {
185 | "Sepal.Length": 5,
186 | "Sepal.Width": 3.4,
187 | "Petal.Length": 1.6,
188 | "Petal.Width": 0.4,
189 | "Species": "setosa"
190 | },
191 | {
192 | "Sepal.Length": 5.2,
193 | "Sepal.Width": 3.5,
194 | "Petal.Length": 1.5,
195 | "Petal.Width": 0.2,
196 | "Species": "setosa"
197 | },
198 | {
199 | "Sepal.Length": 5.2,
200 | "Sepal.Width": 3.4,
201 | "Petal.Length": 1.4,
202 | "Petal.Width": 0.2,
203 | "Species": "setosa"
204 | },
205 | {
206 | "Sepal.Length": 4.7,
207 | "Sepal.Width": 3.2,
208 | "Petal.Length": 1.6,
209 | "Petal.Width": 0.2,
210 | "Species": "setosa"
211 | },
212 | {
213 | "Sepal.Length": 4.8,
214 | "Sepal.Width": 3.1,
215 | "Petal.Length": 1.6,
216 | "Petal.Width": 0.2,
217 | "Species": "setosa"
218 | },
219 | {
220 | "Sepal.Length": 5.4,
221 | "Sepal.Width": 3.4,
222 | "Petal.Length": 1.5,
223 | "Petal.Width": 0.4,
224 | "Species": "setosa"
225 | },
226 | {
227 | "Sepal.Length": 5.2,
228 | "Sepal.Width": 4.1,
229 | "Petal.Length": 1.5,
230 | "Petal.Width": 0.1,
231 | "Species": "setosa"
232 | },
233 | {
234 | "Sepal.Length": 5.5,
235 | "Sepal.Width": 4.2,
236 | "Petal.Length": 1.4,
237 | "Petal.Width": 0.2,
238 | "Species": "setosa"
239 | },
240 | {
241 | "Sepal.Length": 4.9,
242 | "Sepal.Width": 3.1,
243 | "Petal.Length": 1.5,
244 | "Petal.Width": 0.2,
245 | "Species": "setosa"
246 | },
247 | {
248 | "Sepal.Length": 5,
249 | "Sepal.Width": 3.2,
250 | "Petal.Length": 1.2,
251 | "Petal.Width": 0.2,
252 | "Species": "setosa"
253 | },
254 | {
255 | "Sepal.Length": 5.5,
256 | "Sepal.Width": 3.5,
257 | "Petal.Length": 1.3,
258 | "Petal.Width": 0.2,
259 | "Species": "setosa"
260 | },
261 | {
262 | "Sepal.Length": 4.9,
263 | "Sepal.Width": 3.6,
264 | "Petal.Length": 1.4,
265 | "Petal.Width": 0.1,
266 | "Species": "setosa"
267 | },
268 | {
269 | "Sepal.Length": 4.4,
270 | "Sepal.Width": 3,
271 | "Petal.Length": 1.3,
272 | "Petal.Width": 0.2,
273 | "Species": "setosa"
274 | },
275 | {
276 | "Sepal.Length": 5.1,
277 | "Sepal.Width": 3.4,
278 | "Petal.Length": 1.5,
279 | "Petal.Width": 0.2,
280 | "Species": "setosa"
281 | },
282 | {
283 | "Sepal.Length": 5,
284 | "Sepal.Width": 3.5,
285 | "Petal.Length": 1.3,
286 | "Petal.Width": 0.3,
287 | "Species": "setosa"
288 | },
289 | {
290 | "Sepal.Length": 4.5,
291 | "Sepal.Width": 2.3,
292 | "Petal.Length": 1.3,
293 | "Petal.Width": 0.3,
294 | "Species": "setosa"
295 | },
296 | {
297 | "Sepal.Length": 4.4,
298 | "Sepal.Width": 3.2,
299 | "Petal.Length": 1.3,
300 | "Petal.Width": 0.2,
301 | "Species": "setosa"
302 | },
303 | {
304 | "Sepal.Length": 5,
305 | "Sepal.Width": 3.5,
306 | "Petal.Length": 1.6,
307 | "Petal.Width": 0.6,
308 | "Species": "setosa"
309 | },
310 | {
311 | "Sepal.Length": 5.1,
312 | "Sepal.Width": 3.8,
313 | "Petal.Length": 1.9,
314 | "Petal.Width": 0.4,
315 | "Species": "setosa"
316 | },
317 | {
318 | "Sepal.Length": 4.8,
319 | "Sepal.Width": 3,
320 | "Petal.Length": 1.4,
321 | "Petal.Width": 0.3,
322 | "Species": "setosa"
323 | },
324 | {
325 | "Sepal.Length": 5.1,
326 | "Sepal.Width": 3.8,
327 | "Petal.Length": 1.6,
328 | "Petal.Width": 0.2,
329 | "Species": "setosa"
330 | },
331 | {
332 | "Sepal.Length": 4.6,
333 | "Sepal.Width": 3.2,
334 | "Petal.Length": 1.4,
335 | "Petal.Width": 0.2,
336 | "Species": "setosa"
337 | },
338 | {
339 | "Sepal.Length": 5.3,
340 | "Sepal.Width": 3.7,
341 | "Petal.Length": 1.5,
342 | "Petal.Width": 0.2,
343 | "Species": "setosa"
344 | },
345 | {
346 | "Sepal.Length": 5,
347 | "Sepal.Width": 3.3,
348 | "Petal.Length": 1.4,
349 | "Petal.Width": 0.2,
350 | "Species": "setosa"
351 | },
352 | {
353 | "Sepal.Length": 7,
354 | "Sepal.Width": 3.2,
355 | "Petal.Length": 4.7,
356 | "Petal.Width": 1.4,
357 | "Species": "versicolor"
358 | },
359 | {
360 | "Sepal.Length": 6.4,
361 | "Sepal.Width": 3.2,
362 | "Petal.Length": 4.5,
363 | "Petal.Width": 1.5,
364 | "Species": "versicolor"
365 | },
366 | {
367 | "Sepal.Length": 6.9,
368 | "Sepal.Width": 3.1,
369 | "Petal.Length": 4.9,
370 | "Petal.Width": 1.5,
371 | "Species": "versicolor"
372 | },
373 | {
374 | "Sepal.Length": 5.5,
375 | "Sepal.Width": 2.3,
376 | "Petal.Length": 4,
377 | "Petal.Width": 1.3,
378 | "Species": "versicolor"
379 | },
380 | {
381 | "Sepal.Length": 6.5,
382 | "Sepal.Width": 2.8,
383 | "Petal.Length": 4.6,
384 | "Petal.Width": 1.5,
385 | "Species": "versicolor"
386 | },
387 | {
388 | "Sepal.Length": 5.7,
389 | "Sepal.Width": 2.8,
390 | "Petal.Length": 4.5,
391 | "Petal.Width": 1.3,
392 | "Species": "versicolor"
393 | },
394 | {
395 | "Sepal.Length": 6.3,
396 | "Sepal.Width": 3.3,
397 | "Petal.Length": 4.7,
398 | "Petal.Width": 1.6,
399 | "Species": "versicolor"
400 | },
401 | {
402 | "Sepal.Length": 4.9,
403 | "Sepal.Width": 2.4,
404 | "Petal.Length": 3.3,
405 | "Petal.Width": 1,
406 | "Species": "versicolor"
407 | },
408 | {
409 | "Sepal.Length": 6.6,
410 | "Sepal.Width": 2.9,
411 | "Petal.Length": 4.6,
412 | "Petal.Width": 1.3,
413 | "Species": "versicolor"
414 | },
415 | {
416 | "Sepal.Length": 5.2,
417 | "Sepal.Width": 2.7,
418 | "Petal.Length": 3.9,
419 | "Petal.Width": 1.4,
420 | "Species": "versicolor"
421 | },
422 | {
423 | "Sepal.Length": 5,
424 | "Sepal.Width": 2,
425 | "Petal.Length": 3.5,
426 | "Petal.Width": 1,
427 | "Species": "versicolor"
428 | },
429 | {
430 | "Sepal.Length": 5.9,
431 | "Sepal.Width": 3,
432 | "Petal.Length": 4.2,
433 | "Petal.Width": 1.5,
434 | "Species": "versicolor"
435 | },
436 | {
437 | "Sepal.Length": 6,
438 | "Sepal.Width": 2.2,
439 | "Petal.Length": 4,
440 | "Petal.Width": 1,
441 | "Species": "versicolor"
442 | },
443 | {
444 | "Sepal.Length": 6.1,
445 | "Sepal.Width": 2.9,
446 | "Petal.Length": 4.7,
447 | "Petal.Width": 1.4,
448 | "Species": "versicolor"
449 | },
450 | {
451 | "Sepal.Length": 5.6,
452 | "Sepal.Width": 2.9,
453 | "Petal.Length": 3.6,
454 | "Petal.Width": 1.3,
455 | "Species": "versicolor"
456 | },
457 | {
458 | "Sepal.Length": 6.7,
459 | "Sepal.Width": 3.1,
460 | "Petal.Length": 4.4,
461 | "Petal.Width": 1.4,
462 | "Species": "versicolor"
463 | },
464 | {
465 | "Sepal.Length": 5.6,
466 | "Sepal.Width": 3,
467 | "Petal.Length": 4.5,
468 | "Petal.Width": 1.5,
469 | "Species": "versicolor"
470 | },
471 | {
472 | "Sepal.Length": 5.8,
473 | "Sepal.Width": 2.7,
474 | "Petal.Length": 4.1,
475 | "Petal.Width": 1,
476 | "Species": "versicolor"
477 | },
478 | {
479 | "Sepal.Length": 6.2,
480 | "Sepal.Width": 2.2,
481 | "Petal.Length": 4.5,
482 | "Petal.Width": 1.5,
483 | "Species": "versicolor"
484 | },
485 | {
486 | "Sepal.Length": 5.6,
487 | "Sepal.Width": 2.5,
488 | "Petal.Length": 3.9,
489 | "Petal.Width": 1.1,
490 | "Species": "versicolor"
491 | },
492 | {
493 | "Sepal.Length": 5.9,
494 | "Sepal.Width": 3.2,
495 | "Petal.Length": 4.8,
496 | "Petal.Width": 1.8,
497 | "Species": "versicolor"
498 | },
499 | {
500 | "Sepal.Length": 6.1,
501 | "Sepal.Width": 2.8,
502 | "Petal.Length": 4,
503 | "Petal.Width": 1.3,
504 | "Species": "versicolor"
505 | },
506 | {
507 | "Sepal.Length": 6.3,
508 | "Sepal.Width": 2.5,
509 | "Petal.Length": 4.9,
510 | "Petal.Width": 1.5,
511 | "Species": "versicolor"
512 | },
513 | {
514 | "Sepal.Length": 6.1,
515 | "Sepal.Width": 2.8,
516 | "Petal.Length": 4.7,
517 | "Petal.Width": 1.2,
518 | "Species": "versicolor"
519 | },
520 | {
521 | "Sepal.Length": 6.4,
522 | "Sepal.Width": 2.9,
523 | "Petal.Length": 4.3,
524 | "Petal.Width": 1.3,
525 | "Species": "versicolor"
526 | },
527 | {
528 | "Sepal.Length": 6.6,
529 | "Sepal.Width": 3,
530 | "Petal.Length": 4.4,
531 | "Petal.Width": 1.4,
532 | "Species": "versicolor"
533 | },
534 | {
535 | "Sepal.Length": 6.8,
536 | "Sepal.Width": 2.8,
537 | "Petal.Length": 4.8,
538 | "Petal.Width": 1.4,
539 | "Species": "versicolor"
540 | },
541 | {
542 | "Sepal.Length": 6.7,
543 | "Sepal.Width": 3,
544 | "Petal.Length": 5,
545 | "Petal.Width": 1.7,
546 | "Species": "versicolor"
547 | },
548 | {
549 | "Sepal.Length": 6,
550 | "Sepal.Width": 2.9,
551 | "Petal.Length": 4.5,
552 | "Petal.Width": 1.5,
553 | "Species": "versicolor"
554 | },
555 | {
556 | "Sepal.Length": 5.7,
557 | "Sepal.Width": 2.6,
558 | "Petal.Length": 3.5,
559 | "Petal.Width": 1,
560 | "Species": "versicolor"
561 | },
562 | {
563 | "Sepal.Length": 5.5,
564 | "Sepal.Width": 2.4,
565 | "Petal.Length": 3.8,
566 | "Petal.Width": 1.1,
567 | "Species": "versicolor"
568 | },
569 | {
570 | "Sepal.Length": 5.5,
571 | "Sepal.Width": 2.4,
572 | "Petal.Length": 3.7,
573 | "Petal.Width": 1,
574 | "Species": "versicolor"
575 | },
576 | {
577 | "Sepal.Length": 5.8,
578 | "Sepal.Width": 2.7,
579 | "Petal.Length": 3.9,
580 | "Petal.Width": 1.2,
581 | "Species": "versicolor"
582 | },
583 | {
584 | "Sepal.Length": 6,
585 | "Sepal.Width": 2.7,
586 | "Petal.Length": 5.1,
587 | "Petal.Width": 1.6,
588 | "Species": "versicolor"
589 | },
590 | {
591 | "Sepal.Length": 5.4,
592 | "Sepal.Width": 3,
593 | "Petal.Length": 4.5,
594 | "Petal.Width": 1.5,
595 | "Species": "versicolor"
596 | },
597 | {
598 | "Sepal.Length": 6,
599 | "Sepal.Width": 3.4,
600 | "Petal.Length": 4.5,
601 | "Petal.Width": 1.6,
602 | "Species": "versicolor"
603 | },
604 | {
605 | "Sepal.Length": 6.7,
606 | "Sepal.Width": 3.1,
607 | "Petal.Length": 4.7,
608 | "Petal.Width": 1.5,
609 | "Species": "versicolor"
610 | },
611 | {
612 | "Sepal.Length": 6.3,
613 | "Sepal.Width": 2.3,
614 | "Petal.Length": 4.4,
615 | "Petal.Width": 1.3,
616 | "Species": "versicolor"
617 | },
618 | {
619 | "Sepal.Length": 5.6,
620 | "Sepal.Width": 3,
621 | "Petal.Length": 4.1,
622 | "Petal.Width": 1.3,
623 | "Species": "versicolor"
624 | },
625 | {
626 | "Sepal.Length": 5.5,
627 | "Sepal.Width": 2.5,
628 | "Petal.Length": 4,
629 | "Petal.Width": 1.3,
630 | "Species": "versicolor"
631 | },
632 | {
633 | "Sepal.Length": 5.5,
634 | "Sepal.Width": 2.6,
635 | "Petal.Length": 4.4,
636 | "Petal.Width": 1.2,
637 | "Species": "versicolor"
638 | },
639 | {
640 | "Sepal.Length": 6.1,
641 | "Sepal.Width": 3,
642 | "Petal.Length": 4.6,
643 | "Petal.Width": 1.4,
644 | "Species": "versicolor"
645 | },
646 | {
647 | "Sepal.Length": 5.8,
648 | "Sepal.Width": 2.6,
649 | "Petal.Length": 4,
650 | "Petal.Width": 1.2,
651 | "Species": "versicolor"
652 | },
653 | {
654 | "Sepal.Length": 5,
655 | "Sepal.Width": 2.3,
656 | "Petal.Length": 3.3,
657 | "Petal.Width": 1,
658 | "Species": "versicolor"
659 | },
660 | {
661 | "Sepal.Length": 5.6,
662 | "Sepal.Width": 2.7,
663 | "Petal.Length": 4.2,
664 | "Petal.Width": 1.3,
665 | "Species": "versicolor"
666 | },
667 | {
668 | "Sepal.Length": 5.7,
669 | "Sepal.Width": 3,
670 | "Petal.Length": 4.2,
671 | "Petal.Width": 1.2,
672 | "Species": "versicolor"
673 | },
674 | {
675 | "Sepal.Length": 5.7,
676 | "Sepal.Width": 2.9,
677 | "Petal.Length": 4.2,
678 | "Petal.Width": 1.3,
679 | "Species": "versicolor"
680 | },
681 | {
682 | "Sepal.Length": 6.2,
683 | "Sepal.Width": 2.9,
684 | "Petal.Length": 4.3,
685 | "Petal.Width": 1.3,
686 | "Species": "versicolor"
687 | },
688 | {
689 | "Sepal.Length": 5.1,
690 | "Sepal.Width": 2.5,
691 | "Petal.Length": 3,
692 | "Petal.Width": 1.1,
693 | "Species": "versicolor"
694 | },
695 | {
696 | "Sepal.Length": 5.7,
697 | "Sepal.Width": 2.8,
698 | "Petal.Length": 4.1,
699 | "Petal.Width": 1.3,
700 | "Species": "versicolor"
701 | },
702 | {
703 | "Sepal.Length": 6.3,
704 | "Sepal.Width": 3.3,
705 | "Petal.Length": 6,
706 | "Petal.Width": 2.5,
707 | "Species": "virginica"
708 | },
709 | {
710 | "Sepal.Length": 5.8,
711 | "Sepal.Width": 2.7,
712 | "Petal.Length": 5.1,
713 | "Petal.Width": 1.9,
714 | "Species": "virginica"
715 | },
716 | {
717 | "Sepal.Length": 7.1,
718 | "Sepal.Width": 3,
719 | "Petal.Length": 5.9,
720 | "Petal.Width": 2.1,
721 | "Species": "virginica"
722 | },
723 | {
724 | "Sepal.Length": 6.3,
725 | "Sepal.Width": 2.9,
726 | "Petal.Length": 5.6,
727 | "Petal.Width": 1.8,
728 | "Species": "virginica"
729 | },
730 | {
731 | "Sepal.Length": 6.5,
732 | "Sepal.Width": 3,
733 | "Petal.Length": 5.8,
734 | "Petal.Width": 2.2,
735 | "Species": "virginica"
736 | },
737 | {
738 | "Sepal.Length": 7.6,
739 | "Sepal.Width": 3,
740 | "Petal.Length": 6.6,
741 | "Petal.Width": 2.1,
742 | "Species": "virginica"
743 | },
744 | {
745 | "Sepal.Length": 4.9,
746 | "Sepal.Width": 2.5,
747 | "Petal.Length": 4.5,
748 | "Petal.Width": 1.7,
749 | "Species": "virginica"
750 | },
751 | {
752 | "Sepal.Length": 7.3,
753 | "Sepal.Width": 2.9,
754 | "Petal.Length": 6.3,
755 | "Petal.Width": 1.8,
756 | "Species": "virginica"
757 | },
758 | {
759 | "Sepal.Length": 6.7,
760 | "Sepal.Width": 2.5,
761 | "Petal.Length": 5.8,
762 | "Petal.Width": 1.8,
763 | "Species": "virginica"
764 | },
765 | {
766 | "Sepal.Length": 7.2,
767 | "Sepal.Width": 3.6,
768 | "Petal.Length": 6.1,
769 | "Petal.Width": 2.5,
770 | "Species": "virginica"
771 | },
772 | {
773 | "Sepal.Length": 6.5,
774 | "Sepal.Width": 3.2,
775 | "Petal.Length": 5.1,
776 | "Petal.Width": 2,
777 | "Species": "virginica"
778 | },
779 | {
780 | "Sepal.Length": 6.4,
781 | "Sepal.Width": 2.7,
782 | "Petal.Length": 5.3,
783 | "Petal.Width": 1.9,
784 | "Species": "virginica"
785 | },
786 | {
787 | "Sepal.Length": 6.8,
788 | "Sepal.Width": 3,
789 | "Petal.Length": 5.5,
790 | "Petal.Width": 2.1,
791 | "Species": "virginica"
792 | },
793 | {
794 | "Sepal.Length": 5.7,
795 | "Sepal.Width": 2.5,
796 | "Petal.Length": 5,
797 | "Petal.Width": 2,
798 | "Species": "virginica"
799 | },
800 | {
801 | "Sepal.Length": 5.8,
802 | "Sepal.Width": 2.8,
803 | "Petal.Length": 5.1,
804 | "Petal.Width": 2.4,
805 | "Species": "virginica"
806 | },
807 | {
808 | "Sepal.Length": 6.4,
809 | "Sepal.Width": 3.2,
810 | "Petal.Length": 5.3,
811 | "Petal.Width": 2.3,
812 | "Species": "virginica"
813 | },
814 | {
815 | "Sepal.Length": 6.5,
816 | "Sepal.Width": 3,
817 | "Petal.Length": 5.5,
818 | "Petal.Width": 1.8,
819 | "Species": "virginica"
820 | },
821 | {
822 | "Sepal.Length": 7.7,
823 | "Sepal.Width": 3.8,
824 | "Petal.Length": 6.7,
825 | "Petal.Width": 2.2,
826 | "Species": "virginica"
827 | },
828 | {
829 | "Sepal.Length": 7.7,
830 | "Sepal.Width": 2.6,
831 | "Petal.Length": 6.9,
832 | "Petal.Width": 2.3,
833 | "Species": "virginica"
834 | },
835 | {
836 | "Sepal.Length": 6,
837 | "Sepal.Width": 2.2,
838 | "Petal.Length": 5,
839 | "Petal.Width": 1.5,
840 | "Species": "virginica"
841 | },
842 | {
843 | "Sepal.Length": 6.9,
844 | "Sepal.Width": 3.2,
845 | "Petal.Length": 5.7,
846 | "Petal.Width": 2.3,
847 | "Species": "virginica"
848 | },
849 | {
850 | "Sepal.Length": 5.6,
851 | "Sepal.Width": 2.8,
852 | "Petal.Length": 4.9,
853 | "Petal.Width": 2,
854 | "Species": "virginica"
855 | },
856 | {
857 | "Sepal.Length": 7.7,
858 | "Sepal.Width": 2.8,
859 | "Petal.Length": 6.7,
860 | "Petal.Width": 2,
861 | "Species": "virginica"
862 | },
863 | {
864 | "Sepal.Length": 6.3,
865 | "Sepal.Width": 2.7,
866 | "Petal.Length": 4.9,
867 | "Petal.Width": 1.8,
868 | "Species": "virginica"
869 | },
870 | {
871 | "Sepal.Length": 6.7,
872 | "Sepal.Width": 3.3,
873 | "Petal.Length": 5.7,
874 | "Petal.Width": 2.1,
875 | "Species": "virginica"
876 | },
877 | {
878 | "Sepal.Length": 7.2,
879 | "Sepal.Width": 3.2,
880 | "Petal.Length": 6,
881 | "Petal.Width": 1.8,
882 | "Species": "virginica"
883 | },
884 | {
885 | "Sepal.Length": 6.2,
886 | "Sepal.Width": 2.8,
887 | "Petal.Length": 4.8,
888 | "Petal.Width": 1.8,
889 | "Species": "virginica"
890 | },
891 | {
892 | "Sepal.Length": 6.1,
893 | "Sepal.Width": 3,
894 | "Petal.Length": 4.9,
895 | "Petal.Width": 1.8,
896 | "Species": "virginica"
897 | },
898 | {
899 | "Sepal.Length": 6.4,
900 | "Sepal.Width": 2.8,
901 | "Petal.Length": 5.6,
902 | "Petal.Width": 2.1,
903 | "Species": "virginica"
904 | },
905 | {
906 | "Sepal.Length": 7.2,
907 | "Sepal.Width": 3,
908 | "Petal.Length": 5.8,
909 | "Petal.Width": 1.6,
910 | "Species": "virginica"
911 | },
912 | {
913 | "Sepal.Length": 7.4,
914 | "Sepal.Width": 2.8,
915 | "Petal.Length": 6.1,
916 | "Petal.Width": 1.9,
917 | "Species": "virginica"
918 | },
919 | {
920 | "Sepal.Length": 7.9,
921 | "Sepal.Width": 3.8,
922 | "Petal.Length": 6.4,
923 | "Petal.Width": 2,
924 | "Species": "virginica"
925 | },
926 | {
927 | "Sepal.Length": 6.4,
928 | "Sepal.Width": 2.8,
929 | "Petal.Length": 5.6,
930 | "Petal.Width": 2.2,
931 | "Species": "virginica"
932 | },
933 | {
934 | "Sepal.Length": 6.3,
935 | "Sepal.Width": 2.8,
936 | "Petal.Length": 5.1,
937 | "Petal.Width": 1.5,
938 | "Species": "virginica"
939 | },
940 | {
941 | "Sepal.Length": 6.1,
942 | "Sepal.Width": 2.6,
943 | "Petal.Length": 5.6,
944 | "Petal.Width": 1.4,
945 | "Species": "virginica"
946 | },
947 | {
948 | "Sepal.Length": 7.7,
949 | "Sepal.Width": 3,
950 | "Petal.Length": 6.1,
951 | "Petal.Width": 2.3,
952 | "Species": "virginica"
953 | },
954 | {
955 | "Sepal.Length": 6.3,
956 | "Sepal.Width": 3.4,
957 | "Petal.Length": 5.6,
958 | "Petal.Width": 2.4,
959 | "Species": "virginica"
960 | },
961 | {
962 | "Sepal.Length": 6.4,
963 | "Sepal.Width": 3.1,
964 | "Petal.Length": 5.5,
965 | "Petal.Width": 1.8,
966 | "Species": "virginica"
967 | },
968 | {
969 | "Sepal.Length": 6,
970 | "Sepal.Width": 3,
971 | "Petal.Length": 4.8,
972 | "Petal.Width": 1.8,
973 | "Species": "virginica"
974 | },
975 | {
976 | "Sepal.Length": 6.9,
977 | "Sepal.Width": 3.1,
978 | "Petal.Length": 5.4,
979 | "Petal.Width": 2.1,
980 | "Species": "virginica"
981 | },
982 | {
983 | "Sepal.Length": 6.7,
984 | "Sepal.Width": 3.1,
985 | "Petal.Length": 5.6,
986 | "Petal.Width": 2.4,
987 | "Species": "virginica"
988 | },
989 | {
990 | "Sepal.Length": 6.9,
991 | "Sepal.Width": 3.1,
992 | "Petal.Length": 5.1,
993 | "Petal.Width": 2.3,
994 | "Species": "virginica"
995 | },
996 | {
997 | "Sepal.Length": 5.8,
998 | "Sepal.Width": 2.7,
999 | "Petal.Length": 5.1,
1000 | "Petal.Width": 1.9,
1001 | "Species": "virginica"
1002 | },
1003 | {
1004 | "Sepal.Length": 6.8,
1005 | "Sepal.Width": 3.2,
1006 | "Petal.Length": 5.9,
1007 | "Petal.Width": 2.3,
1008 | "Species": "virginica"
1009 | },
1010 | {
1011 | "Sepal.Length": 6.7,
1012 | "Sepal.Width": 3.3,
1013 | "Petal.Length": 5.7,
1014 | "Petal.Width": 2.5,
1015 | "Species": "virginica"
1016 | },
1017 | {
1018 | "Sepal.Length": 6.7,
1019 | "Sepal.Width": 3,
1020 | "Petal.Length": 5.2,
1021 | "Petal.Width": 2.3,
1022 | "Species": "virginica"
1023 | },
1024 | {
1025 | "Sepal.Length": 6.3,
1026 | "Sepal.Width": 2.5,
1027 | "Petal.Length": 5,
1028 | "Petal.Width": 1.9,
1029 | "Species": "virginica"
1030 | },
1031 | {
1032 | "Sepal.Length": 6.5,
1033 | "Sepal.Width": 3,
1034 | "Petal.Length": 5.2,
1035 | "Petal.Width": 2,
1036 | "Species": "virginica"
1037 | },
1038 | {
1039 | "Sepal.Length": 6.2,
1040 | "Sepal.Width": 3.4,
1041 | "Petal.Length": 5.4,
1042 | "Petal.Width": 2.3,
1043 | "Species": "virginica"
1044 | },
1045 | {
1046 | "Sepal.Length": 5.9,
1047 | "Sepal.Width": 3,
1048 | "Petal.Length": 5.1,
1049 | "Petal.Width": 1.8,
1050 | "Species": "virginica"
1051 | }
1052 | ]
1053 |
--------------------------------------------------------------------------------