├── .github ├── .gitignore └── workflows │ ├── check-release.yaml │ ├── pkgdown.yaml │ └── test-coverage.yaml ├── vignettes ├── .gitignore └── aa-conversions.Rmd ├── inst └── extdata │ ├── iris.fst │ ├── iris.rds │ ├── iris.duckdb │ ├── iris.parquet │ ├── iris.sqlite │ ├── multifile.zip │ ├── iris_dataset │ ├── Species=setosa │ │ └── part-0.parquet │ ├── Species=versicolor │ │ └── part-0.parquet │ └── Species=virginica │ │ └── part-0.parquet │ ├── region_2022.txt │ ├── region_2022.csv │ ├── region_2022_with_comment.csv │ ├── iris.ndjson │ └── iris.json ├── man ├── figures │ ├── hex_parquetize.png │ └── Insee_example_csv.gif ├── expect_missing_argument.Rd ├── parquetize_example.Rd ├── expect_parquet.Rd ├── get_partitions.Rd ├── parquetize-package.Rd ├── get_parquet_info.Rd ├── check_parquet.Rd ├── write_parquet_at_once.Rd ├── download_extract.Rd ├── rbind_parquet.Rd ├── fst_to_parquet.Rd ├── rds_to_parquet.Rd ├── json_to_parquet.Rd ├── sqlite_to_parquet.Rd ├── write_parquet_by_chunk.Rd ├── dbi_to_parquet.Rd ├── csv_to_parquet.Rd └── table_to_parquet.Rd ├── .Rbuildignore ├── .gitignore ├── data-raw ├── iris-rds.R ├── iris-fst.R ├── iris-sqlite.R ├── iris-parquet.R └── region-2022.R ├── tests ├── testthat │ ├── test-check_parquet.R │ ├── test-parquetize_example.R │ ├── test-get_parquet_info.R │ ├── test-rbind_parquet.R │ ├── test-get_partitions.R │ ├── test-fst_to_parquet.R │ ├── test-rds_to_parquet.R │ ├── test-write_parquet_at_once.R │ ├── test-download_extract.R │ ├── test-utilities.R │ ├── test-json_to_parquet.R │ ├── test-sqlite_to_parquet.R │ ├── test-dbi_to_parquet.R │ ├── test-write_parquet_by_chunk.R │ ├── test-testthat-helpers.R │ ├── test-csv_to_parquet.R │ └── test-table_to_parquet.R └── testthat.R ├── parquetize.Rproj ├── _pkgdown.yml ├── R ├── package-parquetize.R ├── parquetize_example.R ├── get_partitions.R ├── check_parquet.R ├── get_parquet_info.R ├── testthat-helpers.R ├── rds_to_parquet.R ├── fst_to_parquet.R ├── download_extract.R ├── write_parquet_at_once.R ├── utilities.R ├── json_to_parquet.R ├── rbind_parquet.R ├── sqlite_to_parquet.R ├── write_parquet_by_chunk.R ├── dbi_to_parquet.R ├── csv_to_parquet.R └── table_to_parquet.R ├── DESCRIPTION ├── NAMESPACE ├── dev └── dev_history.R ├── CONTRIBUTING.md ├── README.md └── NEWS.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /inst/extdata/iris.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.fst -------------------------------------------------------------------------------- /inst/extdata/iris.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.rds -------------------------------------------------------------------------------- /inst/extdata/iris.duckdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.duckdb -------------------------------------------------------------------------------- /inst/extdata/iris.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.parquet -------------------------------------------------------------------------------- /inst/extdata/iris.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris.sqlite -------------------------------------------------------------------------------- /inst/extdata/multifile.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/multifile.zip -------------------------------------------------------------------------------- /man/figures/hex_parquetize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/man/figures/hex_parquetize.png -------------------------------------------------------------------------------- /man/figures/Insee_example_csv.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/man/figures/Insee_example_csv.gif -------------------------------------------------------------------------------- /inst/extdata/iris_dataset/Species=setosa/part-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris_dataset/Species=setosa/part-0.parquet -------------------------------------------------------------------------------- /inst/extdata/iris_dataset/Species=versicolor/part-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris_dataset/Species=versicolor/part-0.parquet -------------------------------------------------------------------------------- /inst/extdata/iris_dataset/Species=virginica/part-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddotta/parquetize/HEAD/inst/extdata/iris_dataset/Species=virginica/part-0.parquet -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^dev/dev_history\.R$ 4 | ^data-raw$ 5 | ^_pkgdown\.yml$ 6 | ^docs$ 7 | ^pkgdown$ 8 | ^\.github$ 9 | ^CONTRIBUTING.md 10 | 11 | # This folders 12 | tests/testthat/output/ 13 | tests/testthat/Data_test/ 14 | ^doc$ 15 | ^Meta$ 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # This files 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | .Ruserdata 6 | 7 | Thumbs.db 8 | docs 9 | inst/doc 10 | 11 | # This folders 12 | Data/ 13 | Data_test/ 14 | tests/testthat/output/ 15 | tests/testthat/Data/ 16 | 17 | # Files with this extensions 18 | *.sas7bdat 19 | /doc/ 20 | /Meta/ 21 | -------------------------------------------------------------------------------- /data-raw/iris-rds.R: -------------------------------------------------------------------------------- 1 | #################################################################%# 2 | #### Code to create the rds file `iris.rds sous `inst/extdata`#### 3 | ###############################################################%# 4 | 5 | data(iris) 6 | 7 | saveRDS(object = iris, 8 | file = "inst/extdata/iris.rds") 9 | -------------------------------------------------------------------------------- /data-raw/iris-fst.R: -------------------------------------------------------------------------------- 1 | #################################################################%# 2 | #### Code to create the rds file `iris.fst sous `inst/extdata`#### 3 | ###############################################################%# 4 | 5 | library(fst) 6 | 7 | data(iris) 8 | 9 | fst::write.fst(x = iris, 10 | path = "inst/extdata/iris.fst") 11 | -------------------------------------------------------------------------------- /tests/testthat/test-check_parquet.R: -------------------------------------------------------------------------------- 1 | test_that("check_parquet fails on bad file", { 2 | expect_error( 3 | check_parquet(parquetize_example("iris.sqlite")), 4 | regexp = "Error creating dataset" 5 | ) 6 | }) 7 | 8 | test_that("check_parquet fails on missing file", { 9 | expect_error( 10 | check_parquet("no_such_file"), 11 | class = "no_such_file" 12 | ) 13 | }) 14 | -------------------------------------------------------------------------------- /data-raw/iris-sqlite.R: -------------------------------------------------------------------------------- 1 | ####################################################################%# 2 | #### Code to create the rds file `iris.sqlite sous `inst/extdata`#### 3 | ##################################################################%# 4 | 5 | library(RSQLite) 6 | library(DBI) 7 | con <- DBI::dbConnect(RSQLite::SQLite(), "inst/extdata/iris.sqlite") 8 | dbWriteTable(con, "iris", iris) 9 | dbDisconnect(con) 10 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(parquetize) 11 | 12 | test_check("parquetize") 13 | -------------------------------------------------------------------------------- /parquetize.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: 673eed3b-84af-4afe-8e19-fe69a63307c1 3 | 4 | RestoreWorkspace: Default 5 | SaveWorkspace: Default 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: Sweave 14 | LaTeX: pdfLaTeX 15 | 16 | AutoAppendNewline: Yes 17 | StripTrailingWhitespace: Yes 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | -------------------------------------------------------------------------------- /man/expect_missing_argument.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/testthat-helpers.R 3 | \name{expect_missing_argument} 4 | \alias{expect_missing_argument} 5 | \title{Check if missing argument error is raised} 6 | \usage{ 7 | expect_missing_argument(object, regexp) 8 | } 9 | \arguments{ 10 | \item{object}{the object to check} 11 | 12 | \item{regexp}{a regexp with the message we must find} 13 | } 14 | \value{ 15 | same as expect_error 16 | } 17 | \description{ 18 | Check if missing argument error is raised 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /tests/testthat/test-parquetize_example.R: -------------------------------------------------------------------------------- 1 | test_that("test number of sample files in the package positive", { 2 | expect_true( 3 | length(parquetize_example()) > 0 4 | ) 5 | }) 6 | 7 | test_that("test with file", { 8 | expect_no_error( 9 | parquetize_example("iris.json") 10 | ) 11 | }) 12 | 13 | test_that("test with directory without extension", { 14 | expect_no_error( 15 | parquetize_example("iris_dataset") 16 | ) 17 | }) 18 | 19 | test_that("test fails if file does not exist", { 20 | expect_error( 21 | parquetize_example("no_such_dataset"), 22 | class = "no_such_file" 23 | ) 24 | }) 25 | -------------------------------------------------------------------------------- /data-raw/iris-parquet.R: -------------------------------------------------------------------------------- 1 | #########################################################################################%# 2 | #### Code to create the csv file `iris.parquet and partitioned files to `inst/extdata`#### 3 | #######################################################################################%# 4 | 5 | library(arrow) 6 | 7 | data(iris) 8 | 9 | # For iris.parquet 10 | arrow::write_parquet(x = iris, 11 | sink = "inst/extdata/iris.parquet") 12 | 13 | # For partitioned files 14 | 15 | arrow::write_dataset(dataset = iris, 16 | path = "inst/extdata/", 17 | partitioning = c("Species")) 18 | -------------------------------------------------------------------------------- /man/parquetize_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parquetize_example.R 3 | \name{parquetize_example} 4 | \alias{parquetize_example} 5 | \title{Get path to parquetize example} 6 | \usage{ 7 | parquetize_example(file = NULL) 8 | } 9 | \arguments{ 10 | \item{file}{Name of file or directory. If \code{NULL}, the example files will be listed.} 11 | } 12 | \value{ 13 | A character string 14 | } 15 | \description{ 16 | parquetize comes bundled with a number of sample files in its \code{inst/extdata} 17 | directory. This function make them easy to access 18 | } 19 | \examples{ 20 | parquetize_example() 21 | parquetize_example("region_2022.csv") 22 | parquetize_example("iris_dataset") 23 | } 24 | -------------------------------------------------------------------------------- /tests/testthat/test-get_parquet_info.R: -------------------------------------------------------------------------------- 1 | test_that("get_parquet_info works for file", { 2 | parquet <- system.file("extdata", "iris.parquet", package = "parquetize") 3 | info <- get_parquet_info(parquet) 4 | 5 | expect_s3_class(info, "tbl") 6 | expect_equal(nrow(info), 1) 7 | expect_equal(ncol(info), 5) 8 | 9 | expect_equal(info[[1, "path"]], parquet) 10 | expect_equal(info[[1, "num_rows"]], 150) 11 | expect_equal(info[[1, "num_row_groups"]], 1) 12 | expect_equal(info[[1, "num_columns"]], 5) 13 | expect_equal(info[[1, "mean_row_group_size"]], 150) 14 | }) 15 | 16 | test_that("get_parquet_info works for dataset", { 17 | parquet <- system.file("extdata", "iris_dataset", package = "parquetize") 18 | info <- get_parquet_info(parquet) 19 | 20 | expect_s3_class(info, "tbl") 21 | expect_equal(nrow(info), 3) 22 | expect_equal(ncol(info), 5) 23 | }) 24 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | template: 2 | bootstrap: 5 3 | bootswatch: litera 4 | 5 | navbar: 6 | right: 7 | - text: Contribute 8 | icon: fab fa-github fa-lg 9 | href: https://github.com/ddotta/parquetize 10 | 11 | reference: 12 | - title: Functions 13 | desc: The conversion functions available in this package 14 | contents: 15 | - csv_to_parquet 16 | - json_to_parquet 17 | - rds_to_parquet 18 | - fst_to_parquet 19 | - table_to_parquet 20 | - sqlite_to_parquet 21 | - dbi_to_parquet 22 | - title: Other functions 23 | contents: 24 | - get_parquet_info 25 | - get_partitions 26 | - check_parquet 27 | - download_extract 28 | - rbind_parquet 29 | - parquetize_example 30 | - title: Developers 31 | contents: 32 | - write_parquet_by_chunk 33 | - write_parquet_at_once 34 | -------------------------------------------------------------------------------- /.github/workflows/check-release.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | name: R-CMD-check 12 | 13 | jobs: 14 | R-CMD-check: 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | R_KEEP_PKG_SOURCE: yes 19 | steps: 20 | - uses: actions/checkout@v3 21 | 22 | - uses: r-lib/actions/setup-r@v2 23 | with: 24 | use-public-rspm: true 25 | 26 | - uses: r-lib/actions/setup-r-dependencies@v2 27 | with: 28 | extra-packages: any::rcmdcheck 29 | needs: check 30 | 31 | - uses: r-lib/actions/check-r-package@v2 32 | -------------------------------------------------------------------------------- /R/package-parquetize.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | #' @importFrom DBI dbClearResult dbConnect dbDisconnect dbFetch dbHasCompleted dbListTables dbReadTable dbSendQuery 3 | #' @importFrom RSQLite SQLite 4 | #' @importFrom arrow open_dataset read_json_arrow read_parquet write_dataset write_parquet 5 | #' @importFrom cli cli_abort cli_alert_danger cli_alert_info cli_alert_success cli_alert_warning cli_progress_bar cli_progress_message 6 | #' @importFrom curl curl_download 7 | #' @importFrom fst read.fst 8 | #' @importFrom glue glue glue_sql 9 | #' @importFrom haven read_dta read_sas read_sav 10 | #' @importFrom jsonlite read_json 11 | #' @importFrom lifecycle deprecate_warn deprecated 12 | #' @importFrom readr locale read_delim 13 | #' @importFrom tibble as_tibble 14 | #' @importFrom tidyselect all_of everything 15 | #' @importFrom tools file_ext file_path_sans_ext 16 | #' @importFrom utils object.size unzip 17 | #' @importFrom rlang inject 18 | #' @import dplyr 19 | "_PACKAGE" 20 | -------------------------------------------------------------------------------- /man/expect_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/testthat-helpers.R 3 | \name{expect_parquet} 4 | \alias{expect_parquet} 5 | \title{Check if parquet dataset/file is readable and has the good number of rows} 6 | \usage{ 7 | expect_parquet( 8 | path, 9 | with_lines, 10 | with_partitions = NULL, 11 | with_columns = NULL, 12 | with_files = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{path}{to the parquet file or dataset} 17 | 18 | \item{with_lines}{number of lines the file/dataset should have} 19 | 20 | \item{with_partitions}{NULL or a vector with the partition names the dataset should have} 21 | 22 | \item{with_columns}{NULL or a column's name vector the dataset/file should have} 23 | 24 | \item{with_files}{NULL or number of files a dataset should have} 25 | } 26 | \value{ 27 | the dataset handle 28 | } 29 | \description{ 30 | Check if parquet dataset/file is readable and has the good number of rows 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/get_partitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_partitions.R 3 | \name{get_partitions} 4 | \alias{get_partitions} 5 | \title{get unique values from table's column} 6 | \usage{ 7 | get_partitions(conn, table, column) 8 | } 9 | \arguments{ 10 | \item{conn}{A \code{DBIConnection} object, as return by \code{DBI::dbConnect}} 11 | 12 | \item{table}{a DB table name} 13 | 14 | \item{column}{a column name for the table passed in param} 15 | } 16 | \value{ 17 | a vector with unique values for the column of the table 18 | } 19 | \description{ 20 | This function allows you to extract unique values from a table's column to use as partitions.\cr 21 | 22 | Internally, this function does "SELECT DISTINCT(\code{mycolumn}) FROM \code{mytable};" 23 | } 24 | \examples{ 25 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(), 26 | system.file("extdata","iris.sqlite",package = "parquetize")) 27 | 28 | get_partitions(dbi_connection, "iris", "Species") 29 | } 30 | -------------------------------------------------------------------------------- /tests/testthat/test-rbind_parquet.R: -------------------------------------------------------------------------------- 1 | test_that("Checks rbind_parquet creates correct output file", { 2 | temp_dir <- tempfile() 3 | 4 | dir.create(temp_dir, showWarnings = FALSE) 5 | 6 | file.create(fileext = file.path(temp_dir, "test_data1-4.parquet")) 7 | write_parquet(data.frame( 8 | x = c("a","b","c"), 9 | y = c(1L,2L,3L) 10 | ), file.path(temp_dir, "test_data1-4.parquet")) 11 | 12 | file.create(fileext = file.path(temp_dir, "test_data4-6.parquet")) 13 | write_parquet(data.frame( 14 | x = c("d","e","f"), 15 | y = c(4L,5L,6L) 16 | ), file.path(temp_dir, "test_data4-6.parquet")) 17 | 18 | test_data <- rbind_parquet(folder = temp_dir, 19 | output_name = "test_data", 20 | delete_initial_files = FALSE) 21 | 22 | expect_equal( 23 | unname(unlist(lapply(test_data, class))), 24 | c("character", "integer") 25 | ) 26 | 27 | expect_equal(names(test_data), c("x", 28 | "y")) 29 | 30 | }) 31 | -------------------------------------------------------------------------------- /man/parquetize-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package-parquetize.R 3 | \docType{package} 4 | \name{parquetize-package} 5 | \alias{parquetize} 6 | \alias{parquetize-package} 7 | \title{parquetize: Convert Files to Parquet Format} 8 | \description{ 9 | Collection of functions to get files in parquet format. Parquet is a columnar storage file format \url{https://parquet.apache.org/}. The files to convert can be of several formats ("csv", "RData", "rds", "RSQLite", "json", "ndjson", "SAS", "SPSS"...). 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://ddotta.github.io/parquetize/} 15 | \item \url{https://github.com/ddotta/parquetize} 16 | \item Report bugs at \url{https://github.com/ddotta/parquetize/issues} 17 | } 18 | 19 | } 20 | \author{ 21 | \strong{Maintainer}: Damien Dotta \email{damien.dotta@live.fr} 22 | 23 | Authors: 24 | \itemize{ 25 | \item Nicolas Chuche \email{nicolas.chuche@barna.be} 26 | } 27 | 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /R/parquetize_example.R: -------------------------------------------------------------------------------- 1 | #' @name parquetize_example 2 | #' 3 | #' @title Get path to parquetize example 4 | #' 5 | #' @description parquetize comes bundled with a number of sample files in its `inst/extdata` 6 | #' directory. This function make them easy to access 7 | #' 8 | #' @param file Name of file or directory. If `NULL`, the example files will be listed. 9 | # 10 | #' @return A character string 11 | #' 12 | #' @export 13 | #' @examples 14 | #' parquetize_example() 15 | #' parquetize_example("region_2022.csv") 16 | #' parquetize_example("iris_dataset") 17 | 18 | parquetize_example <- function(file = NULL) { 19 | # To show all example files contained in parquetize 20 | if (is.null(file)) { 21 | return(dir(system.file("extdata", package = "parquetize"))) 22 | } 23 | 24 | #To get the path to a file or a directory 25 | tryCatch( 26 | system.file("extdata", file, package = "parquetize", mustWork = TRUE), 27 | error = function(cond) cli_abort("Be careful, {file} doesn't exist in parquetize", class = "no_such_file") 28 | ) 29 | } 30 | -------------------------------------------------------------------------------- /data-raw/region-2022.R: -------------------------------------------------------------------------------- 1 | ################################################################################################%# 2 | #### Code to create the csv/txt file `region_2022.csv` and `region_2022.txt` in `inst/extdata`#### 3 | ################################################################################################%# 4 | 5 | # The file `region_2022.csv` comes from the site insee.fr. 6 | # It can be downloaded at the following URL : 7 | # https://www.insee.fr/fr/information/6051727 8 | 9 | library(curl) 10 | library(readr) 11 | 12 | zipinseefr <- curl_download("https://www.insee.fr/fr/statistiques/fichier/6051727/cog_ensemble_2022_csv.zip", 13 | tempfile()) 14 | filesinseefr <- unzip(zipfile=zipinseefr) 15 | 16 | region_2022 <- read_delim(filesinseefr[11], 17 | show_col_types = FALSE) 18 | 19 | write.csv2( 20 | region_2022, 21 | file = "inst/extdata/region_2022.csv", 22 | row.names = FALSE) 23 | 24 | write.table( 25 | region_2022, 26 | file = "inst/extdata/region_2022.txt", 27 | row.names = FALSE 28 | ) 29 | -------------------------------------------------------------------------------- /tests/testthat/test-get_partitions.R: -------------------------------------------------------------------------------- 1 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(), 2 | system.file("extdata","iris.sqlite",package = "parquetize")) 3 | on.exit(DBI::dbDisconnect(dbi_connection)) 4 | 5 | test_that("Checks get_partitions returns the good value", { 6 | partitions <- expect_no_error( 7 | get_partitions( 8 | conn = dbi_connection, 9 | table = "iris", 10 | column = "Species" 11 | ), 12 | ) 13 | 14 | testthat::expect_setequal(partitions, c("setosa", "versicolor", "virginica")) 15 | }) 16 | 17 | test_that("Checks arguments are correctly filled in", { 18 | expect_missing_argument( 19 | get_partitions( 20 | table = "iris", 21 | column = "Species" 22 | ), 23 | regexp = "conn" 24 | ) 25 | expect_missing_argument( 26 | get_partitions( 27 | conn = dbi_connection, 28 | column = "Species" 29 | ), 30 | regexp = "table" 31 | ) 32 | expect_missing_argument( 33 | get_partitions( 34 | conn = dbi_connection, 35 | table = "iris", 36 | ), 37 | regexp = "column" 38 | ) 39 | }) 40 | 41 | -------------------------------------------------------------------------------- /man/get_parquet_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_parquet_info.R 3 | \name{get_parquet_info} 4 | \alias{get_parquet_info} 5 | \title{Get various info on parquet files} 6 | \usage{ 7 | get_parquet_info(path) 8 | } 9 | \arguments{ 10 | \item{path}{parquet file path or directory. If directory is given, 11 | \code{get_parquet_info} will be applied on all parquet files found in 12 | subdirectories} 13 | } 14 | \value{ 15 | a tibble with 5 columns : 16 | \itemize{ 17 | \item path, file path 18 | \item num_rows, number of rows 19 | \item num_row_groups, number of group row 20 | \item num_columns, 21 | \item row_group_size, mean row group size 22 | } 23 | 24 | If one column contain \code{NA}, parquet file may be malformed. 25 | } 26 | \description{ 27 | One very important parquet metadata is the row group size.\cr 28 | 29 | If it's value is low (below 10 000), you should rebuild your parquet files.\cr 30 | 31 | Normal value is between 30 000 and 1 000 000 32 | } 33 | \examples{ 34 | get_parquet_info(system.file("extdata", "iris.parquet", package = "parquetize")) 35 | 36 | get_parquet_info(system.file("extdata", "iris_dataset", package = "parquetize")) 37 | } 38 | -------------------------------------------------------------------------------- /inst/extdata/region_2022.txt: -------------------------------------------------------------------------------- 1 | "REG" "CHEFLIEU" "TNCC" "NCC" "NCCENR" "LIBELLE" 2 | 1 "97105" 3 "GUADELOUPE" "Guadeloupe" "Guadeloupe" 3 | 2 "97209" 3 "MARTINIQUE" "Martinique" "Martinique" 4 | 3 "97302" 3 "GUYANE" "Guyane" "Guyane" 5 | 4 "97411" 0 "LA REUNION" "La Réunion" "La Réunion" 6 | 6 "97608" 0 "MAYOTTE" "Mayotte" "Mayotte" 7 | 11 "75056" 1 "ILE DE FRANCE" "Île-de-France" "Île-de-France" 8 | 24 "45234" 2 "CENTRE VAL DE LOIRE" "Centre-Val de Loire" "Centre-Val de Loire" 9 | 27 "21231" 0 "BOURGOGNE FRANCHE COMTE" "Bourgogne-Franche-Comté" "Bourgogne-Franche-Comté" 10 | 28 "76540" 0 "NORMANDIE" "Normandie" "Normandie" 11 | 32 "59350" 4 "HAUTS DE FRANCE" "Hauts-de-France" "Hauts-de-France" 12 | 44 "67482" 2 "GRAND EST" "Grand Est" "Grand Est" 13 | 52 "44109" 4 "PAYS DE LA LOIRE" "Pays de la Loire" "Pays de la Loire" 14 | 53 "35238" 0 "BRETAGNE" "Bretagne" "Bretagne" 15 | 75 "33063" 3 "NOUVELLE AQUITAINE" "Nouvelle-Aquitaine" "Nouvelle-Aquitaine" 16 | 76 "31555" 1 "OCCITANIE" "Occitanie" "Occitanie" 17 | 84 "69123" 1 "AUVERGNE RHONE ALPES" "Auvergne-Rhône-Alpes" "Auvergne-Rhône-Alpes" 18 | 93 "13055" 0 "PROVENCE ALPES COTE D AZUR" "Provence-Alpes-Côte d'Azur" "Provence-Alpes-Côte d'Azur" 19 | 94 "2A004" 0 "CORSE" "Corse" "Corse" 20 | -------------------------------------------------------------------------------- /inst/extdata/region_2022.csv: -------------------------------------------------------------------------------- 1 | "REG";"CHEFLIEU";"TNCC";"NCC";"NCCENR";"LIBELLE" 2 | "01";"97105";3;"GUADELOUPE";"Guadeloupe";"Guadeloupe" 3 | "02";"97209";3;"MARTINIQUE";"Martinique";"Martinique" 4 | "03";"97302";3;"GUYANE";"Guyane";"Guyane" 5 | "04";"97411";0;"LA REUNION";"La Réunion";"La Réunion" 6 | "06";"97608";0;"MAYOTTE";"Mayotte";"Mayotte" 7 | "11";"75056";1;"ILE DE FRANCE";"Île-de-France";"Île-de-France" 8 | "24";"45234";2;"CENTRE VAL DE LOIRE";"Centre-Val de Loire";"Centre-Val de Loire" 9 | "27";"21231";0;"BOURGOGNE FRANCHE COMTE";"Bourgogne-Franche-Comté";"Bourgogne-Franche-Comté" 10 | "28";"76540";0;"NORMANDIE";"Normandie";"Normandie" 11 | "32";"59350";4;"HAUTS DE FRANCE";"Hauts-de-France";"Hauts-de-France" 12 | "44";"67482";2;"GRAND EST";"Grand Est";"Grand Est" 13 | "52";"44109";4;"PAYS DE LA LOIRE";"Pays de la Loire";"Pays de la Loire" 14 | "53";"35238";0;"BRETAGNE";"Bretagne";"Bretagne" 15 | "75";"33063";3;"NOUVELLE AQUITAINE";"Nouvelle-Aquitaine";"Nouvelle-Aquitaine" 16 | "76";"31555";1;"OCCITANIE";"Occitanie";"Occitanie" 17 | "84";"69123";1;"AUVERGNE RHONE ALPES";"Auvergne-Rhône-Alpes";"Auvergne-Rhône-Alpes" 18 | "93";"13055";0;"PROVENCE ALPES COTE D AZUR";"Provence-Alpes-Côte d'Azur";"Provence-Alpes-Côte d'Azur" 19 | "94";"2A004";0;"CORSE";"Corse";"Corse" 20 | -------------------------------------------------------------------------------- /inst/extdata/region_2022_with_comment.csv: -------------------------------------------------------------------------------- 1 | # A comment 2 | "REG";"CHEFLIEU";"TNCC";"NCC";"NCCENR";"LIBELLE" 3 | "01";"97105";3;"GUADELOUPE";"Guadeloupe";"Guadeloupe" 4 | "02";"97209";3;"MARTINIQUE";"Martinique";"Martinique" 5 | "03";"97302";3;"GUYANE";"Guyane";"Guyane" 6 | "04";"97411";0;"LA REUNION";"La Réunion";"La Réunion" 7 | "06";"97608";0;"MAYOTTE";"Mayotte";"Mayotte" 8 | "11";"75056";1;"ILE DE FRANCE";"Île-de-France";"Île-de-France" 9 | "24";"45234";2;"CENTRE VAL DE LOIRE";"Centre-Val de Loire";"Centre-Val de Loire" 10 | "27";"21231";0;"BOURGOGNE FRANCHE COMTE";"Bourgogne-Franche-Comté";"Bourgogne-Franche-Comté" 11 | "28";"76540";0;"NORMANDIE";"Normandie";"Normandie" 12 | "32";"59350";4;"HAUTS DE FRANCE";"Hauts-de-France";"Hauts-de-France" 13 | "44";"67482";2;"GRAND EST";"Grand Est";"Grand Est" 14 | "52";"44109";4;"PAYS DE LA LOIRE";"Pays de la Loire";"Pays de la Loire" 15 | "53";"35238";0;"BRETAGNE";"Bretagne";"Bretagne" 16 | "75";"33063";3;"NOUVELLE AQUITAINE";"Nouvelle-Aquitaine";"Nouvelle-Aquitaine" 17 | "76";"31555";1;"OCCITANIE";"Occitanie";"Occitanie" 18 | "84";"69123";1;"AUVERGNE RHONE ALPES";"Auvergne-Rhône-Alpes";"Auvergne-Rhône-Alpes" 19 | "93";"13055";0;"PROVENCE ALPES COTE D AZUR";"Provence-Alpes-Côte d'Azur";"Provence-Alpes-Côte d'Azur" 20 | "94";"2A004";0;"CORSE";"Corse";"Corse" 21 | -------------------------------------------------------------------------------- /man/check_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check_parquet.R 3 | \name{check_parquet} 4 | \alias{check_parquet} 5 | \title{Check if parquet file or dataset is readable and return basic informations} 6 | \usage{ 7 | check_parquet(path) 8 | } 9 | \arguments{ 10 | \item{path}{path to the file or dataset} 11 | } 12 | \value{ 13 | a tibble with information on parquet dataset/file's columns with 14 | three columns : field name, arrow type and nullable 15 | } 16 | \description{ 17 | This function checks if a file/dataset is a valid parquet format. 18 | It will print the number of lines/columns and return a tibble on columns 19 | information. 20 | } 21 | \details{ 22 | This function will : 23 | \itemize{ 24 | \item open the parquet dataset/file to check if it's valid 25 | \item print the number of lines 26 | \item print the number of columns 27 | \item return a tibble with 2 columns : 28 | \itemize{ 29 | \item the column name (string) 30 | \item the arrow type (string) 31 | } 32 | } 33 | 34 | You can find a list of arrow type in the documentation 35 | \href{https://arrow.apache.org/docs/r/articles/data_types.html}{on this page}. 36 | } 37 | \examples{ 38 | 39 | # check a parquet file 40 | check_parquet(parquetize_example("iris.parquet")) 41 | 42 | # check a parquet dataset 43 | check_parquet(parquetize_example("iris_dataset")) 44 | } 45 | -------------------------------------------------------------------------------- /R/get_partitions.R: -------------------------------------------------------------------------------- 1 | #' @name get_partitions 2 | #' 3 | #' @title get unique values from table's column 4 | #' 5 | #' @description This function allows you to extract unique values from a table's column to use as partitions.\cr 6 | #' 7 | #' Internally, this function does "SELECT DISTINCT(`mycolumn`) FROM `mytable`;" 8 | #' 9 | #' @param conn A `DBIConnection` object, as return by `DBI::dbConnect` 10 | #' @param table a DB table name 11 | #' @param column a column name for the table passed in param 12 | #' 13 | #' @return a vector with unique values for the column of the table 14 | #' @export 15 | #' 16 | #' @examples 17 | #' dbi_connection <- DBI::dbConnect(RSQLite::SQLite(), 18 | #' system.file("extdata","iris.sqlite",package = "parquetize")) 19 | #' 20 | #' get_partitions(dbi_connection, "iris", "Species") 21 | get_partitions <- function(conn, table, column) { 22 | if (missing(conn)) { 23 | cli_abort("Be careful, the argument conn must be filled in", class = "parquetize_missing_argument") 24 | } 25 | if (missing(table)) { 26 | cli_abort("Be careful, the argument table must be filled in", class = "parquetize_missing_argument") 27 | } 28 | if (missing(column)) { 29 | cli_abort("Be careful, the argument column must be filled in", class = "parquetize_missing_argument") 30 | } 31 | 32 | DBI::dbGetQuery(conn, glue::glue("SELECT distinct({`column`}) FROM {`table`}", .con = conn))[,1] 33 | } 34 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: parquetize 2 | Type: Package 3 | Title: Convert Files to Parquet Format 4 | Version: 0.5.8 5 | Authors@R: 6 | c(person(given = "Damien", 7 | family = "Dotta", 8 | role = c("aut", "cre"), 9 | email = "damien.dotta@live.fr"), 10 | person(given = "Nicolas", 11 | family = "Chuche", 12 | role = c("aut"), 13 | email = "nicolas.chuche@barna.be")) 14 | Description: Collection of functions to get files in parquet format. 15 | Parquet is a columnar storage file format . 16 | The files to convert can be of several formats 17 | ("csv", "RData", "rds", "RSQLite", 18 | "json", "ndjson", "SAS", "SPSS"...). 19 | License: Apache License (>= 2.0) 20 | Encoding: UTF-8 21 | Depends: 22 | R (>= 3.5.0) 23 | URL: https://ddotta.github.io/parquetize/, 24 | https://github.com/ddotta/parquetize 25 | BugReports: https://github.com/ddotta/parquetize/issues 26 | Roxygen: list(markdown = TRUE) 27 | RoxygenNote: 7.3.2 28 | Suggests: 29 | knitr, 30 | rmarkdown, 31 | testthat (>= 3.0.0) 32 | Config/testthat/edition: 3 33 | Imports: 34 | haven (>= 2.4.0), 35 | arrow, 36 | curl, 37 | readr, 38 | jsonlite, 39 | DBI, 40 | RSQLite, 41 | cli, 42 | tidyselect, 43 | lifecycle, 44 | tools, 45 | glue, 46 | fst, 47 | rlang, 48 | dplyr, 49 | tibble 50 | VignetteBuilder: knitr 51 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@v4.4.1 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /tests/testthat/test-fst_to_parquet.R: -------------------------------------------------------------------------------- 1 | test_that("Checks arguments are correctly filled in", { 2 | expect_missing_argument( 3 | fst_to_parquet( 4 | path_to_file = system.file("extdata","iris.fst",package = "parquetize") 5 | ), 6 | regexp = "path_to_parquet" 7 | ) 8 | expect_missing_argument( 9 | fst_to_parquet( 10 | path_to_parquet = tempfile() 11 | ), 12 | regexp = "path_to_file" 13 | ) 14 | }) 15 | 16 | test_that("Checks message is displayed with fst file", { 17 | path_to_parquet <- tempfile() 18 | 19 | expect_no_error( 20 | fst_to_parquet( 21 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"), 22 | path_to_parquet = path_to_parquet 23 | ) 24 | ) 25 | expect_parquet( 26 | file.path(path_to_parquet), 27 | with_lines = 150 28 | ) 29 | 30 | }) 31 | 32 | test_that("Checks message is displayed with by adding partition and partitioning argument", { 33 | path_to_parquet <- tempfile() 34 | 35 | expect_no_error( 36 | fst_to_parquet( 37 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"), 38 | path_to_parquet = path_to_parquet, 39 | partition = "yes", 40 | partitioning = c("Species") 41 | ) 42 | ) 43 | 44 | expect_parquet( 45 | file.path(path_to_parquet), 46 | with_lines = 150 47 | ) 48 | expect_identical( 49 | dir(path_to_parquet), 50 | c('Species=setosa', 'Species=versicolor', 'Species=virginica') 51 | ) 52 | }) 53 | -------------------------------------------------------------------------------- /tests/testthat/test-rds_to_parquet.R: -------------------------------------------------------------------------------- 1 | test_that("Checks arguments are correctly filled in", { 2 | expect_missing_argument( 3 | rds_to_parquet( 4 | path_to_file = system.file("extdata","iris.rds",package = "parquetize") 5 | ), 6 | regexp = "path_to_parquet" 7 | ) 8 | expect_missing_argument( 9 | rds_to_parquet( 10 | path_to_parquet = tempfile() 11 | ), 12 | regexp = "path_to_file" 13 | ) 14 | }) 15 | 16 | test_that("Checks message is displayed with rds file", { 17 | path_to_parquet <- tempfile() 18 | 19 | expect_no_error( 20 | rds_to_parquet( 21 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"), 22 | path_to_parquet = path_to_parquet 23 | ) 24 | ) 25 | expect_parquet( 26 | file.path(path_to_parquet), 27 | with_lines = 150 28 | ) 29 | 30 | }) 31 | 32 | test_that("Checks message is displayed with by adding partition and partitioning argument", { 33 | path_to_parquet <- tempfile() 34 | 35 | expect_no_error( 36 | rds_to_parquet( 37 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"), 38 | path_to_parquet = path_to_parquet, 39 | partition = "yes", 40 | partitioning = c("Species") 41 | ) 42 | ) 43 | 44 | expect_parquet( 45 | file.path(path_to_parquet), 46 | with_lines = 150 47 | ) 48 | expect_identical( 49 | dir(path_to_parquet), 50 | c('Species=setosa', 'Species=versicolor', 'Species=virginica') 51 | ) 52 | }) 53 | -------------------------------------------------------------------------------- /tests/testthat/test-write_parquet_at_once.R: -------------------------------------------------------------------------------- 1 | test_that("write_parquet_at_once warn if path_to_parquet is a directory for a parquet file", { 2 | path_to_parquet <- tempfile() 3 | dir.create(path_to_parquet, showWarnings = FALSE) 4 | expect_message( 5 | write_parquet_at_once(mtcars, path_to_parquet = path_to_parquet, partition = "no"), 6 | regexp = "path_to_parquet should be a file name" 7 | ) 8 | }) 9 | 10 | test_that("write_parquet_at_once fails on missing argument", { 11 | expect_missing_argument( 12 | write_parquet_at_once( 13 | path_to_parquet = path_to_parquet 14 | ), 15 | regexp = "data" 16 | ) 17 | 18 | expect_missing_argument( 19 | write_parquet_at_once( 20 | data = iris 21 | ), 22 | regexp = "path_to_parquet" 23 | ) 24 | }) 25 | 26 | test_that("write_parquet_at_once works for simple parquet file", { 27 | path_to_parquet <- tempfile() 28 | expect_no_error( 29 | write_parquet_at_once(iris, path_to_parquet) 30 | ) 31 | 32 | expect_parquet( 33 | path_to_parquet, 34 | with_lines = 150, 35 | with_file = 1 36 | ) 37 | }) 38 | 39 | test_that("write_parquet_at_once works for partitioned dataset", { 40 | path_to_parquet <- tempfile() 41 | expect_no_error( 42 | write_parquet_at_once(iris, path_to_parquet, partition = "yes", partitioning = "Species") 43 | ) 44 | 45 | expect_parquet( 46 | path_to_parquet, 47 | with_lines = 150, 48 | with_file = 3, 49 | with_partitions = c("Species=setosa", "Species=versicolor", "Species=virginica") 50 | ) 51 | }) 52 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v4 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /tests/testthat/test-download_extract.R: -------------------------------------------------------------------------------- 1 | test_that("Checks download_extract return local file if not a zip", { 2 | expect_equal( 3 | download_extract("/my/local/file.truc"), 4 | "/my/local/file.truc" 5 | ) 6 | }) 7 | 8 | test_that("Checks download_extract returns the csv file of local zip", { 9 | expect_match( 10 | download_extract(system.file("extdata","mtcars.csv.zip", package = "readr")), 11 | ".*/mtcars.csv" 12 | ) 13 | }) 14 | 15 | test_that("Checks download_extract fails with error if zip has more than one file and no filename_in_zip", { 16 | skip_if_offline() 17 | 18 | expect_missing_argument( 19 | download_extract( 20 | system.file("extdata","multifile.zip",package = "parquetize") 21 | ), 22 | regexp = "filename_in_zip" 23 | ) 24 | }) 25 | 26 | test_that("Checks download_extract works with multi files zip", { 27 | file <- download_extract( 28 | system.file("extdata","multifile.zip",package = "parquetize"), 29 | filename_in_zip = "region_2022.csv" 30 | ) 31 | 32 | expect_match( 33 | file, 34 | ".*/region_2022.csv" 35 | ) 36 | 37 | expect_true( 38 | file.exists(file) 39 | ) 40 | }) 41 | 42 | test_that("Checks download_extract returns the csv file of remote zip", { 43 | skip_if_offline() 44 | 45 | file <- download_extract( 46 | "https://www.stats.govt.nz/assets/Uploads/Business-employment-data/Business-employment-data-June-2022-quarter/Download-data/business-employment-data-june-2022-quarter-csv.zip" 47 | ) 48 | 49 | expect_match( 50 | file, 51 | ".*/machine-readable-business-employment-data-june-2022-quarter.csv" 52 | ) 53 | 54 | expect_true( 55 | file.exists(file) 56 | ) 57 | }) 58 | 59 | -------------------------------------------------------------------------------- /R/check_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name check_parquet 2 | #' 3 | #' @title Check if parquet file or dataset is readable and return basic informations 4 | #' 5 | #' @description This function checks if a file/dataset is a valid parquet format. 6 | #' It will print the number of lines/columns and return a tibble on columns 7 | #' information. 8 | #' 9 | #' @details This function will : 10 | #' 11 | #' * open the parquet dataset/file to check if it's valid 12 | #' * print the number of lines 13 | #' * print the number of columns 14 | #' * return a tibble with 2 columns : 15 | #' 16 | #' * the column name (string) 17 | #' * the arrow type (string) 18 | #' 19 | #' You can find a list of arrow type in the documentation 20 | #' \href{https://arrow.apache.org/docs/r/articles/data_types.html}{on this page}. 21 | #' 22 | #' @param path path to the file or dataset 23 | #' 24 | #' @return a tibble with information on parquet dataset/file's columns with 25 | #' three columns : field name, arrow type and nullable 26 | #' 27 | #' @export 28 | #' 29 | #' @examples 30 | #' 31 | #' # check a parquet file 32 | #' check_parquet(parquetize_example("iris.parquet")) 33 | #' 34 | #' # check a parquet dataset 35 | #' check_parquet(parquetize_example("iris_dataset")) 36 | check_parquet <- function(path) { 37 | 38 | if (isFALSE(file.exists(path))) { 39 | cli_abort("Be careful, {path} doesn't exist", class = "no_such_file") 40 | } 41 | 42 | cli_alert_info("checking: {path}") 43 | 44 | ds <- arrow::open_dataset(path, unify_schemas = TRUE) 45 | cli_alert_success("loading dataset: ok") 46 | 47 | cli_alert_success("number of lines: {nrow(ds)}") 48 | cli_alert_success("number of columns: {length(names(ds))}") 49 | 50 | get_col_types(ds) 51 | } 52 | -------------------------------------------------------------------------------- /tests/testthat/test-utilities.R: -------------------------------------------------------------------------------- 1 | test_that("test get_haven_read_function_by_extension returns the good method", { 2 | file <- system.file("examples","iris.dta", package = "haven") 3 | fun <- get_haven_read_function_for_file(file) 4 | expect_s3_class(fun(file), "tbl") 5 | 6 | file <- system.file("examples","iris.sas7bdat", package = "haven") 7 | fun <- get_haven_read_function_for_file(file) 8 | expect_s3_class(fun(file), "tbl") 9 | 10 | file <- system.file("examples","iris.sav", package = "haven") 11 | fun <- get_haven_read_function_for_file(file) 12 | expect_s3_class(fun(file), "tbl") 13 | }) 14 | 15 | 16 | test_that("tests get_haven_read_function_by_extension fails when needed", { 17 | expect_error( 18 | get_haven_read_function_for_file("/some/bad/file/without_extension"), 19 | class = "parquetize_bad_argument" 20 | ) 21 | 22 | expect_error( 23 | get_haven_read_function_for_file("/some/bad/file/with_bad_extension.xlsx"), 24 | class = "parquetize_bad_argument" 25 | ) 26 | }) 27 | 28 | test_that("test get_lines_for_memory return the good number of lines", { 29 | file <- system.file("examples","iris.dta", package = "haven") 30 | read_method <- get_haven_read_function_for_file(file) 31 | data <- read_method(file, n_max = Inf) 32 | 33 | expect_equal( 34 | get_lines_for_memory(data, max_memory = 1 / 1024), 35 | 16 36 | ) 37 | }) 38 | 39 | test_that("test is_remote works", { 40 | expect_true(is_remote("https://my_url/")) 41 | expect_true(is_remote("http://my_url/")) 42 | expect_true(is_remote("ftp://my_url/")) 43 | expect_true(is_remote("ftps://my_url/")) 44 | 45 | expect_false(is_remote("c://my_url/")) 46 | expect_false(is_remote("/my_url/")) 47 | }) 48 | 49 | 50 | -------------------------------------------------------------------------------- /man/write_parquet_at_once.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/write_parquet_at_once.R 3 | \name{write_parquet_at_once} 4 | \alias{write_parquet_at_once} 5 | \title{write parquet file or dataset based on partition argument \cr} 6 | \usage{ 7 | write_parquet_at_once( 8 | data, 9 | path_to_parquet, 10 | partition = "no", 11 | compression = "snappy", 12 | compression_level = NULL, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{data}{the data.frame/tibble to write} 18 | 19 | \item{path_to_parquet}{String that indicates the path to the directory where 20 | the output parquet file or dataset will be stored.} 21 | 22 | \item{partition}{string ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 23 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}.} 24 | 25 | \item{compression}{compression algorithm. Default "snappy".} 26 | 27 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 28 | 29 | \item{...}{Additional format-specific arguments, see 30 | \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}} 31 | } 32 | \value{ 33 | a dataset as return by arrow::open_dataset 34 | } 35 | \description{ 36 | Low level function that implements the logic to write a parquet file or a dataset from data 37 | } 38 | \examples{ 39 | 40 | write_parquet_at_once(iris, tempfile()) 41 | 42 | write_parquet_at_once(iris, tempfile(), partition = "yes", partitioning = c("Species")) 43 | 44 | \dontrun{ 45 | write_parquet_at_once(iris, tempfile(), compression="gzip", compression_level = 5) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /man/download_extract.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_extract.R 3 | \name{download_extract} 4 | \alias{download_extract} 5 | \title{download and uncompress file if needed} 6 | \usage{ 7 | download_extract(path, filename_in_zip) 8 | } 9 | \arguments{ 10 | \item{path}{the input file's path or url.} 11 | 12 | \item{filename_in_zip}{name of the csv file in the zip. Required if 13 | several csv are included in the zip.} 14 | } 15 | \value{ 16 | the path to the usable (uncompressed) file, invisibly. 17 | } 18 | \description{ 19 | This function will download the file if the file is remote and 20 | unzip it if it is zipped. It will just return the input path argument if 21 | it's neither. \cr 22 | 23 | If the zip contains multiple files, you can use \code{filename_in_zip} to set the file you want to unzip and use. 24 | 25 | You can pipe output on all \verb{*_to_parquet} functions. 26 | } 27 | \examples{ 28 | 29 | # 1. unzip a local zip file 30 | # 2. parquetize it 31 | 32 | file_path <- download_extract(system.file("extdata","mtcars.csv.zip", package = "readr")) 33 | csv_to_parquet( 34 | file_path, 35 | path_to_parquet = tempfile(fileext = ".parquet") 36 | ) 37 | 38 | # 1. download a remote file 39 | # 2. extract the file census2021-ts007-ctry.csv 40 | # 3. parquetize it 41 | 42 | file_path <- download_extract( 43 | "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip", 44 | filename_in_zip = "census2021-ts007-ctry.csv" 45 | ) 46 | csv_to_parquet( 47 | file_path, 48 | path_to_parquet = tempfile(fileext = ".parquet") 49 | ) 50 | 51 | # the file is local and not zipped so : 52 | # 1. parquetize it 53 | 54 | file_path <- download_extract(parquetize_example("region_2022.csv")) 55 | csv_to_parquet( 56 | file_path, 57 | path_to_parquet = tempfile(fileext = ".parquet") 58 | ) 59 | 60 | } 61 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(check_parquet) 4 | export(csv_to_parquet) 5 | export(dbi_to_parquet) 6 | export(download_extract) 7 | export(expect_missing_argument) 8 | export(expect_parquet) 9 | export(fst_to_parquet) 10 | export(get_parquet_info) 11 | export(get_partitions) 12 | export(json_to_parquet) 13 | export(parquetize_example) 14 | export(rbind_parquet) 15 | export(rds_to_parquet) 16 | export(sqlite_to_parquet) 17 | export(table_to_parquet) 18 | export(write_parquet_at_once) 19 | export(write_parquet_by_chunk) 20 | import(dplyr) 21 | importFrom(DBI,dbClearResult) 22 | importFrom(DBI,dbConnect) 23 | importFrom(DBI,dbDisconnect) 24 | importFrom(DBI,dbFetch) 25 | importFrom(DBI,dbHasCompleted) 26 | importFrom(DBI,dbListTables) 27 | importFrom(DBI,dbReadTable) 28 | importFrom(DBI,dbSendQuery) 29 | importFrom(RSQLite,SQLite) 30 | importFrom(arrow,open_dataset) 31 | importFrom(arrow,read_json_arrow) 32 | importFrom(arrow,read_parquet) 33 | importFrom(arrow,write_dataset) 34 | importFrom(arrow,write_parquet) 35 | importFrom(cli,cli_abort) 36 | importFrom(cli,cli_alert_danger) 37 | importFrom(cli,cli_alert_info) 38 | importFrom(cli,cli_alert_success) 39 | importFrom(cli,cli_alert_warning) 40 | importFrom(cli,cli_progress_bar) 41 | importFrom(cli,cli_progress_message) 42 | importFrom(curl,curl_download) 43 | importFrom(fst,read.fst) 44 | importFrom(glue,glue) 45 | importFrom(glue,glue_sql) 46 | importFrom(haven,read_dta) 47 | importFrom(haven,read_sas) 48 | importFrom(haven,read_sav) 49 | importFrom(jsonlite,read_json) 50 | importFrom(lifecycle,deprecate_warn) 51 | importFrom(lifecycle,deprecated) 52 | importFrom(readr,locale) 53 | importFrom(readr,read_delim) 54 | importFrom(rlang,inject) 55 | importFrom(tibble,as_tibble) 56 | importFrom(tidyselect,all_of) 57 | importFrom(tidyselect,everything) 58 | importFrom(tools,file_ext) 59 | importFrom(tools,file_path_sans_ext) 60 | importFrom(utils,object.size) 61 | importFrom(utils,unzip) 62 | -------------------------------------------------------------------------------- /man/rbind_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rbind_parquet.R 3 | \name{rbind_parquet} 4 | \alias{rbind_parquet} 5 | \title{Function to bind multiple parquet files by row} 6 | \usage{ 7 | rbind_parquet( 8 | folder, 9 | output_name, 10 | delete_initial_files = TRUE, 11 | compression = "snappy", 12 | compression_level = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{folder}{the folder where the initial files are stored} 17 | 18 | \item{output_name}{name of the output parquet file} 19 | 20 | \item{delete_initial_files}{Boolean. Should the function delete the initial files ? By default TRUE.} 21 | 22 | \item{compression}{compression algorithm. Default "snappy".} 23 | 24 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 25 | } 26 | \value{ 27 | Parquet files, invisibly 28 | } 29 | \description{ 30 | This function read all parquet files in \code{folder} argument that starts with \code{output_name}, 31 | combine them using rbind and write the result to a new parquet file. \cr 32 | 33 | It can also delete the initial files if \code{delete_initial_files} argument is TRUE. \cr 34 | 35 | Be careful, this function will not work if files with different structures 36 | are present in the folder given with the argument \code{folder}. 37 | } 38 | \examples{ 39 | \dontrun{ 40 | library(arrow) 41 | if (file.exists('output')==FALSE) { 42 | dir.create("output") 43 | } 44 | 45 | file.create(fileext = "output/test_data1-4.parquet") 46 | write_parquet(data.frame( 47 | x = c("a","b","c"), 48 | y = c(1L,2L,3L) 49 | ), 50 | "output/test_data1-4.parquet") 51 | 52 | file.create(fileext = "output/test_data4-6.parquet") 53 | write_parquet(data.frame( 54 | x = c("d","e","f"), 55 | y = c(4L,5L,6L) 56 | ), "output/test_data4-6.parquet") 57 | 58 | test_data <- rbind_parquet(folder = "output", 59 | output_name = "test_data", 60 | delete_initial_files = FALSE) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /tests/testthat/test-json_to_parquet.R: -------------------------------------------------------------------------------- 1 | test_that("Checks arguments are correctly filled in", { 2 | testthat::local_edition(3) 3 | 4 | expect_missing_argument( 5 | json_to_parquet( 6 | path_to_file = system.file("extdata","iris.ndjson",package = "parquetize") 7 | ), 8 | regexp = "path_to_parquet" 9 | ) 10 | 11 | expect_missing_argument( 12 | json_to_parquet( 13 | path_to_parquet = tempfile() 14 | ), 15 | regexp = "path_to_file" 16 | ) 17 | 18 | expect_error( 19 | json_to_parquet( 20 | path_to_file = system.file("extdata","iris.json",package = "parquetize"), 21 | path_to_parquet = tempfile(), 22 | format = "xjson" 23 | ), 24 | class = "parquetize_bad_format" 25 | ) 26 | }) 27 | 28 | test_that("Checks converting json file works", { 29 | path_to_parquet <- tempfile() 30 | 31 | json_to_parquet( 32 | path_to_file = system.file("extdata","iris.json",package = "parquetize"), 33 | path_to_parquet = path_to_parquet 34 | ) 35 | 36 | expect_parquet( 37 | path_to_parquet, 38 | with_lines = 150 39 | ) 40 | }) 41 | 42 | test_that("Checks converting ndjson file works", { 43 | path_to_parquet <- tempfile() 44 | 45 | json_to_parquet( 46 | path_to_file = system.file("extdata","iris.ndjson",package = "parquetize"), 47 | path_to_parquet = path_to_parquet, 48 | format = "ndjson" 49 | ) 50 | expect_parquet( 51 | path_to_parquet, 52 | with_lines = 150 53 | ) 54 | 55 | }) 56 | 57 | test_that("Checks adding partition and partitioning argument works", { 58 | path_to_parquet <- tempfile() 59 | 60 | json_to_parquet( 61 | path_to_file = system.file("extdata","iris.json",package = "parquetize"), 62 | path_to_parquet = path_to_parquet, 63 | partition = "yes", 64 | partitioning = c("Species") 65 | ) 66 | expect_parquet( 67 | path_to_parquet, 68 | with_lines = 150, 69 | with_partitions = c('Species=setosa', 'Species=versicolor', 'Species=virginica') 70 | ) 71 | }) 72 | -------------------------------------------------------------------------------- /R/get_parquet_info.R: -------------------------------------------------------------------------------- 1 | #' @name get_parquet_info 2 | #' 3 | #' @title Get various info on parquet files 4 | #' 5 | #' @description One very important parquet metadata is the row group size.\cr 6 | #' 7 | #' If it's value is low (below 10 000), you should rebuild your parquet files.\cr 8 | #' 9 | #' Normal value is between 30 000 and 1 000 000 10 | #' 11 | #' @param path parquet file path or directory. If directory is given, 12 | #' `get_parquet_info` will be applied on all parquet files found in 13 | #' subdirectories 14 | #' 15 | #' @return a tibble with 5 columns : 16 | #' * path, file path 17 | #' * num_rows, number of rows 18 | #' * num_row_groups, number of group row 19 | #' * num_columns, 20 | #' * row_group_size, mean row group size 21 | #' 22 | #' If one column contain `NA`, parquet file may be malformed. 23 | #' 24 | #' @export 25 | #' 26 | #' @examples 27 | #' get_parquet_info(system.file("extdata", "iris.parquet", package = "parquetize")) 28 | #' 29 | #' get_parquet_info(system.file("extdata", "iris_dataset", package = "parquetize")) 30 | get_parquet_info <- function(path) { 31 | if (dir.exists(path)) { 32 | files <- list.files(path, recursive = TRUE, pattern = "*.parquet$", full.names = T) 33 | } else if (file.exists(path)) { 34 | files <- path 35 | } else { 36 | stop("path must be a file or a directory") 37 | } 38 | 39 | tibble::tibble( 40 | path = files, 41 | num_rows = sapply(files, get_parquet_attribute, attribute = "num_rows"), 42 | num_row_groups = sapply(files, get_parquet_attribute, attribute = "num_row_groups"), 43 | num_columns = sapply(files, get_parquet_attribute, attribute = "num_columns") 44 | ) %>% 45 | dplyr::mutate( 46 | mean_row_group_size = .data$num_rows / .data$num_row_groups 47 | ) 48 | } 49 | 50 | #' @name get_parquet_attribute 51 | #' 52 | #' @title Utility to get attributes from a parquet file 53 | #' 54 | #' @param path parquet file path or directory. 55 | #' @param attribute name of searched attribute 56 | #' 57 | #' @noRd 58 | get_parquet_attribute <- function(path, attribute) { 59 | tryCatch({ 60 | reader <- arrow::ParquetFileReader$create(path) 61 | reader[[attribute]] 62 | }, 63 | error = function(e) { return(NA_real_) } 64 | ) 65 | } 66 | -------------------------------------------------------------------------------- /R/testthat-helpers.R: -------------------------------------------------------------------------------- 1 | #' Check if parquet dataset/file is readable and has the good number of rows 2 | #' 3 | #' @param path to the parquet file or dataset 4 | #' @param with_lines number of lines the file/dataset should have 5 | #' @param with_partitions NULL or a vector with the partition names the dataset should have 6 | #' @param with_columns NULL or a column's name vector the dataset/file should have 7 | #' @param with_files NULL or number of files a dataset should have 8 | #' 9 | #' @return the dataset handle 10 | #' @export 11 | #' 12 | #' @keywords internal 13 | expect_parquet <- function( 14 | path, 15 | with_lines, 16 | with_partitions = NULL, 17 | with_columns = NULL, 18 | with_files = NULL) { 19 | dataset <- testthat::expect_no_error(arrow::open_dataset(path)) 20 | testthat::expect_equal(nrow(dataset), with_lines) 21 | 22 | if (!is.null(with_partitions)) { 23 | tryCatch( 24 | testthat::expect_setequal(dir(path), with_partitions), 25 | error = function(cond) { cli::cli_abort("{with_partitions} different from {dir(path)}", class = "partquetize_test_with_partitions")} 26 | ) 27 | } 28 | 29 | if (!is.null(with_columns)) { 30 | tryCatch( 31 | testthat::expect_setequal(names(dataset), with_columns), 32 | error = function(cond) { cli::cli_abort("{with_columns} different from {names(dataset)}", class = "partquetize_test_with_columns") } 33 | ) 34 | } 35 | 36 | if (!is.null(with_files)) { 37 | files_number <- length(dataset$files) 38 | 39 | tryCatch( 40 | testthat::expect_equal(files_number, with_files), 41 | error = function(cond) { cli::cli_abort("we should have {with_files} files. We have {files_number}", class = "partquetize_test_with_files") } 42 | ) 43 | } 44 | return(dataset) 45 | } 46 | 47 | #' Check if missing argument error is raised 48 | #' 49 | #' @param object the object to check 50 | #' @param regexp a regexp with the message we must find 51 | #' 52 | #' @return same as expect_error 53 | #' @export 54 | #' 55 | #' @keywords internal 56 | expect_missing_argument <- function(object, regexp) { 57 | testthat::expect_error( 58 | object, 59 | class = "parquetize_missing_argument", 60 | regexp = regexp 61 | ) 62 | } 63 | -------------------------------------------------------------------------------- /tests/testthat/test-sqlite_to_parquet.R: -------------------------------------------------------------------------------- 1 | test_that("Checks arguments are correctly filled in", { 2 | expect_missing_argument( 3 | sqlite_to_parquet( 4 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize") 5 | ), 6 | regexp = "path_to_parquet" 7 | ) 8 | expect_missing_argument( 9 | sqlite_to_parquet( 10 | path_to_parquet = tempfile() 11 | ), 12 | regexp = "path_to_file" 13 | ) 14 | }) 15 | 16 | test_that("Check if extension used in path_to_file is correct", { 17 | expect_error( 18 | sqlite_to_parquet( 19 | path_to_file = system.file("extdata","iris.sqliteee",package = "parquetize") 20 | ), 21 | class = "parquetize_bad_format" 22 | ) 23 | }) 24 | 25 | test_that("Check if parquetize fails when table does not exist", { 26 | expect_error( 27 | sqlite_to_parquet( 28 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"), 29 | path_to_parquet = tempfile(), 30 | table = "nosuchtable" 31 | ), 32 | class = "parquetize_missing_table", 33 | regexp = "nosuchtable" 34 | ) 35 | }) 36 | 37 | test_that("Checks message is displayed with sqlite file", { 38 | path_to_parquet <- tempfile() 39 | 40 | expect_no_error( 41 | sqlite_to_parquet( 42 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"), 43 | table_in_sqlite = "iris", 44 | path_to_parquet = path_to_parquet 45 | ) 46 | ) 47 | 48 | expect_parquet( 49 | file.path(path_to_parquet), 50 | with_lines = 150 51 | ) 52 | 53 | }) 54 | 55 | test_that("Checks message is displayed with by adding partition and partitioning argument", { 56 | path_to_parquet <- tempfile() 57 | 58 | expect_no_error( 59 | sqlite_to_parquet( 60 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"), 61 | table_in_sqlite = "iris", 62 | path_to_parquet = path_to_parquet, 63 | partition = "yes", 64 | partitioning = c("Species") 65 | ) 66 | ) 67 | 68 | expect_parquet( 69 | file.path(path_to_parquet), 70 | with_lines = 150 71 | ) 72 | 73 | expect_identical( 74 | dir(path_to_parquet), 75 | c('Species=setosa', 'Species=versicolor', 'Species=virginica') 76 | ) 77 | 78 | }) 79 | -------------------------------------------------------------------------------- /dev/dev_history.R: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------# 2 | # Exemple sous https://linogaliana.gitlab.io/collaboratif/package.html # 3 | 4 | #################### AU QUOTIDIEN ############################### 5 | # 3.a. Inclure du code, le documenter et le tester 6 | # Pour chaque fonction du package : 7 | usethis::use_r("csv_to_parquet") 8 | usethis::use_test("csv_to_parquet") 9 | # écrire le code de la fonction 10 | # documenter la fonction 11 | # # Pour mettre à jour la documentation et le NAMESPACE 12 | # devtools::document() 13 | roxygen2::roxygenise() 14 | # écrire les tests 15 | # exécuter les tests 16 | devtools::test() 17 | 18 | # 3.b. Si besoin, déclarer une dépendance dans DESCRIPTION 19 | usethis::use_package("readr") 20 | # pour utiliser %>% dans un package 21 | # usethis::use_pipe() 22 | 23 | # Pour réaliser le contrôle de conformité du package 24 | devtools::check() 25 | 26 | # 3.c. Astuce qui peut aider durant le développement 27 | # Charger l'ensemble des fonctions de son package 28 | devtools::load_all() 29 | 30 | # Pour le code coverage 31 | covr::package_coverage() 32 | covr::report() 33 | #------------------------------------------------# 34 | 35 | # Ajout de `dev/dev_history.R` au .Rbuildignore 36 | usethis::use_build_ignore("dev/dev_history.R") 37 | 38 | # Ajout d'un fichier NEWS 39 | usethis::use_news_md() 40 | 41 | # Creation du squelette du pkgdown 42 | usethis::use_pkgdown() 43 | 44 | # Configuration des GHA 45 | usethis::use_github_action(name = "check-release") 46 | 47 | # Ajout des fichiers dans `data-raw` 48 | usethis::use_data_raw("region-2022") 49 | 50 | # Creation des vignettes 51 | usethis::use_vignette("aa-conversions") 52 | 53 | # Creation du repertoire testthat 54 | usethis::use_testthat() 55 | 56 | # Pour avoir le détail du code coverage par fonction 57 | covr::report() 58 | 59 | ################ En fin de developpement ########## 60 | 61 | # Construction du site (uniquement sur SSP Cloud) 62 | pkgdown::build_site(override = list(destination = "../website")) 63 | 64 | # Construction du fichier .tar.gz 65 | devtools::build() 66 | 67 | # Construction du fichier .zip (format binaire) 68 | devtools::build(binary=TRUE) 69 | 70 | # Construction du manuel au format pdf 71 | devtools::build_manual(path = "manuel") 72 | -------------------------------------------------------------------------------- /R/rds_to_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name rds_to_parquet 2 | #' 3 | #' @title Convert a rds file to parquet format 4 | #' 5 | #' @description This function allows to convert a rds file to parquet format. \cr 6 | #' 7 | #' Two conversions possibilities are offered : 8 | #' 9 | #'\itemize{ 10 | #' 11 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;} 12 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;} 13 | #' 14 | #' } 15 | #' 16 | #' @inheritParams table_to_parquet 17 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 18 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations. 19 | #' @return A parquet file, invisibly 20 | #' 21 | #' @export 22 | #' 23 | #' @examples 24 | #' 25 | #' # Conversion from a local rds file to a single parquet file :: 26 | #' 27 | #' rds_to_parquet( 28 | #' path_to_file = system.file("extdata","iris.rds",package = "parquetize"), 29 | #' path_to_parquet = tempfile(fileext = ".parquet") 30 | #' ) 31 | #' 32 | #' # Conversion from a local rds file to a partitioned parquet file :: 33 | #' 34 | #' rds_to_parquet( 35 | #' path_to_file = system.file("extdata","iris.rds",package = "parquetize"), 36 | #' path_to_parquet = tempfile(fileext = ".parquet"), 37 | #' partition = "yes", 38 | #' partitioning = c("Species") 39 | #' ) 40 | 41 | rds_to_parquet <- function( 42 | path_to_file, 43 | path_to_parquet, 44 | partition = "no", 45 | compression = "snappy", 46 | compression_level = NULL, 47 | ... 48 | ) { 49 | 50 | # Check if path_to_file is missing 51 | if (missing(path_to_file)) { 52 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument") 53 | } 54 | 55 | # Check if path_to_parquet is missing 56 | if (missing(path_to_parquet)) { 57 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument") 58 | } 59 | 60 | Sys.sleep(0.01) 61 | cli_progress_message("Reading data...") 62 | 63 | rds_output <- readRDS(file = path_to_file) 64 | 65 | dataset <- write_parquet_at_once( 66 | rds_output, 67 | path_to_parquet, 68 | partition, 69 | compression, 70 | compression_level, 71 | ...) 72 | 73 | return(invisible(dataset)) 74 | 75 | } 76 | -------------------------------------------------------------------------------- /R/fst_to_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name fst_to_parquet 2 | #' 3 | #' @title Convert a fst file to parquet format 4 | #' 5 | #' @description This function allows to convert a fst file to parquet format. \cr 6 | #' 7 | #' Two conversions possibilities are offered : 8 | #' 9 | #'\itemize{ 10 | #' 11 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;} 12 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;} 13 | #' 14 | #' } 15 | #' 16 | #' @inheritParams table_to_parquet 17 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 18 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations. 19 | #' @return A parquet file, invisibly 20 | #' 21 | #' @export 22 | #' 23 | #' @examples 24 | #' 25 | #' # Conversion from a local fst file to a single parquet file :: 26 | #' 27 | #' fst_to_parquet( 28 | #' path_to_file = system.file("extdata","iris.fst",package = "parquetize"), 29 | #' path_to_parquet = tempfile(fileext = ".parquet") 30 | #' ) 31 | #' 32 | #' # Conversion from a local fst file to a partitioned parquet file :: 33 | #' 34 | #' fst_to_parquet( 35 | #' path_to_file = system.file("extdata","iris.fst",package = "parquetize"), 36 | #' path_to_parquet = tempfile(fileext = ".parquet"), 37 | #' partition = "yes", 38 | #' partitioning = c("Species") 39 | #' ) 40 | 41 | fst_to_parquet <- function( 42 | path_to_file, 43 | path_to_parquet, 44 | partition = "no", 45 | compression = "snappy", 46 | compression_level = NULL, 47 | ... 48 | ) { 49 | 50 | # Check if path_to_file is missing 51 | if (missing(path_to_file)) { 52 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument") 53 | } 54 | 55 | # Check if path_to_parquet is missing 56 | if (missing(path_to_parquet)) { 57 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument") 58 | } 59 | 60 | Sys.sleep(0.01) 61 | cli_progress_message("Reading data...") 62 | 63 | fst_output <- fst::read.fst(path = path_to_file) 64 | 65 | dataset <- write_parquet_at_once( 66 | fst_output, 67 | path_to_parquet, 68 | partition, 69 | compression, 70 | compression_level, 71 | ...) 72 | 73 | return(invisible(dataset)) 74 | 75 | } 76 | -------------------------------------------------------------------------------- /man/fst_to_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fst_to_parquet.R 3 | \name{fst_to_parquet} 4 | \alias{fst_to_parquet} 5 | \title{Convert a fst file to parquet format} 6 | \usage{ 7 | fst_to_parquet( 8 | path_to_file, 9 | path_to_parquet, 10 | partition = "no", 11 | compression = "snappy", 12 | compression_level = NULL, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).} 18 | 19 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.} 20 | 21 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 22 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}. 23 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.} 24 | 25 | \item{compression}{compression algorithm. Default "snappy".} 26 | 27 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 28 | 29 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 30 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.} 31 | } 32 | \value{ 33 | A parquet file, invisibly 34 | } 35 | \description{ 36 | This function allows to convert a fst file to parquet format. \cr 37 | 38 | Two conversions possibilities are offered : 39 | 40 | \itemize{ 41 | 42 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;} 43 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;} 44 | 45 | } 46 | } 47 | \examples{ 48 | 49 | # Conversion from a local fst file to a single parquet file :: 50 | 51 | fst_to_parquet( 52 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"), 53 | path_to_parquet = tempfile(fileext = ".parquet") 54 | ) 55 | 56 | # Conversion from a local fst file to a partitioned parquet file :: 57 | 58 | fst_to_parquet( 59 | path_to_file = system.file("extdata","iris.fst",package = "parquetize"), 60 | path_to_parquet = tempfile(fileext = ".parquet"), 61 | partition = "yes", 62 | partitioning = c("Species") 63 | ) 64 | } 65 | -------------------------------------------------------------------------------- /man/rds_to_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rds_to_parquet.R 3 | \name{rds_to_parquet} 4 | \alias{rds_to_parquet} 5 | \title{Convert a rds file to parquet format} 6 | \usage{ 7 | rds_to_parquet( 8 | path_to_file, 9 | path_to_parquet, 10 | partition = "no", 11 | compression = "snappy", 12 | compression_level = NULL, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).} 18 | 19 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.} 20 | 21 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 22 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}. 23 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.} 24 | 25 | \item{compression}{compression algorithm. Default "snappy".} 26 | 27 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 28 | 29 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 30 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.} 31 | } 32 | \value{ 33 | A parquet file, invisibly 34 | } 35 | \description{ 36 | This function allows to convert a rds file to parquet format. \cr 37 | 38 | Two conversions possibilities are offered : 39 | 40 | \itemize{ 41 | 42 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;} 43 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;} 44 | 45 | } 46 | } 47 | \examples{ 48 | 49 | # Conversion from a local rds file to a single parquet file :: 50 | 51 | rds_to_parquet( 52 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"), 53 | path_to_parquet = tempfile(fileext = ".parquet") 54 | ) 55 | 56 | # Conversion from a local rds file to a partitioned parquet file :: 57 | 58 | rds_to_parquet( 59 | path_to_file = system.file("extdata","iris.rds",package = "parquetize"), 60 | path_to_parquet = tempfile(fileext = ".parquet"), 61 | partition = "yes", 62 | partitioning = c("Species") 63 | ) 64 | } 65 | -------------------------------------------------------------------------------- /R/download_extract.R: -------------------------------------------------------------------------------- 1 | #' @name download_extract 2 | #' 3 | #' @title download and uncompress file if needed 4 | #' 5 | #' @description This function will download the file if the file is remote and 6 | #' unzip it if it is zipped. It will just return the input path argument if 7 | #' it's neither. \cr 8 | #' 9 | #' If the zip contains multiple files, you can use `filename_in_zip` to set the file you want to unzip and use. 10 | #' 11 | #' You can pipe output on all `*_to_parquet` functions. 12 | #' 13 | #' 14 | #' @param path the input file's path or url. 15 | #' @param filename_in_zip name of the csv file in the zip. Required if 16 | #' several csv are included in the zip. 17 | #' 18 | #' @return the path to the usable (uncompressed) file, invisibly. 19 | #' 20 | #' @export 21 | #' 22 | #' @examples 23 | #' 24 | #' # 1. unzip a local zip file 25 | #' # 2. parquetize it 26 | #' 27 | #' file_path <- download_extract(system.file("extdata","mtcars.csv.zip", package = "readr")) 28 | #' csv_to_parquet( 29 | #' file_path, 30 | #' path_to_parquet = tempfile(fileext = ".parquet") 31 | #' ) 32 | #' 33 | #' # 1. download a remote file 34 | #' # 2. extract the file census2021-ts007-ctry.csv 35 | #' # 3. parquetize it 36 | #' 37 | #' file_path <- download_extract( 38 | #' "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip", 39 | #' filename_in_zip = "census2021-ts007-ctry.csv" 40 | #' ) 41 | #' csv_to_parquet( 42 | #' file_path, 43 | #' path_to_parquet = tempfile(fileext = ".parquet") 44 | #' ) 45 | #' 46 | #' # the file is local and not zipped so : 47 | #' # 1. parquetize it 48 | #' 49 | #' file_path <- download_extract(parquetize_example("region_2022.csv")) 50 | #' csv_to_parquet( 51 | #' file_path, 52 | #' path_to_parquet = tempfile(fileext = ".parquet") 53 | #' ) 54 | #' 55 | download_extract <- function(path, filename_in_zip) { 56 | if (is_remote(path)) { 57 | tmp_file <- curl_download(path,tempfile(fileext = file_ext(path))) 58 | } else { 59 | tmp_file <- path 60 | } 61 | 62 | if (!is_zip(path)) return(invisible(tmp_file)) 63 | 64 | csv_files <- unzip(zipfile=tmp_file,exdir=tempfile()) 65 | names(csv_files) <- basename(csv_files) 66 | 67 | if (length(csv_files) > 1 & missing(filename_in_zip)) { 68 | cli_abort("Be careful, zip files contains more than one file, you must set filename_in_zip argument", 69 | class = "parquetize_missing_argument") 70 | } else if (length(csv_files) > 1) { 71 | path <- csv_files[[filename_in_zip]] 72 | } else { 73 | path <- csv_files[[1]] 74 | } 75 | invisible(path) 76 | } 77 | 78 | -------------------------------------------------------------------------------- /R/write_parquet_at_once.R: -------------------------------------------------------------------------------- 1 | #' @name write_parquet_at_once 2 | #' 3 | #' @title write parquet file or dataset based on partition argument \cr 4 | #' 5 | #' @description Low level function that implements the logic to write a parquet file or a dataset from data 6 | #' 7 | #' @param data the data.frame/tibble to write 8 | #' @inheritParams write_parquet_by_chunk 9 | #' @param partition string ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 10 | #' If "yes", `"partitioning"` argument must be filled in. In this case, a folder will be created for each modality of the variable filled in `"partitioning"`. 11 | #' 12 | #' @return a dataset as return by arrow::open_dataset 13 | #' 14 | #' @export 15 | #' 16 | #' @examples 17 | #' 18 | #' write_parquet_at_once(iris, tempfile()) 19 | #' 20 | #' write_parquet_at_once(iris, tempfile(), partition = "yes", partitioning = c("Species")) 21 | #' 22 | #' \dontrun{ 23 | #' write_parquet_at_once(iris, tempfile(), compression="gzip", compression_level = 5) 24 | #' } 25 | write_parquet_at_once <- function( 26 | data, 27 | path_to_parquet, 28 | partition = "no", 29 | compression = "snappy", 30 | compression_level = NULL, 31 | ...) { 32 | Sys.sleep(0.01) 33 | cli_progress_message("Writing data...") 34 | 35 | if (missing(data)) { 36 | cli_abort("Be careful, data argument is mandatory", class = "parquetize_missing_argument") 37 | } 38 | 39 | if (missing(path_to_parquet)) { 40 | cli_abort("Be careful, path_to_parquet argument is mandatory", class = "parquetize_missing_argument") 41 | } 42 | 43 | if (partition == "no") { 44 | if (isTRUE(file.info(path_to_parquet)$isdir)) { 45 | path_to_parquet <- file.path(path_to_parquet, paste0(basename(path_to_parquet), ".parquet")) 46 | cli_alert_warning("Be careful, path_to_parquet should be a file name, using : {path_to_parquet}") 47 | } 48 | 49 | write_parquet(data, 50 | sink = path_to_parquet, 51 | compression = compression, 52 | compression_level = compression_level, 53 | ...) 54 | parquet_type <- "file" 55 | } else if (partition == "yes") { 56 | write_dataset(data, 57 | path = path_to_parquet, 58 | compression = compression, 59 | compression_level = compression_level, 60 | ...) 61 | parquet_type <- "dataset" 62 | } 63 | Sys.sleep(0.01) 64 | cli_alert_success("\nData are available in parquet {parquet_type} under {path_to_parquet}") 65 | invisible(arrow::open_dataset(path_to_parquet)) 66 | } 67 | -------------------------------------------------------------------------------- /man/json_to_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/json_to_parquet.R 3 | \name{json_to_parquet} 4 | \alias{json_to_parquet} 5 | \title{Convert a json file to parquet format} 6 | \usage{ 7 | json_to_parquet( 8 | path_to_file, 9 | path_to_parquet, 10 | format = "json", 11 | partition = "no", 12 | compression = "snappy", 13 | compression_level = NULL, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).} 19 | 20 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.} 21 | 22 | \item{format}{string that indicates if the format is "json" (by default) or "ndjson"} 23 | 24 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 25 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}. 26 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.} 27 | 28 | \item{compression}{compression algorithm. Default "snappy".} 29 | 30 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 31 | 32 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 33 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.} 34 | } 35 | \value{ 36 | A parquet file, invisibly 37 | } 38 | \description{ 39 | This function allows to convert a \href{https://www.json.org/json-en.html}{json} 40 | or \href{https://docs.mulesoft.com/dataweave/latest/dataweave-formats-ndjson}{ndjson} file to parquet format. \cr 41 | 42 | Two conversions possibilities are offered : 43 | 44 | \itemize{ 45 | 46 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;} 47 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;} 48 | 49 | } 50 | } 51 | \examples{ 52 | 53 | # Conversion from a local json file to a single parquet file :: 54 | 55 | json_to_parquet( 56 | path_to_file = system.file("extdata","iris.json",package = "parquetize"), 57 | path_to_parquet = tempfile(fileext = ".parquet") 58 | ) 59 | 60 | # Conversion from a local ndjson file to a partitioned parquet file :: 61 | 62 | json_to_parquet( 63 | path_to_file = system.file("extdata","iris.ndjson",package = "parquetize"), 64 | path_to_parquet = tempfile(fileext = ".parquet"), 65 | format = "ndjson" 66 | ) 67 | } 68 | -------------------------------------------------------------------------------- /tests/testthat/test-dbi_to_parquet.R: -------------------------------------------------------------------------------- 1 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(), 2 | system.file("extdata","iris.sqlite",package = "parquetize")) 3 | on.exit(DBI::dbDisconnect(dbi_connection)) 4 | 5 | test_that("Checks arguments are correctly filled in", { 6 | expect_missing_argument( 7 | dbi_to_parquet( 8 | sql_query = "SELECT * FROM iris", 9 | path_to_parquet = "Data_test" 10 | ), 11 | regexp = "conn" 12 | ) 13 | 14 | expect_missing_argument( 15 | dbi_to_parquet( 16 | conn = dbi_connection, 17 | path_to_parquet = "Data_test" 18 | ), 19 | regexp = "sql_query" 20 | ) 21 | 22 | expect_missing_argument( 23 | dbi_to_parquet( 24 | conn = dbi_connection, 25 | sql_query = "SELECT * FROM iris" 26 | ), 27 | regexp = "path_to_parquet" 28 | ) 29 | }) 30 | 31 | test_that("Checks simple query generate a parquet file", { 32 | path_to_parquet <- tempfile() 33 | 34 | expect_no_error( 35 | dbi_to_parquet( 36 | conn = dbi_connection, 37 | sql_query = "SELECT * FROM iris", 38 | path_to_parquet = path_to_parquet 39 | ) 40 | ) 41 | 42 | expect_parquet( 43 | path_to_parquet, 44 | with_lines = 150 45 | ) 46 | }) 47 | 48 | test_that("Checks simple query generate a parquet file with good messages", { 49 | path_to_parquet <- tempfile() 50 | 51 | expect_no_error( 52 | dbi_to_parquet( 53 | conn = dbi_connection, 54 | sql_query = "SELECT * FROM iris", 55 | path_to_parquet = path_to_parquet, 56 | partition = "yes", 57 | partitioning = "Species" 58 | ) 59 | ) 60 | 61 | expect_parquet( 62 | path_to_parquet, 63 | with_lines = 150, 64 | with_partitions = c("Species=setosa", "Species=versicolor", "Species=virginica") 65 | ) 66 | }) 67 | 68 | test_that("Checks simple query works by chunk with max_rows", { 69 | path_to_parquet <- tempfile() 70 | 71 | expect_no_error( 72 | dbi_to_parquet( 73 | conn = dbi_connection, 74 | sql_query = "SELECT * FROM iris", 75 | path_to_parquet = path_to_parquet, 76 | max_rows = 49 77 | ) 78 | ) 79 | 80 | expect_parquet( 81 | path_to_parquet, 82 | with_lines = 150 83 | ) 84 | }) 85 | 86 | test_that("Checks simple query works by chunk with max_memory", { 87 | path_to_parquet <- tempfile() 88 | parquetname <- "iris" 89 | 90 | expect_no_error( 91 | dbi_to_parquet( 92 | conn = dbi_connection, 93 | sql_query = "SELECT * FROM iris", 94 | path_to_parquet = path_to_parquet, 95 | max_memory = 2 / 1024 96 | ) 97 | ) 98 | 99 | expect_parquet( 100 | path_to_parquet, 101 | with_lines = 150 102 | ) 103 | }) 104 | 105 | -------------------------------------------------------------------------------- /man/sqlite_to_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sqlite_to_parquet.R 3 | \name{sqlite_to_parquet} 4 | \alias{sqlite_to_parquet} 5 | \title{Convert a sqlite file to parquet format} 6 | \usage{ 7 | sqlite_to_parquet( 8 | path_to_file, 9 | table_in_sqlite, 10 | path_to_parquet, 11 | partition = "no", 12 | compression = "snappy", 13 | compression_level = NULL, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).} 19 | 20 | \item{table_in_sqlite}{string that indicates the name of the table to convert in the sqlite file} 21 | 22 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.} 23 | 24 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 25 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}. 26 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.} 27 | 28 | \item{compression}{compression algorithm. Default "snappy".} 29 | 30 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 31 | 32 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 33 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.} 34 | } 35 | \value{ 36 | A parquet file, invisibly 37 | } 38 | \description{ 39 | This function allows to convert a table from a sqlite file to parquet format. \cr 40 | The following extensions are supported : 41 | "db","sdb","sqlite","db3","s3db","sqlite3","sl3","db2","s2db","sqlite2","sl2". \cr 42 | 43 | Two conversions possibilities are offered : 44 | 45 | \itemize{ 46 | 47 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;} 48 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;} 49 | 50 | } 51 | } 52 | \examples{ 53 | 54 | # Conversion from a local sqlite file to a single parquet file : 55 | 56 | sqlite_to_parquet( 57 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"), 58 | table_in_sqlite = "iris", 59 | path_to_parquet = tempfile(fileext = ".parquet") 60 | ) 61 | 62 | # Conversion from a local sqlite file to a partitioned parquet file : 63 | 64 | sqlite_to_parquet( 65 | path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"), 66 | table_in_sqlite = "iris", 67 | path_to_parquet = tempfile(), 68 | partition = "yes", 69 | partitioning = c("Species") 70 | ) 71 | } 72 | -------------------------------------------------------------------------------- /tests/testthat/test-write_parquet_by_chunk.R: -------------------------------------------------------------------------------- 1 | # we create the closure to loop over the data.frame 2 | my_read_closure <- function() { 3 | function(input, skip = 0L, n_max = Inf) { 4 | # if we are after the end of the input we return an empty data.frame 5 | if (skip+1 > nrow(input)) { return(data.frame()) } 6 | 7 | input[(skip+1):(min(skip+n_max, nrow(input))),] 8 | } 9 | } 10 | 11 | test_that("checks that argument works", { 12 | read_method <- my_read_closure() 13 | 14 | expect_missing_argument( 15 | write_parquet_by_chunk( 16 | input = mtcars, 17 | path_to_parquet = tempfile(), 18 | max_rows = 10, 19 | ), 20 | regexp = "read_method" 21 | ) 22 | 23 | expect_missing_argument( 24 | write_parquet_by_chunk( 25 | read_method = read_method, 26 | path_to_parquet = tempfile(), 27 | max_rows = 10, 28 | ), 29 | regexp = "input" 30 | ) 31 | 32 | expect_error( 33 | write_parquet_by_chunk( 34 | read_method = "", 35 | input = mtcars, 36 | path_to_parquet = tempfile(), 37 | max_rows = 10, 38 | ), 39 | regexp = "read_method", 40 | class = "parquetize_bad_argument" 41 | ) 42 | 43 | expect_error( 44 | write_parquet_by_chunk( 45 | read_method = read_method, 46 | input = mtcars, 47 | path_to_parquet = tempfile(), 48 | max_rows = 10, 49 | max_memory = 10, 50 | ), 51 | regexp = "can not be used together", 52 | class = "parquetize_bad_argument" 53 | ) 54 | }) 55 | 56 | test_that("works with empty data", { 57 | path_to_parquet <- tempfile() 58 | read_method <- my_read_closure() 59 | 60 | expect_no_error( 61 | write_parquet_by_chunk( 62 | read_method = read_method, 63 | input = data.frame(), 64 | path_to_parquet = path_to_parquet, 65 | max_rows = 50, 66 | ) 67 | ) 68 | 69 | expect_parquet(path_to_parquet, with_lines = 0) 70 | }) 71 | 72 | test_that("Checks parquetizing by nrow chunks works", { 73 | path_to_parquet <- tempfile() 74 | read_method <- my_read_closure() 75 | 76 | expect_no_error( 77 | write_parquet_by_chunk( 78 | read_method = read_method, 79 | input = iris, 80 | path_to_parquet = path_to_parquet, 81 | max_rows = 50, 82 | ) 83 | ) 84 | 85 | expect_parquet(path_to_parquet, with_lines = 150, with_files = 3) 86 | }) 87 | 88 | test_that("Checks parquetizing by memory size chunks works", { 89 | path_to_parquet <- tempfile() 90 | read_method <- my_read_closure() 91 | 92 | expect_no_error( 93 | write_parquet_by_chunk( 94 | read_method = read_method, 95 | input = iris, 96 | path_to_parquet = path_to_parquet, 97 | max_memory = 2 / 1024, 98 | ) 99 | ) 100 | 101 | expect_parquet(path_to_parquet, with_lines = 150, with_files = 4) 102 | }) 103 | -------------------------------------------------------------------------------- /R/utilities.R: -------------------------------------------------------------------------------- 1 | #' @name get_lines_for_memory 2 | #' 3 | #' @title Utility to guess the number of lines fiting in given memory_size 4 | #' 5 | #' @param data a tibble/dataframe of equivalent with the data sample used to guess memory 6 | #' @param memory_size memory (in Mo) to use for one chunk, default to 4000Mb 7 | #' 8 | #' This method tries to estimate the number lines that fit in argument 9 | #' memory_size 10 | #' 11 | #' @noRd 12 | get_lines_for_memory <- function(data, max_memory = 4000) { 13 | data_memory_size <- object.size(data) 14 | # cosmetic : remove object.size attribute 15 | attributes(data_memory_size) <- NULL 16 | 17 | # max_memory is in Mb and data_memory_size in bytes 18 | lines <- ceiling(max_memory * 1024 * 1024 * nrow(data) / data_memory_size) 19 | lines 20 | } 21 | 22 | haven_read_function_by_extension <- list( 23 | "sas7bdat" = haven::read_sas, 24 | "SAS7BDAT" = haven::read_sas, 25 | "sav" = haven::read_sav, 26 | "SAV" = haven::read_sav, 27 | "dta" = haven::read_dta, 28 | "DTA" = haven::read_dta 29 | ) 30 | 31 | #' @name get_read_function_for_file 32 | #' 33 | #' @title Utility that returns the haven method to use for given file 34 | #' 35 | #' @param file_name string that indicates the path to the input file 36 | #' 37 | #' @noRd 38 | get_haven_read_function_for_file <- function(file_name) { 39 | ext <- tools::file_ext(file_name) 40 | if (ext == "") { 41 | cli_abort("Be careful, unable to find a read method for \"{file_name}\", it has no extension", 42 | class = "parquetize_bad_argument") 43 | } 44 | 45 | fun <- haven_read_function_by_extension[[ext]] 46 | if (is.null(fun)) { 47 | cli_abort("Be careful, no method to read \"{file_name}\" file", 48 | class = "parquetize_bad_argument") 49 | } 50 | 51 | fun 52 | } 53 | 54 | 55 | #' @name is_remote 56 | #' 57 | #' @title Utility to check if file is local or remote 58 | #' 59 | #' @param path file's path 60 | #' @return TRUE if remote, FALSE otherwise 61 | #' 62 | #' @noRd 63 | 64 | is_remote <- function(path) { 65 | grepl('(http|ftp)s?://', path) 66 | } 67 | 68 | #' @name is_zip 69 | #' 70 | #' @title Utility to check if file is a zip 71 | #' 72 | #' @param path file's path 73 | #' @return TRUE if zip, FALSE otherwise 74 | #' 75 | #' @noRd 76 | 77 | is_zip <- function(path) { 78 | grepl('\\.zip$', path, ignore.case = TRUE) 79 | } 80 | 81 | #' @name get_col_types 82 | #' 83 | #' @title Utility to get informations on parquet file's columns 84 | #' 85 | #' @param ds a dataset/parquet file 86 | #' 87 | #' @return a tibble with 2 columns : 88 | #' 89 | #' * the column name (string) 90 | #' * the arrow type (string) 91 | # 92 | #' @noRd 93 | get_col_types <- function(ds) { 94 | fields <- ds$schema$fields 95 | 96 | tibble( 97 | name = unlist(lapply(fields, function(x) { x$name })), 98 | type = unlist(lapply(fields, function(x) { x$type$name })) 99 | ) 100 | } 101 | -------------------------------------------------------------------------------- /tests/testthat/test-testthat-helpers.R: -------------------------------------------------------------------------------- 1 | test_that("expect_parquet fails on file error", { 2 | expect_error( 3 | expect_parquet(parquetize_example("region_2022.csv"), with_lines = 25), 4 | regexp = "Invalid" 5 | ) 6 | }) 7 | 8 | test_that("expect_parquet fails on file's number of line", { 9 | expect_error( 10 | expect_parquet(parquetize_example("iris_dataset"), with_lines = 25), 11 | class = "expectation_failure" 12 | ) 13 | }) 14 | 15 | test_that("expect_parquet works without partitions", { 16 | expect_no_error( 17 | expect_parquet(parquetize_example("iris_dataset"), with_lines = 150) 18 | ) 19 | }) 20 | 21 | test_that("expect_parquet works with partitions", { 22 | expect_no_error( 23 | expect_parquet(parquetize_example("iris_dataset"), 24 | with_lines = 150, 25 | with_partitions = c('Species=setosa', 'Species=versicolor', 'Species=virginica'), 26 | with_files = 3) 27 | ) 28 | }) 29 | 30 | test_that("expect_parquet works with columns", { 31 | expect_no_error( 32 | expect_parquet(parquetize_example("iris_dataset"), 33 | with_lines = 150, 34 | with_columns = c("Petal.Width", "Sepal.Length", "Sepal.Width", "Species", "Petal.Length")) 35 | ) 36 | }) 37 | 38 | test_that("expect_parquet fails works with partitions", { 39 | expect_error( 40 | expect_parquet(parquetize_example("iris_dataset"), 41 | with_lines = 150, 42 | with_partitions = c('Species=setosa')), 43 | class = "partquetize_test_with_partitions" 44 | ) 45 | }) 46 | 47 | test_that("expect_parquet fails with bad columns columns", { 48 | expect_error( 49 | expect_parquet(parquetize_example("iris_dataset"), 50 | with_lines = 150, 51 | with_columns = c("Petal.Length", "Petal.Width", "Sepal.Length")), 52 | class = "partquetize_test_with_columns" 53 | ) 54 | }) 55 | 56 | test_that("expect_missing_argument check good errors", { 57 | raising_fun <- function() { 58 | cli_abort("string", class = "parquetize_missing_argument") 59 | } 60 | expect_no_error( 61 | expect_missing_argument(raising_fun(), regexp = "string") 62 | ) 63 | }) 64 | 65 | test_that("expect_missing_argument fails on bad string", { 66 | raising_fun <- function() { 67 | cli_abort("string", class = "parquetize_missing_argument") 68 | } 69 | expect_error( 70 | expect_missing_argument(raising_fun(), regexp = "message") 71 | ) 72 | }) 73 | 74 | test_that("expect_missing_argument fails on bad error type", { 75 | raising_fun <- function() { 76 | cli_abort("string", class = "a_class") 77 | } 78 | expect_error( 79 | expect_missing_argument(raising_fun(), regexp = "string"), 80 | class = "a_class" 81 | ) 82 | }) 83 | 84 | test_that("expect_parquet fails with bad files number", { 85 | expect_error( 86 | expect_parquet(parquetize_example("iris_dataset"), with_lines = 150, with_files = 100), 87 | class = "partquetize_test_with_files" 88 | ) 89 | }) 90 | -------------------------------------------------------------------------------- /R/json_to_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name json_to_parquet 2 | #' 3 | #' @title Convert a json file to parquet format 4 | #' 5 | #' @description This function allows to convert a \href{https://www.json.org/json-en.html}{json} 6 | #' or \href{https://docs.mulesoft.com/dataweave/latest/dataweave-formats-ndjson}{ndjson} file to parquet format. \cr 7 | #' 8 | #' Two conversions possibilities are offered : 9 | #' 10 | #'\itemize{ 11 | #' 12 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;} 13 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;} 14 | #' 15 | #' } 16 | #' 17 | #' @param format string that indicates if the format is "json" (by default) or "ndjson" 18 | #' @inheritParams table_to_parquet 19 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 20 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations. 21 | #' @return A parquet file, invisibly 22 | #' 23 | #' @export 24 | #' 25 | #' @examples 26 | #' 27 | #' # Conversion from a local json file to a single parquet file :: 28 | #' 29 | #' json_to_parquet( 30 | #' path_to_file = system.file("extdata","iris.json",package = "parquetize"), 31 | #' path_to_parquet = tempfile(fileext = ".parquet") 32 | #' ) 33 | #' 34 | #' # Conversion from a local ndjson file to a partitioned parquet file :: 35 | #' 36 | #' json_to_parquet( 37 | #' path_to_file = system.file("extdata","iris.ndjson",package = "parquetize"), 38 | #' path_to_parquet = tempfile(fileext = ".parquet"), 39 | #' format = "ndjson" 40 | #' ) 41 | 42 | json_to_parquet <- function( 43 | path_to_file, 44 | path_to_parquet, 45 | format = "json", 46 | partition = "no", 47 | compression = "snappy", 48 | compression_level = NULL, 49 | ... 50 | ) { 51 | 52 | # Check if path_to_file is missing 53 | if (missing(path_to_file)) { 54 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument") 55 | } 56 | 57 | # Check if path_to_parquet is missing 58 | if (missing(path_to_parquet)) { 59 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument") 60 | } 61 | 62 | # Check if format is equal to "json" or "ndjson" 63 | if (!(format %in% c("json","ndjson"))) { 64 | cli_abort("Be careful, the argument format must be equal to 'json' or 'ndjson'", class = "parquetize_bad_format") 65 | } 66 | 67 | Sys.sleep(0.01) 68 | cli_progress_message("Reading data...") 69 | 70 | if (format == "json") { 71 | json_output <- jsonlite::read_json(path = path_to_file, 72 | simplifyVector = TRUE) 73 | } else if (format == "ndjson") { 74 | json_output <- read_json_arrow(file = path_to_file, 75 | as_data_frame = TRUE) 76 | } 77 | 78 | dataset <- write_parquet_at_once(json_output, path_to_parquet, partition, ...) 79 | 80 | return(invisible(dataset)) 81 | 82 | } 83 | -------------------------------------------------------------------------------- /R/rbind_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name rbind_parquet 2 | #' 3 | #' @title Function to bind multiple parquet files by row 4 | #' 5 | #' @description This function read all parquet files in `folder` argument that starts with `output_name`, 6 | #' combine them using rbind and write the result to a new parquet file. \cr 7 | #' 8 | #' It can also delete the initial files if `delete_initial_files` argument is TRUE. \cr 9 | #' 10 | #' Be careful, this function will not work if files with different structures 11 | #' are present in the folder given with the argument `folder`. 12 | #' 13 | #' @param folder the folder where the initial files are stored 14 | #' @param output_name name of the output parquet file 15 | #' @param delete_initial_files Boolean. Should the function delete the initial files ? By default TRUE. 16 | #' @param compression compression algorithm. Default "snappy". 17 | #' @param compression_level compression level. Meaning depends on compression algorithm. 18 | #' 19 | #' @return Parquet files, invisibly 20 | #' 21 | #' @export 22 | #' 23 | #' @examples 24 | #' \dontrun{ 25 | #' library(arrow) 26 | #' if (file.exists('output')==FALSE) { 27 | #' dir.create("output") 28 | #' } 29 | #' 30 | #' file.create(fileext = "output/test_data1-4.parquet") 31 | #' write_parquet(data.frame( 32 | #' x = c("a","b","c"), 33 | #' y = c(1L,2L,3L) 34 | #' ), 35 | #' "output/test_data1-4.parquet") 36 | #' 37 | #' file.create(fileext = "output/test_data4-6.parquet") 38 | #' write_parquet(data.frame( 39 | #' x = c("d","e","f"), 40 | #' y = c(4L,5L,6L) 41 | #' ), "output/test_data4-6.parquet") 42 | #' 43 | #' test_data <- rbind_parquet(folder = "output", 44 | #' output_name = "test_data", 45 | #' delete_initial_files = FALSE) 46 | #' } 47 | 48 | rbind_parquet <- function(folder, 49 | output_name, 50 | delete_initial_files = TRUE, 51 | compression = "snappy", 52 | compression_level = NULL) { 53 | 54 | # Get the list of files in the folder 55 | files <- list.files(folder, pattern = paste0("^",output_name,".*\\.parquet$")) 56 | 57 | # Initialize an empty list to store the data frames 58 | data_frames <- list() 59 | 60 | # Loop through the files 61 | for (file in files) { 62 | # Read the parquet file into a data frame 63 | df <- read_parquet(file.path(folder,file)) 64 | 65 | # Add the data frame to the list 66 | data_frames[[file]] <- df 67 | } 68 | 69 | # Use rbind to combine the data frames into a single data frame 70 | combined_df <- do.call(rbind, data_frames) 71 | 72 | # Delete the initial parquet files 73 | if (isTRUE(delete_initial_files)) { 74 | unlink(file.path(folder,files)) 75 | } 76 | 77 | # Write the combined data frame to a new parquet file 78 | write_parquet(combined_df, 79 | file.path(folder, paste0(output_name,".parquet")), 80 | compression = compression, 81 | compression_level = compression_level) 82 | 83 | cli_alert_success("\nThe {output_name} parquet file is available under {folder}") 84 | 85 | return(invisible(combined_df)) 86 | } 87 | -------------------------------------------------------------------------------- /R/sqlite_to_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name sqlite_to_parquet 2 | #' 3 | #' @title Convert a sqlite file to parquet format 4 | #' 5 | #' @description This function allows to convert a table from a sqlite file to parquet format. \cr 6 | #' The following extensions are supported : 7 | #' "db","sdb","sqlite","db3","s3db","sqlite3","sl3","db2","s2db","sqlite2","sl2". \cr 8 | #' 9 | #' Two conversions possibilities are offered : 10 | #' 11 | #'\itemize{ 12 | #' 13 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;} 14 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;} 15 | #' 16 | #' } 17 | #' 18 | #' @param table_in_sqlite string that indicates the name of the table to convert in the sqlite file 19 | #' @inheritParams table_to_parquet 20 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 21 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations. 22 | #' @return A parquet file, invisibly 23 | #' 24 | #' @export 25 | #' 26 | #' @examples 27 | #' 28 | #' # Conversion from a local sqlite file to a single parquet file : 29 | #' 30 | #' sqlite_to_parquet( 31 | #' path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"), 32 | #' table_in_sqlite = "iris", 33 | #' path_to_parquet = tempfile(fileext = ".parquet") 34 | #' ) 35 | #' 36 | #' # Conversion from a local sqlite file to a partitioned parquet file : 37 | #' 38 | #' sqlite_to_parquet( 39 | #' path_to_file = system.file("extdata","iris.sqlite",package = "parquetize"), 40 | #' table_in_sqlite = "iris", 41 | #' path_to_parquet = tempfile(), 42 | #' partition = "yes", 43 | #' partitioning = c("Species") 44 | #' ) 45 | 46 | sqlite_to_parquet <- function( 47 | path_to_file, 48 | table_in_sqlite, 49 | path_to_parquet, 50 | partition = "no", 51 | compression = "snappy", 52 | compression_level = NULL, 53 | ... 54 | ) { 55 | 56 | # Check if path_to_file is missing 57 | if (missing(path_to_file)) { 58 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument") 59 | } 60 | 61 | # Check if extension used in path_to_file is correct 62 | if (!(sub(".*\\.", "", path_to_file) %in% c("db","sdb","sqlite","db3","s3db","sqlite3","sl3","db2","s2db","sqlite2","sl2"))) { 63 | cli_abort("Be careful, the extension used in path_to_file is not correct", class = "parquetize_bad_format") 64 | } 65 | 66 | # Check if path_to_parquet is missing 67 | if (missing(path_to_parquet)) { 68 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument") 69 | } 70 | 71 | Sys.sleep(0.01) 72 | cli_progress_message("Reading data...") 73 | 74 | con_sqlite <- DBI::dbConnect(RSQLite::SQLite(), path_to_file) 75 | 76 | # Check if table_in_sqlite exists in sqlite file 77 | list_table <- DBI::dbListTables(con_sqlite) 78 | if (!(table_in_sqlite %in% list_table)==TRUE) { 79 | cli_abort("Be careful, the table filled in the table_in_sqlite argument {table_in_sqlite} does not exist in your sqlite file", 80 | class = "parquetize_missing_table") 81 | } 82 | 83 | sqlite_output <- DBI::dbReadTable(con_sqlite, table_in_sqlite) 84 | 85 | DBI::dbDisconnect(con_sqlite, shutdown=TRUE) 86 | 87 | Sys.sleep(0.01) 88 | cli_progress_message("Writing data...") 89 | 90 | dataset <- write_parquet_at_once( 91 | sqlite_output, 92 | path_to_parquet, 93 | partition, 94 | compression, 95 | compression_level, 96 | ...) 97 | 98 | return(invisible(dataset)) 99 | 100 | } 101 | -------------------------------------------------------------------------------- /man/write_parquet_by_chunk.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/write_parquet_by_chunk.R 3 | \name{write_parquet_by_chunk} 4 | \alias{write_parquet_by_chunk} 5 | \title{read input by chunk on function and create dataset \cr} 6 | \usage{ 7 | write_parquet_by_chunk( 8 | read_method, 9 | input, 10 | path_to_parquet, 11 | max_rows = NULL, 12 | max_memory = NULL, 13 | chunk_memory_sample_lines = 10000, 14 | compression = "snappy", 15 | compression_level = NULL, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{read_method}{a method to read input files. This method take only three 21 | arguments 22 | 23 | \code{input} : some kind of data. Can be a 24 | \code{skip} : the number of row to skip 25 | \code{n_max} : the number of row to return 26 | 27 | This method will be called until it returns a dataframe/tibble with zero row.} 28 | 29 | \item{input}{that indicates the path to the input. It can be anything you 30 | want but more often a file's path or a data.frame.} 31 | 32 | \item{path_to_parquet}{String that indicates the path to the directory where 33 | the output parquet file or dataset will be stored.} 34 | 35 | \item{max_rows}{Number of lines that defines the size of the chunk. This 36 | argument can not be filled in if max_memory is used.} 37 | 38 | \item{max_memory}{Memory size (in Mb) in which data of one parquet file 39 | should roughly fit.} 40 | 41 | \item{chunk_memory_sample_lines}{Number of lines to read to evaluate 42 | max_memory. Default to 10 000.} 43 | 44 | \item{compression}{compression algorithm. Default "snappy".} 45 | 46 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 47 | 48 | \item{...}{Additional format-specific arguments, see 49 | \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()}} 50 | } 51 | \value{ 52 | a dataset as return by arrow::open_dataset 53 | } 54 | \description{ 55 | Low level function that implements the logic to to read input file by chunk and write a 56 | dataset. \cr 57 | 58 | It will: 59 | 60 | \itemize{ 61 | \item{calculate the number of row by chunk if needed;} 62 | \item{loop over the input file by chunk;} 63 | \item{write each output files.} 64 | } 65 | } 66 | \examples{ 67 | 68 | # example with a dataframe 69 | 70 | # we create the function to loop over the data.frame 71 | 72 | read_method <- function(input, skip = 0L, n_max = Inf) { 73 | # if we are after the end of the input we return an empty data.frame 74 | if (skip+1 > nrow(input)) { return(data.frame()) } 75 | 76 | # return the n_max row from skip + 1 77 | input[(skip+1):(min(skip+n_max, nrow(input))),] 78 | } 79 | 80 | # we use it 81 | 82 | write_parquet_by_chunk( 83 | read_method = read_method, 84 | input = mtcars, 85 | path_to_parquet = tempfile(), 86 | max_rows = 10, 87 | ) 88 | 89 | 90 | # 91 | # Example with haven::read_sas 92 | # 93 | 94 | # we need to pass two argument beside the 3 input, skip and n_max. 95 | # We will use a closure : 96 | 97 | my_read_closure <- function(encoding, columns) { 98 | function(input, skip = OL, n_max = Inf) { 99 | haven::read_sas(data_file = input, 100 | n_max = n_max, 101 | skip = skip, 102 | encoding = encoding, 103 | col_select = all_of(columns)) 104 | } 105 | } 106 | 107 | # we initialize the closure 108 | 109 | read_method <- my_read_closure(encoding = "WINDOWS-1252", columns = c("Species", "Petal_Width")) 110 | 111 | # we use it 112 | write_parquet_by_chunk( 113 | read_method = read_method, 114 | input = system.file("examples","iris.sas7bdat", package = "haven"), 115 | path_to_parquet = tempfile(), 116 | max_rows = 75, 117 | ) 118 | 119 | } 120 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to parquetize development 2 | 3 | The goal of this guide is to help you get up and contributing to `parquetize` as 4 | quickly as possible. The guide is divided into two main pieces: 5 | 6 | 1. Filing a bug report or feature request in an issue. 7 | 1. Suggesting a change via a pull request. 8 | 9 | 10 | ## Issues 11 | 12 | When filing an issue, the most important thing is to include a minimal 13 | reproducible example so that I can quickly verify the problem, and then figure 14 | out how to fix it. There are three things you need to include to make your 15 | example reproducible: required packages, data, code. 16 | 17 | 1. **Packages** should be loaded at the top of the script, so it's easy to 18 | see which ones the example needs. 19 | 20 | 1. The easiest way to include **data** is to use `dput()` to generate the R code 21 | to recreate it. For example, to recreate the `mtcars` dataset in R, 22 | I'd perform the following steps: 23 | 24 | 1. Run `dput(mtcars)` in R 25 | 2. Copy the output 26 | 3. In my reproducible script, type `mtcars <- ` then paste. 27 | 28 | But even better is if you can create a `data.frame()` with just a handful 29 | of rows and columns that still illustrates the problem. 30 | 31 | 1. Spend a little bit of time ensuring that your **code** is easy for others to 32 | read: 33 | 34 | * make sure you've used spaces and your variable names are concise, but 35 | informative 36 | 37 | * use comments to indicate where your problem lies 38 | 39 | * do your best to remove everything that is not related to the problem. 40 | The shorter your code is, the easier it is to understand. 41 | 42 | You can check you have actually made a reproducible example by starting up a 43 | fresh R session and pasting your script in. 44 | 45 | (Unless you've been specifically asked for it, please don't include the output 46 | of `sessionInfo()`.) 47 | 48 | ## Pull requests 49 | 50 | To contribute a change to `parquetize`, you follow these steps: 51 | 52 | 1. Create a branch in git and make your changes. 53 | 1. Push branch to github and issue pull request (PR). 54 | 1. Discuss the pull request. 55 | 1. Iterate until either I accept the PR or decide that it's not 56 | a good fit for `parquetize`. 57 | 58 | Each of these steps are described in more detail below. This might feel 59 | overwhelming the first time you get set up, but it gets easier with practice. 60 | 61 | If you're not familiar with git or github, please start by reading 62 | 63 | Pull requests will be evaluated against a seven point checklist: 64 | 65 | 1. __Motivation__. Your pull request should clearly and concisely motivate the 66 | need for change. 67 | 68 | Also include this motivation in `NEWS` so that when a new release of 69 | parquetize comes out it's easy for users to see what's changed. Add your 70 | item at the top of the file and use markdown for formatting. The 71 | news item should end with `(@yourGithubUsername, #the_issue_number)`. 72 | 73 | 2. __Only related changes__. Before you submit your pull request, please 74 | check to make sure that you haven't accidentally included any unrelated 75 | changes. These make it harder to see exactly what's changed, and to 76 | evaluate any unexpected side effects. 77 | 78 | Each PR corresponds to a git branch, so if you expect to submit 79 | multiple changes make sure to create multiple branches. If you have 80 | multiple changes that depend on each other, start with the first one 81 | and don't submit any others until the first one has been processed. 82 | 83 | 3. If you're adding new parameters or a new function, you'll also need 84 | to document them with [roxygen](https://github.com/klutometis/roxygen). 85 | Make sure to re-run `devtools::document()` on the code before submitting. 86 | 87 | 4. If fixing a bug or adding a new feature, 88 | please add a [testthat](https://github.com/r-lib/testthat) unit test. 89 | 90 | This seems like a lot of work but don't worry if your pull request isn't perfect. 91 | A pull request ("PR") is a process, and unless you've submitted a few in the 92 | past it's unlikely that your pull request will be accepted as is. 93 | 94 | Many thanks in advance ! 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![GitHub top 3 | language](https://img.shields.io/github/languages/top/ddotta/parquetize) 4 | [![version](http://www.r-pkg.org/badges/version/parquetize)](https://CRAN.R-project.org/package=parquetize) 5 | [![cranlogs](http://cranlogs.r-pkg.org/badges/parquetize)](https://CRAN.R-project.org/package=parquetize) 6 | [![Downloads](https://cranlogs.r-pkg.org/badges/grand-total/parquetize?color=brightgreen)](https://cran.r-project.org/package=parquetize) 7 | [![R check 8 | status](https://github.com/ddotta/parquetize/workflows/R-CMD-check/badge.svg)](https://github.com/ddotta/parquetize/actions/workflows/check-release.yaml) 9 | [![codecov](https://codecov.io/gh/ddotta/parquetize/branch/main/graph/badge.svg?token=25MHI8O62M)](https://app.codecov.io/gh/ddotta/parquetize) 10 | [![CodeFactor](https://www.codefactor.io/repository/github/ddotta/parquetize/badge)](https://www.codefactor.io/repository/github/ddotta/parquetize) 11 | 12 | 13 | :package: Package `parquetize` 14 | ====================================== 15 | 16 | R package that allows to convert databases of different formats (csv, SAS, SPSS, Stata, rds, sqlite, JSON, ndJSON) to [parquet](https://parquet.apache.org/) format in a same function. 17 | 18 | ## Installation 19 | 20 | To install `parquetize` from CRAN : 21 | 22 | ``` r 23 | install.packages("parquetize") 24 | ``` 25 | 26 | Or alternatively to install the development version from GitHub : 27 | 28 | ``` r 29 | remotes::install_github("ddotta/parquetize") 30 | ``` 31 | 32 | Then to load it : 33 | 34 | ``` r 35 | library(parquetize) 36 | ``` 37 | 38 | ## Why this package ? 39 | 40 | This package is a simple wrapper of some very useful functions from the [haven](https://github.com/tidyverse/haven), [readr](https://github.com/tidyverse/readr/), [jsonlite](https://github.com/jeroen/jsonlite), [RSQLite](https://github.com/r-dbi/RSQLite) and [arrow](https://github.com/apache/arrow) packages. 41 | 42 | While working, I realized that I was often repeating the same operation when working with parquet files : 43 | 44 | - I import the file in R with {haven}, {jsonlite}, {readr}, {DBI} or {RSQLite}. 45 | - And I export the file in parquet format 46 | 47 | As a fervent of the DRY principle (don't repeat yourself) the exported functions of this package make my life easier and **execute these operations within the same function**. 48 | 49 | **The last benefit** of using package `{parquetize}` is that its functions allow to create single parquet files or partitioned files depending on the arguments chosen in the functions. 50 | 51 | - [csv_to_parquet()](https://ddotta.github.io/parquetize/reference/csv_to_parquet.html) 52 | - **The other benefit of this function** is that it allows you to convert csv or txt files whether they are stored locally or available on the internet directly to csv/txt format or inside a zip. 53 | - [json_to_parquet()](https://ddotta.github.io/parquetize/reference/json_to_parquet.html) 54 | - **The other benefit of this function** is that it handles JSON and ndJSON files in a same function. There is only one function to use for these 2 cases. 55 | - [rds_to_parquet()](https://ddotta.github.io/parquetize/reference/rds_to_parquet.html) 56 | - [fst_to_parquet()](https://ddotta.github.io/parquetize/reference/fst_to_parquet.html) 57 | - [table_to_parquet()](https://ddotta.github.io/parquetize/reference/table_to_parquet.html) 58 | - **The other benefit of this function** is that it handles SAS, SPSS and Stata files in a same function. There is only one function to use for these 3 cases. To avoid overcharging R's RAM for huge table, the conversion can be done by chunk. For more information, see [here](https://ddotta.github.io/parquetize/articles/aa-conversions.html) 59 | - [sqlite_to_parquet()](https://ddotta.github.io/parquetize/reference/sqlite_to_parquet.html) 60 | - [dbi_to_parquet()](https://ddotta.github.io/parquetize/reference/dbi_to_parquet.html) 61 | 62 | 63 | For more details, see the examples associated with each function in the documentation. 64 | 65 | ## Example 66 | 67 | You want to use the Insee file of first names by birth department? Use R and {parquetize} package that takes care of everything: it downloads the data (3.7 million rows) and converts it to parquet format in few seconds ! 68 | 69 | 70 | 71 | ## Contribution 72 | 73 | Feel welcome to contribute to add features that you find useful in your daily work. 74 | Ideas are welcomed in [the issues](https://github.com/ddotta/parquetize/issues). 75 | -------------------------------------------------------------------------------- /man/dbi_to_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dbi_to_parquet.R 3 | \name{dbi_to_parquet} 4 | \alias{dbi_to_parquet} 5 | \title{Convert a SQL Query on a DBI connection to parquet format} 6 | \usage{ 7 | dbi_to_parquet( 8 | conn, 9 | sql_query, 10 | path_to_parquet, 11 | max_memory, 12 | max_rows, 13 | chunk_memory_sample_lines = 10000, 14 | partition = "no", 15 | compression = "snappy", 16 | compression_level = NULL, 17 | ... 18 | ) 19 | } 20 | \arguments{ 21 | \item{conn}{A DBIConnection object, as return by DBI::dbConnect} 22 | 23 | \item{sql_query}{a character string containing an SQL query (this argument is passed to DBI::dbSendQuery)} 24 | 25 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.} 26 | 27 | \item{max_memory}{Memory size (in Mb) in which data of one parquet file should roughly fit.} 28 | 29 | \item{max_rows}{Number of lines that defines the size of the chunk. 30 | This argument can not be filled in if max_memory is used.} 31 | 32 | \item{chunk_memory_sample_lines}{Number of lines to read to evaluate max_memory. Default to 10 000.} 33 | 34 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 35 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}. 36 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.} 37 | 38 | \item{compression}{compression algorithm. Default "snappy".} 39 | 40 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 41 | 42 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 43 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.} 44 | } 45 | \value{ 46 | A parquet file, invisibly 47 | } 48 | \description{ 49 | This function allows to convert a SQL query from a DBI to parquet format.\cr 50 | 51 | It handles all DBI supported databases. 52 | 53 | Two conversions possibilities are offered : 54 | 55 | \itemize{ 56 | 57 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;} 58 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;} 59 | 60 | } 61 | 62 | Examples explain how to convert a query to a chunked dataset 63 | } 64 | \examples{ 65 | 66 | # Conversion from a sqlite dbi connection to a single parquet file : 67 | 68 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(), 69 | system.file("extdata","iris.sqlite",package = "parquetize")) 70 | 71 | # Reading iris table from local sqlite database 72 | # and conversion to one parquet file : 73 | 74 | dbi_to_parquet( 75 | conn = dbi_connection, 76 | sql_query = "SELECT * FROM iris", 77 | path_to_parquet = tempfile(fileext=".parquet"), 78 | ) 79 | 80 | # Reading iris table from local sqlite database by chunk (using 81 | # `max_memory` argument) and conversion to multiple parquet files 82 | 83 | dbi_to_parquet( 84 | conn = dbi_connection, 85 | sql_query = "SELECT * FROM iris", 86 | path_to_parquet = tempdir(), 87 | max_memory = 2 / 1024 88 | ) 89 | 90 | # Using chunk and partition together is not possible directly but easy to do : 91 | # Reading iris table from local sqlite database by chunk (using 92 | # `max_memory` argument) and conversion to arrow dataset partitioned by 93 | # species 94 | 95 | # get unique values of column "iris from table "iris" 96 | partitions <- get_partitions(dbi_connection, table = "iris", column = "Species") 97 | 98 | # loop over those values 99 | for (species in partitions) { 100 | dbi_to_parquet( 101 | conn = dbi_connection, 102 | # use glue_sql to create the query filtering the partition 103 | sql_query = glue::glue_sql("SELECT * FROM iris where Species = {species}", 104 | .con = dbi_connection), 105 | # add the partition name in the output dir to respect parquet partition schema 106 | path_to_parquet = file.path(tempdir(), "iris", paste0("Species=", species)), 107 | max_memory = 2 / 1024, 108 | ) 109 | } 110 | 111 | # If you need a more complicated query to get your partitions, you can use 112 | # dbGetQuery directly : 113 | col_to_partition <- DBI::dbGetQuery(dbi_connection, "SELECT distinct(`Species`) FROM `iris`")[,1] 114 | 115 | } 116 | -------------------------------------------------------------------------------- /tests/testthat/test-csv_to_parquet.R: -------------------------------------------------------------------------------- 1 | options(timeout=200) 2 | 3 | test_that("Checks arguments are correctly filled in", { 4 | expect_missing_argument( 5 | csv_to_parquet( 6 | path_to_parquet = tempfile() 7 | ), 8 | regexp = "path_to_file" 9 | ) 10 | 11 | expect_missing_argument( 12 | csv_to_parquet( 13 | path_to_file = parquetize_example("region_2022.csv") 14 | ), 15 | regexp = "path_to_parquet" 16 | ) 17 | }) 18 | 19 | test_that("Checks simple conversion works", { 20 | path_to_parquet <- tempfile() 21 | 22 | expect_no_error( 23 | csv_to_parquet( 24 | path_to_file = parquetize_example("region_2022.csv"), 25 | path_to_parquet = path_to_parquet 26 | ) 27 | ) 28 | 29 | expect_parquet(path = path_to_parquet, with_lines = 18) 30 | }) 31 | 32 | test_that("Checks url_to_csv argument is deprecated", { 33 | expect_warning( 34 | csv_to_parquet( 35 | url_to_csv = "https://github.com/sidsriv/Introduction-to-Data-Science-in-python/raw/master/census.csv", 36 | path_to_parquet = tempfile() 37 | ), 38 | regexp = "deprecated" 39 | ) 40 | }) 41 | test_that("Checks csv_as_a_zip is deprecated", { 42 | expect_warning( 43 | csv_to_parquet( 44 | path_to_file = system.file("extdata","mtcars.csv.zip", package = "readr"), 45 | path_to_parquet = tempfile(), 46 | csv_as_a_zip = TRUE 47 | ), 48 | regexp = "deprecated" 49 | ) 50 | }) 51 | 52 | 53 | test_that("Checks it works with compression", { 54 | skip_if_offline() 55 | 56 | path_to_parquet <- tempfile() 57 | 58 | expect_no_error( 59 | csv_to_parquet( 60 | path_to_file = parquetize_example("region_2022.csv"), 61 | path_to_parquet = path_to_parquet, 62 | compression = "gzip", 63 | compression_level = 5 64 | ) 65 | ) 66 | 67 | expect_parquet(path = path_to_parquet, with_lines = 18) 68 | }) 69 | 70 | test_that("Checks it works when partitioning", { 71 | path_to_parquet <- tempfile() 72 | 73 | expect_no_error( 74 | csv_to_parquet( 75 | path_to_file = parquetize_example("region_2022.csv"), 76 | path_to_parquet = path_to_parquet, 77 | partition = "yes", 78 | partitioning = c("REG") 79 | ) 80 | ) 81 | 82 | expect_parquet(path = path_to_parquet, with_lines = 18) 83 | }) 84 | 85 | test_that("Checks error if argument columns is not a character vector", { 86 | expect_error( 87 | csv_to_parquet( 88 | path_to_file = parquetize_example("region_2022.csv"), 89 | path_to_parquet = tempfile(), 90 | columns = matrix(1:10) 91 | ), 92 | class = "parquetize_bad_argument" 93 | ) 94 | }) 95 | 96 | test_that("Checks columns are selected as wanted", { 97 | path_to_parquet <- tempfile() 98 | columns <- c("REG","LIBELLE") 99 | 100 | expect_no_error( 101 | csv_to_parquet( 102 | path_to_file = parquetize_example("region_2022.csv"), 103 | path_to_parquet = path_to_parquet, 104 | columns = columns 105 | ) 106 | ) 107 | 108 | expect_parquet( 109 | path_to_parquet, 110 | with_lines = 18, 111 | with_columns = columns) 112 | }) 113 | 114 | test_that("Checks message zip with one file works", { 115 | path_to_parquet <- tempfile() 116 | 117 | expect_no_error( 118 | csv_to_parquet( 119 | path_to_file = system.file("extdata","mtcars.csv.zip", package = "readr"), 120 | path_to_parquet = path_to_parquet, 121 | ) 122 | ) 123 | 124 | expect_parquet(path = path_to_parquet, with_lines = 32) 125 | }) 126 | 127 | 128 | test_that("Checks we have only selected columns in parquet file", { 129 | path_to_parquet <- tempfile() 130 | columns <- c("REG","LIBELLE") 131 | 132 | csv_to_parquet( 133 | path_to_file = parquetize_example("region_2022.csv"), 134 | path_to_parquet = path_to_parquet, 135 | columns = columns 136 | ) 137 | 138 | expect_setequal( 139 | names(read_parquet(path_to_parquet)), 140 | columns 141 | ) 142 | }) 143 | 144 | 145 | test_that("Checks error if csv starts with a comment", { 146 | expect_error( 147 | csv_to_parquet( 148 | path_to_file = parquetize_example("region_2022_with_comment.csv"), 149 | path_to_parquet = tempfile() 150 | ), 151 | regexp = 'Could not guess the delimiter' 152 | ) 153 | }) 154 | 155 | 156 | test_that("Checks conversion works with read_delim_args", { 157 | path_to_parquet <- tempfile() 158 | 159 | expect_no_error( 160 | csv_to_parquet( 161 | path_to_file = parquetize_example("region_2022_with_comment.csv"), 162 | path_to_parquet = path_to_parquet, 163 | read_delim_args = list(comment = '#') 164 | ) 165 | ) 166 | 167 | expect_parquet(path = path_to_parquet, with_lines = 18) 168 | }) 169 | -------------------------------------------------------------------------------- /man/csv_to_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/csv_to_parquet.R 3 | \name{csv_to_parquet} 4 | \alias{csv_to_parquet} 5 | \title{Convert a csv or a txt file to parquet format} 6 | \usage{ 7 | csv_to_parquet( 8 | path_to_file, 9 | url_to_csv = lifecycle::deprecated(), 10 | csv_as_a_zip = lifecycle::deprecated(), 11 | filename_in_zip, 12 | path_to_parquet, 13 | columns = "all", 14 | compression = "snappy", 15 | compression_level = NULL, 16 | partition = "no", 17 | encoding = "UTF-8", 18 | read_delim_args = list(), 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).} 24 | 25 | \item{url_to_csv}{DEPRECATED use path_to_file instead} 26 | 27 | \item{csv_as_a_zip}{DEPRECATED} 28 | 29 | \item{filename_in_zip}{name of the csv/txt file in the zip. Required if several csv/txt are included in the zip.} 30 | 31 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.} 32 | 33 | \item{columns}{Character vector of columns to select from the input file (by default, all columns are selected).} 34 | 35 | \item{compression}{compression algorithm. Default "snappy".} 36 | 37 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 38 | 39 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 40 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}. 41 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.} 42 | 43 | \item{encoding}{String that indicates the character encoding for the input file.} 44 | 45 | \item{read_delim_args}{list of arguments for \code{read_delim}.} 46 | 47 | \item{...}{additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 48 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.} 49 | } 50 | \value{ 51 | A parquet file, invisibly 52 | } 53 | \description{ 54 | This function allows to convert a csv or a txt file to parquet format. \cr 55 | 56 | Two conversions possibilities are offered : 57 | 58 | \itemize{ 59 | 60 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;} 61 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;} 62 | 63 | } 64 | } 65 | \note{ 66 | Be careful, if the zip size exceeds 4 GB, the function may truncate 67 | the data (because unzip() won't work reliably in this case - 68 | see \href{https://rdrr.io/r/utils/unzip.html}{here}). 69 | In this case, it's advised to unzip your csv/txt file by hand 70 | (for example with \href{https://www.7-zip.org/}{7-Zip}) 71 | then use the function with the argument \code{path_to_file}. 72 | } 73 | \examples{ 74 | 75 | # Conversion from a local csv file to a single parquet file : 76 | 77 | csv_to_parquet( 78 | path_to_file = parquetize_example("region_2022.csv"), 79 | path_to_parquet = tempfile(fileext=".parquet") 80 | ) 81 | 82 | # Conversion from a local txt file to a single parquet file : 83 | 84 | csv_to_parquet( 85 | path_to_file = parquetize_example("region_2022.txt"), 86 | path_to_parquet = tempfile(fileext=".parquet") 87 | ) 88 | 89 | # Conversion from a local csv file to a single parquet file and select only 90 | # few columns : 91 | 92 | csv_to_parquet( 93 | path_to_file = parquetize_example("region_2022.csv"), 94 | path_to_parquet = tempfile(fileext = ".parquet"), 95 | columns = c("REG","LIBELLE") 96 | ) 97 | 98 | # Conversion from a local csv file to a partitioned parquet file : 99 | 100 | csv_to_parquet( 101 | path_to_file = parquetize_example("region_2022.csv"), 102 | path_to_parquet = tempfile(fileext = ".parquet"), 103 | partition = "yes", 104 | partitioning = c("REG") 105 | ) 106 | 107 | # Conversion from a URL and a zipped file (csv) : 108 | 109 | csv_to_parquet( 110 | path_to_file = "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip", 111 | filename_in_zip = "census2021-ts007-ctry.csv", 112 | path_to_parquet = tempfile(fileext = ".parquet") 113 | ) 114 | 115 | \dontrun{ 116 | # Conversion from a URL and a zipped file (txt) : 117 | 118 | csv_to_parquet( 119 | path_to_file = "https://sourceforge.net/projects/irisdss/files/latest/download", 120 | filename_in_zip = "IRIS TEST data.txt", 121 | path_to_parquet = tempfile(fileext=".parquet") 122 | ) 123 | 124 | # Conversion from a URL and a csv file with "gzip" compression : 125 | 126 | csv_to_parquet( 127 | path_to_file = 128 | "https://github.com/sidsriv/Introduction-to-Data-Science-in-python/raw/master/census.csv", 129 | path_to_parquet = tempfile(fileext = ".parquet"), 130 | compression = "gzip", 131 | compression_level = 5 132 | ) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /vignettes/aa-conversions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Convert huge input file to parquet" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{aa-conversions} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | ```{r setup} 18 | library(parquetize) 19 | ``` 20 | 21 | ## With `table_to_parquet()` 22 | 23 | For **huge input files in SAS, SPSS and Stata formats**, the parquetize package allows you to perform a clever conversion by using `max_memory` or `max_rows` in the [`table_to_parquet()`](https://ddotta.github.io/parquetize/reference/table_to_parquet.html) function. 24 | The native behavior of this function (and all other functions in the package) is to load the entire table to be converted into R and then write it to disk (in a single file or a partitioned directory). 25 | 26 | When handling very large files, the risk that frequently occurs is that the R session aborts because it cannot load the entire database into memory. 27 | This risk is even more present if you work locally on your computer and it can be limited if you work on remote servers. 28 | **`table_to_parquet()` offers this solution which answers a need expressed by parquetize users.** 29 | 30 | ------- 31 | **The idea is to split the very large table into "chunks" based on memory consumption of input data or on the number of rows in the table in order to be able to simultaneously :** 32 | - **read a chunk of the very large database** 33 | - **write this chunk in the floor file** 34 | ------ 35 | 36 | Here are examples from the documentation using the iris table. There's two ways to split output files : 37 | 38 | * by memory consumption 39 | * by number of lines 40 | 41 | ### Spliting data by memory consumption 42 | 43 | `table_to_parquet` can guess the number of lines to put in a file based on the 44 | memory consuption with the argument `max_memory` expressed in Mb. 45 | 46 | Here we cut the 150 rows into chunks of roughly 5 Kb when a file is loaded as a 47 | tibble. 48 | In this example we get 2 parquet files of 89 lines called `iris1-89.parquet` and `iris90-150.parquet` 49 | 50 | ```{r iris-memory-example} 51 | table_to_parquet( 52 | path_to_file = system.file("examples", "iris.sas7bdat", package = "haven"), 53 | path_to_parquet = tempfile(), 54 | max_memory = 5 / 1024, 55 | encoding = "utf-8" 56 | ) 57 | ``` 58 | 59 | In real life, you should use a `max_memory` in the Gb range, for example 60 | with a SAS file of 50 000 000 lines and using `max_memory` of 5000 Mb : 61 | 62 | 63 | ```{r real-memory-example, eval=FALSE} 64 | table_to_parquet( 65 | path_to_file = "myhugefile.sas7bdat", 66 | path_to_parquet = tempdir(), 67 | max_memory = 5000, 68 | encoding = "utf-8" 69 | ) 70 | ``` 71 | 72 | 73 | ### Splitting data by number of lines 74 | 75 | > Tip: The number of lines that each chunk must contain must be supported by the RAM of your computer/server. Ideally, the number of chunks to be defined must be limited. It should be in tens and not hundreds to limit the number of intermediate files (see example below). 76 | 77 | Here we cut the 150 rows into 3 chunks of 50 rows. In this example we get 3 parquet files of 50 lines called `iris1-50.parquet`, `iris51-100.parquet` and `iris101-151.parquet` 78 | 79 | ```{r iris-example, eval=FALSE} 80 | table_to_parquet( 81 | path_to_file = system.file("examples", "iris.sas7bdat", package = "haven"), 82 | path_to_parquet = tempfile(), 83 | max_rows = 50, 84 | encoding = "utf-8" 85 | ) 86 | ``` 87 | 88 | In real life, we can perform this kind of request with the parquetize API (for example with a SAS file of 50 000 000 lines and defining 25 chunks of 2 000 000 rows each) : 89 | 90 | 91 | ```{r real-example, eval=FALSE} 92 | table_to_parquet( 93 | path_to_file = "myhugefile.sas7bdat", 94 | path_to_parquet = tempdir(), 95 | max_rows = 2000000, 96 | encoding = "utf-8" 97 | ) 98 | ``` 99 | 100 | Files `myhugefile1-2000000.parquet`, `myhugefile2000001-4000000.parquet` ... will be created. 101 | 102 | ## Function `rbind_parquet()` 103 | 104 | If at the end of the conversion with `table_to_parquet()`, **you want to reconstitute a unique initial table** and **if you have the computer resources (in RAM) to do so**, you can use the helper function provided with the API of `rbind_parquet()`. 105 | This function allows to bind multiple parquet files by row. 106 | Here's an example without deleting initial files (`delete_initial_files`=FALSE) : 107 | 108 | ```{r rbind_parquet-example, eval=FALSE} 109 | rbind_parquet( 110 | folder = tempfile(), 111 | output_name = "myhugefile", 112 | delete_initial_files = FALSE 113 | ) 114 | ``` 115 | The `myhugefile.parquet` file will be created from the `myhugefile1-2000000.parquet`, `myhugefile2000001-4000000.parquet`... files! 116 | 117 | ## Alternatives to `{parquetize}` 118 | 119 | Despite our best efforts, you may not be able to convert your very large database with {parquetize}. 120 | In this case, one solution is probably to turn to [duckdb](https://github.com/duckdb/duckdb-r), which offers undeniable advantages when it comes to conversion operations. 121 | -------------------------------------------------------------------------------- /tests/testthat/test-table_to_parquet.R: -------------------------------------------------------------------------------- 1 | test_that("Checks arguments are filled in", { 2 | expect_missing_argument( 3 | table_to_parquet( 4 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 5 | encoding = "utf-8" 6 | ), 7 | regexp = "path_to_parquet" 8 | ) 9 | 10 | expect_missing_argument( 11 | table_to_parquet( 12 | path_to_parquet = tempfile(), 13 | encoding = "utf-8" 14 | ), 15 | regexp = "path_to_file" 16 | ) 17 | }) 18 | 19 | test_that("Checks we can not use chunk_size with negative skip", { 20 | expect_error( 21 | table_to_parquet( 22 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 23 | path_to_parquet = tempfile(), 24 | encoding = "utf-8", 25 | max_rows = 50, 26 | skip = -100 27 | ), 28 | class = "parquetize_bad_argument", 29 | regexp = "skip must be must be greater than" 30 | ) 31 | }) 32 | 33 | test_that("Checks by_chunk is deprecated", { 34 | expect_warning( 35 | table_to_parquet( 36 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 37 | path_to_parquet = tempfile(), 38 | by_chunk = TRUE, 39 | max_rows = 50 40 | ), 41 | regexp = "This argument is no longer needed" 42 | ) 43 | }) 44 | 45 | test_that("Checks chunk_size and chunk_memory_size are deprecated", { 46 | expect_warning( 47 | table_to_parquet( 48 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 49 | path_to_parquet = tempfile(), 50 | chunk_size = 1000 51 | ), 52 | regexp = "This argument is deprecated" 53 | ) 54 | 55 | expect_warning( 56 | table_to_parquet( 57 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 58 | path_to_parquet = tempfile(), 59 | chunk_memory_size = 1000 60 | ), 61 | regexp = "This argument is deprecated" 62 | ) 63 | }) 64 | 65 | 66 | test_that("Checks argument columns is a character vector", { 67 | expect_error( 68 | table_to_parquet( 69 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 70 | path_to_parquet = tempfile(), 71 | columns = matrix(1:10) 72 | ), 73 | class = "parquetize_bad_type" 74 | ) 75 | }) 76 | 77 | test_that("Checks parquetizing all formats works and return files with the good number of lines", { 78 | for (extension in c("sas7bdat", "sav", "dta")) { 79 | path_to_parquet <- tempfile() 80 | file <- paste0("iris.", extension) 81 | 82 | expect_no_error( 83 | table_to_parquet( 84 | path_to_file = system.file("examples",file, package = "haven"), 85 | path_to_parquet = path_to_parquet 86 | ) 87 | ) 88 | 89 | expect_parquet(path_to_parquet, with_lines = 150) 90 | } 91 | }) 92 | 93 | test_that("Checks parquetizing by chunk with encoding works", { 94 | path_to_parquet <- tempfile() 95 | 96 | expect_no_error( 97 | table_to_parquet( 98 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 99 | path_to_parquet = path_to_parquet, 100 | max_rows = 50, 101 | encoding = "utf-8" 102 | ) 103 | ) 104 | 105 | expect_parquet(path_to_parquet, with_lines = 150, with_files = 3) 106 | }) 107 | 108 | test_that("Checks parquetizing works with partitioning", { 109 | path_to_parquet <- tempfile() 110 | 111 | expect_no_error( 112 | table_to_parquet( 113 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 114 | path_to_parquet = path_to_parquet, 115 | partition = "yes", 116 | partitioning = "Species" 117 | ) 118 | ) 119 | expect_parquet( 120 | path_to_parquet, 121 | with_lines = 150, 122 | with_partitions = c("Species=setosa", "Species=versic", "Species=virgin") 123 | ) 124 | 125 | }) 126 | 127 | test_that("Checks it fails with SAS by adding max_rows, partition and partitioning argument", { 128 | expect_error( 129 | table_to_parquet( 130 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 131 | path_to_parquet = tempfile(), 132 | max_rows = 50, 133 | partition = "yes", 134 | partitioning = "Species" 135 | ), 136 | class = "parquetize_bad_argument" 137 | ) 138 | }) 139 | 140 | test_that("Checks we have only selected columns in parquet file", { 141 | input_file <- system.file("examples","iris.sas7bdat", package = "haven") 142 | 143 | path_to_parquet <- tempfile() 144 | columns <- c("Species","Sepal_Length") 145 | 146 | table_to_parquet( 147 | path_to_file = input_file, 148 | path_to_parquet = path_to_parquet, 149 | columns = columns 150 | ) 151 | 152 | expect_parquet( 153 | path_to_parquet, 154 | with_lines = 150, 155 | with_columns = columns 156 | ) 157 | }) 158 | 159 | test_that("Checks we have only selected columns in parquet dataset", { 160 | input_file <- system.file("examples","iris.sas7bdat", package = "haven") 161 | path_to_parquet <- tempfile() 162 | columns <- c("Species","Sepal_Length") 163 | 164 | table_to_parquet( 165 | path_to_file = input_file, 166 | path_to_parquet = path_to_parquet, 167 | columns = columns, 168 | max_rows = 50 169 | ) 170 | 171 | expect_parquet( 172 | path_to_parquet, 173 | with_lines = 150, 174 | with_columns = columns 175 | ) 176 | }) 177 | -------------------------------------------------------------------------------- /R/write_parquet_by_chunk.R: -------------------------------------------------------------------------------- 1 | #' @name write_parquet_by_chunk 2 | #' 3 | #' @title read input by chunk on function and create dataset \cr 4 | #' 5 | #' @description Low level function that implements the logic to to read input file by chunk and write a 6 | #' dataset. \cr 7 | #' 8 | #' It will: 9 | #' 10 | #' \itemize{ 11 | #' \item{calculate the number of row by chunk if needed;} 12 | #' \item{loop over the input file by chunk;} 13 | #' \item{write each output files.} 14 | #' } 15 | #' 16 | #' @param read_method a method to read input files. This method take only three 17 | #' arguments 18 | #' 19 | #' `input` : some kind of data. Can be a 20 | #' `skip` : the number of row to skip 21 | #' `n_max` : the number of row to return 22 | #' 23 | #' This method will be called until it returns a dataframe/tibble with zero row. 24 | #' 25 | #' @param input that indicates the path to the input. It can be anything you 26 | #' want but more often a file's path or a data.frame. 27 | #' @param path_to_parquet String that indicates the path to the directory where 28 | #' the output parquet file or dataset will be stored. 29 | #' @param max_memory Memory size (in Mb) in which data of one parquet file 30 | #' should roughly fit. 31 | #' @param max_rows Number of lines that defines the size of the chunk. This 32 | #' argument can not be filled in if max_memory is used. 33 | #' @param chunk_memory_sample_lines Number of lines to read to evaluate 34 | #' max_memory. Default to 10 000. 35 | #' @param compression compression algorithm. Default "snappy". 36 | #' @param compression_level compression level. Meaning depends on compression algorithm. 37 | #' @param ... Additional format-specific arguments, see 38 | #' \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 39 | #' 40 | #' @return a dataset as return by arrow::open_dataset 41 | #' @export 42 | #' 43 | #' @examples 44 | #' 45 | #' # example with a dataframe 46 | #' 47 | #' # we create the function to loop over the data.frame 48 | #' 49 | #' read_method <- function(input, skip = 0L, n_max = Inf) { 50 | #' # if we are after the end of the input we return an empty data.frame 51 | #' if (skip+1 > nrow(input)) { return(data.frame()) } 52 | #' 53 | #' # return the n_max row from skip + 1 54 | #' input[(skip+1):(min(skip+n_max, nrow(input))),] 55 | #' } 56 | #' 57 | #' # we use it 58 | #' 59 | #' write_parquet_by_chunk( 60 | #' read_method = read_method, 61 | #' input = mtcars, 62 | #' path_to_parquet = tempfile(), 63 | #' max_rows = 10, 64 | #' ) 65 | #' 66 | #' 67 | #' # 68 | #' # Example with haven::read_sas 69 | #' # 70 | #' 71 | #' # we need to pass two argument beside the 3 input, skip and n_max. 72 | #' # We will use a closure : 73 | #' 74 | #' my_read_closure <- function(encoding, columns) { 75 | #' function(input, skip = OL, n_max = Inf) { 76 | #' haven::read_sas(data_file = input, 77 | #' n_max = n_max, 78 | #' skip = skip, 79 | #' encoding = encoding, 80 | #' col_select = all_of(columns)) 81 | #' } 82 | #' } 83 | #' 84 | #' # we initialize the closure 85 | #' 86 | #' read_method <- my_read_closure(encoding = "WINDOWS-1252", columns = c("Species", "Petal_Width")) 87 | #' 88 | #' # we use it 89 | #' write_parquet_by_chunk( 90 | #' read_method = read_method, 91 | #' input = system.file("examples","iris.sas7bdat", package = "haven"), 92 | #' path_to_parquet = tempfile(), 93 | #' max_rows = 75, 94 | #' ) 95 | #' 96 | write_parquet_by_chunk <- function( 97 | read_method, 98 | input, 99 | path_to_parquet, 100 | max_rows = NULL, 101 | max_memory = NULL, 102 | chunk_memory_sample_lines = 10000, 103 | compression = "snappy", 104 | compression_level = NULL, 105 | ... 106 | ) { 107 | if (missing(read_method)) { 108 | cli_abort("Be careful, read_method argument is mandatory", class = "parquetize_missing_argument") 109 | } 110 | 111 | if (!is.function(read_method)) { 112 | cli_abort("Be careful, read_method must be a function", class = "parquetize_bad_argument") 113 | } 114 | 115 | if (missing(input)) { 116 | cli_abort("Be careful, input argument is mandatory", class = "parquetize_missing_argument") 117 | } 118 | 119 | # max_rows and max_memory can not be used together so fails 120 | if (!is.null(max_rows) & !is.null(max_memory)) { 121 | cli_abort("Be careful, max_rows and max_memory can not be used together", class = "parquetize_bad_argument") 122 | } 123 | 124 | if (is.null(max_rows)) { 125 | data <- read_method(input, n_max = chunk_memory_sample_lines) 126 | max_rows <- get_lines_for_memory(data, 127 | max_memory = max_memory) 128 | } 129 | 130 | dir.create(path_to_parquet, showWarnings = FALSE, recursive = TRUE) 131 | 132 | parquetname <- tools::file_path_sans_ext(basename(path_to_parquet)) 133 | 134 | skip = 0 135 | while (TRUE) { 136 | Sys.sleep(0.01) 137 | cli_progress_message("Reading data...") 138 | 139 | tbl <- read_method(input, skip = skip, n_max = max_rows) 140 | if (nrow(tbl) != 0) { 141 | Sys.sleep(0.01) 142 | parquetizename <- glue::glue("{parquetname}-{skip+1}-{skip+nrow(tbl)}.parquet") 143 | cli_progress_message("Writing {parquetizename}...") 144 | write_parquet(tbl, 145 | sink = file.path(path_to_parquet, 146 | parquetizename), 147 | compression = compression, 148 | compression_level = compression_level, 149 | ... 150 | ) 151 | } 152 | skip <- skip + nrow(tbl) 153 | if (nrow(tbl) < max_rows) { break } 154 | } 155 | 156 | Sys.sleep(0.01) 157 | cli_alert_success("\nData are available in parquet dataset under {path_to_parquet}/") 158 | 159 | invisible(arrow::open_dataset(path_to_parquet)) 160 | } 161 | -------------------------------------------------------------------------------- /R/dbi_to_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name dbi_to_parquet 2 | #' 3 | #' @title Convert a SQL Query on a DBI connection to parquet format 4 | #' 5 | #' @description This function allows to convert a SQL query from a DBI to parquet format.\cr 6 | #' 7 | #' It handles all DBI supported databases. 8 | #' 9 | #' Two conversions possibilities are offered : 10 | #' 11 | #'\itemize{ 12 | #' 13 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;} 14 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;} 15 | #' 16 | #' } 17 | #' 18 | #' Examples explain how to convert a query to a chunked dataset 19 | #' 20 | #' @param conn A DBIConnection object, as return by DBI::dbConnect 21 | #' @param sql_query a character string containing an SQL query (this argument is passed to DBI::dbSendQuery) 22 | #' @inheritParams table_to_parquet 23 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 24 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations. 25 | #' @return A parquet file, invisibly 26 | #' 27 | #' @export 28 | #' 29 | #' @examples 30 | #' 31 | #' # Conversion from a sqlite dbi connection to a single parquet file : 32 | #' 33 | #' dbi_connection <- DBI::dbConnect(RSQLite::SQLite(), 34 | #' system.file("extdata","iris.sqlite",package = "parquetize")) 35 | #' 36 | #' # Reading iris table from local sqlite database 37 | #' # and conversion to one parquet file : 38 | #' 39 | #' dbi_to_parquet( 40 | #' conn = dbi_connection, 41 | #' sql_query = "SELECT * FROM iris", 42 | #' path_to_parquet = tempfile(fileext=".parquet"), 43 | #' ) 44 | #' 45 | #' # Reading iris table from local sqlite database by chunk (using 46 | #' # `max_memory` argument) and conversion to multiple parquet files 47 | #' 48 | #' dbi_to_parquet( 49 | #' conn = dbi_connection, 50 | #' sql_query = "SELECT * FROM iris", 51 | #' path_to_parquet = tempdir(), 52 | #' max_memory = 2 / 1024 53 | #' ) 54 | #' 55 | #' # Using chunk and partition together is not possible directly but easy to do : 56 | #' # Reading iris table from local sqlite database by chunk (using 57 | #' # `max_memory` argument) and conversion to arrow dataset partitioned by 58 | #' # species 59 | #' 60 | #' # get unique values of column "iris from table "iris" 61 | #' partitions <- get_partitions(dbi_connection, table = "iris", column = "Species") 62 | #' 63 | #' # loop over those values 64 | #' for (species in partitions) { 65 | #' dbi_to_parquet( 66 | #' conn = dbi_connection, 67 | #' # use glue_sql to create the query filtering the partition 68 | #' sql_query = glue::glue_sql("SELECT * FROM iris where Species = {species}", 69 | #' .con = dbi_connection), 70 | #' # add the partition name in the output dir to respect parquet partition schema 71 | #' path_to_parquet = file.path(tempdir(), "iris", paste0("Species=", species)), 72 | #' max_memory = 2 / 1024, 73 | #' ) 74 | #' } 75 | #' 76 | #' # If you need a more complicated query to get your partitions, you can use 77 | #' # dbGetQuery directly : 78 | #' col_to_partition <- DBI::dbGetQuery(dbi_connection, "SELECT distinct(`Species`) FROM `iris`")[,1] 79 | #' 80 | dbi_to_parquet <- function( 81 | conn, 82 | sql_query, 83 | path_to_parquet, 84 | max_memory, 85 | max_rows, 86 | chunk_memory_sample_lines = 10000, 87 | partition = "no", 88 | compression = "snappy", 89 | compression_level = NULL, 90 | ... 91 | ) { 92 | if (missing(conn)) { 93 | cli_abort("Be careful, the argument conn must be filled in", class = "parquetize_missing_argument") 94 | } 95 | 96 | # Check if path_to_parquet is missing 97 | if (missing(sql_query)) { 98 | cli_abort("Be careful, the argument sql_query must be filled in", class = "parquetize_missing_argument") 99 | } 100 | 101 | # Check if path_to_parquet is missing 102 | if (missing(path_to_parquet)) { 103 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument") 104 | } 105 | 106 | by_chunk <- !(missing(max_rows) & missing(max_memory)) 107 | 108 | if (by_chunk == TRUE) { 109 | 110 | dir.create(path_to_parquet, showWarnings = FALSE, recursive = TRUE) 111 | 112 | if (missing(max_rows)) { 113 | # create the query and send it to the DB 114 | result <- dbSendQuery(conn, sql_query) 115 | # fetch a sample of result 116 | data <- dbFetch(result, n = chunk_memory_sample_lines) 117 | # close the query in DB 118 | dbClearResult(result) 119 | 120 | max_rows <- get_lines_for_memory(data, 121 | max_memory = max_memory) 122 | } 123 | 124 | result <- dbSendQuery(conn, sql_query) 125 | on.exit(dbClearResult(result)) 126 | 127 | skip <- 0 128 | while (!dbHasCompleted(result)) { 129 | Sys.sleep(0.01) 130 | cli_progress_message("Reading data...") 131 | data <- dbFetch(result, n = max_rows) 132 | 133 | parquetizename <- glue::glue("part-{skip+1}-{skip+nrow(data)}.parquet") 134 | Sys.sleep(0.01) 135 | cli_progress_message("Writing data in {parquetizename}...") 136 | write_parquet(data, 137 | sink = file.path(path_to_parquet, 138 | parquetizename), 139 | compression = compression, 140 | compression_level = compression_level, 141 | ... 142 | ) 143 | skip <- skip + nrow(data) 144 | } 145 | cli_alert_success("\nParquet dataset is available under {path_to_parquet}/") 146 | return(invisible(TRUE)) 147 | } 148 | 149 | result <- dbSendQuery(conn, sql_query) 150 | on.exit(dbClearResult(result)) 151 | 152 | Sys.sleep(0.01) 153 | cli_progress_message("Reading data...") 154 | output <- dbFetch(result) 155 | 156 | parquetfile <- write_parquet_at_once( 157 | output, 158 | path_to_parquet, 159 | partition, 160 | compression, 161 | compression_level, 162 | ...) 163 | 164 | return(invisible(parquetfile)) 165 | } 166 | -------------------------------------------------------------------------------- /R/csv_to_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name csv_to_parquet 2 | #' @title Convert a csv or a txt file to parquet format 3 | #' 4 | #' @description This function allows to convert a csv or a txt file to parquet format. \cr 5 | #' 6 | #' Two conversions possibilities are offered : 7 | #' 8 | #'\itemize{ 9 | #' 10 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;} 11 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;} 12 | #' 13 | #' } 14 | #' 15 | #' @param filename_in_zip name of the csv/txt file in the zip. Required if several csv/txt are included in the zip. 16 | #' @param url_to_csv DEPRECATED use path_to_file instead 17 | #' @param csv_as_a_zip DEPRECATED 18 | #' @inheritParams table_to_parquet 19 | #' @param read_delim_args list of arguments for `read_delim`. 20 | #' @param ... additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 21 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations. 22 | #' 23 | #' @note Be careful, if the zip size exceeds 4 GB, the function may truncate 24 | #' the data (because unzip() won't work reliably in this case - 25 | #' see \href{https://rdrr.io/r/utils/unzip.html}{here}). 26 | #' In this case, it's advised to unzip your csv/txt file by hand 27 | #' (for example with \href{https://www.7-zip.org/}{7-Zip}) 28 | #' then use the function with the argument `path_to_file`. 29 | #' 30 | #' @return A parquet file, invisibly 31 | #' 32 | #' @export 33 | #' 34 | #' @examples 35 | #' 36 | #' # Conversion from a local csv file to a single parquet file : 37 | #' 38 | #' csv_to_parquet( 39 | #' path_to_file = parquetize_example("region_2022.csv"), 40 | #' path_to_parquet = tempfile(fileext=".parquet") 41 | #' ) 42 | #' 43 | #' # Conversion from a local txt file to a single parquet file : 44 | #' 45 | #' csv_to_parquet( 46 | #' path_to_file = parquetize_example("region_2022.txt"), 47 | #' path_to_parquet = tempfile(fileext=".parquet") 48 | #' ) 49 | #' 50 | #' # Conversion from a local csv file to a single parquet file and select only 51 | #' # few columns : 52 | #' 53 | #' csv_to_parquet( 54 | #' path_to_file = parquetize_example("region_2022.csv"), 55 | #' path_to_parquet = tempfile(fileext = ".parquet"), 56 | #' columns = c("REG","LIBELLE") 57 | #' ) 58 | #' 59 | #' # Conversion from a local csv file to a partitioned parquet file : 60 | #' 61 | #' csv_to_parquet( 62 | #' path_to_file = parquetize_example("region_2022.csv"), 63 | #' path_to_parquet = tempfile(fileext = ".parquet"), 64 | #' partition = "yes", 65 | #' partitioning = c("REG") 66 | #' ) 67 | #' 68 | #' # Conversion from a URL and a zipped file (csv) : 69 | #' 70 | #' csv_to_parquet( 71 | #' path_to_file = "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip", 72 | #' filename_in_zip = "census2021-ts007-ctry.csv", 73 | #' path_to_parquet = tempfile(fileext = ".parquet") 74 | #' ) 75 | #' 76 | #' \dontrun{ 77 | #' # Conversion from a URL and a zipped file (txt) : 78 | #' 79 | #' csv_to_parquet( 80 | #' path_to_file = "https://sourceforge.net/projects/irisdss/files/latest/download", 81 | #' filename_in_zip = "IRIS TEST data.txt", 82 | #' path_to_parquet = tempfile(fileext=".parquet") 83 | #' ) 84 | #' 85 | #' # Conversion from a URL and a csv file with "gzip" compression : 86 | #' 87 | #' csv_to_parquet( 88 | #' path_to_file = 89 | #' "https://github.com/sidsriv/Introduction-to-Data-Science-in-python/raw/master/census.csv", 90 | #' path_to_parquet = tempfile(fileext = ".parquet"), 91 | #' compression = "gzip", 92 | #' compression_level = 5 93 | #' ) 94 | #' } 95 | csv_to_parquet <- function( 96 | path_to_file, 97 | url_to_csv = lifecycle::deprecated(), 98 | csv_as_a_zip = lifecycle::deprecated(), 99 | filename_in_zip, 100 | path_to_parquet, 101 | columns = "all", 102 | compression = "snappy", 103 | compression_level = NULL, 104 | partition = "no", 105 | encoding = "UTF-8", 106 | read_delim_args = list(), 107 | ... 108 | ) { 109 | if (!missing(url_to_csv)) { 110 | lifecycle::deprecate_warn( 111 | when = "0.5.5", 112 | what = "csv_to_parquet(url_to_csv)", 113 | details = "This argument is replaced by path_to_file." 114 | ) 115 | } 116 | 117 | if (!missing(csv_as_a_zip)) { 118 | lifecycle::deprecate_warn( 119 | when = "0.5.5", 120 | what = "csv_to_parquet(csv_as_a_zip)", 121 | details = "This argument is no longer needed, parquetize detect zip file by extension." 122 | ) 123 | } 124 | 125 | # Check if at least one of the two arguments path_to_file or url_to_csv is set 126 | if (missing(path_to_file) & missing(url_to_csv)) { 127 | cli_abort("Be careful, you have to fill the path_to_file argument", class = "parquetize_missing_argument") 128 | } 129 | 130 | # Check if path_to_parquet is missing 131 | if (missing(path_to_parquet)) { 132 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument") 133 | } 134 | 135 | # Check if columns argument is a character vector 136 | if (isFALSE(is.vector(columns) & is.character(columns))) { 137 | cli_abort(c("Be careful, the argument columns must be a character vector", 138 | 'You can use `all` or `c("col1", "col2"))`'), 139 | class = "parquetize_bad_argument") 140 | } 141 | 142 | if (missing(path_to_file)) { 143 | path_to_file <- url_to_csv 144 | } 145 | 146 | input_file <- download_extract(path_to_file, filename_in_zip) 147 | 148 | Sys.sleep(0.01) 149 | cli_progress_message("Reading data...") 150 | 151 | csv_output <- inject( 152 | read_delim( 153 | file = input_file, 154 | locale = locale(encoding = encoding), 155 | lazy = TRUE, 156 | show_col_types = FALSE, 157 | col_select = if (identical(columns,"all")) everything() else all_of(columns), 158 | !!!read_delim_args 159 | ) 160 | ) 161 | 162 | dataset <- write_parquet_at_once( 163 | csv_output, 164 | path_to_parquet, 165 | partition, 166 | compression, 167 | compression_level, 168 | ...) 169 | 170 | return(invisible(dataset)) 171 | } 172 | -------------------------------------------------------------------------------- /man/table_to_parquet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/table_to_parquet.R 3 | \name{table_to_parquet} 4 | \alias{table_to_parquet} 5 | \title{Convert an input file to parquet format} 6 | \usage{ 7 | table_to_parquet( 8 | path_to_file, 9 | path_to_parquet, 10 | max_memory = NULL, 11 | max_rows = NULL, 12 | chunk_size = lifecycle::deprecated(), 13 | chunk_memory_size = lifecycle::deprecated(), 14 | columns = "all", 15 | by_chunk = lifecycle::deprecated(), 16 | skip = 0, 17 | partition = "no", 18 | encoding = NULL, 19 | chunk_memory_sample_lines = 10000, 20 | compression = "snappy", 21 | compression_level = NULL, 22 | user_na = FALSE, 23 | ... 24 | ) 25 | } 26 | \arguments{ 27 | \item{path_to_file}{String that indicates the path to the input file (don't forget the extension).} 28 | 29 | \item{path_to_parquet}{String that indicates the path to the directory where the parquet files will be stored.} 30 | 31 | \item{max_memory}{Memory size (in Mb) in which data of one parquet file should roughly fit.} 32 | 33 | \item{max_rows}{Number of lines that defines the size of the chunk. 34 | This argument can not be filled in if max_memory is used.} 35 | 36 | \item{chunk_size}{DEPRECATED use max_rows} 37 | 38 | \item{chunk_memory_size}{DEPRECATED use max_memory} 39 | 40 | \item{columns}{Character vector of columns to select from the input file (by default, all columns are selected).} 41 | 42 | \item{by_chunk}{DEPRECATED use max_memory or max_rows instead} 43 | 44 | \item{skip}{By default 0. This argument must be filled in if \code{by_chunk} is TRUE. Number of lines to ignore when converting.} 45 | 46 | \item{partition}{String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 47 | If "yes", \code{"partitioning"} argument must be filled in. In this case, a folder will be created for each modality of the variable filled in \code{"partitioning"}. 48 | Be careful, this argument can not be "yes" if \code{max_memory} or \code{max_rows} argument are not NULL.} 49 | 50 | \item{encoding}{String that indicates the character encoding for the input file.} 51 | 52 | \item{chunk_memory_sample_lines}{Number of lines to read to evaluate max_memory. Default to 10 000.} 53 | 54 | \item{compression}{compression algorithm. Default "snappy".} 55 | 56 | \item{compression_level}{compression level. Meaning depends on compression algorithm.} 57 | 58 | \item{user_na}{If \code{TRUE} variables with user defined missing will be read 59 | into \code{\link[haven:labelled_spss]{haven::labelled_spss()}} objects. If \code{FALSE}, the default, user-defined missings will be converted to \code{NA}.} 60 | 61 | \item{...}{Additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 62 | and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations.} 63 | } 64 | \value{ 65 | Parquet files, invisibly 66 | } 67 | \description{ 68 | This function allows to convert an input file to parquet format. \cr 69 | 70 | It handles SAS, SPSS and Stata files in a same function. There is only one function to use for these 3 cases. 71 | For these 3 cases, the function guesses the data format using the extension of the input file (in the \code{path_to_file} argument). \cr 72 | 73 | Two conversions possibilities are offered : 74 | 75 | \itemize{ 76 | 77 | \item{Convert to a single parquet file. Argument \code{path_to_parquet} must then be used;} 78 | \item{Convert to a partitioned parquet file. Additionnal arguments \code{partition} and \code{partitioning} must then be used;} 79 | 80 | } 81 | 82 | To avoid overcharging R's RAM, the conversion can be done by chunk. One of arguments \code{max_memory} or \code{max_rows} must then be used. 83 | This is very useful for huge tables and for computers with little RAM because the conversion is then done 84 | with less memory consumption. For more information, see \href{https://ddotta.github.io/parquetize/articles/aa-conversions.html}{here}. 85 | } 86 | \examples{ 87 | # Conversion from a SAS file to a single parquet file : 88 | 89 | table_to_parquet( 90 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 91 | path_to_parquet = tempfile(fileext = ".parquet") 92 | ) 93 | 94 | # Conversion from a SPSS file to a single parquet file : 95 | 96 | table_to_parquet( 97 | path_to_file = system.file("examples","iris.sav", package = "haven"), 98 | path_to_parquet = tempfile(fileext = ".parquet"), 99 | ) 100 | # Conversion from a Stata file to a single parquet file without progress bar : 101 | 102 | table_to_parquet( 103 | path_to_file = system.file("examples","iris.dta", package = "haven"), 104 | path_to_parquet = tempfile(fileext = ".parquet") 105 | ) 106 | 107 | # Reading SPSS file by chunk (using `max_rows` argument) 108 | # and conversion to multiple parquet files : 109 | 110 | table_to_parquet( 111 | path_to_file = system.file("examples","iris.sav", package = "haven"), 112 | path_to_parquet = tempfile(), 113 | max_rows = 50, 114 | ) 115 | 116 | # Reading SPSS file by chunk (using `max_memory` argument) 117 | # and conversion to multiple parquet files of 5 Kb when loaded (5 Mb / 1024) 118 | # (in real files, you should use bigger value that fit in memory like 3000 119 | # or 4000) : 120 | 121 | table_to_parquet( 122 | path_to_file = system.file("examples","iris.sav", package = "haven"), 123 | path_to_parquet = tempfile(), 124 | max_memory = 5 / 1024 125 | ) 126 | 127 | # Reading SAS file by chunk of 50 lines with encoding 128 | # and conversion to multiple files : 129 | 130 | table_to_parquet( 131 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 132 | path_to_parquet = tempfile(), 133 | max_rows = 50, 134 | encoding = "utf-8" 135 | ) 136 | 137 | # Conversion from a SAS file to a single parquet file and select only 138 | # few columns : 139 | 140 | table_to_parquet( 141 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 142 | path_to_parquet = tempfile(fileext = ".parquet"), 143 | columns = c("Species","Petal_Length") 144 | ) 145 | 146 | # Conversion from a SAS file to a partitioned parquet file : 147 | 148 | table_to_parquet( 149 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 150 | path_to_parquet = tempfile(), 151 | partition = "yes", 152 | partitioning = c("Species") # vector use as partition key 153 | ) 154 | 155 | # Reading SAS file by chunk of 50 lines 156 | # and conversion to multiple files with zstd, compression level 10 157 | 158 | if (isTRUE(arrow::arrow_info()$capabilities[['zstd']])) { 159 | table_to_parquet( 160 | path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 161 | path_to_parquet = tempfile(), 162 | max_rows = 50, 163 | compression = "zstd", 164 | compression_level = 10 165 | ) 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # parquetize (WIP) 2 | 3 | This release includes : 4 | 5 | - `{parquetize}` now has a new `get_parquet_info` function for retrieving metadata from parquet files. This function is particularly useful for row group size (added by @nbc). 6 | 7 | # parquetize 0.5.7 8 | 9 | This release includes : 10 | 11 | - bugfix by @leungi: remove single quotes in SQL statement thatgenerates incorrect SQL syntax for connection of type Microsoft SQL Server #45 12 | - `{parquetize}` now has a minimal version (2.4.0) for `{haven}` dependency package to ensure that conversions are performed correctly from SAS files compressed in BINARY mode #46 13 | - `csv_to_parquet` now has a `read_delim_args` argument, allowing passing of arguments to `read_delim` (added by @nikostr). 14 | - `table_to_parquet` can now convert files with uppercase extensions (.SAS7BDAT, .SAV, .DTA) 15 | 16 | 17 | # parquetize 0.5.6.1 18 | 19 | This release includes : 20 | 21 | #### fst_to_parquet function 22 | 23 | - a new [fst_to_parquet](https://ddotta.github.io/parquetize/reference/fst_to_parquet.html) function that converts a fst file to parquet format. 24 | 25 | #### Other 26 | 27 | - Rely more on `@inheritParams` to simply documentation of functions arguments #38. This leads to some renaming of arguments (e.g `path_to_csv` -> `path_to_file`...) 28 | - Arguments `compression` and `compression_level` are now passed to write_parquet_at_once and write_parquet_by_chunk functions and now available in main conversion functions of `parquetize` #36 29 | - Group `@importFrom` in a file to facilitate their maintenance #37 30 | - work on download_extract tests #43 31 | 32 | # parquetize 0.5.6 33 | 34 | This release includes : 35 | 36 | #### Possibility to use a RDBMS as source 37 | 38 | You can convert to parquet any query you want on any DBI compatible RDBMS : 39 | 40 | ```{r} 41 | dbi_connection <- DBI::dbConnect(RSQLite::SQLite(), 42 | system.file("extdata","iris.sqlite",package = "parquetize")) 43 | 44 | # Reading iris table from local sqlite database 45 | # and conversion to one parquet file : 46 | dbi_to_parquet( 47 | conn = dbi_connection, 48 | sql_query = "SELECT * FROM iris", 49 | path_to_parquet = tempdir(), 50 | parquetname = "iris" 51 | ) 52 | ``` 53 | 54 | You can find more information on 55 | [`dbi_to_parquet`](https://ddotta.github.io/parquetize/reference/dbi_to_parquet.html) documentation. 56 | 57 | #### check_parquet function 58 | 59 | - a new [check_parquet](https://ddotta.github.io/parquetize/reference/check_parquet.html) function that check if a dataset/file is valid and return columns and arrow type 60 | 61 | #### Deprecations 62 | 63 | Two arguments are deprecated to avoid confusion with arrow concept and keep consistency 64 | 65 | * `chunk_size` is replaced by `max_rows` (chunk size is an arrow concept). 66 | * `chunk_memory_size` is replaced by `max_memory` for consistency 67 | 68 | #### Other 69 | 70 | - refactoring : extract the logic to write parquet files as chunk or at once in write_parquet_by_chunk and write_parquet_at_once 71 | - a big test's refactoring : all _to_parquet output files are formally validate (readable as parquet, number of lines, partitions, number of files). 72 | - use cli_abort instead of cli_alert_danger with stop("") everywhere 73 | - some minors changes 74 | - bugfix: table_to_parquet did not select columns as expected 75 | - bugfix: skip_if_offline tests with download 76 | 77 | # parquetize 0.5.5 78 | 79 | This release includes : 80 | 81 | #### A very important new contributor to `parquetize` ! 82 | 83 | Due to these numerous contributions, @nbc is now officially part of the project authors ! 84 | 85 | #### Three arguments deprecation 86 | 87 | After a big refactoring, three arguments are deprecated : 88 | 89 | * `by_chunk` : `table_to_parquet` will automatically chunked if you use one of `chunk_memory_size` or `chunk_size`. 90 | * `csv_as_a_zip`: `csv_to_table` will detect if file is a zip by the extension. 91 | * `url_to_csv` : use `path_to_csv` instead, `csv_to_table` will detect if the file is remote with the file path. 92 | 93 | They will raise a deprecation warning for the moment. 94 | 95 | #### Chunking by memory size 96 | 97 | The possibility to chunk parquet by memory size with `table_to_parquet()`: 98 | `table_to_parquet()` takes a `chunk_memory_size` argument to convert an input 99 | file into parquet file of roughly `chunk_memory_size` Mb size when data are 100 | loaded in memory. 101 | 102 | Argument `by_chunk` is deprecated (see above). 103 | 104 | Example of use of the argument `chunk_memory_size`: 105 | 106 | ```{r} 107 | table_to_parquet( 108 | path_to_table = system.file("examples","iris.sas7bdat", package = "haven"), 109 | path_to_parquet = tempdir(), 110 | chunk_memory_size = 5000, # this will create files of around 5Gb when loaded in memory 111 | ) 112 | ``` 113 | 114 | #### Passing argument like compression to `write_parquet` when chunking 115 | 116 | The functionality for users to pass argument to `write_parquet()` when 117 | chunking argument (in the ellipsis). Can be used for example to pass 118 | `compression` and `compression_level`. 119 | 120 | Example: 121 | 122 | ```{r} 123 | table_to_parquet( 124 | path_to_table = system.file("examples","iris.sas7bdat", package = "haven"), 125 | path_to_parquet = tempdir(), 126 | compression = "zstd", 127 | compression_level = 10, 128 | chunk_memory_size = 5000 129 | ) 130 | ``` 131 | 132 | #### A new function `download_extract` 133 | 134 | This function is added to ... download and unzip file if needed. 135 | 136 | ```{r} 137 | file_path <- download_extract( 138 | "https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip", 139 | filename_in_zip = "census2021-ts007-ctry.csv" 140 | ) 141 | csv_to_parquet( 142 | file_path, 143 | path_to_parquet = tempdir() 144 | ) 145 | ``` 146 | 147 | #### Other 148 | 149 | Under the cover, this release has hardened tests 150 | 151 | # parquetize 0.5.4 152 | 153 | This release fix an error when converting a sas file by chunk. 154 | 155 | # parquetize 0.5.3 156 | 157 | This release includes : 158 | 159 | - Added columns selection to `table_to_parquet()` and `csv_to_parquet()` functions #20 160 | - The example files in parquet format of the iris table have been migrated to the `inst/extdata` directory. 161 | 162 | # parquetize 0.5.2 163 | 164 | This release includes : 165 | 166 | - The behaviour of `table_to_parquet()` function has been fixed when the argument `by_chunk` is TRUE. 167 | 168 | # parquetize 0.5.1 169 | 170 | This release removes `duckdb_to_parquet()` function on the advice of Brian Ripley from CRAN. 171 | Indeed, the storage of DuckDB is not yet stable. The storage will be stabilized when version 1.0 releases. 172 | 173 | # parquetize 0.5.0 174 | 175 | This release includes corrections for CRAN submission. 176 | 177 | # parquetize 0.4.0 178 | 179 | **This release includes an important feature :** 180 | 181 | The `table_to_parquet()` function can now convert tables to parquet format with less memory consumption. 182 | Useful for huge tables and for computers with little RAM. (#15) 183 | A vignette has been written about it. See [here](https://ddotta.github.io/parquetize/articles/aa-conversions.html). 184 | 185 | * Removal of the `nb_rows` argument in the `table_to_parquet()` function 186 | * Replaced by new arguments `by_chunk`, `chunk_size` and `skip` (see documentation) 187 | * Progress bars are now managed with [{cli} package](https://github.com/r-lib/cli) 188 | 189 | # parquetize 0.3.0 190 | 191 | * Added `duckdb_to_parquet()` function to convert duckdb files to parquet format. 192 | * Added `sqlite_to_parquet()` function to convert sqlite files to parquet format. 193 | 194 | # parquetize 0.2.0 195 | 196 | * Added `rds_to_parquet()` function to convert rds files to parquet format. 197 | * Added `json_to_parquet()` function to convert json and ndjson files to parquet format. 198 | * Added the possibility to convert a csv file to a partitioned parquet file. 199 | * Improving code coverage (#9) 200 | * Check if `path_to_parquet` exists in functions `csv_to_parquet()` or `table_to_parquet()` (@py-b) 201 | 202 | 203 | # parquetize 0.1.0 204 | 205 | * Added `table_to_parquet()` function to convert SAS, SPSS and Stata files to parquet format. 206 | * Added `csv_to_parquet()` function to convert csv files to parquet format. 207 | * Added `parquetize_example()` function to get path to package data examples. 208 | * Added a `NEWS.md` file to track changes to the package. 209 | -------------------------------------------------------------------------------- /R/table_to_parquet.R: -------------------------------------------------------------------------------- 1 | #' @name table_to_parquet 2 | #' 3 | #' @title Convert an input file to parquet format 4 | #' 5 | #' @description This function allows to convert an input file to parquet format. \cr 6 | #' 7 | #' It handles SAS, SPSS and Stata files in a same function. There is only one function to use for these 3 cases. 8 | #' For these 3 cases, the function guesses the data format using the extension of the input file (in the `path_to_file` argument). \cr 9 | #' 10 | #' Two conversions possibilities are offered : 11 | #' 12 | #'\itemize{ 13 | #' 14 | #' \item{Convert to a single parquet file. Argument `path_to_parquet` must then be used;} 15 | #' \item{Convert to a partitioned parquet file. Additionnal arguments `partition` and `partitioning` must then be used;} 16 | #' 17 | #' } 18 | #' 19 | #' To avoid overcharging R's RAM, the conversion can be done by chunk. One of arguments `max_memory` or `max_rows` must then be used. 20 | #' This is very useful for huge tables and for computers with little RAM because the conversion is then done 21 | #' with less memory consumption. For more information, see \href{https://ddotta.github.io/parquetize/articles/aa-conversions.html}{here}. 22 | #' 23 | #' @param path_to_file String that indicates the path to the input file (don't forget the extension). 24 | #' @param path_to_parquet String that indicates the path to the directory where the parquet files will be stored. 25 | #' @param columns Character vector of columns to select from the input file (by default, all columns are selected). 26 | #' @param max_memory Memory size (in Mb) in which data of one parquet file should roughly fit. 27 | #' @param max_rows Number of lines that defines the size of the chunk. 28 | #' This argument can not be filled in if max_memory is used. 29 | #' @param chunk_memory_sample_lines Number of lines to read to evaluate max_memory. Default to 10 000. 30 | #' @param by_chunk DEPRECATED use max_memory or max_rows instead 31 | #' @param chunk_size DEPRECATED use max_rows 32 | #' @param chunk_memory_size DEPRECATED use max_memory 33 | #' @param skip By default 0. This argument must be filled in if `by_chunk` is TRUE. Number of lines to ignore when converting. 34 | #' @param partition String ("yes" or "no" - by default) that indicates whether you want to create a partitioned parquet file. 35 | #' If "yes", `"partitioning"` argument must be filled in. In this case, a folder will be created for each modality of the variable filled in `"partitioning"`. 36 | #' Be careful, this argument can not be "yes" if `max_memory` or `max_rows` argument are not NULL. 37 | #' @param encoding String that indicates the character encoding for the input file. 38 | #' @param compression compression algorithm. Default "snappy". 39 | #' @param compression_level compression level. Meaning depends on compression algorithm. 40 | #' @param user_na If `TRUE` variables with user defined missing will be read 41 | #' into [haven::labelled_spss()] objects. If `FALSE`, the default, user-defined missings will be converted to `NA`. 42 | #' @param ... Additional format-specific arguments, see \href{https://arrow.apache.org/docs/r/reference/write_parquet.html}{arrow::write_parquet()} 43 | #' and \href{https://arrow.apache.org/docs/r/reference/write_dataset.html}{arrow::write_dataset()} for more informations. 44 | #' 45 | #' @return Parquet files, invisibly 46 | #' 47 | #' @export 48 | #' 49 | #' @examples 50 | #' # Conversion from a SAS file to a single parquet file : 51 | #' 52 | #' table_to_parquet( 53 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 54 | #' path_to_parquet = tempfile(fileext = ".parquet") 55 | #' ) 56 | #' 57 | #' # Conversion from a SPSS file to a single parquet file : 58 | #' 59 | #' table_to_parquet( 60 | #' path_to_file = system.file("examples","iris.sav", package = "haven"), 61 | #' path_to_parquet = tempfile(fileext = ".parquet"), 62 | #' ) 63 | #' # Conversion from a Stata file to a single parquet file without progress bar : 64 | #' 65 | #' table_to_parquet( 66 | #' path_to_file = system.file("examples","iris.dta", package = "haven"), 67 | #' path_to_parquet = tempfile(fileext = ".parquet") 68 | #' ) 69 | #' 70 | #' # Reading SPSS file by chunk (using `max_rows` argument) 71 | #' # and conversion to multiple parquet files : 72 | #' 73 | #' table_to_parquet( 74 | #' path_to_file = system.file("examples","iris.sav", package = "haven"), 75 | #' path_to_parquet = tempfile(), 76 | #' max_rows = 50, 77 | #' ) 78 | #' 79 | #' # Reading SPSS file by chunk (using `max_memory` argument) 80 | #' # and conversion to multiple parquet files of 5 Kb when loaded (5 Mb / 1024) 81 | #' # (in real files, you should use bigger value that fit in memory like 3000 82 | #' # or 4000) : 83 | #' 84 | #' table_to_parquet( 85 | #' path_to_file = system.file("examples","iris.sav", package = "haven"), 86 | #' path_to_parquet = tempfile(), 87 | #' max_memory = 5 / 1024 88 | #' ) 89 | #' 90 | #' # Reading SAS file by chunk of 50 lines with encoding 91 | #' # and conversion to multiple files : 92 | #' 93 | #' table_to_parquet( 94 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 95 | #' path_to_parquet = tempfile(), 96 | #' max_rows = 50, 97 | #' encoding = "utf-8" 98 | #' ) 99 | #' 100 | #' # Conversion from a SAS file to a single parquet file and select only 101 | #' # few columns : 102 | #' 103 | #' table_to_parquet( 104 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 105 | #' path_to_parquet = tempfile(fileext = ".parquet"), 106 | #' columns = c("Species","Petal_Length") 107 | #' ) 108 | #' 109 | #' # Conversion from a SAS file to a partitioned parquet file : 110 | #' 111 | #' table_to_parquet( 112 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 113 | #' path_to_parquet = tempfile(), 114 | #' partition = "yes", 115 | #' partitioning = c("Species") # vector use as partition key 116 | #' ) 117 | #' 118 | #' # Reading SAS file by chunk of 50 lines 119 | #' # and conversion to multiple files with zstd, compression level 10 120 | #' 121 | #' if (isTRUE(arrow::arrow_info()$capabilities[['zstd']])) { 122 | #' table_to_parquet( 123 | #' path_to_file = system.file("examples","iris.sas7bdat", package = "haven"), 124 | #' path_to_parquet = tempfile(), 125 | #' max_rows = 50, 126 | #' compression = "zstd", 127 | #' compression_level = 10 128 | #' ) 129 | #' } 130 | 131 | table_to_parquet <- function( 132 | path_to_file, 133 | path_to_parquet, 134 | max_memory = NULL, 135 | max_rows = NULL, 136 | chunk_size = lifecycle::deprecated(), 137 | chunk_memory_size = lifecycle::deprecated(), 138 | columns = "all", 139 | by_chunk = lifecycle::deprecated(), 140 | skip = 0, 141 | partition = "no", 142 | encoding = NULL, 143 | chunk_memory_sample_lines = 10000, 144 | compression = "snappy", 145 | compression_level = NULL, 146 | user_na = FALSE, 147 | ... 148 | ) { 149 | if (!missing(by_chunk)) { 150 | lifecycle::deprecate_warn( 151 | when = "0.5.5", 152 | what = "table_to_parquet(by_chunk)", 153 | details = "This argument is no longer needed, table_to_parquet will chunk if one of max_memory or max_rows is setted" 154 | ) 155 | } 156 | 157 | if (!missing(chunk_size)) { 158 | lifecycle::deprecate_warn( 159 | when = "0.5.5", 160 | what = "table_to_parquet(chunk_size)", 161 | details = "This argument is deprecated, use max_rows." 162 | ) 163 | max_rows <- chunk_size 164 | } 165 | 166 | if (!missing(chunk_memory_size)) { 167 | lifecycle::deprecate_warn( 168 | when = "0.5.5", 169 | what = "table_to_parquet(chunk_memory_size)", 170 | details = "This argument is deprecated, use max_memory." 171 | ) 172 | max_memory <- chunk_memory_size 173 | } 174 | 175 | # Check if path_to_file is missing 176 | if (missing(path_to_file)) { 177 | cli_abort("Be careful, the argument path_to_file must be filled in", class = "parquetize_missing_argument") 178 | } 179 | 180 | # Check if path_to_parquet is missing 181 | if (missing(path_to_parquet)) { 182 | cli_abort("Be careful, the argument path_to_parquet must be filled in", class = "parquetize_missing_argument") 183 | } 184 | 185 | # Check if columns argument is a character vector 186 | if (isFALSE(is.vector(columns) & is.character(columns))) { 187 | cli_abort(c("Be careful, the argument columns must be a character vector", 188 | 'You can use `all` or `c("col1", "col2"))`'), 189 | class = "parquetize_bad_type") 190 | } 191 | 192 | by_chunk <- !(missing(max_rows) & missing(max_memory)) 193 | 194 | # Check if skip argument is correctly filled in by_chunk argument is TRUE 195 | if (by_chunk==TRUE & skip<0) { 196 | cli_abort("Be careful, if you want to do a conversion by chunk then the argument skip must be must be greater than 0", 197 | class = "parquetize_bad_argument") 198 | } 199 | 200 | # If by_chunk argument is TRUE and partition argument is equal to "yes" it fails 201 | if (by_chunk==TRUE & partition == "yes") { 202 | cli_abort("Be careful, when max_rows or max_memory are used, partition and partitioning can not be used", class = "parquetize_bad_argument") 203 | } 204 | 205 | 206 | # Closure to create read data 207 | closure_read_method <- function(encoding, columns, user_na) { 208 | method <- get_haven_read_function_for_file(path_to_file) 209 | function(path, n_max = Inf, skip = 0L) { 210 | 211 | ext <- tools::file_ext(path_to_file) 212 | 213 | if (ext != "sav") { 214 | method(path, 215 | n_max = n_max, 216 | skip = skip, 217 | encoding = encoding, 218 | col_select = if (identical(columns,"all")) everything() else all_of(columns)) 219 | 220 | } else if (ext == "sav") { 221 | method(path, 222 | n_max = n_max, 223 | skip = skip, 224 | encoding = encoding, 225 | col_select = if (identical(columns,"all")) everything() else all_of(columns), 226 | user_na = user_na) 227 | } 228 | } 229 | } 230 | 231 | read_method <- closure_read_method(encoding = encoding, columns = columns, user_na = user_na) 232 | 233 | if (by_chunk) { 234 | ds <- write_parquet_by_chunk( 235 | read_method = read_method, 236 | input = path_to_file, 237 | path_to_parquet = path_to_parquet, 238 | max_rows = max_rows, 239 | max_memory = max_memory, 240 | chunk_memory_sample_lines = chunk_memory_sample_lines, 241 | ... 242 | ) 243 | return(invisible(ds)) 244 | } 245 | 246 | Sys.sleep(0.01) 247 | cli_progress_message("Reading data...") 248 | table_output <- read_method(path_to_file) 249 | 250 | parquetfile <- write_parquet_at_once( 251 | table_output, 252 | path_to_parquet, 253 | partition, 254 | compression, 255 | compression_level, 256 | ...) 257 | 258 | cli_alert_success("\nThe {path_to_file} file is available in parquet format under {path_to_parquet}") 259 | 260 | return(invisible(parquetfile)) 261 | } 262 | -------------------------------------------------------------------------------- /inst/extdata/iris.ndjson: -------------------------------------------------------------------------------- 1 | {"sepalLength": 5.1, "sepalWidth": 3.5, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 2 | {"sepalLength": 4.9, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 3 | {"sepalLength": 4.7, "sepalWidth": 3.2, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"} 4 | {"sepalLength": 4.6, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"} 5 | {"sepalLength": 5.0, "sepalWidth": 3.6, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 6 | {"sepalLength": 5.4, "sepalWidth": 3.9, "petalLength": 1.7, "petalWidth": 0.4, "species": "setosa"} 7 | {"sepalLength": 4.6, "sepalWidth": 3.4, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"} 8 | {"sepalLength": 5.0, "sepalWidth": 3.4, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"} 9 | {"sepalLength": 4.4, "sepalWidth": 2.9, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 10 | {"sepalLength": 4.9, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.1, "species": "setosa"} 11 | {"sepalLength": 5.4, "sepalWidth": 3.7, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"} 12 | {"sepalLength": 4.8, "sepalWidth": 3.4, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"} 13 | {"sepalLength": 4.8, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.1, "species": "setosa"} 14 | {"sepalLength": 4.3, "sepalWidth": 3.0, "petalLength": 1.1, "petalWidth": 0.1, "species": "setosa"} 15 | {"sepalLength": 5.8, "sepalWidth": 4.0, "petalLength": 1.2, "petalWidth": 0.2, "species": "setosa"} 16 | {"sepalLength": 5.7, "sepalWidth": 4.4, "petalLength": 1.5, "petalWidth": 0.4, "species": "setosa"} 17 | {"sepalLength": 5.4, "sepalWidth": 3.9, "petalLength": 1.3, "petalWidth": 0.4, "species": "setosa"} 18 | {"sepalLength": 5.1, "sepalWidth": 3.5, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"} 19 | {"sepalLength": 5.7, "sepalWidth": 3.8, "petalLength": 1.7, "petalWidth": 0.3, "species": "setosa"} 20 | {"sepalLength": 5.1, "sepalWidth": 3.8, "petalLength": 1.5, "petalWidth": 0.3, "species": "setosa"} 21 | {"sepalLength": 5.4, "sepalWidth": 3.4, "petalLength": 1.7, "petalWidth": 0.2, "species": "setosa"} 22 | {"sepalLength": 5.1, "sepalWidth": 3.7, "petalLength": 1.5, "petalWidth": 0.4, "species": "setosa"} 23 | {"sepalLength": 4.6, "sepalWidth": 3.6, "petalLength": 1.0, "petalWidth": 0.2, "species": "setosa"} 24 | {"sepalLength": 5.1, "sepalWidth": 3.3, "petalLength": 1.7, "petalWidth": 0.5, "species": "setosa"} 25 | {"sepalLength": 4.8, "sepalWidth": 3.4, "petalLength": 1.9, "petalWidth": 0.2, "species": "setosa"} 26 | {"sepalLength": 5.0, "sepalWidth": 3.0, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"} 27 | {"sepalLength": 5.0, "sepalWidth": 3.4, "petalLength": 1.6, "petalWidth": 0.4, "species": "setosa"} 28 | {"sepalLength": 5.2, "sepalWidth": 3.5, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"} 29 | {"sepalLength": 5.2, "sepalWidth": 3.4, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 30 | {"sepalLength": 4.7, "sepalWidth": 3.2, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"} 31 | {"sepalLength": 4.8, "sepalWidth": 3.1, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"} 32 | {"sepalLength": 5.4, "sepalWidth": 3.4, "petalLength": 1.5, "petalWidth": 0.4, "species": "setosa"} 33 | {"sepalLength": 5.2, "sepalWidth": 4.1, "petalLength": 1.5, "petalWidth": 0.1, "species": "setosa"} 34 | {"sepalLength": 5.5, "sepalWidth": 4.2, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 35 | {"sepalLength": 4.9, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"} 36 | {"sepalLength": 5.0, "sepalWidth": 3.2, "petalLength": 1.2, "petalWidth": 0.2, "species": "setosa"} 37 | {"sepalLength": 5.5, "sepalWidth": 3.5, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"} 38 | {"sepalLength": 4.9, "sepalWidth": 3.6, "petalLength": 1.4, "petalWidth": 0.1, "species": "setosa"} 39 | {"sepalLength": 4.4, "sepalWidth": 3.0, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"} 40 | {"sepalLength": 5.1, "sepalWidth": 3.4, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"} 41 | {"sepalLength": 5.0, "sepalWidth": 3.5, "petalLength": 1.3, "petalWidth": 0.3, "species": "setosa"} 42 | {"sepalLength": 4.5, "sepalWidth": 2.3, "petalLength": 1.3, "petalWidth": 0.3, "species": "setosa"} 43 | {"sepalLength": 4.4, "sepalWidth": 3.2, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"} 44 | {"sepalLength": 5.0, "sepalWidth": 3.5, "petalLength": 1.6, "petalWidth": 0.6, "species": "setosa"} 45 | {"sepalLength": 5.1, "sepalWidth": 3.8, "petalLength": 1.9, "petalWidth": 0.4, "species": "setosa"} 46 | {"sepalLength": 4.8, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"} 47 | {"sepalLength": 5.1, "sepalWidth": 3.8, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"} 48 | {"sepalLength": 4.6, "sepalWidth": 3.2, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 49 | {"sepalLength": 5.3, "sepalWidth": 3.7, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"} 50 | {"sepalLength": 5.0, "sepalWidth": 3.3, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"} 51 | {"sepalLength": 7.0, "sepalWidth": 3.2, "petalLength": 4.7, "petalWidth": 1.4, "species": "versicolor"} 52 | {"sepalLength": 6.4, "sepalWidth": 3.2, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"} 53 | {"sepalLength": 6.9, "sepalWidth": 3.1, "petalLength": 4.9, "petalWidth": 1.5, "species": "versicolor"} 54 | {"sepalLength": 5.5, "sepalWidth": 2.3, "petalLength": 4.0, "petalWidth": 1.3, "species": "versicolor"} 55 | {"sepalLength": 6.5, "sepalWidth": 2.8, "petalLength": 4.6, "petalWidth": 1.5, "species": "versicolor"} 56 | {"sepalLength": 5.7, "sepalWidth": 2.8, "petalLength": 4.5, "petalWidth": 1.3, "species": "versicolor"} 57 | {"sepalLength": 6.3, "sepalWidth": 3.3, "petalLength": 4.7, "petalWidth": 1.6, "species": "versicolor"} 58 | {"sepalLength": 4.9, "sepalWidth": 2.4, "petalLength": 3.3, "petalWidth": 1.0, "species": "versicolor"} 59 | {"sepalLength": 6.6, "sepalWidth": 2.9, "petalLength": 4.6, "petalWidth": 1.3, "species": "versicolor"} 60 | {"sepalLength": 5.2, "sepalWidth": 2.7, "petalLength": 3.9, "petalWidth": 1.4, "species": "versicolor"} 61 | {"sepalLength": 5.0, "sepalWidth": 2.0, "petalLength": 3.5, "petalWidth": 1.0, "species": "versicolor"} 62 | {"sepalLength": 5.9, "sepalWidth": 3.0, "petalLength": 4.2, "petalWidth": 1.5, "species": "versicolor"} 63 | {"sepalLength": 6.0, "sepalWidth": 2.2, "petalLength": 4.0, "petalWidth": 1.0, "species": "versicolor"} 64 | {"sepalLength": 6.1, "sepalWidth": 2.9, "petalLength": 4.7, "petalWidth": 1.4, "species": "versicolor"} 65 | {"sepalLength": 5.6, "sepalWidth": 2.9, "petalLength": 3.6, "petalWidth": 1.3, "species": "versicolor"} 66 | {"sepalLength": 6.7, "sepalWidth": 3.1, "petalLength": 4.4, "petalWidth": 1.4, "species": "versicolor"} 67 | {"sepalLength": 5.6, "sepalWidth": 3.0, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"} 68 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 4.1, "petalWidth": 1.0, "species": "versicolor"} 69 | {"sepalLength": 6.2, "sepalWidth": 2.2, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"} 70 | {"sepalLength": 5.6, "sepalWidth": 2.5, "petalLength": 3.9, "petalWidth": 1.1, "species": "versicolor"} 71 | {"sepalLength": 5.9, "sepalWidth": 3.2, "petalLength": 4.8, "petalWidth": 1.8, "species": "versicolor"} 72 | {"sepalLength": 6.1, "sepalWidth": 2.8, "petalLength": 4.0, "petalWidth": 1.3, "species": "versicolor"} 73 | {"sepalLength": 6.3, "sepalWidth": 2.5, "petalLength": 4.9, "petalWidth": 1.5, "species": "versicolor"} 74 | {"sepalLength": 6.1, "sepalWidth": 2.8, "petalLength": 4.7, "petalWidth": 1.2, "species": "versicolor"} 75 | {"sepalLength": 6.4, "sepalWidth": 2.9, "petalLength": 4.3, "petalWidth": 1.3, "species": "versicolor"} 76 | {"sepalLength": 6.6, "sepalWidth": 3.0, "petalLength": 4.4, "petalWidth": 1.4, "species": "versicolor"} 77 | {"sepalLength": 6.8, "sepalWidth": 2.8, "petalLength": 4.8, "petalWidth": 1.4, "species": "versicolor"} 78 | {"sepalLength": 6.7, "sepalWidth": 3.0, "petalLength": 5.0, "petalWidth": 1.7, "species": "versicolor"} 79 | {"sepalLength": 6.0, "sepalWidth": 2.9, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"} 80 | {"sepalLength": 5.7, "sepalWidth": 2.6, "petalLength": 3.5, "petalWidth": 1.0, "species": "versicolor"} 81 | {"sepalLength": 5.5, "sepalWidth": 2.4, "petalLength": 3.8, "petalWidth": 1.1, "species": "versicolor"} 82 | {"sepalLength": 5.5, "sepalWidth": 2.4, "petalLength": 3.7, "petalWidth": 1.0, "species": "versicolor"} 83 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 3.9, "petalWidth": 1.2, "species": "versicolor"} 84 | {"sepalLength": 6.0, "sepalWidth": 2.7, "petalLength": 5.1, "petalWidth": 1.6, "species": "versicolor"} 85 | {"sepalLength": 5.4, "sepalWidth": 3.0, "petalLength": 4.5, "petalWidth": 1.5, "species": "versicolor"} 86 | {"sepalLength": 6.0, "sepalWidth": 3.4, "petalLength": 4.5, "petalWidth": 1.6, "species": "versicolor"} 87 | {"sepalLength": 6.7, "sepalWidth": 3.1, "petalLength": 4.7, "petalWidth": 1.5, "species": "versicolor"} 88 | {"sepalLength": 6.3, "sepalWidth": 2.3, "petalLength": 4.4, "petalWidth": 1.3, "species": "versicolor"} 89 | {"sepalLength": 5.6, "sepalWidth": 3.0, "petalLength": 4.1, "petalWidth": 1.3, "species": "versicolor"} 90 | {"sepalLength": 5.5, "sepalWidth": 2.5, "petalLength": 4.0, "petalWidth": 1.3, "species": "versicolor"} 91 | {"sepalLength": 5.5, "sepalWidth": 2.6, "petalLength": 4.4, "petalWidth": 1.2, "species": "versicolor"} 92 | {"sepalLength": 6.1, "sepalWidth": 3.0, "petalLength": 4.6, "petalWidth": 1.4, "species": "versicolor"} 93 | {"sepalLength": 5.8, "sepalWidth": 2.6, "petalLength": 4.0, "petalWidth": 1.2, "species": "versicolor"} 94 | {"sepalLength": 5.0, "sepalWidth": 2.3, "petalLength": 3.3, "petalWidth": 1.0, "species": "versicolor"} 95 | {"sepalLength": 5.6, "sepalWidth": 2.7, "petalLength": 4.2, "petalWidth": 1.3, "species": "versicolor"} 96 | {"sepalLength": 5.7, "sepalWidth": 3.0, "petalLength": 4.2, "petalWidth": 1.2, "species": "versicolor"} 97 | {"sepalLength": 5.7, "sepalWidth": 2.9, "petalLength": 4.2, "petalWidth": 1.3, "species": "versicolor"} 98 | {"sepalLength": 6.2, "sepalWidth": 2.9, "petalLength": 4.3, "petalWidth": 1.3, "species": "versicolor"} 99 | {"sepalLength": 5.1, "sepalWidth": 2.5, "petalLength": 3.0, "petalWidth": 1.1, "species": "versicolor"} 100 | {"sepalLength": 5.7, "sepalWidth": 2.8, "petalLength": 4.1, "petalWidth": 1.3, "species": "versicolor"} 101 | {"sepalLength": 6.3, "sepalWidth": 3.3, "petalLength": 6.0, "petalWidth": 2.5, "species": "virginica"} 102 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 5.1, "petalWidth": 1.9, "species": "virginica"} 103 | {"sepalLength": 7.1, "sepalWidth": 3.0, "petalLength": 5.9, "petalWidth": 2.1, "species": "virginica"} 104 | {"sepalLength": 6.3, "sepalWidth": 2.9, "petalLength": 5.6, "petalWidth": 1.8, "species": "virginica"} 105 | {"sepalLength": 6.5, "sepalWidth": 3.0, "petalLength": 5.8, "petalWidth": 2.2, "species": "virginica"} 106 | {"sepalLength": 7.6, "sepalWidth": 3.0, "petalLength": 6.6, "petalWidth": 2.1, "species": "virginica"} 107 | {"sepalLength": 4.9, "sepalWidth": 2.5, "petalLength": 4.5, "petalWidth": 1.7, "species": "virginica"} 108 | {"sepalLength": 7.3, "sepalWidth": 2.9, "petalLength": 6.3, "petalWidth": 1.8, "species": "virginica"} 109 | {"sepalLength": 6.7, "sepalWidth": 2.5, "petalLength": 5.8, "petalWidth": 1.8, "species": "virginica"} 110 | {"sepalLength": 7.2, "sepalWidth": 3.6, "petalLength": 6.1, "petalWidth": 2.5, "species": "virginica"} 111 | {"sepalLength": 6.5, "sepalWidth": 3.2, "petalLength": 5.1, "petalWidth": 2.0, "species": "virginica"} 112 | {"sepalLength": 6.4, "sepalWidth": 2.7, "petalLength": 5.3, "petalWidth": 1.9, "species": "virginica"} 113 | {"sepalLength": 6.8, "sepalWidth": 3.0, "petalLength": 5.5, "petalWidth": 2.1, "species": "virginica"} 114 | {"sepalLength": 5.7, "sepalWidth": 2.5, "petalLength": 5.0, "petalWidth": 2.0, "species": "virginica"} 115 | {"sepalLength": 5.8, "sepalWidth": 2.8, "petalLength": 5.1, "petalWidth": 2.4, "species": "virginica"} 116 | {"sepalLength": 6.4, "sepalWidth": 3.2, "petalLength": 5.3, "petalWidth": 2.3, "species": "virginica"} 117 | {"sepalLength": 6.5, "sepalWidth": 3.0, "petalLength": 5.5, "petalWidth": 1.8, "species": "virginica"} 118 | {"sepalLength": 7.7, "sepalWidth": 3.8, "petalLength": 6.7, "petalWidth": 2.2, "species": "virginica"} 119 | {"sepalLength": 7.7, "sepalWidth": 2.6, "petalLength": 6.9, "petalWidth": 2.3, "species": "virginica"} 120 | {"sepalLength": 6.0, "sepalWidth": 2.2, "petalLength": 5.0, "petalWidth": 1.5, "species": "virginica"} 121 | {"sepalLength": 6.9, "sepalWidth": 3.2, "petalLength": 5.7, "petalWidth": 2.3, "species": "virginica"} 122 | {"sepalLength": 5.6, "sepalWidth": 2.8, "petalLength": 4.9, "petalWidth": 2.0, "species": "virginica"} 123 | {"sepalLength": 7.7, "sepalWidth": 2.8, "petalLength": 6.7, "petalWidth": 2.0, "species": "virginica"} 124 | {"sepalLength": 6.3, "sepalWidth": 2.7, "petalLength": 4.9, "petalWidth": 1.8, "species": "virginica"} 125 | {"sepalLength": 6.7, "sepalWidth": 3.3, "petalLength": 5.7, "petalWidth": 2.1, "species": "virginica"} 126 | {"sepalLength": 7.2, "sepalWidth": 3.2, "petalLength": 6.0, "petalWidth": 1.8, "species": "virginica"} 127 | {"sepalLength": 6.2, "sepalWidth": 2.8, "petalLength": 4.8, "petalWidth": 1.8, "species": "virginica"} 128 | {"sepalLength": 6.1, "sepalWidth": 3.0, "petalLength": 4.9, "petalWidth": 1.8, "species": "virginica"} 129 | {"sepalLength": 6.4, "sepalWidth": 2.8, "petalLength": 5.6, "petalWidth": 2.1, "species": "virginica"} 130 | {"sepalLength": 7.2, "sepalWidth": 3.0, "petalLength": 5.8, "petalWidth": 1.6, "species": "virginica"} 131 | {"sepalLength": 7.4, "sepalWidth": 2.8, "petalLength": 6.1, "petalWidth": 1.9, "species": "virginica"} 132 | {"sepalLength": 7.9, "sepalWidth": 3.8, "petalLength": 6.4, "petalWidth": 2.0, "species": "virginica"} 133 | {"sepalLength": 6.4, "sepalWidth": 2.8, "petalLength": 5.6, "petalWidth": 2.2, "species": "virginica"} 134 | {"sepalLength": 6.3, "sepalWidth": 2.8, "petalLength": 5.1, "petalWidth": 1.5, "species": "virginica"} 135 | {"sepalLength": 6.1, "sepalWidth": 2.6, "petalLength": 5.6, "petalWidth": 1.4, "species": "virginica"} 136 | {"sepalLength": 7.7, "sepalWidth": 3.0, "petalLength": 6.1, "petalWidth": 2.3, "species": "virginica"} 137 | {"sepalLength": 6.3, "sepalWidth": 3.4, "petalLength": 5.6, "petalWidth": 2.4, "species": "virginica"} 138 | {"sepalLength": 6.4, "sepalWidth": 3.1, "petalLength": 5.5, "petalWidth": 1.8, "species": "virginica"} 139 | {"sepalLength": 6.0, "sepalWidth": 3.0, "petalLength": 4.8, "petalWidth": 1.8, "species": "virginica"} 140 | {"sepalLength": 6.9, "sepalWidth": 3.1, "petalLength": 5.4, "petalWidth": 2.1, "species": "virginica"} 141 | {"sepalLength": 6.7, "sepalWidth": 3.1, "petalLength": 5.6, "petalWidth": 2.4, "species": "virginica"} 142 | {"sepalLength": 6.9, "sepalWidth": 3.1, "petalLength": 5.1, "petalWidth": 2.3, "species": "virginica"} 143 | {"sepalLength": 5.8, "sepalWidth": 2.7, "petalLength": 5.1, "petalWidth": 1.9, "species": "virginica"} 144 | {"sepalLength": 6.8, "sepalWidth": 3.2, "petalLength": 5.9, "petalWidth": 2.3, "species": "virginica"} 145 | {"sepalLength": 6.7, "sepalWidth": 3.3, "petalLength": 5.7, "petalWidth": 2.5, "species": "virginica"} 146 | {"sepalLength": 6.7, "sepalWidth": 3.0, "petalLength": 5.2, "petalWidth": 2.3, "species": "virginica"} 147 | {"sepalLength": 6.3, "sepalWidth": 2.5, "petalLength": 5.0, "petalWidth": 1.9, "species": "virginica"} 148 | {"sepalLength": 6.5, "sepalWidth": 3.0, "petalLength": 5.2, "petalWidth": 2.0, "species": "virginica"} 149 | {"sepalLength": 6.2, "sepalWidth": 3.4, "petalLength": 5.4, "petalWidth": 2.3, "species": "virginica"} 150 | {"sepalLength": 5.9, "sepalWidth": 3.0, "petalLength": 5.1, "petalWidth": 1.8, "species": "virginica"} 151 | -------------------------------------------------------------------------------- /inst/extdata/iris.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Sepal.Length": 5.1, 4 | "Sepal.Width": 3.5, 5 | "Petal.Length": 1.4, 6 | "Petal.Width": 0.2, 7 | "Species": "setosa" 8 | }, 9 | { 10 | "Sepal.Length": 4.9, 11 | "Sepal.Width": 3, 12 | "Petal.Length": 1.4, 13 | "Petal.Width": 0.2, 14 | "Species": "setosa" 15 | }, 16 | { 17 | "Sepal.Length": 4.7, 18 | "Sepal.Width": 3.2, 19 | "Petal.Length": 1.3, 20 | "Petal.Width": 0.2, 21 | "Species": "setosa" 22 | }, 23 | { 24 | "Sepal.Length": 4.6, 25 | "Sepal.Width": 3.1, 26 | "Petal.Length": 1.5, 27 | "Petal.Width": 0.2, 28 | "Species": "setosa" 29 | }, 30 | { 31 | "Sepal.Length": 5, 32 | "Sepal.Width": 3.6, 33 | "Petal.Length": 1.4, 34 | "Petal.Width": 0.2, 35 | "Species": "setosa" 36 | }, 37 | { 38 | "Sepal.Length": 5.4, 39 | "Sepal.Width": 3.9, 40 | "Petal.Length": 1.7, 41 | "Petal.Width": 0.4, 42 | "Species": "setosa" 43 | }, 44 | { 45 | "Sepal.Length": 4.6, 46 | "Sepal.Width": 3.4, 47 | "Petal.Length": 1.4, 48 | "Petal.Width": 0.3, 49 | "Species": "setosa" 50 | }, 51 | { 52 | "Sepal.Length": 5, 53 | "Sepal.Width": 3.4, 54 | "Petal.Length": 1.5, 55 | "Petal.Width": 0.2, 56 | "Species": "setosa" 57 | }, 58 | { 59 | "Sepal.Length": 4.4, 60 | "Sepal.Width": 2.9, 61 | "Petal.Length": 1.4, 62 | "Petal.Width": 0.2, 63 | "Species": "setosa" 64 | }, 65 | { 66 | "Sepal.Length": 4.9, 67 | "Sepal.Width": 3.1, 68 | "Petal.Length": 1.5, 69 | "Petal.Width": 0.1, 70 | "Species": "setosa" 71 | }, 72 | { 73 | "Sepal.Length": 5.4, 74 | "Sepal.Width": 3.7, 75 | "Petal.Length": 1.5, 76 | "Petal.Width": 0.2, 77 | "Species": "setosa" 78 | }, 79 | { 80 | "Sepal.Length": 4.8, 81 | "Sepal.Width": 3.4, 82 | "Petal.Length": 1.6, 83 | "Petal.Width": 0.2, 84 | "Species": "setosa" 85 | }, 86 | { 87 | "Sepal.Length": 4.8, 88 | "Sepal.Width": 3, 89 | "Petal.Length": 1.4, 90 | "Petal.Width": 0.1, 91 | "Species": "setosa" 92 | }, 93 | { 94 | "Sepal.Length": 4.3, 95 | "Sepal.Width": 3, 96 | "Petal.Length": 1.1, 97 | "Petal.Width": 0.1, 98 | "Species": "setosa" 99 | }, 100 | { 101 | "Sepal.Length": 5.8, 102 | "Sepal.Width": 4, 103 | "Petal.Length": 1.2, 104 | "Petal.Width": 0.2, 105 | "Species": "setosa" 106 | }, 107 | { 108 | "Sepal.Length": 5.7, 109 | "Sepal.Width": 4.4, 110 | "Petal.Length": 1.5, 111 | "Petal.Width": 0.4, 112 | "Species": "setosa" 113 | }, 114 | { 115 | "Sepal.Length": 5.4, 116 | "Sepal.Width": 3.9, 117 | "Petal.Length": 1.3, 118 | "Petal.Width": 0.4, 119 | "Species": "setosa" 120 | }, 121 | { 122 | "Sepal.Length": 5.1, 123 | "Sepal.Width": 3.5, 124 | "Petal.Length": 1.4, 125 | "Petal.Width": 0.3, 126 | "Species": "setosa" 127 | }, 128 | { 129 | "Sepal.Length": 5.7, 130 | "Sepal.Width": 3.8, 131 | "Petal.Length": 1.7, 132 | "Petal.Width": 0.3, 133 | "Species": "setosa" 134 | }, 135 | { 136 | "Sepal.Length": 5.1, 137 | "Sepal.Width": 3.8, 138 | "Petal.Length": 1.5, 139 | "Petal.Width": 0.3, 140 | "Species": "setosa" 141 | }, 142 | { 143 | "Sepal.Length": 5.4, 144 | "Sepal.Width": 3.4, 145 | "Petal.Length": 1.7, 146 | "Petal.Width": 0.2, 147 | "Species": "setosa" 148 | }, 149 | { 150 | "Sepal.Length": 5.1, 151 | "Sepal.Width": 3.7, 152 | "Petal.Length": 1.5, 153 | "Petal.Width": 0.4, 154 | "Species": "setosa" 155 | }, 156 | { 157 | "Sepal.Length": 4.6, 158 | "Sepal.Width": 3.6, 159 | "Petal.Length": 1, 160 | "Petal.Width": 0.2, 161 | "Species": "setosa" 162 | }, 163 | { 164 | "Sepal.Length": 5.1, 165 | "Sepal.Width": 3.3, 166 | "Petal.Length": 1.7, 167 | "Petal.Width": 0.5, 168 | "Species": "setosa" 169 | }, 170 | { 171 | "Sepal.Length": 4.8, 172 | "Sepal.Width": 3.4, 173 | "Petal.Length": 1.9, 174 | "Petal.Width": 0.2, 175 | "Species": "setosa" 176 | }, 177 | { 178 | "Sepal.Length": 5, 179 | "Sepal.Width": 3, 180 | "Petal.Length": 1.6, 181 | "Petal.Width": 0.2, 182 | "Species": "setosa" 183 | }, 184 | { 185 | "Sepal.Length": 5, 186 | "Sepal.Width": 3.4, 187 | "Petal.Length": 1.6, 188 | "Petal.Width": 0.4, 189 | "Species": "setosa" 190 | }, 191 | { 192 | "Sepal.Length": 5.2, 193 | "Sepal.Width": 3.5, 194 | "Petal.Length": 1.5, 195 | "Petal.Width": 0.2, 196 | "Species": "setosa" 197 | }, 198 | { 199 | "Sepal.Length": 5.2, 200 | "Sepal.Width": 3.4, 201 | "Petal.Length": 1.4, 202 | "Petal.Width": 0.2, 203 | "Species": "setosa" 204 | }, 205 | { 206 | "Sepal.Length": 4.7, 207 | "Sepal.Width": 3.2, 208 | "Petal.Length": 1.6, 209 | "Petal.Width": 0.2, 210 | "Species": "setosa" 211 | }, 212 | { 213 | "Sepal.Length": 4.8, 214 | "Sepal.Width": 3.1, 215 | "Petal.Length": 1.6, 216 | "Petal.Width": 0.2, 217 | "Species": "setosa" 218 | }, 219 | { 220 | "Sepal.Length": 5.4, 221 | "Sepal.Width": 3.4, 222 | "Petal.Length": 1.5, 223 | "Petal.Width": 0.4, 224 | "Species": "setosa" 225 | }, 226 | { 227 | "Sepal.Length": 5.2, 228 | "Sepal.Width": 4.1, 229 | "Petal.Length": 1.5, 230 | "Petal.Width": 0.1, 231 | "Species": "setosa" 232 | }, 233 | { 234 | "Sepal.Length": 5.5, 235 | "Sepal.Width": 4.2, 236 | "Petal.Length": 1.4, 237 | "Petal.Width": 0.2, 238 | "Species": "setosa" 239 | }, 240 | { 241 | "Sepal.Length": 4.9, 242 | "Sepal.Width": 3.1, 243 | "Petal.Length": 1.5, 244 | "Petal.Width": 0.2, 245 | "Species": "setosa" 246 | }, 247 | { 248 | "Sepal.Length": 5, 249 | "Sepal.Width": 3.2, 250 | "Petal.Length": 1.2, 251 | "Petal.Width": 0.2, 252 | "Species": "setosa" 253 | }, 254 | { 255 | "Sepal.Length": 5.5, 256 | "Sepal.Width": 3.5, 257 | "Petal.Length": 1.3, 258 | "Petal.Width": 0.2, 259 | "Species": "setosa" 260 | }, 261 | { 262 | "Sepal.Length": 4.9, 263 | "Sepal.Width": 3.6, 264 | "Petal.Length": 1.4, 265 | "Petal.Width": 0.1, 266 | "Species": "setosa" 267 | }, 268 | { 269 | "Sepal.Length": 4.4, 270 | "Sepal.Width": 3, 271 | "Petal.Length": 1.3, 272 | "Petal.Width": 0.2, 273 | "Species": "setosa" 274 | }, 275 | { 276 | "Sepal.Length": 5.1, 277 | "Sepal.Width": 3.4, 278 | "Petal.Length": 1.5, 279 | "Petal.Width": 0.2, 280 | "Species": "setosa" 281 | }, 282 | { 283 | "Sepal.Length": 5, 284 | "Sepal.Width": 3.5, 285 | "Petal.Length": 1.3, 286 | "Petal.Width": 0.3, 287 | "Species": "setosa" 288 | }, 289 | { 290 | "Sepal.Length": 4.5, 291 | "Sepal.Width": 2.3, 292 | "Petal.Length": 1.3, 293 | "Petal.Width": 0.3, 294 | "Species": "setosa" 295 | }, 296 | { 297 | "Sepal.Length": 4.4, 298 | "Sepal.Width": 3.2, 299 | "Petal.Length": 1.3, 300 | "Petal.Width": 0.2, 301 | "Species": "setosa" 302 | }, 303 | { 304 | "Sepal.Length": 5, 305 | "Sepal.Width": 3.5, 306 | "Petal.Length": 1.6, 307 | "Petal.Width": 0.6, 308 | "Species": "setosa" 309 | }, 310 | { 311 | "Sepal.Length": 5.1, 312 | "Sepal.Width": 3.8, 313 | "Petal.Length": 1.9, 314 | "Petal.Width": 0.4, 315 | "Species": "setosa" 316 | }, 317 | { 318 | "Sepal.Length": 4.8, 319 | "Sepal.Width": 3, 320 | "Petal.Length": 1.4, 321 | "Petal.Width": 0.3, 322 | "Species": "setosa" 323 | }, 324 | { 325 | "Sepal.Length": 5.1, 326 | "Sepal.Width": 3.8, 327 | "Petal.Length": 1.6, 328 | "Petal.Width": 0.2, 329 | "Species": "setosa" 330 | }, 331 | { 332 | "Sepal.Length": 4.6, 333 | "Sepal.Width": 3.2, 334 | "Petal.Length": 1.4, 335 | "Petal.Width": 0.2, 336 | "Species": "setosa" 337 | }, 338 | { 339 | "Sepal.Length": 5.3, 340 | "Sepal.Width": 3.7, 341 | "Petal.Length": 1.5, 342 | "Petal.Width": 0.2, 343 | "Species": "setosa" 344 | }, 345 | { 346 | "Sepal.Length": 5, 347 | "Sepal.Width": 3.3, 348 | "Petal.Length": 1.4, 349 | "Petal.Width": 0.2, 350 | "Species": "setosa" 351 | }, 352 | { 353 | "Sepal.Length": 7, 354 | "Sepal.Width": 3.2, 355 | "Petal.Length": 4.7, 356 | "Petal.Width": 1.4, 357 | "Species": "versicolor" 358 | }, 359 | { 360 | "Sepal.Length": 6.4, 361 | "Sepal.Width": 3.2, 362 | "Petal.Length": 4.5, 363 | "Petal.Width": 1.5, 364 | "Species": "versicolor" 365 | }, 366 | { 367 | "Sepal.Length": 6.9, 368 | "Sepal.Width": 3.1, 369 | "Petal.Length": 4.9, 370 | "Petal.Width": 1.5, 371 | "Species": "versicolor" 372 | }, 373 | { 374 | "Sepal.Length": 5.5, 375 | "Sepal.Width": 2.3, 376 | "Petal.Length": 4, 377 | "Petal.Width": 1.3, 378 | "Species": "versicolor" 379 | }, 380 | { 381 | "Sepal.Length": 6.5, 382 | "Sepal.Width": 2.8, 383 | "Petal.Length": 4.6, 384 | "Petal.Width": 1.5, 385 | "Species": "versicolor" 386 | }, 387 | { 388 | "Sepal.Length": 5.7, 389 | "Sepal.Width": 2.8, 390 | "Petal.Length": 4.5, 391 | "Petal.Width": 1.3, 392 | "Species": "versicolor" 393 | }, 394 | { 395 | "Sepal.Length": 6.3, 396 | "Sepal.Width": 3.3, 397 | "Petal.Length": 4.7, 398 | "Petal.Width": 1.6, 399 | "Species": "versicolor" 400 | }, 401 | { 402 | "Sepal.Length": 4.9, 403 | "Sepal.Width": 2.4, 404 | "Petal.Length": 3.3, 405 | "Petal.Width": 1, 406 | "Species": "versicolor" 407 | }, 408 | { 409 | "Sepal.Length": 6.6, 410 | "Sepal.Width": 2.9, 411 | "Petal.Length": 4.6, 412 | "Petal.Width": 1.3, 413 | "Species": "versicolor" 414 | }, 415 | { 416 | "Sepal.Length": 5.2, 417 | "Sepal.Width": 2.7, 418 | "Petal.Length": 3.9, 419 | "Petal.Width": 1.4, 420 | "Species": "versicolor" 421 | }, 422 | { 423 | "Sepal.Length": 5, 424 | "Sepal.Width": 2, 425 | "Petal.Length": 3.5, 426 | "Petal.Width": 1, 427 | "Species": "versicolor" 428 | }, 429 | { 430 | "Sepal.Length": 5.9, 431 | "Sepal.Width": 3, 432 | "Petal.Length": 4.2, 433 | "Petal.Width": 1.5, 434 | "Species": "versicolor" 435 | }, 436 | { 437 | "Sepal.Length": 6, 438 | "Sepal.Width": 2.2, 439 | "Petal.Length": 4, 440 | "Petal.Width": 1, 441 | "Species": "versicolor" 442 | }, 443 | { 444 | "Sepal.Length": 6.1, 445 | "Sepal.Width": 2.9, 446 | "Petal.Length": 4.7, 447 | "Petal.Width": 1.4, 448 | "Species": "versicolor" 449 | }, 450 | { 451 | "Sepal.Length": 5.6, 452 | "Sepal.Width": 2.9, 453 | "Petal.Length": 3.6, 454 | "Petal.Width": 1.3, 455 | "Species": "versicolor" 456 | }, 457 | { 458 | "Sepal.Length": 6.7, 459 | "Sepal.Width": 3.1, 460 | "Petal.Length": 4.4, 461 | "Petal.Width": 1.4, 462 | "Species": "versicolor" 463 | }, 464 | { 465 | "Sepal.Length": 5.6, 466 | "Sepal.Width": 3, 467 | "Petal.Length": 4.5, 468 | "Petal.Width": 1.5, 469 | "Species": "versicolor" 470 | }, 471 | { 472 | "Sepal.Length": 5.8, 473 | "Sepal.Width": 2.7, 474 | "Petal.Length": 4.1, 475 | "Petal.Width": 1, 476 | "Species": "versicolor" 477 | }, 478 | { 479 | "Sepal.Length": 6.2, 480 | "Sepal.Width": 2.2, 481 | "Petal.Length": 4.5, 482 | "Petal.Width": 1.5, 483 | "Species": "versicolor" 484 | }, 485 | { 486 | "Sepal.Length": 5.6, 487 | "Sepal.Width": 2.5, 488 | "Petal.Length": 3.9, 489 | "Petal.Width": 1.1, 490 | "Species": "versicolor" 491 | }, 492 | { 493 | "Sepal.Length": 5.9, 494 | "Sepal.Width": 3.2, 495 | "Petal.Length": 4.8, 496 | "Petal.Width": 1.8, 497 | "Species": "versicolor" 498 | }, 499 | { 500 | "Sepal.Length": 6.1, 501 | "Sepal.Width": 2.8, 502 | "Petal.Length": 4, 503 | "Petal.Width": 1.3, 504 | "Species": "versicolor" 505 | }, 506 | { 507 | "Sepal.Length": 6.3, 508 | "Sepal.Width": 2.5, 509 | "Petal.Length": 4.9, 510 | "Petal.Width": 1.5, 511 | "Species": "versicolor" 512 | }, 513 | { 514 | "Sepal.Length": 6.1, 515 | "Sepal.Width": 2.8, 516 | "Petal.Length": 4.7, 517 | "Petal.Width": 1.2, 518 | "Species": "versicolor" 519 | }, 520 | { 521 | "Sepal.Length": 6.4, 522 | "Sepal.Width": 2.9, 523 | "Petal.Length": 4.3, 524 | "Petal.Width": 1.3, 525 | "Species": "versicolor" 526 | }, 527 | { 528 | "Sepal.Length": 6.6, 529 | "Sepal.Width": 3, 530 | "Petal.Length": 4.4, 531 | "Petal.Width": 1.4, 532 | "Species": "versicolor" 533 | }, 534 | { 535 | "Sepal.Length": 6.8, 536 | "Sepal.Width": 2.8, 537 | "Petal.Length": 4.8, 538 | "Petal.Width": 1.4, 539 | "Species": "versicolor" 540 | }, 541 | { 542 | "Sepal.Length": 6.7, 543 | "Sepal.Width": 3, 544 | "Petal.Length": 5, 545 | "Petal.Width": 1.7, 546 | "Species": "versicolor" 547 | }, 548 | { 549 | "Sepal.Length": 6, 550 | "Sepal.Width": 2.9, 551 | "Petal.Length": 4.5, 552 | "Petal.Width": 1.5, 553 | "Species": "versicolor" 554 | }, 555 | { 556 | "Sepal.Length": 5.7, 557 | "Sepal.Width": 2.6, 558 | "Petal.Length": 3.5, 559 | "Petal.Width": 1, 560 | "Species": "versicolor" 561 | }, 562 | { 563 | "Sepal.Length": 5.5, 564 | "Sepal.Width": 2.4, 565 | "Petal.Length": 3.8, 566 | "Petal.Width": 1.1, 567 | "Species": "versicolor" 568 | }, 569 | { 570 | "Sepal.Length": 5.5, 571 | "Sepal.Width": 2.4, 572 | "Petal.Length": 3.7, 573 | "Petal.Width": 1, 574 | "Species": "versicolor" 575 | }, 576 | { 577 | "Sepal.Length": 5.8, 578 | "Sepal.Width": 2.7, 579 | "Petal.Length": 3.9, 580 | "Petal.Width": 1.2, 581 | "Species": "versicolor" 582 | }, 583 | { 584 | "Sepal.Length": 6, 585 | "Sepal.Width": 2.7, 586 | "Petal.Length": 5.1, 587 | "Petal.Width": 1.6, 588 | "Species": "versicolor" 589 | }, 590 | { 591 | "Sepal.Length": 5.4, 592 | "Sepal.Width": 3, 593 | "Petal.Length": 4.5, 594 | "Petal.Width": 1.5, 595 | "Species": "versicolor" 596 | }, 597 | { 598 | "Sepal.Length": 6, 599 | "Sepal.Width": 3.4, 600 | "Petal.Length": 4.5, 601 | "Petal.Width": 1.6, 602 | "Species": "versicolor" 603 | }, 604 | { 605 | "Sepal.Length": 6.7, 606 | "Sepal.Width": 3.1, 607 | "Petal.Length": 4.7, 608 | "Petal.Width": 1.5, 609 | "Species": "versicolor" 610 | }, 611 | { 612 | "Sepal.Length": 6.3, 613 | "Sepal.Width": 2.3, 614 | "Petal.Length": 4.4, 615 | "Petal.Width": 1.3, 616 | "Species": "versicolor" 617 | }, 618 | { 619 | "Sepal.Length": 5.6, 620 | "Sepal.Width": 3, 621 | "Petal.Length": 4.1, 622 | "Petal.Width": 1.3, 623 | "Species": "versicolor" 624 | }, 625 | { 626 | "Sepal.Length": 5.5, 627 | "Sepal.Width": 2.5, 628 | "Petal.Length": 4, 629 | "Petal.Width": 1.3, 630 | "Species": "versicolor" 631 | }, 632 | { 633 | "Sepal.Length": 5.5, 634 | "Sepal.Width": 2.6, 635 | "Petal.Length": 4.4, 636 | "Petal.Width": 1.2, 637 | "Species": "versicolor" 638 | }, 639 | { 640 | "Sepal.Length": 6.1, 641 | "Sepal.Width": 3, 642 | "Petal.Length": 4.6, 643 | "Petal.Width": 1.4, 644 | "Species": "versicolor" 645 | }, 646 | { 647 | "Sepal.Length": 5.8, 648 | "Sepal.Width": 2.6, 649 | "Petal.Length": 4, 650 | "Petal.Width": 1.2, 651 | "Species": "versicolor" 652 | }, 653 | { 654 | "Sepal.Length": 5, 655 | "Sepal.Width": 2.3, 656 | "Petal.Length": 3.3, 657 | "Petal.Width": 1, 658 | "Species": "versicolor" 659 | }, 660 | { 661 | "Sepal.Length": 5.6, 662 | "Sepal.Width": 2.7, 663 | "Petal.Length": 4.2, 664 | "Petal.Width": 1.3, 665 | "Species": "versicolor" 666 | }, 667 | { 668 | "Sepal.Length": 5.7, 669 | "Sepal.Width": 3, 670 | "Petal.Length": 4.2, 671 | "Petal.Width": 1.2, 672 | "Species": "versicolor" 673 | }, 674 | { 675 | "Sepal.Length": 5.7, 676 | "Sepal.Width": 2.9, 677 | "Petal.Length": 4.2, 678 | "Petal.Width": 1.3, 679 | "Species": "versicolor" 680 | }, 681 | { 682 | "Sepal.Length": 6.2, 683 | "Sepal.Width": 2.9, 684 | "Petal.Length": 4.3, 685 | "Petal.Width": 1.3, 686 | "Species": "versicolor" 687 | }, 688 | { 689 | "Sepal.Length": 5.1, 690 | "Sepal.Width": 2.5, 691 | "Petal.Length": 3, 692 | "Petal.Width": 1.1, 693 | "Species": "versicolor" 694 | }, 695 | { 696 | "Sepal.Length": 5.7, 697 | "Sepal.Width": 2.8, 698 | "Petal.Length": 4.1, 699 | "Petal.Width": 1.3, 700 | "Species": "versicolor" 701 | }, 702 | { 703 | "Sepal.Length": 6.3, 704 | "Sepal.Width": 3.3, 705 | "Petal.Length": 6, 706 | "Petal.Width": 2.5, 707 | "Species": "virginica" 708 | }, 709 | { 710 | "Sepal.Length": 5.8, 711 | "Sepal.Width": 2.7, 712 | "Petal.Length": 5.1, 713 | "Petal.Width": 1.9, 714 | "Species": "virginica" 715 | }, 716 | { 717 | "Sepal.Length": 7.1, 718 | "Sepal.Width": 3, 719 | "Petal.Length": 5.9, 720 | "Petal.Width": 2.1, 721 | "Species": "virginica" 722 | }, 723 | { 724 | "Sepal.Length": 6.3, 725 | "Sepal.Width": 2.9, 726 | "Petal.Length": 5.6, 727 | "Petal.Width": 1.8, 728 | "Species": "virginica" 729 | }, 730 | { 731 | "Sepal.Length": 6.5, 732 | "Sepal.Width": 3, 733 | "Petal.Length": 5.8, 734 | "Petal.Width": 2.2, 735 | "Species": "virginica" 736 | }, 737 | { 738 | "Sepal.Length": 7.6, 739 | "Sepal.Width": 3, 740 | "Petal.Length": 6.6, 741 | "Petal.Width": 2.1, 742 | "Species": "virginica" 743 | }, 744 | { 745 | "Sepal.Length": 4.9, 746 | "Sepal.Width": 2.5, 747 | "Petal.Length": 4.5, 748 | "Petal.Width": 1.7, 749 | "Species": "virginica" 750 | }, 751 | { 752 | "Sepal.Length": 7.3, 753 | "Sepal.Width": 2.9, 754 | "Petal.Length": 6.3, 755 | "Petal.Width": 1.8, 756 | "Species": "virginica" 757 | }, 758 | { 759 | "Sepal.Length": 6.7, 760 | "Sepal.Width": 2.5, 761 | "Petal.Length": 5.8, 762 | "Petal.Width": 1.8, 763 | "Species": "virginica" 764 | }, 765 | { 766 | "Sepal.Length": 7.2, 767 | "Sepal.Width": 3.6, 768 | "Petal.Length": 6.1, 769 | "Petal.Width": 2.5, 770 | "Species": "virginica" 771 | }, 772 | { 773 | "Sepal.Length": 6.5, 774 | "Sepal.Width": 3.2, 775 | "Petal.Length": 5.1, 776 | "Petal.Width": 2, 777 | "Species": "virginica" 778 | }, 779 | { 780 | "Sepal.Length": 6.4, 781 | "Sepal.Width": 2.7, 782 | "Petal.Length": 5.3, 783 | "Petal.Width": 1.9, 784 | "Species": "virginica" 785 | }, 786 | { 787 | "Sepal.Length": 6.8, 788 | "Sepal.Width": 3, 789 | "Petal.Length": 5.5, 790 | "Petal.Width": 2.1, 791 | "Species": "virginica" 792 | }, 793 | { 794 | "Sepal.Length": 5.7, 795 | "Sepal.Width": 2.5, 796 | "Petal.Length": 5, 797 | "Petal.Width": 2, 798 | "Species": "virginica" 799 | }, 800 | { 801 | "Sepal.Length": 5.8, 802 | "Sepal.Width": 2.8, 803 | "Petal.Length": 5.1, 804 | "Petal.Width": 2.4, 805 | "Species": "virginica" 806 | }, 807 | { 808 | "Sepal.Length": 6.4, 809 | "Sepal.Width": 3.2, 810 | "Petal.Length": 5.3, 811 | "Petal.Width": 2.3, 812 | "Species": "virginica" 813 | }, 814 | { 815 | "Sepal.Length": 6.5, 816 | "Sepal.Width": 3, 817 | "Petal.Length": 5.5, 818 | "Petal.Width": 1.8, 819 | "Species": "virginica" 820 | }, 821 | { 822 | "Sepal.Length": 7.7, 823 | "Sepal.Width": 3.8, 824 | "Petal.Length": 6.7, 825 | "Petal.Width": 2.2, 826 | "Species": "virginica" 827 | }, 828 | { 829 | "Sepal.Length": 7.7, 830 | "Sepal.Width": 2.6, 831 | "Petal.Length": 6.9, 832 | "Petal.Width": 2.3, 833 | "Species": "virginica" 834 | }, 835 | { 836 | "Sepal.Length": 6, 837 | "Sepal.Width": 2.2, 838 | "Petal.Length": 5, 839 | "Petal.Width": 1.5, 840 | "Species": "virginica" 841 | }, 842 | { 843 | "Sepal.Length": 6.9, 844 | "Sepal.Width": 3.2, 845 | "Petal.Length": 5.7, 846 | "Petal.Width": 2.3, 847 | "Species": "virginica" 848 | }, 849 | { 850 | "Sepal.Length": 5.6, 851 | "Sepal.Width": 2.8, 852 | "Petal.Length": 4.9, 853 | "Petal.Width": 2, 854 | "Species": "virginica" 855 | }, 856 | { 857 | "Sepal.Length": 7.7, 858 | "Sepal.Width": 2.8, 859 | "Petal.Length": 6.7, 860 | "Petal.Width": 2, 861 | "Species": "virginica" 862 | }, 863 | { 864 | "Sepal.Length": 6.3, 865 | "Sepal.Width": 2.7, 866 | "Petal.Length": 4.9, 867 | "Petal.Width": 1.8, 868 | "Species": "virginica" 869 | }, 870 | { 871 | "Sepal.Length": 6.7, 872 | "Sepal.Width": 3.3, 873 | "Petal.Length": 5.7, 874 | "Petal.Width": 2.1, 875 | "Species": "virginica" 876 | }, 877 | { 878 | "Sepal.Length": 7.2, 879 | "Sepal.Width": 3.2, 880 | "Petal.Length": 6, 881 | "Petal.Width": 1.8, 882 | "Species": "virginica" 883 | }, 884 | { 885 | "Sepal.Length": 6.2, 886 | "Sepal.Width": 2.8, 887 | "Petal.Length": 4.8, 888 | "Petal.Width": 1.8, 889 | "Species": "virginica" 890 | }, 891 | { 892 | "Sepal.Length": 6.1, 893 | "Sepal.Width": 3, 894 | "Petal.Length": 4.9, 895 | "Petal.Width": 1.8, 896 | "Species": "virginica" 897 | }, 898 | { 899 | "Sepal.Length": 6.4, 900 | "Sepal.Width": 2.8, 901 | "Petal.Length": 5.6, 902 | "Petal.Width": 2.1, 903 | "Species": "virginica" 904 | }, 905 | { 906 | "Sepal.Length": 7.2, 907 | "Sepal.Width": 3, 908 | "Petal.Length": 5.8, 909 | "Petal.Width": 1.6, 910 | "Species": "virginica" 911 | }, 912 | { 913 | "Sepal.Length": 7.4, 914 | "Sepal.Width": 2.8, 915 | "Petal.Length": 6.1, 916 | "Petal.Width": 1.9, 917 | "Species": "virginica" 918 | }, 919 | { 920 | "Sepal.Length": 7.9, 921 | "Sepal.Width": 3.8, 922 | "Petal.Length": 6.4, 923 | "Petal.Width": 2, 924 | "Species": "virginica" 925 | }, 926 | { 927 | "Sepal.Length": 6.4, 928 | "Sepal.Width": 2.8, 929 | "Petal.Length": 5.6, 930 | "Petal.Width": 2.2, 931 | "Species": "virginica" 932 | }, 933 | { 934 | "Sepal.Length": 6.3, 935 | "Sepal.Width": 2.8, 936 | "Petal.Length": 5.1, 937 | "Petal.Width": 1.5, 938 | "Species": "virginica" 939 | }, 940 | { 941 | "Sepal.Length": 6.1, 942 | "Sepal.Width": 2.6, 943 | "Petal.Length": 5.6, 944 | "Petal.Width": 1.4, 945 | "Species": "virginica" 946 | }, 947 | { 948 | "Sepal.Length": 7.7, 949 | "Sepal.Width": 3, 950 | "Petal.Length": 6.1, 951 | "Petal.Width": 2.3, 952 | "Species": "virginica" 953 | }, 954 | { 955 | "Sepal.Length": 6.3, 956 | "Sepal.Width": 3.4, 957 | "Petal.Length": 5.6, 958 | "Petal.Width": 2.4, 959 | "Species": "virginica" 960 | }, 961 | { 962 | "Sepal.Length": 6.4, 963 | "Sepal.Width": 3.1, 964 | "Petal.Length": 5.5, 965 | "Petal.Width": 1.8, 966 | "Species": "virginica" 967 | }, 968 | { 969 | "Sepal.Length": 6, 970 | "Sepal.Width": 3, 971 | "Petal.Length": 4.8, 972 | "Petal.Width": 1.8, 973 | "Species": "virginica" 974 | }, 975 | { 976 | "Sepal.Length": 6.9, 977 | "Sepal.Width": 3.1, 978 | "Petal.Length": 5.4, 979 | "Petal.Width": 2.1, 980 | "Species": "virginica" 981 | }, 982 | { 983 | "Sepal.Length": 6.7, 984 | "Sepal.Width": 3.1, 985 | "Petal.Length": 5.6, 986 | "Petal.Width": 2.4, 987 | "Species": "virginica" 988 | }, 989 | { 990 | "Sepal.Length": 6.9, 991 | "Sepal.Width": 3.1, 992 | "Petal.Length": 5.1, 993 | "Petal.Width": 2.3, 994 | "Species": "virginica" 995 | }, 996 | { 997 | "Sepal.Length": 5.8, 998 | "Sepal.Width": 2.7, 999 | "Petal.Length": 5.1, 1000 | "Petal.Width": 1.9, 1001 | "Species": "virginica" 1002 | }, 1003 | { 1004 | "Sepal.Length": 6.8, 1005 | "Sepal.Width": 3.2, 1006 | "Petal.Length": 5.9, 1007 | "Petal.Width": 2.3, 1008 | "Species": "virginica" 1009 | }, 1010 | { 1011 | "Sepal.Length": 6.7, 1012 | "Sepal.Width": 3.3, 1013 | "Petal.Length": 5.7, 1014 | "Petal.Width": 2.5, 1015 | "Species": "virginica" 1016 | }, 1017 | { 1018 | "Sepal.Length": 6.7, 1019 | "Sepal.Width": 3, 1020 | "Petal.Length": 5.2, 1021 | "Petal.Width": 2.3, 1022 | "Species": "virginica" 1023 | }, 1024 | { 1025 | "Sepal.Length": 6.3, 1026 | "Sepal.Width": 2.5, 1027 | "Petal.Length": 5, 1028 | "Petal.Width": 1.9, 1029 | "Species": "virginica" 1030 | }, 1031 | { 1032 | "Sepal.Length": 6.5, 1033 | "Sepal.Width": 3, 1034 | "Petal.Length": 5.2, 1035 | "Petal.Width": 2, 1036 | "Species": "virginica" 1037 | }, 1038 | { 1039 | "Sepal.Length": 6.2, 1040 | "Sepal.Width": 3.4, 1041 | "Petal.Length": 5.4, 1042 | "Petal.Width": 2.3, 1043 | "Species": "virginica" 1044 | }, 1045 | { 1046 | "Sepal.Length": 5.9, 1047 | "Sepal.Width": 3, 1048 | "Petal.Length": 5.1, 1049 | "Petal.Width": 1.8, 1050 | "Species": "virginica" 1051 | } 1052 | ] 1053 | --------------------------------------------------------------------------------