├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── test-coverage.yaml ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── RcppExports.R ├── bigreadr-package.R ├── bind.R ├── nlines-split.R ├── read.R └── zzz.R ├── README.md ├── _pkgdown.yml ├── bigreadr.Rproj ├── codecov.yml ├── docs ├── 404.html ├── articles │ ├── csv2sqlite.html │ └── index.html ├── authors.html ├── bootstrap-toc.css ├── bootstrap-toc.js ├── docsearch.css ├── docsearch.js ├── index.html ├── link.svg ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml ├── reference │ ├── Rplot001.png │ ├── big_fread1.html │ ├── big_fread2.html │ ├── bigreadr-package.html │ ├── cbind_df.html │ ├── fread2.html │ ├── fwrite2.html │ ├── index.html │ ├── nlines.html │ ├── rbind_df.html │ └── split_file.html └── sitemap.xml ├── inst ├── WORDLIST └── testdata │ ├── cars_with_newline.csv │ ├── cars_without_newline.csv │ └── wrong_string.rds ├── man ├── big_fread1.Rd ├── big_fread2.Rd ├── bigreadr-package.Rd ├── cbind_df.Rd ├── fread2.Rd ├── fwrite2.Rd ├── nlines.Rd ├── rbind_df.Rd └── split_file.Rd ├── src ├── .gitignore ├── RcppExports.cpp └── nlines-split.cpp ├── tests ├── spelling.R ├── testthat.R └── testthat │ ├── test-bind.R │ ├── test-nlines.R │ ├── test-read.R │ └── test-split.R ├── tmp-save └── nlines.cpp ├── tmp-tests ├── bench-acc.R ├── bench-rbind.R ├── bench-read.R ├── bench-read2.R ├── bench-read3.R ├── bench-read4.R ├── bench-read5.R ├── bench-read6.R ├── bench-read7.R ├── has-header.R ├── split.cpp ├── test-file2string.cpp ├── test-mmap-nlines.cpp ├── test-parallel.R ├── test-parallel2.R ├── test-setvbuf.cpp ├── test-setvbuf2.cpp ├── test-setvbuf3.cpp ├── test-setvbuf4.cpp ├── test-setvbuf5.cpp ├── test-setvbuf6.cpp ├── test-string.cpp ├── text-write.txt └── text-write2.txt └── vignettes └── csv2sqlite.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^tmp-tests$ 4 | ^tmp-data$ 5 | ^\.travis\.yml$ 6 | ^appveyor\.yml$ 7 | ^codecov\.yml$ 8 | ^tmp-save$ 9 | ^_pkgdown\.yml$ 10 | ^docs$ 11 | ^vignettes$ 12 | ^\.github$ 13 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | tmp-data/ 6 | tmp-tests/tmp/ 7 | tmp-tests/tmp2/ 8 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: bigreadr 2 | Version: 0.2.5 3 | Date: 2022-12-06 4 | Title: Read Large Text Files 5 | Description: Read large text files by splitting them in smaller files. 6 | Package 'bigreadr' also provides some convenient wrappers around fread() 7 | and fwrite() from package 'data.table'. 8 | Authors@R: 9 | person(given = "Florian", 10 | family = "Privé", 11 | role = c("aut", "cre"), 12 | email = "florian.prive.21@gmail.com") 13 | License: GPL-3 14 | Encoding: UTF-8 15 | ByteCompile: true 16 | Roxygen: list(markdown = TRUE) 17 | RoxygenNote: 6.1.0 18 | Imports: 19 | bigassertr (>= 0.1.1), 20 | data.table, 21 | parallelly, 22 | Rcpp, 23 | utils 24 | Suggests: 25 | spelling, 26 | testthat, 27 | covr, 28 | RSQLite 29 | LinkingTo: 30 | Rcpp 31 | Language: en-US 32 | URL: https://github.com/privefl/bigreadr 33 | BugReports: https://github.com/privefl/bigreadr/issues 34 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(big_fread1) 4 | export(big_fread2) 5 | export(cbind_df) 6 | export(fread2) 7 | export(fwrite2) 8 | export(get_split_files) 9 | export(nlines) 10 | export(rbind_df) 11 | export(split_file) 12 | importFrom(Rcpp,sourceCpp) 13 | importFrom(bigassertr,assert_exist) 14 | importFrom(bigassertr,assert_int) 15 | importFrom(bigassertr,assert_pos) 16 | importFrom(bigassertr,message2) 17 | importFrom(bigassertr,stop2) 18 | importFrom(bigassertr,warning2) 19 | useDynLib(bigreadr, .registration = TRUE) 20 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | nlines_cpp <- function(file) { 5 | .Call(`_bigreadr_nlines_cpp`, file) 6 | } 7 | 8 | split_every_nlines <- function(name_in, prefix_out, every_nlines, repeat_header) { 9 | .Call(`_bigreadr_split_every_nlines`, name_in, prefix_out, every_nlines, repeat_header) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /R/bigreadr-package.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib bigreadr, .registration = TRUE 2 | #' @importFrom Rcpp sourceCpp 3 | #' @importFrom bigassertr message2 warning2 stop2 assert_exist assert_int assert_pos 4 | #' @keywords internal 5 | "_PACKAGE" 6 | -------------------------------------------------------------------------------- /R/bind.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | #' Merge data frames 4 | #' 5 | #' @param list_df A list of multiple data frames with the same variables in the 6 | #' same order. 7 | #' 8 | #' @return One merged data frame with the names of the first input data frame. 9 | #' @export 10 | #' 11 | #' @examples 12 | #' str(iris) 13 | #' str(rbind_df(list(iris, iris))) 14 | #' 15 | rbind_df <- function(list_df) { 16 | 17 | first_df <- list_df[[1]] 18 | if (data.table::is.data.table(first_df)) { 19 | data.table::rbindlist(list_df) 20 | } else if (is.data.frame(first_df)) { 21 | list_df_merged <- lapply(seq_along(first_df), function(k) { 22 | unlist(lapply(list_df, function(l) l[[k]]), recursive = FALSE) 23 | }) 24 | list_df_merged_named <- stats::setNames(list_df_merged, names(list_df[[1]])) 25 | as.data.frame(list_df_merged_named, stringsAsFactors = FALSE) 26 | } else { 27 | stop2("'list_df' should contain data tables or data frames.") 28 | } 29 | } 30 | 31 | ################################################################################ 32 | 33 | #' Merge data frames 34 | #' 35 | #' @param list_df A list of multiple data frames with the same observations in 36 | #' the same order. 37 | #' 38 | #' @return One merged data frame. 39 | #' @export 40 | #' 41 | #' @examples 42 | #' str(iris) 43 | #' str(cbind_df(list(iris, iris))) 44 | #' 45 | cbind_df <- function(list_df) { 46 | do.call(cbind, list_df) 47 | } 48 | 49 | ################################################################################ 50 | -------------------------------------------------------------------------------- /R/nlines-split.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | #' Number of lines 4 | #' 5 | #' Get the number of lines of a file. 6 | #' 7 | #' @param file Path of the file. 8 | #' 9 | #' @return The number of lines as one integer. 10 | #' @export 11 | #' 12 | #' @examples 13 | #' tmp <- fwrite2(iris) 14 | #' nlines(tmp) 15 | #' 16 | nlines <- function(file) { 17 | nlines_cpp( normalizePath(file, mustWork = TRUE) ) 18 | } 19 | 20 | ################################################################################ 21 | 22 | #' Split file every nlines 23 | #' 24 | #' @param file Path to file that you want to split. 25 | #' @param every_nlines Maximum number of lines in new file parts. 26 | #' @param prefix_out Prefix for created files. Default uses `tempfile()`. 27 | #' @param repeat_header Whether to repeat the header row in each file. 28 | #' Default is `FALSE`. 29 | #' 30 | #' @return A list with 31 | #' - `name_in`: input parameter `file`, 32 | #' - `prefix_out`: input parameter `prefix_out``, 33 | #' - `nfiles`: Number of files (parts) created, 34 | #' - `nlines_part`: input parameter `every_nlines`, 35 | #' - `nlines_all`: total number of lines of `file`. 36 | #' @export 37 | #' 38 | #' @examples 39 | #' tmp <- fwrite2(iris) 40 | #' infos <- split_file(tmp, 100) 41 | #' str(infos) 42 | #' get_split_files(infos) 43 | split_file <- function(file, every_nlines, 44 | prefix_out = tempfile(), 45 | repeat_header = FALSE) { 46 | 47 | split_every_nlines( 48 | name_in = normalizePath(file, mustWork = TRUE), 49 | prefix_out = path.expand(prefix_out), 50 | every_nlines = every_nlines, 51 | repeat_header = repeat_header 52 | ) 53 | } 54 | 55 | ################################################################################ 56 | 57 | #' Get files from splitting. 58 | #' 59 | #' @param split_file_out Output of [split_file]. 60 | #' 61 | #' @return Vector of file paths created by [split_file]. 62 | #' @export 63 | #' @rdname split_file 64 | #' 65 | get_split_files <- function(split_file_out) { 66 | 67 | sprintf("%s_%s.txt", 68 | split_file_out[["prefix_out"]], 69 | seq_len(split_file_out[["nfiles"]])) 70 | } 71 | 72 | ################################################################################ 73 | -------------------------------------------------------------------------------- /R/read.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | #' Read text file(s) 4 | #' 5 | #' @param input Path to the file(s) that you want to read from. 6 | #' This can also be a command, some text or an URL. 7 | #' If a vector of inputs is provided, resulting data frames are appended. 8 | #' @param ... Other arguments to be passed to [data.table::fread]. 9 | #' @param data.table Whether to return a `data.table` or just a `data.frame`? 10 | #' Default is `FALSE` (and is the opposite of [data.table::fread]). 11 | #' @param nThread Number of threads to use. Default uses all threads minus one. 12 | #' 13 | #' @return A `data.frame` by default; a `data.table` when `data.table = TRUE`. 14 | #' @export 15 | #' 16 | #' @examples 17 | #' tmp <- fwrite2(iris) 18 | #' iris2 <- fread2(tmp) 19 | #' all.equal(iris2, iris) ## fread doesn't use factors 20 | fread2 <- function(input, ..., 21 | data.table = FALSE, 22 | nThread = getOption("bigreadr.nThread")) { 23 | 24 | if (missing(input)) { 25 | data.table::fread(..., data.table = data.table, nThread = nThread) 26 | } else if (length(input) > 1) { 27 | rbind_df(lapply(input, fread2, ..., data.table = data.table, nThread = nThread)) 28 | } else { 29 | data.table::fread(input, ..., data.table = data.table, nThread = nThread) 30 | } 31 | } 32 | 33 | ################################################################################ 34 | 35 | #' Write a data frame to a text file 36 | #' 37 | #' @param x Data frame to write. 38 | #' @param file Path to the file that you want to write to. 39 | #' Defaults uses `tempfile()`. 40 | #' @param ... Other arguments to be passed to [data.table::fwrite]. 41 | #' @param quote Whether to quote strings (default is `FALSE`). 42 | #' @param nThread Number of threads to use. Default uses all threads minus one. 43 | #' 44 | #' @return Input parameter `file`, invisibly. 45 | #' @export 46 | #' 47 | #' @examples 48 | #' tmp <- fwrite2(iris) 49 | #' iris2 <- fread2(tmp) 50 | #' all.equal(iris2, iris) ## fread doesn't use factors 51 | fwrite2 <- function(x, file = tempfile(), ..., 52 | quote = FALSE, 53 | nThread = getOption("bigreadr.nThread")) { 54 | 55 | data.table::fwrite(x, file, ..., quote = quote, nThread = nThread) 56 | invisible(file) 57 | } 58 | 59 | ################################################################################ 60 | 61 | #' Read large text file 62 | #' 63 | #' Read large text file by splitting lines. 64 | #' 65 | #' @param file Path to file that you want to read. 66 | #' @inheritParams split_file 67 | #' @param .transform Function to transform each data frame corresponding to each 68 | #' part of the `file`. Default doesn't change anything. 69 | #' @param .combine Function to combine results (list of data frames). 70 | #' @param skip Number of lines to skip at the beginning of `file`. 71 | #' @param ... Other arguments to be passed to [data.table::fread], 72 | #' excepted `input`, `file`, `skip`, `col.names` and `showProgress`. 73 | #' @param print_timings Whether to print timings? Default is `TRUE`. 74 | #' 75 | #' @inherit fread2 return 76 | #' @export 77 | #' 78 | big_fread1 <- function(file, every_nlines, 79 | .transform = identity, .combine = rbind_df, 80 | skip = 0, ..., 81 | print_timings = TRUE) { 82 | 83 | begin <- proc.time()[3] 84 | print_proc <- function(action) { 85 | if (print_timings) { 86 | reset <- proc.time()[3] 87 | message2("%s: %s seconds.", action, round(reset - begin, 1)) 88 | begin <<- reset 89 | } 90 | } 91 | 92 | ## Split file 93 | infos_split <- split_file(file, every_nlines = every_nlines) 94 | file_parts <- get_split_files(infos_split) 95 | on.exit(unlink(file_parts), add = TRUE) 96 | 97 | print_proc("Splitting") 98 | 99 | ## Read first part to get names and to skip some lines 100 | part1 <- fread2(file_parts[1], skip = skip, ..., showProgress = FALSE) 101 | names_df <- names(part1) 102 | part1 <- .transform(part1) 103 | 104 | print_proc("Reading + transforming first part") 105 | 106 | ## Read + transform other parts 107 | other_parts <- lapply(file_parts[-1], function(file_part) { 108 | .transform(fread2(file_part, skip = 0, col.names = names_df, 109 | ..., showProgress = FALSE)) 110 | }) 111 | 112 | print_proc("Reading + transforming other parts") 113 | 114 | ## Combine 115 | all_parts <- unname(c(list(part1), other_parts)) 116 | res <- tryCatch(.combine(all_parts), error = function(e) { 117 | warning2("Combining failed. Returning list of parts instead..") 118 | all_parts 119 | }) 120 | 121 | print_proc("Combining") 122 | 123 | res 124 | } 125 | 126 | ################################################################################ 127 | 128 | cut_in_nb <- function(x, nb) { 129 | split(x, sort(rep_len(seq_len(nb), length(x)))) 130 | } 131 | 132 | #' Read large text file 133 | #' 134 | #' Read large text file by splitting columns. 135 | #' 136 | #' @param file Path to file that you want to read. 137 | #' @param nb_parts Number of parts in which to split reading (and transforming). 138 | #' Parts are referring to blocks of selected columns. 139 | #' Default uses `part_size` to set a good value. 140 | #' @param .transform Function to transform each data frame corresponding to each 141 | #' block of selected columns. Default doesn't change anything. 142 | #' @param .combine Function to combine results (list of data frames). 143 | #' @param skip Number of lines to skip at the beginning of `file`. 144 | #' @param select Indices of columns to keep (sorted). Default keeps them all. 145 | #' @param ... Other arguments to be passed to [data.table::fread], 146 | #' excepted `input`, `file`, `skip`, `select` and `showProgress`. 147 | #' @param progress Show progress? Default is `FALSE`. 148 | #' @param part_size Size of the parts if `nb_parts` is not supplied. 149 | #' Default is `500 * 1024^2` (500 MB). 150 | #' 151 | #' @return The outputs of `fread2` + `.transform`, combined with `.combine`. 152 | #' @export 153 | #' 154 | big_fread2 <- function(file, nb_parts = NULL, 155 | .transform = identity, 156 | .combine = cbind_df, 157 | skip = 0, 158 | select = NULL, 159 | progress = FALSE, 160 | part_size = 500 * 1024^2, ## 500 MB 161 | ...) { 162 | 163 | assert_exist(file) 164 | ## Split selected columns in nb_parts 165 | if (is.null(select)) { 166 | nb_cols <- ncol(fread2(file, nrows = 1, skip = skip, ...)) 167 | select <- seq_len(nb_cols) 168 | } else { 169 | assert_int(select); assert_pos(select) 170 | if (is.unsorted(select, strictly = TRUE)) 171 | stop2("Argument 'select' should be sorted.") 172 | } 173 | # Number of parts 174 | if (is.null(nb_parts)) { 175 | nb_parts <- ceiling(file.size(file) / part_size) 176 | if (progress) message2("Will read the file in %d parts.", nb_parts) 177 | } 178 | split_cols <- cut_in_nb(select, nb_parts) 179 | 180 | if (progress) { 181 | pb <- utils::txtProgressBar(min = 0, max = length(select), style = 3) 182 | on.exit(close(pb), add = TRUE) 183 | } 184 | 185 | ## Read + transform other parts 186 | already_read <- 0 187 | all_parts <- lapply(split_cols, function(cols) { 188 | part <- .transform( 189 | fread2(file, skip = skip, select = cols, ..., showProgress = FALSE) 190 | ) 191 | already_read <<- already_read + length(cols) 192 | if (progress) utils::setTxtProgressBar(pb, already_read) 193 | part 194 | }) 195 | all_parts <- unname(all_parts) 196 | 197 | ## Combine 198 | tryCatch(.combine(all_parts), error = function(e) { 199 | warning2("Combining failed. Returning list of parts instead..") 200 | all_parts 201 | }) 202 | } 203 | 204 | ################################################################################ 205 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | .onLoad <- function(libname, pkgname) { 4 | options( 5 | bigreadr.nThread = max(parallelly::availableCores() - 1L, 1L) 6 | ) 7 | } 8 | 9 | ################################################################################ 10 | 11 | .onUnload <- function(libpath) { 12 | options( 13 | bigreadr.nThread = NULL 14 | ) 15 | } 16 | 17 | ################################################################################ 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![R-CMD-check](https://github.com/privefl/bigreadr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/privefl/bigreadr/actions/workflows/R-CMD-check.yaml) 3 | [![CRAN status](https://www.r-pkg.org/badges/version/bigreadr)](https://cran.r-project.org/package=bigreadr) 4 | [![Codecov test coverage](https://codecov.io/gh/privefl/bigreadr/branch/master/graph/badge.svg)](https://app.codecov.io/gh/privefl/bigreadr?branch=master) 5 | 6 | 7 | 8 | # R package {bigreadr} 9 | 10 | Read large text files based on splitting + `data.table::fread` 11 | 12 | 13 | ## Example 14 | 15 | ```r 16 | # remotes::install_github("privefl/bigreadr") 17 | library(bigreadr) 18 | 19 | # Create a temporary file of ~141 MB (just as an example) 20 | csv <- fwrite2(iris[rep(seq_len(nrow(iris)), 1e4), rep(1:5, 4)], tempfile()) 21 | format(file.size(csv), big.mark = ",") 22 | 23 | ## Splitting lines (1) 24 | # Read (by parts) all data -> using `fread` would be faster 25 | nlines(csv) ## 1M5 lines -> split every 500,000 26 | big_iris1 <- big_fread1(csv, every_nlines = 5e5) 27 | # Read and subset (by parts) 28 | big_iris1_setosa <- big_fread1(csv, every_nlines = 5e5, .transform = function(df) { 29 | dplyr::filter(df, Species == "setosa") 30 | }) 31 | 32 | ## Splitting columns (2) 33 | big_iris2 <- big_fread2(csv, nb_parts = 3) 34 | # Read and subset (by parts) 35 | species_setosa <- (fread2(csv, select = 5)[[1]] == "setosa") 36 | big_iris2_setosa <- big_fread2(csv, nb_parts = 3, .transform = function(df) { 37 | dplyr::filter(df, species_setosa) 38 | }) 39 | 40 | ## Verification 41 | identical(big_iris1_setosa, dplyr::filter(big_iris1, Species == "setosa")) 42 | identical(big_iris2, big_iris1) 43 | identical(big_iris2_setosa, big_iris1_setosa) 44 | ``` 45 | 46 | ## Use cases 47 | 48 | Please send me your use cases! 49 | 50 | - [Convert a CSV to SQLite by parts](https://privefl.github.io/bigreadr/articles/csv2sqlite.html) 51 | 52 | - [Read a text file as a disk.frame](https://diskframe.com/articles/ingesting-data.html) 53 | 54 | - [Read a text file as a Filebacked Big Matrix](https://privefl.github.io/bigstatsr/reference/big_read.html) 55 | 56 | - [Read a text file as a Filebacked Data Frame](https://privefl.github.io/bigdfr/reference/FDF_read.html) 57 | 58 | - Read multiple files at once using `bigreadr::fread2()`. 59 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/privefl/bigreadr/2d8806f1067b19610a2d633bf2e863b910570d5d/_pkgdown.yml -------------------------------------------------------------------------------- /bigreadr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Page not found (404) • bigreadr 9 | 10 | 11 | 12 | 13 | 14 | 15 | 19 | 20 | 21 | 22 | 23 |
24 |
70 | 71 | 72 | 73 | 74 |
75 |
76 | 79 | 80 | Content not found. Please use links in the navbar. 81 | 82 |
83 | 84 | 88 | 89 |
90 | 91 | 92 | 93 | 104 |
105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | Articles • bigreadr 6 | 7 | 8 |
9 |
47 | 48 | 49 | 50 |
51 |
52 | 55 | 56 |
57 |

All vignettes

58 |

59 | 60 |
Convert a CSV to SQLite by parts
61 |
62 |
63 |
64 |
65 | 66 | 67 |
76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | Authors and Citation • bigreadr 6 | 7 | 8 |
9 |
47 | 48 | 49 | 50 |
51 |
52 |
53 | 56 | 57 | 58 |
  • 59 |

    Florian Privé. Author, maintainer. 60 |

    61 |
  • 62 |
63 |
64 |
65 |

Citation

66 | Source: DESCRIPTION 67 |
68 |
69 | 70 | 71 |

Privé F (2022). 72 | bigreadr: Read Large Text Files. 73 | R package version 0.2.5, https://github.com/privefl/bigreadr. 74 |

75 |
@Manual{,
 76 |   title = {bigreadr: Read Large Text Files},
 77 |   author = {Florian Privé},
 78 |   year = {2022},
 79 |   note = {R package version 0.2.5},
 80 |   url = {https://github.com/privefl/bigreadr},
 81 | }
82 | 83 |
84 | 85 |
86 | 87 | 88 | 89 |
98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | (function() { 6 | 'use strict'; 7 | 8 | window.Toc = { 9 | helpers: { 10 | // return all matching elements in the set, or their descendants 11 | findOrFilter: function($el, selector) { 12 | // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ 13 | // http://stackoverflow.com/a/12731439/358804 14 | var $descendants = $el.find(selector); 15 | return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); 16 | }, 17 | 18 | generateUniqueIdBase: function(el) { 19 | var text = $(el).text(); 20 | var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); 21 | return anchor || el.tagName.toLowerCase(); 22 | }, 23 | 24 | generateUniqueId: function(el) { 25 | var anchorBase = this.generateUniqueIdBase(el); 26 | for (var i = 0; ; i++) { 27 | var anchor = anchorBase; 28 | if (i > 0) { 29 | // add suffix 30 | anchor += '-' + i; 31 | } 32 | // check if ID already exists 33 | if (!document.getElementById(anchor)) { 34 | return anchor; 35 | } 36 | } 37 | }, 38 | 39 | generateAnchor: function(el) { 40 | if (el.id) { 41 | return el.id; 42 | } else { 43 | var anchor = this.generateUniqueId(el); 44 | el.id = anchor; 45 | return anchor; 46 | } 47 | }, 48 | 49 | createNavList: function() { 50 | return $(''); 51 | }, 52 | 53 | createChildNavList: function($parent) { 54 | var $childList = this.createNavList(); 55 | $parent.append($childList); 56 | return $childList; 57 | }, 58 | 59 | generateNavEl: function(anchor, text) { 60 | var $a = $(''); 61 | $a.attr('href', '#' + anchor); 62 | $a.text(text); 63 | var $li = $('
  • '); 64 | $li.append($a); 65 | return $li; 66 | }, 67 | 68 | generateNavItem: function(headingEl) { 69 | var anchor = this.generateAnchor(headingEl); 70 | var $heading = $(headingEl); 71 | var text = $heading.data('toc-text') || $heading.text(); 72 | return this.generateNavEl(anchor, text); 73 | }, 74 | 75 | // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). 76 | getTopLevel: function($scope) { 77 | for (var i = 1; i <= 6; i++) { 78 | var $headings = this.findOrFilter($scope, 'h' + i); 79 | if ($headings.length > 1) { 80 | return i; 81 | } 82 | } 83 | 84 | return 1; 85 | }, 86 | 87 | // returns the elements for the top level, and the next below it 88 | getHeadings: function($scope, topLevel) { 89 | var topSelector = 'h' + topLevel; 90 | 91 | var secondaryLevel = topLevel + 1; 92 | var secondarySelector = 'h' + secondaryLevel; 93 | 94 | return this.findOrFilter($scope, topSelector + ',' + secondarySelector); 95 | }, 96 | 97 | getNavLevel: function(el) { 98 | return parseInt(el.tagName.charAt(1), 10); 99 | }, 100 | 101 | populateNav: function($topContext, topLevel, $headings) { 102 | var $context = $topContext; 103 | var $prevNav; 104 | 105 | var helpers = this; 106 | $headings.each(function(i, el) { 107 | var $newNav = helpers.generateNavItem(el); 108 | var navLevel = helpers.getNavLevel(el); 109 | 110 | // determine the proper $context 111 | if (navLevel === topLevel) { 112 | // use top level 113 | $context = $topContext; 114 | } else if ($prevNav && $context === $topContext) { 115 | // create a new level of the tree and switch to it 116 | $context = helpers.createChildNavList($prevNav); 117 | } // else use the current $context 118 | 119 | $context.append($newNav); 120 | 121 | $prevNav = $newNav; 122 | }); 123 | }, 124 | 125 | parseOps: function(arg) { 126 | var opts; 127 | if (arg.jquery) { 128 | opts = { 129 | $nav: arg 130 | }; 131 | } else { 132 | opts = arg; 133 | } 134 | opts.$scope = opts.$scope || $(document.body); 135 | return opts; 136 | } 137 | }, 138 | 139 | // accepts a jQuery object, or an options object 140 | init: function(opts) { 141 | opts = this.helpers.parseOps(opts); 142 | 143 | // ensure that the data attribute is in place for styling 144 | opts.$nav.attr('data-toggle', 'toc'); 145 | 146 | var $topContext = this.helpers.createChildNavList(opts.$nav); 147 | var topLevel = this.helpers.getTopLevel(opts.$scope); 148 | var $headings = this.helpers.getHeadings(opts.$scope, topLevel); 149 | this.helpers.populateNav($topContext, topLevel, $headings); 150 | } 151 | }; 152 | 153 | $(function() { 154 | $('nav[data-toggle="toc"]').each(function(i, el) { 155 | var $nav = $(el); 156 | Toc.init($nav); 157 | }); 158 | }); 159 | })(); 160 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body { 21 | position: relative; 22 | } 23 | 24 | body > .container { 25 | display: flex; 26 | height: 100%; 27 | flex-direction: column; 28 | } 29 | 30 | body > .container .row { 31 | flex: 1 0 auto; 32 | } 33 | 34 | footer { 35 | margin-top: 45px; 36 | padding: 35px 0 36px; 37 | border-top: 1px solid #e5e5e5; 38 | color: #666; 39 | display: flex; 40 | flex-shrink: 0; 41 | } 42 | footer p { 43 | margin-bottom: 0; 44 | } 45 | footer div { 46 | flex: 1; 47 | } 48 | footer .pkgdown { 49 | text-align: right; 50 | } 51 | footer p { 52 | margin-bottom: 0; 53 | } 54 | 55 | img.icon { 56 | float: right; 57 | } 58 | 59 | /* Ensure in-page images don't run outside their container */ 60 | .contents img { 61 | max-width: 100%; 62 | height: auto; 63 | } 64 | 65 | /* Fix bug in bootstrap (only seen in firefox) */ 66 | summary { 67 | display: list-item; 68 | } 69 | 70 | /* Typographic tweaking ---------------------------------*/ 71 | 72 | .contents .page-header { 73 | margin-top: calc(-60px + 1em); 74 | } 75 | 76 | dd { 77 | margin-left: 3em; 78 | } 79 | 80 | /* Section anchors ---------------------------------*/ 81 | 82 | a.anchor { 83 | display: none; 84 | margin-left: 5px; 85 | width: 20px; 86 | height: 20px; 87 | 88 | background-image: url(./link.svg); 89 | background-repeat: no-repeat; 90 | background-size: 20px 20px; 91 | background-position: center center; 92 | } 93 | 94 | h1:hover .anchor, 95 | h2:hover .anchor, 96 | h3:hover .anchor, 97 | h4:hover .anchor, 98 | h5:hover .anchor, 99 | h6:hover .anchor { 100 | display: inline-block; 101 | } 102 | 103 | /* Fixes for fixed navbar --------------------------*/ 104 | 105 | .contents h1, .contents h2, .contents h3, .contents h4 { 106 | padding-top: 60px; 107 | margin-top: -40px; 108 | } 109 | 110 | /* Navbar submenu --------------------------*/ 111 | 112 | .dropdown-submenu { 113 | position: relative; 114 | } 115 | 116 | .dropdown-submenu>.dropdown-menu { 117 | top: 0; 118 | left: 100%; 119 | margin-top: -6px; 120 | margin-left: -1px; 121 | border-radius: 0 6px 6px 6px; 122 | } 123 | 124 | .dropdown-submenu:hover>.dropdown-menu { 125 | display: block; 126 | } 127 | 128 | .dropdown-submenu>a:after { 129 | display: block; 130 | content: " "; 131 | float: right; 132 | width: 0; 133 | height: 0; 134 | border-color: transparent; 135 | border-style: solid; 136 | border-width: 5px 0 5px 5px; 137 | border-left-color: #cccccc; 138 | margin-top: 5px; 139 | margin-right: -10px; 140 | } 141 | 142 | .dropdown-submenu:hover>a:after { 143 | border-left-color: #ffffff; 144 | } 145 | 146 | .dropdown-submenu.pull-left { 147 | float: none; 148 | } 149 | 150 | .dropdown-submenu.pull-left>.dropdown-menu { 151 | left: -100%; 152 | margin-left: 10px; 153 | border-radius: 6px 0 6px 6px; 154 | } 155 | 156 | /* Sidebar --------------------------*/ 157 | 158 | #pkgdown-sidebar { 159 | margin-top: 30px; 160 | position: -webkit-sticky; 161 | position: sticky; 162 | top: 70px; 163 | } 164 | 165 | #pkgdown-sidebar h2 { 166 | font-size: 1.5em; 167 | margin-top: 1em; 168 | } 169 | 170 | #pkgdown-sidebar h2:first-child { 171 | margin-top: 0; 172 | } 173 | 174 | #pkgdown-sidebar .list-unstyled li { 175 | margin-bottom: 0.5em; 176 | } 177 | 178 | /* bootstrap-toc tweaks ------------------------------------------------------*/ 179 | 180 | /* All levels of nav */ 181 | 182 | nav[data-toggle='toc'] .nav > li > a { 183 | padding: 4px 20px 4px 6px; 184 | font-size: 1.5rem; 185 | font-weight: 400; 186 | color: inherit; 187 | } 188 | 189 | nav[data-toggle='toc'] .nav > li > a:hover, 190 | nav[data-toggle='toc'] .nav > li > a:focus { 191 | padding-left: 5px; 192 | color: inherit; 193 | border-left: 1px solid #878787; 194 | } 195 | 196 | nav[data-toggle='toc'] .nav > .active > a, 197 | nav[data-toggle='toc'] .nav > .active:hover > a, 198 | nav[data-toggle='toc'] .nav > .active:focus > a { 199 | padding-left: 5px; 200 | font-size: 1.5rem; 201 | font-weight: 400; 202 | color: inherit; 203 | border-left: 2px solid #878787; 204 | } 205 | 206 | /* Nav: second level (shown on .active) */ 207 | 208 | nav[data-toggle='toc'] .nav .nav { 209 | display: none; /* Hide by default, but at >768px, show it */ 210 | padding-bottom: 10px; 211 | } 212 | 213 | nav[data-toggle='toc'] .nav .nav > li > a { 214 | padding-left: 16px; 215 | font-size: 1.35rem; 216 | } 217 | 218 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 219 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 220 | padding-left: 15px; 221 | } 222 | 223 | nav[data-toggle='toc'] .nav .nav > .active > a, 224 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 225 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 226 | padding-left: 15px; 227 | font-weight: 500; 228 | font-size: 1.35rem; 229 | } 230 | 231 | /* orcid ------------------------------------------------------------------- */ 232 | 233 | .orcid { 234 | font-size: 16px; 235 | color: #A6CE39; 236 | /* margins are required by official ORCID trademark and display guidelines */ 237 | margin-left:4px; 238 | margin-right:4px; 239 | vertical-align: middle; 240 | } 241 | 242 | /* Reference index & topics ----------------------------------------------- */ 243 | 244 | .ref-index th {font-weight: normal;} 245 | 246 | .ref-index td {vertical-align: top; min-width: 100px} 247 | .ref-index .icon {width: 40px;} 248 | .ref-index .alias {width: 40%;} 249 | .ref-index-icons .alias {width: calc(40% - 40px);} 250 | .ref-index .title {width: 60%;} 251 | 252 | .ref-arguments th {text-align: right; padding-right: 10px;} 253 | .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} 254 | .ref-arguments .name {width: 20%;} 255 | .ref-arguments .desc {width: 80%;} 256 | 257 | /* Nice scrolling for wide elements --------------------------------------- */ 258 | 259 | table { 260 | display: block; 261 | overflow: auto; 262 | } 263 | 264 | /* Syntax highlighting ---------------------------------------------------- */ 265 | 266 | pre, code, pre code { 267 | background-color: #f8f8f8; 268 | color: #333; 269 | } 270 | pre, pre code { 271 | white-space: pre-wrap; 272 | word-break: break-all; 273 | overflow-wrap: break-word; 274 | } 275 | 276 | pre { 277 | border: 1px solid #eee; 278 | } 279 | 280 | pre .img, pre .r-plt { 281 | margin: 5px 0; 282 | } 283 | 284 | pre .img img, pre .r-plt img { 285 | background-color: #fff; 286 | } 287 | 288 | code a, pre a { 289 | color: #375f84; 290 | } 291 | 292 | a.sourceLine:hover { 293 | text-decoration: none; 294 | } 295 | 296 | .fl {color: #1514b5;} 297 | .fu {color: #000000;} /* function */ 298 | .ch,.st {color: #036a07;} /* string */ 299 | .kw {color: #264D66;} /* keyword */ 300 | .co {color: #888888;} /* comment */ 301 | 302 | .error {font-weight: bolder;} 303 | .warning {font-weight: bolder;} 304 | 305 | /* Clipboard --------------------------*/ 306 | 307 | .hasCopyButton { 308 | position: relative; 309 | } 310 | 311 | .btn-copy-ex { 312 | position: absolute; 313 | right: 0; 314 | top: 0; 315 | visibility: hidden; 316 | } 317 | 318 | .hasCopyButton:hover button.btn-copy-ex { 319 | visibility: visible; 320 | } 321 | 322 | /* headroom.js ------------------------ */ 323 | 324 | .headroom { 325 | will-change: transform; 326 | transition: transform 200ms linear; 327 | } 328 | .headroom--pinned { 329 | transform: translateY(0%); 330 | } 331 | .headroom--unpinned { 332 | transform: translateY(-100%); 333 | } 334 | 335 | /* mark.js ----------------------------*/ 336 | 337 | mark { 338 | background-color: rgba(255, 255, 51, 0.5); 339 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 340 | padding: 1px; 341 | } 342 | 343 | /* vertical spacing after htmlwidgets */ 344 | .html-widget { 345 | margin-bottom: 10px; 346 | } 347 | 348 | /* fontawesome ------------------------ */ 349 | 350 | .fab { 351 | font-family: "Font Awesome 5 Brands" !important; 352 | } 353 | 354 | /* don't display links in code chunks when printing */ 355 | /* source: https://stackoverflow.com/a/10781533 */ 356 | @media print { 357 | code a:link:after, code a:visited:after { 358 | content: ""; 359 | } 360 | } 361 | 362 | /* Section anchors --------------------------------- 363 | Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 364 | */ 365 | 366 | div.csl-bib-body { } 367 | div.csl-entry { 368 | clear: both; 369 | } 370 | .hanging-indent div.csl-entry { 371 | margin-left:2em; 372 | text-indent:-2em; 373 | } 374 | div.csl-left-margin { 375 | min-width:2em; 376 | float:left; 377 | } 378 | div.csl-right-inline { 379 | margin-left:2em; 380 | padding-left:1em; 381 | } 382 | div.csl-indent { 383 | margin-left: 2em; 384 | } 385 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $("div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.19.2 2 | pkgdown: 2.0.6 3 | pkgdown_sha: ~ 4 | articles: 5 | csv2sqlite: csv2sqlite.html 6 | last_built: 2022-12-06T14:39Z 7 | 8 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/privefl/bigreadr/2d8806f1067b19610a2d633bf2e863b910570d5d/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/reference/big_fread1.html: -------------------------------------------------------------------------------- 1 | 2 | Read large text file — big_fread1 • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    Read large text file by splitting lines.

    60 |
    61 | 62 |
    63 |
    big_fread1(file, every_nlines, .transform = identity,
     64 |   .combine = rbind_df, skip = 0, ..., print_timings = TRUE)
    65 |
    66 | 67 |
    68 |

    Arguments

    69 |
    file
    70 |

    Path to file that you want to read.

    71 | 72 | 73 |
    every_nlines
    74 |

    Maximum number of lines in new file parts.

    75 | 76 | 77 |
    .transform
    78 |

    Function to transform each data frame corresponding to each 79 | part of the file. Default doesn't change anything.

    80 | 81 | 82 |
    .combine
    83 |

    Function to combine results (list of data frames).

    84 | 85 | 86 |
    skip
    87 |

    Number of lines to skip at the beginning of file.

    88 | 89 | 90 |
    ...
    91 |

    Other arguments to be passed to data.table::fread, 92 | excepted input, file, skip, col.names and showProgress.

    93 | 94 | 95 |
    print_timings
    96 |

    Whether to print timings? Default is TRUE.

    97 | 98 |
    99 |
    100 |

    Value

    101 | 102 | 103 |

    A data.frame by default; a data.table when data.table = TRUE.

    104 |
    105 | 106 |
    107 | 110 |
    111 | 112 | 113 |
    116 | 117 |
    118 |

    Site built with pkgdown 2.0.6.

    119 |
    120 | 121 |
    122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /docs/reference/big_fread2.html: -------------------------------------------------------------------------------- 1 | 2 | Read large text file — big_fread2 • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    Read large text file by splitting columns.

    60 |
    61 | 62 |
    63 |
    big_fread2(file, nb_parts = NULL, .transform = identity,
     64 |   .combine = cbind_df, skip = 0, select = NULL, progress = FALSE,
     65 |   part_size = 500 * 1024^2, ...)
    66 |
    67 | 68 |
    69 |

    Arguments

    70 |
    file
    71 |

    Path to file that you want to read.

    72 | 73 | 74 |
    nb_parts
    75 |

    Number of parts in which to split reading (and transforming). 76 | Parts are referring to blocks of selected columns. 77 | Default uses part_size to set a good value.

    78 | 79 | 80 |
    .transform
    81 |

    Function to transform each data frame corresponding to each 82 | block of selected columns. Default doesn't change anything.

    83 | 84 | 85 |
    .combine
    86 |

    Function to combine results (list of data frames).

    87 | 88 | 89 |
    skip
    90 |

    Number of lines to skip at the beginning of file.

    91 | 92 | 93 |
    select
    94 |

    Indices of columns to keep (sorted). Default keeps them all.

    95 | 96 | 97 |
    progress
    98 |

    Show progress? Default is FALSE.

    99 | 100 | 101 |
    part_size
    102 |

    Size of the parts if nb_parts is not supplied. 103 | Default is 500 * 1024^2 (500 MB).

    104 | 105 | 106 |
    ...
    107 |

    Other arguments to be passed to data.table::fread, 108 | excepted input, file, skip, select and showProgress.

    109 | 110 |
    111 |
    112 |

    Value

    113 | 114 | 115 |

    The outputs of fread2 + .transform, combined with .combine.

    116 |
    117 | 118 |
    119 | 122 |
    123 | 124 | 125 |
    128 | 129 |
    130 |

    Site built with pkgdown 2.0.6.

    131 |
    132 | 133 |
    134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /docs/reference/bigreadr-package.html: -------------------------------------------------------------------------------- 1 | 2 | bigreadr: Read Large Text Files — bigreadr-package • bigreadr 8 | 9 | 10 |
    11 |
    49 | 50 | 51 | 52 |
    53 |
    54 | 59 | 60 |
    61 |

    Read large text files by splitting them in smaller files. 62 | Package 'bigreadr' also provides some convenient wrappers around fread() 63 | and fwrite() from package 'data.table'.

    64 |
    65 | 66 | 67 |
    68 |

    See also

    69 | 72 |
    73 |
    74 |

    Author

    75 |

    Maintainer: Florian Privé florian.prive.21@gmail.com

    76 |
    77 | 78 |
    79 | 82 |
    83 | 84 | 85 |
    88 | 89 |
    90 |

    Site built with pkgdown 2.0.6.

    91 |
    92 | 93 |
    94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /docs/reference/cbind_df.html: -------------------------------------------------------------------------------- 1 | 2 | Merge data frames — cbind_df • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    Merge data frames

    60 |
    61 | 62 |
    63 |
    cbind_df(list_df)
    64 |
    65 | 66 |
    67 |

    Arguments

    68 |
    list_df
    69 |

    A list of multiple data frames with the same observations in 70 | the same order.

    71 | 72 |
    73 |
    74 |

    Value

    75 | 76 | 77 |

    One merged data frame.

    78 |
    79 | 80 |
    81 |

    Examples

    82 |
    str(iris)
     83 | #> 'data.frame':	150 obs. of  5 variables:
     84 | #>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
     85 | #>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
     86 | #>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
     87 | #>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
     88 | #>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
     89 | str(cbind_df(list(iris, iris)))
     90 | #> 'data.frame':	150 obs. of  10 variables:
     91 | #>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
     92 | #>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
     93 | #>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
     94 | #>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
     95 | #>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
     96 | #>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
     97 | #>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
     98 | #>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
     99 | #>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
    100 | #>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
    101 | 
    102 | 
    103 |
    104 |
    105 | 108 |
    109 | 110 | 111 |
    114 | 115 |
    116 |

    Site built with pkgdown 2.0.6.

    117 |
    118 | 119 |
    120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /docs/reference/fread2.html: -------------------------------------------------------------------------------- 1 | 2 | Read text file(s) — fread2 • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    Read text file(s)

    60 |
    61 | 62 |
    63 |
    fread2(input, ..., data.table = FALSE,
     64 |   nThread = getOption("bigreadr.nThread"))
    65 |
    66 | 67 |
    68 |

    Arguments

    69 |
    input
    70 |

    Path to the file(s) that you want to read from. 71 | This can also be a command, some text or an URL. 72 | If a vector of inputs is provided, resulting data frames are appended.

    73 | 74 | 75 |
    ...
    76 |

    Other arguments to be passed to data.table::fread.

    77 | 78 | 79 |
    data.table
    80 |

    Whether to return a data.table or just a data.frame? 81 | Default is FALSE (and is the opposite of data.table::fread).

    82 | 83 | 84 |
    nThread
    85 |

    Number of threads to use. Default uses all threads minus one.

    86 | 87 |
    88 |
    89 |

    Value

    90 | 91 | 92 |

    A data.frame by default; a data.table when data.table = TRUE.

    93 |
    94 | 95 |
    96 |

    Examples

    97 |
    tmp <- fwrite2(iris)
     98 | iris2 <- fread2(tmp)
     99 | all.equal(iris2, iris)  ## fread doesn't use factors
    100 | #> [1] "Component \"Species\": Modes: character, numeric"                      
    101 | #> [2] "Component \"Species\": Attributes: < target is NULL, current is list >"
    102 | #> [3] "Component \"Species\": target is character, current is factor"         
    103 | 
    104 |
    105 |
    106 | 109 |
    110 | 111 | 112 |
    115 | 116 |
    117 |

    Site built with pkgdown 2.0.6.

    118 |
    119 | 120 |
    121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /docs/reference/fwrite2.html: -------------------------------------------------------------------------------- 1 | 2 | Write a data frame to a text file — fwrite2 • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    Write a data frame to a text file

    60 |
    61 | 62 |
    63 |
    fwrite2(x, file = tempfile(), ..., quote = FALSE,
     64 |   nThread = getOption("bigreadr.nThread"))
    65 |
    66 | 67 |
    68 |

    Arguments

    69 |
    x
    70 |

    Data frame to write.

    71 | 72 | 73 |
    file
    74 |

    Path to the file that you want to write to. 75 | Defaults uses tempfile().

    76 | 77 | 78 |
    ...
    79 |

    Other arguments to be passed to data.table::fwrite.

    80 | 81 | 82 |
    quote
    83 |

    Whether to quote strings (default is FALSE).

    84 | 85 | 86 |
    nThread
    87 |

    Number of threads to use. Default uses all threads minus one.

    88 | 89 |
    90 |
    91 |

    Value

    92 | 93 | 94 |

    Input parameter file, invisibly.

    95 |
    96 | 97 |
    98 |

    Examples

    99 |
    tmp <- fwrite2(iris)
    100 | iris2 <- fread2(tmp)
    101 | all.equal(iris2, iris)  ## fread doesn't use factors
    102 | #> [1] "Component \"Species\": Modes: character, numeric"                      
    103 | #> [2] "Component \"Species\": Attributes: < target is NULL, current is list >"
    104 | #> [3] "Component \"Species\": target is character, current is factor"         
    105 | 
    106 |
    107 |
    108 | 111 |
    112 | 113 | 114 |
    117 | 118 |
    119 |

    Site built with pkgdown 2.0.6.

    120 |
    121 | 122 |
    123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | Function reference • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 55 | 56 | 60 | 63 | 64 | 67 | 68 | 71 | 72 | 75 | 76 | 79 | 80 | 83 | 84 | 87 | 88 | 91 | 92 |
    57 |

    All functions

    58 |

    59 |
    61 |

    big_fread1()

    62 |

    Read large text file

    65 |

    big_fread2()

    66 |

    Read large text file

    69 |

    cbind_df()

    70 |

    Merge data frames

    73 |

    fread2()

    74 |

    Read text file(s)

    77 |

    fwrite2()

    78 |

    Write a data frame to a text file

    81 |

    nlines()

    82 |

    Number of lines

    85 |

    rbind_df()

    86 |

    Merge data frames

    89 |

    split_file() get_split_files()

    90 |

    Split file every nlines

    93 | 94 | 97 |
    98 | 99 | 100 |
    103 | 104 |
    105 |

    Site built with pkgdown 2.0.6.

    106 |
    107 | 108 |
    109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /docs/reference/nlines.html: -------------------------------------------------------------------------------- 1 | 2 | Number of lines — nlines • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    Get the number of lines of a file.

    60 |
    61 | 62 |
    63 |
    nlines(file)
    64 |
    65 | 66 |
    67 |

    Arguments

    68 |
    file
    69 |

    Path of the file.

    70 | 71 |
    72 |
    73 |

    Value

    74 | 75 | 76 |

    The number of lines as one integer.

    77 |
    78 | 79 |
    80 |

    Examples

    81 |
    tmp <- fwrite2(iris)
     82 | nlines(tmp)
     83 | #> [1] 151
     84 | 
     85 | 
    86 |
    87 |
    88 | 91 |
    92 | 93 | 94 |
    97 | 98 |
    99 |

    Site built with pkgdown 2.0.6.

    100 |
    101 | 102 |
    103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /docs/reference/rbind_df.html: -------------------------------------------------------------------------------- 1 | 2 | Merge data frames — rbind_df • bigreadr 6 | 7 | 8 |
    9 |
    47 | 48 | 49 | 50 |
    51 |
    52 | 57 | 58 |
    59 |

    Merge data frames

    60 |
    61 | 62 |
    63 |
    rbind_df(list_df)
    64 |
    65 | 66 |
    67 |

    Arguments

    68 |
    list_df
    69 |

    A list of multiple data frames with the same variables in the 70 | same order.

    71 | 72 |
    73 |
    74 |

    Value

    75 | 76 | 77 |

    One merged data frame with the names of the first input data frame.

    78 |
    79 | 80 |
    81 |

    Examples

    82 |
    str(iris)
     83 | #> 'data.frame':	150 obs. of  5 variables:
     84 | #>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
     85 | #>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
     86 | #>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
     87 | #>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
     88 | #>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
     89 | str(rbind_df(list(iris, iris)))
     90 | #> 'data.frame':	300 obs. of  5 variables:
     91 | #>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
     92 | #>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
     93 | #>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
     94 | #>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
     95 | #>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
     96 | 
     97 | 
    98 |
    99 |
    100 | 103 |
    104 | 105 | 106 |
    109 | 110 |
    111 |

    Site built with pkgdown 2.0.6.

    112 |
    113 | 114 |
    115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /docs/reference/split_file.html: -------------------------------------------------------------------------------- 1 | 2 | Split file every nlines — split_file • bigreadr 7 | 8 | 9 |
    10 |
    48 | 49 | 50 | 51 |
    52 |
    53 | 58 | 59 |
    60 |

    Split file every nlines

    61 |

    Get files from splitting.

    62 |
    63 | 64 |
    65 |
    split_file(file, every_nlines, prefix_out = tempfile(),
     66 |   repeat_header = FALSE)
     67 | 
     68 | get_split_files(split_file_out)
    69 |
    70 | 71 |
    72 |

    Arguments

    73 |
    file
    74 |

    Path to file that you want to split.

    75 | 76 | 77 |
    every_nlines
    78 |

    Maximum number of lines in new file parts.

    79 | 80 | 81 |
    prefix_out
    82 |

    Prefix for created files. Default uses tempfile().

    83 | 84 | 85 |
    repeat_header
    86 |

    Whether to repeat the header row in each file. 87 | Default is FALSE.

    88 | 89 | 90 |
    split_file_out
    91 |

    Output of split_file.

    92 | 93 |
    94 |
    95 |

    Value

    96 | 97 | 98 |

    A list with

    • name_in: input parameter file,

    • 99 |
    • prefix_out: input parameter `prefix_out``,

    • 100 |
    • nfiles: Number of files (parts) created,

    • 101 |
    • nlines_part: input parameter every_nlines,

    • 102 |
    • nlines_all: total number of lines of file.

    • 103 |

    Vector of file paths created by split_file.

    104 |
    105 | 106 |
    107 |

    Examples

    108 |
    tmp <- fwrite2(iris)
    109 | infos <- split_file(tmp, 100)
    110 | str(infos)
    111 | #> List of 6
    112 | #>  $ name_in      : chr "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f821d7102d"
    113 | #>  $ prefix_out   : chr "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f855f46bc3"
    114 | #>  $ nfiles       : int 2
    115 | #>  $ nlines_part  : int 100
    116 | #>  $ nlines_all   : num 151
    117 | #>  $ repeat_header: logi FALSE
    118 | get_split_files(infos)
    119 | #> [1] "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f855f46bc3_1.txt"
    120 | #> [2] "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f855f46bc3_2.txt"
    121 | 
    122 |
    123 |
    124 | 127 |
    128 | 129 | 130 |
    133 | 134 |
    135 |

    Site built with pkgdown 2.0.6.

    136 |
    137 | 138 |
    139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /404.html 5 | 6 | 7 | /articles/csv2sqlite.html 8 | 9 | 10 | /articles/index.html 11 | 12 | 13 | /authors.html 14 | 15 | 16 | /index.html 17 | 18 | 19 | /reference/bigreadr-package.html 20 | 21 | 22 | /reference/big_fread1.html 23 | 24 | 25 | /reference/big_fread2.html 26 | 27 | 28 | /reference/cbind_df.html 29 | 30 | 31 | /reference/fread2.html 32 | 33 | 34 | /reference/fwrite2.html 35 | 36 | 37 | /reference/index.html 38 | 39 | 40 | /reference/nlines.html 41 | 42 | 43 | /reference/rbind_df.html 44 | 45 | 46 | /reference/split_file.html 47 | 48 | 49 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | Filebacked 2 | Florian 3 | fpeek 4 | fread 5 | fwrite 6 | nlines 7 | Privà 8 | Privé 9 | -------------------------------------------------------------------------------- /inst/testdata/cars_with_newline.csv: -------------------------------------------------------------------------------- 1 | speed,dist 2 | 4,2 3 | 4,10 4 | 7,4 5 | 7,22 6 | 8,16 7 | 9,10 8 | 10,18 9 | 10,26 10 | 10,34 11 | 11,17 12 | 11,28 13 | 12,14 14 | 12,20 15 | 12,24 16 | 12,28 17 | 13,26 18 | 13,34 19 | 13,34 20 | 13,46 21 | 14,26 22 | 14,36 23 | 14,60 24 | 14,80 25 | 15,20 26 | 15,26 27 | 15,54 28 | 16,32 29 | 16,40 30 | 17,32 31 | 17,40 32 | 17,50 33 | 18,42 34 | 18,56 35 | 18,76 36 | 18,84 37 | 19,36 38 | 19,46 39 | 19,68 40 | 20,32 41 | 20,48 42 | 20,52 43 | 20,56 44 | 20,64 45 | 22,66 46 | 23,54 47 | 24,70 48 | 24,92 49 | 24,93 50 | 24,120 51 | 25,85 52 | -------------------------------------------------------------------------------- /inst/testdata/cars_without_newline.csv: -------------------------------------------------------------------------------- 1 | speed,dist 2 | 4,2 3 | 4,10 4 | 7,4 5 | 7,22 6 | 8,16 7 | 9,10 8 | 10,18 9 | 10,26 10 | 10,34 11 | 11,17 12 | 11,28 13 | 12,14 14 | 12,20 15 | 12,24 16 | 12,28 17 | 13,26 18 | 13,34 19 | 13,34 20 | 13,46 21 | 14,26 22 | 14,36 23 | 14,60 24 | 14,80 25 | 15,20 26 | 15,26 27 | 15,54 28 | 16,32 29 | 16,40 30 | 17,32 31 | 17,40 32 | 17,50 33 | 18,42 34 | 18,56 35 | 18,76 36 | 18,84 37 | 19,36 38 | 19,46 39 | 19,68 40 | 20,32 41 | 20,48 42 | 20,52 43 | 20,56 44 | 20,64 45 | 22,66 46 | 23,54 47 | 24,70 48 | 24,92 49 | 24,93 50 | 24,120 51 | 25,85 -------------------------------------------------------------------------------- /inst/testdata/wrong_string.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/privefl/bigreadr/2d8806f1067b19610a2d633bf2e863b910570d5d/inst/testdata/wrong_string.rds -------------------------------------------------------------------------------- /man/big_fread1.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{big_fread1} 4 | \alias{big_fread1} 5 | \title{Read large text file} 6 | \usage{ 7 | big_fread1(file, every_nlines, .transform = identity, 8 | .combine = rbind_df, skip = 0, ..., print_timings = TRUE) 9 | } 10 | \arguments{ 11 | \item{file}{Path to file that you want to read.} 12 | 13 | \item{every_nlines}{Maximum number of lines in new file parts.} 14 | 15 | \item{.transform}{Function to transform each data frame corresponding to each 16 | part of the \code{file}. Default doesn't change anything.} 17 | 18 | \item{.combine}{Function to combine results (list of data frames).} 19 | 20 | \item{skip}{Number of lines to skip at the beginning of \code{file}.} 21 | 22 | \item{...}{Other arguments to be passed to \link[data.table:fread]{data.table::fread}, 23 | excepted \code{input}, \code{file}, \code{skip}, \code{col.names} and \code{showProgress}.} 24 | 25 | \item{print_timings}{Whether to print timings? Default is \code{TRUE}.} 26 | } 27 | \value{ 28 | A \code{data.frame} by default; a \code{data.table} when \code{data.table = TRUE}. 29 | } 30 | \description{ 31 | Read large text file by splitting lines. 32 | } 33 | -------------------------------------------------------------------------------- /man/big_fread2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{big_fread2} 4 | \alias{big_fread2} 5 | \title{Read large text file} 6 | \usage{ 7 | big_fread2(file, nb_parts = NULL, .transform = identity, 8 | .combine = cbind_df, skip = 0, select = NULL, progress = FALSE, 9 | part_size = 500 * 1024^2, ...) 10 | } 11 | \arguments{ 12 | \item{file}{Path to file that you want to read.} 13 | 14 | \item{nb_parts}{Number of parts in which to split reading (and transforming). 15 | Parts are referring to blocks of selected columns. 16 | Default uses \code{part_size} to set a good value.} 17 | 18 | \item{.transform}{Function to transform each data frame corresponding to each 19 | block of selected columns. Default doesn't change anything.} 20 | 21 | \item{.combine}{Function to combine results (list of data frames).} 22 | 23 | \item{skip}{Number of lines to skip at the beginning of \code{file}.} 24 | 25 | \item{select}{Indices of columns to keep (sorted). Default keeps them all.} 26 | 27 | \item{progress}{Show progress? Default is \code{FALSE}.} 28 | 29 | \item{part_size}{Size of the parts if \code{nb_parts} is not supplied. 30 | Default is \code{500 * 1024^2} (500 MB).} 31 | 32 | \item{...}{Other arguments to be passed to \link[data.table:fread]{data.table::fread}, 33 | excepted \code{input}, \code{file}, \code{skip}, \code{select} and \code{showProgress}.} 34 | } 35 | \value{ 36 | The outputs of \code{fread2} + \code{.transform}, combined with \code{.combine}. 37 | } 38 | \description{ 39 | Read large text file by splitting columns. 40 | } 41 | -------------------------------------------------------------------------------- /man/bigreadr-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bigreadr-package.R 3 | \docType{package} 4 | \name{bigreadr-package} 5 | \alias{bigreadr} 6 | \alias{bigreadr-package} 7 | \title{bigreadr: Read Large Text Files} 8 | \description{ 9 | Read large text files by splitting them in smaller files. 10 | Package 'bigreadr' also provides some convenient wrappers around fread() 11 | and fwrite() from package 'data.table'. 12 | } 13 | \seealso{ 14 | Useful links: 15 | \itemize{ 16 | \item \url{https://github.com/privefl/bigreadr} 17 | \item Report bugs at \url{https://github.com/privefl/bigreadr/issues} 18 | } 19 | 20 | } 21 | \author{ 22 | \strong{Maintainer}: Florian Privé \email{florian.prive.21@gmail.com} 23 | 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/cbind_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bind.R 3 | \name{cbind_df} 4 | \alias{cbind_df} 5 | \title{Merge data frames} 6 | \usage{ 7 | cbind_df(list_df) 8 | } 9 | \arguments{ 10 | \item{list_df}{A list of multiple data frames with the same observations in 11 | the same order.} 12 | } 13 | \value{ 14 | One merged data frame. 15 | } 16 | \description{ 17 | Merge data frames 18 | } 19 | \examples{ 20 | str(iris) 21 | str(cbind_df(list(iris, iris))) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/fread2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{fread2} 4 | \alias{fread2} 5 | \title{Read text file(s)} 6 | \usage{ 7 | fread2(input, ..., data.table = FALSE, 8 | nThread = getOption("bigreadr.nThread")) 9 | } 10 | \arguments{ 11 | \item{input}{Path to the file(s) that you want to read from. 12 | This can also be a command, some text or an URL. 13 | If a vector of inputs is provided, resulting data frames are appended.} 14 | 15 | \item{...}{Other arguments to be passed to \link[data.table:fread]{data.table::fread}.} 16 | 17 | \item{data.table}{Whether to return a \code{data.table} or just a \code{data.frame}? 18 | Default is \code{FALSE} (and is the opposite of \link[data.table:fread]{data.table::fread}).} 19 | 20 | \item{nThread}{Number of threads to use. Default uses all threads minus one.} 21 | } 22 | \value{ 23 | A \code{data.frame} by default; a \code{data.table} when \code{data.table = TRUE}. 24 | } 25 | \description{ 26 | Read text file(s) 27 | } 28 | \examples{ 29 | tmp <- fwrite2(iris) 30 | iris2 <- fread2(tmp) 31 | all.equal(iris2, iris) ## fread doesn't use factors 32 | } 33 | -------------------------------------------------------------------------------- /man/fwrite2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{fwrite2} 4 | \alias{fwrite2} 5 | \title{Write a data frame to a text file} 6 | \usage{ 7 | fwrite2(x, file = tempfile(), ..., quote = FALSE, 8 | nThread = getOption("bigreadr.nThread")) 9 | } 10 | \arguments{ 11 | \item{x}{Data frame to write.} 12 | 13 | \item{file}{Path to the file that you want to write to. 14 | Defaults uses \code{tempfile()}.} 15 | 16 | \item{...}{Other arguments to be passed to \link[data.table:fwrite]{data.table::fwrite}.} 17 | 18 | \item{quote}{Whether to quote strings (default is \code{FALSE}).} 19 | 20 | \item{nThread}{Number of threads to use. Default uses all threads minus one.} 21 | } 22 | \value{ 23 | Input parameter \code{file}, invisibly. 24 | } 25 | \description{ 26 | Write a data frame to a text file 27 | } 28 | \examples{ 29 | tmp <- fwrite2(iris) 30 | iris2 <- fread2(tmp) 31 | all.equal(iris2, iris) ## fread doesn't use factors 32 | } 33 | -------------------------------------------------------------------------------- /man/nlines.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nlines-split.R 3 | \name{nlines} 4 | \alias{nlines} 5 | \title{Number of lines} 6 | \usage{ 7 | nlines(file) 8 | } 9 | \arguments{ 10 | \item{file}{Path of the file.} 11 | } 12 | \value{ 13 | The number of lines as one integer. 14 | } 15 | \description{ 16 | Get the number of lines of a file. 17 | } 18 | \examples{ 19 | tmp <- fwrite2(iris) 20 | nlines(tmp) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/rbind_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bind.R 3 | \name{rbind_df} 4 | \alias{rbind_df} 5 | \title{Merge data frames} 6 | \usage{ 7 | rbind_df(list_df) 8 | } 9 | \arguments{ 10 | \item{list_df}{A list of multiple data frames with the same variables in the 11 | same order.} 12 | } 13 | \value{ 14 | One merged data frame with the names of the first input data frame. 15 | } 16 | \description{ 17 | Merge data frames 18 | } 19 | \examples{ 20 | str(iris) 21 | str(rbind_df(list(iris, iris))) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/split_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nlines-split.R 3 | \name{split_file} 4 | \alias{split_file} 5 | \alias{get_split_files} 6 | \title{Split file every nlines} 7 | \usage{ 8 | split_file(file, every_nlines, prefix_out = tempfile(), 9 | repeat_header = FALSE) 10 | 11 | get_split_files(split_file_out) 12 | } 13 | \arguments{ 14 | \item{file}{Path to file that you want to split.} 15 | 16 | \item{every_nlines}{Maximum number of lines in new file parts.} 17 | 18 | \item{prefix_out}{Prefix for created files. Default uses \code{tempfile()}.} 19 | 20 | \item{repeat_header}{Whether to repeat the header row in each file. 21 | Default is \code{FALSE}.} 22 | 23 | \item{split_file_out}{Output of \link{split_file}.} 24 | } 25 | \value{ 26 | A list with 27 | \itemize{ 28 | \item \code{name_in}: input parameter \code{file}, 29 | \item \code{prefix_out}: input parameter `prefix_out``, 30 | \item \code{nfiles}: Number of files (parts) created, 31 | \item \code{nlines_part}: input parameter \code{every_nlines}, 32 | \item \code{nlines_all}: total number of lines of \code{file}. 33 | } 34 | 35 | Vector of file paths created by \link{split_file}. 36 | } 37 | \description{ 38 | Split file every nlines 39 | 40 | Get files from splitting. 41 | } 42 | \examples{ 43 | tmp <- fwrite2(iris) 44 | infos <- split_file(tmp, 100) 45 | str(infos) 46 | get_split_files(infos) 47 | } 48 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // nlines_cpp 14 | double nlines_cpp(std::string file); 15 | RcppExport SEXP _bigreadr_nlines_cpp(SEXP fileSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< std::string >::type file(fileSEXP); 20 | rcpp_result_gen = Rcpp::wrap(nlines_cpp(file)); 21 | return rcpp_result_gen; 22 | END_RCPP 23 | } 24 | // split_every_nlines 25 | List split_every_nlines(std::string name_in, std::string prefix_out, int every_nlines, bool repeat_header); 26 | RcppExport SEXP _bigreadr_split_every_nlines(SEXP name_inSEXP, SEXP prefix_outSEXP, SEXP every_nlinesSEXP, SEXP repeat_headerSEXP) { 27 | BEGIN_RCPP 28 | Rcpp::RObject rcpp_result_gen; 29 | Rcpp::RNGScope rcpp_rngScope_gen; 30 | Rcpp::traits::input_parameter< std::string >::type name_in(name_inSEXP); 31 | Rcpp::traits::input_parameter< std::string >::type prefix_out(prefix_outSEXP); 32 | Rcpp::traits::input_parameter< int >::type every_nlines(every_nlinesSEXP); 33 | Rcpp::traits::input_parameter< bool >::type repeat_header(repeat_headerSEXP); 34 | rcpp_result_gen = Rcpp::wrap(split_every_nlines(name_in, prefix_out, every_nlines, repeat_header)); 35 | return rcpp_result_gen; 36 | END_RCPP 37 | } 38 | 39 | static const R_CallMethodDef CallEntries[] = { 40 | {"_bigreadr_nlines_cpp", (DL_FUNC) &_bigreadr_nlines_cpp, 1}, 41 | {"_bigreadr_split_every_nlines", (DL_FUNC) &_bigreadr_split_every_nlines, 4}, 42 | {NULL, NULL, 0} 43 | }; 44 | 45 | RcppExport void R_init_bigreadr(DllInfo *dll) { 46 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 47 | R_useDynamicSymbols(dll, FALSE); 48 | } 49 | -------------------------------------------------------------------------------- /src/nlines-split.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | 3 | #include 4 | using namespace Rcpp; 5 | 6 | #define INIT_SIZE 64 7 | 8 | /******************************************************************************/ 9 | 10 | char * fgets_full_line(char * str, FILE * stream, size_t * p_size) { 11 | 12 | while (true) { 13 | 14 | str = fgets(str, *p_size, stream); 15 | if (str == NULL) return NULL; 16 | // Rcout << *p_size << " -> " << (str[strlen(str) - 1] == '\n') << std::endl; 17 | 18 | if (feof(stream) | (str[strlen(str) - 1] == '\n')) { // reached EOF or EOL 19 | 20 | // Rcout << strlen(str) << " / " << (str[strlen(str) - 1] == '\n') << std::endl; 21 | return str; 22 | 23 | } else { // increase size of str and try again 24 | 25 | fseek(stream , 1 - *p_size, SEEK_CUR); 26 | *p_size *= 2; 27 | 28 | delete [] str; 29 | str = new char[*p_size]; 30 | 31 | } 32 | } 33 | } 34 | 35 | /******************************************************************************/ 36 | 37 | // [[Rcpp::export]] 38 | double nlines_cpp(std::string file) { 39 | 40 | FILE *fp_in = fopen(file.c_str(), "r"); 41 | if (fp_in == NULL) Rcpp::stop("Error while opening file '%s'.", file); 42 | 43 | size_t size = INIT_SIZE; 44 | 45 | char *line = new char[size]; 46 | size_t nline_all = 0; 47 | 48 | while (!feof(fp_in)) { 49 | 50 | line = fgets_full_line(line, fp_in, &size); 51 | 52 | if (ferror(fp_in)) { 53 | delete [] line; 54 | Rcpp::stop("Error while reading file '%s'.", file); 55 | } 56 | 57 | if (line != NULL) nline_all++; 58 | } 59 | 60 | fclose(fp_in); 61 | delete [] line; 62 | 63 | return nline_all; 64 | } 65 | 66 | /******************************************************************************/ 67 | 68 | // [[Rcpp::export]] 69 | List split_every_nlines(std::string name_in, 70 | std::string prefix_out, 71 | int every_nlines, 72 | bool repeat_header) { 73 | 74 | FILE *fp_in = fopen(name_in.c_str(), "r"), *fp_out; 75 | if (fp_in == NULL) 76 | Rcpp::stop("Error while opening file '%s'.", name_in); 77 | 78 | const char *fn_out = prefix_out.c_str(); 79 | size_t max_len = strlen(fn_out) + 20; 80 | char *name_out = new char[max_len]; 81 | 82 | size_t size = INIT_SIZE; 83 | 84 | char *line = new char[size]; 85 | 86 | // read header once and store it 87 | line = fgets_full_line(line, fp_in, &size); 88 | char *head = new char[size]; 89 | strcpy(head, line); 90 | rewind(fp_in); 91 | 92 | bool not_eof = true, header_added = false; 93 | int nfile = 0; 94 | size_t nline_all = 0; 95 | 96 | while (not_eof) { 97 | 98 | // Open file number 'nfile' 99 | snprintf(name_out, max_len, "%s_%d.txt", fn_out, ++nfile); 100 | fp_out = fopen(name_out, "w"); 101 | 102 | // Fill it with 'every_nlines' lines 103 | int nline_file = 0; 104 | while (nline_file < every_nlines) { 105 | 106 | if ( (line = fgets_full_line(line, fp_in, &size)) == NULL ) { 107 | not_eof = false; 108 | break; 109 | } 110 | 111 | if (repeat_header & (nline_file == 0) & (nfile > 1)) { 112 | fputs(head, fp_out); 113 | header_added = true; 114 | }; 115 | 116 | fputs(line, fp_out); 117 | nline_file++; 118 | } 119 | 120 | // Close file number 'nfile' 121 | fflush(fp_out); 122 | fclose(fp_out); 123 | if (nline_file == 0) { 124 | // nothing has been written because of EOF -> remove file 125 | remove(name_out); 126 | nfile--; 127 | } else { 128 | nline_all += nline_file + header_added; 129 | } 130 | } 131 | 132 | fclose(fp_in); 133 | 134 | delete[] name_out; 135 | delete[] line; 136 | delete[] head; 137 | 138 | return List::create( 139 | _["name_in"] = name_in, 140 | _["prefix_out"] = prefix_out, 141 | _["nfiles"] = nfile, 142 | _["nlines_part"] = every_nlines, 143 | _["nlines_all"] = nline_all, 144 | _["repeat_header"] = repeat_header 145 | ); 146 | } 147 | 148 | /******************************************************************************/ 149 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | spelling::spell_check_test(vignettes = TRUE, error = FALSE) 2 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(bigreadr) 3 | 4 | test_check("bigreadr") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-bind.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | context("test-bind.R") 4 | 5 | ################################################################################ 6 | 7 | test_that("'cbind_df' works", { 8 | 9 | # No copies with 'cbind.data.frame' 10 | iris$Species <- as.character(iris$Species) 11 | addr <- sapply(iris, data.table::address) 12 | iris2 <- cbind_df(list(iris, iris)) 13 | expect_identical(sapply(iris2, data.table::address), c(addr, addr)) 14 | 15 | # Data frame with factors 16 | df <- datasets::iris 17 | df2 <- cbind_df(list(df)) 18 | expect_identical(df2, df) 19 | df3 <- cbind_df(list(df, df, df)) 20 | expect_equal(dim(df3), c(150, 15)) 21 | expect_identical(class(df3), "data.frame") 22 | 23 | # Data table 24 | dt <- data.table::as.data.table(df) 25 | dt2 <- cbind_df(list(dt)) 26 | expect_identical(class(dt2), c("data.table", "data.frame")) 27 | expect_identical(dt2, dt) 28 | dt3 <- cbind_df(list(dt, dt, dt)) 29 | expect_equal(dim(dt3), c(150, 15)) 30 | expect_identical(class(dt3), c("data.table", "data.frame")) 31 | 32 | # Data frame without factors 33 | df$Species <- as.character(df$Species) 34 | df2 <- cbind_df(list(df)) 35 | expect_identical(df2, df) 36 | df3 <- cbind_df(list(df, df, df)) 37 | expect_equal(dim(df3), c(150, 15)) 38 | expect_identical(class(df3), "data.frame") 39 | }) 40 | 41 | ################################################################################ 42 | 43 | test_that("'rbind_df' works", { 44 | 45 | # Data frame with factors 46 | df <- datasets::iris 47 | df2 <- rbind_df(list(df)) 48 | expect_identical(df2, df) 49 | df3 <- rbind_df(list(df, df, df)) 50 | expect_equal(dim(df3), c(450, 5)) 51 | expect_identical(class(df3), "data.frame") 52 | 53 | # Data table 54 | dt <- data.table::as.data.table(df) 55 | dt2 <- rbind_df(list(dt)) 56 | expect_identical(class(dt2), c("data.table", "data.frame")) 57 | expect_identical(dt2, dt) 58 | dt3 <- rbind_df(list(dt, dt, dt)) 59 | expect_equal(dim(dt3), c(450, 5)) 60 | expect_identical(class(dt3), c("data.table", "data.frame")) 61 | 62 | # Data frame without factors 63 | df$Species <- as.character(df$Species) 64 | df2 <- rbind_df(list(df)) 65 | expect_identical(df2, df) 66 | df3 <- rbind_df(list(df, df, df)) 67 | expect_equal(dim(df3), c(450, 5)) 68 | expect_identical(class(df3), "data.frame") 69 | 70 | # Error 71 | expect_error(rbind_df(list(as.matrix(iris), iris)), 72 | "'list_df' should contain data tables or data frames.", fixed = TRUE) 73 | }) 74 | 75 | ################################################################################ 76 | -------------------------------------------------------------------------------- /tests/testthat/test-nlines.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | context("test-nlines.R") 4 | 5 | ################################################################################ 6 | 7 | test_that("'nlines()' works", { 8 | 9 | expect_error(nlines("does_not_exist.txt")) 10 | 11 | strings <- readRDS(system.file("testdata", "wrong_string.rds", package = "bigreadr")) 12 | writeLines(strings, tmp <- tempfile()) 13 | expect_equal(nlines(tmp), 24) 14 | 15 | strings <- c("", "", " ", sapply(10^(seq(0, 4, by = 0.2)), function(i) { 16 | paste(as.matrix(iris)[sample(nrow(iris), i, TRUE), ], collapse = " ") 17 | })) 18 | replicate(100, { 19 | writeLines(sample(strings, replace = TRUE), tmp <- tempfile()) 20 | expect_equal(nlines(tmp), length(readLines(tmp))) 21 | }) 22 | }) 23 | 24 | ################################################################################ 25 | 26 | test_that("'nlines()' works with or without newline", { 27 | csv1 <- system.file("testdata", "cars_with_newline.csv", package = "bigreadr") 28 | expect_identical(nlines(csv1), 51) 29 | csv2 <- system.file("testdata", "cars_without_newline.csv", package = "bigreadr") 30 | expect_identical(nlines(csv2), 51) 31 | }) 32 | 33 | ################################################################################ 34 | -------------------------------------------------------------------------------- /tests/testthat/test-read.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | context("test-read.R") 4 | 5 | iris$Species <- as.character(iris$Species) 6 | csv <- fwrite2(iris, tempfile(fileext = ".csv")) 7 | 8 | ################################################################################ 9 | 10 | test_that("'fread2' changes default", { 11 | no_dt <- fread2(csv) 12 | expect_equal(no_dt, iris) 13 | expect_s3_class(no_dt, "data.frame") 14 | expect_failure(expect_s3_class(no_dt, "data.table")) 15 | expect_s3_class(fread2(csv, data.table = TRUE), "data.table") 16 | }) 17 | 18 | test_that("'fread2' works with multiple files", { 19 | csv2 <- rep(csv, 3) 20 | no_dt <- fread2(csv2) 21 | expect_equal(no_dt, rbind(iris, iris, iris)) 22 | expect_s3_class(no_dt, "data.frame") 23 | expect_failure(expect_s3_class(no_dt, "data.table")) 24 | expect_s3_class(fread2(csv2, data.table = TRUE), "data.table") 25 | 26 | expect_equal(dim(fread2(csv2, nrows = 5)), c(15, 5)) 27 | expect_equal(dim(fread2(csv2, select = "Species")), c(450, 1)) 28 | }) 29 | 30 | test_that("'fread2' can use different types of input", { 31 | 32 | cmd <- sprintf("grep -v setosa %s", fwrite2(datasets::iris)) 33 | expect_equal(fread2(cmd), data.table::fread(cmd, data.table = FALSE)) 34 | 35 | url <- "https://raw.githubusercontent.com/privefl/bigsnpr/master/inst/extdata/example.fam" 36 | expect_equal(fread2(url), data.table::fread(url, data.table = FALSE)) 37 | 38 | text <- paste(readLines(url), collapse = "\n") 39 | expect_equal(fread2(text), data.table::fread(url, data.table = FALSE)) 40 | }) 41 | 42 | ################################################################################ 43 | 44 | test_that("'big_fread1' works", { 45 | 46 | iris1 <- big_fread1(file = csv, 50, print_timings = FALSE) 47 | expect_equal(iris1, iris) 48 | 49 | expect_warning( 50 | iris1 <- big_fread1(file = csv, 50, print_timings = FALSE, 51 | .combine = function() stop("ERROR")), 52 | "Combining failed.") 53 | expect_length(iris1, 4) 54 | expect_equal(rbind_df(iris1), iris) 55 | 56 | iris2 <- big_fread1(file = csv, 250, print_timings = FALSE) 57 | expect_equal(iris2, iris) 58 | 59 | ind3 <- 1:4 60 | iris3 <- big_fread1(file = csv, 7, select = ind3, skip = 1, print_timings = FALSE) 61 | expect_equal(iris3, iris[ind3], check.attributes = FALSE) 62 | expect_identical(names(iris3), paste0("V", ind3)) 63 | 64 | iris4 <- big_fread1(file = csv, 50, print_timings = FALSE, 65 | .transform = function(df) subset(df, Species == "virginica")) 66 | expect_equal(iris4, subset(iris, Species == "virginica"), check.attributes = FALSE) 67 | 68 | expect_message(big_fread1(file = csv, 50, print_timings = TRUE), "seconds") 69 | }) 70 | 71 | ################################################################################ 72 | 73 | test_that("'big_fread2' works", { 74 | 75 | for (nb_parts in 1:7) { 76 | 77 | iris1 <- big_fread2(file = csv, nb_parts) 78 | expect_equal(iris1, iris) 79 | 80 | expect_warning( 81 | iris1 <- big_fread2(file = csv, nb_parts, 82 | .combine = function() stop("ERROR")), 83 | "Combining failed.") 84 | expect_length(iris1, min(nb_parts, ncol(iris))) 85 | expect_equal(cbind_df(iris1), iris) 86 | 87 | ind2 <- 1 88 | iris2 <- big_fread2(file = csv, nb_parts, select = ind2, skip = 0) 89 | expect_equal(iris2, iris[ind2]) 90 | 91 | ind3 <- 1:4 92 | iris3 <- big_fread2(file = csv, nb_parts, select = ind3, skip = 1) 93 | expect_equal(iris3, iris[ind3], check.attributes = FALSE) 94 | expect_identical(names(iris3), paste0("V", ind3)) 95 | 96 | expect_error(big_fread2(file = csv, nb_parts, select = c(4, 1:3), skip = 0), 97 | "Argument 'select' should be sorted.", fixed = TRUE) 98 | } 99 | }) 100 | 101 | ################################################################################ 102 | 103 | test_that("Same column accessor", { 104 | iris_dt <- data.table::as.data.table(iris) 105 | expect_equal(iris[, 1:3], as.data.frame(iris_dt[, 1:3])) 106 | expect_equal(iris[, 3, drop = FALSE], 107 | as.data.frame(iris_dt[, 3, drop = FALSE])) 108 | }) 109 | 110 | ################################################################################ 111 | 112 | test_that("Use 'scan' correctly", { 113 | expect_identical(scan(csv, "", skip = 0, nlines = 1, sep = "\n", quiet = TRUE), 114 | paste(names(iris), collapse = ",")) 115 | expect_identical(scan(csv, "", skip = 1, nlines = 1, sep = "\n", quiet = TRUE), 116 | paste(as.matrix(iris)[1, ], collapse = ",")) 117 | }) 118 | 119 | ################################################################################ 120 | -------------------------------------------------------------------------------- /tests/testthat/test-split.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | context("test-split.R") 4 | 5 | ################################################################################ 6 | 7 | test_that("'split_every_nlines()' works", { 8 | 9 | tmp <- bigreadr::fwrite2(iris) 10 | test <- bigreadr:::split_every_nlines(tmp, tmp, 20, TRUE) 11 | files <- list.files(tempdir(), basename(tmp), full.names = TRUE) 12 | files2 <- c(tmp, paste0(tmp, "_", 1:8, ".txt")) 13 | expect_identical(normalizePath(sort(files)), normalizePath(files2)) 14 | }) 15 | 16 | ################################################################################ 17 | 18 | test_that("'split_file()' works", { 19 | 20 | strings <- c("", "", " ", sapply(10^(seq(0, 4, by = 0.2)), function(i) { 21 | paste(as.matrix(iris)[sample(nrow(iris), i, TRUE), ], collapse = " ") 22 | })) 23 | for (every in c(1, 2, 4, 12, 24, 25)) { 24 | writeLines(sample(strings, replace = TRUE), tmp <- tempfile()) 25 | # Infos are correct 26 | infos <- split_file(tmp, every, tmp2 <- tempfile()) 27 | expect_identical(infos[["name_in"]], normalizePath(tmp)) 28 | expect_identical(infos[["prefix_out"]], path.expand(tmp2)) 29 | expect_identical(infos[["repeat_header"]], FALSE) 30 | expect_equal(ceiling(infos[["nlines_all"]] / infos[["nlines_part"]]), 31 | infos[["nfiles"]]) 32 | expect_equal(infos[["nlines_all"]], 24) 33 | # New files all exist 34 | files <- get_split_files(infos) 35 | expect_true(all(file.exists(files))) 36 | # Number of lines and size is summing to whole input file 37 | expect_identical(sum(sapply(files, nlines)), nlines(tmp)) 38 | expect_identical(sum(file.size(files)), file.size(tmp)) 39 | # Content is the same 40 | expect_identical(do.call('c', lapply(files, readLines)), readLines(tmp)) 41 | } 42 | }) 43 | 44 | ################################################################################ 45 | 46 | test_that("'split_file()' works with a repeated header", { 47 | 48 | # Reading splitted files is easier 49 | tf <- fwrite2(cars, tempfile(fileext = ".csv")) 50 | sf1 <- split_file(tf, 10) 51 | gsf1 <- get_split_files(sf1) 52 | expect_equal(sum(sapply(gsf1, nlines)), 51) 53 | expect_error(Reduce(rbind, lapply(gsf1, fread2)), 54 | "names do not match previous names") 55 | 56 | sf2 <- split_file(tf, 10, repeat_header = TRUE) 57 | gsf2 <- get_split_files(sf2) 58 | expect_equal(sapply(gsf2, readLines, n = 1), rep(readLines(tf, n = 1), 6), 59 | check.attributes = FALSE) 60 | 61 | loaded_df <- Reduce(rbind, lapply(gsf2, read.csv)) 62 | expect_equal(names(loaded_df), c("speed", "dist")) 63 | expect_equal(nrow(loaded_df), 50) 64 | 65 | # Content is the same 66 | first_part <- readLines(gsf2[1]) 67 | other_parts <- unlist(lapply(gsf2[-1], function(f) readLines(f)[-1])) 68 | expect_identical(c(first_part, other_parts), readLines(tf)) 69 | }) 70 | 71 | ################################################################################ 72 | 73 | test_that("'split_file()' works with a repeated header (special cases)", { 74 | 75 | strings <- c("", "", " ", sapply(10^(seq(0, 4, by = 0.2)), function(i) { 76 | paste(as.matrix(iris)[sample(nrow(iris), i, TRUE), ], collapse = " ") 77 | })) 78 | for (every in c(1, 2, 4, 12, 24, 25)) { 79 | writeLines(sample(strings, replace = TRUE), tmp <- tempfile()) 80 | # Infos are correct 81 | infos <- split_file(tmp, every, tmp2 <- tempfile(), repeat_header = TRUE) 82 | expect_identical(infos[["name_in"]], normalizePath(tmp)) 83 | expect_identical(infos[["prefix_out"]], path.expand(tmp2)) 84 | expect_identical(infos[["repeat_header"]], TRUE) 85 | nlines_all_without_header <- infos[["nlines_all"]] - infos[["nfiles"]] 86 | expect_equal(nlines_all_without_header + 1, 24) 87 | expect_equal(ceiling((nlines_all_without_header + 1) / infos[["nlines_part"]]), 88 | infos[["nfiles"]]) 89 | # New files all exist 90 | files <- get_split_files(infos) 91 | expect_true(all(file.exists(files))) 92 | # Same first line for each file 93 | expect_equal(sapply(files, readLines, n = 1), 94 | rep(readLines(tmp, n = 1), infos[["nfiles"]]), 95 | check.attributes = FALSE) 96 | # Content is the same 97 | first_part <- readLines(files[1]) 98 | other_parts <- unlist(lapply(files[-1], function(f) readLines(f)[-1])) 99 | expect_identical(c(first_part, other_parts), readLines(tmp)) 100 | } 101 | }) 102 | 103 | ################################################################################ 104 | -------------------------------------------------------------------------------- /tmp-save/nlines.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define BUFSIZE (1024 * 1024) 4 | 5 | 6 | //' Count number of lines 7 | //' 8 | //' @param filename Path to the file. 9 | //' 10 | //' @export 11 | //' 12 | // [[Rcpp::export]] 13 | double nlines1(std::string filename) { 14 | 15 | FILE *fp_in = fopen(filename.c_str(), "rb"); 16 | // setvbuf(fp_in, NULL, _IOLBF, BUFSIZE); 17 | 18 | size_t size = 100; 19 | size_t last = size - 2; 20 | 21 | char *line = new char[size]; 22 | char *temp= NULL; 23 | size_t c = 0; 24 | bool not_eol; 25 | 26 | while (fgets(line, size, fp_in) != NULL) { 27 | 28 | if (strlen(line) > last) { 29 | 30 | not_eol = (line[last] != '\n'); 31 | 32 | size *= 2; 33 | temp = new char[size]; 34 | delete [] line; 35 | line = temp; 36 | last = size - 2; 37 | 38 | if (not_eol) continue; 39 | } 40 | 41 | // End of line 42 | c++; 43 | } 44 | 45 | fclose(fp_in); 46 | 47 | return c; 48 | } 49 | 50 | #include 51 | #include 52 | using namespace std; 53 | 54 | int FileRead(istream& is, char* buff) { 55 | is.read(buff, BUFSIZE); 56 | return is.gcount(); 57 | } 58 | 59 | // [[Rcpp::export]] 60 | double nlines2(const char * filename) { 61 | 62 | ifstream ifs(filename, ios::in | ios::binary); 63 | 64 | char *buff = new char[BUFSIZE]; 65 | 66 | size_t nlines = 0; 67 | while (int cc = FileRead(ifs, buff)) { 68 | nlines += std::count(buff, buff + cc, '\n'); 69 | } 70 | 71 | delete [] buff; 72 | 73 | return nlines; 74 | } 75 | 76 | // [[Rcpp::export]] 77 | double nlines3(const char * filename) { 78 | 79 | FILE *fp = fopen(filename, "r"); 80 | 81 | size_t nlines = 0; 82 | 83 | char c = 'a'; 84 | while (c != EOF) { 85 | c = getc(fp); 86 | if (c == '\n') nlines++; 87 | } 88 | 89 | fclose(fp); 90 | 91 | return nlines; 92 | } 93 | 94 | // [[Rcpp::export]] 95 | double nlines4(std::string filename, int buff_size = 1024) { 96 | 97 | FILE *fp_in = fopen(filename.c_str(), "rb"); 98 | // setvbuf(fp_in, NULL, _IOFBF, BUFSIZE); 99 | 100 | char *buff = new char[buff_size]; 101 | // int buff_size_minus_one = buff_size - 1; 102 | size_t nlines = 0; 103 | 104 | while (feof(fp_in) == 0) { 105 | if (fgets(buff, buff_size, fp_in) == NULL) 106 | Rcpp::Rcout << "Error?" << std::endl; 107 | 108 | // Rcpp::Rcout << " : "<< strlen(buff) << 109 | // " => " << (buff[strlen(buff) - 1] == '\n') << std::endl; 110 | 111 | if ((buff[strlen(buff) - 1] == '\n')) nlines++; 112 | } 113 | 114 | fclose(fp_in); 115 | 116 | return nlines; 117 | } 118 | 119 | // [[Rcpp::export]] 120 | double nlines5(std::string filename, int buff_size = 1024) { 121 | 122 | FILE *input_file = fopen(filename.c_str(), "rb"); 123 | char buffer[buff_size + 1]; 124 | size_t line_count = 0; 125 | 126 | while (!feof(input_file)) 127 | { 128 | size_t chars_read = fread(buffer, 1, buff_size, input_file); 129 | for (unsigned int i = 0; i < chars_read; ++i) 130 | { 131 | if (buffer[i] == '\n') 132 | { 133 | ++line_count; 134 | } 135 | } 136 | } 137 | 138 | fclose(input_file); 139 | 140 | return line_count; 141 | } 142 | 143 | // [[Rcpp::export]] 144 | double nlines6(std::string filename) { 145 | 146 | size_t newlines = 0; 147 | char buf[BUFSIZE]; 148 | size_t BUFSIZE_M1 = BUFSIZE - 1; 149 | size_t BUFSIZE_M2 = BUFSIZE - 2; 150 | FILE* file = fopen(filename.c_str(), "rb"); 151 | 152 | while (fgets(buf, BUFSIZE, file)) { 153 | if (strlen(buf) != BUFSIZE_M1 || buf[BUFSIZE_M2] != '\n') 154 | newlines++; 155 | } 156 | 157 | return newlines; 158 | } 159 | 160 | 161 | 162 | #include 163 | #include 164 | #include 165 | #include 166 | 167 | // [[Rcpp::export]] 168 | double nlines7(std::string filename) { 169 | 170 | int fd = open(filename.c_str(), O_RDONLY, 0); 171 | 172 | char *buff = new char[BUFSIZE]; 173 | size_t nlines = 0; 174 | 175 | while (int len = read(fd, buff, BUFSIZE)) { 176 | 177 | if (len == -1) { 178 | (void)close(fd); 179 | break; 180 | } 181 | 182 | for (int i = 0; i < len; i++) 183 | if (buff[i] == '\n') nlines++; 184 | } 185 | 186 | (void)close(fd); 187 | 188 | return nlines; 189 | } 190 | 191 | /*** R 192 | cars2 <- cars[rep(1:50, 20e2), rep(1:2, 100)] 193 | # cars2 <- cars[rep(1:50, 5), rep(1:2, 30e3)] 194 | bigreadr::fwrite2(cars2, "tmp-data/cars.csv") 195 | for (i in 2:10) bigreadr::fwrite2(cars2, "tmp-data/cars.csv", append = TRUE) 196 | 197 | system.time(print(nlines7("tmp-data/cars.csv"))) 198 | system.time(print(nlines1("tmp-data/cars.csv"))) 199 | system.time(system("wc -l tmp-data/cars.csv")) 200 | system.time(print(bigreadr::nlines("tmp-data/cars.csv"))) 201 | 202 | 203 | # microbenchmark::microbenchmark( 204 | # nlines1("tmp-data/cars.csv"), # 1000 205 | # # nlines2("tmp-data/cars.csv"), # 1500 206 | # # nlines3("tmp-data/cars.csv"), # 33500 207 | # # nlines4("tmp-data/cars.csv", 1024), # 1050 208 | # # nlines4("tmp-data/cars.csv", 1024 * 1024), # 1100 209 | # # nlines5("tmp-data/cars.csv", 1024), # 1050 210 | # # nlines5("tmp-data/cars.csv", 1024 * 1024), # 1100 211 | # nlines6("tmp-data/cars.csv"), # 1050 212 | # # nlines5("tmp-data/cars.csv", 1024 * 1024 * 64), # 1100 213 | # # nlines_mmap("tmp-data/cars.csv"), # 1900 214 | # # bigreadr::nlines("tmp-data/cars.csv"), # 3400 215 | # system("wc -l tmp-data/cars.csv"), # 400 216 | # # system("grep -c '\n' tmp-data/cars.csv"), # 400 217 | # times = 5 218 | # ) 219 | #### 5M x 200 #### 220 | # Unit: milliseconds 221 | # expr min lq mean median 222 | # nlines("tmp-data/cars.csv") 2092.0324 2098.8990 2138.311 2101.8745 223 | # bigreadr::nlines("tmp-data/cars.csv") 6746.9176 6762.7296 6868.384 6799.3394 224 | # system("wc -l tmp-data/cars.csv") 853.2787 856.6954 863.299 862.6793 225 | # uq max neval 226 | # 2113.3909 2448.5013 10 227 | # 6816.9416 7438.5126 10 228 | # 867.3886 883.3312 10 229 | 230 | #### 5K x 200K #### 231 | # Unit: milliseconds 232 | # expr min lq mean median 233 | # nlines("tmp-data/cars.csv") 1852.4570 1858.6921 2429.795 1934.0913 234 | # bigreadr::nlines("tmp-data/cars.csv") 6557.9264 6621.6394 6982.951 6836.6807 235 | # system("wc -l tmp-data/cars.csv") 798.7292 845.8318 1426.601 864.2086 236 | # uq max neval 237 | # 2312.193 5831.689 10 238 | # 7211.877 7922.534 10 239 | # 1092.094 5640.510 10 240 | 241 | val <- try(system(paste("wc -l", "tmp-data/cars.csv"), intern = TRUE, 242 | ignore.stderr = TRUE), silent = TRUE) 243 | val <- `if`(class(val) == "try-error", nlines1("tmp-data/cars.csv"), 244 | as.numeric(strsplit(val, " ")[[1]][1])) 245 | */ 246 | -------------------------------------------------------------------------------- /tmp-tests/bench-acc.R: -------------------------------------------------------------------------------- 1 | 2 | library(data.table) 3 | iris_dt <- as.data.table(iris) 4 | microbenchmark::microbenchmark( 5 | iris[, 1:3], 6 | iris[1:3], 7 | iris_dt[, 1:3], 8 | iris[, 3, drop = FALSE], 9 | iris[3], 10 | iris_dt[, 3, drop = FALSE] 11 | ) 12 | 13 | -------------------------------------------------------------------------------- /tmp-tests/bench-rbind.R: -------------------------------------------------------------------------------- 1 | mtcars <- datasets::mtcars 2 | mtcars <- mtcars[rep(1:32, 1000), rep(1:11, 10)] 3 | mtcars_dt <- data.table::as.data.table(mtcars) 4 | 5 | list_mtcars <- rep(list(mtcars), 10) 6 | list_mtcars_dt <- rep(list(mtcars_dt), 10) 7 | 8 | rbind_df <- function(list_df) { 9 | list_df_merged <- lapply(seq_along(list_df[[1]]), function(k) { 10 | unlist(lapply(list_df, function(l) l[[k]])) 11 | }) 12 | list_df_merged_named <- stats::setNames(list_df_merged, names(list_df[[1]])) 13 | as.data.frame(list_df_merged_named, stringsAsFactors = FALSE) 14 | } 15 | 16 | rbind_df2 <- function(list_df) { 17 | data.table::rbindlist(list_df) 18 | } 19 | 20 | microbenchmark::microbenchmark( 21 | 22 | A1 = rbind.data.frame(mtcars), 23 | A2 = rbind.data.frame(mtcars_dt), 24 | B1 = rbind_df(list(mtcars)), 25 | B2 = rbind_df(list(mtcars_dt)), 26 | C1 = rbind_df2(list(mtcars)), 27 | C2 = rbind_df2(list(mtcars_dt)), 28 | 29 | AA1 = do.call(rbind.data.frame, list_mtcars), 30 | AA2 = do.call(rbind.data.frame, list_mtcars_dt), 31 | BB1 = rbind_df(list_mtcars), 32 | BB2 = rbind_df(list_mtcars_dt), 33 | CC1 = rbind_df2(list_mtcars), 34 | CC2 = rbind_df2(list_mtcars_dt), 35 | 36 | times = 10 37 | ) 38 | -------------------------------------------------------------------------------- /tmp-tests/bench-read.R: -------------------------------------------------------------------------------- 1 | csv <- readr::readr_example("mtcars.csv") 2 | df <- data.table::fread(csv, data.table = FALSE) 3 | 4 | ## LONG CSV 5 | csv2 <- "tmp-data/mtcars-long.csv" 6 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2, 7 | # quote = FALSE, row.names = FALSE) 8 | 9 | system.time( 10 | df2 <- data.table::fread(csv2) 11 | ) # 3.5 12 | 13 | system.time( 14 | df3 <- readr::read_csv(csv2) 15 | ) # 25 16 | rm(df2, df3); gc(reset = TRUE) 17 | 18 | 19 | system.time(nlines <- fpeek::peek_count_lines(csv2)) # 1.8 20 | system.time(nlines2 <- nrow(data.table::fread(csv2, select = 1))) # 2.8 21 | 22 | tmp <- tempfile() 23 | if (Sys.info()[["sysname"]] == "Windows") { 24 | 25 | # https://sourceforge.net/projects/gnuwin32/ 26 | awk <- shortPathName("C:/Program Files (x86)/GnuWin32/bin/awk.exe") # Windows 27 | cmd <- sprintf("%s \"NR%%%d==1{x=\"\"\"%s\"\"\"++i;}{print > x}\" %s", 28 | awk, 20, gsub("\\\\", "\\\\\\\\", tmp), normalizePath(csv)) 29 | 30 | } else { 31 | 32 | cmd <- sprintf("awk 'NR%%%d==1{x=\"%s\"++i;}{print > x}' %s", 33 | tmp, 20, normalizePath(csv)) 34 | 35 | } 36 | system(cmd) 37 | readLines(paste0(tmp, 1), 1) 38 | 39 | cmd <- sprintf("%s \"NR%%%d==1{x=\"\"\"%s\"\"\"++i;}{print > x}\" %s", 40 | awk, 20000, gsub("\\\\", "\\\\\\\\", tmp), normalizePath(csv2)) 41 | system.time(system(cmd)) # 1.4 42 | # readLines(paste0(tmp, 1)) 43 | 44 | 45 | ## LARGE CSV 46 | csv3 <- "tmp-data/mtcars-wide.csv" 47 | data.table::fwrite(df[rep(seq_len(nrow(df)), 500), rep(seq_len(ncol(df)), 1000)], csv3, 48 | quote = FALSE, row.names = FALSE) 49 | 50 | system.time( 51 | df2 <- data.table::fread(csv3, data.table = FALSE) 52 | ) # 0.06 -> 0.65 -> 9.8 53 | system.time( 54 | nlines <- nrow(data.table::fread(csv3, select = 1)) 55 | ) # 0.1 -> 0.45 -> 4.5 56 | system.time(nlines2 <- fpeek::peek_count_lines(csv3)) 57 | 58 | # system.time( 59 | # df3 <- readr::read_csv(csv3) 60 | # ) # 6 61 | 62 | cmd <- sprintf("%s \"NR%%%d==1{x=\"\"\"%s\"\"\"++i;}{print > x}\" %s", 63 | awk, 2, gsub("\\\\", "\\\\\\\\", tmp), normalizePath(csv3)) 64 | system.time(system(cmd)) # 1.4 65 | # readLines(paste0(tmp, 1)) 66 | 67 | -------------------------------------------------------------------------------- /tmp-tests/bench-read2.R: -------------------------------------------------------------------------------- 1 | # https://sourceforge.net/projects/gnuwin32/files/coreutils/5.3.0/coreutils-5.3.0.exe/download 2 | 3 | csv <- readr::readr_example("mtcars.csv") 4 | # split <- shortPathName("C:\\Program Files (x86)\\GnuWin32/bin/split.exe") 5 | split <- "split" 6 | 7 | system(sprintf("%s --version", split)) == 0 8 | # system(sprintf("%s -l 5 %s", split, csv)) 9 | 10 | ## LONG CSV 11 | df <- data.table::fread(csv, data.table = FALSE) 12 | csv2 <- tempfile(fileext = ".csv") 13 | data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2, 14 | quote = FALSE, row.names = FALSE) 15 | file.size(csv2) 16 | 17 | # system.time(system(sprintf("find /c /v \"aabbccdd\" %s", csv2))) 18 | 19 | system.time(data.table::fread(csv2, nThread = 1)) ## 2.2 20 | system.time(data.table::fread(csv2, nThread = 2)) ## 1.5 21 | system.time(data.table::fread(csv2, nThread = 4)) ## 1 22 | system.time(data.table::fread(csv2, nThread = 7)) ## 0.7 23 | 24 | tmp <- tempfile() 25 | system.time(system(sprintf("%s -l 200000 %s %s", split, csv2, tmp))) ## 12 sec 26 | system.time(fpeek::peek_count_lines(csv2)) ## 3 sec 27 | system.time(nrow(data.table::fread(csv2, select = 1))) 28 | 29 | files <- list.files(dirname(tmp), basename(tmp), full.names = TRUE) 30 | df1 <- data.table::fread(files[1], data.table = FALSE) 31 | data.table::fread(tail(files, 1), col.names = names(df1), data.table = FALSE) 32 | 33 | scan(csv, "", sep = ",", nlines = 1, skip = 0) 34 | 35 | 36 | df <- mtcars 37 | df2 <- unname(mtcars) 38 | 39 | sapply(df, data.table::address) 40 | sapply(df2, data.table::address) 41 | 42 | 43 | microbenchmark::microbenchmark( 44 | as.matrix(unname(mtcars), rownames.force = FALSE), 45 | as.matrix(mtcars) 46 | ) 47 | -------------------------------------------------------------------------------- /tmp-tests/bench-read3.R: -------------------------------------------------------------------------------- 1 | 2 | ## LONG CSV 3 | csv2 <- "tmp-data/mtcars-long.csv" 4 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2, 5 | # quote = FALSE, row.names = FALSE) 6 | 7 | library(bigreadr) 8 | if (Sys.info()[["sysname"]] == "Windows") { 9 | options(bigreadr.split = "C:\\Program Files (x86)\\GnuWin32/bin/split.exe") 10 | } 11 | 12 | system.time( 13 | test <- split_file(csv2) 14 | ) 15 | 16 | rm(test2); gc(reset = TRUE) 17 | system.time( 18 | test2 <- big_fread(csv2, every_x_mb = 100) 19 | ) 20 | gc() # + 2 GB 21 | 22 | rm(test2); gc(reset = TRUE) 23 | system.time( 24 | test2 <- data.table::fread(csv2) 25 | ) 26 | gc() # + 1 GB 27 | 28 | # system.time(test <- split_file(csv2, every_x_mb = 1000)) 29 | # system.time(test <- split_file(csv2, every_x_mb = 10)) 30 | system.time(tmp <- lapply(test, function(f) data.table::fread(f, data.table = FALSE))) 31 | 32 | system.time(tmp2 <- do.call(my_rbind, tmp)) 33 | 34 | system.time( 35 | test2 <- big_fread(csv2, every_x_mb = 100) 36 | ) 37 | system.time( 38 | test3 <- data.table::fread(csv2) 39 | ) 40 | 41 | 42 | tmp <- tempfile() 43 | system.time( 44 | status <- system(sprintf("%s -C %dm %s %s", "split", 100, csv2, tmp)) 45 | ) 46 | file_parts <- list.files(dirname(tmp), basename(tmp), full.names = TRUE) 47 | 48 | dt1 <- data.table::fread(file_parts[1]) 49 | 50 | system.time(df2 <- data.table::fread(csv2, data.table = FALSE)) 51 | system.time(df3 <- bigreadr::big_fread( 52 | csv2, .transform = identity 53 | )) 54 | -------------------------------------------------------------------------------- /tmp-tests/bench-read4.R: -------------------------------------------------------------------------------- 1 | 2 | ## LONG CSV 3 | csv2 <- "tmp-data/mtcars-long.csv" 4 | 5 | Rcpp::sourceCpp('tmp-tests/test-setvbuf.cpp') 6 | 7 | # system.time(test <- test_setvbuf(csv2, 10)) 8 | system.time(test <- test_setvbuf2(csv2)) 9 | system.time(test2 <- fpeek::peek_count_lines(csv2)) 10 | 11 | csv2.2 <- sub("\\.csv$", "2.csv", csv2) 12 | system.time(test <- test_setvbuf3(csv2, csv2.2)) 13 | 14 | # df1 <- data.table::fread(csv2) 15 | # df2 <- data.table::fread(csv2.2) 16 | # identical(df1, df2) 17 | # 18 | # system.time(file.copy(csv2, sub("\\.csv$", "3.csv", csv2))) # 1.5 sec 19 | -------------------------------------------------------------------------------- /tmp-tests/bench-read5.R: -------------------------------------------------------------------------------- 1 | 2 | library(bigreadr) 3 | if (Sys.info()[["sysname"]] == "Windows") { 4 | options(bigreadr.split = "C:\\Program Files (x86)\\GnuWin32/bin/split.exe") 5 | } 6 | 7 | 8 | ## LONG CSV 9 | csv2 <- "tmp-data/mtcars-long.csv" 10 | # csv <- readr::readr_example("mtcars.csv") 11 | # df <- data.table::fread(csv, data.table = FALSE) 12 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2, 13 | # quote = FALSE, row.names = FALSE) 14 | 15 | nlines(csv2) 16 | system.time( 17 | test <- split_file(csv2) 18 | ) 19 | # Windows: 4.6 / 8.2 / 8.9 20 | # Linux: 1.5 / 1.8 / 1.4 21 | # Linux2: 1.4 / 1.3 / 1.1 / 1.3 22 | 23 | Rcpp::sourceCpp('tmp-tests/test-setvbuf5.cpp') 24 | tmp <- tempfile() 25 | system.time( 26 | test2 <- test_setvbuf6(csv2, tmp, 1e6) 27 | ) 28 | # Windows: 15 / 4.8 / 5.0 / 4.4 29 | # Linux: 5.4 / 3.3 / 3.6 / 3.5 / 2.8 30 | # Linux2: 1.3 / 1.8 / 1.8 / 1.7 31 | as.integer(test2) 32 | list.files(dirname(tmp), basename(tmp)) 33 | 34 | 35 | 36 | ## LARGE CSV 37 | csv3 <- "tmp-data/mtcars-wide.csv" 38 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 50), rep(seq_len(ncol(df)), 10000)], 39 | # csv3, quote = FALSE, row.names = FALSE) 40 | 41 | nlines(csv3) 42 | system.time( 43 | test <- split_file(csv3) 44 | ) 45 | # Windows: 4.3 / 3.9 / 9.6 46 | # Linux: 3.2 / 1.4 / 3.7 47 | # Linux2: 1.4 / 1.2 / 1.1 48 | 49 | Rcpp::sourceCpp('tmp-tests/test-setvbuf5.cpp') 50 | tmp <- tempfile() 51 | system.time( 52 | test2 <- test_setvbuf6(csv3, tmp, 100) 53 | ) 54 | # Windows: 14. / 5.0 / 4.6 55 | # Linux: 1.7 / 1.7 / 6.5 56 | # Linux2: 0.4 / 1.1 / 1.2 / 1.2 57 | as.integer(test2) 58 | list.files(dirname(tmp), basename(tmp)) 59 | -------------------------------------------------------------------------------- /tmp-tests/bench-read6.R: -------------------------------------------------------------------------------- 1 | library(bigreadr) 2 | 3 | long <- FALSE 4 | if (long) { 5 | csv2 <- "tmp-data/mtcars-long.csv" 6 | block <- 1e6 7 | M <- 11 8 | block2 <- 3 9 | } else { 10 | csv2 <- "tmp-data/mtcars-wide.csv" 11 | block <- 1e3 12 | M <- 11e3 13 | block2 <- 3 14 | } 15 | 16 | 17 | library(bigstatsr) 18 | (n1 <- bigreadr::nlines(csv2)) 19 | 20 | # debugonce(big_read) 21 | # tmp <- gc(reset = TRUE) 22 | # system.time( 23 | # test <- big_read(csv2, header = TRUE, sep = ",", 24 | # nlines = n1, confirmed = TRUE, 25 | # nlines.block = block, type = "double") 26 | # ) # 38 sec // 912 sec 27 | # gc() - tmp 28 | 29 | tmp <- gc(reset = TRUE) 30 | system.time({ 31 | X <- FBM(n1 - 1, M) 32 | offset <- 0 33 | test2 <- big_fread1(csv2, block, .transform = function(df) { 34 | ind <- rows_along(df) 35 | X[offset + ind, ] <- as.matrix(df) 36 | offset <<- offset + length(ind) 37 | NULL 38 | }, .combine = c) 39 | }) # 16 sec // 122 sec 40 | gc() - tmp 41 | 42 | # all.equal(dim(test$FBM), dim(X)) 43 | # all.equal(test$FBM[, 1], X[, 1]) 44 | # all.equal(test$FBM[, 11], X[, 11]) 45 | 46 | tmp <- gc(reset = TRUE) 47 | system.time({ 48 | X2 <- FBM(n1 - 1, M) 49 | offset <- 0 50 | test3 <- big_fread2(csv2, block2, .transform = function(df) { 51 | print(offset) 52 | ind <- cols_along(df) 53 | X2[, offset + ind] <- as.matrix(df) 54 | offset <<- offset + length(ind) 55 | NULL 56 | }, .combine = c) 57 | }) # 16 sec // 122 sec 58 | gc() - tmp 59 | 60 | all.equal(dim(X2), dim(X)) 61 | all.equal(X2[, 1], X[, 1]) 62 | all.equal(X2[, 11], X[, 11]) 63 | all.equal(X2[, M], X[, M]) 64 | 65 | -------------------------------------------------------------------------------- /tmp-tests/bench-read7.R: -------------------------------------------------------------------------------- 1 | csv <- "tmp-data/mtcars-long.csv" 2 | csv2 <- "tmp-data/mtcars-wide.csv" 3 | 4 | ## System command 'cut' is super slow on my Windows. 5 | 6 | tmp <- gc(reset = TRUE) 7 | system.time( 8 | test2 <- data.table::fread(sprintf("cut -f1-5 -s -d',' %s", csv)) 9 | ) 10 | gc() - tmp 11 | 12 | tmp <- gc(reset = TRUE) 13 | system.time( 14 | test2 <- data.table::fread(sprintf("cut -f1-50000 -s -d',' %s", csv2)) 15 | ) 16 | gc() - tmp 17 | 18 | 19 | tmp <- gc(reset = TRUE) 20 | system.time( 21 | test2 <- data.table::fread(csv, select = 1:5) 22 | ) 23 | gc() - tmp 24 | 25 | tmp <- gc(reset = TRUE) 26 | system.time( 27 | test2 <- data.table::fread(csv2, select = 1:50000) 28 | ) 29 | gc() - tmp 30 | 31 | 32 | tmp <- gc(reset = TRUE) 33 | system.time( 34 | test2 <- data.table::fread(csv2, select = 1:10000) 35 | ) 36 | gc() - tmp 37 | 38 | tryCatch(data.table::fread(file = csv, nrows = 0, skip = 1), 39 | error = function(e) NULL) 40 | dt <- data.table::fread(file = csv, select = c(5, 1, 3), verbose = TRUE) 41 | names(dt) 42 | names(mtcars)[c(5, 1, 3)] 43 | dt2 <- `[.data.frame`(dt, names(mtcars)[c(5, 1, 3)]) 44 | dt2[1] 45 | class(dt2) 46 | 47 | library(data.table) 48 | fwrite(iris, tmp <- tempfile()) 49 | debugonce(fread) 50 | data.table::fread(file = tmp, select = c(5, 1, 3), skip = 0) 51 | data.table::fread(file = tmp, select = c(5, 1, 3), skip = 1) 52 | 53 | system.time(first_line <- fread(csv2, nrows = 1)) 54 | system.time(zero_line <- fread(csv2, nrows = 0)) 55 | system.time(first_line <- fread(csv2, nrows = 1, skip = 1)) 56 | 57 | # system.time( 58 | # df4 <- limma::read.columns(csv, names(mtcars)[1:4], sep = ",") 59 | # ) # 32 sec 60 | -------------------------------------------------------------------------------- /tmp-tests/has-header.R: -------------------------------------------------------------------------------- 1 | part1 <- fread2(file_parts[1], skip = skip, ...) 2 | first_line <- scan(file, "", skip = skip, nlines = 1, sep = "\n", quiet = TRUE) 3 | match_names <- sapply(names(part1), regexpr, text = first_line, fixed = TRUE) 4 | has_header <- all( diff(match_names) > 0 ) 5 | -------------------------------------------------------------------------------- /tmp-tests/split.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define BUFLEN (64 * 1024) 4 | 5 | // [[Rcpp::export]] 6 | NumericVector test_setvbuf7(std::string filename, 7 | std::string filename2, 8 | int every_nlines, 9 | Environment parts_) { 10 | 11 | XPtr xptr = parts_["address"]; 12 | BMAcc parts(xptr); 13 | 14 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out; 15 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN); 16 | 17 | const char *fn_out = filename2.c_str(); 18 | char name_out[strlen(fn_out) + 20]; 19 | 20 | size_t line_size; 21 | size_t size = 100; 22 | size_t last = size - 2; 23 | 24 | char *line = new char[size]; 25 | char *temp; 26 | 27 | bool not_eol, not_eof = true; 28 | int i, k = 0, c = 0; 29 | 30 | 31 | while (not_eof) { 32 | 33 | // Open file number 'k' 34 | sprintf(name_out, "%s%d.txt", fn_out, ++k); 35 | fp_out = fopen(name_out, "wb"); 36 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN); 37 | 38 | // Fill it with 'every_nlines' lines 39 | i = 0; 40 | while (i < every_nlines) { 41 | 42 | if (fgets(line, size, fp_in) == NULL) { 43 | not_eof = false; 44 | break; 45 | } 46 | 47 | line_size = strlen(line); 48 | 49 | fputs(line, fp_out); 50 | 51 | if (line_size > last) { 52 | 53 | not_eol = (line[last] != '\n'); 54 | 55 | fflush(fp_out); 56 | size *= 2; 57 | temp = new char[size]; 58 | delete [] line; 59 | line = temp; 60 | last = size - 2; 61 | 62 | if (not_eol) continue; 63 | } 64 | 65 | // End of line 66 | i++; 67 | 68 | } 69 | 70 | c += i; 71 | 72 | // Close file number 'k' 73 | fflush(fp_out); 74 | fclose(fp_out); 75 | parts(k - 1, 0) = 1; // OK to porcess 76 | Rcout << k << std::endl; 77 | 78 | } 79 | 80 | fclose(fp_in); 81 | 82 | return NumericVector::create(_["K"] = k, _["every"] = every_nlines, _["N"] = c); 83 | } 84 | 85 | -------------------------------------------------------------------------------- /tmp-tests/test-file2string.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace std; 5 | 6 | // [[Rcpp::export]] 7 | std::string file2string(std::string fn) { 8 | 9 | std::string str, strTotal; 10 | ifstream in; 11 | in.open(fn.c_str()); 12 | getline(in, str); 13 | while ( in ) { 14 | Rcpp::Rcout << strTotal.max_size() << std::endl; 15 | strTotal += str + '\n'; 16 | getline(in, str); 17 | } 18 | 19 | return strTotal; 20 | } 21 | 22 | // [[Rcpp::export]] 23 | std::string file2string2(std::string fn) { 24 | 25 | std::ifstream ifs(fn.c_str()); 26 | std::string content( (std::istreambuf_iterator(ifs) ), 27 | (std::istreambuf_iterator() ) ); 28 | 29 | return content; 30 | } 31 | 32 | 33 | /*** R 34 | test <- file2string("text-write.txt") 35 | writeLines(test) 36 | test2 <- file2string2("text-write.txt") 37 | writeLines(test2) 38 | csv2 <- "tmp-data/mtcars-long.csv" 39 | # system.time(test3 <- file2string2(csv2)) 40 | */ 41 | -------------------------------------------------------------------------------- /tmp-tests/test-mmap-nlines.cpp: -------------------------------------------------------------------------------- 1 | // [[Rcpp::depends(rmio)]] 2 | // [[Rcpp::plugins(cpp11)]] 3 | #include 4 | #include // for std::error_code 5 | #include 6 | 7 | using std::size_t; 8 | 9 | 10 | // [[Rcpp::export]] 11 | double nlines_mmap(std::string path) { 12 | 13 | // Memory-map the file 14 | std::error_code error; 15 | mio::ummap_source ro_ummap; 16 | ro_ummap.map(path, error); 17 | if (error) Rcpp::stop("Error when mapping file:\n %s.\n", error.message()); 18 | 19 | int nlines = std::count_if(ro_ummap.begin(), ro_ummap.end(), 20 | [](unsigned char x) { return x == '\n'; }); 21 | 22 | size_t nbytes = ro_ummap.size(); 23 | // size_t nlines = 0; 24 | // for (size_t k = 0; k < nbytes; k++) { 25 | // if (ro_ummap[k] == '\n') nlines++; 26 | // } 27 | 28 | if (ro_ummap[nbytes - 1] != '\n') nlines++; 29 | 30 | return nlines; 31 | } 32 | 33 | // [[Rcpp::export]] 34 | double nlines_mmap2(std::string path) { 35 | 36 | // Memory-map the file 37 | std::error_code error; 38 | mio::ummap_source ro_ummap; 39 | ro_ummap.map(path, error); 40 | if (error) Rcpp::stop("Error when mapping file:\n %s.\n", error.message()); 41 | 42 | size_t nbytes = ro_ummap.size(); 43 | size_t nlines = 0; 44 | for (size_t k = 0; k < (nbytes - 4); k += 4) { 45 | nlines += ((ro_ummap[k] == '\n') + (ro_ummap[k + 1] == '\n')) + 46 | ((ro_ummap[k + 2] == '\n') + (ro_ummap[k + 3] == '\n')); 47 | } 48 | 49 | // TODO: add the test and test that more than 4 bytes 50 | 51 | if (ro_ummap[nbytes - 1] != '\n') nlines++; 52 | 53 | return nlines; 54 | } 55 | 56 | /*** R 57 | nlines_mmap("../tmp-data/cars.csv.bk") 58 | nlines_mmap2("../tmp-data/cars.csv.bk") 59 | */ 60 | -------------------------------------------------------------------------------- /tmp-tests/test-parallel.R: -------------------------------------------------------------------------------- 1 | library(bigreadr) 2 | library(bigstatsr) 3 | library(foreach) 4 | 5 | ## Need to handle 'skip' 6 | csv2 <- "tmp-data/mtcars-long.csv" 7 | n <- nlines(csv2) 8 | K <- 20 9 | every_lines <- ceiling(n / 20) 10 | 11 | Rcpp::sourceCpp('tmp-tests/test-setvbuf6.cpp') 12 | tmp <- tempfile() 13 | parts <- FBM(K, 1, init = 0, type = "integer") 14 | system.time( 15 | test <- test_setvbuf7(csv2, tmp, every_nlines = every_lines, parts) 16 | ) 17 | as.integer(test) 18 | files <- paste0(tmp, 1:K, ".txt") 19 | file.exists(files) 20 | 21 | system.time({ 22 | res2 <- foreach(ic = 1:K) %do% { 23 | while (parts[ic] == 0) Sys.sleep(TIME) 24 | bigreadr:::fread2(files[ic], nThread = 8) 25 | } 26 | }) # 0.9 / 1 (8) -> 2.4 (1) 27 | ## Either all or only 1 28 | 29 | 30 | -------------------------------------------------------------------------------- /tmp-tests/test-parallel2.R: -------------------------------------------------------------------------------- 1 | library(bigreadr) 2 | library(bigstatsr) 3 | library(foreach) 4 | 5 | ## Need to handle 'skip' 6 | csv2 <- "tmp-data/mtcars-long.csv" 7 | n <- nlines(csv2) 8 | K <- 20 9 | every_lines <- ceiling(n / 20) 10 | 11 | parallel <- TRUE 12 | if (!parallel) { 13 | registerDoSEQ() 14 | } else { 15 | cl <- parallel::makeCluster(2) 16 | doParallel::registerDoParallel(cl) 17 | # on.exit(parallel::stopCluster(cl), add = TRUE) 18 | } 19 | 20 | TIME <- 1 / (10 * K) 21 | parts <- FBM(K, 1, init = 0, type = "integer") 22 | tmp <- tempfile() 23 | files <- paste0(tmp, 1:K, ".txt") 24 | system.time({ 25 | res <- foreach(job = 1:2) %dopar% { 26 | 27 | if (job == 1) { 28 | print(1) 29 | system.time( 30 | test <- bigreadr:::test_setvbuf7(csv2, tmp, every_nlines = every_lines, parts) 31 | ) 32 | # NULL 33 | } else { 34 | print(2) 35 | system.time({ 36 | lapply(seq_along(files), function(k) { 37 | while (parts[k] == 0) Sys.sleep(TIME) 38 | bigreadr:::fread2(files[k]) 39 | }) 40 | }) 41 | } 42 | } 43 | }) 44 | parallel::stopCluster(cl) 45 | res 46 | # res <- do.call(bigreadr::my_rbind, res[[2]]) 47 | 48 | 49 | #### PROBLEM: fread reading (second job) is slowing down first job #### 50 | 51 | 52 | system.time({ 53 | lapply(seq_along(files), function(k) { 54 | while (parts[k] == 0) Sys.sleep(TIME) 55 | bigreadr:::fread2(files[k], nThread = 8) 56 | }) 57 | }) 58 | -------------------------------------------------------------------------------- /tmp-tests/test-setvbuf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define BUFLEN (64 * 1024) 10 | 11 | // [[Rcpp::export]] 12 | int test_setvbuf(std::string filename, int size = 100) { 13 | 14 | FILE *fp = fopen(filename.c_str(), "r"); 15 | 16 | unsigned sizem1 = size - 1; 17 | int last = size - 2; 18 | 19 | char line[size]; 20 | // char *id; 21 | // char *token; 22 | char *buf = (char*)malloc(BUFLEN); 23 | int c = 0; 24 | 25 | setvbuf ( fp , buf , _IOLBF, BUFLEN ); 26 | while (fgets(line, size, fp) != NULL) { 27 | // Rcout << strlen(line) << std::endl; 28 | if (strlen(line) < sizem1) { 29 | c++; 30 | } else { 31 | // Rcout << (line[last] == '\n') << std::endl; 32 | if (line[last] == '\n') c++; 33 | } 34 | // id = strtok(line, "\t"); 35 | // token = strtok(NULL, "\t"); 36 | // 37 | // char *fnout = malloc(strlen(id)+5); 38 | // fnout = strcat(fnout, id); 39 | // fnout = strcat(fnout, ".seq"); 40 | // 41 | // fpout = fopen(fnout, "w"); 42 | // setvbuf ( fpout , NULL , _IONBF , 0 ); 43 | // fprintf(fpout, "%s", token); 44 | // fclose(fpout); 45 | } 46 | 47 | fclose(fp); 48 | 49 | return c; 50 | 51 | } 52 | 53 | // [[Rcpp::export]] 54 | int test_setvbuf2(std::string filename, int size = 100) { 55 | 56 | FILE *fp = fopen(filename.c_str(), "r"); 57 | 58 | unsigned sizem1 = size - 1; 59 | int last = size - 2; 60 | 61 | char * line = new char[size]; 62 | char * temp; 63 | // char *id; 64 | // char *token; 65 | // char *buf = (char*)malloc(BUFLEN); 66 | int c = 0; 67 | 68 | setvbuf ( fp , NULL , _IOLBF, BUFLEN ); 69 | while (fgets(line, size, fp) != NULL) { 70 | // Rcout << strlen(line) << std::endl; 71 | if (strlen(line) < sizem1) { 72 | c++; 73 | } else { 74 | // Rcout << (line[last] == '\n') << std::endl; 75 | if (line[last] == '\n') c++; 76 | size *= 2; 77 | temp = new char[size]; 78 | delete [] line; 79 | line = temp; 80 | sizem1 = size - 1; 81 | last = size - 2; 82 | } 83 | // id = strtok(line, "\t"); 84 | // token = strtok(NULL, "\t"); 85 | // 86 | // char *fnout = malloc(strlen(id)+5); 87 | // fnout = strcat(fnout, id); 88 | // fnout = strcat(fnout, ".seq"); 89 | // 90 | // fpout = fopen(fnout, "w"); 91 | // setvbuf ( fpout , NULL , _IONBF , 0 ); 92 | // fprintf(fpout, "%s", token); 93 | // fclose(fpout); 94 | } 95 | 96 | fclose(fp); 97 | 98 | return c; 99 | 100 | } 101 | 102 | /*** R 103 | test_setvbuf("text-write.txt") 104 | test_setvbuf2("text-write.txt") 105 | */ 106 | -------------------------------------------------------------------------------- /tmp-tests/test-setvbuf2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define BUFLEN (64 * 1024) 10 | 11 | // [[Rcpp::export]] 12 | int test_setvbuf3(std::string filename, 13 | std::string filename2, 14 | int size = 100) { 15 | 16 | FILE *fp_in = fopen(filename.c_str(), "rb"); 17 | FILE *fp_out = fopen(filename2.c_str(), "wb"); 18 | 19 | unsigned sizem1 = size - 1; 20 | int last = size - 2; 21 | 22 | char * line = new char[size]; 23 | char * temp; 24 | // char *id; 25 | // char *token; 26 | // char *buf = (char*)malloc(BUFLEN); 27 | int c = 0; 28 | 29 | setvbuf ( fp_in , NULL , _IOLBF, BUFLEN ); 30 | setvbuf ( fp_out , NULL , _IOFBF, BUFLEN ); 31 | 32 | 33 | while (fgets(line, size, fp_in) != NULL) { 34 | 35 | fputs(line, fp_out); 36 | 37 | // Rcout << strlen(line) << std::endl; 38 | if (strlen(line) < sizem1) { 39 | c++; 40 | // if (c % 1000 == 1) fflush(fp_out); 41 | } else { 42 | // Rcout << (line[last] == '\n') << std::endl; 43 | if (line[last] == '\n') c++; 44 | size *= 2; 45 | temp = new char[size]; 46 | delete [] line; 47 | line = temp; 48 | sizem1 = size - 1; 49 | last = size - 2; 50 | } 51 | 52 | // id = strtok(line, "\t"); 53 | // token = strtok(NULL, "\t"); 54 | // 55 | // char *fnout = malloc(strlen(id)+5); 56 | // fnout = strcat(fnout, id); 57 | // fnout = strcat(fnout, ".seq"); 58 | // 59 | // fpout = fopen(fnout, "w"); 60 | // setvbuf ( fpout , NULL , _IONBF , 0 ); 61 | // fprintf(fpout, "%s", token); 62 | // fclose(fpout); 63 | } 64 | 65 | fclose(fp_in); 66 | fflush(fp_out); 67 | fclose(fp_out); 68 | 69 | return c; 70 | } 71 | 72 | /*** R 73 | test_setvbuf3("text-write.txt", "text-write2.txt") 74 | */ 75 | -------------------------------------------------------------------------------- /tmp-tests/test-setvbuf3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define BUFLEN (64 * 1024) 4 | 5 | // [[Rcpp::export]] 6 | int test_setvbuf4(std::string filename, std::string filename2) { 7 | 8 | FILE *fp_in = fopen(filename.c_str(), "rb"); 9 | FILE *fp_out = fopen(filename2.c_str(), "wb"); 10 | 11 | size_t line_size; 12 | size_t size = 100; 13 | size_t last = size - 2; 14 | 15 | char *line = new char[size]; 16 | char *temp; 17 | int c = 0; 18 | bool not_eol; 19 | 20 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN); 21 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN); 22 | 23 | while (fgets(line, size, fp_in) != NULL) { 24 | 25 | line_size = strlen(line); 26 | 27 | fputs(line, fp_out); 28 | 29 | if (line_size > last) { 30 | 31 | not_eol = (line[last] != '\n'); 32 | 33 | size *= 2; 34 | temp = new char[size]; 35 | delete [] line; 36 | line = temp; 37 | last = size - 2; 38 | 39 | if (not_eol) continue; 40 | } 41 | 42 | // End of line 43 | c++; 44 | 45 | } 46 | 47 | fclose(fp_in); 48 | fflush(fp_out); 49 | fclose(fp_out); 50 | 51 | return c; 52 | } 53 | 54 | /*** R 55 | test_setvbuf4("text-write.txt", "text-write2.txt") 56 | */ 57 | -------------------------------------------------------------------------------- /tmp-tests/test-setvbuf4.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | #define BUFLEN (64 * 1024) 5 | 6 | // [[Rcpp::export]] 7 | int test_setvbuf5(std::string filename, std::string filename2) { 8 | 9 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out; 10 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN); 11 | 12 | const char *fn_out = filename2.c_str(); 13 | char name_out[strlen(fn_out) + 20]; 14 | 15 | size_t line_size; 16 | size_t size = 100; 17 | size_t last = size - 2; 18 | 19 | char *line = new char[size]; 20 | char *temp; 21 | int c = 0; 22 | bool not_eol; 23 | 24 | sprintf(name_out, "%s%d.txt", fn_out, c); 25 | fp_out = fopen(name_out, "wb"); 26 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN); 27 | 28 | while (fgets(line, size, fp_in) != NULL) { 29 | 30 | line_size = strlen(line); 31 | 32 | fputs(line, fp_out); 33 | 34 | if (line_size > last) { 35 | 36 | not_eol = (line[last] != '\n'); 37 | 38 | fflush(fp_out); 39 | size *= 2; 40 | temp = new char[size]; 41 | delete [] line; 42 | line = temp; 43 | last = size - 2; 44 | 45 | if (not_eol) continue; 46 | } 47 | 48 | // End of line 49 | c++; 50 | fflush(fp_out); 51 | fclose(fp_out); 52 | sprintf(name_out, "%s%d.txt", fn_out, c); 53 | fp_out = fopen(name_out, "wb"); 54 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN); 55 | 56 | } 57 | 58 | fflush(fp_out); 59 | fclose(fp_out); // last one has nothing inside 60 | fclose(fp_in); 61 | 62 | return c; 63 | } 64 | 65 | /*** R 66 | test_setvbuf5("text-write.txt", "tmp/text-write-part") 67 | readLines("text-write.txt")[[6]] 68 | readLines("tmp/text-write-part5.txt") 69 | */ 70 | -------------------------------------------------------------------------------- /tmp-tests/test-setvbuf5.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | #define BUFLEN (64 * 1024) 5 | 6 | // [[Rcpp::export]] 7 | NumericVector test_setvbuf6(std::string filename, 8 | std::string filename2, 9 | int every_nlines) { 10 | 11 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out; 12 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN); 13 | 14 | const char *fn_out = filename2.c_str(); 15 | char name_out[strlen(fn_out) + 20]; 16 | 17 | size_t line_size; 18 | size_t size = 100; 19 | size_t last = size - 2; 20 | 21 | char *line = new char[size]; 22 | char *temp; 23 | 24 | bool not_eol, not_eof = true; 25 | int i, k = 0, c = 0; 26 | 27 | 28 | while (not_eof) { 29 | 30 | // Open file number 'k' 31 | sprintf(name_out, "%s%d.txt", fn_out, ++k); 32 | fp_out = fopen(name_out, "wb"); 33 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN); 34 | 35 | // Fill it with 'every_nlines' lines 36 | i = 0; 37 | while (i < every_nlines) { 38 | 39 | if (fgets(line, size, fp_in) == NULL) { 40 | not_eof = false; 41 | break; 42 | } 43 | 44 | line_size = strlen(line); 45 | 46 | fputs(line, fp_out); 47 | 48 | if (line_size > last) { 49 | 50 | not_eol = (line[last] != '\n'); 51 | 52 | fflush(fp_out); 53 | size *= 2; 54 | temp = new char[size]; 55 | delete [] line; 56 | line = temp; 57 | last = size - 2; 58 | 59 | if (not_eol) continue; 60 | } 61 | 62 | // End of line 63 | i++; 64 | 65 | } 66 | 67 | c += i; 68 | 69 | // Close file number 'k' 70 | fflush(fp_out); 71 | fclose(fp_out); 72 | 73 | } 74 | 75 | fclose(fp_in); 76 | 77 | return NumericVector::create(_["K"] = k, _["every"] = every_nlines, _["N"] = c); 78 | } 79 | 80 | /***R 81 | test_setvbuf6("text-write.txt", "tmp2/text-write-part", 2) 82 | readLines("text-write.txt")[[7]] 83 | readLines("tmp2/text-write-part4.txt") 84 | */ 85 | -------------------------------------------------------------------------------- /tmp-tests/test-setvbuf6.cpp: -------------------------------------------------------------------------------- 1 | // [[Rcpp::depends(BH, bigstatsr)]] 2 | #include 3 | 4 | #define BUFLEN (64 * 1024) 5 | 6 | // [[Rcpp::export]] 7 | NumericVector test_setvbuf7(std::string filename, 8 | std::string filename2, 9 | int every_nlines, 10 | Environment parts_) { 11 | 12 | XPtr xptr = parts_["address"]; 13 | BMAcc parts(xptr); 14 | 15 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out; 16 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN); 17 | 18 | const char *fn_out = filename2.c_str(); 19 | char name_out[strlen(fn_out) + 20]; 20 | 21 | size_t line_size; 22 | size_t size = 100; 23 | size_t last = size - 2; 24 | 25 | char *line = new char[size]; 26 | char *temp; 27 | 28 | bool not_eol, not_eof = true; 29 | int i, k = 0, c = 0; 30 | 31 | 32 | while (not_eof) { 33 | 34 | // Open file number 'k' 35 | sprintf(name_out, "%s%d.txt", fn_out, ++k); 36 | fp_out = fopen(name_out, "wb"); 37 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN); 38 | 39 | // Fill it with 'every_nlines' lines 40 | i = 0; 41 | while (i < every_nlines) { 42 | 43 | if (fgets(line, size, fp_in) == NULL) { 44 | not_eof = false; 45 | break; 46 | } 47 | 48 | line_size = strlen(line); 49 | 50 | fputs(line, fp_out); 51 | 52 | if (line_size > last) { 53 | 54 | not_eol = (line[last] != '\n'); 55 | 56 | fflush(fp_out); 57 | size *= 2; 58 | temp = new char[size]; 59 | delete [] line; 60 | line = temp; 61 | last = size - 2; 62 | 63 | if (not_eol) continue; 64 | } 65 | 66 | // End of line 67 | i++; 68 | 69 | } 70 | 71 | c += i; 72 | 73 | // Close file number 'k' 74 | fflush(fp_out); 75 | fclose(fp_out); 76 | parts(k - 1, 0) = 1; 77 | Rcout << k << std::endl; 78 | 79 | } 80 | 81 | fclose(fp_in); 82 | 83 | return NumericVector::create(_["K"] = k, _["every"] = every_nlines, _["N"] = c); 84 | } 85 | 86 | -------------------------------------------------------------------------------- /tmp-tests/test-string.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | #define BUFLEN (64 * 1024) 5 | 6 | // [[Rcpp::export]] 7 | void test_string(std::string filename) { 8 | 9 | const char *fn = filename.c_str(); 10 | char name_out[strlen(fn) + 20]; 11 | 12 | for (int k = 1; k < 10; k++) { 13 | sprintf(name_out, "%s%d.txt", fn, k); 14 | Rcout << filename << std::endl; 15 | Rcout << name_out << std::endl; 16 | } 17 | } 18 | 19 | /*** R 20 | test_string(tempfile()) 21 | */ 22 | -------------------------------------------------------------------------------- /tmp-tests/text-write.txt: -------------------------------------------------------------------------------- 1 | a 199 23.45 2 | b 1e+8 3 | c 23339999 4 | errrrr 5 | dde 6 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpfa 7 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpf 8 | -------------------------------------------------------------------------------- /tmp-tests/text-write2.txt: -------------------------------------------------------------------------------- 1 | a 199 23.45 2 | b 1e+8 3 | c 23339999 4 | errrrr 5 | dde 6 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpfa 7 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpf 8 | -------------------------------------------------------------------------------- /vignettes/csv2sqlite.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Convert a CSV to SQLite by parts" 3 | author: "Florian Privé" 4 | date: "August 26, 2018" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, fig.align = "center", eval = FALSE) 10 | ``` 11 | 12 | ## How 13 | 14 | You can easily use this package {bigreadr} to convert a CSV to an SQLite database without loading the whole CSV in memory. 15 | 16 | You can use the following function: 17 | 18 | ```{r} 19 | csv2sqlite <- function(csv, 20 | every_nlines, 21 | table_name, 22 | dbname = sub("\\.csv$", ".sqlite", csv), 23 | ...) { 24 | 25 | # Prepare reading 26 | con <- RSQLite::dbConnect(RSQLite::SQLite(), dbname) 27 | init <- TRUE 28 | fill_sqlite <- function(df) { 29 | 30 | if (init) { 31 | RSQLite::dbCreateTable(con, table_name, df) 32 | init <<- FALSE 33 | } 34 | 35 | RSQLite::dbAppendTable(con, table_name, df) 36 | NULL 37 | } 38 | 39 | # Read and fill by parts 40 | bigreadr::big_fread1(csv, every_nlines, 41 | .transform = fill_sqlite, 42 | .combine = unlist, 43 | ... = ...) 44 | 45 | # Returns 46 | con 47 | } 48 | ``` 49 | 50 | Function `bigreadr::big_fread1()` first splits the CSV in smaller CSV files, then it reads these CSV files as data frames and transform them, and finally combine the results. 51 | 52 | Here, the transformation is just appending the data frame to the SQLite database (and creating this DB the first time). Moreover, you don't want to return anything (`NULL`). 53 | 54 | ## Use case 55 | 56 | For example, with this function, I was able to convert a [CSV file of 9 GB](https://www.data.gouv.fr/fr/datasets/base-sirene-des-entreprises-et-de-leurs-etablissements-siren-siret/) in 40 minutes using less than 2 GB of memory. 57 | 58 | ```{r} 59 | con <- csv2sqlite(csv, every_nlines = 1e6, table_name = "sirene", 60 | encoding = "Latin-1") 61 | ``` 62 | 63 | --------------------------------------------------------------------------------