├── .Rbuildignore
├── .github
├── .gitignore
└── workflows
│ ├── R-CMD-check.yaml
│ └── test-coverage.yaml
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── R
├── RcppExports.R
├── bigreadr-package.R
├── bind.R
├── nlines-split.R
├── read.R
└── zzz.R
├── README.md
├── _pkgdown.yml
├── bigreadr.Rproj
├── codecov.yml
├── docs
├── 404.html
├── articles
│ ├── csv2sqlite.html
│ └── index.html
├── authors.html
├── bootstrap-toc.css
├── bootstrap-toc.js
├── docsearch.css
├── docsearch.js
├── index.html
├── link.svg
├── pkgdown.css
├── pkgdown.js
├── pkgdown.yml
├── reference
│ ├── Rplot001.png
│ ├── big_fread1.html
│ ├── big_fread2.html
│ ├── bigreadr-package.html
│ ├── cbind_df.html
│ ├── fread2.html
│ ├── fwrite2.html
│ ├── index.html
│ ├── nlines.html
│ ├── rbind_df.html
│ └── split_file.html
└── sitemap.xml
├── inst
├── WORDLIST
└── testdata
│ ├── cars_with_newline.csv
│ ├── cars_without_newline.csv
│ └── wrong_string.rds
├── man
├── big_fread1.Rd
├── big_fread2.Rd
├── bigreadr-package.Rd
├── cbind_df.Rd
├── fread2.Rd
├── fwrite2.Rd
├── nlines.Rd
├── rbind_df.Rd
└── split_file.Rd
├── src
├── .gitignore
├── RcppExports.cpp
└── nlines-split.cpp
├── tests
├── spelling.R
├── testthat.R
└── testthat
│ ├── test-bind.R
│ ├── test-nlines.R
│ ├── test-read.R
│ └── test-split.R
├── tmp-save
└── nlines.cpp
├── tmp-tests
├── bench-acc.R
├── bench-rbind.R
├── bench-read.R
├── bench-read2.R
├── bench-read3.R
├── bench-read4.R
├── bench-read5.R
├── bench-read6.R
├── bench-read7.R
├── has-header.R
├── split.cpp
├── test-file2string.cpp
├── test-mmap-nlines.cpp
├── test-parallel.R
├── test-parallel2.R
├── test-setvbuf.cpp
├── test-setvbuf2.cpp
├── test-setvbuf3.cpp
├── test-setvbuf4.cpp
├── test-setvbuf5.cpp
├── test-setvbuf6.cpp
├── test-string.cpp
├── text-write.txt
└── text-write2.txt
└── vignettes
└── csv2sqlite.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^tmp-tests$
4 | ^tmp-data$
5 | ^\.travis\.yml$
6 | ^appveyor\.yml$
7 | ^codecov\.yml$
8 | ^tmp-save$
9 | ^_pkgdown\.yml$
10 | ^docs$
11 | ^vignettes$
12 | ^\.github$
13 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 |
9 | name: R-CMD-check
10 |
11 | jobs:
12 | R-CMD-check:
13 | runs-on: ${{ matrix.config.os }}
14 |
15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 |
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | config:
21 | - {os: macos-latest, r: 'release'}
22 | - {os: windows-latest, r: 'release'}
23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
24 | - {os: ubuntu-latest, r: 'release'}
25 | - {os: ubuntu-latest, r: 'oldrel-1'}
26 |
27 | env:
28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 | R_KEEP_PKG_SOURCE: yes
30 |
31 | steps:
32 | - uses: actions/checkout@v3
33 |
34 | - uses: r-lib/actions/setup-pandoc@v2
35 |
36 | - uses: r-lib/actions/setup-r@v2
37 | with:
38 | r-version: ${{ matrix.config.r }}
39 | http-user-agent: ${{ matrix.config.http-user-agent }}
40 | use-public-rspm: true
41 |
42 | - uses: r-lib/actions/setup-r-dependencies@v2
43 | with:
44 | extra-packages: any::rcmdcheck
45 | needs: check
46 |
47 | - uses: r-lib/actions/check-r-package@v2
48 | with:
49 | upload-snapshots: true
50 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 |
9 | name: test-coverage
10 |
11 | jobs:
12 | test-coverage:
13 | runs-on: ubuntu-latest
14 | env:
15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 |
20 | - uses: r-lib/actions/setup-r@v2
21 | with:
22 | use-public-rspm: true
23 |
24 | - uses: r-lib/actions/setup-r-dependencies@v2
25 | with:
26 | extra-packages: any::covr
27 | needs: coverage
28 |
29 | - name: Test coverage
30 | run: |
31 | covr::codecov(
32 | quiet = FALSE,
33 | clean = FALSE,
34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
35 | )
36 | shell: Rscript {0}
37 |
38 | - name: Show testthat output
39 | if: always()
40 | run: |
41 | ## --------------------------------------------------------------------
42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 | shell: bash
44 |
45 | - name: Upload test results
46 | if: failure()
47 | uses: actions/upload-artifact@v3
48 | with:
49 | name: coverage-test-failures
50 | path: ${{ runner.temp }}/package
51 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | tmp-data/
6 | tmp-tests/tmp/
7 | tmp-tests/tmp2/
8 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: bigreadr
2 | Version: 0.2.5
3 | Date: 2022-12-06
4 | Title: Read Large Text Files
5 | Description: Read large text files by splitting them in smaller files.
6 | Package 'bigreadr' also provides some convenient wrappers around fread()
7 | and fwrite() from package 'data.table'.
8 | Authors@R:
9 | person(given = "Florian",
10 | family = "Privé",
11 | role = c("aut", "cre"),
12 | email = "florian.prive.21@gmail.com")
13 | License: GPL-3
14 | Encoding: UTF-8
15 | ByteCompile: true
16 | Roxygen: list(markdown = TRUE)
17 | RoxygenNote: 6.1.0
18 | Imports:
19 | bigassertr (>= 0.1.1),
20 | data.table,
21 | parallelly,
22 | Rcpp,
23 | utils
24 | Suggests:
25 | spelling,
26 | testthat,
27 | covr,
28 | RSQLite
29 | LinkingTo:
30 | Rcpp
31 | Language: en-US
32 | URL: https://github.com/privefl/bigreadr
33 | BugReports: https://github.com/privefl/bigreadr/issues
34 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(big_fread1)
4 | export(big_fread2)
5 | export(cbind_df)
6 | export(fread2)
7 | export(fwrite2)
8 | export(get_split_files)
9 | export(nlines)
10 | export(rbind_df)
11 | export(split_file)
12 | importFrom(Rcpp,sourceCpp)
13 | importFrom(bigassertr,assert_exist)
14 | importFrom(bigassertr,assert_int)
15 | importFrom(bigassertr,assert_pos)
16 | importFrom(bigassertr,message2)
17 | importFrom(bigassertr,stop2)
18 | importFrom(bigassertr,warning2)
19 | useDynLib(bigreadr, .registration = TRUE)
20 |
--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 |
4 | nlines_cpp <- function(file) {
5 | .Call(`_bigreadr_nlines_cpp`, file)
6 | }
7 |
8 | split_every_nlines <- function(name_in, prefix_out, every_nlines, repeat_header) {
9 | .Call(`_bigreadr_split_every_nlines`, name_in, prefix_out, every_nlines, repeat_header)
10 | }
11 |
12 |
--------------------------------------------------------------------------------
/R/bigreadr-package.R:
--------------------------------------------------------------------------------
1 | #' @useDynLib bigreadr, .registration = TRUE
2 | #' @importFrom Rcpp sourceCpp
3 | #' @importFrom bigassertr message2 warning2 stop2 assert_exist assert_int assert_pos
4 | #' @keywords internal
5 | "_PACKAGE"
6 |
--------------------------------------------------------------------------------
/R/bind.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | #' Merge data frames
4 | #'
5 | #' @param list_df A list of multiple data frames with the same variables in the
6 | #' same order.
7 | #'
8 | #' @return One merged data frame with the names of the first input data frame.
9 | #' @export
10 | #'
11 | #' @examples
12 | #' str(iris)
13 | #' str(rbind_df(list(iris, iris)))
14 | #'
15 | rbind_df <- function(list_df) {
16 |
17 | first_df <- list_df[[1]]
18 | if (data.table::is.data.table(first_df)) {
19 | data.table::rbindlist(list_df)
20 | } else if (is.data.frame(first_df)) {
21 | list_df_merged <- lapply(seq_along(first_df), function(k) {
22 | unlist(lapply(list_df, function(l) l[[k]]), recursive = FALSE)
23 | })
24 | list_df_merged_named <- stats::setNames(list_df_merged, names(list_df[[1]]))
25 | as.data.frame(list_df_merged_named, stringsAsFactors = FALSE)
26 | } else {
27 | stop2("'list_df' should contain data tables or data frames.")
28 | }
29 | }
30 |
31 | ################################################################################
32 |
33 | #' Merge data frames
34 | #'
35 | #' @param list_df A list of multiple data frames with the same observations in
36 | #' the same order.
37 | #'
38 | #' @return One merged data frame.
39 | #' @export
40 | #'
41 | #' @examples
42 | #' str(iris)
43 | #' str(cbind_df(list(iris, iris)))
44 | #'
45 | cbind_df <- function(list_df) {
46 | do.call(cbind, list_df)
47 | }
48 |
49 | ################################################################################
50 |
--------------------------------------------------------------------------------
/R/nlines-split.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | #' Number of lines
4 | #'
5 | #' Get the number of lines of a file.
6 | #'
7 | #' @param file Path of the file.
8 | #'
9 | #' @return The number of lines as one integer.
10 | #' @export
11 | #'
12 | #' @examples
13 | #' tmp <- fwrite2(iris)
14 | #' nlines(tmp)
15 | #'
16 | nlines <- function(file) {
17 | nlines_cpp( normalizePath(file, mustWork = TRUE) )
18 | }
19 |
20 | ################################################################################
21 |
22 | #' Split file every nlines
23 | #'
24 | #' @param file Path to file that you want to split.
25 | #' @param every_nlines Maximum number of lines in new file parts.
26 | #' @param prefix_out Prefix for created files. Default uses `tempfile()`.
27 | #' @param repeat_header Whether to repeat the header row in each file.
28 | #' Default is `FALSE`.
29 | #'
30 | #' @return A list with
31 | #' - `name_in`: input parameter `file`,
32 | #' - `prefix_out`: input parameter `prefix_out``,
33 | #' - `nfiles`: Number of files (parts) created,
34 | #' - `nlines_part`: input parameter `every_nlines`,
35 | #' - `nlines_all`: total number of lines of `file`.
36 | #' @export
37 | #'
38 | #' @examples
39 | #' tmp <- fwrite2(iris)
40 | #' infos <- split_file(tmp, 100)
41 | #' str(infos)
42 | #' get_split_files(infos)
43 | split_file <- function(file, every_nlines,
44 | prefix_out = tempfile(),
45 | repeat_header = FALSE) {
46 |
47 | split_every_nlines(
48 | name_in = normalizePath(file, mustWork = TRUE),
49 | prefix_out = path.expand(prefix_out),
50 | every_nlines = every_nlines,
51 | repeat_header = repeat_header
52 | )
53 | }
54 |
55 | ################################################################################
56 |
57 | #' Get files from splitting.
58 | #'
59 | #' @param split_file_out Output of [split_file].
60 | #'
61 | #' @return Vector of file paths created by [split_file].
62 | #' @export
63 | #' @rdname split_file
64 | #'
65 | get_split_files <- function(split_file_out) {
66 |
67 | sprintf("%s_%s.txt",
68 | split_file_out[["prefix_out"]],
69 | seq_len(split_file_out[["nfiles"]]))
70 | }
71 |
72 | ################################################################################
73 |
--------------------------------------------------------------------------------
/R/read.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | #' Read text file(s)
4 | #'
5 | #' @param input Path to the file(s) that you want to read from.
6 | #' This can also be a command, some text or an URL.
7 | #' If a vector of inputs is provided, resulting data frames are appended.
8 | #' @param ... Other arguments to be passed to [data.table::fread].
9 | #' @param data.table Whether to return a `data.table` or just a `data.frame`?
10 | #' Default is `FALSE` (and is the opposite of [data.table::fread]).
11 | #' @param nThread Number of threads to use. Default uses all threads minus one.
12 | #'
13 | #' @return A `data.frame` by default; a `data.table` when `data.table = TRUE`.
14 | #' @export
15 | #'
16 | #' @examples
17 | #' tmp <- fwrite2(iris)
18 | #' iris2 <- fread2(tmp)
19 | #' all.equal(iris2, iris) ## fread doesn't use factors
20 | fread2 <- function(input, ...,
21 | data.table = FALSE,
22 | nThread = getOption("bigreadr.nThread")) {
23 |
24 | if (missing(input)) {
25 | data.table::fread(..., data.table = data.table, nThread = nThread)
26 | } else if (length(input) > 1) {
27 | rbind_df(lapply(input, fread2, ..., data.table = data.table, nThread = nThread))
28 | } else {
29 | data.table::fread(input, ..., data.table = data.table, nThread = nThread)
30 | }
31 | }
32 |
33 | ################################################################################
34 |
35 | #' Write a data frame to a text file
36 | #'
37 | #' @param x Data frame to write.
38 | #' @param file Path to the file that you want to write to.
39 | #' Defaults uses `tempfile()`.
40 | #' @param ... Other arguments to be passed to [data.table::fwrite].
41 | #' @param quote Whether to quote strings (default is `FALSE`).
42 | #' @param nThread Number of threads to use. Default uses all threads minus one.
43 | #'
44 | #' @return Input parameter `file`, invisibly.
45 | #' @export
46 | #'
47 | #' @examples
48 | #' tmp <- fwrite2(iris)
49 | #' iris2 <- fread2(tmp)
50 | #' all.equal(iris2, iris) ## fread doesn't use factors
51 | fwrite2 <- function(x, file = tempfile(), ...,
52 | quote = FALSE,
53 | nThread = getOption("bigreadr.nThread")) {
54 |
55 | data.table::fwrite(x, file, ..., quote = quote, nThread = nThread)
56 | invisible(file)
57 | }
58 |
59 | ################################################################################
60 |
61 | #' Read large text file
62 | #'
63 | #' Read large text file by splitting lines.
64 | #'
65 | #' @param file Path to file that you want to read.
66 | #' @inheritParams split_file
67 | #' @param .transform Function to transform each data frame corresponding to each
68 | #' part of the `file`. Default doesn't change anything.
69 | #' @param .combine Function to combine results (list of data frames).
70 | #' @param skip Number of lines to skip at the beginning of `file`.
71 | #' @param ... Other arguments to be passed to [data.table::fread],
72 | #' excepted `input`, `file`, `skip`, `col.names` and `showProgress`.
73 | #' @param print_timings Whether to print timings? Default is `TRUE`.
74 | #'
75 | #' @inherit fread2 return
76 | #' @export
77 | #'
78 | big_fread1 <- function(file, every_nlines,
79 | .transform = identity, .combine = rbind_df,
80 | skip = 0, ...,
81 | print_timings = TRUE) {
82 |
83 | begin <- proc.time()[3]
84 | print_proc <- function(action) {
85 | if (print_timings) {
86 | reset <- proc.time()[3]
87 | message2("%s: %s seconds.", action, round(reset - begin, 1))
88 | begin <<- reset
89 | }
90 | }
91 |
92 | ## Split file
93 | infos_split <- split_file(file, every_nlines = every_nlines)
94 | file_parts <- get_split_files(infos_split)
95 | on.exit(unlink(file_parts), add = TRUE)
96 |
97 | print_proc("Splitting")
98 |
99 | ## Read first part to get names and to skip some lines
100 | part1 <- fread2(file_parts[1], skip = skip, ..., showProgress = FALSE)
101 | names_df <- names(part1)
102 | part1 <- .transform(part1)
103 |
104 | print_proc("Reading + transforming first part")
105 |
106 | ## Read + transform other parts
107 | other_parts <- lapply(file_parts[-1], function(file_part) {
108 | .transform(fread2(file_part, skip = 0, col.names = names_df,
109 | ..., showProgress = FALSE))
110 | })
111 |
112 | print_proc("Reading + transforming other parts")
113 |
114 | ## Combine
115 | all_parts <- unname(c(list(part1), other_parts))
116 | res <- tryCatch(.combine(all_parts), error = function(e) {
117 | warning2("Combining failed. Returning list of parts instead..")
118 | all_parts
119 | })
120 |
121 | print_proc("Combining")
122 |
123 | res
124 | }
125 |
126 | ################################################################################
127 |
128 | cut_in_nb <- function(x, nb) {
129 | split(x, sort(rep_len(seq_len(nb), length(x))))
130 | }
131 |
132 | #' Read large text file
133 | #'
134 | #' Read large text file by splitting columns.
135 | #'
136 | #' @param file Path to file that you want to read.
137 | #' @param nb_parts Number of parts in which to split reading (and transforming).
138 | #' Parts are referring to blocks of selected columns.
139 | #' Default uses `part_size` to set a good value.
140 | #' @param .transform Function to transform each data frame corresponding to each
141 | #' block of selected columns. Default doesn't change anything.
142 | #' @param .combine Function to combine results (list of data frames).
143 | #' @param skip Number of lines to skip at the beginning of `file`.
144 | #' @param select Indices of columns to keep (sorted). Default keeps them all.
145 | #' @param ... Other arguments to be passed to [data.table::fread],
146 | #' excepted `input`, `file`, `skip`, `select` and `showProgress`.
147 | #' @param progress Show progress? Default is `FALSE`.
148 | #' @param part_size Size of the parts if `nb_parts` is not supplied.
149 | #' Default is `500 * 1024^2` (500 MB).
150 | #'
151 | #' @return The outputs of `fread2` + `.transform`, combined with `.combine`.
152 | #' @export
153 | #'
154 | big_fread2 <- function(file, nb_parts = NULL,
155 | .transform = identity,
156 | .combine = cbind_df,
157 | skip = 0,
158 | select = NULL,
159 | progress = FALSE,
160 | part_size = 500 * 1024^2, ## 500 MB
161 | ...) {
162 |
163 | assert_exist(file)
164 | ## Split selected columns in nb_parts
165 | if (is.null(select)) {
166 | nb_cols <- ncol(fread2(file, nrows = 1, skip = skip, ...))
167 | select <- seq_len(nb_cols)
168 | } else {
169 | assert_int(select); assert_pos(select)
170 | if (is.unsorted(select, strictly = TRUE))
171 | stop2("Argument 'select' should be sorted.")
172 | }
173 | # Number of parts
174 | if (is.null(nb_parts)) {
175 | nb_parts <- ceiling(file.size(file) / part_size)
176 | if (progress) message2("Will read the file in %d parts.", nb_parts)
177 | }
178 | split_cols <- cut_in_nb(select, nb_parts)
179 |
180 | if (progress) {
181 | pb <- utils::txtProgressBar(min = 0, max = length(select), style = 3)
182 | on.exit(close(pb), add = TRUE)
183 | }
184 |
185 | ## Read + transform other parts
186 | already_read <- 0
187 | all_parts <- lapply(split_cols, function(cols) {
188 | part <- .transform(
189 | fread2(file, skip = skip, select = cols, ..., showProgress = FALSE)
190 | )
191 | already_read <<- already_read + length(cols)
192 | if (progress) utils::setTxtProgressBar(pb, already_read)
193 | part
194 | })
195 | all_parts <- unname(all_parts)
196 |
197 | ## Combine
198 | tryCatch(.combine(all_parts), error = function(e) {
199 | warning2("Combining failed. Returning list of parts instead..")
200 | all_parts
201 | })
202 | }
203 |
204 | ################################################################################
205 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | .onLoad <- function(libname, pkgname) {
4 | options(
5 | bigreadr.nThread = max(parallelly::availableCores() - 1L, 1L)
6 | )
7 | }
8 |
9 | ################################################################################
10 |
11 | .onUnload <- function(libpath) {
12 | options(
13 | bigreadr.nThread = NULL
14 | )
15 | }
16 |
17 | ################################################################################
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://github.com/privefl/bigreadr/actions/workflows/R-CMD-check.yaml)
3 | [](https://cran.r-project.org/package=bigreadr)
4 | [](https://app.codecov.io/gh/privefl/bigreadr?branch=master)
5 |
6 |
7 |
8 | # R package {bigreadr}
9 |
10 | Read large text files based on splitting + `data.table::fread`
11 |
12 |
13 | ## Example
14 |
15 | ```r
16 | # remotes::install_github("privefl/bigreadr")
17 | library(bigreadr)
18 |
19 | # Create a temporary file of ~141 MB (just as an example)
20 | csv <- fwrite2(iris[rep(seq_len(nrow(iris)), 1e4), rep(1:5, 4)], tempfile())
21 | format(file.size(csv), big.mark = ",")
22 |
23 | ## Splitting lines (1)
24 | # Read (by parts) all data -> using `fread` would be faster
25 | nlines(csv) ## 1M5 lines -> split every 500,000
26 | big_iris1 <- big_fread1(csv, every_nlines = 5e5)
27 | # Read and subset (by parts)
28 | big_iris1_setosa <- big_fread1(csv, every_nlines = 5e5, .transform = function(df) {
29 | dplyr::filter(df, Species == "setosa")
30 | })
31 |
32 | ## Splitting columns (2)
33 | big_iris2 <- big_fread2(csv, nb_parts = 3)
34 | # Read and subset (by parts)
35 | species_setosa <- (fread2(csv, select = 5)[[1]] == "setosa")
36 | big_iris2_setosa <- big_fread2(csv, nb_parts = 3, .transform = function(df) {
37 | dplyr::filter(df, species_setosa)
38 | })
39 |
40 | ## Verification
41 | identical(big_iris1_setosa, dplyr::filter(big_iris1, Species == "setosa"))
42 | identical(big_iris2, big_iris1)
43 | identical(big_iris2_setosa, big_iris1_setosa)
44 | ```
45 |
46 | ## Use cases
47 |
48 | Please send me your use cases!
49 |
50 | - [Convert a CSV to SQLite by parts](https://privefl.github.io/bigreadr/articles/csv2sqlite.html)
51 |
52 | - [Read a text file as a disk.frame](https://diskframe.com/articles/ingesting-data.html)
53 |
54 | - [Read a text file as a Filebacked Big Matrix](https://privefl.github.io/bigstatsr/reference/big_read.html)
55 |
56 | - [Read a text file as a Filebacked Data Frame](https://privefl.github.io/bigdfr/reference/FDF_read.html)
57 |
58 | - Read multiple files at once using `bigreadr::fread2()`.
59 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/privefl/bigreadr/2d8806f1067b19610a2d633bf2e863b910570d5d/_pkgdown.yml
--------------------------------------------------------------------------------
/bigreadr.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 1%
9 | informational: true
10 | patch:
11 | default:
12 | target: auto
13 | threshold: 1%
14 | informational: true
15 |
--------------------------------------------------------------------------------
/docs/404.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | Page not found (404) • bigreadr
9 |
10 |
11 |
12 |
13 |
14 |
15 |
19 |
20 |
21 |
22 |
23 |
24 |
75 |
76 |
79 |
80 | Content not found. Please use links in the navbar.
81 |
82 |
83 |
84 |
88 |
89 |
90 |
91 |
92 |
93 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/docs/articles/index.html:
--------------------------------------------------------------------------------
1 |
2 | Articles • bigreadr
6 |
7 |
8 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/docs/authors.html:
--------------------------------------------------------------------------------
1 |
2 | Authors and Citation • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
63 |
69 |
70 |
71 |
Privé F (2022).
72 | bigreadr: Read Large Text Files .
73 | R package version 0.2.5, https://github.com/privefl/bigreadr .
74 |
75 |
@Manual{,
76 | title = {bigreadr: Read Large Text Files},
77 | author = {Florian Privé},
78 | year = {2022},
79 | note = {R package version 0.2.5},
80 | url = {https://github.com/privefl/bigreadr},
81 | }
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/docs/bootstrap-toc.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/)
3 | * Copyright 2015 Aidan Feldman
4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */
5 |
6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */
7 |
8 | /* All levels of nav */
9 | nav[data-toggle='toc'] .nav > li > a {
10 | display: block;
11 | padding: 4px 20px;
12 | font-size: 13px;
13 | font-weight: 500;
14 | color: #767676;
15 | }
16 | nav[data-toggle='toc'] .nav > li > a:hover,
17 | nav[data-toggle='toc'] .nav > li > a:focus {
18 | padding-left: 19px;
19 | color: #563d7c;
20 | text-decoration: none;
21 | background-color: transparent;
22 | border-left: 1px solid #563d7c;
23 | }
24 | nav[data-toggle='toc'] .nav > .active > a,
25 | nav[data-toggle='toc'] .nav > .active:hover > a,
26 | nav[data-toggle='toc'] .nav > .active:focus > a {
27 | padding-left: 18px;
28 | font-weight: bold;
29 | color: #563d7c;
30 | background-color: transparent;
31 | border-left: 2px solid #563d7c;
32 | }
33 |
34 | /* Nav: second level (shown on .active) */
35 | nav[data-toggle='toc'] .nav .nav {
36 | display: none; /* Hide by default, but at >768px, show it */
37 | padding-bottom: 10px;
38 | }
39 | nav[data-toggle='toc'] .nav .nav > li > a {
40 | padding-top: 1px;
41 | padding-bottom: 1px;
42 | padding-left: 30px;
43 | font-size: 12px;
44 | font-weight: normal;
45 | }
46 | nav[data-toggle='toc'] .nav .nav > li > a:hover,
47 | nav[data-toggle='toc'] .nav .nav > li > a:focus {
48 | padding-left: 29px;
49 | }
50 | nav[data-toggle='toc'] .nav .nav > .active > a,
51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a,
52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a {
53 | padding-left: 28px;
54 | font-weight: 500;
55 | }
56 |
57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */
58 | nav[data-toggle='toc'] .nav > .active > ul {
59 | display: block;
60 | }
61 |
--------------------------------------------------------------------------------
/docs/bootstrap-toc.js:
--------------------------------------------------------------------------------
1 | /*!
2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/)
3 | * Copyright 2015 Aidan Feldman
4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */
5 | (function() {
6 | 'use strict';
7 |
8 | window.Toc = {
9 | helpers: {
10 | // return all matching elements in the set, or their descendants
11 | findOrFilter: function($el, selector) {
12 | // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/
13 | // http://stackoverflow.com/a/12731439/358804
14 | var $descendants = $el.find(selector);
15 | return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])');
16 | },
17 |
18 | generateUniqueIdBase: function(el) {
19 | var text = $(el).text();
20 | var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-');
21 | return anchor || el.tagName.toLowerCase();
22 | },
23 |
24 | generateUniqueId: function(el) {
25 | var anchorBase = this.generateUniqueIdBase(el);
26 | for (var i = 0; ; i++) {
27 | var anchor = anchorBase;
28 | if (i > 0) {
29 | // add suffix
30 | anchor += '-' + i;
31 | }
32 | // check if ID already exists
33 | if (!document.getElementById(anchor)) {
34 | return anchor;
35 | }
36 | }
37 | },
38 |
39 | generateAnchor: function(el) {
40 | if (el.id) {
41 | return el.id;
42 | } else {
43 | var anchor = this.generateUniqueId(el);
44 | el.id = anchor;
45 | return anchor;
46 | }
47 | },
48 |
49 | createNavList: function() {
50 | return $('');
51 | },
52 |
53 | createChildNavList: function($parent) {
54 | var $childList = this.createNavList();
55 | $parent.append($childList);
56 | return $childList;
57 | },
58 |
59 | generateNavEl: function(anchor, text) {
60 | var $a = $(' ');
61 | $a.attr('href', '#' + anchor);
62 | $a.text(text);
63 | var $li = $(' ');
64 | $li.append($a);
65 | return $li;
66 | },
67 |
68 | generateNavItem: function(headingEl) {
69 | var anchor = this.generateAnchor(headingEl);
70 | var $heading = $(headingEl);
71 | var text = $heading.data('toc-text') || $heading.text();
72 | return this.generateNavEl(anchor, text);
73 | },
74 |
75 | // Find the first heading level (``, then ``, etc.) that has more than one element. Defaults to 1 (for ``).
76 | getTopLevel: function($scope) {
77 | for (var i = 1; i <= 6; i++) {
78 | var $headings = this.findOrFilter($scope, 'h' + i);
79 | if ($headings.length > 1) {
80 | return i;
81 | }
82 | }
83 |
84 | return 1;
85 | },
86 |
87 | // returns the elements for the top level, and the next below it
88 | getHeadings: function($scope, topLevel) {
89 | var topSelector = 'h' + topLevel;
90 |
91 | var secondaryLevel = topLevel + 1;
92 | var secondarySelector = 'h' + secondaryLevel;
93 |
94 | return this.findOrFilter($scope, topSelector + ',' + secondarySelector);
95 | },
96 |
97 | getNavLevel: function(el) {
98 | return parseInt(el.tagName.charAt(1), 10);
99 | },
100 |
101 | populateNav: function($topContext, topLevel, $headings) {
102 | var $context = $topContext;
103 | var $prevNav;
104 |
105 | var helpers = this;
106 | $headings.each(function(i, el) {
107 | var $newNav = helpers.generateNavItem(el);
108 | var navLevel = helpers.getNavLevel(el);
109 |
110 | // determine the proper $context
111 | if (navLevel === topLevel) {
112 | // use top level
113 | $context = $topContext;
114 | } else if ($prevNav && $context === $topContext) {
115 | // create a new level of the tree and switch to it
116 | $context = helpers.createChildNavList($prevNav);
117 | } // else use the current $context
118 |
119 | $context.append($newNav);
120 |
121 | $prevNav = $newNav;
122 | });
123 | },
124 |
125 | parseOps: function(arg) {
126 | var opts;
127 | if (arg.jquery) {
128 | opts = {
129 | $nav: arg
130 | };
131 | } else {
132 | opts = arg;
133 | }
134 | opts.$scope = opts.$scope || $(document.body);
135 | return opts;
136 | }
137 | },
138 |
139 | // accepts a jQuery object, or an options object
140 | init: function(opts) {
141 | opts = this.helpers.parseOps(opts);
142 |
143 | // ensure that the data attribute is in place for styling
144 | opts.$nav.attr('data-toggle', 'toc');
145 |
146 | var $topContext = this.helpers.createChildNavList(opts.$nav);
147 | var topLevel = this.helpers.getTopLevel(opts.$scope);
148 | var $headings = this.helpers.getHeadings(opts.$scope, topLevel);
149 | this.helpers.populateNav($topContext, topLevel, $headings);
150 | }
151 | };
152 |
153 | $(function() {
154 | $('nav[data-toggle="toc"]').each(function(i, el) {
155 | var $nav = $(el);
156 | Toc.init($nav);
157 | });
158 | });
159 | })();
160 |
--------------------------------------------------------------------------------
/docs/docsearch.js:
--------------------------------------------------------------------------------
1 | $(function() {
2 |
3 | // register a handler to move the focus to the search bar
4 | // upon pressing shift + "/" (i.e. "?")
5 | $(document).on('keydown', function(e) {
6 | if (e.shiftKey && e.keyCode == 191) {
7 | e.preventDefault();
8 | $("#search-input").focus();
9 | }
10 | });
11 |
12 | $(document).ready(function() {
13 | // do keyword highlighting
14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */
15 | var mark = function() {
16 |
17 | var referrer = document.URL ;
18 | var paramKey = "q" ;
19 |
20 | if (referrer.indexOf("?") !== -1) {
21 | var qs = referrer.substr(referrer.indexOf('?') + 1);
22 | var qs_noanchor = qs.split('#')[0];
23 | var qsa = qs_noanchor.split('&');
24 | var keyword = "";
25 |
26 | for (var i = 0; i < qsa.length; i++) {
27 | var currentParam = qsa[i].split('=');
28 |
29 | if (currentParam.length !== 2) {
30 | continue;
31 | }
32 |
33 | if (currentParam[0] == paramKey) {
34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20"));
35 | }
36 | }
37 |
38 | if (keyword !== "") {
39 | $(".contents").unmark({
40 | done: function() {
41 | $(".contents").mark(keyword);
42 | }
43 | });
44 | }
45 | }
46 | };
47 |
48 | mark();
49 | });
50 | });
51 |
52 | /* Search term highlighting ------------------------------*/
53 |
54 | function matchedWords(hit) {
55 | var words = [];
56 |
57 | var hierarchy = hit._highlightResult.hierarchy;
58 | // loop to fetch from lvl0, lvl1, etc.
59 | for (var idx in hierarchy) {
60 | words = words.concat(hierarchy[idx].matchedWords);
61 | }
62 |
63 | var content = hit._highlightResult.content;
64 | if (content) {
65 | words = words.concat(content.matchedWords);
66 | }
67 |
68 | // return unique words
69 | var words_uniq = [...new Set(words)];
70 | return words_uniq;
71 | }
72 |
73 | function updateHitURL(hit) {
74 |
75 | var words = matchedWords(hit);
76 | var url = "";
77 |
78 | if (hit.anchor) {
79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor;
80 | } else {
81 | url = hit.url + '?q=' + escape(words.join(" "));
82 | }
83 |
84 | return url;
85 | }
86 |
--------------------------------------------------------------------------------
/docs/link.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
8 |
12 |
13 |
--------------------------------------------------------------------------------
/docs/pkgdown.css:
--------------------------------------------------------------------------------
1 | /* Sticky footer */
2 |
3 | /**
4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/
5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css
6 | *
7 | * .Site -> body > .container
8 | * .Site-content -> body > .container .row
9 | * .footer -> footer
10 | *
11 | * Key idea seems to be to ensure that .container and __all its parents__
12 | * have height set to 100%
13 | *
14 | */
15 |
16 | html, body {
17 | height: 100%;
18 | }
19 |
20 | body {
21 | position: relative;
22 | }
23 |
24 | body > .container {
25 | display: flex;
26 | height: 100%;
27 | flex-direction: column;
28 | }
29 |
30 | body > .container .row {
31 | flex: 1 0 auto;
32 | }
33 |
34 | footer {
35 | margin-top: 45px;
36 | padding: 35px 0 36px;
37 | border-top: 1px solid #e5e5e5;
38 | color: #666;
39 | display: flex;
40 | flex-shrink: 0;
41 | }
42 | footer p {
43 | margin-bottom: 0;
44 | }
45 | footer div {
46 | flex: 1;
47 | }
48 | footer .pkgdown {
49 | text-align: right;
50 | }
51 | footer p {
52 | margin-bottom: 0;
53 | }
54 |
55 | img.icon {
56 | float: right;
57 | }
58 |
59 | /* Ensure in-page images don't run outside their container */
60 | .contents img {
61 | max-width: 100%;
62 | height: auto;
63 | }
64 |
65 | /* Fix bug in bootstrap (only seen in firefox) */
66 | summary {
67 | display: list-item;
68 | }
69 |
70 | /* Typographic tweaking ---------------------------------*/
71 |
72 | .contents .page-header {
73 | margin-top: calc(-60px + 1em);
74 | }
75 |
76 | dd {
77 | margin-left: 3em;
78 | }
79 |
80 | /* Section anchors ---------------------------------*/
81 |
82 | a.anchor {
83 | display: none;
84 | margin-left: 5px;
85 | width: 20px;
86 | height: 20px;
87 |
88 | background-image: url(./link.svg);
89 | background-repeat: no-repeat;
90 | background-size: 20px 20px;
91 | background-position: center center;
92 | }
93 |
94 | h1:hover .anchor,
95 | h2:hover .anchor,
96 | h3:hover .anchor,
97 | h4:hover .anchor,
98 | h5:hover .anchor,
99 | h6:hover .anchor {
100 | display: inline-block;
101 | }
102 |
103 | /* Fixes for fixed navbar --------------------------*/
104 |
105 | .contents h1, .contents h2, .contents h3, .contents h4 {
106 | padding-top: 60px;
107 | margin-top: -40px;
108 | }
109 |
110 | /* Navbar submenu --------------------------*/
111 |
112 | .dropdown-submenu {
113 | position: relative;
114 | }
115 |
116 | .dropdown-submenu>.dropdown-menu {
117 | top: 0;
118 | left: 100%;
119 | margin-top: -6px;
120 | margin-left: -1px;
121 | border-radius: 0 6px 6px 6px;
122 | }
123 |
124 | .dropdown-submenu:hover>.dropdown-menu {
125 | display: block;
126 | }
127 |
128 | .dropdown-submenu>a:after {
129 | display: block;
130 | content: " ";
131 | float: right;
132 | width: 0;
133 | height: 0;
134 | border-color: transparent;
135 | border-style: solid;
136 | border-width: 5px 0 5px 5px;
137 | border-left-color: #cccccc;
138 | margin-top: 5px;
139 | margin-right: -10px;
140 | }
141 |
142 | .dropdown-submenu:hover>a:after {
143 | border-left-color: #ffffff;
144 | }
145 |
146 | .dropdown-submenu.pull-left {
147 | float: none;
148 | }
149 |
150 | .dropdown-submenu.pull-left>.dropdown-menu {
151 | left: -100%;
152 | margin-left: 10px;
153 | border-radius: 6px 0 6px 6px;
154 | }
155 |
156 | /* Sidebar --------------------------*/
157 |
158 | #pkgdown-sidebar {
159 | margin-top: 30px;
160 | position: -webkit-sticky;
161 | position: sticky;
162 | top: 70px;
163 | }
164 |
165 | #pkgdown-sidebar h2 {
166 | font-size: 1.5em;
167 | margin-top: 1em;
168 | }
169 |
170 | #pkgdown-sidebar h2:first-child {
171 | margin-top: 0;
172 | }
173 |
174 | #pkgdown-sidebar .list-unstyled li {
175 | margin-bottom: 0.5em;
176 | }
177 |
178 | /* bootstrap-toc tweaks ------------------------------------------------------*/
179 |
180 | /* All levels of nav */
181 |
182 | nav[data-toggle='toc'] .nav > li > a {
183 | padding: 4px 20px 4px 6px;
184 | font-size: 1.5rem;
185 | font-weight: 400;
186 | color: inherit;
187 | }
188 |
189 | nav[data-toggle='toc'] .nav > li > a:hover,
190 | nav[data-toggle='toc'] .nav > li > a:focus {
191 | padding-left: 5px;
192 | color: inherit;
193 | border-left: 1px solid #878787;
194 | }
195 |
196 | nav[data-toggle='toc'] .nav > .active > a,
197 | nav[data-toggle='toc'] .nav > .active:hover > a,
198 | nav[data-toggle='toc'] .nav > .active:focus > a {
199 | padding-left: 5px;
200 | font-size: 1.5rem;
201 | font-weight: 400;
202 | color: inherit;
203 | border-left: 2px solid #878787;
204 | }
205 |
206 | /* Nav: second level (shown on .active) */
207 |
208 | nav[data-toggle='toc'] .nav .nav {
209 | display: none; /* Hide by default, but at >768px, show it */
210 | padding-bottom: 10px;
211 | }
212 |
213 | nav[data-toggle='toc'] .nav .nav > li > a {
214 | padding-left: 16px;
215 | font-size: 1.35rem;
216 | }
217 |
218 | nav[data-toggle='toc'] .nav .nav > li > a:hover,
219 | nav[data-toggle='toc'] .nav .nav > li > a:focus {
220 | padding-left: 15px;
221 | }
222 |
223 | nav[data-toggle='toc'] .nav .nav > .active > a,
224 | nav[data-toggle='toc'] .nav .nav > .active:hover > a,
225 | nav[data-toggle='toc'] .nav .nav > .active:focus > a {
226 | padding-left: 15px;
227 | font-weight: 500;
228 | font-size: 1.35rem;
229 | }
230 |
231 | /* orcid ------------------------------------------------------------------- */
232 |
233 | .orcid {
234 | font-size: 16px;
235 | color: #A6CE39;
236 | /* margins are required by official ORCID trademark and display guidelines */
237 | margin-left:4px;
238 | margin-right:4px;
239 | vertical-align: middle;
240 | }
241 |
242 | /* Reference index & topics ----------------------------------------------- */
243 |
244 | .ref-index th {font-weight: normal;}
245 |
246 | .ref-index td {vertical-align: top; min-width: 100px}
247 | .ref-index .icon {width: 40px;}
248 | .ref-index .alias {width: 40%;}
249 | .ref-index-icons .alias {width: calc(40% - 40px);}
250 | .ref-index .title {width: 60%;}
251 |
252 | .ref-arguments th {text-align: right; padding-right: 10px;}
253 | .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px}
254 | .ref-arguments .name {width: 20%;}
255 | .ref-arguments .desc {width: 80%;}
256 |
257 | /* Nice scrolling for wide elements --------------------------------------- */
258 |
259 | table {
260 | display: block;
261 | overflow: auto;
262 | }
263 |
264 | /* Syntax highlighting ---------------------------------------------------- */
265 |
266 | pre, code, pre code {
267 | background-color: #f8f8f8;
268 | color: #333;
269 | }
270 | pre, pre code {
271 | white-space: pre-wrap;
272 | word-break: break-all;
273 | overflow-wrap: break-word;
274 | }
275 |
276 | pre {
277 | border: 1px solid #eee;
278 | }
279 |
280 | pre .img, pre .r-plt {
281 | margin: 5px 0;
282 | }
283 |
284 | pre .img img, pre .r-plt img {
285 | background-color: #fff;
286 | }
287 |
288 | code a, pre a {
289 | color: #375f84;
290 | }
291 |
292 | a.sourceLine:hover {
293 | text-decoration: none;
294 | }
295 |
296 | .fl {color: #1514b5;}
297 | .fu {color: #000000;} /* function */
298 | .ch,.st {color: #036a07;} /* string */
299 | .kw {color: #264D66;} /* keyword */
300 | .co {color: #888888;} /* comment */
301 |
302 | .error {font-weight: bolder;}
303 | .warning {font-weight: bolder;}
304 |
305 | /* Clipboard --------------------------*/
306 |
307 | .hasCopyButton {
308 | position: relative;
309 | }
310 |
311 | .btn-copy-ex {
312 | position: absolute;
313 | right: 0;
314 | top: 0;
315 | visibility: hidden;
316 | }
317 |
318 | .hasCopyButton:hover button.btn-copy-ex {
319 | visibility: visible;
320 | }
321 |
322 | /* headroom.js ------------------------ */
323 |
324 | .headroom {
325 | will-change: transform;
326 | transition: transform 200ms linear;
327 | }
328 | .headroom--pinned {
329 | transform: translateY(0%);
330 | }
331 | .headroom--unpinned {
332 | transform: translateY(-100%);
333 | }
334 |
335 | /* mark.js ----------------------------*/
336 |
337 | mark {
338 | background-color: rgba(255, 255, 51, 0.5);
339 | border-bottom: 2px solid rgba(255, 153, 51, 0.3);
340 | padding: 1px;
341 | }
342 |
343 | /* vertical spacing after htmlwidgets */
344 | .html-widget {
345 | margin-bottom: 10px;
346 | }
347 |
348 | /* fontawesome ------------------------ */
349 |
350 | .fab {
351 | font-family: "Font Awesome 5 Brands" !important;
352 | }
353 |
354 | /* don't display links in code chunks when printing */
355 | /* source: https://stackoverflow.com/a/10781533 */
356 | @media print {
357 | code a:link:after, code a:visited:after {
358 | content: "";
359 | }
360 | }
361 |
362 | /* Section anchors ---------------------------------
363 | Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71
364 | */
365 |
366 | div.csl-bib-body { }
367 | div.csl-entry {
368 | clear: both;
369 | }
370 | .hanging-indent div.csl-entry {
371 | margin-left:2em;
372 | text-indent:-2em;
373 | }
374 | div.csl-left-margin {
375 | min-width:2em;
376 | float:left;
377 | }
378 | div.csl-right-inline {
379 | margin-left:2em;
380 | padding-left:1em;
381 | }
382 | div.csl-indent {
383 | margin-left: 2em;
384 | }
385 |
--------------------------------------------------------------------------------
/docs/pkgdown.js:
--------------------------------------------------------------------------------
1 | /* http://gregfranko.com/blog/jquery-best-practices/ */
2 | (function($) {
3 | $(function() {
4 |
5 | $('.navbar-fixed-top').headroom();
6 |
7 | $('body').css('padding-top', $('.navbar').height() + 10);
8 | $(window).resize(function(){
9 | $('body').css('padding-top', $('.navbar').height() + 10);
10 | });
11 |
12 | $('[data-toggle="tooltip"]').tooltip();
13 |
14 | var cur_path = paths(location.pathname);
15 | var links = $("#navbar ul li a");
16 | var max_length = -1;
17 | var pos = -1;
18 | for (var i = 0; i < links.length; i++) {
19 | if (links[i].getAttribute("href") === "#")
20 | continue;
21 | // Ignore external links
22 | if (links[i].host !== location.host)
23 | continue;
24 |
25 | var nav_path = paths(links[i].pathname);
26 |
27 | var length = prefix_length(nav_path, cur_path);
28 | if (length > max_length) {
29 | max_length = length;
30 | pos = i;
31 | }
32 | }
33 |
34 | // Add class to parent , and enclosing if in dropdown
35 | if (pos >= 0) {
36 | var menu_anchor = $(links[pos]);
37 | menu_anchor.parent().addClass("active");
38 | menu_anchor.closest("li.dropdown").addClass("active");
39 | }
40 | });
41 |
42 | function paths(pathname) {
43 | var pieces = pathname.split("/");
44 | pieces.shift(); // always starts with /
45 |
46 | var end = pieces[pieces.length - 1];
47 | if (end === "index.html" || end === "")
48 | pieces.pop();
49 | return(pieces);
50 | }
51 |
52 | // Returns -1 if not found
53 | function prefix_length(needle, haystack) {
54 | if (needle.length > haystack.length)
55 | return(-1);
56 |
57 | // Special case for length-0 haystack, since for loop won't run
58 | if (haystack.length === 0) {
59 | return(needle.length === 0 ? 0 : -1);
60 | }
61 |
62 | for (var i = 0; i < haystack.length; i++) {
63 | if (needle[i] != haystack[i])
64 | return(i);
65 | }
66 |
67 | return(haystack.length);
68 | }
69 |
70 | /* Clipboard --------------------------*/
71 |
72 | function changeTooltipMessage(element, msg) {
73 | var tooltipOriginalTitle=element.getAttribute('data-original-title');
74 | element.setAttribute('data-original-title', msg);
75 | $(element).tooltip('show');
76 | element.setAttribute('data-original-title', tooltipOriginalTitle);
77 | }
78 |
79 | if(ClipboardJS.isSupported()) {
80 | $(document).ready(function() {
81 | var copyButton = " ";
82 |
83 | $("div.sourceCode").addClass("hasCopyButton");
84 |
85 | // Insert copy buttons:
86 | $(copyButton).prependTo(".hasCopyButton");
87 |
88 | // Initialize tooltips:
89 | $('.btn-copy-ex').tooltip({container: 'body'});
90 |
91 | // Initialize clipboard:
92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', {
93 | text: function(trigger) {
94 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, "");
95 | }
96 | });
97 |
98 | clipboardBtnCopies.on('success', function(e) {
99 | changeTooltipMessage(e.trigger, 'Copied!');
100 | e.clearSelection();
101 | });
102 |
103 | clipboardBtnCopies.on('error', function() {
104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy');
105 | });
106 | });
107 | }
108 | })(window.jQuery || window.$)
109 |
--------------------------------------------------------------------------------
/docs/pkgdown.yml:
--------------------------------------------------------------------------------
1 | pandoc: 2.19.2
2 | pkgdown: 2.0.6
3 | pkgdown_sha: ~
4 | articles:
5 | csv2sqlite: csv2sqlite.html
6 | last_built: 2022-12-06T14:39Z
7 |
8 |
--------------------------------------------------------------------------------
/docs/reference/Rplot001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/privefl/bigreadr/2d8806f1067b19610a2d633bf2e863b910570d5d/docs/reference/Rplot001.png
--------------------------------------------------------------------------------
/docs/reference/big_fread1.html:
--------------------------------------------------------------------------------
1 |
2 | Read large text file — big_fread1 • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
57 |
58 |
59 |
Read large text file by splitting lines.
60 |
61 |
62 |
63 |
big_fread1 ( file , every_nlines , .transform = identity ,
64 | .combine = rbind_df , skip = 0 , ... , print_timings = TRUE )
65 |
66 |
67 |
68 |
Arguments
69 |
file
70 | Path to file that you want to read.
71 |
72 |
73 | every_nlines
74 | Maximum number of lines in new file parts.
75 |
76 |
77 | .transform
78 | Function to transform each data frame corresponding to each
79 | part of the file
. Default doesn't change anything.
80 |
81 |
82 | .combine
83 | Function to combine results (list of data frames).
84 |
85 |
86 | skip
87 | Number of lines to skip at the beginning of file
.
88 |
89 |
90 | ...
91 | Other arguments to be passed to data.table::fread ,
92 | excepted input
, file
, skip
, col.names
and showProgress
.
93 |
94 |
95 | print_timings
96 | Whether to print timings? Default is TRUE
.
97 |
98 |
99 |
100 |
Value
101 |
102 |
103 |
A data.frame
by default; a data.table
when data.table = TRUE
.
104 |
105 |
106 |
107 |
110 |
111 |
112 |
113 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/docs/reference/big_fread2.html:
--------------------------------------------------------------------------------
1 |
2 | Read large text file — big_fread2 • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
57 |
58 |
59 |
Read large text file by splitting columns.
60 |
61 |
62 |
63 |
big_fread2 ( file , nb_parts = NULL , .transform = identity ,
64 | .combine = cbind_df , skip = 0 , select = NULL , progress = FALSE ,
65 | part_size = 500 * 1024 ^ 2 , ... )
66 |
67 |
68 |
69 |
Arguments
70 |
file
71 | Path to file that you want to read.
72 |
73 |
74 | nb_parts
75 | Number of parts in which to split reading (and transforming).
76 | Parts are referring to blocks of selected columns.
77 | Default uses part_size
to set a good value.
78 |
79 |
80 | .transform
81 | Function to transform each data frame corresponding to each
82 | block of selected columns. Default doesn't change anything.
83 |
84 |
85 | .combine
86 | Function to combine results (list of data frames).
87 |
88 |
89 | skip
90 | Number of lines to skip at the beginning of file
.
91 |
92 |
93 | select
94 | Indices of columns to keep (sorted). Default keeps them all.
95 |
96 |
97 | progress
98 | Show progress? Default is FALSE
.
99 |
100 |
101 | part_size
102 | Size of the parts if nb_parts
is not supplied.
103 | Default is 500 * 1024^2
(500 MB).
104 |
105 |
106 | ...
107 | Other arguments to be passed to data.table::fread ,
108 | excepted input
, file
, skip
, select
and showProgress
.
109 |
110 |
111 |
112 |
Value
113 |
114 |
115 |
The outputs of fread2
+ .transform
, combined with .combine
.
116 |
117 |
118 |
119 |
122 |
123 |
124 |
125 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
--------------------------------------------------------------------------------
/docs/reference/bigreadr-package.html:
--------------------------------------------------------------------------------
1 |
2 | bigreadr: Read Large Text Files — bigreadr-package • bigreadr
8 |
9 |
10 |
11 |
53 |
54 |
59 |
60 |
61 |
Read large text files by splitting them in smaller files.
62 | Package 'bigreadr' also provides some convenient wrappers around fread()
63 | and fwrite() from package 'data.table'.
64 |
65 |
66 |
67 |
73 |
77 |
78 |
79 |
82 |
83 |
84 |
85 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/docs/reference/cbind_df.html:
--------------------------------------------------------------------------------
1 |
2 | Merge data frames — cbind_df • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
57 |
58 |
59 |
Merge data frames
60 |
61 |
62 |
65 |
66 |
67 |
Arguments
68 |
list_df
69 | A list of multiple data frames with the same observations in
70 | the same order.
71 |
72 |
73 |
74 |
Value
75 |
76 |
77 |
One merged data frame.
78 |
79 |
80 |
81 |
Examples
82 |
str ( iris )
83 | #> 'data.frame': 150 obs. of 5 variables:
84 | #> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
85 | #> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
86 | #> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
87 | #> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
88 | #> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
89 | str ( cbind_df ( list ( iris , iris ) ) )
90 | #> 'data.frame': 150 obs. of 10 variables:
91 | #> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
92 | #> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
93 | #> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
94 | #> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
95 | #> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
96 | #> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
97 | #> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
98 | #> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
99 | #> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
100 | #> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
101 |
102 |
103 |
104 |
105 |
108 |
109 |
110 |
111 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/docs/reference/fread2.html:
--------------------------------------------------------------------------------
1 |
2 | Read text file(s) — fread2 • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
57 |
58 |
59 |
Read text file(s)
60 |
61 |
62 |
63 |
fread2 ( input , ... , data.table = FALSE ,
64 | nThread = getOption ( "bigreadr.nThread" ) )
65 |
66 |
67 |
68 |
Arguments
69 |
input
70 | Path to the file(s) that you want to read from.
71 | This can also be a command, some text or an URL.
72 | If a vector of inputs is provided, resulting data frames are appended.
73 |
74 |
75 | ...
76 | Other arguments to be passed to data.table::fread .
77 |
78 |
79 | data.table
80 | Whether to return a data.table
or just a data.frame
?
81 | Default is FALSE
(and is the opposite of data.table::fread ).
82 |
83 |
84 | nThread
85 | Number of threads to use. Default uses all threads minus one.
86 |
87 |
88 |
89 |
Value
90 |
91 |
92 |
A data.frame
by default; a data.table
when data.table = TRUE
.
93 |
94 |
95 |
96 |
Examples
97 |
tmp <- fwrite2 ( iris )
98 | iris2 <- fread2 ( tmp )
99 | all.equal ( iris2 , iris ) ## fread doesn't use factors
100 | #> [1] "Component \"Species\": Modes: character, numeric"
101 | #> [2] "Component \"Species\": Attributes: < target is NULL, current is list >"
102 | #> [3] "Component \"Species\": target is character, current is factor"
103 |
104 |
105 |
106 |
109 |
110 |
111 |
112 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/docs/reference/fwrite2.html:
--------------------------------------------------------------------------------
1 |
2 | Write a data frame to a text file — fwrite2 • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
57 |
58 |
59 |
Write a data frame to a text file
60 |
61 |
62 |
63 |
fwrite2 ( x , file = tempfile ( ) , ... , quote = FALSE ,
64 | nThread = getOption ( "bigreadr.nThread" ) )
65 |
66 |
67 |
68 |
Arguments
69 |
x
70 | Data frame to write.
71 |
72 |
73 | file
74 | Path to the file that you want to write to.
75 | Defaults uses tempfile()
.
76 |
77 |
78 | ...
79 | Other arguments to be passed to data.table::fwrite .
80 |
81 |
82 | quote
83 | Whether to quote strings (default is FALSE
).
84 |
85 |
86 | nThread
87 | Number of threads to use. Default uses all threads minus one.
88 |
89 |
90 |
91 |
Value
92 |
93 |
94 |
Input parameter file
, invisibly.
95 |
96 |
97 |
98 |
Examples
99 |
tmp <- fwrite2 ( iris )
100 | iris2 <- fread2 ( tmp )
101 | all.equal ( iris2 , iris ) ## fread doesn't use factors
102 | #> [1] "Component \"Species\": Modes: character, numeric"
103 | #> [2] "Component \"Species\": Attributes: < target is NULL, current is list >"
104 | #> [3] "Component \"Species\": target is character, current is factor"
105 |
106 |
107 |
108 |
111 |
112 |
113 |
114 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/docs/reference/index.html:
--------------------------------------------------------------------------------
1 |
2 | Function reference • bigreadr
6 |
7 |
8 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/docs/reference/nlines.html:
--------------------------------------------------------------------------------
1 |
2 | Number of lines — nlines • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
57 |
58 |
59 |
Get the number of lines of a file.
60 |
61 |
62 |
65 |
66 |
67 |
Arguments
68 |
file
69 | Path of the file.
70 |
71 |
72 |
73 |
Value
74 |
75 |
76 |
The number of lines as one integer.
77 |
78 |
79 |
80 |
Examples
81 |
tmp <- fwrite2 ( iris )
82 | nlines ( tmp )
83 | #> [1] 151
84 |
85 |
86 |
87 |
88 |
91 |
92 |
93 |
94 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/docs/reference/rbind_df.html:
--------------------------------------------------------------------------------
1 |
2 | Merge data frames — rbind_df • bigreadr
6 |
7 |
8 |
9 |
51 |
52 |
57 |
58 |
59 |
Merge data frames
60 |
61 |
62 |
65 |
66 |
67 |
Arguments
68 |
list_df
69 | A list of multiple data frames with the same variables in the
70 | same order.
71 |
72 |
73 |
74 |
Value
75 |
76 |
77 |
One merged data frame with the names of the first input data frame.
78 |
79 |
80 |
81 |
Examples
82 |
str ( iris )
83 | #> 'data.frame': 150 obs. of 5 variables:
84 | #> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
85 | #> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
86 | #> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
87 | #> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
88 | #> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
89 | str ( rbind_df ( list ( iris , iris ) ) )
90 | #> 'data.frame': 300 obs. of 5 variables:
91 | #> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
92 | #> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
93 | #> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
94 | #> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
95 | #> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
96 |
97 |
98 |
99 |
100 |
103 |
104 |
105 |
106 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/docs/reference/split_file.html:
--------------------------------------------------------------------------------
1 |
2 | Split file every nlines — split_file • bigreadr
7 |
8 |
9 |
10 |
52 |
53 |
58 |
59 |
60 |
Split file every nlines
61 |
Get files from splitting.
62 |
63 |
64 |
65 |
split_file ( file , every_nlines , prefix_out = tempfile ( ) ,
66 | repeat_header = FALSE )
67 |
68 | get_split_files ( split_file_out )
69 |
70 |
71 |
72 |
Arguments
73 |
file
74 | Path to file that you want to split.
75 |
76 |
77 | every_nlines
78 | Maximum number of lines in new file parts.
79 |
80 |
81 | prefix_out
82 | Prefix for created files. Default uses tempfile()
.
83 |
84 |
85 | repeat_header
86 | Whether to repeat the header row in each file.
87 | Default is FALSE
.
88 |
89 |
90 | split_file_out
91 | Output of split_file.
92 |
93 |
94 |
95 |
Value
96 |
97 |
98 |
A list with
name_in
: input parameter file
,
99 | prefix_out
: input parameter `prefix_out``,
100 | nfiles
: Number of files (parts) created,
101 | nlines_part
: input parameter every_nlines
,
102 | nlines_all
: total number of lines of file
.
103 | Vector of file paths created by split_file.
104 |
105 |
106 |
107 |
Examples
108 |
tmp <- fwrite2 ( iris )
109 | infos <- split_file ( tmp , 100 )
110 | str ( infos )
111 | #> List of 6
112 | #> $ name_in : chr "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f821d7102d"
113 | #> $ prefix_out : chr "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f855f46bc3"
114 | #> $ nfiles : int 2
115 | #> $ nlines_part : int 100
116 | #> $ nlines_all : num 151
117 | #> $ repeat_header: logi FALSE
118 | get_split_files ( infos )
119 | #> [1] "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f855f46bc3_1.txt"
120 | #> [2] "C:\\Users\\au639593\\AppData\\Local\\Temp\\Rtmpq2HStE\\file40f855f46bc3_2.txt"
121 |
122 |
123 |
124 |
127 |
128 |
129 |
130 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/docs/sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /404.html
5 |
6 |
7 | /articles/csv2sqlite.html
8 |
9 |
10 | /articles/index.html
11 |
12 |
13 | /authors.html
14 |
15 |
16 | /index.html
17 |
18 |
19 | /reference/bigreadr-package.html
20 |
21 |
22 | /reference/big_fread1.html
23 |
24 |
25 | /reference/big_fread2.html
26 |
27 |
28 | /reference/cbind_df.html
29 |
30 |
31 | /reference/fread2.html
32 |
33 |
34 | /reference/fwrite2.html
35 |
36 |
37 | /reference/index.html
38 |
39 |
40 | /reference/nlines.html
41 |
42 |
43 | /reference/rbind_df.html
44 |
45 |
46 | /reference/split_file.html
47 |
48 |
49 |
--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
1 | Filebacked
2 | Florian
3 | fpeek
4 | fread
5 | fwrite
6 | nlines
7 | PrivÃ
8 | Privé
9 |
--------------------------------------------------------------------------------
/inst/testdata/cars_with_newline.csv:
--------------------------------------------------------------------------------
1 | speed,dist
2 | 4,2
3 | 4,10
4 | 7,4
5 | 7,22
6 | 8,16
7 | 9,10
8 | 10,18
9 | 10,26
10 | 10,34
11 | 11,17
12 | 11,28
13 | 12,14
14 | 12,20
15 | 12,24
16 | 12,28
17 | 13,26
18 | 13,34
19 | 13,34
20 | 13,46
21 | 14,26
22 | 14,36
23 | 14,60
24 | 14,80
25 | 15,20
26 | 15,26
27 | 15,54
28 | 16,32
29 | 16,40
30 | 17,32
31 | 17,40
32 | 17,50
33 | 18,42
34 | 18,56
35 | 18,76
36 | 18,84
37 | 19,36
38 | 19,46
39 | 19,68
40 | 20,32
41 | 20,48
42 | 20,52
43 | 20,56
44 | 20,64
45 | 22,66
46 | 23,54
47 | 24,70
48 | 24,92
49 | 24,93
50 | 24,120
51 | 25,85
52 |
--------------------------------------------------------------------------------
/inst/testdata/cars_without_newline.csv:
--------------------------------------------------------------------------------
1 | speed,dist
2 | 4,2
3 | 4,10
4 | 7,4
5 | 7,22
6 | 8,16
7 | 9,10
8 | 10,18
9 | 10,26
10 | 10,34
11 | 11,17
12 | 11,28
13 | 12,14
14 | 12,20
15 | 12,24
16 | 12,28
17 | 13,26
18 | 13,34
19 | 13,34
20 | 13,46
21 | 14,26
22 | 14,36
23 | 14,60
24 | 14,80
25 | 15,20
26 | 15,26
27 | 15,54
28 | 16,32
29 | 16,40
30 | 17,32
31 | 17,40
32 | 17,50
33 | 18,42
34 | 18,56
35 | 18,76
36 | 18,84
37 | 19,36
38 | 19,46
39 | 19,68
40 | 20,32
41 | 20,48
42 | 20,52
43 | 20,56
44 | 20,64
45 | 22,66
46 | 23,54
47 | 24,70
48 | 24,92
49 | 24,93
50 | 24,120
51 | 25,85
--------------------------------------------------------------------------------
/inst/testdata/wrong_string.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/privefl/bigreadr/2d8806f1067b19610a2d633bf2e863b910570d5d/inst/testdata/wrong_string.rds
--------------------------------------------------------------------------------
/man/big_fread1.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/read.R
3 | \name{big_fread1}
4 | \alias{big_fread1}
5 | \title{Read large text file}
6 | \usage{
7 | big_fread1(file, every_nlines, .transform = identity,
8 | .combine = rbind_df, skip = 0, ..., print_timings = TRUE)
9 | }
10 | \arguments{
11 | \item{file}{Path to file that you want to read.}
12 |
13 | \item{every_nlines}{Maximum number of lines in new file parts.}
14 |
15 | \item{.transform}{Function to transform each data frame corresponding to each
16 | part of the \code{file}. Default doesn't change anything.}
17 |
18 | \item{.combine}{Function to combine results (list of data frames).}
19 |
20 | \item{skip}{Number of lines to skip at the beginning of \code{file}.}
21 |
22 | \item{...}{Other arguments to be passed to \link[data.table:fread]{data.table::fread},
23 | excepted \code{input}, \code{file}, \code{skip}, \code{col.names} and \code{showProgress}.}
24 |
25 | \item{print_timings}{Whether to print timings? Default is \code{TRUE}.}
26 | }
27 | \value{
28 | A \code{data.frame} by default; a \code{data.table} when \code{data.table = TRUE}.
29 | }
30 | \description{
31 | Read large text file by splitting lines.
32 | }
33 |
--------------------------------------------------------------------------------
/man/big_fread2.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/read.R
3 | \name{big_fread2}
4 | \alias{big_fread2}
5 | \title{Read large text file}
6 | \usage{
7 | big_fread2(file, nb_parts = NULL, .transform = identity,
8 | .combine = cbind_df, skip = 0, select = NULL, progress = FALSE,
9 | part_size = 500 * 1024^2, ...)
10 | }
11 | \arguments{
12 | \item{file}{Path to file that you want to read.}
13 |
14 | \item{nb_parts}{Number of parts in which to split reading (and transforming).
15 | Parts are referring to blocks of selected columns.
16 | Default uses \code{part_size} to set a good value.}
17 |
18 | \item{.transform}{Function to transform each data frame corresponding to each
19 | block of selected columns. Default doesn't change anything.}
20 |
21 | \item{.combine}{Function to combine results (list of data frames).}
22 |
23 | \item{skip}{Number of lines to skip at the beginning of \code{file}.}
24 |
25 | \item{select}{Indices of columns to keep (sorted). Default keeps them all.}
26 |
27 | \item{progress}{Show progress? Default is \code{FALSE}.}
28 |
29 | \item{part_size}{Size of the parts if \code{nb_parts} is not supplied.
30 | Default is \code{500 * 1024^2} (500 MB).}
31 |
32 | \item{...}{Other arguments to be passed to \link[data.table:fread]{data.table::fread},
33 | excepted \code{input}, \code{file}, \code{skip}, \code{select} and \code{showProgress}.}
34 | }
35 | \value{
36 | The outputs of \code{fread2} + \code{.transform}, combined with \code{.combine}.
37 | }
38 | \description{
39 | Read large text file by splitting columns.
40 | }
41 |
--------------------------------------------------------------------------------
/man/bigreadr-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/bigreadr-package.R
3 | \docType{package}
4 | \name{bigreadr-package}
5 | \alias{bigreadr}
6 | \alias{bigreadr-package}
7 | \title{bigreadr: Read Large Text Files}
8 | \description{
9 | Read large text files by splitting them in smaller files.
10 | Package 'bigreadr' also provides some convenient wrappers around fread()
11 | and fwrite() from package 'data.table'.
12 | }
13 | \seealso{
14 | Useful links:
15 | \itemize{
16 | \item \url{https://github.com/privefl/bigreadr}
17 | \item Report bugs at \url{https://github.com/privefl/bigreadr/issues}
18 | }
19 |
20 | }
21 | \author{
22 | \strong{Maintainer}: Florian Privé \email{florian.prive.21@gmail.com}
23 |
24 | }
25 | \keyword{internal}
26 |
--------------------------------------------------------------------------------
/man/cbind_df.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/bind.R
3 | \name{cbind_df}
4 | \alias{cbind_df}
5 | \title{Merge data frames}
6 | \usage{
7 | cbind_df(list_df)
8 | }
9 | \arguments{
10 | \item{list_df}{A list of multiple data frames with the same observations in
11 | the same order.}
12 | }
13 | \value{
14 | One merged data frame.
15 | }
16 | \description{
17 | Merge data frames
18 | }
19 | \examples{
20 | str(iris)
21 | str(cbind_df(list(iris, iris)))
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/man/fread2.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/read.R
3 | \name{fread2}
4 | \alias{fread2}
5 | \title{Read text file(s)}
6 | \usage{
7 | fread2(input, ..., data.table = FALSE,
8 | nThread = getOption("bigreadr.nThread"))
9 | }
10 | \arguments{
11 | \item{input}{Path to the file(s) that you want to read from.
12 | This can also be a command, some text or an URL.
13 | If a vector of inputs is provided, resulting data frames are appended.}
14 |
15 | \item{...}{Other arguments to be passed to \link[data.table:fread]{data.table::fread}.}
16 |
17 | \item{data.table}{Whether to return a \code{data.table} or just a \code{data.frame}?
18 | Default is \code{FALSE} (and is the opposite of \link[data.table:fread]{data.table::fread}).}
19 |
20 | \item{nThread}{Number of threads to use. Default uses all threads minus one.}
21 | }
22 | \value{
23 | A \code{data.frame} by default; a \code{data.table} when \code{data.table = TRUE}.
24 | }
25 | \description{
26 | Read text file(s)
27 | }
28 | \examples{
29 | tmp <- fwrite2(iris)
30 | iris2 <- fread2(tmp)
31 | all.equal(iris2, iris) ## fread doesn't use factors
32 | }
33 |
--------------------------------------------------------------------------------
/man/fwrite2.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/read.R
3 | \name{fwrite2}
4 | \alias{fwrite2}
5 | \title{Write a data frame to a text file}
6 | \usage{
7 | fwrite2(x, file = tempfile(), ..., quote = FALSE,
8 | nThread = getOption("bigreadr.nThread"))
9 | }
10 | \arguments{
11 | \item{x}{Data frame to write.}
12 |
13 | \item{file}{Path to the file that you want to write to.
14 | Defaults uses \code{tempfile()}.}
15 |
16 | \item{...}{Other arguments to be passed to \link[data.table:fwrite]{data.table::fwrite}.}
17 |
18 | \item{quote}{Whether to quote strings (default is \code{FALSE}).}
19 |
20 | \item{nThread}{Number of threads to use. Default uses all threads minus one.}
21 | }
22 | \value{
23 | Input parameter \code{file}, invisibly.
24 | }
25 | \description{
26 | Write a data frame to a text file
27 | }
28 | \examples{
29 | tmp <- fwrite2(iris)
30 | iris2 <- fread2(tmp)
31 | all.equal(iris2, iris) ## fread doesn't use factors
32 | }
33 |
--------------------------------------------------------------------------------
/man/nlines.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/nlines-split.R
3 | \name{nlines}
4 | \alias{nlines}
5 | \title{Number of lines}
6 | \usage{
7 | nlines(file)
8 | }
9 | \arguments{
10 | \item{file}{Path of the file.}
11 | }
12 | \value{
13 | The number of lines as one integer.
14 | }
15 | \description{
16 | Get the number of lines of a file.
17 | }
18 | \examples{
19 | tmp <- fwrite2(iris)
20 | nlines(tmp)
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/man/rbind_df.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/bind.R
3 | \name{rbind_df}
4 | \alias{rbind_df}
5 | \title{Merge data frames}
6 | \usage{
7 | rbind_df(list_df)
8 | }
9 | \arguments{
10 | \item{list_df}{A list of multiple data frames with the same variables in the
11 | same order.}
12 | }
13 | \value{
14 | One merged data frame with the names of the first input data frame.
15 | }
16 | \description{
17 | Merge data frames
18 | }
19 | \examples{
20 | str(iris)
21 | str(rbind_df(list(iris, iris)))
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/man/split_file.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/nlines-split.R
3 | \name{split_file}
4 | \alias{split_file}
5 | \alias{get_split_files}
6 | \title{Split file every nlines}
7 | \usage{
8 | split_file(file, every_nlines, prefix_out = tempfile(),
9 | repeat_header = FALSE)
10 |
11 | get_split_files(split_file_out)
12 | }
13 | \arguments{
14 | \item{file}{Path to file that you want to split.}
15 |
16 | \item{every_nlines}{Maximum number of lines in new file parts.}
17 |
18 | \item{prefix_out}{Prefix for created files. Default uses \code{tempfile()}.}
19 |
20 | \item{repeat_header}{Whether to repeat the header row in each file.
21 | Default is \code{FALSE}.}
22 |
23 | \item{split_file_out}{Output of \link{split_file}.}
24 | }
25 | \value{
26 | A list with
27 | \itemize{
28 | \item \code{name_in}: input parameter \code{file},
29 | \item \code{prefix_out}: input parameter `prefix_out``,
30 | \item \code{nfiles}: Number of files (parts) created,
31 | \item \code{nlines_part}: input parameter \code{every_nlines},
32 | \item \code{nlines_all}: total number of lines of \code{file}.
33 | }
34 |
35 | Vector of file paths created by \link{split_file}.
36 | }
37 | \description{
38 | Split file every nlines
39 |
40 | Get files from splitting.
41 | }
42 | \examples{
43 | tmp <- fwrite2(iris)
44 | infos <- split_file(tmp, 100)
45 | str(infos)
46 | get_split_files(infos)
47 | }
48 |
--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 |
--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 |
4 | #include
5 |
6 | using namespace Rcpp;
7 |
8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 |
13 | // nlines_cpp
14 | double nlines_cpp(std::string file);
15 | RcppExport SEXP _bigreadr_nlines_cpp(SEXP fileSEXP) {
16 | BEGIN_RCPP
17 | Rcpp::RObject rcpp_result_gen;
18 | Rcpp::RNGScope rcpp_rngScope_gen;
19 | Rcpp::traits::input_parameter< std::string >::type file(fileSEXP);
20 | rcpp_result_gen = Rcpp::wrap(nlines_cpp(file));
21 | return rcpp_result_gen;
22 | END_RCPP
23 | }
24 | // split_every_nlines
25 | List split_every_nlines(std::string name_in, std::string prefix_out, int every_nlines, bool repeat_header);
26 | RcppExport SEXP _bigreadr_split_every_nlines(SEXP name_inSEXP, SEXP prefix_outSEXP, SEXP every_nlinesSEXP, SEXP repeat_headerSEXP) {
27 | BEGIN_RCPP
28 | Rcpp::RObject rcpp_result_gen;
29 | Rcpp::RNGScope rcpp_rngScope_gen;
30 | Rcpp::traits::input_parameter< std::string >::type name_in(name_inSEXP);
31 | Rcpp::traits::input_parameter< std::string >::type prefix_out(prefix_outSEXP);
32 | Rcpp::traits::input_parameter< int >::type every_nlines(every_nlinesSEXP);
33 | Rcpp::traits::input_parameter< bool >::type repeat_header(repeat_headerSEXP);
34 | rcpp_result_gen = Rcpp::wrap(split_every_nlines(name_in, prefix_out, every_nlines, repeat_header));
35 | return rcpp_result_gen;
36 | END_RCPP
37 | }
38 |
39 | static const R_CallMethodDef CallEntries[] = {
40 | {"_bigreadr_nlines_cpp", (DL_FUNC) &_bigreadr_nlines_cpp, 1},
41 | {"_bigreadr_split_every_nlines", (DL_FUNC) &_bigreadr_split_every_nlines, 4},
42 | {NULL, NULL, 0}
43 | };
44 |
45 | RcppExport void R_init_bigreadr(DllInfo *dll) {
46 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
47 | R_useDynamicSymbols(dll, FALSE);
48 | }
49 |
--------------------------------------------------------------------------------
/src/nlines-split.cpp:
--------------------------------------------------------------------------------
1 | /******************************************************************************/
2 |
3 | #include
4 | using namespace Rcpp;
5 |
6 | #define INIT_SIZE 64
7 |
8 | /******************************************************************************/
9 |
10 | char * fgets_full_line(char * str, FILE * stream, size_t * p_size) {
11 |
12 | while (true) {
13 |
14 | str = fgets(str, *p_size, stream);
15 | if (str == NULL) return NULL;
16 | // Rcout << *p_size << " -> " << (str[strlen(str) - 1] == '\n') << std::endl;
17 |
18 | if (feof(stream) | (str[strlen(str) - 1] == '\n')) { // reached EOF or EOL
19 |
20 | // Rcout << strlen(str) << " / " << (str[strlen(str) - 1] == '\n') << std::endl;
21 | return str;
22 |
23 | } else { // increase size of str and try again
24 |
25 | fseek(stream , 1 - *p_size, SEEK_CUR);
26 | *p_size *= 2;
27 |
28 | delete [] str;
29 | str = new char[*p_size];
30 |
31 | }
32 | }
33 | }
34 |
35 | /******************************************************************************/
36 |
37 | // [[Rcpp::export]]
38 | double nlines_cpp(std::string file) {
39 |
40 | FILE *fp_in = fopen(file.c_str(), "r");
41 | if (fp_in == NULL) Rcpp::stop("Error while opening file '%s'.", file);
42 |
43 | size_t size = INIT_SIZE;
44 |
45 | char *line = new char[size];
46 | size_t nline_all = 0;
47 |
48 | while (!feof(fp_in)) {
49 |
50 | line = fgets_full_line(line, fp_in, &size);
51 |
52 | if (ferror(fp_in)) {
53 | delete [] line;
54 | Rcpp::stop("Error while reading file '%s'.", file);
55 | }
56 |
57 | if (line != NULL) nline_all++;
58 | }
59 |
60 | fclose(fp_in);
61 | delete [] line;
62 |
63 | return nline_all;
64 | }
65 |
66 | /******************************************************************************/
67 |
68 | // [[Rcpp::export]]
69 | List split_every_nlines(std::string name_in,
70 | std::string prefix_out,
71 | int every_nlines,
72 | bool repeat_header) {
73 |
74 | FILE *fp_in = fopen(name_in.c_str(), "r"), *fp_out;
75 | if (fp_in == NULL)
76 | Rcpp::stop("Error while opening file '%s'.", name_in);
77 |
78 | const char *fn_out = prefix_out.c_str();
79 | size_t max_len = strlen(fn_out) + 20;
80 | char *name_out = new char[max_len];
81 |
82 | size_t size = INIT_SIZE;
83 |
84 | char *line = new char[size];
85 |
86 | // read header once and store it
87 | line = fgets_full_line(line, fp_in, &size);
88 | char *head = new char[size];
89 | strcpy(head, line);
90 | rewind(fp_in);
91 |
92 | bool not_eof = true, header_added = false;
93 | int nfile = 0;
94 | size_t nline_all = 0;
95 |
96 | while (not_eof) {
97 |
98 | // Open file number 'nfile'
99 | snprintf(name_out, max_len, "%s_%d.txt", fn_out, ++nfile);
100 | fp_out = fopen(name_out, "w");
101 |
102 | // Fill it with 'every_nlines' lines
103 | int nline_file = 0;
104 | while (nline_file < every_nlines) {
105 |
106 | if ( (line = fgets_full_line(line, fp_in, &size)) == NULL ) {
107 | not_eof = false;
108 | break;
109 | }
110 |
111 | if (repeat_header & (nline_file == 0) & (nfile > 1)) {
112 | fputs(head, fp_out);
113 | header_added = true;
114 | };
115 |
116 | fputs(line, fp_out);
117 | nline_file++;
118 | }
119 |
120 | // Close file number 'nfile'
121 | fflush(fp_out);
122 | fclose(fp_out);
123 | if (nline_file == 0) {
124 | // nothing has been written because of EOF -> remove file
125 | remove(name_out);
126 | nfile--;
127 | } else {
128 | nline_all += nline_file + header_added;
129 | }
130 | }
131 |
132 | fclose(fp_in);
133 |
134 | delete[] name_out;
135 | delete[] line;
136 | delete[] head;
137 |
138 | return List::create(
139 | _["name_in"] = name_in,
140 | _["prefix_out"] = prefix_out,
141 | _["nfiles"] = nfile,
142 | _["nlines_part"] = every_nlines,
143 | _["nlines_all"] = nline_all,
144 | _["repeat_header"] = repeat_header
145 | );
146 | }
147 |
148 | /******************************************************************************/
149 |
--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | spelling::spell_check_test(vignettes = TRUE, error = FALSE)
2 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(bigreadr)
3 |
4 | test_check("bigreadr")
5 |
--------------------------------------------------------------------------------
/tests/testthat/test-bind.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | context("test-bind.R")
4 |
5 | ################################################################################
6 |
7 | test_that("'cbind_df' works", {
8 |
9 | # No copies with 'cbind.data.frame'
10 | iris$Species <- as.character(iris$Species)
11 | addr <- sapply(iris, data.table::address)
12 | iris2 <- cbind_df(list(iris, iris))
13 | expect_identical(sapply(iris2, data.table::address), c(addr, addr))
14 |
15 | # Data frame with factors
16 | df <- datasets::iris
17 | df2 <- cbind_df(list(df))
18 | expect_identical(df2, df)
19 | df3 <- cbind_df(list(df, df, df))
20 | expect_equal(dim(df3), c(150, 15))
21 | expect_identical(class(df3), "data.frame")
22 |
23 | # Data table
24 | dt <- data.table::as.data.table(df)
25 | dt2 <- cbind_df(list(dt))
26 | expect_identical(class(dt2), c("data.table", "data.frame"))
27 | expect_identical(dt2, dt)
28 | dt3 <- cbind_df(list(dt, dt, dt))
29 | expect_equal(dim(dt3), c(150, 15))
30 | expect_identical(class(dt3), c("data.table", "data.frame"))
31 |
32 | # Data frame without factors
33 | df$Species <- as.character(df$Species)
34 | df2 <- cbind_df(list(df))
35 | expect_identical(df2, df)
36 | df3 <- cbind_df(list(df, df, df))
37 | expect_equal(dim(df3), c(150, 15))
38 | expect_identical(class(df3), "data.frame")
39 | })
40 |
41 | ################################################################################
42 |
43 | test_that("'rbind_df' works", {
44 |
45 | # Data frame with factors
46 | df <- datasets::iris
47 | df2 <- rbind_df(list(df))
48 | expect_identical(df2, df)
49 | df3 <- rbind_df(list(df, df, df))
50 | expect_equal(dim(df3), c(450, 5))
51 | expect_identical(class(df3), "data.frame")
52 |
53 | # Data table
54 | dt <- data.table::as.data.table(df)
55 | dt2 <- rbind_df(list(dt))
56 | expect_identical(class(dt2), c("data.table", "data.frame"))
57 | expect_identical(dt2, dt)
58 | dt3 <- rbind_df(list(dt, dt, dt))
59 | expect_equal(dim(dt3), c(450, 5))
60 | expect_identical(class(dt3), c("data.table", "data.frame"))
61 |
62 | # Data frame without factors
63 | df$Species <- as.character(df$Species)
64 | df2 <- rbind_df(list(df))
65 | expect_identical(df2, df)
66 | df3 <- rbind_df(list(df, df, df))
67 | expect_equal(dim(df3), c(450, 5))
68 | expect_identical(class(df3), "data.frame")
69 |
70 | # Error
71 | expect_error(rbind_df(list(as.matrix(iris), iris)),
72 | "'list_df' should contain data tables or data frames.", fixed = TRUE)
73 | })
74 |
75 | ################################################################################
76 |
--------------------------------------------------------------------------------
/tests/testthat/test-nlines.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | context("test-nlines.R")
4 |
5 | ################################################################################
6 |
7 | test_that("'nlines()' works", {
8 |
9 | expect_error(nlines("does_not_exist.txt"))
10 |
11 | strings <- readRDS(system.file("testdata", "wrong_string.rds", package = "bigreadr"))
12 | writeLines(strings, tmp <- tempfile())
13 | expect_equal(nlines(tmp), 24)
14 |
15 | strings <- c("", "", " ", sapply(10^(seq(0, 4, by = 0.2)), function(i) {
16 | paste(as.matrix(iris)[sample(nrow(iris), i, TRUE), ], collapse = " ")
17 | }))
18 | replicate(100, {
19 | writeLines(sample(strings, replace = TRUE), tmp <- tempfile())
20 | expect_equal(nlines(tmp), length(readLines(tmp)))
21 | })
22 | })
23 |
24 | ################################################################################
25 |
26 | test_that("'nlines()' works with or without newline", {
27 | csv1 <- system.file("testdata", "cars_with_newline.csv", package = "bigreadr")
28 | expect_identical(nlines(csv1), 51)
29 | csv2 <- system.file("testdata", "cars_without_newline.csv", package = "bigreadr")
30 | expect_identical(nlines(csv2), 51)
31 | })
32 |
33 | ################################################################################
34 |
--------------------------------------------------------------------------------
/tests/testthat/test-read.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | context("test-read.R")
4 |
5 | iris$Species <- as.character(iris$Species)
6 | csv <- fwrite2(iris, tempfile(fileext = ".csv"))
7 |
8 | ################################################################################
9 |
10 | test_that("'fread2' changes default", {
11 | no_dt <- fread2(csv)
12 | expect_equal(no_dt, iris)
13 | expect_s3_class(no_dt, "data.frame")
14 | expect_failure(expect_s3_class(no_dt, "data.table"))
15 | expect_s3_class(fread2(csv, data.table = TRUE), "data.table")
16 | })
17 |
18 | test_that("'fread2' works with multiple files", {
19 | csv2 <- rep(csv, 3)
20 | no_dt <- fread2(csv2)
21 | expect_equal(no_dt, rbind(iris, iris, iris))
22 | expect_s3_class(no_dt, "data.frame")
23 | expect_failure(expect_s3_class(no_dt, "data.table"))
24 | expect_s3_class(fread2(csv2, data.table = TRUE), "data.table")
25 |
26 | expect_equal(dim(fread2(csv2, nrows = 5)), c(15, 5))
27 | expect_equal(dim(fread2(csv2, select = "Species")), c(450, 1))
28 | })
29 |
30 | test_that("'fread2' can use different types of input", {
31 |
32 | cmd <- sprintf("grep -v setosa %s", fwrite2(datasets::iris))
33 | expect_equal(fread2(cmd), data.table::fread(cmd, data.table = FALSE))
34 |
35 | url <- "https://raw.githubusercontent.com/privefl/bigsnpr/master/inst/extdata/example.fam"
36 | expect_equal(fread2(url), data.table::fread(url, data.table = FALSE))
37 |
38 | text <- paste(readLines(url), collapse = "\n")
39 | expect_equal(fread2(text), data.table::fread(url, data.table = FALSE))
40 | })
41 |
42 | ################################################################################
43 |
44 | test_that("'big_fread1' works", {
45 |
46 | iris1 <- big_fread1(file = csv, 50, print_timings = FALSE)
47 | expect_equal(iris1, iris)
48 |
49 | expect_warning(
50 | iris1 <- big_fread1(file = csv, 50, print_timings = FALSE,
51 | .combine = function() stop("ERROR")),
52 | "Combining failed.")
53 | expect_length(iris1, 4)
54 | expect_equal(rbind_df(iris1), iris)
55 |
56 | iris2 <- big_fread1(file = csv, 250, print_timings = FALSE)
57 | expect_equal(iris2, iris)
58 |
59 | ind3 <- 1:4
60 | iris3 <- big_fread1(file = csv, 7, select = ind3, skip = 1, print_timings = FALSE)
61 | expect_equal(iris3, iris[ind3], check.attributes = FALSE)
62 | expect_identical(names(iris3), paste0("V", ind3))
63 |
64 | iris4 <- big_fread1(file = csv, 50, print_timings = FALSE,
65 | .transform = function(df) subset(df, Species == "virginica"))
66 | expect_equal(iris4, subset(iris, Species == "virginica"), check.attributes = FALSE)
67 |
68 | expect_message(big_fread1(file = csv, 50, print_timings = TRUE), "seconds")
69 | })
70 |
71 | ################################################################################
72 |
73 | test_that("'big_fread2' works", {
74 |
75 | for (nb_parts in 1:7) {
76 |
77 | iris1 <- big_fread2(file = csv, nb_parts)
78 | expect_equal(iris1, iris)
79 |
80 | expect_warning(
81 | iris1 <- big_fread2(file = csv, nb_parts,
82 | .combine = function() stop("ERROR")),
83 | "Combining failed.")
84 | expect_length(iris1, min(nb_parts, ncol(iris)))
85 | expect_equal(cbind_df(iris1), iris)
86 |
87 | ind2 <- 1
88 | iris2 <- big_fread2(file = csv, nb_parts, select = ind2, skip = 0)
89 | expect_equal(iris2, iris[ind2])
90 |
91 | ind3 <- 1:4
92 | iris3 <- big_fread2(file = csv, nb_parts, select = ind3, skip = 1)
93 | expect_equal(iris3, iris[ind3], check.attributes = FALSE)
94 | expect_identical(names(iris3), paste0("V", ind3))
95 |
96 | expect_error(big_fread2(file = csv, nb_parts, select = c(4, 1:3), skip = 0),
97 | "Argument 'select' should be sorted.", fixed = TRUE)
98 | }
99 | })
100 |
101 | ################################################################################
102 |
103 | test_that("Same column accessor", {
104 | iris_dt <- data.table::as.data.table(iris)
105 | expect_equal(iris[, 1:3], as.data.frame(iris_dt[, 1:3]))
106 | expect_equal(iris[, 3, drop = FALSE],
107 | as.data.frame(iris_dt[, 3, drop = FALSE]))
108 | })
109 |
110 | ################################################################################
111 |
112 | test_that("Use 'scan' correctly", {
113 | expect_identical(scan(csv, "", skip = 0, nlines = 1, sep = "\n", quiet = TRUE),
114 | paste(names(iris), collapse = ","))
115 | expect_identical(scan(csv, "", skip = 1, nlines = 1, sep = "\n", quiet = TRUE),
116 | paste(as.matrix(iris)[1, ], collapse = ","))
117 | })
118 |
119 | ################################################################################
120 |
--------------------------------------------------------------------------------
/tests/testthat/test-split.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 |
3 | context("test-split.R")
4 |
5 | ################################################################################
6 |
7 | test_that("'split_every_nlines()' works", {
8 |
9 | tmp <- bigreadr::fwrite2(iris)
10 | test <- bigreadr:::split_every_nlines(tmp, tmp, 20, TRUE)
11 | files <- list.files(tempdir(), basename(tmp), full.names = TRUE)
12 | files2 <- c(tmp, paste0(tmp, "_", 1:8, ".txt"))
13 | expect_identical(normalizePath(sort(files)), normalizePath(files2))
14 | })
15 |
16 | ################################################################################
17 |
18 | test_that("'split_file()' works", {
19 |
20 | strings <- c("", "", " ", sapply(10^(seq(0, 4, by = 0.2)), function(i) {
21 | paste(as.matrix(iris)[sample(nrow(iris), i, TRUE), ], collapse = " ")
22 | }))
23 | for (every in c(1, 2, 4, 12, 24, 25)) {
24 | writeLines(sample(strings, replace = TRUE), tmp <- tempfile())
25 | # Infos are correct
26 | infos <- split_file(tmp, every, tmp2 <- tempfile())
27 | expect_identical(infos[["name_in"]], normalizePath(tmp))
28 | expect_identical(infos[["prefix_out"]], path.expand(tmp2))
29 | expect_identical(infos[["repeat_header"]], FALSE)
30 | expect_equal(ceiling(infos[["nlines_all"]] / infos[["nlines_part"]]),
31 | infos[["nfiles"]])
32 | expect_equal(infos[["nlines_all"]], 24)
33 | # New files all exist
34 | files <- get_split_files(infos)
35 | expect_true(all(file.exists(files)))
36 | # Number of lines and size is summing to whole input file
37 | expect_identical(sum(sapply(files, nlines)), nlines(tmp))
38 | expect_identical(sum(file.size(files)), file.size(tmp))
39 | # Content is the same
40 | expect_identical(do.call('c', lapply(files, readLines)), readLines(tmp))
41 | }
42 | })
43 |
44 | ################################################################################
45 |
46 | test_that("'split_file()' works with a repeated header", {
47 |
48 | # Reading splitted files is easier
49 | tf <- fwrite2(cars, tempfile(fileext = ".csv"))
50 | sf1 <- split_file(tf, 10)
51 | gsf1 <- get_split_files(sf1)
52 | expect_equal(sum(sapply(gsf1, nlines)), 51)
53 | expect_error(Reduce(rbind, lapply(gsf1, fread2)),
54 | "names do not match previous names")
55 |
56 | sf2 <- split_file(tf, 10, repeat_header = TRUE)
57 | gsf2 <- get_split_files(sf2)
58 | expect_equal(sapply(gsf2, readLines, n = 1), rep(readLines(tf, n = 1), 6),
59 | check.attributes = FALSE)
60 |
61 | loaded_df <- Reduce(rbind, lapply(gsf2, read.csv))
62 | expect_equal(names(loaded_df), c("speed", "dist"))
63 | expect_equal(nrow(loaded_df), 50)
64 |
65 | # Content is the same
66 | first_part <- readLines(gsf2[1])
67 | other_parts <- unlist(lapply(gsf2[-1], function(f) readLines(f)[-1]))
68 | expect_identical(c(first_part, other_parts), readLines(tf))
69 | })
70 |
71 | ################################################################################
72 |
73 | test_that("'split_file()' works with a repeated header (special cases)", {
74 |
75 | strings <- c("", "", " ", sapply(10^(seq(0, 4, by = 0.2)), function(i) {
76 | paste(as.matrix(iris)[sample(nrow(iris), i, TRUE), ], collapse = " ")
77 | }))
78 | for (every in c(1, 2, 4, 12, 24, 25)) {
79 | writeLines(sample(strings, replace = TRUE), tmp <- tempfile())
80 | # Infos are correct
81 | infos <- split_file(tmp, every, tmp2 <- tempfile(), repeat_header = TRUE)
82 | expect_identical(infos[["name_in"]], normalizePath(tmp))
83 | expect_identical(infos[["prefix_out"]], path.expand(tmp2))
84 | expect_identical(infos[["repeat_header"]], TRUE)
85 | nlines_all_without_header <- infos[["nlines_all"]] - infos[["nfiles"]]
86 | expect_equal(nlines_all_without_header + 1, 24)
87 | expect_equal(ceiling((nlines_all_without_header + 1) / infos[["nlines_part"]]),
88 | infos[["nfiles"]])
89 | # New files all exist
90 | files <- get_split_files(infos)
91 | expect_true(all(file.exists(files)))
92 | # Same first line for each file
93 | expect_equal(sapply(files, readLines, n = 1),
94 | rep(readLines(tmp, n = 1), infos[["nfiles"]]),
95 | check.attributes = FALSE)
96 | # Content is the same
97 | first_part <- readLines(files[1])
98 | other_parts <- unlist(lapply(files[-1], function(f) readLines(f)[-1]))
99 | expect_identical(c(first_part, other_parts), readLines(tmp))
100 | }
101 | })
102 |
103 | ################################################################################
104 |
--------------------------------------------------------------------------------
/tmp-save/nlines.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define BUFSIZE (1024 * 1024)
4 |
5 |
6 | //' Count number of lines
7 | //'
8 | //' @param filename Path to the file.
9 | //'
10 | //' @export
11 | //'
12 | // [[Rcpp::export]]
13 | double nlines1(std::string filename) {
14 |
15 | FILE *fp_in = fopen(filename.c_str(), "rb");
16 | // setvbuf(fp_in, NULL, _IOLBF, BUFSIZE);
17 |
18 | size_t size = 100;
19 | size_t last = size - 2;
20 |
21 | char *line = new char[size];
22 | char *temp= NULL;
23 | size_t c = 0;
24 | bool not_eol;
25 |
26 | while (fgets(line, size, fp_in) != NULL) {
27 |
28 | if (strlen(line) > last) {
29 |
30 | not_eol = (line[last] != '\n');
31 |
32 | size *= 2;
33 | temp = new char[size];
34 | delete [] line;
35 | line = temp;
36 | last = size - 2;
37 |
38 | if (not_eol) continue;
39 | }
40 |
41 | // End of line
42 | c++;
43 | }
44 |
45 | fclose(fp_in);
46 |
47 | return c;
48 | }
49 |
50 | #include
51 | #include
52 | using namespace std;
53 |
54 | int FileRead(istream& is, char* buff) {
55 | is.read(buff, BUFSIZE);
56 | return is.gcount();
57 | }
58 |
59 | // [[Rcpp::export]]
60 | double nlines2(const char * filename) {
61 |
62 | ifstream ifs(filename, ios::in | ios::binary);
63 |
64 | char *buff = new char[BUFSIZE];
65 |
66 | size_t nlines = 0;
67 | while (int cc = FileRead(ifs, buff)) {
68 | nlines += std::count(buff, buff + cc, '\n');
69 | }
70 |
71 | delete [] buff;
72 |
73 | return nlines;
74 | }
75 |
76 | // [[Rcpp::export]]
77 | double nlines3(const char * filename) {
78 |
79 | FILE *fp = fopen(filename, "r");
80 |
81 | size_t nlines = 0;
82 |
83 | char c = 'a';
84 | while (c != EOF) {
85 | c = getc(fp);
86 | if (c == '\n') nlines++;
87 | }
88 |
89 | fclose(fp);
90 |
91 | return nlines;
92 | }
93 |
94 | // [[Rcpp::export]]
95 | double nlines4(std::string filename, int buff_size = 1024) {
96 |
97 | FILE *fp_in = fopen(filename.c_str(), "rb");
98 | // setvbuf(fp_in, NULL, _IOFBF, BUFSIZE);
99 |
100 | char *buff = new char[buff_size];
101 | // int buff_size_minus_one = buff_size - 1;
102 | size_t nlines = 0;
103 |
104 | while (feof(fp_in) == 0) {
105 | if (fgets(buff, buff_size, fp_in) == NULL)
106 | Rcpp::Rcout << "Error?" << std::endl;
107 |
108 | // Rcpp::Rcout << " : "<< strlen(buff) <<
109 | // " => " << (buff[strlen(buff) - 1] == '\n') << std::endl;
110 |
111 | if ((buff[strlen(buff) - 1] == '\n')) nlines++;
112 | }
113 |
114 | fclose(fp_in);
115 |
116 | return nlines;
117 | }
118 |
119 | // [[Rcpp::export]]
120 | double nlines5(std::string filename, int buff_size = 1024) {
121 |
122 | FILE *input_file = fopen(filename.c_str(), "rb");
123 | char buffer[buff_size + 1];
124 | size_t line_count = 0;
125 |
126 | while (!feof(input_file))
127 | {
128 | size_t chars_read = fread(buffer, 1, buff_size, input_file);
129 | for (unsigned int i = 0; i < chars_read; ++i)
130 | {
131 | if (buffer[i] == '\n')
132 | {
133 | ++line_count;
134 | }
135 | }
136 | }
137 |
138 | fclose(input_file);
139 |
140 | return line_count;
141 | }
142 |
143 | // [[Rcpp::export]]
144 | double nlines6(std::string filename) {
145 |
146 | size_t newlines = 0;
147 | char buf[BUFSIZE];
148 | size_t BUFSIZE_M1 = BUFSIZE - 1;
149 | size_t BUFSIZE_M2 = BUFSIZE - 2;
150 | FILE* file = fopen(filename.c_str(), "rb");
151 |
152 | while (fgets(buf, BUFSIZE, file)) {
153 | if (strlen(buf) != BUFSIZE_M1 || buf[BUFSIZE_M2] != '\n')
154 | newlines++;
155 | }
156 |
157 | return newlines;
158 | }
159 |
160 |
161 |
162 | #include
163 | #include
164 | #include
165 | #include
166 |
167 | // [[Rcpp::export]]
168 | double nlines7(std::string filename) {
169 |
170 | int fd = open(filename.c_str(), O_RDONLY, 0);
171 |
172 | char *buff = new char[BUFSIZE];
173 | size_t nlines = 0;
174 |
175 | while (int len = read(fd, buff, BUFSIZE)) {
176 |
177 | if (len == -1) {
178 | (void)close(fd);
179 | break;
180 | }
181 |
182 | for (int i = 0; i < len; i++)
183 | if (buff[i] == '\n') nlines++;
184 | }
185 |
186 | (void)close(fd);
187 |
188 | return nlines;
189 | }
190 |
191 | /*** R
192 | cars2 <- cars[rep(1:50, 20e2), rep(1:2, 100)]
193 | # cars2 <- cars[rep(1:50, 5), rep(1:2, 30e3)]
194 | bigreadr::fwrite2(cars2, "tmp-data/cars.csv")
195 | for (i in 2:10) bigreadr::fwrite2(cars2, "tmp-data/cars.csv", append = TRUE)
196 |
197 | system.time(print(nlines7("tmp-data/cars.csv")))
198 | system.time(print(nlines1("tmp-data/cars.csv")))
199 | system.time(system("wc -l tmp-data/cars.csv"))
200 | system.time(print(bigreadr::nlines("tmp-data/cars.csv")))
201 |
202 |
203 | # microbenchmark::microbenchmark(
204 | # nlines1("tmp-data/cars.csv"), # 1000
205 | # # nlines2("tmp-data/cars.csv"), # 1500
206 | # # nlines3("tmp-data/cars.csv"), # 33500
207 | # # nlines4("tmp-data/cars.csv", 1024), # 1050
208 | # # nlines4("tmp-data/cars.csv", 1024 * 1024), # 1100
209 | # # nlines5("tmp-data/cars.csv", 1024), # 1050
210 | # # nlines5("tmp-data/cars.csv", 1024 * 1024), # 1100
211 | # nlines6("tmp-data/cars.csv"), # 1050
212 | # # nlines5("tmp-data/cars.csv", 1024 * 1024 * 64), # 1100
213 | # # nlines_mmap("tmp-data/cars.csv"), # 1900
214 | # # bigreadr::nlines("tmp-data/cars.csv"), # 3400
215 | # system("wc -l tmp-data/cars.csv"), # 400
216 | # # system("grep -c '\n' tmp-data/cars.csv"), # 400
217 | # times = 5
218 | # )
219 | #### 5M x 200 ####
220 | # Unit: milliseconds
221 | # expr min lq mean median
222 | # nlines("tmp-data/cars.csv") 2092.0324 2098.8990 2138.311 2101.8745
223 | # bigreadr::nlines("tmp-data/cars.csv") 6746.9176 6762.7296 6868.384 6799.3394
224 | # system("wc -l tmp-data/cars.csv") 853.2787 856.6954 863.299 862.6793
225 | # uq max neval
226 | # 2113.3909 2448.5013 10
227 | # 6816.9416 7438.5126 10
228 | # 867.3886 883.3312 10
229 |
230 | #### 5K x 200K ####
231 | # Unit: milliseconds
232 | # expr min lq mean median
233 | # nlines("tmp-data/cars.csv") 1852.4570 1858.6921 2429.795 1934.0913
234 | # bigreadr::nlines("tmp-data/cars.csv") 6557.9264 6621.6394 6982.951 6836.6807
235 | # system("wc -l tmp-data/cars.csv") 798.7292 845.8318 1426.601 864.2086
236 | # uq max neval
237 | # 2312.193 5831.689 10
238 | # 7211.877 7922.534 10
239 | # 1092.094 5640.510 10
240 |
241 | val <- try(system(paste("wc -l", "tmp-data/cars.csv"), intern = TRUE,
242 | ignore.stderr = TRUE), silent = TRUE)
243 | val <- `if`(class(val) == "try-error", nlines1("tmp-data/cars.csv"),
244 | as.numeric(strsplit(val, " ")[[1]][1]))
245 | */
246 |
--------------------------------------------------------------------------------
/tmp-tests/bench-acc.R:
--------------------------------------------------------------------------------
1 |
2 | library(data.table)
3 | iris_dt <- as.data.table(iris)
4 | microbenchmark::microbenchmark(
5 | iris[, 1:3],
6 | iris[1:3],
7 | iris_dt[, 1:3],
8 | iris[, 3, drop = FALSE],
9 | iris[3],
10 | iris_dt[, 3, drop = FALSE]
11 | )
12 |
13 |
--------------------------------------------------------------------------------
/tmp-tests/bench-rbind.R:
--------------------------------------------------------------------------------
1 | mtcars <- datasets::mtcars
2 | mtcars <- mtcars[rep(1:32, 1000), rep(1:11, 10)]
3 | mtcars_dt <- data.table::as.data.table(mtcars)
4 |
5 | list_mtcars <- rep(list(mtcars), 10)
6 | list_mtcars_dt <- rep(list(mtcars_dt), 10)
7 |
8 | rbind_df <- function(list_df) {
9 | list_df_merged <- lapply(seq_along(list_df[[1]]), function(k) {
10 | unlist(lapply(list_df, function(l) l[[k]]))
11 | })
12 | list_df_merged_named <- stats::setNames(list_df_merged, names(list_df[[1]]))
13 | as.data.frame(list_df_merged_named, stringsAsFactors = FALSE)
14 | }
15 |
16 | rbind_df2 <- function(list_df) {
17 | data.table::rbindlist(list_df)
18 | }
19 |
20 | microbenchmark::microbenchmark(
21 |
22 | A1 = rbind.data.frame(mtcars),
23 | A2 = rbind.data.frame(mtcars_dt),
24 | B1 = rbind_df(list(mtcars)),
25 | B2 = rbind_df(list(mtcars_dt)),
26 | C1 = rbind_df2(list(mtcars)),
27 | C2 = rbind_df2(list(mtcars_dt)),
28 |
29 | AA1 = do.call(rbind.data.frame, list_mtcars),
30 | AA2 = do.call(rbind.data.frame, list_mtcars_dt),
31 | BB1 = rbind_df(list_mtcars),
32 | BB2 = rbind_df(list_mtcars_dt),
33 | CC1 = rbind_df2(list_mtcars),
34 | CC2 = rbind_df2(list_mtcars_dt),
35 |
36 | times = 10
37 | )
38 |
--------------------------------------------------------------------------------
/tmp-tests/bench-read.R:
--------------------------------------------------------------------------------
1 | csv <- readr::readr_example("mtcars.csv")
2 | df <- data.table::fread(csv, data.table = FALSE)
3 |
4 | ## LONG CSV
5 | csv2 <- "tmp-data/mtcars-long.csv"
6 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2,
7 | # quote = FALSE, row.names = FALSE)
8 |
9 | system.time(
10 | df2 <- data.table::fread(csv2)
11 | ) # 3.5
12 |
13 | system.time(
14 | df3 <- readr::read_csv(csv2)
15 | ) # 25
16 | rm(df2, df3); gc(reset = TRUE)
17 |
18 |
19 | system.time(nlines <- fpeek::peek_count_lines(csv2)) # 1.8
20 | system.time(nlines2 <- nrow(data.table::fread(csv2, select = 1))) # 2.8
21 |
22 | tmp <- tempfile()
23 | if (Sys.info()[["sysname"]] == "Windows") {
24 |
25 | # https://sourceforge.net/projects/gnuwin32/
26 | awk <- shortPathName("C:/Program Files (x86)/GnuWin32/bin/awk.exe") # Windows
27 | cmd <- sprintf("%s \"NR%%%d==1{x=\"\"\"%s\"\"\"++i;}{print > x}\" %s",
28 | awk, 20, gsub("\\\\", "\\\\\\\\", tmp), normalizePath(csv))
29 |
30 | } else {
31 |
32 | cmd <- sprintf("awk 'NR%%%d==1{x=\"%s\"++i;}{print > x}' %s",
33 | tmp, 20, normalizePath(csv))
34 |
35 | }
36 | system(cmd)
37 | readLines(paste0(tmp, 1), 1)
38 |
39 | cmd <- sprintf("%s \"NR%%%d==1{x=\"\"\"%s\"\"\"++i;}{print > x}\" %s",
40 | awk, 20000, gsub("\\\\", "\\\\\\\\", tmp), normalizePath(csv2))
41 | system.time(system(cmd)) # 1.4
42 | # readLines(paste0(tmp, 1))
43 |
44 |
45 | ## LARGE CSV
46 | csv3 <- "tmp-data/mtcars-wide.csv"
47 | data.table::fwrite(df[rep(seq_len(nrow(df)), 500), rep(seq_len(ncol(df)), 1000)], csv3,
48 | quote = FALSE, row.names = FALSE)
49 |
50 | system.time(
51 | df2 <- data.table::fread(csv3, data.table = FALSE)
52 | ) # 0.06 -> 0.65 -> 9.8
53 | system.time(
54 | nlines <- nrow(data.table::fread(csv3, select = 1))
55 | ) # 0.1 -> 0.45 -> 4.5
56 | system.time(nlines2 <- fpeek::peek_count_lines(csv3))
57 |
58 | # system.time(
59 | # df3 <- readr::read_csv(csv3)
60 | # ) # 6
61 |
62 | cmd <- sprintf("%s \"NR%%%d==1{x=\"\"\"%s\"\"\"++i;}{print > x}\" %s",
63 | awk, 2, gsub("\\\\", "\\\\\\\\", tmp), normalizePath(csv3))
64 | system.time(system(cmd)) # 1.4
65 | # readLines(paste0(tmp, 1))
66 |
67 |
--------------------------------------------------------------------------------
/tmp-tests/bench-read2.R:
--------------------------------------------------------------------------------
1 | # https://sourceforge.net/projects/gnuwin32/files/coreutils/5.3.0/coreutils-5.3.0.exe/download
2 |
3 | csv <- readr::readr_example("mtcars.csv")
4 | # split <- shortPathName("C:\\Program Files (x86)\\GnuWin32/bin/split.exe")
5 | split <- "split"
6 |
7 | system(sprintf("%s --version", split)) == 0
8 | # system(sprintf("%s -l 5 %s", split, csv))
9 |
10 | ## LONG CSV
11 | df <- data.table::fread(csv, data.table = FALSE)
12 | csv2 <- tempfile(fileext = ".csv")
13 | data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2,
14 | quote = FALSE, row.names = FALSE)
15 | file.size(csv2)
16 |
17 | # system.time(system(sprintf("find /c /v \"aabbccdd\" %s", csv2)))
18 |
19 | system.time(data.table::fread(csv2, nThread = 1)) ## 2.2
20 | system.time(data.table::fread(csv2, nThread = 2)) ## 1.5
21 | system.time(data.table::fread(csv2, nThread = 4)) ## 1
22 | system.time(data.table::fread(csv2, nThread = 7)) ## 0.7
23 |
24 | tmp <- tempfile()
25 | system.time(system(sprintf("%s -l 200000 %s %s", split, csv2, tmp))) ## 12 sec
26 | system.time(fpeek::peek_count_lines(csv2)) ## 3 sec
27 | system.time(nrow(data.table::fread(csv2, select = 1)))
28 |
29 | files <- list.files(dirname(tmp), basename(tmp), full.names = TRUE)
30 | df1 <- data.table::fread(files[1], data.table = FALSE)
31 | data.table::fread(tail(files, 1), col.names = names(df1), data.table = FALSE)
32 |
33 | scan(csv, "", sep = ",", nlines = 1, skip = 0)
34 |
35 |
36 | df <- mtcars
37 | df2 <- unname(mtcars)
38 |
39 | sapply(df, data.table::address)
40 | sapply(df2, data.table::address)
41 |
42 |
43 | microbenchmark::microbenchmark(
44 | as.matrix(unname(mtcars), rownames.force = FALSE),
45 | as.matrix(mtcars)
46 | )
47 |
--------------------------------------------------------------------------------
/tmp-tests/bench-read3.R:
--------------------------------------------------------------------------------
1 |
2 | ## LONG CSV
3 | csv2 <- "tmp-data/mtcars-long.csv"
4 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2,
5 | # quote = FALSE, row.names = FALSE)
6 |
7 | library(bigreadr)
8 | if (Sys.info()[["sysname"]] == "Windows") {
9 | options(bigreadr.split = "C:\\Program Files (x86)\\GnuWin32/bin/split.exe")
10 | }
11 |
12 | system.time(
13 | test <- split_file(csv2)
14 | )
15 |
16 | rm(test2); gc(reset = TRUE)
17 | system.time(
18 | test2 <- big_fread(csv2, every_x_mb = 100)
19 | )
20 | gc() # + 2 GB
21 |
22 | rm(test2); gc(reset = TRUE)
23 | system.time(
24 | test2 <- data.table::fread(csv2)
25 | )
26 | gc() # + 1 GB
27 |
28 | # system.time(test <- split_file(csv2, every_x_mb = 1000))
29 | # system.time(test <- split_file(csv2, every_x_mb = 10))
30 | system.time(tmp <- lapply(test, function(f) data.table::fread(f, data.table = FALSE)))
31 |
32 | system.time(tmp2 <- do.call(my_rbind, tmp))
33 |
34 | system.time(
35 | test2 <- big_fread(csv2, every_x_mb = 100)
36 | )
37 | system.time(
38 | test3 <- data.table::fread(csv2)
39 | )
40 |
41 |
42 | tmp <- tempfile()
43 | system.time(
44 | status <- system(sprintf("%s -C %dm %s %s", "split", 100, csv2, tmp))
45 | )
46 | file_parts <- list.files(dirname(tmp), basename(tmp), full.names = TRUE)
47 |
48 | dt1 <- data.table::fread(file_parts[1])
49 |
50 | system.time(df2 <- data.table::fread(csv2, data.table = FALSE))
51 | system.time(df3 <- bigreadr::big_fread(
52 | csv2, .transform = identity
53 | ))
54 |
--------------------------------------------------------------------------------
/tmp-tests/bench-read4.R:
--------------------------------------------------------------------------------
1 |
2 | ## LONG CSV
3 | csv2 <- "tmp-data/mtcars-long.csv"
4 |
5 | Rcpp::sourceCpp('tmp-tests/test-setvbuf.cpp')
6 |
7 | # system.time(test <- test_setvbuf(csv2, 10))
8 | system.time(test <- test_setvbuf2(csv2))
9 | system.time(test2 <- fpeek::peek_count_lines(csv2))
10 |
11 | csv2.2 <- sub("\\.csv$", "2.csv", csv2)
12 | system.time(test <- test_setvbuf3(csv2, csv2.2))
13 |
14 | # df1 <- data.table::fread(csv2)
15 | # df2 <- data.table::fread(csv2.2)
16 | # identical(df1, df2)
17 | #
18 | # system.time(file.copy(csv2, sub("\\.csv$", "3.csv", csv2))) # 1.5 sec
19 |
--------------------------------------------------------------------------------
/tmp-tests/bench-read5.R:
--------------------------------------------------------------------------------
1 |
2 | library(bigreadr)
3 | if (Sys.info()[["sysname"]] == "Windows") {
4 | options(bigreadr.split = "C:\\Program Files (x86)\\GnuWin32/bin/split.exe")
5 | }
6 |
7 |
8 | ## LONG CSV
9 | csv2 <- "tmp-data/mtcars-long.csv"
10 | # csv <- readr::readr_example("mtcars.csv")
11 | # df <- data.table::fread(csv, data.table = FALSE)
12 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 500000), ], csv2,
13 | # quote = FALSE, row.names = FALSE)
14 |
15 | nlines(csv2)
16 | system.time(
17 | test <- split_file(csv2)
18 | )
19 | # Windows: 4.6 / 8.2 / 8.9
20 | # Linux: 1.5 / 1.8 / 1.4
21 | # Linux2: 1.4 / 1.3 / 1.1 / 1.3
22 |
23 | Rcpp::sourceCpp('tmp-tests/test-setvbuf5.cpp')
24 | tmp <- tempfile()
25 | system.time(
26 | test2 <- test_setvbuf6(csv2, tmp, 1e6)
27 | )
28 | # Windows: 15 / 4.8 / 5.0 / 4.4
29 | # Linux: 5.4 / 3.3 / 3.6 / 3.5 / 2.8
30 | # Linux2: 1.3 / 1.8 / 1.8 / 1.7
31 | as.integer(test2)
32 | list.files(dirname(tmp), basename(tmp))
33 |
34 |
35 |
36 | ## LARGE CSV
37 | csv3 <- "tmp-data/mtcars-wide.csv"
38 | # data.table::fwrite(df[rep(seq_len(nrow(df)), 50), rep(seq_len(ncol(df)), 10000)],
39 | # csv3, quote = FALSE, row.names = FALSE)
40 |
41 | nlines(csv3)
42 | system.time(
43 | test <- split_file(csv3)
44 | )
45 | # Windows: 4.3 / 3.9 / 9.6
46 | # Linux: 3.2 / 1.4 / 3.7
47 | # Linux2: 1.4 / 1.2 / 1.1
48 |
49 | Rcpp::sourceCpp('tmp-tests/test-setvbuf5.cpp')
50 | tmp <- tempfile()
51 | system.time(
52 | test2 <- test_setvbuf6(csv3, tmp, 100)
53 | )
54 | # Windows: 14. / 5.0 / 4.6
55 | # Linux: 1.7 / 1.7 / 6.5
56 | # Linux2: 0.4 / 1.1 / 1.2 / 1.2
57 | as.integer(test2)
58 | list.files(dirname(tmp), basename(tmp))
59 |
--------------------------------------------------------------------------------
/tmp-tests/bench-read6.R:
--------------------------------------------------------------------------------
1 | library(bigreadr)
2 |
3 | long <- FALSE
4 | if (long) {
5 | csv2 <- "tmp-data/mtcars-long.csv"
6 | block <- 1e6
7 | M <- 11
8 | block2 <- 3
9 | } else {
10 | csv2 <- "tmp-data/mtcars-wide.csv"
11 | block <- 1e3
12 | M <- 11e3
13 | block2 <- 3
14 | }
15 |
16 |
17 | library(bigstatsr)
18 | (n1 <- bigreadr::nlines(csv2))
19 |
20 | # debugonce(big_read)
21 | # tmp <- gc(reset = TRUE)
22 | # system.time(
23 | # test <- big_read(csv2, header = TRUE, sep = ",",
24 | # nlines = n1, confirmed = TRUE,
25 | # nlines.block = block, type = "double")
26 | # ) # 38 sec // 912 sec
27 | # gc() - tmp
28 |
29 | tmp <- gc(reset = TRUE)
30 | system.time({
31 | X <- FBM(n1 - 1, M)
32 | offset <- 0
33 | test2 <- big_fread1(csv2, block, .transform = function(df) {
34 | ind <- rows_along(df)
35 | X[offset + ind, ] <- as.matrix(df)
36 | offset <<- offset + length(ind)
37 | NULL
38 | }, .combine = c)
39 | }) # 16 sec // 122 sec
40 | gc() - tmp
41 |
42 | # all.equal(dim(test$FBM), dim(X))
43 | # all.equal(test$FBM[, 1], X[, 1])
44 | # all.equal(test$FBM[, 11], X[, 11])
45 |
46 | tmp <- gc(reset = TRUE)
47 | system.time({
48 | X2 <- FBM(n1 - 1, M)
49 | offset <- 0
50 | test3 <- big_fread2(csv2, block2, .transform = function(df) {
51 | print(offset)
52 | ind <- cols_along(df)
53 | X2[, offset + ind] <- as.matrix(df)
54 | offset <<- offset + length(ind)
55 | NULL
56 | }, .combine = c)
57 | }) # 16 sec // 122 sec
58 | gc() - tmp
59 |
60 | all.equal(dim(X2), dim(X))
61 | all.equal(X2[, 1], X[, 1])
62 | all.equal(X2[, 11], X[, 11])
63 | all.equal(X2[, M], X[, M])
64 |
65 |
--------------------------------------------------------------------------------
/tmp-tests/bench-read7.R:
--------------------------------------------------------------------------------
1 | csv <- "tmp-data/mtcars-long.csv"
2 | csv2 <- "tmp-data/mtcars-wide.csv"
3 |
4 | ## System command 'cut' is super slow on my Windows.
5 |
6 | tmp <- gc(reset = TRUE)
7 | system.time(
8 | test2 <- data.table::fread(sprintf("cut -f1-5 -s -d',' %s", csv))
9 | )
10 | gc() - tmp
11 |
12 | tmp <- gc(reset = TRUE)
13 | system.time(
14 | test2 <- data.table::fread(sprintf("cut -f1-50000 -s -d',' %s", csv2))
15 | )
16 | gc() - tmp
17 |
18 |
19 | tmp <- gc(reset = TRUE)
20 | system.time(
21 | test2 <- data.table::fread(csv, select = 1:5)
22 | )
23 | gc() - tmp
24 |
25 | tmp <- gc(reset = TRUE)
26 | system.time(
27 | test2 <- data.table::fread(csv2, select = 1:50000)
28 | )
29 | gc() - tmp
30 |
31 |
32 | tmp <- gc(reset = TRUE)
33 | system.time(
34 | test2 <- data.table::fread(csv2, select = 1:10000)
35 | )
36 | gc() - tmp
37 |
38 | tryCatch(data.table::fread(file = csv, nrows = 0, skip = 1),
39 | error = function(e) NULL)
40 | dt <- data.table::fread(file = csv, select = c(5, 1, 3), verbose = TRUE)
41 | names(dt)
42 | names(mtcars)[c(5, 1, 3)]
43 | dt2 <- `[.data.frame`(dt, names(mtcars)[c(5, 1, 3)])
44 | dt2[1]
45 | class(dt2)
46 |
47 | library(data.table)
48 | fwrite(iris, tmp <- tempfile())
49 | debugonce(fread)
50 | data.table::fread(file = tmp, select = c(5, 1, 3), skip = 0)
51 | data.table::fread(file = tmp, select = c(5, 1, 3), skip = 1)
52 |
53 | system.time(first_line <- fread(csv2, nrows = 1))
54 | system.time(zero_line <- fread(csv2, nrows = 0))
55 | system.time(first_line <- fread(csv2, nrows = 1, skip = 1))
56 |
57 | # system.time(
58 | # df4 <- limma::read.columns(csv, names(mtcars)[1:4], sep = ",")
59 | # ) # 32 sec
60 |
--------------------------------------------------------------------------------
/tmp-tests/has-header.R:
--------------------------------------------------------------------------------
1 | part1 <- fread2(file_parts[1], skip = skip, ...)
2 | first_line <- scan(file, "", skip = skip, nlines = 1, sep = "\n", quiet = TRUE)
3 | match_names <- sapply(names(part1), regexpr, text = first_line, fixed = TRUE)
4 | has_header <- all( diff(match_names) > 0 )
5 |
--------------------------------------------------------------------------------
/tmp-tests/split.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define BUFLEN (64 * 1024)
4 |
5 | // [[Rcpp::export]]
6 | NumericVector test_setvbuf7(std::string filename,
7 | std::string filename2,
8 | int every_nlines,
9 | Environment parts_) {
10 |
11 | XPtr xptr = parts_["address"];
12 | BMAcc parts(xptr);
13 |
14 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out;
15 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN);
16 |
17 | const char *fn_out = filename2.c_str();
18 | char name_out[strlen(fn_out) + 20];
19 |
20 | size_t line_size;
21 | size_t size = 100;
22 | size_t last = size - 2;
23 |
24 | char *line = new char[size];
25 | char *temp;
26 |
27 | bool not_eol, not_eof = true;
28 | int i, k = 0, c = 0;
29 |
30 |
31 | while (not_eof) {
32 |
33 | // Open file number 'k'
34 | sprintf(name_out, "%s%d.txt", fn_out, ++k);
35 | fp_out = fopen(name_out, "wb");
36 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN);
37 |
38 | // Fill it with 'every_nlines' lines
39 | i = 0;
40 | while (i < every_nlines) {
41 |
42 | if (fgets(line, size, fp_in) == NULL) {
43 | not_eof = false;
44 | break;
45 | }
46 |
47 | line_size = strlen(line);
48 |
49 | fputs(line, fp_out);
50 |
51 | if (line_size > last) {
52 |
53 | not_eol = (line[last] != '\n');
54 |
55 | fflush(fp_out);
56 | size *= 2;
57 | temp = new char[size];
58 | delete [] line;
59 | line = temp;
60 | last = size - 2;
61 |
62 | if (not_eol) continue;
63 | }
64 |
65 | // End of line
66 | i++;
67 |
68 | }
69 |
70 | c += i;
71 |
72 | // Close file number 'k'
73 | fflush(fp_out);
74 | fclose(fp_out);
75 | parts(k - 1, 0) = 1; // OK to porcess
76 | Rcout << k << std::endl;
77 |
78 | }
79 |
80 | fclose(fp_in);
81 |
82 | return NumericVector::create(_["K"] = k, _["every"] = every_nlines, _["N"] = c);
83 | }
84 |
85 |
--------------------------------------------------------------------------------
/tmp-tests/test-file2string.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | using namespace std;
5 |
6 | // [[Rcpp::export]]
7 | std::string file2string(std::string fn) {
8 |
9 | std::string str, strTotal;
10 | ifstream in;
11 | in.open(fn.c_str());
12 | getline(in, str);
13 | while ( in ) {
14 | Rcpp::Rcout << strTotal.max_size() << std::endl;
15 | strTotal += str + '\n';
16 | getline(in, str);
17 | }
18 |
19 | return strTotal;
20 | }
21 |
22 | // [[Rcpp::export]]
23 | std::string file2string2(std::string fn) {
24 |
25 | std::ifstream ifs(fn.c_str());
26 | std::string content( (std::istreambuf_iterator(ifs) ),
27 | (std::istreambuf_iterator() ) );
28 |
29 | return content;
30 | }
31 |
32 |
33 | /*** R
34 | test <- file2string("text-write.txt")
35 | writeLines(test)
36 | test2 <- file2string2("text-write.txt")
37 | writeLines(test2)
38 | csv2 <- "tmp-data/mtcars-long.csv"
39 | # system.time(test3 <- file2string2(csv2))
40 | */
41 |
--------------------------------------------------------------------------------
/tmp-tests/test-mmap-nlines.cpp:
--------------------------------------------------------------------------------
1 | // [[Rcpp::depends(rmio)]]
2 | // [[Rcpp::plugins(cpp11)]]
3 | #include
4 | #include // for std::error_code
5 | #include
6 |
7 | using std::size_t;
8 |
9 |
10 | // [[Rcpp::export]]
11 | double nlines_mmap(std::string path) {
12 |
13 | // Memory-map the file
14 | std::error_code error;
15 | mio::ummap_source ro_ummap;
16 | ro_ummap.map(path, error);
17 | if (error) Rcpp::stop("Error when mapping file:\n %s.\n", error.message());
18 |
19 | int nlines = std::count_if(ro_ummap.begin(), ro_ummap.end(),
20 | [](unsigned char x) { return x == '\n'; });
21 |
22 | size_t nbytes = ro_ummap.size();
23 | // size_t nlines = 0;
24 | // for (size_t k = 0; k < nbytes; k++) {
25 | // if (ro_ummap[k] == '\n') nlines++;
26 | // }
27 |
28 | if (ro_ummap[nbytes - 1] != '\n') nlines++;
29 |
30 | return nlines;
31 | }
32 |
33 | // [[Rcpp::export]]
34 | double nlines_mmap2(std::string path) {
35 |
36 | // Memory-map the file
37 | std::error_code error;
38 | mio::ummap_source ro_ummap;
39 | ro_ummap.map(path, error);
40 | if (error) Rcpp::stop("Error when mapping file:\n %s.\n", error.message());
41 |
42 | size_t nbytes = ro_ummap.size();
43 | size_t nlines = 0;
44 | for (size_t k = 0; k < (nbytes - 4); k += 4) {
45 | nlines += ((ro_ummap[k] == '\n') + (ro_ummap[k + 1] == '\n')) +
46 | ((ro_ummap[k + 2] == '\n') + (ro_ummap[k + 3] == '\n'));
47 | }
48 |
49 | // TODO: add the test and test that more than 4 bytes
50 |
51 | if (ro_ummap[nbytes - 1] != '\n') nlines++;
52 |
53 | return nlines;
54 | }
55 |
56 | /*** R
57 | nlines_mmap("../tmp-data/cars.csv.bk")
58 | nlines_mmap2("../tmp-data/cars.csv.bk")
59 | */
60 |
--------------------------------------------------------------------------------
/tmp-tests/test-parallel.R:
--------------------------------------------------------------------------------
1 | library(bigreadr)
2 | library(bigstatsr)
3 | library(foreach)
4 |
5 | ## Need to handle 'skip'
6 | csv2 <- "tmp-data/mtcars-long.csv"
7 | n <- nlines(csv2)
8 | K <- 20
9 | every_lines <- ceiling(n / 20)
10 |
11 | Rcpp::sourceCpp('tmp-tests/test-setvbuf6.cpp')
12 | tmp <- tempfile()
13 | parts <- FBM(K, 1, init = 0, type = "integer")
14 | system.time(
15 | test <- test_setvbuf7(csv2, tmp, every_nlines = every_lines, parts)
16 | )
17 | as.integer(test)
18 | files <- paste0(tmp, 1:K, ".txt")
19 | file.exists(files)
20 |
21 | system.time({
22 | res2 <- foreach(ic = 1:K) %do% {
23 | while (parts[ic] == 0) Sys.sleep(TIME)
24 | bigreadr:::fread2(files[ic], nThread = 8)
25 | }
26 | }) # 0.9 / 1 (8) -> 2.4 (1)
27 | ## Either all or only 1
28 |
29 |
30 |
--------------------------------------------------------------------------------
/tmp-tests/test-parallel2.R:
--------------------------------------------------------------------------------
1 | library(bigreadr)
2 | library(bigstatsr)
3 | library(foreach)
4 |
5 | ## Need to handle 'skip'
6 | csv2 <- "tmp-data/mtcars-long.csv"
7 | n <- nlines(csv2)
8 | K <- 20
9 | every_lines <- ceiling(n / 20)
10 |
11 | parallel <- TRUE
12 | if (!parallel) {
13 | registerDoSEQ()
14 | } else {
15 | cl <- parallel::makeCluster(2)
16 | doParallel::registerDoParallel(cl)
17 | # on.exit(parallel::stopCluster(cl), add = TRUE)
18 | }
19 |
20 | TIME <- 1 / (10 * K)
21 | parts <- FBM(K, 1, init = 0, type = "integer")
22 | tmp <- tempfile()
23 | files <- paste0(tmp, 1:K, ".txt")
24 | system.time({
25 | res <- foreach(job = 1:2) %dopar% {
26 |
27 | if (job == 1) {
28 | print(1)
29 | system.time(
30 | test <- bigreadr:::test_setvbuf7(csv2, tmp, every_nlines = every_lines, parts)
31 | )
32 | # NULL
33 | } else {
34 | print(2)
35 | system.time({
36 | lapply(seq_along(files), function(k) {
37 | while (parts[k] == 0) Sys.sleep(TIME)
38 | bigreadr:::fread2(files[k])
39 | })
40 | })
41 | }
42 | }
43 | })
44 | parallel::stopCluster(cl)
45 | res
46 | # res <- do.call(bigreadr::my_rbind, res[[2]])
47 |
48 |
49 | #### PROBLEM: fread reading (second job) is slowing down first job ####
50 |
51 |
52 | system.time({
53 | lapply(seq_along(files), function(k) {
54 | while (parts[k] == 0) Sys.sleep(TIME)
55 | bigreadr:::fread2(files[k], nThread = 8)
56 | })
57 | })
58 |
--------------------------------------------------------------------------------
/tmp-tests/test-setvbuf.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | using namespace Rcpp;
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #define BUFLEN (64 * 1024)
10 |
11 | // [[Rcpp::export]]
12 | int test_setvbuf(std::string filename, int size = 100) {
13 |
14 | FILE *fp = fopen(filename.c_str(), "r");
15 |
16 | unsigned sizem1 = size - 1;
17 | int last = size - 2;
18 |
19 | char line[size];
20 | // char *id;
21 | // char *token;
22 | char *buf = (char*)malloc(BUFLEN);
23 | int c = 0;
24 |
25 | setvbuf ( fp , buf , _IOLBF, BUFLEN );
26 | while (fgets(line, size, fp) != NULL) {
27 | // Rcout << strlen(line) << std::endl;
28 | if (strlen(line) < sizem1) {
29 | c++;
30 | } else {
31 | // Rcout << (line[last] == '\n') << std::endl;
32 | if (line[last] == '\n') c++;
33 | }
34 | // id = strtok(line, "\t");
35 | // token = strtok(NULL, "\t");
36 | //
37 | // char *fnout = malloc(strlen(id)+5);
38 | // fnout = strcat(fnout, id);
39 | // fnout = strcat(fnout, ".seq");
40 | //
41 | // fpout = fopen(fnout, "w");
42 | // setvbuf ( fpout , NULL , _IONBF , 0 );
43 | // fprintf(fpout, "%s", token);
44 | // fclose(fpout);
45 | }
46 |
47 | fclose(fp);
48 |
49 | return c;
50 |
51 | }
52 |
53 | // [[Rcpp::export]]
54 | int test_setvbuf2(std::string filename, int size = 100) {
55 |
56 | FILE *fp = fopen(filename.c_str(), "r");
57 |
58 | unsigned sizem1 = size - 1;
59 | int last = size - 2;
60 |
61 | char * line = new char[size];
62 | char * temp;
63 | // char *id;
64 | // char *token;
65 | // char *buf = (char*)malloc(BUFLEN);
66 | int c = 0;
67 |
68 | setvbuf ( fp , NULL , _IOLBF, BUFLEN );
69 | while (fgets(line, size, fp) != NULL) {
70 | // Rcout << strlen(line) << std::endl;
71 | if (strlen(line) < sizem1) {
72 | c++;
73 | } else {
74 | // Rcout << (line[last] == '\n') << std::endl;
75 | if (line[last] == '\n') c++;
76 | size *= 2;
77 | temp = new char[size];
78 | delete [] line;
79 | line = temp;
80 | sizem1 = size - 1;
81 | last = size - 2;
82 | }
83 | // id = strtok(line, "\t");
84 | // token = strtok(NULL, "\t");
85 | //
86 | // char *fnout = malloc(strlen(id)+5);
87 | // fnout = strcat(fnout, id);
88 | // fnout = strcat(fnout, ".seq");
89 | //
90 | // fpout = fopen(fnout, "w");
91 | // setvbuf ( fpout , NULL , _IONBF , 0 );
92 | // fprintf(fpout, "%s", token);
93 | // fclose(fpout);
94 | }
95 |
96 | fclose(fp);
97 |
98 | return c;
99 |
100 | }
101 |
102 | /*** R
103 | test_setvbuf("text-write.txt")
104 | test_setvbuf2("text-write.txt")
105 | */
106 |
--------------------------------------------------------------------------------
/tmp-tests/test-setvbuf2.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | using namespace Rcpp;
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #define BUFLEN (64 * 1024)
10 |
11 | // [[Rcpp::export]]
12 | int test_setvbuf3(std::string filename,
13 | std::string filename2,
14 | int size = 100) {
15 |
16 | FILE *fp_in = fopen(filename.c_str(), "rb");
17 | FILE *fp_out = fopen(filename2.c_str(), "wb");
18 |
19 | unsigned sizem1 = size - 1;
20 | int last = size - 2;
21 |
22 | char * line = new char[size];
23 | char * temp;
24 | // char *id;
25 | // char *token;
26 | // char *buf = (char*)malloc(BUFLEN);
27 | int c = 0;
28 |
29 | setvbuf ( fp_in , NULL , _IOLBF, BUFLEN );
30 | setvbuf ( fp_out , NULL , _IOFBF, BUFLEN );
31 |
32 |
33 | while (fgets(line, size, fp_in) != NULL) {
34 |
35 | fputs(line, fp_out);
36 |
37 | // Rcout << strlen(line) << std::endl;
38 | if (strlen(line) < sizem1) {
39 | c++;
40 | // if (c % 1000 == 1) fflush(fp_out);
41 | } else {
42 | // Rcout << (line[last] == '\n') << std::endl;
43 | if (line[last] == '\n') c++;
44 | size *= 2;
45 | temp = new char[size];
46 | delete [] line;
47 | line = temp;
48 | sizem1 = size - 1;
49 | last = size - 2;
50 | }
51 |
52 | // id = strtok(line, "\t");
53 | // token = strtok(NULL, "\t");
54 | //
55 | // char *fnout = malloc(strlen(id)+5);
56 | // fnout = strcat(fnout, id);
57 | // fnout = strcat(fnout, ".seq");
58 | //
59 | // fpout = fopen(fnout, "w");
60 | // setvbuf ( fpout , NULL , _IONBF , 0 );
61 | // fprintf(fpout, "%s", token);
62 | // fclose(fpout);
63 | }
64 |
65 | fclose(fp_in);
66 | fflush(fp_out);
67 | fclose(fp_out);
68 |
69 | return c;
70 | }
71 |
72 | /*** R
73 | test_setvbuf3("text-write.txt", "text-write2.txt")
74 | */
75 |
--------------------------------------------------------------------------------
/tmp-tests/test-setvbuf3.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define BUFLEN (64 * 1024)
4 |
5 | // [[Rcpp::export]]
6 | int test_setvbuf4(std::string filename, std::string filename2) {
7 |
8 | FILE *fp_in = fopen(filename.c_str(), "rb");
9 | FILE *fp_out = fopen(filename2.c_str(), "wb");
10 |
11 | size_t line_size;
12 | size_t size = 100;
13 | size_t last = size - 2;
14 |
15 | char *line = new char[size];
16 | char *temp;
17 | int c = 0;
18 | bool not_eol;
19 |
20 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN);
21 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN);
22 |
23 | while (fgets(line, size, fp_in) != NULL) {
24 |
25 | line_size = strlen(line);
26 |
27 | fputs(line, fp_out);
28 |
29 | if (line_size > last) {
30 |
31 | not_eol = (line[last] != '\n');
32 |
33 | size *= 2;
34 | temp = new char[size];
35 | delete [] line;
36 | line = temp;
37 | last = size - 2;
38 |
39 | if (not_eol) continue;
40 | }
41 |
42 | // End of line
43 | c++;
44 |
45 | }
46 |
47 | fclose(fp_in);
48 | fflush(fp_out);
49 | fclose(fp_out);
50 |
51 | return c;
52 | }
53 |
54 | /*** R
55 | test_setvbuf4("text-write.txt", "text-write2.txt")
56 | */
57 |
--------------------------------------------------------------------------------
/tmp-tests/test-setvbuf4.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | using namespace Rcpp;
3 |
4 | #define BUFLEN (64 * 1024)
5 |
6 | // [[Rcpp::export]]
7 | int test_setvbuf5(std::string filename, std::string filename2) {
8 |
9 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out;
10 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN);
11 |
12 | const char *fn_out = filename2.c_str();
13 | char name_out[strlen(fn_out) + 20];
14 |
15 | size_t line_size;
16 | size_t size = 100;
17 | size_t last = size - 2;
18 |
19 | char *line = new char[size];
20 | char *temp;
21 | int c = 0;
22 | bool not_eol;
23 |
24 | sprintf(name_out, "%s%d.txt", fn_out, c);
25 | fp_out = fopen(name_out, "wb");
26 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN);
27 |
28 | while (fgets(line, size, fp_in) != NULL) {
29 |
30 | line_size = strlen(line);
31 |
32 | fputs(line, fp_out);
33 |
34 | if (line_size > last) {
35 |
36 | not_eol = (line[last] != '\n');
37 |
38 | fflush(fp_out);
39 | size *= 2;
40 | temp = new char[size];
41 | delete [] line;
42 | line = temp;
43 | last = size - 2;
44 |
45 | if (not_eol) continue;
46 | }
47 |
48 | // End of line
49 | c++;
50 | fflush(fp_out);
51 | fclose(fp_out);
52 | sprintf(name_out, "%s%d.txt", fn_out, c);
53 | fp_out = fopen(name_out, "wb");
54 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN);
55 |
56 | }
57 |
58 | fflush(fp_out);
59 | fclose(fp_out); // last one has nothing inside
60 | fclose(fp_in);
61 |
62 | return c;
63 | }
64 |
65 | /*** R
66 | test_setvbuf5("text-write.txt", "tmp/text-write-part")
67 | readLines("text-write.txt")[[6]]
68 | readLines("tmp/text-write-part5.txt")
69 | */
70 |
--------------------------------------------------------------------------------
/tmp-tests/test-setvbuf5.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | using namespace Rcpp;
3 |
4 | #define BUFLEN (64 * 1024)
5 |
6 | // [[Rcpp::export]]
7 | NumericVector test_setvbuf6(std::string filename,
8 | std::string filename2,
9 | int every_nlines) {
10 |
11 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out;
12 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN);
13 |
14 | const char *fn_out = filename2.c_str();
15 | char name_out[strlen(fn_out) + 20];
16 |
17 | size_t line_size;
18 | size_t size = 100;
19 | size_t last = size - 2;
20 |
21 | char *line = new char[size];
22 | char *temp;
23 |
24 | bool not_eol, not_eof = true;
25 | int i, k = 0, c = 0;
26 |
27 |
28 | while (not_eof) {
29 |
30 | // Open file number 'k'
31 | sprintf(name_out, "%s%d.txt", fn_out, ++k);
32 | fp_out = fopen(name_out, "wb");
33 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN);
34 |
35 | // Fill it with 'every_nlines' lines
36 | i = 0;
37 | while (i < every_nlines) {
38 |
39 | if (fgets(line, size, fp_in) == NULL) {
40 | not_eof = false;
41 | break;
42 | }
43 |
44 | line_size = strlen(line);
45 |
46 | fputs(line, fp_out);
47 |
48 | if (line_size > last) {
49 |
50 | not_eol = (line[last] != '\n');
51 |
52 | fflush(fp_out);
53 | size *= 2;
54 | temp = new char[size];
55 | delete [] line;
56 | line = temp;
57 | last = size - 2;
58 |
59 | if (not_eol) continue;
60 | }
61 |
62 | // End of line
63 | i++;
64 |
65 | }
66 |
67 | c += i;
68 |
69 | // Close file number 'k'
70 | fflush(fp_out);
71 | fclose(fp_out);
72 |
73 | }
74 |
75 | fclose(fp_in);
76 |
77 | return NumericVector::create(_["K"] = k, _["every"] = every_nlines, _["N"] = c);
78 | }
79 |
80 | /***R
81 | test_setvbuf6("text-write.txt", "tmp2/text-write-part", 2)
82 | readLines("text-write.txt")[[7]]
83 | readLines("tmp2/text-write-part4.txt")
84 | */
85 |
--------------------------------------------------------------------------------
/tmp-tests/test-setvbuf6.cpp:
--------------------------------------------------------------------------------
1 | // [[Rcpp::depends(BH, bigstatsr)]]
2 | #include
3 |
4 | #define BUFLEN (64 * 1024)
5 |
6 | // [[Rcpp::export]]
7 | NumericVector test_setvbuf7(std::string filename,
8 | std::string filename2,
9 | int every_nlines,
10 | Environment parts_) {
11 |
12 | XPtr xptr = parts_["address"];
13 | BMAcc parts(xptr);
14 |
15 | FILE *fp_in = fopen(filename.c_str(), "rb"), *fp_out;
16 | setvbuf(fp_in, NULL, _IOLBF, BUFLEN);
17 |
18 | const char *fn_out = filename2.c_str();
19 | char name_out[strlen(fn_out) + 20];
20 |
21 | size_t line_size;
22 | size_t size = 100;
23 | size_t last = size - 2;
24 |
25 | char *line = new char[size];
26 | char *temp;
27 |
28 | bool not_eol, not_eof = true;
29 | int i, k = 0, c = 0;
30 |
31 |
32 | while (not_eof) {
33 |
34 | // Open file number 'k'
35 | sprintf(name_out, "%s%d.txt", fn_out, ++k);
36 | fp_out = fopen(name_out, "wb");
37 | setvbuf(fp_out, NULL, _IOFBF, BUFLEN);
38 |
39 | // Fill it with 'every_nlines' lines
40 | i = 0;
41 | while (i < every_nlines) {
42 |
43 | if (fgets(line, size, fp_in) == NULL) {
44 | not_eof = false;
45 | break;
46 | }
47 |
48 | line_size = strlen(line);
49 |
50 | fputs(line, fp_out);
51 |
52 | if (line_size > last) {
53 |
54 | not_eol = (line[last] != '\n');
55 |
56 | fflush(fp_out);
57 | size *= 2;
58 | temp = new char[size];
59 | delete [] line;
60 | line = temp;
61 | last = size - 2;
62 |
63 | if (not_eol) continue;
64 | }
65 |
66 | // End of line
67 | i++;
68 |
69 | }
70 |
71 | c += i;
72 |
73 | // Close file number 'k'
74 | fflush(fp_out);
75 | fclose(fp_out);
76 | parts(k - 1, 0) = 1;
77 | Rcout << k << std::endl;
78 |
79 | }
80 |
81 | fclose(fp_in);
82 |
83 | return NumericVector::create(_["K"] = k, _["every"] = every_nlines, _["N"] = c);
84 | }
85 |
86 |
--------------------------------------------------------------------------------
/tmp-tests/test-string.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | using namespace Rcpp;
3 |
4 | #define BUFLEN (64 * 1024)
5 |
6 | // [[Rcpp::export]]
7 | void test_string(std::string filename) {
8 |
9 | const char *fn = filename.c_str();
10 | char name_out[strlen(fn) + 20];
11 |
12 | for (int k = 1; k < 10; k++) {
13 | sprintf(name_out, "%s%d.txt", fn, k);
14 | Rcout << filename << std::endl;
15 | Rcout << name_out << std::endl;
16 | }
17 | }
18 |
19 | /*** R
20 | test_string(tempfile())
21 | */
22 |
--------------------------------------------------------------------------------
/tmp-tests/text-write.txt:
--------------------------------------------------------------------------------
1 | a 199 23.45
2 | b 1e+8
3 | c 23339999
4 | errrrr
5 | dde
6 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpfa
7 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpf
8 |
--------------------------------------------------------------------------------
/tmp-tests/text-write2.txt:
--------------------------------------------------------------------------------
1 | a 199 23.45
2 | b 1e+8
3 | c 23339999
4 | errrrr
5 | dde
6 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpfa
7 | mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmeeeeeeeeeeeeeeeeeeeeeeeelllllllllllllllllllllllllddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeggggggggggggggggggggggaaacvdgbfetgdfghmethdpf
8 |
--------------------------------------------------------------------------------
/vignettes/csv2sqlite.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Convert a CSV to SQLite by parts"
3 | author: "Florian Privé"
4 | date: "August 26, 2018"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, fig.align = "center", eval = FALSE)
10 | ```
11 |
12 | ## How
13 |
14 | You can easily use this package {bigreadr} to convert a CSV to an SQLite database without loading the whole CSV in memory.
15 |
16 | You can use the following function:
17 |
18 | ```{r}
19 | csv2sqlite <- function(csv,
20 | every_nlines,
21 | table_name,
22 | dbname = sub("\\.csv$", ".sqlite", csv),
23 | ...) {
24 |
25 | # Prepare reading
26 | con <- RSQLite::dbConnect(RSQLite::SQLite(), dbname)
27 | init <- TRUE
28 | fill_sqlite <- function(df) {
29 |
30 | if (init) {
31 | RSQLite::dbCreateTable(con, table_name, df)
32 | init <<- FALSE
33 | }
34 |
35 | RSQLite::dbAppendTable(con, table_name, df)
36 | NULL
37 | }
38 |
39 | # Read and fill by parts
40 | bigreadr::big_fread1(csv, every_nlines,
41 | .transform = fill_sqlite,
42 | .combine = unlist,
43 | ... = ...)
44 |
45 | # Returns
46 | con
47 | }
48 | ```
49 |
50 | Function `bigreadr::big_fread1()` first splits the CSV in smaller CSV files, then it reads these CSV files as data frames and transform them, and finally combine the results.
51 |
52 | Here, the transformation is just appending the data frame to the SQLite database (and creating this DB the first time). Moreover, you don't want to return anything (`NULL`).
53 |
54 | ## Use case
55 |
56 | For example, with this function, I was able to convert a [CSV file of 9 GB](https://www.data.gouv.fr/fr/datasets/base-sirene-des-entreprises-et-de-leurs-etablissements-siren-siret/) in 40 minutes using less than 2 GB of memory.
57 |
58 | ```{r}
59 | con <- csv2sqlite(csv, every_nlines = 1e6, table_name = "sirene",
60 | encoding = "Latin-1")
61 | ```
62 |
63 |
--------------------------------------------------------------------------------