├── .Rbuildignore ├── .covrignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ └── test-coverage.yaml ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── callback.R ├── cpp11.R ├── date-symbols.R ├── example.R ├── locale.R ├── melt_delim.R ├── melt_delim_chunked.R ├── melt_fwf.R ├── melt_table.R ├── meltr-package.R ├── problems.R ├── source.R ├── sysdata.rda ├── tokenizer.R └── utils.R ├── README.Rmd ├── README.md ├── codecov.yml ├── cran-comments.md ├── data-raw └── date-symbols.R ├── inst └── extdata │ ├── epa78.txt │ ├── fwf-sample.txt │ ├── massey-rating.txt │ └── mtcars.csv ├── man ├── Tokenizers.Rd ├── callback.Rd ├── clipboard.Rd ├── datasource.Rd ├── date_names.Rd ├── locale.Rd ├── melt_delim.Rd ├── melt_delim_chunked.Rd ├── melt_fwf.Rd ├── melt_table.Rd ├── meltr_example.Rd ├── problems.Rd └── show_progress.Rd ├── src ├── .gitignore ├── Collector.cpp ├── Collector.h ├── CollectorGuess.cpp ├── DateTimeParser.h ├── Iconv.cpp ├── Iconv.h ├── LocaleInfo.cpp ├── LocaleInfo.h ├── Progress.h ├── QiParsers.h ├── Reader.cpp ├── Reader.h ├── Source.cpp ├── Source.h ├── SourceFile.h ├── SourceRaw.h ├── SourceString.h ├── Token.h ├── Tokenizer.cpp ├── Tokenizer.h ├── TokenizerDelim.cpp ├── TokenizerDelim.h ├── TokenizerFwf.cpp ├── TokenizerFwf.h ├── TokenizerWs.cpp ├── TokenizerWs.h ├── Warnings.h ├── connection.cpp ├── cpp11.cpp ├── mio.h ├── read.cpp ├── unicode_fopen.h └── utils.h └── tests ├── testthat.R └── testthat ├── basic-df-singlequote.csv ├── basic-df.csv ├── empty-file ├── enc-iso-8859-1.txt ├── fwf-trailing.txt ├── helper.R ├── non-tabular.csv ├── raw.csv ├── table-crash ├── test-melt-chunked.R ├── test-melt-csv.R ├── test-melt-fwf.R └── test-melt-table.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^LICENSE\.md$ 2 | ^data-raw$ 3 | ^codecov\.yml$ 4 | ^\.github$ 5 | ^README\.Rmd$ 6 | ^\.covrignore$ 7 | ^cran-comments\.md$ 8 | ^CRAN-SUBMISSION$ 9 | -------------------------------------------------------------------------------- /.covrignore: -------------------------------------------------------------------------------- 1 | src/mio.h 2 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | branches: [main, master] 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: ${{ matrix.config.os }} 18 | 19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | config: 25 | - {os: macOS-latest, r: 'release'} 26 | 27 | - {os: windows-latest, r: 'release'} 28 | # Use 3.6 to trigger usage of RTools35 29 | - {os: windows-latest, r: '3.6'} 30 | 31 | # Use older ubuntu to maximise backward compatibility 32 | - {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'} 33 | - {os: ubuntu-18.04, r: 'release'} 34 | - {os: ubuntu-18.04, r: 'oldrel-1'} 35 | - {os: ubuntu-18.04, r: 'oldrel-2'} 36 | - {os: ubuntu-18.04, r: 'oldrel-3'} 37 | - {os: ubuntu-18.04, r: 'oldrel-4'} 38 | 39 | env: 40 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 41 | R_KEEP_PKG_SOURCE: yes 42 | 43 | steps: 44 | - uses: actions/checkout@v3 45 | 46 | - uses: r-lib/actions/setup-pandoc@v2 47 | 48 | - uses: r-lib/actions/setup-r@v2 49 | with: 50 | r-version: ${{ matrix.config.r }} 51 | http-user-agent: ${{ matrix.config.http-user-agent }} 52 | use-public-rspm: true 53 | 54 | - uses: r-lib/actions/setup-r-dependencies@v2 55 | with: 56 | extra-packages: rcmdcheck 57 | 58 | - uses: r-lib/actions/check-r-package@v2 59 | 60 | - name: Show testthat output 61 | if: always() 62 | run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true 63 | shell: bash 64 | 65 | - name: Upload check results 66 | if: failure() 67 | uses: actions/upload-artifact@main 68 | with: 69 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 70 | path: check 71 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | tags: ['*'] 7 | 8 | name: pkgdown 9 | 10 | jobs: 11 | pkgdown: 12 | runs-on: ubuntu-latest 13 | env: 14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - uses: r-lib/actions/setup-pandoc@v1 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v1 25 | with: 26 | extra-packages: pkgdown 27 | needs: website 28 | 29 | - name: Deploy package 30 | run: | 31 | git config --local user.name "$GITHUB_ACTOR" 32 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 33 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 34 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | name: Commands 8 | 9 | jobs: 10 | document: 11 | if: startsWith(github.event.comment.body, '/document') 12 | name: document 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v3 18 | 19 | - uses: r-lib/actions/pr-fetch@v2 20 | with: 21 | repo-token: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - uses: r-lib/actions/setup-r@v2 24 | with: 25 | use-public-rspm: true 26 | 27 | - uses: r-lib/actions/setup-r-dependencies@v2 28 | with: 29 | extra-packages: roxygen2 30 | 31 | - name: Document 32 | run: Rscript -e 'roxygen2::roxygenise()' 33 | 34 | - name: commit 35 | run: | 36 | git config --local user.name "$GITHUB_ACTOR" 37 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 38 | git add man/\* NAMESPACE 39 | git commit -m 'Document' 40 | 41 | - uses: r-lib/actions/pr-push@v2 42 | with: 43 | repo-token: ${{ secrets.GITHUB_TOKEN }} 44 | 45 | style: 46 | if: startsWith(github.event.comment.body, '/style') 47 | name: style 48 | runs-on: ubuntu-latest 49 | env: 50 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 51 | steps: 52 | - uses: actions/checkout@v3 53 | 54 | - uses: r-lib/actions/pr-fetch@v2 55 | with: 56 | repo-token: ${{ secrets.GITHUB_TOKEN }} 57 | 58 | - uses: r-lib/actions/setup-r@v2 59 | 60 | - name: Install dependencies 61 | run: Rscript -e 'install.packages("styler")' 62 | 63 | - name: Style 64 | run: Rscript -e 'styler::style_pkg()' 65 | 66 | - name: commit 67 | run: | 68 | git config --local user.name "$GITHUB_ACTOR" 69 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 70 | git add \*.R 71 | git commit -m 'Style' 72 | 73 | - uses: r-lib/actions/pr-push@v2 74 | with: 75 | repo-token: ${{ secrets.GITHUB_TOKEN }} 76 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: covr 27 | 28 | - name: Test coverage 29 | run: covr::codecov() 30 | shell: Rscript {0} 31 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: meltr 2 | Title: Read Non-Rectangular Text Data 3 | Version: 1.0.2 4 | Authors@R: 5 | c(person(given = "Hadley", 6 | family = "Wickham", 7 | role = "aut", 8 | email = "hadley@rstudio.com"), 9 | person(given = "Duncan", 10 | family = "Garmonsway", 11 | role = c("aut", "cre"), 12 | email = "nacnudus@gmail.com", 13 | comment = "@nacnudus"), 14 | person(given = "Jim", 15 | family = "Hester", 16 | role = "aut", 17 | email = "jim.hester@rstudio.com", 18 | comment = c(ORCID = "0000-0002-2739-7082")), 19 | person(given = "RStudio", 20 | role = c("cph", "fnd")), 21 | person(given = "https://github.com/mandreyel/", 22 | role = "cph", 23 | comment = "mio library")) 24 | Description: The goal of 'meltr' is to provide a fast and friendly way to 25 | read non-rectangular data, such as ragged forms of csv (comma-separated 26 | values), tsv (tab-separated values), and fwf (fixed-width format) files. 27 | License: MIT + file LICENSE 28 | URL: https://r-lib.github.io/meltr/, 29 | https://github.com/r-lib/meltr 30 | BugReports: https://github.com/r-lib/meltr/issues 31 | Depends: 32 | R (>= 2.10) 33 | Imports: 34 | cli, 35 | methods, 36 | R6, 37 | rlang, 38 | tibble 39 | Suggests: 40 | clipr, 41 | covr, 42 | crayon, 43 | curl, 44 | readr, 45 | testthat (>= 3.0.0), 46 | withr 47 | LinkingTo: 48 | cpp11 49 | Config/testthat/edition: 3 50 | Config/Needs/website: dplyr 51 | Encoding: UTF-8 52 | LazyData: true 53 | Roxygen: list(markdown = TRUE) 54 | RoxygenNote: 7.2.1 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: meltr authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 meltr authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method("[",meltr_spec_tbl_df) 4 | S3method(as.data.frame,meltr_spec_tbl_df) 5 | S3method(as_tibble,meltr_spec_tbl_df) 6 | S3method(print,date_names) 7 | S3method(print,locale) 8 | export(AccumulateCallback) 9 | export(ChunkCallback) 10 | export(DataFrameCallback) 11 | export(ListCallback) 12 | export(SideEffectChunkCallback) 13 | export(clipboard) 14 | export(datasource) 15 | export(date_names) 16 | export(date_names_lang) 17 | export(date_names_langs) 18 | export(default_locale) 19 | export(fwf_cols) 20 | export(fwf_empty) 21 | export(fwf_positions) 22 | export(fwf_widths) 23 | export(locale) 24 | export(melt_csv) 25 | export(melt_csv2) 26 | export(melt_csv2_chunked) 27 | export(melt_csv_chunked) 28 | export(melt_delim) 29 | export(melt_delim_chunked) 30 | export(melt_fwf) 31 | export(melt_table) 32 | export(melt_table2) 33 | export(melt_tsv) 34 | export(melt_tsv_chunked) 35 | export(meltr_example) 36 | export(problems) 37 | export(show_progress) 38 | export(stop_for_problems) 39 | export(tokenizer_csv) 40 | export(tokenizer_delim) 41 | export(tokenizer_fwf) 42 | export(tokenizer_line) 43 | export(tokenizer_log) 44 | export(tokenizer_tsv) 45 | export(tokenizer_ws) 46 | importFrom(methods,setOldClass) 47 | importFrom(tibble,as_tibble) 48 | importFrom(tibble,tibble) 49 | useDynLib(meltr, .registration = TRUE) 50 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # meltr 1.0.2 2 | 3 | * Fix CRAN warnings 4 | 5 | # meltr 1.0.1 6 | 7 | * Fix buffer overflow when trying to parse a field that is over 64 characters long (#10) 8 | 9 | # meltr 1.0.0 10 | 11 | * Added a `NEWS.md` file to track changes to the package. 12 | -------------------------------------------------------------------------------- /R/callback.R: -------------------------------------------------------------------------------- 1 | as_chunk_callback <- function(x) UseMethod("as_chunk_callback") 2 | as_chunk_callback.function <- function(x) { 3 | SideEffectChunkCallback$new(x) 4 | } 5 | as_chunk_callback.R6ClassGenerator <- function(x) { 6 | as_chunk_callback(x$new()) 7 | } 8 | as_chunk_callback.ChunkCallback <- function(x) { 9 | x 10 | } 11 | 12 | #' Callback classes 13 | #' 14 | #' These classes are used to define callback behaviors. 15 | #' 16 | #' \describe{ 17 | #' \item{ChunkCallback}{Callback interface definition, all callback functions should inherit from this class.} 18 | #' \item{SideEffectChunkCallback}{Callback function that is used only for side effects, no results are returned.} 19 | #' \item{DataFrameCallback}{Callback function that combines each result together at the end.} 20 | #' \item{AccumulateCallBack}{ 21 | #' Callback function that accumulates a single result. Requires the parameter `acc` to specify 22 | #' the initial value of the accumulator. The parameter `acc` is `NULL` by default. 23 | #' } 24 | #' } 25 | #' @usage NULL 26 | #' @format NULL 27 | #' @name callback 28 | #' @keywords internal 29 | #' @family chunked 30 | #' @export 31 | ChunkCallback <- R6::R6Class("ChunkCallback", 32 | private = list( 33 | callback = NULL 34 | ), 35 | public = list( 36 | initialize = function(callback) NULL, 37 | receive = function(data, index) NULL, 38 | continue = function() TRUE, 39 | result = function() NULL, 40 | finally = function() NULL 41 | ) 42 | ) 43 | 44 | #' @usage NULL 45 | #' @format NULL 46 | #' @rdname callback 47 | #' @export 48 | SideEffectChunkCallback <- R6::R6Class("SideEffectChunkCallback", 49 | inherit = ChunkCallback, 50 | private = list( 51 | cancel = FALSE 52 | ), 53 | public = list( 54 | initialize = function(callback) { 55 | check_callback_fun(callback) 56 | private$callback <- callback 57 | }, 58 | receive = function(data, index) { 59 | result <- private$callback(data, index) 60 | private$cancel <- identical(result, FALSE) 61 | }, 62 | continue = function() { 63 | !private$cancel 64 | } 65 | ) 66 | ) 67 | 68 | #' @usage NULL 69 | #' @format NULL 70 | #' @rdname callback 71 | #' @export 72 | DataFrameCallback <- R6::R6Class("DataFrameCallback", 73 | inherit = ChunkCallback, 74 | private = list( 75 | results = list() 76 | ), 77 | public = list( 78 | initialize = function(callback) { 79 | private$callback <- callback 80 | }, 81 | receive = function(data, index) { 82 | result <- private$callback(data, index) 83 | private$results <- c(private$results, list(result)) 84 | }, 85 | result = function() { 86 | do.call(`rbind`, private$results) 87 | }, 88 | finally = function() { 89 | private$results <- list() 90 | } 91 | ) 92 | ) 93 | 94 | #' @usage NULL 95 | #' @format NULL 96 | #' @rdname callback 97 | #' @export 98 | ListCallback <- R6::R6Class("ListCallback", 99 | inherit = ChunkCallback, 100 | private = list( 101 | results = list() 102 | ), 103 | public = list( 104 | initialize = function(callback) { 105 | private$callback <- callback 106 | }, 107 | receive = function(data, index) { 108 | result <- private$callback(data, index) 109 | private$results <- c(private$results, list(result)) 110 | }, 111 | result = function() { 112 | private$results 113 | }, 114 | finally = function() { 115 | private$results <- list() 116 | } 117 | ) 118 | ) 119 | 120 | #' @usage NULL 121 | #' @format NULL 122 | #' @rdname callback 123 | #' @export 124 | AccumulateCallback <- R6::R6Class("AccumulateCallback", 125 | inherit = ChunkCallback, 126 | private = list( 127 | acc = NULL 128 | ), 129 | public = list( 130 | initialize = function(callback, acc = NULL) { 131 | check_callback_fun(callback, 132 | req_args = 3, 133 | message = "`callback` must have three or more arguments" 134 | ) 135 | private$acc <- acc 136 | private$callback <- callback 137 | }, 138 | receive = function(data, index) { 139 | private$acc <- private$callback(data, index, private$acc) 140 | }, 141 | result = function() { 142 | private$acc 143 | } 144 | ) 145 | ) 146 | 147 | check_callback_fun <- function(callback, req_args = 2, message = NULL) { 148 | if (is.null(message)) { 149 | message <- "`callback` must have two or more arguments" 150 | } 151 | n_args <- length(formals(callback)) 152 | if (n_args < req_args) { 153 | stop(message, call. = FALSE) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /R/cpp11.R: -------------------------------------------------------------------------------- 1 | # Generated by cpp11: do not edit by hand 2 | 3 | collectorGuess <- function(input, locale_, guessInteger) { 4 | .Call(`_meltr_collectorGuess`, input, locale_, guessInteger) 5 | } 6 | 7 | read_connection_ <- function(con, filename, chunk_size) { 8 | .Call(`_meltr_read_connection_`, con, filename, chunk_size) 9 | } 10 | 11 | read_file_ <- function(sourceSpec, locale_) { 12 | .Call(`_meltr_read_file_`, sourceSpec, locale_) 13 | } 14 | 15 | read_file_raw_ <- function(sourceSpec) { 16 | .Call(`_meltr_read_file_raw_`, sourceSpec) 17 | } 18 | 19 | melt_tokens_ <- function(sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress) { 20 | .Call(`_meltr_melt_tokens_`, sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress) 21 | } 22 | 23 | melt_tokens_chunked_ <- function(sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress) { 24 | invisible(.Call(`_meltr_melt_tokens_chunked_`, sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress)) 25 | } 26 | 27 | whitespaceColumns <- function(sourceSpec, n, comment) { 28 | .Call(`_meltr_whitespaceColumns`, sourceSpec, n, comment) 29 | } 30 | -------------------------------------------------------------------------------- /R/date-symbols.R: -------------------------------------------------------------------------------- 1 | #' Create or retrieve date names 2 | #' 3 | #' When parsing dates, you often need to know how weekdays of the week and 4 | #' months are represented as text. This pair of functions allows you to either 5 | #' create your own, or retrieve from a standard list. The standard list is 6 | #' derived from ICU () via the stringi package. 7 | #' 8 | #' @param mon,mon_ab Full and abbreviated month names. 9 | #' @param day,day_ab Full and abbreviated week day names. Starts with Sunday. 10 | #' @param am_pm Names used for AM and PM. 11 | #' @return A date names object 12 | #' @export 13 | #' @examples 14 | #' date_names(mon = LETTERS[1:12], day = letters[1:7]) 15 | #' date_names_lang("en") 16 | #' date_names_lang("ko") 17 | #' date_names_lang("fr") 18 | date_names <- function(mon, mon_ab = mon, day, day_ab = day, 19 | am_pm = c("AM", "PM")) { 20 | stopifnot(is.character(mon), length(mon) == 12) 21 | stopifnot(is.character(mon_ab), length(mon_ab) == 12) 22 | stopifnot(is.character(day), length(day) == 7) 23 | stopifnot(is.character(day_ab), length(day_ab) == 7) 24 | 25 | structure( 26 | list( 27 | mon = enc2utf8(mon), 28 | mon_ab = enc2utf8(mon_ab), 29 | day = enc2utf8(day), 30 | day_ab = enc2utf8(day_ab), 31 | am_pm = enc2utf8(am_pm) 32 | ), 33 | class = "date_names" 34 | ) 35 | } 36 | 37 | #' @export 38 | #' @rdname date_names 39 | #' @param language A BCP 47 locale, made up of a language and a region, 40 | #' e.g. `"en_US"` for American English. See `date_names_langs()` 41 | #' for a complete list of available locales. 42 | date_names_lang <- function(language) { 43 | stopifnot(is.character(language), length(language) == 1) 44 | 45 | symbols <- date_symbols[[language]] 46 | if (is.null(symbols)) { 47 | stop("Unknown language '", language, "'", call. = FALSE) 48 | } 49 | 50 | symbols 51 | } 52 | 53 | #' @export 54 | #' @rdname date_names 55 | date_names_langs <- function() { 56 | names(date_symbols) 57 | } 58 | 59 | #' @export 60 | print.date_names <- function(x, ...) { 61 | cat("\n") 62 | 63 | if (identical(x$day, x$day_ab)) { 64 | day <- paste0(x$day, collapse = ", ") 65 | } else { 66 | day <- paste0(x$day, " (", x$day_ab, ")", collapse = ", ") 67 | } 68 | 69 | if (identical(x$mon, x$mon_ab)) { 70 | mon <- paste0(x$mon, collapse = ", ") 71 | } else { 72 | mon <- paste0(x$mon, " (", x$mon_ab, ")", collapse = ", ") 73 | } 74 | am_pm <- paste0(x$am_pm, collapse = "/") 75 | 76 | cat_wrap("Days: ", day) 77 | cat_wrap("Months: ", mon) 78 | cat_wrap("AM/PM: ", am_pm) 79 | } 80 | 81 | is.date_names <- function(x) inherits(x, "date_names") 82 | 83 | cat_wrap <- function(header, body) { 84 | body <- strwrap(body, exdent = nchar(header)) 85 | cat(header, paste(body, collapse = "\n"), "\n", sep = "") 86 | } 87 | -------------------------------------------------------------------------------- /R/example.R: -------------------------------------------------------------------------------- 1 | #' Get path to meltr example 2 | #' 3 | #' meltr comes bundled with a number of sample files in its `inst/extdata` 4 | #' directory. This function make them easy to access 5 | #' 6 | #' @param file Name of file. If `NULL`, the example files will be listed. 7 | #' @return A file path or a vector of file names 8 | #' @export 9 | #' @examples 10 | #' meltr_example() 11 | #' meltr_example("mtcars.csv") 12 | meltr_example <- function(file = NULL) { 13 | if (is.null(file)) { 14 | dir(system.file("extdata", package = "meltr")) 15 | } else { 16 | system.file("extdata", file, package = "meltr", mustWork = TRUE) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /R/locale.R: -------------------------------------------------------------------------------- 1 | #' Create locales 2 | #' 3 | #' A locale object tries to capture all the defaults that can vary between 4 | #' countries. You set the locale in once, and the details are automatically 5 | #' passed on down to the columns parsers. The defaults have been chosen to 6 | #' match R (i.e. US English) as closely as possible. See 7 | #' `vignette("locales")` for more details. 8 | #' 9 | #' @param date_names Character representations of day and month names. Either 10 | #' the language code as string (passed on to [date_names_lang()]) 11 | #' or an object created by [date_names()]. 12 | #' @param date_format,time_format Default date and time formats. 13 | #' @param decimal_mark,grouping_mark Symbols used to indicate the decimal 14 | #' place, and to chunk larger numbers. Decimal mark can only be `,` or 15 | #' `.`. 16 | #' @param tz Default tz. This is used both for input (if the time zone isn't 17 | #' present in individual strings), and for output (to control the default 18 | #' display). The default is to use "UTC", a time zone that does not use 19 | #' daylight savings time (DST) and hence is typically most useful for data. 20 | #' The absence of time zones makes it approximately 50x faster to generate 21 | #' UTC times than any other time zone. 22 | #' 23 | #' Use `""` to use the system default time zone, but beware that this 24 | #' will not be reproducible across systems. 25 | #' 26 | #' For a complete list of possible time zones, see [OlsonNames()]. 27 | #' Americans, note that "EST" is a Canadian time zone that does not have 28 | #' DST. It is *not* Eastern Standard Time. It's better to use 29 | #' "US/Eastern", "US/Central" etc. 30 | #' @param encoding Default encoding. This only affects how the file is 31 | #' read - meltr always converts the output to UTF-8. 32 | #' @return A locale object 33 | #' @export 34 | #' @examples 35 | #' locale() 36 | #' locale("fr") 37 | #' 38 | #' # South American locale 39 | #' locale("es", decimal_mark = ",") 40 | locale <- function(date_names = "en", 41 | date_format = "%AD", time_format = "%AT", 42 | decimal_mark = ".", grouping_mark = ",", 43 | tz = "UTC", encoding = "UTF-8") { 44 | if (is.character(date_names)) { 45 | date_names <- date_names_lang(date_names) 46 | } 47 | stopifnot(is.date_names(date_names)) 48 | 49 | if (missing(grouping_mark) && !missing(decimal_mark)) { 50 | grouping_mark <- if (decimal_mark == ".") "," else "." 51 | } else if (missing(decimal_mark) && !missing(grouping_mark)) { 52 | decimal_mark <- if (grouping_mark == ".") "," else "." 53 | } 54 | 55 | stopifnot(decimal_mark %in% c(".", ",")) 56 | stopifnot(is.character(grouping_mark), length(grouping_mark) == 1) 57 | if (decimal_mark == grouping_mark) { 58 | stop("`decimal_mark` and `grouping_mark` must be different", call. = FALSE) 59 | } 60 | 61 | tz <- check_tz(tz) 62 | check_encoding(encoding) 63 | 64 | structure( 65 | list( 66 | date_names = date_names, 67 | date_format = date_format, 68 | time_format = time_format, 69 | decimal_mark = decimal_mark, 70 | grouping_mark = grouping_mark, 71 | tz = tz, 72 | encoding = encoding 73 | ), 74 | class = "locale" 75 | ) 76 | } 77 | 78 | is.locale <- function(x) inherits(x, "locale") 79 | 80 | #' @export 81 | print.locale <- function(x, ...) { 82 | cat("\n") 83 | cat("Numbers: ", prettyNum(123456.78, 84 | big.mark = x$grouping_mark, 85 | decimal.mark = x$decimal_mark, digits = 8 86 | ), "\n", sep = "") 87 | cat("Formats: ", x$date_format, " / ", x$time_format, "\n", sep = "") 88 | cat("Timezone: ", x$tz, "\n", sep = "") 89 | cat("Encoding: ", x$encoding, "\n", sep = "") 90 | print(x$date_names) 91 | } 92 | 93 | #' @export 94 | #' @rdname locale 95 | default_locale <- function() { 96 | loc <- getOption("readr.default_locale") 97 | if (is.null(loc)) { 98 | loc <- locale() 99 | options("readr.default_locale" = loc) 100 | } 101 | 102 | loc 103 | } 104 | 105 | check_tz <- function(x) { 106 | stopifnot(is.character(x), length(x) == 1) 107 | 108 | if (identical(x, "")) { 109 | x <- Sys.timezone() 110 | 111 | if (identical(x, "") || identical(x, NA_character_)) { 112 | x <- "UTC" 113 | } 114 | } 115 | 116 | x 117 | } 118 | 119 | check_encoding <- function(x) { 120 | stopifnot(is.character(x), length(x) == 1) 121 | 122 | if (tolower(x) %in% tolower(iconvlist())) { 123 | return(TRUE) 124 | } 125 | 126 | stop("Unknown encoding ", x, call. = FALSE) 127 | } 128 | -------------------------------------------------------------------------------- /R/melt_delim.R: -------------------------------------------------------------------------------- 1 | #' Return melted data for each token in a delimited file (including csv & tsv) 2 | #' 3 | #' For certain non-rectangular data formats, it can be useful to parse the data 4 | #' into a melted format where each row represents a single token. 5 | #' 6 | #' `melt_csv()` and `melt_tsv()` are special cases of the general 7 | #' `melt_delim()`. They're useful for reading the most common types of 8 | #' flat file data, comma separated values and tab separated values, 9 | #' respectively. `melt_csv2()` uses `;` for the field separator and `,` for the 10 | #' decimal point. This is common in some European countries. 11 | #' @inheritParams readr::read_delim 12 | #' @return A [tibble()] of four columns: 13 | #' * `row`, the row that the token comes from in the original file 14 | #' * `col`, the column that the token comes from in the original file 15 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`, 16 | #' `"date"`, guessed in a similar way to the `guess_parser()` function. 17 | #' * `value`, the token itself as a character string, unchanged from its 18 | #' representation in the original file. 19 | #' 20 | #' If there are parsing problems, a warning tells you 21 | #' how many, and you can retrieve the details with [problems()]. 22 | #' @seealso [readr::read_delim()] for the conventional way to read rectangular data 23 | #' from delimited files. 24 | #' @export 25 | #' @examples 26 | #' # Input sources ------------------------------------------------------------- 27 | #' # Read from a path 28 | #' melt_csv(meltr_example("mtcars.csv")) 29 | #' \dontrun{ 30 | #' melt_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv") 31 | #' } 32 | #' 33 | #' # Or directly from a string (must contain a newline) 34 | #' melt_csv("x,y\n1,2\n3,4") 35 | #' 36 | #' # To import empty cells as 'empty' rather than `NA` 37 | #' melt_csv("x,y\n,NA,\"\",''", na = "NA") 38 | #' 39 | #' # File types ---------------------------------------------------------------- 40 | #' melt_csv("a,b\n1.0,2.0") 41 | #' melt_csv2("a;b\n1,0;2,0") 42 | #' melt_tsv("a\tb\n1.0\t2.0") 43 | #' melt_delim("a|b\n1.0|2.0", delim = "|") 44 | #' @export 45 | melt_delim <- function(file, delim, quote = '"', 46 | escape_backslash = FALSE, escape_double = TRUE, 47 | locale = default_locale(), 48 | na = c("", "NA"), quoted_na = TRUE, 49 | comment = "", trim_ws = FALSE, 50 | skip = 0, n_max = Inf, 51 | progress = show_progress(), 52 | skip_empty_rows = FALSE) { 53 | if (!nzchar(delim)) { 54 | stop("`delim` must be at least one character, ", 55 | "use `melt_table()` for whitespace delimited input.", call. = FALSE) 56 | } 57 | tokenizer <- tokenizer_delim(delim, quote = quote, 58 | escape_backslash = escape_backslash, escape_double = escape_double, 59 | na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws, 60 | skip_empty_rows = skip_empty_rows) 61 | melt_delimited(file, tokenizer, locale = locale, skip = skip, 62 | skip_empty_rows = skip_empty_rows, comment = comment, 63 | n_max = n_max, progress = progress) 64 | } 65 | 66 | #' @rdname melt_delim 67 | #' @export 68 | melt_csv <- function(file, locale = default_locale(), na = c("", "NA"), 69 | quoted_na = TRUE, quote = "\"", comment = "", 70 | trim_ws = TRUE, skip = 0, n_max = Inf, 71 | progress = show_progress(), 72 | skip_empty_rows = FALSE) { 73 | tokenizer <- tokenizer_csv(na = na, quoted_na = quoted_na, quote = quote, 74 | comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows) 75 | melt_delimited(file, tokenizer, locale = locale, skip = skip, 76 | skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, 77 | progress = progress) 78 | } 79 | 80 | #' @rdname melt_delim 81 | #' @export 82 | melt_csv2 <- function(file, locale = default_locale(), na = c("", "NA"), 83 | quoted_na = TRUE, quote = "\"", comment = "", 84 | trim_ws = TRUE, skip = 0, n_max = Inf, 85 | progress = show_progress(), 86 | skip_empty_rows = FALSE) { 87 | 88 | if (locale$decimal_mark == ".") { 89 | cli::cli_alert_info("Using {.val ','} as decimal and {.val '.'} as grouping mark. Use {.fn melt_delim} for more control.") 90 | locale$decimal_mark <- "," 91 | locale$grouping_mark <- "." 92 | } 93 | tokenizer <- tokenizer_delim(delim = ";", na = na, quoted_na = quoted_na, 94 | quote = quote, comment = comment, trim_ws = trim_ws, 95 | skip_empty_rows = skip_empty_rows) 96 | melt_delimited(file, tokenizer, locale = locale, skip = skip, 97 | skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, 98 | progress = progress) 99 | } 100 | 101 | 102 | #' @rdname melt_delim 103 | #' @export 104 | melt_tsv <- function(file, locale = default_locale(), na = c("", "NA"), 105 | quoted_na = TRUE, quote = "\"", comment = "", 106 | trim_ws = TRUE, skip = 0, n_max = Inf, 107 | progress = show_progress(), 108 | skip_empty_rows = FALSE) { 109 | tokenizer <- tokenizer_tsv(na = na, quoted_na = quoted_na, quote = quote, 110 | comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows) 111 | melt_delimited(file, tokenizer, locale = locale, skip = skip, 112 | skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, 113 | progress = progress) 114 | } 115 | 116 | # Helper functions for reading from delimited files ---------------------------- 117 | col_spec_melt <- 118 | structure(list(row = structure(list(), 119 | class = c("collector_double", 120 | "collector")), 121 | col = structure(list(), 122 | class = c("collector_double", 123 | "collector")), 124 | data_type = structure(list(), 125 | class = c("collector_character", 126 | "collector")), 127 | value = structure(list(), 128 | class = c("collector_character", 129 | "collector"))), 130 | .Names = c("row", "col", "data_type", "value")) 131 | 132 | melt_tokens <- function(data, tokenizer, locale_, n_max, progress) { 133 | if (n_max == Inf) { 134 | n_max <- -1 135 | } 136 | melt_tokens_(data, tokenizer, col_spec_melt, locale_, n_max, progress) 137 | } 138 | 139 | melt_delimited <- function(file, tokenizer, locale = default_locale(), 140 | skip = 0, skip_empty_rows = FALSE, comment = "", n_max = Inf, 141 | progress = show_progress()) { 142 | name <- source_name(file) 143 | # If connection needed, read once. 144 | file <- standardise_path(file) 145 | if (is.connection(file)) { 146 | data <- datasource_connection(file, skip, skip_empty_rows = skip_empty_rows, comment) 147 | } else { 148 | if (empty_file(file)) { 149 | return(tibble::tibble(row = double(), col = double(), 150 | data_type = character(), value = character())) 151 | } 152 | if (is.character(file) && identical(locale$encoding, "UTF-8")) { 153 | # When locale is not set, file is probablly marked as its correct encoding. 154 | # As default_locale() assumes file is UTF-8, file should be encoded as UTF-8 for non-UTF-8 MBCS locales. 155 | data <- enc2utf8(file) 156 | } else { 157 | data <- file 158 | } 159 | } 160 | ds <- datasource(data, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment) 161 | out <- melt_tokens(ds, tokenizer, locale_ = locale, n_max = n_max, 162 | progress = progress) 163 | warn_problems(out) 164 | } 165 | -------------------------------------------------------------------------------- /R/melt_delim_chunked.R: -------------------------------------------------------------------------------- 1 | # Generates the chunked definition from the melt_* definition 2 | generate_melt_chunked_fun <- function(x) { # nocov start 3 | args <- formals(x) 4 | 5 | # Remove n_max argument 6 | args <- args[names(args) != "n_max"] 7 | 8 | args <- append(args, alist(callback = , chunk_size = 10000), 1) 9 | 10 | b <- as.list(body(x)) 11 | 12 | # Change melt_delimited to melt_delimited_chunked 13 | b[[length(b)]][[1]] <- quote(melt_delimited_chunked) 14 | 15 | call_args <- as.list(b[[length(b)]]) 16 | 17 | # Remove the n_max argument 18 | call_args <- call_args[!names(call_args) == "n_max"] 19 | 20 | # add the callback and chunk_size arguments 21 | b[[length(b)]] <- as.call(append(call_args, alist(callback = callback, chunk_size = chunk_size), 2)) 22 | 23 | body(x) <- as.call(b) 24 | 25 | formals(x) <- args 26 | 27 | x 28 | } # nocov end 29 | 30 | # Generates the modified melt_delimited function 31 | generate_melt_delimited_chunked <- function(x) { # nocov start 32 | args <- formals(x) 33 | args <- args[names(args) != "n_max"] 34 | args <- append(args, alist(callback = , chunk_size = 10000), 1) 35 | 36 | b <- as.list(body(x)) 37 | 38 | for (i in seq_along(b)) { 39 | if (is.call(b[[i]]) && identical(b[[i]][[1]], as.symbol("<-")) && 40 | is.call(b[[i]][[3]]) && identical(b[[i]][[3]][[1]], quote(melt_tokens))) { 41 | 42 | # Change melt_tokens() to melt_tokens_chunked 43 | b[[i]][[3]][[1]] <- quote(melt_tokens_chunked) 44 | chunked_call <- as.list(b[[i]][[3]]) 45 | 46 | # Remove the n_max argument 47 | chunked_call <- chunked_call[!names(chunked_call) == "n_max"] 48 | 49 | # Add the callback and chunk_size arguments 50 | b[[i]] <- as.call(append(chunked_call, alist(callback = callback, chunk_size = chunk_size), 2)) 51 | 52 | # Remove additional calls 53 | b <- b[-seq(i + 1, length(b))] 54 | body(x) <- as.call(b) 55 | formals(x) <- args 56 | return(x) 57 | } 58 | } 59 | 60 | x 61 | } # nocov end 62 | 63 | melt_tokens_chunked <- function(data, callback, chunk_size, tokenizer, locale_, progress) { 64 | callback <- as_chunk_callback(callback) 65 | on.exit(callback$finally(), add = TRUE) 66 | 67 | melt_tokens_chunked_( 68 | data, callback, chunk_size, tokenizer, col_spec_melt, 69 | locale_, progress 70 | ) 71 | 72 | return(callback$result()) 73 | } 74 | 75 | melt_delimited_chunked <- generate_melt_delimited_chunked(melt_delimited) 76 | 77 | #' Melt a delimited file by chunks 78 | #' 79 | #' For certain non-rectangular data formats, it can be useful to parse the data 80 | #' into a melted format where each row represents a single token. 81 | #' 82 | #' `melt_delim_chunked()` and the specialisations `melt_csv_chunked()`, 83 | #' `melt_csv2_chunked()` and `melt_tsv_chunked()` read files by a chunk of rows 84 | #' at a time, executing a given function on one chunk before reading the next. 85 | #' 86 | #' @inheritParams readr::read_delim_chunked 87 | #' @param callback A callback function to call on each chunk 88 | #' @param chunk_size The number of rows to include in each chunk 89 | #' @return A [tibble()] of four columns: 90 | #' * `row`, the row that the token comes from in the original file 91 | #' * `col`, the column that the token comes from in the original file 92 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`, 93 | #' `"date"`, guessed in a similar way to the `guess_parser()` function. 94 | #' * `value`, the token itself as a character string, unchanged from its 95 | #' representation in the original file. 96 | #' 97 | #' If there are parsing problems, a warning tells you 98 | #' how many, and you can retrieve the details with [problems()]. 99 | #' @keywords internal 100 | #' @family chunked 101 | #' @export 102 | #' @examples 103 | #' # Cars with 3 gears 104 | #' f <- function(x, pos) subset(x, data_type == "integer") 105 | #' melt_csv_chunked(meltr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) 106 | melt_delim_chunked <- generate_melt_chunked_fun(melt_delim) 107 | 108 | #' @rdname melt_delim_chunked 109 | #' @export 110 | melt_csv_chunked <- generate_melt_chunked_fun(melt_csv) 111 | 112 | #' @rdname melt_delim_chunked 113 | #' @export 114 | melt_csv2_chunked <- generate_melt_chunked_fun(melt_csv2) 115 | 116 | #' @rdname melt_delim_chunked 117 | #' @export 118 | melt_tsv_chunked <- generate_melt_chunked_fun(melt_tsv) 119 | 120 | utils::globalVariables(c("callback", "chunk_size")) 121 | -------------------------------------------------------------------------------- /R/melt_fwf.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Return melted data for each token in a fixed width file 4 | #' 5 | #' For certain non-rectangular data formats, it can be useful to parse the data 6 | #' into a melted format where each row represents a single token. 7 | #' 8 | #' `melt_fwf()` parses each token of a fixed width file into a single row, but 9 | #' it still requires that each field is in the same in every row of the 10 | #' source file. 11 | #' 12 | #' @seealso [melt_table()] to melt fixed width files where each 13 | #' column is separated by whitespace, and [melt_fwf()] for the conventional 14 | #' way to read rectangular data from fixed width files. 15 | #' @inheritParams readr::read_fwf 16 | #' @param col_positions Column positions, as created by [fwf_empty()], 17 | #' [fwf_widths()] or [fwf_positions()]. To read in only selected fields, 18 | #' use [fwf_positions()]. If the width of the last column is variable (a 19 | #' ragged fwf file), supply the last end position as NA. 20 | #' @return A [tibble()] of four columns: 21 | #' * `row`, the row that the token comes from in the original file 22 | #' * `col`, the column that the token comes from in the original file 23 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`, 24 | #' `"date"`, guessed in a similar way to the `guess_parser()` function. 25 | #' * `value`, the token itself as a character string, unchanged from its 26 | #' representation in the original file. 27 | #' 28 | #' If there are parsing problems, a warning tells you 29 | #' how many, and you can retrieve the details with [problems()]. 30 | #' @export 31 | #' @examples 32 | #' fwf_sample <- meltr_example("fwf-sample.txt") 33 | #' writeLines(readLines(fwf_sample)) 34 | #' 35 | #' # You can specify column positions in several ways: 36 | #' # 1. Guess based on position of empty columns 37 | #' melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn"))) 38 | #' # 2. A vector of field widths 39 | #' melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn"))) 40 | #' # 3. Paired vectors of start and end positions 41 | #' melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn"))) 42 | #' # 4. Named arguments with start and end positions 43 | #' melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42))) 44 | #' # 5. Named arguments with column widths 45 | #' melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12)) 46 | melt_fwf <- function(file, col_positions, 47 | locale = default_locale(), na = c("", "NA"), 48 | comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, 49 | progress = show_progress(), 50 | skip_empty_rows = FALSE) { 51 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) 52 | if (inherits(ds, "source_file") && empty_file(file)) { 53 | return(tibble::tibble( 54 | row = double(), col = double(), 55 | data_type = character(), value = character() 56 | )) 57 | } 58 | tokenizer <- tokenizer_fwf(as.integer(col_positions$begin), as.integer(col_positions$end), 59 | na = na, 60 | comment = comment, trim_ws = trim_ws, 61 | skip_empty_rows = skip_empty_rows 62 | ) 63 | out <- melt_tokens(ds, tokenizer, 64 | locale_ = locale, 65 | n_max = if (n_max == Inf) -1 else n_max, progress = progress 66 | ) 67 | warn_problems(out) 68 | } 69 | 70 | #' @rdname melt_fwf 71 | #' @export 72 | #' @param n Number of lines the tokenizer will read to determine file structure. By default 73 | #' it is set to 100. 74 | fwf_empty <- function(file, skip = 0, skip_empty_rows = FALSE, col_names = NULL, comment = "", n = 100L) { 75 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) 76 | 77 | out <- whitespaceColumns(ds, comment = comment, n = n) 78 | out$end[length(out$end)] <- NA 79 | 80 | col_names <- fwf_col_names(col_names, length(out$begin)) 81 | out$col_names <- col_names 82 | out 83 | } 84 | 85 | #' @rdname melt_fwf 86 | #' @export 87 | #' @param widths Width of each field. Use NA as width of last field when 88 | #' reading a ragged fwf file. 89 | #' @param col_names Either NULL, or a character vector column names. 90 | fwf_widths <- function(widths, col_names = NULL) { 91 | pos <- cumsum(c(1L, abs(widths))) 92 | fwf_positions(pos[-length(pos)], pos[-1] - 1L, col_names) 93 | } 94 | 95 | #' @rdname melt_fwf 96 | #' @export 97 | #' @param start,end Starting and ending (inclusive) positions of each field. 98 | #' Use NA as last end field when reading a ragged fwf file. 99 | fwf_positions <- function(start, end = NULL, col_names = NULL) { 100 | stopifnot(length(start) == length(end)) 101 | col_names <- fwf_col_names(col_names, length(start)) 102 | 103 | tibble( 104 | begin = start - 1L, 105 | end = end, # -1 to change to 0 offset, +1 to be exclusive, 106 | col_names = as.character(col_names) 107 | ) 108 | } 109 | 110 | 111 | #' @rdname melt_fwf 112 | #' @export 113 | #' @param ... If the first element is a data frame, 114 | #' then it must have all numeric columns and either one or two rows. 115 | #' The column names are the variable names. The column values are the 116 | #' variable widths if a length one vector, and if length two, variable start and end 117 | #' positions. The elements of `...` are used to construct a data frame 118 | #' with or or two rows as above. 119 | fwf_cols <- function(...) { 120 | x <- lapply(list(...), as.integer) 121 | names(x) <- fwf_col_names(names(x), length(x)) 122 | x <- tibble::as_tibble(x) 123 | if (nrow(x) == 2) { 124 | res <- fwf_positions(as.integer(x[1, ]), as.integer(x[2, ]), names(x)) 125 | } else if (nrow(x) == 1) { 126 | res <- fwf_widths(as.integer(x[1, ]), names(x)) 127 | } else { 128 | stop("All variables must have either one (width) two (start, end) values.", 129 | call. = FALSE 130 | ) 131 | } 132 | res 133 | } 134 | 135 | fwf_col_names <- function(nm, n) { 136 | nm <- nm %||% rep("", n) 137 | nm_empty <- (nm == "") 138 | nm[nm_empty] <- paste0("X", seq_len(n))[nm_empty] 139 | nm 140 | } 141 | -------------------------------------------------------------------------------- /R/melt_table.R: -------------------------------------------------------------------------------- 1 | #' Return melted data for each token in a whitespace-separated file 2 | #' 3 | #' @description 4 | #' 5 | #' For certain non-rectangular data formats, it can be useful to parse the data 6 | #' into a melted format where each row represents a single token. 7 | #' 8 | #' `melt_table()` and `melt_table2()` are designed to read the type of textual 9 | #' data where each column is separated by one (or more) columns of space. 10 | #' 11 | #' `melt_table2()` allows any number of whitespace characters between columns, 12 | #' and the lines can be of different lengths. 13 | #' 14 | #' `melt_table()` is more strict, each line must be the same length, 15 | #' and each field is in the same position in every line. It first finds empty 16 | #' columns and then parses like a fixed width file. 17 | #' 18 | #' @seealso [melt_fwf()] to melt fixed width files where each column 19 | #' is not separated by whitespace. `melt_fwf()` is also useful for reading 20 | #' tabular data with non-standard formatting. [readr::read_table()] is the 21 | #' conventional way to read tabular data from whitespace-separated files. 22 | #' @inheritParams readr::read_table 23 | #' @return A [tibble()] of four columns: 24 | #' * `row`, the row that the token comes from in the original file 25 | #' * `col`, the column that the token comes from in the original file 26 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`, 27 | #' `"date"`, guessed in a similar way to the `guess_parser()` function. 28 | #' * `value`, the token itself as a character string, unchanged from its 29 | #' representation in the original file. 30 | #' 31 | #' If there are parsing problems, a warning tells you 32 | #' how many, and you can retrieve the details with [problems()]. 33 | #' @export 34 | #' @examples 35 | #' # One corner from http://www.masseyratings.com/cf/compare.htm 36 | #' massey <- meltr_example("massey-rating.txt") 37 | #' cat(readLines(massey)) 38 | #' melt_table(massey) 39 | #' 40 | #' # Sample of 1978 fuel economy data from 41 | #' # http://www.fueleconomy.gov/feg/epadata/78data.zip 42 | #' epa <- meltr_example("epa78.txt") 43 | #' writeLines(readLines(epa)) 44 | #' melt_table(epa) 45 | melt_table <- function(file, locale = default_locale(), na = "NA", skip = 0, 46 | n_max = Inf, guess_max = min(n_max, 1000), 47 | progress = show_progress(), comment = "", 48 | skip_empty_rows = FALSE) { 49 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) 50 | if (inherits(ds, "source_file") && empty_file(file)) { 51 | return(tibble::tibble( 52 | row = double(), col = double(), 53 | data_type = character(), value = character() 54 | )) 55 | } 56 | 57 | columns <- fwf_empty(ds, skip = skip, skip_empty_rows = skip_empty_rows, n = guess_max, comment = comment) 58 | tokenizer <- tokenizer_fwf(columns$begin, columns$end, 59 | na = na, 60 | comment = comment, 61 | skip_empty_rows = skip_empty_rows 62 | ) 63 | 64 | ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows) 65 | out <- melt_tokens(ds, tokenizer, 66 | locale_ = locale, n_max = n_max, 67 | progress = progress 68 | ) 69 | warn_problems(out) 70 | } 71 | 72 | #' @rdname melt_table 73 | #' @export 74 | melt_table2 <- function(file, locale = default_locale(), na = "NA", skip = 0, 75 | n_max = Inf, progress = show_progress(), comment = "", 76 | skip_empty_rows = FALSE) { 77 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) 78 | if (inherits(ds, "source_file") && empty_file(file)) { 79 | return(tibble::tibble( 80 | row = double(), col = double(), 81 | data_type = character(), value = character() 82 | )) 83 | } 84 | tokenizer <- tokenizer_ws( 85 | na = na, comment = comment, 86 | skip_empty_rows = skip_empty_rows 87 | ) 88 | 89 | ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows) 90 | melt_delimited(ds, tokenizer, 91 | locale = locale, skip = skip, 92 | comment = comment, n_max = n_max, progress = progress 93 | ) 94 | } 95 | -------------------------------------------------------------------------------- /R/meltr-package.R: -------------------------------------------------------------------------------- 1 | ## usethis namespace: start 2 | #' @useDynLib meltr, .registration = TRUE 3 | ## usethis namespace: end 4 | NULL 5 | -------------------------------------------------------------------------------- /R/problems.R: -------------------------------------------------------------------------------- 1 | #' Retrieve parsing problems 2 | #' 3 | #' Readr functions will only throw an error if parsing fails in an unrecoverable 4 | #' way. However, there are lots of potential problems that you might want to 5 | #' know about - these are stored in the `problems` attribute of the 6 | #' output, which you can easily access with this function. 7 | #' `stop_for_problems()` will throw an error if there are any parsing 8 | #' problems: this is useful for automated scripts where you want to throw 9 | #' an error as soon as you encounter a problem. 10 | #' 11 | #' @param x An data frame (from `read_*()`) or a vector 12 | #' (from `parse_*()`). 13 | #' @return A data frame with one row for each problem and four columns: 14 | #' \item{row,col}{Row and column of problem} 15 | #' \item{expected}{What readr expected to find} 16 | #' \item{actual}{What it actually got} 17 | #' @export 18 | #' @examples 19 | #' if (requireNamespace("readr")) { 20 | #' x <- readr::parse_integer(c("1X", "blah", "3")) 21 | #' problems(x) 22 | #' 23 | #' y <- readr::parse_integer(c("1", "2", "3")) 24 | #' problems(y) 25 | #' } 26 | problems <- local({ 27 | no_problems <- tibble::tibble( 28 | row = integer(), 29 | col = integer(), 30 | expected = character(), 31 | actual = character() 32 | ) 33 | 34 | function(x = .Last.value) { 35 | problems <- probs(x) 36 | 37 | if (is.null(problems)) { 38 | return(invisible(no_problems)) 39 | } 40 | 41 | problems 42 | } 43 | }) 44 | 45 | #' @export 46 | #' @rdname problems 47 | stop_for_problems <- function(x) { 48 | n <- n_problems(x) 49 | if (n == 0) { 50 | return(invisible(x)) 51 | } 52 | 53 | stop(n, " parsing failure", if (n > 1) "s", call. = FALSE) 54 | } 55 | 56 | probs <- function(x) { 57 | attr(suppressWarnings(x), "problems") 58 | } 59 | 60 | n_problems <- function(x) { 61 | probs <- problems(x) 62 | if (is.null(probs)) 0 else nrow(probs) 63 | } 64 | 65 | problem_rows <- function(x) { 66 | if (n_problems(x) == 0) { 67 | return(x[0, , drop = FALSE]) 68 | } 69 | 70 | probs <- problems(x) 71 | x[unique(probs$row), , drop = FALSE] 72 | } 73 | 74 | warn_problems <- function(x) { 75 | n <- n_problems(x) 76 | if (n == 0) { 77 | return(x) 78 | } 79 | 80 | probs <- as.data.frame(attr(x, "problems")) 81 | many_problems <- nrow(probs) > 5 82 | 83 | probs_f <- format(utils::head(probs, 5), justify = "left") 84 | probs_f[probs_f == "NA"] <- "--" 85 | probs_f <- rbind(names(probs), probs_f) 86 | probs_f <- lapply(probs_f, format, justify = "right") 87 | 88 | if (many_problems) { 89 | # nchar fails with non-ascii characters, so encode characters beforehand. 90 | width <- vapply(probs_f, function(x) max(nchar(encodeString(x))), integer(1)) 91 | dots <- vapply(width, function(i) paste(rep(".", i), collapse = ""), 92 | FUN.VALUE = character(1) 93 | ) 94 | 95 | probs_f <- Map(c, probs_f, dots) 96 | } 97 | 98 | probs_f <- do.call(paste, c(probs_f, list(sep = " ", collapse = "\n"))) 99 | warning(n, " parsing failure", if (n > 1) "s", ".\n", 100 | probs_f, "\n", 101 | if (many_problems) "See problems(...) for more details.\n", 102 | call. = FALSE, immediate. = TRUE, noBreaks. = TRUE 103 | ) 104 | 105 | x 106 | } 107 | 108 | name_problems <- function(x, all_colnames, name = "input") { 109 | if (n_problems(x) == 0) { 110 | return(x) 111 | } 112 | 113 | problems <- problems(x) 114 | problems$file <- name 115 | problems$col <- all_colnames[problems$col] 116 | attr(x, "problems") <- problems 117 | 118 | x 119 | } 120 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-lib/meltr/38c5a720afe794d1fd2f36e5bb552dd9a8ca8b47/R/sysdata.rda -------------------------------------------------------------------------------- /R/tokenizer.R: -------------------------------------------------------------------------------- 1 | #' Tokenizers. 2 | #' 3 | #' Explicitly create tokenizer objects. Usually you will not call these 4 | #' function, but will instead use one of the use friendly wrappers like 5 | #' [readr::read_csv()]. 6 | #' 7 | #' @keywords internal 8 | #' @name Tokenizers 9 | #' @examples 10 | #' tokenizer_csv() 11 | NULL 12 | 13 | #' @export 14 | #' @rdname Tokenizers 15 | #' @param comment A string used to identify comments. Any text after the 16 | #' comment characters will be silently ignored. 17 | #' @param na Character vector of strings to interpret as missing values. Set this 18 | #' option to `character()` to indicate no missing values. 19 | #' @param quoted_na Should missing values inside quotes be treated as missing 20 | #' values (the default) or strings. 21 | #' @param delim Single character used to separate fields within a record. 22 | #' @param quote Single character used to quote strings. 23 | #' @param trim_ws Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from 24 | #' each field before parsing it? 25 | #' @param escape_double Does the file escape quotes by doubling them? 26 | #' i.e. If this option is `TRUE`, the value `""""` represents 27 | #' a single quote, `\"`. 28 | #' @param escape_backslash Does the file use backslashes to escape special 29 | #' characters? This is more general than `escape_double` as backslashes 30 | #' can be used to escape the delimiter character, the quote character, or 31 | #' to add special characters like `\\n`. 32 | #' @param skip_empty_rows Should blank rows be ignored altogether? i.e. If this 33 | #' option is `TRUE` then blank rows will not be represented at all. If it is 34 | #' `FALSE` then they will be represented by `NA` values in all the columns. 35 | #' @return A tokeenizer object 36 | #' @examples 37 | #' tokenizer_delim(",") 38 | tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, comment = "", 39 | trim_ws = TRUE, 40 | escape_double = TRUE, 41 | escape_backslash = FALSE, 42 | skip_empty_rows = TRUE) { 43 | structure( 44 | list( 45 | delim = delim, 46 | quote = quote, 47 | na = na, 48 | quoted_na = quoted_na, 49 | comment = comment, 50 | trim_ws = trim_ws, 51 | escape_double = escape_double, 52 | escape_backslash = escape_backslash, 53 | skip_empty_rows = skip_empty_rows 54 | ), 55 | class = "tokenizer_delim" 56 | ) 57 | } 58 | 59 | #' @export 60 | #' @rdname Tokenizers 61 | tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"", 62 | comment = "", trim_ws = TRUE, 63 | skip_empty_rows = TRUE) { 64 | tokenizer_delim( 65 | delim = ",", 66 | na = na, 67 | quoted_na = quoted_na, 68 | quote = quote, 69 | comment = comment, 70 | trim_ws = trim_ws, 71 | escape_double = TRUE, 72 | escape_backslash = FALSE, 73 | skip_empty_rows = skip_empty_rows 74 | ) 75 | } 76 | 77 | #' @export 78 | #' @rdname Tokenizers 79 | tokenizer_tsv <- function(na = "NA", quoted_na = TRUE, quote = "\"", 80 | comment = "", trim_ws = TRUE, 81 | skip_empty_rows = TRUE) { 82 | tokenizer_delim( 83 | delim = "\t", 84 | na = na, 85 | quoted_na = quoted_na, 86 | quote = quote, 87 | comment = comment, 88 | trim_ws = trim_ws, 89 | escape_double = TRUE, 90 | escape_backslash = FALSE, 91 | skip_empty_rows = skip_empty_rows 92 | ) 93 | } 94 | 95 | #' @export 96 | #' @rdname Tokenizers 97 | tokenizer_line <- function(na = character(), skip_empty_rows = TRUE) { 98 | structure(list(na = na, skip_empty_rows = skip_empty_rows), 99 | class = "tokenizer_line" 100 | ) 101 | } 102 | 103 | #' @export 104 | #' @rdname Tokenizers 105 | tokenizer_log <- function(trim_ws) { 106 | structure(list(trim_ws = trim_ws), class = "tokenizer_log") 107 | } 108 | 109 | 110 | #' @export 111 | #' @rdname Tokenizers 112 | #' @param begin,end Begin and end offsets for each file. These are C++ 113 | #' offsets so the first column is column zero, and the ranges are 114 | #' [begin, end) (i.e inclusive-exclusive). 115 | tokenizer_fwf <- function(begin, end, na = "NA", comment = "", trim_ws = TRUE, 116 | skip_empty_rows = TRUE) { 117 | structure(list( 118 | begin = as.integer(begin), end = as.integer(end), na = na, comment = comment, 119 | trim_ws = trim_ws, skip_empty_rows = skip_empty_rows 120 | ), 121 | class = "tokenizer_fwf" 122 | ) 123 | } 124 | 125 | #' @export 126 | #' @rdname Tokenizers 127 | tokenizer_ws <- function(na = "NA", comment = "", skip_empty_rows = TRUE) { 128 | structure(list(na = na, comment = comment, skip_empty_rows = skip_empty_rows), 129 | class = "tokenizer_ws" 130 | ) 131 | } 132 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # Silence R CMD check note 2 | #' @importFrom tibble tibble 3 | NULL 4 | 5 | is.connection <- function(x) inherits(x, "connection") 6 | 7 | `%||%` <- function(a, b) if (is.null(a)) b else a 8 | 9 | #' Determine whether progress bars should be shown 10 | #' 11 | #' Progress bars are shown _unless_ one of the following is `TRUE` 12 | #' - The bar is explicitly disabled by setting `options(readr.show_progress = FALSE)` 13 | #' - The code is run in a non-interactive session (`interactive()` is `FALSE`). 14 | #' - The code is run in an RStudio notebook chunk. 15 | #' - The code is run by knitr / rmarkdown. 16 | #' 17 | #' @return A logical value 18 | #' @export 19 | #' @examples 20 | #' show_progress() 21 | show_progress <- function() { 22 | isTRUE(getOption("readr.show_progress")) && # user disables progress bar 23 | interactive() && # an interactive session 24 | !isTRUE(getOption("rstudio.notebook.executing")) && # Not running in an RStudio notebook chunk 25 | !isTRUE(getOption("knitr.in.progress")) # Not actively knitting a document 26 | } 27 | 28 | #' @importFrom tibble as_tibble 29 | #' @export 30 | as_tibble.meltr_spec_tbl_df <- function(x, ...) { 31 | attr(x, "spec") <- NULL 32 | attr(x, "problems") <- NULL 33 | class(x) <- setdiff(class(x), "meltr_spec_tbl_df") 34 | NextMethod("as_tibble") 35 | } 36 | 37 | #' @export 38 | as.data.frame.meltr_spec_tbl_df <- function(x, ...) { 39 | attr(x, "spec") <- NULL 40 | attr(x, "problems") <- NULL 41 | class(x) <- setdiff(class(x), "meltr_spec_tbl_df") 42 | NextMethod("as.data.frame") 43 | } 44 | 45 | #' @export 46 | `[.meltr_spec_tbl_df` <- function(x, ...) { 47 | attr(x, "spec") <- NULL 48 | attr(x, "problems") <- NULL 49 | class(x) <- setdiff(class(x), "spec_tbl_df") 50 | NextMethod(`[`) 51 | } 52 | 53 | #' @importFrom methods setOldClass 54 | setOldClass(c("meltr_spec_tbl_df", "tbl_df", "tbl", "data.frame")) 55 | 56 | # @export 57 | compare.meltr_spec_tbl_df <- function(x, y, ...) { 58 | attr(x, "spec") <- NULL 59 | attr(x, "problems") <- NULL 60 | 61 | attr(y, "spec") <- NULL 62 | attr(y, "problems") <- NULL 63 | 64 | NextMethod("compare") 65 | } 66 | 67 | # @export 68 | compare_proxy.meltr_spec_tbl_df <- function(x) { 69 | attr(x, "spec") <- NULL 70 | attr(x, "problems") <- NULL 71 | x 72 | } 73 | 74 | is_named <- function(x) { 75 | nms <- names(x) 76 | 77 | if (is.null(nms)) { 78 | return(FALSE) 79 | } 80 | 81 | all(nms != "" & !is.na(nms)) 82 | } 83 | 84 | .onLoad <- function(...) { 85 | register_s3_method("testthat", "compare", "meltr_spec_tbl_df") 86 | register_s3_method("waldo", "compare_proxy", "meltr_spec_tbl_df") 87 | } 88 | 89 | register_s3_method <- function(pkg, generic, class, fun = NULL) { 90 | stopifnot(is.character(pkg), length(pkg) == 1) 91 | stopifnot(is.character(generic), length(generic) == 1) 92 | stopifnot(is.character(class), length(class) == 1) 93 | 94 | if (is.null(fun)) { 95 | fun <- get(paste0(generic, ".", class), envir = parent.frame()) 96 | } else { 97 | stopifnot(is.function(fun)) 98 | } 99 | 100 | if (pkg %in% loadedNamespaces()) { 101 | registerS3method(generic, class, fun, envir = asNamespace(pkg)) 102 | } 103 | 104 | # Always register hook in case package is later unloaded & reloaded 105 | setHook( 106 | packageEvent(pkg, "onLoad"), 107 | function(...) { 108 | registerS3method(generic, class, fun, envir = asNamespace(pkg)) 109 | } 110 | ) 111 | } 112 | 113 | # Silence R CMD check note 114 | # Namespaces in Imports field not imported from: 115 | # ‘R6’ ‘rlang’ 116 | # All declared Imports should be used. 117 | # See https://github.com/hadley/r-pkgs/issues/828 118 | fake_function_1 <- function() R6::R6Class 119 | fake_function_2 <- function() rlang::int 120 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # meltr 17 | 18 | 19 | [![R-CMD-check](https://github.com/r-lib/meltr/workflows/R-CMD-check/badge.svg)](https://github.com/r-lib/meltr/actions) 20 | [![Codecov test coverage](https://codecov.io/gh/r-lib/meltr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/r-lib/meltr?branch=main) 21 | 22 | 23 |

24 | The wicked witch of the west saying 'I'm Melting, Melting!!!!!' 25 |

26 | 27 | The goal of 'meltr' is to provide a fast and friendly way to read 28 | non-rectangular data (like ragged forms of 'csv', 'tsv', and 'fwf'). 29 | 30 | Standard tools like [`readr::read_csv()`](https://readr.tidyverse.org/reference/read_delim.html) can cope to some extent with unusual inputs, like files with empty rows or newlines embedded in strings. 31 | But some files are so wacky that standard tools don't work at all, and instead you have to take the file to pieces and reassemble to get structured data you can work with. 32 | 33 | The meltr package provides tools to do this. 34 | 35 | ## Installation 36 | 37 | You can install the released version of meltr from CRAN with: 38 | 39 | ``` r 40 | install.packages("meltr") 41 | ``` 42 | 43 | Or you can install the development version with: 44 | 45 | ```r 46 | # install.packages("devtools") 47 | devtools::install_github("r-lib/meltr") 48 | ``` 49 | 50 | ## The problem with non-rectangular data 51 | 52 | Here's a contrived example that breaks two assumptions made by common tools like `readr::read_csv()`. 53 | 54 | 1. There are more cells in some rows than others. 55 | 2. There are mixed data types within each column. 56 | 57 | In contrast, the `melt_csv()` function reads the file one cell at a time, importing each cell of the file into a whole row of the final data frame. 58 | 59 | ```{r} 60 | writeLines("Help,,007,I'm 61 | 1960-09-30,FALSE,trapped in,7,1.21 62 | non-rectangular,data,NA", "messy.csv") 63 | 64 | library(meltr) 65 | 66 | melt_csv("messy.csv") 67 | ``` 68 | 69 | The output of `melt_csv()` gives us: 70 | 71 | - A data frame of results – structured data about un-structured data! 72 | - Rows of data corresponding to cells of the input data. 73 | - Empty cells such as the cell on row 1, but not missing cells at the ends of rows 1 and 3. 74 | - The raw, unconverted data, no data type conversion is attempted – every value is imported as a string, and the `data_type` column merely gives meltr's best guess of what the data types ought to be. 75 | 76 | What are some ways you can you use this? 77 | To begin with, you can do some simple manipulations with ordinary functions. 78 | 79 | For example you could extract the words. 80 | 81 | ```{r} 82 | library(dplyr) 83 | 84 | data <- melt_csv("messy.csv") 85 | 86 | data %>% 87 | filter(data_type == "character") 88 | ``` 89 | 90 | Or find if there are missing entries. 91 | 92 | ```{r} 93 | data %>% 94 | filter(data_type == "missing") 95 | ``` 96 | 97 | ```{r, include = FALSE} 98 | unlink("messy.csv") 99 | ``` 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # meltr 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/r-lib/meltr/workflows/R-CMD-check/badge.svg)](https://github.com/r-lib/meltr/actions) 9 | [![Codecov test 10 | coverage](https://codecov.io/gh/r-lib/meltr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/r-lib/meltr?branch=main) 11 | 12 | 13 |

14 | The wicked witch of the west saying 'I'm Melting, Melting!!!!!' 15 |

16 | 17 | The goal of ‘meltr’ is to provide a fast and friendly way to read 18 | non-rectangular data (like ragged forms of ‘csv’, ‘tsv’, and ‘fwf’). 19 | 20 | Standard tools like 21 | [`readr::read_csv()`](https://readr.tidyverse.org/reference/read_delim.html) 22 | can cope to some extent with unusual inputs, like files with empty rows 23 | or newlines embedded in strings. But some files are so wacky that 24 | standard tools don’t work at all, and instead you have to take the file 25 | to pieces and reassemble to get structured data you can work with. 26 | 27 | The meltr package provides tools to do this. 28 | 29 | ## Installation 30 | 31 | You can install the released version of meltr from CRAN with: 32 | 33 | ``` r 34 | install.packages("meltr") 35 | ``` 36 | 37 | Or you can install the development version with: 38 | 39 | ``` r 40 | # install.packages("devtools") 41 | devtools::install_github("r-lib/meltr") 42 | ``` 43 | 44 | ## The problem with non-rectangular data 45 | 46 | Here’s a contrived example that breaks two assumptions made by common 47 | tools like `readr::read_csv()`. 48 | 49 | 1. There are more cells in some rows than others. 50 | 2. There are mixed data types within each column. 51 | 52 | In contrast, the `melt_csv()` function reads the file one cell at a 53 | time, importing each cell of the file into a whole row of the final data 54 | frame. 55 | 56 | ``` r 57 | writeLines("Help,,007,I'm 58 | 1960-09-30,FALSE,trapped in,7,1.21 59 | non-rectangular,data,NA", "messy.csv") 60 | 61 | library(meltr) 62 | 63 | melt_csv("messy.csv") 64 | #> # A tibble: 12 × 4 65 | #> row col data_type value 66 | #> 67 | #> 1 1 1 character Help 68 | #> 2 1 2 missing 69 | #> 3 1 3 character 007 70 | #> 4 1 4 character I'm 71 | #> 5 2 1 date 1960-09-30 72 | #> 6 2 2 logical FALSE 73 | #> 7 2 3 character trapped in 74 | #> 8 2 4 integer 7 75 | #> 9 2 5 double 1.21 76 | #> 10 3 1 character non-rectangular 77 | #> 11 3 2 character data 78 | #> 12 3 3 missing 79 | ``` 80 | 81 | The output of `melt_csv()` gives us: 82 | 83 | - A data frame of results – structured data about un-structured data! 84 | - Rows of data corresponding to cells of the input data. 85 | - Empty cells such as the cell on row 1, but not missing cells at the 86 | ends of rows 1 and 3. 87 | - The raw, unconverted data, no data type conversion is attempted – 88 | every value is imported as a string, and the `data_type` column merely 89 | gives meltr’s best guess of what the data types ought to be. 90 | 91 | What are some ways you can you use this? To begin with, you can do some 92 | simple manipulations with ordinary functions. 93 | 94 | For example you could extract the words. 95 | 96 | ``` r 97 | library(dplyr) 98 | #> 99 | #> Attaching package: 'dplyr' 100 | #> The following objects are masked from 'package:stats': 101 | #> 102 | #> filter, lag 103 | #> The following objects are masked from 'package:base': 104 | #> 105 | #> intersect, setdiff, setequal, union 106 | 107 | data <- melt_csv("messy.csv") 108 | 109 | data %>% 110 | filter(data_type == "character") 111 | #> # A tibble: 6 × 4 112 | #> row col data_type value 113 | #> 114 | #> 1 1 1 character Help 115 | #> 2 1 3 character 007 116 | #> 3 1 4 character I'm 117 | #> 4 2 3 character trapped in 118 | #> 5 3 1 character non-rectangular 119 | #> 6 3 2 character data 120 | ``` 121 | 122 | Or find if there are missing entries. 123 | 124 | ``` r 125 | data %>% 126 | filter(data_type == "missing") 127 | #> # A tibble: 2 × 4 128 | #> row col data_type value 129 | #> 130 | #> 1 1 2 missing 131 | #> 2 3 3 missing 132 | ``` 133 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## R CMD check results 2 | 3 | 0 errors, 0 warnings, 0 notes 4 | -------------------------------------------------------------------------------- /data-raw/date-symbols.R: -------------------------------------------------------------------------------- 1 | library(stringi) 2 | 3 | locs <- stri_locale_list() 4 | base <- unique(stri_split_fixed(locs, "_", n = 2, simplify = TRUE)[, 1]) 5 | 6 | locale_info <- function(x) { 7 | full <- stri_datetime_symbols(x, context = "format", width = "wide") 8 | abbr <- stri_datetime_symbols(x, context = "format", width = "abbreviated") 9 | 10 | date_names( 11 | mon = full$Month, 12 | mon_ab = abbr$Month, 13 | day = full$Weekday, 14 | day_ab = abbr$Weekday, 15 | am_pm = full$AmPm 16 | ) 17 | } 18 | 19 | date_symbols <- lapply(base, locale_info) 20 | names(date_symbols) <- base 21 | 22 | usethis::use_data(date_symbols, internal = TRUE, overwrite = TRUE) 23 | -------------------------------------------------------------------------------- /inst/extdata/epa78.txt: -------------------------------------------------------------------------------- 1 | ALFA ROMEO ALFA ROMEO 78010003 2 | ALFETTA 03 81 8 74 7 89 9 ALFETTA 78010053 3 | SPIDER 2000 01 SPIDER 2000 78010103 4 | AMC AMC 78020002 5 | GREMLIN 03 79 9 79 9 GREMLIN 78020053 6 | PACER 04 89 11 89 11 PACER 78020103 7 | PACER WAGON 07 90 26 91 26 PACER WAGON 78020153 8 | CONCORD 04 88 12 90 11 90 11 83 16 CONCORD 78020203 9 | CONCORD WAGON 07 91 30 91 30 CONCORD WAGON 78020253 10 | MATADOR COUPE 05 97 14 97 14 MATADOR COUPE 78020303 11 | MATADOR SEDAN 06 110 20 110 20 MATADOR SEDAN 78020353 12 | MATADOR WAGON 09 112 50 112 50 MATADOR WAGON 78020403 13 | ASTON MARTIN ASTON MARTIN 78040002 14 | ASTON MARTIN ASTON MARTIN 78040053 15 | AUDI AUDI 78050002 16 | FOX 03 84 11 84 11 84 11 FOX 78050053 17 | FOX WAGON 07 83 40 83 40 FOX WAGON 78050103 18 | 5000 04 90 15 90 15 5000 78050153 19 | AVANTI AVANTI 78065002 20 | AVANTI II 02 75 8 75 8 AVANTI II 78065053 21 | -------------------------------------------------------------------------------- /inst/extdata/fwf-sample.txt: -------------------------------------------------------------------------------- 1 | John Smith WA 418-Y11-4111 2 | Mary Hartford CA 319-Z19-4341 3 | Evan Nolan IL 219-532-c301 4 | -------------------------------------------------------------------------------- /inst/extdata/massey-rating.txt: -------------------------------------------------------------------------------- 1 | UCC PAY LAZ KPK RT COF BIH DII ENG ACU Rank Team Conf 2 | 1 1 1 1 1 1 1 1 1 1 1 Ohio St B10 3 | 2 2 2 2 2 2 2 2 4 2 2 Oregon P12 4 | 3 4 3 4 3 4 3 4 2 3 3 Alabama SEC 5 | 4 3 4 3 4 3 5 3 3 4 4 TCU B12 6 | 6 6 6 5 5 7 6 5 6 11 5 Michigan St B10 7 | 7 7 7 6 7 6 11 8 7 8 6 Georgia SEC 8 | 5 5 5 7 6 8 4 6 5 5 7 Florida St ACC 9 | 8 8 9 9 10 5 7 7 10 7 8 Baylor B12 10 | 9 11 8 13 11 11 12 9 14 9 9 Georgia Tech ACC 11 | 13 10 13 11 8 9 10 11 9 10 10 Mississippi SEC 12 | -------------------------------------------------------------------------------- /inst/extdata/mtcars.csv: -------------------------------------------------------------------------------- 1 | "mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb" 2 | 21,6,160,110,3.9,2.62,16.46,0,1,4,4 3 | 21,6,160,110,3.9,2.875,17.02,0,1,4,4 4 | 22.8,4,108,93,3.85,2.32,18.61,1,1,4,1 5 | 21.4,6,258,110,3.08,3.215,19.44,1,0,3,1 6 | 18.7,8,360,175,3.15,3.44,17.02,0,0,3,2 7 | 18.1,6,225,105,2.76,3.46,20.22,1,0,3,1 8 | 14.3,8,360,245,3.21,3.57,15.84,0,0,3,4 9 | 24.4,4,146.7,62,3.69,3.19,20,1,0,4,2 10 | 22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2 11 | 19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4 12 | 17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4 13 | 16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3 14 | 17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3 15 | 15.2,8,275.8,180,3.07,3.78,18,0,0,3,3 16 | 10.4,8,472,205,2.93,5.25,17.98,0,0,3,4 17 | 10.4,8,460,215,3,5.424,17.82,0,0,3,4 18 | 14.7,8,440,230,3.23,5.345,17.42,0,0,3,4 19 | 32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1 20 | 30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2 21 | 33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1 22 | 21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1 23 | 15.5,8,318,150,2.76,3.52,16.87,0,0,3,2 24 | 15.2,8,304,150,3.15,3.435,17.3,0,0,3,2 25 | 13.3,8,350,245,3.73,3.84,15.41,0,0,3,4 26 | 19.2,8,400,175,3.08,3.845,17.05,0,0,3,2 27 | 27.3,4,79,66,4.08,1.935,18.9,1,1,4,1 28 | 26,4,120.3,91,4.43,2.14,16.7,0,1,5,2 29 | 30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2 30 | 15.8,8,351,264,4.22,3.17,14.5,0,1,5,4 31 | 19.7,6,145,175,3.62,2.77,15.5,0,1,5,6 32 | 15,8,301,335,3.54,3.57,14.6,0,1,5,8 33 | 21.4,4,121,109,4.11,2.78,18.6,1,1,4,2 34 | -------------------------------------------------------------------------------- /man/Tokenizers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenizer.R 3 | \name{Tokenizers} 4 | \alias{Tokenizers} 5 | \alias{tokenizer_delim} 6 | \alias{tokenizer_csv} 7 | \alias{tokenizer_tsv} 8 | \alias{tokenizer_line} 9 | \alias{tokenizer_log} 10 | \alias{tokenizer_fwf} 11 | \alias{tokenizer_ws} 12 | \title{Tokenizers.} 13 | \usage{ 14 | tokenizer_delim( 15 | delim, 16 | quote = "\\"", 17 | na = "NA", 18 | quoted_na = TRUE, 19 | comment = "", 20 | trim_ws = TRUE, 21 | escape_double = TRUE, 22 | escape_backslash = FALSE, 23 | skip_empty_rows = TRUE 24 | ) 25 | 26 | tokenizer_csv( 27 | na = "NA", 28 | quoted_na = TRUE, 29 | quote = "\\"", 30 | comment = "", 31 | trim_ws = TRUE, 32 | skip_empty_rows = TRUE 33 | ) 34 | 35 | tokenizer_tsv( 36 | na = "NA", 37 | quoted_na = TRUE, 38 | quote = "\\"", 39 | comment = "", 40 | trim_ws = TRUE, 41 | skip_empty_rows = TRUE 42 | ) 43 | 44 | tokenizer_line(na = character(), skip_empty_rows = TRUE) 45 | 46 | tokenizer_log(trim_ws) 47 | 48 | tokenizer_fwf( 49 | begin, 50 | end, 51 | na = "NA", 52 | comment = "", 53 | trim_ws = TRUE, 54 | skip_empty_rows = TRUE 55 | ) 56 | 57 | tokenizer_ws(na = "NA", comment = "", skip_empty_rows = TRUE) 58 | } 59 | \arguments{ 60 | \item{delim}{Single character used to separate fields within a record.} 61 | 62 | \item{quote}{Single character used to quote strings.} 63 | 64 | \item{na}{Character vector of strings to interpret as missing values. Set this 65 | option to \code{character()} to indicate no missing values.} 66 | 67 | \item{quoted_na}{Should missing values inside quotes be treated as missing 68 | values (the default) or strings.} 69 | 70 | \item{comment}{A string used to identify comments. Any text after the 71 | comment characters will be silently ignored.} 72 | 73 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from 74 | each field before parsing it?} 75 | 76 | \item{escape_double}{Does the file escape quotes by doubling them? 77 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents 78 | a single quote, \verb{\\"}.} 79 | 80 | \item{escape_backslash}{Does the file use backslashes to escape special 81 | characters? This is more general than \code{escape_double} as backslashes 82 | can be used to escape the delimiter character, the quote character, or 83 | to add special characters like \verb{\\\\n}.} 84 | 85 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this 86 | option is \code{TRUE} then blank rows will not be represented at all. If it is 87 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.} 88 | 89 | \item{begin, end}{Begin and end offsets for each file. These are C++ 90 | offsets so the first column is column zero, and the ranges are 91 | [begin, end) (i.e inclusive-exclusive).} 92 | } 93 | \value{ 94 | A tokeenizer object 95 | } 96 | \description{ 97 | Explicitly create tokenizer objects. Usually you will not call these 98 | function, but will instead use one of the use friendly wrappers like 99 | \code{\link[readr:read_delim]{readr::read_csv()}}. 100 | } 101 | \examples{ 102 | tokenizer_csv() 103 | tokenizer_delim(",") 104 | } 105 | \keyword{internal} 106 | -------------------------------------------------------------------------------- /man/clipboard.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{clipboard} 4 | \alias{clipboard} 5 | \title{Returns values from the clipboard} 6 | \usage{ 7 | clipboard() 8 | } 9 | \description{ 10 | This is useful in the \code{\link[readr:read_delim]{readr::read_delim()}} functions to read from the clipboard. 11 | } 12 | \examples{ 13 | \dontrun{ 14 | clipboard() 15 | } 16 | } 17 | \seealso{ 18 | readr::read_delim 19 | } 20 | -------------------------------------------------------------------------------- /man/datasource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{datasource} 4 | \alias{datasource} 5 | \title{Create a source object.} 6 | \usage{ 7 | datasource( 8 | file, 9 | skip = 0, 10 | skip_empty_rows = FALSE, 11 | comment = "", 12 | skip_quote = TRUE 13 | ) 14 | } 15 | \arguments{ 16 | \item{file}{Either a path to a file, a connection, or literal data 17 | (either a single string or a raw vector). 18 | 19 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will 20 | be automatically uncompressed. Files starting with \verb{http://}, 21 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically 22 | downloaded. Remote gz files can also be automatically downloaded and 23 | decompressed. 24 | 25 | Literal data is most useful for examples and tests. It must contain at 26 | least one new line to be recognised as data (instead of a path) or be a 27 | vector of greater than length 1. 28 | 29 | Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} 30 | 31 | \item{skip}{Number of lines to skip before reading data.} 32 | } 33 | \value{ 34 | A source object 35 | } 36 | \description{ 37 | Create a source object. 38 | } 39 | \examples{ 40 | # Literal csv 41 | datasource("a,b,c\n1,2,3") 42 | datasource(charToRaw("a,b,c\n1,2,3")) 43 | 44 | # Strings 45 | datasource(meltr_example("mtcars.csv")) 46 | \dontrun{ 47 | datasource("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv") 48 | } 49 | 50 | # Connection 51 | con <- rawConnection(charToRaw("abc\n123")) 52 | datasource(con) 53 | close(con) 54 | } 55 | \keyword{internal} 56 | -------------------------------------------------------------------------------- /man/date_names.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/date-symbols.R 3 | \name{date_names} 4 | \alias{date_names} 5 | \alias{date_names_lang} 6 | \alias{date_names_langs} 7 | \title{Create or retrieve date names} 8 | \usage{ 9 | date_names(mon, mon_ab = mon, day, day_ab = day, am_pm = c("AM", "PM")) 10 | 11 | date_names_lang(language) 12 | 13 | date_names_langs() 14 | } 15 | \arguments{ 16 | \item{mon, mon_ab}{Full and abbreviated month names.} 17 | 18 | \item{day, day_ab}{Full and abbreviated week day names. Starts with Sunday.} 19 | 20 | \item{am_pm}{Names used for AM and PM.} 21 | 22 | \item{language}{A BCP 47 locale, made up of a language and a region, 23 | e.g. \code{"en_US"} for American English. See \code{date_names_langs()} 24 | for a complete list of available locales.} 25 | } 26 | \value{ 27 | A date names object 28 | } 29 | \description{ 30 | When parsing dates, you often need to know how weekdays of the week and 31 | months are represented as text. This pair of functions allows you to either 32 | create your own, or retrieve from a standard list. The standard list is 33 | derived from ICU (\url{https://icu.unicode.org/}) via the stringi package. 34 | } 35 | \examples{ 36 | date_names(mon = LETTERS[1:12], day = letters[1:7]) 37 | date_names_lang("en") 38 | date_names_lang("ko") 39 | date_names_lang("fr") 40 | } 41 | -------------------------------------------------------------------------------- /man/locale.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/locale.R 3 | \name{locale} 4 | \alias{locale} 5 | \alias{default_locale} 6 | \title{Create locales} 7 | \usage{ 8 | locale( 9 | date_names = "en", 10 | date_format = "\%AD", 11 | time_format = "\%AT", 12 | decimal_mark = ".", 13 | grouping_mark = ",", 14 | tz = "UTC", 15 | encoding = "UTF-8" 16 | ) 17 | 18 | default_locale() 19 | } 20 | \arguments{ 21 | \item{date_names}{Character representations of day and month names. Either 22 | the language code as string (passed on to \code{\link[=date_names_lang]{date_names_lang()}}) 23 | or an object created by \code{\link[=date_names]{date_names()}}.} 24 | 25 | \item{date_format, time_format}{Default date and time formats.} 26 | 27 | \item{decimal_mark, grouping_mark}{Symbols used to indicate the decimal 28 | place, and to chunk larger numbers. Decimal mark can only be \verb{,} or 29 | \code{.}.} 30 | 31 | \item{tz}{Default tz. This is used both for input (if the time zone isn't 32 | present in individual strings), and for output (to control the default 33 | display). The default is to use "UTC", a time zone that does not use 34 | daylight savings time (DST) and hence is typically most useful for data. 35 | The absence of time zones makes it approximately 50x faster to generate 36 | UTC times than any other time zone. 37 | 38 | Use \code{""} to use the system default time zone, but beware that this 39 | will not be reproducible across systems. 40 | 41 | For a complete list of possible time zones, see \code{\link[=OlsonNames]{OlsonNames()}}. 42 | Americans, note that "EST" is a Canadian time zone that does not have 43 | DST. It is \emph{not} Eastern Standard Time. It's better to use 44 | "US/Eastern", "US/Central" etc.} 45 | 46 | \item{encoding}{Default encoding. This only affects how the file is 47 | read - meltr always converts the output to UTF-8.} 48 | } 49 | \value{ 50 | A locale object 51 | } 52 | \description{ 53 | A locale object tries to capture all the defaults that can vary between 54 | countries. You set the locale in once, and the details are automatically 55 | passed on down to the columns parsers. The defaults have been chosen to 56 | match R (i.e. US English) as closely as possible. See 57 | \code{vignette("locales")} for more details. 58 | } 59 | \examples{ 60 | locale() 61 | locale("fr") 62 | 63 | # South American locale 64 | locale("es", decimal_mark = ",") 65 | } 66 | -------------------------------------------------------------------------------- /man/melt_delim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/melt_delim.R 3 | \name{melt_delim} 4 | \alias{melt_delim} 5 | \alias{melt_csv} 6 | \alias{melt_csv2} 7 | \alias{melt_tsv} 8 | \title{Return melted data for each token in a delimited file (including csv & tsv)} 9 | \usage{ 10 | melt_delim( 11 | file, 12 | delim, 13 | quote = "\\"", 14 | escape_backslash = FALSE, 15 | escape_double = TRUE, 16 | locale = default_locale(), 17 | na = c("", "NA"), 18 | quoted_na = TRUE, 19 | comment = "", 20 | trim_ws = FALSE, 21 | skip = 0, 22 | n_max = Inf, 23 | progress = show_progress(), 24 | skip_empty_rows = FALSE 25 | ) 26 | 27 | melt_csv( 28 | file, 29 | locale = default_locale(), 30 | na = c("", "NA"), 31 | quoted_na = TRUE, 32 | quote = "\\"", 33 | comment = "", 34 | trim_ws = TRUE, 35 | skip = 0, 36 | n_max = Inf, 37 | progress = show_progress(), 38 | skip_empty_rows = FALSE 39 | ) 40 | 41 | melt_csv2( 42 | file, 43 | locale = default_locale(), 44 | na = c("", "NA"), 45 | quoted_na = TRUE, 46 | quote = "\\"", 47 | comment = "", 48 | trim_ws = TRUE, 49 | skip = 0, 50 | n_max = Inf, 51 | progress = show_progress(), 52 | skip_empty_rows = FALSE 53 | ) 54 | 55 | melt_tsv( 56 | file, 57 | locale = default_locale(), 58 | na = c("", "NA"), 59 | quoted_na = TRUE, 60 | quote = "\\"", 61 | comment = "", 62 | trim_ws = TRUE, 63 | skip = 0, 64 | n_max = Inf, 65 | progress = show_progress(), 66 | skip_empty_rows = FALSE 67 | ) 68 | } 69 | \arguments{ 70 | \item{file}{Either a path to a file, a connection, or literal data 71 | (either a single string or a raw vector). 72 | 73 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will 74 | be automatically uncompressed. Files starting with \verb{http://}, 75 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically 76 | downloaded. Remote gz files can also be automatically downloaded and 77 | decompressed. 78 | 79 | Literal data is most useful for examples and tests. To be recognised as 80 | literal data, the input must be either wrapped with \code{I()}, be a string 81 | containing at least one new line, or be a vector containing at least one 82 | string with a new line. 83 | 84 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.} 85 | 86 | \item{delim}{Single character used to separate fields within a record.} 87 | 88 | \item{quote}{Single character used to quote strings.} 89 | 90 | \item{escape_backslash}{Does the file use backslashes to escape special 91 | characters? This is more general than \code{escape_double} as backslashes 92 | can be used to escape the delimiter character, the quote character, or 93 | to add special characters like \verb{\\\\n}.} 94 | 95 | \item{escape_double}{Does the file escape quotes by doubling them? 96 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents 97 | a single quote, \verb{\\"}.} 98 | 99 | \item{locale}{The locale controls defaults that vary from place to place. 100 | The default locale is US-centric (like R), but you can use 101 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like 102 | the default time zone, encoding, decimal mark, big mark, and day/month 103 | names.} 104 | 105 | \item{na}{Character vector of strings to interpret as missing values. Set this 106 | option to \code{character()} to indicate no missing values.} 107 | 108 | \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values 109 | inside quotes be treated as missing values (the default) or strings. This 110 | parameter is soft deprecated as of readr 2.0.0.} 111 | 112 | \item{comment}{A string used to identify comments. Any text after the 113 | comment characters will be silently ignored.} 114 | 115 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from 116 | each field before parsing it?} 117 | 118 | \item{skip}{Number of lines to skip before reading data. If \code{comment} is 119 | supplied any commented lines are ignored \emph{after} skipping.} 120 | 121 | \item{n_max}{Maximum number of lines to read.} 122 | 123 | \item{progress}{Display a progress bar? By default it will only display 124 | in an interactive session and not while knitting a document. The automatic 125 | progress bar can be disabled by setting option \code{readr.show_progress} to 126 | \code{FALSE}.} 127 | 128 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this 129 | option is \code{TRUE} then blank rows will not be represented at all. If it is 130 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.} 131 | } 132 | \value{ 133 | A \code{\link[=tibble]{tibble()}} of four columns: 134 | \itemize{ 135 | \item \code{row}, the row that the token comes from in the original file 136 | \item \code{col}, the column that the token comes from in the original file 137 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"}, 138 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function. 139 | \item \code{value}, the token itself as a character string, unchanged from its 140 | representation in the original file. 141 | } 142 | 143 | If there are parsing problems, a warning tells you 144 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}. 145 | } 146 | \description{ 147 | For certain non-rectangular data formats, it can be useful to parse the data 148 | into a melted format where each row represents a single token. 149 | } 150 | \details{ 151 | \code{melt_csv()} and \code{melt_tsv()} are special cases of the general 152 | \code{melt_delim()}. They're useful for reading the most common types of 153 | flat file data, comma separated values and tab separated values, 154 | respectively. \code{melt_csv2()} uses \verb{;} for the field separator and \verb{,} for the 155 | decimal point. This is common in some European countries. 156 | } 157 | \examples{ 158 | # Input sources ------------------------------------------------------------- 159 | # Read from a path 160 | melt_csv(meltr_example("mtcars.csv")) 161 | \dontrun{ 162 | melt_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv") 163 | } 164 | 165 | # Or directly from a string (must contain a newline) 166 | melt_csv("x,y\n1,2\n3,4") 167 | 168 | # To import empty cells as 'empty' rather than `NA` 169 | melt_csv("x,y\n,NA,\"\",''", na = "NA") 170 | 171 | # File types ---------------------------------------------------------------- 172 | melt_csv("a,b\n1.0,2.0") 173 | melt_csv2("a;b\n1,0;2,0") 174 | melt_tsv("a\tb\n1.0\t2.0") 175 | melt_delim("a|b\n1.0|2.0", delim = "|") 176 | } 177 | \seealso{ 178 | \code{\link[readr:read_delim]{readr::read_delim()}} for the conventional way to read rectangular data 179 | from delimited files. 180 | } 181 | -------------------------------------------------------------------------------- /man/melt_delim_chunked.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/melt_delim_chunked.R 3 | \name{melt_delim_chunked} 4 | \alias{melt_delim_chunked} 5 | \alias{melt_csv_chunked} 6 | \alias{melt_csv2_chunked} 7 | \alias{melt_tsv_chunked} 8 | \title{Melt a delimited file by chunks} 9 | \usage{ 10 | melt_delim_chunked( 11 | file, 12 | callback, 13 | chunk_size = 10000, 14 | delim, 15 | quote = "\\"", 16 | escape_backslash = FALSE, 17 | escape_double = TRUE, 18 | locale = default_locale(), 19 | na = c("", "NA"), 20 | quoted_na = TRUE, 21 | comment = "", 22 | trim_ws = FALSE, 23 | skip = 0, 24 | progress = show_progress(), 25 | skip_empty_rows = FALSE 26 | ) 27 | 28 | melt_csv_chunked( 29 | file, 30 | callback, 31 | chunk_size = 10000, 32 | locale = default_locale(), 33 | na = c("", "NA"), 34 | quoted_na = TRUE, 35 | quote = "\\"", 36 | comment = "", 37 | trim_ws = TRUE, 38 | skip = 0, 39 | progress = show_progress(), 40 | skip_empty_rows = FALSE 41 | ) 42 | 43 | melt_csv2_chunked( 44 | file, 45 | callback, 46 | chunk_size = 10000, 47 | locale = default_locale(), 48 | na = c("", "NA"), 49 | quoted_na = TRUE, 50 | quote = "\\"", 51 | comment = "", 52 | trim_ws = TRUE, 53 | skip = 0, 54 | progress = show_progress(), 55 | skip_empty_rows = FALSE 56 | ) 57 | 58 | melt_tsv_chunked( 59 | file, 60 | callback, 61 | chunk_size = 10000, 62 | locale = default_locale(), 63 | na = c("", "NA"), 64 | quoted_na = TRUE, 65 | quote = "\\"", 66 | comment = "", 67 | trim_ws = TRUE, 68 | skip = 0, 69 | progress = show_progress(), 70 | skip_empty_rows = FALSE 71 | ) 72 | } 73 | \arguments{ 74 | \item{file}{Either a path to a file, a connection, or literal data 75 | (either a single string or a raw vector). 76 | 77 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will 78 | be automatically uncompressed. Files starting with \verb{http://}, 79 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically 80 | downloaded. Remote gz files can also be automatically downloaded and 81 | decompressed. 82 | 83 | Literal data is most useful for examples and tests. To be recognised as 84 | literal data, the input must be either wrapped with \code{I()}, be a string 85 | containing at least one new line, or be a vector containing at least one 86 | string with a new line. 87 | 88 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.} 89 | 90 | \item{callback}{A callback function to call on each chunk} 91 | 92 | \item{chunk_size}{The number of rows to include in each chunk} 93 | 94 | \item{delim}{Single character used to separate fields within a record.} 95 | 96 | \item{quote}{Single character used to quote strings.} 97 | 98 | \item{escape_backslash}{Does the file use backslashes to escape special 99 | characters? This is more general than \code{escape_double} as backslashes 100 | can be used to escape the delimiter character, the quote character, or 101 | to add special characters like \verb{\\\\n}.} 102 | 103 | \item{escape_double}{Does the file escape quotes by doubling them? 104 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents 105 | a single quote, \verb{\\"}.} 106 | 107 | \item{locale}{The locale controls defaults that vary from place to place. 108 | The default locale is US-centric (like R), but you can use 109 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like 110 | the default time zone, encoding, decimal mark, big mark, and day/month 111 | names.} 112 | 113 | \item{na}{Character vector of strings to interpret as missing values. Set this 114 | option to \code{character()} to indicate no missing values.} 115 | 116 | \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values 117 | inside quotes be treated as missing values (the default) or strings. This 118 | parameter is soft deprecated as of readr 2.0.0.} 119 | 120 | \item{comment}{A string used to identify comments. Any text after the 121 | comment characters will be silently ignored.} 122 | 123 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from 124 | each field before parsing it?} 125 | 126 | \item{skip}{Number of lines to skip before reading data. If \code{comment} is 127 | supplied any commented lines are ignored \emph{after} skipping.} 128 | 129 | \item{progress}{Display a progress bar? By default it will only display 130 | in an interactive session and not while knitting a document. The automatic 131 | progress bar can be disabled by setting option \code{readr.show_progress} to 132 | \code{FALSE}.} 133 | 134 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this 135 | option is \code{TRUE} then blank rows will not be represented at all. If it is 136 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.} 137 | } 138 | \value{ 139 | A \code{\link[=tibble]{tibble()}} of four columns: 140 | \itemize{ 141 | \item \code{row}, the row that the token comes from in the original file 142 | \item \code{col}, the column that the token comes from in the original file 143 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"}, 144 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function. 145 | \item \code{value}, the token itself as a character string, unchanged from its 146 | representation in the original file. 147 | } 148 | 149 | If there are parsing problems, a warning tells you 150 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}. 151 | } 152 | \description{ 153 | For certain non-rectangular data formats, it can be useful to parse the data 154 | into a melted format where each row represents a single token. 155 | } 156 | \details{ 157 | \code{melt_delim_chunked()} and the specialisations \code{melt_csv_chunked()}, 158 | \code{melt_csv2_chunked()} and \code{melt_tsv_chunked()} read files by a chunk of rows 159 | at a time, executing a given function on one chunk before reading the next. 160 | } 161 | \examples{ 162 | # Cars with 3 gears 163 | f <- function(x, pos) subset(x, data_type == "integer") 164 | melt_csv_chunked(meltr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) 165 | } 166 | \seealso{ 167 | Other chunked: 168 | \code{\link{callback}} 169 | } 170 | \concept{chunked} 171 | \keyword{internal} 172 | -------------------------------------------------------------------------------- /man/melt_fwf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/melt_fwf.R 3 | \name{melt_fwf} 4 | \alias{melt_fwf} 5 | \alias{fwf_empty} 6 | \alias{fwf_widths} 7 | \alias{fwf_positions} 8 | \alias{fwf_cols} 9 | \title{Return melted data for each token in a fixed width file} 10 | \usage{ 11 | melt_fwf( 12 | file, 13 | col_positions, 14 | locale = default_locale(), 15 | na = c("", "NA"), 16 | comment = "", 17 | trim_ws = TRUE, 18 | skip = 0, 19 | n_max = Inf, 20 | progress = show_progress(), 21 | skip_empty_rows = FALSE 22 | ) 23 | 24 | fwf_empty( 25 | file, 26 | skip = 0, 27 | skip_empty_rows = FALSE, 28 | col_names = NULL, 29 | comment = "", 30 | n = 100L 31 | ) 32 | 33 | fwf_widths(widths, col_names = NULL) 34 | 35 | fwf_positions(start, end = NULL, col_names = NULL) 36 | 37 | fwf_cols(...) 38 | } 39 | \arguments{ 40 | \item{file}{Either a path to a file, a connection, or literal data 41 | (either a single string or a raw vector). 42 | 43 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will 44 | be automatically uncompressed. Files starting with \verb{http://}, 45 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically 46 | downloaded. Remote gz files can also be automatically downloaded and 47 | decompressed. 48 | 49 | Literal data is most useful for examples and tests. To be recognised as 50 | literal data, the input must be either wrapped with \code{I()}, be a string 51 | containing at least one new line, or be a vector containing at least one 52 | string with a new line. 53 | 54 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.} 55 | 56 | \item{col_positions}{Column positions, as created by \code{\link[=fwf_empty]{fwf_empty()}}, 57 | \code{\link[=fwf_widths]{fwf_widths()}} or \code{\link[=fwf_positions]{fwf_positions()}}. To read in only selected fields, 58 | use \code{\link[=fwf_positions]{fwf_positions()}}. If the width of the last column is variable (a 59 | ragged fwf file), supply the last end position as NA.} 60 | 61 | \item{locale}{The locale controls defaults that vary from place to place. 62 | The default locale is US-centric (like R), but you can use 63 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like 64 | the default time zone, encoding, decimal mark, big mark, and day/month 65 | names.} 66 | 67 | \item{na}{Character vector of strings to interpret as missing values. Set this 68 | option to \code{character()} to indicate no missing values.} 69 | 70 | \item{comment}{A string used to identify comments. Any text after the 71 | comment characters will be silently ignored.} 72 | 73 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from 74 | each field before parsing it?} 75 | 76 | \item{skip}{Number of lines to skip before reading data.} 77 | 78 | \item{n_max}{Maximum number of lines to read.} 79 | 80 | \item{progress}{Display a progress bar? By default it will only display 81 | in an interactive session and not while knitting a document. The automatic 82 | progress bar can be disabled by setting option \code{readr.show_progress} to 83 | \code{FALSE}.} 84 | 85 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this 86 | option is \code{TRUE} then blank rows will not be represented at all. If it is 87 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.} 88 | 89 | \item{col_names}{Either NULL, or a character vector column names.} 90 | 91 | \item{n}{Number of lines the tokenizer will read to determine file structure. By default 92 | it is set to 100.} 93 | 94 | \item{widths}{Width of each field. Use NA as width of last field when 95 | reading a ragged fwf file.} 96 | 97 | \item{start, end}{Starting and ending (inclusive) positions of each field. 98 | Use NA as last end field when reading a ragged fwf file.} 99 | 100 | \item{...}{If the first element is a data frame, 101 | then it must have all numeric columns and either one or two rows. 102 | The column names are the variable names. The column values are the 103 | variable widths if a length one vector, and if length two, variable start and end 104 | positions. The elements of \code{...} are used to construct a data frame 105 | with or or two rows as above.} 106 | } 107 | \value{ 108 | A \code{\link[=tibble]{tibble()}} of four columns: 109 | \itemize{ 110 | \item \code{row}, the row that the token comes from in the original file 111 | \item \code{col}, the column that the token comes from in the original file 112 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"}, 113 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function. 114 | \item \code{value}, the token itself as a character string, unchanged from its 115 | representation in the original file. 116 | } 117 | 118 | If there are parsing problems, a warning tells you 119 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}. 120 | } 121 | \description{ 122 | For certain non-rectangular data formats, it can be useful to parse the data 123 | into a melted format where each row represents a single token. 124 | } 125 | \details{ 126 | \code{melt_fwf()} parses each token of a fixed width file into a single row, but 127 | it still requires that each field is in the same in every row of the 128 | source file. 129 | } 130 | \examples{ 131 | fwf_sample <- meltr_example("fwf-sample.txt") 132 | writeLines(readLines(fwf_sample)) 133 | 134 | # You can specify column positions in several ways: 135 | # 1. Guess based on position of empty columns 136 | melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn"))) 137 | # 2. A vector of field widths 138 | melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn"))) 139 | # 3. Paired vectors of start and end positions 140 | melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn"))) 141 | # 4. Named arguments with start and end positions 142 | melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42))) 143 | # 5. Named arguments with column widths 144 | melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12)) 145 | } 146 | \seealso{ 147 | \code{\link[=melt_table]{melt_table()}} to melt fixed width files where each 148 | column is separated by whitespace, and \code{\link[=melt_fwf]{melt_fwf()}} for the conventional 149 | way to read rectangular data from fixed width files. 150 | } 151 | -------------------------------------------------------------------------------- /man/melt_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/melt_table.R 3 | \name{melt_table} 4 | \alias{melt_table} 5 | \alias{melt_table2} 6 | \title{Return melted data for each token in a whitespace-separated file} 7 | \usage{ 8 | melt_table( 9 | file, 10 | locale = default_locale(), 11 | na = "NA", 12 | skip = 0, 13 | n_max = Inf, 14 | guess_max = min(n_max, 1000), 15 | progress = show_progress(), 16 | comment = "", 17 | skip_empty_rows = FALSE 18 | ) 19 | 20 | melt_table2( 21 | file, 22 | locale = default_locale(), 23 | na = "NA", 24 | skip = 0, 25 | n_max = Inf, 26 | progress = show_progress(), 27 | comment = "", 28 | skip_empty_rows = FALSE 29 | ) 30 | } 31 | \arguments{ 32 | \item{file}{Either a path to a file, a connection, or literal data 33 | (either a single string or a raw vector). 34 | 35 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will 36 | be automatically uncompressed. Files starting with \verb{http://}, 37 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically 38 | downloaded. Remote gz files can also be automatically downloaded and 39 | decompressed. 40 | 41 | Literal data is most useful for examples and tests. To be recognised as 42 | literal data, the input must be either wrapped with \code{I()}, be a string 43 | containing at least one new line, or be a vector containing at least one 44 | string with a new line. 45 | 46 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.} 47 | 48 | \item{locale}{The locale controls defaults that vary from place to place. 49 | The default locale is US-centric (like R), but you can use 50 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like 51 | the default time zone, encoding, decimal mark, big mark, and day/month 52 | names.} 53 | 54 | \item{na}{Character vector of strings to interpret as missing values. Set this 55 | option to \code{character()} to indicate no missing values.} 56 | 57 | \item{skip}{Number of lines to skip before reading data.} 58 | 59 | \item{n_max}{Maximum number of lines to read.} 60 | 61 | \item{guess_max}{Maximum number of lines to use for guessing column types. 62 | See \code{vignette("column-types", package = "readr")} for more details.} 63 | 64 | \item{progress}{Display a progress bar? By default it will only display 65 | in an interactive session and not while knitting a document. The automatic 66 | progress bar can be disabled by setting option \code{readr.show_progress} to 67 | \code{FALSE}.} 68 | 69 | \item{comment}{A string used to identify comments. Any text after the 70 | comment characters will be silently ignored.} 71 | 72 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this 73 | option is \code{TRUE} then blank rows will not be represented at all. If it is 74 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.} 75 | } 76 | \value{ 77 | A \code{\link[=tibble]{tibble()}} of four columns: 78 | \itemize{ 79 | \item \code{row}, the row that the token comes from in the original file 80 | \item \code{col}, the column that the token comes from in the original file 81 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"}, 82 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function. 83 | \item \code{value}, the token itself as a character string, unchanged from its 84 | representation in the original file. 85 | } 86 | 87 | If there are parsing problems, a warning tells you 88 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}. 89 | } 90 | \description{ 91 | For certain non-rectangular data formats, it can be useful to parse the data 92 | into a melted format where each row represents a single token. 93 | 94 | \code{melt_table()} and \code{melt_table2()} are designed to read the type of textual 95 | data where each column is separated by one (or more) columns of space. 96 | 97 | \code{melt_table2()} allows any number of whitespace characters between columns, 98 | and the lines can be of different lengths. 99 | 100 | \code{melt_table()} is more strict, each line must be the same length, 101 | and each field is in the same position in every line. It first finds empty 102 | columns and then parses like a fixed width file. 103 | } 104 | \examples{ 105 | # One corner from http://www.masseyratings.com/cf/compare.htm 106 | massey <- meltr_example("massey-rating.txt") 107 | cat(readLines(massey)) 108 | melt_table(massey) 109 | 110 | # Sample of 1978 fuel economy data from 111 | # http://www.fueleconomy.gov/feg/epadata/78data.zip 112 | epa <- meltr_example("epa78.txt") 113 | writeLines(readLines(epa)) 114 | melt_table(epa) 115 | } 116 | \seealso{ 117 | \code{\link[=melt_fwf]{melt_fwf()}} to melt fixed width files where each column 118 | is not separated by whitespace. \code{melt_fwf()} is also useful for reading 119 | tabular data with non-standard formatting. \code{\link[readr:read_table]{readr::read_table()}} is the 120 | conventional way to read tabular data from whitespace-separated files. 121 | } 122 | -------------------------------------------------------------------------------- /man/meltr_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/example.R 3 | \name{meltr_example} 4 | \alias{meltr_example} 5 | \title{Get path to meltr example} 6 | \usage{ 7 | meltr_example(file = NULL) 8 | } 9 | \arguments{ 10 | \item{file}{Name of file. If \code{NULL}, the example files will be listed.} 11 | } 12 | \value{ 13 | A file path or a vector of file names 14 | } 15 | \description{ 16 | meltr comes bundled with a number of sample files in its \code{inst/extdata} 17 | directory. This function make them easy to access 18 | } 19 | \examples{ 20 | meltr_example() 21 | meltr_example("mtcars.csv") 22 | } 23 | -------------------------------------------------------------------------------- /man/problems.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/problems.R 3 | \name{problems} 4 | \alias{problems} 5 | \alias{stop_for_problems} 6 | \title{Retrieve parsing problems} 7 | \usage{ 8 | problems(x = .Last.value) 9 | 10 | stop_for_problems(x) 11 | } 12 | \arguments{ 13 | \item{x}{An data frame (from \verb{read_*()}) or a vector 14 | (from \verb{parse_*()}).} 15 | } 16 | \value{ 17 | A data frame with one row for each problem and four columns: 18 | \item{row,col}{Row and column of problem} 19 | \item{expected}{What readr expected to find} 20 | \item{actual}{What it actually got} 21 | } 22 | \description{ 23 | Readr functions will only throw an error if parsing fails in an unrecoverable 24 | way. However, there are lots of potential problems that you might want to 25 | know about - these are stored in the \code{problems} attribute of the 26 | output, which you can easily access with this function. 27 | \code{stop_for_problems()} will throw an error if there are any parsing 28 | problems: this is useful for automated scripts where you want to throw 29 | an error as soon as you encounter a problem. 30 | } 31 | \examples{ 32 | if (requireNamespace("readr")) { 33 | x <- readr::parse_integer(c("1X", "blah", "3")) 34 | problems(x) 35 | 36 | y <- readr::parse_integer(c("1", "2", "3")) 37 | problems(y) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /man/show_progress.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{show_progress} 4 | \alias{show_progress} 5 | \title{Determine whether progress bars should be shown} 6 | \usage{ 7 | show_progress() 8 | } 9 | \value{ 10 | A logical value 11 | } 12 | \description{ 13 | Progress bars are shown \emph{unless} one of the following is \code{TRUE} 14 | \itemize{ 15 | \item The bar is explicitly disabled by setting \code{options(readr.show_progress = FALSE)} 16 | \item The code is run in a non-interactive session (\code{interactive()} is \code{FALSE}). 17 | \item The code is run in an RStudio notebook chunk. 18 | \item The code is run by knitr / rmarkdown. 19 | } 20 | } 21 | \examples{ 22 | show_progress() 23 | } 24 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/Collector.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/list.hpp" 2 | 3 | #include "Collector.h" 4 | #include "LocaleInfo.h" 5 | #include "QiParsers.h" 6 | #include "utils.h" 7 | 8 | CollectorPtr Collector::create(const cpp11::list& spec, LocaleInfo* pLocale) { 9 | std::string subclass(cpp11::as_cpp(spec.attr("class"))[0]); 10 | 11 | 12 | if (subclass == "collector_double") { 13 | return CollectorPtr(new CollectorDouble(pLocale->decimalMark_)); 14 | } 15 | if (subclass == "collector_character") { 16 | return CollectorPtr(new CollectorCharacter(&pLocale->encoder_)); 17 | } 18 | 19 | cpp11::stop("Unsupported column type '%s'", subclass.c_str()); 20 | return CollectorPtr(new CollectorSkip()); 21 | } 22 | 23 | std::vector 24 | collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale) { 25 | std::vector collectors; 26 | for (auto spec : specs) { 27 | CollectorPtr col(Collector::create(SEXP(spec), pLocale)); 28 | collectors.push_back(col); 29 | } 30 | 31 | return collectors; 32 | } 33 | 34 | // Implementations ------------------------------------------------------------ 35 | 36 | void CollectorCharacter::setValue(int i, const Token& t) { 37 | switch (t.type()) { 38 | case TOKEN_STRING: { 39 | std::string buffer; 40 | SourceIterators string = t.getString(&buffer); 41 | 42 | if (t.hasNull()) { 43 | warn(t.row(), t.col(), "", "embedded null"); 44 | } 45 | 46 | SET_STRING_ELT( 47 | column_, 48 | i, 49 | pEncoder_->makeSEXP(string.first, string.second, t.hasNull())); 50 | break; 51 | }; 52 | case TOKEN_MISSING: 53 | SET_STRING_ELT(column_, i, NA_STRING); 54 | break; 55 | case TOKEN_EMPTY: 56 | SET_STRING_ELT(column_, i, Rf_mkCharCE("", CE_UTF8)); 57 | break; 58 | case TOKEN_EOF: 59 | cpp11::stop("Invalid token"); 60 | } 61 | } 62 | 63 | void CollectorCharacter::setValue(int i, const std::string& s) { 64 | SET_STRING_ELT(column_, i, Rf_mkCharCE(s.c_str(), CE_UTF8)); 65 | } 66 | 67 | void CollectorDouble::setValue(int i, size_t st) { REAL(column_)[i] = st; } 68 | -------------------------------------------------------------------------------- /src/Collector.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_COLLECTOR_H_ 2 | #define MELTR_COLLECTOR_H_ 3 | 4 | #include "cpp11/doubles.hpp" 5 | #include "cpp11/integers.hpp" 6 | #include "cpp11/list.hpp" 7 | #include "cpp11/logicals.hpp" 8 | #include "cpp11/strings.hpp" 9 | 10 | #include "DateTimeParser.h" 11 | #include "Iconv.h" 12 | #include "LocaleInfo.h" 13 | #include "Token.h" 14 | #include "Warnings.h" 15 | #include 16 | #include 17 | 18 | class Collector; 19 | typedef std::shared_ptr CollectorPtr; 20 | 21 | class Collector { 22 | protected: 23 | cpp11::sexp column_; 24 | Warnings* pWarnings_; 25 | 26 | int n_; 27 | 28 | public: 29 | Collector(SEXP column, Warnings* pWarnings = NULL) 30 | : column_(column), pWarnings_(pWarnings), n_(0) {} 31 | 32 | virtual ~Collector(){}; 33 | 34 | virtual void setValue(int i, const Token& t) = 0; 35 | virtual void setValue(int /* unused */, const std::string& /* unused */){}; // nocov 36 | virtual void setValue(int /* unused */, size_t /* unused */ ){}; // nocov 37 | 38 | virtual cpp11::sexp vector() { return column_; }; 39 | 40 | virtual bool skip() { return false; } 41 | 42 | int size() { return n_; } 43 | 44 | void resize(int n) { 45 | if (n == n_) 46 | return; 47 | 48 | if (column_ == R_NilValue) 49 | return; 50 | 51 | #if R_VERSION >= R_Version(3, 4, 0) 52 | if (n > 0 && n < n_) { 53 | SET_TRUELENGTH(column_, n_); 54 | SETLENGTH(column_, n); 55 | SET_GROWABLE_BIT(column_); 56 | } else { 57 | column_ = Rf_lengthgets(column_, n); 58 | } 59 | #else 60 | column_ = Rf_lengthgets(column_, n); 61 | #endif 62 | 63 | n_ = n; 64 | } 65 | 66 | void clear() { resize(0); } 67 | 68 | void setWarnings(Warnings* pWarnings) { pWarnings_ = pWarnings; } 69 | 70 | inline void warn(int row, int col, std::string expected, std::string actual) { 71 | if (pWarnings_ == NULL) { 72 | cpp11::warning( 73 | "[%i, %i]: expected %s, but got '%s'", 74 | row + 1, 75 | col + 1, 76 | expected.c_str(), 77 | actual.c_str()); 78 | return; 79 | } 80 | 81 | pWarnings_->addWarning(row, col, expected, actual); 82 | } 83 | inline void 84 | warn(int row, int col, std::string expected, SourceIterators actual) { 85 | warn(row, col, expected, std::string(actual.first, actual.second)); 86 | } 87 | 88 | static CollectorPtr create(const cpp11::list& spec, LocaleInfo* pLocale); 89 | }; 90 | 91 | // Character ------------------------------------------------------------------- 92 | 93 | class CollectorCharacter : public Collector { 94 | Iconv* pEncoder_; 95 | 96 | public: 97 | CollectorCharacter(Iconv* pEncoder) 98 | : Collector(cpp11::writable::strings(R_xlen_t(0))), pEncoder_(pEncoder) {} 99 | void setValue(int i, const Token& t); 100 | void setValue(int i, const std::string& s); 101 | }; 102 | 103 | // Date ------------------------------------------------------------------------ 104 | 105 | class CollectorDate : public Collector { 106 | std::string format_; 107 | DateTimeParser parser_; 108 | 109 | public: 110 | CollectorDate(LocaleInfo* pLocale, const std::string& format) 111 | : Collector(cpp11::writable::doubles(R_xlen_t(0))), 112 | format_(format), 113 | parser_(pLocale) {} 114 | 115 | void setValue(int i, const Token& t); 116 | 117 | cpp11::sexp vector() { 118 | column_.attr("class") = "Date"; 119 | return column_; 120 | }; 121 | }; 122 | 123 | // Date time ------------------------------------------------------------------- 124 | 125 | class CollectorDateTime : public Collector { 126 | std::string format_; 127 | DateTimeParser parser_; 128 | std::string tz_; 129 | 130 | public: 131 | CollectorDateTime(LocaleInfo* pLocale, const std::string& format) 132 | : Collector(cpp11::writable::doubles(R_xlen_t(0))), 133 | format_(format), 134 | parser_(pLocale), 135 | tz_(pLocale->tz_) {} 136 | 137 | void setValue(int i, const Token& t); 138 | 139 | cpp11::sexp vector() { 140 | column_.attr("class") = {"POSIXct", "POSIXt"}; 141 | column_.attr("tzone") = tz_; 142 | return column_; 143 | }; 144 | }; 145 | 146 | class CollectorDouble : public Collector { 147 | 148 | public: 149 | CollectorDouble(char /* unused */) 150 | : Collector(cpp11::writable::doubles(R_xlen_t(0))) {} 151 | void setValue(int /* unused */, const Token& /* unused */) { /* unused */ }; 152 | void setValue(int i, size_t st); 153 | }; 154 | 155 | class CollectorFactor : public Collector { 156 | Iconv* pEncoder_; 157 | std::vector levels_; 158 | std::map levelset_; 159 | bool ordered_, implicitLevels_, includeNa_; 160 | std::string buffer_; 161 | 162 | void insert(int i, const cpp11::r_string& str, const Token& t); 163 | 164 | public: 165 | CollectorFactor( 166 | Iconv* pEncoder, cpp11::sexp levels, bool ordered, bool includeNa) 167 | : Collector(cpp11::writable::integers(R_xlen_t(0))), 168 | pEncoder_(pEncoder), 169 | ordered_(ordered), 170 | includeNa_(includeNa) { 171 | implicitLevels_ = levels == R_NilValue; 172 | if (!implicitLevels_) { 173 | cpp11::strings lvls(levels); 174 | int n = lvls.size(); 175 | 176 | for (int i = 0; i < n; ++i) { 177 | cpp11::r_string std_level; 178 | if (STRING_ELT(lvls, i) != NA_STRING) { 179 | const char* level = Rf_translateCharUTF8(STRING_ELT(lvls, i)); 180 | std_level = level; 181 | } else { 182 | std_level = NA_STRING; 183 | } 184 | levels_.push_back(std_level); 185 | levelset_.insert(std::make_pair(std_level, i)); 186 | } 187 | } 188 | } 189 | void setValue(int i, const Token& t); 190 | 191 | cpp11::sexp vector() { 192 | if (ordered_) { 193 | column_.attr("class") = {"ordered", "factor"}; 194 | } else { 195 | column_.attr("class") = "factor"; 196 | } 197 | 198 | int n = levels_.size(); 199 | cpp11::writable::strings levels(n); 200 | for (int i = 0; i < n; ++i) { 201 | levels[i] = levels_[i]; 202 | } 203 | 204 | column_.attr("levels") = levels; 205 | return column_; 206 | }; 207 | }; 208 | 209 | class CollectorInteger : public Collector { 210 | public: 211 | CollectorInteger() : Collector(cpp11::writable::integers(R_xlen_t(0))) {} 212 | void setValue(int i, const Token& t); 213 | }; 214 | 215 | class CollectorLogical : public Collector { 216 | public: 217 | CollectorLogical() : Collector(cpp11::writable::logicals(R_xlen_t(0))) {} 218 | void setValue(int i, const Token& t); 219 | }; 220 | 221 | class CollectorNumeric : public Collector { 222 | char decimalMark_, groupingMark_; 223 | 224 | public: 225 | CollectorNumeric(char decimalMark, char groupingMark) 226 | : Collector(cpp11::writable::doubles(R_xlen_t(0))), 227 | decimalMark_(decimalMark), 228 | groupingMark_(groupingMark) {} 229 | void setValue(int i, const Token& t); 230 | bool isNum(char c); 231 | }; 232 | 233 | // Time --------------------------------------------------------------------- 234 | 235 | class CollectorTime : public Collector { 236 | std::string format_; 237 | DateTimeParser parser_; 238 | 239 | public: 240 | CollectorTime(LocaleInfo* pLocale, const std::string& format) 241 | : Collector(cpp11::writable::doubles(R_xlen_t(0))), 242 | format_(format), 243 | parser_(pLocale) {} 244 | 245 | void setValue(int i, const Token& t); 246 | 247 | cpp11::sexp vector() { 248 | column_.attr("class") = {"hms", "difftime"}; 249 | column_.attr("units") = "secs"; 250 | return column_; 251 | }; 252 | }; 253 | 254 | // Skip --------------------------------------------------------------------- 255 | 256 | class CollectorSkip : public Collector { 257 | public: 258 | CollectorSkip() : Collector(R_NilValue) {} 259 | void setValue(int /* unused */, const Token& /* unused */) {} 260 | bool skip() { return true; } 261 | }; 262 | 263 | // Raw ------------------------------------------------------------------------- 264 | class CollectorRaw : public Collector { 265 | public: 266 | CollectorRaw() : Collector(cpp11::writable::list(static_cast(0))) {} 267 | void setValue(int i, const Token& t); 268 | }; 269 | 270 | // Helpers --------------------------------------------------------------------- 271 | 272 | std::vector 273 | collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale); 274 | void collectorsResize(std::vector& collectors, int n); 275 | void collectorsClear(std::vector& collectors); 276 | std::string collectorGuess( 277 | const cpp11::strings& input, 278 | const cpp11::list& locale_, 279 | bool guessInteger = false); 280 | 281 | #endif 282 | -------------------------------------------------------------------------------- /src/CollectorGuess.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/R.hpp" 2 | #include "cpp11/list.hpp" 3 | #include "cpp11/strings.hpp" 4 | 5 | #include "DateTimeParser.h" 6 | #include "LocaleInfo.h" 7 | #include "QiParsers.h" 8 | #include "utils.h" 9 | 10 | typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale); 11 | 12 | bool canParse( 13 | const cpp11::strings& x, const canParseFun& canParse, LocaleInfo* pLocale) { 14 | for (const auto & i : x) { 15 | if (i == NA_STRING) { 16 | continue; 17 | } 18 | 19 | if (i.size() == 0) { 20 | continue; 21 | } 22 | 23 | if (!canParse(std::string(i), pLocale)) { 24 | return false; 25 | } 26 | } 27 | return true; 28 | } 29 | 30 | bool allMissing(const cpp11::strings& x) { 31 | for (const auto & i : x) { 32 | if (i != NA_STRING && i.size() > 0) { 33 | return false; 34 | } 35 | } 36 | return true; 37 | } 38 | 39 | bool isLogical(const std::string& x, LocaleInfo* /*unused*/) { 40 | const char* const str = x.data(); 41 | bool res = isLogical(str, str + x.size()); 42 | return res; 43 | } 44 | 45 | bool isNumber(const std::string& x, LocaleInfo* pLocale) { 46 | // Leading zero not followed by decimal mark 47 | if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) { 48 | return false; 49 | } 50 | 51 | double res = 0; 52 | std::string::const_iterator begin = x.begin(); 53 | 54 | std::string::const_iterator end = x.end(); 55 | 56 | bool ok = parseNumber( 57 | pLocale->decimalMark_, pLocale->groupingMark_, begin, end, res); 58 | return ok && begin == x.begin() && end == x.end(); 59 | } 60 | 61 | bool isInteger(const std::string& x, LocaleInfo* /*unused*/) { 62 | // Leading zero 63 | if (x[0] == '0' && x.size() > 1) { 64 | return false; 65 | } 66 | 67 | double res = 0; 68 | std::string::const_iterator begin = x.begin(); 69 | 70 | std::string::const_iterator end = x.end(); 71 | 72 | return parseInt(begin, end, res) && begin == end; 73 | } 74 | 75 | bool isDouble(const std::string& x, LocaleInfo* pLocale) { 76 | // Leading zero not followed by decimal mark 77 | if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) { 78 | return false; 79 | } 80 | 81 | double res = 0; 82 | const char* begin = x.c_str(); 83 | const char* end = begin + x.size(); 84 | 85 | return parseDouble(pLocale->decimalMark_, begin, end, res) && 86 | end == begin + x.size(); 87 | } 88 | 89 | bool isTime(const std::string& x, LocaleInfo* pLocale) { 90 | DateTimeParser parser(pLocale); 91 | 92 | parser.setDate(x.c_str()); 93 | return parser.parseLocaleTime(); 94 | } 95 | 96 | bool isDate(const std::string& x, LocaleInfo* pLocale) { 97 | DateTimeParser parser(pLocale); 98 | 99 | parser.setDate(x.c_str()); 100 | return parser.parseLocaleDate(); 101 | } 102 | 103 | static bool isDateTime(const std::string& x, LocaleInfo* pLocale) { 104 | DateTimeParser parser(pLocale); 105 | 106 | parser.setDate(x.c_str()); 107 | bool ok = parser.parseISO8601(); 108 | 109 | if (!ok) { 110 | return false; 111 | } 112 | 113 | if (!parser.compactDate()) { 114 | return true; 115 | } 116 | 117 | // Values like 00014567 are unlikely to be dates, so don't guess 118 | return parser.year() > 999; 119 | } 120 | 121 | [[cpp11::register]] std::string collectorGuess( 122 | const cpp11::strings& input, 123 | const cpp11::list& locale_, 124 | bool guessInteger) { 125 | LocaleInfo locale(static_cast(locale_)); 126 | 127 | if (input.size() == 0) { 128 | return "character"; 129 | } 130 | 131 | if (allMissing(input)) { 132 | return "logical"; 133 | } 134 | 135 | // Work from strictest to most flexible 136 | if (canParse(input, isLogical, &locale)) { 137 | return "logical"; 138 | } 139 | if (guessInteger && canParse(input, isInteger, &locale)) { 140 | return "integer"; 141 | } 142 | if (canParse(input, isDouble, &locale)) { 143 | return "double"; 144 | } 145 | if (canParse(input, isNumber, &locale)) { 146 | return "number"; 147 | } 148 | if (canParse(input, isTime, &locale)) { 149 | return "time"; 150 | } 151 | if (canParse(input, isDate, &locale)) { 152 | return "date"; 153 | } 154 | if (canParse(input, isDateTime, &locale)) { 155 | return "datetime"; 156 | } 157 | 158 | // Otherwise can always parse as a character 159 | return "character"; 160 | } 161 | -------------------------------------------------------------------------------- /src/Iconv.cpp: -------------------------------------------------------------------------------- 1 | #include "Iconv.h" 2 | #include "cpp11/protect.hpp" 3 | #include 4 | 5 | Iconv::Iconv(const std::string& from, const std::string& to) { 6 | if (from == "UTF-8") { 7 | cd_ = nullptr; 8 | } else { 9 | cd_ = Riconv_open(to.c_str(), from.c_str()); 10 | if (cd_ == (void*)-1) { 11 | if (errno == EINVAL) { 12 | cpp11::stop("Can't convert from %s to %s", from.c_str(), to.c_str()); 13 | } else { 14 | cpp11::stop("Iconv initialisation failed"); 15 | } 16 | } 17 | 18 | // Allocate space in buffer 19 | buffer_.resize(1024); 20 | } 21 | } 22 | 23 | Iconv::~Iconv() { 24 | if (cd_ != nullptr) { 25 | Riconv_close(cd_); 26 | cd_ = nullptr; 27 | } 28 | } 29 | 30 | size_t Iconv::convert(const char* start, const char* end) { 31 | size_t n = end - start; 32 | 33 | // Ensure buffer is big enough: one input byte can never generate 34 | // more than 4 output bytes 35 | size_t max_size = n * 4; 36 | if (buffer_.size() < max_size) { 37 | buffer_.resize(max_size); 38 | } 39 | 40 | char* outbuf = &buffer_[0]; 41 | size_t inbytesleft = n; 42 | 43 | size_t outbytesleft = max_size; 44 | size_t res = Riconv(cd_, &start, &inbytesleft, &outbuf, &outbytesleft); 45 | 46 | if (res == (size_t)-1) { 47 | switch (errno) { 48 | case EILSEQ: 49 | cpp11::stop("Invalid multibyte sequence"); 50 | case EINVAL: 51 | cpp11::stop("Incomplete multibyte sequence"); 52 | case E2BIG: 53 | cpp11::stop("Iconv buffer too small"); 54 | default: 55 | cpp11::stop("Iconv failed to convert for unknown reason"); 56 | } 57 | } 58 | 59 | return max_size - outbytesleft; 60 | } 61 | 62 | int my_strnlen(const char* s, int maxlen) { 63 | for (int n = 0; n < maxlen; ++n) { 64 | if (s[n] == '\0') { 65 | return n; 66 | } 67 | } 68 | return maxlen; 69 | } 70 | 71 | #if defined(__sun) 72 | #define meltr_strnlen my_strnlen 73 | #else 74 | #define meltr_strnlen strnlen 75 | #endif 76 | 77 | // To be safe, we need to check for nulls - this also needs to emit 78 | // a warning, but this behaviour is better than crashing 79 | SEXP safeMakeChar(const char* start, size_t n, bool hasNull) { 80 | size_t m = hasNull ? meltr_strnlen(start, n) : n; 81 | if (m > INT_MAX) { 82 | cpp11::stop("R character strings are limited to 2^31-1 bytes"); 83 | } 84 | return Rf_mkCharLenCE(start, m, CE_UTF8); 85 | } 86 | 87 | SEXP Iconv::makeSEXP(const char* start, const char* end, bool hasNull) { 88 | if (cd_ == nullptr) { 89 | return safeMakeChar(start, end - start, hasNull); 90 | } 91 | 92 | int n = convert(start, end); 93 | return safeMakeChar(&buffer_[0], n, hasNull); 94 | } 95 | 96 | std::string Iconv::makeString(const char* start, const char* end) { 97 | if (cd_ == nullptr) { 98 | return std::string(start, end); 99 | } 100 | 101 | int n = convert(start, end); 102 | return std::string(&buffer_[0], n); 103 | } 104 | -------------------------------------------------------------------------------- /src/Iconv.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_ICONV_H_ 2 | #define MELTR_ICONV_H_ 3 | 4 | #include "cpp11/R.hpp" 5 | #include 6 | 7 | #include "R_ext/Riconv.h" 8 | #include 9 | 10 | class Iconv { 11 | void* cd_; 12 | std::string buffer_; 13 | 14 | public: 15 | Iconv(const std::string& from, const std::string& to = "UTF-8"); 16 | virtual ~Iconv(); 17 | 18 | SEXP makeSEXP(const char* start, const char* end, bool hasNull = true); 19 | std::string makeString(const char* start, const char* end); 20 | 21 | private: 22 | // Returns number of characters in buffer 23 | size_t convert(const char* start, const char* end); 24 | }; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/LocaleInfo.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/as.hpp" 2 | #include "cpp11/list.hpp" 3 | #include "cpp11/strings.hpp" 4 | #include 5 | #include 6 | 7 | #include "LocaleInfo.h" 8 | 9 | LocaleInfo::LocaleInfo(const cpp11::list& x) 10 | : encoding_(cpp11::as_cpp(x["encoding"])), 11 | encoder_(Iconv(encoding_)) { 12 | std::string klass = cpp11::as_cpp(x.attr("class")); 13 | if (klass != "locale") { 14 | cpp11::stop("Invalid input: must be of class locale"); 15 | } 16 | 17 | cpp11::list date_names(x["date_names"]); 18 | mon_ = cpp11::as_cpp>(date_names["mon"]); 19 | monAb_ = cpp11::as_cpp>(date_names["mon_ab"]); 20 | day_ = cpp11::as_cpp>(date_names["day"]); 21 | dayAb_ = cpp11::as_cpp>(date_names["day_ab"]); 22 | amPm_ = cpp11::as_cpp>(date_names["am_pm"]); 23 | 24 | decimalMark_ = cpp11::as_cpp(x["decimal_mark"]); 25 | groupingMark_ = cpp11::as_cpp(x["grouping_mark"]); 26 | 27 | dateFormat_ = cpp11::as_cpp(x["date_format"]); 28 | timeFormat_ = cpp11::as_cpp(x["time_format"]); 29 | 30 | tz_ = cpp11::as_cpp(x["tz"]); 31 | } 32 | -------------------------------------------------------------------------------- /src/LocaleInfo.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_LOCALINFO_H_ 2 | #define MELTR_LOCALINFO_H_ 3 | 4 | #include "Iconv.h" 5 | 6 | #include "cpp11/list.hpp" 7 | #include 8 | #include 9 | 10 | class LocaleInfo { 11 | 12 | public: 13 | // LC_TIME 14 | std::vector mon_, monAb_, day_, dayAb_, amPm_; 15 | std::string dateFormat_, timeFormat_; 16 | 17 | // LC_NUMERIC 18 | char decimalMark_, groupingMark_; 19 | 20 | // LC_MISC 21 | std::string tz_; 22 | std::string encoding_; 23 | Iconv encoder_; 24 | 25 | LocaleInfo(const cpp11::list& x); 26 | }; 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /src/Progress.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_PROGRESS_H_ 2 | #define MELTR_PROGRESS_H_ 3 | 4 | #include "cpp11/R.hpp" 5 | #include 6 | #include 7 | #include 8 | 9 | inline int now() { return clock() / CLOCKS_PER_SEC; } 10 | 11 | inline std::string clearLine(int width = 50) { 12 | return "\r" + std::string(' ', width) + "\r"; 13 | } 14 | 15 | inline std::string showTime(int x) { 16 | std::stringstream ss; 17 | if (x < 60) { 18 | ss << x << " s"; 19 | return ss.str(); 20 | } else if (x < 60 * 60) { 21 | ss << x / 60 << " m"; 22 | return ss.str(); 23 | } else { 24 | ss << x / (60 * 60) << " h"; 25 | return ss.str(); 26 | } 27 | } 28 | 29 | class Progress { 30 | int timeMin_, timeInit_, timeStop_, width_; 31 | bool show_, stopped_; 32 | 33 | public: 34 | Progress(int min = 5, int width = Rf_GetOptionWidth()) 35 | : timeMin_(min), 36 | timeInit_(now()), 37 | timeStop_(now()), 38 | width_(width), 39 | show_(false), 40 | stopped_(false) {} 41 | 42 | void stop() { 43 | timeStop_ = now(); 44 | stopped_ = true; 45 | } 46 | 47 | void show(std::pair progress) { 48 | double prop = progress.first, size = progress.second / (1024 * 1024); 49 | 50 | double est = (now() - timeInit_) / prop; 51 | if (!show_) { 52 | if (est > timeMin_) { 53 | show_ = true; 54 | } else { 55 | return; 56 | } 57 | } 58 | 59 | std::stringstream labelStream; 60 | labelStream << std::setprecision(2) << std::fixed << " " 61 | << (int)(prop * 100) << "%"; 62 | if (size > 0) { 63 | labelStream << " " << std::setprecision(0) << size << " MB"; 64 | } 65 | 66 | std::string label = labelStream.str(); 67 | 68 | int barSize = width_ - label.size() - 2; 69 | if (barSize < 0) { 70 | return; 71 | } 72 | int nbars = prop * barSize; 73 | int nspaces = (1 - prop) * barSize; 74 | std::string bars(nbars, '='), spaces(nspaces, ' '); 75 | Rprintf("\r|%s%s|%s", bars.c_str(), spaces.c_str(), label.c_str()); 76 | } 77 | 78 | ~Progress() { 79 | try { 80 | if (!show_) 81 | return; 82 | 83 | if (!stopped_) 84 | timeStop_ = now(); 85 | Rprintf("\n"); 86 | 87 | } catch (...) { 88 | } 89 | } 90 | }; 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /src/Reader.cpp: -------------------------------------------------------------------------------- 1 | #include "Reader.h" 2 | 3 | #include "cpp11/function.hpp" 4 | #include "cpp11/list.hpp" 5 | 6 | #include 7 | #include 8 | 9 | Reader::Reader( 10 | SourcePtr source, 11 | TokenizerPtr tokenizer, 12 | std::vector collectors, 13 | bool progress, 14 | const cpp11::strings& colNames) 15 | : source_(std::move(source)), 16 | tokenizer_(std::move(tokenizer)), 17 | collectors_(std::move(collectors)), 18 | progress_(progress), 19 | begun_(false) { 20 | init(colNames); 21 | } 22 | 23 | void Reader::init(const cpp11::strings& colNames) { 24 | tokenizer_->tokenize(source_->begin(), source_->end()); 25 | tokenizer_->setWarnings(&warnings_); 26 | 27 | // Work out which output columns we are keeping and set warnings for each 28 | // collector 29 | size_t p = collectors_.size(); 30 | for (size_t j = 0; j < p; ++j) { 31 | if (!collectors_[j]->skip()) { 32 | keptColumns_.push_back(j); 33 | collectors_[j]->setWarnings(&warnings_); 34 | } 35 | } 36 | 37 | if (colNames.size() > 0) { 38 | outNames_ = cpp11::writable::strings(keptColumns_.size()); 39 | int i = 0; 40 | for (int keptColumn : keptColumns_) { 41 | outNames_[i++] = colNames[keptColumn]; 42 | } 43 | } 44 | } 45 | 46 | void Reader::collectorsResize(R_xlen_t n) { 47 | for (auto & collector : collectors_) { 48 | collector->resize(n); 49 | } 50 | } 51 | 52 | void Reader::collectorsClear() { 53 | for (auto & collector : collectors_) { 54 | collector->clear(); 55 | } 56 | } 57 | 58 | cpp11::sexp 59 | Reader::meltToDataFrame(const cpp11::list& locale_, R_xlen_t lines) { 60 | melt(locale_, lines); 61 | 62 | // Save individual columns into a data frame 63 | cpp11::writable::list out(4); 64 | out[0] = collectors_[0]->vector(); 65 | out[1] = collectors_[1]->vector(); 66 | out[2] = collectors_[2]->vector(); 67 | out[3] = collectors_[3]->vector(); 68 | 69 | out.attr("names") = {"row", "col", "data_type", "value"}; 70 | cpp11::sexp out2(warnings_.addAsAttribute(static_cast(out))); 71 | 72 | collectorsClear(); 73 | warnings_.clear(); 74 | 75 | out.attr("names") = {"row", "col", "data_type", "value"}; 76 | 77 | static cpp11::function as_tibble = cpp11::package("tibble")["as_tibble"]; 78 | return as_tibble(out); 79 | } 80 | 81 | R_xlen_t Reader::melt(const cpp11::list& locale_, R_xlen_t lines) { 82 | 83 | if (t_.type() == TOKEN_EOF) { 84 | return (-1); 85 | } 86 | 87 | R_xlen_t n = (lines < 0) ? 10000 : lines * 10; // Start with 10 cells per line 88 | 89 | collectorsResize(n); 90 | 91 | R_xlen_t last_row = -1; 92 | 93 | R_xlen_t cells = 0; 94 | R_xlen_t first_row; 95 | if (!begun_) { 96 | t_ = tokenizer_->nextToken(); 97 | begun_ = true; 98 | first_row = 0; 99 | } else { 100 | first_row = t_.row(); 101 | } 102 | 103 | while (t_.type() != TOKEN_EOF) { 104 | ++cells; 105 | 106 | if (progress_ && cells % progressStep_ == 0) { 107 | progressBar_.show(tokenizer_->progress()); 108 | } 109 | 110 | if (lines >= 0 && static_cast(t_.row()) - first_row >= lines) { 111 | --cells; 112 | break; 113 | } 114 | 115 | if (cells >= n) { 116 | // Estimate rows in full dataset and resize collectors 117 | n = (cells / tokenizer_->progress().first) * 1.1; 118 | collectorsResize(n); 119 | } 120 | 121 | collectors_[0]->setValue(cells - 1, t_.row() + 1); 122 | collectors_[1]->setValue(cells - 1, t_.col() + 1); 123 | collectors_[3]->setValue(cells - 1, t_); 124 | 125 | switch (t_.type()) { 126 | case TOKEN_STRING: { 127 | cpp11::sexp str(cpp11::as_sexp(t_.asString())); 128 | collectors_[2]->setValue( 129 | cells - 1, collectorGuess(SEXP(str), locale_, true)); 130 | break; 131 | }; 132 | case TOKEN_MISSING: 133 | collectors_[2]->setValue(cells - 1, "missing"); 134 | break; 135 | case TOKEN_EMPTY: 136 | collectors_[2]->setValue(cells - 1, "empty"); 137 | break; 138 | case TOKEN_EOF: 139 | cpp11::stop("Invalid token"); 140 | } 141 | 142 | last_row = t_.row(); 143 | t_ = tokenizer_->nextToken(); 144 | } 145 | 146 | if (progress_) { 147 | progressBar_.show(tokenizer_->progress()); 148 | } 149 | 150 | progressBar_.stop(); 151 | 152 | // Resize the collectors to the final size (if it is not already at that 153 | // size) 154 | if (last_row == -1) { 155 | collectorsResize(0); 156 | } else if (cells < (n - 1)) { 157 | collectorsResize(cells); 158 | } 159 | 160 | return cells - 1; 161 | } 162 | -------------------------------------------------------------------------------- /src/Reader.h: -------------------------------------------------------------------------------- 1 | #include "Collector.h" 2 | #include "Progress.h" 3 | #include "Source.h" 4 | 5 | #include "cpp11/list.hpp" 6 | #include "cpp11/strings.hpp" 7 | 8 | class Reader { 9 | public: 10 | Reader( 11 | SourcePtr source, 12 | TokenizerPtr tokenizer, 13 | std::vector collectors, 14 | bool progress, 15 | const cpp11::strings& colNames = cpp11::strings()); 16 | 17 | cpp11::sexp meltToDataFrame(const cpp11::list& locale_, R_xlen_t lines = -1); 18 | 19 | private: 20 | Warnings warnings_; 21 | SourcePtr source_; 22 | TokenizerPtr tokenizer_; 23 | std::vector collectors_; 24 | bool progress_; 25 | Progress progressBar_; 26 | std::vector keptColumns_; 27 | cpp11::writable::strings outNames_; 28 | bool begun_; 29 | Token t_; 30 | 31 | const static R_xlen_t progressStep_ = 10000; 32 | 33 | void init(const cpp11::strings& colNames); 34 | R_xlen_t melt(const cpp11::list& locale_, R_xlen_t lines = -1); 35 | 36 | void collectorsResize(R_xlen_t n); 37 | void collectorsClear(); 38 | }; 39 | -------------------------------------------------------------------------------- /src/Source.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/list.hpp" 2 | #include "cpp11/strings.hpp" 3 | 4 | #include "Source.h" 5 | #include "SourceFile.h" 6 | #include "SourceRaw.h" 7 | #include "SourceString.h" 8 | 9 | SourcePtr Source::create(const cpp11::list& spec) { 10 | std::string subclass(cpp11::as_cpp(spec.attr("class"))[0]); 11 | 12 | int skip = cpp11::as_cpp(spec["skip"]); 13 | bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); 14 | std::string comment = cpp11::as_cpp(spec["comment"]); 15 | bool skipQuote = cpp11::as_cpp(spec["skip_quote"]); 16 | 17 | if (subclass == "source_raw") { 18 | return SourcePtr( 19 | new SourceRaw(spec[0], skip, skipEmptyRows, comment, skipQuote)); 20 | } 21 | 22 | if (subclass == "source_string") { 23 | return SourcePtr( 24 | new SourceString(spec[0], skip, skipEmptyRows, comment, skipQuote)); 25 | } 26 | 27 | if (subclass == "source_file") { 28 | cpp11::strings path(spec[0]); 29 | return SourcePtr(new SourceFile(Rf_translateCharUTF8(path[0]), skip, skipEmptyRows, comment, skipQuote)); 30 | } 31 | 32 | cpp11::stop("Unknown source type"); 33 | return SourcePtr(); 34 | } 35 | 36 | const char* Source::skipLines( 37 | const char* begin, 38 | const char* end, 39 | int n, 40 | bool skipEmptyRows, 41 | const std::string& comment, 42 | bool skipQuote) { 43 | bool hasComment = !comment.empty(); 44 | bool isComment = false; 45 | 46 | const char* cur = begin; 47 | 48 | while (cur < end && n > 0) { 49 | cur = skipLine( 50 | cur, end, hasComment && inComment(cur, end, comment), skipQuote); 51 | --n; 52 | ++skippedRows_; 53 | } 54 | 55 | // Skip any more trailing empty rows or comments 56 | while (cur < end && 57 | ((skipEmptyRows && (*cur == '\n' || *cur == '\r')) || 58 | (isComment = hasComment && inComment(cur, end, comment)))) { 59 | cur = skipLine(cur, end, isComment, skipQuote); 60 | ++skippedRows_; 61 | } 62 | 63 | return cur; 64 | } 65 | 66 | const char* Source::skipLine( 67 | const char* begin, const char* end, bool isComment, bool skipQuote) { 68 | const char* cur = begin; 69 | // skip the rest of the line until the newline 70 | while (cur < end && !(*cur == '\n' || *cur == '\r')) { 71 | if (!isComment && skipQuote && *cur == '"') { 72 | cur = skipDoubleQuoted(cur, end); 73 | } else { 74 | advanceForLF(&cur, end); 75 | ++cur; 76 | } 77 | } 78 | 79 | advanceForLF(&cur, end); 80 | 81 | // skip the actual newline char 82 | if (cur < end) { 83 | ++cur; 84 | } 85 | 86 | return cur; 87 | } 88 | 89 | const char* Source::skipDoubleQuoted(const char* begin, const char* end) { 90 | const char* cur = begin; 91 | 92 | // This doesn't handle escaped quotes or more sophisticated things, but 93 | // will work for simple cases. 94 | // Opening quote 95 | ++cur; 96 | 97 | while (cur < end && *cur != '"') { 98 | ++cur; 99 | } 100 | 101 | // Closing quote 102 | if (cur < end) { 103 | ++cur; 104 | } 105 | 106 | return cur; 107 | } 108 | 109 | const char* Source::skipBom(const char* begin, const char* end) { 110 | 111 | /* Unicode Byte Order Marks 112 | https://en.wikipedia.org/wiki/Byte_order_mark#Representations_of_byte_order_marks_by_encoding 113 | 114 | 00 00 FE FF: UTF-32BE 115 | FF FE 00 00: UTF-32LE 116 | FE FF: UTF-16BE 117 | FF FE: UTF-16LE 118 | EF BB BF: UTF-8 119 | */ 120 | 121 | switch (begin[0]) { 122 | // UTF-32BE 123 | case '\x00': 124 | if (end - begin >= 4 && begin[1] == '\x00' && begin[2] == '\xFE' && 125 | begin[3] == '\xFF') { 126 | return begin + 4; 127 | } 128 | break; 129 | 130 | // UTF-8 131 | case '\xEF': 132 | if (end - begin >= 3 && begin[1] == '\xBB' && begin[2] == '\xBF') { 133 | return begin + 3; 134 | } 135 | break; 136 | 137 | // UTF-16BE 138 | case '\xfe': 139 | if (end - begin >= 2 && begin[1] == '\xff') { 140 | return begin + 2; 141 | } 142 | break; 143 | 144 | case '\xff': 145 | if (end - begin >= 2 && begin[1] == '\xfe') { 146 | 147 | // UTF-32 LE 148 | if (end - begin >= 4 && begin[2] == '\x00' && begin[3] == '\x00') { 149 | return begin + 4; 150 | } 151 | 152 | // UTF-16 LE 153 | return begin + 2; 154 | } 155 | break; 156 | } 157 | return begin; 158 | } 159 | -------------------------------------------------------------------------------- /src/Source.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_SOURCE_H_ 2 | #define MELTR_SOURCE_H_ 3 | 4 | #include "cpp11/list.hpp" 5 | #include "utils.h" 6 | 7 | #include 8 | 9 | class Source; 10 | typedef std::shared_ptr SourcePtr; 11 | 12 | class Source { 13 | public: 14 | Source() : skippedRows_(0) {} 15 | virtual ~Source() {} 16 | 17 | virtual const char* begin() = 0; 18 | virtual const char* end() = 0; 19 | 20 | const char* skipLines( 21 | const char* begin, 22 | const char* end, 23 | int n, 24 | bool skipEmptyRows = true, 25 | const std::string& comment = "", 26 | bool skipQuote = true); 27 | 28 | static const char* 29 | skipLine(const char* begin, const char* end, bool isComment, bool skipQuote); 30 | 31 | static const char* skipDoubleQuoted(const char* begin, const char* end); 32 | 33 | size_t skippedRows() { return skippedRows_; } 34 | 35 | static const char* skipBom(const char* begin, const char* end); 36 | 37 | static SourcePtr create(const cpp11::list& spec); 38 | 39 | private: 40 | static bool 41 | inComment(const char* cur, const char* end, const std::string& comment) { 42 | return starts_with_comment(cur, end, comment); 43 | } 44 | 45 | size_t skippedRows_; 46 | }; 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/SourceFile.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_SOURCEFILE_H_ 2 | #define MELTR_SOURCEFILE_H_ 3 | 4 | #include "Source.h" 5 | #include "cpp11/protect.hpp" 6 | 7 | #include "unicode_fopen.h" 8 | 9 | class SourceFile : public Source { 10 | mio::mmap_source source_; 11 | 12 | const char* begin_; 13 | const char* end_; 14 | 15 | public: 16 | SourceFile( 17 | const std::string& path, 18 | int skip = 0, 19 | bool skipEmptyRows = true, 20 | const std::string& comment = "", 21 | bool skipQuotes = true) { 22 | 23 | std::error_code error; 24 | source_ = make_mmap_source(path.c_str(), error); 25 | 26 | if (error) { 27 | cpp11::stop("Cannot read file %s: %s", error.message().c_str()); 28 | } 29 | 30 | begin_ = source_.begin(); 31 | end_ = begin_ + source_.size(); 32 | 33 | // Skip byte order mark, if needed 34 | begin_ = skipBom(begin_, end_); 35 | 36 | // Skip lines, if needed 37 | begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); 38 | } 39 | 40 | const char* begin() { return begin_; } 41 | 42 | const char* end() { return end_; } 43 | }; 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/SourceRaw.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_SOURCERAW_H_ 2 | #define MELTR_SOURCERAW_H_ 3 | 4 | #include "Source.h" 5 | #include "cpp11/raws.hpp" 6 | 7 | class SourceRaw : public Source { 8 | cpp11::raws x_; 9 | const char* begin_; 10 | const char* end_; 11 | 12 | public: 13 | SourceRaw( 14 | cpp11::raws x, 15 | int skip = 0, 16 | bool skipEmptyRows = true, 17 | const std::string& comment = "", 18 | bool skipQuotes = true) 19 | : x_(x) { 20 | begin_ = (const char*)RAW(x); 21 | end_ = (const char*)RAW(x) + Rf_xlength(x); 22 | 23 | // Skip byte order mark, if needed 24 | begin_ = skipBom(begin_, end_); 25 | 26 | // Skip lines, if needed 27 | begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); 28 | } 29 | 30 | const char* begin() { return begin_; } 31 | 32 | const char* end() { return end_; } 33 | }; 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/SourceString.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_SOURCESTRING_H_ 2 | #define MELTR_SOURCESTRING_H_ 3 | 4 | #include "cpp11/strings.hpp" 5 | 6 | #include "Source.h" 7 | 8 | class SourceString : public Source { 9 | cpp11::sexp string_; 10 | 11 | const char* begin_; 12 | const char* end_; 13 | 14 | public: 15 | SourceString( 16 | cpp11::strings x, 17 | int skip = 0, 18 | bool skipEmptyRows = true, 19 | const std::string& comment = "", 20 | bool skipQuotes = true) 21 | : string_(static_cast(x[0])) { 22 | 23 | begin_ = CHAR(string_); 24 | end_ = begin_ + Rf_xlength(string_); 25 | 26 | // Skip byte order mark, if needed 27 | begin_ = skipBom(begin_, end_); 28 | 29 | // Skip lines, if needed 30 | begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); 31 | } 32 | 33 | const char* begin() { return begin_; } 34 | 35 | const char* end() { return end_; } 36 | }; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/Token.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_TOKEN_H_ 2 | #define MELTR_TOKEN_H_ 3 | 4 | #include "cpp11/raws.hpp" 5 | 6 | #include "Iconv.h" 7 | #include "Source.h" 8 | #include "Tokenizer.h" 9 | #include 10 | 11 | enum TokenType { 12 | TOKEN_STRING, // a sequence of characters 13 | TOKEN_MISSING, // an missing value 14 | TOKEN_EMPTY, // an empty value 15 | TOKEN_EOF // end of file 16 | }; 17 | 18 | class Token { 19 | TokenType type_; 20 | SourceIterator begin_, end_; 21 | size_t row_, col_; 22 | bool hasNull_; 23 | 24 | Tokenizer* pTokenizer_; 25 | 26 | public: 27 | Token() 28 | : type_(TOKEN_EMPTY), 29 | begin_(0), 30 | end_(0), 31 | row_(0), 32 | col_(0), 33 | hasNull_(false), 34 | pTokenizer_(nullptr) {} 35 | Token(TokenType type, int row, int col) 36 | : type_(type), 37 | begin_(0), 38 | end_(0), 39 | row_(row), 40 | col_(col), 41 | hasNull_(false), 42 | pTokenizer_(nullptr) {} 43 | Token( 44 | SourceIterator begin, 45 | SourceIterator end, 46 | int row, 47 | int col, 48 | bool hasNull, 49 | Tokenizer* pTokenizer = NULL) 50 | : type_(TOKEN_STRING), 51 | begin_(begin), 52 | end_(end), 53 | row_(row), 54 | col_(col), 55 | hasNull_(hasNull), 56 | pTokenizer_(pTokenizer) { 57 | if (begin_ == end_) 58 | type_ = TOKEN_EMPTY; 59 | } 60 | 61 | std::string asString() const { 62 | switch (type_) { 63 | case TOKEN_STRING: { 64 | std::string buffer; 65 | SourceIterators string = getString(&buffer); 66 | 67 | return std::string(string.first, string.second); 68 | } 69 | case TOKEN_MISSING: 70 | return "[MISSING]"; 71 | case TOKEN_EMPTY: 72 | return "[EMPTY]"; 73 | case TOKEN_EOF: 74 | return "[EOF]"; 75 | } 76 | 77 | return ""; 78 | } 79 | 80 | SEXP asRaw() const { 81 | size_t n = (type_ == TOKEN_STRING) ? end_ - begin_ : 0; 82 | cpp11::writable::raws out(n); 83 | 84 | if (n > 0) 85 | memcpy(RAW(out), begin_, n); 86 | 87 | return out; 88 | } 89 | 90 | SEXP asSEXP(Iconv* pEncoder) const { 91 | switch (type_) { 92 | case TOKEN_STRING: { 93 | std::string buffer; 94 | SourceIterators string = getString(&buffer); 95 | 96 | return pEncoder->makeSEXP(string.first, string.second, hasNull_); 97 | } 98 | default: 99 | return NA_STRING; 100 | } 101 | } 102 | 103 | TokenType type() const { return type_; } 104 | 105 | SourceIterators getString(std::string* pOut) const { 106 | if (pTokenizer_ == NULL) 107 | return std::make_pair(begin_, end_); 108 | 109 | pTokenizer_->unescape(begin_, end_, pOut); 110 | return std::make_pair(pOut->data(), pOut->data() + pOut->size()); 111 | } 112 | 113 | size_t row() const { return row_; } 114 | size_t col() const { return col_; } 115 | 116 | bool hasNull() const { return hasNull_; } 117 | 118 | Token& trim() { 119 | while (begin_ != end_ && (*begin_ == ' ' || *begin_ == '\t')) 120 | begin_++; 121 | while (end_ != begin_ && (*(end_ - 1) == ' ' || *(end_ - 1) == '\t')) 122 | end_--; 123 | 124 | if (begin_ == end_) 125 | type_ = TOKEN_EMPTY; 126 | 127 | return *this; 128 | } 129 | 130 | Token& flagNA(const std::vector& NA) { 131 | 132 | std::vector::const_iterator it; 133 | for (it = NA.begin(); it != NA.end(); ++it) { 134 | if ((size_t)(end_ - begin_) != it->size()) 135 | continue; 136 | 137 | if (strncmp(begin_, it->data(), it->size()) == 0) { 138 | type_ = TOKEN_MISSING; 139 | break; 140 | } 141 | } 142 | 143 | return *this; 144 | } 145 | }; 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /src/Tokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/as.hpp" 2 | #include "cpp11/integers.hpp" 3 | #include "cpp11/list.hpp" 4 | 5 | #include "Tokenizer.h" 6 | #include "TokenizerDelim.h" 7 | #include "TokenizerFwf.h" 8 | #include "TokenizerWs.h" 9 | 10 | TokenizerPtr Tokenizer::create(const cpp11::list& spec) { 11 | std::string subclass(cpp11::strings(spec.attr("class"))[0]); 12 | 13 | if (subclass == "tokenizer_delim") { 14 | char delim = cpp11::as_cpp(spec["delim"]); 15 | char quote = cpp11::as_cpp(spec["quote"]); 16 | std::vector na = 17 | cpp11::as_cpp>(spec["na"]); 18 | std::string comment = cpp11::as_cpp(spec["comment"]); 19 | bool trimWs = cpp11::as_cpp(spec["trim_ws"]); 20 | bool escapeDouble = cpp11::as_cpp(spec["escape_double"]); 21 | bool escapeBackslash = cpp11::as_cpp(spec["escape_backslash"]); 22 | bool quotedNA = cpp11::as_cpp(spec["quoted_na"]); 23 | bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); 24 | 25 | return TokenizerPtr(new TokenizerDelim( 26 | delim, 27 | quote, 28 | na, 29 | comment, 30 | trimWs, 31 | escapeBackslash, 32 | escapeDouble, 33 | quotedNA, 34 | skipEmptyRows)); 35 | } 36 | 37 | if (subclass == "tokenizer_fwf") { 38 | std::vector begin = cpp11::as_cpp>(spec["begin"]); 39 | std::vector end = cpp11::as_cpp>(spec["end"]); 40 | std::vector na = 41 | cpp11::as_cpp>(spec["na"]); 42 | std::string comment = cpp11::as_cpp(spec["comment"]); 43 | bool trimWs = cpp11::as_cpp(spec["trim_ws"]); 44 | bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); 45 | return TokenizerPtr( 46 | new TokenizerFwf(begin, end, na, comment, trimWs, skipEmptyRows)); 47 | } 48 | 49 | if (subclass == "tokenizer_ws") { 50 | std::vector na = 51 | cpp11::as_cpp>(spec["na"]); 52 | std::string comment = cpp11::as_cpp(spec["comment"]); 53 | bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); 54 | return TokenizerPtr(new TokenizerWs(na, comment, skipEmptyRows)); 55 | } 56 | 57 | cpp11::stop("Unknown tokenizer type"); 58 | return TokenizerPtr(); 59 | } 60 | -------------------------------------------------------------------------------- /src/Tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_TOKENIZER_H_ 2 | #define MELTR_TOKENIZER_H_ 3 | 4 | #include "cpp11/R.hpp" 5 | #include "cpp11/list.hpp" 6 | #include "cpp11/protect.hpp" 7 | 8 | #include "Warnings.h" 9 | #include 10 | 11 | class Token; 12 | 13 | typedef const char* SourceIterator; 14 | typedef std::pair SourceIterators; 15 | typedef void (*UnescapeFun)(SourceIterator, SourceIterator, std::string*); 16 | 17 | class Tokenizer; 18 | typedef std::shared_ptr TokenizerPtr; 19 | 20 | class Tokenizer { 21 | Warnings* pWarnings_; 22 | 23 | public: 24 | Tokenizer() : pWarnings_(NULL) {} 25 | virtual ~Tokenizer() {} 26 | 27 | virtual void tokenize(SourceIterator begin, SourceIterator end) = 0; 28 | virtual Token nextToken() = 0; 29 | // Percentage & bytes 30 | virtual std::pair progress() = 0; 31 | 32 | virtual void 33 | unescape(SourceIterator begin, SourceIterator end, std::string* pOut) { 34 | pOut->reserve(end - begin); 35 | for (SourceIterator cur = begin; cur != end; ++cur) 36 | pOut->push_back(*cur); 37 | } 38 | 39 | void setWarnings(Warnings* pWarnings) { pWarnings_ = pWarnings; } 40 | 41 | inline void warn( 42 | int row, 43 | int col, 44 | const std::string& expected, 45 | const std::string& actual = "") { 46 | if (pWarnings_ == NULL) { 47 | cpp11::warning( 48 | "[%i, %i]: expected %s", row + 1, col + 1, expected.c_str()); 49 | return; 50 | } 51 | pWarnings_->addWarning(row, col, expected, actual); 52 | } 53 | 54 | static TokenizerPtr create(const cpp11::list& spec); 55 | }; 56 | 57 | // ----------------------------------------------------------------------------- 58 | // Helper class for parsers - ensures iterator always advanced no matter 59 | // how loop is exited 60 | 61 | class Advance { 62 | SourceIterator* pIter_; 63 | 64 | public: 65 | Advance(SourceIterator* pIter) : pIter_(pIter) {} 66 | Advance(const Advance&) = delete; 67 | Advance& operator=(const Advance&) = delete; 68 | ~Advance() { (*pIter_)++; } 69 | }; 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /src/TokenizerDelim.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_TOKENIZEDELIM_H_ 2 | #define MELTR_TOKENIZEDELIM_H_ 3 | #include "cpp11/R.hpp" 4 | 5 | #include "Token.h" 6 | #include "Tokenizer.h" 7 | #include "utils.h" 8 | 9 | enum DelimState { 10 | STATE_DELIM, 11 | STATE_FIELD, 12 | STATE_STRING, 13 | STATE_QUOTE, 14 | STATE_ESCAPE_S, 15 | STATE_ESCAPE_F, 16 | STATE_STRING_END, 17 | STATE_COMMENT 18 | }; 19 | 20 | class TokenizerDelim : public Tokenizer { 21 | char delim_, quote_; 22 | std::vector NA_; 23 | std::string comment_; 24 | 25 | bool hasComment_, trimWS_, escapeBackslash_, escapeDouble_, quotedNA_, 26 | hasEmptyNA_; 27 | 28 | SourceIterator begin_, cur_, end_; 29 | DelimState state_; 30 | int row_, col_; 31 | bool moreTokens_; 32 | bool skipEmptyRows_; 33 | 34 | public: 35 | TokenizerDelim( 36 | char delim = ',', 37 | char quote = '"', 38 | std::vector NA = std::vector(1, "NA"), 39 | const std::string& comment = "", 40 | bool trimWS = true, 41 | bool escapeBackslash = false, 42 | bool escapeDouble = true, 43 | bool quotedNA = true, 44 | bool skipEmptyRows = true); 45 | 46 | void tokenize(SourceIterator begin, SourceIterator end); 47 | 48 | std::pair progress(); 49 | 50 | Token nextToken(); 51 | 52 | void unescape(SourceIterator begin, SourceIterator end, std::string* pOut); 53 | 54 | private: 55 | bool isComment(const char* cur) const; 56 | 57 | void newField(); 58 | 59 | void newRecord(); 60 | 61 | Token emptyToken(int row, int col) const; 62 | 63 | Token fieldToken( 64 | SourceIterator begin, 65 | SourceIterator end, 66 | bool hasEscapeB, 67 | bool hasNull, 68 | int row, 69 | int col); 70 | 71 | Token stringToken( 72 | SourceIterator begin, 73 | SourceIterator end, 74 | bool hasEscapeB, 75 | bool hasEscapeD, 76 | bool hasNull, 77 | int row, 78 | int col); 79 | 80 | void unescapeBackslash( 81 | SourceIterator begin, SourceIterator end, std::string* pOut); 82 | 83 | void 84 | unescapeDouble(SourceIterator begin, SourceIterator end, std::string* pOut) const; 85 | }; 86 | #endif 87 | -------------------------------------------------------------------------------- /src/TokenizerFwf.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/list.hpp" 2 | #include "cpp11/protect.hpp" 3 | 4 | #include "Tokenizer.h" 5 | #include "TokenizerFwf.h" 6 | #include "utils.h" 7 | 8 | #include "Source.h" 9 | 10 | #include 11 | #include 12 | 13 | struct skip_t { 14 | SourceIterator begin; 15 | int lines; 16 | }; 17 | 18 | skip_t skip_comments( 19 | SourceIterator begin, SourceIterator end, const std::string& comment = "") { 20 | skip_t out; 21 | if (comment.length() == 0) { 22 | out.begin = begin; 23 | out.lines = 0; 24 | return out; 25 | } 26 | 27 | SourceIterator cur = begin; 28 | int skip = 0; 29 | while (starts_with_comment(cur, end, comment)) { 30 | // Skip rest of line 31 | while (cur != end && *cur != '\n' && *cur != '\r') { 32 | ++cur; 33 | } 34 | 35 | advanceForLF(&cur, end); 36 | ++cur; 37 | ++skip; 38 | } 39 | 40 | out.begin = cur; 41 | out.lines = skip; 42 | return out; 43 | } 44 | 45 | std::vector 46 | emptyCols_(SourceIterator begin, SourceIterator end, size_t n = 100) { 47 | 48 | std::vector is_white; 49 | 50 | size_t row = 0; 51 | 52 | size_t col = 0; 53 | for (SourceIterator cur = begin; cur != end; ++cur) { 54 | if (row > n) { 55 | break; 56 | } 57 | 58 | switch (*cur) { 59 | case '\n': 60 | case '\r': 61 | advanceForLF(&cur, end); 62 | col = 0; 63 | row++; 64 | break; 65 | case ' ': 66 | col++; 67 | break; 68 | default: 69 | // Make sure there's enough room 70 | if (col >= is_white.size()) { 71 | is_white.resize(col + 1, true); 72 | } 73 | is_white[col] = false; 74 | col++; 75 | } 76 | } 77 | 78 | return is_white; 79 | } 80 | 81 | [[cpp11::register]] cpp11::list 82 | whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment) { 83 | SourcePtr source = Source::create(sourceSpec); 84 | 85 | skip_t s = skip_comments(source->begin(), source->end(), std::move(comment)); 86 | 87 | std::vector empty = emptyCols_(s.begin, source->end(), n); 88 | std::vector begin; 89 | 90 | std::vector end; 91 | 92 | bool in_col = false; 93 | 94 | for (size_t i = 0; i < empty.size(); ++i) { 95 | if (in_col && empty[i]) { 96 | end.push_back(i); 97 | in_col = false; 98 | } else if (!in_col && !empty[i]) { 99 | begin.push_back(i); 100 | in_col = true; 101 | } 102 | } 103 | 104 | if (in_col) { 105 | end.push_back(empty.size()); 106 | } 107 | 108 | using namespace cpp11::literals; 109 | return cpp11::writable::list( 110 | {"begin"_nm = begin, "end"_nm = end, "skip"_nm = s.lines}); 111 | } 112 | 113 | // TokenizerFwf -------------------------------------------------------------- 114 | 115 | #include "TokenizerFwf.h" 116 | 117 | TokenizerFwf::TokenizerFwf( 118 | const std::vector& beginOffset, 119 | const std::vector& endOffset, 120 | std::vector NA, 121 | const std::string& comment, 122 | bool trimWS, 123 | bool skipEmptyRows) 124 | : beginOffset_(beginOffset), 125 | endOffset_(endOffset), 126 | NA_(std::move(NA)), 127 | cols_(beginOffset.size()), 128 | comment_(comment), 129 | moreTokens_(false), 130 | hasComment_(!comment.empty()), 131 | trimWS_(trimWS), 132 | skipEmptyRows_(skipEmptyRows) { 133 | if (beginOffset_.size() != endOffset_.size()) { 134 | cpp11::stop( 135 | "Begin (%i) and end (%i) specifications must have equal length", 136 | beginOffset_.size(), 137 | endOffset_.size()); 138 | } 139 | 140 | if (beginOffset_.empty()) { 141 | cpp11::stop("Zero-length begin and end specifications not supported"); 142 | } 143 | 144 | // File is assumed to be ragged (last column can have variable width) 145 | // when the last element of endOffset_ is NA 146 | isRagged_ = endOffset_[endOffset_.size() - 1L] == NA_INTEGER; 147 | 148 | max_ = 0; 149 | for (int j = 0; j < (cols_ - static_cast(isRagged_)); ++j) { 150 | if (endOffset_[j] <= beginOffset_[j]) { 151 | cpp11::stop( 152 | "Begin offset (%i) must be smaller than end offset (%i)", 153 | beginOffset_[j], 154 | endOffset_[j]); 155 | } 156 | 157 | if (beginOffset_[j] < 0) { 158 | cpp11::stop("Begin offset (%i) must be greater than 0", beginOffset_[j]); 159 | } 160 | 161 | if (endOffset_[j] < 0) { 162 | cpp11::stop("End offset (%i) must be greater than 0", endOffset_[j]); 163 | } 164 | 165 | if (endOffset_[j] > max_) { 166 | max_ = endOffset_[j]; 167 | } 168 | } 169 | } 170 | 171 | void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) { 172 | cur_ = begin; 173 | curLine_ = begin; 174 | 175 | begin_ = begin; 176 | end_ = end; 177 | 178 | row_ = 0; 179 | col_ = 0; 180 | moreTokens_ = true; 181 | } 182 | 183 | std::pair TokenizerFwf::progress() { 184 | size_t bytes = cur_ - begin_; 185 | return std::make_pair(bytes / (double)(end_ - begin_), bytes); 186 | } 187 | 188 | Token TokenizerFwf::nextToken() { 189 | if (!moreTokens_) { 190 | return {TOKEN_EOF, 0, 0}; 191 | } 192 | 193 | // Check for comments only at start of line 194 | while (cur_ != end_ && col_ == 0 && 195 | (isComment(cur_) || (isEmpty() && skipEmptyRows_))) { 196 | // Skip rest of line 197 | while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') { 198 | ++cur_; 199 | } 200 | advanceForLF(&cur_, end_); 201 | if (cur_ != end_) { 202 | ++cur_; 203 | } 204 | curLine_ = cur_; 205 | } 206 | 207 | // Find start of field 208 | SourceIterator fieldBegin = cur_; 209 | findBeginning: 210 | int skip = beginOffset_[col_] - (cur_ - curLine_); 211 | if (skip < 0) { // overlapping case 212 | fieldBegin += skip; 213 | } else if (skip > 0) { // skipped columns case 214 | for (int i = 0; i < skip; ++i) { 215 | if (fieldBegin == end_) { 216 | break; 217 | } 218 | 219 | if (*fieldBegin == '\n' || *fieldBegin == '\r') { 220 | std::stringstream ss1; 221 | ss1 << skip << " chars betwen fields"; 222 | std::stringstream ss2; 223 | ss2 << skip << " chars until end of line"; 224 | warn(row_, col_, ss1.str(), ss2.str()); 225 | 226 | row_++; 227 | col_ = 0; 228 | 229 | advanceForLF(&fieldBegin, end_); 230 | if (fieldBegin != end_) { 231 | fieldBegin++; 232 | } 233 | cur_ = curLine_ = fieldBegin; 234 | goto findBeginning; 235 | } 236 | fieldBegin++; 237 | } 238 | } 239 | 240 | if (fieldBegin == end_) { 241 | // need to warn here if col != 0/cols - 1 242 | moreTokens_ = false; 243 | return {TOKEN_EOF, 0, 0}; 244 | } 245 | 246 | // Find end of field 247 | SourceIterator fieldEnd = fieldBegin; 248 | bool lastCol = (col_ == cols_ - 1); 249 | 250 | bool tooShort = false; 251 | 252 | bool hasNull = false; 253 | 254 | if (lastCol && isRagged_) { 255 | // Last column is ragged, so read until end of line (ignoring width) 256 | while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') { 257 | if (*fieldEnd == '\0') { 258 | hasNull = true; 259 | } 260 | fieldEnd++; 261 | } 262 | } else { 263 | int width = endOffset_[col_] - beginOffset_[col_]; 264 | // Find the end of the field, stopping for newlines 265 | for (int i = 0; i < width; ++i) { 266 | if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') { 267 | if (!(col_ == 0 && !skipEmptyRows_)) { 268 | std::stringstream ss1; 269 | ss1 << i << " chars"; 270 | std::stringstream ss2; 271 | ss2 << i; 272 | warn(row_, col_, ss1.str(), ss2.str()); 273 | } 274 | 275 | tooShort = true; 276 | break; 277 | } 278 | if (*fieldEnd == '\0') { 279 | hasNull = true; 280 | } 281 | 282 | fieldEnd++; 283 | } 284 | } 285 | 286 | Token t = fieldToken(fieldBegin, fieldEnd, hasNull); 287 | 288 | if (lastCol || tooShort) { 289 | row_++; 290 | col_ = 0; 291 | 292 | if (!(tooShort || isRagged_)) { 293 | // Proceed to the end of the line when you are possibly not there. 294 | // This is needed in case the last column in the file is not being read. 295 | while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') { 296 | fieldEnd++; 297 | } 298 | } 299 | 300 | curLine_ = fieldEnd; 301 | advanceForLF(&curLine_, end_); 302 | if (curLine_ != end_) { 303 | curLine_++; 304 | } 305 | cur_ = curLine_; 306 | } else { 307 | col_++; 308 | cur_ = fieldEnd; 309 | } 310 | 311 | return t; 312 | } 313 | 314 | Token TokenizerFwf::fieldToken( 315 | SourceIterator begin, SourceIterator end, bool hasNull) { 316 | if (begin == end) { 317 | return {TOKEN_MISSING, row_, col_}; 318 | } 319 | 320 | Token t = Token(begin, end, row_, col_, hasNull); 321 | if (trimWS_) { 322 | t.trim(); 323 | } 324 | t.flagNA(NA_); 325 | 326 | return t; 327 | } 328 | 329 | bool TokenizerFwf::isComment(const char* cur) const { 330 | if (!hasComment_) { 331 | return false; 332 | } 333 | 334 | return starts_with_comment(cur, end_, comment_); 335 | } 336 | 337 | bool TokenizerFwf::isEmpty() const { 338 | return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n'; 339 | } 340 | -------------------------------------------------------------------------------- /src/TokenizerFwf.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_TOKENIZERFWF_H_ 2 | #define MELTR_TOKENIZERFWF_H_ 3 | 4 | #include "Token.h" 5 | #include "Tokenizer.h" 6 | #include "utils.h" 7 | 8 | class TokenizerFwf : public Tokenizer { 9 | std::vector beginOffset_; 10 | std::vector endOffset_; 11 | std::vector NA_; 12 | 13 | SourceIterator begin_, cur_, curLine_, end_; 14 | int row_, col_, cols_, max_; 15 | std::string comment_; 16 | bool moreTokens_, isRagged_, hasComment_, trimWS_; 17 | bool skipEmptyRows_; 18 | 19 | public: 20 | TokenizerFwf( 21 | const std::vector& beginOffset, 22 | const std::vector& endOffset, 23 | std::vector NA = std::vector(1, "NA"), 24 | const std::string& comment = "", 25 | bool trimWS = true, 26 | bool skipEmptyRows = true); 27 | 28 | void tokenize(SourceIterator begin, SourceIterator end); 29 | 30 | std::pair progress(); 31 | 32 | Token nextToken(); 33 | 34 | private: 35 | Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull); 36 | 37 | bool isComment(const char* cur) const; 38 | bool isEmpty() const; 39 | }; 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /src/TokenizerWs.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/R.hpp" 2 | 3 | #include "Tokenizer.h" 4 | #include "TokenizerFwf.h" 5 | #include "TokenizerWs.h" 6 | #include "utils.h" 7 | 8 | #include "Source.h" 9 | 10 | // TokenizerWs 11 | // -------------------------------------------------------------------- 12 | 13 | #include 14 | #include 15 | 16 | TokenizerWs::TokenizerWs( 17 | std::vector NA, const std::string& comment, bool skipEmptyRows) 18 | : NA_(std::move(NA)), 19 | comment_(comment), 20 | moreTokens_(false), 21 | hasComment_(!comment.empty()), 22 | skipEmptyRows_(skipEmptyRows) {} 23 | 24 | void TokenizerWs::tokenize(SourceIterator begin, SourceIterator end) { 25 | cur_ = begin; 26 | curLine_ = begin; 27 | 28 | begin_ = begin; 29 | end_ = end; 30 | 31 | row_ = 0; 32 | col_ = 0; 33 | moreTokens_ = true; 34 | } 35 | 36 | std::pair TokenizerWs::progress() { 37 | size_t bytes = cur_ - begin_; 38 | return std::make_pair(bytes / (double)(end_ - begin_), bytes); 39 | } 40 | 41 | Token TokenizerWs::nextToken() { 42 | // Check for comments and empty lines at the start of a line 43 | while (cur_ != end_ && col_ == 0 && 44 | (isComment(cur_) || (skipEmptyRows_ && isEmpty()))) { 45 | ignoreLine(); 46 | } 47 | 48 | if (cur_ == end_) { 49 | return {TOKEN_EOF, 0, 0}; 50 | } 51 | 52 | // Find start of field 53 | SourceIterator fieldBegin = cur_; 54 | while (fieldBegin != end_ && (isblank(*fieldBegin) != 0)) { 55 | ++fieldBegin; 56 | } 57 | 58 | // Make sure we are not at the start of a comment 59 | if (isComment(fieldBegin)) { 60 | ignoreLine(); 61 | row_++; 62 | col_ = 0; 63 | return nextToken(); 64 | } 65 | 66 | SourceIterator fieldEnd = fieldBegin; 67 | while (fieldEnd != end_ && (isspace(*fieldEnd) == 0)) { 68 | ++fieldEnd; 69 | } 70 | bool hasNull = fieldEnd != end_ && *fieldEnd == '\0'; 71 | Token t = fieldToken(fieldBegin, fieldEnd, hasNull); 72 | cur_ = fieldEnd; 73 | ++col_; 74 | if (cur_ != end_ && (*cur_ == '\r' || *cur_ == '\n')) { 75 | advanceForLF(&cur_, end_); 76 | ++cur_; 77 | row_++; 78 | col_ = 0; 79 | } 80 | return t; 81 | } 82 | 83 | Token TokenizerWs::fieldToken( 84 | SourceIterator begin, SourceIterator end, bool hasNull) { 85 | if (begin == end) { 86 | return {TOKEN_MISSING, row_, col_}; 87 | } 88 | 89 | Token t = Token(begin, end, row_, col_, hasNull); 90 | t.trim(); 91 | t.flagNA(NA_); 92 | 93 | return t; 94 | } 95 | 96 | bool TokenizerWs::isComment(const char* cur) const { 97 | if (!hasComment_) { 98 | return false; 99 | } 100 | 101 | return starts_with_comment(cur, end_, comment_); 102 | } 103 | 104 | bool TokenizerWs::isEmpty() const { 105 | return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n'; 106 | } 107 | 108 | void TokenizerWs::ignoreLine() { 109 | // Skip rest of line 110 | while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') { 111 | ++cur_; 112 | } 113 | advanceForLF(&cur_, end_); 114 | if (cur_ != end_) { 115 | ++cur_; 116 | } 117 | curLine_ = cur_; 118 | } 119 | -------------------------------------------------------------------------------- /src/TokenizerWs.h: -------------------------------------------------------------------------------- 1 | #ifndef READR_TOKENIZERWS_H_ 2 | #define READR_TOKENIZERWS_H_ 3 | 4 | #include "Token.h" 5 | #include "Tokenizer.h" 6 | #include "utils.h" 7 | 8 | class TokenizerWs : public Tokenizer { 9 | std::vector NA_; 10 | 11 | SourceIterator begin_, cur_, curLine_, end_; 12 | int row_, col_; 13 | std::string comment_; 14 | bool moreTokens_, hasComment_; 15 | bool skipEmptyRows_; 16 | 17 | public: 18 | TokenizerWs( 19 | std::vector NA = std::vector(1, "NA"), 20 | const std::string& comment = "", 21 | bool skipEmptyRows = true); 22 | 23 | void tokenize(SourceIterator begin, SourceIterator end); 24 | 25 | std::pair progress(); 26 | 27 | Token nextToken(); 28 | 29 | private: 30 | Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull); 31 | 32 | bool isComment(const char* cur) const; 33 | bool isEmpty() const; 34 | void ignoreLine(); 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/Warnings.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_WARNINGS_H_ 2 | #define MELTR_WARNINGS_H_ 3 | 4 | #include "cpp11/data_frame.hpp" 5 | #include "cpp11/sexp.hpp" 6 | #include "cpp11/strings.hpp" 7 | #include 8 | #include 9 | 10 | class Warnings { 11 | std::vector row_, col_; 12 | std::vector expected_, actual_; 13 | 14 | public: 15 | Warnings() {} 16 | 17 | // row and col should be zero-indexed. addWarning converts into one-indexed 18 | void addWarning( 19 | int row, 20 | int col, 21 | const std::string& expected, 22 | const std::string& actual) { 23 | row_.push_back(row == -1 ? NA_INTEGER : row + 1); 24 | col_.push_back(col == -1 ? NA_INTEGER : col + 1); 25 | expected_.push_back(expected); 26 | actual_.push_back(actual); 27 | } 28 | 29 | cpp11::sexp addAsAttribute(cpp11::sexp x) { 30 | if (size() == 0) 31 | return x; 32 | 33 | x.attr("problems") = asDataFrame(); 34 | return x; 35 | } 36 | 37 | size_t size() { return row_.size(); } 38 | 39 | void clear() { 40 | row_.clear(); 41 | col_.clear(); 42 | expected_.clear(); 43 | actual_.clear(); 44 | } 45 | 46 | cpp11::data_frame asDataFrame() { 47 | using namespace cpp11::literals; 48 | 49 | cpp11::writable::data_frame out( 50 | {"row"_nm = row_, 51 | "col"_nm = col_, 52 | "expected"_nm = expected_, 53 | "actual"_nm = actual_}); 54 | out.attr("class") = {"tbl_df", "tbl", "data.frame"}; 55 | 56 | return static_cast(out); 57 | } 58 | }; 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/connection.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11/R.hpp" 2 | #include "cpp11/function.hpp" 3 | #include "cpp11/raws.hpp" 4 | #include "cpp11/strings.hpp" 5 | 6 | #include 7 | 8 | // Wrapper around R's read_bin function 9 | SEXP read_bin(const cpp11::sexp& con, int bytes) { 10 | static auto readBin = cpp11::package("base")["readBin"]; 11 | 12 | return readBin(con, "raw", bytes); 13 | } 14 | 15 | // Read data from a connection in chunks and then combine into a single 16 | // raw vector. 17 | // 18 | [[cpp11::register]] std::string 19 | read_connection_(const cpp11::sexp& con, std::string filename, int chunk_size) { 20 | 21 | std::ofstream out(filename.c_str(), std::fstream::out | std::fstream::binary); 22 | 23 | SEXP chunk = read_bin(con, chunk_size); 24 | R_xlen_t chunk_len = Rf_xlength(chunk); 25 | 26 | while (chunk_len > 0) { 27 | std::copy( 28 | RAW(chunk), 29 | RAW(chunk) + Rf_xlength(chunk), 30 | std::ostream_iterator(out)); 31 | chunk = read_bin(con, chunk_size); 32 | chunk_len = Rf_xlength(chunk); 33 | } 34 | 35 | return filename; 36 | } 37 | -------------------------------------------------------------------------------- /src/cpp11.cpp: -------------------------------------------------------------------------------- 1 | // Generated by cpp11: do not edit by hand 2 | // clang-format off 3 | 4 | 5 | #include "cpp11/declarations.hpp" 6 | #include 7 | 8 | // CollectorGuess.cpp 9 | std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger); 10 | extern "C" SEXP _meltr_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger) { 11 | BEGIN_CPP11 12 | return cpp11::as_sexp(collectorGuess(cpp11::as_cpp>(input), cpp11::as_cpp>(locale_), cpp11::as_cpp>(guessInteger))); 13 | END_CPP11 14 | } 15 | // connection.cpp 16 | std::string read_connection_(const cpp11::sexp& con, std::string filename, int chunk_size); 17 | extern "C" SEXP _meltr_read_connection_(SEXP con, SEXP filename, SEXP chunk_size) { 18 | BEGIN_CPP11 19 | return cpp11::as_sexp(read_connection_(cpp11::as_cpp>(con), cpp11::as_cpp>(filename), cpp11::as_cpp>(chunk_size))); 20 | END_CPP11 21 | } 22 | // read.cpp 23 | cpp11::strings read_file_(const cpp11::list& sourceSpec, const cpp11::list& locale_); 24 | extern "C" SEXP _meltr_read_file_(SEXP sourceSpec, SEXP locale_) { 25 | BEGIN_CPP11 26 | return cpp11::as_sexp(read_file_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(locale_))); 27 | END_CPP11 28 | } 29 | // read.cpp 30 | cpp11::raws read_file_raw_(const cpp11::list& sourceSpec); 31 | extern "C" SEXP _meltr_read_file_raw_(SEXP sourceSpec) { 32 | BEGIN_CPP11 33 | return cpp11::as_sexp(read_file_raw_(cpp11::as_cpp>(sourceSpec))); 34 | END_CPP11 35 | } 36 | // read.cpp 37 | cpp11::sexp melt_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, int n_max, bool progress); 38 | extern "C" SEXP _meltr_melt_tokens_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP colSpecs, SEXP locale_, SEXP n_max, SEXP progress) { 39 | BEGIN_CPP11 40 | return cpp11::as_sexp(melt_tokens_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(colSpecs), cpp11::as_cpp>(locale_), cpp11::as_cpp>(n_max), cpp11::as_cpp>(progress))); 41 | END_CPP11 42 | } 43 | // read.cpp 44 | void melt_tokens_chunked_(const cpp11::list& sourceSpec, const cpp11::environment& callback, int chunkSize, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, bool progress); 45 | extern "C" SEXP _meltr_melt_tokens_chunked_(SEXP sourceSpec, SEXP callback, SEXP chunkSize, SEXP tokenizerSpec, SEXP colSpecs, SEXP locale_, SEXP progress) { 46 | BEGIN_CPP11 47 | melt_tokens_chunked_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(callback), cpp11::as_cpp>(chunkSize), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(colSpecs), cpp11::as_cpp>(locale_), cpp11::as_cpp>(progress)); 48 | return R_NilValue; 49 | END_CPP11 50 | } 51 | // TokenizerFwf.cpp 52 | cpp11::list whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment); 53 | extern "C" SEXP _meltr_whitespaceColumns(SEXP sourceSpec, SEXP n, SEXP comment) { 54 | BEGIN_CPP11 55 | return cpp11::as_sexp(whitespaceColumns(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(n), cpp11::as_cpp>(comment))); 56 | END_CPP11 57 | } 58 | 59 | extern "C" { 60 | static const R_CallMethodDef CallEntries[] = { 61 | {"_meltr_collectorGuess", (DL_FUNC) &_meltr_collectorGuess, 3}, 62 | {"_meltr_melt_tokens_", (DL_FUNC) &_meltr_melt_tokens_, 6}, 63 | {"_meltr_melt_tokens_chunked_", (DL_FUNC) &_meltr_melt_tokens_chunked_, 7}, 64 | {"_meltr_read_connection_", (DL_FUNC) &_meltr_read_connection_, 3}, 65 | {"_meltr_read_file_", (DL_FUNC) &_meltr_read_file_, 2}, 66 | {"_meltr_read_file_raw_", (DL_FUNC) &_meltr_read_file_raw_, 1}, 67 | {"_meltr_whitespaceColumns", (DL_FUNC) &_meltr_whitespaceColumns, 3}, 68 | {NULL, NULL, 0} 69 | }; 70 | } 71 | 72 | extern "C" attribute_visible void R_init_meltr(DllInfo* dll){ 73 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 74 | R_useDynamicSymbols(dll, FALSE); 75 | R_forceSymbols(dll, TRUE); 76 | } 77 | -------------------------------------------------------------------------------- /src/read.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "cpp11/environment.hpp" 4 | #include "cpp11/function.hpp" 5 | #include "cpp11/list.hpp" 6 | #include "cpp11/strings.hpp" 7 | 8 | #include "Collector.h" 9 | #include "LocaleInfo.h" 10 | #include "Progress.h" 11 | #include "Reader.h" 12 | #include "Source.h" 13 | #include "Tokenizer.h" 14 | #include "Warnings.h" 15 | 16 | [[cpp11::register]] cpp11::strings 17 | read_file_(const cpp11::list& sourceSpec, const cpp11::list& locale_) { 18 | SourcePtr source = Source::create(sourceSpec); 19 | LocaleInfo locale(locale_); 20 | 21 | return cpp11::writable::strings( 22 | locale.encoder_.makeSEXP(source->begin(), source->end())); 23 | } 24 | 25 | [[cpp11::register]] cpp11::raws read_file_raw_(const cpp11::list& sourceSpec) { 26 | SourcePtr source = Source::create(sourceSpec); 27 | 28 | cpp11::writable::raws res( 29 | static_cast(source->end() - source->begin())); 30 | std::copy(source->begin(), source->end(), RAW(res)); 31 | return SEXP(res); 32 | } 33 | 34 | cpp11::function 35 | R6method(const cpp11::environment& env, const std::string& method) { 36 | return static_cast(env[method.c_str()]); 37 | } 38 | bool isTrue(SEXP x) { 39 | if (!(TYPEOF(x) == LGLSXP && Rf_length(x) == 1)) { 40 | cpp11::stop("`continue()` must return a length 1 logical vector"); 41 | } 42 | return LOGICAL(x)[0] == TRUE; 43 | } 44 | 45 | typedef std::vector::iterator CollectorItr; 46 | 47 | [[cpp11::register]] cpp11::sexp melt_tokens_( 48 | const cpp11::list& sourceSpec, 49 | const cpp11::list& tokenizerSpec, 50 | const cpp11::list& colSpecs, 51 | const cpp11::list& locale_, 52 | int n_max, 53 | bool progress) { 54 | 55 | LocaleInfo l(locale_); 56 | Reader r( 57 | Source::create(sourceSpec), 58 | Tokenizer::create(tokenizerSpec), 59 | collectorsCreate(colSpecs, &l), 60 | progress); 61 | 62 | return r.meltToDataFrame(cpp11::list(locale_), n_max); 63 | } 64 | 65 | [[cpp11::register]] void melt_tokens_chunked_( 66 | const cpp11::list& sourceSpec, 67 | const cpp11::environment& callback, 68 | int chunkSize, 69 | const cpp11::list& tokenizerSpec, 70 | const cpp11::list& colSpecs, 71 | const cpp11::list& locale_, 72 | bool progress) { 73 | 74 | LocaleInfo l(locale_); 75 | Reader r( 76 | Source::create(sourceSpec), 77 | Tokenizer::create(tokenizerSpec), 78 | collectorsCreate(colSpecs, &l), 79 | progress); 80 | 81 | int pos = 1; 82 | while (isTrue(R6method(callback, "continue")())) { 83 | cpp11::data_frame out( 84 | r.meltToDataFrame(static_cast(locale_), chunkSize)); 85 | if (out.nrow() == 0) { 86 | return; 87 | } 88 | R6method(callback, "receive")(out, pos); 89 | pos += out.nrow(); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/unicode_fopen.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | // clang-format off 5 | #ifdef __clang__ 6 | # pragma clang diagnostic push 7 | # pragma clang diagnostic ignored "-Wsign-compare" 8 | #include "mio.h" 9 | # pragma clang diagnostic pop 10 | #else 11 | #include "mio.h" 12 | #endif 13 | // clang-format on 14 | 15 | #ifdef _WIN32 16 | #include 17 | #include 18 | #endif 19 | 20 | // This is needed to support wide character paths on windows 21 | inline FILE* unicode_fopen(const char* path, const char* mode) { 22 | FILE* out; 23 | #ifdef _WIN32 24 | // First conver the mode to the wide equivalent 25 | // Only usage is 2 characters so max 8 bytes + 2 byte null. 26 | wchar_t mode_w[10]; 27 | MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9); 28 | 29 | // Then convert the path 30 | wchar_t* buf; 31 | size_t len = MultiByteToWideChar(CP_UTF8, 0, path, -1, NULL, 0); 32 | if (len <= 0) { 33 | Rf_error("Cannot convert file to Unicode: %s", path); 34 | } 35 | buf = (wchar_t*)R_alloc(len, sizeof(wchar_t)); 36 | if (buf == NULL) { 37 | Rf_error("Could not allocate buffer of size: %zu", len); 38 | } 39 | 40 | MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len); 41 | out = _wfopen(buf, mode_w); 42 | #else 43 | out = fopen(path, mode); 44 | #endif 45 | 46 | return out; 47 | } 48 | 49 | inline mio::mmap_source 50 | make_mmap_source(const char* file, std::error_code& error) { 51 | #ifdef __WIN32 52 | wchar_t* buf; 53 | size_t len = MultiByteToWideChar(CP_UTF8, 0, file, -1, NULL, 0); 54 | if (len <= 0) { 55 | Rf_error("Cannot convert file to Unicode: %s", file); 56 | } 57 | buf = (wchar_t*)malloc(len * sizeof(wchar_t)); 58 | if (buf == NULL) { 59 | Rf_error("Could not allocate buffer of size: %zu", len); 60 | } 61 | 62 | MultiByteToWideChar(CP_UTF8, 0, file, -1, buf, len); 63 | mio::mmap_source out = mio::make_mmap_source(buf, error); 64 | free(buf); 65 | return out; 66 | #else 67 | return mio::make_mmap_source(file, error); 68 | #endif 69 | } 70 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef MELTR_UTILS_H_ 2 | #define MELTR_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Advances iterator if the next character is a LF. 9 | // Returns iterator to end of line. 10 | template inline Iter advanceForLF(Iter* pBegin, Iter end) { 11 | Iter cur = *pBegin; 12 | if (cur == end) { 13 | return cur; 14 | } 15 | if (*cur == '\r' && (cur + 1 != end) && *(cur + 1) == '\n') 16 | (*pBegin)++; 17 | 18 | return cur; 19 | } 20 | 21 | const static char* const true_values[] = { 22 | "T", "t", "True", "TRUE", "true", (char*)NULL}; 23 | const static char* const false_values[] = { 24 | "F", "f", "False", "FALSE", "false", (char*)NULL}; 25 | 26 | inline bool isTrue(const char* start, const char* end) { 27 | size_t len = end - start; 28 | 29 | for (int i = 0; true_values[i]; i++) { 30 | size_t true_len = strlen(true_values[i]); 31 | if (true_len == len && strncmp(start, true_values[i], len) == 0) { 32 | return true; 33 | } 34 | } 35 | return false; 36 | } 37 | inline bool isFalse(const char* start, const char* end) { 38 | size_t len = end - start; 39 | 40 | for (int i = 0; false_values[i]; i++) { 41 | if (strlen(false_values[i]) == len && 42 | strncmp(start, false_values[i], len) == 0) { 43 | return true; 44 | } 45 | } 46 | return false; 47 | } 48 | 49 | inline bool isLogical(const char* start, const char* end) { 50 | return isTrue(start, end) || isFalse(start, end); 51 | } 52 | 53 | inline bool istarts_with(const std::string& input, const std::string& test) { 54 | if (test.size() > input.size()) { 55 | return false; 56 | } 57 | 58 | auto test_it = test.cbegin(); 59 | auto input_it = input.cbegin(); 60 | auto test_end = test.cend(); 61 | auto locale = std::locale(); 62 | while (test_it != test_end) { 63 | if (std::toupper(*test_it++, locale) != std::toupper(*input_it++, locale)) { 64 | return false; 65 | } 66 | } 67 | return true; 68 | } 69 | 70 | inline bool starts_with_comment( 71 | const char* cur, const char* end, const std::string& comment) { 72 | // If the comment is bigger than what we are testing, it cannot start with it. 73 | if ((long)comment.size() > (end - cur)) { 74 | return false; 75 | } 76 | for (auto c : comment) { 77 | if (*cur++ != c) { 78 | return false; 79 | } 80 | } 81 | return true; 82 | } 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(meltr) 3 | 4 | test_check("meltr") 5 | -------------------------------------------------------------------------------- /tests/testthat/basic-df-singlequote.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d 2 | TRUE,7,0.181526642525569,'m' 3 | TRUE,2,0.833227441413328,'z' 4 | TRUE,8,0.926790483295918,'r' 5 | FALSE,10,0.375270307529718,'s' 6 | TRUE,6,0.420266286935657,'g' 7 | TRUE,3,0.435449987649918,'h' 8 | TRUE,5,0.0210941969417036,'w' 9 | FALSE,9,0.0915570755023509,'u' 10 | FALSE,1,0.756106866057962,'l' 11 | FALSE,4,0.353530979715288,NA 12 | -------------------------------------------------------------------------------- /tests/testthat/basic-df.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d 2 | TRUE,7,0.181526642525569,"m" 3 | TRUE,2,0.833227441413328,"z" 4 | TRUE,8,0.926790483295918,"r" 5 | FALSE,10,0.375270307529718,"s" 6 | TRUE,6,0.420266286935657,"g" 7 | TRUE,3,0.435449987649918,"h" 8 | TRUE,5,0.0210941969417036,"w" 9 | FALSE,9,0.0915570755023509,"u" 10 | FALSE,1,0.756106866057962,"l" 11 | FALSE,4,0.353530979715288,NA 12 | -------------------------------------------------------------------------------- /tests/testthat/empty-file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-lib/meltr/38c5a720afe794d1fd2f36e5bb552dd9a8ca8b47/tests/testthat/empty-file -------------------------------------------------------------------------------- /tests/testthat/enc-iso-8859-1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-lib/meltr/38c5a720afe794d1fd2f36e5bb552dd9a8ca8b47/tests/testthat/enc-iso-8859-1.txt -------------------------------------------------------------------------------- /tests/testthat/fwf-trailing.txt: -------------------------------------------------------------------------------- 1 | 123 123 2 | 123 123 3 | -------------------------------------------------------------------------------- /tests/testthat/helper.R: -------------------------------------------------------------------------------- 1 | # Provide helper overriding tibble::all.equal.tbl_df as it requires dplyr 2 | # https://github.com/tidyverse/readr/pull/577 3 | # Using this helper allows us to avoid Suggesting dplyr 4 | all.equal.tbl_df <- function(target, current, ..., check.attributes = FALSE) { 5 | all.equal.list(target, current, ..., check.attributes = check.attributes) 6 | } 7 | 8 | is_bz2_file <- function(x) { 9 | 10 | # Magic number for bz2 is "BZh" in ASCII 11 | # https://en.wikipedia.org/wiki/Bzip2#File_format 12 | identical(charToRaw("BZh"), readBin(x, n = 3, what = "raw")) 13 | } 14 | 15 | encoded <- function(x, encoding) { 16 | Encoding(x) <- encoding 17 | x 18 | } 19 | 20 | skip_if_no_clipboard <- function() { 21 | if (!clipr::clipr_available()) { 22 | testthat::skip("System clipboard is not available - skipping test.") 23 | } 24 | return(invisible(TRUE)) 25 | } 26 | 27 | with_crayon <- function(expr) { 28 | old <- options(crayon.enabled = TRUE, crayon.colors = 16) 29 | crayon::num_colors(forget = TRUE) 30 | on.exit({ 31 | options(old) 32 | crayon::num_colors(forget = TRUE) 33 | }) 34 | 35 | force(expr) 36 | } 37 | -------------------------------------------------------------------------------- /tests/testthat/non-tabular.csv: -------------------------------------------------------------------------------- 1 | a,"b",'c' 2 | ,,NA,"NA", 3 | a,1,1.0,1.1,1e3 4 | -------------------------------------------------------------------------------- /tests/testthat/raw.csv: -------------------------------------------------------------------------------- 1 | abc,def 2 | abc,def 3 | -------------------------------------------------------------------------------- /tests/testthat/table-crash: -------------------------------------------------------------------------------- 1 | 3.5022800E+05 2.1990000E+02 1.7455317E-03 5.0152367E+00 1.0200010E+00 0.0000000E+00 1.0360000E+03 2 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 3 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 4 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 5 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 6 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 7 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 8 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 9 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 10 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 11 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 12 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 13 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 14 | 1.3231179E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 6.9944140E-03 15 | -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 16 | -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 17 | -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 18 | -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 19 | 2.5980995E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 7.0062219E-03 20 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 21 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 22 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 23 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 24 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 25 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 26 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 27 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 28 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 29 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 30 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 31 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 32 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 33 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 34 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 35 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 36 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 37 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 38 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 39 | 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.00000 -------------------------------------------------------------------------------- /tests/testthat/test-melt-chunked.R: -------------------------------------------------------------------------------- 1 | test_that("melt_delim_chunked", { 2 | file <- meltr_example("mtcars.csv") 3 | unchunked <- melt_csv(file) 4 | 5 | get_dims <- function(data, pos) dims[[length(dims) + 1]] <<- dim(data) 6 | 7 | # Full file in one chunk 8 | dims <- list() 9 | melt_csv_chunked(file, get_dims) 10 | expect_equal(dim(unchunked), dims[[1]]) 11 | 12 | # Each line separately 13 | dims <- list() 14 | melt_csv_chunked(file, get_dims, chunk_size = 1) 15 | expect_true(all(vapply(dims[1:6], identical, logical(1), c(11L, 4L)))) 16 | expect_equal(nrow(unchunked) / 11L, length(dims)) 17 | 18 | # In chunks of 5 19 | dims <- list() 20 | melt_csv_chunked(file, get_dims, chunk_size = 5) 21 | expect_true(all(vapply(dims[1:6], identical, logical(1), c(55L, 4L)))) 22 | expect_true(identical(dims[[7]], c(33L, 4L))) 23 | 24 | # Halting early 25 | get_dims_stop <- function(data, pos) { 26 | dims[[length(dims) + 1]] <<- dim(data) 27 | if (pos >= 5) { 28 | return(FALSE) 29 | } 30 | } 31 | dims <- list() 32 | melt_csv_chunked(file, get_dims_stop, chunk_size = 5) 33 | expect_true(length(dims) == 2) 34 | expect_true(all(vapply(dims[1:2], identical, logical(1), c(55L, 4L)))) 35 | }) 36 | 37 | test_that("DataFrameCallback works as intended", { 38 | f <- meltr_example("mtcars.csv") 39 | out0 <- subset(melt_csv(f), data_type == "integer") 40 | fun3 <- DataFrameCallback$new(function(x, pos) { 41 | subset(x, data_type == "integer") 42 | }) 43 | 44 | out1 <- melt_csv_chunked(f, fun3) 45 | out2 <- melt_csv_chunked(f, fun3, chunk_size = 1) 46 | out3 <- melt_csv_chunked(f, fun3, chunk_size = 10) 47 | 48 | expect_true(all.equal(out0, out1)) 49 | expect_true(all.equal(out0, out2)) 50 | expect_true(all.equal(out0, out3)) 51 | 52 | 53 | # No matching rows 54 | out0 <- subset(melt_csv(f), data_type == "integer") 55 | 56 | fun5 <- DataFrameCallback$new(function(x, pos) subset(x, data_type == "integer")) 57 | 58 | out1 <- melt_csv_chunked(f, fun5) 59 | 60 | # Need to set guess_max higher than 1 to guess correct column types 61 | out2 <- melt_csv_chunked(f, fun5, chunk_size = 1) 62 | 63 | out3 <- melt_csv_chunked(f, fun5, chunk_size = 10) 64 | 65 | expect_true(all.equal(out0, out1)) 66 | expect_true(all.equal(out0, out2)) 67 | expect_true(all.equal(out0, out3)) 68 | }) 69 | 70 | test_that("ListCallback works as intended", { 71 | f <- meltr_example("mtcars.csv") 72 | out0 <- melt_csv(f) 73 | 74 | fun <- ListCallback$new(function(x, pos) x[["value"]]) 75 | out1 <- melt_csv_chunked(f, fun, chunk_size = 10) 76 | 77 | expect_equal(out0[["value"]], unlist(out1)) 78 | }) 79 | -------------------------------------------------------------------------------- /tests/testthat/test-melt-csv.R: -------------------------------------------------------------------------------- 1 | test_that("melt_csv type imputation and NA detection works", { 2 | skip_on_os("windows") 3 | melt_data <- melt_csv("non-tabular.csv", na = "NA") 4 | expect_equal( 5 | melt_data$data_type[7:11], 6 | c("missing", "empty", "character", "integer", "double") 7 | ) 8 | }) 9 | 10 | test_that("melt_tsv works on a simple file", { 11 | melt_data <- melt_tsv("a\tb\n1\t2") 12 | expect_equal(melt_data$data_type, rep(c("character", "integer"), each = 2)) 13 | }) 14 | 15 | test_that("melt_csv's 'NA' option genuinely changes the NA values", { 16 | expect_equal(melt_csv("z\n", na = "z")$data_type, "missing") 17 | }) 18 | 19 | test_that("melt_csv's 'NA' option works with multiple NA values", { 20 | expect_equal( 21 | melt_csv("NA\nmiss\n13", na = c("13", "miss"))$data_type, 22 | c("character", "missing", "missing") 23 | ) 24 | }) 25 | 26 | test_that('passing character() to melt_csv\'s "NA" option reads "" correctly', { 27 | expect_equal(melt_csv("foo\n", na = character())$value, "foo") 28 | }) 29 | 30 | test_that("passing \"\" to melt_csv's 'NA' option reads \"\" correctly", { 31 | expect_equal( 32 | melt_csv("foo,bar\nfoo,\n", na = "")$value, 33 | c("foo", "bar", "foo", NA) 34 | ) 35 | }) 36 | 37 | test_that("changing melt_csv's 'quote' argument works correctly", { 38 | test_data <- melt_csv("basic-df.csv") 39 | test_data_singlequote <- melt_csv("basic-df-singlequote.csv", quote = "'") 40 | expect_identical(test_data, test_data_singlequote) 41 | }) 42 | 43 | test_that("melt_csv's 'skip' option allows for skipping'", { 44 | test_data <- melt_csv("basic-df.csv", skip = 1) 45 | expect_equal(nrow(test_data), 40) 46 | }) 47 | 48 | test_that("melt_csv's 'n_max' allows for a maximum number of records and does not corrupt any", { 49 | test_data <- melt_csv("basic-df.csv", n_max = 7) 50 | expect_equal(nrow(test_data), 28) 51 | expect_equal(sum(test_data$data_type == "missing"), 0) 52 | }) 53 | 54 | test_that("can read more than 100 columns", { 55 | set.seed(2015 - 3 - 13) 56 | x <- as.data.frame(matrix(rbinom(300, 2, .5), nrow = 2)) 57 | f <- tempfile() 58 | on.exit(unlink(f)) 59 | write.csv(x, f, row.names = FALSE) 60 | expect_equal(max(melt_csv(f)$col), 150) 61 | }) 62 | 63 | test_that("encoding affects text", { 64 | x <- melt_csv("enc-iso-8859-1.txt", locale = locale(encoding = "ISO-8859-1")) 65 | expect_identical(x$value[2], "\u00e9l\u00e8ve") 66 | }) 67 | 68 | test_that("nuls are dropped with a warning", { 69 | expect_warning(x <- melt_csv("raw.csv")) 70 | expect_equal(readr:::n_problems(x), 1) 71 | expect_equal(x$value[3], "ab") 72 | }) 73 | 74 | test_that("can read from the clipboard", { 75 | skip_on_cran() 76 | skip_if_no_clipboard() 77 | clipr::write_clip("a,b,c\n1,2,3") 78 | expect_identical(melt_csv(clipboard()), melt_csv("a,b,c\n1,2,3")) 79 | }) 80 | 81 | test_that("can read from a multi-line character vector", { 82 | expect_identical(max(melt_csv(c("a,b,c", "1,2,3"))$row), 2) 83 | }) 84 | 85 | # Column warnings --------------------------------------------------------- 86 | 87 | test_that("missing lines are not skipped", { 88 | # first 89 | expect_equal(max(melt_csv("a,b\n\n\n1,2")$row), 4) 90 | 91 | # middle 92 | expect_equal(max(melt_csv("a,b\n1,2\n\n\n2,3\n")$row), 5) 93 | 94 | # last (trailing \n is ignored) 95 | expect_equal(max(melt_csv("a,b\n1,2\n\n\n")$row), 4) 96 | }) 97 | 98 | # melt_csv2 --------------------------------------------------------------- 99 | 100 | test_that("decimal mark automatically set to ,", { 101 | expect_message( 102 | x <- melt_csv2("x\n1,23"), 103 | if (default_locale()$decimal_mark == ".") "decimal .*grouping .*mark" else NA 104 | ) 105 | expect_equal(x$data_type[2], "double") 106 | }) 107 | 108 | # Zero rows --------------------------------------------------------------- 109 | 110 | test_that("n_max 0 gives zero row data frame", { 111 | x <- melt_csv("a,b\n1,2", n_max = 0) 112 | expect_equal(dim(x), c(0, 4)) 113 | }) 114 | 115 | # Comments ---------------------------------------------------------------- 116 | 117 | test_that("comments are ignored regardless of where they appear", { 118 | out1 <- melt_csv("x\n1#comment", comment = "#") 119 | out2 <- melt_csv("x\n1#comment\n#comment", comment = "#") 120 | out3 <- melt_csv('x\n"1"#comment', comment = "#") 121 | 122 | chk1 <- tibble::tibble( 123 | row = c(1, 2), 124 | col = c(1, 1), 125 | data_type = c("character", "integer"), 126 | value = c("x", "1") 127 | ) 128 | 129 | expect_true(all.equal(chk1, out1)) 130 | expect_true(all.equal(chk1, out2)) 131 | expect_true(all.equal(chk1, out3)) 132 | 133 | out5 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6", comment = "#") 134 | out6 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\nA4,A5,A6", comment = "#") 135 | out7 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\n#comment\nA4,A5,A6", comment = "#") 136 | 137 | chk2 <- tibble::tibble( 138 | row = c(1, 1, 1, 2, 2, 2, 3, 4, 4, 4), 139 | col = c(1, 2, 3, 1, 2, 3, 1, 1, 2, 3), 140 | data_type = "character", 141 | value = c("x1", "x2", "x3", "A2", "B2", "C2", "A3", "A4", "A5", "A6") 142 | ) 143 | 144 | expect_true(all.equal(chk2, out5)) 145 | expect_true(all.equal(chk2, out6)) 146 | expect_true(all.equal(chk2, out7)) 147 | }) 148 | 149 | test_that("escaped/quoted comments are ignored", { 150 | out1 <- melt_delim("x\n\\#", 151 | comment = "#", delim = ",", 152 | escape_backslash = TRUE, escape_double = FALSE 153 | ) 154 | out2 <- melt_csv('x\n"#"', comment = "#") 155 | 156 | expect_equal(out1$value[2], "#") 157 | expect_equal(out2$value[2], "#") 158 | }) 159 | 160 | test_that("leading comments are ignored", { 161 | out <- melt_csv("#a\n#b\nx\n1", comment = "#") 162 | 163 | expect_equal(nrow(out), 2) 164 | expect_equal(out$value[2], "1") 165 | }) 166 | 167 | test_that("skip respects comments", { 168 | melt_x <- function(...) { 169 | melt_csv("#a\nb\nc", ...)$value 170 | } 171 | 172 | expect_equal(melt_x(), c("#a", "b", "c")) 173 | expect_equal(melt_x(skip = 1), c("b", "c")) 174 | expect_equal(melt_x(comment = "#"), c("b", "c")) 175 | expect_equal(melt_x(comment = "#", skip = 2), c("c")) 176 | }) 177 | 178 | test_that("melt_csv returns a four-col zero-row data.frame on an empty file", { 179 | expect_equal(dim(melt_csv("empty-file")), c(0, 4)) 180 | }) 181 | 182 | test_that("melt_delim errors on length 0 delimiter", { 183 | expect_error( 184 | melt_delim("a b\n1 2\n", delim = ""), 185 | "`delim` must be at least one character, use `melt_table\\(\\)` for whitespace delimited input\\." 186 | ) 187 | }) 188 | 189 | test_that("melt_csv handles whitespace between delimiters and quoted fields", { 190 | x <- melt_csv('1, \"hi,there\"\n3,4') 191 | expect_equal(x$value[2:3], c("hi,there", "3")) 192 | }) 193 | 194 | test_that("melt_csv works with raw inputs", { 195 | x <- melt_csv(as.raw(charToRaw("a,b\n1,2"))) 196 | expect_equal(x, 197 | tibble::tibble( 198 | row = c(1, 1, 2, 2), 199 | col = c(1, 2, 1, 2), 200 | data_type = c("character", "character", "integer", "integer"), 201 | value = c("a", "b", "1", "2") 202 | ) 203 | ) 204 | }) 205 | 206 | 207 | test_that("melt_csv works with dates and datetimes", { 208 | x <- melt_csv('a\n2020-01-01,2021-01-01 10:01:00') 209 | expect_equal(x, 210 | tibble::tibble( 211 | row = c(1, 2, 2), 212 | col = c(1, 1, 2), 213 | data_type = c("character", "date", "datetime"), 214 | value = c("a", "2020-01-01", "2021-01-01 10:01:00") 215 | ) 216 | ) 217 | }) 218 | -------------------------------------------------------------------------------- /tests/testthat/test-melt-fwf.R: -------------------------------------------------------------------------------- 1 | test_that("trailing spaces ommitted", { 2 | withr::local_options(lifecycle_verbosity = "quiet") 3 | spec <- fwf_empty("fwf-trailing.txt") 4 | expect_equal(spec$begin, c(0, 4)) 5 | expect_equal(spec$end, c(3, NA)) 6 | 7 | df <- melt_fwf("fwf-trailing.txt", spec) 8 | expect_true(all(df$value == "123")) 9 | }) 10 | 11 | test_that("respects the trim_ws argument", { 12 | withr::local_options(lifecycle_verbosity = "quiet") 13 | x <- "a11 b22 c33\nd e f " 14 | out1 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = FALSE) 15 | expect_equal(out1$value, c("a11", "b22", "c33", "d ", "e ", "f ")) 16 | 17 | out2 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = TRUE) 18 | expect_equal(out2$value, c("a11", "b22", "c33", "d", "e", "f")) 19 | }) 20 | 21 | test_that("respects the trim_ws argument with empty fields", { 22 | withr::local_options(lifecycle_verbosity = "quiet") 23 | x <- "a11 b22 c33\nd f " 24 | out1 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = FALSE) 25 | expect_equal(out1$value, c("a11", "b22", "c33", "d ", " ", "f ")) 26 | 27 | out2 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = TRUE, na = "NA") 28 | expect_equal(out2$value, c("a11", "b22", "c33", "d", "", "f")) 29 | }) 30 | 31 | test_that("fwf_empty can skip comments", { 32 | withr::local_options(lifecycle_verbosity = "quiet") 33 | x <- "COMMENT\n1 2 3\n4 5 6" 34 | 35 | out1 <- melt_fwf(x, fwf_empty(I(x), comment = "COMMENT"), comment = "COMMENT") 36 | expect_equal(dim(out1), c(6, 4)) 37 | }) 38 | 39 | test_that("missing lines are not skipped", { 40 | withr::local_options(lifecycle_verbosity = "quiet") 41 | # first 42 | x <- "a b\n\n\n1 2" 43 | expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 4) 44 | 45 | # middle 46 | x <- "a b\n1 2\n\n\n2 3" 47 | expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 5) 48 | 49 | # last (trailing \n is ignored) 50 | x <- "a b\n1 2\n\n\n" 51 | expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 4) 52 | }) 53 | 54 | test_that("passing \"\" to melt_fwf's 'na' option", { 55 | withr::local_options(lifecycle_verbosity = "quiet") 56 | expect_equal( 57 | melt_fwf("foobar\nfoo ", fwf_widths(c(3, 3)), na = "")$value, 58 | c("foo", "bar", "foo", NA) 59 | ) 60 | }) 61 | 62 | test_that("ragged last column expanded with NA", { 63 | withr::local_options(lifecycle_verbosity = "quiet") 64 | x <- melt_fwf("1a\n2ab\n3abc", fwf_widths(c(1, NA))) 65 | expect_equal(x$value[c(2, 4, 6)], c("a", "ab", "abc")) 66 | expect_equal(n_problems(x), 0) 67 | }) 68 | 69 | test_that("ragged last column shrunk with warning", { 70 | withr::local_options(lifecycle_verbosity = "quiet") 71 | expect_warning(x <- melt_fwf("1a\n2ab\n3abc", fwf_widths(c(1, 3)))) 72 | expect_equal(x$value[c(2, 4, 6)], c("a", "ab", "abc")) 73 | expect_equal(n_problems(x), 2) 74 | }) 75 | 76 | test_that("melt all columns with positions, non ragged", { 77 | withr::local_options(lifecycle_verbosity = "quiet") 78 | col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6)) 79 | x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) 80 | expect_equal(x$value[c(3, 6, 9)], c("A", "B", "C")) 81 | expect_equal(n_problems(x), 0) 82 | }) 83 | 84 | test_that("melt subset columns with positions", { 85 | withr::local_options(lifecycle_verbosity = "quiet") 86 | col_pos <- fwf_positions(c(1, 3), c(2, 5)) 87 | x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) 88 | expect_equal(x$value[c(1, 3, 5)], as.character(c(12, 67, 54))) 89 | expect_equal(x$value[c(2, 4, 6)], as.character(c(345, 890, 321))) 90 | expect_equal(n_problems(x), 0) 91 | }) 92 | 93 | test_that("melt columns with positions, ragged", { 94 | withr::local_options(lifecycle_verbosity = "quiet") 95 | col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, NA)) 96 | x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) 97 | expect_equal(x$value[c(1, 4, 7)], as.character(c(12, 67, 54))) 98 | expect_equal(x$value[c(2, 5, 8)], as.character(c(345, 890, 321))) 99 | expect_equal(x$value[c(3, 6, 9)], c("A", "BBBBBBBBB", "C")) 100 | expect_equal(n_problems(x), 0) 101 | }) 102 | 103 | test_that("melt columns with width, ragged", { 104 | withr::local_options(lifecycle_verbosity = "quiet") 105 | col_pos <- fwf_widths(c(2, 3, NA)) 106 | x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) 107 | expect_equal(x$value[c(1, 4, 7)], as.character(c(12, 67, 54))) 108 | expect_equal(x$value[c(2, 5, 8)], as.character(c(345, 890, 321))) 109 | expect_equal(x$value[c(3, 6, 9)], c("A", "BBBBBBBBB", "C")) 110 | expect_equal(n_problems(x), 0) 111 | }) 112 | 113 | test_that("melt_fwf returns an empty data.frame on an empty file", { 114 | withr::local_options(lifecycle_verbosity = "quiet") 115 | empty_df <- tibble::tibble( 116 | row = double(), col = double(), 117 | data_type = character(), value = character() 118 | ) 119 | expect_true(all.equal(melt_fwf("empty-file"), empty_df)) 120 | }) 121 | 122 | test_that("check for line breaks in between widths", { 123 | withr::local_options(lifecycle_verbosity = "quiet") 124 | txt1 <- paste( 125 | "1 1", 126 | "2", 127 | "1 1 ", 128 | sep = "\n" 129 | ) 130 | expect_warning(out1 <- melt_fwf(txt1, fwf_empty(I(txt1)))) 131 | expect_equal(n_problems(out1), 1) 132 | 133 | txt2 <- paste( 134 | " 1 1", 135 | " 2", 136 | " 1 1 ", 137 | sep = "\n" 138 | ) 139 | expect_warning(out2 <- melt_fwf(txt2, fwf_empty(I(txt2)))) 140 | expect_equal(n_problems(out2), 1) 141 | 142 | exp <- tibble::tibble( 143 | row = c(1, 1, 2, 3, 3), 144 | col = c(1, 2, 1, 1, 2), 145 | data_type = "integer", 146 | value = as.character(c(1, 1, 2, 1, 1)) 147 | ) 148 | expect_true(all.equal(out1, exp, check.attributes = FALSE)) 149 | expect_true(all.equal(out2, exp, check.attributes = FALSE)) 150 | }) 151 | 152 | test_that("ignore commented lines anywhere in file", { 153 | withr::local_options(lifecycle_verbosity = "quiet") 154 | col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6)) 155 | x1 <- melt_fwf("COMMENT\n12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos, comment = "COMMENT") 156 | x2 <- melt_fwf("12345A\n67890BBBBBBBBB\nCOMMENT\n54321C", col_positions = col_pos, comment = "COMMENT") 157 | x3 <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C\nCOMMENT", col_positions = col_pos, comment = "COMMENT") 158 | x4 <- melt_fwf("COMMENT\n12345A\nCOMMENT\n67890BBBBBBBBB\n54321C\nCOMMENT", col_positions = col_pos, comment = "COMMENT") 159 | 160 | expect_identical(x1, x2) 161 | expect_identical(x1, x3) 162 | expect_identical(x1, x4) 163 | 164 | expect_equal(x1$value[c(3, 6, 9)], c("A", "B", "C")) 165 | expect_equal(n_problems(x1), 0) 166 | }) 167 | 168 | test_that("error on empty spec", { 169 | withr::local_options(lifecycle_verbosity = "quiet") 170 | txt <- "foo\n" 171 | pos <- fwf_positions(start = numeric(0), end = numeric(0)) 172 | expect_error(melt_fwf(txt, pos), "Zero-length.*specifications not supported") 173 | }) 174 | -------------------------------------------------------------------------------- /tests/testthat/test-melt-table.R: -------------------------------------------------------------------------------- 1 | # melt_table ------------------------------------------------------------------- 2 | 3 | test_that("melt_table silently reads ragged last column", { 4 | x <- melt_table("foo bar\n1 2\n3 4\n5 6\n") 5 | expect_equal(x$value[-1:-2], as.character(1:6)) 6 | }) 7 | 8 | test_that("melt_table skips all comment lines", { 9 | x <- melt_table("foo bar\n1 2\n3 4\n5 6\n") 10 | 11 | y <- melt_table("#comment1\n#comment2\nfoo bar\n1 2\n3 4\n5 6\n", comment = "#") 12 | 13 | expect_equal(x, y) 14 | }) 15 | 16 | test_that("missing lines are not skipped", { 17 | # first 18 | expect_equal(max(melt_table("a b\n\n\n12 34")$row), 4) 19 | 20 | # middle 21 | expect_equal(max(melt_table("a b\n12 34\n\n\n23 45")$row), 5) 22 | 23 | # last (trailing \n is ignored) 24 | expect_equal(max(melt_table("a b\n12 34\n\n\n")$row), 4) 25 | }) 26 | 27 | test_that("melt_table can read from a pipe", { 28 | x <- melt_table(pipe("echo a b c && echo 1 2 3 && echo 4 5 6")) 29 | expect_equal(x$value[-1:-3], as.character(1:6)) 30 | }) 31 | 32 | test_that("melt_table can read a truncated file without crashing", { 33 | expect_warning(expect_error(melt_table("table-crash"), NA)) 34 | }) 35 | 36 | test_that("melt_table returns an empty data.frame on an empty file", { 37 | empty_df <- tibble::tibble( 38 | row = double(), col = double(), 39 | data_type = character(), value = character() 40 | ) 41 | expect_true(all.equal(melt_table("empty-file"), empty_df)) 42 | }) 43 | 44 | # melt_table2 ------------------------------------------------------------------- 45 | 46 | test_that("melt_table2 silently reads ragged columns", { 47 | x <- melt_table2("foo bar\n1 2\n3 4\n5 6\n") 48 | expect_equal(x$value[-1:-2], as.character(1:6)) 49 | }) 50 | 51 | test_that("melt_table2 skips all comment lines", { 52 | x <- melt_table2("foo bar\n1 2\n3 4\n5 6\n") 53 | 54 | y <- melt_table2("#comment1\n#comment2\nfoo bar\n1 2\n3 4\n5 6\n", comment = "#") 55 | 56 | expect_equal(x, y) 57 | }) 58 | 59 | test_that("melt_table2 can read from a pipe", { 60 | x <- melt_table2(pipe("echo a b c&& echo 1 2 3&& echo 4 5 6")) 61 | expect_equal(x$value[-1:-3], as.character(1:6)) 62 | }) 63 | 64 | test_that("melt_table2 does not duplicate header rows for leading whitespace", { 65 | x <- melt_table2("foo bar\n1 2\n") 66 | expect_equal(nrow(x), 4L) 67 | expect_equal(x$value[-1:-2], as.character(1:2)) 68 | }) 69 | 70 | test_that("melt_table2 ignores blank lines at the end of a file", { 71 | expect_warning(x <- melt_table2("x y\n1 2\n\n"), NA) 72 | expect_equal(nrow(x), 5L) 73 | expect_equal(x$value[3:4], as.character(1:2)) 74 | }) 75 | 76 | test_that("melt_table2 returns an empty data.frame on an empty file", { 77 | empty_df <- tibble::tibble( 78 | row = double(), col = double(), 79 | data_type = character(), value = character() 80 | ) 81 | expect_true(all.equal(melt_table2("empty-file"), empty_df)) 82 | }) 83 | --------------------------------------------------------------------------------