├── .Rbuildignore
├── .covrignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   ├── pkgdown.yaml
    │   ├── pr-commands.yaml
    │   └── test-coverage.yaml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── callback.R
    ├── cpp11.R
    ├── date-symbols.R
    ├── example.R
    ├── locale.R
    ├── melt_delim.R
    ├── melt_delim_chunked.R
    ├── melt_fwf.R
    ├── melt_table.R
    ├── meltr-package.R
    ├── problems.R
    ├── source.R
    ├── sysdata.rda
    ├── tokenizer.R
    └── utils.R
├── README.Rmd
├── README.md
├── codecov.yml
├── cran-comments.md
├── data-raw
    └── date-symbols.R
├── inst
    └── extdata
    │   ├── epa78.txt
    │   ├── fwf-sample.txt
    │   ├── massey-rating.txt
    │   └── mtcars.csv
├── man
    ├── Tokenizers.Rd
    ├── callback.Rd
    ├── clipboard.Rd
    ├── datasource.Rd
    ├── date_names.Rd
    ├── locale.Rd
    ├── melt_delim.Rd
    ├── melt_delim_chunked.Rd
    ├── melt_fwf.Rd
    ├── melt_table.Rd
    ├── meltr_example.Rd
    ├── problems.Rd
    └── show_progress.Rd
├── src
    ├── .gitignore
    ├── Collector.cpp
    ├── Collector.h
    ├── CollectorGuess.cpp
    ├── DateTimeParser.h
    ├── Iconv.cpp
    ├── Iconv.h
    ├── LocaleInfo.cpp
    ├── LocaleInfo.h
    ├── Progress.h
    ├── QiParsers.h
    ├── Reader.cpp
    ├── Reader.h
    ├── Source.cpp
    ├── Source.h
    ├── SourceFile.h
    ├── SourceRaw.h
    ├── SourceString.h
    ├── Token.h
    ├── Tokenizer.cpp
    ├── Tokenizer.h
    ├── TokenizerDelim.cpp
    ├── TokenizerDelim.h
    ├── TokenizerFwf.cpp
    ├── TokenizerFwf.h
    ├── TokenizerWs.cpp
    ├── TokenizerWs.h
    ├── Warnings.h
    ├── connection.cpp
    ├── cpp11.cpp
    ├── mio.h
    ├── read.cpp
    ├── unicode_fopen.h
    └── utils.h
└── tests
    ├── testthat.R
    └── testthat
        ├── basic-df-singlequote.csv
        ├── basic-df.csv
        ├── empty-file
        ├── enc-iso-8859-1.txt
        ├── fwf-trailing.txt
        ├── helper.R
        ├── non-tabular.csv
        ├── raw.csv
        ├── table-crash
        ├── test-melt-chunked.R
        ├── test-melt-csv.R
        ├── test-melt-fwf.R
        └── test-melt-table.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^LICENSE\.md$
2 | ^data-raw$
3 | ^codecov\.yml$
4 | ^\.github$
5 | ^README\.Rmd$
6 | ^\.covrignore$
7 | ^cran-comments\.md$
8 | ^CRAN-SUBMISSION$
9 | 


--------------------------------------------------------------------------------
/.covrignore:
--------------------------------------------------------------------------------
1 | src/mio.h
2 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 |     branches: [main, master]
12 | 
13 | name: R-CMD-check
14 | 
15 | jobs:
16 |   R-CMD-check:
17 |     runs-on: ${{ matrix.config.os }}
18 | 
19 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         config:
25 |           - {os: macOS-latest,   r: 'release'}
26 | 
27 |           - {os: windows-latest, r: 'release'}
28 |           # Use 3.6 to trigger usage of RTools35
29 |           - {os: windows-latest, r: '3.6'}
30 | 
31 |           # Use older ubuntu to maximise backward compatibility
32 |           - {os: ubuntu-18.04,   r: 'devel', http-user-agent: 'release'}
33 |           - {os: ubuntu-18.04,   r: 'release'}
34 |           - {os: ubuntu-18.04,   r: 'oldrel-1'}
35 |           - {os: ubuntu-18.04,   r: 'oldrel-2'}
36 |           - {os: ubuntu-18.04,   r: 'oldrel-3'}
37 |           - {os: ubuntu-18.04,   r: 'oldrel-4'}
38 | 
39 |     env:
40 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
41 |       R_KEEP_PKG_SOURCE: yes
42 | 
43 |     steps:
44 |       - uses: actions/checkout@v3
45 | 
46 |       - uses: r-lib/actions/setup-pandoc@v2
47 | 
48 |       - uses: r-lib/actions/setup-r@v2
49 |         with:
50 |           r-version: ${{ matrix.config.r }}
51 |           http-user-agent: ${{ matrix.config.http-user-agent }}
52 |           use-public-rspm: true
53 | 
54 |       - uses: r-lib/actions/setup-r-dependencies@v2
55 |         with:
56 |           extra-packages: rcmdcheck
57 | 
58 |       - uses: r-lib/actions/check-r-package@v2
59 | 
60 |       - name: Show testthat output
61 |         if: always()
62 |         run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
63 |         shell: bash
64 | 
65 |       - name: Upload check results
66 |         if: failure()
67 |         uses: actions/upload-artifact@main
68 |         with:
69 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
70 |           path: check
71 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |     tags: ['*']
 7 | 
 8 | name: pkgdown
 9 | 
10 | jobs:
11 |   pkgdown:
12 |     runs-on: ubuntu-latest
13 |     env:
14 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
15 |     steps:
16 |       - uses: actions/checkout@v3
17 | 
18 |       - uses: r-lib/actions/setup-pandoc@v1
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v1
25 |         with:
26 |           extra-packages: pkgdown
27 |           needs: website
28 | 
29 |       - name: Deploy package
30 |         run: |
31 |           git config --local user.name "$GITHUB_ACTOR"
32 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
33 |           Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
34 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 | 
 7 | name: Commands
 8 | 
 9 | jobs:
10 |   document:
11 |     if: startsWith(github.event.comment.body, '/document')
12 |     name: document
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |     steps:
17 |       - uses: actions/checkout@v3
18 | 
19 |       - uses: r-lib/actions/pr-fetch@v2
20 |         with:
21 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
22 | 
23 |       - uses: r-lib/actions/setup-r@v2
24 |         with:
25 |           use-public-rspm: true
26 | 
27 |       - uses: r-lib/actions/setup-r-dependencies@v2
28 |         with:
29 |           extra-packages: roxygen2
30 | 
31 |       - name: Document
32 |         run: Rscript -e 'roxygen2::roxygenise()'
33 | 
34 |       - name: commit
35 |         run: |
36 |           git config --local user.name "$GITHUB_ACTOR"
37 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
38 |           git add man/\* NAMESPACE
39 |           git commit -m 'Document'
40 | 
41 |       - uses: r-lib/actions/pr-push@v2
42 |         with:
43 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
44 | 
45 |   style:
46 |     if: startsWith(github.event.comment.body, '/style')
47 |     name: style
48 |     runs-on: ubuntu-latest
49 |     env:
50 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
51 |     steps:
52 |       - uses: actions/checkout@v3
53 | 
54 |       - uses: r-lib/actions/pr-fetch@v2
55 |         with:
56 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
57 | 
58 |       - uses: r-lib/actions/setup-r@v2
59 | 
60 |       - name: Install dependencies
61 |         run: Rscript -e 'install.packages("styler")'
62 | 
63 |       - name: Style
64 |         run: Rscript -e 'styler::style_pkg()'
65 | 
66 |       - name: commit
67 |         run: |
68 |           git config --local user.name "$GITHUB_ACTOR"
69 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
70 |           git add \*.R
71 |           git commit -m 'Style'
72 | 
73 |       - uses: r-lib/actions/pr-push@v2
74 |         with:
75 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
76 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: covr
27 | 
28 |       - name: Test coverage
29 |         run: covr::codecov()
30 |         shell: Rscript {0}
31 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: meltr
 2 | Title: Read Non-Rectangular Text Data
 3 | Version: 1.0.2
 4 | Authors@R:
 5 |     c(person(given = "Hadley",
 6 |              family = "Wickham",
 7 |              role = "aut",
 8 |              email = "hadley@rstudio.com"),
 9 |       person(given = "Duncan",
10 |              family = "Garmonsway",
11 |              role = c("aut", "cre"),
12 |              email = "nacnudus@gmail.com",
13 |              comment = "@nacnudus"),
14 |       person(given = "Jim",
15 |              family = "Hester",
16 |              role = "aut",
17 |              email = "jim.hester@rstudio.com",
18 |              comment = c(ORCID = "0000-0002-2739-7082")),
19 |       person(given = "RStudio",
20 |              role = c("cph", "fnd")),
21 |       person(given = "https://github.com/mandreyel/",
22 |              role = "cph",
23 |              comment = "mio library"))
24 | Description: The goal of 'meltr' is to provide a fast and friendly way to
25 |     read non-rectangular data, such as ragged forms of csv (comma-separated
26 |     values), tsv (tab-separated values), and fwf (fixed-width format) files.
27 | License: MIT + file LICENSE
28 | URL: https://r-lib.github.io/meltr/,
29 |     https://github.com/r-lib/meltr
30 | BugReports: https://github.com/r-lib/meltr/issues
31 | Depends:
32 |     R (>= 2.10)
33 | Imports:
34 |     cli,
35 |     methods,
36 |     R6,
37 |     rlang,
38 |     tibble
39 | Suggests:
40 |     clipr,
41 |     covr,
42 |     crayon,
43 |     curl,
44 |     readr,
45 |     testthat (>= 3.0.0),
46 |     withr
47 | LinkingTo:
48 |     cpp11
49 | Config/testthat/edition: 3
50 | Config/Needs/website: dplyr
51 | Encoding: UTF-8
52 | LazyData: true
53 | Roxygen: list(markdown = TRUE)
54 | RoxygenNote: 7.2.1
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: meltr authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2021 meltr authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("[",meltr_spec_tbl_df)
 4 | S3method(as.data.frame,meltr_spec_tbl_df)
 5 | S3method(as_tibble,meltr_spec_tbl_df)
 6 | S3method(print,date_names)
 7 | S3method(print,locale)
 8 | export(AccumulateCallback)
 9 | export(ChunkCallback)
10 | export(DataFrameCallback)
11 | export(ListCallback)
12 | export(SideEffectChunkCallback)
13 | export(clipboard)
14 | export(datasource)
15 | export(date_names)
16 | export(date_names_lang)
17 | export(date_names_langs)
18 | export(default_locale)
19 | export(fwf_cols)
20 | export(fwf_empty)
21 | export(fwf_positions)
22 | export(fwf_widths)
23 | export(locale)
24 | export(melt_csv)
25 | export(melt_csv2)
26 | export(melt_csv2_chunked)
27 | export(melt_csv_chunked)
28 | export(melt_delim)
29 | export(melt_delim_chunked)
30 | export(melt_fwf)
31 | export(melt_table)
32 | export(melt_table2)
33 | export(melt_tsv)
34 | export(melt_tsv_chunked)
35 | export(meltr_example)
36 | export(problems)
37 | export(show_progress)
38 | export(stop_for_problems)
39 | export(tokenizer_csv)
40 | export(tokenizer_delim)
41 | export(tokenizer_fwf)
42 | export(tokenizer_line)
43 | export(tokenizer_log)
44 | export(tokenizer_tsv)
45 | export(tokenizer_ws)
46 | importFrom(methods,setOldClass)
47 | importFrom(tibble,as_tibble)
48 | importFrom(tibble,tibble)
49 | useDynLib(meltr, .registration = TRUE)
50 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # meltr 1.0.2
 2 | 
 3 | * Fix CRAN warnings
 4 | 
 5 | # meltr 1.0.1
 6 | 
 7 | * Fix buffer overflow when trying to parse a field that is over 64 characters long (#10)
 8 | 
 9 | # meltr 1.0.0
10 | 
11 | * Added a `NEWS.md` file to track changes to the package.
12 | 


--------------------------------------------------------------------------------
/R/callback.R:
--------------------------------------------------------------------------------
  1 | as_chunk_callback <- function(x) UseMethod("as_chunk_callback")
  2 | as_chunk_callback.function <- function(x) {
  3 |   SideEffectChunkCallback$new(x)
  4 | }
  5 | as_chunk_callback.R6ClassGenerator <- function(x) {
  6 |   as_chunk_callback(x$new())
  7 | }
  8 | as_chunk_callback.ChunkCallback <- function(x) {
  9 |   x
 10 | }
 11 | 
 12 | #' Callback classes
 13 | #'
 14 | #' These classes are used to define callback behaviors.
 15 | #'
 16 | #' \describe{
 17 | #'  \item{ChunkCallback}{Callback interface definition, all callback functions should inherit from this class.}
 18 | #'  \item{SideEffectChunkCallback}{Callback function that is used only for side effects, no results are returned.}
 19 | #'  \item{DataFrameCallback}{Callback function that combines each result together at the end.}
 20 | #'  \item{AccumulateCallBack}{
 21 | #'    Callback function that accumulates a single result. Requires the parameter `acc` to specify
 22 | #'    the initial value of the accumulator.  The parameter `acc` is `NULL` by default.
 23 | #'  }
 24 | #' }
 25 | #' @usage NULL
 26 | #' @format NULL
 27 | #' @name callback
 28 | #' @keywords internal
 29 | #' @family chunked
 30 | #' @export
 31 | ChunkCallback <- R6::R6Class("ChunkCallback",
 32 |   private = list(
 33 |     callback = NULL
 34 |   ),
 35 |   public = list(
 36 |     initialize = function(callback) NULL,
 37 |     receive = function(data, index) NULL,
 38 |     continue = function() TRUE,
 39 |     result = function() NULL,
 40 |     finally = function() NULL
 41 |   )
 42 | )
 43 | 
 44 | #' @usage NULL
 45 | #' @format NULL
 46 | #' @rdname callback
 47 | #' @export
 48 | SideEffectChunkCallback <- R6::R6Class("SideEffectChunkCallback",
 49 |   inherit = ChunkCallback,
 50 |   private = list(
 51 |     cancel = FALSE
 52 |   ),
 53 |   public = list(
 54 |     initialize = function(callback) {
 55 |       check_callback_fun(callback)
 56 |       private$callback <- callback
 57 |     },
 58 |     receive = function(data, index) {
 59 |       result <- private$callback(data, index)
 60 |       private$cancel <- identical(result, FALSE)
 61 |     },
 62 |     continue = function() {
 63 |       !private$cancel
 64 |     }
 65 |   )
 66 | )
 67 | 
 68 | #' @usage NULL
 69 | #' @format NULL
 70 | #' @rdname callback
 71 | #' @export
 72 | DataFrameCallback <- R6::R6Class("DataFrameCallback",
 73 |   inherit = ChunkCallback,
 74 |   private = list(
 75 |     results = list()
 76 |   ),
 77 |   public = list(
 78 |     initialize = function(callback) {
 79 |       private$callback <- callback
 80 |     },
 81 |     receive = function(data, index) {
 82 |       result <- private$callback(data, index)
 83 |       private$results <- c(private$results, list(result))
 84 |     },
 85 |     result = function() {
 86 |       do.call(`rbind`, private$results)
 87 |     },
 88 |     finally = function() {
 89 |       private$results <- list()
 90 |     }
 91 |   )
 92 | )
 93 | 
 94 | #' @usage NULL
 95 | #' @format NULL
 96 | #' @rdname callback
 97 | #' @export
 98 | ListCallback <- R6::R6Class("ListCallback",
 99 |   inherit = ChunkCallback,
100 |   private = list(
101 |     results = list()
102 |   ),
103 |   public = list(
104 |     initialize = function(callback) {
105 |       private$callback <- callback
106 |     },
107 |     receive = function(data, index) {
108 |       result <- private$callback(data, index)
109 |       private$results <- c(private$results, list(result))
110 |     },
111 |     result = function() {
112 |       private$results
113 |     },
114 |     finally = function() {
115 |       private$results <- list()
116 |     }
117 |   )
118 | )
119 | 
120 | #' @usage NULL
121 | #' @format NULL
122 | #' @rdname callback
123 | #' @export
124 | AccumulateCallback <- R6::R6Class("AccumulateCallback",
125 |   inherit = ChunkCallback,
126 |   private = list(
127 |     acc = NULL
128 |   ),
129 |   public = list(
130 |     initialize = function(callback, acc = NULL) {
131 |       check_callback_fun(callback,
132 |         req_args = 3,
133 |         message = "`callback` must have three or more arguments"
134 |       )
135 |       private$acc <- acc
136 |       private$callback <- callback
137 |     },
138 |     receive = function(data, index) {
139 |       private$acc <- private$callback(data, index, private$acc)
140 |     },
141 |     result = function() {
142 |       private$acc
143 |     }
144 |   )
145 | )
146 | 
147 | check_callback_fun <- function(callback, req_args = 2, message = NULL) {
148 |   if (is.null(message)) {
149 |     message <- "`callback` must have two or more arguments"
150 |   }
151 |   n_args <- length(formals(callback))
152 |   if (n_args < req_args) {
153 |     stop(message, call. = FALSE)
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/R/cpp11.R:
--------------------------------------------------------------------------------
 1 | # Generated by cpp11: do not edit by hand
 2 | 
 3 | collectorGuess <- function(input, locale_, guessInteger) {
 4 |   .Call(`_meltr_collectorGuess`, input, locale_, guessInteger)
 5 | }
 6 | 
 7 | read_connection_ <- function(con, filename, chunk_size) {
 8 |   .Call(`_meltr_read_connection_`, con, filename, chunk_size)
 9 | }
10 | 
11 | read_file_ <- function(sourceSpec, locale_) {
12 |   .Call(`_meltr_read_file_`, sourceSpec, locale_)
13 | }
14 | 
15 | read_file_raw_ <- function(sourceSpec) {
16 |   .Call(`_meltr_read_file_raw_`, sourceSpec)
17 | }
18 | 
19 | melt_tokens_ <- function(sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress) {
20 |   .Call(`_meltr_melt_tokens_`, sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress)
21 | }
22 | 
23 | melt_tokens_chunked_ <- function(sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress) {
24 |   invisible(.Call(`_meltr_melt_tokens_chunked_`, sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress))
25 | }
26 | 
27 | whitespaceColumns <- function(sourceSpec, n, comment) {
28 |   .Call(`_meltr_whitespaceColumns`, sourceSpec, n, comment)
29 | }
30 | 


--------------------------------------------------------------------------------
/R/date-symbols.R:
--------------------------------------------------------------------------------
 1 | #' Create or retrieve date names
 2 | #'
 3 | #' When parsing dates, you often need to know how weekdays of the week and
 4 | #' months are represented as text. This pair of functions allows you to either
 5 | #' create your own, or retrieve from a standard list. The standard list is
 6 | #' derived from ICU (<https://icu.unicode.org/>) via the stringi package.
 7 | #'
 8 | #' @param mon,mon_ab Full and abbreviated month names.
 9 | #' @param day,day_ab Full and abbreviated week day names. Starts with Sunday.
10 | #' @param am_pm Names used for AM and PM.
11 | #' @return A date names object
12 | #' @export
13 | #' @examples
14 | #' date_names(mon = LETTERS[1:12], day = letters[1:7])
15 | #' date_names_lang("en")
16 | #' date_names_lang("ko")
17 | #' date_names_lang("fr")
18 | date_names <- function(mon, mon_ab = mon, day, day_ab = day,
19 |                        am_pm = c("AM", "PM")) {
20 |   stopifnot(is.character(mon), length(mon) == 12)
21 |   stopifnot(is.character(mon_ab), length(mon_ab) == 12)
22 |   stopifnot(is.character(day), length(day) == 7)
23 |   stopifnot(is.character(day_ab), length(day_ab) == 7)
24 | 
25 |   structure(
26 |     list(
27 |       mon = enc2utf8(mon),
28 |       mon_ab = enc2utf8(mon_ab),
29 |       day = enc2utf8(day),
30 |       day_ab = enc2utf8(day_ab),
31 |       am_pm = enc2utf8(am_pm)
32 |     ),
33 |     class = "date_names"
34 |   )
35 | }
36 | 
37 | #' @export
38 | #' @rdname date_names
39 | #' @param language A BCP 47 locale, made up of a language and a region,
40 | #'   e.g. `"en_US"` for American English. See `date_names_langs()`
41 | #'   for a complete list of available locales.
42 | date_names_lang <- function(language) {
43 |   stopifnot(is.character(language), length(language) == 1)
44 | 
45 |   symbols <- date_symbols[[language]]
46 |   if (is.null(symbols)) {
47 |     stop("Unknown language '", language, "'", call. = FALSE)
48 |   }
49 | 
50 |   symbols
51 | }
52 | 
53 | #' @export
54 | #' @rdname date_names
55 | date_names_langs <- function() {
56 |   names(date_symbols)
57 | }
58 | 
59 | #' @export
60 | print.date_names <- function(x, ...) {
61 |   cat("<date_names>\n")
62 | 
63 |   if (identical(x$day, x$day_ab)) {
64 |     day <- paste0(x$day, collapse = ", ")
65 |   } else {
66 |     day <- paste0(x$day, " (", x$day_ab, ")", collapse = ", ")
67 |   }
68 | 
69 |   if (identical(x$mon, x$mon_ab)) {
70 |     mon <- paste0(x$mon, collapse = ", ")
71 |   } else {
72 |     mon <- paste0(x$mon, " (", x$mon_ab, ")", collapse = ", ")
73 |   }
74 |   am_pm <- paste0(x$am_pm, collapse = "/")
75 | 
76 |   cat_wrap("Days:   ", day)
77 |   cat_wrap("Months: ", mon)
78 |   cat_wrap("AM/PM:  ", am_pm)
79 | }
80 | 
81 | is.date_names <- function(x) inherits(x, "date_names")
82 | 
83 | cat_wrap <- function(header, body) {
84 |   body <- strwrap(body, exdent = nchar(header))
85 |   cat(header, paste(body, collapse = "\n"), "\n", sep = "")
86 | }
87 | 


--------------------------------------------------------------------------------
/R/example.R:
--------------------------------------------------------------------------------
 1 | #' Get path to meltr example
 2 | #'
 3 | #' meltr comes bundled with a number of sample files in its `inst/extdata`
 4 | #' directory. This function make them easy to access
 5 | #'
 6 | #' @param file Name of file. If `NULL`, the example files will be listed.
 7 | #' @return A file path or a vector of file names
 8 | #' @export
 9 | #' @examples
10 | #' meltr_example()
11 | #' meltr_example("mtcars.csv")
12 | meltr_example <- function(file = NULL) {
13 |   if (is.null(file)) {
14 |     dir(system.file("extdata", package = "meltr"))
15 |   } else {
16 |     system.file("extdata", file, package = "meltr", mustWork = TRUE)
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/R/locale.R:
--------------------------------------------------------------------------------
  1 | #' Create locales
  2 | #'
  3 | #' A locale object tries to capture all the defaults that can vary between
  4 | #' countries. You set the locale in once, and the details are automatically
  5 | #' passed on down to the columns parsers. The defaults have been chosen to
  6 | #' match R (i.e. US English) as closely as possible. See
  7 | #' `vignette("locales")` for more details.
  8 | #'
  9 | #' @param date_names Character representations of day and month names. Either
 10 | #'   the language code as string (passed on to [date_names_lang()])
 11 | #'   or an object created by [date_names()].
 12 | #' @param date_format,time_format Default date and time formats.
 13 | #' @param decimal_mark,grouping_mark Symbols used to indicate the decimal
 14 | #'   place, and to chunk larger numbers. Decimal mark can only be `,` or
 15 | #'   `.`.
 16 | #' @param tz Default tz. This is used both for input (if the time zone isn't
 17 | #'   present in individual strings), and for output (to control the default
 18 | #'   display). The default is to use "UTC", a time zone that does not use
 19 | #'   daylight savings time (DST) and hence is typically most useful for data.
 20 | #'   The absence of time zones makes it approximately 50x faster to generate
 21 | #'   UTC times than any other time zone.
 22 | #'
 23 | #'   Use `""` to use the system default time zone, but beware that this
 24 | #'   will not be reproducible across systems.
 25 | #'
 26 | #'   For a complete list of possible time zones, see [OlsonNames()].
 27 | #'   Americans, note that "EST" is a Canadian time zone that does not have
 28 | #'   DST. It is *not* Eastern Standard Time. It's better to use
 29 | #'   "US/Eastern", "US/Central" etc.
 30 | #' @param encoding Default encoding. This only affects how the file is
 31 | #'   read - meltr always converts the output to UTF-8.
 32 | #' @return A locale object
 33 | #' @export
 34 | #' @examples
 35 | #' locale()
 36 | #' locale("fr")
 37 | #'
 38 | #' # South American locale
 39 | #' locale("es", decimal_mark = ",")
 40 | locale <- function(date_names = "en",
 41 |                    date_format = "%AD", time_format = "%AT",
 42 |                    decimal_mark = ".", grouping_mark = ",",
 43 |                    tz = "UTC", encoding = "UTF-8") {
 44 |   if (is.character(date_names)) {
 45 |     date_names <- date_names_lang(date_names)
 46 |   }
 47 |   stopifnot(is.date_names(date_names))
 48 | 
 49 |   if (missing(grouping_mark) && !missing(decimal_mark)) {
 50 |     grouping_mark <- if (decimal_mark == ".") "," else "."
 51 |   } else if (missing(decimal_mark) && !missing(grouping_mark)) {
 52 |     decimal_mark <- if (grouping_mark == ".") "," else "."
 53 |   }
 54 | 
 55 |   stopifnot(decimal_mark %in% c(".", ","))
 56 |   stopifnot(is.character(grouping_mark), length(grouping_mark) == 1)
 57 |   if (decimal_mark == grouping_mark) {
 58 |     stop("`decimal_mark` and `grouping_mark` must be different", call. = FALSE)
 59 |   }
 60 | 
 61 |   tz <- check_tz(tz)
 62 |   check_encoding(encoding)
 63 | 
 64 |   structure(
 65 |     list(
 66 |       date_names = date_names,
 67 |       date_format = date_format,
 68 |       time_format = time_format,
 69 |       decimal_mark = decimal_mark,
 70 |       grouping_mark = grouping_mark,
 71 |       tz = tz,
 72 |       encoding = encoding
 73 |     ),
 74 |     class = "locale"
 75 |   )
 76 | }
 77 | 
 78 | is.locale <- function(x) inherits(x, "locale")
 79 | 
 80 | #' @export
 81 | print.locale <- function(x, ...) {
 82 |   cat("<locale>\n")
 83 |   cat("Numbers:  ", prettyNum(123456.78,
 84 |     big.mark = x$grouping_mark,
 85 |     decimal.mark = x$decimal_mark, digits = 8
 86 |   ), "\n", sep = "")
 87 |   cat("Formats:  ", x$date_format, " / ", x$time_format, "\n", sep = "")
 88 |   cat("Timezone: ", x$tz, "\n", sep = "")
 89 |   cat("Encoding: ", x$encoding, "\n", sep = "")
 90 |   print(x$date_names)
 91 | }
 92 | 
 93 | #' @export
 94 | #' @rdname locale
 95 | default_locale <- function() {
 96 |   loc <- getOption("readr.default_locale")
 97 |   if (is.null(loc)) {
 98 |     loc <- locale()
 99 |     options("readr.default_locale" = loc)
100 |   }
101 | 
102 |   loc
103 | }
104 | 
105 | check_tz <- function(x) {
106 |   stopifnot(is.character(x), length(x) == 1)
107 | 
108 |   if (identical(x, "")) {
109 |     x <- Sys.timezone()
110 | 
111 |     if (identical(x, "") || identical(x, NA_character_)) {
112 |       x <- "UTC"
113 |     }
114 |   }
115 | 
116 |   x
117 | }
118 | 
119 | check_encoding <- function(x) {
120 |   stopifnot(is.character(x), length(x) == 1)
121 | 
122 |   if (tolower(x) %in% tolower(iconvlist())) {
123 |     return(TRUE)
124 |   }
125 | 
126 |   stop("Unknown encoding ", x, call. = FALSE)
127 | }
128 | 


--------------------------------------------------------------------------------
/R/melt_delim.R:
--------------------------------------------------------------------------------
  1 | #' Return melted data for each token in a delimited file (including csv & tsv)
  2 | #'
  3 | #' For certain non-rectangular data formats, it can be useful to parse the data
  4 | #' into a melted format where each row represents a single token.
  5 | #'
  6 | #' `melt_csv()` and `melt_tsv()` are special cases of the general
  7 | #' `melt_delim()`. They're useful for reading the most common types of
  8 | #' flat file data, comma separated values and tab separated values,
  9 | #' respectively. `melt_csv2()` uses `;` for the field separator and `,` for the
 10 | #' decimal point. This is common in some European countries.
 11 | #' @inheritParams readr::read_delim
 12 | #' @return A [tibble()] of four columns:
 13 | #'   * `row`, the row that the token comes from in the original file
 14 | #'   * `col`, the column that the token comes from in the original file
 15 | #'   * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
 16 | #'     `"date"`, guessed in a similar way to the `guess_parser()` function.
 17 | #'   * `value`, the token itself as a character string, unchanged from its
 18 | #'     representation in the original file.
 19 | #'
 20 | #'   If there are parsing problems, a warning tells you
 21 | #'   how many, and you can retrieve the details with [problems()].
 22 | #' @seealso [readr::read_delim()] for the conventional way to read rectangular data
 23 | #' from delimited files.
 24 | #' @export
 25 | #' @examples
 26 | #' # Input sources -------------------------------------------------------------
 27 | #' # Read from a path
 28 | #' melt_csv(meltr_example("mtcars.csv"))
 29 | #' \dontrun{
 30 | #' melt_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
 31 | #' }
 32 | #'
 33 | #' # Or directly from a string (must contain a newline)
 34 | #' melt_csv("x,y\n1,2\n3,4")
 35 | #'
 36 | #' # To import empty cells as 'empty' rather than `NA`
 37 | #' melt_csv("x,y\n,NA,\"\",''", na = "NA")
 38 | #'
 39 | #' # File types ----------------------------------------------------------------
 40 | #' melt_csv("a,b\n1.0,2.0")
 41 | #' melt_csv2("a;b\n1,0;2,0")
 42 | #' melt_tsv("a\tb\n1.0\t2.0")
 43 | #' melt_delim("a|b\n1.0|2.0", delim = "|")
 44 | #' @export
 45 | melt_delim <- function(file, delim, quote = '"',
 46 |                        escape_backslash = FALSE, escape_double = TRUE,
 47 |                        locale = default_locale(),
 48 |                        na = c("", "NA"), quoted_na = TRUE,
 49 |                        comment = "", trim_ws = FALSE,
 50 |                        skip = 0, n_max = Inf,
 51 |                        progress = show_progress(),
 52 |                        skip_empty_rows = FALSE) {
 53 |   if (!nzchar(delim)) {
 54 |     stop("`delim` must be at least one character, ",
 55 |       "use `melt_table()` for whitespace delimited input.", call. = FALSE)
 56 |   }
 57 |   tokenizer <- tokenizer_delim(delim, quote = quote,
 58 |     escape_backslash = escape_backslash, escape_double = escape_double,
 59 |     na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws,
 60 |     skip_empty_rows = skip_empty_rows)
 61 |   melt_delimited(file, tokenizer, locale = locale, skip = skip,
 62 |     skip_empty_rows = skip_empty_rows, comment = comment,
 63 |     n_max = n_max, progress = progress)
 64 | }
 65 | 
 66 | #' @rdname melt_delim
 67 | #' @export
 68 | melt_csv <- function(file, locale = default_locale(), na = c("", "NA"),
 69 |                      quoted_na = TRUE, quote = "\"", comment = "",
 70 |                      trim_ws = TRUE, skip = 0, n_max = Inf,
 71 |                      progress = show_progress(),
 72 |                      skip_empty_rows = FALSE) {
 73 |   tokenizer <- tokenizer_csv(na = na, quoted_na = quoted_na, quote = quote,
 74 |     comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows)
 75 |   melt_delimited(file, tokenizer, locale = locale, skip = skip,
 76 |     skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max,
 77 |     progress = progress)
 78 | }
 79 | 
 80 | #' @rdname melt_delim
 81 | #' @export
 82 | melt_csv2 <- function(file, locale = default_locale(), na = c("", "NA"),
 83 |                       quoted_na = TRUE, quote = "\"", comment = "",
 84 |                       trim_ws = TRUE, skip = 0, n_max = Inf,
 85 |                       progress = show_progress(),
 86 |                       skip_empty_rows = FALSE) {
 87 | 
 88 |   if (locale$decimal_mark == ".") {
 89 |     cli::cli_alert_info("Using {.val ','} as decimal and {.val '.'} as grouping mark. Use {.fn melt_delim} for more control.")
 90 |     locale$decimal_mark <- ","
 91 |     locale$grouping_mark <- "."
 92 |   }
 93 |   tokenizer <- tokenizer_delim(delim = ";", na = na, quoted_na = quoted_na,
 94 |     quote = quote, comment = comment, trim_ws = trim_ws,
 95 |     skip_empty_rows = skip_empty_rows)
 96 |   melt_delimited(file, tokenizer, locale = locale, skip = skip,
 97 |     skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max,
 98 |     progress = progress)
 99 | }
100 | 
101 | 
102 | #' @rdname melt_delim
103 | #' @export
104 | melt_tsv <- function(file, locale = default_locale(), na = c("", "NA"),
105 |                      quoted_na = TRUE, quote = "\"", comment = "",
106 |                      trim_ws = TRUE, skip = 0, n_max = Inf,
107 |                      progress = show_progress(),
108 |                      skip_empty_rows = FALSE) {
109 |   tokenizer <- tokenizer_tsv(na = na, quoted_na = quoted_na, quote = quote,
110 |     comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows)
111 |   melt_delimited(file, tokenizer, locale = locale, skip = skip,
112 |     skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max,
113 |     progress = progress)
114 | }
115 | 
116 | # Helper functions for reading from delimited files ----------------------------
117 | col_spec_melt <-
118 |   structure(list(row = structure(list(),
119 |                                  class = c("collector_double",
120 |                                            "collector")),
121 |                  col = structure(list(),
122 |                                  class = c("collector_double",
123 |                                            "collector")),
124 |                  data_type = structure(list(),
125 |                                        class = c("collector_character",
126 |                                                  "collector")),
127 |                  value = structure(list(),
128 |                                    class = c("collector_character",
129 |                                              "collector"))),
130 |             .Names = c("row", "col", "data_type", "value"))
131 | 
132 | melt_tokens <- function(data, tokenizer, locale_, n_max, progress) {
133 |   if (n_max == Inf) {
134 |     n_max <- -1
135 |   }
136 |   melt_tokens_(data, tokenizer, col_spec_melt, locale_, n_max, progress)
137 | }
138 | 
139 | melt_delimited <- function(file, tokenizer, locale = default_locale(),
140 |                            skip = 0, skip_empty_rows = FALSE, comment = "", n_max = Inf,
141 |                            progress = show_progress()) {
142 |   name <- source_name(file)
143 |   # If connection needed, read once.
144 |   file <- standardise_path(file)
145 |   if (is.connection(file)) {
146 |     data <- datasource_connection(file, skip, skip_empty_rows = skip_empty_rows, comment)
147 |   } else {
148 |     if (empty_file(file)) {
149 |        return(tibble::tibble(row = double(), col = double(),
150 |                              data_type = character(), value = character()))
151 |     }
152 |     if (is.character(file) && identical(locale$encoding, "UTF-8")) {
153 |       # When locale is not set, file is probablly marked as its correct encoding.
154 |       # As default_locale() assumes file is UTF-8, file should be encoded as UTF-8 for non-UTF-8 MBCS locales.
155 |       data <- enc2utf8(file)
156 |     } else {
157 |       data <- file
158 |     }
159 |   }
160 |   ds <- datasource(data, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment)
161 |   out <- melt_tokens(ds, tokenizer, locale_ = locale, n_max = n_max,
162 |               progress = progress)
163 |   warn_problems(out)
164 | }
165 | 


--------------------------------------------------------------------------------
/R/melt_delim_chunked.R:
--------------------------------------------------------------------------------
  1 | # Generates the chunked definition from the melt_* definition
  2 | generate_melt_chunked_fun <- function(x) { # nocov start
  3 |   args <- formals(x)
  4 | 
  5 |   # Remove n_max argument
  6 |   args <- args[names(args) != "n_max"]
  7 | 
  8 |   args <- append(args, alist(callback = , chunk_size = 10000), 1)
  9 | 
 10 |   b <- as.list(body(x))
 11 | 
 12 |   # Change melt_delimited to melt_delimited_chunked
 13 |   b[[length(b)]][[1]] <- quote(melt_delimited_chunked)
 14 | 
 15 |   call_args <- as.list(b[[length(b)]])
 16 | 
 17 |   # Remove the n_max argument
 18 |   call_args <- call_args[!names(call_args) == "n_max"]
 19 | 
 20 |   # add the callback and chunk_size arguments
 21 |   b[[length(b)]] <- as.call(append(call_args, alist(callback = callback, chunk_size = chunk_size), 2))
 22 | 
 23 |   body(x) <- as.call(b)
 24 | 
 25 |   formals(x) <- args
 26 | 
 27 |   x
 28 | } # nocov end
 29 | 
 30 | # Generates the modified melt_delimited function
 31 | generate_melt_delimited_chunked <- function(x) { # nocov start
 32 |   args <- formals(x)
 33 |   args <- args[names(args) != "n_max"]
 34 |   args <- append(args, alist(callback = , chunk_size = 10000), 1)
 35 | 
 36 |   b <- as.list(body(x))
 37 | 
 38 |   for (i in seq_along(b)) {
 39 |     if (is.call(b[[i]]) && identical(b[[i]][[1]], as.symbol("<-")) &&
 40 |       is.call(b[[i]][[3]]) && identical(b[[i]][[3]][[1]], quote(melt_tokens))) {
 41 | 
 42 |       # Change melt_tokens() to melt_tokens_chunked
 43 |       b[[i]][[3]][[1]] <- quote(melt_tokens_chunked)
 44 |       chunked_call <- as.list(b[[i]][[3]])
 45 | 
 46 |       # Remove the n_max argument
 47 |       chunked_call <- chunked_call[!names(chunked_call) == "n_max"]
 48 | 
 49 |       # Add the callback and chunk_size arguments
 50 |       b[[i]] <- as.call(append(chunked_call, alist(callback = callback, chunk_size = chunk_size), 2))
 51 | 
 52 |       # Remove additional calls
 53 |       b <- b[-seq(i + 1, length(b))]
 54 |       body(x) <- as.call(b)
 55 |       formals(x) <- args
 56 |       return(x)
 57 |     }
 58 |   }
 59 | 
 60 |   x
 61 | } # nocov end
 62 | 
 63 | melt_tokens_chunked <- function(data, callback, chunk_size, tokenizer, locale_, progress) {
 64 |   callback <- as_chunk_callback(callback)
 65 |   on.exit(callback$finally(), add = TRUE)
 66 | 
 67 |   melt_tokens_chunked_(
 68 |     data, callback, chunk_size, tokenizer, col_spec_melt,
 69 |     locale_, progress
 70 |   )
 71 | 
 72 |   return(callback$result())
 73 | }
 74 | 
 75 | melt_delimited_chunked <- generate_melt_delimited_chunked(melt_delimited)
 76 | 
 77 | #' Melt a delimited file by chunks
 78 | #'
 79 | #' For certain non-rectangular data formats, it can be useful to parse the data
 80 | #' into a melted format where each row represents a single token.
 81 | #'
 82 | #' `melt_delim_chunked()` and the specialisations `melt_csv_chunked()`,
 83 | #' `melt_csv2_chunked()` and `melt_tsv_chunked()` read files by a chunk of rows
 84 | #' at a time, executing a given function on one chunk before reading the next.
 85 | #'
 86 | #' @inheritParams readr::read_delim_chunked
 87 | #' @param callback A callback function to call on each chunk
 88 | #' @param chunk_size The number of rows to include in each chunk
 89 | #' @return A [tibble()] of four columns:
 90 | #'   * `row`, the row that the token comes from in the original file
 91 | #'   * `col`, the column that the token comes from in the original file
 92 | #'   * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
 93 | #'     `"date"`, guessed in a similar way to the `guess_parser()` function.
 94 | #'   * `value`, the token itself as a character string, unchanged from its
 95 | #'     representation in the original file.
 96 | #'
 97 | #'   If there are parsing problems, a warning tells you
 98 | #'   how many, and you can retrieve the details with [problems()].
 99 | #' @keywords internal
100 | #' @family chunked
101 | #' @export
102 | #' @examples
103 | #' # Cars with 3 gears
104 | #' f <- function(x, pos) subset(x, data_type == "integer")
105 | #' melt_csv_chunked(meltr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5)
106 | melt_delim_chunked <- generate_melt_chunked_fun(melt_delim)
107 | 
108 | #' @rdname melt_delim_chunked
109 | #' @export
110 | melt_csv_chunked <- generate_melt_chunked_fun(melt_csv)
111 | 
112 | #' @rdname melt_delim_chunked
113 | #' @export
114 | melt_csv2_chunked <- generate_melt_chunked_fun(melt_csv2)
115 | 
116 | #' @rdname melt_delim_chunked
117 | #' @export
118 | melt_tsv_chunked <- generate_melt_chunked_fun(melt_tsv)
119 | 
120 | utils::globalVariables(c("callback", "chunk_size"))
121 | 


--------------------------------------------------------------------------------
/R/melt_fwf.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Return melted data for each token in a fixed width file
  4 | #'
  5 | #' For certain non-rectangular data formats, it can be useful to parse the data
  6 | #' into a melted format where each row represents a single token.
  7 | #'
  8 | #' `melt_fwf()` parses each token of a fixed width file into a single row, but
  9 | #' it still requires that each field is in the same in every row of the
 10 | #' source file.
 11 | #'
 12 | #' @seealso [melt_table()] to melt fixed width files where each
 13 | #'   column is separated by whitespace, and [melt_fwf()] for the conventional
 14 | #'   way to read rectangular data from fixed width files.
 15 | #' @inheritParams readr::read_fwf
 16 | #' @param col_positions Column positions, as created by [fwf_empty()],
 17 | #'   [fwf_widths()] or [fwf_positions()]. To read in only selected fields,
 18 | #'   use [fwf_positions()]. If the width of the last column is variable (a
 19 | #'   ragged fwf file), supply the last end position as NA.
 20 | #' @return A [tibble()] of four columns:
 21 | #'   * `row`, the row that the token comes from in the original file
 22 | #'   * `col`, the column that the token comes from in the original file
 23 | #'   * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
 24 | #'     `"date"`, guessed in a similar way to the `guess_parser()` function.
 25 | #'   * `value`, the token itself as a character string, unchanged from its
 26 | #'     representation in the original file.
 27 | #'
 28 | #'   If there are parsing problems, a warning tells you
 29 | #'   how many, and you can retrieve the details with [problems()].
 30 | #' @export
 31 | #' @examples
 32 | #' fwf_sample <- meltr_example("fwf-sample.txt")
 33 | #' writeLines(readLines(fwf_sample))
 34 | #'
 35 | #' # You can specify column positions in several ways:
 36 | #' # 1. Guess based on position of empty columns
 37 | #' melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn")))
 38 | #' # 2. A vector of field widths
 39 | #' melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn")))
 40 | #' # 3. Paired vectors of start and end positions
 41 | #' melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn")))
 42 | #' # 4. Named arguments with start and end positions
 43 | #' melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42)))
 44 | #' # 5. Named arguments with column widths
 45 | #' melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12))
 46 | melt_fwf <- function(file, col_positions,
 47 |                      locale = default_locale(), na = c("", "NA"),
 48 |                      comment = "", trim_ws = TRUE, skip = 0, n_max = Inf,
 49 |                      progress = show_progress(),
 50 |                      skip_empty_rows = FALSE) {
 51 |   ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
 52 |   if (inherits(ds, "source_file") && empty_file(file)) {
 53 |     return(tibble::tibble(
 54 |       row = double(), col = double(),
 55 |       data_type = character(), value = character()
 56 |     ))
 57 |   }
 58 |   tokenizer <- tokenizer_fwf(as.integer(col_positions$begin), as.integer(col_positions$end),
 59 |     na = na,
 60 |     comment = comment, trim_ws = trim_ws,
 61 |     skip_empty_rows = skip_empty_rows
 62 |   )
 63 |   out <- melt_tokens(ds, tokenizer,
 64 |     locale_ = locale,
 65 |     n_max = if (n_max == Inf) -1 else n_max, progress = progress
 66 |   )
 67 |   warn_problems(out)
 68 | }
 69 | 
 70 | #' @rdname melt_fwf
 71 | #' @export
 72 | #' @param n Number of lines the tokenizer will read to determine file structure. By default
 73 | #'      it is set to 100.
 74 | fwf_empty <- function(file, skip = 0, skip_empty_rows = FALSE, col_names = NULL, comment = "", n = 100L) {
 75 |   ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
 76 | 
 77 |   out <- whitespaceColumns(ds, comment = comment, n = n)
 78 |   out$end[length(out$end)] <- NA
 79 | 
 80 |   col_names <- fwf_col_names(col_names, length(out$begin))
 81 |   out$col_names <- col_names
 82 |   out
 83 | }
 84 | 
 85 | #' @rdname melt_fwf
 86 | #' @export
 87 | #' @param widths Width of each field. Use NA as width of last field when
 88 | #'    reading a ragged fwf file.
 89 | #' @param col_names Either NULL, or a character vector column names.
 90 | fwf_widths <- function(widths, col_names = NULL) {
 91 |   pos <- cumsum(c(1L, abs(widths)))
 92 |   fwf_positions(pos[-length(pos)], pos[-1] - 1L, col_names)
 93 | }
 94 | 
 95 | #' @rdname melt_fwf
 96 | #' @export
 97 | #' @param start,end Starting and ending (inclusive) positions of each field.
 98 | #'    Use NA as last end field when reading a ragged fwf file.
 99 | fwf_positions <- function(start, end = NULL, col_names = NULL) {
100 |   stopifnot(length(start) == length(end))
101 |   col_names <- fwf_col_names(col_names, length(start))
102 | 
103 |   tibble(
104 |     begin = start - 1L,
105 |     end = end, # -1 to change to 0 offset, +1 to be exclusive,
106 |     col_names = as.character(col_names)
107 |   )
108 | }
109 | 
110 | 
111 | #' @rdname melt_fwf
112 | #' @export
113 | #' @param ... If the first element is a data frame,
114 | #'   then it must have all numeric columns and either one or two rows.
115 | #'   The column names are the variable names. The column values are the
116 | #'   variable widths if a length one vector, and if length two, variable start and end
117 | #'   positions. The elements of `...` are used to construct a data frame
118 | #'   with or or two rows as above.
119 | fwf_cols <- function(...) {
120 |   x <- lapply(list(...), as.integer)
121 |   names(x) <- fwf_col_names(names(x), length(x))
122 |   x <- tibble::as_tibble(x)
123 |   if (nrow(x) == 2) {
124 |     res <- fwf_positions(as.integer(x[1, ]), as.integer(x[2, ]), names(x))
125 |   } else if (nrow(x) == 1) {
126 |     res <- fwf_widths(as.integer(x[1, ]), names(x))
127 |   } else {
128 |     stop("All variables must have either one (width) two (start, end) values.",
129 |       call. = FALSE
130 |     )
131 |   }
132 |   res
133 | }
134 | 
135 | fwf_col_names <- function(nm, n) {
136 |   nm <- nm %||% rep("", n)
137 |   nm_empty <- (nm == "")
138 |   nm[nm_empty] <- paste0("X", seq_len(n))[nm_empty]
139 |   nm
140 | }
141 | 


--------------------------------------------------------------------------------
/R/melt_table.R:
--------------------------------------------------------------------------------
 1 | #' Return melted data for each token in a whitespace-separated file
 2 | #'
 3 | #' @description
 4 | #'
 5 | #' For certain non-rectangular data formats, it can be useful to parse the data
 6 | #' into a melted format where each row represents a single token.
 7 | #'
 8 | #' `melt_table()` and `melt_table2()` are designed to read the type of textual
 9 | #' data where each column is separated by one (or more) columns of space.
10 | #'
11 | #' `melt_table2()` allows any number of whitespace characters between columns,
12 | #' and the lines can be of different lengths.
13 | #'
14 | #' `melt_table()` is more strict, each line must be the same length,
15 | #' and each field is in the same position in every line. It first finds empty
16 | #' columns and then parses like a fixed width file.
17 | #'
18 | #' @seealso [melt_fwf()] to melt fixed width files where each column
19 | #'   is not separated by whitespace. `melt_fwf()` is also useful for reading
20 | #'   tabular data with non-standard formatting.  [readr::read_table()] is the
21 | #'   conventional way to read tabular data from whitespace-separated files.
22 | #' @inheritParams readr::read_table
23 | #' @return A [tibble()] of four columns:
24 | #'   * `row`, the row that the token comes from in the original file
25 | #'   * `col`, the column that the token comes from in the original file
26 | #'   * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
27 | #'     `"date"`, guessed in a similar way to the `guess_parser()` function.
28 | #'   * `value`, the token itself as a character string, unchanged from its
29 | #'     representation in the original file.
30 | #'
31 | #'   If there are parsing problems, a warning tells you
32 | #'   how many, and you can retrieve the details with [problems()].
33 | #' @export
34 | #' @examples
35 | #' # One corner from http://www.masseyratings.com/cf/compare.htm
36 | #' massey <- meltr_example("massey-rating.txt")
37 | #' cat(readLines(massey))
38 | #' melt_table(massey)
39 | #'
40 | #' # Sample of 1978 fuel economy data from
41 | #' # http://www.fueleconomy.gov/feg/epadata/78data.zip
42 | #' epa <- meltr_example("epa78.txt")
43 | #' writeLines(readLines(epa))
44 | #' melt_table(epa)
45 | melt_table <- function(file, locale = default_locale(), na = "NA", skip = 0,
46 |                        n_max = Inf, guess_max = min(n_max, 1000),
47 |                        progress = show_progress(), comment = "",
48 |                        skip_empty_rows = FALSE) {
49 |   ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
50 |   if (inherits(ds, "source_file") && empty_file(file)) {
51 |     return(tibble::tibble(
52 |       row = double(), col = double(),
53 |       data_type = character(), value = character()
54 |     ))
55 |   }
56 | 
57 |   columns <- fwf_empty(ds, skip = skip, skip_empty_rows = skip_empty_rows, n = guess_max, comment = comment)
58 |   tokenizer <- tokenizer_fwf(columns$begin, columns$end,
59 |     na = na,
60 |     comment = comment,
61 |     skip_empty_rows = skip_empty_rows
62 |   )
63 | 
64 |   ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows)
65 |   out <- melt_tokens(ds, tokenizer,
66 |     locale_ = locale, n_max = n_max,
67 |     progress = progress
68 |   )
69 |   warn_problems(out)
70 | }
71 | 
72 | #' @rdname melt_table
73 | #' @export
74 | melt_table2 <- function(file, locale = default_locale(), na = "NA", skip = 0,
75 |                         n_max = Inf, progress = show_progress(), comment = "",
76 |                         skip_empty_rows = FALSE) {
77 |   ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
78 |   if (inherits(ds, "source_file") && empty_file(file)) {
79 |     return(tibble::tibble(
80 |       row = double(), col = double(),
81 |       data_type = character(), value = character()
82 |     ))
83 |   }
84 |   tokenizer <- tokenizer_ws(
85 |     na = na, comment = comment,
86 |     skip_empty_rows = skip_empty_rows
87 |   )
88 | 
89 |   ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows)
90 |   melt_delimited(ds, tokenizer,
91 |     locale = locale, skip = skip,
92 |     comment = comment, n_max = n_max, progress = progress
93 |   )
94 | }
95 | 


--------------------------------------------------------------------------------
/R/meltr-package.R:
--------------------------------------------------------------------------------
1 | ## usethis namespace: start
2 | #' @useDynLib meltr, .registration = TRUE
3 | ## usethis namespace: end
4 | NULL
5 | 


--------------------------------------------------------------------------------
/R/problems.R:
--------------------------------------------------------------------------------
  1 | #' Retrieve parsing problems
  2 | #'
  3 | #' Readr functions will only throw an error if parsing fails in an unrecoverable
  4 | #' way. However, there are lots of potential problems that you might want to
  5 | #' know about - these are stored in the `problems` attribute of the
  6 | #' output, which you can easily access with this function.
  7 | #' `stop_for_problems()` will throw an error if there are any parsing
  8 | #' problems: this is useful for automated scripts where you want to throw
  9 | #' an error as soon as you encounter a problem.
 10 | #'
 11 | #' @param x An data frame (from `read_*()`) or a vector
 12 | #'   (from `parse_*()`).
 13 | #' @return A data frame with one row for each problem and four columns:
 14 | #'   \item{row,col}{Row and column of problem}
 15 | #'   \item{expected}{What readr expected to find}
 16 | #'   \item{actual}{What it actually got}
 17 | #' @export
 18 | #' @examples
 19 | #' if (requireNamespace("readr")) {
 20 | #' x <- readr::parse_integer(c("1X", "blah", "3"))
 21 | #' problems(x)
 22 | #'
 23 | #' y <- readr::parse_integer(c("1", "2", "3"))
 24 | #' problems(y)
 25 | #' }
 26 | problems <- local({
 27 |   no_problems <- tibble::tibble(
 28 |     row = integer(),
 29 |     col = integer(),
 30 |     expected = character(),
 31 |     actual = character()
 32 |   )
 33 | 
 34 |   function(x = .Last.value) {
 35 |     problems <- probs(x)
 36 | 
 37 |     if (is.null(problems)) {
 38 |       return(invisible(no_problems))
 39 |     }
 40 | 
 41 |     problems
 42 |   }
 43 | })
 44 | 
 45 | #' @export
 46 | #' @rdname problems
 47 | stop_for_problems <- function(x) {
 48 |   n <- n_problems(x)
 49 |   if (n == 0) {
 50 |     return(invisible(x))
 51 |   }
 52 | 
 53 |   stop(n, " parsing failure", if (n > 1) "s", call. = FALSE)
 54 | }
 55 | 
 56 | probs <- function(x) {
 57 |   attr(suppressWarnings(x), "problems")
 58 | }
 59 | 
 60 | n_problems <- function(x) {
 61 |   probs <- problems(x)
 62 |   if (is.null(probs)) 0 else nrow(probs)
 63 | }
 64 | 
 65 | problem_rows <- function(x) {
 66 |   if (n_problems(x) == 0) {
 67 |     return(x[0, , drop = FALSE])
 68 |   }
 69 | 
 70 |   probs <- problems(x)
 71 |   x[unique(probs$row), , drop = FALSE]
 72 | }
 73 | 
 74 | warn_problems <- function(x) {
 75 |   n <- n_problems(x)
 76 |   if (n == 0) {
 77 |     return(x)
 78 |   }
 79 | 
 80 |   probs <- as.data.frame(attr(x, "problems"))
 81 |   many_problems <- nrow(probs) > 5
 82 | 
 83 |   probs_f <- format(utils::head(probs, 5), justify = "left")
 84 |   probs_f[probs_f == "NA"] <- "--"
 85 |   probs_f <- rbind(names(probs), probs_f)
 86 |   probs_f <- lapply(probs_f, format, justify = "right")
 87 | 
 88 |   if (many_problems) {
 89 |     # nchar fails with non-ascii characters, so encode characters beforehand.
 90 |     width <- vapply(probs_f, function(x) max(nchar(encodeString(x))), integer(1))
 91 |     dots <- vapply(width, function(i) paste(rep(".", i), collapse = ""),
 92 |       FUN.VALUE = character(1)
 93 |     )
 94 | 
 95 |     probs_f <- Map(c, probs_f, dots)
 96 |   }
 97 | 
 98 |   probs_f <- do.call(paste, c(probs_f, list(sep = " ", collapse = "\n")))
 99 |   warning(n, " parsing failure", if (n > 1) "s", ".\n",
100 |     probs_f, "\n",
101 |     if (many_problems) "See problems(...) for more details.\n",
102 |     call. = FALSE, immediate. = TRUE, noBreaks. = TRUE
103 |   )
104 | 
105 |   x
106 | }
107 | 
108 | name_problems <- function(x, all_colnames, name = "input") {
109 |   if (n_problems(x) == 0) {
110 |     return(x)
111 |   }
112 | 
113 |   problems <- problems(x)
114 |   problems$file <- name
115 |   problems$col <- all_colnames[problems$col]
116 |   attr(x, "problems") <- problems
117 | 
118 |   x
119 | }
120 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-lib/meltr/38c5a720afe794d1fd2f36e5bb552dd9a8ca8b47/R/sysdata.rda


--------------------------------------------------------------------------------
/R/tokenizer.R:
--------------------------------------------------------------------------------
  1 | #' Tokenizers.
  2 | #'
  3 | #' Explicitly create tokenizer objects. Usually you will not call these
  4 | #' function, but will instead use one of the use friendly wrappers like
  5 | #' [readr::read_csv()].
  6 | #'
  7 | #' @keywords internal
  8 | #' @name Tokenizers
  9 | #' @examples
 10 | #' tokenizer_csv()
 11 | NULL
 12 | 
 13 | #' @export
 14 | #' @rdname Tokenizers
 15 | #' @param comment A string used to identify comments. Any text after the
 16 | #'   comment characters will be silently ignored.
 17 | #' @param na Character vector of strings to interpret as missing values. Set this
 18 | #'   option to `character()` to indicate no missing values.
 19 | #' @param quoted_na Should missing values inside quotes be treated as missing
 20 | #'   values (the default) or strings.
 21 | #' @param delim Single character used to separate fields within a record.
 22 | #' @param quote Single character used to quote strings.
 23 | #' @param trim_ws Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
 24 | #'     each field before parsing it?
 25 | #' @param escape_double Does the file escape quotes by doubling them?
 26 | #'   i.e. If this option is `TRUE`, the value `""""` represents
 27 | #'   a single quote, `\"`.
 28 | #' @param escape_backslash Does the file use backslashes to escape special
 29 | #'   characters? This is more general than `escape_double` as backslashes
 30 | #'   can be used to escape the delimiter character, the quote character, or
 31 | #'   to add special characters like `\\n`.
 32 | #' @param skip_empty_rows Should blank rows be ignored altogether? i.e. If this
 33 | #'   option is `TRUE` then blank rows will not be represented at all.  If it is
 34 | #'   `FALSE` then they will be represented by `NA` values in all the columns.
 35 | #' @return A tokeenizer object
 36 | #' @examples
 37 | #' tokenizer_delim(",")
 38 | tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, comment = "",
 39 |                             trim_ws = TRUE,
 40 |                             escape_double = TRUE,
 41 |                             escape_backslash = FALSE,
 42 |                             skip_empty_rows = TRUE) {
 43 |   structure(
 44 |     list(
 45 |       delim = delim,
 46 |       quote = quote,
 47 |       na = na,
 48 |       quoted_na = quoted_na,
 49 |       comment = comment,
 50 |       trim_ws = trim_ws,
 51 |       escape_double = escape_double,
 52 |       escape_backslash = escape_backslash,
 53 |       skip_empty_rows = skip_empty_rows
 54 |     ),
 55 |     class = "tokenizer_delim"
 56 |   )
 57 | }
 58 | 
 59 | #' @export
 60 | #' @rdname Tokenizers
 61 | tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
 62 |                           comment = "", trim_ws = TRUE,
 63 |                           skip_empty_rows = TRUE) {
 64 |   tokenizer_delim(
 65 |     delim = ",",
 66 |     na = na,
 67 |     quoted_na = quoted_na,
 68 |     quote = quote,
 69 |     comment = comment,
 70 |     trim_ws = trim_ws,
 71 |     escape_double = TRUE,
 72 |     escape_backslash = FALSE,
 73 |     skip_empty_rows = skip_empty_rows
 74 |   )
 75 | }
 76 | 
 77 | #' @export
 78 | #' @rdname Tokenizers
 79 | tokenizer_tsv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
 80 |                           comment = "", trim_ws = TRUE,
 81 |                           skip_empty_rows = TRUE) {
 82 |   tokenizer_delim(
 83 |     delim = "\t",
 84 |     na = na,
 85 |     quoted_na = quoted_na,
 86 |     quote = quote,
 87 |     comment = comment,
 88 |     trim_ws = trim_ws,
 89 |     escape_double = TRUE,
 90 |     escape_backslash = FALSE,
 91 |     skip_empty_rows = skip_empty_rows
 92 |   )
 93 | }
 94 | 
 95 | #' @export
 96 | #' @rdname Tokenizers
 97 | tokenizer_line <- function(na = character(), skip_empty_rows = TRUE) {
 98 |   structure(list(na = na, skip_empty_rows = skip_empty_rows),
 99 |     class = "tokenizer_line"
100 |   )
101 | }
102 | 
103 | #' @export
104 | #' @rdname Tokenizers
105 | tokenizer_log <- function(trim_ws) {
106 |   structure(list(trim_ws = trim_ws), class = "tokenizer_log")
107 | }
108 | 
109 | 
110 | #' @export
111 | #' @rdname Tokenizers
112 | #' @param begin,end Begin and end offsets for each file. These are C++
113 | #'   offsets so the first column is column zero, and the ranges are
114 | #'   [begin, end) (i.e inclusive-exclusive).
115 | tokenizer_fwf <- function(begin, end, na = "NA", comment = "", trim_ws = TRUE,
116 |                           skip_empty_rows = TRUE) {
117 |   structure(list(
118 |     begin = as.integer(begin), end = as.integer(end), na = na, comment = comment,
119 |     trim_ws = trim_ws, skip_empty_rows = skip_empty_rows
120 |   ),
121 |   class = "tokenizer_fwf"
122 |   )
123 | }
124 | 
125 | #' @export
126 | #' @rdname Tokenizers
127 | tokenizer_ws <- function(na = "NA", comment = "", skip_empty_rows = TRUE) {
128 |   structure(list(na = na, comment = comment, skip_empty_rows = skip_empty_rows),
129 |     class = "tokenizer_ws"
130 |   )
131 | }
132 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | # Silence R CMD check note
  2 | #' @importFrom tibble tibble
  3 | NULL
  4 | 
  5 | is.connection <- function(x) inherits(x, "connection")
  6 | 
  7 | `%||%` <- function(a, b) if (is.null(a)) b else a
  8 | 
  9 | #' Determine whether progress bars should be shown
 10 | #'
 11 | #' Progress bars are shown _unless_ one of the following is `TRUE`
 12 | #' - The bar is explicitly disabled by setting `options(readr.show_progress = FALSE)`
 13 | #' - The code is run in a non-interactive session (`interactive()` is `FALSE`).
 14 | #' - The code is run in an RStudio notebook chunk.
 15 | #' - The code is run by knitr / rmarkdown.
 16 | #'
 17 | #' @return A logical value
 18 | #' @export
 19 | #' @examples
 20 | #' show_progress()
 21 | show_progress <- function() {
 22 |   isTRUE(getOption("readr.show_progress")) && # user disables progress bar
 23 |     interactive() && # an interactive session
 24 |     !isTRUE(getOption("rstudio.notebook.executing")) && # Not running in an RStudio notebook chunk
 25 |     !isTRUE(getOption("knitr.in.progress")) # Not actively knitting a document
 26 | }
 27 | 
 28 | #' @importFrom tibble as_tibble
 29 | #' @export
 30 | as_tibble.meltr_spec_tbl_df <- function(x, ...) {
 31 |   attr(x, "spec") <- NULL
 32 |   attr(x, "problems") <- NULL
 33 |   class(x) <- setdiff(class(x), "meltr_spec_tbl_df")
 34 |   NextMethod("as_tibble")
 35 | }
 36 | 
 37 | #' @export
 38 | as.data.frame.meltr_spec_tbl_df <- function(x, ...) {
 39 |   attr(x, "spec") <- NULL
 40 |   attr(x, "problems") <- NULL
 41 |   class(x) <- setdiff(class(x), "meltr_spec_tbl_df")
 42 |   NextMethod("as.data.frame")
 43 | }
 44 | 
 45 | #' @export
 46 | `[.meltr_spec_tbl_df` <- function(x, ...) {
 47 |   attr(x, "spec") <- NULL
 48 |   attr(x, "problems") <- NULL
 49 |   class(x) <- setdiff(class(x), "spec_tbl_df")
 50 |   NextMethod(`[`)
 51 | }
 52 | 
 53 | #' @importFrom methods setOldClass
 54 | setOldClass(c("meltr_spec_tbl_df", "tbl_df", "tbl", "data.frame"))
 55 | 
 56 | # @export
 57 | compare.meltr_spec_tbl_df <- function(x, y, ...) {
 58 |   attr(x, "spec") <- NULL
 59 |   attr(x, "problems") <- NULL
 60 | 
 61 |   attr(y, "spec") <- NULL
 62 |   attr(y, "problems") <- NULL
 63 | 
 64 |   NextMethod("compare")
 65 | }
 66 | 
 67 | # @export
 68 | compare_proxy.meltr_spec_tbl_df <- function(x) {
 69 |   attr(x, "spec") <- NULL
 70 |   attr(x, "problems") <- NULL
 71 |   x
 72 | }
 73 | 
 74 | is_named <- function(x) {
 75 |   nms <- names(x)
 76 | 
 77 |   if (is.null(nms)) {
 78 |     return(FALSE)
 79 |   }
 80 | 
 81 |   all(nms != "" & !is.na(nms))
 82 | }
 83 | 
 84 | .onLoad <- function(...) {
 85 |   register_s3_method("testthat", "compare", "meltr_spec_tbl_df")
 86 |   register_s3_method("waldo", "compare_proxy", "meltr_spec_tbl_df")
 87 | }
 88 | 
 89 | register_s3_method <- function(pkg, generic, class, fun = NULL) {
 90 |   stopifnot(is.character(pkg), length(pkg) == 1)
 91 |   stopifnot(is.character(generic), length(generic) == 1)
 92 |   stopifnot(is.character(class), length(class) == 1)
 93 | 
 94 |   if (is.null(fun)) {
 95 |     fun <- get(paste0(generic, ".", class), envir = parent.frame())
 96 |   } else {
 97 |     stopifnot(is.function(fun))
 98 |   }
 99 | 
100 |   if (pkg %in% loadedNamespaces()) {
101 |     registerS3method(generic, class, fun, envir = asNamespace(pkg))
102 |   }
103 | 
104 |   # Always register hook in case package is later unloaded & reloaded
105 |   setHook(
106 |     packageEvent(pkg, "onLoad"),
107 |     function(...) {
108 |       registerS3method(generic, class, fun, envir = asNamespace(pkg))
109 |     }
110 |   )
111 | }
112 | 
113 | # Silence R CMD check note
114 | #  Namespaces in Imports field not imported from:
115 | #    ‘R6’ ‘rlang’
116 | #    All declared Imports should be used.
117 | # See https://github.com/hadley/r-pkgs/issues/828
118 | fake_function_1 <- function() R6::R6Class
119 | fake_function_2 <- function() rlang::int
120 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "man/figures/README-",
 12 |   out.width = "100%"
 13 | )
 14 | ```
 15 | 
 16 | # meltr
 17 | 
 18 | <!-- badges: start -->
 19 | [![R-CMD-check](https://github.com/r-lib/meltr/workflows/R-CMD-check/badge.svg)](https://github.com/r-lib/meltr/actions)
 20 | [![Codecov test coverage](https://codecov.io/gh/r-lib/meltr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/r-lib/meltr?branch=main)
 21 | <!-- badges: end -->
 22 | 
 23 | <p align="center">
 24 |   <img src="https://nacnudus.github.io/duncangarmonsway/posts/2018-12-29-meltcsv/im_melting_wicked_witch_of_the_west.jpg" alt="The wicked witch of the west saying 'I'm Melting, Melting!!!!!'">
 25 | </p>
 26 | 
 27 | The goal of 'meltr' is to provide a fast and friendly way to read
 28 | non-rectangular data (like ragged forms of 'csv', 'tsv', and 'fwf').
 29 | 
 30 | Standard tools like [`readr::read_csv()`](https://readr.tidyverse.org/reference/read_delim.html) can cope to some extent with unusual inputs, like files with empty rows or newlines embedded in strings.
 31 | But some files are so wacky that standard tools don't work at all, and instead you have to take the file to pieces and reassemble to get structured data you can work with.
 32 | 
 33 | The meltr package provides tools to do this.
 34 | 
 35 | ## Installation
 36 | 
 37 | You can install the released version of meltr from CRAN with:
 38 | 
 39 | ``` r
 40 | install.packages("meltr")
 41 | ```
 42 | 
 43 | Or you can install the development version with:
 44 | 
 45 | ```r
 46 | # install.packages("devtools")
 47 | devtools::install_github("r-lib/meltr")
 48 | ```
 49 | 
 50 | ## The problem with non-rectangular data
 51 | 
 52 | Here's a contrived example that breaks two assumptions made by common tools like `readr::read_csv()`.
 53 | 
 54 | 1. There are more cells in some rows than others.
 55 | 2. There are mixed data types within each column.
 56 | 
 57 | In contrast, the `melt_csv()` function reads the file one cell at a time, importing each cell of the file into a whole row of the final data frame.
 58 | 
 59 | ```{r}
 60 | writeLines("Help,,007,I'm
 61 | 1960-09-30,FALSE,trapped in,7,1.21
 62 | non-rectangular,data,NA", "messy.csv")
 63 | 
 64 | library(meltr)
 65 | 
 66 | melt_csv("messy.csv")
 67 | ```
 68 | 
 69 | The output of `melt_csv()` gives us:
 70 | 
 71 | - A data frame of results – structured data about un-structured data!
 72 | - Rows of data corresponding to cells of the input data.
 73 | - Empty cells such as the cell on row 1, but not missing cells at the ends of rows 1 and 3.
 74 | - The raw, unconverted data, no data type conversion is attempted – every value is imported as a string, and the `data_type` column merely gives meltr's best guess of what the data types ought to be.
 75 | 
 76 | What are some ways you can you use this?
 77 | To begin with, you can do some simple manipulations with ordinary functions.
 78 | 
 79 | For example you could extract the words.
 80 | 
 81 | ```{r}
 82 | library(dplyr)
 83 | 
 84 | data <- melt_csv("messy.csv")
 85 | 
 86 | data %>%
 87 |   filter(data_type == "character")
 88 | ```
 89 | 
 90 | Or find if there are missing entries.
 91 | 
 92 | ```{r}
 93 | data %>%
 94 |   filter(data_type == "missing")
 95 | ```
 96 | 
 97 | ```{r, include = FALSE}
 98 | unlink("messy.csv")
 99 | ```
100 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # meltr
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![R-CMD-check](https://github.com/r-lib/meltr/workflows/R-CMD-check/badge.svg)](https://github.com/r-lib/meltr/actions)
  9 | [![Codecov test
 10 | coverage](https://codecov.io/gh/r-lib/meltr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/r-lib/meltr?branch=main)
 11 | <!-- badges: end -->
 12 | 
 13 | <p align="center">
 14 | <img src="https://nacnudus.github.io/duncangarmonsway/posts/2018-12-29-meltcsv/im_melting_wicked_witch_of_the_west.jpg" alt="The wicked witch of the west saying 'I'm Melting, Melting!!!!!'">
 15 | </p>
 16 | 
 17 | The goal of ‘meltr’ is to provide a fast and friendly way to read
 18 | non-rectangular data (like ragged forms of ‘csv’, ‘tsv’, and ‘fwf’).
 19 | 
 20 | Standard tools like
 21 | [`readr::read_csv()`](https://readr.tidyverse.org/reference/read_delim.html)
 22 | can cope to some extent with unusual inputs, like files with empty rows
 23 | or newlines embedded in strings. But some files are so wacky that
 24 | standard tools don’t work at all, and instead you have to take the file
 25 | to pieces and reassemble to get structured data you can work with.
 26 | 
 27 | The meltr package provides tools to do this.
 28 | 
 29 | ## Installation
 30 | 
 31 | You can install the released version of meltr from CRAN with:
 32 | 
 33 | ``` r
 34 | install.packages("meltr")
 35 | ```
 36 | 
 37 | Or you can install the development version with:
 38 | 
 39 | ``` r
 40 | # install.packages("devtools")
 41 | devtools::install_github("r-lib/meltr")
 42 | ```
 43 | 
 44 | ## The problem with non-rectangular data
 45 | 
 46 | Here’s a contrived example that breaks two assumptions made by common
 47 | tools like `readr::read_csv()`.
 48 | 
 49 | 1.  There are more cells in some rows than others.
 50 | 2.  There are mixed data types within each column.
 51 | 
 52 | In contrast, the `melt_csv()` function reads the file one cell at a
 53 | time, importing each cell of the file into a whole row of the final data
 54 | frame.
 55 | 
 56 | ``` r
 57 | writeLines("Help,,007,I'm
 58 | 1960-09-30,FALSE,trapped in,7,1.21
 59 | non-rectangular,data,NA", "messy.csv")
 60 | 
 61 | library(meltr)
 62 | 
 63 | melt_csv("messy.csv")
 64 | #> # A tibble: 12 × 4
 65 | #>      row   col data_type value          
 66 | #>    <dbl> <dbl> <chr>     <chr>          
 67 | #>  1     1     1 character Help           
 68 | #>  2     1     2 missing   <NA>           
 69 | #>  3     1     3 character 007            
 70 | #>  4     1     4 character I'm            
 71 | #>  5     2     1 date      1960-09-30     
 72 | #>  6     2     2 logical   FALSE          
 73 | #>  7     2     3 character trapped in     
 74 | #>  8     2     4 integer   7              
 75 | #>  9     2     5 double    1.21           
 76 | #> 10     3     1 character non-rectangular
 77 | #> 11     3     2 character data           
 78 | #> 12     3     3 missing   <NA>
 79 | ```
 80 | 
 81 | The output of `melt_csv()` gives us:
 82 | 
 83 | - A data frame of results – structured data about un-structured data!
 84 | - Rows of data corresponding to cells of the input data.
 85 | - Empty cells such as the cell on row 1, but not missing cells at the
 86 |   ends of rows 1 and 3.
 87 | - The raw, unconverted data, no data type conversion is attempted –
 88 |   every value is imported as a string, and the `data_type` column merely
 89 |   gives meltr’s best guess of what the data types ought to be.
 90 | 
 91 | What are some ways you can you use this? To begin with, you can do some
 92 | simple manipulations with ordinary functions.
 93 | 
 94 | For example you could extract the words.
 95 | 
 96 | ``` r
 97 | library(dplyr)
 98 | #> 
 99 | #> Attaching package: 'dplyr'
100 | #> The following objects are masked from 'package:stats':
101 | #> 
102 | #>     filter, lag
103 | #> The following objects are masked from 'package:base':
104 | #> 
105 | #>     intersect, setdiff, setequal, union
106 | 
107 | data <- melt_csv("messy.csv")
108 | 
109 | data %>%
110 |   filter(data_type == "character")
111 | #> # A tibble: 6 × 4
112 | #>     row   col data_type value          
113 | #>   <dbl> <dbl> <chr>     <chr>          
114 | #> 1     1     1 character Help           
115 | #> 2     1     3 character 007            
116 | #> 3     1     4 character I'm            
117 | #> 4     2     3 character trapped in     
118 | #> 5     3     1 character non-rectangular
119 | #> 6     3     2 character data
120 | ```
121 | 
122 | Or find if there are missing entries.
123 | 
124 | ``` r
125 | data %>%
126 |   filter(data_type == "missing")
127 | #> # A tibble: 2 × 4
128 | #>     row   col data_type value
129 | #>   <dbl> <dbl> <chr>     <chr>
130 | #> 1     1     2 missing   <NA> 
131 | #> 2     3     3 missing   <NA>
132 | ```
133 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## R CMD check results
2 | 
3 | 0 errors, 0 warnings, 0 notes
4 | 


--------------------------------------------------------------------------------
/data-raw/date-symbols.R:
--------------------------------------------------------------------------------
 1 | library(stringi)
 2 | 
 3 | locs <- stri_locale_list()
 4 | base <- unique(stri_split_fixed(locs, "_", n = 2, simplify = TRUE)[, 1])
 5 | 
 6 | locale_info <- function(x) {
 7 |   full <- stri_datetime_symbols(x, context = "format", width = "wide")
 8 |   abbr <- stri_datetime_symbols(x, context = "format", width = "abbreviated")
 9 | 
10 |   date_names(
11 |     mon = full$Month,
12 |     mon_ab = abbr$Month,
13 |     day = full$Weekday,
14 |     day_ab = abbr$Weekday,
15 |     am_pm = full$AmPm
16 |   )
17 | }
18 | 
19 | date_symbols <- lapply(base, locale_info)
20 | names(date_symbols) <- base
21 | 
22 | usethis::use_data(date_symbols, internal = TRUE, overwrite = TRUE)
23 | 


--------------------------------------------------------------------------------
/inst/extdata/epa78.txt:
--------------------------------------------------------------------------------
 1 | ALFA ROMEO                                                                     ALFA ROMEO           78010003
 2 | ALFETTA                              03  81  8    74  7   89  9                ALFETTA              78010053
 3 | SPIDER 2000                          01                                        SPIDER 2000          78010103
 4 | AMC                                                                            AMC                  78020002
 5 | GREMLIN                              03  79  9                    79  9        GREMLIN              78020053
 6 | PACER                                04  89 11                    89 11        PACER                78020103
 7 | PACER WAGON                          07  90 26    91 26                        PACER WAGON          78020153
 8 | CONCORD                              04  88 12    90 11   90 11   83 16        CONCORD              78020203
 9 | CONCORD WAGON                        07  91 30            91 30                CONCORD WAGON        78020253
10 | MATADOR COUPE                        05  97 14    97 14                        MATADOR COUPE        78020303
11 | MATADOR SEDAN                        06 110 20           110 20                MATADOR SEDAN        78020353
12 | MATADOR WAGON                        09 112 50           112 50                MATADOR WAGON        78020403
13 | ASTON MARTIN                                                                   ASTON MARTIN         78040002
14 | ASTON MARTIN                                                                   ASTON MARTIN         78040053
15 | AUDI                                                                           AUDI                 78050002
16 | FOX                                  03  84 11    84 11   84 11                FOX                  78050053
17 | FOX WAGON                            07  83 40            83 40                FOX WAGON            78050103
18 | 5000                                 04  90 15            90 15                5000                 78050153
19 | AVANTI                                                                         AVANTI               78065002
20 | AVANTI II                            02  75  8    75  8                        AVANTI II            78065053
21 | 


--------------------------------------------------------------------------------
/inst/extdata/fwf-sample.txt:
--------------------------------------------------------------------------------
1 | John Smith          WA        418-Y11-4111
2 | Mary Hartford       CA        319-Z19-4341
3 | Evan Nolan          IL        219-532-c301
4 | 


--------------------------------------------------------------------------------
/inst/extdata/massey-rating.txt:
--------------------------------------------------------------------------------
 1 | UCC PAY LAZ KPK  RT   COF BIH DII ENG ACU Rank Team            Conf
 2 |   1   1   1   1   1     1   1   1   1   1    1 Ohio St          B10 
 3 |   2   2   2   2   2     2   2   2   4   2    2 Oregon           P12 
 4 |   3   4   3   4   3     4   3   4   2   3    3 Alabama          SEC 
 5 |   4   3   4   3   4     3   5   3   3   4    4 TCU              B12 
 6 |   6   6   6   5   5     7   6   5   6  11    5 Michigan St      B10 
 7 |   7   7   7   6   7     6  11   8   7   8    6 Georgia          SEC 
 8 |   5   5   5   7   6     8   4   6   5   5    7 Florida St       ACC 
 9 |   8   8   9   9  10     5   7   7  10   7    8 Baylor           B12 
10 |   9  11   8  13  11    11  12   9  14   9    9 Georgia Tech     ACC 
11 |  13  10  13  11   8     9  10  11   9  10   10 Mississippi      SEC 
12 | 


--------------------------------------------------------------------------------
/inst/extdata/mtcars.csv:
--------------------------------------------------------------------------------
 1 | "mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
 2 | 21,6,160,110,3.9,2.62,16.46,0,1,4,4
 3 | 21,6,160,110,3.9,2.875,17.02,0,1,4,4
 4 | 22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
 5 | 21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
 6 | 18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
 7 | 18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
 8 | 14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
 9 | 24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
10 | 22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | 19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | 17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | 16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | 17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | 15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
16 | 10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
17 | 10.4,8,460,215,3,5.424,17.82,0,0,3,4
18 | 14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
19 | 32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | 30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | 33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | 21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | 15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
24 | 15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
25 | 13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
26 | 19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
27 | 27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
28 | 26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | 30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | 15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
31 | 19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
32 | 15,8,301,335,3.54,3.57,14.6,0,1,5,8
33 | 21.4,4,121,109,4.11,2.78,18.6,1,1,4,2
34 | 


--------------------------------------------------------------------------------
/man/Tokenizers.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/tokenizer.R
  3 | \name{Tokenizers}
  4 | \alias{Tokenizers}
  5 | \alias{tokenizer_delim}
  6 | \alias{tokenizer_csv}
  7 | \alias{tokenizer_tsv}
  8 | \alias{tokenizer_line}
  9 | \alias{tokenizer_log}
 10 | \alias{tokenizer_fwf}
 11 | \alias{tokenizer_ws}
 12 | \title{Tokenizers.}
 13 | \usage{
 14 | tokenizer_delim(
 15 |   delim,
 16 |   quote = "\\"",
 17 |   na = "NA",
 18 |   quoted_na = TRUE,
 19 |   comment = "",
 20 |   trim_ws = TRUE,
 21 |   escape_double = TRUE,
 22 |   escape_backslash = FALSE,
 23 |   skip_empty_rows = TRUE
 24 | )
 25 | 
 26 | tokenizer_csv(
 27 |   na = "NA",
 28 |   quoted_na = TRUE,
 29 |   quote = "\\"",
 30 |   comment = "",
 31 |   trim_ws = TRUE,
 32 |   skip_empty_rows = TRUE
 33 | )
 34 | 
 35 | tokenizer_tsv(
 36 |   na = "NA",
 37 |   quoted_na = TRUE,
 38 |   quote = "\\"",
 39 |   comment = "",
 40 |   trim_ws = TRUE,
 41 |   skip_empty_rows = TRUE
 42 | )
 43 | 
 44 | tokenizer_line(na = character(), skip_empty_rows = TRUE)
 45 | 
 46 | tokenizer_log(trim_ws)
 47 | 
 48 | tokenizer_fwf(
 49 |   begin,
 50 |   end,
 51 |   na = "NA",
 52 |   comment = "",
 53 |   trim_ws = TRUE,
 54 |   skip_empty_rows = TRUE
 55 | )
 56 | 
 57 | tokenizer_ws(na = "NA", comment = "", skip_empty_rows = TRUE)
 58 | }
 59 | \arguments{
 60 | \item{delim}{Single character used to separate fields within a record.}
 61 | 
 62 | \item{quote}{Single character used to quote strings.}
 63 | 
 64 | \item{na}{Character vector of strings to interpret as missing values. Set this
 65 | option to \code{character()} to indicate no missing values.}
 66 | 
 67 | \item{quoted_na}{Should missing values inside quotes be treated as missing
 68 | values (the default) or strings.}
 69 | 
 70 | \item{comment}{A string used to identify comments. Any text after the
 71 | comment characters will be silently ignored.}
 72 | 
 73 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
 74 | each field before parsing it?}
 75 | 
 76 | \item{escape_double}{Does the file escape quotes by doubling them?
 77 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents
 78 | a single quote, \verb{\\"}.}
 79 | 
 80 | \item{escape_backslash}{Does the file use backslashes to escape special
 81 | characters? This is more general than \code{escape_double} as backslashes
 82 | can be used to escape the delimiter character, the quote character, or
 83 | to add special characters like \verb{\\\\n}.}
 84 | 
 85 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
 86 | option is \code{TRUE} then blank rows will not be represented at all.  If it is
 87 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
 88 | 
 89 | \item{begin, end}{Begin and end offsets for each file. These are C++
 90 | offsets so the first column is column zero, and the ranges are
 91 | [begin, end) (i.e inclusive-exclusive).}
 92 | }
 93 | \value{
 94 | A tokeenizer object
 95 | }
 96 | \description{
 97 | Explicitly create tokenizer objects. Usually you will not call these
 98 | function, but will instead use one of the use friendly wrappers like
 99 | \code{\link[readr:read_delim]{readr::read_csv()}}.
100 | }
101 | \examples{
102 | tokenizer_csv()
103 | tokenizer_delim(",")
104 | }
105 | \keyword{internal}
106 | 


--------------------------------------------------------------------------------
/man/clipboard.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{clipboard}
 4 | \alias{clipboard}
 5 | \title{Returns values from the clipboard}
 6 | \usage{
 7 | clipboard()
 8 | }
 9 | \description{
10 | This is useful in the \code{\link[readr:read_delim]{readr::read_delim()}} functions to read from the clipboard.
11 | }
12 | \examples{
13 | \dontrun{
14 |   clipboard()
15 | }
16 | }
17 | \seealso{
18 | readr::read_delim
19 | }
20 | 


--------------------------------------------------------------------------------
/man/datasource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{datasource}
 4 | \alias{datasource}
 5 | \title{Create a source object.}
 6 | \usage{
 7 | datasource(
 8 |   file,
 9 |   skip = 0,
10 |   skip_empty_rows = FALSE,
11 |   comment = "",
12 |   skip_quote = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{file}{Either a path to a file, a connection, or literal data
17 | (either a single string or a raw vector).
18 | 
19 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
20 | be automatically uncompressed. Files starting with \verb{http://},
21 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
22 | downloaded. Remote gz files can also be automatically downloaded and
23 | decompressed.
24 | 
25 | Literal data is most useful for examples and tests. It must contain at
26 | least one new line to be recognised as data (instead of a path) or be a
27 | vector of greater than length 1.
28 | 
29 | Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.}
30 | 
31 | \item{skip}{Number of lines to skip before reading data.}
32 | }
33 | \value{
34 | A source object
35 | }
36 | \description{
37 | Create a source object.
38 | }
39 | \examples{
40 | # Literal csv
41 | datasource("a,b,c\n1,2,3")
42 | datasource(charToRaw("a,b,c\n1,2,3"))
43 | 
44 | # Strings
45 | datasource(meltr_example("mtcars.csv"))
46 | \dontrun{
47 | datasource("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
48 | }
49 | 
50 | # Connection
51 | con <- rawConnection(charToRaw("abc\n123"))
52 | datasource(con)
53 | close(con)
54 | }
55 | \keyword{internal}
56 | 


--------------------------------------------------------------------------------
/man/date_names.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/date-symbols.R
 3 | \name{date_names}
 4 | \alias{date_names}
 5 | \alias{date_names_lang}
 6 | \alias{date_names_langs}
 7 | \title{Create or retrieve date names}
 8 | \usage{
 9 | date_names(mon, mon_ab = mon, day, day_ab = day, am_pm = c("AM", "PM"))
10 | 
11 | date_names_lang(language)
12 | 
13 | date_names_langs()
14 | }
15 | \arguments{
16 | \item{mon, mon_ab}{Full and abbreviated month names.}
17 | 
18 | \item{day, day_ab}{Full and abbreviated week day names. Starts with Sunday.}
19 | 
20 | \item{am_pm}{Names used for AM and PM.}
21 | 
22 | \item{language}{A BCP 47 locale, made up of a language and a region,
23 | e.g. \code{"en_US"} for American English. See \code{date_names_langs()}
24 | for a complete list of available locales.}
25 | }
26 | \value{
27 | A date names object
28 | }
29 | \description{
30 | When parsing dates, you often need to know how weekdays of the week and
31 | months are represented as text. This pair of functions allows you to either
32 | create your own, or retrieve from a standard list. The standard list is
33 | derived from ICU (\url{https://icu.unicode.org/}) via the stringi package.
34 | }
35 | \examples{
36 | date_names(mon = LETTERS[1:12], day = letters[1:7])
37 | date_names_lang("en")
38 | date_names_lang("ko")
39 | date_names_lang("fr")
40 | }
41 | 


--------------------------------------------------------------------------------
/man/locale.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/locale.R
 3 | \name{locale}
 4 | \alias{locale}
 5 | \alias{default_locale}
 6 | \title{Create locales}
 7 | \usage{
 8 | locale(
 9 |   date_names = "en",
10 |   date_format = "\%AD",
11 |   time_format = "\%AT",
12 |   decimal_mark = ".",
13 |   grouping_mark = ",",
14 |   tz = "UTC",
15 |   encoding = "UTF-8"
16 | )
17 | 
18 | default_locale()
19 | }
20 | \arguments{
21 | \item{date_names}{Character representations of day and month names. Either
22 | the language code as string (passed on to \code{\link[=date_names_lang]{date_names_lang()}})
23 | or an object created by \code{\link[=date_names]{date_names()}}.}
24 | 
25 | \item{date_format, time_format}{Default date and time formats.}
26 | 
27 | \item{decimal_mark, grouping_mark}{Symbols used to indicate the decimal
28 | place, and to chunk larger numbers. Decimal mark can only be \verb{,} or
29 | \code{.}.}
30 | 
31 | \item{tz}{Default tz. This is used both for input (if the time zone isn't
32 | present in individual strings), and for output (to control the default
33 | display). The default is to use "UTC", a time zone that does not use
34 | daylight savings time (DST) and hence is typically most useful for data.
35 | The absence of time zones makes it approximately 50x faster to generate
36 | UTC times than any other time zone.
37 | 
38 | Use \code{""} to use the system default time zone, but beware that this
39 | will not be reproducible across systems.
40 | 
41 | For a complete list of possible time zones, see \code{\link[=OlsonNames]{OlsonNames()}}.
42 | Americans, note that "EST" is a Canadian time zone that does not have
43 | DST. It is \emph{not} Eastern Standard Time. It's better to use
44 | "US/Eastern", "US/Central" etc.}
45 | 
46 | \item{encoding}{Default encoding. This only affects how the file is
47 | read - meltr always converts the output to UTF-8.}
48 | }
49 | \value{
50 | A locale object
51 | }
52 | \description{
53 | A locale object tries to capture all the defaults that can vary between
54 | countries. You set the locale in once, and the details are automatically
55 | passed on down to the columns parsers. The defaults have been chosen to
56 | match R (i.e. US English) as closely as possible. See
57 | \code{vignette("locales")} for more details.
58 | }
59 | \examples{
60 | locale()
61 | locale("fr")
62 | 
63 | # South American locale
64 | locale("es", decimal_mark = ",")
65 | }
66 | 


--------------------------------------------------------------------------------
/man/melt_delim.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/melt_delim.R
  3 | \name{melt_delim}
  4 | \alias{melt_delim}
  5 | \alias{melt_csv}
  6 | \alias{melt_csv2}
  7 | \alias{melt_tsv}
  8 | \title{Return melted data for each token in a delimited file (including csv & tsv)}
  9 | \usage{
 10 | melt_delim(
 11 |   file,
 12 |   delim,
 13 |   quote = "\\"",
 14 |   escape_backslash = FALSE,
 15 |   escape_double = TRUE,
 16 |   locale = default_locale(),
 17 |   na = c("", "NA"),
 18 |   quoted_na = TRUE,
 19 |   comment = "",
 20 |   trim_ws = FALSE,
 21 |   skip = 0,
 22 |   n_max = Inf,
 23 |   progress = show_progress(),
 24 |   skip_empty_rows = FALSE
 25 | )
 26 | 
 27 | melt_csv(
 28 |   file,
 29 |   locale = default_locale(),
 30 |   na = c("", "NA"),
 31 |   quoted_na = TRUE,
 32 |   quote = "\\"",
 33 |   comment = "",
 34 |   trim_ws = TRUE,
 35 |   skip = 0,
 36 |   n_max = Inf,
 37 |   progress = show_progress(),
 38 |   skip_empty_rows = FALSE
 39 | )
 40 | 
 41 | melt_csv2(
 42 |   file,
 43 |   locale = default_locale(),
 44 |   na = c("", "NA"),
 45 |   quoted_na = TRUE,
 46 |   quote = "\\"",
 47 |   comment = "",
 48 |   trim_ws = TRUE,
 49 |   skip = 0,
 50 |   n_max = Inf,
 51 |   progress = show_progress(),
 52 |   skip_empty_rows = FALSE
 53 | )
 54 | 
 55 | melt_tsv(
 56 |   file,
 57 |   locale = default_locale(),
 58 |   na = c("", "NA"),
 59 |   quoted_na = TRUE,
 60 |   quote = "\\"",
 61 |   comment = "",
 62 |   trim_ws = TRUE,
 63 |   skip = 0,
 64 |   n_max = Inf,
 65 |   progress = show_progress(),
 66 |   skip_empty_rows = FALSE
 67 | )
 68 | }
 69 | \arguments{
 70 | \item{file}{Either a path to a file, a connection, or literal data
 71 | (either a single string or a raw vector).
 72 | 
 73 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
 74 | be automatically uncompressed. Files starting with \verb{http://},
 75 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
 76 | downloaded. Remote gz files can also be automatically downloaded and
 77 | decompressed.
 78 | 
 79 | Literal data is most useful for examples and tests. To be recognised as
 80 | literal data, the input must be either wrapped with \code{I()}, be a string
 81 | containing at least one new line, or be a vector containing at least one
 82 | string with a new line.
 83 | 
 84 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
 85 | 
 86 | \item{delim}{Single character used to separate fields within a record.}
 87 | 
 88 | \item{quote}{Single character used to quote strings.}
 89 | 
 90 | \item{escape_backslash}{Does the file use backslashes to escape special
 91 | characters? This is more general than \code{escape_double} as backslashes
 92 | can be used to escape the delimiter character, the quote character, or
 93 | to add special characters like \verb{\\\\n}.}
 94 | 
 95 | \item{escape_double}{Does the file escape quotes by doubling them?
 96 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents
 97 | a single quote, \verb{\\"}.}
 98 | 
 99 | \item{locale}{The locale controls defaults that vary from place to place.
100 | The default locale is US-centric (like R), but you can use
101 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
102 | the default time zone, encoding, decimal mark, big mark, and day/month
103 | names.}
104 | 
105 | \item{na}{Character vector of strings to interpret as missing values. Set this
106 | option to \code{character()} to indicate no missing values.}
107 | 
108 | \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values
109 | inside quotes be treated as missing values (the default) or strings. This
110 | parameter is soft deprecated as of readr 2.0.0.}
111 | 
112 | \item{comment}{A string used to identify comments. Any text after the
113 | comment characters will be silently ignored.}
114 | 
115 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
116 | each field before parsing it?}
117 | 
118 | \item{skip}{Number of lines to skip before reading data. If \code{comment} is
119 | supplied any commented lines are ignored \emph{after} skipping.}
120 | 
121 | \item{n_max}{Maximum number of lines to read.}
122 | 
123 | \item{progress}{Display a progress bar? By default it will only display
124 | in an interactive session and not while knitting a document. The automatic
125 | progress bar can be disabled by setting option \code{readr.show_progress} to
126 | \code{FALSE}.}
127 | 
128 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
129 | option is \code{TRUE} then blank rows will not be represented at all.  If it is
130 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
131 | }
132 | \value{
133 | A \code{\link[=tibble]{tibble()}} of four columns:
134 | \itemize{
135 | \item \code{row}, the row that the token comes from in the original file
136 | \item \code{col}, the column that the token comes from in the original file
137 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
138 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
139 | \item \code{value}, the token itself as a character string, unchanged from its
140 | representation in the original file.
141 | }
142 | 
143 | If there are parsing problems, a warning tells you
144 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
145 | }
146 | \description{
147 | For certain non-rectangular data formats, it can be useful to parse the data
148 | into a melted format where each row represents a single token.
149 | }
150 | \details{
151 | \code{melt_csv()} and \code{melt_tsv()} are special cases of the general
152 | \code{melt_delim()}. They're useful for reading the most common types of
153 | flat file data, comma separated values and tab separated values,
154 | respectively. \code{melt_csv2()} uses \verb{;} for the field separator and \verb{,} for the
155 | decimal point. This is common in some European countries.
156 | }
157 | \examples{
158 | # Input sources -------------------------------------------------------------
159 | # Read from a path
160 | melt_csv(meltr_example("mtcars.csv"))
161 | \dontrun{
162 | melt_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
163 | }
164 | 
165 | # Or directly from a string (must contain a newline)
166 | melt_csv("x,y\n1,2\n3,4")
167 | 
168 | # To import empty cells as 'empty' rather than `NA`
169 | melt_csv("x,y\n,NA,\"\",''", na = "NA")
170 | 
171 | # File types ----------------------------------------------------------------
172 | melt_csv("a,b\n1.0,2.0")
173 | melt_csv2("a;b\n1,0;2,0")
174 | melt_tsv("a\tb\n1.0\t2.0")
175 | melt_delim("a|b\n1.0|2.0", delim = "|")
176 | }
177 | \seealso{
178 | \code{\link[readr:read_delim]{readr::read_delim()}} for the conventional way to read rectangular data
179 | from delimited files.
180 | }
181 | 


--------------------------------------------------------------------------------
/man/melt_delim_chunked.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/melt_delim_chunked.R
  3 | \name{melt_delim_chunked}
  4 | \alias{melt_delim_chunked}
  5 | \alias{melt_csv_chunked}
  6 | \alias{melt_csv2_chunked}
  7 | \alias{melt_tsv_chunked}
  8 | \title{Melt a delimited file by chunks}
  9 | \usage{
 10 | melt_delim_chunked(
 11 |   file,
 12 |   callback,
 13 |   chunk_size = 10000,
 14 |   delim,
 15 |   quote = "\\"",
 16 |   escape_backslash = FALSE,
 17 |   escape_double = TRUE,
 18 |   locale = default_locale(),
 19 |   na = c("", "NA"),
 20 |   quoted_na = TRUE,
 21 |   comment = "",
 22 |   trim_ws = FALSE,
 23 |   skip = 0,
 24 |   progress = show_progress(),
 25 |   skip_empty_rows = FALSE
 26 | )
 27 | 
 28 | melt_csv_chunked(
 29 |   file,
 30 |   callback,
 31 |   chunk_size = 10000,
 32 |   locale = default_locale(),
 33 |   na = c("", "NA"),
 34 |   quoted_na = TRUE,
 35 |   quote = "\\"",
 36 |   comment = "",
 37 |   trim_ws = TRUE,
 38 |   skip = 0,
 39 |   progress = show_progress(),
 40 |   skip_empty_rows = FALSE
 41 | )
 42 | 
 43 | melt_csv2_chunked(
 44 |   file,
 45 |   callback,
 46 |   chunk_size = 10000,
 47 |   locale = default_locale(),
 48 |   na = c("", "NA"),
 49 |   quoted_na = TRUE,
 50 |   quote = "\\"",
 51 |   comment = "",
 52 |   trim_ws = TRUE,
 53 |   skip = 0,
 54 |   progress = show_progress(),
 55 |   skip_empty_rows = FALSE
 56 | )
 57 | 
 58 | melt_tsv_chunked(
 59 |   file,
 60 |   callback,
 61 |   chunk_size = 10000,
 62 |   locale = default_locale(),
 63 |   na = c("", "NA"),
 64 |   quoted_na = TRUE,
 65 |   quote = "\\"",
 66 |   comment = "",
 67 |   trim_ws = TRUE,
 68 |   skip = 0,
 69 |   progress = show_progress(),
 70 |   skip_empty_rows = FALSE
 71 | )
 72 | }
 73 | \arguments{
 74 | \item{file}{Either a path to a file, a connection, or literal data
 75 | (either a single string or a raw vector).
 76 | 
 77 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
 78 | be automatically uncompressed. Files starting with \verb{http://},
 79 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
 80 | downloaded. Remote gz files can also be automatically downloaded and
 81 | decompressed.
 82 | 
 83 | Literal data is most useful for examples and tests. To be recognised as
 84 | literal data, the input must be either wrapped with \code{I()}, be a string
 85 | containing at least one new line, or be a vector containing at least one
 86 | string with a new line.
 87 | 
 88 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
 89 | 
 90 | \item{callback}{A callback function to call on each chunk}
 91 | 
 92 | \item{chunk_size}{The number of rows to include in each chunk}
 93 | 
 94 | \item{delim}{Single character used to separate fields within a record.}
 95 | 
 96 | \item{quote}{Single character used to quote strings.}
 97 | 
 98 | \item{escape_backslash}{Does the file use backslashes to escape special
 99 | characters? This is more general than \code{escape_double} as backslashes
100 | can be used to escape the delimiter character, the quote character, or
101 | to add special characters like \verb{\\\\n}.}
102 | 
103 | \item{escape_double}{Does the file escape quotes by doubling them?
104 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents
105 | a single quote, \verb{\\"}.}
106 | 
107 | \item{locale}{The locale controls defaults that vary from place to place.
108 | The default locale is US-centric (like R), but you can use
109 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
110 | the default time zone, encoding, decimal mark, big mark, and day/month
111 | names.}
112 | 
113 | \item{na}{Character vector of strings to interpret as missing values. Set this
114 | option to \code{character()} to indicate no missing values.}
115 | 
116 | \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values
117 | inside quotes be treated as missing values (the default) or strings. This
118 | parameter is soft deprecated as of readr 2.0.0.}
119 | 
120 | \item{comment}{A string used to identify comments. Any text after the
121 | comment characters will be silently ignored.}
122 | 
123 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
124 | each field before parsing it?}
125 | 
126 | \item{skip}{Number of lines to skip before reading data. If \code{comment} is
127 | supplied any commented lines are ignored \emph{after} skipping.}
128 | 
129 | \item{progress}{Display a progress bar? By default it will only display
130 | in an interactive session and not while knitting a document. The automatic
131 | progress bar can be disabled by setting option \code{readr.show_progress} to
132 | \code{FALSE}.}
133 | 
134 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
135 | option is \code{TRUE} then blank rows will not be represented at all.  If it is
136 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
137 | }
138 | \value{
139 | A \code{\link[=tibble]{tibble()}} of four columns:
140 | \itemize{
141 | \item \code{row}, the row that the token comes from in the original file
142 | \item \code{col}, the column that the token comes from in the original file
143 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
144 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
145 | \item \code{value}, the token itself as a character string, unchanged from its
146 | representation in the original file.
147 | }
148 | 
149 | If there are parsing problems, a warning tells you
150 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
151 | }
152 | \description{
153 | For certain non-rectangular data formats, it can be useful to parse the data
154 | into a melted format where each row represents a single token.
155 | }
156 | \details{
157 | \code{melt_delim_chunked()} and the specialisations \code{melt_csv_chunked()},
158 | \code{melt_csv2_chunked()} and \code{melt_tsv_chunked()} read files by a chunk of rows
159 | at a time, executing a given function on one chunk before reading the next.
160 | }
161 | \examples{
162 | # Cars with 3 gears
163 | f <- function(x, pos) subset(x, data_type == "integer")
164 | melt_csv_chunked(meltr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5)
165 | }
166 | \seealso{
167 | Other chunked: 
168 | \code{\link{callback}}
169 | }
170 | \concept{chunked}
171 | \keyword{internal}
172 | 


--------------------------------------------------------------------------------
/man/melt_fwf.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/melt_fwf.R
  3 | \name{melt_fwf}
  4 | \alias{melt_fwf}
  5 | \alias{fwf_empty}
  6 | \alias{fwf_widths}
  7 | \alias{fwf_positions}
  8 | \alias{fwf_cols}
  9 | \title{Return melted data for each token in a fixed width file}
 10 | \usage{
 11 | melt_fwf(
 12 |   file,
 13 |   col_positions,
 14 |   locale = default_locale(),
 15 |   na = c("", "NA"),
 16 |   comment = "",
 17 |   trim_ws = TRUE,
 18 |   skip = 0,
 19 |   n_max = Inf,
 20 |   progress = show_progress(),
 21 |   skip_empty_rows = FALSE
 22 | )
 23 | 
 24 | fwf_empty(
 25 |   file,
 26 |   skip = 0,
 27 |   skip_empty_rows = FALSE,
 28 |   col_names = NULL,
 29 |   comment = "",
 30 |   n = 100L
 31 | )
 32 | 
 33 | fwf_widths(widths, col_names = NULL)
 34 | 
 35 | fwf_positions(start, end = NULL, col_names = NULL)
 36 | 
 37 | fwf_cols(...)
 38 | }
 39 | \arguments{
 40 | \item{file}{Either a path to a file, a connection, or literal data
 41 | (either a single string or a raw vector).
 42 | 
 43 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
 44 | be automatically uncompressed. Files starting with \verb{http://},
 45 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
 46 | downloaded. Remote gz files can also be automatically downloaded and
 47 | decompressed.
 48 | 
 49 | Literal data is most useful for examples and tests. To be recognised as
 50 | literal data, the input must be either wrapped with \code{I()}, be a string
 51 | containing at least one new line, or be a vector containing at least one
 52 | string with a new line.
 53 | 
 54 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
 55 | 
 56 | \item{col_positions}{Column positions, as created by \code{\link[=fwf_empty]{fwf_empty()}},
 57 | \code{\link[=fwf_widths]{fwf_widths()}} or \code{\link[=fwf_positions]{fwf_positions()}}. To read in only selected fields,
 58 | use \code{\link[=fwf_positions]{fwf_positions()}}. If the width of the last column is variable (a
 59 | ragged fwf file), supply the last end position as NA.}
 60 | 
 61 | \item{locale}{The locale controls defaults that vary from place to place.
 62 | The default locale is US-centric (like R), but you can use
 63 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
 64 | the default time zone, encoding, decimal mark, big mark, and day/month
 65 | names.}
 66 | 
 67 | \item{na}{Character vector of strings to interpret as missing values. Set this
 68 | option to \code{character()} to indicate no missing values.}
 69 | 
 70 | \item{comment}{A string used to identify comments. Any text after the
 71 | comment characters will be silently ignored.}
 72 | 
 73 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
 74 | each field before parsing it?}
 75 | 
 76 | \item{skip}{Number of lines to skip before reading data.}
 77 | 
 78 | \item{n_max}{Maximum number of lines to read.}
 79 | 
 80 | \item{progress}{Display a progress bar? By default it will only display
 81 | in an interactive session and not while knitting a document. The automatic
 82 | progress bar can be disabled by setting option \code{readr.show_progress} to
 83 | \code{FALSE}.}
 84 | 
 85 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
 86 | option is \code{TRUE} then blank rows will not be represented at all.  If it is
 87 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
 88 | 
 89 | \item{col_names}{Either NULL, or a character vector column names.}
 90 | 
 91 | \item{n}{Number of lines the tokenizer will read to determine file structure. By default
 92 | it is set to 100.}
 93 | 
 94 | \item{widths}{Width of each field. Use NA as width of last field when
 95 | reading a ragged fwf file.}
 96 | 
 97 | \item{start, end}{Starting and ending (inclusive) positions of each field.
 98 | Use NA as last end field when reading a ragged fwf file.}
 99 | 
100 | \item{...}{If the first element is a data frame,
101 | then it must have all numeric columns and either one or two rows.
102 | The column names are the variable names. The column values are the
103 | variable widths if a length one vector, and if length two, variable start and end
104 | positions. The elements of \code{...} are used to construct a data frame
105 | with or or two rows as above.}
106 | }
107 | \value{
108 | A \code{\link[=tibble]{tibble()}} of four columns:
109 | \itemize{
110 | \item \code{row}, the row that the token comes from in the original file
111 | \item \code{col}, the column that the token comes from in the original file
112 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
113 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
114 | \item \code{value}, the token itself as a character string, unchanged from its
115 | representation in the original file.
116 | }
117 | 
118 | If there are parsing problems, a warning tells you
119 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
120 | }
121 | \description{
122 | For certain non-rectangular data formats, it can be useful to parse the data
123 | into a melted format where each row represents a single token.
124 | }
125 | \details{
126 | \code{melt_fwf()} parses each token of a fixed width file into a single row, but
127 | it still requires that each field is in the same in every row of the
128 | source file.
129 | }
130 | \examples{
131 | fwf_sample <- meltr_example("fwf-sample.txt")
132 | writeLines(readLines(fwf_sample))
133 | 
134 | # You can specify column positions in several ways:
135 | # 1. Guess based on position of empty columns
136 | melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn")))
137 | # 2. A vector of field widths
138 | melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn")))
139 | # 3. Paired vectors of start and end positions
140 | melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn")))
141 | # 4. Named arguments with start and end positions
142 | melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42)))
143 | # 5. Named arguments with column widths
144 | melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12))
145 | }
146 | \seealso{
147 | \code{\link[=melt_table]{melt_table()}} to melt fixed width files where each
148 | column is separated by whitespace, and \code{\link[=melt_fwf]{melt_fwf()}} for the conventional
149 | way to read rectangular data from fixed width files.
150 | }
151 | 


--------------------------------------------------------------------------------
/man/melt_table.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/melt_table.R
  3 | \name{melt_table}
  4 | \alias{melt_table}
  5 | \alias{melt_table2}
  6 | \title{Return melted data for each token in a whitespace-separated file}
  7 | \usage{
  8 | melt_table(
  9 |   file,
 10 |   locale = default_locale(),
 11 |   na = "NA",
 12 |   skip = 0,
 13 |   n_max = Inf,
 14 |   guess_max = min(n_max, 1000),
 15 |   progress = show_progress(),
 16 |   comment = "",
 17 |   skip_empty_rows = FALSE
 18 | )
 19 | 
 20 | melt_table2(
 21 |   file,
 22 |   locale = default_locale(),
 23 |   na = "NA",
 24 |   skip = 0,
 25 |   n_max = Inf,
 26 |   progress = show_progress(),
 27 |   comment = "",
 28 |   skip_empty_rows = FALSE
 29 | )
 30 | }
 31 | \arguments{
 32 | \item{file}{Either a path to a file, a connection, or literal data
 33 | (either a single string or a raw vector).
 34 | 
 35 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
 36 | be automatically uncompressed. Files starting with \verb{http://},
 37 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
 38 | downloaded. Remote gz files can also be automatically downloaded and
 39 | decompressed.
 40 | 
 41 | Literal data is most useful for examples and tests. To be recognised as
 42 | literal data, the input must be either wrapped with \code{I()}, be a string
 43 | containing at least one new line, or be a vector containing at least one
 44 | string with a new line.
 45 | 
 46 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
 47 | 
 48 | \item{locale}{The locale controls defaults that vary from place to place.
 49 | The default locale is US-centric (like R), but you can use
 50 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
 51 | the default time zone, encoding, decimal mark, big mark, and day/month
 52 | names.}
 53 | 
 54 | \item{na}{Character vector of strings to interpret as missing values. Set this
 55 | option to \code{character()} to indicate no missing values.}
 56 | 
 57 | \item{skip}{Number of lines to skip before reading data.}
 58 | 
 59 | \item{n_max}{Maximum number of lines to read.}
 60 | 
 61 | \item{guess_max}{Maximum number of lines to use for guessing column types.
 62 | See \code{vignette("column-types", package = "readr")} for more details.}
 63 | 
 64 | \item{progress}{Display a progress bar? By default it will only display
 65 | in an interactive session and not while knitting a document. The automatic
 66 | progress bar can be disabled by setting option \code{readr.show_progress} to
 67 | \code{FALSE}.}
 68 | 
 69 | \item{comment}{A string used to identify comments. Any text after the
 70 | comment characters will be silently ignored.}
 71 | 
 72 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
 73 | option is \code{TRUE} then blank rows will not be represented at all.  If it is
 74 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
 75 | }
 76 | \value{
 77 | A \code{\link[=tibble]{tibble()}} of four columns:
 78 | \itemize{
 79 | \item \code{row}, the row that the token comes from in the original file
 80 | \item \code{col}, the column that the token comes from in the original file
 81 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
 82 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
 83 | \item \code{value}, the token itself as a character string, unchanged from its
 84 | representation in the original file.
 85 | }
 86 | 
 87 | If there are parsing problems, a warning tells you
 88 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
 89 | }
 90 | \description{
 91 | For certain non-rectangular data formats, it can be useful to parse the data
 92 | into a melted format where each row represents a single token.
 93 | 
 94 | \code{melt_table()} and \code{melt_table2()} are designed to read the type of textual
 95 | data where each column is separated by one (or more) columns of space.
 96 | 
 97 | \code{melt_table2()} allows any number of whitespace characters between columns,
 98 | and the lines can be of different lengths.
 99 | 
100 | \code{melt_table()} is more strict, each line must be the same length,
101 | and each field is in the same position in every line. It first finds empty
102 | columns and then parses like a fixed width file.
103 | }
104 | \examples{
105 | # One corner from http://www.masseyratings.com/cf/compare.htm
106 | massey <- meltr_example("massey-rating.txt")
107 | cat(readLines(massey))
108 | melt_table(massey)
109 | 
110 | # Sample of 1978 fuel economy data from
111 | # http://www.fueleconomy.gov/feg/epadata/78data.zip
112 | epa <- meltr_example("epa78.txt")
113 | writeLines(readLines(epa))
114 | melt_table(epa)
115 | }
116 | \seealso{
117 | \code{\link[=melt_fwf]{melt_fwf()}} to melt fixed width files where each column
118 | is not separated by whitespace. \code{melt_fwf()} is also useful for reading
119 | tabular data with non-standard formatting.  \code{\link[readr:read_table]{readr::read_table()}} is the
120 | conventional way to read tabular data from whitespace-separated files.
121 | }
122 | 


--------------------------------------------------------------------------------
/man/meltr_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/example.R
 3 | \name{meltr_example}
 4 | \alias{meltr_example}
 5 | \title{Get path to meltr example}
 6 | \usage{
 7 | meltr_example(file = NULL)
 8 | }
 9 | \arguments{
10 | \item{file}{Name of file. If \code{NULL}, the example files will be listed.}
11 | }
12 | \value{
13 | A file path or a vector of file names
14 | }
15 | \description{
16 | meltr comes bundled with a number of sample files in its \code{inst/extdata}
17 | directory. This function make them easy to access
18 | }
19 | \examples{
20 | meltr_example()
21 | meltr_example("mtcars.csv")
22 | }
23 | 


--------------------------------------------------------------------------------
/man/problems.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/problems.R
 3 | \name{problems}
 4 | \alias{problems}
 5 | \alias{stop_for_problems}
 6 | \title{Retrieve parsing problems}
 7 | \usage{
 8 | problems(x = .Last.value)
 9 | 
10 | stop_for_problems(x)
11 | }
12 | \arguments{
13 | \item{x}{An data frame (from \verb{read_*()}) or a vector
14 | (from \verb{parse_*()}).}
15 | }
16 | \value{
17 | A data frame with one row for each problem and four columns:
18 | \item{row,col}{Row and column of problem}
19 | \item{expected}{What readr expected to find}
20 | \item{actual}{What it actually got}
21 | }
22 | \description{
23 | Readr functions will only throw an error if parsing fails in an unrecoverable
24 | way. However, there are lots of potential problems that you might want to
25 | know about - these are stored in the \code{problems} attribute of the
26 | output, which you can easily access with this function.
27 | \code{stop_for_problems()} will throw an error if there are any parsing
28 | problems: this is useful for automated scripts where you want to throw
29 | an error as soon as you encounter a problem.
30 | }
31 | \examples{
32 | if (requireNamespace("readr")) {
33 | x <- readr::parse_integer(c("1X", "blah", "3"))
34 | problems(x)
35 | 
36 | y <- readr::parse_integer(c("1", "2", "3"))
37 | problems(y)
38 | }
39 | }
40 | 


--------------------------------------------------------------------------------
/man/show_progress.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{show_progress}
 4 | \alias{show_progress}
 5 | \title{Determine whether progress bars should be shown}
 6 | \usage{
 7 | show_progress()
 8 | }
 9 | \value{
10 | A logical value
11 | }
12 | \description{
13 | Progress bars are shown \emph{unless} one of the following is \code{TRUE}
14 | \itemize{
15 | \item The bar is explicitly disabled by setting \code{options(readr.show_progress = FALSE)}
16 | \item The code is run in a non-interactive session (\code{interactive()} is \code{FALSE}).
17 | \item The code is run in an RStudio notebook chunk.
18 | \item The code is run by knitr / rmarkdown.
19 | }
20 | }
21 | \examples{
22 | show_progress()
23 | }
24 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/src/Collector.cpp:
--------------------------------------------------------------------------------
 1 | #include "cpp11/list.hpp"
 2 | 
 3 | #include "Collector.h"
 4 | #include "LocaleInfo.h"
 5 | #include "QiParsers.h"
 6 | #include "utils.h"
 7 | 
 8 | CollectorPtr Collector::create(const cpp11::list& spec, LocaleInfo* pLocale) {
 9 |   std::string subclass(cpp11::as_cpp<cpp11::strings>(spec.attr("class"))[0]);
10 | 
11 | 	
12 |   if (subclass == "collector_double") {
13 |     return CollectorPtr(new CollectorDouble(pLocale->decimalMark_));
14 |   }
15 |   if (subclass == "collector_character") {
16 |     return CollectorPtr(new CollectorCharacter(&pLocale->encoder_));
17 |   }
18 | 
19 |   cpp11::stop("Unsupported column type '%s'", subclass.c_str());
20 |   return CollectorPtr(new CollectorSkip());
21 | }
22 | 
23 | std::vector<CollectorPtr>
24 | collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale) {
25 |   std::vector<CollectorPtr> collectors;
26 |   for (auto spec : specs) {
27 |     CollectorPtr col(Collector::create(SEXP(spec), pLocale));
28 |     collectors.push_back(col);
29 |   }
30 | 
31 |   return collectors;
32 | }
33 | 
34 | // Implementations ------------------------------------------------------------
35 | 
36 | void CollectorCharacter::setValue(int i, const Token& t) {
37 |   switch (t.type()) {
38 |   case TOKEN_STRING: {
39 |     std::string buffer;
40 |     SourceIterators string = t.getString(&buffer);
41 | 
42 |     if (t.hasNull()) {
43 |       warn(t.row(), t.col(), "", "embedded null");
44 |     }
45 | 
46 |     SET_STRING_ELT(
47 |         column_,
48 |         i,
49 |         pEncoder_->makeSEXP(string.first, string.second, t.hasNull()));
50 |     break;
51 |   };
52 |   case TOKEN_MISSING:
53 |     SET_STRING_ELT(column_, i, NA_STRING);
54 |     break;
55 |   case TOKEN_EMPTY:
56 |     SET_STRING_ELT(column_, i, Rf_mkCharCE("", CE_UTF8));
57 |     break;
58 |   case TOKEN_EOF:
59 |     cpp11::stop("Invalid token");
60 |   }
61 | }
62 | 
63 | void CollectorCharacter::setValue(int i, const std::string& s) {
64 |   SET_STRING_ELT(column_, i, Rf_mkCharCE(s.c_str(), CE_UTF8));
65 | }
66 | 
67 | void CollectorDouble::setValue(int i, size_t st) { REAL(column_)[i] = st; }
68 | 


--------------------------------------------------------------------------------
/src/Collector.h:
--------------------------------------------------------------------------------
  1 | #ifndef MELTR_COLLECTOR_H_
  2 | #define MELTR_COLLECTOR_H_
  3 | 
  4 | #include "cpp11/doubles.hpp"
  5 | #include "cpp11/integers.hpp"
  6 | #include "cpp11/list.hpp"
  7 | #include "cpp11/logicals.hpp"
  8 | #include "cpp11/strings.hpp"
  9 | 
 10 | #include "DateTimeParser.h"
 11 | #include "Iconv.h"
 12 | #include "LocaleInfo.h"
 13 | #include "Token.h"
 14 | #include "Warnings.h"
 15 | #include <map>
 16 | #include <memory>
 17 | 
 18 | class Collector;
 19 | typedef std::shared_ptr<Collector> CollectorPtr;
 20 | 
 21 | class Collector {
 22 | protected:
 23 |   cpp11::sexp column_;
 24 |   Warnings* pWarnings_;
 25 | 
 26 |   int n_;
 27 | 
 28 | public:
 29 |   Collector(SEXP column, Warnings* pWarnings = NULL)
 30 |       : column_(column), pWarnings_(pWarnings), n_(0) {}
 31 | 
 32 |   virtual ~Collector(){};
 33 | 
 34 |   virtual void setValue(int i, const Token& t) = 0;
 35 |   virtual void setValue(int /* unused */, const std::string& /* unused */){}; // nocov
 36 |   virtual void setValue(int /* unused */, size_t /* unused */ ){};            // nocov
 37 | 
 38 |   virtual cpp11::sexp vector() { return column_; };
 39 | 
 40 |   virtual bool skip() { return false; }
 41 | 
 42 |   int size() { return n_; }
 43 | 
 44 |   void resize(int n) {
 45 |     if (n == n_)
 46 |       return;
 47 | 
 48 |     if (column_ == R_NilValue)
 49 |       return;
 50 | 
 51 | #if R_VERSION >= R_Version(3, 4, 0)
 52 |     if (n > 0 && n < n_) {
 53 |       SET_TRUELENGTH(column_, n_);
 54 |       SETLENGTH(column_, n);
 55 |       SET_GROWABLE_BIT(column_);
 56 |     } else {
 57 |       column_ = Rf_lengthgets(column_, n);
 58 |     }
 59 | #else
 60 |     column_ = Rf_lengthgets(column_, n);
 61 | #endif
 62 | 
 63 |     n_ = n;
 64 |   }
 65 | 
 66 |   void clear() { resize(0); }
 67 | 
 68 |   void setWarnings(Warnings* pWarnings) { pWarnings_ = pWarnings; }
 69 | 
 70 |   inline void warn(int row, int col, std::string expected, std::string actual) {
 71 |     if (pWarnings_ == NULL) {
 72 |       cpp11::warning(
 73 |           "[%i, %i]: expected %s, but got '%s'",
 74 |           row + 1,
 75 |           col + 1,
 76 |           expected.c_str(),
 77 |           actual.c_str());
 78 |       return;
 79 |     }
 80 | 
 81 |     pWarnings_->addWarning(row, col, expected, actual);
 82 |   }
 83 |   inline void
 84 |   warn(int row, int col, std::string expected, SourceIterators actual) {
 85 |     warn(row, col, expected, std::string(actual.first, actual.second));
 86 |   }
 87 | 
 88 |   static CollectorPtr create(const cpp11::list& spec, LocaleInfo* pLocale);
 89 | };
 90 | 
 91 | // Character -------------------------------------------------------------------
 92 | 
 93 | class CollectorCharacter : public Collector {
 94 |   Iconv* pEncoder_;
 95 | 
 96 | public:
 97 |   CollectorCharacter(Iconv* pEncoder)
 98 |       : Collector(cpp11::writable::strings(R_xlen_t(0))), pEncoder_(pEncoder) {}
 99 |   void setValue(int i, const Token& t);
100 |   void setValue(int i, const std::string& s);
101 | };
102 | 
103 | // Date ------------------------------------------------------------------------
104 | 
105 | class CollectorDate : public Collector {
106 |   std::string format_;
107 |   DateTimeParser parser_;
108 | 
109 | public:
110 |   CollectorDate(LocaleInfo* pLocale, const std::string& format)
111 |       : Collector(cpp11::writable::doubles(R_xlen_t(0))),
112 |         format_(format),
113 |         parser_(pLocale) {}
114 | 
115 |   void setValue(int i, const Token& t);
116 | 
117 |   cpp11::sexp vector() {
118 |     column_.attr("class") = "Date";
119 |     return column_;
120 |   };
121 | };
122 | 
123 | // Date time -------------------------------------------------------------------
124 | 
125 | class CollectorDateTime : public Collector {
126 |   std::string format_;
127 |   DateTimeParser parser_;
128 |   std::string tz_;
129 | 
130 | public:
131 |   CollectorDateTime(LocaleInfo* pLocale, const std::string& format)
132 |       : Collector(cpp11::writable::doubles(R_xlen_t(0))),
133 |         format_(format),
134 |         parser_(pLocale),
135 |         tz_(pLocale->tz_) {}
136 | 
137 |   void setValue(int i, const Token& t);
138 | 
139 |   cpp11::sexp vector() {
140 |     column_.attr("class") = {"POSIXct", "POSIXt"};
141 |     column_.attr("tzone") = tz_;
142 |     return column_;
143 |   };
144 | };
145 | 
146 | class CollectorDouble : public Collector {
147 | 
148 | public:
149 |   CollectorDouble(char /* unused */)
150 |       : Collector(cpp11::writable::doubles(R_xlen_t(0))) {}
151 |   void setValue(int /* unused */, const Token& /* unused */) { /* unused */ };
152 |   void setValue(int i, size_t st);
153 | };
154 | 
155 | class CollectorFactor : public Collector {
156 |   Iconv* pEncoder_;
157 |   std::vector<cpp11::r_string> levels_;
158 |   std::map<cpp11::r_string, int> levelset_;
159 |   bool ordered_, implicitLevels_, includeNa_;
160 |   std::string buffer_;
161 | 
162 |   void insert(int i, const cpp11::r_string& str, const Token& t);
163 | 
164 | public:
165 |   CollectorFactor(
166 |       Iconv* pEncoder, cpp11::sexp levels, bool ordered, bool includeNa)
167 |       : Collector(cpp11::writable::integers(R_xlen_t(0))),
168 |         pEncoder_(pEncoder),
169 |         ordered_(ordered),
170 |         includeNa_(includeNa) {
171 |     implicitLevels_ = levels == R_NilValue;
172 |     if (!implicitLevels_) {
173 |       cpp11::strings lvls(levels);
174 |       int n = lvls.size();
175 | 
176 |       for (int i = 0; i < n; ++i) {
177 |         cpp11::r_string std_level;
178 |         if (STRING_ELT(lvls, i) != NA_STRING) {
179 |           const char* level = Rf_translateCharUTF8(STRING_ELT(lvls, i));
180 |           std_level = level;
181 |         } else {
182 |           std_level = NA_STRING;
183 |         }
184 |         levels_.push_back(std_level);
185 |         levelset_.insert(std::make_pair(std_level, i));
186 |       }
187 |     }
188 |   }
189 |   void setValue(int i, const Token& t);
190 | 
191 |   cpp11::sexp vector() {
192 |     if (ordered_) {
193 |       column_.attr("class") = {"ordered", "factor"};
194 |     } else {
195 |       column_.attr("class") = "factor";
196 |     }
197 | 
198 |     int n = levels_.size();
199 |     cpp11::writable::strings levels(n);
200 |     for (int i = 0; i < n; ++i) {
201 |       levels[i] = levels_[i];
202 |     }
203 | 
204 |     column_.attr("levels") = levels;
205 |     return column_;
206 |   };
207 | };
208 | 
209 | class CollectorInteger : public Collector {
210 | public:
211 |   CollectorInteger() : Collector(cpp11::writable::integers(R_xlen_t(0))) {}
212 |   void setValue(int i, const Token& t);
213 | };
214 | 
215 | class CollectorLogical : public Collector {
216 | public:
217 |   CollectorLogical() : Collector(cpp11::writable::logicals(R_xlen_t(0))) {}
218 |   void setValue(int i, const Token& t);
219 | };
220 | 
221 | class CollectorNumeric : public Collector {
222 |   char decimalMark_, groupingMark_;
223 | 
224 | public:
225 |   CollectorNumeric(char decimalMark, char groupingMark)
226 |       : Collector(cpp11::writable::doubles(R_xlen_t(0))),
227 |         decimalMark_(decimalMark),
228 |         groupingMark_(groupingMark) {}
229 |   void setValue(int i, const Token& t);
230 |   bool isNum(char c);
231 | };
232 | 
233 | // Time ---------------------------------------------------------------------
234 | 
235 | class CollectorTime : public Collector {
236 |   std::string format_;
237 |   DateTimeParser parser_;
238 | 
239 | public:
240 |   CollectorTime(LocaleInfo* pLocale, const std::string& format)
241 |       : Collector(cpp11::writable::doubles(R_xlen_t(0))),
242 |         format_(format),
243 |         parser_(pLocale) {}
244 | 
245 |   void setValue(int i, const Token& t);
246 | 
247 |   cpp11::sexp vector() {
248 |     column_.attr("class") = {"hms", "difftime"};
249 |     column_.attr("units") = "secs";
250 |     return column_;
251 |   };
252 | };
253 | 
254 | // Skip ---------------------------------------------------------------------
255 | 
256 | class CollectorSkip : public Collector {
257 | public:
258 |   CollectorSkip() : Collector(R_NilValue) {}
259 |   void setValue(int /* unused */, const Token& /* unused */) {}
260 |   bool skip() { return true; }
261 | };
262 | 
263 | // Raw -------------------------------------------------------------------------
264 | class CollectorRaw : public Collector {
265 | public:
266 |   CollectorRaw() : Collector(cpp11::writable::list(static_cast<R_xlen_t>(0))) {}
267 |   void setValue(int i, const Token& t);
268 | };
269 | 
270 | // Helpers ---------------------------------------------------------------------
271 | 
272 | std::vector<CollectorPtr>
273 | collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale);
274 | void collectorsResize(std::vector<CollectorPtr>& collectors, int n);
275 | void collectorsClear(std::vector<CollectorPtr>& collectors);
276 | std::string collectorGuess(
277 |     const cpp11::strings& input,
278 |     const cpp11::list& locale_,
279 |     bool guessInteger = false);
280 | 
281 | #endif
282 | 


--------------------------------------------------------------------------------
/src/CollectorGuess.cpp:
--------------------------------------------------------------------------------
  1 | #include "cpp11/R.hpp"
  2 | #include "cpp11/list.hpp"
  3 | #include "cpp11/strings.hpp"
  4 | 
  5 | #include "DateTimeParser.h"
  6 | #include "LocaleInfo.h"
  7 | #include "QiParsers.h"
  8 | #include "utils.h"
  9 | 
 10 | typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale);
 11 | 
 12 | bool canParse(
 13 |     const cpp11::strings& x, const canParseFun& canParse, LocaleInfo* pLocale) {
 14 |   for (const auto & i : x) {
 15 |     if (i == NA_STRING) {
 16 |       continue;
 17 |     }
 18 | 
 19 |     if (i.size() == 0) {
 20 |       continue;
 21 |     }
 22 | 
 23 |     if (!canParse(std::string(i), pLocale)) {
 24 |       return false;
 25 |     }
 26 |   }
 27 |   return true;
 28 | }
 29 | 
 30 | bool allMissing(const cpp11::strings& x) {
 31 |   for (const auto & i : x) {
 32 |     if (i != NA_STRING && i.size() > 0) {
 33 |       return false;
 34 |     }
 35 |   }
 36 |   return true;
 37 | }
 38 | 
 39 | bool isLogical(const std::string& x, LocaleInfo* /*unused*/) {
 40 |   const char* const str = x.data();
 41 |   bool res = isLogical(str, str + x.size());
 42 |   return res;
 43 | }
 44 | 
 45 | bool isNumber(const std::string& x, LocaleInfo* pLocale) {
 46 |   // Leading zero not followed by decimal mark
 47 |   if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) {
 48 |     return false;
 49 |   }
 50 | 
 51 |   double res = 0;
 52 |   std::string::const_iterator begin = x.begin();
 53 | 
 54 |   std::string::const_iterator end = x.end();
 55 | 
 56 |   bool ok = parseNumber(
 57 |       pLocale->decimalMark_, pLocale->groupingMark_, begin, end, res);
 58 |   return ok && begin == x.begin() && end == x.end();
 59 | }
 60 | 
 61 | bool isInteger(const std::string& x, LocaleInfo* /*unused*/) {
 62 |   // Leading zero
 63 |   if (x[0] == '0' && x.size() > 1) {
 64 |     return false;
 65 |   }
 66 | 
 67 |   double res = 0;
 68 |   std::string::const_iterator begin = x.begin();
 69 | 
 70 |   std::string::const_iterator end = x.end();
 71 | 
 72 |   return parseInt(begin, end, res) && begin == end;
 73 | }
 74 | 
 75 | bool isDouble(const std::string& x, LocaleInfo* pLocale) {
 76 |   // Leading zero not followed by decimal mark
 77 |   if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) {
 78 |     return false;
 79 |   }
 80 | 
 81 |   double res = 0;
 82 |   const char* begin = x.c_str();
 83 |   const char* end = begin + x.size();
 84 | 
 85 |   return parseDouble(pLocale->decimalMark_, begin, end, res) &&
 86 |          end == begin + x.size();
 87 | }
 88 | 
 89 | bool isTime(const std::string& x, LocaleInfo* pLocale) {
 90 |   DateTimeParser parser(pLocale);
 91 | 
 92 |   parser.setDate(x.c_str());
 93 |   return parser.parseLocaleTime();
 94 | }
 95 | 
 96 | bool isDate(const std::string& x, LocaleInfo* pLocale) {
 97 |   DateTimeParser parser(pLocale);
 98 | 
 99 |   parser.setDate(x.c_str());
100 |   return parser.parseLocaleDate();
101 | }
102 | 
103 | static bool isDateTime(const std::string& x, LocaleInfo* pLocale) {
104 |   DateTimeParser parser(pLocale);
105 | 
106 |   parser.setDate(x.c_str());
107 |   bool ok = parser.parseISO8601();
108 | 
109 |   if (!ok) {
110 |     return false;
111 |   }
112 | 
113 |   if (!parser.compactDate()) {
114 |     return true;
115 |   }
116 | 
117 |   // Values like 00014567 are unlikely to be dates, so don't guess
118 |   return parser.year() > 999;
119 | }
120 | 
121 | [[cpp11::register]] std::string collectorGuess(
122 |     const cpp11::strings& input,
123 |     const cpp11::list& locale_,
124 |     bool guessInteger) {
125 |   LocaleInfo locale(static_cast<SEXP>(locale_));
126 | 
127 |   if (input.size() == 0) {
128 |     return "character";
129 |   }
130 | 
131 |   if (allMissing(input)) {
132 |     return "logical";
133 |   }
134 | 
135 |   // Work from strictest to most flexible
136 |   if (canParse(input, isLogical, &locale)) {
137 |     return "logical";
138 |   }
139 |   if (guessInteger && canParse(input, isInteger, &locale)) {
140 |     return "integer";
141 |   }
142 |   if (canParse(input, isDouble, &locale)) {
143 |     return "double";
144 |   }
145 |   if (canParse(input, isNumber, &locale)) {
146 |     return "number";
147 |   }
148 |   if (canParse(input, isTime, &locale)) {
149 |     return "time";
150 |   }
151 |   if (canParse(input, isDate, &locale)) {
152 |     return "date";
153 |   }
154 |   if (canParse(input, isDateTime, &locale)) {
155 |     return "datetime";
156 |   }
157 | 
158 |   // Otherwise can always parse as a character
159 |   return "character";
160 | }
161 | 


--------------------------------------------------------------------------------
/src/Iconv.cpp:
--------------------------------------------------------------------------------
  1 | #include "Iconv.h"
  2 | #include "cpp11/protect.hpp"
  3 | #include <cstring>
  4 | 
  5 | Iconv::Iconv(const std::string& from, const std::string& to) {
  6 |   if (from == "UTF-8") {
  7 |     cd_ = nullptr;
  8 |   } else {
  9 |     cd_ = Riconv_open(to.c_str(), from.c_str());
 10 |     if (cd_ == (void*)-1) {
 11 |       if (errno == EINVAL) {
 12 |         cpp11::stop("Can't convert from %s to %s", from.c_str(), to.c_str());
 13 |       } else {
 14 |         cpp11::stop("Iconv initialisation failed");
 15 |       }
 16 |     }
 17 | 
 18 |     // Allocate space in buffer
 19 |     buffer_.resize(1024);
 20 |   }
 21 | }
 22 | 
 23 | Iconv::~Iconv() {
 24 |   if (cd_ != nullptr) {
 25 |     Riconv_close(cd_);
 26 |     cd_ = nullptr;
 27 |   }
 28 | }
 29 | 
 30 | size_t Iconv::convert(const char* start, const char* end) {
 31 |   size_t n = end - start;
 32 | 
 33 |   // Ensure buffer is big enough: one input byte can never generate
 34 |   // more than 4 output bytes
 35 |   size_t max_size = n * 4;
 36 |   if (buffer_.size() < max_size) {
 37 |     buffer_.resize(max_size);
 38 |   }
 39 | 
 40 |   char* outbuf = &buffer_[0];
 41 |   size_t inbytesleft = n;
 42 | 
 43 |   size_t outbytesleft = max_size;
 44 |   size_t res = Riconv(cd_, &start, &inbytesleft, &outbuf, &outbytesleft);
 45 | 
 46 |   if (res == (size_t)-1) {
 47 |     switch (errno) {
 48 |     case EILSEQ:
 49 |       cpp11::stop("Invalid multibyte sequence");
 50 |     case EINVAL:
 51 |       cpp11::stop("Incomplete multibyte sequence");
 52 |     case E2BIG:
 53 |       cpp11::stop("Iconv buffer too small");
 54 |     default:
 55 |       cpp11::stop("Iconv failed to convert for unknown reason");
 56 |     }
 57 |   }
 58 | 
 59 |   return max_size - outbytesleft;
 60 | }
 61 | 
 62 | int my_strnlen(const char* s, int maxlen) {
 63 |   for (int n = 0; n < maxlen; ++n) {
 64 |     if (s[n] == '\0') {
 65 |       return n;
 66 |     }
 67 |   }
 68 |   return maxlen;
 69 | }
 70 | 
 71 | #if defined(__sun)
 72 | #define meltr_strnlen my_strnlen
 73 | #else
 74 | #define meltr_strnlen strnlen
 75 | #endif
 76 | 
 77 | // To be safe, we need to check for nulls - this also needs to emit
 78 | // a warning, but this behaviour is better than crashing
 79 | SEXP safeMakeChar(const char* start, size_t n, bool hasNull) {
 80 |   size_t m = hasNull ? meltr_strnlen(start, n) : n;
 81 |   if (m > INT_MAX) {
 82 |     cpp11::stop("R character strings are limited to 2^31-1 bytes");
 83 |   }
 84 |   return Rf_mkCharLenCE(start, m, CE_UTF8);
 85 | }
 86 | 
 87 | SEXP Iconv::makeSEXP(const char* start, const char* end, bool hasNull) {
 88 |   if (cd_ == nullptr) {
 89 |     return safeMakeChar(start, end - start, hasNull);
 90 |   }
 91 | 
 92 |   int n = convert(start, end);
 93 |   return safeMakeChar(&buffer_[0], n, hasNull);
 94 | }
 95 | 
 96 | std::string Iconv::makeString(const char* start, const char* end) {
 97 |   if (cd_ == nullptr) {
 98 |     return std::string(start, end);
 99 |   }
100 | 
101 |   int n = convert(start, end);
102 |   return std::string(&buffer_[0], n);
103 | }
104 | 


--------------------------------------------------------------------------------
/src/Iconv.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_ICONV_H_
 2 | #define MELTR_ICONV_H_
 3 | 
 4 | #include "cpp11/R.hpp"
 5 | #include <string>
 6 | 
 7 | #include "R_ext/Riconv.h"
 8 | #include <errno.h>
 9 | 
10 | class Iconv {
11 |   void* cd_;
12 |   std::string buffer_;
13 | 
14 | public:
15 |   Iconv(const std::string& from, const std::string& to = "UTF-8");
16 |   virtual ~Iconv();
17 | 
18 |   SEXP makeSEXP(const char* start, const char* end, bool hasNull = true);
19 |   std::string makeString(const char* start, const char* end);
20 | 
21 | private:
22 |   // Returns number of characters in buffer
23 |   size_t convert(const char* start, const char* end);
24 | };
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/LocaleInfo.cpp:
--------------------------------------------------------------------------------
 1 | #include "cpp11/as.hpp"
 2 | #include "cpp11/list.hpp"
 3 | #include "cpp11/strings.hpp"
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "LocaleInfo.h"
 8 | 
 9 | LocaleInfo::LocaleInfo(const cpp11::list& x)
10 |     : encoding_(cpp11::as_cpp<std::string>(x["encoding"])),
11 |       encoder_(Iconv(encoding_)) {
12 |   std::string klass = cpp11::as_cpp<std::string>(x.attr("class"));
13 |   if (klass != "locale") {
14 |     cpp11::stop("Invalid input: must be of class locale");
15 |   }
16 | 
17 |   cpp11::list date_names(x["date_names"]);
18 |   mon_ = cpp11::as_cpp<std::vector<std::string>>(date_names["mon"]);
19 |   monAb_ = cpp11::as_cpp<std::vector<std::string>>(date_names["mon_ab"]);
20 |   day_ = cpp11::as_cpp<std::vector<std::string>>(date_names["day"]);
21 |   dayAb_ = cpp11::as_cpp<std::vector<std::string>>(date_names["day_ab"]);
22 |   amPm_ = cpp11::as_cpp<std::vector<std::string>>(date_names["am_pm"]);
23 | 
24 |   decimalMark_ = cpp11::as_cpp<char>(x["decimal_mark"]);
25 |   groupingMark_ = cpp11::as_cpp<char>(x["grouping_mark"]);
26 | 
27 |   dateFormat_ = cpp11::as_cpp<std::string>(x["date_format"]);
28 |   timeFormat_ = cpp11::as_cpp<std::string>(x["time_format"]);
29 | 
30 |   tz_ = cpp11::as_cpp<std::string>(x["tz"]);
31 | }
32 | 


--------------------------------------------------------------------------------
/src/LocaleInfo.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_LOCALINFO_H_
 2 | #define MELTR_LOCALINFO_H_
 3 | 
 4 | #include "Iconv.h"
 5 | 
 6 | #include "cpp11/list.hpp"
 7 | #include <string>
 8 | #include <vector>
 9 | 
10 | class LocaleInfo {
11 | 
12 | public:
13 |   // LC_TIME
14 |   std::vector<std::string> mon_, monAb_, day_, dayAb_, amPm_;
15 |   std::string dateFormat_, timeFormat_;
16 | 
17 |   // LC_NUMERIC
18 |   char decimalMark_, groupingMark_;
19 | 
20 |   // LC_MISC
21 |   std::string tz_;
22 |   std::string encoding_;
23 |   Iconv encoder_;
24 | 
25 |   LocaleInfo(const cpp11::list& x);
26 | };
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/src/Progress.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_PROGRESS_H_
 2 | #define MELTR_PROGRESS_H_
 3 | 
 4 | #include "cpp11/R.hpp"
 5 | #include <iomanip>
 6 | #include <sstream>
 7 | #include <time.h>
 8 | 
 9 | inline int now() { return clock() / CLOCKS_PER_SEC; }
10 | 
11 | inline std::string clearLine(int width = 50) {
12 |   return "\r" + std::string(' ', width) + "\r";
13 | }
14 | 
15 | inline std::string showTime(int x) {
16 |   std::stringstream ss;
17 |   if (x < 60) {
18 |     ss << x << " s";
19 |     return ss.str();
20 |   } else if (x < 60 * 60) {
21 |     ss << x / 60 << " m";
22 |     return ss.str();
23 |   } else {
24 |     ss << x / (60 * 60) << " h";
25 |     return ss.str();
26 |   }
27 | }
28 | 
29 | class Progress {
30 |   int timeMin_, timeInit_, timeStop_, width_;
31 |   bool show_, stopped_;
32 | 
33 | public:
34 |   Progress(int min = 5, int width = Rf_GetOptionWidth())
35 |       : timeMin_(min),
36 |         timeInit_(now()),
37 |         timeStop_(now()),
38 |         width_(width),
39 |         show_(false),
40 |         stopped_(false) {}
41 | 
42 |   void stop() {
43 |     timeStop_ = now();
44 |     stopped_ = true;
45 |   }
46 | 
47 |   void show(std::pair<double, size_t> progress) {
48 |     double prop = progress.first, size = progress.second / (1024 * 1024);
49 | 
50 |     double est = (now() - timeInit_) / prop;
51 |     if (!show_) {
52 |       if (est > timeMin_) {
53 |         show_ = true;
54 |       } else {
55 |         return;
56 |       }
57 |     }
58 | 
59 |     std::stringstream labelStream;
60 |     labelStream << std::setprecision(2) << std::fixed << " "
61 |                 << (int)(prop * 100) << "%";
62 |     if (size > 0) {
63 |       labelStream << " " << std::setprecision(0) << size << " MB";
64 |     }
65 | 
66 |     std::string label = labelStream.str();
67 | 
68 |     int barSize = width_ - label.size() - 2;
69 |     if (barSize < 0) {
70 |       return;
71 |     }
72 |     int nbars = prop * barSize;
73 |     int nspaces = (1 - prop) * barSize;
74 |     std::string bars(nbars, '='), spaces(nspaces, ' ');
75 |     Rprintf("\r|%s%s|%s", bars.c_str(), spaces.c_str(), label.c_str());
76 |   }
77 | 
78 |   ~Progress() {
79 |     try {
80 |       if (!show_)
81 |         return;
82 | 
83 |       if (!stopped_)
84 |         timeStop_ = now();
85 |       Rprintf("\n");
86 | 
87 |     } catch (...) {
88 |     }
89 |   }
90 | };
91 | 
92 | #endif
93 | 


--------------------------------------------------------------------------------
/src/Reader.cpp:
--------------------------------------------------------------------------------
  1 | #include "Reader.h"
  2 | 
  3 | #include "cpp11/function.hpp"
  4 | #include "cpp11/list.hpp"
  5 | 
  6 | #include <sstream>
  7 | #include <utility>
  8 | 
  9 | Reader::Reader(
 10 |     SourcePtr source,
 11 |     TokenizerPtr tokenizer,
 12 |     std::vector<CollectorPtr> collectors,
 13 |     bool progress,
 14 |     const cpp11::strings& colNames)
 15 |     : source_(std::move(source)),
 16 |       tokenizer_(std::move(tokenizer)),
 17 |       collectors_(std::move(collectors)),
 18 |       progress_(progress),
 19 |       begun_(false) {
 20 |   init(colNames);
 21 | }
 22 | 
 23 | void Reader::init(const cpp11::strings& colNames) {
 24 |   tokenizer_->tokenize(source_->begin(), source_->end());
 25 |   tokenizer_->setWarnings(&warnings_);
 26 | 
 27 |   // Work out which output columns we are keeping and set warnings for each
 28 |   // collector
 29 |   size_t p = collectors_.size();
 30 |   for (size_t j = 0; j < p; ++j) {
 31 |     if (!collectors_[j]->skip()) {
 32 |       keptColumns_.push_back(j);
 33 |       collectors_[j]->setWarnings(&warnings_);
 34 |     }
 35 |   }
 36 | 
 37 |   if (colNames.size() > 0) {
 38 |     outNames_ = cpp11::writable::strings(keptColumns_.size());
 39 |     int i = 0;
 40 |     for (int keptColumn : keptColumns_) {
 41 |       outNames_[i++] = colNames[keptColumn];
 42 |     }
 43 |   }
 44 | }
 45 | 
 46 | void Reader::collectorsResize(R_xlen_t n) {
 47 |   for (auto & collector : collectors_) {
 48 |     collector->resize(n);
 49 |   }
 50 | }
 51 | 
 52 | void Reader::collectorsClear() {
 53 |   for (auto & collector : collectors_) {
 54 |     collector->clear();
 55 |   }
 56 | }
 57 | 
 58 | cpp11::sexp
 59 | Reader::meltToDataFrame(const cpp11::list& locale_, R_xlen_t lines) {
 60 |   melt(locale_, lines);
 61 | 
 62 |   // Save individual columns into a data frame
 63 |   cpp11::writable::list out(4);
 64 |   out[0] = collectors_[0]->vector();
 65 |   out[1] = collectors_[1]->vector();
 66 |   out[2] = collectors_[2]->vector();
 67 |   out[3] = collectors_[3]->vector();
 68 | 
 69 |   out.attr("names") = {"row", "col", "data_type", "value"};
 70 |   cpp11::sexp out2(warnings_.addAsAttribute(static_cast<SEXP>(out)));
 71 | 
 72 |   collectorsClear();
 73 |   warnings_.clear();
 74 | 
 75 |   out.attr("names") = {"row", "col", "data_type", "value"};
 76 | 
 77 |   static cpp11::function as_tibble = cpp11::package("tibble")["as_tibble"];
 78 |   return as_tibble(out);
 79 | }
 80 | 
 81 | R_xlen_t Reader::melt(const cpp11::list& locale_, R_xlen_t lines) {
 82 | 
 83 |   if (t_.type() == TOKEN_EOF) {
 84 |     return (-1);
 85 |   }
 86 | 
 87 |   R_xlen_t n = (lines < 0) ? 10000 : lines * 10; // Start with 10 cells per line
 88 | 
 89 |   collectorsResize(n);
 90 | 
 91 |   R_xlen_t last_row = -1;
 92 | 
 93 |   R_xlen_t cells = 0;
 94 |   R_xlen_t first_row;
 95 |   if (!begun_) {
 96 |     t_ = tokenizer_->nextToken();
 97 |     begun_ = true;
 98 |     first_row = 0;
 99 |   } else {
100 |     first_row = t_.row();
101 |   }
102 | 
103 |   while (t_.type() != TOKEN_EOF) {
104 |     ++cells;
105 | 
106 |     if (progress_ && cells % progressStep_ == 0) {
107 |       progressBar_.show(tokenizer_->progress());
108 |     }
109 | 
110 |     if (lines >= 0 && static_cast<R_xlen_t>(t_.row()) - first_row >= lines) {
111 |       --cells;
112 |       break;
113 |     }
114 | 
115 |     if (cells >= n) {
116 |       // Estimate rows in full dataset and resize collectors
117 |       n = (cells / tokenizer_->progress().first) * 1.1;
118 |       collectorsResize(n);
119 |     }
120 | 
121 |     collectors_[0]->setValue(cells - 1, t_.row() + 1);
122 |     collectors_[1]->setValue(cells - 1, t_.col() + 1);
123 |     collectors_[3]->setValue(cells - 1, t_);
124 | 
125 |     switch (t_.type()) {
126 |     case TOKEN_STRING: {
127 |       cpp11::sexp str(cpp11::as_sexp(t_.asString()));
128 |       collectors_[2]->setValue(
129 |           cells - 1, collectorGuess(SEXP(str), locale_, true));
130 |       break;
131 |     };
132 |     case TOKEN_MISSING:
133 |       collectors_[2]->setValue(cells - 1, "missing");
134 |       break;
135 |     case TOKEN_EMPTY:
136 |       collectors_[2]->setValue(cells - 1, "empty");
137 |       break;
138 |     case TOKEN_EOF:
139 |       cpp11::stop("Invalid token");
140 |     }
141 | 
142 |     last_row = t_.row();
143 |     t_ = tokenizer_->nextToken();
144 |   }
145 | 
146 |   if (progress_) {
147 |     progressBar_.show(tokenizer_->progress());
148 |   }
149 | 
150 |   progressBar_.stop();
151 | 
152 |   // Resize the collectors to the final size (if it is not already at that
153 |   // size)
154 |   if (last_row == -1) {
155 |     collectorsResize(0);
156 |   } else if (cells < (n - 1)) {
157 |     collectorsResize(cells);
158 |   }
159 | 
160 |   return cells - 1;
161 | }
162 | 


--------------------------------------------------------------------------------
/src/Reader.h:
--------------------------------------------------------------------------------
 1 | #include "Collector.h"
 2 | #include "Progress.h"
 3 | #include "Source.h"
 4 | 
 5 | #include "cpp11/list.hpp"
 6 | #include "cpp11/strings.hpp"
 7 | 
 8 | class Reader {
 9 | public:
10 |   Reader(
11 |       SourcePtr source,
12 |       TokenizerPtr tokenizer,
13 |       std::vector<CollectorPtr> collectors,
14 |       bool progress,
15 |       const cpp11::strings& colNames = cpp11::strings());
16 | 
17 |   cpp11::sexp meltToDataFrame(const cpp11::list& locale_, R_xlen_t lines = -1);
18 | 
19 | private:
20 |   Warnings warnings_;
21 |   SourcePtr source_;
22 |   TokenizerPtr tokenizer_;
23 |   std::vector<CollectorPtr> collectors_;
24 |   bool progress_;
25 |   Progress progressBar_;
26 |   std::vector<int> keptColumns_;
27 |   cpp11::writable::strings outNames_;
28 |   bool begun_;
29 |   Token t_;
30 | 
31 |   const static R_xlen_t progressStep_ = 10000;
32 | 
33 |   void init(const cpp11::strings& colNames);
34 |   R_xlen_t melt(const cpp11::list& locale_, R_xlen_t lines = -1);
35 | 
36 |   void collectorsResize(R_xlen_t n);
37 |   void collectorsClear();
38 | };
39 | 


--------------------------------------------------------------------------------
/src/Source.cpp:
--------------------------------------------------------------------------------
  1 | #include "cpp11/list.hpp"
  2 | #include "cpp11/strings.hpp"
  3 | 
  4 | #include "Source.h"
  5 | #include "SourceFile.h"
  6 | #include "SourceRaw.h"
  7 | #include "SourceString.h"
  8 | 
  9 | SourcePtr Source::create(const cpp11::list& spec) {
 10 |   std::string subclass(cpp11::as_cpp<cpp11::strings>(spec.attr("class"))[0]);
 11 | 
 12 |   int skip = cpp11::as_cpp<int>(spec["skip"]);
 13 |   bool skipEmptyRows = cpp11::as_cpp<bool>(spec["skip_empty_rows"]);
 14 |   std::string comment = cpp11::as_cpp<std::string>(spec["comment"]);
 15 |   bool skipQuote = cpp11::as_cpp<bool>(spec["skip_quote"]);
 16 | 
 17 |   if (subclass == "source_raw") {
 18 |     return SourcePtr(
 19 |         new SourceRaw(spec[0], skip, skipEmptyRows, comment, skipQuote));
 20 |   }
 21 | 
 22 |   if (subclass == "source_string") {
 23 |     return SourcePtr(
 24 |         new SourceString(spec[0], skip, skipEmptyRows, comment, skipQuote));
 25 |   }
 26 | 
 27 |   if (subclass == "source_file") {
 28 |     cpp11::strings path(spec[0]);
 29 |     return SourcePtr(new SourceFile(Rf_translateCharUTF8(path[0]), skip, skipEmptyRows, comment, skipQuote));
 30 |   }
 31 | 
 32 |   cpp11::stop("Unknown source type");
 33 |   return SourcePtr();
 34 | }
 35 | 
 36 | const char* Source::skipLines(
 37 |     const char* begin,
 38 |     const char* end,
 39 |     int n,
 40 |     bool skipEmptyRows,
 41 |     const std::string& comment,
 42 |     bool skipQuote) {
 43 |   bool hasComment = !comment.empty();
 44 |   bool isComment = false;
 45 | 
 46 |   const char* cur = begin;
 47 | 
 48 |   while (cur < end && n > 0) {
 49 |     cur = skipLine(
 50 |         cur, end, hasComment && inComment(cur, end, comment), skipQuote);
 51 |     --n;
 52 |     ++skippedRows_;
 53 |   }
 54 | 
 55 |   // Skip any more trailing empty rows or comments
 56 |   while (cur < end &&
 57 |          ((skipEmptyRows && (*cur == '\n' || *cur == '\r')) ||
 58 |           (isComment = hasComment && inComment(cur, end, comment)))) {
 59 |     cur = skipLine(cur, end, isComment, skipQuote);
 60 |     ++skippedRows_;
 61 |   }
 62 | 
 63 |   return cur;
 64 | }
 65 | 
 66 | const char* Source::skipLine(
 67 |     const char* begin, const char* end, bool isComment, bool skipQuote) {
 68 |   const char* cur = begin;
 69 |   // skip the rest of the line until the newline
 70 |   while (cur < end && !(*cur == '\n' || *cur == '\r')) {
 71 |     if (!isComment && skipQuote && *cur == '"') {
 72 |       cur = skipDoubleQuoted(cur, end);
 73 |     } else {
 74 |       advanceForLF(&cur, end);
 75 |       ++cur;
 76 |     }
 77 |   }
 78 | 
 79 |   advanceForLF(&cur, end);
 80 | 
 81 |   // skip the actual newline char
 82 |   if (cur < end) {
 83 |     ++cur;
 84 |   }
 85 | 
 86 |   return cur;
 87 | }
 88 | 
 89 | const char* Source::skipDoubleQuoted(const char* begin, const char* end) {
 90 |   const char* cur = begin;
 91 | 
 92 |   // This doesn't handle escaped quotes or more sophisticated things, but
 93 |   // will work for simple cases.
 94 |   // Opening quote
 95 |   ++cur;
 96 | 
 97 |   while (cur < end && *cur != '"') {
 98 |     ++cur;
 99 |   }
100 | 
101 |   // Closing quote
102 |   if (cur < end) {
103 |     ++cur;
104 |   }
105 | 
106 |   return cur;
107 | }
108 | 
109 | const char* Source::skipBom(const char* begin, const char* end) {
110 | 
111 |   /* Unicode Byte Order Marks
112 |      https://en.wikipedia.org/wiki/Byte_order_mark#Representations_of_byte_order_marks_by_encoding
113 | 
114 |      00 00 FE FF: UTF-32BE
115 |      FF FE 00 00: UTF-32LE
116 |      FE FF:       UTF-16BE
117 |      FF FE:       UTF-16LE
118 |      EF BB BF:    UTF-8
119 |  */
120 | 
121 |   switch (begin[0]) {
122 |   // UTF-32BE
123 |   case '\x00':
124 |     if (end - begin >= 4 && begin[1] == '\x00' && begin[2] == '\xFE' &&
125 |         begin[3] == '\xFF') {
126 |       return begin + 4;
127 |     }
128 |     break;
129 | 
130 |   // UTF-8
131 |   case '\xEF':
132 |     if (end - begin >= 3 && begin[1] == '\xBB' && begin[2] == '\xBF') {
133 |       return begin + 3;
134 |     }
135 |     break;
136 | 
137 |   // UTF-16BE
138 |   case '\xfe':
139 |     if (end - begin >= 2 && begin[1] == '\xff') {
140 |       return begin + 2;
141 |     }
142 |     break;
143 | 
144 |   case '\xff':
145 |     if (end - begin >= 2 && begin[1] == '\xfe') {
146 | 
147 |       // UTF-32 LE
148 |       if (end - begin >= 4 && begin[2] == '\x00' && begin[3] == '\x00') {
149 |         return begin + 4;
150 |       }
151 | 
152 |       // UTF-16 LE
153 |       return begin + 2;
154 |     }
155 |     break;
156 |   }
157 |   return begin;
158 | }
159 | 


--------------------------------------------------------------------------------
/src/Source.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_SOURCE_H_
 2 | #define MELTR_SOURCE_H_
 3 | 
 4 | #include "cpp11/list.hpp"
 5 | #include "utils.h"
 6 | 
 7 | #include <memory>
 8 | 
 9 | class Source;
10 | typedef std::shared_ptr<Source> SourcePtr;
11 | 
12 | class Source {
13 | public:
14 |   Source() : skippedRows_(0) {}
15 |   virtual ~Source() {}
16 | 
17 |   virtual const char* begin() = 0;
18 |   virtual const char* end() = 0;
19 | 
20 |   const char* skipLines(
21 |       const char* begin,
22 |       const char* end,
23 |       int n,
24 |       bool skipEmptyRows = true,
25 |       const std::string& comment = "",
26 |       bool skipQuote = true);
27 | 
28 |   static const char*
29 |   skipLine(const char* begin, const char* end, bool isComment, bool skipQuote);
30 | 
31 |   static const char* skipDoubleQuoted(const char* begin, const char* end);
32 | 
33 |   size_t skippedRows() { return skippedRows_; }
34 | 
35 |   static const char* skipBom(const char* begin, const char* end);
36 | 
37 |   static SourcePtr create(const cpp11::list& spec);
38 | 
39 | private:
40 |   static bool
41 |   inComment(const char* cur, const char* end, const std::string& comment) {
42 |     return starts_with_comment(cur, end, comment);
43 |   }
44 | 
45 |   size_t skippedRows_;
46 | };
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/SourceFile.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_SOURCEFILE_H_
 2 | #define MELTR_SOURCEFILE_H_
 3 | 
 4 | #include "Source.h"
 5 | #include "cpp11/protect.hpp"
 6 | 
 7 | #include "unicode_fopen.h"
 8 | 
 9 | class SourceFile : public Source {
10 |   mio::mmap_source source_;
11 | 
12 |   const char* begin_;
13 |   const char* end_;
14 | 
15 | public:
16 |   SourceFile(
17 |       const std::string& path,
18 |       int skip = 0,
19 |       bool skipEmptyRows = true,
20 |       const std::string& comment = "",
21 |       bool skipQuotes = true) {
22 | 
23 |     std::error_code error;
24 |     source_ = make_mmap_source(path.c_str(), error);
25 | 
26 |     if (error) {
27 |       cpp11::stop("Cannot read file %s: %s", error.message().c_str());
28 |     }
29 | 
30 |     begin_ = source_.begin();
31 |     end_ = begin_ + source_.size();
32 | 
33 |     // Skip byte order mark, if needed
34 |     begin_ = skipBom(begin_, end_);
35 | 
36 |     // Skip lines, if needed
37 |     begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes);
38 |   }
39 | 
40 |   const char* begin() { return begin_; }
41 | 
42 |   const char* end() { return end_; }
43 | };
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/SourceRaw.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_SOURCERAW_H_
 2 | #define MELTR_SOURCERAW_H_
 3 | 
 4 | #include "Source.h"
 5 | #include "cpp11/raws.hpp"
 6 | 
 7 | class SourceRaw : public Source {
 8 |   cpp11::raws x_;
 9 |   const char* begin_;
10 |   const char* end_;
11 | 
12 | public:
13 |   SourceRaw(
14 |       cpp11::raws x,
15 |       int skip = 0,
16 |       bool skipEmptyRows = true,
17 |       const std::string& comment = "",
18 |       bool skipQuotes = true)
19 |       : x_(x) {
20 |     begin_ = (const char*)RAW(x);
21 |     end_ = (const char*)RAW(x) + Rf_xlength(x);
22 | 
23 |     // Skip byte order mark, if needed
24 |     begin_ = skipBom(begin_, end_);
25 | 
26 |     // Skip lines, if needed
27 |     begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes);
28 |   }
29 | 
30 |   const char* begin() { return begin_; }
31 | 
32 |   const char* end() { return end_; }
33 | };
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/src/SourceString.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_SOURCESTRING_H_
 2 | #define MELTR_SOURCESTRING_H_
 3 | 
 4 | #include "cpp11/strings.hpp"
 5 | 
 6 | #include "Source.h"
 7 | 
 8 | class SourceString : public Source {
 9 |   cpp11::sexp string_;
10 | 
11 |   const char* begin_;
12 |   const char* end_;
13 | 
14 | public:
15 |   SourceString(
16 |       cpp11::strings x,
17 |       int skip = 0,
18 |       bool skipEmptyRows = true,
19 |       const std::string& comment = "",
20 |       bool skipQuotes = true)
21 |       : string_(static_cast<SEXP>(x[0])) {
22 | 
23 |     begin_ = CHAR(string_);
24 |     end_ = begin_ + Rf_xlength(string_);
25 | 
26 |     // Skip byte order mark, if needed
27 |     begin_ = skipBom(begin_, end_);
28 | 
29 |     // Skip lines, if needed
30 |     begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes);
31 |   }
32 | 
33 |   const char* begin() { return begin_; }
34 | 
35 |   const char* end() { return end_; }
36 | };
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/Token.h:
--------------------------------------------------------------------------------
  1 | #ifndef MELTR_TOKEN_H_
  2 | #define MELTR_TOKEN_H_
  3 | 
  4 | #include "cpp11/raws.hpp"
  5 | 
  6 | #include "Iconv.h"
  7 | #include "Source.h"
  8 | #include "Tokenizer.h"
  9 | #include <string>
 10 | 
 11 | enum TokenType {
 12 |   TOKEN_STRING,  // a sequence of characters
 13 |   TOKEN_MISSING, // an missing value
 14 |   TOKEN_EMPTY,   // an empty value
 15 |   TOKEN_EOF      // end of file
 16 | };
 17 | 
 18 | class Token {
 19 |   TokenType type_;
 20 |   SourceIterator begin_, end_;
 21 |   size_t row_, col_;
 22 |   bool hasNull_;
 23 | 
 24 |   Tokenizer* pTokenizer_;
 25 | 
 26 | public:
 27 |   Token()
 28 |       : type_(TOKEN_EMPTY),
 29 |         begin_(0),
 30 |         end_(0),
 31 |         row_(0),
 32 |         col_(0),
 33 |         hasNull_(false),
 34 |         pTokenizer_(nullptr) {}
 35 |   Token(TokenType type, int row, int col)
 36 |       : type_(type),
 37 |         begin_(0),
 38 |         end_(0),
 39 |         row_(row),
 40 |         col_(col),
 41 |         hasNull_(false),
 42 |         pTokenizer_(nullptr) {}
 43 |   Token(
 44 |       SourceIterator begin,
 45 |       SourceIterator end,
 46 |       int row,
 47 |       int col,
 48 |       bool hasNull,
 49 |       Tokenizer* pTokenizer = NULL)
 50 |       : type_(TOKEN_STRING),
 51 |         begin_(begin),
 52 |         end_(end),
 53 |         row_(row),
 54 |         col_(col),
 55 |         hasNull_(hasNull),
 56 |         pTokenizer_(pTokenizer) {
 57 |     if (begin_ == end_)
 58 |       type_ = TOKEN_EMPTY;
 59 |   }
 60 | 
 61 |   std::string asString() const {
 62 |     switch (type_) {
 63 |     case TOKEN_STRING: {
 64 |       std::string buffer;
 65 |       SourceIterators string = getString(&buffer);
 66 | 
 67 |       return std::string(string.first, string.second);
 68 |     }
 69 |     case TOKEN_MISSING:
 70 |       return "[MISSING]";
 71 |     case TOKEN_EMPTY:
 72 |       return "[EMPTY]";
 73 |     case TOKEN_EOF:
 74 |       return "[EOF]";
 75 |     }
 76 | 
 77 |     return "";
 78 |   }
 79 | 
 80 |   SEXP asRaw() const {
 81 |     size_t n = (type_ == TOKEN_STRING) ? end_ - begin_ : 0;
 82 |     cpp11::writable::raws out(n);
 83 | 
 84 |     if (n > 0)
 85 |       memcpy(RAW(out), begin_, n);
 86 | 
 87 |     return out;
 88 |   }
 89 | 
 90 |   SEXP asSEXP(Iconv* pEncoder) const {
 91 |     switch (type_) {
 92 |     case TOKEN_STRING: {
 93 |       std::string buffer;
 94 |       SourceIterators string = getString(&buffer);
 95 | 
 96 |       return pEncoder->makeSEXP(string.first, string.second, hasNull_);
 97 |     }
 98 |     default:
 99 |       return NA_STRING;
100 |     }
101 |   }
102 | 
103 |   TokenType type() const { return type_; }
104 | 
105 |   SourceIterators getString(std::string* pOut) const {
106 |     if (pTokenizer_ == NULL)
107 |       return std::make_pair(begin_, end_);
108 | 
109 |     pTokenizer_->unescape(begin_, end_, pOut);
110 |     return std::make_pair(pOut->data(), pOut->data() + pOut->size());
111 |   }
112 | 
113 |   size_t row() const { return row_; }
114 |   size_t col() const { return col_; }
115 | 
116 |   bool hasNull() const { return hasNull_; }
117 | 
118 |   Token& trim() {
119 |     while (begin_ != end_ && (*begin_ == ' ' || *begin_ == '\t'))
120 |       begin_++;
121 |     while (end_ != begin_ && (*(end_ - 1) == ' ' || *(end_ - 1) == '\t'))
122 |       end_--;
123 | 
124 |     if (begin_ == end_)
125 |       type_ = TOKEN_EMPTY;
126 | 
127 |     return *this;
128 |   }
129 | 
130 |   Token& flagNA(const std::vector<std::string>& NA) {
131 | 
132 |     std::vector<std::string>::const_iterator it;
133 |     for (it = NA.begin(); it != NA.end(); ++it) {
134 |       if ((size_t)(end_ - begin_) != it->size())
135 |         continue;
136 | 
137 |       if (strncmp(begin_, it->data(), it->size()) == 0) {
138 |         type_ = TOKEN_MISSING;
139 |         break;
140 |       }
141 |     }
142 | 
143 |     return *this;
144 |   }
145 | };
146 | 
147 | #endif
148 | 


--------------------------------------------------------------------------------
/src/Tokenizer.cpp:
--------------------------------------------------------------------------------
 1 | #include "cpp11/as.hpp"
 2 | #include "cpp11/integers.hpp"
 3 | #include "cpp11/list.hpp"
 4 | 
 5 | #include "Tokenizer.h"
 6 | #include "TokenizerDelim.h"
 7 | #include "TokenizerFwf.h"
 8 | #include "TokenizerWs.h"
 9 | 
10 | TokenizerPtr Tokenizer::create(const cpp11::list& spec) {
11 |   std::string subclass(cpp11::strings(spec.attr("class"))[0]);
12 | 
13 |   if (subclass == "tokenizer_delim") {
14 |     char delim = cpp11::as_cpp<char>(spec["delim"]);
15 |     char quote = cpp11::as_cpp<char>(spec["quote"]);
16 |     std::vector<std::string> na =
17 |         cpp11::as_cpp<std::vector<std::string>>(spec["na"]);
18 |     std::string comment = cpp11::as_cpp<std::string>(spec["comment"]);
19 |     bool trimWs = cpp11::as_cpp<bool>(spec["trim_ws"]);
20 |     bool escapeDouble = cpp11::as_cpp<bool>(spec["escape_double"]);
21 |     bool escapeBackslash = cpp11::as_cpp<bool>(spec["escape_backslash"]);
22 |     bool quotedNA = cpp11::as_cpp<bool>(spec["quoted_na"]);
23 |     bool skipEmptyRows = cpp11::as_cpp<bool>(spec["skip_empty_rows"]);
24 | 
25 |     return TokenizerPtr(new TokenizerDelim(
26 |         delim,
27 |         quote,
28 |         na,
29 |         comment,
30 |         trimWs,
31 |         escapeBackslash,
32 |         escapeDouble,
33 |         quotedNA,
34 |         skipEmptyRows));
35 |   }
36 | 
37 |   if (subclass == "tokenizer_fwf") {
38 |     std::vector<int> begin = cpp11::as_cpp<std::vector<int>>(spec["begin"]);
39 |     std::vector<int> end = cpp11::as_cpp<std::vector<int>>(spec["end"]);
40 |     std::vector<std::string> na =
41 |         cpp11::as_cpp<std::vector<std::string>>(spec["na"]);
42 |     std::string comment = cpp11::as_cpp<std::string>(spec["comment"]);
43 |     bool trimWs = cpp11::as_cpp<bool>(spec["trim_ws"]);
44 |     bool skipEmptyRows = cpp11::as_cpp<bool>(spec["skip_empty_rows"]);
45 |     return TokenizerPtr(
46 |         new TokenizerFwf(begin, end, na, comment, trimWs, skipEmptyRows));
47 |   }
48 | 
49 |   if (subclass == "tokenizer_ws") {
50 |     std::vector<std::string> na =
51 |         cpp11::as_cpp<std::vector<std::string>>(spec["na"]);
52 |     std::string comment = cpp11::as_cpp<std::string>(spec["comment"]);
53 |     bool skipEmptyRows = cpp11::as_cpp<bool>(spec["skip_empty_rows"]);
54 |     return TokenizerPtr(new TokenizerWs(na, comment, skipEmptyRows));
55 |   }
56 | 
57 |   cpp11::stop("Unknown tokenizer type");
58 |   return TokenizerPtr();
59 | }
60 | 


--------------------------------------------------------------------------------
/src/Tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_TOKENIZER_H_
 2 | #define MELTR_TOKENIZER_H_
 3 | 
 4 | #include "cpp11/R.hpp"
 5 | #include "cpp11/list.hpp"
 6 | #include "cpp11/protect.hpp"
 7 | 
 8 | #include "Warnings.h"
 9 | #include <memory>
10 | 
11 | class Token;
12 | 
13 | typedef const char* SourceIterator;
14 | typedef std::pair<SourceIterator, SourceIterator> SourceIterators;
15 | typedef void (*UnescapeFun)(SourceIterator, SourceIterator, std::string*);
16 | 
17 | class Tokenizer;
18 | typedef std::shared_ptr<Tokenizer> TokenizerPtr;
19 | 
20 | class Tokenizer {
21 |   Warnings* pWarnings_;
22 | 
23 | public:
24 |   Tokenizer() : pWarnings_(NULL) {}
25 |   virtual ~Tokenizer() {}
26 | 
27 |   virtual void tokenize(SourceIterator begin, SourceIterator end) = 0;
28 |   virtual Token nextToken() = 0;
29 |   // Percentage & bytes
30 |   virtual std::pair<double, size_t> progress() = 0;
31 | 
32 |   virtual void
33 |   unescape(SourceIterator begin, SourceIterator end, std::string* pOut) {
34 |     pOut->reserve(end - begin);
35 |     for (SourceIterator cur = begin; cur != end; ++cur)
36 |       pOut->push_back(*cur);
37 |   }
38 | 
39 |   void setWarnings(Warnings* pWarnings) { pWarnings_ = pWarnings; }
40 | 
41 |   inline void warn(
42 |       int row,
43 |       int col,
44 |       const std::string& expected,
45 |       const std::string& actual = "") {
46 |     if (pWarnings_ == NULL) {
47 |       cpp11::warning(
48 |           "[%i, %i]: expected %s", row + 1, col + 1, expected.c_str());
49 |       return;
50 |     }
51 |     pWarnings_->addWarning(row, col, expected, actual);
52 |   }
53 | 
54 |   static TokenizerPtr create(const cpp11::list& spec);
55 | };
56 | 
57 | // -----------------------------------------------------------------------------
58 | // Helper class for parsers - ensures iterator always advanced no matter
59 | // how loop is exited
60 | 
61 | class Advance {
62 |   SourceIterator* pIter_;
63 | 
64 | public:
65 |   Advance(SourceIterator* pIter) : pIter_(pIter) {}
66 |   Advance(const Advance&) = delete;
67 |   Advance& operator=(const Advance&) = delete;
68 |   ~Advance() { (*pIter_)++; }
69 | };
70 | 
71 | #endif
72 | 


--------------------------------------------------------------------------------
/src/TokenizerDelim.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_TOKENIZEDELIM_H_
 2 | #define MELTR_TOKENIZEDELIM_H_
 3 | #include "cpp11/R.hpp"
 4 | 
 5 | #include "Token.h"
 6 | #include "Tokenizer.h"
 7 | #include "utils.h"
 8 | 
 9 | enum DelimState {
10 |   STATE_DELIM,
11 |   STATE_FIELD,
12 |   STATE_STRING,
13 |   STATE_QUOTE,
14 |   STATE_ESCAPE_S,
15 |   STATE_ESCAPE_F,
16 |   STATE_STRING_END,
17 |   STATE_COMMENT
18 | };
19 | 
20 | class TokenizerDelim : public Tokenizer {
21 |   char delim_, quote_;
22 |   std::vector<std::string> NA_;
23 |   std::string comment_;
24 | 
25 |   bool hasComment_, trimWS_, escapeBackslash_, escapeDouble_, quotedNA_,
26 |       hasEmptyNA_;
27 | 
28 |   SourceIterator begin_, cur_, end_;
29 |   DelimState state_;
30 |   int row_, col_;
31 |   bool moreTokens_;
32 |   bool skipEmptyRows_;
33 | 
34 | public:
35 |   TokenizerDelim(
36 |       char delim = ',',
37 |       char quote = '"',
38 |       std::vector<std::string> NA = std::vector<std::string>(1, "NA"),
39 |       const std::string& comment = "",
40 |       bool trimWS = true,
41 |       bool escapeBackslash = false,
42 |       bool escapeDouble = true,
43 |       bool quotedNA = true,
44 |       bool skipEmptyRows = true);
45 | 
46 |   void tokenize(SourceIterator begin, SourceIterator end);
47 | 
48 |   std::pair<double, size_t> progress();
49 | 
50 |   Token nextToken();
51 | 
52 |   void unescape(SourceIterator begin, SourceIterator end, std::string* pOut);
53 | 
54 | private:
55 |   bool isComment(const char* cur) const;
56 | 
57 |   void newField();
58 | 
59 |   void newRecord();
60 | 
61 |   Token emptyToken(int row, int col) const;
62 | 
63 |   Token fieldToken(
64 |       SourceIterator begin,
65 |       SourceIterator end,
66 |       bool hasEscapeB,
67 |       bool hasNull,
68 |       int row,
69 |       int col);
70 | 
71 |   Token stringToken(
72 |       SourceIterator begin,
73 |       SourceIterator end,
74 |       bool hasEscapeB,
75 |       bool hasEscapeD,
76 |       bool hasNull,
77 |       int row,
78 |       int col);
79 | 
80 |   void unescapeBackslash(
81 |       SourceIterator begin, SourceIterator end, std::string* pOut);
82 | 
83 |   void
84 |   unescapeDouble(SourceIterator begin, SourceIterator end, std::string* pOut) const;
85 | };
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/TokenizerFwf.cpp:
--------------------------------------------------------------------------------
  1 | #include "cpp11/list.hpp"
  2 | #include "cpp11/protect.hpp"
  3 | 
  4 | #include "Tokenizer.h"
  5 | #include "TokenizerFwf.h"
  6 | #include "utils.h"
  7 | 
  8 | #include "Source.h"
  9 | 
 10 | #include <sstream>
 11 | #include <utility>
 12 | 
 13 | struct skip_t {
 14 |   SourceIterator begin;
 15 |   int lines;
 16 | };
 17 | 
 18 | skip_t skip_comments(
 19 |     SourceIterator begin, SourceIterator end, const std::string& comment = "") {
 20 |   skip_t out;
 21 |   if (comment.length() == 0) {
 22 |     out.begin = begin;
 23 |     out.lines = 0;
 24 |     return out;
 25 |   }
 26 | 
 27 |   SourceIterator cur = begin;
 28 |   int skip = 0;
 29 |   while (starts_with_comment(cur, end, comment)) {
 30 |     // Skip rest of line
 31 |     while (cur != end && *cur != '\n' && *cur != '\r') {
 32 |       ++cur;
 33 |     }
 34 | 
 35 |     advanceForLF(&cur, end);
 36 |     ++cur;
 37 |     ++skip;
 38 |   }
 39 | 
 40 |   out.begin = cur;
 41 |   out.lines = skip;
 42 |   return out;
 43 | }
 44 | 
 45 | std::vector<bool>
 46 | emptyCols_(SourceIterator begin, SourceIterator end, size_t n = 100) {
 47 | 
 48 |   std::vector<bool> is_white;
 49 | 
 50 |   size_t row = 0;
 51 | 
 52 |   size_t col = 0;
 53 |   for (SourceIterator cur = begin; cur != end; ++cur) {
 54 |     if (row > n) {
 55 |       break;
 56 |     }
 57 | 
 58 |     switch (*cur) {
 59 |     case '\n':
 60 |     case '\r':
 61 |       advanceForLF(&cur, end);
 62 |       col = 0;
 63 |       row++;
 64 |       break;
 65 |     case ' ':
 66 |       col++;
 67 |       break;
 68 |     default:
 69 |       // Make sure there's enough room
 70 |       if (col >= is_white.size()) {
 71 |         is_white.resize(col + 1, true);
 72 |       }
 73 |       is_white[col] = false;
 74 |       col++;
 75 |     }
 76 |   }
 77 | 
 78 |   return is_white;
 79 | }
 80 | 
 81 | [[cpp11::register]] cpp11::list
 82 | whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment) {
 83 |   SourcePtr source = Source::create(sourceSpec);
 84 | 
 85 |   skip_t s = skip_comments(source->begin(), source->end(), std::move(comment));
 86 | 
 87 |   std::vector<bool> empty = emptyCols_(s.begin, source->end(), n);
 88 |   std::vector<int> begin;
 89 | 
 90 |   std::vector<int> end;
 91 | 
 92 |   bool in_col = false;
 93 | 
 94 |   for (size_t i = 0; i < empty.size(); ++i) {
 95 |     if (in_col && empty[i]) {
 96 |       end.push_back(i);
 97 |       in_col = false;
 98 |     } else if (!in_col && !empty[i]) {
 99 |       begin.push_back(i);
100 |       in_col = true;
101 |     }
102 |   }
103 | 
104 |   if (in_col) {
105 |     end.push_back(empty.size());
106 |   }
107 | 
108 |   using namespace cpp11::literals;
109 |   return cpp11::writable::list(
110 |       {"begin"_nm = begin, "end"_nm = end, "skip"_nm = s.lines});
111 | }
112 | 
113 | // TokenizerFwf --------------------------------------------------------------
114 | 
115 | #include "TokenizerFwf.h"
116 | 
117 | TokenizerFwf::TokenizerFwf(
118 |     const std::vector<int>& beginOffset,
119 |     const std::vector<int>& endOffset,
120 |     std::vector<std::string> NA,
121 |     const std::string& comment,
122 |     bool trimWS,
123 |     bool skipEmptyRows)
124 |     : beginOffset_(beginOffset),
125 |       endOffset_(endOffset),
126 |       NA_(std::move(NA)),
127 |       cols_(beginOffset.size()),
128 |       comment_(comment),
129 |       moreTokens_(false),
130 |       hasComment_(!comment.empty()),
131 |       trimWS_(trimWS),
132 |       skipEmptyRows_(skipEmptyRows) {
133 |   if (beginOffset_.size() != endOffset_.size()) {
134 |     cpp11::stop(
135 |         "Begin (%i) and end (%i) specifications must have equal length",
136 |         beginOffset_.size(),
137 |         endOffset_.size());
138 |   }
139 | 
140 |   if (beginOffset_.empty()) {
141 |     cpp11::stop("Zero-length begin and end specifications not supported");
142 |   }
143 | 
144 |   // File is assumed to be ragged (last column can have variable width)
145 |   // when the last element of endOffset_ is NA
146 |   isRagged_ = endOffset_[endOffset_.size() - 1L] == NA_INTEGER;
147 | 
148 |   max_ = 0;
149 |   for (int j = 0; j < (cols_ - static_cast<int>(isRagged_)); ++j) {
150 |     if (endOffset_[j] <= beginOffset_[j]) {
151 |       cpp11::stop(
152 |           "Begin offset (%i) must be smaller than end offset (%i)",
153 |           beginOffset_[j],
154 |           endOffset_[j]);
155 |     }
156 | 
157 |     if (beginOffset_[j] < 0) {
158 |       cpp11::stop("Begin offset (%i) must be greater than 0", beginOffset_[j]);
159 |     }
160 | 
161 |     if (endOffset_[j] < 0) {
162 |       cpp11::stop("End offset (%i) must be greater than 0", endOffset_[j]);
163 |     }
164 | 
165 |     if (endOffset_[j] > max_) {
166 |       max_ = endOffset_[j];
167 |     }
168 |   }
169 | }
170 | 
171 | void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) {
172 |   cur_ = begin;
173 |   curLine_ = begin;
174 | 
175 |   begin_ = begin;
176 |   end_ = end;
177 | 
178 |   row_ = 0;
179 |   col_ = 0;
180 |   moreTokens_ = true;
181 | }
182 | 
183 | std::pair<double, size_t> TokenizerFwf::progress() {
184 |   size_t bytes = cur_ - begin_;
185 |   return std::make_pair(bytes / (double)(end_ - begin_), bytes);
186 | }
187 | 
188 | Token TokenizerFwf::nextToken() {
189 |   if (!moreTokens_) {
190 |     return {TOKEN_EOF, 0, 0};
191 |   }
192 | 
193 |   // Check for comments only at start of line
194 |   while (cur_ != end_ && col_ == 0 &&
195 |          (isComment(cur_) || (isEmpty() && skipEmptyRows_))) {
196 |     // Skip rest of line
197 |     while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') {
198 |       ++cur_;
199 |     }
200 |     advanceForLF(&cur_, end_);
201 |     if (cur_ != end_) {
202 |       ++cur_;
203 |     }
204 |     curLine_ = cur_;
205 |   }
206 | 
207 |   // Find start of field
208 |   SourceIterator fieldBegin = cur_;
209 | findBeginning:
210 |   int skip = beginOffset_[col_] - (cur_ - curLine_);
211 |   if (skip < 0) { // overlapping case
212 |     fieldBegin += skip;
213 |   } else if (skip > 0) { // skipped columns case
214 |     for (int i = 0; i < skip; ++i) {
215 |       if (fieldBegin == end_) {
216 |         break;
217 |       }
218 | 
219 |       if (*fieldBegin == '\n' || *fieldBegin == '\r') {
220 |         std::stringstream ss1;
221 |         ss1 << skip << " chars betwen fields";
222 |         std::stringstream ss2;
223 |         ss2 << skip << " chars until end of line";
224 |         warn(row_, col_, ss1.str(), ss2.str());
225 | 
226 |         row_++;
227 |         col_ = 0;
228 | 
229 |         advanceForLF(&fieldBegin, end_);
230 |         if (fieldBegin != end_) {
231 |           fieldBegin++;
232 |         }
233 |         cur_ = curLine_ = fieldBegin;
234 |         goto findBeginning;
235 |       }
236 |       fieldBegin++;
237 |     }
238 |   }
239 | 
240 |   if (fieldBegin == end_) {
241 |     // need to warn here if col != 0/cols - 1
242 |     moreTokens_ = false;
243 |     return {TOKEN_EOF, 0, 0};
244 |   }
245 | 
246 |   // Find end of field
247 |   SourceIterator fieldEnd = fieldBegin;
248 |   bool lastCol = (col_ == cols_ - 1);
249 | 
250 |   bool tooShort = false;
251 | 
252 |   bool hasNull = false;
253 | 
254 |   if (lastCol && isRagged_) {
255 |     // Last column is ragged, so read until end of line (ignoring width)
256 |     while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
257 |       if (*fieldEnd == '\0') {
258 |         hasNull = true;
259 |       }
260 |       fieldEnd++;
261 |     }
262 |   } else {
263 |     int width = endOffset_[col_] - beginOffset_[col_];
264 |     // Find the end of the field, stopping for newlines
265 |     for (int i = 0; i < width; ++i) {
266 |       if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') {
267 |         if (!(col_ == 0 && !skipEmptyRows_)) {
268 |           std::stringstream ss1;
269 |           ss1 << i << " chars";
270 |           std::stringstream ss2;
271 |           ss2 << i;
272 |           warn(row_, col_, ss1.str(), ss2.str());
273 |         }
274 | 
275 |         tooShort = true;
276 |         break;
277 |       }
278 |       if (*fieldEnd == '\0') {
279 |         hasNull = true;
280 |       }
281 | 
282 |       fieldEnd++;
283 |     }
284 |   }
285 | 
286 |   Token t = fieldToken(fieldBegin, fieldEnd, hasNull);
287 | 
288 |   if (lastCol || tooShort) {
289 |     row_++;
290 |     col_ = 0;
291 | 
292 |     if (!(tooShort || isRagged_)) {
293 |       // Proceed to the end of the line when you are possibly not there.
294 |       // This is needed in case the last column in the file is not being read.
295 |       while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
296 |         fieldEnd++;
297 |       }
298 |     }
299 | 
300 |     curLine_ = fieldEnd;
301 |     advanceForLF(&curLine_, end_);
302 |     if (curLine_ != end_) {
303 |       curLine_++;
304 |     }
305 |     cur_ = curLine_;
306 |   } else {
307 |     col_++;
308 |     cur_ = fieldEnd;
309 |   }
310 | 
311 |   return t;
312 | }
313 | 
314 | Token TokenizerFwf::fieldToken(
315 |     SourceIterator begin, SourceIterator end, bool hasNull) {
316 |   if (begin == end) {
317 |     return {TOKEN_MISSING, row_, col_};
318 |   }
319 | 
320 |   Token t = Token(begin, end, row_, col_, hasNull);
321 |   if (trimWS_) {
322 |     t.trim();
323 |   }
324 |   t.flagNA(NA_);
325 | 
326 |   return t;
327 | }
328 | 
329 | bool TokenizerFwf::isComment(const char* cur) const {
330 |   if (!hasComment_) {
331 |     return false;
332 |   }
333 | 
334 |   return starts_with_comment(cur, end_, comment_);
335 | }
336 | 
337 | bool TokenizerFwf::isEmpty() const {
338 |   return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n';
339 | }
340 | 


--------------------------------------------------------------------------------
/src/TokenizerFwf.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_TOKENIZERFWF_H_
 2 | #define MELTR_TOKENIZERFWF_H_
 3 | 
 4 | #include "Token.h"
 5 | #include "Tokenizer.h"
 6 | #include "utils.h"
 7 | 
 8 | class TokenizerFwf : public Tokenizer {
 9 |   std::vector<int> beginOffset_;
10 |   std::vector<int> endOffset_;
11 |   std::vector<std::string> NA_;
12 | 
13 |   SourceIterator begin_, cur_, curLine_, end_;
14 |   int row_, col_, cols_, max_;
15 |   std::string comment_;
16 |   bool moreTokens_, isRagged_, hasComment_, trimWS_;
17 |   bool skipEmptyRows_;
18 | 
19 | public:
20 |   TokenizerFwf(
21 |       const std::vector<int>& beginOffset,
22 |       const std::vector<int>& endOffset,
23 |       std::vector<std::string> NA = std::vector<std::string>(1, "NA"),
24 |       const std::string& comment = "",
25 |       bool trimWS = true,
26 |       bool skipEmptyRows = true);
27 | 
28 |   void tokenize(SourceIterator begin, SourceIterator end);
29 | 
30 |   std::pair<double, size_t> progress();
31 | 
32 |   Token nextToken();
33 | 
34 | private:
35 |   Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull);
36 | 
37 |   bool isComment(const char* cur) const;
38 |   bool isEmpty() const;
39 | };
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/src/TokenizerWs.cpp:
--------------------------------------------------------------------------------
  1 | #include "cpp11/R.hpp"
  2 | 
  3 | #include "Tokenizer.h"
  4 | #include "TokenizerFwf.h"
  5 | #include "TokenizerWs.h"
  6 | #include "utils.h"
  7 | 
  8 | #include "Source.h"
  9 | 
 10 | // TokenizerWs
 11 | // --------------------------------------------------------------------
 12 | 
 13 | #include <cctype>
 14 | #include <utility>
 15 | 
 16 | TokenizerWs::TokenizerWs(
 17 |     std::vector<std::string> NA, const std::string& comment, bool skipEmptyRows)
 18 |     : NA_(std::move(NA)),
 19 |       comment_(comment),
 20 |       moreTokens_(false),
 21 |       hasComment_(!comment.empty()),
 22 |       skipEmptyRows_(skipEmptyRows) {}
 23 | 
 24 | void TokenizerWs::tokenize(SourceIterator begin, SourceIterator end) {
 25 |   cur_ = begin;
 26 |   curLine_ = begin;
 27 | 
 28 |   begin_ = begin;
 29 |   end_ = end;
 30 | 
 31 |   row_ = 0;
 32 |   col_ = 0;
 33 |   moreTokens_ = true;
 34 | }
 35 | 
 36 | std::pair<double, size_t> TokenizerWs::progress() {
 37 |   size_t bytes = cur_ - begin_;
 38 |   return std::make_pair(bytes / (double)(end_ - begin_), bytes);
 39 | }
 40 | 
 41 | Token TokenizerWs::nextToken() {
 42 |   // Check for comments and empty lines at the start of a line
 43 |   while (cur_ != end_ && col_ == 0 &&
 44 |          (isComment(cur_) || (skipEmptyRows_ && isEmpty()))) {
 45 |     ignoreLine();
 46 |   }
 47 | 
 48 |   if (cur_ == end_) {
 49 |     return {TOKEN_EOF, 0, 0};
 50 |   }
 51 | 
 52 |   // Find start of field
 53 |   SourceIterator fieldBegin = cur_;
 54 |   while (fieldBegin != end_ && (isblank(*fieldBegin) != 0)) {
 55 |     ++fieldBegin;
 56 |   }
 57 | 
 58 |   // Make sure we are not at the start of a comment
 59 |   if (isComment(fieldBegin)) {
 60 |     ignoreLine();
 61 |     row_++;
 62 |     col_ = 0;
 63 |     return nextToken();
 64 |   }
 65 | 
 66 |   SourceIterator fieldEnd = fieldBegin;
 67 |   while (fieldEnd != end_ && (isspace(*fieldEnd) == 0)) {
 68 |     ++fieldEnd;
 69 |   }
 70 |   bool hasNull = fieldEnd != end_ && *fieldEnd == '\0';
 71 |   Token t = fieldToken(fieldBegin, fieldEnd, hasNull);
 72 |   cur_ = fieldEnd;
 73 |   ++col_;
 74 |   if (cur_ != end_ && (*cur_ == '\r' || *cur_ == '\n')) {
 75 |     advanceForLF(&cur_, end_);
 76 |     ++cur_;
 77 |     row_++;
 78 |     col_ = 0;
 79 |   }
 80 |   return t;
 81 | }
 82 | 
 83 | Token TokenizerWs::fieldToken(
 84 |     SourceIterator begin, SourceIterator end, bool hasNull) {
 85 |   if (begin == end) {
 86 |     return {TOKEN_MISSING, row_, col_};
 87 |   }
 88 | 
 89 |   Token t = Token(begin, end, row_, col_, hasNull);
 90 |   t.trim();
 91 |   t.flagNA(NA_);
 92 | 
 93 |   return t;
 94 | }
 95 | 
 96 | bool TokenizerWs::isComment(const char* cur) const {
 97 |   if (!hasComment_) {
 98 |     return false;
 99 |   }
100 | 
101 |   return starts_with_comment(cur, end_, comment_);
102 | }
103 | 
104 | bool TokenizerWs::isEmpty() const {
105 |   return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n';
106 | }
107 | 
108 | void TokenizerWs::ignoreLine() {
109 |   // Skip rest of line
110 |   while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') {
111 |     ++cur_;
112 |   }
113 |   advanceForLF(&cur_, end_);
114 |   if (cur_ != end_) {
115 |     ++cur_;
116 |   }
117 |   curLine_ = cur_;
118 | }
119 | 


--------------------------------------------------------------------------------
/src/TokenizerWs.h:
--------------------------------------------------------------------------------
 1 | #ifndef READR_TOKENIZERWS_H_
 2 | #define READR_TOKENIZERWS_H_
 3 | 
 4 | #include "Token.h"
 5 | #include "Tokenizer.h"
 6 | #include "utils.h"
 7 | 
 8 | class TokenizerWs : public Tokenizer {
 9 |   std::vector<std::string> NA_;
10 | 
11 |   SourceIterator begin_, cur_, curLine_, end_;
12 |   int row_, col_;
13 |   std::string comment_;
14 |   bool moreTokens_, hasComment_;
15 |   bool skipEmptyRows_;
16 | 
17 | public:
18 |   TokenizerWs(
19 |       std::vector<std::string> NA = std::vector<std::string>(1, "NA"),
20 |       const std::string& comment = "",
21 |       bool skipEmptyRows = true);
22 | 
23 |   void tokenize(SourceIterator begin, SourceIterator end);
24 | 
25 |   std::pair<double, size_t> progress();
26 | 
27 |   Token nextToken();
28 | 
29 | private:
30 |   Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull);
31 | 
32 |   bool isComment(const char* cur) const;
33 |   bool isEmpty() const;
34 |   void ignoreLine();
35 | };
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/Warnings.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_WARNINGS_H_
 2 | #define MELTR_WARNINGS_H_
 3 | 
 4 | #include "cpp11/data_frame.hpp"
 5 | #include "cpp11/sexp.hpp"
 6 | #include "cpp11/strings.hpp"
 7 | #include <string>
 8 | #include <vector>
 9 | 
10 | class Warnings {
11 |   std::vector<int> row_, col_;
12 |   std::vector<std::string> expected_, actual_;
13 | 
14 | public:
15 |   Warnings() {}
16 | 
17 |   // row and col should be zero-indexed. addWarning converts into one-indexed
18 |   void addWarning(
19 |       int row,
20 |       int col,
21 |       const std::string& expected,
22 |       const std::string& actual) {
23 |     row_.push_back(row == -1 ? NA_INTEGER : row + 1);
24 |     col_.push_back(col == -1 ? NA_INTEGER : col + 1);
25 |     expected_.push_back(expected);
26 |     actual_.push_back(actual);
27 |   }
28 | 
29 |   cpp11::sexp addAsAttribute(cpp11::sexp x) {
30 |     if (size() == 0)
31 |       return x;
32 | 
33 |     x.attr("problems") = asDataFrame();
34 |     return x;
35 |   }
36 | 
37 |   size_t size() { return row_.size(); }
38 | 
39 |   void clear() {
40 |     row_.clear();
41 |     col_.clear();
42 |     expected_.clear();
43 |     actual_.clear();
44 |   }
45 | 
46 |   cpp11::data_frame asDataFrame() {
47 |     using namespace cpp11::literals;
48 | 
49 |     cpp11::writable::data_frame out(
50 |         {"row"_nm = row_,
51 |          "col"_nm = col_,
52 |          "expected"_nm = expected_,
53 |          "actual"_nm = actual_});
54 |     out.attr("class") = {"tbl_df", "tbl", "data.frame"};
55 | 
56 |     return static_cast<SEXP>(out);
57 |   }
58 | };
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/connection.cpp:
--------------------------------------------------------------------------------
 1 | #include "cpp11/R.hpp"
 2 | #include "cpp11/function.hpp"
 3 | #include "cpp11/raws.hpp"
 4 | #include "cpp11/strings.hpp"
 5 | 
 6 | #include <fstream>
 7 | 
 8 | // Wrapper around R's read_bin function
 9 | SEXP read_bin(const cpp11::sexp& con, int bytes) {
10 |   static auto readBin = cpp11::package("base")["readBin"];
11 | 
12 |   return readBin(con, "raw", bytes);
13 | }
14 | 
15 | // Read data from a connection in chunks and then combine into a single
16 | // raw vector.
17 | //
18 | [[cpp11::register]] std::string
19 | read_connection_(const cpp11::sexp& con, std::string filename, int chunk_size) {
20 | 
21 |   std::ofstream out(filename.c_str(), std::fstream::out | std::fstream::binary);
22 | 
23 |   SEXP chunk = read_bin(con, chunk_size);
24 |   R_xlen_t chunk_len = Rf_xlength(chunk);
25 | 
26 |   while (chunk_len > 0) {
27 |     std::copy(
28 |         RAW(chunk),
29 |         RAW(chunk) + Rf_xlength(chunk),
30 |         std::ostream_iterator<char>(out));
31 |     chunk = read_bin(con, chunk_size);
32 |     chunk_len = Rf_xlength(chunk);
33 |   }
34 | 
35 |   return filename;
36 | }
37 | 


--------------------------------------------------------------------------------
/src/cpp11.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by cpp11: do not edit by hand
 2 | // clang-format off
 3 | 
 4 | 
 5 | #include "cpp11/declarations.hpp"
 6 | #include <R_ext/Visibility.h>
 7 | 
 8 | // CollectorGuess.cpp
 9 | std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger);
10 | extern "C" SEXP _meltr_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger) {
11 |   BEGIN_CPP11
12 |     return cpp11::as_sexp(collectorGuess(cpp11::as_cpp<cpp11::decay_t<const cpp11::strings&>>(input), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_), cpp11::as_cpp<cpp11::decay_t<bool>>(guessInteger)));
13 |   END_CPP11
14 | }
15 | // connection.cpp
16 | std::string read_connection_(const cpp11::sexp& con, std::string filename, int chunk_size);
17 | extern "C" SEXP _meltr_read_connection_(SEXP con, SEXP filename, SEXP chunk_size) {
18 |   BEGIN_CPP11
19 |     return cpp11::as_sexp(read_connection_(cpp11::as_cpp<cpp11::decay_t<const cpp11::sexp&>>(con), cpp11::as_cpp<cpp11::decay_t<std::string>>(filename), cpp11::as_cpp<cpp11::decay_t<int>>(chunk_size)));
20 |   END_CPP11
21 | }
22 | // read.cpp
23 | cpp11::strings read_file_(const cpp11::list& sourceSpec, const cpp11::list& locale_);
24 | extern "C" SEXP _meltr_read_file_(SEXP sourceSpec, SEXP locale_) {
25 |   BEGIN_CPP11
26 |     return cpp11::as_sexp(read_file_(cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(sourceSpec), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_)));
27 |   END_CPP11
28 | }
29 | // read.cpp
30 | cpp11::raws read_file_raw_(const cpp11::list& sourceSpec);
31 | extern "C" SEXP _meltr_read_file_raw_(SEXP sourceSpec) {
32 |   BEGIN_CPP11
33 |     return cpp11::as_sexp(read_file_raw_(cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(sourceSpec)));
34 |   END_CPP11
35 | }
36 | // read.cpp
37 | cpp11::sexp melt_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, int n_max, bool progress);
38 | extern "C" SEXP _meltr_melt_tokens_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP colSpecs, SEXP locale_, SEXP n_max, SEXP progress) {
39 |   BEGIN_CPP11
40 |     return cpp11::as_sexp(melt_tokens_(cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(sourceSpec), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(tokenizerSpec), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(colSpecs), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_), cpp11::as_cpp<cpp11::decay_t<int>>(n_max), cpp11::as_cpp<cpp11::decay_t<bool>>(progress)));
41 |   END_CPP11
42 | }
43 | // read.cpp
44 | void melt_tokens_chunked_(const cpp11::list& sourceSpec, const cpp11::environment& callback, int chunkSize, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, bool progress);
45 | extern "C" SEXP _meltr_melt_tokens_chunked_(SEXP sourceSpec, SEXP callback, SEXP chunkSize, SEXP tokenizerSpec, SEXP colSpecs, SEXP locale_, SEXP progress) {
46 |   BEGIN_CPP11
47 |     melt_tokens_chunked_(cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(sourceSpec), cpp11::as_cpp<cpp11::decay_t<const cpp11::environment&>>(callback), cpp11::as_cpp<cpp11::decay_t<int>>(chunkSize), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(tokenizerSpec), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(colSpecs), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_), cpp11::as_cpp<cpp11::decay_t<bool>>(progress));
48 |     return R_NilValue;
49 |   END_CPP11
50 | }
51 | // TokenizerFwf.cpp
52 | cpp11::list whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment);
53 | extern "C" SEXP _meltr_whitespaceColumns(SEXP sourceSpec, SEXP n, SEXP comment) {
54 |   BEGIN_CPP11
55 |     return cpp11::as_sexp(whitespaceColumns(cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(sourceSpec), cpp11::as_cpp<cpp11::decay_t<int>>(n), cpp11::as_cpp<cpp11::decay_t<std::string>>(comment)));
56 |   END_CPP11
57 | }
58 | 
59 | extern "C" {
60 | static const R_CallMethodDef CallEntries[] = {
61 |     {"_meltr_collectorGuess",       (DL_FUNC) &_meltr_collectorGuess,       3},
62 |     {"_meltr_melt_tokens_",         (DL_FUNC) &_meltr_melt_tokens_,         6},
63 |     {"_meltr_melt_tokens_chunked_", (DL_FUNC) &_meltr_melt_tokens_chunked_, 7},
64 |     {"_meltr_read_connection_",     (DL_FUNC) &_meltr_read_connection_,     3},
65 |     {"_meltr_read_file_",           (DL_FUNC) &_meltr_read_file_,           2},
66 |     {"_meltr_read_file_raw_",       (DL_FUNC) &_meltr_read_file_raw_,       1},
67 |     {"_meltr_whitespaceColumns",    (DL_FUNC) &_meltr_whitespaceColumns,    3},
68 |     {NULL, NULL, 0}
69 | };
70 | }
71 | 
72 | extern "C" attribute_visible void R_init_meltr(DllInfo* dll){
73 |   R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
74 |   R_useDynamicSymbols(dll, FALSE);
75 |   R_forceSymbols(dll, TRUE);
76 | }
77 | 


--------------------------------------------------------------------------------
/src/read.cpp:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "cpp11/environment.hpp"
 4 | #include "cpp11/function.hpp"
 5 | #include "cpp11/list.hpp"
 6 | #include "cpp11/strings.hpp"
 7 | 
 8 | #include "Collector.h"
 9 | #include "LocaleInfo.h"
10 | #include "Progress.h"
11 | #include "Reader.h"
12 | #include "Source.h"
13 | #include "Tokenizer.h"
14 | #include "Warnings.h"
15 | 
16 | [[cpp11::register]] cpp11::strings
17 | read_file_(const cpp11::list& sourceSpec, const cpp11::list& locale_) {
18 |   SourcePtr source = Source::create(sourceSpec);
19 |   LocaleInfo locale(locale_);
20 | 
21 |   return cpp11::writable::strings(
22 |       locale.encoder_.makeSEXP(source->begin(), source->end()));
23 | }
24 | 
25 | [[cpp11::register]] cpp11::raws read_file_raw_(const cpp11::list& sourceSpec) {
26 |   SourcePtr source = Source::create(sourceSpec);
27 | 
28 |   cpp11::writable::raws res(
29 |       static_cast<R_xlen_t>(source->end() - source->begin()));
30 |   std::copy(source->begin(), source->end(), RAW(res));
31 |   return SEXP(res);
32 | }
33 | 
34 | cpp11::function
35 | R6method(const cpp11::environment& env, const std::string& method) {
36 |   return static_cast<SEXP>(env[method.c_str()]);
37 | }
38 | bool isTrue(SEXP x) {
39 |   if (!(TYPEOF(x) == LGLSXP && Rf_length(x) == 1)) {
40 |     cpp11::stop("`continue()` must return a length 1 logical vector");
41 |   }
42 |   return LOGICAL(x)[0] == TRUE;
43 | }
44 | 
45 | typedef std::vector<CollectorPtr>::iterator CollectorItr;
46 | 
47 | [[cpp11::register]] cpp11::sexp melt_tokens_(
48 |     const cpp11::list& sourceSpec,
49 |     const cpp11::list& tokenizerSpec,
50 |     const cpp11::list& colSpecs,
51 |     const cpp11::list& locale_,
52 |     int n_max,
53 |     bool progress) {
54 | 
55 |   LocaleInfo l(locale_);
56 |   Reader r(
57 |       Source::create(sourceSpec),
58 |       Tokenizer::create(tokenizerSpec),
59 |       collectorsCreate(colSpecs, &l),
60 |       progress);
61 | 
62 |   return r.meltToDataFrame(cpp11::list(locale_), n_max);
63 | }
64 | 
65 | [[cpp11::register]] void melt_tokens_chunked_(
66 |     const cpp11::list& sourceSpec,
67 |     const cpp11::environment& callback,
68 |     int chunkSize,
69 |     const cpp11::list& tokenizerSpec,
70 |     const cpp11::list& colSpecs,
71 |     const cpp11::list& locale_,
72 |     bool progress) {
73 | 
74 |   LocaleInfo l(locale_);
75 |   Reader r(
76 |       Source::create(sourceSpec),
77 |       Tokenizer::create(tokenizerSpec),
78 |       collectorsCreate(colSpecs, &l),
79 |       progress);
80 | 
81 |   int pos = 1;
82 |   while (isTrue(R6method(callback, "continue")())) {
83 |     cpp11::data_frame out(
84 |         r.meltToDataFrame(static_cast<SEXP>(locale_), chunkSize));
85 |     if (out.nrow() == 0) {
86 |       return;
87 |     }
88 |     R6method(callback, "receive")(out, pos);
89 |     pos += out.nrow();
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/unicode_fopen.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdio.h>
 4 | // clang-format off
 5 | #ifdef __clang__
 6 | # pragma clang diagnostic push
 7 | # pragma clang diagnostic ignored "-Wsign-compare"
 8 | #include "mio.h"
 9 | # pragma clang diagnostic pop
10 | #else
11 | #include "mio.h"
12 | #endif
13 | // clang-format on
14 | 
15 | #ifdef _WIN32
16 | #include <Rinternals.h>
17 | #include <windows.h>
18 | #endif
19 | 
20 | // This is needed to support wide character paths on windows
21 | inline FILE* unicode_fopen(const char* path, const char* mode) {
22 |   FILE* out;
23 | #ifdef _WIN32
24 |   // First conver the mode to the wide equivalent
25 |   // Only usage is 2 characters so max 8 bytes + 2 byte null.
26 |   wchar_t mode_w[10];
27 |   MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9);
28 | 
29 |   // Then convert the path
30 |   wchar_t* buf;
31 |   size_t len = MultiByteToWideChar(CP_UTF8, 0, path, -1, NULL, 0);
32 |   if (len <= 0) {
33 |     Rf_error("Cannot convert file to Unicode: %s", path);
34 |   }
35 |   buf = (wchar_t*)R_alloc(len, sizeof(wchar_t));
36 |   if (buf == NULL) {
37 |     Rf_error("Could not allocate buffer of size: %zu", len);
38 |   }
39 | 
40 |   MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len);
41 |   out = _wfopen(buf, mode_w);
42 | #else
43 |   out = fopen(path, mode);
44 | #endif
45 | 
46 |   return out;
47 | }
48 | 
49 | inline mio::mmap_source
50 | make_mmap_source(const char* file, std::error_code& error) {
51 | #ifdef __WIN32
52 |   wchar_t* buf;
53 |   size_t len = MultiByteToWideChar(CP_UTF8, 0, file, -1, NULL, 0);
54 |   if (len <= 0) {
55 |     Rf_error("Cannot convert file to Unicode: %s", file);
56 |   }
57 |   buf = (wchar_t*)malloc(len * sizeof(wchar_t));
58 |   if (buf == NULL) {
59 |     Rf_error("Could not allocate buffer of size: %zu", len);
60 |   }
61 | 
62 |   MultiByteToWideChar(CP_UTF8, 0, file, -1, buf, len);
63 |   mio::mmap_source out = mio::make_mmap_source(buf, error);
64 |   free(buf);
65 |   return out;
66 | #else
67 |   return mio::make_mmap_source(file, error);
68 | #endif
69 | }
70 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef MELTR_UTILS_H_
 2 | #define MELTR_UTILS_H_
 3 | 
 4 | #include <cstring>
 5 | #include <locale>
 6 | #include <string>
 7 | 
 8 | // Advances iterator if the next character is a LF.
 9 | // Returns iterator to end of line.
10 | template <class Iter> inline Iter advanceForLF(Iter* pBegin, Iter end) {
11 |   Iter cur = *pBegin;
12 |   if (cur == end) {
13 |     return cur;
14 |   }
15 |   if (*cur == '\r' && (cur + 1 != end) && *(cur + 1) == '\n')
16 |     (*pBegin)++;
17 | 
18 |   return cur;
19 | }
20 | 
21 | const static char* const true_values[] = {
22 |     "T", "t", "True", "TRUE", "true", (char*)NULL};
23 | const static char* const false_values[] = {
24 |     "F", "f", "False", "FALSE", "false", (char*)NULL};
25 | 
26 | inline bool isTrue(const char* start, const char* end) {
27 |   size_t len = end - start;
28 | 
29 |   for (int i = 0; true_values[i]; i++) {
30 |     size_t true_len = strlen(true_values[i]);
31 |     if (true_len == len && strncmp(start, true_values[i], len) == 0) {
32 |       return true;
33 |     }
34 |   }
35 |   return false;
36 | }
37 | inline bool isFalse(const char* start, const char* end) {
38 |   size_t len = end - start;
39 | 
40 |   for (int i = 0; false_values[i]; i++) {
41 |     if (strlen(false_values[i]) == len &&
42 |         strncmp(start, false_values[i], len) == 0) {
43 |       return true;
44 |     }
45 |   }
46 |   return false;
47 | }
48 | 
49 | inline bool isLogical(const char* start, const char* end) {
50 |   return isTrue(start, end) || isFalse(start, end);
51 | }
52 | 
53 | inline bool istarts_with(const std::string& input, const std::string& test) {
54 |   if (test.size() > input.size()) {
55 |     return false;
56 |   }
57 | 
58 |   auto test_it = test.cbegin();
59 |   auto input_it = input.cbegin();
60 |   auto test_end = test.cend();
61 |   auto locale = std::locale();
62 |   while (test_it != test_end) {
63 |     if (std::toupper(*test_it++, locale) != std::toupper(*input_it++, locale)) {
64 |       return false;
65 |     }
66 |   }
67 |   return true;
68 | }
69 | 
70 | inline bool starts_with_comment(
71 |     const char* cur, const char* end, const std::string& comment) {
72 |   // If the comment is bigger than what we are testing, it cannot start with it.
73 |   if ((long)comment.size() > (end - cur)) {
74 |     return false;
75 |   }
76 |   for (auto c : comment) {
77 |     if (*cur++ != c) {
78 |       return false;
79 |     }
80 |   }
81 |   return true;
82 | }
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(meltr)
3 | 
4 | test_check("meltr")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/basic-df-singlequote.csv:
--------------------------------------------------------------------------------
 1 | a,b,c,d
 2 | TRUE,7,0.181526642525569,'m'
 3 | TRUE,2,0.833227441413328,'z'
 4 | TRUE,8,0.926790483295918,'r'
 5 | FALSE,10,0.375270307529718,'s'
 6 | TRUE,6,0.420266286935657,'g'
 7 | TRUE,3,0.435449987649918,'h'
 8 | TRUE,5,0.0210941969417036,'w'
 9 | FALSE,9,0.0915570755023509,'u'
10 | FALSE,1,0.756106866057962,'l'
11 | FALSE,4,0.353530979715288,NA
12 | 


--------------------------------------------------------------------------------
/tests/testthat/basic-df.csv:
--------------------------------------------------------------------------------
 1 | a,b,c,d
 2 | TRUE,7,0.181526642525569,"m"
 3 | TRUE,2,0.833227441413328,"z"
 4 | TRUE,8,0.926790483295918,"r"
 5 | FALSE,10,0.375270307529718,"s"
 6 | TRUE,6,0.420266286935657,"g"
 7 | TRUE,3,0.435449987649918,"h"
 8 | TRUE,5,0.0210941969417036,"w"
 9 | FALSE,9,0.0915570755023509,"u"
10 | FALSE,1,0.756106866057962,"l"
11 | FALSE,4,0.353530979715288,NA
12 | 


--------------------------------------------------------------------------------
/tests/testthat/empty-file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-lib/meltr/38c5a720afe794d1fd2f36e5bb552dd9a8ca8b47/tests/testthat/empty-file


--------------------------------------------------------------------------------
/tests/testthat/enc-iso-8859-1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-lib/meltr/38c5a720afe794d1fd2f36e5bb552dd9a8ca8b47/tests/testthat/enc-iso-8859-1.txt


--------------------------------------------------------------------------------
/tests/testthat/fwf-trailing.txt:
--------------------------------------------------------------------------------
1 | 123 123
2 | 123 123
3 | 


--------------------------------------------------------------------------------
/tests/testthat/helper.R:
--------------------------------------------------------------------------------
 1 | # Provide helper overriding tibble::all.equal.tbl_df as it requires dplyr
 2 | # https://github.com/tidyverse/readr/pull/577
 3 | # Using this helper allows us to avoid Suggesting dplyr
 4 | all.equal.tbl_df <- function(target, current, ..., check.attributes = FALSE) {
 5 |   all.equal.list(target, current, ..., check.attributes = check.attributes)
 6 | }
 7 | 
 8 | is_bz2_file <- function(x) {
 9 | 
10 |   # Magic number for bz2 is "BZh" in ASCII
11 |   # https://en.wikipedia.org/wiki/Bzip2#File_format
12 |   identical(charToRaw("BZh"), readBin(x, n = 3, what = "raw"))
13 | }
14 | 
15 | encoded <- function(x, encoding) {
16 |   Encoding(x) <- encoding
17 |   x
18 | }
19 | 
20 | skip_if_no_clipboard <- function() {
21 |   if (!clipr::clipr_available()) {
22 |     testthat::skip("System clipboard is not available - skipping test.")
23 |   }
24 |   return(invisible(TRUE))
25 | }
26 | 
27 | with_crayon <- function(expr) {
28 |   old <- options(crayon.enabled = TRUE, crayon.colors = 16)
29 |   crayon::num_colors(forget = TRUE)
30 |   on.exit({
31 |     options(old)
32 |     crayon::num_colors(forget = TRUE)
33 |   })
34 | 
35 |   force(expr)
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/testthat/non-tabular.csv:
--------------------------------------------------------------------------------
1 | a,"b",'c'
2 | ,,NA,"NA",
3 | a,1,1.0,1.1,1e3
4 | 


--------------------------------------------------------------------------------
/tests/testthat/raw.csv:
--------------------------------------------------------------------------------
1 | abc,def
2 | ab c,def
3 | 


--------------------------------------------------------------------------------
/tests/testthat/table-crash:
--------------------------------------------------------------------------------
 1 |  3.5022800E+05  2.1990000E+02  1.7455317E-03  5.0152367E+00  1.0200010E+00  0.0000000E+00  1.0360000E+03
 2 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
 3 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
 4 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
 5 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
 6 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
 7 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
 8 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
 9 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
10 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
11 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
12 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
13 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
14 |  1.3231179E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  6.9944140E-03
15 | -9.9920000E+02  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
16 | -9.9920000E+02  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
17 | -9.9920000E+02  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
18 | -9.9920000E+02  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
19 |  2.5980995E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  7.0062219E-03
20 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
21 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
22 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
23 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
24 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
25 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
26 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
27 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
28 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
29 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
30 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
31 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
32 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
33 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
34 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
35 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
36 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
37 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
38 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00
39 |  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.0000000E+00  0.00000


--------------------------------------------------------------------------------
/tests/testthat/test-melt-chunked.R:
--------------------------------------------------------------------------------
 1 | test_that("melt_delim_chunked", {
 2 |   file <- meltr_example("mtcars.csv")
 3 |   unchunked <- melt_csv(file)
 4 | 
 5 |   get_dims <- function(data, pos) dims[[length(dims) + 1]] <<- dim(data)
 6 | 
 7 |   # Full file in one chunk
 8 |   dims <- list()
 9 |   melt_csv_chunked(file, get_dims)
10 |   expect_equal(dim(unchunked), dims[[1]])
11 | 
12 |   # Each line separately
13 |   dims <- list()
14 |   melt_csv_chunked(file, get_dims, chunk_size = 1)
15 |   expect_true(all(vapply(dims[1:6], identical, logical(1), c(11L, 4L))))
16 |   expect_equal(nrow(unchunked) / 11L, length(dims))
17 | 
18 |   # In chunks of 5
19 |   dims <- list()
20 |   melt_csv_chunked(file, get_dims, chunk_size = 5)
21 |   expect_true(all(vapply(dims[1:6], identical, logical(1), c(55L, 4L))))
22 |   expect_true(identical(dims[[7]], c(33L, 4L)))
23 | 
24 |   # Halting early
25 |   get_dims_stop <- function(data, pos) {
26 |     dims[[length(dims) + 1]] <<- dim(data)
27 |     if (pos >= 5) {
28 |       return(FALSE)
29 |     }
30 |   }
31 |   dims <- list()
32 |   melt_csv_chunked(file, get_dims_stop, chunk_size = 5)
33 |   expect_true(length(dims) == 2)
34 |   expect_true(all(vapply(dims[1:2], identical, logical(1), c(55L, 4L))))
35 | })
36 | 
37 | test_that("DataFrameCallback works as intended", {
38 |   f <- meltr_example("mtcars.csv")
39 |   out0 <- subset(melt_csv(f), data_type == "integer")
40 |   fun3 <- DataFrameCallback$new(function(x, pos) {
41 |     subset(x, data_type == "integer")
42 |   })
43 | 
44 |   out1 <- melt_csv_chunked(f, fun3)
45 |   out2 <- melt_csv_chunked(f, fun3, chunk_size = 1)
46 |   out3 <- melt_csv_chunked(f, fun3, chunk_size = 10)
47 | 
48 |   expect_true(all.equal(out0, out1))
49 |   expect_true(all.equal(out0, out2))
50 |   expect_true(all.equal(out0, out3))
51 | 
52 | 
53 |   # No matching rows
54 |   out0 <- subset(melt_csv(f), data_type == "integer")
55 | 
56 |   fun5 <- DataFrameCallback$new(function(x, pos) subset(x, data_type == "integer"))
57 | 
58 |   out1 <- melt_csv_chunked(f, fun5)
59 | 
60 |   # Need to set guess_max higher than 1 to guess correct column types
61 |   out2 <- melt_csv_chunked(f, fun5, chunk_size = 1)
62 | 
63 |   out3 <- melt_csv_chunked(f, fun5, chunk_size = 10)
64 | 
65 |   expect_true(all.equal(out0, out1))
66 |   expect_true(all.equal(out0, out2))
67 |   expect_true(all.equal(out0, out3))
68 | })
69 | 
70 | test_that("ListCallback works as intended", {
71 |   f <- meltr_example("mtcars.csv")
72 |   out0 <- melt_csv(f)
73 | 
74 |   fun <- ListCallback$new(function(x, pos) x[["value"]])
75 |   out1 <- melt_csv_chunked(f, fun, chunk_size = 10)
76 | 
77 |   expect_equal(out0[["value"]], unlist(out1))
78 | })
79 | 


--------------------------------------------------------------------------------
/tests/testthat/test-melt-csv.R:
--------------------------------------------------------------------------------
  1 | test_that("melt_csv type imputation and NA detection works", {
  2 |   skip_on_os("windows")
  3 |   melt_data <- melt_csv("non-tabular.csv", na = "NA")
  4 |   expect_equal(
  5 |     melt_data$data_type[7:11],
  6 |     c("missing", "empty", "character", "integer", "double")
  7 |   )
  8 | })
  9 | 
 10 | test_that("melt_tsv works on a simple file", {
 11 |   melt_data <- melt_tsv("a\tb\n1\t2")
 12 |   expect_equal(melt_data$data_type, rep(c("character", "integer"), each = 2))
 13 | })
 14 | 
 15 | test_that("melt_csv's 'NA' option genuinely changes the NA values", {
 16 |   expect_equal(melt_csv("z\n", na = "z")$data_type, "missing")
 17 | })
 18 | 
 19 | test_that("melt_csv's 'NA' option works with multiple NA values", {
 20 |   expect_equal(
 21 |     melt_csv("NA\nmiss\n13", na = c("13", "miss"))$data_type,
 22 |     c("character", "missing", "missing")
 23 |   )
 24 | })
 25 | 
 26 | test_that('passing character() to melt_csv\'s "NA" option reads "" correctly', {
 27 |   expect_equal(melt_csv("foo\n", na = character())$value, "foo")
 28 | })
 29 | 
 30 | test_that("passing \"\" to melt_csv's 'NA' option reads \"\" correctly", {
 31 |   expect_equal(
 32 |     melt_csv("foo,bar\nfoo,\n", na = "")$value,
 33 |     c("foo", "bar", "foo", NA)
 34 |   )
 35 | })
 36 | 
 37 | test_that("changing melt_csv's 'quote' argument works correctly", {
 38 |   test_data <- melt_csv("basic-df.csv")
 39 |   test_data_singlequote <- melt_csv("basic-df-singlequote.csv", quote = "'")
 40 |   expect_identical(test_data, test_data_singlequote)
 41 | })
 42 | 
 43 | test_that("melt_csv's 'skip' option allows for skipping'", {
 44 |   test_data <- melt_csv("basic-df.csv", skip = 1)
 45 |   expect_equal(nrow(test_data), 40)
 46 | })
 47 | 
 48 | test_that("melt_csv's 'n_max' allows for a maximum number of records and does not corrupt any", {
 49 |   test_data <- melt_csv("basic-df.csv", n_max = 7)
 50 |   expect_equal(nrow(test_data), 28)
 51 |   expect_equal(sum(test_data$data_type == "missing"), 0)
 52 | })
 53 | 
 54 | test_that("can read more than 100 columns", {
 55 |   set.seed(2015 - 3 - 13)
 56 |   x <- as.data.frame(matrix(rbinom(300, 2, .5), nrow = 2))
 57 |   f <- tempfile()
 58 |   on.exit(unlink(f))
 59 |   write.csv(x, f, row.names = FALSE)
 60 |   expect_equal(max(melt_csv(f)$col), 150)
 61 | })
 62 | 
 63 | test_that("encoding affects text", {
 64 |   x <- melt_csv("enc-iso-8859-1.txt", locale = locale(encoding = "ISO-8859-1"))
 65 |   expect_identical(x$value[2], "\u00e9l\u00e8ve")
 66 | })
 67 | 
 68 | test_that("nuls are dropped with a warning", {
 69 |   expect_warning(x <- melt_csv("raw.csv"))
 70 |   expect_equal(readr:::n_problems(x), 1)
 71 |   expect_equal(x$value[3], "ab")
 72 | })
 73 | 
 74 | test_that("can read from the clipboard", {
 75 |   skip_on_cran()
 76 |   skip_if_no_clipboard()
 77 |   clipr::write_clip("a,b,c\n1,2,3")
 78 |   expect_identical(melt_csv(clipboard()), melt_csv("a,b,c\n1,2,3"))
 79 | })
 80 | 
 81 | test_that("can read from a multi-line character vector", {
 82 |   expect_identical(max(melt_csv(c("a,b,c", "1,2,3"))$row), 2)
 83 | })
 84 | 
 85 | # Column warnings ---------------------------------------------------------
 86 | 
 87 | test_that("missing lines are not skipped", {
 88 |   # first
 89 |   expect_equal(max(melt_csv("a,b\n\n\n1,2")$row), 4)
 90 | 
 91 |   # middle
 92 |   expect_equal(max(melt_csv("a,b\n1,2\n\n\n2,3\n")$row), 5)
 93 | 
 94 |   # last (trailing \n is ignored)
 95 |   expect_equal(max(melt_csv("a,b\n1,2\n\n\n")$row), 4)
 96 | })
 97 | 
 98 | # melt_csv2 ---------------------------------------------------------------
 99 | 
100 | test_that("decimal mark automatically set to ,", {
101 |   expect_message(
102 |     x <- melt_csv2("x\n1,23"),
103 |     if (default_locale()$decimal_mark == ".") "decimal .*grouping .*mark" else NA
104 |   )
105 |   expect_equal(x$data_type[2], "double")
106 | })
107 | 
108 | # Zero rows ---------------------------------------------------------------
109 | 
110 | test_that("n_max 0 gives zero row data frame", {
111 |   x <- melt_csv("a,b\n1,2", n_max = 0)
112 |   expect_equal(dim(x), c(0, 4))
113 | })
114 | 
115 | # Comments ----------------------------------------------------------------
116 | 
117 | test_that("comments are ignored regardless of where they appear", {
118 |   out1 <- melt_csv("x\n1#comment", comment = "#")
119 |   out2 <- melt_csv("x\n1#comment\n#comment", comment = "#")
120 |   out3 <- melt_csv('x\n"1"#comment', comment = "#")
121 | 
122 |   chk1 <- tibble::tibble(
123 |     row = c(1, 2),
124 |     col = c(1, 1),
125 |     data_type = c("character", "integer"),
126 |     value = c("x", "1")
127 |   )
128 | 
129 |   expect_true(all.equal(chk1, out1))
130 |   expect_true(all.equal(chk1, out2))
131 |   expect_true(all.equal(chk1, out3))
132 | 
133 |   out5 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6", comment = "#")
134 |   out6 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\nA4,A5,A6", comment = "#")
135 |   out7 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\n#comment\nA4,A5,A6", comment = "#")
136 | 
137 |   chk2 <- tibble::tibble(
138 |     row = c(1, 1, 1, 2, 2, 2, 3, 4, 4, 4),
139 |     col = c(1, 2, 3, 1, 2, 3, 1, 1, 2, 3),
140 |     data_type = "character",
141 |     value = c("x1", "x2", "x3", "A2", "B2", "C2", "A3", "A4", "A5", "A6")
142 |   )
143 | 
144 |   expect_true(all.equal(chk2, out5))
145 |   expect_true(all.equal(chk2, out6))
146 |   expect_true(all.equal(chk2, out7))
147 | })
148 | 
149 | test_that("escaped/quoted comments are ignored", {
150 |   out1 <- melt_delim("x\n\\#",
151 |     comment = "#", delim = ",",
152 |     escape_backslash = TRUE, escape_double = FALSE
153 |   )
154 |   out2 <- melt_csv('x\n"#"', comment = "#")
155 | 
156 |   expect_equal(out1$value[2], "#")
157 |   expect_equal(out2$value[2], "#")
158 | })
159 | 
160 | test_that("leading comments are ignored", {
161 |   out <- melt_csv("#a\n#b\nx\n1", comment = "#")
162 | 
163 |   expect_equal(nrow(out), 2)
164 |   expect_equal(out$value[2], "1")
165 | })
166 | 
167 | test_that("skip respects comments", {
168 |   melt_x <- function(...) {
169 |     melt_csv("#a\nb\nc", ...)$value
170 |   }
171 | 
172 |   expect_equal(melt_x(), c("#a", "b", "c"))
173 |   expect_equal(melt_x(skip = 1), c("b", "c"))
174 |   expect_equal(melt_x(comment = "#"), c("b", "c"))
175 |   expect_equal(melt_x(comment = "#", skip = 2), c("c"))
176 | })
177 | 
178 | test_that("melt_csv returns a four-col zero-row data.frame on an empty file", {
179 |   expect_equal(dim(melt_csv("empty-file")), c(0, 4))
180 | })
181 | 
182 | test_that("melt_delim errors on length 0 delimiter", {
183 |   expect_error(
184 |     melt_delim("a b\n1 2\n", delim = ""),
185 |     "`delim` must be at least one character, use `melt_table\\(\\)` for whitespace delimited input\\."
186 |   )
187 | })
188 | 
189 | test_that("melt_csv handles whitespace between delimiters and quoted fields", {
190 |   x <- melt_csv('1, \"hi,there\"\n3,4')
191 |   expect_equal(x$value[2:3], c("hi,there", "3"))
192 | })
193 | 
194 | test_that("melt_csv works with raw inputs", {
195 |   x <- melt_csv(as.raw(charToRaw("a,b\n1,2")))
196 |   expect_equal(x,
197 |     tibble::tibble(
198 |       row = c(1, 1, 2, 2),
199 |       col = c(1, 2, 1, 2),
200 |       data_type = c("character", "character", "integer", "integer"),
201 |       value = c("a", "b", "1", "2")
202 |     )
203 |   )
204 | })
205 | 
206 | 
207 | test_that("melt_csv works with dates and datetimes", {
208 |   x <- melt_csv('a\n2020-01-01,2021-01-01 10:01:00')
209 |   expect_equal(x,
210 |     tibble::tibble(
211 |       row = c(1, 2, 2),
212 |       col = c(1, 1, 2),
213 |       data_type = c("character", "date", "datetime"),
214 |       value = c("a", "2020-01-01", "2021-01-01 10:01:00")
215 |     )
216 |   )
217 | })
218 | 


--------------------------------------------------------------------------------
/tests/testthat/test-melt-fwf.R:
--------------------------------------------------------------------------------
  1 | test_that("trailing spaces ommitted", {
  2 |   withr::local_options(lifecycle_verbosity = "quiet")
  3 |   spec <- fwf_empty("fwf-trailing.txt")
  4 |   expect_equal(spec$begin, c(0, 4))
  5 |   expect_equal(spec$end, c(3, NA))
  6 | 
  7 |   df <- melt_fwf("fwf-trailing.txt", spec)
  8 |   expect_true(all(df$value == "123"))
  9 | })
 10 | 
 11 | test_that("respects the trim_ws argument", {
 12 |   withr::local_options(lifecycle_verbosity = "quiet")
 13 |   x <- "a11 b22 c33\nd   e   f  "
 14 |   out1 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = FALSE)
 15 |   expect_equal(out1$value, c("a11", "b22", "c33", "d  ", "e  ", "f  "))
 16 | 
 17 |   out2 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = TRUE)
 18 |   expect_equal(out2$value, c("a11", "b22", "c33", "d", "e", "f"))
 19 | })
 20 | 
 21 | test_that("respects the trim_ws argument with empty fields", {
 22 |   withr::local_options(lifecycle_verbosity = "quiet")
 23 |   x <- "a11 b22 c33\nd       f  "
 24 |   out1 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = FALSE)
 25 |   expect_equal(out1$value, c("a11", "b22", "c33", "d  ", "   ", "f  "))
 26 | 
 27 |   out2 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = TRUE, na = "NA")
 28 |   expect_equal(out2$value, c("a11", "b22", "c33", "d", "", "f"))
 29 | })
 30 | 
 31 | test_that("fwf_empty can skip comments", {
 32 |   withr::local_options(lifecycle_verbosity = "quiet")
 33 |   x <- "COMMENT\n1 2 3\n4 5 6"
 34 | 
 35 |   out1 <- melt_fwf(x, fwf_empty(I(x), comment = "COMMENT"), comment = "COMMENT")
 36 |   expect_equal(dim(out1), c(6, 4))
 37 | })
 38 | 
 39 | test_that("missing lines are not skipped", {
 40 |   withr::local_options(lifecycle_verbosity = "quiet")
 41 |   # first
 42 |   x <- "a b\n\n\n1 2"
 43 |   expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 4)
 44 | 
 45 |   # middle
 46 |   x <- "a b\n1 2\n\n\n2 3"
 47 |   expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 5)
 48 | 
 49 |   # last (trailing \n is ignored)
 50 |   x <- "a b\n1 2\n\n\n"
 51 |   expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 4)
 52 | })
 53 | 
 54 | test_that("passing \"\" to melt_fwf's 'na' option", {
 55 |   withr::local_options(lifecycle_verbosity = "quiet")
 56 |   expect_equal(
 57 |     melt_fwf("foobar\nfoo   ", fwf_widths(c(3, 3)), na = "")$value,
 58 |     c("foo", "bar", "foo", NA)
 59 |   )
 60 | })
 61 | 
 62 | test_that("ragged last column expanded with NA", {
 63 |   withr::local_options(lifecycle_verbosity = "quiet")
 64 |   x <- melt_fwf("1a\n2ab\n3abc", fwf_widths(c(1, NA)))
 65 |   expect_equal(x$value[c(2, 4, 6)], c("a", "ab", "abc"))
 66 |   expect_equal(n_problems(x), 0)
 67 | })
 68 | 
 69 | test_that("ragged last column shrunk with warning", {
 70 |   withr::local_options(lifecycle_verbosity = "quiet")
 71 |   expect_warning(x <- melt_fwf("1a\n2ab\n3abc", fwf_widths(c(1, 3))))
 72 |   expect_equal(x$value[c(2, 4, 6)], c("a", "ab", "abc"))
 73 |   expect_equal(n_problems(x), 2)
 74 | })
 75 | 
 76 | test_that("melt all columns with positions, non ragged", {
 77 |   withr::local_options(lifecycle_verbosity = "quiet")
 78 |   col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6))
 79 |   x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos)
 80 |   expect_equal(x$value[c(3, 6, 9)], c("A", "B", "C"))
 81 |   expect_equal(n_problems(x), 0)
 82 | })
 83 | 
 84 | test_that("melt subset columns with positions", {
 85 |   withr::local_options(lifecycle_verbosity = "quiet")
 86 |   col_pos <- fwf_positions(c(1, 3), c(2, 5))
 87 |   x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos)
 88 |   expect_equal(x$value[c(1, 3, 5)], as.character(c(12, 67, 54)))
 89 |   expect_equal(x$value[c(2, 4, 6)], as.character(c(345, 890, 321)))
 90 |   expect_equal(n_problems(x), 0)
 91 | })
 92 | 
 93 | test_that("melt columns with positions, ragged", {
 94 |   withr::local_options(lifecycle_verbosity = "quiet")
 95 |   col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, NA))
 96 |   x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos)
 97 |   expect_equal(x$value[c(1, 4, 7)], as.character(c(12, 67, 54)))
 98 |   expect_equal(x$value[c(2, 5, 8)], as.character(c(345, 890, 321)))
 99 |   expect_equal(x$value[c(3, 6, 9)], c("A", "BBBBBBBBB", "C"))
100 |   expect_equal(n_problems(x), 0)
101 | })
102 | 
103 | test_that("melt columns with width, ragged", {
104 |   withr::local_options(lifecycle_verbosity = "quiet")
105 |   col_pos <- fwf_widths(c(2, 3, NA))
106 |   x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos)
107 |   expect_equal(x$value[c(1, 4, 7)], as.character(c(12, 67, 54)))
108 |   expect_equal(x$value[c(2, 5, 8)], as.character(c(345, 890, 321)))
109 |   expect_equal(x$value[c(3, 6, 9)], c("A", "BBBBBBBBB", "C"))
110 |   expect_equal(n_problems(x), 0)
111 | })
112 | 
113 | test_that("melt_fwf returns an empty data.frame on an empty file", {
114 |   withr::local_options(lifecycle_verbosity = "quiet")
115 |   empty_df <- tibble::tibble(
116 |     row = double(), col = double(),
117 |     data_type = character(), value = character()
118 |   )
119 |   expect_true(all.equal(melt_fwf("empty-file"), empty_df))
120 | })
121 | 
122 | test_that("check for line breaks in between widths", {
123 |   withr::local_options(lifecycle_verbosity = "quiet")
124 |   txt1 <- paste(
125 |     "1 1",
126 |     "2",
127 |     "1 1 ",
128 |     sep = "\n"
129 |   )
130 |   expect_warning(out1 <- melt_fwf(txt1, fwf_empty(I(txt1))))
131 |   expect_equal(n_problems(out1), 1)
132 | 
133 |   txt2 <- paste(
134 |     " 1 1",
135 |     " 2",
136 |     " 1 1 ",
137 |     sep = "\n"
138 |   )
139 |   expect_warning(out2 <- melt_fwf(txt2, fwf_empty(I(txt2))))
140 |   expect_equal(n_problems(out2), 1)
141 | 
142 |   exp <- tibble::tibble(
143 |     row = c(1, 1, 2, 3, 3),
144 |     col = c(1, 2, 1, 1, 2),
145 |     data_type = "integer",
146 |     value = as.character(c(1, 1, 2, 1, 1))
147 |   )
148 |   expect_true(all.equal(out1, exp, check.attributes = FALSE))
149 |   expect_true(all.equal(out2, exp, check.attributes = FALSE))
150 | })
151 | 
152 | test_that("ignore commented lines anywhere in file", {
153 |   withr::local_options(lifecycle_verbosity = "quiet")
154 |   col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6))
155 |   x1 <- melt_fwf("COMMENT\n12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos, comment = "COMMENT")
156 |   x2 <- melt_fwf("12345A\n67890BBBBBBBBB\nCOMMENT\n54321C", col_positions = col_pos, comment = "COMMENT")
157 |   x3 <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C\nCOMMENT", col_positions = col_pos, comment = "COMMENT")
158 |   x4 <- melt_fwf("COMMENT\n12345A\nCOMMENT\n67890BBBBBBBBB\n54321C\nCOMMENT", col_positions = col_pos, comment = "COMMENT")
159 | 
160 |   expect_identical(x1, x2)
161 |   expect_identical(x1, x3)
162 |   expect_identical(x1, x4)
163 | 
164 |   expect_equal(x1$value[c(3, 6, 9)], c("A", "B", "C"))
165 |   expect_equal(n_problems(x1), 0)
166 | })
167 | 
168 | test_that("error on empty spec", {
169 |   withr::local_options(lifecycle_verbosity = "quiet")
170 |   txt <- "foo\n"
171 |   pos <- fwf_positions(start = numeric(0), end = numeric(0))
172 |   expect_error(melt_fwf(txt, pos), "Zero-length.*specifications not supported")
173 | })
174 | 


--------------------------------------------------------------------------------
/tests/testthat/test-melt-table.R:
--------------------------------------------------------------------------------
 1 | # melt_table -------------------------------------------------------------------
 2 | 
 3 | test_that("melt_table silently reads ragged last column", {
 4 |   x <- melt_table("foo bar\n1   2\n3   4\n5   6\n")
 5 |   expect_equal(x$value[-1:-2], as.character(1:6))
 6 | })
 7 | 
 8 | test_that("melt_table skips all comment lines", {
 9 |   x <- melt_table("foo bar\n1   2\n3   4\n5   6\n")
10 | 
11 |   y <- melt_table("#comment1\n#comment2\nfoo bar\n1   2\n3   4\n5   6\n", comment = "#")
12 | 
13 |   expect_equal(x, y)
14 | })
15 | 
16 | test_that("missing lines are not skipped", {
17 |   # first
18 |   expect_equal(max(melt_table("a   b\n\n\n12 34")$row), 4)
19 | 
20 |   # middle
21 |   expect_equal(max(melt_table("a   b\n12 34\n\n\n23 45")$row), 5)
22 | 
23 |   # last (trailing \n is ignored)
24 |   expect_equal(max(melt_table("a   b\n12 34\n\n\n")$row), 4)
25 | })
26 | 
27 | test_that("melt_table can read from a pipe", {
28 |   x <- melt_table(pipe("echo a b c && echo 1 2 3 && echo 4 5 6"))
29 |   expect_equal(x$value[-1:-3], as.character(1:6))
30 | })
31 | 
32 | test_that("melt_table can read a truncated file without crashing", {
33 |   expect_warning(expect_error(melt_table("table-crash"), NA))
34 | })
35 | 
36 | test_that("melt_table returns an empty data.frame on an empty file", {
37 |   empty_df <- tibble::tibble(
38 |     row = double(), col = double(),
39 |     data_type = character(), value = character()
40 |   )
41 |   expect_true(all.equal(melt_table("empty-file"), empty_df))
42 | })
43 | 
44 | # melt_table2 -------------------------------------------------------------------
45 | 
46 | test_that("melt_table2 silently reads ragged columns", {
47 |   x <- melt_table2("foo bar\n1 2\n3   4\n5     6\n")
48 |   expect_equal(x$value[-1:-2], as.character(1:6))
49 | })
50 | 
51 | test_that("melt_table2 skips all comment lines", {
52 |   x <- melt_table2("foo bar\n1   2\n3   4\n5   6\n")
53 | 
54 |   y <- melt_table2("#comment1\n#comment2\nfoo bar\n1   2\n3   4\n5   6\n", comment = "#")
55 | 
56 |   expect_equal(x, y)
57 | })
58 | 
59 | test_that("melt_table2 can read from a pipe", {
60 |   x <- melt_table2(pipe("echo a b c&& echo 1 2 3&& echo 4 5 6"))
61 |   expect_equal(x$value[-1:-3], as.character(1:6))
62 | })
63 | 
64 | test_that("melt_table2 does not duplicate header rows for leading whitespace", {
65 |   x <- melt_table2("foo bar\n1   2\n")
66 |   expect_equal(nrow(x), 4L)
67 |   expect_equal(x$value[-1:-2], as.character(1:2))
68 | })
69 | 
70 | test_that("melt_table2 ignores blank lines at the end of a file", {
71 |   expect_warning(x <- melt_table2("x y\n1 2\n\n"), NA)
72 |   expect_equal(nrow(x), 5L)
73 |   expect_equal(x$value[3:4], as.character(1:2))
74 | })
75 | 
76 | test_that("melt_table2 returns an empty data.frame on an empty file", {
77 |   empty_df <- tibble::tibble(
78 |     row = double(), col = double(),
79 |     data_type = character(), value = character()
80 |   )
81 |   expect_true(all.equal(melt_table2("empty-file"), empty_df))
82 | })
83 | 


--------------------------------------------------------------------------------