├── .Rbuildignore
├── .covrignore
├── .github
├── .gitignore
└── workflows
│ ├── R-CMD-check.yaml
│ ├── pkgdown.yaml
│ ├── pr-commands.yaml
│ └── test-coverage.yaml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
├── callback.R
├── cpp11.R
├── date-symbols.R
├── example.R
├── locale.R
├── melt_delim.R
├── melt_delim_chunked.R
├── melt_fwf.R
├── melt_table.R
├── meltr-package.R
├── problems.R
├── source.R
├── sysdata.rda
├── tokenizer.R
└── utils.R
├── README.Rmd
├── README.md
├── codecov.yml
├── cran-comments.md
├── data-raw
└── date-symbols.R
├── inst
└── extdata
│ ├── epa78.txt
│ ├── fwf-sample.txt
│ ├── massey-rating.txt
│ └── mtcars.csv
├── man
├── Tokenizers.Rd
├── callback.Rd
├── clipboard.Rd
├── datasource.Rd
├── date_names.Rd
├── locale.Rd
├── melt_delim.Rd
├── melt_delim_chunked.Rd
├── melt_fwf.Rd
├── melt_table.Rd
├── meltr_example.Rd
├── problems.Rd
└── show_progress.Rd
├── src
├── .gitignore
├── Collector.cpp
├── Collector.h
├── CollectorGuess.cpp
├── DateTimeParser.h
├── Iconv.cpp
├── Iconv.h
├── LocaleInfo.cpp
├── LocaleInfo.h
├── Progress.h
├── QiParsers.h
├── Reader.cpp
├── Reader.h
├── Source.cpp
├── Source.h
├── SourceFile.h
├── SourceRaw.h
├── SourceString.h
├── Token.h
├── Tokenizer.cpp
├── Tokenizer.h
├── TokenizerDelim.cpp
├── TokenizerDelim.h
├── TokenizerFwf.cpp
├── TokenizerFwf.h
├── TokenizerWs.cpp
├── TokenizerWs.h
├── Warnings.h
├── connection.cpp
├── cpp11.cpp
├── mio.h
├── read.cpp
├── unicode_fopen.h
└── utils.h
└── tests
├── testthat.R
└── testthat
├── basic-df-singlequote.csv
├── basic-df.csv
├── empty-file
├── enc-iso-8859-1.txt
├── fwf-trailing.txt
├── helper.R
├── non-tabular.csv
├── raw.csv
├── table-crash
├── test-melt-chunked.R
├── test-melt-csv.R
├── test-melt-fwf.R
└── test-melt-table.R
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^LICENSE\.md$
2 | ^data-raw$
3 | ^codecov\.yml$
4 | ^\.github$
5 | ^README\.Rmd$
6 | ^\.covrignore$
7 | ^cran-comments\.md$
8 | ^CRAN-SUBMISSION$
9 |
--------------------------------------------------------------------------------
/.covrignore:
--------------------------------------------------------------------------------
1 | src/mio.h
2 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | #
4 | # NOTE: This workflow is overkill for most R packages and
5 | # check-standard.yaml is likely a better choice.
6 | # usethis::use_github_action("check-standard") will install it.
7 | on:
8 | push:
9 | branches: [main, master]
10 | pull_request:
11 | branches: [main, master]
12 |
13 | name: R-CMD-check
14 |
15 | jobs:
16 | R-CMD-check:
17 | runs-on: ${{ matrix.config.os }}
18 |
19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 |
21 | strategy:
22 | fail-fast: false
23 | matrix:
24 | config:
25 | - {os: macOS-latest, r: 'release'}
26 |
27 | - {os: windows-latest, r: 'release'}
28 | # Use 3.6 to trigger usage of RTools35
29 | - {os: windows-latest, r: '3.6'}
30 |
31 | # Use older ubuntu to maximise backward compatibility
32 | - {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'}
33 | - {os: ubuntu-18.04, r: 'release'}
34 | - {os: ubuntu-18.04, r: 'oldrel-1'}
35 | - {os: ubuntu-18.04, r: 'oldrel-2'}
36 | - {os: ubuntu-18.04, r: 'oldrel-3'}
37 | - {os: ubuntu-18.04, r: 'oldrel-4'}
38 |
39 | env:
40 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
41 | R_KEEP_PKG_SOURCE: yes
42 |
43 | steps:
44 | - uses: actions/checkout@v3
45 |
46 | - uses: r-lib/actions/setup-pandoc@v2
47 |
48 | - uses: r-lib/actions/setup-r@v2
49 | with:
50 | r-version: ${{ matrix.config.r }}
51 | http-user-agent: ${{ matrix.config.http-user-agent }}
52 | use-public-rspm: true
53 |
54 | - uses: r-lib/actions/setup-r-dependencies@v2
55 | with:
56 | extra-packages: rcmdcheck
57 |
58 | - uses: r-lib/actions/check-r-package@v2
59 |
60 | - name: Show testthat output
61 | if: always()
62 | run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
63 | shell: bash
64 |
65 | - name: Upload check results
66 | if: failure()
67 | uses: actions/upload-artifact@main
68 | with:
69 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results
70 | path: check
71 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | tags: ['*']
7 |
8 | name: pkgdown
9 |
10 | jobs:
11 | pkgdown:
12 | runs-on: ubuntu-latest
13 | env:
14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
15 | steps:
16 | - uses: actions/checkout@v3
17 |
18 | - uses: r-lib/actions/setup-pandoc@v1
19 |
20 | - uses: r-lib/actions/setup-r@v2
21 | with:
22 | use-public-rspm: true
23 |
24 | - uses: r-lib/actions/setup-r-dependencies@v1
25 | with:
26 | extra-packages: pkgdown
27 | needs: website
28 |
29 | - name: Deploy package
30 | run: |
31 | git config --local user.name "$GITHUB_ACTOR"
32 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
33 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
34 |
--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | issue_comment:
5 | types: [created]
6 |
7 | name: Commands
8 |
9 | jobs:
10 | document:
11 | if: startsWith(github.event.comment.body, '/document')
12 | name: document
13 | runs-on: ubuntu-latest
14 | env:
15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | steps:
17 | - uses: actions/checkout@v3
18 |
19 | - uses: r-lib/actions/pr-fetch@v2
20 | with:
21 | repo-token: ${{ secrets.GITHUB_TOKEN }}
22 |
23 | - uses: r-lib/actions/setup-r@v2
24 | with:
25 | use-public-rspm: true
26 |
27 | - uses: r-lib/actions/setup-r-dependencies@v2
28 | with:
29 | extra-packages: roxygen2
30 |
31 | - name: Document
32 | run: Rscript -e 'roxygen2::roxygenise()'
33 |
34 | - name: commit
35 | run: |
36 | git config --local user.name "$GITHUB_ACTOR"
37 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
38 | git add man/\* NAMESPACE
39 | git commit -m 'Document'
40 |
41 | - uses: r-lib/actions/pr-push@v2
42 | with:
43 | repo-token: ${{ secrets.GITHUB_TOKEN }}
44 |
45 | style:
46 | if: startsWith(github.event.comment.body, '/style')
47 | name: style
48 | runs-on: ubuntu-latest
49 | env:
50 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
51 | steps:
52 | - uses: actions/checkout@v3
53 |
54 | - uses: r-lib/actions/pr-fetch@v2
55 | with:
56 | repo-token: ${{ secrets.GITHUB_TOKEN }}
57 |
58 | - uses: r-lib/actions/setup-r@v2
59 |
60 | - name: Install dependencies
61 | run: Rscript -e 'install.packages("styler")'
62 |
63 | - name: Style
64 | run: Rscript -e 'styler::style_pkg()'
65 |
66 | - name: commit
67 | run: |
68 | git config --local user.name "$GITHUB_ACTOR"
69 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
70 | git add \*.R
71 | git commit -m 'Style'
72 |
73 | - uses: r-lib/actions/pr-push@v2
74 | with:
75 | repo-token: ${{ secrets.GITHUB_TOKEN }}
76 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 |
9 | name: test-coverage
10 |
11 | jobs:
12 | test-coverage:
13 | runs-on: ubuntu-latest
14 | env:
15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 |
20 | - uses: r-lib/actions/setup-r@v2
21 | with:
22 | use-public-rspm: true
23 |
24 | - uses: r-lib/actions/setup-r-dependencies@v2
25 | with:
26 | extra-packages: covr
27 |
28 | - name: Test coverage
29 | run: covr::codecov()
30 | shell: Rscript {0}
31 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: meltr
2 | Title: Read Non-Rectangular Text Data
3 | Version: 1.0.2
4 | Authors@R:
5 | c(person(given = "Hadley",
6 | family = "Wickham",
7 | role = "aut",
8 | email = "hadley@rstudio.com"),
9 | person(given = "Duncan",
10 | family = "Garmonsway",
11 | role = c("aut", "cre"),
12 | email = "nacnudus@gmail.com",
13 | comment = "@nacnudus"),
14 | person(given = "Jim",
15 | family = "Hester",
16 | role = "aut",
17 | email = "jim.hester@rstudio.com",
18 | comment = c(ORCID = "0000-0002-2739-7082")),
19 | person(given = "RStudio",
20 | role = c("cph", "fnd")),
21 | person(given = "https://github.com/mandreyel/",
22 | role = "cph",
23 | comment = "mio library"))
24 | Description: The goal of 'meltr' is to provide a fast and friendly way to
25 | read non-rectangular data, such as ragged forms of csv (comma-separated
26 | values), tsv (tab-separated values), and fwf (fixed-width format) files.
27 | License: MIT + file LICENSE
28 | URL: https://r-lib.github.io/meltr/,
29 | https://github.com/r-lib/meltr
30 | BugReports: https://github.com/r-lib/meltr/issues
31 | Depends:
32 | R (>= 2.10)
33 | Imports:
34 | cli,
35 | methods,
36 | R6,
37 | rlang,
38 | tibble
39 | Suggests:
40 | clipr,
41 | covr,
42 | crayon,
43 | curl,
44 | readr,
45 | testthat (>= 3.0.0),
46 | withr
47 | LinkingTo:
48 | cpp11
49 | Config/testthat/edition: 3
50 | Config/Needs/website: dplyr
51 | Encoding: UTF-8
52 | LazyData: true
53 | Roxygen: list(markdown = TRUE)
54 | RoxygenNote: 7.2.1
55 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: meltr authors
3 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2021 meltr authors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method("[",meltr_spec_tbl_df)
4 | S3method(as.data.frame,meltr_spec_tbl_df)
5 | S3method(as_tibble,meltr_spec_tbl_df)
6 | S3method(print,date_names)
7 | S3method(print,locale)
8 | export(AccumulateCallback)
9 | export(ChunkCallback)
10 | export(DataFrameCallback)
11 | export(ListCallback)
12 | export(SideEffectChunkCallback)
13 | export(clipboard)
14 | export(datasource)
15 | export(date_names)
16 | export(date_names_lang)
17 | export(date_names_langs)
18 | export(default_locale)
19 | export(fwf_cols)
20 | export(fwf_empty)
21 | export(fwf_positions)
22 | export(fwf_widths)
23 | export(locale)
24 | export(melt_csv)
25 | export(melt_csv2)
26 | export(melt_csv2_chunked)
27 | export(melt_csv_chunked)
28 | export(melt_delim)
29 | export(melt_delim_chunked)
30 | export(melt_fwf)
31 | export(melt_table)
32 | export(melt_table2)
33 | export(melt_tsv)
34 | export(melt_tsv_chunked)
35 | export(meltr_example)
36 | export(problems)
37 | export(show_progress)
38 | export(stop_for_problems)
39 | export(tokenizer_csv)
40 | export(tokenizer_delim)
41 | export(tokenizer_fwf)
42 | export(tokenizer_line)
43 | export(tokenizer_log)
44 | export(tokenizer_tsv)
45 | export(tokenizer_ws)
46 | importFrom(methods,setOldClass)
47 | importFrom(tibble,as_tibble)
48 | importFrom(tibble,tibble)
49 | useDynLib(meltr, .registration = TRUE)
50 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # meltr 1.0.2
2 |
3 | * Fix CRAN warnings
4 |
5 | # meltr 1.0.1
6 |
7 | * Fix buffer overflow when trying to parse a field that is over 64 characters long (#10)
8 |
9 | # meltr 1.0.0
10 |
11 | * Added a `NEWS.md` file to track changes to the package.
12 |
--------------------------------------------------------------------------------
/R/callback.R:
--------------------------------------------------------------------------------
1 | as_chunk_callback <- function(x) UseMethod("as_chunk_callback")
2 | as_chunk_callback.function <- function(x) {
3 | SideEffectChunkCallback$new(x)
4 | }
5 | as_chunk_callback.R6ClassGenerator <- function(x) {
6 | as_chunk_callback(x$new())
7 | }
8 | as_chunk_callback.ChunkCallback <- function(x) {
9 | x
10 | }
11 |
12 | #' Callback classes
13 | #'
14 | #' These classes are used to define callback behaviors.
15 | #'
16 | #' \describe{
17 | #' \item{ChunkCallback}{Callback interface definition, all callback functions should inherit from this class.}
18 | #' \item{SideEffectChunkCallback}{Callback function that is used only for side effects, no results are returned.}
19 | #' \item{DataFrameCallback}{Callback function that combines each result together at the end.}
20 | #' \item{AccumulateCallBack}{
21 | #' Callback function that accumulates a single result. Requires the parameter `acc` to specify
22 | #' the initial value of the accumulator. The parameter `acc` is `NULL` by default.
23 | #' }
24 | #' }
25 | #' @usage NULL
26 | #' @format NULL
27 | #' @name callback
28 | #' @keywords internal
29 | #' @family chunked
30 | #' @export
31 | ChunkCallback <- R6::R6Class("ChunkCallback",
32 | private = list(
33 | callback = NULL
34 | ),
35 | public = list(
36 | initialize = function(callback) NULL,
37 | receive = function(data, index) NULL,
38 | continue = function() TRUE,
39 | result = function() NULL,
40 | finally = function() NULL
41 | )
42 | )
43 |
44 | #' @usage NULL
45 | #' @format NULL
46 | #' @rdname callback
47 | #' @export
48 | SideEffectChunkCallback <- R6::R6Class("SideEffectChunkCallback",
49 | inherit = ChunkCallback,
50 | private = list(
51 | cancel = FALSE
52 | ),
53 | public = list(
54 | initialize = function(callback) {
55 | check_callback_fun(callback)
56 | private$callback <- callback
57 | },
58 | receive = function(data, index) {
59 | result <- private$callback(data, index)
60 | private$cancel <- identical(result, FALSE)
61 | },
62 | continue = function() {
63 | !private$cancel
64 | }
65 | )
66 | )
67 |
68 | #' @usage NULL
69 | #' @format NULL
70 | #' @rdname callback
71 | #' @export
72 | DataFrameCallback <- R6::R6Class("DataFrameCallback",
73 | inherit = ChunkCallback,
74 | private = list(
75 | results = list()
76 | ),
77 | public = list(
78 | initialize = function(callback) {
79 | private$callback <- callback
80 | },
81 | receive = function(data, index) {
82 | result <- private$callback(data, index)
83 | private$results <- c(private$results, list(result))
84 | },
85 | result = function() {
86 | do.call(`rbind`, private$results)
87 | },
88 | finally = function() {
89 | private$results <- list()
90 | }
91 | )
92 | )
93 |
94 | #' @usage NULL
95 | #' @format NULL
96 | #' @rdname callback
97 | #' @export
98 | ListCallback <- R6::R6Class("ListCallback",
99 | inherit = ChunkCallback,
100 | private = list(
101 | results = list()
102 | ),
103 | public = list(
104 | initialize = function(callback) {
105 | private$callback <- callback
106 | },
107 | receive = function(data, index) {
108 | result <- private$callback(data, index)
109 | private$results <- c(private$results, list(result))
110 | },
111 | result = function() {
112 | private$results
113 | },
114 | finally = function() {
115 | private$results <- list()
116 | }
117 | )
118 | )
119 |
120 | #' @usage NULL
121 | #' @format NULL
122 | #' @rdname callback
123 | #' @export
124 | AccumulateCallback <- R6::R6Class("AccumulateCallback",
125 | inherit = ChunkCallback,
126 | private = list(
127 | acc = NULL
128 | ),
129 | public = list(
130 | initialize = function(callback, acc = NULL) {
131 | check_callback_fun(callback,
132 | req_args = 3,
133 | message = "`callback` must have three or more arguments"
134 | )
135 | private$acc <- acc
136 | private$callback <- callback
137 | },
138 | receive = function(data, index) {
139 | private$acc <- private$callback(data, index, private$acc)
140 | },
141 | result = function() {
142 | private$acc
143 | }
144 | )
145 | )
146 |
147 | check_callback_fun <- function(callback, req_args = 2, message = NULL) {
148 | if (is.null(message)) {
149 | message <- "`callback` must have two or more arguments"
150 | }
151 | n_args <- length(formals(callback))
152 | if (n_args < req_args) {
153 | stop(message, call. = FALSE)
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/R/cpp11.R:
--------------------------------------------------------------------------------
1 | # Generated by cpp11: do not edit by hand
2 |
3 | collectorGuess <- function(input, locale_, guessInteger) {
4 | .Call(`_meltr_collectorGuess`, input, locale_, guessInteger)
5 | }
6 |
7 | read_connection_ <- function(con, filename, chunk_size) {
8 | .Call(`_meltr_read_connection_`, con, filename, chunk_size)
9 | }
10 |
11 | read_file_ <- function(sourceSpec, locale_) {
12 | .Call(`_meltr_read_file_`, sourceSpec, locale_)
13 | }
14 |
15 | read_file_raw_ <- function(sourceSpec) {
16 | .Call(`_meltr_read_file_raw_`, sourceSpec)
17 | }
18 |
19 | melt_tokens_ <- function(sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress) {
20 | .Call(`_meltr_melt_tokens_`, sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress)
21 | }
22 |
23 | melt_tokens_chunked_ <- function(sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress) {
24 | invisible(.Call(`_meltr_melt_tokens_chunked_`, sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress))
25 | }
26 |
27 | whitespaceColumns <- function(sourceSpec, n, comment) {
28 | .Call(`_meltr_whitespaceColumns`, sourceSpec, n, comment)
29 | }
30 |
--------------------------------------------------------------------------------
/R/date-symbols.R:
--------------------------------------------------------------------------------
1 | #' Create or retrieve date names
2 | #'
3 | #' When parsing dates, you often need to know how weekdays of the week and
4 | #' months are represented as text. This pair of functions allows you to either
5 | #' create your own, or retrieve from a standard list. The standard list is
6 | #' derived from ICU () via the stringi package.
7 | #'
8 | #' @param mon,mon_ab Full and abbreviated month names.
9 | #' @param day,day_ab Full and abbreviated week day names. Starts with Sunday.
10 | #' @param am_pm Names used for AM and PM.
11 | #' @return A date names object
12 | #' @export
13 | #' @examples
14 | #' date_names(mon = LETTERS[1:12], day = letters[1:7])
15 | #' date_names_lang("en")
16 | #' date_names_lang("ko")
17 | #' date_names_lang("fr")
18 | date_names <- function(mon, mon_ab = mon, day, day_ab = day,
19 | am_pm = c("AM", "PM")) {
20 | stopifnot(is.character(mon), length(mon) == 12)
21 | stopifnot(is.character(mon_ab), length(mon_ab) == 12)
22 | stopifnot(is.character(day), length(day) == 7)
23 | stopifnot(is.character(day_ab), length(day_ab) == 7)
24 |
25 | structure(
26 | list(
27 | mon = enc2utf8(mon),
28 | mon_ab = enc2utf8(mon_ab),
29 | day = enc2utf8(day),
30 | day_ab = enc2utf8(day_ab),
31 | am_pm = enc2utf8(am_pm)
32 | ),
33 | class = "date_names"
34 | )
35 | }
36 |
37 | #' @export
38 | #' @rdname date_names
39 | #' @param language A BCP 47 locale, made up of a language and a region,
40 | #' e.g. `"en_US"` for American English. See `date_names_langs()`
41 | #' for a complete list of available locales.
42 | date_names_lang <- function(language) {
43 | stopifnot(is.character(language), length(language) == 1)
44 |
45 | symbols <- date_symbols[[language]]
46 | if (is.null(symbols)) {
47 | stop("Unknown language '", language, "'", call. = FALSE)
48 | }
49 |
50 | symbols
51 | }
52 |
53 | #' @export
54 | #' @rdname date_names
55 | date_names_langs <- function() {
56 | names(date_symbols)
57 | }
58 |
59 | #' @export
60 | print.date_names <- function(x, ...) {
61 | cat("\n")
62 |
63 | if (identical(x$day, x$day_ab)) {
64 | day <- paste0(x$day, collapse = ", ")
65 | } else {
66 | day <- paste0(x$day, " (", x$day_ab, ")", collapse = ", ")
67 | }
68 |
69 | if (identical(x$mon, x$mon_ab)) {
70 | mon <- paste0(x$mon, collapse = ", ")
71 | } else {
72 | mon <- paste0(x$mon, " (", x$mon_ab, ")", collapse = ", ")
73 | }
74 | am_pm <- paste0(x$am_pm, collapse = "/")
75 |
76 | cat_wrap("Days: ", day)
77 | cat_wrap("Months: ", mon)
78 | cat_wrap("AM/PM: ", am_pm)
79 | }
80 |
81 | is.date_names <- function(x) inherits(x, "date_names")
82 |
83 | cat_wrap <- function(header, body) {
84 | body <- strwrap(body, exdent = nchar(header))
85 | cat(header, paste(body, collapse = "\n"), "\n", sep = "")
86 | }
87 |
--------------------------------------------------------------------------------
/R/example.R:
--------------------------------------------------------------------------------
1 | #' Get path to meltr example
2 | #'
3 | #' meltr comes bundled with a number of sample files in its `inst/extdata`
4 | #' directory. This function make them easy to access
5 | #'
6 | #' @param file Name of file. If `NULL`, the example files will be listed.
7 | #' @return A file path or a vector of file names
8 | #' @export
9 | #' @examples
10 | #' meltr_example()
11 | #' meltr_example("mtcars.csv")
12 | meltr_example <- function(file = NULL) {
13 | if (is.null(file)) {
14 | dir(system.file("extdata", package = "meltr"))
15 | } else {
16 | system.file("extdata", file, package = "meltr", mustWork = TRUE)
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/R/locale.R:
--------------------------------------------------------------------------------
1 | #' Create locales
2 | #'
3 | #' A locale object tries to capture all the defaults that can vary between
4 | #' countries. You set the locale in once, and the details are automatically
5 | #' passed on down to the columns parsers. The defaults have been chosen to
6 | #' match R (i.e. US English) as closely as possible. See
7 | #' `vignette("locales")` for more details.
8 | #'
9 | #' @param date_names Character representations of day and month names. Either
10 | #' the language code as string (passed on to [date_names_lang()])
11 | #' or an object created by [date_names()].
12 | #' @param date_format,time_format Default date and time formats.
13 | #' @param decimal_mark,grouping_mark Symbols used to indicate the decimal
14 | #' place, and to chunk larger numbers. Decimal mark can only be `,` or
15 | #' `.`.
16 | #' @param tz Default tz. This is used both for input (if the time zone isn't
17 | #' present in individual strings), and for output (to control the default
18 | #' display). The default is to use "UTC", a time zone that does not use
19 | #' daylight savings time (DST) and hence is typically most useful for data.
20 | #' The absence of time zones makes it approximately 50x faster to generate
21 | #' UTC times than any other time zone.
22 | #'
23 | #' Use `""` to use the system default time zone, but beware that this
24 | #' will not be reproducible across systems.
25 | #'
26 | #' For a complete list of possible time zones, see [OlsonNames()].
27 | #' Americans, note that "EST" is a Canadian time zone that does not have
28 | #' DST. It is *not* Eastern Standard Time. It's better to use
29 | #' "US/Eastern", "US/Central" etc.
30 | #' @param encoding Default encoding. This only affects how the file is
31 | #' read - meltr always converts the output to UTF-8.
32 | #' @return A locale object
33 | #' @export
34 | #' @examples
35 | #' locale()
36 | #' locale("fr")
37 | #'
38 | #' # South American locale
39 | #' locale("es", decimal_mark = ",")
40 | locale <- function(date_names = "en",
41 | date_format = "%AD", time_format = "%AT",
42 | decimal_mark = ".", grouping_mark = ",",
43 | tz = "UTC", encoding = "UTF-8") {
44 | if (is.character(date_names)) {
45 | date_names <- date_names_lang(date_names)
46 | }
47 | stopifnot(is.date_names(date_names))
48 |
49 | if (missing(grouping_mark) && !missing(decimal_mark)) {
50 | grouping_mark <- if (decimal_mark == ".") "," else "."
51 | } else if (missing(decimal_mark) && !missing(grouping_mark)) {
52 | decimal_mark <- if (grouping_mark == ".") "," else "."
53 | }
54 |
55 | stopifnot(decimal_mark %in% c(".", ","))
56 | stopifnot(is.character(grouping_mark), length(grouping_mark) == 1)
57 | if (decimal_mark == grouping_mark) {
58 | stop("`decimal_mark` and `grouping_mark` must be different", call. = FALSE)
59 | }
60 |
61 | tz <- check_tz(tz)
62 | check_encoding(encoding)
63 |
64 | structure(
65 | list(
66 | date_names = date_names,
67 | date_format = date_format,
68 | time_format = time_format,
69 | decimal_mark = decimal_mark,
70 | grouping_mark = grouping_mark,
71 | tz = tz,
72 | encoding = encoding
73 | ),
74 | class = "locale"
75 | )
76 | }
77 |
78 | is.locale <- function(x) inherits(x, "locale")
79 |
80 | #' @export
81 | print.locale <- function(x, ...) {
82 | cat("\n")
83 | cat("Numbers: ", prettyNum(123456.78,
84 | big.mark = x$grouping_mark,
85 | decimal.mark = x$decimal_mark, digits = 8
86 | ), "\n", sep = "")
87 | cat("Formats: ", x$date_format, " / ", x$time_format, "\n", sep = "")
88 | cat("Timezone: ", x$tz, "\n", sep = "")
89 | cat("Encoding: ", x$encoding, "\n", sep = "")
90 | print(x$date_names)
91 | }
92 |
93 | #' @export
94 | #' @rdname locale
95 | default_locale <- function() {
96 | loc <- getOption("readr.default_locale")
97 | if (is.null(loc)) {
98 | loc <- locale()
99 | options("readr.default_locale" = loc)
100 | }
101 |
102 | loc
103 | }
104 |
105 | check_tz <- function(x) {
106 | stopifnot(is.character(x), length(x) == 1)
107 |
108 | if (identical(x, "")) {
109 | x <- Sys.timezone()
110 |
111 | if (identical(x, "") || identical(x, NA_character_)) {
112 | x <- "UTC"
113 | }
114 | }
115 |
116 | x
117 | }
118 |
119 | check_encoding <- function(x) {
120 | stopifnot(is.character(x), length(x) == 1)
121 |
122 | if (tolower(x) %in% tolower(iconvlist())) {
123 | return(TRUE)
124 | }
125 |
126 | stop("Unknown encoding ", x, call. = FALSE)
127 | }
128 |
--------------------------------------------------------------------------------
/R/melt_delim.R:
--------------------------------------------------------------------------------
1 | #' Return melted data for each token in a delimited file (including csv & tsv)
2 | #'
3 | #' For certain non-rectangular data formats, it can be useful to parse the data
4 | #' into a melted format where each row represents a single token.
5 | #'
6 | #' `melt_csv()` and `melt_tsv()` are special cases of the general
7 | #' `melt_delim()`. They're useful for reading the most common types of
8 | #' flat file data, comma separated values and tab separated values,
9 | #' respectively. `melt_csv2()` uses `;` for the field separator and `,` for the
10 | #' decimal point. This is common in some European countries.
11 | #' @inheritParams readr::read_delim
12 | #' @return A [tibble()] of four columns:
13 | #' * `row`, the row that the token comes from in the original file
14 | #' * `col`, the column that the token comes from in the original file
15 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
16 | #' `"date"`, guessed in a similar way to the `guess_parser()` function.
17 | #' * `value`, the token itself as a character string, unchanged from its
18 | #' representation in the original file.
19 | #'
20 | #' If there are parsing problems, a warning tells you
21 | #' how many, and you can retrieve the details with [problems()].
22 | #' @seealso [readr::read_delim()] for the conventional way to read rectangular data
23 | #' from delimited files.
24 | #' @export
25 | #' @examples
26 | #' # Input sources -------------------------------------------------------------
27 | #' # Read from a path
28 | #' melt_csv(meltr_example("mtcars.csv"))
29 | #' \dontrun{
30 | #' melt_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
31 | #' }
32 | #'
33 | #' # Or directly from a string (must contain a newline)
34 | #' melt_csv("x,y\n1,2\n3,4")
35 | #'
36 | #' # To import empty cells as 'empty' rather than `NA`
37 | #' melt_csv("x,y\n,NA,\"\",''", na = "NA")
38 | #'
39 | #' # File types ----------------------------------------------------------------
40 | #' melt_csv("a,b\n1.0,2.0")
41 | #' melt_csv2("a;b\n1,0;2,0")
42 | #' melt_tsv("a\tb\n1.0\t2.0")
43 | #' melt_delim("a|b\n1.0|2.0", delim = "|")
44 | #' @export
45 | melt_delim <- function(file, delim, quote = '"',
46 | escape_backslash = FALSE, escape_double = TRUE,
47 | locale = default_locale(),
48 | na = c("", "NA"), quoted_na = TRUE,
49 | comment = "", trim_ws = FALSE,
50 | skip = 0, n_max = Inf,
51 | progress = show_progress(),
52 | skip_empty_rows = FALSE) {
53 | if (!nzchar(delim)) {
54 | stop("`delim` must be at least one character, ",
55 | "use `melt_table()` for whitespace delimited input.", call. = FALSE)
56 | }
57 | tokenizer <- tokenizer_delim(delim, quote = quote,
58 | escape_backslash = escape_backslash, escape_double = escape_double,
59 | na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws,
60 | skip_empty_rows = skip_empty_rows)
61 | melt_delimited(file, tokenizer, locale = locale, skip = skip,
62 | skip_empty_rows = skip_empty_rows, comment = comment,
63 | n_max = n_max, progress = progress)
64 | }
65 |
66 | #' @rdname melt_delim
67 | #' @export
68 | melt_csv <- function(file, locale = default_locale(), na = c("", "NA"),
69 | quoted_na = TRUE, quote = "\"", comment = "",
70 | trim_ws = TRUE, skip = 0, n_max = Inf,
71 | progress = show_progress(),
72 | skip_empty_rows = FALSE) {
73 | tokenizer <- tokenizer_csv(na = na, quoted_na = quoted_na, quote = quote,
74 | comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows)
75 | melt_delimited(file, tokenizer, locale = locale, skip = skip,
76 | skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max,
77 | progress = progress)
78 | }
79 |
80 | #' @rdname melt_delim
81 | #' @export
82 | melt_csv2 <- function(file, locale = default_locale(), na = c("", "NA"),
83 | quoted_na = TRUE, quote = "\"", comment = "",
84 | trim_ws = TRUE, skip = 0, n_max = Inf,
85 | progress = show_progress(),
86 | skip_empty_rows = FALSE) {
87 |
88 | if (locale$decimal_mark == ".") {
89 | cli::cli_alert_info("Using {.val ','} as decimal and {.val '.'} as grouping mark. Use {.fn melt_delim} for more control.")
90 | locale$decimal_mark <- ","
91 | locale$grouping_mark <- "."
92 | }
93 | tokenizer <- tokenizer_delim(delim = ";", na = na, quoted_na = quoted_na,
94 | quote = quote, comment = comment, trim_ws = trim_ws,
95 | skip_empty_rows = skip_empty_rows)
96 | melt_delimited(file, tokenizer, locale = locale, skip = skip,
97 | skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max,
98 | progress = progress)
99 | }
100 |
101 |
102 | #' @rdname melt_delim
103 | #' @export
104 | melt_tsv <- function(file, locale = default_locale(), na = c("", "NA"),
105 | quoted_na = TRUE, quote = "\"", comment = "",
106 | trim_ws = TRUE, skip = 0, n_max = Inf,
107 | progress = show_progress(),
108 | skip_empty_rows = FALSE) {
109 | tokenizer <- tokenizer_tsv(na = na, quoted_na = quoted_na, quote = quote,
110 | comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows)
111 | melt_delimited(file, tokenizer, locale = locale, skip = skip,
112 | skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max,
113 | progress = progress)
114 | }
115 |
116 | # Helper functions for reading from delimited files ----------------------------
117 | col_spec_melt <-
118 | structure(list(row = structure(list(),
119 | class = c("collector_double",
120 | "collector")),
121 | col = structure(list(),
122 | class = c("collector_double",
123 | "collector")),
124 | data_type = structure(list(),
125 | class = c("collector_character",
126 | "collector")),
127 | value = structure(list(),
128 | class = c("collector_character",
129 | "collector"))),
130 | .Names = c("row", "col", "data_type", "value"))
131 |
132 | melt_tokens <- function(data, tokenizer, locale_, n_max, progress) {
133 | if (n_max == Inf) {
134 | n_max <- -1
135 | }
136 | melt_tokens_(data, tokenizer, col_spec_melt, locale_, n_max, progress)
137 | }
138 |
139 | melt_delimited <- function(file, tokenizer, locale = default_locale(),
140 | skip = 0, skip_empty_rows = FALSE, comment = "", n_max = Inf,
141 | progress = show_progress()) {
142 | name <- source_name(file)
143 | # If connection needed, read once.
144 | file <- standardise_path(file)
145 | if (is.connection(file)) {
146 | data <- datasource_connection(file, skip, skip_empty_rows = skip_empty_rows, comment)
147 | } else {
148 | if (empty_file(file)) {
149 | return(tibble::tibble(row = double(), col = double(),
150 | data_type = character(), value = character()))
151 | }
152 | if (is.character(file) && identical(locale$encoding, "UTF-8")) {
153 | # When locale is not set, file is probablly marked as its correct encoding.
154 | # As default_locale() assumes file is UTF-8, file should be encoded as UTF-8 for non-UTF-8 MBCS locales.
155 | data <- enc2utf8(file)
156 | } else {
157 | data <- file
158 | }
159 | }
160 | ds <- datasource(data, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment)
161 | out <- melt_tokens(ds, tokenizer, locale_ = locale, n_max = n_max,
162 | progress = progress)
163 | warn_problems(out)
164 | }
165 |
--------------------------------------------------------------------------------
/R/melt_delim_chunked.R:
--------------------------------------------------------------------------------
1 | # Generates the chunked definition from the melt_* definition
2 | generate_melt_chunked_fun <- function(x) { # nocov start
3 | args <- formals(x)
4 |
5 | # Remove n_max argument
6 | args <- args[names(args) != "n_max"]
7 |
8 | args <- append(args, alist(callback = , chunk_size = 10000), 1)
9 |
10 | b <- as.list(body(x))
11 |
12 | # Change melt_delimited to melt_delimited_chunked
13 | b[[length(b)]][[1]] <- quote(melt_delimited_chunked)
14 |
15 | call_args <- as.list(b[[length(b)]])
16 |
17 | # Remove the n_max argument
18 | call_args <- call_args[!names(call_args) == "n_max"]
19 |
20 | # add the callback and chunk_size arguments
21 | b[[length(b)]] <- as.call(append(call_args, alist(callback = callback, chunk_size = chunk_size), 2))
22 |
23 | body(x) <- as.call(b)
24 |
25 | formals(x) <- args
26 |
27 | x
28 | } # nocov end
29 |
30 | # Generates the modified melt_delimited function
31 | generate_melt_delimited_chunked <- function(x) { # nocov start
32 | args <- formals(x)
33 | args <- args[names(args) != "n_max"]
34 | args <- append(args, alist(callback = , chunk_size = 10000), 1)
35 |
36 | b <- as.list(body(x))
37 |
38 | for (i in seq_along(b)) {
39 | if (is.call(b[[i]]) && identical(b[[i]][[1]], as.symbol("<-")) &&
40 | is.call(b[[i]][[3]]) && identical(b[[i]][[3]][[1]], quote(melt_tokens))) {
41 |
42 | # Change melt_tokens() to melt_tokens_chunked
43 | b[[i]][[3]][[1]] <- quote(melt_tokens_chunked)
44 | chunked_call <- as.list(b[[i]][[3]])
45 |
46 | # Remove the n_max argument
47 | chunked_call <- chunked_call[!names(chunked_call) == "n_max"]
48 |
49 | # Add the callback and chunk_size arguments
50 | b[[i]] <- as.call(append(chunked_call, alist(callback = callback, chunk_size = chunk_size), 2))
51 |
52 | # Remove additional calls
53 | b <- b[-seq(i + 1, length(b))]
54 | body(x) <- as.call(b)
55 | formals(x) <- args
56 | return(x)
57 | }
58 | }
59 |
60 | x
61 | } # nocov end
62 |
63 | melt_tokens_chunked <- function(data, callback, chunk_size, tokenizer, locale_, progress) {
64 | callback <- as_chunk_callback(callback)
65 | on.exit(callback$finally(), add = TRUE)
66 |
67 | melt_tokens_chunked_(
68 | data, callback, chunk_size, tokenizer, col_spec_melt,
69 | locale_, progress
70 | )
71 |
72 | return(callback$result())
73 | }
74 |
75 | melt_delimited_chunked <- generate_melt_delimited_chunked(melt_delimited)
76 |
77 | #' Melt a delimited file by chunks
78 | #'
79 | #' For certain non-rectangular data formats, it can be useful to parse the data
80 | #' into a melted format where each row represents a single token.
81 | #'
82 | #' `melt_delim_chunked()` and the specialisations `melt_csv_chunked()`,
83 | #' `melt_csv2_chunked()` and `melt_tsv_chunked()` read files by a chunk of rows
84 | #' at a time, executing a given function on one chunk before reading the next.
85 | #'
86 | #' @inheritParams readr::read_delim_chunked
87 | #' @param callback A callback function to call on each chunk
88 | #' @param chunk_size The number of rows to include in each chunk
89 | #' @return A [tibble()] of four columns:
90 | #' * `row`, the row that the token comes from in the original file
91 | #' * `col`, the column that the token comes from in the original file
92 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
93 | #' `"date"`, guessed in a similar way to the `guess_parser()` function.
94 | #' * `value`, the token itself as a character string, unchanged from its
95 | #' representation in the original file.
96 | #'
97 | #' If there are parsing problems, a warning tells you
98 | #' how many, and you can retrieve the details with [problems()].
99 | #' @keywords internal
100 | #' @family chunked
101 | #' @export
102 | #' @examples
103 | #' # Cars with 3 gears
104 | #' f <- function(x, pos) subset(x, data_type == "integer")
105 | #' melt_csv_chunked(meltr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5)
106 | melt_delim_chunked <- generate_melt_chunked_fun(melt_delim)
107 |
108 | #' @rdname melt_delim_chunked
109 | #' @export
110 | melt_csv_chunked <- generate_melt_chunked_fun(melt_csv)
111 |
112 | #' @rdname melt_delim_chunked
113 | #' @export
114 | melt_csv2_chunked <- generate_melt_chunked_fun(melt_csv2)
115 |
116 | #' @rdname melt_delim_chunked
117 | #' @export
118 | melt_tsv_chunked <- generate_melt_chunked_fun(melt_tsv)
119 |
120 | utils::globalVariables(c("callback", "chunk_size"))
121 |
--------------------------------------------------------------------------------
/R/melt_fwf.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | #' Return melted data for each token in a fixed width file
4 | #'
5 | #' For certain non-rectangular data formats, it can be useful to parse the data
6 | #' into a melted format where each row represents a single token.
7 | #'
8 | #' `melt_fwf()` parses each token of a fixed width file into a single row, but
9 | #' it still requires that each field is in the same in every row of the
10 | #' source file.
11 | #'
12 | #' @seealso [melt_table()] to melt fixed width files where each
13 | #' column is separated by whitespace, and [melt_fwf()] for the conventional
14 | #' way to read rectangular data from fixed width files.
15 | #' @inheritParams readr::read_fwf
16 | #' @param col_positions Column positions, as created by [fwf_empty()],
17 | #' [fwf_widths()] or [fwf_positions()]. To read in only selected fields,
18 | #' use [fwf_positions()]. If the width of the last column is variable (a
19 | #' ragged fwf file), supply the last end position as NA.
20 | #' @return A [tibble()] of four columns:
21 | #' * `row`, the row that the token comes from in the original file
22 | #' * `col`, the column that the token comes from in the original file
23 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
24 | #' `"date"`, guessed in a similar way to the `guess_parser()` function.
25 | #' * `value`, the token itself as a character string, unchanged from its
26 | #' representation in the original file.
27 | #'
28 | #' If there are parsing problems, a warning tells you
29 | #' how many, and you can retrieve the details with [problems()].
30 | #' @export
31 | #' @examples
32 | #' fwf_sample <- meltr_example("fwf-sample.txt")
33 | #' writeLines(readLines(fwf_sample))
34 | #'
35 | #' # You can specify column positions in several ways:
36 | #' # 1. Guess based on position of empty columns
37 | #' melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn")))
38 | #' # 2. A vector of field widths
39 | #' melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn")))
40 | #' # 3. Paired vectors of start and end positions
41 | #' melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn")))
42 | #' # 4. Named arguments with start and end positions
43 | #' melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42)))
44 | #' # 5. Named arguments with column widths
45 | #' melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12))
46 | melt_fwf <- function(file, col_positions,
47 | locale = default_locale(), na = c("", "NA"),
48 | comment = "", trim_ws = TRUE, skip = 0, n_max = Inf,
49 | progress = show_progress(),
50 | skip_empty_rows = FALSE) {
51 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
52 | if (inherits(ds, "source_file") && empty_file(file)) {
53 | return(tibble::tibble(
54 | row = double(), col = double(),
55 | data_type = character(), value = character()
56 | ))
57 | }
58 | tokenizer <- tokenizer_fwf(as.integer(col_positions$begin), as.integer(col_positions$end),
59 | na = na,
60 | comment = comment, trim_ws = trim_ws,
61 | skip_empty_rows = skip_empty_rows
62 | )
63 | out <- melt_tokens(ds, tokenizer,
64 | locale_ = locale,
65 | n_max = if (n_max == Inf) -1 else n_max, progress = progress
66 | )
67 | warn_problems(out)
68 | }
69 |
70 | #' @rdname melt_fwf
71 | #' @export
72 | #' @param n Number of lines the tokenizer will read to determine file structure. By default
73 | #' it is set to 100.
74 | fwf_empty <- function(file, skip = 0, skip_empty_rows = FALSE, col_names = NULL, comment = "", n = 100L) {
75 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
76 |
77 | out <- whitespaceColumns(ds, comment = comment, n = n)
78 | out$end[length(out$end)] <- NA
79 |
80 | col_names <- fwf_col_names(col_names, length(out$begin))
81 | out$col_names <- col_names
82 | out
83 | }
84 |
85 | #' @rdname melt_fwf
86 | #' @export
87 | #' @param widths Width of each field. Use NA as width of last field when
88 | #' reading a ragged fwf file.
89 | #' @param col_names Either NULL, or a character vector column names.
90 | fwf_widths <- function(widths, col_names = NULL) {
91 | pos <- cumsum(c(1L, abs(widths)))
92 | fwf_positions(pos[-length(pos)], pos[-1] - 1L, col_names)
93 | }
94 |
95 | #' @rdname melt_fwf
96 | #' @export
97 | #' @param start,end Starting and ending (inclusive) positions of each field.
98 | #' Use NA as last end field when reading a ragged fwf file.
99 | fwf_positions <- function(start, end = NULL, col_names = NULL) {
100 | stopifnot(length(start) == length(end))
101 | col_names <- fwf_col_names(col_names, length(start))
102 |
103 | tibble(
104 | begin = start - 1L,
105 | end = end, # -1 to change to 0 offset, +1 to be exclusive,
106 | col_names = as.character(col_names)
107 | )
108 | }
109 |
110 |
111 | #' @rdname melt_fwf
112 | #' @export
113 | #' @param ... If the first element is a data frame,
114 | #' then it must have all numeric columns and either one or two rows.
115 | #' The column names are the variable names. The column values are the
116 | #' variable widths if a length one vector, and if length two, variable start and end
117 | #' positions. The elements of `...` are used to construct a data frame
118 | #' with or or two rows as above.
119 | fwf_cols <- function(...) {
120 | x <- lapply(list(...), as.integer)
121 | names(x) <- fwf_col_names(names(x), length(x))
122 | x <- tibble::as_tibble(x)
123 | if (nrow(x) == 2) {
124 | res <- fwf_positions(as.integer(x[1, ]), as.integer(x[2, ]), names(x))
125 | } else if (nrow(x) == 1) {
126 | res <- fwf_widths(as.integer(x[1, ]), names(x))
127 | } else {
128 | stop("All variables must have either one (width) two (start, end) values.",
129 | call. = FALSE
130 | )
131 | }
132 | res
133 | }
134 |
135 | fwf_col_names <- function(nm, n) {
136 | nm <- nm %||% rep("", n)
137 | nm_empty <- (nm == "")
138 | nm[nm_empty] <- paste0("X", seq_len(n))[nm_empty]
139 | nm
140 | }
141 |
--------------------------------------------------------------------------------
/R/melt_table.R:
--------------------------------------------------------------------------------
1 | #' Return melted data for each token in a whitespace-separated file
2 | #'
3 | #' @description
4 | #'
5 | #' For certain non-rectangular data formats, it can be useful to parse the data
6 | #' into a melted format where each row represents a single token.
7 | #'
8 | #' `melt_table()` and `melt_table2()` are designed to read the type of textual
9 | #' data where each column is separated by one (or more) columns of space.
10 | #'
11 | #' `melt_table2()` allows any number of whitespace characters between columns,
12 | #' and the lines can be of different lengths.
13 | #'
14 | #' `melt_table()` is more strict, each line must be the same length,
15 | #' and each field is in the same position in every line. It first finds empty
16 | #' columns and then parses like a fixed width file.
17 | #'
18 | #' @seealso [melt_fwf()] to melt fixed width files where each column
19 | #' is not separated by whitespace. `melt_fwf()` is also useful for reading
20 | #' tabular data with non-standard formatting. [readr::read_table()] is the
21 | #' conventional way to read tabular data from whitespace-separated files.
22 | #' @inheritParams readr::read_table
23 | #' @return A [tibble()] of four columns:
24 | #' * `row`, the row that the token comes from in the original file
25 | #' * `col`, the column that the token comes from in the original file
26 | #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`,
27 | #' `"date"`, guessed in a similar way to the `guess_parser()` function.
28 | #' * `value`, the token itself as a character string, unchanged from its
29 | #' representation in the original file.
30 | #'
31 | #' If there are parsing problems, a warning tells you
32 | #' how many, and you can retrieve the details with [problems()].
33 | #' @export
34 | #' @examples
35 | #' # One corner from http://www.masseyratings.com/cf/compare.htm
36 | #' massey <- meltr_example("massey-rating.txt")
37 | #' cat(readLines(massey))
38 | #' melt_table(massey)
39 | #'
40 | #' # Sample of 1978 fuel economy data from
41 | #' # http://www.fueleconomy.gov/feg/epadata/78data.zip
42 | #' epa <- meltr_example("epa78.txt")
43 | #' writeLines(readLines(epa))
44 | #' melt_table(epa)
45 | melt_table <- function(file, locale = default_locale(), na = "NA", skip = 0,
46 | n_max = Inf, guess_max = min(n_max, 1000),
47 | progress = show_progress(), comment = "",
48 | skip_empty_rows = FALSE) {
49 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
50 | if (inherits(ds, "source_file") && empty_file(file)) {
51 | return(tibble::tibble(
52 | row = double(), col = double(),
53 | data_type = character(), value = character()
54 | ))
55 | }
56 |
57 | columns <- fwf_empty(ds, skip = skip, skip_empty_rows = skip_empty_rows, n = guess_max, comment = comment)
58 | tokenizer <- tokenizer_fwf(columns$begin, columns$end,
59 | na = na,
60 | comment = comment,
61 | skip_empty_rows = skip_empty_rows
62 | )
63 |
64 | ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows)
65 | out <- melt_tokens(ds, tokenizer,
66 | locale_ = locale, n_max = n_max,
67 | progress = progress
68 | )
69 | warn_problems(out)
70 | }
71 |
72 | #' @rdname melt_table
73 | #' @export
74 | melt_table2 <- function(file, locale = default_locale(), na = "NA", skip = 0,
75 | n_max = Inf, progress = show_progress(), comment = "",
76 | skip_empty_rows = FALSE) {
77 | ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows)
78 | if (inherits(ds, "source_file") && empty_file(file)) {
79 | return(tibble::tibble(
80 | row = double(), col = double(),
81 | data_type = character(), value = character()
82 | ))
83 | }
84 | tokenizer <- tokenizer_ws(
85 | na = na, comment = comment,
86 | skip_empty_rows = skip_empty_rows
87 | )
88 |
89 | ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows)
90 | melt_delimited(ds, tokenizer,
91 | locale = locale, skip = skip,
92 | comment = comment, n_max = n_max, progress = progress
93 | )
94 | }
95 |
--------------------------------------------------------------------------------
/R/meltr-package.R:
--------------------------------------------------------------------------------
1 | ## usethis namespace: start
2 | #' @useDynLib meltr, .registration = TRUE
3 | ## usethis namespace: end
4 | NULL
5 |
--------------------------------------------------------------------------------
/R/problems.R:
--------------------------------------------------------------------------------
1 | #' Retrieve parsing problems
2 | #'
3 | #' Readr functions will only throw an error if parsing fails in an unrecoverable
4 | #' way. However, there are lots of potential problems that you might want to
5 | #' know about - these are stored in the `problems` attribute of the
6 | #' output, which you can easily access with this function.
7 | #' `stop_for_problems()` will throw an error if there are any parsing
8 | #' problems: this is useful for automated scripts where you want to throw
9 | #' an error as soon as you encounter a problem.
10 | #'
11 | #' @param x An data frame (from `read_*()`) or a vector
12 | #' (from `parse_*()`).
13 | #' @return A data frame with one row for each problem and four columns:
14 | #' \item{row,col}{Row and column of problem}
15 | #' \item{expected}{What readr expected to find}
16 | #' \item{actual}{What it actually got}
17 | #' @export
18 | #' @examples
19 | #' if (requireNamespace("readr")) {
20 | #' x <- readr::parse_integer(c("1X", "blah", "3"))
21 | #' problems(x)
22 | #'
23 | #' y <- readr::parse_integer(c("1", "2", "3"))
24 | #' problems(y)
25 | #' }
26 | problems <- local({
27 | no_problems <- tibble::tibble(
28 | row = integer(),
29 | col = integer(),
30 | expected = character(),
31 | actual = character()
32 | )
33 |
34 | function(x = .Last.value) {
35 | problems <- probs(x)
36 |
37 | if (is.null(problems)) {
38 | return(invisible(no_problems))
39 | }
40 |
41 | problems
42 | }
43 | })
44 |
45 | #' @export
46 | #' @rdname problems
47 | stop_for_problems <- function(x) {
48 | n <- n_problems(x)
49 | if (n == 0) {
50 | return(invisible(x))
51 | }
52 |
53 | stop(n, " parsing failure", if (n > 1) "s", call. = FALSE)
54 | }
55 |
56 | probs <- function(x) {
57 | attr(suppressWarnings(x), "problems")
58 | }
59 |
60 | n_problems <- function(x) {
61 | probs <- problems(x)
62 | if (is.null(probs)) 0 else nrow(probs)
63 | }
64 |
65 | problem_rows <- function(x) {
66 | if (n_problems(x) == 0) {
67 | return(x[0, , drop = FALSE])
68 | }
69 |
70 | probs <- problems(x)
71 | x[unique(probs$row), , drop = FALSE]
72 | }
73 |
74 | warn_problems <- function(x) {
75 | n <- n_problems(x)
76 | if (n == 0) {
77 | return(x)
78 | }
79 |
80 | probs <- as.data.frame(attr(x, "problems"))
81 | many_problems <- nrow(probs) > 5
82 |
83 | probs_f <- format(utils::head(probs, 5), justify = "left")
84 | probs_f[probs_f == "NA"] <- "--"
85 | probs_f <- rbind(names(probs), probs_f)
86 | probs_f <- lapply(probs_f, format, justify = "right")
87 |
88 | if (many_problems) {
89 | # nchar fails with non-ascii characters, so encode characters beforehand.
90 | width <- vapply(probs_f, function(x) max(nchar(encodeString(x))), integer(1))
91 | dots <- vapply(width, function(i) paste(rep(".", i), collapse = ""),
92 | FUN.VALUE = character(1)
93 | )
94 |
95 | probs_f <- Map(c, probs_f, dots)
96 | }
97 |
98 | probs_f <- do.call(paste, c(probs_f, list(sep = " ", collapse = "\n")))
99 | warning(n, " parsing failure", if (n > 1) "s", ".\n",
100 | probs_f, "\n",
101 | if (many_problems) "See problems(...) for more details.\n",
102 | call. = FALSE, immediate. = TRUE, noBreaks. = TRUE
103 | )
104 |
105 | x
106 | }
107 |
108 | name_problems <- function(x, all_colnames, name = "input") {
109 | if (n_problems(x) == 0) {
110 | return(x)
111 | }
112 |
113 | problems <- problems(x)
114 | problems$file <- name
115 | problems$col <- all_colnames[problems$col]
116 | attr(x, "problems") <- problems
117 |
118 | x
119 | }
120 |
--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-lib/meltr/38c5a720afe794d1fd2f36e5bb552dd9a8ca8b47/R/sysdata.rda
--------------------------------------------------------------------------------
/R/tokenizer.R:
--------------------------------------------------------------------------------
1 | #' Tokenizers.
2 | #'
3 | #' Explicitly create tokenizer objects. Usually you will not call these
4 | #' function, but will instead use one of the use friendly wrappers like
5 | #' [readr::read_csv()].
6 | #'
7 | #' @keywords internal
8 | #' @name Tokenizers
9 | #' @examples
10 | #' tokenizer_csv()
11 | NULL
12 |
13 | #' @export
14 | #' @rdname Tokenizers
15 | #' @param comment A string used to identify comments. Any text after the
16 | #' comment characters will be silently ignored.
17 | #' @param na Character vector of strings to interpret as missing values. Set this
18 | #' option to `character()` to indicate no missing values.
19 | #' @param quoted_na Should missing values inside quotes be treated as missing
20 | #' values (the default) or strings.
21 | #' @param delim Single character used to separate fields within a record.
22 | #' @param quote Single character used to quote strings.
23 | #' @param trim_ws Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
24 | #' each field before parsing it?
25 | #' @param escape_double Does the file escape quotes by doubling them?
26 | #' i.e. If this option is `TRUE`, the value `""""` represents
27 | #' a single quote, `\"`.
28 | #' @param escape_backslash Does the file use backslashes to escape special
29 | #' characters? This is more general than `escape_double` as backslashes
30 | #' can be used to escape the delimiter character, the quote character, or
31 | #' to add special characters like `\\n`.
32 | #' @param skip_empty_rows Should blank rows be ignored altogether? i.e. If this
33 | #' option is `TRUE` then blank rows will not be represented at all. If it is
34 | #' `FALSE` then they will be represented by `NA` values in all the columns.
35 | #' @return A tokeenizer object
36 | #' @examples
37 | #' tokenizer_delim(",")
38 | tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, comment = "",
39 | trim_ws = TRUE,
40 | escape_double = TRUE,
41 | escape_backslash = FALSE,
42 | skip_empty_rows = TRUE) {
43 | structure(
44 | list(
45 | delim = delim,
46 | quote = quote,
47 | na = na,
48 | quoted_na = quoted_na,
49 | comment = comment,
50 | trim_ws = trim_ws,
51 | escape_double = escape_double,
52 | escape_backslash = escape_backslash,
53 | skip_empty_rows = skip_empty_rows
54 | ),
55 | class = "tokenizer_delim"
56 | )
57 | }
58 |
59 | #' @export
60 | #' @rdname Tokenizers
61 | tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
62 | comment = "", trim_ws = TRUE,
63 | skip_empty_rows = TRUE) {
64 | tokenizer_delim(
65 | delim = ",",
66 | na = na,
67 | quoted_na = quoted_na,
68 | quote = quote,
69 | comment = comment,
70 | trim_ws = trim_ws,
71 | escape_double = TRUE,
72 | escape_backslash = FALSE,
73 | skip_empty_rows = skip_empty_rows
74 | )
75 | }
76 |
77 | #' @export
78 | #' @rdname Tokenizers
79 | tokenizer_tsv <- function(na = "NA", quoted_na = TRUE, quote = "\"",
80 | comment = "", trim_ws = TRUE,
81 | skip_empty_rows = TRUE) {
82 | tokenizer_delim(
83 | delim = "\t",
84 | na = na,
85 | quoted_na = quoted_na,
86 | quote = quote,
87 | comment = comment,
88 | trim_ws = trim_ws,
89 | escape_double = TRUE,
90 | escape_backslash = FALSE,
91 | skip_empty_rows = skip_empty_rows
92 | )
93 | }
94 |
95 | #' @export
96 | #' @rdname Tokenizers
97 | tokenizer_line <- function(na = character(), skip_empty_rows = TRUE) {
98 | structure(list(na = na, skip_empty_rows = skip_empty_rows),
99 | class = "tokenizer_line"
100 | )
101 | }
102 |
103 | #' @export
104 | #' @rdname Tokenizers
105 | tokenizer_log <- function(trim_ws) {
106 | structure(list(trim_ws = trim_ws), class = "tokenizer_log")
107 | }
108 |
109 |
110 | #' @export
111 | #' @rdname Tokenizers
112 | #' @param begin,end Begin and end offsets for each file. These are C++
113 | #' offsets so the first column is column zero, and the ranges are
114 | #' [begin, end) (i.e inclusive-exclusive).
115 | tokenizer_fwf <- function(begin, end, na = "NA", comment = "", trim_ws = TRUE,
116 | skip_empty_rows = TRUE) {
117 | structure(list(
118 | begin = as.integer(begin), end = as.integer(end), na = na, comment = comment,
119 | trim_ws = trim_ws, skip_empty_rows = skip_empty_rows
120 | ),
121 | class = "tokenizer_fwf"
122 | )
123 | }
124 |
125 | #' @export
126 | #' @rdname Tokenizers
127 | tokenizer_ws <- function(na = "NA", comment = "", skip_empty_rows = TRUE) {
128 | structure(list(na = na, comment = comment, skip_empty_rows = skip_empty_rows),
129 | class = "tokenizer_ws"
130 | )
131 | }
132 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | # Silence R CMD check note
2 | #' @importFrom tibble tibble
3 | NULL
4 |
5 | is.connection <- function(x) inherits(x, "connection")
6 |
7 | `%||%` <- function(a, b) if (is.null(a)) b else a
8 |
9 | #' Determine whether progress bars should be shown
10 | #'
11 | #' Progress bars are shown _unless_ one of the following is `TRUE`
12 | #' - The bar is explicitly disabled by setting `options(readr.show_progress = FALSE)`
13 | #' - The code is run in a non-interactive session (`interactive()` is `FALSE`).
14 | #' - The code is run in an RStudio notebook chunk.
15 | #' - The code is run by knitr / rmarkdown.
16 | #'
17 | #' @return A logical value
18 | #' @export
19 | #' @examples
20 | #' show_progress()
21 | show_progress <- function() {
22 | isTRUE(getOption("readr.show_progress")) && # user disables progress bar
23 | interactive() && # an interactive session
24 | !isTRUE(getOption("rstudio.notebook.executing")) && # Not running in an RStudio notebook chunk
25 | !isTRUE(getOption("knitr.in.progress")) # Not actively knitting a document
26 | }
27 |
28 | #' @importFrom tibble as_tibble
29 | #' @export
30 | as_tibble.meltr_spec_tbl_df <- function(x, ...) {
31 | attr(x, "spec") <- NULL
32 | attr(x, "problems") <- NULL
33 | class(x) <- setdiff(class(x), "meltr_spec_tbl_df")
34 | NextMethod("as_tibble")
35 | }
36 |
37 | #' @export
38 | as.data.frame.meltr_spec_tbl_df <- function(x, ...) {
39 | attr(x, "spec") <- NULL
40 | attr(x, "problems") <- NULL
41 | class(x) <- setdiff(class(x), "meltr_spec_tbl_df")
42 | NextMethod("as.data.frame")
43 | }
44 |
45 | #' @export
46 | `[.meltr_spec_tbl_df` <- function(x, ...) {
47 | attr(x, "spec") <- NULL
48 | attr(x, "problems") <- NULL
49 | class(x) <- setdiff(class(x), "spec_tbl_df")
50 | NextMethod(`[`)
51 | }
52 |
53 | #' @importFrom methods setOldClass
54 | setOldClass(c("meltr_spec_tbl_df", "tbl_df", "tbl", "data.frame"))
55 |
56 | # @export
57 | compare.meltr_spec_tbl_df <- function(x, y, ...) {
58 | attr(x, "spec") <- NULL
59 | attr(x, "problems") <- NULL
60 |
61 | attr(y, "spec") <- NULL
62 | attr(y, "problems") <- NULL
63 |
64 | NextMethod("compare")
65 | }
66 |
67 | # @export
68 | compare_proxy.meltr_spec_tbl_df <- function(x) {
69 | attr(x, "spec") <- NULL
70 | attr(x, "problems") <- NULL
71 | x
72 | }
73 |
74 | is_named <- function(x) {
75 | nms <- names(x)
76 |
77 | if (is.null(nms)) {
78 | return(FALSE)
79 | }
80 |
81 | all(nms != "" & !is.na(nms))
82 | }
83 |
84 | .onLoad <- function(...) {
85 | register_s3_method("testthat", "compare", "meltr_spec_tbl_df")
86 | register_s3_method("waldo", "compare_proxy", "meltr_spec_tbl_df")
87 | }
88 |
89 | register_s3_method <- function(pkg, generic, class, fun = NULL) {
90 | stopifnot(is.character(pkg), length(pkg) == 1)
91 | stopifnot(is.character(generic), length(generic) == 1)
92 | stopifnot(is.character(class), length(class) == 1)
93 |
94 | if (is.null(fun)) {
95 | fun <- get(paste0(generic, ".", class), envir = parent.frame())
96 | } else {
97 | stopifnot(is.function(fun))
98 | }
99 |
100 | if (pkg %in% loadedNamespaces()) {
101 | registerS3method(generic, class, fun, envir = asNamespace(pkg))
102 | }
103 |
104 | # Always register hook in case package is later unloaded & reloaded
105 | setHook(
106 | packageEvent(pkg, "onLoad"),
107 | function(...) {
108 | registerS3method(generic, class, fun, envir = asNamespace(pkg))
109 | }
110 | )
111 | }
112 |
113 | # Silence R CMD check note
114 | # Namespaces in Imports field not imported from:
115 | # ‘R6’ ‘rlang’
116 | # All declared Imports should be used.
117 | # See https://github.com/hadley/r-pkgs/issues/828
118 | fake_function_1 <- function() R6::R6Class
119 | fake_function_2 <- function() rlang::int
120 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 |
6 |
7 | ```{r, include = FALSE}
8 | knitr::opts_chunk$set(
9 | collapse = TRUE,
10 | comment = "#>",
11 | fig.path = "man/figures/README-",
12 | out.width = "100%"
13 | )
14 | ```
15 |
16 | # meltr
17 |
18 |
19 | [](https://github.com/r-lib/meltr/actions)
20 | [](https://app.codecov.io/gh/r-lib/meltr?branch=main)
21 |
22 |
23 |
24 |
25 |
26 |
27 | The goal of 'meltr' is to provide a fast and friendly way to read
28 | non-rectangular data (like ragged forms of 'csv', 'tsv', and 'fwf').
29 |
30 | Standard tools like [`readr::read_csv()`](https://readr.tidyverse.org/reference/read_delim.html) can cope to some extent with unusual inputs, like files with empty rows or newlines embedded in strings.
31 | But some files are so wacky that standard tools don't work at all, and instead you have to take the file to pieces and reassemble to get structured data you can work with.
32 |
33 | The meltr package provides tools to do this.
34 |
35 | ## Installation
36 |
37 | You can install the released version of meltr from CRAN with:
38 |
39 | ``` r
40 | install.packages("meltr")
41 | ```
42 |
43 | Or you can install the development version with:
44 |
45 | ```r
46 | # install.packages("devtools")
47 | devtools::install_github("r-lib/meltr")
48 | ```
49 |
50 | ## The problem with non-rectangular data
51 |
52 | Here's a contrived example that breaks two assumptions made by common tools like `readr::read_csv()`.
53 |
54 | 1. There are more cells in some rows than others.
55 | 2. There are mixed data types within each column.
56 |
57 | In contrast, the `melt_csv()` function reads the file one cell at a time, importing each cell of the file into a whole row of the final data frame.
58 |
59 | ```{r}
60 | writeLines("Help,,007,I'm
61 | 1960-09-30,FALSE,trapped in,7,1.21
62 | non-rectangular,data,NA", "messy.csv")
63 |
64 | library(meltr)
65 |
66 | melt_csv("messy.csv")
67 | ```
68 |
69 | The output of `melt_csv()` gives us:
70 |
71 | - A data frame of results – structured data about un-structured data!
72 | - Rows of data corresponding to cells of the input data.
73 | - Empty cells such as the cell on row 1, but not missing cells at the ends of rows 1 and 3.
74 | - The raw, unconverted data, no data type conversion is attempted – every value is imported as a string, and the `data_type` column merely gives meltr's best guess of what the data types ought to be.
75 |
76 | What are some ways you can you use this?
77 | To begin with, you can do some simple manipulations with ordinary functions.
78 |
79 | For example you could extract the words.
80 |
81 | ```{r}
82 | library(dplyr)
83 |
84 | data <- melt_csv("messy.csv")
85 |
86 | data %>%
87 | filter(data_type == "character")
88 | ```
89 |
90 | Or find if there are missing entries.
91 |
92 | ```{r}
93 | data %>%
94 | filter(data_type == "missing")
95 | ```
96 |
97 | ```{r, include = FALSE}
98 | unlink("messy.csv")
99 | ```
100 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # meltr
5 |
6 |
7 |
8 | [](https://github.com/r-lib/meltr/actions)
9 | [](https://app.codecov.io/gh/r-lib/meltr?branch=main)
11 |
12 |
13 |
14 |
15 |
16 |
17 | The goal of ‘meltr’ is to provide a fast and friendly way to read
18 | non-rectangular data (like ragged forms of ‘csv’, ‘tsv’, and ‘fwf’).
19 |
20 | Standard tools like
21 | [`readr::read_csv()`](https://readr.tidyverse.org/reference/read_delim.html)
22 | can cope to some extent with unusual inputs, like files with empty rows
23 | or newlines embedded in strings. But some files are so wacky that
24 | standard tools don’t work at all, and instead you have to take the file
25 | to pieces and reassemble to get structured data you can work with.
26 |
27 | The meltr package provides tools to do this.
28 |
29 | ## Installation
30 |
31 | You can install the released version of meltr from CRAN with:
32 |
33 | ``` r
34 | install.packages("meltr")
35 | ```
36 |
37 | Or you can install the development version with:
38 |
39 | ``` r
40 | # install.packages("devtools")
41 | devtools::install_github("r-lib/meltr")
42 | ```
43 |
44 | ## The problem with non-rectangular data
45 |
46 | Here’s a contrived example that breaks two assumptions made by common
47 | tools like `readr::read_csv()`.
48 |
49 | 1. There are more cells in some rows than others.
50 | 2. There are mixed data types within each column.
51 |
52 | In contrast, the `melt_csv()` function reads the file one cell at a
53 | time, importing each cell of the file into a whole row of the final data
54 | frame.
55 |
56 | ``` r
57 | writeLines("Help,,007,I'm
58 | 1960-09-30,FALSE,trapped in,7,1.21
59 | non-rectangular,data,NA", "messy.csv")
60 |
61 | library(meltr)
62 |
63 | melt_csv("messy.csv")
64 | #> # A tibble: 12 × 4
65 | #> row col data_type value
66 | #>
67 | #> 1 1 1 character Help
68 | #> 2 1 2 missing
69 | #> 3 1 3 character 007
70 | #> 4 1 4 character I'm
71 | #> 5 2 1 date 1960-09-30
72 | #> 6 2 2 logical FALSE
73 | #> 7 2 3 character trapped in
74 | #> 8 2 4 integer 7
75 | #> 9 2 5 double 1.21
76 | #> 10 3 1 character non-rectangular
77 | #> 11 3 2 character data
78 | #> 12 3 3 missing
79 | ```
80 |
81 | The output of `melt_csv()` gives us:
82 |
83 | - A data frame of results – structured data about un-structured data!
84 | - Rows of data corresponding to cells of the input data.
85 | - Empty cells such as the cell on row 1, but not missing cells at the
86 | ends of rows 1 and 3.
87 | - The raw, unconverted data, no data type conversion is attempted –
88 | every value is imported as a string, and the `data_type` column merely
89 | gives meltr’s best guess of what the data types ought to be.
90 |
91 | What are some ways you can you use this? To begin with, you can do some
92 | simple manipulations with ordinary functions.
93 |
94 | For example you could extract the words.
95 |
96 | ``` r
97 | library(dplyr)
98 | #>
99 | #> Attaching package: 'dplyr'
100 | #> The following objects are masked from 'package:stats':
101 | #>
102 | #> filter, lag
103 | #> The following objects are masked from 'package:base':
104 | #>
105 | #> intersect, setdiff, setequal, union
106 |
107 | data <- melt_csv("messy.csv")
108 |
109 | data %>%
110 | filter(data_type == "character")
111 | #> # A tibble: 6 × 4
112 | #> row col data_type value
113 | #>
114 | #> 1 1 1 character Help
115 | #> 2 1 3 character 007
116 | #> 3 1 4 character I'm
117 | #> 4 2 3 character trapped in
118 | #> 5 3 1 character non-rectangular
119 | #> 6 3 2 character data
120 | ```
121 |
122 | Or find if there are missing entries.
123 |
124 | ``` r
125 | data %>%
126 | filter(data_type == "missing")
127 | #> # A tibble: 2 × 4
128 | #> row col data_type value
129 | #>
130 | #> 1 1 2 missing
131 | #> 2 3 3 missing
132 | ```
133 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 1%
9 | informational: true
10 | patch:
11 | default:
12 | target: auto
13 | threshold: 1%
14 | informational: true
15 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## R CMD check results
2 |
3 | 0 errors, 0 warnings, 0 notes
4 |
--------------------------------------------------------------------------------
/data-raw/date-symbols.R:
--------------------------------------------------------------------------------
1 | library(stringi)
2 |
3 | locs <- stri_locale_list()
4 | base <- unique(stri_split_fixed(locs, "_", n = 2, simplify = TRUE)[, 1])
5 |
6 | locale_info <- function(x) {
7 | full <- stri_datetime_symbols(x, context = "format", width = "wide")
8 | abbr <- stri_datetime_symbols(x, context = "format", width = "abbreviated")
9 |
10 | date_names(
11 | mon = full$Month,
12 | mon_ab = abbr$Month,
13 | day = full$Weekday,
14 | day_ab = abbr$Weekday,
15 | am_pm = full$AmPm
16 | )
17 | }
18 |
19 | date_symbols <- lapply(base, locale_info)
20 | names(date_symbols) <- base
21 |
22 | usethis::use_data(date_symbols, internal = TRUE, overwrite = TRUE)
23 |
--------------------------------------------------------------------------------
/inst/extdata/epa78.txt:
--------------------------------------------------------------------------------
1 | ALFA ROMEO ALFA ROMEO 78010003
2 | ALFETTA 03 81 8 74 7 89 9 ALFETTA 78010053
3 | SPIDER 2000 01 SPIDER 2000 78010103
4 | AMC AMC 78020002
5 | GREMLIN 03 79 9 79 9 GREMLIN 78020053
6 | PACER 04 89 11 89 11 PACER 78020103
7 | PACER WAGON 07 90 26 91 26 PACER WAGON 78020153
8 | CONCORD 04 88 12 90 11 90 11 83 16 CONCORD 78020203
9 | CONCORD WAGON 07 91 30 91 30 CONCORD WAGON 78020253
10 | MATADOR COUPE 05 97 14 97 14 MATADOR COUPE 78020303
11 | MATADOR SEDAN 06 110 20 110 20 MATADOR SEDAN 78020353
12 | MATADOR WAGON 09 112 50 112 50 MATADOR WAGON 78020403
13 | ASTON MARTIN ASTON MARTIN 78040002
14 | ASTON MARTIN ASTON MARTIN 78040053
15 | AUDI AUDI 78050002
16 | FOX 03 84 11 84 11 84 11 FOX 78050053
17 | FOX WAGON 07 83 40 83 40 FOX WAGON 78050103
18 | 5000 04 90 15 90 15 5000 78050153
19 | AVANTI AVANTI 78065002
20 | AVANTI II 02 75 8 75 8 AVANTI II 78065053
21 |
--------------------------------------------------------------------------------
/inst/extdata/fwf-sample.txt:
--------------------------------------------------------------------------------
1 | John Smith WA 418-Y11-4111
2 | Mary Hartford CA 319-Z19-4341
3 | Evan Nolan IL 219-532-c301
4 |
--------------------------------------------------------------------------------
/inst/extdata/massey-rating.txt:
--------------------------------------------------------------------------------
1 | UCC PAY LAZ KPK RT COF BIH DII ENG ACU Rank Team Conf
2 | 1 1 1 1 1 1 1 1 1 1 1 Ohio St B10
3 | 2 2 2 2 2 2 2 2 4 2 2 Oregon P12
4 | 3 4 3 4 3 4 3 4 2 3 3 Alabama SEC
5 | 4 3 4 3 4 3 5 3 3 4 4 TCU B12
6 | 6 6 6 5 5 7 6 5 6 11 5 Michigan St B10
7 | 7 7 7 6 7 6 11 8 7 8 6 Georgia SEC
8 | 5 5 5 7 6 8 4 6 5 5 7 Florida St ACC
9 | 8 8 9 9 10 5 7 7 10 7 8 Baylor B12
10 | 9 11 8 13 11 11 12 9 14 9 9 Georgia Tech ACC
11 | 13 10 13 11 8 9 10 11 9 10 10 Mississippi SEC
12 |
--------------------------------------------------------------------------------
/inst/extdata/mtcars.csv:
--------------------------------------------------------------------------------
1 | "mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
2 | 21,6,160,110,3.9,2.62,16.46,0,1,4,4
3 | 21,6,160,110,3.9,2.875,17.02,0,1,4,4
4 | 22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
5 | 21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
6 | 18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
7 | 18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
8 | 14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
9 | 24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
10 | 22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | 19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | 17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | 16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | 17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | 15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
16 | 10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
17 | 10.4,8,460,215,3,5.424,17.82,0,0,3,4
18 | 14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
19 | 32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | 30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | 33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | 21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | 15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
24 | 15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
25 | 13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
26 | 19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
27 | 27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
28 | 26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | 30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | 15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
31 | 19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
32 | 15,8,301,335,3.54,3.57,14.6,0,1,5,8
33 | 21.4,4,121,109,4.11,2.78,18.6,1,1,4,2
34 |
--------------------------------------------------------------------------------
/man/Tokenizers.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tokenizer.R
3 | \name{Tokenizers}
4 | \alias{Tokenizers}
5 | \alias{tokenizer_delim}
6 | \alias{tokenizer_csv}
7 | \alias{tokenizer_tsv}
8 | \alias{tokenizer_line}
9 | \alias{tokenizer_log}
10 | \alias{tokenizer_fwf}
11 | \alias{tokenizer_ws}
12 | \title{Tokenizers.}
13 | \usage{
14 | tokenizer_delim(
15 | delim,
16 | quote = "\\"",
17 | na = "NA",
18 | quoted_na = TRUE,
19 | comment = "",
20 | trim_ws = TRUE,
21 | escape_double = TRUE,
22 | escape_backslash = FALSE,
23 | skip_empty_rows = TRUE
24 | )
25 |
26 | tokenizer_csv(
27 | na = "NA",
28 | quoted_na = TRUE,
29 | quote = "\\"",
30 | comment = "",
31 | trim_ws = TRUE,
32 | skip_empty_rows = TRUE
33 | )
34 |
35 | tokenizer_tsv(
36 | na = "NA",
37 | quoted_na = TRUE,
38 | quote = "\\"",
39 | comment = "",
40 | trim_ws = TRUE,
41 | skip_empty_rows = TRUE
42 | )
43 |
44 | tokenizer_line(na = character(), skip_empty_rows = TRUE)
45 |
46 | tokenizer_log(trim_ws)
47 |
48 | tokenizer_fwf(
49 | begin,
50 | end,
51 | na = "NA",
52 | comment = "",
53 | trim_ws = TRUE,
54 | skip_empty_rows = TRUE
55 | )
56 |
57 | tokenizer_ws(na = "NA", comment = "", skip_empty_rows = TRUE)
58 | }
59 | \arguments{
60 | \item{delim}{Single character used to separate fields within a record.}
61 |
62 | \item{quote}{Single character used to quote strings.}
63 |
64 | \item{na}{Character vector of strings to interpret as missing values. Set this
65 | option to \code{character()} to indicate no missing values.}
66 |
67 | \item{quoted_na}{Should missing values inside quotes be treated as missing
68 | values (the default) or strings.}
69 |
70 | \item{comment}{A string used to identify comments. Any text after the
71 | comment characters will be silently ignored.}
72 |
73 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
74 | each field before parsing it?}
75 |
76 | \item{escape_double}{Does the file escape quotes by doubling them?
77 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents
78 | a single quote, \verb{\\"}.}
79 |
80 | \item{escape_backslash}{Does the file use backslashes to escape special
81 | characters? This is more general than \code{escape_double} as backslashes
82 | can be used to escape the delimiter character, the quote character, or
83 | to add special characters like \verb{\\\\n}.}
84 |
85 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
86 | option is \code{TRUE} then blank rows will not be represented at all. If it is
87 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
88 |
89 | \item{begin, end}{Begin and end offsets for each file. These are C++
90 | offsets so the first column is column zero, and the ranges are
91 | [begin, end) (i.e inclusive-exclusive).}
92 | }
93 | \value{
94 | A tokeenizer object
95 | }
96 | \description{
97 | Explicitly create tokenizer objects. Usually you will not call these
98 | function, but will instead use one of the use friendly wrappers like
99 | \code{\link[readr:read_delim]{readr::read_csv()}}.
100 | }
101 | \examples{
102 | tokenizer_csv()
103 | tokenizer_delim(",")
104 | }
105 | \keyword{internal}
106 |
--------------------------------------------------------------------------------
/man/clipboard.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/source.R
3 | \name{clipboard}
4 | \alias{clipboard}
5 | \title{Returns values from the clipboard}
6 | \usage{
7 | clipboard()
8 | }
9 | \description{
10 | This is useful in the \code{\link[readr:read_delim]{readr::read_delim()}} functions to read from the clipboard.
11 | }
12 | \examples{
13 | \dontrun{
14 | clipboard()
15 | }
16 | }
17 | \seealso{
18 | readr::read_delim
19 | }
20 |
--------------------------------------------------------------------------------
/man/datasource.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/source.R
3 | \name{datasource}
4 | \alias{datasource}
5 | \title{Create a source object.}
6 | \usage{
7 | datasource(
8 | file,
9 | skip = 0,
10 | skip_empty_rows = FALSE,
11 | comment = "",
12 | skip_quote = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{file}{Either a path to a file, a connection, or literal data
17 | (either a single string or a raw vector).
18 |
19 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
20 | be automatically uncompressed. Files starting with \verb{http://},
21 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
22 | downloaded. Remote gz files can also be automatically downloaded and
23 | decompressed.
24 |
25 | Literal data is most useful for examples and tests. It must contain at
26 | least one new line to be recognised as data (instead of a path) or be a
27 | vector of greater than length 1.
28 |
29 | Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.}
30 |
31 | \item{skip}{Number of lines to skip before reading data.}
32 | }
33 | \value{
34 | A source object
35 | }
36 | \description{
37 | Create a source object.
38 | }
39 | \examples{
40 | # Literal csv
41 | datasource("a,b,c\n1,2,3")
42 | datasource(charToRaw("a,b,c\n1,2,3"))
43 |
44 | # Strings
45 | datasource(meltr_example("mtcars.csv"))
46 | \dontrun{
47 | datasource("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
48 | }
49 |
50 | # Connection
51 | con <- rawConnection(charToRaw("abc\n123"))
52 | datasource(con)
53 | close(con)
54 | }
55 | \keyword{internal}
56 |
--------------------------------------------------------------------------------
/man/date_names.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/date-symbols.R
3 | \name{date_names}
4 | \alias{date_names}
5 | \alias{date_names_lang}
6 | \alias{date_names_langs}
7 | \title{Create or retrieve date names}
8 | \usage{
9 | date_names(mon, mon_ab = mon, day, day_ab = day, am_pm = c("AM", "PM"))
10 |
11 | date_names_lang(language)
12 |
13 | date_names_langs()
14 | }
15 | \arguments{
16 | \item{mon, mon_ab}{Full and abbreviated month names.}
17 |
18 | \item{day, day_ab}{Full and abbreviated week day names. Starts with Sunday.}
19 |
20 | \item{am_pm}{Names used for AM and PM.}
21 |
22 | \item{language}{A BCP 47 locale, made up of a language and a region,
23 | e.g. \code{"en_US"} for American English. See \code{date_names_langs()}
24 | for a complete list of available locales.}
25 | }
26 | \value{
27 | A date names object
28 | }
29 | \description{
30 | When parsing dates, you often need to know how weekdays of the week and
31 | months are represented as text. This pair of functions allows you to either
32 | create your own, or retrieve from a standard list. The standard list is
33 | derived from ICU (\url{https://icu.unicode.org/}) via the stringi package.
34 | }
35 | \examples{
36 | date_names(mon = LETTERS[1:12], day = letters[1:7])
37 | date_names_lang("en")
38 | date_names_lang("ko")
39 | date_names_lang("fr")
40 | }
41 |
--------------------------------------------------------------------------------
/man/locale.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/locale.R
3 | \name{locale}
4 | \alias{locale}
5 | \alias{default_locale}
6 | \title{Create locales}
7 | \usage{
8 | locale(
9 | date_names = "en",
10 | date_format = "\%AD",
11 | time_format = "\%AT",
12 | decimal_mark = ".",
13 | grouping_mark = ",",
14 | tz = "UTC",
15 | encoding = "UTF-8"
16 | )
17 |
18 | default_locale()
19 | }
20 | \arguments{
21 | \item{date_names}{Character representations of day and month names. Either
22 | the language code as string (passed on to \code{\link[=date_names_lang]{date_names_lang()}})
23 | or an object created by \code{\link[=date_names]{date_names()}}.}
24 |
25 | \item{date_format, time_format}{Default date and time formats.}
26 |
27 | \item{decimal_mark, grouping_mark}{Symbols used to indicate the decimal
28 | place, and to chunk larger numbers. Decimal mark can only be \verb{,} or
29 | \code{.}.}
30 |
31 | \item{tz}{Default tz. This is used both for input (if the time zone isn't
32 | present in individual strings), and for output (to control the default
33 | display). The default is to use "UTC", a time zone that does not use
34 | daylight savings time (DST) and hence is typically most useful for data.
35 | The absence of time zones makes it approximately 50x faster to generate
36 | UTC times than any other time zone.
37 |
38 | Use \code{""} to use the system default time zone, but beware that this
39 | will not be reproducible across systems.
40 |
41 | For a complete list of possible time zones, see \code{\link[=OlsonNames]{OlsonNames()}}.
42 | Americans, note that "EST" is a Canadian time zone that does not have
43 | DST. It is \emph{not} Eastern Standard Time. It's better to use
44 | "US/Eastern", "US/Central" etc.}
45 |
46 | \item{encoding}{Default encoding. This only affects how the file is
47 | read - meltr always converts the output to UTF-8.}
48 | }
49 | \value{
50 | A locale object
51 | }
52 | \description{
53 | A locale object tries to capture all the defaults that can vary between
54 | countries. You set the locale in once, and the details are automatically
55 | passed on down to the columns parsers. The defaults have been chosen to
56 | match R (i.e. US English) as closely as possible. See
57 | \code{vignette("locales")} for more details.
58 | }
59 | \examples{
60 | locale()
61 | locale("fr")
62 |
63 | # South American locale
64 | locale("es", decimal_mark = ",")
65 | }
66 |
--------------------------------------------------------------------------------
/man/melt_delim.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/melt_delim.R
3 | \name{melt_delim}
4 | \alias{melt_delim}
5 | \alias{melt_csv}
6 | \alias{melt_csv2}
7 | \alias{melt_tsv}
8 | \title{Return melted data for each token in a delimited file (including csv & tsv)}
9 | \usage{
10 | melt_delim(
11 | file,
12 | delim,
13 | quote = "\\"",
14 | escape_backslash = FALSE,
15 | escape_double = TRUE,
16 | locale = default_locale(),
17 | na = c("", "NA"),
18 | quoted_na = TRUE,
19 | comment = "",
20 | trim_ws = FALSE,
21 | skip = 0,
22 | n_max = Inf,
23 | progress = show_progress(),
24 | skip_empty_rows = FALSE
25 | )
26 |
27 | melt_csv(
28 | file,
29 | locale = default_locale(),
30 | na = c("", "NA"),
31 | quoted_na = TRUE,
32 | quote = "\\"",
33 | comment = "",
34 | trim_ws = TRUE,
35 | skip = 0,
36 | n_max = Inf,
37 | progress = show_progress(),
38 | skip_empty_rows = FALSE
39 | )
40 |
41 | melt_csv2(
42 | file,
43 | locale = default_locale(),
44 | na = c("", "NA"),
45 | quoted_na = TRUE,
46 | quote = "\\"",
47 | comment = "",
48 | trim_ws = TRUE,
49 | skip = 0,
50 | n_max = Inf,
51 | progress = show_progress(),
52 | skip_empty_rows = FALSE
53 | )
54 |
55 | melt_tsv(
56 | file,
57 | locale = default_locale(),
58 | na = c("", "NA"),
59 | quoted_na = TRUE,
60 | quote = "\\"",
61 | comment = "",
62 | trim_ws = TRUE,
63 | skip = 0,
64 | n_max = Inf,
65 | progress = show_progress(),
66 | skip_empty_rows = FALSE
67 | )
68 | }
69 | \arguments{
70 | \item{file}{Either a path to a file, a connection, or literal data
71 | (either a single string or a raw vector).
72 |
73 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
74 | be automatically uncompressed. Files starting with \verb{http://},
75 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
76 | downloaded. Remote gz files can also be automatically downloaded and
77 | decompressed.
78 |
79 | Literal data is most useful for examples and tests. To be recognised as
80 | literal data, the input must be either wrapped with \code{I()}, be a string
81 | containing at least one new line, or be a vector containing at least one
82 | string with a new line.
83 |
84 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
85 |
86 | \item{delim}{Single character used to separate fields within a record.}
87 |
88 | \item{quote}{Single character used to quote strings.}
89 |
90 | \item{escape_backslash}{Does the file use backslashes to escape special
91 | characters? This is more general than \code{escape_double} as backslashes
92 | can be used to escape the delimiter character, the quote character, or
93 | to add special characters like \verb{\\\\n}.}
94 |
95 | \item{escape_double}{Does the file escape quotes by doubling them?
96 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents
97 | a single quote, \verb{\\"}.}
98 |
99 | \item{locale}{The locale controls defaults that vary from place to place.
100 | The default locale is US-centric (like R), but you can use
101 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
102 | the default time zone, encoding, decimal mark, big mark, and day/month
103 | names.}
104 |
105 | \item{na}{Character vector of strings to interpret as missing values. Set this
106 | option to \code{character()} to indicate no missing values.}
107 |
108 | \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values
109 | inside quotes be treated as missing values (the default) or strings. This
110 | parameter is soft deprecated as of readr 2.0.0.}
111 |
112 | \item{comment}{A string used to identify comments. Any text after the
113 | comment characters will be silently ignored.}
114 |
115 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
116 | each field before parsing it?}
117 |
118 | \item{skip}{Number of lines to skip before reading data. If \code{comment} is
119 | supplied any commented lines are ignored \emph{after} skipping.}
120 |
121 | \item{n_max}{Maximum number of lines to read.}
122 |
123 | \item{progress}{Display a progress bar? By default it will only display
124 | in an interactive session and not while knitting a document. The automatic
125 | progress bar can be disabled by setting option \code{readr.show_progress} to
126 | \code{FALSE}.}
127 |
128 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
129 | option is \code{TRUE} then blank rows will not be represented at all. If it is
130 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
131 | }
132 | \value{
133 | A \code{\link[=tibble]{tibble()}} of four columns:
134 | \itemize{
135 | \item \code{row}, the row that the token comes from in the original file
136 | \item \code{col}, the column that the token comes from in the original file
137 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
138 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
139 | \item \code{value}, the token itself as a character string, unchanged from its
140 | representation in the original file.
141 | }
142 |
143 | If there are parsing problems, a warning tells you
144 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
145 | }
146 | \description{
147 | For certain non-rectangular data formats, it can be useful to parse the data
148 | into a melted format where each row represents a single token.
149 | }
150 | \details{
151 | \code{melt_csv()} and \code{melt_tsv()} are special cases of the general
152 | \code{melt_delim()}. They're useful for reading the most common types of
153 | flat file data, comma separated values and tab separated values,
154 | respectively. \code{melt_csv2()} uses \verb{;} for the field separator and \verb{,} for the
155 | decimal point. This is common in some European countries.
156 | }
157 | \examples{
158 | # Input sources -------------------------------------------------------------
159 | # Read from a path
160 | melt_csv(meltr_example("mtcars.csv"))
161 | \dontrun{
162 | melt_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
163 | }
164 |
165 | # Or directly from a string (must contain a newline)
166 | melt_csv("x,y\n1,2\n3,4")
167 |
168 | # To import empty cells as 'empty' rather than `NA`
169 | melt_csv("x,y\n,NA,\"\",''", na = "NA")
170 |
171 | # File types ----------------------------------------------------------------
172 | melt_csv("a,b\n1.0,2.0")
173 | melt_csv2("a;b\n1,0;2,0")
174 | melt_tsv("a\tb\n1.0\t2.0")
175 | melt_delim("a|b\n1.0|2.0", delim = "|")
176 | }
177 | \seealso{
178 | \code{\link[readr:read_delim]{readr::read_delim()}} for the conventional way to read rectangular data
179 | from delimited files.
180 | }
181 |
--------------------------------------------------------------------------------
/man/melt_delim_chunked.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/melt_delim_chunked.R
3 | \name{melt_delim_chunked}
4 | \alias{melt_delim_chunked}
5 | \alias{melt_csv_chunked}
6 | \alias{melt_csv2_chunked}
7 | \alias{melt_tsv_chunked}
8 | \title{Melt a delimited file by chunks}
9 | \usage{
10 | melt_delim_chunked(
11 | file,
12 | callback,
13 | chunk_size = 10000,
14 | delim,
15 | quote = "\\"",
16 | escape_backslash = FALSE,
17 | escape_double = TRUE,
18 | locale = default_locale(),
19 | na = c("", "NA"),
20 | quoted_na = TRUE,
21 | comment = "",
22 | trim_ws = FALSE,
23 | skip = 0,
24 | progress = show_progress(),
25 | skip_empty_rows = FALSE
26 | )
27 |
28 | melt_csv_chunked(
29 | file,
30 | callback,
31 | chunk_size = 10000,
32 | locale = default_locale(),
33 | na = c("", "NA"),
34 | quoted_na = TRUE,
35 | quote = "\\"",
36 | comment = "",
37 | trim_ws = TRUE,
38 | skip = 0,
39 | progress = show_progress(),
40 | skip_empty_rows = FALSE
41 | )
42 |
43 | melt_csv2_chunked(
44 | file,
45 | callback,
46 | chunk_size = 10000,
47 | locale = default_locale(),
48 | na = c("", "NA"),
49 | quoted_na = TRUE,
50 | quote = "\\"",
51 | comment = "",
52 | trim_ws = TRUE,
53 | skip = 0,
54 | progress = show_progress(),
55 | skip_empty_rows = FALSE
56 | )
57 |
58 | melt_tsv_chunked(
59 | file,
60 | callback,
61 | chunk_size = 10000,
62 | locale = default_locale(),
63 | na = c("", "NA"),
64 | quoted_na = TRUE,
65 | quote = "\\"",
66 | comment = "",
67 | trim_ws = TRUE,
68 | skip = 0,
69 | progress = show_progress(),
70 | skip_empty_rows = FALSE
71 | )
72 | }
73 | \arguments{
74 | \item{file}{Either a path to a file, a connection, or literal data
75 | (either a single string or a raw vector).
76 |
77 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
78 | be automatically uncompressed. Files starting with \verb{http://},
79 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
80 | downloaded. Remote gz files can also be automatically downloaded and
81 | decompressed.
82 |
83 | Literal data is most useful for examples and tests. To be recognised as
84 | literal data, the input must be either wrapped with \code{I()}, be a string
85 | containing at least one new line, or be a vector containing at least one
86 | string with a new line.
87 |
88 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
89 |
90 | \item{callback}{A callback function to call on each chunk}
91 |
92 | \item{chunk_size}{The number of rows to include in each chunk}
93 |
94 | \item{delim}{Single character used to separate fields within a record.}
95 |
96 | \item{quote}{Single character used to quote strings.}
97 |
98 | \item{escape_backslash}{Does the file use backslashes to escape special
99 | characters? This is more general than \code{escape_double} as backslashes
100 | can be used to escape the delimiter character, the quote character, or
101 | to add special characters like \verb{\\\\n}.}
102 |
103 | \item{escape_double}{Does the file escape quotes by doubling them?
104 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents
105 | a single quote, \verb{\\"}.}
106 |
107 | \item{locale}{The locale controls defaults that vary from place to place.
108 | The default locale is US-centric (like R), but you can use
109 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
110 | the default time zone, encoding, decimal mark, big mark, and day/month
111 | names.}
112 |
113 | \item{na}{Character vector of strings to interpret as missing values. Set this
114 | option to \code{character()} to indicate no missing values.}
115 |
116 | \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values
117 | inside quotes be treated as missing values (the default) or strings. This
118 | parameter is soft deprecated as of readr 2.0.0.}
119 |
120 | \item{comment}{A string used to identify comments. Any text after the
121 | comment characters will be silently ignored.}
122 |
123 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
124 | each field before parsing it?}
125 |
126 | \item{skip}{Number of lines to skip before reading data. If \code{comment} is
127 | supplied any commented lines are ignored \emph{after} skipping.}
128 |
129 | \item{progress}{Display a progress bar? By default it will only display
130 | in an interactive session and not while knitting a document. The automatic
131 | progress bar can be disabled by setting option \code{readr.show_progress} to
132 | \code{FALSE}.}
133 |
134 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
135 | option is \code{TRUE} then blank rows will not be represented at all. If it is
136 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
137 | }
138 | \value{
139 | A \code{\link[=tibble]{tibble()}} of four columns:
140 | \itemize{
141 | \item \code{row}, the row that the token comes from in the original file
142 | \item \code{col}, the column that the token comes from in the original file
143 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
144 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
145 | \item \code{value}, the token itself as a character string, unchanged from its
146 | representation in the original file.
147 | }
148 |
149 | If there are parsing problems, a warning tells you
150 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
151 | }
152 | \description{
153 | For certain non-rectangular data formats, it can be useful to parse the data
154 | into a melted format where each row represents a single token.
155 | }
156 | \details{
157 | \code{melt_delim_chunked()} and the specialisations \code{melt_csv_chunked()},
158 | \code{melt_csv2_chunked()} and \code{melt_tsv_chunked()} read files by a chunk of rows
159 | at a time, executing a given function on one chunk before reading the next.
160 | }
161 | \examples{
162 | # Cars with 3 gears
163 | f <- function(x, pos) subset(x, data_type == "integer")
164 | melt_csv_chunked(meltr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5)
165 | }
166 | \seealso{
167 | Other chunked:
168 | \code{\link{callback}}
169 | }
170 | \concept{chunked}
171 | \keyword{internal}
172 |
--------------------------------------------------------------------------------
/man/melt_fwf.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/melt_fwf.R
3 | \name{melt_fwf}
4 | \alias{melt_fwf}
5 | \alias{fwf_empty}
6 | \alias{fwf_widths}
7 | \alias{fwf_positions}
8 | \alias{fwf_cols}
9 | \title{Return melted data for each token in a fixed width file}
10 | \usage{
11 | melt_fwf(
12 | file,
13 | col_positions,
14 | locale = default_locale(),
15 | na = c("", "NA"),
16 | comment = "",
17 | trim_ws = TRUE,
18 | skip = 0,
19 | n_max = Inf,
20 | progress = show_progress(),
21 | skip_empty_rows = FALSE
22 | )
23 |
24 | fwf_empty(
25 | file,
26 | skip = 0,
27 | skip_empty_rows = FALSE,
28 | col_names = NULL,
29 | comment = "",
30 | n = 100L
31 | )
32 |
33 | fwf_widths(widths, col_names = NULL)
34 |
35 | fwf_positions(start, end = NULL, col_names = NULL)
36 |
37 | fwf_cols(...)
38 | }
39 | \arguments{
40 | \item{file}{Either a path to a file, a connection, or literal data
41 | (either a single string or a raw vector).
42 |
43 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
44 | be automatically uncompressed. Files starting with \verb{http://},
45 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
46 | downloaded. Remote gz files can also be automatically downloaded and
47 | decompressed.
48 |
49 | Literal data is most useful for examples and tests. To be recognised as
50 | literal data, the input must be either wrapped with \code{I()}, be a string
51 | containing at least one new line, or be a vector containing at least one
52 | string with a new line.
53 |
54 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
55 |
56 | \item{col_positions}{Column positions, as created by \code{\link[=fwf_empty]{fwf_empty()}},
57 | \code{\link[=fwf_widths]{fwf_widths()}} or \code{\link[=fwf_positions]{fwf_positions()}}. To read in only selected fields,
58 | use \code{\link[=fwf_positions]{fwf_positions()}}. If the width of the last column is variable (a
59 | ragged fwf file), supply the last end position as NA.}
60 |
61 | \item{locale}{The locale controls defaults that vary from place to place.
62 | The default locale is US-centric (like R), but you can use
63 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
64 | the default time zone, encoding, decimal mark, big mark, and day/month
65 | names.}
66 |
67 | \item{na}{Character vector of strings to interpret as missing values. Set this
68 | option to \code{character()} to indicate no missing values.}
69 |
70 | \item{comment}{A string used to identify comments. Any text after the
71 | comment characters will be silently ignored.}
72 |
73 | \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from
74 | each field before parsing it?}
75 |
76 | \item{skip}{Number of lines to skip before reading data.}
77 |
78 | \item{n_max}{Maximum number of lines to read.}
79 |
80 | \item{progress}{Display a progress bar? By default it will only display
81 | in an interactive session and not while knitting a document. The automatic
82 | progress bar can be disabled by setting option \code{readr.show_progress} to
83 | \code{FALSE}.}
84 |
85 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
86 | option is \code{TRUE} then blank rows will not be represented at all. If it is
87 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
88 |
89 | \item{col_names}{Either NULL, or a character vector column names.}
90 |
91 | \item{n}{Number of lines the tokenizer will read to determine file structure. By default
92 | it is set to 100.}
93 |
94 | \item{widths}{Width of each field. Use NA as width of last field when
95 | reading a ragged fwf file.}
96 |
97 | \item{start, end}{Starting and ending (inclusive) positions of each field.
98 | Use NA as last end field when reading a ragged fwf file.}
99 |
100 | \item{...}{If the first element is a data frame,
101 | then it must have all numeric columns and either one or two rows.
102 | The column names are the variable names. The column values are the
103 | variable widths if a length one vector, and if length two, variable start and end
104 | positions. The elements of \code{...} are used to construct a data frame
105 | with or or two rows as above.}
106 | }
107 | \value{
108 | A \code{\link[=tibble]{tibble()}} of four columns:
109 | \itemize{
110 | \item \code{row}, the row that the token comes from in the original file
111 | \item \code{col}, the column that the token comes from in the original file
112 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
113 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
114 | \item \code{value}, the token itself as a character string, unchanged from its
115 | representation in the original file.
116 | }
117 |
118 | If there are parsing problems, a warning tells you
119 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
120 | }
121 | \description{
122 | For certain non-rectangular data formats, it can be useful to parse the data
123 | into a melted format where each row represents a single token.
124 | }
125 | \details{
126 | \code{melt_fwf()} parses each token of a fixed width file into a single row, but
127 | it still requires that each field is in the same in every row of the
128 | source file.
129 | }
130 | \examples{
131 | fwf_sample <- meltr_example("fwf-sample.txt")
132 | writeLines(readLines(fwf_sample))
133 |
134 | # You can specify column positions in several ways:
135 | # 1. Guess based on position of empty columns
136 | melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn")))
137 | # 2. A vector of field widths
138 | melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn")))
139 | # 3. Paired vectors of start and end positions
140 | melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn")))
141 | # 4. Named arguments with start and end positions
142 | melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42)))
143 | # 5. Named arguments with column widths
144 | melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12))
145 | }
146 | \seealso{
147 | \code{\link[=melt_table]{melt_table()}} to melt fixed width files where each
148 | column is separated by whitespace, and \code{\link[=melt_fwf]{melt_fwf()}} for the conventional
149 | way to read rectangular data from fixed width files.
150 | }
151 |
--------------------------------------------------------------------------------
/man/melt_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/melt_table.R
3 | \name{melt_table}
4 | \alias{melt_table}
5 | \alias{melt_table2}
6 | \title{Return melted data for each token in a whitespace-separated file}
7 | \usage{
8 | melt_table(
9 | file,
10 | locale = default_locale(),
11 | na = "NA",
12 | skip = 0,
13 | n_max = Inf,
14 | guess_max = min(n_max, 1000),
15 | progress = show_progress(),
16 | comment = "",
17 | skip_empty_rows = FALSE
18 | )
19 |
20 | melt_table2(
21 | file,
22 | locale = default_locale(),
23 | na = "NA",
24 | skip = 0,
25 | n_max = Inf,
26 | progress = show_progress(),
27 | comment = "",
28 | skip_empty_rows = FALSE
29 | )
30 | }
31 | \arguments{
32 | \item{file}{Either a path to a file, a connection, or literal data
33 | (either a single string or a raw vector).
34 |
35 | Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
36 | be automatically uncompressed. Files starting with \verb{http://},
37 | \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically
38 | downloaded. Remote gz files can also be automatically downloaded and
39 | decompressed.
40 |
41 | Literal data is most useful for examples and tests. To be recognised as
42 | literal data, the input must be either wrapped with \code{I()}, be a string
43 | containing at least one new line, or be a vector containing at least one
44 | string with a new line.
45 |
46 | Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.}
47 |
48 | \item{locale}{The locale controls defaults that vary from place to place.
49 | The default locale is US-centric (like R), but you can use
50 | \code{\link[readr:locale]{locale()}} to create your own locale that controls things like
51 | the default time zone, encoding, decimal mark, big mark, and day/month
52 | names.}
53 |
54 | \item{na}{Character vector of strings to interpret as missing values. Set this
55 | option to \code{character()} to indicate no missing values.}
56 |
57 | \item{skip}{Number of lines to skip before reading data.}
58 |
59 | \item{n_max}{Maximum number of lines to read.}
60 |
61 | \item{guess_max}{Maximum number of lines to use for guessing column types.
62 | See \code{vignette("column-types", package = "readr")} for more details.}
63 |
64 | \item{progress}{Display a progress bar? By default it will only display
65 | in an interactive session and not while knitting a document. The automatic
66 | progress bar can be disabled by setting option \code{readr.show_progress} to
67 | \code{FALSE}.}
68 |
69 | \item{comment}{A string used to identify comments. Any text after the
70 | comment characters will be silently ignored.}
71 |
72 | \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this
73 | option is \code{TRUE} then blank rows will not be represented at all. If it is
74 | \code{FALSE} then they will be represented by \code{NA} values in all the columns.}
75 | }
76 | \value{
77 | A \code{\link[=tibble]{tibble()}} of four columns:
78 | \itemize{
79 | \item \code{row}, the row that the token comes from in the original file
80 | \item \code{col}, the column that the token comes from in the original file
81 | \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"},
82 | \code{"date"}, guessed in a similar way to the \code{guess_parser()} function.
83 | \item \code{value}, the token itself as a character string, unchanged from its
84 | representation in the original file.
85 | }
86 |
87 | If there are parsing problems, a warning tells you
88 | how many, and you can retrieve the details with \code{\link[=problems]{problems()}}.
89 | }
90 | \description{
91 | For certain non-rectangular data formats, it can be useful to parse the data
92 | into a melted format where each row represents a single token.
93 |
94 | \code{melt_table()} and \code{melt_table2()} are designed to read the type of textual
95 | data where each column is separated by one (or more) columns of space.
96 |
97 | \code{melt_table2()} allows any number of whitespace characters between columns,
98 | and the lines can be of different lengths.
99 |
100 | \code{melt_table()} is more strict, each line must be the same length,
101 | and each field is in the same position in every line. It first finds empty
102 | columns and then parses like a fixed width file.
103 | }
104 | \examples{
105 | # One corner from http://www.masseyratings.com/cf/compare.htm
106 | massey <- meltr_example("massey-rating.txt")
107 | cat(readLines(massey))
108 | melt_table(massey)
109 |
110 | # Sample of 1978 fuel economy data from
111 | # http://www.fueleconomy.gov/feg/epadata/78data.zip
112 | epa <- meltr_example("epa78.txt")
113 | writeLines(readLines(epa))
114 | melt_table(epa)
115 | }
116 | \seealso{
117 | \code{\link[=melt_fwf]{melt_fwf()}} to melt fixed width files where each column
118 | is not separated by whitespace. \code{melt_fwf()} is also useful for reading
119 | tabular data with non-standard formatting. \code{\link[readr:read_table]{readr::read_table()}} is the
120 | conventional way to read tabular data from whitespace-separated files.
121 | }
122 |
--------------------------------------------------------------------------------
/man/meltr_example.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/example.R
3 | \name{meltr_example}
4 | \alias{meltr_example}
5 | \title{Get path to meltr example}
6 | \usage{
7 | meltr_example(file = NULL)
8 | }
9 | \arguments{
10 | \item{file}{Name of file. If \code{NULL}, the example files will be listed.}
11 | }
12 | \value{
13 | A file path or a vector of file names
14 | }
15 | \description{
16 | meltr comes bundled with a number of sample files in its \code{inst/extdata}
17 | directory. This function make them easy to access
18 | }
19 | \examples{
20 | meltr_example()
21 | meltr_example("mtcars.csv")
22 | }
23 |
--------------------------------------------------------------------------------
/man/problems.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/problems.R
3 | \name{problems}
4 | \alias{problems}
5 | \alias{stop_for_problems}
6 | \title{Retrieve parsing problems}
7 | \usage{
8 | problems(x = .Last.value)
9 |
10 | stop_for_problems(x)
11 | }
12 | \arguments{
13 | \item{x}{An data frame (from \verb{read_*()}) or a vector
14 | (from \verb{parse_*()}).}
15 | }
16 | \value{
17 | A data frame with one row for each problem and four columns:
18 | \item{row,col}{Row and column of problem}
19 | \item{expected}{What readr expected to find}
20 | \item{actual}{What it actually got}
21 | }
22 | \description{
23 | Readr functions will only throw an error if parsing fails in an unrecoverable
24 | way. However, there are lots of potential problems that you might want to
25 | know about - these are stored in the \code{problems} attribute of the
26 | output, which you can easily access with this function.
27 | \code{stop_for_problems()} will throw an error if there are any parsing
28 | problems: this is useful for automated scripts where you want to throw
29 | an error as soon as you encounter a problem.
30 | }
31 | \examples{
32 | if (requireNamespace("readr")) {
33 | x <- readr::parse_integer(c("1X", "blah", "3"))
34 | problems(x)
35 |
36 | y <- readr::parse_integer(c("1", "2", "3"))
37 | problems(y)
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/man/show_progress.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{show_progress}
4 | \alias{show_progress}
5 | \title{Determine whether progress bars should be shown}
6 | \usage{
7 | show_progress()
8 | }
9 | \value{
10 | A logical value
11 | }
12 | \description{
13 | Progress bars are shown \emph{unless} one of the following is \code{TRUE}
14 | \itemize{
15 | \item The bar is explicitly disabled by setting \code{options(readr.show_progress = FALSE)}
16 | \item The code is run in a non-interactive session (\code{interactive()} is \code{FALSE}).
17 | \item The code is run in an RStudio notebook chunk.
18 | \item The code is run by knitr / rmarkdown.
19 | }
20 | }
21 | \examples{
22 | show_progress()
23 | }
24 |
--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 |
--------------------------------------------------------------------------------
/src/Collector.cpp:
--------------------------------------------------------------------------------
1 | #include "cpp11/list.hpp"
2 |
3 | #include "Collector.h"
4 | #include "LocaleInfo.h"
5 | #include "QiParsers.h"
6 | #include "utils.h"
7 |
8 | CollectorPtr Collector::create(const cpp11::list& spec, LocaleInfo* pLocale) {
9 | std::string subclass(cpp11::as_cpp(spec.attr("class"))[0]);
10 |
11 |
12 | if (subclass == "collector_double") {
13 | return CollectorPtr(new CollectorDouble(pLocale->decimalMark_));
14 | }
15 | if (subclass == "collector_character") {
16 | return CollectorPtr(new CollectorCharacter(&pLocale->encoder_));
17 | }
18 |
19 | cpp11::stop("Unsupported column type '%s'", subclass.c_str());
20 | return CollectorPtr(new CollectorSkip());
21 | }
22 |
23 | std::vector
24 | collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale) {
25 | std::vector collectors;
26 | for (auto spec : specs) {
27 | CollectorPtr col(Collector::create(SEXP(spec), pLocale));
28 | collectors.push_back(col);
29 | }
30 |
31 | return collectors;
32 | }
33 |
34 | // Implementations ------------------------------------------------------------
35 |
36 | void CollectorCharacter::setValue(int i, const Token& t) {
37 | switch (t.type()) {
38 | case TOKEN_STRING: {
39 | std::string buffer;
40 | SourceIterators string = t.getString(&buffer);
41 |
42 | if (t.hasNull()) {
43 | warn(t.row(), t.col(), "", "embedded null");
44 | }
45 |
46 | SET_STRING_ELT(
47 | column_,
48 | i,
49 | pEncoder_->makeSEXP(string.first, string.second, t.hasNull()));
50 | break;
51 | };
52 | case TOKEN_MISSING:
53 | SET_STRING_ELT(column_, i, NA_STRING);
54 | break;
55 | case TOKEN_EMPTY:
56 | SET_STRING_ELT(column_, i, Rf_mkCharCE("", CE_UTF8));
57 | break;
58 | case TOKEN_EOF:
59 | cpp11::stop("Invalid token");
60 | }
61 | }
62 |
63 | void CollectorCharacter::setValue(int i, const std::string& s) {
64 | SET_STRING_ELT(column_, i, Rf_mkCharCE(s.c_str(), CE_UTF8));
65 | }
66 |
67 | void CollectorDouble::setValue(int i, size_t st) { REAL(column_)[i] = st; }
68 |
--------------------------------------------------------------------------------
/src/Collector.h:
--------------------------------------------------------------------------------
1 | #ifndef MELTR_COLLECTOR_H_
2 | #define MELTR_COLLECTOR_H_
3 |
4 | #include "cpp11/doubles.hpp"
5 | #include "cpp11/integers.hpp"
6 | #include "cpp11/list.hpp"
7 | #include "cpp11/logicals.hpp"
8 | #include "cpp11/strings.hpp"
9 |
10 | #include "DateTimeParser.h"
11 | #include "Iconv.h"
12 | #include "LocaleInfo.h"
13 | #include "Token.h"
14 | #include "Warnings.h"
15 | #include