├── air.toml
├── .github
├── .gitignore
├── CODEOWNERS
├── workflows
│ ├── pkgdown.yaml
│ ├── test-coverage.yaml
│ ├── R-CMD-check.yaml
│ ├── rhub.yaml
│ └── pr-commands.yaml
└── CODE_OF_CONDUCT.md
├── LICENSE
├── .gitignore
├── .vscode
├── extensions.json
└── settings.json
├── Makefile
├── tests
├── testthat.R
└── testthat
│ ├── test-expr_as_xml.R
│ └── test-xml_parse_data.R
├── R
├── xmlparsedata-package.R
├── utils.R
├── expr_as_xml.R
└── package.R
├── NAMESPACE
├── .Rbuildignore
├── codecov.yml
├── man
├── expr_as_xml.Rd
├── xmlparsedata.Rd
├── xml_parse_token_map.Rd
├── xmlparsedata-package.Rd
└── xml_parse_data.Rd
├── _pkgdown.yml
├── DESCRIPTION
├── LICENSE.md
├── NEWS.md
├── README.Rmd
└── README.md
/air.toml:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2025
2 | COPYRIGHT HOLDER: xmlparsedata authors
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | /revdep
5 | docs
6 |
--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | "recommendations": [
3 | "Posit.air-vscode"
4 | ]
5 | }
6 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 | all: README.md
3 |
4 | README.md: README.Rmd
5 | Rscript -e "library(knitr); knit('$<', output = '$@', quiet = TRUE)"
6 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # CODEOWNERS for xmlparsedata
2 | # https://www.tidyverse.org/development/understudies
3 | * @gaborcsardi @jimhester
4 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "[r]": {
3 | "editor.formatOnSave": true,
4 | "editor.defaultFormatter": "Posit.air-vscode"
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(xmlparsedata)
3 |
4 | if (requireNamespace("xml2", quietly = TRUE)) {
5 | test_check("xmlparsedata")
6 | }
7 |
--------------------------------------------------------------------------------
/R/xmlparsedata-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | #' @aliases xmlparsedata-package
3 | "_PACKAGE"
4 |
5 | ## usethis namespace: start
6 | ## usethis namespace: end
7 | NULL
8 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(expr_as_xml)
4 | export(xml_parse_data)
5 | export(xml_parse_token_map)
6 | importFrom(utils,getParseData)
7 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^Makefile$
4 | ^README.Rmd$
5 | ^README\.html$
6 | ^.travis.yml$
7 | ^appveyor.yml$
8 | ^\.github$
9 | ^revdep$
10 | ^_pkgdown\.yml$
11 | ^docs$
12 | ^pkgdown$
13 | ^codecov\.yml$
14 | ^LICENSE\.md$
15 | ^[\.]?air\.toml$
16 | ^\.vscode$
17 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 1%
9 | informational: true
10 | patch:
11 | default:
12 | target: auto
13 | threshold: 1%
14 | informational: true
15 |
--------------------------------------------------------------------------------
/man/expr_as_xml.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/expr_as_xml.R
3 | \name{expr_as_xml}
4 | \alias{expr_as_xml}
5 | \title{Get an XML representation of an expression}
6 | \usage{
7 | expr_as_xml(expr)
8 | }
9 | \arguments{
10 | \item{expr}{An expression.}
11 | }
12 | \description{
13 | Get an XML representation of an expression
14 | }
15 |
--------------------------------------------------------------------------------
/man/xmlparsedata.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/package.R
3 | \docType{package}
4 | \name{xmlparsedata}
5 | \alias{xmlparsedata}
6 | \title{Parse Data of R Code as an 'XML' Tree}
7 | \description{
8 | Convert the output of 'utils::getParseData()' to an 'XML' tree, that is
9 | searchable and easier to manipulate in general.
10 | }
11 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://r-lib.github.io/xmlparsedata/
2 |
3 | template:
4 | package: tidytemplate
5 | bootstrap: 5
6 | includes:
7 | in_header: |
8 |
9 |
10 |
11 | development:
12 | mode: auto
13 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | reparse_octal <- function(pd, lines) {
2 | out <- character(nrow(pd))
3 | single_line <- pd$line1 == pd$line2
4 | out[single_line] <- substr(
5 | lines[pd$line1[single_line]],
6 | pd$col1[single_line],
7 | pd$col2[single_line]
8 | )
9 | for (ii in which(!single_line)) {
10 | out[ii] <- paste(
11 | c(
12 | substring(lines[pd$line1[ii]], pd$col1[ii]),
13 | if (pd$line1[ii] < pd$line2[ii] - 1L)
14 | lines[(pd$line1[ii] + 1L):(pd$line2[ii] - 1L)],
15 | substr(lines[pd$line2[ii]], 1L, pd$col2[ii])
16 | ),
17 | collapse = "\n"
18 | )
19 | }
20 | out
21 | }
22 |
--------------------------------------------------------------------------------
/R/expr_as_xml.R:
--------------------------------------------------------------------------------
1 | #' Get an XML representation of an expression
2 | #'
3 | #' @param expr An expression.
4 | #' @export
5 | expr_as_xml <- function(expr) {
6 | if (!requireNamespace("xml2", quietly = TRUE)) {
7 | stop("'xml2' is required to return an XML object")
8 | }
9 | tmp_source <- tempfile()
10 | on.exit(unlink(tmp_source))
11 |
12 | # NB: deparse() approach struggles with `{` expressions
13 | dput(substitute(expr), file = tmp_source)
14 | parsed_expr <- parse(tmp_source, keep.source = TRUE)
15 | # TODO(#28): Strip the line/column metadata which
16 | # is technically 'missing' for this case.
17 | xml2::read_xml(xml_parse_data(parsed_expr))
18 | }
19 |
--------------------------------------------------------------------------------
/man/xml_parse_token_map.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/package.R
3 | \docType{data}
4 | \name{xml_parse_token_map}
5 | \alias{xml_parse_token_map}
6 | \title{Map token names of the R parser to token names in
7 | \code{\link[=xml_parse_data]{xml_parse_data()}}}
8 | \format{
9 | An object of class \code{character} of length 20.
10 | }
11 | \usage{
12 | xml_parse_token_map
13 | }
14 | \description{
15 | Some of the R token names are not valid XML tag names,
16 | so \code{\link[=xml_parse_data]{xml_parse_data()}} needs to replace them to create a
17 | valid XML file.
18 | }
19 | \seealso{
20 | \code{\link[=xml_parse_data]{xml_parse_data()}}
21 | }
22 | \keyword{datasets}
23 |
--------------------------------------------------------------------------------
/man/xmlparsedata-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/xmlparsedata-package.R
3 | \docType{package}
4 | \name{xmlparsedata-package}
5 | \alias{xmlparsedata-package}
6 | \alias{_PACKAGE}
7 | \title{xmlparsedata: Parse Data of 'R' Code as an 'XML' Tree}
8 | \description{
9 | Convert the output of 'utils::getParseData()' to an 'XML' tree, that one can search via 'XPath', and easier to manipulate in general.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 | \item \url{https://github.com/r-lib/xmlparsedata#readme}
15 | \item \url{https://r-lib.github.io/xmlparsedata/}
16 | \item Report bugs at \url{https://github.com/r-lib/xmlparsedata/issues}
17 | }
18 |
19 | }
20 | \author{
21 | \strong{Maintainer}: Gábor Csárdi \email{csardi.gabor@gmail.com}
22 |
23 | Other contributors:
24 | \itemize{
25 | \item Posit Software, PBC [copyright holder, funder]
26 | \item Mango Solutions [copyright holder, funder]
27 | }
28 |
29 | }
30 | \keyword{internal}
31 |
--------------------------------------------------------------------------------
/tests/testthat/test-expr_as_xml.R:
--------------------------------------------------------------------------------
1 | test_that("XML object is returned with correct structure", {
2 | skip_if_not_installed("xml2")
3 |
4 | expect_silent({
5 | expr_xml <- expr_as_xml(mtcars[, "cyl"])
6 | })
7 | expect_s3_class(expr_xml, "xml_document")
8 | expect_identical(
9 | vapply(
10 | xml2::xml_children(xml2::xml_child(expr_xml)),
11 | xml2::xml_name,
12 | character(1L)
13 | ),
14 | c("expr", "OP-LEFT-BRACKET", "OP-COMMA", "expr", "OP-RIGHT-BRACKET")
15 | )
16 | })
17 |
18 | test_that("multi-expression case also works", {
19 | expect_silent({
20 | expr_xml <- expr_as_xml({
21 | 1 + 1
22 | sqrt(rnorm(100))
23 | })
24 | })
25 | expect_identical(xml2::xml_name(expr_xml), "exprlist")
26 | # `{`, `1 + 1`, `sqrt(...)`, and `}`
27 | expect_length(xml2::xml_children(xml2::xml_child(expr_xml)), 4L)
28 | })
29 |
30 | test_that("literals are also fine", {
31 | expect_silent(expr_as_xml("a b c"))
32 | expect_silent(expr_as_xml(100L))
33 | })
34 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: xmlparsedata
2 | Title: Parse Data of 'R' Code as an 'XML' Tree
3 | Version: 1.0.5.9000
4 | Authors@R: c(
5 | person("Gábor", "Csárdi", , "csardi.gabor@gmail.com", role = c("aut", "cre")),
6 | person("Posit Software, PBC", role = c("cph", "fnd"),
7 | comment = c(ROR = "03wc8by49")),
8 | person("Mango Solutions", role = c("cph", "fnd"))
9 | )
10 | Description: Convert the output of 'utils::getParseData()' to an 'XML'
11 | tree, that one can search via 'XPath', and easier to manipulate in
12 | general.
13 | License: MIT + file LICENSE
14 | URL: https://github.com/r-lib/xmlparsedata#readme,
15 | https://r-lib.github.io/xmlparsedata/
16 | BugReports: https://github.com/r-lib/xmlparsedata/issues
17 | Depends:
18 | R (>= 3.0.0)
19 | Suggests:
20 | covr,
21 | testthat (>= 3.0.0),
22 | xml2
23 | Config/Needs/website: tidyverse/tidytemplate
24 | Config/testthat/edition: 3
25 | Config/usethis/last-upkeep: 2025-05-07
26 | Encoding: UTF-8
27 | LazyData: true
28 | Roxygen: list(markdown = TRUE)
29 | RoxygenNote: 7.2.3
30 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2025 xmlparsedata authors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | release:
8 | types: [published]
9 | workflow_dispatch:
10 |
11 | name: pkgdown.yaml
12 |
13 | permissions: read-all
14 |
15 | jobs:
16 | pkgdown:
17 | runs-on: ubuntu-latest
18 | # Only restrict concurrency for non-PR jobs
19 | concurrency:
20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 | env:
22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 | permissions:
24 | contents: write
25 | steps:
26 | - uses: actions/checkout@v4
27 |
28 | - uses: r-lib/actions/setup-pandoc@v2
29 |
30 | - uses: r-lib/actions/setup-r@v2
31 | with:
32 | use-public-rspm: true
33 |
34 | - uses: r-lib/actions/setup-r-dependencies@v2
35 | with:
36 | extra-packages: any::pkgdown, local::.
37 | needs: website
38 |
39 | - name: Build site
40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 | shell: Rscript {0}
42 |
43 | - name: Deploy to GitHub pages 🚀
44 | if: github.event_name != 'pull_request'
45 | uses: JamesIves/github-pages-deploy-action@v4.5.0
46 | with:
47 | clean: false
48 | branch: gh-pages
49 | folder: docs
50 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 |
2 | # xmlparsedata Development version
3 |
4 | * Re-parse character literals with octal-escaped expressions of width 1 or 2,
5 | e.g. `"\1"`, to work around a bug (in R<4.3.0) in `utils::getParseData()`
6 | (#25, @michaelchirico).
7 |
8 | * New `expr_as_xml()` to get an XML representation of R expressions (#27 @MichaelChirico).
9 |
10 | # xmlparsedata 1.0.5
11 |
12 | * Translate `\` in lambda expression to `OP-LAMBDA` (#18 @renkun-ken).
13 |
14 | * Drop all control characters, except horizontal tab and newline (#19).
15 |
16 | # xmlparsedata 1.0.4
17 |
18 | * Translate ] tokens to `OP-RIGHT-BRACKET` instead of
19 | `OP-RIGHT-BRACE` (#11 @AshesITR).
20 |
21 | * `xml_parse_data()` now works if `includeText = FALSE`
22 | (#14 @renkun-ken).
23 |
24 | # xmlparsedata 1.0.3
25 |
26 | * Ensure that closing xml-tags for code expressions that end at the same
27 | position in a file respect start-first-end-last ordering in the produced xml.
28 | Ensures that the new `equal_assign` token in `getParseData()` for R-3.6 is
29 | handled appropriately. #5 @russHyde
30 |
31 | # xmlparsedata 1.0.2
32 |
33 | * Remove control characters `\003`, `\007`, `\010`, `\027`, as they are
34 | not allowed in XML 1.0, #1 @GregoireGauriot
35 |
36 | * Always convert parsed text to UTF-8
37 |
38 | # xmlparsedata 1.0.1
39 |
40 | * Fix a bug when the input is already a `getParseData()` data frame.
41 | https://github.com/jimhester/lintr filters the parsed data to include
42 | individual functions only, but only filters the data frame, not the
43 | underlying srcrefs, so when we call `getParseData()` on the data frame
44 | again, we get the data for the whole source file. This is fixed now by
45 | noticing that the input is already a data frame
46 |
47 | # xmlparsedata 1.0.0
48 |
49 | First public release.
50 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 |
8 | name: test-coverage.yaml
9 |
10 | permissions: read-all
11 |
12 | jobs:
13 | test-coverage:
14 | runs-on: ubuntu-latest
15 | env:
16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 |
18 | steps:
19 | - uses: actions/checkout@v4
20 |
21 | - uses: r-lib/actions/setup-r@v2
22 | with:
23 | use-public-rspm: true
24 |
25 | - uses: r-lib/actions/setup-r-dependencies@v2
26 | with:
27 | extra-packages: any::covr, any::xml2
28 | needs: coverage
29 |
30 | - name: Test coverage
31 | run: |
32 | cov <- covr::package_coverage(
33 | quiet = FALSE,
34 | clean = FALSE,
35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
36 | )
37 | print(cov)
38 | covr::to_cobertura(cov)
39 | shell: Rscript {0}
40 |
41 | - uses: codecov/codecov-action@v5
42 | with:
43 | # Fail if error if not on PR, or if on PR and token is given
44 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
45 | files: ./cobertura.xml
46 | plugins: noop
47 | disable_search: true
48 | token: ${{ secrets.CODECOV_TOKEN }}
49 |
50 | - name: Show testthat output
51 | if: always()
52 | run: |
53 | ## --------------------------------------------------------------------
54 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
55 | shell: bash
56 |
57 | - name: Upload test results
58 | if: failure()
59 | uses: actions/upload-artifact@v4
60 | with:
61 | name: coverage-test-failures
62 | path: ${{ runner.temp }}/package
63 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | #
4 | # NOTE: This workflow is overkill for most R packages and
5 | # check-standard.yaml is likely a better choice.
6 | # usethis::use_github_action("check-standard") will install it.
7 | on:
8 | push:
9 | branches: [main, master]
10 | pull_request:
11 |
12 | name: R-CMD-check.yaml
13 |
14 | permissions: read-all
15 |
16 | jobs:
17 | R-CMD-check:
18 | runs-on: ${{ matrix.config.os }}
19 |
20 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
21 |
22 | strategy:
23 | fail-fast: false
24 | matrix:
25 | config:
26 | - {os: macos-latest, r: 'release'}
27 |
28 | - {os: windows-latest, r: 'release'}
29 | # use 4.0 or 4.1 to check with rtools40's older compiler
30 | - {os: windows-latest, r: 'oldrel-4'}
31 |
32 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
33 | - {os: ubuntu-latest, r: 'release'}
34 | - {os: ubuntu-latest, r: 'oldrel-1'}
35 | - {os: ubuntu-latest, r: 'oldrel-2'}
36 | - {os: ubuntu-latest, r: 'oldrel-3'}
37 | - {os: ubuntu-latest, r: 'oldrel-4'}
38 |
39 | env:
40 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
41 | R_KEEP_PKG_SOURCE: yes
42 |
43 | steps:
44 | - uses: actions/checkout@v4
45 |
46 | - uses: r-lib/actions/setup-pandoc@v2
47 |
48 | - uses: r-lib/actions/setup-r@v2
49 | with:
50 | r-version: ${{ matrix.config.r }}
51 | http-user-agent: ${{ matrix.config.http-user-agent }}
52 | use-public-rspm: true
53 |
54 | - uses: r-lib/actions/setup-r-dependencies@v2
55 | with:
56 | extra-packages: any::rcmdcheck
57 | needs: check
58 |
59 | - uses: r-lib/actions/check-r-package@v2
60 | with:
61 | upload-snapshots: true
62 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
63 |
--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
2 | # https://github.com/r-hub/rhub2/blob/v1/inst/workflow/rhub.yaml
3 | # You can update this file to a newer version using the rhub2 package:
4 | #
5 | # rhub2::rhub_setup()
6 | #
7 | # It is unlikely that you need to modify this file manually.
8 |
9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 |
12 | on:
13 | workflow_dispatch:
14 | inputs:
15 | config:
16 | description: 'A comma separated list of R-hub platforms to use.'
17 | type: string
18 | default: 'linux,windows,macos'
19 | name:
20 | description: 'Run name. You can leave this empty now.'
21 | type: string
22 | id:
23 | description: 'Unique ID. You can leave this empty now.'
24 | type: string
25 |
26 | jobs:
27 |
28 | setup:
29 | runs-on: ubuntu-latest
30 | outputs:
31 | containers: ${{ steps.rhub-setup.outputs.containers }}
32 | platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 |
34 | steps:
35 | # NO NEED TO CHECKOUT HERE
36 | - uses: r-hub/rhub2/actions/rhub-setup@v1
37 | with:
38 | config: ${{ github.event.inputs.config }}
39 | id: rhub-setup
40 |
41 | linux-containers:
42 | needs: setup
43 | if: ${{ needs.setup.outputs.containers != '[]' }}
44 | runs-on: ubuntu-latest
45 | name: ${{ matrix.config.label }}
46 | strategy:
47 | fail-fast: false
48 | matrix:
49 | config: ${{ fromJson(needs.setup.outputs.containers) }}
50 | container:
51 | image: ${{ matrix.config.container }}
52 |
53 | steps:
54 | - uses: r-hub/rhub2/actions/rhub-checkout@v1
55 | - uses: r-hub/rhub2/actions/rhub-check@v1
56 | with:
57 | token: ${{ secrets.RHUB_TOKEN }}
58 | job-config: ${{ matrix.config.job-config }}
59 |
60 | other-platforms:
61 | needs: setup
62 | if: ${{ needs.setup.outputs.platforms != '[]' }}
63 | runs-on: ${{ matrix.config.os }}
64 | name: ${{ matrix.config.label }}
65 | strategy:
66 | fail-fast: false
67 | matrix:
68 | config: ${{ fromJson(needs.setup.outputs.platforms) }}
69 |
70 | steps:
71 | - uses: r-hub/rhub2/actions/rhub-checkout@v1
72 | - uses: r-hub/rhub2/actions/rhub-setup-r@v1
73 | with:
74 | job-config: ${{ matrix.config.job-config }}
75 | token: ${{ secrets.RHUB_TOKEN }}
76 | - uses: r-hub/rhub2/actions/rhub-check@v1
77 | with:
78 | job-config: ${{ matrix.config.job-config }}
79 | token: ${{ secrets.RHUB_TOKEN }}
80 |
--------------------------------------------------------------------------------
/man/xml_parse_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/package.R
3 | \name{xml_parse_data}
4 | \alias{xml_parse_data}
5 | \title{Convert R parse data to XML}
6 | \usage{
7 | xml_parse_data(x, includeText = NA, pretty = FALSE)
8 | }
9 | \arguments{
10 | \item{x}{
11 | an expression returned from \code{\link{parse}}, or a function or other
12 | object with source reference information
13 | }
14 |
15 | \item{includeText}{
16 | logical; whether to include the text of parsed items in the result
17 | }
18 |
19 | \item{pretty}{Whether to pretty-indent the XML output. It has a small
20 | overhead which probably only matters for very large source files.}
21 | }
22 | \value{
23 | An XML string representing the parse data. See details below.
24 | }
25 | \description{
26 | In recent R versions the parser can attach source code location
27 | information to the parsed expressions. This information is often
28 | useful for static analysis, e.g. code linting. It can be accessed
29 | via the \code{\link[utils:getParseData]{utils::getParseData()}} function.
30 | }
31 | \details{
32 | \code{xml_parse_data()} converts this information to an XML tree.
33 | The R parser's token names are preserved in the XML as much as
34 | possible, but some of them are not valid XML tag names, so they are
35 | renamed, see the \link{xml_parse_token_map} vector for the
36 | mapping.
37 |
38 | The top XML tag is \verb{}, which is a list of
39 | expressions, each expression is an \verb{} tag. Each tag
40 | has attributes that define the location: \code{line1}, \code{col1},
41 | \code{line2}, \code{col2}. These are from the \code{\link[=getParseData]{getParseData()}}
42 | data frame column names. Next, there are two attributes,
43 | \code{start} and \code{end}, which can be used as an ordering of
44 | expressions in the document. Note that while the values
45 | are correlated with (and in some cases may match exactly)
46 | positions in the document, this cannot be relied upon.
47 |
48 | See an example below. See also the README at
49 | \url{https://github.com/r-lib/xmlparsedata#readme}
50 | for examples on how to search the XML tree with the \code{xml2} package
51 | and XPath expressions.
52 |
53 | Note that \code{xml_parse_data()} silently drops all control characters
54 | (0x01-0x1f) from the input, except horizontal tab (0x09) and newline
55 | (0x0a), because they are invalid in XML 1.0.
56 | }
57 | \examples{
58 | code <- "function(a = 1, b = 2) {\n a + b\n}\n"
59 | expr <- parse(text = code, keep.source = TRUE)
60 |
61 | # The base R way:
62 | getParseData(expr)
63 |
64 | cat(xml_parse_data(expr, pretty = TRUE))
65 | }
66 | \seealso{
67 | \link{xml_parse_token_map} for the token names.
68 | \url{https://github.com/r-lib/xmlparsedata#readme} for more
69 | information and use cases.
70 | }
71 |
--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | issue_comment:
5 | types: [created]
6 |
7 | name: pr-commands.yaml
8 |
9 | permissions: read-all
10 |
11 | jobs:
12 | document:
13 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }}
14 | name: document
15 | runs-on: ubuntu-latest
16 | env:
17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
18 | permissions:
19 | contents: write
20 | steps:
21 | - uses: actions/checkout@v4
22 |
23 | - uses: r-lib/actions/pr-fetch@v2
24 | with:
25 | repo-token: ${{ secrets.GITHUB_TOKEN }}
26 |
27 | - uses: r-lib/actions/setup-r@v2
28 | with:
29 | use-public-rspm: true
30 |
31 | - uses: r-lib/actions/setup-r-dependencies@v2
32 | with:
33 | extra-packages: any::roxygen2
34 | needs: pr-document
35 |
36 | - name: Document
37 | run: roxygen2::roxygenise()
38 | shell: Rscript {0}
39 |
40 | - name: commit
41 | run: |
42 | git config --local user.name "$GITHUB_ACTOR"
43 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
44 | git add man/\* NAMESPACE
45 | git commit -m 'Document'
46 |
47 | - uses: r-lib/actions/pr-push@v2
48 | with:
49 | repo-token: ${{ secrets.GITHUB_TOKEN }}
50 |
51 | style:
52 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }}
53 | name: style
54 | runs-on: ubuntu-latest
55 | env:
56 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
57 | permissions:
58 | contents: write
59 | steps:
60 | - uses: actions/checkout@v4
61 |
62 | - uses: r-lib/actions/pr-fetch@v2
63 | with:
64 | repo-token: ${{ secrets.GITHUB_TOKEN }}
65 |
66 | - uses: r-lib/actions/setup-r@v2
67 |
68 | - name: Install dependencies
69 | run: install.packages("styler")
70 | shell: Rscript {0}
71 |
72 | - name: Style
73 | run: styler::style_pkg()
74 | shell: Rscript {0}
75 |
76 | - name: commit
77 | run: |
78 | git config --local user.name "$GITHUB_ACTOR"
79 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
80 | git add \*.R
81 | git commit -m 'Style'
82 |
83 | - uses: r-lib/actions/pr-push@v2
84 | with:
85 | repo-token: ${{ secrets.GITHUB_TOKEN }}
86 |
--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, caste, color, religion, or sexual
10 | identity and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the overall
26 | community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or advances of
31 | any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email address,
35 | without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at codeofconduct@posit.co.
63 | All complaints will be reviewed and investigated promptly and fairly.
64 |
65 | All community leaders are obligated to respect the privacy and security of the
66 | reporter of any incident.
67 |
68 | ## Enforcement Guidelines
69 |
70 | Community leaders will follow these Community Impact Guidelines in determining
71 | the consequences for any action they deem in violation of this Code of Conduct:
72 |
73 | ### 1. Correction
74 |
75 | **Community Impact**: Use of inappropriate language or other behavior deemed
76 | unprofessional or unwelcome in the community.
77 |
78 | **Consequence**: A private, written warning from community leaders, providing
79 | clarity around the nature of the violation and an explanation of why the
80 | behavior was inappropriate. A public apology may be requested.
81 |
82 | ### 2. Warning
83 |
84 | **Community Impact**: A violation through a single incident or series of
85 | actions.
86 |
87 | **Consequence**: A warning with consequences for continued behavior. No
88 | interaction with the people involved, including unsolicited interaction with
89 | those enforcing the Code of Conduct, for a specified period of time. This
90 | includes avoiding interactions in community spaces as well as external channels
91 | like social media. Violating these terms may lead to a temporary or permanent
92 | ban.
93 |
94 | ### 3. Temporary Ban
95 |
96 | **Community Impact**: A serious violation of community standards, including
97 | sustained inappropriate behavior.
98 |
99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 |
105 | ### 4. Permanent Ban
106 |
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior, harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 |
111 | **Consequence**: A permanent ban from any sort of public interaction within the
112 | community.
113 |
114 | ## Attribution
115 |
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.1, available at
118 | .
119 |
120 | Community Impact Guidelines were inspired by
121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion].
122 |
123 | For answers to common questions about this code of conduct, see the FAQ at
124 | . Translations are available at .
125 |
126 | [homepage]: https://www.contributor-covenant.org
127 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 | ```{r}
6 | #| label: setup
7 | #| echo: false
8 | #| message: false
9 | knitr::opts_chunk$set(
10 | comment = "#>",
11 | tidy = FALSE,
12 | error = FALSE
13 | )
14 | ```
15 |
16 | # xmlparsedata
17 |
18 | > Parse Data of R Code as an 'XML' Tree
19 |
20 |
21 | [](https://github.com/r-lib/xmlparsedata/actions/workflows/R-CMD-check.yaml)
22 | [](https://www.r-pkg.org/pkg/xmlparsedata)
23 | [](https://www.r-pkg.org/pkg/xmlparsedata)
24 | [](https://app.codecov.io/gh/r-lib/xmlparsedata)
25 |
26 |
27 | Convert the output of 'utils::getParseData()' to an 'XML' tree, that is
28 | searchable and easier to manipulate in general.
29 |
30 | ---
31 |
32 | - [Installation](#installation)
33 | - [Usage](#usage)
34 | - [Introduction](#introduction)
35 | - [`utils::getParseData()`](#utilsgetparsedata)
36 | - [`xml_parse_data()`](#xml_parse_data)
37 | - [Renaming some tokens](#renaming-some-tokens)
38 | - [Search the parse tree with `xml2`](#search-the-parse-tree-with-xml2)
39 | - [License](#license)
40 |
41 | ## Installation
42 |
43 | Stable version:
44 |
45 | ```{r}
46 | #| eval: false
47 | install.packages("xmlparsedata")
48 | ```
49 |
50 | Development version:
51 |
52 | ```{r}
53 | #| eval: false
54 | pak::pak("r-lib/zip")
55 | ```
56 |
57 | ## Usage
58 |
59 | ### Introduction
60 |
61 | In recent R versions the parser can attach source code location
62 | information to the parsed expressions. This information is often
63 | useful for static analysis, e.g. code linting. It can be accessed
64 | via the `utils::getParseData()` function.
65 |
66 | `xmlparsedata` converts this information to an XML tree.
67 | The R parser's token names are preserved in the XML as much as
68 | possible, but some of them are not valid XML tag names, so they are
69 | renamed, see below.
70 |
71 | ### `utils::getParseData()`
72 |
73 | `utils::getParseData()` summarizes the parse information in a data
74 | frame. The data frame has one row per expression tree node, and each
75 | node points to its parent. Here is a small example:
76 |
77 | ```{r}
78 | p <- parse(
79 | text = "function(a = 1, b = 2) { \n a + b\n}\n",
80 | keep.source = TRUE
81 | )
82 | getParseData(p)
83 | ```
84 |
85 | ### `xml_parse_data()`
86 |
87 | `xmlparsedata::xml_parse_data()` converts the parse information to
88 | an XML document. It works similarly to `getParseData()`. Specify the
89 | `pretty = TRUE` option to pretty-indent the XML output. Note that this
90 | has a small overhead, so if you are parsing large files, I suggest you
91 | omit it.
92 |
93 | ```{r}
94 | library(xmlparsedata)
95 | xml <- xml_parse_data(p, pretty = TRUE)
96 | cat(xml)
97 | ```
98 |
99 | The top XML tag is ``, which is a list of
100 | expressions, each expression is an `` tag. Each tag
101 | has attributes that define the location: `line1`, `col1`,
102 | `line2`, `col2`. These are from the `getParseData()`
103 | data frame column names.
104 |
105 | ### Renaming some tokens
106 |
107 | The R parser's token names are preserved in the XML as much as
108 | possible, but some of them are not valid XML tag names, so they are
109 | renamed, see the `xml_parse_token_map` vector for the mapping:
110 |
111 | ```{r}
112 | xml_parse_token_map
113 | ```
114 |
115 | ### Search the parse tree with `xml2`
116 |
117 | The `xml2` package can search XML documents using
118 | [XPath](https://en.wikipedia.org/wiki/XPath) expressions. This is often
119 | useful to search for specific code patterns.
120 |
121 | As an example we search a source file from base R for `1:nrow()`
122 | expressions, which are usually unsafe, as `nrow()` might be zero,
123 | and then the expression is equivalent to `1:0`, i.e. `c(1, 0)`, which
124 | is usually not the intended behavior.
125 |
126 | We load and parse the file directly from the the R source code mirror
127 | at https://github.com/wch/r-source:
128 |
129 | ```{r}
130 | url <- paste0(
131 | "https://raw.githubusercontent.com/wch/r-source/",
132 | "4fc93819fc7401b8695ce57a948fe163d4188f47/src/library/tools/R/xgettext.R"
133 | )
134 | src <- readLines(url)
135 | p <- parse(text = src, keep.source = TRUE)
136 | ```
137 |
138 | and we convert it to an XML tree:
139 |
140 | ```{r}
141 | library(xml2)
142 | xml <- read_xml(xml_parse_data(p))
143 | ```
144 |
145 | The `1:nrow()` expression corresponds to the following
146 | tree in R:
147 |
148 | ```
149 |
150 | +--
151 | +-- NUM_CONST: 1
152 | +-- ':'
153 | +--
154 | +--
155 | +-- SYMBOL_FUNCTION_CALL nrow
156 | +-- '('
157 | +--
158 | +-- ')'
159 | ```
160 |
161 | ```{r}
162 | bad <- xml_parse_data(
163 | parse(text = "1:nrow(expr)", keep.source = TRUE),
164 | pretty = TRUE
165 | )
166 | cat(bad)
167 | ```
168 |
169 | This translates to the following XPath expression (ignoring
170 | the last tree tokens from the `length(expr)` expressions):
171 |
172 | ```{r}
173 | xp <- paste0(
174 | "//expr",
175 | "[expr[NUM_CONST[text()='1']]]",
176 | "[OP-COLON]",
177 | "[expr[expr[SYMBOL_FUNCTION_CALL[text()='nrow']]]]"
178 | )
179 | ```
180 |
181 | We can search for this subtree with `xml2::xml_find_all()`:
182 |
183 | ```{r}
184 | bad_nrow <- xml_find_all(xml, xp)
185 | bad_nrow
186 | ```
187 |
188 | There is only one hit, in line 334:
189 |
190 | ```{r}
191 | cbind(332:336, src[332:336])
192 | ```
193 |
194 | ## Code of Conduct
195 |
196 | Please note that the xmlparsedata project is released with a
197 | [Contributor Code of Conduct](https://r-lib.github.io/xmlparsedata/CODE_OF_CONDUCT.html).
198 | By contributing to this project, you agree to abide by its terms.
199 |
200 | ## License
201 |
202 | MIT © Mango Solutions, RStudio
203 |
--------------------------------------------------------------------------------
/tests/testthat/test-xml_parse_data.R:
--------------------------------------------------------------------------------
1 | test_that("empty input", {
2 | xml <- xml_parse_data(parse(text = "", keep.source = TRUE))
3 | expect_true(is.character(xml))
4 | expect_true(length(xml) == 1)
5 | expect_match(xml, "\\s*")
6 | expect_silent(x <- xml2::read_xml(xml))
7 | })
8 |
9 | test_that("trivial input", {
10 | xml <- xml_parse_data(parse(text = "# comment\n", keep.source = TRUE))
11 | expect_true(is.character(xml))
12 | expect_true(length(xml) == 1)
13 | expect_match(xml, "\\s*\\s*")
14 | expect_silent(x <- xml2::read_xml(xml))
15 |
16 | xml <- xml_parse_data(parse(text = "1", keep.source = TRUE))
17 | expect_match(
18 | xml,
19 | paste0(
20 | "\\s*\\s*",
21 | "\\s*"
22 | )
23 | )
24 | expect_silent(x <- xml2::read_xml(xml))
25 | })
26 |
27 | test_that("non-trivial input", {
28 | ip <- deparse(utils::install.packages)
29 | xml <- xml_parse_data(parse(text = ip, keep.source = TRUE))
30 | expect_silent(x <- xml2::read_xml(xml))
31 |
32 | dp <- deparse(utils::install.packages)
33 | xml <- xml_parse_data(
34 | parse(text = dp, keep.source = TRUE),
35 | pretty = TRUE
36 | )
37 | expect_silent(x <- xml2::read_xml(xml))
38 | })
39 |
40 | test_that("UTF-8 is OK", {
41 | src <- enc2native("# comment with éápő")
42 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE))
43 | x <- xml2::read_xml(xml)
44 |
45 | comment <- xml2::xml_children(x)
46 | col1 <- xml2::xml_attr(comment, "col1")
47 | col2 <- xml2::xml_attr(comment, "col2")
48 |
49 | expect_equal(
50 | substring(src, col1, col2),
51 | src
52 | )
53 |
54 | src <- enc2native("# 現行の学校文法では、英語にあるような「目的語」「補語」")
55 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE))
56 | x <- xml2::read_xml(xml)
57 |
58 | comment <- xml2::xml_children(x)
59 | col1 <- xml2::xml_attr(comment, "col1")
60 | col2 <- xml2::xml_attr(comment, "col2")
61 |
62 | expect_equal(
63 | substring(src, col1, col2),
64 | iconv(src, to = "UTF-8")
65 | )
66 |
67 | src <- enc2native("`%ééé%` <- function(l, r) l + r")
68 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE), pretty = TRUE)
69 |
70 | op <- xml2::xml_find_all(
71 | xml2::read_xml(xml),
72 | iconv(
73 | enc2native("/exprlist/expr/expr/SYMBOL[text()='`%ééé%`']"),
74 | to = "UTF-8"
75 | )
76 | )
77 | expect_equal(length(op), 1)
78 | })
79 |
80 | test_that("data frame input", {
81 | p <- parse(text = "1 + 1", keep.source = TRUE)
82 |
83 | pd <- getParseData(p)
84 | attr(pd, "srcfile") <- NULL
85 | class(pd) <- "data.frame"
86 | x1 <- xml_parse_data(pd)
87 |
88 | x2 <- xml_parse_data(p)
89 |
90 | expect_equal(x1, x2)
91 | })
92 |
93 |
94 | test_that("Control-C character", {
95 | src <- "# Control-C \003
96 | # Bell \007
97 | # Escape \027
98 | # Form feed \f
99 | # Vertical tab \t
100 | "
101 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE))
102 | x <- xml2::read_xml(xml)
103 | expect_s3_class(x, "xml_document")
104 | })
105 |
106 |
107 | test_that("equal_assign is handled on R 3.6", {
108 | # `a = 1` is an example of an R statement that gets parsed into nested xml
109 | # nodes that have different token / tagnames (following the introduction of
110 | # the `equal_assign` token to getParseData() in R-3.6), but the same ending
111 | # position in the original code. Tokens/expressions that start before should
112 | # end after any nested subexpressions in the resulting xml:
113 |
114 | xml <- xml_parse_data(parse(text = "a = 1", keep.source = TRUE))
115 | expect_true(is.character(xml))
116 | expect_true(length(xml) == 1)
117 | expect_silent(x <- xml2::read_xml(xml))
118 | })
119 |
120 | test_that("includeText=FALSE works", {
121 | # getParseData(..., includeText = FALSE) returns a data.frame
122 | # without `text` column. xml_parse_data should handle this case
123 | # correctly and the resulting xml text should not contain text
124 | # elements.
125 | xml <- xml_parse_data(
126 | parse(text = "x <- 1", keep.source = TRUE),
127 | includeText = FALSE
128 | )
129 | expect_true(is.character(xml))
130 | expect_true(length(xml) == 1)
131 | expect_silent(x <- xml2::read_xml(xml))
132 | expect_true(xml2::xml_text(x) == "")
133 | })
134 |
135 | test_that("lambda operator works", {
136 | testthat::skip_if_not(
137 | getRversion() >= "4.1.0" && as.numeric(R.version[["svn rev"]]) >= 79553
138 | )
139 | # r-devel rev 79553 introduces native pipe syntax (|>) and lambda expression (e.g \(x) x + 1).
140 | xml <- xml_parse_data(parse(text = "\\(x) x + 1", keep.source = TRUE))
141 | expect_true(is.character(xml))
142 | expect_true(length(xml) == 1)
143 | expect_silent(x <- xml2::read_xml(xml))
144 | expect_true(length(xml2::xml_find_all(x, "//OP-LAMBDA")) == 1)
145 | })
146 |
147 | test_that("narrow octal strings are parsed correctly", {
148 | expect_match(
149 | xml_parse_data(parse(text = "'\\1'", keep.source = TRUE)),
150 | "'\\1'",
151 | fixed = TRUE
152 | )
153 | expect_match(
154 | xml_parse_data(parse(text = '"\\1"', keep.source = TRUE)),
155 | '"\\1"',
156 | fixed = TRUE
157 | )
158 |
159 | # multiple literals
160 | expect_match(
161 | xml_parse_data(parse(text = "'\\1'\n'\\2'", keep.source = TRUE)),
162 | "'[\\]1'.*'[\\]2'"
163 | )
164 | # multiple escapes
165 | expect_match(
166 | xml_parse_data(parse(text = "'\\1\\2'", keep.source = TRUE)),
167 | "'\\1\\2'",
168 | fixed = TRUE
169 | )
170 | # multi-line strings
171 | expect_match(
172 | xml_parse_data(parse(text = "'\n\\1\n'", keep.source = TRUE)),
173 | "'\n\\1\n'",
174 | fixed = TRUE
175 | )
176 | expect_match(
177 | xml_parse_data(parse(text = "a <- '\\1\n\\2'", keep.source = TRUE)),
178 | "'\\1\n\\2'",
179 | fixed = TRUE
180 | )
181 | # mixed-length strings
182 | expect_match(
183 | xml_parse_data(parse(
184 | text = "foo('\\1',\n '\n\\2\n')",
185 | keep.source = TRUE
186 | )),
187 | "'[\\]1'.*'\n[\\]2\n'"
188 | )
189 | })
190 |
--------------------------------------------------------------------------------
/R/package.R:
--------------------------------------------------------------------------------
1 | #' Parse Data of R Code as an 'XML' Tree
2 | #'
3 | #' Convert the output of 'utils::getParseData()' to an 'XML' tree, that is
4 | #' searchable and easier to manipulate in general.
5 | #'
6 | #' @docType package
7 | #' @name xmlparsedata
8 | NULL
9 |
10 | #' Convert R parse data to XML
11 | #'
12 | #' In recent R versions the parser can attach source code location
13 | #' information to the parsed expressions. This information is often
14 | #' useful for static analysis, e.g. code linting. It can be accessed
15 | #' via the [utils::getParseData()] function.
16 | #'
17 | #' `xml_parse_data()` converts this information to an XML tree.
18 | #' The R parser's token names are preserved in the XML as much as
19 | #' possible, but some of them are not valid XML tag names, so they are
20 | #' renamed, see the [xml_parse_token_map] vector for the
21 | #' mapping.
22 | #'
23 | #' The top XML tag is ``, which is a list of
24 | #' expressions, each expression is an `` tag. Each tag
25 | #' has attributes that define the location: `line1`, `col1`,
26 | #' `line2`, `col2`. These are from the [getParseData()]
27 | #' data frame column names. Next, there are two attributes,
28 | #' `start` and `end`, which can be used as an ordering of
29 | #' expressions in the document. Note that while the values
30 | #' are correlated with (and in some cases may match exactly)
31 | #' positions in the document, this cannot be relied upon.
32 | #'
33 | #' See an example below. See also the README at
34 | #'
35 | #' for examples on how to search the XML tree with the `xml2` package
36 | #' and XPath expressions.
37 | #'
38 | #' Note that `xml_parse_data()` silently drops all control characters
39 | #' (0x01-0x1f) from the input, except horizontal tab (0x09) and newline
40 | #' (0x0a), because they are invalid in XML 1.0.
41 | #'
42 | #' @param pretty Whether to pretty-indent the XML output. It has a small
43 | #' overhead which probably only matters for very large source files.
44 | #' @inheritParams utils::getParseData
45 | #' @return An XML string representing the parse data. See details below.
46 | #'
47 | #' @export
48 | #' @importFrom utils getParseData
49 | #' @seealso [xml_parse_token_map] for the token names.
50 | #' for more
51 | #' information and use cases.
52 | #' @examples
53 | #' code <- "function(a = 1, b = 2) {\n a + b\n}\n"
54 | #' expr <- parse(text = code, keep.source = TRUE)
55 | #'
56 | #' # The base R way:
57 | #' getParseData(expr)
58 | #'
59 | #' cat(xml_parse_data(expr, pretty = TRUE))
60 | xml_parse_data <- function(x, includeText = NA, pretty = FALSE) {
61 | xml_header <- paste0(
62 | "\n\n"
64 | )
65 | xml_footer <- "\n\n"
66 |
67 | ## Maybe it is already a data frame, e.g. when used in lintr
68 | if (is.data.frame(x)) {
69 | pd <- x
70 | } else {
71 | pd <- getParseData(x, includeText = includeText)
72 | if (is.null(pd)) {
73 | tmp_source <- tempfile()
74 | on.exit(unlink(tmp_source))
75 | dput(x, file = tmp_source)
76 |
77 | x <- parse(tmp_source, keep.source = TRUE)
78 | pd <- getParseData(x, includeText = includeText)
79 | pd$line1 <- pd$line2 <- pd$col1 <- pd$col2 <- NA_integer_
80 | }
81 | }
82 |
83 | if (!nrow(pd)) {
84 | return(paste0(xml_header, xml_footer))
85 | }
86 |
87 | pd <- fix_comments(pd)
88 |
89 | if (!is.data.frame(x)) {
90 | # workaround for R parser bug #18323; see #25
91 | str_const_mismatch <- pd$token == "STR_CONST" &
92 | pd$col2 - pd$col1 != nchar(pd$text) - 1L &
93 | # skip if there are tabs, which would require complicating the logic a lot
94 | !grepl("\t", pd$text, fixed = TRUE)
95 | if (any(str_const_mismatch)) {
96 | pd$text[str_const_mismatch] <- reparse_octal(
97 | pd[str_const_mismatch, ],
98 | attr(x, "srcfile")$lines
99 | )
100 | }
101 | }
102 |
103 | if (!is.null(pd$text)) {
104 | pd$text <- enc2utf8(pd$text)
105 | }
106 |
107 | ## Tags for all nodes, teminal nodes have end tags as well
108 | pd$token <- map_token(pd$token)
109 |
110 | ## Positions, to make it easy to compare what comes first
111 | maxcol <- max(pd$col1, pd$col2) + 1L
112 | pd$start <- pd$line1 * maxcol + pd$col1
113 | pd$end <- pd$line2 * maxcol + pd$col2
114 |
115 | terminal_tag <- character(nrow(pd))
116 | terminal_tag[pd$terminal] <- paste0("", pd$token[pd$terminal], ">")
117 | if (anyNA(pd$line1)) {
118 | pd$tag <- paste0(
119 | "<",
120 | pd$token,
121 | ">",
122 | if (!is.null(pd$text)) xml_encode(pd$text) else "",
123 | terminal_tag
124 | )
125 | } else {
126 | pd$tag <- paste0(
127 | "<",
128 | pd$token,
129 | " line1=\"",
130 | pd$line1,
131 | "\" col1=\"",
132 | pd$col1,
133 | "\" line2=\"",
134 | pd$line2,
135 | "\" col2=\"",
136 | pd$col2,
137 | "\" start=\"",
138 | pd$start,
139 | "\" end=\"",
140 | pd$end,
141 | "\">",
142 | if (!is.null(pd$text)) xml_encode(pd$text) else "",
143 | terminal_tag
144 | )
145 | }
146 |
147 | ## Add an extra terminal tag for each non-terminal one
148 | pd2 <- pd[!pd$terminal, ]
149 | if (nrow(pd2)) {
150 | pd2$terminal <- TRUE
151 | pd2$parent <- -1
152 | pd2$line1 <- pd2$line2
153 | pd2$col1 <- pd2$col2
154 | pd2$line2 <- pd2$line2 - 1L
155 | pd2$col2 <- pd2$col2 - 1L
156 | pd2$tag <- paste0("", pd2$token, ">")
157 | pd <- rbind(pd, pd2, make.row.names = FALSE)
158 | }
159 |
160 | ## Order the nodes properly
161 | ## - the terminal nodes from pd2 may be nested inside each other, when
162 | ## this happens they will have the same line1, col1, line2, col2 and
163 | ## terminal status; and 'start' is used to break ties
164 | ord <- order(pd$line1, pd$col1, -pd$line2, -pd$col2, pd$terminal, -pd$start)
165 | pd <- pd[ord, ]
166 |
167 | if (pretty) {
168 | str <- !pd$terminal
169 | end <- pd$parent == -1
170 | ind <- 2L + cumsum(str * 2L + end * (-2L)) - str * 2L
171 | xml <- paste0(strrep(" ", ind), pd$tag, collapse = "\n")
172 | } else {
173 | xml <- paste(pd$tag, collapse = "\n")
174 | }
175 |
176 | paste0(xml_header, xml, xml_footer)
177 | }
178 |
179 | fix_comments <- function(pd) {
180 | pd$parent[pd$parent < 0] <- 0
181 | pd
182 | }
183 |
184 | map_token <- function(token) {
185 | needs_translation <- token %in% names(xml_parse_token_map)
186 | token[needs_translation] <- xml_parse_token_map[token[needs_translation]]
187 | token
188 | }
189 |
190 | #' Map token names of the R parser to token names in
191 | #' [xml_parse_data()]
192 | #'
193 | #' Some of the R token names are not valid XML tag names,
194 | #' so [xml_parse_data()] needs to replace them to create a
195 | #' valid XML file.
196 | #'
197 | #' @export
198 | #' @seealso [xml_parse_data()]
199 |
200 | xml_parse_token_map <- c(
201 | "'?'" = "OP-QUESTION",
202 | "'~'" = "OP-TILDE",
203 | "'+'" = "OP-PLUS",
204 | "'-'" = "OP-MINUS",
205 | "'*'" = "OP-STAR",
206 | "'/'" = "OP-SLASH",
207 | "':'" = "OP-COLON",
208 | "'^'" = "OP-CARET",
209 | "'$'" = "OP-DOLLAR",
210 | "'@'" = "OP-AT",
211 | "'('" = "OP-LEFT-PAREN",
212 | "'['" = "OP-LEFT-BRACKET",
213 | "';'" = "OP-SEMICOLON",
214 | "'{'" = "OP-LEFT-BRACE",
215 | "'}'" = "OP-RIGHT-BRACE",
216 | "')'" = "OP-RIGHT-PAREN",
217 | "'!'" = "OP-EXCLAMATION",
218 | "']'" = "OP-RIGHT-BRACKET",
219 | "','" = "OP-COMMA",
220 | "'\\\\'" = "OP-LAMBDA"
221 | )
222 |
223 | xml_encode <- function(x) {
224 | x <- gsub("&", "&", x, fixed = TRUE)
225 | x <- gsub("<", "<", x, fixed = TRUE)
226 | x <- gsub(">", ">", x, fixed = TRUE)
227 | # most control characters are not allowed in XML, except tab and nl
228 | x <- gsub("[\x01-\x08\x0b-\x1f]", "", x, useBytes = TRUE)
229 | x
230 | }
231 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # xmlparsedata
3 |
4 | > Parse Data of R Code as an ‘XML’ Tree
5 |
6 |
7 |
8 | [](https://github.com/r-lib/xmlparsedata/actions/workflows/R-CMD-check.yaml)
9 | [](https://www.r-pkg.org/pkg/xmlparsedata)
10 | [](https://www.r-pkg.org/pkg/xmlparsedata)
12 | [](https://app.codecov.io/gh/r-lib/xmlparsedata)
14 |
15 |
16 | Convert the output of ‘utils::getParseData()’ to an ‘XML’ tree, that is
17 | searchable and easier to manipulate in general.
18 |
19 | ------------------------------------------------------------------------
20 |
21 | - [Installation](#installation)
22 | - [Usage](#usage)
23 | - [Introduction](#introduction)
24 | - [`utils::getParseData()`](#utilsgetparsedata)
25 | - [`xml_parse_data()`](#xml_parse_data)
26 | - [Renaming some tokens](#renaming-some-tokens)
27 | - [Search the parse tree with
28 | `xml2`](#search-the-parse-tree-with-xml2)
29 | - [License](#license)
30 |
31 | ## Installation
32 |
33 | Stable version:
34 |
35 | ``` r
36 | install.packages("xmlparsedata")
37 | ```
38 |
39 | Development version:
40 |
41 | ``` r
42 | pak::pak("r-lib/zip")
43 | ```
44 |
45 | ## Usage
46 |
47 | ### Introduction
48 |
49 | In recent R versions the parser can attach source code location
50 | information to the parsed expressions. This information is often useful
51 | for static analysis, e.g. code linting. It can be accessed via the
52 | `utils::getParseData()` function.
53 |
54 | `xmlparsedata` converts this information to an XML tree. The R parser’s
55 | token names are preserved in the XML as much as possible, but some of
56 | them are not valid XML tag names, so they are renamed, see below.
57 |
58 | ### `utils::getParseData()`
59 |
60 | `utils::getParseData()` summarizes the parse information in a data
61 | frame. The data frame has one row per expression tree node, and each
62 | node points to its parent. Here is a small example:
63 |
64 | ``` r
65 | p <- parse(
66 | text = "function(a = 1, b = 2) { \n a + b\n}\n",
67 | keep.source = TRUE
68 | )
69 | getParseData(p)
70 | ```
71 |
72 | #> line1 col1 line2 col2 id parent token terminal text
73 | #> 33 1 1 3 1 33 0 expr FALSE
74 | #> 1 1 1 1 8 1 33 FUNCTION TRUE function
75 | #> 2 1 9 1 9 2 33 '(' TRUE (
76 | #> 3 1 10 1 10 3 33 SYMBOL_FORMALS TRUE a
77 | #> 4 1 12 1 12 4 33 EQ_FORMALS TRUE =
78 | #> 5 1 14 1 14 5 6 NUM_CONST TRUE 1
79 | #> 6 1 14 1 14 6 33 expr FALSE
80 | #> 7 1 15 1 15 7 33 ',' TRUE ,
81 | #> 10 1 17 1 17 10 33 SYMBOL_FORMALS TRUE b
82 | #> 11 1 19 1 19 11 33 EQ_FORMALS TRUE =
83 | #> 12 1 21 1 21 12 13 NUM_CONST TRUE 2
84 | #> 13 1 21 1 21 13 33 expr FALSE
85 | #> 14 1 22 1 22 14 33 ')' TRUE )
86 | #> 30 1 24 3 1 30 33 expr FALSE
87 | #> 17 1 24 1 24 17 30 '{' TRUE {
88 | #> 25 2 3 2 7 25 30 expr FALSE
89 | #> 19 2 3 2 3 19 21 SYMBOL TRUE a
90 | #> 21 2 3 2 3 21 25 expr FALSE
91 | #> 20 2 5 2 5 20 25 '+' TRUE +
92 | #> 22 2 7 2 7 22 24 SYMBOL TRUE b
93 | #> 24 2 7 2 7 24 25 expr FALSE
94 | #> 28 3 1 3 1 28 30 '}' TRUE }
95 |
96 | ### `xml_parse_data()`
97 |
98 | `xmlparsedata::xml_parse_data()` converts the parse information to an
99 | XML document. It works similarly to `getParseData()`. Specify the
100 | `pretty = TRUE` option to pretty-indent the XML output. Note that this
101 | has a small overhead, so if you are parsing large files, I suggest you
102 | omit it.
103 |
104 | ``` r
105 | library(xmlparsedata)
106 | xml <- xml_parse_data(p, pretty = TRUE)
107 | cat(xml)
108 | ```
109 |
110 | #>
111 | #>
112 | #>
113 | #> function
114 | #> (
115 | #> a
116 | #> =
117 | #>
118 | #> 1
119 | #>
120 | #> ,
121 | #> b
122 | #> =
123 | #>
124 | #> 2
125 | #>
126 | #> )
127 | #>
128 | #> {
129 | #>
130 | #>
131 | #> a
132 | #>
133 | #> +
134 | #>
135 | #> b
136 | #>
137 | #>
138 | #> }
139 | #>
140 | #>
141 | #>
142 |
143 | The top XML tag is ``, which is a list of expressions, each
144 | expression is an `` tag. Each tag has attributes that define the
145 | location: `line1`, `col1`, `line2`, `col2`. These are from the
146 | `getParseData()` data frame column names.
147 |
148 | ### Renaming some tokens
149 |
150 | The R parser’s token names are preserved in the XML as much as possible,
151 | but some of them are not valid XML tag names, so they are renamed, see
152 | the `xml_parse_token_map` vector for the mapping:
153 |
154 | ``` r
155 | xml_parse_token_map
156 | ```
157 |
158 | #> '?' '~' '+' '-'
159 | #> "OP-QUESTION" "OP-TILDE" "OP-PLUS" "OP-MINUS"
160 | #> '*' '/' ':' '^'
161 | #> "OP-STAR" "OP-SLASH" "OP-COLON" "OP-CARET"
162 | #> '$' '@' '(' '['
163 | #> "OP-DOLLAR" "OP-AT" "OP-LEFT-PAREN" "OP-LEFT-BRACKET"
164 | #> ';' '{' '}' ')'
165 | #> "OP-SEMICOLON" "OP-LEFT-BRACE" "OP-RIGHT-BRACE" "OP-RIGHT-PAREN"
166 | #> '!' ']' ',' '\\\\'
167 | #> "OP-EXCLAMATION" "OP-RIGHT-BRACKET" "OP-COMMA" "OP-LAMBDA"
168 |
169 | ### Search the parse tree with `xml2`
170 |
171 | The `xml2` package can search XML documents using
172 | [XPath](https://en.wikipedia.org/wiki/XPath) expressions. This is often
173 | useful to search for specific code patterns.
174 |
175 | As an example we search a source file from base R for `1:nrow()`
176 | expressions, which are usually unsafe, as `nrow()` might be zero, and
177 | then the expression is equivalent to `1:0`, i.e. `c(1, 0)`, which is
178 | usually not the intended behavior.
179 |
180 | We load and parse the file directly from the the R source code mirror at
181 | :
182 |
183 | ``` r
184 | url <- paste0(
185 | "https://raw.githubusercontent.com/wch/r-source/",
186 | "4fc93819fc7401b8695ce57a948fe163d4188f47/src/library/tools/R/xgettext.R"
187 | )
188 | src <- readLines(url)
189 | p <- parse(text = src, keep.source = TRUE)
190 | ```
191 |
192 | and we convert it to an XML tree:
193 |
194 | ``` r
195 | library(xml2)
196 | xml <- read_xml(xml_parse_data(p))
197 | ```
198 |
199 | The `1:nrow()` expression corresponds to the following tree in R:
200 |
201 |
202 | +--
203 | +-- NUM_CONST: 1
204 | +-- ':'
205 | +--
206 | +--
207 | +-- SYMBOL_FUNCTION_CALL nrow
208 | +-- '('
209 | +--
210 | +-- ')'
211 |
212 | ``` r
213 | bad <- xml_parse_data(
214 | parse(text = "1:nrow(expr)", keep.source = TRUE),
215 | pretty = TRUE
216 | )
217 | cat(bad)
218 | ```
219 |
220 | #>
221 | #>
222 | #>
223 | #>
224 | #> 1
225 | #>
226 | #> :
227 | #>
228 | #>
229 | #> nrow
230 | #>
231 | #> (
232 | #>
233 | #> expr
234 | #>
235 | #> )
236 | #>
237 | #>
238 | #>
239 |
240 | This translates to the following XPath expression (ignoring the last
241 | tree tokens from the `length(expr)` expressions):
242 |
243 | ``` r
244 | xp <- paste0(
245 | "//expr",
246 | "[expr[NUM_CONST[text()='1']]]",
247 | "[OP-COLON]",
248 | "[expr[expr[SYMBOL_FUNCTION_CALL[text()='nrow']]]]"
249 | )
250 | ```
251 |
252 | We can search for this subtree with `xml2::xml_find_all()`:
253 |
254 | ``` r
255 | bad_nrow <- xml_find_all(xml, xp)
256 | bad_nrow
257 | ```
258 |
259 | #> {xml_nodeset (1)}
260 | #> [1] [2,] "333" " else"
271 | #> [3,] "334" "\tfor (i in 1:nrow(x)) {"
272 | #> [4,] "335" "\t if (is.na(x[i, 2L])) cols <- c(1L, 3:5)"
273 | #> [5,] "336" "\t else cols <- 1:5"
274 |
275 | ## Code of Conduct
276 |
277 | Please note that the xmlparsedata project is released with a
278 | [Contributor Code of
279 | Conduct](https://r-lib.github.io/xmlparsedata/CODE_OF_CONDUCT.html). By
280 | contributing to this project, you agree to abide by its terms.
281 |
282 | ## License
283 |
284 | MIT © Mango Solutions, RStudio
285 |
--------------------------------------------------------------------------------