├── .Rbuildignore ├── .github ├── .gitignore ├── CODEOWNERS ├── CODE_OF_CONDUCT.md └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ ├── rhub.yaml │ └── test-coverage.yaml ├── .gitignore ├── .vscode ├── extensions.json └── settings.json ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── Makefile ├── NAMESPACE ├── NEWS.md ├── R ├── expr_as_xml.R ├── package.R ├── utils.R └── xmlparsedata-package.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── air.toml ├── codecov.yml ├── man ├── expr_as_xml.Rd ├── xml_parse_data.Rd ├── xml_parse_token_map.Rd ├── xmlparsedata-package.Rd └── xmlparsedata.Rd └── tests ├── testthat.R └── testthat ├── test-expr_as_xml.R └── test-xml_parse_data.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^Makefile$ 4 | ^README.Rmd$ 5 | ^README\.html$ 6 | ^.travis.yml$ 7 | ^appveyor.yml$ 8 | ^\.github$ 9 | ^revdep$ 10 | ^_pkgdown\.yml$ 11 | ^docs$ 12 | ^pkgdown$ 13 | ^codecov\.yml$ 14 | ^LICENSE\.md$ 15 | ^[\.]?air\.toml$ 16 | ^\.vscode$ 17 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # CODEOWNERS for xmlparsedata 2 | # https://www.tidyverse.org/development/understudies 3 | * @gaborcsardi @jimhester 4 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at codeofconduct@posit.co. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series of 85 | actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or permanent 92 | ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within the 112 | community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.1, available at 118 | . 119 | 120 | Community Impact Guidelines were inspired by 121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion]. 122 | 123 | For answers to common questions about this code of conduct, see the FAQ at 124 | . Translations are available at . 125 | 126 | [homepage]: https://www.contributor-covenant.org 127 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | 12 | name: R-CMD-check.yaml 13 | 14 | permissions: read-all 15 | 16 | jobs: 17 | R-CMD-check: 18 | runs-on: ${{ matrix.config.os }} 19 | 20 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | config: 26 | - {os: macos-latest, r: 'release'} 27 | 28 | - {os: windows-latest, r: 'release'} 29 | # use 4.0 or 4.1 to check with rtools40's older compiler 30 | - {os: windows-latest, r: 'oldrel-4'} 31 | 32 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 33 | - {os: ubuntu-latest, r: 'release'} 34 | - {os: ubuntu-latest, r: 'oldrel-1'} 35 | - {os: ubuntu-latest, r: 'oldrel-2'} 36 | - {os: ubuntu-latest, r: 'oldrel-3'} 37 | - {os: ubuntu-latest, r: 'oldrel-4'} 38 | 39 | env: 40 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 41 | R_KEEP_PKG_SOURCE: yes 42 | 43 | steps: 44 | - uses: actions/checkout@v4 45 | 46 | - uses: r-lib/actions/setup-pandoc@v2 47 | 48 | - uses: r-lib/actions/setup-r@v2 49 | with: 50 | r-version: ${{ matrix.config.r }} 51 | http-user-agent: ${{ matrix.config.http-user-agent }} 52 | use-public-rspm: true 53 | 54 | - uses: r-lib/actions/setup-r-dependencies@v2 55 | with: 56 | extra-packages: any::rcmdcheck 57 | needs: check 58 | 59 | - uses: r-lib/actions/check-r-package@v2 60 | with: 61 | upload-snapshots: true 62 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 63 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.5.0 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | name: pr-commands.yaml 8 | 9 | permissions: read-all 10 | 11 | jobs: 12 | document: 13 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} 14 | name: document 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | permissions: 19 | contents: write 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - uses: r-lib/actions/pr-fetch@v2 24 | with: 25 | repo-token: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::roxygen2 34 | needs: pr-document 35 | 36 | - name: Document 37 | run: roxygen2::roxygenise() 38 | shell: Rscript {0} 39 | 40 | - name: commit 41 | run: | 42 | git config --local user.name "$GITHUB_ACTOR" 43 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 44 | git add man/\* NAMESPACE 45 | git commit -m 'Document' 46 | 47 | - uses: r-lib/actions/pr-push@v2 48 | with: 49 | repo-token: ${{ secrets.GITHUB_TOKEN }} 50 | 51 | style: 52 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} 53 | name: style 54 | runs-on: ubuntu-latest 55 | env: 56 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 57 | permissions: 58 | contents: write 59 | steps: 60 | - uses: actions/checkout@v4 61 | 62 | - uses: r-lib/actions/pr-fetch@v2 63 | with: 64 | repo-token: ${{ secrets.GITHUB_TOKEN }} 65 | 66 | - uses: r-lib/actions/setup-r@v2 67 | 68 | - name: Install dependencies 69 | run: install.packages("styler") 70 | shell: Rscript {0} 71 | 72 | - name: Style 73 | run: styler::style_pkg() 74 | shell: Rscript {0} 75 | 76 | - name: commit 77 | run: | 78 | git config --local user.name "$GITHUB_ACTOR" 79 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 80 | git add \*.R 81 | git commit -m 'Style' 82 | 83 | - uses: r-lib/actions/pr-push@v2 84 | with: 85 | repo-token: ${{ secrets.GITHUB_TOKEN }} 86 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/rhub2/blob/v1/inst/workflow/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub2::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/rhub2/actions/rhub-setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/rhub2/actions/rhub-checkout@v1 55 | - uses: r-hub/rhub2/actions/rhub-check@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | 60 | other-platforms: 61 | needs: setup 62 | if: ${{ needs.setup.outputs.platforms != '[]' }} 63 | runs-on: ${{ matrix.config.os }} 64 | name: ${{ matrix.config.label }} 65 | strategy: 66 | fail-fast: false 67 | matrix: 68 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 69 | 70 | steps: 71 | - uses: r-hub/rhub2/actions/rhub-checkout@v1 72 | - uses: r-hub/rhub2/actions/rhub-setup-r@v1 73 | with: 74 | job-config: ${{ matrix.config.job-config }} 75 | token: ${{ secrets.RHUB_TOKEN }} 76 | - uses: r-hub/rhub2/actions/rhub-check@v1 77 | with: 78 | job-config: ${{ matrix.config.job-config }} 79 | token: ${{ secrets.RHUB_TOKEN }} 80 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: test-coverage.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | test-coverage: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::covr, any::xml2 28 | needs: coverage 29 | 30 | - name: Test coverage 31 | run: | 32 | cov <- covr::package_coverage( 33 | quiet = FALSE, 34 | clean = FALSE, 35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 36 | ) 37 | print(cov) 38 | covr::to_cobertura(cov) 39 | shell: Rscript {0} 40 | 41 | - uses: codecov/codecov-action@v5 42 | with: 43 | # Fail if error if not on PR, or if on PR and token is given 44 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} 45 | files: ./cobertura.xml 46 | plugins: noop 47 | disable_search: true 48 | token: ${{ secrets.CODECOV_TOKEN }} 49 | 50 | - name: Show testthat output 51 | if: always() 52 | run: | 53 | ## -------------------------------------------------------------------- 54 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 55 | shell: bash 56 | 57 | - name: Upload test results 58 | if: failure() 59 | uses: actions/upload-artifact@v4 60 | with: 61 | name: coverage-test-failures 62 | path: ${{ runner.temp }}/package 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | /revdep 5 | docs 6 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "Posit.air-vscode" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[r]": { 3 | "editor.formatOnSave": true, 4 | "editor.defaultFormatter": "Posit.air-vscode" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: xmlparsedata 2 | Title: Parse Data of 'R' Code as an 'XML' Tree 3 | Version: 1.0.5.9000 4 | Authors@R: c( 5 | person("Gábor", "Csárdi", , "csardi.gabor@gmail.com", role = c("aut", "cre")), 6 | person("Posit Software, PBC", role = c("cph", "fnd"), 7 | comment = c(ROR = "03wc8by49")), 8 | person("Mango Solutions", role = c("cph", "fnd")) 9 | ) 10 | Description: Convert the output of 'utils::getParseData()' to an 'XML' 11 | tree, that one can search via 'XPath', and easier to manipulate in 12 | general. 13 | License: MIT + file LICENSE 14 | URL: https://github.com/r-lib/xmlparsedata#readme, 15 | https://r-lib.github.io/xmlparsedata/ 16 | BugReports: https://github.com/r-lib/xmlparsedata/issues 17 | Depends: 18 | R (>= 3.0.0) 19 | Suggests: 20 | covr, 21 | testthat (>= 3.0.0), 22 | xml2 23 | Config/Needs/website: tidyverse/tidytemplate 24 | Config/testthat/edition: 3 25 | Config/usethis/last-upkeep: 2025-05-07 26 | Encoding: UTF-8 27 | LazyData: true 28 | Roxygen: list(markdown = TRUE) 29 | RoxygenNote: 7.2.3 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2025 2 | COPYRIGHT HOLDER: xmlparsedata authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2025 xmlparsedata authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: README.md 3 | 4 | README.md: README.Rmd 5 | Rscript -e "library(knitr); knit('$<', output = '$@', quiet = TRUE)" 6 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(expr_as_xml) 4 | export(xml_parse_data) 5 | export(xml_parse_token_map) 6 | importFrom(utils,getParseData) 7 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2 | # xmlparsedata Development version 3 | 4 | * Re-parse character literals with octal-escaped expressions of width 1 or 2, 5 | e.g. `"\1"`, to work around a bug (in R<4.3.0) in `utils::getParseData()` 6 | (#25, @michaelchirico). 7 | 8 | * New `expr_as_xml()` to get an XML representation of R expressions (#27 @MichaelChirico). 9 | 10 | # xmlparsedata 1.0.5 11 | 12 | * Translate `\` in lambda expression to `OP-LAMBDA` (#18 @renkun-ken). 13 | 14 | * Drop all control characters, except horizontal tab and newline (#19). 15 | 16 | # xmlparsedata 1.0.4 17 | 18 | * Translate ] tokens to `OP-RIGHT-BRACKET` instead of 19 | `OP-RIGHT-BRACE` (#11 @AshesITR). 20 | 21 | * `xml_parse_data()` now works if `includeText = FALSE` 22 | (#14 @renkun-ken). 23 | 24 | # xmlparsedata 1.0.3 25 | 26 | * Ensure that closing xml-tags for code expressions that end at the same 27 | position in a file respect start-first-end-last ordering in the produced xml. 28 | Ensures that the new `equal_assign` token in `getParseData()` for R-3.6 is 29 | handled appropriately. #5 @russHyde 30 | 31 | # xmlparsedata 1.0.2 32 | 33 | * Remove control characters `\003`, `\007`, `\010`, `\027`, as they are 34 | not allowed in XML 1.0, #1 @GregoireGauriot 35 | 36 | * Always convert parsed text to UTF-8 37 | 38 | # xmlparsedata 1.0.1 39 | 40 | * Fix a bug when the input is already a `getParseData()` data frame. 41 | https://github.com/jimhester/lintr filters the parsed data to include 42 | individual functions only, but only filters the data frame, not the 43 | underlying srcrefs, so when we call `getParseData()` on the data frame 44 | again, we get the data for the whole source file. This is fixed now by 45 | noticing that the input is already a data frame 46 | 47 | # xmlparsedata 1.0.0 48 | 49 | First public release. 50 | -------------------------------------------------------------------------------- /R/expr_as_xml.R: -------------------------------------------------------------------------------- 1 | #' Get an XML representation of an expression 2 | #' 3 | #' @param expr An expression. 4 | #' @export 5 | expr_as_xml <- function(expr) { 6 | if (!requireNamespace("xml2", quietly = TRUE)) { 7 | stop("'xml2' is required to return an XML object") 8 | } 9 | tmp_source <- tempfile() 10 | on.exit(unlink(tmp_source)) 11 | 12 | # NB: deparse() approach struggles with `{` expressions 13 | dput(substitute(expr), file = tmp_source) 14 | parsed_expr <- parse(tmp_source, keep.source = TRUE) 15 | # TODO(#28): Strip the line/column metadata which 16 | # is technically 'missing' for this case. 17 | xml2::read_xml(xml_parse_data(parsed_expr)) 18 | } 19 | -------------------------------------------------------------------------------- /R/package.R: -------------------------------------------------------------------------------- 1 | #' Parse Data of R Code as an 'XML' Tree 2 | #' 3 | #' Convert the output of 'utils::getParseData()' to an 'XML' tree, that is 4 | #' searchable and easier to manipulate in general. 5 | #' 6 | #' @docType package 7 | #' @name xmlparsedata 8 | NULL 9 | 10 | #' Convert R parse data to XML 11 | #' 12 | #' In recent R versions the parser can attach source code location 13 | #' information to the parsed expressions. This information is often 14 | #' useful for static analysis, e.g. code linting. It can be accessed 15 | #' via the [utils::getParseData()] function. 16 | #' 17 | #' `xml_parse_data()` converts this information to an XML tree. 18 | #' The R parser's token names are preserved in the XML as much as 19 | #' possible, but some of them are not valid XML tag names, so they are 20 | #' renamed, see the [xml_parse_token_map] vector for the 21 | #' mapping. 22 | #' 23 | #' The top XML tag is ``, which is a list of 24 | #' expressions, each expression is an `` tag. Each tag 25 | #' has attributes that define the location: `line1`, `col1`, 26 | #' `line2`, `col2`. These are from the [getParseData()] 27 | #' data frame column names. Next, there are two attributes, 28 | #' `start` and `end`, which can be used as an ordering of 29 | #' expressions in the document. Note that while the values 30 | #' are correlated with (and in some cases may match exactly) 31 | #' positions in the document, this cannot be relied upon. 32 | #' 33 | #' See an example below. See also the README at 34 | #' 35 | #' for examples on how to search the XML tree with the `xml2` package 36 | #' and XPath expressions. 37 | #' 38 | #' Note that `xml_parse_data()` silently drops all control characters 39 | #' (0x01-0x1f) from the input, except horizontal tab (0x09) and newline 40 | #' (0x0a), because they are invalid in XML 1.0. 41 | #' 42 | #' @param pretty Whether to pretty-indent the XML output. It has a small 43 | #' overhead which probably only matters for very large source files. 44 | #' @inheritParams utils::getParseData 45 | #' @return An XML string representing the parse data. See details below. 46 | #' 47 | #' @export 48 | #' @importFrom utils getParseData 49 | #' @seealso [xml_parse_token_map] for the token names. 50 | #' for more 51 | #' information and use cases. 52 | #' @examples 53 | #' code <- "function(a = 1, b = 2) {\n a + b\n}\n" 54 | #' expr <- parse(text = code, keep.source = TRUE) 55 | #' 56 | #' # The base R way: 57 | #' getParseData(expr) 58 | #' 59 | #' cat(xml_parse_data(expr, pretty = TRUE)) 60 | xml_parse_data <- function(x, includeText = NA, pretty = FALSE) { 61 | xml_header <- paste0( 62 | "\n\n" 64 | ) 65 | xml_footer <- "\n\n" 66 | 67 | ## Maybe it is already a data frame, e.g. when used in lintr 68 | if (is.data.frame(x)) { 69 | pd <- x 70 | } else { 71 | pd <- getParseData(x, includeText = includeText) 72 | if (is.null(pd)) { 73 | tmp_source <- tempfile() 74 | on.exit(unlink(tmp_source)) 75 | dput(x, file = tmp_source) 76 | 77 | x <- parse(tmp_source, keep.source = TRUE) 78 | pd <- getParseData(x, includeText = includeText) 79 | pd$line1 <- pd$line2 <- pd$col1 <- pd$col2 <- NA_integer_ 80 | } 81 | } 82 | 83 | if (!nrow(pd)) { 84 | return(paste0(xml_header, xml_footer)) 85 | } 86 | 87 | pd <- fix_comments(pd) 88 | 89 | if (!is.data.frame(x)) { 90 | # workaround for R parser bug #18323; see #25 91 | str_const_mismatch <- pd$token == "STR_CONST" & 92 | pd$col2 - pd$col1 != nchar(pd$text) - 1L & 93 | # skip if there are tabs, which would require complicating the logic a lot 94 | !grepl("\t", pd$text, fixed = TRUE) 95 | if (any(str_const_mismatch)) { 96 | pd$text[str_const_mismatch] <- reparse_octal( 97 | pd[str_const_mismatch, ], 98 | attr(x, "srcfile")$lines 99 | ) 100 | } 101 | } 102 | 103 | if (!is.null(pd$text)) { 104 | pd$text <- enc2utf8(pd$text) 105 | } 106 | 107 | ## Tags for all nodes, teminal nodes have end tags as well 108 | pd$token <- map_token(pd$token) 109 | 110 | ## Positions, to make it easy to compare what comes first 111 | maxcol <- max(pd$col1, pd$col2) + 1L 112 | pd$start <- pd$line1 * maxcol + pd$col1 113 | pd$end <- pd$line2 * maxcol + pd$col2 114 | 115 | terminal_tag <- character(nrow(pd)) 116 | terminal_tag[pd$terminal] <- paste0("") 117 | if (anyNA(pd$line1)) { 118 | pd$tag <- paste0( 119 | "<", 120 | pd$token, 121 | ">", 122 | if (!is.null(pd$text)) xml_encode(pd$text) else "", 123 | terminal_tag 124 | ) 125 | } else { 126 | pd$tag <- paste0( 127 | "<", 128 | pd$token, 129 | " line1=\"", 130 | pd$line1, 131 | "\" col1=\"", 132 | pd$col1, 133 | "\" line2=\"", 134 | pd$line2, 135 | "\" col2=\"", 136 | pd$col2, 137 | "\" start=\"", 138 | pd$start, 139 | "\" end=\"", 140 | pd$end, 141 | "\">", 142 | if (!is.null(pd$text)) xml_encode(pd$text) else "", 143 | terminal_tag 144 | ) 145 | } 146 | 147 | ## Add an extra terminal tag for each non-terminal one 148 | pd2 <- pd[!pd$terminal, ] 149 | if (nrow(pd2)) { 150 | pd2$terminal <- TRUE 151 | pd2$parent <- -1 152 | pd2$line1 <- pd2$line2 153 | pd2$col1 <- pd2$col2 154 | pd2$line2 <- pd2$line2 - 1L 155 | pd2$col2 <- pd2$col2 - 1L 156 | pd2$tag <- paste0("") 157 | pd <- rbind(pd, pd2, make.row.names = FALSE) 158 | } 159 | 160 | ## Order the nodes properly 161 | ## - the terminal nodes from pd2 may be nested inside each other, when 162 | ## this happens they will have the same line1, col1, line2, col2 and 163 | ## terminal status; and 'start' is used to break ties 164 | ord <- order(pd$line1, pd$col1, -pd$line2, -pd$col2, pd$terminal, -pd$start) 165 | pd <- pd[ord, ] 166 | 167 | if (pretty) { 168 | str <- !pd$terminal 169 | end <- pd$parent == -1 170 | ind <- 2L + cumsum(str * 2L + end * (-2L)) - str * 2L 171 | xml <- paste0(strrep(" ", ind), pd$tag, collapse = "\n") 172 | } else { 173 | xml <- paste(pd$tag, collapse = "\n") 174 | } 175 | 176 | paste0(xml_header, xml, xml_footer) 177 | } 178 | 179 | fix_comments <- function(pd) { 180 | pd$parent[pd$parent < 0] <- 0 181 | pd 182 | } 183 | 184 | map_token <- function(token) { 185 | needs_translation <- token %in% names(xml_parse_token_map) 186 | token[needs_translation] <- xml_parse_token_map[token[needs_translation]] 187 | token 188 | } 189 | 190 | #' Map token names of the R parser to token names in 191 | #' [xml_parse_data()] 192 | #' 193 | #' Some of the R token names are not valid XML tag names, 194 | #' so [xml_parse_data()] needs to replace them to create a 195 | #' valid XML file. 196 | #' 197 | #' @export 198 | #' @seealso [xml_parse_data()] 199 | 200 | xml_parse_token_map <- c( 201 | "'?'" = "OP-QUESTION", 202 | "'~'" = "OP-TILDE", 203 | "'+'" = "OP-PLUS", 204 | "'-'" = "OP-MINUS", 205 | "'*'" = "OP-STAR", 206 | "'/'" = "OP-SLASH", 207 | "':'" = "OP-COLON", 208 | "'^'" = "OP-CARET", 209 | "'$'" = "OP-DOLLAR", 210 | "'@'" = "OP-AT", 211 | "'('" = "OP-LEFT-PAREN", 212 | "'['" = "OP-LEFT-BRACKET", 213 | "';'" = "OP-SEMICOLON", 214 | "'{'" = "OP-LEFT-BRACE", 215 | "'}'" = "OP-RIGHT-BRACE", 216 | "')'" = "OP-RIGHT-PAREN", 217 | "'!'" = "OP-EXCLAMATION", 218 | "']'" = "OP-RIGHT-BRACKET", 219 | "','" = "OP-COMMA", 220 | "'\\\\'" = "OP-LAMBDA" 221 | ) 222 | 223 | xml_encode <- function(x) { 224 | x <- gsub("&", "&", x, fixed = TRUE) 225 | x <- gsub("<", "<", x, fixed = TRUE) 226 | x <- gsub(">", ">", x, fixed = TRUE) 227 | # most control characters are not allowed in XML, except tab and nl 228 | x <- gsub("[\x01-\x08\x0b-\x1f]", "", x, useBytes = TRUE) 229 | x 230 | } 231 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | reparse_octal <- function(pd, lines) { 2 | out <- character(nrow(pd)) 3 | single_line <- pd$line1 == pd$line2 4 | out[single_line] <- substr( 5 | lines[pd$line1[single_line]], 6 | pd$col1[single_line], 7 | pd$col2[single_line] 8 | ) 9 | for (ii in which(!single_line)) { 10 | out[ii] <- paste( 11 | c( 12 | substring(lines[pd$line1[ii]], pd$col1[ii]), 13 | if (pd$line1[ii] < pd$line2[ii] - 1L) 14 | lines[(pd$line1[ii] + 1L):(pd$line2[ii] - 1L)], 15 | substr(lines[pd$line2[ii]], 1L, pd$col2[ii]) 16 | ), 17 | collapse = "\n" 18 | ) 19 | } 20 | out 21 | } 22 | -------------------------------------------------------------------------------- /R/xmlparsedata-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | #' @aliases xmlparsedata-package 3 | "_PACKAGE" 4 | 5 | ## usethis namespace: start 6 | ## usethis namespace: end 7 | NULL 8 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r} 6 | #| label: setup 7 | #| echo: false 8 | #| message: false 9 | knitr::opts_chunk$set( 10 | comment = "#>", 11 | tidy = FALSE, 12 | error = FALSE 13 | ) 14 | ``` 15 | 16 | # xmlparsedata 17 | 18 | > Parse Data of R Code as an 'XML' Tree 19 | 20 | 21 | [![R-CMD-check](https://github.com/r-lib/xmlparsedata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/r-lib/xmlparsedata/actions/workflows/R-CMD-check.yaml) 22 | [![](https://www.r-pkg.org/badges/version/xmlparsedata)](https://www.r-pkg.org/pkg/xmlparsedata) 23 | [![CRAN RStudio mirror downloads](https://cranlogs.r-pkg.org/badges/xmlparsedata)](https://www.r-pkg.org/pkg/xmlparsedata) 24 | [![Codecov test coverage](https://codecov.io/gh/r-lib/xmlparsedata/graph/badge.svg)](https://app.codecov.io/gh/r-lib/xmlparsedata) 25 | 26 | 27 | Convert the output of 'utils::getParseData()' to an 'XML' tree, that is 28 | searchable and easier to manipulate in general. 29 | 30 | --- 31 | 32 | - [Installation](#installation) 33 | - [Usage](#usage) 34 | - [Introduction](#introduction) 35 | - [`utils::getParseData()`](#utilsgetparsedata) 36 | - [`xml_parse_data()`](#xml_parse_data) 37 | - [Renaming some tokens](#renaming-some-tokens) 38 | - [Search the parse tree with `xml2`](#search-the-parse-tree-with-xml2) 39 | - [License](#license) 40 | 41 | ## Installation 42 | 43 | Stable version: 44 | 45 | ```{r} 46 | #| eval: false 47 | install.packages("xmlparsedata") 48 | ``` 49 | 50 | Development version: 51 | 52 | ```{r} 53 | #| eval: false 54 | pak::pak("r-lib/zip") 55 | ``` 56 | 57 | ## Usage 58 | 59 | ### Introduction 60 | 61 | In recent R versions the parser can attach source code location 62 | information to the parsed expressions. This information is often 63 | useful for static analysis, e.g. code linting. It can be accessed 64 | via the `utils::getParseData()` function. 65 | 66 | `xmlparsedata` converts this information to an XML tree. 67 | The R parser's token names are preserved in the XML as much as 68 | possible, but some of them are not valid XML tag names, so they are 69 | renamed, see below. 70 | 71 | ### `utils::getParseData()` 72 | 73 | `utils::getParseData()` summarizes the parse information in a data 74 | frame. The data frame has one row per expression tree node, and each 75 | node points to its parent. Here is a small example: 76 | 77 | ```{r} 78 | p <- parse( 79 | text = "function(a = 1, b = 2) { \n a + b\n}\n", 80 | keep.source = TRUE 81 | ) 82 | getParseData(p) 83 | ``` 84 | 85 | ### `xml_parse_data()` 86 | 87 | `xmlparsedata::xml_parse_data()` converts the parse information to 88 | an XML document. It works similarly to `getParseData()`. Specify the 89 | `pretty = TRUE` option to pretty-indent the XML output. Note that this 90 | has a small overhead, so if you are parsing large files, I suggest you 91 | omit it. 92 | 93 | ```{r} 94 | library(xmlparsedata) 95 | xml <- xml_parse_data(p, pretty = TRUE) 96 | cat(xml) 97 | ``` 98 | 99 | The top XML tag is ``, which is a list of 100 | expressions, each expression is an `` tag. Each tag 101 | has attributes that define the location: `line1`, `col1`, 102 | `line2`, `col2`. These are from the `getParseData()` 103 | data frame column names. 104 | 105 | ### Renaming some tokens 106 | 107 | The R parser's token names are preserved in the XML as much as 108 | possible, but some of them are not valid XML tag names, so they are 109 | renamed, see the `xml_parse_token_map` vector for the mapping: 110 | 111 | ```{r} 112 | xml_parse_token_map 113 | ``` 114 | 115 | ### Search the parse tree with `xml2` 116 | 117 | The `xml2` package can search XML documents using 118 | [XPath](https://en.wikipedia.org/wiki/XPath) expressions. This is often 119 | useful to search for specific code patterns. 120 | 121 | As an example we search a source file from base R for `1:nrow()` 122 | expressions, which are usually unsafe, as `nrow()` might be zero, 123 | and then the expression is equivalent to `1:0`, i.e. `c(1, 0)`, which 124 | is usually not the intended behavior. 125 | 126 | We load and parse the file directly from the the R source code mirror 127 | at https://github.com/wch/r-source: 128 | 129 | ```{r} 130 | url <- paste0( 131 | "https://raw.githubusercontent.com/wch/r-source/", 132 | "4fc93819fc7401b8695ce57a948fe163d4188f47/src/library/tools/R/xgettext.R" 133 | ) 134 | src <- readLines(url) 135 | p <- parse(text = src, keep.source = TRUE) 136 | ``` 137 | 138 | and we convert it to an XML tree: 139 | 140 | ```{r} 141 | library(xml2) 142 | xml <- read_xml(xml_parse_data(p)) 143 | ``` 144 | 145 | The `1:nrow()` expression corresponds to the following 146 | tree in R: 147 | 148 | ``` 149 | 150 | +-- 151 | +-- NUM_CONST: 1 152 | +-- ':' 153 | +-- 154 | +-- 155 | +-- SYMBOL_FUNCTION_CALL nrow 156 | +-- '(' 157 | +-- 158 | +-- ')' 159 | ``` 160 | 161 | ```{r} 162 | bad <- xml_parse_data( 163 | parse(text = "1:nrow(expr)", keep.source = TRUE), 164 | pretty = TRUE 165 | ) 166 | cat(bad) 167 | ``` 168 | 169 | This translates to the following XPath expression (ignoring 170 | the last tree tokens from the `length(expr)` expressions): 171 | 172 | ```{r} 173 | xp <- paste0( 174 | "//expr", 175 | "[expr[NUM_CONST[text()='1']]]", 176 | "[OP-COLON]", 177 | "[expr[expr[SYMBOL_FUNCTION_CALL[text()='nrow']]]]" 178 | ) 179 | ``` 180 | 181 | We can search for this subtree with `xml2::xml_find_all()`: 182 | 183 | ```{r} 184 | bad_nrow <- xml_find_all(xml, xp) 185 | bad_nrow 186 | ``` 187 | 188 | There is only one hit, in line 334: 189 | 190 | ```{r} 191 | cbind(332:336, src[332:336]) 192 | ``` 193 | 194 | ## Code of Conduct 195 | 196 | Please note that the xmlparsedata project is released with a 197 | [Contributor Code of Conduct](https://r-lib.github.io/xmlparsedata/CODE_OF_CONDUCT.html). 198 | By contributing to this project, you agree to abide by its terms. 199 | 200 | ## License 201 | 202 | MIT © Mango Solutions, RStudio 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # xmlparsedata 3 | 4 | > Parse Data of R Code as an ‘XML’ Tree 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/r-lib/xmlparsedata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/r-lib/xmlparsedata/actions/workflows/R-CMD-check.yaml) 9 | [![](https://www.r-pkg.org/badges/version/xmlparsedata)](https://www.r-pkg.org/pkg/xmlparsedata) 10 | [![CRAN RStudio mirror 11 | downloads](https://cranlogs.r-pkg.org/badges/xmlparsedata)](https://www.r-pkg.org/pkg/xmlparsedata) 12 | [![Codecov test 13 | coverage](https://codecov.io/gh/r-lib/xmlparsedata/graph/badge.svg)](https://app.codecov.io/gh/r-lib/xmlparsedata) 14 | 15 | 16 | Convert the output of ‘utils::getParseData()’ to an ‘XML’ tree, that is 17 | searchable and easier to manipulate in general. 18 | 19 | ------------------------------------------------------------------------ 20 | 21 | - [Installation](#installation) 22 | - [Usage](#usage) 23 | - [Introduction](#introduction) 24 | - [`utils::getParseData()`](#utilsgetparsedata) 25 | - [`xml_parse_data()`](#xml_parse_data) 26 | - [Renaming some tokens](#renaming-some-tokens) 27 | - [Search the parse tree with 28 | `xml2`](#search-the-parse-tree-with-xml2) 29 | - [License](#license) 30 | 31 | ## Installation 32 | 33 | Stable version: 34 | 35 | ``` r 36 | install.packages("xmlparsedata") 37 | ``` 38 | 39 | Development version: 40 | 41 | ``` r 42 | pak::pak("r-lib/zip") 43 | ``` 44 | 45 | ## Usage 46 | 47 | ### Introduction 48 | 49 | In recent R versions the parser can attach source code location 50 | information to the parsed expressions. This information is often useful 51 | for static analysis, e.g. code linting. It can be accessed via the 52 | `utils::getParseData()` function. 53 | 54 | `xmlparsedata` converts this information to an XML tree. The R parser’s 55 | token names are preserved in the XML as much as possible, but some of 56 | them are not valid XML tag names, so they are renamed, see below. 57 | 58 | ### `utils::getParseData()` 59 | 60 | `utils::getParseData()` summarizes the parse information in a data 61 | frame. The data frame has one row per expression tree node, and each 62 | node points to its parent. Here is a small example: 63 | 64 | ``` r 65 | p <- parse( 66 | text = "function(a = 1, b = 2) { \n a + b\n}\n", 67 | keep.source = TRUE 68 | ) 69 | getParseData(p) 70 | ``` 71 | 72 | #> line1 col1 line2 col2 id parent token terminal text 73 | #> 33 1 1 3 1 33 0 expr FALSE 74 | #> 1 1 1 1 8 1 33 FUNCTION TRUE function 75 | #> 2 1 9 1 9 2 33 '(' TRUE ( 76 | #> 3 1 10 1 10 3 33 SYMBOL_FORMALS TRUE a 77 | #> 4 1 12 1 12 4 33 EQ_FORMALS TRUE = 78 | #> 5 1 14 1 14 5 6 NUM_CONST TRUE 1 79 | #> 6 1 14 1 14 6 33 expr FALSE 80 | #> 7 1 15 1 15 7 33 ',' TRUE , 81 | #> 10 1 17 1 17 10 33 SYMBOL_FORMALS TRUE b 82 | #> 11 1 19 1 19 11 33 EQ_FORMALS TRUE = 83 | #> 12 1 21 1 21 12 13 NUM_CONST TRUE 2 84 | #> 13 1 21 1 21 13 33 expr FALSE 85 | #> 14 1 22 1 22 14 33 ')' TRUE ) 86 | #> 30 1 24 3 1 30 33 expr FALSE 87 | #> 17 1 24 1 24 17 30 '{' TRUE { 88 | #> 25 2 3 2 7 25 30 expr FALSE 89 | #> 19 2 3 2 3 19 21 SYMBOL TRUE a 90 | #> 21 2 3 2 3 21 25 expr FALSE 91 | #> 20 2 5 2 5 20 25 '+' TRUE + 92 | #> 22 2 7 2 7 22 24 SYMBOL TRUE b 93 | #> 24 2 7 2 7 24 25 expr FALSE 94 | #> 28 3 1 3 1 28 30 '}' TRUE } 95 | 96 | ### `xml_parse_data()` 97 | 98 | `xmlparsedata::xml_parse_data()` converts the parse information to an 99 | XML document. It works similarly to `getParseData()`. Specify the 100 | `pretty = TRUE` option to pretty-indent the XML output. Note that this 101 | has a small overhead, so if you are parsing large files, I suggest you 102 | omit it. 103 | 104 | ``` r 105 | library(xmlparsedata) 106 | xml <- xml_parse_data(p, pretty = TRUE) 107 | cat(xml) 108 | ``` 109 | 110 | #> 111 | #> 112 | #> 113 | #> function 114 | #> ( 115 | #> a 116 | #> = 117 | #> 118 | #> 1 119 | #> 120 | #> , 121 | #> b 122 | #> = 123 | #> 124 | #> 2 125 | #> 126 | #> ) 127 | #> 128 | #> { 129 | #> 130 | #> 131 | #> a 132 | #> 133 | #> + 134 | #> 135 | #> b 136 | #> 137 | #> 138 | #> } 139 | #> 140 | #> 141 | #> 142 | 143 | The top XML tag is ``, which is a list of expressions, each 144 | expression is an `` tag. Each tag has attributes that define the 145 | location: `line1`, `col1`, `line2`, `col2`. These are from the 146 | `getParseData()` data frame column names. 147 | 148 | ### Renaming some tokens 149 | 150 | The R parser’s token names are preserved in the XML as much as possible, 151 | but some of them are not valid XML tag names, so they are renamed, see 152 | the `xml_parse_token_map` vector for the mapping: 153 | 154 | ``` r 155 | xml_parse_token_map 156 | ``` 157 | 158 | #> '?' '~' '+' '-' 159 | #> "OP-QUESTION" "OP-TILDE" "OP-PLUS" "OP-MINUS" 160 | #> '*' '/' ':' '^' 161 | #> "OP-STAR" "OP-SLASH" "OP-COLON" "OP-CARET" 162 | #> '$' '@' '(' '[' 163 | #> "OP-DOLLAR" "OP-AT" "OP-LEFT-PAREN" "OP-LEFT-BRACKET" 164 | #> ';' '{' '}' ')' 165 | #> "OP-SEMICOLON" "OP-LEFT-BRACE" "OP-RIGHT-BRACE" "OP-RIGHT-PAREN" 166 | #> '!' ']' ',' '\\\\' 167 | #> "OP-EXCLAMATION" "OP-RIGHT-BRACKET" "OP-COMMA" "OP-LAMBDA" 168 | 169 | ### Search the parse tree with `xml2` 170 | 171 | The `xml2` package can search XML documents using 172 | [XPath](https://en.wikipedia.org/wiki/XPath) expressions. This is often 173 | useful to search for specific code patterns. 174 | 175 | As an example we search a source file from base R for `1:nrow()` 176 | expressions, which are usually unsafe, as `nrow()` might be zero, and 177 | then the expression is equivalent to `1:0`, i.e. `c(1, 0)`, which is 178 | usually not the intended behavior. 179 | 180 | We load and parse the file directly from the the R source code mirror at 181 | : 182 | 183 | ``` r 184 | url <- paste0( 185 | "https://raw.githubusercontent.com/wch/r-source/", 186 | "4fc93819fc7401b8695ce57a948fe163d4188f47/src/library/tools/R/xgettext.R" 187 | ) 188 | src <- readLines(url) 189 | p <- parse(text = src, keep.source = TRUE) 190 | ``` 191 | 192 | and we convert it to an XML tree: 193 | 194 | ``` r 195 | library(xml2) 196 | xml <- read_xml(xml_parse_data(p)) 197 | ``` 198 | 199 | The `1:nrow()` expression corresponds to the following tree in R: 200 | 201 | 202 | +-- 203 | +-- NUM_CONST: 1 204 | +-- ':' 205 | +-- 206 | +-- 207 | +-- SYMBOL_FUNCTION_CALL nrow 208 | +-- '(' 209 | +-- 210 | +-- ')' 211 | 212 | ``` r 213 | bad <- xml_parse_data( 214 | parse(text = "1:nrow(expr)", keep.source = TRUE), 215 | pretty = TRUE 216 | ) 217 | cat(bad) 218 | ``` 219 | 220 | #> 221 | #> 222 | #> 223 | #> 224 | #> 1 225 | #> 226 | #> : 227 | #> 228 | #> 229 | #> nrow 230 | #> 231 | #> ( 232 | #> 233 | #> expr 234 | #> 235 | #> ) 236 | #> 237 | #> 238 | #> 239 | 240 | This translates to the following XPath expression (ignoring the last 241 | tree tokens from the `length(expr)` expressions): 242 | 243 | ``` r 244 | xp <- paste0( 245 | "//expr", 246 | "[expr[NUM_CONST[text()='1']]]", 247 | "[OP-COLON]", 248 | "[expr[expr[SYMBOL_FUNCTION_CALL[text()='nrow']]]]" 249 | ) 250 | ``` 251 | 252 | We can search for this subtree with `xml2::xml_find_all()`: 253 | 254 | ``` r 255 | bad_nrow <- xml_find_all(xml, xp) 256 | bad_nrow 257 | ``` 258 | 259 | #> {xml_nodeset (1)} 260 | #> [1] [2,] "333" " else" 271 | #> [3,] "334" "\tfor (i in 1:nrow(x)) {" 272 | #> [4,] "335" "\t if (is.na(x[i, 2L])) cols <- c(1L, 3:5)" 273 | #> [5,] "336" "\t else cols <- 1:5" 274 | 275 | ## Code of Conduct 276 | 277 | Please note that the xmlparsedata project is released with a 278 | [Contributor Code of 279 | Conduct](https://r-lib.github.io/xmlparsedata/CODE_OF_CONDUCT.html). By 280 | contributing to this project, you agree to abide by its terms. 281 | 282 | ## License 283 | 284 | MIT © Mango Solutions, RStudio 285 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://r-lib.github.io/xmlparsedata/ 2 | 3 | template: 4 | package: tidytemplate 5 | bootstrap: 5 6 | includes: 7 | in_header: | 8 | 9 | 10 | development: 11 | mode: auto 12 | -------------------------------------------------------------------------------- /air.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-lib/xmlparsedata/e6c9977f518bb6006328c30e4cbd21f54ab00475/air.toml -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /man/expr_as_xml.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expr_as_xml.R 3 | \name{expr_as_xml} 4 | \alias{expr_as_xml} 5 | \title{Get an XML representation of an expression} 6 | \usage{ 7 | expr_as_xml(expr) 8 | } 9 | \arguments{ 10 | \item{expr}{An expression.} 11 | } 12 | \description{ 13 | Get an XML representation of an expression 14 | } 15 | -------------------------------------------------------------------------------- /man/xml_parse_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package.R 3 | \name{xml_parse_data} 4 | \alias{xml_parse_data} 5 | \title{Convert R parse data to XML} 6 | \usage{ 7 | xml_parse_data(x, includeText = NA, pretty = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{ 11 | an expression returned from \code{\link{parse}}, or a function or other 12 | object with source reference information 13 | } 14 | 15 | \item{includeText}{ 16 | logical; whether to include the text of parsed items in the result 17 | } 18 | 19 | \item{pretty}{Whether to pretty-indent the XML output. It has a small 20 | overhead which probably only matters for very large source files.} 21 | } 22 | \value{ 23 | An XML string representing the parse data. See details below. 24 | } 25 | \description{ 26 | In recent R versions the parser can attach source code location 27 | information to the parsed expressions. This information is often 28 | useful for static analysis, e.g. code linting. It can be accessed 29 | via the \code{\link[utils:getParseData]{utils::getParseData()}} function. 30 | } 31 | \details{ 32 | \code{xml_parse_data()} converts this information to an XML tree. 33 | The R parser's token names are preserved in the XML as much as 34 | possible, but some of them are not valid XML tag names, so they are 35 | renamed, see the \link{xml_parse_token_map} vector for the 36 | mapping. 37 | 38 | The top XML tag is \verb{}, which is a list of 39 | expressions, each expression is an \verb{} tag. Each tag 40 | has attributes that define the location: \code{line1}, \code{col1}, 41 | \code{line2}, \code{col2}. These are from the \code{\link[=getParseData]{getParseData()}} 42 | data frame column names. Next, there are two attributes, 43 | \code{start} and \code{end}, which can be used as an ordering of 44 | expressions in the document. Note that while the values 45 | are correlated with (and in some cases may match exactly) 46 | positions in the document, this cannot be relied upon. 47 | 48 | See an example below. See also the README at 49 | \url{https://github.com/r-lib/xmlparsedata#readme} 50 | for examples on how to search the XML tree with the \code{xml2} package 51 | and XPath expressions. 52 | 53 | Note that \code{xml_parse_data()} silently drops all control characters 54 | (0x01-0x1f) from the input, except horizontal tab (0x09) and newline 55 | (0x0a), because they are invalid in XML 1.0. 56 | } 57 | \examples{ 58 | code <- "function(a = 1, b = 2) {\n a + b\n}\n" 59 | expr <- parse(text = code, keep.source = TRUE) 60 | 61 | # The base R way: 62 | getParseData(expr) 63 | 64 | cat(xml_parse_data(expr, pretty = TRUE)) 65 | } 66 | \seealso{ 67 | \link{xml_parse_token_map} for the token names. 68 | \url{https://github.com/r-lib/xmlparsedata#readme} for more 69 | information and use cases. 70 | } 71 | -------------------------------------------------------------------------------- /man/xml_parse_token_map.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package.R 3 | \docType{data} 4 | \name{xml_parse_token_map} 5 | \alias{xml_parse_token_map} 6 | \title{Map token names of the R parser to token names in 7 | \code{\link[=xml_parse_data]{xml_parse_data()}}} 8 | \format{ 9 | An object of class \code{character} of length 20. 10 | } 11 | \usage{ 12 | xml_parse_token_map 13 | } 14 | \description{ 15 | Some of the R token names are not valid XML tag names, 16 | so \code{\link[=xml_parse_data]{xml_parse_data()}} needs to replace them to create a 17 | valid XML file. 18 | } 19 | \seealso{ 20 | \code{\link[=xml_parse_data]{xml_parse_data()}} 21 | } 22 | \keyword{datasets} 23 | -------------------------------------------------------------------------------- /man/xmlparsedata-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xmlparsedata-package.R 3 | \docType{package} 4 | \name{xmlparsedata-package} 5 | \alias{xmlparsedata-package} 6 | \alias{_PACKAGE} 7 | \title{xmlparsedata: Parse Data of 'R' Code as an 'XML' Tree} 8 | \description{ 9 | Convert the output of 'utils::getParseData()' to an 'XML' tree, that one can search via 'XPath', and easier to manipulate in general. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/r-lib/xmlparsedata#readme} 15 | \item \url{https://r-lib.github.io/xmlparsedata/} 16 | \item Report bugs at \url{https://github.com/r-lib/xmlparsedata/issues} 17 | } 18 | 19 | } 20 | \author{ 21 | \strong{Maintainer}: Gábor Csárdi \email{csardi.gabor@gmail.com} 22 | 23 | Other contributors: 24 | \itemize{ 25 | \item Posit Software, PBC [copyright holder, funder] 26 | \item Mango Solutions [copyright holder, funder] 27 | } 28 | 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /man/xmlparsedata.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package.R 3 | \docType{package} 4 | \name{xmlparsedata} 5 | \alias{xmlparsedata} 6 | \title{Parse Data of R Code as an 'XML' Tree} 7 | \description{ 8 | Convert the output of 'utils::getParseData()' to an 'XML' tree, that is 9 | searchable and easier to manipulate in general. 10 | } 11 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(xmlparsedata) 3 | 4 | if (requireNamespace("xml2", quietly = TRUE)) { 5 | test_check("xmlparsedata") 6 | } 7 | -------------------------------------------------------------------------------- /tests/testthat/test-expr_as_xml.R: -------------------------------------------------------------------------------- 1 | test_that("XML object is returned with correct structure", { 2 | skip_if_not_installed("xml2") 3 | 4 | expect_silent({ 5 | expr_xml <- expr_as_xml(mtcars[, "cyl"]) 6 | }) 7 | expect_s3_class(expr_xml, "xml_document") 8 | expect_identical( 9 | vapply( 10 | xml2::xml_children(xml2::xml_child(expr_xml)), 11 | xml2::xml_name, 12 | character(1L) 13 | ), 14 | c("expr", "OP-LEFT-BRACKET", "OP-COMMA", "expr", "OP-RIGHT-BRACKET") 15 | ) 16 | }) 17 | 18 | test_that("multi-expression case also works", { 19 | expect_silent({ 20 | expr_xml <- expr_as_xml({ 21 | 1 + 1 22 | sqrt(rnorm(100)) 23 | }) 24 | }) 25 | expect_identical(xml2::xml_name(expr_xml), "exprlist") 26 | # `{`, `1 + 1`, `sqrt(...)`, and `}` 27 | expect_length(xml2::xml_children(xml2::xml_child(expr_xml)), 4L) 28 | }) 29 | 30 | test_that("literals are also fine", { 31 | expect_silent(expr_as_xml("a b c")) 32 | expect_silent(expr_as_xml(100L)) 33 | }) 34 | -------------------------------------------------------------------------------- /tests/testthat/test-xml_parse_data.R: -------------------------------------------------------------------------------- 1 | test_that("empty input", { 2 | xml <- xml_parse_data(parse(text = "", keep.source = TRUE)) 3 | expect_true(is.character(xml)) 4 | expect_true(length(xml) == 1) 5 | expect_match(xml, "\\s*") 6 | expect_silent(x <- xml2::read_xml(xml)) 7 | }) 8 | 9 | test_that("trivial input", { 10 | xml <- xml_parse_data(parse(text = "# comment\n", keep.source = TRUE)) 11 | expect_true(is.character(xml)) 12 | expect_true(length(xml) == 1) 13 | expect_match(xml, "\\s*\\s*") 14 | expect_silent(x <- xml2::read_xml(xml)) 15 | 16 | xml <- xml_parse_data(parse(text = "1", keep.source = TRUE)) 17 | expect_match( 18 | xml, 19 | paste0( 20 | "\\s*\\s*", 21 | "\\s*" 22 | ) 23 | ) 24 | expect_silent(x <- xml2::read_xml(xml)) 25 | }) 26 | 27 | test_that("non-trivial input", { 28 | ip <- deparse(utils::install.packages) 29 | xml <- xml_parse_data(parse(text = ip, keep.source = TRUE)) 30 | expect_silent(x <- xml2::read_xml(xml)) 31 | 32 | dp <- deparse(utils::install.packages) 33 | xml <- xml_parse_data( 34 | parse(text = dp, keep.source = TRUE), 35 | pretty = TRUE 36 | ) 37 | expect_silent(x <- xml2::read_xml(xml)) 38 | }) 39 | 40 | test_that("UTF-8 is OK", { 41 | src <- enc2native("# comment with éápő") 42 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE)) 43 | x <- xml2::read_xml(xml) 44 | 45 | comment <- xml2::xml_children(x) 46 | col1 <- xml2::xml_attr(comment, "col1") 47 | col2 <- xml2::xml_attr(comment, "col2") 48 | 49 | expect_equal( 50 | substring(src, col1, col2), 51 | src 52 | ) 53 | 54 | src <- enc2native("# 現行の学校文法では、英語にあるような「目的語」「補語」") 55 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE)) 56 | x <- xml2::read_xml(xml) 57 | 58 | comment <- xml2::xml_children(x) 59 | col1 <- xml2::xml_attr(comment, "col1") 60 | col2 <- xml2::xml_attr(comment, "col2") 61 | 62 | expect_equal( 63 | substring(src, col1, col2), 64 | iconv(src, to = "UTF-8") 65 | ) 66 | 67 | src <- enc2native("`%ééé%` <- function(l, r) l + r") 68 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE), pretty = TRUE) 69 | 70 | op <- xml2::xml_find_all( 71 | xml2::read_xml(xml), 72 | iconv( 73 | enc2native("/exprlist/expr/expr/SYMBOL[text()='`%ééé%`']"), 74 | to = "UTF-8" 75 | ) 76 | ) 77 | expect_equal(length(op), 1) 78 | }) 79 | 80 | test_that("data frame input", { 81 | p <- parse(text = "1 + 1", keep.source = TRUE) 82 | 83 | pd <- getParseData(p) 84 | attr(pd, "srcfile") <- NULL 85 | class(pd) <- "data.frame" 86 | x1 <- xml_parse_data(pd) 87 | 88 | x2 <- xml_parse_data(p) 89 | 90 | expect_equal(x1, x2) 91 | }) 92 | 93 | 94 | test_that("Control-C character", { 95 | src <- "# Control-C \003 96 | # Bell \007 97 | # Escape \027 98 | # Form feed \f 99 | # Vertical tab \t 100 | " 101 | xml <- xml_parse_data(parse(text = src, keep.source = TRUE)) 102 | x <- xml2::read_xml(xml) 103 | expect_s3_class(x, "xml_document") 104 | }) 105 | 106 | 107 | test_that("equal_assign is handled on R 3.6", { 108 | # `a = 1` is an example of an R statement that gets parsed into nested xml 109 | # nodes that have different token / tagnames (following the introduction of 110 | # the `equal_assign` token to getParseData() in R-3.6), but the same ending 111 | # position in the original code. Tokens/expressions that start before should 112 | # end after any nested subexpressions in the resulting xml: 113 | 114 | xml <- xml_parse_data(parse(text = "a = 1", keep.source = TRUE)) 115 | expect_true(is.character(xml)) 116 | expect_true(length(xml) == 1) 117 | expect_silent(x <- xml2::read_xml(xml)) 118 | }) 119 | 120 | test_that("includeText=FALSE works", { 121 | # getParseData(..., includeText = FALSE) returns a data.frame 122 | # without `text` column. xml_parse_data should handle this case 123 | # correctly and the resulting xml text should not contain text 124 | # elements. 125 | xml <- xml_parse_data( 126 | parse(text = "x <- 1", keep.source = TRUE), 127 | includeText = FALSE 128 | ) 129 | expect_true(is.character(xml)) 130 | expect_true(length(xml) == 1) 131 | expect_silent(x <- xml2::read_xml(xml)) 132 | expect_true(xml2::xml_text(x) == "") 133 | }) 134 | 135 | test_that("lambda operator works", { 136 | testthat::skip_if_not( 137 | getRversion() >= "4.1.0" && as.numeric(R.version[["svn rev"]]) >= 79553 138 | ) 139 | # r-devel rev 79553 introduces native pipe syntax (|>) and lambda expression (e.g \(x) x + 1). 140 | xml <- xml_parse_data(parse(text = "\\(x) x + 1", keep.source = TRUE)) 141 | expect_true(is.character(xml)) 142 | expect_true(length(xml) == 1) 143 | expect_silent(x <- xml2::read_xml(xml)) 144 | expect_true(length(xml2::xml_find_all(x, "//OP-LAMBDA")) == 1) 145 | }) 146 | 147 | test_that("narrow octal strings are parsed correctly", { 148 | expect_match( 149 | xml_parse_data(parse(text = "'\\1'", keep.source = TRUE)), 150 | "'\\1'", 151 | fixed = TRUE 152 | ) 153 | expect_match( 154 | xml_parse_data(parse(text = '"\\1"', keep.source = TRUE)), 155 | '"\\1"', 156 | fixed = TRUE 157 | ) 158 | 159 | # multiple literals 160 | expect_match( 161 | xml_parse_data(parse(text = "'\\1'\n'\\2'", keep.source = TRUE)), 162 | "'[\\]1'.*'[\\]2'" 163 | ) 164 | # multiple escapes 165 | expect_match( 166 | xml_parse_data(parse(text = "'\\1\\2'", keep.source = TRUE)), 167 | "'\\1\\2'", 168 | fixed = TRUE 169 | ) 170 | # multi-line strings 171 | expect_match( 172 | xml_parse_data(parse(text = "'\n\\1\n'", keep.source = TRUE)), 173 | "'\n\\1\n'", 174 | fixed = TRUE 175 | ) 176 | expect_match( 177 | xml_parse_data(parse(text = "a <- '\\1\n\\2'", keep.source = TRUE)), 178 | "'\\1\n\\2'", 179 | fixed = TRUE 180 | ) 181 | # mixed-length strings 182 | expect_match( 183 | xml_parse_data(parse( 184 | text = "foo('\\1',\n '\n\\2\n')", 185 | keep.source = TRUE 186 | )), 187 | "'[\\]1'.*'\n[\\]2\n'" 188 | ) 189 | }) 190 | --------------------------------------------------------------------------------