├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ └── test-coverage.yaml ├── .gitignore ├── .lintr ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── aaa.R ├── capture.R ├── character_class.R ├── counts.R ├── escape.R ├── lookarounds.R ├── match.R ├── or.R ├── rex-mode.R ├── rex.R ├── shortcuts.R ├── utils.R ├── wildcards.R └── zzz.R ├── README.md ├── _pkgdown.yml ├── codecov.yml ├── cran-comments.md ├── man ├── as.regex.Rd ├── capture.Rd ├── character_class.Rd ├── character_class_escape.Rd ├── counts.Rd ├── escape.Rd ├── group.Rd ├── lookarounds.Rd ├── not.Rd ├── or.Rd ├── re_matches.Rd ├── re_substitutes.Rd ├── regex.Rd ├── register_shortcuts.Rd ├── rex.Rd ├── rex_mode.Rd ├── shortcuts.Rd ├── single_shortcuts.Rd └── wildcards.Rd ├── revdep ├── README.md ├── check.R ├── checks.rds ├── failures.md └── problems.md ├── rex.Rproj ├── tests ├── testthat.R └── testthat │ ├── test-aaa.R │ ├── test-capture.R │ ├── test-character_class.R │ ├── test-common.R │ ├── test-counts.R │ ├── test-escape.R │ ├── test-lookarounds.R │ ├── test-match.R │ ├── test-or.R │ ├── test-print.R │ ├── test-rex.R │ ├── test-rex_mode.R │ ├── test-shortcuts.R │ └── test-wildcards.R └── vignettes ├── NASA.txt ├── log_parsing.Rmd ├── stackoverflow.R └── url_parsing.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | ^travis-tool\.sh$ 5 | ^cran-comments\.md$ 6 | ^.*\.gz$ 7 | ^\.lintr$ 8 | ^vignettes/stackoverflow\.md$ 9 | ^revdep$ 10 | ^CRAN-RELEASE$ 11 | ^\.github$ 12 | ^codecov\.yml$ 13 | ^_pkgdown\.yml$ 14 | ^docs$ 15 | ^pkgdown$ 16 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macOS-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v2 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | env: 18 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | - uses: r-lib/actions/setup-pandoc@v2 23 | 24 | - uses: r-lib/actions/setup-r@v2 25 | with: 26 | use-public-rspm: true 27 | 28 | - uses: r-lib/actions/setup-r-dependencies@v2 29 | with: 30 | extra-packages: any::pkgdown, local::. 31 | needs: website 32 | 33 | - name: Build site 34 | run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)' 35 | 36 | - name: Deploy to GitHub pages 🚀 37 | if: github.event_name != 'pull_request' 38 | uses: JamesIves/github-pages-deploy-action@4.1.4 39 | with: 40 | branch: gh-pages 41 | folder: docs 42 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: covr::codecov() 31 | shell: Rscript {0} 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | inst/doc 3 | .Rhistory 4 | revdep/checks 5 | docs 6 | -------------------------------------------------------------------------------- /.lintr: -------------------------------------------------------------------------------- 1 | linters: modify_defaults(line_length_linter(120)) 2 | exclusions: c("inst/doc/url_parsing.R", "inst/doc/log_parsing.R") 3 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Package 2 | Package: rex 3 | Title: Friendly Regular Expressions 4 | Version: 1.2.1.9000 5 | Authors@R: c( 6 | person("Kevin", "Ushey", , "kevinushey@gmail.com", role = c("aut", "cre")), 7 | person("Jim", "Hester", , "james.f.hester@gmail.com", role = "aut"), 8 | person("Robert", "Krzyzanowski", , "rkrzyzanowski@gmail.com", role = "aut") 9 | ) 10 | Description: A friendly interface for the construction of regular 11 | expressions. 12 | License: MIT + file LICENSE 13 | URL: https://rex.r-lib.org, https://github.com/r-lib/rex 14 | BugReports: https://github.com/r-lib/rex/issues 15 | Suggests: 16 | covr, 17 | dplyr, 18 | ggplot2, 19 | Hmisc, 20 | knitr, 21 | magrittr, 22 | rmarkdown, 23 | roxygen2, 24 | rvest, 25 | stringr, 26 | testthat 27 | VignetteBuilder: 28 | knitr 29 | Encoding: UTF-8 30 | RoxygenNote: 7.1.2 31 | Collate: 32 | 'aaa.R' 33 | 'utils.R' 34 | 'escape.R' 35 | 'capture.R' 36 | 'character_class.R' 37 | 'counts.R' 38 | 'lookarounds.R' 39 | 'match.R' 40 | 'or.R' 41 | 'rex-mode.R' 42 | 'rex.R' 43 | 'shortcuts.R' 44 | 'wildcards.R' 45 | 'zzz.R' 46 | Config/Needs/website: r-lib/pkgdown, tidyverse/tidytemplate 47 | Imports: 48 | withr 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2014-2016 2 | COPYRIGHT HOLDER: James Hester and Kevin Ushey 3 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(as.character,regex) 4 | S3method(as.regex,default) 5 | S3method(c,regex) 6 | S3method(character_class_escape,character) 7 | S3method(character_class_escape,character_class) 8 | S3method(character_class_escape,default) 9 | S3method(character_class_escape,list) 10 | S3method(character_class_escape,regex) 11 | S3method(escape,character) 12 | S3method(escape,character_class) 13 | S3method(escape,default) 14 | S3method(escape,list) 15 | S3method(escape,regex) 16 | S3method(print,regex) 17 | export(as.regex) 18 | export(character_class) 19 | export(character_class_escape) 20 | export(escape) 21 | export(m) 22 | export(matches) 23 | export(re_matches) 24 | export(re_substitutes) 25 | export(regex) 26 | export(register_shortcuts) 27 | export(rex) 28 | export(rex_) 29 | export(rex_mode) 30 | export(s) 31 | export(shortcuts) 32 | export(substitutes) 33 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## Rex (development version) 2 | 3 | * Rex no longer changes the state of the random number generator when attached. 4 | 5 | ## Rex Version 1.2.1 ## 6 | 7 | * Kevin Ushey is now the maintainer 8 | 9 | ## Rex Version 1.2.0 ## 10 | 11 | * `%>%` is no longer imported and then re-exported from rex 12 | 13 | ## Rex Version 1.1.2 ## 14 | 15 | * Updating tests to work with testthat version 1.0.2.9000. 16 | 17 | * Add `m`, `matches` and `s`, `substitutes` aliases for `re_matches` and 18 | `re_substitutes`. 19 | 20 | ## Rex Version 1.1.1 ## 21 | 22 | * Vignette tweak for ggplot2 2.0.0 23 | * Only print startup message some of the time. 24 | * Move register for magrittr pipe to `.onLoad()` 25 | 26 | ## Rex Version 1.0.1 ## 27 | 28 | * Work around ggplot2 bug with windows fonts 29 | 30 | ## Rex Version 1.0.0 ## 31 | 32 | * Include the capture results even if `locations = TRUE` 33 | * Add `:` operator for character ranges 34 | * Remove duplicate regex functino 35 | * Don't re-compute missing names 36 | * Reduce code duplication 37 | * Add examples for lookarounds 38 | 39 | ## Rex Version 0.2.0 ## 40 | 41 | ### Enhancements 42 | 43 | * Add a newline shortcut 44 | * add register_shortcuts to allow use of rex in external packages without 45 | spurious NOTES. 46 | 47 | ## Rex Version 0.1.1 ## 48 | 49 | ### Enhancements 50 | 51 | * re_matches now has a "locations" argument, which returns the start and end 52 | locations of the match or capture(s). 53 | * Simplify regular expressions generated from 'some_of' functions. 54 | 55 | ### Bug fixes 56 | 57 | * backslashes ("\\") are now properly escaped. 58 | 59 | ### Misc 60 | 61 | * Improve Rex mode documentation (#21 @Ironholds) 62 | * Improve Log parsing Vignette copy and Title (#18, #20 @Ironholds) 63 | * Add links to GitHub and issues page in DESCRIPTION 64 | 65 | ## Rex Version 0.1.0 ## 66 | 67 | Initial release 68 | -------------------------------------------------------------------------------- /R/aaa.R: -------------------------------------------------------------------------------- 1 | .rex <- new.env(parent = emptyenv()) 2 | .rex$env <- new.env(parent = emptyenv()) 3 | .rex$mode <- FALSE 4 | 5 | register <- function(...) { 6 | names <- gsub("`", "", as.character(eval(substitute(alist(...)))), fixed = TRUE) 7 | 8 | list2env(structure(list(...), .Names = names), envir = .rex$env) 9 | } 10 | 11 | register_object <- function(object) { 12 | list2env(as.list(object), envir = .rex$env) 13 | } 14 | -------------------------------------------------------------------------------- /R/capture.R: -------------------------------------------------------------------------------- 1 | #' @include escape.R 2 | #' @include utils.R 3 | NULL 4 | 5 | #' Create a capture group 6 | #' 7 | #' Used to save the matched value within the group for use later in the regular 8 | #' expression or to extract the values captured. Both named and unnamed groups 9 | #' can later be referenced using \code{\link{capture_group}}. 10 | #' 11 | #' @param name of the group. Unnamed capture groups are numbers starting at 1 12 | #' in the order they appear in the regular expression. If two groups have the 13 | #' same name, the leftmost group is the used in any reference. 14 | #' @param ... \code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 15 | #' functions. 16 | #' @family rex 17 | #' @aliases . 18 | #' @seealso \code{\link{group}} for grouping without capturing. Perl 5 Capture 19 | #' Groups \url{https://perldoc.perl.org/perlre#Capture-groups} 20 | #' @examples 21 | #' 22 | #' # Match paired quotation marks 23 | #' re <- rex( 24 | #' # first quotation mark 25 | #' capture(quotes), 26 | #' 27 | #' # match all non-matching quotation marks 28 | #' zero_or_more(except(capture_group(1))), 29 | #' 30 | #' # end quotation mark (matches first) 31 | #' capture_group(1) 32 | #' ) 33 | #' 34 | #' #named capture - don't match apples to oranges 35 | #' re <- rex( 36 | #' capture(name = "fruit", or("apple", "orange")), 37 | #' "=", 38 | #' capture_group("fruit") 39 | #' ) 40 | capture <- . <- function(..., name = NULL) { 41 | if(!is.null(name)) { 42 | name <- paste0("?<", name, ">") 43 | } 44 | p( "(", name, p(escape_dots(...)), ")" ) 45 | } 46 | register(capture, .) 47 | 48 | #' @rdname capture 49 | capture_group <- function(name) { 50 | p( "\\g{", name, "}" ) 51 | } 52 | register(capture_group) 53 | 54 | #' Create a grouped expression 55 | #' 56 | #' This is similar to \code{\link{capture}} except that it does not store the 57 | #' value of the group. Best used when you want to combine several parts 58 | #' together and do not reference or extract the grouped value later. 59 | #' @inheritParams capture 60 | #' @seealso \code{\link{capture}} for grouping with capturing. Perl 5 Extended 61 | #' Patterns \url{https://perldoc.perl.org/perlre#Extended-Patterns} 62 | #' @family rex 63 | group <- function(...) { 64 | p( "(?:", p(escape_dots(...)), ")" ) 65 | } 66 | register(group) 67 | -------------------------------------------------------------------------------- /R/character_class.R: -------------------------------------------------------------------------------- 1 | #' @include utils.R 2 | #' @include escape.R 3 | NULL 4 | 5 | #' Create character classes 6 | #' 7 | #' There are multiple ways you can define a character class. 8 | 9 | #' @inheritParams capture 10 | #' @param start beginning of character class 11 | #' @param end end of character class 12 | #' @param x text to include in the character class (must be escaped manually) 13 | #' @inheritParams wildcards 14 | #' @export 15 | #' @family rex 16 | #' @examples 17 | #' # grey = gray 18 | #' re <- rex("gr", one_of("a", "e"), "y") 19 | #' grepl(re, c("grey", "gray")) # TRUE TRUE 20 | #' 21 | #' # Match non-vowels 22 | #' re <- rex(none_of("a", "e", "i", "o", "u")) 23 | #' # They can also be in the same string 24 | #' re <- rex(none_of("aeiou")) 25 | #' grepl(re, c("k", "l", "e")) # TRUE TRUE FALSE 26 | #' 27 | #' # Match range 28 | #' re <- rex(range("a", "e")) 29 | #' grepl(re, c("b", "d", "f")) # TRUE TRUE FALSE 30 | #' 31 | #' # Explicit creation 32 | #' re <- rex(character_class("abcd\\[")) 33 | #' grepl(re, c("a", "d", "[", "]")) # TRUE TRUE TRUE FALSE 34 | #' @describeIn character_class explicitly define a character class 35 | character_class <- function(x) structure(x, class = c("character_class", "regex")) 36 | 37 | #' @describeIn character_class matches one of the specified characters. 38 | one_of <- function(...) { 39 | p( "[", p(character_class_escape_dots(...)), "]" ) 40 | } 41 | register(one_of) 42 | 43 | #' @describeIn character_class matches zero or more of the specified characters. 44 | any_of <- function(..., type = c("greedy", "lazy", "possessive")) { 45 | add_type(p(one_of(...), "*"), type) 46 | } 47 | register(any_of) 48 | 49 | #' @describeIn character_class matches one or more of the specified characters. 50 | some_of <- function(..., type = c("greedy", "lazy", "possessive")) { 51 | add_type(p(one_of(...), "+"), type) 52 | } 53 | register(some_of) 54 | 55 | #' @describeIn character_class matches anything but one of the specified characters. 56 | #' @aliases except 57 | none_of <- except <- function(...) { 58 | p( "[^", p(character_class_escape_dots(...)), "]" ) 59 | } 60 | register(none_of, except) 61 | 62 | #' @describeIn character_class matches zero or more of anything but the specified characters. 63 | except_any_of <- function(..., type = c("greedy", "lazy", "possessive")) { 64 | add_type(p(none_of(...), "*"), type) 65 | } 66 | register(except_any_of) 67 | 68 | #' @describeIn character_class matches one or more of anything but the specified characters. 69 | except_some_of <- function(..., type = c("greedy", "lazy", "possessive")) { 70 | add_type(p(none_of(...), "+"), type) 71 | } 72 | register(except_some_of) 73 | 74 | #' @describeIn character_class matches one of any of the characters in the range. 75 | range <- function(start, end) { 76 | character_class(p(character_class_escape(start), "-", character_class_escape(end))) 77 | } 78 | register(range) 79 | 80 | #' @describeIn character_class matches one of any of the characters in the range. 81 | `:` <- function(start, end) { 82 | if (all(is.character(start), is.character(end))) { 83 | range(start, end) 84 | } else { 85 | .Primitive(":")(start, end) 86 | } 87 | } 88 | register(`:`) 89 | 90 | #' @describeIn character_class matches one of any of the characters except those in the range. 91 | exclude_range <- function(start, end) { 92 | character_class(p("^", character_class_escape(start), "-", character_class_escape(end))) 93 | } 94 | register(exclude_range) 95 | -------------------------------------------------------------------------------- /R/counts.R: -------------------------------------------------------------------------------- 1 | #' @include escape.R 2 | #' @include utils.R 3 | NULL 4 | 5 | #' Counts 6 | #' 7 | #' Functions to restrict a regex to a specific number 8 | #' @param x A regex pattern. 9 | #' @param n An integer number 10 | #' @param low An integer number for the lower limit. 11 | #' @param high An integer number for the upper limit. 12 | #' @inheritParams zero_or_more 13 | #' @family rex 14 | #' @name counts 15 | NULL 16 | 17 | #' @aliases n 18 | #' @describeIn counts \code{x} must occur exactly \code{n} times. 19 | n_times <- n <- function(x, n, type = c("greedy", "lazy", "possessive")) { 20 | add_type(p("(?:", p(escape(x)), "){", n, "}"), type) 21 | } 22 | register(n_times, n) 23 | 24 | #' @describeIn counts \code{x} must occur between \code{low} and \code{high} times. 25 | between <- function(x, low, high, type = c("greedy", "lazy", "possessive")) { 26 | add_type(p("(?:", p(escape(x)), "){", low, ",", high, "}"), type) 27 | } 28 | register(between) 29 | 30 | #' @describeIn counts \code{x} must occur at least \code{n} times. 31 | at_least <- function(x, n, type = c("greedy", "lazy", "possessive")) { 32 | add_type(between(x, n, ""), type) 33 | } 34 | register(at_least) 35 | 36 | #' @describeIn counts \code{x} must occur at most \code{n} times. 37 | at_most <- function(x, n, type = c("greedy", "lazy", "possessive")) { 38 | add_type(between(x, 0, n), type) 39 | } 40 | register(at_most) 41 | -------------------------------------------------------------------------------- /R/escape.R: -------------------------------------------------------------------------------- 1 | #' @include utils.R 2 | NULL 3 | 4 | #' Escape characters for a regex 5 | #' 6 | #' @param x Object to escape. 7 | #' @export 8 | escape <- function(x) UseMethod("escape") 9 | 10 | #' @describeIn escape Objects are simply passed through unchanged. 11 | #' @export 12 | escape.regex <- function(x) x 13 | 14 | #' @describeIn escape Objects are surrounded by braces. 15 | #' @export 16 | escape.character_class <- function(x) { 17 | p("[", x, "]") 18 | } 19 | 20 | #' @describeIn escape Objects are properly escaped for regular expressions. 21 | #' @export 22 | escape.character <- function(x) { 23 | chars <- c("*", ".", "?", "^", "+", "$", "|", "(", ")", "[", "]", "{", "}", "\\") 24 | regex(sanitize(x, chars)) 25 | } 26 | 27 | #' @describeIn escape default escape coerces to character and escapes. 28 | #' @export 29 | escape.default <- function(x) { 30 | escape.character(as.character(x)) 31 | } 32 | 33 | #' @describeIn escape simply call escape on all elements of the list. 34 | #' @export 35 | escape.list <- function(x) { 36 | lapply(x, escape) 37 | } 38 | 39 | escape_dots <- function(...) { 40 | unlist(escape(eval(list(...)))) 41 | } 42 | 43 | #' Character class escapes 44 | #' @inheritParams escape 45 | #' @export 46 | character_class_escape <- function(x) UseMethod("character_class_escape") 47 | 48 | #' @describeIn character_class_escape objects are passed through unchanged. 49 | #' @export 50 | character_class_escape.regex <- function(x) x 51 | 52 | #' @describeIn character_class_escape objects are passed through unchanged. 53 | #' @export 54 | character_class_escape.character_class <- character_class_escape.regex 55 | 56 | #' @describeIn character_class_escape objects properly escaped for character classes. 57 | #' @export 58 | character_class_escape.character <- function(x) { 59 | regex(sanitize(x, c("-", "^", "[", "]", "\\"))) 60 | } 61 | 62 | #' @describeIn character_class_escape call \code{character_class_escape} on all elements of the list. 63 | #' @export 64 | character_class_escape.list <- function(x) { 65 | lapply(x, character_class_escape) 66 | } 67 | 68 | #' @describeIn character_class_escape coerce to \code{character} and \code{character_class_escape}. 69 | #' @export 70 | character_class_escape.default <- function(x) { 71 | character_class_escape.character(as.character(x)) 72 | } 73 | 74 | character_class_escape_dots <- function(...) { 75 | unlist(character_class_escape(eval(list(...)))) 76 | } 77 | 78 | sanitize <- function(x, chars) { 79 | gsub(paste0("([\\", paste0(collapse = "\\", chars), "])"), "\\\\\\1", x, perl = TRUE) 80 | } 81 | -------------------------------------------------------------------------------- /R/lookarounds.R: -------------------------------------------------------------------------------- 1 | #' @include escape.R 2 | #' @include utils.R 3 | NULL 4 | 5 | #' Lookarounds 6 | #' 7 | #' These functions provide an interface to perl lookarounds. 8 | #' 9 | #' Special binary functions are used to infer an ordering, since often you 10 | #' might wish to match a word / set of characters conditional on the start 11 | #' and end of that word. 12 | #' 13 | #' \itemize{ 14 | #' \item \code{\%if_next_is\%}: \code{TRUE} if x follows y 15 | #' \item \code{\%if_next_isnt\%}: \code{TRUE} if x does not follow y 16 | #' \item \code{\%if_prev_is\%}: \code{TRUE} if y comes before x 17 | #' \item \code{\%if_prev_isnt\%}: \code{TRUE} if y does not come before x 18 | #' } 19 | #' @param x A regex pattern. 20 | #' @param y A regex pattern. 21 | #' @name lookarounds 22 | #' @title Lookarounds 23 | #' @family rex 24 | #' @seealso Perl 5 Documentation \url{https://perldoc.perl.org/perlre#Extended-Patterns} 25 | #' @examples 26 | #' stopifnot(grepl(rex("crab" %if_next_is% "apple"), "crabapple", perl = TRUE)) 27 | #' stopifnot(grepl(rex("crab" %if_prev_is% "apple"), "applecrab", perl = TRUE)) 28 | #' stopifnot(grepl(rex(range("a", "e") %if_next_isnt% range("f", "g")), 29 | #' "ah", perl = TRUE)) 30 | #' stopifnot(grepl(rex(range("a", "e") %if_next_is% range("f", "i")), 31 | #' "ah", perl = TRUE)) 32 | NULL 33 | 34 | #' @rdname lookarounds 35 | `%if_next_is%` <- function(x, y) { 36 | p("(?:", escape(x), "(?=", escape(y), ")", ")") 37 | } 38 | register(`%if_next_is%`) 39 | 40 | #' @rdname lookarounds 41 | `%if_next_isnt%` <- function(x, y) { 42 | p("(?:", escape(x), "(?!", escape(y), ")", ")") 43 | } 44 | register(`%if_next_isnt%`) 45 | 46 | #' @rdname lookarounds 47 | `%if_prev_is%` <- function(x, y) { 48 | p("(?:", "(?<=", escape(y), ")", escape(x), ")") 49 | } 50 | register(`%if_prev_is%`) 51 | 52 | #' @rdname lookarounds 53 | `%if_prev_isnt%` <- function(x, y) { 54 | p("(?:", "(? 34 | #' @aliases matches m 35 | #' @export re_matches matches m 36 | re_matches <- matches <- m <- function(data, pattern, global = FALSE, options = NULL, locations = FALSE, ...) { 37 | 38 | pattern <- add_options(pattern, options) 39 | 40 | process_matches <- function(match, string) { 41 | 42 | if(no_capture(match)) { 43 | 44 | # if no capture and no location just return if the regex matched 45 | if(!locations) { 46 | return(match != -1L) 47 | } 48 | 49 | # else return a data frame of the start and end locations 50 | match[ match == -1L ] <- NA_integer_ 51 | starts <- match 52 | attributes(starts) <- NULL 53 | 54 | lengths <- attr(match, "match.length") 55 | ends <- starts + lengths - 1L 56 | 57 | return(data.frame(start = starts, end = ends)) 58 | } 59 | 60 | # if a capture return a data frame with the capture results for each string 61 | starts <- attr(match, "capture.start") 62 | lengths <- attr(match, "capture.length") 63 | ends <- starts + lengths - 1L 64 | 65 | not_matched <- starts == -1L 66 | 67 | strings <- substring(string, starts, ends) 68 | 69 | strings[not_matched] <- NA_character_ 70 | 71 | res <- matrix(ncol = ncol(starts), strings) 72 | 73 | nms <- auto_name(attr(match, "capture.names")) 74 | 75 | if (!locations) { 76 | colnames(res) <- nms 77 | return(as.data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)) 78 | } 79 | 80 | starts[not_matched] <- NA_integer_ 81 | 82 | ends[not_matched] <- NA_integer_ 83 | 84 | indexes <- unlist(lapply(seq_len(ncol(res)), function(x) { 85 | seq(x, by = ncol(res), length.out = 3) 86 | })) 87 | 88 | full <- data.frame(res, starts, ends, stringsAsFactors = FALSE, check.names = FALSE)[, indexes, drop = FALSE] 89 | full_names <- unlist(Map(function(name) c(name, paste(sep=".", name, c("start", "end"))), nms, USE.NAMES = FALSE)) 90 | colnames(full) <- full_names 91 | 92 | full 93 | } 94 | 95 | if(global %==% TRUE) { 96 | mapply(process_matches, gregexpr(pattern = pattern, data, perl = TRUE, ...), data, SIMPLIFY = FALSE) 97 | } 98 | else { 99 | process_matches(regexpr(pattern = pattern, data, perl = TRUE, ...), data) 100 | } 101 | } 102 | 103 | #' Substitute regular expressions in a string with another string. 104 | #' 105 | #' @param data character vector to substitute 106 | #' @param pattern regular expression to match 107 | #' @param replacement replacement text to use 108 | #' @param global substitute all occurrences 109 | #' @param options option flags 110 | #' @param ... options passed to sub or gsub 111 | #' @seealso \code{\link{regexp}} Section "Perl-like Regular Expressions" for a 112 | #' discussion of the supported options 113 | #' @examples 114 | #' string <- c("this is a Test", "string") 115 | #' re_substitutes(string, "test", "not a test", options = "insensitive") 116 | #' re_substitutes(string, "i", "x", global = TRUE) 117 | #' re_substitutes(string, "(test)", "not a \\1", options = "insensitive") 118 | #' @aliases substitutes s 119 | #' @export re_substitutes substitutes s 120 | re_substitutes <- substitutes <- s <- function(data, pattern, replacement, global = FALSE, options = NULL, ...) { 121 | pattern <- add_options(pattern, options) 122 | method <- if (isTRUE(global)) gsub else sub 123 | method(x = data, pattern = pattern, replacement = replacement, perl = TRUE, ...) 124 | } 125 | 126 | add_options <- function(pattern, options) { 127 | if (!is.null(options)) { 128 | options <- match_args(options, names(option_map)) 129 | p("(?", p(option_map[options]), ")", pattern) 130 | } 131 | else { 132 | pattern 133 | } 134 | } 135 | 136 | match_args <- function(arg, choices) { 137 | matches <- pmatch(arg, choices) 138 | if (anyNA(matches)) { 139 | stop(gettextf("'arg' should be one of %s", toString(dQuote(choices))), domain = NA) 140 | } 141 | choices[matches] 142 | } 143 | 144 | option_map <- c( 145 | "insensitive" = "i", 146 | "multi-line" = "m", 147 | "single-line" = "s", 148 | "extended" = "x", 149 | "ungreedy" = "U" 150 | ) 151 | 152 | no_capture <- function(match) { 153 | is.null(attr(match, "capture.start", exact = TRUE)) 154 | } 155 | 156 | auto_name <- function(names) { 157 | missing <- names == "" 158 | if (!any(missing)) { 159 | return(names) 160 | } 161 | names[missing] <- seq_along(names)[missing] 162 | names 163 | } 164 | -------------------------------------------------------------------------------- /R/or.R: -------------------------------------------------------------------------------- 1 | #' @include utils.R 2 | #' @include escape.R 3 | NULL 4 | 5 | #' Or 6 | #' 7 | #' The special binary function \code{\%or\%} can be used to specify a set 8 | #' of optional matches. 9 | #' 10 | #' @rdname or 11 | #' @usage x \%or\% y 12 | #' @param x A string. 13 | #' @param y A string. 14 | #' @family rex 15 | #' @inheritParams capture 16 | `%or%` <- function(x, y) { 17 | group(p(escape(x)), regex("|"), p(escape(y))) 18 | } 19 | register(`%or%`) 20 | 21 | #' describeIn or regular function can also be used, useful for more than 2 arguments. 22 | or <- function(...) { 23 | group(regex(paste0(collapse = "|", unlist(escape_dots(...))))) 24 | } 25 | register(or) 26 | -------------------------------------------------------------------------------- /R/rex-mode.R: -------------------------------------------------------------------------------- 1 | #' Toggles \pkg{rex} mode. 2 | #' 3 | #' While within rex mode, functions used within the \code{\link{rex}} function 4 | #' are attached, so one can get e.g. auto-completion within editors. 5 | #' 6 | #' @export 7 | rex_mode <- function() { 8 | 9 | ## Enter rex mode 10 | if (!.rex$mode) { 11 | .rex$mode <- TRUE 12 | message("Rex functions and shortcuts attached!") 13 | ## We know what we're doing, so hide the R CMD check note 14 | suppressMessages( 15 | eval(call("attach", call("$", as.name(".rex"), as.name("env")))) 16 | ) 17 | } 18 | 19 | ## Exit rex mode 20 | else { 21 | message("Rex functions and shortcuts detached!") 22 | .rex$mode <- FALSE 23 | detach(".rex$env") 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /R/rex.R: -------------------------------------------------------------------------------- 1 | #' @include escape.R 2 | #' @include character_class.R 3 | #' @include utils.R 4 | NULL 5 | 6 | #' Generate a regular expression. 7 | #' @export 8 | #' @family rex 9 | #' @param ... \code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 10 | #' functions. 11 | #' @param env environment to evaluate the rex expression in. 12 | #' @aliases rex_ 13 | rex <- function(..., env = parent.frame()) { 14 | args <- as.list(substitute(list(...))[-1]) 15 | rex_(args, env = env) 16 | } 17 | 18 | #' @export 19 | rex_ <- function(args, env = parent.frame()) { 20 | args <- Filter(function(x) !identical(x, quote(expr = )), args) 21 | 22 | eval_env <- list2env(as.list(.rex$env), parent = env) 23 | evaled <- lapply(args, eval, envir = eval_env) 24 | 25 | p(escape(evaled)) 26 | } 27 | 28 | #' @describeIn regex coerce regex object to a character 29 | #' @export 30 | as.character.regex <- function(x, ...) x 31 | 32 | #' Coerce objects to a \code{\link{regex}}. 33 | #' @name as.regex 34 | #' @param x Object to coerce to \code{\link{regex}}. 35 | #' @param ... further arguments passed to methods. 36 | #' @export 37 | as.regex <- function(x, ...) UseMethod("as.regex") 38 | 39 | #' @export 40 | #' @describeIn as.regex Simply escape the Object. 41 | as.regex.default <- function(x, ...) escape(x) 42 | 43 | #' @export 44 | #' @describeIn regex Print regex object 45 | print.regex <- function(x, ...){ 46 | cat(paste(strwrap(x), collapse = "\n"), "\n", sep = "") 47 | } 48 | 49 | #' Regular Expression 50 | #' 51 | #' Specify an explicit regular expression. This expression must already be 52 | #' escaped. 53 | #' @param x Object 54 | #' @param ... further arguments 55 | #' @seealso \code{\link{as.regex}} to coerce to a regex object. 56 | #' @export 57 | regex <- function(x, ...) structure(x, class = "regex") 58 | 59 | #' Register the Rex shortcuts 60 | #' 61 | #' If you are using rex in another package you need to call this function to 62 | #' register all of the rex shortcuts so that spurious NOTEs about global 63 | #' variables being generated during R CMD check. 64 | #' @param pkg_name the package to register the shortcuts in 65 | #' @export 66 | register_shortcuts <- function(pkg_name) { 67 | invisible(utils::globalVariables(ls(.rex$env), pkg_name)) 68 | } 69 | 70 | #' @export 71 | c.regex <- function(..., recursive = FALSE) { 72 | regex(c(unlist(lapply(list(...), unclass)))) 73 | } 74 | -------------------------------------------------------------------------------- /R/shortcuts.R: -------------------------------------------------------------------------------- 1 | shortcut <- function(...) { 2 | data <- list(...) 3 | if (length(data) == 1L) { 4 | data <- data[[1L]] 5 | } 6 | structure(data, class = "shortcut") 7 | } 8 | 9 | #' Single shortcuts 10 | #' 11 | #' Each of these shortcuts has both a plural (-s) and inverse (non_) form. 12 | single_shortcuts <- shortcut( 13 | 14 | ## Character class shortcuts 15 | alnum = character_class("[:alnum:]"), 16 | alpha = character_class("[:alpha:]"), 17 | letter = character_class("[:alpha:]"), 18 | blank = character_class("[:blank:]"), 19 | cntrl = character_class("[:cntrl:]"), 20 | digit = character_class("[:digit:]"), 21 | number = character_class("[:digit:]"), 22 | graph = character_class("[:graph:]"), 23 | lower = character_class("[:lower:]"), 24 | print = character_class("[:print:]"), 25 | punct = character_class("[:punct:]"), 26 | space = character_class("[:space:]"), 27 | upper = character_class("[:upper:]"), 28 | xdigit = character_class("[:xdigit:]"), 29 | newline = regex("\\R"), 30 | 31 | single_quote = character_class("'"), 32 | double_quote = character_class("\""), 33 | quote = character_class("'\"") 34 | ) 35 | 36 | basic_shortcuts <- shortcut( 37 | 38 | dot = escape("."), 39 | any = any_char <- regex("."), 40 | something = regex(".+"), 41 | anything = regex(".*"), 42 | 43 | start = regex("^"), 44 | end = regex("$"), 45 | 46 | boundary = regex("\\b"), 47 | non_boundary = regex("\\B") 48 | ) 49 | 50 | inverse <- function(x) { 51 | x[] <- lapply(x, function(xx) { 52 | val <- paste0("^", xx); class(val) <- class(xx) 53 | val 54 | }) 55 | names(x) <- paste0("non_", names(x)) 56 | x 57 | } 58 | 59 | plural <- function(x) { 60 | x[] <- lapply(x, function(xx) { 61 | val <- paste0(escape(xx), "+"); class(val) <- "regex" 62 | val 63 | }) 64 | names(x) <- paste0(names(x), "s") 65 | x 66 | } 67 | 68 | multiple <- function(x) { 69 | x[] <- lapply(x, function(xx) { 70 | val <- paste0(escape(xx), "*"); class(val) <- "regex" 71 | val 72 | }) 73 | names(x) <- paste0("any_", names(x), "s") 74 | x 75 | } 76 | 77 | #' Shortcuts 78 | #' 79 | #' Commonly used character classes and regular expressions. These shortcuts 80 | #' are substituted inside \code{rex} calls. 81 | #' 82 | #' \code{names(shortcuts)} will give you the full list of available shortcuts. 83 | #' @export 84 | #' @family rex 85 | 86 | shortcuts <- shortcut(c( 87 | basic_shortcuts, 88 | single_shortcuts, 89 | plural(single_shortcuts), 90 | multiple(single_shortcuts), 91 | inverse(single_shortcuts), 92 | plural(inverse(single_shortcuts)), 93 | multiple(inverse(single_shortcuts)) 94 | )) 95 | 96 | default_data_format.shortcut <- function(x) { 97 | build_rd <- get("build_rd", envir = asNamespace("roxygen2")) 98 | rd <- get("rd", envir = asNamespace("roxygen2")) 99 | 100 | build_rd(rd("\\preformatted{"), paste0(names(x), " - ", x, collapse = "\n"), rd("}")) 101 | } 102 | 103 | register_object(shortcuts) 104 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | p <- function(...) { 2 | regex(paste0(collapse = "", ...)) 3 | } 4 | 5 | `%==%` <- function(x, y) { 6 | identical(x, y) 7 | } 8 | -------------------------------------------------------------------------------- /R/wildcards.R: -------------------------------------------------------------------------------- 1 | #' @include escape.R 2 | #' @include utils.R 3 | NULL 4 | 5 | #' Wildcards 6 | #' 7 | #' @inheritParams capture 8 | #' @param type the type of match to perform. 9 | #' 10 | #' There are three match types 11 | #' \enumerate{ 12 | #' \item \code{greedy}: match the longest string. This is the default matching type. 13 | #' \item \code{lazy}: match the shortest string. This matches the shortest string from the same anchor point, not necessarily the shortest global string. 14 | #' \item \code{possessive}: match and don't allow backtracking 15 | #' } 16 | #' @family rex 17 | #' @name wildcards 18 | NULL 19 | 20 | #' @describeIn wildcards match \code{...} zero or more times. 21 | zero_or_more <- function(..., type = c("greedy", "lazy", "possessive")) { 22 | add_type(p("(?:", p(escape_dots(...)), ")*"), type) 23 | } 24 | register(zero_or_more) 25 | 26 | #' @describeIn wildcards match \code{...} one or more times. 27 | one_or_more <- function(..., type = c("greedy", "lazy", "possessive")) { 28 | add_type(p("(?:", p(escape_dots(...)), ")+"), type) 29 | } 30 | register(one_or_more) 31 | 32 | #' @describeIn wildcards match \code{...} zero or one times. 33 | #' @aliases zero_or_one 34 | maybe <- zero_or_one <- function(..., type = c("greedy", "lazy", "possessive")) { 35 | p("(?:", p(escape_dots(...)), ")?") 36 | } 37 | register(maybe, zero_or_one) 38 | 39 | add_type <- function(x, type = c("greedy", "lazy", "possessive")) { 40 | type <- match.arg(type) 41 | 42 | switch(type, 43 | greedy = x, 44 | lazy = p(x, "?"), 45 | possessive = p(x, "+") 46 | ) 47 | } 48 | 49 | #' Do not match 50 | #' 51 | #' @inheritParams capture 52 | #' @inheritParams zero_or_more 53 | #' @family rex 54 | # This is slightly different than if_next_isn't because we want to match 55 | # anything that is not the search term as well 56 | not <- function(..., type = c("greedy", "lazy", "possessive")) { 57 | add_type(p("(?:(?!", escape_dots(...), ").)*"), type = type) 58 | } 59 | register(not) 60 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(lib, pkg) { # nolint 2 | withr::with_preserve_seed({ 3 | if (!interactive() || stats::runif(1) > 0.1) return() 4 | 5 | packageStartupMessage("Welcome to rex, the friendly regular expression helper!\n", 6 | "Use 'rex_mode()' to toggle code completion for rex shortcuts and functions.") 7 | }) 8 | } 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rex 2 | 3 | 4 | [![Codecov test coverage](https://codecov.io/gh/kevinushey/rex/branch/master/graph/badge.svg)](https://app.codecov.io/gh/kevinushey/rex?branch=main) 5 | [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html) 6 | [![R-CMD-check](https://github.com/kevinushey/rex/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kevinushey/rex/actions/workflows/R-CMD-check.yaml) 7 | 8 | 9 | ### Friendly Regular Expressions 10 | 11 | Regular expressions are very powerful feature, however they are often difficult 12 | to interpret. Rex allows you to build complex regular expressions from human 13 | readable expressions. So instead of writing (and later trying to decipher) 14 | ```r 15 | r <- "^(?:(((?:[^:])+)://))?((?:(?:(?!:/).)*)+)(?:(:([[:digit:]]+)))?(?:(/.*))?$" 16 | ``` 17 | 18 | You can write 19 | 20 | ```r 21 | r <- rex( 22 | 23 | start, 24 | 25 | ## match the protocol -- may exist or may not 26 | maybe(capture( 27 | capture(except_some_of(":")), 28 | "://" 29 | )), 30 | 31 | ## match the path 32 | capture(one_or_more(not(":/"))), 33 | 34 | ## get the port 35 | maybe(capture(":", capture(numbers))), 36 | 37 | ## and the rest 38 | maybe(capture("/", anything)), 39 | 40 | end 41 | 42 | ) 43 | ``` 44 | 45 | While these expressions are a bit longer than their corresponding regular 46 | expression, they are much more readable and maintainable. 47 | 48 | ## Installation 49 | 50 | ```r 51 | install.packages("rex") 52 | ``` 53 | 54 | ## Usage 55 | 56 | The vignettes have longer form usage examples. 57 | 58 | - [URL Validation](http://rpubs.com/jimhester/rex-url_parsing) 59 | - [Webserver Log Parsing](http://rpubs.com/jimhester/rex-log_parsing) 60 | 61 | Each `rex()` function call can include a number of functions and shortcuts. 62 | For a full list of the functions available please see `?rex` and `?shortcuts`. 63 | 64 | ### Rex Mode 65 | 66 | Rex functions are not exported because they are only useful within `rex()` 67 | calls, but they can be temporarily attached using `rex_mode()` which allows 68 | them to be auto-completed. 69 | 70 | ### Using Rex in other packages 71 | 72 | Using `rex` in other packages will generate spurious NOTEs from `R CMD check` 73 | unless you include a call to `rex::register_shortcuts()` with your package name 74 | somewhere in your package source. This function registers all of the rex 75 | shortcuts as valid variables fixing the NOTEs. 76 | 77 | ## See Also 78 | - [Regularity](https://github.com/andrewberls/regularity) - Ruby library that 79 | partially inspired `rex`. 80 | - [PCRE](http://www.pcre.org/) - Perl Compatible Regular Expressions, the 81 | engine that `rex` regular expressions use. 82 | - [Perl 5 Regular Expressions](https://perldoc.perl.org/perlre) - Perl 83 | regular expression documentation, which are nearly 100% compatible with PCRE. 84 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://rex.r-lib.org 2 | 3 | template: 4 | package: tidytemplate 5 | bootstrap: 5 6 | includes: 7 | in_header: | 8 | 9 | 10 | reference: 11 | - title: Create a regular expression 12 | contents: 13 | - rex 14 | - capture 15 | - character_class 16 | - n_times 17 | - group 18 | - "`%if_next_is%`" 19 | - not 20 | - or 21 | - zero_or_more 22 | 23 | - title: String manipulation 24 | contents: 25 | - re_matches 26 | - re_substitutes 27 | 28 | - title: For developers 29 | contents: 30 | - as.regex 31 | - as.character.regex 32 | - character_class_escape 33 | - escape 34 | - shortcuts 35 | - single_shortcuts 36 | - register_shortcuts 37 | - rex_mode 38 | 39 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Downstream dependencies 2 | 3 | * I ran R CMD check on all 17 downstream dependencies of rex 4 | Summary at: https://github.com/kevinushey/rex/blob/master/revdep/#readme 5 | 6 | * There were 0 ERRORs in downstream dependencies. 7 | -------------------------------------------------------------------------------- /man/as.regex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rex.R 3 | \name{as.regex} 4 | \alias{as.regex} 5 | \alias{as.regex.default} 6 | \title{Coerce objects to a \code{\link{regex}}.} 7 | \usage{ 8 | as.regex(x, ...) 9 | 10 | \method{as.regex}{default}(x, ...) 11 | } 12 | \arguments{ 13 | \item{x}{Object to coerce to \code{\link{regex}}.} 14 | 15 | \item{...}{further arguments passed to methods.} 16 | } 17 | \description{ 18 | Coerce objects to a \code{\link{regex}}. 19 | } 20 | \section{Methods (by class)}{ 21 | \itemize{ 22 | \item \code{default}: Simply escape the Object. 23 | }} 24 | 25 | -------------------------------------------------------------------------------- /man/capture.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/capture.R 3 | \name{capture} 4 | \alias{capture} 5 | \alias{.} 6 | \alias{capture_group} 7 | \title{Create a capture group} 8 | \usage{ 9 | capture(..., name = NULL) 10 | 11 | capture_group(name) 12 | } 13 | \arguments{ 14 | \item{...}{\code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 15 | functions.} 16 | 17 | \item{name}{of the group. Unnamed capture groups are numbers starting at 1 18 | in the order they appear in the regular expression. If two groups have the 19 | same name, the leftmost group is the used in any reference.} 20 | } 21 | \description{ 22 | Used to save the matched value within the group for use later in the regular 23 | expression or to extract the values captured. Both named and unnamed groups 24 | can later be referenced using \code{\link{capture_group}}. 25 | } 26 | \examples{ 27 | 28 | # Match paired quotation marks 29 | re <- rex( 30 | # first quotation mark 31 | capture(quotes), 32 | 33 | # match all non-matching quotation marks 34 | zero_or_more(except(capture_group(1))), 35 | 36 | # end quotation mark (matches first) 37 | capture_group(1) 38 | ) 39 | 40 | #named capture - don't match apples to oranges 41 | re <- rex( 42 | capture(name = "fruit", or("apple", "orange")), 43 | "=", 44 | capture_group("fruit") 45 | ) 46 | } 47 | \seealso{ 48 | \code{\link{group}} for grouping without capturing. Perl 5 Capture 49 | Groups \url{https://perldoc.perl.org/perlre#Capture-groups} 50 | 51 | Other rex: 52 | \code{\link{\%or\%}()}, 53 | \code{\link{character_class}()}, 54 | \code{\link{counts}}, 55 | \code{\link{group}()}, 56 | \code{\link{lookarounds}}, 57 | \code{\link{not}()}, 58 | \code{\link{rex}()}, 59 | \code{\link{shortcuts}}, 60 | \code{\link{wildcards}} 61 | } 62 | \concept{rex} 63 | -------------------------------------------------------------------------------- /man/character_class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/character_class.R 3 | \name{character_class} 4 | \alias{character_class} 5 | \alias{one_of} 6 | \alias{any_of} 7 | \alias{some_of} 8 | \alias{none_of} 9 | \alias{except} 10 | \alias{except_any_of} 11 | \alias{except_some_of} 12 | \alias{range} 13 | \alias{:} 14 | \alias{exclude_range} 15 | \title{Create character classes} 16 | \usage{ 17 | character_class(x) 18 | 19 | one_of(...) 20 | 21 | any_of(..., type = c("greedy", "lazy", "possessive")) 22 | 23 | some_of(..., type = c("greedy", "lazy", "possessive")) 24 | 25 | none_of(...) 26 | 27 | except_any_of(..., type = c("greedy", "lazy", "possessive")) 28 | 29 | except_some_of(..., type = c("greedy", "lazy", "possessive")) 30 | 31 | range(start, end) 32 | 33 | `:`(start, end) 34 | 35 | exclude_range(start, end) 36 | } 37 | \arguments{ 38 | \item{x}{text to include in the character class (must be escaped manually)} 39 | 40 | \item{...}{\code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 41 | functions.} 42 | 43 | \item{type}{the type of match to perform. 44 | 45 | There are three match types 46 | \enumerate{ 47 | \item \code{greedy}: match the longest string. This is the default matching type. 48 | \item \code{lazy}: match the shortest string. This matches the shortest string from the same anchor point, not necessarily the shortest global string. 49 | \item \code{possessive}: match and don't allow backtracking 50 | }} 51 | 52 | \item{start}{beginning of character class} 53 | 54 | \item{end}{end of character class} 55 | } 56 | \description{ 57 | There are multiple ways you can define a character class. 58 | } 59 | \section{Functions}{ 60 | \itemize{ 61 | \item \code{character_class}: explicitly define a character class 62 | 63 | \item \code{one_of}: matches one of the specified characters. 64 | 65 | \item \code{any_of}: matches zero or more of the specified characters. 66 | 67 | \item \code{some_of}: matches one or more of the specified characters. 68 | 69 | \item \code{none_of}: matches anything but one of the specified characters. 70 | 71 | \item \code{except_any_of}: matches zero or more of anything but the specified characters. 72 | 73 | \item \code{except_some_of}: matches one or more of anything but the specified characters. 74 | 75 | \item \code{range}: matches one of any of the characters in the range. 76 | 77 | \item \code{:}: matches one of any of the characters in the range. 78 | 79 | \item \code{exclude_range}: matches one of any of the characters except those in the range. 80 | }} 81 | 82 | \examples{ 83 | # grey = gray 84 | re <- rex("gr", one_of("a", "e"), "y") 85 | grepl(re, c("grey", "gray")) # TRUE TRUE 86 | 87 | # Match non-vowels 88 | re <- rex(none_of("a", "e", "i", "o", "u")) 89 | # They can also be in the same string 90 | re <- rex(none_of("aeiou")) 91 | grepl(re, c("k", "l", "e")) # TRUE TRUE FALSE 92 | 93 | # Match range 94 | re <- rex(range("a", "e")) 95 | grepl(re, c("b", "d", "f")) # TRUE TRUE FALSE 96 | 97 | # Explicit creation 98 | re <- rex(character_class("abcd\\\\[")) 99 | grepl(re, c("a", "d", "[", "]")) # TRUE TRUE TRUE FALSE 100 | } 101 | \seealso{ 102 | Other rex: 103 | \code{\link{\%or\%}()}, 104 | \code{\link{capture}()}, 105 | \code{\link{counts}}, 106 | \code{\link{group}()}, 107 | \code{\link{lookarounds}}, 108 | \code{\link{not}()}, 109 | \code{\link{rex}()}, 110 | \code{\link{shortcuts}}, 111 | \code{\link{wildcards}} 112 | } 113 | \concept{rex} 114 | -------------------------------------------------------------------------------- /man/character_class_escape.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/escape.R 3 | \name{character_class_escape} 4 | \alias{character_class_escape} 5 | \alias{character_class_escape.regex} 6 | \alias{character_class_escape.character_class} 7 | \alias{character_class_escape.character} 8 | \alias{character_class_escape.list} 9 | \alias{character_class_escape.default} 10 | \title{Character class escapes} 11 | \usage{ 12 | character_class_escape(x) 13 | 14 | \method{character_class_escape}{regex}(x) 15 | 16 | \method{character_class_escape}{character_class}(x) 17 | 18 | \method{character_class_escape}{character}(x) 19 | 20 | \method{character_class_escape}{list}(x) 21 | 22 | \method{character_class_escape}{default}(x) 23 | } 24 | \arguments{ 25 | \item{x}{Object to escape.} 26 | } 27 | \description{ 28 | Character class escapes 29 | } 30 | \section{Methods (by class)}{ 31 | \itemize{ 32 | \item \code{regex}: objects are passed through unchanged. 33 | 34 | \item \code{character_class}: objects are passed through unchanged. 35 | 36 | \item \code{character}: objects properly escaped for character classes. 37 | 38 | \item \code{list}: call \code{character_class_escape} on all elements of the list. 39 | 40 | \item \code{default}: coerce to \code{character} and \code{character_class_escape}. 41 | }} 42 | 43 | -------------------------------------------------------------------------------- /man/counts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/counts.R 3 | \name{counts} 4 | \alias{counts} 5 | \alias{n_times} 6 | \alias{n} 7 | \alias{between} 8 | \alias{at_least} 9 | \alias{at_most} 10 | \title{Counts} 11 | \usage{ 12 | n_times(x, n, type = c("greedy", "lazy", "possessive")) 13 | 14 | between(x, low, high, type = c("greedy", "lazy", "possessive")) 15 | 16 | at_least(x, n, type = c("greedy", "lazy", "possessive")) 17 | 18 | at_most(x, n, type = c("greedy", "lazy", "possessive")) 19 | } 20 | \arguments{ 21 | \item{x}{A regex pattern.} 22 | 23 | \item{n}{An integer number} 24 | 25 | \item{type}{the type of match to perform. 26 | 27 | There are three match types 28 | \enumerate{ 29 | \item \code{greedy}: match the longest string. This is the default matching type. 30 | \item \code{lazy}: match the shortest string. This matches the shortest string from the same anchor point, not necessarily the shortest global string. 31 | \item \code{possessive}: match and don't allow backtracking 32 | }} 33 | 34 | \item{low}{An integer number for the lower limit.} 35 | 36 | \item{high}{An integer number for the upper limit.} 37 | } 38 | \description{ 39 | Functions to restrict a regex to a specific number 40 | } 41 | \section{Functions}{ 42 | \itemize{ 43 | \item \code{n_times}: \code{x} must occur exactly \code{n} times. 44 | 45 | \item \code{between}: \code{x} must occur between \code{low} and \code{high} times. 46 | 47 | \item \code{at_least}: \code{x} must occur at least \code{n} times. 48 | 49 | \item \code{at_most}: \code{x} must occur at most \code{n} times. 50 | }} 51 | 52 | \seealso{ 53 | Other rex: 54 | \code{\link{\%or\%}()}, 55 | \code{\link{capture}()}, 56 | \code{\link{character_class}()}, 57 | \code{\link{group}()}, 58 | \code{\link{lookarounds}}, 59 | \code{\link{not}()}, 60 | \code{\link{rex}()}, 61 | \code{\link{shortcuts}}, 62 | \code{\link{wildcards}} 63 | } 64 | \concept{rex} 65 | -------------------------------------------------------------------------------- /man/escape.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/escape.R 3 | \name{escape} 4 | \alias{escape} 5 | \alias{escape.regex} 6 | \alias{escape.character_class} 7 | \alias{escape.character} 8 | \alias{escape.default} 9 | \alias{escape.list} 10 | \title{Escape characters for a regex} 11 | \usage{ 12 | escape(x) 13 | 14 | \method{escape}{regex}(x) 15 | 16 | \method{escape}{character_class}(x) 17 | 18 | \method{escape}{character}(x) 19 | 20 | \method{escape}{default}(x) 21 | 22 | \method{escape}{list}(x) 23 | } 24 | \arguments{ 25 | \item{x}{Object to escape.} 26 | } 27 | \description{ 28 | Escape characters for a regex 29 | } 30 | \section{Methods (by class)}{ 31 | \itemize{ 32 | \item \code{regex}: Objects are simply passed through unchanged. 33 | 34 | \item \code{character_class}: Objects are surrounded by braces. 35 | 36 | \item \code{character}: Objects are properly escaped for regular expressions. 37 | 38 | \item \code{default}: default escape coerces to character and escapes. 39 | 40 | \item \code{list}: simply call escape on all elements of the list. 41 | }} 42 | 43 | -------------------------------------------------------------------------------- /man/group.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/capture.R 3 | \name{group} 4 | \alias{group} 5 | \title{Create a grouped expression} 6 | \usage{ 7 | group(...) 8 | } 9 | \arguments{ 10 | \item{...}{\code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 11 | functions.} 12 | } 13 | \description{ 14 | This is similar to \code{\link{capture}} except that it does not store the 15 | value of the group. Best used when you want to combine several parts 16 | together and do not reference or extract the grouped value later. 17 | } 18 | \seealso{ 19 | \code{\link{capture}} for grouping with capturing. Perl 5 Extended 20 | Patterns \url{https://perldoc.perl.org/perlre#Extended-Patterns} 21 | 22 | Other rex: 23 | \code{\link{\%or\%}()}, 24 | \code{\link{capture}()}, 25 | \code{\link{character_class}()}, 26 | \code{\link{counts}}, 27 | \code{\link{lookarounds}}, 28 | \code{\link{not}()}, 29 | \code{\link{rex}()}, 30 | \code{\link{shortcuts}}, 31 | \code{\link{wildcards}} 32 | } 33 | \concept{rex} 34 | -------------------------------------------------------------------------------- /man/lookarounds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lookarounds.R 3 | \name{lookarounds} 4 | \alias{lookarounds} 5 | \alias{\%if_next_is\%} 6 | \alias{\%if_next_isnt\%} 7 | \alias{\%if_prev_is\%} 8 | \alias{\%if_prev_isnt\%} 9 | \title{Lookarounds} 10 | \usage{ 11 | x \%if_next_is\% y 12 | 13 | x \%if_next_isnt\% y 14 | 15 | x \%if_prev_is\% y 16 | 17 | x \%if_prev_isnt\% y 18 | } 19 | \arguments{ 20 | \item{x}{A regex pattern.} 21 | 22 | \item{y}{A regex pattern.} 23 | } 24 | \description{ 25 | Lookarounds 26 | } 27 | \details{ 28 | These functions provide an interface to perl lookarounds. 29 | 30 | Special binary functions are used to infer an ordering, since often you 31 | might wish to match a word / set of characters conditional on the start 32 | and end of that word. 33 | 34 | \itemize{ 35 | \item \code{\%if_next_is\%}: \code{TRUE} if x follows y 36 | \item \code{\%if_next_isnt\%}: \code{TRUE} if x does not follow y 37 | \item \code{\%if_prev_is\%}: \code{TRUE} if y comes before x 38 | \item \code{\%if_prev_isnt\%}: \code{TRUE} if y does not come before x 39 | } 40 | } 41 | \examples{ 42 | stopifnot(grepl(rex("crab" \%if_next_is\% "apple"), "crabapple", perl = TRUE)) 43 | stopifnot(grepl(rex("crab" \%if_prev_is\% "apple"), "applecrab", perl = TRUE)) 44 | stopifnot(grepl(rex(range("a", "e") \%if_next_isnt\% range("f", "g")), 45 | "ah", perl = TRUE)) 46 | stopifnot(grepl(rex(range("a", "e") \%if_next_is\% range("f", "i")), 47 | "ah", perl = TRUE)) 48 | } 49 | \seealso{ 50 | Perl 5 Documentation \url{https://perldoc.perl.org/perlre#Extended-Patterns} 51 | 52 | Other rex: 53 | \code{\link{\%or\%}()}, 54 | \code{\link{capture}()}, 55 | \code{\link{character_class}()}, 56 | \code{\link{counts}}, 57 | \code{\link{group}()}, 58 | \code{\link{not}()}, 59 | \code{\link{rex}()}, 60 | \code{\link{shortcuts}}, 61 | \code{\link{wildcards}} 62 | } 63 | \concept{rex} 64 | -------------------------------------------------------------------------------- /man/not.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wildcards.R 3 | \name{not} 4 | \alias{not} 5 | \title{Do not match} 6 | \usage{ 7 | not(..., type = c("greedy", "lazy", "possessive")) 8 | } 9 | \arguments{ 10 | \item{...}{\code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 11 | functions.} 12 | 13 | \item{type}{the type of match to perform. 14 | 15 | There are three match types 16 | \enumerate{ 17 | \item \code{greedy}: match the longest string. This is the default matching type. 18 | \item \code{lazy}: match the shortest string. This matches the shortest string from the same anchor point, not necessarily the shortest global string. 19 | \item \code{possessive}: match and don't allow backtracking 20 | }} 21 | } 22 | \description{ 23 | Do not match 24 | } 25 | \seealso{ 26 | Other rex: 27 | \code{\link{\%or\%}()}, 28 | \code{\link{capture}()}, 29 | \code{\link{character_class}()}, 30 | \code{\link{counts}}, 31 | \code{\link{group}()}, 32 | \code{\link{lookarounds}}, 33 | \code{\link{rex}()}, 34 | \code{\link{shortcuts}}, 35 | \code{\link{wildcards}} 36 | } 37 | \concept{rex} 38 | -------------------------------------------------------------------------------- /man/or.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/or.R 3 | \name{\%or\%} 4 | \alias{\%or\%} 5 | \alias{or} 6 | \title{Or} 7 | \usage{ 8 | x \%or\% y 9 | 10 | or(...) 11 | } 12 | \arguments{ 13 | \item{x}{A string.} 14 | 15 | \item{y}{A string.} 16 | 17 | \item{...}{\code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 18 | functions.} 19 | } 20 | \description{ 21 | The special binary function \code{\%or\%} can be used to specify a set 22 | of optional matches. 23 | } 24 | \seealso{ 25 | Other rex: 26 | \code{\link{capture}()}, 27 | \code{\link{character_class}()}, 28 | \code{\link{counts}}, 29 | \code{\link{group}()}, 30 | \code{\link{lookarounds}}, 31 | \code{\link{not}()}, 32 | \code{\link{rex}()}, 33 | \code{\link{shortcuts}}, 34 | \code{\link{wildcards}} 35 | } 36 | \concept{rex} 37 | -------------------------------------------------------------------------------- /man/re_matches.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/match.R 3 | \name{re_matches} 4 | \alias{re_matches} 5 | \alias{matches} 6 | \alias{m} 7 | \title{Match function} 8 | \usage{ 9 | re_matches( 10 | data, 11 | pattern, 12 | global = FALSE, 13 | options = NULL, 14 | locations = FALSE, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{character vector to match against} 20 | 21 | \item{pattern}{regular expression to use for matching} 22 | 23 | \item{global}{use global matching} 24 | 25 | \item{options}{regular expression options} 26 | 27 | \item{locations}{rather than returning the values of the matched (or 28 | captured) string, return a \code{data.frame} of the match locations in the 29 | string.} 30 | 31 | \item{...}{options passed to regexpr or gregexpr} 32 | } 33 | \value{ 34 | if no captures, returns a logical vector the same length as the 35 | input character vector specifying if the relevant value matched or not. If 36 | there are captures in the regular expression, returns a \code{data.frame} with a 37 | column for each capture group. If \code{global} is \code{TRUE}, returns a 38 | list of \code{data.frame}s. 39 | } 40 | \description{ 41 | Match function 42 | } 43 | \examples{ 44 | string <- c("this is a", "test string") 45 | re_matches(string, rex("test")) # FALSE FALSE 46 | 47 | # named capture 48 | re_matches(string, rex(capture(alphas, name = "first_word"), space, 49 | capture(alphas, name = "second_word"))) 50 | # first_word second_word 51 | # 1 this is 52 | # 2 test string 53 | 54 | # capture returns NA when it fails to match 55 | re_matches(string, rex(capture("test"))) 56 | # 1 57 | # 1 test 58 | # 2 59 | } 60 | \seealso{ 61 | \code{\link{regexp}} Section "Perl-like Regular Expressions" for a 62 | discussion of the supported options 63 | } 64 | -------------------------------------------------------------------------------- /man/re_substitutes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/match.R 3 | \name{re_substitutes} 4 | \alias{re_substitutes} 5 | \alias{substitutes} 6 | \alias{s} 7 | \title{Substitute regular expressions in a string with another string.} 8 | \usage{ 9 | re_substitutes(data, pattern, replacement, global = FALSE, options = NULL, ...) 10 | } 11 | \arguments{ 12 | \item{data}{character vector to substitute} 13 | 14 | \item{pattern}{regular expression to match} 15 | 16 | \item{replacement}{replacement text to use} 17 | 18 | \item{global}{substitute all occurrences} 19 | 20 | \item{options}{option flags} 21 | 22 | \item{...}{options passed to sub or gsub} 23 | } 24 | \description{ 25 | Substitute regular expressions in a string with another string. 26 | } 27 | \examples{ 28 | string <- c("this is a Test", "string") 29 | re_substitutes(string, "test", "not a test", options = "insensitive") 30 | re_substitutes(string, "i", "x", global = TRUE) 31 | re_substitutes(string, "(test)", "not a \\\\1", options = "insensitive") 32 | } 33 | \seealso{ 34 | \code{\link{regexp}} Section "Perl-like Regular Expressions" for a 35 | discussion of the supported options 36 | } 37 | -------------------------------------------------------------------------------- /man/regex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rex.R 3 | \name{as.character.regex} 4 | \alias{as.character.regex} 5 | \alias{print.regex} 6 | \alias{regex} 7 | \title{Regular Expression} 8 | \usage{ 9 | \method{as.character}{regex}(x, ...) 10 | 11 | \method{print}{regex}(x, ...) 12 | 13 | regex(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{Object} 17 | 18 | \item{...}{further arguments} 19 | } 20 | \description{ 21 | Specify an explicit regular expression. This expression must already be 22 | escaped. 23 | } 24 | \section{Methods (by generic)}{ 25 | \itemize{ 26 | \item \code{as.character}: coerce regex object to a character 27 | 28 | \item \code{print}: Print regex object 29 | }} 30 | 31 | \seealso{ 32 | \code{\link{as.regex}} to coerce to a regex object. 33 | } 34 | -------------------------------------------------------------------------------- /man/register_shortcuts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rex.R 3 | \name{register_shortcuts} 4 | \alias{register_shortcuts} 5 | \title{Register the Rex shortcuts} 6 | \usage{ 7 | register_shortcuts(pkg_name) 8 | } 9 | \arguments{ 10 | \item{pkg_name}{the package to register the shortcuts in} 11 | } 12 | \description{ 13 | If you are using rex in another package you need to call this function to 14 | register all of the rex shortcuts so that spurious NOTEs about global 15 | variables being generated during R CMD check. 16 | } 17 | -------------------------------------------------------------------------------- /man/rex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rex.R 3 | \name{rex} 4 | \alias{rex} 5 | \alias{rex_} 6 | \title{Generate a regular expression.} 7 | \usage{ 8 | rex(..., env = parent.frame()) 9 | } 10 | \arguments{ 11 | \item{...}{\code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 12 | functions.} 13 | 14 | \item{env}{environment to evaluate the rex expression in.} 15 | } 16 | \description{ 17 | Generate a regular expression. 18 | } 19 | \seealso{ 20 | Other rex: 21 | \code{\link{\%or\%}()}, 22 | \code{\link{capture}()}, 23 | \code{\link{character_class}()}, 24 | \code{\link{counts}}, 25 | \code{\link{group}()}, 26 | \code{\link{lookarounds}}, 27 | \code{\link{not}()}, 28 | \code{\link{shortcuts}}, 29 | \code{\link{wildcards}} 30 | } 31 | \concept{rex} 32 | -------------------------------------------------------------------------------- /man/rex_mode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rex-mode.R 3 | \name{rex_mode} 4 | \alias{rex_mode} 5 | \title{Toggles \pkg{rex} mode.} 6 | \usage{ 7 | rex_mode() 8 | } 9 | \description{ 10 | While within rex mode, functions used within the \code{\link{rex}} function 11 | are attached, so one can get e.g. auto-completion within editors. 12 | } 13 | -------------------------------------------------------------------------------- /man/shortcuts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/shortcuts.R 3 | \docType{data} 4 | \name{shortcuts} 5 | \alias{shortcuts} 6 | \title{Shortcuts} 7 | \format{ 8 | An object of class \code{shortcut} of length 116. 9 | } 10 | \usage{ 11 | shortcuts 12 | } 13 | \description{ 14 | Commonly used character classes and regular expressions. These shortcuts 15 | are substituted inside \code{rex} calls. 16 | } 17 | \details{ 18 | \code{names(shortcuts)} will give you the full list of available shortcuts. 19 | } 20 | \seealso{ 21 | Other rex: 22 | \code{\link{\%or\%}()}, 23 | \code{\link{capture}()}, 24 | \code{\link{character_class}()}, 25 | \code{\link{counts}}, 26 | \code{\link{group}()}, 27 | \code{\link{lookarounds}}, 28 | \code{\link{not}()}, 29 | \code{\link{rex}()}, 30 | \code{\link{wildcards}} 31 | } 32 | \concept{rex} 33 | \keyword{datasets} 34 | -------------------------------------------------------------------------------- /man/single_shortcuts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/shortcuts.R 3 | \docType{data} 4 | \name{single_shortcuts} 5 | \alias{single_shortcuts} 6 | \title{Single shortcuts} 7 | \format{ 8 | An object of class \code{shortcut} of length 18. 9 | } 10 | \usage{ 11 | single_shortcuts 12 | } 13 | \description{ 14 | Each of these shortcuts has both a plural (-s) and inverse (non_) form. 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/wildcards.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wildcards.R 3 | \name{wildcards} 4 | \alias{wildcards} 5 | \alias{zero_or_more} 6 | \alias{one_or_more} 7 | \alias{maybe} 8 | \alias{zero_or_one} 9 | \title{Wildcards} 10 | \usage{ 11 | zero_or_more(..., type = c("greedy", "lazy", "possessive")) 12 | 13 | one_or_more(..., type = c("greedy", "lazy", "possessive")) 14 | 15 | maybe(..., type = c("greedy", "lazy", "possessive")) 16 | } 17 | \arguments{ 18 | \item{...}{\code{\link{shortcuts}}, R variables, text, or other \pkg{rex} 19 | functions.} 20 | 21 | \item{type}{the type of match to perform. 22 | 23 | There are three match types 24 | \enumerate{ 25 | \item \code{greedy}: match the longest string. This is the default matching type. 26 | \item \code{lazy}: match the shortest string. This matches the shortest string from the same anchor point, not necessarily the shortest global string. 27 | \item \code{possessive}: match and don't allow backtracking 28 | }} 29 | } 30 | \description{ 31 | Wildcards 32 | } 33 | \section{Functions}{ 34 | \itemize{ 35 | \item \code{zero_or_more}: match \code{...} zero or more times. 36 | 37 | \item \code{one_or_more}: match \code{...} one or more times. 38 | 39 | \item \code{maybe}: match \code{...} zero or one times. 40 | }} 41 | 42 | \seealso{ 43 | Other rex: 44 | \code{\link{\%or\%}()}, 45 | \code{\link{capture}()}, 46 | \code{\link{character_class}()}, 47 | \code{\link{counts}}, 48 | \code{\link{group}()}, 49 | \code{\link{lookarounds}}, 50 | \code{\link{not}()}, 51 | \code{\link{rex}()}, 52 | \code{\link{shortcuts}} 53 | } 54 | \concept{rex} 55 | -------------------------------------------------------------------------------- /revdep/README.md: -------------------------------------------------------------------------------- 1 | # Revdeps 2 | 3 | ## All (16) 4 | 5 | |package |version |error |warning |note | 6 | |:------------------------------------|:--------|:-----|:-------|:----| 7 | |bdpar |2.0.0 | | | | 8 | |covr |3.5.0 | | | | 9 | |[datarobot](problems.md#datarobot) |2.17.0 | | |2 | 10 | |dparser |0.1.8 | | | | 11 | |gramEvol |2.1-3 | | | | 12 | |lintr |2.0.1 | | | | 13 | |[mlr](problems.md#mlr) |2.17.1 | | |1 | 14 | |mlrCPO |0.3.6 | | | | 15 | |namedCapture |2019.8.7 | | | | 16 | |[nlmixr](problems.md#nlmixr) |1.1.1-7 | | |1 | 17 | |OpenML |1.10 | | | | 18 | |ore |1.6.3 | | | | 19 | |[roxygen2md](problems.md#roxygen2md) |1.0.0 | | |1 | 20 | |[RxODE](problems.md#rxode) |0.9.2-0 |1 | | | 21 | |table.express |0.3.1 | | | | 22 | |todor |0.1.0 | | | | 23 | 24 | -------------------------------------------------------------------------------- /revdep/check.R: -------------------------------------------------------------------------------- 1 | library("devtools") 2 | 3 | revdep_check() 4 | revdep_check_save_summary() 5 | -------------------------------------------------------------------------------- /revdep/checks.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-lib/rex/7a8fb36a29919c3d447cc823c672af9b5cf1fd68/revdep/checks.rds -------------------------------------------------------------------------------- /revdep/failures.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /revdep/problems.md: -------------------------------------------------------------------------------- 1 | # datarobot 2 | 3 |
4 | 5 | * Version: 2.17.0 6 | * Source code: https://github.com/cran/datarobot 7 | * Date/Publication: 2020-02-22 05:50:07 UTC 8 | * Number of recursive dependencies: 100 9 | 10 | Run `revdep_details(,"datarobot")` for more info 11 | 12 |
13 | 14 | ## In both 15 | 16 | * checking installed package size ... NOTE 17 | ``` 18 | installed size is 8.9Mb 19 | sub-directories of 1Mb or more: 20 | doc 2.3Mb 21 | extdata 5.5Mb 22 | ``` 23 | 24 | * checking dependencies in R code ... NOTE 25 | ``` 26 | Namespace in Imports field not imported from: ‘curl’ 27 | All declared Imports should be used. 28 | ``` 29 | 30 | # mlr 31 | 32 |
33 | 34 | * Version: 2.17.1 35 | * Source code: https://github.com/cran/mlr 36 | * URL: https://mlr.mlr-org.com, https://github.com/mlr-org/mlr 37 | * BugReports: https://github.com/mlr-org/mlr/issues 38 | * Date/Publication: 2020-03-24 10:40:02 UTC 39 | * Number of recursive dependencies: 389 40 | 41 | Run `revdep_details(,"mlr")` for more info 42 | 43 |
44 | 45 | ## In both 46 | 47 | * checking installed package size ... NOTE 48 | ``` 49 | installed size is 5.0Mb 50 | sub-directories of 1Mb or more: 51 | R 1.5Mb 52 | data 2.3Mb 53 | ``` 54 | 55 | # nlmixr 56 | 57 |
58 | 59 | * Version: 1.1.1-7 60 | * Source code: https://github.com/cran/nlmixr 61 | * URL: https://github.com/nlmixrdevelopment/nlmixr 62 | * Date/Publication: 2020-03-18 22:50:02 UTC 63 | * Number of recursive dependencies: 152 64 | 65 | Run `revdep_details(,"nlmixr")` for more info 66 | 67 |
68 | 69 | ## In both 70 | 71 | * checking installed package size ... NOTE 72 | ``` 73 | installed size is 21.3Mb 74 | sub-directories of 1Mb or more: 75 | libs 19.6Mb 76 | ``` 77 | 78 | # roxygen2md 79 | 80 |
81 | 82 | * Version: 1.0.0 83 | * Source code: https://github.com/cran/roxygen2md 84 | * URL: https://roxygen2md.r-lib.org, https://github.com/r-lib/roxygen2md 85 | * BugReports: https://github.com/r-lib/roxygen2md/issues 86 | * Date/Publication: 2019-06-17 15:40:03 UTC 87 | * Number of recursive dependencies: 76 88 | 89 | Run `revdep_details(,"roxygen2md")` for more info 90 | 91 |
92 | 93 | ## In both 94 | 95 | * checking dependencies in R code ... NOTE 96 | ``` 97 | Namespace in Imports field not imported from: ‘withr’ 98 | All declared Imports should be used. 99 | ``` 100 | 101 | # RxODE 102 | 103 |
104 | 105 | * Version: 0.9.2-0 106 | * Source code: https://github.com/cran/RxODE 107 | * URL: https://nlmixrdevelopment.github.io/RxODE/ 108 | * BugReports: https://github.com/nlmixrdevelopment/RxODE/issues 109 | * Date/Publication: 2020-03-13 07:10:14 UTC 110 | * Number of recursive dependencies: 132 111 | 112 | Run `revdep_details(,"RxODE")` for more info 113 | 114 |
115 | 116 | ## In both 117 | 118 | * checking package dependencies ... ERROR 119 | ``` 120 | Packages required but not available: 121 | 'knitr', 'PreciseSums', 'Rcpp', 'brew', 'cli', 'dparser', 'ggplot2', 122 | 'inline', 'magrittr', 'memoise', 'mvnfast', 'pillar', 'sys', 'units', 123 | 'assertthat', 'lotri', 'RcppArmadillo' 124 | 125 | Packages suggested but not available: 126 | 'DT', 'data.table', 'shiny', 'testthat', 'usethis', 'devtools', 127 | 'covr', 'rmarkdown', 'SnakeCharmR', 'dplyr', 'tidyr', 'tibble', 128 | 'curl', 'gridExtra', 'microbenchmark', 'scales', 'stringi', 129 | 'htmltools', 'reticulate', 'rlang', 'installr', 'learnr', 'remotes', 130 | 'crayon', 'xgxr', 'digest', 'vdiffr', 'ggrepel' 131 | 132 | VignetteBuilder package required for checking but not installed: ‘knitr’ 133 | 134 | The suggested packages are required for a complete check. 135 | Checking can be attempted without them by setting the environment 136 | variable _R_CHECK_FORCE_SUGGESTS_ to a false value. 137 | 138 | See section ‘The DESCRIPTION file’ in the ‘Writing R Extensions’ 139 | manual. 140 | ``` 141 | 142 | -------------------------------------------------------------------------------- /rex.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(rex) 3 | 4 | test_check("rex") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-aaa.R: -------------------------------------------------------------------------------- 1 | context("aaa") 2 | test_that("register adds a function to the .rex$env enviornment", { 3 | a <- identity 4 | 5 | register(a) 6 | 7 | expect_true("a" %in% ls(envir=.rex$env)) 8 | 9 | rm("a", envir = .rex$env) 10 | }) 11 | 12 | test_that("register_object adds all the objects to the .rex$env environment", { 13 | b <- list(x=1, y = 2) 14 | 15 | register_object(b) 16 | 17 | expect_true("x" %in% ls(envir=.rex$env)) 18 | 19 | expect_true("y" %in% ls(envir=.rex$env)) 20 | 21 | expect_false("b" %in% ls(envir=.rex$env)) 22 | 23 | rm("x", envir = .rex$env) 24 | 25 | rm("y", envir = .rex$env) 26 | }) 27 | -------------------------------------------------------------------------------- /tests/testthat/test-capture.R: -------------------------------------------------------------------------------- 1 | context("capture") 2 | test_that("matches basic characters", { 3 | x <- "text" 4 | re <- rex(capture(x)) 5 | 6 | expect_equal(re, regex("(text)")) 7 | 8 | expect_true(grepl(re, x)) 9 | 10 | expect_equal(gsub(re, "\\1", x), x) 11 | 12 | expect_equal(gsub(re, "replacement", x), "replacement") 13 | }) 14 | 15 | test_that("escapes special characters", { 16 | x <- "^[x$\\" 17 | re <- rex(capture(x)) 18 | 19 | expect_equal(re, regex("(\\^\\[x\\$\\\\)")) 20 | 21 | expect_true(grepl(re, x)) 22 | 23 | expect_equal(gsub(re, "\\1", x), x) 24 | 25 | expect_equal(gsub(re, "replacement", x), "replacement") 26 | }) 27 | 28 | test_that("examples work", { 29 | re <- rex( 30 | # first quotation mark 31 | capture(quote), 32 | 33 | # match all non-matching quotation marks 34 | zero_or_more(except(capture_group(1))), 35 | 36 | # end quotation mark (matches first) 37 | capture_group(1) 38 | ) 39 | 40 | expect_equal(re, regex("(['\"])(?:[^\\g{1}])*\\g{1}")) 41 | 42 | lapply(c("\"\"", "\"'\"", "\"arst\"", "''", "'arst'", "'\"'"), 43 | function(x) { 44 | expect_true(grepl(re, x, perl = TRUE), info=x) 45 | } 46 | ) 47 | 48 | lapply(c("'a", "'asr\""), function(x) { 49 | expect_false(grepl(re, x, perl = TRUE), info = x) 50 | }) 51 | 52 | }) 53 | 54 | context("named capture") 55 | test_that("examples work", { 56 | re <- rex( 57 | capture(name = "fruit", or("apple", "orange")), 58 | "=", 59 | capture_group("fruit") 60 | ) 61 | 62 | expect_true(grepl(re, "apple=apple", perl=TRUE)) 63 | expect_true(grepl(re, "orange=orange", perl=TRUE)) 64 | expect_false(grepl(re, "apple=orange", perl=TRUE)) 65 | }) 66 | -------------------------------------------------------------------------------- /tests/testthat/test-character_class.R: -------------------------------------------------------------------------------- 1 | context("one_of") 2 | test_that("simple text is correct", { 3 | re <- rex(one_of(1:9)) 4 | 5 | expect_equal(re, regex("[123456789]")) 6 | lapply(1:9, function(x) { 7 | expect_true(grepl(re, x, perl = TRUE), info=x) 8 | }) 9 | expect_false(grepl(re, "a", perl = TRUE)) 10 | 11 | 12 | vals <- c("a", "b", "c", 0, 3, 5) 13 | 14 | re <- rex(one_of(vals)) 15 | 16 | expect_equal(re, regex("[abc035]")) 17 | 18 | lapply(vals, function(x) { 19 | expect_true(grepl(re, x, perl = TRUE), info = x) 20 | }) 21 | 22 | expect_false(grepl(re, "d", perl = TRUE)) 23 | }) 24 | 25 | test_that("escapes correctly", { 26 | vals <- c("[", "]") 27 | 28 | re <- rex(one_of(vals)) 29 | 30 | expect_equal(re, regex("[\\[\\]]")) 31 | 32 | lapply(vals, function(x) { 33 | expect_true(grepl(re, x, perl = TRUE), info = x) 34 | }) 35 | 36 | expect_false(grepl(re, "{", perl = TRUE)) 37 | }) 38 | 39 | tests <- c(quote("a"), 40 | quote(any), 41 | quote(quote), 42 | quote(quotes), 43 | quote(lower), 44 | quote(upper), 45 | quote(list(upper, lower)), 46 | quote(list("[", "]"))) 47 | 48 | test_that("any_of equals one_of() plus *", { 49 | lapply(tests, 50 | function(x) { 51 | re1 <- regex(paste0(rex(one_of(eval(x))), "*")) 52 | re2 <- rex(any_of(eval(x))) 53 | 54 | expect_equal(re1, re2, info = paste(sep=" : ", re1, re2)) 55 | }) 56 | }) 57 | 58 | test_that("some_of equals one_of plus +", { 59 | lapply(tests, 60 | function(x) { 61 | re1 <- regex(paste0(rex(one_of(eval(x))), "+")) 62 | re2 <- rex(some_of(eval(x))) 63 | 64 | expect_equal(re1, re2, info = paste(sep=" : ", re1, re2)) 65 | }) 66 | }) 67 | 68 | test_that("except_any equals none_of() plus *", { 69 | lapply(tests, 70 | function(x) { 71 | re1 <- regex(paste0(rex(none_of(eval(x))), "*")) 72 | re2 <- rex(except_any_of(eval(x))) 73 | 74 | expect_equal(re1, re2, info = paste(sep=" : ", re1, re2)) 75 | }) 76 | }) 77 | 78 | test_that("except_some equals none_of() plus +", { 79 | lapply(tests, 80 | function(x) { 81 | re1 <- regex(paste0(rex(none_of(eval(x))), "+")) 82 | re2 <- rex(except_some_of(eval(x))) 83 | 84 | expect_equal(re1, re2, info = paste(sep=" : ", re1, re2)) 85 | }) 86 | }) 87 | 88 | context("none_of") 89 | test_that("simple text is correct", { 90 | re <- rex(none_of(1:9)) 91 | 92 | expect_equal(re, regex("[^123456789]")) 93 | lapply(1:9, function(x) { 94 | expect_false(grepl(re, x, perl = TRUE), info=x) 95 | }) 96 | expect_true(grepl(re, "a", perl = TRUE)) 97 | 98 | 99 | vals <- c("a", "b", "c", 0, 3, 5) 100 | 101 | re <- rex(none_of(vals)) 102 | 103 | expect_equal(re, regex("[^abc035]")) 104 | 105 | lapply(vals, function(x) { 106 | expect_false(grepl(re, x, perl = TRUE), info = x) 107 | }) 108 | 109 | expect_true(grepl(re, "d", perl = TRUE)) 110 | }) 111 | 112 | test_that("escapes correctly", { 113 | vals <- c("[", "]") 114 | 115 | re <- rex(none_of(vals)) 116 | 117 | expect_equal(re, regex("[^\\[\\]]")) 118 | 119 | lapply(vals, function(x) { 120 | expect_false(grepl(re, x, perl = TRUE), info = x) 121 | }) 122 | 123 | expect_true(grepl(re, "{", perl = TRUE)) 124 | }) 125 | 126 | context("range") 127 | test_that("matches basic characters", { 128 | re <- rex(range(1, 3)) 129 | 130 | expect_equal(re, regex("[1-3]")) 131 | 132 | lapply(1:3, function(x) { 133 | expect_true(grepl(re, x), info=x) 134 | }) 135 | 136 | lapply(4:9, function(x) { 137 | expect_false(grepl(re, x), info=x) 138 | }) 139 | 140 | }) 141 | test_that("escapes special characters", { 142 | re <- rex(range("[", "}")) 143 | 144 | expect_equal(re, regex("[\\[-}]")) 145 | 146 | lapply(c("[", "}"), function(x) { 147 | expect_true(grepl(re, x), info=x) 148 | }) 149 | 150 | }) 151 | 152 | context("exclude_range") 153 | test_that("matches basic characters", { 154 | re <- rex(exclude_range(1, 3)) 155 | 156 | expect_equal(re, regex("[^1-3]")) 157 | 158 | lapply(1:3, function(x) { 159 | expect_false(grepl(re, x, perl = TRUE), info=x) 160 | }) 161 | 162 | lapply(4:9, function(x) { 163 | expect_true(grepl(re, x, perl = TRUE), info=x) 164 | }) 165 | 166 | }) 167 | 168 | context("one_of") 169 | test_that("matches basic characters", { 170 | expect_equal(rex(one_of("a", "b", "rst")), regex("[abrst]")) 171 | }) 172 | 173 | test_that("escapes special characters", { 174 | expect_equal(rex(one_of("^", "b", "\\")), regex("[\\^b\\\\]")) 175 | }) 176 | 177 | context("except") 178 | test_that("matches basic characters", { 179 | expect_equal(rex(except("a", "b", "rst")), regex("[^abrst]")) 180 | }) 181 | 182 | test_that("escapes special characters", { 183 | expect_equal(rex(except("^", "b")), regex("[^\\^b]")) 184 | }) 185 | 186 | test_that("none_of is the same as except", { 187 | expect_equal(rex(none_of("^", "b", 1:10)), rex(except("^", "b", 1:10))) 188 | }) 189 | 190 | context("character_class") 191 | test_that("examples are correct", { 192 | # grey = gray 193 | re <- rex("gr", one_of("a", "e"), "y") 194 | expect_equal(grepl(re, c("grey", "gray")), c(TRUE, TRUE)) # TRUE TRUE 195 | 196 | # Match non-vowels 197 | re <- rex(none_of("a", "e", "i", "o", "u")) 198 | # They can also be in the same string 199 | re2 <- rex(none_of("aeiou")) 200 | expect_identical(re, re2) 201 | expect_equal(grepl(re, c("k", "l", "e")), c(TRUE, TRUE, FALSE)) # TRUE TRUE FALSE 202 | 203 | # Match range 204 | re <- rex(range("a", "e")) 205 | expect_equal(grepl(re, c("b", "d", "f")), c(TRUE, TRUE, FALSE)) # TRUE TRUE FALSE 206 | 207 | # Explicit creation (note you have to escape manually here) 208 | re <- rex(character_class("abcd\\[")) 209 | expect_equal(grepl(re, c("a", "d", "[", "]")), c(TRUE, TRUE, TRUE, FALSE)) # TRUE TRUE TRUE FALSE 210 | }) 211 | test_that("escapes special characters", { 212 | re <- rex(exclude_range("[", "}")) 213 | 214 | expect_equal(re, regex("[^\\[-}]")) 215 | 216 | lapply(c("[", "}"), function(x) { 217 | expect_false(grepl(re, x, perl = TRUE), info=x) 218 | }) 219 | 220 | expect_true(grepl(re, "A", perl = TRUE)) 221 | }) 222 | -------------------------------------------------------------------------------- /tests/testthat/test-common.R: -------------------------------------------------------------------------------- 1 | context("rex") 2 | 3 | test_that("start works", { 4 | 5 | r <- rex( 6 | start, letter 7 | ) 8 | 9 | expect_true(grepl(r, "abcdef")) 10 | expect_false(grepl(r, "123456", perl = TRUE)) 11 | 12 | }) 13 | 14 | test_that("end works", { 15 | 16 | r <- rex( 17 | "Z", end 18 | ) 19 | 20 | expect_true(grepl(r, "abcZ")) 21 | expect_false(grepl(r, "abc")) 22 | 23 | }) 24 | 25 | test_that("version parsing works", { 26 | 27 | r <- rex( 28 | start, 29 | capture(numbers), 30 | any, 31 | capture(numbers), 32 | any, 33 | capture(numbers), 34 | any, 35 | capture(numbers), 36 | end 37 | ) 38 | 39 | expect_identical( 40 | gsub(r, "\\1 \\2 \\3 \\4", "3.1.1-1", perl = TRUE), 41 | "3 1 1 1" 42 | ) 43 | 44 | }) 45 | 46 | test_that("verbs in rex work", { 47 | 48 | r <- rex( 49 | start, "foo", zero_or_more(any), "bar", end 50 | ) 51 | 52 | expect_true(grepl(r, "fooABCbar", perl = TRUE)) 53 | expect_true(grepl(r, "foo123\tbar", perl = TRUE)) 54 | 55 | }) 56 | 57 | test_that("Simple URL parsing works", { 58 | 59 | # TODO: get these working better again 60 | ## Decompose a URL into its components. 61 | ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html). 62 | x <- "http://stat.umn.edu:80/xyz" 63 | re <- "^(?:(((?:(?:(?!:).)*)+)://))?((?:(?:(?!:/).)*)+)(?:(:((?:[[:digit:]]+)+)))?(?:(/(?:.)*))?$" 64 | #m <- regexec(re, x) 65 | #m 66 | #regmatches(x, m) 67 | ## Element 3 is the protocol, 4 is the host, 6 is the port, and 7 68 | ## is the path. We can use this to make a function for extracting the 69 | ## parts of a URL: 70 | URL_parts <- function(x) { 71 | m <- regexec(re, x) 72 | parts <- do.call(rbind, 73 | lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) 74 | colnames(parts) <- c("protocol","host","port","path") 75 | parts 76 | } 77 | #URL_parts(x) 78 | 79 | r <- rex( 80 | 81 | start, 82 | 83 | ## match the protocol -- may exist or may not 84 | maybe(capture( 85 | capture(except_some_of(":")), 86 | "://" 87 | )), 88 | 89 | ## match the path 90 | capture(one_or_more(not(":/"))), 91 | 92 | ## get the port 93 | maybe(capture(":", capture(numbers))), 94 | 95 | ## and the rest 96 | maybe(capture("/", anything)), 97 | 98 | end 99 | 100 | ) 101 | 102 | rbind(r = r, m = re) 103 | n <- gregexpr(r, x, perl = TRUE)[[1]] 104 | split_matches <- function(string, matches) { 105 | starts <- attr(matches, "capture.start") 106 | lengths <- attr(matches, "capture.length") 107 | ends <- starts + lengths - 1 108 | c(string, substring(string, starts, ends)) 109 | } 110 | 111 | #split_matches(x, n) 112 | #regmatches(x, m)[[1]] 113 | #expect_equal(regmatches(x, m)[[1]], split_matches(x, n)) 114 | 115 | }) 116 | 117 | context("URL Validation") 118 | test_that("URL Validation works", { 119 | valid_chars <- rex(except_some_of(".", "/", " ", "-")) 120 | 121 | `%>%` <- magrittr::`%>%` 122 | 123 | re <- rex( 124 | start, 125 | 126 | # protocol identifier (optional) + // 127 | group(list("http", maybe("s")) %or% "ftp", "://"), 128 | 129 | # user:pass authentication (optional) 130 | maybe(non_spaces, 131 | maybe(":", zero_or_more(non_space)), 132 | "@"), 133 | 134 | #host name 135 | group(zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)), 136 | 137 | #domain name 138 | zero_or_more(".", zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)), 139 | 140 | #TLD identifier 141 | group(".", valid_chars %>% at_least(2)), 142 | 143 | # server port number (optional) 144 | maybe(":", digit %>% between(2, 5)), 145 | 146 | # resource path (optional) 147 | maybe("/", non_space %>% zero_or_more()), 148 | 149 | end 150 | ) 151 | 152 | good <- c("http://foo.com/blah_blah", 153 | "http://foo.com/blah_blah/", 154 | "http://foo.com/blah_blah_(wikipedia)", 155 | "http://foo.com/blah_blah_(wikipedia)_(again)", 156 | "http://www.example.com/wpstyle/?p=364", 157 | "https://www.example.com/foo/?bar=baz&inga=42&quux", 158 | "http://✪df.ws/123", 159 | "http://userid:password@example.com:8080", 160 | "http://userid:password@example.com:8080/", 161 | "http://userid@example.com", 162 | "http://userid@example.com/", 163 | "http://userid@example.com:8080", 164 | "http://userid@example.com:8080/", 165 | "http://userid:password@example.com", 166 | "http://userid:password@example.com/", 167 | "http://➡.ws/䨹", 168 | "http://⌘.ws", 169 | "http://⌘.ws/", 170 | "http://foo.com/blah_(wikipedia)#cite-1", 171 | "http://foo.com/blah_(wikipedia)_blah#cite-1", 172 | "http://foo.com/unicode_(✪)_in_parens", 173 | "http://foo.com/(something)?after=parens", 174 | "http://☺.damowmow.com/", 175 | "http://code.google.com/events/#&product=browser", 176 | "http://j.mp", 177 | "ftp://foo.bar/baz", 178 | "http://foo.bar/?q=Test%20URL-encoded%20stuff", 179 | "http://مثال.إختبار", 180 | "http://例子.测试", 181 | "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", 182 | "http://1337.net", 183 | "http://a.b-c.de", 184 | "http://223.255.255.254") 185 | 186 | bad <- c( 187 | "http://", 188 | "http://.", 189 | "http://..", 190 | "http://../", 191 | "http://?", 192 | "http://??", 193 | "http://??/", 194 | "http://#", 195 | "http://##", 196 | "http://##/", 197 | "http://foo.bar?q=Spaces should be encoded", 198 | "//", 199 | "//a", 200 | "///a", 201 | "///", 202 | "http:///a", 203 | "foo.com", 204 | "rdar://1234", 205 | "h://test", # nolint 206 | "http:// shouldfail.com", 207 | ":// should fail", 208 | "http://foo.bar/foo(bar)baz quux", 209 | "ftps://foo.bar/", 210 | "http://-error-.invalid/", 211 | "http://-a.b.co", 212 | "http://a.b-.co", 213 | "http://0.0.0.0", 214 | "http://3628126748", 215 | "http://.www.foo.bar/", 216 | "http://www.foo.bar./", 217 | "http://.www.foo.bar./") 218 | 219 | lapply(good, function(x) { 220 | expect_true(grepl(re, x, perl = TRUE), info=x) 221 | }) 222 | lapply(bad, function(x) { 223 | expect_false(grepl(re, x, perl = TRUE), info=x) 224 | }) 225 | }) 226 | 227 | context("start") 228 | test_that("matches basic characters", { 229 | expect_equal(rex(start, "f"), regex("^f")) 230 | }) 231 | 232 | test_that("escapes special characters", { 233 | expect_equal(rex(start, "."), regex("^\\.")) 234 | }) 235 | 236 | test_that("matches basic characters", { 237 | expect_equal(rex(start, "x" %>% n_times(3)), regex("^(?:x){3}")) 238 | }) 239 | 240 | test_that("matches special identifiers", { 241 | expect_equal(rex(start, number %>% n_times(2)), regex("^(?:[[:digit:]]){2}")) 242 | }) 243 | 244 | context("appending expressions") 245 | test_that("adds basic characters", { 246 | expect_equal(rex("x", "y", "z"), 247 | regex("xyz")) 248 | expect_equal(rex("x", maybe("y"), "z"), 249 | regex("x(?:y)?z")) 250 | }) 251 | 252 | test_that("escapes special characters", { 253 | expect_equal(rex(numbers %>% between(0, 2), ".", "$", end), 254 | regex("(?:[[:digit:]]+){0,2}\\.\\$$")) 255 | }) 256 | 257 | context("shortcuts - end") 258 | test_that("matches basic characters", { 259 | expect_equal(rex("x", "y", end), 260 | regex("xy$")) 261 | }) 262 | 263 | 264 | test_that("escapes special characters", { 265 | expect_equal(rex("x", "$", end), 266 | regex("x\\$$")) 267 | }) 268 | 269 | context("general regex") 270 | test_that("returns a well-formed regex", { 271 | expect_equal(rex(start, "w", "x" %or% "y", "z", end), 272 | regex("^w(?:x|y)z$")) 273 | }) 274 | 275 | context("rex examples") 276 | re <- 277 | rex(start, 278 | number %>% n_times(3), 279 | "-", 280 | letter %>% n_times(2), 281 | maybe("#"), 282 | "a" %or% "b", 283 | "c" %>% between(2, 4), 284 | "$", 285 | end 286 | ) 287 | 288 | expect_true(grepl(re, "123-xy#accc$", perl=TRUE)) 289 | expect_true(grepl(re, "999-dfbcc$")) 290 | expect_false(grepl(re, "000-df#baccccccccc$")) 291 | expect_false(grepl(re, "444-dd3ac$")) 292 | 293 | context("issues") 294 | test_that("#11 Modifiers and named character classes", { 295 | p <- rex(none_of(alpha)) 296 | expect_true(grepl(p, "6", perl = TRUE)) 297 | }) 298 | -------------------------------------------------------------------------------- /tests/testthat/test-counts.R: -------------------------------------------------------------------------------- 1 | context("n_times") 2 | `%>%` <- magrittr::`%>%` 3 | test_that("description", { 4 | re1 <- rex("x" %>% n_times(2)) 5 | re2 <- rex("x" %>% n(2)) 6 | 7 | expect_identical(re1, re2) 8 | 9 | expect_equal(re1, 10 | regex("(?:x){2}")) 11 | 12 | expect_true(grepl(re1, "xx")) 13 | expect_false(grepl(re1, "x")) 14 | }) 15 | 16 | context("between") 17 | test_that("creates a bounded repetition", { 18 | re <- rex("x" %>% between(2, 4)) 19 | expect_equal(re, regex("(?:x){2,4}")) 20 | 21 | expect_true(grepl(re, "xxx")) 22 | expect_false(grepl(re, "x")) 23 | }) 24 | 25 | context("at_least") 26 | test_that("creates a bounded repetition", { 27 | re <- rex("x" %>% at_least(3)) 28 | expect_equal(re, regex("(?:x){3,}")) 29 | 30 | expect_true(grepl(re, "xxx")) 31 | expect_false(grepl(re, "xx")) 32 | }) 33 | 34 | context("at_most") 35 | test_that("creates a repetition of n times at most", { 36 | re <- rex(start, "x" %>% at_most(3), end) 37 | expect_equal(re, regex("^(?:x){0,3}$")) 38 | 39 | expect_true(grepl(re, "xxx")) 40 | expect_false(grepl(re, "xxxxx")) 41 | expect_true(grepl(re, "xxx", perl = TRUE)) 42 | expect_false(grepl(re, "xxxxx", perl = TRUE)) 43 | }) 44 | -------------------------------------------------------------------------------- /tests/testthat/test-escape.R: -------------------------------------------------------------------------------- 1 | context("escape") 2 | test_that("default escape works properly", { 3 | expect_equal(escape(1), structure("1", class="regex")) 4 | }) 5 | -------------------------------------------------------------------------------- /tests/testthat/test-lookarounds.R: -------------------------------------------------------------------------------- 1 | context("lookarounds") 2 | test_that("lookarounds work", { 3 | 4 | r <- rex( 5 | start, "foo" %if_next_isnt% "bar" 6 | ) 7 | 8 | expect_true(grepl(r, "fooba", perl = TRUE)) 9 | expect_false(grepl(r, "foobar", perl = TRUE)) 10 | }) 11 | context("if_next_is") 12 | test_that("matches basic characters", { 13 | re <- rex("a" %if_next_is% "b") 14 | expect_equal(re, regex("(?:a(?=b))")) 15 | expect_true(grepl(re, "ab", perl=TRUE)) 16 | expect_false(grepl(re, "ac", perl=TRUE)) 17 | }) 18 | test_that("escapes special characters", { 19 | re <- rex("[" %if_next_is% "?=") 20 | expect_equal(re, regex("(?:\\[(?=\\?=))")) 21 | expect_true(grepl(re, "[?=", perl=TRUE)) 22 | expect_false(grepl(re, "?=[", perl=TRUE)) 23 | }) 24 | 25 | context("if_next_isnt") 26 | test_that("matches basic characters", { 27 | re <- rex("a" %if_next_isnt% "b") 28 | expect_equal(re, regex("(?:a(?!b))")) 29 | expect_true(grepl(re, "ac", perl=TRUE)) 30 | expect_false(grepl(re, "ab", perl=TRUE)) 31 | }) 32 | test_that("escapes special characters", { 33 | re <- rex("[" %if_next_isnt% "?=") 34 | expect_equal(re, regex("(?:\\[(?!\\?=))")) 35 | expect_true(grepl(re, "?=[", perl=TRUE)) 36 | expect_false(grepl(re, "[?=", perl=TRUE)) 37 | }) 38 | 39 | context("if_prev_is") 40 | test_that("matches basic characters", { 41 | re <- rex("a" %if_prev_is% "b") 42 | expect_equal(re, regex("(?:(?<=b)a)")) 43 | expect_true(grepl(re, "ba", perl=TRUE)) 44 | expect_false(grepl(re, "ab", perl=TRUE)) 45 | }) 46 | test_that("escapes special characters", { 47 | re <- rex("[" %if_prev_is% "<=?") 48 | expect_equal(re, regex("(?:(?<=<=\\?)\\[)")) 49 | expect_true(grepl(re, "<=?[", perl=TRUE)) 50 | expect_false(grepl(re, "[b", perl=TRUE)) 51 | }) 52 | 53 | context("if_prev_isnt") 54 | test_that("matches basic characters", { 55 | re <- rex("a" %if_prev_isnt% "b") 56 | expect_equal(re, regex("(?:(? 7 | %\VignetteIndexEntry{Server Log Parsing} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | \usepackage[utf8]{inputenc} 10 | --- 11 | 12 | Parsing server log files is a common task in server administration. 13 | [1](https://link.springer.com/article/10.1007/BF03325089),[2](https://stackoverflow.com/search?q=%22Apache+log%22) 14 | Historically R would not be well suited to this and it would be better 15 | performed using a scripting language such as perl. Rex, however, makes this 16 | easy to do and allows you to perform both the data cleaning and analysis in R! 17 | 18 | Common server logs consist of space separated fields. 19 | 20 | > 198.214.42.14 - - [21/Jul/1995:14:31:46 -0400] "GET /images/ HTTP/1.0" 200 17688 21 | 22 | > lahal.ksc.nasa.gov - - [24/Jul/1995:12:42:40 -0400] "GET /images/USA-logosmall.gif HTTP/1.0" 200 234 23 | 24 | The logs used in this vignette come from two months of all HTTP requests 25 | to the NASA Kennedy Space Center WWW server in Florida and are freely available 26 | for use. [3](https://web.archive.org/web/20181003084945/http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html) 27 | 28 | ```{r include = FALSE} 29 | library(rex) 30 | library(dplyr) 31 | library(knitr) 32 | library(ggplot2) 33 | library(magrittr) 34 | ``` 35 | 36 | ```{r show.warnings=FALSE} 37 | parsed <- scan("NASA.txt", what = "character", sep = "\n") %>% 38 | re_matches( 39 | rex( 40 | 41 | # Get the time of the request 42 | "[", 43 | capture(name = "time", 44 | except_any_of("]") 45 | ), 46 | "]", 47 | 48 | space, double_quote, "GET", space, 49 | 50 | # Get the filetype of the request if requesting a file 51 | maybe( 52 | non_spaces, ".", 53 | capture(name = "filetype", 54 | except_some_of(space, ".", "?", double_quote) 55 | ) 56 | ) 57 | ) 58 | ) %>% 59 | mutate(filetype = tolower(filetype), 60 | time = as.POSIXct(time, format="%d/%b/%Y:%H:%M:%S %z")) 61 | ``` 62 | 63 | This gives us a nicely formatted data frame of the time and filetypes of the requests. 64 | ```{r echo = FALSE} 65 | kable(head(parsed, n = 10)) 66 | ``` 67 | 68 | We can also easily generate a histogram of the filetypes, or a plot of requests over time. 69 | ```{r FALSE, fig.show='hold', warning = FALSE, message = FALSE} 70 | ggplot(na.omit(parsed)) + stat_count(aes(x=filetype)) 71 | ggplot(na.omit(parsed)) + geom_histogram(aes(x=time)) + ggtitle("Requests over time") 72 | ``` 73 | -------------------------------------------------------------------------------- /vignettes/stackoverflow.R: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: "Stackoverflow Usage Examples" 3 | #' author: "Jim Hester" 4 | #' date: "`r Sys.Date()`" 5 | #' output: rmarkdown::html_vignette 6 | #' vignette: > 7 | #' %\VignetteIndexEntry{Stackoverflow Usage Examples} 8 | #' %\VignetteEngine{knitr::rmarkdown} 9 | #' \usepackage[utf8]{inputenc} 10 | #' --- 11 | 12 | #+ cache=FALSE, include=FALSE 13 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>") 14 | knitr::render_markdown(strict = TRUE) 15 | 16 | #' ### [http://stackoverflow.com/questions/27106552][] 17 | 18 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 19 | 20 | x <- c("a=1", "b=3", "a=9", "c=2", "b=4", "a=2") 21 | 22 | #' First extract the names and values from the strings. 23 | 24 | library(rex) 25 | matches <- re_matches(x, 26 | rex( 27 | capture(name="name", letter), 28 | "=", 29 | capture(name="value", digit) 30 | )) 31 | matches 32 | 33 | #' Then tally the groups using `split()`. 34 | 35 | groups <- split(as.numeric(matches$value), matches$name) 36 | groups 37 | 38 | #' If we try to convert directly to a data.frame from `split()` the groups with 39 | #' fewer members will have their members recycled rather than `NA`, so instead 40 | #' explicitly fill with `NA`. 41 | 42 | largest_group <- max(lengths(groups)) 43 | largest_group 44 | 45 | groups <- lapply(groups, function(group) { 46 | if (length(group) < largest_group) { 47 | group[largest_group] <- NA 48 | } 49 | group 50 | }) 51 | groups 52 | 53 | #' Finally we can create the data.frame 54 | 55 | do.call("data.frame", groups) 56 | 57 | #' ### [http://stackoverflow.com/questions/14146362/][] ### 58 | 59 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 60 | 61 | mystrings <- c("X2/D2/F4", 62 | "X10/D9/F4", 63 | "X3/D22/F4", 64 | "X9/D22/F9") 65 | 66 | library(rex) 67 | matches <- re_matches(mystrings, 68 | rex( 69 | "/", 70 | any, 71 | capture(name = "numbers", digits) 72 | ) 73 | ) 74 | as.numeric(matches$numbers) 75 | 76 | #' ### [http://stackoverflow.com/questions/8613237/][] ## 77 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 78 | 79 | j <- "What kind of cheese isn't your cheese? (wonder) Nacho cheese! (groan) (Laugh)" 80 | 81 | library(rex) 82 | matches <- re_matches(j, 83 | rex( 84 | "(", 85 | capture(name = "text", except_any_of(")")), 86 | ")"), 87 | global = TRUE) 88 | matches 89 | 90 | #' ### [http://stackoverflow.com/questions/22976472][] 91 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 92 | 93 | txt <- "this is just a test! i'm not sure if this is O.K. or if it will work? who knows. regex is sorta new to me.. There are certain cases that I may not figure out?? sad! ^_^" 94 | 95 | re <- rex( 96 | capture(name = "first_letter", alnum), 97 | capture(name = "sentence", 98 | any_non_puncts, 99 | zero_or_more( 100 | group( 101 | punct %if_next_isnt% space, 102 | any_non_puncts 103 | ) 104 | ), 105 | maybe(punct) 106 | ) 107 | ) 108 | 109 | re_substitutes(txt, re, "\\U\\1\\E\\2", global = TRUE) 110 | 111 | #' ### [http://stackoverflow.com/questions/27172007][] 112 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 113 | 114 | x <- data.frame( 115 | locationid = c( 116 | 1073744023, 117 | 1073744022, 118 | 1073744025, 119 | 1073744024, 120 | 1073744021, 121 | 1073744026 122 | ), 123 | address = c( 124 | "525 East 68th Street, New York, NY 10065, USA", 125 | "270 Park Avenue, New York, NY 10017, USA", 126 | "Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA", 127 | "1251 Avenue of the Americas, New York, NY 10020, USA", 128 | "1301 Avenue of the Americas, New York, NY 10019, USA", 129 | "44 West 45th Street, New York, NY 10036, USA" 130 | )) 131 | 132 | library(rex) 133 | 134 | sep <- rex(",", spaces) 135 | 136 | re <- 137 | rex( 138 | capture(name = "address", 139 | except_some_of(",") 140 | ), 141 | sep, 142 | capture(name = "city", 143 | except_some_of(",") 144 | ), 145 | sep, 146 | capture(name = "state", 147 | uppers 148 | ), 149 | spaces, 150 | capture(name = "zip", 151 | some_of(digit, "-") 152 | ), 153 | sep, 154 | capture(name = "country", 155 | something 156 | )) 157 | 158 | re_matches(x$address, re) 159 | 160 | #' ### [http://stackoverflow.com/questions/27155297/][] 161 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 162 | 163 | library(rex) 164 | x <- 165 | "https://support.google.com/blogger/topic/12457 166 | https://support.google.com/blogger/topic/12457. 167 | https://support.google.com/blogger/topic/12457] 168 | <> 169 | https://support.google.com/blogger/topic/12457, 170 | https://support.google.com/blogger/topic/12457), 171 | xxxxxxhttps://support.google.com/blogger/topic/12457),hhhththta" 172 | 173 | re <- rex( 174 | capture(name = "url", 175 | "https://support.google.com/blogger/topic/", 176 | digits 177 | )) 178 | 179 | re_matches(x, re, global = TRUE)[[1]] 180 | 181 | #' ### [http://stackoverflow.com/questions/27219421][] 182 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 183 | tmp <- c("Little Street","A323", "Essex Road (A43)", "M43", "Orange street", "M4", "B2045", "New Street") 184 | 185 | library(rex) 186 | classify_road <- function(x) { 187 | res <- re_matches(x, 188 | rex( 189 | capture(name = "type", 190 | upper 191 | ), 192 | digit 193 | ) 194 | ) 195 | 196 | res$type[ is.na(res$type) ] <- "Minor" 197 | paste(res$type, "Road") 198 | } 199 | 200 | classify_road(tmp) 201 | 202 | #' ### [http://stackoverflow.com/questions/22247410][] 203 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 204 | 205 | x <- "this is a multiline text 206 | some more test here 207 | before we get to the good stuff 208 | \\end{figure}" 209 | 210 | re <- rex("\\end{figure}") 211 | re_matches(x, re) 212 | 213 | regexpr(re, x, perl = TRUE) 214 | 215 | #' ### [http://stackoverflow.com/questions/23447261][] 216 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 217 | 218 | x <- structure(list(text = structure(c(4L, 6L, 1L, 2L, 5L, 3L), .Label = c("ãããæããããéãããæãããInappropriate announce:-(", 219 | "@AirAsia your direct debit (Maybank) payment gateways is not working. Is it something you are working to fix?", 220 | "@AirAsia Apart from the slight delay and shortage of food on our way back from Phuket, both flights were very smooth. Kudos :)", 221 | "RT @AirAsia: ØØÙØÙÙÙÙ ÙØØØ ØØØÙ ÙØØØØÙ ØØØØÙÙÙí í Now you can enjoy a #great :D breakfast onboard with our new breakfast meals! :D", 222 | "xdek ke flight @AirAsia Malaysia to LA... hahah..:p bagi la promo murah2 sikit, kompom aku beli...", 223 | "You know there is a problem when customer service asks you to wait for 103 minutes and your no is 42 in the queue. X-(" 224 | ), class = "factor"), created = structure(c(5L, 4L, 4L, 3L, 2L, 225 | 1L), .Label = c("1/2/2014 16:14", "1/2/2014 17:00", "3/2/2014 0:54", 226 | "3/2/2014 0:58", "3/2/2014 1:28"), class = "factor")), .Names = c("text", 227 | "created"), class = "data.frame", row.names = c(NA, -6L)) 228 | 229 | emots <- as.character(outer(c(":", ";", ":-", ";-"), c(")", "(", "]", "[", "D", "o", "O", "P", "p"), paste0)) 230 | 231 | re_matches(x$text, 232 | rex( 233 | capture(name = "emoticons", 234 | or(emots) 235 | ) 236 | ), 237 | global = TRUE) 238 | 239 | #' ### [http://stackoverflow.com/questions/27234040][] 240 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 241 | 242 | z <- " 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 |
MESA HIGH VICTORIES
TeamScore
Parkfield High Demons28 to 21
Burns High Badgers14 to 13
" 255 | 256 | re_matches(z, 257 | rex( 258 | capture(name="table", 259 | " ### 348 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 349 | df <- structure(list(Object = c("T00055", "T00055", "E00336", "E00336", 350 | "E00336", "E00336", "T 00054"), Coding = c("T 00055_005_<002_+", 351 | "T 00055_008_<002_+", "E 00336_041_<001_+001_+", "E 00336_041_<001_+001_+001_+", 352 | "E 00336_041_<001_+001_+002_+", "E 00336_041_<001_+001_+002_<", 353 | "T 00054_013_<003_<015_+003_<001_<"), Fn = c(2L, 2L, 3L, 4L, 354 | 4L, 4L, 4L), Remaining = c(30L, 30L, 0L, 10L, 56L, 52L, 52L)), .Names = c("Object", 355 | "Coding", "Fn", "Remaining"), row.names = c(NA, -7L), class = "data.frame") 356 | 357 | subset(df, grepl(rex(at_least(group("_+", anything), 2)), Coding)) 358 | 359 | #' ### 360 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 361 | 362 | ids <- c("367025001", "CT_341796001", "M13X01692-01", "13C025050901", "13C00699551") 363 | 364 | re_substitutes(ids, 365 | rex(non_digits %or% list("01", end)), 366 | "", 367 | global = TRUE) 368 | 369 | #' ### 370 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 371 | library("rvest") 372 | library("stringr") 373 | 374 | minimal <- html("blah

 foo") 375 | 376 | bodytext <- minimal %>% 377 | html_node("body") %>% 378 | html_text() 379 | 380 | re_substitutes(bodytext, rex(spaces), "", global = TRUE) 381 | 382 | #' ### 383 | #+ message=FALSE 384 | string <- "this\\(system) {is} [full]." 385 | library(Hmisc) 386 | gsub("\\\\(.)", "\\1", escapeRegex(string)) 387 | 388 | #' Alternatively [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 389 | library(rex) 390 | re_substitutes(escape(string), rex("\\", capture(any)), "\\1", global = TRUE) 391 | 392 | #' ### 393 | #' [rex](http://cran.r-project.org/web/packages/rex/) has a [vignette for parsing server logs](http://cran.r-project.org/web/packages/rex/vignettes/log_parsing.html). While the format is not exactly the same as your log you should be able to adapt it to your case fairly easily. 394 | #' As far as reading the log in assuming the file fits in memory your best bet is to read the whole file first with `readLines()`, then the following will put each field into a `data.frame` column. 395 | x <- "Feb 6 12:14:14 localhost haproxy[14389]: 10.0.1.2:33317 [06/Feb/2009:12:14:14.655] http-in static/srv1 10/0/30/69/109 200 2750 - - ---- 1/1/1/1/0 0/0 {1wt.eu} {} \"GET /index.html HTTP/1.1\"" 396 | library(rex) 397 | re <- rex( 398 | 399 | capture(name = "process_name", alpha), 400 | "[", 401 | capture(name = "pid", digits), 402 | "]:", 403 | spaces, 404 | capture(name = "client_ip", any_of(digit, ".")), 405 | ":", 406 | capture(name = "client_port", digits), 407 | spaces, 408 | "[", 409 | capture(name = "accept_date", except_some_of("]")), 410 | "]", 411 | spaces, 412 | capture(name = "frontend_name", non_spaces), 413 | spaces, 414 | capture(name = "backend_name", except_some_of("/")), 415 | "/", 416 | capture(name = "server_name", non_spaces), 417 | spaces, 418 | capture(name = "Tq", some_of("-", digit)), 419 | "/", 420 | capture(name = "Tw", some_of("-", digit)), 421 | "/", 422 | capture(name = "Tc", some_of("-", digit)), 423 | "/", 424 | capture(name = "Tr", some_of("-", digit)), 425 | "/", 426 | capture(name = "Tt", some_of("+", digit)), 427 | spaces, 428 | capture(name = "status_code", digits), 429 | spaces, 430 | capture(name = "bytes_read", some_of("+", digit)), 431 | spaces, 432 | capture(name = "captured_request_cookie", non_spaces), 433 | spaces, 434 | capture(name = "captured_response_cookie", non_spaces), 435 | spaces, 436 | capture(name = "termination_state", non_spaces), 437 | spaces, 438 | capture(name = "actconn", digits), 439 | "/", 440 | capture(name = "feconn", digits), 441 | "/", 442 | capture(name = "beconn", digits), 443 | "/", 444 | capture(name = "srv_conn", digits), 445 | "/", 446 | capture(name = "retries", some_of("+", digit)), 447 | spaces, 448 | capture(name = "srv_queue", digits), 449 | "/", 450 | capture(name = "backend_queue", digits), 451 | spaces, 452 | "{", 453 | capture(name = "captured_request_headers", except_any_of("}")), 454 | "}", 455 | spaces, 456 | "{", 457 | capture(name = "captured_response_headers", except_any_of("}")), 458 | "}", 459 | spaces, 460 | double_quote, 461 | capture(name = "http_request", non_quotes), 462 | double_quote) 463 | 464 | re_matches(x, re) 465 | 466 | #' ### 467 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 468 | 469 | my.data <- read.table(text = " 470 | my.string state 471 | ......... A 472 | 1........ B 473 | 112...... C 474 | 11111.... D 475 | 1111113.. E 476 | 111111111 F 477 | 111111111 G 478 | ", header = TRUE, stringsAsFactors = FALSE) 479 | 480 | library(rex) 481 | 482 | re_matches(my.data$my.string, 483 | rex(capture(except(".")), "."))$"1" 484 | 485 | #' ### 486 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 487 | string <- "Shakira - Wolf - 02.Hips don't lie.mp3" 488 | 489 | library(rex) 490 | re_matches(string, 491 | rex(capture(zero_or_more(any, type="lazy")), spaces, "-"))$"1" 492 | 493 | #' ### 494 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 495 | 496 | string <- "I t is tim e to g o" 497 | library(rex) 498 | re_substitutes(string, rex( 499 | space %if_next_is% 500 | list( 501 | list(non_space, space, at_least(non_space, 2)) %or% 502 | list(non_space, end) 503 | ) 504 | ), "", global = TRUE) 505 | 506 | #' ### 507 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler. 508 | 509 | string <- "01:04:43.064 [12439] <2> xyz 510 | 01:04:43.067 [12439] <2> a lmn 511 | 01:04:43.068 [12439] <4> j klm 512 | x_times_wait to <3000> 513 | 01:04:43.068 [12439] <4> j klm 514 | enter_object <5000> main k" 515 | 516 | library(rex) 517 | 518 | timestamp <- rex(n(digit, 2), ":", n(digit, 2), ":", n(digit, 2), ".", n(digit, 3)) 519 | 520 | re <- rex(timestamp, space, 521 | "[", digits, "]", space, 522 | "<", digits, ">", space, 523 | capture(anything)) 524 | 525 | re_matches(string, re, global = TRUE) 526 | -------------------------------------------------------------------------------- /vignettes/url_parsing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "URL Validation" 3 | author: "Jim Hester" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{URL Validation} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | \usepackage[utf8]{inputenc} 10 | --- 11 | 12 | Consider the task of correctly [validating a URL](https://mathiasbynens.be/demo/url-regex). 13 | From that page two conclusions can be made. 14 | 15 | 1. Validating URLs require complex regular expressions. 16 | 2. Creating a correct regular expression is hard! (only 1 out of 13 regexs were valid for all cases). 17 | 18 | Because of this one may be tempted to simply copy the best regex you can find ([gist](https://gist.github.com/dperini/729294)). 19 | 20 | The problem with this is that while you can copy it now, what happens later when you find a case that is not handled correctly? Can you correctly interpret and modify this? 21 | ```{r url_parsing_stock, eval=F} 22 | "^(?:(?:http(?:s)?|ftp)://)(?:\\S+(?::(?:\\S)*)?@)?(?:(?:[a-z0-9\u00a1-\uffff](?:-)*)*(?:[a-z0-9\u00a1-\uffff])+)(?:\\.(?:[a-z0-9\u00a1-\uffff](?:-)*)*(?:[a-z0-9\u00a1-\uffff])+)*(?:\\.(?:[a-z0-9\u00a1-\uffff]){2,})(?::(?:\\d){2,5})?(?:/(?:\\S)*)?$" 23 | ``` 24 | 25 | However if you re-create the regex with `rex` it is much easier to understand and modify later if needed. 26 | ```{r url_parsing_url} 27 | library(rex) 28 | library(magrittr) 29 | 30 | valid_chars <- rex(except_some_of(".", "/", " ", "-")) 31 | 32 | re <- rex( 33 | start, 34 | 35 | # protocol identifier (optional) + // 36 | group(list("http", maybe("s")) %or% "ftp", "://"), 37 | 38 | # user:pass authentication (optional) 39 | maybe(non_spaces, 40 | maybe(":", zero_or_more(non_space)), 41 | "@"), 42 | 43 | #host name 44 | group(zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)), 45 | 46 | #domain name 47 | zero_or_more(".", zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)), 48 | 49 | #TLD identifier 50 | group(".", valid_chars %>% at_least(2)), 51 | 52 | # server port number (optional) 53 | maybe(":", digit %>% between(2, 5)), 54 | 55 | # resource path (optional) 56 | maybe("/", non_space %>% zero_or_more()), 57 | 58 | end 59 | ) 60 | ``` 61 | 62 | We can then validate that it correctly identifies both good and bad URLs. (_IP address validation removed_) 63 | 64 | ```{r url_parsing_validate} 65 | good <- c("http://foo.com/blah_blah", 66 | "http://foo.com/blah_blah/", 67 | "http://foo.com/blah_blah_(wikipedia)", 68 | "http://foo.com/blah_blah_(wikipedia)_(again)", 69 | "http://www.example.com/wpstyle/?p=364", 70 | "https://www.example.com/foo/?bar=baz&inga=42&quux", 71 | "http://✪df.ws/123", 72 | "http://userid:password@example.com:8080", 73 | "http://userid:password@example.com:8080/", 74 | "http://userid@example.com", 75 | "http://userid@example.com/", 76 | "http://userid@example.com:8080", 77 | "http://userid@example.com:8080/", 78 | "http://userid:password@example.com", 79 | "http://userid:password@example.com/", 80 | "http://➡.ws/䨹", 81 | "http://⌘.ws", 82 | "http://⌘.ws/", 83 | "http://foo.com/blah_(wikipedia)#cite-1", 84 | "http://foo.com/blah_(wikipedia)_blah#cite-1", 85 | "http://foo.com/unicode_(✪)_in_parens", 86 | "http://foo.com/(something)?after=parens", 87 | "http://☺.damowmow.com/", 88 | "http://code.google.com/events/#&product=browser", 89 | "http://j.mp", 90 | "ftp://foo.bar/baz", 91 | "http://foo.bar/?q=Test%20URL-encoded%20stuff", 92 | "http://مثال.إختبار", 93 | "http://例子.测试", 94 | "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", 95 | "http://1337.net", 96 | "http://a.b-c.de", 97 | "http://223.255.255.254") 98 | 99 | bad <- c( 100 | "http://", 101 | "http://.", 102 | "http://..", 103 | "http://../", 104 | "http://?", 105 | "http://??", 106 | "http://??/", 107 | "http://#", 108 | "http://##", 109 | "http://##/", 110 | "http://foo.bar?q=Spaces should be encoded", 111 | "//", 112 | "//a", 113 | "///a", 114 | "///", 115 | "http:///a", 116 | "foo.com", 117 | "rdar://1234", 118 | "h://test", 119 | "http:// shouldfail.com", 120 | ":// should fail", 121 | "http://foo.bar/foo(bar)baz quux", 122 | "ftps://foo.bar/", 123 | "http://-error-.invalid/", 124 | "http://-a.b.co", 125 | "http://a.b-.co", 126 | "http://0.0.0.0", 127 | "http://3628126748", 128 | "http://.www.foo.bar/", 129 | "http://www.foo.bar./", 130 | "http://.www.foo.bar./") 131 | 132 | all(grepl(re, good) == TRUE) 133 | 134 | all(grepl(re, bad) == FALSE) 135 | ``` 136 | 137 | You can now see the power and expressiveness of building regular expressions with `rex`! 138 | --------------------------------------------------------------------------------