├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
    └── core.R
├── README.Rmd
├── README.md
└── man
    ├── figures
        ├── README-unnamed-chunk-6-1.png
        ├── logo.png
        └── white.png
    └── str_capture.Rd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README.Rmd$
4 | ^README.md$
5 | ^working$
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .Rhistory
 3 | *.Rproj
 4 | .Rproj.user
 5 | *.swp
 6 | *.txt
 7 | inst/doc
 8 | doc
 9 | Meta
10 | pkgdown
11 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: fugly
 2 | Type: Package
 3 | Title: Extract Named Substrings Using Regular Expressions
 4 | Version: 0.1.0
 5 | Author: mikefc
 6 | Maintainer: mikefc <mikefc@coolbutuseless.com>
 7 | Description: Extract named substrings using named capture groups in regular expressions.
 8 | License: MIT + file LICENSE
 9 | Encoding: UTF-8
10 | LazyData: true
11 | RoxygenNote: 7.1.1
12 | Imports:
13 |     stringr
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 mikefc@coolbutuseless.com
 2 | 
 3 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 |     The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(str_capture)
4 | import(stringr)
5 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # fugly 0.1.0  2021-03-19
2 | 
3 | * Initial release
4 | 


--------------------------------------------------------------------------------
/R/core.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | #' Named capture groups
 5 | #'
 6 | #' @param string input character vector
 7 | #' @param pattern a regex using named capture groups as used in \code{glue} and
 8 | #'        \code{unglue}
 9 | #' @param delim delimiters of the named capture groups. Note: Very litte sanity
10 | #'        checking is done here. You'll want to be able to guarantee that these
11 | #'        delims do not appear in your actual string input otherwise things
12 | #'        will not go as you want. Caveat Emptor!
13 | #'
14 | #' @return data.frame of captured groups
15 | #'
16 | #' @import stringr
17 | #' @export
18 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
19 | str_capture <- function(string, pattern, delim = c('{', '}')) {
20 | 
21 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
22 |   # Assert delim is sane
23 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 |   stopifnot(length(delim) == 2L)
25 |   stopifnot(delim[1L] != delim[2L])
26 |   stopifnot(all(nchar(delim) == 1L))
27 | 
28 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 |   # This is how the capture patterns will be extracted e.g. "<.*?>"
30 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 |   capture_pattern <- paste0("(\\", delim[1L], ".*?\\", delim[2L], ")")
32 | 
33 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 |   # Extract the captured names + regexs from the user-supplied patterns
35 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
36 |   capture_groups <- stringr::str_match_all(pattern, capture_pattern)[[1L]][,-1L]
37 |   capture_groups <- stringr::str_sub(capture_groups, start = 2L,
38 |                                     end = stringr::str_length(capture_groups) - 1L)
39 | 
40 |   capture_groups <- stringr::str_split_fixed(capture_groups, '=', 2)
41 | 
42 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
43 |   # Sanity check the names
44 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
45 |   capture_names <- capture_groups[,1L]
46 |   stopifnot(length(capture_names) > 0L)
47 |   stopifnot(!anyNA(capture_names))
48 |   stopifnot(all(nchar(capture_names) > 0L))
49 |   stopifnot(!anyDuplicated(capture_names))
50 | 
51 | 
52 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
53 |   # massage the regexes. pay attention to backslashes
54 |   # if no regex supplied, use ".*?"
55 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
56 |   capture_regex <- capture_groups[,2L]
57 |   capture_regex[capture_regex == ''] <- '.*?'
58 |   capture_regex <- paste0("(", capture_regex, ")")
59 |   capture_regex <- stringr::str_replace_all(capture_regex, "\\\\", "\\\\\\\\")
60 | 
61 | 
62 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
63 |   # Replace the users capture pattern with just the user-supplied regex for
64 |   # this capture group
65 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
66 |   final_pattern <- pattern
67 |   for (new_regex in capture_regex) {
68 |     final_pattern <- stringr::str_replace(final_pattern, capture_pattern, new_regex)
69 |   }
70 | 
71 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
72 |   # Run the regex and create a data.frame result
73 |   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
74 |   res <- stringr::str_match(string, final_pattern)[,-1L, drop = FALSE]
75 |   res <- as.data.frame(res)
76 |   names(res) <- capture_names
77 | 
78 | 
79 |   res
80 | }
81 | 
82 | 
83 | if (FALSE) {
84 |   delim   <- c('<', '>')
85 | 
86 |   string <- c(
87 |     "information: name:greg age:27 ",
88 |     "information: name:mary age:34 "
89 |   )
90 | 
91 |   pattern <- "name:{name} age:{age=\\d+}"
92 | 
93 |   str_capture(string, pattern)
94 | 
95 | 
96 | 
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = FALSE,
 10 |   comment = "#>",
 11 |   fig.path = "man/figures/README-",
 12 |   out.width = "100%"
 13 | )
 14 | 
 15 | library(fugly)
 16 | ```
 17 | 
 18 | ```{r echo = FALSE, eval = FALSE}
 19 | # Quick logo generation. Borrowed heavily from Nick Tierney's Syn logo process
 20 | library(magick)
 21 | library(showtext)
 22 | font_add_google("Allura", "gf")
 23 | 
 24 | 
 25 | # pkgdown::build_site(override = list(destination = "../coolbutuseless.github.io/package/minipdf"))
 26 | ```
 27 | 
 28 | ```{r echo = FALSE, eval = FALSE}
 29 | img <- image_read("man/figures/white.png") #%>%
 30 |   # image_transparent(color = "#f9fafb", fuzz = 10) %>%
 31 |   # image_trim() %>%
 32 |   # image_threshold()
 33 | 
 34 | 
 35 | hexSticker::sticker(subplot  = img,
 36 |                     s_x      = 0.92,
 37 |                     s_y      = 1.2,
 38 |                     s_width  = 1.5,
 39 |                     s_height = 0.95,
 40 |                     package  = "fugly",
 41 |                     p_x      = 1,
 42 |                     p_y      = 1.15,
 43 |                     p_color  = "#223344",
 44 |                     p_family = "gf",
 45 |                     p_size   = 23,
 46 |                     h_size   = 1.2,
 47 |                     h_fill   = "#ffffff",
 48 |                     h_color  = "#223344",
 49 |                     filename = "man/figures/logo.png")
 50 | 
 51 | image_read("man/figures/logo.png")
 52 | ```
 53 | 
 54 | # fugly <img src="man/figures/logo.png" align="right" height="230/"/>
 55 | 
 56 | <!-- badges: start -->
 57 | 
 58 | ![](https://img.shields.io/badge/cool-useless-green.svg)
 59 | 
 60 | <!-- badges: end -->
 61 | 
 62 | This package provides a single function (`str_capture`) for using named capture 
 63 | groups to extract values from strings. A key requirement for readability is that 
 64 | the names of the capture groups are specified inline as part of the regex, 
 65 | and not in an external vector or as separate names.
 66 | 
 67 | `fugly::str_capture()` is implemented as a wrapper around 
 68 | [stringr](https://cran.r-project.org/package=stringr). This is because `stringr` 
 69 | itself does not yet do named capture groups (See issues for 
 70 | [stringr](https://github.com/tidyverse/stringr/issues/71) and 
 71 | [stringi](https://github.com/gagolews/stringi/issues/153)).
 72 | 
 73 | `fugly::str_capture()` is very similar to a number of existing packages. See
 74 | table below for a comparison.
 75 | 
 76 | | Method                      | Speed    | Inline capture group naming | robust |
 77 | |-----------------------------|----------|-----------------------------|--------|
 78 | | `fugly::str_capture`        | Fast     | Yes                         | No     |
 79 | | `rr4r::rr4r_extract_groups` | Fast     | Yes                         | Yes    |
 80 | | `nc::capture_first_vec`     | Fast     | No                          | Yes    |
 81 | | `tidy::extract`             | Fast     | No                          | Yes    |
 82 | | `utils::strcapture`         | Middling | No                          | Yes    |
 83 | | `unglue::unglue`            | Slow     | Yes                         | Yes    |
 84 | | `ore::ore_search`           | Slow     | Yes                         | Yes    |
 85 | 
 86 | ### What do I mean when I say `fugly::str_capture()` is unsafe/dodgy/non-robust?
 87 | 
 88 | -   It doesn't adhere to standard regular expression syntax for named capture groups as used in perl, python etc.
 89 | 
 90 | -   It doesn't really adhere to `glue` syntax (although it looks similar at a surface level).
 91 | 
 92 | -   If you specify delimiters which appear in your string input, then you're going to have a bad time.
 93 | 
 94 | -   It's generally only been tested on data which is:
 95 | 
 96 |     -   highly structured
 97 |     -   only ASCII
 98 |     -   non-pathological
 99 | 
100 | ### What's in the box?
101 | 
102 | -   `fugly::str_capture(string, pattern, delim)`
103 | 
104 |     -   capture named groups with regular expressions
105 |     -   returns a data.frame with all columns containing character strings
106 |     -   can mix-and-match with non-capturing regular expressions
107 |     -   if no regular expression specified for a named group then `.*?` is used.
108 |     -   does not do any type guessing/conversion.
109 | 
110 | ## Installation
111 | 
112 | You can install from [GitHub](https://github.com/coolbutuseless/fugly) with:
113 | 
114 | ``` r
115 | # install.package('remotes')
116 | remotes::install_github('coolbutuseless/fugly')
117 | ```
118 | 
119 | ## Example 1
120 | 
121 | In the following example:
122 | 
123 | -   Input consists of multiple strings
124 | -   capture groups are delimited by `{}` by default.
125 | -   the regex for the capture group for `name` is unspecified, so `.*?` will be used
126 | -   the regex for the capture group for `age` is `\d+` i.e. match must consist of 1-or-more digits
127 | 
128 | ```{r example}
129 | library(fugly)
130 | 
131 | string <- c(
132 |   "information: Name:greg Age:27 ",
133 |   "information: Name:mary Age:34 "
134 | )
135 | 
136 | str_capture(string, pattern = "Name:{name} Age:{age=\\d+}")
137 | ```
138 | 
139 | ## Example 2
140 | 
141 | A more complicated example:
142 | 
143 | -   Note the mixture of capturing groups and a bare `.*?` in the pattern which is not returned as a result
144 | 
145 | ```{r}
146 | string <- c(
147 | '{"type":"Feature","properties":{"hash":"1348778913c0224a","number":"27","street":"BANAMBILA STREET","unit":"","city":"ARANDA","district":"","region":"ACT","postcode":"2614","id":"GAACT714851647"},"geometry":{"type":"Point","coordinates":[149.0826143,-35.2545558]}}',
148 | '{"type":"Feature","properties":{"hash":"dc776871c868bc7e","number":"139","street":"BOUVERIE STREET","unit":"UNIT 711","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC423944917"},"geometry":{"type":"Point","coordinates":[144.9617149,-37.8032551]}}',
149 | '{"type":"Feature","properties":{"hash":"8197f34a40ccad47","number":"6","street":"MOGRIDGE STREET","unit":"","city":"WARWICK","district":"","region":"QLD","postcode":"4370","id":"GAQLD155949502"},"geometry":{"type":"Point","coordinates":[152.0230999,-28.2230133]}}',
150 | '{"type":"Feature","properties":{"hash":"18edc96308fc1a8e","number":"22","street":"ORR STREET","unit":"UNIT 507","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC424282716"},"geometry":{"type":"Point","coordinates":[144.9653484,-37.8063371]}}'
151 | )
152 | 
153 | 
154 | str_capture(string, pattern = '"number":"{number}","street":"{street}".*?"coordinates":\\[{coords}\\]')
155 | 
156 | ```
157 | 
158 | ## Simple Benchmark
159 | 
160 | I acknowledge that this isn't the greatest benchmark, but it is relevant to my current use-case.
161 | 
162 | -   [nc](https://github.com/tdhock/nc) with the PCRE regex engine is the fastest named capture I could find in R.
163 | 
164 |     -   However - I'm not a huge fan of its syntax
165 | 
166 | -   For large inputs (1000+ input strings), `fugly` is significantly faster than `unglue`, `utils::strcapture` and \``ore`
167 | 
168 | -   The rust regex engine [rr4r](https://github.com/yutannihilation/rr4r) is slightly faster than `fugly`
169 | 
170 | -   `unglue` is the slowest of the methods.
171 | 
172 | -   `ore` lies somewhere between `unglue` and `utils::strcapture`
173 | 
174 | -   As pointed out by [Michael Barrowman](https://twitter.com/MyKo101AB), `tidyr::extract()` will also do named capture into a data.frame.
175 | 
176 |     -   Similar to `utils::strcapture()`, the names are not specified inline with the regex, but are listed separately.
177 | 
178 | ```{r warning=FALSE, message=FALSE}
179 | # remotes::install_github("jonclayden/ore")
180 | # remotes::install_github("yutannihilation/rr4r")
181 | # remotes::install_github('qinwf/re2r') 
182 | library(ore)
183 | library(rr4r)
184 | library(unglue)
185 | library(ggplot2)
186 | library(tidyr)
187 | 
188 | # meaningless strings for benchmarking
189 | N <- 1000
190 | string <- paste0("Information name:greg age:", seq(N))
191 | 
192 | 
193 | res <- bench::mark(
194 |   `fugly::str_capture()` = fugly::str_capture(string, "name:{name} age:{age=\\d+}"),
195 |   `unglue::unglue()` = unglue::unglue_data(string, "Information name:{name} age:{age=\\d+}"),
196 |   `utils::strcapture()` = utils::strcapture("Information name:(.*?) age:(\\d+)", string, 
197 |                     proto = data.frame(name=character(), age=character())),
198 |   `ore::ore_search()` = do.call(rbind.data.frame, lapply(ore_search(ore('name:(?<name>.*?) age:(?<age>\\d+)', encoding='utf8'), string, all=TRUE), function(x) {x$groups$matches})),
199 |    `rr4r::rr4r_extract_groups()` = rr4r::rr4r_extract_groups(string, "name:(?P<name>.*?) age:(?P<age>\\d+)"),
200 |   `nc::capture_first_vec() PCRE` = nc::capture_first_vec(string, "Information name:", name=".*?", " age:", age="\\d+", engine = 'PCRE'),
201 |   `tidyr::extract()` = tidyr::extract(data.frame(x = string), x, into = c('name', 'age'), regex = 'name:(.*?) age:(\\d+)'),
202 |   check = FALSE
203 | )
204 | ```
205 | 
206 | ```{r echo=FALSE}
207 | plot(res) + 
208 |   theme_bw() + 
209 |   theme(legend.position = 'bottom')
210 | ```
211 | 
212 | ## Related Software
213 | 
214 | -   [stringr](https://cran.r-project.org/package=stringr)
215 | -   `utils::strcapture()`
216 | -   [unglue::unglue()](%5Bunglue%5D(https://cran.r-project.org/web/packages/unglue/index.html))
217 | -   [ore](https://github.com/jonclayden/ore), [ore on CRAN](https://cran.r-project.org/package=ore)
218 | -   [namedCapture](https://cran.r-project.org/web/packages/namedCapture/index.html) Note: I couldn't get this to work sanely.
219 | -   [rr4f](https://github.com/yutannihilation/rr4r) rust regex engine
220 | -   [nc](https://github.com/tdhock/nc)
221 | 
222 | ## Acknowledgements
223 | 
224 | -   R Core for developing and maintaining the language.
225 | -   CRAN maintainers, for patiently shepherding packages onto CRAN and maintaining the repository
226 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # fugly <img src="man/figures/logo.png" align="right" height="230/"/>
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | ![](https://img.shields.io/badge/cool-useless-green.svg)
  9 | 
 10 | <!-- badges: end -->
 11 | 
 12 | This package provides a single function (`str_capture`) for using named
 13 | capture groups to extract values from strings. A key requirement for
 14 | readability is that the names of the capture groups are specified inline
 15 | as part of the regex, and not in an external vector or as separate
 16 | names.
 17 | 
 18 | `fugly::str_capture()` is implemented as a wrapper around
 19 | [stringr](https://cran.r-project.org/package=stringr). This is because
 20 | `stringr` itself does not yet do named capture groups (See issues for
 21 | [stringr](https://github.com/tidyverse/stringr/issues/71) and
 22 | [stringi](https://github.com/gagolews/stringi/issues/153)).
 23 | 
 24 | `fugly::str_capture()` is very similar to a number of existing packages.
 25 | See table below for a comparison.
 26 | 
 27 | | Method                      | Speed    | Inline capture group naming | robust |
 28 | |-----------------------------|----------|-----------------------------|--------|
 29 | | `fugly::str_capture`        | Fast     | Yes                         | No     |
 30 | | `rr4r::rr4r_extract_groups` | Fast     | Yes                         | Yes    |
 31 | | `nc::capture_first_vec`     | Fast     | No                          | Yes    |
 32 | | `tidy::extract`             | Fast     | No                          | Yes    |
 33 | | `utils::strcapture`         | Middling | No                          | Yes    |
 34 | | `unglue::unglue`            | Slow     | Yes                         | Yes    |
 35 | | `ore::ore_search`           | Slow     | Yes                         | Yes    |
 36 | 
 37 | ### What do I mean when I say `fugly::str_capture()` is unsafe/dodgy/non-robust?
 38 | 
 39 | -   It doesn’t adhere to standard regular expression syntax for named
 40 |     capture groups as used in perl, python etc.
 41 | 
 42 | -   It doesn’t really adhere to `glue` syntax (although it looks similar
 43 |     at a surface level).
 44 | 
 45 | -   If you specify delimiters which appear in your string input, then
 46 |     you’re going to have a bad time.
 47 | 
 48 | -   It’s generally only been tested on data which is:
 49 | 
 50 |     -   highly structured
 51 |     -   only ASCII
 52 |     -   non-pathological
 53 | 
 54 | ### What’s in the box?
 55 | 
 56 | -   `fugly::str_capture(string, pattern, delim)`
 57 | 
 58 |     -   capture named groups with regular expressions
 59 |     -   returns a data.frame with all columns containing character
 60 |         strings
 61 |     -   can mix-and-match with non-capturing regular expressions
 62 |     -   if no regular expression specified for a named group then `.*?`
 63 |         is used.
 64 |     -   does not do any type guessing/conversion.
 65 | 
 66 | ## Installation
 67 | 
 68 | You can install from [GitHub](https://github.com/coolbutuseless/fugly)
 69 | with:
 70 | 
 71 | ``` r
 72 | # install.package('remotes')
 73 | remotes::install_github('coolbutuseless/fugly')
 74 | ```
 75 | 
 76 | ## Example 1
 77 | 
 78 | In the following example:
 79 | 
 80 | -   Input consists of multiple strings
 81 | -   capture groups are delimited by `{}` by default.
 82 | -   the regex for the capture group for `name` is unspecified, so `.*?`
 83 |     will be used
 84 | -   the regex for the capture group for `age` is `\d+` i.e. match must
 85 |     consist of 1-or-more digits
 86 | 
 87 | ``` r
 88 | library(fugly)
 89 | 
 90 | string <- c(
 91 |   "information: Name:greg Age:27 ",
 92 |   "information: Name:mary Age:34 "
 93 | )
 94 | 
 95 | str_capture(string, pattern = "Name:{name} Age:{age=\\d+}")
 96 | ```
 97 | 
 98 |     #>   name age
 99 |     #> 1 greg  27
100 |     #> 2 mary  34
101 | 
102 | ## Example 2
103 | 
104 | A more complicated example:
105 | 
106 | -   Note the mixture of capturing groups and a bare `.*?` in the pattern
107 |     which is not returned as a result
108 | 
109 | ``` r
110 | string <- c(
111 | '{"type":"Feature","properties":{"hash":"1348778913c0224a","number":"27","street":"BANAMBILA STREET","unit":"","city":"ARANDA","district":"","region":"ACT","postcode":"2614","id":"GAACT714851647"},"geometry":{"type":"Point","coordinates":[149.0826143,-35.2545558]}}',
112 | '{"type":"Feature","properties":{"hash":"dc776871c868bc7e","number":"139","street":"BOUVERIE STREET","unit":"UNIT 711","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC423944917"},"geometry":{"type":"Point","coordinates":[144.9617149,-37.8032551]}}',
113 | '{"type":"Feature","properties":{"hash":"8197f34a40ccad47","number":"6","street":"MOGRIDGE STREET","unit":"","city":"WARWICK","district":"","region":"QLD","postcode":"4370","id":"GAQLD155949502"},"geometry":{"type":"Point","coordinates":[152.0230999,-28.2230133]}}',
114 | '{"type":"Feature","properties":{"hash":"18edc96308fc1a8e","number":"22","street":"ORR STREET","unit":"UNIT 507","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC424282716"},"geometry":{"type":"Point","coordinates":[144.9653484,-37.8063371]}}'
115 | )
116 | 
117 | 
118 | str_capture(string, pattern = '"number":"{number}","street":"{street}".*?"coordinates":\\[{coords}\\]')
119 | ```
120 | 
121 |     #>   number           street                  coords
122 |     #> 1     27 BANAMBILA STREET 149.0826143,-35.2545558
123 |     #> 2    139  BOUVERIE STREET 144.9617149,-37.8032551
124 |     #> 3      6  MOGRIDGE STREET 152.0230999,-28.2230133
125 |     #> 4     22       ORR STREET 144.9653484,-37.8063371
126 | 
127 | ## Simple Benchmark
128 | 
129 | I acknowledge that this isn’t the greatest benchmark, but it is relevant
130 | to my current use-case.
131 | 
132 | -   [nc](https://github.com/tdhock/nc) with the PCRE regex engine is the
133 |     fastest named capture I could find in R.
134 | 
135 |     -   However - I’m not a huge fan of its syntax
136 | 
137 | -   For large inputs (1000+ input strings), `fugly` is significantly
138 |     faster than `unglue`, `utils::strcapture` and \``ore`
139 | 
140 | -   The rust regex engine
141 |     [rr4r](https://github.com/yutannihilation/rr4r) is slightly faster
142 |     than `fugly`
143 | 
144 | -   `unglue` is the slowest of the methods.
145 | 
146 | -   `ore` lies somewhere between `unglue` and `utils::strcapture`
147 | 
148 | -   As pointed out by [Michael
149 |     Barrowman](https://twitter.com/MyKo101AB), `tidyr::extract()` will
150 |     also do named capture into a data.frame.
151 | 
152 |     -   Similar to `utils::strcapture()`, the names are not specified
153 |         inline with the regex, but are listed separately.
154 | 
155 | ``` r
156 | # remotes::install_github("jonclayden/ore")
157 | # remotes::install_github("yutannihilation/rr4r")
158 | # remotes::install_github('qinwf/re2r') 
159 | library(ore)
160 | library(rr4r)
161 | library(unglue)
162 | library(ggplot2)
163 | library(tidyr)
164 | 
165 | # meaningless strings for benchmarking
166 | N <- 1000
167 | string <- paste0("Information name:greg age:", seq(N))
168 | 
169 | 
170 | res <- bench::mark(
171 |   `fugly::str_capture()` = fugly::str_capture(string, "name:{name} age:{age=\\d+}"),
172 |   `unglue::unglue()` = unglue::unglue_data(string, "Information name:{name} age:{age=\\d+}"),
173 |   `utils::strcapture()` = utils::strcapture("Information name:(.*?) age:(\\d+)", string, 
174 |                     proto = data.frame(name=character(), age=character())),
175 |   `ore::ore_search()` = do.call(rbind.data.frame, lapply(ore_search(ore('name:(?<name>.*?) age:(?<age>\\d+)', encoding='utf8'), string, all=TRUE), function(x) {x$groups$matches})),
176 |    `rr4r::rr4r_extract_groups()` = rr4r::rr4r_extract_groups(string, "name:(?P<name>.*?) age:(?P<age>\\d+)"),
177 |   `nc::capture_first_vec() PCRE` = nc::capture_first_vec(string, "Information name:", name=".*?", " age:", age="\\d+", engine = 'PCRE'),
178 |   `tidyr::extract()` = tidyr::extract(data.frame(x = string), x, into = c('name', 'age'), regex = 'name:(.*?) age:(\\d+)'),
179 |   check = FALSE
180 | )
181 | ```
182 | 
183 | <img src="man/figures/README-unnamed-chunk-6-1.png" width="100%" />
184 | 
185 | ## Related Software
186 | 
187 | -   [stringr](https://cran.r-project.org/package=stringr)
188 | -   `utils::strcapture()`
189 | -   [unglue::unglue()](%5Bunglue%5D(https://cran.r-project.org/web/packages/unglue/index.html))
190 | -   [ore](https://github.com/jonclayden/ore), [ore on
191 |     CRAN](https://cran.r-project.org/package=ore)
192 | -   [namedCapture](https://cran.r-project.org/web/packages/namedCapture/index.html)
193 |     Note: I couldn’t get this to work sanely.
194 | -   [rr4f](https://github.com/yutannihilation/rr4r) rust regex engine
195 | -   [nc](https://github.com/tdhock/nc)
196 | 
197 | ## Acknowledgements
198 | 
199 | -   R Core for developing and maintaining the language.
200 | -   CRAN maintainers, for patiently shepherding packages onto CRAN and
201 |     maintaining the repository
202 | 


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolbutuseless/fugly/b28acb6ce44c426a0290e6296ffa74e212a52e0e/man/figures/README-unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolbutuseless/fugly/b28acb6ce44c426a0290e6296ffa74e212a52e0e/man/figures/logo.png


--------------------------------------------------------------------------------
/man/figures/white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolbutuseless/fugly/b28acb6ce44c426a0290e6296ffa74e212a52e0e/man/figures/white.png


--------------------------------------------------------------------------------
/man/str_capture.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/core.R
 3 | \name{str_capture}
 4 | \alias{str_capture}
 5 | \title{Named capture groups}
 6 | \usage{
 7 | str_capture(string, pattern, delim = c("{", "}"))
 8 | }
 9 | \arguments{
10 | \item{string}{input character vector}
11 | 
12 | \item{pattern}{a regex using named capture groups as used in \code{glue} and
13 | \code{unglue}}
14 | 
15 | \item{delim}{delimiters of the named capture groups. Note: Very litte sanity
16 | checking is done here. You'll want to be able to guarantee that these
17 | delims do not appear in your actual string input otherwise things
18 | will not go as you want. Caveat Emptor!}
19 | }
20 | \value{
21 | data.frame of captured groups
22 | }
23 | \description{
24 | Named capture groups
25 | }
26 | 


--------------------------------------------------------------------------------