├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R └── core.R ├── README.Rmd ├── README.md └── man ├── figures ├── README-unnamed-chunk-6-1.png ├── logo.png └── white.png └── str_capture.Rd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README.Rmd$ 4 | ^README.md$ 5 | ^working$ 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rhistory 3 | *.Rproj 4 | .Rproj.user 5 | *.swp 6 | *.txt 7 | inst/doc 8 | doc 9 | Meta 10 | pkgdown 11 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: fugly 2 | Type: Package 3 | Title: Extract Named Substrings Using Regular Expressions 4 | Version: 0.1.0 5 | Author: mikefc 6 | Maintainer: mikefc 7 | Description: Extract named substrings using named capture groups in regular expressions. 8 | License: MIT + file LICENSE 9 | Encoding: UTF-8 10 | LazyData: true 11 | RoxygenNote: 7.1.1 12 | Imports: 13 | stringr 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 mikefc@coolbutuseless.com 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(str_capture) 4 | import(stringr) 5 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # fugly 0.1.0 2021-03-19 2 | 3 | * Initial release 4 | -------------------------------------------------------------------------------- /R/core.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | #' Named capture groups 5 | #' 6 | #' @param string input character vector 7 | #' @param pattern a regex using named capture groups as used in \code{glue} and 8 | #' \code{unglue} 9 | #' @param delim delimiters of the named capture groups. Note: Very litte sanity 10 | #' checking is done here. You'll want to be able to guarantee that these 11 | #' delims do not appear in your actual string input otherwise things 12 | #' will not go as you want. Caveat Emptor! 13 | #' 14 | #' @return data.frame of captured groups 15 | #' 16 | #' @import stringr 17 | #' @export 18 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 19 | str_capture <- function(string, pattern, delim = c('{', '}')) { 20 | 21 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 22 | # Assert delim is sane 23 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 24 | stopifnot(length(delim) == 2L) 25 | stopifnot(delim[1L] != delim[2L]) 26 | stopifnot(all(nchar(delim) == 1L)) 27 | 28 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | # This is how the capture patterns will be extracted e.g. "<.*?>" 30 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 31 | capture_pattern <- paste0("(\\", delim[1L], ".*?\\", delim[2L], ")") 32 | 33 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | # Extract the captured names + regexs from the user-supplied patterns 35 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 36 | capture_groups <- stringr::str_match_all(pattern, capture_pattern)[[1L]][,-1L] 37 | capture_groups <- stringr::str_sub(capture_groups, start = 2L, 38 | end = stringr::str_length(capture_groups) - 1L) 39 | 40 | capture_groups <- stringr::str_split_fixed(capture_groups, '=', 2) 41 | 42 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 43 | # Sanity check the names 44 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 45 | capture_names <- capture_groups[,1L] 46 | stopifnot(length(capture_names) > 0L) 47 | stopifnot(!anyNA(capture_names)) 48 | stopifnot(all(nchar(capture_names) > 0L)) 49 | stopifnot(!anyDuplicated(capture_names)) 50 | 51 | 52 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 53 | # massage the regexes. pay attention to backslashes 54 | # if no regex supplied, use ".*?" 55 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 56 | capture_regex <- capture_groups[,2L] 57 | capture_regex[capture_regex == ''] <- '.*?' 58 | capture_regex <- paste0("(", capture_regex, ")") 59 | capture_regex <- stringr::str_replace_all(capture_regex, "\\\\", "\\\\\\\\") 60 | 61 | 62 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 63 | # Replace the users capture pattern with just the user-supplied regex for 64 | # this capture group 65 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 66 | final_pattern <- pattern 67 | for (new_regex in capture_regex) { 68 | final_pattern <- stringr::str_replace(final_pattern, capture_pattern, new_regex) 69 | } 70 | 71 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 72 | # Run the regex and create a data.frame result 73 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 74 | res <- stringr::str_match(string, final_pattern)[,-1L, drop = FALSE] 75 | res <- as.data.frame(res) 76 | names(res) <- capture_names 77 | 78 | 79 | res 80 | } 81 | 82 | 83 | if (FALSE) { 84 | delim <- c('<', '>') 85 | 86 | string <- c( 87 | "information: name:greg age:27 ", 88 | "information: name:mary age:34 " 89 | ) 90 | 91 | pattern <- "name:{name} age:{age=\\d+}" 92 | 93 | str_capture(string, pattern) 94 | 95 | 96 | 97 | 98 | } 99 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = FALSE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | 15 | library(fugly) 16 | ``` 17 | 18 | ```{r echo = FALSE, eval = FALSE} 19 | # Quick logo generation. Borrowed heavily from Nick Tierney's Syn logo process 20 | library(magick) 21 | library(showtext) 22 | font_add_google("Allura", "gf") 23 | 24 | 25 | # pkgdown::build_site(override = list(destination = "../coolbutuseless.github.io/package/minipdf")) 26 | ``` 27 | 28 | ```{r echo = FALSE, eval = FALSE} 29 | img <- image_read("man/figures/white.png") #%>% 30 | # image_transparent(color = "#f9fafb", fuzz = 10) %>% 31 | # image_trim() %>% 32 | # image_threshold() 33 | 34 | 35 | hexSticker::sticker(subplot = img, 36 | s_x = 0.92, 37 | s_y = 1.2, 38 | s_width = 1.5, 39 | s_height = 0.95, 40 | package = "fugly", 41 | p_x = 1, 42 | p_y = 1.15, 43 | p_color = "#223344", 44 | p_family = "gf", 45 | p_size = 23, 46 | h_size = 1.2, 47 | h_fill = "#ffffff", 48 | h_color = "#223344", 49 | filename = "man/figures/logo.png") 50 | 51 | image_read("man/figures/logo.png") 52 | ``` 53 | 54 | # fugly 55 | 56 | 57 | 58 | ![](https://img.shields.io/badge/cool-useless-green.svg) 59 | 60 | 61 | 62 | This package provides a single function (`str_capture`) for using named capture 63 | groups to extract values from strings. A key requirement for readability is that 64 | the names of the capture groups are specified inline as part of the regex, 65 | and not in an external vector or as separate names. 66 | 67 | `fugly::str_capture()` is implemented as a wrapper around 68 | [stringr](https://cran.r-project.org/package=stringr). This is because `stringr` 69 | itself does not yet do named capture groups (See issues for 70 | [stringr](https://github.com/tidyverse/stringr/issues/71) and 71 | [stringi](https://github.com/gagolews/stringi/issues/153)). 72 | 73 | `fugly::str_capture()` is very similar to a number of existing packages. See 74 | table below for a comparison. 75 | 76 | | Method | Speed | Inline capture group naming | robust | 77 | |-----------------------------|----------|-----------------------------|--------| 78 | | `fugly::str_capture` | Fast | Yes | No | 79 | | `rr4r::rr4r_extract_groups` | Fast | Yes | Yes | 80 | | `nc::capture_first_vec` | Fast | No | Yes | 81 | | `tidy::extract` | Fast | No | Yes | 82 | | `utils::strcapture` | Middling | No | Yes | 83 | | `unglue::unglue` | Slow | Yes | Yes | 84 | | `ore::ore_search` | Slow | Yes | Yes | 85 | 86 | ### What do I mean when I say `fugly::str_capture()` is unsafe/dodgy/non-robust? 87 | 88 | - It doesn't adhere to standard regular expression syntax for named capture groups as used in perl, python etc. 89 | 90 | - It doesn't really adhere to `glue` syntax (although it looks similar at a surface level). 91 | 92 | - If you specify delimiters which appear in your string input, then you're going to have a bad time. 93 | 94 | - It's generally only been tested on data which is: 95 | 96 | - highly structured 97 | - only ASCII 98 | - non-pathological 99 | 100 | ### What's in the box? 101 | 102 | - `fugly::str_capture(string, pattern, delim)` 103 | 104 | - capture named groups with regular expressions 105 | - returns a data.frame with all columns containing character strings 106 | - can mix-and-match with non-capturing regular expressions 107 | - if no regular expression specified for a named group then `.*?` is used. 108 | - does not do any type guessing/conversion. 109 | 110 | ## Installation 111 | 112 | You can install from [GitHub](https://github.com/coolbutuseless/fugly) with: 113 | 114 | ``` r 115 | # install.package('remotes') 116 | remotes::install_github('coolbutuseless/fugly') 117 | ``` 118 | 119 | ## Example 1 120 | 121 | In the following example: 122 | 123 | - Input consists of multiple strings 124 | - capture groups are delimited by `{}` by default. 125 | - the regex for the capture group for `name` is unspecified, so `.*?` will be used 126 | - the regex for the capture group for `age` is `\d+` i.e. match must consist of 1-or-more digits 127 | 128 | ```{r example} 129 | library(fugly) 130 | 131 | string <- c( 132 | "information: Name:greg Age:27 ", 133 | "information: Name:mary Age:34 " 134 | ) 135 | 136 | str_capture(string, pattern = "Name:{name} Age:{age=\\d+}") 137 | ``` 138 | 139 | ## Example 2 140 | 141 | A more complicated example: 142 | 143 | - Note the mixture of capturing groups and a bare `.*?` in the pattern which is not returned as a result 144 | 145 | ```{r} 146 | string <- c( 147 | '{"type":"Feature","properties":{"hash":"1348778913c0224a","number":"27","street":"BANAMBILA STREET","unit":"","city":"ARANDA","district":"","region":"ACT","postcode":"2614","id":"GAACT714851647"},"geometry":{"type":"Point","coordinates":[149.0826143,-35.2545558]}}', 148 | '{"type":"Feature","properties":{"hash":"dc776871c868bc7e","number":"139","street":"BOUVERIE STREET","unit":"UNIT 711","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC423944917"},"geometry":{"type":"Point","coordinates":[144.9617149,-37.8032551]}}', 149 | '{"type":"Feature","properties":{"hash":"8197f34a40ccad47","number":"6","street":"MOGRIDGE STREET","unit":"","city":"WARWICK","district":"","region":"QLD","postcode":"4370","id":"GAQLD155949502"},"geometry":{"type":"Point","coordinates":[152.0230999,-28.2230133]}}', 150 | '{"type":"Feature","properties":{"hash":"18edc96308fc1a8e","number":"22","street":"ORR STREET","unit":"UNIT 507","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC424282716"},"geometry":{"type":"Point","coordinates":[144.9653484,-37.8063371]}}' 151 | ) 152 | 153 | 154 | str_capture(string, pattern = '"number":"{number}","street":"{street}".*?"coordinates":\\[{coords}\\]') 155 | 156 | ``` 157 | 158 | ## Simple Benchmark 159 | 160 | I acknowledge that this isn't the greatest benchmark, but it is relevant to my current use-case. 161 | 162 | - [nc](https://github.com/tdhock/nc) with the PCRE regex engine is the fastest named capture I could find in R. 163 | 164 | - However - I'm not a huge fan of its syntax 165 | 166 | - For large inputs (1000+ input strings), `fugly` is significantly faster than `unglue`, `utils::strcapture` and \``ore` 167 | 168 | - The rust regex engine [rr4r](https://github.com/yutannihilation/rr4r) is slightly faster than `fugly` 169 | 170 | - `unglue` is the slowest of the methods. 171 | 172 | - `ore` lies somewhere between `unglue` and `utils::strcapture` 173 | 174 | - As pointed out by [Michael Barrowman](https://twitter.com/MyKo101AB), `tidyr::extract()` will also do named capture into a data.frame. 175 | 176 | - Similar to `utils::strcapture()`, the names are not specified inline with the regex, but are listed separately. 177 | 178 | ```{r warning=FALSE, message=FALSE} 179 | # remotes::install_github("jonclayden/ore") 180 | # remotes::install_github("yutannihilation/rr4r") 181 | # remotes::install_github('qinwf/re2r') 182 | library(ore) 183 | library(rr4r) 184 | library(unglue) 185 | library(ggplot2) 186 | library(tidyr) 187 | 188 | # meaningless strings for benchmarking 189 | N <- 1000 190 | string <- paste0("Information name:greg age:", seq(N)) 191 | 192 | 193 | res <- bench::mark( 194 | `fugly::str_capture()` = fugly::str_capture(string, "name:{name} age:{age=\\d+}"), 195 | `unglue::unglue()` = unglue::unglue_data(string, "Information name:{name} age:{age=\\d+}"), 196 | `utils::strcapture()` = utils::strcapture("Information name:(.*?) age:(\\d+)", string, 197 | proto = data.frame(name=character(), age=character())), 198 | `ore::ore_search()` = do.call(rbind.data.frame, lapply(ore_search(ore('name:(?.*?) age:(?\\d+)', encoding='utf8'), string, all=TRUE), function(x) {x$groups$matches})), 199 | `rr4r::rr4r_extract_groups()` = rr4r::rr4r_extract_groups(string, "name:(?P.*?) age:(?P\\d+)"), 200 | `nc::capture_first_vec() PCRE` = nc::capture_first_vec(string, "Information name:", name=".*?", " age:", age="\\d+", engine = 'PCRE'), 201 | `tidyr::extract()` = tidyr::extract(data.frame(x = string), x, into = c('name', 'age'), regex = 'name:(.*?) age:(\\d+)'), 202 | check = FALSE 203 | ) 204 | ``` 205 | 206 | ```{r echo=FALSE} 207 | plot(res) + 208 | theme_bw() + 209 | theme(legend.position = 'bottom') 210 | ``` 211 | 212 | ## Related Software 213 | 214 | - [stringr](https://cran.r-project.org/package=stringr) 215 | - `utils::strcapture()` 216 | - [unglue::unglue()](%5Bunglue%5D(https://cran.r-project.org/web/packages/unglue/index.html)) 217 | - [ore](https://github.com/jonclayden/ore), [ore on CRAN](https://cran.r-project.org/package=ore) 218 | - [namedCapture](https://cran.r-project.org/web/packages/namedCapture/index.html) Note: I couldn't get this to work sanely. 219 | - [rr4f](https://github.com/yutannihilation/rr4r) rust regex engine 220 | - [nc](https://github.com/tdhock/nc) 221 | 222 | ## Acknowledgements 223 | 224 | - R Core for developing and maintaining the language. 225 | - CRAN maintainers, for patiently shepherding packages onto CRAN and maintaining the repository 226 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # fugly 5 | 6 | 7 | 8 | ![](https://img.shields.io/badge/cool-useless-green.svg) 9 | 10 | 11 | 12 | This package provides a single function (`str_capture`) for using named 13 | capture groups to extract values from strings. A key requirement for 14 | readability is that the names of the capture groups are specified inline 15 | as part of the regex, and not in an external vector or as separate 16 | names. 17 | 18 | `fugly::str_capture()` is implemented as a wrapper around 19 | [stringr](https://cran.r-project.org/package=stringr). This is because 20 | `stringr` itself does not yet do named capture groups (See issues for 21 | [stringr](https://github.com/tidyverse/stringr/issues/71) and 22 | [stringi](https://github.com/gagolews/stringi/issues/153)). 23 | 24 | `fugly::str_capture()` is very similar to a number of existing packages. 25 | See table below for a comparison. 26 | 27 | | Method | Speed | Inline capture group naming | robust | 28 | |-----------------------------|----------|-----------------------------|--------| 29 | | `fugly::str_capture` | Fast | Yes | No | 30 | | `rr4r::rr4r_extract_groups` | Fast | Yes | Yes | 31 | | `nc::capture_first_vec` | Fast | No | Yes | 32 | | `tidy::extract` | Fast | No | Yes | 33 | | `utils::strcapture` | Middling | No | Yes | 34 | | `unglue::unglue` | Slow | Yes | Yes | 35 | | `ore::ore_search` | Slow | Yes | Yes | 36 | 37 | ### What do I mean when I say `fugly::str_capture()` is unsafe/dodgy/non-robust? 38 | 39 | - It doesn’t adhere to standard regular expression syntax for named 40 | capture groups as used in perl, python etc. 41 | 42 | - It doesn’t really adhere to `glue` syntax (although it looks similar 43 | at a surface level). 44 | 45 | - If you specify delimiters which appear in your string input, then 46 | you’re going to have a bad time. 47 | 48 | - It’s generally only been tested on data which is: 49 | 50 | - highly structured 51 | - only ASCII 52 | - non-pathological 53 | 54 | ### What’s in the box? 55 | 56 | - `fugly::str_capture(string, pattern, delim)` 57 | 58 | - capture named groups with regular expressions 59 | - returns a data.frame with all columns containing character 60 | strings 61 | - can mix-and-match with non-capturing regular expressions 62 | - if no regular expression specified for a named group then `.*?` 63 | is used. 64 | - does not do any type guessing/conversion. 65 | 66 | ## Installation 67 | 68 | You can install from [GitHub](https://github.com/coolbutuseless/fugly) 69 | with: 70 | 71 | ``` r 72 | # install.package('remotes') 73 | remotes::install_github('coolbutuseless/fugly') 74 | ``` 75 | 76 | ## Example 1 77 | 78 | In the following example: 79 | 80 | - Input consists of multiple strings 81 | - capture groups are delimited by `{}` by default. 82 | - the regex for the capture group for `name` is unspecified, so `.*?` 83 | will be used 84 | - the regex for the capture group for `age` is `\d+` i.e. match must 85 | consist of 1-or-more digits 86 | 87 | ``` r 88 | library(fugly) 89 | 90 | string <- c( 91 | "information: Name:greg Age:27 ", 92 | "information: Name:mary Age:34 " 93 | ) 94 | 95 | str_capture(string, pattern = "Name:{name} Age:{age=\\d+}") 96 | ``` 97 | 98 | #> name age 99 | #> 1 greg 27 100 | #> 2 mary 34 101 | 102 | ## Example 2 103 | 104 | A more complicated example: 105 | 106 | - Note the mixture of capturing groups and a bare `.*?` in the pattern 107 | which is not returned as a result 108 | 109 | ``` r 110 | string <- c( 111 | '{"type":"Feature","properties":{"hash":"1348778913c0224a","number":"27","street":"BANAMBILA STREET","unit":"","city":"ARANDA","district":"","region":"ACT","postcode":"2614","id":"GAACT714851647"},"geometry":{"type":"Point","coordinates":[149.0826143,-35.2545558]}}', 112 | '{"type":"Feature","properties":{"hash":"dc776871c868bc7e","number":"139","street":"BOUVERIE STREET","unit":"UNIT 711","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC423944917"},"geometry":{"type":"Point","coordinates":[144.9617149,-37.8032551]}}', 113 | '{"type":"Feature","properties":{"hash":"8197f34a40ccad47","number":"6","street":"MOGRIDGE STREET","unit":"","city":"WARWICK","district":"","region":"QLD","postcode":"4370","id":"GAQLD155949502"},"geometry":{"type":"Point","coordinates":[152.0230999,-28.2230133]}}', 114 | '{"type":"Feature","properties":{"hash":"18edc96308fc1a8e","number":"22","street":"ORR STREET","unit":"UNIT 507","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC424282716"},"geometry":{"type":"Point","coordinates":[144.9653484,-37.8063371]}}' 115 | ) 116 | 117 | 118 | str_capture(string, pattern = '"number":"{number}","street":"{street}".*?"coordinates":\\[{coords}\\]') 119 | ``` 120 | 121 | #> number street coords 122 | #> 1 27 BANAMBILA STREET 149.0826143,-35.2545558 123 | #> 2 139 BOUVERIE STREET 144.9617149,-37.8032551 124 | #> 3 6 MOGRIDGE STREET 152.0230999,-28.2230133 125 | #> 4 22 ORR STREET 144.9653484,-37.8063371 126 | 127 | ## Simple Benchmark 128 | 129 | I acknowledge that this isn’t the greatest benchmark, but it is relevant 130 | to my current use-case. 131 | 132 | - [nc](https://github.com/tdhock/nc) with the PCRE regex engine is the 133 | fastest named capture I could find in R. 134 | 135 | - However - I’m not a huge fan of its syntax 136 | 137 | - For large inputs (1000+ input strings), `fugly` is significantly 138 | faster than `unglue`, `utils::strcapture` and \``ore` 139 | 140 | - The rust regex engine 141 | [rr4r](https://github.com/yutannihilation/rr4r) is slightly faster 142 | than `fugly` 143 | 144 | - `unglue` is the slowest of the methods. 145 | 146 | - `ore` lies somewhere between `unglue` and `utils::strcapture` 147 | 148 | - As pointed out by [Michael 149 | Barrowman](https://twitter.com/MyKo101AB), `tidyr::extract()` will 150 | also do named capture into a data.frame. 151 | 152 | - Similar to `utils::strcapture()`, the names are not specified 153 | inline with the regex, but are listed separately. 154 | 155 | ``` r 156 | # remotes::install_github("jonclayden/ore") 157 | # remotes::install_github("yutannihilation/rr4r") 158 | # remotes::install_github('qinwf/re2r') 159 | library(ore) 160 | library(rr4r) 161 | library(unglue) 162 | library(ggplot2) 163 | library(tidyr) 164 | 165 | # meaningless strings for benchmarking 166 | N <- 1000 167 | string <- paste0("Information name:greg age:", seq(N)) 168 | 169 | 170 | res <- bench::mark( 171 | `fugly::str_capture()` = fugly::str_capture(string, "name:{name} age:{age=\\d+}"), 172 | `unglue::unglue()` = unglue::unglue_data(string, "Information name:{name} age:{age=\\d+}"), 173 | `utils::strcapture()` = utils::strcapture("Information name:(.*?) age:(\\d+)", string, 174 | proto = data.frame(name=character(), age=character())), 175 | `ore::ore_search()` = do.call(rbind.data.frame, lapply(ore_search(ore('name:(?.*?) age:(?\\d+)', encoding='utf8'), string, all=TRUE), function(x) {x$groups$matches})), 176 | `rr4r::rr4r_extract_groups()` = rr4r::rr4r_extract_groups(string, "name:(?P.*?) age:(?P\\d+)"), 177 | `nc::capture_first_vec() PCRE` = nc::capture_first_vec(string, "Information name:", name=".*?", " age:", age="\\d+", engine = 'PCRE'), 178 | `tidyr::extract()` = tidyr::extract(data.frame(x = string), x, into = c('name', 'age'), regex = 'name:(.*?) age:(\\d+)'), 179 | check = FALSE 180 | ) 181 | ``` 182 | 183 | 184 | 185 | ## Related Software 186 | 187 | - [stringr](https://cran.r-project.org/package=stringr) 188 | - `utils::strcapture()` 189 | - [unglue::unglue()](%5Bunglue%5D(https://cran.r-project.org/web/packages/unglue/index.html)) 190 | - [ore](https://github.com/jonclayden/ore), [ore on 191 | CRAN](https://cran.r-project.org/package=ore) 192 | - [namedCapture](https://cran.r-project.org/web/packages/namedCapture/index.html) 193 | Note: I couldn’t get this to work sanely. 194 | - [rr4f](https://github.com/yutannihilation/rr4r) rust regex engine 195 | - [nc](https://github.com/tdhock/nc) 196 | 197 | ## Acknowledgements 198 | 199 | - R Core for developing and maintaining the language. 200 | - CRAN maintainers, for patiently shepherding packages onto CRAN and 201 | maintaining the repository 202 | -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/fugly/b28acb6ce44c426a0290e6296ffa74e212a52e0e/man/figures/README-unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/fugly/b28acb6ce44c426a0290e6296ffa74e212a52e0e/man/figures/logo.png -------------------------------------------------------------------------------- /man/figures/white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/fugly/b28acb6ce44c426a0290e6296ffa74e212a52e0e/man/figures/white.png -------------------------------------------------------------------------------- /man/str_capture.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core.R 3 | \name{str_capture} 4 | \alias{str_capture} 5 | \title{Named capture groups} 6 | \usage{ 7 | str_capture(string, pattern, delim = c("{", "}")) 8 | } 9 | \arguments{ 10 | \item{string}{input character vector} 11 | 12 | \item{pattern}{a regex using named capture groups as used in \code{glue} and 13 | \code{unglue}} 14 | 15 | \item{delim}{delimiters of the named capture groups. Note: Very litte sanity 16 | checking is done here. You'll want to be able to guarantee that these 17 | delims do not appear in your actual string input otherwise things 18 | will not go as you want. Caveat Emptor!} 19 | } 20 | \value{ 21 | data.frame of captured groups 22 | } 23 | \description{ 24 | Named capture groups 25 | } 26 | --------------------------------------------------------------------------------