├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── ChangeLog ├── DESCRIPTION ├── Makefile ├── NAMESPACE ├── R ├── RcppExports.R ├── sf_functions.r ├── zz_help_files.R └── zzz.R ├── README.md ├── cleanup ├── configure ├── configure.ac ├── inst ├── PCRE2_LICENSE.txt ├── extra_tests │ └── benchmark_test.r ├── icelandic_words_500_utf8.txt └── include │ ├── sf_external.h │ └── sf_internal.h ├── man ├── convert_to_sf.Rd ├── get_string_type.Rd ├── materialize.Rd ├── random_strings.Rd ├── sf_assign.Rd ├── sf_collapse.Rd ├── sf_compare.Rd ├── sf_concat.Rd ├── sf_ends.Rd ├── sf_grepl.Rd ├── sf_gsub.Rd ├── sf_iconv.Rd ├── sf_match.Rd ├── sf_nchar.Rd ├── sf_paste.Rd ├── sf_readLines.Rd ├── sf_split.Rd ├── sf_starts.Rd ├── sf_substr.Rd ├── sf_tolower.Rd ├── sf_toupper.Rd ├── sf_trim.Rd ├── sf_vector.Rd ├── sf_writeLines.Rd └── string_identical.Rd ├── src ├── Makevars.in ├── Makevars.win ├── PCRE2 │ ├── config.h │ ├── pcre2.h │ ├── pcre2_auto_possess.c │ ├── pcre2_chartables.c │ ├── pcre2_compile.c │ ├── pcre2_config.c │ ├── pcre2_context.c │ ├── pcre2_convert.c │ ├── pcre2_dfa_match.c │ ├── pcre2_error.c │ ├── pcre2_extuni.c │ ├── pcre2_find_bracket.c │ ├── pcre2_internal.h │ ├── pcre2_intmodedep.h │ ├── pcre2_is_bundled.c │ ├── pcre2_jit_compile.c │ ├── pcre2_jit_match.c │ ├── pcre2_jit_misc.c │ ├── pcre2_jit_neon_inc.h │ ├── pcre2_jit_simd_inc.h │ ├── pcre2_maketables.c │ ├── pcre2_match.c │ ├── pcre2_match_data.c │ ├── pcre2_newline.c │ ├── pcre2_ord2utf.c │ ├── pcre2_pattern_info.c │ ├── pcre2_printint.c │ ├── pcre2_script_run.c │ ├── pcre2_serialize.c │ ├── pcre2_string_utils.c │ ├── pcre2_study.c │ ├── pcre2_substitute.c │ ├── pcre2_substring.c │ ├── pcre2_tables.c │ ├── pcre2_ucd.c │ ├── pcre2_ucp.h │ ├── pcre2_valid_utf.c │ ├── pcre2_xclass.c │ ├── pcre2posix.c │ ├── pcre2posix.h │ └── sljit │ │ ├── sljitConfig.h │ │ ├── sljitConfigInternal.h │ │ ├── sljitExecAllocator.c │ │ ├── sljitLir.c │ │ ├── sljitLir.h │ │ ├── sljitNativeARM_32.c │ │ ├── sljitNativeARM_64.c │ │ ├── sljitNativeARM_T2_32.c │ │ ├── sljitNativeMIPS_32.c │ │ ├── sljitNativeMIPS_64.c │ │ ├── sljitNativeMIPS_common.c │ │ ├── sljitNativePPC_32.c │ │ ├── sljitNativePPC_64.c │ │ ├── sljitNativePPC_common.c │ │ ├── sljitNativeSPARC_32.c │ │ ├── sljitNativeSPARC_common.c │ │ ├── sljitNativeTILEGX-encoder.c │ │ ├── sljitNativeTILEGX_64.c │ │ ├── sljitNativeX86_32.c │ │ ├── sljitNativeX86_64.c │ │ ├── sljitNativeX86_common.c │ │ ├── sljitProtExecAllocator.c │ │ └── sljitUtils.c ├── PCRE2_wrapper │ ├── pcre2_wrapper.cpp │ └── pcre2_wrapper.h ├── RcppExports.cpp ├── sf_altrep.h ├── sf_disabled.h ├── sf_functions.cpp └── xxhash │ ├── xxhash.c │ └── xxhash.h ├── tests ├── tests.cpp └── tests.r └── vignettes ├── bench_v2.png ├── vignette.html └── vignette.rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | .travis.yml 4 | rebuild.sh 5 | .*\.tar\.gz 6 | ^local 7 | ^benchmark_results 8 | rticle 9 | .Rhistory 10 | src/ZSTD/LICENSCE.txt 11 | aclocal.m4 12 | Makefile 13 | ^.vscode 14 | ^\.github$ 15 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | branches: [main, master] 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | # rchk: 17 | # runs-on: ubuntu-latest 18 | # steps: 19 | # - uses: actions/checkout@v1 20 | # - uses: r-lib/actions/run-rchk@master 21 | R-CMD-check: 22 | runs-on: ${{ matrix.config.os }} 23 | 24 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 25 | 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | config: 30 | - {os: macOS-latest, r: 'release'} 31 | 32 | - {os: windows-latest, r: 'release'} 33 | # Use 3.6 to trigger usage of RTools35 34 | # - {os: windows-latest, r: '3.6'} 35 | 36 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 37 | - {os: ubuntu-latest, r: 'release'} 38 | - {os: ubuntu-latest, r: 'oldrel-1'} 39 | - {os: ubuntu-latest, r: 'oldrel-2'} 40 | - {os: ubuntu-latest, r: 'oldrel-3'} 41 | - {os: ubuntu-latest, r: 'oldrel-4'} 42 | 43 | env: 44 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 45 | R_KEEP_PKG_SOURCE: yes 46 | 47 | steps: 48 | - name: Windows CRLF fix 49 | run: git config --global core.autocrlf false 50 | 51 | - uses: actions/checkout@v3 52 | 53 | - uses: r-lib/actions/setup-pandoc@v2 54 | 55 | - uses: r-lib/actions/setup-r@v2 56 | with: 57 | r-version: ${{ matrix.config.r }} 58 | http-user-agent: ${{ matrix.config.http-user-agent }} 59 | use-public-rspm: true 60 | 61 | - uses: r-lib/actions/setup-r-dependencies@v2 62 | with: 63 | extra-packages: any::rcmdcheck 64 | needs: check 65 | 66 | - uses: r-lib/actions/check-r-package@v2 67 | with: 68 | upload-snapshots: true 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | src/*.a 9 | src/**/*.o 10 | src/**/*.so 11 | src/**/*.dll 12 | src/**/*.a 13 | stringfish*.tar.gz 14 | *.Rcheck 15 | ..Rcheck 16 | rebuild.sh 17 | local 18 | .DS_Store 19 | src/ZSTD/LICENSCE.txt 20 | aclocal.m4 21 | config.log 22 | config.status 23 | autom4te.cache/* 24 | .vscode 25 | /src/.vscode 26 | src/Makevars 27 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | Version 0.16.0 (2023-11-25) 2 | * Add in ALTREP serialization 3 | 4 | Version 0.15.8 (2023-5-29) 5 | * Fix warn message in R 4.3+ "warning: function declaration isn’t a prototype [-Wstrict-prototypes]" 6 | * Remove C++11 SystemRequirements per new CRAN guidelines 7 | * Switch to `std::atomic` instead of `tbb::atomic` due to deprecation (https://github.com/traversc/stringfish/issues/19) 8 | 9 | Version 0.15.7 (2022-2-21) 10 | * Switch to using Rcpp's new `signature` attribute to define default parameters 11 | * Use more accurate language in configure script 12 | 13 | Version 0.15.6 (2021-12-8) 14 | * Bugfix to work on Linux Alpine; missing checks for TBB (https://github.com/r-hub/r-minimal/issues/37, https://github.com/traversc/stringfish/issues/11) 15 | 16 | Version 0.15.5 (2021-11-30) 17 | * Bugfix to work on R 3.4 (https://github.com/tidyverse/multidplyr/pull/129) 18 | 19 | Version 0.15.4 (2021-10-11) 20 | * Disable `sf_match` test due to error on Solaris and R 4.1.X bug (https://bugs.r-project.org/show_bug.cgi?id=18211) 21 | 22 | Version 0.15.3 (2021-10-9) 23 | * Fix to https://github.com/traversc/stringfish/issues/7; definition conflict with bundled PCRE2 24 | * Update autoconf to version 2.6.9 (autoupdate; autoreconf --warnings=obsolete) 25 | * `nthreads` parameter default is now `getOption("stringfish.nthreads", 1L)`. Set using `options(stringfish.nthreads = 4L)` 26 | 27 | Version 0.15.2 (2021-7-23) 28 | * Change ALTVEC_DATAPTR to DATAPTR to conform to changes in the API 29 | 30 | Version 0.15.1 (2021-3-13) 31 | * Fix PCRE2 issue on Apple M1 32 | * Fix for missing return type in src/sf_disabled.h 33 | 34 | Version 0.15.0 (2021-2-20) 35 | * Reduce requirement for R 3.5 so packages depending on `stringfish` don't require R 3.5. Most functionality will not be available in R < 3.5. 36 | * Update `xxhash` library to release 0.8.0 and use the improved XXH3 algorithm for hashing. 37 | 38 | Version 0.14.2 (2020-9-3) 39 | * Fix bug with `extract_subset` ALTREP routine (https://github.com/traversc/qs/issues/42) 40 | 41 | Version 0.14.1 (2020-7-23) 42 | * Implemented multithreading with RcppParallel 43 | * Addition of `Extract_subset` ALTREP method 44 | * Additional functions (`sf_compare`, `sf_concat`, `sf_equals`) 45 | * Various bug fixes 46 | 47 | Version 0.13.2 (2020-7-5) 48 | * Removed -mshstk flag to fix CRAN note 49 | 50 | Version 0.13.1 (2020-7-5) 51 | * Additional functions: `sf_toupper`, `sf_tolower`, `sf_toupper`, `sf_tolower`, `sf_starts`, `sf_ends`, `sf_trim`, `sf_split`, `sf_match` 52 | * Additional functions: `string_identical`, `sf_writeLines` 53 | 54 | Version 0.11.2 (2020-6-1) 55 | * Fixed compilation error on Fedora (adding -mshstk flag compile flag in configure file) 56 | * Fixed a bug in alt-rep set_string_elt method 57 | * Added additional sfstring constructor `sfstring(size_t, cetype_t)` 58 | 59 | Version 0.11.2 (2020-6-1) 60 | * Fixed compilation error on Fedora (adding -mshstk flag compile flag in configure file) 61 | * Fixed a bug in alt-rep set_string_elt method 62 | * Added additional sfstring constructor `sfstring(size_t, cetype_t)` 63 | 64 | Version 0.11 (2020-5-14) 65 | * Check for PCRE2 system installation and updated bundled version to latest (10.35) 66 | * Fix to copyright statements in DESCRIPTION file 67 | 68 | Version 0.1 (2020-5-11) 69 | * Initial CRAN release 70 | * An alt-rep string framework for fast and extensible processing of string data 71 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: stringfish 2 | Title: Alt String Implementation 3 | Version: 0.16.0 4 | Date: 2023-11-27 5 | Authors@R: c( 6 | person("Travers", "Ching", email = "traversc@gmail.com", role = c("aut", "cre", "cph")), 7 | person("Phillip", "Hazel", role = c("ctb"), comment = "Bundled PCRE2 code"), 8 | person("Zoltan", "Herczeg", role = c("ctb", "cph"), comment = "Bundled PCRE2 code"), 9 | person("University of Cambridge", role = c("cph"), comment = "Bundled PCRE2 code"), 10 | person("Tilera Corporation", role = c("cph"), comment = "Stack-less Just-In-Time compiler bundled with PCRE2"), 11 | person("Yann", "Collet", role = c("ctb", "cph"), comment = "Yann Collet is the author of the bundled xxHash code")) 12 | Maintainer: Travers Ching 13 | Description: Provides an extendable, performant and multithreaded 'alt-string' implementation backed by 'C++' vectors and strings. 14 | License: GPL-3 15 | Biarch: true 16 | Encoding: UTF-8 17 | Depends: R (>= 3.0.2) 18 | SystemRequirements: GNU make 19 | LinkingTo: 20 | Rcpp (>= 0.12.18.3), RcppParallel (>= 5.1.4) 21 | Imports: 22 | Rcpp, RcppParallel 23 | Suggests: 24 | qs, knitr, rmarkdown, usethis, dplyr, stringr, rlang 25 | VignetteBuilder: knitr 26 | RoxygenNote: 7.2.3 27 | Copyright: Copyright for the bundled 'PCRE2' library is held by University of Cambridge, Zoltan Herczeg and Tilera Coporation (Stack-less Just-In-Time compiler); Copyright for the bundled 'xxHash' code is held by Yann Collet. 28 | URL: https://github.com/traversc/stringfish 29 | BugReports: https://github.com/traversc/stringfish/issues 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | PACKAGE := $(shell perl -aF: -ne 'print, exit if s/^Package:\s+//' DESCRIPTION) 3 | VERSION := $(shell perl -aF: -ne 'print, exit if s/^Version:\s+//' DESCRIPTION) 4 | BUILD := $(PACKAGE)_$(VERSION).tar.gz 5 | 6 | .PHONY: doc build install test vignette $(BUILD) 7 | 8 | check: $(BUILD) 9 | R CMD check --as-cran $< 10 | 11 | check-no-vignette: $(BUILD) 12 | R CMD check --as-cran --no-build-vignettes $< 13 | 14 | check-rhub: $(BUILD) 15 | Rscript -e 'rhub::check("$(BUILD)", platform = c("ubuntu-gcc-devel", "windows-x86_64-devel", "solaris-x86-patched", "solaris-x86-patched-ods", "macos-m1-bigsur-release"))' 16 | 17 | check-solaris: $(BUILD) 18 | Rscript -e 'rhub::check("$(BUILD)", platform = c("solaris-x86-patched", "solaris-x86-patched-ods"))' 19 | 20 | check-m1: $(BUILD) 21 | Rscript -e 'rhub::check("$(BUILD)", platform = c("macos-m1-bigsur-release"))' 22 | 23 | compile: 24 | find src/ -type f -exec chmod 644 {} \; 25 | Rscript -e "library(Rcpp); compileAttributes('.');" 26 | # Rscript -e "devtools::load_all(); roxygen2::roxygenise('.');" 27 | find . -iname "*.a" -exec rm {} \; 28 | find . -iname "*.o" -exec rm {} \; 29 | find . -iname "*.so" -exec rm {} \; 30 | 31 | build: 32 | autoconf 33 | chmod 755 cleanup 34 | chmod 755 configure 35 | find src/ -type f -exec chmod 644 {} \; 36 | chmod 644 ChangeLog DESCRIPTION Makefile NAMESPACE README.md 37 | ./configure 38 | ./cleanup 39 | Rscript -e "library(Rcpp); compileAttributes('.');" 40 | Rscript -e "devtools::load_all(); roxygen2::roxygenise('.');" 41 | # rm -f R/RcppExports.R 42 | find . -iname "*.a" -exec rm {} \; 43 | find . -iname "*.o" -exec rm {} \; 44 | find . -iname "*.so" -exec rm {} \; 45 | R CMD build . 46 | 47 | install: 48 | autoconf 49 | chmod 755 cleanup 50 | chmod 755 configure 51 | find src/ -type f -exec chmod 644 {} \; 52 | chmod 644 ChangeLog DESCRIPTION Makefile NAMESPACE README.md 53 | ./configure 54 | ./cleanup 55 | Rscript -e "library(Rcpp); compileAttributes('.');" 56 | Rscript -e "devtools::load_all(); roxygen2::roxygenise('.');" 57 | # rm -f R/RcppExports.R 58 | find . -iname "*.a" -exec rm {} \; 59 | find . -iname "*.o" -exec rm {} \; 60 | find . -iname "*.so" -exec rm {} \; 61 | R CMD build . # --no-build-vignettes 62 | R CMD INSTALL $(BUILD) --configure-args="--with-simd=AVX2" # --with-pcre2-force-compile" 63 | 64 | vignette: 65 | Rscript -e "rmarkdown::render(input='vignettes/vignette.rmd', output_format='html_vignette')" 66 | IS_GITHUB=Yes Rscript -e "rmarkdown::render(input='vignettes/vignette.rmd', output_file='../README.md', output_format=rmarkdown::github_document(html_preview=FALSE))"; unset IS_GITHUB 67 | 68 | test: 69 | Rscript tests/tests.R 70 | Rscript inst/extra_tests/benchmark_test.R 5 71 | 72 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | importFrom(Rcpp,sourceCpp) 2 | importFrom(RcppParallel, RcppParallelLibs) 3 | useDynLib(stringfish, .registration=TRUE) 4 | export( 5 | "materialize", 6 | "get_string_type", 7 | "convert_to_sf", "sf_convert", 8 | "sf_vector", 9 | "sf_assign", 10 | "random_strings", 11 | "string_identical", 12 | 13 | "sf_iconv", 14 | "sf_nchar", 15 | "sf_substr", 16 | "sf_paste", 17 | "sf_collapse", 18 | "sf_readLines", 19 | "sf_writeLines", 20 | "sf_grepl", 21 | "sf_gsub", 22 | "sf_toupper", 23 | "sf_tolower", 24 | "sf_starts", 25 | "sf_ends", 26 | "sf_trim", 27 | "sf_split", 28 | "sf_match", 29 | "sf_equals", "sf_compare", 30 | "sf_concat", "sfc") 31 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | set_is_utf8_locale <- function() { 5 | invisible(.Call(`_stringfish_set_is_utf8_locale`)) 6 | } 7 | 8 | unset_is_utf8_locale <- function() { 9 | invisible(.Call(`_stringfish_unset_is_utf8_locale`)) 10 | } 11 | 12 | get_is_utf8_locale <- function() { 13 | .Call(`_stringfish_get_is_utf8_locale`) 14 | } 15 | 16 | is_tbb <- function() { 17 | .Call(`_stringfish_is_tbb`) 18 | } 19 | 20 | check_simd <- function() { 21 | invisible(.Call(`_stringfish_check_simd`)) 22 | } 23 | 24 | get_pcre2_info <- function() { 25 | .Call(`_stringfish_get_pcre2_info`) 26 | } 27 | 28 | get_string_type <- function(x) { 29 | .Call(`_stringfish_get_string_type`, x) 30 | } 31 | 32 | materialize <- function(x) { 33 | .Call(`_stringfish_materialize`, x) 34 | } 35 | 36 | sf_vector <- function(len) { 37 | .Call(`_stringfish_sf_vector`, len) 38 | } 39 | 40 | sf_assign <- function(x, i, e) { 41 | invisible(.Call(`_stringfish_sf_assign`, x, i, e)) 42 | } 43 | 44 | sf_iconv <- function(x, from, to, nthreads = getOption("stringfish.nthreads", 1L)) { 45 | .Call(`_stringfish_sf_iconv`, x, from, to, nthreads) 46 | } 47 | 48 | convert_to_sf <- function(x) { 49 | .Call(`_stringfish_convert_to_sf`, x) 50 | } 51 | 52 | sf_nchar <- function(x, type = "chars", nthreads = getOption("stringfish.nthreads", 1L)) { 53 | .Call(`_stringfish_sf_nchar`, x, type, nthreads) 54 | } 55 | 56 | sf_substr <- function(x, start, stop, nthreads = getOption("stringfish.nthreads", 1L)) { 57 | .Call(`_stringfish_sf_substr`, x, start, stop, nthreads) 58 | } 59 | 60 | c_sf_paste <- function(dots, sep, nthreads = 1L) { 61 | .Call(`_stringfish_c_sf_paste`, dots, sep, nthreads) 62 | } 63 | 64 | sf_collapse <- function(x, collapse) { 65 | .Call(`_stringfish_sf_collapse`, x, collapse) 66 | } 67 | 68 | sf_readLines <- function(file, encoding = "UTF-8") { 69 | .Call(`_stringfish_sf_readLines`, file, encoding) 70 | } 71 | 72 | sf_writeLines <- function(text, file, sep = "\n", na_value = "NA", encode_mode = "UTF-8") { 73 | invisible(.Call(`_stringfish_sf_writeLines`, text, file, sep, na_value, encode_mode)) 74 | } 75 | 76 | sf_grepl <- function(subject, pattern, encode_mode = "auto", fixed = FALSE, nthreads = getOption("stringfish.nthreads", 1L)) { 77 | .Call(`_stringfish_sf_grepl`, subject, pattern, encode_mode, fixed, nthreads) 78 | } 79 | 80 | sf_split <- function(subject, split, encode_mode = "auto", fixed = FALSE, nthreads = getOption("stringfish.nthreads", 1L)) { 81 | .Call(`_stringfish_sf_split`, subject, split, encode_mode, fixed, nthreads) 82 | } 83 | 84 | sf_gsub <- function(subject, pattern, replacement, encode_mode = "auto", fixed = FALSE, nthreads = getOption("stringfish.nthreads", 1L)) { 85 | .Call(`_stringfish_sf_gsub`, subject, pattern, replacement, encode_mode, fixed, nthreads) 86 | } 87 | 88 | random_strings <- function(N, string_size = 50L, charset = "abcdefghijklmnopqrstuvwxyz", vector_mode = "stringfish") { 89 | .Call(`_stringfish_random_strings`, N, string_size, charset, vector_mode) 90 | } 91 | 92 | sf_tolower <- function(x) { 93 | .Call(`_stringfish_sf_tolower`, x) 94 | } 95 | 96 | sf_toupper <- function(x) { 97 | .Call(`_stringfish_sf_toupper`, x) 98 | } 99 | 100 | sf_match <- function(x, table, nthreads = getOption("stringfish.nthreads", 1L)) { 101 | .Call(`_stringfish_sf_match`, x, table, nthreads) 102 | } 103 | 104 | sf_compare <- function(x, y, nthreads = getOption("stringfish.nthreads", 1L)) { 105 | .Call(`_stringfish_sf_compare`, x, y, nthreads) 106 | } 107 | 108 | c_sf_concat <- function(x) { 109 | .Call(`_stringfish_c_sf_concat`, x) 110 | } 111 | 112 | -------------------------------------------------------------------------------- /R/sf_functions.r: -------------------------------------------------------------------------------- 1 | sf_paste <- function(..., sep="", nthreads = getOption("stringfish.nthreads", 1L)) { 2 | if(!is.character(sep) || length(sep) != 1) { 3 | stop("sep should be a character vector of length 1") 4 | } 5 | dots <- list(...) 6 | len <- -1 7 | for(i in seq_along(dots)) { 8 | if(!is.character(dots[[i]])) { 9 | dots[[i]] <- as.character(dots[[i]]) 10 | } 11 | li <- length(dots[[i]]) 12 | if(li == 0) stop("argument cannot be of length zero") 13 | if(li == 1) next 14 | if(len == -1) { 15 | len <- li 16 | } else { 17 | if(li != len) stop("All arguments should be the same length or length 1") 18 | } 19 | } 20 | c_sf_paste(dots, sep, nthreads) 21 | } 22 | 23 | sf_concat <- function(...) { 24 | dots <- list(...) 25 | for(i in seq_along(dots)) { 26 | if(!is.character(dots[[i]])) dots[[i]] <- as.character(dots[[i]]) 27 | } 28 | c_sf_concat(dots) 29 | } 30 | 31 | sf_starts <- function(subject, pattern, ...) { 32 | pattern <- paste0("^", pattern) 33 | sf_grepl(subject, pattern, ...) 34 | } 35 | 36 | sf_ends <- function(subject, pattern, ...) { 37 | pattern <- paste0(pattern, "$") 38 | sf_grepl(subject, pattern, ...) 39 | } 40 | 41 | sf_trim <- function(subject, which = c("both", "left", "right"), whitespace = "[ \\t\\r\\n]", ...) { 42 | which <- match.arg(which) 43 | if(which == "both") { 44 | sf_gsub(sf_gsub(subject, paste0("^", whitespace,"+"), "", ...), paste0(whitespace, "+", "$"), "", ...) 45 | } else if(which == "left") { 46 | sf_gsub(subject, paste0("^", whitespace, "+"), "", ...) 47 | } else { 48 | sf_gsub(subject, paste0(whitespace, "+", "$"), "", ...) 49 | } 50 | } 51 | 52 | string_identical <- function(x, y) { 53 | stopifnot(is.character(x)) 54 | stopifnot(is.character(y)) 55 | if(length(x) != length(y)) return(FALSE) 56 | na_x <- is.na(x) 57 | na_y <- is.na(y) 58 | stopifnot(identical(na_x,na_y)) 59 | if(all(na_x)) return(TRUE) # correctly catches zero length as well 60 | not_na <- !na_x 61 | if(any(nchar(x[not_na]) != nchar(y[not_na]))) return(FALSE) 62 | if(!all(Encoding(x[not_na]) == Encoding(y[not_na]))) return(FALSE) 63 | if(any(x[not_na] != y[not_na])) return(FALSE) 64 | return(TRUE) 65 | } 66 | 67 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(libname, pkgname) { 2 | # maybe we should check this at compile time somehow? 3 | if(identical(utils::localeToCharset()[1], "UTF-8")) set_is_utf8_locale() 4 | } 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | stringfish 2 | ================ 3 | 4 | [![R-CMD-check](https://github.com/traversc/stringfish/workflows/R-CMD-check/badge.svg)](https://github.com/traversc/stringfish/actions) 5 | [![CRAN-Status-Badge](https://www.r-pkg.org/badges/version/stringfish)](https://cran.r-project.org/package=stringfish) 6 | [![CRAN-Downloads-Badge](https://cranlogs.r-pkg.org/badges/stringfish)](https://cran.r-project.org/package=stringfish) 7 | [![CRAN-Downloads-Total-Badge](https://cranlogs.r-pkg.org/badges/grand-total/stringfish)](https://cran.r-project.org/package=stringfish) 8 | 9 | `stringfish` is a framework for performing string and sequence 10 | operations using the ALTREP system to speed up the computation of common 11 | string operations. 12 | 13 | The ultimate goal of the package is to unify ALTREP string 14 | implementations under a common framework. 15 | 16 | The ALTREP system (new as of R 3.5.0) allows package developers to 17 | represent R objects using their own custom memory layout, completely 18 | invisible to the user. `stringfish` represents string data as a simple 19 | C++/STL vector, which is very fast and lightweight. 20 | 21 | Using normal R functions to process string data (e.g. `substr`, `gsub`, 22 | `paste`, etc.) causes “materialization” of ALTREP vectors to normal R 23 | data, which can be a slow process. Therefore, in order to take full 24 | advantage of the ALTREP framework, string processing functions need to 25 | be re-written to be ALTREP aware. This package hopes to fulfill that 26 | purpose. 27 | 28 | ## Installation 29 | 30 | ``` r 31 | install.packages("stringfish", type="source", configure.args="--with-simd=AVX2") 32 | ``` 33 | 34 | ## Benchmark 35 | 36 | The simplest way to show the utility of the ALTREP framework is through 37 | a quick benchmark comparing `stringfish` and base R. 38 | 39 | ![](vignettes/bench_v2.png "bench_v2") 40 | 41 | Yes you are reading the graph correctly: some functions in `stringfish` 42 | are more than an order of magnitude faster than vectorized base R 43 | operations (and even faster with some build in multithreading). On large 44 | text datasets, this can turn minutes of computation into seconds. 45 | 46 | ## Currently implemented functions 47 | 48 | A list of implemented `stringfish` functions and analogous base R 49 | functions: 50 | 51 | - `sf_iconv` (`iconv`) 52 | - `sf_nchar` (`nchar`) 53 | - `sf_substr` (`substr`) 54 | - `sf_paste` (`paste0`) 55 | - `sf_collapse` (`paste0`) 56 | - `sf_readLines` (`readLines`) 57 | - `sf_writeLines` (`writeLines`) 58 | - `sf_grepl` (`grepl`) 59 | - `sf_gsub` (`gsub`) 60 | - `sf_toupper` (`toupper`) 61 | - `sf_tolower` (`tolower`) 62 | - `sf_starts` (`startsWith`) 63 | - `sf_ends` (`endsWith`) 64 | - `sf_trim` (`trimws`) 65 | - `sf_split` (`strsplit`) 66 | - `sf_match` (`match` for strings only) 67 | - `sf_compare`/`sf_equals` (`==`, ALTREP-aware string equality) 68 | 69 | Utility functions: 70 | 71 | - `sf_vector` – creates a new and empty `stringfish` vector 72 | - `sf_assign` – assign strings into a `stringfish` vector in place 73 | (like `x[i] <- "mystring"`) 74 | - `sf_convert`/`convert_to_sf` – converts a character vector to a 75 | `stringfish` vector 76 | - `get_string_type` – determines string type (whether ALTREP or 77 | normal) 78 | - `materialize` – converts any ALTREP object into a normal R object 79 | - `random_strings` – creates random strings as either a `stringfish` 80 | or normal R vector 81 | - `string_identical` – like `identical` for strings but also requires 82 | identical encoding (i.e. latin1 and UTF-8 strings will not match) 83 | 84 | In addition, many R operations in base R and other packages are already 85 | ALTREP-aware (i.e. they don’t cause materialization). Functions that 86 | subset or index into string vectors generally do not materialize. 87 | 88 | - `sample` 89 | - `head` 90 | - `tail` 91 | - `[` – e.g. `x[20:30]` 92 | - `dplyr::filter` – e.g. `dplyr::filter(df, sf_starts("a"))` 93 | - Etc. 94 | 95 | `stringfish` functions are not intended to exactly replicate their base 96 | R analogues. One difference is that `subject` parameters are always the 97 | first argument, which is easier to use with pipes (`%>%`). E.g., 98 | `gsub(pattern, replacement, subject)` becomes `sf_gsub(subject, pattern, 99 | replacement)`. 100 | 101 | ## Extensibility 102 | 103 | `stringfish` as a framework is intended to be easily extensible. 104 | Stringfish vectors can be worked into `Rcpp` scripts or even into other 105 | packages (see the `qs` package for an example). 106 | 107 | Below is a detailed `Rcpp` script that creates a function to alternate 108 | upper and lower case of strings. 109 | 110 | ``` c 111 | // [[Rcpp::plugins(cpp11)]] 112 | // [[Rcpp::depends(stringfish)]] 113 | #include 114 | #include "sf_external.h" 115 | using namespace Rcpp; 116 | 117 | // [[Rcpp::export]] 118 | SEXP sf_alternate_case(SEXP x) { 119 | // Iterate through a character vector using the RStringIndexer class 120 | // If the input vector x is a stringfish character vector it will do so without materialization 121 | RStringIndexer r(x); 122 | size_t len = r.size(); 123 | 124 | // Create an output stringfish vector 125 | // Like all R objects, it must be protected from garbage collection 126 | SEXP output = PROTECT(sf_vector(len)); 127 | 128 | // Obtain a reference to the underlying output data 129 | sf_vec_data & output_data = sf_vec_data_ref(output); 130 | 131 | // You can use range based for loop via an iterator class that returns RStringIndexer::rstring_info e 132 | // rstring info is a struct containing const char * ptr (null terminated), int len, and cetype_t enc 133 | // a NA string is represented by a nullptr 134 | // Alternatively, access the data via the function r.getCharLenCE(i) 135 | size_t i = 0; 136 | for(auto e : r) { 137 | // check if string is NA and go to next if it is 138 | if(e.ptr == nullptr) { 139 | i++; // increment output index 140 | continue; 141 | } 142 | // create a temporary output string and process the results 143 | std::string temp(e.len, '\0'); 144 | bool case_switch = false; 145 | for(int j=0; j= 65) & (e.ptr[j] <= 90)) { // char j is upper case 147 | if((case_switch = !case_switch)) { // check if we should convert to lower case 148 | temp[j] = e.ptr[j] + 32; 149 | continue; 150 | } 151 | } else if((e.ptr[j] >= 97) & (e.ptr[j] <= 122)) { // char j is lower case 152 | if(!(case_switch = !case_switch)) { // check if we should convert to upper case 153 | temp[j] = e.ptr[j] - 32; 154 | continue; 155 | } 156 | } else if(e.ptr[j] == 32) { 157 | case_switch = false; 158 | } 159 | temp[j] = e.ptr[j]; 160 | } 161 | 162 | // Create a new vector element sfstring and insert the processed string into the stringfish vector 163 | // sfstring has three constructors, 1) taking a std::string and encoding, 164 | // 2) a char pointer and encoding, or 3) a CHARSXP object (e.g. sfstring(NA_STRING)) 165 | output_data[i] = sfstring(temp, e.enc); 166 | i++; // increment output index 167 | } 168 | // Finally, call unprotect and return result 169 | UNPROTECT(1); 170 | return output; 171 | } 172 | ``` 173 | 174 | Example function call: 175 | 176 | ``` r 177 | sf_alternate_case("hello world") 178 | [1] "hElLo wOrLd" 179 | ``` 180 | 181 | ## To do 182 | 183 | - Additional functions 184 | - ICU library functions 185 | -------------------------------------------------------------------------------- /cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | rm -f config.* src/Makevars src/config.h 4 | rm -rf autom4te.cache -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([stringfish],[0.13.2 traversc@gmail.com]) 2 | AC_PATH_PROG([PKGCONF],[pkg-config],[],[$PATH:/usr/local/bin:ext/bin:ext:/sw/bin:/opt/bin:/opt/local/bin]) 3 | 4 | echo "stringfish configure script" 5 | ######################################################## 6 | ### Predefined compile strings for different cases 7 | 8 | ADD_LIBS="" 9 | INCLUDE_PATHS="" 10 | LIBPCRE2="" 11 | PCRE2_BUNDLED="" 12 | 13 | ######################################################## 14 | ### Configure args 15 | 16 | AC_ARG_WITH([pcre2-force-compile], 17 | AS_HELP_STRING([--with-pcre2-force-compile],[Force compilation of bundled pcre2 source files]), 18 | [pcre2_force_compile="true"]) 19 | 20 | AC_ARG_WITH([pcre2-include], 21 | AS_HELP_STRING([--with-pcre2-include=INCLUDE_PATH],[the location of pcre2 header files]), 22 | [pcre2_include_path=$withval]) 23 | 24 | AC_ARG_WITH([pcre2-lib], 25 | AS_HELP_STRING([--with-pcre2-lib=LIB_PATH],[the location of pcre2 library files]), 26 | [pcre2_lib_path=$withval]) 27 | 28 | AC_ARG_WITH([simd], 29 | AS_HELP_STRING([--with-simd],[Manually select SIMD support (options: AVX2)]), 30 | [with_simd=$withval]) 31 | 32 | 33 | ######################################################## 34 | #### Version value function 35 | 36 | getVersion() 37 | { 38 | VERSION_STRING=$1 39 | MAJOR=`echo $VERSION_STRING | cut -d. -f1` 40 | MINOR=`echo $VERSION_STRING | cut -d. -f2` 41 | echo $(($MAJOR*1000+$MINOR)) 42 | } 43 | 44 | ######################################################## 45 | #### Check for GCC version and add -mshstk cflags for GCC 8+ 46 | #### ver 0.13.2 -- no longer in use 47 | 48 | # echo "Testing for C compiler version" 49 | # echo "R_HOME: $R_HOME" 50 | # : ${R_HOME=`R RHOME`} 51 | # echo "R_HOME: $R_HOME" 52 | # if test -z "${R_HOME}"; then 53 | # echo "could not determine R_HOME" 54 | # exit 1 55 | # fi 56 | # CC=`"${R_HOME}/bin/R" CMD config CC` 57 | # echo "C compiler command: $CC" 58 | 59 | # AC_LANG(C) 60 | # AX_CHECK_COMPILE_FLAG([-mshstk],[MSHSTK_FLAG_AVAIL=yes]) 61 | 62 | # if test xx$MSHSTK_FLAG_AVAIL = "xxyes"; then 63 | # ADD_CFLAGS="${ADD_CFLAGS} -mshstk" 64 | # fi 65 | 66 | # AC_LANG(C) 67 | # AX_COMPILER_VENDOR 68 | # AX_COMPILER_VERSION 69 | # echo "C compiler vendor: $ax_cv_c_compiler_vendor" 70 | # echo "C compiler version: $ax_cv_c_compiler_version" # note: The version is completely wrong for Mac LLVM 71 | 72 | # if test xx$ax_cv_c_compiler_vendor = "xxgnu"; then 73 | # CCVER=`${CC} -dumpversion | cut -f 1 -d "."` 74 | # echo "gcc dumpversion: $CCVER" 75 | # if test "${CCVER}" -ge 8; then 76 | # ADD_CFLAGS="${ADD_CFLAGS} -mshstk" 77 | # fi 78 | # elif test xx$ax_cv_c_compiler_vendor == "xxclang"; then 79 | # CCVER=`${CC} -dumpversion | cut -f 1 -d "."` 80 | # echo "clang dumpversion: $CCVER" 81 | # if test "${CCVER}" -ge 9; then 82 | # ADD_CFLAGS="${ADD_CFLAGS} -mshstk" 83 | # fi 84 | # fi 85 | 86 | 87 | ######################################################## 88 | #### PCRE2 library paths 89 | 90 | if test xx$pcre2_force_compile = "xxtrue"; then 91 | echo "Compiling PCRE2 from source due to --with-pcre2-force-compile" 92 | COMPILE_PCRE2="true" 93 | elif test "xx$pcre2_include_path" != "xx"; then 94 | echo "Using user-defined pcre2 install paths" 95 | ADD_LIBS="${ADD_LIBS} -L${pcre2_lib_path}" 96 | INCLUDE_PATHS="${INCLUDE_PATHS} -I${pcre2_include_path}" 97 | COMPILE_PCRE2="false" 98 | elif test "xx$PKGCONF" != "xx"; then 99 | if "${PKGCONF}" --exists libpcre2-8; then 100 | VERSION_STRING=`${PKGCONF} --modversion libpcre2-8` 101 | VER=`getVersion ${VERSION_STRING}` 102 | if test "${VER}" -ge 10035; then 103 | echo "PCRE2 ${VERSION_STRING} library detected -- skipping PCRE2 compilation" 104 | pcre2_lib_path=`"${PKGCONF}" --libs libpcre2-8` 105 | pcre2_include_path=`"${PKGCONF}" --cflags-only-I libpcre2-8` 106 | ADD_LIBS="${ADD_LIBS} ${pcre2_lib_path}" 107 | INCLUDE_PATHS="${INCLUDE_PATHS} ${pcre2_include_path}" 108 | COMPILE_PCRE2="false" 109 | else 110 | echo "PCRE2 ${VERSION_STRING} library detected but is lower than bundled version (10.35) -- compiling from source" 111 | COMPILE_PCRE2="true" 112 | fi 113 | else 114 | echo "PCRE2 library not detected -- compiling from source" 115 | COMPILE_PCRE2="true" 116 | fi 117 | else 118 | echo "pkg-confg not detected -- compiling from source" 119 | COMPILE_PCRE2="true" 120 | fi 121 | 122 | if test xx$COMPILE_PCRE2 = "xxtrue"; then 123 | INCLUDE_PATHS="${INCLUDE_PATHS} -IPCRE2" 124 | LIBPCRE2="\$(LIBPCRE2)" 125 | PCRE2_BUNDLED="-DPCRE2_BUNDLED" 126 | fi 127 | 128 | if test xx$with_simd = "xxAVX2"; then 129 | echo "Using AVX2" 130 | INCLUDE_PATHS="$INCLUDE_PATHS -mavx2 -msse3 -msse2" 131 | # elif test xx$with_simd = "xxSSE3"; then 132 | # echo "Using SSE3" 133 | # INCLUDE_PATHS="$INCLUDE_PATHS -msse3 -msse2" 134 | fi 135 | 136 | echo $ADD_LIBS 137 | echo $INCLUDE_PATHS 138 | echo $LIBPCRE2 139 | 140 | AC_SUBST([ADD_LIBS], $ADD_LIBS) 141 | AC_SUBST([INCLUDE_PATHS], $INCLUDE_PATHS) 142 | AC_SUBST([LIBPCRE2], $LIBPCRE2) 143 | AC_SUBST([PCRE2_BUNDLED], $PCRE2_BUNDLED) 144 | AC_CONFIG_FILES([src/Makevars]) 145 | AC_OUTPUT 146 | -------------------------------------------------------------------------------- /inst/PCRE2_LICENSE.txt: -------------------------------------------------------------------------------- 1 | PCRE2 LICENCE 2 | ------------- 3 | 4 | PCRE2 is a library of functions to support regular expressions whose syntax 5 | and semantics are as close as possible to those of the Perl 5 language. 6 | 7 | Releases 10.00 and above of PCRE2 are distributed under the terms of the "BSD" 8 | licence, as specified below, with one exemption for certain binary 9 | redistributions. The documentation for PCRE2, supplied in the "doc" directory, 10 | is distributed under the same terms as the software itself. The data in the 11 | testdata directory is not copyrighted and is in the public domain. 12 | 13 | The basic library functions are written in C and are freestanding. Also 14 | included in the distribution is a just-in-time compiler that can be used to 15 | optimize pattern matching. This is an optional feature that can be omitted when 16 | the library is built. 17 | 18 | 19 | THE BASIC LIBRARY FUNCTIONS 20 | --------------------------- 21 | 22 | Written by: Philip Hazel 23 | Email local part: ph10 24 | Email domain: cam.ac.uk 25 | 26 | University of Cambridge Computing Service, 27 | Cambridge, England. 28 | 29 | Copyright (c) 1997-2019 University of Cambridge 30 | All rights reserved. 31 | 32 | 33 | PCRE2 JUST-IN-TIME COMPILATION SUPPORT 34 | -------------------------------------- 35 | 36 | Written by: Zoltan Herczeg 37 | Email local part: hzmester 38 | Email domain: freemail.hu 39 | 40 | Copyright(c) 2010-2019 Zoltan Herczeg 41 | All rights reserved. 42 | 43 | 44 | STACK-LESS JUST-IN-TIME COMPILER 45 | -------------------------------- 46 | 47 | Written by: Zoltan Herczeg 48 | Email local part: hzmester 49 | Email domain: freemail.hu 50 | 51 | Copyright(c) 2009-2019 Zoltan Herczeg 52 | All rights reserved. 53 | 54 | 55 | THE "BSD" LICENCE 56 | ----------------- 57 | 58 | Redistribution and use in source and binary forms, with or without 59 | modification, are permitted provided that the following conditions are met: 60 | 61 | * Redistributions of source code must retain the above copyright notices, 62 | this list of conditions and the following disclaimer. 63 | 64 | * Redistributions in binary form must reproduce the above copyright 65 | notices, this list of conditions and the following disclaimer in the 66 | documentation and/or other materials provided with the distribution. 67 | 68 | * Neither the name of the University of Cambridge nor the names of any 69 | contributors may be used to endorse or promote products derived from this 70 | software without specific prior written permission. 71 | 72 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 73 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 74 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 75 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 76 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 77 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 78 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 79 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 80 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 81 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 82 | POSSIBILITY OF SUCH DAMAGE. 83 | 84 | 85 | EXEMPTION FOR BINARY LIBRARY-LIKE PACKAGES 86 | ------------------------------------------ 87 | 88 | The second condition in the BSD licence (covering binary redistributions) does 89 | not apply all the way down a chain of software. If binary package A includes 90 | PCRE2, it must respect the condition, but if package B is software that 91 | includes package A, the condition is not imposed on package B unless it uses 92 | PCRE2 independently. 93 | 94 | End 95 | -------------------------------------------------------------------------------- /inst/icelandic_words_500_utf8.txt: -------------------------------------------------------------------------------- 1 | ég 2 | að 3 | er 4 | það 5 | ekki 6 | í 7 | og 8 | þú 9 | við 10 | á 11 | hann 12 | þetta 13 | hvað 14 | sem 15 | mér 16 | til 17 | með 18 | þér 19 | en 20 | fyrir 21 | af 22 | um 23 | þig 24 | var 25 | mig 26 | því 27 | já 28 | hún 29 | nei 30 | allt 31 | þá 32 | ef 33 | eru 34 | bara 35 | ert 36 | svo 37 | þeir 38 | þið 39 | okkur 40 | eftir 41 | ertu 42 | eins 43 | vera 44 | hér 45 | gera 46 | lagi 47 | veit 48 | hefur 49 | nú 50 | frá 51 | þegar 52 | hvernig 53 | fara 54 | e 55 | honum 56 | út 57 | hef 58 | verður 59 | aftur 60 | upp 61 | ekkert 62 | vel 63 | þessu 64 | verið 65 | minn 66 | sé 67 | svona 68 | hver 69 | hana 70 | ykkur 71 | eða 72 | vil 73 | hverju 74 | komdu 75 | get 76 | segja 77 | úr 78 | hvar 79 | erum 80 | aldrei 81 | hafa 82 | eitthvað 83 | hérna 84 | gott 85 | maður 86 | viltu 87 | hjá 88 | þau 89 | fá 90 | getur 91 | sagði 92 | koma 93 | inn 94 | okkar 95 | þarna 96 | núna 97 | herra 98 | kannski 99 | mín 100 | þarf 101 | þar 102 | hans 103 | tala 104 | þeim 105 | þess 106 | þín 107 | takk 108 | farðu 109 | henni 110 | væri 111 | þinn 112 | fer 113 | líka 114 | sjá 115 | rétt 116 | áfram 117 | held 118 | sér 119 | kemur 120 | mjög 121 | gert 122 | verð 123 | þessi 124 | vegna 125 | saman 126 | sá 127 | enn 128 | þakka 129 | einn 130 | komið 131 | gæti 132 | allir 133 | alltaf 134 | of 135 | enginn 136 | kom 137 | skal 138 | vita 139 | yfir 140 | má 141 | farið 142 | ætla 143 | dag 144 | a 145 | hafi 146 | förum 147 | þær 148 | veistu 149 | hingað 150 | heldur 151 | niður 152 | hefði 153 | sig 154 | mitt 155 | mikið 156 | átt 157 | pabbi 158 | segðu 159 | höfum 160 | bíddu 161 | sama 162 | láttu 163 | hr 164 | einhver 165 | guð 166 | heim 167 | aðeins 168 | áður 169 | segir 170 | þessa 171 | getum 172 | fyrirgefðu 173 | mamma 174 | vertu 175 | tíma 176 | finnst 177 | góður 178 | meira 179 | vill 180 | vildi 181 | i 182 | gerðu 183 | þitt 184 | verðum 185 | eitt 186 | satt 187 | hefurðu 188 | stað 189 | verða 190 | taka 191 | fór 192 | jæja 193 | veist 194 | halda 195 | ao 196 | hvert 197 | ykkar 198 | alla 199 | mun 200 | annað 201 | voru 202 | gerir 203 | sjáðu 204 | leið 205 | sagt 206 | fram 207 | vilt 208 | séð 209 | hvaða 210 | láta 211 | menn 212 | gerði 213 | öll 214 | fólk 215 | kvöld 216 | elskan 217 | hvort 218 | myndi 219 | ó 220 | einu 221 | fjandinn 222 | vinur 223 | ár 224 | síðan 225 | geri 226 | hennar 227 | hélt 228 | hættu 229 | eina 230 | halló 231 | þ 232 | pú 233 | þessum 234 | ætti 235 | vissi 236 | auðvitað 237 | sinni 238 | þeirra 239 | eruð 240 | mína 241 | alveg 242 | þannig 243 | strax 244 | þína 245 | drepa 246 | reyna 247 | þennan 248 | vinna 249 | skil 250 | án 251 | finna 252 | neitt 253 | langar 254 | morgun 255 | viss 256 | undir 257 | geturðu 258 | engin 259 | taktu 260 | öllum 261 | áttu 262 | hvers 263 | hæ 264 | gengur 265 | hugsa 266 | heyrðu 267 | kem 268 | mínum 269 | einmitt 270 | frú 271 | haltu 272 | fimm 273 | elska 274 | heldurðu 275 | þangað 276 | lengi 277 | varð 278 | eigum 279 | virðist 280 | hvenær 281 | fékk 282 | varst 283 | segi 284 | ná 285 | vilja 286 | góða 287 | gaman 288 | þarft 289 | kominn 290 | komast 291 | lengur 292 | sæll 293 | lífi 294 | eiga 295 | mann 296 | gerðist 297 | burt 298 | líður 299 | petta 300 | átti 301 | jack 302 | inni 303 | þínum 304 | þurfum 305 | frábært 306 | þykir 307 | pao 308 | öllu 309 | leitt 310 | fyrst 311 | fyrsta 312 | geta 313 | handa 314 | betur 315 | ætlarðu 316 | hafði 317 | hægt 318 | jú 319 | héðan 320 | hjálpa 321 | fyrr 322 | illa 323 | hitta 324 | málið 325 | alvöru 326 | nóg 327 | góð 328 | einhvern 329 | sinn 330 | nótt 331 | èg 332 | víst 333 | áhyggjur 334 | nema 335 | ára 336 | komum 337 | daginn 338 | ferð 339 | mál 340 | ein 341 | gegn 342 | hlýtur 343 | sjálfur 344 | ú 345 | fengið 346 | búinn 347 | r 348 | hafið 349 | langt 350 | annars 351 | leita 352 | aò 353 | tveir 354 | ættir 355 | heiti 356 | haldið 357 | heima 358 | erfitt 359 | orðið 360 | ad 361 | enga 362 | líf 363 | engar 364 | minni 365 | faðir 366 | deyja 367 | mátt 368 | gat 369 | hve 370 | à 371 | skiptir 372 | samt 373 | mínu 374 | máli 375 | mínútur 376 | vio 377 | hætta 378 | gerum 379 | sýna 380 | tekur 381 | spyrja 382 | meðan 383 | heyra 384 | tvö 385 | gefa 386 | bless 387 | skilurðu 388 | heitir 389 | vorum 390 | mínir 391 | ætlar 392 | klukkan 393 | kann 394 | hvern 395 | john 396 | fær 397 | maðurinn 398 | gerast 399 | gangi 400 | h 401 | allar 402 | fæ 403 | sagðir 404 | ganga 405 | vantar 406 | fínt 407 | tíu 408 | komst 409 | nokkuð 410 | stundum 411 | baka 412 | lítur 413 | aõ 414 | pér 415 | hjálp 416 | tók 417 | skjóta 418 | alls 419 | uppi 420 | sért 421 | sérðu 422 | góðan 423 | látið 424 | sex 425 | stendur 426 | gerist 427 | þinni 428 | fann 429 | færð 430 | heyrt 431 | nota 432 | trúi 433 | skilið 434 | allan 435 | líkar 436 | afsakið 437 | engan 438 | fleiri 439 | mömmu 440 | þótt 441 | líklega 442 | kona 443 | manni 444 | þessari 445 | segirðu 446 | sjáumst 447 | jafnvel 448 | hlustaðu 449 | úti 450 | árum 451 | verði 452 | tvær 453 | þekki 454 | bílinn 455 | man 456 | hefðir 457 | fannst 458 | hugmynd 459 | afsakaðu 460 | lokið 461 | milli 462 | daga 463 | g 464 | konan 465 | varstu 466 | kemst 467 | tekið 468 | sú 469 | yrði 470 | sonur 471 | byrja 472 | bíða 473 | tími 474 | búið 475 | tvo 476 | leyfðu 477 | hví 478 | nógu 479 | fjandans 480 | manstu 481 | besta 482 | félagi 483 | þó 484 | ö 485 | værir 486 | engu 487 | eg 488 | betra 489 | new 490 | ha 491 | eigin 492 | hafðu 493 | annan 494 | kalla 495 | næstum 496 | hátt 497 | vitum 498 | beint 499 | dálítið 500 | peninga 501 | -------------------------------------------------------------------------------- /inst/include/sf_external.h: -------------------------------------------------------------------------------- 1 | #ifndef SF_EXTERNAL_H 2 | #define SF_EXTERNAL_H 3 | 4 | #include 5 | #include 6 | #include "sf_internal.h" 7 | using namespace Rcpp; 8 | 9 | std::string get_string_type(SEXP x) {static std::string(*fun)(SEXP) = (std::string(*)(SEXP)) R_GetCCallable("stringfish", "get_string_type");return fun(x);} 10 | 11 | SEXP materialize(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "materialize");return fun(x);} 12 | SEXP sf_vector(size_t len) {static SEXP(*fun)(size_t) = (SEXP(*)(size_t)) R_GetCCallable("stringfish", "sf_vector");return fun(len);} 13 | sf_vec_data & sf_vec_data_ref(SEXP x) {static sf_vec_data &(*fun)(SEXP) = (sf_vec_data &(*)(SEXP)) R_GetCCallable("stringfish", "sf_vec_data_ref");return fun(x);} 14 | void sf_assign(SEXP x, size_t i, SEXP e) {static void(*fun)(SEXP, size_t, SEXP) = (void(*)(SEXP, size_t, SEXP)) R_GetCCallable("stringfish", "sf_assign");return fun(x, i, e);} 15 | SEXP sf_iconv(SEXP x, std::string from, std::string to) {static SEXP(*fun)(SEXP, std::string, std::string) = (SEXP(*)(SEXP, std::string, std::string)) R_GetCCallable("stringfish", "sf_iconv");return fun(x, from, to);} 16 | SEXP convert_to_sf(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "convert_to_sf");return fun(x);} 17 | 18 | SEXP sf_readLines(std::string filename, std::string encoding = "UTF-8") {static SEXP(*fun)(std::string, std::string) = (SEXP(*)(std::string, std::string)) R_GetCCallable("stringfish", "sf_readLines");return fun(filename, encoding);} 19 | void sf_writeLines(SEXP text, const std::string file, const std::string sep = "\n", const std::string na_value = "NA", const std::string encode_mode = "UTF-8") 20 | {static void(*fun)(SEXP, const std::string, const std::string, const std::string, const std::string encode_mode) = (void(*)(SEXP, const std::string, const std::string, const std::string, const std::string encode_mode)) R_GetCCallable("stringfish", "sf_writeLines");return fun(text, file, sep, na_value, encode_mode);} 21 | 22 | IntegerVector sf_nchar(SEXP obj, std::string type = "chars") {static IntegerVector(*fun)(SEXP, std::string) = (IntegerVector(*)(SEXP, std::string)) R_GetCCallable("stringfish", "sf_nchar");return fun(obj, type);} 23 | sfstring sf_substr_internal(const char * x, const int len, const cetype_t type, int start, int stop) {static sfstring(*fun)(const char *, const int, const cetype_t, int, int) = (sfstring(*)(const char *, const int, const cetype_t, int, int)) R_GetCCallable("stringfish", "sf_substr_internal");return fun(x, len, type, start, stop);} 24 | SEXP sf_substr(SEXP x, IntegerVector start, IntegerVector stop) {static SEXP(*fun)(SEXP, IntegerVector, IntegerVector) = (SEXP(*)(SEXP, IntegerVector, IntegerVector)) R_GetCCallable("stringfish", "sf_substr");return fun(x, start, stop);} 25 | SEXP c_sf_paste(List dots, SEXP sep) {static SEXP(*fun)(List, SEXP) = (SEXP(*)(List, SEXP)) R_GetCCallable("stringfish", "c_sf_paste");return fun(dots, sep);} 26 | SEXP sf_collapse(SEXP x, SEXP collapse) {static SEXP(*fun)(SEXP, SEXP) = (SEXP(*)(SEXP, SEXP)) R_GetCCallable("stringfish", "sf_collapse");return fun(x, collapse);} 27 | 28 | LogicalVector sf_grepl(SEXP subject, SEXP pattern, const std::string encode_mode = "auto", const bool fixed = false) {static LogicalVector(*fun)(SEXP, SEXP, const std::string, const bool) = (LogicalVector(*)(SEXP, SEXP, const std::string, const bool)) R_GetCCallable("stringfish", "sf_grepl");return fun(subject, pattern, encode_mode, fixed);} 29 | SEXP sf_split(SEXP subject, SEXP split, const std::string encode_mode = "auto", const bool fixed = false) {static SEXP(*fun)(SEXP, SEXP, const std::string, const bool) = (SEXP(*)(SEXP, SEXP, const std::string, const bool)) R_GetCCallable("stringfish", "sf_split");return fun(subject, split, encode_mode, fixed);} 30 | SEXP sf_gsub(SEXP subject, SEXP pattern, SEXP replacement, const std::string encode_mode = "auto", const bool fixed = false) {static SEXP(*fun)(SEXP, SEXP, SEXP, const std::string, const bool) = (SEXP(*)(SEXP, SEXP, SEXP, const std::string, const bool)) R_GetCCallable("stringfish", "sf_gsub");return fun(subject, pattern, replacement, encode_mode, fixed);} 31 | SEXP random_strings(const int N, const int string_size = 50, std::string charset = "abcdefghijklmnopqrstuvwxyz", std::string vector_mode = "stringfish") {static SEXP(*fun)(const int, const int, std::string, std::string) = (SEXP(*)(const int, const int, std::string, std::string)) R_GetCCallable("stringfish", "sf_random_strings");return fun(N, string_size, charset, vector_mode);} 32 | SEXP sf_toupper(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "sf_toupper");return fun(x);} 33 | SEXP sf_tolower(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "sf_tolower");return fun(x);} 34 | IntegerVector sf_match(SEXP x, SEXP table) {static IntegerVector(*fun)(SEXP, SEXP) = (IntegerVector(*)(SEXP, SEXP)) R_GetCCallable("stringfish", "sf_match");return fun(x, table);} 35 | 36 | #endif // include guard -------------------------------------------------------------------------------- /man/convert_to_sf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{convert_to_sf} 4 | \alias{convert_to_sf} 5 | \alias{sf_convert} 6 | \title{convert_to_sf} 7 | \usage{ 8 | convert_to_sf(x) 9 | 10 | sf_convert(x) 11 | } 12 | \arguments{ 13 | \item{x}{A character vector} 14 | } 15 | \value{ 16 | The converted character vector 17 | } 18 | \description{ 19 | Converts a character vector to a stringfish vector 20 | } 21 | \details{ 22 | Converts a character vector to a stringfish vector. The opposite of `materialize`. 23 | } 24 | \examples{ 25 | if(getRversion() >= "3.5.0") { 26 | x <- convert_to_sf(letters) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/get_string_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{get_string_type} 4 | \alias{get_string_type} 5 | \title{get_string_type} 6 | \usage{ 7 | get_string_type(x) 8 | } 9 | \arguments{ 10 | \item{x}{the vector} 11 | } 12 | \value{ 13 | The type of vector 14 | } 15 | \description{ 16 | Returns the type of the character vector 17 | } 18 | \details{ 19 | A function that returns the type of character vector. Possible values are "normal vector", "stringfish vector", "stringfish vector (materialized)" or "other alt-rep vector" 20 | } 21 | \examples{ 22 | if(getRversion() >= "3.5.0") { 23 | x <- sf_vector(10) 24 | get_string_type(x) # returns "stringfish vector" 25 | x <- character(10) 26 | get_string_type(x) # returns "normal vector" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/materialize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{materialize} 4 | \alias{materialize} 5 | \title{materialize} 6 | \usage{ 7 | materialize(x) 8 | } 9 | \arguments{ 10 | \item{x}{An alt-rep object} 11 | } 12 | \value{ 13 | x 14 | } 15 | \description{ 16 | Materializes an alt-rep object 17 | } 18 | \details{ 19 | Materializes any alt-rep object and then returns it. 20 | Note: the object is materialized regardless of whether the return value is assigned to a variable. 21 | } 22 | \examples{ 23 | if(getRversion() >= "3.5.0") { 24 | x <- sf_vector(10) 25 | sf_assign(x, 1, "hello world") 26 | sf_assign(x, 2, "another string") 27 | x <- materialize(x) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /man/random_strings.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{random_strings} 4 | \alias{random_strings} 5 | \title{random_strings} 6 | \usage{ 7 | random_strings(N, string_size = 50, charset = "abcdefghijklmnopqrstuvwxyz", 8 | vector_mode = "stringfish") 9 | } 10 | \arguments{ 11 | \item{N}{The number of strings to generate} 12 | 13 | \item{string_size}{The length of the strings} 14 | 15 | \item{charset}{The characters used to generate the random strings (default: abcdefghijklmnopqrstuvwxyz)} 16 | 17 | \item{vector_mode}{The type of character vector to generate (either stringfish or normal, default: stringfish)} 18 | } 19 | \value{ 20 | A character vector of the random strings 21 | } 22 | \description{ 23 | A function that generates random strings 24 | } 25 | \details{ 26 | The function uses the PCRE2 library, which is also used internally by R. 27 | Note: the order of paramters is switched compared to the `gsub` base R function, with subject being first. 28 | See also: https://www.pcre.org/current/doc/html/pcre2api.html for more documentation on match syntax. 29 | } 30 | \examples{ 31 | if(getRversion() >= "3.5.0") { 32 | set.seed(1) 33 | x <- random_strings(1e6, 80, "ACGT", vector_mode = "stringfish") 34 | } 35 | } 36 | \seealso{ 37 | gsub 38 | } 39 | -------------------------------------------------------------------------------- /man/sf_assign.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_assign} 4 | \alias{sf_assign} 5 | \title{sf_assign} 6 | \usage{ 7 | sf_assign(x, i, e) 8 | } 9 | \arguments{ 10 | \item{x}{the vector} 11 | 12 | \item{i}{the index to assign to} 13 | 14 | \item{e}{the new string to replace at i in x} 15 | } 16 | \value{ 17 | No return value, the function assigns an element to an existing stringfish vector 18 | } 19 | \description{ 20 | Assigns a new string to a stringfish vector or any other character vector 21 | } 22 | \details{ 23 | A function to assign a new element to an existing character vector. If the the vector is a stringfish vector, it does so without materialization. 24 | } 25 | \examples{ 26 | if(getRversion() >= "3.5.0") { 27 | x <- sf_vector(10) 28 | sf_assign(x, 1, "hello world") 29 | sf_assign(x, 2, "another string") 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /man/sf_collapse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_collapse} 4 | \alias{sf_collapse} 5 | \title{sf_collapse} 6 | \usage{ 7 | sf_collapse(x, collapse) 8 | } 9 | \arguments{ 10 | \item{x}{A character vector} 11 | 12 | \item{collapse}{A single string} 13 | } 14 | \value{ 15 | A single string with all values in `x` pasted together, separated by `collapse`. 16 | } 17 | \description{ 18 | Pastes a series of strings together separated by the `collapse` parameter 19 | } 20 | \details{ 21 | This works the same way as `paste0(x, collapse=collapse)` 22 | } 23 | \examples{ 24 | if(getRversion() >= "3.5.0") { 25 | x <- c("hello", "\\\\xe4\\\\xb8\\\\x96\\\\xe7\\\\x95\\\\x8c") 26 | Encoding(x) <- "UTF-8" 27 | sf_collapse(x, " ") # "hello world" in Japanese 28 | sf_collapse(letters, "") # returns the alphabet 29 | } 30 | } 31 | \seealso{ 32 | paste0, paste 33 | } 34 | -------------------------------------------------------------------------------- /man/sf_compare.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_compare} 4 | \alias{sf_compare} 5 | \alias{sf_equals} 6 | \title{sf_compare} 7 | \usage{ 8 | sf_compare(x, y, nthreads = getOption("stringfish.nthreads", 1L)) 9 | 10 | sf_equals(x, y, nthreads = getOption("stringfish.nthreads", 1L)) 11 | } 12 | \arguments{ 13 | \item{x}{A character vector of length 1 or the same non-zero length as y} 14 | 15 | \item{y}{Another character vector of length 1 or the same non-zero length as y} 16 | 17 | \item{nthreads}{Number of threads to use} 18 | } 19 | \value{ 20 | A logical vector 21 | } 22 | \description{ 23 | Returns a logical vector testing equality of strings from two string vectors 24 | } 25 | \details{ 26 | Note: the function tests for both string and encoding equality 27 | } 28 | \examples{ 29 | if(getRversion() >= "3.5.0") { 30 | sf_compare(letters, "a") 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/sf_concat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_concat} 4 | \alias{sf_concat} 5 | \alias{sfc} 6 | \title{sf_concat} 7 | \usage{ 8 | sf_concat(...) 9 | 10 | sfc(...) 11 | } 12 | \arguments{ 13 | \item{...}{Any number of vectors, coerced to character vector if necessary} 14 | } 15 | \value{ 16 | A concatenated stringfish vector 17 | } 18 | \description{ 19 | Appends vectors together 20 | } 21 | \examples{ 22 | if(getRversion() >= "3.5.0") { 23 | sf_concat(letters, 1:5) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/sf_ends.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_ends} 4 | \alias{sf_ends} 5 | \title{sf_ends} 6 | \usage{ 7 | sf_ends(subject, pattern, ...) 8 | } 9 | \arguments{ 10 | \item{subject}{A character vector} 11 | 12 | \item{pattern}{A string to look for at the start} 13 | 14 | \item{...}{Parameters passed to sf_grepl} 15 | } 16 | \value{ 17 | A logical vector true if there is a match, false if no match, NA is the subject was NA 18 | } 19 | \description{ 20 | A function for detecting a pattern at the end of a string 21 | } 22 | \examples{ 23 | if(getRversion() >= "3.5.0") { 24 | x <- c("alpha", "beta", "gamma", "delta", "epsilon") 25 | sf_ends(x, "a") 26 | } 27 | } 28 | \seealso{ 29 | endsWith, sf_starts 30 | } 31 | -------------------------------------------------------------------------------- /man/sf_grepl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_grepl} 4 | \alias{sf_grepl} 5 | \title{sf_grepl} 6 | \usage{ 7 | sf_grepl(subject, pattern, encode_mode = "auto", fixed = FALSE, 8 | nthreads = getOption("stringfish.nthreads", 1L)) 9 | } 10 | \arguments{ 11 | \item{subject}{The subject character vector to search} 12 | 13 | \item{pattern}{The pattern to search for} 14 | 15 | \item{encode_mode}{"auto", "UTF-8" or "byte". Determines multi-byte (UTF-8) characters or single-byte characters are used.} 16 | 17 | \item{fixed}{determines whether the pattern parameter should be interpreted literally or as a regular expression} 18 | 19 | \item{nthreads}{Number of threads to use} 20 | } 21 | \value{ 22 | A logical vector with the same length as subject 23 | } 24 | \description{ 25 | A function that matches patterns and returns a logical vector 26 | } 27 | \details{ 28 | The function uses the PCRE2 library, which is also used internally by R. 29 | The encoding is based on the pattern string (or forced via the encode_mode parameter). 30 | Note: the order of paramters is switched compared to the `grepl` base R function, with subject being first. 31 | See also: https://www.pcre.org/current/doc/html/pcre2api.html for more documentation on match syntax. 32 | } 33 | \examples{ 34 | if(getRversion() >= "3.5.0") { 35 | x <- sf_vector(10) 36 | sf_assign(x, 1, "hello world") 37 | pattern <- "^hello" 38 | sf_grepl(x, pattern) 39 | } 40 | } 41 | \seealso{ 42 | grepl 43 | } 44 | -------------------------------------------------------------------------------- /man/sf_gsub.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_gsub} 4 | \alias{sf_gsub} 5 | \title{sf_gsub} 6 | \usage{ 7 | sf_gsub(subject, pattern, replacement, encode_mode = "auto", fixed = FALSE, 8 | nthreads = getOption("stringfish.nthreads", 1L)) 9 | } 10 | \arguments{ 11 | \item{subject}{The subject character vector to search} 12 | 13 | \item{pattern}{The pattern to search for} 14 | 15 | \item{replacement}{The replacement string} 16 | 17 | \item{encode_mode}{"auto", "UTF-8" or "byte". Determines multi-byte (UTF-8) characters or single-byte characters are used.} 18 | 19 | \item{fixed}{determines whether the pattern parameter should be interpreted literally or as a regular expression} 20 | 21 | \item{nthreads}{Number of threads to use} 22 | } 23 | \value{ 24 | A stringfish vector of the replacement string 25 | } 26 | \description{ 27 | A function that performs pattern substitution 28 | } 29 | \details{ 30 | The function uses the PCRE2 library, which is also used internally by R. However, syntax may be slightly different. 31 | E.g.: capture groups: "\1" in R, but "$1" in PCRE2 (as in Perl). 32 | The encoding of the output is determined by the pattern (or forced using encode_mode parameter) and encodings should be compatible. 33 | E.g: mixing ASCII and UTF-8 is okay, but not UTF-8 and latin1. 34 | Note: the order of paramters is switched compared to the `gsub` base R function, with subject being first. 35 | See also: https://www.pcre.org/current/doc/html/pcre2api.html for more documentation on match syntax. 36 | } 37 | \examples{ 38 | if(getRversion() >= "3.5.0") { 39 | x <- "hello world" 40 | pattern <- "^hello (.+)" 41 | replacement <- "goodbye $1" 42 | sf_gsub(x, pattern, replacement) 43 | } 44 | } 45 | \seealso{ 46 | gsub 47 | } 48 | -------------------------------------------------------------------------------- /man/sf_iconv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_iconv} 4 | \alias{sf_iconv} 5 | \title{sf_iconv} 6 | \usage{ 7 | sf_iconv(x, from, to, nthreads = getOption("stringfish.nthreads", 1L)) 8 | } 9 | \arguments{ 10 | \item{x}{An alt-rep object} 11 | 12 | \item{from}{the encoding to assume of `x`} 13 | 14 | \item{nthreads}{Number of threads to use} 15 | 16 | \item{to}{the new encoding} 17 | } 18 | \value{ 19 | the converted character vector as a stringfish vector 20 | } 21 | \description{ 22 | Converts encoding of one character vector to another 23 | } 24 | \details{ 25 | This is an analogue to the base R function `iconv`. It converts a string from one encoding (e.g. latin1 or UTF-8) to another 26 | } 27 | \examples{ 28 | if(getRversion() >= "3.5.0") { 29 | x <- "fa\xE7ile" 30 | Encoding(x) <- "latin1" 31 | sf_iconv(x, "latin1", "UTF-8") 32 | } 33 | } 34 | \seealso{ 35 | iconv 36 | } 37 | -------------------------------------------------------------------------------- /man/sf_match.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_match} 4 | \alias{sf_match} 5 | \title{sf_match} 6 | \usage{ 7 | sf_match(x, table, nthreads = getOption("stringfish.nthreads", 1L)) 8 | } 9 | \arguments{ 10 | \item{x}{A character vector to search for in table} 11 | 12 | \item{table}{A character vector to be matched against x} 13 | 14 | \item{nthreads}{Number of threads to use} 15 | } 16 | \value{ 17 | An integer vector of the indicies of each x element's position in table 18 | } 19 | \description{ 20 | Returns a vector of the positions of x in table 21 | } 22 | \details{ 23 | Note: similarly to the base R function, long "table" vectors are not supported. This is due to the maximum integer value that can be returned (`.Machine$integer.max`) 24 | } 25 | \examples{ 26 | if(getRversion() >= "3.5.0") { 27 | sf_match("c", letters) 28 | } 29 | } 30 | \seealso{ 31 | match 32 | } 33 | -------------------------------------------------------------------------------- /man/sf_nchar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_nchar} 4 | \alias{sf_nchar} 5 | \title{sf_nchar} 6 | \usage{ 7 | sf_nchar(x, type = "chars", nthreads = getOption("stringfish.nthreads", 1L)) 8 | } 9 | \arguments{ 10 | \item{x}{A character vector} 11 | 12 | \item{type}{The type of counting to perform ("chars" or "bytes", default: "chars")} 13 | 14 | \item{nthreads}{Number of threads to use} 15 | } 16 | \value{ 17 | An integer vector of the number of characters 18 | } 19 | \description{ 20 | Counts the number of characters in a character vector 21 | } 22 | \details{ 23 | Returns the number of characters per string. The type of counting only matters for UTF-8 strings, where a character can be represented by multiple bytes. 24 | } 25 | \examples{ 26 | if(getRversion() >= "3.5.0") { 27 | x <- "fa\xE7ile" 28 | Encoding(x) <- "latin1" 29 | x <- sf_iconv(x, "latin1", "UTF-8") 30 | } 31 | } 32 | \seealso{ 33 | nchar 34 | } 35 | -------------------------------------------------------------------------------- /man/sf_paste.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_paste} 4 | \alias{sf_paste} 5 | \title{sf_paste} 6 | \usage{ 7 | sf_paste(..., sep = "", nthreads = getOption("stringfish.nthreads", 1L)) 8 | } 9 | \arguments{ 10 | \item{...}{Any number of character vector strings} 11 | 12 | \item{sep}{The seperating string between strings} 13 | 14 | \item{nthreads}{Number of threads to use} 15 | } 16 | \value{ 17 | A character vector where elements of the arguments are pasted together 18 | } 19 | \description{ 20 | Pastes a series of strings together 21 | } 22 | \details{ 23 | This works the same way as `paste0(..., sep=sep)` 24 | } 25 | \examples{ 26 | if(getRversion() >= "3.5.0") { 27 | x <- letters 28 | y <- LETTERS 29 | sf_paste(x,y, sep = ":") 30 | } 31 | } 32 | \seealso{ 33 | paste0, paste 34 | } 35 | -------------------------------------------------------------------------------- /man/sf_readLines.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_readLines} 4 | \alias{sf_readLines} 5 | \title{sf_readLines} 6 | \usage{ 7 | sf_readLines(file, encoding = "UTF-8") 8 | } 9 | \arguments{ 10 | \item{file}{The file name} 11 | 12 | \item{encoding}{The encoding to use (Default: UTF-8)} 13 | } 14 | \value{ 15 | A stringfish vector of the lines in a file 16 | } 17 | \description{ 18 | A function that reads a file line by line 19 | } 20 | \details{ 21 | A function for reading in text data using `std::ifstream`. 22 | } 23 | \examples{ 24 | if(getRversion() >= "3.5.0") { 25 | file <- tempfile() 26 | sf_writeLines(letters, file) 27 | sf_readLines(file) 28 | } 29 | } 30 | \seealso{ 31 | readLines 32 | } 33 | -------------------------------------------------------------------------------- /man/sf_split.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_split} 4 | \alias{sf_split} 5 | \title{sf_split} 6 | \usage{ 7 | sf_split(subject, split, encode_mode = "auto", fixed = FALSE, 8 | nthreads = getOption("stringfish.nthreads", 1L)) 9 | } 10 | \arguments{ 11 | \item{subject}{A character vector} 12 | 13 | \item{split}{A delimiter to split the string by} 14 | 15 | \item{encode_mode}{"auto", "UTF-8" or "byte". Determines multi-byte (UTF-8) characters or single-byte characters are used.} 16 | 17 | \item{fixed}{determines whether the split parameter should be interpreted literally or as a regular expression} 18 | 19 | \item{nthreads}{Number of threads to use} 20 | } 21 | \value{ 22 | A list of stringfish character vectors 23 | } 24 | \description{ 25 | A function to split strings by a delimiter 26 | } 27 | \examples{ 28 | if(getRversion() >= "3.5.0") { 29 | sf_split(datasets::state.name, "\\\\s") # split U.S. state names by any space character 30 | } 31 | } 32 | \seealso{ 33 | strsplit 34 | } 35 | -------------------------------------------------------------------------------- /man/sf_starts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_starts} 4 | \alias{sf_starts} 5 | \title{sf_starts} 6 | \usage{ 7 | sf_starts(subject, pattern, ...) 8 | } 9 | \arguments{ 10 | \item{subject}{A character vector} 11 | 12 | \item{pattern}{A string to look for at the start} 13 | 14 | \item{...}{Parameters passed to sf_grepl} 15 | } 16 | \value{ 17 | A logical vector true if there is a match, false if no match, NA is the subject was NA 18 | } 19 | \description{ 20 | A function for detecting a pattern at the start of a string 21 | } 22 | \examples{ 23 | if(getRversion() >= "3.5.0") { 24 | x <- c("alpha", "beta", "gamma", "delta", "epsilon") 25 | sf_starts(x, "a") 26 | } 27 | } 28 | \seealso{ 29 | startsWith, sf_ends 30 | } 31 | -------------------------------------------------------------------------------- /man/sf_substr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_substr} 4 | \alias{sf_substr} 5 | \title{sf_substr} 6 | \usage{ 7 | sf_substr(x, start, stop, nthreads = getOption("stringfish.nthreads", 1L)) 8 | } 9 | \arguments{ 10 | \item{x}{A character vector} 11 | 12 | \item{start}{The begining to extract from} 13 | 14 | \item{stop}{The end to extract from} 15 | 16 | \item{nthreads}{Number of threads to use} 17 | } 18 | \value{ 19 | A stringfish vector of substrings 20 | } 21 | \description{ 22 | Extracts substrings from a character vector 23 | } 24 | \details{ 25 | This works the same way as `substr`, but in addition allows negative indexing. 26 | Negative indicies count backwards from the end of the string, with -1 being the last character. 27 | } 28 | \examples{ 29 | if(getRversion() >= "3.5.0") { 30 | x <- c("fa\xE7ile", "hello world") 31 | Encoding(x) <- "latin1" 32 | x <- sf_iconv(x, "latin1", "UTF-8") 33 | sf_substr(x, 4, -1) # extracts from the 4th character to the last 34 | ## [1] "ile" "lo world" 35 | } 36 | } 37 | \seealso{ 38 | substr 39 | } 40 | -------------------------------------------------------------------------------- /man/sf_tolower.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_tolower} 4 | \alias{sf_tolower} 5 | \title{sf_tolower} 6 | \usage{ 7 | sf_tolower(x) 8 | } 9 | \arguments{ 10 | \item{x}{A character vector} 11 | } 12 | \value{ 13 | A stringfish vector where all uppercase is converted to lowercase 14 | } 15 | \description{ 16 | A function converting a string to all lowercase 17 | } 18 | \details{ 19 | Note: the function only converts ASCII characters. 20 | } 21 | \examples{ 22 | if(getRversion() >= "3.5.0") { 23 | x <- LETTERS 24 | sf_tolower(x) 25 | } 26 | } 27 | \seealso{ 28 | tolower 29 | } 30 | -------------------------------------------------------------------------------- /man/sf_toupper.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_toupper} 4 | \alias{sf_toupper} 5 | \title{sf_toupper} 6 | \usage{ 7 | sf_toupper(x) 8 | } 9 | \arguments{ 10 | \item{x}{A character vector} 11 | } 12 | \value{ 13 | A stringfish vector where all lowercase is converted to uppercase 14 | } 15 | \description{ 16 | A function converting a string to all uppercase 17 | } 18 | \details{ 19 | Note: the function only converts ASCII characters. 20 | } 21 | \examples{ 22 | if(getRversion() >= "3.5.0") { 23 | x <- letters 24 | sf_toupper(x) 25 | } 26 | } 27 | \seealso{ 28 | toupper 29 | } 30 | -------------------------------------------------------------------------------- /man/sf_trim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_trim} 4 | \alias{sf_trim} 5 | \title{sf_trim} 6 | \usage{ 7 | sf_trim(subject, which = c("both", "left", "right"), whitespace = "[ \\\\t\\\\r\\\\n]", ...) 8 | } 9 | \arguments{ 10 | \item{subject}{A character vector} 11 | 12 | \item{which}{"both", "left", or "right" determines which white space is removed} 13 | 14 | \item{whitespace}{Whitespace characters (default: "[ \\\\t\\\\r\\\\n]")} 15 | 16 | \item{...}{Parameters passed to sf_gsub} 17 | } 18 | \value{ 19 | A stringfish vector of trimmed whitespace 20 | } 21 | \description{ 22 | A function to remove leading/trailing whitespace 23 | } 24 | \examples{ 25 | if(getRversion() >= "3.5.0") { 26 | x <- c(" alpha ", " beta", " gamma ", "delta ", "epsilon ") 27 | sf_trim(x) 28 | } 29 | } 30 | \seealso{ 31 | trimws 32 | } 33 | -------------------------------------------------------------------------------- /man/sf_vector.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_vector} 4 | \alias{sf_vector} 5 | \title{sf_vector} 6 | \usage{ 7 | sf_vector(len) 8 | } 9 | \arguments{ 10 | \item{len}{length of the new vector} 11 | } 12 | \value{ 13 | A new (empty) stringfish vector 14 | } 15 | \description{ 16 | Creates a new stringfish vector 17 | } 18 | \details{ 19 | This function creates a new stringfish vector, an alt-rep character vector backed by a C++ "std::vector" as the internal memory representation. 20 | The vector type is "sfstring", which is a simple C++ class containing a "std::string" and a single byte (uint8_t) representing the encoding. 21 | } 22 | \examples{ 23 | if(getRversion() >= "3.5.0") { 24 | x <- sf_vector(10) 25 | sf_assign(x, 1, "hello world") 26 | sf_assign(x, 2, "another string") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/sf_writeLines.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{sf_writeLines} 4 | \alias{sf_writeLines} 5 | \title{sf_writeLines} 6 | \usage{ 7 | sf_writeLines(text, file, sep = "\n", na_value = "NA", encode_mode = "UTF-8") 8 | } 9 | \arguments{ 10 | \item{text}{A character to write to file} 11 | 12 | \item{file}{Name of the file to write to} 13 | 14 | \item{sep}{The line separator character(s)} 15 | 16 | \item{na_value}{What to write in case of a NA string} 17 | 18 | \item{encode_mode}{"UTF-8" or "byte". If "UTF-8", all strings are re-encoded as UTF-8.} 19 | } 20 | \description{ 21 | A function that reads a file line by line 22 | } 23 | \details{ 24 | A function for writing text data using `std::ofstream`. 25 | } 26 | \examples{ 27 | if(getRversion() >= "3.5.0") { 28 | file <- tempfile() 29 | sf_writeLines(letters, file) 30 | sf_readLines(file) 31 | } 32 | } 33 | \seealso{ 34 | writeLines 35 | } 36 | -------------------------------------------------------------------------------- /man/string_identical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zz_help_files.R 3 | \name{string_identical} 4 | \alias{string_identical} 5 | \title{string_identical} 6 | \usage{ 7 | string_identical(x, y) 8 | } 9 | \arguments{ 10 | \item{x}{A character vector} 11 | 12 | \item{y}{Another character to compare to x} 13 | } 14 | \value{ 15 | TRUE if strings are identical, including encoding 16 | } 17 | \description{ 18 | A stricter comparison of string equality 19 | } 20 | \examples{ 21 | x <- "fa\xE7ile" 22 | Encoding(x) <- "latin1" 23 | y <- iconv(x, "latin1", "UTF-8") 24 | identical(x, y) # TRUE 25 | string_identical(x, y) # FALSE 26 | } 27 | \seealso{ 28 | identical 29 | } 30 | -------------------------------------------------------------------------------- /src/Makevars.in: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS=-DRCPP_USE_UNWIND_PROTECT -DRCPP_NO_RTTI -DPCRE2_CODE_UNIT_WIDTH=8 -DHAVE_CONFIG_H @PCRE2_BUNDLED@ -I. @INCLUDE_PATHS@ 2 | PKG_CXXFLAGS = $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::CxxFlags()") 3 | PKG_LIBS=-lpthread -L. -lSFPCRE2 @ADD_LIBS@ $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::RcppParallelLibs()") 4 | 5 | LIBPCRE2 = PCRE2/pcre2_chartables.o \ 6 | PCRE2/pcre2_auto_possess.o \ 7 | PCRE2/pcre2_compile.o \ 8 | PCRE2/pcre2_config.o \ 9 | PCRE2/pcre2_context.o \ 10 | PCRE2/pcre2_convert.o \ 11 | PCRE2/pcre2_dfa_match.o \ 12 | PCRE2/pcre2_error.o \ 13 | PCRE2/pcre2_extuni.o \ 14 | PCRE2/pcre2_find_bracket.o \ 15 | PCRE2/pcre2_jit_compile.o \ 16 | PCRE2/pcre2_maketables.o \ 17 | PCRE2/pcre2_match.o \ 18 | PCRE2/pcre2_match_data.o \ 19 | PCRE2/pcre2_newline.o \ 20 | PCRE2/pcre2_ord2utf.o \ 21 | PCRE2/pcre2_pattern_info.o \ 22 | PCRE2/pcre2_script_run.o \ 23 | PCRE2/pcre2_serialize.o \ 24 | PCRE2/pcre2_string_utils.o \ 25 | PCRE2/pcre2_study.o \ 26 | PCRE2/pcre2_substitute.o \ 27 | PCRE2/pcre2_substring.o \ 28 | PCRE2/pcre2_tables.o \ 29 | PCRE2/pcre2_ucd.o \ 30 | PCRE2/pcre2_valid_utf.o \ 31 | PCRE2/pcre2_xclass.o \ 32 | PCRE2/pcre2_is_bundled.o 33 | 34 | 35 | PCRE2_wrapper = PCRE2_wrapper/pcre2_wrapper.o 36 | 37 | $(SHLIB): libSFPCRE2.a 38 | 39 | libSFPCRE2.a: @LIBPCRE2@ $(PCRE2_wrapper) 40 | $(AR) rcs libSFPCRE2.a @LIBPCRE2@ $(PCRE2_wrapper) 41 | 42 | clean: 43 | rm -f $(SHLIB) $(OBJECTS) @LIBPCRE2@ $(PCRE2_wrapper) libSFPCRE2.a 44 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS = -DRCPP_USE_UNWIND_PROTECT -DRCPP_NO_RTTI -DPCRE2_CODE_UNIT_WIDTH=8 -DHAVE_CONFIG_H -DPCRE2_BUNDLED -I. -IPCRE2 2 | PKG_CXXFLAGS = -DRCPP_PARALLEL_USE_TBB=1 $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "RcppParallel::CxxFlags()") 3 | PKG_LIBS = -lpthread -L. -lSFPCRE2 $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "RcppParallel::RcppParallelLibs()") 4 | 5 | LIBPCRE2 = PCRE2/pcre2_chartables.o \ 6 | PCRE2/pcre2_auto_possess.o \ 7 | PCRE2/pcre2_compile.o \ 8 | PCRE2/pcre2_config.o \ 9 | PCRE2/pcre2_context.o \ 10 | PCRE2/pcre2_convert.o \ 11 | PCRE2/pcre2_dfa_match.o \ 12 | PCRE2/pcre2_error.o \ 13 | PCRE2/pcre2_extuni.o \ 14 | PCRE2/pcre2_find_bracket.o \ 15 | PCRE2/pcre2_jit_compile.o \ 16 | PCRE2/pcre2_maketables.o \ 17 | PCRE2/pcre2_match.o \ 18 | PCRE2/pcre2_match_data.o \ 19 | PCRE2/pcre2_newline.o \ 20 | PCRE2/pcre2_ord2utf.o \ 21 | PCRE2/pcre2_pattern_info.o \ 22 | PCRE2/pcre2_script_run.o \ 23 | PCRE2/pcre2_serialize.o \ 24 | PCRE2/pcre2_string_utils.o \ 25 | PCRE2/pcre2_study.o \ 26 | PCRE2/pcre2_substitute.o \ 27 | PCRE2/pcre2_substring.o \ 28 | PCRE2/pcre2_tables.o \ 29 | PCRE2/pcre2_ucd.o \ 30 | PCRE2/pcre2_valid_utf.o \ 31 | PCRE2/pcre2_xclass.o \ 32 | PCRE2/pcre2_is_bundled.o 33 | 34 | PCRE2_wrapper = PCRE2_wrapper/pcre2_wrapper.o 35 | 36 | $(SHLIB): libSFPCRE2.a 37 | 38 | libSFPCRE2.a: $(LIBPCRE2) $(PCRE2_wrapper) 39 | $(AR) rcs libSFPCRE2.a $(LIBPCRE2) $(PCRE2_wrapper) 40 | 41 | clean: 42 | rm -f $(SHLIB) $(OBJECTS) $(LIBPCRE2) $(PCRE2_wrapper) libSFPCRE2.a 43 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_chartables.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* This file was automatically written by the pcre2_dftables auxiliary 6 | program. It contains character tables that are used when no external 7 | tables are passed to PCRE2 by the application that calls it. The tables 8 | are used only for characters whose code values are less than 256. */ 9 | 10 | /* This set of tables was written in the C locale. */ 11 | 12 | /* The pcre2_ftables program (which is distributed with PCRE2) can be used 13 | to build alternative versions of this file. This is necessary if you are 14 | running in an EBCDIC environment, or if you want to default to a different 15 | encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates 16 | these tables in the "C" locale by default. This happens automatically if 17 | PCRE2 is configured with --enable-rebuild-chartables. However, you can run 18 | pcre2_dftables manually with the -L option to build tables using the LC_ALL 19 | locale. */ 20 | 21 | /* The following #include is present because without it gcc 4.x may remove 22 | the array definition from the final binary if PCRE2 is built into a static 23 | library and dead code stripping is activated. This leads to link errors. 24 | Pulling in the header ensures that the array gets flagged as "someone 25 | outside this compilation unit might reference this" and so it will always 26 | be supplied to the linker. */ 27 | 28 | #ifdef HAVE_CONFIG_H 29 | #include "config.h" 30 | #endif 31 | 32 | #include "pcre2_internal.h" 33 | 34 | const uint8_t PRIV(default_tables)[] = { 35 | 36 | /* This table is a lower casing table. */ 37 | 38 | 0, 1, 2, 3, 4, 5, 6, 7, 39 | 8, 9, 10, 11, 12, 13, 14, 15, 40 | 16, 17, 18, 19, 20, 21, 22, 23, 41 | 24, 25, 26, 27, 28, 29, 30, 31, 42 | 32, 33, 34, 35, 36, 37, 38, 39, 43 | 40, 41, 42, 43, 44, 45, 46, 47, 44 | 48, 49, 50, 51, 52, 53, 54, 55, 45 | 56, 57, 58, 59, 60, 61, 62, 63, 46 | 64, 97, 98, 99,100,101,102,103, 47 | 104,105,106,107,108,109,110,111, 48 | 112,113,114,115,116,117,118,119, 49 | 120,121,122, 91, 92, 93, 94, 95, 50 | 96, 97, 98, 99,100,101,102,103, 51 | 104,105,106,107,108,109,110,111, 52 | 112,113,114,115,116,117,118,119, 53 | 120,121,122,123,124,125,126,127, 54 | 128,129,130,131,132,133,134,135, 55 | 136,137,138,139,140,141,142,143, 56 | 144,145,146,147,148,149,150,151, 57 | 152,153,154,155,156,157,158,159, 58 | 160,161,162,163,164,165,166,167, 59 | 168,169,170,171,172,173,174,175, 60 | 176,177,178,179,180,181,182,183, 61 | 184,185,186,187,188,189,190,191, 62 | 192,193,194,195,196,197,198,199, 63 | 200,201,202,203,204,205,206,207, 64 | 208,209,210,211,212,213,214,215, 65 | 216,217,218,219,220,221,222,223, 66 | 224,225,226,227,228,229,230,231, 67 | 232,233,234,235,236,237,238,239, 68 | 240,241,242,243,244,245,246,247, 69 | 248,249,250,251,252,253,254,255, 70 | 71 | /* This table is a case flipping table. */ 72 | 73 | 0, 1, 2, 3, 4, 5, 6, 7, 74 | 8, 9, 10, 11, 12, 13, 14, 15, 75 | 16, 17, 18, 19, 20, 21, 22, 23, 76 | 24, 25, 26, 27, 28, 29, 30, 31, 77 | 32, 33, 34, 35, 36, 37, 38, 39, 78 | 40, 41, 42, 43, 44, 45, 46, 47, 79 | 48, 49, 50, 51, 52, 53, 54, 55, 80 | 56, 57, 58, 59, 60, 61, 62, 63, 81 | 64, 97, 98, 99,100,101,102,103, 82 | 104,105,106,107,108,109,110,111, 83 | 112,113,114,115,116,117,118,119, 84 | 120,121,122, 91, 92, 93, 94, 95, 85 | 96, 65, 66, 67, 68, 69, 70, 71, 86 | 72, 73, 74, 75, 76, 77, 78, 79, 87 | 80, 81, 82, 83, 84, 85, 86, 87, 88 | 88, 89, 90,123,124,125,126,127, 89 | 128,129,130,131,132,133,134,135, 90 | 136,137,138,139,140,141,142,143, 91 | 144,145,146,147,148,149,150,151, 92 | 152,153,154,155,156,157,158,159, 93 | 160,161,162,163,164,165,166,167, 94 | 168,169,170,171,172,173,174,175, 95 | 176,177,178,179,180,181,182,183, 96 | 184,185,186,187,188,189,190,191, 97 | 192,193,194,195,196,197,198,199, 98 | 200,201,202,203,204,205,206,207, 99 | 208,209,210,211,212,213,214,215, 100 | 216,217,218,219,220,221,222,223, 101 | 224,225,226,227,228,229,230,231, 102 | 232,233,234,235,236,237,238,239, 103 | 240,241,242,243,244,245,246,247, 104 | 248,249,250,251,252,253,254,255, 105 | 106 | /* This table contains bit maps for various character classes. Each map is 32 107 | bytes long and the bits run from the least significant end of each byte. The 108 | classes that have their own maps are: space, xdigit, digit, upper, lower, word, 109 | graph, print, punct, and cntrl. Other classes are built from combinations. */ 110 | 111 | 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, /* space */ 112 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 113 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 114 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 115 | 116 | 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* xdigit */ 117 | 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, 118 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 119 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 120 | 121 | 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* digit */ 122 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 123 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 124 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 125 | 126 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* upper */ 127 | 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00, 128 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 129 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 130 | 131 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* lower */ 132 | 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07, 133 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 134 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 135 | 136 | 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* word */ 137 | 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07, 138 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 139 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 140 | 141 | 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, /* graph */ 142 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, 143 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 144 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 145 | 146 | 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, /* print */ 147 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, 148 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 149 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 150 | 151 | 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, /* punct */ 152 | 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78, 153 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 154 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 155 | 156 | 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, /* cntrl */ 157 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, 158 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 159 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 160 | 161 | /* This table identifies various classes of character by individual bits: 162 | 0x01 white space character 163 | 0x02 letter 164 | 0x04 lower case letter 165 | 0x08 decimal digit 166 | 0x10 alphanumeric or '_' 167 | */ 168 | 169 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ 170 | 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */ 171 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ 172 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 173 | 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ 174 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ 175 | 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, /* 0 - 7 */ 176 | 0x18,0x18,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ 177 | 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* @ - G */ 178 | 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */ 179 | 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */ 180 | 0x12,0x12,0x12,0x00,0x00,0x00,0x00,0x10, /* X - _ */ 181 | 0x00,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* ` - g */ 182 | 0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* h - o */ 183 | 0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* p - w */ 184 | 0x16,0x16,0x16,0x00,0x00,0x00,0x00,0x00, /* x -127 */ 185 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ 186 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ 187 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ 188 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ 189 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ 190 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ 191 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ 192 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 193 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ 194 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ 195 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ 196 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ 197 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ 198 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ 199 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 200 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ 201 | 202 | /* End of pcre2_chartables.c */ 203 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_config.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2020 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | #ifdef HAVE_CONFIG_H 42 | #include "config.h" 43 | #endif 44 | 45 | /* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes 46 | its value gets changed by pcre2_intmodedep.h (included by pcre2_internal.h) to 47 | be in code units. */ 48 | 49 | static int configured_link_size = LINK_SIZE; 50 | 51 | #include "pcre2_internal.h" 52 | 53 | /* These macros are the standard way of turning unquoted text into C strings. 54 | They allow macros like PCRE2_MAJOR to be defined without quotes, which is 55 | convenient for user programs that want to test their values. */ 56 | 57 | #define STRING(a) # a 58 | #define XSTRING(s) STRING(s) 59 | 60 | 61 | /************************************************* 62 | * Return info about what features are configured * 63 | *************************************************/ 64 | 65 | /* If where is NULL, the length of memory required is returned. 66 | 67 | Arguments: 68 | what what information is required 69 | where where to put the information 70 | 71 | Returns: 0 if a numerical value is returned 72 | >= 0 if a string value 73 | PCRE2_ERROR_BADOPTION if "where" not recognized 74 | or JIT target requested when JIT not enabled 75 | */ 76 | 77 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 78 | bundled_pcre2_config(uint32_t what, void *where) 79 | { 80 | if (where == NULL) /* Requests a length */ 81 | { 82 | switch(what) 83 | { 84 | default: 85 | return PCRE2_ERROR_BADOPTION; 86 | 87 | case PCRE2_CONFIG_BSR: 88 | case PCRE2_CONFIG_COMPILED_WIDTHS: 89 | case PCRE2_CONFIG_DEPTHLIMIT: 90 | case PCRE2_CONFIG_HEAPLIMIT: 91 | case PCRE2_CONFIG_JIT: 92 | case PCRE2_CONFIG_LINKSIZE: 93 | case PCRE2_CONFIG_MATCHLIMIT: 94 | case PCRE2_CONFIG_NEVER_BACKSLASH_C: 95 | case PCRE2_CONFIG_NEWLINE: 96 | case PCRE2_CONFIG_PARENSLIMIT: 97 | case PCRE2_CONFIG_STACKRECURSE: /* Obsolete */ 98 | case PCRE2_CONFIG_TABLES_LENGTH: 99 | case PCRE2_CONFIG_UNICODE: 100 | return sizeof(uint32_t); 101 | 102 | /* These are handled below */ 103 | 104 | case PCRE2_CONFIG_JITTARGET: 105 | case PCRE2_CONFIG_UNICODE_VERSION: 106 | case PCRE2_CONFIG_VERSION: 107 | break; 108 | } 109 | } 110 | 111 | switch (what) 112 | { 113 | default: 114 | return PCRE2_ERROR_BADOPTION; 115 | 116 | case PCRE2_CONFIG_BSR: 117 | #ifdef BSR_ANYCRLF 118 | *((uint32_t *)where) = PCRE2_BSR_ANYCRLF; 119 | #else 120 | *((uint32_t *)where) = PCRE2_BSR_UNICODE; 121 | #endif 122 | break; 123 | 124 | case PCRE2_CONFIG_COMPILED_WIDTHS: 125 | *((uint32_t *)where) = 0 126 | #ifdef SUPPORT_PCRE2_8 127 | + 1 128 | #endif 129 | #ifdef SUPPORT_PCRE2_16 130 | + 2 131 | #endif 132 | #ifdef SUPPORT_PCRE2_32 133 | + 4 134 | #endif 135 | ; 136 | break; 137 | 138 | case PCRE2_CONFIG_DEPTHLIMIT: 139 | *((uint32_t *)where) = MATCH_LIMIT_DEPTH; 140 | break; 141 | 142 | case PCRE2_CONFIG_HEAPLIMIT: 143 | *((uint32_t *)where) = HEAP_LIMIT; 144 | break; 145 | 146 | case PCRE2_CONFIG_JIT: 147 | #ifdef SUPPORT_JIT 148 | *((uint32_t *)where) = 1; 149 | #else 150 | *((uint32_t *)where) = 0; 151 | #endif 152 | break; 153 | 154 | case PCRE2_CONFIG_JITTARGET: 155 | #ifdef SUPPORT_JIT 156 | { 157 | const char *v = PRIV(jit_get_target)(); 158 | return (int)(1 + ((where == NULL)? 159 | strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); 160 | } 161 | #else 162 | return PCRE2_ERROR_BADOPTION; 163 | #endif 164 | 165 | case PCRE2_CONFIG_LINKSIZE: 166 | *((uint32_t *)where) = (uint32_t)configured_link_size; 167 | break; 168 | 169 | case PCRE2_CONFIG_MATCHLIMIT: 170 | *((uint32_t *)where) = MATCH_LIMIT; 171 | break; 172 | 173 | case PCRE2_CONFIG_NEWLINE: 174 | *((uint32_t *)where) = NEWLINE_DEFAULT; 175 | break; 176 | 177 | case PCRE2_CONFIG_NEVER_BACKSLASH_C: 178 | #ifdef NEVER_BACKSLASH_C 179 | *((uint32_t *)where) = 1; 180 | #else 181 | *((uint32_t *)where) = 0; 182 | #endif 183 | break; 184 | 185 | case PCRE2_CONFIG_PARENSLIMIT: 186 | *((uint32_t *)where) = PARENS_NEST_LIMIT; 187 | break; 188 | 189 | /* This is now obsolete. The stack is no longer used via recursion for 190 | handling backtracking in bundled_pcre2_match(). */ 191 | 192 | case PCRE2_CONFIG_STACKRECURSE: 193 | *((uint32_t *)where) = 0; 194 | break; 195 | 196 | case PCRE2_CONFIG_TABLES_LENGTH: 197 | *((uint32_t *)where) = TABLES_LENGTH; 198 | break; 199 | 200 | case PCRE2_CONFIG_UNICODE_VERSION: 201 | { 202 | #if defined SUPPORT_UNICODE 203 | const char *v = PRIV(unicode_version); 204 | #else 205 | const char *v = "Unicode not supported"; 206 | #endif 207 | return (int)(1 + ((where == NULL)? 208 | strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); 209 | } 210 | break; 211 | 212 | case PCRE2_CONFIG_UNICODE: 213 | #if defined SUPPORT_UNICODE 214 | *((uint32_t *)where) = 1; 215 | #else 216 | *((uint32_t *)where) = 0; 217 | #endif 218 | break; 219 | 220 | /* The hackery in setting "v" below is to cope with the case when 221 | PCRE2_PRERELEASE is set to an empty string (which it is for real releases). 222 | If the second alternative is used in this case, it does not leave a space 223 | before the date. On the other hand, if all four macros are put into a single 224 | XSTRING when PCRE2_PRERELEASE is not empty, an unwanted space is inserted. 225 | There are problems using an "obvious" approach like this: 226 | 227 | XSTRING(PCRE2_MAJOR) "." XSTRING(PCRE_MINOR) 228 | XSTRING(PCRE2_PRERELEASE) " " XSTRING(PCRE_DATE) 229 | 230 | because, when PCRE2_PRERELEASE is empty, this leads to an attempted expansion 231 | of STRING(). The C standard states: "If (before argument substitution) any 232 | argument consists of no preprocessing tokens, the behavior is undefined." It 233 | turns out the gcc treats this case as a single empty string - which is what 234 | we really want - but Visual C grumbles about the lack of an argument for the 235 | macro. Unfortunately, both are within their rights. As there seems to be no 236 | way to test for a macro's value being empty at compile time, we have to 237 | resort to a runtime test. */ 238 | 239 | case PCRE2_CONFIG_VERSION: 240 | { 241 | const char *v = (XSTRING(Z PCRE2_PRERELEASE)[1] == 0)? 242 | XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) : 243 | XSTRING(PCRE2_MAJOR.PCRE2_MINOR) XSTRING(PCRE2_PRERELEASE PCRE2_DATE); 244 | return (int)(1 + ((where == NULL)? 245 | strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); 246 | } 247 | } 248 | 249 | return 0; 250 | } 251 | 252 | /* End of pcre2_config.c */ 253 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_extuni.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2019 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | /* This module contains an internal function that is used to match a Unicode 42 | extended grapheme sequence. It is used by both bundled_pcre2_match() and 43 | pcre2_def_match(). However, it is called only when Unicode support is being 44 | compiled. Nevertheless, we provide a dummy function when there is no Unicode 45 | support, because some compilers do not like functionless source files. */ 46 | 47 | 48 | #ifdef HAVE_CONFIG_H 49 | #include "config.h" 50 | #endif 51 | 52 | 53 | #include "pcre2_internal.h" 54 | 55 | 56 | /* Dummy function */ 57 | 58 | #ifndef SUPPORT_UNICODE 59 | PCRE2_SPTR 60 | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, 61 | PCRE2_SPTR end_subject, BOOL utf, int *xcount) 62 | { 63 | (void)c; 64 | (void)eptr; 65 | (void)start_subject; 66 | (void)end_subject; 67 | (void)utf; 68 | (void)xcount; 69 | return NULL; 70 | } 71 | #else 72 | 73 | 74 | /************************************************* 75 | * Match an extended grapheme sequence * 76 | *************************************************/ 77 | 78 | /* 79 | Arguments: 80 | c the first character 81 | eptr pointer to next character 82 | start_subject pointer to start of subject 83 | end_subject pointer to end of subject 84 | utf TRUE if in UTF mode 85 | xcount pointer to count of additional characters, 86 | or NULL if count not needed 87 | 88 | Returns: pointer after the end of the sequence 89 | */ 90 | 91 | PCRE2_SPTR 92 | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, 93 | PCRE2_SPTR end_subject, BOOL utf, int *xcount) 94 | { 95 | int lgb = UCD_GRAPHBREAK(c); 96 | 97 | while (eptr < end_subject) 98 | { 99 | int rgb; 100 | int len = 1; 101 | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 102 | rgb = UCD_GRAPHBREAK(c); 103 | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; 104 | 105 | /* Not breaking between Regional Indicators is allowed only if there 106 | are an even number of preceding RIs. */ 107 | 108 | if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) 109 | { 110 | int ricount = 0; 111 | PCRE2_SPTR bptr = eptr - 1; 112 | if (utf) BACKCHAR(bptr); 113 | 114 | /* bptr is pointing to the left-hand character */ 115 | 116 | while (bptr > start_subject) 117 | { 118 | bptr--; 119 | if (utf) 120 | { 121 | BACKCHAR(bptr); 122 | GETCHAR(c, bptr); 123 | } 124 | else 125 | c = *bptr; 126 | if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; 127 | ricount++; 128 | } 129 | if ((ricount & 1) != 0) break; /* Grapheme break required */ 130 | } 131 | 132 | /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this 133 | allows any number of them before a following Extended_Pictographic. */ 134 | 135 | if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || 136 | lgb != ucp_gbExtended_Pictographic) 137 | lgb = rgb; 138 | 139 | eptr += len; 140 | if (xcount != NULL) *xcount += 1; 141 | } 142 | 143 | return eptr; 144 | } 145 | 146 | #endif /* SUPPORT_UNICODE */ 147 | 148 | /* End of pcre2_extuni.c */ 149 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_find_bracket.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2018 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | 42 | /* This module contains a single function that scans through a compiled pattern 43 | until it finds a capturing bracket with the given number, or, if the number is 44 | negative, an instance of OP_REVERSE for a lookbehind. The function is called 45 | from pcre2_compile.c and also from pcre2_study.c when finding the minimum 46 | matching length. */ 47 | 48 | 49 | #ifdef HAVE_CONFIG_H 50 | #include "config.h" 51 | #endif 52 | 53 | #include "pcre2_internal.h" 54 | 55 | 56 | /************************************************* 57 | * Scan compiled regex for specific bracket * 58 | *************************************************/ 59 | 60 | /* 61 | Arguments: 62 | code points to start of expression 63 | utf TRUE in UTF mode 64 | number the required bracket number or negative to find a lookbehind 65 | 66 | Returns: pointer to the opcode for the bracket, or NULL if not found 67 | */ 68 | 69 | PCRE2_SPTR 70 | PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) 71 | { 72 | for (;;) 73 | { 74 | PCRE2_UCHAR c = *code; 75 | 76 | if (c == OP_END) return NULL; 77 | 78 | /* XCLASS is used for classes that cannot be represented just by a bit map. 79 | This includes negated single high-valued characters. CALLOUT_STR is used for 80 | callouts with string arguments. In both cases the length in the table is 81 | zero; the actual length is stored in the compiled code. */ 82 | 83 | if (c == OP_XCLASS) code += GET(code, 1); 84 | else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); 85 | 86 | /* Handle lookbehind */ 87 | 88 | else if (c == OP_REVERSE) 89 | { 90 | if (number < 0) return (PCRE2_UCHAR *)code; 91 | code += PRIV(OP_lengths)[c]; 92 | } 93 | 94 | /* Handle capturing bracket */ 95 | 96 | else if (c == OP_CBRA || c == OP_SCBRA || 97 | c == OP_CBRAPOS || c == OP_SCBRAPOS) 98 | { 99 | int n = (int)GET2(code, 1+LINK_SIZE); 100 | if (n == number) return (PCRE2_UCHAR *)code; 101 | code += PRIV(OP_lengths)[c]; 102 | } 103 | 104 | /* Otherwise, we can get the item's length from the table, except that for 105 | repeated character types, we have to test for \p and \P, which have an extra 106 | two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we 107 | must add in its length. */ 108 | 109 | else 110 | { 111 | switch(c) 112 | { 113 | case OP_TYPESTAR: 114 | case OP_TYPEMINSTAR: 115 | case OP_TYPEPLUS: 116 | case OP_TYPEMINPLUS: 117 | case OP_TYPEQUERY: 118 | case OP_TYPEMINQUERY: 119 | case OP_TYPEPOSSTAR: 120 | case OP_TYPEPOSPLUS: 121 | case OP_TYPEPOSQUERY: 122 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 123 | break; 124 | 125 | case OP_TYPEUPTO: 126 | case OP_TYPEMINUPTO: 127 | case OP_TYPEEXACT: 128 | case OP_TYPEPOSUPTO: 129 | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 130 | code += 2; 131 | break; 132 | 133 | case OP_MARK: 134 | case OP_COMMIT_ARG: 135 | case OP_PRUNE_ARG: 136 | case OP_SKIP_ARG: 137 | case OP_THEN_ARG: 138 | code += code[1]; 139 | break; 140 | } 141 | 142 | /* Add in the fixed length from the table */ 143 | 144 | code += PRIV(OP_lengths)[c]; 145 | 146 | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be 147 | followed by a multi-byte character. The length in the table is a minimum, so 148 | we have to arrange to skip the extra bytes. */ 149 | 150 | #ifdef MAYBE_UTF_MULTI 151 | if (utf) switch(c) 152 | { 153 | case OP_CHAR: 154 | case OP_CHARI: 155 | case OP_NOT: 156 | case OP_NOTI: 157 | case OP_EXACT: 158 | case OP_EXACTI: 159 | case OP_NOTEXACT: 160 | case OP_NOTEXACTI: 161 | case OP_UPTO: 162 | case OP_UPTOI: 163 | case OP_NOTUPTO: 164 | case OP_NOTUPTOI: 165 | case OP_MINUPTO: 166 | case OP_MINUPTOI: 167 | case OP_NOTMINUPTO: 168 | case OP_NOTMINUPTOI: 169 | case OP_POSUPTO: 170 | case OP_POSUPTOI: 171 | case OP_NOTPOSUPTO: 172 | case OP_NOTPOSUPTOI: 173 | case OP_STAR: 174 | case OP_STARI: 175 | case OP_NOTSTAR: 176 | case OP_NOTSTARI: 177 | case OP_MINSTAR: 178 | case OP_MINSTARI: 179 | case OP_NOTMINSTAR: 180 | case OP_NOTMINSTARI: 181 | case OP_POSSTAR: 182 | case OP_POSSTARI: 183 | case OP_NOTPOSSTAR: 184 | case OP_NOTPOSSTARI: 185 | case OP_PLUS: 186 | case OP_PLUSI: 187 | case OP_NOTPLUS: 188 | case OP_NOTPLUSI: 189 | case OP_MINPLUS: 190 | case OP_MINPLUSI: 191 | case OP_NOTMINPLUS: 192 | case OP_NOTMINPLUSI: 193 | case OP_POSPLUS: 194 | case OP_POSPLUSI: 195 | case OP_NOTPOSPLUS: 196 | case OP_NOTPOSPLUSI: 197 | case OP_QUERY: 198 | case OP_QUERYI: 199 | case OP_NOTQUERY: 200 | case OP_NOTQUERYI: 201 | case OP_MINQUERY: 202 | case OP_MINQUERYI: 203 | case OP_NOTMINQUERY: 204 | case OP_NOTMINQUERYI: 205 | case OP_POSQUERY: 206 | case OP_POSQUERYI: 207 | case OP_NOTPOSQUERY: 208 | case OP_NOTPOSQUERYI: 209 | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); 210 | break; 211 | } 212 | #else 213 | (void)(utf); /* Keep compiler happy by referencing function argument */ 214 | #endif /* MAYBE_UTF_MULTI */ 215 | } 216 | } 217 | } 218 | 219 | /* End of pcre2_find_bracket.c */ 220 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_is_bundled.c: -------------------------------------------------------------------------------- 1 | #include "pcre2.h" 2 | 3 | int pcre2_is_bundled(void) { 4 | return 1; 5 | } 6 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_jit_match.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2018 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | #ifndef INCLUDED_FROM_PCRE2_JIT_COMPILE 42 | #error This file must be included from pcre2_jit_compile.c. 43 | #endif 44 | 45 | #ifdef SUPPORT_JIT 46 | 47 | static SLJIT_NOINLINE int jit_machine_stack_exec(jit_arguments *arguments, jit_function executable_func) 48 | { 49 | sljit_u8 local_space[MACHINE_STACK_SIZE]; 50 | struct sljit_stack local_stack; 51 | 52 | local_stack.min_start = local_space; 53 | local_stack.start = local_space; 54 | local_stack.end = local_space + MACHINE_STACK_SIZE; 55 | local_stack.top = local_space + MACHINE_STACK_SIZE; 56 | arguments->stack = &local_stack; 57 | return executable_func(arguments); 58 | } 59 | 60 | #endif 61 | 62 | 63 | /************************************************* 64 | * Do a JIT pattern match * 65 | *************************************************/ 66 | 67 | /* This function runs a JIT pattern match. 68 | 69 | Arguments: 70 | code points to the compiled expression 71 | subject points to the subject string 72 | length length of subject string (may contain binary zeros) 73 | start_offset where to start in the subject string 74 | options option bits 75 | match_data points to a match_data block 76 | mcontext points to a match context 77 | 78 | Returns: > 0 => success; value is the number of ovector pairs filled 79 | = 0 => success, but ovector is not big enough 80 | -1 => failed to match (PCRE_ERROR_NOMATCH) 81 | < -1 => some kind of unexpected problem 82 | */ 83 | 84 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 85 | bundled_pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, 86 | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, 87 | pcre2_match_context *mcontext) 88 | { 89 | #ifndef SUPPORT_JIT 90 | 91 | (void)code; 92 | (void)subject; 93 | (void)length; 94 | (void)start_offset; 95 | (void)options; 96 | (void)match_data; 97 | (void)mcontext; 98 | return PCRE2_ERROR_JIT_BADOPTION; 99 | 100 | #else /* SUPPORT_JIT */ 101 | 102 | pcre2_real_code *re = (pcre2_real_code *)code; 103 | executable_functions *functions = (executable_functions *)re->executable_jit; 104 | pcre2_jit_stack *jit_stack; 105 | uint32_t oveccount = match_data->oveccount; 106 | uint32_t max_oveccount; 107 | union { 108 | void *executable_func; 109 | jit_function call_executable_func; 110 | } convert_executable_func; 111 | jit_arguments arguments; 112 | int rc; 113 | int index = 0; 114 | 115 | if ((options & PCRE2_PARTIAL_HARD) != 0) 116 | index = 2; 117 | else if ((options & PCRE2_PARTIAL_SOFT) != 0) 118 | index = 1; 119 | 120 | if (functions == NULL || functions->executable_funcs[index] == NULL) 121 | return PCRE2_ERROR_JIT_BADOPTION; 122 | 123 | /* Sanity checks should be handled by pcre_exec. */ 124 | arguments.str = subject + start_offset; 125 | arguments.begin = subject; 126 | arguments.end = subject + length; 127 | arguments.match_data = match_data; 128 | arguments.startchar_ptr = subject; 129 | arguments.mark_ptr = NULL; 130 | arguments.options = options; 131 | 132 | if (mcontext != NULL) 133 | { 134 | arguments.callout = mcontext->callout; 135 | arguments.callout_data = mcontext->callout_data; 136 | arguments.offset_limit = mcontext->offset_limit; 137 | arguments.limit_match = (mcontext->match_limit < re->limit_match)? 138 | mcontext->match_limit : re->limit_match; 139 | if (mcontext->jit_callback != NULL) 140 | jit_stack = mcontext->jit_callback(mcontext->jit_callback_data); 141 | else 142 | jit_stack = (pcre2_jit_stack *)mcontext->jit_callback_data; 143 | } 144 | else 145 | { 146 | arguments.callout = NULL; 147 | arguments.callout_data = NULL; 148 | arguments.offset_limit = PCRE2_UNSET; 149 | arguments.limit_match = (MATCH_LIMIT < re->limit_match)? 150 | MATCH_LIMIT : re->limit_match; 151 | jit_stack = NULL; 152 | } 153 | 154 | 155 | max_oveccount = functions->top_bracket; 156 | if (oveccount > max_oveccount) 157 | oveccount = max_oveccount; 158 | arguments.oveccount = oveccount << 1; 159 | 160 | 161 | convert_executable_func.executable_func = functions->executable_funcs[index]; 162 | if (jit_stack != NULL) 163 | { 164 | arguments.stack = (struct sljit_stack *)(jit_stack->stack); 165 | rc = convert_executable_func.call_executable_func(&arguments); 166 | } 167 | else 168 | rc = jit_machine_stack_exec(&arguments, convert_executable_func.call_executable_func); 169 | 170 | if (rc > (int)oveccount) 171 | rc = 0; 172 | match_data->code = re; 173 | match_data->subject = (rc >= 0 || rc == PCRE2_ERROR_PARTIAL)? subject : NULL; 174 | match_data->rc = rc; 175 | match_data->startchar = arguments.startchar_ptr - subject; 176 | match_data->leftchar = 0; 177 | match_data->rightchar = 0; 178 | match_data->mark = arguments.mark_ptr; 179 | match_data->matchedby = PCRE2_MATCHEDBY_JIT; 180 | 181 | return match_data->rc; 182 | 183 | #endif /* SUPPORT_JIT */ 184 | } 185 | 186 | /* End of pcre2_jit_match.c */ 187 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_jit_misc.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | 42 | #ifndef INCLUDED_FROM_PCRE2_JIT_COMPILE 43 | #error This file must be included from pcre2_jit_compile.c. 44 | #endif 45 | 46 | 47 | 48 | /************************************************* 49 | * Free JIT read-only data * 50 | *************************************************/ 51 | 52 | void 53 | PRIV(jit_free_rodata)(void *current, void *allocator_data) 54 | { 55 | #ifndef SUPPORT_JIT 56 | (void)current; 57 | (void)allocator_data; 58 | #else /* SUPPORT_JIT */ 59 | void *next; 60 | 61 | SLJIT_UNUSED_ARG(allocator_data); 62 | 63 | while (current != NULL) 64 | { 65 | next = *(void**)current; 66 | SLJIT_FREE(current, allocator_data); 67 | current = next; 68 | } 69 | 70 | #endif /* SUPPORT_JIT */ 71 | } 72 | 73 | /************************************************* 74 | * Free JIT compiled code * 75 | *************************************************/ 76 | 77 | void 78 | PRIV(jit_free)(void *executable_jit, pcre2_memctl *memctl) 79 | { 80 | #ifndef SUPPORT_JIT 81 | (void)executable_jit; 82 | (void)memctl; 83 | #else /* SUPPORT_JIT */ 84 | 85 | executable_functions *functions = (executable_functions *)executable_jit; 86 | void *allocator_data = memctl; 87 | int i; 88 | 89 | for (i = 0; i < JIT_NUMBER_OF_COMPILE_MODES; i++) 90 | { 91 | if (functions->executable_funcs[i] != NULL) 92 | sljit_free_code(functions->executable_funcs[i]); 93 | PRIV(jit_free_rodata)(functions->read_only_data_heads[i], allocator_data); 94 | } 95 | 96 | SLJIT_FREE(functions, allocator_data); 97 | 98 | #endif /* SUPPORT_JIT */ 99 | } 100 | 101 | 102 | /************************************************* 103 | * Free unused JIT memory * 104 | *************************************************/ 105 | 106 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 107 | bundled_pcre2_jit_free_unused_memory(pcre2_general_context *gcontext) 108 | { 109 | #ifndef SUPPORT_JIT 110 | (void)gcontext; /* Suppress warning */ 111 | #else /* SUPPORT_JIT */ 112 | SLJIT_UNUSED_ARG(gcontext); 113 | sljit_free_unused_memory_exec(); 114 | #endif /* SUPPORT_JIT */ 115 | } 116 | 117 | 118 | 119 | /************************************************* 120 | * Allocate a JIT stack * 121 | *************************************************/ 122 | 123 | PCRE2_EXP_DEFN pcre2_jit_stack * PCRE2_CALL_CONVENTION 124 | bundled_pcre2_jit_stack_create(size_t startsize, size_t maxsize, 125 | pcre2_general_context *gcontext) 126 | { 127 | #ifndef SUPPORT_JIT 128 | 129 | (void)gcontext; 130 | (void)startsize; 131 | (void)maxsize; 132 | return NULL; 133 | 134 | #else /* SUPPORT_JIT */ 135 | 136 | pcre2_jit_stack *jit_stack; 137 | 138 | if (startsize < 1 || maxsize < 1) 139 | return NULL; 140 | if (startsize > maxsize) 141 | startsize = maxsize; 142 | startsize = (startsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1); 143 | maxsize = (maxsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1); 144 | 145 | jit_stack = PRIV(memctl_malloc)(sizeof(pcre2_real_jit_stack), (pcre2_memctl *)gcontext); 146 | if (jit_stack == NULL) return NULL; 147 | jit_stack->stack = sljit_allocate_stack(startsize, maxsize, &jit_stack->memctl); 148 | if (jit_stack->stack == NULL) 149 | { 150 | jit_stack->memctl.free(jit_stack, jit_stack->memctl.memory_data); 151 | return NULL; 152 | } 153 | return jit_stack; 154 | 155 | #endif 156 | } 157 | 158 | 159 | /************************************************* 160 | * Assign a JIT stack to a pattern * 161 | *************************************************/ 162 | 163 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 164 | bundled_pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback, 165 | void *callback_data) 166 | { 167 | #ifndef SUPPORT_JIT 168 | (void)mcontext; 169 | (void)callback; 170 | (void)callback_data; 171 | #else /* SUPPORT_JIT */ 172 | 173 | if (mcontext == NULL) return; 174 | mcontext->jit_callback = callback; 175 | mcontext->jit_callback_data = callback_data; 176 | 177 | #endif /* SUPPORT_JIT */ 178 | } 179 | 180 | 181 | /************************************************* 182 | * Free a JIT stack * 183 | *************************************************/ 184 | 185 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 186 | bundled_pcre2_jit_stack_free(pcre2_jit_stack *jit_stack) 187 | { 188 | #ifndef SUPPORT_JIT 189 | (void)jit_stack; 190 | #else /* SUPPORT_JIT */ 191 | if (jit_stack != NULL) 192 | { 193 | sljit_free_stack((struct sljit_stack *)(jit_stack->stack), &jit_stack->memctl); 194 | jit_stack->memctl.free(jit_stack, jit_stack->memctl.memory_data); 195 | } 196 | #endif /* SUPPORT_JIT */ 197 | } 198 | 199 | 200 | /************************************************* 201 | * Get target CPU type * 202 | *************************************************/ 203 | 204 | const char* 205 | PRIV(jit_get_target)(void) 206 | { 207 | #ifndef SUPPORT_JIT 208 | return "JIT is not supported"; 209 | #else /* SUPPORT_JIT */ 210 | return sljit_get_platform_name(); 211 | #endif /* SUPPORT_JIT */ 212 | } 213 | 214 | 215 | /************************************************* 216 | * Get size of JIT code * 217 | *************************************************/ 218 | 219 | size_t 220 | PRIV(jit_get_size)(void *executable_jit) 221 | { 222 | #ifndef SUPPORT_JIT 223 | (void)executable_jit; 224 | return 0; 225 | #else /* SUPPORT_JIT */ 226 | sljit_uw *executable_sizes = ((executable_functions *)executable_jit)->executable_sizes; 227 | SLJIT_COMPILE_ASSERT(JIT_NUMBER_OF_COMPILE_MODES == 3, number_of_compile_modes_changed); 228 | return executable_sizes[0] + executable_sizes[1] + executable_sizes[2]; 229 | #endif 230 | } 231 | 232 | /* End of pcre2_jit_misc.c */ 233 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_jit_neon_inc.h: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | This module by Zoltan Herczeg and Sebastian Pop 10 | Original API code Copyright (c) 1997-2012 University of Cambridge 11 | New API code Copyright (c) 2016-2019 University of Cambridge 12 | 13 | ----------------------------------------------------------------------------- 14 | Redistribution and use in source and binary forms, with or without 15 | modification, are permitted provided that the following conditions are met: 16 | 17 | * Redistributions of source code must retain the above copyright notice, 18 | this list of conditions and the following disclaimer. 19 | 20 | * Redistributions in binary form must reproduce the above copyright 21 | notice, this list of conditions and the following disclaimer in the 22 | documentation and/or other materials provided with the distribution. 23 | 24 | * Neither the name of the University of Cambridge nor the names of its 25 | contributors may be used to endorse or promote products derived from 26 | this software without specific prior written permission. 27 | 28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | POSSIBILITY OF SUCH DAMAGE. 39 | ----------------------------------------------------------------------------- 40 | */ 41 | 42 | # if defined(FFCS) 43 | # if defined(FF_UTF) 44 | # define FF_FUN ffcs_utf 45 | # else 46 | # define FF_FUN ffcs 47 | # endif 48 | 49 | # elif defined(FFCS_2) 50 | # if defined(FF_UTF) 51 | # define FF_FUN ffcs_2_utf 52 | # else 53 | # define FF_FUN ffcs_2 54 | # endif 55 | 56 | # elif defined(FFCS_MASK) 57 | # if defined(FF_UTF) 58 | # define FF_FUN ffcs_mask_utf 59 | # else 60 | # define FF_FUN ffcs_mask 61 | # endif 62 | 63 | # elif defined(FFCPS_0) 64 | # if defined (FF_UTF) 65 | # define FF_FUN ffcps_0_utf 66 | # else 67 | # define FF_FUN ffcps_0 68 | # endif 69 | 70 | # elif defined (FFCPS_1) 71 | # if defined (FF_UTF) 72 | # define FF_FUN ffcps_1_utf 73 | # else 74 | # define FF_FUN ffcps_1 75 | # endif 76 | 77 | # elif defined (FFCPS_DEFAULT) 78 | # if defined (FF_UTF) 79 | # define FF_FUN ffcps_default_utf 80 | # else 81 | # define FF_FUN ffcps_default 82 | # endif 83 | # endif 84 | 85 | static sljit_u8* SLJIT_FUNC FF_FUN(sljit_u8 *str_end, sljit_u8 *str_ptr, sljit_uw offs1, sljit_uw offs2, sljit_uw chars) 86 | #undef FF_FUN 87 | { 88 | quad_word qw; 89 | int_char ic; 90 | ic.x = chars; 91 | 92 | #if defined(FFCS) 93 | sljit_u8 c1 = ic.c.c1; 94 | vect_t vc1 = VDUPQ(c1); 95 | 96 | #elif defined(FFCS_2) 97 | sljit_u8 c1 = ic.c.c1; 98 | vect_t vc1 = VDUPQ(c1); 99 | sljit_u8 c2 = ic.c.c2; 100 | vect_t vc2 = VDUPQ(c2); 101 | 102 | #elif defined(FFCS_MASK) 103 | sljit_u8 c1 = ic.c.c1; 104 | vect_t vc1 = VDUPQ(c1); 105 | sljit_u8 mask = ic.c.c2; 106 | vect_t vmask = VDUPQ(mask); 107 | #endif 108 | 109 | #if defined(FFCPS) 110 | compare_type compare1_type = compare_match1; 111 | compare_type compare2_type = compare_match1; 112 | vect_t cmp1a, cmp1b, cmp2a, cmp2b; 113 | const sljit_u32 diff = IN_UCHARS(offs1 - offs2); 114 | PCRE2_UCHAR char1a = ic.c.c1; 115 | PCRE2_UCHAR char2a = ic.c.c3; 116 | 117 | # ifdef FFCPS_CHAR1A2A 118 | cmp1a = VDUPQ(char1a); 119 | cmp2a = VDUPQ(char2a); 120 | cmp1b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ 121 | cmp2b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ 122 | # else 123 | PCRE2_UCHAR char1b = ic.c.c2; 124 | PCRE2_UCHAR char2b = ic.c.c4; 125 | if (char1a == char1b) 126 | { 127 | cmp1a = VDUPQ(char1a); 128 | cmp1b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ 129 | } 130 | else 131 | { 132 | sljit_u32 bit1 = char1a ^ char1b; 133 | if (is_powerof2(bit1)) 134 | { 135 | compare1_type = compare_match1i; 136 | cmp1a = VDUPQ(char1a | bit1); 137 | cmp1b = VDUPQ(bit1); 138 | } 139 | else 140 | { 141 | compare1_type = compare_match2; 142 | cmp1a = VDUPQ(char1a); 143 | cmp1b = VDUPQ(char1b); 144 | } 145 | } 146 | 147 | if (char2a == char2b) 148 | { 149 | cmp2a = VDUPQ(char2a); 150 | cmp2b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */ 151 | } 152 | else 153 | { 154 | sljit_u32 bit2 = char2a ^ char2b; 155 | if (is_powerof2(bit2)) 156 | { 157 | compare2_type = compare_match1i; 158 | cmp2a = VDUPQ(char2a | bit2); 159 | cmp2b = VDUPQ(bit2); 160 | } 161 | else 162 | { 163 | compare2_type = compare_match2; 164 | cmp2a = VDUPQ(char2a); 165 | cmp2b = VDUPQ(char2b); 166 | } 167 | } 168 | # endif 169 | 170 | str_ptr += IN_UCHARS(offs1); 171 | #endif 172 | 173 | #if PCRE2_CODE_UNIT_WIDTH != 8 174 | vect_t char_mask = VDUPQ(0xff); 175 | #endif 176 | 177 | #if defined(FF_UTF) 178 | restart:; 179 | #endif 180 | 181 | #if defined(FFCPS) 182 | sljit_u8 *p1 = str_ptr - diff; 183 | #endif 184 | sljit_s32 align_offset = ((uint64_t)str_ptr & 0xf); 185 | str_ptr = (sljit_u8 *) ((uint64_t)str_ptr & ~0xf); 186 | vect_t data = VLD1Q(str_ptr); 187 | #if PCRE2_CODE_UNIT_WIDTH != 8 188 | data = VANDQ(data, char_mask); 189 | #endif 190 | 191 | #if defined(FFCS) 192 | vect_t eq = VCEQQ(data, vc1); 193 | 194 | #elif defined(FFCS_2) 195 | vect_t eq1 = VCEQQ(data, vc1); 196 | vect_t eq2 = VCEQQ(data, vc2); 197 | vect_t eq = VORRQ(eq1, eq2); 198 | 199 | #elif defined(FFCS_MASK) 200 | vect_t eq = VORRQ(data, vmask); 201 | eq = VCEQQ(eq, vc1); 202 | 203 | #elif defined(FFCPS) 204 | # if defined(FFCPS_DIFF1) 205 | vect_t prev_data = data; 206 | # endif 207 | 208 | vect_t data2; 209 | if (p1 < str_ptr) 210 | { 211 | data2 = VLD1Q(str_ptr - diff); 212 | #if PCRE2_CODE_UNIT_WIDTH != 8 213 | data2 = VANDQ(data2, char_mask); 214 | #endif 215 | } 216 | else 217 | data2 = shift_left_n_lanes(data, offs1 - offs2); 218 | 219 | if (compare1_type == compare_match1) 220 | data = VCEQQ(data, cmp1a); 221 | else 222 | data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b); 223 | 224 | if (compare2_type == compare_match1) 225 | data2 = VCEQQ(data2, cmp2a); 226 | else 227 | data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b); 228 | 229 | vect_t eq = VANDQ(data, data2); 230 | #endif 231 | 232 | VST1Q(qw.mem, eq); 233 | /* Ignore matches before the first STR_PTR. */ 234 | if (align_offset < 8) 235 | { 236 | qw.dw[0] >>= align_offset * 8; 237 | if (qw.dw[0]) 238 | { 239 | str_ptr += align_offset + __builtin_ctzll(qw.dw[0]) / 8; 240 | goto match; 241 | } 242 | if (qw.dw[1]) 243 | { 244 | str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8; 245 | goto match; 246 | } 247 | } 248 | else 249 | { 250 | qw.dw[1] >>= (align_offset - 8) * 8; 251 | if (qw.dw[1]) 252 | { 253 | str_ptr += align_offset + __builtin_ctzll(qw.dw[1]) / 8; 254 | goto match; 255 | } 256 | } 257 | str_ptr += 16; 258 | 259 | while (str_ptr < str_end) 260 | { 261 | vect_t orig_data = VLD1Q(str_ptr); 262 | #if PCRE2_CODE_UNIT_WIDTH != 8 263 | orig_data = VANDQ(orig_data, char_mask); 264 | #endif 265 | data = orig_data; 266 | 267 | #if defined(FFCS) 268 | eq = VCEQQ(data, vc1); 269 | 270 | #elif defined(FFCS_2) 271 | eq1 = VCEQQ(data, vc1); 272 | eq2 = VCEQQ(data, vc2); 273 | eq = VORRQ(eq1, eq2); 274 | 275 | #elif defined(FFCS_MASK) 276 | eq = VORRQ(data, vmask); 277 | eq = VCEQQ(eq, vc1); 278 | #endif 279 | 280 | #if defined(FFCPS) 281 | # if defined (FFCPS_DIFF1) 282 | data2 = VEXTQ(prev_data, data, VECTOR_FACTOR - 1); 283 | # else 284 | data2 = VLD1Q(str_ptr - diff); 285 | # if PCRE2_CODE_UNIT_WIDTH != 8 286 | data2 = VANDQ(data2, char_mask); 287 | # endif 288 | # endif 289 | 290 | # ifdef FFCPS_CHAR1A2A 291 | data = VCEQQ(data, cmp1a); 292 | data2 = VCEQQ(data2, cmp2a); 293 | # else 294 | if (compare1_type == compare_match1) 295 | data = VCEQQ(data, cmp1a); 296 | else 297 | data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b); 298 | if (compare2_type == compare_match1) 299 | data2 = VCEQQ(data2, cmp2a); 300 | else 301 | data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b); 302 | # endif 303 | 304 | eq = VANDQ(data, data2); 305 | #endif 306 | 307 | VST1Q(qw.mem, eq); 308 | if (qw.dw[0]) 309 | str_ptr += __builtin_ctzll(qw.dw[0]) / 8; 310 | else if (qw.dw[1]) 311 | str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8; 312 | else { 313 | str_ptr += 16; 314 | #if defined (FFCPS_DIFF1) 315 | prev_data = orig_data; 316 | #endif 317 | continue; 318 | } 319 | 320 | match:; 321 | if (str_ptr >= str_end) 322 | /* Failed match. */ 323 | return NULL; 324 | 325 | #if defined(FF_UTF) 326 | if (utf_continue(str_ptr + IN_UCHARS(-offs1))) 327 | { 328 | /* Not a match. */ 329 | str_ptr += IN_UCHARS(1); 330 | goto restart; 331 | } 332 | #endif 333 | 334 | /* Match. */ 335 | #if defined (FFCPS) 336 | str_ptr -= IN_UCHARS(offs1); 337 | #endif 338 | return str_ptr; 339 | } 340 | 341 | /* Failed match. */ 342 | return NULL; 343 | } 344 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_maketables.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2020 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | 42 | /* This module contains the external function bundled_pcre2_maketables(), which builds 43 | character tables for PCRE2 in the current locale. The file is compiled on its 44 | own as part of the PCRE2 library. It is also included in the compilation of 45 | pcre2_dftables.c as a freestanding program, in which case the macro 46 | PCRE2_DFTABLES is defined. */ 47 | 48 | #ifndef PCRE2_DFTABLES /* Compiling the library */ 49 | # ifdef HAVE_CONFIG_H 50 | # include "config.h" 51 | # endif 52 | # include "pcre2_internal.h" 53 | #endif 54 | 55 | 56 | 57 | /************************************************* 58 | * Create PCRE2 character tables * 59 | *************************************************/ 60 | 61 | /* This function builds a set of character tables for use by PCRE2 and returns 62 | a pointer to them. They are build using the ctype functions, and consequently 63 | their contents will depend upon the current locale setting. When compiled as 64 | part of the library, the store is obtained via a general context malloc, if 65 | supplied, but when PCRE2_DFTABLES is defined (when compiling the pcre2_dftables 66 | freestanding auxiliary program) malloc() is used, and the function has a 67 | different name so as not to clash with the prototype in pcre2.h. 68 | 69 | Arguments: none when PCRE2_DFTABLES is defined 70 | else a PCRE2 general context or NULL 71 | Returns: pointer to the contiguous block of data 72 | else NULL if memory allocation failed 73 | */ 74 | 75 | #ifdef PCRE2_DFTABLES /* Included in freestanding pcre2_dftables program */ 76 | static const uint8_t *maketables(void) 77 | { 78 | uint8_t *yield = (uint8_t *)malloc(TABLES_LENGTH); 79 | 80 | #else /* Not PCRE2_DFTABLES, that is, compiling the library */ 81 | PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION 82 | bundled_pcre2_maketables(pcre2_general_context *gcontext) 83 | { 84 | uint8_t *yield = (uint8_t *)((gcontext != NULL)? 85 | gcontext->memctl.malloc(TABLES_LENGTH, gcontext->memctl.memory_data) : 86 | malloc(TABLES_LENGTH)); 87 | #endif /* PCRE2_DFTABLES */ 88 | 89 | int i; 90 | uint8_t *p; 91 | 92 | if (yield == NULL) return NULL; 93 | p = yield; 94 | 95 | /* First comes the lower casing table */ 96 | 97 | for (i = 0; i < 256; i++) *p++ = tolower(i); 98 | 99 | /* Next the case-flipping table */ 100 | 101 | for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); 102 | 103 | /* Then the character class tables. Don't try to be clever and save effort on 104 | exclusive ones - in some locales things may be different. 105 | 106 | Note that the table for "space" includes everything "isspace" gives, including 107 | VT in the default locale. This makes it work for the POSIX class [:space:]. 108 | From PCRE1 release 8.34 and for all PCRE2 releases it is also correct for Perl 109 | space, because Perl added VT at release 5.18. 110 | 111 | Note also that it is possible for a character to be alnum or alpha without 112 | being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the 113 | fr_FR locale (at least under Debian Linux's locales as of 12/2005). So we must 114 | test for alnum specially. */ 115 | 116 | memset(p, 0, cbit_length); 117 | for (i = 0; i < 256; i++) 118 | { 119 | if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7); 120 | if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7); 121 | if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7); 122 | if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7); 123 | if (i == '_') p[cbit_word + i/8] |= 1u << (i&7); 124 | if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7); 125 | if (isxdigit(i)) p[cbit_xdigit + i/8] |= 1u << (i&7); 126 | if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7); 127 | if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7); 128 | if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7); 129 | if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7); 130 | } 131 | p += cbit_length; 132 | 133 | /* Finally, the character type table. In this, we used to exclude VT from the 134 | white space chars, because Perl didn't recognize it as such for \s and for 135 | comments within regexes. However, Perl changed at release 5.18, so PCRE1 136 | changed at release 8.34 and it's always been this way for PCRE2. */ 137 | 138 | for (i = 0; i < 256; i++) 139 | { 140 | int x = 0; 141 | if (isspace(i)) x += ctype_space; 142 | if (isalpha(i)) x += ctype_letter; 143 | if (islower(i)) x += ctype_lcletter; 144 | if (isdigit(i)) x += ctype_digit; 145 | if (isalnum(i) || i == '_') x += ctype_word; 146 | *p++ = x; 147 | } 148 | 149 | return yield; 150 | } 151 | 152 | #ifndef PCRE2_DFTABLES /* Compiling the library */ 153 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 154 | bundled_pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables) 155 | { 156 | if (gcontext) 157 | gcontext->memctl.free((void *)tables, gcontext->memctl.memory_data); 158 | else 159 | free((void *)tables); 160 | } 161 | #endif 162 | 163 | /* End of pcre2_maketables.c */ 164 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_match_data.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2019 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | 42 | #ifdef HAVE_CONFIG_H 43 | #include "config.h" 44 | #endif 45 | 46 | #include "pcre2_internal.h" 47 | 48 | 49 | 50 | /************************************************* 51 | * Create a match data block given ovector size * 52 | *************************************************/ 53 | 54 | /* A minimum of 1 is imposed on the number of ovector pairs. */ 55 | 56 | PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION 57 | bundled_pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext) 58 | { 59 | pcre2_match_data *yield; 60 | if (oveccount < 1) oveccount = 1; 61 | yield = PRIV(memctl_malloc)( 62 | offsetof(pcre2_match_data, ovector) + 2*oveccount*sizeof(PCRE2_SIZE), 63 | (pcre2_memctl *)gcontext); 64 | if (yield == NULL) return NULL; 65 | yield->oveccount = oveccount; 66 | yield->flags = 0; 67 | return yield; 68 | } 69 | 70 | 71 | 72 | /************************************************* 73 | * Create a match data block using pattern data * 74 | *************************************************/ 75 | 76 | /* If no context is supplied, use the memory allocator from the code. */ 77 | 78 | PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION 79 | bundled_pcre2_match_data_create_from_pattern(const pcre2_code *code, 80 | pcre2_general_context *gcontext) 81 | { 82 | if (gcontext == NULL) gcontext = (pcre2_general_context *)code; 83 | return bundled_pcre2_match_data_create(((pcre2_real_code *)code)->top_bracket + 1, 84 | gcontext); 85 | } 86 | 87 | 88 | 89 | /************************************************* 90 | * Free a match data block * 91 | *************************************************/ 92 | 93 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 94 | bundled_pcre2_match_data_free(pcre2_match_data *match_data) 95 | { 96 | if (match_data != NULL) 97 | { 98 | if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) 99 | match_data->memctl.free((void *)match_data->subject, 100 | match_data->memctl.memory_data); 101 | match_data->memctl.free(match_data, match_data->memctl.memory_data); 102 | } 103 | } 104 | 105 | 106 | 107 | /************************************************* 108 | * Get last mark in match * 109 | *************************************************/ 110 | 111 | PCRE2_EXP_DEFN PCRE2_SPTR PCRE2_CALL_CONVENTION 112 | bundled_pcre2_get_mark(pcre2_match_data *match_data) 113 | { 114 | return match_data->mark; 115 | } 116 | 117 | 118 | 119 | /************************************************* 120 | * Get pointer to ovector * 121 | *************************************************/ 122 | 123 | PCRE2_EXP_DEFN PCRE2_SIZE * PCRE2_CALL_CONVENTION 124 | bundled_pcre2_get_ovector_pointer(pcre2_match_data *match_data) 125 | { 126 | return match_data->ovector; 127 | } 128 | 129 | 130 | 131 | /************************************************* 132 | * Get number of ovector slots * 133 | *************************************************/ 134 | 135 | PCRE2_EXP_DEFN uint32_t PCRE2_CALL_CONVENTION 136 | bundled_pcre2_get_ovector_count(pcre2_match_data *match_data) 137 | { 138 | return match_data->oveccount; 139 | } 140 | 141 | 142 | 143 | /************************************************* 144 | * Get starting code unit in match * 145 | *************************************************/ 146 | 147 | PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION 148 | bundled_pcre2_get_startchar(pcre2_match_data *match_data) 149 | { 150 | return match_data->startchar; 151 | } 152 | 153 | 154 | 155 | /************************************************* 156 | * Get size of match data block * 157 | *************************************************/ 158 | 159 | PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION 160 | bundled_pcre2_get_match_data_size(pcre2_match_data *match_data) 161 | { 162 | return offsetof(pcre2_match_data, ovector) + 163 | 2 * (match_data->oveccount) * sizeof(PCRE2_SIZE); 164 | } 165 | 166 | /* End of pcre2_match_data.c */ 167 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_newline.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | 42 | /* This module contains internal functions for testing newlines when more than 43 | one kind of newline is to be recognized. When a newline is found, its length is 44 | returned. In principle, we could implement several newline "types", each 45 | referring to a different set of newline characters. At present, PCRE2 supports 46 | only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF, 47 | and NLTYPE_ANY. The full list of Unicode newline characters is taken from 48 | http://unicode.org/unicode/reports/tr18/. */ 49 | 50 | 51 | #ifdef HAVE_CONFIG_H 52 | #include "config.h" 53 | #endif 54 | 55 | #include "pcre2_internal.h" 56 | 57 | 58 | 59 | /************************************************* 60 | * Check for newline at given position * 61 | *************************************************/ 62 | 63 | /* This function is called only via the IS_NEWLINE macro, which does so only 64 | when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed 65 | newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the code unit 66 | pointed to by ptr is less than the end of the string. 67 | 68 | Arguments: 69 | ptr pointer to possible newline 70 | type the newline type 71 | endptr pointer to the end of the string 72 | lenptr where to return the length 73 | utf TRUE if in utf mode 74 | 75 | Returns: TRUE or FALSE 76 | */ 77 | 78 | BOOL 79 | PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr, 80 | uint32_t *lenptr, BOOL utf) 81 | { 82 | uint32_t c; 83 | 84 | #ifdef SUPPORT_UNICODE 85 | if (utf) { GETCHAR(c, ptr); } else c = *ptr; 86 | #else 87 | (void)utf; 88 | c = *ptr; 89 | #endif /* SUPPORT_UNICODE */ 90 | 91 | if (type == NLTYPE_ANYCRLF) switch(c) 92 | { 93 | case CHAR_LF: 94 | *lenptr = 1; 95 | return TRUE; 96 | 97 | case CHAR_CR: 98 | *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; 99 | return TRUE; 100 | 101 | default: 102 | return FALSE; 103 | } 104 | 105 | /* NLTYPE_ANY */ 106 | 107 | else switch(c) 108 | { 109 | #ifdef EBCDIC 110 | case CHAR_NEL: 111 | #endif 112 | case CHAR_LF: 113 | case CHAR_VT: 114 | case CHAR_FF: 115 | *lenptr = 1; 116 | return TRUE; 117 | 118 | case CHAR_CR: 119 | *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; 120 | return TRUE; 121 | 122 | #ifndef EBCDIC 123 | #if PCRE2_CODE_UNIT_WIDTH == 8 124 | case CHAR_NEL: 125 | *lenptr = utf? 2 : 1; 126 | return TRUE; 127 | 128 | case 0x2028: /* LS */ 129 | case 0x2029: /* PS */ 130 | *lenptr = 3; 131 | return TRUE; 132 | 133 | #else /* 16-bit or 32-bit code units */ 134 | case CHAR_NEL: 135 | case 0x2028: /* LS */ 136 | case 0x2029: /* PS */ 137 | *lenptr = 1; 138 | return TRUE; 139 | #endif 140 | #endif /* Not EBCDIC */ 141 | 142 | default: 143 | return FALSE; 144 | } 145 | } 146 | 147 | 148 | 149 | /************************************************* 150 | * Check for newline at previous position * 151 | *************************************************/ 152 | 153 | /* This function is called only via the WAS_NEWLINE macro, which does so only 154 | when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed 155 | newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the initial 156 | value of ptr is greater than the start of the string that is being processed. 157 | 158 | Arguments: 159 | ptr pointer to possible newline 160 | type the newline type 161 | startptr pointer to the start of the string 162 | lenptr where to return the length 163 | utf TRUE if in utf mode 164 | 165 | Returns: TRUE or FALSE 166 | */ 167 | 168 | BOOL 169 | PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr, 170 | uint32_t *lenptr, BOOL utf) 171 | { 172 | uint32_t c; 173 | ptr--; 174 | 175 | #ifdef SUPPORT_UNICODE 176 | if (utf) 177 | { 178 | BACKCHAR(ptr); 179 | GETCHAR(c, ptr); 180 | } 181 | else c = *ptr; 182 | #else 183 | (void)utf; 184 | c = *ptr; 185 | #endif /* SUPPORT_UNICODE */ 186 | 187 | if (type == NLTYPE_ANYCRLF) switch(c) 188 | { 189 | case CHAR_LF: 190 | *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; 191 | return TRUE; 192 | 193 | case CHAR_CR: 194 | *lenptr = 1; 195 | return TRUE; 196 | 197 | default: 198 | return FALSE; 199 | } 200 | 201 | /* NLTYPE_ANY */ 202 | 203 | else switch(c) 204 | { 205 | case CHAR_LF: 206 | *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; 207 | return TRUE; 208 | 209 | #ifdef EBCDIC 210 | case CHAR_NEL: 211 | #endif 212 | case CHAR_VT: 213 | case CHAR_FF: 214 | case CHAR_CR: 215 | *lenptr = 1; 216 | return TRUE; 217 | 218 | #ifndef EBCDIC 219 | #if PCRE2_CODE_UNIT_WIDTH == 8 220 | case CHAR_NEL: 221 | *lenptr = utf? 2 : 1; 222 | return TRUE; 223 | 224 | case 0x2028: /* LS */ 225 | case 0x2029: /* PS */ 226 | *lenptr = 3; 227 | return TRUE; 228 | 229 | #else /* 16-bit or 32-bit code units */ 230 | case CHAR_NEL: 231 | case 0x2028: /* LS */ 232 | case 0x2029: /* PS */ 233 | *lenptr = 1; 234 | return TRUE; 235 | #endif 236 | #endif /* Not EBCDIC */ 237 | 238 | default: 239 | return FALSE; 240 | } 241 | } 242 | 243 | /* End of pcre2_newline.c */ 244 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_ord2utf.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | 42 | /* This file contains a function that converts a Unicode character code point 43 | into a UTF string. The behaviour is different for each code unit width. */ 44 | 45 | 46 | #ifdef HAVE_CONFIG_H 47 | #include "config.h" 48 | #endif 49 | 50 | #include "pcre2_internal.h" 51 | 52 | 53 | /* If SUPPORT_UNICODE is not defined, this function will never be called. 54 | Supply a dummy function because some compilers do not like empty source 55 | modules. */ 56 | 57 | #ifndef SUPPORT_UNICODE 58 | unsigned int 59 | PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer) 60 | { 61 | (void)(cvalue); 62 | (void)(buffer); 63 | return 0; 64 | } 65 | #else /* SUPPORT_UNICODE */ 66 | 67 | 68 | /************************************************* 69 | * Convert code point to UTF * 70 | *************************************************/ 71 | 72 | /* 73 | Arguments: 74 | cvalue the character value 75 | buffer pointer to buffer for result 76 | 77 | Returns: number of code units placed in the buffer 78 | */ 79 | 80 | unsigned int 81 | PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer) 82 | { 83 | /* Convert to UTF-8 */ 84 | 85 | #if PCRE2_CODE_UNIT_WIDTH == 8 86 | int i, j; 87 | for (i = 0; i < PRIV(utf8_table1_size); i++) 88 | if ((int)cvalue <= PRIV(utf8_table1)[i]) break; 89 | buffer += i; 90 | for (j = i; j > 0; j--) 91 | { 92 | *buffer-- = 0x80 | (cvalue & 0x3f); 93 | cvalue >>= 6; 94 | } 95 | *buffer = PRIV(utf8_table2)[i] | cvalue; 96 | return i + 1; 97 | 98 | /* Convert to UTF-16 */ 99 | 100 | #elif PCRE2_CODE_UNIT_WIDTH == 16 101 | if (cvalue <= 0xffff) 102 | { 103 | *buffer = (PCRE2_UCHAR)cvalue; 104 | return 1; 105 | } 106 | cvalue -= 0x10000; 107 | *buffer++ = 0xd800 | (cvalue >> 10); 108 | *buffer = 0xdc00 | (cvalue & 0x3ff); 109 | return 2; 110 | 111 | /* Convert to UTF-32 */ 112 | 113 | #else 114 | *buffer = (PCRE2_UCHAR)cvalue; 115 | return 1; 116 | #endif 117 | } 118 | #endif /* SUPPORT_UNICODE */ 119 | 120 | /* End of pcre_ord2utf.c */ 121 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_serialize.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2020 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | /* This module contains functions for serializing and deserializing 42 | a sequence of compiled codes. */ 43 | 44 | 45 | #ifdef HAVE_CONFIG_H 46 | #include "config.h" 47 | #endif 48 | 49 | 50 | #include "pcre2_internal.h" 51 | 52 | /* Magic number to provide a small check against being handed junk. */ 53 | 54 | #define SERIALIZED_DATA_MAGIC 0x50523253u 55 | 56 | /* Deserialization is limited to the current PCRE version and 57 | character width. */ 58 | 59 | #define SERIALIZED_DATA_VERSION \ 60 | ((PCRE2_MAJOR) | ((PCRE2_MINOR) << 16)) 61 | 62 | #define SERIALIZED_DATA_CONFIG \ 63 | (sizeof(PCRE2_UCHAR) | ((sizeof(void*)) << 8) | ((sizeof(PCRE2_SIZE)) << 16)) 64 | 65 | 66 | 67 | /************************************************* 68 | * Serialize compiled patterns * 69 | *************************************************/ 70 | 71 | PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION 72 | bundled_pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, 73 | uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, 74 | pcre2_general_context *gcontext) 75 | { 76 | uint8_t *bytes; 77 | uint8_t *dst_bytes; 78 | int32_t i; 79 | PCRE2_SIZE total_size; 80 | const pcre2_real_code *re; 81 | const uint8_t *tables; 82 | pcre2_serialized_data *data; 83 | 84 | const pcre2_memctl *memctl = (gcontext != NULL) ? 85 | &gcontext->memctl : &PRIV(default_compile_context).memctl; 86 | 87 | if (codes == NULL || serialized_bytes == NULL || serialized_size == NULL) 88 | return PCRE2_ERROR_NULL; 89 | 90 | if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; 91 | 92 | /* Compute total size. */ 93 | total_size = sizeof(pcre2_serialized_data) + TABLES_LENGTH; 94 | tables = NULL; 95 | 96 | for (i = 0; i < number_of_codes; i++) 97 | { 98 | if (codes[i] == NULL) return PCRE2_ERROR_NULL; 99 | re = (const pcre2_real_code *)(codes[i]); 100 | if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; 101 | if (tables == NULL) 102 | tables = re->tables; 103 | else if (tables != re->tables) 104 | return PCRE2_ERROR_MIXEDTABLES; 105 | total_size += re->blocksize; 106 | } 107 | 108 | /* Initialize the byte stream. */ 109 | bytes = memctl->malloc(total_size + sizeof(pcre2_memctl), memctl->memory_data); 110 | if (bytes == NULL) return PCRE2_ERROR_NOMEMORY; 111 | 112 | /* The controller is stored as a hidden parameter. */ 113 | memcpy(bytes, memctl, sizeof(pcre2_memctl)); 114 | bytes += sizeof(pcre2_memctl); 115 | 116 | data = (pcre2_serialized_data *)bytes; 117 | data->magic = SERIALIZED_DATA_MAGIC; 118 | data->version = SERIALIZED_DATA_VERSION; 119 | data->config = SERIALIZED_DATA_CONFIG; 120 | data->number_of_codes = number_of_codes; 121 | 122 | /* Copy all compiled code data. */ 123 | dst_bytes = bytes + sizeof(pcre2_serialized_data); 124 | memcpy(dst_bytes, tables, TABLES_LENGTH); 125 | dst_bytes += TABLES_LENGTH; 126 | 127 | for (i = 0; i < number_of_codes; i++) 128 | { 129 | re = (const pcre2_real_code *)(codes[i]); 130 | (void)memcpy(dst_bytes, (char *)re, re->blocksize); 131 | 132 | /* Certain fields in the compiled code block are re-set during 133 | deserialization. In order to ensure that the serialized data stream is always 134 | the same for the same pattern, set them to zero here. We can't assume the 135 | copy of the pattern is correctly aligned for accessing the fields as part of 136 | a structure. Note the use of sizeof(void *) in the second of these, to 137 | specify the size of a pointer. If sizeof(uint8_t *) is used (tables is a 138 | pointer to uint8_t), gcc gives a warning because the first argument is also a 139 | pointer to uint8_t. Casting the first argument to (void *) can stop this, but 140 | it didn't stop Coverity giving the same complaint. */ 141 | 142 | (void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0, 143 | sizeof(pcre2_memctl)); 144 | (void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0, 145 | sizeof(void *)); 146 | (void)memset(dst_bytes + offsetof(pcre2_real_code, executable_jit), 0, 147 | sizeof(void *)); 148 | 149 | dst_bytes += re->blocksize; 150 | } 151 | 152 | *serialized_bytes = bytes; 153 | *serialized_size = total_size; 154 | return number_of_codes; 155 | } 156 | 157 | 158 | /************************************************* 159 | * Deserialize compiled patterns * 160 | *************************************************/ 161 | 162 | PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION 163 | bundled_pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, 164 | const uint8_t *bytes, pcre2_general_context *gcontext) 165 | { 166 | const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes; 167 | const pcre2_memctl *memctl = (gcontext != NULL) ? 168 | &gcontext->memctl : &PRIV(default_compile_context).memctl; 169 | 170 | const uint8_t *src_bytes; 171 | pcre2_real_code *dst_re; 172 | uint8_t *tables; 173 | int32_t i, j; 174 | 175 | /* Sanity checks. */ 176 | 177 | if (data == NULL || codes == NULL) return PCRE2_ERROR_NULL; 178 | if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; 179 | if (data->number_of_codes <= 0) return PCRE2_ERROR_BADSERIALIZEDDATA; 180 | if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC; 181 | if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE; 182 | if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE; 183 | 184 | if (number_of_codes > data->number_of_codes) 185 | number_of_codes = data->number_of_codes; 186 | 187 | src_bytes = bytes + sizeof(pcre2_serialized_data); 188 | 189 | /* Decode tables. The reference count for the tables is stored immediately 190 | following them. */ 191 | 192 | tables = memctl->malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), memctl->memory_data); 193 | if (tables == NULL) return PCRE2_ERROR_NOMEMORY; 194 | 195 | memcpy(tables, src_bytes, TABLES_LENGTH); 196 | *(PCRE2_SIZE *)(tables + TABLES_LENGTH) = number_of_codes; 197 | src_bytes += TABLES_LENGTH; 198 | 199 | /* Decode the byte stream. We must not try to read the size from the compiled 200 | code block in the stream, because it might be unaligned, which causes errors on 201 | hardware such as Sparc-64 that doesn't like unaligned memory accesses. The type 202 | of the blocksize field is given its own name to ensure that it is the same here 203 | as in the block. */ 204 | 205 | for (i = 0; i < number_of_codes; i++) 206 | { 207 | CODE_BLOCKSIZE_TYPE blocksize; 208 | memcpy(&blocksize, src_bytes + offsetof(pcre2_real_code, blocksize), 209 | sizeof(CODE_BLOCKSIZE_TYPE)); 210 | if (blocksize <= sizeof(pcre2_real_code)) 211 | return PCRE2_ERROR_BADSERIALIZEDDATA; 212 | 213 | /* The allocator provided by gcontext replaces the original one. */ 214 | 215 | dst_re = (pcre2_real_code *)PRIV(memctl_malloc)(blocksize, 216 | (pcre2_memctl *)gcontext); 217 | if (dst_re == NULL) 218 | { 219 | memctl->free(tables, memctl->memory_data); 220 | for (j = 0; j < i; j++) 221 | { 222 | memctl->free(codes[j], memctl->memory_data); 223 | codes[j] = NULL; 224 | } 225 | return PCRE2_ERROR_NOMEMORY; 226 | } 227 | 228 | /* The new allocator must be preserved. */ 229 | 230 | memcpy(((uint8_t *)dst_re) + sizeof(pcre2_memctl), 231 | src_bytes + sizeof(pcre2_memctl), blocksize - sizeof(pcre2_memctl)); 232 | if (dst_re->magic_number != MAGIC_NUMBER || 233 | dst_re->name_entry_size > MAX_NAME_SIZE + IMM2_SIZE + 1 || 234 | dst_re->name_count > MAX_NAME_COUNT) 235 | { 236 | memctl->free(dst_re, memctl->memory_data); 237 | return PCRE2_ERROR_BADSERIALIZEDDATA; 238 | } 239 | 240 | /* At the moment only one table is supported. */ 241 | 242 | dst_re->tables = tables; 243 | dst_re->executable_jit = NULL; 244 | dst_re->flags |= PCRE2_DEREF_TABLES; 245 | 246 | codes[i] = dst_re; 247 | src_bytes += blocksize; 248 | } 249 | 250 | return number_of_codes; 251 | } 252 | 253 | 254 | /************************************************* 255 | * Get the number of serialized patterns * 256 | *************************************************/ 257 | 258 | PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION 259 | bundled_pcre2_serialize_get_number_of_codes(const uint8_t *bytes) 260 | { 261 | const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes; 262 | 263 | if (data == NULL) return PCRE2_ERROR_NULL; 264 | if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC; 265 | if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE; 266 | if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE; 267 | 268 | return data->number_of_codes; 269 | } 270 | 271 | 272 | /************************************************* 273 | * Free the allocated stream * 274 | *************************************************/ 275 | 276 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 277 | bundled_pcre2_serialize_free(uint8_t *bytes) 278 | { 279 | if (bytes != NULL) 280 | { 281 | pcre2_memctl *memctl = (pcre2_memctl *)(bytes - sizeof(pcre2_memctl)); 282 | memctl->free(memctl, memctl->memory_data); 283 | } 284 | } 285 | 286 | /* End of pcre2_serialize.c */ 287 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_string_utils.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2018 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | /* This module contains internal functions for comparing and finding the length 42 | of strings. These are used instead of strcmp() etc because the standard 43 | functions work only on 8-bit data. */ 44 | 45 | 46 | #ifdef HAVE_CONFIG_H 47 | #include "config.h" 48 | #endif 49 | 50 | #include "pcre2_internal.h" 51 | 52 | 53 | /************************************************* 54 | * Emulated memmove() for systems without it * 55 | *************************************************/ 56 | 57 | /* This function can make use of bcopy() if it is available. Otherwise do it by 58 | steam, as there some non-Unix environments that lack both memmove() and 59 | bcopy(). */ 60 | 61 | #if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE) 62 | void * 63 | PRIV(memmove)(void *d, const void *s, size_t n) 64 | { 65 | #ifdef HAVE_BCOPY 66 | bcopy(s, d, n); 67 | return d; 68 | #else 69 | size_t i; 70 | unsigned char *dest = (unsigned char *)d; 71 | const unsigned char *src = (const unsigned char *)s; 72 | if (dest > src) 73 | { 74 | dest += n; 75 | src += n; 76 | for (i = 0; i < n; ++i) *(--dest) = *(--src); 77 | return (void *)dest; 78 | } 79 | else 80 | { 81 | for (i = 0; i < n; ++i) *dest++ = *src++; 82 | return (void *)(dest - n); 83 | } 84 | #endif /* not HAVE_BCOPY */ 85 | } 86 | #endif /* not VPCOMPAT && not HAVE_MEMMOVE */ 87 | 88 | 89 | /************************************************* 90 | * Compare two zero-terminated PCRE2 strings * 91 | *************************************************/ 92 | 93 | /* 94 | Arguments: 95 | str1 first string 96 | str2 second string 97 | 98 | Returns: 0, 1, or -1 99 | */ 100 | 101 | int 102 | PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2) 103 | { 104 | PCRE2_UCHAR c1, c2; 105 | while (*str1 != '\0' || *str2 != '\0') 106 | { 107 | c1 = *str1++; 108 | c2 = *str2++; 109 | if (c1 != c2) return ((c1 > c2) << 1) - 1; 110 | } 111 | return 0; 112 | } 113 | 114 | 115 | /************************************************* 116 | * Compare zero-terminated PCRE2 & 8-bit strings * 117 | *************************************************/ 118 | 119 | /* As the 8-bit string is almost always a literal, its type is specified as 120 | const char *. 121 | 122 | Arguments: 123 | str1 first string 124 | str2 second string 125 | 126 | Returns: 0, 1, or -1 127 | */ 128 | 129 | int 130 | PRIV(strcmp_c8)(PCRE2_SPTR str1, const char *str2) 131 | { 132 | PCRE2_UCHAR c1, c2; 133 | while (*str1 != '\0' || *str2 != '\0') 134 | { 135 | c1 = *str1++; 136 | c2 = *str2++; 137 | if (c1 != c2) return ((c1 > c2) << 1) - 1; 138 | } 139 | return 0; 140 | } 141 | 142 | 143 | /************************************************* 144 | * Compare two PCRE2 strings, given a length * 145 | *************************************************/ 146 | 147 | /* 148 | Arguments: 149 | str1 first string 150 | str2 second string 151 | len the length 152 | 153 | Returns: 0, 1, or -1 154 | */ 155 | 156 | int 157 | PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len) 158 | { 159 | PCRE2_UCHAR c1, c2; 160 | for (; len > 0; len--) 161 | { 162 | c1 = *str1++; 163 | c2 = *str2++; 164 | if (c1 != c2) return ((c1 > c2) << 1) - 1; 165 | } 166 | return 0; 167 | } 168 | 169 | 170 | /************************************************* 171 | * Compare PCRE2 string to 8-bit string by length * 172 | *************************************************/ 173 | 174 | /* As the 8-bit string is almost always a literal, its type is specified as 175 | const char *. 176 | 177 | Arguments: 178 | str1 first string 179 | str2 second string 180 | len the length 181 | 182 | Returns: 0, 1, or -1 183 | */ 184 | 185 | int 186 | PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len) 187 | { 188 | PCRE2_UCHAR c1, c2; 189 | for (; len > 0; len--) 190 | { 191 | c1 = *str1++; 192 | c2 = *str2++; 193 | if (c1 != c2) return ((c1 > c2) << 1) - 1; 194 | } 195 | return 0; 196 | } 197 | 198 | 199 | /************************************************* 200 | * Find the length of a PCRE2 string * 201 | *************************************************/ 202 | 203 | /* 204 | Argument: the string 205 | Returns: the length 206 | */ 207 | 208 | PCRE2_SIZE 209 | PRIV(strlen)(PCRE2_SPTR str) 210 | { 211 | PCRE2_SIZE c = 0; 212 | while (*str++ != 0) c++; 213 | return c; 214 | } 215 | 216 | 217 | /************************************************* 218 | * Copy 8-bit 0-terminated string to PCRE2 string * 219 | *************************************************/ 220 | 221 | /* Arguments: 222 | str1 buffer to receive the string 223 | str2 8-bit string to be copied 224 | 225 | Returns: the number of code units used (excluding trailing zero) 226 | */ 227 | 228 | PCRE2_SIZE 229 | PRIV(strcpy_c8)(PCRE2_UCHAR *str1, const char *str2) 230 | { 231 | PCRE2_UCHAR *t = str1; 232 | while (*str2 != 0) *t++ = *str2++; 233 | *t = 0; 234 | return t - str1; 235 | } 236 | 237 | /* End of pcre2_string_utils.c */ 238 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_ucp.h: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2018 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | 42 | #ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD 43 | #define PCRE2_UCP_H_IDEMPOTENT_GUARD 44 | 45 | /* This file contains definitions of the property values that are returned by 46 | the UCD access macros. New values that are added for new releases of Unicode 47 | should always be at the end of each enum, for backwards compatibility. 48 | 49 | IMPORTANT: Note also that the specific numeric values of the enums have to be 50 | the same as the values that are generated by the maint/MultiStage2.py script, 51 | where the equivalent property descriptive names are listed in vectors. 52 | 53 | ALSO: The specific values of the first two enums are assumed for the table 54 | called catposstab in pcre2_compile.c. */ 55 | 56 | /* These are the general character categories. */ 57 | 58 | enum { 59 | ucp_C, /* Other */ 60 | ucp_L, /* Letter */ 61 | ucp_M, /* Mark */ 62 | ucp_N, /* Number */ 63 | ucp_P, /* Punctuation */ 64 | ucp_S, /* Symbol */ 65 | ucp_Z /* Separator */ 66 | }; 67 | 68 | /* These are the particular character categories. */ 69 | 70 | enum { 71 | ucp_Cc, /* Control */ 72 | ucp_Cf, /* Format */ 73 | ucp_Cn, /* Unassigned */ 74 | ucp_Co, /* Private use */ 75 | ucp_Cs, /* Surrogate */ 76 | ucp_Ll, /* Lower case letter */ 77 | ucp_Lm, /* Modifier letter */ 78 | ucp_Lo, /* Other letter */ 79 | ucp_Lt, /* Title case letter */ 80 | ucp_Lu, /* Upper case letter */ 81 | ucp_Mc, /* Spacing mark */ 82 | ucp_Me, /* Enclosing mark */ 83 | ucp_Mn, /* Non-spacing mark */ 84 | ucp_Nd, /* Decimal number */ 85 | ucp_Nl, /* Letter number */ 86 | ucp_No, /* Other number */ 87 | ucp_Pc, /* Connector punctuation */ 88 | ucp_Pd, /* Dash punctuation */ 89 | ucp_Pe, /* Close punctuation */ 90 | ucp_Pf, /* Final punctuation */ 91 | ucp_Pi, /* Initial punctuation */ 92 | ucp_Po, /* Other punctuation */ 93 | ucp_Ps, /* Open punctuation */ 94 | ucp_Sc, /* Currency symbol */ 95 | ucp_Sk, /* Modifier symbol */ 96 | ucp_Sm, /* Mathematical symbol */ 97 | ucp_So, /* Other symbol */ 98 | ucp_Zl, /* Line separator */ 99 | ucp_Zp, /* Paragraph separator */ 100 | ucp_Zs /* Space separator */ 101 | }; 102 | 103 | /* These are grapheme break properties. The Extended Pictographic property 104 | comes from the emoji-data.txt file. */ 105 | 106 | enum { 107 | ucp_gbCR, /* 0 */ 108 | ucp_gbLF, /* 1 */ 109 | ucp_gbControl, /* 2 */ 110 | ucp_gbExtend, /* 3 */ 111 | ucp_gbPrepend, /* 4 */ 112 | ucp_gbSpacingMark, /* 5 */ 113 | ucp_gbL, /* 6 Hangul syllable type L */ 114 | ucp_gbV, /* 7 Hangul syllable type V */ 115 | ucp_gbT, /* 8 Hangul syllable type T */ 116 | ucp_gbLV, /* 9 Hangul syllable type LV */ 117 | ucp_gbLVT, /* 10 Hangul syllable type LVT */ 118 | ucp_gbRegionalIndicator, /* 11 */ 119 | ucp_gbOther, /* 12 */ 120 | ucp_gbZWJ, /* 13 */ 121 | ucp_gbExtended_Pictographic /* 14 */ 122 | }; 123 | 124 | /* These are the script identifications. */ 125 | 126 | enum { 127 | ucp_Unknown, 128 | ucp_Arabic, 129 | ucp_Armenian, 130 | ucp_Bengali, 131 | ucp_Bopomofo, 132 | ucp_Braille, 133 | ucp_Buginese, 134 | ucp_Buhid, 135 | ucp_Canadian_Aboriginal, 136 | ucp_Cherokee, 137 | ucp_Common, 138 | ucp_Coptic, 139 | ucp_Cypriot, 140 | ucp_Cyrillic, 141 | ucp_Deseret, 142 | ucp_Devanagari, 143 | ucp_Ethiopic, 144 | ucp_Georgian, 145 | ucp_Glagolitic, 146 | ucp_Gothic, 147 | ucp_Greek, 148 | ucp_Gujarati, 149 | ucp_Gurmukhi, 150 | ucp_Han, 151 | ucp_Hangul, 152 | ucp_Hanunoo, 153 | ucp_Hebrew, 154 | ucp_Hiragana, 155 | ucp_Inherited, 156 | ucp_Kannada, 157 | ucp_Katakana, 158 | ucp_Kharoshthi, 159 | ucp_Khmer, 160 | ucp_Lao, 161 | ucp_Latin, 162 | ucp_Limbu, 163 | ucp_Linear_B, 164 | ucp_Malayalam, 165 | ucp_Mongolian, 166 | ucp_Myanmar, 167 | ucp_New_Tai_Lue, 168 | ucp_Ogham, 169 | ucp_Old_Italic, 170 | ucp_Old_Persian, 171 | ucp_Oriya, 172 | ucp_Osmanya, 173 | ucp_Runic, 174 | ucp_Shavian, 175 | ucp_Sinhala, 176 | ucp_Syloti_Nagri, 177 | ucp_Syriac, 178 | ucp_Tagalog, 179 | ucp_Tagbanwa, 180 | ucp_Tai_Le, 181 | ucp_Tamil, 182 | ucp_Telugu, 183 | ucp_Thaana, 184 | ucp_Thai, 185 | ucp_Tibetan, 186 | ucp_Tifinagh, 187 | ucp_Ugaritic, 188 | ucp_Yi, 189 | /* New for Unicode 5.0 */ 190 | ucp_Balinese, 191 | ucp_Cuneiform, 192 | ucp_Nko, 193 | ucp_Phags_Pa, 194 | ucp_Phoenician, 195 | /* New for Unicode 5.1 */ 196 | ucp_Carian, 197 | ucp_Cham, 198 | ucp_Kayah_Li, 199 | ucp_Lepcha, 200 | ucp_Lycian, 201 | ucp_Lydian, 202 | ucp_Ol_Chiki, 203 | ucp_Rejang, 204 | ucp_Saurashtra, 205 | ucp_Sundanese, 206 | ucp_Vai, 207 | /* New for Unicode 5.2 */ 208 | ucp_Avestan, 209 | ucp_Bamum, 210 | ucp_Egyptian_Hieroglyphs, 211 | ucp_Imperial_Aramaic, 212 | ucp_Inscriptional_Pahlavi, 213 | ucp_Inscriptional_Parthian, 214 | ucp_Javanese, 215 | ucp_Kaithi, 216 | ucp_Lisu, 217 | ucp_Meetei_Mayek, 218 | ucp_Old_South_Arabian, 219 | ucp_Old_Turkic, 220 | ucp_Samaritan, 221 | ucp_Tai_Tham, 222 | ucp_Tai_Viet, 223 | /* New for Unicode 6.0.0 */ 224 | ucp_Batak, 225 | ucp_Brahmi, 226 | ucp_Mandaic, 227 | /* New for Unicode 6.1.0 */ 228 | ucp_Chakma, 229 | ucp_Meroitic_Cursive, 230 | ucp_Meroitic_Hieroglyphs, 231 | ucp_Miao, 232 | ucp_Sharada, 233 | ucp_Sora_Sompeng, 234 | ucp_Takri, 235 | /* New for Unicode 7.0.0 */ 236 | ucp_Bassa_Vah, 237 | ucp_Caucasian_Albanian, 238 | ucp_Duployan, 239 | ucp_Elbasan, 240 | ucp_Grantha, 241 | ucp_Khojki, 242 | ucp_Khudawadi, 243 | ucp_Linear_A, 244 | ucp_Mahajani, 245 | ucp_Manichaean, 246 | ucp_Mende_Kikakui, 247 | ucp_Modi, 248 | ucp_Mro, 249 | ucp_Nabataean, 250 | ucp_Old_North_Arabian, 251 | ucp_Old_Permic, 252 | ucp_Pahawh_Hmong, 253 | ucp_Palmyrene, 254 | ucp_Psalter_Pahlavi, 255 | ucp_Pau_Cin_Hau, 256 | ucp_Siddham, 257 | ucp_Tirhuta, 258 | ucp_Warang_Citi, 259 | /* New for Unicode 8.0.0 */ 260 | ucp_Ahom, 261 | ucp_Anatolian_Hieroglyphs, 262 | ucp_Hatran, 263 | ucp_Multani, 264 | ucp_Old_Hungarian, 265 | ucp_SignWriting, 266 | /* New for Unicode 10.0.0 (no update since 8.0.0) */ 267 | ucp_Adlam, 268 | ucp_Bhaiksuki, 269 | ucp_Marchen, 270 | ucp_Newa, 271 | ucp_Osage, 272 | ucp_Tangut, 273 | ucp_Masaram_Gondi, 274 | ucp_Nushu, 275 | ucp_Soyombo, 276 | ucp_Zanabazar_Square, 277 | /* New for Unicode 11.0.0 */ 278 | ucp_Dogra, 279 | ucp_Gunjala_Gondi, 280 | ucp_Hanifi_Rohingya, 281 | ucp_Makasar, 282 | ucp_Medefaidrin, 283 | ucp_Old_Sogdian, 284 | ucp_Sogdian, 285 | /* New for Unicode 12.0.0 */ 286 | ucp_Elymaic, 287 | ucp_Nandinagari, 288 | ucp_Nyiakeng_Puachue_Hmong, 289 | ucp_Wancho, 290 | /* New for Unicode 13.0.0 */ 291 | ucp_Chorasmian, 292 | ucp_Dives_Akuru, 293 | ucp_Khitan_Small_Script, 294 | ucp_Yezidi 295 | }; 296 | 297 | #endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */ 298 | 299 | /* End of pcre2_ucp.h */ 300 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2_xclass.c: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. 7 | 8 | Written by Philip Hazel 9 | Original API code Copyright (c) 1997-2012 University of Cambridge 10 | New API code Copyright (c) 2016-2019 University of Cambridge 11 | 12 | ----------------------------------------------------------------------------- 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of the University of Cambridge nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE. 38 | ----------------------------------------------------------------------------- 39 | */ 40 | 41 | /* This module contains an internal function that is used to match an extended 42 | class. It is used by pcre2_auto_possessify() and by both bundled_pcre2_match() and 43 | pcre2_def_match(). */ 44 | 45 | 46 | #ifdef HAVE_CONFIG_H 47 | #include "config.h" 48 | #endif 49 | 50 | 51 | #include "pcre2_internal.h" 52 | 53 | /************************************************* 54 | * Match character against an XCLASS * 55 | *************************************************/ 56 | 57 | /* This function is called to match a character against an extended class that 58 | might contain codepoints above 255 and/or Unicode properties. 59 | 60 | Arguments: 61 | c the character 62 | data points to the flag code unit of the XCLASS data 63 | utf TRUE if in UTF mode 64 | 65 | Returns: TRUE if character matches, else FALSE 66 | */ 67 | 68 | BOOL 69 | PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf) 70 | { 71 | PCRE2_UCHAR t; 72 | BOOL negated = (*data & XCL_NOT) != 0; 73 | 74 | #if PCRE2_CODE_UNIT_WIDTH == 8 75 | /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ 76 | utf = TRUE; 77 | #endif 78 | 79 | /* Code points < 256 are matched against a bitmap, if one is present. If not, 80 | we still carry on, because there may be ranges that start below 256 in the 81 | additional data. */ 82 | 83 | if (c < 256) 84 | { 85 | if ((*data & XCL_HASPROP) == 0) 86 | { 87 | if ((*data & XCL_MAP) == 0) return negated; 88 | return (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0; 89 | } 90 | if ((*data & XCL_MAP) != 0 && 91 | (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0) 92 | return !negated; /* char found */ 93 | } 94 | 95 | /* First skip the bit map if present. Then match against the list of Unicode 96 | properties or large chars or ranges that end with a large char. We won't ever 97 | encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */ 98 | 99 | if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR); 100 | 101 | while ((t = *data++) != XCL_END) 102 | { 103 | uint32_t x, y; 104 | if (t == XCL_SINGLE) 105 | { 106 | #ifdef SUPPORT_UNICODE 107 | if (utf) 108 | { 109 | GETCHARINC(x, data); /* macro generates multiple statements */ 110 | } 111 | else 112 | #endif 113 | x = *data++; 114 | if (c == x) return !negated; 115 | } 116 | else if (t == XCL_RANGE) 117 | { 118 | #ifdef SUPPORT_UNICODE 119 | if (utf) 120 | { 121 | GETCHARINC(x, data); /* macro generates multiple statements */ 122 | GETCHARINC(y, data); /* macro generates multiple statements */ 123 | } 124 | else 125 | #endif 126 | { 127 | x = *data++; 128 | y = *data++; 129 | } 130 | if (c >= x && c <= y) return !negated; 131 | } 132 | 133 | #ifdef SUPPORT_UNICODE 134 | else /* XCL_PROP & XCL_NOTPROP */ 135 | { 136 | const ucd_record *prop = GET_UCD(c); 137 | BOOL isprop = t == XCL_PROP; 138 | 139 | switch(*data) 140 | { 141 | case PT_ANY: 142 | if (isprop) return !negated; 143 | break; 144 | 145 | case PT_LAMP: 146 | if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 147 | prop->chartype == ucp_Lt) == isprop) return !negated; 148 | break; 149 | 150 | case PT_GC: 151 | if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop) 152 | return !negated; 153 | break; 154 | 155 | case PT_PC: 156 | if ((data[1] == prop->chartype) == isprop) return !negated; 157 | break; 158 | 159 | case PT_SC: 160 | if ((data[1] == prop->script) == isprop) return !negated; 161 | break; 162 | 163 | case PT_ALNUM: 164 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 165 | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop) 166 | return !negated; 167 | break; 168 | 169 | /* Perl space used to exclude VT, but from Perl 5.18 it is included, 170 | which means that Perl space and POSIX space are now identical. PCRE 171 | was changed at release 8.34. */ 172 | 173 | case PT_SPACE: /* Perl space */ 174 | case PT_PXSPACE: /* POSIX space */ 175 | switch(c) 176 | { 177 | HSPACE_CASES: 178 | VSPACE_CASES: 179 | if (isprop) return !negated; 180 | break; 181 | 182 | default: 183 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop) 184 | return !negated; 185 | break; 186 | } 187 | break; 188 | 189 | case PT_WORD: 190 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 191 | PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) 192 | == isprop) 193 | return !negated; 194 | break; 195 | 196 | case PT_UCNC: 197 | if (c < 0xa0) 198 | { 199 | if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 200 | c == CHAR_GRAVE_ACCENT) == isprop) 201 | return !negated; 202 | } 203 | else 204 | { 205 | if ((c < 0xd800 || c > 0xdfff) == isprop) 206 | return !negated; 207 | } 208 | break; 209 | 210 | /* The following three properties can occur only in an XCLASS, as there 211 | is no \p or \P coding for them. */ 212 | 213 | /* Graphic character. Implement this as not Z (space or separator) and 214 | not C (other), except for Cf (format) with a few exceptions. This seems 215 | to be what Perl does. The exceptional characters are: 216 | 217 | U+061C Arabic Letter Mark 218 | U+180E Mongolian Vowel Separator 219 | U+2066 - U+2069 Various "isolate"s 220 | */ 221 | 222 | case PT_PXGRAPH: 223 | if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z && 224 | (PRIV(ucp_gentype)[prop->chartype] != ucp_C || 225 | (prop->chartype == ucp_Cf && 226 | c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069)) 227 | )) == isprop) 228 | return !negated; 229 | break; 230 | 231 | /* Printable character: same as graphic, with the addition of Zs, i.e. 232 | not Zl and not Zp, and U+180E. */ 233 | 234 | case PT_PXPRINT: 235 | if ((prop->chartype != ucp_Zl && 236 | prop->chartype != ucp_Zp && 237 | (PRIV(ucp_gentype)[prop->chartype] != ucp_C || 238 | (prop->chartype == ucp_Cf && 239 | c != 0x061c && (c < 0x2066 || c > 0x2069)) 240 | )) == isprop) 241 | return !negated; 242 | break; 243 | 244 | /* Punctuation: all Unicode punctuation, plus ASCII characters that 245 | Unicode treats as symbols rather than punctuation, for Perl 246 | compatibility (these are $+<=>^`|~). */ 247 | 248 | case PT_PXPUNCT: 249 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P || 250 | (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) 251 | return !negated; 252 | break; 253 | 254 | /* This should never occur, but compilers may mutter if there is no 255 | default. */ 256 | 257 | default: 258 | return FALSE; 259 | } 260 | 261 | data += 2; 262 | } 263 | #else 264 | (void)utf; /* Avoid compiler warning */ 265 | #endif /* SUPPORT_UNICODE */ 266 | } 267 | 268 | return negated; /* char did not match */ 269 | } 270 | 271 | /* End of pcre2_xclass.c */ 272 | -------------------------------------------------------------------------------- /src/PCRE2/pcre2posix.h: -------------------------------------------------------------------------------- 1 | /************************************************* 2 | * Perl-Compatible Regular Expressions * 3 | *************************************************/ 4 | 5 | /* PCRE2 is a library of functions to support regular expressions whose syntax 6 | and semantics are as close as possible to those of the Perl 5 language. This is 7 | the public header file to be #included by applications that call PCRE2 via the 8 | POSIX wrapper interface. 9 | 10 | Written by Philip Hazel 11 | Original API code Copyright (c) 1997-2012 University of Cambridge 12 | New API code Copyright (c) 2016-2019 University of Cambridge 13 | 14 | ----------------------------------------------------------------------------- 15 | Redistribution and use in source and binary forms, with or without 16 | modification, are permitted provided that the following conditions are met: 17 | 18 | * Redistributions of source code must retain the above copyright notice, 19 | this list of conditions and the following disclaimer. 20 | 21 | * Redistributions in binary form must reproduce the above copyright 22 | notice, this list of conditions and the following disclaimer in the 23 | documentation and/or other materials provided with the distribution. 24 | 25 | * Neither the name of the University of Cambridge nor the names of its 26 | contributors may be used to endorse or promote products derived from 27 | this software without specific prior written permission. 28 | 29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 39 | POSSIBILITY OF SUCH DAMAGE. 40 | ----------------------------------------------------------------------------- 41 | */ 42 | 43 | 44 | /* Have to include stdlib.h in order to ensure that size_t is defined. */ 45 | 46 | #include 47 | 48 | /* Allow for C++ users */ 49 | 50 | #ifdef __cplusplus 51 | extern "C" { 52 | #endif 53 | 54 | /* Options, mostly defined by POSIX, but with some extras. */ 55 | 56 | #define REG_ICASE 0x0001 /* Maps to PCRE2_CASELESS */ 57 | #define REG_NEWLINE 0x0002 /* Maps to PCRE2_MULTILINE */ 58 | #define REG_NOTBOL 0x0004 /* Maps to PCRE2_NOTBOL */ 59 | #define REG_NOTEOL 0x0008 /* Maps to PCRE2_NOTEOL */ 60 | #define REG_DOTALL 0x0010 /* NOT defined by POSIX; maps to PCRE2_DOTALL */ 61 | #define REG_NOSUB 0x0020 /* Do not report what was matched */ 62 | #define REG_UTF 0x0040 /* NOT defined by POSIX; maps to PCRE2_UTF */ 63 | #define REG_STARTEND 0x0080 /* BSD feature: pass subject string by so,eo */ 64 | #define REG_NOTEMPTY 0x0100 /* NOT defined by POSIX; maps to PCRE2_NOTEMPTY */ 65 | #define REG_UNGREEDY 0x0200 /* NOT defined by POSIX; maps to PCRE2_UNGREEDY */ 66 | #define REG_UCP 0x0400 /* NOT defined by POSIX; maps to PCRE2_UCP */ 67 | #define REG_PEND 0x0800 /* GNU feature: pass end pattern by re_endp */ 68 | #define REG_NOSPEC 0x1000 /* Maps to PCRE2_LITERAL */ 69 | 70 | /* This is not used by PCRE2, but by defining it we make it easier 71 | to slot PCRE2 into existing programs that make POSIX calls. */ 72 | 73 | #define REG_EXTENDED 0 74 | 75 | /* Error values. Not all these are relevant or used by the wrapper. */ 76 | 77 | enum { 78 | REG_ASSERT = 1, /* internal error ? */ 79 | REG_BADBR, /* invalid repeat counts in {} */ 80 | REG_BADPAT, /* pattern error */ 81 | REG_BADRPT, /* ? * + invalid */ 82 | REG_EBRACE, /* unbalanced {} */ 83 | REG_EBRACK, /* unbalanced [] */ 84 | REG_ECOLLATE, /* collation error - not relevant */ 85 | REG_ECTYPE, /* bad class */ 86 | REG_EESCAPE, /* bad escape sequence */ 87 | REG_EMPTY, /* empty expression */ 88 | REG_EPAREN, /* unbalanced () */ 89 | REG_ERANGE, /* bad range inside [] */ 90 | REG_ESIZE, /* expression too big */ 91 | REG_ESPACE, /* failed to get memory */ 92 | REG_ESUBREG, /* bad back reference */ 93 | REG_INVARG, /* bad argument */ 94 | REG_NOMATCH /* match failed */ 95 | }; 96 | 97 | 98 | /* The structure representing a compiled regular expression. It is also used 99 | for passing the pattern end pointer when REG_PEND is set. */ 100 | 101 | typedef struct { 102 | void *re_pcre2_code; 103 | void *re_match_data; 104 | const char *re_endp; 105 | size_t re_nsub; 106 | size_t re_erroffset; 107 | int re_cflags; 108 | } regex_t; 109 | 110 | /* The structure in which a captured offset is returned. */ 111 | 112 | typedef int regoff_t; 113 | 114 | typedef struct { 115 | regoff_t rm_so; 116 | regoff_t rm_eo; 117 | } regmatch_t; 118 | 119 | /* When an application links to a PCRE2 DLL in Windows, the symbols that are 120 | imported have to be identified as such. When building PCRE2, the appropriate 121 | export settings are needed, and are set in pcre2posix.c before including this 122 | file. */ 123 | 124 | #if defined(_WIN32) && !defined(PCRE2_STATIC) && !defined(PCRE2POSIX_EXP_DECL) 125 | # define PCRE2POSIX_EXP_DECL extern // __declspec(dllimport) 126 | # define PCRE2POSIX_EXP_DEFN // __declspec(dllimport) 127 | #endif 128 | 129 | /* By default, we use the standard "extern" declarations. */ 130 | 131 | #ifndef PCRE2POSIX_EXP_DECL 132 | # ifdef __cplusplus 133 | # define PCRE2POSIX_EXP_DECL extern "C" 134 | # define PCRE2POSIX_EXP_DEFN extern "C" 135 | # else 136 | # define PCRE2POSIX_EXP_DECL extern 137 | # define PCRE2POSIX_EXP_DEFN extern 138 | # endif 139 | #endif 140 | 141 | /* The functions. The actual code is in functions with pcre2_xxx names for 142 | uniqueness. POSIX names are provided as macros for API compatibility with POSIX 143 | regex functions. It's done this way to ensure to they are always linked from 144 | the PCRE2 library and not by accident from elsewhere (regex_t differs in size 145 | elsewhere). */ 146 | 147 | PCRE2POSIX_EXP_DECL int pcre2_regcomp(regex_t *, const char *, int); 148 | PCRE2POSIX_EXP_DECL int pcre2_regexec(const regex_t *, const char *, size_t, 149 | regmatch_t *, int); 150 | PCRE2POSIX_EXP_DECL size_t pcre2_regerror(int, const regex_t *, char *, size_t); 151 | PCRE2POSIX_EXP_DECL void pcre2_regfree(regex_t *); 152 | 153 | #define regcomp pcre2_regcomp 154 | #define regexec pcre2_regexec 155 | #define regerror pcre2_regerror 156 | #define regfree pcre2_regfree 157 | 158 | /* Debian had a patch that used different names. These are now here to save 159 | them having to maintain their own patch, but are not documented by PCRE2. */ 160 | 161 | #define PCRE2regcomp pcre2_regcomp 162 | #define PCRE2regexec pcre2_regexec 163 | #define PCRE2regerror pcre2_regerror 164 | #define PCRE2regfree pcre2_regfree 165 | 166 | #ifdef __cplusplus 167 | } /* extern "C" */ 168 | #endif 169 | 170 | /* End of pcre2posix.h */ 171 | -------------------------------------------------------------------------------- /src/PCRE2/sljit/sljitConfig.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Stack-less Just-In-Time compiler 3 | * 4 | * Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without modification, are 7 | * permitted provided that the following conditions are met: 8 | * 9 | * 1. Redistributions of source code must retain the above copyright notice, this list of 10 | * conditions and the following disclaimer. 11 | * 12 | * 2. Redistributions in binary form must reproduce the above copyright notice, this list 13 | * of conditions and the following disclaimer in the documentation and/or other materials 14 | * provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY 17 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 19 | * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 21 | * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 24 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | #ifndef _SLJIT_CONFIG_H_ 28 | #define _SLJIT_CONFIG_H_ 29 | 30 | #ifdef __cplusplus 31 | extern "C" { 32 | #endif 33 | 34 | /* --------------------------------------------------------------------- */ 35 | /* Custom defines */ 36 | /* --------------------------------------------------------------------- */ 37 | 38 | /* Put your custom defines here. This empty section will never change 39 | which helps maintaining patches (with diff / patch utilities). */ 40 | 41 | /* --------------------------------------------------------------------- */ 42 | /* Architecture */ 43 | /* --------------------------------------------------------------------- */ 44 | 45 | /* Architecture selection. */ 46 | /* #define SLJIT_CONFIG_X86_32 1 */ 47 | /* #define SLJIT_CONFIG_X86_64 1 */ 48 | /* #define SLJIT_CONFIG_ARM_V5 1 */ 49 | /* #define SLJIT_CONFIG_ARM_V7 1 */ 50 | /* #define SLJIT_CONFIG_ARM_THUMB2 1 */ 51 | /* #define SLJIT_CONFIG_ARM_64 1 */ 52 | /* #define SLJIT_CONFIG_PPC_32 1 */ 53 | /* #define SLJIT_CONFIG_PPC_64 1 */ 54 | /* #define SLJIT_CONFIG_MIPS_32 1 */ 55 | /* #define SLJIT_CONFIG_MIPS_64 1 */ 56 | /* #define SLJIT_CONFIG_SPARC_32 1 */ 57 | /* #define SLJIT_CONFIG_TILEGX 1 */ 58 | 59 | /* #define SLJIT_CONFIG_AUTO 1 */ 60 | /* #define SLJIT_CONFIG_UNSUPPORTED 1 */ 61 | 62 | /* --------------------------------------------------------------------- */ 63 | /* Utilities */ 64 | /* --------------------------------------------------------------------- */ 65 | 66 | /* Useful for thread-safe compiling of global functions. */ 67 | #ifndef SLJIT_UTIL_GLOBAL_LOCK 68 | /* Enabled by default */ 69 | #define SLJIT_UTIL_GLOBAL_LOCK 1 70 | #endif 71 | 72 | /* Implements a stack like data structure (by using mmap / VirtualAlloc */ 73 | /* or a custom allocator). */ 74 | #ifndef SLJIT_UTIL_STACK 75 | /* Enabled by default */ 76 | #define SLJIT_UTIL_STACK 1 77 | #endif 78 | 79 | /* Uses user provided allocator to allocate the stack (see SLJIT_UTIL_STACK) */ 80 | #ifndef SLJIT_UTIL_SIMPLE_STACK_ALLOCATION 81 | /* Disabled by default */ 82 | #define SLJIT_UTIL_SIMPLE_STACK_ALLOCATION 0 83 | #endif 84 | 85 | /* Single threaded application. Does not require any locks. */ 86 | #ifndef SLJIT_SINGLE_THREADED 87 | /* Disabled by default. */ 88 | #define SLJIT_SINGLE_THREADED 0 89 | #endif 90 | 91 | /* --------------------------------------------------------------------- */ 92 | /* Configuration */ 93 | /* --------------------------------------------------------------------- */ 94 | 95 | /* If SLJIT_STD_MACROS_DEFINED is not defined, the application should 96 | define SLJIT_MALLOC, SLJIT_FREE, SLJIT_MEMCPY, and NULL. */ 97 | #ifndef SLJIT_STD_MACROS_DEFINED 98 | /* Disabled by default. */ 99 | #define SLJIT_STD_MACROS_DEFINED 0 100 | #endif 101 | 102 | /* Executable code allocation: 103 | If SLJIT_EXECUTABLE_ALLOCATOR is not defined, the application should 104 | define SLJIT_MALLOC_EXEC, SLJIT_FREE_EXEC, and SLJIT_EXEC_OFFSET. */ 105 | #ifndef SLJIT_EXECUTABLE_ALLOCATOR 106 | /* Enabled by default. */ 107 | #define SLJIT_EXECUTABLE_ALLOCATOR 1 108 | 109 | /* When SLJIT_PROT_EXECUTABLE_ALLOCATOR is enabled SLJIT uses 110 | an allocator which does not set writable and executable 111 | permission flags at the same time. The trade-of is increased 112 | memory consumption and disabled dynamic code modifications. */ 113 | #ifndef SLJIT_PROT_EXECUTABLE_ALLOCATOR 114 | /* Disabled by default. */ 115 | #define SLJIT_PROT_EXECUTABLE_ALLOCATOR 0 116 | #endif 117 | 118 | #endif 119 | 120 | /* Force cdecl calling convention even if a better calling 121 | convention (e.g. fastcall) is supported by the C compiler. 122 | If this option is disabled (this is the default), functions 123 | called from JIT should be defined with SLJIT_FUNC attribute. 124 | Standard C functions can still be called by using the 125 | SLJIT_CALL_CDECL jump type. */ 126 | #ifndef SLJIT_USE_CDECL_CALLING_CONVENTION 127 | /* Disabled by default */ 128 | #define SLJIT_USE_CDECL_CALLING_CONVENTION 0 129 | #endif 130 | 131 | /* Return with error when an invalid argument is passed. */ 132 | #ifndef SLJIT_ARGUMENT_CHECKS 133 | /* Disabled by default */ 134 | #define SLJIT_ARGUMENT_CHECKS 0 135 | #endif 136 | 137 | /* Debug checks (assertions, etc.). */ 138 | #ifndef SLJIT_DEBUG 139 | /* Enabled by default */ 140 | #define SLJIT_DEBUG 1 141 | #endif 142 | 143 | /* Verbose operations. */ 144 | #ifndef SLJIT_VERBOSE 145 | /* Enabled by default */ 146 | #define SLJIT_VERBOSE 1 147 | #endif 148 | 149 | /* 150 | SLJIT_IS_FPU_AVAILABLE 151 | The availability of the FPU can be controlled by SLJIT_IS_FPU_AVAILABLE. 152 | zero value - FPU is NOT present. 153 | nonzero value - FPU is present. 154 | */ 155 | 156 | /* For further configurations, see the beginning of sljitConfigInternal.h */ 157 | 158 | #ifdef __cplusplus 159 | } /* extern "C" */ 160 | #endif 161 | 162 | #endif 163 | -------------------------------------------------------------------------------- /src/sf_disabled.h: -------------------------------------------------------------------------------- 1 | #ifndef SF_DISABLED_H 2 | #define SF_DISABLED_H 3 | 4 | #define NO_ALTREP_SUPPORT() throw std::runtime_error("ALTREP not supported in R < 3.5") 5 | void init_stringfish(DllInfo* dll) {(void)0;} // no op; the init attribute still gets read in sf_altrep.h. 6 | void sf_export_functions(DllInfo* dll) {(void)0;} 7 | 8 | std::string get_string_type(SEXP x) {NO_ALTREP_SUPPORT();} 9 | SEXP materialize(SEXP x) {NO_ALTREP_SUPPORT();} 10 | SEXP sf_vector(size_t len) {NO_ALTREP_SUPPORT();} 11 | void sf_assign(SEXP x, size_t i, SEXP e) {NO_ALTREP_SUPPORT();} 12 | SEXP sf_iconv(SEXP x, const std::string from, const std::string to, int nthreads=1) {NO_ALTREP_SUPPORT();} 13 | SEXP convert_to_sf(SEXP x) {NO_ALTREP_SUPPORT();} 14 | IntegerVector sf_nchar(SEXP x, const std::string type = "chars", const int nthreads = 1) {NO_ALTREP_SUPPORT();} 15 | SEXP sf_substr(SEXP x, IntegerVector start, IntegerVector stop, const int nthreads = 1) {NO_ALTREP_SUPPORT();} 16 | SEXP c_sf_paste(List dots, SEXP sep, const int nthreads = 1) {NO_ALTREP_SUPPORT();} 17 | SEXP sf_collapse(SEXP x, SEXP collapse) {NO_ALTREP_SUPPORT();} 18 | SEXP sf_readLines(const std::string file, const std::string encoding = "UTF-8") {NO_ALTREP_SUPPORT();} 19 | void sf_writeLines(SEXP text, const std::string file, const std::string sep = "\n", const std::string na_value = "NA", const std::string encode_mode = "UTF-8") {NO_ALTREP_SUPPORT();} 20 | LogicalVector sf_grepl(SEXP subject, SEXP pattern, const std::string encode_mode = "auto", const bool fixed = false ,const int nthreads = 1) {NO_ALTREP_SUPPORT();} 21 | SEXP sf_split(SEXP subject, SEXP split, const std::string encode_mode = "auto", const bool fixed = false, const int nthreads = 1) {NO_ALTREP_SUPPORT();} 22 | SEXP sf_gsub(SEXP subject, SEXP pattern, SEXP replacement, const std::string encode_mode = "auto", const bool fixed = false, const int nthreads = 1) {NO_ALTREP_SUPPORT();} 23 | SEXP random_strings(const int N, const int string_size = 50, 24 | std::string charset = "abcdefghijklmnopqrstuvwxyz", 25 | std::string vector_mode = "stringfish") { 26 | if(vector_mode == "stringfish") Rcpp::warning("ALTREP not supported in R < 3.5"); 27 | CharacterVector ret(N); 28 | std::string str; 29 | str.resize(string_size); 30 | for(int i=0; i r = Rcpp::as< std::vector >(Rcpp::sample(charset.size(), string_size, true, R_NilValue, false)); 32 | for(int j=0; j 4 | #include "sf_external.h" 5 | using namespace Rcpp; 6 | 7 | // [[Rcpp::export]] 8 | SEXP sf_alternate_case(SEXP x) { 9 | // Iterate through a character vector using the RStringIndexer class 10 | // If the input vector x is a stringfish character vector it will do so without materialization 11 | RStringIndexer r(x); 12 | size_t len = r.size(); 13 | 14 | // Create an output stringfish vector 15 | // Like all R objects, it must be protected from garbage collection 16 | SEXP output = PROTECT(sf_vector(len)); 17 | 18 | // Obtain a reference to the underlying output data 19 | sf_vec_data & output_data = sf_vec_data_ref(output); 20 | 21 | // You can use range based for loop via an iterator class that returns RStringIndexer::rstring_info e 22 | // rstring info is a struct containing const char * ptr (null terminated), int len, and cetype_t enc 23 | // a NA string is represented by a nullptr 24 | // Alternatively, access the data via the function r.getCharLenCE(i) 25 | size_t i = 0; 26 | for(auto e : r) { 27 | // check if string is NA and go to next if it is 28 | if(e.ptr == nullptr) { 29 | i++; // increment output index 30 | continue; 31 | } 32 | // create a temporary output string and process the results 33 | std::string temp(e.len, '\0'); 34 | bool case_switch = false; 35 | for(int j=0; j= 65) & (e.ptr[j] <= 90)) { // char j is upper case 37 | if((case_switch = !case_switch)) { // check if we should convert to lower case 38 | temp[j] = e.ptr[j] + 32; 39 | continue; 40 | } 41 | } else if((e.ptr[j] >= 97) & (e.ptr[j] <= 122)) { // char j is lower case 42 | if(!(case_switch = !case_switch)) { // check if we should convert to upper case 43 | temp[j] = e.ptr[j] - 32; 44 | continue; 45 | } 46 | } else if(e.ptr[j] == 32) { 47 | case_switch = false; 48 | } 49 | temp[j] = e.ptr[j]; 50 | } 51 | 52 | // Create a new vector element sfstring and insert the processed string into the stringfish vector 53 | // sfstring has three constructors, 1) taking a std::string and encoding, 54 | // 2) a char pointer and encoding, or 3) a CHARSXP object (e.g. sfstring(NA_STRING)) 55 | output_data[i] = sfstring(temp, e.enc); 56 | i++; // increment output index 57 | } 58 | // Finally, call unprotect and return result 59 | UNPROTECT(1); 60 | return output; 61 | } 62 | -------------------------------------------------------------------------------- /vignettes/bench_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/traversc/stringfish/b943b1c74b1350e6ec85a92dddfbad50eb8cca8f/vignettes/bench_v2.png -------------------------------------------------------------------------------- /vignettes/vignette.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "stringfish" 3 | output: 4 | html_vignette: 5 | keep_md: no 6 | rmarkdown::github_document: default 7 | vignette: > 8 | %\VignetteIndexEntry{stringfish} 9 | \usepackage[utf8]{inputenc} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | --- 12 | 13 | ```{r, setup, echo=FALSE} 14 | IS_GITHUB <- Sys.getenv("IS_GITHUB") != "" 15 | ``` 16 | 17 | ```{r results='asis', echo=FALSE, eval=IS_GITHUB} 18 | cat(' 19 | [![R-CMD-check](https://github.com/traversc/stringfish/workflows/R-CMD-check/badge.svg)](https://github.com/traversc/stringfish/actions) 20 | [![CRAN-Status-Badge](https://www.r-pkg.org/badges/version/stringfish)](https://cran.r-project.org/package=stringfish) 21 | [![CRAN-Downloads-Badge](https://cranlogs.r-pkg.org/badges/stringfish)](https://cran.r-project.org/package=stringfish) 22 | [![CRAN-Downloads-Total-Badge](https://cranlogs.r-pkg.org/badges/grand-total/stringfish)](https://cran.r-project.org/package=stringfish) 23 | ') 24 | ``` 25 | 26 | `stringfish` is a framework for performing string and sequence operations using the ALTREP system to speed up the computation of common string operations. 27 | 28 | The ultimate goal of the package is to unify ALTREP string implementations under a common framework. 29 | 30 | The ALTREP system (new as of R 3.5.0) allows package developers to represent R objects using their own custom memory layout, completely invisible to the user. `stringfish` represents string data as a simple C++/STL vector, which is very fast and lightweight. 31 | 32 | Using normal R functions to process string data (e.g. `substr`, `gsub`, `paste`, etc.) causes "materialization" of ALTREP vectors to normal R data, which can be a slow process. Therefore, in order to take full advantage of the ALTREP framework, string processing functions need to be re-written to be ALTREP aware. This package hopes to fulfill that purpose. 33 | 34 | ## Installation 35 | ```{r eval=FALSE} 36 | install.packages("stringfish", type="source", configure.args="--with-simd=AVX2") 37 | ``` 38 | 39 | ## Benchmark 40 | 41 | The simplest way to show the utility of the ALTREP framework is through a quick benchmark comparing `stringfish` and base R. 42 | 43 | ```{r echo=FALSE, results='asis'} 44 | if(IS_GITHUB) { 45 | cat('![](vignettes/bench_v2.png "bench_v2"){width=576px}') 46 | } else { 47 | cat('![](bench_v2.png "bench_v2"){width=576px}') 48 | } 49 | ``` 50 | 51 | 52 | Yes you are reading the graph correctly: some functions in `stringfish` are more than an order of magnitude faster than vectorized base R operations (and even faster with some build in multithreading). On large text datasets, this can turn minutes of computation into seconds. 53 | 54 | ## Currently implemented functions 55 | 56 | A list of implemented `stringfish` functions and analogous base R functions: 57 | 58 | * `sf_iconv` (`iconv`) 59 | * `sf_nchar` (`nchar`) 60 | * `sf_substr` (`substr`) 61 | * `sf_paste` (`paste0`) 62 | * `sf_collapse` (`paste0`) 63 | * `sf_readLines` (`readLines`) 64 | * `sf_writeLines` (`writeLines`) 65 | * `sf_grepl` (`grepl`) 66 | * `sf_gsub` (`gsub`) 67 | * `sf_toupper` (`toupper`) 68 | * `sf_tolower` (`tolower`) 69 | * `sf_starts` (`startsWith`) 70 | * `sf_ends` (`endsWith`) 71 | * `sf_trim` (`trimws`) 72 | * `sf_split` (`strsplit`) 73 | * `sf_match` (`match` for strings only) 74 | * `sf_compare`/`sf_equals` (`==`, ALTREP-aware string equality) 75 | 76 | Utility functions: 77 | 78 | * `sf_vector` -- creates a new and empty `stringfish` vector 79 | * `sf_assign` -- assign strings into a `stringfish` vector in place (like `x[i] <- "mystring"`) 80 | * `sf_convert`/`convert_to_sf` -- converts a character vector to a `stringfish` vector 81 | * `get_string_type` -- determines string type (whether ALTREP or normal) 82 | * `materialize` -- converts any ALTREP object into a normal R object 83 | * `random_strings` -- creates random strings as either a `stringfish` or normal R vector 84 | * `string_identical` -- like `identical` for strings but also requires identical encoding (i.e. latin1 and UTF-8 strings will not match) 85 | 86 | In addition, many R operations in base R and other packages are already ALTREP-aware (i.e. they don't cause materialization). Functions that subset or index into string vectors generally do not materialize. 87 | 88 | * `sample` 89 | * `head` 90 | * `tail` 91 | * `[` -- e.g. `x[20:30]` 92 | * `dplyr::filter` -- e.g. `dplyr::filter(df, sf_starts("a"))` 93 | * Etc. 94 | 95 | `stringfish` functions are not intended to exactly replicate their base R analogues. One difference is that `subject` parameters are always the first argument, which is easier to use with pipes (`%>%`). E.g., `gsub(pattern, replacement, subject)` becomes `sf_gsub(subject, pattern, replacement)`. 96 | 97 | ## Extensibility 98 | 99 | `stringfish` as a framework is intended to be easily extensible. Stringfish vectors can be worked into `Rcpp` scripts or even into other packages (see the `qs` package for an example). 100 | 101 | Below is a detailed `Rcpp` script that creates a function to alternate upper and lower case of strings. 102 | 103 | ```{c eval=FALSE} 104 | // [[Rcpp::plugins(cpp11)]] 105 | // [[Rcpp::depends(stringfish)]] 106 | #include 107 | #include "sf_external.h" 108 | using namespace Rcpp; 109 | 110 | // [[Rcpp::export]] 111 | SEXP sf_alternate_case(SEXP x) { 112 | // Iterate through a character vector using the RStringIndexer class 113 | // If the input vector x is a stringfish character vector it will do so without materialization 114 | RStringIndexer r(x); 115 | size_t len = r.size(); 116 | 117 | // Create an output stringfish vector 118 | // Like all R objects, it must be protected from garbage collection 119 | SEXP output = PROTECT(sf_vector(len)); 120 | 121 | // Obtain a reference to the underlying output data 122 | sf_vec_data & output_data = sf_vec_data_ref(output); 123 | 124 | // You can use range based for loop via an iterator class that returns RStringIndexer::rstring_info e 125 | // rstring info is a struct containing const char * ptr (null terminated), int len, and cetype_t enc 126 | // a NA string is represented by a nullptr 127 | // Alternatively, access the data via the function r.getCharLenCE(i) 128 | size_t i = 0; 129 | for(auto e : r) { 130 | // check if string is NA and go to next if it is 131 | if(e.ptr == nullptr) { 132 | i++; // increment output index 133 | continue; 134 | } 135 | // create a temporary output string and process the results 136 | std::string temp(e.len, '\0'); 137 | bool case_switch = false; 138 | for(int j=0; j= 65) & (e.ptr[j] <= 90)) { // char j is upper case 140 | if((case_switch = !case_switch)) { // check if we should convert to lower case 141 | temp[j] = e.ptr[j] + 32; 142 | continue; 143 | } 144 | } else if((e.ptr[j] >= 97) & (e.ptr[j] <= 122)) { // char j is lower case 145 | if(!(case_switch = !case_switch)) { // check if we should convert to upper case 146 | temp[j] = e.ptr[j] - 32; 147 | continue; 148 | } 149 | } else if(e.ptr[j] == 32) { 150 | case_switch = false; 151 | } 152 | temp[j] = e.ptr[j]; 153 | } 154 | 155 | // Create a new vector element sfstring and insert the processed string into the stringfish vector 156 | // sfstring has three constructors, 1) taking a std::string and encoding, 157 | // 2) a char pointer and encoding, or 3) a CHARSXP object (e.g. sfstring(NA_STRING)) 158 | output_data[i] = sfstring(temp, e.enc); 159 | i++; // increment output index 160 | } 161 | // Finally, call unprotect and return result 162 | UNPROTECT(1); 163 | return output; 164 | } 165 | 166 | ``` 167 | 168 | Example function call: 169 | ```{r eval=FALSE} 170 | sf_alternate_case("hello world") 171 | [1] "hElLo wOrLd" 172 | ``` 173 | 174 | ## To do 175 | * Additional functions 176 | * ICU library functions 177 | --------------------------------------------------------------------------------