├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── ChangeLog
├── DESCRIPTION
├── Makefile
├── NAMESPACE
├── R
    ├── RcppExports.R
    ├── sf_functions.r
    ├── zz_help_files.R
    └── zzz.R
├── README.md
├── cleanup
├── configure
├── configure.ac
├── inst
    ├── PCRE2_LICENSE.txt
    ├── extra_tests
    │   └── benchmark_test.r
    ├── icelandic_words_500_utf8.txt
    └── include
    │   ├── sf_external.h
    │   └── sf_internal.h
├── man
    ├── convert_to_sf.Rd
    ├── get_string_type.Rd
    ├── materialize.Rd
    ├── random_strings.Rd
    ├── sf_assign.Rd
    ├── sf_collapse.Rd
    ├── sf_compare.Rd
    ├── sf_concat.Rd
    ├── sf_ends.Rd
    ├── sf_grepl.Rd
    ├── sf_gsub.Rd
    ├── sf_iconv.Rd
    ├── sf_match.Rd
    ├── sf_nchar.Rd
    ├── sf_paste.Rd
    ├── sf_readLines.Rd
    ├── sf_split.Rd
    ├── sf_starts.Rd
    ├── sf_substr.Rd
    ├── sf_tolower.Rd
    ├── sf_toupper.Rd
    ├── sf_trim.Rd
    ├── sf_vector.Rd
    ├── sf_writeLines.Rd
    └── string_identical.Rd
├── src
    ├── Makevars.in
    ├── Makevars.win
    ├── PCRE2
    │   ├── config.h
    │   ├── pcre2.h
    │   ├── pcre2_auto_possess.c
    │   ├── pcre2_chartables.c
    │   ├── pcre2_compile.c
    │   ├── pcre2_config.c
    │   ├── pcre2_context.c
    │   ├── pcre2_convert.c
    │   ├── pcre2_dfa_match.c
    │   ├── pcre2_error.c
    │   ├── pcre2_extuni.c
    │   ├── pcre2_find_bracket.c
    │   ├── pcre2_internal.h
    │   ├── pcre2_intmodedep.h
    │   ├── pcre2_is_bundled.c
    │   ├── pcre2_jit_compile.c
    │   ├── pcre2_jit_match.c
    │   ├── pcre2_jit_misc.c
    │   ├── pcre2_jit_neon_inc.h
    │   ├── pcre2_jit_simd_inc.h
    │   ├── pcre2_maketables.c
    │   ├── pcre2_match.c
    │   ├── pcre2_match_data.c
    │   ├── pcre2_newline.c
    │   ├── pcre2_ord2utf.c
    │   ├── pcre2_pattern_info.c
    │   ├── pcre2_printint.c
    │   ├── pcre2_script_run.c
    │   ├── pcre2_serialize.c
    │   ├── pcre2_string_utils.c
    │   ├── pcre2_study.c
    │   ├── pcre2_substitute.c
    │   ├── pcre2_substring.c
    │   ├── pcre2_tables.c
    │   ├── pcre2_ucd.c
    │   ├── pcre2_ucp.h
    │   ├── pcre2_valid_utf.c
    │   ├── pcre2_xclass.c
    │   ├── pcre2posix.c
    │   ├── pcre2posix.h
    │   └── sljit
    │   │   ├── sljitConfig.h
    │   │   ├── sljitConfigInternal.h
    │   │   ├── sljitExecAllocator.c
    │   │   ├── sljitLir.c
    │   │   ├── sljitLir.h
    │   │   ├── sljitNativeARM_32.c
    │   │   ├── sljitNativeARM_64.c
    │   │   ├── sljitNativeARM_T2_32.c
    │   │   ├── sljitNativeMIPS_32.c
    │   │   ├── sljitNativeMIPS_64.c
    │   │   ├── sljitNativeMIPS_common.c
    │   │   ├── sljitNativePPC_32.c
    │   │   ├── sljitNativePPC_64.c
    │   │   ├── sljitNativePPC_common.c
    │   │   ├── sljitNativeSPARC_32.c
    │   │   ├── sljitNativeSPARC_common.c
    │   │   ├── sljitNativeTILEGX-encoder.c
    │   │   ├── sljitNativeTILEGX_64.c
    │   │   ├── sljitNativeX86_32.c
    │   │   ├── sljitNativeX86_64.c
    │   │   ├── sljitNativeX86_common.c
    │   │   ├── sljitProtExecAllocator.c
    │   │   └── sljitUtils.c
    ├── PCRE2_wrapper
    │   ├── pcre2_wrapper.cpp
    │   └── pcre2_wrapper.h
    ├── RcppExports.cpp
    ├── sf_altrep.h
    ├── sf_disabled.h
    ├── sf_functions.cpp
    └── xxhash
    │   ├── xxhash.c
    │   └── xxhash.h
├── tests
    ├── tests.cpp
    └── tests.r
└── vignettes
    ├── bench_v2.png
    ├── vignette.html
    └── vignette.rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | .travis.yml
 4 | rebuild.sh
 5 | .*\.tar\.gz
 6 | ^local
 7 | ^benchmark_results
 8 | rticle
 9 | .Rhistory
10 | src/ZSTD/LICENSCE.txt
11 | aclocal.m4
12 | Makefile
13 | ^.vscode
14 | ^\.github$
15 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 |     branches: [main, master]
12 | 
13 | name: R-CMD-check
14 | 
15 | jobs:
16 |   # rchk:
17 |   #   runs-on: ubuntu-latest
18 |   #   steps:
19 |   #   - uses: actions/checkout@v1
20 |   #   - uses: r-lib/actions/run-rchk@master
21 |   R-CMD-check:
22 |     runs-on: ${{ matrix.config.os }}
23 | 
24 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
25 | 
26 |     strategy:
27 |       fail-fast: false
28 |       matrix:
29 |         config:
30 |           - {os: macOS-latest,   r: 'release'}
31 | 
32 |           - {os: windows-latest, r: 'release'}
33 |           # Use 3.6 to trigger usage of RTools35
34 |           # - {os: windows-latest, r: '3.6'}
35 | 
36 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
37 |           - {os: ubuntu-latest,   r: 'release'}
38 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
39 |           - {os: ubuntu-latest,   r: 'oldrel-2'}
40 |           - {os: ubuntu-latest,   r: 'oldrel-3'}
41 |           - {os: ubuntu-latest,   r: 'oldrel-4'}
42 | 
43 |     env:
44 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
45 |       R_KEEP_PKG_SOURCE: yes
46 | 
47 |     steps:
48 |       - name: Windows CRLF fix
49 |         run: git config --global core.autocrlf false
50 | 
51 |       - uses: actions/checkout@v3
52 | 
53 |       - uses: r-lib/actions/setup-pandoc@v2
54 | 
55 |       - uses: r-lib/actions/setup-r@v2
56 |         with:
57 |           r-version: ${{ matrix.config.r }}
58 |           http-user-agent: ${{ matrix.config.http-user-agent }}
59 |           use-public-rspm: true
60 | 
61 |       - uses: r-lib/actions/setup-r-dependencies@v2
62 |         with:
63 |           extra-packages: any::rcmdcheck
64 |           needs: check
65 | 
66 |       - uses: r-lib/actions/check-r-package@v2
67 |         with:
68 |           upload-snapshots: true
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | src/*.o
 6 | src/*.so
 7 | src/*.dll
 8 | src/*.a
 9 | src/**/*.o
10 | src/**/*.so
11 | src/**/*.dll
12 | src/**/*.a
13 | stringfish*.tar.gz
14 | *.Rcheck
15 | ..Rcheck
16 | rebuild.sh
17 | local
18 | .DS_Store
19 | src/ZSTD/LICENSCE.txt
20 | aclocal.m4
21 | config.log
22 | config.status
23 | autom4te.cache/*
24 | .vscode
25 | /src/.vscode
26 | src/Makevars
27 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
 1 | Version 0.16.0 (2023-11-25)
 2 |    * Add in ALTREP serialization
 3 | 
 4 | Version 0.15.8 (2023-5-29)
 5 |    * Fix warn message in R 4.3+ "warning: function declaration isn’t a prototype [-Wstrict-prototypes]"
 6 |    * Remove C++11 SystemRequirements per new CRAN guidelines
 7 |    * Switch to `std::atomic` instead of `tbb::atomic` due to deprecation (https://github.com/traversc/stringfish/issues/19)
 8 | 
 9 | Version 0.15.7 (2022-2-21)
10 |    * Switch to using Rcpp's new `signature` attribute to define default parameters
11 |    * Use more accurate language in configure script
12 | 
13 | Version 0.15.6 (2021-12-8)
14 |    * Bugfix to work on Linux Alpine; missing checks for TBB (https://github.com/r-hub/r-minimal/issues/37, https://github.com/traversc/stringfish/issues/11)
15 | 
16 | Version 0.15.5 (2021-11-30)
17 |    * Bugfix to work on R 3.4 (https://github.com/tidyverse/multidplyr/pull/129)
18 | 
19 | Version 0.15.4 (2021-10-11)
20 |    * Disable `sf_match` test due to error on Solaris and R 4.1.X bug (https://bugs.r-project.org/show_bug.cgi?id=18211)
21 | 
22 | Version 0.15.3 (2021-10-9)
23 |    * Fix to https://github.com/traversc/stringfish/issues/7; definition conflict with bundled PCRE2
24 |    * Update autoconf to version 2.6.9 (autoupdate; autoreconf --warnings=obsolete)
25 |    * `nthreads` parameter default is now `getOption("stringfish.nthreads", 1L)`. Set using `options(stringfish.nthreads = 4L)`
26 | 
27 | Version 0.15.2 (2021-7-23)
28 |    * Change ALTVEC_DATAPTR to DATAPTR to conform to changes in the API
29 | 
30 | Version 0.15.1 (2021-3-13)
31 |    * Fix PCRE2 issue on Apple M1
32 |    * Fix for missing return type in src/sf_disabled.h
33 | 
34 | Version 0.15.0 (2021-2-20)
35 |    * Reduce requirement for R 3.5 so packages depending on `stringfish` don't require R 3.5. Most functionality will not be available in R < 3.5.
36 |    * Update `xxhash` library to release 0.8.0 and use the improved XXH3 algorithm for hashing. 
37 | 
38 | Version 0.14.2 (2020-9-3)
39 |    * Fix bug with `extract_subset` ALTREP routine (https://github.com/traversc/qs/issues/42)
40 | 
41 | Version 0.14.1 (2020-7-23)
42 |    * Implemented multithreading with RcppParallel
43 |    * Addition of `Extract_subset` ALTREP method
44 |    * Additional functions (`sf_compare`, `sf_concat`, `sf_equals`)
45 |    * Various bug fixes
46 | 
47 | Version 0.13.2 (2020-7-5)
48 |    * Removed -mshstk flag to fix CRAN note
49 | 
50 | Version 0.13.1 (2020-7-5)
51 |    * Additional functions: `sf_toupper`, `sf_tolower`, `sf_toupper`, `sf_tolower`, `sf_starts`, `sf_ends`, `sf_trim`, `sf_split`, `sf_match`
52 |    * Additional functions: `string_identical`, `sf_writeLines`
53 | 
54 | Version 0.11.2 (2020-6-1)
55 |    * Fixed compilation error on Fedora (adding -mshstk flag compile flag in configure file)
56 |    * Fixed a bug in alt-rep set_string_elt method
57 |    * Added additional sfstring constructor `sfstring(size_t, cetype_t)`
58 | 
59 | Version 0.11.2 (2020-6-1)
60 |    * Fixed compilation error on Fedora (adding -mshstk flag compile flag in configure file)
61 |    * Fixed a bug in alt-rep set_string_elt method
62 |    * Added additional sfstring constructor `sfstring(size_t, cetype_t)`
63 | 
64 | Version 0.11 (2020-5-14)
65 |    * Check for PCRE2 system installation and updated bundled version to latest (10.35)
66 |    * Fix to copyright statements in DESCRIPTION file
67 | 
68 | Version 0.1 (2020-5-11)
69 |    * Initial CRAN release
70 |    * An alt-rep string framework for fast and extensible processing of string data
71 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: stringfish
 2 | Title: Alt String Implementation
 3 | Version: 0.16.0
 4 | Date: 2023-11-27
 5 | Authors@R: c(
 6 |     person("Travers", "Ching", email = "traversc@gmail.com", role = c("aut", "cre", "cph")),
 7 |     person("Phillip", "Hazel", role = c("ctb"), comment = "Bundled PCRE2 code"),
 8 |     person("Zoltan", "Herczeg", role = c("ctb", "cph"), comment = "Bundled PCRE2 code"),
 9 |     person("University of Cambridge", role = c("cph"), comment = "Bundled PCRE2 code"),
10 |     person("Tilera Corporation", role = c("cph"), comment = "Stack-less Just-In-Time compiler bundled with PCRE2"),
11 |     person("Yann", "Collet", role = c("ctb", "cph"), comment = "Yann Collet is the author of the bundled xxHash code"))
12 | Maintainer: Travers Ching <traversc@gmail.com>
13 | Description: Provides an extendable, performant and multithreaded 'alt-string' implementation backed by 'C++' vectors and strings.
14 | License: GPL-3
15 | Biarch: true
16 | Encoding: UTF-8
17 | Depends: R (>= 3.0.2)
18 | SystemRequirements: GNU make
19 | LinkingTo: 
20 |     Rcpp (>= 0.12.18.3), RcppParallel (>= 5.1.4)
21 | Imports: 
22 |     Rcpp, RcppParallel
23 | Suggests:
24 |     qs, knitr, rmarkdown, usethis, dplyr, stringr, rlang
25 | VignetteBuilder: knitr
26 | RoxygenNote: 7.2.3
27 | Copyright: Copyright for the bundled 'PCRE2' library is held by University of Cambridge, Zoltan Herczeg and Tilera Coporation (Stack-less Just-In-Time compiler); Copyright for the bundled 'xxHash' code is held by Yann Collet. 
28 | URL: https://github.com/traversc/stringfish
29 | BugReports: https://github.com/traversc/stringfish/issues
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL   := /bin/bash
 2 | PACKAGE := $(shell perl -aF: -ne 'print, exit if s/^Package:\s+//' DESCRIPTION)
 3 | VERSION := $(shell perl -aF: -ne 'print, exit if s/^Version:\s+//' DESCRIPTION)
 4 | BUILD   := $(PACKAGE)_$(VERSION).tar.gz
 5 | 
 6 | .PHONY: doc build install test vignette $(BUILD)
 7 | 
 8 | check: $(BUILD)
 9 | 	R CMD check --as-cran $<
10 | 
11 | check-no-vignette: $(BUILD)
12 | 	R CMD check --as-cran --no-build-vignettes $<
13 | 
14 | check-rhub: $(BUILD)
15 | 	Rscript -e 'rhub::check("$(BUILD)", platform = c("ubuntu-gcc-devel", "windows-x86_64-devel", "solaris-x86-patched", "solaris-x86-patched-ods", "macos-m1-bigsur-release"))'
16 | 
17 | check-solaris: $(BUILD)
18 | 	Rscript -e 'rhub::check("$(BUILD)", platform = c("solaris-x86-patched", "solaris-x86-patched-ods"))'
19 | 
20 | check-m1: $(BUILD)
21 | 	Rscript -e 'rhub::check("$(BUILD)", platform = c("macos-m1-bigsur-release"))'
22 | 
23 | compile:
24 | 	find src/ -type f -exec chmod 644 {} \;
25 | 	Rscript -e "library(Rcpp); compileAttributes('.');"
26 | 	# Rscript -e "devtools::load_all(); roxygen2::roxygenise('.');"
27 | 	find . -iname "*.a" -exec rm {} \;
28 | 	find . -iname "*.o" -exec rm {} \;
29 | 	find . -iname "*.so" -exec rm {} \;
30 | 
31 | build:
32 | 	autoconf
33 | 	chmod 755 cleanup
34 | 	chmod 755 configure
35 | 	find src/ -type f -exec chmod 644 {} \;
36 | 	chmod 644 ChangeLog DESCRIPTION Makefile NAMESPACE README.md
37 | 	./configure
38 | 	./cleanup
39 | 	Rscript -e "library(Rcpp); compileAttributes('.');"
40 | 	Rscript -e "devtools::load_all(); roxygen2::roxygenise('.');"
41 | 	# rm -f R/RcppExports.R
42 | 	find . -iname "*.a" -exec rm {} \;
43 | 	find . -iname "*.o" -exec rm {} \;
44 | 	find . -iname "*.so" -exec rm {} \;
45 | 	R CMD build .
46 | 
47 | install:
48 | 	autoconf
49 | 	chmod 755 cleanup
50 | 	chmod 755 configure
51 | 	find src/ -type f -exec chmod 644 {} \;
52 | 	chmod 644 ChangeLog DESCRIPTION Makefile NAMESPACE README.md
53 | 	./configure
54 | 	./cleanup
55 | 	Rscript -e "library(Rcpp); compileAttributes('.');"
56 | 	Rscript -e "devtools::load_all(); roxygen2::roxygenise('.');"
57 | 	# rm -f R/RcppExports.R
58 | 	find . -iname "*.a" -exec rm {} \;
59 | 	find . -iname "*.o" -exec rm {} \;
60 | 	find . -iname "*.so" -exec rm {} \;
61 | 	R CMD build . # --no-build-vignettes
62 | 	R CMD INSTALL $(BUILD) --configure-args="--with-simd=AVX2" # --with-pcre2-force-compile"
63 | 
64 | vignette:
65 | 	Rscript -e "rmarkdown::render(input='vignettes/vignette.rmd', output_format='html_vignette')"
66 | 	IS_GITHUB=Yes Rscript -e "rmarkdown::render(input='vignettes/vignette.rmd', output_file='../README.md', output_format=rmarkdown::github_document(html_preview=FALSE))"; unset IS_GITHUB
67 | 
68 | test:
69 | 	Rscript tests/tests.R
70 | 	Rscript inst/extra_tests/benchmark_test.R 5
71 | 
72 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | importFrom(Rcpp,sourceCpp)
 2 | importFrom(RcppParallel, RcppParallelLibs)
 3 | useDynLib(stringfish, .registration=TRUE)
 4 | export(
 5 | "materialize",
 6 | "get_string_type",
 7 | "convert_to_sf", "sf_convert", 
 8 | "sf_vector",
 9 | "sf_assign",
10 | "random_strings",
11 | "string_identical",
12 | 
13 | "sf_iconv",
14 | "sf_nchar",
15 | "sf_substr",
16 | "sf_paste",
17 | "sf_collapse",
18 | "sf_readLines",
19 | "sf_writeLines",
20 | "sf_grepl",
21 | "sf_gsub",
22 | "sf_toupper",
23 | "sf_tolower",
24 | "sf_starts",
25 | "sf_ends",
26 | "sf_trim",
27 | "sf_split",
28 | "sf_match",
29 | "sf_equals", "sf_compare",
30 | "sf_concat", "sfc")
31 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | set_is_utf8_locale <- function() {
  5 |     invisible(.Call(`_stringfish_set_is_utf8_locale`))
  6 | }
  7 | 
  8 | unset_is_utf8_locale <- function() {
  9 |     invisible(.Call(`_stringfish_unset_is_utf8_locale`))
 10 | }
 11 | 
 12 | get_is_utf8_locale <- function() {
 13 |     .Call(`_stringfish_get_is_utf8_locale`)
 14 | }
 15 | 
 16 | is_tbb <- function() {
 17 |     .Call(`_stringfish_is_tbb`)
 18 | }
 19 | 
 20 | check_simd <- function() {
 21 |     invisible(.Call(`_stringfish_check_simd`))
 22 | }
 23 | 
 24 | get_pcre2_info <- function() {
 25 |     .Call(`_stringfish_get_pcre2_info`)
 26 | }
 27 | 
 28 | get_string_type <- function(x) {
 29 |     .Call(`_stringfish_get_string_type`, x)
 30 | }
 31 | 
 32 | materialize <- function(x) {
 33 |     .Call(`_stringfish_materialize`, x)
 34 | }
 35 | 
 36 | sf_vector <- function(len) {
 37 |     .Call(`_stringfish_sf_vector`, len)
 38 | }
 39 | 
 40 | sf_assign <- function(x, i, e) {
 41 |     invisible(.Call(`_stringfish_sf_assign`, x, i, e))
 42 | }
 43 | 
 44 | sf_iconv <- function(x, from, to, nthreads = getOption("stringfish.nthreads", 1L)) {
 45 |     .Call(`_stringfish_sf_iconv`, x, from, to, nthreads)
 46 | }
 47 | 
 48 | convert_to_sf <- function(x) {
 49 |     .Call(`_stringfish_convert_to_sf`, x)
 50 | }
 51 | 
 52 | sf_nchar <- function(x, type = "chars", nthreads = getOption("stringfish.nthreads", 1L)) {
 53 |     .Call(`_stringfish_sf_nchar`, x, type, nthreads)
 54 | }
 55 | 
 56 | sf_substr <- function(x, start, stop, nthreads = getOption("stringfish.nthreads", 1L)) {
 57 |     .Call(`_stringfish_sf_substr`, x, start, stop, nthreads)
 58 | }
 59 | 
 60 | c_sf_paste <- function(dots, sep, nthreads = 1L) {
 61 |     .Call(`_stringfish_c_sf_paste`, dots, sep, nthreads)
 62 | }
 63 | 
 64 | sf_collapse <- function(x, collapse) {
 65 |     .Call(`_stringfish_sf_collapse`, x, collapse)
 66 | }
 67 | 
 68 | sf_readLines <- function(file, encoding = "UTF-8") {
 69 |     .Call(`_stringfish_sf_readLines`, file, encoding)
 70 | }
 71 | 
 72 | sf_writeLines <- function(text, file, sep = "\n", na_value = "NA", encode_mode = "UTF-8") {
 73 |     invisible(.Call(`_stringfish_sf_writeLines`, text, file, sep, na_value, encode_mode))
 74 | }
 75 | 
 76 | sf_grepl <- function(subject, pattern, encode_mode = "auto", fixed = FALSE, nthreads = getOption("stringfish.nthreads", 1L)) {
 77 |     .Call(`_stringfish_sf_grepl`, subject, pattern, encode_mode, fixed, nthreads)
 78 | }
 79 | 
 80 | sf_split <- function(subject, split, encode_mode = "auto", fixed = FALSE, nthreads = getOption("stringfish.nthreads", 1L)) {
 81 |     .Call(`_stringfish_sf_split`, subject, split, encode_mode, fixed, nthreads)
 82 | }
 83 | 
 84 | sf_gsub <- function(subject, pattern, replacement, encode_mode = "auto", fixed = FALSE, nthreads = getOption("stringfish.nthreads", 1L)) {
 85 |     .Call(`_stringfish_sf_gsub`, subject, pattern, replacement, encode_mode, fixed, nthreads)
 86 | }
 87 | 
 88 | random_strings <- function(N, string_size = 50L, charset = "abcdefghijklmnopqrstuvwxyz", vector_mode = "stringfish") {
 89 |     .Call(`_stringfish_random_strings`, N, string_size, charset, vector_mode)
 90 | }
 91 | 
 92 | sf_tolower <- function(x) {
 93 |     .Call(`_stringfish_sf_tolower`, x)
 94 | }
 95 | 
 96 | sf_toupper <- function(x) {
 97 |     .Call(`_stringfish_sf_toupper`, x)
 98 | }
 99 | 
100 | sf_match <- function(x, table, nthreads = getOption("stringfish.nthreads", 1L)) {
101 |     .Call(`_stringfish_sf_match`, x, table, nthreads)
102 | }
103 | 
104 | sf_compare <- function(x, y, nthreads = getOption("stringfish.nthreads", 1L)) {
105 |     .Call(`_stringfish_sf_compare`, x, y, nthreads)
106 | }
107 | 
108 | c_sf_concat <- function(x) {
109 |     .Call(`_stringfish_c_sf_concat`, x)
110 | }
111 | 
112 | 


--------------------------------------------------------------------------------
/R/sf_functions.r:
--------------------------------------------------------------------------------
 1 | sf_paste <- function(..., sep="", nthreads = getOption("stringfish.nthreads", 1L)) {
 2 |   if(!is.character(sep) || length(sep) != 1) {
 3 |     stop("sep should be a character vector of length 1")
 4 |   }
 5 |   dots <- list(...)
 6 |   len <- -1
 7 |   for(i in seq_along(dots)) {
 8 |     if(!is.character(dots[[i]])) {
 9 |       dots[[i]] <- as.character(dots[[i]])
10 |     }
11 |     li <- length(dots[[i]])
12 |     if(li == 0) stop("argument cannot be of length zero")
13 |     if(li == 1) next
14 |     if(len == -1) {
15 |       len <- li
16 |     } else {
17 |       if(li != len) stop("All arguments should be the same length or length 1")
18 |     }
19 |   }
20 |   c_sf_paste(dots, sep, nthreads)
21 | }
22 | 
23 | sf_concat <- function(...) {
24 |   dots <- list(...)
25 |   for(i in seq_along(dots)) {
26 |     if(!is.character(dots[[i]])) dots[[i]] <- as.character(dots[[i]])
27 |   }
28 |   c_sf_concat(dots)
29 | }
30 | 
31 | sf_starts <- function(subject, pattern, ...) {
32 |   pattern <- paste0("^", pattern)
33 |   sf_grepl(subject, pattern, ...)
34 | }
35 | 
36 | sf_ends <- function(subject, pattern, ...) {
37 |   pattern <- paste0(pattern, "$")
38 |   sf_grepl(subject, pattern, ...)
39 | }
40 | 
41 | sf_trim <- function(subject, which = c("both", "left", "right"), whitespace = "[ \\t\\r\\n]", ...) {
42 |   which <- match.arg(which)
43 |   if(which == "both") {
44 |     sf_gsub(sf_gsub(subject, paste0("^", whitespace,"+"), "", ...), paste0(whitespace, "+", "$"), "", ...)
45 |   } else if(which == "left") {
46 |     sf_gsub(subject, paste0("^", whitespace, "+"), "", ...)
47 |   } else {
48 |     sf_gsub(subject, paste0(whitespace, "+", "$"), "", ...)
49 |   }
50 | }
51 | 
52 | string_identical <- function(x, y) {
53 |   stopifnot(is.character(x))
54 |   stopifnot(is.character(y))
55 |   if(length(x) != length(y)) return(FALSE)
56 |   na_x <- is.na(x)
57 |   na_y <- is.na(y)
58 |   stopifnot(identical(na_x,na_y))
59 |   if(all(na_x)) return(TRUE) # correctly catches zero length as well
60 |   not_na <- !na_x
61 |   if(any(nchar(x[not_na]) != nchar(y[not_na]))) return(FALSE)
62 |   if(!all(Encoding(x[not_na]) == Encoding(y[not_na]))) return(FALSE)
63 |   if(any(x[not_na] != y[not_na])) return(FALSE)
64 |   return(TRUE)
65 | }
66 | 
67 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onAttach <- function(libname, pkgname) {
2 |   # maybe we should check this at compile time somehow?
3 |   if(identical(utils::localeToCharset()[1], "UTF-8")) set_is_utf8_locale()
4 | }
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | stringfish
  2 | ================
  3 | 
  4 | [![R-CMD-check](https://github.com/traversc/stringfish/workflows/R-CMD-check/badge.svg)](https://github.com/traversc/stringfish/actions)
  5 | [![CRAN-Status-Badge](https://www.r-pkg.org/badges/version/stringfish)](https://cran.r-project.org/package=stringfish)
  6 | [![CRAN-Downloads-Badge](https://cranlogs.r-pkg.org/badges/stringfish)](https://cran.r-project.org/package=stringfish)
  7 | [![CRAN-Downloads-Total-Badge](https://cranlogs.r-pkg.org/badges/grand-total/stringfish)](https://cran.r-project.org/package=stringfish)
  8 | 
  9 | `stringfish` is a framework for performing string and sequence
 10 | operations using the ALTREP system to speed up the computation of common
 11 | string operations.
 12 | 
 13 | The ultimate goal of the package is to unify ALTREP string
 14 | implementations under a common framework.
 15 | 
 16 | The ALTREP system (new as of R 3.5.0) allows package developers to
 17 | represent R objects using their own custom memory layout, completely
 18 | invisible to the user. `stringfish` represents string data as a simple
 19 | C++/STL vector, which is very fast and lightweight.
 20 | 
 21 | Using normal R functions to process string data (e.g. `substr`, `gsub`,
 22 | `paste`, etc.) causes “materialization” of ALTREP vectors to normal R
 23 | data, which can be a slow process. Therefore, in order to take full
 24 | advantage of the ALTREP framework, string processing functions need to
 25 | be re-written to be ALTREP aware. This package hopes to fulfill that
 26 | purpose.
 27 | 
 28 | ## Installation
 29 | 
 30 | ``` r
 31 | install.packages("stringfish", type="source", configure.args="--with-simd=AVX2")
 32 | ```
 33 | 
 34 | ## Benchmark
 35 | 
 36 | The simplest way to show the utility of the ALTREP framework is through
 37 | a quick benchmark comparing `stringfish` and base R.
 38 | 
 39 | ![](vignettes/bench_v2.png "bench_v2")
 40 | 
 41 | Yes you are reading the graph correctly: some functions in `stringfish`
 42 | are more than an order of magnitude faster than vectorized base R
 43 | operations (and even faster with some build in multithreading). On large
 44 | text datasets, this can turn minutes of computation into seconds.
 45 | 
 46 | ## Currently implemented functions
 47 | 
 48 | A list of implemented `stringfish` functions and analogous base R
 49 | functions:
 50 | 
 51 |   - `sf_iconv` (`iconv`)
 52 |   - `sf_nchar` (`nchar`)
 53 |   - `sf_substr` (`substr`)
 54 |   - `sf_paste` (`paste0`)
 55 |   - `sf_collapse` (`paste0`)
 56 |   - `sf_readLines` (`readLines`)
 57 |   - `sf_writeLines` (`writeLines`)
 58 |   - `sf_grepl` (`grepl`)
 59 |   - `sf_gsub` (`gsub`)
 60 |   - `sf_toupper` (`toupper`)
 61 |   - `sf_tolower` (`tolower`)
 62 |   - `sf_starts` (`startsWith`)
 63 |   - `sf_ends` (`endsWith`)
 64 |   - `sf_trim` (`trimws`)
 65 |   - `sf_split` (`strsplit`)
 66 |   - `sf_match` (`match` for strings only)
 67 |   - `sf_compare`/`sf_equals` (`==`, ALTREP-aware string equality)
 68 | 
 69 | Utility functions:
 70 | 
 71 |   - `sf_vector` – creates a new and empty `stringfish` vector
 72 |   - `sf_assign` – assign strings into a `stringfish` vector in place
 73 |     (like `x[i] <- "mystring"`)
 74 |   - `sf_convert`/`convert_to_sf` – converts a character vector to a
 75 |     `stringfish` vector
 76 |   - `get_string_type` – determines string type (whether ALTREP or
 77 |     normal)
 78 |   - `materialize` – converts any ALTREP object into a normal R object
 79 |   - `random_strings` – creates random strings as either a `stringfish`
 80 |     or normal R vector
 81 |   - `string_identical` – like `identical` for strings but also requires
 82 |     identical encoding (i.e. latin1 and UTF-8 strings will not match)
 83 | 
 84 | In addition, many R operations in base R and other packages are already
 85 | ALTREP-aware (i.e. they don’t cause materialization). Functions that
 86 | subset or index into string vectors generally do not materialize.
 87 | 
 88 |   - `sample`
 89 |   - `head`
 90 |   - `tail`
 91 |   - `[` – e.g. `x[20:30]`
 92 |   - `dplyr::filter` – e.g. `dplyr::filter(df, sf_starts("a"))`
 93 |   - Etc.
 94 | 
 95 | `stringfish` functions are not intended to exactly replicate their base
 96 | R analogues. One difference is that `subject` parameters are always the
 97 | first argument, which is easier to use with pipes (`%>%`). E.g.,
 98 | `gsub(pattern, replacement, subject)` becomes `sf_gsub(subject, pattern,
 99 | replacement)`.
100 | 
101 | ## Extensibility
102 | 
103 | `stringfish` as a framework is intended to be easily extensible.
104 | Stringfish vectors can be worked into `Rcpp` scripts or even into other
105 | packages (see the `qs` package for an example).
106 | 
107 | Below is a detailed `Rcpp` script that creates a function to alternate
108 | upper and lower case of strings.
109 | 
110 | ``` c
111 | // [[Rcpp::plugins(cpp11)]]
112 | // [[Rcpp::depends(stringfish)]]
113 | #include <Rcpp.h>
114 | #include "sf_external.h"
115 | using namespace Rcpp;
116 | 
117 | // [[Rcpp::export]]
118 | SEXP sf_alternate_case(SEXP x) {
119 |   // Iterate through a character vector using the RStringIndexer class
120 |   // If the input vector x is a stringfish character vector it will do so without materialization
121 |   RStringIndexer r(x);
122 |   size_t len = r.size();
123 |   
124 |   // Create an output stringfish vector
125 |   // Like all R objects, it must be protected from garbage collection
126 |   SEXP output = PROTECT(sf_vector(len));
127 |   
128 |   // Obtain a reference to the underlying output data
129 |   sf_vec_data & output_data = sf_vec_data_ref(output);
130 |   
131 |   // You can use range based for loop via an iterator class that returns RStringIndexer::rstring_info e
132 |   // rstring info is a struct containing const char * ptr (null terminated), int len, and cetype_t enc
133 |   // a NA string is represented by a nullptr
134 |   // Alternatively, access the data via the function r.getCharLenCE(i)
135 |   size_t i = 0;
136 |   for(auto e : r) {
137 |     // check if string is NA and go to next if it is
138 |     if(e.ptr == nullptr) {
139 |       i++; // increment output index
140 |       continue;
141 |     }
142 |     // create a temporary output string and process the results
143 |     std::string temp(e.len, '\0');
144 |     bool case_switch = false;
145 |     for(int j=0; j<e.len; j++) {
146 |       if((e.ptr[j] >= 65) & (e.ptr[j] <= 90)) { // char j is upper case
147 |         if((case_switch = !case_switch)) { // check if we should convert to lower case
148 |           temp[j] = e.ptr[j] + 32;
149 |           continue;
150 |         }
151 |       } else if((e.ptr[j] >= 97) & (e.ptr[j] <= 122)) { // char j is lower case
152 |         if(!(case_switch = !case_switch)) { // check if we should convert to upper case
153 |           temp[j] = e.ptr[j] - 32;
154 |           continue;
155 |         }
156 |       } else if(e.ptr[j] == 32) {
157 |         case_switch = false;
158 |       }
159 |       temp[j] = e.ptr[j];
160 |     }
161 |     
162 |     // Create a new vector element sfstring and insert the processed string into the stringfish vector
163 |     // sfstring has three constructors, 1) taking a std::string and encoding, 
164 |     // 2) a char pointer and encoding, or 3) a CHARSXP object (e.g. sfstring(NA_STRING))
165 |     output_data[i] = sfstring(temp, e.enc);
166 |     i++; // increment output index
167 |   }
168 |   // Finally, call unprotect and return result
169 |   UNPROTECT(1);
170 |   return output;
171 | }
172 | ```
173 | 
174 | Example function call:
175 | 
176 | ``` r
177 | sf_alternate_case("hello world") 
178 | [1] "hElLo wOrLd"
179 | ```
180 | 
181 | ## To do
182 | 
183 |   - Additional functions
184 |   - ICU library functions
185 | 


--------------------------------------------------------------------------------
/cleanup:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | rm -f config.* src/Makevars src/config.h
4 | rm -rf autom4te.cache


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | AC_INIT([stringfish],[0.13.2 traversc@gmail.com])
  2 | AC_PATH_PROG([PKGCONF],[pkg-config],[],[$PATH:/usr/local/bin:ext/bin:ext:/sw/bin:/opt/bin:/opt/local/bin])
  3 | 
  4 | echo "stringfish configure script"
  5 | ########################################################
  6 | ### Predefined compile strings for different cases
  7 | 
  8 | ADD_LIBS=""
  9 | INCLUDE_PATHS=""
 10 | LIBPCRE2=""
 11 | PCRE2_BUNDLED=""
 12 | 
 13 | ########################################################
 14 | ### Configure args
 15 | 
 16 | AC_ARG_WITH([pcre2-force-compile],
 17 |             AS_HELP_STRING([--with-pcre2-force-compile],[Force compilation of bundled pcre2 source files]),
 18 |             [pcre2_force_compile="true"])
 19 |             
 20 | AC_ARG_WITH([pcre2-include],
 21 |             AS_HELP_STRING([--with-pcre2-include=INCLUDE_PATH],[the location of pcre2 header files]),
 22 |             [pcre2_include_path=$withval])
 23 |             
 24 | AC_ARG_WITH([pcre2-lib],
 25 |             AS_HELP_STRING([--with-pcre2-lib=LIB_PATH],[the location of pcre2 library files]),
 26 |             [pcre2_lib_path=$withval])
 27 |             
 28 | AC_ARG_WITH([simd],
 29 |             AS_HELP_STRING([--with-simd],[Manually select SIMD support (options: AVX2)]),
 30 |             [with_simd=$withval])
 31 |        
 32 | 
 33 | ########################################################
 34 | #### Version value function
 35 | 
 36 | getVersion()
 37 | {
 38 | VERSION_STRING=$1
 39 | MAJOR=`echo $VERSION_STRING | cut -d. -f1`
 40 | MINOR=`echo $VERSION_STRING | cut -d. -f2`
 41 | echo $(($MAJOR*1000+$MINOR))
 42 | }
 43 | 
 44 | ########################################################
 45 | #### Check for GCC version and add -mshstk cflags for GCC 8+
 46 | #### ver 0.13.2 -- no longer in use
 47 | 
 48 | # echo "Testing for C compiler version"
 49 | # echo "R_HOME: $R_HOME"
 50 | # : ${R_HOME=`R RHOME`}
 51 | # echo "R_HOME: $R_HOME"
 52 | # if test -z "${R_HOME}"; then
 53 | #   echo "could not determine R_HOME"
 54 | #   exit 1
 55 | # fi
 56 | # CC=`"${R_HOME}/bin/R" CMD config CC`
 57 | # echo "C compiler command: $CC"
 58 | 
 59 | # AC_LANG(C)
 60 | # AX_CHECK_COMPILE_FLAG([-mshstk],[MSHSTK_FLAG_AVAIL=yes])
 61 | 
 62 | # if test xx$MSHSTK_FLAG_AVAIL = "xxyes"; then
 63 | #   ADD_CFLAGS="${ADD_CFLAGS} -mshstk"
 64 | # fi
 65 | 
 66 | # AC_LANG(C)
 67 | # AX_COMPILER_VENDOR
 68 | # AX_COMPILER_VERSION
 69 | # echo "C compiler vendor: $ax_cv_c_compiler_vendor"
 70 | # echo "C compiler version: $ax_cv_c_compiler_version" # note: The version is completely wrong for Mac LLVM
 71 | 
 72 | # if test xx$ax_cv_c_compiler_vendor = "xxgnu"; then
 73 | #   CCVER=`${CC} -dumpversion | cut -f 1 -d "."`
 74 | #   echo "gcc dumpversion: $CCVER"
 75 | #   if test "${CCVER}" -ge 8; then
 76 | #     ADD_CFLAGS="${ADD_CFLAGS} -mshstk"
 77 | #   fi
 78 | # elif test xx$ax_cv_c_compiler_vendor == "xxclang"; then
 79 | #     CCVER=`${CC} -dumpversion | cut -f 1 -d "."`
 80 | #   echo "clang dumpversion: $CCVER"
 81 | #   if test "${CCVER}" -ge 9; then
 82 | #     ADD_CFLAGS="${ADD_CFLAGS} -mshstk"
 83 | #   fi
 84 | # fi
 85 | 
 86 | 
 87 | ########################################################
 88 | #### PCRE2 library paths
 89 | 
 90 | if test xx$pcre2_force_compile = "xxtrue"; then
 91 |   echo "Compiling PCRE2 from source due to --with-pcre2-force-compile"
 92 |   COMPILE_PCRE2="true"
 93 | elif test "xx$pcre2_include_path" != "xx"; then
 94 |   echo "Using user-defined pcre2 install paths"
 95 |     ADD_LIBS="${ADD_LIBS} -L${pcre2_lib_path}"
 96 |     INCLUDE_PATHS="${INCLUDE_PATHS} -I${pcre2_include_path}"
 97 |     COMPILE_PCRE2="false"
 98 | elif test "xx$PKGCONF" != "xx"; then
 99 |   if "${PKGCONF}" --exists libpcre2-8; then
100 |     VERSION_STRING=`${PKGCONF} --modversion libpcre2-8`
101 |     VER=`getVersion ${VERSION_STRING}`
102 |     if test "${VER}" -ge 10035; then
103 |       echo "PCRE2 ${VERSION_STRING} library detected -- skipping PCRE2 compilation"
104 |       pcre2_lib_path=`"${PKGCONF}" --libs libpcre2-8`
105 |       pcre2_include_path=`"${PKGCONF}" --cflags-only-I libpcre2-8`
106 |       ADD_LIBS="${ADD_LIBS} ${pcre2_lib_path}"
107 |       INCLUDE_PATHS="${INCLUDE_PATHS} ${pcre2_include_path}"
108 |       COMPILE_PCRE2="false"
109 |     else
110 |       echo "PCRE2 ${VERSION_STRING} library detected but is lower than bundled version (10.35) -- compiling from source"
111 |       COMPILE_PCRE2="true"
112 |     fi
113 |   else
114 |     echo "PCRE2 library not detected -- compiling from source"
115 |     COMPILE_PCRE2="true"
116 |   fi
117 | else
118 |   echo "pkg-confg not detected -- compiling from source"
119 |   COMPILE_PCRE2="true"
120 | fi
121 | 
122 | if test xx$COMPILE_PCRE2 = "xxtrue"; then
123 |   INCLUDE_PATHS="${INCLUDE_PATHS} -IPCRE2"
124 |   LIBPCRE2="\$(LIBPCRE2)"
125 |   PCRE2_BUNDLED="-DPCRE2_BUNDLED"
126 | fi
127 | 
128 | if test xx$with_simd = "xxAVX2"; then
129 |   echo "Using AVX2"
130 |   INCLUDE_PATHS="$INCLUDE_PATHS -mavx2 -msse3 -msse2"
131 | # elif test xx$with_simd = "xxSSE3"; then
132 | #   echo "Using SSE3"
133 | #   INCLUDE_PATHS="$INCLUDE_PATHS -msse3 -msse2"
134 | fi
135 | 
136 | echo $ADD_LIBS
137 | echo $INCLUDE_PATHS
138 | echo $LIBPCRE2
139 | 
140 | AC_SUBST([ADD_LIBS], $ADD_LIBS)
141 | AC_SUBST([INCLUDE_PATHS], $INCLUDE_PATHS)
142 | AC_SUBST([LIBPCRE2], $LIBPCRE2)
143 | AC_SUBST([PCRE2_BUNDLED], $PCRE2_BUNDLED)
144 | AC_CONFIG_FILES([src/Makevars])
145 | AC_OUTPUT
146 | 


--------------------------------------------------------------------------------
/inst/PCRE2_LICENSE.txt:
--------------------------------------------------------------------------------
 1 | PCRE2 LICENCE
 2 | -------------
 3 | 
 4 | PCRE2 is a library of functions to support regular expressions whose syntax
 5 | and semantics are as close as possible to those of the Perl 5 language.
 6 | 
 7 | Releases 10.00 and above of PCRE2 are distributed under the terms of the "BSD"
 8 | licence, as specified below, with one exemption for certain binary
 9 | redistributions. The documentation for PCRE2, supplied in the "doc" directory,
10 | is distributed under the same terms as the software itself. The data in the
11 | testdata directory is not copyrighted and is in the public domain.
12 | 
13 | The basic library functions are written in C and are freestanding. Also
14 | included in the distribution is a just-in-time compiler that can be used to
15 | optimize pattern matching. This is an optional feature that can be omitted when
16 | the library is built.
17 | 
18 | 
19 | THE BASIC LIBRARY FUNCTIONS
20 | ---------------------------
21 | 
22 | Written by:       Philip Hazel
23 | Email local part: ph10
24 | Email domain:     cam.ac.uk
25 | 
26 | University of Cambridge Computing Service,
27 | Cambridge, England.
28 | 
29 | Copyright (c) 1997-2019 University of Cambridge
30 | All rights reserved.
31 | 
32 | 
33 | PCRE2 JUST-IN-TIME COMPILATION SUPPORT
34 | --------------------------------------
35 | 
36 | Written by:       Zoltan Herczeg
37 | Email local part: hzmester
38 | Email domain:     freemail.hu
39 | 
40 | Copyright(c) 2010-2019 Zoltan Herczeg
41 | All rights reserved.
42 | 
43 | 
44 | STACK-LESS JUST-IN-TIME COMPILER
45 | --------------------------------
46 | 
47 | Written by:       Zoltan Herczeg
48 | Email local part: hzmester
49 | Email domain:     freemail.hu
50 | 
51 | Copyright(c) 2009-2019 Zoltan Herczeg
52 | All rights reserved.
53 | 
54 | 
55 | THE "BSD" LICENCE
56 | -----------------
57 | 
58 | Redistribution and use in source and binary forms, with or without
59 | modification, are permitted provided that the following conditions are met:
60 | 
61 |     * Redistributions of source code must retain the above copyright notices,
62 |       this list of conditions and the following disclaimer.
63 | 
64 |     * Redistributions in binary form must reproduce the above copyright
65 |       notices, this list of conditions and the following disclaimer in the
66 |       documentation and/or other materials provided with the distribution.
67 | 
68 |     * Neither the name of the University of Cambridge nor the names of any
69 |       contributors may be used to endorse or promote products derived from this
70 |       software without specific prior written permission.
71 | 
72 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
73 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82 | POSSIBILITY OF SUCH DAMAGE.
83 | 
84 | 
85 | EXEMPTION FOR BINARY LIBRARY-LIKE PACKAGES
86 | ------------------------------------------
87 | 
88 | The second condition in the BSD licence (covering binary redistributions) does
89 | not apply all the way down a chain of software. If binary package A includes
90 | PCRE2, it must respect the condition, but if package B is software that
91 | includes package A, the condition is not imposed on package B unless it uses
92 | PCRE2 independently.
93 | 
94 | End
95 | 


--------------------------------------------------------------------------------
/inst/icelandic_words_500_utf8.txt:
--------------------------------------------------------------------------------
  1 | ég
  2 | að
  3 | er
  4 | það
  5 | ekki
  6 | í
  7 | og
  8 | þú
  9 | við
 10 | á
 11 | hann
 12 | þetta
 13 | hvað
 14 | sem
 15 | mér
 16 | til
 17 | með
 18 | þér
 19 | en
 20 | fyrir
 21 | af
 22 | um
 23 | þig
 24 | var
 25 | mig
 26 | því
 27 | já
 28 | hún
 29 | nei
 30 | allt
 31 | þá
 32 | ef
 33 | eru
 34 | bara
 35 | ert
 36 | svo
 37 | þeir
 38 | þið
 39 | okkur
 40 | eftir
 41 | ertu
 42 | eins
 43 | vera
 44 | hér
 45 | gera
 46 | lagi
 47 | veit
 48 | hefur
 49 | nú
 50 | frá
 51 | þegar
 52 | hvernig
 53 | fara
 54 | e
 55 | honum
 56 | út
 57 | hef
 58 | verður
 59 | aftur
 60 | upp
 61 | ekkert
 62 | vel
 63 | þessu
 64 | verið
 65 | minn
 66 | sé
 67 | svona
 68 | hver
 69 | hana
 70 | ykkur
 71 | eða
 72 | vil
 73 | hverju
 74 | komdu
 75 | get
 76 | segja
 77 | úr
 78 | hvar
 79 | erum
 80 | aldrei
 81 | hafa
 82 | eitthvað
 83 | hérna
 84 | gott
 85 | maður
 86 | viltu
 87 | hjá
 88 | þau
 89 | fá
 90 | getur
 91 | sagði
 92 | koma
 93 | inn
 94 | okkar
 95 | þarna
 96 | núna
 97 | herra
 98 | kannski
 99 | mín
100 | þarf
101 | þar
102 | hans
103 | tala
104 | þeim
105 | þess
106 | þín
107 | takk
108 | farðu
109 | henni
110 | væri
111 | þinn
112 | fer
113 | líka
114 | sjá
115 | rétt
116 | áfram
117 | held
118 | sér
119 | kemur
120 | mjög
121 | gert
122 | verð
123 | þessi
124 | vegna
125 | saman
126 | sá
127 | enn
128 | þakka
129 | einn
130 | komið
131 | gæti
132 | allir
133 | alltaf
134 | of
135 | enginn
136 | kom
137 | skal
138 | vita
139 | yfir
140 | má
141 | farið
142 | ætla
143 | dag
144 | a
145 | hafi
146 | förum
147 | þær
148 | veistu
149 | hingað
150 | heldur
151 | niður
152 | hefði
153 | sig
154 | mitt
155 | mikið
156 | átt
157 | pabbi
158 | segðu
159 | höfum
160 | bíddu
161 | sama
162 | láttu
163 | hr
164 | einhver
165 | guð
166 | heim
167 | aðeins
168 | áður
169 | segir
170 | þessa
171 | getum
172 | fyrirgefðu
173 | mamma
174 | vertu
175 | tíma
176 | finnst
177 | góður
178 | meira
179 | vill
180 | vildi
181 | i
182 | gerðu
183 | þitt
184 | verðum
185 | eitt
186 | satt
187 | hefurðu
188 | stað
189 | verða
190 | taka
191 | fór
192 | jæja
193 | veist
194 | halda
195 | ao
196 | hvert
197 | ykkar
198 | alla
199 | mun
200 | annað
201 | voru
202 | gerir
203 | sjáðu
204 | leið
205 | sagt
206 | fram
207 | vilt
208 | séð
209 | hvaða
210 | láta
211 | menn
212 | gerði
213 | öll
214 | fólk
215 | kvöld
216 | elskan
217 | hvort
218 | myndi
219 | ó
220 | einu
221 | fjandinn
222 | vinur
223 | ár
224 | síðan
225 | geri
226 | hennar
227 | hélt
228 | hættu
229 | eina
230 | halló
231 | þ
232 | pú
233 | þessum
234 | ætti
235 | vissi
236 | auðvitað
237 | sinni
238 | þeirra
239 | eruð
240 | mína
241 | alveg
242 | þannig
243 | strax
244 | þína
245 | drepa
246 | reyna
247 | þennan
248 | vinna
249 | skil
250 | án
251 | finna
252 | neitt
253 | langar
254 | morgun
255 | viss
256 | undir
257 | geturðu
258 | engin
259 | taktu
260 | öllum
261 | áttu
262 | hvers
263 | hæ
264 | gengur
265 | hugsa
266 | heyrðu
267 | kem
268 | mínum
269 | einmitt
270 | frú
271 | haltu
272 | fimm
273 | elska
274 | heldurðu
275 | þangað
276 | lengi
277 | varð
278 | eigum
279 | virðist
280 | hvenær
281 | fékk
282 | varst
283 | segi
284 | ná
285 | vilja
286 | góða
287 | gaman
288 | þarft
289 | kominn
290 | komast
291 | lengur
292 | sæll
293 | lífi
294 | eiga
295 | mann
296 | gerðist
297 | burt
298 | líður
299 | petta
300 | átti
301 | jack
302 | inni
303 | þínum
304 | þurfum
305 | frábært
306 | þykir
307 | pao
308 | öllu
309 | leitt
310 | fyrst
311 | fyrsta
312 | geta
313 | handa
314 | betur
315 | ætlarðu
316 | hafði
317 | hægt
318 | jú
319 | héðan
320 | hjálpa
321 | fyrr
322 | illa
323 | hitta
324 | málið
325 | alvöru
326 | nóg
327 | góð
328 | einhvern
329 | sinn
330 | nótt
331 | èg
332 | víst
333 | áhyggjur
334 | nema
335 | ára
336 | komum
337 | daginn
338 | ferð
339 | mál
340 | ein
341 | gegn
342 | hlýtur
343 | sjálfur
344 | ú
345 | fengið
346 | búinn
347 | r
348 | hafið
349 | langt
350 | annars
351 | leita
352 | aò
353 | tveir
354 | ættir
355 | heiti
356 | haldið
357 | heima
358 | erfitt
359 | orðið
360 | ad
361 | enga
362 | líf
363 | engar
364 | minni
365 | faðir
366 | deyja
367 | mátt
368 | gat
369 | hve
370 | à
371 | skiptir
372 | samt
373 | mínu
374 | máli
375 | mínútur
376 | vio
377 | hætta
378 | gerum
379 | sýna
380 | tekur
381 | spyrja
382 | meðan
383 | heyra
384 | tvö
385 | gefa
386 | bless
387 | skilurðu
388 | heitir
389 | vorum
390 | mínir
391 | ætlar
392 | klukkan
393 | kann
394 | hvern
395 | john
396 | fær
397 | maðurinn
398 | gerast
399 | gangi
400 | h
401 | allar
402 | fæ
403 | sagðir
404 | ganga
405 | vantar
406 | fínt
407 | tíu
408 | komst
409 | nokkuð
410 | stundum
411 | baka
412 | lítur
413 | aõ
414 | pér
415 | hjálp
416 | tók
417 | skjóta
418 | alls
419 | uppi
420 | sért
421 | sérðu
422 | góðan
423 | látið
424 | sex
425 | stendur
426 | gerist
427 | þinni
428 | fann
429 | færð
430 | heyrt
431 | nota
432 | trúi
433 | skilið
434 | allan
435 | líkar
436 | afsakið
437 | engan
438 | fleiri
439 | mömmu
440 | þótt
441 | líklega
442 | kona
443 | manni
444 | þessari
445 | segirðu
446 | sjáumst
447 | jafnvel
448 | hlustaðu
449 | úti
450 | árum
451 | verði
452 | tvær
453 | þekki
454 | bílinn
455 | man
456 | hefðir
457 | fannst
458 | hugmynd
459 | afsakaðu
460 | lokið
461 | milli
462 | daga
463 | g
464 | konan
465 | varstu
466 | kemst
467 | tekið
468 | sú
469 | yrði
470 | sonur
471 | byrja
472 | bíða
473 | tími
474 | búið
475 | tvo
476 | leyfðu
477 | hví
478 | nógu
479 | fjandans
480 | manstu
481 | besta
482 | félagi
483 | þó
484 | ö
485 | værir
486 | engu
487 | eg
488 | betra
489 | new
490 | ha
491 | eigin
492 | hafðu
493 | annan
494 | kalla
495 | næstum
496 | hátt
497 | vitum
498 | beint
499 | dálítið
500 | peninga
501 | 


--------------------------------------------------------------------------------
/inst/include/sf_external.h:
--------------------------------------------------------------------------------
 1 | #ifndef SF_EXTERNAL_H
 2 | #define SF_EXTERNAL_H
 3 | 
 4 | #include <R_ext/Rdynload.h>
 5 | #include <Rcpp.h>
 6 | #include "sf_internal.h"
 7 | using namespace Rcpp;
 8 | 
 9 | std::string get_string_type(SEXP x) {static std::string(*fun)(SEXP) = (std::string(*)(SEXP)) R_GetCCallable("stringfish", "get_string_type");return fun(x);}
10 | 
11 | SEXP materialize(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "materialize");return fun(x);}
12 | SEXP sf_vector(size_t len) {static SEXP(*fun)(size_t) = (SEXP(*)(size_t)) R_GetCCallable("stringfish", "sf_vector");return fun(len);}
13 | sf_vec_data & sf_vec_data_ref(SEXP x) {static sf_vec_data &(*fun)(SEXP) = (sf_vec_data &(*)(SEXP)) R_GetCCallable("stringfish", "sf_vec_data_ref");return fun(x);}
14 | void sf_assign(SEXP x, size_t i, SEXP e) {static void(*fun)(SEXP, size_t, SEXP) = (void(*)(SEXP, size_t, SEXP)) R_GetCCallable("stringfish", "sf_assign");return fun(x, i, e);}
15 | SEXP sf_iconv(SEXP x, std::string from, std::string to) {static SEXP(*fun)(SEXP, std::string, std::string) = (SEXP(*)(SEXP, std::string, std::string)) R_GetCCallable("stringfish", "sf_iconv");return fun(x, from, to);}
16 | SEXP convert_to_sf(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "convert_to_sf");return fun(x);}
17 | 
18 | SEXP sf_readLines(std::string filename, std::string encoding = "UTF-8") {static SEXP(*fun)(std::string, std::string) = (SEXP(*)(std::string, std::string)) R_GetCCallable("stringfish", "sf_readLines");return fun(filename, encoding);}
19 | void sf_writeLines(SEXP text, const std::string file, const std::string sep = "\n", const std::string na_value = "NA", const std::string encode_mode = "UTF-8") 
20 |   {static void(*fun)(SEXP, const std::string, const std::string, const std::string, const std::string encode_mode) = (void(*)(SEXP, const std::string, const std::string, const std::string, const std::string encode_mode)) R_GetCCallable("stringfish", "sf_writeLines");return fun(text, file, sep, na_value, encode_mode);}
21 | 
22 | IntegerVector sf_nchar(SEXP obj, std::string type = "chars") {static IntegerVector(*fun)(SEXP, std::string) = (IntegerVector(*)(SEXP, std::string)) R_GetCCallable("stringfish", "sf_nchar");return fun(obj, type);}
23 | sfstring sf_substr_internal(const char * x, const int len, const cetype_t type, int start, int stop) {static sfstring(*fun)(const char *, const int, const cetype_t, int, int) = (sfstring(*)(const char *, const int, const cetype_t, int, int)) R_GetCCallable("stringfish", "sf_substr_internal");return fun(x, len, type, start, stop);}
24 | SEXP sf_substr(SEXP x, IntegerVector start, IntegerVector stop) {static SEXP(*fun)(SEXP, IntegerVector, IntegerVector) = (SEXP(*)(SEXP, IntegerVector, IntegerVector)) R_GetCCallable("stringfish", "sf_substr");return fun(x, start, stop);}
25 | SEXP c_sf_paste(List dots, SEXP sep) {static SEXP(*fun)(List, SEXP) = (SEXP(*)(List, SEXP)) R_GetCCallable("stringfish", "c_sf_paste");return fun(dots, sep);}
26 | SEXP sf_collapse(SEXP x, SEXP collapse) {static SEXP(*fun)(SEXP, SEXP) = (SEXP(*)(SEXP, SEXP)) R_GetCCallable("stringfish", "sf_collapse");return fun(x, collapse);}
27 | 
28 | LogicalVector sf_grepl(SEXP subject, SEXP pattern, const std::string encode_mode = "auto", const bool fixed = false) {static LogicalVector(*fun)(SEXP, SEXP, const std::string, const bool) = (LogicalVector(*)(SEXP, SEXP, const std::string, const bool)) R_GetCCallable("stringfish", "sf_grepl");return fun(subject, pattern, encode_mode, fixed);}
29 | SEXP sf_split(SEXP subject, SEXP split, const std::string encode_mode = "auto", const bool fixed = false) {static SEXP(*fun)(SEXP, SEXP, const std::string, const bool) = (SEXP(*)(SEXP, SEXP, const std::string, const bool)) R_GetCCallable("stringfish", "sf_split");return fun(subject, split, encode_mode, fixed);}
30 | SEXP sf_gsub(SEXP subject, SEXP pattern, SEXP replacement, const std::string encode_mode = "auto", const bool fixed = false) {static SEXP(*fun)(SEXP, SEXP, SEXP, const std::string, const bool) = (SEXP(*)(SEXP, SEXP, SEXP, const std::string, const bool)) R_GetCCallable("stringfish", "sf_gsub");return fun(subject, pattern, replacement, encode_mode, fixed);}
31 | SEXP random_strings(const int N, const int string_size = 50, std::string charset = "abcdefghijklmnopqrstuvwxyz", std::string vector_mode = "stringfish") {static SEXP(*fun)(const int, const int, std::string, std::string) = (SEXP(*)(const int, const int, std::string, std::string)) R_GetCCallable("stringfish", "sf_random_strings");return fun(N, string_size, charset, vector_mode);}
32 | SEXP sf_toupper(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "sf_toupper");return fun(x);}
33 | SEXP sf_tolower(SEXP x) {static SEXP(*fun)(SEXP) = (SEXP(*)(SEXP)) R_GetCCallable("stringfish", "sf_tolower");return fun(x);}
34 | IntegerVector sf_match(SEXP x, SEXP table) {static IntegerVector(*fun)(SEXP, SEXP) = (IntegerVector(*)(SEXP, SEXP)) R_GetCCallable("stringfish", "sf_match");return fun(x, table);}
35 | 
36 | #endif // include guard


--------------------------------------------------------------------------------
/man/convert_to_sf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{convert_to_sf}
 4 | \alias{convert_to_sf}
 5 | \alias{sf_convert}
 6 | \title{convert_to_sf}
 7 | \usage{
 8 | convert_to_sf(x)
 9 | 
10 | sf_convert(x)
11 | }
12 | \arguments{
13 | \item{x}{A character vector}
14 | }
15 | \value{
16 | The converted character vector
17 | }
18 | \description{
19 | Converts a character vector to a stringfish vector
20 | }
21 | \details{
22 | Converts a character vector to a stringfish vector. The opposite of `materialize`.
23 | }
24 | \examples{
25 | if(getRversion() >= "3.5.0") {
26 | x <- convert_to_sf(letters)
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/man/get_string_type.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{get_string_type}
 4 | \alias{get_string_type}
 5 | \title{get_string_type}
 6 | \usage{
 7 | get_string_type(x)
 8 | }
 9 | \arguments{
10 | \item{x}{the vector}
11 | }
12 | \value{
13 | The type of vector
14 | }
15 | \description{
16 | Returns the type of the character vector
17 | }
18 | \details{
19 | A function that returns the type of character vector. Possible values are "normal vector", "stringfish vector", "stringfish vector (materialized)" or "other alt-rep vector"
20 | }
21 | \examples{
22 | if(getRversion() >= "3.5.0") {
23 | x <- sf_vector(10)
24 | get_string_type(x) # returns "stringfish vector"
25 | x <- character(10)
26 | get_string_type(x) # returns "normal vector"
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/man/materialize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{materialize}
 4 | \alias{materialize}
 5 | \title{materialize}
 6 | \usage{
 7 | materialize(x)
 8 | }
 9 | \arguments{
10 | \item{x}{An alt-rep object}
11 | }
12 | \value{
13 | x
14 | }
15 | \description{
16 | Materializes an alt-rep object
17 | }
18 | \details{
19 | Materializes any alt-rep object and then returns it. 
20 | Note: the object is materialized regardless of whether the return value is assigned to a variable.
21 | }
22 | \examples{
23 | if(getRversion() >= "3.5.0") {
24 | x <- sf_vector(10)
25 | sf_assign(x, 1, "hello world")
26 | sf_assign(x, 2, "another string")
27 | x <- materialize(x)
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/man/random_strings.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{random_strings}
 4 | \alias{random_strings}
 5 | \title{random_strings}
 6 | \usage{
 7 | random_strings(N, string_size = 50, charset = "abcdefghijklmnopqrstuvwxyz", 
 8 |                       vector_mode = "stringfish")
 9 | }
10 | \arguments{
11 | \item{N}{The number of strings to generate}
12 | 
13 | \item{string_size}{The length of the strings}
14 | 
15 | \item{charset}{The characters used to generate the random strings (default: abcdefghijklmnopqrstuvwxyz)}
16 | 
17 | \item{vector_mode}{The type of character vector to generate (either stringfish or normal, default: stringfish)}
18 | }
19 | \value{
20 | A character vector of the random strings
21 | }
22 | \description{
23 | A function that generates random strings
24 | }
25 | \details{
26 | The function uses the PCRE2 library, which is also used internally by R. 
27 | Note: the order of paramters is switched compared to the `gsub` base R function, with subject being first. 
28 | See also: https://www.pcre.org/current/doc/html/pcre2api.html for more documentation on match syntax.
29 | }
30 | \examples{
31 | if(getRversion() >= "3.5.0") {
32 | set.seed(1)
33 | x <- random_strings(1e6, 80, "ACGT", vector_mode = "stringfish")
34 | }
35 | }
36 | \seealso{
37 | gsub
38 | }
39 | 


--------------------------------------------------------------------------------
/man/sf_assign.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_assign}
 4 | \alias{sf_assign}
 5 | \title{sf_assign}
 6 | \usage{
 7 | sf_assign(x, i, e)
 8 | }
 9 | \arguments{
10 | \item{x}{the vector}
11 | 
12 | \item{i}{the index to assign to}
13 | 
14 | \item{e}{the new string to replace at i in x}
15 | }
16 | \value{
17 | No return value, the function assigns an element to an existing stringfish vector
18 | }
19 | \description{
20 | Assigns a new string to a stringfish vector or any other character vector
21 | }
22 | \details{
23 | A function to assign a new element to an existing character vector. If the the vector is a stringfish vector, it does so without materialization.
24 | }
25 | \examples{
26 | if(getRversion() >= "3.5.0") {
27 | x <- sf_vector(10)
28 | sf_assign(x, 1, "hello world")
29 | sf_assign(x, 2, "another string")
30 | }
31 | }
32 | 


--------------------------------------------------------------------------------
/man/sf_collapse.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_collapse}
 4 | \alias{sf_collapse}
 5 | \title{sf_collapse}
 6 | \usage{
 7 | sf_collapse(x, collapse)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector}
11 | 
12 | \item{collapse}{A single string}
13 | }
14 | \value{
15 | A single string with all values in `x` pasted together, separated by `collapse`.
16 | }
17 | \description{
18 | Pastes a series of strings together separated by the `collapse` parameter
19 | }
20 | \details{
21 | This works the same way as `paste0(x, collapse=collapse)`
22 | }
23 | \examples{
24 | if(getRversion() >= "3.5.0") {
25 | x <- c("hello", "\\\\xe4\\\\xb8\\\\x96\\\\xe7\\\\x95\\\\x8c")
26 | Encoding(x) <- "UTF-8"
27 | sf_collapse(x, " ") # "hello world" in Japanese
28 | sf_collapse(letters, "") # returns the alphabet
29 | }
30 | }
31 | \seealso{
32 | paste0, paste
33 | }
34 | 


--------------------------------------------------------------------------------
/man/sf_compare.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_compare}
 4 | \alias{sf_compare}
 5 | \alias{sf_equals}
 6 | \title{sf_compare}
 7 | \usage{
 8 | sf_compare(x, y, nthreads = getOption("stringfish.nthreads", 1L))
 9 | 
10 | sf_equals(x, y, nthreads = getOption("stringfish.nthreads", 1L))
11 | }
12 | \arguments{
13 | \item{x}{A character vector of length 1 or the same non-zero length as y}
14 | 
15 | \item{y}{Another character vector of length 1 or the same non-zero length as y}
16 | 
17 | \item{nthreads}{Number of threads to use}
18 | }
19 | \value{
20 | A logical vector
21 | }
22 | \description{
23 | Returns a logical vector testing equality of strings from two string vectors
24 | }
25 | \details{
26 | Note: the function tests for both string and encoding equality
27 | }
28 | \examples{
29 | if(getRversion() >= "3.5.0") {
30 | sf_compare(letters, "a")
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/sf_concat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_concat}
 4 | \alias{sf_concat}
 5 | \alias{sfc}
 6 | \title{sf_concat}
 7 | \usage{
 8 | sf_concat(...)
 9 | 
10 | sfc(...)
11 | }
12 | \arguments{
13 | \item{...}{Any number of vectors, coerced to character vector if necessary}
14 | }
15 | \value{
16 | A concatenated stringfish vector
17 | }
18 | \description{
19 | Appends vectors together
20 | }
21 | \examples{
22 | if(getRversion() >= "3.5.0") {
23 | sf_concat(letters, 1:5)
24 | }
25 | }
26 | 


--------------------------------------------------------------------------------
/man/sf_ends.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_ends}
 4 | \alias{sf_ends}
 5 | \title{sf_ends}
 6 | \usage{
 7 | sf_ends(subject, pattern, ...)
 8 | }
 9 | \arguments{
10 | \item{subject}{A character vector}
11 | 
12 | \item{pattern}{A string to look for at the start}
13 | 
14 | \item{...}{Parameters passed to sf_grepl}
15 | }
16 | \value{
17 | A logical vector true if there is a match, false if no match, NA is the subject was NA
18 | }
19 | \description{
20 | A function for detecting a pattern at the end of a string
21 | }
22 | \examples{
23 | if(getRversion() >= "3.5.0") {
24 | x <- c("alpha", "beta", "gamma", "delta", "epsilon")
25 | sf_ends(x, "a")
26 | }
27 | }
28 | \seealso{
29 | endsWith, sf_starts
30 | }
31 | 


--------------------------------------------------------------------------------
/man/sf_grepl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_grepl}
 4 | \alias{sf_grepl}
 5 | \title{sf_grepl}
 6 | \usage{
 7 | sf_grepl(subject, pattern, encode_mode = "auto", fixed = FALSE, 
 8 | nthreads = getOption("stringfish.nthreads", 1L))
 9 | }
10 | \arguments{
11 | \item{subject}{The subject character vector to search}
12 | 
13 | \item{pattern}{The pattern to search for}
14 | 
15 | \item{encode_mode}{"auto", "UTF-8" or "byte". Determines multi-byte (UTF-8) characters or single-byte characters are used.}
16 | 
17 | \item{fixed}{determines whether the pattern parameter should be interpreted literally or as a regular expression}
18 | 
19 | \item{nthreads}{Number of threads to use}
20 | }
21 | \value{
22 | A logical vector with the same length as subject
23 | }
24 | \description{
25 | A function that matches patterns and returns a logical vector
26 | }
27 | \details{
28 | The function uses the PCRE2 library, which is also used internally by R. 
29 | The encoding is based on the pattern string (or forced via the encode_mode parameter). 
30 | Note: the order of paramters is switched compared to the `grepl` base R function, with subject being first. 
31 | See also: https://www.pcre.org/current/doc/html/pcre2api.html for more documentation on match syntax.
32 | }
33 | \examples{
34 | if(getRversion() >= "3.5.0") {
35 | x <- sf_vector(10)
36 | sf_assign(x, 1, "hello world")
37 | pattern <- "^hello"
38 | sf_grepl(x, pattern)
39 | }
40 | }
41 | \seealso{
42 | grepl
43 | }
44 | 


--------------------------------------------------------------------------------
/man/sf_gsub.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_gsub}
 4 | \alias{sf_gsub}
 5 | \title{sf_gsub}
 6 | \usage{
 7 | sf_gsub(subject, pattern, replacement, encode_mode = "auto", fixed = FALSE, 
 8 | nthreads = getOption("stringfish.nthreads", 1L))
 9 | }
10 | \arguments{
11 | \item{subject}{The subject character vector to search}
12 | 
13 | \item{pattern}{The pattern to search for}
14 | 
15 | \item{replacement}{The replacement string}
16 | 
17 | \item{encode_mode}{"auto", "UTF-8" or "byte". Determines multi-byte (UTF-8) characters or single-byte characters are used.}
18 | 
19 | \item{fixed}{determines whether the pattern parameter should be interpreted literally or as a regular expression}
20 | 
21 | \item{nthreads}{Number of threads to use}
22 | }
23 | \value{
24 | A stringfish vector of the replacement string
25 | }
26 | \description{
27 | A function that performs pattern substitution
28 | }
29 | \details{
30 | The function uses the PCRE2 library, which is also used internally by R. However, syntax may be slightly different. 
31 | E.g.: capture groups: "\1" in R, but "$1" in PCRE2 (as in Perl). 
32 | The encoding of the output is determined by the pattern (or forced using encode_mode parameter) and encodings should be compatible. 
33 | E.g: mixing ASCII and UTF-8 is okay, but not UTF-8 and latin1. 
34 | Note: the order of paramters is switched compared to the `gsub` base R function, with subject being first. 
35 | See also: https://www.pcre.org/current/doc/html/pcre2api.html for more documentation on match syntax.
36 | }
37 | \examples{
38 | if(getRversion() >= "3.5.0") {
39 | x <- "hello world"
40 | pattern <- "^hello (.+)"
41 | replacement <- "goodbye $1"
42 | sf_gsub(x, pattern, replacement)
43 | }
44 | }
45 | \seealso{
46 | gsub
47 | }
48 | 


--------------------------------------------------------------------------------
/man/sf_iconv.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_iconv}
 4 | \alias{sf_iconv}
 5 | \title{sf_iconv}
 6 | \usage{
 7 | sf_iconv(x, from, to, nthreads = getOption("stringfish.nthreads", 1L))
 8 | }
 9 | \arguments{
10 | \item{x}{An alt-rep object}
11 | 
12 | \item{from}{the encoding to assume of `x`}
13 | 
14 | \item{nthreads}{Number of threads to use}
15 | 
16 | \item{to}{the new encoding}
17 | }
18 | \value{
19 | the converted character vector as a stringfish vector
20 | }
21 | \description{
22 | Converts encoding of one character vector to another
23 | }
24 | \details{
25 | This is an analogue to the base R function `iconv`. It converts a string from one encoding (e.g. latin1 or UTF-8) to another
26 | }
27 | \examples{
28 | if(getRversion() >= "3.5.0") {
29 | x <- "fa\xE7ile"
30 | Encoding(x) <- "latin1"
31 | sf_iconv(x, "latin1", "UTF-8")
32 | }
33 | }
34 | \seealso{
35 | iconv
36 | }
37 | 


--------------------------------------------------------------------------------
/man/sf_match.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_match}
 4 | \alias{sf_match}
 5 | \title{sf_match}
 6 | \usage{
 7 | sf_match(x, table, nthreads = getOption("stringfish.nthreads", 1L))
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector to search for in table}
11 | 
12 | \item{table}{A character vector to be matched against x}
13 | 
14 | \item{nthreads}{Number of threads to use}
15 | }
16 | \value{
17 | An integer vector of the indicies of each x element's position in table
18 | }
19 | \description{
20 | Returns a vector of the positions of x in table
21 | }
22 | \details{
23 | Note: similarly to the base R function, long "table" vectors are not supported. This is due to the maximum integer value that can be returned (`.Machine$integer.max`)
24 | }
25 | \examples{
26 | if(getRversion() >= "3.5.0") {
27 | sf_match("c", letters)
28 | }
29 | }
30 | \seealso{
31 | match
32 | }
33 | 


--------------------------------------------------------------------------------
/man/sf_nchar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_nchar}
 4 | \alias{sf_nchar}
 5 | \title{sf_nchar}
 6 | \usage{
 7 | sf_nchar(x, type = "chars", nthreads = getOption("stringfish.nthreads", 1L))
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector}
11 | 
12 | \item{type}{The type of counting to perform ("chars" or "bytes", default: "chars")}
13 | 
14 | \item{nthreads}{Number of threads to use}
15 | }
16 | \value{
17 | An integer vector of the number of characters
18 | }
19 | \description{
20 | Counts the number of characters in a character vector
21 | }
22 | \details{
23 | Returns the number of characters per string. The type of counting only matters for UTF-8 strings, where a character can be represented by multiple bytes.
24 | }
25 | \examples{
26 | if(getRversion() >= "3.5.0") {
27 | x <- "fa\xE7ile"
28 | Encoding(x) <- "latin1"
29 | x <- sf_iconv(x, "latin1", "UTF-8")
30 | }
31 | }
32 | \seealso{
33 | nchar
34 | }
35 | 


--------------------------------------------------------------------------------
/man/sf_paste.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_paste}
 4 | \alias{sf_paste}
 5 | \title{sf_paste}
 6 | \usage{
 7 | sf_paste(..., sep = "", nthreads = getOption("stringfish.nthreads", 1L))
 8 | }
 9 | \arguments{
10 | \item{...}{Any number of character vector strings}
11 | 
12 | \item{sep}{The seperating string between strings}
13 | 
14 | \item{nthreads}{Number of threads to use}
15 | }
16 | \value{
17 | A character vector where elements of the arguments are pasted together
18 | }
19 | \description{
20 | Pastes a series of strings together
21 | }
22 | \details{
23 | This works the same way as `paste0(..., sep=sep)`
24 | }
25 | \examples{
26 | if(getRversion() >= "3.5.0") {
27 | x <- letters
28 | y <- LETTERS
29 | sf_paste(x,y, sep = ":")
30 | }
31 | }
32 | \seealso{
33 | paste0, paste
34 | }
35 | 


--------------------------------------------------------------------------------
/man/sf_readLines.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_readLines}
 4 | \alias{sf_readLines}
 5 | \title{sf_readLines}
 6 | \usage{
 7 | sf_readLines(file, encoding = "UTF-8")
 8 | }
 9 | \arguments{
10 | \item{file}{The file name}
11 | 
12 | \item{encoding}{The encoding to use (Default: UTF-8)}
13 | }
14 | \value{
15 | A stringfish vector of the lines in a file
16 | }
17 | \description{
18 | A function that reads a file line by line
19 | }
20 | \details{
21 | A function for reading in text data using `std::ifstream`.
22 | }
23 | \examples{
24 | if(getRversion() >= "3.5.0") {
25 | file <- tempfile()
26 | sf_writeLines(letters, file)
27 | sf_readLines(file)
28 | }
29 | }
30 | \seealso{
31 | readLines
32 | }
33 | 


--------------------------------------------------------------------------------
/man/sf_split.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_split}
 4 | \alias{sf_split}
 5 | \title{sf_split}
 6 | \usage{
 7 | sf_split(subject, split, encode_mode = "auto", fixed = FALSE, 
 8 | nthreads = getOption("stringfish.nthreads", 1L))
 9 | }
10 | \arguments{
11 | \item{subject}{A character vector}
12 | 
13 | \item{split}{A delimiter to split the string by}
14 | 
15 | \item{encode_mode}{"auto", "UTF-8" or "byte". Determines multi-byte (UTF-8) characters or single-byte characters are used.}
16 | 
17 | \item{fixed}{determines whether the split parameter should be interpreted literally or as a regular expression}
18 | 
19 | \item{nthreads}{Number of threads to use}
20 | }
21 | \value{
22 | A list of stringfish character vectors
23 | }
24 | \description{
25 | A function to split strings by a delimiter
26 | }
27 | \examples{
28 | if(getRversion() >= "3.5.0") {
29 | sf_split(datasets::state.name, "\\\\s") # split U.S. state names by any space character
30 | }
31 | }
32 | \seealso{
33 | strsplit
34 | }
35 | 


--------------------------------------------------------------------------------
/man/sf_starts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_starts}
 4 | \alias{sf_starts}
 5 | \title{sf_starts}
 6 | \usage{
 7 | sf_starts(subject, pattern, ...)
 8 | }
 9 | \arguments{
10 | \item{subject}{A character vector}
11 | 
12 | \item{pattern}{A string to look for at the start}
13 | 
14 | \item{...}{Parameters passed to sf_grepl}
15 | }
16 | \value{
17 | A logical vector true if there is a match, false if no match, NA is the subject was NA
18 | }
19 | \description{
20 | A function for detecting a pattern at the start of a string
21 | }
22 | \examples{
23 | if(getRversion() >= "3.5.0") {
24 | x <- c("alpha", "beta", "gamma", "delta", "epsilon")
25 | sf_starts(x, "a")
26 | }
27 | }
28 | \seealso{
29 | startsWith, sf_ends
30 | }
31 | 


--------------------------------------------------------------------------------
/man/sf_substr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_substr}
 4 | \alias{sf_substr}
 5 | \title{sf_substr}
 6 | \usage{
 7 | sf_substr(x, start, stop, nthreads = getOption("stringfish.nthreads", 1L))
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector}
11 | 
12 | \item{start}{The begining to extract from}
13 | 
14 | \item{stop}{The end to extract from}
15 | 
16 | \item{nthreads}{Number of threads to use}
17 | }
18 | \value{
19 | A stringfish vector of substrings
20 | }
21 | \description{
22 | Extracts substrings from a character vector
23 | }
24 | \details{
25 | This works the same way as `substr`, but in addition allows negative indexing. 
26 | Negative indicies count backwards from the end of the string, with -1 being the last character.
27 | }
28 | \examples{
29 | if(getRversion() >= "3.5.0") {
30 | x <- c("fa\xE7ile", "hello world")
31 | Encoding(x) <- "latin1"
32 | x <- sf_iconv(x, "latin1", "UTF-8")
33 | sf_substr(x, 4, -1) # extracts from the 4th character to the last
34 | ## [1] "ile"  "lo world"
35 | }
36 | }
37 | \seealso{
38 | substr
39 | }
40 | 


--------------------------------------------------------------------------------
/man/sf_tolower.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_tolower}
 4 | \alias{sf_tolower}
 5 | \title{sf_tolower}
 6 | \usage{
 7 | sf_tolower(x)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector}
11 | }
12 | \value{
13 | A stringfish vector where all uppercase is converted to lowercase
14 | }
15 | \description{
16 | A function converting a string to all lowercase
17 | }
18 | \details{
19 | Note: the function only converts ASCII characters.
20 | }
21 | \examples{
22 | if(getRversion() >= "3.5.0") {
23 | x <- LETTERS
24 | sf_tolower(x)
25 | }
26 | }
27 | \seealso{
28 | tolower
29 | }
30 | 


--------------------------------------------------------------------------------
/man/sf_toupper.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_toupper}
 4 | \alias{sf_toupper}
 5 | \title{sf_toupper}
 6 | \usage{
 7 | sf_toupper(x)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector}
11 | }
12 | \value{
13 | A stringfish vector where all lowercase is converted to uppercase
14 | }
15 | \description{
16 | A function converting a string to all uppercase
17 | }
18 | \details{
19 | Note: the function only converts ASCII characters.
20 | }
21 | \examples{
22 | if(getRversion() >= "3.5.0") {
23 | x <- letters
24 | sf_toupper(x)
25 | }
26 | }
27 | \seealso{
28 | toupper
29 | }
30 | 


--------------------------------------------------------------------------------
/man/sf_trim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_trim}
 4 | \alias{sf_trim}
 5 | \title{sf_trim}
 6 | \usage{
 7 | sf_trim(subject, which = c("both", "left", "right"), whitespace = "[ \\\\t\\\\r\\\\n]", ...)
 8 | }
 9 | \arguments{
10 | \item{subject}{A character vector}
11 | 
12 | \item{which}{"both", "left", or "right" determines which white space is removed}
13 | 
14 | \item{whitespace}{Whitespace characters (default: "[ \\\\t\\\\r\\\\n]")}
15 | 
16 | \item{...}{Parameters passed to sf_gsub}
17 | }
18 | \value{
19 | A stringfish vector of trimmed whitespace
20 | }
21 | \description{
22 | A function to remove leading/trailing whitespace
23 | }
24 | \examples{
25 | if(getRversion() >= "3.5.0") {
26 | x <- c(" alpha ", " beta", " gamma ", "delta ", "epsilon ")
27 | sf_trim(x)
28 | }
29 | }
30 | \seealso{
31 | trimws
32 | }
33 | 


--------------------------------------------------------------------------------
/man/sf_vector.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_vector}
 4 | \alias{sf_vector}
 5 | \title{sf_vector}
 6 | \usage{
 7 | sf_vector(len)
 8 | }
 9 | \arguments{
10 | \item{len}{length of the new vector}
11 | }
12 | \value{
13 | A new (empty) stringfish vector
14 | }
15 | \description{
16 | Creates a new stringfish vector
17 | }
18 | \details{
19 | This function creates a new stringfish vector, an alt-rep character vector backed by a C++ "std::vector" as the internal memory representation. 
20 | The vector type is "sfstring", which is a simple C++ class containing a "std::string" and a single byte (uint8_t) representing the encoding.
21 | }
22 | \examples{
23 | if(getRversion() >= "3.5.0") {
24 | x <- sf_vector(10)
25 | sf_assign(x, 1, "hello world")
26 | sf_assign(x, 2, "another string")
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/man/sf_writeLines.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{sf_writeLines}
 4 | \alias{sf_writeLines}
 5 | \title{sf_writeLines}
 6 | \usage{
 7 | sf_writeLines(text, file, sep = "\n", na_value = "NA", encode_mode = "UTF-8")
 8 | }
 9 | \arguments{
10 | \item{text}{A character to write to file}
11 | 
12 | \item{file}{Name of the file to write to}
13 | 
14 | \item{sep}{The line separator character(s)}
15 | 
16 | \item{na_value}{What to write in case of a NA string}
17 | 
18 | \item{encode_mode}{"UTF-8" or "byte". If "UTF-8", all strings are re-encoded as UTF-8.}
19 | }
20 | \description{
21 | A function that reads a file line by line
22 | }
23 | \details{
24 | A function for writing text data using `std::ofstream`.
25 | }
26 | \examples{
27 | if(getRversion() >= "3.5.0") {
28 | file <- tempfile()
29 | sf_writeLines(letters, file)
30 | sf_readLines(file)
31 | }
32 | }
33 | \seealso{
34 | writeLines
35 | }
36 | 


--------------------------------------------------------------------------------
/man/string_identical.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zz_help_files.R
 3 | \name{string_identical}
 4 | \alias{string_identical}
 5 | \title{string_identical}
 6 | \usage{
 7 | string_identical(x, y)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector}
11 | 
12 | \item{y}{Another character to compare to x}
13 | }
14 | \value{
15 | TRUE if strings are identical, including encoding
16 | }
17 | \description{
18 | A stricter comparison of string equality
19 | }
20 | \examples{
21 | x <- "fa\xE7ile"
22 | Encoding(x) <- "latin1"
23 | y <- iconv(x, "latin1", "UTF-8")
24 | identical(x, y) # TRUE
25 | string_identical(x, y) # FALSE
26 | }
27 | \seealso{
28 | identical
29 | }
30 | 


--------------------------------------------------------------------------------
/src/Makevars.in:
--------------------------------------------------------------------------------
 1 | PKG_CPPFLAGS=-DRCPP_USE_UNWIND_PROTECT -DRCPP_NO_RTTI -DPCRE2_CODE_UNIT_WIDTH=8 -DHAVE_CONFIG_H @PCRE2_BUNDLED@ -I. @INCLUDE_PATHS@
 2 | PKG_CXXFLAGS = $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::CxxFlags()")
 3 | PKG_LIBS=-lpthread -L. -lSFPCRE2 @ADD_LIBS@ $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::RcppParallelLibs()")
 4 | 
 5 | LIBPCRE2 = PCRE2/pcre2_chartables.o \
 6 | 	PCRE2/pcre2_auto_possess.o \
 7 | 	PCRE2/pcre2_compile.o \
 8 | 	PCRE2/pcre2_config.o \
 9 | 	PCRE2/pcre2_context.o \
10 | 	PCRE2/pcre2_convert.o \
11 | 	PCRE2/pcre2_dfa_match.o \
12 | 	PCRE2/pcre2_error.o \
13 | 	PCRE2/pcre2_extuni.o \
14 | 	PCRE2/pcre2_find_bracket.o \
15 | 	PCRE2/pcre2_jit_compile.o \
16 | 	PCRE2/pcre2_maketables.o \
17 | 	PCRE2/pcre2_match.o \
18 | 	PCRE2/pcre2_match_data.o \
19 | 	PCRE2/pcre2_newline.o \
20 | 	PCRE2/pcre2_ord2utf.o \
21 | 	PCRE2/pcre2_pattern_info.o \
22 | 	PCRE2/pcre2_script_run.o \
23 | 	PCRE2/pcre2_serialize.o \
24 | 	PCRE2/pcre2_string_utils.o \
25 | 	PCRE2/pcre2_study.o \
26 | 	PCRE2/pcre2_substitute.o \
27 | 	PCRE2/pcre2_substring.o \
28 | 	PCRE2/pcre2_tables.o \
29 | 	PCRE2/pcre2_ucd.o \
30 | 	PCRE2/pcre2_valid_utf.o \
31 | 	PCRE2/pcre2_xclass.o \
32 | 	PCRE2/pcre2_is_bundled.o
33 | 
34 | 	
35 | PCRE2_wrapper = PCRE2_wrapper/pcre2_wrapper.o
36 | 
37 | $(SHLIB): libSFPCRE2.a
38 | 
39 | libSFPCRE2.a: @LIBPCRE2@ $(PCRE2_wrapper)
40 | 	$(AR) rcs libSFPCRE2.a @LIBPCRE2@ $(PCRE2_wrapper)
41 | 
42 | clean:
43 | 	rm -f $(SHLIB) $(OBJECTS) @LIBPCRE2@ $(PCRE2_wrapper) libSFPCRE2.a
44 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | PKG_CPPFLAGS = -DRCPP_USE_UNWIND_PROTECT -DRCPP_NO_RTTI -DPCRE2_CODE_UNIT_WIDTH=8 -DHAVE_CONFIG_H -DPCRE2_BUNDLED -I. -IPCRE2
 2 | PKG_CXXFLAGS = -DRCPP_PARALLEL_USE_TBB=1 $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "RcppParallel::CxxFlags()")
 3 | PKG_LIBS     = -lpthread -L. -lSFPCRE2 $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "RcppParallel::RcppParallelLibs()")
 4 | 
 5 | LIBPCRE2 = PCRE2/pcre2_chartables.o \
 6 | 	PCRE2/pcre2_auto_possess.o \
 7 | 	PCRE2/pcre2_compile.o \
 8 | 	PCRE2/pcre2_config.o \
 9 | 	PCRE2/pcre2_context.o \
10 | 	PCRE2/pcre2_convert.o \
11 | 	PCRE2/pcre2_dfa_match.o \
12 | 	PCRE2/pcre2_error.o \
13 | 	PCRE2/pcre2_extuni.o \
14 | 	PCRE2/pcre2_find_bracket.o \
15 | 	PCRE2/pcre2_jit_compile.o \
16 | 	PCRE2/pcre2_maketables.o \
17 | 	PCRE2/pcre2_match.o \
18 | 	PCRE2/pcre2_match_data.o \
19 | 	PCRE2/pcre2_newline.o \
20 | 	PCRE2/pcre2_ord2utf.o \
21 | 	PCRE2/pcre2_pattern_info.o \
22 | 	PCRE2/pcre2_script_run.o \
23 | 	PCRE2/pcre2_serialize.o \
24 | 	PCRE2/pcre2_string_utils.o \
25 | 	PCRE2/pcre2_study.o \
26 | 	PCRE2/pcre2_substitute.o \
27 | 	PCRE2/pcre2_substring.o \
28 | 	PCRE2/pcre2_tables.o \
29 | 	PCRE2/pcre2_ucd.o \
30 | 	PCRE2/pcre2_valid_utf.o \
31 | 	PCRE2/pcre2_xclass.o \
32 | 	PCRE2/pcre2_is_bundled.o
33 | 
34 | PCRE2_wrapper = PCRE2_wrapper/pcre2_wrapper.o
35 | 	
36 | $(SHLIB): libSFPCRE2.a
37 | 
38 | libSFPCRE2.a: $(LIBPCRE2) $(PCRE2_wrapper)
39 | 	$(AR) rcs libSFPCRE2.a $(LIBPCRE2) $(PCRE2_wrapper)
40 | 
41 | clean:
42 | 	rm -f $(SHLIB) $(OBJECTS) $(LIBPCRE2) $(PCRE2_wrapper) libSFPCRE2.a
43 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_chartables.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* This file was automatically written by the pcre2_dftables auxiliary
  6 | program. It contains character tables that are used when no external
  7 | tables are passed to PCRE2 by the application that calls it. The tables
  8 | are used only for characters whose code values are less than 256. */
  9 | 
 10 | /* This set of tables was written in the C locale. */
 11 | 
 12 | /* The pcre2_ftables program (which is distributed with PCRE2) can be used
 13 | to build alternative versions of this file. This is necessary if you are
 14 | running in an EBCDIC environment, or if you want to default to a different
 15 | encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates
 16 | these tables in the "C" locale by default. This happens automatically if
 17 | PCRE2 is configured with --enable-rebuild-chartables. However, you can run
 18 | pcre2_dftables manually with the -L option to build tables using the LC_ALL
 19 | locale. */
 20 | 
 21 | /* The following #include is present because without it gcc 4.x may remove
 22 | the array definition from the final binary if PCRE2 is built into a static
 23 | library and dead code stripping is activated. This leads to link errors.
 24 | Pulling in the header ensures that the array gets flagged as "someone
 25 | outside this compilation unit might reference this" and so it will always
 26 | be supplied to the linker. */
 27 | 
 28 | #ifdef HAVE_CONFIG_H
 29 | #include "config.h"
 30 | #endif
 31 | 
 32 | #include "pcre2_internal.h"
 33 | 
 34 | const uint8_t PRIV(default_tables)[] = {
 35 | 
 36 | /* This table is a lower casing table. */
 37 | 
 38 |     0,  1,  2,  3,  4,  5,  6,  7,
 39 |     8,  9, 10, 11, 12, 13, 14, 15,
 40 |    16, 17, 18, 19, 20, 21, 22, 23,
 41 |    24, 25, 26, 27, 28, 29, 30, 31,
 42 |    32, 33, 34, 35, 36, 37, 38, 39,
 43 |    40, 41, 42, 43, 44, 45, 46, 47,
 44 |    48, 49, 50, 51, 52, 53, 54, 55,
 45 |    56, 57, 58, 59, 60, 61, 62, 63,
 46 |    64, 97, 98, 99,100,101,102,103,
 47 |   104,105,106,107,108,109,110,111,
 48 |   112,113,114,115,116,117,118,119,
 49 |   120,121,122, 91, 92, 93, 94, 95,
 50 |    96, 97, 98, 99,100,101,102,103,
 51 |   104,105,106,107,108,109,110,111,
 52 |   112,113,114,115,116,117,118,119,
 53 |   120,121,122,123,124,125,126,127,
 54 |   128,129,130,131,132,133,134,135,
 55 |   136,137,138,139,140,141,142,143,
 56 |   144,145,146,147,148,149,150,151,
 57 |   152,153,154,155,156,157,158,159,
 58 |   160,161,162,163,164,165,166,167,
 59 |   168,169,170,171,172,173,174,175,
 60 |   176,177,178,179,180,181,182,183,
 61 |   184,185,186,187,188,189,190,191,
 62 |   192,193,194,195,196,197,198,199,
 63 |   200,201,202,203,204,205,206,207,
 64 |   208,209,210,211,212,213,214,215,
 65 |   216,217,218,219,220,221,222,223,
 66 |   224,225,226,227,228,229,230,231,
 67 |   232,233,234,235,236,237,238,239,
 68 |   240,241,242,243,244,245,246,247,
 69 |   248,249,250,251,252,253,254,255,
 70 | 
 71 | /* This table is a case flipping table. */
 72 | 
 73 |     0,  1,  2,  3,  4,  5,  6,  7,
 74 |     8,  9, 10, 11, 12, 13, 14, 15,
 75 |    16, 17, 18, 19, 20, 21, 22, 23,
 76 |    24, 25, 26, 27, 28, 29, 30, 31,
 77 |    32, 33, 34, 35, 36, 37, 38, 39,
 78 |    40, 41, 42, 43, 44, 45, 46, 47,
 79 |    48, 49, 50, 51, 52, 53, 54, 55,
 80 |    56, 57, 58, 59, 60, 61, 62, 63,
 81 |    64, 97, 98, 99,100,101,102,103,
 82 |   104,105,106,107,108,109,110,111,
 83 |   112,113,114,115,116,117,118,119,
 84 |   120,121,122, 91, 92, 93, 94, 95,
 85 |    96, 65, 66, 67, 68, 69, 70, 71,
 86 |    72, 73, 74, 75, 76, 77, 78, 79,
 87 |    80, 81, 82, 83, 84, 85, 86, 87,
 88 |    88, 89, 90,123,124,125,126,127,
 89 |   128,129,130,131,132,133,134,135,
 90 |   136,137,138,139,140,141,142,143,
 91 |   144,145,146,147,148,149,150,151,
 92 |   152,153,154,155,156,157,158,159,
 93 |   160,161,162,163,164,165,166,167,
 94 |   168,169,170,171,172,173,174,175,
 95 |   176,177,178,179,180,181,182,183,
 96 |   184,185,186,187,188,189,190,191,
 97 |   192,193,194,195,196,197,198,199,
 98 |   200,201,202,203,204,205,206,207,
 99 |   208,209,210,211,212,213,214,215,
100 |   216,217,218,219,220,221,222,223,
101 |   224,225,226,227,228,229,230,231,
102 |   232,233,234,235,236,237,238,239,
103 |   240,241,242,243,244,245,246,247,
104 |   248,249,250,251,252,253,254,255,
105 | 
106 | /* This table contains bit maps for various character classes. Each map is 32
107 | bytes long and the bits run from the least significant end of each byte. The
108 | classes that have their own maps are: space, xdigit, digit, upper, lower, word,
109 | graph, print, punct, and cntrl. Other classes are built from combinations. */
110 | 
111 |   0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,  /* space */
112 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
113 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
114 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
115 | 
116 |   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,  /* xdigit */
117 |   0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
118 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
119 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
120 | 
121 |   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,  /* digit */
122 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
123 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
124 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
125 | 
126 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,  /* upper */
127 |   0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
128 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
129 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
130 | 
131 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,  /* lower */
132 |   0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
133 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
134 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
135 | 
136 |   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,  /* word */
137 |   0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
138 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
139 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 | 
141 |   0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,  /* graph */
142 |   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
143 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 | 
146 |   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,  /* print */
147 |   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
148 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
150 | 
151 |   0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,  /* punct */
152 |   0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
153 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
154 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 | 
156 |   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,  /* cntrl */
157 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
158 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
159 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
160 | 
161 | /* This table identifies various classes of character by individual bits:
162 |   0x01   white space character
163 |   0x02   letter
164 |   0x04   lower case letter
165 |   0x08   decimal digit
166 |   0x10   alphanumeric or '_'
167 | */
168 | 
169 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
170 |   0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /*   8- 15 */
171 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
172 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
173 |   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
174 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
175 |   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, /*  0 - 7  */
176 |   0x18,0x18,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
177 |   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  @ - G  */
178 |   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
179 |   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
180 |   0x12,0x12,0x12,0x00,0x00,0x00,0x00,0x10, /*  X - _  */
181 |   0x00,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /*  ` - g  */
182 |   0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /*  h - o  */
183 |   0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /*  p - w  */
184 |   0x16,0x16,0x16,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
185 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 |   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201 | 
202 | /* End of pcre2_chartables.c */
203 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_config.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2020 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | #ifdef HAVE_CONFIG_H
 42 | #include "config.h"
 43 | #endif
 44 | 
 45 | /* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes
 46 | its value gets changed by pcre2_intmodedep.h (included by pcre2_internal.h) to
 47 | be in code units. */
 48 | 
 49 | static int configured_link_size = LINK_SIZE;
 50 | 
 51 | #include "pcre2_internal.h"
 52 | 
 53 | /* These macros are the standard way of turning unquoted text into C strings.
 54 | They allow macros like PCRE2_MAJOR to be defined without quotes, which is
 55 | convenient for user programs that want to test their values. */
 56 | 
 57 | #define STRING(a)  # a
 58 | #define XSTRING(s) STRING(s)
 59 | 
 60 | 
 61 | /*************************************************
 62 | * Return info about what features are configured *
 63 | *************************************************/
 64 | 
 65 | /* If where is NULL, the length of memory required is returned.
 66 | 
 67 | Arguments:
 68 |   what             what information is required
 69 |   where            where to put the information
 70 | 
 71 | Returns:           0 if a numerical value is returned
 72 |                    >= 0 if a string value
 73 |                    PCRE2_ERROR_BADOPTION if "where" not recognized
 74 |                      or JIT target requested when JIT not enabled
 75 | */
 76 | 
 77 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 78 | bundled_pcre2_config(uint32_t what, void *where)
 79 | {
 80 | if (where == NULL)  /* Requests a length */
 81 |   {
 82 |   switch(what)
 83 |     {
 84 |     default:
 85 |     return PCRE2_ERROR_BADOPTION;
 86 | 
 87 |     case PCRE2_CONFIG_BSR:
 88 |     case PCRE2_CONFIG_COMPILED_WIDTHS:
 89 |     case PCRE2_CONFIG_DEPTHLIMIT:
 90 |     case PCRE2_CONFIG_HEAPLIMIT:
 91 |     case PCRE2_CONFIG_JIT:
 92 |     case PCRE2_CONFIG_LINKSIZE:
 93 |     case PCRE2_CONFIG_MATCHLIMIT:
 94 |     case PCRE2_CONFIG_NEVER_BACKSLASH_C:
 95 |     case PCRE2_CONFIG_NEWLINE:
 96 |     case PCRE2_CONFIG_PARENSLIMIT:
 97 |     case PCRE2_CONFIG_STACKRECURSE:    /* Obsolete */
 98 |     case PCRE2_CONFIG_TABLES_LENGTH:
 99 |     case PCRE2_CONFIG_UNICODE:
100 |     return sizeof(uint32_t);
101 | 
102 |     /* These are handled below */
103 | 
104 |     case PCRE2_CONFIG_JITTARGET:
105 |     case PCRE2_CONFIG_UNICODE_VERSION:
106 |     case PCRE2_CONFIG_VERSION:
107 |     break;
108 |     }
109 |   }
110 | 
111 | switch (what)
112 |   {
113 |   default:
114 |   return PCRE2_ERROR_BADOPTION;
115 | 
116 |   case PCRE2_CONFIG_BSR:
117 | #ifdef BSR_ANYCRLF
118 |   *((uint32_t *)where) = PCRE2_BSR_ANYCRLF;
119 | #else
120 |   *((uint32_t *)where) = PCRE2_BSR_UNICODE;
121 | #endif
122 |   break;
123 | 
124 |   case PCRE2_CONFIG_COMPILED_WIDTHS:
125 |   *((uint32_t *)where) = 0
126 | #ifdef SUPPORT_PCRE2_8
127 |   + 1
128 | #endif
129 | #ifdef SUPPORT_PCRE2_16
130 |   + 2
131 | #endif
132 | #ifdef SUPPORT_PCRE2_32
133 |   + 4
134 | #endif
135 |   ;
136 |   break;
137 | 
138 |   case PCRE2_CONFIG_DEPTHLIMIT:
139 |   *((uint32_t *)where) = MATCH_LIMIT_DEPTH;
140 |   break;
141 | 
142 |   case PCRE2_CONFIG_HEAPLIMIT:
143 |   *((uint32_t *)where) = HEAP_LIMIT;
144 |   break;
145 | 
146 |   case PCRE2_CONFIG_JIT:
147 | #ifdef SUPPORT_JIT
148 |   *((uint32_t *)where) = 1;
149 | #else
150 |   *((uint32_t *)where) = 0;
151 | #endif
152 |   break;
153 | 
154 |   case PCRE2_CONFIG_JITTARGET:
155 | #ifdef SUPPORT_JIT
156 |     {
157 |     const char *v = PRIV(jit_get_target)();
158 |     return (int)(1 + ((where == NULL)?
159 |       strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)));
160 |     }
161 | #else
162 |   return PCRE2_ERROR_BADOPTION;
163 | #endif
164 | 
165 |   case PCRE2_CONFIG_LINKSIZE:
166 |   *((uint32_t *)where) = (uint32_t)configured_link_size;
167 |   break;
168 | 
169 |   case PCRE2_CONFIG_MATCHLIMIT:
170 |   *((uint32_t *)where) = MATCH_LIMIT;
171 |   break;
172 | 
173 |   case PCRE2_CONFIG_NEWLINE:
174 |   *((uint32_t *)where) = NEWLINE_DEFAULT;
175 |   break;
176 | 
177 |   case PCRE2_CONFIG_NEVER_BACKSLASH_C:
178 | #ifdef NEVER_BACKSLASH_C
179 |   *((uint32_t *)where) = 1;
180 | #else
181 |   *((uint32_t *)where) = 0;
182 | #endif
183 |   break;
184 | 
185 |   case PCRE2_CONFIG_PARENSLIMIT:
186 |   *((uint32_t *)where) = PARENS_NEST_LIMIT;
187 |   break;
188 | 
189 |   /* This is now obsolete. The stack is no longer used via recursion for
190 |   handling backtracking in bundled_pcre2_match(). */
191 | 
192 |   case PCRE2_CONFIG_STACKRECURSE:
193 |   *((uint32_t *)where) = 0;
194 |   break;
195 | 
196 |   case PCRE2_CONFIG_TABLES_LENGTH:
197 |   *((uint32_t *)where) = TABLES_LENGTH;
198 |   break;
199 | 
200 |   case PCRE2_CONFIG_UNICODE_VERSION:
201 |     {
202 | #if defined SUPPORT_UNICODE
203 |     const char *v = PRIV(unicode_version);
204 | #else
205 |     const char *v = "Unicode not supported";
206 | #endif
207 |     return (int)(1 + ((where == NULL)?
208 |       strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)));
209 |    }
210 |   break;
211 | 
212 |   case PCRE2_CONFIG_UNICODE:
213 | #if defined SUPPORT_UNICODE
214 |   *((uint32_t *)where) = 1;
215 | #else
216 |   *((uint32_t *)where) = 0;
217 | #endif
218 |   break;
219 | 
220 |   /* The hackery in setting "v" below is to cope with the case when
221 |   PCRE2_PRERELEASE is set to an empty string (which it is for real releases).
222 |   If the second alternative is used in this case, it does not leave a space
223 |   before the date. On the other hand, if all four macros are put into a single
224 |   XSTRING when PCRE2_PRERELEASE is not empty, an unwanted space is inserted.
225 |   There are problems using an "obvious" approach like this:
226 | 
227 |      XSTRING(PCRE2_MAJOR) "." XSTRING(PCRE_MINOR)
228 |      XSTRING(PCRE2_PRERELEASE) " " XSTRING(PCRE_DATE)
229 | 
230 |   because, when PCRE2_PRERELEASE is empty, this leads to an attempted expansion
231 |   of STRING(). The C standard states: "If (before argument substitution) any
232 |   argument consists of no preprocessing tokens, the behavior is undefined." It
233 |   turns out the gcc treats this case as a single empty string - which is what
234 |   we really want - but Visual C grumbles about the lack of an argument for the
235 |   macro. Unfortunately, both are within their rights. As there seems to be no
236 |   way to test for a macro's value being empty at compile time, we have to
237 |   resort to a runtime test. */
238 | 
239 |   case PCRE2_CONFIG_VERSION:
240 |     {
241 |     const char *v = (XSTRING(Z PCRE2_PRERELEASE)[1] == 0)?
242 |       XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) :
243 |       XSTRING(PCRE2_MAJOR.PCRE2_MINOR) XSTRING(PCRE2_PRERELEASE PCRE2_DATE);
244 |     return (int)(1 + ((where == NULL)?
245 |       strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)));
246 |     }
247 |   }
248 | 
249 | return 0;
250 | }
251 | 
252 | /* End of pcre2_config.c */
253 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_extuni.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2019 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | /* This module contains an internal function that is used to match a Unicode
 42 | extended grapheme sequence. It is used by both bundled_pcre2_match() and
 43 | pcre2_def_match(). However, it is called only when Unicode support is being
 44 | compiled. Nevertheless, we provide a dummy function when there is no Unicode
 45 | support, because some compilers do not like functionless source files. */
 46 | 
 47 | 
 48 | #ifdef HAVE_CONFIG_H
 49 | #include "config.h"
 50 | #endif
 51 | 
 52 | 
 53 | #include "pcre2_internal.h"
 54 | 
 55 | 
 56 | /* Dummy function */
 57 | 
 58 | #ifndef SUPPORT_UNICODE
 59 | PCRE2_SPTR
 60 | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
 61 |   PCRE2_SPTR end_subject, BOOL utf, int *xcount)
 62 | {
 63 | (void)c;
 64 | (void)eptr;
 65 | (void)start_subject;
 66 | (void)end_subject;
 67 | (void)utf;
 68 | (void)xcount;
 69 | return NULL;
 70 | }
 71 | #else
 72 | 
 73 | 
 74 | /*************************************************
 75 | *      Match an extended grapheme sequence       *
 76 | *************************************************/
 77 | 
 78 | /*
 79 | Arguments:
 80 |   c              the first character
 81 |   eptr           pointer to next character
 82 |   start_subject  pointer to start of subject
 83 |   end_subject    pointer to end of subject
 84 |   utf            TRUE if in UTF mode
 85 |   xcount         pointer to count of additional characters,
 86 |                    or NULL if count not needed
 87 | 
 88 | Returns:         pointer after the end of the sequence
 89 | */
 90 | 
 91 | PCRE2_SPTR
 92 | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
 93 |   PCRE2_SPTR end_subject, BOOL utf, int *xcount)
 94 | {
 95 | int lgb = UCD_GRAPHBREAK(c);
 96 | 
 97 | while (eptr < end_subject)
 98 |   {
 99 |   int rgb;
100 |   int len = 1;
101 |   if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
102 |   rgb = UCD_GRAPHBREAK(c);
103 |   if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
104 | 
105 |   /* Not breaking between Regional Indicators is allowed only if there
106 |   are an even number of preceding RIs. */
107 | 
108 |   if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator)
109 |     {
110 |     int ricount = 0;
111 |     PCRE2_SPTR bptr = eptr - 1;
112 |     if (utf) BACKCHAR(bptr);
113 | 
114 |     /* bptr is pointing to the left-hand character */
115 | 
116 |     while (bptr > start_subject)
117 |       {
118 |       bptr--;
119 |       if (utf)
120 |         {
121 |         BACKCHAR(bptr);
122 |         GETCHAR(c, bptr);
123 |         }
124 |       else
125 |       c = *bptr;
126 |       if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break;
127 |       ricount++;
128 |       }
129 |     if ((ricount & 1) != 0) break;  /* Grapheme break required */
130 |     }
131 | 
132 |   /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this
133 |   allows any number of them before a following Extended_Pictographic. */
134 | 
135 |   if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) ||
136 |        lgb != ucp_gbExtended_Pictographic)
137 |     lgb = rgb;
138 | 
139 |   eptr += len;
140 |   if (xcount != NULL) *xcount += 1;
141 |   }
142 | 
143 | return eptr;
144 | }
145 | 
146 | #endif  /* SUPPORT_UNICODE */
147 | 
148 | /* End of pcre2_extuni.c */
149 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_find_bracket.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2018 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | 
 42 | /* This module contains a single function that scans through a compiled pattern
 43 | until it finds a capturing bracket with the given number, or, if the number is
 44 | negative, an instance of OP_REVERSE for a lookbehind. The function is called
 45 | from pcre2_compile.c and also from pcre2_study.c when finding the minimum
 46 | matching length. */
 47 | 
 48 | 
 49 | #ifdef HAVE_CONFIG_H
 50 | #include "config.h"
 51 | #endif
 52 | 
 53 | #include "pcre2_internal.h"
 54 | 
 55 | 
 56 | /*************************************************
 57 | *    Scan compiled regex for specific bracket    *
 58 | *************************************************/
 59 | 
 60 | /*
 61 | Arguments:
 62 |   code        points to start of expression
 63 |   utf         TRUE in UTF mode
 64 |   number      the required bracket number or negative to find a lookbehind
 65 | 
 66 | Returns:      pointer to the opcode for the bracket, or NULL if not found
 67 | */
 68 | 
 69 | PCRE2_SPTR
 70 | PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number)
 71 | {
 72 | for (;;)
 73 |   {
 74 |   PCRE2_UCHAR c = *code;
 75 | 
 76 |   if (c == OP_END) return NULL;
 77 | 
 78 |   /* XCLASS is used for classes that cannot be represented just by a bit map.
 79 |   This includes negated single high-valued characters. CALLOUT_STR is used for
 80 |   callouts with string arguments. In both cases the length in the table is
 81 |   zero; the actual length is stored in the compiled code. */
 82 | 
 83 |   if (c == OP_XCLASS) code += GET(code, 1);
 84 |     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
 85 | 
 86 |   /* Handle lookbehind */
 87 | 
 88 |   else if (c == OP_REVERSE)
 89 |     {
 90 |     if (number < 0) return (PCRE2_UCHAR *)code;
 91 |     code += PRIV(OP_lengths)[c];
 92 |     }
 93 | 
 94 |   /* Handle capturing bracket */
 95 | 
 96 |   else if (c == OP_CBRA || c == OP_SCBRA ||
 97 |            c == OP_CBRAPOS || c == OP_SCBRAPOS)
 98 |     {
 99 |     int n = (int)GET2(code, 1+LINK_SIZE);
100 |     if (n == number) return (PCRE2_UCHAR *)code;
101 |     code += PRIV(OP_lengths)[c];
102 |     }
103 | 
104 |   /* Otherwise, we can get the item's length from the table, except that for
105 |   repeated character types, we have to test for \p and \P, which have an extra
106 |   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
107 |   must add in its length. */
108 | 
109 |   else
110 |     {
111 |     switch(c)
112 |       {
113 |       case OP_TYPESTAR:
114 |       case OP_TYPEMINSTAR:
115 |       case OP_TYPEPLUS:
116 |       case OP_TYPEMINPLUS:
117 |       case OP_TYPEQUERY:
118 |       case OP_TYPEMINQUERY:
119 |       case OP_TYPEPOSSTAR:
120 |       case OP_TYPEPOSPLUS:
121 |       case OP_TYPEPOSQUERY:
122 |       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
123 |       break;
124 | 
125 |       case OP_TYPEUPTO:
126 |       case OP_TYPEMINUPTO:
127 |       case OP_TYPEEXACT:
128 |       case OP_TYPEPOSUPTO:
129 |       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
130 |         code += 2;
131 |       break;
132 | 
133 |       case OP_MARK:
134 |       case OP_COMMIT_ARG:
135 |       case OP_PRUNE_ARG:
136 |       case OP_SKIP_ARG:
137 |       case OP_THEN_ARG:
138 |       code += code[1];
139 |       break;
140 |       }
141 | 
142 |     /* Add in the fixed length from the table */
143 | 
144 |     code += PRIV(OP_lengths)[c];
145 | 
146 |   /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
147 |   followed by a multi-byte character. The length in the table is a minimum, so
148 |   we have to arrange to skip the extra bytes. */
149 | 
150 | #ifdef MAYBE_UTF_MULTI
151 |     if (utf) switch(c)
152 |       {
153 |       case OP_CHAR:
154 |       case OP_CHARI:
155 |       case OP_NOT:
156 |       case OP_NOTI:
157 |       case OP_EXACT:
158 |       case OP_EXACTI:
159 |       case OP_NOTEXACT:
160 |       case OP_NOTEXACTI:
161 |       case OP_UPTO:
162 |       case OP_UPTOI:
163 |       case OP_NOTUPTO:
164 |       case OP_NOTUPTOI:
165 |       case OP_MINUPTO:
166 |       case OP_MINUPTOI:
167 |       case OP_NOTMINUPTO:
168 |       case OP_NOTMINUPTOI:
169 |       case OP_POSUPTO:
170 |       case OP_POSUPTOI:
171 |       case OP_NOTPOSUPTO:
172 |       case OP_NOTPOSUPTOI:
173 |       case OP_STAR:
174 |       case OP_STARI:
175 |       case OP_NOTSTAR:
176 |       case OP_NOTSTARI:
177 |       case OP_MINSTAR:
178 |       case OP_MINSTARI:
179 |       case OP_NOTMINSTAR:
180 |       case OP_NOTMINSTARI:
181 |       case OP_POSSTAR:
182 |       case OP_POSSTARI:
183 |       case OP_NOTPOSSTAR:
184 |       case OP_NOTPOSSTARI:
185 |       case OP_PLUS:
186 |       case OP_PLUSI:
187 |       case OP_NOTPLUS:
188 |       case OP_NOTPLUSI:
189 |       case OP_MINPLUS:
190 |       case OP_MINPLUSI:
191 |       case OP_NOTMINPLUS:
192 |       case OP_NOTMINPLUSI:
193 |       case OP_POSPLUS:
194 |       case OP_POSPLUSI:
195 |       case OP_NOTPOSPLUS:
196 |       case OP_NOTPOSPLUSI:
197 |       case OP_QUERY:
198 |       case OP_QUERYI:
199 |       case OP_NOTQUERY:
200 |       case OP_NOTQUERYI:
201 |       case OP_MINQUERY:
202 |       case OP_MINQUERYI:
203 |       case OP_NOTMINQUERY:
204 |       case OP_NOTMINQUERYI:
205 |       case OP_POSQUERY:
206 |       case OP_POSQUERYI:
207 |       case OP_NOTPOSQUERY:
208 |       case OP_NOTPOSQUERYI:
209 |       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
210 |       break;
211 |       }
212 | #else
213 |     (void)(utf);  /* Keep compiler happy by referencing function argument */
214 | #endif  /* MAYBE_UTF_MULTI */
215 |     }
216 |   }
217 | }
218 | 
219 | /* End of pcre2_find_bracket.c */
220 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_is_bundled.c:
--------------------------------------------------------------------------------
1 | #include "pcre2.h"
2 | 
3 | int pcre2_is_bundled(void) {
4 |   return 1;
5 | }
6 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_jit_match.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2018 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | #ifndef INCLUDED_FROM_PCRE2_JIT_COMPILE
 42 | #error This file must be included from pcre2_jit_compile.c.
 43 | #endif
 44 | 
 45 | #ifdef SUPPORT_JIT
 46 | 
 47 | static SLJIT_NOINLINE int jit_machine_stack_exec(jit_arguments *arguments, jit_function executable_func)
 48 | {
 49 | sljit_u8 local_space[MACHINE_STACK_SIZE];
 50 | struct sljit_stack local_stack;
 51 | 
 52 | local_stack.min_start = local_space;
 53 | local_stack.start = local_space;
 54 | local_stack.end = local_space + MACHINE_STACK_SIZE;
 55 | local_stack.top = local_space + MACHINE_STACK_SIZE;
 56 | arguments->stack = &local_stack;
 57 | return executable_func(arguments);
 58 | }
 59 | 
 60 | #endif
 61 | 
 62 | 
 63 | /*************************************************
 64 | *              Do a JIT pattern match            *
 65 | *************************************************/
 66 | 
 67 | /* This function runs a JIT pattern match.
 68 | 
 69 | Arguments:
 70 |   code            points to the compiled expression
 71 |   subject         points to the subject string
 72 |   length          length of subject string (may contain binary zeros)
 73 |   start_offset    where to start in the subject string
 74 |   options         option bits
 75 |   match_data      points to a match_data block
 76 |   mcontext        points to a match context
 77 | 
 78 | Returns:          > 0 => success; value is the number of ovector pairs filled
 79 |                   = 0 => success, but ovector is not big enough
 80 |                    -1 => failed to match (PCRE_ERROR_NOMATCH)
 81 |                  < -1 => some kind of unexpected problem
 82 | */
 83 | 
 84 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 85 | bundled_pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
 86 |   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
 87 |   pcre2_match_context *mcontext)
 88 | {
 89 | #ifndef SUPPORT_JIT
 90 | 
 91 | (void)code;
 92 | (void)subject;
 93 | (void)length;
 94 | (void)start_offset;
 95 | (void)options;
 96 | (void)match_data;
 97 | (void)mcontext;
 98 | return PCRE2_ERROR_JIT_BADOPTION;
 99 | 
100 | #else  /* SUPPORT_JIT */
101 | 
102 | pcre2_real_code *re = (pcre2_real_code *)code;
103 | executable_functions *functions = (executable_functions *)re->executable_jit;
104 | pcre2_jit_stack *jit_stack;
105 | uint32_t oveccount = match_data->oveccount;
106 | uint32_t max_oveccount;
107 | union {
108 |    void *executable_func;
109 |    jit_function call_executable_func;
110 | } convert_executable_func;
111 | jit_arguments arguments;
112 | int rc;
113 | int index = 0;
114 | 
115 | if ((options & PCRE2_PARTIAL_HARD) != 0)
116 |   index = 2;
117 | else if ((options & PCRE2_PARTIAL_SOFT) != 0)
118 |   index = 1;
119 | 
120 | if (functions == NULL || functions->executable_funcs[index] == NULL)
121 |   return PCRE2_ERROR_JIT_BADOPTION;
122 | 
123 | /* Sanity checks should be handled by pcre_exec. */
124 | arguments.str = subject + start_offset;
125 | arguments.begin = subject;
126 | arguments.end = subject + length;
127 | arguments.match_data = match_data;
128 | arguments.startchar_ptr = subject;
129 | arguments.mark_ptr = NULL;
130 | arguments.options = options;
131 | 
132 | if (mcontext != NULL)
133 |   {
134 |   arguments.callout = mcontext->callout;
135 |   arguments.callout_data = mcontext->callout_data;
136 |   arguments.offset_limit = mcontext->offset_limit;
137 |   arguments.limit_match = (mcontext->match_limit < re->limit_match)?
138 |     mcontext->match_limit : re->limit_match;
139 |   if (mcontext->jit_callback != NULL)
140 |     jit_stack = mcontext->jit_callback(mcontext->jit_callback_data);
141 |   else
142 |     jit_stack = (pcre2_jit_stack *)mcontext->jit_callback_data;
143 |   }
144 | else
145 |   {
146 |   arguments.callout = NULL;
147 |   arguments.callout_data = NULL;
148 |   arguments.offset_limit = PCRE2_UNSET;
149 |   arguments.limit_match = (MATCH_LIMIT < re->limit_match)?
150 |     MATCH_LIMIT : re->limit_match;
151 |   jit_stack = NULL;
152 |   }
153 | 
154 | 
155 | max_oveccount = functions->top_bracket;
156 | if (oveccount > max_oveccount)
157 |   oveccount = max_oveccount;
158 | arguments.oveccount = oveccount << 1;
159 | 
160 | 
161 | convert_executable_func.executable_func = functions->executable_funcs[index];
162 | if (jit_stack != NULL)
163 |   {
164 |   arguments.stack = (struct sljit_stack *)(jit_stack->stack);
165 |   rc = convert_executable_func.call_executable_func(&arguments);
166 |   }
167 | else
168 |   rc = jit_machine_stack_exec(&arguments, convert_executable_func.call_executable_func);
169 | 
170 | if (rc > (int)oveccount)
171 |   rc = 0;
172 | match_data->code = re;
173 | match_data->subject = (rc >= 0 || rc == PCRE2_ERROR_PARTIAL)? subject : NULL;
174 | match_data->rc = rc;
175 | match_data->startchar = arguments.startchar_ptr - subject;
176 | match_data->leftchar = 0;
177 | match_data->rightchar = 0;
178 | match_data->mark = arguments.mark_ptr;
179 | match_data->matchedby = PCRE2_MATCHEDBY_JIT;
180 | 
181 | return match_data->rc;
182 | 
183 | #endif  /* SUPPORT_JIT */
184 | }
185 | 
186 | /* End of pcre2_jit_match.c */
187 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_jit_misc.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |          New API code Copyright (c) 2016 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | 
 42 | #ifndef INCLUDED_FROM_PCRE2_JIT_COMPILE
 43 | #error This file must be included from pcre2_jit_compile.c.
 44 | #endif
 45 | 
 46 | 
 47 | 
 48 | /*************************************************
 49 | *           Free JIT read-only data              *
 50 | *************************************************/
 51 | 
 52 | void
 53 | PRIV(jit_free_rodata)(void *current, void *allocator_data)
 54 | {
 55 | #ifndef SUPPORT_JIT
 56 | (void)current;
 57 | (void)allocator_data;
 58 | #else  /* SUPPORT_JIT */
 59 | void *next;
 60 | 
 61 | SLJIT_UNUSED_ARG(allocator_data);
 62 | 
 63 | while (current != NULL)
 64 |   {
 65 |   next = *(void**)current;
 66 |   SLJIT_FREE(current, allocator_data);
 67 |   current = next;
 68 |   }
 69 | 
 70 | #endif /* SUPPORT_JIT */
 71 | }
 72 | 
 73 | /*************************************************
 74 | *           Free JIT compiled code               *
 75 | *************************************************/
 76 | 
 77 | void
 78 | PRIV(jit_free)(void *executable_jit, pcre2_memctl *memctl)
 79 | {
 80 | #ifndef SUPPORT_JIT
 81 | (void)executable_jit;
 82 | (void)memctl;
 83 | #else  /* SUPPORT_JIT */
 84 | 
 85 | executable_functions *functions = (executable_functions *)executable_jit;
 86 | void *allocator_data = memctl;
 87 | int i;
 88 | 
 89 | for (i = 0; i < JIT_NUMBER_OF_COMPILE_MODES; i++)
 90 |   {
 91 |   if (functions->executable_funcs[i] != NULL)
 92 |     sljit_free_code(functions->executable_funcs[i]);
 93 |   PRIV(jit_free_rodata)(functions->read_only_data_heads[i], allocator_data);
 94 |   }
 95 | 
 96 | SLJIT_FREE(functions, allocator_data);
 97 | 
 98 | #endif /* SUPPORT_JIT */
 99 | }
100 | 
101 | 
102 | /*************************************************
103 | *            Free unused JIT memory              *
104 | *************************************************/
105 | 
106 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
107 | bundled_pcre2_jit_free_unused_memory(pcre2_general_context *gcontext)
108 | {
109 | #ifndef SUPPORT_JIT
110 | (void)gcontext;     /* Suppress warning */
111 | #else  /* SUPPORT_JIT */
112 | SLJIT_UNUSED_ARG(gcontext);
113 | sljit_free_unused_memory_exec();
114 | #endif  /* SUPPORT_JIT */
115 | }
116 | 
117 | 
118 | 
119 | /*************************************************
120 | *            Allocate a JIT stack                *
121 | *************************************************/
122 | 
123 | PCRE2_EXP_DEFN pcre2_jit_stack * PCRE2_CALL_CONVENTION
124 | bundled_pcre2_jit_stack_create(size_t startsize, size_t maxsize,
125 |   pcre2_general_context *gcontext)
126 | {
127 | #ifndef SUPPORT_JIT
128 | 
129 | (void)gcontext;
130 | (void)startsize;
131 | (void)maxsize;
132 | return NULL;
133 | 
134 | #else  /* SUPPORT_JIT */
135 | 
136 | pcre2_jit_stack *jit_stack;
137 | 
138 | if (startsize < 1 || maxsize < 1)
139 |   return NULL;
140 | if (startsize > maxsize)
141 |   startsize = maxsize;
142 | startsize = (startsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1);
143 | maxsize = (maxsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1);
144 | 
145 | jit_stack = PRIV(memctl_malloc)(sizeof(pcre2_real_jit_stack), (pcre2_memctl *)gcontext);
146 | if (jit_stack == NULL) return NULL;
147 | jit_stack->stack = sljit_allocate_stack(startsize, maxsize, &jit_stack->memctl);
148 | if (jit_stack->stack == NULL)
149 |   {
150 |   jit_stack->memctl.free(jit_stack, jit_stack->memctl.memory_data);
151 |   return NULL;
152 |   }
153 | return jit_stack;
154 | 
155 | #endif
156 | }
157 | 
158 | 
159 | /*************************************************
160 | *         Assign a JIT stack to a pattern        *
161 | *************************************************/
162 | 
163 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
164 | bundled_pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback,
165 |   void *callback_data)
166 | {
167 | #ifndef SUPPORT_JIT
168 | (void)mcontext;
169 | (void)callback;
170 | (void)callback_data;
171 | #else  /* SUPPORT_JIT */
172 | 
173 | if (mcontext == NULL) return;
174 | mcontext->jit_callback = callback;
175 | mcontext->jit_callback_data = callback_data;
176 | 
177 | #endif  /* SUPPORT_JIT */
178 | }
179 | 
180 | 
181 | /*************************************************
182 | *               Free a JIT stack                 *
183 | *************************************************/
184 | 
185 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
186 | bundled_pcre2_jit_stack_free(pcre2_jit_stack *jit_stack)
187 | {
188 | #ifndef SUPPORT_JIT
189 | (void)jit_stack;
190 | #else  /* SUPPORT_JIT */
191 | if (jit_stack != NULL)
192 |   {
193 |   sljit_free_stack((struct sljit_stack *)(jit_stack->stack), &jit_stack->memctl);
194 |   jit_stack->memctl.free(jit_stack, jit_stack->memctl.memory_data);
195 |   }
196 | #endif  /* SUPPORT_JIT */
197 | }
198 | 
199 | 
200 | /*************************************************
201 | *               Get target CPU type              *
202 | *************************************************/
203 | 
204 | const char*
205 | PRIV(jit_get_target)(void)
206 | {
207 | #ifndef SUPPORT_JIT
208 | return "JIT is not supported";
209 | #else  /* SUPPORT_JIT */
210 | return sljit_get_platform_name();
211 | #endif  /* SUPPORT_JIT */
212 | }
213 | 
214 | 
215 | /*************************************************
216 | *              Get size of JIT code              *
217 | *************************************************/
218 | 
219 | size_t
220 | PRIV(jit_get_size)(void *executable_jit)
221 | {
222 | #ifndef SUPPORT_JIT
223 | (void)executable_jit;
224 | return 0;
225 | #else  /* SUPPORT_JIT */
226 | sljit_uw *executable_sizes = ((executable_functions *)executable_jit)->executable_sizes;
227 | SLJIT_COMPILE_ASSERT(JIT_NUMBER_OF_COMPILE_MODES == 3, number_of_compile_modes_changed);
228 | return executable_sizes[0] + executable_sizes[1] + executable_sizes[2];
229 | #endif
230 | }
231 | 
232 | /* End of pcre2_jit_misc.c */
233 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_jit_neon_inc.h:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |             This module by Zoltan Herczeg and Sebastian Pop
 10 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 11 |           New API code Copyright (c) 2016-2019 University of Cambridge
 12 | 
 13 | -----------------------------------------------------------------------------
 14 | Redistribution and use in source and binary forms, with or without
 15 | modification, are permitted provided that the following conditions are met:
 16 | 
 17 |     * Redistributions of source code must retain the above copyright notice,
 18 |       this list of conditions and the following disclaimer.
 19 | 
 20 |     * Redistributions in binary form must reproduce the above copyright
 21 |       notice, this list of conditions and the following disclaimer in the
 22 |       documentation and/or other materials provided with the distribution.
 23 | 
 24 |     * Neither the name of the University of Cambridge nor the names of its
 25 |       contributors may be used to endorse or promote products derived from
 26 |       this software without specific prior written permission.
 27 | 
 28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 38 | POSSIBILITY OF SUCH DAMAGE.
 39 | -----------------------------------------------------------------------------
 40 | */
 41 | 
 42 | # if defined(FFCS)
 43 | #  if defined(FF_UTF)
 44 | #   define FF_FUN ffcs_utf
 45 | #  else
 46 | #   define FF_FUN ffcs
 47 | #  endif
 48 | 
 49 | # elif defined(FFCS_2)
 50 | #  if defined(FF_UTF)
 51 | #   define FF_FUN ffcs_2_utf
 52 | #  else
 53 | #   define FF_FUN ffcs_2
 54 | #  endif
 55 | 
 56 | # elif defined(FFCS_MASK)
 57 | #  if defined(FF_UTF)
 58 | #   define FF_FUN ffcs_mask_utf
 59 | #  else
 60 | #   define FF_FUN ffcs_mask
 61 | #  endif
 62 | 
 63 | # elif defined(FFCPS_0)
 64 | #  if defined (FF_UTF)
 65 | #   define FF_FUN ffcps_0_utf
 66 | #  else
 67 | #   define FF_FUN ffcps_0
 68 | #  endif
 69 | 
 70 | # elif defined (FFCPS_1)
 71 | #  if defined (FF_UTF)
 72 | #   define FF_FUN ffcps_1_utf
 73 | #  else
 74 | #   define FF_FUN ffcps_1
 75 | #  endif
 76 | 
 77 | # elif defined (FFCPS_DEFAULT)
 78 | #  if defined (FF_UTF)
 79 | #   define FF_FUN ffcps_default_utf
 80 | #  else
 81 | #   define FF_FUN ffcps_default
 82 | #  endif
 83 | # endif
 84 | 
 85 | static sljit_u8* SLJIT_FUNC FF_FUN(sljit_u8 *str_end, sljit_u8 *str_ptr, sljit_uw offs1, sljit_uw offs2, sljit_uw chars)
 86 | #undef FF_FUN
 87 | {
 88 | quad_word qw;
 89 | int_char ic;
 90 | ic.x = chars;
 91 | 
 92 | #if defined(FFCS)
 93 | sljit_u8 c1 = ic.c.c1;
 94 | vect_t vc1 = VDUPQ(c1);
 95 | 
 96 | #elif defined(FFCS_2)
 97 | sljit_u8 c1 = ic.c.c1;
 98 | vect_t vc1 = VDUPQ(c1);
 99 | sljit_u8 c2 = ic.c.c2;
100 | vect_t vc2 = VDUPQ(c2);
101 | 
102 | #elif defined(FFCS_MASK)
103 | sljit_u8 c1 = ic.c.c1;
104 | vect_t vc1 = VDUPQ(c1);
105 | sljit_u8 mask = ic.c.c2;
106 | vect_t vmask = VDUPQ(mask);
107 | #endif
108 | 
109 | #if defined(FFCPS)
110 | compare_type compare1_type = compare_match1;
111 | compare_type compare2_type = compare_match1;
112 | vect_t cmp1a, cmp1b, cmp2a, cmp2b;
113 | const sljit_u32 diff = IN_UCHARS(offs1 - offs2);
114 | PCRE2_UCHAR char1a = ic.c.c1;
115 | PCRE2_UCHAR char2a = ic.c.c3;
116 | 
117 | # ifdef FFCPS_CHAR1A2A
118 | cmp1a = VDUPQ(char1a);
119 | cmp2a = VDUPQ(char2a);
120 | cmp1b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */
121 | cmp2b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */
122 | # else
123 | PCRE2_UCHAR char1b = ic.c.c2;
124 | PCRE2_UCHAR char2b = ic.c.c4;
125 | if (char1a == char1b)
126 |   {
127 |   cmp1a = VDUPQ(char1a);
128 |   cmp1b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */
129 |   }
130 | else
131 |   {
132 |   sljit_u32 bit1 = char1a ^ char1b;
133 |   if (is_powerof2(bit1))
134 |     {
135 |     compare1_type = compare_match1i;
136 |     cmp1a = VDUPQ(char1a | bit1);
137 |     cmp1b = VDUPQ(bit1);
138 |     }
139 |   else
140 |     {
141 |     compare1_type = compare_match2;
142 |     cmp1a = VDUPQ(char1a);
143 |     cmp1b = VDUPQ(char1b);
144 |     }
145 |   }
146 | 
147 | if (char2a == char2b)
148 |   {
149 |   cmp2a = VDUPQ(char2a);
150 |   cmp2b = VDUPQ(0); /* to avoid errors on older compilers -Werror=maybe-uninitialized */
151 |   }
152 | else
153 |   {
154 |   sljit_u32 bit2 = char2a ^ char2b;
155 |   if (is_powerof2(bit2))
156 |     {
157 |     compare2_type = compare_match1i;
158 |     cmp2a = VDUPQ(char2a | bit2);
159 |     cmp2b = VDUPQ(bit2);
160 |     }
161 |   else
162 |     {
163 |     compare2_type = compare_match2;
164 |     cmp2a = VDUPQ(char2a);
165 |     cmp2b = VDUPQ(char2b);
166 |     }
167 |   }
168 | # endif
169 | 
170 | str_ptr += IN_UCHARS(offs1);
171 | #endif
172 | 
173 | #if PCRE2_CODE_UNIT_WIDTH != 8
174 | vect_t char_mask = VDUPQ(0xff);
175 | #endif
176 | 
177 | #if defined(FF_UTF)
178 | restart:;
179 | #endif
180 | 
181 | #if defined(FFCPS)
182 | sljit_u8 *p1 = str_ptr - diff;
183 | #endif
184 | sljit_s32 align_offset = ((uint64_t)str_ptr & 0xf);
185 | str_ptr = (sljit_u8 *) ((uint64_t)str_ptr & ~0xf);
186 | vect_t data = VLD1Q(str_ptr);
187 | #if PCRE2_CODE_UNIT_WIDTH != 8
188 | data = VANDQ(data, char_mask);
189 | #endif
190 |  
191 | #if defined(FFCS)
192 | vect_t eq = VCEQQ(data, vc1);
193 | 
194 | #elif defined(FFCS_2)
195 | vect_t eq1 = VCEQQ(data, vc1);
196 | vect_t eq2 = VCEQQ(data, vc2);
197 | vect_t eq = VORRQ(eq1, eq2);    
198 | 
199 | #elif defined(FFCS_MASK)
200 | vect_t eq = VORRQ(data, vmask);
201 | eq = VCEQQ(eq, vc1);
202 | 
203 | #elif defined(FFCPS)
204 | # if defined(FFCPS_DIFF1)
205 | vect_t prev_data = data;
206 | # endif
207 | 
208 | vect_t data2;
209 | if (p1 < str_ptr)
210 |   {
211 |   data2 = VLD1Q(str_ptr - diff);
212 | #if PCRE2_CODE_UNIT_WIDTH != 8
213 |   data2 = VANDQ(data2, char_mask);
214 | #endif
215 |   }
216 | else
217 |   data2 = shift_left_n_lanes(data, offs1 - offs2);
218 |  
219 | if (compare1_type == compare_match1)
220 |   data = VCEQQ(data, cmp1a);
221 | else
222 |   data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b);
223 | 
224 | if (compare2_type == compare_match1)
225 |   data2 = VCEQQ(data2, cmp2a);
226 | else
227 |   data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b);
228 | 
229 | vect_t eq = VANDQ(data, data2);
230 | #endif
231 | 
232 | VST1Q(qw.mem, eq);
233 | /* Ignore matches before the first STR_PTR. */
234 | if (align_offset < 8)
235 |   {
236 |   qw.dw[0] >>= align_offset * 8;
237 |   if (qw.dw[0])
238 |     {
239 |     str_ptr += align_offset + __builtin_ctzll(qw.dw[0]) / 8;
240 |     goto match;
241 |     }
242 |   if (qw.dw[1])
243 |     {
244 |     str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8;
245 |     goto match;
246 |     }
247 |   }
248 | else
249 |   {
250 |   qw.dw[1] >>= (align_offset - 8) * 8;
251 |   if (qw.dw[1])
252 |     {
253 |     str_ptr += align_offset + __builtin_ctzll(qw.dw[1]) / 8;
254 |     goto match;
255 |     }
256 |   }
257 | str_ptr += 16;
258 | 
259 | while (str_ptr < str_end)
260 |   {
261 |   vect_t orig_data = VLD1Q(str_ptr);
262 | #if PCRE2_CODE_UNIT_WIDTH != 8
263 |   orig_data = VANDQ(orig_data, char_mask);
264 | #endif
265 |   data = orig_data;
266 | 
267 | #if defined(FFCS)
268 |   eq = VCEQQ(data, vc1);
269 | 
270 | #elif defined(FFCS_2)
271 |   eq1 = VCEQQ(data, vc1);
272 |   eq2 = VCEQQ(data, vc2);
273 |   eq = VORRQ(eq1, eq2);    
274 | 
275 | #elif defined(FFCS_MASK)
276 |   eq = VORRQ(data, vmask);
277 |   eq = VCEQQ(eq, vc1);
278 | #endif
279 | 
280 | #if defined(FFCPS)
281 | # if defined (FFCPS_DIFF1)
282 |   data2 = VEXTQ(prev_data, data, VECTOR_FACTOR - 1);
283 | # else
284 |   data2 = VLD1Q(str_ptr - diff);
285 | #  if PCRE2_CODE_UNIT_WIDTH != 8
286 |   data2 = VANDQ(data2, char_mask);
287 | #  endif
288 | # endif
289 | 
290 | # ifdef FFCPS_CHAR1A2A
291 |   data = VCEQQ(data, cmp1a);
292 |   data2 = VCEQQ(data2, cmp2a);
293 | # else
294 |   if (compare1_type == compare_match1)
295 |     data = VCEQQ(data, cmp1a);
296 |   else
297 |     data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b);
298 |   if (compare2_type == compare_match1)
299 |     data2 = VCEQQ(data2, cmp2a);
300 |   else
301 |     data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b);
302 | # endif
303 | 
304 |   eq = VANDQ(data, data2);
305 | #endif
306 | 
307 |   VST1Q(qw.mem, eq);
308 |   if (qw.dw[0])
309 |     str_ptr += __builtin_ctzll(qw.dw[0]) / 8;
310 |   else if (qw.dw[1])
311 |     str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8;
312 |   else {
313 |     str_ptr += 16;
314 | #if defined (FFCPS_DIFF1)
315 |     prev_data = orig_data;
316 | #endif
317 |     continue;
318 |   }
319 | 
320 | match:;
321 |   if (str_ptr >= str_end)
322 |     /* Failed match. */
323 |     return NULL;
324 | 
325 | #if defined(FF_UTF)
326 |   if (utf_continue(str_ptr + IN_UCHARS(-offs1)))
327 |     {
328 |     /* Not a match. */
329 |     str_ptr += IN_UCHARS(1);
330 |     goto restart;
331 |     }
332 | #endif
333 | 
334 |   /* Match. */
335 | #if defined (FFCPS)
336 |   str_ptr -= IN_UCHARS(offs1);
337 | #endif
338 |   return str_ptr;
339 |   }
340 | 
341 | /* Failed match. */
342 | return NULL;
343 | }
344 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_maketables.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2020 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | 
 42 | /* This module contains the external function bundled_pcre2_maketables(), which builds
 43 | character tables for PCRE2 in the current locale. The file is compiled on its
 44 | own as part of the PCRE2 library. It is also included in the compilation of
 45 | pcre2_dftables.c as a freestanding program, in which case the macro
 46 | PCRE2_DFTABLES is defined. */
 47 | 
 48 | #ifndef PCRE2_DFTABLES    /* Compiling the library */
 49 | #  ifdef HAVE_CONFIG_H
 50 | #  include "config.h"
 51 | #  endif
 52 | #  include "pcre2_internal.h"
 53 | #endif
 54 | 
 55 | 
 56 | 
 57 | /*************************************************
 58 | *           Create PCRE2 character tables        *
 59 | *************************************************/
 60 | 
 61 | /* This function builds a set of character tables for use by PCRE2 and returns
 62 | a pointer to them. They are build using the ctype functions, and consequently
 63 | their contents will depend upon the current locale setting. When compiled as
 64 | part of the library, the store is obtained via a general context malloc, if
 65 | supplied, but when PCRE2_DFTABLES is defined (when compiling the pcre2_dftables
 66 | freestanding auxiliary program) malloc() is used, and the function has a
 67 | different name so as not to clash with the prototype in pcre2.h.
 68 | 
 69 | Arguments:   none when PCRE2_DFTABLES is defined
 70 |                else a PCRE2 general context or NULL
 71 | Returns:     pointer to the contiguous block of data
 72 |                else NULL if memory allocation failed
 73 | */
 74 | 
 75 | #ifdef PCRE2_DFTABLES  /* Included in freestanding pcre2_dftables program */
 76 | static const uint8_t *maketables(void)
 77 | {
 78 | uint8_t *yield = (uint8_t *)malloc(TABLES_LENGTH);
 79 | 
 80 | #else  /* Not PCRE2_DFTABLES, that is, compiling the library */
 81 | PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION
 82 | bundled_pcre2_maketables(pcre2_general_context *gcontext)
 83 | {
 84 | uint8_t *yield = (uint8_t *)((gcontext != NULL)?
 85 |   gcontext->memctl.malloc(TABLES_LENGTH, gcontext->memctl.memory_data) :
 86 |   malloc(TABLES_LENGTH));
 87 | #endif  /* PCRE2_DFTABLES */
 88 | 
 89 | int i;
 90 | uint8_t *p;
 91 | 
 92 | if (yield == NULL) return NULL;
 93 | p = yield;
 94 | 
 95 | /* First comes the lower casing table */
 96 | 
 97 | for (i = 0; i < 256; i++) *p++ = tolower(i);
 98 | 
 99 | /* Next the case-flipping table */
100 | 
101 | for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
102 | 
103 | /* Then the character class tables. Don't try to be clever and save effort on
104 | exclusive ones - in some locales things may be different.
105 | 
106 | Note that the table for "space" includes everything "isspace" gives, including
107 | VT in the default locale. This makes it work for the POSIX class [:space:].
108 | From PCRE1 release 8.34 and for all PCRE2 releases it is also correct for Perl
109 | space, because Perl added VT at release 5.18.
110 | 
111 | Note also that it is possible for a character to be alnum or alpha without
112 | being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the
113 | fr_FR locale (at least under Debian Linux's locales as of 12/2005). So we must
114 | test for alnum specially. */
115 | 
116 | memset(p, 0, cbit_length);
117 | for (i = 0; i < 256; i++)
118 |   {
119 |   if (isdigit(i))  p[cbit_digit  + i/8] |= 1u << (i&7);
120 |   if (isupper(i))  p[cbit_upper  + i/8] |= 1u << (i&7);
121 |   if (islower(i))  p[cbit_lower  + i/8] |= 1u << (i&7);
122 |   if (isalnum(i))  p[cbit_word   + i/8] |= 1u << (i&7);
123 |   if (i == '_')    p[cbit_word   + i/8] |= 1u << (i&7);
124 |   if (isspace(i))  p[cbit_space  + i/8] |= 1u << (i&7);
125 |   if (isxdigit(i)) p[cbit_xdigit + i/8] |= 1u << (i&7);
126 |   if (isgraph(i))  p[cbit_graph  + i/8] |= 1u << (i&7);
127 |   if (isprint(i))  p[cbit_print  + i/8] |= 1u << (i&7);
128 |   if (ispunct(i))  p[cbit_punct  + i/8] |= 1u << (i&7);
129 |   if (iscntrl(i))  p[cbit_cntrl  + i/8] |= 1u << (i&7);
130 |   }
131 | p += cbit_length;
132 | 
133 | /* Finally, the character type table. In this, we used to exclude VT from the
134 | white space chars, because Perl didn't recognize it as such for \s and for
135 | comments within regexes. However, Perl changed at release 5.18, so PCRE1
136 | changed at release 8.34 and it's always been this way for PCRE2. */
137 | 
138 | for (i = 0; i < 256; i++)
139 |   {
140 |   int x = 0;
141 |   if (isspace(i)) x += ctype_space;
142 |   if (isalpha(i)) x += ctype_letter;
143 |   if (islower(i)) x += ctype_lcletter;
144 |   if (isdigit(i)) x += ctype_digit;
145 |   if (isalnum(i) || i == '_') x += ctype_word;
146 |   *p++ = x;
147 |   }
148 | 
149 | return yield;
150 | }
151 | 
152 | #ifndef PCRE2_DFTABLES   /* Compiling the library */
153 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
154 | bundled_pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables)
155 | {
156 |   if (gcontext)
157 |     gcontext->memctl.free((void *)tables, gcontext->memctl.memory_data);
158 |   else
159 |     free((void *)tables);
160 | }
161 | #endif
162 | 
163 | /* End of pcre2_maketables.c */
164 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_match_data.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2019 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | 
 42 | #ifdef HAVE_CONFIG_H
 43 | #include "config.h"
 44 | #endif
 45 | 
 46 | #include "pcre2_internal.h"
 47 | 
 48 | 
 49 | 
 50 | /*************************************************
 51 | *  Create a match data block given ovector size  *
 52 | *************************************************/
 53 | 
 54 | /* A minimum of 1 is imposed on the number of ovector pairs. */
 55 | 
 56 | PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION
 57 | bundled_pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext)
 58 | {
 59 | pcre2_match_data *yield;
 60 | if (oveccount < 1) oveccount = 1;
 61 | yield = PRIV(memctl_malloc)(
 62 |   offsetof(pcre2_match_data, ovector) + 2*oveccount*sizeof(PCRE2_SIZE),
 63 |   (pcre2_memctl *)gcontext);
 64 | if (yield == NULL) return NULL;
 65 | yield->oveccount = oveccount;
 66 | yield->flags = 0;
 67 | return yield;
 68 | }
 69 | 
 70 | 
 71 | 
 72 | /*************************************************
 73 | *  Create a match data block using pattern data  *
 74 | *************************************************/
 75 | 
 76 | /* If no context is supplied, use the memory allocator from the code. */
 77 | 
 78 | PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION
 79 | bundled_pcre2_match_data_create_from_pattern(const pcre2_code *code,
 80 |   pcre2_general_context *gcontext)
 81 | {
 82 | if (gcontext == NULL) gcontext = (pcre2_general_context *)code;
 83 | return bundled_pcre2_match_data_create(((pcre2_real_code *)code)->top_bracket + 1,
 84 |   gcontext);
 85 | }
 86 | 
 87 | 
 88 | 
 89 | /*************************************************
 90 | *            Free a match data block             *
 91 | *************************************************/
 92 | 
 93 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
 94 | bundled_pcre2_match_data_free(pcre2_match_data *match_data)
 95 | {
 96 | if (match_data != NULL)
 97 |   {
 98 |   if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
 99 |     match_data->memctl.free((void *)match_data->subject,
100 |       match_data->memctl.memory_data);
101 |   match_data->memctl.free(match_data, match_data->memctl.memory_data);
102 |   }
103 | }
104 | 
105 | 
106 | 
107 | /*************************************************
108 | *         Get last mark in match                 *
109 | *************************************************/
110 | 
111 | PCRE2_EXP_DEFN PCRE2_SPTR PCRE2_CALL_CONVENTION
112 | bundled_pcre2_get_mark(pcre2_match_data *match_data)
113 | {
114 | return match_data->mark;
115 | }
116 | 
117 | 
118 | 
119 | /*************************************************
120 | *          Get pointer to ovector                *
121 | *************************************************/
122 | 
123 | PCRE2_EXP_DEFN PCRE2_SIZE * PCRE2_CALL_CONVENTION
124 | bundled_pcre2_get_ovector_pointer(pcre2_match_data *match_data)
125 | {
126 | return match_data->ovector;
127 | }
128 | 
129 | 
130 | 
131 | /*************************************************
132 | *          Get number of ovector slots           *
133 | *************************************************/
134 | 
135 | PCRE2_EXP_DEFN uint32_t PCRE2_CALL_CONVENTION
136 | bundled_pcre2_get_ovector_count(pcre2_match_data *match_data)
137 | {
138 | return match_data->oveccount;
139 | }
140 | 
141 | 
142 | 
143 | /*************************************************
144 | *         Get starting code unit in match        *
145 | *************************************************/
146 | 
147 | PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION
148 | bundled_pcre2_get_startchar(pcre2_match_data *match_data)
149 | {
150 | return match_data->startchar;
151 | }
152 | 
153 | 
154 | 
155 | /*************************************************
156 | *         Get size of match data block           *
157 | *************************************************/
158 | 
159 | PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION
160 | bundled_pcre2_get_match_data_size(pcre2_match_data *match_data)
161 | {
162 | return offsetof(pcre2_match_data, ovector) +
163 |   2 * (match_data->oveccount) * sizeof(PCRE2_SIZE);
164 | }
165 | 
166 | /* End of pcre2_match_data.c */
167 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_newline.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |          New API code Copyright (c) 2016 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | 
 42 | /* This module contains internal functions for testing newlines when more than
 43 | one kind of newline is to be recognized. When a newline is found, its length is
 44 | returned. In principle, we could implement several newline "types", each
 45 | referring to a different set of newline characters. At present, PCRE2 supports
 46 | only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
 47 | and NLTYPE_ANY. The full list of Unicode newline characters is taken from
 48 | http://unicode.org/unicode/reports/tr18/. */
 49 | 
 50 | 
 51 | #ifdef HAVE_CONFIG_H
 52 | #include "config.h"
 53 | #endif
 54 | 
 55 | #include "pcre2_internal.h"
 56 | 
 57 | 
 58 | 
 59 | /*************************************************
 60 | *      Check for newline at given position       *
 61 | *************************************************/
 62 | 
 63 | /* This function is called only via the IS_NEWLINE macro, which does so only
 64 | when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
 65 | newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the code unit
 66 | pointed to by ptr is less than the end of the string.
 67 | 
 68 | Arguments:
 69 |   ptr          pointer to possible newline
 70 |   type         the newline type
 71 |   endptr       pointer to the end of the string
 72 |   lenptr       where to return the length
 73 |   utf          TRUE if in utf mode
 74 | 
 75 | Returns:       TRUE or FALSE
 76 | */
 77 | 
 78 | BOOL
 79 | PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr,
 80 |   uint32_t *lenptr, BOOL utf)
 81 | {
 82 | uint32_t c;
 83 | 
 84 | #ifdef SUPPORT_UNICODE
 85 | if (utf) { GETCHAR(c, ptr); } else c = *ptr;
 86 | #else
 87 | (void)utf;
 88 | c = *ptr;
 89 | #endif  /* SUPPORT_UNICODE */
 90 | 
 91 | if (type == NLTYPE_ANYCRLF) switch(c)
 92 |   {
 93 |   case CHAR_LF:
 94 |   *lenptr = 1;
 95 |   return TRUE;
 96 | 
 97 |   case CHAR_CR:
 98 |   *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
 99 |   return TRUE;
100 | 
101 |   default:
102 |   return FALSE;
103 |   }
104 | 
105 | /* NLTYPE_ANY */
106 | 
107 | else switch(c)
108 |   {
109 | #ifdef EBCDIC
110 |   case CHAR_NEL:
111 | #endif
112 |   case CHAR_LF:
113 |   case CHAR_VT:
114 |   case CHAR_FF:
115 |   *lenptr = 1;
116 |   return TRUE;
117 | 
118 |   case CHAR_CR:
119 |   *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
120 |   return TRUE;
121 | 
122 | #ifndef EBCDIC
123 | #if PCRE2_CODE_UNIT_WIDTH == 8
124 |   case CHAR_NEL:
125 |   *lenptr = utf? 2 : 1;
126 |   return TRUE;
127 | 
128 |   case 0x2028:   /* LS */
129 |   case 0x2029:   /* PS */
130 |   *lenptr = 3;
131 |   return TRUE;
132 | 
133 | #else  /* 16-bit or 32-bit code units */
134 |   case CHAR_NEL:
135 |   case 0x2028:   /* LS */
136 |   case 0x2029:   /* PS */
137 |   *lenptr = 1;
138 |   return TRUE;
139 | #endif
140 | #endif /* Not EBCDIC */
141 | 
142 |   default:
143 |   return FALSE;
144 |   }
145 | }
146 | 
147 | 
148 | 
149 | /*************************************************
150 | *     Check for newline at previous position     *
151 | *************************************************/
152 | 
153 | /* This function is called only via the WAS_NEWLINE macro, which does so only
154 | when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
155 | newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the initial
156 | value of ptr is greater than the start of the string that is being processed.
157 | 
158 | Arguments:
159 |   ptr          pointer to possible newline
160 |   type         the newline type
161 |   startptr     pointer to the start of the string
162 |   lenptr       where to return the length
163 |   utf          TRUE if in utf mode
164 | 
165 | Returns:       TRUE or FALSE
166 | */
167 | 
168 | BOOL
169 | PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr,
170 |   uint32_t *lenptr, BOOL utf)
171 | {
172 | uint32_t c;
173 | ptr--;
174 | 
175 | #ifdef SUPPORT_UNICODE
176 | if (utf)
177 |   {
178 |   BACKCHAR(ptr);
179 |   GETCHAR(c, ptr);
180 |   }
181 | else c = *ptr;
182 | #else
183 | (void)utf;
184 | c = *ptr;
185 | #endif  /* SUPPORT_UNICODE */
186 | 
187 | if (type == NLTYPE_ANYCRLF) switch(c)
188 |   {
189 |   case CHAR_LF:
190 |   *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
191 |   return TRUE;
192 | 
193 |   case CHAR_CR:
194 |   *lenptr = 1;
195 |   return TRUE;
196 | 
197 |   default:
198 |   return FALSE;
199 |   }
200 | 
201 | /* NLTYPE_ANY */
202 | 
203 | else switch(c)
204 |   {
205 |   case CHAR_LF:
206 |   *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
207 |   return TRUE;
208 | 
209 | #ifdef EBCDIC
210 |   case CHAR_NEL:
211 | #endif
212 |   case CHAR_VT:
213 |   case CHAR_FF:
214 |   case CHAR_CR:
215 |   *lenptr = 1;
216 |   return TRUE;
217 | 
218 | #ifndef EBCDIC
219 | #if PCRE2_CODE_UNIT_WIDTH == 8
220 |   case CHAR_NEL:
221 |   *lenptr = utf? 2 : 1;
222 |   return TRUE;
223 | 
224 |   case 0x2028:   /* LS */
225 |   case 0x2029:   /* PS */
226 |   *lenptr = 3;
227 |   return TRUE;
228 | 
229 | #else /* 16-bit or 32-bit code units */
230 |   case CHAR_NEL:
231 |   case 0x2028:   /* LS */
232 |   case 0x2029:   /* PS */
233 |   *lenptr = 1;
234 |   return TRUE;
235 | #endif
236 | #endif /* Not EBCDIC */
237 | 
238 |   default:
239 |   return FALSE;
240 |   }
241 | }
242 | 
243 | /* End of pcre2_newline.c */
244 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_ord2utf.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |          New API code Copyright (c) 2016 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | 
 42 | /* This file contains a function that converts a Unicode character code point
 43 | into a UTF string. The behaviour is different for each code unit width. */
 44 | 
 45 | 
 46 | #ifdef HAVE_CONFIG_H
 47 | #include "config.h"
 48 | #endif
 49 | 
 50 | #include "pcre2_internal.h"
 51 | 
 52 | 
 53 | /* If SUPPORT_UNICODE is not defined, this function will never be called.
 54 | Supply a dummy function because some compilers do not like empty source
 55 | modules. */
 56 | 
 57 | #ifndef SUPPORT_UNICODE
 58 | unsigned int
 59 | PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
 60 | {
 61 | (void)(cvalue);
 62 | (void)(buffer);
 63 | return 0;
 64 | }
 65 | #else  /* SUPPORT_UNICODE */
 66 | 
 67 | 
 68 | /*************************************************
 69 | *          Convert code point to UTF             *
 70 | *************************************************/
 71 | 
 72 | /*
 73 | Arguments:
 74 |   cvalue     the character value
 75 |   buffer     pointer to buffer for result
 76 | 
 77 | Returns:     number of code units placed in the buffer
 78 | */
 79 | 
 80 | unsigned int
 81 | PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
 82 | {
 83 | /* Convert to UTF-8 */
 84 | 
 85 | #if PCRE2_CODE_UNIT_WIDTH == 8
 86 | int i, j;
 87 | for (i = 0; i < PRIV(utf8_table1_size); i++)
 88 |   if ((int)cvalue <= PRIV(utf8_table1)[i]) break;
 89 | buffer += i;
 90 | for (j = i; j > 0; j--)
 91 |  {
 92 |  *buffer-- = 0x80 | (cvalue & 0x3f);
 93 |  cvalue >>= 6;
 94 |  }
 95 | *buffer = PRIV(utf8_table2)[i] | cvalue;
 96 | return i + 1;
 97 | 
 98 | /* Convert to UTF-16 */
 99 | 
100 | #elif PCRE2_CODE_UNIT_WIDTH == 16
101 | if (cvalue <= 0xffff)
102 |   {
103 |   *buffer = (PCRE2_UCHAR)cvalue;
104 |   return 1;
105 |   }
106 | cvalue -= 0x10000;
107 | *buffer++ = 0xd800 | (cvalue >> 10);
108 | *buffer = 0xdc00 | (cvalue & 0x3ff);
109 | return 2;
110 | 
111 | /* Convert to UTF-32 */
112 | 
113 | #else
114 | *buffer = (PCRE2_UCHAR)cvalue;
115 | return 1;
116 | #endif
117 | }
118 | #endif  /* SUPPORT_UNICODE */
119 | 
120 | /* End of pcre_ord2utf.c */
121 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_serialize.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2020 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | /* This module contains functions for serializing and deserializing
 42 | a sequence of compiled codes. */
 43 | 
 44 | 
 45 | #ifdef HAVE_CONFIG_H
 46 | #include "config.h"
 47 | #endif
 48 | 
 49 | 
 50 | #include "pcre2_internal.h"
 51 | 
 52 | /* Magic number to provide a small check against being handed junk. */
 53 | 
 54 | #define SERIALIZED_DATA_MAGIC 0x50523253u
 55 | 
 56 | /* Deserialization is limited to the current PCRE version and
 57 | character width. */
 58 | 
 59 | #define SERIALIZED_DATA_VERSION \
 60 |   ((PCRE2_MAJOR) | ((PCRE2_MINOR) << 16))
 61 | 
 62 | #define SERIALIZED_DATA_CONFIG \
 63 |   (sizeof(PCRE2_UCHAR) | ((sizeof(void*)) << 8) | ((sizeof(PCRE2_SIZE)) << 16))
 64 | 
 65 | 
 66 | 
 67 | /*************************************************
 68 | *           Serialize compiled patterns          *
 69 | *************************************************/
 70 | 
 71 | PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION
 72 | bundled_pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes,
 73 |    uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size,
 74 |    pcre2_general_context *gcontext)
 75 | {
 76 | uint8_t *bytes;
 77 | uint8_t *dst_bytes;
 78 | int32_t i;
 79 | PCRE2_SIZE total_size;
 80 | const pcre2_real_code *re;
 81 | const uint8_t *tables;
 82 | pcre2_serialized_data *data;
 83 | 
 84 | const pcre2_memctl *memctl = (gcontext != NULL) ?
 85 |   &gcontext->memctl : &PRIV(default_compile_context).memctl;
 86 | 
 87 | if (codes == NULL || serialized_bytes == NULL || serialized_size == NULL)
 88 |   return PCRE2_ERROR_NULL;
 89 | 
 90 | if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA;
 91 | 
 92 | /* Compute total size. */
 93 | total_size = sizeof(pcre2_serialized_data) + TABLES_LENGTH;
 94 | tables = NULL;
 95 | 
 96 | for (i = 0; i < number_of_codes; i++)
 97 |   {
 98 |   if (codes[i] == NULL) return PCRE2_ERROR_NULL;
 99 |   re = (const pcre2_real_code *)(codes[i]);
100 |   if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
101 |   if (tables == NULL)
102 |     tables = re->tables;
103 |   else if (tables != re->tables)
104 |     return PCRE2_ERROR_MIXEDTABLES;
105 |   total_size += re->blocksize;
106 |   }
107 | 
108 | /* Initialize the byte stream. */
109 | bytes = memctl->malloc(total_size + sizeof(pcre2_memctl), memctl->memory_data);
110 | if (bytes == NULL) return PCRE2_ERROR_NOMEMORY;
111 | 
112 | /* The controller is stored as a hidden parameter. */
113 | memcpy(bytes, memctl, sizeof(pcre2_memctl));
114 | bytes += sizeof(pcre2_memctl);
115 | 
116 | data = (pcre2_serialized_data *)bytes;
117 | data->magic = SERIALIZED_DATA_MAGIC;
118 | data->version = SERIALIZED_DATA_VERSION;
119 | data->config = SERIALIZED_DATA_CONFIG;
120 | data->number_of_codes = number_of_codes;
121 | 
122 | /* Copy all compiled code data. */
123 | dst_bytes = bytes + sizeof(pcre2_serialized_data);
124 | memcpy(dst_bytes, tables, TABLES_LENGTH);
125 | dst_bytes += TABLES_LENGTH;
126 | 
127 | for (i = 0; i < number_of_codes; i++)
128 |   {
129 |   re = (const pcre2_real_code *)(codes[i]);
130 |   (void)memcpy(dst_bytes, (char *)re, re->blocksize);
131 |   
132 |   /* Certain fields in the compiled code block are re-set during 
133 |   deserialization. In order to ensure that the serialized data stream is always 
134 |   the same for the same pattern, set them to zero here. We can't assume the 
135 |   copy of the pattern is correctly aligned for accessing the fields as part of 
136 |   a structure. Note the use of sizeof(void *) in the second of these, to
137 |   specify the size of a pointer. If sizeof(uint8_t *) is used (tables is a 
138 |   pointer to uint8_t), gcc gives a warning because the first argument is also a 
139 |   pointer to uint8_t. Casting the first argument to (void *) can stop this, but 
140 |   it didn't stop Coverity giving the same complaint. */
141 |   
142 |   (void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0, 
143 |     sizeof(pcre2_memctl));
144 |   (void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0, 
145 |     sizeof(void *));
146 |   (void)memset(dst_bytes + offsetof(pcre2_real_code, executable_jit), 0,
147 |     sizeof(void *));        
148 |  
149 |   dst_bytes += re->blocksize;
150 |   }
151 | 
152 | *serialized_bytes = bytes;
153 | *serialized_size = total_size;
154 | return number_of_codes;
155 | }
156 | 
157 | 
158 | /*************************************************
159 | *          Deserialize compiled patterns         *
160 | *************************************************/
161 | 
162 | PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION
163 | bundled_pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes,
164 |    const uint8_t *bytes, pcre2_general_context *gcontext)
165 | {
166 | const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes;
167 | const pcre2_memctl *memctl = (gcontext != NULL) ?
168 |   &gcontext->memctl : &PRIV(default_compile_context).memctl;
169 | 
170 | const uint8_t *src_bytes;
171 | pcre2_real_code *dst_re;
172 | uint8_t *tables;
173 | int32_t i, j;
174 | 
175 | /* Sanity checks. */
176 | 
177 | if (data == NULL || codes == NULL) return PCRE2_ERROR_NULL;
178 | if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA;
179 | if (data->number_of_codes <= 0) return PCRE2_ERROR_BADSERIALIZEDDATA;
180 | if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC;
181 | if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE;
182 | if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE;
183 | 
184 | if (number_of_codes > data->number_of_codes)
185 |   number_of_codes = data->number_of_codes;
186 | 
187 | src_bytes = bytes + sizeof(pcre2_serialized_data);
188 | 
189 | /* Decode tables. The reference count for the tables is stored immediately
190 | following them. */
191 | 
192 | tables = memctl->malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), memctl->memory_data);
193 | if (tables == NULL) return PCRE2_ERROR_NOMEMORY;
194 | 
195 | memcpy(tables, src_bytes, TABLES_LENGTH);
196 | *(PCRE2_SIZE *)(tables + TABLES_LENGTH) = number_of_codes;
197 | src_bytes += TABLES_LENGTH;
198 | 
199 | /* Decode the byte stream. We must not try to read the size from the compiled
200 | code block in the stream, because it might be unaligned, which causes errors on
201 | hardware such as Sparc-64 that doesn't like unaligned memory accesses. The type
202 | of the blocksize field is given its own name to ensure that it is the same here
203 | as in the block. */
204 | 
205 | for (i = 0; i < number_of_codes; i++)
206 |   {
207 |   CODE_BLOCKSIZE_TYPE blocksize;
208 |   memcpy(&blocksize, src_bytes + offsetof(pcre2_real_code, blocksize),
209 |     sizeof(CODE_BLOCKSIZE_TYPE));
210 |   if (blocksize <= sizeof(pcre2_real_code))
211 |     return PCRE2_ERROR_BADSERIALIZEDDATA;
212 | 
213 |   /* The allocator provided by gcontext replaces the original one. */
214 | 
215 |   dst_re = (pcre2_real_code *)PRIV(memctl_malloc)(blocksize,
216 |     (pcre2_memctl *)gcontext);
217 |   if (dst_re == NULL)
218 |     {
219 |     memctl->free(tables, memctl->memory_data);
220 |     for (j = 0; j < i; j++)
221 |       {
222 |       memctl->free(codes[j], memctl->memory_data);
223 |       codes[j] = NULL;
224 |       }
225 |     return PCRE2_ERROR_NOMEMORY;
226 |     }
227 | 
228 |   /* The new allocator must be preserved. */
229 | 
230 |   memcpy(((uint8_t *)dst_re) + sizeof(pcre2_memctl),
231 |     src_bytes + sizeof(pcre2_memctl), blocksize - sizeof(pcre2_memctl));
232 |   if (dst_re->magic_number != MAGIC_NUMBER ||
233 |       dst_re->name_entry_size > MAX_NAME_SIZE + IMM2_SIZE + 1 ||
234 |       dst_re->name_count > MAX_NAME_COUNT)
235 |     {   
236 |     memctl->free(dst_re, memctl->memory_data); 
237 |     return PCRE2_ERROR_BADSERIALIZEDDATA;
238 |     } 
239 | 
240 |   /* At the moment only one table is supported. */
241 | 
242 |   dst_re->tables = tables;
243 |   dst_re->executable_jit = NULL;
244 |   dst_re->flags |= PCRE2_DEREF_TABLES;
245 | 
246 |   codes[i] = dst_re;
247 |   src_bytes += blocksize;
248 |   }
249 | 
250 | return number_of_codes;
251 | }
252 | 
253 | 
254 | /*************************************************
255 | *    Get the number of serialized patterns       *
256 | *************************************************/
257 | 
258 | PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION
259 | bundled_pcre2_serialize_get_number_of_codes(const uint8_t *bytes)
260 | {
261 | const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes;
262 | 
263 | if (data == NULL) return PCRE2_ERROR_NULL;
264 | if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC;
265 | if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE;
266 | if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE;
267 | 
268 | return data->number_of_codes;
269 | }
270 | 
271 | 
272 | /*************************************************
273 | *            Free the allocated stream           *
274 | *************************************************/
275 | 
276 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
277 | bundled_pcre2_serialize_free(uint8_t *bytes)
278 | {
279 | if (bytes != NULL)
280 |   {
281 |   pcre2_memctl *memctl = (pcre2_memctl *)(bytes - sizeof(pcre2_memctl));
282 |   memctl->free(memctl, memctl->memory_data);
283 |   }
284 | }
285 | 
286 | /* End of pcre2_serialize.c */
287 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_string_utils.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2018 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | /* This module contains internal functions for comparing and finding the length
 42 | of strings. These are used instead of strcmp() etc because the standard
 43 | functions work only on 8-bit data. */
 44 | 
 45 | 
 46 | #ifdef HAVE_CONFIG_H
 47 | #include "config.h"
 48 | #endif
 49 | 
 50 | #include "pcre2_internal.h"
 51 | 
 52 | 
 53 | /*************************************************
 54 | *    Emulated memmove() for systems without it   *
 55 | *************************************************/
 56 | 
 57 | /* This function can make use of bcopy() if it is available. Otherwise do it by
 58 | steam, as there some non-Unix environments that lack both memmove() and
 59 | bcopy(). */
 60 | 
 61 | #if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE)
 62 | void *
 63 | PRIV(memmove)(void *d, const void *s, size_t n)
 64 | {
 65 | #ifdef HAVE_BCOPY
 66 | bcopy(s, d, n);
 67 | return d;
 68 | #else
 69 | size_t i;
 70 | unsigned char *dest = (unsigned char *)d;
 71 | const unsigned char *src = (const unsigned char *)s;
 72 | if (dest > src)
 73 |   {
 74 |   dest += n;
 75 |   src += n;
 76 |   for (i = 0; i < n; ++i) *(--dest) = *(--src);
 77 |   return (void *)dest;
 78 |   }
 79 | else
 80 |   {
 81 |   for (i = 0; i < n; ++i) *dest++ = *src++;
 82 |   return (void *)(dest - n);
 83 |   }
 84 | #endif   /* not HAVE_BCOPY */
 85 | }
 86 | #endif   /* not VPCOMPAT && not HAVE_MEMMOVE */
 87 | 
 88 | 
 89 | /*************************************************
 90 | *    Compare two zero-terminated PCRE2 strings   *
 91 | *************************************************/
 92 | 
 93 | /*
 94 | Arguments:
 95 |   str1        first string
 96 |   str2        second string
 97 | 
 98 | Returns:      0, 1, or -1
 99 | */
100 | 
101 | int
102 | PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2)
103 | {
104 | PCRE2_UCHAR c1, c2;
105 | while (*str1 != '\0' || *str2 != '\0')
106 |   {
107 |   c1 = *str1++;
108 |   c2 = *str2++;
109 |   if (c1 != c2) return ((c1 > c2) << 1) - 1;
110 |   }
111 | return 0;
112 | }
113 | 
114 | 
115 | /*************************************************
116 | *  Compare zero-terminated PCRE2 & 8-bit strings *
117 | *************************************************/
118 | 
119 | /* As the 8-bit string is almost always a literal, its type is specified as
120 | const char *.
121 | 
122 | Arguments:
123 |   str1        first string
124 |   str2        second string
125 | 
126 | Returns:      0, 1, or -1
127 | */
128 | 
129 | int
130 | PRIV(strcmp_c8)(PCRE2_SPTR str1, const char *str2)
131 | {
132 | PCRE2_UCHAR c1, c2;
133 | while (*str1 != '\0' || *str2 != '\0')
134 |   {
135 |   c1 = *str1++;
136 |   c2 = *str2++;
137 |   if (c1 != c2) return ((c1 > c2) << 1) - 1;
138 |   }
139 | return 0;
140 | }
141 | 
142 | 
143 | /*************************************************
144 | *    Compare two PCRE2 strings, given a length   *
145 | *************************************************/
146 | 
147 | /*
148 | Arguments:
149 |   str1        first string
150 |   str2        second string
151 |   len         the length
152 | 
153 | Returns:      0, 1, or -1
154 | */
155 | 
156 | int
157 | PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len)
158 | {
159 | PCRE2_UCHAR c1, c2;
160 | for (; len > 0; len--)
161 |   {
162 |   c1 = *str1++;
163 |   c2 = *str2++;
164 |   if (c1 != c2) return ((c1 > c2) << 1) - 1;
165 |   }
166 | return 0;
167 | }
168 | 
169 | 
170 | /*************************************************
171 | * Compare PCRE2 string to 8-bit string by length *
172 | *************************************************/
173 | 
174 | /* As the 8-bit string is almost always a literal, its type is specified as
175 | const char *.
176 | 
177 | Arguments:
178 |   str1        first string
179 |   str2        second string
180 |   len         the length
181 | 
182 | Returns:      0, 1, or -1
183 | */
184 | 
185 | int
186 | PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len)
187 | {
188 | PCRE2_UCHAR c1, c2;
189 | for (; len > 0; len--)
190 |   {
191 |   c1 = *str1++;
192 |   c2 = *str2++;
193 |   if (c1 != c2) return ((c1 > c2) << 1) - 1;
194 |   }
195 | return 0;
196 | }
197 | 
198 | 
199 | /*************************************************
200 | *        Find the length of a PCRE2 string       *
201 | *************************************************/
202 | 
203 | /*
204 | Argument:    the string
205 | Returns:     the length
206 | */
207 | 
208 | PCRE2_SIZE
209 | PRIV(strlen)(PCRE2_SPTR str)
210 | {
211 | PCRE2_SIZE c = 0;
212 | while (*str++ != 0) c++;
213 | return c;
214 | }
215 | 
216 | 
217 | /*************************************************
218 | * Copy 8-bit 0-terminated string to PCRE2 string *
219 | *************************************************/
220 | 
221 | /* Arguments:
222 |   str1     buffer to receive the string
223 |   str2     8-bit string to be copied
224 | 
225 | Returns:   the number of code units used (excluding trailing zero)
226 | */
227 | 
228 | PCRE2_SIZE
229 | PRIV(strcpy_c8)(PCRE2_UCHAR *str1, const char *str2)
230 | {
231 | PCRE2_UCHAR *t = str1;
232 | while (*str2 != 0) *t++ = *str2++;
233 | *t = 0;
234 | return t - str1;
235 | }
236 | 
237 | /* End of pcre2_string_utils.c */
238 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_ucp.h:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2018 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | 
 42 | #ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
 43 | #define PCRE2_UCP_H_IDEMPOTENT_GUARD
 44 | 
 45 | /* This file contains definitions of the property values that are returned by
 46 | the UCD access macros. New values that are added for new releases of Unicode
 47 | should always be at the end of each enum, for backwards compatibility.
 48 | 
 49 | IMPORTANT: Note also that the specific numeric values of the enums have to be
 50 | the same as the values that are generated by the maint/MultiStage2.py script,
 51 | where the equivalent property descriptive names are listed in vectors.
 52 | 
 53 | ALSO: The specific values of the first two enums are assumed for the table
 54 | called catposstab in pcre2_compile.c. */
 55 | 
 56 | /* These are the general character categories. */
 57 | 
 58 | enum {
 59 |   ucp_C,     /* Other */
 60 |   ucp_L,     /* Letter */
 61 |   ucp_M,     /* Mark */
 62 |   ucp_N,     /* Number */
 63 |   ucp_P,     /* Punctuation */
 64 |   ucp_S,     /* Symbol */
 65 |   ucp_Z      /* Separator */
 66 | };
 67 | 
 68 | /* These are the particular character categories. */
 69 | 
 70 | enum {
 71 |   ucp_Cc,    /* Control */
 72 |   ucp_Cf,    /* Format */
 73 |   ucp_Cn,    /* Unassigned */
 74 |   ucp_Co,    /* Private use */
 75 |   ucp_Cs,    /* Surrogate */
 76 |   ucp_Ll,    /* Lower case letter */
 77 |   ucp_Lm,    /* Modifier letter */
 78 |   ucp_Lo,    /* Other letter */
 79 |   ucp_Lt,    /* Title case letter */
 80 |   ucp_Lu,    /* Upper case letter */
 81 |   ucp_Mc,    /* Spacing mark */
 82 |   ucp_Me,    /* Enclosing mark */
 83 |   ucp_Mn,    /* Non-spacing mark */
 84 |   ucp_Nd,    /* Decimal number */
 85 |   ucp_Nl,    /* Letter number */
 86 |   ucp_No,    /* Other number */
 87 |   ucp_Pc,    /* Connector punctuation */
 88 |   ucp_Pd,    /* Dash punctuation */
 89 |   ucp_Pe,    /* Close punctuation */
 90 |   ucp_Pf,    /* Final punctuation */
 91 |   ucp_Pi,    /* Initial punctuation */
 92 |   ucp_Po,    /* Other punctuation */
 93 |   ucp_Ps,    /* Open punctuation */
 94 |   ucp_Sc,    /* Currency symbol */
 95 |   ucp_Sk,    /* Modifier symbol */
 96 |   ucp_Sm,    /* Mathematical symbol */
 97 |   ucp_So,    /* Other symbol */
 98 |   ucp_Zl,    /* Line separator */
 99 |   ucp_Zp,    /* Paragraph separator */
100 |   ucp_Zs     /* Space separator */
101 | };
102 | 
103 | /* These are grapheme break properties. The Extended Pictographic property
104 | comes from the emoji-data.txt file. */
105 | 
106 | enum {
107 |   ucp_gbCR,                    /*  0 */
108 |   ucp_gbLF,                    /*  1 */
109 |   ucp_gbControl,               /*  2 */
110 |   ucp_gbExtend,                /*  3 */
111 |   ucp_gbPrepend,               /*  4 */
112 |   ucp_gbSpacingMark,           /*  5 */
113 |   ucp_gbL,                     /*  6 Hangul syllable type L */
114 |   ucp_gbV,                     /*  7 Hangul syllable type V */
115 |   ucp_gbT,                     /*  8 Hangul syllable type T */
116 |   ucp_gbLV,                    /*  9 Hangul syllable type LV */
117 |   ucp_gbLVT,                   /* 10 Hangul syllable type LVT */
118 |   ucp_gbRegionalIndicator,     /* 11 */
119 |   ucp_gbOther,                 /* 12 */
120 |   ucp_gbZWJ,                   /* 13 */
121 |   ucp_gbExtended_Pictographic  /* 14 */
122 | };
123 | 
124 | /* These are the script identifications. */
125 | 
126 | enum {
127 |   ucp_Unknown,
128 |   ucp_Arabic,
129 |   ucp_Armenian,
130 |   ucp_Bengali,
131 |   ucp_Bopomofo,
132 |   ucp_Braille,
133 |   ucp_Buginese,
134 |   ucp_Buhid,
135 |   ucp_Canadian_Aboriginal,
136 |   ucp_Cherokee,
137 |   ucp_Common,
138 |   ucp_Coptic,
139 |   ucp_Cypriot,
140 |   ucp_Cyrillic,
141 |   ucp_Deseret,
142 |   ucp_Devanagari,
143 |   ucp_Ethiopic,
144 |   ucp_Georgian,
145 |   ucp_Glagolitic,
146 |   ucp_Gothic,
147 |   ucp_Greek,
148 |   ucp_Gujarati,
149 |   ucp_Gurmukhi,
150 |   ucp_Han,
151 |   ucp_Hangul,
152 |   ucp_Hanunoo,
153 |   ucp_Hebrew,
154 |   ucp_Hiragana,
155 |   ucp_Inherited,
156 |   ucp_Kannada,
157 |   ucp_Katakana,
158 |   ucp_Kharoshthi,
159 |   ucp_Khmer,
160 |   ucp_Lao,
161 |   ucp_Latin,
162 |   ucp_Limbu,
163 |   ucp_Linear_B,
164 |   ucp_Malayalam,
165 |   ucp_Mongolian,
166 |   ucp_Myanmar,
167 |   ucp_New_Tai_Lue,
168 |   ucp_Ogham,
169 |   ucp_Old_Italic,
170 |   ucp_Old_Persian,
171 |   ucp_Oriya,
172 |   ucp_Osmanya,
173 |   ucp_Runic,
174 |   ucp_Shavian,
175 |   ucp_Sinhala,
176 |   ucp_Syloti_Nagri,
177 |   ucp_Syriac,
178 |   ucp_Tagalog,
179 |   ucp_Tagbanwa,
180 |   ucp_Tai_Le,
181 |   ucp_Tamil,
182 |   ucp_Telugu,
183 |   ucp_Thaana,
184 |   ucp_Thai,
185 |   ucp_Tibetan,
186 |   ucp_Tifinagh,
187 |   ucp_Ugaritic,
188 |   ucp_Yi,
189 |   /* New for Unicode 5.0 */
190 |   ucp_Balinese,
191 |   ucp_Cuneiform,
192 |   ucp_Nko,
193 |   ucp_Phags_Pa,
194 |   ucp_Phoenician,
195 |   /* New for Unicode 5.1 */
196 |   ucp_Carian,
197 |   ucp_Cham,
198 |   ucp_Kayah_Li,
199 |   ucp_Lepcha,
200 |   ucp_Lycian,
201 |   ucp_Lydian,
202 |   ucp_Ol_Chiki,
203 |   ucp_Rejang,
204 |   ucp_Saurashtra,
205 |   ucp_Sundanese,
206 |   ucp_Vai,
207 |   /* New for Unicode 5.2 */
208 |   ucp_Avestan,
209 |   ucp_Bamum,
210 |   ucp_Egyptian_Hieroglyphs,
211 |   ucp_Imperial_Aramaic,
212 |   ucp_Inscriptional_Pahlavi,
213 |   ucp_Inscriptional_Parthian,
214 |   ucp_Javanese,
215 |   ucp_Kaithi,
216 |   ucp_Lisu,
217 |   ucp_Meetei_Mayek,
218 |   ucp_Old_South_Arabian,
219 |   ucp_Old_Turkic,
220 |   ucp_Samaritan,
221 |   ucp_Tai_Tham,
222 |   ucp_Tai_Viet,
223 |   /* New for Unicode 6.0.0 */
224 |   ucp_Batak,
225 |   ucp_Brahmi,
226 |   ucp_Mandaic,
227 |   /* New for Unicode 6.1.0 */
228 |   ucp_Chakma,
229 |   ucp_Meroitic_Cursive,
230 |   ucp_Meroitic_Hieroglyphs,
231 |   ucp_Miao,
232 |   ucp_Sharada,
233 |   ucp_Sora_Sompeng,
234 |   ucp_Takri,
235 |   /* New for Unicode 7.0.0 */
236 |   ucp_Bassa_Vah,
237 |   ucp_Caucasian_Albanian,
238 |   ucp_Duployan,
239 |   ucp_Elbasan,
240 |   ucp_Grantha,
241 |   ucp_Khojki,
242 |   ucp_Khudawadi,
243 |   ucp_Linear_A,
244 |   ucp_Mahajani,
245 |   ucp_Manichaean,
246 |   ucp_Mende_Kikakui,
247 |   ucp_Modi,
248 |   ucp_Mro,
249 |   ucp_Nabataean,
250 |   ucp_Old_North_Arabian,
251 |   ucp_Old_Permic,
252 |   ucp_Pahawh_Hmong,
253 |   ucp_Palmyrene,
254 |   ucp_Psalter_Pahlavi,
255 |   ucp_Pau_Cin_Hau,
256 |   ucp_Siddham,
257 |   ucp_Tirhuta,
258 |   ucp_Warang_Citi,
259 |   /* New for Unicode 8.0.0 */
260 |   ucp_Ahom,
261 |   ucp_Anatolian_Hieroglyphs,
262 |   ucp_Hatran,
263 |   ucp_Multani,
264 |   ucp_Old_Hungarian,
265 |   ucp_SignWriting,
266 |   /* New for Unicode 10.0.0 (no update since 8.0.0) */
267 |   ucp_Adlam,
268 |   ucp_Bhaiksuki,
269 |   ucp_Marchen,
270 |   ucp_Newa,
271 |   ucp_Osage,
272 |   ucp_Tangut,
273 |   ucp_Masaram_Gondi,
274 |   ucp_Nushu,
275 |   ucp_Soyombo,
276 |   ucp_Zanabazar_Square,
277 |   /* New for Unicode 11.0.0 */
278 |   ucp_Dogra,
279 |   ucp_Gunjala_Gondi,
280 |   ucp_Hanifi_Rohingya,
281 |   ucp_Makasar,
282 |   ucp_Medefaidrin,
283 |   ucp_Old_Sogdian,
284 |   ucp_Sogdian,
285 |   /* New for Unicode 12.0.0 */
286 |   ucp_Elymaic,
287 |   ucp_Nandinagari,
288 |   ucp_Nyiakeng_Puachue_Hmong,
289 |   ucp_Wancho,
290 |   /* New for Unicode 13.0.0 */
291 |   ucp_Chorasmian,
292 |   ucp_Dives_Akuru,
293 |   ucp_Khitan_Small_Script,
294 |   ucp_Yezidi
295 | };
296 | 
297 | #endif  /* PCRE2_UCP_H_IDEMPOTENT_GUARD */
298 | 
299 | /* End of pcre2_ucp.h */
300 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2_xclass.c:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language.
  7 | 
  8 |                        Written by Philip Hazel
  9 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 10 |           New API code Copyright (c) 2016-2019 University of Cambridge
 11 | 
 12 | -----------------------------------------------------------------------------
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted provided that the following conditions are met:
 15 | 
 16 |     * Redistributions of source code must retain the above copyright notice,
 17 |       this list of conditions and the following disclaimer.
 18 | 
 19 |     * Redistributions in binary form must reproduce the above copyright
 20 |       notice, this list of conditions and the following disclaimer in the
 21 |       documentation and/or other materials provided with the distribution.
 22 | 
 23 |     * Neither the name of the University of Cambridge nor the names of its
 24 |       contributors may be used to endorse or promote products derived from
 25 |       this software without specific prior written permission.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 | POSSIBILITY OF SUCH DAMAGE.
 38 | -----------------------------------------------------------------------------
 39 | */
 40 | 
 41 | /* This module contains an internal function that is used to match an extended
 42 | class. It is used by pcre2_auto_possessify() and by both bundled_pcre2_match() and
 43 | pcre2_def_match(). */
 44 | 
 45 | 
 46 | #ifdef HAVE_CONFIG_H
 47 | #include "config.h"
 48 | #endif
 49 | 
 50 | 
 51 | #include "pcre2_internal.h"
 52 | 
 53 | /*************************************************
 54 | *       Match character against an XCLASS        *
 55 | *************************************************/
 56 | 
 57 | /* This function is called to match a character against an extended class that
 58 | might contain codepoints above 255 and/or Unicode properties.
 59 | 
 60 | Arguments:
 61 |   c           the character
 62 |   data        points to the flag code unit of the XCLASS data
 63 |   utf         TRUE if in UTF mode
 64 | 
 65 | Returns:      TRUE if character matches, else FALSE
 66 | */
 67 | 
 68 | BOOL
 69 | PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf)
 70 | {
 71 | PCRE2_UCHAR t;
 72 | BOOL negated = (*data & XCL_NOT) != 0;
 73 | 
 74 | #if PCRE2_CODE_UNIT_WIDTH == 8
 75 | /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
 76 | utf = TRUE;
 77 | #endif
 78 | 
 79 | /* Code points < 256 are matched against a bitmap, if one is present. If not,
 80 | we still carry on, because there may be ranges that start below 256 in the
 81 | additional data. */
 82 | 
 83 | if (c < 256)
 84 |   {
 85 |   if ((*data & XCL_HASPROP) == 0)
 86 |     {
 87 |     if ((*data & XCL_MAP) == 0) return negated;
 88 |     return (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0;
 89 |     }
 90 |   if ((*data & XCL_MAP) != 0 &&
 91 |     (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0)
 92 |     return !negated; /* char found */
 93 |   }
 94 | 
 95 | /* First skip the bit map if present. Then match against the list of Unicode
 96 | properties or large chars or ranges that end with a large char. We won't ever
 97 | encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
 98 | 
 99 | if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR);
100 | 
101 | while ((t = *data++) != XCL_END)
102 |   {
103 |   uint32_t x, y;
104 |   if (t == XCL_SINGLE)
105 |     {
106 | #ifdef SUPPORT_UNICODE
107 |     if (utf)
108 |       {
109 |       GETCHARINC(x, data); /* macro generates multiple statements */
110 |       }
111 |     else
112 | #endif
113 |     x = *data++;
114 |     if (c == x) return !negated;
115 |     }
116 |   else if (t == XCL_RANGE)
117 |     {
118 | #ifdef SUPPORT_UNICODE
119 |     if (utf)
120 |       {
121 |       GETCHARINC(x, data); /* macro generates multiple statements */
122 |       GETCHARINC(y, data); /* macro generates multiple statements */
123 |       }
124 |     else
125 | #endif
126 |       {
127 |       x = *data++;
128 |       y = *data++;
129 |       }
130 |     if (c >= x && c <= y) return !negated;
131 |     }
132 | 
133 | #ifdef SUPPORT_UNICODE
134 |   else  /* XCL_PROP & XCL_NOTPROP */
135 |     {
136 |     const ucd_record *prop = GET_UCD(c);
137 |     BOOL isprop = t == XCL_PROP;
138 | 
139 |     switch(*data)
140 |       {
141 |       case PT_ANY:
142 |       if (isprop) return !negated;
143 |       break;
144 | 
145 |       case PT_LAMP:
146 |       if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
147 |            prop->chartype == ucp_Lt) == isprop) return !negated;
148 |       break;
149 | 
150 |       case PT_GC:
151 |       if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
152 |         return !negated;
153 |       break;
154 | 
155 |       case PT_PC:
156 |       if ((data[1] == prop->chartype) == isprop) return !negated;
157 |       break;
158 | 
159 |       case PT_SC:
160 |       if ((data[1] == prop->script) == isprop) return !negated;
161 |       break;
162 | 
163 |       case PT_ALNUM:
164 |       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
165 |            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
166 |         return !negated;
167 |       break;
168 | 
169 |       /* Perl space used to exclude VT, but from Perl 5.18 it is included,
170 |       which means that Perl space and POSIX space are now identical. PCRE
171 |       was changed at release 8.34. */
172 | 
173 |       case PT_SPACE:    /* Perl space */
174 |       case PT_PXSPACE:  /* POSIX space */
175 |       switch(c)
176 |         {
177 |         HSPACE_CASES:
178 |         VSPACE_CASES:
179 |         if (isprop) return !negated;
180 |         break;
181 | 
182 |         default:
183 |         if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
184 |           return !negated;
185 |         break;
186 |         }
187 |       break;
188 | 
189 |       case PT_WORD:
190 |       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
191 |            PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
192 |              == isprop)
193 |         return !negated;
194 |       break;
195 | 
196 |       case PT_UCNC:
197 |       if (c < 0xa0)
198 |         {
199 |         if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
200 |              c == CHAR_GRAVE_ACCENT) == isprop)
201 |           return !negated;
202 |         }
203 |       else
204 |         {
205 |         if ((c < 0xd800 || c > 0xdfff) == isprop)
206 |           return !negated;
207 |         }
208 |       break;
209 | 
210 |       /* The following three properties can occur only in an XCLASS, as there
211 |       is no \p or \P coding for them. */
212 | 
213 |       /* Graphic character. Implement this as not Z (space or separator) and
214 |       not C (other), except for Cf (format) with a few exceptions. This seems
215 |       to be what Perl does. The exceptional characters are:
216 | 
217 |       U+061C           Arabic Letter Mark
218 |       U+180E           Mongolian Vowel Separator
219 |       U+2066 - U+2069  Various "isolate"s
220 |       */
221 | 
222 |       case PT_PXGRAPH:
223 |       if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
224 |             (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
225 |               (prop->chartype == ucp_Cf &&
226 |                 c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
227 |          )) == isprop)
228 |         return !negated;
229 |       break;
230 | 
231 |       /* Printable character: same as graphic, with the addition of Zs, i.e.
232 |       not Zl and not Zp, and U+180E. */
233 | 
234 |       case PT_PXPRINT:
235 |       if ((prop->chartype != ucp_Zl &&
236 |            prop->chartype != ucp_Zp &&
237 |             (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
238 |               (prop->chartype == ucp_Cf &&
239 |                 c != 0x061c && (c < 0x2066 || c > 0x2069))
240 |          )) == isprop)
241 |         return !negated;
242 |       break;
243 | 
244 |       /* Punctuation: all Unicode punctuation, plus ASCII characters that
245 |       Unicode treats as symbols rather than punctuation, for Perl
246 |       compatibility (these are $+<=>^`|~). */
247 | 
248 |       case PT_PXPUNCT:
249 |       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
250 |             (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
251 |         return !negated;
252 |       break;
253 | 
254 |       /* This should never occur, but compilers may mutter if there is no
255 |       default. */
256 | 
257 |       default:
258 |       return FALSE;
259 |       }
260 | 
261 |     data += 2;
262 |     }
263 | #else
264 |   (void)utf;  /* Avoid compiler warning */
265 | #endif  /* SUPPORT_UNICODE */
266 |   }
267 | 
268 | return negated;   /* char did not match */
269 | }
270 | 
271 | /* End of pcre2_xclass.c */
272 | 


--------------------------------------------------------------------------------
/src/PCRE2/pcre2posix.h:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 | *      Perl-Compatible Regular Expressions       *
  3 | *************************************************/
  4 | 
  5 | /* PCRE2 is a library of functions to support regular expressions whose syntax
  6 | and semantics are as close as possible to those of the Perl 5 language. This is
  7 | the public header file to be #included by applications that call PCRE2 via the
  8 | POSIX wrapper interface.
  9 | 
 10 |                        Written by Philip Hazel
 11 |      Original API code Copyright (c) 1997-2012 University of Cambridge
 12 |           New API code Copyright (c) 2016-2019 University of Cambridge
 13 | 
 14 | -----------------------------------------------------------------------------
 15 | Redistribution and use in source and binary forms, with or without
 16 | modification, are permitted provided that the following conditions are met:
 17 | 
 18 |     * Redistributions of source code must retain the above copyright notice,
 19 |       this list of conditions and the following disclaimer.
 20 | 
 21 |     * Redistributions in binary form must reproduce the above copyright
 22 |       notice, this list of conditions and the following disclaimer in the
 23 |       documentation and/or other materials provided with the distribution.
 24 | 
 25 |     * Neither the name of the University of Cambridge nor the names of its
 26 |       contributors may be used to endorse or promote products derived from
 27 |       this software without specific prior written permission.
 28 | 
 29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 39 | POSSIBILITY OF SUCH DAMAGE.
 40 | -----------------------------------------------------------------------------
 41 | */
 42 | 
 43 | 
 44 | /* Have to include stdlib.h in order to ensure that size_t is defined. */
 45 | 
 46 | #include <stdlib.h>
 47 | 
 48 | /* Allow for C++ users */
 49 | 
 50 | #ifdef __cplusplus
 51 | extern "C" {
 52 | #endif
 53 | 
 54 | /* Options, mostly defined by POSIX, but with some extras. */
 55 | 
 56 | #define REG_ICASE     0x0001  /* Maps to PCRE2_CASELESS */
 57 | #define REG_NEWLINE   0x0002  /* Maps to PCRE2_MULTILINE */
 58 | #define REG_NOTBOL    0x0004  /* Maps to PCRE2_NOTBOL */
 59 | #define REG_NOTEOL    0x0008  /* Maps to PCRE2_NOTEOL */
 60 | #define REG_DOTALL    0x0010  /* NOT defined by POSIX; maps to PCRE2_DOTALL */
 61 | #define REG_NOSUB     0x0020  /* Do not report what was matched */
 62 | #define REG_UTF       0x0040  /* NOT defined by POSIX; maps to PCRE2_UTF */
 63 | #define REG_STARTEND  0x0080  /* BSD feature: pass subject string by so,eo */
 64 | #define REG_NOTEMPTY  0x0100  /* NOT defined by POSIX; maps to PCRE2_NOTEMPTY */
 65 | #define REG_UNGREEDY  0x0200  /* NOT defined by POSIX; maps to PCRE2_UNGREEDY */
 66 | #define REG_UCP       0x0400  /* NOT defined by POSIX; maps to PCRE2_UCP */
 67 | #define REG_PEND      0x0800  /* GNU feature: pass end pattern by re_endp */
 68 | #define REG_NOSPEC    0x1000  /* Maps to PCRE2_LITERAL */
 69 | 
 70 | /* This is not used by PCRE2, but by defining it we make it easier
 71 | to slot PCRE2 into existing programs that make POSIX calls. */
 72 | 
 73 | #define REG_EXTENDED  0
 74 | 
 75 | /* Error values. Not all these are relevant or used by the wrapper. */
 76 | 
 77 | enum {
 78 |   REG_ASSERT = 1,  /* internal error ? */
 79 |   REG_BADBR,       /* invalid repeat counts in {} */
 80 |   REG_BADPAT,      /* pattern error */
 81 |   REG_BADRPT,      /* ? * + invalid */
 82 |   REG_EBRACE,      /* unbalanced {} */
 83 |   REG_EBRACK,      /* unbalanced [] */
 84 |   REG_ECOLLATE,    /* collation error - not relevant */
 85 |   REG_ECTYPE,      /* bad class */
 86 |   REG_EESCAPE,     /* bad escape sequence */
 87 |   REG_EMPTY,       /* empty expression */
 88 |   REG_EPAREN,      /* unbalanced () */
 89 |   REG_ERANGE,      /* bad range inside [] */
 90 |   REG_ESIZE,       /* expression too big */
 91 |   REG_ESPACE,      /* failed to get memory */
 92 |   REG_ESUBREG,     /* bad back reference */
 93 |   REG_INVARG,      /* bad argument */
 94 |   REG_NOMATCH      /* match failed */
 95 | };
 96 | 
 97 | 
 98 | /* The structure representing a compiled regular expression. It is also used
 99 | for passing the pattern end pointer when REG_PEND is set. */
100 | 
101 | typedef struct {
102 |   void *re_pcre2_code;
103 |   void *re_match_data;
104 |   const char *re_endp;
105 |   size_t re_nsub;
106 |   size_t re_erroffset;
107 |   int re_cflags;
108 | } regex_t;
109 | 
110 | /* The structure in which a captured offset is returned. */
111 | 
112 | typedef int regoff_t;
113 | 
114 | typedef struct {
115 |   regoff_t rm_so;
116 |   regoff_t rm_eo;
117 | } regmatch_t;
118 | 
119 | /* When an application links to a PCRE2 DLL in Windows, the symbols that are
120 | imported have to be identified as such. When building PCRE2, the appropriate
121 | export settings are needed, and are set in pcre2posix.c before including this
122 | file. */
123 | 
124 | #if defined(_WIN32) && !defined(PCRE2_STATIC) && !defined(PCRE2POSIX_EXP_DECL)
125 | #  define PCRE2POSIX_EXP_DECL  extern // __declspec(dllimport)
126 | #  define PCRE2POSIX_EXP_DEFN  // __declspec(dllimport)
127 | #endif
128 | 
129 | /* By default, we use the standard "extern" declarations. */
130 | 
131 | #ifndef PCRE2POSIX_EXP_DECL
132 | #  ifdef __cplusplus
133 | #    define PCRE2POSIX_EXP_DECL  extern "C"
134 | #    define PCRE2POSIX_EXP_DEFN  extern "C"
135 | #  else
136 | #    define PCRE2POSIX_EXP_DECL  extern
137 | #    define PCRE2POSIX_EXP_DEFN  extern
138 | #  endif
139 | #endif
140 | 
141 | /* The functions. The actual code is in functions with pcre2_xxx names for
142 | uniqueness. POSIX names are provided as macros for API compatibility with POSIX
143 | regex functions. It's done this way to ensure to they are always linked from
144 | the PCRE2 library and not by accident from elsewhere (regex_t differs in size
145 | elsewhere). */
146 | 
147 | PCRE2POSIX_EXP_DECL int pcre2_regcomp(regex_t *, const char *, int);
148 | PCRE2POSIX_EXP_DECL int pcre2_regexec(const regex_t *, const char *, size_t,
149 |                      regmatch_t *, int);
150 | PCRE2POSIX_EXP_DECL size_t pcre2_regerror(int, const regex_t *, char *, size_t);
151 | PCRE2POSIX_EXP_DECL void pcre2_regfree(regex_t *);
152 | 
153 | #define regcomp  pcre2_regcomp
154 | #define regexec  pcre2_regexec
155 | #define regerror pcre2_regerror
156 | #define regfree  pcre2_regfree
157 | 
158 | /* Debian had a patch that used different names. These are now here to save
159 | them having to maintain their own patch, but are not documented by PCRE2. */
160 | 
161 | #define PCRE2regcomp  pcre2_regcomp
162 | #define PCRE2regexec  pcre2_regexec
163 | #define PCRE2regerror pcre2_regerror
164 | #define PCRE2regfree  pcre2_regfree
165 | 
166 | #ifdef __cplusplus
167 | }   /* extern "C" */
168 | #endif
169 | 
170 | /* End of pcre2posix.h */
171 | 


--------------------------------------------------------------------------------
/src/PCRE2/sljit/sljitConfig.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *    Stack-less Just-In-Time compiler
  3 |  *
  4 |  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
  5 |  *
  6 |  * Redistribution and use in source and binary forms, with or without modification, are
  7 |  * permitted provided that the following conditions are met:
  8 |  *
  9 |  *   1. Redistributions of source code must retain the above copyright notice, this list of
 10 |  *      conditions and the following disclaimer.
 11 |  *
 12 |  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
 13 |  *      of conditions and the following disclaimer in the documentation and/or other materials
 14 |  *      provided with the distribution.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
 17 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 18 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 19 |  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 20 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 21 |  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 22 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 23 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 24 |  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  */
 26 | 
 27 | #ifndef _SLJIT_CONFIG_H_
 28 | #define _SLJIT_CONFIG_H_
 29 | 
 30 | #ifdef __cplusplus
 31 | extern "C" {
 32 | #endif
 33 | 
 34 | /* --------------------------------------------------------------------- */
 35 | /*  Custom defines                                                       */
 36 | /* --------------------------------------------------------------------- */
 37 | 
 38 | /* Put your custom defines here. This empty section will never change
 39 |    which helps maintaining patches (with diff / patch utilities). */
 40 | 
 41 | /* --------------------------------------------------------------------- */
 42 | /*  Architecture                                                         */
 43 | /* --------------------------------------------------------------------- */
 44 | 
 45 | /* Architecture selection. */
 46 | /* #define SLJIT_CONFIG_X86_32 1 */
 47 | /* #define SLJIT_CONFIG_X86_64 1 */
 48 | /* #define SLJIT_CONFIG_ARM_V5 1 */
 49 | /* #define SLJIT_CONFIG_ARM_V7 1 */
 50 | /* #define SLJIT_CONFIG_ARM_THUMB2 1 */
 51 | /* #define SLJIT_CONFIG_ARM_64 1 */
 52 | /* #define SLJIT_CONFIG_PPC_32 1 */
 53 | /* #define SLJIT_CONFIG_PPC_64 1 */
 54 | /* #define SLJIT_CONFIG_MIPS_32 1 */
 55 | /* #define SLJIT_CONFIG_MIPS_64 1 */
 56 | /* #define SLJIT_CONFIG_SPARC_32 1 */
 57 | /* #define SLJIT_CONFIG_TILEGX 1 */
 58 | 
 59 | /* #define SLJIT_CONFIG_AUTO 1 */
 60 | /* #define SLJIT_CONFIG_UNSUPPORTED 1 */
 61 | 
 62 | /* --------------------------------------------------------------------- */
 63 | /*  Utilities                                                            */
 64 | /* --------------------------------------------------------------------- */
 65 | 
 66 | /* Useful for thread-safe compiling of global functions. */
 67 | #ifndef SLJIT_UTIL_GLOBAL_LOCK
 68 | /* Enabled by default */
 69 | #define SLJIT_UTIL_GLOBAL_LOCK 1
 70 | #endif
 71 | 
 72 | /* Implements a stack like data structure (by using mmap / VirtualAlloc  */
 73 | /* or a custom allocator). */
 74 | #ifndef SLJIT_UTIL_STACK
 75 | /* Enabled by default */
 76 | #define SLJIT_UTIL_STACK 1
 77 | #endif
 78 | 
 79 | /* Uses user provided allocator to allocate the stack (see SLJIT_UTIL_STACK) */
 80 | #ifndef SLJIT_UTIL_SIMPLE_STACK_ALLOCATION
 81 | /* Disabled by default */
 82 | #define SLJIT_UTIL_SIMPLE_STACK_ALLOCATION 0
 83 | #endif
 84 | 
 85 | /* Single threaded application. Does not require any locks. */
 86 | #ifndef SLJIT_SINGLE_THREADED
 87 | /* Disabled by default. */
 88 | #define SLJIT_SINGLE_THREADED 0
 89 | #endif
 90 | 
 91 | /* --------------------------------------------------------------------- */
 92 | /*  Configuration                                                        */
 93 | /* --------------------------------------------------------------------- */
 94 | 
 95 | /* If SLJIT_STD_MACROS_DEFINED is not defined, the application should
 96 |    define SLJIT_MALLOC, SLJIT_FREE, SLJIT_MEMCPY, and NULL. */
 97 | #ifndef SLJIT_STD_MACROS_DEFINED
 98 | /* Disabled by default. */
 99 | #define SLJIT_STD_MACROS_DEFINED 0
100 | #endif
101 | 
102 | /* Executable code allocation:
103 |    If SLJIT_EXECUTABLE_ALLOCATOR is not defined, the application should
104 |    define SLJIT_MALLOC_EXEC, SLJIT_FREE_EXEC, and SLJIT_EXEC_OFFSET. */
105 | #ifndef SLJIT_EXECUTABLE_ALLOCATOR
106 | /* Enabled by default. */
107 | #define SLJIT_EXECUTABLE_ALLOCATOR 1
108 | 
109 | /* When SLJIT_PROT_EXECUTABLE_ALLOCATOR is enabled SLJIT uses
110 |    an allocator which does not set writable and executable
111 |    permission flags at the same time. The trade-of is increased
112 |    memory consumption and disabled dynamic code modifications. */
113 | #ifndef SLJIT_PROT_EXECUTABLE_ALLOCATOR
114 | /* Disabled by default. */
115 | #define SLJIT_PROT_EXECUTABLE_ALLOCATOR 0
116 | #endif
117 | 
118 | #endif
119 | 
120 | /* Force cdecl calling convention even if a better calling
121 |    convention (e.g. fastcall) is supported by the C compiler.
122 |    If this option is disabled (this is the default), functions
123 |    called from JIT should be defined with SLJIT_FUNC attribute.
124 |    Standard C functions can still be called by using the
125 |    SLJIT_CALL_CDECL jump type. */
126 | #ifndef SLJIT_USE_CDECL_CALLING_CONVENTION
127 | /* Disabled by default */
128 | #define SLJIT_USE_CDECL_CALLING_CONVENTION 0
129 | #endif
130 | 
131 | /* Return with error when an invalid argument is passed. */
132 | #ifndef SLJIT_ARGUMENT_CHECKS
133 | /* Disabled by default */
134 | #define SLJIT_ARGUMENT_CHECKS 0
135 | #endif
136 | 
137 | /* Debug checks (assertions, etc.). */
138 | #ifndef SLJIT_DEBUG
139 | /* Enabled by default */
140 | #define SLJIT_DEBUG 1
141 | #endif
142 | 
143 | /* Verbose operations. */
144 | #ifndef SLJIT_VERBOSE
145 | /* Enabled by default */
146 | #define SLJIT_VERBOSE 1
147 | #endif
148 | 
149 | /*
150 |   SLJIT_IS_FPU_AVAILABLE
151 |     The availability of the FPU can be controlled by SLJIT_IS_FPU_AVAILABLE.
152 |       zero value - FPU is NOT present.
153 |       nonzero value - FPU is present.
154 | */
155 | 
156 | /* For further configurations, see the beginning of sljitConfigInternal.h */
157 | 
158 | #ifdef __cplusplus
159 | } /* extern "C" */
160 | #endif
161 | 
162 | #endif
163 | 


--------------------------------------------------------------------------------
/src/sf_disabled.h:
--------------------------------------------------------------------------------
 1 | #ifndef SF_DISABLED_H
 2 | #define SF_DISABLED_H
 3 | 
 4 | #define NO_ALTREP_SUPPORT() throw std::runtime_error("ALTREP not supported in R < 3.5")
 5 | void init_stringfish(DllInfo* dll) {(void)0;} // no op; the init attribute still gets read in sf_altrep.h.
 6 | void sf_export_functions(DllInfo* dll) {(void)0;}
 7 | 
 8 | std::string get_string_type(SEXP x) {NO_ALTREP_SUPPORT();}
 9 | SEXP materialize(SEXP x) {NO_ALTREP_SUPPORT();}
10 | SEXP sf_vector(size_t len) {NO_ALTREP_SUPPORT();}
11 | void sf_assign(SEXP x, size_t i, SEXP e) {NO_ALTREP_SUPPORT();}
12 | SEXP sf_iconv(SEXP x, const std::string from, const std::string to, int nthreads=1) {NO_ALTREP_SUPPORT();}
13 | SEXP convert_to_sf(SEXP x) {NO_ALTREP_SUPPORT();}
14 | IntegerVector sf_nchar(SEXP x, const std::string type = "chars", const int nthreads = 1) {NO_ALTREP_SUPPORT();}
15 | SEXP sf_substr(SEXP x, IntegerVector start, IntegerVector stop, const int nthreads = 1) {NO_ALTREP_SUPPORT();}
16 | SEXP c_sf_paste(List dots, SEXP sep, const int nthreads = 1) {NO_ALTREP_SUPPORT();}
17 | SEXP sf_collapse(SEXP x, SEXP collapse) {NO_ALTREP_SUPPORT();}
18 | SEXP sf_readLines(const std::string file, const std::string encoding = "UTF-8") {NO_ALTREP_SUPPORT();}
19 | void sf_writeLines(SEXP text, const std::string file, const std::string sep = "\n", const std::string na_value = "NA", const std::string encode_mode = "UTF-8") {NO_ALTREP_SUPPORT();}
20 | LogicalVector sf_grepl(SEXP subject, SEXP pattern, const std::string encode_mode = "auto", const bool fixed = false ,const int nthreads = 1) {NO_ALTREP_SUPPORT();}
21 | SEXP sf_split(SEXP subject, SEXP split, const std::string encode_mode = "auto", const bool fixed = false, const int nthreads = 1) {NO_ALTREP_SUPPORT();}
22 | SEXP sf_gsub(SEXP subject, SEXP pattern, SEXP replacement, const std::string encode_mode = "auto", const bool fixed = false, const int nthreads = 1) {NO_ALTREP_SUPPORT();}
23 | SEXP random_strings(const int N, const int string_size = 50, 
24 |                     std::string charset = "abcdefghijklmnopqrstuvwxyz", 
25 |                     std::string vector_mode = "stringfish") {
26 |   if(vector_mode == "stringfish") Rcpp::warning("ALTREP not supported in R < 3.5");
27 |     CharacterVector ret(N);
28 |   std::string str;
29 |   str.resize(string_size);
30 |   for(int i=0; i<N; ++i) {
31 |     std::vector<int> r = Rcpp::as< std::vector<int> >(Rcpp::sample(charset.size(), string_size, true, R_NilValue, false));
32 |     for(int j=0; j<string_size; j++) str[j] = charset[r[j]];
33 |     ret[i] = str;
34 |   }
35 |   return ret;
36 | }
37 | SEXP sf_tolower(SEXP x) {NO_ALTREP_SUPPORT();}
38 | SEXP sf_toupper(SEXP x) {NO_ALTREP_SUPPORT();}
39 | IntegerVector sf_match(SEXP x, SEXP table, const int nthreads = 1) {NO_ALTREP_SUPPORT();}
40 | LogicalVector sf_compare(SEXP x, SEXP y, const int nthreads = 1) {NO_ALTREP_SUPPORT();}
41 | SEXP c_sf_concat(SEXP x) {NO_ALTREP_SUPPORT();}
42 | 
43 | #endif


--------------------------------------------------------------------------------
/src/xxhash/xxhash.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * xxHash - Extremely Fast Hash algorithm
 3 |  * Copyright (C) 2012-2020 Yann Collet
 4 |  *
 5 |  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *
11 |  *    * Redistributions of source code must retain the above copyright
12 |  *      notice, this list of conditions and the following disclaimer.
13 |  *    * Redistributions in binary form must reproduce the above
14 |  *      copyright notice, this list of conditions and the following disclaimer
15 |  *      in the documentation and/or other materials provided with the
16 |  *      distribution.
17 |  *
18 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 |  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |  *
30 |  * You can contact the author at:
31 |  *   - xxHash homepage: https://www.xxhash.com
32 |  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
33 |  */
34 | 
35 | 
36 | /*
37 |  * xxhash.c instantiates functions defined in xxhash.h
38 |  */
39 | 
40 | #define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
41 | #define XXH_IMPLEMENTATION   /* access definitions */
42 | 
43 | #include "xxhash.h"
44 | 


--------------------------------------------------------------------------------
/tests/tests.cpp:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins(cpp11)]]
 2 | // [[Rcpp::depends(stringfish)]]
 3 | #include <Rcpp.h>
 4 | #include "sf_external.h"
 5 | using namespace Rcpp;
 6 | 
 7 | // [[Rcpp::export]]
 8 | SEXP sf_alternate_case(SEXP x) {
 9 |   // Iterate through a character vector using the RStringIndexer class
10 |   // If the input vector x is a stringfish character vector it will do so without materialization
11 |   RStringIndexer r(x);
12 |   size_t len = r.size();
13 |   
14 |   // Create an output stringfish vector
15 |   // Like all R objects, it must be protected from garbage collection
16 |   SEXP output = PROTECT(sf_vector(len));
17 |   
18 |   // Obtain a reference to the underlying output data
19 |   sf_vec_data & output_data = sf_vec_data_ref(output);
20 |   
21 |   // You can use range based for loop via an iterator class that returns RStringIndexer::rstring_info e
22 |   // rstring info is a struct containing const char * ptr (null terminated), int len, and cetype_t enc
23 |   // a NA string is represented by a nullptr
24 |   // Alternatively, access the data via the function r.getCharLenCE(i)
25 |   size_t i = 0;
26 |   for(auto e : r) {
27 |     // check if string is NA and go to next if it is
28 |     if(e.ptr == nullptr) {
29 |       i++; // increment output index
30 |       continue;
31 |     }
32 |     // create a temporary output string and process the results
33 |     std::string temp(e.len, '\0');
34 |     bool case_switch = false;
35 |     for(int j=0; j<e.len; j++) {
36 |       if((e.ptr[j] >= 65) & (e.ptr[j] <= 90)) { // char j is upper case
37 |         if((case_switch = !case_switch)) { // check if we should convert to lower case
38 |           temp[j] = e.ptr[j] + 32;
39 |           continue;
40 |         }
41 |       } else if((e.ptr[j] >= 97) & (e.ptr[j] <= 122)) { // char j is lower case
42 |         if(!(case_switch = !case_switch)) { // check if we should convert to upper case
43 |           temp[j] = e.ptr[j] - 32;
44 |           continue;
45 |         }
46 |       } else if(e.ptr[j] == 32) {
47 |         case_switch = false;
48 |       }
49 |       temp[j] = e.ptr[j];
50 |     }
51 |     
52 |     // Create a new vector element sfstring and insert the processed string into the stringfish vector
53 |     // sfstring has three constructors, 1) taking a std::string and encoding, 
54 |     // 2) a char pointer and encoding, or 3) a CHARSXP object (e.g. sfstring(NA_STRING))
55 |     output_data[i] = sfstring(temp, e.enc);
56 |     i++; // increment output index
57 |   }
58 |   // Finally, call unprotect and return result
59 |   UNPROTECT(1);
60 |   return output;
61 | }
62 | 


--------------------------------------------------------------------------------
/vignettes/bench_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/traversc/stringfish/b943b1c74b1350e6ec85a92dddfbad50eb8cca8f/vignettes/bench_v2.png


--------------------------------------------------------------------------------
/vignettes/vignette.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "stringfish"
  3 | output:
  4 |   html_vignette:
  5 |     keep_md: no
  6 |   rmarkdown::github_document: default
  7 | vignette: >
  8 |   %\VignetteIndexEntry{stringfish}
  9 |   \usepackage[utf8]{inputenc}
 10 |   %\VignetteEngine{knitr::rmarkdown}
 11 | ---
 12 | 
 13 | ```{r, setup, echo=FALSE}
 14 | IS_GITHUB <- Sys.getenv("IS_GITHUB") != ""
 15 | ```
 16 | 
 17 | ```{r results='asis', echo=FALSE, eval=IS_GITHUB}
 18 | cat('
 19 | [![R-CMD-check](https://github.com/traversc/stringfish/workflows/R-CMD-check/badge.svg)](https://github.com/traversc/stringfish/actions)
 20 | [![CRAN-Status-Badge](https://www.r-pkg.org/badges/version/stringfish)](https://cran.r-project.org/package=stringfish)
 21 | [![CRAN-Downloads-Badge](https://cranlogs.r-pkg.org/badges/stringfish)](https://cran.r-project.org/package=stringfish)
 22 | [![CRAN-Downloads-Total-Badge](https://cranlogs.r-pkg.org/badges/grand-total/stringfish)](https://cran.r-project.org/package=stringfish)
 23 | ')
 24 | ```
 25 | 
 26 | `stringfish` is a framework for performing string and sequence operations using the ALTREP system to speed up the computation of common string operations. 
 27 | 
 28 | The ultimate goal of the package is to unify ALTREP string implementations under a common framework. 
 29 | 
 30 | The ALTREP system (new as of R 3.5.0) allows package developers to represent R objects using their own custom memory layout, completely invisible to the user. `stringfish` represents string data as a simple C++/STL vector, which is very fast and lightweight. 
 31 | 
 32 | Using normal R functions to process string data (e.g. `substr`, `gsub`, `paste`, etc.) causes "materialization" of ALTREP vectors to normal R data, which can be a slow process. Therefore, in order to take full advantage of the ALTREP framework, string processing functions need to be re-written to be ALTREP aware. This package hopes to fulfill that purpose. 
 33 | 
 34 | ## Installation
 35 | ```{r eval=FALSE}
 36 | install.packages("stringfish", type="source", configure.args="--with-simd=AVX2")
 37 | ```
 38 | 
 39 | ## Benchmark
 40 | 
 41 | The simplest way to show the utility of the ALTREP framework is through a quick benchmark comparing `stringfish` and base R. 
 42 | 
 43 | ```{r echo=FALSE, results='asis'}
 44 | if(IS_GITHUB) {
 45 |   cat('![](vignettes/bench_v2.png "bench_v2"){width=576px}')
 46 | } else {
 47 |   cat('![](bench_v2.png "bench_v2"){width=576px}')
 48 | }
 49 | ```
 50 | 
 51 | 
 52 | Yes you are reading the graph correctly: some functions in `stringfish` are more than an order of magnitude faster than vectorized base R operations (and even faster with some build in multithreading). On large text datasets, this can turn minutes of computation into seconds. 
 53 | 
 54 | ## Currently implemented functions
 55 | 
 56 | A list of implemented `stringfish` functions and analogous base R functions:
 57 | 
 58 | * `sf_iconv` (`iconv`)
 59 | * `sf_nchar` (`nchar`)
 60 | * `sf_substr` (`substr`)
 61 | * `sf_paste` (`paste0`)
 62 | * `sf_collapse` (`paste0`)
 63 | * `sf_readLines` (`readLines`)
 64 | * `sf_writeLines` (`writeLines`)
 65 | * `sf_grepl` (`grepl`)
 66 | * `sf_gsub` (`gsub`)
 67 | * `sf_toupper` (`toupper`)
 68 | * `sf_tolower` (`tolower`)
 69 | * `sf_starts` (`startsWith`)
 70 | * `sf_ends` (`endsWith`)
 71 | * `sf_trim` (`trimws`)
 72 | * `sf_split` (`strsplit`)
 73 | * `sf_match` (`match` for strings only)
 74 | * `sf_compare`/`sf_equals` (`==`, ALTREP-aware string equality)
 75 | 
 76 | Utility functions:
 77 | 
 78 | * `sf_vector` -- creates a new and empty `stringfish` vector
 79 | * `sf_assign` -- assign strings into a `stringfish` vector in place (like `x[i] <- "mystring"`)
 80 | * `sf_convert`/`convert_to_sf` -- converts a character vector to a `stringfish` vector
 81 | * `get_string_type` -- determines string type (whether ALTREP or normal)
 82 | * `materialize` -- converts any ALTREP object into a normal R object
 83 | * `random_strings` -- creates random strings as either a `stringfish` or normal R vector
 84 | * `string_identical` -- like `identical` for strings but also requires identical encoding (i.e. latin1 and UTF-8 strings will not match)
 85 | 
 86 | In addition, many R operations in base R and other packages are already ALTREP-aware (i.e. they don't cause materialization). Functions that subset or index into string vectors generally do not materialize.
 87 | 
 88 | * `sample`
 89 | * `head`
 90 | * `tail`
 91 | * `[` -- e.g. `x[20:30]`
 92 | * `dplyr::filter` -- e.g. `dplyr::filter(df, sf_starts("a"))`
 93 | * Etc.
 94 | 
 95 | `stringfish` functions are not intended to exactly replicate their base R analogues. One difference is that `subject` parameters are always the first argument, which is easier to use with pipes (`%>%`). E.g., `gsub(pattern, replacement, subject)` becomes `sf_gsub(subject, pattern, replacement)`.
 96 | 
 97 | ## Extensibility
 98 | 
 99 | `stringfish` as a framework is intended to be easily extensible. Stringfish vectors can be worked into `Rcpp` scripts or even into other packages (see the `qs` package for an example).
100 | 
101 | Below is a detailed `Rcpp` script that creates a function to alternate upper and lower case of strings. 
102 | 
103 | ```{c eval=FALSE}
104 | // [[Rcpp::plugins(cpp11)]]
105 | // [[Rcpp::depends(stringfish)]]
106 | #include <Rcpp.h>
107 | #include "sf_external.h"
108 | using namespace Rcpp;
109 | 
110 | // [[Rcpp::export]]
111 | SEXP sf_alternate_case(SEXP x) {
112 |   // Iterate through a character vector using the RStringIndexer class
113 |   // If the input vector x is a stringfish character vector it will do so without materialization
114 |   RStringIndexer r(x);
115 |   size_t len = r.size();
116 |   
117 |   // Create an output stringfish vector
118 |   // Like all R objects, it must be protected from garbage collection
119 |   SEXP output = PROTECT(sf_vector(len));
120 |   
121 |   // Obtain a reference to the underlying output data
122 |   sf_vec_data & output_data = sf_vec_data_ref(output);
123 |   
124 |   // You can use range based for loop via an iterator class that returns RStringIndexer::rstring_info e
125 |   // rstring info is a struct containing const char * ptr (null terminated), int len, and cetype_t enc
126 |   // a NA string is represented by a nullptr
127 |   // Alternatively, access the data via the function r.getCharLenCE(i)
128 |   size_t i = 0;
129 |   for(auto e : r) {
130 |     // check if string is NA and go to next if it is
131 |     if(e.ptr == nullptr) {
132 |       i++; // increment output index
133 |       continue;
134 |     }
135 |     // create a temporary output string and process the results
136 |     std::string temp(e.len, '\0');
137 |     bool case_switch = false;
138 |     for(int j=0; j<e.len; j++) {
139 |       if((e.ptr[j] >= 65) & (e.ptr[j] <= 90)) { // char j is upper case
140 |         if((case_switch = !case_switch)) { // check if we should convert to lower case
141 |           temp[j] = e.ptr[j] + 32;
142 |           continue;
143 |         }
144 |       } else if((e.ptr[j] >= 97) & (e.ptr[j] <= 122)) { // char j is lower case
145 |         if(!(case_switch = !case_switch)) { // check if we should convert to upper case
146 |           temp[j] = e.ptr[j] - 32;
147 |           continue;
148 |         }
149 |       } else if(e.ptr[j] == 32) {
150 |         case_switch = false;
151 |       }
152 |       temp[j] = e.ptr[j];
153 |     }
154 |     
155 |     // Create a new vector element sfstring and insert the processed string into the stringfish vector
156 |     // sfstring has three constructors, 1) taking a std::string and encoding, 
157 |     // 2) a char pointer and encoding, or 3) a CHARSXP object (e.g. sfstring(NA_STRING))
158 |     output_data[i] = sfstring(temp, e.enc);
159 |     i++; // increment output index
160 |   }
161 |   // Finally, call unprotect and return result
162 |   UNPROTECT(1);
163 |   return output;
164 | }
165 | 
166 | ```
167 | 
168 | Example function call:
169 | ```{r eval=FALSE}
170 | sf_alternate_case("hello world") 
171 | [1] "hElLo wOrLd"
172 | ```
173 | 
174 | ## To do
175 | * Additional functions
176 | * ICU library functions
177 | 


--------------------------------------------------------------------------------