├── .github └── workflows │ └── semgrep.yml ├── LICENSE ├── Makefile ├── README.md ├── lua-re2.lua ├── re2_c.cxx ├── re2_c.h └── test.lua /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: {} 3 | workflow_dispatch: {} 4 | push: 5 | branches: 6 | - main 7 | - master 8 | schedule: 9 | - cron: '0 0 * * *' 10 | name: Semgrep config 11 | jobs: 12 | semgrep: 13 | name: semgrep/ci 14 | runs-on: ubuntu-latest 15 | env: 16 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 17 | SEMGREP_URL: https://cloudflare.semgrep.dev 18 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 19 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 20 | container: 21 | image: semgrep/semgrep 22 | steps: 23 | - uses: actions/checkout@v4 24 | - run: semgrep ci 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 CloudFlare, Inc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of CloudFlare, Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY = all clean createdir test install 2 | 3 | # The directory of RE2 package 4 | RE2_INSTALL_ROOT = 5 | RE2_INC_DIR = $(RE2_INSTALL_ROOT)/usr/local/include 6 | RE2_LIB_DIR = $(RE2_INSTALL_ROOT)/usr/local/lib 7 | LUA_VERSION := 5.1 8 | 9 | # the install dir of this package 10 | PREFIX=/usr/local 11 | LIB_TARGET_DIR=$(PREFIX)/lib 12 | LUA_TARGET_DIR := $(PREFIX)/share/lua/$(LUA_VERSION) 13 | 14 | CXXFLAGAS = -O3 -g -Wall 15 | BUILD_CXXFLAGS = $(CXXFLAGAS) -fvisibility=hidden -I$(RE2_INC_DIR) -MMD 16 | AR_BUILD_CXXFLAGS = -DBUILDING_LIB 17 | SO_BUILD_CXXFLAGS = -DBUILDING_LIB -fPIC 18 | 19 | CXX_SRC = re2_c.cxx 20 | CXX_OBJ = ${CXX_SRC:.cxx=.o} 21 | AR_OBJ = $(addprefix obj/lib/, $(CXX_OBJ)) 22 | SO_OBJ = $(addprefix obj/so/, $(CXX_OBJ)) 23 | 24 | AR_NAME = libre2c.a 25 | SO_NAME = libre2c.so 26 | 27 | BUILD_AR_DIR = obj/lib 28 | BUILD_SO_DIR = obj/so 29 | 30 | AR ?= ar 31 | CXX ?= g++ 32 | 33 | all : $(BUILD_AR_DIR) $(BUILD_SO_DIR) $(AR_NAME) $(SO_NAME) $(RE2C_EX) 34 | 35 | $(BUILD_AR_DIR):; mkdir -p $@ 36 | $(BUILD_SO_DIR):; mkdir -p $@ 37 | 38 | createdir : 39 | @if [ ! -d obj/lib ] ; then mkdir -p obj/lib ; fi && \ 40 | if [ ! -d obj/so ] ; then mkdir -p obj/so ; fi 41 | 42 | -include ar_dep.txt 43 | -include so_dep.txt 44 | 45 | $(AR_NAME) : $(AR_OBJ) 46 | $(AR) cru $@ $(AR_OBJ) 47 | 48 | $(SO_NAME) : $(SO_OBJ) 49 | $(CXX) $(BUILD_CXXFLAGS) $(SO_BUILD_CXXFLAGS) $(SO_OBJ) -shared -L$(RE2_LIB_DIR) -lre2 -lpthread -o $@ 50 | cat $(BUILD_SO_DIR)/*.d > so_dep.txt 51 | 52 | $(AR_OBJ) : $(BUILD_AR_DIR)/%.o : %.cxx 53 | $(CXX) -c $(BUILD_CXXFLAGS) $(AR_BUILD_CXXFLAGS) $< -o $@ 54 | cat $(BUILD_AR_DIR)/*.d > ar_dep.txt 55 | 56 | $(SO_OBJ) : $(BUILD_SO_DIR)/%.o : %.cxx 57 | $(CXX) -c $(BUILD_CXXFLAGS) $(SO_BUILD_CXXFLAGS) $< -o $@ 58 | 59 | clean: 60 | rm -rf $(PROGRAM) ${BUILD_AR_DIR}/*.[od] ${BUILD_SO_DIR}/*.[od] *.[od] \ 61 | *dep.txt $(AR_NAME) $(SO_NAME) $(RE2C_EX) obj/ 62 | 63 | test: 64 | export LD_LIBRARY_PATH=`pwd`:$(LD_LIBRARY_PATH):$(RE2_LIB_DIR); \ 65 | luajit test.lua 66 | 67 | install: 68 | install -D -m 755 $(AR_NAME) $(DESTDIR)/$(LIB_TARGET_DIR)/$(AR_NAME) 69 | install -D -m 755 $(SO_NAME) $(DESTDIR)/$(LIB_TARGET_DIR)/$(SO_NAME) 70 | install -D -m 664 lua-re2.lua $(DESTDIR)/$(LUA_TARGET_DIR)/lua-re2.lua 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | C and Lua wrappers for RE2 2 | ========================= 3 | 4 | C and Lua wrapper for RE2 regular expression library. The Lua wrapper is built on 5 | top of C wrapper. 6 | 7 | Lua Functions 8 | ============= 9 | 10 | The Lua wrapper (lua-re2.lua) exposes following functions: 11 | 12 | new 13 | --- 14 | `syntax: instance = new()` 15 | 16 | Create an instance which pre-allocate some data structures for captures, to obviate 17 | the need of allocating them each time `match` is called. 18 | 19 | The default value of the parameter `max-capture` is 40. 20 | 21 | compile 22 | ------- 23 | `syntax: pattern, capture_num, err_msg = compile(pattern, [options, max_mem])` 24 | 25 | Pre-compile the pattern string. Additional options for the regex engine can be passed by 26 | the `options` and `max_mem`; they are corresponding to RE2's `re2::RE2::Options` except 27 | that the `options` is using single character, instead of bitmask, to pass the boolean 28 | options. The respondance between `options` and RE2's `re2::RE2::Options` are following: 29 | 30 | |option char|re2::RE2::Options| meaning| default value| 31 | |-----------|-----------------|--------|--------------| 32 | | u | utf8 |text and pattern are UTF-8; otherwise Latin-1 | true | 33 | | a | longest_match |search for longest match, not first match | false | 34 | | e | log_errors |log syntax and execution errors to ERROR | true | 35 | | l | literal |interpret string as literal, not regexp | false | 36 | | n | never_nl |never match \n, even if it is in regexp | false | 37 | | s | dot_nl |dot matches everything including new line | false | 38 | | c | never_capture |parse all parens as non-capturing | false | 39 | | i | case_sensitive |match is case-sensitive (regexp can override with (?i) unless in posix_syntax mode) | true | 40 | | m | multi-line-mode |^ match after any newline, and $ match any before newline | false | 41 | 42 | match 43 | ------ 44 | `syntax: captures, errmsg = match(instance, pattern, text, cap_idx)` 45 | 46 | Match the given pre-compiled `pattern` against the `text`. It returns three variables: 47 | 48 | | `captures` | the specified capture(s), see bellow | 49 | | `errmsg` | error message if something wrong took place | 50 | 51 | The input parameter `cap_idx` can take one of the following values: 52 | | -1 | return all captures in an array, i-th (i>=0) element contains i-th capture, and 0-th element is the sub-string which tighly matches the entire pattern| 53 | | 1 .. the-number-of-capture | return particular capture | 54 | 55 | find 56 | ----- 57 | `syntax: match_or_not = find(pattern, text)` 58 | return non-nil if match, nil otherwise 59 | 60 | match_r 61 | ------ 62 | `syntax: captures = match_r(instance, pattern, text)` 63 | 64 | Match the given pre-compiled `pattern` repeatedly through the full `text`. 65 | It returns all captures in an array if match, nil otherwise. 66 | 67 | C Funtions 68 | ========== 69 | The interface functions are self-descriptive. Please check the `re2c_c.h` for details. 70 | -------------------------------------------------------------------------------- /lua-re2.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | Copyright (c) 2014 CloudFlare, Inc. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following disclaimer 12 | in the documentation and/or other materials provided with the 13 | distribution. 14 | * Neither the name of CloudFlare, Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | ]] 30 | 31 | --[[ 32 | This module is a thin Lua wrapper for RE2 lib. It built on top on RE2 33 | C wrapper (libre2c.so) which, in turn, relies on libre.so. 34 | 35 | This module exports following functions: 36 | ---------------------------------------- 37 | o. new(): 38 | Create an instance of this module. 39 | 40 | o. compile(pattern, options, max_mem): 41 | Compile the pattern string. Return pre-compiled pattern on success, 42 | or nil otherwise. 43 | 44 | o. match(self, pattern, text, cap_idx) 45 | Match the pattern agaist the text. Return non-nil along with the 46 | specified capture(s). See the comment to this function for details. 47 | 48 | o. find((pattern, text) 49 | Performing matching without returning captures. 50 | 51 | Usage example: 52 | -------------- 53 | 54 | local re2 = require "lua-re2" 55 | local inst = re2.new() 56 | local pat = re2_inst.compile("the-pattern-string") 57 | local caps, errmsg = re2_inst.match(pat, text) 58 | -- print all captures 59 | if caps then 60 | for i = 1, #caps do 61 | print("capture ", i, caps[i]) 62 | end 63 | end 64 | ]] 65 | 66 | local ffi = require "ffi" 67 | 68 | local _M = {} 69 | local mt = { __index = _M } 70 | 71 | ffi.cdef [[ 72 | struct re2_pattern_t; 73 | struct re2c_match_aux; 74 | struct re2_pattern_t* re2c_compile(const char* pattern, int pat_len, 75 | const char* re2_options, 76 | char* errstr, int errstrlen, 77 | unsigned max_mem); 78 | void re2c_free(struct re2_pattern_t*); 79 | int re2c_getncap(struct re2_pattern_t*); 80 | 81 | int re2c_match(const char* text, int text_len, struct re2_pattern_t*, 82 | struct re2c_match_aux*); 83 | int re2c_find(const char* text, int text_len, struct re2_pattern_t*); 84 | 85 | const char* re2c_get_capture(struct re2c_match_aux*, unsigned idx); 86 | unsigned re2c_get_capture_len(struct re2c_match_aux*, unsigned idx); 87 | 88 | struct re2c_match_aux* re2c_alloc_aux(void); 89 | void re2c_free_aux(struct re2c_match_aux*); 90 | 91 | const char* re2c_get_errstr(struct re2c_match_aux*); 92 | 93 | void* malloc(size_t); 94 | void free(void*); 95 | int re2c_match_r(const char* text, int text_len, struct re2_pattern_t* pattern, 96 | struct re2c_match_aux* aux); 97 | unsigned re2c_get_capture_r_count(struct re2c_match_aux* aux); 98 | const char* re2c_get_capture_r(struct re2c_match_aux*, unsigned idx); 99 | unsigned re2c_get_capture_r_len(struct re2c_match_aux*, unsigned idx); 100 | ]] 101 | 102 | local ffi_string = ffi.string 103 | local ffi_malloc = ffi.C.malloc 104 | local ffi_free = ffi.C.free 105 | local ffi_cast = ffi.cast 106 | local ffi_gc = ffi.gc 107 | 108 | local char_ptr_ty = ffi.typeof("char*"); 109 | 110 | -- NOTE: re2_c_lib must be referenced by a function, or is assigned to 111 | -- _M.whatever; otherwise, the shared object would be unloaded by Garbage- 112 | -- Collector. 113 | -- 114 | local re2_c_lib = ffi.load("libre2c.so") 115 | _M.re2_c_lib = re2_c_lib 116 | local re2c_compile = re2_c_lib.re2c_compile 117 | local re2c_match = re2_c_lib.re2c_match 118 | local re2c_find = re2_c_lib.re2c_find 119 | local re2c_getncap = re2_c_lib.re2c_getncap 120 | local re2c_free = re2_c_lib.re2c_free 121 | local re2c_get_capture = re2_c_lib.re2c_get_capture 122 | local re2c_get_capture_len = re2_c_lib.re2c_get_capture_len 123 | local re2c_match_r = re2_c_lib.re2c_match_r 124 | local re2c_get_capture_r_count = re2_c_lib.re2c_get_capture_r_count 125 | local re2c_get_capture_r = re2_c_lib.re2c_get_capture_r 126 | local re2c_get_capture_r_len = re2_c_lib.re2c_get_capture_r_len 127 | 128 | function _M.new() 129 | local aux = ffi_gc(re2_c_lib.re2c_alloc_aux(), 130 | re2_c_lib.re2c_free_aux) 131 | 132 | local self = { 133 | aux = aux 134 | } 135 | 136 | return setmetatable(self, mt) 137 | end 138 | 139 | -- Compile the given pattern, it will return two values: 140 | -- o. the precompiled pattern or nil, 141 | -- o. error message in case it was not successful. 142 | -- 143 | -- The "options" is a string, each char being a single-char option. See 144 | -- re2_c.h for the list of options and their definition. 145 | -- 146 | -- max_mem is to specify the limit of memory allocated by RE2 engine. 147 | -- 148 | -- Both "options" and "max_mem" could be nil. 149 | -- 150 | function _M.compile(pattern, options, max_mem) 151 | local buf_len = 128 152 | local char_buf = ffi_malloc(buf_len) 153 | char_bur = ffi_cast(char_ptr_ty, char_buf) 154 | 155 | local max_mem = max_mem or 0 156 | local ptn = re2c_compile(pattern, #pattern, options, char_buf, buf_len, 157 | max_mem) 158 | 159 | if ptn == nil then 160 | -- NOTE: "pat == nil" and "not pat" are not equivalent in this case! 161 | local err = ffi_string(char_buf) 162 | ffi_free(char_buf) 163 | return nil, err 164 | end 165 | 166 | ffi_free(char_buf) 167 | return ffi_gc(ptn, re2c_free); 168 | end 169 | 170 | -- Peform pattern match. It returns two values: 171 | -- 172 | -- o. nil if dosen't match. otherwise, 173 | -- *) cap_idx = -1: 174 | -- return all captures in an array where the i-th element (i>=1) 175 | -- corresponds to i-th captures, and 0-th element is the sub-string of 176 | -- the input text which tightly match the pattern. 177 | -- 178 | -- e.g. pattern = "abc([0-1]+)([a-z]+)", text = "wtfabc012abc" 179 | -- The first value returned by this function would be 180 | -- {'abc012abc', '012', 'abc'} 181 | -- 182 | -- *) cap_idx != -1: 183 | -- return specified capture. 184 | -- 185 | -- o. error message if something unusual took place 186 | -- 187 | function _M.match(self, pattern, text, cap_idx) 188 | local cap_idx = cap_idx or -1 189 | local ncap = re2c_getncap(pattern) 190 | if ncap < cap_idx or cap_idx < -1 then 191 | return nil, "capture index out of range" 192 | end 193 | 194 | local aux = self.aux 195 | local ret = re2c_match(text, #text, pattern, aux) 196 | if ret == 0 then 197 | -- return all captures in an array 198 | if cap_idx == -1 then 199 | local cap_array = {} 200 | for i = 0, ncap do 201 | local str = re2c_get_capture(aux, i) 202 | local len = re2c_get_capture_len(aux, i) 203 | cap_array[i] = ffi_string(str, len) 204 | end 205 | return cap_array 206 | end 207 | 208 | -- return particular capture as a string 209 | if cap_idx >= 0 and cap_idx <= ncap then 210 | local str = re2c_get_capture(aux, cap_idx) 211 | local len = re2c_get_capture_len(aux, cap_idx) 212 | local cap = ffi_string(str, len) 213 | return cap 214 | end 215 | end 216 | end 217 | 218 | function _M.find(pattern, text) 219 | local ret = re2c_find(text, #text, pattern) 220 | if ret == 0 then 221 | return 1 222 | end 223 | end 224 | 225 | -- Match pattern repeatly by scanning full text. It returns one value: 226 | -- 227 | -- o. nil if dosen't match. otherwise, 228 | -- return all captures in an array where the i-th element (i>=1) 229 | -- corresponds to i-th captures. 230 | -- 231 | -- e.g. pattern = "([^&=]+)=([^&=]*)", text = "k1=v1&k2=v2&k3=v3" 232 | -- The first value returned by this function would be 233 | -- {'k1', 'v1', 'k2', 'v2', 'k3', 'v3'} 234 | -- 235 | function _M.match_r(self, pattern, text) 236 | local aux = self.aux 237 | local ret = re2c_match_r(text, #text, pattern, aux) 238 | local ncap = re2c_get_capture_r_count(aux) 239 | if ret == 0 then 240 | -- return all captures in an array 241 | local cap_array = {} 242 | for i = 0, ncap-1 do 243 | local str = re2c_get_capture_r(aux, i) 244 | local len = re2c_get_capture_r_len(aux, i) 245 | cap_array[i+1] = ffi_string(str, len) 246 | end 247 | return cap_array 248 | end 249 | end 250 | 251 | return _M 252 | -------------------------------------------------------------------------------- /re2_c.cxx: -------------------------------------------------------------------------------- 1 | #include // for tolower() 2 | #include // for snprintf() 3 | #include 4 | #include 5 | 6 | #include "re2_c.h" 7 | 8 | using namespace std; 9 | 10 | #define likely(x) __builtin_expect((x),1) 11 | #define unlikely(x) __builtin_expect((x),0) 12 | 13 | #ifdef DEBUG 14 | // Usage examples: ASSERT(a > b), ASSERT(foo() && "Opps, foo() reutrn 0"); 15 | #define ASSERT(c) if (!(c))\ 16 | { fprintf(stderr, "%s:%d Assert: %s\n", __FILE__, __LINE__, #c); abort(); } 17 | #else 18 | #define ASSERT(c) ((void)0) 19 | #endif 20 | 21 | #define CAP_VECTOR_DEFAULT_LEN 64 22 | 23 | /* This data structure is used to return some variable-length results back to 24 | * caller. 25 | */ 26 | struct re2c_match_aux { 27 | char* errstr; 28 | re2::StringPiece* captures; 29 | re2::StringPiece* captures_r; /* collections of all captures for Consume/FindAndConsume apis */ 30 | 31 | unsigned short errstr_buf_len; 32 | unsigned short cap_vect_len; /* the capacity of captures vector */ 33 | unsigned short ncap; /* cache of RE2::NumberOfCapturingGroups() */ 34 | unsigned short cap_r_vect_len; 35 | unsigned short cap_r_vect_max_len; 36 | }; 37 | 38 | 39 | /* Record captures per match in captures_r vector. 40 | * captures_r vector will be realloceted automatically. */ 41 | unsigned 42 | re2c_record_capture(struct re2c_match_aux* aux) { 43 | if (unlikely(!aux->captures)) 44 | return 0; 45 | 46 | if (!aux->captures_r) { 47 | aux->captures_r = new re2::StringPiece[CAP_VECTOR_DEFAULT_LEN]; 48 | if (!aux->captures_r) return 1; 49 | } 50 | if (aux->cap_r_vect_len + aux->cap_vect_len >= aux->cap_r_vect_max_len) { 51 | aux->cap_r_vect_max_len *= 2; 52 | re2::StringPiece *new_captures_r = new re2::StringPiece[aux->cap_r_vect_max_len]; 53 | if (!new_captures_r) 54 | return 1; 55 | for (int i = 0; i < aux->cap_r_vect_len; i++) { 56 | new_captures_r[i] = aux->captures_r[i]; 57 | } 58 | delete[] aux->captures_r; 59 | aux->captures_r = new_captures_r; 60 | } 61 | for (int i = 0; i < aux->ncap; i++) { 62 | aux->captures_r[aux->cap_r_vect_len] = aux->captures[i]; 63 | aux->cap_r_vect_len++; 64 | } 65 | return 0; 66 | } 67 | 68 | unsigned 69 | re2c_get_capture_r_count(struct re2c_match_aux* aux) { 70 | return aux->cap_r_vect_len; 71 | } 72 | 73 | const char* 74 | re2c_get_capture_r(struct re2c_match_aux* aux, unsigned idx) { 75 | if (unlikely(!aux->captures_r)) 76 | return 0; 77 | 78 | if (unlikely(aux->cap_r_vect_len <= idx)) 79 | return 0; 80 | 81 | return aux->captures_r[idx].data(); 82 | } 83 | 84 | unsigned 85 | re2c_get_capture_r_len(struct re2c_match_aux* aux, unsigned idx) { 86 | if (unlikely(!aux->captures_r)) 87 | return 0; 88 | 89 | if (unlikely(aux->cap_r_vect_len <= idx)) 90 | return 0; 91 | 92 | return aux->captures_r[idx].size(); 93 | } 94 | 95 | /* Return the "idx"-th capture. NOTE: Captures are not necessarily ended with 96 | * '\0'. 97 | */ 98 | const char* 99 | re2c_get_capture(struct re2c_match_aux* aux, unsigned idx) { 100 | if (unlikely(!aux->captures)) 101 | return 0; 102 | 103 | if (unlikely(aux->ncap <= idx)) 104 | return 0; 105 | 106 | return aux->captures[idx].data(); 107 | } 108 | 109 | unsigned 110 | re2c_get_capture_len(struct re2c_match_aux* aux, unsigned idx) { 111 | if (unlikely(!aux->captures)) 112 | return 0; 113 | 114 | if (unlikely(aux->ncap <= idx)) 115 | return 0; 116 | 117 | return aux->captures[idx].size(); 118 | } 119 | 120 | struct re2c_match_aux* 121 | re2c_alloc_aux(void) { 122 | struct re2c_match_aux* p = new struct re2c_match_aux; 123 | p->errstr = 0; 124 | p->captures = 0; 125 | p->errstr_buf_len = 0; 126 | p->cap_vect_len = 0; 127 | p->ncap = 0; 128 | p->captures_r = 0; 129 | p->cap_r_vect_len = 0; 130 | p->cap_r_vect_max_len = CAP_VECTOR_DEFAULT_LEN; 131 | return p; 132 | } 133 | 134 | void 135 | re2c_free_aux(struct re2c_match_aux* p) { 136 | delete[] p->errstr; 137 | delete[] p->captures; 138 | delete[] p->captures_r; 139 | delete p; 140 | } 141 | 142 | const char* 143 | re2c_get_errstr(struct re2c_match_aux* aux) { 144 | return aux->errstr; 145 | } 146 | 147 | static void 148 | copy_errstr(char* buffer, int buf_len, const string& src) { 149 | if (!buffer) 150 | return; 151 | 152 | int copy_len = src.size(); 153 | if (copy_len > buf_len - 1) 154 | copy_len = buf_len - 1; 155 | 156 | strncpy(buffer, src.c_str(), copy_len); 157 | buffer[copy_len] = '\0'; 158 | } 159 | 160 | struct re2_pattern_t* 161 | re2c_compile(const char* pattern, int pattern_len, const char* re2_options, 162 | char* errstr, int errstrlen, unsigned max_mem) { 163 | const char* ptn_ptr = pattern; 164 | int ptn_len = pattern_len; 165 | 166 | // Process the options 167 | re2::RE2::Options opts; 168 | 169 | opts.set_log_errors(false); 170 | if (re2_options) { 171 | const char* p = re2_options; 172 | 173 | bool multiline = false; 174 | opts.set_perl_classes(true); 175 | opts.set_word_boundary(true); 176 | 177 | while (char c = *p++) { 178 | bool turn_on = true; 179 | if (c >= 'A' && c <= 'Z') { 180 | turn_on = false; 181 | c = tolower(c); 182 | } 183 | 184 | switch (c) { 185 | case 'u': opts.set_utf8(turn_on); break; 186 | case 'p': opts.set_posix_syntax(turn_on); break; 187 | case 'a': opts.set_longest_match(turn_on); break; 188 | case 'e': opts.set_log_errors(turn_on); break; 189 | case 'l': opts.set_literal(turn_on); break; 190 | case 'n': opts.set_never_nl(turn_on); break; 191 | case 's': opts.set_dot_nl(turn_on); break; 192 | case 'c': opts.set_never_capture(turn_on); break; 193 | case 'i': opts.set_case_sensitive(!turn_on); break; 194 | case 'm': multiline = true; break; 195 | default: 196 | { 197 | fprintf(stderr, "unsupport flag\n"); 198 | string s = "unsupport flags "; 199 | s += c; 200 | copy_errstr(errstr, errstrlen, s); 201 | return 0; 202 | } 203 | } 204 | } 205 | 206 | if (max_mem == 0) {max_mem = 2048 * 1024; } 207 | opts.set_max_mem(max_mem); 208 | 209 | // FIXME:one-line mode is always turned on in non-posix mode. To 210 | // workaround the problem, we enclose the pattern with "(?m:...)" 211 | if (multiline) { 212 | const char* prefix = "(?m:"; 213 | const char* postfix = ")"; 214 | 215 | char* t; 216 | t = new char[ptn_len + strlen(prefix) + strlen(postfix) + 1]; 217 | 218 | strcpy(t, prefix); 219 | memcpy(t + strlen(prefix), pattern, ptn_len); 220 | strcpy(t + strlen(prefix) + ptn_len, postfix); 221 | 222 | ptn_ptr = t; 223 | ptn_len += strlen(prefix) + strlen(postfix); 224 | } 225 | } 226 | 227 | // Now compile the pattern 228 | RE2* pat = new RE2(re2::StringPiece(ptn_ptr, ptn_len), opts); 229 | if (ptn_ptr != pattern) 230 | delete[] ptn_ptr; 231 | 232 | if (pat && !pat->ok()) { 233 | copy_errstr(errstr, errstrlen, pat->error()); 234 | delete pat; 235 | return 0; 236 | } 237 | 238 | return (re2_pattern_t*)(void*)pat; 239 | } 240 | 241 | void 242 | re2c_free(struct re2_pattern_t* pat) { 243 | delete (RE2*)(void*)pat; 244 | } 245 | 246 | /* Return the number of captures of the given pattern */ 247 | int 248 | re2c_getncap(struct re2_pattern_t* pattern) { 249 | RE2* pat = reinterpret_cast(pattern); 250 | return pat->NumberOfCapturingGroups(); 251 | } 252 | 253 | /* Return 0 if the pattern matches the given text, 1 otherwise. */ 254 | int 255 | re2c_find(const char* text, int text_len, struct re2_pattern_t* pattern) { 256 | RE2* re2 = (RE2*)(void*)pattern; 257 | if (unlikely(!re2)) 258 | return 1; 259 | 260 | bool match = re2->Match(re2::StringPiece(text, text_len), 261 | 0 /* startpos */, text_len /* endpos*/, 262 | re2::RE2::UNANCHORED, 0, 0); 263 | 264 | return match ? 0 : 1; 265 | } 266 | 267 | /* Return 0 if the pattern matches the given text, 1 otherwise; captures are 268 | * returned via "aux". 269 | */ 270 | int 271 | re2c_match(const char* text, int text_len, struct re2_pattern_t* pattern, 272 | struct re2c_match_aux* aux) { 273 | RE2* re2 = (RE2*)(void*)pattern; 274 | if (unlikely(!re2)) 275 | return 1; 276 | 277 | int ncap = re2->NumberOfCapturingGroups() + 1; 278 | if (!aux->cap_vect_len || aux->cap_vect_len < ncap) { 279 | delete[] aux->captures; 280 | aux->captures = new re2::StringPiece[ncap]; 281 | aux->cap_vect_len = ncap; 282 | } 283 | aux->ncap = ncap; 284 | 285 | bool match = re2->Match(re2::StringPiece(text, text_len), 286 | 0 /* startpos */, text_len /* endpos*/, 287 | re2::RE2::UNANCHORED, aux->captures, ncap); 288 | return match ? 0 : 1; 289 | } 290 | 291 | /* Return 0 if the pattern matches the given text, 1 otherwise; captures are 292 | * returned via "aux". 293 | */ 294 | int 295 | re2c_match_r(const char* text, int text_len, struct re2_pattern_t* pattern, 296 | struct re2c_match_aux* aux) { 297 | RE2* re2 = (RE2*)(void*)pattern; 298 | if (unlikely(!re2)) { 299 | return 1; 300 | } 301 | 302 | int ncap = re2->NumberOfCapturingGroups(); 303 | if (0 != aux->captures) { 304 | delete[] aux->captures; 305 | aux->cap_vect_len = 0; 306 | } 307 | aux->captures = new re2::StringPiece[ncap]; 308 | if (unlikely(!aux->captures)) { 309 | return 1; 310 | } 311 | aux->cap_vect_len = ncap; 312 | aux->ncap = ncap; 313 | 314 | if (0 != aux->captures_r) { 315 | aux->cap_r_vect_len = 0; 316 | } 317 | 318 | RE2::Arg* argv = new RE2::Arg[ncap]; 319 | if (unlikely(!argv)) { 320 | return 1; 321 | } 322 | 323 | RE2::Arg** args = new RE2::Arg* [ncap]; 324 | if (unlikely(!args)) { 325 | delete[] argv; 326 | return 1; 327 | } 328 | 329 | for (int i = 0; i < ncap; i++) { 330 | argv[i] = &aux->captures[i]; 331 | args[i] = &argv[i]; 332 | } 333 | 334 | re2::StringPiece input(text, text_len); 335 | bool match = false; 336 | while (re2->FindAndConsumeN(&input, *re2, args, ncap)) { 337 | match = true; 338 | re2c_record_capture(aux); 339 | } 340 | delete[] args; 341 | 342 | return match ? 0 : 1; 343 | } 344 | -------------------------------------------------------------------------------- /re2_c.h: -------------------------------------------------------------------------------- 1 | #ifndef _RE2_C_H_ 2 | #define _RE2_C_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #ifdef BUILDING_LIB 9 | #define RE2C_EXPORT __attribute__ ((visibility ("protected"))) 10 | #else 11 | #define RE2C_EXPORT __attribute__ ((visibility ("default"))) 12 | #endif 13 | 14 | struct re2_pattern_t; /* opaque type for compiled pattern */ 15 | struct re2c_match_aux; 16 | 17 | /**************************************************************************** 18 | * 19 | * The correspondance between the single-char flags and re2::RE2::Options 20 | * 21 | * char,re2::RE2::Options,default-val,meaning 22 | * =========================================== 23 | * u utf8 (true) text and pattern are UTF-8; otherwise Latin-1 24 | * a longest_match (false) search for longest match, not first match 25 | * e log_errors (true) log syntax and execution errors to ERROR 26 | * n/a max_mem (see below) approx. max memory footprint of RE2 27 | * l literal (false) interpret string as literal, not regexp 28 | * n never_nl (false) never match \n, even if it is in regexp 29 | * s dot_nl (false) dot matches everything including new line 30 | * c never_capture (false) parse all parens as non-capturing 31 | * i case_insensitive (false) match is case-insensitive (regexp can override 32 | * with (?i) unless in posix_syntax mode) 33 | * m multi-line-mode (false) 34 | * 35 | * The lower-case char is to turn on (i.e set value to true), while the 36 | * corresponding upper-case is to turn the flag off. 37 | * 38 | **************************************************************************** 39 | */ 40 | 41 | /* Compile the pattern. If it was successful, the compiled pattern is returned 42 | * if is not-NULL, it is set to be the number of submatches the 43 | * pattern has; and is not accessed or deferenced in this 44 | * case. If it was not successful, NULL is returned; in the meantime, error 45 | * message is returned via if it's non-NULL. 46 | * 47 | * RE2 options are passed via and , where 48 | * is a string, each character turning on or off an boolean option (see the 49 | * above comment). if take value 0, default value is used. 50 | */ 51 | struct re2_pattern_t* re2c_compile(const char* pattern, int pat_len, 52 | const char* re2_options, 53 | char* errstr, int errstrlen, 54 | unsigned max_mem) RE2C_EXPORT; 55 | 56 | /* Free the pre-compiled pattern */ 57 | void re2c_free(struct re2_pattern_t*) RE2C_EXPORT; 58 | 59 | /* Perform pattern match. Return 0 on success, 1 otherwise. Captures are 60 | * returned via "aux". 61 | */ 62 | int re2c_match(const char* text, int text_len, 63 | struct re2_pattern_t* pattern, 64 | struct re2c_match_aux* aux) RE2C_EXPORT; 65 | 66 | /* Similar to re2c_match() except that it dosen't return captures */ 67 | int re2c_find(const char* text, int text_len, 68 | struct re2_pattern_t* pattern) RE2C_EXPORT; 69 | 70 | /* Return the number of captures the pattern have */ 71 | int re2c_getncap(struct re2_pattern_t*) RE2C_EXPORT; 72 | 73 | /* Return the "idx"-th capture */ 74 | const char* re2c_get_capture(struct re2c_match_aux*, unsigned idx) RE2C_EXPORT; 75 | 76 | /* Return the length of the "idx"-th capture */ 77 | unsigned re2c_get_capture_len(struct re2c_match_aux*, unsigned idx) RE2C_EXPORT; 78 | 79 | const char* re2c_get_errstr(struct re2c_match_aux*) RE2C_EXPORT; 80 | 81 | struct re2c_match_aux* re2c_alloc_aux(void) RE2C_EXPORT; 82 | void re2c_free_aux(struct re2c_match_aux*) RE2C_EXPORT; 83 | 84 | int re2c_match_r(const char* text, int text_len, struct re2_pattern_t* pattern, 85 | struct re2c_match_aux* aux) RE2C_EXPORT; 86 | unsigned re2c_get_capture_r_count(struct re2c_match_aux* aux) RE2C_EXPORT; 87 | /* Return the "idx"-th capture */ 88 | const char* re2c_get_capture_r(struct re2c_match_aux*, unsigned idx) RE2C_EXPORT; 89 | 90 | /* Return the length of the "idx"-th capture */ 91 | unsigned re2c_get_capture_r_len(struct re2c_match_aux*, unsigned idx) RE2C_EXPORT; 92 | 93 | #ifdef __cplusplus 94 | } 95 | #endif 96 | 97 | #endif /* _RE2_C_H_ */ 98 | -------------------------------------------------------------------------------- /test.lua: -------------------------------------------------------------------------------- 1 | local re2 = require "lua-re2" 2 | local string_fmt = string.format 3 | local io_write = io.write 4 | 5 | local pat_str 6 | local text_str 7 | local capture_str 8 | 9 | local function print_result(id, r) 10 | io_write(string_fmt("Test %d %s\n", id, r and "succ" or "fail")); 11 | end 12 | 13 | -- Test match without or ignore capture. The result is supposed to be 14 | -- "not match". 15 | local function test_not_match_nocap(id, pat, compile_opt, text) 16 | local re2_inst = re2.new() 17 | local pat, err = re2_inst.compile(pat_str, compile_opt) 18 | local r 19 | if pat then 20 | r = re2_inst:match(pat, text) 21 | end 22 | 23 | print_result(id, r) 24 | return r and 1 or nil 25 | end 26 | 27 | -- Test if the particular capture is correct. 28 | local function test_match_cap(id, pat, compile_opt, text, cap_idx, capture) 29 | local re2_inst = re2.new() 30 | local pat, err = re2_inst.compile(pat_str, compile_opt) 31 | 32 | local res 33 | if pat then 34 | local cap = re2_inst:match(pat, text, cap_idx) 35 | if cap and cap == capture then 36 | res = 1 37 | end 38 | end 39 | 40 | print_result(id, res) 41 | return res and 1 or nil 42 | end 43 | 44 | local function test_match_nocap(id, pat, compile_opt, text) 45 | local re2_inst = re2.new() 46 | local ptn, err = re2_inst.compile(pat_str, compile_opt) 47 | local res 48 | if ptn then 49 | local cap = re2_inst:match(ptn, text) 50 | if cap then 51 | res = 1 52 | local t = re2_inst.find(ptn, text) 53 | if not t then 54 | print("re2_inst::match() and re2_inst.find() disagree") 55 | res = nil 56 | end 57 | else 58 | local t = re2_inst.find(ptn, text) 59 | if t then 60 | print("re2_inst::match() and re2_inst.find() disagree") 61 | end 62 | end 63 | end 64 | 65 | print_result(id, res) 66 | return res and 1 or nil 67 | end 68 | 69 | pat_str = [==[([a-zA-Z ]+)([0-9]*)]==] 70 | text_str = "23456This is the source code repository for code 1234" 71 | capture_str = "This is the source code repository for code " 72 | 73 | test_match_cap(1, pat_str, nil, text_str, 1, capture_str) 74 | test_match_cap(2, pat_str, nil, text_str, 2, "1234") 75 | 76 | -- test multi-line support 77 | pat_str = [[^\d*$]] 78 | text_str = 79 | [[abc 80 | 12345 81 | xyz]] 82 | test_match_nocap(3, pat_str, "m", text_str) 83 | 84 | -- Test if the all captures are correct. 85 | local function test_match_r_caps(id, pat, compile_opt, text, captures) 86 | local re2_inst = re2.new() 87 | local pat, err = re2_inst.compile(pat_str, compile_opt) 88 | 89 | local res 90 | if pat then 91 | local caps = re2_inst.match_r(re2_inst, pat, text, cap_idx) 92 | if caps and (#caps == #captures) then 93 | count = 0 94 | for i=1, #caps do 95 | if caps[i] == captures[i] then 96 | count = count + 1 97 | end 98 | end 99 | if count == #caps then 100 | res = 1 101 | end 102 | end 103 | end 104 | 105 | print_result(id, res) 106 | return res and 1 or nil 107 | end 108 | 109 | pat_str = "([^&=]+)=([^&=]*)" 110 | text_str = "k1=v1&k2=v2&&k3=v3&k4=" 111 | captures = {'k1', 'v1', 'k2', 'v2', 'k3', 'v3', 'k4', ''} 112 | test_match_r_caps(4, pat_str, nil, text_str, captures) 113 | 114 | --------------------------------------------------------------------------------