├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── lib_py3 ├── pylib.c ├── pylib.h └── tre.py └── src ├── demo.c ├── lib ├── platform.c ├── platform.h ├── utf8_lite.c └── utf8_lite.h ├── tdebug.c ├── tdebug.h ├── tinyre.c ├── tinyre.h ├── tlexer.c ├── tlexer.h ├── tparser.c ├── tparser.h ├── tutils.h ├── tvm.c └── tvm.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | build/ 10 | bak/ 11 | 12 | # Translations 13 | *.mo 14 | *.pot 15 | 16 | # Django stuff: 17 | *.log 18 | 19 | # Others 20 | .gitignore~ 21 | *.[ch]~ 22 | *.un~ 23 | *.py~ 24 | *.txt~ 25 | *.swp 26 | *.md~ 27 | 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | 3 | script: 4 | mkdir build && cd build && cmake .. && make 5 | 6 | compiler: 7 | - clang 8 | - gcc 9 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | cmake_minimum_required(VERSION 2.8) 3 | 4 | project (tinyre) 5 | #set(CMAKE_BUILD_TYPE Debug) 6 | set(CMAKE_BUILD_TYPE Release) 7 | set(build_target demo) 8 | #set(build_target py3lib) 9 | 10 | cmake_policy(SET CMP0015 OLD) 11 | 12 | if (MSVC) 13 | #set(CMAKE_GENERATOR_TOOLSET "v120_xp" CACHE STRING "Platform Toolset" FORCE) 14 | #set(CMAKE_VS_PLATFORM_TOOLSET "v120_xp" CACHE STRING "Platform Toolset" FORCE) 15 | endif(MSVC) 16 | 17 | macro(source_group_by_dir source_files) 18 | if(MSVC) 19 | set(sgbd_cur_dir ${CMAKE_CURRENT_SOURCE_DIR}) 20 | foreach(sgbd_file ${${source_files}}) 21 | string(REGEX REPLACE ${sgbd_cur_dir}/\(.*\) \\1 sgbd_fpath ${sgbd_file}) 22 | string(REGEX REPLACE "\(.*\)/.*" \\1 sgbd_group_name ${sgbd_fpath}) 23 | string(COMPARE EQUAL ${sgbd_fpath} ${sgbd_group_name} sgbd_nogroup) 24 | string(REPLACE "/" "\\" sgbd_group_name ${sgbd_group_name}) 25 | if(sgbd_nogroup) 26 | set(sgbd_group_name "\\") 27 | endif(sgbd_nogroup) 28 | source_group(${sgbd_group_name} FILES ${sgbd_file}) 29 | endforeach(sgbd_file) 30 | endif(MSVC) 31 | endmacro(source_group_by_dir) 32 | 33 | INCLUDE_DIRECTORIES() 34 | 35 | if (MSVC) 36 | set(CMAKE_CXX_FLAGS_DEBUG "/MTd") 37 | set(CMAKE_CXX_FLAGS_RELEASE "/MT") 38 | else(MSVC) 39 | set(CMAKE_C_FLAGS "-Wall") 40 | set(CMAKE_C_FLAGS "-std=c99") 41 | set(CMAKE_C_FLAGS_DEBUG "-g") 42 | set(CMAKE_C_FLAGS_RELEASE "-O2") 43 | endif(MSVC) 44 | 45 | add_definitions(-DUNICODE -D_UNICODE) 46 | 47 | IF(${CMAKE_BUILD_TYPE} MATCHES "Debug") 48 | add_definitions(-DTRE_DEBUG) 49 | ENDIF() 50 | 51 | file(GLOB_RECURSE project_headers src/*.h) 52 | file(GLOB_RECURSE project_cpps src/*.c) 53 | set (SRC_LIST ${project_headers} ${project_cpps}) 54 | 55 | source_group_by_dir(SRC_LIST) 56 | 57 | IF(${build_target} MATCHES "demo") 58 | # demo 59 | add_definitions(-DDEMO) 60 | add_executable(tinyre ${SRC_LIST}) 61 | 62 | if (NOT MSVC) 63 | target_link_libraries(tinyre m) 64 | endif(NOT MSVC) 65 | ELSEIF(${build_target} MATCHES "py3lib") 66 | # library 67 | add_definitions(-DPY3LIB) 68 | file(GLOB_RECURSE py3lib_headers lib_py3/*.h) 69 | file(GLOB_RECURSE py3lib_cfiles lib_py3/*.c) 70 | set (LIB_PY3_SRC_LIST ${py3lib_headers} ${py3lib_cfiles}) 71 | source_group_by_dir(LIB_PY3_SRC_LIST) 72 | add_library(_tinyre SHARED ${SRC_LIST} ${LIB_PY3_SRC_LIST}) 73 | set_target_properties(_tinyre PROPERTIES PREFIX "") 74 | ENDIF() 75 | 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 - 2016 fy 2 | 3 | This software is provided 'as-is', without any express or implied 4 | warranty. In no event will the authors be held liable for any damages 5 | arising from the use of this software. 6 | 7 | Permission is granted to anyone to use this software for any purpose, 8 | including commercial applications, and to alter it and redistribute it 9 | freely, subject to the following restrictions: 10 | 11 | 1. The origin of this software must not be misrepresented; you must not 12 | claim that you wrote the original software. If you use this software 13 | in a product, an acknowledgment in the product documentation would be 14 | appreciated but is not required. 15 | 16 | 2. Altered source versions must be plainly marked as such, and must not be 17 | misrepresented as being the original software. 18 | 19 | 3. This notice may not be removed or altered from any source 20 | distribution. 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # tinyre ver 0.9.2 3 | 4 | [![Travis](https://travis-ci.org/fy0/tinyre.svg?branch=master)](https://travis-ci.org/fy0/tinyre) 5 | [![Code Climate](https://codeclimate.com/github/fy0/tinyre/badges/gpa.svg)](https://codeclimate.com/github/fy0/tinyre) 6 | 7 | A tiny regex engine. 8 | Plan to be compatible with "Secret Labs' Regular Expression Engine"(SRE for python). 9 | 10 | **warning: the project already works fine, but slow** 11 | 12 | **Features**: 13 | * **utf-8 support** 14 | Cheers for unicode! 15 | 16 | * **no octal number** 17 | \\1 means group 1, \\1-100 means group n, \\01 match \\1, \\07 match \\7, \\08 match ['\\0', '8'], \\377 match 0o377, but \\400 isn't match with 0o400 and [chr(0o40), '\\0']! 18 | What the hell ... I choose go die! Go away octal number! 19 | 20 | * **custom maximum number of backtracking** 21 | An evil regex: **'a?'\*n+'a'\*n** against **'a'\*n** 22 | For example: **'a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaa'** matches **'aaaaaaaaaaaaaaaaaaaaaaaaa'** 23 | It will takes a long time because of too many times of backtracking. Perl/Python/PCRE requires over **10^15 years** to match a 29-character string. 24 | You can set a limit to backtracking times to avoid this situation, and the match will be falied. 25 | 26 | * **more than 100 groups ...** 27 | but who cares? 28 | 29 | 30 | **Supported**: 31 | * "." Matches any character except a newline. 32 | * "^" Matches the start of the string. 33 | * "$" Matches the end of the string or just before the newline at the end of the string. 34 | * "*" Matches 0 or more (greedy) repetitions of the preceding RE. Greedy means that it will match as many repetitions as possible. 35 | * "+" Matches 1 or more (greedy) repetitions of the preceding RE. 36 | * "?" Matches 0 or 1 (greedy) of the preceding RE. 37 | * *?,+?,?? Non-greedy versions of the previous three special characters. 38 | * {m} Matches m copies of the previous RE. 39 | * {m,n} Matches from m to n repetitions of the preceding RE. 40 | * {m,n}? Non-greedy version of the above. 41 | * "\\" Either escapes special characters or signals a special sequence. 42 | * "\\1-N" Matches the text matched earlier by the group index. 43 | * [] Indicates a set of characters. 44 | * [^] A "^" as the first character indicates a complementing set. 45 | * "|" A|B, creates an RE that will match either A or B. 46 | * (...) Matches the RE inside the parentheses. The contents can be retrieved or matched later in the string. 47 | * (?ims) Set the I, M or S flag for the RE (see below). 48 | * (?:...) Non-grouping version of regular parentheses. 49 | * (?P...) The substring matched by the group is accessible by name. 50 | * (?P=name) Matches the text matched earlier by the group named name. 51 | * (?#...) A comment; ignored. 52 | * (?=...) Matches if ... matches next, but doesn't consume the string. 53 | * (?!...) Matches if ... doesn't match next. 54 | * (?<=...) Matches if preceded by ... (must be fixed length). 55 | * (?groupnum); 37 | 38 | for (i = 0; i < m->groupnum; i++) { 39 | PyObject* t2 = PyTuple_New(3); 40 | if (m->groups[i].name) 41 | PyTuple_SetItem(t2, 0, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, m->groups[i].name, m->groups[i].name_len)); 42 | else { 43 | Py_INCREF(Py_None); 44 | PyTuple_SetItem(t2, 0, Py_None); 45 | } 46 | 47 | if (m->groups[i].head != -1) { 48 | PyTuple_SetItem(t2, 1, PyLong_FromLong(m->groups[i].head)); 49 | PyTuple_SetItem(t2, 2, PyLong_FromLong(m->groups[i].tail)); 50 | } else { 51 | Py_INCREF(Py_None); 52 | Py_INCREF(Py_None); 53 | PyTuple_SetItem(t2, 1, Py_None); 54 | PyTuple_SetItem(t2, 2, Py_None); 55 | } 56 | PyTuple_SetItem(t, i, t2); 57 | } 58 | 59 | tre_match_free(m); 60 | return t; 61 | } 62 | 63 | static PyObject* trepy_match(PyObject *self, PyObject* args) 64 | { 65 | tre_Pattern* pattern; 66 | PyObject* obj; 67 | char* text; 68 | int backtrack_limit; 69 | 70 | if (!PyArg_ParseTuple(args, "Osi", &obj, &text, &backtrack_limit)) 71 | return NULL; 72 | 73 | pattern = (tre_Pattern*)PyCapsule_GetPointer(obj, "_tre_pattern"); 74 | 75 | tre_Match* m = tre_match(pattern, text, backtrack_limit); 76 | if (!m->groups) { 77 | tre_match_free(m); 78 | Py_INCREF(Py_None); 79 | return Py_None; 80 | } 81 | 82 | return tre_Match_c2py(m); 83 | } 84 | 85 | static PyMethodDef tre_methods[] ={ 86 | {"_compile", trepy_compile, METH_VARARGS}, 87 | {"_match", trepy_match, METH_VARARGS}, 88 | {NULL, NULL,0,NULL} 89 | }; 90 | 91 | static struct PyModuleDef module_def ={ 92 | PyModuleDef_HEAD_INIT, 93 | "_tinyre", 94 | "Tiny Regex Engine Module", 95 | -1, 96 | tre_methods, 97 | }; 98 | 99 | 100 | static PyObject *TinyreError; 101 | 102 | 103 | PyMODINIT_FUNC PyInit__tinyre() 104 | { 105 | PyObject *m; 106 | m = PyModule_Create(&module_def); 107 | 108 | if (m == NULL) 109 | return NULL; 110 | 111 | TinyreError = PyErr_NewException("tre.error", NULL, NULL); 112 | Py_INCREF(TinyreError); 113 | PyModule_AddObject(m, "error", TinyreError); 114 | return m; 115 | } 116 | #endif 117 | 118 | -------------------------------------------------------------------------------- /lib_py3/pylib.h: -------------------------------------------------------------------------------- 1 | 2 | #ifdef PY3LIB 3 | 4 | #include "../src/tinyre.h" 5 | #include 6 | 7 | PyMODINIT_FUNC PyInit__tinyre(); 8 | void trepy_free_pattern(PyObject* obj); 9 | 10 | #endif 11 | 12 | -------------------------------------------------------------------------------- /lib_py3/tre.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # tinyre v0.9.0 wrapper 3 | 4 | import _tinyre 5 | 6 | # flags 7 | I = IGNORECASE = 2 8 | M = MULTILINE = 8 9 | S = DOTALL = 16 10 | 11 | class error(Exception): 12 | msg = '' 13 | 14 | class TRE_Pattern: 15 | @classmethod 16 | def __new_pattern__(self, pattern, flags=0): 17 | if flags > I|M|S: 18 | flags = (flags & I) | (flags & M) | (flags & S) 19 | ret = _tinyre._compile(pattern, flags) 20 | if type(ret) == int: 21 | raise error(ret) 22 | else: 23 | ptn = TRE_Pattern() 24 | ptn.__cpattern__ = ret 25 | ptn.pattern = pattern 26 | return ptn 27 | 28 | def match(self, text, backtrack_limit=0): 29 | ret = _tinyre._match(self.__cpattern__, text, backtrack_limit) 30 | if ret: 31 | return TRE_Match(text, ret) 32 | 33 | 34 | class TRE_Match: 35 | def __init__(self, match_text, data): 36 | groupspan = [] 37 | grouptext = [] 38 | groupdict = {} 39 | default_slots = [] 40 | for i in data: 41 | name, a, b = i 42 | span = (a, b) 43 | text = match_text[a:b] if a is not None else None 44 | if i[0]: 45 | groupdict[i[0]] = (span, text) 46 | if span[0] is None: 47 | default_slots.append(True) 48 | else: 49 | default_slots.append(False) 50 | groupspan.append(span) 51 | grouptext.append(text) 52 | self.__text__ = match_text 53 | self.__groupspan__ = tuple(groupspan) 54 | self.__grouptext__ = tuple(grouptext) 55 | self.__groupdict__ = groupdict 56 | self.__default_slots__ = default_slots 57 | self.lastindex = None 58 | 59 | if groupspan: 60 | index = 0 61 | for i in groupspan[1:]: 62 | if i[0]: 63 | index += 1 64 | self.lastindex = index 65 | 66 | def __get_text_by_index__(self, i): 67 | return self.__grouptext__[i] 68 | 69 | def __get_text_by_name__(self, i): 70 | if i in self.__groupdict__: 71 | return self.__groupdict__[i][1] 72 | else: 73 | return None 74 | 75 | def span(self, index=0): 76 | a, b = self.__groupspan__[index] 77 | if a is None: 78 | return -1, -1 79 | return a, b 80 | 81 | def group(self, *indices): 82 | ret = [] 83 | if len(indices) == 0: 84 | indices = {0} 85 | 86 | for i in indices: 87 | if type(i) == int: 88 | ret.append(self.__get_text_by_index__(i)) 89 | elif type(i) == str: 90 | ret.append(self.__get_text_by_name__(i)) 91 | if len(ret) == 1: 92 | return ret[0] 93 | else: 94 | return tuple(ret) 95 | 96 | def groups(self, default=None): 97 | if default is None: 98 | return self.__grouptext__[1:] 99 | else: 100 | ret = list(self.__grouptext__[1:]) 101 | for i in range(1, len(self.__default_slots__)): 102 | if self.__default_slots__[i]: 103 | ret[i-1] = default 104 | return tuple(ret) 105 | 106 | def groupdict(self): 107 | ret = {} 108 | for k, v in self.__groupdict__.items(): 109 | ret[k] = v[1] 110 | return ret 111 | 112 | def start(self, index=0): 113 | return self.span(index)[0] 114 | 115 | def end(self, index=0): 116 | return self.span(index)[1] 117 | 118 | def string(self): 119 | return self.__text__ 120 | 121 | 122 | def compile(pattern, flags=0): 123 | return TRE_Pattern.__new_pattern__(pattern, flags) 124 | 125 | 126 | def match(pattern, text, flags=0, backtrack_limit=0): 127 | if pattern.__class__ != TRE_Pattern: 128 | if pattern.__class__ == str: 129 | pattern = compile(pattern, flags) 130 | else: 131 | return None 132 | return pattern.match(text, backtrack_limit) 133 | 134 | -------------------------------------------------------------------------------- /src/demo.c: -------------------------------------------------------------------------------- 1 | /* 2 | * start : 2012-4-8 09:57 3 | * update: 2015-12-10 v0.9.0 4 | * 5 | * tinyre 6 | * fy, 2012-2015 7 | * 8 | */ 9 | 10 | #include "tutils.h" 11 | #include "tdebug.h" 12 | 13 | #include "tinyre.h" 14 | 15 | 16 | #ifdef DEMO 17 | 18 | int main(int argc,char* argv[]) 19 | { 20 | int i; 21 | int err_code; 22 | tre_Pattern* pattern; 23 | tre_Match* match = NULL; 24 | platform_init(); 25 | 26 | pattern = tre_compile("1(2)[3]", FLAG_DOTALL, &err_code); 27 | 28 | if (pattern) { 29 | match = tre_match(pattern, "123", 5000); 30 | 31 | if (match->groups) { 32 | putchar('\n'); 33 | for (i = 0; i < match->groupnum; i++) { 34 | printf("Group %2d: ", i); 35 | if (match->groups[i].name) { 36 | printf("("); 37 | output_str(match->groups[i].name, match->groups[i].name_len); 38 | printf(") "); 39 | } else printf("(null) "); 40 | printf("%d %d\n", match->groups[i].head, match->groups[i].tail); 41 | if (match->groups[i].head != -1) { 42 | debug_printstr(match->str, match->groups[i].head, match->groups[i].tail); 43 | } else { 44 | printf("match failed."); 45 | } 46 | printf("\n"); 47 | } 48 | } 49 | } else { 50 | tre_err(err_code); 51 | } 52 | 53 | if (pattern) { 54 | tre_pattern_free(pattern); 55 | if (match) { 56 | tre_match_free(match); 57 | } 58 | } 59 | 60 | return 0; 61 | } 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/lib/platform.c: -------------------------------------------------------------------------------- 1 | 2 | #include "platform.h" 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | #ifdef PLATFORM_WINDOWS 9 | #include 10 | 11 | wchar_t* _utf8_to_16(const char* str) { 12 | int nwLen = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0); 13 | 14 | wchar_t *pwBuf = malloc(sizeof(wchar_t) * (nwLen + 1)); 15 | memset(pwBuf, 0, nwLen * 2 + 2); 16 | 17 | MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), pwBuf, nwLen); 18 | 19 | return pwBuf; 20 | } 21 | #endif 22 | 23 | 24 | void printf_u8(const char *fmt, ...) { 25 | #if defined(PLATFORM_WINDOWS) 26 | int size; 27 | 28 | va_list args; 29 | va_start(args, fmt); 30 | size = vsnprintf(NULL, 0, fmt, args); 31 | va_end(args); 32 | 33 | char *buf = malloc(sizeof(char) * (size+1)); 34 | 35 | va_start(args, fmt); 36 | vsprintf(buf, fmt, args); 37 | va_end(args); 38 | 39 | wchar_t* final_str = _utf8_to_16(buf); 40 | wprintf(final_str); 41 | 42 | free(buf); 43 | free(final_str); 44 | #else 45 | va_list args; 46 | va_start(args, fmt); 47 | vprintf(fmt, args); 48 | va_end(args); 49 | #endif 50 | } 51 | 52 | void platform_init() { 53 | setlocale(LC_CTYPE, ""); 54 | } 55 | 56 | -------------------------------------------------------------------------------- /src/lib/platform.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PLATFORM_UTILS_H 3 | #define PLATFORM_UTILS_H 4 | 5 | #if defined(_WIN32) && !defined(_WIN32_WCE) 6 | #define PLATFORM_WINDOWS /* enable goodies for regular Windows */ 7 | #endif 8 | 9 | #ifdef _MSC_VER 10 | #define _INLINE 11 | #pragma execution_character_set("utf-8") 12 | #else 13 | #define _INLINE inline 14 | #endif 15 | 16 | void printf_u8(const char *fmt, ...); 17 | 18 | void platform_init(); 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /src/lib/utf8_lite.c: -------------------------------------------------------------------------------- 1 |  2 | #include "utf8_lite.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifdef _MSC_VER 10 | #include 11 | #endif 12 | 13 | 14 | /* 15 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. 16 | */ 17 | const char *utf8_decode(const char *o, int *val) { 18 | static const unsigned int limits[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF }; 19 | const unsigned char *s = (const unsigned char *)o; 20 | unsigned int c = s[0]; 21 | unsigned int res = 0; /* final result */ 22 | if (c < 0x80) /* ascii? */ 23 | res = c; 24 | else { 25 | int count = 0; /* to count number of continuation bytes */ 26 | while (c & 0x40) { /* still have continuation bytes? */ 27 | int cc = s[++count]; /* read next byte */ 28 | if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ 29 | return NULL; /* invalid byte sequence */ 30 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 31 | c <<= 1; /* to test next bit */ 32 | } 33 | res |= ((c & 0x7F) << (count * 5)); /* add first byte */ 34 | if (count > 3 || res > MAXUNICODE || res <= limits[count]) 35 | return NULL; /* invalid byte sequence */ 36 | s += count; /* skip continuation bytes read */ 37 | } 38 | if (val) *val = res; 39 | return (const char *)s + 1; /* +1 to include first byte */ 40 | } 41 | 42 | 43 | int utf8_len(const char *s) { 44 | int code; 45 | int len = 0, rlen = strlen(s); 46 | const char* s_end = s + rlen + 1; 47 | 48 | for (const char *p = utf8_decode(s, &code); p != s_end; p = utf8_decode(p, &code)) { 49 | len += 1; 50 | } 51 | 52 | return len; 53 | } 54 | 55 | 56 | char* ucs4_to_utf8(int code) { 57 | const char abPrefix[] = {0, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; 58 | const int adwCodeUp[] = { 59 | 0x80, // U+00000000 ~ U+0000007F 60 | 0x800, // U+00000080 ~ U+000007FF 61 | 0x10000, // U+00000800 ~ U+0000FFFF 62 | 0x200000, // U+00010000 ~ U+001FFFFF 63 | 0x4000000, // U+00200000 ~ U+03FFFFFF 64 | 0x80000000 // U+04000000 ~ U+7FFFFFFF 65 | }; 66 | 67 | int i, ilen; 68 | 69 | // 根据UCS4编码范围确定对应的UTF-8编码字节数 70 | ilen = sizeof(adwCodeUp) / sizeof(uint32_t); 71 | for(i = 0; i < ilen; i++) { 72 | if( code < adwCodeUp[i] ) break; 73 | } 74 | 75 | if (i == ilen) return NULL; // 无效的UCS4编码 76 | 77 | ilen = i + 1; // UTF-8编码字节数 78 | char* pbUTF8 = malloc(sizeof(char) * (ilen+1)); 79 | 80 | if (pbUTF8 != NULL) { // 转换为UTF-8编码 81 | for( ; i > 0; i-- ) { 82 | pbUTF8[i] = (char)((code & 0x3F) | 0x80); 83 | code >>= 6; 84 | } 85 | 86 | pbUTF8[0] = (char)(code | abPrefix[ilen - 1]); 87 | } 88 | 89 | /*for (i = 0; i < ilen; i++) { 90 | printf("%2x ", pbUTF8[i]); 91 | }*/ 92 | pbUTF8[ilen] = 0; 93 | 94 | return pbUTF8; 95 | } 96 | 97 | uint32_t* utf8_to_ucs4_str(const char *s, int *plen) { 98 | int code; 99 | const char *p = s; 100 | int len = utf8_len(s); 101 | uint32_t *buf = malloc((len+1) * sizeof(uint32_t)); 102 | 103 | for (int i = 0; i < len;++i) { 104 | p = utf8_decode(p, &code); 105 | buf[i] = (uint32_t)code; 106 | } 107 | 108 | buf[len] = '\0'; 109 | if (plen) *plen = len; 110 | return buf; 111 | } 112 | -------------------------------------------------------------------------------- /src/lib/utf8_lite.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef UTF8_LITE_H 3 | #define UTF8_LITE_H 4 | 5 | #include 6 | 7 | #define MAXUNICODE 0x10FFFF 8 | 9 | const char *utf8_decode (const char *o, int *val); 10 | int utf8_len(const char *s); 11 | 12 | char* ucs4_to_utf8(int code); 13 | 14 | uint32_t* utf8_to_ucs4_str(const char *s, int *plen); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/tdebug.c: -------------------------------------------------------------------------------- 1 |  2 | #include "tvm.h" 3 | #include "tutils.h" 4 | #include "tlexer.h" 5 | #include "tparser.h" 6 | #include "tdebug.h" 7 | 8 | void putcode(uint32_t code) { 9 | //putwchar((wchar_t)code); 10 | if (code < 0xff) { 11 | putchar((char)code); 12 | } else { 13 | char* ret = ucs4_to_utf8(code); 14 | printf_u8("%s", ret); 15 | free(ret); 16 | } 17 | } 18 | 19 | void output_str(uint32_t *str, int len) { 20 | for (int i = 0; i < len; ++i) { 21 | putcode(str[i]); 22 | } 23 | } 24 | 25 | void debug_token_print(tre_Lexer *lex) { 26 | int err; 27 | uint32_t tval; 28 | 29 | printf("token list:\n"); 30 | while (true) { 31 | err = tre_lexer_next(lex); 32 | if (err) { 33 | tre_err(err); 34 | return; 35 | } 36 | tval = lex->token.value; 37 | 38 | //printf(" %12d ", tval, lex->token.extra.code); 39 | if (tval < FIRST_TOKEN) { 40 | printf("%12s ", ""); 41 | putchar(tval); 42 | switch (tval) { 43 | case '(': 44 | printf(" GroupType:%d ", lex->token.extra.group_type); 45 | if (lex->token.extra.group_type == GT_BACKREF_CONDITIONAL_INDEX) { 46 | printf("Index:%d", lex->token.extra.index); 47 | } 48 | break; 49 | case '{': 50 | printf(" {%d, %d}", lex->token.extra.code, lex->token.extra.code2); 51 | break; 52 | case '[': 53 | printf("%5d", lex->token.extra.code); 54 | break; 55 | case '-': 56 | printf(" "); 57 | putcode(lex->token.extra.code); 58 | putchar('-'); 59 | putcode(lex->token.extra.code2); 60 | break; 61 | } 62 | } else { 63 | if (tval == TK_CHAR) { 64 | printf("%12s ", ""); 65 | putcode(lex->token.extra.code); 66 | } else if (tval == TK_CHAR_SPE) { 67 | printf("%12s ", ""); 68 | if (lex->token.extra.code != '.') putchar('\\'); 69 | putcode(lex->token.extra.code); 70 | } else if (tval == TK_BACK_REF) { 71 | printf("%12s ", ""); 72 | printf("%d", lex->token.extra.code); 73 | } else if (tval == TK_COMMENT) { 74 | printf("%12s ", ""); 75 | printf("#"); 76 | } else if (tval == TK_NOP) { 77 | printf("%12s ", ""); 78 | printf("@"); 79 | } else if (tval == TK_END) { 80 | putchar('\n'); 81 | break; 82 | } 83 | } 84 | putchar('\n'); 85 | } 86 | putchar('\n'); 87 | } 88 | 89 | void debug_ins_list_print(ParserMatchGroup* groups) { 90 | int gnum = 0; 91 | 92 | for (ParserMatchGroup *g = groups; g; g = g->next) { 93 | if (gnum == 0) printf_u8("\nInstructions : Group 0\n"); 94 | else { 95 | printf_u8("\nInstructions : Group %d (%d)", gnum, g->group_type); 96 | if (g->group_type == GT_IF_PRECEDED_BY || g->group_type == GT_IF_NOT_PRECEDED_BY || g->group_type == GT_BACKREF_CONDITIONAL_INDEX 97 | || g->group_type == GT_BACKREF_CONDITIONAL_GROUPNAME) { 98 | printf_u8(" [%d]", g->group_extra); 99 | } 100 | putchar('\n'); 101 | } 102 | gnum++; 103 | 104 | for (INS_List* code = g->codes_start; code->next; code = code->next) { 105 | if (code->ins == INS_CMP) { 106 | printf_u8("%15s ", "CMP"); 107 | putcode(*(int*)code->data); 108 | putchar('\n'); 109 | } else if (code->ins == INS_CMP_SPE) { 110 | printf_u8("%15s ", "CMP_SPE"); 111 | putcode(*(int*)code->data); 112 | putchar('\n'); 113 | } else if (code->ins == INS_CMP_MULTI || code->ins == INS_NCMP_MULTI) { 114 | if (code->ins == INS_CMP_MULTI) printf_u8("%15s %d ", "CMP_MULTI", *(int*)code->data); 115 | else printf_u8("%15s %d ", "NCMP_MULTI", *(int*)code->data); 116 | printf_u8("%6d ", *((int*)code->data + 1)); 117 | putcode(*((int*)code->data + 2)); 118 | 119 | if (*((int*)code->data + 1) == '-') { 120 | printf(" "); 121 | putcode(*((int*)code->data + 3)); 122 | } 123 | 124 | putchar('\n'); 125 | for (int i = 1; i < *(int*)code->data; i++) { 126 | printf_u8(" %4d ", *((int*)code->data + i * 3 + 1)); 127 | putcode(*((int*)code->data + i * 3 + 2)); 128 | 129 | if (*((int*)code->data + i * 3 + 1) == '-') { 130 | printf(" "); 131 | putcode(*((int*)code->data + i * 3 + 3)); 132 | } 133 | 134 | putchar('\n'); 135 | } 136 | } else if (code->ins == INS_CMP_BACKREF) { 137 | printf_u8("%15s %d\n", "CMP_BACKREF", *(int*)code->data); 138 | } else if (code->ins == INS_CMP_GROUP) { 139 | printf_u8("%15s %d\n", "CMP_GROUP", *(int*)code->data); 140 | } else if (code->ins == INS_CHECK_POINT) { 141 | printf_u8("%15s %d %d\n", "CHECK_POINT", *(int*)code->data, *((int*)code->data + 1)); 142 | } else if (code->ins == INS_CHECK_POINT_NO_GREED) { 143 | printf_u8("%15s %d %d\n", "CHECK_POINT_NG", *(int*)code->data, *((int*)code->data + 1)); 144 | } else if (code->ins == INS_MATCH_START) { 145 | printf_u8("%15s\n", "MATCH_START"); 146 | } else if (code->ins == INS_MATCH_END) { 147 | printf_u8("%15s\n", "MATCH_END"); 148 | } else if (code->ins == INS_GROUP_END) { 149 | printf_u8("%15s %d\n", "GROUP_END", *(int*)code->data); 150 | } 151 | } 152 | 153 | printf_u8("\n"); 154 | } 155 | } 156 | 157 | void debug_printstr(uint32_t *str, int head, int tail) { 158 | uint32_t *p = str + head; 159 | if (tail <= head) return; 160 | 161 | while (p != str + tail) { 162 | putcode(*p++); 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/tdebug.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef TINYRETRE_DEBUG_H 3 | #define TINYRETRE_DEBUG_H 4 | 5 | #include "tlexer.h" 6 | 7 | //#define TRE_DEBUG 8 | 9 | struct tre_Token; 10 | struct ParserMatchGroup; 11 | 12 | void putcode(uint32_t code); 13 | void output_str(uint32_t *str, int len); 14 | 15 | void debug_token_print(tre_Lexer *lexer); 16 | void debug_ins_list_print(struct ParserMatchGroup* groups); 17 | 18 | void debug_printstr(uint32_t *str, int head, int tail); 19 | 20 | #ifdef TRE_DEBUG 21 | #define TRE_DEBUG_PRINT( ...) printf_u8(__VA_ARGS__) 22 | #else 23 | #define TRE_DEBUG_PRINT(expression, ...) 24 | #endif 25 | 26 | #endif 27 | 28 | -------------------------------------------------------------------------------- /src/tinyre.c: -------------------------------------------------------------------------------- 1 | /* 2 | * start : 2012-4-8 09:57 3 | * update: 2015-12-10 v0.9.0 4 | * 5 | * tinyre 6 | * fy, 2012-2015 7 | * 8 | */ 9 | 10 | #include "tutils.h" 11 | #include "tlexer.h" 12 | #include "tparser.h" 13 | #include "tvm.h" 14 | #include "tdebug.h" 15 | 16 | void tre_err(int err_code) { 17 | switch (err_code) { 18 | case ERR_LEXER_UNBALANCED_PARENTHESIS: 19 | printf_u8("input error: unbalanced parenthesis.\n"); 20 | break; 21 | case ERR_LEXER_UNEXPECTED_END_OF_PATTERN: 22 | printf_u8("input error: unexpected end of pattern.\n"); 23 | break; 24 | case ERR_LEXER_UNKNOW_SPECIFIER: 25 | printf_u8("input error: unknown specifier.\n"); 26 | break; 27 | case ERR_LEXER_BAD_GROUP_NAME: 28 | printf_u8("input error: bad group name\n"); 29 | break; 30 | case ERR_LEXER_UNICODE_ESCAPE: 31 | printf_u8("input error: unicode escape failed, requires 4 chars(\\u0000).\n"); 32 | break; 33 | case ERR_LEXER_UNICODE6_ESCAPE: 34 | printf_u8("input error: unicode escape failed, requires 8 chars(\\u00000000).\n"); 35 | break; 36 | case ERR_LEXER_HEX_ESCAPE: 37 | printf_u8("input error: hex escape failed, requires 2 chars(\\x00).\n"); 38 | break; 39 | case ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF: 40 | printf_u8("input error: bad group name in backref\n"); 41 | break; 42 | case ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX: 43 | printf_u8("input error: invalid group name or index\n"); 44 | break; 45 | case ERR_LEXER_REDEFINITION_OF_GROUP_NAME: 46 | printf_u8("input error: redefinition of group name\n"); 47 | break; 48 | case ERR_PARSER_REQUIRES_FIXED_WIDTH_PATTERN: 49 | printf_u8("input error: look-behind requires fixed-width pattern\n"); 50 | break; 51 | case ERR_PARSER_BAD_CHARACTER_RANGE: 52 | printf_u8("input error: bad character range\n"); 53 | break; 54 | case ERR_PARSER_NOTHING_TO_REPEAT: 55 | printf_u8("input error: nothing to repeat\n"); 56 | break; 57 | case ERR_PARSER_IMPOSSIBLE_TOKEN: 58 | printf_u8("input error: impossible token\n"); 59 | break; 60 | case ERR_PARSER_UNKNOWN_GROUP_NAME: 61 | printf_u8("input error: unknow group name\n"); 62 | break; 63 | case ERR_PARSER_CONDITIONAL_BACKREF: 64 | printf_u8("input error: conditional backref with more than two branches\n"); 65 | break; 66 | case ERR_PARSER_INVALID_GROUP_INDEX: 67 | printf_u8("input error: invalid group index in conditional backref\n"); 68 | break; 69 | default: 70 | printf_u8("parsering falied!!!\n"); 71 | } 72 | } 73 | 74 | tre_Pattern* tre_compile(char* s, int flag, int* err_code) { 75 | int ret; 76 | tre_Pattern* groups; 77 | tre_Lexer* lexer; 78 | 79 | int len; 80 | uint32_t* buf = utf8_to_ucs4_str(s, &len); 81 | 82 | lexer = tre_lexer_new(buf, len); 83 | 84 | //#define TRE_DEBUG_LEXER 85 | #ifdef TRE_DEBUG_LEXER 86 | debug_token_print(lexer); 87 | return 0; 88 | #endif 89 | 90 | groups = tre_parser(lexer, &ret); 91 | 92 | if (groups == NULL) { 93 | *err_code = ret; 94 | } else { 95 | groups->flag = flag | lexer->extra_flag; 96 | } 97 | 98 | tre_lexer_free(lexer); 99 | free(buf); 100 | return groups; 101 | } 102 | 103 | tre_Match* tre_match(tre_Pattern* tp, const char* str, int backtrack_limit) 104 | { 105 | VMState* vms = vm_init(tp, str, backtrack_limit); 106 | tre_GroupResult* groups = vm_exec(vms); 107 | tre_Match* match = tre_new(tre_Match, 1); 108 | match->groupnum = vms->group_num; 109 | match->groups = groups; 110 | match->str = vms->input_str; 111 | vm_free(vms); 112 | return match; 113 | } 114 | 115 | void tre_pattern_free(tre_Pattern *ptn) { 116 | int i; 117 | 118 | for (i = 0; i < ptn->num_all; i++) { 119 | free(ptn->groups[i].codes); 120 | free(ptn->groups[i].name); 121 | } 122 | 123 | free(ptn->groups); 124 | free(ptn); 125 | } 126 | 127 | void tre_match_free(tre_Match *m) { 128 | int i; 129 | if (m->groups) { 130 | for (i = 0; i < m->groupnum; i++) { 131 | free(m->groups[i].name); 132 | } 133 | } 134 | free(m->str); 135 | free(m->groups); 136 | free(m); 137 | } 138 | -------------------------------------------------------------------------------- /src/tinyre.h: -------------------------------------------------------------------------------- 1 | /** 2 | * tinyre v0.9.0 3 | * fy, 2012-2015 4 | * 5 | */ 6 | 7 | #ifndef TINYRE_H 8 | #define TINYRE_H 9 | 10 | #include 11 | 12 | enum tre_Flag { 13 | FLAG_NONE = 0, 14 | //FLAG_TEMPLATE = 1, 15 | FLAG_IGNORECASE = 2, 16 | //FLAG_LOCALE = 4, 17 | FLAG_MULTILINE = 8, 18 | FLAG_DOTALL = 16, 19 | //FLAG_UNICODE = 32, 20 | //FLAG_VERBOSE = 64, 21 | //FLAGTRE_DEBUG = 128, 22 | 23 | //FLAG_T = 1, 24 | FLAG_I = 2, 25 | //FLAG_L = 4, 26 | FLAG_M = 8, 27 | FLAG_S = 16, 28 | //FLAG_U = 32, 29 | //FLAG_X = 64, 30 | }; 31 | 32 | /* compiled groups */ 33 | typedef struct MatchGroup { 34 | uint32_t* name; 35 | int name_len; 36 | int type; 37 | int extra; 38 | uint32_t* codes; 39 | } MatchGroup; 40 | 41 | typedef struct tre_Pattern { 42 | int num; // group num 43 | int num_all; // group num include non-grouping parentheses (?:) (?=) .. 44 | MatchGroup* groups; 45 | int flag; 46 | } tre_Pattern; 47 | 48 | /* 匹配后返回的结果 */ 49 | 50 | typedef struct tre_GroupResult { 51 | uint32_t *name; 52 | int name_len; 53 | int head; 54 | int tail; 55 | } tre_GroupResult; 56 | 57 | 58 | typedef struct tre_Match { 59 | int groupnum; 60 | uint32_t* str; 61 | tre_GroupResult* groups; 62 | } tre_Match; 63 | 64 | /* 表达式编译和匹配 */ 65 | tre_Pattern* tre_compile(char* s, int flag, int* err_code); 66 | tre_Match* tre_match(tre_Pattern* tp, const char* str, int backtrack_limit); 67 | 68 | /* 释放内存占用 */ 69 | void tre_pattern_free(tre_Pattern *ptn); 70 | void tre_match_free(tre_Match *m); 71 | 72 | /* 其他 */ 73 | void tre_err(int err_code); 74 | 75 | #endif 76 | 77 | -------------------------------------------------------------------------------- /src/tlexer.c: -------------------------------------------------------------------------------- 1 |  2 | #include "tlexer.h" 3 | #include "tutils.h" 4 | 5 | int read_int(tre_Lexer *lex, char end_terminal, int *plen); 6 | int read_hex(tre_Lexer *lex, int len, bool *p_isok); 7 | 8 | uint32_t char_next(tre_Lexer *lex) { 9 | return lex->s[lex->scur++]; 10 | } 11 | 12 | uint32_t char_nextn(tre_Lexer *lex, int n) { 13 | uint32_t code = lex->s[lex->scur+n-1]; 14 | lex->scur+=n; 15 | return code; 16 | } 17 | 18 | uint32_t char_lookahead(tre_Lexer *lex) { 19 | return lex->s[lex->scur]; 20 | } 21 | 22 | uint32_t char_lookaheadn(tre_Lexer *lex, int n) { 23 | return lex->s[lex->scur + n - 1]; 24 | } 25 | 26 | _INLINE static 27 | bool token_check(uint32_t code) { 28 | switch (code) { 29 | case '^': case '$': case '*': case '+': case '?': 30 | case '[': case ']': case '{': case '(': case ')': 31 | case '|': 32 | return true; 33 | } 34 | return false; 35 | } 36 | 37 | _INLINE static 38 | bool is_spe_char(uint32_t code) { 39 | const char other_tokens[] = "DdWwSs"; 40 | for (const char *p = other_tokens; *p; p++) { 41 | if (code == *p) return true; 42 | } 43 | return false; 44 | } 45 | 46 | _INLINE static 47 | int try_get_escape(uint32_t code) { 48 | const char other_tokens[] = "abfnrtv"; 49 | const int other_codes[] = { 7, 8, 12, 10, 13, 9, 11 }; 50 | for (const char *p = other_tokens; *p; p++) { 51 | if (code == *p) return other_codes[p-other_tokens]; 52 | } 53 | return code; 54 | } 55 | 56 | 57 | _INLINE static uint8_t _hex(uint32_t code) { 58 | if (code >= '0' && code <= '9') return code - '0'; 59 | else if (code >= 'A' && code <= 'F') return code - 'A' + 10; 60 | else if (code >= 'a' && code <= 'f') return code - 'a' + 10; 61 | return 255; 62 | } 63 | 64 | _INLINE static uint8_t _oct(uint32_t code) { 65 | if (code >= '0' && code <= '7') return code - '0'; 66 | return 255; 67 | } 68 | 69 | _INLINE static uint8_t _bin(uint32_t code) { 70 | if (code >= '0' && code <= '1') return code - '0'; 71 | return 255; 72 | } 73 | 74 | _INLINE static uint8_t _dec(uint32_t code) { 75 | if (code >= '0' && code <= '9') return code - '0'; 76 | return 255; 77 | } 78 | 79 | 80 | _INLINE static 81 | int _read_x_int(const uint32_t *start, const uint32_t *end, int n, uint8_t(*func)(uint32_t code), int max_size) { 82 | const uint32_t *p = start; 83 | const uint32_t *e = (max_size > 0) ? start + max_size : end; 84 | int ret = 0, val = (int)pow(n, e - p - 1); 85 | 86 | do { 87 | ret += (*func)(*p++) * val; 88 | val /= n; 89 | } while (p != e); 90 | 91 | return ret; 92 | } 93 | 94 | int read_int(tre_Lexer *lex, char end_terminal, int *plen) { 95 | const uint32_t *p = lex->s + lex->scur; 96 | const uint32_t *start = p; 97 | 98 | while (isdigit(*p)) ++p; 99 | 100 | if (p == start) { 101 | if (plen) *plen = 0; 102 | return -1; 103 | } 104 | 105 | int num = _read_x_int(start, p, 10, _dec, 0); 106 | if (plen) { 107 | if (end_terminal && (*(p + (p - start) - 1) != end_terminal)) { 108 | return -1; 109 | } 110 | } 111 | if (plen) *plen = p - start; 112 | return num; 113 | } 114 | 115 | int read_hex(tre_Lexer *lex, int len, bool *p_isok) { 116 | const uint32_t *p = lex->s + lex->scur; 117 | const uint32_t *start = p; 118 | int count = 0; 119 | 120 | while (_hex(*p) != 255) { 121 | ++p; 122 | if (++count == len) break; 123 | } 124 | 125 | if (count != len) { 126 | p_isok = false; 127 | return 0; 128 | } 129 | 130 | *p_isok = true; 131 | return _read_x_int(start, p, 16, _hex, 0); 132 | } 133 | 134 | 135 | _INLINE static 136 | int token_char_accept(tre_Lexer *lex, uint32_t code, bool use_back_ref) { 137 | if (code == '\\') { 138 | // 对转义字符做特殊处理 139 | if (lex->scur == lex->slen) { 140 | // 如果已经是最后一个字符,那么当作普通字符即可 141 | lex->token.extra.code = code; 142 | lex->token.value = TK_CHAR; 143 | } else { 144 | // 如果不是,读下一个字符 145 | code = char_lookahead(lex); 146 | if (is_spe_char(code)) { 147 | // 能确定为特殊匹配字符的话,读取结束 148 | lex->token.extra.code = code; 149 | lex->token.value = TK_CHAR_SPE; 150 | code = char_next(lex); 151 | } else { 152 | // 否则当做 hex/unicode 转义处理 153 | int num, len; 154 | bool is_ok = false; 155 | 156 | if (code == 'x') { 157 | code = char_next(lex); 158 | num = read_hex(lex, 2, &is_ok); 159 | if (!is_ok) return ERR_LEXER_HEX_ESCAPE; 160 | char_nextn(lex, 2); 161 | } else if (code == 'u') { 162 | code = char_next(lex); 163 | num = read_hex(lex, 4, &is_ok); 164 | if (!is_ok) return ERR_LEXER_UNICODE_ESCAPE; 165 | char_nextn(lex, 4); 166 | } else if (code == 'U') { 167 | code = char_next(lex); 168 | num = read_hex(lex, 8, &is_ok); // unicode 6.0 \U0000000A 169 | if (!is_ok) return ERR_LEXER_UNICODE6_ESCAPE; 170 | char_nextn(lex, 8); 171 | } 172 | 173 | if (is_ok) { 174 | lex->token.value = TK_CHAR; 175 | lex->token.extra.code = num; 176 | } else { 177 | num = read_int(lex, 0, &len); 178 | if (num != -1) { 179 | // back reference or normal char 180 | if (use_back_ref) { 181 | if (num == 0) { 182 | lex->token.value = TK_CHAR; 183 | lex->token.extra.code = 0; 184 | } else { 185 | lex->token.value = TK_BACK_REF; 186 | lex->token.extra.index = num; 187 | } 188 | } else { 189 | lex->token.value = TK_CHAR; 190 | lex->token.extra.code = num; 191 | } 192 | char_nextn(lex, len); 193 | } else { 194 | // 既不是转义,也不是前向引用,只是一个字符罢了 195 | lex->token.value = TK_CHAR; 196 | lex->token.extra.code = code; 197 | char_next(lex); 198 | } 199 | } 200 | } 201 | } 202 | } else { 203 | // 若非转义字符,那么一切都很简单 204 | lex->token.extra.code = code; 205 | lex->token.value = (code == '.') ? TK_CHAR_SPE : TK_CHAR; 206 | } 207 | return 0; 208 | } 209 | 210 | _INLINE static 211 | int char_to_flag(uint32_t code) { 212 | if (code == 'i') return FLAG_IGNORECASE; 213 | else if (code == 'm') return FLAG_MULTILINE; 214 | else if (code == 's') return FLAG_DOTALL; 215 | return 0; 216 | } 217 | 218 | #define lex_isidentfirst(c) ((c >= 'A' && c<= 'Z') || (c >= 'a' && c<= 'z') || (c >= '_') || (c >= 128)) 219 | #define lex_isidentletter(c) ((c >= 'A' && c<= 'Z') || (c >= 'a' && c<= 'z') || (c >= '0' && c<= '9') || (c == '_') || (c >= 128)) 220 | 221 | uint32_t* read_group_name(tre_Lexer *lex, char end_terminal, int *plen) { 222 | uint32_t code; 223 | uint32_t *name; 224 | const uint32_t *p = lex->s + lex->scur; 225 | const uint32_t *start = p; 226 | 227 | code = *p++; 228 | if (!lex_isidentfirst(code)) return NULL; 229 | 230 | while (true) { 231 | if (!lex_isidentletter(code)) break; 232 | code = *p++; 233 | } 234 | 235 | if (code != end_terminal) { 236 | return NULL; 237 | } 238 | 239 | name = tre_new(uint32_t, p - start); 240 | memcpy(name, start, (p - start) * sizeof(uint32_t)); 241 | name[p - start - 1] = '\0'; 242 | 243 | if (plen) *plen = p - start - 1; 244 | return name; 245 | } 246 | 247 | int tre_lexer_next(tre_Lexer* lex) { 248 | int len; 249 | uint32_t code; 250 | uint32_t* name; 251 | if (lex->scur == lex->slen) { 252 | lex->token.value = TK_END; 253 | return 0; 254 | } 255 | code = char_next(lex); 256 | bool is_lastone = (lex->scur == lex->slen); 257 | 258 | switch (lex->state) { 259 | case 0: // NORMAL STATE 260 | if (token_check(code)) { 261 | lex->token.extra.code = 0; 262 | lex->token.value = code; // token val is it's own ascii. 263 | 264 | switch (code) { 265 | case '[': 266 | lex->state = 1; 267 | if ((!is_lastone) && char_lookahead(lex) == '^') { 268 | lex->token.extra.code = 1; 269 | } 270 | break; 271 | case '{': { 272 | int count; 273 | int scur_bak = lex->scur; 274 | int llimit = 0, rlimit = -1; 275 | 276 | // read left limit a{1 277 | llimit = read_int(lex, 0, &count); 278 | if (count == 0) goto __bad_token; 279 | code = char_nextn(lex, count+1); 280 | 281 | // read comma a{1, 282 | if ((char)code == ',') { 283 | //char_next(lex); 284 | } else if ((char)code == '}') { 285 | rlimit = llimit; 286 | goto __write_code; 287 | } else { 288 | // falied, rollback 289 | goto __bad_token; 290 | } 291 | 292 | // read left limit a{1, 2 293 | rlimit = read_int(lex, 0, &count); 294 | code = char_nextn(lex, count+1); 295 | 296 | // read right brace a{1,2} or a{1,} 297 | if ((char)code == '}') { 298 | // ok, rlimit is -1 299 | } else { 300 | // falied, rollback 301 | goto __bad_token; 302 | } 303 | 304 | __write_code: 305 | lex->token.extra.code = llimit; 306 | lex->token.extra.code2 = rlimit; 307 | break; 308 | 309 | __bad_token: 310 | lex->token.value = TK_CHAR; 311 | lex->token.extra.code = '{'; 312 | lex->scur = scur_bak; 313 | break; 314 | } 315 | case '(': { 316 | code = char_lookahead(lex); 317 | // if next char is not ? 318 | if (code != '?') { 319 | lex->token.extra.group_type = GT_NORMAL; 320 | lex->token.extra.group_name = NULL; 321 | break; 322 | } else { 323 | code = char_nextn(lex, 2); 324 | switch (code) { 325 | case '#': { // just comment 326 | bool is_escape = false; 327 | code = char_next(lex); 328 | while (!(!is_escape && code == ')')) { 329 | code = char_next(lex); 330 | if (is_escape) is_escape = false; 331 | if (code == '\\') is_escape = true; 332 | if (code == '\0') return ERR_LEXER_UNBALANCED_PARENTHESIS; 333 | } 334 | lex->token.value = TK_COMMENT; 335 | break; 336 | } 337 | case ':': lex->token.extra.group_type = GT_NONGROUPING; break; 338 | case '=': lex->token.extra.group_type = GT_IF_MATCH; break; 339 | case '!': lex->token.extra.group_type = GT_IF_NOT_MATCH; break; 340 | case '(': 341 | // code for conditional backref 342 | name = read_group_name(lex, ')', &len); 343 | if (name) { 344 | code = char_nextn(lex, len); 345 | lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_GROUPNAME; 346 | lex->token.extra.group_name = name; 347 | lex->token.extra.group_name_len = len; 348 | } else { 349 | int i = read_int(lex, ')', &len); 350 | if (i == -1) { 351 | return ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX; 352 | } else { 353 | code = char_nextn(lex, len); 354 | lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_INDEX; 355 | lex->token.extra.index = i; 356 | } 357 | } 358 | code = char_next(lex); 359 | break; 360 | case 'P': 361 | // group name 362 | code = char_lookahead(lex); 363 | if (code == '<') { 364 | code = char_next(lex); 365 | name = read_group_name(lex, '>', &len); 366 | if (!name) return ERR_LEXER_BAD_GROUP_NAME; 367 | code = char_nextn(lex, len+1); // name and '>' 368 | 369 | lex->token.extra.group_type = GT_NORMAL; 370 | lex->token.extra.group_name = name; 371 | lex->token.extra.group_name_len = len; 372 | } else if (code == '=') { 373 | // code for back reference (?P=) 374 | code = char_next(lex); 375 | name = read_group_name(lex, ')', &len); 376 | if (!name) return ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF; 377 | code = char_nextn(lex, len); // skip name 378 | 379 | lex->token.extra.group_type = GT_BACKREF; 380 | lex->token.extra.group_name = name; 381 | lex->token.extra.group_name_len = len; 382 | } else { 383 | return ERR_LEXER_UNKNOW_SPECIFIER; 384 | } 385 | break; 386 | case '<': 387 | code = char_next(lex); 388 | if (code == '=') { 389 | lex->token.extra.group_type = GT_IF_PRECEDED_BY; 390 | } else if (code == '!') { 391 | lex->token.extra.group_type = GT_IF_NOT_PRECEDED_BY; 392 | } else { 393 | return ERR_LEXER_UNKNOW_SPECIFIER; 394 | } 395 | break; 396 | default: 397 | if (char_to_flag(code)) { 398 | int flag = 0; 399 | while (true) { 400 | flag = char_to_flag(code); 401 | if (flag) lex->extra_flag |= flag; 402 | else break; 403 | code = char_next(lex); 404 | } 405 | } else { 406 | return ERR_LEXER_UNEXPECTED_END_OF_PATTERN; 407 | } 408 | lex->token.value = TK_NOP; 409 | break; 410 | } 411 | } 412 | } 413 | }; 414 | } else { 415 | int ret = token_char_accept(lex, code, true); 416 | if (ret) return ret; 417 | } 418 | break; 419 | case 1: { // [...] 420 | bool is_escape = code == '\\'; 421 | int ret = token_char_accept(lex, code, false); 422 | if (ret) return ret; 423 | 424 | if (!is_escape && lex->token.value == TK_CHAR) { 425 | // end the state 426 | if (code == ']') { 427 | lex->state = 0; 428 | lex->token.value = ']'; 429 | break; 430 | } 431 | } 432 | 433 | // [a-z] grammar 434 | code = char_lookahead(lex); 435 | if (code == '-') { 436 | uint32_t code2 = char_lookaheadn(lex, 2); 437 | // [a-] 438 | if (code2 == ']') break; 439 | 440 | // [\s-1] -> error 441 | if (lex->token.value == TK_CHAR_SPE) { 442 | return ERR_LEXER_BAD_CHARACTER_RANGE; 443 | } 444 | 445 | // [a-z] 446 | code2 = lex->token.extra.code; 447 | code = char_nextn(lex, 2); 448 | ret = token_char_accept(lex, code, false); 449 | if (ret) return ret; 450 | 451 | // [1-\s] -> error 452 | if (lex->token.value == TK_CHAR_SPE) { 453 | return ERR_LEXER_BAD_CHARACTER_RANGE; 454 | } 455 | 456 | // [z-a] -> error 457 | if (lex->token.extra.code < code2) { 458 | return ERR_LEXER_BAD_CHARACTER_RANGE; 459 | } 460 | 461 | // everything is ok 462 | lex->token.value = '-'; 463 | lex->token.extra.code2 = lex->token.extra.code; 464 | lex->token.extra.code = code2; 465 | } 466 | break; 467 | } 468 | } 469 | return 0; 470 | } 471 | 472 | int tre_check_groups(uint32_t *s, int len) { 473 | int num = 0; 474 | for (int i = 0; i < len; ++i) { 475 | if (s[i] == '\\') i++; 476 | else if (s[i] == '(') { 477 | if (s[i + 1] == '?') { 478 | if (s[i + 2] == 'P') { 479 | if (s[i + 3] == '<') { 480 | i += 2; 481 | num++; 482 | } 483 | } 484 | else if (s[i + 2] == '(') i += 2; 485 | i++; 486 | } else num++; 487 | } else if (s[i] == '[') { 488 | while (i++) { 489 | if (s[i] == ']') break; 490 | else if (s[i] == '\0') return -1; 491 | else if (s[i] == '\\') i++; 492 | } 493 | } 494 | } 495 | return num; 496 | } 497 | 498 | tre_Lexer* tre_lexer_new(uint32_t *s, int len) { 499 | tre_Lexer* lex = tre_new(tre_Lexer, 1); 500 | lex->extra_flag = 0; 501 | lex->max_normal_group_num = tre_check_groups(s, len) + 1; 502 | //printf("AAAAAAAAA %d\n", lex->max_normal_group_num); 503 | lex->state = 0; 504 | 505 | if (s) { 506 | lex->s = s; 507 | lex->scur = 0; 508 | lex->slen = len; 509 | } 510 | return lex; 511 | } 512 | 513 | void tre_lexer_free(tre_Lexer *lex) { 514 | free(lex); 515 | } 516 | 517 | -------------------------------------------------------------------------------- /src/tlexer.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef TINYRE_LEXER_H 3 | #define TINYRE_LEXER_H 4 | 5 | #include "tutils.h" 6 | 7 | #define FIRST_TOKEN 128 8 | 9 | enum TOKEN_LIST { 10 | TK_CHAR = FIRST_TOKEN, 11 | TK_CHAR_SPE, 12 | TK_BACK_REF, 13 | TK_NBACK_REF, 14 | TK_EQ_REF, 15 | TK_NE_REF, 16 | TK_COMMENT, 17 | TK_NOP, 18 | TK_END 19 | }; 20 | 21 | enum GROUP_TYPE { 22 | GT_NORMAL = 0, 23 | GT_NONGROUPING = 1, 24 | GT_BACKREF, 25 | GT_IF_MATCH, 26 | GT_IF_NOT_MATCH, 27 | GT_IF_PRECEDED_BY, 28 | GT_IF_NOT_PRECEDED_BY, 29 | GT_BACKREF_CONDITIONAL_INDEX, 30 | GT_BACKREF_CONDITIONAL_GROUPNAME, 31 | }; 32 | 33 | typedef struct TokenInfo { 34 | uint32_t index; 35 | uint32_t code; 36 | uint32_t code2; 37 | uint32_t group_type; 38 | uint32_t* group_name; 39 | int group_name_len; 40 | } TokenInfo; 41 | 42 | typedef struct tre_Token { 43 | uint32_t value; 44 | TokenInfo extra; 45 | } tre_Token; 46 | 47 | 48 | typedef struct tre_Lexer { 49 | tre_Token token; 50 | int extra_flag; 51 | const uint32_t *s; 52 | int scur; 53 | int slen; 54 | int state; // 0 NOMRAL | 1 [...] 55 | int max_normal_group_num; 56 | //TokenGroupName* group_names; 57 | } tre_Lexer; 58 | 59 | int tre_check_groups(uint32_t *s, int len); 60 | int tre_lexer_next(tre_Lexer *lex); 61 | 62 | tre_Lexer* tre_lexer_new(uint32_t *s, int len); 63 | void tre_lexer_free(tre_Lexer *lex); 64 | 65 | #define ERR_LEXER_UNBALANCED_PARENTHESIS -3 66 | #define ERR_LEXER_UNEXPECTED_END_OF_PATTERN -4 67 | #define ERR_LEXER_UNKNOW_SPECIFIER -5 68 | #define ERR_LEXER_BAD_GROUP_NAME -6 69 | #define ERR_LEXER_UNICODE_ESCAPE -7 70 | #define ERR_LEXER_UNICODE6_ESCAPE -8 71 | #define ERR_LEXER_HEX_ESCAPE -9 72 | #define ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF -10 73 | #define ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX -11 74 | #define ERR_LEXER_REDEFINITION_OF_GROUP_NAME -12 75 | #define ERR_LEXER_BAD_CHARACTER_RANGE -52 76 | 77 | #endif 78 | 79 | -------------------------------------------------------------------------------- /src/tparser.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fy0/tinyre/485d194331eba4c97f9d8aa46deff88939ed8910/src/tparser.c -------------------------------------------------------------------------------- /src/tparser.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef TINYRE_PARSER_H 3 | #define TINYRE_PARSER_H 4 | 5 | #include "tinyre.h" 6 | #include "tlexer.h" 7 | 8 | typedef struct INS_List { 9 | int len; 10 | uint32_t ins; 11 | uint32_t* data; 12 | struct INS_List* next; 13 | } INS_List; 14 | 15 | typedef struct OR_List { 16 | INS_List* codes; 17 | struct OR_List* next; 18 | } OR_List; 19 | 20 | typedef struct ParserMatchGroup { 21 | uint32_t* name; 22 | int name_len; 23 | 24 | INS_List* codes; 25 | INS_List* codes_start; 26 | int group_type; 27 | int group_extra; // used by (?<=) (? 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #define tre_new(__obj_type, __size) (__obj_type*)malloc((sizeof(__obj_type)*(__size))) 25 | 26 | typedef struct tre_Stack { 27 | void* data; 28 | int top; 29 | int len; 30 | } tre_Stack; 31 | 32 | 33 | #define stack_init(_s, _type, _len) { (_s).data = _len ? tre_new(_type, _len) : NULL; (_s).top = -1; (_s).len = _len; } 34 | #define stack_get_top(_s, _type) ((_type*)((_s).data) + (_s).top) 35 | #define stack_empty(_s) (_s.top == -1) 36 | #define stack_push(_s, _type) ((_type*)(_s.data) + ++(_s).top) 37 | #define stack_pop(_s, _type) ((_type*)((_s).data) + (_s).top--) 38 | #define stack_check(_s, _type, _step) if (_s.top == _s.len || _s.len == 0) { _s.len += _step; _s.data = _s.data ? realloc(_s.data, sizeof(_type) * _s.len) : tre_new(_type, _s.len);} 39 | #define stack_free(_s) free((_s).data); 40 | #define stack_copy(_s, _dest, _type) { (_dest).data = tre_new(_type, (_s).len);memcpy((_dest).data, (_s).data, sizeof(_type) * ((_s).top+1)); } 41 | 42 | #endif 43 | 44 | -------------------------------------------------------------------------------- /src/tvm.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fy0/tinyre/485d194331eba4c97f9d8aa46deff88939ed8910/src/tvm.c -------------------------------------------------------------------------------- /src/tvm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fy0/tinyre/485d194331eba4c97f9d8aa46deff88939ed8910/src/tvm.h --------------------------------------------------------------------------------