├── .gitignore
├── .travis.yml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── lib_py3
    ├── pylib.c
    ├── pylib.h
    └── tre.py
└── src
    ├── demo.c
    ├── lib
        ├── platform.c
        ├── platform.h
        ├── utf8_lite.c
        └── utf8_lite.h
    ├── tdebug.c
    ├── tdebug.h
    ├── tinyre.c
    ├── tinyre.h
    ├── tlexer.c
    ├── tlexer.h
    ├── tparser.c
    ├── tparser.h
    ├── tutils.h
    ├── tvm.c
    └── tvm.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | build/
10 | bak/
11 | 
12 | # Translations
13 | *.mo
14 | *.pot
15 | 
16 | # Django stuff:
17 | *.log
18 | 
19 | # Others
20 | .gitignore~
21 | *.[ch]~
22 | *.un~
23 | *.py~
24 | *.txt~
25 | *.swp
26 | *.md~
27 | 
28 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: c
2 | 
3 | script:
4 |   mkdir build && cd build && cmake .. && make
5 | 
6 | compiler:
7 |   - clang
8 |   - gcc
9 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | cmake_minimum_required(VERSION 2.8)
 3 | 
 4 | project (tinyre)
 5 | #set(CMAKE_BUILD_TYPE Debug)
 6 | set(CMAKE_BUILD_TYPE Release)
 7 | set(build_target demo)
 8 | #set(build_target py3lib)
 9 | 
10 | cmake_policy(SET CMP0015 OLD)
11 | 
12 | if (MSVC)
13 |     #set(CMAKE_GENERATOR_TOOLSET "v120_xp" CACHE STRING "Platform Toolset" FORCE) 
14 |     #set(CMAKE_VS_PLATFORM_TOOLSET "v120_xp" CACHE STRING "Platform Toolset" FORCE)
15 | endif(MSVC)
16 | 
17 | macro(source_group_by_dir source_files)
18 |     if(MSVC)
19 |         set(sgbd_cur_dir ${CMAKE_CURRENT_SOURCE_DIR})
20 |         foreach(sgbd_file ${${source_files}})
21 |             string(REGEX REPLACE ${sgbd_cur_dir}/\(.*\) \\1 sgbd_fpath ${sgbd_file})
22 |             string(REGEX REPLACE "\(.*\)/.*" \\1 sgbd_group_name ${sgbd_fpath})
23 |             string(COMPARE EQUAL ${sgbd_fpath} ${sgbd_group_name} sgbd_nogroup)
24 |             string(REPLACE "/" "\\" sgbd_group_name ${sgbd_group_name})
25 |             if(sgbd_nogroup)
26 |                 set(sgbd_group_name "\\")
27 |             endif(sgbd_nogroup)
28 |             source_group(${sgbd_group_name} FILES ${sgbd_file})
29 |         endforeach(sgbd_file)
30 |     endif(MSVC)
31 | endmacro(source_group_by_dir)
32 | 
33 | INCLUDE_DIRECTORIES()
34 | 
35 | if (MSVC)
36 |     set(CMAKE_CXX_FLAGS_DEBUG "/MTd")
37 |     set(CMAKE_CXX_FLAGS_RELEASE "/MT")
38 | else(MSVC)
39 |     set(CMAKE_C_FLAGS "-Wall")
40 |     set(CMAKE_C_FLAGS "-std=c99")
41 |     set(CMAKE_C_FLAGS_DEBUG "-g")
42 |     set(CMAKE_C_FLAGS_RELEASE "-O2")
43 | endif(MSVC)
44 | 
45 | add_definitions(-DUNICODE -D_UNICODE)
46 | 
47 | IF(${CMAKE_BUILD_TYPE} MATCHES "Debug")
48 |     add_definitions(-DTRE_DEBUG)
49 | ENDIF()
50 | 
51 | file(GLOB_RECURSE project_headers src/*.h)
52 | file(GLOB_RECURSE project_cpps src/*.c)
53 | set (SRC_LIST ${project_headers} ${project_cpps})
54 | 
55 | source_group_by_dir(SRC_LIST)
56 | 
57 | IF(${build_target} MATCHES "demo")
58 |     # demo
59 |     add_definitions(-DDEMO)
60 |     add_executable(tinyre ${SRC_LIST})
61 | 
62 |     if (NOT MSVC)
63 |         target_link_libraries(tinyre m)
64 |     endif(NOT MSVC)
65 | ELSEIF(${build_target} MATCHES "py3lib")
66 |     # library
67 |     add_definitions(-DPY3LIB)
68 |     file(GLOB_RECURSE py3lib_headers lib_py3/*.h)
69 |     file(GLOB_RECURSE py3lib_cfiles lib_py3/*.c)
70 |     set (LIB_PY3_SRC_LIST ${py3lib_headers} ${py3lib_cfiles})
71 |     source_group_by_dir(LIB_PY3_SRC_LIST)
72 |     add_library(_tinyre SHARED ${SRC_LIST} ${LIB_PY3_SRC_LIST})
73 |     set_target_properties(_tinyre PROPERTIES PREFIX "")
74 | ENDIF()
75 | 
76 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 - 2016 fy
 2 | 
 3 | This software is provided 'as-is', without any express or implied
 4 | warranty. In no event will the authors be held liable for any damages
 5 | arising from the use of this software.
 6 | 
 7 | Permission is granted to anyone to use this software for any purpose,
 8 | including commercial applications, and to alter it and redistribute it
 9 | freely, subject to the following restrictions:
10 | 
11 |    1. The origin of this software must not be misrepresented; you must not
12 |    claim that you wrote the original software. If you use this software
13 |    in a product, an acknowledgment in the product documentation would be
14 |    appreciated but is not required.
15 | 
16 |    2. Altered source versions must be plainly marked as such, and must not be
17 |    misrepresented as being the original software.
18 | 
19 |    3. This notice may not be removed or altered from any source
20 |    distribution.
21 | 
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # tinyre ver 0.9.2
  3 | 
  4 | [![Travis](https://travis-ci.org/fy0/tinyre.svg?branch=master)](https://travis-ci.org/fy0/tinyre)
  5 | [![Code Climate](https://codeclimate.com/github/fy0/tinyre/badges/gpa.svg)](https://codeclimate.com/github/fy0/tinyre)
  6 | 
  7 | A tiny regex engine.  
  8 | Plan to be compatible with "Secret Labs' Regular Expression Engine"(SRE for python).  
  9 | 
 10 | **warning: the project already works fine, but slow**
 11 | 
 12 | **Features**:  
 13 | * **utf-8 support**  
 14 |   Cheers for unicode!  
 15 | 
 16 | * **no octal number**  
 17 |   \\1 means group 1, \\1-100 means group n, \\01 match \\1, \\07 match \\7, \\08 match ['\\0', '8'], \\377 match 0o377, but \\400 isn't match with 0o400 and [chr(0o40), '\\0']!  
 18 |   What the hell ... I choose go die! Go away octal number!  
 19 | 
 20 | * **custom maximum number of backtracking**  
 21 |   An evil regex: **'a?'\*n+'a'\*n** against **'a'\*n**  
 22 |   For example: **'a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaa'** matches **'aaaaaaaaaaaaaaaaaaaaaaaaa'**  
 23 |   It will takes a long time because of too many times of backtracking. Perl/Python/PCRE requires over **10^15 years** to match a 29-character string.  
 24 |   You can set a limit to backtracking times to avoid this situation, and the match will be falied.  
 25 | 
 26 | * **more than 100 groups ...**  
 27 |   but who cares?  
 28 | 
 29 | 
 30 | **Supported**:
 31 | *    "."      Matches any character except a newline.
 32 | *    "^"      Matches the start of the string.
 33 | *    "$"      Matches the end of the string or just before the newline at the end of the string.
 34 | *    "*"      Matches 0 or more (greedy) repetitions of the preceding RE. Greedy means that it will match as many repetitions as possible.
 35 | *    "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
 36 | *    "?"      Matches 0 or 1 (greedy) of the preceding RE.
 37 | *    *?,+?,?? Non-greedy versions of the previous three special characters.
 38 | *    {m}      Matches m copies of the previous RE.  
 39 | *    {m,n}    Matches from m to n repetitions of the preceding RE.
 40 | *    {m,n}?   Non-greedy version of the above.
 41 | *    "\\"     Either escapes special characters or signals a special sequence.
 42 | *    "\\1-N"  Matches the text matched earlier by the group index.  
 43 | *    []       Indicates a set of characters.  
 44 | *    [^]      A "^" as the first character indicates a complementing set.  
 45 | *    "|"      A|B, creates an RE that will match either A or B.  
 46 | *    (...)    Matches the RE inside the parentheses. The contents can be retrieved or matched later in the string.  
 47 | *    (?ims)   Set the I, M or S flag for the RE (see below).  
 48 | *    (?:...)  Non-grouping version of regular parentheses.  
 49 | *    (?P<name>...) The substring matched by the group is accessible by name.  
 50 | *    (?P=name)     Matches the text matched earlier by the group named name.
 51 | *    (?#...)  A comment; ignored.  
 52 | *    (?=...)  Matches if ... matches next, but doesn't consume the string.  
 53 | *    (?!...)  Matches if ... doesn't match next.  
 54 | *    (?<=...) Matches if preceded by ... (must be fixed length).  
 55 | *    (?<!...) Matches if not preceded by ... (must be fixed length).  
 56 | *    (?(id/name)yes|no) Matches yes pattern if the group with id/name matched, the (optional) no pattern otherwise.  
 57 | *    \\d \\D \\w \\W \\s \\S  
 58 | *    Flag: DOTALL
 59 | *    Flag: IGNORECASE
 60 | *    Flag: MULTILINE
 61 | 
 62 | 
 63 | Some of the functions in this module takes flags as optional parameters:
 64 | *    I  IGNORECASE  Perform case-insensitive matching.
 65 | *    M  MULTILINE   "^" matches the beginning of lines (after a newline) as well as the string. "$" matches the end of lines (before a newline) as well as the end of the string.
 66 | *    S  DOTALL      "." matches any character at all, including the newline.
 67 | 
 68 | ## Use
 69 | 
 70 | **C/C++**
 71 | ```C
 72 | #include "tinyre.h"
 73 | 
 74 | tre_Pattern* pattern;
 75 | tre_Match* match;
 76 | 
 77 | pattern = tre_compile("^(bb)*a", 0);
 78 | match = tre_match(pattern, "bbbbabc", 0);
 79 | 
 80 | // Group  0: bbbba
 81 | // Group  1: bb
 82 | ```
 83 | 
 84 | **Python**
 85 | 
 86 | Edit CMakefile.txt, change build_target to py3lib, disable debug
 87 | ```cmake
 88 | project (tinyre)
 89 | #set(CMAKE_BUILD_TYPE Debug)
 90 | #set(build_target demo)
 91 | set(build_target py3lib)
 92 | ```
 93 | 
 94 | ```bash
 95 | mkdir build
 96 | cd build && cmake .. && make
 97 | cp ./_tinyre.so ../lib_py3
 98 | 
 99 | cd ../lib_py3
100 | python3
101 | ```
102 | 
103 | ```Python
104 | import tre
105 | tre.match("^(bb)*a", "bbbbabc")
106 | ```
107 | 
108 | 
109 | ## Doc
110 | 
111 | [基础设计](https://github.com/fy0/tinyre/wiki/%E5%9F%BA%E7%A1%80%E8%AE%BE%E8%AE%A1)  
112 | [TODO列表](https://github.com/fy0/tinyre/wiki/todo-%E5%88%97%E8%A1%A8)  
113 | [更新记录](https://github.com/fy0/tinyre/wiki/%E6%9B%B4%E6%96%B0%E8%AE%B0%E5%BD%95)  
114 | 
115 | License：zlib
116 | 


--------------------------------------------------------------------------------
/lib_py3/pylib.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifdef PY3LIB
  3 | #include "pylib.h"
  4 | #include "../src/tutils.h"
  5 | 
  6 | void trepy_pattern_free(PyObject* obj)
  7 | {
  8 |     tre_pattern_free((tre_Pattern*)PyCapsule_GetPointer(obj, "_tre_pattern"));
  9 | }
 10 | 
 11 | static PyObject* trepy_compile(PyObject *self, PyObject* args)
 12 | {
 13 |     char* re;
 14 |     int flag;
 15 |     int err_code;
 16 | 
 17 |     if(!PyArg_ParseTuple(args, "si", &re, &flag)) {
 18 |         Py_INCREF(Py_None);
 19 |         return Py_None;
 20 |     }
 21 | 
 22 |     tre_Pattern* ret = tre_compile(re, flag, &err_code);
 23 | 
 24 |     if (ret) {
 25 |         PyObject *o = PyCapsule_New(ret, "_tre_pattern", trepy_pattern_free);
 26 |         return o;
 27 |     } else {
 28 |         return PyLong_FromLong(err_code);
 29 |     }
 30 | }
 31 | 
 32 | static _INLINE
 33 | PyObject* tre_Match_c2py(tre_Match* m)
 34 | {
 35 |     int i;
 36 |     PyObject* t = PyTuple_New(m->groupnum);
 37 | 
 38 |     for (i = 0; i < m->groupnum; i++) {
 39 |         PyObject* t2 = PyTuple_New(3);
 40 |         if (m->groups[i].name)
 41 |             PyTuple_SetItem(t2, 0, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, m->groups[i].name, m->groups[i].name_len));
 42 |         else {
 43 |             Py_INCREF(Py_None);
 44 |             PyTuple_SetItem(t2, 0, Py_None);
 45 |         }
 46 | 
 47 |         if (m->groups[i].head != -1) {
 48 |             PyTuple_SetItem(t2, 1, PyLong_FromLong(m->groups[i].head));
 49 |             PyTuple_SetItem(t2, 2, PyLong_FromLong(m->groups[i].tail));
 50 |         } else {
 51 |             Py_INCREF(Py_None);
 52 |             Py_INCREF(Py_None);
 53 |             PyTuple_SetItem(t2, 1, Py_None);
 54 |             PyTuple_SetItem(t2, 2, Py_None);
 55 |         }
 56 |         PyTuple_SetItem(t, i, t2);
 57 |     }
 58 | 
 59 |     tre_match_free(m);
 60 |     return t;
 61 | }
 62 | 
 63 | static PyObject* trepy_match(PyObject *self, PyObject* args)
 64 | {
 65 |     tre_Pattern* pattern;
 66 |     PyObject* obj;
 67 |     char* text;
 68 |     int backtrack_limit;
 69 | 
 70 |     if (!PyArg_ParseTuple(args, "Osi", &obj, &text, &backtrack_limit))
 71 |         return NULL;
 72 | 
 73 |     pattern = (tre_Pattern*)PyCapsule_GetPointer(obj, "_tre_pattern");
 74 | 
 75 |     tre_Match* m = tre_match(pattern, text, backtrack_limit);
 76 |     if (!m->groups) {
 77 |         tre_match_free(m);
 78 |         Py_INCREF(Py_None);
 79 |         return Py_None;
 80 |     }
 81 | 
 82 |     return tre_Match_c2py(m);
 83 | }
 84 | 
 85 | static PyMethodDef tre_methods[] ={
 86 |     {"_compile", trepy_compile, METH_VARARGS},
 87 |     {"_match", trepy_match, METH_VARARGS},
 88 |     {NULL, NULL,0,NULL}
 89 | };
 90 | 
 91 | static struct PyModuleDef module_def ={
 92 |     PyModuleDef_HEAD_INIT,
 93 |     "_tinyre",
 94 |     "Tiny Regex Engine Module",
 95 |     -1,
 96 |     tre_methods,
 97 | };
 98 | 
 99 | 
100 | static PyObject *TinyreError;
101 | 
102 | 
103 | PyMODINIT_FUNC PyInit__tinyre()
104 | {
105 |     PyObject *m;
106 |     m = PyModule_Create(&module_def);
107 | 
108 |     if (m == NULL)
109 |         return NULL;
110 | 
111 |     TinyreError = PyErr_NewException("tre.error", NULL, NULL);
112 |     Py_INCREF(TinyreError);
113 |     PyModule_AddObject(m, "error", TinyreError);
114 |     return m;
115 | }
116 | #endif
117 | 
118 | 


--------------------------------------------------------------------------------
/lib_py3/pylib.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifdef PY3LIB
 3 | 
 4 | #include "../src/tinyre.h"
 5 | #include <python3.4/Python.h>
 6 | 
 7 | PyMODINIT_FUNC PyInit__tinyre();
 8 | void trepy_free_pattern(PyObject* obj);
 9 | 
10 | #endif
11 | 
12 | 


--------------------------------------------------------------------------------
/lib_py3/tre.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # tinyre v0.9.0 wrapper
  3 | 
  4 | import _tinyre
  5 | 
  6 | # flags
  7 | I = IGNORECASE = 2
  8 | M = MULTILINE = 8
  9 | S = DOTALL = 16
 10 | 
 11 | class error(Exception):
 12 |     msg = ''
 13 | 
 14 | class TRE_Pattern:
 15 |     @classmethod
 16 |     def __new_pattern__(self, pattern, flags=0):
 17 |         if flags > I|M|S:
 18 |             flags = (flags & I) | (flags & M) | (flags & S)
 19 |         ret = _tinyre._compile(pattern, flags)
 20 |         if type(ret) == int:
 21 |             raise error(ret)
 22 |         else:
 23 |             ptn = TRE_Pattern()
 24 |             ptn.__cpattern__ = ret
 25 |             ptn.pattern = pattern
 26 |             return ptn
 27 | 
 28 |     def match(self, text, backtrack_limit=0):
 29 |         ret = _tinyre._match(self.__cpattern__, text, backtrack_limit)
 30 |         if ret:
 31 |             return TRE_Match(text, ret)
 32 | 
 33 | 
 34 | class TRE_Match:
 35 |     def __init__(self, match_text, data):
 36 |         groupspan = []
 37 |         grouptext = []
 38 |         groupdict = {}
 39 |         default_slots = []
 40 |         for i in data:
 41 |             name, a, b = i
 42 |             span = (a, b)
 43 |             text = match_text[a:b] if a is not None else None
 44 |             if i[0]:
 45 |                 groupdict[i[0]] = (span, text)
 46 |             if span[0] is None:
 47 |                 default_slots.append(True)
 48 |             else:
 49 |                 default_slots.append(False)
 50 |             groupspan.append(span)
 51 |             grouptext.append(text)
 52 |         self.__text__ = match_text
 53 |         self.__groupspan__ = tuple(groupspan)
 54 |         self.__grouptext__ = tuple(grouptext)
 55 |         self.__groupdict__ = groupdict
 56 |         self.__default_slots__ = default_slots
 57 |         self.lastindex = None
 58 | 
 59 |         if groupspan:
 60 |             index = 0
 61 |             for i in groupspan[1:]:
 62 |                 if i[0]:
 63 |                     index += 1
 64 |                     self.lastindex = index
 65 | 
 66 |     def __get_text_by_index__(self, i):
 67 |         return self.__grouptext__[i]
 68 | 
 69 |     def __get_text_by_name__(self, i):
 70 |         if i in self.__groupdict__:
 71 |             return self.__groupdict__[i][1]
 72 |         else:
 73 |             return None
 74 | 
 75 |     def span(self, index=0):
 76 |         a, b = self.__groupspan__[index]
 77 |         if a is None:
 78 |             return -1, -1
 79 |         return a, b
 80 | 
 81 |     def group(self, *indices):
 82 |         ret = []
 83 |         if len(indices) == 0:
 84 |             indices = {0}
 85 | 
 86 |         for i in indices:
 87 |             if type(i) == int:
 88 |                 ret.append(self.__get_text_by_index__(i))
 89 |             elif type(i) == str:
 90 |                 ret.append(self.__get_text_by_name__(i))
 91 |         if len(ret) == 1:
 92 |             return ret[0]
 93 |         else:
 94 |             return tuple(ret)
 95 | 
 96 |     def groups(self, default=None):
 97 |         if default is None:
 98 |             return self.__grouptext__[1:]
 99 |         else:
100 |             ret = list(self.__grouptext__[1:])
101 |             for i in range(1, len(self.__default_slots__)):
102 |                 if self.__default_slots__[i]:
103 |                     ret[i-1] = default
104 |             return tuple(ret)
105 | 
106 |     def groupdict(self):
107 |         ret = {}
108 |         for k, v in self.__groupdict__.items():
109 |             ret[k] = v[1]
110 |         return ret
111 | 
112 |     def start(self, index=0):
113 |         return self.span(index)[0]
114 | 
115 |     def end(self, index=0):
116 |         return self.span(index)[1]
117 | 
118 |     def string(self):
119 |         return self.__text__
120 | 
121 | 
122 | def compile(pattern, flags=0):
123 |     return TRE_Pattern.__new_pattern__(pattern, flags)
124 | 
125 | 
126 | def match(pattern, text, flags=0, backtrack_limit=0):
127 |     if pattern.__class__ != TRE_Pattern:
128 |         if pattern.__class__ == str:
129 |             pattern = compile(pattern, flags)
130 |         else:
131 |             return None
132 |     return pattern.match(text, backtrack_limit)
133 | 
134 | 


--------------------------------------------------------------------------------
/src/demo.c:
--------------------------------------------------------------------------------
 1 | ﻿/*
 2 |  * start : 2012-4-8 09:57
 3 |  * update: 2015-12-10 v0.9.0
 4 |  *
 5 |  * tinyre
 6 |  * fy, 2012-2015
 7 |  *
 8 |  */
 9 | 
10 | #include "tutils.h"
11 | #include "tdebug.h"
12 | 
13 | #include "tinyre.h"
14 | 
15 | 
16 | #ifdef DEMO
17 | 
18 | int main(int argc,char* argv[])
19 | {
20 |     int i;
21 |     int err_code;
22 |     tre_Pattern* pattern;
23 |     tre_Match* match = NULL;
24 |     platform_init();
25 | 
26 |     pattern = tre_compile("1(2)[3]", FLAG_DOTALL, &err_code);
27 | 
28 |     if (pattern) {
29 |         match = tre_match(pattern, "123", 5000);
30 | 
31 |         if (match->groups) {
32 |             putchar('\n');
33 |             for (i = 0; i < match->groupnum; i++) {
34 |                 printf("Group %2d: ", i);
35 |                 if (match->groups[i].name) {
36 |                     printf("(");
37 |                     output_str(match->groups[i].name, match->groups[i].name_len);
38 |                     printf(") ");
39 |                 } else printf("(null) ");
40 |                 printf("%d %d\n", match->groups[i].head, match->groups[i].tail);
41 |                 if (match->groups[i].head != -1) {
42 |                     debug_printstr(match->str, match->groups[i].head, match->groups[i].tail);
43 |                 } else {
44 |                     printf("match failed.");
45 |                 }
46 |                 printf("\n");
47 |             }
48 |         }
49 |     } else {
50 |         tre_err(err_code);
51 |     }
52 | 
53 |     if (pattern) {
54 |         tre_pattern_free(pattern);
55 |         if (match) {
56 |             tre_match_free(match);  
57 |         }
58 |     }
59 | 
60 |     return 0;
61 | }
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/lib/platform.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "platform.h"
 3 | #include <stdio.h>
 4 | #include <locale.h>
 5 | #include <stdarg.h>
 6 | 
 7 | 
 8 | #ifdef PLATFORM_WINDOWS
 9 | #include <windows.h>
10 | 
11 | wchar_t* _utf8_to_16(const char* str) {
12 |     int nwLen = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
13 | 
14 |     wchar_t *pwBuf = malloc(sizeof(wchar_t) * (nwLen + 1));
15 |     memset(pwBuf, 0, nwLen * 2 + 2);
16 | 
17 |     MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), pwBuf, nwLen);
18 | 
19 |     return pwBuf;
20 | }
21 | #endif
22 | 
23 | 
24 | void printf_u8(const char *fmt, ...) {
25 | #if defined(PLATFORM_WINDOWS)
26 |     int size;
27 | 
28 |     va_list args;
29 |     va_start(args, fmt);
30 |     size = vsnprintf(NULL, 0, fmt, args);
31 |     va_end(args);
32 | 
33 |     char *buf = malloc(sizeof(char) * (size+1));
34 | 
35 |     va_start(args, fmt);
36 |     vsprintf(buf, fmt, args);
37 |     va_end(args);
38 | 
39 |     wchar_t* final_str = _utf8_to_16(buf);
40 |     wprintf(final_str);
41 | 
42 |     free(buf);
43 |     free(final_str);
44 | #else
45 |     va_list args;
46 |     va_start(args, fmt);
47 |     vprintf(fmt, args);
48 |     va_end(args);    
49 | #endif
50 | }
51 | 
52 | void platform_init() {
53 |     setlocale(LC_CTYPE, "");
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/src/lib/platform.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef PLATFORM_UTILS_H
 3 | #define PLATFORM_UTILS_H
 4 | 
 5 | #if defined(_WIN32) && !defined(_WIN32_WCE)
 6 | #define PLATFORM_WINDOWS  /* enable goodies for regular Windows */
 7 | #endif
 8 | 
 9 | #ifdef _MSC_VER
10 | #define _INLINE
11 | #pragma execution_character_set("utf-8")
12 | #else
13 | #define _INLINE inline
14 | #endif
15 | 
16 | void printf_u8(const char *fmt, ...);
17 | 
18 | void platform_init();
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/src/lib/utf8_lite.c:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | #include "utf8_lite.h"
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include <stdint.h>
  8 | 
  9 | #ifdef _MSC_VER
 10 | #include <Windows.h>
 11 | #endif
 12 | 
 13 | 
 14 | /*
 15 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
 16 | */
 17 | const char *utf8_decode(const char *o, int *val) {
 18 |     static const unsigned int limits[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
 19 |     const unsigned char *s = (const unsigned char *)o;
 20 |     unsigned int c = s[0];
 21 |     unsigned int res = 0;  /* final result */
 22 |     if (c < 0x80)  /* ascii? */
 23 |         res = c;
 24 |     else {
 25 |         int count = 0;  /* to count number of continuation bytes */
 26 |         while (c & 0x40) {  /* still have continuation bytes? */
 27 |             int cc = s[++count];  /* read next byte */
 28 |             if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
 29 |                 return NULL;  /* invalid byte sequence */
 30 |             res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
 31 |             c <<= 1;  /* to test next bit */
 32 |         }
 33 |         res |= ((c & 0x7F) << (count * 5));  /* add first byte */
 34 |         if (count > 3 || res > MAXUNICODE || res <= limits[count])
 35 |             return NULL;  /* invalid byte sequence */
 36 |         s += count;  /* skip continuation bytes read */
 37 |     }
 38 |     if (val) *val = res;
 39 |     return (const char *)s + 1;  /* +1 to include first byte */
 40 | }
 41 | 
 42 | 
 43 | int utf8_len(const char *s) {
 44 |     int code;
 45 |     int len = 0, rlen = strlen(s);
 46 |     const char* s_end = s + rlen + 1;
 47 | 
 48 |     for (const char *p = utf8_decode(s, &code); p != s_end; p = utf8_decode(p, &code)) {
 49 |         len += 1;
 50 |     }
 51 |     
 52 |     return len;
 53 | }
 54 | 
 55 | 
 56 | char* ucs4_to_utf8(int code) {
 57 |     const char  abPrefix[] = {0, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
 58 |     const int adwCodeUp[] = {
 59 |         0x80,           // U+00000000 ～ U+0000007F
 60 |         0x800,          // U+00000080 ～ U+000007FF
 61 |         0x10000,        // U+00000800 ～ U+0000FFFF
 62 |         0x200000,       // U+00010000 ～ U+001FFFFF
 63 |         0x4000000,      // U+00200000 ～ U+03FFFFFF
 64 |         0x80000000      // U+04000000 ～ U+7FFFFFFF
 65 |     };
 66 | 
 67 |     int i, ilen;
 68 | 
 69 |     // 根据UCS4编码范围确定对应的UTF-8编码字节数
 70 |     ilen = sizeof(adwCodeUp) / sizeof(uint32_t);
 71 |     for(i = 0; i < ilen; i++) {
 72 |         if( code < adwCodeUp[i] ) break;
 73 |     }
 74 | 
 75 |     if (i == ilen) return NULL;    // 无效的UCS4编码
 76 | 
 77 |     ilen = i + 1;   // UTF-8编码字节数
 78 |     char* pbUTF8 = malloc(sizeof(char) * (ilen+1));
 79 | 
 80 |     if (pbUTF8 != NULL) {   // 转换为UTF-8编码
 81 |         for( ; i > 0; i-- ) {
 82 |             pbUTF8[i] = (char)((code & 0x3F) | 0x80);
 83 |             code >>= 6;
 84 |         }
 85 | 
 86 |         pbUTF8[0] = (char)(code | abPrefix[ilen - 1]);
 87 |     }
 88 | 
 89 |     /*for (i = 0; i < ilen; i++) {
 90 |         printf("%2x ", pbUTF8[i]);
 91 |     }*/
 92 |     pbUTF8[ilen] = 0;
 93 |     
 94 |     return pbUTF8;
 95 | }
 96 | 
 97 | uint32_t* utf8_to_ucs4_str(const char *s, int *plen) {
 98 |     int code;
 99 |     const char *p = s;
100 |     int len = utf8_len(s);
101 |     uint32_t *buf = malloc((len+1) * sizeof(uint32_t));
102 | 
103 |     for (int i = 0; i < len;++i) {
104 |         p = utf8_decode(p, &code);
105 |         buf[i] = (uint32_t)code;
106 |     }
107 | 
108 |     buf[len] = '\0';
109 |     if (plen) *plen = len;
110 |     return buf;
111 | }
112 | 


--------------------------------------------------------------------------------
/src/lib/utf8_lite.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef UTF8_LITE_H
 3 | #define UTF8_LITE_H
 4 | 
 5 | #include <stdint.h>
 6 | 
 7 | #define MAXUNICODE    0x10FFFF 
 8 | 
 9 | const char *utf8_decode (const char *o, int *val);
10 | int utf8_len(const char *s);
11 | 
12 | char* ucs4_to_utf8(int code);
13 | 
14 | uint32_t* utf8_to_ucs4_str(const char *s, int *plen);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/tdebug.c:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | #include "tvm.h"
  3 | #include "tutils.h"
  4 | #include "tlexer.h"
  5 | #include "tparser.h"
  6 | #include "tdebug.h"
  7 | 
  8 | void putcode(uint32_t code) {
  9 |     //putwchar((wchar_t)code);
 10 |     if (code < 0xff) {
 11 |         putchar((char)code);
 12 |     } else {
 13 |         char* ret = ucs4_to_utf8(code);
 14 |         printf_u8("%s", ret);
 15 |         free(ret);
 16 |     }
 17 | }
 18 | 
 19 | void output_str(uint32_t *str, int len) {
 20 |     for (int i = 0; i < len; ++i) {
 21 |         putcode(str[i]);
 22 |     }
 23 | }
 24 | 
 25 | void debug_token_print(tre_Lexer *lex) {
 26 |     int err;
 27 |     uint32_t tval;
 28 | 
 29 |     printf("token list:\n");
 30 |     while (true) {
 31 |         err = tre_lexer_next(lex);
 32 |         if (err) {
 33 |             tre_err(err);
 34 |             return;
 35 |         }
 36 |         tval = lex->token.value;
 37 | 
 38 |         //printf("    %12d ", tval, lex->token.extra.code);
 39 |         if (tval < FIRST_TOKEN) {
 40 |             printf("%12s  ", "<SIGN>");
 41 |             putchar(tval);
 42 |             switch (tval) {
 43 |                 case '(':
 44 |                     printf("    GroupType:%d    ", lex->token.extra.group_type);
 45 |                     if (lex->token.extra.group_type == GT_BACKREF_CONDITIONAL_INDEX) {
 46 |                         printf("Index:%d", lex->token.extra.index);
 47 |                     }
 48 |                     break;
 49 |                 case '{':
 50 |                     printf("    {%d, %d}", lex->token.extra.code, lex->token.extra.code2);
 51 |                     break;
 52 |                 case '[':
 53 |                     printf("%5d", lex->token.extra.code);
 54 |                     break;
 55 |                 case '-':
 56 |                     printf("    ");
 57 |                     putcode(lex->token.extra.code);
 58 |                     putchar('-');
 59 |                     putcode(lex->token.extra.code2);
 60 |                     break;
 61 |             }
 62 |         } else {
 63 |             if (tval == TK_CHAR) {
 64 |                 printf("%12s  ", "<CHAR>");
 65 |                 putcode(lex->token.extra.code);
 66 |             } else if (tval == TK_CHAR_SPE) {
 67 |                 printf("%12s  ", "<CHAR_SPE>");
 68 |                 if (lex->token.extra.code != '.') putchar('\\');
 69 |                 putcode(lex->token.extra.code);
 70 |             } else if (tval == TK_BACK_REF) {
 71 |                 printf("%12s  ", "<BACK_REF>");
 72 |                 printf("%d", lex->token.extra.code);
 73 |             } else if (tval == TK_COMMENT) {
 74 |                 printf("%12s  ", "<COMMENT>");
 75 |                 printf("#");
 76 |             } else if (tval == TK_NOP) {
 77 |                 printf("%12s  ", "<NOP>");
 78 |                 printf("@");
 79 |             } else if (tval == TK_END) {
 80 |                 putchar('\n');
 81 |                 break;
 82 |             }
 83 |         }
 84 |         putchar('\n');
 85 |     }
 86 |     putchar('\n');
 87 | }
 88 | 
 89 | void debug_ins_list_print(ParserMatchGroup* groups) {
 90 |     int gnum = 0;
 91 | 
 92 |     for (ParserMatchGroup *g = groups; g; g = g->next) {
 93 |         if (gnum == 0) printf_u8("\nInstructions : Group 0\n");
 94 |         else {
 95 |             printf_u8("\nInstructions : Group %d (%d)", gnum, g->group_type);
 96 |             if (g->group_type == GT_IF_PRECEDED_BY || g->group_type == GT_IF_NOT_PRECEDED_BY || g->group_type == GT_BACKREF_CONDITIONAL_INDEX 
 97 |                     || g->group_type == GT_BACKREF_CONDITIONAL_GROUPNAME) {
 98 |                 printf_u8(" [%d]", g->group_extra);
 99 |             }
100 |             putchar('\n');
101 |         }
102 |         gnum++;
103 | 
104 |         for (INS_List* code = g->codes_start; code->next; code = code->next) {
105 |             if (code->ins == INS_CMP) {
106 |                 printf_u8("%15s ", "CMP");
107 |                 putcode(*(int*)code->data);
108 |                 putchar('\n');
109 |             } else if (code->ins == INS_CMP_SPE) {
110 |                 printf_u8("%15s ", "CMP_SPE");
111 |                 putcode(*(int*)code->data);
112 |                 putchar('\n');
113 |             } else if (code->ins == INS_CMP_MULTI || code->ins == INS_NCMP_MULTI) {
114 |                 if (code->ins == INS_CMP_MULTI) printf_u8("%15s %d ", "CMP_MULTI", *(int*)code->data);
115 |                 else printf_u8("%15s %d ", "NCMP_MULTI", *(int*)code->data);
116 |                 printf_u8("%6d    ", *((int*)code->data + 1));
117 |                 putcode(*((int*)code->data + 2));
118 | 
119 |                 if (*((int*)code->data + 1) == '-') {
120 |                     printf(" ");
121 |                     putcode(*((int*)code->data + 3));
122 |                 }
123 | 
124 |                 putchar('\n');
125 |                 for (int i = 1; i < *(int*)code->data; i++) {
126 |                     printf_u8("                    %4d    ", *((int*)code->data + i * 3 + 1));
127 |                     putcode(*((int*)code->data + i * 3 + 2));
128 | 
129 |                     if (*((int*)code->data + i * 3 + 1) == '-') {
130 |                         printf(" ");
131 |                         putcode(*((int*)code->data + i * 3 + 3));
132 |                     }
133 | 
134 |                     putchar('\n');
135 |                 }
136 |             } else if (code->ins == INS_CMP_BACKREF) {
137 |                 printf_u8("%15s %d\n", "CMP_BACKREF", *(int*)code->data);
138 |             } else if (code->ins == INS_CMP_GROUP) {
139 |                 printf_u8("%15s %d\n", "CMP_GROUP", *(int*)code->data);
140 |             } else if (code->ins == INS_CHECK_POINT) {
141 |                 printf_u8("%15s %d %d\n", "CHECK_POINT", *(int*)code->data, *((int*)code->data + 1));
142 |             } else if (code->ins == INS_CHECK_POINT_NO_GREED) {
143 |                 printf_u8("%15s %d %d\n", "CHECK_POINT_NG", *(int*)code->data, *((int*)code->data + 1));
144 |             } else if (code->ins == INS_MATCH_START) {
145 |                 printf_u8("%15s\n", "MATCH_START");
146 |             } else if (code->ins == INS_MATCH_END) {
147 |                 printf_u8("%15s\n", "MATCH_END");
148 |             } else if (code->ins == INS_GROUP_END) {
149 |                 printf_u8("%15s %d\n", "GROUP_END", *(int*)code->data);
150 |             }
151 |         }
152 | 
153 |         printf_u8("\n");
154 |     }
155 | }
156 | 
157 | void debug_printstr(uint32_t *str, int head, int tail) {
158 |     uint32_t *p = str + head;
159 |     if (tail <= head) return;
160 | 
161 |     while (p != str + tail) {
162 |         putcode(*p++);
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------
/src/tdebug.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef TINYRETRE_DEBUG_H
 3 | #define TINYRETRE_DEBUG_H
 4 | 
 5 | #include "tlexer.h"
 6 | 
 7 | //#define TRE_DEBUG
 8 | 
 9 | struct tre_Token;
10 | struct ParserMatchGroup;
11 | 
12 | void putcode(uint32_t code);
13 | void output_str(uint32_t *str, int len);
14 | 
15 | void debug_token_print(tre_Lexer *lexer);
16 | void debug_ins_list_print(struct ParserMatchGroup* groups);
17 | 
18 | void debug_printstr(uint32_t *str, int head, int tail);
19 | 
20 | #ifdef TRE_DEBUG
21 | #define TRE_DEBUG_PRINT( ...) printf_u8(__VA_ARGS__)
22 | #else
23 | #define TRE_DEBUG_PRINT(expression, ...)
24 | #endif
25 | 
26 | #endif
27 | 
28 | 


--------------------------------------------------------------------------------
/src/tinyre.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * start : 2012-4-8 09:57
  3 |  * update: 2015-12-10 v0.9.0
  4 |  *
  5 |  * tinyre
  6 |  * fy, 2012-2015
  7 |  *
  8 |  */
  9 | 
 10 | #include "tutils.h"
 11 | #include "tlexer.h"
 12 | #include "tparser.h"
 13 | #include "tvm.h"
 14 | #include "tdebug.h"
 15 | 
 16 | void tre_err(int err_code) {
 17 |     switch (err_code) {
 18 |         case ERR_LEXER_UNBALANCED_PARENTHESIS:
 19 |             printf_u8("input error: unbalanced parenthesis.\n");
 20 |             break;
 21 |         case ERR_LEXER_UNEXPECTED_END_OF_PATTERN:
 22 |             printf_u8("input error: unexpected end of pattern.\n");
 23 |             break;
 24 |         case ERR_LEXER_UNKNOW_SPECIFIER:
 25 |             printf_u8("input error: unknown specifier.\n");
 26 |             break;
 27 |         case ERR_LEXER_BAD_GROUP_NAME:
 28 |             printf_u8("input error: bad group name\n");
 29 |             break;
 30 |         case ERR_LEXER_UNICODE_ESCAPE:
 31 |             printf_u8("input error: unicode escape failed, requires 4 chars(\\u0000).\n");
 32 |             break;
 33 |         case ERR_LEXER_UNICODE6_ESCAPE:
 34 |             printf_u8("input error: unicode escape failed, requires 8 chars(\\u00000000).\n");
 35 |             break;
 36 |         case ERR_LEXER_HEX_ESCAPE:
 37 |             printf_u8("input error: hex escape failed, requires 2 chars(\\x00).\n");
 38 |             break;
 39 |         case ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF:
 40 |             printf_u8("input error: bad group name in backref\n");
 41 |             break;
 42 |         case ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX:
 43 |             printf_u8("input error: invalid group name or index\n");
 44 |             break;
 45 |         case ERR_LEXER_REDEFINITION_OF_GROUP_NAME:
 46 |             printf_u8("input error: redefinition of group name\n");
 47 |             break;
 48 |         case ERR_PARSER_REQUIRES_FIXED_WIDTH_PATTERN:
 49 |             printf_u8("input error: look-behind requires fixed-width pattern\n");
 50 |             break;
 51 |         case ERR_PARSER_BAD_CHARACTER_RANGE:
 52 |             printf_u8("input error: bad character range\n");
 53 |             break;
 54 |         case ERR_PARSER_NOTHING_TO_REPEAT:
 55 |             printf_u8("input error: nothing to repeat\n");
 56 |             break;
 57 |         case ERR_PARSER_IMPOSSIBLE_TOKEN:
 58 |             printf_u8("input error: impossible token\n");
 59 |             break;
 60 |         case ERR_PARSER_UNKNOWN_GROUP_NAME:
 61 |             printf_u8("input error: unknow group name\n");
 62 |             break;
 63 |         case ERR_PARSER_CONDITIONAL_BACKREF:
 64 |             printf_u8("input error: conditional backref with more than two branches\n");
 65 |             break;
 66 |         case ERR_PARSER_INVALID_GROUP_INDEX:
 67 |             printf_u8("input error: invalid group index in conditional backref\n");
 68 |             break;
 69 |         default:
 70 |             printf_u8("parsering falied!!!\n");
 71 |     }
 72 | }
 73 | 
 74 | tre_Pattern* tre_compile(char* s, int flag, int* err_code) {
 75 |     int ret;
 76 |     tre_Pattern* groups;
 77 |     tre_Lexer* lexer;
 78 | 
 79 |     int len;
 80 |     uint32_t* buf = utf8_to_ucs4_str(s, &len);
 81 | 
 82 |     lexer = tre_lexer_new(buf, len);
 83 | 
 84 | //#define TRE_DEBUG_LEXER
 85 | #ifdef TRE_DEBUG_LEXER
 86 |     debug_token_print(lexer);
 87 |     return 0;
 88 | #endif
 89 | 
 90 |     groups = tre_parser(lexer, &ret);
 91 | 
 92 |     if (groups == NULL) {
 93 |         *err_code = ret;
 94 |     } else {
 95 |         groups->flag = flag | lexer->extra_flag;
 96 |     }
 97 | 
 98 |     tre_lexer_free(lexer);
 99 |     free(buf);
100 |     return groups;
101 | }
102 | 
103 | tre_Match* tre_match(tre_Pattern* tp, const char* str, int backtrack_limit)
104 | {
105 |     VMState* vms = vm_init(tp, str, backtrack_limit);
106 |     tre_GroupResult* groups = vm_exec(vms);
107 |     tre_Match* match = tre_new(tre_Match, 1);
108 |     match->groupnum = vms->group_num;
109 |     match->groups = groups;
110 |     match->str = vms->input_str;
111 |     vm_free(vms);
112 |     return match;
113 | }
114 | 
115 | void tre_pattern_free(tre_Pattern *ptn) {
116 |     int i;
117 | 
118 |     for (i = 0; i < ptn->num_all; i++) {
119 |         free(ptn->groups[i].codes);
120 |         free(ptn->groups[i].name);
121 |     }
122 | 
123 |     free(ptn->groups);
124 |     free(ptn);
125 | }
126 | 
127 | void tre_match_free(tre_Match *m) {
128 |     int i;
129 |     if (m->groups) {
130 |         for (i = 0; i < m->groupnum; i++) {
131 |             free(m->groups[i].name);
132 |         }
133 |     }
134 |     free(m->str);
135 |     free(m->groups);
136 |     free(m);
137 | }
138 | 


--------------------------------------------------------------------------------
/src/tinyre.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * tinyre v0.9.0
 3 |  * fy, 2012-2015
 4 |  *
 5 |  */
 6 | 
 7 | #ifndef TINYRE_H
 8 | #define TINYRE_H
 9 | 
10 | #include <stdint.h>
11 | 
12 | enum tre_Flag {
13 |     FLAG_NONE = 0,
14 |     //FLAG_TEMPLATE = 1,
15 |     FLAG_IGNORECASE = 2,
16 |     //FLAG_LOCALE = 4,
17 |     FLAG_MULTILINE = 8,
18 |     FLAG_DOTALL = 16,
19 |     //FLAG_UNICODE = 32,
20 |     //FLAG_VERBOSE = 64,
21 |     //FLAGTRE_DEBUG = 128,
22 | 
23 |     //FLAG_T = 1,
24 |     FLAG_I = 2,
25 |     //FLAG_L = 4,
26 |     FLAG_M = 8,
27 |     FLAG_S = 16,
28 |     //FLAG_U = 32,
29 |     //FLAG_X = 64,
30 | };
31 | 
32 | /* compiled groups */
33 | typedef struct MatchGroup {
34 |     uint32_t* name;
35 |     int name_len;
36 |     int type;
37 |     int extra;
38 |     uint32_t* codes;
39 | } MatchGroup;
40 | 
41 | typedef struct tre_Pattern {
42 |     int num; // group num
43 |     int num_all; // group num include non-grouping parentheses (?:) (?=) ..
44 |     MatchGroup* groups;
45 |     int flag;
46 | } tre_Pattern;
47 | 
48 | /* 匹配后返回的结果 */
49 | 
50 | typedef struct tre_GroupResult {
51 |     uint32_t *name;
52 |     int name_len;
53 |     int head;
54 |     int tail;
55 | } tre_GroupResult;
56 | 
57 | 
58 | typedef struct tre_Match {
59 |     int groupnum;
60 |     uint32_t* str;
61 |     tre_GroupResult* groups;
62 | } tre_Match;
63 | 
64 | /* 表达式编译和匹配 */
65 | tre_Pattern* tre_compile(char* s, int flag, int* err_code);
66 | tre_Match* tre_match(tre_Pattern* tp, const char* str, int backtrack_limit);
67 | 
68 | /* 释放内存占用 */
69 | void tre_pattern_free(tre_Pattern *ptn);
70 | void tre_match_free(tre_Match *m);
71 | 
72 | /* 其他 */
73 | void tre_err(int err_code);
74 | 
75 | #endif
76 | 
77 | 


--------------------------------------------------------------------------------
/src/tlexer.c:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | #include "tlexer.h"
  3 | #include "tutils.h"
  4 | 
  5 | int read_int(tre_Lexer *lex, char end_terminal, int *plen);
  6 | int read_hex(tre_Lexer *lex, int len, bool *p_isok);
  7 | 
  8 | uint32_t char_next(tre_Lexer *lex) {
  9 |     return lex->s[lex->scur++];
 10 | }
 11 | 
 12 | uint32_t char_nextn(tre_Lexer *lex, int n) {
 13 |     uint32_t code = lex->s[lex->scur+n-1];
 14 |     lex->scur+=n;
 15 |     return code;
 16 | }
 17 | 
 18 | uint32_t char_lookahead(tre_Lexer *lex) {
 19 |     return lex->s[lex->scur];
 20 | }
 21 | 
 22 | uint32_t char_lookaheadn(tre_Lexer *lex, int n) {
 23 |     return lex->s[lex->scur + n - 1];
 24 | }
 25 | 
 26 | _INLINE static
 27 | bool token_check(uint32_t code) {
 28 |     switch (code) {
 29 |         case '^': case '$': case '*': case '+': case '?':
 30 |         case '[': case ']': case '{': case '(': case ')':
 31 |         case '|':
 32 |             return true;
 33 |     }
 34 |     return false;
 35 | }
 36 | 
 37 | _INLINE static
 38 | bool is_spe_char(uint32_t code) {
 39 |     const char other_tokens[] = "DdWwSs";
 40 |     for (const char *p = other_tokens; *p; p++) {
 41 |         if (code == *p) return true;
 42 |     }
 43 |     return false;
 44 | }
 45 | 
 46 | _INLINE static
 47 | int try_get_escape(uint32_t code) {
 48 |     const char other_tokens[] = "abfnrtv";
 49 |     const int other_codes[] = { 7, 8, 12, 10, 13, 9, 11 };
 50 |     for (const char *p = other_tokens; *p; p++) {
 51 |         if (code == *p) return other_codes[p-other_tokens];
 52 |     }
 53 |     return code;
 54 | }
 55 | 
 56 | 
 57 | _INLINE static uint8_t _hex(uint32_t code) {
 58 |     if (code >= '0' && code <= '9') return code - '0';
 59 |     else if (code >= 'A' && code <= 'F') return code - 'A' + 10;
 60 |     else if (code >= 'a' && code <= 'f') return code - 'a' + 10;
 61 |     return 255;
 62 | }
 63 | 
 64 | _INLINE static uint8_t _oct(uint32_t code) {
 65 |     if (code >= '0' && code <= '7') return code - '0';
 66 |     return 255;
 67 | }
 68 | 
 69 | _INLINE static uint8_t _bin(uint32_t code) {
 70 |     if (code >= '0' && code <= '1') return code - '0';
 71 |     return 255;
 72 | }
 73 | 
 74 | _INLINE static uint8_t _dec(uint32_t code) {
 75 |     if (code >= '0' && code <= '9') return code - '0';
 76 |     return 255;
 77 | }
 78 | 
 79 | 
 80 | _INLINE static
 81 | int _read_x_int(const uint32_t *start, const uint32_t *end, int n, uint8_t(*func)(uint32_t code), int max_size) {
 82 |     const uint32_t *p = start;
 83 |     const uint32_t *e = (max_size > 0) ? start + max_size : end;
 84 |     int ret = 0, val = (int)pow(n, e - p - 1);
 85 | 
 86 |     do {
 87 |         ret += (*func)(*p++) * val;
 88 |         val /= n;
 89 |     } while (p != e);
 90 | 
 91 |     return ret;
 92 | }
 93 | 
 94 | int read_int(tre_Lexer *lex, char end_terminal, int *plen) {
 95 |     const uint32_t *p = lex->s + lex->scur;
 96 |     const uint32_t *start = p;
 97 | 
 98 |     while (isdigit(*p)) ++p;
 99 | 
100 |     if (p == start) {
101 |         if (plen) *plen = 0;
102 |         return -1;
103 |     }
104 | 
105 |     int num = _read_x_int(start, p, 10, _dec, 0);
106 |     if (plen) {
107 |         if (end_terminal && (*(p + (p - start) - 1) != end_terminal)) {
108 |             return -1;
109 |         }
110 |     }
111 |     if (plen) *plen = p - start;
112 |     return num;
113 | }
114 | 
115 | int read_hex(tre_Lexer *lex, int len, bool *p_isok) {
116 |     const uint32_t *p = lex->s + lex->scur;
117 |     const uint32_t *start = p;
118 |     int count = 0;
119 | 
120 |     while (_hex(*p) != 255) {
121 |         ++p;
122 |         if (++count == len) break;
123 |     }
124 | 
125 |     if (count != len) {
126 |         p_isok = false;
127 |         return 0;
128 |     }
129 | 
130 |     *p_isok = true;
131 |     return _read_x_int(start, p, 16, _hex, 0);
132 | }
133 | 
134 | 
135 | _INLINE static
136 | int token_char_accept(tre_Lexer *lex, uint32_t code, bool use_back_ref) {
137 |     if (code == '\\') {
138 |         // 对转义字符做特殊处理
139 |         if (lex->scur == lex->slen) {
140 |             // 如果已经是最后一个字符，那么当作普通字符即可
141 |             lex->token.extra.code = code;
142 |             lex->token.value = TK_CHAR;
143 |         } else {
144 |             // 如果不是，读下一个字符
145 |             code = char_lookahead(lex);
146 |             if (is_spe_char(code)) {
147 |                 // 能确定为特殊匹配字符的话，读取结束
148 |                 lex->token.extra.code = code;
149 |                 lex->token.value = TK_CHAR_SPE;
150 |                 code = char_next(lex);
151 |             } else {
152 |                 // 否则当做 hex/unicode 转义处理
153 |                 int num, len;
154 |                 bool is_ok = false;
155 | 
156 |                 if (code == 'x') {
157 |                     code = char_next(lex);
158 |                     num = read_hex(lex, 2, &is_ok);
159 |                     if (!is_ok) return ERR_LEXER_HEX_ESCAPE;
160 |                     char_nextn(lex, 2);
161 |                 } else if (code == 'u') {
162 |                     code = char_next(lex);
163 |                     num = read_hex(lex, 4, &is_ok);
164 |                     if (!is_ok) return ERR_LEXER_UNICODE_ESCAPE;
165 |                     char_nextn(lex, 4);
166 |                 } else if (code == 'U') {
167 |                     code = char_next(lex);
168 |                     num = read_hex(lex, 8, &is_ok); // unicode 6.0 \U0000000A
169 |                     if (!is_ok) return ERR_LEXER_UNICODE6_ESCAPE;
170 |                     char_nextn(lex, 8);
171 |                 }
172 | 
173 |                 if (is_ok) {
174 |                     lex->token.value = TK_CHAR;
175 |                     lex->token.extra.code = num;
176 |                 } else {
177 |                     num = read_int(lex, 0, &len);
178 |                     if (num != -1) {
179 |                         // back reference or normal char
180 |                         if (use_back_ref) {
181 |                             if (num == 0) {
182 |                                 lex->token.value = TK_CHAR;
183 |                                 lex->token.extra.code = 0;
184 |                             } else {
185 |                                 lex->token.value = TK_BACK_REF;
186 |                                 lex->token.extra.index = num;
187 |                             }
188 |                         } else {
189 |                             lex->token.value = TK_CHAR;
190 |                             lex->token.extra.code = num;
191 |                         }
192 |                         char_nextn(lex, len);
193 |                     } else {
194 |                         // 既不是转义，也不是前向引用，只是一个字符罢了
195 |                         lex->token.value = TK_CHAR;
196 |                         lex->token.extra.code = code;
197 |                         char_next(lex);
198 |                     }
199 |                 }
200 |             }
201 |         }
202 |     } else {
203 |         // 若非转义字符，那么一切都很简单
204 |         lex->token.extra.code = code;
205 |         lex->token.value = (code == '.') ? TK_CHAR_SPE : TK_CHAR;
206 |     }
207 |     return 0;
208 | }
209 | 
210 | _INLINE static
211 | int char_to_flag(uint32_t code) {
212 |     if (code == 'i') return FLAG_IGNORECASE;
213 |     else if (code == 'm') return FLAG_MULTILINE;
214 |     else if (code == 's') return FLAG_DOTALL;
215 |     return 0;
216 | }
217 | 
218 | #define lex_isidentfirst(c) ((c >= 'A' && c<= 'Z') || (c >= 'a' && c<= 'z') || (c >= '_') || (c >= 128))
219 | #define lex_isidentletter(c) ((c >= 'A' && c<= 'Z') || (c >= 'a' && c<= 'z') || (c >= '0' && c<= '9') || (c == '_') || (c >= 128))
220 | 
221 | uint32_t* read_group_name(tre_Lexer *lex, char end_terminal, int *plen) {
222 |     uint32_t code;
223 |     uint32_t *name;
224 |     const uint32_t *p = lex->s + lex->scur;
225 |     const uint32_t *start = p;
226 | 
227 |     code = *p++;
228 |     if (!lex_isidentfirst(code)) return NULL;
229 | 
230 |     while (true) {
231 |         if (!lex_isidentletter(code)) break;
232 |         code = *p++;
233 |     }
234 | 
235 |     if (code != end_terminal) {
236 |         return NULL;
237 |     }
238 | 
239 |     name = tre_new(uint32_t, p - start);
240 |     memcpy(name, start, (p - start) * sizeof(uint32_t));
241 |     name[p - start - 1] = '\0';
242 | 
243 |     if (plen) *plen = p - start - 1;
244 |     return name;
245 | }
246 | 
247 | int tre_lexer_next(tre_Lexer* lex) {
248 |     int len;
249 |     uint32_t code;
250 |     uint32_t* name;
251 |     if (lex->scur == lex->slen) {
252 |         lex->token.value = TK_END;
253 |         return 0;
254 |     }
255 |     code = char_next(lex);
256 |     bool is_lastone = (lex->scur == lex->slen);
257 | 
258 |     switch (lex->state) {
259 |         case 0: // NORMAL STATE
260 |             if (token_check(code)) {
261 |                 lex->token.extra.code = 0;
262 |                 lex->token.value = code; // token val is it's own ascii.
263 | 
264 |                 switch (code) {
265 |                     case '[':
266 |                         lex->state = 1;
267 |                         if ((!is_lastone) && char_lookahead(lex) == '^') {
268 |                             lex->token.extra.code = 1;
269 |                         }
270 |                         break;
271 |                     case '{': {
272 |                         int count;
273 |                         int scur_bak = lex->scur;
274 |                         int llimit = 0, rlimit = -1;
275 | 
276 |                         // read left limit a{1
277 |                         llimit = read_int(lex, 0, &count);
278 |                         if (count == 0) goto __bad_token;
279 |                         code = char_nextn(lex, count+1);
280 | 
281 |                         // read comma a{1,
282 |                         if ((char)code == ',') {
283 |                             //char_next(lex);
284 |                         } else if ((char)code == '}') {
285 |                             rlimit = llimit;
286 |                             goto __write_code;
287 |                         } else {
288 |                             // falied, rollback
289 |                             goto __bad_token;
290 |                         }
291 | 
292 |                         // read left limit a{1, 2
293 |                         rlimit = read_int(lex, 0, &count);
294 |                         code = char_nextn(lex, count+1);
295 | 
296 |                         // read right brace a{1,2} or a{1,}
297 |                         if ((char)code == '}') {
298 |                             // ok, rlimit is -1
299 |                         } else {
300 |                             // falied, rollback
301 |                             goto __bad_token;
302 |                         }
303 | 
304 |                     __write_code:
305 |                         lex->token.extra.code = llimit;
306 |                         lex->token.extra.code2 = rlimit;
307 |                         break;
308 | 
309 |                     __bad_token:
310 |                         lex->token.value = TK_CHAR;
311 |                         lex->token.extra.code = '{';
312 |                         lex->scur = scur_bak;
313 |                         break;
314 |                     }
315 |                     case '(': {
316 |                         code = char_lookahead(lex);
317 |                         // if next char is not ?
318 |                         if (code != '?') {
319 |                             lex->token.extra.group_type = GT_NORMAL;
320 |                             lex->token.extra.group_name = NULL;
321 |                             break;
322 |                         } else {
323 |                             code = char_nextn(lex, 2);
324 |                             switch (code) {
325 |                                 case '#': { // just comment
326 |                                     bool is_escape = false;
327 |                                     code = char_next(lex);
328 |                                     while (!(!is_escape && code == ')')) {
329 |                                         code = char_next(lex);
330 |                                         if (is_escape) is_escape = false;
331 |                                         if (code == '\\') is_escape = true;
332 |                                         if (code == '\0') return ERR_LEXER_UNBALANCED_PARENTHESIS;
333 |                                     }
334 |                                     lex->token.value = TK_COMMENT;
335 |                                     break;
336 |                                 }
337 |                                 case ':': lex->token.extra.group_type = GT_NONGROUPING; break;
338 |                                 case '=': lex->token.extra.group_type = GT_IF_MATCH; break;
339 |                                 case '!': lex->token.extra.group_type = GT_IF_NOT_MATCH; break;
340 |                                 case '(':
341 |                                     // code for conditional backref
342 |                                     name = read_group_name(lex, ')', &len);
343 |                                     if (name) {
344 |                                         code = char_nextn(lex, len);
345 |                                         lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_GROUPNAME;
346 |                                         lex->token.extra.group_name = name;
347 |                                         lex->token.extra.group_name_len = len;
348 |                                     } else {
349 |                                         int i = read_int(lex, ')', &len);
350 |                                         if (i == -1) {
351 |                                             return ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX;
352 |                                         } else {
353 |                                             code = char_nextn(lex, len);
354 |                                             lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_INDEX;
355 |                                             lex->token.extra.index = i;
356 |                                         }
357 |                                     }
358 |                                     code = char_next(lex);
359 |                                     break;
360 |                                 case 'P':
361 |                                     // group name
362 |                                     code = char_lookahead(lex);
363 |                                     if (code == '<') {
364 |                                         code = char_next(lex);
365 |                                         name = read_group_name(lex, '>', &len);
366 |                                         if (!name) return ERR_LEXER_BAD_GROUP_NAME;
367 |                                         code = char_nextn(lex, len+1); // name and '>'
368 | 
369 |                                         lex->token.extra.group_type = GT_NORMAL;
370 |                                         lex->token.extra.group_name = name;
371 |                                         lex->token.extra.group_name_len = len;
372 |                                     } else if (code == '=') {
373 |                                         // code for back reference (?P=)
374 |                                         code = char_next(lex);
375 |                                         name = read_group_name(lex, ')', &len);
376 |                                         if (!name) return ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF;
377 |                                         code = char_nextn(lex, len); // skip name
378 | 
379 |                                         lex->token.extra.group_type = GT_BACKREF;
380 |                                         lex->token.extra.group_name = name;
381 |                                         lex->token.extra.group_name_len = len;
382 |                                     } else {
383 |                                         return ERR_LEXER_UNKNOW_SPECIFIER;
384 |                                     }
385 |                                     break;
386 |                                 case '<':
387 |                                     code = char_next(lex);
388 |                                     if (code == '=') {
389 |                                         lex->token.extra.group_type = GT_IF_PRECEDED_BY;
390 |                                     } else if (code == '!') {
391 |                                         lex->token.extra.group_type = GT_IF_NOT_PRECEDED_BY;
392 |                                     } else {
393 |                                         return ERR_LEXER_UNKNOW_SPECIFIER;
394 |                                     }
395 |                                     break;
396 |                                 default:
397 |                                     if (char_to_flag(code)) {
398 |                                         int flag = 0;
399 |                                         while (true) {
400 |                                             flag = char_to_flag(code);
401 |                                             if (flag) lex->extra_flag |= flag;
402 |                                             else break;
403 |                                             code = char_next(lex);
404 |                                         }
405 |                                     } else {
406 |                                         return ERR_LEXER_UNEXPECTED_END_OF_PATTERN;
407 |                                     }
408 |                                     lex->token.value = TK_NOP;
409 |                                     break;
410 |                             }
411 |                         }
412 |                     }
413 |                 };
414 |             } else {
415 |                 int ret = token_char_accept(lex, code, true);
416 |                 if (ret) return ret;
417 |             }
418 |             break;
419 |         case 1: { // [...]
420 |             bool is_escape = code == '\\';
421 |             int ret = token_char_accept(lex, code, false);
422 |             if (ret) return ret;
423 | 
424 |             if (!is_escape && lex->token.value == TK_CHAR) {
425 |                 // end the state
426 |                 if (code == ']') {
427 |                     lex->state = 0;
428 |                     lex->token.value = ']';
429 |                     break;
430 |                 }
431 |             }
432 | 
433 |             // [a-z] grammar
434 |             code = char_lookahead(lex);
435 |             if (code == '-') {
436 |                 uint32_t code2 = char_lookaheadn(lex, 2);
437 |                 // [a-]
438 |                 if (code2 == ']') break;
439 | 
440 |                 // [\s-1] -> error
441 |                 if (lex->token.value == TK_CHAR_SPE) {
442 |                     return ERR_LEXER_BAD_CHARACTER_RANGE;
443 |                 }
444 | 
445 |                 // [a-z]
446 |                 code2 = lex->token.extra.code;
447 |                 code = char_nextn(lex, 2);
448 |                 ret = token_char_accept(lex, code, false);
449 |                 if (ret) return ret;
450 | 
451 |                 // [1-\s] -> error
452 |                 if (lex->token.value == TK_CHAR_SPE) {
453 |                     return ERR_LEXER_BAD_CHARACTER_RANGE;
454 |                 }
455 | 
456 |                 // [z-a] -> error
457 |                 if (lex->token.extra.code < code2) {
458 |                     return ERR_LEXER_BAD_CHARACTER_RANGE;
459 |                 }
460 | 
461 |                 // everything is ok
462 |                 lex->token.value = '-';
463 |                 lex->token.extra.code2 = lex->token.extra.code;
464 |                 lex->token.extra.code = code2;
465 |             }
466 |             break;
467 |         }
468 |     }
469 |     return 0;
470 | }
471 | 
472 | int tre_check_groups(uint32_t *s, int len) {
473 |     int num = 0;
474 |     for (int i = 0; i < len; ++i) {
475 |         if (s[i] == '\\') i++;
476 |         else if (s[i] == '(') {
477 |             if (s[i + 1] == '?') {
478 |                 if (s[i + 2] == 'P') {
479 |                     if (s[i + 3] == '<') {
480 |                         i += 2;
481 |                         num++;
482 |                     }
483 |                 }
484 |                 else if (s[i + 2] == '(') i += 2;
485 |                 i++;
486 |             } else num++;
487 |         } else if (s[i] == '[') {
488 |             while (i++) {
489 |                 if (s[i] == ']') break;
490 |                 else if (s[i] == '\0') return -1;
491 |                 else if (s[i] == '\\') i++;
492 |             }
493 |         }
494 |     }
495 |     return num;
496 | }
497 | 
498 | tre_Lexer* tre_lexer_new(uint32_t *s, int len) {
499 |     tre_Lexer* lex = tre_new(tre_Lexer, 1);
500 |     lex->extra_flag = 0;
501 |     lex->max_normal_group_num = tre_check_groups(s, len) + 1;
502 |     //printf("AAAAAAAAA %d\n", lex->max_normal_group_num);
503 |     lex->state = 0;
504 | 
505 |     if (s) {
506 |         lex->s = s;
507 |         lex->scur = 0;
508 |         lex->slen = len;
509 |     }
510 |     return lex;
511 | }
512 | 
513 | void tre_lexer_free(tre_Lexer *lex) {
514 |     free(lex);
515 | }
516 | 
517 | 


--------------------------------------------------------------------------------
/src/tlexer.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef TINYRE_LEXER_H
 3 | #define TINYRE_LEXER_H
 4 | 
 5 | #include "tutils.h"
 6 | 
 7 | #define FIRST_TOKEN    128
 8 | 
 9 | enum TOKEN_LIST {
10 |     TK_CHAR = FIRST_TOKEN,
11 |     TK_CHAR_SPE,
12 |     TK_BACK_REF,
13 |     TK_NBACK_REF,
14 |     TK_EQ_REF,
15 |     TK_NE_REF,
16 |     TK_COMMENT,
17 |     TK_NOP,
18 |     TK_END
19 | };
20 | 
21 | enum GROUP_TYPE {
22 |     GT_NORMAL = 0,
23 |     GT_NONGROUPING = 1,
24 |     GT_BACKREF,
25 |     GT_IF_MATCH,
26 |     GT_IF_NOT_MATCH,
27 |     GT_IF_PRECEDED_BY,
28 |     GT_IF_NOT_PRECEDED_BY,
29 |     GT_BACKREF_CONDITIONAL_INDEX,
30 |     GT_BACKREF_CONDITIONAL_GROUPNAME,
31 | };
32 | 
33 | typedef struct TokenInfo {
34 |     uint32_t index;
35 |     uint32_t code;
36 |     uint32_t code2;
37 |     uint32_t group_type;
38 |     uint32_t* group_name;
39 |     int group_name_len;
40 | } TokenInfo;
41 | 
42 | typedef struct tre_Token {
43 |     uint32_t value;
44 |     TokenInfo extra;
45 | } tre_Token;
46 | 
47 | 
48 | typedef struct tre_Lexer {
49 |     tre_Token token;
50 |     int extra_flag;
51 |     const uint32_t *s;
52 |     int scur;
53 |     int slen;
54 |     int state; // 0 NOMRAL | 1 [...] 
55 |     int max_normal_group_num;
56 |     //TokenGroupName* group_names;
57 | } tre_Lexer;
58 | 
59 | int tre_check_groups(uint32_t *s, int len);
60 | int tre_lexer_next(tre_Lexer *lex);
61 | 
62 | tre_Lexer* tre_lexer_new(uint32_t *s, int len);
63 | void tre_lexer_free(tre_Lexer *lex);
64 | 
65 | #define ERR_LEXER_UNBALANCED_PARENTHESIS        -3
66 | #define ERR_LEXER_UNEXPECTED_END_OF_PATTERN     -4
67 | #define ERR_LEXER_UNKNOW_SPECIFIER              -5
68 | #define ERR_LEXER_BAD_GROUP_NAME                -6
69 | #define ERR_LEXER_UNICODE_ESCAPE                -7
70 | #define ERR_LEXER_UNICODE6_ESCAPE               -8
71 | #define ERR_LEXER_HEX_ESCAPE                    -9
72 | #define ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF     -10
73 | #define ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX   -11
74 | #define ERR_LEXER_REDEFINITION_OF_GROUP_NAME    -12
75 | #define ERR_LEXER_BAD_CHARACTER_RANGE           -52
76 | 
77 | #endif
78 | 
79 | 


--------------------------------------------------------------------------------
/src/tparser.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fy0/tinyre/485d194331eba4c97f9d8aa46deff88939ed8910/src/tparser.c


--------------------------------------------------------------------------------
/src/tparser.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef TINYRE_PARSER_H
 3 | #define TINYRE_PARSER_H
 4 | 
 5 | #include "tinyre.h"
 6 | #include "tlexer.h"
 7 | 
 8 | typedef struct INS_List {
 9 |     int len;
10 |     uint32_t ins;
11 |     uint32_t* data;
12 |     struct INS_List* next;
13 | } INS_List;
14 | 
15 | typedef struct OR_List {
16 |     INS_List* codes;
17 |     struct OR_List* next;
18 | } OR_List;
19 | 
20 | typedef struct ParserMatchGroup {
21 |     uint32_t* name;
22 |     int name_len;
23 | 
24 |     INS_List* codes;
25 |     INS_List* codes_start;
26 |     int group_type;
27 |     int group_extra; // used by (?<=) (?<!)
28 |     int or_num;
29 |     OR_List* or_list;
30 |     struct ParserMatchGroup* next;
31 | } ParserMatchGroup;
32 | 
33 | typedef struct tre_Parser {
34 |     tre_Lexer *lex;
35 |     int error_code;
36 |     int avaliable_group;
37 | 
38 |     ParserMatchGroup* m_start;
39 |     ParserMatchGroup* m_cur;
40 | 
41 |     bool is_count_width;
42 |     int match_width;
43 | } tre_Parser;
44 | 
45 | tre_Pattern* compact_group(ParserMatchGroup* parser_groups);
46 | tre_Pattern* tre_parser(tre_Lexer *lexer,int* perror_code);
47 | 
48 | // look-behind requires fixed-width pattern
49 | #define ERR_PARSER_REQUIRES_FIXED_WIDTH_PATTERN   -51
50 | // bad character range
51 | #define ERR_PARSER_BAD_CHARACTER_RANGE            -52
52 | // nothing to repeat
53 | #define ERR_PARSER_NOTHING_TO_REPEAT              -53
54 | // impossible token
55 | #define ERR_PARSER_IMPOSSIBLE_TOKEN               -54
56 | // unknow group name
57 | #define ERR_PARSER_UNKNOWN_GROUP_NAME             -55
58 | // conditional backref with more than two branches
59 | #define ERR_PARSER_CONDITIONAL_BACKREF            -56
60 | // invalid group index in conditional backref
61 | #define ERR_PARSER_INVALID_GROUP_INDEX            -57
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/tutils.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * tinyre v0.9.0
 3 |  * fy, 2012-2015
 4 |  *
 5 |  */
 6 | 
 7 | #ifndef TINYRE_UTILS_H
 8 | #define TINYRE_UTILS_H
 9 | 
10 | #include "lib/platform.h"
11 | #include "lib/utf8_lite.h"
12 | #include "tinyre.h"
13 | 
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <stdbool.h>
17 | #include <string.h>
18 | #include <ctype.h>
19 | #include <stdarg.h>
20 | #include <stdint.h>
21 | #include <math.h>
22 | #include <wchar.h>
23 | 
24 | #define tre_new(__obj_type, __size) (__obj_type*)malloc((sizeof(__obj_type)*(__size)))
25 | 
26 | typedef struct tre_Stack {
27 |     void* data;
28 |     int top;
29 |     int len;
30 | } tre_Stack;
31 | 
32 | 
33 | #define stack_init(_s, _type, _len) { (_s).data = _len ? tre_new(_type, _len) : NULL; (_s).top = -1; (_s).len = _len; }
34 | #define stack_get_top(_s, _type) ((_type*)((_s).data) + (_s).top)
35 | #define stack_empty(_s) (_s.top == -1)
36 | #define stack_push(_s, _type) ((_type*)(_s.data) + ++(_s).top)
37 | #define stack_pop(_s, _type) ((_type*)((_s).data) + (_s).top--)
38 | #define stack_check(_s, _type, _step) if (_s.top == _s.len || _s.len == 0) { _s.len += _step; _s.data = _s.data ? realloc(_s.data, sizeof(_type) * _s.len) : tre_new(_type, _s.len);}
39 | #define stack_free(_s) free((_s).data);
40 | #define stack_copy(_s, _dest, _type) { (_dest).data = tre_new(_type, (_s).len);memcpy((_dest).data, (_s).data, sizeof(_type) * ((_s).top+1)); }
41 | 
42 | #endif
43 | 
44 | 


--------------------------------------------------------------------------------
/src/tvm.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fy0/tinyre/485d194331eba4c97f9d8aa46deff88939ed8910/src/tvm.c


--------------------------------------------------------------------------------
/src/tvm.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fy0/tinyre/485d194331eba4c97f9d8aa46deff88939ed8910/src/tvm.h


--------------------------------------------------------------------------------