├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── match.c ├── test.lua └── tre.h /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: C/C++ CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: make 17 | run: make 18 | - uses: leafo/gh-actions-lua@v5 19 | - name: lua test.lua 20 | run: lua test.lua 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | match 2 | match.exe 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS:=-O2 -Wall -Wextra -Wvla -Wsign-conversion -pedantic -std=c99 2 | APPNAME:=match 3 | 4 | ifeq ($(OS),Windows_NT) 5 | APPNAME:=$(APPNAME).exe 6 | CC:=gcc 7 | RM:=del /Q 8 | endif 9 | 10 | all: match 11 | 12 | match: match.c tre.h 13 | $(CC) $(CFLAGS) -o $@ $< 14 | 15 | clean: 16 | $(RM) $(APPNAME) 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tiny-regex-mod 2 | Single file modification of [tiny-regex-c](https://github.com/kokke/tiny-regex-c) by Kokke 3 | 4 | Adds a few features and removes some minor issues: 5 | - made it into a single file library 6 | - modified to return a pointer to the start of the match (instead of an integer) 7 | - added option to get a pointer to the end of the match 8 | - removed static use of regex object 9 | - added quantifier operator `{m,n}` (also `{m}`, `{m,}`) 10 | - added lazy quantifiers `??`, `*?`, `+?` and `{m,n}?` 11 | - merged quantifier (?,*,+,{}) matching into two function (one for greedy, one for lazy) 12 | - added upper limits to quantifiers 13 | - (hopefully) fixed class range matching 14 | - (hopefully) fixed handling of escaped characters 15 | - (hopefully) fixed `.` matching (doesn't match `\r` or `\n`) 16 | - probably butchered print functionality 17 | -------------------------------------------------------------------------------- /match.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | //#define TRE_STATIC 4 | #define TRE_IMPLEMENTATION 5 | #include "tre.h" 6 | //#undef TRE_STATIC 7 | //#undef TRE_IMPLEMENTATION 8 | 9 | int main(int argc, char **argv) 10 | { 11 | if (argc < 3) 12 | { 13 | printf("Usage: %s pattern string\n", argv[0]); 14 | return -1; 15 | } 16 | tre_comp tregex; 17 | tre_compile(argv[1], &tregex); 18 | tre_print(&tregex); 19 | 20 | const char *string = argv[2]; 21 | const char *end; 22 | const char *start = tre_match(&tregex, string, &end); 23 | 24 | if (start) 25 | { 26 | printf("match start: %zu match end: %zu\n", start - string, end - string); 27 | return 1; 28 | } 29 | else 30 | { 31 | printf("no match\n"); 32 | return 0; 33 | } 34 | } -------------------------------------------------------------------------------- /test.lua: -------------------------------------------------------------------------------- 1 | local _sf = string.format 2 | local commandfmt = [[./match '%s' '%s']] 3 | local printfmt = [[%s : "%s", "%s", %s]] 4 | 5 | local _x = function(s) 6 | return s:gsub('\\', '\\\\'):gsub('\n', '\\n'):gsub('\r', '\\r'):gsub('\t', '\\t') 7 | end 8 | 9 | local OK = "OK" 10 | local NOK = "NOK" 11 | local exit_status = true 12 | 13 | local test_tre = function(expected, pattern, sample) 14 | local result 15 | local pipe = io.popen(string.format(commandfmt, pattern, sample)) 16 | pipe:read('*all') 17 | local _, exit, match = pipe:close() 18 | 19 | pattern, sample = _x(pattern), _x(sample) 20 | if exit == "exit" then 21 | if (match == 1 and expected == OK) or (match == 0 and expected == NOK) then 22 | result = "PASS" 23 | else 24 | result = "FAIL" 25 | exit_status = false 26 | end 27 | print(_sf(printfmt, result, pattern, sample, expected)) 28 | else 29 | exit_status = false 30 | print(_sf(printfmt, "ERROR", pattern, sample, expected)) 31 | print(_sf("\t%s code:%s", exit, match)) 32 | end 33 | end 34 | 35 | 36 | local test_vector = 37 | { 38 | { OK, "\\d", "5" }, 39 | { OK, "\\w+", "hej" }, 40 | { OK, "\\s", "\t \n" }, 41 | { NOK, "\\S", "\t \n" }, 42 | { OK, "[\\s]", "\t \n" }, 43 | { NOK, "[\\S]", "\t \n" }, 44 | { NOK, "\\D", "5" }, 45 | { NOK, "\\W+", "hej" }, 46 | { OK, "[0-9]+", "12345" }, 47 | { OK, "\\D", "hej" }, 48 | { NOK, "\\d", "hej" }, 49 | { OK, "[^\\w]", "\\" }, 50 | { OK, "[\\W]", "\\" }, 51 | { NOK, "[\\w]", "\\" }, 52 | { OK, "[^\\d]", "d" }, 53 | { NOK, "[\\d]", "d" }, 54 | { NOK, "[^\\D]", "d" }, 55 | { OK, "[\\D]", "d" }, 56 | { OK, "^.*\\\\.*$", "c:\\Tools" }, 57 | { OK, "^[\\+-]*[\\d]+$", "+27" }, 58 | { OK, "[abc]", "1c2" }, 59 | { NOK, "[abc]", "1C2" }, 60 | { OK, "[1-5]+", "0123456789" }, 61 | { OK, "[.2]", "1C2" }, 62 | { OK, "a*$", "Xaa" }, 63 | { OK, "a*$", "Xaa" }, 64 | { OK, "[a-h]+", "abcdefghxxx" }, 65 | { NOK, "[a-h]+", "ABCDEFGH" }, 66 | { OK, "[A-H]+", "ABCDEFGH" }, 67 | { NOK, "[A-H]+", "abcdefgh" }, 68 | { OK, "[^\\s]+", "abc def" }, 69 | { OK, "[^fc]+", "abc def" }, 70 | { OK, "[^d\\sf]+", "abc def" }, 71 | { OK, "\n", "abc\ndef" }, 72 | { OK, "b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n" }, 73 | { OK, ".*c", "abcabc" }, 74 | { OK, ".+c", "abcabc" }, 75 | { OK, "[b-z].*", "ab" }, 76 | { OK, "b[k-z]*", "ab" }, 77 | { NOK, "[0-9]", " - " }, 78 | { OK, "[^0-9]", " - " }, 79 | { OK, "0|", "0|" }, 80 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "0s:00:00" }, 81 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "000:00" }, 82 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "00:0000" }, 83 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "100:0:00" }, 84 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "00:100:00" }, 85 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "0:00:100" }, 86 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:0:0" }, 87 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:00:0" }, 88 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:0:00" }, 89 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:0:0" }, 90 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:00:0" }, 91 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:0:00" }, 92 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:00:00" }, 93 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:00:00" }, 94 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, 95 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello world !" }, 96 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello World !" }, 97 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world! " }, 98 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, 99 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello World !" }, 100 | { NOK, "\\d\\d?:\\d\\d?:\\d\\d?", "a:0" }, 101 | --[[]] 102 | { OK, "[^\\w][^-1-4]", ")T" }, 103 | { OK, "[^\\w][^-1-4]", ")^" }, 104 | { OK, "[^\\w][^-1-4]", "*)" }, 105 | { OK, "[^\\w][^-1-4]", "!." }, 106 | { OK, "[^\\w][^-1-4]", " x" }, 107 | { OK, "[^\\w][^-1-4]", "$b" }, 108 | --[[]] 109 | { OK, ".?bar", "real_bar" }, 110 | { NOK, ".?bar", "real_foo" }, 111 | { NOK, "X?Y", "Z" }, 112 | } 113 | 114 | 115 | for i, v in pairs(test_vector) do test_tre(table.unpack(v)) end 116 | os.exit(exit_status) -------------------------------------------------------------------------------- /tre.h: -------------------------------------------------------------------------------- 1 | // Public Domain Tiny Regular Expressions Library 2 | // Forked from https://github.com/kokke/tiny-regex-c 3 | // 4 | // Supports: 5 | // --------- 6 | // '^' Start anchor, matches start of string 7 | // '$' End anchor, matches end of string 8 | // --------- 9 | // '*' Asterisk, match zero or more (greedy, *? lazy) 10 | // '+' Plus, match one or more (greedy, +? lazy) 11 | // '{m,n}' Quantifier, match min. 'm' and max. 'n' (greedy, {m,n}? lazy) 12 | // '{m}' exactly 'm' 13 | // '{m,}' match min 'm' and max. MAX_QUANT 14 | // '?' Question, match zero or one (greedy, ?? lazy) 15 | // --------- 16 | // '.' Dot, matches any character except newline (\r, \n) 17 | // '[abc]' Character class, match if one of {'a', 'b', 'c'} 18 | // '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} 19 | // '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z } 20 | // '\s' Whitespace, \t \f \r \n \v and spaces 21 | // '\S' Non-whitespace 22 | // '\w' Alphanumeric, [a-zA-Z0-9_] 23 | // '\W' Non-alphanumeric 24 | // '\d' Digits, [0-9] 25 | // '\D' Non-digits 26 | // '\X' Character itself; X in [^sSwWdD] (e.g. '\\' is '\') 27 | // --------- 28 | 29 | 30 | #ifndef TRE_H_INCLUDE 31 | #define TRE_H_INCLUDE 32 | 33 | #ifndef TRE_STATIC 34 | #define TRE_DEF extern 35 | #else 36 | #define TRE_DEF static 37 | #endif 38 | 39 | #ifdef __cplusplus 40 | extern "C" { 41 | #endif 42 | 43 | #define TRE_MAX_NODES 64 // Max number of regex nodes in expression. 44 | #define TRE_MAX_BUFLEN 128 // Max length of character-class buffer. 45 | 46 | //#define TRE_SILENT // disable inclusion of stdio and printing 47 | //#define TRE_DOTANY // dot matches anything including newline 48 | 49 | typedef struct tre_node tre_node; 50 | typedef struct tre_comp tre_comp; 51 | 52 | struct tre_node 53 | { 54 | unsigned char type; 55 | union 56 | { 57 | char ch; // character 58 | char *cc; // character class buffer 59 | unsigned short mn[2]; // {m,n} quantifier 60 | } u; 61 | }; 62 | 63 | struct tre_comp 64 | { 65 | tre_node nodes[TRE_MAX_NODES]; 66 | char buffer[TRE_MAX_BUFLEN]; 67 | }; 68 | 69 | // Compile regex string pattern as tre_comp struct tregex 70 | TRE_DEF int tre_compile(const char *pattern, tre_comp *tregex); 71 | 72 | // Match tregex in text and return the match start or null if there is no match 73 | // If end is not null set it to the match end 74 | TRE_DEF const char *tre_match(const tre_comp *tregex, const char *text, const char **end); 75 | 76 | // Same but compiles pattern then matches 77 | TRE_DEF const char *tre_compile_match(const char *pattern, const char *text, const char **end); 78 | 79 | // Print the pattern 80 | TRE_DEF void tre_print(const tre_comp *tregex); 81 | 82 | #ifdef __cplusplus 83 | } 84 | #endif 85 | 86 | #endif // TRE_H_INCLUDE 87 | 88 | //------------------------------------------------------------ 89 | 90 | #ifdef TRE_IMPLEMENTATION 91 | 92 | #define TRE_MAXQUANT 1024 // Max in {m,n} - must be < ushrt_max - see "struct tre_node" 93 | #define TRE_MAXPLUS 40000 // For + and * 94 | 95 | #define TRE_TYPES_X X(NONE) X(BEGIN) X(END) \ 96 | X(QUANT) X(LQUANT) X(QMARK) X(LQMARK) X(STAR) X(LSTAR) X(PLUS) X(LPLUS) \ 97 | X(DOT) X(CHAR) X(CLASS) X(NCLASS) X(DIGIT) X(NDIGIT) X(ALPHA) X(NALPHA) X(SPACE) X(NSPACE) 98 | 99 | #define X(A) TRE_##A, 100 | enum { TRE_TYPES_X }; 101 | #undef X 102 | 103 | #include "string.h" 104 | #ifndef TRE_SILENT 105 | #include "stdio.h" 106 | #include "stdarg.h" 107 | #endif 108 | 109 | static int tre_err(const char *format, ...) 110 | { 111 | #ifdef TRE_SILENT 112 | (void) format; 113 | #else 114 | fputs("Error: ", stderr); 115 | va_list args; 116 | va_start(args, format); 117 | vfprintf(stderr, format, args); 118 | va_end(args); 119 | fputs("\n", stderr); 120 | fflush(stderr); 121 | #endif 122 | return 0; 123 | } 124 | 125 | TRE_DEF const char *tre_compile_match(const char *pattern, const char *text, const char **end) 126 | { 127 | tre_comp tregex = {0}; 128 | if (!tre_compile(pattern, &tregex)) 129 | { 130 | tre_err("Compiling pattern failed"); 131 | return 0; 132 | } 133 | 134 | return tre_match(&tregex, text, end); 135 | } 136 | 137 | static const char *matchpattern(const tre_node *nodes, const char *text, const char *tend); 138 | 139 | TRE_DEF const char *tre_nmatch(const tre_comp *tregex, const char *text, unsigned tlen, const char **end) 140 | { 141 | if (!tregex || !text || !tlen) 142 | { 143 | tre_err("NULL text or tre_comp"); 144 | return 0; 145 | } 146 | 147 | const char *tend = text + tlen; 148 | const char *mend; 149 | const tre_node *nodes = tregex->nodes; 150 | 151 | if (nodes->type == TRE_BEGIN) 152 | { 153 | mend = matchpattern(nodes + 1, text, tend); 154 | if (mend) 155 | { 156 | if (end) 157 | *end = mend; 158 | return text; 159 | } 160 | return 0; 161 | } 162 | 163 | do 164 | { 165 | mend = matchpattern(nodes, text, tend); 166 | if (mend) 167 | { 168 | //if (!*text) //Fixme: ??? 169 | // return 0; 170 | if (end) 171 | *end = mend; 172 | return text; 173 | } 174 | } 175 | while (tend > text++); 176 | 177 | return 0; 178 | } 179 | 180 | TRE_DEF const char *tre_match(const tre_comp *tregex, const char *text, const char **end) 181 | { 182 | return tre_nmatch(tregex, text, strlen(text), end); 183 | } 184 | 185 | #define TRE_ISMETA(c) ((c=='s')||(c=='S')||(c=='w')||(c=='W')||(c=='d')||(c=='D')) 186 | 187 | TRE_DEF int tre_ncompile(const char *pattern, unsigned plen, tre_comp *tregex) 188 | { 189 | if (!tregex || !pattern || !plen) 190 | return tre_err("NULL/empty string or tre_comp"); 191 | 192 | tre_node *tnode = tregex->nodes; 193 | char *buf = tregex->buffer; 194 | unsigned buflen = sizeof tregex->buffer; 195 | char quable = 0; // is the last node quantifiable 196 | char temp; 197 | 198 | unsigned idx = 0; 199 | 200 | unsigned long val; // for parsing numbers in {m,n} 201 | unsigned i = 0; // index into pattern 202 | unsigned j = 0; // index into tnode 203 | 204 | while (i < plen && (j + 1 < TRE_MAX_NODES)) 205 | { 206 | switch (pattern[i]) 207 | { 208 | // Meta-characters 209 | case '^': quable = 0; tnode[j].type = TRE_BEGIN; break; 210 | case '$': quable = 0; tnode[j].type = TRE_END; break; 211 | case '.': quable = 1; tnode[j].type = TRE_DOT; break; 212 | case '*': 213 | if (quable == 0) 214 | return tre_err("Non-quantifiable before *"); 215 | quable = 0; 216 | tnode[j].type = (pattern[i + 1] == '?') ? (i++, TRE_LSTAR) : TRE_STAR; break; 217 | case '+': 218 | if (quable == 0) 219 | return tre_err("Non-quantifiable before +"); 220 | quable = 0; 221 | tnode[j].type = (pattern[i + 1] == '?') ? (i++, TRE_LPLUS) : TRE_PLUS; break; 222 | case '?': 223 | if (quable == 0) 224 | return tre_err("Non-quantifiable before ?"); 225 | quable = 0; 226 | tnode[j].type = (pattern[i + 1] == '?') ? (i++, TRE_LQMARK) : TRE_QMARK; break; 227 | 228 | // Escaped characters 229 | case '\\': 230 | { 231 | quable = 1; 232 | if (++i >= plen) 233 | return tre_err("Dangling \\"); 234 | 235 | switch (pattern[i]) 236 | { 237 | // Meta-character: 238 | case 'd': tnode[j].type = TRE_DIGIT; break; 239 | case 'D': tnode[j].type = TRE_NDIGIT; break; 240 | case 'w': tnode[j].type = TRE_ALPHA; break; 241 | case 'W': tnode[j].type = TRE_NALPHA; break; 242 | case 's': tnode[j].type = TRE_SPACE; break; 243 | case 'S': tnode[j].type = TRE_NSPACE; break; 244 | 245 | // Not in [dDwWsS] 246 | default: tnode[j].type = TRE_CHAR; tnode[j].u.ch = pattern[i]; break; 247 | } 248 | } break; 249 | 250 | // Character class 251 | case '[': 252 | { 253 | quable = 1; 254 | 255 | // Look-ahead to determine if negated 256 | tnode[j].type = (pattern[i + 1] == '^') ? (i++, TRE_NCLASS) : TRE_CLASS; 257 | tnode[j].u.cc = buf + idx; 258 | 259 | // Copy characters inside [...] to buffer 260 | while (pattern[++i] != ']' && i < plen) 261 | { 262 | temp = 0; 263 | if (pattern[i] == '\\') 264 | { 265 | if (++i >= plen) 266 | return tre_err("Dangling '\\' in class"); 267 | 268 | // Only escape metachars and escape, omit escape for others 269 | temp = TRE_ISMETA(pattern[i]); 270 | if (temp || pattern[i] == '\\') 271 | { 272 | if (idx > buflen - 2) 273 | return tre_err("Buffer overflow in in class", i - 1); 274 | buf[idx++] = '\\'; 275 | } 276 | } 277 | 278 | if (idx > buflen - 2) 279 | return tre_err("Buffer overflow in class"); 280 | buf[idx++] = pattern[i]; 281 | 282 | // Check if it is a range 283 | if (temp) 284 | continue; // metachar 285 | 286 | if (pattern[i + 1] != '-' || i + 2 >= plen || pattern[i + 2] == ']') 287 | continue; // not '-' or "-"! or "-]" 288 | 289 | temp = (pattern[i + 2] == '\\'); 290 | if (temp && (i + 3 >= plen || TRE_ISMETA(pattern[i + 3]))) 291 | continue; // "-\\"! or "-\\w" 292 | 293 | // Validate range 294 | temp = temp ? pattern[i + 3] : pattern[i + 2]; 295 | if (temp < pattern[i]) 296 | return tre_err("Incorrect range in class"); 297 | //if (idx > buflen - 2) 298 | // return tre_err("Buffer overflow at range - in class"); 299 | //buf[idx++] = pattern[++i]; // '-' 300 | } 301 | 302 | if (pattern[i] != ']') 303 | return tre_err("Non terminated class"); 304 | 305 | buf[idx++] = 0; 306 | } break; 307 | 308 | // Quantifier 309 | case '{': 310 | { 311 | if (quable == 0) 312 | return tre_err("Non-quantifiable before {m,n}"); 313 | quable = 0; 314 | 315 | i++; 316 | val = 0; 317 | do 318 | { 319 | if (i >= plen || pattern[i] < '0' || pattern[i] > '9') 320 | return tre_err("Non-digit min value in quantifier"); 321 | val = 10 * val + (unsigned) (pattern[i++] - '0'); 322 | } 323 | while (pattern[i] != ',' && pattern[i] != '}'); 324 | 325 | if (val > TRE_MAXQUANT) 326 | return tre_err("Min value too big in quantifier"); 327 | tnode[j].u.mn[0] = val; 328 | 329 | if (pattern[i] == ',') 330 | { 331 | if (++i >= plen) 332 | return tre_err("Dangling ',' in quantifier"); 333 | if (pattern[i] == '}') 334 | { 335 | val = TRE_MAXQUANT; 336 | } 337 | else 338 | { 339 | val = 0; 340 | while (pattern[i] != '}') 341 | { 342 | if (i >= plen || pattern[i] < '0' || pattern[i] > '9') 343 | return tre_err("Non-digit max value in quantifier"); 344 | val = 10 * val + (unsigned) (pattern[i++] - '0'); 345 | } 346 | 347 | if (val > TRE_MAXQUANT || val < tnode[j].u.mn[0]) 348 | return tre_err("Max value too big or less than min value in quantifier"); 349 | } 350 | } 351 | tnode[j].type = (i + 1 < plen && pattern[i + 1] == '?') ? (i++, TRE_LQUANT) : TRE_QUANT; 352 | tnode[j].u.mn[1] = val; 353 | } break; 354 | 355 | // Regular characters 356 | default: quable = 1; tnode[j].type = TRE_CHAR; tnode[j].u.ch = pattern[i]; break; 357 | } 358 | i++; 359 | j++; 360 | } 361 | // 'TRE_NONE' is a sentinel used to indicate end-of-pattern 362 | tnode[j].type = TRE_NONE; 363 | 364 | return 1; 365 | } 366 | 367 | TRE_DEF int tre_compile(const char *pattern, tre_comp *tregex) 368 | { 369 | return tre_ncompile(pattern, strlen(pattern), tregex); 370 | } 371 | 372 | #define TRE_MATCHDIGIT(c) ((c >= '0') && (c <= '9')) 373 | #define TRE_MATCHALPHA(c) ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) 374 | #define TRE_MATCHSPACE(c) ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r') || (c == '\f') || (c == '\v')) 375 | #define TRE_MATCHALNUM(c) ((c == '_') || TRE_MATCHALPHA(c) || TRE_MATCHDIGIT(c)) 376 | 377 | static int matchmetachar(char c, char mc) 378 | { 379 | switch (mc) 380 | { 381 | case 'd': return TRE_MATCHDIGIT(c); 382 | case 'D': return !TRE_MATCHDIGIT(c); 383 | case 'w': return TRE_MATCHALNUM(c); 384 | case 'W': return !TRE_MATCHALNUM(c); 385 | case 's': return TRE_MATCHSPACE(c); 386 | case 'S': return !TRE_MATCHSPACE(c); 387 | default: return (c == mc); 388 | } 389 | } 390 | 391 | // note: compiler makes sure '\\' is followed by one of 'dDwWsS\\' 392 | static int matchcharclass(char c, const char *str) 393 | { 394 | char rmax; 395 | while (*str != '\0') 396 | { 397 | if (str[0] == '\\') 398 | { 399 | if (matchmetachar(c, str[1])) 400 | return 1; 401 | str += 2; 402 | if (TRE_ISMETA(*str)) 403 | continue; 404 | } 405 | else 406 | { 407 | if (c == *str) 408 | return 1; 409 | str += 1; 410 | } 411 | 412 | if (*str != '-' || !str[1]) 413 | continue; 414 | rmax = (str[1] == '\\'); 415 | if (rmax && TRE_ISMETA(str[2])) 416 | continue; 417 | 418 | rmax = rmax ? str[2] : str[1]; 419 | if (c >= str[-1] && c <= rmax) 420 | return 1; 421 | str++; 422 | 423 | } 424 | 425 | return 0; 426 | } 427 | 428 | 429 | #ifndef TRE_DOTANY 430 | #define TRE_MATCHDOT(c) ((c != '\n') && (c != '\r')) 431 | #else 432 | #define TRE_MATCHDOT(c) (1) 433 | #endif 434 | 435 | static int matchone(const tre_node *tnode, char c) 436 | { 437 | switch (tnode->type) 438 | { 439 | case TRE_CHAR: return (tnode->u.ch == c); 440 | case TRE_DOT: return TRE_MATCHDOT(c); 441 | case TRE_CLASS: return matchcharclass(c, tnode->u.cc); 442 | case TRE_NCLASS: return !matchcharclass(c, tnode->u.cc); 443 | case TRE_DIGIT: return TRE_MATCHDIGIT(c); 444 | case TRE_NDIGIT: return !TRE_MATCHDIGIT(c); 445 | case TRE_ALPHA: return TRE_MATCHALNUM(c); 446 | case TRE_NALPHA: return !TRE_MATCHALNUM(c); 447 | case TRE_SPACE: return TRE_MATCHSPACE(c); 448 | case TRE_NSPACE: return !TRE_MATCHSPACE(c); 449 | default: return 0; 450 | } 451 | } 452 | 453 | #undef TRE_MATCHDIGIT 454 | #undef TRE_MATCHALPHA 455 | #undef TRE_MATCHSPACE 456 | #undef TRE_MATCHALNUM 457 | #undef TRE_MATCHDOT 458 | 459 | static const char *matchquant_lazy(const tre_node *nodes, const char *text, const char *tend, 460 | unsigned min, unsigned max) 461 | { 462 | const char *end; 463 | max = max - min; 464 | while (min && text < tend && matchone(nodes, *text)) { text++; min--; } 465 | 466 | if (min) 467 | return 0; 468 | 469 | if ((end = matchpattern(nodes + 2, text, tend))) 470 | return end; 471 | 472 | while (max && text < tend && matchone(nodes, *text)) 473 | { 474 | text++; max--; 475 | if ((end = matchpattern(nodes + 2, text, tend))) 476 | return end; 477 | } 478 | 479 | return 0; 480 | } 481 | 482 | static const char *matchquant(const tre_node *nodes, const char *text, const char *tend, 483 | unsigned min, unsigned max) 484 | { 485 | const char *end, *start = text + min; 486 | while (max && text < tend && matchone(nodes, *text)) { text++; max--; } 487 | 488 | while (text >= start) 489 | { 490 | if ((end = matchpattern(nodes + 2, text--, tend))) 491 | return end; 492 | } 493 | 494 | return 0; 495 | } 496 | 497 | // Iterative matching 498 | static const char *matchpattern(const tre_node *nodes, const char *text, const char *tend) 499 | { 500 | do 501 | { 502 | if (nodes[0].type == TRE_NONE) 503 | return text; 504 | 505 | if ((nodes[0].type == TRE_END) && nodes[1].type == TRE_NONE) 506 | return (text == tend) ? text : 0; 507 | 508 | switch (nodes[1].type) 509 | { 510 | case TRE_QMARK: 511 | return matchquant(nodes, text, tend, 0, 1); 512 | case TRE_LQMARK: 513 | return matchquant_lazy(nodes, text, tend, 0, 1); 514 | case TRE_QUANT: 515 | return matchquant(nodes, text, tend, nodes[1].u.mn[0], nodes[1].u.mn[1]); 516 | case TRE_LQUANT: 517 | return matchquant_lazy(nodes, text, tend, nodes[1].u.mn[0], nodes[1].u.mn[1]); 518 | case TRE_STAR: 519 | return matchquant(nodes, text, tend, 0, TRE_MAXPLUS); 520 | case TRE_LSTAR: 521 | return matchquant_lazy(nodes, text, tend, 0, TRE_MAXPLUS); 522 | case TRE_PLUS: 523 | return matchquant(nodes, text, tend, 1, TRE_MAXPLUS); 524 | case TRE_LPLUS: 525 | return matchquant_lazy(nodes, text, tend, 1, TRE_MAXPLUS); 526 | } 527 | } 528 | while (text < tend && matchone(nodes++, *text++)); 529 | 530 | return 0; 531 | } 532 | 533 | void tre_print(const tre_comp *tregex) 534 | { 535 | #ifdef TRE_SILENT 536 | (void) tregex; 537 | #else 538 | #define X(A) #A, 539 | static const char *tre_typenames[] = { TRE_TYPES_X }; 540 | #undef X 541 | 542 | if (!tregex) 543 | { 544 | printf("NULL compiled regex detected\n"); 545 | return; 546 | } 547 | 548 | const tre_node *tnode = tregex->nodes; 549 | int i; 550 | for (i = 0; i < TRE_MAX_NODES; ++i) 551 | { 552 | if (tnode[i].type == TRE_NONE) 553 | break; 554 | 555 | printf("type: %s", tre_typenames[tnode[i].type]); 556 | if (tnode[i].type == TRE_CLASS || tnode[i].type == TRE_NCLASS) 557 | { 558 | printf(" \"%s\"", tnode[i].u.cc); 559 | } 560 | else if (tnode[i].type == TRE_QUANT || tnode[i].type == TRE_LQUANT) 561 | { 562 | printf(" {%d,%d}", tnode[i].u.mn[0], tnode[i].u.mn[1]); 563 | } 564 | else if (tnode[i].type == TRE_CHAR) 565 | { 566 | printf(" '%c'", tnode[i].u.ch); 567 | } 568 | printf("\n"); 569 | } 570 | #endif // TRE_SILENT 571 | } 572 | 573 | #undef TRE_TYPES_X 574 | 575 | #endif // TRE_IMPLEMENTATION 576 | 577 | /* 578 | ------------------------------------------------------------------------------ 579 | This software is available under 2 licenses -- choose whichever you prefer. 580 | ------------------------------------------------------------------------------ 581 | ALTERNATIVE A - Public Domain (www.unlicense.org) 582 | This is free and unencumbered software released into the public domain. 583 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 584 | software, either in source code form or as a compiled binary, for any purpose, 585 | commercial or non-commercial, and by any means. 586 | In jurisdictions that recognize copyright laws, the author or authors of this 587 | software dedicate any and all copyright interest in the software to the public 588 | domain. We make this dedication for the benefit of the public at large and to 589 | the detriment of our heirs and successors. We intend this dedication to be an 590 | overt act of relinquishment in perpetuity of all present and future rights to 591 | this software under copyright law. 592 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 593 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 594 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 595 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 596 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 597 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 598 | ------------------------------------------------------------------------------ 599 | ALTERNATIVE B - MIT License 600 | Copyright (c) 2018 kokke, monolifed 601 | Permission is hereby granted, free of charge, to any person obtaining a copy of 602 | this software and associated documentation files (the "Software"), to deal in 603 | the Software without restriction, including without limitation the rights to 604 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 605 | of the Software, and to permit persons to whom the Software is furnished to do 606 | so, subject to the following conditions: 607 | The above copyright notice and this permission notice shall be included in all 608 | copies or substantial portions of the Software. 609 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 610 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 611 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 612 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 613 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 614 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 615 | SOFTWARE. 616 | ------------------------------------------------------------------------------ 617 | */ 618 | 619 | --------------------------------------------------------------------------------