├── my_regex_test.c ├── readme.md ├── LICENSE ├── my_regex_tests.cpp └── remimu.h /my_regex_test.c: -------------------------------------------------------------------------------- 1 | #include "remimu.h" 2 | 3 | int main(void) 4 | { 5 | RegexToken tokens[1024]; 6 | int16_t token_count = 1024; 7 | int e = regex_parse("[0-9]+\\.[0-9]+", tokens, &token_count, 0); 8 | if (e) return (puts("regex has error"), 0); 9 | print_regex_tokens(tokens); 10 | 11 | int64_t match_len = regex_match(tokens, "23.53) ", 0, 0, 0, 0); 12 | printf("########### return: %zd\n", match_len); 13 | 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Remimu: Single-Header C/C++ Regex Library 2 | 3 | Compatible with C99 and C++11 and later standards. Uses backtracking and relatively standard regex syntax. 4 | 5 | #include "remimu.h" 6 | 7 | ## Functions 8 | ```c 9 | // Returns 0 on success, or -1 on invalid or unsupported regex, or -2 on not enough tokens given to parse regex. 10 | static inline int regex_parse( 11 | const char * pattern, // Regex pattern to parse. 12 | RegexToken * tokens, // Output buffer of token_count regex tokens. 13 | int16_t * token_count, // Maximum allowed number of tokens to write 14 | int32_t flags // Optional bitflags. 15 | ) 16 | 17 | // Returns match length, or -1 on no match, or -2 on out of memory, or -3 if the regex is invalid. 18 | static inline int64_t regex_match( 19 | const RegexToken * tokens, // Parsed regex to match against text. 20 | const char * text, // Text to match against tokens. 21 | size_t start_i, // index value to match at. 22 | uint16_t cap_slots, // Number of allowed capture info output slots. 23 | int64_t * cap_pos, // Capture position info output buffer. 24 | int64_t * cap_span // Capture length info output buffer. 25 | ) 26 | 27 | static inline void print_regex_tokens( 28 | RegexToken * tokens // Regex tokens to spew to stdout, for debugging. 29 | ) 30 | ``` 31 | Remimu doesn't have a searching API. 32 | 33 | If `static inline` doesn't work for your project, define the `REMIMU_FUNC_VISIBILITY` (default `static inline`) and `REMIMU_CONST_VISIBILITY` (default `static const`) visibility prefix macros before including the header. Remimu doesn't use mutable global or mutable static variables, so no prefix macro is needed for them. 34 | 35 | ## Performance 36 | 37 | On simple cases, Remimu's match speed is similar to PCRE2. Regex parsing/compilation is also much faster (around 4x to 10x), so single-shot regexes are often faster than PCRE2. 38 | 39 | HOWEVER: Remimu is a pure backtracking engine, and has `O(2^x)` complexity on regexes with catastrophic backtracking. It can be much, much, MUCH slower than PCRE2. Beware! 40 | 41 | Remimu uses length-checked fixed memory buffers with no recursion, so memory usage is statically known. 42 | 43 | ## Features 44 | 45 | - Lowest-common-denominator common regex syntax 46 | - Based on backtracking (slow in the worst case, but fast in the best case) 47 | - 8-bit only, no utf-16 or utf-32. Use https://wareya.github.io/uniregex/ to create 8-bit versions of utf-8 regexes 48 | - Statically known memory usage (no heap allocation or recursion) 49 | - Groups with or without capture, and with or without quantifiers 50 | - Supported escapes: 51 | - - 2-digit hex: e.g. `\x00`, `\xFF`, or lowercase, or mixed case 52 | - - `\r`, `\n`, `\t`, `\v`, `\f` (whitespace characters) 53 | - - `\d`, `\s`, `\w`, `\D`, `\S`, `\W` (digit, space, and word character classes) 54 | - - `\b`, `\B` word boundary and non-word-boundary anchors (not fully supported in zero-size quantified groups, but even then, usually supported) 55 | - - Escaped literal characters: `{}[]-()|^$*+?:./\` 56 | - - - Escapes work in character classes, except for `b` 57 | - Character classes, including disjoint ranges, proper handling of bare `[` and trailing `-`, etc 58 | - - Dot (`.`) matches all characters, including newlines, unless `REMIMU_FLAG_DOT_NO_NEWLINES` is passed as a flag to `regex_parse` 59 | - - Dot (`.`) only matches at most one byte at a time, so matching `\r\n` requires two dots (and not using `REMIMU_FLAG_DOT_NO_NEWLINES`) 60 | - Anchors (`^` and `$`) 61 | - - Same support caveats as \b, \B apply 62 | - Basic quantifiers (`*`, `+`, `?`) 63 | - - Quantifiers are greedy by default. 64 | - Explicit quantifiers (`{2}`, `{5}`, `{5,}`, `{5,7}`) 65 | - Alternation e.g. `(asdf|foo)` 66 | - Lazy quantifiers e.g. `(asdf)*?` or `\w+?` 67 | - Possessive greedy quantifiers e.g. `(asdf)*+` or `\w++` 68 | - - NOTE: Capture groups for and inside of possessive groups return no capture information. 69 | - Atomic groups e.g. `(?>(asdf))` 70 | - - NOTE: Capture groups inside of atomic groups return no capture information. 71 | 72 | ## Not Supported 73 | 74 | - Strings with non-terminal null characters 75 | - Unicode character classes (matching single utf-8 characters works regardless) 76 | - Exact POSIX regex semantics (posix-style greediness etc) 77 | - - (note: despite being a posix thing, it would be very weird and rare to support exact posix semantics. most regex implementations are not posix regexes and posix regexes are surprising.) 78 | - Backreferences 79 | - Lookbehind/Lookahead 80 | - Named groups 81 | - Most other weird flavor-specific regex stuff 82 | - Capture of or inside of possessive-quantified groups (still take up a capture slot, but no data is returned) 83 | 84 | ## Usage 85 | ```c 86 | // minimal: 87 | 88 | RegexToken tokens[1024]; 89 | int16_t token_count = 1024; 90 | int e = regex_parse("[0-9]+\\.[0-9]+", tokens, &token_count, 0); 91 | assert(!e); 92 | 93 | int64_t match_len = regex_match(tokens, "23.53) ", 0, 0, 0, 0); 94 | printf("########### return: %zd\n", match_len); 95 | 96 | // with captures: 97 | 98 | RegexToken tokens[256]; 99 | int16_t token_count = sizeof(tokens)/sizeof(tokens[0]); 100 | int e = regex_parse("((a)|(b))++", tokens, &token_count, 0); 101 | assert(!e); 102 | 103 | int64_t cap_pos[5]; 104 | int64_t cap_span[5]; 105 | memset(cap_pos, 0xFF, sizeof(cap_pos)); 106 | memset(cap_span, 0xFF, sizeof(cap_span)); 107 | 108 | int64_t matchlen = regex_match(tokens, "aaaaaabbbabaqa", 0, 5, cap_pos, cap_span); 109 | printf("Match length: %zd\n", matchlen); 110 | for (int i = 0; i < 5; i++) 111 | printf("Capture %d: %zd plus %zd\n", i, cap_pos[i], cap_span[i]); 112 | 113 | // for debugging 114 | print_regex_tokens(tokens); 115 | ``` 116 | ## Testing 117 | 118 | `my_regex_tests.cpp` is a C++11 program that throws a matrix of regexes and test strings into PCRE2 and validates that they're matched the same way in Remimu (for supported features). It contains a good number of gotcha regexes. 119 | 120 | ## License 121 | 122 | Creative Commons Zero, public domain. 123 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /my_regex_tests.cpp: -------------------------------------------------------------------------------- 1 | 2 | // tests for regex engine; not actually part of BBEL 3 | // testing requires PCRE2 4 | // msys2: pacman -S mingw-w64--pcre2 5 | // linker flag is usually -lpcre2-8 6 | 7 | //#define REGEX_VERBOSE 8 | #include "remimu.h" 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #define PCRE2_CODE_UNIT_WIDTH 8 17 | #include 18 | 19 | #define BE_QUIET 20 | 21 | static void must_parse_ok(const char* pat) { 22 | RegexToken toks[128]; 23 | memset(toks, 0, sizeof(toks)); 24 | int16_t cap = (int16_t)(sizeof(toks)/sizeof(toks[0])); 25 | int rc = regex_parse(pat, toks, &cap, 0); 26 | if (rc != 0) { 27 | fprintf(stderr, "[FAIL] regex_parse should succeed: `%s` (rc=%d)\n", pat, rc); 28 | assert(rc == 0); 29 | } 30 | } 31 | 32 | static void must_parse_fail(const char* pat) { 33 | RegexToken toks[16]; 34 | memset(toks, 0, sizeof(toks)); 35 | int16_t cap = (int16_t)(sizeof(toks)/sizeof(toks[0])); 36 | int rc = regex_parse(pat, toks, &cap, 0); 37 | if (rc == 0) { 38 | fprintf(stderr, "[FAIL] regex_parse should fail: `%s`\n", pat); 39 | assert(rc != 0); 40 | } 41 | } 42 | 43 | static int64_t do_match(const char* pat, const char* text) { 44 | RegexToken toks[256]; 45 | memset(toks, 0, sizeof(toks)); 46 | int16_t cap = (int16_t)(sizeof(toks)/sizeof(toks[0])); 47 | int rc = regex_parse(pat, toks, &cap, 0); 48 | assert(rc == 0); 49 | return regex_match(toks, text, 0, 0, 0, 0); 50 | } 51 | 52 | static void expect_match_len(const char* pat, const char* text, int64_t want_len) { 53 | int64_t got = do_match(pat, text); 54 | if (got != want_len) { 55 | fprintf(stderr, "[FAIL] `%s` ~ `%s` : want %zd, got %zd\n", pat, text, (ssize_t)want_len, (ssize_t)got); 56 | assert(got == want_len); 57 | } 58 | } 59 | 60 | static void expect_no_match(const char* pat, const char* text) { 61 | int64_t got = do_match(pat, text); 62 | if (got >= 0) { 63 | fprintf(stderr, "[FAIL] `%s` should NOT match `%s` (got len=%zd)\n", pat, text, (ssize_t)got); 64 | assert(got < 0); 65 | } 66 | } 67 | 68 | void testify(void) 69 | { 70 | using clock = std::chrono::high_resolution_clock; 71 | 72 | static const char * regexes[] = { 73 | // ipv4 74 | "^(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$", 75 | // ipv6 76 | "^(?:(?:(?:[0-9a-fA-F]{1,4}):){7}(?:(?:[0-9a-fA-F]{1,4})|:)|(?:(?:[0-9a-fA-F]{1,4}):){6}(?:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|:(?:[0-9a-fA-F]{1,4})|:)|(?:(?:[0-9a-fA-F]{1,4}):){5}(?::(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,2}|:)|(?:(?:[0-9a-fA-F]{1,4}):){4}(?:(?::(?:[0-9a-fA-F]{1,4})){0,1}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,3}|:)|(?:(?:[0-9a-fA-F]{1,4}):){3}(?:(?::(?:[0-9a-fA-F]{1,4})){0,2}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,4}|:)|(?:(?:[0-9a-fA-F]{1,4}):){2}(?:(?::(?:[0-9a-fA-F]{1,4})){0,3}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,5}|:)|(?:(?:[0-9a-fA-F]{1,4}):){1}(?:(?::(?:[0-9a-fA-F]{1,4})){0,4}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,6}|:)|(?::(?:(?::(?:[0-9a-fA-F]{1,4})){0,5}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,7}|:)))(?:%[0-9a-zA-Z-.:]{1,})?$", 77 | 78 | "(b|a|as|q|)*?X", 79 | u8"((電|自転)車)+", 80 | "", 81 | "(|b|a|as|q)*X", 82 | "(b|a|as|q|)*X", 83 | "(b|a|as|q|)+X", 84 | "(b|a|as|q|)+?X", 85 | "((b|a|as|q|))*X", 86 | "((b|a|as|q|))*?X", 87 | "(b|a|as|q)*X", 88 | "(b|a|as|q)*?X", 89 | "(b|a|as|q)+X", 90 | 91 | "((a)|(b))+", 92 | "((a)|(b))++", 93 | "((a)|(b))+?", 94 | "((a)|(b))*", 95 | "((a)|(b))*+", 96 | "((a)|(b))*?", 97 | "((a)|((b)q))*", 98 | "((a)|((b)q))*+", 99 | 100 | "(|a?)+?a{10}", 101 | "(a?)*a{10}", 102 | "(a?)*?a{10}", 103 | "(a?)+?a{100}", 104 | "(a?)+?a{10}", 105 | "(a?)+a{10}", 106 | "(a)+a{9}", 107 | "(a)+?a{9}", 108 | "(|a)+a{9}", 109 | "(|a)+a{10}", 110 | "(|a)+a{11}", 111 | "(|a)+?a{11}", 112 | "^a(bc+|b[eh])g|.h$", 113 | "(bc+d$|ef*g.|h?i(j|k))", 114 | 115 | "(b|a|as|q)*?X", 116 | 117 | // emails and email-like things 118 | // (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]) 119 | "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", 120 | //"(?:\\w+(?:\\.\\w+)*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)", 121 | "(?:\\w+(?:\\.\\w+)*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)", 122 | "(\\w\\w*\\.)+", 123 | "(\\w+\\.)+", 124 | "(?:\\w+(?:\\.\\w+)*)@(?:\\w+(?:\\.\\w+)*)", 125 | "[a-z0-9\\._%+!$&*=^|~#%'`?{}/\\-]+@([a-z0-9\\-]+\\.){1,}([a-z]{2,16})", 126 | "^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}$", 127 | 128 | "(ab?)b", 129 | "(ab?)*b", 130 | "(ab?)*?b", 131 | 132 | "([0a-z][a-z0-9]*,)+", 133 | "([a-z][a-z0-9]*,)+", 134 | 135 | "asdf\\b", 136 | "asdf\\B", 137 | "\\basdf", 138 | 139 | "(\\ba?)*", 140 | "(\\ba?)*?", 141 | "(\\b)+?", 142 | "(\\b)+", 143 | "(\\ba?)+", 144 | "(\\ba?)+?", 145 | "(\\ba?)*a", 146 | "(\\ba?)*?a", 147 | "(\\ba?)+a", 148 | "(\\ba?)+?a", 149 | "a(\\b)*", 150 | "a(\\b)*?", 151 | "(\\b)*a", 152 | "(\\b)*?a", 153 | "a(\\b)+", 154 | "a(\\b)+?", 155 | "(\\b)+a", 156 | "(\\b)+?a", 157 | 158 | "^asdf$", 159 | "^asdf", 160 | "asdf$", 161 | ".*asdf", 162 | ".*asdf$", 163 | 164 | "(^(asdf)?)*", 165 | "(^(asdf)?)*(asdf)?", 166 | "((asdf)?$)*", 167 | "((asdf)?)*((asdf)?$)*", 168 | "(^(asdf)?)*?", 169 | "(^(asdf)?)*?(asdf)?", 170 | "((asdf)?$)*?", 171 | "((asdf)?)*?((asdf)?$)*?", 172 | 173 | "(a?)*a{10}", 174 | "(a?)*?a{10}", 175 | "()", 176 | "(a|)*b", 177 | "(z?)*a{10}", 178 | 179 | 180 | // possessive 181 | "(b|a|)*+", 182 | "(a|)*+b", 183 | "(?>(b|a|)*)", 184 | "(b|a|)*+b", 185 | "(b|a|as|q)*+", 186 | "(b|a|as|q)*+X", 187 | "(b|a|as|q)*", 188 | "a++ab", 189 | 190 | "[0-9]+\\.[0-9]+", 191 | "[0-9]+0\\.[0-9]+", 192 | 193 | "(a|a|ab)bc", 194 | "(ab|ab|a)bc", 195 | "[0-9]\\.[0-9]", 196 | 197 | "\\d\\.\\d", 198 | "\\d*\\.\\d*", 199 | "\\w+", 200 | "\\s+", 201 | "\\s(\\w+)", 202 | "\\w+\\s", 203 | 204 | "(\\d)*?\\.(\\d)+", 205 | "([0-9])*?\\.([0-9])+", 206 | "([0-9]){3,5}?\\.([0-9])+", 207 | "[0-9]{3,5}?\\.[0-9]+", 208 | "([0-9]){3,5}\\.([0-9])+", 209 | "[0-9]{3,5}\\.[0-9]+", 210 | "(a|ab)*b", 211 | "(ab?)*?b", 212 | "(ab?\?)*b", 213 | "(a)?\?(b|a)", 214 | "(a)*a{10}", 215 | "(a)*?a{10}", 216 | "a()a", 217 | "a(|)a", 218 | "a(|){1}?a", 219 | "a(|b)+a", 220 | "a(|b)+?a", 221 | "(a|b)*?b", 222 | "a*a*?", 223 | "a*?a*", 224 | "(b|a)*b", 225 | "(b|a)*?b", 226 | "(b|a|)*", 227 | "(b|a|)*bb", 228 | "(b|a|)*?bb", 229 | "(|a)+", 230 | "(|a)+?", 231 | "()+", 232 | "()+?", 233 | "(|)+?", 234 | "a(|)*a", 235 | "a(|)*?a", 236 | "(a|(((()))))*b", 237 | "((\\w+,?)*:)*", 238 | "((\\w+,?)*+:)*", 239 | "((\\w+,?)*+:)*+", 240 | 241 | // pathological 242 | "((a?b|a)b?)*", 243 | "(.*,){11}P", 244 | "(.*?,){11}P", 245 | 246 | "mistaken bogus regex", 247 | }; 248 | static const char * texts[] = { 249 | "asqbX", 250 | 251 | "aaaaaaaaaa", 252 | "asqb", 253 | "abh", 254 | 255 | u8"自転車", 256 | 257 | "0.42.42.42", 258 | "0.42.42..42", 259 | ".0.42.42.42", 260 | "0.256.42.42", 261 | "0.420.42.42", 262 | "0.0111.42.42", 263 | "254.254.254.254", 264 | "192.168.255.255", 265 | "239.51.161.175", 266 | "239.51.161.175", 267 | "0.0.0.0", 268 | "251.227.56.60", 269 | "18.45.235.138", 270 | 271 | "8db2:5802:4f78:5f2c:2dc5:33e9:8c7b:6fc4", 272 | "0995:86cd:70c9:a98a:bab6:c4b1:93e4:f839", 273 | "ff80::220:16ff:fec9:1", 274 | "fe20::150:560f:fec4:3", 275 | "fd87:403b:401f::/48", 276 | 277 | "effgz", 278 | "ij", 279 | "effg", 280 | "bcdd", 281 | "reffgz", 282 | 283 | "testacc@example.com", 284 | 285 | "aa.bb.cc.dd", 286 | "a5,b7,c9", 287 | "a5,b7,c9,", 288 | "a5,b7,c9,,", 289 | "a5,b7,c9,1", 290 | "a5,b7,c9,a", 291 | "", 292 | " ", 293 | " ", 294 | "a", 295 | "aa", 296 | "aba) ", 297 | "aaaaaaaaa", 298 | "aaaaaaaaaaaaaa", 299 | "aaaaaaaaaaaaaab", 300 | "aaaaaaaaaaaaaaba", 301 | 302 | u8"電車", 303 | u8"電車自転車", 304 | u8"自転車電車", 305 | 306 | "testacc@example.com", 307 | "test+acc@example.com", 308 | "test.acc@example.com", 309 | "test.acc.acc@sub.example.com", 310 | "loooooooo10235699ng.1g.g.g.210g01.longie.acc@sub.example.com.co.co.uk.jp.fakedomain.loooooooooooooooooooonger.com......", 311 | "test.acc@sub.example.com", 312 | "test@sub.example.com", 313 | "@example.com", 314 | "example.com", 315 | "a@", 316 | "#@%^%#$@#$@#.com", 317 | "Joe Smith ", 318 | "_______@example.com", 319 | "“email”@example.com", 320 | "email@[123.123.123.123]", 321 | "email@123.123.123.123", 322 | 323 | "abc) ", 324 | "abba) ", 325 | "abbc) ", 326 | "012.53) ", 327 | ".53) ", 328 | "5.5", 329 | "022134.53) ", 330 | "02234.53) ", 331 | "1131.53) ", 332 | "131.53) ", 333 | "11.53) ", 334 | "1.53) ", 335 | "aa", 336 | "aaaaaaaaabababab", 337 | "aaaaaaaaababababb", 338 | "aaaaabbbbbbbx", 339 | "bbbbbbb", 340 | "1,2,3,4,5,6,7,8,9,10,11,12", 341 | "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16", 342 | 343 | "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22", 344 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25", 345 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 P", 346 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26", 347 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26 P", 348 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27", 349 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28", 350 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28", 351 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30", 352 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31", 353 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32", 354 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33", 355 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34", 356 | //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35", 357 | "aaaaaababababababaabx", 358 | 359 | " ", 360 | "afd1gkage919953bd ", 361 | " x ", 362 | " ,\\1264ga0b a ", 363 | "asdf ", 364 | "asdfg", 365 | " asdf", 366 | "asdf ", 367 | " asdf ", 368 | "XXXasdf", 369 | "asdfXXX", 370 | "XXXasdfXXX", 371 | "000asdf", 372 | "asdf000", 373 | "000asdf000", 374 | "a,b,easbe_1:a,:a", 375 | 376 | "uh-uh", 377 | "words, yeah", 378 | "mistaken bogus regex", 379 | 380 | "aaaaaabbbabaqa", 381 | }; 382 | 383 | const char * slowest_my_regex = ""; 384 | double slowest_my_regex_time = 0.0; 385 | double total_my_regex_time = 0.0; 386 | const char * slowest_pcre2_regex = ""; 387 | double slowest_pcre2_regex_time = 0.0; 388 | double total_pcre2_regex_time = 0.0; 389 | 390 | for (size_t i = 0; i < sizeof(regexes) / sizeof(regexes[0]); i++) 391 | //for (size_t i = 0; 0; i++) 392 | { 393 | const char * regex = regexes[i]; 394 | 395 | RegexToken tokens[512]; 396 | memset(tokens, 0xFF, sizeof(tokens)); 397 | int16_t token_count = sizeof(tokens)/sizeof(tokens[0]); 398 | 399 | auto start = clock::now(); 400 | int e = regex_parse(regex, tokens, &token_count, 0); 401 | double t = std::chrono::duration_cast(clock::now() - start).count() / 1000000.0; 402 | assert(!e); 403 | 404 | total_my_regex_time += t; 405 | if (t > slowest_my_regex_time) 406 | { 407 | slowest_my_regex_time = t; 408 | slowest_my_regex = regex; 409 | } 410 | 411 | bool has_possessive = false; 412 | for (int32_t n = 0; n < token_count; n++) 413 | { 414 | if (tokens[n].mode & REMIMU_MODE_POSSESSIVE) 415 | { 416 | has_possessive = true; 417 | break; 418 | } 419 | } 420 | 421 | #ifndef BE_QUIET 422 | printf("token count: %d\n", token_count); 423 | print_regex_tokens(tokens); 424 | printf("Took %f seconds for my regex engine to parse the regex\n", t); 425 | #endif 426 | 427 | std::string regex_str = regex; 428 | 429 | int errorcode; 430 | PCRE2_SIZE erroroffset; 431 | start = clock::now(); 432 | pcre2_code * re = pcre2_compile(PCRE2_SPTR8(regex), PCRE2_ZERO_TERMINATED, 433 | PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK | PCRE2_DOTALL | PCRE2_NO_AUTO_POSSESS | PCRE2_NO_DOTSTAR_ANCHOR | PCRE2_NO_START_OPTIMIZE, 434 | &errorcode, &erroroffset, NULL); 435 | t = std::chrono::duration_cast(clock::now() - start).count() / 1000000.0; 436 | #ifndef BE_QUIET 437 | printf("Took %f seconds for my PCRE2 to compile the regex\n", t); 438 | #endif 439 | if (t > slowest_pcre2_regex_time) 440 | { 441 | slowest_pcre2_regex_time = t; 442 | slowest_pcre2_regex = regex; 443 | } 444 | total_pcre2_regex_time += t; 445 | 446 | for (size_t j = 0; j < sizeof(texts) / sizeof(texts[0]); j++) 447 | { 448 | const char * text = texts[j]; 449 | std::string text_str = text; 450 | 451 | int64_t pcre2_len = -1; 452 | 453 | #ifndef BE_QUIET 454 | printf("testing PCRE2 regex `%s` on string `%s`...\n", regex, text); 455 | fflush(stdout); 456 | #endif 457 | 458 | auto start = clock::now(); 459 | pcre2_match_data * match_data = pcre2_match_data_create_from_pattern(re, 0); 460 | int submatch_count = pcre2_match(re, PCRE2_SPTR8(text), text_str.size(), 0, PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK, match_data, 0); 461 | 462 | #ifndef BE_QUIET 463 | double t = std::chrono::duration_cast(clock::now() - start).count() / 1000000.0; 464 | #endif 465 | 466 | //printf("submatch count: %d\n", submatch_count); 467 | //printf("ovector count: %d\n", pcre2_get_ovector_count(match_data)); 468 | 469 | PCRE2_SIZE * ovector = 0; 470 | if (submatch_count > 0) 471 | { 472 | ovector = pcre2_get_ovector_pointer(match_data); 473 | size_t offs = ovector[0]; 474 | if (offs == 0) 475 | pcre2_len = ovector[1] - offs; 476 | #ifndef BE_QUIET 477 | printf("pcre2 regex found match at %zd with len %zd after %f seconds\n", offs, pcre2_len, t); 478 | #endif 479 | } 480 | #ifndef BE_QUIET 481 | else 482 | printf("pcre2 regex found no match after %f seconds\n", t); 483 | 484 | printf("testing my regex `%s` on string `%s`...\n", regex, text); 485 | #endif 486 | 487 | start = clock::now(); 488 | 489 | int64_t cap_pos[16]; 490 | int64_t cap_span[16]; 491 | memset(cap_pos, 0xFF, sizeof(cap_pos)); 492 | memset(cap_span, 0xFF, sizeof(cap_span)); 493 | 494 | int64_t match_len = regex_match(tokens, text, 0, 16, cap_pos, cap_span); 495 | 496 | assert(match_len != -3); 497 | #ifndef BE_QUIET 498 | t = std::chrono::duration_cast(clock::now() - start).count() / 1000000.0; 499 | if (match_len >= 0) 500 | printf("my regex found match with len %zd after %f seconds\n", match_len, t); 501 | else if (match_len == -2) 502 | printf("my regex ran out of memory after %f seconds (note: `%s`)\n", t, regex); 503 | else 504 | printf("my regex found no match after %f seconds\n", t); 505 | #endif 506 | 507 | // we define captures differently than PCRE2 for possessives, so skip them 508 | if (!has_possessive && submatch_count > 0) 509 | { 510 | #ifndef BE_QUIET 511 | printf("comparing %zd to %zd...\n", match_len, pcre2_len); 512 | printf("regex `%s`, string `%s`\n", regex, text); 513 | #endif 514 | assert(match_len == pcre2_len); 515 | #ifndef BE_QUIET 516 | puts("comparing captures..."); 517 | #endif 518 | if (match_len >= 0) 519 | { 520 | for (int x = 0; x < submatch_count && x < 16; x++) 521 | { 522 | size_t where = ovector[x*2]; 523 | if (where == 0) 524 | { 525 | size_t pcre2_len = ovector[x*2+1] - where; 526 | // probably a situation of std capturing a zero-length group repetition 527 | #ifndef BE_QUIET 528 | printf("Capture %d: std (%zd,%zd) mine (%zd,%zd)\n", x, where, pcre2_len, cap_pos[x], cap_span[x]); 529 | #endif 530 | if (!(cap_pos[x] == -1 && cap_span[x] == -1 && where == 0 && pcre2_len == 0)) 531 | { 532 | assert(where == (size_t)cap_pos[x]); 533 | assert(pcre2_len == (size_t)cap_span[x]); 534 | } 535 | } 536 | } 537 | } 538 | } 539 | 540 | pcre2_match_data_free(match_data); 541 | } 542 | pcre2_code_free(re); 543 | } 544 | 545 | printf("Slowest regex for me to parse at %f seconds:\n%s\n", slowest_my_regex_time, slowest_my_regex); 546 | printf("Slowest regex for pcre2 to parse at %f seconds:\n%s\n", slowest_pcre2_regex_time, slowest_pcre2_regex); 547 | 548 | printf("Total parse time for me: %f\n", total_my_regex_time); 549 | printf("Total parse time for pcre2: %f\n", total_pcre2_regex_time); 550 | 551 | RegexToken tokens[256]; 552 | int16_t token_count = sizeof(tokens)/sizeof(tokens[0]); 553 | //int e = regex_parse("((\\w+,?)*:)*", tokens, &token_count, 0); 554 | //int e = regex_parse("((\\w+,?)*+:)*", tokens, &token_count, 0); 555 | //int e = regex_parse("((\\w+,?)*+:)*+", tokens, &token_count, 0); 556 | int e = regex_parse("((a)|(b))++", tokens, &token_count, 0); 557 | //int e = regex_parse("((\\w+,?)*:)", tokens, &token_count, 0); 558 | //int e = regex_parse("((a)|((b)q))*", tokens, &token_count, 0); 559 | assert(!e); 560 | 561 | int64_t cap_pos[5]; 562 | int64_t cap_span[5]; 563 | memset(cap_pos, 0xFF, sizeof(cap_pos)); 564 | memset(cap_span, 0xFF, sizeof(cap_span)); 565 | //int64_t matchlen = regex_match(tokens, "a,b,easbe_1:aaa,_,:a", 5, cap_pos, cap_span); 566 | //int64_t matchlen = regex_match(tokens, "aabqaaaaba", 5, cap_pos, cap_span); 567 | int64_t matchlen = regex_match(tokens, "aaaaaabbbabaqa", 0, 5, cap_pos, cap_span); 568 | printf("Match length: %zd\n", matchlen); 569 | for (int i = 0; i < 5; i++) 570 | printf("Capture %d: %zd plus %zd\n", i, cap_pos[i], cap_span[i]); 571 | 572 | // Correct \xHH parsing (nibble order) 573 | // Should parse and match 'A' (0x41). Also should *not* match 'B'. 574 | must_parse_ok("\\x41"); 575 | expect_match_len("\\x41", "A", 1); 576 | expect_no_match("\\x41", "B"); 577 | 578 | // Sanity check: lower-case hex -> 'z' (0x7A) 579 | must_parse_ok("\\x7a"); 580 | expect_match_len("\\x7a", "z", 1); 581 | expect_no_match("\\x7a", "Z"); 582 | 583 | // \xHH inside bracket/class 584 | must_parse_ok("[\\x41]"); 585 | expect_match_len("[\\x41]", "A", 1); 586 | expect_no_match("[\\x41]", "C"); 587 | 588 | // Multiple items in class with hex 589 | must_parse_ok("[ABC\\x7a]"); 590 | expect_match_len("[ABC\\x7a]", "z", 1); 591 | expect_match_len("[ABC\\x7a]", "A", 1); 592 | expect_no_match("[ABC\\x7a]", "q"); 593 | 594 | // Too-short hex escapes must fail parse WITHOUT reading past end 595 | must_parse_fail("\\x"); // nothing after 'x' 596 | must_parse_fail("\\x4"); // only one hex nibble 597 | must_parse_fail("[\\x]"); // same in a class 598 | must_parse_fail("[\\x4]"); // single nibble in a class 599 | 600 | // Valid hex at end-of-pattern should still be OK 601 | must_parse_ok("foo\\x41"); 602 | expect_match_len("foo\\x41", "fooA", 4); 603 | expect_no_match("foo\\x41", "foo@"); 604 | 605 | // Normal text around hex to ensure state transitions are correct 606 | must_parse_ok("X\\x41Y"); 607 | expect_match_len("X\\x41Y", "XA Y", -1); // space breaks it 608 | expect_match_len("X\\x41Y", "XAY", 3); 609 | 610 | // Bracket class mixing ranges and hex 611 | must_parse_ok("[A-\\x5A]"); // 'A'-'Z' 612 | expect_match_len("[A-\\x5A]", "M", 1); 613 | expect_no_match("[A-\\x5A]", "m"); 614 | 615 | print_regex_tokens(tokens); 616 | 617 | puts("All regex tests passed!"); 618 | 619 | if (1) 620 | { 621 | puts("Microbenchmark: matching `\\.\\d+|\\d+\\.\\d*` against 3.1415926535 one million times..."); 622 | 623 | RegexToken tokens[256]; 624 | int16_t token_count = sizeof(tokens)/sizeof(tokens[0]); 625 | 626 | int e = regex_parse("\\.\\d+|\\d+\\.\\d*", tokens, &token_count, 0); 627 | assert(!e); 628 | auto start = clock::now(); 629 | for (size_t i = 0; i < 1000000; i++) 630 | { 631 | int64_t matchlen = regex_match(tokens, "3.1415926535", 0, 0, 0, 0); 632 | assert(matchlen == 12); 633 | volatile int64_t a = 0; matchlen = a; // force the loop to not be optimized away 634 | } 635 | double t = std::chrono::duration_cast(clock::now() - start).count() / 1000000.0; 636 | printf("Match time for me: %f\n", t); 637 | 638 | int errorcode; 639 | PCRE2_SIZE erroroffset; 640 | pcre2_code * re = pcre2_compile(PCRE2_SPTR8("\\.\\d+|\\d+\\.\\d*"), PCRE2_ZERO_TERMINATED, 641 | PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK | PCRE2_DOTALL | PCRE2_NO_AUTO_POSSESS | PCRE2_NO_DOTSTAR_ANCHOR | PCRE2_NO_START_OPTIMIZE, 642 | &errorcode, &erroroffset, NULL); 643 | pcre2_match_data * match_data = pcre2_match_data_create_from_pattern(re, 0); 644 | PCRE2_SIZE * ovector = pcre2_get_ovector_pointer(match_data); 645 | 646 | start = clock::now(); 647 | size_t size = strlen("3.1415926535"); 648 | for (size_t i = 0; i < 1000000; i++) 649 | { 650 | int submatch_count = pcre2_match(re, PCRE2_SPTR8("3.1415926535"), size, 0, PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK, match_data, 0); 651 | int64_t matchlen = ovector[1] - ovector[0]; 652 | assert(submatch_count == 1 && matchlen == 12); 653 | volatile int64_t a = 0; matchlen = a; // force the loop to not be optimized away 654 | } 655 | t = std::chrono::duration_cast(clock::now() - start).count() / 1000000.0; 656 | printf("Match time for pcre2: %f\n", t); 657 | } 658 | } 659 | 660 | int main(void) 661 | { 662 | testify(); 663 | } 664 | -------------------------------------------------------------------------------- /remimu.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_REMIMU 2 | #define INCLUDE_REMIMU 1 3 | 4 | #ifndef REMIMU_FUNC_VISIBILITY 5 | #define REMIMU_FUNC_VISIBILITY static inline 6 | #endif 7 | 8 | #ifndef REMIMU_CONST_VISIBILITY 9 | #define REMIMU_CONST_VISIBILITY static const 10 | #endif 11 | 12 | #ifndef REMIMU_LOG_ERROR 13 | #define REMIMU_LOG_ERROR puts 14 | #endif 15 | 16 | #ifndef REMIMU_ITERATION_LIMIT 17 | #define REMIMU_ITERATION_LIMIT 0 // Set to non-zero to enable an interation limit 18 | #endif 19 | 20 | #ifndef REMIMU_ASSERT 21 | #define REMIMU_ASSERT(x) assert(x) 22 | #endif 23 | 24 | /************ 25 | 26 | REMIMU: SINGLE HEADER C/C++ REGEX LIBRARY 27 | 28 | Compatible with C99 and C++11 and later standards. Uses backtracking and relatively standard regex syntax. 29 | 30 | #include "remimu.h" 31 | 32 | FUNCTIONS 33 | 34 | // Returns 0 on success, or -1 on invalid or unsupported regex, or -2 on not enough tokens given to parse regex. 35 | int regex_parse( 36 | const char * pattern, // Regex pattern to parse. 37 | RegexToken * tokens, // Output buffer of token_count regex tokens. 38 | int16_t * token_count, // Maximum allowed number of tokens to write 39 | int32_t flags // Optional bitflags. 40 | ) 41 | 42 | // Returns match length, or -1 on no match, or -2 on out of memory, or -3 if the regex is invalid. 43 | int64_t regex_match( 44 | const RegexToken * tokens, // Parsed regex to match against text. 45 | const char * text, // Text to match against tokens. 46 | size_t start_i, // index value to match at. 47 | uint16_t cap_slots, // Number of allowed capture info output slots. 48 | int64_t * cap_pos, // Capture position info output buffer. 49 | int64_t * cap_span // Capture length info output buffer. 50 | ) 51 | 52 | void print_regex_tokens( 53 | RegexToken * tokens // Regex tokens to spew to stdout, for debugging. 54 | ) 55 | 56 | PERFORMANCE 57 | 58 | On simple cases, Remimu's match speed is similar to PCRE2. Regex parsing/compilation is also much faster (around 4x to 10x), so single-shot regexes are often faster than PCRE2. 59 | 60 | HOWEVER: Remimu is a pure backtracking engine, and has `O(2^x)` complexity on regexes with catastrophic backtracking. It can be much, much, MUCH slower than PCRE2. Beware! 61 | 62 | Remimu uses length-checked fixed memory buffers with no recursion, so memory usage is statically known. 63 | 64 | FEATURES 65 | 66 | - Lowest-common-denominator common regex syntax 67 | - Based on backtracking (slow in the worst case, but fast in the best case) 68 | - 8-bit only, no utf-16 or utf-32 69 | - Statically known memory usage (no heap allocation or recursion) 70 | - Groups with or without capture, and with or without quantifiers 71 | - Supported escapes: 72 | - - 2-digit hex: e.g. \x00, \xFF, or lowercase, or mixed case 73 | - - \r, \n, \t, \v, \f (whitespace characters) 74 | - - \d, \s, \w, \D, \S, \W (digit, space, and word character classes) 75 | - - \b, \B word boundary and non-word-boundary anchors (not fully supported in zero-size quantified groups, but even then, usually supported) 76 | - - Escaped literal characters: {}[]-()|^$*+?:./\ 77 | - - - Escapes work in character classes, except for 'b' 78 | - Character classes, including disjoint ranges, proper handling of bare [ and trailing -, etc 79 | - - Dot (.) matches all characters, including newlines, unless REMIMU_FLAG_DOT_NO_NEWLINES is passed as a flag to regex_parse 80 | - - Dot (.) only matches at most one byte at a time, so matching \r\n requires two dots (and not using REMIMU_FLAG_DOT_NO_NEWLINES) 81 | - Anchors (^ and $) 82 | - - Same support caveats as \b, \B apply 83 | - Basic quantifiers (*, +, ?) 84 | - - Quantifiers are greedy by default. 85 | - Explicit quantifiers ({2}, {5}, {5,}, {5,7}) 86 | - Alternation e.g. (asdf|foo) 87 | - Lazy quantifiers e.g. (asdf)*? or \w+? 88 | - Possessive greedy quantifiers e.g. (asdf)*+ or \w++ 89 | - - NOTE: Capture groups for and inside of possessive groups return no capture information. 90 | - Atomic groups e.g. (?>(asdf)) 91 | - - NOTE: Capture groups inside of atomic groups return no capture information. 92 | 93 | NOT SUPPORTED 94 | 95 | - Strings with non-terminal null characters 96 | - Unicode character classes (matching single utf-8 characters works regardless) 97 | - Exact POSIX regex semantics (posix-style greediness etc) 98 | - Backreferences 99 | - Lookbehind/Lookahead 100 | - Named groups 101 | - Most other weird flavor-specific regex stuff 102 | - Capture of or inside of possessive-quantified groups (still take up a capture slot, but no data is returned) 103 | 104 | USAGE 105 | 106 | // minimal: 107 | 108 | RegexToken tokens[1024]; 109 | int16_t token_count = 1024; 110 | int e = regex_parse("[0-9]+\\.[0-9]+", tokens, &token_count, 0); 111 | assert(!e); 112 | 113 | int64_t match_len = regex_match(tokens, "23.53) ", 0, 0, 0, 0); 114 | printf("########### return: %zd\n", match_len); 115 | 116 | // with captures: 117 | 118 | RegexToken tokens[256]; 119 | int16_t token_count = sizeof(tokens)/sizeof(tokens[0]); 120 | int e = regex_parse("((a)|(b))++", tokens, &token_count, 0); 121 | assert(!e); 122 | 123 | int64_t cap_pos[5]; 124 | int64_t cap_span[5]; 125 | memset(cap_pos, 0xFF, sizeof(cap_pos)); 126 | memset(cap_span, 0xFF, sizeof(cap_span)); 127 | 128 | int64_t matchlen = regex_match(tokens, "aaaaaabbbabaqa", 0, 5, cap_pos, cap_span); 129 | printf("Match length: %zd\n", matchlen); 130 | for (int i = 0; i < 5; i++) 131 | printf("Capture %d: %zd plus %zd\n", i, cap_pos[i], cap_span[i]); 132 | 133 | // for debugging 134 | print_regex_tokens(tokens); 135 | 136 | LICENSE 137 | 138 | Creative Commons Zero, public domain. 139 | 140 | */ 141 | 142 | #include 143 | #include 144 | #include 145 | #include 146 | #include 147 | 148 | REMIMU_CONST_VISIBILITY int REMIMU_FLAG_DOT_NO_NEWLINES = 1; 149 | 150 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_NORMAL = 0; 151 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_OPEN = 1; 152 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_NCOPEN = 2; 153 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_CLOSE = 3; 154 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_OR = 4; 155 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_CARET = 5; 156 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_DOLLAR = 6; 157 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_BOUND = 7; 158 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_NBOUND = 8; 159 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_END = 9; 160 | 161 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_MODE_POSSESSIVE = 1; 162 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_MODE_LAZY = 2; 163 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_MODE_INVERTED = 128; // temporary; gets cleared later 164 | 165 | typedef struct _RegexToken { 166 | uint8_t kind; 167 | uint8_t mode; 168 | uint16_t count_lo; 169 | uint16_t count_hi; // 0 means no limit 170 | uint16_t mask[16]; // for groups: mask 0 stores group-with-quantifier number (quantifiers are +, *, ?, {n}, {n,}, or {n,m}) 171 | int16_t pair_offset; // from ( or ), offset in token list to matching paren. TODO: move into mask maybe 172 | } RegexToken; 173 | 174 | static int remimu_nibble_hex_to_bin(char hex, uint8_t *bin) 175 | { 176 | if (hex >= '0' && hex <= '9') 177 | { 178 | *bin = hex - '0'; 179 | return 0; 180 | } 181 | if (hex >= 'A' && hex <= 'F') 182 | { 183 | *bin = hex - 'A' + 10; 184 | return 0; 185 | } 186 | if (hex >= 'a' && hex <= 'f') 187 | { 188 | *bin = hex - 'a' + 10; 189 | return 0; 190 | } 191 | return -1; // invalid hex digit 192 | } 193 | 194 | /// Returns a negative number on failure: 195 | /// -1: Regex string is invalid or using unsupported features or too long. 196 | /// -2: Provided buffer not long enough. Give up, or reallocate with more length and retry. 197 | /// Returns 0 on success. 198 | /// On call, token_count pointer must point to the number of tokens that can be written to the tokens buffer. 199 | /// On successful return, the number of actually used tokens is written to token_count. 200 | /// Sets token_count to zero if a regex is not created but no error happened (e.g. empty pattern). 201 | /// Flags: Not yet used. 202 | /// SAFETY: Pattern must be null-terminated. 203 | /// SAFETY: tokens buffer must have at least the input token_count number of RegexToken objects. They are allowed to be uninitialized. 204 | REMIMU_FUNC_VISIBILITY int regex_parse(const char * pattern, RegexToken * tokens, int16_t * token_count, int32_t flags) 205 | { 206 | int64_t tokens_len = *token_count; 207 | uint64_t pattern_len = strlen(pattern); 208 | if (token_count == 0) 209 | return -2; 210 | 211 | // 0: normal 212 | // 1: just saw a backslash 213 | int esc_state = 0; 214 | 215 | // 0: init 216 | // 1: normal 217 | // 2: in char class, initial state 218 | // 3: in char class, but possibly looking for a range marker 219 | // 4: in char class, but just saw a range marker 220 | // 5: immediately after quantifiable token 221 | // 6: immediately after quantifier 222 | 223 | const int STATE_NORMAL = 1; 224 | const int STATE_QUANT = 2; 225 | const int STATE_MODE = 3; 226 | const int STATE_CC_INIT = 4; 227 | const int STATE_CC_NORMAL = 5; 228 | const int STATE_CC_RANGE = 6; 229 | int state = STATE_NORMAL; 230 | 231 | int char_class_mem = -1; 232 | 233 | RegexToken token; 234 | 235 | #define _REGEX_CLEAR_TOKEN() do { \ 236 | memset(&token, 0, sizeof(RegexToken)); \ 237 | token.count_lo = 1; \ 238 | token.count_hi = 2; \ 239 | } while(0) 240 | 241 | _REGEX_CLEAR_TOKEN(); 242 | 243 | #define _REGEX_DO_INVERT() do { \ 244 | for (int n = 0; n < 16; n++) \ 245 | token.mask[n] = ~token.mask[n]; \ 246 | token.mode &= ~REMIMU_MODE_INVERTED; \ 247 | } while (0) 248 | 249 | int16_t k = 0; 250 | 251 | #define _REGEX_PUSH_TOKEN() do { \ 252 | if (k == 0 || tokens[k-1].kind != token.kind || (token.kind != REMIMU_KIND_BOUND && token.kind != REMIMU_KIND_NBOUND)) \ 253 | { \ 254 | if (token.mode & REMIMU_MODE_INVERTED) _REGEX_DO_INVERT(); \ 255 | if (k >= tokens_len) \ 256 | { \ 257 | REMIMU_LOG_ERROR("buffer overflow"); \ 258 | return -2; \ 259 | } \ 260 | tokens[k++] = token; \ 261 | _REGEX_CLEAR_TOKEN(); \ 262 | } \ 263 | } while (0) 264 | 265 | #define _REGEX_SET_MASK(byte) do { token.mask[((uint8_t)(byte))>>4] |= 1 << ((uint8_t)(byte) & 0xF); } while (0) 266 | #define _REGEX_SET_MASK_ALL() do { \ 267 | for (int n = 0; n < 16; n++) \ 268 | token.mask[n] = 0xFFFF; \ 269 | } while (0) 270 | 271 | // start with an invisible group specifier 272 | // (this allows the matcher to not need to have a special root-level alternation operator case) 273 | token.kind = REMIMU_KIND_OPEN; 274 | token.count_lo = 0; 275 | token.count_hi = 0; 276 | 277 | int paren_count = 0; 278 | 279 | for (uint64_t i = 0; i < pattern_len; i++) 280 | { 281 | char c = pattern[i]; 282 | if (state == STATE_QUANT) 283 | { 284 | state = STATE_MODE; 285 | if (c == '?') 286 | { 287 | token.count_lo = 0; 288 | token.count_hi = 2; // first non-allowed amount 289 | continue; 290 | } 291 | else if (c == '+') 292 | { 293 | token.count_lo = 1; 294 | token.count_hi = 0; // unlimited 295 | continue; 296 | } 297 | else if (c == '*') 298 | { 299 | token.count_lo = 0; 300 | token.count_hi = 0; // unlimited 301 | continue; 302 | } 303 | else if (c == '{') 304 | { 305 | if (pattern[i+1] == 0 || pattern[i+1] < '0' || pattern[i+1] > '9') 306 | state = STATE_NORMAL; 307 | else 308 | { 309 | i += 1; 310 | uint32_t val = 0; 311 | while (pattern[i] >= '0' && pattern[i] <= '9') 312 | { 313 | val *= 10; 314 | val += (uint32_t)(pattern[i] - '0'); 315 | if (val > 0xFFFF) 316 | { 317 | REMIMU_LOG_ERROR("quantifier range too long"); 318 | return -1; // unsupported length 319 | } 320 | i += 1; 321 | } 322 | token.count_lo = val; 323 | token.count_hi = val + 1; 324 | if (pattern[i] == ',') 325 | { 326 | token.count_hi = 0; // unlimited 327 | i += 1; 328 | 329 | if (pattern[i] >= '0' && pattern[i] <= '9') 330 | { 331 | uint32_t val2 = 0; 332 | while (pattern[i] >= '0' && pattern[i] <= '9') 333 | { 334 | val2 *= 10; 335 | val2 += (uint32_t)(pattern[i] - '0'); 336 | if (val2 > 0xFFFF) 337 | { 338 | REMIMU_LOG_ERROR("quantifier range too long"); 339 | return -1; // unsupported length 340 | } 341 | i += 1; 342 | } 343 | if (val2 < val) 344 | { 345 | REMIMU_LOG_ERROR("quantifier range is backwards"); 346 | return -1; // unsupported length 347 | } 348 | token.count_hi = val2 + 1; 349 | } 350 | } 351 | 352 | if (pattern[i] == '}') 353 | { 354 | // quantifier range parsed successfully 355 | continue; 356 | } 357 | else 358 | { 359 | REMIMU_LOG_ERROR("quantifier range syntax broken (no terminator)"); 360 | return -1; 361 | } 362 | } 363 | } 364 | } 365 | 366 | if (state == STATE_MODE) 367 | { 368 | state = STATE_NORMAL; 369 | if (c == '?') 370 | { 371 | token.mode |= REMIMU_MODE_LAZY; 372 | continue; 373 | } 374 | else if (c == '+') 375 | { 376 | token.mode |= REMIMU_MODE_POSSESSIVE; 377 | continue; 378 | } 379 | } 380 | 381 | if (state == STATE_NORMAL) 382 | { 383 | if (esc_state == 1) 384 | { 385 | esc_state = 0; 386 | if (c == 'n') 387 | _REGEX_SET_MASK('\n'); 388 | else if (c == 'r') 389 | _REGEX_SET_MASK('\r'); 390 | else if (c == 't') 391 | _REGEX_SET_MASK('\t'); 392 | else if (c == 'v') 393 | _REGEX_SET_MASK('\v'); 394 | else if (c == 'f') 395 | _REGEX_SET_MASK('\f'); 396 | else if (c == 'x') 397 | { 398 | if (pattern[i+1] == 0 || pattern[i+2] == 0) 399 | return -1; // too-short hex pattern 400 | uint8_t n0, n1; 401 | if (remimu_nibble_hex_to_bin(pattern[i+1], &n0)) 402 | return -1; // invalid hex 403 | if (remimu_nibble_hex_to_bin(pattern[i+2], &n1)) 404 | return -1; // invalid hex 405 | _REGEX_SET_MASK((n0 << 4) | n1); 406 | i += 2; 407 | state = STATE_QUANT; 408 | } 409 | else if (c == '{' || c == '}' || 410 | c == '[' || c == ']' || c == '-' || 411 | c == '(' || c == ')' || 412 | c == '|' || c == '^' || c == '$' || 413 | c == '*' || c == '+' || c == '?' || c == ':' || 414 | c == '.' || c == '/' || c == '\\') 415 | { 416 | _REGEX_SET_MASK(c); 417 | state = STATE_QUANT; 418 | } 419 | else if (c == 'd' || c == 's' || c == 'w' || 420 | c == 'D' || c == 'S' || c == 'W') 421 | { 422 | uint8_t is_upper = c <= 'Z'; 423 | 424 | uint16_t m[16]; 425 | memset(m, 0, sizeof(m)); 426 | 427 | if (is_upper) 428 | c += 0x20; 429 | if (c == 'd' || c == 'w') 430 | m[3] |= 0x03FF; // 0~7 431 | if (c == 's') 432 | { 433 | m[0] |= 0x3E00; // \t-\r (includes \n, \v, and \f in the middle. 5 enabled bits.) 434 | m[2] |= 1; // ' ' 435 | } 436 | if (c == 'w') 437 | { 438 | m[4] |= 0xFFFE; // A-O 439 | m[5] |= 0x87FF; // P-Z_ 440 | m[6] |= 0xFFFE; // a-o 441 | m[7] |= 0x07FF; // p-z 442 | } 443 | 444 | for (int j = 0; j < 16; j++) 445 | token.mask[j] |= is_upper ? ~m[j] : m[j]; 446 | 447 | token.kind = REMIMU_KIND_NORMAL; 448 | state = STATE_QUANT; 449 | } 450 | else if (c == 'b') 451 | { 452 | token.kind = REMIMU_KIND_BOUND; 453 | state = STATE_NORMAL; 454 | } 455 | else if (c == 'B') 456 | { 457 | token.kind = REMIMU_KIND_NBOUND; 458 | state = STATE_NORMAL; 459 | } 460 | else 461 | { 462 | REMIMU_LOG_ERROR("unsupported escape sequence"); 463 | return -1; // unknown/unsupported escape sequence 464 | } 465 | } 466 | else 467 | { 468 | _REGEX_PUSH_TOKEN(); 469 | if (c == '\\') 470 | { 471 | esc_state = 1; 472 | } 473 | else if (c == '[') 474 | { 475 | state = STATE_CC_INIT; 476 | char_class_mem = -1; 477 | token.kind = REMIMU_KIND_NORMAL; 478 | if (pattern[i + 1] == '^') 479 | { 480 | token.mode |= REMIMU_MODE_INVERTED; 481 | i += 1; 482 | } 483 | } 484 | else if (c == '(') 485 | { 486 | paren_count += 1; 487 | state = STATE_NORMAL; 488 | token.kind = REMIMU_KIND_OPEN; 489 | token.count_lo = 0; 490 | token.count_hi = 1; 491 | if (pattern[i + 1] == '?' && pattern[i + 2] == ':') 492 | { 493 | token.kind = REMIMU_KIND_NCOPEN; 494 | i += 2; 495 | } 496 | else if (pattern[i + 1] == '?' && pattern[i + 2] == '>') 497 | { 498 | token.kind = REMIMU_KIND_NCOPEN; 499 | _REGEX_PUSH_TOKEN(); 500 | 501 | state = STATE_NORMAL; 502 | token.kind = REMIMU_KIND_NCOPEN; 503 | token.mode = REMIMU_MODE_POSSESSIVE; 504 | token.count_lo = 1; 505 | token.count_hi = 2; 506 | 507 | i += 2; 508 | } 509 | } 510 | else if (c == ')') 511 | { 512 | paren_count -= 1; 513 | if (paren_count < 0 || k == 0) 514 | return -1; // unbalanced parens 515 | token.kind = REMIMU_KIND_CLOSE; 516 | state = STATE_QUANT; 517 | 518 | int balance = 0; 519 | ptrdiff_t found = -1; 520 | for (ptrdiff_t l = k - 1; l >= 0; l--) 521 | { 522 | if (tokens[l].kind == REMIMU_KIND_NCOPEN || tokens[l].kind == REMIMU_KIND_OPEN) 523 | { 524 | if (balance == 0) 525 | { 526 | found = l; 527 | break; 528 | } 529 | else 530 | balance -= 1; 531 | } 532 | else if (tokens[l].kind == REMIMU_KIND_CLOSE) 533 | balance += 1; 534 | } 535 | if (found == -1) 536 | return -1; // unbalanced parens 537 | ptrdiff_t diff = k - found; 538 | if (diff > 32767) 539 | return -1; // too long 540 | token.pair_offset = -diff; 541 | tokens[found].pair_offset = diff; 542 | // phantom group for atomic group emulation 543 | if (tokens[found].mode == REMIMU_MODE_POSSESSIVE) 544 | { 545 | _REGEX_PUSH_TOKEN(); 546 | token.kind = REMIMU_KIND_CLOSE; 547 | token.mode = REMIMU_MODE_POSSESSIVE; 548 | token.pair_offset = -diff - 2; 549 | tokens[found - 1].pair_offset = diff + 2; 550 | } 551 | } 552 | else if (c == '?' || c == '+' || c == '*' || c == '{') 553 | { 554 | REMIMU_LOG_ERROR("quantifier in non-quantifier context"); 555 | return -1; // quantifier in non-quantifier context 556 | } 557 | else if (c == '.') 558 | { 559 | //puts("setting ALL of mask..."); 560 | _REGEX_SET_MASK_ALL(); 561 | if (flags & REMIMU_FLAG_DOT_NO_NEWLINES) 562 | { 563 | token.mask[1] ^= 0x04; // \n 564 | token.mask[1] ^= 0x20; // \r 565 | } 566 | state = STATE_QUANT; 567 | } 568 | else if (c == '^') 569 | { 570 | token.kind = REMIMU_KIND_CARET; 571 | state = STATE_NORMAL; 572 | } 573 | else if (c == '$') 574 | { 575 | token.kind = REMIMU_KIND_DOLLAR; 576 | state = STATE_NORMAL; 577 | } 578 | else if (c == '|') 579 | { 580 | token.kind = REMIMU_KIND_OR; 581 | state = STATE_NORMAL; 582 | } 583 | else 584 | { 585 | _REGEX_SET_MASK(c); 586 | state = STATE_QUANT; 587 | } 588 | } 589 | } 590 | else if (state == STATE_CC_INIT || state == STATE_CC_NORMAL || state == STATE_CC_RANGE) 591 | { 592 | if (c == '\\' && esc_state == 0) 593 | { 594 | esc_state = 1; 595 | continue; 596 | } 597 | uint8_t esc_c = 0; 598 | if (esc_state == 1) 599 | { 600 | esc_state = 0; 601 | if (c == 'n') 602 | esc_c = '\n'; 603 | else if (c == 'r') 604 | esc_c = '\r'; 605 | else if (c == 't') 606 | esc_c = '\t'; 607 | else if (c == 'v') 608 | esc_c = '\v'; 609 | else if (c == 'f') 610 | esc_c = '\f'; 611 | else if (c == 'x') 612 | { 613 | if (pattern[i+1] == 0 || pattern[i+2] == 0) 614 | return -1; // too-short hex pattern 615 | uint8_t n0, n1; 616 | if (remimu_nibble_hex_to_bin(pattern[i+1], &n0)) 617 | return -1; // invalid hex 618 | if (remimu_nibble_hex_to_bin(pattern[i+2], &n1)) 619 | return -1; // invalid hex 620 | esc_c = (n0 << 4) | n1; 621 | i += 2; 622 | } 623 | else if (c == '{' || c == '}' || 624 | c == '[' || c == ']' || c == '-' || 625 | c == '(' || c == ')' || 626 | c == '|' || c == '^' || c == '$' || 627 | c == '*' || c == '+' || c == '?' || c == ':' || 628 | c == '.' || c == '/' || c == '\\') 629 | { 630 | esc_c = c; 631 | } 632 | else if (c == 'd' || c == 's' || c == 'w' || 633 | c == 'D' || c == 'S' || c == 'W') 634 | { 635 | if (state == STATE_CC_RANGE) 636 | { 637 | REMIMU_LOG_ERROR("tried to use a shorthand as part of a range"); 638 | return -1; // range shorthands can't be part of a range 639 | } 640 | uint8_t is_upper = c <= 'Z'; 641 | 642 | uint16_t m[16]; 643 | memset(m, 0, sizeof(m)); 644 | 645 | if (is_upper) 646 | c += 0x20; 647 | if (c == 'd' || c == 'w') 648 | m[3] |= 0x03FF; // 0~7 649 | if (c == 's') 650 | { 651 | m[0] |= 0x3E00; // \t-\r (includes \n, \v, and \f in the middle. 5 enabled bits.) 652 | m[2] |= 1; // ' ' 653 | } 654 | if (c == 'w') 655 | { 656 | m[4] |= 0xFFFE; // A-O 657 | m[5] |= 0x87FF; // P-Z_ 658 | m[6] |= 0xFFFE; // a-o 659 | m[7] |= 0x07FF; // p-z 660 | } 661 | 662 | for (int j = 0; j < 16; j++) 663 | token.mask[j] |= is_upper ? ~m[j] : m[j]; 664 | 665 | char_class_mem = -1; // range shorthands can't be part of a range 666 | continue; 667 | } 668 | else 669 | { 670 | printf("unknown/unsupported escape sequence in character class (\\%c)\n", c); 671 | return -1; // unknown/unsupported escape sequence 672 | } 673 | } 674 | if (state == STATE_CC_INIT) 675 | { 676 | uint8_t val = esc_c ? esc_c : (uint8_t)c; 677 | char_class_mem = val; 678 | _REGEX_SET_MASK(val); 679 | state = STATE_CC_NORMAL; 680 | } 681 | else if (state == STATE_CC_NORMAL) 682 | { 683 | if (c == ']' && esc_c == 0) 684 | { 685 | char_class_mem = -1; 686 | state = STATE_QUANT; 687 | continue; 688 | } 689 | else if (c == '-' && esc_c == 0 && char_class_mem >= 0) 690 | { 691 | state = STATE_CC_RANGE; 692 | continue; 693 | } 694 | else 695 | { 696 | uint8_t val = esc_c ? esc_c : (uint8_t)c; 697 | char_class_mem = val; 698 | _REGEX_SET_MASK(val); 699 | state = STATE_CC_NORMAL; 700 | } 701 | } 702 | else if (state == STATE_CC_RANGE) 703 | { 704 | if (c == ']' && esc_c == 0) 705 | { 706 | char_class_mem = -1; 707 | _REGEX_SET_MASK('-'); 708 | state = STATE_QUANT; 709 | continue; 710 | } 711 | else 712 | { 713 | if (char_class_mem == -1) 714 | { 715 | REMIMU_LOG_ERROR("character class range is broken"); 716 | return -1; // probably tried to use a character class shorthand as part of a range 717 | } 718 | uint8_t rhs = esc_c ? esc_c : (uint8_t)c; 719 | if (rhs < (uint8_t)char_class_mem) 720 | { 721 | REMIMU_LOG_ERROR("character class range is misordered"); 722 | return -1; // range is in wrong order 723 | } 724 | //printf("enabling char class from %d to %d...\n", char_class_mem, c); 725 | for (uint8_t j = rhs; j > (uint8_t)char_class_mem; j--) 726 | _REGEX_SET_MASK(j); 727 | state = STATE_CC_NORMAL; 728 | char_class_mem = -1; 729 | } 730 | } 731 | } 732 | else 733 | REMIMU_ASSERT(0); 734 | } 735 | if (paren_count > 0) 736 | { 737 | REMIMU_LOG_ERROR("(paren_count > 0)"); 738 | return -1; // unbalanced parens 739 | } 740 | if (esc_state != 0) 741 | { 742 | REMIMU_LOG_ERROR("(esc_state != 0)"); 743 | return -1; // open escape sequence 744 | } 745 | if (state >= STATE_CC_INIT) 746 | { 747 | REMIMU_LOG_ERROR("(state >= STATE_CC_INIT)"); 748 | return -1; // open character class 749 | } 750 | 751 | _REGEX_PUSH_TOKEN(); 752 | 753 | // add invisible non-capturing group specifier 754 | token.kind = REMIMU_KIND_CLOSE; 755 | token.count_lo = 1; 756 | token.count_hi = 2; 757 | _REGEX_PUSH_TOKEN(); 758 | 759 | // add end token (tells matcher that it's done) 760 | token.kind = REMIMU_KIND_END; 761 | _REGEX_PUSH_TOKEN(); 762 | 763 | tokens[0].pair_offset = k - 2; 764 | tokens[k-2].pair_offset = -(k - 2); 765 | 766 | *token_count = k; 767 | 768 | // copy quantifiers from )s to (s (so (s know whether they're optional) 769 | // also take the opportunity to smuggle "quantified group index" into the mask field for the ) 770 | uint64_t n = 0; 771 | for (int16_t k2 = 0; k2 < k; k2++) 772 | { 773 | if (tokens[k2].kind == REMIMU_KIND_CLOSE) 774 | { 775 | tokens[k2].mask[0] = n++; 776 | 777 | int16_t k3 = k2 + tokens[k2].pair_offset; 778 | tokens[k3].count_lo = tokens[k2].count_lo; 779 | tokens[k3].count_hi = tokens[k2].count_hi; 780 | tokens[k3].mask[0] = n++; 781 | tokens[k3].mode = tokens[k2].mode; 782 | 783 | //if (n > 65535) 784 | if (n > 1024) 785 | return -1; // too many quantified groups 786 | } 787 | else if (tokens[k2].kind == REMIMU_KIND_OR || tokens[k2].kind == REMIMU_KIND_OPEN || tokens[k2].kind == REMIMU_KIND_NCOPEN) 788 | { 789 | // find next | or ) and how far away it is. store in token 790 | int balance = 0; 791 | ptrdiff_t found = -1; 792 | for (ptrdiff_t l = k2 + 1; l < k; l++) 793 | { 794 | if (tokens[l].kind == REMIMU_KIND_OR && balance == 0) 795 | { 796 | found = l; 797 | break; 798 | } 799 | else if (tokens[l].kind == REMIMU_KIND_CLOSE) 800 | { 801 | if (balance == 0) 802 | { 803 | found = l; 804 | break; 805 | } 806 | else 807 | balance -= 1; 808 | } 809 | else if (tokens[l].kind == REMIMU_KIND_NCOPEN || tokens[l].kind == REMIMU_KIND_OPEN) 810 | balance += 1; 811 | } 812 | if (found == -1) 813 | { 814 | REMIMU_LOG_ERROR("unbalanced parens..."); 815 | return -1; // unbalanced parens 816 | } 817 | ptrdiff_t diff = found - k2; 818 | if (diff > 32767) 819 | { 820 | REMIMU_LOG_ERROR("too long..."); 821 | return -1; // too long 822 | } 823 | 824 | if (tokens[k2].kind == REMIMU_KIND_OR) 825 | tokens[k2].pair_offset = diff; 826 | else 827 | tokens[k2].mask[15] = diff; 828 | } 829 | } 830 | 831 | #undef _REGEX_PUSH_TOKEN 832 | #undef _REGEX_SET_MASK 833 | #undef _REGEX_CLEAR_TOKEN 834 | 835 | return 0; 836 | } 837 | 838 | typedef struct _RegexMatcherState { 839 | uint32_t k; 840 | uint32_t group_state; // quantified group temp state (e.g. number of repetitions) 841 | uint32_t prev; // for )s, stack index of corresponding previous quantified state 842 | #ifdef REGEX_STACK_SMOL 843 | uint32_t i; 844 | uint32_t range_min; 845 | uint32_t range_max; 846 | #else 847 | uint64_t i; 848 | uint64_t range_min; 849 | uint64_t range_max; 850 | #endif 851 | } RegexMatcherState; 852 | 853 | // NOTE: undef'd later 854 | #define _REGEX_CHECK_MASK(K, byte) (!!(tokens[K].mask[((uint8_t)byte)>>4] & (1 << ((uint8_t)byte & 0xF)))) 855 | 856 | // Returns match length if text starts with a regex match. 857 | // Returns -1 if the text doesn't start with a regex match. 858 | // Returns -2 if the matcher ran out of memory or the regex is too complex. 859 | // Returns -3 if the regex is somehow invalid. 860 | // The first cap_slots capture positions and spans (lengths) will be written to cap_pos and cap_span. If zero, will not be written to. 861 | // SAFETY: The text variable must be null-terminated, and start_i must be the index of a character within the string or its null terminator. 862 | // SAFETY: Tokens array must be terminated by a REMIMU_KIND_END token (done by default by regex_parse). 863 | // SAFETY: Partial capture data may be written even if the match fails. 864 | REMIMU_FUNC_VISIBILITY int64_t regex_match(const RegexToken * tokens, const char * text, size_t start_i, uint16_t cap_slots, int64_t * cap_pos, int64_t * cap_span) 865 | { 866 | (void)text; 867 | 868 | #ifdef REGEX_VERBOSE 869 | const uint8_t verbose = 1; 870 | #else 871 | const uint8_t verbose = 0; 872 | #endif 873 | 874 | #define IF_VERBOSE(X) { if (verbose) { X } } 875 | 876 | #ifdef REGEX_STACK_SMOL 877 | const uint16_t stack_size_max = 256; 878 | #else 879 | const uint16_t stack_size_max = 1024; 880 | #endif 881 | const uint16_t aux_stats_size = 1024; 882 | if (cap_slots > aux_stats_size) 883 | cap_slots = aux_stats_size; 884 | 885 | // quantified group state 886 | uint8_t q_group_accepts_zero[aux_stats_size]; 887 | uint32_t q_group_state[aux_stats_size]; // number of repetitions 888 | uint32_t q_group_stack[aux_stats_size]; // location of most recent corresponding ) on stack. 0 means nowhere 889 | 890 | uint16_t q_group_cap_index[aux_stats_size]; 891 | memset(q_group_cap_index, 0xFF, sizeof(q_group_cap_index)); 892 | 893 | uint64_t tokens_len = 0; 894 | uint32_t k = 0; 895 | uint16_t caps = 0; 896 | 897 | while (tokens[k].kind != REMIMU_KIND_END) 898 | { 899 | if (tokens[k].kind == REMIMU_KIND_OPEN && caps < cap_slots) 900 | { 901 | q_group_cap_index[tokens[k].mask[0]] = caps; 902 | q_group_cap_index[tokens[k + tokens[k].pair_offset].mask[0]] = caps; 903 | cap_pos[caps] = -1; 904 | cap_span[caps] = -1; 905 | caps += 1; 906 | } 907 | k += 1; 908 | if (tokens[k].kind == REMIMU_KIND_CLOSE || tokens[k].kind == REMIMU_KIND_OPEN || tokens[k].kind == REMIMU_KIND_NCOPEN) 909 | { 910 | if (tokens[k].mask[0] >= aux_stats_size) 911 | { 912 | REMIMU_LOG_ERROR("too many qualified groups. returning"); 913 | return -2; // OOM: too many quantified groups 914 | } 915 | 916 | q_group_state[tokens[k].mask[0]] = 0; 917 | q_group_stack[tokens[k].mask[0]] = 0; 918 | q_group_accepts_zero[tokens[k].mask[0]] = 0; 919 | } 920 | } 921 | 922 | tokens_len = k; 923 | 924 | RegexMatcherState rewind_stack[stack_size_max]; 925 | uint16_t stack_n = 0; 926 | 927 | uint64_t i = start_i; 928 | 929 | uint64_t range_min = 0; 930 | uint64_t range_max = 0; 931 | uint8_t just_rewinded = 0; 932 | 933 | #define _P_TEXT_HIGHLIGHTED() do { \ 934 | IF_VERBOSE(printf("\033[91m"); \ 935 | for (uint64_t q = 0; q < i; q++) printf("%c", text[q]); \ 936 | printf("\033[0m"); \ 937 | for (uint64_t q = i; text[q] != 0; q++) printf("%c", text[q]); \ 938 | printf("\n");) \ 939 | } while (0) 940 | 941 | #define _REWIND_DO_SAVE_RAW(K, ISDUMMY) do { \ 942 | if (stack_n >= stack_size_max) \ 943 | { \ 944 | REMIMU_LOG_ERROR("out of backtracking room. returning"); \ 945 | return -2; \ 946 | } \ 947 | RegexMatcherState s; \ 948 | memset(&s, 0, sizeof(RegexMatcherState)); \ 949 | s.i = i; \ 950 | s.k = (K); \ 951 | s.range_min = range_min; \ 952 | s.range_max = range_max; \ 953 | s.prev = 0; \ 954 | if (ISDUMMY) s.prev = 0xFAC7; \ 955 | else if (tokens[s.k].kind == REMIMU_KIND_CLOSE) \ 956 | { \ 957 | s.group_state = q_group_state[tokens[s.k].mask[0]]; \ 958 | s.prev = q_group_stack[tokens[s.k].mask[0]]; \ 959 | q_group_stack[tokens[s.k].mask[0]] = stack_n; \ 960 | } \ 961 | rewind_stack[stack_n++] = s; \ 962 | _P_TEXT_HIGHLIGHTED(); \ 963 | IF_VERBOSE(printf("-- saving rewind state k %u i %zd rmin %zu rmax %zd (line %d) (depth %d prev %d)\n", s.k, i, range_min, range_max, __LINE__, stack_n, s.prev);) \ 964 | } while (0) 965 | #define _REWIND_DO_SAVE_DUMMY(K) _REWIND_DO_SAVE_RAW(K, 1) 966 | #define _REWIND_DO_SAVE(K) _REWIND_DO_SAVE_RAW(K, 0) 967 | 968 | #define _REWIND_OR_ABORT() do { \ 969 | if (stack_n == 0) \ 970 | return -1; \ 971 | stack_n -= 1; \ 972 | while (stack_n > 0 && rewind_stack[stack_n].prev == 0xFAC7) stack_n -= 1; \ 973 | just_rewinded = 1; \ 974 | range_min = rewind_stack[stack_n].range_min; \ 975 | range_max = rewind_stack[stack_n].range_max; \ 976 | REMIMU_ASSERT(rewind_stack[stack_n].i <= i); \ 977 | i = rewind_stack[stack_n].i; \ 978 | k = rewind_stack[stack_n].k; \ 979 | if (tokens[k].kind == REMIMU_KIND_CLOSE) \ 980 | { \ 981 | q_group_state[tokens[k].mask[0]] = rewind_stack[stack_n].group_state; \ 982 | q_group_stack[tokens[k].mask[0]] = rewind_stack[stack_n].prev; \ 983 | } \ 984 | _P_TEXT_HIGHLIGHTED(); \ 985 | IF_VERBOSE(printf("-- rewound to k %u i %zd rmin %zu rmax %zd (kind %d prev %d)\n", k, i, range_min, range_max, tokens[k].kind, rewind_stack[stack_n].prev);) \ 986 | k -= 1; \ 987 | } while (0) 988 | // the -= 1 is because of the k++ in the for loop 989 | 990 | // used in boundary anchor checker 991 | uint64_t w_mask[16]; 992 | memset(w_mask, 0, sizeof(w_mask)); 993 | w_mask[3] = 0x03FF; 994 | w_mask[4] = 0xFFFE; 995 | w_mask[5] = 0x87FF; 996 | w_mask[6] = 0xFFFE; 997 | w_mask[7] = 0x07FF; 998 | #define _REGEX_CHECK_IS_W(byte) (!!(w_mask[((uint8_t)byte)>>4] & (1 << ((uint8_t)byte & 0xF)))) 999 | 1000 | int limit = REMIMU_ITERATION_LIMIT; 1001 | for (k = 0; k < tokens_len; k++) 1002 | { 1003 | if (REMIMU_ITERATION_LIMIT) 1004 | { 1005 | if (limit-- == 0) 1006 | { 1007 | REMIMU_LOG_ERROR("iteration limit exceeded. returning"); 1008 | return -2; 1009 | } 1010 | } 1011 | IF_VERBOSE(printf("k: %u\ti: %zu\tl: %zu\tstack_n: %d\n", k, i, limit, stack_n);) 1012 | _P_TEXT_HIGHLIGHTED(); 1013 | if (tokens[k].kind == REMIMU_KIND_CARET) 1014 | { 1015 | if (i != 0) 1016 | _REWIND_OR_ABORT(); 1017 | continue; 1018 | } 1019 | else if (tokens[k].kind == REMIMU_KIND_DOLLAR) 1020 | { 1021 | if (text[i] != 0) 1022 | _REWIND_OR_ABORT(); 1023 | continue; 1024 | } 1025 | else if (tokens[k].kind == REMIMU_KIND_BOUND) 1026 | { 1027 | if (i == 0 && !_REGEX_CHECK_IS_W(text[i])) 1028 | _REWIND_OR_ABORT(); 1029 | else if (i != 0 && text[i] == 0 && !_REGEX_CHECK_IS_W(text[i-1])) 1030 | _REWIND_OR_ABORT(); 1031 | else if (i != 0 && text[i] != 0 && _REGEX_CHECK_IS_W(text[i-1]) == _REGEX_CHECK_IS_W(text[i])) 1032 | _REWIND_OR_ABORT(); 1033 | } 1034 | else if (tokens[k].kind == REMIMU_KIND_NBOUND) 1035 | { 1036 | if (i == 0 && _REGEX_CHECK_IS_W(text[i])) 1037 | _REWIND_OR_ABORT(); 1038 | else if (i != 0 && text[i] == 0 && _REGEX_CHECK_IS_W(text[i-1])) 1039 | _REWIND_OR_ABORT(); 1040 | else if (i != 0 && text[i] != 0 && _REGEX_CHECK_IS_W(text[i-1]) != _REGEX_CHECK_IS_W(text[i])) 1041 | _REWIND_OR_ABORT(); 1042 | } 1043 | else 1044 | { 1045 | // deliberately unmatchable token (e.g. a{0}, a{0,0}) 1046 | if (tokens[k].count_hi == 1) 1047 | { 1048 | if (tokens[k].kind == REMIMU_KIND_OPEN || tokens[k].kind == REMIMU_KIND_NCOPEN) 1049 | k += tokens[k].pair_offset; 1050 | else 1051 | k += 1; 1052 | continue; 1053 | } 1054 | 1055 | if (tokens[k].kind == REMIMU_KIND_OPEN || tokens[k].kind == REMIMU_KIND_NCOPEN) 1056 | { 1057 | if (!just_rewinded) 1058 | { 1059 | IF_VERBOSE(printf("hit OPEN. i is %zd, depth is %d\n", i, stack_n);) 1060 | // need this to be able to detect and reject zero-size matches 1061 | //q_group_state[tokens[k].mask[0]] = i; 1062 | 1063 | // if we're lazy and the min length is 0, we need to try the non-group case first 1064 | if ((tokens[k].mode & REMIMU_MODE_LAZY) && (tokens[k].count_lo == 0 || q_group_accepts_zero[tokens[k + tokens[k].pair_offset].mask[0]])) 1065 | { 1066 | IF_VERBOSE(puts("trying non-group case first.....");) 1067 | range_min = 0; 1068 | range_max = 0; 1069 | _REWIND_DO_SAVE(k); 1070 | k += tokens[k].pair_offset; // automatic += 1 will put us past the matching ) 1071 | } 1072 | else 1073 | { 1074 | range_min = 1; 1075 | range_max = 0; 1076 | _REWIND_DO_SAVE(k); 1077 | } 1078 | } 1079 | else 1080 | { 1081 | IF_VERBOSE(printf("rewinded into OPEN. i is %zd, depth is %d\n", i, stack_n);) 1082 | just_rewinded = 0; 1083 | 1084 | uint64_t orig_k = k; 1085 | 1086 | IF_VERBOSE(printf("--- trying to try another alternation, start k is %d, rmin is %zu\n", k, range_min);) 1087 | 1088 | if (range_min != 0) 1089 | { 1090 | IF_VERBOSE(puts("rangemin is not zero. checking...");) 1091 | k += range_min; 1092 | IF_VERBOSE(printf("start kind: %d\n", tokens[k].kind);) 1093 | IF_VERBOSE(printf("before start kind: %d\n", tokens[k-1].kind);) 1094 | if (tokens[k-1].kind == REMIMU_KIND_OR) 1095 | k += tokens[k-1].pair_offset - 1; 1096 | else if (tokens[k-1].kind == REMIMU_KIND_OPEN || tokens[k-1].kind == REMIMU_KIND_NCOPEN) 1097 | k += tokens[k-1].mask[15] - 1; 1098 | 1099 | IF_VERBOSE(printf("kamakama %d %d\n", k, tokens[k].kind);) 1100 | 1101 | if (tokens[k].kind == REMIMU_KIND_END) // unbalanced parens 1102 | return -3; 1103 | 1104 | IF_VERBOSE(printf("---?!?! %d, %d\n", k, q_group_state[tokens[k].mask[0]]);) 1105 | if (tokens[k].kind == REMIMU_KIND_CLOSE) 1106 | { 1107 | IF_VERBOSE(puts("!!~!~!~~~~!!~~!~ hit CLOSE. rewinding");) 1108 | // do nothing and continue on if we don't need this group 1109 | if (tokens[k].count_lo == 0 || q_group_accepts_zero[tokens[k].mask[0]]) 1110 | { 1111 | IF_VERBOSE(puts("continuing because we don't need this group");) 1112 | q_group_state[tokens[k].mask[0]] = 0; 1113 | 1114 | if (!(tokens[k].mode & REMIMU_MODE_LAZY)) 1115 | q_group_stack[tokens[k].mask[0]] = 0; 1116 | 1117 | continue; 1118 | } 1119 | // otherwise go to the last point before the group 1120 | else 1121 | { 1122 | IF_VERBOSE(puts("going to last point before this group");) 1123 | _REWIND_OR_ABORT(); 1124 | continue; 1125 | } 1126 | } 1127 | 1128 | REMIMU_ASSERT(tokens[k].kind == REMIMU_KIND_OR); 1129 | } 1130 | 1131 | IF_VERBOSE(printf("--- FOUND ALTERNATION for paren at k %zd at k %d\n", orig_k, k);) 1132 | 1133 | ptrdiff_t k_diff = k - orig_k; 1134 | range_min = k_diff + 1; 1135 | 1136 | IF_VERBOSE(puts("(saving in paren after rewinding and looking for next regex token to check)");) 1137 | IF_VERBOSE(printf("%zd\n", range_min);) 1138 | _REWIND_DO_SAVE(k - k_diff); 1139 | } 1140 | } 1141 | else if (tokens[k].kind == REMIMU_KIND_CLOSE) 1142 | { 1143 | // unquantified 1144 | if (tokens[k].count_lo == 1 && tokens[k].count_hi == 2) 1145 | { 1146 | // for captures 1147 | uint16_t cap_index = q_group_cap_index[tokens[k].mask[0]]; 1148 | if (cap_index != 0xFFFF) 1149 | _REWIND_DO_SAVE_DUMMY(k); 1150 | } 1151 | // quantified 1152 | else 1153 | { 1154 | IF_VERBOSE(puts("closer test.....");) 1155 | if (!just_rewinded) 1156 | { 1157 | uint32_t prev = q_group_stack[tokens[k].mask[0]]; 1158 | 1159 | IF_VERBOSE(printf("qrqrqrqrqrqrqrq------- k %d, gs %d, gaz %d, i %zd, tklo %d, rmin %zd, tkhi %d, rmax %zd, prev %d, sn %d\n", k, q_group_state[tokens[k].mask[0]], q_group_accepts_zero[tokens[k].mask[0]], i, tokens[k].count_lo, range_min, tokens[k].count_hi, range_max, prev, stack_n);) 1160 | 1161 | range_max = tokens[k].count_hi; 1162 | range_max -= 1; 1163 | range_min = q_group_accepts_zero[tokens[k].mask[0]] ? 0 : tokens[k].count_lo; 1164 | //REMIMU_ASSERT(q_group_state[tokens[k + tokens[k].pair_offset].mask[0]] <= i); 1165 | //if (prev) REMIMU_ASSERT(rewind_stack[prev].i <= i); 1166 | IF_VERBOSE(printf("qzqzqzqzqzqzqzq------- rmin %zd, rmax %zd\n", range_min, range_max);) 1167 | 1168 | // minimum requirement not yet met 1169 | if (q_group_state[tokens[k].mask[0]] + 1 < range_min) 1170 | { 1171 | IF_VERBOSE(puts("continuing minimum matches for a quantified group");) 1172 | q_group_state[tokens[k].mask[0]] += 1; 1173 | _REWIND_DO_SAVE(k); 1174 | 1175 | k += tokens[k].pair_offset; // back to start of group 1176 | k -= 1; // ensure we actually hit the group node next and not the node after it 1177 | continue; 1178 | } 1179 | // maximum allowance exceeded 1180 | else if (tokens[k].count_hi != 0 && q_group_state[tokens[k].mask[0]] + 1 > range_max) 1181 | { 1182 | IF_VERBOSE(printf("hit maximum allowed instances of a quantified group %d %zd\n", q_group_state[tokens[k].mask[0]], range_max);) 1183 | range_max -= 1; 1184 | _REWIND_OR_ABORT(); 1185 | continue; 1186 | } 1187 | 1188 | // fallback case to detect zero-length matches when we backtracked into the inside of this group 1189 | // after an attempted parse of a second copy of itself 1190 | uint8_t force_zero = 0; 1191 | if (prev != 0 && rewind_stack[prev].i > i) 1192 | { 1193 | // find matching open paren 1194 | size_t n = stack_n - 1; 1195 | while (n > 0 && rewind_stack[n].k != k + tokens[k].pair_offset) 1196 | n -= 1; 1197 | REMIMU_ASSERT(n > 0); 1198 | if (rewind_stack[n].i == i) 1199 | force_zero = 1; 1200 | } 1201 | 1202 | // reject zero-length matches 1203 | if ((force_zero || (prev != 0 && rewind_stack[prev].i == i))) // && q_group_state[tokens[k].mask[0]] > 0 1204 | { 1205 | IF_VERBOSE(printf("rejecting zero-length match..... %d %zd %zd\n", force_zero, rewind_stack[prev].i, i);) 1206 | IF_VERBOSE(printf("%d (k: %d)\n", q_group_state[tokens[k].mask[0]], k);) 1207 | 1208 | q_group_accepts_zero[tokens[k].mask[0]] = 1; 1209 | _REWIND_OR_ABORT(); 1210 | //range_max = q_group_state[tokens[k].mask[0]]; 1211 | //range_min = 0; 1212 | } 1213 | else if (tokens[k].mode & REMIMU_MODE_LAZY) // lazy 1214 | { 1215 | IF_VERBOSE(printf("nidnfasidfnidfndifn------- %d, %d, %zd\n", q_group_state[tokens[k].mask[0]], tokens[k].count_lo, range_min);) 1216 | if (prev) 1217 | IF_VERBOSE(printf("lazy doesn't think it's zero-length. prev i %zd vs i %zd (depth %d)\n", rewind_stack[prev].i, i, stack_n);) 1218 | // continue on to past the group; group retry is in rewind state 1219 | q_group_state[tokens[k].mask[0]] += 1; 1220 | _REWIND_DO_SAVE(k); 1221 | q_group_state[tokens[k].mask[0]] = 0; 1222 | } 1223 | else // greedy 1224 | { 1225 | IF_VERBOSE(puts("wahiwahi");) 1226 | // clear unwanted memory if possessive 1227 | if ((tokens[k].mode & REMIMU_MODE_POSSESSIVE)) 1228 | { 1229 | uint32_t k2 = k; 1230 | 1231 | // special case for first, only rewind to (, not to ) 1232 | if (q_group_state[tokens[k].mask[0]] == 0) 1233 | k2 = k + tokens[k].pair_offset; 1234 | 1235 | if (stack_n == 0) 1236 | return -1; 1237 | stack_n -= 1; 1238 | 1239 | while (stack_n > 0 && rewind_stack[stack_n].k != k2) 1240 | stack_n -= 1; 1241 | 1242 | if (stack_n == 0) 1243 | return -1; 1244 | } 1245 | // continue to next match if sane 1246 | if ((uint32_t)q_group_state[tokens[k + tokens[k].pair_offset].mask[0]] < (uint32_t)i) 1247 | { 1248 | IF_VERBOSE(puts("REWINDING FROM GREEDY NON-REWIND CLOSER");) 1249 | q_group_state[tokens[k].mask[0]] += 1; 1250 | _REWIND_DO_SAVE(k); 1251 | k += tokens[k].pair_offset; // back to start of group 1252 | k -= 1; // ensure we actually hit the group node next and not the node after it 1253 | } 1254 | else 1255 | IF_VERBOSE(puts("CONTINUING FROM GREEDY NON-REWIND CLOSER");) 1256 | } 1257 | } 1258 | else 1259 | { 1260 | IF_VERBOSE(puts("IN CLOSER REWIND!!!");) 1261 | just_rewinded = 0; 1262 | 1263 | if (tokens[k].mode & REMIMU_MODE_LAZY) 1264 | { 1265 | // lazy rewind: need to try matching the group again 1266 | _REWIND_DO_SAVE_DUMMY(k); 1267 | q_group_stack[tokens[k].mask[0]] = stack_n; 1268 | k += tokens[k].pair_offset; // back to start of group 1269 | k -= 1; // ensure we actually hit the group node next and not the node after it 1270 | } 1271 | else 1272 | { 1273 | // greedy. if we're going to go outside the acceptable range, rewind 1274 | IF_VERBOSE(printf("kufukufu %d %zd\n", tokens[k].count_lo, range_min);) 1275 | //uint64_t old_i = i; 1276 | if (q_group_state[tokens[k].mask[0]] < range_min && !q_group_accepts_zero[tokens[k].mask[0]]) 1277 | { 1278 | IF_VERBOSE(printf("rewinding from greedy group because we're going to go out of range (%d vs %zd)\n", q_group_state[tokens[k].mask[0]], range_min);) 1279 | //i = old_i; 1280 | _REWIND_OR_ABORT(); 1281 | } 1282 | // otherwise continue on to past the group 1283 | else 1284 | { 1285 | IF_VERBOSE(puts("continuing past greedy group");) 1286 | q_group_state[tokens[k].mask[0]] = 0; 1287 | 1288 | // for captures 1289 | uint16_t cap_index = q_group_cap_index[tokens[k].mask[0]]; 1290 | if (cap_index != 0xFFFF) 1291 | _REWIND_DO_SAVE_DUMMY(k); 1292 | } 1293 | } 1294 | } 1295 | } 1296 | } 1297 | else if (tokens[k].kind == REMIMU_KIND_OR) 1298 | { 1299 | IF_VERBOSE(printf("hit OR at %d. adding %d\n", k, tokens[k].pair_offset);) 1300 | k += tokens[k].pair_offset; 1301 | k -= 1; 1302 | } 1303 | else if (tokens[k].kind == REMIMU_KIND_NORMAL) 1304 | { 1305 | if (!just_rewinded) 1306 | { 1307 | uint64_t n = 0; 1308 | // do whatever the obligatory minimum amount of matching is 1309 | uint64_t old_i = i; 1310 | while (n < tokens[k].count_lo && text[i] != 0 && _REGEX_CHECK_MASK(k, text[i])) 1311 | { 1312 | i += 1; 1313 | n += 1; 1314 | } 1315 | if (n < tokens[k].count_lo) 1316 | { 1317 | IF_VERBOSE(printf("non-match A. rewinding (token %d)\n", k);) 1318 | i = old_i; 1319 | _REWIND_OR_ABORT(); 1320 | continue; 1321 | } 1322 | 1323 | if (tokens[k].mode & REMIMU_MODE_LAZY) 1324 | { 1325 | range_min = n; 1326 | range_max = tokens[k].count_hi - 1; 1327 | _REWIND_DO_SAVE(k); 1328 | } 1329 | else 1330 | { 1331 | uint64_t ilimit = tokens[k].count_hi; 1332 | if (ilimit == 0) 1333 | ilimit = ~ilimit; 1334 | range_min = n; 1335 | while (text[i] != 0 && _REGEX_CHECK_MASK(k, text[i]) && n + 1 < ilimit) 1336 | { 1337 | IF_VERBOSE(printf("match!! (%c)\n", text[i]);) 1338 | i += 1; 1339 | n += 1; 1340 | } 1341 | range_max = n; 1342 | IF_VERBOSE(printf("set rmin to %zd and rmax to %zd on entry into normal greedy token with k %d\n", range_min, range_max, k);) 1343 | if (!(tokens[k].mode & REMIMU_MODE_POSSESSIVE)) 1344 | _REWIND_DO_SAVE(k); 1345 | } 1346 | } 1347 | else 1348 | { 1349 | just_rewinded = 0; 1350 | 1351 | if (tokens[k].mode & REMIMU_MODE_LAZY) 1352 | { 1353 | uint64_t ilimit = range_max; 1354 | if (ilimit == 0) 1355 | ilimit = ~ilimit; 1356 | 1357 | if (_REGEX_CHECK_MASK(k, text[i]) && text[i] != 0 && range_min < ilimit) 1358 | { 1359 | IF_VERBOSE(printf("match2!! (%c) (k: %d)\n", text[i], k);) 1360 | i += 1; 1361 | range_min += 1; 1362 | _REWIND_DO_SAVE(k); 1363 | } 1364 | else 1365 | { 1366 | IF_VERBOSE(printf("core rewind lazy (k: %d)\n", k);) 1367 | _REWIND_OR_ABORT(); 1368 | } 1369 | } 1370 | else 1371 | { 1372 | //IF_VERBOSE(printf("comparing rmin %zd and rmax %zd token with k %d\n", range_min, range_max, k);) 1373 | if (range_max > range_min) 1374 | { 1375 | IF_VERBOSE(printf("greedy normal going back (k: %d)\n", k);) 1376 | i -= 1; 1377 | range_max -= 1; 1378 | _REWIND_DO_SAVE(k); 1379 | } 1380 | else 1381 | { 1382 | IF_VERBOSE(printf("core rewind greedy (k: %d)\n", k);) 1383 | _REWIND_OR_ABORT(); 1384 | } 1385 | } 1386 | } 1387 | } 1388 | else 1389 | { 1390 | fprintf(stderr, "unimplemented token kind %d\n", tokens[k].kind); 1391 | REMIMU_ASSERT(0); 1392 | } 1393 | } 1394 | //printf("k... %d\n", k); 1395 | } 1396 | 1397 | if (caps != 0) 1398 | { 1399 | //printf("stack_n: %d\n", stack_n); 1400 | fflush(stdout); 1401 | for (size_t n = 0; n < stack_n; n++) 1402 | { 1403 | RegexMatcherState s = rewind_stack[n]; 1404 | int kind = tokens[s.k].kind; 1405 | if (kind == REMIMU_KIND_OPEN || kind == REMIMU_KIND_CLOSE) 1406 | { 1407 | uint16_t cap_index = q_group_cap_index[tokens[s.k].mask[0]]; 1408 | if (cap_index == 0xFFFF) 1409 | continue; 1410 | if (tokens[s.k].kind == REMIMU_KIND_OPEN) 1411 | cap_pos[cap_index] = s.i; 1412 | else if (cap_pos[cap_index] >= 0) 1413 | cap_span[cap_index] = s.i - cap_pos[cap_index]; 1414 | } 1415 | } 1416 | // re-deinitialize capture positions that have no associated capture span 1417 | for (size_t n = 0; n < caps; n++) 1418 | { 1419 | if (cap_span[n] == -1) 1420 | cap_pos[n] = -1; 1421 | } 1422 | } 1423 | 1424 | #undef _REWIND_DO_SAVE 1425 | #undef _REWIND_OR_ABORT 1426 | #undef _REGEX_CHECK_IS_W 1427 | #undef _P_TEXT_HIGHLIGHTED 1428 | #undef IF_VERBOSE 1429 | 1430 | return i; 1431 | } 1432 | 1433 | REMIMU_FUNC_VISIBILITY void print_regex_tokens(RegexToken * tokens) 1434 | { 1435 | const char * kind_to_str[] = { 1436 | "NORMAL", 1437 | "OPEN", 1438 | "NCOPEN", 1439 | "CLOSE", 1440 | "OR", 1441 | "CARET", 1442 | "DOLLAR", 1443 | "BOUND", 1444 | "NBOUND", 1445 | "END", 1446 | }; 1447 | const char * mode_to_str[] = { 1448 | "GREEDY", 1449 | "POSSESS", 1450 | "LAZY", 1451 | }; 1452 | for (int k = 0;; k++) 1453 | { 1454 | printf("%s\t%s\t", kind_to_str[tokens[k].kind], mode_to_str[tokens[k].mode]); 1455 | 1456 | int c_old = -1; 1457 | for (int c = 0; c < (tokens[k].kind ? 0 : 256); c++) 1458 | { 1459 | #define _PRINT_C_SMART(c) { \ 1460 | if (c >= 0x20 && c <= 0x7E) \ 1461 | printf("%c", c); \ 1462 | else \ 1463 | printf("\\x%02x", c); \ 1464 | } 1465 | 1466 | if (_REGEX_CHECK_MASK(k, c)) 1467 | { 1468 | if (c_old == -1) 1469 | c_old = c; 1470 | } 1471 | else if (c_old != -1) 1472 | { 1473 | if (c - 1 == c_old) 1474 | { 1475 | _PRINT_C_SMART(c_old) 1476 | c_old = -1; 1477 | } 1478 | else if (c - 2 == c_old) 1479 | { 1480 | _PRINT_C_SMART(c_old) 1481 | _PRINT_C_SMART(c_old + 1) 1482 | c_old = -1; 1483 | } 1484 | else 1485 | { 1486 | _PRINT_C_SMART(c_old) 1487 | printf("-"); 1488 | _PRINT_C_SMART(c - 1) 1489 | c_old = -1; 1490 | } 1491 | } 1492 | } 1493 | 1494 | /* 1495 | printf("\t"); 1496 | for (int i = 0; i < 16; i++) 1497 | printf("%04x", tokens[k].mask[i]); 1498 | */ 1499 | 1500 | printf("\t{%d,%d}\t(%d)\n", tokens[k].count_lo, tokens[k].count_hi - 1, tokens[k].pair_offset); 1501 | 1502 | if (tokens[k].kind == REMIMU_KIND_END) 1503 | break; 1504 | } 1505 | } 1506 | 1507 | #undef _REGEX_CHECK_MASK 1508 | 1509 | #endif //INCLUDE_REMIMU 1510 | --------------------------------------------------------------------------------