├── my_regex_test.c
├── readme.md
├── LICENSE
├── my_regex_tests.cpp
└── remimu.h


/my_regex_test.c:
--------------------------------------------------------------------------------
 1 | #include "remimu.h"
 2 | 
 3 | int main(void)
 4 | {
 5 |     RegexToken tokens[1024];
 6 |     int16_t token_count = 1024;
 7 |     int e = regex_parse("[0-9]+\\.[0-9]+", tokens, &token_count, 0);
 8 |     if (e) return (puts("regex has error"), 0);
 9 |     print_regex_tokens(tokens);
10 |     
11 |     int64_t match_len = regex_match(tokens, "23.53) ", 0, 0, 0, 0);
12 |     printf("########### return: %zd\n", match_len);
13 |     
14 |     return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Remimu: Single-Header C/C++ Regex Library
  2 | 
  3 | Compatible with C99 and C++11 and later standards. Uses backtracking and relatively standard regex syntax.
  4 | 
  5 |     #include "remimu.h"
  6 | 
  7 | ## Functions
  8 | ```c
  9 |     // Returns 0 on success, or -1 on invalid or unsupported regex, or -2 on not enough tokens given to parse regex.
 10 |     static inline int regex_parse(
 11 |         const char * pattern,       // Regex pattern to parse.
 12 |         RegexToken * tokens,        // Output buffer of token_count regex tokens.
 13 |         int16_t * token_count,      // Maximum allowed number of tokens to write
 14 |         int32_t flags               // Optional bitflags.
 15 |     )
 16 |     
 17 |     // Returns match length, or -1 on no match, or -2 on out of memory, or -3 if the regex is invalid.
 18 |     static inline int64_t regex_match(
 19 |         const RegexToken * tokens,  // Parsed regex to match against text.
 20 |         const char * text,          // Text to match against tokens.
 21 |         size_t start_i,             // index value to match at.
 22 |         uint16_t cap_slots,         // Number of allowed capture info output slots.
 23 |         int64_t * cap_pos,          // Capture position info output buffer.
 24 |         int64_t * cap_span          // Capture length info output buffer.
 25 |     ) 
 26 |     
 27 |     static inline void print_regex_tokens(
 28 |         RegexToken * tokens     // Regex tokens to spew to stdout, for debugging.
 29 |     )
 30 | ```
 31 | Remimu doesn't have a searching API.
 32 | 
 33 | If `static inline` doesn't work for your project, define the `REMIMU_FUNC_VISIBILITY` (default `static inline`) and `REMIMU_CONST_VISIBILITY` (default `static const`) visibility prefix macros before including the header. Remimu doesn't use mutable global or mutable static variables, so no prefix macro is needed for them.
 34 | 
 35 | ## Performance
 36 | 
 37 | On simple cases, Remimu's match speed is similar to PCRE2. Regex parsing/compilation is also much faster (around 4x to 10x), so single-shot regexes are often faster than PCRE2.
 38 | 
 39 | HOWEVER: Remimu is a pure backtracking engine, and has `O(2^x)` complexity on regexes with catastrophic backtracking. It can be much, much, MUCH slower than PCRE2. Beware!
 40 | 
 41 | Remimu uses length-checked fixed memory buffers with no recursion, so memory usage is statically known.
 42 | 
 43 | ## Features
 44 | 
 45 | - Lowest-common-denominator common regex syntax
 46 | - Based on backtracking (slow in the worst case, but fast in the best case)
 47 | - 8-bit only, no utf-16 or utf-32. Use https://wareya.github.io/uniregex/ to create 8-bit versions of utf-8 regexes
 48 | - Statically known memory usage (no heap allocation or recursion)
 49 | - Groups with or without capture, and with or without quantifiers
 50 | - Supported escapes:
 51 | - - 2-digit hex: e.g. `\x00`, `\xFF`, or lowercase, or mixed case
 52 | - - `\r`, `\n`, `\t`, `\v`, `\f` (whitespace characters)
 53 | - - `\d`, `\s`, `\w`, `\D`, `\S`, `\W` (digit, space, and word character classes)
 54 | - - `\b`, `\B` word boundary and non-word-boundary anchors (not fully supported in zero-size quantified groups, but even then, usually supported)
 55 | - - Escaped literal characters: `{}[]-()|^$*+?:./\`
 56 | - - - Escapes work in character classes, except for `b`
 57 | - Character classes, including disjoint ranges, proper handling of bare `[` and trailing `-`, etc
 58 | - - Dot (`.`) matches all characters, including newlines, unless `REMIMU_FLAG_DOT_NO_NEWLINES` is passed as a flag to `regex_parse`
 59 | - - Dot (`.`) only matches at most one byte at a time, so matching `\r\n` requires two dots (and not using `REMIMU_FLAG_DOT_NO_NEWLINES`)
 60 | - Anchors (`^` and `$`)
 61 | - - Same support caveats as \b, \B apply
 62 | - Basic quantifiers (`*`, `+`, `?`)
 63 | - - Quantifiers are greedy by default.
 64 | - Explicit quantifiers (`{2}`, `{5}`, `{5,}`, `{5,7}`)
 65 | - Alternation e.g. `(asdf|foo)`
 66 | - Lazy quantifiers e.g. `(asdf)*?` or `\w+?`
 67 | - Possessive greedy quantifiers e.g. `(asdf)*+` or `\w++`
 68 | - - NOTE: Capture groups for and inside of possessive groups return no capture information.
 69 | - Atomic groups e.g. `(?>(asdf))`
 70 | - - NOTE: Capture groups inside of atomic groups return no capture information.
 71 | 
 72 | ## Not Supported
 73 | 
 74 | - Strings with non-terminal null characters
 75 | - Unicode character classes (matching single utf-8 characters works regardless)
 76 | - Exact POSIX regex semantics (posix-style greediness etc)
 77 | - - (note: despite being a posix thing, it would be very weird and rare to support exact posix semantics. most regex implementations are not posix regexes and posix regexes are surprising.)
 78 | - Backreferences
 79 | - Lookbehind/Lookahead
 80 | - Named groups
 81 | - Most other weird flavor-specific regex stuff
 82 | - Capture of or inside of possessive-quantified groups (still take up a capture slot, but no data is returned)
 83 | 
 84 | ## Usage
 85 | ```c
 86 |     // minimal:
 87 |     
 88 |     RegexToken tokens[1024];
 89 |     int16_t token_count = 1024;
 90 |     int e = regex_parse("[0-9]+\\.[0-9]+", tokens, &token_count, 0);
 91 |     assert(!e);
 92 |     
 93 |     int64_t match_len = regex_match(tokens, "23.53) ", 0, 0, 0, 0);
 94 |     printf("########### return: %zd\n", match_len);
 95 |     
 96 |     // with captures:
 97 |     
 98 |     RegexToken tokens[256];
 99 |     int16_t token_count = sizeof(tokens)/sizeof(tokens[0]);
100 |     int e = regex_parse("((a)|(b))++", tokens, &token_count, 0);
101 |     assert(!e);
102 |     
103 |     int64_t cap_pos[5];
104 |     int64_t cap_span[5];
105 |     memset(cap_pos, 0xFF, sizeof(cap_pos));
106 |     memset(cap_span, 0xFF, sizeof(cap_span));
107 |     
108 |     int64_t matchlen = regex_match(tokens, "aaaaaabbbabaqa", 0, 5, cap_pos, cap_span);
109 |     printf("Match length: %zd\n", matchlen);
110 |     for (int i = 0; i < 5; i++)
111 |         printf("Capture %d: %zd plus %zd\n", i, cap_pos[i], cap_span[i]);
112 |         
113 |     // for debugging
114 |     print_regex_tokens(tokens);
115 | ```
116 | ## Testing
117 | 
118 | `my_regex_tests.cpp` is a C++11 program that throws a matrix of regexes and test strings into PCRE2 and validates that they're matched the same way in Remimu (for supported features). It contains a good number of gotcha regexes.
119 | 
120 | ## License
121 | 
122 | Creative Commons Zero, public domain.
123 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/my_regex_tests.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // tests for regex engine; not actually part of BBEL
  3 | // testing requires PCRE2
  4 | // msys2: pacman -S mingw-w64-<flavor>-pcre2
  5 | // linker flag is usually -lpcre2-8
  6 | 
  7 | //#define REGEX_VERBOSE
  8 | #include "remimu.h"
  9 | 
 10 | #include <chrono>
 11 | 
 12 | #include <string>
 13 | #include <string_view>
 14 | #include <vector>
 15 | 
 16 | #define PCRE2_CODE_UNIT_WIDTH 8
 17 | #include <pcre2.h>
 18 | 
 19 | #define BE_QUIET
 20 | 
 21 | static void must_parse_ok(const char* pat) {
 22 |     RegexToken toks[128];
 23 |     memset(toks, 0, sizeof(toks));
 24 |     int16_t cap = (int16_t)(sizeof(toks)/sizeof(toks[0]));
 25 |     int rc = regex_parse(pat, toks, &cap, 0);
 26 |     if (rc != 0) {
 27 |         fprintf(stderr, "[FAIL] regex_parse should succeed: `%s` (rc=%d)\n", pat, rc);
 28 |         assert(rc == 0);
 29 |     }
 30 | }
 31 | 
 32 | static void must_parse_fail(const char* pat) {
 33 |     RegexToken toks[16];
 34 |     memset(toks, 0, sizeof(toks));
 35 |     int16_t cap = (int16_t)(sizeof(toks)/sizeof(toks[0]));
 36 |     int rc = regex_parse(pat, toks, &cap, 0);
 37 |     if (rc == 0) {
 38 |         fprintf(stderr, "[FAIL] regex_parse should fail: `%s`\n", pat);
 39 |         assert(rc != 0);
 40 |     }
 41 | }
 42 | 
 43 | static int64_t do_match(const char* pat, const char* text) {
 44 |     RegexToken toks[256];
 45 |     memset(toks, 0, sizeof(toks));
 46 |     int16_t cap = (int16_t)(sizeof(toks)/sizeof(toks[0]));
 47 |     int rc = regex_parse(pat, toks, &cap, 0);
 48 |     assert(rc == 0);
 49 |     return regex_match(toks, text, 0, 0, 0, 0);
 50 | }
 51 | 
 52 | static void expect_match_len(const char* pat, const char* text, int64_t want_len) {
 53 |     int64_t got = do_match(pat, text);
 54 |     if (got != want_len) {
 55 |         fprintf(stderr, "[FAIL] `%s` ~ `%s` : want %zd, got %zd\n", pat, text, (ssize_t)want_len, (ssize_t)got);
 56 |         assert(got == want_len);
 57 |     }
 58 | }
 59 | 
 60 | static void expect_no_match(const char* pat, const char* text) {
 61 |     int64_t got = do_match(pat, text);
 62 |     if (got >= 0) {
 63 |         fprintf(stderr, "[FAIL] `%s` should NOT match `%s` (got len=%zd)\n", pat, text, (ssize_t)got);
 64 |         assert(got < 0);
 65 |     }
 66 | }
 67 | 
 68 | void testify(void)
 69 | {
 70 |     using clock = std::chrono::high_resolution_clock;
 71 |     
 72 |     static const char * regexes[] = {
 73 |         // ipv4
 74 |         "^(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$",
 75 |         // ipv6
 76 |         "^(?:(?:(?:[0-9a-fA-F]{1,4}):){7}(?:(?:[0-9a-fA-F]{1,4})|:)|(?:(?:[0-9a-fA-F]{1,4}):){6}(?:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|:(?:[0-9a-fA-F]{1,4})|:)|(?:(?:[0-9a-fA-F]{1,4}):){5}(?::(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,2}|:)|(?:(?:[0-9a-fA-F]{1,4}):){4}(?:(?::(?:[0-9a-fA-F]{1,4})){0,1}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,3}|:)|(?:(?:[0-9a-fA-F]{1,4}):){3}(?:(?::(?:[0-9a-fA-F]{1,4})){0,2}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,4}|:)|(?:(?:[0-9a-fA-F]{1,4}):){2}(?:(?::(?:[0-9a-fA-F]{1,4})){0,3}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,5}|:)|(?:(?:[0-9a-fA-F]{1,4}):){1}(?:(?::(?:[0-9a-fA-F]{1,4})){0,4}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,6}|:)|(?::(?:(?::(?:[0-9a-fA-F]{1,4})){0,5}:(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])|(?::(?:[0-9a-fA-F]{1,4})){1,7}|:)))(?:%[0-9a-zA-Z-.:]{1,})?$",
 77 |         
 78 |         "(b|a|as|q|)*?X",
 79 |         u8"((電|自転)車)+",
 80 |         "",
 81 |         "(|b|a|as|q)*X",
 82 |         "(b|a|as|q|)*X",
 83 |         "(b|a|as|q|)+X",
 84 |         "(b|a|as|q|)+?X",
 85 |         "((b|a|as|q|))*X",
 86 |         "((b|a|as|q|))*?X",
 87 |         "(b|a|as|q)*X",
 88 |         "(b|a|as|q)*?X",
 89 |         "(b|a|as|q)+X",
 90 |         
 91 |         "((a)|(b))+",
 92 |         "((a)|(b))++",
 93 |         "((a)|(b))+?",
 94 |         "((a)|(b))*",
 95 |         "((a)|(b))*+",
 96 |         "((a)|(b))*?",
 97 |         "((a)|((b)q))*",
 98 |         "((a)|((b)q))*+",
 99 |         
100 |         "(|a?)+?a{10}",
101 |         "(a?)*a{10}",
102 |         "(a?)*?a{10}",
103 |         "(a?)+?a{100}",
104 |         "(a?)+?a{10}",
105 |         "(a?)+a{10}",
106 |         "(a)+a{9}",
107 |         "(a)+?a{9}",
108 |         "(|a)+a{9}",
109 |         "(|a)+a{10}",
110 |         "(|a)+a{11}",
111 |         "(|a)+?a{11}",
112 |         "^a(bc+|b[eh])g|.h$",
113 |         "(bc+d$|ef*g.|h?i(j|k))",
114 |         
115 |         "(b|a|as|q)*?X",
116 |         
117 |         // emails and email-like things
118 |         // (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])
119 |         "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])",
120 |         //"(?:\\w+(?:\\.\\w+)*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)",
121 |         "(?:\\w+(?:\\.\\w+)*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)",
122 |         "(\\w\\w*\\.)+",
123 |         "(\\w+\\.)+",
124 |         "(?:\\w+(?:\\.\\w+)*)@(?:\\w+(?:\\.\\w+)*)",
125 |         "[a-z0-9\\._%+!$&*=^|~#%'`?{}/\\-]+@([a-z0-9\\-]+\\.){1,}([a-z]{2,16})",
126 |         "^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}$",
127 |         
128 |         "(ab?)b",
129 |         "(ab?)*b",
130 |         "(ab?)*?b",
131 |         
132 |         "([0a-z][a-z0-9]*,)+",
133 |         "([a-z][a-z0-9]*,)+",
134 |         
135 |         "asdf\\b",
136 |         "asdf\\B",
137 |         "\\basdf",
138 |         
139 |         "(\\ba?)*",
140 |         "(\\ba?)*?",
141 |         "(\\b)+?",
142 |         "(\\b)+",
143 |         "(\\ba?)+",
144 |         "(\\ba?)+?",
145 |         "(\\ba?)*a",
146 |         "(\\ba?)*?a",
147 |         "(\\ba?)+a",
148 |         "(\\ba?)+?a",
149 |         "a(\\b)*",
150 |         "a(\\b)*?",
151 |         "(\\b)*a",
152 |         "(\\b)*?a",
153 |         "a(\\b)+",
154 |         "a(\\b)+?",
155 |         "(\\b)+a",
156 |         "(\\b)+?a",
157 |         
158 |         "^asdf$",
159 |         "^asdf",
160 |         "asdf$",
161 |         ".*asdf",
162 |         ".*asdf$",
163 |         
164 |         "(^(asdf)?)*",
165 |         "(^(asdf)?)*(asdf)?",
166 |         "((asdf)?$)*",
167 |         "((asdf)?)*((asdf)?$)*",
168 |         "(^(asdf)?)*?",
169 |         "(^(asdf)?)*?(asdf)?",
170 |         "((asdf)?$)*?",
171 |         "((asdf)?)*?((asdf)?$)*?",
172 |         
173 |         "(a?)*a{10}",
174 |         "(a?)*?a{10}",
175 |         "()",
176 |         "(a|)*b",
177 |         "(z?)*a{10}",
178 |         
179 |         
180 |         // possessive
181 |         "(b|a|)*+",
182 |         "(a|)*+b",
183 |         "(?>(b|a|)*)",
184 |         "(b|a|)*+b",
185 |         "(b|a|as|q)*+",
186 |         "(b|a|as|q)*+X",
187 |         "(b|a|as|q)*",
188 |         "a++ab",
189 |         
190 |         "[0-9]+\\.[0-9]+",
191 |         "[0-9]+0\\.[0-9]+",
192 |         
193 |         "(a|a|ab)bc",
194 |         "(ab|ab|a)bc",
195 |         "[0-9]\\.[0-9]",
196 |         
197 |         "\\d\\.\\d",
198 |         "\\d*\\.\\d*",
199 |         "\\w+",
200 |         "\\s+",
201 |         "\\s(\\w+)",
202 |         "\\w+\\s",
203 |         
204 |         "(\\d)*?\\.(\\d)+",
205 |         "([0-9])*?\\.([0-9])+",
206 |         "([0-9]){3,5}?\\.([0-9])+",
207 |         "[0-9]{3,5}?\\.[0-9]+",
208 |         "([0-9]){3,5}\\.([0-9])+",
209 |         "[0-9]{3,5}\\.[0-9]+",
210 |         "(a|ab)*b",
211 |         "(ab?)*?b",
212 |         "(ab?\?)*b",
213 |         "(a)?\?(b|a)",
214 |         "(a)*a{10}",
215 |         "(a)*?a{10}",
216 |         "a()a",
217 |         "a(|)a",
218 |         "a(|){1}?a",
219 |         "a(|b)+a",
220 |         "a(|b)+?a",
221 |         "(a|b)*?b",
222 |         "a*a*?",
223 |         "a*?a*",
224 |         "(b|a)*b",
225 |         "(b|a)*?b",
226 |         "(b|a|)*",
227 |         "(b|a|)*bb",
228 |         "(b|a|)*?bb",
229 |         "(|a)+",
230 |         "(|a)+?",
231 |         "()+",
232 |         "()+?",
233 |         "(|)+?",
234 |         "a(|)*a",
235 |         "a(|)*?a",
236 |         "(a|(((()))))*b",
237 |         "((\\w+,?)*:)*",
238 |         "((\\w+,?)*+:)*",
239 |         "((\\w+,?)*+:)*+",
240 |         
241 |         // pathological
242 |         "((a?b|a)b?)*",
243 |         "(.*,){11}P",
244 |         "(.*?,){11}P",
245 |         
246 |         "mistaken bogus regex",
247 |     };
248 |     static const char * texts[] = {
249 |         "asqbX",
250 |         
251 |         "aaaaaaaaaa",
252 |         "asqb",
253 |         "abh",
254 |         
255 |         u8"自転車",
256 |         
257 |         "0.42.42.42",
258 |         "0.42.42..42",
259 |         ".0.42.42.42",
260 |         "0.256.42.42",
261 |         "0.420.42.42",
262 |         "0.0111.42.42",
263 |         "254.254.254.254",
264 |         "192.168.255.255",
265 |         "239.51.161.175",
266 |         "239.51.161.175",
267 |         "0.0.0.0",
268 |         "251.227.56.60",
269 |         "18.45.235.138",
270 |         
271 |         "8db2:5802:4f78:5f2c:2dc5:33e9:8c7b:6fc4",
272 |         "0995:86cd:70c9:a98a:bab6:c4b1:93e4:f839",
273 |         "ff80::220:16ff:fec9:1",
274 |         "fe20::150:560f:fec4:3",
275 |         "fd87:403b:401f::/48",
276 |         
277 |         "effgz",
278 |         "ij",
279 |         "effg",
280 |         "bcdd",
281 |         "reffgz",
282 |         
283 |         "testacc@example.com",
284 |         
285 |         "aa.bb.cc.dd",
286 |         "a5,b7,c9",
287 |         "a5,b7,c9,",
288 |         "a5,b7,c9,,",
289 |         "a5,b7,c9,1",
290 |         "a5,b7,c9,a",
291 |         "",
292 |         " ",
293 |         "  ",
294 |         "a",
295 |         "aa",
296 |         "aba) ",
297 |         "aaaaaaaaa",
298 |         "aaaaaaaaaaaaaa",
299 |         "aaaaaaaaaaaaaab",
300 |         "aaaaaaaaaaaaaaba",
301 |         
302 |         u8"電車",
303 |         u8"電車自転車",
304 |         u8"自転車電車",
305 |         
306 |         "testacc@example.com",
307 |         "test+acc@example.com",
308 |         "test.acc@example.com",
309 |         "test.acc.acc@sub.example.com",
310 |         "loooooooo10235699ng.1g.g.g.210g01.longie.acc@sub.example.com.co.co.uk.jp.fakedomain.loooooooooooooooooooonger.com......",
311 |         "test.acc@sub.example.com",
312 |         "test@sub.example.com",
313 |         "@example.com",
314 |         "example.com",
315 |         "a@",
316 |         "#@%^%#$@#$@#.com",
317 |         "Joe Smith <email@example.com>",
318 |         "_______@example.com",
319 |         "“email”@example.com",
320 |         "email@[123.123.123.123]",
321 |         "email@123.123.123.123",
322 |         
323 |         "abc) ",
324 |         "abba) ",
325 |         "abbc) ",
326 |         "012.53) ",
327 |         ".53) ",
328 |         "5.5",
329 |         "022134.53) ",
330 |         "02234.53) ",
331 |         "1131.53) ",
332 |         "131.53) ",
333 |         "11.53) ",
334 |         "1.53) ",
335 |         "aa",
336 |         "aaaaaaaaabababab",
337 |         "aaaaaaaaababababb",
338 |         "aaaaabbbbbbbx",
339 |         "bbbbbbb",
340 |         "1,2,3,4,5,6,7,8,9,10,11,12",
341 |         "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16",
342 |         
343 |         "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22",
344 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25",
345 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25   P",
346 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26",
347 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26   P",
348 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27",
349 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28",
350 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28",
351 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30",
352 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31",
353 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32",
354 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33",
355 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34",
356 |         //"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35",
357 |         "aaaaaababababababaabx",
358 |         
359 |         "       ",
360 |         "afd1gkage919953bd       ",
361 |         "   x    ",
362 |         "   ,\\1264ga0b a    ",
363 |         "asdf ",
364 |         "asdfg",
365 |         "   asdf",
366 |         "asdf   ",
367 |         "   asdf   ",
368 |         "XXXasdf",
369 |         "asdfXXX",
370 |         "XXXasdfXXX",
371 |         "000asdf",
372 |         "asdf000",
373 |         "000asdf000",
374 |         "a,b,easbe_1:a,:a",
375 |         
376 |         "uh-uh",
377 |         "words, yeah",
378 |         "mistaken bogus regex",
379 |         
380 |         "aaaaaabbbabaqa",
381 |     };
382 |     
383 |     const char * slowest_my_regex = "";
384 |     double slowest_my_regex_time = 0.0;
385 |     double total_my_regex_time = 0.0;
386 |     const char * slowest_pcre2_regex = "";
387 |     double slowest_pcre2_regex_time = 0.0;
388 |     double total_pcre2_regex_time = 0.0;
389 |     
390 |     for (size_t i = 0; i < sizeof(regexes) / sizeof(regexes[0]); i++)
391 |     //for (size_t i = 0; 0; i++)
392 |     {
393 |         const char * regex = regexes[i];
394 |         
395 |         RegexToken tokens[512];
396 |         memset(tokens, 0xFF, sizeof(tokens));
397 |         int16_t token_count = sizeof(tokens)/sizeof(tokens[0]);
398 |         
399 |         auto start = clock::now();
400 |         int e = regex_parse(regex, tokens, &token_count, 0);
401 |         double t = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start).count() / 1000000.0;
402 |         assert(!e);
403 |         
404 |         total_my_regex_time += t;
405 |         if (t > slowest_my_regex_time)
406 |         {
407 |             slowest_my_regex_time = t;
408 |             slowest_my_regex = regex;
409 |         }
410 |         
411 |         bool has_possessive = false;
412 |         for (int32_t n = 0; n < token_count; n++)
413 |         {
414 |             if (tokens[n].mode & REMIMU_MODE_POSSESSIVE)
415 |             {
416 |                 has_possessive = true;
417 |                 break;
418 |             }
419 |         }
420 |         
421 |         #ifndef BE_QUIET
422 |         printf("token count: %d\n", token_count);
423 |         print_regex_tokens(tokens);
424 |         printf("Took %f seconds for my regex engine to parse the regex\n", t);
425 |         #endif
426 |         
427 |         std::string regex_str = regex;
428 |         
429 |         int errorcode;
430 |         PCRE2_SIZE erroroffset;
431 |         start = clock::now();
432 |         pcre2_code * re = pcre2_compile(PCRE2_SPTR8(regex), PCRE2_ZERO_TERMINATED,
433 |             PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK | PCRE2_DOTALL | PCRE2_NO_AUTO_POSSESS | PCRE2_NO_DOTSTAR_ANCHOR | PCRE2_NO_START_OPTIMIZE,
434 |             &errorcode, &erroroffset, NULL);
435 |         t = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start).count() / 1000000.0;
436 |         #ifndef BE_QUIET
437 |         printf("Took %f seconds for my PCRE2 to compile the regex\n", t);
438 |         #endif
439 |         if (t > slowest_pcre2_regex_time)
440 |         {
441 |             slowest_pcre2_regex_time = t;
442 |             slowest_pcre2_regex = regex;
443 |         }
444 |         total_pcre2_regex_time += t;
445 |         
446 |         for (size_t j = 0; j < sizeof(texts) / sizeof(texts[0]); j++)
447 |         {
448 |             const char * text = texts[j];
449 |             std::string text_str = text;
450 |             
451 |             int64_t pcre2_len = -1;
452 |             
453 |             #ifndef BE_QUIET
454 |             printf("testing PCRE2 regex `%s` on string `%s`...\n", regex, text);
455 |             fflush(stdout);
456 |             #endif
457 |             
458 |             auto start = clock::now();
459 |             pcre2_match_data * match_data = pcre2_match_data_create_from_pattern(re, 0);
460 |             int submatch_count = pcre2_match(re, PCRE2_SPTR8(text), text_str.size(), 0, PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK, match_data, 0);
461 |              
462 |             #ifndef BE_QUIET
463 |             double t = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start).count() / 1000000.0;
464 |             #endif
465 |             
466 |             //printf("submatch count: %d\n", submatch_count);
467 |             //printf("ovector count: %d\n", pcre2_get_ovector_count(match_data));
468 |             
469 |             PCRE2_SIZE * ovector = 0;
470 |             if (submatch_count > 0)
471 |             {
472 |                 ovector = pcre2_get_ovector_pointer(match_data);
473 |                 size_t offs = ovector[0];
474 |                 if (offs == 0)
475 |                     pcre2_len = ovector[1] - offs;
476 |                 #ifndef BE_QUIET
477 |                 printf("pcre2 regex found match at %zd with len %zd after %f seconds\n", offs, pcre2_len, t);
478 |                 #endif
479 |             }
480 |             #ifndef BE_QUIET
481 |             else
482 |                 printf("pcre2 regex found no match after %f seconds\n", t);
483 |             
484 |             printf("testing my regex `%s` on string `%s`...\n", regex, text);
485 |             #endif
486 |             
487 |             start = clock::now();
488 |             
489 |             int64_t cap_pos[16];
490 |             int64_t cap_span[16];
491 |             memset(cap_pos, 0xFF, sizeof(cap_pos));
492 |             memset(cap_span, 0xFF, sizeof(cap_span));
493 |             
494 |             int64_t match_len = regex_match(tokens, text, 0, 16, cap_pos, cap_span);
495 |             
496 |             assert(match_len != -3);
497 |             #ifndef BE_QUIET
498 |             t = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start).count() / 1000000.0;
499 |             if (match_len >= 0)
500 |                 printf("my regex found match with len %zd after %f seconds\n", match_len, t);
501 |             else if (match_len == -2)
502 |                 printf("my regex ran out of memory after %f seconds (note: `%s`)\n", t, regex);
503 |             else
504 |                 printf("my regex found no match after %f seconds\n", t);
505 |             #endif
506 |             
507 |             // we define captures differently than PCRE2 for possessives, so skip them
508 |             if (!has_possessive && submatch_count > 0)
509 |             {
510 |                 #ifndef BE_QUIET
511 |                 printf("comparing %zd to %zd...\n", match_len, pcre2_len);
512 |                 printf("regex `%s`, string `%s`\n", regex, text);
513 |                 #endif
514 |                 assert(match_len == pcre2_len);
515 |                 #ifndef BE_QUIET
516 |                 puts("comparing captures...");
517 |                 #endif
518 |                 if (match_len >= 0)
519 |                 {
520 |                     for (int x = 0; x < submatch_count && x < 16; x++)
521 |                     {
522 |                         size_t where = ovector[x*2];
523 |                         if (where == 0)
524 |                         {
525 |                             size_t pcre2_len = ovector[x*2+1] - where;
526 |                             // probably a situation of std capturing a zero-length group repetition
527 |                             #ifndef BE_QUIET
528 |                             printf("Capture %d: std (%zd,%zd)  mine (%zd,%zd)\n", x, where, pcre2_len, cap_pos[x], cap_span[x]);
529 |                             #endif
530 |                             if (!(cap_pos[x] == -1 && cap_span[x] == -1 && where == 0 && pcre2_len == 0))
531 |                             {
532 |                                 assert(where     == (size_t)cap_pos[x]);
533 |                                 assert(pcre2_len == (size_t)cap_span[x]);
534 |                             }
535 |                         }
536 |                     }
537 |                 }
538 |             }
539 |             
540 |             pcre2_match_data_free(match_data);
541 |         }
542 |         pcre2_code_free(re);
543 |     }
544 |     
545 |     printf("Slowest regex for me to parse at %f seconds:\n%s\n", slowest_my_regex_time, slowest_my_regex);
546 |     printf("Slowest regex for pcre2 to parse at %f seconds:\n%s\n", slowest_pcre2_regex_time, slowest_pcre2_regex);
547 |     
548 |     printf("Total parse time for me: %f\n", total_my_regex_time);
549 |     printf("Total parse time for pcre2: %f\n", total_pcre2_regex_time);
550 |     
551 |     RegexToken tokens[256];
552 |     int16_t token_count = sizeof(tokens)/sizeof(tokens[0]);
553 |     //int e = regex_parse("((\\w+,?)*:)*", tokens, &token_count, 0);
554 |     //int e = regex_parse("((\\w+,?)*+:)*", tokens, &token_count, 0);
555 |     //int e = regex_parse("((\\w+,?)*+:)*+", tokens, &token_count, 0);
556 |     int e = regex_parse("((a)|(b))++", tokens, &token_count, 0);
557 |     //int e = regex_parse("((\\w+,?)*:)", tokens, &token_count, 0);
558 |     //int e = regex_parse("((a)|((b)q))*", tokens, &token_count, 0);
559 |     assert(!e);
560 |     
561 |     int64_t cap_pos[5];
562 |     int64_t cap_span[5];
563 |     memset(cap_pos, 0xFF, sizeof(cap_pos));
564 |     memset(cap_span, 0xFF, sizeof(cap_span));
565 |     //int64_t matchlen = regex_match(tokens, "a,b,easbe_1:aaa,_,:a", 5, cap_pos, cap_span);
566 |     //int64_t matchlen = regex_match(tokens, "aabqaaaaba", 5, cap_pos, cap_span);
567 |     int64_t matchlen = regex_match(tokens, "aaaaaabbbabaqa", 0, 5, cap_pos, cap_span);
568 |     printf("Match length: %zd\n", matchlen);
569 |     for (int i = 0; i < 5; i++)
570 |         printf("Capture %d: %zd plus %zd\n", i, cap_pos[i], cap_span[i]);
571 |     
572 |     // Correct \xHH parsing (nibble order)
573 |     // Should parse and match 'A' (0x41). Also should *not* match 'B'.
574 |     must_parse_ok("\\x41");
575 |     expect_match_len("\\x41", "A", 1);
576 |     expect_no_match("\\x41", "B");
577 | 
578 |     // Sanity check: lower-case hex -> 'z' (0x7A)
579 |     must_parse_ok("\\x7a");
580 |     expect_match_len("\\x7a", "z", 1);
581 |     expect_no_match("\\x7a", "Z");
582 | 
583 |     // \xHH inside bracket/class
584 |     must_parse_ok("[\\x41]");
585 |     expect_match_len("[\\x41]", "A", 1);
586 |     expect_no_match("[\\x41]", "C");
587 | 
588 |     // Multiple items in class with hex
589 |     must_parse_ok("[ABC\\x7a]");
590 |     expect_match_len("[ABC\\x7a]", "z", 1);
591 |     expect_match_len("[ABC\\x7a]", "A", 1);
592 |     expect_no_match("[ABC\\x7a]", "q");
593 | 
594 |     // Too-short hex escapes must fail parse WITHOUT reading past end
595 |     must_parse_fail("\\x");     // nothing after 'x'
596 |     must_parse_fail("\\x4");    // only one hex nibble
597 |     must_parse_fail("[\\x]");   // same in a class
598 |     must_parse_fail("[\\x4]");  // single nibble in a class
599 | 
600 |     // Valid hex at end-of-pattern should still be OK
601 |     must_parse_ok("foo\\x41");
602 |     expect_match_len("foo\\x41", "fooA", 4);
603 |     expect_no_match("foo\\x41", "foo@");
604 | 
605 |     // Normal text around hex to ensure state transitions are correct
606 |     must_parse_ok("X\\x41Y");
607 |     expect_match_len("X\\x41Y", "XA Y", -1);  // space breaks it
608 |     expect_match_len("X\\x41Y", "XAY", 3);
609 | 
610 |     // Bracket class mixing ranges and hex
611 |     must_parse_ok("[A-\\x5A]");  // 'A'-'Z'
612 |     expect_match_len("[A-\\x5A]", "M", 1);
613 |     expect_no_match("[A-\\x5A]", "m");
614 |         
615 |     print_regex_tokens(tokens);
616 |     
617 |     puts("All regex tests passed!");
618 |     
619 |     if (1)
620 |     {
621 |         puts("Microbenchmark: matching `\\.\\d+|\\d+\\.\\d*` against 3.1415926535 one million times...");
622 |         
623 |         RegexToken tokens[256];
624 |         int16_t token_count = sizeof(tokens)/sizeof(tokens[0]);
625 |         
626 |         int e = regex_parse("\\.\\d+|\\d+\\.\\d*", tokens, &token_count, 0);
627 |         assert(!e);
628 |         auto start = clock::now();
629 |         for (size_t i = 0; i < 1000000; i++)
630 |         {
631 |             int64_t matchlen = regex_match(tokens, "3.1415926535", 0, 0, 0, 0);
632 |             assert(matchlen == 12);
633 |             volatile int64_t a = 0; matchlen = a; // force the loop to not be optimized away
634 |         }
635 |         double t = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start).count() / 1000000.0;
636 |         printf("Match time for me: %f\n", t);
637 |         
638 |         int errorcode;
639 |         PCRE2_SIZE erroroffset;
640 |         pcre2_code * re = pcre2_compile(PCRE2_SPTR8("\\.\\d+|\\d+\\.\\d*"), PCRE2_ZERO_TERMINATED,
641 |             PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK | PCRE2_DOTALL | PCRE2_NO_AUTO_POSSESS | PCRE2_NO_DOTSTAR_ANCHOR | PCRE2_NO_START_OPTIMIZE,
642 |             &errorcode, &erroroffset, NULL);
643 |         pcre2_match_data * match_data = pcre2_match_data_create_from_pattern(re, 0);
644 |         PCRE2_SIZE * ovector = pcre2_get_ovector_pointer(match_data);
645 |         
646 |         start = clock::now();
647 |         size_t size = strlen("3.1415926535");
648 |         for (size_t i = 0; i < 1000000; i++)
649 |         {
650 |             int submatch_count = pcre2_match(re, PCRE2_SPTR8("3.1415926535"), size, 0, PCRE2_ANCHORED | PCRE2_NO_UTF_CHECK, match_data, 0);
651 |             int64_t matchlen = ovector[1] - ovector[0];
652 |             assert(submatch_count == 1 && matchlen == 12);
653 |             volatile int64_t a = 0; matchlen = a; // force the loop to not be optimized away
654 |         }
655 |         t = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start).count() / 1000000.0;
656 |         printf("Match time for pcre2: %f\n", t);
657 |     }
658 | }
659 | 
660 | int main(void)
661 | {
662 |     testify();
663 | }
664 | 


--------------------------------------------------------------------------------
/remimu.h:
--------------------------------------------------------------------------------
   1 | #ifndef INCLUDE_REMIMU
   2 | #define INCLUDE_REMIMU 1
   3 | 
   4 | #ifndef REMIMU_FUNC_VISIBILITY
   5 | #define REMIMU_FUNC_VISIBILITY static inline
   6 | #endif
   7 | 
   8 | #ifndef REMIMU_CONST_VISIBILITY
   9 | #define REMIMU_CONST_VISIBILITY static const
  10 | #endif
  11 | 
  12 | #ifndef REMIMU_LOG_ERROR
  13 | #define REMIMU_LOG_ERROR puts
  14 | #endif
  15 | 
  16 | #ifndef REMIMU_ITERATION_LIMIT
  17 | #define REMIMU_ITERATION_LIMIT 0 // Set to non-zero to enable an interation limit
  18 | #endif
  19 | 
  20 | #ifndef REMIMU_ASSERT
  21 | #define REMIMU_ASSERT(x) assert(x)
  22 | #endif
  23 | 
  24 | /************
  25 | 
  26 |     REMIMU: SINGLE HEADER C/C++ REGEX LIBRARY
  27 | 
  28 |     Compatible with C99 and C++11 and later standards. Uses backtracking and relatively standard regex syntax.
  29 | 
  30 |     #include "remimu.h"
  31 | 
  32 | FUNCTIONS
  33 | 
  34 |     // Returns 0 on success, or -1 on invalid or unsupported regex, or -2 on not enough tokens given to parse regex.
  35 |     int regex_parse(
  36 |         const char * pattern,       // Regex pattern to parse.
  37 |         RegexToken * tokens,        // Output buffer of token_count regex tokens.
  38 |         int16_t * token_count,      // Maximum allowed number of tokens to write
  39 |         int32_t flags               // Optional bitflags.
  40 |     )
  41 | 
  42 |     // Returns match length, or -1 on no match, or -2 on out of memory, or -3 if the regex is invalid.
  43 |     int64_t regex_match(
  44 |         const RegexToken * tokens,  // Parsed regex to match against text.
  45 |         const char * text,          // Text to match against tokens.
  46 |         size_t start_i,             // index value to match at.
  47 |         uint16_t cap_slots,         // Number of allowed capture info output slots.
  48 |         int64_t * cap_pos,          // Capture position info output buffer.
  49 |         int64_t * cap_span          // Capture length info output buffer.
  50 |     )
  51 | 
  52 |     void print_regex_tokens(
  53 |         RegexToken * tokens     // Regex tokens to spew to stdout, for debugging.
  54 |     )
  55 | 
  56 | PERFORMANCE
  57 | 
  58 |     On simple cases, Remimu's match speed is similar to PCRE2. Regex parsing/compilation is also much faster (around 4x to 10x), so single-shot regexes are often faster than PCRE2.
  59 | 
  60 |     HOWEVER: Remimu is a pure backtracking engine, and has `O(2^x)` complexity on regexes with catastrophic backtracking. It can be much, much, MUCH slower than PCRE2. Beware!
  61 | 
  62 |     Remimu uses length-checked fixed memory buffers with no recursion, so memory usage is statically known.
  63 | 
  64 | FEATURES
  65 | 
  66 |     - Lowest-common-denominator common regex syntax
  67 |     - Based on backtracking (slow in the worst case, but fast in the best case)
  68 |     - 8-bit only, no utf-16 or utf-32
  69 |     - Statically known memory usage (no heap allocation or recursion)
  70 |     - Groups with or without capture, and with or without quantifiers
  71 |     - Supported escapes:
  72 |     - - 2-digit hex: e.g. \x00, \xFF, or lowercase, or mixed case
  73 |     - - \r, \n, \t, \v, \f (whitespace characters)
  74 |     - - \d, \s, \w, \D, \S, \W (digit, space, and word character classes)
  75 |     - - \b, \B word boundary and non-word-boundary anchors (not fully supported in zero-size quantified groups, but even then, usually supported)
  76 |     - - Escaped literal characters: {}[]-()|^$*+?:./\
  77 |     - - - Escapes work in character classes, except for 'b'
  78 |     - Character classes, including disjoint ranges, proper handling of bare [ and trailing -, etc
  79 |     - - Dot (.) matches all characters, including newlines, unless REMIMU_FLAG_DOT_NO_NEWLINES is passed as a flag to regex_parse
  80 |     - - Dot (.) only matches at most one byte at a time, so matching \r\n requires two dots (and not using REMIMU_FLAG_DOT_NO_NEWLINES)
  81 |     - Anchors (^ and $)
  82 |     - - Same support caveats as \b, \B apply
  83 |     - Basic quantifiers (*, +, ?)
  84 |     - - Quantifiers are greedy by default.
  85 |     - Explicit quantifiers ({2}, {5}, {5,}, {5,7})
  86 |     - Alternation e.g. (asdf|foo)
  87 |     - Lazy quantifiers e.g. (asdf)*? or \w+?
  88 |     - Possessive greedy quantifiers e.g. (asdf)*+ or \w++
  89 |     - - NOTE: Capture groups for and inside of possessive groups return no capture information.
  90 |     - Atomic groups e.g. (?>(asdf))
  91 |     - - NOTE: Capture groups inside of atomic groups return no capture information.
  92 | 
  93 | NOT SUPPORTED
  94 | 
  95 |     - Strings with non-terminal null characters
  96 |     - Unicode character classes (matching single utf-8 characters works regardless)
  97 |     - Exact POSIX regex semantics (posix-style greediness etc)
  98 |     - Backreferences
  99 |     - Lookbehind/Lookahead
 100 |     - Named groups
 101 |     - Most other weird flavor-specific regex stuff
 102 |     - Capture of or inside of possessive-quantified groups (still take up a capture slot, but no data is returned)
 103 | 
 104 | USAGE
 105 | 
 106 |     // minimal:
 107 | 
 108 |     RegexToken tokens[1024];
 109 |     int16_t token_count = 1024;
 110 |     int e = regex_parse("[0-9]+\\.[0-9]+", tokens, &token_count, 0);
 111 |     assert(!e);
 112 | 
 113 |     int64_t match_len = regex_match(tokens, "23.53) ", 0, 0, 0, 0);
 114 |     printf("########### return: %zd\n", match_len);
 115 | 
 116 |     // with captures:
 117 | 
 118 |     RegexToken tokens[256];
 119 |     int16_t token_count = sizeof(tokens)/sizeof(tokens[0]);
 120 |     int e = regex_parse("((a)|(b))++", tokens, &token_count, 0);
 121 |     assert(!e);
 122 | 
 123 |     int64_t cap_pos[5];
 124 |     int64_t cap_span[5];
 125 |     memset(cap_pos, 0xFF, sizeof(cap_pos));
 126 |     memset(cap_span, 0xFF, sizeof(cap_span));
 127 | 
 128 |     int64_t matchlen = regex_match(tokens, "aaaaaabbbabaqa", 0, 5, cap_pos, cap_span);
 129 |     printf("Match length: %zd\n", matchlen);
 130 |     for (int i = 0; i < 5; i++)
 131 |         printf("Capture %d: %zd plus %zd\n", i, cap_pos[i], cap_span[i]);
 132 | 
 133 |     // for debugging
 134 |     print_regex_tokens(tokens);
 135 | 
 136 | LICENSE
 137 | 
 138 |     Creative Commons Zero, public domain.
 139 | 
 140 | */
 141 | 
 142 | #include <stdint.h>
 143 | #include <stdio.h>
 144 | #include <stddef.h>
 145 | #include <string.h>
 146 | #include <assert.h>
 147 | 
 148 | REMIMU_CONST_VISIBILITY int REMIMU_FLAG_DOT_NO_NEWLINES = 1;
 149 | 
 150 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_NORMAL      = 0;
 151 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_OPEN        = 1;
 152 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_NCOPEN      = 2;
 153 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_CLOSE       = 3;
 154 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_OR          = 4;
 155 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_CARET       = 5;
 156 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_DOLLAR      = 6;
 157 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_BOUND       = 7;
 158 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_NBOUND      = 8;
 159 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_KIND_END         = 9;
 160 | 
 161 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_MODE_POSSESSIVE  = 1;
 162 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_MODE_LAZY        = 2;
 163 | REMIMU_CONST_VISIBILITY uint8_t REMIMU_MODE_INVERTED    = 128; // temporary; gets cleared later
 164 | 
 165 | typedef struct _RegexToken {
 166 |     uint8_t kind;
 167 |     uint8_t mode;
 168 |     uint16_t count_lo;
 169 |     uint16_t count_hi; // 0 means no limit
 170 |     uint16_t mask[16]; // for groups: mask 0 stores group-with-quantifier number (quantifiers are +, *, ?, {n}, {n,}, or {n,m})
 171 |     int16_t pair_offset; // from ( or ), offset in token list to matching paren. TODO: move into mask maybe
 172 | } RegexToken;
 173 | 
 174 | static int remimu_nibble_hex_to_bin(char hex, uint8_t *bin)
 175 | {
 176 |     if (hex >= '0' && hex <= '9')
 177 |     {
 178 |         *bin = hex - '0';
 179 |         return 0;
 180 |     }
 181 |     if (hex >= 'A' && hex <= 'F')
 182 |     {
 183 |         *bin = hex - 'A' + 10;
 184 |         return 0;
 185 |     }
 186 |     if (hex >= 'a' && hex <= 'f')
 187 |     {
 188 |         *bin = hex - 'a' + 10;
 189 |         return 0;
 190 |     }
 191 |     return -1; // invalid hex digit
 192 | }
 193 | 
 194 | /// Returns a negative number on failure:
 195 | /// -1: Regex string is invalid or using unsupported features or too long.
 196 | /// -2: Provided buffer not long enough. Give up, or reallocate with more length and retry.
 197 | /// Returns 0 on success.
 198 | /// On call, token_count pointer must point to the number of tokens that can be written to the tokens buffer.
 199 | /// On successful return, the number of actually used tokens is written to token_count.
 200 | /// Sets token_count to zero if a regex is not created but no error happened (e.g. empty pattern).
 201 | /// Flags: Not yet used.
 202 | /// SAFETY: Pattern must be null-terminated.
 203 | /// SAFETY: tokens buffer must have at least the input token_count number of RegexToken objects. They are allowed to be uninitialized.
 204 | REMIMU_FUNC_VISIBILITY int regex_parse(const char * pattern, RegexToken * tokens, int16_t * token_count, int32_t flags)
 205 | {
 206 |     int64_t tokens_len = *token_count;
 207 |     uint64_t pattern_len = strlen(pattern);
 208 |     if (token_count == 0)
 209 |         return -2;
 210 | 
 211 |     // 0: normal
 212 |     // 1: just saw a backslash
 213 |     int esc_state = 0;
 214 | 
 215 |     // 0: init
 216 |     // 1: normal
 217 |     // 2: in char class, initial state
 218 |     // 3: in char class, but possibly looking for a range marker
 219 |     // 4: in char class, but just saw a range marker
 220 |     // 5: immediately after quantifiable token
 221 |     // 6: immediately after quantifier
 222 | 
 223 |     const int STATE_NORMAL    = 1;
 224 |     const int STATE_QUANT     = 2;
 225 |     const int STATE_MODE      = 3;
 226 |     const int STATE_CC_INIT   = 4;
 227 |     const int STATE_CC_NORMAL = 5;
 228 |     const int STATE_CC_RANGE  = 6;
 229 |     int state = STATE_NORMAL;
 230 | 
 231 |     int char_class_mem = -1;
 232 | 
 233 |     RegexToken token;
 234 | 
 235 |     #define _REGEX_CLEAR_TOKEN() do { \
 236 |         memset(&token, 0, sizeof(RegexToken)); \
 237 |         token.count_lo = 1; \
 238 |         token.count_hi = 2; \
 239 |     } while(0)
 240 | 
 241 |     _REGEX_CLEAR_TOKEN();
 242 | 
 243 |     #define _REGEX_DO_INVERT() do { \
 244 |         for (int n = 0; n < 16; n++) \
 245 |             token.mask[n] = ~token.mask[n]; \
 246 |         token.mode &= ~REMIMU_MODE_INVERTED; \
 247 |     } while (0)
 248 | 
 249 |     int16_t k = 0;
 250 | 
 251 |     #define _REGEX_PUSH_TOKEN() do { \
 252 |         if (k == 0 || tokens[k-1].kind != token.kind || (token.kind != REMIMU_KIND_BOUND && token.kind != REMIMU_KIND_NBOUND)) \
 253 |         { \
 254 |             if (token.mode & REMIMU_MODE_INVERTED) _REGEX_DO_INVERT(); \
 255 |             if (k >= tokens_len) \
 256 |             { \
 257 |                 REMIMU_LOG_ERROR("buffer overflow"); \
 258 |                 return -2; \
 259 |             } \
 260 |             tokens[k++] = token; \
 261 |             _REGEX_CLEAR_TOKEN(); \
 262 |         } \
 263 |     } while (0)
 264 | 
 265 |     #define _REGEX_SET_MASK(byte) do { token.mask[((uint8_t)(byte))>>4] |= 1 << ((uint8_t)(byte) & 0xF); } while (0)
 266 |     #define _REGEX_SET_MASK_ALL() do { \
 267 |         for (int n = 0; n < 16; n++) \
 268 |             token.mask[n] = 0xFFFF; \
 269 |     } while (0)
 270 | 
 271 |     // start with an invisible group specifier
 272 |     // (this allows the matcher to not need to have a special root-level alternation operator case)
 273 |     token.kind = REMIMU_KIND_OPEN;
 274 |     token.count_lo = 0;
 275 |     token.count_hi = 0;
 276 | 
 277 |     int paren_count = 0;
 278 | 
 279 |     for (uint64_t i = 0; i < pattern_len; i++)
 280 |     {
 281 |         char c = pattern[i];
 282 |         if (state == STATE_QUANT)
 283 |         {
 284 |             state = STATE_MODE;
 285 |             if (c == '?')
 286 |             {
 287 |                 token.count_lo = 0;
 288 |                 token.count_hi = 2; // first non-allowed amount
 289 |                 continue;
 290 |             }
 291 |             else if (c == '+')
 292 |             {
 293 |                 token.count_lo = 1;
 294 |                 token.count_hi = 0; // unlimited
 295 |                 continue;
 296 |             }
 297 |             else if (c == '*')
 298 |             {
 299 |                 token.count_lo = 0;
 300 |                 token.count_hi = 0; // unlimited
 301 |                 continue;
 302 |             }
 303 |             else if (c == '{')
 304 |             {
 305 |                 if (pattern[i+1] == 0 || pattern[i+1] < '0' || pattern[i+1] > '9')
 306 |                     state = STATE_NORMAL;
 307 |                 else
 308 |                 {
 309 |                     i += 1;
 310 |                     uint32_t val = 0;
 311 |                     while (pattern[i] >= '0' && pattern[i] <= '9')
 312 |                     {
 313 |                         val *= 10;
 314 |                         val += (uint32_t)(pattern[i] - '0');
 315 |                         if (val > 0xFFFF)
 316 |                         {
 317 |                             REMIMU_LOG_ERROR("quantifier range too long");
 318 |                             return -1; // unsupported length
 319 |                         }
 320 |                         i += 1;
 321 |                     }
 322 |                     token.count_lo = val;
 323 |                     token.count_hi = val + 1;
 324 |                     if (pattern[i] == ',')
 325 |                     {
 326 |                         token.count_hi = 0; // unlimited
 327 |                         i += 1;
 328 | 
 329 |                         if (pattern[i] >= '0' && pattern[i] <= '9')
 330 |                         {
 331 |                             uint32_t val2 = 0;
 332 |                             while (pattern[i] >= '0' && pattern[i] <= '9')
 333 |                             {
 334 |                                 val2 *= 10;
 335 |                                 val2 += (uint32_t)(pattern[i] - '0');
 336 |                                 if (val2 > 0xFFFF)
 337 |                                 {
 338 |                                     REMIMU_LOG_ERROR("quantifier range too long");
 339 |                                     return -1; // unsupported length
 340 |                                 }
 341 |                                 i += 1;
 342 |                             }
 343 |                             if (val2 < val)
 344 |                             {
 345 |                                 REMIMU_LOG_ERROR("quantifier range is backwards");
 346 |                                 return -1; // unsupported length
 347 |                             }
 348 |                             token.count_hi = val2 + 1;
 349 |                         }
 350 |                     }
 351 | 
 352 |                     if (pattern[i] == '}')
 353 |                     {
 354 |                         // quantifier range parsed successfully
 355 |                         continue;
 356 |                     }
 357 |                     else
 358 |                     {
 359 |                         REMIMU_LOG_ERROR("quantifier range syntax broken (no terminator)");
 360 |                         return -1;
 361 |                     }
 362 |                 }
 363 |             }
 364 |         }
 365 | 
 366 |         if (state == STATE_MODE)
 367 |         {
 368 |             state = STATE_NORMAL;
 369 |             if (c == '?')
 370 |             {
 371 |                 token.mode |= REMIMU_MODE_LAZY;
 372 |                 continue;
 373 |             }
 374 |             else if (c == '+')
 375 |             {
 376 |                 token.mode |= REMIMU_MODE_POSSESSIVE;
 377 |                 continue;
 378 |             }
 379 |         }
 380 | 
 381 |         if (state == STATE_NORMAL)
 382 |         {
 383 |             if (esc_state == 1)
 384 |             {
 385 |                 esc_state = 0;
 386 |                 if (c == 'n')
 387 |                     _REGEX_SET_MASK('\n');
 388 |                 else if (c == 'r')
 389 |                     _REGEX_SET_MASK('\r');
 390 |                 else if (c == 't')
 391 |                     _REGEX_SET_MASK('\t');
 392 |                 else if (c == 'v')
 393 |                     _REGEX_SET_MASK('\v');
 394 |                 else if (c == 'f')
 395 |                     _REGEX_SET_MASK('\f');
 396 |                 else if (c == 'x')
 397 |                 {
 398 |                     if (pattern[i+1] == 0 || pattern[i+2] == 0)
 399 |                         return -1; // too-short hex pattern
 400 |                     uint8_t n0, n1;
 401 |                     if (remimu_nibble_hex_to_bin(pattern[i+1], &n0))
 402 |                         return -1; // invalid hex
 403 |                     if (remimu_nibble_hex_to_bin(pattern[i+2], &n1))
 404 |                         return -1; // invalid hex
 405 |                     _REGEX_SET_MASK((n0 << 4) | n1);
 406 |                     i += 2;
 407 |                     state = STATE_QUANT;
 408 |                 }
 409 |                 else if (c == '{' || c == '}' ||
 410 |                          c == '[' || c == ']' || c == '-' ||
 411 |                          c == '(' || c == ')' ||
 412 |                          c == '|' || c == '^' || c == '$' ||
 413 |                          c == '*' || c == '+' || c == '?' || c == ':' ||
 414 |                          c == '.' || c == '/' || c == '\\')
 415 |                 {
 416 |                     _REGEX_SET_MASK(c);
 417 |                     state = STATE_QUANT;
 418 |                 }
 419 |                 else if (c == 'd' || c == 's' || c == 'w' ||
 420 |                          c == 'D' || c == 'S' || c == 'W')
 421 |                 {
 422 |                     uint8_t is_upper = c <= 'Z';
 423 | 
 424 |                     uint16_t m[16];
 425 |                     memset(m, 0, sizeof(m));
 426 | 
 427 |                     if (is_upper)
 428 |                         c += 0x20;
 429 |                     if (c == 'd' || c == 'w')
 430 |                         m[3] |= 0x03FF; // 0~7
 431 |                     if (c == 's')
 432 |                     {
 433 |                         m[0] |= 0x3E00; // \t-\r (includes \n, \v, and \f in the middle. 5 enabled bits.)
 434 |                         m[2] |= 1; // ' '
 435 |                     }
 436 |                     if (c == 'w')
 437 |                     {
 438 |                         m[4] |= 0xFFFE; // A-O
 439 |                         m[5] |= 0x87FF; // P-Z_
 440 |                         m[6] |= 0xFFFE; // a-o
 441 |                         m[7] |= 0x07FF; // p-z
 442 |                     }
 443 | 
 444 |                     for (int j = 0; j < 16; j++)
 445 |                         token.mask[j] |= is_upper ? ~m[j] : m[j];
 446 | 
 447 |                     token.kind = REMIMU_KIND_NORMAL;
 448 |                     state = STATE_QUANT;
 449 |                 }
 450 |                 else if (c == 'b')
 451 |                 {
 452 |                     token.kind = REMIMU_KIND_BOUND;
 453 |                     state = STATE_NORMAL;
 454 |                 }
 455 |                 else if (c == 'B')
 456 |                 {
 457 |                     token.kind = REMIMU_KIND_NBOUND;
 458 |                     state = STATE_NORMAL;
 459 |                 }
 460 |                 else
 461 |                 {
 462 |                     REMIMU_LOG_ERROR("unsupported escape sequence");
 463 |                     return -1; // unknown/unsupported escape sequence
 464 |                 }
 465 |             }
 466 |             else
 467 |             {
 468 |                 _REGEX_PUSH_TOKEN();
 469 |                 if (c == '\\')
 470 |                 {
 471 |                     esc_state = 1;
 472 |                 }
 473 |                 else if (c == '[')
 474 |                 {
 475 |                     state = STATE_CC_INIT;
 476 |                     char_class_mem = -1;
 477 |                     token.kind = REMIMU_KIND_NORMAL;
 478 |                     if (pattern[i + 1] == '^')
 479 |                     {
 480 |                         token.mode |= REMIMU_MODE_INVERTED;
 481 |                         i += 1;
 482 |                     }
 483 |                 }
 484 |                 else if (c == '(')
 485 |                 {
 486 |                     paren_count += 1;
 487 |                     state = STATE_NORMAL;
 488 |                     token.kind = REMIMU_KIND_OPEN;
 489 |                     token.count_lo = 0;
 490 |                     token.count_hi = 1;
 491 |                     if (pattern[i + 1] == '?' && pattern[i + 2] == ':')
 492 |                     {
 493 |                         token.kind = REMIMU_KIND_NCOPEN;
 494 |                         i += 2;
 495 |                     }
 496 |                     else if (pattern[i + 1] == '?' && pattern[i + 2] == '>')
 497 |                     {
 498 |                         token.kind = REMIMU_KIND_NCOPEN;
 499 |                         _REGEX_PUSH_TOKEN();
 500 | 
 501 |                         state = STATE_NORMAL;
 502 |                         token.kind = REMIMU_KIND_NCOPEN;
 503 |                         token.mode = REMIMU_MODE_POSSESSIVE;
 504 |                         token.count_lo = 1;
 505 |                         token.count_hi = 2;
 506 | 
 507 |                         i += 2;
 508 |                     }
 509 |                 }
 510 |                 else if (c == ')')
 511 |                 {
 512 |                     paren_count -= 1;
 513 |                     if (paren_count < 0 || k == 0)
 514 |                         return -1; // unbalanced parens
 515 |                     token.kind = REMIMU_KIND_CLOSE;
 516 |                     state = STATE_QUANT;
 517 | 
 518 |                     int balance = 0;
 519 |                     ptrdiff_t found = -1;
 520 |                     for (ptrdiff_t l = k - 1; l >= 0; l--)
 521 |                     {
 522 |                         if (tokens[l].kind == REMIMU_KIND_NCOPEN || tokens[l].kind == REMIMU_KIND_OPEN)
 523 |                         {
 524 |                             if (balance == 0)
 525 |                             {
 526 |                                 found = l;
 527 |                                 break;
 528 |                             }
 529 |                             else
 530 |                                 balance -= 1;
 531 |                         }
 532 |                         else if (tokens[l].kind == REMIMU_KIND_CLOSE)
 533 |                             balance += 1;
 534 |                     }
 535 |                     if (found == -1)
 536 |                         return -1; // unbalanced parens
 537 |                     ptrdiff_t diff = k - found;
 538 |                     if (diff > 32767)
 539 |                         return -1; // too long
 540 |                     token.pair_offset = -diff;
 541 |                     tokens[found].pair_offset = diff;
 542 |                     // phantom group for atomic group emulation
 543 |                     if (tokens[found].mode == REMIMU_MODE_POSSESSIVE)
 544 |                     {
 545 |                         _REGEX_PUSH_TOKEN();
 546 |                         token.kind = REMIMU_KIND_CLOSE;
 547 |                         token.mode = REMIMU_MODE_POSSESSIVE;
 548 |                         token.pair_offset = -diff - 2;
 549 |                         tokens[found - 1].pair_offset = diff + 2;
 550 |                     }
 551 |                 }
 552 |                 else if (c == '?' || c == '+' || c == '*' || c == '{')
 553 |                 {
 554 |                     REMIMU_LOG_ERROR("quantifier in non-quantifier context");
 555 |                     return -1; // quantifier in non-quantifier context
 556 |                 }
 557 |                 else if (c == '.')
 558 |                 {
 559 |                     //puts("setting ALL of mask...");
 560 |                     _REGEX_SET_MASK_ALL();
 561 |                     if (flags & REMIMU_FLAG_DOT_NO_NEWLINES)
 562 |                     {
 563 |                         token.mask[1] ^= 0x04; // \n
 564 |                         token.mask[1] ^= 0x20; // \r
 565 |                     }
 566 |                     state = STATE_QUANT;
 567 |                 }
 568 |                 else if (c == '^')
 569 |                 {
 570 |                     token.kind = REMIMU_KIND_CARET;
 571 |                     state = STATE_NORMAL;
 572 |                 }
 573 |                 else if (c == '$')
 574 |                 {
 575 |                     token.kind = REMIMU_KIND_DOLLAR;
 576 |                     state = STATE_NORMAL;
 577 |                 }
 578 |                 else if (c == '|')
 579 |                 {
 580 |                     token.kind = REMIMU_KIND_OR;
 581 |                     state = STATE_NORMAL;
 582 |                 }
 583 |                 else
 584 |                 {
 585 |                     _REGEX_SET_MASK(c);
 586 |                     state = STATE_QUANT;
 587 |                 }
 588 |             }
 589 |         }
 590 |         else if (state == STATE_CC_INIT || state == STATE_CC_NORMAL || state == STATE_CC_RANGE)
 591 |         {
 592 |             if (c == '\\' && esc_state == 0)
 593 |             {
 594 |                 esc_state = 1;
 595 |                 continue;
 596 |             }
 597 |             uint8_t esc_c = 0;
 598 |             if (esc_state == 1)
 599 |             {
 600 |                 esc_state = 0;
 601 |                 if (c == 'n')
 602 |                     esc_c = '\n';
 603 |                 else if (c == 'r')
 604 |                     esc_c = '\r';
 605 |                 else if (c == 't')
 606 |                     esc_c = '\t';
 607 |                 else if (c == 'v')
 608 |                     esc_c = '\v';
 609 |                 else if (c == 'f')
 610 |                     esc_c = '\f';
 611 |                 else if (c == 'x')
 612 |                 {
 613 |                     if (pattern[i+1] == 0 || pattern[i+2] == 0)
 614 |                         return -1; // too-short hex pattern
 615 |                     uint8_t n0, n1;
 616 |                     if (remimu_nibble_hex_to_bin(pattern[i+1], &n0))
 617 |                         return -1; // invalid hex
 618 |                     if (remimu_nibble_hex_to_bin(pattern[i+2], &n1))
 619 |                         return -1; // invalid hex
 620 |                     esc_c = (n0 << 4) | n1;
 621 |                     i += 2;
 622 |                 }
 623 |                 else if (c == '{' || c == '}' ||
 624 |                          c == '[' || c == ']' || c == '-' ||
 625 |                          c == '(' || c == ')' ||
 626 |                          c == '|' || c == '^' || c == '$' ||
 627 |                          c == '*' || c == '+' || c == '?' || c == ':' ||
 628 |                          c == '.' || c == '/' || c == '\\')
 629 |                 {
 630 |                     esc_c = c;
 631 |                 }
 632 |                 else if (c == 'd' || c == 's' || c == 'w' ||
 633 |                          c == 'D' || c == 'S' || c == 'W')
 634 |                 {
 635 |                     if (state == STATE_CC_RANGE)
 636 |                     {
 637 |                         REMIMU_LOG_ERROR("tried to use a shorthand as part of a range");
 638 |                         return -1; // range shorthands can't be part of a range
 639 |                     }
 640 |                     uint8_t is_upper = c <= 'Z';
 641 | 
 642 |                     uint16_t m[16];
 643 |                     memset(m, 0, sizeof(m));
 644 | 
 645 |                     if (is_upper)
 646 |                         c += 0x20;
 647 |                     if (c == 'd' || c == 'w')
 648 |                         m[3] |= 0x03FF; // 0~7
 649 |                     if (c == 's')
 650 |                     {
 651 |                         m[0] |= 0x3E00; // \t-\r (includes \n, \v, and \f in the middle. 5 enabled bits.)
 652 |                         m[2] |= 1; // ' '
 653 |                     }
 654 |                     if (c == 'w')
 655 |                     {
 656 |                         m[4] |= 0xFFFE; // A-O
 657 |                         m[5] |= 0x87FF; // P-Z_
 658 |                         m[6] |= 0xFFFE; // a-o
 659 |                         m[7] |= 0x07FF; // p-z
 660 |                     }
 661 | 
 662 |                     for (int j = 0; j < 16; j++)
 663 |                         token.mask[j] |= is_upper ? ~m[j] : m[j];
 664 | 
 665 |                     char_class_mem = -1; // range shorthands can't be part of a range
 666 |                     continue;
 667 |                 }
 668 |                 else
 669 |                 {
 670 |                     printf("unknown/unsupported escape sequence in character class (\\%c)\n", c);
 671 |                     return -1; // unknown/unsupported escape sequence
 672 |                 }
 673 |             }
 674 |             if (state == STATE_CC_INIT)
 675 |             {
 676 |                 uint8_t val = esc_c ? esc_c : (uint8_t)c;
 677 |                 char_class_mem = val;
 678 |                 _REGEX_SET_MASK(val);
 679 |                 state = STATE_CC_NORMAL;
 680 |             }
 681 |             else if (state == STATE_CC_NORMAL)
 682 |             {
 683 |                 if (c == ']' && esc_c == 0)
 684 |                 {
 685 |                     char_class_mem = -1;
 686 |                     state = STATE_QUANT;
 687 |                     continue;
 688 |                 }
 689 |                 else if (c == '-' && esc_c == 0 && char_class_mem >= 0)
 690 |                 {
 691 |                     state = STATE_CC_RANGE;
 692 |                     continue;
 693 |                 }
 694 |                 else
 695 |                 {
 696 |                     uint8_t val = esc_c ? esc_c : (uint8_t)c;
 697 |                     char_class_mem = val;
 698 |                     _REGEX_SET_MASK(val);
 699 |                     state = STATE_CC_NORMAL;
 700 |                 }
 701 |             }
 702 |             else if (state == STATE_CC_RANGE)
 703 |             {
 704 |                 if (c == ']' && esc_c == 0)
 705 |                 {
 706 |                     char_class_mem = -1;
 707 |                     _REGEX_SET_MASK('-');
 708 |                     state = STATE_QUANT;
 709 |                     continue;
 710 |                 }
 711 |                 else
 712 |                 {
 713 |                     if (char_class_mem == -1)
 714 |                     {
 715 |                         REMIMU_LOG_ERROR("character class range is broken");
 716 |                         return -1; // probably tried to use a character class shorthand as part of a range
 717 |                     }
 718 |                     uint8_t rhs = esc_c ? esc_c : (uint8_t)c;
 719 |                     if (rhs < (uint8_t)char_class_mem)
 720 |                     {
 721 |                         REMIMU_LOG_ERROR("character class range is misordered");
 722 |                         return -1; // range is in wrong order
 723 |                     }
 724 |                     //printf("enabling char class from %d to %d...\n", char_class_mem, c);
 725 |                     for (uint8_t j = rhs; j > (uint8_t)char_class_mem; j--)
 726 |                         _REGEX_SET_MASK(j);
 727 |                     state = STATE_CC_NORMAL;
 728 |                     char_class_mem = -1;
 729 |                 }
 730 |             }
 731 |         }
 732 |         else
 733 |             REMIMU_ASSERT(0);
 734 |     }
 735 |     if (paren_count > 0)
 736 |     {
 737 |         REMIMU_LOG_ERROR("(paren_count > 0)");
 738 |         return -1; // unbalanced parens
 739 |     }
 740 |     if (esc_state != 0)
 741 |     {
 742 |         REMIMU_LOG_ERROR("(esc_state != 0)");
 743 |         return -1; // open escape sequence
 744 |     }
 745 |     if (state >= STATE_CC_INIT)
 746 |     {
 747 |         REMIMU_LOG_ERROR("(state >= STATE_CC_INIT)");
 748 |         return -1; // open character class
 749 |     }
 750 | 
 751 |     _REGEX_PUSH_TOKEN();
 752 | 
 753 |     // add invisible non-capturing group specifier
 754 |     token.kind = REMIMU_KIND_CLOSE;
 755 |     token.count_lo = 1;
 756 |     token.count_hi = 2;
 757 |     _REGEX_PUSH_TOKEN();
 758 | 
 759 |     // add end token (tells matcher that it's done)
 760 |     token.kind = REMIMU_KIND_END;
 761 |     _REGEX_PUSH_TOKEN();
 762 | 
 763 |     tokens[0].pair_offset = k - 2;
 764 |     tokens[k-2].pair_offset = -(k - 2);
 765 | 
 766 |     *token_count = k;
 767 | 
 768 |     // copy quantifiers from )s to (s (so (s know whether they're optional)
 769 |     // also take the opportunity to smuggle "quantified group index" into the mask field for the )
 770 |     uint64_t n = 0;
 771 |     for (int16_t k2 = 0; k2 < k; k2++)
 772 |     {
 773 |         if (tokens[k2].kind == REMIMU_KIND_CLOSE)
 774 |         {
 775 |             tokens[k2].mask[0] = n++;
 776 | 
 777 |             int16_t k3 = k2 + tokens[k2].pair_offset;
 778 |             tokens[k3].count_lo = tokens[k2].count_lo;
 779 |             tokens[k3].count_hi = tokens[k2].count_hi;
 780 |             tokens[k3].mask[0] = n++;
 781 |             tokens[k3].mode = tokens[k2].mode;
 782 | 
 783 |             //if (n > 65535)
 784 |             if (n > 1024)
 785 |                 return -1; // too many quantified groups
 786 |         }
 787 |         else if (tokens[k2].kind == REMIMU_KIND_OR || tokens[k2].kind == REMIMU_KIND_OPEN || tokens[k2].kind == REMIMU_KIND_NCOPEN)
 788 |         {
 789 |             // find next | or ) and how far away it is. store in token
 790 |             int balance = 0;
 791 |             ptrdiff_t found = -1;
 792 |             for (ptrdiff_t l = k2 + 1; l < k; l++)
 793 |             {
 794 |                 if (tokens[l].kind == REMIMU_KIND_OR && balance == 0)
 795 |                 {
 796 |                     found = l;
 797 |                     break;
 798 |                 }
 799 |                 else if (tokens[l].kind == REMIMU_KIND_CLOSE)
 800 |                 {
 801 |                     if (balance == 0)
 802 |                     {
 803 |                         found = l;
 804 |                         break;
 805 |                     }
 806 |                     else
 807 |                         balance -= 1;
 808 |                 }
 809 |                 else if (tokens[l].kind == REMIMU_KIND_NCOPEN || tokens[l].kind == REMIMU_KIND_OPEN)
 810 |                     balance += 1;
 811 |             }
 812 |             if (found == -1)
 813 |             {
 814 |                 REMIMU_LOG_ERROR("unbalanced parens...");
 815 |                 return -1; // unbalanced parens
 816 |             }
 817 |             ptrdiff_t diff = found - k2;
 818 |             if (diff > 32767)
 819 |             {
 820 |                 REMIMU_LOG_ERROR("too long...");
 821 |                 return -1; // too long
 822 |             }
 823 | 
 824 |             if (tokens[k2].kind == REMIMU_KIND_OR)
 825 |                 tokens[k2].pair_offset = diff;
 826 |             else
 827 |                 tokens[k2].mask[15] = diff;
 828 |         }
 829 |     }
 830 | 
 831 |     #undef _REGEX_PUSH_TOKEN
 832 |     #undef _REGEX_SET_MASK
 833 |     #undef _REGEX_CLEAR_TOKEN
 834 | 
 835 |     return 0;
 836 | }
 837 | 
 838 | typedef struct _RegexMatcherState {
 839 |     uint32_t k;
 840 |     uint32_t group_state; // quantified group temp state (e.g. number of repetitions)
 841 |     uint32_t prev; // for )s, stack index of corresponding previous quantified state
 842 | #ifdef REGEX_STACK_SMOL
 843 |     uint32_t i;
 844 |     uint32_t range_min;
 845 |     uint32_t range_max;
 846 | #else
 847 |     uint64_t i;
 848 |     uint64_t range_min;
 849 |     uint64_t range_max;
 850 | #endif
 851 | } RegexMatcherState;
 852 | 
 853 | // NOTE: undef'd later
 854 | #define _REGEX_CHECK_MASK(K, byte) (!!(tokens[K].mask[((uint8_t)byte)>>4] & (1 << ((uint8_t)byte & 0xF))))
 855 | 
 856 | // Returns match length if text starts with a regex match.
 857 | // Returns -1 if the text doesn't start with a regex match.
 858 | // Returns -2 if the matcher ran out of memory or the regex is too complex.
 859 | // Returns -3 if the regex is somehow invalid.
 860 | // The first cap_slots capture positions and spans (lengths) will be written to cap_pos and cap_span. If zero, will not be written to.
 861 | // SAFETY: The text variable must be null-terminated, and start_i must be the index of a character within the string or its null terminator.
 862 | // SAFETY: Tokens array must be terminated by a REMIMU_KIND_END token (done by default by regex_parse).
 863 | // SAFETY: Partial capture data may be written even if the match fails.
 864 | REMIMU_FUNC_VISIBILITY int64_t regex_match(const RegexToken * tokens, const char * text, size_t start_i, uint16_t cap_slots, int64_t * cap_pos, int64_t * cap_span)
 865 | {
 866 |     (void)text;
 867 | 
 868 | #ifdef REGEX_VERBOSE
 869 |     const uint8_t verbose = 1;
 870 | #else
 871 |     const uint8_t verbose = 0;
 872 | #endif
 873 | 
 874 | #define IF_VERBOSE(X) { if (verbose) { X } }
 875 | 
 876 | #ifdef REGEX_STACK_SMOL
 877 |     const uint16_t stack_size_max = 256;
 878 | #else
 879 |     const uint16_t stack_size_max = 1024;
 880 | #endif
 881 |     const uint16_t aux_stats_size = 1024;
 882 |     if (cap_slots > aux_stats_size)
 883 |         cap_slots = aux_stats_size;
 884 | 
 885 |     // quantified group state
 886 |     uint8_t q_group_accepts_zero[aux_stats_size];
 887 |     uint32_t q_group_state[aux_stats_size]; // number of repetitions
 888 |     uint32_t q_group_stack[aux_stats_size]; // location of most recent corresponding ) on stack. 0 means nowhere
 889 | 
 890 |     uint16_t q_group_cap_index[aux_stats_size];
 891 |     memset(q_group_cap_index, 0xFF, sizeof(q_group_cap_index));
 892 | 
 893 |     uint64_t tokens_len = 0;
 894 |     uint32_t k = 0;
 895 |     uint16_t caps = 0;
 896 | 
 897 |     while (tokens[k].kind != REMIMU_KIND_END)
 898 |     {
 899 |         if (tokens[k].kind == REMIMU_KIND_OPEN && caps < cap_slots)
 900 |         {
 901 |             q_group_cap_index[tokens[k].mask[0]] = caps;
 902 |             q_group_cap_index[tokens[k + tokens[k].pair_offset].mask[0]] = caps;
 903 |             cap_pos[caps] = -1;
 904 |             cap_span[caps] = -1;
 905 |             caps += 1;
 906 |         }
 907 |         k += 1;
 908 |         if (tokens[k].kind == REMIMU_KIND_CLOSE || tokens[k].kind == REMIMU_KIND_OPEN || tokens[k].kind == REMIMU_KIND_NCOPEN)
 909 |         {
 910 |             if (tokens[k].mask[0] >= aux_stats_size)
 911 |             {
 912 |                 REMIMU_LOG_ERROR("too many qualified groups. returning");
 913 |                 return -2; // OOM: too many quantified groups
 914 |             }
 915 | 
 916 |             q_group_state[tokens[k].mask[0]] = 0;
 917 |             q_group_stack[tokens[k].mask[0]] = 0;
 918 |             q_group_accepts_zero[tokens[k].mask[0]] = 0;
 919 |         }
 920 |     }
 921 | 
 922 |     tokens_len = k;
 923 | 
 924 |     RegexMatcherState rewind_stack[stack_size_max];
 925 |     uint16_t stack_n = 0;
 926 | 
 927 |     uint64_t i = start_i;
 928 | 
 929 |     uint64_t range_min = 0;
 930 |     uint64_t range_max = 0;
 931 |     uint8_t just_rewinded = 0;
 932 | 
 933 |     #define _P_TEXT_HIGHLIGHTED() do { \
 934 |         IF_VERBOSE(printf("\033[91m"); \
 935 |         for (uint64_t q = 0; q < i; q++) printf("%c", text[q]); \
 936 |         printf("\033[0m"); \
 937 |         for (uint64_t q = i; text[q] != 0; q++) printf("%c", text[q]); \
 938 |         printf("\n");) \
 939 |     } while (0)
 940 | 
 941 |     #define _REWIND_DO_SAVE_RAW(K, ISDUMMY) do { \
 942 |         if (stack_n >= stack_size_max) \
 943 |         { \
 944 |             REMIMU_LOG_ERROR("out of backtracking room. returning"); \
 945 |             return -2; \
 946 |         } \
 947 |         RegexMatcherState s; \
 948 |         memset(&s, 0, sizeof(RegexMatcherState)); \
 949 |         s.i = i; \
 950 |         s.k = (K); \
 951 |         s.range_min = range_min; \
 952 |         s.range_max = range_max; \
 953 |         s.prev = 0; \
 954 |         if (ISDUMMY) s.prev = 0xFAC7; \
 955 |         else if (tokens[s.k].kind == REMIMU_KIND_CLOSE) \
 956 |         { \
 957 |             s.group_state = q_group_state[tokens[s.k].mask[0]]; \
 958 |             s.prev = q_group_stack[tokens[s.k].mask[0]]; \
 959 |             q_group_stack[tokens[s.k].mask[0]] = stack_n; \
 960 |         } \
 961 |         rewind_stack[stack_n++] = s; \
 962 |         _P_TEXT_HIGHLIGHTED(); \
 963 |         IF_VERBOSE(printf("-- saving rewind state k %u i %zd rmin %zu rmax %zd (line %d) (depth %d prev %d)\n", s.k, i, range_min, range_max, __LINE__, stack_n, s.prev);) \
 964 |     } while (0)
 965 |     #define _REWIND_DO_SAVE_DUMMY(K) _REWIND_DO_SAVE_RAW(K, 1)
 966 |     #define _REWIND_DO_SAVE(K) _REWIND_DO_SAVE_RAW(K, 0)
 967 | 
 968 |     #define _REWIND_OR_ABORT() do { \
 969 |         if (stack_n == 0) \
 970 |             return -1; \
 971 |         stack_n -= 1; \
 972 |         while (stack_n > 0 && rewind_stack[stack_n].prev == 0xFAC7) stack_n -= 1; \
 973 |         just_rewinded = 1; \
 974 |         range_min = rewind_stack[stack_n].range_min; \
 975 |         range_max = rewind_stack[stack_n].range_max; \
 976 |         REMIMU_ASSERT(rewind_stack[stack_n].i <= i); \
 977 |         i = rewind_stack[stack_n].i; \
 978 |         k = rewind_stack[stack_n].k; \
 979 |         if (tokens[k].kind == REMIMU_KIND_CLOSE) \
 980 |         { \
 981 |             q_group_state[tokens[k].mask[0]] = rewind_stack[stack_n].group_state; \
 982 |             q_group_stack[tokens[k].mask[0]] = rewind_stack[stack_n].prev; \
 983 |         } \
 984 |         _P_TEXT_HIGHLIGHTED(); \
 985 |         IF_VERBOSE(printf("-- rewound to k %u i %zd rmin %zu rmax %zd (kind %d prev %d)\n", k, i, range_min, range_max, tokens[k].kind, rewind_stack[stack_n].prev);) \
 986 |         k -= 1; \
 987 |     } while (0)
 988 |     // the -= 1 is because of the k++ in the for loop
 989 | 
 990 |     // used in boundary anchor checker
 991 |     uint64_t w_mask[16];
 992 |     memset(w_mask, 0, sizeof(w_mask));
 993 |     w_mask[3] = 0x03FF;
 994 |     w_mask[4] = 0xFFFE;
 995 |     w_mask[5] = 0x87FF;
 996 |     w_mask[6] = 0xFFFE;
 997 |     w_mask[7] = 0x07FF;
 998 |     #define _REGEX_CHECK_IS_W(byte) (!!(w_mask[((uint8_t)byte)>>4] & (1 << ((uint8_t)byte & 0xF))))
 999 | 
1000 |     int limit = REMIMU_ITERATION_LIMIT;
1001 |     for (k = 0; k < tokens_len; k++)
1002 |     {
1003 |         if (REMIMU_ITERATION_LIMIT)
1004 |         {
1005 |             if (limit-- == 0)
1006 |             {
1007 |                 REMIMU_LOG_ERROR("iteration limit exceeded. returning");
1008 |                 return -2;
1009 |             }
1010 |         }
1011 |         IF_VERBOSE(printf("k: %u\ti: %zu\tl: %zu\tstack_n: %d\n", k, i, limit, stack_n);)
1012 |         _P_TEXT_HIGHLIGHTED();
1013 |         if (tokens[k].kind == REMIMU_KIND_CARET)
1014 |         {
1015 |             if (i != 0)
1016 |                 _REWIND_OR_ABORT();
1017 |             continue;
1018 |         }
1019 |         else if (tokens[k].kind == REMIMU_KIND_DOLLAR)
1020 |         {
1021 |             if (text[i] != 0)
1022 |                 _REWIND_OR_ABORT();
1023 |             continue;
1024 |         }
1025 |         else if (tokens[k].kind == REMIMU_KIND_BOUND)
1026 |         {
1027 |             if (i == 0 && !_REGEX_CHECK_IS_W(text[i]))
1028 |                 _REWIND_OR_ABORT();
1029 |             else if (i != 0 && text[i] == 0 && !_REGEX_CHECK_IS_W(text[i-1]))
1030 |                 _REWIND_OR_ABORT();
1031 |             else if (i != 0 && text[i] != 0 && _REGEX_CHECK_IS_W(text[i-1]) == _REGEX_CHECK_IS_W(text[i]))
1032 |                 _REWIND_OR_ABORT();
1033 |         }
1034 |         else if (tokens[k].kind == REMIMU_KIND_NBOUND)
1035 |         {
1036 |             if (i == 0 && _REGEX_CHECK_IS_W(text[i]))
1037 |                 _REWIND_OR_ABORT();
1038 |             else if (i != 0 && text[i] == 0 && _REGEX_CHECK_IS_W(text[i-1]))
1039 |                 _REWIND_OR_ABORT();
1040 |             else if (i != 0 && text[i] != 0 && _REGEX_CHECK_IS_W(text[i-1]) != _REGEX_CHECK_IS_W(text[i]))
1041 |                 _REWIND_OR_ABORT();
1042 |         }
1043 |         else
1044 |         {
1045 |             // deliberately unmatchable token (e.g. a{0}, a{0,0})
1046 |             if (tokens[k].count_hi == 1)
1047 |             {
1048 |                 if (tokens[k].kind == REMIMU_KIND_OPEN || tokens[k].kind == REMIMU_KIND_NCOPEN)
1049 |                     k += tokens[k].pair_offset;
1050 |                 else
1051 |                     k += 1;
1052 |                 continue;
1053 |             }
1054 | 
1055 |             if (tokens[k].kind == REMIMU_KIND_OPEN || tokens[k].kind == REMIMU_KIND_NCOPEN)
1056 |             {
1057 |                 if (!just_rewinded)
1058 |                 {
1059 |                     IF_VERBOSE(printf("hit OPEN. i is %zd, depth is %d\n", i, stack_n);)
1060 |                     // need this to be able to detect and reject zero-size matches
1061 |                     //q_group_state[tokens[k].mask[0]] = i;
1062 | 
1063 |                     // if we're lazy and the min length is 0, we need to try the non-group case first
1064 |                     if ((tokens[k].mode & REMIMU_MODE_LAZY) && (tokens[k].count_lo == 0 || q_group_accepts_zero[tokens[k + tokens[k].pair_offset].mask[0]]))
1065 |                     {
1066 |                         IF_VERBOSE(puts("trying non-group case first.....");)
1067 |                         range_min = 0;
1068 |                         range_max = 0;
1069 |                         _REWIND_DO_SAVE(k);
1070 |                         k += tokens[k].pair_offset; // automatic += 1 will put us past the matching )
1071 |                     }
1072 |                     else
1073 |                     {
1074 |                         range_min = 1;
1075 |                         range_max = 0;
1076 |                         _REWIND_DO_SAVE(k);
1077 |                     }
1078 |                 }
1079 |                 else
1080 |                 {
1081 |                     IF_VERBOSE(printf("rewinded into OPEN. i is %zd, depth is %d\n", i, stack_n);)
1082 |                     just_rewinded = 0;
1083 | 
1084 |                     uint64_t orig_k = k;
1085 | 
1086 |                     IF_VERBOSE(printf("--- trying to try another alternation, start k is %d, rmin is %zu\n", k, range_min);)
1087 | 
1088 |                     if (range_min != 0)
1089 |                     {
1090 |                         IF_VERBOSE(puts("rangemin is not zero. checking...");)
1091 |                         k += range_min;
1092 |                         IF_VERBOSE(printf("start kind: %d\n", tokens[k].kind);)
1093 |                         IF_VERBOSE(printf("before start kind: %d\n", tokens[k-1].kind);)
1094 |                         if (tokens[k-1].kind == REMIMU_KIND_OR)
1095 |                             k += tokens[k-1].pair_offset - 1;
1096 |                         else if (tokens[k-1].kind == REMIMU_KIND_OPEN || tokens[k-1].kind == REMIMU_KIND_NCOPEN)
1097 |                             k += tokens[k-1].mask[15] - 1;
1098 | 
1099 |                         IF_VERBOSE(printf("kamakama %d %d\n", k, tokens[k].kind);)
1100 | 
1101 |                         if (tokens[k].kind == REMIMU_KIND_END) // unbalanced parens
1102 |                             return -3;
1103 | 
1104 |                         IF_VERBOSE(printf("---?!?!   %d, %d\n", k, q_group_state[tokens[k].mask[0]]);)
1105 |                         if (tokens[k].kind == REMIMU_KIND_CLOSE)
1106 |                         {
1107 |                             IF_VERBOSE(puts("!!~!~!~~~~!!~~!~   hit CLOSE. rewinding");)
1108 |                             // do nothing and continue on if we don't need this group
1109 |                             if (tokens[k].count_lo == 0 || q_group_accepts_zero[tokens[k].mask[0]])
1110 |                             {
1111 |                                 IF_VERBOSE(puts("continuing because we don't need this group");)
1112 |                                 q_group_state[tokens[k].mask[0]] = 0;
1113 | 
1114 |                                 if (!(tokens[k].mode & REMIMU_MODE_LAZY))
1115 |                                     q_group_stack[tokens[k].mask[0]] = 0;
1116 | 
1117 |                                 continue;
1118 |                             }
1119 |                             // otherwise go to the last point before the group
1120 |                             else
1121 |                             {
1122 |                                 IF_VERBOSE(puts("going to last point before this group");)
1123 |                                 _REWIND_OR_ABORT();
1124 |                                 continue;
1125 |                             }
1126 |                         }
1127 | 
1128 |                         REMIMU_ASSERT(tokens[k].kind == REMIMU_KIND_OR);
1129 |                     }
1130 | 
1131 |                     IF_VERBOSE(printf("--- FOUND ALTERNATION for paren at k %zd at k %d\n", orig_k, k);)
1132 | 
1133 |                     ptrdiff_t k_diff = k - orig_k;
1134 |                     range_min = k_diff + 1;
1135 | 
1136 |                     IF_VERBOSE(puts("(saving in paren after rewinding and looking for next regex token to check)");)
1137 |                     IF_VERBOSE(printf("%zd\n", range_min);)
1138 |                     _REWIND_DO_SAVE(k - k_diff);
1139 |                 }
1140 |             }
1141 |             else if (tokens[k].kind == REMIMU_KIND_CLOSE)
1142 |             {
1143 |                 // unquantified
1144 |                 if (tokens[k].count_lo == 1 && tokens[k].count_hi == 2)
1145 |                 {
1146 |                     // for captures
1147 |                     uint16_t cap_index = q_group_cap_index[tokens[k].mask[0]];
1148 |                     if (cap_index != 0xFFFF)
1149 |                         _REWIND_DO_SAVE_DUMMY(k);
1150 |                 }
1151 |                 // quantified
1152 |                 else
1153 |                 {
1154 |                     IF_VERBOSE(puts("closer test.....");)
1155 |                     if (!just_rewinded)
1156 |                     {
1157 |                         uint32_t prev = q_group_stack[tokens[k].mask[0]];
1158 | 
1159 |                         IF_VERBOSE(printf("qrqrqrqrqrqrqrq-------      k %d, gs %d, gaz %d, i %zd, tklo %d, rmin %zd, tkhi %d, rmax %zd, prev %d, sn %d\n", k, q_group_state[tokens[k].mask[0]], q_group_accepts_zero[tokens[k].mask[0]], i, tokens[k].count_lo, range_min, tokens[k].count_hi, range_max, prev, stack_n);)
1160 | 
1161 |                         range_max = tokens[k].count_hi;
1162 |                         range_max -= 1;
1163 |                         range_min = q_group_accepts_zero[tokens[k].mask[0]] ? 0 : tokens[k].count_lo;
1164 |                         //REMIMU_ASSERT(q_group_state[tokens[k + tokens[k].pair_offset].mask[0]] <= i);
1165 |                         //if (prev) REMIMU_ASSERT(rewind_stack[prev].i <= i);
1166 |                         IF_VERBOSE(printf("qzqzqzqzqzqzqzq-------      rmin %zd, rmax %zd\n", range_min, range_max);)
1167 | 
1168 |                         // minimum requirement not yet met
1169 |                         if (q_group_state[tokens[k].mask[0]] + 1 < range_min)
1170 |                         {
1171 |                             IF_VERBOSE(puts("continuing minimum matches for a quantified group");)
1172 |                             q_group_state[tokens[k].mask[0]] += 1;
1173 |                             _REWIND_DO_SAVE(k);
1174 | 
1175 |                             k += tokens[k].pair_offset; // back to start of group
1176 |                             k -= 1; // ensure we actually hit the group node next and not the node after it
1177 |                             continue;
1178 |                         }
1179 |                         // maximum allowance exceeded
1180 |                         else if (tokens[k].count_hi != 0 && q_group_state[tokens[k].mask[0]] + 1 > range_max)
1181 |                         {
1182 |                             IF_VERBOSE(printf("hit maximum allowed instances of a quantified group %d %zd\n", q_group_state[tokens[k].mask[0]], range_max);)
1183 |                             range_max -= 1;
1184 |                             _REWIND_OR_ABORT();
1185 |                             continue;
1186 |                         }
1187 | 
1188 |                         // fallback case to detect zero-length matches when we backtracked into the inside of this group
1189 |                         // after an attempted parse of a second copy of itself
1190 |                         uint8_t force_zero = 0;
1191 |                         if (prev != 0 && rewind_stack[prev].i > i)
1192 |                         {
1193 |                             // find matching open paren
1194 |                             size_t n = stack_n - 1;
1195 |                             while (n > 0 && rewind_stack[n].k != k + tokens[k].pair_offset)
1196 |                                 n -= 1;
1197 |                             REMIMU_ASSERT(n > 0);
1198 |                             if (rewind_stack[n].i == i)
1199 |                                 force_zero = 1;
1200 |                         }
1201 | 
1202 |                         // reject zero-length matches
1203 |                         if ((force_zero || (prev != 0 && rewind_stack[prev].i == i))) //  && q_group_state[tokens[k].mask[0]] > 0
1204 |                         {
1205 |                             IF_VERBOSE(printf("rejecting zero-length match..... %d %zd %zd\n", force_zero, rewind_stack[prev].i, i);)
1206 |                             IF_VERBOSE(printf("%d (k: %d)\n", q_group_state[tokens[k].mask[0]], k);)
1207 | 
1208 |                             q_group_accepts_zero[tokens[k].mask[0]] = 1;
1209 |                             _REWIND_OR_ABORT();
1210 |                             //range_max = q_group_state[tokens[k].mask[0]];
1211 |                             //range_min = 0;
1212 |                         }
1213 |                         else if (tokens[k].mode & REMIMU_MODE_LAZY) // lazy
1214 |                         {
1215 |                             IF_VERBOSE(printf("nidnfasidfnidfndifn-------      %d, %d, %zd\n", q_group_state[tokens[k].mask[0]], tokens[k].count_lo, range_min);)
1216 |                             if (prev)
1217 |                                 IF_VERBOSE(printf("lazy doesn't think it's zero-length. prev i %zd vs i %zd (depth %d)\n", rewind_stack[prev].i, i, stack_n);)
1218 |                             // continue on to past the group; group retry is in rewind state
1219 |                             q_group_state[tokens[k].mask[0]] += 1;
1220 |                             _REWIND_DO_SAVE(k);
1221 |                             q_group_state[tokens[k].mask[0]] = 0;
1222 |                         }
1223 |                         else // greedy
1224 |                         {
1225 |                             IF_VERBOSE(puts("wahiwahi");)
1226 |                             // clear unwanted memory if possessive
1227 |                             if ((tokens[k].mode & REMIMU_MODE_POSSESSIVE))
1228 |                             {
1229 |                                 uint32_t k2 = k;
1230 | 
1231 |                                 // special case for first, only rewind to (, not to )
1232 |                                 if (q_group_state[tokens[k].mask[0]] == 0)
1233 |                                     k2 = k + tokens[k].pair_offset;
1234 | 
1235 |                                 if (stack_n == 0)
1236 |                                     return -1;
1237 |                                 stack_n -= 1;
1238 | 
1239 |                                 while (stack_n > 0 && rewind_stack[stack_n].k != k2)
1240 |                                     stack_n -= 1;
1241 | 
1242 |                                 if (stack_n == 0)
1243 |                                     return -1;
1244 |                             }
1245 |                             // continue to next match if sane
1246 |                             if ((uint32_t)q_group_state[tokens[k + tokens[k].pair_offset].mask[0]] < (uint32_t)i)
1247 |                             {
1248 |                                 IF_VERBOSE(puts("REWINDING FROM GREEDY NON-REWIND CLOSER");)
1249 |                                 q_group_state[tokens[k].mask[0]] += 1;
1250 |                                 _REWIND_DO_SAVE(k);
1251 |                                 k += tokens[k].pair_offset; // back to start of group
1252 |                                 k -= 1; // ensure we actually hit the group node next and not the node after it
1253 |                             }
1254 |                             else
1255 |                                 IF_VERBOSE(puts("CONTINUING FROM GREEDY NON-REWIND CLOSER");)
1256 |                         }
1257 |                     }
1258 |                     else
1259 |                     {
1260 |                         IF_VERBOSE(puts("IN CLOSER REWIND!!!");)
1261 |                         just_rewinded = 0;
1262 | 
1263 |                         if (tokens[k].mode & REMIMU_MODE_LAZY)
1264 |                         {
1265 |                             // lazy rewind: need to try matching the group again
1266 |                             _REWIND_DO_SAVE_DUMMY(k);
1267 |                             q_group_stack[tokens[k].mask[0]] = stack_n;
1268 |                             k += tokens[k].pair_offset; // back to start of group
1269 |                             k -= 1; // ensure we actually hit the group node next and not the node after it
1270 |                         }
1271 |                         else
1272 |                         {
1273 |                             // greedy. if we're going to go outside the acceptable range, rewind
1274 |                             IF_VERBOSE(printf("kufukufu %d %zd\n", tokens[k].count_lo, range_min);)
1275 |                             //uint64_t old_i = i;
1276 |                             if (q_group_state[tokens[k].mask[0]] < range_min && !q_group_accepts_zero[tokens[k].mask[0]])
1277 |                             {
1278 |                                 IF_VERBOSE(printf("rewinding from greedy group because we're going to go out of range (%d vs %zd)\n", q_group_state[tokens[k].mask[0]], range_min);)
1279 |                                 //i = old_i;
1280 |                                 _REWIND_OR_ABORT();
1281 |                             }
1282 |                             // otherwise continue on to past the group
1283 |                             else
1284 |                             {
1285 |                                 IF_VERBOSE(puts("continuing past greedy group");)
1286 |                                 q_group_state[tokens[k].mask[0]] = 0;
1287 | 
1288 |                                 // for captures
1289 |                                 uint16_t cap_index = q_group_cap_index[tokens[k].mask[0]];
1290 |                                 if (cap_index != 0xFFFF)
1291 |                                     _REWIND_DO_SAVE_DUMMY(k);
1292 |                             }
1293 |                         }
1294 |                     }
1295 |                 }
1296 |             }
1297 |             else if (tokens[k].kind == REMIMU_KIND_OR)
1298 |             {
1299 |                 IF_VERBOSE(printf("hit OR at %d. adding %d\n", k, tokens[k].pair_offset);)
1300 |                 k += tokens[k].pair_offset;
1301 |                 k -= 1;
1302 |             }
1303 |             else if (tokens[k].kind == REMIMU_KIND_NORMAL)
1304 |             {
1305 |                 if (!just_rewinded)
1306 |                 {
1307 |                     uint64_t n = 0;
1308 |                     // do whatever the obligatory minimum amount of matching is
1309 |                     uint64_t old_i = i;
1310 |                     while (n < tokens[k].count_lo && text[i] != 0 && _REGEX_CHECK_MASK(k, text[i]))
1311 |                     {
1312 |                         i += 1;
1313 |                         n += 1;
1314 |                     }
1315 |                     if (n < tokens[k].count_lo)
1316 |                     {
1317 |                         IF_VERBOSE(printf("non-match A. rewinding (token %d)\n", k);)
1318 |                         i = old_i;
1319 |                         _REWIND_OR_ABORT();
1320 |                         continue;
1321 |                     }
1322 | 
1323 |                     if (tokens[k].mode & REMIMU_MODE_LAZY)
1324 |                     {
1325 |                         range_min = n;
1326 |                         range_max = tokens[k].count_hi - 1;
1327 |                         _REWIND_DO_SAVE(k);
1328 |                     }
1329 |                     else
1330 |                     {
1331 |                         uint64_t ilimit = tokens[k].count_hi;
1332 |                         if (ilimit == 0)
1333 |                             ilimit = ~ilimit;
1334 |                         range_min = n;
1335 |                         while (text[i] != 0 && _REGEX_CHECK_MASK(k, text[i]) && n + 1 < ilimit)
1336 |                         {
1337 |                             IF_VERBOSE(printf("match!! (%c)\n", text[i]);)
1338 |                             i += 1;
1339 |                             n += 1;
1340 |                         }
1341 |                         range_max = n;
1342 |                         IF_VERBOSE(printf("set rmin to %zd and rmax to %zd on entry into normal greedy token with k %d\n", range_min, range_max, k);)
1343 |                         if (!(tokens[k].mode & REMIMU_MODE_POSSESSIVE))
1344 |                             _REWIND_DO_SAVE(k);
1345 |                     }
1346 |                 }
1347 |                 else
1348 |                 {
1349 |                     just_rewinded = 0;
1350 | 
1351 |                     if (tokens[k].mode & REMIMU_MODE_LAZY)
1352 |                     {
1353 |                         uint64_t ilimit = range_max;
1354 |                         if (ilimit == 0)
1355 |                             ilimit = ~ilimit;
1356 | 
1357 |                         if (_REGEX_CHECK_MASK(k, text[i]) && text[i] != 0 && range_min < ilimit)
1358 |                         {
1359 |                             IF_VERBOSE(printf("match2!! (%c) (k: %d)\n", text[i], k);)
1360 |                             i += 1;
1361 |                             range_min += 1;
1362 |                             _REWIND_DO_SAVE(k);
1363 |                         }
1364 |                         else
1365 |                         {
1366 |                             IF_VERBOSE(printf("core rewind lazy (k: %d)\n", k);)
1367 |                             _REWIND_OR_ABORT();
1368 |                         }
1369 |                     }
1370 |                     else
1371 |                     {
1372 |                         //IF_VERBOSE(printf("comparing rmin %zd and rmax %zd token with k %d\n", range_min, range_max, k);)
1373 |                         if (range_max > range_min)
1374 |                         {
1375 |                             IF_VERBOSE(printf("greedy normal going back (k: %d)\n", k);)
1376 |                             i -= 1;
1377 |                             range_max -= 1;
1378 |                             _REWIND_DO_SAVE(k);
1379 |                         }
1380 |                         else
1381 |                         {
1382 |                             IF_VERBOSE(printf("core rewind greedy (k: %d)\n", k);)
1383 |                             _REWIND_OR_ABORT();
1384 |                         }
1385 |                     }
1386 |                 }
1387 |             }
1388 |             else
1389 |             {
1390 |                 fprintf(stderr, "unimplemented token kind %d\n", tokens[k].kind);
1391 |                 REMIMU_ASSERT(0);
1392 |             }
1393 |         }
1394 |         //printf("k... %d\n", k);
1395 |     }
1396 | 
1397 |     if (caps != 0)
1398 |     {
1399 |         //printf("stack_n: %d\n", stack_n);
1400 |         fflush(stdout);
1401 |         for (size_t n = 0; n < stack_n; n++)
1402 |         {
1403 |             RegexMatcherState s = rewind_stack[n];
1404 |             int kind = tokens[s.k].kind;
1405 |             if (kind == REMIMU_KIND_OPEN || kind == REMIMU_KIND_CLOSE)
1406 |             {
1407 |                 uint16_t cap_index = q_group_cap_index[tokens[s.k].mask[0]];
1408 |                 if (cap_index == 0xFFFF)
1409 |                     continue;
1410 |                 if (tokens[s.k].kind == REMIMU_KIND_OPEN)
1411 |                     cap_pos[cap_index] = s.i;
1412 |                 else if (cap_pos[cap_index] >= 0)
1413 |                     cap_span[cap_index] = s.i - cap_pos[cap_index];
1414 |             }
1415 |         }
1416 |         // re-deinitialize capture positions that have no associated capture span
1417 |         for (size_t n = 0; n < caps; n++)
1418 |         {
1419 |             if (cap_span[n] == -1)
1420 |                 cap_pos[n] = -1;
1421 |         }
1422 |     }
1423 | 
1424 |     #undef _REWIND_DO_SAVE
1425 |     #undef _REWIND_OR_ABORT
1426 |     #undef _REGEX_CHECK_IS_W
1427 |     #undef _P_TEXT_HIGHLIGHTED
1428 |     #undef IF_VERBOSE
1429 | 
1430 |     return i;
1431 | }
1432 | 
1433 | REMIMU_FUNC_VISIBILITY void print_regex_tokens(RegexToken * tokens)
1434 | {
1435 |     const char * kind_to_str[] = {
1436 |         "NORMAL",
1437 |         "OPEN",
1438 |         "NCOPEN",
1439 |         "CLOSE",
1440 |         "OR",
1441 |         "CARET",
1442 |         "DOLLAR",
1443 |         "BOUND",
1444 |         "NBOUND",
1445 |         "END",
1446 |     };
1447 |     const char * mode_to_str[] = {
1448 |         "GREEDY",
1449 |         "POSSESS",
1450 |         "LAZY",
1451 |     };
1452 |     for (int k = 0;; k++)
1453 |     {
1454 |         printf("%s\t%s\t", kind_to_str[tokens[k].kind], mode_to_str[tokens[k].mode]);
1455 | 
1456 |         int c_old = -1;
1457 |         for (int c = 0; c < (tokens[k].kind ? 0 : 256); c++)
1458 |         {
1459 |             #define _PRINT_C_SMART(c) { \
1460 |                 if (c >= 0x20 && c <= 0x7E) \
1461 |                     printf("%c", c); \
1462 |                 else \
1463 |                     printf("\\x%02x", c); \
1464 |             }
1465 | 
1466 |             if (_REGEX_CHECK_MASK(k, c))
1467 |             {
1468 |                 if (c_old == -1)
1469 |                     c_old = c;
1470 |             }
1471 |             else if (c_old != -1)
1472 |             {
1473 |                 if (c - 1 == c_old)
1474 |                 {
1475 |                     _PRINT_C_SMART(c_old)
1476 |                     c_old = -1;
1477 |                 }
1478 |                 else if (c - 2 == c_old)
1479 |                 {
1480 |                     _PRINT_C_SMART(c_old)
1481 |                     _PRINT_C_SMART(c_old + 1)
1482 |                     c_old = -1;
1483 |                 }
1484 |                 else
1485 |                 {
1486 |                     _PRINT_C_SMART(c_old)
1487 |                     printf("-");
1488 |                     _PRINT_C_SMART(c - 1)
1489 |                     c_old = -1;
1490 |                 }
1491 |             }
1492 |         }
1493 | 
1494 |         /*
1495 |         printf("\t");
1496 |         for (int i = 0; i < 16; i++)
1497 |             printf("%04x", tokens[k].mask[i]);
1498 |         */
1499 | 
1500 |         printf("\t{%d,%d}\t(%d)\n", tokens[k].count_lo, tokens[k].count_hi - 1, tokens[k].pair_offset);
1501 | 
1502 |         if (tokens[k].kind == REMIMU_KIND_END)
1503 |             break;
1504 |     }
1505 | }
1506 | 
1507 | #undef _REGEX_CHECK_MASK
1508 | 
1509 | #endif //INCLUDE_REMIMU
1510 | 


--------------------------------------------------------------------------------