├── src ├── regex_program.c ├── regex_allocators.c ├── regex_allocators.h ├── regex_character_class.h ├── regex_parse.h ├── regex_compile.h ├── regex_vm_pike.h ├── regex_node.h ├── regex_program.h ├── regex_vm_pike.c ├── regex_parse.c └── regex_compile.c ├── test ├── testdata │ ├── README.md │ ├── LICENSE │ ├── nullsubexpr.dat │ ├── repetition.dat │ └── basic.dat ├── Makefile ├── make-test.rb └── test.c ├── examples ├── regex │ ├── Makefile │ └── regex.c └── regex2dot │ ├── Makefile │ └── regex2dot.c ├── README.md ├── Makefile └── LICENSE /src/regex_program.c: -------------------------------------------------------------------------------- 1 | #include "regex_program.h" 2 | #include "regex_vm_pike.h" 3 | 4 | int regexProgramRun(const RegexProgram *program, const char *string, 5 | const char **matches, int nmatches) { 6 | return regexVmPikeRun(program, string, matches, nmatches); 7 | } 8 | -------------------------------------------------------------------------------- /test/testdata/README.md: -------------------------------------------------------------------------------- 1 | Test data was taken from the Go distribution, which was in turn taken from the 2 | testregex test suite: 3 | 4 | http://www2.research.att.com/~astopen/testregex/testregex.html 5 | 6 | The LICENSE in this directory corresponds to the LICENSE that the data was 7 | released under. 8 | -------------------------------------------------------------------------------- /src/regex_allocators.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "regex_allocators.h" 4 | 5 | RegexAllocators regexAllocators = { 6 | .malloc = malloc, 7 | .realloc = realloc, 8 | .free = free 9 | }; 10 | 11 | void regexAllocatorsSet(const RegexAllocators *allocators) { 12 | regexAllocators = *allocators; 13 | } 14 | 15 | const RegexAllocators *regexAllocatorsGet(void) { 16 | return ®exAllocators; 17 | } 18 | -------------------------------------------------------------------------------- /src/regex_allocators.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_ALLOCATORS_H 2 | #define REGEX_ALLOCATORS_H 3 | 4 | #include 5 | 6 | typedef struct RegexAllocators { 7 | void *(*malloc)(size_t); 8 | void *(*realloc)(void *, size_t); 9 | void (*free)(void *); 10 | } RegexAllocators; 11 | 12 | void regexAllocatorsSet(const RegexAllocators *allocators); 13 | const RegexAllocators *regexAllocatorsGet(void); 14 | 15 | #endif // !REGEX_ALLOCATORS_H 16 | -------------------------------------------------------------------------------- /src/regex_character_class.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_CHARACTER_CLASS_H 2 | #define REGEX_CHARACTER_CLASS_H 3 | 4 | #include 5 | 6 | typedef char RegexCharacterClass[(UCHAR_MAX + CHAR_BIT - 1) / CHAR_BIT]; 7 | 8 | static inline int regexCharacterClassContains(const RegexCharacterClass klass, 9 | int ch) { 10 | return klass[ch / CHAR_BIT] & (1 << ch % CHAR_BIT); 11 | } 12 | static inline int regexCharacterClassAdd(RegexCharacterClass klass, int ch) { 13 | klass[ch / CHAR_BIT] |= 1 << (ch % CHAR_BIT); 14 | return ch; 15 | } 16 | 17 | #endif // !REGEX_CHARACTER_CLASS_H 18 | -------------------------------------------------------------------------------- /src/regex_parse.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_PARSE_H 2 | #define REGEX_PARSE_H 3 | 4 | #include "regex_node.h" 5 | 6 | /* Parse a pattern. */ 7 | RegexNode *regexParse(const char *pattern); 8 | 9 | /* Parse a pattern (using a previously allocated buffer of at least 10 | * regexParseEstimateNodes(pattern) nodes). */ 11 | RegexNode *regexParseWithNodes(const char *pattern, RegexNode *nodes); 12 | 13 | /* Upper bound of number of nodes required to parse pattern. */ 14 | int regexParseEstimateNodes(const char *pattern); 15 | 16 | /* Free a parsed pattern. */ 17 | void regexParseFree(RegexNode *root); 18 | 19 | #endif // !REGEX_PARSE_H 20 | -------------------------------------------------------------------------------- /examples/regex/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM := regex 2 | OBJECTS := regex.o 3 | DEPENDS := $(OBJECTS:%.o=%.d) 4 | 5 | CFLAGS += -std=c11 -Wall -pedantic -I../../src 6 | 7 | .PHONY: all 8 | all: CPPFLAGS += -DNDEBUG 9 | all: CFLAGS += -O2 10 | all: $(PROGRAM) 11 | 12 | .PHONY: debug 13 | debug: CPPFLAGS += -DDEBUG 14 | debug: CFLAGS += -g 15 | debug: LDFLAGS += -g 16 | debug: $(PROGRAM) 17 | 18 | .PHONY: clean 19 | clean: 20 | $(RM) $(PROGRAM) $(OBJECTS) $(DEPENDS) 21 | 22 | -include $(DEPENDS) 23 | 24 | $(PROGRAM): $(OBJECTS) ../../libregex.a 25 | $(CC) $(LDFLAGS) -o $@ $^ ../../libregex.a 26 | 27 | ../../libregex.a: 28 | $(MAKE) -C ../.. libregex.a 29 | 30 | %.o: %.c 31 | $(CC) $(CPPFLAGS) $(CFLAGS) -MMD -c -o $@ $< 32 | -------------------------------------------------------------------------------- /examples/regex2dot/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM := regex2dot 2 | OBJECTS := regex2dot.o 3 | DEPENDS := $(OBJECTS:%.o=%.d) 4 | 5 | CFLAGS += -std=c11 -Wall -pedantic -I../../src 6 | 7 | .PHONY: all 8 | all: CPPFLAGS += -DNDEBUG 9 | all: CFLAGS += -O2 10 | all: $(PROGRAM) 11 | 12 | .PHONY: debug 13 | debug: CPPFLAGS += -DDEBUG 14 | debug: CFLAGS += -g 15 | debug: LDFLAGS += -g 16 | debug: $(PROGRAM) 17 | 18 | .PHONY: clean 19 | clean: 20 | $(RM) $(PROGRAM) $(OBJECTS) $(DEPENDS) 21 | 22 | -include $(DEPENDS) 23 | 24 | $(PROGRAM): $(OBJECTS) ../../libregex.a 25 | $(CC) $(LDFLAGS) -o $@ $^ ../../libregex.a 26 | 27 | ../../libregex.a: 28 | $(MAKE) -C ../.. libregex.a 29 | 30 | %.o: %.c 31 | $(CC) $(CPPFLAGS) $(CFLAGS) -MMD -c -o $@ $< 32 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM := test 2 | OBJECTS := test.o 3 | 4 | CFLAGS += -std=c11 -Wall -pedantic -I../src 5 | 6 | .PHONY: all 7 | all: CPPFLAGS += -DNDEBUG 8 | all: CFLAGS += -O2 9 | all: $(PROGRAM) $(SUBDIRS) 10 | 11 | .PHONY: debug 12 | debug: CPPFLAGS += -DDEBUG 13 | debug: CFLAGS += -g 14 | debug: LDFLAGS += -g 15 | debug: $(PROGRAM) $(SUBDIRS) 16 | 17 | .PHONY: clean 18 | clean: 19 | $(RM) $(PROGRAM) $(OBJECTS) 20 | 21 | $(PROGRAM): $(OBJECTS) ../libregex.a 22 | $(CC) $(LDFLAGS) -o $@ $^ ../libregex.a 23 | 24 | ../libregex.a: 25 | $(MAKE) -C .. libregex.a 26 | 27 | test.c: make-test.rb testdata/basic.dat testdata/nullsubexpr.dat testdata/repetition.dat 28 | ./make-test.rb testdata/basic.dat testdata/nullsubexpr.dat testdata/repetition.dat > $@ 29 | 30 | %.o: %.c 31 | $(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $< 32 | -------------------------------------------------------------------------------- /src/regex_compile.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_COMPILE_H 2 | #define REGEX_COMPILE_H 3 | 4 | #include "regex_node.h" 5 | #include "regex_program.h" 6 | 7 | /* Compile a pattern. */ 8 | RegexProgram *regexCompile(const char *pattern); 9 | 10 | /* Compile a parsed pattern. */ 11 | RegexProgram *regexCompileNode(const RegexNode *root); 12 | 13 | /* Compile a parsed pattern (using a previously allocated program with at least 14 | * regexCompileEstimateInstructions(root) instructions). */ 15 | RegexProgram *regexCompileNodeWithProgram(const RegexNode *root, 16 | RegexProgram *program); 17 | 18 | /* Upper bound of number of instructions required to compile parsed pattern. */ 19 | int regexCompileEstimateInstructions(const RegexNode *root); 20 | 21 | /* Free a compiled program. */ 22 | void regexCompileFree(RegexProgram *program); 23 | 24 | #endif // !REGEX_COMPILE_H 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tiny-regex 2 | ========== 3 | 4 | tiny-regex is a small implementation of a regular expression matching engine, 5 | written in standard C11. It is based on two papers by Russ Cox : 6 | 7 | * ["Regular Expression Matching Can Be Simple And Fast"](https://swtch.com/~rsc/regexp/regexp1.html) 8 | * ["Regular Expression Matching: the Virtual Machine Approach"](https://swtch.com/~rsc/regexp/regexp2.html) 9 | 10 | This library was mainly written for fun and is not intended as a full-featured 11 | general purpose regular expression matching engine. 12 | 13 | Engine features 14 | --------------- 15 | 16 | * `^` and `$` anchors 17 | * `.` match any single character 18 | * `[...]` and `[^...]` character classes 19 | * `?`, `*`, `+`, and `{x,y}` greedy quantifiers 20 | * `??`, `*?`, `+?`, and `{x,y}?` non-greedy quantifiers 21 | * `(...)` capturing groups 22 | -------------------------------------------------------------------------------- /src/regex_vm_pike.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_VM_PIKE_H 2 | #define REGEX_VM_PIKE_H 3 | 4 | #include "regex_program.h" 5 | 6 | #define REGEX_VM_PIKE_MAX_MATCHES 20 7 | 8 | typedef struct RegexVmPikeThread { 9 | int visited; 10 | const RegexProgramInstruction *pc; 11 | const char *matches[REGEX_VM_PIKE_MAX_MATCHES]; 12 | } RegexVmPikeThread; 13 | 14 | /* Run program on string. */ 15 | int regexVmPikeRun(const RegexProgram *program, const char *string, 16 | const char **matches, int nmatches); 17 | 18 | /* Run program on string (using a previously allocated buffer of at least 19 | * regexVmPikeEstimateThreads(program) threads). */ 20 | int regexVmPikeRunWithThreads(const RegexProgram *program, const char *string, 21 | const char **matches, int nmatches, RegexVmPikeThread *threads); 22 | 23 | /* Upper bound of number of threads required to run program. */ 24 | int regexVmPikeEstimateThreads(const RegexProgram *program); 25 | 26 | #endif // !REGEX_VM_PIKE_H 27 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LIBRARY := libregex.a 2 | OBJECTS := src/regex_allocators.o \ 3 | src/regex_compile.o \ 4 | src/regex_parse.o \ 5 | src/regex_program.o \ 6 | src/regex_vm_pike.o 7 | DEPENDS := $(OBJECTS:%.o=%.d) 8 | SUBDIRS := test examples/regex examples/regex2dot 9 | 10 | CFLAGS += -std=c11 -Wall -pedantic 11 | 12 | PREFIX ?= /usr/local 13 | 14 | .PHONY: all 15 | all: CPPFLAGS += -DNDEBUG 16 | all: CFLAGS += -O2 17 | all: $(LIBRARY) $(SUBDIRS) 18 | 19 | .PHONY: debug 20 | debug: CPPFLAGS += -DDEBUG 21 | debug: CFLAGS += -g 22 | debug: LDFLAGS += -g 23 | debug: $(LIBRARY) $(SUBDIRS) 24 | 25 | .PHONY: clean 26 | clean: 27 | for subdir in $(SUBDIRS); do \ 28 | $(MAKE) -C $$subdir clean; \ 29 | done 30 | $(RM) $(LIBRARY) $(OBJECTS) $(DEPENDS) 31 | 32 | -include $(DEPENDS) 33 | 34 | $(LIBRARY): $(OBJECTS) 35 | $(AR) rcs $@ $^ 36 | 37 | $(SUBDIRS): $(LIBRARY) 38 | $(MAKE) -C $@ 39 | 40 | %.o: %.c 41 | $(CC) $(CPPFLAGS) $(CFLAGS) -MMD -c -o $@ $< 42 | 43 | .PHONY: install 44 | install: all 45 | mkdir -p $(PREFIX)/lib 46 | install $(LIBRARY) $(PREFIX)/lib 47 | -------------------------------------------------------------------------------- /test/testdata/LICENSE: -------------------------------------------------------------------------------- 1 | The following license covers testregex.c and all associated test data. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, 6 | copy, modify, merge, publish, distribute, and/or sell copies of the 7 | Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following disclaimer: 9 | 10 | THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED 11 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 12 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 13 | IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 14 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 15 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 16 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 17 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 18 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 19 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /src/regex_node.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_NODE_H 2 | #define REGEX_NODE_H 3 | 4 | typedef enum RegexNodeType { 5 | REGEX_NODE_TYPE_EPSILON = 0, 6 | /* Characters */ 7 | REGEX_NODE_TYPE_CHARACTER, 8 | REGEX_NODE_TYPE_ANY_CHARACTER, 9 | REGEX_NODE_TYPE_CHARACTER_CLASS, 10 | REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED, 11 | /* Composites */ 12 | REGEX_NODE_TYPE_CONCATENATION, 13 | REGEX_NODE_TYPE_ALTERNATION, 14 | /* Quantifiers */ 15 | REGEX_NODE_TYPE_QUANTIFIER, 16 | /* Anchors */ 17 | REGEX_NODE_TYPE_ANCHOR_BEGIN, 18 | REGEX_NODE_TYPE_ANCHOR_END, 19 | /* Captures */ 20 | REGEX_NODE_TYPE_CAPTURE 21 | } RegexNodeType; 22 | 23 | typedef struct RegexNode RegexNode; 24 | 25 | struct RegexNode { 26 | RegexNodeType type; 27 | union { 28 | /* REGEX_NODE_TYPE_CHARACTER */ 29 | struct { int ch; }; 30 | /* REGEX_NODE_TYPE_CHARACTER_CLASS, 31 | * REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED */ 32 | struct { const char *from, *to; }; 33 | /* REGEX_NODE_TYPE_QUANTIFIER */ 34 | struct { int nmin, nmax, greedy; RegexNode *quantified; }; 35 | /* REGEX_NODE_TYPE_CONCATENATION, 36 | * REGEX_NODE_TYPE_ALTERNATION */ 37 | struct { RegexNode *left, *right; }; 38 | /* REGEX_NODE_TYPE_CAPTURE */ 39 | struct { RegexNode *captured; }; 40 | }; 41 | }; 42 | 43 | #endif // !REGEX_NODE_H 44 | -------------------------------------------------------------------------------- /src/regex_program.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_PROGRAM_H 2 | #define REGEX_PROGRAM_H 3 | 4 | #include "regex_character_class.h" 5 | 6 | typedef enum RegexProgramOpcode { 7 | REGEX_PROGRAM_OPCODE_MATCH = 0, 8 | /* Characters */ 9 | REGEX_PROGRAM_OPCODE_CHARACTER, 10 | REGEX_PROGRAM_OPCODE_ANY_CHARACTER, 11 | REGEX_PROGRAM_OPCODE_CHARACTER_CLASS, 12 | REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED, 13 | /* Control-flow */ 14 | REGEX_PROGRAM_OPCODE_SPLIT, 15 | REGEX_PROGRAM_OPCODE_JUMP, 16 | /* Assertions */ 17 | REGEX_PROGRAM_OPCODE_ASSERT_BEGIN, 18 | REGEX_PROGRAM_OPCODE_ASSERT_END, 19 | /* Saving */ 20 | REGEX_PROGRAM_OPCODE_SAVE 21 | } RegexProgramOpcode; 22 | 23 | typedef struct RegexProgramInstruction RegexProgramInstruction; 24 | 25 | struct RegexProgramInstruction { 26 | RegexProgramOpcode opcode; 27 | union { 28 | /* REGEX_PROGRAM_OPCODE_CHARACTER */ 29 | struct { int ch; }; 30 | /* REGEX_PROGRAM_OPCODE_CHARACTER_CLASS, 31 | * REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED */ 32 | struct { RegexCharacterClass klass; }; 33 | /* REGEX_PROGRAM_OPCODE_SPLIT */ 34 | struct { RegexProgramInstruction *first, *second; }; 35 | /* REGEX_PROGRAM_OPCODE_JUMP */ 36 | struct { RegexProgramInstruction *target; }; 37 | /* REGEX_PROGRAM_OPCODE_SAVE */ 38 | struct { int save; }; 39 | }; 40 | }; 41 | 42 | typedef struct RegexProgram { 43 | int ninstructions; 44 | RegexProgramInstruction instructions[]; 45 | } RegexProgram; 46 | 47 | /* Run program on string. */ 48 | int regexProgramRun(const RegexProgram *program, const char *string, 49 | const char **matches, int nmatches); 50 | 51 | #endif // !REGEX_PROGRAM_H 52 | -------------------------------------------------------------------------------- /test/testdata/nullsubexpr.dat: -------------------------------------------------------------------------------- 1 | NOTE null subexpression matches : 2002-06-06 2 | 3 | E (a*)* a (0,1)(0,1) 4 | #E SAME x (0,0)(0,0) 5 | E SAME x (0,0)(?,?) RE2/Go 6 | E SAME aaaaaa (0,6)(0,6) 7 | E SAME aaaaaax (0,6)(0,6) 8 | E (a*)+ a (0,1)(0,1) 9 | E SAME x (0,0)(0,0) 10 | E SAME aaaaaa (0,6)(0,6) 11 | E SAME aaaaaax (0,6)(0,6) 12 | E (a+)* a (0,1)(0,1) 13 | E SAME x (0,0) 14 | E SAME aaaaaa (0,6)(0,6) 15 | E SAME aaaaaax (0,6)(0,6) 16 | E (a+)+ a (0,1)(0,1) 17 | E SAME x NOMATCH 18 | E SAME aaaaaa (0,6)(0,6) 19 | E SAME aaaaaax (0,6)(0,6) 20 | 21 | E ([a]*)* a (0,1)(0,1) 22 | #E SAME x (0,0)(0,0) 23 | E SAME x (0,0)(?,?) RE2/Go 24 | E SAME aaaaaa (0,6)(0,6) 25 | E SAME aaaaaax (0,6)(0,6) 26 | E ([a]*)+ a (0,1)(0,1) 27 | E SAME x (0,0)(0,0) 28 | E SAME aaaaaa (0,6)(0,6) 29 | E SAME aaaaaax (0,6)(0,6) 30 | E ([^b]*)* a (0,1)(0,1) 31 | #E SAME b (0,0)(0,0) 32 | E SAME b (0,0)(?,?) RE2/Go 33 | E SAME aaaaaa (0,6)(0,6) 34 | E SAME aaaaaab (0,6)(0,6) 35 | E ([ab]*)* a (0,1)(0,1) 36 | E SAME aaaaaa (0,6)(0,6) 37 | E SAME ababab (0,6)(0,6) 38 | E SAME bababa (0,6)(0,6) 39 | E SAME b (0,1)(0,1) 40 | E SAME bbbbbb (0,6)(0,6) 41 | E SAME aaaabcde (0,5)(0,5) 42 | E ([^a]*)* b (0,1)(0,1) 43 | E SAME bbbbbb (0,6)(0,6) 44 | #E SAME aaaaaa (0,0)(0,0) 45 | E SAME aaaaaa (0,0)(?,?) RE2/Go 46 | E ([^ab]*)* ccccxx (0,6)(0,6) 47 | #E SAME ababab (0,0)(0,0) 48 | E SAME ababab (0,0)(?,?) RE2/Go 49 | 50 | E ((z)+|a)* zabcde (0,2)(1,2) 51 | 52 | #{E a+? aaaaaa (0,1) no *? +? mimimal match ops 53 | #E (a) aaa (0,1)(0,1) 54 | #E (a*?) aaa (0,0)(0,0) 55 | #E (a)*? aaa (0,0) 56 | #E (a*?)*? aaa (0,0) 57 | #} 58 | 59 | #B \(a*\)*\(x\) x (0,1)(0,0)(0,1) not supported by tiny-regex 60 | #B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) 61 | #B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) 62 | #B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) 63 | #B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) 64 | #B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) 65 | #B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) 66 | #B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) 67 | 68 | #E (a*)*(x) x (0,1)(0,0)(0,1) 69 | E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go 70 | E (a*)*(x) ax (0,2)(0,1)(1,2) 71 | E (a*)*(x) axa (0,2)(0,1)(1,2) 72 | 73 | E (a*)+(x) x (0,1)(0,0)(0,1) 74 | E (a*)+(x) ax (0,2)(0,1)(1,2) 75 | E (a*)+(x) axa (0,2)(0,1)(1,2) 76 | 77 | E (a*){2}(x) x (0,1)(0,0)(0,1) 78 | E (a*){2}(x) ax (0,2)(1,1)(1,2) 79 | E (a*){2}(x) axa (0,2)(1,1)(1,2) 80 | -------------------------------------------------------------------------------- /examples/regex2dot/regex2dot.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | void printUsage(FILE *file, const char *program) { 8 | fprintf(file, "usage: %s pattern\n", program); 9 | } 10 | 11 | void printNode(FILE *file, const RegexNode *node) { 12 | switch (node->type) { 13 | case REGEX_NODE_TYPE_EPSILON: 14 | fprintf(file, "node%p[label=\"ε\",shape=box,fontname=\"times-italic\"];\n", 15 | (void *)node); 16 | break; 17 | 18 | /* Characters */ 19 | case REGEX_NODE_TYPE_CHARACTER: 20 | fprintf(file, "node%p[label=\"'%c'\",shape=box,fontname=\"courier\"];\n", 21 | (void *)node, node->ch); 22 | break; 23 | case REGEX_NODE_TYPE_ANY_CHARACTER: 24 | fprintf(file, "node%p[label=\"any\",shape=box" 25 | ",fontname=\"times-italic\"];\n", (void *)node); 26 | break; 27 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 28 | fprintf(file, "node%p[label=\"[%.*s]\",shape=box,fontname=\"courier\"];\n", 29 | (void *)node, (int)(node->to - node->from), node->from); 30 | break; 31 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 32 | fprintf(file, "node%p[label=\"[^%.*s]\",shape=box,fontname=\"courier\"];\n", 33 | (void *)node, (int)(node->to - node->from), node->from); 34 | break; 35 | 36 | /* Composites */ 37 | case REGEX_NODE_TYPE_CONCATENATION: 38 | fprintf(file, "node%p[label=\"concatenation\",shape=box,style=\"rounded\"" 39 | ",fontname=\"times-italic\"];\n", (void *)node); 40 | printNode(file, node->left); 41 | fprintf(file, "node%p->node%p;\n", (void *)node, (void *)node->left); 42 | printNode(file, node->right); 43 | fprintf(file, "node%p->node%p;\n", (void *)node, (void *)node->right); 44 | break; 45 | case REGEX_NODE_TYPE_ALTERNATION: 46 | fprintf(file, "node%p[label=\"alternation\",shape=diamond,style=\"rounded\"" 47 | ",fontname=\"times-italic\"];\n", (void *)node); 48 | printNode(file, node->left); 49 | fprintf(file, "node%p->node%p;\n", (void *)node, (void *)node->left); 50 | printNode(file, node->right); 51 | fprintf(file, "node%p->node%p;\n", (void *)node, (void *)node->right); 52 | break; 53 | 54 | /* Quantifiers */ 55 | case REGEX_NODE_TYPE_QUANTIFIER: 56 | fprintf(file, "node%p[label=\"%d..", (void *)node, node->nmin); 57 | if (node->nmax == -1) 58 | fprintf(file, "inf"); 59 | else 60 | fprintf(file, "%d", node->nmax); 61 | if (node->nmin == 0) 62 | fprintf(file, "\",shape=ellipse,style=\"dotted\"];\n"); 63 | else 64 | fprintf(file, "\",shape=ellipse];\n"); 65 | printNode(file, node->quantified); 66 | fprintf(file, "node%p->node%p;\n", (void *)node, (void *)node->quantified); 67 | break; 68 | 69 | /* Anchors */ 70 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 71 | fprintf(file, "node%p[label=\"^\",shape=circle];\n", (void *)node); 72 | break; 73 | case REGEX_NODE_TYPE_ANCHOR_END: 74 | fprintf(file, "node%p[label=\"$\",shape=circle];\n", (void *)node); 75 | break; 76 | 77 | /* Captures */ 78 | case REGEX_NODE_TYPE_CAPTURE: 79 | fprintf(file, "node%p[label=\"capture\",shape=parallelogram," 80 | "style=\"rounded\",fontname=\"times-italic\"];\n", (void *)node); 81 | printNode(file, node->captured); 82 | fprintf(file, "node%p->node%p;\n", (void *)node, (void *)node->captured); 83 | break; 84 | } 85 | } 86 | 87 | void printDot(FILE *file, const RegexNode *node) { 88 | fprintf(file, "digraph regex {\n"); 89 | printNode(file, node); 90 | fprintf(file, "}\n"); 91 | } 92 | 93 | int main(int argc, char *argv[]) { 94 | RegexNode *node; 95 | 96 | // process command line 97 | if (argc < 2 || argc > 3) { 98 | printUsage(stderr, argv[0]); 99 | return EXIT_FAILURE; 100 | } 101 | 102 | if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { 103 | printUsage(stdout, argv[0]); 104 | return EXIT_SUCCESS; 105 | } 106 | 107 | // parse pattern 108 | if ((node = regexParse(argv[1]))) 109 | printDot(stdout, node); 110 | else { 111 | fprintf(stderr, "%s: regexParse() failed\n", argv[0]); 112 | return EXIT_FAILURE; 113 | } 114 | 115 | regexParseFree(node); 116 | return EXIT_SUCCESS; 117 | } 118 | -------------------------------------------------------------------------------- /test/make-test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | puts <<-END 4 | /* generated by #{$0}#{ARGV.size > 0 ? ' ' + ARGV.join(' ') : ''} */ 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #ifdef __GNUC__ 13 | void success(const char *source, const char *format, ...) 14 | __attribute__ ((format(printf, 2, 3))); 15 | void fail(const char *source, const char *format, ...) 16 | __attribute__ ((format(printf, 2, 3))); 17 | #endif 18 | 19 | void success(const char *source, const char *format, ...) { 20 | va_list ap; 21 | va_start(ap, format); 22 | printf("%s [\\x1b[32mSUCCESS\\x1b[0m] ", source); 23 | vprintf(format, ap); 24 | printf("\\n"); 25 | va_end(ap); 26 | } 27 | 28 | void fail(const char *source, const char *format, ...) { 29 | va_list ap; 30 | va_start(ap, format); 31 | printf("%s [\\x1b[31mFAIL \\x1b[0m] ", source); 32 | vprintf(format, ap); 33 | printf("\\n"); 34 | va_end(ap); 35 | } 36 | 37 | int test(const char *source, const char *pattern, const char *string, 38 | int nmatches, ...) { 39 | RegexNode *root; 40 | RegexProgram *program; 41 | const char *matches[20] = {0}; 42 | int result = 0; 43 | va_list ap; 44 | 45 | // parse pattern 46 | if (!(root = regexParse(pattern))) { 47 | fail(source, "regexParse() failed"); 48 | return -1; 49 | } 50 | 51 | // compile parsed pattern 52 | program = regexCompileNode(root); 53 | regexParseFree(root); 54 | if (!program) { 55 | fail(source, "regexCompile() failed"); 56 | return -1; 57 | } 58 | 59 | // run program on string 60 | if ((result = regexProgramRun(program, string, 61 | matches, sizeof (matches) / sizeof (matches[0]))) < 0) { 62 | fail(source, "regexProgramRun() failed"); 63 | regexCompileFree(program); 64 | return -1; 65 | } 66 | 67 | va_start(ap, nmatches); 68 | if (result > 0) { 69 | if (nmatches > 0) { 70 | success(source, "/%s/ =~ \\"%s\\"", pattern, string); 71 | result = 0; 72 | for (int i = 0; i + 1 < nmatches 73 | && i + 1 < sizeof (matches) / sizeof (matches[0]); i += 2) { 74 | int begin = va_arg(ap, int), end = va_arg(ap, int); 75 | if ((begin == -1 || begin == matches[i] - string) 76 | && (end == -1 || end == matches[i + 1] - string)) { 77 | // success(source, "(%d,%d)", begin, end); 78 | } else if (matches[i] && matches[i + 1]) { 79 | fail(source, "expected (%d,%d), got (%d,%d)", begin, end, 80 | (int)(matches[i] - string), (int)(matches[i + 1] - string)); 81 | result = -1; 82 | } else { 83 | fail(source, "expected (%d,%d), got (NULL,NULL)", begin, end); 84 | result = -1; 85 | } 86 | } 87 | } else { 88 | fail(source, "/%s/ =~ \\"%s\\"", pattern, string); 89 | result = -1; 90 | } 91 | } else if (result == 0) { 92 | if (nmatches == 0) 93 | success(source, "/%s/ !~ \\"%s\\"", pattern, string); 94 | else { 95 | fail(source, "/%s/ !~ \\"%s\\"", pattern, string); 96 | result = -1; 97 | } 98 | } 99 | 100 | va_end(ap); 101 | regexCompileFree(program); 102 | return result; 103 | } 104 | 105 | int main(int argc, char *argv[]) { 106 | int nerrors = 0; 107 | END 108 | 109 | filename = nil 110 | previous = nil 111 | ntests = 0 112 | 113 | ARGF.each do |line| 114 | if ARGF.filename != filename 115 | filename = ARGF.filename 116 | ARGF.lineno = 1 117 | end 118 | 119 | line = line.sub(/^:[^:]*:/, '') 120 | next unless line =~ /^[{BEASKL]+/ 121 | 122 | options, pattern, string, captures = line.chomp.split(/\t+/) 123 | string = '' if string == 'NULL' 124 | pattern = previous if pattern == 'SAME' 125 | previous = pattern 126 | pattern = pattern.gsub('\\', "\\\\\\\\") unless options.include?('$') 127 | string = string .gsub('\\', "\\\\\\\\") unless options.include?('$') 128 | captures = captures == 'NOMATCH' \ 129 | ? captures 130 | : captures 131 | .scan(/\((.*?),(.*?)\)/) 132 | .flatten 133 | .map {|offset| offset == '?' ? -1 : offset.to_i } 134 | 135 | puts <<-END 136 | nerrors += test("#{ARGF.filename}:#{'%03d' % ARGF.lineno}", "#{pattern}", "#{string}", 137 | #{captures == 'NOMATCH' ? 0 : "#{captures.size}, #{captures.join(', ')}"}); 138 | END 139 | ntests += 1 140 | end 141 | 142 | puts <<-END 143 | printf("#{ntests} test(s), %d error(s).\\n", -nerrors); 144 | return 0; 145 | } 146 | END 147 | -------------------------------------------------------------------------------- /src/regex_vm_pike.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "regex_allocators.h" 5 | #include "regex_vm_pike.h" 6 | 7 | typedef struct RegexVmPikeContext { 8 | const char *string, *sp; 9 | } RegexVmPikeContext; 10 | 11 | typedef struct RegexVmPikeThreadList { 12 | int nthreads; 13 | RegexVmPikeThread *threads; 14 | } RegexVmPikeThreadList; 15 | 16 | // THREAD MANAGEMENT ////////////////////////////////////////////////////////// 17 | 18 | void regexVmPikeAddThread(RegexVmPikeThreadList *list, 19 | const RegexProgram *program, 20 | const RegexProgramInstruction *pc, 21 | const char *string, const char *sp, 22 | const char **matches, int nmatches) { 23 | 24 | if (list->threads[pc - program->instructions].visited == sp - string + 1) 25 | return; 26 | list->threads[pc - program->instructions].visited = sp - string + 1; 27 | 28 | switch (pc->opcode) { 29 | case REGEX_PROGRAM_OPCODE_MATCH: 30 | // fall-through 31 | 32 | /* Characters */ 33 | case REGEX_PROGRAM_OPCODE_CHARACTER: 34 | case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: 35 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS: 36 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED: 37 | list->threads[list->nthreads].pc = pc; 38 | memcpy(list->threads[list->nthreads].matches, matches, 39 | sizeof (matches[0]) * ((nmatches <= REGEX_VM_PIKE_MAX_MATCHES) 40 | ? nmatches 41 | : REGEX_VM_PIKE_MAX_MATCHES)); 42 | ++list->nthreads; 43 | break; 44 | 45 | /* Control-flow */ 46 | case REGEX_PROGRAM_OPCODE_SPLIT: 47 | regexVmPikeAddThread(list, program, pc->first, 48 | string, sp, matches, nmatches); 49 | regexVmPikeAddThread(list, program, pc->second, 50 | string, sp, matches, nmatches); 51 | break; 52 | case REGEX_PROGRAM_OPCODE_JUMP: 53 | regexVmPikeAddThread(list, program, pc->target, 54 | string, sp, matches, nmatches); 55 | break; 56 | 57 | /* Assertions */ 58 | case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: 59 | if (sp == string) 60 | regexVmPikeAddThread(list, program, pc + 1, 61 | string, sp, matches, nmatches); 62 | break; 63 | case REGEX_PROGRAM_OPCODE_ASSERT_END: 64 | if (!*sp) 65 | regexVmPikeAddThread(list, program, pc + 1, 66 | string, sp, matches, nmatches); 67 | break; 68 | 69 | /* Saving */ 70 | case REGEX_PROGRAM_OPCODE_SAVE: 71 | if (pc->save < nmatches && pc->save < REGEX_VM_PIKE_MAX_MATCHES) { 72 | const char *saved = matches[pc->save]; 73 | matches[pc->save] = sp; 74 | regexVmPikeAddThread(list, program, pc + 1, 75 | string, sp, matches, nmatches); 76 | matches[pc->save] = saved; 77 | } else { 78 | regexVmPikeAddThread(list, program, pc + 1, 79 | string, sp, matches, nmatches); 80 | } 81 | break; 82 | } 83 | } 84 | 85 | // PUBLIC API ///////////////////////////////////////////////////////////////// 86 | 87 | int regexVmPikeRun(const RegexProgram *program, const char *string, 88 | const char **matches, int nmatches) { 89 | const RegexAllocators *allocators = regexAllocatorsGet(); 90 | size_t size = sizeof (RegexVmPikeThread) 91 | * regexVmPikeEstimateThreads(program); 92 | RegexVmPikeThread *threads; 93 | int matched; 94 | 95 | if (!(threads = allocators->malloc(size))) 96 | return -1; 97 | 98 | matched = regexVmPikeRunWithThreads(program, string, 99 | matches, nmatches, threads); 100 | allocators->free(threads); 101 | return matched; 102 | } 103 | 104 | int regexVmPikeRunWithThreads(const RegexProgram *program, const char *string, 105 | const char **matches, int nmatches, RegexVmPikeThread *threads) { 106 | RegexVmPikeThreadList *current = &(RegexVmPikeThreadList){ 107 | .nthreads = 0, .threads = threads}; 108 | RegexVmPikeThreadList *next = &(RegexVmPikeThreadList){ 109 | .nthreads = 0, .threads = threads + program->ninstructions}; 110 | int matched = 0; 111 | 112 | memset(threads, 0, sizeof (RegexVmPikeThread) * program->ninstructions * 2); 113 | 114 | regexVmPikeAddThread(current, program, program->instructions, 115 | string, string, matches, nmatches); 116 | 117 | for (const char *sp = string; ; ++sp) { 118 | for (int i = 0; i < current->nthreads; ++i) { 119 | RegexVmPikeThread *thread = current->threads + i; 120 | switch (thread->pc->opcode) { 121 | case REGEX_PROGRAM_OPCODE_MATCH: 122 | matched = 1; 123 | current->nthreads = 0; 124 | memcpy(matches, thread->matches, 125 | sizeof (matches[0]) * ((nmatches <= REGEX_VM_PIKE_MAX_MATCHES) 126 | ? nmatches 127 | : REGEX_VM_PIKE_MAX_MATCHES)); 128 | continue; 129 | 130 | /* Characters */ 131 | case REGEX_PROGRAM_OPCODE_CHARACTER: 132 | if (*sp == thread->pc->ch) 133 | break; 134 | continue; 135 | case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: 136 | if (*sp) 137 | break; 138 | continue; 139 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS: 140 | if (regexCharacterClassContains(thread->pc->klass, *sp)) 141 | break; 142 | continue; 143 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED: 144 | if (!regexCharacterClassContains(thread->pc->klass, *sp)) 145 | break; 146 | continue; 147 | 148 | /* Control-flow */ 149 | case REGEX_PROGRAM_OPCODE_SPLIT: 150 | case REGEX_PROGRAM_OPCODE_JUMP: 151 | // fall-through 152 | 153 | /* Assertions */ 154 | case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: 155 | case REGEX_PROGRAM_OPCODE_ASSERT_END: 156 | // fall-through 157 | 158 | /* Saving */ 159 | case REGEX_PROGRAM_OPCODE_SAVE: 160 | // handled in regexVmPikeAddThread() 161 | abort(); 162 | } 163 | 164 | regexVmPikeAddThread(next, program, thread->pc + 1, 165 | string, sp + 1, thread->matches, nmatches); 166 | } 167 | 168 | // swap current and next thread list 169 | RegexVmPikeThreadList *swap = current; 170 | current = next; 171 | next = swap; 172 | next->nthreads = 0; 173 | 174 | // done if no more threads are running or end of string reached 175 | if (current->nthreads == 0 || !*sp) 176 | break; 177 | } 178 | 179 | return matched; 180 | } 181 | 182 | int regexVmPikeEstimateThreads(const RegexProgram *program) { 183 | return program->ninstructions * 2; 184 | } 185 | -------------------------------------------------------------------------------- /examples/regex/regex.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "regex_compile.h" 7 | #include "regex_parse.h" 8 | 9 | void printUsage(FILE *file, const char *program) { 10 | fprintf(file, "usage: %s pattern [string...]\n", program); 11 | } 12 | 13 | void printNode(FILE *file, RegexNode *node, int depth) { 14 | switch (node->type) { 15 | case REGEX_NODE_TYPE_EPSILON: 16 | fprintf(file, "epsilon"); 17 | break; 18 | 19 | /* Characters */ 20 | case REGEX_NODE_TYPE_CHARACTER: 21 | fprintf(file, isprint(node->ch) 22 | ? "character('%c')" 23 | : "character(%02x)", node->ch); 24 | break; 25 | case REGEX_NODE_TYPE_ANY_CHARACTER: 26 | fprintf(file, "any_character"); 27 | break; 28 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 29 | fprintf(file, "character_class(\"%.*s\")", 30 | (int)(node->to - node->from), node->from); 31 | break; 32 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 33 | fprintf(file, "character_class_negated(\"%.*s\")", 34 | (int)(node->to - node->from), node->from); 35 | break; 36 | 37 | /* Composites */ 38 | case REGEX_NODE_TYPE_CONCATENATION: 39 | fprintf(file, "concatenation("); 40 | printNode(file, node->left , depth + 1); 41 | fprintf(file, ", "); 42 | printNode(file, node->right, depth + 1); 43 | fprintf(file, ")"); 44 | break; 45 | case REGEX_NODE_TYPE_ALTERNATION: 46 | fprintf(file, "alternation("); 47 | printNode(file, node->left , depth + 1); 48 | fprintf(file, ", "); 49 | printNode(file, node->right, depth + 1); 50 | fprintf(file, ")"); 51 | break; 52 | 53 | /* Quantifiers */ 54 | case REGEX_NODE_TYPE_QUANTIFIER: 55 | fprintf(file, "quantifier("); 56 | printNode(file, node->quantified, depth + 1); 57 | fprintf(file, ", %d, %d, %s)", node->nmin, node->nmax, 58 | node->greedy ? "greedy" : "non_greedy"); 59 | break; 60 | 61 | /* Anchors */ 62 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 63 | fprintf(file, "anchor_begin"); 64 | break; 65 | case REGEX_NODE_TYPE_ANCHOR_END: 66 | fprintf(file, "anchor_end"); 67 | break; 68 | 69 | /* Captures */ 70 | case REGEX_NODE_TYPE_CAPTURE: 71 | fprintf(file, "capture("); 72 | printNode(file, node->captured, depth + 1); 73 | fprintf(file, ")"); 74 | break; 75 | } 76 | 77 | if (depth == 0) 78 | fprintf(file, "\n"); 79 | } 80 | 81 | void printCharacterClass(FILE *file, 82 | const RegexProgramInstruction *instruction) { 83 | for (int ch = 0, to; ch < UCHAR_MAX; ++ch) { 84 | if (regexCharacterClassContains(instruction->klass, ch)) { 85 | fprintf(file, isprint(ch) ? "%c" : "%02x", ch); 86 | for (to = ch + 1; 87 | regexCharacterClassContains(instruction->klass, to); ++to); 88 | if (to > ch + 2) { 89 | fprintf(file, isprint(to) ? "-%c" : "-%02x", to - 1); 90 | ch = to; 91 | } 92 | } 93 | } 94 | } 95 | 96 | void printInstruction(FILE *file, const RegexProgram *program, 97 | const RegexProgramInstruction *instruction) { 98 | fprintf(file, "[%04x] ", (int)(instruction - program->instructions)); 99 | 100 | switch(instruction->opcode) { 101 | case REGEX_PROGRAM_OPCODE_MATCH: 102 | fprintf(file, "MATCH\n"); 103 | break; 104 | 105 | /* Characters */ 106 | case REGEX_PROGRAM_OPCODE_CHARACTER: 107 | if (isprint(instruction->ch)) 108 | fprintf(file, "CHAR %c\n", instruction->ch); 109 | else 110 | fprintf(file, "CHAR %02x\n", instruction->ch); 111 | break; 112 | case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: 113 | fprintf(file, "ANY_CHAR\n"); 114 | break; 115 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS: 116 | fprintf(file, "CHARACTER_CLASS ["); 117 | printCharacterClass(file, instruction); 118 | fprintf(file, "]\n"); 119 | break; 120 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED: 121 | fprintf(file, "CHARACTER_CLASS_NEGATED [^"); 122 | printCharacterClass(file, instruction); 123 | fprintf(file, "]\n"); 124 | break; 125 | 126 | /* Control-flow */ 127 | case REGEX_PROGRAM_OPCODE_JUMP: 128 | fprintf(file, "JUMP %04x\n", 129 | (int)(instruction->target - program->instructions)); 130 | break; 131 | case REGEX_PROGRAM_OPCODE_SPLIT: 132 | fprintf(file, "SPLIT %04x %04x\n", 133 | (int)(instruction->first - program->instructions), 134 | (int)(instruction->second - program->instructions)); 135 | break; 136 | 137 | /* Assertions */ 138 | case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: 139 | fprintf(file, "ASSERT_BEGIN\n"); 140 | break; 141 | case REGEX_PROGRAM_OPCODE_ASSERT_END: 142 | fprintf(file, "ASSERT_END\n"); 143 | break; 144 | 145 | /* Saving */ 146 | case REGEX_PROGRAM_OPCODE_SAVE: 147 | fprintf(file, "SAVE %d\n", instruction->save); 148 | break; 149 | } 150 | } 151 | 152 | void printProgram(FILE *file, const RegexProgram *program) { 153 | for (int i = 0; i < program->ninstructions; ++i) 154 | printInstruction(file, program, program->instructions + i); 155 | } 156 | 157 | int main(int argc, char *argv[]) { 158 | RegexNode *node; 159 | RegexProgram *program; 160 | 161 | // process command line 162 | if (argc < 2) { 163 | printUsage(stderr, argv[0]); 164 | return EXIT_FAILURE; 165 | } 166 | 167 | if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { 168 | printUsage(stdout, argv[0]); 169 | return EXIT_SUCCESS; 170 | } 171 | 172 | // parse pattern 173 | if ((node = regexParse(argv[1]))) 174 | printNode(stdout, node, 0); 175 | else { 176 | fprintf(stderr, "%s: regexParse() failed\n", argv[0]); 177 | return EXIT_FAILURE; 178 | } 179 | 180 | // compile parsed pattern 181 | program = regexCompileNode(node); 182 | regexParseFree(node); 183 | if (program) 184 | printProgram(stdout, program); 185 | else { 186 | fprintf(stderr, "%s: regexCompileNode() failed\n", argv[0]); 187 | return EXIT_FAILURE; 188 | } 189 | 190 | // run program on string(s) 191 | for (int i = 2; i < argc; ++i) { 192 | const char *matches[20] = {0}; 193 | int nmatches = 0; 194 | 195 | if (regexProgramRun(program, argv[i], matches, 20) > 0) { 196 | for (int j = 0; j < sizeof (matches) / sizeof (matches[0]); ++j) 197 | if (matches[j]) 198 | nmatches = j; 199 | 200 | printf("\"%s\": ", argv[i]); 201 | 202 | for (int j = 0; j <= nmatches; j += 2) { 203 | if (j > 0) 204 | printf(", "); 205 | if (matches[j] && matches[j + 1]) { 206 | printf("\"%.*s\"(%d,%d)", 207 | (int)(matches[j + 1] - matches[j]), matches[j], 208 | (int)(matches[j] - argv[i]), (int)(matches[j + 1] - argv[i])); 209 | } else { 210 | printf("(NULL,NULL)"); 211 | } 212 | } 213 | 214 | printf("\n"); 215 | } else { 216 | printf("\"%s\": no match\n", argv[i]); 217 | } 218 | } 219 | 220 | regexCompileFree(program); 221 | return EXIT_SUCCESS; 222 | } 223 | -------------------------------------------------------------------------------- /test/testdata/repetition.dat: -------------------------------------------------------------------------------- 1 | NOTE implicit vs. explicit repetitions : 2009-02-02 2 | 3 | # Glenn Fowler 4 | # conforming matches (column 4) must match one of the following BREs 5 | # NOMATCH 6 | # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* 7 | # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* 8 | # i.e., each 3-tuple has two identical elements and one (?,?) 9 | 10 | E ((..)|(.)) NULL NOMATCH 11 | E ((..)|(.))((..)|(.)) NULL NOMATCH 12 | E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH 13 | 14 | E ((..)|(.)){1} NULL NOMATCH 15 | E ((..)|(.)){2} NULL NOMATCH 16 | E ((..)|(.)){3} NULL NOMATCH 17 | 18 | E ((..)|(.))* NULL (0,0) 19 | 20 | E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) 21 | E ((..)|(.))((..)|(.)) a NOMATCH 22 | E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH 23 | 24 | E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) 25 | E ((..)|(.)){2} a NOMATCH 26 | E ((..)|(.)){3} a NOMATCH 27 | 28 | E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) 29 | 30 | E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) 31 | E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) 32 | E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH 33 | 34 | E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) 35 | E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) 36 | E ((..)|(.)){3} aa NOMATCH 37 | 38 | E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) 39 | 40 | E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) 41 | E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) 42 | E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) 43 | 44 | E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) 45 | #E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) 46 | E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 47 | E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) 48 | 49 | #E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) 50 | E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 51 | 52 | E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) 53 | E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 54 | E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) 55 | 56 | E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) 57 | E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) 58 | #E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) 59 | E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go 60 | 61 | E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) 62 | 63 | E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) 64 | E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 65 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) 66 | 67 | E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) 68 | E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) 69 | #E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) 70 | E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 71 | 72 | #E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) 73 | E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 74 | 75 | E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) 76 | E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 77 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) 78 | 79 | E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) 80 | E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) 81 | E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) 82 | 83 | E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) 84 | 85 | NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 86 | 87 | # These test a bug in OS X / FreeBSD / NetBSD, and libtree. 88 | # Linux/GLIBC gets the {8,} and {8,8} wrong. 89 | 90 | :HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) 91 | :HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) 92 | :HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) 93 | :HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) 94 | :HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) 95 | :HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) 96 | :HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) 97 | :HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) 98 | :HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) 99 | #:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) 100 | :HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go 101 | #:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) 102 | :HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go 103 | #:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) 104 | :HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go 105 | #:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) 106 | :HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go 107 | #:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) 108 | :HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go 109 | #:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) 110 | :HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go 111 | #:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) 112 | :HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go 113 | #:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) 114 | :HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go 115 | :HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) 116 | 117 | # These test a fixed bug in my regex-tdfa that did not keep the expanded 118 | # form properly grouped, so right association did the wrong thing with 119 | # these ambiguous patterns (crafted just to test my code when I became 120 | # suspicious of my implementation). The first subexpression should use 121 | # "ab" then "a" then "bcd". 122 | 123 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible 124 | # results like (0,6)(4,5)(6,6). 125 | 126 | #:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 127 | :HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) tiny-regex 128 | #:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 129 | :HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) tiny-regex 130 | :HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 131 | :HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 132 | :HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH 133 | #:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 134 | :HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) tiny-regex 135 | #:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 136 | :HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) tiny-regex 137 | :HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 138 | :HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 139 | :HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH 140 | #:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 141 | :HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) tiny-regex 142 | #:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 143 | :HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) tiny-regex 144 | 145 | # The above worked on Linux/GLIBC but the following often fail. 146 | # They also trip up OS X / FreeBSD / NetBSD: 147 | 148 | #:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 149 | :HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 150 | #:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 151 | :HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 152 | #:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 153 | :HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 154 | #:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 155 | :HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 156 | :HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH 157 | #:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 158 | :HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 159 | #:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 160 | :HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 161 | #:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 162 | :HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 163 | #:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 164 | :HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 165 | :HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH 166 | #:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 167 | :HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 168 | #:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 169 | :HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 170 | -------------------------------------------------------------------------------- /src/regex_parse.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "regex_allocators.h" 5 | #include "regex_parse.h" 6 | 7 | typedef struct RegexParseContext { 8 | const char *sp; 9 | RegexNode *stack, *output; 10 | } RegexParseContext; 11 | 12 | // SHUNTING YARD STACK HANDLING /////////////////////////////////////////////// 13 | 14 | static inline RegexNode *regexParsePush(RegexParseContext *context, 15 | const RegexNode *node) { 16 | assert(context->stack <= context->output); 17 | *context->stack = *node; 18 | return context->stack++; 19 | } 20 | 21 | static inline RegexNode *regexParseDrop(RegexParseContext *context) { 22 | return --context->stack; 23 | } 24 | 25 | static inline RegexNode *regexParseConsume(RegexParseContext *context) { 26 | *--context->output = *--context->stack; 27 | return context->output; 28 | } 29 | 30 | static inline RegexNode *regexParseConcatenate(RegexParseContext *context, 31 | const RegexNode *bottom) { 32 | if (context->stack == bottom) 33 | regexParsePush(context, &(RegexNode){.type = REGEX_NODE_TYPE_EPSILON}); 34 | else { 35 | while (context->stack - 1 > bottom) { 36 | RegexNode *right = regexParseConsume(context); 37 | RegexNode *left = regexParseConsume(context); 38 | regexParsePush(context, &(RegexNode){ 39 | .type = REGEX_NODE_TYPE_CONCATENATION, 40 | .left = left, 41 | .right = right}); 42 | } 43 | } 44 | return context->stack - 1; 45 | } 46 | 47 | // PARSING //////////////////////////////////////////////////////////////////// 48 | 49 | RegexNode *regexParseCharacterClass(RegexParseContext *context) { 50 | RegexNodeType type = (*context->sp == '^') 51 | ? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED) 52 | : REGEX_NODE_TYPE_CHARACTER_CLASS; 53 | const char *from = context->sp; 54 | 55 | for (;;) { 56 | int ch = *context->sp++; 57 | switch (ch) { 58 | case '\0': 59 | // premature end of character class 60 | return NULL; 61 | case ']': 62 | if (context->sp - 1 == from) 63 | goto CHARACTER; 64 | return regexParsePush(context, &(RegexNode){.type = type, 65 | .from = from, .to = context->sp - 1}); 66 | case '\\': 67 | ch = *context->sp++; 68 | // fall-through 69 | default: 70 | CHARACTER: 71 | if (*context->sp == '-' && context->sp[1] != ']') { 72 | if (context->sp[1] < ch) 73 | // empty range in character class 74 | return NULL; 75 | context->sp += 2; 76 | } 77 | break; 78 | } 79 | } 80 | } 81 | 82 | RegexNode *regexParseInterval(RegexParseContext *context) { 83 | const char *from = context->sp; 84 | int nmin, nmax; 85 | 86 | for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp) 87 | nmin = (nmin * 10) + (*context->sp - '0'); 88 | 89 | if (*context->sp == ',') { 90 | ++context->sp; 91 | if (*from != ',' && *context->sp == '}') 92 | nmax = -1; 93 | else { 94 | for (nmax = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp) 95 | nmax = (nmax * 10) + (*context->sp - '0'); 96 | if (*(context->sp - 1) == ',' || *context->sp != '}' || nmax < nmin) { 97 | context->sp = from; 98 | return NULL; 99 | } 100 | } 101 | } else if (*from != '}' && *context->sp == '}') { 102 | nmax = nmin; 103 | } else { 104 | context->sp = from; 105 | return NULL; 106 | } 107 | 108 | ++context->sp; 109 | return regexParsePush(context, &(RegexNode){ 110 | .type = REGEX_NODE_TYPE_QUANTIFIER, 111 | .nmin = nmin, 112 | .nmax = nmax, 113 | .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1, 114 | .quantified = regexParseConsume(context)}); 115 | } 116 | 117 | RegexNode *regexParseContext(RegexParseContext *context, int depth) { 118 | RegexNode *bottom = context->stack; 119 | 120 | for (;;) { 121 | RegexNode *left, *right; 122 | int ch = *context->sp++; 123 | switch (ch) { 124 | /* Characters */ 125 | case '\\': 126 | ch = *context->sp++; 127 | // fall-through 128 | default: 129 | CHARACTER: 130 | regexParsePush(context, &(RegexNode){.type = REGEX_NODE_TYPE_CHARACTER, 131 | .ch = ch}); 132 | break; 133 | case '.': 134 | regexParsePush(context, &(RegexNode){ 135 | .type = REGEX_NODE_TYPE_ANY_CHARACTER}); 136 | break; 137 | case '[': 138 | if (!regexParseCharacterClass(context)) 139 | return NULL; 140 | break; 141 | 142 | /* Composites */ 143 | case '|': 144 | left = regexParseConcatenate(context, bottom); 145 | if (!(right = regexParseContext(context, depth))) 146 | return NULL; 147 | if (left->type == REGEX_NODE_TYPE_EPSILON && right->type == left->type) { 148 | regexParseDrop(context); 149 | } else if (left->type == REGEX_NODE_TYPE_EPSILON) { 150 | right = regexParseConsume(context); 151 | regexParseDrop(context); 152 | regexParsePush(context, &(RegexNode){ 153 | .type = REGEX_NODE_TYPE_QUANTIFIER, 154 | .nmin = 0, 155 | .nmax = 1, 156 | .greedy = 1, 157 | .quantified = right}); 158 | } else if (right->type == REGEX_NODE_TYPE_EPSILON) { 159 | regexParseDrop(context); 160 | left = regexParseConsume(context); 161 | regexParsePush(context, &(RegexNode){ 162 | .type = REGEX_NODE_TYPE_QUANTIFIER, 163 | .nmin = 0, 164 | .nmax = 1, 165 | .greedy = 1, 166 | .quantified = left}); 167 | } else { 168 | right = regexParseConsume(context); 169 | left = regexParseConsume(context); 170 | regexParsePush(context, &(RegexNode){ 171 | .type = REGEX_NODE_TYPE_ALTERNATION, 172 | .left = left, 173 | .right = right}); 174 | } 175 | return bottom; 176 | 177 | /* Quantifiers */ 178 | #define QUANTIFIER(ch, min, max) \ 179 | case ch: \ 180 | if (context->stack == bottom) \ 181 | goto CHARACTER; \ 182 | regexParsePush(context, &(RegexNode){ \ 183 | .type = REGEX_NODE_TYPE_QUANTIFIER, \ 184 | .nmin = min, \ 185 | .nmax = max, \ 186 | .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1, \ 187 | .quantified = regexParseConsume(context)}); \ 188 | break 189 | QUANTIFIER('?', 0, 1); 190 | QUANTIFIER('*', 0, -1); 191 | QUANTIFIER('+', 1, -1); 192 | #undef QUANTIFIER 193 | case '{': 194 | if ((context->stack == bottom) || !regexParseInterval(context)) 195 | goto CHARACTER; 196 | break; 197 | 198 | /* Anchors */ 199 | case '^': 200 | regexParsePush(context, &(RegexNode){ 201 | .type = REGEX_NODE_TYPE_ANCHOR_BEGIN}); 202 | break; 203 | case '$': 204 | regexParsePush(context, &(RegexNode){ 205 | .type = REGEX_NODE_TYPE_ANCHOR_END}); 206 | break; 207 | 208 | /* Captures */ 209 | case '(': 210 | if (!regexParseContext(context, depth + 1)) 211 | return NULL; 212 | regexParsePush(context, &(RegexNode){.type = REGEX_NODE_TYPE_CAPTURE, 213 | .captured = regexParseConsume(context)}); 214 | break; 215 | case ')': 216 | if (depth > 0) 217 | return regexParseConcatenate(context, bottom); 218 | else 219 | // unmatched close parenthesis 220 | return NULL; 221 | 222 | /* End of string */ 223 | case '\0': 224 | if (depth == 0) 225 | return regexParseConcatenate(context, bottom); 226 | else 227 | // unmatched open parenthesis 228 | return NULL; 229 | } 230 | } 231 | } 232 | 233 | // PUBLIC API ///////////////////////////////////////////////////////////////// 234 | 235 | RegexNode *regexParse(const char *pattern) { 236 | const RegexAllocators *allocators = regexAllocatorsGet(); 237 | size_t size = sizeof (RegexNode) * regexParseEstimateNodes(pattern); 238 | RegexNode *nodes; 239 | 240 | if (!(nodes = allocators->malloc(size))) 241 | return NULL; 242 | 243 | if (!regexParseWithNodes(pattern, nodes)) { 244 | allocators->free(nodes); 245 | return NULL; 246 | } 247 | 248 | return nodes; 249 | } 250 | 251 | RegexNode *regexParseWithNodes(const char *pattern, RegexNode *nodes) { 252 | RegexParseContext *context = &(RegexParseContext){.sp = pattern, 253 | .stack = nodes, .output = nodes + regexParseEstimateNodes(pattern)}; 254 | return regexParseContext(context, 0); 255 | } 256 | 257 | int regexParseEstimateNodes(const char *pattern) { 258 | return strlen(pattern) * 2; 259 | } 260 | 261 | void regexParseFree(RegexNode *root) { 262 | const RegexAllocators *allocators = regexAllocatorsGet(); 263 | allocators->free(root); 264 | } 265 | -------------------------------------------------------------------------------- /test/testdata/basic.dat: -------------------------------------------------------------------------------- 1 | NOTE all standard compliant implementations should pass these : 2002-05-31 2 | 3 | BE abracadabra$ abracadabracadabra (7,18) 4 | BE a...b abababbb (2,7) 5 | BE XXXXXX ..XXXXXX (2,8) 6 | E \) () (1,2) 7 | BE a] a]a (0,2) 8 | B } } (0,1) 9 | E \} } (0,1) 10 | BE \] ] (0,1) 11 | B ] ] (0,1) 12 | E ] ] (0,1) 13 | B { { (0,1) 14 | B } } (0,1) 15 | BE ^a ax (0,1) 16 | BE \^a a^a (1,3) 17 | BE a\^ a^ (0,2) 18 | BE a$ aa (1,2) 19 | BE a\$ a$ (0,2) 20 | BE ^$ NULL (0,0) 21 | E $^ NULL (0,0) 22 | E a($) aa (1,2)(2,2) 23 | E a*(^a) aa (0,1)(0,1) 24 | E (..)*(...)* a (0,0) 25 | E (..)*(...)* abcd (0,4)(2,4) 26 | E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) 27 | E (ab)c|abc abc (0,3)(0,2) 28 | E a{0}b ab (1,2) 29 | E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 30 | E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 31 | #E a{9876543210} NULL BADBR not supported by tiny-regex 32 | E ((a|a)|a) a (0,1)(0,1)(0,1) 33 | E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) 34 | E a*(a.|aa) aaaa (0,4)(2,4) 35 | E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) 36 | E (a|b)?.* b (0,1)(0,1) 37 | E (a|b)c|a(b|c) ac (0,2)(0,1) 38 | E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) 39 | E (a|b)*c|(a|ab)*c abc (0,3)(1,2) 40 | E (a|b)*c|(a|ab)*c xc (1,2) 41 | E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) 42 | E a?(ab|ba)ab abab (0,4)(0,2) 43 | E a?(ac{0}b|ba)ab abab (0,4)(0,2) 44 | E ab|abab abbabab (0,2) 45 | E aba|bab|bba baaabbbaba (5,8) 46 | E aba|bab baaabbbaba (6,9) 47 | E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) 48 | E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) 49 | E ab|a xabc (1,3) 50 | E ab|a xxabc (2,4) 51 | #Ei (Ab|cD)* aBcD (0,4)(2,4) not supported by tiny-regex 52 | BE [^-] --a (2,3) 53 | BE [a-]* --a (0,3) 54 | BE [a-m-]* --amoma-- (0,4) 55 | E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) 56 | E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) 57 | #{E [[:upper:]] A (0,1) [[]] not supported 58 | #E [[:lower:]]+ `az{ (1,3) not supported by tiny-regex 59 | #E [[:upper:]]+ @AZ[ (1,3) not supported by tiny-regex 60 | # No collation in Go 61 | #BE [[-]] [[-]] (2,4) 62 | #BE [[.NIL.]] NULL ECOLLATE 63 | #BE [[=aleph=]] NULL ECOLLATE 64 | } 65 | BE$ \n \n (0,1) 66 | #BEn$ \n \n (0,1) not supported by tiny-regex 67 | BE$ [^a] \n (0,1) 68 | BE$ \na \na (0,2) 69 | E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) 70 | BE xxx xxx (0,3) 71 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) 72 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) 73 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) 74 | E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) 75 | E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) 76 | E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) 77 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) 78 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) 79 | E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) 80 | BE$ .* \x01\xff (0,2) 81 | E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) 82 | L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH 83 | E a*a*a*a*a*b aaaaaaaaab (0,10) 84 | BE ^ NULL (0,0) 85 | BE $ NULL (0,0) 86 | BE ^$ NULL (0,0) 87 | BE ^a$ a (0,1) 88 | BE abc abc (0,3) 89 | BE abc xabcy (1,4) 90 | BE abc ababc (2,5) 91 | BE ab*c abc (0,3) 92 | BE ab*bc abc (0,3) 93 | BE ab*bc abbc (0,4) 94 | BE ab*bc abbbbc (0,6) 95 | E ab+bc abbc (0,4) 96 | E ab+bc abbbbc (0,6) 97 | E ab?bc abbc (0,4) 98 | E ab?bc abc (0,3) 99 | E ab?c abc (0,3) 100 | BE ^abc$ abc (0,3) 101 | BE ^abc abcc (0,3) 102 | BE abc$ aabc (1,4) 103 | BE ^ abc (0,0) 104 | BE $ abc (3,3) 105 | BE a.c abc (0,3) 106 | BE a.c axc (0,3) 107 | BE a.*c axyzc (0,5) 108 | BE a[bc]d abd (0,3) 109 | BE a[b-d]e ace (0,3) 110 | BE a[b-d] aac (1,3) 111 | BE a[-b] a- (0,2) 112 | BE a[b-] a- (0,2) 113 | BE a] a] (0,2) 114 | BE a[]]b a]b (0,3) 115 | BE a[^bc]d aed (0,3) 116 | BE a[^-b]c adc (0,3) 117 | BE a[^]b]c adc (0,3) 118 | E ab|cd abc (0,2) 119 | E ab|cd abcd (0,2) 120 | E a\(b a(b (0,3) 121 | E a\(*b ab (0,2) 122 | E a\(*b a((b (0,4) 123 | E ((a)) abc (0,1)(0,1)(0,1) 124 | E (a)b(c) abc (0,3)(0,1)(2,3) 125 | E a+b+c aabbabc (4,7) 126 | E a* aaa (0,3) 127 | #E (a*)* - (0,0)(0,0) 128 | E (a*)* - (0,0)(?,?) RE2/Go 129 | E (a*)+ - (0,0)(0,0) 130 | #E (a*|b)* - (0,0)(0,0) 131 | E (a*|b)* - (0,0)(?,?) RE2/Go 132 | E (a+|b)* ab (0,2)(1,2) 133 | E (a+|b)+ ab (0,2)(1,2) 134 | E (a+|b)? ab (0,1)(0,1) 135 | BE [^ab]* cde (0,3) 136 | #E (^)* - (0,0)(0,0) 137 | E (^)* - (0,0)(?,?) RE2/Go 138 | BE a* NULL (0,0) 139 | E ([abc])*d abbbcd (0,6)(4,5) 140 | E ([abc])*bcd abcd (0,4)(0,1) 141 | E a|b|c|d|e e (0,1) 142 | E (a|b|c|d|e)f ef (0,2)(0,1) 143 | #E ((a*|b))* - (0,0)(0,0)(0,0) 144 | E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go 145 | BE abcd*efg abcdefg (0,7) 146 | BE ab* xabyabbbz (1,3) 147 | BE ab* xayabbbz (1,2) 148 | E (ab|cd)e abcde (2,5)(2,4) 149 | BE [abhgefdc]ij hij (0,3) 150 | E (a|b)c*d abcd (1,4)(1,2) 151 | E (ab|ab*)bc abc (0,3)(0,1) 152 | E a([bc]*)c* abc (0,3)(1,3) 153 | E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) 154 | E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) 155 | E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) 156 | E a[bcd]*dcdcde adcdcde (0,7) 157 | E (ab|a)b*c abc (0,3)(0,2) 158 | E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) 159 | BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) 160 | E ^a(bc+|b[eh])g|.h$ abh (1,3) 161 | E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) 162 | E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) 163 | E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) 164 | E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) 165 | BE multiple words multiple words yeah (0,14) 166 | E (.*)c(.*) abcde (0,5)(0,2)(3,5) 167 | BE abcd abcd (0,4) 168 | E a(bc)d abcd (0,4)(1,3) 169 | E a[-]?c ac (0,3) 170 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) 171 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) 172 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) 173 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) 174 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) 175 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) 176 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) 177 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) 178 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) 179 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) 180 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) 181 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) 182 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) 183 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) 184 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) 185 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) 186 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) 187 | E a+(b|c)*d+ aabcdd (0,6)(3,4) 188 | E ^.+$ vivi (0,4) 189 | E ^(.+)$ vivi (0,4)(0,4) 190 | E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) 191 | E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) 192 | E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) 193 | E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) 194 | E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) 195 | E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) 196 | E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) 197 | E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) 198 | E ((foo)|bar)!bas bar!bas (0,7)(0,3) 199 | E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) 200 | E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) 201 | E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) 202 | E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) 203 | E (foo|(bar))!bas foo!bas (0,7)(0,3) 204 | E (foo|bar)!bas bar!bas (0,7)(0,3) 205 | E (foo|bar)!bas foo!bar!bas (4,11)(4,7) 206 | E (foo|bar)!bas foo!bas (0,7)(0,3) 207 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 208 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) 209 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) 210 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) 211 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) 212 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) 213 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) 214 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 215 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) 216 | E .*(/XXX).* /XXX (0,4)(0,4) 217 | E .*(\\XXX).* \XXX (0,4)(0,4) 218 | E \\XXX \XXX (0,4) 219 | E .*(/000).* /000 (0,4)(0,4) 220 | E .*(\\000).* \000 (0,4)(0,4) 221 | E \\000 \000 (0,4) 222 | -------------------------------------------------------------------------------- /src/regex_compile.c: -------------------------------------------------------------------------------- 1 | #include "regex_allocators.h" 2 | #include "regex_compile.h" 3 | #include "regex_parse.h" 4 | 5 | typedef struct RegexCompileContext { 6 | RegexProgramInstruction *pc; 7 | int ncaptures; 8 | } RegexCompileContext; 9 | 10 | // COMPILATION //////////////////////////////////////////////////////////////// 11 | 12 | int regexCompileCountInstructions(const RegexNode *node) { 13 | int ninstructions; 14 | 15 | switch (node->type) { 16 | case REGEX_NODE_TYPE_EPSILON: 17 | return 0; 18 | 19 | /* Characters */ 20 | case REGEX_NODE_TYPE_CHARACTER: 21 | case REGEX_NODE_TYPE_ANY_CHARACTER: 22 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 23 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 24 | return 1; 25 | 26 | /* Composites */ 27 | case REGEX_NODE_TYPE_CONCATENATION: 28 | return regexCompileCountInstructions(node->left) 29 | + regexCompileCountInstructions(node->right); 30 | case REGEX_NODE_TYPE_ALTERNATION: 31 | return 2 + regexCompileCountInstructions(node->left) 32 | + regexCompileCountInstructions(node->right); 33 | 34 | /* Quantifiers */ 35 | case REGEX_NODE_TYPE_QUANTIFIER: 36 | ninstructions = regexCompileCountInstructions(node->quantified); 37 | if (node->nmax >= node->nmin) 38 | return node->nmin * ninstructions 39 | + (node->nmax - node->nmin) * (ninstructions + 1); 40 | else 41 | return 1 + (node->nmin ? node->nmin * ninstructions : ninstructions + 1); 42 | 43 | /* Anchors */ 44 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 45 | case REGEX_NODE_TYPE_ANCHOR_END: 46 | return 1; 47 | 48 | /* Captures */ 49 | case REGEX_NODE_TYPE_CAPTURE: 50 | return 2 + regexCompileCountInstructions(node->captured); 51 | } 52 | } 53 | 54 | int regexCompileNodeIsAnchored(const RegexNode *node) { 55 | switch (node->type) { 56 | case REGEX_NODE_TYPE_EPSILON: 57 | return 0; 58 | 59 | /* Characters */ 60 | case REGEX_NODE_TYPE_CHARACTER: 61 | case REGEX_NODE_TYPE_ANY_CHARACTER: 62 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 63 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 64 | return 0; 65 | 66 | /* Composites */ 67 | case REGEX_NODE_TYPE_CONCATENATION: 68 | return regexCompileNodeIsAnchored(node->left); 69 | case REGEX_NODE_TYPE_ALTERNATION: 70 | return regexCompileNodeIsAnchored(node->left) 71 | && regexCompileNodeIsAnchored(node->right); 72 | 73 | /* Quantifiers */ 74 | case REGEX_NODE_TYPE_QUANTIFIER: 75 | return regexCompileNodeIsAnchored(node->quantified); 76 | 77 | /* Anchors */ 78 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 79 | return 1; 80 | case REGEX_NODE_TYPE_ANCHOR_END: 81 | return 0; 82 | 83 | /* Captures */ 84 | case REGEX_NODE_TYPE_CAPTURE: 85 | return regexCompileNodeIsAnchored(node->captured); 86 | } 87 | } 88 | 89 | static inline RegexProgramInstruction *regexCompileEmit( 90 | RegexCompileContext *context, const RegexProgramInstruction *instruction) { 91 | *context->pc = *instruction; 92 | return context->pc++; 93 | } 94 | 95 | RegexProgramInstruction *regexCompileCharacterClass(const RegexNode *node, 96 | RegexProgramInstruction *instruction) { 97 | const char *sp = node->from; 98 | 99 | for (;;) { 100 | int ch = *sp++; 101 | switch (ch) { 102 | case ']': 103 | if (sp - 1 == node->from) 104 | goto CHARACTER; 105 | return instruction; 106 | case '\\': 107 | ch = *sp++; 108 | // fall-through 109 | default: 110 | CHARACTER: 111 | if (*sp == '-' && sp[1] != ']') { 112 | for (; ch <= sp[1]; ++ch) 113 | regexCharacterClassAdd(instruction->klass, ch); 114 | sp += 2; 115 | } else { 116 | regexCharacterClassAdd(instruction->klass, ch); 117 | } 118 | break; 119 | } 120 | } 121 | } 122 | 123 | RegexProgramInstruction *regexCompileContext(RegexCompileContext *context, 124 | const RegexNode *node) { 125 | RegexProgramInstruction *bottom = context->pc, *split, *jump, *last; 126 | int ncaptures = context->ncaptures, capture; 127 | 128 | switch (node->type) { 129 | case REGEX_NODE_TYPE_EPSILON: 130 | break; 131 | 132 | /* Characters */ 133 | case REGEX_NODE_TYPE_CHARACTER: 134 | regexCompileEmit(context, &(RegexProgramInstruction){ 135 | .opcode = REGEX_PROGRAM_OPCODE_CHARACTER, .ch = node->ch}); 136 | break; 137 | case REGEX_NODE_TYPE_ANY_CHARACTER: 138 | regexCompileEmit(context, &(RegexProgramInstruction){ 139 | .opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER}); 140 | break; 141 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 142 | regexCompileCharacterClass(node, 143 | regexCompileEmit(context, &(RegexProgramInstruction){ 144 | .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS})); 145 | break; 146 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 147 | regexCompileCharacterClass(node, 148 | regexCompileEmit(context, &(RegexProgramInstruction){ 149 | .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED})); 150 | break; 151 | 152 | /* Composites */ 153 | case REGEX_NODE_TYPE_CONCATENATION: 154 | regexCompileContext(context, node->left); 155 | regexCompileContext(context, node->right); 156 | break; 157 | case REGEX_NODE_TYPE_ALTERNATION: 158 | split = regexCompileEmit(context, &(RegexProgramInstruction){ 159 | .opcode = REGEX_PROGRAM_OPCODE_SPLIT}); 160 | split->first = regexCompileContext(context, node->left); 161 | jump = regexCompileEmit(context, &(RegexProgramInstruction){ 162 | .opcode = REGEX_PROGRAM_OPCODE_JUMP}); 163 | split->second = regexCompileContext(context, node->right); 164 | jump->target = context->pc; 165 | break; 166 | 167 | /* Quantifiers */ 168 | case REGEX_NODE_TYPE_QUANTIFIER: 169 | for (int i = 0; i < node->nmin; ++i) { 170 | context->ncaptures = ncaptures; 171 | last = regexCompileContext(context, node->quantified); 172 | } 173 | if (node->nmax > node->nmin) { 174 | for (int i = 0; i < node->nmax - node->nmin; ++i) { 175 | context->ncaptures = ncaptures; 176 | split = regexCompileEmit(context, &(RegexProgramInstruction){ 177 | .opcode = REGEX_PROGRAM_OPCODE_SPLIT}); 178 | split->first = regexCompileContext(context, node->quantified); 179 | split->second = context->pc; 180 | if (!node->greedy) { 181 | RegexProgramInstruction *swap = split->first; 182 | split->first = split->second; 183 | split->second = swap; 184 | } 185 | } 186 | } else if (node->nmax == -1) { 187 | split = regexCompileEmit(context, &(RegexProgramInstruction){ 188 | .opcode = REGEX_PROGRAM_OPCODE_SPLIT}); 189 | if (node->nmin == 0) { 190 | split->first = regexCompileContext(context, node->quantified); 191 | jump = regexCompileEmit(context, &(RegexProgramInstruction){ 192 | .opcode = REGEX_PROGRAM_OPCODE_JUMP}); 193 | split->second = context->pc; 194 | jump->target = split; 195 | } else { 196 | split->first = last; 197 | split->second = context->pc; 198 | } 199 | if (!node->greedy) { 200 | RegexProgramInstruction *swap = split->first; 201 | split->first = split->second; 202 | split->second = swap; 203 | } 204 | } 205 | break; 206 | 207 | /* Anchors */ 208 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 209 | regexCompileEmit(context, &(RegexProgramInstruction){ 210 | .opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN}); 211 | break; 212 | case REGEX_NODE_TYPE_ANCHOR_END: 213 | regexCompileEmit(context, &(RegexProgramInstruction){ 214 | .opcode = REGEX_PROGRAM_OPCODE_ASSERT_END}); 215 | break; 216 | 217 | /* Captures */ 218 | case REGEX_NODE_TYPE_CAPTURE: 219 | capture = context->ncaptures++ * 2; 220 | regexCompileEmit(context, &(RegexProgramInstruction){ 221 | .opcode = REGEX_PROGRAM_OPCODE_SAVE, .save = capture}); 222 | regexCompileContext(context, node->captured); 223 | regexCompileEmit(context, &(RegexProgramInstruction){ 224 | .opcode = REGEX_PROGRAM_OPCODE_SAVE, .save = capture + 1}); 225 | break; 226 | } 227 | 228 | return bottom; 229 | } 230 | 231 | // PUBLIC API ///////////////////////////////////////////////////////////////// 232 | 233 | RegexProgram *regexCompile(const char *pattern) { 234 | RegexNode *root; 235 | RegexProgram *program; 236 | 237 | if (!(root = regexParse(pattern))) 238 | return NULL; 239 | 240 | program = regexCompileNode(root); 241 | regexParseFree(root); 242 | return program; 243 | } 244 | 245 | RegexProgram *regexCompileNode(const RegexNode *root) { 246 | const RegexAllocators *allocators = regexAllocatorsGet(); 247 | size_t size = sizeof (RegexProgram) + sizeof (RegexProgramInstruction) 248 | * regexCompileEstimateInstructions(root); 249 | RegexProgram *program; 250 | 251 | if (!(program = allocators->malloc(size))) 252 | return NULL; 253 | 254 | if (!regexCompileNodeWithProgram(root, program)) { 255 | allocators->free(program); 256 | return NULL; 257 | } 258 | 259 | return program; 260 | } 261 | 262 | RegexProgram *regexCompileNodeWithProgram(const RegexNode *root, 263 | RegexProgram *program) { 264 | 265 | // add capture node for entire match 266 | root = &(RegexNode){.type = REGEX_NODE_TYPE_CAPTURE, 267 | .captured = (RegexNode *)root}; 268 | 269 | // add .*? unless pattern starts with ^ 270 | if (!regexCompileNodeIsAnchored(root)) 271 | root = &(RegexNode){ 272 | .type = REGEX_NODE_TYPE_CONCATENATION, 273 | .left = &(RegexNode){ 274 | .type = REGEX_NODE_TYPE_QUANTIFIER, 275 | .nmin = 0, 276 | .nmax = -1, 277 | .greedy = 0, 278 | .quantified = &(RegexNode){.type = REGEX_NODE_TYPE_ANY_CHARACTER}}, 279 | .right = (RegexNode *)root}; 280 | 281 | // compile 282 | RegexCompileContext *context = &(RegexCompileContext){ 283 | .pc = program->instructions, .ncaptures = 0}; 284 | regexCompileContext(context, root); 285 | 286 | // emit final match instruction 287 | regexCompileEmit(context, &(RegexProgramInstruction){ 288 | .opcode = REGEX_PROGRAM_OPCODE_MATCH}); 289 | 290 | // set total number of instructions 291 | program->ninstructions = context->pc - program->instructions; 292 | 293 | return program; 294 | } 295 | 296 | int regexCompileEstimateInstructions(const RegexNode *root) { 297 | return regexCompileCountInstructions(root) 298 | /* .*? is added unless pattern starts with ^, 299 | * save instructions are added for beginning and end of match, 300 | * a final match instruction is added to the end of the program */ 301 | + !regexCompileNodeIsAnchored(root) * 3 + 2 + 1; 302 | } 303 | 304 | /* Free a compiled program. */ 305 | void regexCompileFree(RegexProgram *program) { 306 | const RegexAllocators *allocators = regexAllocatorsGet(); 307 | allocators->free(program); 308 | } 309 | -------------------------------------------------------------------------------- /test/test.c: -------------------------------------------------------------------------------- 1 | /* generated by ./make-test.rb testdata/basic.dat testdata/nullsubexpr.dat testdata/repetition.dat */ 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #ifdef __GNUC__ 10 | void success(const char *source, const char *format, ...) 11 | __attribute__ ((format(printf, 2, 3))); 12 | void fail(const char *source, const char *format, ...) 13 | __attribute__ ((format(printf, 2, 3))); 14 | #endif 15 | 16 | void success(const char *source, const char *format, ...) { 17 | va_list ap; 18 | va_start(ap, format); 19 | printf("%s [\x1b[32mSUCCESS\x1b[0m] ", source); 20 | vprintf(format, ap); 21 | printf("\n"); 22 | va_end(ap); 23 | } 24 | 25 | void fail(const char *source, const char *format, ...) { 26 | va_list ap; 27 | va_start(ap, format); 28 | printf("%s [\x1b[31mFAIL \x1b[0m] ", source); 29 | vprintf(format, ap); 30 | printf("\n"); 31 | va_end(ap); 32 | } 33 | 34 | int test(const char *source, const char *pattern, const char *string, 35 | int nmatches, ...) { 36 | RegexNode *root; 37 | RegexProgram *program; 38 | const char *matches[20] = {0}; 39 | int result = 0; 40 | va_list ap; 41 | 42 | // parse pattern 43 | if (!(root = regexParse(pattern))) { 44 | fail(source, "regexParse() failed"); 45 | return -1; 46 | } 47 | 48 | // compile parsed pattern 49 | program = regexCompileNode(root); 50 | regexParseFree(root); 51 | if (!program) { 52 | fail(source, "regexCompile() failed"); 53 | return -1; 54 | } 55 | 56 | // run program on string 57 | if ((result = regexProgramRun(program, string, 58 | matches, sizeof (matches) / sizeof (matches[0]))) < 0) { 59 | fail(source, "regexProgramRun() failed"); 60 | regexCompileFree(program); 61 | return -1; 62 | } 63 | 64 | va_start(ap, nmatches); 65 | if (result > 0) { 66 | if (nmatches > 0) { 67 | success(source, "/%s/ =~ \"%s\"", pattern, string); 68 | result = 0; 69 | for (int i = 0; i + 1 < nmatches 70 | && i + 1 < sizeof (matches) / sizeof (matches[0]); i += 2) { 71 | int begin = va_arg(ap, int), end = va_arg(ap, int); 72 | if ((begin == -1 || begin == matches[i] - string) 73 | && (end == -1 || end == matches[i + 1] - string)) { 74 | // success(source, "(%d,%d)", begin, end); 75 | } else if (matches[i] && matches[i + 1]) { 76 | fail(source, "expected (%d,%d), got (%d,%d)", begin, end, 77 | (int)(matches[i] - string), (int)(matches[i + 1] - string)); 78 | result = -1; 79 | } else { 80 | fail(source, "expected (%d,%d), got (NULL,NULL)", begin, end); 81 | result = -1; 82 | } 83 | } 84 | } else { 85 | fail(source, "/%s/ =~ \"%s\"", pattern, string); 86 | result = -1; 87 | } 88 | } else if (result == 0) { 89 | if (nmatches == 0) 90 | success(source, "/%s/ !~ \"%s\"", pattern, string); 91 | else { 92 | fail(source, "/%s/ !~ \"%s\"", pattern, string); 93 | result = -1; 94 | } 95 | } 96 | 97 | va_end(ap); 98 | regexCompileFree(program); 99 | return result; 100 | } 101 | 102 | int main(int argc, char *argv[]) { 103 | int nerrors = 0; 104 | nerrors += test("testdata/basic.dat:003", "abracadabra$", "abracadabracadabra", 105 | 2, 7, 18); 106 | nerrors += test("testdata/basic.dat:004", "a...b", "abababbb", 107 | 2, 2, 7); 108 | nerrors += test("testdata/basic.dat:005", "XXXXXX", "..XXXXXX", 109 | 2, 2, 8); 110 | nerrors += test("testdata/basic.dat:006", "\\)", "()", 111 | 2, 1, 2); 112 | nerrors += test("testdata/basic.dat:007", "a]", "a]a", 113 | 2, 0, 2); 114 | nerrors += test("testdata/basic.dat:008", "}", "}", 115 | 2, 0, 1); 116 | nerrors += test("testdata/basic.dat:009", "\\}", "}", 117 | 2, 0, 1); 118 | nerrors += test("testdata/basic.dat:010", "\\]", "]", 119 | 2, 0, 1); 120 | nerrors += test("testdata/basic.dat:011", "]", "]", 121 | 2, 0, 1); 122 | nerrors += test("testdata/basic.dat:012", "]", "]", 123 | 2, 0, 1); 124 | nerrors += test("testdata/basic.dat:013", "{", "{", 125 | 2, 0, 1); 126 | nerrors += test("testdata/basic.dat:014", "}", "}", 127 | 2, 0, 1); 128 | nerrors += test("testdata/basic.dat:015", "^a", "ax", 129 | 2, 0, 1); 130 | nerrors += test("testdata/basic.dat:016", "\\^a", "a^a", 131 | 2, 1, 3); 132 | nerrors += test("testdata/basic.dat:017", "a\\^", "a^", 133 | 2, 0, 2); 134 | nerrors += test("testdata/basic.dat:018", "a$", "aa", 135 | 2, 1, 2); 136 | nerrors += test("testdata/basic.dat:019", "a\\$", "a$", 137 | 2, 0, 2); 138 | nerrors += test("testdata/basic.dat:020", "^$", "", 139 | 2, 0, 0); 140 | nerrors += test("testdata/basic.dat:021", "$^", "", 141 | 2, 0, 0); 142 | nerrors += test("testdata/basic.dat:022", "a($)", "aa", 143 | 4, 1, 2, 2, 2); 144 | nerrors += test("testdata/basic.dat:023", "a*(^a)", "aa", 145 | 4, 0, 1, 0, 1); 146 | nerrors += test("testdata/basic.dat:024", "(..)*(...)*", "a", 147 | 2, 0, 0); 148 | nerrors += test("testdata/basic.dat:025", "(..)*(...)*", "abcd", 149 | 4, 0, 4, 2, 4); 150 | nerrors += test("testdata/basic.dat:026", "(ab|a)(bc|c)", "abc", 151 | 6, 0, 3, 0, 2, 2, 3); 152 | nerrors += test("testdata/basic.dat:027", "(ab)c|abc", "abc", 153 | 4, 0, 3, 0, 2); 154 | nerrors += test("testdata/basic.dat:028", "a{0}b", "ab", 155 | 2, 1, 2); 156 | nerrors += test("testdata/basic.dat:029", "(a*)(b?)(b+)b{3}", "aaabbbbbbb", 157 | 8, 0, 10, 0, 3, 3, 4, 4, 7); 158 | nerrors += test("testdata/basic.dat:030", "(a*)(b{0,1})(b{1,})b{3}", "aaabbbbbbb", 159 | 8, 0, 10, 0, 3, 3, 4, 4, 7); 160 | nerrors += test("testdata/basic.dat:032", "((a|a)|a)", "a", 161 | 6, 0, 1, 0, 1, 0, 1); 162 | nerrors += test("testdata/basic.dat:033", "(a*)(a|aa)", "aaaa", 163 | 6, 0, 4, 0, 3, 3, 4); 164 | nerrors += test("testdata/basic.dat:034", "a*(a.|aa)", "aaaa", 165 | 4, 0, 4, 2, 4); 166 | nerrors += test("testdata/basic.dat:035", "a(b)|c(d)|a(e)f", "aef", 167 | 8, 0, 3, -1, -1, -1, -1, 1, 2); 168 | nerrors += test("testdata/basic.dat:036", "(a|b)?.*", "b", 169 | 4, 0, 1, 0, 1); 170 | nerrors += test("testdata/basic.dat:037", "(a|b)c|a(b|c)", "ac", 171 | 4, 0, 2, 0, 1); 172 | nerrors += test("testdata/basic.dat:038", "(a|b)c|a(b|c)", "ab", 173 | 6, 0, 2, -1, -1, 1, 2); 174 | nerrors += test("testdata/basic.dat:039", "(a|b)*c|(a|ab)*c", "abc", 175 | 4, 0, 3, 1, 2); 176 | nerrors += test("testdata/basic.dat:040", "(a|b)*c|(a|ab)*c", "xc", 177 | 2, 1, 2); 178 | nerrors += test("testdata/basic.dat:041", "(.a|.b).*|.*(.a|.b)", "xa", 179 | 4, 0, 2, 0, 2); 180 | nerrors += test("testdata/basic.dat:042", "a?(ab|ba)ab", "abab", 181 | 4, 0, 4, 0, 2); 182 | nerrors += test("testdata/basic.dat:043", "a?(ac{0}b|ba)ab", "abab", 183 | 4, 0, 4, 0, 2); 184 | nerrors += test("testdata/basic.dat:044", "ab|abab", "abbabab", 185 | 2, 0, 2); 186 | nerrors += test("testdata/basic.dat:045", "aba|bab|bba", "baaabbbaba", 187 | 2, 5, 8); 188 | nerrors += test("testdata/basic.dat:046", "aba|bab", "baaabbbaba", 189 | 2, 6, 9); 190 | nerrors += test("testdata/basic.dat:047", "(aa|aaa)*|(a|aaaaa)", "aa", 191 | 4, 0, 2, 0, 2); 192 | nerrors += test("testdata/basic.dat:048", "(a.|.a.)*|(a|.a...)", "aa", 193 | 4, 0, 2, 0, 2); 194 | nerrors += test("testdata/basic.dat:049", "ab|a", "xabc", 195 | 2, 1, 3); 196 | nerrors += test("testdata/basic.dat:050", "ab|a", "xxabc", 197 | 2, 2, 4); 198 | nerrors += test("testdata/basic.dat:052", "[^-]", "--a", 199 | 2, 2, 3); 200 | nerrors += test("testdata/basic.dat:053", "[a-]*", "--a", 201 | 2, 0, 3); 202 | nerrors += test("testdata/basic.dat:054", "[a-m-]*", "--amoma--", 203 | 2, 0, 4); 204 | nerrors += test("testdata/basic.dat:055", ":::1:::0:|:::1:1:0:", ":::0:::1:::1:::0:", 205 | 2, 8, 17); 206 | nerrors += test("testdata/basic.dat:056", ":::1:::0:|:::1:1:1:", ":::0:::1:::1:::0:", 207 | 2, 8, 17); 208 | nerrors += test("testdata/basic.dat:065", "\n", "\n", 209 | 2, 0, 1); 210 | nerrors += test("testdata/basic.dat:067", "[^a]", "\n", 211 | 2, 0, 1); 212 | nerrors += test("testdata/basic.dat:068", "\na", "\na", 213 | 2, 0, 2); 214 | nerrors += test("testdata/basic.dat:069", "(a)(b)(c)", "abc", 215 | 8, 0, 3, 0, 1, 1, 2, 2, 3); 216 | nerrors += test("testdata/basic.dat:070", "xxx", "xxx", 217 | 2, 0, 3); 218 | nerrors += test("testdata/basic.dat:071", "(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\\* */?)0*[6-7]))([^0-9]|$)", "feb 6,", 219 | 2, 0, 6); 220 | nerrors += test("testdata/basic.dat:072", "(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\\* */?)0*[6-7]))([^0-9]|$)", "2/7", 221 | 2, 0, 3); 222 | nerrors += test("testdata/basic.dat:073", "(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\\* */?)0*[6-7]))([^0-9]|$)", "feb 1,Feb 6", 223 | 2, 5, 11); 224 | nerrors += test("testdata/basic.dat:074", "((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", "x", 225 | 6, 0, 1, 0, 1, 0, 1); 226 | nerrors += test("testdata/basic.dat:075", "((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", "xx", 227 | 6, 0, 2, 1, 2, 1, 2); 228 | nerrors += test("testdata/basic.dat:076", "a?(ab|ba)*", "ababababababababababababababababababababababababababababababababababababababababa", 229 | 4, 0, 81, 79, 81); 230 | nerrors += test("testdata/basic.dat:077", "abaa|abbaa|abbbaa|abbbbaa", "ababbabbbabbbabbbbabbbbaa", 231 | 2, 18, 25); 232 | nerrors += test("testdata/basic.dat:078", "abaa|abbaa|abbbaa|abbbbaa", "ababbabbbabbbabbbbabaa", 233 | 2, 18, 22); 234 | nerrors += test("testdata/basic.dat:079", "aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", "baaabbbabac", 235 | 2, 7, 11); 236 | nerrors += test("testdata/basic.dat:080", ".*", "\x01\xff", 237 | 2, 0, 2); 238 | nerrors += test("testdata/basic.dat:081", "aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", "XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", 239 | 2, 53, 57); 240 | nerrors += test("testdata/basic.dat:082", "aaaa\\nbbbb\\ncccc\\nddddd\\neeeeee\\nfffffff\\ngggg\\nhhhh\\niiiii\\njjjjj\\nkkkkk\\nllll", "XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", 241 | 0); 242 | nerrors += test("testdata/basic.dat:083", "a*a*a*a*a*b", "aaaaaaaaab", 243 | 2, 0, 10); 244 | nerrors += test("testdata/basic.dat:084", "^", "", 245 | 2, 0, 0); 246 | nerrors += test("testdata/basic.dat:085", "$", "", 247 | 2, 0, 0); 248 | nerrors += test("testdata/basic.dat:086", "^$", "", 249 | 2, 0, 0); 250 | nerrors += test("testdata/basic.dat:087", "^a$", "a", 251 | 2, 0, 1); 252 | nerrors += test("testdata/basic.dat:088", "abc", "abc", 253 | 2, 0, 3); 254 | nerrors += test("testdata/basic.dat:089", "abc", "xabcy", 255 | 2, 1, 4); 256 | nerrors += test("testdata/basic.dat:090", "abc", "ababc", 257 | 2, 2, 5); 258 | nerrors += test("testdata/basic.dat:091", "ab*c", "abc", 259 | 2, 0, 3); 260 | nerrors += test("testdata/basic.dat:092", "ab*bc", "abc", 261 | 2, 0, 3); 262 | nerrors += test("testdata/basic.dat:093", "ab*bc", "abbc", 263 | 2, 0, 4); 264 | nerrors += test("testdata/basic.dat:094", "ab*bc", "abbbbc", 265 | 2, 0, 6); 266 | nerrors += test("testdata/basic.dat:095", "ab+bc", "abbc", 267 | 2, 0, 4); 268 | nerrors += test("testdata/basic.dat:096", "ab+bc", "abbbbc", 269 | 2, 0, 6); 270 | nerrors += test("testdata/basic.dat:097", "ab?bc", "abbc", 271 | 2, 0, 4); 272 | nerrors += test("testdata/basic.dat:098", "ab?bc", "abc", 273 | 2, 0, 3); 274 | nerrors += test("testdata/basic.dat:099", "ab?c", "abc", 275 | 2, 0, 3); 276 | nerrors += test("testdata/basic.dat:100", "^abc$", "abc", 277 | 2, 0, 3); 278 | nerrors += test("testdata/basic.dat:101", "^abc", "abcc", 279 | 2, 0, 3); 280 | nerrors += test("testdata/basic.dat:102", "abc$", "aabc", 281 | 2, 1, 4); 282 | nerrors += test("testdata/basic.dat:103", "^", "abc", 283 | 2, 0, 0); 284 | nerrors += test("testdata/basic.dat:104", "$", "abc", 285 | 2, 3, 3); 286 | nerrors += test("testdata/basic.dat:105", "a.c", "abc", 287 | 2, 0, 3); 288 | nerrors += test("testdata/basic.dat:106", "a.c", "axc", 289 | 2, 0, 3); 290 | nerrors += test("testdata/basic.dat:107", "a.*c", "axyzc", 291 | 2, 0, 5); 292 | nerrors += test("testdata/basic.dat:108", "a[bc]d", "abd", 293 | 2, 0, 3); 294 | nerrors += test("testdata/basic.dat:109", "a[b-d]e", "ace", 295 | 2, 0, 3); 296 | nerrors += test("testdata/basic.dat:110", "a[b-d]", "aac", 297 | 2, 1, 3); 298 | nerrors += test("testdata/basic.dat:111", "a[-b]", "a-", 299 | 2, 0, 2); 300 | nerrors += test("testdata/basic.dat:112", "a[b-]", "a-", 301 | 2, 0, 2); 302 | nerrors += test("testdata/basic.dat:113", "a]", "a]", 303 | 2, 0, 2); 304 | nerrors += test("testdata/basic.dat:114", "a[]]b", "a]b", 305 | 2, 0, 3); 306 | nerrors += test("testdata/basic.dat:115", "a[^bc]d", "aed", 307 | 2, 0, 3); 308 | nerrors += test("testdata/basic.dat:116", "a[^-b]c", "adc", 309 | 2, 0, 3); 310 | nerrors += test("testdata/basic.dat:117", "a[^]b]c", "adc", 311 | 2, 0, 3); 312 | nerrors += test("testdata/basic.dat:118", "ab|cd", "abc", 313 | 2, 0, 2); 314 | nerrors += test("testdata/basic.dat:119", "ab|cd", "abcd", 315 | 2, 0, 2); 316 | nerrors += test("testdata/basic.dat:120", "a\\(b", "a(b", 317 | 2, 0, 3); 318 | nerrors += test("testdata/basic.dat:121", "a\\(*b", "ab", 319 | 2, 0, 2); 320 | nerrors += test("testdata/basic.dat:122", "a\\(*b", "a((b", 321 | 2, 0, 4); 322 | nerrors += test("testdata/basic.dat:123", "((a))", "abc", 323 | 6, 0, 1, 0, 1, 0, 1); 324 | nerrors += test("testdata/basic.dat:124", "(a)b(c)", "abc", 325 | 6, 0, 3, 0, 1, 2, 3); 326 | nerrors += test("testdata/basic.dat:125", "a+b+c", "aabbabc", 327 | 2, 4, 7); 328 | nerrors += test("testdata/basic.dat:126", "a*", "aaa", 329 | 2, 0, 3); 330 | nerrors += test("testdata/basic.dat:128", "(a*)*", "-", 331 | 4, 0, 0, -1, -1); 332 | nerrors += test("testdata/basic.dat:129", "(a*)+", "-", 333 | 4, 0, 0, 0, 0); 334 | nerrors += test("testdata/basic.dat:131", "(a*|b)*", "-", 335 | 4, 0, 0, -1, -1); 336 | nerrors += test("testdata/basic.dat:132", "(a+|b)*", "ab", 337 | 4, 0, 2, 1, 2); 338 | nerrors += test("testdata/basic.dat:133", "(a+|b)+", "ab", 339 | 4, 0, 2, 1, 2); 340 | nerrors += test("testdata/basic.dat:134", "(a+|b)?", "ab", 341 | 4, 0, 1, 0, 1); 342 | nerrors += test("testdata/basic.dat:135", "[^ab]*", "cde", 343 | 2, 0, 3); 344 | nerrors += test("testdata/basic.dat:137", "(^)*", "-", 345 | 4, 0, 0, -1, -1); 346 | nerrors += test("testdata/basic.dat:138", "a*", "", 347 | 2, 0, 0); 348 | nerrors += test("testdata/basic.dat:139", "([abc])*d", "abbbcd", 349 | 4, 0, 6, 4, 5); 350 | nerrors += test("testdata/basic.dat:140", "([abc])*bcd", "abcd", 351 | 4, 0, 4, 0, 1); 352 | nerrors += test("testdata/basic.dat:141", "a|b|c|d|e", "e", 353 | 2, 0, 1); 354 | nerrors += test("testdata/basic.dat:142", "(a|b|c|d|e)f", "ef", 355 | 4, 0, 2, 0, 1); 356 | nerrors += test("testdata/basic.dat:144", "((a*|b))*", "-", 357 | 6, 0, 0, -1, -1, -1, -1); 358 | nerrors += test("testdata/basic.dat:145", "abcd*efg", "abcdefg", 359 | 2, 0, 7); 360 | nerrors += test("testdata/basic.dat:146", "ab*", "xabyabbbz", 361 | 2, 1, 3); 362 | nerrors += test("testdata/basic.dat:147", "ab*", "xayabbbz", 363 | 2, 1, 2); 364 | nerrors += test("testdata/basic.dat:148", "(ab|cd)e", "abcde", 365 | 4, 2, 5, 2, 4); 366 | nerrors += test("testdata/basic.dat:149", "[abhgefdc]ij", "hij", 367 | 2, 0, 3); 368 | nerrors += test("testdata/basic.dat:150", "(a|b)c*d", "abcd", 369 | 4, 1, 4, 1, 2); 370 | nerrors += test("testdata/basic.dat:151", "(ab|ab*)bc", "abc", 371 | 4, 0, 3, 0, 1); 372 | nerrors += test("testdata/basic.dat:152", "a([bc]*)c*", "abc", 373 | 4, 0, 3, 1, 3); 374 | nerrors += test("testdata/basic.dat:153", "a([bc]*)(c*d)", "abcd", 375 | 6, 0, 4, 1, 3, 3, 4); 376 | nerrors += test("testdata/basic.dat:154", "a([bc]+)(c*d)", "abcd", 377 | 6, 0, 4, 1, 3, 3, 4); 378 | nerrors += test("testdata/basic.dat:155", "a([bc]*)(c+d)", "abcd", 379 | 6, 0, 4, 1, 2, 2, 4); 380 | nerrors += test("testdata/basic.dat:156", "a[bcd]*dcdcde", "adcdcde", 381 | 2, 0, 7); 382 | nerrors += test("testdata/basic.dat:157", "(ab|a)b*c", "abc", 383 | 4, 0, 3, 0, 2); 384 | nerrors += test("testdata/basic.dat:158", "((a)(b)c)(d)", "abcd", 385 | 10, 0, 4, 0, 3, 0, 1, 1, 2, 3, 4); 386 | nerrors += test("testdata/basic.dat:159", "[A-Za-z_][A-Za-z0-9_]*", "alpha", 387 | 2, 0, 5); 388 | nerrors += test("testdata/basic.dat:160", "^a(bc+|b[eh])g|.h$", "abh", 389 | 2, 1, 3); 390 | nerrors += test("testdata/basic.dat:161", "(bc+d$|ef*g.|h?i(j|k))", "effgz", 391 | 4, 0, 5, 0, 5); 392 | nerrors += test("testdata/basic.dat:162", "(bc+d$|ef*g.|h?i(j|k))", "ij", 393 | 6, 0, 2, 0, 2, 1, 2); 394 | nerrors += test("testdata/basic.dat:163", "(bc+d$|ef*g.|h?i(j|k))", "reffgz", 395 | 4, 1, 6, 1, 6); 396 | nerrors += test("testdata/basic.dat:164", "(((((((((a)))))))))", "a", 397 | 20, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); 398 | nerrors += test("testdata/basic.dat:165", "multiple words", "multiple words yeah", 399 | 2, 0, 14); 400 | nerrors += test("testdata/basic.dat:166", "(.*)c(.*)", "abcde", 401 | 6, 0, 5, 0, 2, 3, 5); 402 | nerrors += test("testdata/basic.dat:167", "abcd", "abcd", 403 | 2, 0, 4); 404 | nerrors += test("testdata/basic.dat:168", "a(bc)d", "abcd", 405 | 4, 0, 4, 1, 3); 406 | nerrors += test("testdata/basic.dat:169", "a[-]?c", "ac", 407 | 2, 0, 3); 408 | nerrors += test("testdata/basic.dat:170", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Qaddafi", 409 | 6, 0, 15, -1, -1, 10, 12); 410 | nerrors += test("testdata/basic.dat:171", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Mo'ammar Gadhafi", 411 | 6, 0, 16, -1, -1, 11, 13); 412 | nerrors += test("testdata/basic.dat:172", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Kaddafi", 413 | 6, 0, 15, -1, -1, 10, 12); 414 | nerrors += test("testdata/basic.dat:173", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Qadhafi", 415 | 6, 0, 15, -1, -1, 10, 12); 416 | nerrors += test("testdata/basic.dat:174", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Gadafi", 417 | 6, 0, 14, -1, -1, 10, 11); 418 | nerrors += test("testdata/basic.dat:175", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Mu'ammar Qadafi", 419 | 6, 0, 15, -1, -1, 11, 12); 420 | nerrors += test("testdata/basic.dat:176", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Moamar Gaddafi", 421 | 6, 0, 14, -1, -1, 9, 11); 422 | nerrors += test("testdata/basic.dat:177", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Mu'ammar Qadhdhafi", 423 | 6, 0, 18, -1, -1, 13, 15); 424 | nerrors += test("testdata/basic.dat:178", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Khaddafi", 425 | 6, 0, 16, -1, -1, 11, 13); 426 | nerrors += test("testdata/basic.dat:179", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Ghaddafy", 427 | 6, 0, 16, -1, -1, 11, 13); 428 | nerrors += test("testdata/basic.dat:180", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Ghadafi", 429 | 6, 0, 15, -1, -1, 11, 12); 430 | nerrors += test("testdata/basic.dat:181", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Ghaddafi", 431 | 6, 0, 16, -1, -1, 11, 13); 432 | nerrors += test("testdata/basic.dat:182", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muamar Kaddafi", 433 | 6, 0, 14, -1, -1, 9, 11); 434 | nerrors += test("testdata/basic.dat:183", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Quathafi", 435 | 6, 0, 16, -1, -1, 11, 13); 436 | nerrors += test("testdata/basic.dat:184", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Muammar Gheddafi", 437 | 6, 0, 16, -1, -1, 11, 13); 438 | nerrors += test("testdata/basic.dat:185", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Moammar Khadafy", 439 | 6, 0, 15, -1, -1, 11, 12); 440 | nerrors += test("testdata/basic.dat:186", "M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", "Moammar Qudhafi", 441 | 6, 0, 15, -1, -1, 10, 12); 442 | nerrors += test("testdata/basic.dat:187", "a+(b|c)*d+", "aabcdd", 443 | 4, 0, 6, 3, 4); 444 | nerrors += test("testdata/basic.dat:188", "^.+$", "vivi", 445 | 2, 0, 4); 446 | nerrors += test("testdata/basic.dat:189", "^(.+)$", "vivi", 447 | 4, 0, 4, 0, 4); 448 | nerrors += test("testdata/basic.dat:190", "^([^!.]+).att.com!(.+)$", "gryphon.att.com!eby", 449 | 6, 0, 19, 0, 7, 16, 19); 450 | nerrors += test("testdata/basic.dat:191", "^([^!]+!)?([^!]+)$", "bas", 451 | 6, 0, 3, -1, -1, 0, 3); 452 | nerrors += test("testdata/basic.dat:192", "^([^!]+!)?([^!]+)$", "bar!bas", 453 | 6, 0, 7, 0, 4, 4, 7); 454 | nerrors += test("testdata/basic.dat:193", "^([^!]+!)?([^!]+)$", "foo!bas", 455 | 6, 0, 7, 0, 4, 4, 7); 456 | nerrors += test("testdata/basic.dat:194", "^.+!([^!]+!)([^!]+)$", "foo!bar!bas", 457 | 6, 0, 11, 4, 8, 8, 11); 458 | nerrors += test("testdata/basic.dat:195", "((foo)|(bar))!bas", "bar!bas", 459 | 8, 0, 7, 0, 3, -1, -1, 0, 3); 460 | nerrors += test("testdata/basic.dat:196", "((foo)|(bar))!bas", "foo!bar!bas", 461 | 8, 4, 11, 4, 7, -1, -1, 4, 7); 462 | nerrors += test("testdata/basic.dat:197", "((foo)|(bar))!bas", "foo!bas", 463 | 6, 0, 7, 0, 3, 0, 3); 464 | nerrors += test("testdata/basic.dat:198", "((foo)|bar)!bas", "bar!bas", 465 | 4, 0, 7, 0, 3); 466 | nerrors += test("testdata/basic.dat:199", "((foo)|bar)!bas", "foo!bar!bas", 467 | 4, 4, 11, 4, 7); 468 | nerrors += test("testdata/basic.dat:200", "((foo)|bar)!bas", "foo!bas", 469 | 6, 0, 7, 0, 3, 0, 3); 470 | nerrors += test("testdata/basic.dat:201", "(foo|(bar))!bas", "bar!bas", 471 | 6, 0, 7, 0, 3, 0, 3); 472 | nerrors += test("testdata/basic.dat:202", "(foo|(bar))!bas", "foo!bar!bas", 473 | 6, 4, 11, 4, 7, 4, 7); 474 | nerrors += test("testdata/basic.dat:203", "(foo|(bar))!bas", "foo!bas", 475 | 4, 0, 7, 0, 3); 476 | nerrors += test("testdata/basic.dat:204", "(foo|bar)!bas", "bar!bas", 477 | 4, 0, 7, 0, 3); 478 | nerrors += test("testdata/basic.dat:205", "(foo|bar)!bas", "foo!bar!bas", 479 | 4, 4, 11, 4, 7); 480 | nerrors += test("testdata/basic.dat:206", "(foo|bar)!bas", "foo!bas", 481 | 4, 0, 7, 0, 3); 482 | nerrors += test("testdata/basic.dat:207", "^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", "foo!bar!bas", 483 | 12, 0, 11, 0, 11, -1, -1, -1, -1, 4, 8, 8, 11); 484 | nerrors += test("testdata/basic.dat:208", "^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", "bas", 485 | 6, 0, 3, -1, -1, 0, 3); 486 | nerrors += test("testdata/basic.dat:209", "^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", "bar!bas", 487 | 6, 0, 7, 0, 4, 4, 7); 488 | nerrors += test("testdata/basic.dat:210", "^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", "foo!bar!bas", 489 | 10, 0, 11, -1, -1, -1, -1, 4, 8, 8, 11); 490 | nerrors += test("testdata/basic.dat:211", "^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", "foo!bas", 491 | 6, 0, 7, 0, 4, 4, 7); 492 | nerrors += test("testdata/basic.dat:212", "^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", "bas", 493 | 8, 0, 3, 0, 3, -1, -1, 0, 3); 494 | nerrors += test("testdata/basic.dat:213", "^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", "bar!bas", 495 | 8, 0, 7, 0, 7, 0, 4, 4, 7); 496 | nerrors += test("testdata/basic.dat:214", "^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", "foo!bar!bas", 497 | 12, 0, 11, 0, 11, -1, -1, -1, -1, 4, 8, 8, 11); 498 | nerrors += test("testdata/basic.dat:215", "^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", "foo!bas", 499 | 8, 0, 7, 0, 7, 0, 4, 4, 7); 500 | nerrors += test("testdata/basic.dat:216", ".*(/XXX).*", "/XXX", 501 | 4, 0, 4, 0, 4); 502 | nerrors += test("testdata/basic.dat:217", ".*(\\\\XXX).*", "\\XXX", 503 | 4, 0, 4, 0, 4); 504 | nerrors += test("testdata/basic.dat:218", "\\\\XXX", "\\XXX", 505 | 2, 0, 4); 506 | nerrors += test("testdata/basic.dat:219", ".*(/000).*", "/000", 507 | 4, 0, 4, 0, 4); 508 | nerrors += test("testdata/basic.dat:220", ".*(\\\\000).*", "\\000", 509 | 4, 0, 4, 0, 4); 510 | nerrors += test("testdata/basic.dat:221", "\\\\000", "\\000", 511 | 2, 0, 4); 512 | nerrors += test("testdata/nullsubexpr.dat:003", "(a*)*", "a", 513 | 4, 0, 1, 0, 1); 514 | nerrors += test("testdata/nullsubexpr.dat:005", "(a*)*", "x", 515 | 4, 0, 0, -1, -1); 516 | nerrors += test("testdata/nullsubexpr.dat:006", "(a*)*", "aaaaaa", 517 | 4, 0, 6, 0, 6); 518 | nerrors += test("testdata/nullsubexpr.dat:007", "(a*)*", "aaaaaax", 519 | 4, 0, 6, 0, 6); 520 | nerrors += test("testdata/nullsubexpr.dat:008", "(a*)+", "a", 521 | 4, 0, 1, 0, 1); 522 | nerrors += test("testdata/nullsubexpr.dat:009", "(a*)+", "x", 523 | 4, 0, 0, 0, 0); 524 | nerrors += test("testdata/nullsubexpr.dat:010", "(a*)+", "aaaaaa", 525 | 4, 0, 6, 0, 6); 526 | nerrors += test("testdata/nullsubexpr.dat:011", "(a*)+", "aaaaaax", 527 | 4, 0, 6, 0, 6); 528 | nerrors += test("testdata/nullsubexpr.dat:012", "(a+)*", "a", 529 | 4, 0, 1, 0, 1); 530 | nerrors += test("testdata/nullsubexpr.dat:013", "(a+)*", "x", 531 | 2, 0, 0); 532 | nerrors += test("testdata/nullsubexpr.dat:014", "(a+)*", "aaaaaa", 533 | 4, 0, 6, 0, 6); 534 | nerrors += test("testdata/nullsubexpr.dat:015", "(a+)*", "aaaaaax", 535 | 4, 0, 6, 0, 6); 536 | nerrors += test("testdata/nullsubexpr.dat:016", "(a+)+", "a", 537 | 4, 0, 1, 0, 1); 538 | nerrors += test("testdata/nullsubexpr.dat:017", "(a+)+", "x", 539 | 0); 540 | nerrors += test("testdata/nullsubexpr.dat:018", "(a+)+", "aaaaaa", 541 | 4, 0, 6, 0, 6); 542 | nerrors += test("testdata/nullsubexpr.dat:019", "(a+)+", "aaaaaax", 543 | 4, 0, 6, 0, 6); 544 | nerrors += test("testdata/nullsubexpr.dat:021", "([a]*)*", "a", 545 | 4, 0, 1, 0, 1); 546 | nerrors += test("testdata/nullsubexpr.dat:023", "([a]*)*", "x", 547 | 4, 0, 0, -1, -1); 548 | nerrors += test("testdata/nullsubexpr.dat:024", "([a]*)*", "aaaaaa", 549 | 4, 0, 6, 0, 6); 550 | nerrors += test("testdata/nullsubexpr.dat:025", "([a]*)*", "aaaaaax", 551 | 4, 0, 6, 0, 6); 552 | nerrors += test("testdata/nullsubexpr.dat:026", "([a]*)+", "a", 553 | 4, 0, 1, 0, 1); 554 | nerrors += test("testdata/nullsubexpr.dat:027", "([a]*)+", "x", 555 | 4, 0, 0, 0, 0); 556 | nerrors += test("testdata/nullsubexpr.dat:028", "([a]*)+", "aaaaaa", 557 | 4, 0, 6, 0, 6); 558 | nerrors += test("testdata/nullsubexpr.dat:029", "([a]*)+", "aaaaaax", 559 | 4, 0, 6, 0, 6); 560 | nerrors += test("testdata/nullsubexpr.dat:030", "([^b]*)*", "a", 561 | 4, 0, 1, 0, 1); 562 | nerrors += test("testdata/nullsubexpr.dat:032", "([^b]*)*", "b", 563 | 4, 0, 0, -1, -1); 564 | nerrors += test("testdata/nullsubexpr.dat:033", "([^b]*)*", "aaaaaa", 565 | 4, 0, 6, 0, 6); 566 | nerrors += test("testdata/nullsubexpr.dat:034", "([^b]*)*", "aaaaaab", 567 | 4, 0, 6, 0, 6); 568 | nerrors += test("testdata/nullsubexpr.dat:035", "([ab]*)*", "a", 569 | 4, 0, 1, 0, 1); 570 | nerrors += test("testdata/nullsubexpr.dat:036", "([ab]*)*", "aaaaaa", 571 | 4, 0, 6, 0, 6); 572 | nerrors += test("testdata/nullsubexpr.dat:037", "([ab]*)*", "ababab", 573 | 4, 0, 6, 0, 6); 574 | nerrors += test("testdata/nullsubexpr.dat:038", "([ab]*)*", "bababa", 575 | 4, 0, 6, 0, 6); 576 | nerrors += test("testdata/nullsubexpr.dat:039", "([ab]*)*", "b", 577 | 4, 0, 1, 0, 1); 578 | nerrors += test("testdata/nullsubexpr.dat:040", "([ab]*)*", "bbbbbb", 579 | 4, 0, 6, 0, 6); 580 | nerrors += test("testdata/nullsubexpr.dat:041", "([ab]*)*", "aaaabcde", 581 | 4, 0, 5, 0, 5); 582 | nerrors += test("testdata/nullsubexpr.dat:042", "([^a]*)*", "b", 583 | 4, 0, 1, 0, 1); 584 | nerrors += test("testdata/nullsubexpr.dat:043", "([^a]*)*", "bbbbbb", 585 | 4, 0, 6, 0, 6); 586 | nerrors += test("testdata/nullsubexpr.dat:045", "([^a]*)*", "aaaaaa", 587 | 4, 0, 0, -1, -1); 588 | nerrors += test("testdata/nullsubexpr.dat:046", "([^ab]*)*", "ccccxx", 589 | 4, 0, 6, 0, 6); 590 | nerrors += test("testdata/nullsubexpr.dat:048", "([^ab]*)*", "ababab", 591 | 4, 0, 0, -1, -1); 592 | nerrors += test("testdata/nullsubexpr.dat:050", "((z)+|a)*", "zabcde", 593 | 4, 0, 2, 1, 2); 594 | nerrors += test("testdata/nullsubexpr.dat:069", "(a*)*(x)", "x", 595 | 6, 0, 1, -1, -1, 0, 1); 596 | nerrors += test("testdata/nullsubexpr.dat:070", "(a*)*(x)", "ax", 597 | 6, 0, 2, 0, 1, 1, 2); 598 | nerrors += test("testdata/nullsubexpr.dat:071", "(a*)*(x)", "axa", 599 | 6, 0, 2, 0, 1, 1, 2); 600 | nerrors += test("testdata/nullsubexpr.dat:073", "(a*)+(x)", "x", 601 | 6, 0, 1, 0, 0, 0, 1); 602 | nerrors += test("testdata/nullsubexpr.dat:074", "(a*)+(x)", "ax", 603 | 6, 0, 2, 0, 1, 1, 2); 604 | nerrors += test("testdata/nullsubexpr.dat:075", "(a*)+(x)", "axa", 605 | 6, 0, 2, 0, 1, 1, 2); 606 | nerrors += test("testdata/nullsubexpr.dat:077", "(a*){2}(x)", "x", 607 | 6, 0, 1, 0, 0, 0, 1); 608 | nerrors += test("testdata/nullsubexpr.dat:078", "(a*){2}(x)", "ax", 609 | 6, 0, 2, 1, 1, 1, 2); 610 | nerrors += test("testdata/nullsubexpr.dat:079", "(a*){2}(x)", "axa", 611 | 6, 0, 2, 1, 1, 1, 2); 612 | nerrors += test("testdata/repetition.dat:010", "((..)|(.))", "", 613 | 0); 614 | nerrors += test("testdata/repetition.dat:011", "((..)|(.))((..)|(.))", "", 615 | 0); 616 | nerrors += test("testdata/repetition.dat:012", "((..)|(.))((..)|(.))((..)|(.))", "", 617 | 0); 618 | nerrors += test("testdata/repetition.dat:014", "((..)|(.)){1}", "", 619 | 0); 620 | nerrors += test("testdata/repetition.dat:015", "((..)|(.)){2}", "", 621 | 0); 622 | nerrors += test("testdata/repetition.dat:016", "((..)|(.)){3}", "", 623 | 0); 624 | nerrors += test("testdata/repetition.dat:018", "((..)|(.))*", "", 625 | 2, 0, 0); 626 | nerrors += test("testdata/repetition.dat:020", "((..)|(.))", "a", 627 | 8, 0, 1, 0, 1, -1, -1, 0, 1); 628 | nerrors += test("testdata/repetition.dat:021", "((..)|(.))((..)|(.))", "a", 629 | 0); 630 | nerrors += test("testdata/repetition.dat:022", "((..)|(.))((..)|(.))((..)|(.))", "a", 631 | 0); 632 | nerrors += test("testdata/repetition.dat:024", "((..)|(.)){1}", "a", 633 | 8, 0, 1, 0, 1, -1, -1, 0, 1); 634 | nerrors += test("testdata/repetition.dat:025", "((..)|(.)){2}", "a", 635 | 0); 636 | nerrors += test("testdata/repetition.dat:026", "((..)|(.)){3}", "a", 637 | 0); 638 | nerrors += test("testdata/repetition.dat:028", "((..)|(.))*", "a", 639 | 8, 0, 1, 0, 1, -1, -1, 0, 1); 640 | nerrors += test("testdata/repetition.dat:030", "((..)|(.))", "aa", 641 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 642 | nerrors += test("testdata/repetition.dat:031", "((..)|(.))((..)|(.))", "aa", 643 | 14, 0, 2, 0, 1, -1, -1, 0, 1, 1, 2, -1, -1, 1, 2); 644 | nerrors += test("testdata/repetition.dat:032", "((..)|(.))((..)|(.))((..)|(.))", "aa", 645 | 0); 646 | nerrors += test("testdata/repetition.dat:034", "((..)|(.)){1}", "aa", 647 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 648 | nerrors += test("testdata/repetition.dat:035", "((..)|(.)){2}", "aa", 649 | 8, 0, 2, 1, 2, -1, -1, 1, 2); 650 | nerrors += test("testdata/repetition.dat:036", "((..)|(.)){3}", "aa", 651 | 0); 652 | nerrors += test("testdata/repetition.dat:038", "((..)|(.))*", "aa", 653 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 654 | nerrors += test("testdata/repetition.dat:040", "((..)|(.))", "aaa", 655 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 656 | nerrors += test("testdata/repetition.dat:041", "((..)|(.))((..)|(.))", "aaa", 657 | 14, 0, 3, 0, 2, 0, 2, -1, -1, 2, 3, -1, -1, 2, 3); 658 | nerrors += test("testdata/repetition.dat:042", "((..)|(.))((..)|(.))((..)|(.))", "aaa", 659 | 20, 0, 3, 0, 1, -1, -1, 0, 1, 1, 2, -1, -1, 1, 2, 2, 3, -1, -1, 2, 3); 660 | nerrors += test("testdata/repetition.dat:044", "((..)|(.)){1}", "aaa", 661 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 662 | nerrors += test("testdata/repetition.dat:046", "((..)|(.)){2}", "aaa", 663 | 8, 0, 3, 2, 3, 0, 2, 2, 3); 664 | nerrors += test("testdata/repetition.dat:047", "((..)|(.)){3}", "aaa", 665 | 8, 0, 3, 2, 3, -1, -1, 2, 3); 666 | nerrors += test("testdata/repetition.dat:050", "((..)|(.))*", "aaa", 667 | 8, 0, 3, 2, 3, 0, 2, 2, 3); 668 | nerrors += test("testdata/repetition.dat:052", "((..)|(.))", "aaaa", 669 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 670 | nerrors += test("testdata/repetition.dat:053", "((..)|(.))((..)|(.))", "aaaa", 671 | 14, 0, 4, 0, 2, 0, 2, -1, -1, 2, 4, 2, 4, -1, -1); 672 | nerrors += test("testdata/repetition.dat:054", "((..)|(.))((..)|(.))((..)|(.))", "aaaa", 673 | 20, 0, 4, 0, 2, 0, 2, -1, -1, 2, 3, -1, -1, 2, 3, 3, 4, -1, -1, 3, 4); 674 | nerrors += test("testdata/repetition.dat:056", "((..)|(.)){1}", "aaaa", 675 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 676 | nerrors += test("testdata/repetition.dat:057", "((..)|(.)){2}", "aaaa", 677 | 8, 0, 4, 2, 4, 2, 4, -1, -1); 678 | nerrors += test("testdata/repetition.dat:059", "((..)|(.)){3}", "aaaa", 679 | 8, 0, 4, 3, 4, 0, 2, 3, 4); 680 | nerrors += test("testdata/repetition.dat:061", "((..)|(.))*", "aaaa", 681 | 8, 0, 4, 2, 4, 2, 4, -1, -1); 682 | nerrors += test("testdata/repetition.dat:063", "((..)|(.))", "aaaaa", 683 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 684 | nerrors += test("testdata/repetition.dat:064", "((..)|(.))((..)|(.))", "aaaaa", 685 | 14, 0, 4, 0, 2, 0, 2, -1, -1, 2, 4, 2, 4, -1, -1); 686 | nerrors += test("testdata/repetition.dat:065", "((..)|(.))((..)|(.))((..)|(.))", "aaaaa", 687 | 20, 0, 5, 0, 2, 0, 2, -1, -1, 2, 4, 2, 4, -1, -1, 4, 5, -1, -1, 4, 5); 688 | nerrors += test("testdata/repetition.dat:067", "((..)|(.)){1}", "aaaaa", 689 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 690 | nerrors += test("testdata/repetition.dat:068", "((..)|(.)){2}", "aaaaa", 691 | 8, 0, 4, 2, 4, 2, 4, -1, -1); 692 | nerrors += test("testdata/repetition.dat:070", "((..)|(.)){3}", "aaaaa", 693 | 8, 0, 5, 4, 5, 2, 4, 4, 5); 694 | nerrors += test("testdata/repetition.dat:073", "((..)|(.))*", "aaaaa", 695 | 8, 0, 5, 4, 5, 2, 4, 4, 5); 696 | nerrors += test("testdata/repetition.dat:075", "((..)|(.))", "aaaaaa", 697 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 698 | nerrors += test("testdata/repetition.dat:076", "((..)|(.))((..)|(.))", "aaaaaa", 699 | 14, 0, 4, 0, 2, 0, 2, -1, -1, 2, 4, 2, 4, -1, -1); 700 | nerrors += test("testdata/repetition.dat:077", "((..)|(.))((..)|(.))((..)|(.))", "aaaaaa", 701 | 20, 0, 6, 0, 2, 0, 2, -1, -1, 2, 4, 2, 4, -1, -1, 4, 6, 4, 6, -1, -1); 702 | nerrors += test("testdata/repetition.dat:079", "((..)|(.)){1}", "aaaaaa", 703 | 8, 0, 2, 0, 2, 0, 2, -1, -1); 704 | nerrors += test("testdata/repetition.dat:080", "((..)|(.)){2}", "aaaaaa", 705 | 8, 0, 4, 2, 4, 2, 4, -1, -1); 706 | nerrors += test("testdata/repetition.dat:081", "((..)|(.)){3}", "aaaaaa", 707 | 8, 0, 6, 4, 6, 4, 6, -1, -1); 708 | nerrors += test("testdata/repetition.dat:083", "((..)|(.))*", "aaaaaa", 709 | 8, 0, 6, 4, 6, 4, 6, -1, -1); 710 | nerrors += test("testdata/repetition.dat:090", "X(.?){0,}Y", "X1234567Y", 711 | 4, 0, 9, 7, 8); 712 | nerrors += test("testdata/repetition.dat:091", "X(.?){1,}Y", "X1234567Y", 713 | 4, 0, 9, 7, 8); 714 | nerrors += test("testdata/repetition.dat:092", "X(.?){2,}Y", "X1234567Y", 715 | 4, 0, 9, 7, 8); 716 | nerrors += test("testdata/repetition.dat:093", "X(.?){3,}Y", "X1234567Y", 717 | 4, 0, 9, 7, 8); 718 | nerrors += test("testdata/repetition.dat:094", "X(.?){4,}Y", "X1234567Y", 719 | 4, 0, 9, 7, 8); 720 | nerrors += test("testdata/repetition.dat:095", "X(.?){5,}Y", "X1234567Y", 721 | 4, 0, 9, 7, 8); 722 | nerrors += test("testdata/repetition.dat:096", "X(.?){6,}Y", "X1234567Y", 723 | 4, 0, 9, 7, 8); 724 | nerrors += test("testdata/repetition.dat:097", "X(.?){7,}Y", "X1234567Y", 725 | 4, 0, 9, 7, 8); 726 | nerrors += test("testdata/repetition.dat:098", "X(.?){8,}Y", "X1234567Y", 727 | 4, 0, 9, 8, 8); 728 | nerrors += test("testdata/repetition.dat:100", "X(.?){0,8}Y", "X1234567Y", 729 | 4, 0, 9, 8, 8); 730 | nerrors += test("testdata/repetition.dat:102", "X(.?){1,8}Y", "X1234567Y", 731 | 4, 0, 9, 8, 8); 732 | nerrors += test("testdata/repetition.dat:104", "X(.?){2,8}Y", "X1234567Y", 733 | 4, 0, 9, 8, 8); 734 | nerrors += test("testdata/repetition.dat:106", "X(.?){3,8}Y", "X1234567Y", 735 | 4, 0, 9, 8, 8); 736 | nerrors += test("testdata/repetition.dat:108", "X(.?){4,8}Y", "X1234567Y", 737 | 4, 0, 9, 8, 8); 738 | nerrors += test("testdata/repetition.dat:110", "X(.?){5,8}Y", "X1234567Y", 739 | 4, 0, 9, 8, 8); 740 | nerrors += test("testdata/repetition.dat:112", "X(.?){6,8}Y", "X1234567Y", 741 | 4, 0, 9, 8, 8); 742 | nerrors += test("testdata/repetition.dat:114", "X(.?){7,8}Y", "X1234567Y", 743 | 4, 0, 9, 8, 8); 744 | nerrors += test("testdata/repetition.dat:115", "X(.?){8,8}Y", "X1234567Y", 745 | 4, 0, 9, 8, 8); 746 | nerrors += test("testdata/repetition.dat:127", "(a|ab|c|bcd){0,}(d*)", "ababcd", 747 | 6, 0, 1, 0, 1, 1, 1); 748 | nerrors += test("testdata/repetition.dat:129", "(a|ab|c|bcd){1,}(d*)", "ababcd", 749 | 6, 0, 1, 0, 1, 1, 1); 750 | nerrors += test("testdata/repetition.dat:130", "(a|ab|c|bcd){2,}(d*)", "ababcd", 751 | 6, 0, 6, 3, 6, 6, 6); 752 | nerrors += test("testdata/repetition.dat:131", "(a|ab|c|bcd){3,}(d*)", "ababcd", 753 | 6, 0, 6, 3, 6, 6, 6); 754 | nerrors += test("testdata/repetition.dat:132", "(a|ab|c|bcd){4,}(d*)", "ababcd", 755 | 0); 756 | nerrors += test("testdata/repetition.dat:134", "(a|ab|c|bcd){0,10}(d*)", "ababcd", 757 | 6, 0, 1, 0, 1, 1, 1); 758 | nerrors += test("testdata/repetition.dat:136", "(a|ab|c|bcd){1,10}(d*)", "ababcd", 759 | 6, 0, 1, 0, 1, 1, 1); 760 | nerrors += test("testdata/repetition.dat:137", "(a|ab|c|bcd){2,10}(d*)", "ababcd", 761 | 6, 0, 6, 3, 6, 6, 6); 762 | nerrors += test("testdata/repetition.dat:138", "(a|ab|c|bcd){3,10}(d*)", "ababcd", 763 | 6, 0, 6, 3, 6, 6, 6); 764 | nerrors += test("testdata/repetition.dat:139", "(a|ab|c|bcd){4,10}(d*)", "ababcd", 765 | 0); 766 | nerrors += test("testdata/repetition.dat:141", "(a|ab|c|bcd)*(d*)", "ababcd", 767 | 6, 0, 1, 0, 1, 1, 1); 768 | nerrors += test("testdata/repetition.dat:143", "(a|ab|c|bcd)+(d*)", "ababcd", 769 | 6, 0, 1, 0, 1, 1, 1); 770 | nerrors += test("testdata/repetition.dat:149", "(ab|a|c|bcd){0,}(d*)", "ababcd", 771 | 6, 0, 6, 4, 5, 5, 6); 772 | nerrors += test("testdata/repetition.dat:151", "(ab|a|c|bcd){1,}(d*)", "ababcd", 773 | 6, 0, 6, 4, 5, 5, 6); 774 | nerrors += test("testdata/repetition.dat:153", "(ab|a|c|bcd){2,}(d*)", "ababcd", 775 | 6, 0, 6, 4, 5, 5, 6); 776 | nerrors += test("testdata/repetition.dat:155", "(ab|a|c|bcd){3,}(d*)", "ababcd", 777 | 6, 0, 6, 4, 5, 5, 6); 778 | nerrors += test("testdata/repetition.dat:156", "(ab|a|c|bcd){4,}(d*)", "ababcd", 779 | 0); 780 | nerrors += test("testdata/repetition.dat:158", "(ab|a|c|bcd){0,10}(d*)", "ababcd", 781 | 6, 0, 6, 4, 5, 5, 6); 782 | nerrors += test("testdata/repetition.dat:160", "(ab|a|c|bcd){1,10}(d*)", "ababcd", 783 | 6, 0, 6, 4, 5, 5, 6); 784 | nerrors += test("testdata/repetition.dat:162", "(ab|a|c|bcd){2,10}(d*)", "ababcd", 785 | 6, 0, 6, 4, 5, 5, 6); 786 | nerrors += test("testdata/repetition.dat:164", "(ab|a|c|bcd){3,10}(d*)", "ababcd", 787 | 6, 0, 6, 4, 5, 5, 6); 788 | nerrors += test("testdata/repetition.dat:165", "(ab|a|c|bcd){4,10}(d*)", "ababcd", 789 | 0); 790 | nerrors += test("testdata/repetition.dat:167", "(ab|a|c|bcd)*(d*)", "ababcd", 791 | 6, 0, 6, 4, 5, 5, 6); 792 | nerrors += test("testdata/repetition.dat:169", "(ab|a|c|bcd)+(d*)", "ababcd", 793 | 6, 0, 6, 4, 5, 5, 6); 794 | printf("345 test(s), %d error(s).\n", -nerrors); 795 | return 0; 796 | } 797 | --------------------------------------------------------------------------------