├── .clang-format ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── include └── cregex.h ├── mk ├── common.mk └── test-data.mk ├── src ├── compile.c ├── parse.c └── vm.c └── tests ├── cli.c ├── generator.rb └── re2dot.c /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Chromium 2 | Language: Cpp 3 | MaxEmptyLinesToKeep: 3 4 | IndentCaseLabels: false 5 | AllowShortIfStatementsOnASingleLine: false 6 | AllowShortCaseLabelsOnASingleLine: false 7 | AllowShortLoopsOnASingleLine: false 8 | DerivePointerAlignment: false 9 | PointerAlignment: Right 10 | SpaceAfterCStyleCast: true 11 | TabWidth: 4 12 | UseTab: Never 13 | IndentWidth: 4 14 | BreakBeforeBraces: Linux 15 | AccessModifierOffset: -4 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tests/driver 2 | tests/driver.o 3 | tests/driver.c 4 | tests/basic.dat 5 | tests/nullsubexpr.dat 6 | tests/repetition.dat 7 | tests/cli 8 | tests/re2dot 9 | *.o 10 | *.o.d 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 National Cheng Kung University, Taiwan. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROGS := driver cli re2dot 2 | PROGS := $(addprefix tests/,$(PROGS)) 3 | 4 | OBJS := src/compile.o \ 5 | src/parse.o \ 6 | src/vm.o 7 | deps := $(OBJS:%.o=%.o.d) $(PROGS:%=%.o.d) 8 | 9 | include mk/common.mk 10 | 11 | CC ?= gcc 12 | CFLAGS += -std=c11 -Wall -pedantic 13 | CFLAGS += -Iinclude 14 | 15 | .PHONY: all 16 | all: CFLAGS += -DNDEBUG -O2 17 | all: $(OBJS) $(PROGS) 18 | 19 | .PHONY: debug 20 | debug: CFLAGS += -DDEBUG -g 21 | debug: LDFLAGS += -g 22 | debug: $(OBJS) $(PROGS) 23 | 24 | include mk/test-data.mk 25 | tests/driver.c: tests/generator.rb $(TESTDATA) 26 | $(VECHO) " GEN\t$@\n" 27 | $(Q)tests/generator.rb $(TESTDATA) > $@ 28 | check: tests/driver 29 | $(Q)$< 30 | 31 | %.o: %.c 32 | $(VECHO) " CC\t$@\n" 33 | $(Q)$(CC) $(CFLAGS) -MMD -MF $@.d -c -o $@ $< 34 | 35 | tests/%: tests/%.o $(OBJS) 36 | $(VECHO) " CC+LD\t$@\n" 37 | $(Q)$(CC) $(LDFLAGS) -o $@ $^ 38 | 39 | .PHONY: clean 40 | clean: 41 | $(RM) $(PROGS) $(PROGS:%=%.o) $(OBJS) $(deps) 42 | distclean: clean 43 | -$(RM) tests/driver.c $(TESTDATA) 44 | 45 | -include $(deps) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cregex 2 | 3 | `cregex` is a compact implementation of [regular expression](https://en.wikipedia.org/wiki/Regular_expression) 4 | (regex) matching engine in C. Its design was inspired by Rob Pike's regex-code for the book "Beautiful Code" 5 | [available online here](https://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html). 6 | It is based on two papers by Russ Cox: 7 | * [Regular Expression Matching Can Be Simple And Fast](https://swtch.com/~rsc/regexp/regexp1.html) 8 | * [Regular Expression Matching: the Virtual Machine Approach](https://swtch.com/~rsc/regexp/regexp2.html) 9 | 10 | `cregex` supports a subset of the syntax and semantics of the [POSIX Basic Regular Expressions](https://www.regular-expressions.info/posix.html). 11 | The main design goal of `cregex` is to be small, correct, self contained and 12 | use few resources while retaining acceptable performance and feature completeness. 13 | 14 | ## Features 15 | 16 | * `^` and `$` anchors 17 | * `.` match any single character 18 | * `[...]` and `[^...]` character classes 19 | * `?`, `*`, `+`, and `{x,y}` greedy quantifiers 20 | * `??`, `*?`, `+?`, and `{x,y}?` non-greedy quantifiers 21 | * `(...)` capturing groups 22 | 23 | ## Build and Test 24 | 25 | Simply run to build the library and test programs. 26 | ```shell 27 | $ make 28 | ``` 29 | 30 | Run the tests from Go distribution. 31 | ```shell 32 | $ make check 33 | ``` 34 | 35 | Visualize the regular expressions with [Graphviz](https://graphviz.org/). 36 | ```shell 37 | $ tests/re2dot "(a*)(b{0,1})(b{1,})b{3}" | dot -Tpng -o out.png 38 | ``` 39 | 40 | ## License 41 | 42 | `cregex` is freely redistributable under the BSD 2 clause license. 43 | Use of this source code is governed by a BSD-style license that can be found in the `LICENSE` file. 44 | -------------------------------------------------------------------------------- /include/cregex.h: -------------------------------------------------------------------------------- 1 | #ifndef CREGEX_H 2 | #define CREGEX_H 3 | 4 | typedef enum { 5 | REGEX_NODE_TYPE_EPSILON = 0, 6 | /* Characters */ 7 | REGEX_NODE_TYPE_CHARACTER, 8 | REGEX_NODE_TYPE_ANY_CHARACTER, 9 | REGEX_NODE_TYPE_CHARACTER_CLASS, 10 | REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED, 11 | /* Composites */ 12 | REGEX_NODE_TYPE_CONCATENATION, 13 | REGEX_NODE_TYPE_ALTERNATION, 14 | /* Quantifiers */ 15 | REGEX_NODE_TYPE_QUANTIFIER, 16 | /* Anchors */ 17 | REGEX_NODE_TYPE_ANCHOR_BEGIN, 18 | REGEX_NODE_TYPE_ANCHOR_END, 19 | /* Captures */ 20 | REGEX_NODE_TYPE_CAPTURE 21 | } cregex_node_type; 22 | 23 | typedef struct cregex_node { 24 | cregex_node_type type; 25 | union { 26 | /* REGEX_NODE_TYPE_CHARACTER */ 27 | struct { 28 | int ch; 29 | }; 30 | /* REGEX_NODE_TYPE_CHARACTER_CLASS, 31 | * REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED 32 | */ 33 | struct { 34 | const char *from, *to; 35 | }; 36 | /* REGEX_NODE_TYPE_QUANTIFIER */ 37 | struct { 38 | int nmin, nmax, greedy; 39 | struct cregex_node *quantified; 40 | }; 41 | /* REGEX_NODE_TYPE_CONCATENATION, 42 | * REGEX_NODE_TYPE_ALTERNATION 43 | */ 44 | struct { 45 | struct cregex_node *left, *right; 46 | }; 47 | /* REGEX_NODE_TYPE_CAPTURE */ 48 | struct { 49 | struct cregex_node *captured; 50 | }; 51 | }; 52 | } cregex_node_t; 53 | 54 | typedef enum { 55 | REGEX_PROGRAM_OPCODE_MATCH = 0, 56 | /* Characters */ 57 | REGEX_PROGRAM_OPCODE_CHARACTER, 58 | REGEX_PROGRAM_OPCODE_ANY_CHARACTER, 59 | REGEX_PROGRAM_OPCODE_CHARACTER_CLASS, 60 | REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED, 61 | /* Control-flow */ 62 | REGEX_PROGRAM_OPCODE_SPLIT, 63 | REGEX_PROGRAM_OPCODE_JUMP, 64 | /* Assertions */ 65 | REGEX_PROGRAM_OPCODE_ASSERT_BEGIN, 66 | REGEX_PROGRAM_OPCODE_ASSERT_END, 67 | /* Saving */ 68 | REGEX_PROGRAM_OPCODE_SAVE 69 | } cregex_program_opcode_t; 70 | 71 | #include 72 | 73 | typedef char cregex_char_class[(UCHAR_MAX + CHAR_BIT - 1) / CHAR_BIT]; 74 | 75 | static inline int cregex_char_class_contains(const cregex_char_class klass, 76 | int ch) 77 | { 78 | return klass[ch / CHAR_BIT] & (1 << ch % CHAR_BIT); 79 | } 80 | 81 | static inline int cregex_char_class_add(cregex_char_class klass, int ch) 82 | { 83 | klass[ch / CHAR_BIT] |= 1 << (ch % CHAR_BIT); 84 | return ch; 85 | } 86 | 87 | typedef struct cregex_program_instr { 88 | cregex_program_opcode_t opcode; 89 | union { 90 | /* REGEX_PROGRAM_OPCODE_CHARACTER */ 91 | struct { 92 | int ch; 93 | }; 94 | /* REGEX_PROGRAM_OPCODE_CHARACTER_CLASS, 95 | * REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED 96 | */ 97 | struct { 98 | cregex_char_class klass; 99 | }; 100 | /* REGEX_PROGRAM_OPCODE_SPLIT */ 101 | struct { 102 | struct cregex_program_instr *first, *second; 103 | }; 104 | /* REGEX_PROGRAM_OPCODE_JUMP */ 105 | struct { 106 | struct cregex_program_instr *target; 107 | }; 108 | /* REGEX_PROGRAM_OPCODE_SAVE */ 109 | struct { 110 | int save; 111 | }; 112 | }; 113 | } cregex_program_instr_t; 114 | 115 | typedef struct { 116 | int ninstructions; 117 | cregex_program_instr_t instructions[]; 118 | } cregex_program_t; 119 | 120 | /* Run program on string */ 121 | int cregex_program_run(const cregex_program_t *program, 122 | const char *string, 123 | const char **matches, 124 | int nmatches); 125 | 126 | /* Compile a parsed pattern */ 127 | cregex_program_t *cregex_compile_node(const cregex_node_t *root); 128 | 129 | /* Free a compiled program */ 130 | void cregex_compile_free(cregex_program_t *program); 131 | 132 | /* Parse a pattern */ 133 | cregex_node_t *cregex_parse(const char *pattern); 134 | 135 | /* Free a parsed pattern */ 136 | void cregex_parse_free(cregex_node_t *root); 137 | 138 | #endif 139 | -------------------------------------------------------------------------------- /mk/common.mk: -------------------------------------------------------------------------------- 1 | UNAME_S := $(shell uname -s) 2 | ifeq ($(UNAME_S),Darwin) 3 | PRINTF = printf 4 | else 5 | PRINTF = env printf 6 | endif 7 | 8 | # Control the build verbosity 9 | ifeq ("$(VERBOSE)","1") 10 | Q := 11 | VECHO = @true 12 | REDIR = 13 | else 14 | Q := @ 15 | VECHO = @$(PRINTF) 16 | REDIR = >/dev/null 17 | endif 18 | -------------------------------------------------------------------------------- /mk/test-data.mk: -------------------------------------------------------------------------------- 1 | TESTDATA = basic.dat nullsubexpr.dat repetition.dat 2 | TESTDATA := $(addprefix tests/,$(TESTDATA)) 3 | 4 | tests/basic.dat: 5 | $(VECHO) " Downloading $@ ...\n" 6 | $(Q)wget -q -O $@ https://golang.org/src/regexp/testdata/basic.dat?m=text 7 | # FIXME: clarify if it was an imcomplete test item 8 | $(Q)sed '/9876543210/d' $@ > tests/fixed 9 | mv -f tests/fixed $@ 10 | 11 | tests/nullsubexpr.dat: 12 | $(VECHO) " Downloading $@ ...\n" 13 | $(Q)wget -q -O $@ https://golang.org/src/regexp/testdata/nullsubexpr.dat?m=text 14 | 15 | tests/repetition.dat: 16 | $(VECHO) " Downloading $@ ...\n" 17 | $(Q)wget -q -O $@ https://golang.org/src/regexp/testdata/repetition.dat?m=text 18 | -------------------------------------------------------------------------------- /src/compile.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "cregex.h" 5 | 6 | typedef struct { 7 | cregex_program_instr_t *pc; 8 | int ncaptures; 9 | } regex_compile_context; 10 | 11 | static int count_instructions(const cregex_node_t *node) 12 | { 13 | switch (node->type) { 14 | case REGEX_NODE_TYPE_EPSILON: 15 | return 0; 16 | 17 | /* Characters */ 18 | case REGEX_NODE_TYPE_CHARACTER: 19 | case REGEX_NODE_TYPE_ANY_CHARACTER: 20 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 21 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 22 | return 1; 23 | 24 | /* Composites */ 25 | case REGEX_NODE_TYPE_CONCATENATION: 26 | return count_instructions(node->left) + count_instructions(node->right); 27 | case REGEX_NODE_TYPE_ALTERNATION: 28 | return 2 + count_instructions(node->left) + 29 | count_instructions(node->right); 30 | 31 | /* Quantifiers */ 32 | case REGEX_NODE_TYPE_QUANTIFIER: { 33 | int num = count_instructions(node->quantified); 34 | if (node->nmax >= node->nmin) 35 | return node->nmin * num + (node->nmax - node->nmin) * (num + 1); 36 | return 1 + (node->nmin ? node->nmin * num : num + 1); 37 | } 38 | 39 | /* Anchors */ 40 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 41 | case REGEX_NODE_TYPE_ANCHOR_END: 42 | return 1; 43 | 44 | /* Captures */ 45 | case REGEX_NODE_TYPE_CAPTURE: 46 | return 2 + count_instructions(node->captured); 47 | } 48 | 49 | /* should not reach here */ 50 | return 0; 51 | } 52 | 53 | static bool node_is_anchored(const cregex_node_t *node) 54 | { 55 | switch (node->type) { 56 | case REGEX_NODE_TYPE_EPSILON: 57 | return false; 58 | 59 | /* Characters */ 60 | case REGEX_NODE_TYPE_CHARACTER: 61 | case REGEX_NODE_TYPE_ANY_CHARACTER: 62 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 63 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 64 | return false; 65 | 66 | /* Composites */ 67 | case REGEX_NODE_TYPE_CONCATENATION: 68 | return node_is_anchored(node->left); 69 | case REGEX_NODE_TYPE_ALTERNATION: 70 | return node_is_anchored(node->left) && node_is_anchored(node->right); 71 | 72 | /* Quantifiers */ 73 | case REGEX_NODE_TYPE_QUANTIFIER: 74 | return node_is_anchored(node->quantified); 75 | 76 | /* Anchors */ 77 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 78 | return true; 79 | case REGEX_NODE_TYPE_ANCHOR_END: 80 | return false; 81 | 82 | /* Captures */ 83 | case REGEX_NODE_TYPE_CAPTURE: 84 | return node_is_anchored(node->captured); 85 | } 86 | 87 | /* should not reach here */ 88 | return false; 89 | } 90 | 91 | static inline cregex_program_instr_t *emit( 92 | regex_compile_context *context, 93 | const cregex_program_instr_t *instruction) 94 | { 95 | *context->pc = *instruction; 96 | return context->pc++; 97 | } 98 | 99 | static cregex_program_instr_t *compile_char_class( 100 | const cregex_node_t *node, 101 | cregex_program_instr_t *instruction) 102 | { 103 | const char *sp = node->from; 104 | 105 | for (;;) { 106 | int ch = *sp++; 107 | switch (ch) { 108 | case ']': 109 | if (sp - 1 == node->from) 110 | goto CHARACTER; 111 | return instruction; 112 | case '\\': 113 | ch = *sp++; 114 | /* fall-through */ 115 | default: 116 | CHARACTER: 117 | if (*sp == '-' && sp[1] != ']') { 118 | for (; ch <= sp[1]; ++ch) 119 | cregex_char_class_add(instruction->klass, ch); 120 | sp += 2; 121 | } else { 122 | cregex_char_class_add(instruction->klass, ch); 123 | } 124 | break; 125 | } 126 | } 127 | } 128 | 129 | static cregex_program_instr_t *compile_context(regex_compile_context *context, 130 | const cregex_node_t *node) 131 | { 132 | cregex_program_instr_t *bottom = context->pc, *split, *jump; 133 | int ncaptures = context->ncaptures, capture; 134 | 135 | switch (node->type) { 136 | case REGEX_NODE_TYPE_EPSILON: 137 | break; 138 | 139 | /* Characters */ 140 | case REGEX_NODE_TYPE_CHARACTER: 141 | emit(context, 142 | &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_CHARACTER, 143 | .ch = node->ch}); 144 | break; 145 | case REGEX_NODE_TYPE_ANY_CHARACTER: 146 | emit(context, &(cregex_program_instr_t){ 147 | .opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER}); 148 | break; 149 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 150 | compile_char_class( 151 | node, 152 | emit(context, &(cregex_program_instr_t){ 153 | .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS})); 154 | break; 155 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 156 | compile_char_class( 157 | node, 158 | emit(context, 159 | &(cregex_program_instr_t){ 160 | .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED})); 161 | break; 162 | 163 | /* Composites */ 164 | case REGEX_NODE_TYPE_CONCATENATION: 165 | compile_context(context, node->left); 166 | compile_context(context, node->right); 167 | break; 168 | case REGEX_NODE_TYPE_ALTERNATION: 169 | split = emit(context, &(cregex_program_instr_t){ 170 | .opcode = REGEX_PROGRAM_OPCODE_SPLIT}); 171 | split->first = compile_context(context, node->left); 172 | jump = emit(context, &(cregex_program_instr_t){ 173 | .opcode = REGEX_PROGRAM_OPCODE_JUMP}); 174 | split->second = compile_context(context, node->right); 175 | jump->target = context->pc; 176 | break; 177 | 178 | /* Quantifiers */ 179 | case REGEX_NODE_TYPE_QUANTIFIER: { 180 | cregex_program_instr_t *last = NULL; 181 | for (int i = 0; i < node->nmin; ++i) { 182 | context->ncaptures = ncaptures; 183 | last = compile_context(context, node->quantified); 184 | } 185 | if (node->nmax > node->nmin) { 186 | for (int i = 0; i < node->nmax - node->nmin; ++i) { 187 | context->ncaptures = ncaptures; 188 | split = 189 | emit(context, &(cregex_program_instr_t){ 190 | .opcode = REGEX_PROGRAM_OPCODE_SPLIT}); 191 | split->first = compile_context(context, node->quantified); 192 | split->second = context->pc; 193 | if (!node->greedy) { 194 | cregex_program_instr_t *swap = split->first; 195 | split->first = split->second; 196 | split->second = swap; 197 | } 198 | } 199 | } else if (node->nmax == -1) { 200 | split = emit(context, &(cregex_program_instr_t){ 201 | .opcode = REGEX_PROGRAM_OPCODE_SPLIT}); 202 | if (node->nmin == 0) { 203 | split->first = compile_context(context, node->quantified); 204 | jump = emit(context, &(cregex_program_instr_t){ 205 | .opcode = REGEX_PROGRAM_OPCODE_JUMP}); 206 | split->second = context->pc; 207 | jump->target = split; 208 | } else { 209 | split->first = last; 210 | split->second = context->pc; 211 | } 212 | if (!node->greedy) { 213 | cregex_program_instr_t *swap = split->first; 214 | split->first = split->second; 215 | split->second = swap; 216 | } 217 | } 218 | break; 219 | } 220 | 221 | /* Anchors */ 222 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 223 | emit(context, &(cregex_program_instr_t){ 224 | .opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN}); 225 | break; 226 | case REGEX_NODE_TYPE_ANCHOR_END: 227 | emit(context, &(cregex_program_instr_t){ 228 | .opcode = REGEX_PROGRAM_OPCODE_ASSERT_END}); 229 | break; 230 | 231 | /* Captures */ 232 | case REGEX_NODE_TYPE_CAPTURE: 233 | capture = context->ncaptures++ * 2; 234 | emit(context, 235 | &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_SAVE, 236 | .save = capture}); 237 | compile_context(context, node->captured); 238 | emit(context, 239 | &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_SAVE, 240 | .save = capture + 1}); 241 | break; 242 | } 243 | 244 | return bottom; 245 | } 246 | 247 | /* Compile a parsed pattern (using a previously allocated program with at least 248 | * estimate_instructions(root) instructions). 249 | */ 250 | static cregex_program_t *compile_node_with_program(const cregex_node_t *root, 251 | cregex_program_t *program) 252 | { 253 | /* add capture node for entire match */ 254 | root = &(cregex_node_t){.type = REGEX_NODE_TYPE_CAPTURE, 255 | .captured = (cregex_node_t *) root}; 256 | 257 | /* add .*? unless pattern starts with ^ */ 258 | if (!node_is_anchored(root)) 259 | root = &(cregex_node_t){ 260 | .type = REGEX_NODE_TYPE_CONCATENATION, 261 | .left = 262 | &(cregex_node_t){ 263 | .type = REGEX_NODE_TYPE_QUANTIFIER, 264 | .nmin = 0, 265 | .nmax = -1, 266 | .greedy = 0, 267 | .quantified = &( 268 | cregex_node_t){.type = REGEX_NODE_TYPE_ANY_CHARACTER}}, 269 | .right = (cregex_node_t *) root}; 270 | 271 | /* compile */ 272 | regex_compile_context *context = 273 | &(regex_compile_context){.pc = program->instructions, .ncaptures = 0}; 274 | compile_context(context, root); 275 | 276 | /* emit final match instruction */ 277 | emit(context, 278 | &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_MATCH}); 279 | 280 | /* set total number of instructions */ 281 | program->ninstructions = context->pc - program->instructions; 282 | 283 | return program; 284 | } 285 | 286 | /* Upper bound of number of instructions required to compile parsed pattern. */ 287 | static int estimate_instructions(const cregex_node_t *root) 288 | { 289 | return count_instructions(root) 290 | /* .*? is added unless pattern starts with ^, 291 | * save instructions are added for beginning and end of match, 292 | * a final match instruction is added to the end of the program 293 | */ 294 | + !node_is_anchored(root) * 3 + 2 + 1; 295 | } 296 | 297 | cregex_program_t *cregex_compile_node(const cregex_node_t *root) 298 | { 299 | size_t size = sizeof(cregex_program_t) + 300 | sizeof(cregex_program_instr_t) * estimate_instructions(root); 301 | cregex_program_t *program; 302 | 303 | if (!(program = malloc(size))) 304 | return NULL; 305 | 306 | if (!compile_node_with_program(root, program)) { 307 | free(program); 308 | return NULL; 309 | } 310 | 311 | return program; 312 | } 313 | 314 | /* Free a compiled program */ 315 | void cregex_compile_free(cregex_program_t *program) 316 | { 317 | free(program); 318 | } 319 | -------------------------------------------------------------------------------- /src/parse.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cregex.h" 6 | 7 | typedef struct { 8 | const char *sp; 9 | cregex_node_t *stack, *output; 10 | } regex_parse_context; 11 | 12 | /* Shunting-yard algorithm 13 | * See https://en.wikipedia.org/wiki/Shunting-yard_algorithm 14 | */ 15 | 16 | static inline cregex_node_t *push(regex_parse_context *context, 17 | const cregex_node_t *node) 18 | { 19 | assert(context->stack <= context->output); 20 | *context->stack = *node; 21 | return context->stack++; 22 | } 23 | 24 | static inline cregex_node_t *drop(regex_parse_context *context) 25 | { 26 | return --context->stack; 27 | } 28 | 29 | static inline cregex_node_t *consume(regex_parse_context *context) 30 | { 31 | *--context->output = *--context->stack; 32 | return context->output; 33 | } 34 | 35 | static inline cregex_node_t *concatenate(regex_parse_context *context, 36 | const cregex_node_t *bottom) 37 | { 38 | if (context->stack == bottom) 39 | push(context, &(cregex_node_t){.type = REGEX_NODE_TYPE_EPSILON}); 40 | else { 41 | while (context->stack - 1 > bottom) { 42 | cregex_node_t *right = consume(context); 43 | cregex_node_t *left = consume(context); 44 | push(context, 45 | &(cregex_node_t){.type = REGEX_NODE_TYPE_CONCATENATION, 46 | .left = left, 47 | .right = right}); 48 | } 49 | } 50 | return context->stack - 1; 51 | } 52 | 53 | static cregex_node_t *parse_char_class(regex_parse_context *context) 54 | { 55 | cregex_node_type type = 56 | (*context->sp == '^') 57 | ? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED) 58 | : REGEX_NODE_TYPE_CHARACTER_CLASS; 59 | const char *from = context->sp; 60 | 61 | for (;;) { 62 | int ch = *context->sp++; 63 | switch (ch) { 64 | case '\0': 65 | /* premature end of character class */ 66 | return NULL; 67 | case ']': 68 | if (context->sp - 1 == from) 69 | goto CHARACTER; 70 | return push(context, 71 | &(cregex_node_t){ 72 | .type = type, .from = from, .to = context->sp - 1}); 73 | case '\\': 74 | ch = *context->sp++; 75 | /* fall-through */ 76 | default: 77 | CHARACTER: 78 | if (*context->sp == '-' && context->sp[1] != ']') { 79 | if (context->sp[1] < ch) 80 | /* empty range in character class */ 81 | return NULL; 82 | context->sp += 2; 83 | } 84 | break; 85 | } 86 | } 87 | } 88 | 89 | static cregex_node_t *parse_interval(regex_parse_context *context) 90 | { 91 | const char *from = context->sp; 92 | int nmin, nmax; 93 | 94 | for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp) 95 | nmin = (nmin * 10) + (*context->sp - '0'); 96 | 97 | if (*context->sp == ',') { 98 | ++context->sp; 99 | if (*from != ',' && *context->sp == '}') 100 | nmax = -1; 101 | else { 102 | for (nmax = 0; *context->sp >= '0' && *context->sp <= '9'; 103 | ++context->sp) 104 | nmax = (nmax * 10) + (*context->sp - '0'); 105 | if (*(context->sp - 1) == ',' || *context->sp != '}' || 106 | nmax < nmin) { 107 | context->sp = from; 108 | return NULL; 109 | } 110 | } 111 | } else if (*from != '}' && *context->sp == '}') { 112 | nmax = nmin; 113 | } else { 114 | context->sp = from; 115 | return NULL; 116 | } 117 | 118 | ++context->sp; 119 | return push(context, 120 | &(cregex_node_t){ 121 | .type = REGEX_NODE_TYPE_QUANTIFIER, 122 | .nmin = nmin, 123 | .nmax = nmax, 124 | .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1, 125 | .quantified = consume(context)}); 126 | } 127 | 128 | static cregex_node_t *parse_context(regex_parse_context *context, int depth) 129 | { 130 | cregex_node_t *bottom = context->stack; 131 | 132 | for (;;) { 133 | int ch = *context->sp++; 134 | switch (ch) { 135 | /* Characters */ 136 | case '\\': 137 | ch = *context->sp++; 138 | /* fall-through */ 139 | default: 140 | CHARACTER: 141 | push(context, 142 | &(cregex_node_t){.type = REGEX_NODE_TYPE_CHARACTER, .ch = ch}); 143 | break; 144 | case '.': 145 | push(context, 146 | &(cregex_node_t){.type = REGEX_NODE_TYPE_ANY_CHARACTER}); 147 | break; 148 | case '[': 149 | if (!parse_char_class(context)) 150 | return NULL; 151 | break; 152 | 153 | /* Composites */ 154 | case '|': { 155 | cregex_node_t *left = concatenate(context, bottom), *right; 156 | if (!(right = parse_context(context, depth))) 157 | return NULL; 158 | if (left->type == REGEX_NODE_TYPE_EPSILON && 159 | right->type == left->type) { 160 | drop(context); 161 | } else if (left->type == REGEX_NODE_TYPE_EPSILON) { 162 | right = consume(context); 163 | drop(context); 164 | push(context, 165 | &(cregex_node_t){.type = REGEX_NODE_TYPE_QUANTIFIER, 166 | .nmin = 0, 167 | .nmax = 1, 168 | .greedy = 1, 169 | .quantified = right}); 170 | } else if (right->type == REGEX_NODE_TYPE_EPSILON) { 171 | drop(context); 172 | left = consume(context); 173 | push(context, 174 | &(cregex_node_t){.type = REGEX_NODE_TYPE_QUANTIFIER, 175 | .nmin = 0, 176 | .nmax = 1, 177 | .greedy = 1, 178 | .quantified = left}); 179 | } else { 180 | right = consume(context); 181 | left = consume(context); 182 | push(context, 183 | &(cregex_node_t){.type = REGEX_NODE_TYPE_ALTERNATION, 184 | .left = left, 185 | .right = right}); 186 | } 187 | return bottom; 188 | } 189 | 190 | #define QUANTIFIER(ch, min, max) \ 191 | case ch: \ 192 | if (context->stack == bottom) \ 193 | goto CHARACTER; \ 194 | push(context, \ 195 | &(cregex_node_t){ \ 196 | .type = REGEX_NODE_TYPE_QUANTIFIER, \ 197 | .nmin = min, \ 198 | .nmax = max, \ 199 | .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1, \ 200 | .quantified = consume(context)}); \ 201 | break 202 | 203 | /* clang-format off */ 204 | /* Quantifiers */ 205 | QUANTIFIER('?', 0, 1); 206 | QUANTIFIER('*', 0, -1); 207 | QUANTIFIER('+', 1, -1); 208 | /* clang-format on */ 209 | #undef QUANTIFIER 210 | 211 | case '{': 212 | if ((context->stack == bottom) || !parse_interval(context)) 213 | goto CHARACTER; 214 | break; 215 | 216 | /* Anchors */ 217 | case '^': 218 | push(context, 219 | &(cregex_node_t){.type = REGEX_NODE_TYPE_ANCHOR_BEGIN}); 220 | break; 221 | case '$': 222 | push(context, &(cregex_node_t){.type = REGEX_NODE_TYPE_ANCHOR_END}); 223 | break; 224 | 225 | /* Captures */ 226 | case '(': 227 | if (!parse_context(context, depth + 1)) 228 | return NULL; 229 | push(context, &(cregex_node_t){.type = REGEX_NODE_TYPE_CAPTURE, 230 | .captured = consume(context)}); 231 | break; 232 | case ')': 233 | if (depth > 0) 234 | return concatenate(context, bottom); 235 | /* unmatched close parenthesis */ 236 | return NULL; 237 | 238 | /* End of string */ 239 | case '\0': 240 | if (depth == 0) 241 | return concatenate(context, bottom); 242 | /* unmatched open parenthesis */ 243 | return NULL; 244 | } 245 | } 246 | } 247 | 248 | static inline int estimate_nodes(const char *pattern) 249 | { 250 | return strlen(pattern) * 2; 251 | } 252 | 253 | /* Parse a pattern (using a previously allocated buffer of at least 254 | * estimate_nodes(pattern) nodes). 255 | */ 256 | static cregex_node_t *parse_with_nodes(const char *pattern, 257 | cregex_node_t *nodes) 258 | { 259 | regex_parse_context *context = 260 | &(regex_parse_context){.sp = pattern, 261 | .stack = nodes, 262 | .output = nodes + estimate_nodes(pattern)}; 263 | return parse_context(context, 0); 264 | } 265 | 266 | cregex_node_t *cregex_parse(const char *pattern) 267 | { 268 | size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern); 269 | cregex_node_t *nodes = malloc(size); 270 | if (!nodes) 271 | return NULL; 272 | 273 | if (!parse_with_nodes(pattern, nodes)) { 274 | free(nodes); 275 | return NULL; 276 | } 277 | 278 | return nodes; 279 | } 280 | 281 | void cregex_parse_free(cregex_node_t *root) 282 | { 283 | free(root); 284 | } 285 | -------------------------------------------------------------------------------- /src/vm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "cregex.h" 5 | 6 | #define REGEX_VM_MAX_MATCHES 20 7 | 8 | /* The VM executes one or more threads, each running a regular expression 9 | * program, which is just a list of regular expression instructions. Each 10 | * thread maintains two registers while it runs: a program counter (PC) and 11 | * a string pointer (SP). 12 | */ 13 | typedef struct { 14 | int visited; 15 | const cregex_program_instr_t *pc; 16 | const char *matches[REGEX_VM_MAX_MATCHES]; 17 | } vm_thread; 18 | 19 | /* Run program on string */ 20 | static int vm_run(const cregex_program_t *program, 21 | const char *string, 22 | const char **matches, 23 | int nmatches); 24 | 25 | /* Run program on string (using a previously allocated buffer of at least 26 | * vm_estimate_threads(program) threads) 27 | */ 28 | static int vm_run_with_threads(const cregex_program_t *program, 29 | const char *string, 30 | const char **matches, 31 | int nmatches, 32 | vm_thread *threads); 33 | 34 | typedef struct { 35 | int nthreads; 36 | vm_thread *threads; 37 | } vm_thread_list; 38 | 39 | static void vm_add_thread(vm_thread_list *list, 40 | const cregex_program_t *program, 41 | const cregex_program_instr_t *pc, 42 | const char *string, 43 | const char *sp, 44 | const char **matches, 45 | int nmatches) 46 | { 47 | if (list->threads[pc - program->instructions].visited == sp - string + 1) 48 | return; 49 | list->threads[pc - program->instructions].visited = sp - string + 1; 50 | 51 | switch (pc->opcode) { 52 | case REGEX_PROGRAM_OPCODE_MATCH: 53 | /* fall-through */ 54 | 55 | /* Characters */ 56 | case REGEX_PROGRAM_OPCODE_CHARACTER: 57 | case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: 58 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS: 59 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED: 60 | list->threads[list->nthreads].pc = pc; 61 | memcpy(list->threads[list->nthreads].matches, matches, 62 | sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES) 63 | ? nmatches 64 | : REGEX_VM_MAX_MATCHES)); 65 | ++list->nthreads; 66 | break; 67 | 68 | /* Control-flow */ 69 | case REGEX_PROGRAM_OPCODE_SPLIT: 70 | vm_add_thread(list, program, pc->first, string, sp, matches, nmatches); 71 | vm_add_thread(list, program, pc->second, string, sp, matches, nmatches); 72 | break; 73 | case REGEX_PROGRAM_OPCODE_JUMP: 74 | vm_add_thread(list, program, pc->target, string, sp, matches, nmatches); 75 | break; 76 | 77 | /* Assertions */ 78 | case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: 79 | if (sp == string) 80 | vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); 81 | break; 82 | case REGEX_PROGRAM_OPCODE_ASSERT_END: 83 | if (!*sp) 84 | vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); 85 | break; 86 | 87 | /* Saving */ 88 | case REGEX_PROGRAM_OPCODE_SAVE: 89 | if (pc->save < nmatches && pc->save < REGEX_VM_MAX_MATCHES) { 90 | const char *saved = matches[pc->save]; 91 | matches[pc->save] = sp; 92 | vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); 93 | matches[pc->save] = saved; 94 | } else { 95 | vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); 96 | } 97 | break; 98 | } 99 | } 100 | 101 | /* Upper bound of number of threads required to run program */ 102 | static int vm_estimate_threads(const cregex_program_t *program) 103 | { 104 | return program->ninstructions * 2; 105 | } 106 | 107 | static int vm_run(const cregex_program_t *program, 108 | const char *string, 109 | const char **matches, 110 | int nmatches) 111 | { 112 | size_t size = sizeof(vm_thread) * vm_estimate_threads(program); 113 | vm_thread *threads; 114 | int matched; 115 | 116 | if (!(threads = malloc(size))) 117 | return -1; 118 | 119 | matched = vm_run_with_threads(program, string, matches, nmatches, threads); 120 | free(threads); 121 | return matched; 122 | } 123 | 124 | static int vm_run_with_threads(const cregex_program_t *program, 125 | const char *string, 126 | const char **matches, 127 | int nmatches, 128 | vm_thread *threads) 129 | { 130 | vm_thread_list *current = 131 | &(vm_thread_list){.nthreads = 0, .threads = threads}; 132 | vm_thread_list *next = &(vm_thread_list){ 133 | .nthreads = 0, .threads = threads + program->ninstructions}; 134 | int matched = 0; 135 | 136 | memset(threads, 0, sizeof(vm_thread) * program->ninstructions * 2); 137 | 138 | vm_add_thread(current, program, program->instructions, string, string, 139 | matches, nmatches); 140 | 141 | for (const char *sp = string;; ++sp) { 142 | for (int i = 0; i < current->nthreads; ++i) { 143 | vm_thread *thread = current->threads + i; 144 | switch (thread->pc->opcode) { 145 | case REGEX_PROGRAM_OPCODE_MATCH: 146 | matched = 1; 147 | current->nthreads = 0; 148 | memcpy(matches, thread->matches, 149 | sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES) 150 | ? nmatches 151 | : REGEX_VM_MAX_MATCHES)); 152 | continue; 153 | 154 | /* Characters */ 155 | case REGEX_PROGRAM_OPCODE_CHARACTER: 156 | if (*sp == thread->pc->ch) 157 | break; 158 | continue; 159 | case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: 160 | if (*sp) 161 | break; 162 | continue; 163 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS: 164 | if (cregex_char_class_contains(thread->pc->klass, *sp)) 165 | break; 166 | continue; 167 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED: 168 | if (!cregex_char_class_contains(thread->pc->klass, *sp)) 169 | break; 170 | continue; 171 | 172 | /* Control-flow */ 173 | case REGEX_PROGRAM_OPCODE_SPLIT: 174 | case REGEX_PROGRAM_OPCODE_JUMP: 175 | /* fall-through */ 176 | 177 | /* Assertions */ 178 | case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: 179 | case REGEX_PROGRAM_OPCODE_ASSERT_END: 180 | /* fall-through */ 181 | 182 | /* Saving */ 183 | case REGEX_PROGRAM_OPCODE_SAVE: 184 | /* handled in vm_add_thread() */ 185 | abort(); 186 | } 187 | 188 | vm_add_thread(next, program, thread->pc + 1, string, sp + 1, 189 | thread->matches, nmatches); 190 | } 191 | 192 | /* swap current and next thread list */ 193 | vm_thread_list *swap = current; 194 | current = next; 195 | next = swap; 196 | next->nthreads = 0; 197 | 198 | /* done if no more threads are running or end of string reached */ 199 | if (current->nthreads == 0 || !*sp) 200 | break; 201 | } 202 | 203 | return matched; 204 | } 205 | 206 | int cregex_program_run(const cregex_program_t *program, 207 | const char *string, 208 | const char **matches, 209 | int nmatches) 210 | { 211 | return vm_run(program, string, matches, nmatches); 212 | } 213 | -------------------------------------------------------------------------------- /tests/cli.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | static void usage(FILE *file, const char *program) 9 | { 10 | fprintf(file, "usage: %s pattern [string...]\n", program); 11 | } 12 | 13 | static void print_node(FILE *file, cregex_node_t *node, int depth) 14 | { 15 | switch (node->type) { 16 | case REGEX_NODE_TYPE_EPSILON: 17 | fprintf(file, "epsilon"); 18 | break; 19 | 20 | /* Characters */ 21 | case REGEX_NODE_TYPE_CHARACTER: 22 | fprintf(file, isprint(node->ch) ? "character('%c')" : "character(%02x)", 23 | node->ch); 24 | break; 25 | case REGEX_NODE_TYPE_ANY_CHARACTER: 26 | fprintf(file, "any_character"); 27 | break; 28 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 29 | fprintf(file, "character_class(\"%.*s\")", 30 | (int) (node->to - node->from), node->from); 31 | break; 32 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 33 | fprintf(file, "character_class_negated(\"%.*s\")", 34 | (int) (node->to - node->from), node->from); 35 | break; 36 | 37 | /* Composites */ 38 | case REGEX_NODE_TYPE_CONCATENATION: 39 | fprintf(file, "concatenation("); 40 | print_node(file, node->left, depth + 1); 41 | fprintf(file, ", "); 42 | print_node(file, node->right, depth + 1); 43 | fprintf(file, ")"); 44 | break; 45 | case REGEX_NODE_TYPE_ALTERNATION: 46 | fprintf(file, "alternation("); 47 | print_node(file, node->left, depth + 1); 48 | fprintf(file, ", "); 49 | print_node(file, node->right, depth + 1); 50 | fprintf(file, ")"); 51 | break; 52 | 53 | /* Quantifiers */ 54 | case REGEX_NODE_TYPE_QUANTIFIER: 55 | fprintf(file, "quantifier("); 56 | print_node(file, node->quantified, depth + 1); 57 | fprintf(file, ", %d, %d, %s)", node->nmin, node->nmax, 58 | node->greedy ? "greedy" : "non_greedy"); 59 | break; 60 | 61 | /* Anchors */ 62 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 63 | fprintf(file, "anchor_begin"); 64 | break; 65 | case REGEX_NODE_TYPE_ANCHOR_END: 66 | fprintf(file, "anchor_end"); 67 | break; 68 | 69 | /* Captures */ 70 | case REGEX_NODE_TYPE_CAPTURE: 71 | fprintf(file, "capture("); 72 | print_node(file, node->captured, depth + 1); 73 | fprintf(file, ")"); 74 | break; 75 | } 76 | 77 | if (depth == 0) 78 | fprintf(file, "\n"); 79 | } 80 | 81 | static void print_char_class(FILE *file, 82 | const cregex_program_instr_t *instruction) 83 | { 84 | for (int ch = 0, to; ch < UCHAR_MAX; ++ch) { 85 | if (cregex_char_class_contains(instruction->klass, ch)) { 86 | fprintf(file, isprint(ch) ? "%c" : "%02x", ch); 87 | for (to = ch + 1; 88 | cregex_char_class_contains(instruction->klass, to); ++to) 89 | ; 90 | if (to > ch + 2) { 91 | fprintf(file, isprint(to) ? "-%c" : "-%02x", to - 1); 92 | ch = to; 93 | } 94 | } 95 | } 96 | } 97 | 98 | static void print_instruction(FILE *file, 99 | const cregex_program_t *program, 100 | const cregex_program_instr_t *instruction) 101 | { 102 | fprintf(file, "[%04x] ", (int) (instruction - program->instructions)); 103 | 104 | switch (instruction->opcode) { 105 | case REGEX_PROGRAM_OPCODE_MATCH: 106 | fprintf(file, "MATCH\n"); 107 | break; 108 | 109 | /* Characters */ 110 | case REGEX_PROGRAM_OPCODE_CHARACTER: 111 | if (isprint(instruction->ch)) 112 | fprintf(file, "CHAR %c\n", instruction->ch); 113 | else 114 | fprintf(file, "CHAR %02x\n", instruction->ch); 115 | break; 116 | case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: 117 | fprintf(file, "ANY_CHAR\n"); 118 | break; 119 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS: 120 | fprintf(file, "CHARACTER_CLASS ["); 121 | print_char_class(file, instruction); 122 | fprintf(file, "]\n"); 123 | break; 124 | case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED: 125 | fprintf(file, "CHARACTER_CLASS_NEGATED [^"); 126 | print_char_class(file, instruction); 127 | fprintf(file, "]\n"); 128 | break; 129 | 130 | /* Control-flow */ 131 | case REGEX_PROGRAM_OPCODE_JUMP: 132 | fprintf(file, "JUMP %04x\n", 133 | (int) (instruction->target - program->instructions)); 134 | break; 135 | case REGEX_PROGRAM_OPCODE_SPLIT: 136 | fprintf(file, "SPLIT %04x %04x\n", 137 | (int) (instruction->first - program->instructions), 138 | (int) (instruction->second - program->instructions)); 139 | break; 140 | 141 | /* Assertions */ 142 | case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: 143 | fprintf(file, "ASSERT_BEGIN\n"); 144 | break; 145 | case REGEX_PROGRAM_OPCODE_ASSERT_END: 146 | fprintf(file, "ASSERT_END\n"); 147 | break; 148 | 149 | /* Saving */ 150 | case REGEX_PROGRAM_OPCODE_SAVE: 151 | fprintf(file, "SAVE %d\n", instruction->save); 152 | break; 153 | } 154 | } 155 | 156 | static void print_program(FILE *file, const cregex_program_t *program) 157 | { 158 | for (int i = 0; i < program->ninstructions; ++i) 159 | print_instruction(file, program, program->instructions + i); 160 | } 161 | 162 | int main(int argc, char *argv[]) 163 | { 164 | cregex_node_t *node; 165 | cregex_program_t *program; 166 | 167 | /* process command line */ 168 | if (argc < 2) { 169 | usage(stderr, argv[0]); 170 | return EXIT_FAILURE; 171 | } 172 | 173 | if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { 174 | usage(stdout, argv[0]); 175 | return EXIT_SUCCESS; 176 | } 177 | 178 | /* parse pattern */ 179 | if ((node = cregex_parse(argv[1]))) 180 | print_node(stdout, node, 0); 181 | else { 182 | fprintf(stderr, "%s: cregex_parse() failed\n", argv[0]); 183 | return EXIT_FAILURE; 184 | } 185 | 186 | /* compile parsed pattern */ 187 | program = cregex_compile_node(node); 188 | cregex_parse_free(node); 189 | if (program) 190 | print_program(stdout, program); 191 | else { 192 | fprintf(stderr, "%s: cregex_compile_node() failed\n", argv[0]); 193 | return EXIT_FAILURE; 194 | } 195 | 196 | /* run program on string(s) */ 197 | for (int i = 2; i < argc; ++i) { 198 | const char *matches[20] = {0}; 199 | 200 | if (cregex_program_run(program, argv[i], matches, 20) > 0) { 201 | int nmatches = 0; 202 | for (int j = 0; j < sizeof(matches) / sizeof(matches[0]); ++j) 203 | if (matches[j]) 204 | nmatches = j; 205 | 206 | printf("\"%s\": ", argv[i]); 207 | 208 | for (int j = 0; j <= nmatches; j += 2) { 209 | if (j > 0) 210 | printf(", "); 211 | if (matches[j] && matches[j + 1]) { 212 | printf("\"%.*s\"(%d,%d)", 213 | (int) (matches[j + 1] - matches[j]), matches[j], 214 | (int) (matches[j] - argv[i]), 215 | (int) (matches[j + 1] - argv[i])); 216 | } else { 217 | printf("(NULL,NULL)"); 218 | } 219 | } 220 | 221 | printf("\n"); 222 | } else { 223 | printf("\"%s\": no match\n", argv[i]); 224 | } 225 | } 226 | 227 | cregex_compile_free(program); 228 | return EXIT_SUCCESS; 229 | } 230 | -------------------------------------------------------------------------------- /tests/generator.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | puts <<-END 4 | /* generated by #{$0}#{ARGV.size > 0 ? ' ' + ARGV.join(' ') : ''} */ 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #ifdef __GNUC__ 12 | static void success(const char *source, const char *format, ...) 13 | __attribute__ ((format(printf, 2, 3))); 14 | static void fail(const char *source, const char *format, ...) 15 | __attribute__ ((format(printf, 2, 3))); 16 | #endif 17 | 18 | static void success(const char *source, const char *format, ...) 19 | { 20 | va_list ap; 21 | va_start(ap, format); 22 | printf("%s [\\x1b[32mSUCCESS\\x1b[0m] ", source); 23 | vprintf(format, ap); 24 | printf("\\n"); 25 | va_end(ap); 26 | } 27 | 28 | static void fail(const char *source, const char *format, ...) 29 | { 30 | va_list ap; 31 | va_start(ap, format); 32 | printf("%s [\\x1b[31mFAIL \\x1b[0m] ", source); 33 | vprintf(format, ap); 34 | printf("\\n"); 35 | va_end(ap); 36 | } 37 | 38 | static int test(const char *source, 39 | const char *pattern, const char *string, 40 | int nmatches, 41 | ...) 42 | { 43 | cregex_node_t *root; 44 | cregex_program_t *program; 45 | const char *matches[20] = {0}; 46 | int result = 0; 47 | va_list ap; 48 | 49 | /* parse pattern */ 50 | if (!(root = cregex_parse(pattern))) { 51 | fail(source, "cregex_parse() failed"); 52 | return -1; 53 | } 54 | 55 | /* compile parsed pattern */ 56 | program = cregex_compile_node(root); 57 | cregex_parse_free(root); 58 | if (!program) { 59 | fail(source, "cregex_compile_node() failed"); 60 | return -1; 61 | } 62 | 63 | /* run program on string */ 64 | if ((result = cregex_program_run(program, string, matches, 65 | sizeof (matches) / sizeof (matches[0]))) < 66 | 0) { 67 | fail(source, "cregex_program_run() failed"); 68 | cregex_compile_free(program); 69 | return -1; 70 | } 71 | 72 | va_start(ap, nmatches); 73 | if (result > 0) { 74 | if (nmatches > 0) { 75 | success(source, "/%s/ =~ \\"%s\\"", pattern, string); 76 | result = 0; 77 | for (int i = 0; i + 1 < nmatches && 78 | i + 1 < sizeof (matches) / sizeof (matches[0]); 79 | i += 2) { 80 | int begin = va_arg(ap, int), end = va_arg(ap, int); 81 | if ((begin == -1 || begin == matches[i] - string) && 82 | (end == -1 || end == matches[i + 1] - string)) { 83 | // success(source, "(%d,%d)", begin, end); 84 | } else if (matches[i] && matches[i + 1]) { 85 | fail(source, "expected (%d,%d), got (%d,%d)", begin, end, 86 | (int) (matches[i] - string), 87 | (int) (matches[i + 1] - string)); 88 | result = -1; 89 | } else { 90 | fail(source, "expected (%d,%d), got (NULL,NULL)", begin, end); 91 | result = -1; 92 | } 93 | } 94 | } else { 95 | fail(source, "/%s/ =~ \\"%s\\"", pattern, string); 96 | result = -1; 97 | } 98 | } else if (result == 0) { 99 | if (nmatches == 0) 100 | success(source, "/%s/ !~ \\"%s\\"", pattern, string); 101 | else { 102 | fail(source, "/%s/ !~ \\"%s\\"", pattern, string); 103 | result = -1; 104 | } 105 | } 106 | 107 | va_end(ap); 108 | cregex_compile_free(program); 109 | return result; 110 | } 111 | 112 | int main(int argc, char *argv[]) 113 | { 114 | int nerrors = 0; 115 | END 116 | 117 | filename = nil 118 | previous = nil 119 | ntests = 0 120 | 121 | ARGF.each do |line| 122 | if ARGF.filename != filename 123 | filename = ARGF.filename 124 | ARGF.lineno = 1 125 | end 126 | 127 | line = line.sub(/^:[^:]*:/, '') 128 | next unless line =~ /^[{BEASKL]+/ 129 | 130 | options, pattern, string, captures = line.chomp.split(/\t+/) 131 | string = '' if string == 'NULL' 132 | pattern = previous if pattern == 'SAME' 133 | previous = pattern 134 | pattern = pattern.gsub('\\', "\\\\\\\\") unless options.include?('$') 135 | string = string .gsub('\\', "\\\\\\\\") unless options.include?('$') 136 | captures = captures == 'NOMATCH' \ 137 | ? captures 138 | : captures 139 | .scan(/\((.*?),(.*?)\)/) 140 | .flatten 141 | .map {|offset| offset == '?' ? -1 : offset.to_i } 142 | 143 | puts <<-END 144 | nerrors += test("#{ARGF.filename}:#{'%03d' % ARGF.lineno}", "#{pattern}", "#{string}", 145 | #{captures == 'NOMATCH' ? 0 : "#{captures.size}, #{captures.join(', ')}"}); 146 | END 147 | ntests += 1 148 | end 149 | 150 | puts <<-END 151 | printf("#{ntests} test(s), %d error(s).\\n", -nerrors); 152 | return 0; 153 | } 154 | END 155 | -------------------------------------------------------------------------------- /tests/re2dot.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | static void usage(FILE *file, const char *program) 8 | { 9 | fprintf(file, "usage: %s pattern\n", program); 10 | } 11 | 12 | static void print_node(FILE *file, const cregex_node_t *node) 13 | { 14 | switch (node->type) { 15 | case REGEX_NODE_TYPE_EPSILON: 16 | fprintf(file, 17 | "node%p[label=\"ε\",shape=box,fontname=\"times-italic\"];\n", 18 | (void *) node); 19 | break; 20 | 21 | /* Characters */ 22 | case REGEX_NODE_TYPE_CHARACTER: 23 | fprintf(file, 24 | "node%p[color=lightblue2,style=filled,label=\"'%c'\",shape=box," 25 | "fontname=\"courier\"];\n", 26 | (void *) node, node->ch); 27 | break; 28 | case REGEX_NODE_TYPE_ANY_CHARACTER: 29 | fprintf(file, 30 | "node%p[label=\"any\",shape=box" 31 | ",fontname=\"times-italic\"];\n", 32 | (void *) node); 33 | break; 34 | case REGEX_NODE_TYPE_CHARACTER_CLASS: 35 | fprintf(file, 36 | "node%p[label=\"[%.*s]\",shape=box,fontname=\"courier\"];\n", 37 | (void *) node, (int) (node->to - node->from), node->from); 38 | break; 39 | case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: 40 | fprintf(file, 41 | "node%p[label=\"[^%.*s]\",shape=box,fontname=\"courier\"];\n", 42 | (void *) node, (int) (node->to - node->from), node->from); 43 | break; 44 | 45 | /* Composites */ 46 | case REGEX_NODE_TYPE_CONCATENATION: 47 | fprintf(file, 48 | "node%p[label=\"concatenation\",shape=box,style=\"rounded\"" 49 | ",fontname=\"times-italic\"];\n", 50 | (void *) node); 51 | print_node(file, node->left); 52 | fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->left); 53 | print_node(file, node->right); 54 | fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->right); 55 | break; 56 | case REGEX_NODE_TYPE_ALTERNATION: 57 | fprintf(file, 58 | "node%p[label=\"alternation\",shape=diamond,style=\"rounded\"" 59 | ",fontname=\"times-italic\"];\n", 60 | (void *) node); 61 | print_node(file, node->left); 62 | fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->left); 63 | print_node(file, node->right); 64 | fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->right); 65 | break; 66 | 67 | /* Quantifiers */ 68 | case REGEX_NODE_TYPE_QUANTIFIER: 69 | fprintf(file, "node%p[label=\"%d..", (void *) node, node->nmin); 70 | if (node->nmax == -1) 71 | fprintf(file, "INF"); 72 | else 73 | fprintf(file, "%d", node->nmax); 74 | if (node->nmin == 0) 75 | fprintf(file, "\",shape=ellipse,style=\"dotted\"];\n"); 76 | else 77 | fprintf(file, "\",shape=ellipse];\n"); 78 | print_node(file, node->quantified); 79 | fprintf(file, "node%p->node%p;\n", (void *) node, 80 | (void *) node->quantified); 81 | break; 82 | 83 | /* Anchors */ 84 | case REGEX_NODE_TYPE_ANCHOR_BEGIN: 85 | fprintf(file, "node%p[label=\"^\",shape=circle];\n", (void *) node); 86 | break; 87 | case REGEX_NODE_TYPE_ANCHOR_END: 88 | fprintf(file, "node%p[label=\"$\",shape=circle];\n", (void *) node); 89 | break; 90 | 91 | /* Captures */ 92 | case REGEX_NODE_TYPE_CAPTURE: 93 | fprintf(file, 94 | "node%p[label=\"capture\",shape=parallelogram," 95 | "style=\"rounded\",fontname=\"times-italic\"];\n", 96 | (void *) node); 97 | print_node(file, node->captured); 98 | fprintf(file, "node%p->node%p;\n", (void *) node, 99 | (void *) node->captured); 100 | break; 101 | } 102 | } 103 | 104 | static void print_dot(FILE *file, const cregex_node_t *node) 105 | { 106 | fprintf(file, "digraph cregex_ {\n"); 107 | print_node(file, node); 108 | fprintf(file, "}\n"); 109 | } 110 | 111 | int main(int argc, char *argv[]) 112 | { 113 | cregex_node_t *node; 114 | 115 | /* process command line */ 116 | if (argc < 2 || argc > 3) { 117 | usage(stderr, argv[0]); 118 | return EXIT_FAILURE; 119 | } 120 | 121 | if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { 122 | usage(stdout, argv[0]); 123 | return EXIT_SUCCESS; 124 | } 125 | 126 | /* parse pattern */ 127 | if ((node = cregex_parse(argv[1]))) 128 | print_dot(stdout, node); 129 | else { 130 | fprintf(stderr, "%s: cregex_parse() failed\n", argv[0]); 131 | return EXIT_FAILURE; 132 | } 133 | 134 | cregex_parse_free(node); 135 | return EXIT_SUCCESS; 136 | } 137 | --------------------------------------------------------------------------------