├── .clang-format
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── include
    └── cregex.h
├── mk
    ├── common.mk
    └── test-data.mk
├── src
    ├── compile.c
    ├── parse.c
    └── vm.c
└── tests
    ├── cli.c
    ├── generator.rb
    └── re2dot.c


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Chromium
 2 | Language: Cpp
 3 | MaxEmptyLinesToKeep: 3
 4 | IndentCaseLabels: false
 5 | AllowShortIfStatementsOnASingleLine: false
 6 | AllowShortCaseLabelsOnASingleLine: false
 7 | AllowShortLoopsOnASingleLine: false
 8 | DerivePointerAlignment: false
 9 | PointerAlignment: Right
10 | SpaceAfterCStyleCast: true
11 | TabWidth: 4
12 | UseTab: Never
13 | IndentWidth: 4
14 | BreakBeforeBraces: Linux
15 | AccessModifierOffset: -4
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | tests/driver
 2 | tests/driver.o
 3 | tests/driver.c
 4 | tests/basic.dat
 5 | tests/nullsubexpr.dat
 6 | tests/repetition.dat
 7 | tests/cli
 8 | tests/re2dot
 9 | *.o
10 | *.o.d
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 National Cheng Kung University, Taiwan.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice,
 8 |   this list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PROGS := driver cli re2dot
 2 | PROGS := $(addprefix tests/,$(PROGS))
 3 | 
 4 | OBJS := src/compile.o \
 5 |         src/parse.o \
 6 |         src/vm.o
 7 | deps := $(OBJS:%.o=%.o.d) $(PROGS:%=%.o.d)
 8 | 
 9 | include mk/common.mk
10 | 
11 | CC      ?= gcc
12 | CFLAGS  += -std=c11 -Wall -pedantic
13 | CFLAGS  += -Iinclude 
14 | 
15 | .PHONY: all
16 | all: CFLAGS   += -DNDEBUG -O2
17 | all: $(OBJS) $(PROGS)
18 | 
19 | .PHONY: debug
20 | debug: CFLAGS   += -DDEBUG -g
21 | debug: LDFLAGS  += -g
22 | debug: $(OBJS) $(PROGS)
23 | 
24 | include mk/test-data.mk
25 | tests/driver.c: tests/generator.rb $(TESTDATA)
26 | 	$(VECHO) "  GEN\t$@\n"
27 | 	$(Q)tests/generator.rb $(TESTDATA) > $@
28 | check: tests/driver
29 | 	$(Q)$<
30 | 
31 | %.o: %.c
32 | 	$(VECHO) "  CC\t$@\n"
33 | 	$(Q)$(CC) $(CFLAGS) -MMD -MF $@.d -c -o $@ $<
34 | 
35 | tests/%: tests/%.o $(OBJS)
36 | 	$(VECHO) "  CC+LD\t$@\n"
37 | 	$(Q)$(CC) $(LDFLAGS) -o $@ $^
38 | 
39 | .PHONY: clean
40 | clean:
41 | 	$(RM) $(PROGS) $(PROGS:%=%.o) $(OBJS) $(deps)
42 | distclean: clean
43 | 	-$(RM) tests/driver.c $(TESTDATA)
44 | 
45 | -include $(deps)
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cregex
 2 | 
 3 | `cregex` is a compact implementation of [regular expression](https://en.wikipedia.org/wiki/Regular_expression)
 4 | (regex) matching engine in C. Its design was inspired by Rob Pike's regex-code for the book "Beautiful Code"
 5 | [available online here](https://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html).
 6 | It is based on two papers by Russ Cox:
 7 | * [Regular Expression Matching Can Be Simple And Fast](https://swtch.com/~rsc/regexp/regexp1.html)
 8 | * [Regular Expression Matching: the Virtual Machine Approach](https://swtch.com/~rsc/regexp/regexp2.html)
 9 | 
10 | `cregex` supports a subset of the syntax and semantics of the [POSIX Basic Regular Expressions](https://www.regular-expressions.info/posix.html).
11 | The main design goal of `cregex` is to be small, correct, self contained and
12 | use few resources while retaining acceptable performance and feature completeness.
13 | 
14 | ## Features
15 | 
16 | * `^` and `$` anchors
17 | * `.` match any single character
18 | * `[...]` and `[^...]` character classes
19 | * `?`, `*`, `+`, and `{x,y}` greedy quantifiers
20 | * `??`, `*?`, `+?`, and `{x,y}?` non-greedy quantifiers
21 | * `(...)` capturing groups
22 | 
23 | ## Build and Test
24 | 
25 | Simply run to build the library and test programs.
26 | ```shell
27 | $ make
28 | ```
29 | 
30 | Run the tests from Go distribution.
31 | ```shell
32 | $ make check
33 | ```
34 | 
35 | Visualize the regular expressions with [Graphviz](https://graphviz.org/).
36 | ```shell
37 | $ tests/re2dot "(a*)(b{0,1})(b{1,})b{3}" | dot -Tpng -o out.png
38 | ```
39 | 
40 | ## License
41 | 
42 | `cregex` is freely redistributable under the BSD 2 clause license.
43 | Use of this source code is governed by a BSD-style license that can be found in the `LICENSE` file.
44 | 


--------------------------------------------------------------------------------
/include/cregex.h:
--------------------------------------------------------------------------------
  1 | #ifndef CREGEX_H
  2 | #define CREGEX_H
  3 | 
  4 | typedef enum {
  5 |     REGEX_NODE_TYPE_EPSILON = 0,
  6 |     /* Characters */
  7 |     REGEX_NODE_TYPE_CHARACTER,
  8 |     REGEX_NODE_TYPE_ANY_CHARACTER,
  9 |     REGEX_NODE_TYPE_CHARACTER_CLASS,
 10 |     REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED,
 11 |     /* Composites */
 12 |     REGEX_NODE_TYPE_CONCATENATION,
 13 |     REGEX_NODE_TYPE_ALTERNATION,
 14 |     /* Quantifiers */
 15 |     REGEX_NODE_TYPE_QUANTIFIER,
 16 |     /* Anchors */
 17 |     REGEX_NODE_TYPE_ANCHOR_BEGIN,
 18 |     REGEX_NODE_TYPE_ANCHOR_END,
 19 |     /* Captures */
 20 |     REGEX_NODE_TYPE_CAPTURE
 21 | } cregex_node_type;
 22 | 
 23 | typedef struct cregex_node {
 24 |     cregex_node_type type;
 25 |     union {
 26 |         /* REGEX_NODE_TYPE_CHARACTER */
 27 |         struct {
 28 |             int ch;
 29 |         };
 30 |         /* REGEX_NODE_TYPE_CHARACTER_CLASS,
 31 |          * REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED
 32 |          */
 33 |         struct {
 34 |             const char *from, *to;
 35 |         };
 36 |         /* REGEX_NODE_TYPE_QUANTIFIER */
 37 |         struct {
 38 |             int nmin, nmax, greedy;
 39 |             struct cregex_node *quantified;
 40 |         };
 41 |         /* REGEX_NODE_TYPE_CONCATENATION,
 42 |          * REGEX_NODE_TYPE_ALTERNATION
 43 |          */
 44 |         struct {
 45 |             struct cregex_node *left, *right;
 46 |         };
 47 |         /* REGEX_NODE_TYPE_CAPTURE */
 48 |         struct {
 49 |             struct cregex_node *captured;
 50 |         };
 51 |     };
 52 | } cregex_node_t;
 53 | 
 54 | typedef enum {
 55 |     REGEX_PROGRAM_OPCODE_MATCH = 0,
 56 |     /* Characters */
 57 |     REGEX_PROGRAM_OPCODE_CHARACTER,
 58 |     REGEX_PROGRAM_OPCODE_ANY_CHARACTER,
 59 |     REGEX_PROGRAM_OPCODE_CHARACTER_CLASS,
 60 |     REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED,
 61 |     /* Control-flow */
 62 |     REGEX_PROGRAM_OPCODE_SPLIT,
 63 |     REGEX_PROGRAM_OPCODE_JUMP,
 64 |     /* Assertions */
 65 |     REGEX_PROGRAM_OPCODE_ASSERT_BEGIN,
 66 |     REGEX_PROGRAM_OPCODE_ASSERT_END,
 67 |     /* Saving */
 68 |     REGEX_PROGRAM_OPCODE_SAVE
 69 | } cregex_program_opcode_t;
 70 | 
 71 | #include <limits.h>
 72 | 
 73 | typedef char cregex_char_class[(UCHAR_MAX + CHAR_BIT - 1) / CHAR_BIT];
 74 | 
 75 | static inline int cregex_char_class_contains(const cregex_char_class klass,
 76 |                                              int ch)
 77 | {
 78 |     return klass[ch / CHAR_BIT] & (1 << ch % CHAR_BIT);
 79 | }
 80 | 
 81 | static inline int cregex_char_class_add(cregex_char_class klass, int ch)
 82 | {
 83 |     klass[ch / CHAR_BIT] |= 1 << (ch % CHAR_BIT);
 84 |     return ch;
 85 | }
 86 | 
 87 | typedef struct cregex_program_instr {
 88 |     cregex_program_opcode_t opcode;
 89 |     union {
 90 |         /* REGEX_PROGRAM_OPCODE_CHARACTER */
 91 |         struct {
 92 |             int ch;
 93 |         };
 94 |         /* REGEX_PROGRAM_OPCODE_CHARACTER_CLASS,
 95 |          * REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED
 96 |          */
 97 |         struct {
 98 |             cregex_char_class klass;
 99 |         };
100 |         /* REGEX_PROGRAM_OPCODE_SPLIT */
101 |         struct {
102 |             struct cregex_program_instr *first, *second;
103 |         };
104 |         /* REGEX_PROGRAM_OPCODE_JUMP */
105 |         struct {
106 |             struct cregex_program_instr *target;
107 |         };
108 |         /* REGEX_PROGRAM_OPCODE_SAVE */
109 |         struct {
110 |             int save;
111 |         };
112 |     };
113 | } cregex_program_instr_t;
114 | 
115 | typedef struct {
116 |     int ninstructions;
117 |     cregex_program_instr_t instructions[];
118 | } cregex_program_t;
119 | 
120 | /* Run program on string */
121 | int cregex_program_run(const cregex_program_t *program,
122 |                        const char *string,
123 |                        const char **matches,
124 |                        int nmatches);
125 | 
126 | /* Compile a parsed pattern */
127 | cregex_program_t *cregex_compile_node(const cregex_node_t *root);
128 | 
129 | /* Free a compiled program */
130 | void cregex_compile_free(cregex_program_t *program);
131 | 
132 | /* Parse a pattern */
133 | cregex_node_t *cregex_parse(const char *pattern);
134 | 
135 | /* Free a parsed pattern */
136 | void cregex_parse_free(cregex_node_t *root);
137 | 
138 | #endif
139 | 


--------------------------------------------------------------------------------
/mk/common.mk:
--------------------------------------------------------------------------------
 1 | UNAME_S := $(shell uname -s)
 2 | ifeq ($(UNAME_S),Darwin)
 3 |     PRINTF = printf
 4 | else
 5 |     PRINTF = env printf
 6 | endif
 7 | 
 8 | # Control the build verbosity
 9 | ifeq ("$(VERBOSE)","1")
10 |     Q :=
11 |     VECHO = @true
12 |     REDIR =
13 | else
14 |     Q := @
15 |     VECHO = @$(PRINTF)
16 |     REDIR = >/dev/null
17 | endif
18 | 


--------------------------------------------------------------------------------
/mk/test-data.mk:
--------------------------------------------------------------------------------
 1 | TESTDATA = basic.dat nullsubexpr.dat repetition.dat
 2 | TESTDATA := $(addprefix tests/,$(TESTDATA))
 3 | 
 4 | tests/basic.dat:
 5 | 	$(VECHO) "  Downloading $@ ...\n"
 6 | 	$(Q)wget -q -O $@ https://golang.org/src/regexp/testdata/basic.dat?m=text
 7 | 	# FIXME: clarify if it was an imcomplete test item
 8 | 	$(Q)sed '/9876543210/d' $@ > tests/fixed
 9 | 	mv -f tests/fixed $@
10 | 
11 | tests/nullsubexpr.dat:
12 | 	$(VECHO) "  Downloading $@ ...\n"
13 | 	$(Q)wget -q -O $@ https://golang.org/src/regexp/testdata/nullsubexpr.dat?m=text
14 | 
15 | tests/repetition.dat:
16 | 	$(VECHO) "  Downloading $@ ...\n"
17 | 	$(Q)wget -q -O $@ https://golang.org/src/regexp/testdata/repetition.dat?m=text
18 | 


--------------------------------------------------------------------------------
/src/compile.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdlib.h>
  3 | 
  4 | #include "cregex.h"
  5 | 
  6 | typedef struct {
  7 |     cregex_program_instr_t *pc;
  8 |     int ncaptures;
  9 | } regex_compile_context;
 10 | 
 11 | static int count_instructions(const cregex_node_t *node)
 12 | {
 13 |     switch (node->type) {
 14 |     case REGEX_NODE_TYPE_EPSILON:
 15 |         return 0;
 16 | 
 17 |     /* Characters */
 18 |     case REGEX_NODE_TYPE_CHARACTER:
 19 |     case REGEX_NODE_TYPE_ANY_CHARACTER:
 20 |     case REGEX_NODE_TYPE_CHARACTER_CLASS:
 21 |     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
 22 |         return 1;
 23 | 
 24 |     /* Composites */
 25 |     case REGEX_NODE_TYPE_CONCATENATION:
 26 |         return count_instructions(node->left) + count_instructions(node->right);
 27 |     case REGEX_NODE_TYPE_ALTERNATION:
 28 |         return 2 + count_instructions(node->left) +
 29 |                count_instructions(node->right);
 30 | 
 31 |     /* Quantifiers */
 32 |     case REGEX_NODE_TYPE_QUANTIFIER: {
 33 |         int num = count_instructions(node->quantified);
 34 |         if (node->nmax >= node->nmin)
 35 |             return node->nmin * num + (node->nmax - node->nmin) * (num + 1);
 36 |         return 1 + (node->nmin ? node->nmin * num : num + 1);
 37 |     }
 38 | 
 39 |     /* Anchors */
 40 |     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
 41 |     case REGEX_NODE_TYPE_ANCHOR_END:
 42 |         return 1;
 43 | 
 44 |     /* Captures */
 45 |     case REGEX_NODE_TYPE_CAPTURE:
 46 |         return 2 + count_instructions(node->captured);
 47 |     }
 48 | 
 49 |     /* should not reach here */
 50 |     return 0;
 51 | }
 52 | 
 53 | static bool node_is_anchored(const cregex_node_t *node)
 54 | {
 55 |     switch (node->type) {
 56 |     case REGEX_NODE_TYPE_EPSILON:
 57 |         return false;
 58 | 
 59 |     /* Characters */
 60 |     case REGEX_NODE_TYPE_CHARACTER:
 61 |     case REGEX_NODE_TYPE_ANY_CHARACTER:
 62 |     case REGEX_NODE_TYPE_CHARACTER_CLASS:
 63 |     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
 64 |         return false;
 65 | 
 66 |     /* Composites */
 67 |     case REGEX_NODE_TYPE_CONCATENATION:
 68 |         return node_is_anchored(node->left);
 69 |     case REGEX_NODE_TYPE_ALTERNATION:
 70 |         return node_is_anchored(node->left) && node_is_anchored(node->right);
 71 | 
 72 |     /* Quantifiers */
 73 |     case REGEX_NODE_TYPE_QUANTIFIER:
 74 |         return node_is_anchored(node->quantified);
 75 | 
 76 |     /* Anchors */
 77 |     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
 78 |         return true;
 79 |     case REGEX_NODE_TYPE_ANCHOR_END:
 80 |         return false;
 81 | 
 82 |     /* Captures */
 83 |     case REGEX_NODE_TYPE_CAPTURE:
 84 |         return node_is_anchored(node->captured);
 85 |     }
 86 | 
 87 |     /* should not reach here */
 88 |     return false;
 89 | }
 90 | 
 91 | static inline cregex_program_instr_t *emit(
 92 |     regex_compile_context *context,
 93 |     const cregex_program_instr_t *instruction)
 94 | {
 95 |     *context->pc = *instruction;
 96 |     return context->pc++;
 97 | }
 98 | 
 99 | static cregex_program_instr_t *compile_char_class(
100 |     const cregex_node_t *node,
101 |     cregex_program_instr_t *instruction)
102 | {
103 |     const char *sp = node->from;
104 | 
105 |     for (;;) {
106 |         int ch = *sp++;
107 |         switch (ch) {
108 |         case ']':
109 |             if (sp - 1 == node->from)
110 |                 goto CHARACTER;
111 |             return instruction;
112 |         case '\\':
113 |             ch = *sp++;
114 |             /* fall-through */
115 |         default:
116 |         CHARACTER:
117 |             if (*sp == '-' && sp[1] != ']') {
118 |                 for (; ch <= sp[1]; ++ch)
119 |                     cregex_char_class_add(instruction->klass, ch);
120 |                 sp += 2;
121 |             } else {
122 |                 cregex_char_class_add(instruction->klass, ch);
123 |             }
124 |             break;
125 |         }
126 |     }
127 | }
128 | 
129 | static cregex_program_instr_t *compile_context(regex_compile_context *context,
130 |                                                const cregex_node_t *node)
131 | {
132 |     cregex_program_instr_t *bottom = context->pc, *split, *jump;
133 |     int ncaptures = context->ncaptures, capture;
134 | 
135 |     switch (node->type) {
136 |     case REGEX_NODE_TYPE_EPSILON:
137 |         break;
138 | 
139 |     /* Characters */
140 |     case REGEX_NODE_TYPE_CHARACTER:
141 |         emit(context,
142 |              &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_CHARACTER,
143 |                                        .ch = node->ch});
144 |         break;
145 |     case REGEX_NODE_TYPE_ANY_CHARACTER:
146 |         emit(context, &(cregex_program_instr_t){
147 |                           .opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER});
148 |         break;
149 |     case REGEX_NODE_TYPE_CHARACTER_CLASS:
150 |         compile_char_class(
151 |             node,
152 |             emit(context, &(cregex_program_instr_t){
153 |                               .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS}));
154 |         break;
155 |     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
156 |         compile_char_class(
157 |             node,
158 |             emit(context,
159 |                  &(cregex_program_instr_t){
160 |                      .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED}));
161 |         break;
162 | 
163 |     /* Composites */
164 |     case REGEX_NODE_TYPE_CONCATENATION:
165 |         compile_context(context, node->left);
166 |         compile_context(context, node->right);
167 |         break;
168 |     case REGEX_NODE_TYPE_ALTERNATION:
169 |         split = emit(context, &(cregex_program_instr_t){
170 |                                   .opcode = REGEX_PROGRAM_OPCODE_SPLIT});
171 |         split->first = compile_context(context, node->left);
172 |         jump = emit(context, &(cregex_program_instr_t){
173 |                                  .opcode = REGEX_PROGRAM_OPCODE_JUMP});
174 |         split->second = compile_context(context, node->right);
175 |         jump->target = context->pc;
176 |         break;
177 | 
178 |     /* Quantifiers */
179 |     case REGEX_NODE_TYPE_QUANTIFIER: {
180 |         cregex_program_instr_t *last = NULL;
181 |         for (int i = 0; i < node->nmin; ++i) {
182 |             context->ncaptures = ncaptures;
183 |             last = compile_context(context, node->quantified);
184 |         }
185 |         if (node->nmax > node->nmin) {
186 |             for (int i = 0; i < node->nmax - node->nmin; ++i) {
187 |                 context->ncaptures = ncaptures;
188 |                 split =
189 |                     emit(context, &(cregex_program_instr_t){
190 |                                       .opcode = REGEX_PROGRAM_OPCODE_SPLIT});
191 |                 split->first = compile_context(context, node->quantified);
192 |                 split->second = context->pc;
193 |                 if (!node->greedy) {
194 |                     cregex_program_instr_t *swap = split->first;
195 |                     split->first = split->second;
196 |                     split->second = swap;
197 |                 }
198 |             }
199 |         } else if (node->nmax == -1) {
200 |             split = emit(context, &(cregex_program_instr_t){
201 |                                       .opcode = REGEX_PROGRAM_OPCODE_SPLIT});
202 |             if (node->nmin == 0) {
203 |                 split->first = compile_context(context, node->quantified);
204 |                 jump = emit(context, &(cregex_program_instr_t){
205 |                                          .opcode = REGEX_PROGRAM_OPCODE_JUMP});
206 |                 split->second = context->pc;
207 |                 jump->target = split;
208 |             } else {
209 |                 split->first = last;
210 |                 split->second = context->pc;
211 |             }
212 |             if (!node->greedy) {
213 |                 cregex_program_instr_t *swap = split->first;
214 |                 split->first = split->second;
215 |                 split->second = swap;
216 |             }
217 |         }
218 |         break;
219 |     }
220 | 
221 |     /* Anchors */
222 |     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
223 |         emit(context, &(cregex_program_instr_t){
224 |                           .opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN});
225 |         break;
226 |     case REGEX_NODE_TYPE_ANCHOR_END:
227 |         emit(context, &(cregex_program_instr_t){
228 |                           .opcode = REGEX_PROGRAM_OPCODE_ASSERT_END});
229 |         break;
230 | 
231 |     /* Captures */
232 |     case REGEX_NODE_TYPE_CAPTURE:
233 |         capture = context->ncaptures++ * 2;
234 |         emit(context,
235 |              &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_SAVE,
236 |                                        .save = capture});
237 |         compile_context(context, node->captured);
238 |         emit(context,
239 |              &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_SAVE,
240 |                                        .save = capture + 1});
241 |         break;
242 |     }
243 | 
244 |     return bottom;
245 | }
246 | 
247 | /* Compile a parsed pattern (using a previously allocated program with at least
248 |  * estimate_instructions(root) instructions).
249 |  */
250 | static cregex_program_t *compile_node_with_program(const cregex_node_t *root,
251 |                                                    cregex_program_t *program)
252 | {
253 |     /* add capture node for entire match */
254 |     root = &(cregex_node_t){.type = REGEX_NODE_TYPE_CAPTURE,
255 |                             .captured = (cregex_node_t *) root};
256 | 
257 |     /* add .*? unless pattern starts with ^ */
258 |     if (!node_is_anchored(root))
259 |         root = &(cregex_node_t){
260 |             .type = REGEX_NODE_TYPE_CONCATENATION,
261 |             .left =
262 |                 &(cregex_node_t){
263 |                     .type = REGEX_NODE_TYPE_QUANTIFIER,
264 |                     .nmin = 0,
265 |                     .nmax = -1,
266 |                     .greedy = 0,
267 |                     .quantified = &(
268 |                         cregex_node_t){.type = REGEX_NODE_TYPE_ANY_CHARACTER}},
269 |             .right = (cregex_node_t *) root};
270 | 
271 |     /* compile */
272 |     regex_compile_context *context =
273 |         &(regex_compile_context){.pc = program->instructions, .ncaptures = 0};
274 |     compile_context(context, root);
275 | 
276 |     /* emit final match instruction */
277 |     emit(context,
278 |          &(cregex_program_instr_t){.opcode = REGEX_PROGRAM_OPCODE_MATCH});
279 | 
280 |     /* set total number of instructions */
281 |     program->ninstructions = context->pc - program->instructions;
282 | 
283 |     return program;
284 | }
285 | 
286 | /* Upper bound of number of instructions required to compile parsed pattern. */
287 | static int estimate_instructions(const cregex_node_t *root)
288 | {
289 |     return count_instructions(root)
290 |            /* .*? is added unless pattern starts with ^,
291 |             * save instructions are added for beginning and end of match,
292 |             * a final match instruction is added to the end of the program
293 |             */
294 |            + !node_is_anchored(root) * 3 + 2 + 1;
295 | }
296 | 
297 | cregex_program_t *cregex_compile_node(const cregex_node_t *root)
298 | {
299 |     size_t size = sizeof(cregex_program_t) +
300 |                   sizeof(cregex_program_instr_t) * estimate_instructions(root);
301 |     cregex_program_t *program;
302 | 
303 |     if (!(program = malloc(size)))
304 |         return NULL;
305 | 
306 |     if (!compile_node_with_program(root, program)) {
307 |         free(program);
308 |         return NULL;
309 |     }
310 | 
311 |     return program;
312 | }
313 | 
314 | /* Free a compiled program */
315 | void cregex_compile_free(cregex_program_t *program)
316 | {
317 |     free(program);
318 | }
319 | 


--------------------------------------------------------------------------------
/src/parse.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #include "cregex.h"
  6 | 
  7 | typedef struct {
  8 |     const char *sp;
  9 |     cregex_node_t *stack, *output;
 10 | } regex_parse_context;
 11 | 
 12 | /* Shunting-yard algorithm
 13 |  * See https://en.wikipedia.org/wiki/Shunting-yard_algorithm
 14 |  */
 15 | 
 16 | static inline cregex_node_t *push(regex_parse_context *context,
 17 |                                   const cregex_node_t *node)
 18 | {
 19 |     assert(context->stack <= context->output);
 20 |     *context->stack = *node;
 21 |     return context->stack++;
 22 | }
 23 | 
 24 | static inline cregex_node_t *drop(regex_parse_context *context)
 25 | {
 26 |     return --context->stack;
 27 | }
 28 | 
 29 | static inline cregex_node_t *consume(regex_parse_context *context)
 30 | {
 31 |     *--context->output = *--context->stack;
 32 |     return context->output;
 33 | }
 34 | 
 35 | static inline cregex_node_t *concatenate(regex_parse_context *context,
 36 |                                          const cregex_node_t *bottom)
 37 | {
 38 |     if (context->stack == bottom)
 39 |         push(context, &(cregex_node_t){.type = REGEX_NODE_TYPE_EPSILON});
 40 |     else {
 41 |         while (context->stack - 1 > bottom) {
 42 |             cregex_node_t *right = consume(context);
 43 |             cregex_node_t *left = consume(context);
 44 |             push(context,
 45 |                  &(cregex_node_t){.type = REGEX_NODE_TYPE_CONCATENATION,
 46 |                                   .left = left,
 47 |                                   .right = right});
 48 |         }
 49 |     }
 50 |     return context->stack - 1;
 51 | }
 52 | 
 53 | static cregex_node_t *parse_char_class(regex_parse_context *context)
 54 | {
 55 |     cregex_node_type type =
 56 |         (*context->sp == '^')
 57 |             ? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED)
 58 |             : REGEX_NODE_TYPE_CHARACTER_CLASS;
 59 |     const char *from = context->sp;
 60 | 
 61 |     for (;;) {
 62 |         int ch = *context->sp++;
 63 |         switch (ch) {
 64 |         case '\0':
 65 |             /* premature end of character class */
 66 |             return NULL;
 67 |         case ']':
 68 |             if (context->sp - 1 == from)
 69 |                 goto CHARACTER;
 70 |             return push(context,
 71 |                         &(cregex_node_t){
 72 |                             .type = type, .from = from, .to = context->sp - 1});
 73 |         case '\\':
 74 |             ch = *context->sp++;
 75 |             /* fall-through */
 76 |         default:
 77 |         CHARACTER:
 78 |             if (*context->sp == '-' && context->sp[1] != ']') {
 79 |                 if (context->sp[1] < ch)
 80 |                     /* empty range in character class */
 81 |                     return NULL;
 82 |                 context->sp += 2;
 83 |             }
 84 |             break;
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | static cregex_node_t *parse_interval(regex_parse_context *context)
 90 | {
 91 |     const char *from = context->sp;
 92 |     int nmin, nmax;
 93 | 
 94 |     for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp)
 95 |         nmin = (nmin * 10) + (*context->sp - '0');
 96 | 
 97 |     if (*context->sp == ',') {
 98 |         ++context->sp;
 99 |         if (*from != ',' && *context->sp == '}')
100 |             nmax = -1;
101 |         else {
102 |             for (nmax = 0; *context->sp >= '0' && *context->sp <= '9';
103 |                  ++context->sp)
104 |                 nmax = (nmax * 10) + (*context->sp - '0');
105 |             if (*(context->sp - 1) == ',' || *context->sp != '}' ||
106 |                 nmax < nmin) {
107 |                 context->sp = from;
108 |                 return NULL;
109 |             }
110 |         }
111 |     } else if (*from != '}' && *context->sp == '}') {
112 |         nmax = nmin;
113 |     } else {
114 |         context->sp = from;
115 |         return NULL;
116 |     }
117 | 
118 |     ++context->sp;
119 |     return push(context,
120 |                 &(cregex_node_t){
121 |                     .type = REGEX_NODE_TYPE_QUANTIFIER,
122 |                     .nmin = nmin,
123 |                     .nmax = nmax,
124 |                     .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1,
125 |                     .quantified = consume(context)});
126 | }
127 | 
128 | static cregex_node_t *parse_context(regex_parse_context *context, int depth)
129 | {
130 |     cregex_node_t *bottom = context->stack;
131 | 
132 |     for (;;) {
133 |         int ch = *context->sp++;
134 |         switch (ch) {
135 |         /* Characters */
136 |         case '\\':
137 |             ch = *context->sp++;
138 |             /* fall-through */
139 |         default:
140 |         CHARACTER:
141 |             push(context,
142 |                  &(cregex_node_t){.type = REGEX_NODE_TYPE_CHARACTER, .ch = ch});
143 |             break;
144 |         case '.':
145 |             push(context,
146 |                  &(cregex_node_t){.type = REGEX_NODE_TYPE_ANY_CHARACTER});
147 |             break;
148 |         case '[':
149 |             if (!parse_char_class(context))
150 |                 return NULL;
151 |             break;
152 | 
153 |         /* Composites */
154 |         case '|': {
155 |             cregex_node_t *left = concatenate(context, bottom), *right;
156 |             if (!(right = parse_context(context, depth)))
157 |                 return NULL;
158 |             if (left->type == REGEX_NODE_TYPE_EPSILON &&
159 |                 right->type == left->type) {
160 |                 drop(context);
161 |             } else if (left->type == REGEX_NODE_TYPE_EPSILON) {
162 |                 right = consume(context);
163 |                 drop(context);
164 |                 push(context,
165 |                      &(cregex_node_t){.type = REGEX_NODE_TYPE_QUANTIFIER,
166 |                                       .nmin = 0,
167 |                                       .nmax = 1,
168 |                                       .greedy = 1,
169 |                                       .quantified = right});
170 |             } else if (right->type == REGEX_NODE_TYPE_EPSILON) {
171 |                 drop(context);
172 |                 left = consume(context);
173 |                 push(context,
174 |                      &(cregex_node_t){.type = REGEX_NODE_TYPE_QUANTIFIER,
175 |                                       .nmin = 0,
176 |                                       .nmax = 1,
177 |                                       .greedy = 1,
178 |                                       .quantified = left});
179 |             } else {
180 |                 right = consume(context);
181 |                 left = consume(context);
182 |                 push(context,
183 |                      &(cregex_node_t){.type = REGEX_NODE_TYPE_ALTERNATION,
184 |                                       .left = left,
185 |                                       .right = right});
186 |             }
187 |             return bottom;
188 |         }
189 | 
190 | #define QUANTIFIER(ch, min, max)                                           \
191 |     case ch:                                                               \
192 |         if (context->stack == bottom)                                      \
193 |             goto CHARACTER;                                                \
194 |         push(context,                                                      \
195 |              &(cregex_node_t){                                             \
196 |                  .type = REGEX_NODE_TYPE_QUANTIFIER,                       \
197 |                  .nmin = min,                                              \
198 |                  .nmax = max,                                              \
199 |                  .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1, \
200 |                  .quantified = consume(context)});                         \
201 |         break
202 | 
203 |             /* clang-format off */
204 |         /* Quantifiers */
205 |         QUANTIFIER('?', 0, 1);
206 |         QUANTIFIER('*', 0, -1);
207 |         QUANTIFIER('+', 1, -1);
208 |             /* clang-format on */
209 | #undef QUANTIFIER
210 | 
211 |         case '{':
212 |             if ((context->stack == bottom) || !parse_interval(context))
213 |                 goto CHARACTER;
214 |             break;
215 | 
216 |         /* Anchors */
217 |         case '^':
218 |             push(context,
219 |                  &(cregex_node_t){.type = REGEX_NODE_TYPE_ANCHOR_BEGIN});
220 |             break;
221 |         case '$':
222 |             push(context, &(cregex_node_t){.type = REGEX_NODE_TYPE_ANCHOR_END});
223 |             break;
224 | 
225 |         /* Captures */
226 |         case '(':
227 |             if (!parse_context(context, depth + 1))
228 |                 return NULL;
229 |             push(context, &(cregex_node_t){.type = REGEX_NODE_TYPE_CAPTURE,
230 |                                            .captured = consume(context)});
231 |             break;
232 |         case ')':
233 |             if (depth > 0)
234 |                 return concatenate(context, bottom);
235 |             /* unmatched close parenthesis */
236 |             return NULL;
237 | 
238 |         /* End of string */
239 |         case '\0':
240 |             if (depth == 0)
241 |                 return concatenate(context, bottom);
242 |             /* unmatched open parenthesis */
243 |             return NULL;
244 |         }
245 |     }
246 | }
247 | 
248 | static inline int estimate_nodes(const char *pattern)
249 | {
250 |     return strlen(pattern) * 2;
251 | }
252 | 
253 | /* Parse a pattern (using a previously allocated buffer of at least
254 |  * estimate_nodes(pattern) nodes).
255 |  */
256 | static cregex_node_t *parse_with_nodes(const char *pattern,
257 |                                        cregex_node_t *nodes)
258 | {
259 |     regex_parse_context *context =
260 |         &(regex_parse_context){.sp = pattern,
261 |                                .stack = nodes,
262 |                                .output = nodes + estimate_nodes(pattern)};
263 |     return parse_context(context, 0);
264 | }
265 | 
266 | cregex_node_t *cregex_parse(const char *pattern)
267 | {
268 |     size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern);
269 |     cregex_node_t *nodes = malloc(size);
270 |     if (!nodes)
271 |         return NULL;
272 | 
273 |     if (!parse_with_nodes(pattern, nodes)) {
274 |         free(nodes);
275 |         return NULL;
276 |     }
277 | 
278 |     return nodes;
279 | }
280 | 
281 | void cregex_parse_free(cregex_node_t *root)
282 | {
283 |     free(root);
284 | }
285 | 


--------------------------------------------------------------------------------
/src/vm.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <string.h>
  3 | 
  4 | #include "cregex.h"
  5 | 
  6 | #define REGEX_VM_MAX_MATCHES 20
  7 | 
  8 | /* The VM executes one or more threads, each running a regular expression
  9 |  * program, which is just a list of regular expression instructions. Each
 10 |  * thread maintains two registers while it runs: a program counter (PC) and
 11 |  * a string pointer (SP).
 12 |  */
 13 | typedef struct {
 14 |     int visited;
 15 |     const cregex_program_instr_t *pc;
 16 |     const char *matches[REGEX_VM_MAX_MATCHES];
 17 | } vm_thread;
 18 | 
 19 | /* Run program on string */
 20 | static int vm_run(const cregex_program_t *program,
 21 |                   const char *string,
 22 |                   const char **matches,
 23 |                   int nmatches);
 24 | 
 25 | /* Run program on string (using a previously allocated buffer of at least
 26 |  * vm_estimate_threads(program) threads)
 27 |  */
 28 | static int vm_run_with_threads(const cregex_program_t *program,
 29 |                                const char *string,
 30 |                                const char **matches,
 31 |                                int nmatches,
 32 |                                vm_thread *threads);
 33 | 
 34 | typedef struct {
 35 |     int nthreads;
 36 |     vm_thread *threads;
 37 | } vm_thread_list;
 38 | 
 39 | static void vm_add_thread(vm_thread_list *list,
 40 |                           const cregex_program_t *program,
 41 |                           const cregex_program_instr_t *pc,
 42 |                           const char *string,
 43 |                           const char *sp,
 44 |                           const char **matches,
 45 |                           int nmatches)
 46 | {
 47 |     if (list->threads[pc - program->instructions].visited == sp - string + 1)
 48 |         return;
 49 |     list->threads[pc - program->instructions].visited = sp - string + 1;
 50 | 
 51 |     switch (pc->opcode) {
 52 |     case REGEX_PROGRAM_OPCODE_MATCH:
 53 |         /* fall-through */
 54 | 
 55 |     /* Characters */
 56 |     case REGEX_PROGRAM_OPCODE_CHARACTER:
 57 |     case REGEX_PROGRAM_OPCODE_ANY_CHARACTER:
 58 |     case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS:
 59 |     case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED:
 60 |         list->threads[list->nthreads].pc = pc;
 61 |         memcpy(list->threads[list->nthreads].matches, matches,
 62 |                sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES)
 63 |                                          ? nmatches
 64 |                                          : REGEX_VM_MAX_MATCHES));
 65 |         ++list->nthreads;
 66 |         break;
 67 | 
 68 |     /* Control-flow */
 69 |     case REGEX_PROGRAM_OPCODE_SPLIT:
 70 |         vm_add_thread(list, program, pc->first, string, sp, matches, nmatches);
 71 |         vm_add_thread(list, program, pc->second, string, sp, matches, nmatches);
 72 |         break;
 73 |     case REGEX_PROGRAM_OPCODE_JUMP:
 74 |         vm_add_thread(list, program, pc->target, string, sp, matches, nmatches);
 75 |         break;
 76 | 
 77 |     /* Assertions */
 78 |     case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN:
 79 |         if (sp == string)
 80 |             vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
 81 |         break;
 82 |     case REGEX_PROGRAM_OPCODE_ASSERT_END:
 83 |         if (!*sp)
 84 |             vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
 85 |         break;
 86 | 
 87 |     /* Saving */
 88 |     case REGEX_PROGRAM_OPCODE_SAVE:
 89 |         if (pc->save < nmatches && pc->save < REGEX_VM_MAX_MATCHES) {
 90 |             const char *saved = matches[pc->save];
 91 |             matches[pc->save] = sp;
 92 |             vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
 93 |             matches[pc->save] = saved;
 94 |         } else {
 95 |             vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
 96 |         }
 97 |         break;
 98 |     }
 99 | }
100 | 
101 | /* Upper bound of number of threads required to run program */
102 | static int vm_estimate_threads(const cregex_program_t *program)
103 | {
104 |     return program->ninstructions * 2;
105 | }
106 | 
107 | static int vm_run(const cregex_program_t *program,
108 |                   const char *string,
109 |                   const char **matches,
110 |                   int nmatches)
111 | {
112 |     size_t size = sizeof(vm_thread) * vm_estimate_threads(program);
113 |     vm_thread *threads;
114 |     int matched;
115 | 
116 |     if (!(threads = malloc(size)))
117 |         return -1;
118 | 
119 |     matched = vm_run_with_threads(program, string, matches, nmatches, threads);
120 |     free(threads);
121 |     return matched;
122 | }
123 | 
124 | static int vm_run_with_threads(const cregex_program_t *program,
125 |                                const char *string,
126 |                                const char **matches,
127 |                                int nmatches,
128 |                                vm_thread *threads)
129 | {
130 |     vm_thread_list *current =
131 |         &(vm_thread_list){.nthreads = 0, .threads = threads};
132 |     vm_thread_list *next = &(vm_thread_list){
133 |         .nthreads = 0, .threads = threads + program->ninstructions};
134 |     int matched = 0;
135 | 
136 |     memset(threads, 0, sizeof(vm_thread) * program->ninstructions * 2);
137 | 
138 |     vm_add_thread(current, program, program->instructions, string, string,
139 |                   matches, nmatches);
140 | 
141 |     for (const char *sp = string;; ++sp) {
142 |         for (int i = 0; i < current->nthreads; ++i) {
143 |             vm_thread *thread = current->threads + i;
144 |             switch (thread->pc->opcode) {
145 |             case REGEX_PROGRAM_OPCODE_MATCH:
146 |                 matched = 1;
147 |                 current->nthreads = 0;
148 |                 memcpy(matches, thread->matches,
149 |                        sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES)
150 |                                                  ? nmatches
151 |                                                  : REGEX_VM_MAX_MATCHES));
152 |                 continue;
153 | 
154 |             /* Characters */
155 |             case REGEX_PROGRAM_OPCODE_CHARACTER:
156 |                 if (*sp == thread->pc->ch)
157 |                     break;
158 |                 continue;
159 |             case REGEX_PROGRAM_OPCODE_ANY_CHARACTER:
160 |                 if (*sp)
161 |                     break;
162 |                 continue;
163 |             case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS:
164 |                 if (cregex_char_class_contains(thread->pc->klass, *sp))
165 |                     break;
166 |                 continue;
167 |             case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED:
168 |                 if (!cregex_char_class_contains(thread->pc->klass, *sp))
169 |                     break;
170 |                 continue;
171 | 
172 |             /* Control-flow */
173 |             case REGEX_PROGRAM_OPCODE_SPLIT:
174 |             case REGEX_PROGRAM_OPCODE_JUMP:
175 |                 /* fall-through */
176 | 
177 |             /* Assertions */
178 |             case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN:
179 |             case REGEX_PROGRAM_OPCODE_ASSERT_END:
180 |                 /* fall-through */
181 | 
182 |             /* Saving */
183 |             case REGEX_PROGRAM_OPCODE_SAVE:
184 |                 /* handled in vm_add_thread() */
185 |                 abort();
186 |             }
187 | 
188 |             vm_add_thread(next, program, thread->pc + 1, string, sp + 1,
189 |                           thread->matches, nmatches);
190 |         }
191 | 
192 |         /* swap current and next thread list */
193 |         vm_thread_list *swap = current;
194 |         current = next;
195 |         next = swap;
196 |         next->nthreads = 0;
197 | 
198 |         /* done if no more threads are running or end of string reached */
199 |         if (current->nthreads == 0 || !*sp)
200 |             break;
201 |     }
202 | 
203 |     return matched;
204 | }
205 | 
206 | int cregex_program_run(const cregex_program_t *program,
207 |                        const char *string,
208 |                        const char **matches,
209 |                        int nmatches)
210 | {
211 |     return vm_run(program, string, matches, nmatches);
212 | }
213 | 


--------------------------------------------------------------------------------
/tests/cli.c:
--------------------------------------------------------------------------------
  1 | #include <ctype.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | 
  6 | #include <cregex.h>
  7 | 
  8 | static void usage(FILE *file, const char *program)
  9 | {
 10 |     fprintf(file, "usage: %s pattern [string...]\n", program);
 11 | }
 12 | 
 13 | static void print_node(FILE *file, cregex_node_t *node, int depth)
 14 | {
 15 |     switch (node->type) {
 16 |     case REGEX_NODE_TYPE_EPSILON:
 17 |         fprintf(file, "epsilon");
 18 |         break;
 19 | 
 20 |     /* Characters */
 21 |     case REGEX_NODE_TYPE_CHARACTER:
 22 |         fprintf(file, isprint(node->ch) ? "character('%c')" : "character(%02x)",
 23 |                 node->ch);
 24 |         break;
 25 |     case REGEX_NODE_TYPE_ANY_CHARACTER:
 26 |         fprintf(file, "any_character");
 27 |         break;
 28 |     case REGEX_NODE_TYPE_CHARACTER_CLASS:
 29 |         fprintf(file, "character_class(\"%.*s\")",
 30 |                 (int) (node->to - node->from), node->from);
 31 |         break;
 32 |     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
 33 |         fprintf(file, "character_class_negated(\"%.*s\")",
 34 |                 (int) (node->to - node->from), node->from);
 35 |         break;
 36 | 
 37 |     /* Composites */
 38 |     case REGEX_NODE_TYPE_CONCATENATION:
 39 |         fprintf(file, "concatenation(");
 40 |         print_node(file, node->left, depth + 1);
 41 |         fprintf(file, ", ");
 42 |         print_node(file, node->right, depth + 1);
 43 |         fprintf(file, ")");
 44 |         break;
 45 |     case REGEX_NODE_TYPE_ALTERNATION:
 46 |         fprintf(file, "alternation(");
 47 |         print_node(file, node->left, depth + 1);
 48 |         fprintf(file, ", ");
 49 |         print_node(file, node->right, depth + 1);
 50 |         fprintf(file, ")");
 51 |         break;
 52 | 
 53 |     /* Quantifiers */
 54 |     case REGEX_NODE_TYPE_QUANTIFIER:
 55 |         fprintf(file, "quantifier(");
 56 |         print_node(file, node->quantified, depth + 1);
 57 |         fprintf(file, ", %d, %d, %s)", node->nmin, node->nmax,
 58 |                 node->greedy ? "greedy" : "non_greedy");
 59 |         break;
 60 | 
 61 |     /* Anchors */
 62 |     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
 63 |         fprintf(file, "anchor_begin");
 64 |         break;
 65 |     case REGEX_NODE_TYPE_ANCHOR_END:
 66 |         fprintf(file, "anchor_end");
 67 |         break;
 68 | 
 69 |     /* Captures */
 70 |     case REGEX_NODE_TYPE_CAPTURE:
 71 |         fprintf(file, "capture(");
 72 |         print_node(file, node->captured, depth + 1);
 73 |         fprintf(file, ")");
 74 |         break;
 75 |     }
 76 | 
 77 |     if (depth == 0)
 78 |         fprintf(file, "\n");
 79 | }
 80 | 
 81 | static void print_char_class(FILE *file,
 82 |                              const cregex_program_instr_t *instruction)
 83 | {
 84 |     for (int ch = 0, to; ch < UCHAR_MAX; ++ch) {
 85 |         if (cregex_char_class_contains(instruction->klass, ch)) {
 86 |             fprintf(file, isprint(ch) ? "%c" : "%02x", ch);
 87 |             for (to = ch + 1;
 88 |                  cregex_char_class_contains(instruction->klass, to); ++to)
 89 |                 ;
 90 |             if (to > ch + 2) {
 91 |                 fprintf(file, isprint(to) ? "-%c" : "-%02x", to - 1);
 92 |                 ch = to;
 93 |             }
 94 |         }
 95 |     }
 96 | }
 97 | 
 98 | static void print_instruction(FILE *file,
 99 |                               const cregex_program_t *program,
100 |                               const cregex_program_instr_t *instruction)
101 | {
102 |     fprintf(file, "[%04x] ", (int) (instruction - program->instructions));
103 | 
104 |     switch (instruction->opcode) {
105 |     case REGEX_PROGRAM_OPCODE_MATCH:
106 |         fprintf(file, "MATCH\n");
107 |         break;
108 | 
109 |     /* Characters */
110 |     case REGEX_PROGRAM_OPCODE_CHARACTER:
111 |         if (isprint(instruction->ch))
112 |             fprintf(file, "CHAR %c\n", instruction->ch);
113 |         else
114 |             fprintf(file, "CHAR %02x\n", instruction->ch);
115 |         break;
116 |     case REGEX_PROGRAM_OPCODE_ANY_CHARACTER:
117 |         fprintf(file, "ANY_CHAR\n");
118 |         break;
119 |     case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS:
120 |         fprintf(file, "CHARACTER_CLASS [");
121 |         print_char_class(file, instruction);
122 |         fprintf(file, "]\n");
123 |         break;
124 |     case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED:
125 |         fprintf(file, "CHARACTER_CLASS_NEGATED [^");
126 |         print_char_class(file, instruction);
127 |         fprintf(file, "]\n");
128 |         break;
129 | 
130 |     /* Control-flow */
131 |     case REGEX_PROGRAM_OPCODE_JUMP:
132 |         fprintf(file, "JUMP %04x\n",
133 |                 (int) (instruction->target - program->instructions));
134 |         break;
135 |     case REGEX_PROGRAM_OPCODE_SPLIT:
136 |         fprintf(file, "SPLIT %04x %04x\n",
137 |                 (int) (instruction->first - program->instructions),
138 |                 (int) (instruction->second - program->instructions));
139 |         break;
140 | 
141 |     /* Assertions */
142 |     case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN:
143 |         fprintf(file, "ASSERT_BEGIN\n");
144 |         break;
145 |     case REGEX_PROGRAM_OPCODE_ASSERT_END:
146 |         fprintf(file, "ASSERT_END\n");
147 |         break;
148 | 
149 |     /* Saving */
150 |     case REGEX_PROGRAM_OPCODE_SAVE:
151 |         fprintf(file, "SAVE %d\n", instruction->save);
152 |         break;
153 |     }
154 | }
155 | 
156 | static void print_program(FILE *file, const cregex_program_t *program)
157 | {
158 |     for (int i = 0; i < program->ninstructions; ++i)
159 |         print_instruction(file, program, program->instructions + i);
160 | }
161 | 
162 | int main(int argc, char *argv[])
163 | {
164 |     cregex_node_t *node;
165 |     cregex_program_t *program;
166 | 
167 |     /* process command line */
168 |     if (argc < 2) {
169 |         usage(stderr, argv[0]);
170 |         return EXIT_FAILURE;
171 |     }
172 | 
173 |     if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
174 |         usage(stdout, argv[0]);
175 |         return EXIT_SUCCESS;
176 |     }
177 | 
178 |     /* parse pattern */
179 |     if ((node = cregex_parse(argv[1])))
180 |         print_node(stdout, node, 0);
181 |     else {
182 |         fprintf(stderr, "%s: cregex_parse() failed\n", argv[0]);
183 |         return EXIT_FAILURE;
184 |     }
185 | 
186 |     /* compile parsed pattern */
187 |     program = cregex_compile_node(node);
188 |     cregex_parse_free(node);
189 |     if (program)
190 |         print_program(stdout, program);
191 |     else {
192 |         fprintf(stderr, "%s: cregex_compile_node() failed\n", argv[0]);
193 |         return EXIT_FAILURE;
194 |     }
195 | 
196 |     /* run program on string(s) */
197 |     for (int i = 2; i < argc; ++i) {
198 |         const char *matches[20] = {0};
199 | 
200 |         if (cregex_program_run(program, argv[i], matches, 20) > 0) {
201 |             int nmatches = 0;
202 |             for (int j = 0; j < sizeof(matches) / sizeof(matches[0]); ++j)
203 |                 if (matches[j])
204 |                     nmatches = j;
205 | 
206 |             printf("\"%s\": ", argv[i]);
207 | 
208 |             for (int j = 0; j <= nmatches; j += 2) {
209 |                 if (j > 0)
210 |                     printf(", ");
211 |                 if (matches[j] && matches[j + 1]) {
212 |                     printf("\"%.*s\"(%d,%d)",
213 |                            (int) (matches[j + 1] - matches[j]), matches[j],
214 |                            (int) (matches[j] - argv[i]),
215 |                            (int) (matches[j + 1] - argv[i]));
216 |                 } else {
217 |                     printf("(NULL,NULL)");
218 |                 }
219 |             }
220 | 
221 |             printf("\n");
222 |         } else {
223 |             printf("\"%s\": no match\n", argv[i]);
224 |         }
225 |     }
226 | 
227 |     cregex_compile_free(program);
228 |     return EXIT_SUCCESS;
229 | }
230 | 


--------------------------------------------------------------------------------
/tests/generator.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | puts <<-END
  4 | /* generated by #{$0}#{ARGV.size > 0 ? ' ' + ARGV.join(' ') : ''} */
  5 | 
  6 | #include <stdarg.h>
  7 | #include <stdio.h>
  8 | 
  9 | #include <cregex.h>
 10 | 
 11 | #ifdef __GNUC__
 12 | static void success(const char *source, const char *format, ...)
 13 |     __attribute__ ((format(printf, 2, 3)));
 14 | static void fail(const char *source, const char *format, ...)
 15 |     __attribute__ ((format(printf, 2, 3)));
 16 | #endif
 17 | 
 18 | static void success(const char *source, const char *format, ...)
 19 | {
 20 |     va_list ap;
 21 |     va_start(ap, format);
 22 |     printf("%s [\\x1b[32mSUCCESS\\x1b[0m] ", source);
 23 |     vprintf(format, ap);
 24 |     printf("\\n");
 25 |     va_end(ap);
 26 | }
 27 | 
 28 | static void fail(const char *source, const char *format, ...)
 29 | {
 30 |     va_list ap;
 31 |     va_start(ap, format);
 32 |     printf("%s [\\x1b[31mFAIL   \\x1b[0m] ", source);
 33 |     vprintf(format, ap);
 34 |     printf("\\n");
 35 |     va_end(ap);
 36 | }
 37 | 
 38 | static int test(const char *source,
 39 |                 const char *pattern, const char *string,
 40 |                 int nmatches,
 41 |                 ...)
 42 | {
 43 |     cregex_node_t *root;
 44 |     cregex_program_t *program;
 45 |     const char *matches[20] = {0};
 46 |     int result = 0;
 47 |     va_list ap;
 48 | 
 49 |     /* parse pattern */
 50 |     if (!(root = cregex_parse(pattern))) {
 51 |         fail(source, "cregex_parse() failed");
 52 |         return -1;
 53 |     }
 54 | 
 55 |     /* compile parsed pattern */
 56 |     program = cregex_compile_node(root);
 57 |     cregex_parse_free(root);
 58 |     if (!program) {
 59 |         fail(source, "cregex_compile_node() failed");
 60 |         return -1;
 61 |     }
 62 | 
 63 |     /* run program on string */
 64 |     if ((result = cregex_program_run(program, string, matches,
 65 |                                      sizeof (matches) / sizeof (matches[0]))) <
 66 |         0) {
 67 |         fail(source, "cregex_program_run() failed");
 68 |         cregex_compile_free(program);
 69 |         return -1;
 70 |     }
 71 | 
 72 |     va_start(ap, nmatches);
 73 |     if (result > 0) {
 74 |         if (nmatches > 0) {
 75 |             success(source, "/%s/ =~ \\"%s\\"", pattern, string);
 76 |             result = 0;
 77 |             for (int i = 0; i + 1 < nmatches &&
 78 |                             i + 1 < sizeof (matches) / sizeof (matches[0]);
 79 |                  i += 2) {
 80 |                 int begin = va_arg(ap, int), end = va_arg(ap, int);
 81 |                 if ((begin == -1 || begin == matches[i] - string) &&
 82 |                     (end == -1 || end == matches[i + 1] - string)) {
 83 |                     // success(source, "(%d,%d)", begin, end);
 84 |                 } else if (matches[i] && matches[i + 1]) {
 85 |                     fail(source, "expected (%d,%d), got (%d,%d)", begin, end,
 86 |                          (int) (matches[i] - string),
 87 |                          (int) (matches[i + 1] - string));
 88 |                     result = -1;
 89 |                 } else {
 90 |                     fail(source, "expected (%d,%d), got (NULL,NULL)", begin, end);
 91 |                     result = -1;
 92 |                 }
 93 |             }
 94 |         } else {
 95 |             fail(source, "/%s/ =~ \\"%s\\"", pattern, string);
 96 |             result = -1;
 97 |         }
 98 |     } else if (result == 0) {
 99 |         if (nmatches == 0)
100 |             success(source, "/%s/ !~ \\"%s\\"", pattern, string);
101 |         else {
102 |             fail(source, "/%s/ !~ \\"%s\\"", pattern, string);
103 |             result = -1;
104 |         }
105 |     }
106 | 
107 |     va_end(ap);
108 |     cregex_compile_free(program);
109 |     return result;
110 | }
111 | 
112 | int main(int argc, char *argv[])
113 | {
114 |     int nerrors = 0;
115 | END
116 | 
117 | filename = nil
118 | previous = nil
119 | ntests   = 0
120 | 
121 | ARGF.each do |line|
122 |   if ARGF.filename != filename
123 |     filename = ARGF.filename
124 |     ARGF.lineno = 1
125 |   end
126 | 
127 |   line = line.sub(/^:[^:]*:/, '')
128 |   next unless line =~ /^[{BEASKL]+/
129 | 
130 |   options, pattern, string, captures = line.chomp.split(/\t+/)
131 |   string   = ''       if string  == 'NULL'
132 |   pattern  = previous if pattern == 'SAME'
133 |   previous = pattern
134 |   pattern  = pattern.gsub('\\', "\\\\\\\\") unless options.include?('$')
135 |   string   = string .gsub('\\', "\\\\\\\\") unless options.include?('$')
136 |   captures = captures == 'NOMATCH' \
137 |     ? captures
138 |     : captures
139 |         .scan(/\((.*?),(.*?)\)/)
140 |         .flatten
141 |         .map {|offset| offset == '?' ? -1 : offset.to_i }
142 | 
143 |   puts <<-END
144 |   nerrors += test("#{ARGF.filename}:#{'%03d' % ARGF.lineno}", "#{pattern}", "#{string}",
145 |     #{captures == 'NOMATCH' ? 0 : "#{captures.size}, #{captures.join(', ')}"});
146 | END
147 |   ntests += 1
148 | end
149 | 
150 | puts <<-END
151 |     printf("#{ntests} test(s), %d error(s).\\n", -nerrors);
152 |     return 0;
153 | }
154 | END
155 | 


--------------------------------------------------------------------------------
/tests/re2dot.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #include <cregex.h>
  6 | 
  7 | static void usage(FILE *file, const char *program)
  8 | {
  9 |     fprintf(file, "usage: %s pattern\n", program);
 10 | }
 11 | 
 12 | static void print_node(FILE *file, const cregex_node_t *node)
 13 | {
 14 |     switch (node->type) {
 15 |     case REGEX_NODE_TYPE_EPSILON:
 16 |         fprintf(file,
 17 |                 "node%p[label=\"ε\",shape=box,fontname=\"times-italic\"];\n",
 18 |                 (void *) node);
 19 |         break;
 20 | 
 21 |     /* Characters */
 22 |     case REGEX_NODE_TYPE_CHARACTER:
 23 |         fprintf(file,
 24 |                 "node%p[color=lightblue2,style=filled,label=\"'%c'\",shape=box,"
 25 |                 "fontname=\"courier\"];\n",
 26 |                 (void *) node, node->ch);
 27 |         break;
 28 |     case REGEX_NODE_TYPE_ANY_CHARACTER:
 29 |         fprintf(file,
 30 |                 "node%p[label=\"any\",shape=box"
 31 |                 ",fontname=\"times-italic\"];\n",
 32 |                 (void *) node);
 33 |         break;
 34 |     case REGEX_NODE_TYPE_CHARACTER_CLASS:
 35 |         fprintf(file,
 36 |                 "node%p[label=\"[%.*s]\",shape=box,fontname=\"courier\"];\n",
 37 |                 (void *) node, (int) (node->to - node->from), node->from);
 38 |         break;
 39 |     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
 40 |         fprintf(file,
 41 |                 "node%p[label=\"[^%.*s]\",shape=box,fontname=\"courier\"];\n",
 42 |                 (void *) node, (int) (node->to - node->from), node->from);
 43 |         break;
 44 | 
 45 |     /* Composites */
 46 |     case REGEX_NODE_TYPE_CONCATENATION:
 47 |         fprintf(file,
 48 |                 "node%p[label=\"concatenation\",shape=box,style=\"rounded\""
 49 |                 ",fontname=\"times-italic\"];\n",
 50 |                 (void *) node);
 51 |         print_node(file, node->left);
 52 |         fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->left);
 53 |         print_node(file, node->right);
 54 |         fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->right);
 55 |         break;
 56 |     case REGEX_NODE_TYPE_ALTERNATION:
 57 |         fprintf(file,
 58 |                 "node%p[label=\"alternation\",shape=diamond,style=\"rounded\""
 59 |                 ",fontname=\"times-italic\"];\n",
 60 |                 (void *) node);
 61 |         print_node(file, node->left);
 62 |         fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->left);
 63 |         print_node(file, node->right);
 64 |         fprintf(file, "node%p->node%p;\n", (void *) node, (void *) node->right);
 65 |         break;
 66 | 
 67 |     /* Quantifiers */
 68 |     case REGEX_NODE_TYPE_QUANTIFIER:
 69 |         fprintf(file, "node%p[label=\"%d..", (void *) node, node->nmin);
 70 |         if (node->nmax == -1)
 71 |             fprintf(file, "INF");
 72 |         else
 73 |             fprintf(file, "%d", node->nmax);
 74 |         if (node->nmin == 0)
 75 |             fprintf(file, "\",shape=ellipse,style=\"dotted\"];\n");
 76 |         else
 77 |             fprintf(file, "\",shape=ellipse];\n");
 78 |         print_node(file, node->quantified);
 79 |         fprintf(file, "node%p->node%p;\n", (void *) node,
 80 |                 (void *) node->quantified);
 81 |         break;
 82 | 
 83 |     /* Anchors */
 84 |     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
 85 |         fprintf(file, "node%p[label=\"^\",shape=circle];\n", (void *) node);
 86 |         break;
 87 |     case REGEX_NODE_TYPE_ANCHOR_END:
 88 |         fprintf(file, "node%p[label=\"$\",shape=circle];\n", (void *) node);
 89 |         break;
 90 | 
 91 |     /* Captures */
 92 |     case REGEX_NODE_TYPE_CAPTURE:
 93 |         fprintf(file,
 94 |                 "node%p[label=\"capture\",shape=parallelogram,"
 95 |                 "style=\"rounded\",fontname=\"times-italic\"];\n",
 96 |                 (void *) node);
 97 |         print_node(file, node->captured);
 98 |         fprintf(file, "node%p->node%p;\n", (void *) node,
 99 |                 (void *) node->captured);
100 |         break;
101 |     }
102 | }
103 | 
104 | static void print_dot(FILE *file, const cregex_node_t *node)
105 | {
106 |     fprintf(file, "digraph cregex_ {\n");
107 |     print_node(file, node);
108 |     fprintf(file, "}\n");
109 | }
110 | 
111 | int main(int argc, char *argv[])
112 | {
113 |     cregex_node_t *node;
114 | 
115 |     /* process command line */
116 |     if (argc < 2 || argc > 3) {
117 |         usage(stderr, argv[0]);
118 |         return EXIT_FAILURE;
119 |     }
120 | 
121 |     if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
122 |         usage(stdout, argv[0]);
123 |         return EXIT_SUCCESS;
124 |     }
125 | 
126 |     /* parse pattern */
127 |     if ((node = cregex_parse(argv[1])))
128 |         print_dot(stdout, node);
129 |     else {
130 |         fprintf(stderr, "%s: cregex_parse() failed\n", argv[0]);
131 |         return EXIT_FAILURE;
132 |     }
133 | 
134 |     cregex_parse_free(node);
135 |     return EXIT_SUCCESS;
136 | }
137 | 


--------------------------------------------------------------------------------