├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── build.zig
├── build.zig.zon
├── example
    └── example.c
├── include
    └── regex.h
└── src
    ├── all_test.zig
    ├── c_regex.zig
    ├── compile.zig
    ├── debug.zig
    ├── exec.zig
    ├── input.zig
    ├── parse.zig
    ├── parse_test.zig
    ├── range_set.zig
    ├── regex.zig
    ├── regex_test.zig
    ├── vm_backtrack.zig
    ├── vm_pike.zig
    └── vm_test.zig


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: ['*']
 8 | 
 9 | permissions:
10 |   contents: read
11 | 
12 | jobs:
13 | 
14 |   lint:
15 |     name: Lint
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout code
19 |         uses: actions/checkout@v4
20 | 
21 |       - uses: goto-bus-stop/setup-zig@v2
22 |         with:
23 |           version: 0.14.0  # most recent stable
24 | 
25 |       - name: Check formatting
26 |         run: zig fmt --check .
27 | 
28 |   test:
29 |     name: Test / Zig ${{ matrix.zig-version }}
30 |     runs-on: ubuntu-latest
31 |     continue-on-error: ${{ matrix.allow-fail }}
32 | 
33 |     strategy:
34 |       matrix:
35 |         zig-version: ['0.14.0']
36 |         os: [ubuntu-latest]
37 |         allow-fail: [false]
38 |         include:
39 |           # Test against Zig master but don't break from it.
40 |           # master is a constantly moving target,
41 |           # so we'll fix issues on a best-effort basis.
42 |           - zig-version: master
43 |             os: ubuntu-latest
44 |             allow-fail: true
45 | 
46 |     steps:
47 |       - name: Checkout code
48 |         uses: actions/checkout@v4
49 | 
50 |       - name: Set up Zig
51 |         uses: goto-bus-stop/setup-zig@v2
52 |         with:
53 |           version: ${{ matrix.zig-version }}
54 | 
55 |       - name: Run tests
56 |         run: zig build test
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | zig-cache
2 | zig-out
3 | .zig-*
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Marc Tiehuis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | An automaton-based regex implementation for [zig](http://ziglang.org/).
 2 | 
 3 | Note: This is still a work in progress and many things still need to be done.
 4 | 
 5 |  - [x] Capture group support
 6 |  - [ ] UTF-8 support
 7 |  - [ ] More tests (plus some automated tests/fuzzing)
 8 |  - [x] Add a PikeVM implementation
 9 |  - [ ] Literal optimizations and just general performance improvements.
10 | 
11 | ## Usage
12 | 
13 | ```zig
14 | const debug = @import("std").debug;
15 | const Regex = @import("regex").Regex;
16 | 
17 | test "example" {
18 |     var re = try Regex.compile(debug.global_allocator, "\\w+");
19 | 
20 |     debug.assert(try re.match("hej") == true);
21 | }
22 | ```
23 | 
24 | ## Api
25 | 
26 | ### Regex
27 | 
28 | ```zig
29 | fn compile(a: Allocator, re: []const u8) !Regex
30 | ```
31 | 
32 | Compiles a regex string, returning any errors during parsing/compiling.
33 | 
34 | ---
35 | 
36 | ```zig
37 | pub fn match(re: *Regex, input: []const u8) !bool
38 | ```
39 | 
40 | Match a compiled regex against some input. The input must be matched in its
41 | entirety and from the first index.
42 | 
43 | ---
44 | 
45 | ```zig
46 | pub fn partialMatch(re: *Regex, input: []const u8) !bool
47 | ```
48 | 
49 | Match a compiled regex against some input. Unlike `match`, this matches the
50 | leftmost and does not have to be anchored to the start of `input`.
51 | 
52 | ---
53 | 
54 | ```zig
55 | pub fn captures(re: *Regex, input: []const u8) !?Captures
56 | ```
57 | 
58 | Match a compiled regex against some input. Returns a list of all matching
59 | slices in the regex with the first (0-index) being the entire regex.
60 | 
61 | If no match was found, null is returned.
62 | 
63 | ### Captures
64 | 
65 | ```zig
66 | pub fn sliceAt(captures: *const Captures, n: usize) ?[]const u8
67 | ```
68 | 
69 | Return the sub-slice for the numbered capture group. 0 refers to the entire
70 | match.
71 | 
72 | ```zig
73 | pub fn boundsAt(captures: *const Captures, n: usize) ?Span
74 | ```
75 | 
76 | Return the lower and upper byte positions for the specified capture group.
77 | 
78 | We can retrieve the sub-slice using this function:
79 | 
80 | ```zig
81 | const span = caps.boundsAt(0)
82 | debug.assert(mem.eql(u8, caps.sliceAt(0), input[span.lower..span.upper]));
83 | ```
84 | 
85 | ---
86 | 
87 | ## References
88 | 
89 | See the following useful sources:
90 |  - https://swtch.com/~rsc/regexp/
91 |  - [Rust Regex Library](https://github.com/rust-lang/regex)
92 |  - [Go Regex Library](https://github.com/golang/go/tree/master/src/regexp)
93 | 


--------------------------------------------------------------------------------
/build.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | 
 3 | pub fn build(b: *std.Build) void {
 4 |     const target = b.standardTargetOptions(.{});
 5 |     const optimize = b.standardOptimizeOption(.{});
 6 | 
 7 |     if (@hasDecl(std.Build, "CreateModuleOptions")) {
 8 |         // Zig 0.11
 9 |         _ = b.addModule("regex", .{
10 |             .source_file = .{ .path = "src/regex.zig" },
11 |         });
12 |     } else {
13 |         // Zig 0.12-dev.2159
14 |         _ = b.addModule("regex", .{
15 |             .root_source_file = path(b, "src/regex.zig"),
16 |         });
17 |     }
18 | 
19 |     // library tests
20 |     const library_tests = b.addTest(.{
21 |         .root_source_file = path(b, "src/all_test.zig"),
22 |         .target = target,
23 |         .optimize = optimize,
24 |     });
25 |     const run_library_tests = b.addRunArtifact(library_tests);
26 | 
27 |     const test_step = b.step("test", "Run all tests");
28 |     test_step.dependOn(&run_library_tests.step);
29 | 
30 |     // C library
31 |     const staticLib = b.addStaticLibrary(.{
32 |         .name = "regex",
33 |         .root_source_file = path(b, "src/c_regex.zig"),
34 |         .target = target,
35 |         .optimize = optimize,
36 |     });
37 |     staticLib.linkLibC();
38 | 
39 |     b.installArtifact(staticLib);
40 | 
41 |     const sharedLib = b.addSharedLibrary(.{
42 |         .name = "regex",
43 |         .root_source_file = path(b, "src/c_regex.zig"),
44 |         .target = target,
45 |         .optimize = optimize,
46 |     });
47 |     sharedLib.linkLibC();
48 | 
49 |     b.installArtifact(sharedLib);
50 | 
51 |     // C example
52 |     const c_example = b.addExecutable(.{
53 |         .name = "example",
54 |         .target = target,
55 |         .optimize = optimize,
56 |     });
57 |     c_example.addCSourceFile(.{
58 |         .file = path(b, "example/example.c"),
59 |         .flags = &.{"-Wall"},
60 |     });
61 |     c_example.addIncludePath(path(b, "include"));
62 |     c_example.linkLibC();
63 |     c_example.linkLibrary(staticLib);
64 | 
65 |     const c_example_step = b.step("c-example", "Example using C API");
66 |     c_example_step.dependOn(&staticLib.step);
67 |     c_example_step.dependOn(&c_example.step);
68 | 
69 |     b.default_step.dependOn(test_step);
70 | }
71 | 
72 | fn path(b: *std.Build, sub_path: []const u8) std.Build.LazyPath {
73 |     if (@hasDecl(std.Build, "path")) {
74 |         // Zig 0.13-dev.267
75 |         return b.path(sub_path);
76 |     } else {
77 |         return .{ .path = sub_path };
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/build.zig.zon:
--------------------------------------------------------------------------------
 1 | .{
 2 |     .name = .regex,
 3 |     .version = "0.1.2",
 4 |     .minimum_zig_version = "0.14.0",
 5 |     .paths = .{
 6 |         "src",
 7 |         "build.zig",
 8 |         "build.zig.zon",
 9 |         "LICENSE",
10 |         "README.md",
11 |     },
12 |     .fingerprint = 0x4204f8cae7b7106b,
13 | }
14 | 


--------------------------------------------------------------------------------
/example/example.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "regex.h"
 3 | 
 4 | int main() {
 5 |   zre_regex *re = zre_compile(".*world.*");
 6 |   if (!re) {
 7 |     printf("Regex compile error\n");
 8 |     return 1;
 9 |   }
10 | 
11 |   if (zre_match(re, "Hello world!"))
12 |     printf("Match!\n");
13 | 
14 |   zre_deinit(re);
15 | }
16 | 


--------------------------------------------------------------------------------
/include/regex.h:
--------------------------------------------------------------------------------
 1 | #ifndef __ZRE_H__
 2 | #define __ZRE_H__
 3 | 
 4 | #include <stdbool.h>
 5 | #include <stddef.h>
 6 | 
 7 | typedef struct zre_regex zre_regex;
 8 | typedef struct zre_captures zre_captures;
 9 | 
10 | typedef struct zre_captures_span {
11 |   size_t lower;
12 |   size_t upper;
13 | } zre_captures_span;
14 | 
15 | extern zre_regex* zre_compile(const char* input);
16 | 
17 | extern bool zre_match(zre_regex* re, const char* input);
18 | 
19 | extern bool zre_partial_match(zre_regex* re, const char* input);
20 | 
21 | extern void zre_deinit(zre_regex* re);
22 | 
23 | extern zre_captures* zre_captures_all(zre_regex* re, const char* input);
24 | 
25 | extern size_t zre_captures_len(const zre_captures* cap);
26 | 
27 | extern const char* zre_captures_slice_at(const zre_captures* cap, size_t n, size_t* len);
28 | 
29 | extern bool zre_captures_bounds_at(const zre_captures* cap, zre_captures_span* sp, size_t n);
30 | 
31 | extern void zre_captures_deinit(zre_captures* cap);
32 | 
33 | #endif // __ZRE_H__
34 | 


--------------------------------------------------------------------------------
/src/all_test.zig:
--------------------------------------------------------------------------------
1 | test "all" {
2 |     _ = @import("range_set.zig");
3 |     _ = @import("parse_test.zig");
4 |     _ = @import("vm_test.zig");
5 |     _ = @import("regex_test.zig");
6 | }
7 | 


--------------------------------------------------------------------------------
/src/c_regex.zig:
--------------------------------------------------------------------------------
 1 | //! C API for the zig-regex library
 2 | 
 3 | const std = @import("std");
 4 | 
 5 | const regex = @import("regex.zig");
 6 | const Regex = regex.Regex;
 7 | const Captures = regex.Captures;
 8 | 
 9 | const zre_regex = opaque {};
10 | const zre_captures = opaque {};
11 | 
12 | const zre_captures_span = extern struct {
13 |     lower: usize,
14 |     upper: usize,
15 | };
16 | 
17 | var allocator = std.heap.c_allocator;
18 | 
19 | export fn zre_compile(input: ?[*:0]const u8) ?*zre_regex {
20 |     const r = allocator.create(Regex) catch return null;
21 |     r.* = Regex.compile(allocator, std.mem.span(input.?)) catch return null;
22 |     return @ptrCast(r);
23 | }
24 | 
25 | export fn zre_match(re: ?*zre_regex, input: ?[*:0]const u8) bool {
26 |     var r: *Regex = @ptrCast(@alignCast(re));
27 |     return r.match(std.mem.span(input.?)) catch return false;
28 | }
29 | 
30 | export fn zre_partial_match(re: ?*zre_regex, input: ?[*:0]const u8) bool {
31 |     var r: *Regex = @ptrCast(@alignCast(re));
32 |     return r.partialMatch(std.mem.span(input.?)) catch return false;
33 | }
34 | 
35 | export fn zre_deinit(re: ?*zre_regex) void {
36 |     var r: *Regex = @ptrCast(@alignCast(re));
37 |     r.deinit();
38 |     allocator.destroy(r);
39 | }
40 | 
41 | export fn zre_captures_all(re: ?*zre_regex, input: ?[*:0]const u8) ?*zre_captures {
42 |     var r: *Regex = @ptrCast(@alignCast(re));
43 |     const c = allocator.create(Captures) catch return null;
44 |     c.* = (r.captures(std.mem.span(input.?)) catch return null) orelse return null;
45 |     return @ptrCast(c);
46 | }
47 | 
48 | export fn zre_captures_len(cap: ?*const zre_captures) usize {
49 |     const c: *const Captures = @ptrCast(@alignCast(cap));
50 |     return c.slots.len / 2;
51 | }
52 | 
53 | export fn zre_captures_slice_at(cap: ?*const zre_captures, n: usize, len: ?*usize) ?[*]const u8 {
54 |     const c: *const Captures = @ptrCast(@alignCast(cap));
55 |     const slice = c.sliceAt(n) orelse return null;
56 |     if (len) |ln| {
57 |         ln.* = slice.len;
58 |     }
59 |     return slice.ptr;
60 | }
61 | 
62 | export fn zre_captures_bounds_at(cap: ?*const zre_captures, sp: ?*zre_captures_span, n: usize) bool {
63 |     const c: *const Captures = @ptrCast(@alignCast(cap));
64 |     const span = c.boundsAt(n);
65 |     if (span) |s| {
66 |         sp.?.*.lower = s.lower;
67 |         sp.?.*.upper = s.upper;
68 |         return true;
69 |     }
70 |     return false;
71 | }
72 | 
73 | export fn zre_captures_deinit(cap: ?*zre_captures) void {
74 |     var c: *Captures = @ptrCast(@alignCast(cap));
75 |     c.deinit();
76 |     allocator.destroy(c);
77 | }
78 | 


--------------------------------------------------------------------------------
/src/compile.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const mem = std.mem;
  3 | const Allocator = std.mem.Allocator;
  4 | const ArrayList = std.ArrayList;
  5 | const debug = std.debug;
  6 | 
  7 | const parser = @import("parse.zig");
  8 | const Parser = parser.Parser;
  9 | const ByteClass = parser.ByteClass;
 10 | const Expr = parser.Expr;
 11 | const Assertion = parser.Assertion;
 12 | 
 13 | pub const InstructionData = union(enum) {
 14 |     // Match the specified character.
 15 |     Char: u8,
 16 |     // Match the specified character ranges.
 17 |     ByteClass: ByteClass,
 18 |     // Matches the AnyChar special cases
 19 |     AnyCharNotNL,
 20 |     // Empty match (\w assertion)
 21 |     EmptyMatch: Assertion,
 22 |     // Stop the thread, found a match
 23 |     Match,
 24 |     // Jump to the instruction at address x
 25 |     Jump,
 26 |     // Split execution, spawing a new thread and continuing in lockstep
 27 |     Split: usize,
 28 |     // Slot to save position in
 29 |     Save: usize,
 30 | };
 31 | 
 32 | // Represents instructions for the VM.
 33 | pub const Instruction = struct {
 34 |     // Next instruction to execute
 35 |     out: usize,
 36 |     // Associated data with this
 37 |     data: InstructionData,
 38 | 
 39 |     pub fn new(out: usize, data: InstructionData) Instruction {
 40 |         return Instruction{
 41 |             .out = out,
 42 |             .data = data,
 43 |         };
 44 |     }
 45 | };
 46 | 
 47 | // Represents an instruction with unpatched holes.
 48 | const InstHole = union(enum) {
 49 |     // Match with an unfilled output
 50 |     Char: u8,
 51 |     // Match a character class range
 52 |     ByteClass: ByteClass,
 53 |     // Empty Match assertion
 54 |     EmptyMatch: Assertion,
 55 |     // Match any character
 56 |     AnyCharNotNL,
 57 |     // Split with no unfilled branch
 58 |     Split,
 59 |     // Split with a filled first branch
 60 |     Split1: usize,
 61 |     // Split with a filled second branch
 62 |     Split2: usize,
 63 |     // Save capture
 64 |     Save: usize,
 65 | };
 66 | 
 67 | // Represents a partial instruction. During compilation the instructions will be a mix of compiled
 68 | // and un-compiled. All instructions must be in the compiled state when we finish processing.
 69 | const PartialInst = union(enum) {
 70 |     // A completely compiled instruction
 71 |     Compiled: Instruction,
 72 | 
 73 |     // A partially compiled instruction, the back-links are not yet filled
 74 |     Uncompiled: InstHole,
 75 | 
 76 |     // Modify the current instruction to point to the specified instruction.
 77 |     pub fn fill(s: *PartialInst, i: usize) void {
 78 |         switch (s.*) {
 79 |             PartialInst.Uncompiled => |ih| {
 80 |                 // Generate the corresponding compiled instruction. All simply goto the specified
 81 |                 // instruction, except for the dual split case, in which both outgoing pointers
 82 |                 // go to the same place.
 83 |                 const compiled = switch (ih) {
 84 |                     InstHole.Char => |ch| Instruction.new(i, InstructionData{ .Char = ch }),
 85 | 
 86 |                     InstHole.EmptyMatch => |assertion| Instruction.new(i, InstructionData{ .EmptyMatch = assertion }),
 87 | 
 88 |                     InstHole.AnyCharNotNL => Instruction.new(i, InstructionData.AnyCharNotNL),
 89 | 
 90 |                     InstHole.ByteClass => |class| Instruction.new(i, InstructionData{ .ByteClass = class }),
 91 | 
 92 |                     InstHole.Split =>
 93 |                     // If we both point to the same output, we can encode as a jump
 94 |                     Instruction.new(i, InstructionData.Jump),
 95 | 
 96 |                     // 1st was already filled
 97 |                     InstHole.Split1 => |split| Instruction.new(split, InstructionData{ .Split = i }),
 98 | 
 99 |                     // 2nd was already filled
100 |                     InstHole.Split2 => |split| Instruction.new(i, InstructionData{ .Split = split }),
101 | 
102 |                     InstHole.Save => |slot| Instruction.new(i, InstructionData{ .Save = slot }),
103 |                 };
104 | 
105 |                 s.* = PartialInst{ .Compiled = compiled };
106 |             },
107 |             PartialInst.Compiled => {
108 |                 // nothing to do, already filled
109 |             },
110 |         }
111 |     }
112 | };
113 | 
114 | // A program represents the compiled bytecode of an NFA.
115 | pub const Program = struct {
116 |     // Sequence of instructions representing an NFA
117 |     insts: []Instruction,
118 |     // Start instruction
119 |     start: usize,
120 |     // Find Start instruction
121 |     find_start: usize,
122 |     // Max number of slots required
123 |     slot_count: usize,
124 |     // Allocator which owns the instructions
125 |     allocator: Allocator,
126 | 
127 |     pub fn init(allocator: Allocator, a: []Instruction, find_start: usize, slot_count: usize) Program {
128 |         return Program{
129 |             .allocator = allocator,
130 |             .insts = a,
131 |             .start = 0,
132 |             .find_start = find_start,
133 |             .slot_count = slot_count,
134 |         };
135 |     }
136 | 
137 |     pub fn deinit(p: *Program) void {
138 |         for (p.insts) |*inst| {
139 |             switch (inst.data) {
140 |                 .ByteClass => |*bc| {
141 |                     bc.deinit();
142 |                 },
143 |                 else => {},
144 |             }
145 |         }
146 |         p.allocator.free(p.insts);
147 |     }
148 | };
149 | 
150 | // A Hole represents the outgoing node of a partially compiled Fragment.
151 | //
152 | // If None, the Hole needs to be back-patched as we do not yet know which instruction this
153 | // points to yet.
154 | const Hole = union(enum) {
155 |     None,
156 |     One: usize,
157 |     Many: ArrayList(Hole),
158 | };
159 | 
160 | // A patch represents an unpatched output for a contigious sequence of instructions.
161 | const Patch = struct {
162 |     // The address of the first instruction
163 |     entry: usize,
164 |     // The output hole of this instruction (to be filled to an actual address/es)
165 |     hole: Hole,
166 | };
167 | 
168 | // A Compiler compiles a regex expression into a bytecode representation of the NFA.
169 | pub const Compiler = struct {
170 |     // Stores all partial instructions
171 |     insts: ArrayList(PartialInst),
172 |     allocator: Allocator,
173 |     // Capture state
174 |     capture_index: usize,
175 | 
176 |     pub fn init(a: Allocator) Compiler {
177 |         return Compiler{
178 |             .insts = ArrayList(PartialInst).init(a),
179 |             .allocator = a,
180 |             .capture_index = 0,
181 |         };
182 |     }
183 | 
184 |     pub fn deinit(c: *Compiler) void {
185 |         c.insts.deinit();
186 |     }
187 | 
188 |     fn nextCaptureIndex(c: *Compiler) usize {
189 |         const s = c.capture_index;
190 |         c.capture_index += 2;
191 |         return s;
192 |     }
193 | 
194 |     // Compile the regex expression
195 |     pub fn compile(c: *Compiler, expr: *const Expr) !Program {
196 |         // surround in a full program match
197 |         const entry = c.insts.items.len;
198 |         const index = c.nextCaptureIndex();
199 |         try c.pushCompiled(Instruction.new(entry + 1, InstructionData{ .Save = index }));
200 | 
201 |         // compile the main expression
202 |         const patch = try c.compileInternal(expr);
203 | 
204 |         // not iterating over an empty correctly in backtrack
205 |         c.fillToNext(patch.hole);
206 |         const h = try c.pushHole(InstHole{ .Save = index + 1 });
207 | 
208 |         // fill any holes to end at the next instruction which will be a match
209 |         c.fillToNext(h);
210 |         try c.pushCompiled(Instruction.new(0, InstructionData.Match));
211 | 
212 |         var p = ArrayList(Instruction).init(c.allocator);
213 |         defer p.deinit();
214 | 
215 |         for (c.insts.items) |e| {
216 |             switch (e) {
217 |                 PartialInst.Compiled => |x| {
218 |                     try p.append(x);
219 |                 },
220 |                 else => |_| {
221 |                     @panic("uncompiled instruction encountered during compilation");
222 |                 },
223 |             }
224 |         }
225 | 
226 |         // To facilitate fast finding (matching non-anchored to the start) we simply append a
227 |         // .*? to the start of our instructions. We push the fragment with this set of instructions
228 |         // at the end of the compiled set. We perform an anchored search by entering normally and
229 |         // a non-anchored by jumping to this patch before starting.
230 |         //
231 |         // 1: compiled instructions
232 |         // 2: match
233 |         // ... # We add the following
234 |         // 3: split 1, 4
235 |         // 4: any 3
236 |         const fragment_start = c.insts.items.len;
237 |         const fragment = [_]Instruction{
238 |             Instruction.new(0, InstructionData{ .Split = fragment_start + 1 }),
239 |             Instruction.new(fragment_start, InstructionData.AnyCharNotNL),
240 |         };
241 |         try p.appendSlice(&fragment);
242 | 
243 |         return Program.init(p.allocator, try p.toOwnedSlice(), fragment_start, c.capture_index);
244 |     }
245 | 
246 |     fn compileInternal(c: *Compiler, expr: *const Expr) Allocator.Error!Patch {
247 |         switch (expr.*) {
248 |             Expr.Literal => |lit| {
249 |                 const h = try c.pushHole(InstHole{ .Char = lit });
250 |                 return Patch{ .hole = h, .entry = c.insts.items.len - 1 };
251 |             },
252 |             Expr.ByteClass => |classes| {
253 |                 // Similar, we use a special instruction.
254 |                 const h = try c.pushHole(InstHole{ .ByteClass = try classes.dupe(c.allocator) });
255 |                 return Patch{ .hole = h, .entry = c.insts.items.len - 1 };
256 |             },
257 |             Expr.AnyCharNotNL => {
258 |                 const h = try c.pushHole(InstHole.AnyCharNotNL);
259 |                 return Patch{ .hole = h, .entry = c.insts.items.len - 1 };
260 |             },
261 |             Expr.EmptyMatch => |assertion| {
262 |                 const h = try c.pushHole(InstHole{ .EmptyMatch = assertion });
263 |                 return Patch{ .hole = h, .entry = c.insts.items.len - 1 };
264 |             },
265 |             Expr.Repeat => |repeat| {
266 |                 // Case 1: *
267 |                 if (repeat.min == 0 and repeat.max == null) {
268 |                     return c.compileStar(repeat.subexpr, repeat.greedy);
269 |                 }
270 |                 // Case 2: +
271 |                 else if (repeat.min == 1 and repeat.max == null) {
272 |                     return c.compilePlus(repeat.subexpr, repeat.greedy);
273 |                 }
274 |                 // Case 3: ?
275 |                 else if (repeat.min == 0 and repeat.max != null and repeat.max.? == 1) {
276 |                     return c.compileQuestion(repeat.subexpr, repeat.greedy);
277 |                 }
278 |                 // Case 4: {m,}
279 |                 else if (repeat.max == null) {
280 |                     // e{2,} => eee*
281 |                     // fixed min concatenation
282 |                     const p = try c.compileInternal(repeat.subexpr);
283 |                     var hole = p.hole;
284 |                     const entry = p.entry;
285 | 
286 |                     var i: usize = 1;
287 |                     while (i < repeat.min) : (i += 1) {
288 |                         const new_subexpr = try repeat.subexpr.clone();
289 |                         const ep = try c.compileInternal(&new_subexpr);
290 |                         c.fill(hole, ep.entry);
291 |                         hole = ep.hole;
292 |                     }
293 | 
294 |                     // add final e* infinite capture
295 |                     var new_subexpr = try repeat.subexpr.clone();
296 |                     const st = try c.compileStar(&new_subexpr, repeat.greedy);
297 |                     c.fill(hole, st.entry);
298 | 
299 |                     return Patch{ .hole = st.hole, .entry = entry };
300 |                 }
301 |                 // Case 5: {m,n} and {m}
302 |                 else {
303 |                     // e{3,6} => eee?e?e?e?
304 |                     const p = try c.compileInternal(repeat.subexpr);
305 |                     var hole = p.hole;
306 |                     const entry = p.entry;
307 | 
308 |                     var i: usize = 1;
309 |                     while (i < repeat.min) : (i += 1) {
310 |                         const new_subexpr = try repeat.subexpr.clone();
311 |                         const ep = try c.compileInternal(&new_subexpr);
312 |                         c.fill(hole, ep.entry);
313 |                         hole = ep.hole;
314 |                     }
315 | 
316 |                     // repeated optional concatenations
317 |                     while (i < repeat.max.?) : (i += 1) {
318 |                         var new_subexpr = try repeat.subexpr.clone();
319 |                         const ep = try c.compileQuestion(&new_subexpr, repeat.greedy);
320 |                         c.fill(hole, ep.entry);
321 |                         hole = ep.hole;
322 |                     }
323 | 
324 |                     return Patch{ .hole = hole, .entry = entry };
325 |                 }
326 |             },
327 |             Expr.Concat => |subexprs| {
328 |                 // Compile each item in the sub-expression
329 |                 const f = subexprs.items[0];
330 | 
331 |                 // First patch
332 |                 const p = try c.compileInternal(f);
333 |                 var hole = p.hole;
334 |                 const entry = p.entry;
335 | 
336 |                 // tie together patches from concat arguments
337 |                 for (subexprs.items[1..]) |e| {
338 |                     const ep = try c.compileInternal(e);
339 |                     // fill the previous patch hole to the current entry
340 |                     c.fill(hole, ep.entry);
341 |                     // current hole is now the next fragment
342 |                     hole = ep.hole;
343 |                 }
344 | 
345 |                 return Patch{ .hole = hole, .entry = entry };
346 |             },
347 |             Expr.Capture => |subexpr| {
348 |                 // 1: save 1, 2
349 |                 // 2: subexpr
350 |                 // 3: restore 1, 4
351 |                 // ...
352 | 
353 |                 // Create a partial instruction with a hole outgoing at the current location.
354 |                 const entry = c.insts.items.len;
355 | 
356 |                 const index = c.nextCaptureIndex();
357 | 
358 |                 try c.pushCompiled(Instruction.new(entry + 1, InstructionData{ .Save = index }));
359 |                 const p = try c.compileInternal(subexpr);
360 |                 c.fillToNext(p.hole);
361 | 
362 |                 const h = try c.pushHole(InstHole{ .Save = index + 1 });
363 | 
364 |                 return Patch{ .hole = h, .entry = entry };
365 |             },
366 |             Expr.Alternate => |subexprs| {
367 |                 // Alternation with one path does not make sense
368 |                 debug.assert(subexprs.items.len >= 2);
369 | 
370 |                 // Alternates are simply a series of splits into the sub-expressions, with each
371 |                 // subexpr having the same output hole (after the final subexpr).
372 |                 //
373 |                 // 1: split 2, 4
374 |                 // 2: subexpr1
375 |                 // 3: jmp 8
376 |                 // 4: split 5, 7
377 |                 // 5: subexpr2
378 |                 // 6: jmp 8
379 |                 // 7: subexpr3
380 |                 // 8: ...
381 | 
382 |                 const entry = c.insts.items.len;
383 |                 var holes = ArrayList(Hole).init(c.allocator);
384 |                 errdefer holes.deinit();
385 | 
386 |                 // TODO: Doees this need to be dynamically allocated?
387 |                 const last_hole = try c.allocator.create(Hole);
388 |                 defer c.allocator.destroy(last_hole);
389 |                 last_hole.* = .None;
390 | 
391 |                 // This compiles one branch of the split at a time.
392 |                 for (subexprs.items[0 .. subexprs.items.len - 1]) |subexpr| {
393 |                     c.fillToNext(last_hole.*);
394 | 
395 |                     // next entry will be a sub-expression
396 |                     //
397 |                     // We fill the second part of this hole on the next sub-expression.
398 |                     last_hole.* = try c.pushHole(InstHole{ .Split1 = c.insts.items.len + 1 });
399 | 
400 |                     // compile the subexpression
401 |                     const p = try c.compileInternal(subexpr);
402 | 
403 |                     // store outgoing hole for the subexpression
404 |                     try holes.append(p.hole);
405 |                 }
406 | 
407 |                 // one entry left, push a sub-expression so we end with a double-subexpression.
408 |                 const p = try c.compileInternal(subexprs.items[subexprs.items.len - 1]);
409 |                 c.fill(last_hole.*, p.entry);
410 | 
411 |                 // push the last sub-expression hole
412 |                 try holes.append(p.hole);
413 | 
414 |                 // return many holes which are all to be filled to the next instruction
415 |                 return Patch{ .hole = Hole{ .Many = holes }, .entry = entry };
416 |             },
417 |             Expr.PseudoLeftParen => {
418 |                 @panic("internal error, encountered PseudoLeftParen");
419 |             },
420 |         }
421 | 
422 |         return Patch{ .hole = Hole.None, .entry = c.insts.items.len };
423 |     }
424 | 
425 |     fn compileStar(c: *Compiler, expr: *Expr, greedy: bool) !Patch {
426 |         // 1: split 2, 4
427 |         // 2: subexpr
428 |         // 3: jmp 1
429 |         // 4: ...
430 | 
431 |         // We do not know where the second branch in this split will go (unsure yet of
432 |         // the length of the following subexpr. Need a hole.
433 | 
434 |         // Create a partial instruction with a hole outgoing at the current location.
435 |         const entry = c.insts.items.len;
436 | 
437 |         // * or *? variant, simply switch the branches, the matcher manages precedence
438 |         // of the executing threads.
439 |         const partial_inst = if (greedy)
440 |             InstHole{ .Split1 = c.insts.items.len + 1 }
441 |         else
442 |             InstHole{ .Split2 = c.insts.items.len + 1 };
443 | 
444 |         const h = try c.pushHole(partial_inst);
445 | 
446 |         // compile the subexpression
447 |         const p = try c.compileInternal(expr);
448 | 
449 |         // sub-expression to jump
450 |         c.fillToNext(p.hole);
451 | 
452 |         // Jump back to the entry split
453 |         try c.pushCompiled(Instruction.new(entry, InstructionData.Jump));
454 | 
455 |         // Return a filled patch set to the first split instruction.
456 |         return Patch{ .hole = h, .entry = entry };
457 |     }
458 | 
459 |     fn compilePlus(c: *Compiler, expr: *Expr, greedy: bool) !Patch {
460 |         // 1: subexpr
461 |         // 2: split 1, 3
462 |         // 3: ...
463 |         //
464 |         // NOTE: We can do a lookahead on non-greedy here to improve performance.
465 |         const p = try c.compileInternal(expr);
466 | 
467 |         // Create the next expression in place
468 |         c.fillToNext(p.hole);
469 | 
470 |         // split 3, 1 (non-greedy)
471 |         // Point back to the upcoming next instruction (will always be filled).
472 |         const partial_inst = if (greedy)
473 |             InstHole{ .Split1 = p.entry }
474 |         else
475 |             InstHole{ .Split2 = p.entry };
476 | 
477 |         const h = try c.pushHole(partial_inst);
478 | 
479 |         // split to the next instruction
480 |         return Patch{ .hole = h, .entry = p.entry };
481 |     }
482 | 
483 |     fn compileQuestion(c: *Compiler, expr: *Expr, greedy: bool) !Patch {
484 |         // 1: split 2, 3
485 | 
486 |         // 2: subexpr
487 |         // 3: ...
488 | 
489 |         // Create a partial instruction with a hole outgoing at the current location.
490 |         const partial_inst = if (greedy)
491 |             InstHole{ .Split1 = c.insts.items.len + 1 }
492 |         else
493 |             InstHole{ .Split2 = c.insts.items.len + 1 };
494 | 
495 |         const h = try c.pushHole(partial_inst);
496 | 
497 |         // compile the subexpression
498 |         const p = try c.compileInternal(expr);
499 | 
500 |         var holes = ArrayList(Hole).init(c.allocator);
501 |         errdefer holes.deinit();
502 |         try holes.append(h);
503 |         try holes.append(p.hole);
504 | 
505 |         // Return a filled patch set to the first split instruction.
506 |         return Patch{ .hole = Hole{ .Many = holes }, .entry = p.entry - 1 };
507 |     }
508 | 
509 |     // Push a compiled instruction directly onto the stack.
510 |     fn pushCompiled(c: *Compiler, i: Instruction) !void {
511 |         try c.insts.append(PartialInst{ .Compiled = i });
512 |     }
513 | 
514 |     // Push a instruction with a hole onto the set
515 |     fn pushHole(c: *Compiler, i: InstHole) !Hole {
516 |         const h = c.insts.items.len;
517 |         try c.insts.append(PartialInst{ .Uncompiled = i });
518 |         return Hole{ .One = h };
519 |     }
520 | 
521 |     // Patch an individual hole with the specified output address.
522 |     fn fill(c: *Compiler, hole: Hole, goto1: usize) void {
523 |         switch (hole) {
524 |             Hole.None => {},
525 |             Hole.One => |pc| c.insts.items[pc].fill(goto1),
526 |             Hole.Many => |*holes| {
527 |                 for (holes.items) |hole1|
528 |                     c.fill(hole1, goto1);
529 |                 holes.deinit();
530 |             },
531 |         }
532 |     }
533 | 
534 |     // Patch a hole to point to the next instruction
535 |     fn fillToNext(c: *Compiler, hole: Hole) void {
536 |         c.fill(hole, c.insts.items.len);
537 |     }
538 | };
539 | 


--------------------------------------------------------------------------------
/src/debug.zig:
--------------------------------------------------------------------------------
  1 | // AST/IR Inspection routines are in a separate compilation unit to avoid pulling in any
  2 | // dependencies on i/o output (which may not be supported in a freestanding environment).
  3 | 
  4 | const debug = @import("std").debug;
  5 | 
  6 | const parse = @import("parse.zig");
  7 | const compile = @import("compile.zig");
  8 | 
  9 | const Expr = parse.Expr;
 10 | const Instruction = compile.Instruction;
 11 | const InstructionData = compile.InstructionData;
 12 | const Program = compile.Program;
 13 | 
 14 | pub fn printCharEscaped(ch: u8) void {
 15 |     switch (ch) {
 16 |         '\t' => {
 17 |             debug.print("\\t", .{});
 18 |         },
 19 |         '\r' => {
 20 |             debug.print("\\r", .{});
 21 |         },
 22 |         '\n' => {
 23 |             debug.print("\\n", .{});
 24 |         },
 25 |         // printable characters
 26 |         32...126 => {
 27 |             debug.print("{c}", .{ch});
 28 |         },
 29 |         else => {
 30 |             debug.print("0x{x}", .{ch});
 31 |         },
 32 |     }
 33 | }
 34 | 
 35 | pub fn dumpExpr(e: Expr) void {
 36 |     dumpExprIndent(e, 0);
 37 | }
 38 | 
 39 | fn dumpExprIndent(e: Expr, indent: usize) void {
 40 |     var i: usize = 0;
 41 |     while (i < indent) : (i += 1) {
 42 |         debug.print(" ", .{});
 43 |     }
 44 | 
 45 |     switch (e) {
 46 |         Expr.AnyCharNotNL => {
 47 |             debug.print("{s}\n", .{@tagName(e)});
 48 |         },
 49 |         Expr.EmptyMatch => |assertion| {
 50 |             debug.print("{s}({s})\n", .{ @tagName(e), @tagName(assertion) });
 51 |         },
 52 |         Expr.Literal => |lit| {
 53 |             debug.print("{s}(", .{@tagName(e)});
 54 |             printCharEscaped(lit);
 55 |             debug.print(")\n", .{});
 56 |         },
 57 |         Expr.Capture => |subexpr| {
 58 |             debug.print("{s}\n", .{@tagName(e)});
 59 |             dumpExprIndent(subexpr.*, indent + 1);
 60 |         },
 61 |         Expr.Repeat => |repeat| {
 62 |             debug.print("{s}(min={d}, max={?d}, greedy={any})\n", .{ @tagName(e), repeat.min, repeat.max, repeat.greedy });
 63 |             dumpExprIndent(repeat.subexpr.*, indent + 1);
 64 |         },
 65 |         Expr.ByteClass => |class| {
 66 |             debug.print("{s}(", .{@tagName(e)});
 67 |             for (class.ranges.items) |r| {
 68 |                 debug.print("[", .{});
 69 |                 printCharEscaped(r.min);
 70 |                 debug.print("-", .{});
 71 |                 printCharEscaped(r.max);
 72 |                 debug.print("]", .{});
 73 |             }
 74 |             debug.print(")\n", .{});
 75 |         },
 76 |         // TODO: Can we get better type unification on enum variants with the same type?
 77 |         Expr.Concat => |subexprs| {
 78 |             debug.print("{s}\n", .{@tagName(e)});
 79 |             for (subexprs.items) |s|
 80 |                 dumpExprIndent(s.*, indent + 1);
 81 |         },
 82 |         Expr.Alternate => |subexprs| {
 83 |             debug.print("{s}\n", .{@tagName(e)});
 84 |             for (subexprs.items) |s|
 85 |                 dumpExprIndent(s.*, indent + 1);
 86 |         },
 87 |         // NOTE: Shouldn't occur ever in returned output.
 88 |         Expr.PseudoLeftParen => {
 89 |             debug.print("{s}\n", .{@tagName(e)});
 90 |         },
 91 |     }
 92 | }
 93 | 
 94 | pub fn dumpInstruction(s: Instruction) void {
 95 |     switch (s.data) {
 96 |         InstructionData.Char => |ch| {
 97 |             debug.print("char({}) '{c}'\n", .{ s.out, ch });
 98 |         },
 99 |         InstructionData.EmptyMatch => |assertion| {
100 |             debug.print("empty({}) {s}\n", .{ s.out, @tagName(assertion) });
101 |         },
102 |         InstructionData.ByteClass => |class| {
103 |             debug.print("range({}) ", .{s.out});
104 |             for (class.ranges.items) |r|
105 |                 debug.print("[{d}-{d}]", .{ r.min, r.max });
106 |             debug.print("\n", .{});
107 |         },
108 |         InstructionData.AnyCharNotNL => {
109 |             debug.print("any({})\n", .{s.out});
110 |         },
111 |         InstructionData.Match => {
112 |             debug.print("match\n", .{});
113 |         },
114 |         InstructionData.Jump => {
115 |             debug.print("jump({})\n", .{s.out});
116 |         },
117 |         InstructionData.Split => |branch| {
118 |             debug.print("split({}) {}\n", .{ s.out, branch });
119 |         },
120 |         InstructionData.Save => |slot| {
121 |             debug.print("save({}), {}\n", .{ s.out, slot });
122 |         },
123 |     }
124 | }
125 | 
126 | pub fn dumpProgram(s: Program) void {
127 |     debug.print("start: {}\n\n", .{s.start});
128 |     for (s.insts, 0..) |inst, i| {
129 |         debug.print("L{}: ", .{i});
130 |         dumpInstruction(inst);
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/exec.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const Allocator = std.mem.Allocator;
 3 | const ArrayList = std.ArrayList;
 4 | const compile = @import("compile.zig");
 5 | const Program = compile.Program;
 6 | 
 7 | const VmBacktrack = @import("vm_backtrack.zig").VmBacktrack;
 8 | const VmPike = @import("vm_pike.zig").VmPike;
 9 | const Input = @import("input.zig").Input;
10 | 
11 | pub fn exec(allocator: Allocator, prog: Program, prog_start: usize, input: *Input, slots: *ArrayList(?usize)) !bool {
12 |     if (VmBacktrack.shouldExec(prog, input)) {
13 |         var engine = VmBacktrack.init(allocator);
14 |         return engine.exec(prog, prog_start, input, slots);
15 |     } else {
16 |         var engine = VmPike.init(allocator);
17 |         return engine.exec(prog, prog_start, input, slots);
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/input.zig:
--------------------------------------------------------------------------------
  1 | // A generic iterator of some input bytes.
  2 | //
  3 | // This is intended to handle different decoding patterns. The intent is to have a Utf-8 and byte
  4 | // input abstraction. Execution engines can be generic over these two types.
  5 | //
  6 | // Technically we could encode Utf-8 into associated bytes when constructing the program. This is
  7 | // typically slower on the match however as for large unicode states many more states need to be
  8 | // traversed.
  9 | 
 10 | const Assertion = @import("parse.zig").Assertion;
 11 | 
 12 | pub const Input = struct {
 13 |     bytes: []const u8,
 14 |     byte_pos: usize,
 15 | 
 16 |     currentFn: *const fn (input: Input) ?u8,
 17 |     advanceFn: *const fn (input: *Input) void,
 18 |     isNextWordCharFn: *const fn (input: Input) bool,
 19 |     isPrevWordCharFn: *const fn (input: Input) bool,
 20 |     isCurrWordCharFn: *const fn (input: Input) bool,
 21 | 
 22 |     pub fn advance(self: *Input) void {
 23 |         self.advanceFn(self);
 24 |     }
 25 | 
 26 |     pub fn current(self: Input) ?u8 {
 27 |         return self.currentFn(self);
 28 |     }
 29 | 
 30 |     // Note: We extend the range here to one past the end of the input. This is done in order to
 31 |     // handle complete matches correctly.
 32 |     pub fn isConsumed(self: Input) bool {
 33 |         return self.byte_pos > self.bytes.len;
 34 |     }
 35 | 
 36 |     pub fn isEmptyMatch(self: Input, match: Assertion) bool {
 37 |         switch (match) {
 38 |             Assertion.None => {
 39 |                 return true;
 40 |             },
 41 |             Assertion.BeginLine => {
 42 |                 return self.byte_pos == 0;
 43 |             },
 44 |             Assertion.EndLine => {
 45 |                 return self.byte_pos >= self.bytes.len - 1;
 46 |             },
 47 |             Assertion.BeginText => {
 48 |                 // TODO: Handle different modes.
 49 |                 return self.byte_pos == 0;
 50 |             },
 51 |             Assertion.EndText => {
 52 |                 return self.byte_pos >= self.bytes.len - 1;
 53 |             },
 54 |             Assertion.WordBoundaryAscii => {
 55 |                 return self.isPrevWordCharFn(self) != self.isCurrWordCharFn(self);
 56 |             },
 57 |             Assertion.NotWordBoundaryAscii => {
 58 |                 return self.isPrevWordCharFn(self) == self.isCurrWordCharFn(self);
 59 |             },
 60 |         }
 61 |     }
 62 | 
 63 |     // Create a new instance using the same interface functions.
 64 |     pub fn clone(self: Input) Input {
 65 |         return Input{
 66 |             .bytes = self.bytes,
 67 |             .byte_pos = self.byte_pos,
 68 | 
 69 |             .currentFn = self.currentFn,
 70 |             .advanceFn = self.advanceFn,
 71 |             .isNextWordCharFn = self.isNextWordCharFn,
 72 |             .isPrevWordCharFn = self.isPrevWordCharFn,
 73 |             .isCurrWordCharFn = self.isCurrWordCharFn,
 74 |         };
 75 |     }
 76 | };
 77 | 
 78 | pub const InputBytes = struct {
 79 |     input: Input,
 80 | 
 81 |     pub fn init(bytes: []const u8) InputBytes {
 82 |         return InputBytes{
 83 |             .input = Input{
 84 |                 .bytes = bytes,
 85 |                 .byte_pos = 0,
 86 | 
 87 |                 .currentFn = current,
 88 |                 .advanceFn = advance,
 89 |                 .isNextWordCharFn = isNextWordChar,
 90 |                 .isPrevWordCharFn = isPrevWordChar,
 91 |                 .isCurrWordCharFn = isCurrWordChar,
 92 |             },
 93 |         };
 94 |     }
 95 | 
 96 |     // TODO: When we can compare ?usize == usize this will be a bit nicer.
 97 |     fn current(self: Input) ?u8 {
 98 |         if (self.byte_pos < self.bytes.len) {
 99 |             return self.bytes[self.byte_pos];
100 |         } else {
101 |             return null;
102 |         }
103 |     }
104 | 
105 |     fn advance(self: *Input) void {
106 |         if (self.byte_pos <= self.bytes.len) {
107 |             self.byte_pos += 1;
108 |         }
109 |     }
110 | 
111 |     fn isWordChar(c: u8) bool {
112 |         return switch (c) {
113 |             '0'...'9', 'a'...'z', 'A'...'Z' => true,
114 |             else => false,
115 |         };
116 |     }
117 | 
118 |     fn isNextWordChar(self: Input) bool {
119 |         return (self.byte_pos < self.bytes.len - 1) and isWordChar(self.bytes[self.byte_pos + 1]);
120 |     }
121 | 
122 |     fn isPrevWordChar(self: Input) bool {
123 |         return (self.byte_pos > 0) and isWordChar(self.bytes[self.byte_pos - 1]);
124 |     }
125 | 
126 |     fn isCurrWordChar(self: Input) bool {
127 |         return (self.byte_pos < self.bytes.len) and isWordChar(self.bytes[self.byte_pos]);
128 |     }
129 | };
130 | 


--------------------------------------------------------------------------------
/src/parse.zig:
--------------------------------------------------------------------------------
  1 | /// Parses a regular expression into an expression-tree. Uses a stack-based parser to avoid
  2 | /// unbounded recursion.
  3 | const std = @import("std");
  4 | const math = std.math;
  5 | const mem = std.mem;
  6 | const fmt = std.fmt;
  7 | const Allocator = std.mem.Allocator;
  8 | const ArenaAllocator = std.heap.ArenaAllocator;
  9 | const ArrayList = std.ArrayList;
 10 | const debug = std.debug;
 11 | 
 12 | const range_set = @import("range_set.zig");
 13 | const ByteClassTemplates = range_set.ByteClassTemplates;
 14 | 
 15 | /// A single class range (e.g. [a-z]).
 16 | pub const ByteRange = range_set.Range(u8);
 17 | 
 18 | /// Multiple class ranges (e.g. [a-z0-9])
 19 | pub const ByteClass = range_set.RangeSet(u8);
 20 | 
 21 | /// Repeat sequence (e.g. +, *, ?, {m,n})
 22 | pub const Repeater = struct {
 23 |     // The sub-expression to repeat
 24 |     subexpr: *Expr,
 25 |     // Lower number of times to match
 26 |     min: usize,
 27 |     // Upper number of times to match (null -> infinite)
 28 |     max: ?usize,
 29 |     // Whether this matches greedily
 30 |     greedy: bool,
 31 | };
 32 | 
 33 | /// A specific look-around assertion
 34 | pub const Assertion = enum {
 35 |     // Always true assertion
 36 |     None,
 37 |     // ^ anchor, beginning of text (or line depending on mode)
 38 |     BeginLine,
 39 |     // $ anchor, beginning of text (or line dependening on mode)
 40 |     EndLine,
 41 |     // \A anchor, beginning of text
 42 |     BeginText,
 43 |     // \z anchor, end of text
 44 |     EndText,
 45 |     // \w anchor, word boundary ascii
 46 |     WordBoundaryAscii,
 47 |     // \W anchor, non-word boundary ascii
 48 |     NotWordBoundaryAscii,
 49 | };
 50 | 
 51 | /// A single node of an expression tree.
 52 | pub const Expr = union(enum) {
 53 |     // Empty match (\w assertion)
 54 |     EmptyMatch: Assertion,
 55 |     // A single character byte to match
 56 |     Literal: u8,
 57 |     // . character
 58 |     AnyCharNotNL,
 59 |     // Capture group
 60 |     Capture: *Expr,
 61 |     // *, +, ?
 62 |     Repeat: Repeater,
 63 |     // Character class [a-z0-9]
 64 |     ByteClass: ByteClass,
 65 |     // Concatenation
 66 |     Concat: ArrayList(*Expr),
 67 |     // |
 68 |     Alternate: ArrayList(*Expr),
 69 |     // Pseudo stack operator to define start of a capture
 70 |     PseudoLeftParen,
 71 | 
 72 |     pub fn isByteClass(re: *const Expr) bool {
 73 |         switch (re.*) {
 74 |             .Literal,
 75 |             .ByteClass,
 76 |             .AnyCharNotNL,
 77 |             // TODO: Don't keep capture here, but allow on repeat operators.
 78 |             .Capture,
 79 |             => return true,
 80 |             else => return false,
 81 |         }
 82 |     }
 83 | 
 84 |     pub fn clone(re: *Expr) !Expr {
 85 |         return switch (re.*) {
 86 |             .ByteClass => |*bc| Expr{ .ByteClass = try bc.clone() },
 87 |             else => re.*,
 88 |         };
 89 |     }
 90 | 
 91 |     pub fn deinit(re: *Expr) void {
 92 |         switch (re.*) {
 93 |             .ByteClass => |*bc| bc.deinit(),
 94 |         }
 95 |     }
 96 | };
 97 | 
 98 | // Private in fmt.
 99 | fn charToDigit(c: u8, radix: u8) !u8 {
100 |     const value = switch (c) {
101 |         '0'...'9' => c - '0',
102 |         'A'...'Z' => c - 'A' + 10,
103 |         'a'...'z' => c - 'a' + 10,
104 |         else => return error.InvalidChar,
105 |     };
106 | 
107 |     if (value >= radix)
108 |         return error.InvalidChar;
109 | 
110 |     return value;
111 | }
112 | 
113 | const StringIterator = struct {
114 |     const Self = @This();
115 | 
116 |     slice: []const u8,
117 |     index: usize,
118 | 
119 |     pub fn init(s: []const u8) Self {
120 |         return StringIterator{
121 |             .slice = s,
122 |             .index = 0,
123 |         };
124 |     }
125 | 
126 |     // Advance the stream and return the next token.
127 |     pub fn next(it: *Self) ?u8 {
128 |         if (it.index < it.slice.len) {
129 |             const n = it.index;
130 |             it.index += 1;
131 |             return it.slice[n];
132 |         } else {
133 |             return null;
134 |         }
135 |     }
136 | 
137 |     // Advance the stream.
138 |     pub fn bump(it: *Self) void {
139 |         if (it.index < it.slice.len) {
140 |             it.index += 1;
141 |         }
142 |     }
143 | 
144 |     // Reset the stream back one character
145 |     pub fn bumpBack(it: *Self) void {
146 |         if (it.index > 0) {
147 |             it.index -= 1;
148 |         }
149 |     }
150 | 
151 |     // Look at the nth character in the stream without advancing.
152 |     fn peekAhead(it: *const Self, comptime n: usize) ?u8 {
153 |         if (it.index + n < it.slice.len) {
154 |             return it.slice[it.index + n];
155 |         } else {
156 |             return null;
157 |         }
158 |     }
159 | 
160 |     // Return true if the next character in the stream is `ch`.
161 |     pub fn peekNextIs(it: *const Self, ch: u8) bool {
162 |         if (it.peekAhead(1)) |ok_ch| {
163 |             return ok_ch == ch;
164 |         } else {
165 |             return false;
166 |         }
167 |     }
168 | 
169 |     // Look at the next character in the stream without advancing.
170 |     pub fn peek(it: *const Self) ?u8 {
171 |         return it.peekAhead(0);
172 |     }
173 | 
174 |     // Return true if the next character in the stream is `ch`.
175 |     pub fn peekIs(it: *const Self, ch: u8) bool {
176 |         if (it.peek()) |ok_ch| {
177 |             return ok_ch == ch;
178 |         } else {
179 |             return false;
180 |         }
181 |     }
182 | 
183 |     // Read an integer from the stream. Any non-digit characters stops the parsing chain.
184 |     //
185 |     // Error if no digits were read.
186 |     //
187 |     // TODO: Non character word-boundary instead?
188 |     pub fn readInt(it: *Self, comptime T: type, comptime radix: u8) !T {
189 |         return it.readIntN(T, radix, math.maxInt(usize));
190 |     }
191 | 
192 |     // Read an integer from the stream, limiting the read to N characters at most.
193 |     pub fn readIntN(it: *Self, comptime T: type, comptime radix: u8, comptime N: usize) !T {
194 |         const start = it.index;
195 | 
196 |         var i: usize = 0;
197 |         while (it.peek()) |ch| : (i += 1) {
198 |             if (i >= N) {
199 |                 break;
200 |             }
201 | 
202 |             if (charToDigit(ch, radix)) |_| {
203 |                 it.bump();
204 |             } else |_| {
205 |                 break;
206 |             }
207 |         }
208 | 
209 |         if (start != it.index) {
210 |             return try fmt.parseUnsigned(T, it.slice[start..it.index], radix);
211 |         } else {
212 |             return error.NoIntegerRead;
213 |         }
214 |     }
215 | 
216 |     pub fn skipSpaces(it: *Self) void {
217 |         while (it.peek()) |ok| {
218 |             if (ok != ' ')
219 |                 return;
220 | 
221 |             it.bump();
222 |         }
223 |     }
224 | };
225 | 
226 | pub const ParseError = error{
227 |     MissingRepeatOperand,
228 |     MissingRepeatArgument,
229 |     InvalidRepeatArgument,
230 |     EmptyAlternate,
231 |     UnbalancedParentheses,
232 |     UnopenedParentheses,
233 |     UnclosedParentheses,
234 |     EmptyCaptureGroup,
235 |     UnmatchedByteClass,
236 |     StackUnderflow,
237 |     InvalidRepeatRange,
238 |     UnclosedRepeat,
239 |     UnclosedBrackets,
240 |     ExcessiveRepeatCount,
241 |     OpenEscapeCode,
242 |     UnclosedHexCharacterCode,
243 |     InvalidHexDigit,
244 |     InvalidOctalDigit,
245 |     UnrecognizedEscapeCode,
246 | };
247 | 
248 | pub const ParserOptions = struct {
249 |     // Upper limit on values allowed in a bounded expression (e.g. {500,1000}).
250 |     // This must be bounded as these are unrolled by the engine into individual branches and
251 |     // otherwise are a vector for memory exhaustion attacks.
252 |     max_repeat_length: usize,
253 | 
254 |     pub fn default() ParserOptions {
255 |         return ParserOptions{ .max_repeat_length = 1000 };
256 |     }
257 | };
258 | 
259 | /// Parser manages the parsing state and converts a regular expression string into an expression tree.
260 | ///
261 | /// The resulting expression is tied to the Parser which generated it.
262 | pub const Parser = struct {
263 |     // Parse expression stack
264 |     stack: ArrayList(*Expr),
265 |     // ArenaAllocator for generating all expression nodes
266 |     arena: ArenaAllocator,
267 |     // Allocator for temporary lists/items
268 |     allocator: Allocator,
269 |     // Configurable parser options
270 |     options: ParserOptions,
271 |     // Internal execution state.
272 |     it: StringIterator,
273 | 
274 |     pub fn init(a: Allocator) Parser {
275 |         return initWithOptions(a, ParserOptions.default());
276 |     }
277 | 
278 |     pub fn initWithOptions(a: Allocator, options: ParserOptions) Parser {
279 |         return Parser{
280 |             .stack = ArrayList(*Expr).init(a),
281 |             .arena = ArenaAllocator.init(a),
282 |             .allocator = a,
283 |             .options = options,
284 |             .it = undefined,
285 |         };
286 |     }
287 | 
288 |     pub fn deinit(p: *Parser) void {
289 |         p.stack.deinit();
290 |         p.arena.deinit();
291 |     }
292 | 
293 |     pub fn reset(p: *Parser) void {
294 |         p.stack.shrink(0);
295 | 
296 |         // Note: A shrink or reset on the ArenaAllocator would be nice.
297 |         p.arena.deinit();
298 |         p.arena = ArenaAllocator.init(p.allocator);
299 |     }
300 | 
301 |     fn popStack(p: *Parser) !*Expr {
302 |         if (p.stack.items.len == 0) {
303 |             return error.StackUnderflow;
304 |         }
305 | 
306 |         return p.stack.pop().?;
307 |     }
308 | 
309 |     fn popByteClass(p: *Parser) !*Expr {
310 |         const re1 = try p.popStack();
311 |         if (re1.isByteClass()) {
312 |             return re1;
313 |         } else {
314 |             return error.MissingRepeatOperand;
315 |         }
316 |     }
317 | 
318 |     fn isPunctuation(c: u8) bool {
319 |         return switch (c) {
320 |             '\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}', '^', '$', '-' => true,
321 |             else => false,
322 |         };
323 |     }
324 | 
325 |     fn createExpr(p: *Parser) !*Expr {
326 |         return try p.arena.allocator().create(Expr);
327 |     }
328 | 
329 |     pub fn parse(p: *Parser, re: []const u8) !*Expr {
330 |         p.it = StringIterator.init(re);
331 |         // Shorter alias
332 |         var it = &p.it;
333 | 
334 |         while (it.next()) |ch| {
335 |             // TODO: Consolidate some of the same common patterns.
336 |             switch (ch) {
337 |                 '*' => {
338 |                     try p.parseRepeat(0, null);
339 |                 },
340 |                 '+' => {
341 |                     try p.parseRepeat(1, null);
342 |                 },
343 |                 '?' => {
344 |                     try p.parseRepeat(0, 1);
345 |                 },
346 |                 '{' => {
347 |                     it.skipSpaces();
348 | 
349 |                     const min = it.readInt(usize, 10) catch return error.InvalidRepeatArgument;
350 |                     var max: ?usize = min;
351 | 
352 |                     it.skipSpaces();
353 | 
354 |                     if (it.peekIs(',')) {
355 |                         it.bump();
356 |                         it.skipSpaces();
357 | 
358 |                         // {m,} case with infinite upper bound
359 |                         if (it.peekIs('}')) {
360 |                             max = null;
361 |                         }
362 |                         // {m,n} case with explicit bounds
363 |                         else {
364 |                             max = it.readInt(usize, 10) catch return error.InvalidRepeatArgument;
365 | 
366 |                             if (max.? < min) {
367 |                                 return error.InvalidRepeatRange;
368 |                             }
369 |                         }
370 |                     }
371 | 
372 |                     it.skipSpaces();
373 |                     if (!it.peekIs('}')) {
374 |                         return error.UnclosedRepeat;
375 |                     }
376 |                     it.bump();
377 | 
378 |                     // We limit repeat counts to overoad arbitrary memory blowup during compilation
379 |                     const limit = p.options.max_repeat_length;
380 |                     if (min > limit or max != null and max.? > limit) {
381 |                         return error.ExcessiveRepeatCount;
382 |                     }
383 | 
384 |                     try p.parseRepeat(min, max);
385 |                 },
386 |                 '.' => {
387 |                     const r = try p.createExpr();
388 |                     r.* = Expr{ .AnyCharNotNL = undefined };
389 |                     try p.stack.append(r);
390 |                 },
391 |                 '[' => {
392 |                     try p.parseCharClass();
393 |                 },
394 |                 // Don't handle alternation just yet, parentheses group together arguments into
395 |                 // a sub-expression only.
396 |                 '(' => {
397 |                     const r = try p.createExpr();
398 |                     r.* = Expr{ .PseudoLeftParen = undefined };
399 |                     try p.stack.append(r);
400 |                 },
401 |                 ')' => {
402 |                     // Pop the stack until.
403 |                     //
404 |                     // - Empty, error unopened parenthesis.
405 |                     // - ( pseudo operator, push a group expression of the concat
406 |                     // - '|' pop and add the concat to the alternation list. Pop one more item
407 |                     //   after which must be a opening parenthesis.
408 |                     //
409 |                     // '|' ensures there will be only one alternation on the stack here.
410 |                     var concat = ArrayList(*Expr).init(p.arena.allocator());
411 | 
412 |                     while (true) {
413 |                         // would underflow, push a new alternation
414 |                         if (p.stack.items.len == 0) {
415 |                             return error.UnopenedParentheses;
416 |                         }
417 | 
418 |                         const e = p.stack.pop().?;
419 |                         switch (e.*) {
420 |                             // Existing alternation
421 |                             .Alternate => {
422 |                                 mem.reverse(*Expr, concat.items);
423 | 
424 |                                 const ra = try p.createExpr();
425 |                                 if (concat.items.len == 1) {
426 |                                     ra.* = concat.items[0].*;
427 |                                 } else {
428 |                                     ra.* = Expr{ .Concat = concat };
429 |                                 }
430 | 
431 |                                 // append to the alternation stack
432 |                                 try e.Alternate.append(ra);
433 | 
434 |                                 if (p.stack.items.len == 0) {
435 |                                     return error.UnopenedParentheses;
436 |                                 }
437 | 
438 |                                 // pop the left parentheses that must now exist
439 |                                 debug.assert(p.stack.pop().?.* == Expr.PseudoLeftParen);
440 | 
441 |                                 const r = try p.createExpr();
442 |                                 r.* = Expr{ .Capture = e };
443 |                                 try p.stack.append(r);
444 |                                 break;
445 |                             },
446 |                             // Existing parentheses, push new alternation
447 |                             .PseudoLeftParen => {
448 |                                 mem.reverse(*Expr, concat.items);
449 | 
450 |                                 const ra = try p.createExpr();
451 |                                 ra.* = Expr{ .Concat = concat };
452 | 
453 |                                 if (concat.items.len == 0) {
454 |                                     return error.EmptyCaptureGroup;
455 |                                 } else if (concat.items.len == 1) {
456 |                                     ra.* = concat.items[0].*;
457 |                                 } else {
458 |                                     ra.* = Expr{ .Concat = concat };
459 |                                 }
460 | 
461 |                                 const r = try p.createExpr();
462 |                                 r.* = Expr{ .Capture = ra };
463 |                                 try p.stack.append(r);
464 |                                 break;
465 |                             },
466 |                             // New expression, push onto concat stack
467 |                             else => {
468 |                                 try concat.append(e);
469 |                             },
470 |                         }
471 |                     }
472 |                 },
473 |                 '|' => {
474 |                     // Pop the stack until.
475 |                     //
476 |                     // - Empty, then push the sub-expression as a concat.
477 |                     // - ( pseudo operator, leave '(' and push concat.
478 |                     // - '|' is found, pop the existing and add a new alternation to the array.
479 |                     var concat = ArrayList(*Expr).init(p.arena.allocator());
480 | 
481 |                     if (p.stack.items.len == 0 or !p.stack.items[p.stack.items.len - 1].isByteClass()) {
482 |                         return error.EmptyAlternate;
483 |                     }
484 | 
485 |                     while (true) {
486 |                         // would underflow, push a new alternation
487 |                         if (p.stack.items.len == 0) {
488 |                             // We need to create a single expr node for the alternation.
489 |                             const ra = try p.createExpr();
490 |                             mem.reverse(*Expr, concat.items);
491 | 
492 |                             if (concat.items.len == 1) {
493 |                                 ra.* = concat.items[0].*;
494 |                             } else {
495 |                                 ra.* = Expr{ .Concat = concat };
496 |                             }
497 | 
498 |                             var r = try p.createExpr();
499 |                             r.* = Expr{ .Alternate = ArrayList(*Expr).init(p.arena.allocator()) };
500 |                             try r.Alternate.append(ra);
501 |                             try p.stack.append(r);
502 |                             break;
503 |                         }
504 | 
505 |                         const e = p.stack.pop().?;
506 |                         switch (e.*) {
507 |                             // Existing alternation, combine
508 |                             .Alternate => {
509 |                                 mem.reverse(*Expr, concat.items);
510 | 
511 |                                 const ra = try p.createExpr();
512 |                                 if (concat.items.len == 1) {
513 |                                     ra.* = concat.items[0].*;
514 |                                 } else {
515 |                                     ra.* = Expr{ .Concat = concat };
516 |                                 }
517 | 
518 |                                 // use the expression itself
519 |                                 try e.Alternate.append(ra);
520 | 
521 |                                 try p.stack.append(e);
522 |                                 break;
523 |                             },
524 |                             // Existing parentheses, push new alternation
525 |                             .PseudoLeftParen => {
526 |                                 // re-push parentheses marker
527 |                                 try p.stack.append(e);
528 | 
529 |                                 mem.reverse(*Expr, concat.items);
530 | 
531 |                                 const ra = try p.createExpr();
532 |                                 if (concat.items.len == 1) {
533 |                                     ra.* = concat.items[0].*;
534 |                                 } else {
535 |                                     ra.* = Expr{ .Concat = concat };
536 |                                 }
537 | 
538 |                                 var r = try p.createExpr();
539 |                                 r.* = Expr{ .Alternate = ArrayList(*Expr).init(p.arena.allocator()) };
540 |                                 try r.Alternate.append(ra);
541 |                                 try p.stack.append(r);
542 |                                 break;
543 |                             },
544 |                             // New expression, push onto concat stack
545 |                             else => {
546 |                                 try concat.append(e);
547 |                             },
548 |                         }
549 |                     }
550 |                 },
551 |                 '\\' => {
552 |                     const r = try p.parseEscape();
553 |                     try p.stack.append(r);
554 |                 },
555 |                 '^' => {
556 |                     const r = try p.createExpr();
557 |                     r.* = Expr{ .EmptyMatch = Assertion.BeginLine };
558 |                     try p.stack.append(r);
559 |                 },
560 |                 '$' => {
561 |                     const r = try p.createExpr();
562 |                     r.* = Expr{ .EmptyMatch = Assertion.EndLine };
563 |                     try p.stack.append(r);
564 |                 },
565 |                 else => {
566 |                     try p.parseLiteral(ch);
567 |                 },
568 |             }
569 |         }
570 | 
571 |         // special case empty item
572 |         if (p.stack.items.len == 0) {
573 |             const r = try p.createExpr();
574 |             r.* = Expr{ .EmptyMatch = Assertion.None };
575 |             return r;
576 |         }
577 | 
578 |         // special case single item to avoid top-level concat for simple.
579 |         if (p.stack.items.len == 1) {
580 |             return p.stack.pop().?;
581 |         }
582 | 
583 |         // finish a concatenation result
584 |         //
585 |         // This pops items off the stack and concatenates them until:
586 |         //
587 |         // - The stack is empty (the items are concat and pushed and the single result is returned).
588 |         // - An alternation is seen, this is popped and the current concat state is pushed as an
589 |         //   alternation item.
590 |         //
591 |         // After any of these cases, the stack must be empty.
592 |         //
593 |         // There can be no parentheses left on the stack during this popping.
594 |         var concat = ArrayList(*Expr).init(p.arena.allocator());
595 | 
596 |         while (true) {
597 |             if (p.stack.items.len == 0) {
598 |                 // concat the items in reverse order and return
599 |                 mem.reverse(*Expr, concat.items);
600 | 
601 |                 const r = try p.createExpr();
602 |                 if (concat.items.len == 1) {
603 |                     r.* = concat.items[0].*;
604 |                 } else {
605 |                     r.* = Expr{ .Concat = concat };
606 |                 }
607 |                 return r;
608 |             }
609 | 
610 |             // pop an item, check if it is an alternate and not a pseudo left paren
611 |             const e = p.stack.pop().?;
612 |             switch (e.*) {
613 |                 .PseudoLeftParen => {
614 |                     return error.UnclosedParentheses;
615 |                 },
616 |                 // Alternation at top-level, push concat and return
617 |                 .Alternate => {
618 |                     mem.reverse(*Expr, concat.items);
619 | 
620 |                     const ra = try p.createExpr();
621 |                     if (concat.items.len == 1) {
622 |                         ra.* = concat.items[0].*;
623 |                     } else {
624 |                         ra.* = Expr{ .Concat = concat };
625 |                     }
626 | 
627 |                     // use the expression itself
628 |                     try e.Alternate.append(ra);
629 | 
630 |                     // if stack is not empty, this is an error
631 |                     if (p.stack.items.len != 0) {
632 |                         switch (p.stack.pop().?.*) {
633 |                             .PseudoLeftParen => return error.UnclosedParentheses,
634 |                             else => unreachable,
635 |                         }
636 |                     }
637 | 
638 |                     return e;
639 |                 },
640 |                 // New expression, push onto concat stack
641 |                 else => {
642 |                     try concat.append(e);
643 |                 },
644 |             }
645 |         }
646 |     }
647 | 
648 |     fn parseLiteral(p: *Parser, ch: u8) !void {
649 |         const r = try p.createExpr();
650 |         r.* = Expr{ .Literal = ch };
651 |         try p.stack.append(r);
652 |     }
653 | 
654 |     fn parseRepeat(p: *Parser, min: usize, max: ?usize) !void {
655 |         var greedy = true;
656 |         if (p.it.peekIs('?')) {
657 |             p.it.bump();
658 |             greedy = false;
659 |         }
660 | 
661 |         const sub_expr = p.popByteClass() catch return error.MissingRepeatOperand;
662 | 
663 |         const repeat = Repeater{
664 |             .subexpr = sub_expr,
665 |             .min = min,
666 |             .max = max,
667 |             .greedy = greedy,
668 |         };
669 | 
670 |         const r = try p.createExpr();
671 |         r.* = Expr{ .Repeat = repeat };
672 |         try p.stack.append(r);
673 |     }
674 | 
675 |     // NOTE: We don't handle needed character classes.
676 |     fn parseCharClass(p: *Parser) !void {
677 |         var it = &p.it;
678 | 
679 |         var class = ByteClass.init(p.arena.allocator());
680 |         errdefer class.deinit();
681 | 
682 |         var negate = false;
683 |         if (it.peekIs('^')) {
684 |             it.bump();
685 |             negate = true;
686 |         }
687 | 
688 |         // First '[' in a multi-class is always treated as a literal. This disallows
689 |         // the empty byte-set '[]'.
690 |         if (it.peekIs(']')) {
691 |             it.bump();
692 | 
693 |             const range = ByteRange{ .min = ']', .max = ']' };
694 |             try class.addRange(range);
695 |         }
696 | 
697 |         while (!it.peekIs(']')) : (it.bump()) {
698 |             if (it.peek() == null) {
699 |                 return error.UnclosedBrackets;
700 |             }
701 | 
702 |             const chp = it.peek().?;
703 | 
704 |             // If this is a byte-class escape, we cannot expect an '-' range after it.
705 |             // Accept the following - as a literal (may be bad behaviour).
706 |             //
707 |             // If it is not, then we can and it is fine.
708 |             var range: ByteRange = undefined;
709 | 
710 |             if (chp == '\\') {
711 |                 it.bump();
712 | 
713 |                 // parseEscape returns a literal or byteclass so reformat
714 |                 const r = try p.parseEscape();
715 |                 // NOTE: this is bumped on loop
716 |                 it.index -= 1;
717 |                 switch (r.*) {
718 |                     .Literal => |value| {
719 |                         range = ByteRange{ .min = value, .max = value };
720 |                     },
721 |                     .ByteClass => |*vv| {
722 |                         defer vv.deinit();
723 |                         // '-' doesn't make sense following this, merge class here
724 |                         // and continue next.
725 |                         try class.mergeClass(vv.*);
726 |                         continue;
727 |                     },
728 |                     else => unreachable,
729 |                 }
730 |             } else {
731 |                 range = ByteRange{ .min = chp, .max = chp };
732 |             }
733 | 
734 |             // is this a range?
735 |             if (it.peekNextIs('-')) {
736 |                 it.bump();
737 |                 it.bump();
738 | 
739 |                 if (it.peek() == null) {
740 |                     return error.UnclosedBrackets;
741 |                 } else if (it.peekIs(']')) {
742 |                     // treat the '-' as a literal instead
743 |                     it.index -= 1;
744 |                 } else {
745 |                     range.max = it.peek().?;
746 |                 }
747 |             }
748 | 
749 |             try class.addRange(range);
750 |         }
751 |         it.bump();
752 | 
753 |         if (negate) {
754 |             try class.negate();
755 |         }
756 | 
757 |         const r = try p.createExpr();
758 |         r.* = Expr{ .ByteClass = class };
759 |         try p.stack.append(r);
760 |     }
761 | 
762 |     fn parseEscape(p: *Parser) !*Expr {
763 |         const ch = p.it.next() orelse return error.OpenEscapeCode;
764 | 
765 |         if (isPunctuation(ch)) {
766 |             const r = try p.createExpr();
767 |             r.* = Expr{ .Literal = ch };
768 |             return r;
769 |         }
770 | 
771 |         switch (ch) {
772 |             // escape chars
773 |             'a' => {
774 |                 const r = try p.createExpr();
775 |                 r.* = Expr{ .Literal = '\x07' };
776 |                 return r;
777 |             },
778 |             'f' => {
779 |                 const r = try p.createExpr();
780 |                 r.* = Expr{ .Literal = '\x0c' };
781 |                 return r;
782 |             },
783 |             'n' => {
784 |                 const r = try p.createExpr();
785 |                 r.* = Expr{ .Literal = '\n' };
786 |                 return r;
787 |             },
788 |             'r' => {
789 |                 const r = try p.createExpr();
790 |                 r.* = Expr{ .Literal = '\r' };
791 |                 return r;
792 |             },
793 |             't' => {
794 |                 const r = try p.createExpr();
795 |                 r.* = Expr{ .Literal = '\t' };
796 |                 return r;
797 |             },
798 |             'v' => {
799 |                 const r = try p.createExpr();
800 |                 r.* = Expr{ .Literal = '\x0b' };
801 |                 return r;
802 |             },
803 |             // perl codes
804 |             's' => {
805 |                 const s = try ByteClassTemplates.Whitespace(p.arena.allocator());
806 |                 const r = try p.createExpr();
807 |                 r.* = Expr{ .ByteClass = s };
808 |                 return r;
809 |             },
810 |             'S' => {
811 |                 const s = try ByteClassTemplates.NonWhitespace(p.arena.allocator());
812 |                 const r = try p.createExpr();
813 |                 r.* = Expr{ .ByteClass = s };
814 |                 return r;
815 |             },
816 |             'w' => {
817 |                 const s = try ByteClassTemplates.AlphaNumeric(p.arena.allocator());
818 |                 const r = try p.createExpr();
819 |                 r.* = Expr{ .ByteClass = s };
820 |                 return r;
821 |             },
822 |             'W' => {
823 |                 const s = try ByteClassTemplates.NonAlphaNumeric(p.arena.allocator());
824 |                 const r = try p.createExpr();
825 |                 r.* = Expr{ .ByteClass = s };
826 |                 return r;
827 |             },
828 |             'd' => {
829 |                 const s = try ByteClassTemplates.Digits(p.arena.allocator());
830 |                 const r = try p.createExpr();
831 |                 r.* = Expr{ .ByteClass = s };
832 |                 return r;
833 |             },
834 |             'D' => {
835 |                 const s = try ByteClassTemplates.NonDigits(p.arena.allocator());
836 |                 const r = try p.createExpr();
837 |                 r.* = Expr{ .ByteClass = s };
838 |                 return r;
839 |             },
840 |             '0'...'9' => {
841 |                 p.it.bumpBack();
842 | 
843 |                 // octal integer up to 3 digits, always succeeds since we have at least one digit
844 |                 // TODO: u32 codepoint and not u8
845 |                 const value = p.it.readIntN(u8, 8, 3) catch return error.InvalidOctalDigit;
846 |                 const r = try p.createExpr();
847 |                 r.* = Expr{ .Literal = value };
848 |                 return r;
849 |             },
850 |             'x' => {
851 |                 p.it.skipSpaces();
852 | 
853 |                 // '\x{2423}
854 |                 if (p.it.peekIs('{')) {
855 |                     p.it.bump();
856 | 
857 |                     // TODO: u32 codepoint and not u8
858 |                     const value = p.it.readInt(u8, 16) catch return error.InvalidHexDigit;
859 | 
860 |                     // TODO: Check range as well and if valid unicode codepoint
861 |                     if (!p.it.peekIs('}')) {
862 |                         return error.UnclosedHexCharacterCode;
863 |                     }
864 |                     p.it.bump();
865 | 
866 |                     const r = try p.createExpr();
867 |                     r.* = Expr{ .Literal = value };
868 |                     return r;
869 |                 }
870 |                 // '\x23
871 |                 else {
872 |                     const value = p.it.readIntN(u8, 16, 2) catch return error.InvalidHexDigit;
873 |                     const r = try p.createExpr();
874 |                     r.* = Expr{ .Literal = value };
875 |                     return r;
876 |                 }
877 |             },
878 |             'b' => {
879 |                 const r = try p.createExpr();
880 |                 r.* = Expr{ .EmptyMatch = Assertion.WordBoundaryAscii };
881 |                 return r;
882 |             },
883 |             'B' => {
884 |                 const r = try p.createExpr();
885 |                 r.* = Expr{ .EmptyMatch = Assertion.NotWordBoundaryAscii };
886 |                 return r;
887 |             },
888 |             else => {
889 |                 return error.UnrecognizedEscapeCode;
890 |             },
891 |         }
892 |     }
893 | };
894 | 


--------------------------------------------------------------------------------
/src/parse_test.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const debug = std.debug;
  3 | const mem = std.mem;
  4 | const FixedBufferAllocator = std.heap.FixedBufferAllocator;
  5 | 
  6 | const parse = @import("parse.zig");
  7 | const Parser = parse.Parser;
  8 | const Expr = parse.Expr;
  9 | const ParseError = parse.ParseError;
 10 | 
 11 | // Note: Switch to OutStream
 12 | var global_buffer: [2048]u8 = undefined;
 13 | 
 14 | const StaticWriter = struct {
 15 |     buffer: []u8,
 16 |     last: usize,
 17 | 
 18 |     pub fn init(buffer: []u8) StaticWriter {
 19 |         return StaticWriter{
 20 |             .buffer = buffer,
 21 |             .last = 0,
 22 |         };
 23 |     }
 24 | 
 25 |     pub fn writeFn(self: *StaticWriter, bytes: []const u8) Error!usize {
 26 |         @memcpy(self.buffer[self.last..][0..bytes.len], bytes);
 27 |         self.last += bytes.len;
 28 |         return bytes.len;
 29 |     }
 30 | 
 31 |     pub const Error = error{OutOfMemory};
 32 |     pub const Writer = std.io.Writer(*StaticWriter, Error, writeFn);
 33 | 
 34 |     pub fn writer(self: *StaticWriter) Writer {
 35 |         return .{ .context = self };
 36 |     }
 37 | 
 38 |     pub fn printCharEscaped(self: *StaticWriter, ch: u8) !void {
 39 |         switch (ch) {
 40 |             '\t' => {
 41 |                 try self.writer().print("\\t", .{});
 42 |             },
 43 |             '\r' => {
 44 |                 try self.writer().print("\\r", .{});
 45 |             },
 46 |             '\n' => {
 47 |                 try self.writer().print("\\n", .{});
 48 |             },
 49 |             // printable characters
 50 |             32...126 => {
 51 |                 try self.writer().print("{c}", .{ch});
 52 |             },
 53 |             else => {
 54 |                 try self.writer().print("0x{x}", .{ch});
 55 |             },
 56 |         }
 57 |     }
 58 | };
 59 | 
 60 | // Return a minimal string representation of the expression tree.
 61 | fn repr(e: *Expr) ![]u8 {
 62 |     var stream = StaticWriter.init(global_buffer[0..]);
 63 |     try reprIndent(&stream, e, 0);
 64 |     return global_buffer[0..stream.last];
 65 | }
 66 | 
 67 | fn reprIndent(out: *StaticWriter, e: *Expr, indent: usize) anyerror!void {
 68 |     var i: usize = 0;
 69 |     while (i < indent) : (i += 1) {
 70 |         try out.writer().print(" ", .{});
 71 |     }
 72 | 
 73 |     switch (e.*) {
 74 |         Expr.AnyCharNotNL => {
 75 |             try out.writer().print("dot\n", .{});
 76 |         },
 77 |         Expr.EmptyMatch => |assertion| {
 78 |             try out.writer().print("empty({s})\n", .{@tagName(assertion)});
 79 |         },
 80 |         Expr.Literal => |lit| {
 81 |             try out.writer().print("lit(", .{});
 82 |             try out.printCharEscaped(lit);
 83 |             try out.writer().print(")\n", .{});
 84 |         },
 85 |         Expr.Capture => |subexpr| {
 86 |             try out.writer().print("cap\n", .{});
 87 |             try reprIndent(out, subexpr, indent + 1);
 88 |         },
 89 |         Expr.Repeat => |repeat| {
 90 |             try out.writer().print("rep(", .{});
 91 |             if (repeat.min == 0 and repeat.max == null) {
 92 |                 try out.writer().print("*", .{});
 93 |             } else if (repeat.min == 1 and repeat.max == null) {
 94 |                 try out.writer().print("+", .{});
 95 |             } else if (repeat.min == 0 and repeat.max != null and repeat.max.? == 1) {
 96 |                 try out.writer().print("?", .{});
 97 |             } else {
 98 |                 try out.writer().print("{{{d},", .{repeat.min});
 99 |                 if (repeat.max) |ok| {
100 |                     try out.writer().print("{d}", .{ok});
101 |                 }
102 |                 try out.writer().print("}}", .{});
103 |             }
104 | 
105 |             if (!repeat.greedy) {
106 |                 try out.writer().print("?", .{});
107 |             }
108 |             try out.writer().print(")\n", .{});
109 | 
110 |             try reprIndent(out, repeat.subexpr, indent + 1);
111 |         },
112 |         Expr.ByteClass => |class| {
113 |             try out.writer().print("bset(", .{});
114 |             for (class.ranges.items) |r| {
115 |                 try out.writer().print("[", .{});
116 |                 try out.printCharEscaped(r.min);
117 |                 try out.writer().print("-", .{});
118 |                 try out.printCharEscaped(r.max);
119 |                 try out.writer().print("]", .{});
120 |             }
121 |             try out.writer().print(")\n", .{});
122 |         },
123 |         // TODO: Can we get better type unification on enum variants with the same type?
124 |         Expr.Concat => |subexprs| {
125 |             try out.writer().print("cat\n", .{});
126 |             for (subexprs.items) |s|
127 |                 try reprIndent(out, s, indent + 1);
128 |         },
129 |         Expr.Alternate => |subexprs| {
130 |             try out.writer().print("alt\n", .{});
131 |             for (subexprs.items) |s|
132 |                 try reprIndent(out, s, indent + 1);
133 |         },
134 |         // NOTE: Shouldn't occur ever in returned output.
135 |         Expr.PseudoLeftParen => {
136 |             try out.writer().print("{s}\n", .{@tagName(e.*)});
137 |         },
138 |     }
139 | }
140 | 
141 | fn check(re: []const u8, expected_ast: []const u8) void {
142 |     var p = Parser.init(std.testing.allocator);
143 |     defer p.deinit();
144 |     const expr = p.parse(re) catch unreachable;
145 | 
146 |     const ast = repr(expr) catch unreachable;
147 | 
148 |     const spaces = [_]u8{ ' ', '\n' };
149 |     const trimmed_ast = mem.trim(u8, ast, &spaces);
150 |     const trimmed_expected_ast = mem.trim(u8, expected_ast, &spaces);
151 | 
152 |     if (!mem.eql(u8, trimmed_ast, trimmed_expected_ast)) {
153 |         debug.print(
154 |             \\
155 |             \\-- parsed the regex
156 |             \\
157 |             \\{s}
158 |             \\
159 |             \\-- expected the following
160 |             \\
161 |             \\{s}
162 |             \\
163 |             \\-- but instead got
164 |             \\
165 |             \\{s}
166 |             \\
167 |         , .{
168 |             re,
169 |             trimmed_expected_ast,
170 |             trimmed_ast,
171 |         });
172 | 
173 |         @panic("assertion failure");
174 |     }
175 | }
176 | 
177 | // These are taken off rust-regex for the moment.
178 | test "parse simple" {
179 |     check(
180 |         \\
181 |     ,
182 |         \\empty(None)
183 |     );
184 | 
185 |     check(
186 |         \\a
187 |     ,
188 |         \\lit(a)
189 |     );
190 | 
191 |     check(
192 |         \\ab
193 |     ,
194 |         \\cat
195 |         \\ lit(a)
196 |         \\ lit(b)
197 |     );
198 | 
199 |     check(
200 |         \\^a
201 |     ,
202 |         \\cat
203 |         \\ empty(BeginLine)
204 |         \\ lit(a)
205 |     );
206 | 
207 |     check(
208 |         \\a?
209 |     ,
210 |         \\rep(?)
211 |         \\ lit(a)
212 |     );
213 | 
214 |     check(
215 |         \\ab?
216 |     ,
217 |         \\cat
218 |         \\ lit(a)
219 |         \\ rep(?)
220 |         \\  lit(b)
221 |     );
222 | 
223 |     check(
224 |         \\a??
225 |     ,
226 |         \\rep(??)
227 |         \\ lit(a)
228 |     );
229 | 
230 |     check(
231 |         \\a+
232 |     ,
233 |         \\rep(+)
234 |         \\ lit(a)
235 |     );
236 | 
237 |     check(
238 |         \\a+?
239 |     ,
240 |         \\rep(+?)
241 |         \\ lit(a)
242 |     );
243 | 
244 |     check(
245 |         \\a*?
246 |     ,
247 |         \\rep(*?)
248 |         \\ lit(a)
249 |     );
250 | 
251 |     check(
252 |         \\a{5}
253 |     ,
254 |         \\rep({5,5})
255 |         \\ lit(a)
256 |     );
257 | 
258 |     check(
259 |         \\a{5,}
260 |     ,
261 |         \\rep({5,})
262 |         \\ lit(a)
263 |     );
264 | 
265 |     check(
266 |         \\a{5,10}
267 |     ,
268 |         \\rep({5,10})
269 |         \\ lit(a)
270 |     );
271 | 
272 |     check(
273 |         \\a{5}?
274 |     ,
275 |         \\rep({5,5}?)
276 |         \\ lit(a)
277 |     );
278 | 
279 |     check(
280 |         \\a{5,}?
281 |     ,
282 |         \\rep({5,}?)
283 |         \\ lit(a)
284 |     );
285 | 
286 |     check(
287 |         \\a{ 5     }
288 |     ,
289 |         \\rep({5,5})
290 |         \\ lit(a)
291 |     );
292 | 
293 |     check(
294 |         \\(a)
295 |     ,
296 |         \\cap
297 |         \\ lit(a)
298 |     );
299 | 
300 |     check(
301 |         \\(ab)
302 |     ,
303 |         \\cap
304 |         \\ cat
305 |         \\  lit(a)
306 |         \\  lit(b)
307 |     );
308 | 
309 |     check(
310 |         \\a|b
311 |     ,
312 |         \\alt
313 |         \\ lit(a)
314 |         \\ lit(b)
315 |     );
316 | 
317 |     check(
318 |         \\a|b|c
319 |     ,
320 |         \\alt
321 |         \\ lit(a)
322 |         \\ lit(b)
323 |         \\ lit(c)
324 |     );
325 | 
326 |     check(
327 |         \\(a|b)
328 |     ,
329 |         \\cap
330 |         \\ alt
331 |         \\  lit(a)
332 |         \\  lit(b)
333 |     );
334 | 
335 |     check(
336 |         \\(a|b|c)
337 |     ,
338 |         \\cap
339 |         \\ alt
340 |         \\  lit(a)
341 |         \\  lit(b)
342 |         \\  lit(c)
343 |     );
344 | 
345 |     check(
346 |         \\(ab|bc|cd)
347 |     ,
348 |         \\cap
349 |         \\ alt
350 |         \\  cat
351 |         \\   lit(a)
352 |         \\   lit(b)
353 |         \\  cat
354 |         \\   lit(b)
355 |         \\   lit(c)
356 |         \\  cat
357 |         \\   lit(c)
358 |         \\   lit(d)
359 |     );
360 | 
361 |     check(
362 |         \\(ab|(bc|(cd)))
363 |     ,
364 |         \\cap
365 |         \\ alt
366 |         \\  cat
367 |         \\   lit(a)
368 |         \\   lit(b)
369 |         \\  cap
370 |         \\   alt
371 |         \\    cat
372 |         \\     lit(b)
373 |         \\     lit(c)
374 |         \\    cap
375 |         \\     cat
376 |         \\      lit(c)
377 |         \\      lit(d)
378 |     );
379 | 
380 |     check(
381 |         \\.
382 |     ,
383 |         \\dot
384 |     );
385 | }
386 | 
387 | test "parse escape" {
388 |     check(
389 |         \\\a\f\t\n\r\v
390 |     ,
391 |         \\cat
392 |         \\ lit(0x7)
393 |         \\ lit(0xc)
394 |         \\ lit(\t)
395 |         \\ lit(\n)
396 |         \\ lit(\r)
397 |         \\ lit(0xb)
398 |     );
399 | 
400 |     check(
401 |         \\\\\.\+\*\?\(\)\|\[\]\{\}\^\$
402 |     ,
403 |         \\cat
404 |         \\ lit(\)
405 |         \\ lit(.)
406 |         \\ lit(+)
407 |         \\ lit(*)
408 |         \\ lit(?)
409 |         \\ lit(()
410 |         \\ lit())
411 |         \\ lit(|)
412 |         \\ lit([)
413 |         \\ lit(])
414 |         \\ lit({)
415 |         \\ lit(})
416 |         \\ lit(^)
417 |         \\ lit($)
418 |     );
419 | 
420 |     check("\\123",
421 |         \\lit(S)
422 |     );
423 | 
424 |     check("\\1234",
425 |         \\cat
426 |         \\ lit(S)
427 |         \\ lit(4)
428 |     );
429 | 
430 |     check("\\x53",
431 |         \\lit(S)
432 |     );
433 | 
434 |     check("\\x534",
435 |         \\cat
436 |         \\ lit(S)
437 |         \\ lit(4)
438 |     );
439 | 
440 |     check("\\x{53}",
441 |         \\lit(S)
442 |     );
443 | 
444 |     check("\\x{53}4",
445 |         \\cat
446 |         \\ lit(S)
447 |         \\ lit(4)
448 |     );
449 | }
450 | 
451 | test "parse character classes" {
452 |     check(
453 |         \\[a]
454 |     ,
455 |         \\bset([a-a])
456 |     );
457 | 
458 |     check(
459 |         \\[\x00]
460 |     ,
461 |         \\bset([0x0-0x0])
462 |     );
463 | 
464 |     check(
465 |         \\[\n]
466 |     ,
467 |         \\bset([\n-\n])
468 |     );
469 | 
470 |     check(
471 |         \\[^a]
472 |     ,
473 |         \\bset([0x0-`][b-0xff])
474 |     );
475 | 
476 |     check(
477 |         \\[^\x00]
478 |     ,
479 |         \\bset([0x1-0xff])
480 |     );
481 | 
482 |     check(
483 |         \\[^\n]
484 |     ,
485 |         \\bset([0x0-\t][0xb-0xff])
486 |     );
487 | 
488 |     check(
489 |         \\[]]
490 |     ,
491 |         \\bset([]-]])
492 |     );
493 | 
494 |     check(
495 |         \\[]\[]
496 |     ,
497 |         \\bset([[-[][]-]])
498 |     );
499 | 
500 |     check(
501 |         \\[\[]]
502 |     ,
503 |         \\cat
504 |         \\ bset([[-[])
505 |         \\ lit(])
506 |     );
507 | 
508 |     check(
509 |         \\[]-]
510 |     ,
511 |         \\bset([---][]-]])
512 |     );
513 | 
514 |     check(
515 |         \\[-]]
516 |     ,
517 |         \\cat
518 |         \\ bset([---])
519 |         \\ lit(])
520 |     );
521 | }
522 | 
523 | fn checkError(re: []const u8, expected_err: ParseError) void {
524 |     var a = std.heap.ArenaAllocator.init(std.testing.allocator);
525 |     defer a.deinit();
526 |     var p = Parser.init(a.allocator());
527 |     const parse_result = p.parse(re);
528 | 
529 |     if (parse_result) |expr| {
530 |         const ast = repr(expr) catch unreachable;
531 |         const spaces = [_]u8{ ' ', '\n' };
532 |         const trimmed_ast = mem.trim(u8, ast, &spaces);
533 | 
534 |         debug.print(
535 |             \\
536 |             \\-- parsed the regex
537 |             \\
538 |             \\{s}
539 |             \\
540 |             \\-- expected the following
541 |             \\
542 |             \\{s}
543 |             \\
544 |             \\-- but instead got
545 |             \\
546 |             \\{s}
547 |             \\
548 |             \\
549 |         , .{
550 |             re,
551 |             @errorName(expected_err),
552 |             trimmed_ast,
553 |         });
554 | 
555 |         @panic("assertion failure");
556 |     } else |found_err| {
557 |         if (found_err != expected_err) {
558 |             debug.print(
559 |                 \\
560 |                 \\-- parsed the regex
561 |                 \\
562 |                 \\{s}
563 |                 \\
564 |                 \\-- expected the following
565 |                 \\
566 |                 \\{s}
567 |                 \\
568 |                 \\-- but instead got
569 |                 \\
570 |                 \\{s}
571 |                 \\
572 |                 \\
573 |             , .{
574 |                 re,
575 |                 @errorName(expected_err),
576 |                 @errorName(found_err),
577 |             });
578 | 
579 |             @panic("assertion failure");
580 |         }
581 |     }
582 | }
583 | 
584 | test "parse errors repeat" {
585 |     checkError(
586 |         \\*
587 |     , ParseError.MissingRepeatOperand);
588 | 
589 |     checkError(
590 |         \\(*
591 |     , ParseError.MissingRepeatOperand);
592 | 
593 |     checkError(
594 |         \\({5}
595 |     , ParseError.MissingRepeatOperand);
596 | 
597 |     checkError(
598 |         \\{5}
599 |     , ParseError.MissingRepeatOperand);
600 | 
601 |     checkError(
602 |         \\a**
603 |     , ParseError.MissingRepeatOperand);
604 | 
605 |     checkError(
606 |         \\a|*
607 |     , ParseError.MissingRepeatOperand);
608 | 
609 |     checkError(
610 |         \\a*{5}
611 |     , ParseError.MissingRepeatOperand);
612 | 
613 |     checkError(
614 |         \\a|{5}
615 |     , ParseError.MissingRepeatOperand);
616 | 
617 |     checkError(
618 |         \\a{}
619 |     , ParseError.InvalidRepeatArgument);
620 | 
621 |     checkError(
622 |         \\a{5
623 |     , ParseError.UnclosedRepeat);
624 | 
625 |     checkError(
626 |         \\a{xyz
627 |     , ParseError.InvalidRepeatArgument);
628 | 
629 |     checkError(
630 |         \\a{12,xyz
631 |     , ParseError.InvalidRepeatArgument);
632 | 
633 |     checkError(
634 |         \\a{999999999999}
635 |     , ParseError.ExcessiveRepeatCount);
636 | 
637 |     checkError(
638 |         \\a{1,999999999999}
639 |     , ParseError.ExcessiveRepeatCount);
640 | 
641 |     checkError(
642 |         \\a{12x}
643 |     , ParseError.UnclosedRepeat);
644 | 
645 |     checkError(
646 |         \\a{1,12x}
647 |     , ParseError.UnclosedRepeat);
648 | }
649 | 
650 | test "parse errors alternate" {
651 |     checkError(
652 |         \\|a
653 |     , ParseError.EmptyAlternate);
654 | 
655 |     checkError(
656 |         \\(|a)
657 |     , ParseError.EmptyAlternate);
658 | 
659 |     checkError(
660 |         \\a||
661 |     , ParseError.EmptyAlternate);
662 | 
663 |     checkError(
664 |         \\)
665 |     , ParseError.UnopenedParentheses);
666 | 
667 |     checkError(
668 |         \\ab)
669 |     , ParseError.UnopenedParentheses);
670 | 
671 |     checkError(
672 |         \\a|b)
673 |     , ParseError.UnopenedParentheses);
674 | 
675 |     checkError(
676 |         \\(a|b
677 |     , ParseError.UnclosedParentheses);
678 | 
679 |     //checkError(
680 |     //    \\(a|)
681 |     //,
682 |     //    ParseError.UnopenedParentheses
683 |     //);
684 | 
685 |     //checkError(
686 |     //    \\()
687 |     //,
688 |     //    ParseError.UnopenedParentheses
689 |     //);
690 | 
691 |     checkError(
692 |         \\ab(xy
693 |     , ParseError.UnclosedParentheses);
694 | 
695 |     //checkError(
696 |     //    \\()
697 |     //,
698 |     //    ParseError.UnopenedParentheses
699 |     //);
700 | 
701 |     //checkError(
702 |     //    \\a|
703 |     //,
704 |     //    ParseError.UnbalancedParentheses
705 |     //);
706 | }
707 | 
708 | test "parse errors escape" {
709 |     checkError("\\", ParseError.OpenEscapeCode);
710 | 
711 |     checkError("\\m", ParseError.UnrecognizedEscapeCode);
712 | 
713 |     checkError("\\x", ParseError.InvalidHexDigit);
714 | 
715 |     //checkError(
716 |     //    "\\xA"
717 |     //,
718 |     //    ParseError.UnrecognizedEscapeCode
719 |     //);
720 | 
721 |     //checkError(
722 |     //    "\\xAG"
723 |     //,
724 |     //    ParseError.UnrecognizedEscapeCode
725 |     //);
726 | 
727 |     checkError("\\x{", ParseError.InvalidHexDigit);
728 | 
729 |     checkError("\\x{A", ParseError.UnclosedHexCharacterCode);
730 | 
731 |     checkError("\\x{AG}", ParseError.UnclosedHexCharacterCode);
732 | 
733 |     checkError("\\x{D800}", ParseError.InvalidHexDigit);
734 | 
735 |     checkError("\\x{110000}", ParseError.InvalidHexDigit);
736 | 
737 |     checkError("\\x{99999999999999}", ParseError.InvalidHexDigit);
738 | }
739 | 
740 | test "parse errors character class" {
741 |     checkError(
742 |         \\[
743 |     , ParseError.UnclosedBrackets);
744 | 
745 |     checkError(
746 |         \\[^
747 |     , ParseError.UnclosedBrackets);
748 | 
749 |     checkError(
750 |         \\[a
751 |     , ParseError.UnclosedBrackets);
752 | 
753 |     checkError(
754 |         \\[^a
755 |     , ParseError.UnclosedBrackets);
756 | 
757 |     checkError(
758 |         \\[a-
759 |     , ParseError.UnclosedBrackets);
760 | 
761 |     checkError(
762 |         \\[^a-
763 |     , ParseError.UnclosedBrackets);
764 | 
765 |     checkError(
766 |         \\[---
767 |     , ParseError.UnclosedBrackets);
768 | 
769 |     checkError(
770 |         \\[\A]
771 |     , ParseError.UnrecognizedEscapeCode);
772 | 
773 |     //checkError(
774 |     //    \\[a-\d]
775 |     //,
776 |     //    ParseError.UnclosedBrackets
777 |     //);
778 | 
779 |     //checkError(
780 |     //    \\[a-\A]
781 |     //,
782 |     //    ParseError.UnrecognizedEscapeCode
783 |     //);
784 | 
785 |     checkError(
786 |         \\[\A-a]
787 |     , ParseError.UnrecognizedEscapeCode);
788 | 
789 |     //checkError(
790 |     //    \\[z-a]
791 |     //,
792 |     //    ParseError.UnclosedBrackets
793 |     //);
794 | 
795 |     checkError(
796 |         \\[]
797 |     , ParseError.UnclosedBrackets);
798 | 
799 |     checkError(
800 |         \\[^]
801 |     , ParseError.UnclosedBrackets);
802 | 
803 |     //checkError(
804 |     //    \\[^\d\D]
805 |     //,
806 |     //    ParseError.UnclosedBrackets
807 |     //);
808 | 
809 |     //checkError(
810 |     //    \\[+--]
811 |     //,
812 |     //    ParseError.UnclosedBrackets
813 |     //);
814 | 
815 |     //checkError(
816 |     //    \\[a-a--\xFF]
817 |     //,
818 |     //    ParseError.UnclosedBrackets
819 |     //);
820 | }
821 | 


--------------------------------------------------------------------------------
/src/range_set.zig:
--------------------------------------------------------------------------------
  1 | // A set of ordered disconnected non-empty ranges. These are stored in a flat array as opposed
  2 | // to a tree structure. Insertions maintain order by rearranging as needed. Asymptotically
  3 | // worse than a tree range-set but given the size of the typical range-sets we work with this
  4 | // implementation is undoubtedly quicker.
  5 | 
  6 | const std = @import("std");
  7 | const debug = std.debug;
  8 | const mem = std.mem;
  9 | const math = std.math;
 10 | const Allocator = std.mem.Allocator;
 11 | const ArrayList = std.ArrayList;
 12 | 
 13 | // A single inclusive range (a, b) and a <= b
 14 | pub fn Range(comptime T: type) type {
 15 |     return struct {
 16 |         min: T,
 17 |         max: T,
 18 | 
 19 |         pub fn new(min: T, max: T) Range(T) {
 20 |             debug.assert(min <= max);
 21 |             return Range(T){ .min = min, .max = max };
 22 |         }
 23 | 
 24 |         pub fn single(item: T) Range(T) {
 25 |             return Range(T){ .min = item, .max = item };
 26 |         }
 27 |     };
 28 | }
 29 | 
 30 | // A contiguous set of ranges which manages merging of sub-ranges and negation of the entire class.
 31 | pub fn RangeSet(comptime T: type) type {
 32 |     return struct {
 33 |         const Self = @This();
 34 |         const RangeType = Range(T);
 35 | 
 36 |         // for any consecutive x, y in ranges, the following hold:
 37 |         //  - x.min <= x.max
 38 |         //  - x.max < y.min
 39 |         ranges: ArrayList(RangeType),
 40 | 
 41 |         pub fn init(a: Allocator) Self {
 42 |             return Self{ .ranges = ArrayList(RangeType).init(a) };
 43 |         }
 44 | 
 45 |         pub fn deinit(self: *Self) void {
 46 |             self.ranges.deinit();
 47 |         }
 48 | 
 49 |         pub fn clone(self: Self) !Self {
 50 |             return Self{ .ranges = try self.ranges.clone() };
 51 |         }
 52 | 
 53 |         pub fn dupe(self: Self, a: Allocator) !Self {
 54 |             var cloned = try ArrayList(RangeType).initCapacity(a, self.ranges.items.len);
 55 |             cloned.appendSliceAssumeCapacity(self.ranges.items);
 56 |             return Self{ .ranges = cloned };
 57 |         }
 58 | 
 59 |         // Add a range into the current class, preserving the structure invariants.
 60 |         pub fn addRange(self: *Self, range: RangeType) !void {
 61 |             var ranges = &self.ranges;
 62 | 
 63 |             if (ranges.items.len == 0) {
 64 |                 try ranges.append(range);
 65 |                 return;
 66 |             }
 67 | 
 68 |             // Insert range.
 69 |             for (ranges.items, 0..) |r, i| {
 70 |                 if (range.min <= r.min) {
 71 |                     try ranges.insert(i, range);
 72 |                     break;
 73 |                 }
 74 |             } else {
 75 |                 try ranges.append(range);
 76 |             }
 77 | 
 78 |             // Merge overlapping runs.
 79 |             var index: usize = 0;
 80 |             var merge = ranges.items[0];
 81 | 
 82 |             for (ranges.items[1..]) |r| {
 83 |                 // Overlap (or directly adjacent)
 84 |                 const upper = math.add(T, merge.max, 1) catch math.maxInt(T);
 85 |                 if (r.min <= upper) {
 86 |                     merge.max = @max(merge.max, r.max);
 87 |                 }
 88 |                 // No overlap
 89 |                 else {
 90 |                     ranges.items[index] = merge;
 91 |                     merge = r;
 92 |                     index += 1;
 93 |                 }
 94 |             }
 95 | 
 96 |             ranges.items[index] = merge;
 97 |             index += 1;
 98 |             ranges.shrinkRetainingCapacity(index);
 99 |         }
100 | 
101 |         // Merge two classes into one.
102 |         pub fn mergeClass(self: *Self, other: Self) !void {
103 |             for (other.ranges.items) |r| {
104 |                 try self.addRange(r);
105 |             }
106 |         }
107 | 
108 |         // Inverting a class means the resulting class the contains method will match
109 |         // the inverted set. i.e. contains(a, byte) == !contains(b, byte) if a == b.negated().
110 |         //
111 |         // The negation is performed in place.
112 |         pub fn negate(self: *Self) !void {
113 |             const ranges = &self.ranges;
114 |             const ranges_end = self.ranges.items.len;
115 | 
116 |             // The negated range is appended to the current list of ranges and then moved in
117 |             // place and capacity shrunk to avoid creating a temporary range set.
118 |             const negated = &self.ranges;
119 |             const negated_start = self.ranges.items.len;
120 | 
121 |             if (ranges.items.len == 0) {
122 |                 try ranges.append(RangeType.new(math.minInt(T), math.maxInt(T)));
123 |                 return;
124 |             }
125 | 
126 |             var low: T = math.minInt(T);
127 |             for (ranges.items[0..ranges_end]) |r| {
128 |                 // NOTE: Can only occur on first element.
129 |                 if (r.min != math.minInt(T)) {
130 |                     try negated.append(RangeType.new(low, r.min - 1));
131 |                 }
132 | 
133 |                 low = math.add(T, r.max, 1) catch math.maxInt(T);
134 |             }
135 | 
136 |             // Highest segment will be remaining.
137 |             const lastRange = ranges.items[ranges_end - 1];
138 |             if (lastRange.max != math.maxInt(T)) {
139 |                 try negated.append(RangeType.new(low, math.maxInt(T)));
140 |             }
141 | 
142 |             std.mem.copyForwards(RangeType, ranges.items, ranges.items[negated_start..]);
143 |             ranges.shrinkRetainingCapacity(negated.items.len - negated_start);
144 |         }
145 | 
146 |         pub fn contains(self: Self, value: T) bool {
147 |             // TODO: Binary search required for large unicode sets.
148 |             for (self.ranges.items) |range| {
149 |                 if (range.min <= value and value <= range.max) {
150 |                     return true;
151 |                 }
152 |             }
153 |             return false;
154 |         }
155 |     };
156 | }
157 | 
158 | pub const ByteClassTemplates = struct {
159 |     const ByteRange = Range(u8);
160 |     const ByteClass = RangeSet(u8);
161 | 
162 |     pub fn Whitespace(a: Allocator) !ByteClass {
163 |         var rs = ByteClass.init(a);
164 |         errdefer rs.deinit();
165 | 
166 |         // \t, \n, \v, \f, \r
167 |         try rs.addRange(ByteRange.new('\x09', '\x0D'));
168 |         // ' '
169 |         try rs.addRange(ByteRange.single(' '));
170 | 
171 |         return rs;
172 |     }
173 | 
174 |     pub fn NonWhitespace(a: Allocator) !ByteClass {
175 |         var rs = try Whitespace(a);
176 |         errdefer rs.deinit();
177 | 
178 |         try rs.negate();
179 |         return rs;
180 |     }
181 | 
182 |     pub fn AlphaNumeric(a: Allocator) !ByteClass {
183 |         var rs = ByteClass.init(a);
184 |         errdefer rs.deinit();
185 | 
186 |         try rs.addRange(ByteRange.new('0', '9'));
187 |         try rs.addRange(ByteRange.new('A', 'Z'));
188 |         try rs.addRange(ByteRange.new('a', 'z'));
189 | 
190 |         return rs;
191 |     }
192 | 
193 |     pub fn NonAlphaNumeric(a: Allocator) !ByteClass {
194 |         var rs = try AlphaNumeric(a);
195 |         errdefer rs.deinit();
196 | 
197 |         try rs.negate();
198 |         return rs;
199 |     }
200 | 
201 |     pub fn Digits(a: Allocator) !ByteClass {
202 |         var rs = ByteClass.init(a);
203 |         errdefer rs.deinit();
204 | 
205 |         try rs.addRange(ByteRange.new('0', '9'));
206 | 
207 |         return rs;
208 |     }
209 | 
210 |     pub fn NonDigits(a: Allocator) !ByteClass {
211 |         var rs = try Digits(a);
212 |         errdefer rs.deinit();
213 | 
214 |         try rs.negate();
215 |         return rs;
216 |     }
217 | };
218 | 
219 | test "class simple" {
220 |     const alloc = std.testing.allocator;
221 |     var a = RangeSet(u8).init(alloc);
222 |     defer a.deinit();
223 |     try a.addRange(Range(u8).new(0, 54));
224 | 
225 |     debug.assert(a.contains(0));
226 |     debug.assert(a.contains(23));
227 |     debug.assert(a.contains(54));
228 |     debug.assert(!a.contains(58));
229 | }
230 | 
231 | test "class simple negate" {
232 |     const alloc = std.testing.allocator;
233 |     var a = RangeSet(u8).init(alloc);
234 |     defer a.deinit();
235 |     try a.addRange(Range(u8).new(0, 54));
236 | 
237 |     debug.assert(a.contains(0));
238 |     debug.assert(a.contains(23));
239 |     debug.assert(a.contains(54));
240 |     debug.assert(!a.contains(58));
241 | 
242 |     try a.negate();
243 |     // Match the negation
244 | 
245 |     debug.assert(!a.contains(0));
246 |     debug.assert(!a.contains(23));
247 |     debug.assert(!a.contains(54));
248 |     debug.assert(a.contains(55));
249 |     debug.assert(a.contains(58));
250 | 
251 |     try a.negate();
252 |     // negate is idempotent
253 | 
254 |     debug.assert(a.contains(0));
255 |     debug.assert(a.contains(23));
256 |     debug.assert(a.contains(54));
257 |     debug.assert(!a.contains(58));
258 | }
259 | 
260 | test "class multiple" {
261 |     const alloc = std.testing.allocator;
262 |     var a = RangeSet(u8).init(alloc);
263 |     defer a.deinit();
264 |     try a.addRange(Range(u8).new(0, 20));
265 |     try a.addRange(Range(u8).new(80, 100));
266 |     try a.addRange(Range(u8).new(230, 255));
267 | 
268 |     debug.assert(a.contains(20));
269 |     debug.assert(!a.contains(21));
270 |     debug.assert(!a.contains(79));
271 |     debug.assert(a.contains(80));
272 |     debug.assert(!a.contains(229));
273 |     debug.assert(a.contains(230));
274 |     debug.assert(a.contains(255));
275 | }
276 | 
277 | test "class multiple negated" {
278 |     const alloc = std.testing.allocator;
279 |     var a = RangeSet(u8).init(alloc);
280 |     defer a.deinit();
281 |     try a.addRange(Range(u8).new(0, 20));
282 |     try a.addRange(Range(u8).new(80, 100));
283 |     try a.addRange(Range(u8).new(230, 255));
284 | 
285 |     debug.assert(a.contains(20));
286 |     debug.assert(!a.contains(21));
287 |     debug.assert(!a.contains(79));
288 |     debug.assert(a.contains(80));
289 |     debug.assert(!a.contains(229));
290 |     debug.assert(a.contains(230));
291 |     debug.assert(a.contains(255));
292 | 
293 |     try a.negate();
294 | 
295 |     debug.assert(!a.contains(20));
296 |     debug.assert(a.contains(21));
297 |     debug.assert(a.contains(79));
298 |     debug.assert(!a.contains(80));
299 |     debug.assert(a.contains(229));
300 |     debug.assert(!a.contains(230));
301 |     debug.assert(!a.contains(255));
302 | 
303 |     try a.negate();
304 | 
305 |     debug.assert(a.contains(20));
306 |     debug.assert(!a.contains(21));
307 |     debug.assert(!a.contains(79));
308 |     debug.assert(a.contains(80));
309 |     debug.assert(!a.contains(229));
310 |     debug.assert(a.contains(230));
311 |     debug.assert(a.contains(255));
312 | }
313 | 
314 | test "class out of order" {
315 |     const alloc = std.testing.allocator;
316 |     var a = RangeSet(u8).init(alloc);
317 |     defer a.deinit();
318 |     try a.addRange(Range(u8).new(80, 100));
319 |     try a.addRange(Range(u8).new(20, 30));
320 | 
321 |     debug.assert(a.contains(80));
322 |     debug.assert(!a.contains(79));
323 |     debug.assert(!a.contains(101));
324 |     debug.assert(!a.contains(45));
325 |     debug.assert(!a.contains(19));
326 | }
327 | 
328 | test "class merging" {
329 |     const alloc = std.testing.allocator;
330 |     var a = RangeSet(u8).init(alloc);
331 |     defer a.deinit();
332 |     try a.addRange(Range(u8).new(20, 100));
333 |     try a.addRange(Range(u8).new(50, 80));
334 |     try a.addRange(Range(u8).new(50, 140));
335 | 
336 |     debug.assert(!a.contains(19));
337 |     debug.assert(a.contains(20));
338 |     debug.assert(a.contains(80));
339 |     debug.assert(a.contains(140));
340 |     debug.assert(!a.contains(141));
341 | }
342 | 
343 | test "class merging boundary" {
344 |     const alloc = std.testing.allocator;
345 |     var a = RangeSet(u8).init(alloc);
346 |     defer a.deinit();
347 |     try a.addRange(Range(u8).new(20, 40));
348 |     try a.addRange(Range(u8).new(40, 60));
349 | 
350 |     debug.assert(a.ranges.items.len == 1);
351 | }
352 | 
353 | test "class merging adjacent" {
354 |     const alloc = std.testing.allocator;
355 |     var a = RangeSet(u8).init(alloc);
356 |     defer a.deinit();
357 |     try a.addRange(Range(u8).new(56, 56));
358 |     try a.addRange(Range(u8).new(57, 57));
359 |     try a.addRange(Range(u8).new(58, 58));
360 | 
361 |     debug.assert(a.ranges.items.len == 1);
362 | }
363 | 


--------------------------------------------------------------------------------
/src/regex.zig:
--------------------------------------------------------------------------------
  1 | // External high-level Regex api.
  2 | //
  3 | // This hides details such as what matching engine is used internally and the parsing/compilation
  4 | // stages are merged into a single wrapper function.
  5 | 
  6 | const std = @import("std");
  7 | const Allocator = std.mem.Allocator;
  8 | const ArrayList = std.ArrayList;
  9 | const debug = std.debug;
 10 | 
 11 | const parse = @import("parse.zig");
 12 | const compile = @import("compile.zig");
 13 | const exec = @import("exec.zig");
 14 | 
 15 | const Parser = parse.Parser;
 16 | const Expr = parse.Expr;
 17 | const Compiler = compile.Compiler;
 18 | const Program = compile.Program;
 19 | const Instruction = compile.Instruction;
 20 | 
 21 | const InputBytes = @import("input.zig").InputBytes;
 22 | 
 23 | pub const Regex = struct {
 24 |     // Internal allocator
 25 |     allocator: Allocator,
 26 |     // A compiled set of instructions
 27 |     compiled: Program,
 28 |     // Capture slots
 29 |     slots: ArrayList(?usize),
 30 |     // Original regex string
 31 |     string: []const u8,
 32 | 
 33 |     // Compile a regex, possibly returning any error which occurred.
 34 |     pub fn compile(a: Allocator, re: []const u8) !Regex {
 35 |         var p = Parser.init(a);
 36 |         defer p.deinit();
 37 | 
 38 |         const expr = try p.parse(re);
 39 | 
 40 |         var c = Compiler.init(a);
 41 |         defer c.deinit();
 42 | 
 43 |         return Regex{
 44 |             .allocator = a,
 45 |             .compiled = try c.compile(expr),
 46 |             .slots = ArrayList(?usize).init(a),
 47 |             .string = re,
 48 |         };
 49 |     }
 50 | 
 51 |     pub fn deinit(re: *Regex) void {
 52 |         re.slots.deinit();
 53 |         re.compiled.deinit();
 54 |     }
 55 | 
 56 |     // Does the regex match at the start of the string?
 57 |     pub fn match(re: *Regex, input_str: []const u8) !bool {
 58 |         var input_bytes = InputBytes.init(input_str);
 59 |         return exec.exec(re.allocator, re.compiled, re.compiled.start, &input_bytes.input, &re.slots);
 60 |     }
 61 | 
 62 |     // Does the regex match anywhere in the string?
 63 |     pub fn partialMatch(re: *Regex, input_str: []const u8) !bool {
 64 |         var input_bytes = InputBytes.init(input_str);
 65 |         return exec.exec(re.allocator, re.compiled, re.compiled.find_start, &input_bytes.input, &re.slots);
 66 |     }
 67 | 
 68 |     // Where in the string does the regex and its capture groups match?
 69 |     //
 70 |     // Zero capture is the entire match.
 71 |     pub fn captures(re: *Regex, input_str: []const u8) !?Captures {
 72 |         var input_bytes = InputBytes.init(input_str);
 73 |         const is_match = try exec.exec(re.allocator, re.compiled, re.compiled.find_start, &input_bytes.input, &re.slots);
 74 | 
 75 |         if (is_match) {
 76 |             return try Captures.init(input_str, &re.slots);
 77 |         } else {
 78 |             return null;
 79 |         }
 80 |     }
 81 | };
 82 | 
 83 | // A pair of bounds used to index into an associated slice.
 84 | pub const Span = struct {
 85 |     lower: usize,
 86 |     upper: usize,
 87 | };
 88 | 
 89 | // A set of captures of a Regex on an input slice.
 90 | pub const Captures = struct {
 91 |     const Self = @This();
 92 | 
 93 |     input: []const u8,
 94 |     allocator: Allocator,
 95 |     slots: []const ?usize,
 96 | 
 97 |     pub fn init(input: []const u8, slots: *ArrayList(?usize)) !Captures {
 98 |         return Captures{
 99 |             .input = input,
100 |             .allocator = slots.allocator,
101 |             .slots = try slots.allocator.dupe(?usize, slots.items),
102 |         };
103 |     }
104 | 
105 |     pub fn deinit(self: *Self) void {
106 |         self.allocator.free(self.slots);
107 |     }
108 | 
109 |     pub fn len(self: *const Self) usize {
110 |         return self.slots.len / 2;
111 |     }
112 | 
113 |     // Return the slice of the matching string for the specified capture index.
114 |     // If the index did not participate in the capture group null is returned.
115 |     pub fn sliceAt(self: *const Self, n: usize) ?[]const u8 {
116 |         if (self.boundsAt(n)) |span| {
117 |             return self.input[span.lower..span.upper];
118 |         }
119 | 
120 |         return null;
121 |     }
122 | 
123 |     // Return the substring slices of the input directly.
124 |     pub fn boundsAt(self: *const Self, n: usize) ?Span {
125 |         const base = 2 * n;
126 | 
127 |         if (base < self.slots.len) {
128 |             if (self.slots[base]) |lower| {
129 |                 const upper = self.slots[base + 1].?;
130 |                 return Span{
131 |                     .lower = lower,
132 |                     .upper = upper,
133 |                 };
134 |             }
135 |         }
136 | 
137 |         return null;
138 |     }
139 | };
140 | 


--------------------------------------------------------------------------------
/src/regex_test.zig:
--------------------------------------------------------------------------------
  1 | const Regex = @import("regex.zig").Regex;
  2 | const debug = @import("std").debug;
  3 | const Parser = @import("parse.zig").Parser;
  4 | const re_debug = @import("debug.zig");
  5 | 
  6 | const std = @import("std");
  7 | const mem = std.mem;
  8 | 
  9 | fn check(re_input: []const u8, to_match: []const u8, expected: bool) void {
 10 |     var re = Regex.compile(std.testing.allocator, re_input) catch unreachable;
 11 |     defer re.deinit();
 12 | 
 13 |     if ((re.partialMatch(to_match) catch unreachable) != expected) {
 14 |         debug.print(
 15 |             \\
 16 |             \\ -- Failure! ------------------
 17 |             \\
 18 |             \\Regex:    '{s}'
 19 |             \\String:   '{s}'
 20 |             \\Expected: {any}
 21 |             \\
 22 |         , .{
 23 |             re_input,
 24 |             to_match,
 25 |             expected,
 26 |         });
 27 | 
 28 |         // Dump expression tree and bytecode
 29 |         var p = Parser.init(std.testing.allocator);
 30 |         defer p.deinit();
 31 |         const expr = p.parse(re_input) catch unreachable;
 32 | 
 33 |         debug.print(
 34 |             \\
 35 |             \\ -- Expression Tree ------------
 36 |             \\
 37 |         , .{});
 38 |         re_debug.dumpExpr(expr.*);
 39 | 
 40 |         debug.print(
 41 |             \\
 42 |             \\ -- Bytecode -------------------
 43 |             \\
 44 |         , .{});
 45 |         re_debug.dumpProgram(re.compiled);
 46 | 
 47 |         debug.print(
 48 |             \\
 49 |             \\ -------------------------------
 50 |             \\
 51 |         , .{});
 52 | 
 53 |         @panic("assertion failure");
 54 |     }
 55 | }
 56 | 
 57 | fn checkCompile(allocator: mem.Allocator, re_input: []const u8) !void {
 58 |     var re = try Regex.compile(allocator, re_input);
 59 |     re.deinit();
 60 | }
 61 | 
 62 | test "regex sanity tests" {
 63 |     // Taken from tiny-regex-c
 64 |     check("\\d", "5", true);
 65 |     check("\\w+", "hej", true);
 66 |     check("\\s", "\t \n", true);
 67 |     check("\\S", "\t \n", false);
 68 |     check("[\\s]", "\t \n", true);
 69 |     check("[\\S]", "\t \n", false);
 70 |     check("\\D", "5", false);
 71 |     check("\\W+", "hej", false);
 72 |     check("[0-9]+", "12345", true);
 73 |     check("\\D", "hej", true);
 74 |     check("\\d", "hej", false);
 75 |     check("[^\\w]", "\\", true);
 76 |     check("[\\W]", "\\", true);
 77 |     check("[\\w]", "\\", false);
 78 |     check("[^\\d]", "d", true);
 79 |     check("[\\d]", "d", false);
 80 |     check("[^\\D]", "d", false);
 81 |     check("[\\D]", "d", true);
 82 |     check("^.*\\\\.*$", "c:\\Tools", true);
 83 |     check("^[\\+-]*[\\d]+$", "+27", true);
 84 |     check("[abc]", "1c2", true);
 85 |     check("[abc]", "1C2", false);
 86 |     check("[1-5]+", "0123456789", true);
 87 |     check("[.2]", "1C2", true);
 88 |     check("a*$", "Xaa", true);
 89 |     check("a*$", "Xaa", true);
 90 |     check("[a-h]+", "abcdefghxxx", true);
 91 |     check("[a-h]+", "ABCDEFGH", false);
 92 |     check("[A-H]+", "ABCDEFGH", true);
 93 |     check("[A-H]+", "abcdefgh", false);
 94 |     check("[^\\s]+", "abc def", true);
 95 |     check("[^fc]+", "abc def", true);
 96 |     check("[^d\\sf]+", "abc def", true);
 97 |     check("\n", "abc\ndef", true);
 98 |     //check("b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n", true);
 99 |     check(".*c", "abcabc", true);
100 |     check(".+c", "abcabc", true);
101 |     check("[b-z].*", "ab", true);
102 |     check("b[k-z]*", "ab", true);
103 |     check("[0-9]", "  - ", false);
104 |     check("[^0-9]", "  - ", true);
105 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", true);
106 |     check("[Hh]ello [Ww]orld\\s*[!]?", "hello world !", true);
107 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello World !", true);
108 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world!   ", true);
109 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world  !", true);
110 |     check("[Hh]ello [Ww]orld\\s*[!]?", "hello World    !", true);
111 |     check("[^\\w][^-1-4]", ")T", true);
112 |     check("[^\\w][^-1-4]", ")^", true);
113 |     check("[^\\w][^-1-4]", "*)", true);
114 |     check("[^\\w][^-1-4]", "!.", true);
115 |     check("[^\\w][^-1-4]", " x", true);
116 |     check("[^\\w][^-1-4]", "$b", true);
117 |     check("a|b", "a", true);
118 |     check("a|b", "b", true);
119 |     check("a|b", "x", false);
120 |     check("[a-b]|[d-f]\\s+", "d ", true);
121 |     check("[a-b]|[d-f]\\s+", "b", true);
122 |     check("[a-b]|[d-f]\\s+", "c", false);
123 |     check("\\bx\\b", "x", true);
124 |     check("\\bx\\b", " x ", true);
125 |     check("\\bx", "Ax", false);
126 |     check("x\\b", "xA", false);
127 |     check("\\Bx\\B", "x", false);
128 |     check("\\Bx\\B", " x ", false);
129 |     check("\\Bx", "Ax", true);
130 |     check("x\\B", "xA", true);
131 | }
132 | 
133 | test "regex captures" {
134 |     var r = try Regex.compile(std.testing.allocator, "ab(\\d+)");
135 |     defer r.deinit();
136 | 
137 |     debug.assert(try r.partialMatch("xxxxab0123a"));
138 | 
139 |     var caps = (try r.captures("xxxxab0123a")).?;
140 |     defer caps.deinit();
141 | 
142 |     debug.assert(mem.eql(u8, "ab0123", caps.sliceAt(0).?));
143 |     debug.assert(mem.eql(u8, "0123", caps.sliceAt(1).?));
144 | }
145 | 
146 | test "regex memory leaks" {
147 |     const allocator = std.testing.allocator;
148 | 
149 |     try checkCompile(allocator, "\\d");
150 |     try checkCompile(allocator, "\\w+");
151 |     try checkCompile(allocator, "\\s");
152 |     try checkCompile(allocator, "\\S");
153 |     try checkCompile(allocator, "[\\s]");
154 |     try checkCompile(allocator, "[\\S]");
155 |     try checkCompile(allocator, "\\D");
156 |     try checkCompile(allocator, "\\W+");
157 |     try checkCompile(allocator, "[0-9]+");
158 |     try checkCompile(allocator, "[^\\w]");
159 |     try checkCompile(allocator, "[\\W]");
160 |     try checkCompile(allocator, "[\\w]");
161 |     try checkCompile(allocator, "[^\\d]");
162 |     try checkCompile(allocator, "[\\d]");
163 |     try checkCompile(allocator, "[^\\D]");
164 |     try checkCompile(allocator, "[\\D]");
165 |     try checkCompile(allocator, "^.*\\\\.*$");
166 |     try checkCompile(allocator, "^[\\+-]*[\\d]+$");
167 |     try checkCompile(allocator, "[abc]");
168 |     try checkCompile(allocator, "[1-5]+");
169 |     try checkCompile(allocator, "[.2]");
170 |     try checkCompile(allocator, "a*$");
171 |     try checkCompile(allocator, "[a-h]+");
172 |     try checkCompile(allocator, "[^\\s]+");
173 |     try checkCompile(allocator, "[^fc]+");
174 |     try checkCompile(allocator, "[^d\\sf]+");
175 |     try checkCompile(allocator, "\n");
176 |     try checkCompile(allocator, "b.\\s*\n");
177 |     try checkCompile(allocator, ".*c");
178 |     try checkCompile(allocator, ".+c");
179 |     try checkCompile(allocator, "[b-z].*");
180 |     try checkCompile(allocator, "b[k-z]*");
181 |     try checkCompile(allocator, "[0-9]");
182 |     try checkCompile(allocator, "[^0-9]");
183 |     try checkCompile(allocator, "a?");
184 |     try checkCompile(allocator, "[Hh]ello [Ww]orld\\s*[!]?");
185 |     try checkCompile(allocator, "[^\\w][^-1-4]");
186 |     try checkCompile(allocator, "[a-b]|[d-f]\\s+");
187 |     try checkCompile(allocator, "x\\b");
188 |     try checkCompile(allocator, "x\\B");
189 |     try checkCompile(allocator, "[0-9]{2,}");
190 |     try checkCompile(allocator, "[0-9]{2,3}");
191 | }
192 | 


--------------------------------------------------------------------------------
/src/vm_backtrack.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const mem = std.mem;
  3 | const Allocator = std.mem.Allocator;
  4 | const ArrayList = std.ArrayList;
  5 | const debug = std.debug;
  6 | 
  7 | const parse = @import("parse.zig");
  8 | const compile = @import("compile.zig");
  9 | 
 10 | const Parser = parse.Parser;
 11 | const Expr = parse.Expr;
 12 | const Assertion = parse.Assertion;
 13 | const Compiler = compile.Compiler;
 14 | const Program = compile.Program;
 15 | const InstructionData = compile.InstructionData;
 16 | const Input = @import("input.zig").Input;
 17 | 
 18 | const SaveRestore = struct {
 19 |     // slot position to restore
 20 |     slot: usize,
 21 |     // position to store in slot
 22 |     last_pos: usize,
 23 | };
 24 | 
 25 | const Thread = struct {
 26 |     // instruction pointer
 27 |     ip: usize,
 28 |     // Current input position
 29 |     input: Input,
 30 | };
 31 | 
 32 | const Job = union(enum) {
 33 |     Thread: Thread,
 34 |     SaveRestore: SaveRestore,
 35 | };
 36 | 
 37 | const ExecState = struct {
 38 |     const BitsetLen = 512;
 39 |     const BitsetType = u32;
 40 | 
 41 |     // pending jobs
 42 |     jobs: ArrayList(Job),
 43 | 
 44 |     // cache (we can bound this visited bitset since we bound when we use the backtracking engine.
 45 |     visited: [BitsetLen]BitsetType,
 46 | 
 47 |     prog: *const Program,
 48 | 
 49 |     slots: *ArrayList(?usize),
 50 | };
 51 | 
 52 | // This is bounded and only used for small compiled regexes. It is not quadratic since pre-seen
 53 | // nodes are cached across threads.
 54 | pub const VmBacktrack = struct {
 55 |     const Self = @This();
 56 |     allocator: Allocator,
 57 | 
 58 |     pub fn init(allocator: Allocator) Self {
 59 |         return Self{ .allocator = allocator };
 60 |     }
 61 | 
 62 |     pub fn shouldExec(prog: Program, input: *const Input) bool {
 63 |         return (prog.insts.len + 1) * (input.bytes.len + 1) < ExecState.BitsetLen * @sizeOf(ExecState.BitsetType);
 64 |     }
 65 | 
 66 |     pub fn exec(self: *Self, prog: Program, prog_start: usize, input: *Input, slots: *ArrayList(?usize)) !bool {
 67 |         // Should never run this without first checking shouldExec and running only if true.
 68 |         debug.assert(shouldExec(prog, input));
 69 | 
 70 |         var state = ExecState{
 71 |             .jobs = ArrayList(Job).init(self.allocator),
 72 |             .visited = [_]u32{0} ** 512,
 73 |             .prog = &prog,
 74 |             .slots = slots,
 75 |         };
 76 |         defer state.jobs.deinit();
 77 | 
 78 |         const t = Job{ .Thread = Thread{ .ip = prog_start, .input = input.clone() } };
 79 |         try state.jobs.append(t);
 80 | 
 81 |         while (state.jobs.pop()) |job| {
 82 |             switch (job) {
 83 |                 Job.Thread => |thread| {
 84 |                     if (try step(&state, &thread)) {
 85 |                         return true;
 86 |                     }
 87 |                 },
 88 |                 Job.SaveRestore => |save| {
 89 |                     if (save.slot < state.slots.items.len) {
 90 |                         state.slots.items[save.slot] = save.last_pos;
 91 |                     }
 92 |                 },
 93 |             }
 94 |         }
 95 | 
 96 |         return false;
 97 |     }
 98 | 
 99 |     fn step(state: *ExecState, thread: *const Thread) !bool {
100 |         // For linear actions, we can just modify the current thread and avoid pushing new items
101 |         // to the stack.
102 |         var input = thread.input;
103 | 
104 |         var ip = thread.ip;
105 | 
106 |         while (true) {
107 |             const inst = state.prog.insts[ip];
108 |             const at = input.current();
109 | 
110 |             if (!shouldVisit(state, ip, input.byte_pos)) {
111 |                 return false;
112 |             }
113 | 
114 |             switch (inst.data) {
115 |                 InstructionData.Char => |ch| {
116 |                     if (at == null or at.? != ch) {
117 |                         return false;
118 |                     }
119 |                     input.advance();
120 |                 },
121 |                 InstructionData.EmptyMatch => |assertion| {
122 |                     if (!input.isEmptyMatch(assertion)) {
123 |                         return false;
124 |                     }
125 |                 },
126 |                 InstructionData.ByteClass => |class| {
127 |                     if (at == null or !class.contains(at.?)) {
128 |                         return false;
129 |                     }
130 |                     input.advance();
131 |                 },
132 |                 InstructionData.AnyCharNotNL => {
133 |                     if (at == null or at.? == '\n') {
134 |                         return false;
135 |                     }
136 |                     input.advance();
137 |                 },
138 |                 InstructionData.Save => |slot| {
139 |                     // Our capture array may not be long enough, extend and fill with empty
140 |                     while (state.slots.items.len <= slot) {
141 |                         // TODO: Can't append null as optional
142 |                         try state.slots.append(0);
143 |                         state.slots.items[state.slots.items.len - 1] = null;
144 |                     }
145 | 
146 |                     // We can save an existing match by creating a job which will run on this thread
147 |                     // failing. This will reset to the old match before any subsequent splits in
148 |                     // this thread.
149 |                     if (state.slots.items[slot]) |last_pos| {
150 |                         const job = Job{
151 |                             .SaveRestore = SaveRestore{
152 |                                 .slot = slot,
153 |                                 .last_pos = last_pos,
154 |                             },
155 |                         };
156 |                         try state.jobs.append(job);
157 |                     }
158 | 
159 |                     state.slots.items[slot] = input.byte_pos;
160 |                 },
161 |                 InstructionData.Match => {
162 |                     return true;
163 |                 },
164 |                 InstructionData.Jump => {
165 |                     // Jump at end of loop
166 |                 },
167 |                 InstructionData.Split => |split| {
168 |                     const t = Job{ .Thread = Thread{ .ip = split, .input = input.clone() } };
169 |                     try state.jobs.append(t);
170 |                 },
171 |             }
172 | 
173 |             ip = inst.out;
174 |         }
175 |     }
176 | 
177 |     // checks if we have visited this specific node and if not, set the bit and return true
178 |     fn shouldVisit(state: *ExecState, ip: usize, at: usize) bool {
179 |         const BitsetType = ExecState.BitsetType;
180 |         const BitsetShiftType = std.math.Log2Int(BitsetType);
181 | 
182 |         const size = @sizeOf(BitsetType);
183 | 
184 |         const n = at * (state.prog.insts.len + 1) + ip;
185 |         const bitmask = @as(BitsetType, 1) << @as(BitsetShiftType, @intCast(n & (size - 1)));
186 | 
187 |         if ((state.visited[n / size] & bitmask) != 0) {
188 |             return false;
189 |         }
190 | 
191 |         state.visited[n / size] |= bitmask;
192 |         return true;
193 |     }
194 | };
195 | 


--------------------------------------------------------------------------------
/src/vm_pike.zig:
--------------------------------------------------------------------------------
  1 | // PikeVM
  2 | //
  3 | // This is the default engine currently except for small regexes which we use a caching backtracking
  4 | // engine as this is faster according to most other mature regex engines in practice.
  5 | //
  6 | // This is a very simple version with no optimizations.
  7 | 
  8 | const std = @import("std");
  9 | const mem = std.mem;
 10 | const Allocator = std.mem.Allocator;
 11 | const ArenaAllocator = std.heap.ArenaAllocator;
 12 | const ArrayList = std.ArrayList;
 13 | 
 14 | const parse = @import("parse.zig");
 15 | const compile = @import("compile.zig");
 16 | 
 17 | const Parser = parse.Parser;
 18 | const Assertion = parse.Assertion;
 19 | const Program = compile.Program;
 20 | const InstructionData = compile.InstructionData;
 21 | const Input = @import("input.zig").Input;
 22 | 
 23 | const Thread = struct {
 24 |     pc: usize,
 25 |     // We know the maximum slot entry in advance. Therefore, we allocate the entire array as needed
 26 |     // as this is easier (and probably quicker) than allocating only what we need in an ArrayList.
 27 |     slots: []?usize,
 28 | };
 29 | 
 30 | const ExecState = struct {
 31 |     const Self = @This();
 32 | 
 33 |     arena: ArenaAllocator,
 34 |     slot_count: usize,
 35 | 
 36 |     pub fn init(allocator: Allocator, program: Program) Self {
 37 |         return Self{
 38 |             .arena = ArenaAllocator.init(allocator),
 39 |             .slot_count = program.slot_count,
 40 |         };
 41 |     }
 42 | 
 43 |     pub fn deinit(self: *Self) void {
 44 |         self.arena.deinit();
 45 |     }
 46 | 
 47 |     pub fn newSlot(self: *Self) ![]?usize {
 48 |         const slots = try self.arena.allocator().alloc(?usize, self.slot_count);
 49 |         @memset(slots, null);
 50 |         return slots;
 51 |     }
 52 | 
 53 |     pub fn cloneSlots(self: *Self, other: []?usize) ![]?usize {
 54 |         const slots = try self.arena.allocator().alloc(?usize, self.slot_count);
 55 |         @memcpy(slots, other);
 56 |         return slots;
 57 |     }
 58 | };
 59 | 
 60 | pub const VmPike = struct {
 61 |     const Self = @This();
 62 | 
 63 |     allocator: Allocator,
 64 | 
 65 |     pub fn init(allocator: Allocator) Self {
 66 |         return Self{ .allocator = allocator };
 67 |     }
 68 | 
 69 |     pub fn exec(self: *Self, prog: Program, prog_start: usize, input: *Input, slots: *ArrayList(?usize)) !bool {
 70 |         var clist = ArrayList(Thread).init(self.allocator);
 71 |         defer clist.deinit();
 72 | 
 73 |         var nlist = ArrayList(Thread).init(self.allocator);
 74 |         defer nlist.deinit();
 75 | 
 76 |         var state = ExecState.init(self.allocator, prog);
 77 |         defer state.deinit();
 78 | 
 79 |         const t = Thread{
 80 |             .pc = prog_start,
 81 |             .slots = try state.newSlot(),
 82 |         };
 83 |         try clist.append(t);
 84 | 
 85 |         var matched: ?[]?usize = null;
 86 | 
 87 |         while (!input.isConsumed()) : (input.advance()) {
 88 |             while (clist.pop()) |thread| {
 89 |                 const inst = prog.insts[thread.pc];
 90 |                 const at = input.current();
 91 | 
 92 |                 switch (inst.data) {
 93 |                     InstructionData.Char => |ch| {
 94 |                         if (at != null and at.? == ch) {
 95 |                             try nlist.append(Thread{
 96 |                                 .pc = inst.out,
 97 |                                 .slots = thread.slots,
 98 |                             });
 99 |                         }
100 |                     },
101 |                     InstructionData.EmptyMatch => |assertion| {
102 |                         if (input.isEmptyMatch(assertion)) {
103 |                             try clist.append(Thread{
104 |                                 .pc = inst.out,
105 |                                 .slots = thread.slots,
106 |                             });
107 |                         }
108 |                     },
109 |                     InstructionData.ByteClass => |class| {
110 |                         if (at != null and class.contains(at.?)) {
111 |                             try nlist.append(Thread{
112 |                                 .pc = inst.out,
113 |                                 .slots = thread.slots,
114 |                             });
115 |                         }
116 |                     },
117 |                     InstructionData.AnyCharNotNL => {
118 |                         if (at != null and at.? != '\n') {
119 |                             try nlist.append(Thread{
120 |                                 .pc = inst.out,
121 |                                 .slots = thread.slots,
122 |                             });
123 |                         }
124 |                     },
125 |                     InstructionData.Match => {
126 |                         // We always will have a complete capture in the 0, 1 index
127 |                         if (matched) |last| {
128 |                             // leftmost
129 |                             if (thread.slots[0].? > last[0].?) {
130 |                                 continue;
131 |                             }
132 |                             // longest
133 |                             if (thread.slots[1].? - thread.slots[0].? <= last[1].? - last[0].?) {
134 |                                 continue;
135 |                             }
136 |                         }
137 | 
138 |                         matched = try state.cloneSlots(thread.slots);
139 | 
140 |                         // TODO: Handle thread priority correctly so we can immediately finish all
141 |                         // current threads in clits.
142 |                         // clist.shrink(0);
143 |                     },
144 |                     InstructionData.Save => |slot| {
145 |                         // We don't need a deep copy here since we only ever advance forward so
146 |                         // all future captures are valid for any subsequent threads.
147 |                         var new_thread = Thread{
148 |                             .pc = inst.out,
149 |                             .slots = thread.slots,
150 |                         };
151 | 
152 |                         new_thread.slots[slot] = input.byte_pos;
153 |                         try clist.append(new_thread);
154 |                     },
155 |                     InstructionData.Jump => {
156 |                         try clist.append(Thread{
157 |                             .pc = inst.out,
158 |                             .slots = thread.slots,
159 |                         });
160 |                     },
161 |                     InstructionData.Split => |split| {
162 |                         // Split pushed first since we want to handle the branch secondary to the
163 |                         // current thread (popped from end).
164 |                         try clist.append(Thread{
165 |                             .pc = split,
166 |                             .slots = try state.cloneSlots(thread.slots),
167 |                         });
168 |                         try clist.append(Thread{
169 |                             .pc = inst.out,
170 |                             .slots = thread.slots,
171 |                         });
172 |                     },
173 |                 }
174 |             }
175 | 
176 |             mem.swap(ArrayList(Thread), &clist, &nlist);
177 |             nlist.shrinkRetainingCapacity(0);
178 |         }
179 | 
180 |         if (matched) |ok_matched| {
181 |             slots.shrinkAndFree(0);
182 |             try slots.appendSlice(ok_matched);
183 |             return true;
184 |         }
185 | 
186 |         return false;
187 |     }
188 | };
189 | 


--------------------------------------------------------------------------------
/src/vm_test.zig:
--------------------------------------------------------------------------------
  1 | const exec = @import("exec.zig").exec;
  2 | const debug = @import("std").debug;
  3 | const Parser = @import("parse.zig").Parser;
  4 | const Regex = @import("regex.zig").Regex;
  5 | const InputBytes = @import("input.zig").InputBytes;
  6 | const re_debug = @import("debug.zig");
  7 | 
  8 | const std = @import("std");
  9 | const ArrayList = std.ArrayList;
 10 | const FixedBufferAllocator = std.heap.FixedBufferAllocator;
 11 | const mem = std.mem;
 12 | 
 13 | // vms to test
 14 | const VmBacktrack = @import("vm_backtrack.zig").VmBacktrack;
 15 | const VmPike = @import("vm_pike.zig").VmPike;
 16 | 
 17 | // Debug global allocator is too small for our tests
 18 | var buffer: [800000]u8 = undefined;
 19 | var fixed_allocator = FixedBufferAllocator.init(buffer[0..]);
 20 | 
 21 | fn nullableEql(comptime T: type, a: []const ?T, b: []const ?T) bool {
 22 |     if (a.len != b.len) {
 23 |         return false;
 24 |     }
 25 | 
 26 |     var i: usize = 0;
 27 |     while (i < a.len) : (i += 1) {
 28 |         if (a[i] != null and b[i] != null) {
 29 |             if (a[i].? != b[i].?) {
 30 |                 return false;
 31 |             }
 32 |             // ok
 33 |         } else if (a[i] == null and b[i] == null) {
 34 |             // ok
 35 |         } else {
 36 |             return false;
 37 |         }
 38 |     }
 39 | 
 40 |     return true;
 41 | }
 42 | 
 43 | fn check(re_input: []const u8, to_match: []const u8, expected: bool) void {
 44 |     const re = Regex.compile(fixed_allocator.allocator(), re_input) catch unreachable;
 45 | 
 46 |     // This is just an engine comparison test but we should also test against fixed vectors
 47 |     var backtrack = VmBacktrack.init(re.allocator);
 48 |     var backtrack_slots = ArrayList(?usize).init(re.allocator);
 49 |     var pike = VmPike.init(re.allocator);
 50 |     var pike_slots = ArrayList(?usize).init(re.allocator);
 51 | 
 52 |     var input1 = InputBytes.init(to_match).input;
 53 |     const pike_result = pike.exec(re.compiled, re.compiled.find_start, &input1, &pike_slots) catch unreachable;
 54 | 
 55 |     var input2 = InputBytes.init(to_match).input;
 56 |     const backtrack_result = backtrack.exec(re.compiled, re.compiled.find_start, &input2, &backtrack_slots) catch unreachable;
 57 | 
 58 |     const slots_equal = nullableEql(usize, pike_slots.items, backtrack_slots.items);
 59 | 
 60 |     // Note: slot entries are invalid on non-match
 61 |     if (pike_result != backtrack_result or (expected == true and !slots_equal)) {
 62 |         debug.print(
 63 |             \\
 64 |             \\ -- Failure! ----------------
 65 |             \\
 66 |             \\
 67 |             \\pikevm:    {any}
 68 |             \\backtrack: {any}
 69 |             \\
 70 |         , .{ pike_result, backtrack_result });
 71 | 
 72 |         debug.print(
 73 |             \\
 74 |             \\ -- Slots -------------------
 75 |             \\
 76 |             \\pikevm
 77 |             \\
 78 |         , .{});
 79 |         for (pike_slots.items) |entry| {
 80 |             debug.print("{?d} ", .{entry});
 81 |         }
 82 |         debug.print("\n", .{});
 83 | 
 84 |         debug.print(
 85 |             \\
 86 |             \\
 87 |             \\backtrack
 88 |             \\
 89 |         , .{});
 90 |         for (backtrack_slots.items) |entry| {
 91 |             debug.print("{?d} ", .{entry});
 92 |         }
 93 |         debug.print("\n", .{});
 94 | 
 95 |         debug.print(
 96 |             \\
 97 |             \\ -- Regex ------------------
 98 |             \\
 99 |             \\Regex:    '{s}'
100 |             \\String:   '{s}'
101 |             \\Expected: {any}
102 |             \\
103 |         , .{ re_input, to_match, expected });
104 | 
105 |         // Dump expression tree and bytecode
106 |         var p = Parser.init(std.testing.allocator);
107 |         defer p.deinit();
108 |         const expr = p.parse(re_input) catch unreachable;
109 | 
110 |         debug.print(
111 |             \\
112 |             \\ -- Expression Tree ------------
113 |             \\
114 |         , .{});
115 |         re_debug.dumpExpr(expr.*);
116 | 
117 |         debug.print(
118 |             \\
119 |             \\ -- Bytecode -------------------
120 |             \\
121 |         , .{});
122 |         re_debug.dumpProgram(re.compiled);
123 | 
124 |         debug.print(
125 |             \\
126 |             \\ -------------------------------
127 |             \\
128 |         , .{});
129 | 
130 |         @panic("assertion failure");
131 |     }
132 | }
133 | 
134 | test "pikevm == backtrackvm" {
135 |     // Taken from tiny-regex-c
136 |     check("\\d", "5", true);
137 |     check("\\w+", "hej", true);
138 |     check("\\s", "\t \n", true);
139 |     check("\\S", "\t \n", false);
140 |     check("[\\s]", "\t \n", true);
141 |     check("[\\S]", "\t \n", false);
142 |     check("\\D", "5", false);
143 |     check("\\W+", "hej", false);
144 |     check("[0-9]+", "12345", true);
145 |     check("\\D", "hej", true);
146 |     check("\\d", "hej", false);
147 |     check("[^\\w]", "\\", true);
148 |     check("[\\W]", "\\", true);
149 |     check("[\\w]", "\\", false);
150 |     check("[^\\d]", "d", true);
151 |     check("[\\d]", "d", false);
152 |     check("[^\\D]", "d", false);
153 |     check("[\\D]", "d", true);
154 |     check("^.*\\\\.*$", "c:\\Tools", true);
155 |     check("^[\\+-]*[\\d]+$", "+27", true);
156 |     check("[abc]", "1c2", true);
157 |     check("[abc]", "1C2", false);
158 |     check("[1-5]+", "0123456789", true);
159 |     check("[.2]", "1C2", true);
160 |     check("a*$", "Xaa", true);
161 |     check("a*$", "Xaa", true);
162 |     check("[a-h]+", "abcdefghxxx", true);
163 |     check("[a-h]+", "ABCDEFGH", false);
164 |     check("[A-H]+", "ABCDEFGH", true);
165 |     check("[A-H]+", "abcdefgh", false);
166 |     check("[^\\s]+", "abc def", true);
167 |     check("[^fc]+", "abc def", true);
168 |     check("[^d\\sf]+", "abc def", true);
169 |     check("\n", "abc\ndef", true);
170 |     //check("b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n", true);
171 |     check(".*c", "abcabc", true);
172 |     check(".+c", "abcabc", true);
173 |     check("[b-z].*", "ab", true);
174 |     check("b[k-z]*", "ab", true);
175 |     check("[0-9]", "  - ", false);
176 |     check("[^0-9]", "  - ", true);
177 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", true);
178 |     check("[Hh]ello [Ww]orld\\s*[!]?", "hello world !", true);
179 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello World !", true);
180 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world!   ", true);
181 |     check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world  !", true);
182 |     check("[Hh]ello [Ww]orld\\s*[!]?", "hello World    !", true);
183 |     check("[^\\w][^-1-4]", ")T", true);
184 |     check("[^\\w][^-1-4]", ")^", true);
185 |     check("[^\\w][^-1-4]", "*)", true);
186 |     check("[^\\w][^-1-4]", "!.", true);
187 |     check("[^\\w][^-1-4]", " x", true);
188 |     check("[^\\w][^-1-4]", "$b", true);
189 |     check("a{3,}", "aaa", true);
190 |     check(".*emacs.*", "emacs-packages.nix", true);
191 |     check("[a-b]|[d-f]\\s+", "d ", true);
192 |     check("[a-b]|[d-f]\\s+", "b", true);
193 |     check("[a-b]|[d-f]\\s+", "c", false);
194 |     check("\\bx\\b", "x", true);
195 |     check("\\bx\\b", " x ", true);
196 |     check("\\bx", "Ax", false);
197 |     check("x\\b", "xA", false);
198 |     check("\\Bx\\B", "x", false);
199 |     check("\\Bx\\B", " x ", false);
200 |     check("\\Bx", "Ax", true);
201 |     check("x\\B", "xA", true);
202 | }
203 | 


--------------------------------------------------------------------------------