├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── build.zig ├── build.zig.zon ├── example └── example.c ├── include └── regex.h └── src ├── all_test.zig ├── c_regex.zig ├── compile.zig ├── debug.zig ├── exec.zig ├── input.zig ├── parse.zig ├── parse_test.zig ├── range_set.zig ├── regex.zig ├── regex_test.zig ├── vm_backtrack.zig ├── vm_pike.zig └── vm_test.zig /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: ['*'] 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | 14 | lint: 15 | name: Lint 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v4 20 | 21 | - uses: goto-bus-stop/setup-zig@v2 22 | with: 23 | version: 0.14.0 # most recent stable 24 | 25 | - name: Check formatting 26 | run: zig fmt --check . 27 | 28 | test: 29 | name: Test / Zig ${{ matrix.zig-version }} 30 | runs-on: ubuntu-latest 31 | continue-on-error: ${{ matrix.allow-fail }} 32 | 33 | strategy: 34 | matrix: 35 | zig-version: ['0.14.0'] 36 | os: [ubuntu-latest] 37 | allow-fail: [false] 38 | include: 39 | # Test against Zig master but don't break from it. 40 | # master is a constantly moving target, 41 | # so we'll fix issues on a best-effort basis. 42 | - zig-version: master 43 | os: ubuntu-latest 44 | allow-fail: true 45 | 46 | steps: 47 | - name: Checkout code 48 | uses: actions/checkout@v4 49 | 50 | - name: Set up Zig 51 | uses: goto-bus-stop/setup-zig@v2 52 | with: 53 | version: ${{ matrix.zig-version }} 54 | 55 | - name: Run tests 56 | run: zig build test 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | zig-cache 2 | zig-out 3 | .zig-* 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Marc Tiehuis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | An automaton-based regex implementation for [zig](http://ziglang.org/). 2 | 3 | Note: This is still a work in progress and many things still need to be done. 4 | 5 | - [x] Capture group support 6 | - [ ] UTF-8 support 7 | - [ ] More tests (plus some automated tests/fuzzing) 8 | - [x] Add a PikeVM implementation 9 | - [ ] Literal optimizations and just general performance improvements. 10 | 11 | ## Usage 12 | 13 | ```zig 14 | const debug = @import("std").debug; 15 | const Regex = @import("regex").Regex; 16 | 17 | test "example" { 18 | var re = try Regex.compile(debug.global_allocator, "\\w+"); 19 | 20 | debug.assert(try re.match("hej") == true); 21 | } 22 | ``` 23 | 24 | ## Api 25 | 26 | ### Regex 27 | 28 | ```zig 29 | fn compile(a: Allocator, re: []const u8) !Regex 30 | ``` 31 | 32 | Compiles a regex string, returning any errors during parsing/compiling. 33 | 34 | --- 35 | 36 | ```zig 37 | pub fn match(re: *Regex, input: []const u8) !bool 38 | ``` 39 | 40 | Match a compiled regex against some input. The input must be matched in its 41 | entirety and from the first index. 42 | 43 | --- 44 | 45 | ```zig 46 | pub fn partialMatch(re: *Regex, input: []const u8) !bool 47 | ``` 48 | 49 | Match a compiled regex against some input. Unlike `match`, this matches the 50 | leftmost and does not have to be anchored to the start of `input`. 51 | 52 | --- 53 | 54 | ```zig 55 | pub fn captures(re: *Regex, input: []const u8) !?Captures 56 | ``` 57 | 58 | Match a compiled regex against some input. Returns a list of all matching 59 | slices in the regex with the first (0-index) being the entire regex. 60 | 61 | If no match was found, null is returned. 62 | 63 | ### Captures 64 | 65 | ```zig 66 | pub fn sliceAt(captures: *const Captures, n: usize) ?[]const u8 67 | ``` 68 | 69 | Return the sub-slice for the numbered capture group. 0 refers to the entire 70 | match. 71 | 72 | ```zig 73 | pub fn boundsAt(captures: *const Captures, n: usize) ?Span 74 | ``` 75 | 76 | Return the lower and upper byte positions for the specified capture group. 77 | 78 | We can retrieve the sub-slice using this function: 79 | 80 | ```zig 81 | const span = caps.boundsAt(0) 82 | debug.assert(mem.eql(u8, caps.sliceAt(0), input[span.lower..span.upper])); 83 | ``` 84 | 85 | --- 86 | 87 | ## References 88 | 89 | See the following useful sources: 90 | - https://swtch.com/~rsc/regexp/ 91 | - [Rust Regex Library](https://github.com/rust-lang/regex) 92 | - [Go Regex Library](https://github.com/golang/go/tree/master/src/regexp) 93 | -------------------------------------------------------------------------------- /build.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub fn build(b: *std.Build) void { 4 | const target = b.standardTargetOptions(.{}); 5 | const optimize = b.standardOptimizeOption(.{}); 6 | 7 | if (@hasDecl(std.Build, "CreateModuleOptions")) { 8 | // Zig 0.11 9 | _ = b.addModule("regex", .{ 10 | .source_file = .{ .path = "src/regex.zig" }, 11 | }); 12 | } else { 13 | // Zig 0.12-dev.2159 14 | _ = b.addModule("regex", .{ 15 | .root_source_file = path(b, "src/regex.zig"), 16 | }); 17 | } 18 | 19 | // library tests 20 | const library_tests = b.addTest(.{ 21 | .root_source_file = path(b, "src/all_test.zig"), 22 | .target = target, 23 | .optimize = optimize, 24 | }); 25 | const run_library_tests = b.addRunArtifact(library_tests); 26 | 27 | const test_step = b.step("test", "Run all tests"); 28 | test_step.dependOn(&run_library_tests.step); 29 | 30 | // C library 31 | const staticLib = b.addStaticLibrary(.{ 32 | .name = "regex", 33 | .root_source_file = path(b, "src/c_regex.zig"), 34 | .target = target, 35 | .optimize = optimize, 36 | }); 37 | staticLib.linkLibC(); 38 | 39 | b.installArtifact(staticLib); 40 | 41 | const sharedLib = b.addSharedLibrary(.{ 42 | .name = "regex", 43 | .root_source_file = path(b, "src/c_regex.zig"), 44 | .target = target, 45 | .optimize = optimize, 46 | }); 47 | sharedLib.linkLibC(); 48 | 49 | b.installArtifact(sharedLib); 50 | 51 | // C example 52 | const c_example = b.addExecutable(.{ 53 | .name = "example", 54 | .target = target, 55 | .optimize = optimize, 56 | }); 57 | c_example.addCSourceFile(.{ 58 | .file = path(b, "example/example.c"), 59 | .flags = &.{"-Wall"}, 60 | }); 61 | c_example.addIncludePath(path(b, "include")); 62 | c_example.linkLibC(); 63 | c_example.linkLibrary(staticLib); 64 | 65 | const c_example_step = b.step("c-example", "Example using C API"); 66 | c_example_step.dependOn(&staticLib.step); 67 | c_example_step.dependOn(&c_example.step); 68 | 69 | b.default_step.dependOn(test_step); 70 | } 71 | 72 | fn path(b: *std.Build, sub_path: []const u8) std.Build.LazyPath { 73 | if (@hasDecl(std.Build, "path")) { 74 | // Zig 0.13-dev.267 75 | return b.path(sub_path); 76 | } else { 77 | return .{ .path = sub_path }; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /build.zig.zon: -------------------------------------------------------------------------------- 1 | .{ 2 | .name = .regex, 3 | .version = "0.1.2", 4 | .minimum_zig_version = "0.14.0", 5 | .paths = .{ 6 | "src", 7 | "build.zig", 8 | "build.zig.zon", 9 | "LICENSE", 10 | "README.md", 11 | }, 12 | .fingerprint = 0x4204f8cae7b7106b, 13 | } 14 | -------------------------------------------------------------------------------- /example/example.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regex.h" 3 | 4 | int main() { 5 | zre_regex *re = zre_compile(".*world.*"); 6 | if (!re) { 7 | printf("Regex compile error\n"); 8 | return 1; 9 | } 10 | 11 | if (zre_match(re, "Hello world!")) 12 | printf("Match!\n"); 13 | 14 | zre_deinit(re); 15 | } 16 | -------------------------------------------------------------------------------- /include/regex.h: -------------------------------------------------------------------------------- 1 | #ifndef __ZRE_H__ 2 | #define __ZRE_H__ 3 | 4 | #include 5 | #include 6 | 7 | typedef struct zre_regex zre_regex; 8 | typedef struct zre_captures zre_captures; 9 | 10 | typedef struct zre_captures_span { 11 | size_t lower; 12 | size_t upper; 13 | } zre_captures_span; 14 | 15 | extern zre_regex* zre_compile(const char* input); 16 | 17 | extern bool zre_match(zre_regex* re, const char* input); 18 | 19 | extern bool zre_partial_match(zre_regex* re, const char* input); 20 | 21 | extern void zre_deinit(zre_regex* re); 22 | 23 | extern zre_captures* zre_captures_all(zre_regex* re, const char* input); 24 | 25 | extern size_t zre_captures_len(const zre_captures* cap); 26 | 27 | extern const char* zre_captures_slice_at(const zre_captures* cap, size_t n, size_t* len); 28 | 29 | extern bool zre_captures_bounds_at(const zre_captures* cap, zre_captures_span* sp, size_t n); 30 | 31 | extern void zre_captures_deinit(zre_captures* cap); 32 | 33 | #endif // __ZRE_H__ 34 | -------------------------------------------------------------------------------- /src/all_test.zig: -------------------------------------------------------------------------------- 1 | test "all" { 2 | _ = @import("range_set.zig"); 3 | _ = @import("parse_test.zig"); 4 | _ = @import("vm_test.zig"); 5 | _ = @import("regex_test.zig"); 6 | } 7 | -------------------------------------------------------------------------------- /src/c_regex.zig: -------------------------------------------------------------------------------- 1 | //! C API for the zig-regex library 2 | 3 | const std = @import("std"); 4 | 5 | const regex = @import("regex.zig"); 6 | const Regex = regex.Regex; 7 | const Captures = regex.Captures; 8 | 9 | const zre_regex = opaque {}; 10 | const zre_captures = opaque {}; 11 | 12 | const zre_captures_span = extern struct { 13 | lower: usize, 14 | upper: usize, 15 | }; 16 | 17 | var allocator = std.heap.c_allocator; 18 | 19 | export fn zre_compile(input: ?[*:0]const u8) ?*zre_regex { 20 | const r = allocator.create(Regex) catch return null; 21 | r.* = Regex.compile(allocator, std.mem.span(input.?)) catch return null; 22 | return @ptrCast(r); 23 | } 24 | 25 | export fn zre_match(re: ?*zre_regex, input: ?[*:0]const u8) bool { 26 | var r: *Regex = @ptrCast(@alignCast(re)); 27 | return r.match(std.mem.span(input.?)) catch return false; 28 | } 29 | 30 | export fn zre_partial_match(re: ?*zre_regex, input: ?[*:0]const u8) bool { 31 | var r: *Regex = @ptrCast(@alignCast(re)); 32 | return r.partialMatch(std.mem.span(input.?)) catch return false; 33 | } 34 | 35 | export fn zre_deinit(re: ?*zre_regex) void { 36 | var r: *Regex = @ptrCast(@alignCast(re)); 37 | r.deinit(); 38 | allocator.destroy(r); 39 | } 40 | 41 | export fn zre_captures_all(re: ?*zre_regex, input: ?[*:0]const u8) ?*zre_captures { 42 | var r: *Regex = @ptrCast(@alignCast(re)); 43 | const c = allocator.create(Captures) catch return null; 44 | c.* = (r.captures(std.mem.span(input.?)) catch return null) orelse return null; 45 | return @ptrCast(c); 46 | } 47 | 48 | export fn zre_captures_len(cap: ?*const zre_captures) usize { 49 | const c: *const Captures = @ptrCast(@alignCast(cap)); 50 | return c.slots.len / 2; 51 | } 52 | 53 | export fn zre_captures_slice_at(cap: ?*const zre_captures, n: usize, len: ?*usize) ?[*]const u8 { 54 | const c: *const Captures = @ptrCast(@alignCast(cap)); 55 | const slice = c.sliceAt(n) orelse return null; 56 | if (len) |ln| { 57 | ln.* = slice.len; 58 | } 59 | return slice.ptr; 60 | } 61 | 62 | export fn zre_captures_bounds_at(cap: ?*const zre_captures, sp: ?*zre_captures_span, n: usize) bool { 63 | const c: *const Captures = @ptrCast(@alignCast(cap)); 64 | const span = c.boundsAt(n); 65 | if (span) |s| { 66 | sp.?.*.lower = s.lower; 67 | sp.?.*.upper = s.upper; 68 | return true; 69 | } 70 | return false; 71 | } 72 | 73 | export fn zre_captures_deinit(cap: ?*zre_captures) void { 74 | var c: *Captures = @ptrCast(@alignCast(cap)); 75 | c.deinit(); 76 | allocator.destroy(c); 77 | } 78 | -------------------------------------------------------------------------------- /src/compile.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const mem = std.mem; 3 | const Allocator = std.mem.Allocator; 4 | const ArrayList = std.ArrayList; 5 | const debug = std.debug; 6 | 7 | const parser = @import("parse.zig"); 8 | const Parser = parser.Parser; 9 | const ByteClass = parser.ByteClass; 10 | const Expr = parser.Expr; 11 | const Assertion = parser.Assertion; 12 | 13 | pub const InstructionData = union(enum) { 14 | // Match the specified character. 15 | Char: u8, 16 | // Match the specified character ranges. 17 | ByteClass: ByteClass, 18 | // Matches the AnyChar special cases 19 | AnyCharNotNL, 20 | // Empty match (\w assertion) 21 | EmptyMatch: Assertion, 22 | // Stop the thread, found a match 23 | Match, 24 | // Jump to the instruction at address x 25 | Jump, 26 | // Split execution, spawing a new thread and continuing in lockstep 27 | Split: usize, 28 | // Slot to save position in 29 | Save: usize, 30 | }; 31 | 32 | // Represents instructions for the VM. 33 | pub const Instruction = struct { 34 | // Next instruction to execute 35 | out: usize, 36 | // Associated data with this 37 | data: InstructionData, 38 | 39 | pub fn new(out: usize, data: InstructionData) Instruction { 40 | return Instruction{ 41 | .out = out, 42 | .data = data, 43 | }; 44 | } 45 | }; 46 | 47 | // Represents an instruction with unpatched holes. 48 | const InstHole = union(enum) { 49 | // Match with an unfilled output 50 | Char: u8, 51 | // Match a character class range 52 | ByteClass: ByteClass, 53 | // Empty Match assertion 54 | EmptyMatch: Assertion, 55 | // Match any character 56 | AnyCharNotNL, 57 | // Split with no unfilled branch 58 | Split, 59 | // Split with a filled first branch 60 | Split1: usize, 61 | // Split with a filled second branch 62 | Split2: usize, 63 | // Save capture 64 | Save: usize, 65 | }; 66 | 67 | // Represents a partial instruction. During compilation the instructions will be a mix of compiled 68 | // and un-compiled. All instructions must be in the compiled state when we finish processing. 69 | const PartialInst = union(enum) { 70 | // A completely compiled instruction 71 | Compiled: Instruction, 72 | 73 | // A partially compiled instruction, the back-links are not yet filled 74 | Uncompiled: InstHole, 75 | 76 | // Modify the current instruction to point to the specified instruction. 77 | pub fn fill(s: *PartialInst, i: usize) void { 78 | switch (s.*) { 79 | PartialInst.Uncompiled => |ih| { 80 | // Generate the corresponding compiled instruction. All simply goto the specified 81 | // instruction, except for the dual split case, in which both outgoing pointers 82 | // go to the same place. 83 | const compiled = switch (ih) { 84 | InstHole.Char => |ch| Instruction.new(i, InstructionData{ .Char = ch }), 85 | 86 | InstHole.EmptyMatch => |assertion| Instruction.new(i, InstructionData{ .EmptyMatch = assertion }), 87 | 88 | InstHole.AnyCharNotNL => Instruction.new(i, InstructionData.AnyCharNotNL), 89 | 90 | InstHole.ByteClass => |class| Instruction.new(i, InstructionData{ .ByteClass = class }), 91 | 92 | InstHole.Split => 93 | // If we both point to the same output, we can encode as a jump 94 | Instruction.new(i, InstructionData.Jump), 95 | 96 | // 1st was already filled 97 | InstHole.Split1 => |split| Instruction.new(split, InstructionData{ .Split = i }), 98 | 99 | // 2nd was already filled 100 | InstHole.Split2 => |split| Instruction.new(i, InstructionData{ .Split = split }), 101 | 102 | InstHole.Save => |slot| Instruction.new(i, InstructionData{ .Save = slot }), 103 | }; 104 | 105 | s.* = PartialInst{ .Compiled = compiled }; 106 | }, 107 | PartialInst.Compiled => { 108 | // nothing to do, already filled 109 | }, 110 | } 111 | } 112 | }; 113 | 114 | // A program represents the compiled bytecode of an NFA. 115 | pub const Program = struct { 116 | // Sequence of instructions representing an NFA 117 | insts: []Instruction, 118 | // Start instruction 119 | start: usize, 120 | // Find Start instruction 121 | find_start: usize, 122 | // Max number of slots required 123 | slot_count: usize, 124 | // Allocator which owns the instructions 125 | allocator: Allocator, 126 | 127 | pub fn init(allocator: Allocator, a: []Instruction, find_start: usize, slot_count: usize) Program { 128 | return Program{ 129 | .allocator = allocator, 130 | .insts = a, 131 | .start = 0, 132 | .find_start = find_start, 133 | .slot_count = slot_count, 134 | }; 135 | } 136 | 137 | pub fn deinit(p: *Program) void { 138 | for (p.insts) |*inst| { 139 | switch (inst.data) { 140 | .ByteClass => |*bc| { 141 | bc.deinit(); 142 | }, 143 | else => {}, 144 | } 145 | } 146 | p.allocator.free(p.insts); 147 | } 148 | }; 149 | 150 | // A Hole represents the outgoing node of a partially compiled Fragment. 151 | // 152 | // If None, the Hole needs to be back-patched as we do not yet know which instruction this 153 | // points to yet. 154 | const Hole = union(enum) { 155 | None, 156 | One: usize, 157 | Many: ArrayList(Hole), 158 | }; 159 | 160 | // A patch represents an unpatched output for a contigious sequence of instructions. 161 | const Patch = struct { 162 | // The address of the first instruction 163 | entry: usize, 164 | // The output hole of this instruction (to be filled to an actual address/es) 165 | hole: Hole, 166 | }; 167 | 168 | // A Compiler compiles a regex expression into a bytecode representation of the NFA. 169 | pub const Compiler = struct { 170 | // Stores all partial instructions 171 | insts: ArrayList(PartialInst), 172 | allocator: Allocator, 173 | // Capture state 174 | capture_index: usize, 175 | 176 | pub fn init(a: Allocator) Compiler { 177 | return Compiler{ 178 | .insts = ArrayList(PartialInst).init(a), 179 | .allocator = a, 180 | .capture_index = 0, 181 | }; 182 | } 183 | 184 | pub fn deinit(c: *Compiler) void { 185 | c.insts.deinit(); 186 | } 187 | 188 | fn nextCaptureIndex(c: *Compiler) usize { 189 | const s = c.capture_index; 190 | c.capture_index += 2; 191 | return s; 192 | } 193 | 194 | // Compile the regex expression 195 | pub fn compile(c: *Compiler, expr: *const Expr) !Program { 196 | // surround in a full program match 197 | const entry = c.insts.items.len; 198 | const index = c.nextCaptureIndex(); 199 | try c.pushCompiled(Instruction.new(entry + 1, InstructionData{ .Save = index })); 200 | 201 | // compile the main expression 202 | const patch = try c.compileInternal(expr); 203 | 204 | // not iterating over an empty correctly in backtrack 205 | c.fillToNext(patch.hole); 206 | const h = try c.pushHole(InstHole{ .Save = index + 1 }); 207 | 208 | // fill any holes to end at the next instruction which will be a match 209 | c.fillToNext(h); 210 | try c.pushCompiled(Instruction.new(0, InstructionData.Match)); 211 | 212 | var p = ArrayList(Instruction).init(c.allocator); 213 | defer p.deinit(); 214 | 215 | for (c.insts.items) |e| { 216 | switch (e) { 217 | PartialInst.Compiled => |x| { 218 | try p.append(x); 219 | }, 220 | else => |_| { 221 | @panic("uncompiled instruction encountered during compilation"); 222 | }, 223 | } 224 | } 225 | 226 | // To facilitate fast finding (matching non-anchored to the start) we simply append a 227 | // .*? to the start of our instructions. We push the fragment with this set of instructions 228 | // at the end of the compiled set. We perform an anchored search by entering normally and 229 | // a non-anchored by jumping to this patch before starting. 230 | // 231 | // 1: compiled instructions 232 | // 2: match 233 | // ... # We add the following 234 | // 3: split 1, 4 235 | // 4: any 3 236 | const fragment_start = c.insts.items.len; 237 | const fragment = [_]Instruction{ 238 | Instruction.new(0, InstructionData{ .Split = fragment_start + 1 }), 239 | Instruction.new(fragment_start, InstructionData.AnyCharNotNL), 240 | }; 241 | try p.appendSlice(&fragment); 242 | 243 | return Program.init(p.allocator, try p.toOwnedSlice(), fragment_start, c.capture_index); 244 | } 245 | 246 | fn compileInternal(c: *Compiler, expr: *const Expr) Allocator.Error!Patch { 247 | switch (expr.*) { 248 | Expr.Literal => |lit| { 249 | const h = try c.pushHole(InstHole{ .Char = lit }); 250 | return Patch{ .hole = h, .entry = c.insts.items.len - 1 }; 251 | }, 252 | Expr.ByteClass => |classes| { 253 | // Similar, we use a special instruction. 254 | const h = try c.pushHole(InstHole{ .ByteClass = try classes.dupe(c.allocator) }); 255 | return Patch{ .hole = h, .entry = c.insts.items.len - 1 }; 256 | }, 257 | Expr.AnyCharNotNL => { 258 | const h = try c.pushHole(InstHole.AnyCharNotNL); 259 | return Patch{ .hole = h, .entry = c.insts.items.len - 1 }; 260 | }, 261 | Expr.EmptyMatch => |assertion| { 262 | const h = try c.pushHole(InstHole{ .EmptyMatch = assertion }); 263 | return Patch{ .hole = h, .entry = c.insts.items.len - 1 }; 264 | }, 265 | Expr.Repeat => |repeat| { 266 | // Case 1: * 267 | if (repeat.min == 0 and repeat.max == null) { 268 | return c.compileStar(repeat.subexpr, repeat.greedy); 269 | } 270 | // Case 2: + 271 | else if (repeat.min == 1 and repeat.max == null) { 272 | return c.compilePlus(repeat.subexpr, repeat.greedy); 273 | } 274 | // Case 3: ? 275 | else if (repeat.min == 0 and repeat.max != null and repeat.max.? == 1) { 276 | return c.compileQuestion(repeat.subexpr, repeat.greedy); 277 | } 278 | // Case 4: {m,} 279 | else if (repeat.max == null) { 280 | // e{2,} => eee* 281 | // fixed min concatenation 282 | const p = try c.compileInternal(repeat.subexpr); 283 | var hole = p.hole; 284 | const entry = p.entry; 285 | 286 | var i: usize = 1; 287 | while (i < repeat.min) : (i += 1) { 288 | const new_subexpr = try repeat.subexpr.clone(); 289 | const ep = try c.compileInternal(&new_subexpr); 290 | c.fill(hole, ep.entry); 291 | hole = ep.hole; 292 | } 293 | 294 | // add final e* infinite capture 295 | var new_subexpr = try repeat.subexpr.clone(); 296 | const st = try c.compileStar(&new_subexpr, repeat.greedy); 297 | c.fill(hole, st.entry); 298 | 299 | return Patch{ .hole = st.hole, .entry = entry }; 300 | } 301 | // Case 5: {m,n} and {m} 302 | else { 303 | // e{3,6} => eee?e?e?e? 304 | const p = try c.compileInternal(repeat.subexpr); 305 | var hole = p.hole; 306 | const entry = p.entry; 307 | 308 | var i: usize = 1; 309 | while (i < repeat.min) : (i += 1) { 310 | const new_subexpr = try repeat.subexpr.clone(); 311 | const ep = try c.compileInternal(&new_subexpr); 312 | c.fill(hole, ep.entry); 313 | hole = ep.hole; 314 | } 315 | 316 | // repeated optional concatenations 317 | while (i < repeat.max.?) : (i += 1) { 318 | var new_subexpr = try repeat.subexpr.clone(); 319 | const ep = try c.compileQuestion(&new_subexpr, repeat.greedy); 320 | c.fill(hole, ep.entry); 321 | hole = ep.hole; 322 | } 323 | 324 | return Patch{ .hole = hole, .entry = entry }; 325 | } 326 | }, 327 | Expr.Concat => |subexprs| { 328 | // Compile each item in the sub-expression 329 | const f = subexprs.items[0]; 330 | 331 | // First patch 332 | const p = try c.compileInternal(f); 333 | var hole = p.hole; 334 | const entry = p.entry; 335 | 336 | // tie together patches from concat arguments 337 | for (subexprs.items[1..]) |e| { 338 | const ep = try c.compileInternal(e); 339 | // fill the previous patch hole to the current entry 340 | c.fill(hole, ep.entry); 341 | // current hole is now the next fragment 342 | hole = ep.hole; 343 | } 344 | 345 | return Patch{ .hole = hole, .entry = entry }; 346 | }, 347 | Expr.Capture => |subexpr| { 348 | // 1: save 1, 2 349 | // 2: subexpr 350 | // 3: restore 1, 4 351 | // ... 352 | 353 | // Create a partial instruction with a hole outgoing at the current location. 354 | const entry = c.insts.items.len; 355 | 356 | const index = c.nextCaptureIndex(); 357 | 358 | try c.pushCompiled(Instruction.new(entry + 1, InstructionData{ .Save = index })); 359 | const p = try c.compileInternal(subexpr); 360 | c.fillToNext(p.hole); 361 | 362 | const h = try c.pushHole(InstHole{ .Save = index + 1 }); 363 | 364 | return Patch{ .hole = h, .entry = entry }; 365 | }, 366 | Expr.Alternate => |subexprs| { 367 | // Alternation with one path does not make sense 368 | debug.assert(subexprs.items.len >= 2); 369 | 370 | // Alternates are simply a series of splits into the sub-expressions, with each 371 | // subexpr having the same output hole (after the final subexpr). 372 | // 373 | // 1: split 2, 4 374 | // 2: subexpr1 375 | // 3: jmp 8 376 | // 4: split 5, 7 377 | // 5: subexpr2 378 | // 6: jmp 8 379 | // 7: subexpr3 380 | // 8: ... 381 | 382 | const entry = c.insts.items.len; 383 | var holes = ArrayList(Hole).init(c.allocator); 384 | errdefer holes.deinit(); 385 | 386 | // TODO: Doees this need to be dynamically allocated? 387 | const last_hole = try c.allocator.create(Hole); 388 | defer c.allocator.destroy(last_hole); 389 | last_hole.* = .None; 390 | 391 | // This compiles one branch of the split at a time. 392 | for (subexprs.items[0 .. subexprs.items.len - 1]) |subexpr| { 393 | c.fillToNext(last_hole.*); 394 | 395 | // next entry will be a sub-expression 396 | // 397 | // We fill the second part of this hole on the next sub-expression. 398 | last_hole.* = try c.pushHole(InstHole{ .Split1 = c.insts.items.len + 1 }); 399 | 400 | // compile the subexpression 401 | const p = try c.compileInternal(subexpr); 402 | 403 | // store outgoing hole for the subexpression 404 | try holes.append(p.hole); 405 | } 406 | 407 | // one entry left, push a sub-expression so we end with a double-subexpression. 408 | const p = try c.compileInternal(subexprs.items[subexprs.items.len - 1]); 409 | c.fill(last_hole.*, p.entry); 410 | 411 | // push the last sub-expression hole 412 | try holes.append(p.hole); 413 | 414 | // return many holes which are all to be filled to the next instruction 415 | return Patch{ .hole = Hole{ .Many = holes }, .entry = entry }; 416 | }, 417 | Expr.PseudoLeftParen => { 418 | @panic("internal error, encountered PseudoLeftParen"); 419 | }, 420 | } 421 | 422 | return Patch{ .hole = Hole.None, .entry = c.insts.items.len }; 423 | } 424 | 425 | fn compileStar(c: *Compiler, expr: *Expr, greedy: bool) !Patch { 426 | // 1: split 2, 4 427 | // 2: subexpr 428 | // 3: jmp 1 429 | // 4: ... 430 | 431 | // We do not know where the second branch in this split will go (unsure yet of 432 | // the length of the following subexpr. Need a hole. 433 | 434 | // Create a partial instruction with a hole outgoing at the current location. 435 | const entry = c.insts.items.len; 436 | 437 | // * or *? variant, simply switch the branches, the matcher manages precedence 438 | // of the executing threads. 439 | const partial_inst = if (greedy) 440 | InstHole{ .Split1 = c.insts.items.len + 1 } 441 | else 442 | InstHole{ .Split2 = c.insts.items.len + 1 }; 443 | 444 | const h = try c.pushHole(partial_inst); 445 | 446 | // compile the subexpression 447 | const p = try c.compileInternal(expr); 448 | 449 | // sub-expression to jump 450 | c.fillToNext(p.hole); 451 | 452 | // Jump back to the entry split 453 | try c.pushCompiled(Instruction.new(entry, InstructionData.Jump)); 454 | 455 | // Return a filled patch set to the first split instruction. 456 | return Patch{ .hole = h, .entry = entry }; 457 | } 458 | 459 | fn compilePlus(c: *Compiler, expr: *Expr, greedy: bool) !Patch { 460 | // 1: subexpr 461 | // 2: split 1, 3 462 | // 3: ... 463 | // 464 | // NOTE: We can do a lookahead on non-greedy here to improve performance. 465 | const p = try c.compileInternal(expr); 466 | 467 | // Create the next expression in place 468 | c.fillToNext(p.hole); 469 | 470 | // split 3, 1 (non-greedy) 471 | // Point back to the upcoming next instruction (will always be filled). 472 | const partial_inst = if (greedy) 473 | InstHole{ .Split1 = p.entry } 474 | else 475 | InstHole{ .Split2 = p.entry }; 476 | 477 | const h = try c.pushHole(partial_inst); 478 | 479 | // split to the next instruction 480 | return Patch{ .hole = h, .entry = p.entry }; 481 | } 482 | 483 | fn compileQuestion(c: *Compiler, expr: *Expr, greedy: bool) !Patch { 484 | // 1: split 2, 3 485 | 486 | // 2: subexpr 487 | // 3: ... 488 | 489 | // Create a partial instruction with a hole outgoing at the current location. 490 | const partial_inst = if (greedy) 491 | InstHole{ .Split1 = c.insts.items.len + 1 } 492 | else 493 | InstHole{ .Split2 = c.insts.items.len + 1 }; 494 | 495 | const h = try c.pushHole(partial_inst); 496 | 497 | // compile the subexpression 498 | const p = try c.compileInternal(expr); 499 | 500 | var holes = ArrayList(Hole).init(c.allocator); 501 | errdefer holes.deinit(); 502 | try holes.append(h); 503 | try holes.append(p.hole); 504 | 505 | // Return a filled patch set to the first split instruction. 506 | return Patch{ .hole = Hole{ .Many = holes }, .entry = p.entry - 1 }; 507 | } 508 | 509 | // Push a compiled instruction directly onto the stack. 510 | fn pushCompiled(c: *Compiler, i: Instruction) !void { 511 | try c.insts.append(PartialInst{ .Compiled = i }); 512 | } 513 | 514 | // Push a instruction with a hole onto the set 515 | fn pushHole(c: *Compiler, i: InstHole) !Hole { 516 | const h = c.insts.items.len; 517 | try c.insts.append(PartialInst{ .Uncompiled = i }); 518 | return Hole{ .One = h }; 519 | } 520 | 521 | // Patch an individual hole with the specified output address. 522 | fn fill(c: *Compiler, hole: Hole, goto1: usize) void { 523 | switch (hole) { 524 | Hole.None => {}, 525 | Hole.One => |pc| c.insts.items[pc].fill(goto1), 526 | Hole.Many => |*holes| { 527 | for (holes.items) |hole1| 528 | c.fill(hole1, goto1); 529 | holes.deinit(); 530 | }, 531 | } 532 | } 533 | 534 | // Patch a hole to point to the next instruction 535 | fn fillToNext(c: *Compiler, hole: Hole) void { 536 | c.fill(hole, c.insts.items.len); 537 | } 538 | }; 539 | -------------------------------------------------------------------------------- /src/debug.zig: -------------------------------------------------------------------------------- 1 | // AST/IR Inspection routines are in a separate compilation unit to avoid pulling in any 2 | // dependencies on i/o output (which may not be supported in a freestanding environment). 3 | 4 | const debug = @import("std").debug; 5 | 6 | const parse = @import("parse.zig"); 7 | const compile = @import("compile.zig"); 8 | 9 | const Expr = parse.Expr; 10 | const Instruction = compile.Instruction; 11 | const InstructionData = compile.InstructionData; 12 | const Program = compile.Program; 13 | 14 | pub fn printCharEscaped(ch: u8) void { 15 | switch (ch) { 16 | '\t' => { 17 | debug.print("\\t", .{}); 18 | }, 19 | '\r' => { 20 | debug.print("\\r", .{}); 21 | }, 22 | '\n' => { 23 | debug.print("\\n", .{}); 24 | }, 25 | // printable characters 26 | 32...126 => { 27 | debug.print("{c}", .{ch}); 28 | }, 29 | else => { 30 | debug.print("0x{x}", .{ch}); 31 | }, 32 | } 33 | } 34 | 35 | pub fn dumpExpr(e: Expr) void { 36 | dumpExprIndent(e, 0); 37 | } 38 | 39 | fn dumpExprIndent(e: Expr, indent: usize) void { 40 | var i: usize = 0; 41 | while (i < indent) : (i += 1) { 42 | debug.print(" ", .{}); 43 | } 44 | 45 | switch (e) { 46 | Expr.AnyCharNotNL => { 47 | debug.print("{s}\n", .{@tagName(e)}); 48 | }, 49 | Expr.EmptyMatch => |assertion| { 50 | debug.print("{s}({s})\n", .{ @tagName(e), @tagName(assertion) }); 51 | }, 52 | Expr.Literal => |lit| { 53 | debug.print("{s}(", .{@tagName(e)}); 54 | printCharEscaped(lit); 55 | debug.print(")\n", .{}); 56 | }, 57 | Expr.Capture => |subexpr| { 58 | debug.print("{s}\n", .{@tagName(e)}); 59 | dumpExprIndent(subexpr.*, indent + 1); 60 | }, 61 | Expr.Repeat => |repeat| { 62 | debug.print("{s}(min={d}, max={?d}, greedy={any})\n", .{ @tagName(e), repeat.min, repeat.max, repeat.greedy }); 63 | dumpExprIndent(repeat.subexpr.*, indent + 1); 64 | }, 65 | Expr.ByteClass => |class| { 66 | debug.print("{s}(", .{@tagName(e)}); 67 | for (class.ranges.items) |r| { 68 | debug.print("[", .{}); 69 | printCharEscaped(r.min); 70 | debug.print("-", .{}); 71 | printCharEscaped(r.max); 72 | debug.print("]", .{}); 73 | } 74 | debug.print(")\n", .{}); 75 | }, 76 | // TODO: Can we get better type unification on enum variants with the same type? 77 | Expr.Concat => |subexprs| { 78 | debug.print("{s}\n", .{@tagName(e)}); 79 | for (subexprs.items) |s| 80 | dumpExprIndent(s.*, indent + 1); 81 | }, 82 | Expr.Alternate => |subexprs| { 83 | debug.print("{s}\n", .{@tagName(e)}); 84 | for (subexprs.items) |s| 85 | dumpExprIndent(s.*, indent + 1); 86 | }, 87 | // NOTE: Shouldn't occur ever in returned output. 88 | Expr.PseudoLeftParen => { 89 | debug.print("{s}\n", .{@tagName(e)}); 90 | }, 91 | } 92 | } 93 | 94 | pub fn dumpInstruction(s: Instruction) void { 95 | switch (s.data) { 96 | InstructionData.Char => |ch| { 97 | debug.print("char({}) '{c}'\n", .{ s.out, ch }); 98 | }, 99 | InstructionData.EmptyMatch => |assertion| { 100 | debug.print("empty({}) {s}\n", .{ s.out, @tagName(assertion) }); 101 | }, 102 | InstructionData.ByteClass => |class| { 103 | debug.print("range({}) ", .{s.out}); 104 | for (class.ranges.items) |r| 105 | debug.print("[{d}-{d}]", .{ r.min, r.max }); 106 | debug.print("\n", .{}); 107 | }, 108 | InstructionData.AnyCharNotNL => { 109 | debug.print("any({})\n", .{s.out}); 110 | }, 111 | InstructionData.Match => { 112 | debug.print("match\n", .{}); 113 | }, 114 | InstructionData.Jump => { 115 | debug.print("jump({})\n", .{s.out}); 116 | }, 117 | InstructionData.Split => |branch| { 118 | debug.print("split({}) {}\n", .{ s.out, branch }); 119 | }, 120 | InstructionData.Save => |slot| { 121 | debug.print("save({}), {}\n", .{ s.out, slot }); 122 | }, 123 | } 124 | } 125 | 126 | pub fn dumpProgram(s: Program) void { 127 | debug.print("start: {}\n\n", .{s.start}); 128 | for (s.insts, 0..) |inst, i| { 129 | debug.print("L{}: ", .{i}); 130 | dumpInstruction(inst); 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/exec.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const Allocator = std.mem.Allocator; 3 | const ArrayList = std.ArrayList; 4 | const compile = @import("compile.zig"); 5 | const Program = compile.Program; 6 | 7 | const VmBacktrack = @import("vm_backtrack.zig").VmBacktrack; 8 | const VmPike = @import("vm_pike.zig").VmPike; 9 | const Input = @import("input.zig").Input; 10 | 11 | pub fn exec(allocator: Allocator, prog: Program, prog_start: usize, input: *Input, slots: *ArrayList(?usize)) !bool { 12 | if (VmBacktrack.shouldExec(prog, input)) { 13 | var engine = VmBacktrack.init(allocator); 14 | return engine.exec(prog, prog_start, input, slots); 15 | } else { 16 | var engine = VmPike.init(allocator); 17 | return engine.exec(prog, prog_start, input, slots); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/input.zig: -------------------------------------------------------------------------------- 1 | // A generic iterator of some input bytes. 2 | // 3 | // This is intended to handle different decoding patterns. The intent is to have a Utf-8 and byte 4 | // input abstraction. Execution engines can be generic over these two types. 5 | // 6 | // Technically we could encode Utf-8 into associated bytes when constructing the program. This is 7 | // typically slower on the match however as for large unicode states many more states need to be 8 | // traversed. 9 | 10 | const Assertion = @import("parse.zig").Assertion; 11 | 12 | pub const Input = struct { 13 | bytes: []const u8, 14 | byte_pos: usize, 15 | 16 | currentFn: *const fn (input: Input) ?u8, 17 | advanceFn: *const fn (input: *Input) void, 18 | isNextWordCharFn: *const fn (input: Input) bool, 19 | isPrevWordCharFn: *const fn (input: Input) bool, 20 | isCurrWordCharFn: *const fn (input: Input) bool, 21 | 22 | pub fn advance(self: *Input) void { 23 | self.advanceFn(self); 24 | } 25 | 26 | pub fn current(self: Input) ?u8 { 27 | return self.currentFn(self); 28 | } 29 | 30 | // Note: We extend the range here to one past the end of the input. This is done in order to 31 | // handle complete matches correctly. 32 | pub fn isConsumed(self: Input) bool { 33 | return self.byte_pos > self.bytes.len; 34 | } 35 | 36 | pub fn isEmptyMatch(self: Input, match: Assertion) bool { 37 | switch (match) { 38 | Assertion.None => { 39 | return true; 40 | }, 41 | Assertion.BeginLine => { 42 | return self.byte_pos == 0; 43 | }, 44 | Assertion.EndLine => { 45 | return self.byte_pos >= self.bytes.len - 1; 46 | }, 47 | Assertion.BeginText => { 48 | // TODO: Handle different modes. 49 | return self.byte_pos == 0; 50 | }, 51 | Assertion.EndText => { 52 | return self.byte_pos >= self.bytes.len - 1; 53 | }, 54 | Assertion.WordBoundaryAscii => { 55 | return self.isPrevWordCharFn(self) != self.isCurrWordCharFn(self); 56 | }, 57 | Assertion.NotWordBoundaryAscii => { 58 | return self.isPrevWordCharFn(self) == self.isCurrWordCharFn(self); 59 | }, 60 | } 61 | } 62 | 63 | // Create a new instance using the same interface functions. 64 | pub fn clone(self: Input) Input { 65 | return Input{ 66 | .bytes = self.bytes, 67 | .byte_pos = self.byte_pos, 68 | 69 | .currentFn = self.currentFn, 70 | .advanceFn = self.advanceFn, 71 | .isNextWordCharFn = self.isNextWordCharFn, 72 | .isPrevWordCharFn = self.isPrevWordCharFn, 73 | .isCurrWordCharFn = self.isCurrWordCharFn, 74 | }; 75 | } 76 | }; 77 | 78 | pub const InputBytes = struct { 79 | input: Input, 80 | 81 | pub fn init(bytes: []const u8) InputBytes { 82 | return InputBytes{ 83 | .input = Input{ 84 | .bytes = bytes, 85 | .byte_pos = 0, 86 | 87 | .currentFn = current, 88 | .advanceFn = advance, 89 | .isNextWordCharFn = isNextWordChar, 90 | .isPrevWordCharFn = isPrevWordChar, 91 | .isCurrWordCharFn = isCurrWordChar, 92 | }, 93 | }; 94 | } 95 | 96 | // TODO: When we can compare ?usize == usize this will be a bit nicer. 97 | fn current(self: Input) ?u8 { 98 | if (self.byte_pos < self.bytes.len) { 99 | return self.bytes[self.byte_pos]; 100 | } else { 101 | return null; 102 | } 103 | } 104 | 105 | fn advance(self: *Input) void { 106 | if (self.byte_pos <= self.bytes.len) { 107 | self.byte_pos += 1; 108 | } 109 | } 110 | 111 | fn isWordChar(c: u8) bool { 112 | return switch (c) { 113 | '0'...'9', 'a'...'z', 'A'...'Z' => true, 114 | else => false, 115 | }; 116 | } 117 | 118 | fn isNextWordChar(self: Input) bool { 119 | return (self.byte_pos < self.bytes.len - 1) and isWordChar(self.bytes[self.byte_pos + 1]); 120 | } 121 | 122 | fn isPrevWordChar(self: Input) bool { 123 | return (self.byte_pos > 0) and isWordChar(self.bytes[self.byte_pos - 1]); 124 | } 125 | 126 | fn isCurrWordChar(self: Input) bool { 127 | return (self.byte_pos < self.bytes.len) and isWordChar(self.bytes[self.byte_pos]); 128 | } 129 | }; 130 | -------------------------------------------------------------------------------- /src/parse.zig: -------------------------------------------------------------------------------- 1 | /// Parses a regular expression into an expression-tree. Uses a stack-based parser to avoid 2 | /// unbounded recursion. 3 | const std = @import("std"); 4 | const math = std.math; 5 | const mem = std.mem; 6 | const fmt = std.fmt; 7 | const Allocator = std.mem.Allocator; 8 | const ArenaAllocator = std.heap.ArenaAllocator; 9 | const ArrayList = std.ArrayList; 10 | const debug = std.debug; 11 | 12 | const range_set = @import("range_set.zig"); 13 | const ByteClassTemplates = range_set.ByteClassTemplates; 14 | 15 | /// A single class range (e.g. [a-z]). 16 | pub const ByteRange = range_set.Range(u8); 17 | 18 | /// Multiple class ranges (e.g. [a-z0-9]) 19 | pub const ByteClass = range_set.RangeSet(u8); 20 | 21 | /// Repeat sequence (e.g. +, *, ?, {m,n}) 22 | pub const Repeater = struct { 23 | // The sub-expression to repeat 24 | subexpr: *Expr, 25 | // Lower number of times to match 26 | min: usize, 27 | // Upper number of times to match (null -> infinite) 28 | max: ?usize, 29 | // Whether this matches greedily 30 | greedy: bool, 31 | }; 32 | 33 | /// A specific look-around assertion 34 | pub const Assertion = enum { 35 | // Always true assertion 36 | None, 37 | // ^ anchor, beginning of text (or line depending on mode) 38 | BeginLine, 39 | // $ anchor, beginning of text (or line dependening on mode) 40 | EndLine, 41 | // \A anchor, beginning of text 42 | BeginText, 43 | // \z anchor, end of text 44 | EndText, 45 | // \w anchor, word boundary ascii 46 | WordBoundaryAscii, 47 | // \W anchor, non-word boundary ascii 48 | NotWordBoundaryAscii, 49 | }; 50 | 51 | /// A single node of an expression tree. 52 | pub const Expr = union(enum) { 53 | // Empty match (\w assertion) 54 | EmptyMatch: Assertion, 55 | // A single character byte to match 56 | Literal: u8, 57 | // . character 58 | AnyCharNotNL, 59 | // Capture group 60 | Capture: *Expr, 61 | // *, +, ? 62 | Repeat: Repeater, 63 | // Character class [a-z0-9] 64 | ByteClass: ByteClass, 65 | // Concatenation 66 | Concat: ArrayList(*Expr), 67 | // | 68 | Alternate: ArrayList(*Expr), 69 | // Pseudo stack operator to define start of a capture 70 | PseudoLeftParen, 71 | 72 | pub fn isByteClass(re: *const Expr) bool { 73 | switch (re.*) { 74 | .Literal, 75 | .ByteClass, 76 | .AnyCharNotNL, 77 | // TODO: Don't keep capture here, but allow on repeat operators. 78 | .Capture, 79 | => return true, 80 | else => return false, 81 | } 82 | } 83 | 84 | pub fn clone(re: *Expr) !Expr { 85 | return switch (re.*) { 86 | .ByteClass => |*bc| Expr{ .ByteClass = try bc.clone() }, 87 | else => re.*, 88 | }; 89 | } 90 | 91 | pub fn deinit(re: *Expr) void { 92 | switch (re.*) { 93 | .ByteClass => |*bc| bc.deinit(), 94 | } 95 | } 96 | }; 97 | 98 | // Private in fmt. 99 | fn charToDigit(c: u8, radix: u8) !u8 { 100 | const value = switch (c) { 101 | '0'...'9' => c - '0', 102 | 'A'...'Z' => c - 'A' + 10, 103 | 'a'...'z' => c - 'a' + 10, 104 | else => return error.InvalidChar, 105 | }; 106 | 107 | if (value >= radix) 108 | return error.InvalidChar; 109 | 110 | return value; 111 | } 112 | 113 | const StringIterator = struct { 114 | const Self = @This(); 115 | 116 | slice: []const u8, 117 | index: usize, 118 | 119 | pub fn init(s: []const u8) Self { 120 | return StringIterator{ 121 | .slice = s, 122 | .index = 0, 123 | }; 124 | } 125 | 126 | // Advance the stream and return the next token. 127 | pub fn next(it: *Self) ?u8 { 128 | if (it.index < it.slice.len) { 129 | const n = it.index; 130 | it.index += 1; 131 | return it.slice[n]; 132 | } else { 133 | return null; 134 | } 135 | } 136 | 137 | // Advance the stream. 138 | pub fn bump(it: *Self) void { 139 | if (it.index < it.slice.len) { 140 | it.index += 1; 141 | } 142 | } 143 | 144 | // Reset the stream back one character 145 | pub fn bumpBack(it: *Self) void { 146 | if (it.index > 0) { 147 | it.index -= 1; 148 | } 149 | } 150 | 151 | // Look at the nth character in the stream without advancing. 152 | fn peekAhead(it: *const Self, comptime n: usize) ?u8 { 153 | if (it.index + n < it.slice.len) { 154 | return it.slice[it.index + n]; 155 | } else { 156 | return null; 157 | } 158 | } 159 | 160 | // Return true if the next character in the stream is `ch`. 161 | pub fn peekNextIs(it: *const Self, ch: u8) bool { 162 | if (it.peekAhead(1)) |ok_ch| { 163 | return ok_ch == ch; 164 | } else { 165 | return false; 166 | } 167 | } 168 | 169 | // Look at the next character in the stream without advancing. 170 | pub fn peek(it: *const Self) ?u8 { 171 | return it.peekAhead(0); 172 | } 173 | 174 | // Return true if the next character in the stream is `ch`. 175 | pub fn peekIs(it: *const Self, ch: u8) bool { 176 | if (it.peek()) |ok_ch| { 177 | return ok_ch == ch; 178 | } else { 179 | return false; 180 | } 181 | } 182 | 183 | // Read an integer from the stream. Any non-digit characters stops the parsing chain. 184 | // 185 | // Error if no digits were read. 186 | // 187 | // TODO: Non character word-boundary instead? 188 | pub fn readInt(it: *Self, comptime T: type, comptime radix: u8) !T { 189 | return it.readIntN(T, radix, math.maxInt(usize)); 190 | } 191 | 192 | // Read an integer from the stream, limiting the read to N characters at most. 193 | pub fn readIntN(it: *Self, comptime T: type, comptime radix: u8, comptime N: usize) !T { 194 | const start = it.index; 195 | 196 | var i: usize = 0; 197 | while (it.peek()) |ch| : (i += 1) { 198 | if (i >= N) { 199 | break; 200 | } 201 | 202 | if (charToDigit(ch, radix)) |_| { 203 | it.bump(); 204 | } else |_| { 205 | break; 206 | } 207 | } 208 | 209 | if (start != it.index) { 210 | return try fmt.parseUnsigned(T, it.slice[start..it.index], radix); 211 | } else { 212 | return error.NoIntegerRead; 213 | } 214 | } 215 | 216 | pub fn skipSpaces(it: *Self) void { 217 | while (it.peek()) |ok| { 218 | if (ok != ' ') 219 | return; 220 | 221 | it.bump(); 222 | } 223 | } 224 | }; 225 | 226 | pub const ParseError = error{ 227 | MissingRepeatOperand, 228 | MissingRepeatArgument, 229 | InvalidRepeatArgument, 230 | EmptyAlternate, 231 | UnbalancedParentheses, 232 | UnopenedParentheses, 233 | UnclosedParentheses, 234 | EmptyCaptureGroup, 235 | UnmatchedByteClass, 236 | StackUnderflow, 237 | InvalidRepeatRange, 238 | UnclosedRepeat, 239 | UnclosedBrackets, 240 | ExcessiveRepeatCount, 241 | OpenEscapeCode, 242 | UnclosedHexCharacterCode, 243 | InvalidHexDigit, 244 | InvalidOctalDigit, 245 | UnrecognizedEscapeCode, 246 | }; 247 | 248 | pub const ParserOptions = struct { 249 | // Upper limit on values allowed in a bounded expression (e.g. {500,1000}). 250 | // This must be bounded as these are unrolled by the engine into individual branches and 251 | // otherwise are a vector for memory exhaustion attacks. 252 | max_repeat_length: usize, 253 | 254 | pub fn default() ParserOptions { 255 | return ParserOptions{ .max_repeat_length = 1000 }; 256 | } 257 | }; 258 | 259 | /// Parser manages the parsing state and converts a regular expression string into an expression tree. 260 | /// 261 | /// The resulting expression is tied to the Parser which generated it. 262 | pub const Parser = struct { 263 | // Parse expression stack 264 | stack: ArrayList(*Expr), 265 | // ArenaAllocator for generating all expression nodes 266 | arena: ArenaAllocator, 267 | // Allocator for temporary lists/items 268 | allocator: Allocator, 269 | // Configurable parser options 270 | options: ParserOptions, 271 | // Internal execution state. 272 | it: StringIterator, 273 | 274 | pub fn init(a: Allocator) Parser { 275 | return initWithOptions(a, ParserOptions.default()); 276 | } 277 | 278 | pub fn initWithOptions(a: Allocator, options: ParserOptions) Parser { 279 | return Parser{ 280 | .stack = ArrayList(*Expr).init(a), 281 | .arena = ArenaAllocator.init(a), 282 | .allocator = a, 283 | .options = options, 284 | .it = undefined, 285 | }; 286 | } 287 | 288 | pub fn deinit(p: *Parser) void { 289 | p.stack.deinit(); 290 | p.arena.deinit(); 291 | } 292 | 293 | pub fn reset(p: *Parser) void { 294 | p.stack.shrink(0); 295 | 296 | // Note: A shrink or reset on the ArenaAllocator would be nice. 297 | p.arena.deinit(); 298 | p.arena = ArenaAllocator.init(p.allocator); 299 | } 300 | 301 | fn popStack(p: *Parser) !*Expr { 302 | if (p.stack.items.len == 0) { 303 | return error.StackUnderflow; 304 | } 305 | 306 | return p.stack.pop().?; 307 | } 308 | 309 | fn popByteClass(p: *Parser) !*Expr { 310 | const re1 = try p.popStack(); 311 | if (re1.isByteClass()) { 312 | return re1; 313 | } else { 314 | return error.MissingRepeatOperand; 315 | } 316 | } 317 | 318 | fn isPunctuation(c: u8) bool { 319 | return switch (c) { 320 | '\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}', '^', '$', '-' => true, 321 | else => false, 322 | }; 323 | } 324 | 325 | fn createExpr(p: *Parser) !*Expr { 326 | return try p.arena.allocator().create(Expr); 327 | } 328 | 329 | pub fn parse(p: *Parser, re: []const u8) !*Expr { 330 | p.it = StringIterator.init(re); 331 | // Shorter alias 332 | var it = &p.it; 333 | 334 | while (it.next()) |ch| { 335 | // TODO: Consolidate some of the same common patterns. 336 | switch (ch) { 337 | '*' => { 338 | try p.parseRepeat(0, null); 339 | }, 340 | '+' => { 341 | try p.parseRepeat(1, null); 342 | }, 343 | '?' => { 344 | try p.parseRepeat(0, 1); 345 | }, 346 | '{' => { 347 | it.skipSpaces(); 348 | 349 | const min = it.readInt(usize, 10) catch return error.InvalidRepeatArgument; 350 | var max: ?usize = min; 351 | 352 | it.skipSpaces(); 353 | 354 | if (it.peekIs(',')) { 355 | it.bump(); 356 | it.skipSpaces(); 357 | 358 | // {m,} case with infinite upper bound 359 | if (it.peekIs('}')) { 360 | max = null; 361 | } 362 | // {m,n} case with explicit bounds 363 | else { 364 | max = it.readInt(usize, 10) catch return error.InvalidRepeatArgument; 365 | 366 | if (max.? < min) { 367 | return error.InvalidRepeatRange; 368 | } 369 | } 370 | } 371 | 372 | it.skipSpaces(); 373 | if (!it.peekIs('}')) { 374 | return error.UnclosedRepeat; 375 | } 376 | it.bump(); 377 | 378 | // We limit repeat counts to overoad arbitrary memory blowup during compilation 379 | const limit = p.options.max_repeat_length; 380 | if (min > limit or max != null and max.? > limit) { 381 | return error.ExcessiveRepeatCount; 382 | } 383 | 384 | try p.parseRepeat(min, max); 385 | }, 386 | '.' => { 387 | const r = try p.createExpr(); 388 | r.* = Expr{ .AnyCharNotNL = undefined }; 389 | try p.stack.append(r); 390 | }, 391 | '[' => { 392 | try p.parseCharClass(); 393 | }, 394 | // Don't handle alternation just yet, parentheses group together arguments into 395 | // a sub-expression only. 396 | '(' => { 397 | const r = try p.createExpr(); 398 | r.* = Expr{ .PseudoLeftParen = undefined }; 399 | try p.stack.append(r); 400 | }, 401 | ')' => { 402 | // Pop the stack until. 403 | // 404 | // - Empty, error unopened parenthesis. 405 | // - ( pseudo operator, push a group expression of the concat 406 | // - '|' pop and add the concat to the alternation list. Pop one more item 407 | // after which must be a opening parenthesis. 408 | // 409 | // '|' ensures there will be only one alternation on the stack here. 410 | var concat = ArrayList(*Expr).init(p.arena.allocator()); 411 | 412 | while (true) { 413 | // would underflow, push a new alternation 414 | if (p.stack.items.len == 0) { 415 | return error.UnopenedParentheses; 416 | } 417 | 418 | const e = p.stack.pop().?; 419 | switch (e.*) { 420 | // Existing alternation 421 | .Alternate => { 422 | mem.reverse(*Expr, concat.items); 423 | 424 | const ra = try p.createExpr(); 425 | if (concat.items.len == 1) { 426 | ra.* = concat.items[0].*; 427 | } else { 428 | ra.* = Expr{ .Concat = concat }; 429 | } 430 | 431 | // append to the alternation stack 432 | try e.Alternate.append(ra); 433 | 434 | if (p.stack.items.len == 0) { 435 | return error.UnopenedParentheses; 436 | } 437 | 438 | // pop the left parentheses that must now exist 439 | debug.assert(p.stack.pop().?.* == Expr.PseudoLeftParen); 440 | 441 | const r = try p.createExpr(); 442 | r.* = Expr{ .Capture = e }; 443 | try p.stack.append(r); 444 | break; 445 | }, 446 | // Existing parentheses, push new alternation 447 | .PseudoLeftParen => { 448 | mem.reverse(*Expr, concat.items); 449 | 450 | const ra = try p.createExpr(); 451 | ra.* = Expr{ .Concat = concat }; 452 | 453 | if (concat.items.len == 0) { 454 | return error.EmptyCaptureGroup; 455 | } else if (concat.items.len == 1) { 456 | ra.* = concat.items[0].*; 457 | } else { 458 | ra.* = Expr{ .Concat = concat }; 459 | } 460 | 461 | const r = try p.createExpr(); 462 | r.* = Expr{ .Capture = ra }; 463 | try p.stack.append(r); 464 | break; 465 | }, 466 | // New expression, push onto concat stack 467 | else => { 468 | try concat.append(e); 469 | }, 470 | } 471 | } 472 | }, 473 | '|' => { 474 | // Pop the stack until. 475 | // 476 | // - Empty, then push the sub-expression as a concat. 477 | // - ( pseudo operator, leave '(' and push concat. 478 | // - '|' is found, pop the existing and add a new alternation to the array. 479 | var concat = ArrayList(*Expr).init(p.arena.allocator()); 480 | 481 | if (p.stack.items.len == 0 or !p.stack.items[p.stack.items.len - 1].isByteClass()) { 482 | return error.EmptyAlternate; 483 | } 484 | 485 | while (true) { 486 | // would underflow, push a new alternation 487 | if (p.stack.items.len == 0) { 488 | // We need to create a single expr node for the alternation. 489 | const ra = try p.createExpr(); 490 | mem.reverse(*Expr, concat.items); 491 | 492 | if (concat.items.len == 1) { 493 | ra.* = concat.items[0].*; 494 | } else { 495 | ra.* = Expr{ .Concat = concat }; 496 | } 497 | 498 | var r = try p.createExpr(); 499 | r.* = Expr{ .Alternate = ArrayList(*Expr).init(p.arena.allocator()) }; 500 | try r.Alternate.append(ra); 501 | try p.stack.append(r); 502 | break; 503 | } 504 | 505 | const e = p.stack.pop().?; 506 | switch (e.*) { 507 | // Existing alternation, combine 508 | .Alternate => { 509 | mem.reverse(*Expr, concat.items); 510 | 511 | const ra = try p.createExpr(); 512 | if (concat.items.len == 1) { 513 | ra.* = concat.items[0].*; 514 | } else { 515 | ra.* = Expr{ .Concat = concat }; 516 | } 517 | 518 | // use the expression itself 519 | try e.Alternate.append(ra); 520 | 521 | try p.stack.append(e); 522 | break; 523 | }, 524 | // Existing parentheses, push new alternation 525 | .PseudoLeftParen => { 526 | // re-push parentheses marker 527 | try p.stack.append(e); 528 | 529 | mem.reverse(*Expr, concat.items); 530 | 531 | const ra = try p.createExpr(); 532 | if (concat.items.len == 1) { 533 | ra.* = concat.items[0].*; 534 | } else { 535 | ra.* = Expr{ .Concat = concat }; 536 | } 537 | 538 | var r = try p.createExpr(); 539 | r.* = Expr{ .Alternate = ArrayList(*Expr).init(p.arena.allocator()) }; 540 | try r.Alternate.append(ra); 541 | try p.stack.append(r); 542 | break; 543 | }, 544 | // New expression, push onto concat stack 545 | else => { 546 | try concat.append(e); 547 | }, 548 | } 549 | } 550 | }, 551 | '\\' => { 552 | const r = try p.parseEscape(); 553 | try p.stack.append(r); 554 | }, 555 | '^' => { 556 | const r = try p.createExpr(); 557 | r.* = Expr{ .EmptyMatch = Assertion.BeginLine }; 558 | try p.stack.append(r); 559 | }, 560 | '$' => { 561 | const r = try p.createExpr(); 562 | r.* = Expr{ .EmptyMatch = Assertion.EndLine }; 563 | try p.stack.append(r); 564 | }, 565 | else => { 566 | try p.parseLiteral(ch); 567 | }, 568 | } 569 | } 570 | 571 | // special case empty item 572 | if (p.stack.items.len == 0) { 573 | const r = try p.createExpr(); 574 | r.* = Expr{ .EmptyMatch = Assertion.None }; 575 | return r; 576 | } 577 | 578 | // special case single item to avoid top-level concat for simple. 579 | if (p.stack.items.len == 1) { 580 | return p.stack.pop().?; 581 | } 582 | 583 | // finish a concatenation result 584 | // 585 | // This pops items off the stack and concatenates them until: 586 | // 587 | // - The stack is empty (the items are concat and pushed and the single result is returned). 588 | // - An alternation is seen, this is popped and the current concat state is pushed as an 589 | // alternation item. 590 | // 591 | // After any of these cases, the stack must be empty. 592 | // 593 | // There can be no parentheses left on the stack during this popping. 594 | var concat = ArrayList(*Expr).init(p.arena.allocator()); 595 | 596 | while (true) { 597 | if (p.stack.items.len == 0) { 598 | // concat the items in reverse order and return 599 | mem.reverse(*Expr, concat.items); 600 | 601 | const r = try p.createExpr(); 602 | if (concat.items.len == 1) { 603 | r.* = concat.items[0].*; 604 | } else { 605 | r.* = Expr{ .Concat = concat }; 606 | } 607 | return r; 608 | } 609 | 610 | // pop an item, check if it is an alternate and not a pseudo left paren 611 | const e = p.stack.pop().?; 612 | switch (e.*) { 613 | .PseudoLeftParen => { 614 | return error.UnclosedParentheses; 615 | }, 616 | // Alternation at top-level, push concat and return 617 | .Alternate => { 618 | mem.reverse(*Expr, concat.items); 619 | 620 | const ra = try p.createExpr(); 621 | if (concat.items.len == 1) { 622 | ra.* = concat.items[0].*; 623 | } else { 624 | ra.* = Expr{ .Concat = concat }; 625 | } 626 | 627 | // use the expression itself 628 | try e.Alternate.append(ra); 629 | 630 | // if stack is not empty, this is an error 631 | if (p.stack.items.len != 0) { 632 | switch (p.stack.pop().?.*) { 633 | .PseudoLeftParen => return error.UnclosedParentheses, 634 | else => unreachable, 635 | } 636 | } 637 | 638 | return e; 639 | }, 640 | // New expression, push onto concat stack 641 | else => { 642 | try concat.append(e); 643 | }, 644 | } 645 | } 646 | } 647 | 648 | fn parseLiteral(p: *Parser, ch: u8) !void { 649 | const r = try p.createExpr(); 650 | r.* = Expr{ .Literal = ch }; 651 | try p.stack.append(r); 652 | } 653 | 654 | fn parseRepeat(p: *Parser, min: usize, max: ?usize) !void { 655 | var greedy = true; 656 | if (p.it.peekIs('?')) { 657 | p.it.bump(); 658 | greedy = false; 659 | } 660 | 661 | const sub_expr = p.popByteClass() catch return error.MissingRepeatOperand; 662 | 663 | const repeat = Repeater{ 664 | .subexpr = sub_expr, 665 | .min = min, 666 | .max = max, 667 | .greedy = greedy, 668 | }; 669 | 670 | const r = try p.createExpr(); 671 | r.* = Expr{ .Repeat = repeat }; 672 | try p.stack.append(r); 673 | } 674 | 675 | // NOTE: We don't handle needed character classes. 676 | fn parseCharClass(p: *Parser) !void { 677 | var it = &p.it; 678 | 679 | var class = ByteClass.init(p.arena.allocator()); 680 | errdefer class.deinit(); 681 | 682 | var negate = false; 683 | if (it.peekIs('^')) { 684 | it.bump(); 685 | negate = true; 686 | } 687 | 688 | // First '[' in a multi-class is always treated as a literal. This disallows 689 | // the empty byte-set '[]'. 690 | if (it.peekIs(']')) { 691 | it.bump(); 692 | 693 | const range = ByteRange{ .min = ']', .max = ']' }; 694 | try class.addRange(range); 695 | } 696 | 697 | while (!it.peekIs(']')) : (it.bump()) { 698 | if (it.peek() == null) { 699 | return error.UnclosedBrackets; 700 | } 701 | 702 | const chp = it.peek().?; 703 | 704 | // If this is a byte-class escape, we cannot expect an '-' range after it. 705 | // Accept the following - as a literal (may be bad behaviour). 706 | // 707 | // If it is not, then we can and it is fine. 708 | var range: ByteRange = undefined; 709 | 710 | if (chp == '\\') { 711 | it.bump(); 712 | 713 | // parseEscape returns a literal or byteclass so reformat 714 | const r = try p.parseEscape(); 715 | // NOTE: this is bumped on loop 716 | it.index -= 1; 717 | switch (r.*) { 718 | .Literal => |value| { 719 | range = ByteRange{ .min = value, .max = value }; 720 | }, 721 | .ByteClass => |*vv| { 722 | defer vv.deinit(); 723 | // '-' doesn't make sense following this, merge class here 724 | // and continue next. 725 | try class.mergeClass(vv.*); 726 | continue; 727 | }, 728 | else => unreachable, 729 | } 730 | } else { 731 | range = ByteRange{ .min = chp, .max = chp }; 732 | } 733 | 734 | // is this a range? 735 | if (it.peekNextIs('-')) { 736 | it.bump(); 737 | it.bump(); 738 | 739 | if (it.peek() == null) { 740 | return error.UnclosedBrackets; 741 | } else if (it.peekIs(']')) { 742 | // treat the '-' as a literal instead 743 | it.index -= 1; 744 | } else { 745 | range.max = it.peek().?; 746 | } 747 | } 748 | 749 | try class.addRange(range); 750 | } 751 | it.bump(); 752 | 753 | if (negate) { 754 | try class.negate(); 755 | } 756 | 757 | const r = try p.createExpr(); 758 | r.* = Expr{ .ByteClass = class }; 759 | try p.stack.append(r); 760 | } 761 | 762 | fn parseEscape(p: *Parser) !*Expr { 763 | const ch = p.it.next() orelse return error.OpenEscapeCode; 764 | 765 | if (isPunctuation(ch)) { 766 | const r = try p.createExpr(); 767 | r.* = Expr{ .Literal = ch }; 768 | return r; 769 | } 770 | 771 | switch (ch) { 772 | // escape chars 773 | 'a' => { 774 | const r = try p.createExpr(); 775 | r.* = Expr{ .Literal = '\x07' }; 776 | return r; 777 | }, 778 | 'f' => { 779 | const r = try p.createExpr(); 780 | r.* = Expr{ .Literal = '\x0c' }; 781 | return r; 782 | }, 783 | 'n' => { 784 | const r = try p.createExpr(); 785 | r.* = Expr{ .Literal = '\n' }; 786 | return r; 787 | }, 788 | 'r' => { 789 | const r = try p.createExpr(); 790 | r.* = Expr{ .Literal = '\r' }; 791 | return r; 792 | }, 793 | 't' => { 794 | const r = try p.createExpr(); 795 | r.* = Expr{ .Literal = '\t' }; 796 | return r; 797 | }, 798 | 'v' => { 799 | const r = try p.createExpr(); 800 | r.* = Expr{ .Literal = '\x0b' }; 801 | return r; 802 | }, 803 | // perl codes 804 | 's' => { 805 | const s = try ByteClassTemplates.Whitespace(p.arena.allocator()); 806 | const r = try p.createExpr(); 807 | r.* = Expr{ .ByteClass = s }; 808 | return r; 809 | }, 810 | 'S' => { 811 | const s = try ByteClassTemplates.NonWhitespace(p.arena.allocator()); 812 | const r = try p.createExpr(); 813 | r.* = Expr{ .ByteClass = s }; 814 | return r; 815 | }, 816 | 'w' => { 817 | const s = try ByteClassTemplates.AlphaNumeric(p.arena.allocator()); 818 | const r = try p.createExpr(); 819 | r.* = Expr{ .ByteClass = s }; 820 | return r; 821 | }, 822 | 'W' => { 823 | const s = try ByteClassTemplates.NonAlphaNumeric(p.arena.allocator()); 824 | const r = try p.createExpr(); 825 | r.* = Expr{ .ByteClass = s }; 826 | return r; 827 | }, 828 | 'd' => { 829 | const s = try ByteClassTemplates.Digits(p.arena.allocator()); 830 | const r = try p.createExpr(); 831 | r.* = Expr{ .ByteClass = s }; 832 | return r; 833 | }, 834 | 'D' => { 835 | const s = try ByteClassTemplates.NonDigits(p.arena.allocator()); 836 | const r = try p.createExpr(); 837 | r.* = Expr{ .ByteClass = s }; 838 | return r; 839 | }, 840 | '0'...'9' => { 841 | p.it.bumpBack(); 842 | 843 | // octal integer up to 3 digits, always succeeds since we have at least one digit 844 | // TODO: u32 codepoint and not u8 845 | const value = p.it.readIntN(u8, 8, 3) catch return error.InvalidOctalDigit; 846 | const r = try p.createExpr(); 847 | r.* = Expr{ .Literal = value }; 848 | return r; 849 | }, 850 | 'x' => { 851 | p.it.skipSpaces(); 852 | 853 | // '\x{2423} 854 | if (p.it.peekIs('{')) { 855 | p.it.bump(); 856 | 857 | // TODO: u32 codepoint and not u8 858 | const value = p.it.readInt(u8, 16) catch return error.InvalidHexDigit; 859 | 860 | // TODO: Check range as well and if valid unicode codepoint 861 | if (!p.it.peekIs('}')) { 862 | return error.UnclosedHexCharacterCode; 863 | } 864 | p.it.bump(); 865 | 866 | const r = try p.createExpr(); 867 | r.* = Expr{ .Literal = value }; 868 | return r; 869 | } 870 | // '\x23 871 | else { 872 | const value = p.it.readIntN(u8, 16, 2) catch return error.InvalidHexDigit; 873 | const r = try p.createExpr(); 874 | r.* = Expr{ .Literal = value }; 875 | return r; 876 | } 877 | }, 878 | 'b' => { 879 | const r = try p.createExpr(); 880 | r.* = Expr{ .EmptyMatch = Assertion.WordBoundaryAscii }; 881 | return r; 882 | }, 883 | 'B' => { 884 | const r = try p.createExpr(); 885 | r.* = Expr{ .EmptyMatch = Assertion.NotWordBoundaryAscii }; 886 | return r; 887 | }, 888 | else => { 889 | return error.UnrecognizedEscapeCode; 890 | }, 891 | } 892 | } 893 | }; 894 | -------------------------------------------------------------------------------- /src/parse_test.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const debug = std.debug; 3 | const mem = std.mem; 4 | const FixedBufferAllocator = std.heap.FixedBufferAllocator; 5 | 6 | const parse = @import("parse.zig"); 7 | const Parser = parse.Parser; 8 | const Expr = parse.Expr; 9 | const ParseError = parse.ParseError; 10 | 11 | // Note: Switch to OutStream 12 | var global_buffer: [2048]u8 = undefined; 13 | 14 | const StaticWriter = struct { 15 | buffer: []u8, 16 | last: usize, 17 | 18 | pub fn init(buffer: []u8) StaticWriter { 19 | return StaticWriter{ 20 | .buffer = buffer, 21 | .last = 0, 22 | }; 23 | } 24 | 25 | pub fn writeFn(self: *StaticWriter, bytes: []const u8) Error!usize { 26 | @memcpy(self.buffer[self.last..][0..bytes.len], bytes); 27 | self.last += bytes.len; 28 | return bytes.len; 29 | } 30 | 31 | pub const Error = error{OutOfMemory}; 32 | pub const Writer = std.io.Writer(*StaticWriter, Error, writeFn); 33 | 34 | pub fn writer(self: *StaticWriter) Writer { 35 | return .{ .context = self }; 36 | } 37 | 38 | pub fn printCharEscaped(self: *StaticWriter, ch: u8) !void { 39 | switch (ch) { 40 | '\t' => { 41 | try self.writer().print("\\t", .{}); 42 | }, 43 | '\r' => { 44 | try self.writer().print("\\r", .{}); 45 | }, 46 | '\n' => { 47 | try self.writer().print("\\n", .{}); 48 | }, 49 | // printable characters 50 | 32...126 => { 51 | try self.writer().print("{c}", .{ch}); 52 | }, 53 | else => { 54 | try self.writer().print("0x{x}", .{ch}); 55 | }, 56 | } 57 | } 58 | }; 59 | 60 | // Return a minimal string representation of the expression tree. 61 | fn repr(e: *Expr) ![]u8 { 62 | var stream = StaticWriter.init(global_buffer[0..]); 63 | try reprIndent(&stream, e, 0); 64 | return global_buffer[0..stream.last]; 65 | } 66 | 67 | fn reprIndent(out: *StaticWriter, e: *Expr, indent: usize) anyerror!void { 68 | var i: usize = 0; 69 | while (i < indent) : (i += 1) { 70 | try out.writer().print(" ", .{}); 71 | } 72 | 73 | switch (e.*) { 74 | Expr.AnyCharNotNL => { 75 | try out.writer().print("dot\n", .{}); 76 | }, 77 | Expr.EmptyMatch => |assertion| { 78 | try out.writer().print("empty({s})\n", .{@tagName(assertion)}); 79 | }, 80 | Expr.Literal => |lit| { 81 | try out.writer().print("lit(", .{}); 82 | try out.printCharEscaped(lit); 83 | try out.writer().print(")\n", .{}); 84 | }, 85 | Expr.Capture => |subexpr| { 86 | try out.writer().print("cap\n", .{}); 87 | try reprIndent(out, subexpr, indent + 1); 88 | }, 89 | Expr.Repeat => |repeat| { 90 | try out.writer().print("rep(", .{}); 91 | if (repeat.min == 0 and repeat.max == null) { 92 | try out.writer().print("*", .{}); 93 | } else if (repeat.min == 1 and repeat.max == null) { 94 | try out.writer().print("+", .{}); 95 | } else if (repeat.min == 0 and repeat.max != null and repeat.max.? == 1) { 96 | try out.writer().print("?", .{}); 97 | } else { 98 | try out.writer().print("{{{d},", .{repeat.min}); 99 | if (repeat.max) |ok| { 100 | try out.writer().print("{d}", .{ok}); 101 | } 102 | try out.writer().print("}}", .{}); 103 | } 104 | 105 | if (!repeat.greedy) { 106 | try out.writer().print("?", .{}); 107 | } 108 | try out.writer().print(")\n", .{}); 109 | 110 | try reprIndent(out, repeat.subexpr, indent + 1); 111 | }, 112 | Expr.ByteClass => |class| { 113 | try out.writer().print("bset(", .{}); 114 | for (class.ranges.items) |r| { 115 | try out.writer().print("[", .{}); 116 | try out.printCharEscaped(r.min); 117 | try out.writer().print("-", .{}); 118 | try out.printCharEscaped(r.max); 119 | try out.writer().print("]", .{}); 120 | } 121 | try out.writer().print(")\n", .{}); 122 | }, 123 | // TODO: Can we get better type unification on enum variants with the same type? 124 | Expr.Concat => |subexprs| { 125 | try out.writer().print("cat\n", .{}); 126 | for (subexprs.items) |s| 127 | try reprIndent(out, s, indent + 1); 128 | }, 129 | Expr.Alternate => |subexprs| { 130 | try out.writer().print("alt\n", .{}); 131 | for (subexprs.items) |s| 132 | try reprIndent(out, s, indent + 1); 133 | }, 134 | // NOTE: Shouldn't occur ever in returned output. 135 | Expr.PseudoLeftParen => { 136 | try out.writer().print("{s}\n", .{@tagName(e.*)}); 137 | }, 138 | } 139 | } 140 | 141 | fn check(re: []const u8, expected_ast: []const u8) void { 142 | var p = Parser.init(std.testing.allocator); 143 | defer p.deinit(); 144 | const expr = p.parse(re) catch unreachable; 145 | 146 | const ast = repr(expr) catch unreachable; 147 | 148 | const spaces = [_]u8{ ' ', '\n' }; 149 | const trimmed_ast = mem.trim(u8, ast, &spaces); 150 | const trimmed_expected_ast = mem.trim(u8, expected_ast, &spaces); 151 | 152 | if (!mem.eql(u8, trimmed_ast, trimmed_expected_ast)) { 153 | debug.print( 154 | \\ 155 | \\-- parsed the regex 156 | \\ 157 | \\{s} 158 | \\ 159 | \\-- expected the following 160 | \\ 161 | \\{s} 162 | \\ 163 | \\-- but instead got 164 | \\ 165 | \\{s} 166 | \\ 167 | , .{ 168 | re, 169 | trimmed_expected_ast, 170 | trimmed_ast, 171 | }); 172 | 173 | @panic("assertion failure"); 174 | } 175 | } 176 | 177 | // These are taken off rust-regex for the moment. 178 | test "parse simple" { 179 | check( 180 | \\ 181 | , 182 | \\empty(None) 183 | ); 184 | 185 | check( 186 | \\a 187 | , 188 | \\lit(a) 189 | ); 190 | 191 | check( 192 | \\ab 193 | , 194 | \\cat 195 | \\ lit(a) 196 | \\ lit(b) 197 | ); 198 | 199 | check( 200 | \\^a 201 | , 202 | \\cat 203 | \\ empty(BeginLine) 204 | \\ lit(a) 205 | ); 206 | 207 | check( 208 | \\a? 209 | , 210 | \\rep(?) 211 | \\ lit(a) 212 | ); 213 | 214 | check( 215 | \\ab? 216 | , 217 | \\cat 218 | \\ lit(a) 219 | \\ rep(?) 220 | \\ lit(b) 221 | ); 222 | 223 | check( 224 | \\a?? 225 | , 226 | \\rep(??) 227 | \\ lit(a) 228 | ); 229 | 230 | check( 231 | \\a+ 232 | , 233 | \\rep(+) 234 | \\ lit(a) 235 | ); 236 | 237 | check( 238 | \\a+? 239 | , 240 | \\rep(+?) 241 | \\ lit(a) 242 | ); 243 | 244 | check( 245 | \\a*? 246 | , 247 | \\rep(*?) 248 | \\ lit(a) 249 | ); 250 | 251 | check( 252 | \\a{5} 253 | , 254 | \\rep({5,5}) 255 | \\ lit(a) 256 | ); 257 | 258 | check( 259 | \\a{5,} 260 | , 261 | \\rep({5,}) 262 | \\ lit(a) 263 | ); 264 | 265 | check( 266 | \\a{5,10} 267 | , 268 | \\rep({5,10}) 269 | \\ lit(a) 270 | ); 271 | 272 | check( 273 | \\a{5}? 274 | , 275 | \\rep({5,5}?) 276 | \\ lit(a) 277 | ); 278 | 279 | check( 280 | \\a{5,}? 281 | , 282 | \\rep({5,}?) 283 | \\ lit(a) 284 | ); 285 | 286 | check( 287 | \\a{ 5 } 288 | , 289 | \\rep({5,5}) 290 | \\ lit(a) 291 | ); 292 | 293 | check( 294 | \\(a) 295 | , 296 | \\cap 297 | \\ lit(a) 298 | ); 299 | 300 | check( 301 | \\(ab) 302 | , 303 | \\cap 304 | \\ cat 305 | \\ lit(a) 306 | \\ lit(b) 307 | ); 308 | 309 | check( 310 | \\a|b 311 | , 312 | \\alt 313 | \\ lit(a) 314 | \\ lit(b) 315 | ); 316 | 317 | check( 318 | \\a|b|c 319 | , 320 | \\alt 321 | \\ lit(a) 322 | \\ lit(b) 323 | \\ lit(c) 324 | ); 325 | 326 | check( 327 | \\(a|b) 328 | , 329 | \\cap 330 | \\ alt 331 | \\ lit(a) 332 | \\ lit(b) 333 | ); 334 | 335 | check( 336 | \\(a|b|c) 337 | , 338 | \\cap 339 | \\ alt 340 | \\ lit(a) 341 | \\ lit(b) 342 | \\ lit(c) 343 | ); 344 | 345 | check( 346 | \\(ab|bc|cd) 347 | , 348 | \\cap 349 | \\ alt 350 | \\ cat 351 | \\ lit(a) 352 | \\ lit(b) 353 | \\ cat 354 | \\ lit(b) 355 | \\ lit(c) 356 | \\ cat 357 | \\ lit(c) 358 | \\ lit(d) 359 | ); 360 | 361 | check( 362 | \\(ab|(bc|(cd))) 363 | , 364 | \\cap 365 | \\ alt 366 | \\ cat 367 | \\ lit(a) 368 | \\ lit(b) 369 | \\ cap 370 | \\ alt 371 | \\ cat 372 | \\ lit(b) 373 | \\ lit(c) 374 | \\ cap 375 | \\ cat 376 | \\ lit(c) 377 | \\ lit(d) 378 | ); 379 | 380 | check( 381 | \\. 382 | , 383 | \\dot 384 | ); 385 | } 386 | 387 | test "parse escape" { 388 | check( 389 | \\\a\f\t\n\r\v 390 | , 391 | \\cat 392 | \\ lit(0x7) 393 | \\ lit(0xc) 394 | \\ lit(\t) 395 | \\ lit(\n) 396 | \\ lit(\r) 397 | \\ lit(0xb) 398 | ); 399 | 400 | check( 401 | \\\\\.\+\*\?\(\)\|\[\]\{\}\^\$ 402 | , 403 | \\cat 404 | \\ lit(\) 405 | \\ lit(.) 406 | \\ lit(+) 407 | \\ lit(*) 408 | \\ lit(?) 409 | \\ lit(() 410 | \\ lit()) 411 | \\ lit(|) 412 | \\ lit([) 413 | \\ lit(]) 414 | \\ lit({) 415 | \\ lit(}) 416 | \\ lit(^) 417 | \\ lit($) 418 | ); 419 | 420 | check("\\123", 421 | \\lit(S) 422 | ); 423 | 424 | check("\\1234", 425 | \\cat 426 | \\ lit(S) 427 | \\ lit(4) 428 | ); 429 | 430 | check("\\x53", 431 | \\lit(S) 432 | ); 433 | 434 | check("\\x534", 435 | \\cat 436 | \\ lit(S) 437 | \\ lit(4) 438 | ); 439 | 440 | check("\\x{53}", 441 | \\lit(S) 442 | ); 443 | 444 | check("\\x{53}4", 445 | \\cat 446 | \\ lit(S) 447 | \\ lit(4) 448 | ); 449 | } 450 | 451 | test "parse character classes" { 452 | check( 453 | \\[a] 454 | , 455 | \\bset([a-a]) 456 | ); 457 | 458 | check( 459 | \\[\x00] 460 | , 461 | \\bset([0x0-0x0]) 462 | ); 463 | 464 | check( 465 | \\[\n] 466 | , 467 | \\bset([\n-\n]) 468 | ); 469 | 470 | check( 471 | \\[^a] 472 | , 473 | \\bset([0x0-`][b-0xff]) 474 | ); 475 | 476 | check( 477 | \\[^\x00] 478 | , 479 | \\bset([0x1-0xff]) 480 | ); 481 | 482 | check( 483 | \\[^\n] 484 | , 485 | \\bset([0x0-\t][0xb-0xff]) 486 | ); 487 | 488 | check( 489 | \\[]] 490 | , 491 | \\bset([]-]]) 492 | ); 493 | 494 | check( 495 | \\[]\[] 496 | , 497 | \\bset([[-[][]-]]) 498 | ); 499 | 500 | check( 501 | \\[\[]] 502 | , 503 | \\cat 504 | \\ bset([[-[]) 505 | \\ lit(]) 506 | ); 507 | 508 | check( 509 | \\[]-] 510 | , 511 | \\bset([---][]-]]) 512 | ); 513 | 514 | check( 515 | \\[-]] 516 | , 517 | \\cat 518 | \\ bset([---]) 519 | \\ lit(]) 520 | ); 521 | } 522 | 523 | fn checkError(re: []const u8, expected_err: ParseError) void { 524 | var a = std.heap.ArenaAllocator.init(std.testing.allocator); 525 | defer a.deinit(); 526 | var p = Parser.init(a.allocator()); 527 | const parse_result = p.parse(re); 528 | 529 | if (parse_result) |expr| { 530 | const ast = repr(expr) catch unreachable; 531 | const spaces = [_]u8{ ' ', '\n' }; 532 | const trimmed_ast = mem.trim(u8, ast, &spaces); 533 | 534 | debug.print( 535 | \\ 536 | \\-- parsed the regex 537 | \\ 538 | \\{s} 539 | \\ 540 | \\-- expected the following 541 | \\ 542 | \\{s} 543 | \\ 544 | \\-- but instead got 545 | \\ 546 | \\{s} 547 | \\ 548 | \\ 549 | , .{ 550 | re, 551 | @errorName(expected_err), 552 | trimmed_ast, 553 | }); 554 | 555 | @panic("assertion failure"); 556 | } else |found_err| { 557 | if (found_err != expected_err) { 558 | debug.print( 559 | \\ 560 | \\-- parsed the regex 561 | \\ 562 | \\{s} 563 | \\ 564 | \\-- expected the following 565 | \\ 566 | \\{s} 567 | \\ 568 | \\-- but instead got 569 | \\ 570 | \\{s} 571 | \\ 572 | \\ 573 | , .{ 574 | re, 575 | @errorName(expected_err), 576 | @errorName(found_err), 577 | }); 578 | 579 | @panic("assertion failure"); 580 | } 581 | } 582 | } 583 | 584 | test "parse errors repeat" { 585 | checkError( 586 | \\* 587 | , ParseError.MissingRepeatOperand); 588 | 589 | checkError( 590 | \\(* 591 | , ParseError.MissingRepeatOperand); 592 | 593 | checkError( 594 | \\({5} 595 | , ParseError.MissingRepeatOperand); 596 | 597 | checkError( 598 | \\{5} 599 | , ParseError.MissingRepeatOperand); 600 | 601 | checkError( 602 | \\a** 603 | , ParseError.MissingRepeatOperand); 604 | 605 | checkError( 606 | \\a|* 607 | , ParseError.MissingRepeatOperand); 608 | 609 | checkError( 610 | \\a*{5} 611 | , ParseError.MissingRepeatOperand); 612 | 613 | checkError( 614 | \\a|{5} 615 | , ParseError.MissingRepeatOperand); 616 | 617 | checkError( 618 | \\a{} 619 | , ParseError.InvalidRepeatArgument); 620 | 621 | checkError( 622 | \\a{5 623 | , ParseError.UnclosedRepeat); 624 | 625 | checkError( 626 | \\a{xyz 627 | , ParseError.InvalidRepeatArgument); 628 | 629 | checkError( 630 | \\a{12,xyz 631 | , ParseError.InvalidRepeatArgument); 632 | 633 | checkError( 634 | \\a{999999999999} 635 | , ParseError.ExcessiveRepeatCount); 636 | 637 | checkError( 638 | \\a{1,999999999999} 639 | , ParseError.ExcessiveRepeatCount); 640 | 641 | checkError( 642 | \\a{12x} 643 | , ParseError.UnclosedRepeat); 644 | 645 | checkError( 646 | \\a{1,12x} 647 | , ParseError.UnclosedRepeat); 648 | } 649 | 650 | test "parse errors alternate" { 651 | checkError( 652 | \\|a 653 | , ParseError.EmptyAlternate); 654 | 655 | checkError( 656 | \\(|a) 657 | , ParseError.EmptyAlternate); 658 | 659 | checkError( 660 | \\a|| 661 | , ParseError.EmptyAlternate); 662 | 663 | checkError( 664 | \\) 665 | , ParseError.UnopenedParentheses); 666 | 667 | checkError( 668 | \\ab) 669 | , ParseError.UnopenedParentheses); 670 | 671 | checkError( 672 | \\a|b) 673 | , ParseError.UnopenedParentheses); 674 | 675 | checkError( 676 | \\(a|b 677 | , ParseError.UnclosedParentheses); 678 | 679 | //checkError( 680 | // \\(a|) 681 | //, 682 | // ParseError.UnopenedParentheses 683 | //); 684 | 685 | //checkError( 686 | // \\() 687 | //, 688 | // ParseError.UnopenedParentheses 689 | //); 690 | 691 | checkError( 692 | \\ab(xy 693 | , ParseError.UnclosedParentheses); 694 | 695 | //checkError( 696 | // \\() 697 | //, 698 | // ParseError.UnopenedParentheses 699 | //); 700 | 701 | //checkError( 702 | // \\a| 703 | //, 704 | // ParseError.UnbalancedParentheses 705 | //); 706 | } 707 | 708 | test "parse errors escape" { 709 | checkError("\\", ParseError.OpenEscapeCode); 710 | 711 | checkError("\\m", ParseError.UnrecognizedEscapeCode); 712 | 713 | checkError("\\x", ParseError.InvalidHexDigit); 714 | 715 | //checkError( 716 | // "\\xA" 717 | //, 718 | // ParseError.UnrecognizedEscapeCode 719 | //); 720 | 721 | //checkError( 722 | // "\\xAG" 723 | //, 724 | // ParseError.UnrecognizedEscapeCode 725 | //); 726 | 727 | checkError("\\x{", ParseError.InvalidHexDigit); 728 | 729 | checkError("\\x{A", ParseError.UnclosedHexCharacterCode); 730 | 731 | checkError("\\x{AG}", ParseError.UnclosedHexCharacterCode); 732 | 733 | checkError("\\x{D800}", ParseError.InvalidHexDigit); 734 | 735 | checkError("\\x{110000}", ParseError.InvalidHexDigit); 736 | 737 | checkError("\\x{99999999999999}", ParseError.InvalidHexDigit); 738 | } 739 | 740 | test "parse errors character class" { 741 | checkError( 742 | \\[ 743 | , ParseError.UnclosedBrackets); 744 | 745 | checkError( 746 | \\[^ 747 | , ParseError.UnclosedBrackets); 748 | 749 | checkError( 750 | \\[a 751 | , ParseError.UnclosedBrackets); 752 | 753 | checkError( 754 | \\[^a 755 | , ParseError.UnclosedBrackets); 756 | 757 | checkError( 758 | \\[a- 759 | , ParseError.UnclosedBrackets); 760 | 761 | checkError( 762 | \\[^a- 763 | , ParseError.UnclosedBrackets); 764 | 765 | checkError( 766 | \\[--- 767 | , ParseError.UnclosedBrackets); 768 | 769 | checkError( 770 | \\[\A] 771 | , ParseError.UnrecognizedEscapeCode); 772 | 773 | //checkError( 774 | // \\[a-\d] 775 | //, 776 | // ParseError.UnclosedBrackets 777 | //); 778 | 779 | //checkError( 780 | // \\[a-\A] 781 | //, 782 | // ParseError.UnrecognizedEscapeCode 783 | //); 784 | 785 | checkError( 786 | \\[\A-a] 787 | , ParseError.UnrecognizedEscapeCode); 788 | 789 | //checkError( 790 | // \\[z-a] 791 | //, 792 | // ParseError.UnclosedBrackets 793 | //); 794 | 795 | checkError( 796 | \\[] 797 | , ParseError.UnclosedBrackets); 798 | 799 | checkError( 800 | \\[^] 801 | , ParseError.UnclosedBrackets); 802 | 803 | //checkError( 804 | // \\[^\d\D] 805 | //, 806 | // ParseError.UnclosedBrackets 807 | //); 808 | 809 | //checkError( 810 | // \\[+--] 811 | //, 812 | // ParseError.UnclosedBrackets 813 | //); 814 | 815 | //checkError( 816 | // \\[a-a--\xFF] 817 | //, 818 | // ParseError.UnclosedBrackets 819 | //); 820 | } 821 | -------------------------------------------------------------------------------- /src/range_set.zig: -------------------------------------------------------------------------------- 1 | // A set of ordered disconnected non-empty ranges. These are stored in a flat array as opposed 2 | // to a tree structure. Insertions maintain order by rearranging as needed. Asymptotically 3 | // worse than a tree range-set but given the size of the typical range-sets we work with this 4 | // implementation is undoubtedly quicker. 5 | 6 | const std = @import("std"); 7 | const debug = std.debug; 8 | const mem = std.mem; 9 | const math = std.math; 10 | const Allocator = std.mem.Allocator; 11 | const ArrayList = std.ArrayList; 12 | 13 | // A single inclusive range (a, b) and a <= b 14 | pub fn Range(comptime T: type) type { 15 | return struct { 16 | min: T, 17 | max: T, 18 | 19 | pub fn new(min: T, max: T) Range(T) { 20 | debug.assert(min <= max); 21 | return Range(T){ .min = min, .max = max }; 22 | } 23 | 24 | pub fn single(item: T) Range(T) { 25 | return Range(T){ .min = item, .max = item }; 26 | } 27 | }; 28 | } 29 | 30 | // A contiguous set of ranges which manages merging of sub-ranges and negation of the entire class. 31 | pub fn RangeSet(comptime T: type) type { 32 | return struct { 33 | const Self = @This(); 34 | const RangeType = Range(T); 35 | 36 | // for any consecutive x, y in ranges, the following hold: 37 | // - x.min <= x.max 38 | // - x.max < y.min 39 | ranges: ArrayList(RangeType), 40 | 41 | pub fn init(a: Allocator) Self { 42 | return Self{ .ranges = ArrayList(RangeType).init(a) }; 43 | } 44 | 45 | pub fn deinit(self: *Self) void { 46 | self.ranges.deinit(); 47 | } 48 | 49 | pub fn clone(self: Self) !Self { 50 | return Self{ .ranges = try self.ranges.clone() }; 51 | } 52 | 53 | pub fn dupe(self: Self, a: Allocator) !Self { 54 | var cloned = try ArrayList(RangeType).initCapacity(a, self.ranges.items.len); 55 | cloned.appendSliceAssumeCapacity(self.ranges.items); 56 | return Self{ .ranges = cloned }; 57 | } 58 | 59 | // Add a range into the current class, preserving the structure invariants. 60 | pub fn addRange(self: *Self, range: RangeType) !void { 61 | var ranges = &self.ranges; 62 | 63 | if (ranges.items.len == 0) { 64 | try ranges.append(range); 65 | return; 66 | } 67 | 68 | // Insert range. 69 | for (ranges.items, 0..) |r, i| { 70 | if (range.min <= r.min) { 71 | try ranges.insert(i, range); 72 | break; 73 | } 74 | } else { 75 | try ranges.append(range); 76 | } 77 | 78 | // Merge overlapping runs. 79 | var index: usize = 0; 80 | var merge = ranges.items[0]; 81 | 82 | for (ranges.items[1..]) |r| { 83 | // Overlap (or directly adjacent) 84 | const upper = math.add(T, merge.max, 1) catch math.maxInt(T); 85 | if (r.min <= upper) { 86 | merge.max = @max(merge.max, r.max); 87 | } 88 | // No overlap 89 | else { 90 | ranges.items[index] = merge; 91 | merge = r; 92 | index += 1; 93 | } 94 | } 95 | 96 | ranges.items[index] = merge; 97 | index += 1; 98 | ranges.shrinkRetainingCapacity(index); 99 | } 100 | 101 | // Merge two classes into one. 102 | pub fn mergeClass(self: *Self, other: Self) !void { 103 | for (other.ranges.items) |r| { 104 | try self.addRange(r); 105 | } 106 | } 107 | 108 | // Inverting a class means the resulting class the contains method will match 109 | // the inverted set. i.e. contains(a, byte) == !contains(b, byte) if a == b.negated(). 110 | // 111 | // The negation is performed in place. 112 | pub fn negate(self: *Self) !void { 113 | const ranges = &self.ranges; 114 | const ranges_end = self.ranges.items.len; 115 | 116 | // The negated range is appended to the current list of ranges and then moved in 117 | // place and capacity shrunk to avoid creating a temporary range set. 118 | const negated = &self.ranges; 119 | const negated_start = self.ranges.items.len; 120 | 121 | if (ranges.items.len == 0) { 122 | try ranges.append(RangeType.new(math.minInt(T), math.maxInt(T))); 123 | return; 124 | } 125 | 126 | var low: T = math.minInt(T); 127 | for (ranges.items[0..ranges_end]) |r| { 128 | // NOTE: Can only occur on first element. 129 | if (r.min != math.minInt(T)) { 130 | try negated.append(RangeType.new(low, r.min - 1)); 131 | } 132 | 133 | low = math.add(T, r.max, 1) catch math.maxInt(T); 134 | } 135 | 136 | // Highest segment will be remaining. 137 | const lastRange = ranges.items[ranges_end - 1]; 138 | if (lastRange.max != math.maxInt(T)) { 139 | try negated.append(RangeType.new(low, math.maxInt(T))); 140 | } 141 | 142 | std.mem.copyForwards(RangeType, ranges.items, ranges.items[negated_start..]); 143 | ranges.shrinkRetainingCapacity(negated.items.len - negated_start); 144 | } 145 | 146 | pub fn contains(self: Self, value: T) bool { 147 | // TODO: Binary search required for large unicode sets. 148 | for (self.ranges.items) |range| { 149 | if (range.min <= value and value <= range.max) { 150 | return true; 151 | } 152 | } 153 | return false; 154 | } 155 | }; 156 | } 157 | 158 | pub const ByteClassTemplates = struct { 159 | const ByteRange = Range(u8); 160 | const ByteClass = RangeSet(u8); 161 | 162 | pub fn Whitespace(a: Allocator) !ByteClass { 163 | var rs = ByteClass.init(a); 164 | errdefer rs.deinit(); 165 | 166 | // \t, \n, \v, \f, \r 167 | try rs.addRange(ByteRange.new('\x09', '\x0D')); 168 | // ' ' 169 | try rs.addRange(ByteRange.single(' ')); 170 | 171 | return rs; 172 | } 173 | 174 | pub fn NonWhitespace(a: Allocator) !ByteClass { 175 | var rs = try Whitespace(a); 176 | errdefer rs.deinit(); 177 | 178 | try rs.negate(); 179 | return rs; 180 | } 181 | 182 | pub fn AlphaNumeric(a: Allocator) !ByteClass { 183 | var rs = ByteClass.init(a); 184 | errdefer rs.deinit(); 185 | 186 | try rs.addRange(ByteRange.new('0', '9')); 187 | try rs.addRange(ByteRange.new('A', 'Z')); 188 | try rs.addRange(ByteRange.new('a', 'z')); 189 | 190 | return rs; 191 | } 192 | 193 | pub fn NonAlphaNumeric(a: Allocator) !ByteClass { 194 | var rs = try AlphaNumeric(a); 195 | errdefer rs.deinit(); 196 | 197 | try rs.negate(); 198 | return rs; 199 | } 200 | 201 | pub fn Digits(a: Allocator) !ByteClass { 202 | var rs = ByteClass.init(a); 203 | errdefer rs.deinit(); 204 | 205 | try rs.addRange(ByteRange.new('0', '9')); 206 | 207 | return rs; 208 | } 209 | 210 | pub fn NonDigits(a: Allocator) !ByteClass { 211 | var rs = try Digits(a); 212 | errdefer rs.deinit(); 213 | 214 | try rs.negate(); 215 | return rs; 216 | } 217 | }; 218 | 219 | test "class simple" { 220 | const alloc = std.testing.allocator; 221 | var a = RangeSet(u8).init(alloc); 222 | defer a.deinit(); 223 | try a.addRange(Range(u8).new(0, 54)); 224 | 225 | debug.assert(a.contains(0)); 226 | debug.assert(a.contains(23)); 227 | debug.assert(a.contains(54)); 228 | debug.assert(!a.contains(58)); 229 | } 230 | 231 | test "class simple negate" { 232 | const alloc = std.testing.allocator; 233 | var a = RangeSet(u8).init(alloc); 234 | defer a.deinit(); 235 | try a.addRange(Range(u8).new(0, 54)); 236 | 237 | debug.assert(a.contains(0)); 238 | debug.assert(a.contains(23)); 239 | debug.assert(a.contains(54)); 240 | debug.assert(!a.contains(58)); 241 | 242 | try a.negate(); 243 | // Match the negation 244 | 245 | debug.assert(!a.contains(0)); 246 | debug.assert(!a.contains(23)); 247 | debug.assert(!a.contains(54)); 248 | debug.assert(a.contains(55)); 249 | debug.assert(a.contains(58)); 250 | 251 | try a.negate(); 252 | // negate is idempotent 253 | 254 | debug.assert(a.contains(0)); 255 | debug.assert(a.contains(23)); 256 | debug.assert(a.contains(54)); 257 | debug.assert(!a.contains(58)); 258 | } 259 | 260 | test "class multiple" { 261 | const alloc = std.testing.allocator; 262 | var a = RangeSet(u8).init(alloc); 263 | defer a.deinit(); 264 | try a.addRange(Range(u8).new(0, 20)); 265 | try a.addRange(Range(u8).new(80, 100)); 266 | try a.addRange(Range(u8).new(230, 255)); 267 | 268 | debug.assert(a.contains(20)); 269 | debug.assert(!a.contains(21)); 270 | debug.assert(!a.contains(79)); 271 | debug.assert(a.contains(80)); 272 | debug.assert(!a.contains(229)); 273 | debug.assert(a.contains(230)); 274 | debug.assert(a.contains(255)); 275 | } 276 | 277 | test "class multiple negated" { 278 | const alloc = std.testing.allocator; 279 | var a = RangeSet(u8).init(alloc); 280 | defer a.deinit(); 281 | try a.addRange(Range(u8).new(0, 20)); 282 | try a.addRange(Range(u8).new(80, 100)); 283 | try a.addRange(Range(u8).new(230, 255)); 284 | 285 | debug.assert(a.contains(20)); 286 | debug.assert(!a.contains(21)); 287 | debug.assert(!a.contains(79)); 288 | debug.assert(a.contains(80)); 289 | debug.assert(!a.contains(229)); 290 | debug.assert(a.contains(230)); 291 | debug.assert(a.contains(255)); 292 | 293 | try a.negate(); 294 | 295 | debug.assert(!a.contains(20)); 296 | debug.assert(a.contains(21)); 297 | debug.assert(a.contains(79)); 298 | debug.assert(!a.contains(80)); 299 | debug.assert(a.contains(229)); 300 | debug.assert(!a.contains(230)); 301 | debug.assert(!a.contains(255)); 302 | 303 | try a.negate(); 304 | 305 | debug.assert(a.contains(20)); 306 | debug.assert(!a.contains(21)); 307 | debug.assert(!a.contains(79)); 308 | debug.assert(a.contains(80)); 309 | debug.assert(!a.contains(229)); 310 | debug.assert(a.contains(230)); 311 | debug.assert(a.contains(255)); 312 | } 313 | 314 | test "class out of order" { 315 | const alloc = std.testing.allocator; 316 | var a = RangeSet(u8).init(alloc); 317 | defer a.deinit(); 318 | try a.addRange(Range(u8).new(80, 100)); 319 | try a.addRange(Range(u8).new(20, 30)); 320 | 321 | debug.assert(a.contains(80)); 322 | debug.assert(!a.contains(79)); 323 | debug.assert(!a.contains(101)); 324 | debug.assert(!a.contains(45)); 325 | debug.assert(!a.contains(19)); 326 | } 327 | 328 | test "class merging" { 329 | const alloc = std.testing.allocator; 330 | var a = RangeSet(u8).init(alloc); 331 | defer a.deinit(); 332 | try a.addRange(Range(u8).new(20, 100)); 333 | try a.addRange(Range(u8).new(50, 80)); 334 | try a.addRange(Range(u8).new(50, 140)); 335 | 336 | debug.assert(!a.contains(19)); 337 | debug.assert(a.contains(20)); 338 | debug.assert(a.contains(80)); 339 | debug.assert(a.contains(140)); 340 | debug.assert(!a.contains(141)); 341 | } 342 | 343 | test "class merging boundary" { 344 | const alloc = std.testing.allocator; 345 | var a = RangeSet(u8).init(alloc); 346 | defer a.deinit(); 347 | try a.addRange(Range(u8).new(20, 40)); 348 | try a.addRange(Range(u8).new(40, 60)); 349 | 350 | debug.assert(a.ranges.items.len == 1); 351 | } 352 | 353 | test "class merging adjacent" { 354 | const alloc = std.testing.allocator; 355 | var a = RangeSet(u8).init(alloc); 356 | defer a.deinit(); 357 | try a.addRange(Range(u8).new(56, 56)); 358 | try a.addRange(Range(u8).new(57, 57)); 359 | try a.addRange(Range(u8).new(58, 58)); 360 | 361 | debug.assert(a.ranges.items.len == 1); 362 | } 363 | -------------------------------------------------------------------------------- /src/regex.zig: -------------------------------------------------------------------------------- 1 | // External high-level Regex api. 2 | // 3 | // This hides details such as what matching engine is used internally and the parsing/compilation 4 | // stages are merged into a single wrapper function. 5 | 6 | const std = @import("std"); 7 | const Allocator = std.mem.Allocator; 8 | const ArrayList = std.ArrayList; 9 | const debug = std.debug; 10 | 11 | const parse = @import("parse.zig"); 12 | const compile = @import("compile.zig"); 13 | const exec = @import("exec.zig"); 14 | 15 | const Parser = parse.Parser; 16 | const Expr = parse.Expr; 17 | const Compiler = compile.Compiler; 18 | const Program = compile.Program; 19 | const Instruction = compile.Instruction; 20 | 21 | const InputBytes = @import("input.zig").InputBytes; 22 | 23 | pub const Regex = struct { 24 | // Internal allocator 25 | allocator: Allocator, 26 | // A compiled set of instructions 27 | compiled: Program, 28 | // Capture slots 29 | slots: ArrayList(?usize), 30 | // Original regex string 31 | string: []const u8, 32 | 33 | // Compile a regex, possibly returning any error which occurred. 34 | pub fn compile(a: Allocator, re: []const u8) !Regex { 35 | var p = Parser.init(a); 36 | defer p.deinit(); 37 | 38 | const expr = try p.parse(re); 39 | 40 | var c = Compiler.init(a); 41 | defer c.deinit(); 42 | 43 | return Regex{ 44 | .allocator = a, 45 | .compiled = try c.compile(expr), 46 | .slots = ArrayList(?usize).init(a), 47 | .string = re, 48 | }; 49 | } 50 | 51 | pub fn deinit(re: *Regex) void { 52 | re.slots.deinit(); 53 | re.compiled.deinit(); 54 | } 55 | 56 | // Does the regex match at the start of the string? 57 | pub fn match(re: *Regex, input_str: []const u8) !bool { 58 | var input_bytes = InputBytes.init(input_str); 59 | return exec.exec(re.allocator, re.compiled, re.compiled.start, &input_bytes.input, &re.slots); 60 | } 61 | 62 | // Does the regex match anywhere in the string? 63 | pub fn partialMatch(re: *Regex, input_str: []const u8) !bool { 64 | var input_bytes = InputBytes.init(input_str); 65 | return exec.exec(re.allocator, re.compiled, re.compiled.find_start, &input_bytes.input, &re.slots); 66 | } 67 | 68 | // Where in the string does the regex and its capture groups match? 69 | // 70 | // Zero capture is the entire match. 71 | pub fn captures(re: *Regex, input_str: []const u8) !?Captures { 72 | var input_bytes = InputBytes.init(input_str); 73 | const is_match = try exec.exec(re.allocator, re.compiled, re.compiled.find_start, &input_bytes.input, &re.slots); 74 | 75 | if (is_match) { 76 | return try Captures.init(input_str, &re.slots); 77 | } else { 78 | return null; 79 | } 80 | } 81 | }; 82 | 83 | // A pair of bounds used to index into an associated slice. 84 | pub const Span = struct { 85 | lower: usize, 86 | upper: usize, 87 | }; 88 | 89 | // A set of captures of a Regex on an input slice. 90 | pub const Captures = struct { 91 | const Self = @This(); 92 | 93 | input: []const u8, 94 | allocator: Allocator, 95 | slots: []const ?usize, 96 | 97 | pub fn init(input: []const u8, slots: *ArrayList(?usize)) !Captures { 98 | return Captures{ 99 | .input = input, 100 | .allocator = slots.allocator, 101 | .slots = try slots.allocator.dupe(?usize, slots.items), 102 | }; 103 | } 104 | 105 | pub fn deinit(self: *Self) void { 106 | self.allocator.free(self.slots); 107 | } 108 | 109 | pub fn len(self: *const Self) usize { 110 | return self.slots.len / 2; 111 | } 112 | 113 | // Return the slice of the matching string for the specified capture index. 114 | // If the index did not participate in the capture group null is returned. 115 | pub fn sliceAt(self: *const Self, n: usize) ?[]const u8 { 116 | if (self.boundsAt(n)) |span| { 117 | return self.input[span.lower..span.upper]; 118 | } 119 | 120 | return null; 121 | } 122 | 123 | // Return the substring slices of the input directly. 124 | pub fn boundsAt(self: *const Self, n: usize) ?Span { 125 | const base = 2 * n; 126 | 127 | if (base < self.slots.len) { 128 | if (self.slots[base]) |lower| { 129 | const upper = self.slots[base + 1].?; 130 | return Span{ 131 | .lower = lower, 132 | .upper = upper, 133 | }; 134 | } 135 | } 136 | 137 | return null; 138 | } 139 | }; 140 | -------------------------------------------------------------------------------- /src/regex_test.zig: -------------------------------------------------------------------------------- 1 | const Regex = @import("regex.zig").Regex; 2 | const debug = @import("std").debug; 3 | const Parser = @import("parse.zig").Parser; 4 | const re_debug = @import("debug.zig"); 5 | 6 | const std = @import("std"); 7 | const mem = std.mem; 8 | 9 | fn check(re_input: []const u8, to_match: []const u8, expected: bool) void { 10 | var re = Regex.compile(std.testing.allocator, re_input) catch unreachable; 11 | defer re.deinit(); 12 | 13 | if ((re.partialMatch(to_match) catch unreachable) != expected) { 14 | debug.print( 15 | \\ 16 | \\ -- Failure! ------------------ 17 | \\ 18 | \\Regex: '{s}' 19 | \\String: '{s}' 20 | \\Expected: {any} 21 | \\ 22 | , .{ 23 | re_input, 24 | to_match, 25 | expected, 26 | }); 27 | 28 | // Dump expression tree and bytecode 29 | var p = Parser.init(std.testing.allocator); 30 | defer p.deinit(); 31 | const expr = p.parse(re_input) catch unreachable; 32 | 33 | debug.print( 34 | \\ 35 | \\ -- Expression Tree ------------ 36 | \\ 37 | , .{}); 38 | re_debug.dumpExpr(expr.*); 39 | 40 | debug.print( 41 | \\ 42 | \\ -- Bytecode ------------------- 43 | \\ 44 | , .{}); 45 | re_debug.dumpProgram(re.compiled); 46 | 47 | debug.print( 48 | \\ 49 | \\ ------------------------------- 50 | \\ 51 | , .{}); 52 | 53 | @panic("assertion failure"); 54 | } 55 | } 56 | 57 | fn checkCompile(allocator: mem.Allocator, re_input: []const u8) !void { 58 | var re = try Regex.compile(allocator, re_input); 59 | re.deinit(); 60 | } 61 | 62 | test "regex sanity tests" { 63 | // Taken from tiny-regex-c 64 | check("\\d", "5", true); 65 | check("\\w+", "hej", true); 66 | check("\\s", "\t \n", true); 67 | check("\\S", "\t \n", false); 68 | check("[\\s]", "\t \n", true); 69 | check("[\\S]", "\t \n", false); 70 | check("\\D", "5", false); 71 | check("\\W+", "hej", false); 72 | check("[0-9]+", "12345", true); 73 | check("\\D", "hej", true); 74 | check("\\d", "hej", false); 75 | check("[^\\w]", "\\", true); 76 | check("[\\W]", "\\", true); 77 | check("[\\w]", "\\", false); 78 | check("[^\\d]", "d", true); 79 | check("[\\d]", "d", false); 80 | check("[^\\D]", "d", false); 81 | check("[\\D]", "d", true); 82 | check("^.*\\\\.*$", "c:\\Tools", true); 83 | check("^[\\+-]*[\\d]+$", "+27", true); 84 | check("[abc]", "1c2", true); 85 | check("[abc]", "1C2", false); 86 | check("[1-5]+", "0123456789", true); 87 | check("[.2]", "1C2", true); 88 | check("a*$", "Xaa", true); 89 | check("a*$", "Xaa", true); 90 | check("[a-h]+", "abcdefghxxx", true); 91 | check("[a-h]+", "ABCDEFGH", false); 92 | check("[A-H]+", "ABCDEFGH", true); 93 | check("[A-H]+", "abcdefgh", false); 94 | check("[^\\s]+", "abc def", true); 95 | check("[^fc]+", "abc def", true); 96 | check("[^d\\sf]+", "abc def", true); 97 | check("\n", "abc\ndef", true); 98 | //check("b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n", true); 99 | check(".*c", "abcabc", true); 100 | check(".+c", "abcabc", true); 101 | check("[b-z].*", "ab", true); 102 | check("b[k-z]*", "ab", true); 103 | check("[0-9]", " - ", false); 104 | check("[^0-9]", " - ", true); 105 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", true); 106 | check("[Hh]ello [Ww]orld\\s*[!]?", "hello world !", true); 107 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello World !", true); 108 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world! ", true); 109 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", true); 110 | check("[Hh]ello [Ww]orld\\s*[!]?", "hello World !", true); 111 | check("[^\\w][^-1-4]", ")T", true); 112 | check("[^\\w][^-1-4]", ")^", true); 113 | check("[^\\w][^-1-4]", "*)", true); 114 | check("[^\\w][^-1-4]", "!.", true); 115 | check("[^\\w][^-1-4]", " x", true); 116 | check("[^\\w][^-1-4]", "$b", true); 117 | check("a|b", "a", true); 118 | check("a|b", "b", true); 119 | check("a|b", "x", false); 120 | check("[a-b]|[d-f]\\s+", "d ", true); 121 | check("[a-b]|[d-f]\\s+", "b", true); 122 | check("[a-b]|[d-f]\\s+", "c", false); 123 | check("\\bx\\b", "x", true); 124 | check("\\bx\\b", " x ", true); 125 | check("\\bx", "Ax", false); 126 | check("x\\b", "xA", false); 127 | check("\\Bx\\B", "x", false); 128 | check("\\Bx\\B", " x ", false); 129 | check("\\Bx", "Ax", true); 130 | check("x\\B", "xA", true); 131 | } 132 | 133 | test "regex captures" { 134 | var r = try Regex.compile(std.testing.allocator, "ab(\\d+)"); 135 | defer r.deinit(); 136 | 137 | debug.assert(try r.partialMatch("xxxxab0123a")); 138 | 139 | var caps = (try r.captures("xxxxab0123a")).?; 140 | defer caps.deinit(); 141 | 142 | debug.assert(mem.eql(u8, "ab0123", caps.sliceAt(0).?)); 143 | debug.assert(mem.eql(u8, "0123", caps.sliceAt(1).?)); 144 | } 145 | 146 | test "regex memory leaks" { 147 | const allocator = std.testing.allocator; 148 | 149 | try checkCompile(allocator, "\\d"); 150 | try checkCompile(allocator, "\\w+"); 151 | try checkCompile(allocator, "\\s"); 152 | try checkCompile(allocator, "\\S"); 153 | try checkCompile(allocator, "[\\s]"); 154 | try checkCompile(allocator, "[\\S]"); 155 | try checkCompile(allocator, "\\D"); 156 | try checkCompile(allocator, "\\W+"); 157 | try checkCompile(allocator, "[0-9]+"); 158 | try checkCompile(allocator, "[^\\w]"); 159 | try checkCompile(allocator, "[\\W]"); 160 | try checkCompile(allocator, "[\\w]"); 161 | try checkCompile(allocator, "[^\\d]"); 162 | try checkCompile(allocator, "[\\d]"); 163 | try checkCompile(allocator, "[^\\D]"); 164 | try checkCompile(allocator, "[\\D]"); 165 | try checkCompile(allocator, "^.*\\\\.*$"); 166 | try checkCompile(allocator, "^[\\+-]*[\\d]+$"); 167 | try checkCompile(allocator, "[abc]"); 168 | try checkCompile(allocator, "[1-5]+"); 169 | try checkCompile(allocator, "[.2]"); 170 | try checkCompile(allocator, "a*$"); 171 | try checkCompile(allocator, "[a-h]+"); 172 | try checkCompile(allocator, "[^\\s]+"); 173 | try checkCompile(allocator, "[^fc]+"); 174 | try checkCompile(allocator, "[^d\\sf]+"); 175 | try checkCompile(allocator, "\n"); 176 | try checkCompile(allocator, "b.\\s*\n"); 177 | try checkCompile(allocator, ".*c"); 178 | try checkCompile(allocator, ".+c"); 179 | try checkCompile(allocator, "[b-z].*"); 180 | try checkCompile(allocator, "b[k-z]*"); 181 | try checkCompile(allocator, "[0-9]"); 182 | try checkCompile(allocator, "[^0-9]"); 183 | try checkCompile(allocator, "a?"); 184 | try checkCompile(allocator, "[Hh]ello [Ww]orld\\s*[!]?"); 185 | try checkCompile(allocator, "[^\\w][^-1-4]"); 186 | try checkCompile(allocator, "[a-b]|[d-f]\\s+"); 187 | try checkCompile(allocator, "x\\b"); 188 | try checkCompile(allocator, "x\\B"); 189 | try checkCompile(allocator, "[0-9]{2,}"); 190 | try checkCompile(allocator, "[0-9]{2,3}"); 191 | } 192 | -------------------------------------------------------------------------------- /src/vm_backtrack.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const mem = std.mem; 3 | const Allocator = std.mem.Allocator; 4 | const ArrayList = std.ArrayList; 5 | const debug = std.debug; 6 | 7 | const parse = @import("parse.zig"); 8 | const compile = @import("compile.zig"); 9 | 10 | const Parser = parse.Parser; 11 | const Expr = parse.Expr; 12 | const Assertion = parse.Assertion; 13 | const Compiler = compile.Compiler; 14 | const Program = compile.Program; 15 | const InstructionData = compile.InstructionData; 16 | const Input = @import("input.zig").Input; 17 | 18 | const SaveRestore = struct { 19 | // slot position to restore 20 | slot: usize, 21 | // position to store in slot 22 | last_pos: usize, 23 | }; 24 | 25 | const Thread = struct { 26 | // instruction pointer 27 | ip: usize, 28 | // Current input position 29 | input: Input, 30 | }; 31 | 32 | const Job = union(enum) { 33 | Thread: Thread, 34 | SaveRestore: SaveRestore, 35 | }; 36 | 37 | const ExecState = struct { 38 | const BitsetLen = 512; 39 | const BitsetType = u32; 40 | 41 | // pending jobs 42 | jobs: ArrayList(Job), 43 | 44 | // cache (we can bound this visited bitset since we bound when we use the backtracking engine. 45 | visited: [BitsetLen]BitsetType, 46 | 47 | prog: *const Program, 48 | 49 | slots: *ArrayList(?usize), 50 | }; 51 | 52 | // This is bounded and only used for small compiled regexes. It is not quadratic since pre-seen 53 | // nodes are cached across threads. 54 | pub const VmBacktrack = struct { 55 | const Self = @This(); 56 | allocator: Allocator, 57 | 58 | pub fn init(allocator: Allocator) Self { 59 | return Self{ .allocator = allocator }; 60 | } 61 | 62 | pub fn shouldExec(prog: Program, input: *const Input) bool { 63 | return (prog.insts.len + 1) * (input.bytes.len + 1) < ExecState.BitsetLen * @sizeOf(ExecState.BitsetType); 64 | } 65 | 66 | pub fn exec(self: *Self, prog: Program, prog_start: usize, input: *Input, slots: *ArrayList(?usize)) !bool { 67 | // Should never run this without first checking shouldExec and running only if true. 68 | debug.assert(shouldExec(prog, input)); 69 | 70 | var state = ExecState{ 71 | .jobs = ArrayList(Job).init(self.allocator), 72 | .visited = [_]u32{0} ** 512, 73 | .prog = &prog, 74 | .slots = slots, 75 | }; 76 | defer state.jobs.deinit(); 77 | 78 | const t = Job{ .Thread = Thread{ .ip = prog_start, .input = input.clone() } }; 79 | try state.jobs.append(t); 80 | 81 | while (state.jobs.pop()) |job| { 82 | switch (job) { 83 | Job.Thread => |thread| { 84 | if (try step(&state, &thread)) { 85 | return true; 86 | } 87 | }, 88 | Job.SaveRestore => |save| { 89 | if (save.slot < state.slots.items.len) { 90 | state.slots.items[save.slot] = save.last_pos; 91 | } 92 | }, 93 | } 94 | } 95 | 96 | return false; 97 | } 98 | 99 | fn step(state: *ExecState, thread: *const Thread) !bool { 100 | // For linear actions, we can just modify the current thread and avoid pushing new items 101 | // to the stack. 102 | var input = thread.input; 103 | 104 | var ip = thread.ip; 105 | 106 | while (true) { 107 | const inst = state.prog.insts[ip]; 108 | const at = input.current(); 109 | 110 | if (!shouldVisit(state, ip, input.byte_pos)) { 111 | return false; 112 | } 113 | 114 | switch (inst.data) { 115 | InstructionData.Char => |ch| { 116 | if (at == null or at.? != ch) { 117 | return false; 118 | } 119 | input.advance(); 120 | }, 121 | InstructionData.EmptyMatch => |assertion| { 122 | if (!input.isEmptyMatch(assertion)) { 123 | return false; 124 | } 125 | }, 126 | InstructionData.ByteClass => |class| { 127 | if (at == null or !class.contains(at.?)) { 128 | return false; 129 | } 130 | input.advance(); 131 | }, 132 | InstructionData.AnyCharNotNL => { 133 | if (at == null or at.? == '\n') { 134 | return false; 135 | } 136 | input.advance(); 137 | }, 138 | InstructionData.Save => |slot| { 139 | // Our capture array may not be long enough, extend and fill with empty 140 | while (state.slots.items.len <= slot) { 141 | // TODO: Can't append null as optional 142 | try state.slots.append(0); 143 | state.slots.items[state.slots.items.len - 1] = null; 144 | } 145 | 146 | // We can save an existing match by creating a job which will run on this thread 147 | // failing. This will reset to the old match before any subsequent splits in 148 | // this thread. 149 | if (state.slots.items[slot]) |last_pos| { 150 | const job = Job{ 151 | .SaveRestore = SaveRestore{ 152 | .slot = slot, 153 | .last_pos = last_pos, 154 | }, 155 | }; 156 | try state.jobs.append(job); 157 | } 158 | 159 | state.slots.items[slot] = input.byte_pos; 160 | }, 161 | InstructionData.Match => { 162 | return true; 163 | }, 164 | InstructionData.Jump => { 165 | // Jump at end of loop 166 | }, 167 | InstructionData.Split => |split| { 168 | const t = Job{ .Thread = Thread{ .ip = split, .input = input.clone() } }; 169 | try state.jobs.append(t); 170 | }, 171 | } 172 | 173 | ip = inst.out; 174 | } 175 | } 176 | 177 | // checks if we have visited this specific node and if not, set the bit and return true 178 | fn shouldVisit(state: *ExecState, ip: usize, at: usize) bool { 179 | const BitsetType = ExecState.BitsetType; 180 | const BitsetShiftType = std.math.Log2Int(BitsetType); 181 | 182 | const size = @sizeOf(BitsetType); 183 | 184 | const n = at * (state.prog.insts.len + 1) + ip; 185 | const bitmask = @as(BitsetType, 1) << @as(BitsetShiftType, @intCast(n & (size - 1))); 186 | 187 | if ((state.visited[n / size] & bitmask) != 0) { 188 | return false; 189 | } 190 | 191 | state.visited[n / size] |= bitmask; 192 | return true; 193 | } 194 | }; 195 | -------------------------------------------------------------------------------- /src/vm_pike.zig: -------------------------------------------------------------------------------- 1 | // PikeVM 2 | // 3 | // This is the default engine currently except for small regexes which we use a caching backtracking 4 | // engine as this is faster according to most other mature regex engines in practice. 5 | // 6 | // This is a very simple version with no optimizations. 7 | 8 | const std = @import("std"); 9 | const mem = std.mem; 10 | const Allocator = std.mem.Allocator; 11 | const ArenaAllocator = std.heap.ArenaAllocator; 12 | const ArrayList = std.ArrayList; 13 | 14 | const parse = @import("parse.zig"); 15 | const compile = @import("compile.zig"); 16 | 17 | const Parser = parse.Parser; 18 | const Assertion = parse.Assertion; 19 | const Program = compile.Program; 20 | const InstructionData = compile.InstructionData; 21 | const Input = @import("input.zig").Input; 22 | 23 | const Thread = struct { 24 | pc: usize, 25 | // We know the maximum slot entry in advance. Therefore, we allocate the entire array as needed 26 | // as this is easier (and probably quicker) than allocating only what we need in an ArrayList. 27 | slots: []?usize, 28 | }; 29 | 30 | const ExecState = struct { 31 | const Self = @This(); 32 | 33 | arena: ArenaAllocator, 34 | slot_count: usize, 35 | 36 | pub fn init(allocator: Allocator, program: Program) Self { 37 | return Self{ 38 | .arena = ArenaAllocator.init(allocator), 39 | .slot_count = program.slot_count, 40 | }; 41 | } 42 | 43 | pub fn deinit(self: *Self) void { 44 | self.arena.deinit(); 45 | } 46 | 47 | pub fn newSlot(self: *Self) ![]?usize { 48 | const slots = try self.arena.allocator().alloc(?usize, self.slot_count); 49 | @memset(slots, null); 50 | return slots; 51 | } 52 | 53 | pub fn cloneSlots(self: *Self, other: []?usize) ![]?usize { 54 | const slots = try self.arena.allocator().alloc(?usize, self.slot_count); 55 | @memcpy(slots, other); 56 | return slots; 57 | } 58 | }; 59 | 60 | pub const VmPike = struct { 61 | const Self = @This(); 62 | 63 | allocator: Allocator, 64 | 65 | pub fn init(allocator: Allocator) Self { 66 | return Self{ .allocator = allocator }; 67 | } 68 | 69 | pub fn exec(self: *Self, prog: Program, prog_start: usize, input: *Input, slots: *ArrayList(?usize)) !bool { 70 | var clist = ArrayList(Thread).init(self.allocator); 71 | defer clist.deinit(); 72 | 73 | var nlist = ArrayList(Thread).init(self.allocator); 74 | defer nlist.deinit(); 75 | 76 | var state = ExecState.init(self.allocator, prog); 77 | defer state.deinit(); 78 | 79 | const t = Thread{ 80 | .pc = prog_start, 81 | .slots = try state.newSlot(), 82 | }; 83 | try clist.append(t); 84 | 85 | var matched: ?[]?usize = null; 86 | 87 | while (!input.isConsumed()) : (input.advance()) { 88 | while (clist.pop()) |thread| { 89 | const inst = prog.insts[thread.pc]; 90 | const at = input.current(); 91 | 92 | switch (inst.data) { 93 | InstructionData.Char => |ch| { 94 | if (at != null and at.? == ch) { 95 | try nlist.append(Thread{ 96 | .pc = inst.out, 97 | .slots = thread.slots, 98 | }); 99 | } 100 | }, 101 | InstructionData.EmptyMatch => |assertion| { 102 | if (input.isEmptyMatch(assertion)) { 103 | try clist.append(Thread{ 104 | .pc = inst.out, 105 | .slots = thread.slots, 106 | }); 107 | } 108 | }, 109 | InstructionData.ByteClass => |class| { 110 | if (at != null and class.contains(at.?)) { 111 | try nlist.append(Thread{ 112 | .pc = inst.out, 113 | .slots = thread.slots, 114 | }); 115 | } 116 | }, 117 | InstructionData.AnyCharNotNL => { 118 | if (at != null and at.? != '\n') { 119 | try nlist.append(Thread{ 120 | .pc = inst.out, 121 | .slots = thread.slots, 122 | }); 123 | } 124 | }, 125 | InstructionData.Match => { 126 | // We always will have a complete capture in the 0, 1 index 127 | if (matched) |last| { 128 | // leftmost 129 | if (thread.slots[0].? > last[0].?) { 130 | continue; 131 | } 132 | // longest 133 | if (thread.slots[1].? - thread.slots[0].? <= last[1].? - last[0].?) { 134 | continue; 135 | } 136 | } 137 | 138 | matched = try state.cloneSlots(thread.slots); 139 | 140 | // TODO: Handle thread priority correctly so we can immediately finish all 141 | // current threads in clits. 142 | // clist.shrink(0); 143 | }, 144 | InstructionData.Save => |slot| { 145 | // We don't need a deep copy here since we only ever advance forward so 146 | // all future captures are valid for any subsequent threads. 147 | var new_thread = Thread{ 148 | .pc = inst.out, 149 | .slots = thread.slots, 150 | }; 151 | 152 | new_thread.slots[slot] = input.byte_pos; 153 | try clist.append(new_thread); 154 | }, 155 | InstructionData.Jump => { 156 | try clist.append(Thread{ 157 | .pc = inst.out, 158 | .slots = thread.slots, 159 | }); 160 | }, 161 | InstructionData.Split => |split| { 162 | // Split pushed first since we want to handle the branch secondary to the 163 | // current thread (popped from end). 164 | try clist.append(Thread{ 165 | .pc = split, 166 | .slots = try state.cloneSlots(thread.slots), 167 | }); 168 | try clist.append(Thread{ 169 | .pc = inst.out, 170 | .slots = thread.slots, 171 | }); 172 | }, 173 | } 174 | } 175 | 176 | mem.swap(ArrayList(Thread), &clist, &nlist); 177 | nlist.shrinkRetainingCapacity(0); 178 | } 179 | 180 | if (matched) |ok_matched| { 181 | slots.shrinkAndFree(0); 182 | try slots.appendSlice(ok_matched); 183 | return true; 184 | } 185 | 186 | return false; 187 | } 188 | }; 189 | -------------------------------------------------------------------------------- /src/vm_test.zig: -------------------------------------------------------------------------------- 1 | const exec = @import("exec.zig").exec; 2 | const debug = @import("std").debug; 3 | const Parser = @import("parse.zig").Parser; 4 | const Regex = @import("regex.zig").Regex; 5 | const InputBytes = @import("input.zig").InputBytes; 6 | const re_debug = @import("debug.zig"); 7 | 8 | const std = @import("std"); 9 | const ArrayList = std.ArrayList; 10 | const FixedBufferAllocator = std.heap.FixedBufferAllocator; 11 | const mem = std.mem; 12 | 13 | // vms to test 14 | const VmBacktrack = @import("vm_backtrack.zig").VmBacktrack; 15 | const VmPike = @import("vm_pike.zig").VmPike; 16 | 17 | // Debug global allocator is too small for our tests 18 | var buffer: [800000]u8 = undefined; 19 | var fixed_allocator = FixedBufferAllocator.init(buffer[0..]); 20 | 21 | fn nullableEql(comptime T: type, a: []const ?T, b: []const ?T) bool { 22 | if (a.len != b.len) { 23 | return false; 24 | } 25 | 26 | var i: usize = 0; 27 | while (i < a.len) : (i += 1) { 28 | if (a[i] != null and b[i] != null) { 29 | if (a[i].? != b[i].?) { 30 | return false; 31 | } 32 | // ok 33 | } else if (a[i] == null and b[i] == null) { 34 | // ok 35 | } else { 36 | return false; 37 | } 38 | } 39 | 40 | return true; 41 | } 42 | 43 | fn check(re_input: []const u8, to_match: []const u8, expected: bool) void { 44 | const re = Regex.compile(fixed_allocator.allocator(), re_input) catch unreachable; 45 | 46 | // This is just an engine comparison test but we should also test against fixed vectors 47 | var backtrack = VmBacktrack.init(re.allocator); 48 | var backtrack_slots = ArrayList(?usize).init(re.allocator); 49 | var pike = VmPike.init(re.allocator); 50 | var pike_slots = ArrayList(?usize).init(re.allocator); 51 | 52 | var input1 = InputBytes.init(to_match).input; 53 | const pike_result = pike.exec(re.compiled, re.compiled.find_start, &input1, &pike_slots) catch unreachable; 54 | 55 | var input2 = InputBytes.init(to_match).input; 56 | const backtrack_result = backtrack.exec(re.compiled, re.compiled.find_start, &input2, &backtrack_slots) catch unreachable; 57 | 58 | const slots_equal = nullableEql(usize, pike_slots.items, backtrack_slots.items); 59 | 60 | // Note: slot entries are invalid on non-match 61 | if (pike_result != backtrack_result or (expected == true and !slots_equal)) { 62 | debug.print( 63 | \\ 64 | \\ -- Failure! ---------------- 65 | \\ 66 | \\ 67 | \\pikevm: {any} 68 | \\backtrack: {any} 69 | \\ 70 | , .{ pike_result, backtrack_result }); 71 | 72 | debug.print( 73 | \\ 74 | \\ -- Slots ------------------- 75 | \\ 76 | \\pikevm 77 | \\ 78 | , .{}); 79 | for (pike_slots.items) |entry| { 80 | debug.print("{?d} ", .{entry}); 81 | } 82 | debug.print("\n", .{}); 83 | 84 | debug.print( 85 | \\ 86 | \\ 87 | \\backtrack 88 | \\ 89 | , .{}); 90 | for (backtrack_slots.items) |entry| { 91 | debug.print("{?d} ", .{entry}); 92 | } 93 | debug.print("\n", .{}); 94 | 95 | debug.print( 96 | \\ 97 | \\ -- Regex ------------------ 98 | \\ 99 | \\Regex: '{s}' 100 | \\String: '{s}' 101 | \\Expected: {any} 102 | \\ 103 | , .{ re_input, to_match, expected }); 104 | 105 | // Dump expression tree and bytecode 106 | var p = Parser.init(std.testing.allocator); 107 | defer p.deinit(); 108 | const expr = p.parse(re_input) catch unreachable; 109 | 110 | debug.print( 111 | \\ 112 | \\ -- Expression Tree ------------ 113 | \\ 114 | , .{}); 115 | re_debug.dumpExpr(expr.*); 116 | 117 | debug.print( 118 | \\ 119 | \\ -- Bytecode ------------------- 120 | \\ 121 | , .{}); 122 | re_debug.dumpProgram(re.compiled); 123 | 124 | debug.print( 125 | \\ 126 | \\ ------------------------------- 127 | \\ 128 | , .{}); 129 | 130 | @panic("assertion failure"); 131 | } 132 | } 133 | 134 | test "pikevm == backtrackvm" { 135 | // Taken from tiny-regex-c 136 | check("\\d", "5", true); 137 | check("\\w+", "hej", true); 138 | check("\\s", "\t \n", true); 139 | check("\\S", "\t \n", false); 140 | check("[\\s]", "\t \n", true); 141 | check("[\\S]", "\t \n", false); 142 | check("\\D", "5", false); 143 | check("\\W+", "hej", false); 144 | check("[0-9]+", "12345", true); 145 | check("\\D", "hej", true); 146 | check("\\d", "hej", false); 147 | check("[^\\w]", "\\", true); 148 | check("[\\W]", "\\", true); 149 | check("[\\w]", "\\", false); 150 | check("[^\\d]", "d", true); 151 | check("[\\d]", "d", false); 152 | check("[^\\D]", "d", false); 153 | check("[\\D]", "d", true); 154 | check("^.*\\\\.*$", "c:\\Tools", true); 155 | check("^[\\+-]*[\\d]+$", "+27", true); 156 | check("[abc]", "1c2", true); 157 | check("[abc]", "1C2", false); 158 | check("[1-5]+", "0123456789", true); 159 | check("[.2]", "1C2", true); 160 | check("a*$", "Xaa", true); 161 | check("a*$", "Xaa", true); 162 | check("[a-h]+", "abcdefghxxx", true); 163 | check("[a-h]+", "ABCDEFGH", false); 164 | check("[A-H]+", "ABCDEFGH", true); 165 | check("[A-H]+", "abcdefgh", false); 166 | check("[^\\s]+", "abc def", true); 167 | check("[^fc]+", "abc def", true); 168 | check("[^d\\sf]+", "abc def", true); 169 | check("\n", "abc\ndef", true); 170 | //check("b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n", true); 171 | check(".*c", "abcabc", true); 172 | check(".+c", "abcabc", true); 173 | check("[b-z].*", "ab", true); 174 | check("b[k-z]*", "ab", true); 175 | check("[0-9]", " - ", false); 176 | check("[^0-9]", " - ", true); 177 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", true); 178 | check("[Hh]ello [Ww]orld\\s*[!]?", "hello world !", true); 179 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello World !", true); 180 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world! ", true); 181 | check("[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", true); 182 | check("[Hh]ello [Ww]orld\\s*[!]?", "hello World !", true); 183 | check("[^\\w][^-1-4]", ")T", true); 184 | check("[^\\w][^-1-4]", ")^", true); 185 | check("[^\\w][^-1-4]", "*)", true); 186 | check("[^\\w][^-1-4]", "!.", true); 187 | check("[^\\w][^-1-4]", " x", true); 188 | check("[^\\w][^-1-4]", "$b", true); 189 | check("a{3,}", "aaa", true); 190 | check(".*emacs.*", "emacs-packages.nix", true); 191 | check("[a-b]|[d-f]\\s+", "d ", true); 192 | check("[a-b]|[d-f]\\s+", "b", true); 193 | check("[a-b]|[d-f]\\s+", "c", false); 194 | check("\\bx\\b", "x", true); 195 | check("\\bx\\b", " x ", true); 196 | check("\\bx", "Ax", false); 197 | check("x\\b", "xA", false); 198 | check("\\Bx\\B", "x", false); 199 | check("\\Bx\\B", " x ", false); 200 | check("\\Bx", "Ax", true); 201 | check("x\\B", "xA", true); 202 | } 203 | --------------------------------------------------------------------------------