├── .gitignore ├── README.md ├── bench.nim ├── benchdata.txt ├── copying.txt ├── ex1.nim ├── lexe.nim ├── lexim.nim ├── lexim.nims ├── listing.nim ├── nfa.nim ├── regexprs.nim ├── testa.nim ├── tests.nim ├── todo.txt └── vm.nim /.gitignore: -------------------------------------------------------------------------------- 1 | nimcache/ 2 | *.exe 3 | lexe.input 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lexim 2 | Lexer generation and regex implementation for Nim. 3 | 4 | # example 5 | Lexim requires a 'lexe' helper exe that is used by 'lexim'. 6 | Compile via ``nim c lexe`` and then you can run the example 7 | via ``nim c ex1.nim``. 8 | -------------------------------------------------------------------------------- /bench.nim: -------------------------------------------------------------------------------- 1 | 2 | import nfa, regexprs, listing 3 | from strutils import find 4 | 5 | const 6 | asRegex = ".*[Pp]leasuring" 7 | 8 | import vm 9 | from times import cpuTime 10 | 11 | var bc = vm.re(asRegex) 12 | echo "code ", bc.code.len, " data: ", bc.data.len 13 | 14 | template bench(text, doWork: untyped) = 15 | var t0 = cpuTime() 16 | doWork 17 | echo text, " took [s] ", cpuTime() - t0 18 | 19 | import re, strutils 20 | 21 | let thaRe = re.re("[Pp]leasuring", {reDotAll, reStudy}) 22 | 23 | import lexim 24 | proc lex(input: string): int = 25 | var pos = 0 26 | while pos < input.len: 27 | lexim.match input, pos: 28 | of r"[Pp]leasuring": 29 | return pos 30 | of r".": 31 | discard 32 | return -1 33 | 34 | import std/strscans 35 | proc scan(input: string): int = 36 | var pos = 0 37 | while pos < input.len: 38 | if scanp(input, pos, {'P', 'p'}, "leasuring"): 39 | return pos 40 | inc pos 41 | return -1 42 | 43 | import npeg 44 | proc pegs(input: string): int = 45 | let p = peg search: 46 | search <- @({'P', 'p'} * "leasuring") 47 | let r = p.match(input) 48 | return if r.ok: r.matchLen else: -1 49 | 50 | proc main = 51 | let inp = readFile("benchdata.txt") 52 | when true: 53 | bench "vm 1": 54 | for i in 1..100: 55 | discard vm.matchLen(inp, bc) 56 | 57 | bench "re A": 58 | for i in 1..100: 59 | discard re.find(inp, thaRe) 60 | 61 | bench "find": 62 | for i in 1..100: 63 | discard find(inp, "pleasuring") 64 | 65 | bench "lexer": 66 | for i in 1..100: 67 | discard lex(inp) 68 | 69 | bench "scanp": 70 | for i in 1..100: 71 | discard scan(inp) 72 | 73 | bench "npeg": 74 | for i in 1..100: 75 | discard pegs(inp) 76 | 77 | echo matchLen(inp, bc) 78 | echo re.find(inp, thaRe)+len"pleasuring" 79 | echo find(inp, "pleasuring")+len"pleasuring" 80 | echo lex(inp) # +len"pleasuring" 81 | echo scan(inp) # +len"pleasuring" 82 | echo pegs(inp) # +len"pleasuring" 83 | 84 | main() 85 | -------------------------------------------------------------------------------- /copying.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Andreas Rumpf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /ex1.nim: -------------------------------------------------------------------------------- 1 | discard """ 2 | output: ''' 3 | an identifier the## 4 | something else ## 5 | an integer 0909## 6 | something else ## 7 | an ELSE 8 | something else ## 9 | an identifier input## 10 | something else ## 11 | an ELIF 12 | something else ## 13 | an identifier elseo## 14 | something else ## 15 | an END''' 16 | """ 17 | 18 | import lexim 19 | 20 | proc main = 21 | var input = "the 0909 else input elif elseo end" 22 | var pos = 0 23 | while pos < input.len: 24 | let oldPos = pos 25 | match input, pos: 26 | of r"\d+": echo "an integer ", input.substr(oldPos, pos-1), "##" 27 | of "else": echo "an ELSE" 28 | of "elif": echo "an ELIF" 29 | of "end": echo "an END" 30 | of r"[a-zA-Z_]\w+": echo "an identifier ", input.substr(oldPos, pos-1), "##" 31 | of r".": echo "something else ", input.substr(oldPos, pos-1), "##" 32 | 33 | main() 34 | -------------------------------------------------------------------------------- /lexe.nim: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Lexim - The Lexer Generator for Nim 4 | # (c) Copyright 2015 Andreas Rumpf 5 | # 6 | # See the file "copying.txt", included in this 7 | # distribution, for details about the copyright. 8 | # 9 | 10 | import 11 | regexprs, nfa, marshal 12 | 13 | # The part that implements lexer generation as an exe to speed up 14 | # this process. 15 | proc findMacro(name: string): PRegExpr = nil 16 | 17 | proc main(input: string): string = 18 | let inp = marshal.to[seq[string]](input) 19 | 20 | var bigRe: PRegExpr = nil 21 | for i in 0..= MaxChar: break 36 | inc c1 37 | 38 | proc charAt(s: string; i: int): char {.inline.} = 39 | result = if i < s.len: s[i] else: '\0' 40 | 41 | proc currChar(s, i: NimNode; isCString: bool): NimNode {.compileTime.} = 42 | result = 43 | if isCString: 44 | newTree(nnkBracketExpr, s, i) 45 | else: 46 | newCall(bindSym"charAt", s, i) 47 | 48 | proc getCmp(s, i: NimNode; x: set[char]; isCString: bool): NimNode {.compileTime.} = 49 | result = newCall(bindSym"contains", charSetLit(x), currChar(s, i, isCString)) 50 | 51 | proc getSpecial(s, i: NimNode; x: Alphabet; isCString: bool): NimNode {.compileTime.} = 52 | result = newCall(bindSym"==", currChar(s, i, isCString), newLit(x.val)) 53 | 54 | proc newVarStmt(name, typ, value: NimNode): NimNode {.compiletime.} = 55 | return newTree(nnkVarSection, newTree(nnkIdentDefs, name, typ, value)) 56 | 57 | proc nextState(i, state: NimNode; dest: int): NimNode {.compileTime.} = 58 | newStmtList(newCall(bindSym"inc", i), newAssignment(state, newLit(dest))) 59 | 60 | proc genMatcher(a: DFA; s, i, bodies: NimNode; isCString: bool): NimNode {.compileTime.} = 61 | let state = genSym(nskVar, "state") 62 | result = newStmtList() 63 | result.add newVarStmt(newTree(nnkPragmaExpr, state, 64 | newTree(nnkPragma, ident"goto")), 65 | newTree(nnkBracketExpr, bindSym"range", 66 | newRange(newLit(1), newLit(a.stateCount))), 67 | newLit(a.startState)) 68 | var caseStmt = newNimNode(nnkCaseStmt) 69 | caseStmt.add state 70 | result.add newTree(nnkWhileStmt, bindSym"true", caseStmt) 71 | for src in countup(1, a.stateCount): 72 | let rule = getRule(a, src) 73 | var ifStmt = newNimNode(nnkIfStmt) 74 | for dest in allDests(a, src): 75 | let (others, cs) = allTransitions(a, src, dest) 76 | if cs != {}: 77 | ifStmt.add newTree(nnkElifBranch, 78 | getCmp(s, i, cs, isCString), 79 | nextState(i, state, dest)) 80 | for ot in others: 81 | if ot.kind == reChar: 82 | ifStmt.add newTree(nnkElifBranch, 83 | getSpecial(s, i, ot, isCString), 84 | nextState(i, state, dest)) 85 | else: 86 | doAssert false, "not supported " & $ot.kind 87 | let actions = if rule >= 1: 88 | newStmtList(bodies[rule-1][1], newTree(nnkBreakStmt, 89 | newNimNode(nnkEmpty))) 90 | else: 91 | newTree(nnkBreakStmt, newNimNode(nnkEmpty)) 92 | if ifStmt.len == 0: 93 | caseStmt.add newTree(nnkOfBranch, newLit(src), actions) 94 | else: 95 | ifStmt.add newTree(nnkElse, actions) 96 | caseStmt.add newTree(nnkOfBranch, newLit(src), ifStmt) 97 | 98 | template `/.`(x: string): string = 99 | (when defined(posix): "./" & x else: x) 100 | 101 | macro match*(s: cstring|string; pos: int; sections: varargs[untyped]): untyped = 102 | let isCString = s.getType.typeKind == ntyCString 103 | when defined(leximSkipLexe): 104 | var bigRe: PRegExpr = nil 105 | var rule = 1 106 | for sec in sections.children: 107 | expectKind sec, nnkOfBranch 108 | expectLen sec, 2 109 | if sec[0].kind in nnkStrLit..nnkTripleStrLit: 110 | let rex = parseRegExpr(sec[0].strVal, findMacro, 111 | {reNoCaptures, reNoBackrefs}) 112 | rex.rule = rule 113 | if bigRe.isNil: bigRe = rex 114 | else: bigRe = altExpr(bigRe, rex) 115 | else: 116 | error("Expected a node of kind nnkStrLit, got " & $sec[0].kind) 117 | inc rule 118 | 119 | var n: NFA 120 | var d, o: DFA 121 | regExprToNFA(bigRe, n) 122 | let alph = fullAlphabet(n) 123 | NFA_to_DFA(n, d, alph) 124 | optimizeDFA(d, o, alph) 125 | result = genMatcher(o, s, pos, sections, isCString) 126 | else: 127 | # use 'lexe.exe' helper program in order to speedup lexer generation 128 | var res: seq[string] = @[] 129 | for sec in sections.children: 130 | expectKind sec, nnkOfBranch 131 | expectLen sec, 2 132 | if sec[0].kind in nnkStrLit..nnkTripleStrLit: 133 | res.add sec[0].strVal 134 | else: 135 | error("Expected a node of kind nnkStrLit, got " & $sec[0].kind) 136 | 137 | let data = $$res 138 | writeFile("lexe.input", data) 139 | let o = to[DFA](staticExec(/."lexe", input="", cache=data)) 140 | result = genMatcher(o, s, pos, sections, isCString) 141 | echo repr result 142 | 143 | when isMainModule: # defined(testing): 144 | var input = "the 0909 else input elif elseo end" 145 | let asc = input.cstring 146 | var pos = 0 147 | while pos < input.len: 148 | let oldPos = pos 149 | match input, pos: 150 | of r"\d+": echo "an integer ", input.substr(oldPos, pos-1), "##" 151 | of "else": echo "an ELSE" 152 | of "elif": echo "an ELIF" 153 | of "end": echo "an END" 154 | of r"[a-zA-Z_]\w+": echo "an identifier ", input.substr(oldPos, pos-1), "##" 155 | of r".": echo "something else ", input.substr(oldPos, pos-1), "##" 156 | -------------------------------------------------------------------------------- /lexim.nims: -------------------------------------------------------------------------------- 1 | 2 | version = "1.0" 3 | author = "Andreas Rumpf" 4 | description = "Lexer generation and regex implementation for Nim." 5 | license = "MIT" 6 | 7 | requires "nim >= 0.11.3" 8 | 9 | import ospaths 10 | 11 | proc buildHelper(name: string) = 12 | if not fileExists(name.toExe): 13 | exec "nim c " & name 14 | 15 | task build, "builds Lexim and an example": 16 | buildHelper "lexe" 17 | exec "nim c ex1" 18 | setCommand "nop" 19 | 20 | task tests, "test regular expressions": 21 | exec "nim c -r tests" 22 | setCommand "nop" 23 | -------------------------------------------------------------------------------- /listing.nim: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Lexim - The Lexer Generator for Nim 4 | # (c) Copyright 2015 Andreas Rumpf 5 | # 6 | # See the file "copying.txt", included in this 7 | # distribution, for details about the copyright. 8 | # 9 | 10 | 11 | ## this modules contains utility functions for list generating routines: 12 | 13 | import strutils 14 | 15 | proc nchars*(cc: set[char]): int = 16 | result = 0 17 | for c in countup('\0', '\xFF'): 18 | if c in cc: inc(result) 19 | 20 | proc charStr*(c: char; reserved: set[char]): string = 21 | case c 22 | of '\b': 23 | result = "\\b" 24 | of '\t': 25 | result = "\\t" 26 | of '\C': 27 | result = "\\r" 28 | of '\L': 29 | result = "\\l" 30 | of '\v': 31 | result = "\\v" 32 | of '\f': 33 | result = "\\f" 34 | of '\e': 35 | result = "\\e" 36 | of '\a': 37 | result = "\\a" 38 | of '\\': 39 | result = "\\\\" 40 | else: 41 | if c < ' ': 42 | result = '\\' & $ord(c) 43 | elif c in reserved: 44 | result = '\\' & $c 45 | else: 46 | result = $c 47 | 48 | proc singleQuoteStr*(str: string): string = 49 | result = "'" 50 | for c in str: result.add charStr(c, {'\''}) 51 | result.add '\'' 52 | 53 | proc doubleQuoteStr*(str: string): string = 54 | result = "\"" 55 | for c in str: result.add charStr(c, {'\"'}) 56 | result.add '\"' 57 | 58 | proc charSetStrAux(cc: set[char]): string = 59 | const 60 | reserved = {'^', '-', ']'} 61 | MaxChar = '\xFF' 62 | result = "" 63 | var c1 = '\0' 64 | while true: 65 | if c1 in cc: 66 | var c2 = c1 67 | while (c2 < MaxChar) and (succ(c2) in cc): c2 = succ(c2) 68 | if c1 == c2: 69 | result.add charStr(c1, reserved) 70 | elif c2 == succ(c1): 71 | result.add charStr(c1, reserved) & charStr(c2, reserved) 72 | else: 73 | result.add charStr(c1, reserved) & '-' & charStr(c2, reserved) 74 | c1 = c2 75 | if c1 >= MaxChar: break 76 | inc(c1) 77 | 78 | proc charSetStr*(cc: set[char]): string = 79 | if cc == {'\x01'..'\xFF'} - {'\L'}: 80 | result = "." 81 | else: 82 | if nchars(cc) > 128: 83 | result = "[^" & charSetStrAux({'\0'..'\xFF'} - cc) & ']' 84 | else: 85 | result = '[' & charSetStrAux(cc) & ']' 86 | 87 | proc charSetOrCharStr*(cc: set[char]): string = 88 | var count = 0 89 | var c1 = '\0' # to avoid warnings 90 | for c in countup('\0', '\xFF'): 91 | if c in cc: 92 | c1 = c 93 | inc(count) 94 | if count > 1: 95 | result = charSetStr(cc) 96 | elif count == 1: 97 | result = charStr(c1, {'.'}) # was: singleQuoteStr(c1) 98 | else: 99 | result = "[]" 100 | 101 | -------------------------------------------------------------------------------- /nfa.nim: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Lexim - The Lexer Generator for Nim 4 | # (c) Copyright 2015 Andreas Rumpf 5 | # 6 | # See the file "copying.txt", included in this 7 | # distribution, for details about the copyright. 8 | # 9 | 10 | import 11 | regexprs 12 | 13 | const 14 | maxLabel* = 255 15 | 16 | type 17 | Alphabet* = object 18 | kind*: RegexKind 19 | val*: char 20 | 21 | const 22 | alEpsilon* = Alphabet(kind: reEps, val: '\0') 23 | 24 | type 25 | TRuleIndex* = range[0..10_000] 26 | TLabel* = range[0..maxLabel] # 0 is an invalid label number, indicating 27 | # there is no transition 28 | TLabelSet* = set[TLabel] # max. size may be bigger in Nim 29 | # transition tables: if label = 0, 30 | # it is the start node 31 | DFA_Edge* = object 32 | cond*: Alphabet 33 | dest*: TLabel 34 | DFA_Trans* = array[TLabel, seq[DFA_Edge]] # transitions for DFA's 35 | # label = 1 is the start node 36 | 37 | NFA_Edge* = object 38 | cond*: Alphabet 39 | dest*: TLabelSet 40 | NFA_Trans* = array[TLabel, seq[NFA_Edge]] # transitions for NFA's 41 | # label 0 is the start node 42 | TLabelToRule* = array[TLabel, TRuleIndex] 43 | DFA* = object 44 | startState*: int # start state; for some reason it won't always be 1 45 | stateCount*: int # number of states; states are from 1 to stateCount 46 | captures*, backrefs*: int 47 | ruleCount*: int # number of rules; rule 0 means no match 48 | trans*: DFA_Trans 49 | toRules*: TLabelToRule 50 | 51 | NFA* = object 52 | captures, backrefs, stateCount: int 53 | trans*: NFA_Trans 54 | toRules*: TLabelToRule 55 | 56 | proc initNFA(a: var NFA) = discard 57 | proc initDFA(a: var DFA) = discard 58 | 59 | proc addTrans(src: var seq[NFA_Edge]; c: Alphabet; d: TLabel) = 60 | for i in 0 .. high(src): 61 | if src[i].cond == c: 62 | src[i].dest.incl d 63 | return 64 | src.add(NFA_Edge(cond: c, dest: {d})) 65 | if c.kind == reEps and src.len != 1: 66 | # make epsilon always the first transition to speed up later passes: 67 | swap(src[0], src[src.high]) 68 | 69 | proc addTrans(src: var seq[DFA_Edge]; c: Alphabet; d: TLabel) = 70 | for i in 0 .. high(src): 71 | if src[i].cond == c: 72 | src[i].dest = d 73 | return 74 | src.add(DFA_Edge(cond: c, dest: d)) 75 | 76 | proc auxRegExprToNFA(r: PRegExpr; a: var NFA; currState: int): int = 77 | # helper that is recursive; returns the new current state 78 | result = currState 79 | assert(r != nil) 80 | if r == nil: return 81 | case r.kind 82 | of reEps: 83 | addTrans(a.trans[result], alEpsilon, result + 1) 84 | inc(result) 85 | of reChar: 86 | addTrans(a.trans[result], Alphabet(kind: reChar, val: r.c), result + 1) 87 | inc(result) 88 | of reWordBoundary, reWordBoundaryNot, reBegin, reEnd: 89 | addTrans(a.trans[result], Alphabet(kind: r.kind, val: '\0'), result + 1) 90 | inc(result) 91 | of reStr: 92 | # string node 93 | for i in countup(0, len(r.s)-1): 94 | addTrans(a.trans[result], Alphabet(kind: reChar, val: r.s[i]), result + 1) 95 | inc(result) 96 | of reCat: 97 | # concatenation node 98 | result = auxRegExprToNFA(r.a, a, result) 99 | result = auxRegExprToNFA(r.b, a, result) 100 | of reCClass: 101 | addTrans(a.trans[result], alEpsilon, result + 1) 102 | inc(result) 103 | for c in countup('\0', '\xFF'): 104 | if c in r.cc[]: 105 | addTrans(a.trans[result], Alphabet(kind: reChar, val: c), result + 1) 106 | inc(result) 107 | of reStar: 108 | # star node 109 | # we draw one transition too much, which shouldn't be wrong 110 | let aa = auxRegExprToNFA(r.a, a, result) 111 | addTrans(a.trans[result], alEpsilon, aa + 1) 112 | addTrans(a.trans[aa], alEpsilon, aa + 1) 113 | addTrans(a.trans[aa + 1], alEpsilon, result) 114 | result = aa + 1 115 | of rePlus: 116 | # plus node 117 | # constructed as M M* would be: 118 | result = auxRegExprToNFA(catExpr(r.a, starExpr(r.a)), a, result) 119 | of reOpt: 120 | # option node 121 | # constructed as M | eps would be: 122 | result = auxRegExprToNFA(altExpr(r.a, epsExpr()), a, result) 123 | of reAlt: 124 | # (|) node 125 | addTrans(a.trans[result], alEpsilon, result + 1) 126 | inc(result) 127 | let oldState = result 128 | let aa = auxRegExprToNFA(r.a, a, result) 129 | let bb = auxRegExprToNFA(r.b, a, aa + 1) 130 | addTrans(a.trans[oldState], alEpsilon, aa + 1) 131 | addTrans(a.trans[aa], alEpsilon, bb + 1) 132 | addTrans(a.trans[bb], alEpsilon, bb + 1) 133 | result = bb + 1 134 | of reCapture, reCaptureEnd: 135 | a.captures = max(a.captures, int(r.c)) 136 | addTrans(a.trans[result], Alphabet(kind: reCapture, val: r.c), result+1) 137 | inc(result) 138 | result = auxRegExprToNFA(r.a, a, result) 139 | addTrans(a.trans[result], Alphabet(kind: reCaptureEnd, val: r.c), result+1) 140 | inc(result) 141 | of reBackref: 142 | a.backrefs = max(a.backrefs, int(r.c)) 143 | addTrans(a.trans[result], Alphabet(kind: reBackref, val: r.c), result + 1) 144 | inc(result) 145 | if r.rule != 0: a.toRules[result] = r.rule 146 | 147 | proc regExprToNFA*(r: PRegExpr; a: var NFA) = 148 | initNFA(a) 149 | a.stateCount = auxRegExprToNFA(r, a, 0) 150 | 151 | proc allTransitions*(a: DFA; source, dest: TLabel): (seq[Alphabet], set[char]) = 152 | result[0] = @[] 153 | if a.trans[source].len > 0: 154 | result[1] = {} 155 | var card = 0 156 | var lastChar = -1 157 | for x in a.trans[source]: 158 | if x.dest == dest: 159 | if x.cond.kind == reChar: 160 | inc card 161 | if lastChar < 0: lastChar = int x.cond.val 162 | result[1].incl x.cond.val 163 | else: 164 | result[0].add x.cond 165 | if card == 1: 166 | result[1] = {} 167 | result[0].add Alphabet(kind: reChar, val: char lastChar) 168 | 169 | iterator allDests*(a: DFA; source: TLabel): TLabel = 170 | if a.trans[source].len > 0: 171 | # use a set to eliminate duplicates: 172 | var dests: TLabelSet 173 | for x in a.trans[source]: dests.incl x.dest 174 | for d in dests: yield d 175 | 176 | proc getRule*(a: DFA; s: TLabel): int = a.toRules[s] 177 | 178 | proc closure(a: NFA; S: TLabelSet): TLabelSet = 179 | var res: TLabelSet 180 | result = S 181 | while true: 182 | res = result 183 | for L in countup(0, a.stateCount): 184 | if L in res: 185 | if a.trans[L].len > 0 and a.trans[L][0].cond.kind == reEps: 186 | result = result + a.trans[L][0].dest 187 | if res == result: break 188 | 189 | proc getDest(a: seq[NFA_Edge]; c: Alphabet): TLabelSet = 190 | if a.len == 0: return 191 | for t in a: 192 | if t.cond.kind == c.kind and t.cond.val == c.val: return t.dest 193 | 194 | proc getDest(a: seq[DFA_Edge]; c: Alphabet): TLabel = 195 | if a.len == 0: return 196 | for t in a: 197 | if t.cond.kind == c.kind and t.cond.val == c.val: return t.dest 198 | 199 | proc getDFAedge(a: NFA; d: TLabelSet; c: Alphabet): TLabelSet = 200 | var tmp: TLabelSet = {} 201 | for L in countup(0, a.stateCount): 202 | if L in d: 203 | tmp = tmp + getDest(a.trans[L], c) 204 | result = closure(a, tmp) 205 | 206 | proc searchInStates(states: openarray[TLabelSet]; p: int; e: TLabelSet): int = 207 | # returns -1 if not found 208 | for i in countup(0, p): 209 | if states[i] == e: return i 210 | result = -1 211 | 212 | proc fullAlphabet(captures, backrefs: int): seq[Alphabet] = 213 | result = @[] 214 | var c: Alphabet 215 | c.kind = reChar 216 | for x in '\0'..'\255': 217 | c.val = x 218 | result.add c 219 | c.kind = reBackref 220 | for x in 1..backrefs: 221 | c.val = char(x) 222 | result.add c 223 | for x in 1..captures: 224 | c.val = char(x) 225 | c.kind = reCapture 226 | result.add c 227 | c.kind = reCaptureEnd 228 | result.add c 229 | c.val = '\0' 230 | c.kind = reBegin 231 | result.add c 232 | c.kind = reEnd 233 | result.add c 234 | c.kind = reWordBoundary 235 | result.add c 236 | c.kind = reWordBoundaryNot 237 | result.add c 238 | 239 | proc fullAlphabet*(a: NFA): seq[Alphabet] = fullAlphabet(a.captures, a.backrefs) 240 | 241 | proc NFA_to_DFA*(a: NFA; b: var DFA; fullAlphabet: seq[Alphabet]) = 242 | # Look into 'Modern compiler implementation in Java' for reference of 243 | # this algorithm. 244 | var 245 | states: seq[TLabelSet] = @[] 246 | states.add({}) 247 | states.add closure(a, {0.TLabel}) # 0 is the start state 248 | var p = 1 249 | var j = 0 250 | while j <= p: 251 | for c in fullAlphabet: 252 | let e = getDFAedge(a, states[j], c) 253 | let i = searchInStates(states, p, e) 254 | if i >= 0: 255 | addTrans(b.trans[j], c, i) 256 | else: 257 | inc(p) 258 | assert p == states.len 259 | states.add e 260 | addTrans(b.trans[j], c, p) 261 | inc(j) 262 | for d in countup(low(TLabel), j - 1): 263 | var minRule = high(int) 264 | for i in countup(low(TLabel), high(TLabel)): 265 | if i in states[d]: 266 | if minRule > a.toRules[i] and a.toRules[i] != 0: 267 | minRule = a.toRules[i] 268 | if minRule == high(int): 269 | b.toRules[d] = 0 270 | else: 271 | b.toRules[d] = minRule 272 | if minRule > b.ruleCount: b.ruleCount = minRule 273 | b.stateCount = j - 1 274 | b.startState = 1 # for some reason this is always 1 275 | b.captures = a.captures 276 | b.backrefs = a.backrefs 277 | 278 | proc getPreds(a: DFA; s: TLabelSet; c: Alphabet): TLabelSet = 279 | # computes the set of predecessors for the set s (under the character c) 280 | result = {} 281 | let k = c.kind 282 | let v = c.val 283 | for i in countup(1, a.stateCount): 284 | for t in a.trans[i]: 285 | if t.cond.kind == k and t.cond.val == v and t.dest in s: 286 | incl(result, i) 287 | 288 | proc card(s: TLabelSet; maxState: int): int = 289 | result = 0 290 | for i in countup(1, maxState): 291 | if i in s: inc(result) 292 | 293 | proc choose(s: TLabelSet; maxState: int): TLabel = 294 | # choose an arbitrary element from s 295 | assert(s != {}) 296 | for i in countup(1, maxState): 297 | if i in s: 298 | return i 299 | result = 0 # invalid state 300 | 301 | proc optimizeDFA*(a: DFA; b: var DFA; fullAlphabet: seq[Alphabet]) = 302 | # Optimizes the DFA a to produce a minimal DFA. 303 | # We use Hopcroft's algorithm; see the paper coming with this source. 304 | # We have different types of nodes: there is a one to one correspondence 305 | # between type and matching rule. 306 | b.captures = a.captures 307 | b.backrefs = a.backrefs 308 | # p[0], w[0] are unused 309 | # assign each state to a partition and to the worklist: 310 | # w := {F, S-F}; p := {F, S-F} 311 | var w = newSeq[TLabelSet](a.ruleCount+1) 312 | var p = newSeq[TLabelSet](a.ruleCount+1) 313 | for d in countup(1, a.stateCount): 314 | incl(w[a.toRules[d]], d) 315 | incl(p[a.toRules[d]], d) 316 | while w.len > 0: 317 | let s = w.pop 318 | for c in fullAlphabet: 319 | let I = getPreds(a, s, c) 320 | if I == {}: 321 | continue # speed things up 322 | for j in countdown(p.len - 1, 0): 323 | let R = p[j] 324 | if (R * I != {}) and not (R <= I): 325 | # partition R into x, y 326 | let x = R * I 327 | let y = R - x # replace R by x and y in P: 328 | p[j] = x 329 | p.add y 330 | let findRes = searchInStates(w, w.len - 1, R) 331 | if findRes >= 0: 332 | # R is elem of W, so replace R by x, y 333 | w[findRes] = x 334 | w.add y 335 | else: 336 | if card(x, a.stateCount) <= card(y, a.stateCount): # add y to W: 337 | w.add x 338 | else: 339 | w.add y 340 | b.stateCount = p.len # new states 341 | b.ruleCount = a.ruleCount # rule count stays the same 342 | for j in countup(0, p.len - 1): 343 | if p[j] != {}: 344 | let repr = choose(p[j], a.stateCount) # choose a representant of the set 345 | if a.startState in p[j]: b.startState = j + 1 346 | b.toRules[j + 1] = a.toRules[repr] 347 | for c in fullAlphabet: 348 | let dest = a.trans[repr].getDest(c) 349 | if dest != 0: 350 | # test to speed things up 351 | for k in countup(0, p.len - 1): 352 | if dest in p[k]: 353 | addTrans b.trans[j + 1], c, k + 1 354 | break 355 | -------------------------------------------------------------------------------- /regexprs.nim: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Lexim - The Lexer Generator for Nim 4 | # (c) Copyright 2015 Andreas Rumpf 5 | # 6 | # See the file "copying.txt", included in this 7 | # distribution, for details about the copyright. 8 | # 9 | 10 | # This module implements a parser for regular expressions. 11 | 12 | import strutils 13 | 14 | type 15 | RegexKind* = enum ## the regex AST's kind 16 | reEps, ## epsilon node 17 | reChar, ## character node 18 | reStr, ## string node 19 | reCClass, ## character class node 20 | reStar, ## star node 21 | rePlus, ## plus node 22 | reOpt, ## option node 23 | reCat, ## concatenation node 24 | reAlt, ## alternatives node (|) 25 | reCapture, ## (capture) 26 | reCaptureEnd, ## not used by regex, but by NFA 27 | reBackref, ## \\backref 28 | reBegin, ## \\A 29 | reEnd, ## \\Z 30 | reWordBoundary, ## \\b 31 | reWordBoundaryNot ## \\B 32 | 33 | PRegExpr* = ref TRegExpr 34 | TRegExpr* = object 35 | kind*: RegexKind 36 | a*, b*: PRegExpr # some nodes have two successors 37 | c*: char 38 | s*: string 39 | cc*: ref set[char] 40 | rule*: int # if >= 0 it is a final state; 41 | # then it is the rule that was matched 42 | 43 | RegexError* = object of ValueError 44 | RegexFlag* = enum ## how regexes are parsed 45 | reExtended, ## extended syntax support 46 | reNoBackrefs, ## always process \\1 as a character literal, 47 | ## not as back reference 48 | reNoCaptures ## () is the same as (?:) 49 | 50 | MacroLookupProc* = proc (macroname: string): PRegExpr {.closure.} ## \ 51 | ## lookup proc that expands {macros}. 52 | 53 | ReCtx = object 54 | pos: int 55 | flags: set[RegexFlag] 56 | captures: int # count the captures to give them an index 57 | findMacro: MacroLookupProc 58 | 59 | const 60 | wordChars* = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\128', '\255'} 61 | whitespace* = {'\1'..'\32'} 62 | digits* = {'0'..'9'} 63 | 64 | proc newExpr(kind: RegexKind): PRegExpr = 65 | new(result) 66 | result.kind = kind 67 | 68 | proc epsExpr*(): PRegExpr = 69 | result = newExpr(reEps) 70 | 71 | proc charExpr*(c: char): PRegExpr = 72 | result = newExpr(reChar) 73 | result.c = c 74 | 75 | proc backrefExpr*(x: int): PRegExpr = 76 | result = newExpr(reBackref) 77 | result.c = char x 78 | 79 | proc strExpr*(str: string): PRegExpr = 80 | if len(str) == 1: 81 | result = charExpr(str[0]) 82 | else: 83 | result = newExpr(reStr) 84 | result.s = str 85 | 86 | proc cclassExpr*(charset: set[char]): PRegExpr = 87 | result = newExpr(reCClass) 88 | new(result.cc) 89 | result.cc[] = charset 90 | 91 | proc starExpr*(r: PRegExpr): PRegExpr = 92 | if r.kind == reStar: 93 | result = r 94 | else: 95 | result = newExpr(reStar) 96 | result.a = r 97 | 98 | proc plusExpr*(r: PRegExpr): PRegExpr = 99 | result = newExpr(rePlus) 100 | result.a = r 101 | 102 | proc optExpr*(r: PRegExpr): PRegExpr = 103 | result = newExpr(reOpt) 104 | result.a = r 105 | 106 | proc catExpr*(a, b: PRegExpr): PRegExpr = 107 | result = newExpr(reCat) 108 | result.a = a 109 | result.b = b 110 | 111 | proc altExpr*(a, b: PRegExpr): PRegExpr = 112 | result = newExpr(reAlt) 113 | result.a = a 114 | result.b = b 115 | 116 | proc altExpr*(a: varargs[PRegExpr]): PRegExpr = 117 | result = altExpr(a[0], a[1]) 118 | for i in 2 ..< a.len: 119 | result = result.altExpr(a[i]) 120 | 121 | proc mnExpr*(r: PRegExpr; m, n: int): PRegExpr = 122 | var ri: PRegExpr 123 | if m > n or n == 0: 124 | result = epsExpr() 125 | else: 126 | # construct r^m: 127 | if m == 0: 128 | ri = epsExpr() 129 | else: 130 | ri = r 131 | for i in countup(2, m): ri = catExpr(ri, r) 132 | result = ri # r{m,n} := r^m 133 | for i in countup(m + 1, n): 134 | if ri.kind == reEps: ri = r 135 | else: ri = catExpr(ri, r) 136 | result = altExpr(result, ri) # r{m,n} := r{m,n} | r^i, 137 | # i=m+1,...,n 138 | 139 | proc newCapture*(a: PRegExpr): PRegExpr = 140 | result = newExpr(reCapture) 141 | result.a = a 142 | 143 | proc getNext(buf: string; c: var ReCtx): char = 144 | if reExtended in c.flags: 145 | while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos) 146 | result = if c.pos < buf.len: buf[c.pos] else: '\0' 147 | 148 | proc error(msg: string) {.noinline.} = 149 | raise newException(RegexError, msg) 150 | 151 | proc getChar(buf: string; c: var ReCtx; inClass: bool): PRegExpr = 152 | var val, i: int 153 | if reExtended in c.flags and not inClass: 154 | while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos) 155 | if c.pos < buf.len and buf[c.pos] != '\\': 156 | result = charExpr(buf[c.pos]) 157 | inc(c.pos) 158 | else: 159 | let ch = if c.pos+1 < buf.len: buf[c.pos+1] else: '\0' 160 | case ch 161 | of 'n': 162 | result = altExpr(strExpr("\C\L"), charExpr('\L'), charExpr('\C')) 163 | inc(c.pos, 2) 164 | of 'r': 165 | result = charExpr('\r') 166 | inc(c.pos, 2) 167 | of 'l', 'L': 168 | result = charExpr('\L') 169 | inc(c.pos, 2) 170 | of 't': 171 | result = charExpr('\t') 172 | inc(c.pos, 2) 173 | of 'b': 174 | result = if inClass: charExpr('\b') else: newExpr(reWordBoundary) 175 | inc(c.pos, 2) 176 | of 'B': 177 | result = if inClass: charExpr('\b') else: newExpr(reWordBoundaryNot) 178 | inc(c.pos, 2) 179 | of 'e': 180 | result = charExpr('\e') 181 | inc(c.pos, 2) 182 | of 'a', 'A': 183 | result = if inClass: charExpr('\a') else: newExpr(reBegin) 184 | inc(c.pos, 2) 185 | of 'v': 186 | result = charExpr('\v') 187 | inc(c.pos, 2) 188 | of 'f': 189 | result = charExpr('\f') 190 | inc(c.pos, 2) 191 | of 'z', 'Z': 192 | if not inClass: result = newExpr(reEnd) 193 | else: error("\\Z not supported in character class") 194 | inc(c.pos, 2) 195 | of 's': 196 | result = cclassExpr(whitespace) 197 | inc(c.pos, 2) 198 | of 'S': 199 | result = cclassExpr({'\1'..'\255'} - whitespace) 200 | inc(c.pos, 2) 201 | of 'd': 202 | result = cclassExpr(digits) 203 | inc(c.pos, 2) 204 | of 'D': 205 | result = cclassExpr({'\1'..'\255'} - digits) 206 | inc(c.pos, 2) 207 | of 'w': 208 | result = cclassExpr(wordChars) 209 | inc(c.pos, 2) 210 | of 'W': 211 | result = cclassExpr({'\1'..'\255'} - wordChars) 212 | inc(c.pos, 2) 213 | of '0'..'9': 214 | let startsWithZero = ch == '0' 215 | val = ord(ch) - ord('0') 216 | inc(c.pos, 2) 217 | i = 1 218 | while (i <= 4) and c.pos < buf.len and (buf[c.pos] in {'0'..'9'}): 219 | val = val * 10 + ord(buf[c.pos]) - ord('0') 220 | inc(c.pos) 221 | inc(i) 222 | if startsWithZero or reNoBackrefs in c.flags: 223 | result = charExpr(char val) 224 | else: 225 | result = backrefExpr(val) 226 | else: 227 | if ch in {'\0'..'\x1F'}: 228 | error "invalid character #" & toHex(ch.ord, 2) 229 | else: 230 | result = charExpr(ch) 231 | inc(c.pos, 2) 232 | 233 | proc parseStr(buf: string; c: var ReCtx): PRegExpr = 234 | var s = "" 235 | inc(c.pos) # skip " 236 | while c.pos < buf.len and buf[c.pos] != '\"': 237 | if buf[c.pos] in {'\0', '\C', '\L'}: 238 | error "\" expected" 239 | let al = getChar(buf, c,false) 240 | if al.kind == reChar: s.add al.c 241 | else: error "invalid regular expression " & buf 242 | inc(c.pos) # skip " 243 | result = strExpr(s) 244 | 245 | proc parseCClass(buf: string; c: var ReCtx): PRegExpr = 246 | # scan a character class 247 | var 248 | caret: bool 249 | cc: set[char] 250 | inc(c.pos) # skip [ 251 | if c.pos < buf.len and buf[c.pos] == '^': 252 | caret = true 253 | inc(c.pos) 254 | else: 255 | caret = false 256 | while c.pos < buf.len and buf[c.pos] != ']': 257 | if buf[c.pos] in {'\0', '\C', '\L'}: 258 | error "] expected" 259 | let a = getChar(buf, c, true) 260 | if a.kind == reChar: 261 | incl(cc, a.c) 262 | if c.pos < buf.len and buf[c.pos] == '-': 263 | inc(c.pos) 264 | if c.pos < buf.len and buf[c.pos] == ']': 265 | incl(cc, '-') 266 | break 267 | let b = getChar(buf, c, true) 268 | if b.kind == reChar: 269 | cc = cc + {a.c .. b.c} 270 | elif b.kind == reCClass: 271 | incl(cc, '-') 272 | cc = cc + b.cc[] 273 | else: 274 | error "invalid regular expression " & buf 275 | elif a.kind == reCClass: 276 | cc = cc + a.cc[] 277 | else: 278 | error "invalid regular expression " & buf 279 | if c.pos < buf.len and buf[c.pos] == ']': inc(c.pos) 280 | else: error "] expected" 281 | if caret: result = cclassExpr({'\1'..'\xFF'} - cc) 282 | else: result = cclassExpr(cc) 283 | 284 | proc parseNum(buf: string; c: var ReCtx): int = 285 | result = 0 286 | if c.pos < buf.len and buf[c.pos] in {'0'..'9'}: 287 | while true: 288 | result = result * 10 + ord(buf[c.pos]) - ord('0') 289 | inc(c.pos) 290 | if c.pos >= buf.len or buf[c.pos] notin {'0'..'9'}: break 291 | else: 292 | error "number expected" 293 | 294 | proc parseIdent(buf: string; c: var ReCtx): string = 295 | result = "" 296 | if c.pos < buf.len and buf[c.pos] in {'a'..'z', 'A'..'Z', '_'}: 297 | while c.pos < buf.len: 298 | case buf[c.pos] 299 | of 'a'..'z', 'A'..'Z', '0'..'9': 300 | result.add toUpperAscii(buf[c.pos]) 301 | inc(c.pos) 302 | of '_': 303 | inc(c.pos) # ignore _ 304 | else: break 305 | else: 306 | error "identifier expected" 307 | 308 | proc parseMacroCall(buf: string; c: var ReCtx): PRegExpr = 309 | let name = parseIdent(buf, c) 310 | result = c.findMacro(name) 311 | if result.isNil: 312 | error "undefined macro: " & name 313 | 314 | proc parseRegExpr*(buf: string; c: var ReCtx): PRegExpr 315 | 316 | proc factor(buf: string; c: var ReCtx): PRegExpr = 317 | case getNext(buf, c) 318 | of '\"': 319 | result = parseStr(buf, c) 320 | of '[': 321 | result = parseCClass(buf, c) 322 | of '.': 323 | inc(c.pos) 324 | result = cclassExpr({'\1'..'\xFF'}) # - {'\L'}) 325 | of '(': 326 | inc(c.pos) # skip ( 327 | var isCapture = reNoCaptures notin c.flags 328 | if c.pos+1 < buf.len and buf[c.pos] == '?' and buf[c.pos+1] == ':': 329 | inc c.pos, 2 330 | isCapture = false 331 | result = parseRegExpr(buf, c) 332 | if getNext(buf, c) == ')': inc(c.pos) 333 | else: error ") expected" 334 | if isCapture: 335 | inc c.captures 336 | result = newCapture(result) 337 | result.c = char c.captures 338 | of '\\': 339 | result = getChar(buf, c, false) 340 | of '{': 341 | inc(c.pos) # skip { 342 | while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos) 343 | result = parseMacroCall(buf, c) 344 | if getNext(buf, c) == '}': inc(c.pos) 345 | else: error "} expected" 346 | of '*', '+', '?': 347 | error "escape " & buf[c.pos] & " with \\" 348 | of '$': 349 | result = newExpr(reEnd) 350 | inc(c.pos) 351 | of '^': 352 | result = newExpr(reBegin) 353 | inc(c.pos) 354 | else: 355 | result = charExpr(if c.pos < buf.len: buf[c.pos] else: '\0') 356 | inc(c.pos) 357 | while true: 358 | case getNext(buf, c) 359 | of '*': 360 | inc(c.pos) 361 | result = starExpr(result) 362 | of '+': 363 | inc(c.pos) 364 | result = plusExpr(result) 365 | of '?': 366 | inc(c.pos) 367 | result = optExpr(result) 368 | of '{': 369 | inc(c.pos) # skip { 370 | if getNext(buf, c) notin {'0'..'9'}: 371 | # a macro, but do not parse it here, but later to 372 | # keep the operator predecence: 373 | while true: # back to { 374 | # a single decrement might not do 375 | # because of skipped whitespace 376 | dec(c.pos) 377 | if buf[c.pos] == '{': break 378 | break 379 | else: 380 | var n: int 381 | let m = parseNum(buf, c) 382 | if getNext(buf, c) == ',': 383 | inc(c.pos) 384 | while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos) 385 | n = parseNum(buf, c) 386 | else: 387 | n = m 388 | result = mnExpr(result, m, n) 389 | if getNext(buf, c) == '}': inc(c.pos) 390 | else: error "} expected" 391 | else: break 392 | 393 | proc term(buf: string; c: var ReCtx): PRegExpr = 394 | const 395 | termDelim = {'\0', ':', '|', ')'} #,'/' 396 | if getNext(buf, c) notin termDelim: 397 | result = factor(buf, c) 398 | while getNext(buf, c) notin termDelim: 399 | result = catExpr(result, factor(buf, c)) 400 | else: 401 | result = epsExpr() 402 | 403 | proc parseRegExpr(buf: string; c: var ReCtx): PRegExpr = 404 | result = term(buf, c) 405 | while getNext(buf, c) == '|': 406 | inc(c.pos) 407 | result = altExpr(result, term(buf, c)) 408 | 409 | proc parseRegExpr*(reg: string; findMacro: MacroLookupProc; 410 | flags: set[RegexFlag] = {}): PRegExpr = 411 | var c: ReCtx 412 | c.pos = 0 413 | c.flags = flags 414 | c.findMacro = findMacro 415 | c.captures = 0 416 | result = parseRegExpr(reg, c) 417 | 418 | proc containsInvCap(r: PRegExpr; inAlt: bool): bool = 419 | if r != nil: 420 | result = containsInvCap(r.a, inAlt or r.kind == reAlt) or 421 | containsInvCap(r.b, inAlt or r.kind == reAlt) or 422 | r.kind == reCapture and inAlt 423 | 424 | proc containsInvalidCapture*(r: PRegExpr): bool = 425 | ## When the implementation uses a DFA, captures can only be supported in 426 | ## quite a limited way: (abc)|(xyz) cannot be supported. This proc checks for 427 | ## that so a nice error can be generated. 428 | result = containsInvCap(r, false) 429 | -------------------------------------------------------------------------------- /testa.nim: -------------------------------------------------------------------------------- 1 | 2 | import nfa, regexprs, listing, codegen 3 | 4 | proc initExample2(a: var NFA) = 5 | regexprs.addMacro("IDENT", parseRegExpr("[a-zA-Z_]")) 6 | regexprs.addMacro("E", parseRegExpr("[eE][+-]?[0-9]+")) 7 | let floatPat = parseRegExpr("[0-9]+ (\\.[0-9]+{e}? | { e })") # floating point numbers 8 | let intPat = parseRegExpr("[0-9]+") # integer 9 | let identPat = parseRegExpr("{ident}[0-9 A - Z a-z _]*") 10 | let elseExpr = parseRegExpr("e l s e ") 11 | let elifExpr = parseRegExpr("e l i f ") 12 | floatPat.rule = 3 13 | intPat.rule = 4 14 | identPat.rule = 5 15 | elseExpr.rule = 1 16 | elifExpr.rule = 2 17 | regExprToNFA(altExpr(identPat, intPat, floatPat, elifExpr, elseExpr), a) 18 | 19 | const 20 | asRegex = "([a-zA-Z_][0-9A-Za-z_]*)|([0-9]+)|([0-9]+ (\\.[0-9]+([eE][+-]?[0-9]+)?|[eE][+-]?[0-9]+))|else|elif" 21 | 22 | var n: NFA 23 | var d, o: DFA 24 | 25 | initExample2(n) 26 | NFA_to_DFA(n, d) 27 | 28 | optimizeDFA(d, o) 29 | 30 | when false: 31 | var buffer = newStringOfCap(10_000) 32 | genMatcher(o, buffer) 33 | writeFile("matcher.nim", buffer) 34 | 35 | import vm, vm2 36 | from times import cpuTime 37 | 38 | var bc = vm.Bytecode(code: @[], data: @[]) 39 | vm.genBytecode(o, bc) 40 | echo "code ", bc.code.len, " data: ", bc.data.len 41 | 42 | 43 | var bc2 = vm2.Bytecode(code: @[], data: @[]) 44 | vm2.genBytecode(o, bc2) 45 | echo "code ", bc2.code.len, " data: ", bc2.data.len 46 | 47 | template bench(text, doWork: expr) = 48 | var t0 = cpuTime() 49 | doWork 50 | echo text, " took [s] ", cpuTime() - t0 51 | 52 | import re, strutils 53 | 54 | let thaRe = re(asRegex) 55 | 56 | proc main = 57 | while true: 58 | let inp = readLine(stdin) 59 | if inp.len == 0: break 60 | bench "vm 1": 61 | for i in 1..100_000: 62 | discard vm.execBytecode(bc, inp) 63 | bench "vm 2": 64 | for i in 1..100_000: 65 | discard vm2.execBytecode(bc2, inp) 66 | 67 | bench "re A": 68 | for i in 1..100_000: 69 | discard re.matchLen(inp, thaRe) 70 | 71 | bench "sets": 72 | for i in 1..100_000: 73 | discard strutils.allCharsInSet(inp, {'A'..'Z','a'..'z','0'..'9','_'}) 74 | 75 | echo execBytecode(bc, inp) 76 | echo execBytecode(bc2, inp) 77 | echo re.matchLen(inp, thaRe) 78 | 79 | main() 80 | -------------------------------------------------------------------------------- /tests.nim: -------------------------------------------------------------------------------- 1 | 2 | import vm 3 | 4 | doAssert match("(a b c)", re"\( .* \)") 5 | doAssert match("while", re("while")) 6 | 7 | doAssert "0158787".match(re"\d+") 8 | doAssert "ABC 0232".match(re"\w+\s+\d+") 9 | doAssert "ABC".match(re"\d+ | \w+") 10 | 11 | doAssert matchLen("key", re"\w+") == 3 12 | 13 | var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+" 14 | doAssert matchLen("key1= cal9", pattern) == 11 15 | doAssert match("abc", re"\Aabc\Z") 16 | 17 | doAssert(not match("abcdef", re"^abc$")) 18 | 19 | doAssert(not match("aef", re"\A(?:abc|def)\Z")) 20 | doAssert(match("def", re"\A(?:abc|def)\Z")) 21 | doAssert(not match("deffoo", re"\A(?:abc|def)\Z")) 22 | 23 | doAssert(not match("deffoo", re"\b(?:abc|def)\b")) 24 | doAssert(match("def foo", re"\b(?:abc|def)\b")) 25 | 26 | doAssert(matchLen("def foo", re"\b(?:abc|def)\b") == 3) 27 | 28 | doAssert(matchLen("def foo\C\L", re"\bdef\sfoo\n") == 9) 29 | 30 | let complex = re"(\`|\')[^`']*\1" #re"(\w+)|(a)bcxyz" 31 | 32 | #echoCode(complex) 33 | if "'haha'" =~ complex: 34 | echo matches 35 | else: 36 | assert false 37 | 38 | #echo matchLen("abc", complex) # == 3 39 | 40 | when true: 41 | let complex2 = re"((a+)b(c)\2)" 42 | #echoCode complex2 43 | if "aaaaabcc" =~ complex2: 44 | echo matches 45 | #for x in matches: echo x 46 | #assert matches[1] == "abc" 47 | else: 48 | assert false 49 | 50 | when false: 51 | if "abc" =~ re"(cba)?.*": 52 | assert matches[0] == nil 53 | else: assert false 54 | 55 | if "abc" =~ re"().*": 56 | assert matches[0] == "" 57 | else: assert false 58 | -------------------------------------------------------------------------------- /todo.txt: -------------------------------------------------------------------------------- 1 | * fail at re construction time when invalid captures are used 2 | * finish 're' compatibility API 3 | -------------------------------------------------------------------------------- /vm.nim: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Lexim - The Lexer Generator for Nim 4 | # (c) Copyright 2015 Andreas Rumpf 5 | # 6 | # See the file "copying.txt", included in this 7 | # distribution, for details about the copyright. 8 | # 9 | 10 | ## Translates the DFA into a bytecode and then runs the bytecode. 11 | import nfa, strutils, intsets, regexprs 12 | 13 | type 14 | Instr* = distinct uint32 15 | 16 | Opcode* = enum 17 | opcRet, # return with some literal 18 | opcTestSet, # test current character against bitset in data section 19 | opcTestChar, # test current character against char embedded in instr 20 | opcTJmp, # jump if comparison was true 21 | opcBegin, # \A match 22 | opcEnd, # \Z match 23 | opcWordBound, # \b match 24 | opcCaptureBegin # begin of capture '(' 25 | opcCaptureEnd # end of capture ')' 26 | opcBackref # \\1 match 27 | 28 | Bytecode* = object 29 | code*: seq[Instr] 30 | data*: seq[set[char]] 31 | startAt*, captures*: int 32 | 33 | template opcode*(x: Instr): Opcode = Opcode(x.uint32 and 0xff'u32) 34 | template regBx*(x: Instr): int = (x.uint32 shr 16'u32).int 35 | 36 | proc codeListing(c: Bytecode, result: var string, start=0; last = -1) = 37 | # first iteration: compute all necessary labels: 38 | var jumpTargets = initIntSet() 39 | let last = if last < 0: c.code.len-1 else: min(last, c.code.len-1) 40 | jumpTargets.incl(c.startAt) 41 | for i in start..last: 42 | let x = c.code[i] 43 | if x.opcode == opcTJmp: 44 | jumpTargets.incl(x.regBx) 45 | result.addf("goto L$1:\n", c.startAt) 46 | var i = start 47 | while i <= last: 48 | if i in jumpTargets: result.addf("L$1:\n", i) 49 | let x = c.code[i] 50 | result.add($i) 51 | let opc = opcode(x) 52 | case opc 53 | of opcRet: 54 | result.addf("\t$#\t$#\n", ($opc).substr(3), x.regBx) 55 | of opcTestSet: 56 | result.addf("\t$#\t$#\n", ($opc).substr(3), $c.data[x.regBx]) 57 | of opcTestChar: 58 | result.addf("\t$#\t$#\n", ($opc).substr(3), $chr(x.regBx)) 59 | of opcTJmp: 60 | result.addf("\t$#\tL$#\n", ($opc).substr(3), x.regBx) 61 | of opcCaptureBegin, opcCaptureEnd, opcBackref: 62 | result.addf("\t$#\tC$#\n", ($opc).substr(3), x.regBx) 63 | of opcBegin, opcEnd, opcWordBound: 64 | result.addf("\t$#\n", ($opc).substr(3)) 65 | inc i 66 | 67 | proc echoCode*(c: Bytecode; start=0; last = -1) {.deprecated.} = 68 | var buf = "" 69 | codeListing(c, buf, start, last) 70 | echo buf 71 | 72 | proc genInstr(opc: Opcode; bx: int): Instr = 73 | # `bx` must be signed and in the range [-32767, 32768] 74 | const a = 0 75 | doAssert bx >= -32767 and bx <= 32768 76 | result = (opc.uint32 or a.uint32 shl 8'u32 or 77 | bx.uint32 shl 16'u32).Instr 78 | 79 | proc gABx(c: var Bytecode; opc: Opcode; bx: int) = 80 | c.code.add(genInstr(opc, bx)) 81 | 82 | proc genData(c: var Bytecode; data: set[char]): int = 83 | assert '\0' notin data 84 | for i in 0 .. c.data.high: 85 | if c.data[i] == data: return i 86 | result = c.data.len 87 | c.data.add data 88 | 89 | proc genTest(res: var Bytecode; x: Alphabet; dest: int) = 90 | case x.kind 91 | of reChar: 92 | gABx(res, opcTestChar, int(x.val)) 93 | gABx(res, opcTJmp, dest) 94 | of reBegin: 95 | gABx(res, opcBegin, 0) 96 | gABx(res, opcTJmp, dest) 97 | of reEnd: 98 | gABx(res, opcEnd, 0) 99 | gABx(res, opcTJmp, dest) 100 | of reWordBoundary: 101 | gABx(res, opcWordBound, genData(res, wordChars)) 102 | gABx(res, opcTJmp, dest) 103 | of reWordBoundaryNot: 104 | gABx(res, opcWordBound, genData(res, {'\1'..'\255'} - wordChars)) 105 | gABx(res, opcTJmp, dest) 106 | else: discard 107 | 108 | proc genCapture(res: var Bytecode; cs: Alphabet; dest: int) = 109 | case cs.kind 110 | of reCapture: 111 | gABx(res, opcCaptureBegin, cs.val.int-1) 112 | gABx(res, opcTJmp, dest) 113 | inc res.captures 114 | of reCaptureEnd: 115 | gABx(res, opcCaptureEnd, cs.val.int-1) 116 | gABx(res, opcTJmp, dest) 117 | of reBackref: 118 | gABx(res, opcBackref, cs.val.int-1) 119 | gABx(res, opcTJmp, dest) 120 | else: discard 121 | 122 | proc genBytecode*(a: DFA; res: var Bytecode) = 123 | var stateToLabel = newSeq[int](a.stateCount) 124 | 125 | for src in countup(1, a.stateCount): 126 | stateToLabel[src-1] = res.code.len 127 | let rule = getRule(a, src) 128 | for dest in allDests(a, src): 129 | # this implements the rather strange 130 | # "match longest but only sometimes" rule that regexes seem to have: 131 | if rule == 0 or rule == getRule(a, dest): 132 | let (list, cset) = allTransitions(a, src, dest) 133 | for x in list: genCapture(res, x, dest) 134 | if cset != {}: 135 | gABx(res, opcTestSet, genData(res, cset)) 136 | gABx(res, opcTJmp, dest) 137 | for x in list: genTest(res, x, dest) 138 | if stateToLabel[src-1] != res.code.len or rule != 0: 139 | # only generate 'ret' instruction when the state is not empty: 140 | gABx(res, opcRet, rule) 141 | # Fixup the TJmp instructions: 142 | for i in 0 .. res.code.high: 143 | let instr = res.code[i] 144 | if opcode(instr) == opcTJmp: 145 | res.code[i] = genInstr(opcTJmp, stateToLabel[regBx(instr)-1]) 146 | res.startAt = stateToLabel[a.startState-1] 147 | 148 | type Action* = int #distinct range[1..32_000] 149 | 150 | proc backrefMatch(input: string; sp: int; capture: (int, int)): bool = 151 | var i = capture[0] 152 | var k = sp 153 | while true: 154 | if i > capture[1]: return true 155 | if k >= input.len or input[k] != input[i]: return false 156 | inc k 157 | inc i 158 | 159 | proc execBytecode*(m: Bytecode; input: string; 160 | captures: var seq[(int, int)], 161 | start=0): tuple[a: Action, endPos: int] = 162 | var pc = m.startAt 163 | var sp = start 164 | #var backtrack: seq[(int,int)] 165 | while true: 166 | let instr = m.code[pc] 167 | let opc = instr.opcode 168 | let arg = instr.regBx 169 | if opc == opcTestSet: 170 | # we *know* the next instruction is a TJmp: 171 | let next = m.code[pc+1] 172 | assert next.opcode == opcTJmp 173 | if sp < input.len and input[sp] in m.data[arg]: 174 | pc = next.regBx 175 | inc sp 176 | else: 177 | inc pc, 2 178 | elif opc == opcTestChar: 179 | # we *know* the next instruction is a TJmp: 180 | let next = m.code[pc+1] 181 | assert next.opcode == opcTJmp 182 | if sp < input.len and input[sp] == chr(arg): 183 | pc = next.regBx 184 | inc sp 185 | else: 186 | inc pc, 2 187 | elif opc == opcRet: 188 | #if arg == 0 and not backtrack.isNil and backtrack.len > 0: 189 | # let (newPc, newSp) = backtrack.pop() 190 | # pc = newPc 191 | # sp = newSp 192 | #else: 193 | return (Action(arg), sp) 194 | else: 195 | case opc 196 | of opcBegin: 197 | if sp == start: 198 | pc = m.code[pc+1].regBx 199 | else: 200 | inc pc, 2 201 | of opcEnd: 202 | if sp >= input.len: 203 | pc = m.code[pc+1].regBx 204 | else: 205 | inc pc, 2 206 | of opcWordBound: 207 | if sp >= input.len or sp == start or input[sp] notin m.data[arg]: 208 | pc = m.code[pc+1].regBx 209 | else: 210 | inc pc, 2 211 | of opcCaptureBegin: 212 | if captures.len <= arg: setLen(captures, arg+1) 213 | captures[arg][0] = sp 214 | captures[arg][1] = -2 # mark as still open 215 | #if backtrack.isNil: backtrack = @[] 216 | #backtrack.add((pc+2, sp)) 217 | pc = m.code[pc+1].regBx 218 | of opcCaptureEnd: 219 | captures[arg][1] = sp-1 220 | pc = m.code[pc+1].regBx 221 | of opcBackref: 222 | if arg < captures.len and backrefMatch(input, sp, captures[arg]): 223 | pc = m.code[pc+1].regBx 224 | inc sp, captures[arg][1] - captures[arg][0] + 1 225 | else: 226 | inc pc, 2 227 | else: assert false 228 | 229 | proc findMacro(s: string): PRegExpr = nil 230 | 231 | proc re*(regex: string; flags: set[RegexFlag] = {reExtended}): Bytecode = 232 | let r = parseRegExpr(regex, findMacro, flags) 233 | r.rule = 1 234 | var n: NFA 235 | regExprToNFA(r, n) 236 | let alph = fullAlphabet(n) 237 | 238 | var d, o: DFA 239 | NFA_to_DFA(n, d, alph) 240 | optimizeDFA(d, o, alph) 241 | 242 | result.code = @[] 243 | result.data = @[] 244 | genBytecode(o, result) 245 | 246 | proc matchLen*(input: string; r: Bytecode; 247 | captures: var seq[(int, int)], start=0): int = 248 | let (isMatch, len) = execBytecode(r, input, captures, start) 249 | result = if isMatch <= 0: -1 else: len 250 | 251 | proc match*(input: string; r: Bytecode; 252 | captures: var seq[(int, int)]; start=0): bool = 253 | let (isMatch, len) = execBytecode(r, input, captures, start) 254 | result = isMatch > 0 255 | 256 | proc matchLen*(input: string; r: Bytecode, start=0): int = 257 | var captures: seq[(int, int)] = @[] 258 | let (isMatch, len) = execBytecode(r, input, captures, start) 259 | result = if isMatch <= 0: -1 else: len 260 | 261 | proc match*(input: string; r: Bytecode; start=0): bool = 262 | var captures: seq[(int, int)] = @[] 263 | let (isMatch, len) = execBytecode(r, input, captures, start) 264 | result = isMatch > 0 265 | 266 | template `=~`*(s: string, pattern: Bytecode): untyped = 267 | ## This calls ``match`` with an implicit declared ``matches`` seq that 268 | ## can be used in the scope of the ``=~`` call: 269 | ## 270 | ## .. code-block:: nim 271 | ## 272 | ## if line =~ re"\s*(\w+)\s*\=\s*(\w+)": 273 | ## # matches a key=value pair: 274 | ## echo("Key: ", matches[0]) 275 | ## echo("Value: ", matches[1]) 276 | ## elif line =~ re"\s*(\#.*)": 277 | ## # matches a comment 278 | ## # note that the implicit ``matches`` array is different from the 279 | ## # ``matches`` array of the first branch 280 | ## echo("comment: ", matches[0]) 281 | ## else: 282 | ## echo("syntax error") 283 | ## 284 | var captures: seq[(int, int)] = @[] 285 | when not declaredInScope(matches): 286 | var matches {.inject.}: seq[string] = @[] 287 | let m = match(s, pattern, captures) 288 | for i in 0..high(captures): 289 | matches.add substr(s, captures[i][0], captures[i][1]) 290 | m 291 | --------------------------------------------------------------------------------