├── .gitignore
├── README.md
├── bench.nim
├── benchdata.txt
├── copying.txt
├── ex1.nim
├── lexe.nim
├── lexim.nim
├── lexim.nims
├── listing.nim
├── nfa.nim
├── regexprs.nim
├── testa.nim
├── tests.nim
├── todo.txt
└── vm.nim


/.gitignore:
--------------------------------------------------------------------------------
1 | nimcache/
2 | *.exe
3 | lexe.input
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # lexim
2 | Lexer generation and regex implementation for Nim.
3 | 
4 | # example
5 | Lexim requires a 'lexe' helper exe that is used by 'lexim'.
6 | Compile via ``nim c lexe`` and then you can run the example
7 | via ``nim c ex1.nim``.
8 | 


--------------------------------------------------------------------------------
/bench.nim:
--------------------------------------------------------------------------------
 1 | 
 2 | import nfa, regexprs, listing
 3 | from strutils import find
 4 | 
 5 | const
 6 |   asRegex = ".*[Pp]leasuring"
 7 | 
 8 | import vm
 9 | from times import cpuTime
10 | 
11 | var bc = vm.re(asRegex)
12 | echo "code ", bc.code.len, " data: ", bc.data.len
13 | 
14 | template bench(text, doWork: untyped) =
15 |   var t0 = cpuTime()
16 |   doWork
17 |   echo text, " took [s] ", cpuTime() - t0
18 | 
19 | import re, strutils
20 | 
21 | let thaRe = re.re("[Pp]leasuring", {reDotAll, reStudy})
22 | 
23 | import lexim
24 | proc lex(input: string): int =
25 |   var pos = 0
26 |   while pos < input.len:
27 |     lexim.match input, pos:
28 |     of r"[Pp]leasuring":
29 |       return pos
30 |     of r".":
31 |       discard
32 |   return -1
33 | 
34 | import std/strscans
35 | proc scan(input: string): int =
36 |   var pos = 0
37 |   while pos < input.len:
38 |     if scanp(input, pos, {'P', 'p'}, "leasuring"):
39 |       return pos
40 |     inc pos
41 |   return -1
42 | 
43 | import npeg
44 | proc pegs(input: string): int =
45 |   let p = peg search:
46 |     search <- @({'P', 'p'} * "leasuring")
47 |   let r = p.match(input)
48 |   return if r.ok: r.matchLen else: -1
49 | 
50 | proc main =
51 |   let inp = readFile("benchdata.txt")
52 |   when true:
53 |     bench "vm 1":
54 |       for i in 1..100:
55 |         discard vm.matchLen(inp, bc)
56 | 
57 |     bench "re A":
58 |       for i in 1..100:
59 |         discard re.find(inp, thaRe)
60 | 
61 |     bench "find":
62 |       for i in 1..100:
63 |         discard find(inp, "pleasuring")
64 | 
65 |     bench "lexer":
66 |       for i in 1..100:
67 |         discard lex(inp)
68 | 
69 |     bench "scanp":
70 |       for i in 1..100:
71 |         discard scan(inp)
72 | 
73 |     bench "npeg":
74 |       for i in 1..100:
75 |         discard pegs(inp)
76 | 
77 |     echo matchLen(inp, bc)
78 |     echo re.find(inp, thaRe)+len"pleasuring"
79 |     echo find(inp, "pleasuring")+len"pleasuring"
80 |     echo lex(inp) # +len"pleasuring"
81 |     echo scan(inp) # +len"pleasuring"
82 |     echo pegs(inp) # +len"pleasuring"
83 | 
84 | main()
85 | 


--------------------------------------------------------------------------------
/copying.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andreas Rumpf
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/ex1.nim:
--------------------------------------------------------------------------------
 1 | discard """
 2 |   output: '''
 3 | an identifier the##
 4 | something else  ##
 5 | an integer 0909##
 6 | something else  ##
 7 | an ELSE
 8 | something else  ##
 9 | an identifier input##
10 | something else  ##
11 | an ELIF
12 | something else  ##
13 | an identifier elseo##
14 | something else  ##
15 | an END'''
16 | """
17 | 
18 | import lexim
19 | 
20 | proc main =
21 |   var input = "the 0909 else input elif elseo end"
22 |   var pos = 0
23 |   while pos < input.len:
24 |     let oldPos = pos
25 |     match input, pos:
26 |     of r"\d+": echo "an integer ", input.substr(oldPos, pos-1), "##"
27 |     of "else": echo "an ELSE"
28 |     of "elif": echo "an ELIF"
29 |     of "end": echo "an END"
30 |     of r"[a-zA-Z_]\w+": echo "an identifier ", input.substr(oldPos, pos-1), "##"
31 |     of r".": echo "something else ", input.substr(oldPos, pos-1), "##"
32 | 
33 | main()
34 | 


--------------------------------------------------------------------------------
/lexe.nim:
--------------------------------------------------------------------------------
 1 | #
 2 | #
 3 | #    Lexim - The Lexer Generator for Nim
 4 | #        (c) Copyright 2015 Andreas Rumpf
 5 | #
 6 | #    See the file "copying.txt", included in this
 7 | #    distribution, for details about the copyright.
 8 | #
 9 | 
10 | import
11 |   regexprs, nfa, marshal
12 | 
13 | # The part that implements lexer generation as an exe to speed up
14 | # this process.
15 | proc findMacro(name: string): PRegExpr = nil
16 | 
17 | proc main(input: string): string =
18 |   let inp = marshal.to[seq[string]](input)
19 | 
20 |   var bigRe: PRegExpr = nil
21 |   for i in 0..<inp.len:
22 |     let rex = parseRegExpr(inp[i], findMacro,
23 |                            {reNoCaptures, reNoBackrefs})
24 |     rex.rule = i+1
25 |     if bigRe.isNil: bigRe = rex
26 |     else: bigRe = altExpr(bigRe, rex)
27 | 
28 |   var n: NFA
29 |   var d, o: DFA
30 | 
31 |   regExprToNFA(bigRe, n)
32 |   let alph = fullAlphabet(n)
33 |   NFA_to_DFA(n, d, alph)
34 |   optimizeDFA(d, o, alph)
35 |   result = $$o
36 | 
37 | echo main(readFile("lexe.input"))
38 | 


--------------------------------------------------------------------------------
/lexim.nim:
--------------------------------------------------------------------------------
  1 | #
  2 | #
  3 | #    Lexim - The Lexer Generator for Nim
  4 | #        (c) Copyright 2015 Andreas Rumpf
  5 | #
  6 | #    See the file "copying.txt", included in this
  7 | #    distribution, for details about the copyright.
  8 | #
  9 | 
 10 | import
 11 |   regexprs, nfa, macros, marshal
 12 | 
 13 | proc findMacro(name: string): PRegExpr {.used.} = nil
 14 | 
 15 | proc newRange(a, b: NimNode): NimNode {.compileTime.} =
 16 |   newCall(bindSym"..", a, b)
 17 | 
 18 | proc charSetLit(cc: set[char]): NimNode {.compileTime.} =
 19 |   const
 20 |     MaxChar = '\xFF'
 21 |   result = newNimNode(nnkCurly)
 22 |   var c1 = '\0'
 23 |   while true:
 24 |     if c1 in cc:
 25 |       var c2 = c1
 26 |       while c2 < MaxChar and succ(c2) in cc: c2 = succ(c2)
 27 |       if c1 == c2:
 28 |         result.add newLit(c1)
 29 |       elif c2 == succ(c1):
 30 |         result.add newLit(c1)
 31 |         result.add newLit(c2)
 32 |       else:
 33 |         result.add newRange(newLit(c1), newLit(c2))
 34 |       c1 = c2
 35 |     if c1 >= MaxChar: break
 36 |     inc c1
 37 | 
 38 | proc charAt(s: string; i: int): char {.inline.} =
 39 |   result = if i < s.len: s[i] else: '\0'
 40 | 
 41 | proc currChar(s, i: NimNode; isCString: bool): NimNode {.compileTime.} =
 42 |   result =
 43 |     if isCString:
 44 |       newTree(nnkBracketExpr, s, i)
 45 |     else:
 46 |       newCall(bindSym"charAt", s, i)
 47 | 
 48 | proc getCmp(s, i: NimNode; x: set[char]; isCString: bool): NimNode {.compileTime.} =
 49 |   result = newCall(bindSym"contains",  charSetLit(x), currChar(s, i, isCString))
 50 | 
 51 | proc getSpecial(s, i: NimNode; x: Alphabet; isCString: bool): NimNode {.compileTime.} =
 52 |   result = newCall(bindSym"==", currChar(s, i, isCString), newLit(x.val))
 53 | 
 54 | proc newVarStmt(name, typ, value: NimNode): NimNode {.compiletime.} =
 55 |   return newTree(nnkVarSection, newTree(nnkIdentDefs, name, typ, value))
 56 | 
 57 | proc nextState(i, state: NimNode; dest: int): NimNode {.compileTime.} =
 58 |   newStmtList(newCall(bindSym"inc", i), newAssignment(state, newLit(dest)))
 59 | 
 60 | proc genMatcher(a: DFA; s, i, bodies: NimNode; isCString: bool): NimNode {.compileTime.} =
 61 |   let state = genSym(nskVar, "state")
 62 |   result = newStmtList()
 63 |   result.add newVarStmt(newTree(nnkPragmaExpr, state,
 64 |                           newTree(nnkPragma, ident"goto")),
 65 |                         newTree(nnkBracketExpr, bindSym"range",
 66 |                           newRange(newLit(1), newLit(a.stateCount))),
 67 |                         newLit(a.startState))
 68 |   var caseStmt = newNimNode(nnkCaseStmt)
 69 |   caseStmt.add state
 70 |   result.add newTree(nnkWhileStmt, bindSym"true", caseStmt)
 71 |   for src in countup(1, a.stateCount):
 72 |     let rule = getRule(a, src)
 73 |     var ifStmt = newNimNode(nnkIfStmt)
 74 |     for dest in allDests(a, src):
 75 |       let (others, cs) = allTransitions(a, src, dest)
 76 |       if cs != {}:
 77 |         ifStmt.add newTree(nnkElifBranch,
 78 |                            getCmp(s, i, cs, isCString),
 79 |                            nextState(i, state, dest))
 80 |       for ot in others:
 81 |         if ot.kind == reChar:
 82 |           ifStmt.add newTree(nnkElifBranch,
 83 |                              getSpecial(s, i, ot, isCString),
 84 |                              nextState(i, state, dest))
 85 |         else:
 86 |           doAssert false, "not supported " & $ot.kind
 87 |     let actions = if rule >= 1:
 88 |            newStmtList(bodies[rule-1][1], newTree(nnkBreakStmt,
 89 |                   newNimNode(nnkEmpty)))
 90 |          else:
 91 |            newTree(nnkBreakStmt, newNimNode(nnkEmpty))
 92 |     if ifStmt.len == 0:
 93 |       caseStmt.add newTree(nnkOfBranch, newLit(src), actions)
 94 |     else:
 95 |       ifStmt.add newTree(nnkElse, actions)
 96 |       caseStmt.add newTree(nnkOfBranch, newLit(src), ifStmt)
 97 | 
 98 | template `/.`(x: string): string =
 99 |   (when defined(posix): "./" & x else: x)
100 | 
101 | macro match*(s: cstring|string; pos: int; sections: varargs[untyped]): untyped =
102 |   let isCString = s.getType.typeKind == ntyCString
103 |   when defined(leximSkipLexe):
104 |     var bigRe: PRegExpr = nil
105 |     var rule = 1
106 |     for sec in sections.children:
107 |       expectKind sec, nnkOfBranch
108 |       expectLen sec, 2
109 |       if sec[0].kind in nnkStrLit..nnkTripleStrLit:
110 |         let rex = parseRegExpr(sec[0].strVal, findMacro,
111 |                               {reNoCaptures, reNoBackrefs})
112 |         rex.rule = rule
113 |         if bigRe.isNil: bigRe = rex
114 |         else: bigRe = altExpr(bigRe, rex)
115 |       else:
116 |         error("Expected a node of kind nnkStrLit, got " & $sec[0].kind)
117 |       inc rule
118 | 
119 |     var n: NFA
120 |     var d, o: DFA
121 |     regExprToNFA(bigRe, n)
122 |     let alph = fullAlphabet(n)
123 |     NFA_to_DFA(n, d, alph)
124 |     optimizeDFA(d, o, alph)
125 |     result = genMatcher(o, s, pos, sections, isCString)
126 |   else:
127 |     # use 'lexe.exe' helper program in order to speedup lexer generation
128 |     var res: seq[string] = @[]
129 |     for sec in sections.children:
130 |       expectKind sec, nnkOfBranch
131 |       expectLen sec, 2
132 |       if sec[0].kind in nnkStrLit..nnkTripleStrLit:
133 |         res.add sec[0].strVal
134 |       else:
135 |         error("Expected a node of kind nnkStrLit, got " & $sec[0].kind)
136 | 
137 |     let data = $$res
138 |     writeFile("lexe.input", data)
139 |     let o = to[DFA](staticExec(/."lexe", input="", cache=data))
140 |     result = genMatcher(o, s, pos, sections, isCString)
141 |   echo repr result
142 | 
143 | when isMainModule: # defined(testing):
144 |   var input = "the 0909 else input elif elseo end"
145 |   let asc = input.cstring
146 |   var pos = 0
147 |   while pos < input.len:
148 |     let oldPos = pos
149 |     match input, pos:
150 |     of r"\d+": echo "an integer ", input.substr(oldPos, pos-1), "##"
151 |     of "else": echo "an ELSE"
152 |     of "elif": echo "an ELIF"
153 |     of "end": echo "an END"
154 |     of r"[a-zA-Z_]\w+": echo "an identifier ", input.substr(oldPos, pos-1), "##"
155 |     of r".": echo "something else ", input.substr(oldPos, pos-1), "##"
156 | 


--------------------------------------------------------------------------------
/lexim.nims:
--------------------------------------------------------------------------------
 1 | 
 2 | version = "1.0"
 3 | author = "Andreas Rumpf"
 4 | description = "Lexer generation and regex implementation for Nim."
 5 | license = "MIT"
 6 | 
 7 | requires "nim >= 0.11.3"
 8 | 
 9 | import ospaths
10 | 
11 | proc buildHelper(name: string) =
12 |   if not fileExists(name.toExe):
13 |     exec "nim c " & name
14 | 
15 | task build, "builds Lexim and an example":
16 |   buildHelper "lexe"
17 |   exec "nim c ex1"
18 |   setCommand "nop"
19 | 
20 | task tests, "test regular expressions":
21 |   exec "nim c -r tests"
22 |   setCommand "nop"
23 | 


--------------------------------------------------------------------------------
/listing.nim:
--------------------------------------------------------------------------------
  1 | #
  2 | #
  3 | #    Lexim - The Lexer Generator for Nim
  4 | #        (c) Copyright 2015 Andreas Rumpf
  5 | #
  6 | #    See the file "copying.txt", included in this
  7 | #    distribution, for details about the copyright.
  8 | #
  9 | 
 10 | 
 11 | ## this modules contains utility functions for list generating routines:
 12 | 
 13 | import strutils
 14 | 
 15 | proc nchars*(cc: set[char]): int =
 16 |   result = 0
 17 |   for c in countup('\0', '\xFF'):
 18 |     if c in cc: inc(result)
 19 | 
 20 | proc charStr*(c: char; reserved: set[char]): string =
 21 |   case c
 22 |   of '\b':
 23 |     result = "\\b"
 24 |   of '\t':
 25 |     result = "\\t"
 26 |   of '\C':
 27 |     result = "\\r"
 28 |   of '\L':
 29 |     result = "\\l"
 30 |   of '\v':
 31 |     result = "\\v"
 32 |   of '\f':
 33 |     result = "\\f"
 34 |   of '\e':
 35 |     result = "\\e"
 36 |   of '\a':
 37 |     result = "\\a"
 38 |   of '\\':
 39 |     result = "\\\\"
 40 |   else:
 41 |     if c < ' ':
 42 |       result = '\\' & $ord(c)
 43 |     elif c in reserved:
 44 |       result = '\\' & $c
 45 |     else:
 46 |       result = $c
 47 | 
 48 | proc singleQuoteStr*(str: string): string =
 49 |   result = "'"
 50 |   for c in str: result.add charStr(c, {'\''})
 51 |   result.add '\''
 52 | 
 53 | proc doubleQuoteStr*(str: string): string =
 54 |   result = "\""
 55 |   for c in str: result.add charStr(c, {'\"'})
 56 |   result.add '\"'
 57 | 
 58 | proc charSetStrAux(cc: set[char]): string =
 59 |   const
 60 |     reserved = {'^', '-', ']'}
 61 |     MaxChar = '\xFF'
 62 |   result = ""
 63 |   var c1 = '\0'
 64 |   while true:
 65 |     if c1 in cc:
 66 |       var c2 = c1
 67 |       while (c2 < MaxChar) and (succ(c2) in cc): c2 = succ(c2)
 68 |       if c1 == c2:
 69 |         result.add charStr(c1, reserved)
 70 |       elif c2 == succ(c1):
 71 |         result.add charStr(c1, reserved) & charStr(c2, reserved)
 72 |       else:
 73 |         result.add charStr(c1, reserved) & '-' & charStr(c2, reserved)
 74 |       c1 = c2
 75 |     if c1 >= MaxChar: break
 76 |     inc(c1)
 77 | 
 78 | proc charSetStr*(cc: set[char]): string =
 79 |   if cc == {'\x01'..'\xFF'} - {'\L'}:
 80 |     result = "."
 81 |   else:
 82 |     if nchars(cc) > 128:
 83 |       result = "[^" & charSetStrAux({'\0'..'\xFF'} - cc) & ']'
 84 |     else:
 85 |       result = '[' & charSetStrAux(cc) & ']'
 86 | 
 87 | proc charSetOrCharStr*(cc: set[char]): string =
 88 |   var count = 0
 89 |   var c1 = '\0'                   # to avoid warnings
 90 |   for c in countup('\0', '\xFF'):
 91 |     if c in cc:
 92 |       c1 = c
 93 |       inc(count)
 94 |   if count > 1:
 95 |     result = charSetStr(cc)
 96 |   elif count == 1:
 97 |     result = charStr(c1, {'.'}) # was: singleQuoteStr(c1)
 98 |   else:
 99 |     result = "[]"
100 | 
101 | 


--------------------------------------------------------------------------------
/nfa.nim:
--------------------------------------------------------------------------------
  1 | #
  2 | #
  3 | #    Lexim - The Lexer Generator for Nim
  4 | #        (c) Copyright 2015 Andreas Rumpf
  5 | #
  6 | #    See the file "copying.txt", included in this
  7 | #    distribution, for details about the copyright.
  8 | #
  9 | 
 10 | import
 11 |   regexprs
 12 | 
 13 | const
 14 |   maxLabel* = 255
 15 | 
 16 | type
 17 |   Alphabet* = object
 18 |     kind*: RegexKind
 19 |     val*: char
 20 | 
 21 | const
 22 |   alEpsilon* = Alphabet(kind: reEps, val: '\0')
 23 | 
 24 | type
 25 |   TRuleIndex* = range[0..10_000]
 26 |   TLabel* = range[0..maxLabel] # 0 is an invalid label number, indicating
 27 |                                # there is no transition
 28 |   TLabelSet* = set[TLabel]    # max. size may be bigger in Nim
 29 |                               # transition tables: if label = 0,
 30 |                               # it is the start node
 31 |   DFA_Edge* = object
 32 |     cond*: Alphabet
 33 |     dest*: TLabel
 34 |   DFA_Trans* = array[TLabel, seq[DFA_Edge]] # transitions for DFA's
 35 |                                             # label = 1 is the start node
 36 | 
 37 |   NFA_Edge* = object
 38 |     cond*: Alphabet
 39 |     dest*: TLabelSet
 40 |   NFA_Trans* = array[TLabel, seq[NFA_Edge]] # transitions for NFA's
 41 |                                             # label 0 is the start node
 42 |   TLabelToRule* = array[TLabel, TRuleIndex]
 43 |   DFA* = object
 44 |     startState*: int          # start state; for some reason it won't always be 1
 45 |     stateCount*: int          # number of states; states are from 1 to stateCount
 46 |     captures*, backrefs*: int
 47 |     ruleCount*: int           # number of rules; rule 0 means no match
 48 |     trans*: DFA_Trans
 49 |     toRules*: TLabelToRule
 50 | 
 51 |   NFA* = object
 52 |     captures, backrefs, stateCount: int
 53 |     trans*: NFA_Trans
 54 |     toRules*: TLabelToRule
 55 | 
 56 | proc initNFA(a: var NFA) = discard
 57 | proc initDFA(a: var DFA) = discard
 58 | 
 59 | proc addTrans(src: var seq[NFA_Edge]; c: Alphabet; d: TLabel) =
 60 |   for i in 0 .. high(src):
 61 |     if src[i].cond == c:
 62 |       src[i].dest.incl d
 63 |       return
 64 |   src.add(NFA_Edge(cond: c, dest: {d}))
 65 |   if c.kind == reEps and src.len != 1:
 66 |     # make epsilon always the first transition to speed up later passes:
 67 |     swap(src[0], src[src.high])
 68 | 
 69 | proc addTrans(src: var seq[DFA_Edge]; c: Alphabet; d: TLabel) =
 70 |   for i in 0 .. high(src):
 71 |     if src[i].cond == c:
 72 |       src[i].dest = d
 73 |       return
 74 |   src.add(DFA_Edge(cond: c, dest: d))
 75 | 
 76 | proc auxRegExprToNFA(r: PRegExpr; a: var NFA; currState: int): int =
 77 |   # helper that is recursive; returns the new current state
 78 |   result = currState
 79 |   assert(r != nil)
 80 |   if r == nil: return
 81 |   case r.kind
 82 |   of reEps:
 83 |     addTrans(a.trans[result], alEpsilon, result + 1)
 84 |     inc(result)
 85 |   of reChar:
 86 |     addTrans(a.trans[result], Alphabet(kind: reChar, val: r.c), result + 1)
 87 |     inc(result)
 88 |   of reWordBoundary, reWordBoundaryNot, reBegin, reEnd:
 89 |     addTrans(a.trans[result], Alphabet(kind: r.kind, val: '\0'), result + 1)
 90 |     inc(result)
 91 |   of reStr:
 92 |     # string node
 93 |     for i in countup(0, len(r.s)-1):
 94 |       addTrans(a.trans[result], Alphabet(kind: reChar, val: r.s[i]), result + 1)
 95 |       inc(result)
 96 |   of reCat:
 97 |     # concatenation node
 98 |     result = auxRegExprToNFA(r.a, a, result)
 99 |     result = auxRegExprToNFA(r.b, a, result)
100 |   of reCClass:
101 |     addTrans(a.trans[result], alEpsilon, result + 1)
102 |     inc(result)
103 |     for c in countup('\0', '\xFF'):
104 |       if c in r.cc[]:
105 |         addTrans(a.trans[result], Alphabet(kind: reChar, val: c), result + 1)
106 |     inc(result)
107 |   of reStar:
108 |     # star node
109 |     # we draw one transition too much, which shouldn't be wrong
110 |     let aa = auxRegExprToNFA(r.a, a, result)
111 |     addTrans(a.trans[result], alEpsilon, aa + 1)
112 |     addTrans(a.trans[aa], alEpsilon, aa + 1)
113 |     addTrans(a.trans[aa + 1], alEpsilon, result)
114 |     result = aa + 1
115 |   of rePlus:
116 |     # plus node
117 |     # constructed as M M* would be:
118 |     result = auxRegExprToNFA(catExpr(r.a, starExpr(r.a)), a, result)
119 |   of reOpt:
120 |     # option node
121 |     # constructed as M | eps would be:
122 |     result = auxRegExprToNFA(altExpr(r.a, epsExpr()), a, result)
123 |   of reAlt:
124 |     # (|) node
125 |     addTrans(a.trans[result], alEpsilon, result + 1)
126 |     inc(result)
127 |     let oldState = result
128 |     let aa = auxRegExprToNFA(r.a, a, result)
129 |     let bb = auxRegExprToNFA(r.b, a, aa + 1)
130 |     addTrans(a.trans[oldState], alEpsilon, aa + 1)
131 |     addTrans(a.trans[aa], alEpsilon, bb + 1)
132 |     addTrans(a.trans[bb], alEpsilon, bb + 1)
133 |     result = bb + 1
134 |   of reCapture, reCaptureEnd:
135 |     a.captures = max(a.captures, int(r.c))
136 |     addTrans(a.trans[result], Alphabet(kind: reCapture, val: r.c), result+1)
137 |     inc(result)
138 |     result = auxRegExprToNFA(r.a, a, result)
139 |     addTrans(a.trans[result], Alphabet(kind: reCaptureEnd, val: r.c), result+1)
140 |     inc(result)
141 |   of reBackref:
142 |     a.backrefs = max(a.backrefs, int(r.c))
143 |     addTrans(a.trans[result], Alphabet(kind: reBackref, val: r.c), result + 1)
144 |     inc(result)
145 |   if r.rule != 0: a.toRules[result] = r.rule
146 | 
147 | proc regExprToNFA*(r: PRegExpr; a: var NFA) =
148 |   initNFA(a)
149 |   a.stateCount = auxRegExprToNFA(r, a, 0)
150 | 
151 | proc allTransitions*(a: DFA; source, dest: TLabel): (seq[Alphabet], set[char]) =
152 |   result[0] = @[]
153 |   if a.trans[source].len > 0:
154 |     result[1] = {}
155 |     var card = 0
156 |     var lastChar = -1
157 |     for x in a.trans[source]:
158 |       if x.dest == dest:
159 |         if x.cond.kind == reChar:
160 |           inc card
161 |           if lastChar < 0: lastChar = int x.cond.val
162 |           result[1].incl x.cond.val
163 |         else:
164 |           result[0].add x.cond
165 |     if card == 1:
166 |       result[1] = {}
167 |       result[0].add Alphabet(kind: reChar, val: char lastChar)
168 | 
169 | iterator allDests*(a: DFA; source: TLabel): TLabel =
170 |   if a.trans[source].len > 0:
171 |     # use a set to eliminate duplicates:
172 |     var dests: TLabelSet
173 |     for x in a.trans[source]: dests.incl x.dest
174 |     for d in dests: yield d
175 | 
176 | proc getRule*(a: DFA; s: TLabel): int = a.toRules[s]
177 | 
178 | proc closure(a: NFA; S: TLabelSet): TLabelSet =
179 |   var res: TLabelSet
180 |   result = S
181 |   while true:
182 |     res = result
183 |     for L in countup(0, a.stateCount):
184 |       if L in res:
185 |         if a.trans[L].len > 0 and a.trans[L][0].cond.kind == reEps:
186 |           result = result + a.trans[L][0].dest
187 |     if res == result: break
188 | 
189 | proc getDest(a: seq[NFA_Edge]; c: Alphabet): TLabelSet =
190 |   if a.len == 0: return
191 |   for t in a:
192 |     if t.cond.kind == c.kind and t.cond.val == c.val: return t.dest
193 | 
194 | proc getDest(a: seq[DFA_Edge]; c: Alphabet): TLabel =
195 |   if a.len == 0: return
196 |   for t in a:
197 |     if t.cond.kind == c.kind and t.cond.val == c.val: return t.dest
198 | 
199 | proc getDFAedge(a: NFA; d: TLabelSet; c: Alphabet): TLabelSet =
200 |   var tmp: TLabelSet = {}
201 |   for L in countup(0, a.stateCount):
202 |     if L in d:
203 |       tmp = tmp + getDest(a.trans[L], c)
204 |   result = closure(a, tmp)
205 | 
206 | proc searchInStates(states: openarray[TLabelSet]; p: int; e: TLabelSet): int =
207 |   # returns -1 if not found
208 |   for i in countup(0, p):
209 |     if states[i] == e: return i
210 |   result = -1
211 | 
212 | proc fullAlphabet(captures, backrefs: int): seq[Alphabet] =
213 |   result = @[]
214 |   var c: Alphabet
215 |   c.kind = reChar
216 |   for x in '\0'..'\255':
217 |     c.val = x
218 |     result.add c
219 |   c.kind = reBackref
220 |   for x in 1..backrefs:
221 |     c.val = char(x)
222 |     result.add c
223 |   for x in 1..captures:
224 |     c.val = char(x)
225 |     c.kind = reCapture
226 |     result.add c
227 |     c.kind = reCaptureEnd
228 |     result.add c
229 |   c.val = '\0'
230 |   c.kind = reBegin
231 |   result.add c
232 |   c.kind = reEnd
233 |   result.add c
234 |   c.kind = reWordBoundary
235 |   result.add c
236 |   c.kind = reWordBoundaryNot
237 |   result.add c
238 | 
239 | proc fullAlphabet*(a: NFA): seq[Alphabet] = fullAlphabet(a.captures, a.backrefs)
240 | 
241 | proc NFA_to_DFA*(a: NFA; b: var DFA; fullAlphabet: seq[Alphabet]) =
242 |   # Look into 'Modern compiler implementation in Java' for reference of
243 |   # this algorithm.
244 |   var
245 |     states: seq[TLabelSet] = @[]
246 |   states.add({})
247 |   states.add closure(a, {0.TLabel}) # 0 is the start state
248 |   var p = 1
249 |   var j = 0
250 |   while j <= p:
251 |     for c in fullAlphabet:
252 |       let e = getDFAedge(a, states[j], c)
253 |       let i = searchInStates(states, p, e)
254 |       if i >= 0:
255 |         addTrans(b.trans[j], c, i)
256 |       else:
257 |         inc(p)
258 |         assert p == states.len
259 |         states.add e
260 |         addTrans(b.trans[j], c, p)
261 |     inc(j)
262 |   for d in countup(low(TLabel), j - 1):
263 |     var minRule = high(int)
264 |     for i in countup(low(TLabel), high(TLabel)):
265 |       if i in states[d]:
266 |         if minRule > a.toRules[i] and a.toRules[i] != 0:
267 |           minRule = a.toRules[i]
268 |     if minRule == high(int):
269 |       b.toRules[d] = 0
270 |     else:
271 |       b.toRules[d] = minRule
272 |       if minRule > b.ruleCount: b.ruleCount = minRule
273 |   b.stateCount = j - 1
274 |   b.startState = 1            # for some reason this is always 1
275 |   b.captures = a.captures
276 |   b.backrefs = a.backrefs
277 | 
278 | proc getPreds(a: DFA; s: TLabelSet; c: Alphabet): TLabelSet =
279 |   # computes the set of predecessors for the set s (under the character c)
280 |   result = {}
281 |   let k = c.kind
282 |   let v = c.val
283 |   for i in countup(1, a.stateCount):
284 |     for t in a.trans[i]:
285 |       if t.cond.kind == k and t.cond.val == v and t.dest in s:
286 |         incl(result, i)
287 | 
288 | proc card(s: TLabelSet; maxState: int): int =
289 |   result = 0
290 |   for i in countup(1, maxState):
291 |     if i in s: inc(result)
292 | 
293 | proc choose(s: TLabelSet; maxState: int): TLabel =
294 |   # choose an arbitrary element from s
295 |   assert(s != {})
296 |   for i in countup(1, maxState):
297 |     if i in s:
298 |       return i
299 |   result = 0                  # invalid state
300 | 
301 | proc optimizeDFA*(a: DFA; b: var DFA; fullAlphabet: seq[Alphabet]) =
302 |   # Optimizes the DFA a to produce a minimal DFA.
303 |   # We use Hopcroft's algorithm; see the paper coming with this source.
304 |   # We have different types of nodes: there is a one to one correspondence
305 |   # between type and matching rule.
306 |   b.captures = a.captures
307 |   b.backrefs = a.backrefs
308 |   # p[0], w[0] are unused
309 |   # assign each state to a partition and to the worklist:
310 |   # w := {F, S-F}; p := {F, S-F}
311 |   var w = newSeq[TLabelSet](a.ruleCount+1)
312 |   var p = newSeq[TLabelSet](a.ruleCount+1)
313 |   for d in countup(1, a.stateCount):
314 |     incl(w[a.toRules[d]], d)
315 |     incl(p[a.toRules[d]], d)
316 |   while w.len > 0:
317 |     let s = w.pop
318 |     for c in fullAlphabet:
319 |       let I = getPreds(a, s, c)
320 |       if I == {}:
321 |         continue              # speed things up
322 |       for j in countdown(p.len - 1, 0):
323 |         let R = p[j]
324 |         if (R * I != {}) and not (R <= I):
325 |           # partition R into x, y
326 |           let x = R * I
327 |           let y = R - x           # replace R by x and y in P:
328 |           p[j] = x
329 |           p.add y
330 |           let findRes = searchInStates(w, w.len - 1, R)
331 |           if findRes >= 0:
332 |             # R is elem of W, so replace R by x, y
333 |             w[findRes] = x
334 |             w.add y
335 |           else:
336 |             if card(x, a.stateCount) <= card(y, a.stateCount): # add y to W:
337 |               w.add x
338 |             else:
339 |               w.add y
340 |   b.stateCount = p.len        # new states
341 |   b.ruleCount = a.ruleCount   # rule count stays the same
342 |   for j in countup(0, p.len - 1):
343 |     if p[j] != {}:
344 |       let repr = choose(p[j], a.stateCount) # choose a representant of the set
345 |       if a.startState in p[j]: b.startState = j + 1
346 |       b.toRules[j + 1] = a.toRules[repr]
347 |       for c in fullAlphabet:
348 |         let dest = a.trans[repr].getDest(c)
349 |         if dest != 0:
350 |           # test to speed things up
351 |           for k in countup(0, p.len - 1):
352 |             if dest in p[k]:
353 |               addTrans b.trans[j + 1], c, k + 1
354 |               break
355 | 


--------------------------------------------------------------------------------
/regexprs.nim:
--------------------------------------------------------------------------------
  1 | #
  2 | #
  3 | #    Lexim - The Lexer Generator for Nim
  4 | #        (c) Copyright 2015 Andreas Rumpf
  5 | #
  6 | #    See the file "copying.txt", included in this
  7 | #    distribution, for details about the copyright.
  8 | #
  9 | 
 10 | # This module implements a parser for regular expressions.
 11 | 
 12 | import strutils
 13 | 
 14 | type
 15 |   RegexKind* = enum         ## the regex AST's kind
 16 |     reEps,                  ## epsilon node
 17 |     reChar,                 ## character node
 18 |     reStr,                  ## string node
 19 |     reCClass,               ## character class node
 20 |     reStar,                 ## star node
 21 |     rePlus,                 ## plus node
 22 |     reOpt,                  ## option node
 23 |     reCat,                  ## concatenation node
 24 |     reAlt,                  ## alternatives node (|)
 25 |     reCapture,              ## (capture)
 26 |     reCaptureEnd,           ## not used by regex, but by NFA
 27 |     reBackref,              ## \\backref
 28 |     reBegin,                ## \\A
 29 |     reEnd,                  ## \\Z
 30 |     reWordBoundary,         ## \\b
 31 |     reWordBoundaryNot       ## \\B
 32 | 
 33 |   PRegExpr* = ref TRegExpr
 34 |   TRegExpr* = object
 35 |     kind*: RegexKind
 36 |     a*, b*: PRegExpr          # some nodes have two successors
 37 |     c*: char
 38 |     s*: string
 39 |     cc*: ref set[char]
 40 |     rule*: int                # if >= 0 it is a final state;
 41 |                               # then it is the rule that was matched
 42 | 
 43 |   RegexError* = object of ValueError
 44 |   RegexFlag* = enum  ## how regexes are parsed
 45 |     reExtended,      ## extended syntax support
 46 |     reNoBackrefs,    ## always process \\1 as a character literal,
 47 |                      ## not as back reference
 48 |     reNoCaptures     ## () is the same as (?:)
 49 | 
 50 |   MacroLookupProc* = proc (macroname: string): PRegExpr {.closure.} ## \
 51 |     ## lookup proc that expands {macros}.
 52 | 
 53 |   ReCtx = object
 54 |     pos: int
 55 |     flags: set[RegexFlag]
 56 |     captures: int # count the captures to give them an index
 57 |     findMacro: MacroLookupProc
 58 | 
 59 | const
 60 |   wordChars* = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\128', '\255'}
 61 |   whitespace* = {'\1'..'\32'}
 62 |   digits* = {'0'..'9'}
 63 | 
 64 | proc newExpr(kind: RegexKind): PRegExpr =
 65 |   new(result)
 66 |   result.kind = kind
 67 | 
 68 | proc epsExpr*(): PRegExpr =
 69 |   result = newExpr(reEps)
 70 | 
 71 | proc charExpr*(c: char): PRegExpr =
 72 |   result = newExpr(reChar)
 73 |   result.c = c
 74 | 
 75 | proc backrefExpr*(x: int): PRegExpr =
 76 |   result = newExpr(reBackref)
 77 |   result.c = char x
 78 | 
 79 | proc strExpr*(str: string): PRegExpr =
 80 |   if len(str) == 1:
 81 |     result = charExpr(str[0])
 82 |   else:
 83 |     result = newExpr(reStr)
 84 |     result.s = str
 85 | 
 86 | proc cclassExpr*(charset: set[char]): PRegExpr =
 87 |   result = newExpr(reCClass)
 88 |   new(result.cc)
 89 |   result.cc[] = charset
 90 | 
 91 | proc starExpr*(r: PRegExpr): PRegExpr =
 92 |   if r.kind == reStar:
 93 |     result = r
 94 |   else:
 95 |     result = newExpr(reStar)
 96 |     result.a = r
 97 | 
 98 | proc plusExpr*(r: PRegExpr): PRegExpr =
 99 |   result = newExpr(rePlus)
100 |   result.a = r
101 | 
102 | proc optExpr*(r: PRegExpr): PRegExpr =
103 |   result = newExpr(reOpt)
104 |   result.a = r
105 | 
106 | proc catExpr*(a, b: PRegExpr): PRegExpr =
107 |   result = newExpr(reCat)
108 |   result.a = a
109 |   result.b = b
110 | 
111 | proc altExpr*(a, b: PRegExpr): PRegExpr =
112 |   result = newExpr(reAlt)
113 |   result.a = a
114 |   result.b = b
115 | 
116 | proc altExpr*(a: varargs[PRegExpr]): PRegExpr =
117 |   result = altExpr(a[0], a[1])
118 |   for i in 2 ..< a.len:
119 |     result = result.altExpr(a[i])
120 | 
121 | proc mnExpr*(r: PRegExpr; m, n: int): PRegExpr =
122 |   var ri: PRegExpr
123 |   if m > n or n == 0:
124 |     result = epsExpr()
125 |   else:
126 |     # construct r^m:
127 |     if m == 0:
128 |       ri = epsExpr()
129 |     else:
130 |       ri = r
131 |       for i in countup(2, m): ri = catExpr(ri, r)
132 |     result = ri               # r{m,n} := r^m
133 |     for i in countup(m + 1, n):
134 |       if ri.kind == reEps: ri = r
135 |       else: ri = catExpr(ri, r)
136 |       result = altExpr(result, ri) # r{m,n} := r{m,n} | r^i,
137 |                                    #   i=m+1,...,n
138 | 
139 | proc newCapture*(a: PRegExpr): PRegExpr =
140 |   result = newExpr(reCapture)
141 |   result.a = a
142 | 
143 | proc getNext(buf: string; c: var ReCtx): char =
144 |   if reExtended in c.flags:
145 |     while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos)
146 |   result = if c.pos < buf.len: buf[c.pos] else: '\0'
147 | 
148 | proc error(msg: string) {.noinline.} =
149 |   raise newException(RegexError, msg)
150 | 
151 | proc getChar(buf: string; c: var ReCtx; inClass: bool): PRegExpr =
152 |   var val, i: int
153 |   if reExtended in c.flags and not inClass:
154 |     while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos)
155 |   if c.pos < buf.len and buf[c.pos] != '\\':
156 |     result = charExpr(buf[c.pos])
157 |     inc(c.pos)
158 |   else:
159 |     let ch = if c.pos+1 < buf.len: buf[c.pos+1] else: '\0'
160 |     case ch
161 |     of 'n':
162 |       result = altExpr(strExpr("\C\L"), charExpr('\L'), charExpr('\C'))
163 |       inc(c.pos, 2)
164 |     of 'r':
165 |       result = charExpr('\r')
166 |       inc(c.pos, 2)
167 |     of 'l', 'L':
168 |       result = charExpr('\L')
169 |       inc(c.pos, 2)
170 |     of 't':
171 |       result = charExpr('\t')
172 |       inc(c.pos, 2)
173 |     of 'b':
174 |       result = if inClass: charExpr('\b') else: newExpr(reWordBoundary)
175 |       inc(c.pos, 2)
176 |     of 'B':
177 |       result = if inClass: charExpr('\b') else: newExpr(reWordBoundaryNot)
178 |       inc(c.pos, 2)
179 |     of 'e':
180 |       result = charExpr('\e')
181 |       inc(c.pos, 2)
182 |     of 'a', 'A':
183 |       result = if inClass: charExpr('\a') else: newExpr(reBegin)
184 |       inc(c.pos, 2)
185 |     of 'v':
186 |       result = charExpr('\v')
187 |       inc(c.pos, 2)
188 |     of 'f':
189 |       result = charExpr('\f')
190 |       inc(c.pos, 2)
191 |     of 'z', 'Z':
192 |       if not inClass: result = newExpr(reEnd)
193 |       else: error("\\Z not supported in character class")
194 |       inc(c.pos, 2)
195 |     of 's':
196 |       result = cclassExpr(whitespace)
197 |       inc(c.pos, 2)
198 |     of 'S':
199 |       result = cclassExpr({'\1'..'\255'} - whitespace)
200 |       inc(c.pos, 2)
201 |     of 'd':
202 |       result = cclassExpr(digits)
203 |       inc(c.pos, 2)
204 |     of 'D':
205 |       result = cclassExpr({'\1'..'\255'} - digits)
206 |       inc(c.pos, 2)
207 |     of 'w':
208 |       result = cclassExpr(wordChars)
209 |       inc(c.pos, 2)
210 |     of 'W':
211 |       result = cclassExpr({'\1'..'\255'} - wordChars)
212 |       inc(c.pos, 2)
213 |     of '0'..'9':
214 |       let startsWithZero = ch == '0'
215 |       val = ord(ch) - ord('0')
216 |       inc(c.pos, 2)
217 |       i = 1
218 |       while (i <= 4) and c.pos < buf.len and (buf[c.pos] in {'0'..'9'}):
219 |         val = val * 10 + ord(buf[c.pos]) - ord('0')
220 |         inc(c.pos)
221 |         inc(i)
222 |       if startsWithZero or reNoBackrefs in c.flags:
223 |         result = charExpr(char val)
224 |       else:
225 |         result = backrefExpr(val)
226 |     else:
227 |       if ch in {'\0'..'\x1F'}:
228 |         error "invalid character #" & toHex(ch.ord, 2)
229 |       else:
230 |         result = charExpr(ch)
231 |         inc(c.pos, 2)
232 | 
233 | proc parseStr(buf: string; c: var ReCtx): PRegExpr =
234 |   var s = ""
235 |   inc(c.pos)                    # skip "
236 |   while c.pos < buf.len and buf[c.pos] != '\"':
237 |     if buf[c.pos] in {'\0', '\C', '\L'}:
238 |       error "\" expected"
239 |     let al = getChar(buf, c,false)
240 |     if al.kind == reChar: s.add al.c
241 |     else: error "invalid regular expression " & buf
242 |   inc(c.pos)                    # skip "
243 |   result = strExpr(s)
244 | 
245 | proc parseCClass(buf: string; c: var ReCtx): PRegExpr =
246 |   # scan a character class
247 |   var
248 |     caret: bool
249 |     cc: set[char]
250 |   inc(c.pos)                    # skip [
251 |   if c.pos < buf.len and buf[c.pos] == '^':
252 |     caret = true
253 |     inc(c.pos)
254 |   else:
255 |     caret = false
256 |   while c.pos < buf.len and buf[c.pos] != ']':
257 |     if buf[c.pos] in {'\0', '\C', '\L'}:
258 |       error "] expected"
259 |     let a = getChar(buf, c, true)
260 |     if a.kind == reChar:
261 |       incl(cc, a.c)
262 |       if c.pos < buf.len and buf[c.pos] == '-':
263 |         inc(c.pos)
264 |         if c.pos < buf.len and buf[c.pos] == ']':
265 |           incl(cc, '-')
266 |           break
267 |         let b = getChar(buf, c, true)
268 |         if b.kind == reChar:
269 |           cc = cc + {a.c .. b.c}
270 |         elif b.kind == reCClass:
271 |           incl(cc, '-')
272 |           cc = cc + b.cc[]
273 |         else:
274 |           error "invalid regular expression " & buf
275 |     elif a.kind == reCClass:
276 |       cc = cc + a.cc[]
277 |     else:
278 |       error "invalid regular expression " & buf
279 |   if c.pos < buf.len and buf[c.pos] == ']': inc(c.pos)
280 |   else: error "] expected"
281 |   if caret: result = cclassExpr({'\1'..'\xFF'} - cc)
282 |   else: result = cclassExpr(cc)
283 | 
284 | proc parseNum(buf: string; c: var ReCtx): int =
285 |   result = 0
286 |   if c.pos < buf.len and buf[c.pos] in {'0'..'9'}:
287 |     while true:
288 |       result = result * 10 + ord(buf[c.pos]) - ord('0')
289 |       inc(c.pos)
290 |       if c.pos >= buf.len or buf[c.pos] notin {'0'..'9'}: break
291 |   else:
292 |     error "number expected"
293 | 
294 | proc parseIdent(buf: string; c: var ReCtx): string =
295 |   result = ""
296 |   if c.pos < buf.len and buf[c.pos] in {'a'..'z', 'A'..'Z', '_'}:
297 |     while c.pos < buf.len:
298 |       case buf[c.pos]
299 |       of 'a'..'z', 'A'..'Z', '0'..'9':
300 |         result.add toUpperAscii(buf[c.pos])
301 |         inc(c.pos)
302 |       of '_':
303 |         inc(c.pos)              # ignore _
304 |       else: break
305 |   else:
306 |     error "identifier expected"
307 | 
308 | proc parseMacroCall(buf: string; c: var ReCtx): PRegExpr =
309 |   let name = parseIdent(buf, c)
310 |   result = c.findMacro(name)
311 |   if result.isNil:
312 |     error "undefined macro: " & name
313 | 
314 | proc parseRegExpr*(buf: string; c: var ReCtx): PRegExpr
315 | 
316 | proc factor(buf: string; c: var ReCtx): PRegExpr =
317 |   case getNext(buf, c)
318 |   of '\"':
319 |     result = parseStr(buf, c)
320 |   of '[':
321 |     result = parseCClass(buf, c)
322 |   of '.':
323 |     inc(c.pos)
324 |     result = cclassExpr({'\1'..'\xFF'}) # - {'\L'})
325 |   of '(':
326 |     inc(c.pos)                  # skip (
327 |     var isCapture = reNoCaptures notin c.flags
328 |     if c.pos+1 < buf.len and buf[c.pos] == '?' and buf[c.pos+1] == ':':
329 |       inc c.pos, 2
330 |       isCapture = false
331 |     result = parseRegExpr(buf, c)
332 |     if getNext(buf, c) == ')': inc(c.pos)
333 |     else: error ") expected"
334 |     if isCapture:
335 |       inc c.captures
336 |       result = newCapture(result)
337 |       result.c = char c.captures
338 |   of '\\':
339 |     result = getChar(buf, c, false)
340 |   of '{':
341 |     inc(c.pos)                  # skip {
342 |     while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos)
343 |     result = parseMacroCall(buf, c)
344 |     if getNext(buf, c) == '}': inc(c.pos)
345 |     else: error "} expected"
346 |   of '*', '+', '?':
347 |     error "escape " & buf[c.pos] & " with \\"
348 |   of '$':
349 |     result = newExpr(reEnd)
350 |     inc(c.pos)
351 |   of '^':
352 |     result = newExpr(reBegin)
353 |     inc(c.pos)
354 |   else:
355 |     result = charExpr(if c.pos < buf.len: buf[c.pos] else: '\0')
356 |     inc(c.pos)
357 |   while true:
358 |     case getNext(buf, c)
359 |     of '*':
360 |       inc(c.pos)
361 |       result = starExpr(result)
362 |     of '+':
363 |       inc(c.pos)
364 |       result = plusExpr(result)
365 |     of '?':
366 |       inc(c.pos)
367 |       result = optExpr(result)
368 |     of '{':
369 |       inc(c.pos)                # skip {
370 |       if getNext(buf, c) notin {'0'..'9'}:
371 |         # a macro, but do not parse it here, but later to
372 |         # keep the operator predecence:
373 |         while true:           # back to {
374 |                               # a single decrement might not do
375 |                               # because of skipped whitespace
376 |           dec(c.pos)
377 |           if buf[c.pos] == '{': break
378 |         break
379 |       else:
380 |         var n: int
381 |         let m = parseNum(buf, c)
382 |         if getNext(buf, c) == ',':
383 |           inc(c.pos)
384 |           while c.pos < buf.len and buf[c.pos] in {' ', '\t'}: inc(c.pos)
385 |           n = parseNum(buf, c)
386 |         else:
387 |           n = m
388 |         result = mnExpr(result, m, n)
389 |       if getNext(buf, c) == '}': inc(c.pos)
390 |       else: error "} expected"
391 |     else: break
392 | 
393 | proc term(buf: string; c: var ReCtx): PRegExpr =
394 |   const
395 |     termDelim = {'\0', ':', '|', ')'} #,'/'
396 |   if getNext(buf, c) notin termDelim:
397 |     result = factor(buf, c)
398 |     while getNext(buf, c) notin termDelim:
399 |       result = catExpr(result, factor(buf, c))
400 |   else:
401 |     result = epsExpr()
402 | 
403 | proc parseRegExpr(buf: string; c: var ReCtx): PRegExpr =
404 |   result = term(buf, c)
405 |   while getNext(buf, c) == '|':
406 |     inc(c.pos)
407 |     result = altExpr(result, term(buf, c))
408 | 
409 | proc parseRegExpr*(reg: string; findMacro: MacroLookupProc;
410 |                    flags: set[RegexFlag] = {}): PRegExpr =
411 |   var c: ReCtx
412 |   c.pos = 0
413 |   c.flags = flags
414 |   c.findMacro = findMacro
415 |   c.captures = 0
416 |   result = parseRegExpr(reg, c)
417 | 
418 | proc containsInvCap(r: PRegExpr; inAlt: bool): bool =
419 |   if r != nil:
420 |     result = containsInvCap(r.a, inAlt or r.kind == reAlt) or
421 |              containsInvCap(r.b, inAlt or r.kind == reAlt) or
422 |              r.kind == reCapture and inAlt
423 | 
424 | proc containsInvalidCapture*(r: PRegExpr): bool =
425 |   ## When the implementation uses a DFA, captures can only be supported in
426 |   ## quite a limited way: (abc)|(xyz) cannot be supported. This proc checks for
427 |   ## that so a nice error can be generated.
428 |   result = containsInvCap(r, false)
429 | 


--------------------------------------------------------------------------------
/testa.nim:
--------------------------------------------------------------------------------
 1 | 
 2 | import nfa, regexprs, listing, codegen
 3 | 
 4 | proc initExample2(a: var NFA) =
 5 |   regexprs.addMacro("IDENT", parseRegExpr("[a-zA-Z_]"))
 6 |   regexprs.addMacro("E", parseRegExpr("[eE][+-]?[0-9]+"))
 7 |   let floatPat = parseRegExpr("[0-9]+ (\\.[0-9]+{e}? | { e })") # floating point numbers
 8 |   let intPat = parseRegExpr("[0-9]+") # integer
 9 |   let identPat = parseRegExpr("{ident}[0-9 A - Z a-z _]*")
10 |   let elseExpr = parseRegExpr("e l s e ")
11 |   let elifExpr = parseRegExpr("e l  i f    ")
12 |   floatPat.rule = 3
13 |   intPat.rule = 4
14 |   identPat.rule = 5
15 |   elseExpr.rule = 1
16 |   elifExpr.rule = 2
17 |   regExprToNFA(altExpr(identPat, intPat, floatPat, elifExpr, elseExpr), a)
18 | 
19 | const
20 |   asRegex = "([a-zA-Z_][0-9A-Za-z_]*)|([0-9]+)|([0-9]+ (\\.[0-9]+([eE][+-]?[0-9]+)?|[eE][+-]?[0-9]+))|else|elif"
21 | 
22 | var n: NFA
23 | var d, o: DFA
24 | 
25 | initExample2(n)
26 | NFA_to_DFA(n, d)
27 | 
28 | optimizeDFA(d, o)
29 | 
30 | when false:
31 |   var buffer = newStringOfCap(10_000)
32 |   genMatcher(o, buffer)
33 |   writeFile("matcher.nim", buffer)
34 | 
35 | import vm, vm2
36 | from times import cpuTime
37 | 
38 | var bc = vm.Bytecode(code: @[], data: @[])
39 | vm.genBytecode(o, bc)
40 | echo "code ", bc.code.len, " data: ", bc.data.len
41 | 
42 | 
43 | var bc2 = vm2.Bytecode(code: @[], data: @[])
44 | vm2.genBytecode(o, bc2)
45 | echo "code ", bc2.code.len, " data: ", bc2.data.len
46 | 
47 | template bench(text, doWork: expr) =
48 |   var t0 = cpuTime()
49 |   doWork
50 |   echo text, " took [s] ", cpuTime() - t0
51 | 
52 | import re, strutils
53 | 
54 | let thaRe = re(asRegex)
55 | 
56 | proc main =
57 |   while true:
58 |     let inp = readLine(stdin)
59 |     if inp.len == 0: break
60 |     bench "vm 1":
61 |       for i in 1..100_000:
62 |         discard vm.execBytecode(bc, inp)
63 |     bench "vm 2":
64 |       for i in 1..100_000:
65 |         discard vm2.execBytecode(bc2, inp)
66 | 
67 |     bench "re A":
68 |       for i in 1..100_000:
69 |         discard re.matchLen(inp, thaRe)
70 | 
71 |     bench "sets":
72 |       for i in 1..100_000:
73 |         discard strutils.allCharsInSet(inp, {'A'..'Z','a'..'z','0'..'9','_'})
74 | 
75 |     echo execBytecode(bc, inp)
76 |     echo execBytecode(bc2, inp)
77 |     echo re.matchLen(inp, thaRe)
78 | 
79 | main()
80 | 


--------------------------------------------------------------------------------
/tests.nim:
--------------------------------------------------------------------------------
 1 | 
 2 | import vm
 3 | 
 4 | doAssert match("(a b c)", re"\( .* \)")
 5 | doAssert match("while", re("while"))
 6 | 
 7 | doAssert "0158787".match(re"\d+")
 8 | doAssert "ABC 0232".match(re"\w+\s+\d+")
 9 | doAssert "ABC".match(re"\d+ | \w+")
10 | 
11 | doAssert matchLen("key", re"\w+") == 3
12 | 
13 | var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+"
14 | doAssert matchLen("key1=  cal9", pattern) == 11
15 | doAssert match("abc", re"\Aabc\Z")
16 | 
17 | doAssert(not match("abcdef", re"^abc$"))
18 | 
19 | doAssert(not match("aef", re"\A(?:abc|def)\Z"))
20 | doAssert(match("def", re"\A(?:abc|def)\Z"))
21 | doAssert(not match("deffoo", re"\A(?:abc|def)\Z"))
22 | 
23 | doAssert(not match("deffoo", re"\b(?:abc|def)\b"))
24 | doAssert(match("def foo", re"\b(?:abc|def)\b"))
25 | 
26 | doAssert(matchLen("def foo", re"\b(?:abc|def)\b") == 3)
27 | 
28 | doAssert(matchLen("def foo\C\L", re"\bdef\sfoo\n") == 9)
29 | 
30 | let complex = re"(\`|\')[^`']*\1" #re"(\w+)|(a)bcxyz"
31 | 
32 | #echoCode(complex)
33 | if "'haha'" =~ complex:
34 |   echo matches
35 | else:
36 |   assert false
37 | 
38 | #echo matchLen("abc", complex) # == 3
39 | 
40 | when true:
41 |   let complex2 = re"((a+)b(c)\2)"
42 |   #echoCode complex2
43 |   if "aaaaabcc" =~ complex2:
44 |     echo matches
45 |     #for x in matches: echo x
46 |     #assert matches[1] == "abc"
47 |   else:
48 |     assert false
49 | 
50 | when false:
51 |   if "abc" =~ re"(cba)?.*":
52 |     assert matches[0] == nil
53 |   else: assert false
54 | 
55 |   if "abc" =~ re"().*":
56 |     assert matches[0] == ""
57 |   else: assert false
58 | 


--------------------------------------------------------------------------------
/todo.txt:
--------------------------------------------------------------------------------
1 | * fail at re construction time when invalid captures are used
2 | * finish 're' compatibility API
3 | 


--------------------------------------------------------------------------------
/vm.nim:
--------------------------------------------------------------------------------
  1 | #
  2 | #
  3 | #    Lexim - The Lexer Generator for Nim
  4 | #        (c) Copyright 2015 Andreas Rumpf
  5 | #
  6 | #    See the file "copying.txt", included in this
  7 | #    distribution, for details about the copyright.
  8 | #
  9 | 
 10 | ## Translates the DFA into a bytecode and then runs the bytecode.
 11 | import nfa, strutils, intsets, regexprs
 12 | 
 13 | type
 14 |   Instr* = distinct uint32
 15 | 
 16 |   Opcode* = enum
 17 |     opcRet,         # return with some literal
 18 |     opcTestSet,     # test current character against bitset in data section
 19 |     opcTestChar,    # test current character against char embedded in instr
 20 |     opcTJmp,        # jump if comparison was true
 21 |     opcBegin,       # \A match
 22 |     opcEnd,         # \Z match
 23 |     opcWordBound,   # \b match
 24 |     opcCaptureBegin # begin of capture '('
 25 |     opcCaptureEnd   # end of capture ')'
 26 |     opcBackref      # \\1 match
 27 | 
 28 |   Bytecode* = object
 29 |     code*: seq[Instr]
 30 |     data*: seq[set[char]]
 31 |     startAt*, captures*: int
 32 | 
 33 | template opcode*(x: Instr): Opcode = Opcode(x.uint32 and 0xff'u32)
 34 | template regBx*(x: Instr): int = (x.uint32 shr 16'u32).int
 35 | 
 36 | proc codeListing(c: Bytecode, result: var string, start=0; last = -1) =
 37 |   # first iteration: compute all necessary labels:
 38 |   var jumpTargets = initIntSet()
 39 |   let last = if last < 0: c.code.len-1 else: min(last, c.code.len-1)
 40 |   jumpTargets.incl(c.startAt)
 41 |   for i in start..last:
 42 |     let x = c.code[i]
 43 |     if x.opcode == opcTJmp:
 44 |       jumpTargets.incl(x.regBx)
 45 |   result.addf("goto L$1:\n", c.startAt)
 46 |   var i = start
 47 |   while i <= last:
 48 |     if i in jumpTargets: result.addf("L$1:\n", i)
 49 |     let x = c.code[i]
 50 |     result.add($i)
 51 |     let opc = opcode(x)
 52 |     case opc
 53 |     of opcRet:
 54 |       result.addf("\t$#\t$#\n", ($opc).substr(3), x.regBx)
 55 |     of opcTestSet:
 56 |       result.addf("\t$#\t$#\n", ($opc).substr(3), $c.data[x.regBx])
 57 |     of opcTestChar:
 58 |       result.addf("\t$#\t$#\n", ($opc).substr(3), $chr(x.regBx))
 59 |     of opcTJmp:
 60 |       result.addf("\t$#\tL$#\n", ($opc).substr(3), x.regBx)
 61 |     of opcCaptureBegin, opcCaptureEnd, opcBackref:
 62 |       result.addf("\t$#\tC$#\n", ($opc).substr(3), x.regBx)
 63 |     of opcBegin, opcEnd, opcWordBound:
 64 |       result.addf("\t$#\n", ($opc).substr(3))
 65 |     inc i
 66 | 
 67 | proc echoCode*(c: Bytecode; start=0; last = -1) {.deprecated.} =
 68 |   var buf = ""
 69 |   codeListing(c, buf, start, last)
 70 |   echo buf
 71 | 
 72 | proc genInstr(opc: Opcode; bx: int): Instr =
 73 |   # `bx` must be signed and in the range [-32767, 32768]
 74 |   const a = 0
 75 |   doAssert bx >= -32767 and bx <= 32768
 76 |   result = (opc.uint32 or a.uint32 shl 8'u32 or
 77 |             bx.uint32 shl 16'u32).Instr
 78 | 
 79 | proc gABx(c: var Bytecode; opc: Opcode; bx: int) =
 80 |   c.code.add(genInstr(opc, bx))
 81 | 
 82 | proc genData(c: var Bytecode; data: set[char]): int =
 83 |   assert '\0' notin data
 84 |   for i in 0 .. c.data.high:
 85 |     if c.data[i] == data: return i
 86 |   result = c.data.len
 87 |   c.data.add data
 88 | 
 89 | proc genTest(res: var Bytecode; x: Alphabet; dest: int) =
 90 |   case x.kind
 91 |   of reChar:
 92 |     gABx(res, opcTestChar, int(x.val))
 93 |     gABx(res, opcTJmp, dest)
 94 |   of reBegin:
 95 |     gABx(res, opcBegin, 0)
 96 |     gABx(res, opcTJmp, dest)
 97 |   of reEnd:
 98 |     gABx(res, opcEnd, 0)
 99 |     gABx(res, opcTJmp, dest)
100 |   of reWordBoundary:
101 |     gABx(res, opcWordBound, genData(res, wordChars))
102 |     gABx(res, opcTJmp, dest)
103 |   of reWordBoundaryNot:
104 |     gABx(res, opcWordBound, genData(res, {'\1'..'\255'} - wordChars))
105 |     gABx(res, opcTJmp, dest)
106 |   else: discard
107 | 
108 | proc genCapture(res: var Bytecode; cs: Alphabet; dest: int) =
109 |   case cs.kind
110 |   of reCapture:
111 |     gABx(res, opcCaptureBegin, cs.val.int-1)
112 |     gABx(res, opcTJmp, dest)
113 |     inc res.captures
114 |   of reCaptureEnd:
115 |     gABx(res, opcCaptureEnd, cs.val.int-1)
116 |     gABx(res, opcTJmp, dest)
117 |   of reBackref:
118 |     gABx(res, opcBackref, cs.val.int-1)
119 |     gABx(res, opcTJmp, dest)
120 |   else: discard
121 | 
122 | proc genBytecode*(a: DFA; res: var Bytecode) =
123 |   var stateToLabel = newSeq[int](a.stateCount)
124 | 
125 |   for src in countup(1, a.stateCount):
126 |     stateToLabel[src-1] = res.code.len
127 |     let rule = getRule(a, src)
128 |     for dest in allDests(a, src):
129 |       # this implements the rather strange
130 |       # "match longest but only sometimes" rule that regexes seem to have:
131 |       if rule == 0 or rule == getRule(a, dest):
132 |         let (list, cset) = allTransitions(a, src, dest)
133 |         for x in list: genCapture(res, x, dest)
134 |         if cset != {}:
135 |           gABx(res, opcTestSet, genData(res, cset))
136 |           gABx(res, opcTJmp, dest)
137 |         for x in list: genTest(res, x, dest)
138 |     if stateToLabel[src-1] != res.code.len or rule != 0:
139 |       # only generate 'ret' instruction when the state is not empty:
140 |       gABx(res, opcRet, rule)
141 |   # Fixup the TJmp instructions:
142 |   for i in 0 .. res.code.high:
143 |     let instr = res.code[i]
144 |     if opcode(instr) == opcTJmp:
145 |       res.code[i] = genInstr(opcTJmp, stateToLabel[regBx(instr)-1])
146 |   res.startAt = stateToLabel[a.startState-1]
147 | 
148 | type Action* = int #distinct range[1..32_000]
149 | 
150 | proc backrefMatch(input: string; sp: int; capture: (int, int)): bool =
151 |   var i = capture[0]
152 |   var k = sp
153 |   while true:
154 |     if i > capture[1]: return true
155 |     if k >= input.len or input[k] != input[i]: return false
156 |     inc k
157 |     inc i
158 | 
159 | proc execBytecode*(m: Bytecode; input: string;
160 |                    captures: var seq[(int, int)],
161 |                    start=0): tuple[a: Action, endPos: int] =
162 |   var pc = m.startAt
163 |   var sp = start
164 |   #var backtrack: seq[(int,int)]
165 |   while true:
166 |     let instr = m.code[pc]
167 |     let opc = instr.opcode
168 |     let arg = instr.regBx
169 |     if opc == opcTestSet:
170 |       # we *know* the next instruction is a TJmp:
171 |       let next = m.code[pc+1]
172 |       assert next.opcode == opcTJmp
173 |       if sp < input.len and input[sp] in m.data[arg]:
174 |         pc = next.regBx
175 |         inc sp
176 |       else:
177 |         inc pc, 2
178 |     elif opc == opcTestChar:
179 |       # we *know* the next instruction is a TJmp:
180 |       let next = m.code[pc+1]
181 |       assert next.opcode == opcTJmp
182 |       if sp < input.len and input[sp] == chr(arg):
183 |         pc = next.regBx
184 |         inc sp
185 |       else:
186 |         inc pc, 2
187 |     elif opc == opcRet:
188 |       #if arg == 0 and not backtrack.isNil and backtrack.len > 0:
189 |       #  let (newPc, newSp) = backtrack.pop()
190 |       #  pc = newPc
191 |       #  sp = newSp
192 |       #else:
193 |       return (Action(arg), sp)
194 |     else:
195 |       case opc
196 |       of opcBegin:
197 |         if sp == start:
198 |           pc = m.code[pc+1].regBx
199 |         else:
200 |           inc pc, 2
201 |       of opcEnd:
202 |         if sp >= input.len:
203 |           pc = m.code[pc+1].regBx
204 |         else:
205 |           inc pc, 2
206 |       of opcWordBound:
207 |         if sp >= input.len or sp == start or input[sp] notin m.data[arg]:
208 |           pc = m.code[pc+1].regBx
209 |         else:
210 |           inc pc, 2
211 |       of opcCaptureBegin:
212 |         if captures.len <= arg: setLen(captures, arg+1)
213 |         captures[arg][0] = sp
214 |         captures[arg][1] = -2 # mark as still open
215 |         #if backtrack.isNil: backtrack = @[]
216 |         #backtrack.add((pc+2, sp))
217 |         pc = m.code[pc+1].regBx
218 |       of opcCaptureEnd:
219 |         captures[arg][1] = sp-1
220 |         pc = m.code[pc+1].regBx
221 |       of opcBackref:
222 |         if arg < captures.len and backrefMatch(input, sp, captures[arg]):
223 |           pc = m.code[pc+1].regBx
224 |           inc sp, captures[arg][1] - captures[arg][0] + 1
225 |         else:
226 |           inc pc, 2
227 |       else: assert false
228 | 
229 | proc findMacro(s: string): PRegExpr = nil
230 | 
231 | proc re*(regex: string; flags: set[RegexFlag] = {reExtended}): Bytecode =
232 |   let r = parseRegExpr(regex, findMacro, flags)
233 |   r.rule = 1
234 |   var n: NFA
235 |   regExprToNFA(r, n)
236 |   let alph = fullAlphabet(n)
237 | 
238 |   var d, o: DFA
239 |   NFA_to_DFA(n, d, alph)
240 |   optimizeDFA(d, o, alph)
241 | 
242 |   result.code = @[]
243 |   result.data = @[]
244 |   genBytecode(o, result)
245 | 
246 | proc matchLen*(input: string; r: Bytecode;
247 |                captures: var seq[(int, int)], start=0): int =
248 |   let (isMatch, len) = execBytecode(r, input, captures, start)
249 |   result = if isMatch <= 0: -1 else: len
250 | 
251 | proc match*(input: string; r: Bytecode;
252 |             captures: var seq[(int, int)]; start=0): bool =
253 |   let (isMatch, len) = execBytecode(r, input, captures, start)
254 |   result = isMatch > 0
255 | 
256 | proc matchLen*(input: string; r: Bytecode, start=0): int =
257 |   var captures: seq[(int, int)] = @[]
258 |   let (isMatch, len) = execBytecode(r, input, captures, start)
259 |   result = if isMatch <= 0: -1 else: len
260 | 
261 | proc match*(input: string; r: Bytecode; start=0): bool =
262 |   var captures: seq[(int, int)] = @[]
263 |   let (isMatch, len) = execBytecode(r, input, captures, start)
264 |   result = isMatch > 0
265 | 
266 | template `=~`*(s: string, pattern: Bytecode): untyped =
267 |   ## This calls ``match`` with an implicit declared ``matches`` seq that
268 |   ## can be used in the scope of the ``=~`` call:
269 |   ##
270 |   ## .. code-block:: nim
271 |   ##
272 |   ##   if line =~ re"\s*(\w+)\s*\=\s*(\w+)":
273 |   ##     # matches a key=value pair:
274 |   ##     echo("Key: ", matches[0])
275 |   ##     echo("Value: ", matches[1])
276 |   ##   elif line =~ re"\s*(\#.*)":
277 |   ##     # matches a comment
278 |   ##     # note that the implicit ``matches`` array is different from the
279 |   ##     # ``matches`` array of the first branch
280 |   ##     echo("comment: ", matches[0])
281 |   ##   else:
282 |   ##     echo("syntax error")
283 |   ##
284 |   var captures: seq[(int, int)] = @[]
285 |   when not declaredInScope(matches):
286 |     var matches {.inject.}: seq[string] = @[]
287 |   let m = match(s, pattern, captures)
288 |   for i in 0..high(captures):
289 |     matches.add substr(s, captures[i][0], captures[i][1])
290 |   m
291 | 


--------------------------------------------------------------------------------