├── ABOUT ├── CHANGELOG.md ├── LICENSE ├── README.md ├── TODO.md ├── doc └── USAGE.md ├── src ├── lpcap.lua ├── lpcode.lua ├── lpeglj.lua ├── lpprint.lua ├── lpvm.lua └── re.lua └── tests ├── loadtest.lua ├── streamtest.lua ├── streamtest2.lua ├── test.lua └── testlr.lua /ABOUT: -------------------------------------------------------------------------------- 1 | LPeg Parser in LuaJIT 2 | based on LPeg v1.0 - PEG pattern matching for Lua 3 | Lua.org & PUC-Rio written by Roberto Ierusalimschy 4 | http://www.inf.puc-rio.br/~roberto/lpeg/ 5 | 6 | left recursion support based on Sérgio Medeiros algorithm 7 | http://arxiv.org/abs/1207.0443 8 | 9 | The re.lua and the test.lua are taken from 10 | original LPeg distribution. 11 | 12 | Released under MIT License 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 1.0.0.0 : September 30, 2015 2 | - include changes and bug fixes from LPeg v1.0 3 | - added VM runtime listing (tracing) for debugging purposes 4 | 5 | 0.12.2 : July 10, 2014 6 | 7 | - added restricted memoization 8 | - stream support (infinite parsing) 9 | 10 | 0.12.1 : December 30, 2013 11 | 12 | - speed improvement 13 | - support direct and indirect left recursion based on Sérgio Medeiros algorithm (http://arxiv.org/abs/1207.0443) 14 | - loading and saving patterns 15 | 16 | 0.12 : July 14, 2013: Initial release 17 | 18 | - LPeg Parser in pure LuaJIT based on LPeg v.12 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ** Modules: 2 | ** lpcap.lua, lpcode.lua, lpeglj.lua, lpprint.lua, lpvm.lua 3 | ** testlr.lua 4 | ** Copyright (C) 2014 Rostislav Sacek. 5 | ** 6 | ** Modules: 7 | ** re.lua, test.lua 8 | ** Copyright (C) 2013 Lua.org, PUC-Rio. 9 | ** 10 | ** Licence: 11 | ** Permission is hereby granted, free of charge, to any person obtaining 12 | ** a copy of this software and associated documentation files (the 13 | ** "Software"), to deal in the Software without restriction, including 14 | ** without limitation the rights to use, copy, modify, merge, publish, 15 | ** distribute, sublicense, and/or sell copies of the Software, and to 16 | ** permit persons to whom the Software is furnished to do so, subject to 17 | ** the following conditions: 18 | ** 19 | ** The above copyright notice and this permission notice shall be 20 | ** included in all copies or substantial portions of the Software. 21 | ** 22 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 25 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 27 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 28 | ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | ** 30 | ** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] 31 | 32 | ----------------------------------------------------------------------------- 33 | 34 | 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | LPegLJ v1.0 2 | ============= 3 | 4 | LPeg Parser in pure LuaJIT 5 | (straight Lua + FFI translation of LPeg C code) 6 | based on LPeg v1.0 - PEG pattern matching for Lua 7 | Lua.org & PUC-Rio written by Roberto Ierusalimschy 8 | http://www.inf.puc-rio.br/~roberto/lpeg/ 9 | 10 | left recursion support is based on Sérgio Medeiros algorithm 11 | http://arxiv.org/abs/1207.0443 12 | 13 | ### Usage: 14 | ```Lua 15 | local lpeglj = require"lpeglj" 16 | local pattern = lpeglj.P("a") 17 | -- then: 18 | lpeglj.match(pattern, "a") 19 | -- or, equivalently: 20 | pattern:match("a") 21 | ``` 22 | 23 | ### Compatibility: 24 | 25 | - full syntactical and functional backward compatibility with LPeg v1.0 26 | - works only with LuaJIT 2.x 27 | 28 | ### Differences from LPeg v1.0: 29 | 30 | Description in doc/USAGE.md 31 | 32 | - LPegLJ supports direct and indirect left recursion based on Sérgio Medeiros algorithm (http://arxiv.org/abs/1207.0443) 33 | - patterns can be saved and loaded 34 | - supports memoization (restricted) - useful for complex grammars 35 | - can be used in stream mode (infinite parsing) 36 | - VM action runtime listing (tracing) for debugging purposes 37 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ### Grammar checking 2 | #### Check PEG and left recursion rules for right order. 3 | 4 | E <- ('a' / 'aa') 'b' 5 | 6 | E <- 'a' / E 'a' 7 | 8 | ### Left factorization 9 | 10 | ### Runtime capture 11 | Add commit and revert functions (for creating tables via Cmt). 12 | 13 | ### Rule profiling 14 | 15 | ### Performance tests 16 | 17 | 18 | -------------------------------------------------------------------------------- /doc/USAGE.md: -------------------------------------------------------------------------------- 1 | LPegLJ 1.0.0.0LJ 2 | =========== 3 | ## New functions: 4 | ###Loading and saving patterns: 5 | ####pat:save(fname, [tree]) 6 | Save pattern to file. 7 | 8 | fname - file name for pattern 9 | 10 | tree - full pattern tree is saved - later modification is possible 11 | ####pat:dump([tree]) 12 | Dump pattern to string. 13 | 14 | tree - full pattern tree is saved - later modification is possible 15 | ####lpeg.loadfile(fname, [fsymbols]) 16 | Load pattern from file. 17 | 18 | fname - file name with pattern 19 | 20 | fsymbols - table with functions (key - symbolic name, value - function). This should be used only for functions with upvalues. 21 | 22 | ####lpeg.load(str, [fsymbols]) 23 | Load pattern from memory. 24 | 25 | str - pattern in memory (string or ffi type) 26 | 27 | fsymbols - table with functions (key - symbolic name, value - function). This should be used only for functions with upvalues. 28 | 29 | ###Example: 30 | ```Lua 31 | local lpeglj = require"lpeglj" 32 | local pat = lpeglj.P('abc') 33 | pat:save("saved.pat") -- save only pattern code 34 | local savedpat = lpeglj.loadfile("saved.pat") 35 | ``` 36 | ###Left recursion: 37 | ####lpeglj.enableleftrecursion(set) 38 | *set* - enable left recursion 39 | ####lpeglj.V(v, p) 40 | *p* - precedence level (number 1 to n) 41 | ###Example: 42 | ```Lua 43 | local lpeglj = require"lpeglj" 44 | lpeglj.enableleftrecursion(true) 45 | local pat = m.P{ 46 | "E", 47 | E = lpeglj.V("E", 1) * '+' * lpeglj.V("E", 2) + -- left associative rule with low precedence 48 | lpeglj.V("E", 2) * '**' * lpeglj.V("E", 2) + -- right associative rule with higher precedence 49 | 'n' 50 | } 51 | pat:match("n+n+n") 52 | ``` 53 | ####using re module with precedence 54 | ```Lua 55 | local lpeglj = require"lpeglj" 56 | local re = require"re" 57 | lpeglj.enableleftrecursion(true) 58 | local pat = [[ 59 | E <- E:1 [+-] E:2 / -- left associativity 60 | E:2 [*/] E:3 / 61 | E:3 '**' E:3 / -- right associativity 62 | '-' E:4 / -- highest precedence 63 | '(' E ')' / 64 | [0-9]+ 65 | ]] 66 | re.match("-1*(6+2/4+3-1)**2", pat) 67 | ``` 68 | ###Using memoization: 69 | ####lpeglj.enablememoization(set) 70 | *set* - enable memoization (true or false) 71 | 72 | ###Using stream: 73 | 74 | In stream mode all input data are copied into internal buffers. During parsing algorithm discards unused buffer (without link from stack or from captures stack). 75 | Captures are generated and removed from capture stack in this condition: capture are not in unsolved alternative and capture is not open (should be complete). 76 | Algorithm generates only complete capture on highest level. Nested captures are generated after higher level captures are completed. 77 | 78 | ####lpeglj.streammatch(pat, init, ...) 79 | *pat* - pattern 80 | *init* - start position in stream (should be positive number) 81 | *...* - another parameters (same as in lpeg.match function) 82 | 83 | Returns function **func**. This function is called with string data from stream. 84 | 85 | ####func(str, eos) 86 | *str* - string input (string) 87 | *eos* - end of stream (boolean) 88 | Returns **status** and capture(s)(if available) or position. 89 | 90 | **Status**: 91 | 1 - need another data 92 | -1 - parsing fail 93 | 0 - parsing finished 94 | 95 | Restrictions and differences for stream mode: 96 | 97 | - start position in stream should be positive number. 98 | - whole string argument in match-time captures (Cmt and function) is not string but function. 99 | This function takes two arguments (start and end index of string in stream) and return string. 100 | 101 | ###Example: 102 | ```Lua 103 | local lpeglj = require"lpeglj" 104 | local pat = m.C("abc") * m.C("def") 105 | local fce = pat:streammatch() 106 | local st = fce("ab") -- return 1 - need another data 107 | local st, cap = fce("c") -- return 1 , "abc" - capture and need another data 108 | local st, cap = fce("def") -- return 0 , "def" - capture and finish parsing 109 | ``` 110 | 111 | ####lpeglj.setmaxbehind(val) 112 | *val* - max position before current position (number or nil for reset) 113 | 114 | Function sets maximum position before current position. Buffer with this position can not be deleted. 115 | This function has meaning only for match-time captures which use first string argument. In this case 116 | algorithm can not determinate range of requested string. 117 | 118 | #### re module 119 | 120 | ####re.streammatch (pat, init) 121 | *pat* - pattern 122 | *init* - start position in stream (should be positive number) 123 | 124 | Returns function **func**. This function is called with string data from stream. 125 | 126 | ####func(str, eos) 127 | *str* - string input (string) 128 | *eos* - end of stream (boolean) 129 | Returns **status** and captures or position. 130 | 131 | **Status**: 132 | 1 - need another data 133 | -1 - parsing fail 134 | 0 - parsing finished 135 | 136 | ###Runtime tracing: 137 | ####lpeg.enabletracing(set) 138 | *set* - enable tracing (true or false) 139 | 140 | **Output format:** 141 | ####Rule entry: 142 | indent '+'[typ] rulename 143 | 144 | *indent* - nesting level 145 | *typ* - type of call 146 | - 'M' - memoized rule 147 | - 'TC' - tail call 148 | *rulename* - name of rule 149 | 150 | ####Rule match: 151 | indent '='[typ] funcname [extra] subject [captures] 152 | 153 | *indent* - nesting level 154 | *typ* - type of call 155 | - 'M' - memoized rule 156 | - 'IB' - increment bound (for left recursion) 157 | *extra* - additional info for left recursion - level of IB 158 | *subject* - corresponding part of input string (or stream) 159 | *captures* - corresponding part of runtime captures 160 | 161 | ####Rule leave (fail): 162 | indent '-' rulename 163 | 164 | *indent* - nesting level 165 | *rulename* - name of rule 166 | -------------------------------------------------------------------------------- /src/lpcap.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | LPEGLJ 3 | lpcap.lua 4 | Capture functions 5 | Copyright (C) 2014 Rostislav Sacek. 6 | based on LPeg v1.0 - PEG pattern matching for Lua 7 | Lua.org & PUC-Rio written by Roberto Ierusalimschy 8 | http://www.inf.puc-rio.br/~roberto/lpeg/ 9 | 10 | ** Permission is hereby granted, free of charge, to any person obtaining 11 | ** a copy of this software and associated documentation files (the 12 | ** "Software"), to deal in the Software without restriction, including 13 | ** without limitation the rights to use, copy, modify, merge, publish, 14 | ** distribute, sublicense, and/or sell copies of the Software, and to 15 | ** permit persons to whom the Software is furnished to do so, subject to 16 | ** the following conditions: 17 | ** 18 | ** The above copyright notice and this permission notice shall be 19 | ** included in all copies or substantial portions of the Software. 20 | ** 21 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | ** 29 | ** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] 30 | --]] 31 | local ffi = require "ffi" 32 | 33 | local Cclose = 0 34 | local Cposition = 1 35 | local Cconst = 2 36 | local Cbackref = 3 37 | local Carg = 4 38 | local Csimple = 5 39 | local Ctable = 6 40 | local Cfunction = 7 41 | local Cquery = 8 42 | local Cstring = 9 43 | local Cnum = 10 44 | local Csubst = 11 45 | local Cfold = 12 46 | local Cruntime = 13 47 | local Cgroup = 14 48 | 49 | local MAXSTRCAPS = 10 50 | 51 | local pushcapture 52 | local addonestring 53 | 54 | 55 | -- Goes back in a list of captures looking for an open capture 56 | -- corresponding to a close 57 | 58 | local function findopen(cs, index) 59 | local n = 0; -- number of closes waiting an open 60 | while true do 61 | index = index - 1 62 | if cs.ocap[index].kind == Cclose then 63 | n = n + 1 -- one more open to skip 64 | elseif cs.ocap[index].siz == 0 then 65 | if n == 0 then 66 | return index 67 | end 68 | n = n - 1 69 | end 70 | end 71 | end 72 | 73 | 74 | local function checknextcap(cs, captop) 75 | local cap = cs.cap; 76 | -- not a single capture? ((cap)->siz != 0) 77 | if cs.ocap[cap].siz == 0 then 78 | local n = 0; -- number of opens waiting a close 79 | -- look for corresponding close 80 | while true do 81 | cap = cap + 1 82 | if cap > captop then return end 83 | if cs.ocap[cap].kind == Cclose then 84 | n = n - 1 85 | if n + 1 == 0 then 86 | break; 87 | end 88 | elseif cs.ocap[cap].siz == 0 then 89 | n = n + 1 90 | end 91 | end 92 | end 93 | cap = cap + 1; -- + 1 to skip last close (or entire single capture) 94 | if cap > captop then return end 95 | return true 96 | end 97 | 98 | 99 | -- Go to the next capture 100 | 101 | local function nextcap(cs) 102 | local cap = cs.cap; 103 | -- not a single capture? ((cap)->siz != 0) 104 | if cs.ocap[cap].siz == 0 then 105 | local n = 0; -- number of opens waiting a close 106 | -- look for corresponding close 107 | while true do 108 | cap = cap + 1 109 | if cs.ocap[cap].kind == Cclose then 110 | n = n - 1 111 | if n + 1 == 0 then 112 | break; 113 | end 114 | elseif cs.ocap[cap].siz == 0 then 115 | n = n + 1 116 | end 117 | end 118 | end 119 | cs.cap = cap + 1; -- + 1 to skip last close (or entire single capture) 120 | end 121 | 122 | 123 | -- Push on the Lua stack all values generated by nested captures inside 124 | -- the current capture. Returns number of values pushed. 'addextra' 125 | -- makes it push the entire match after all captured values. The 126 | -- entire match is pushed also if there are no other nested values, 127 | -- so the function never returns zero. 128 | 129 | local function pushnestedvalues(cs, addextra, out, valuetable) 130 | local co = cs.cap 131 | cs.cap = cs.cap + 1 132 | -- no nested captures? 133 | if cs.ocap[cs.cap - 1].siz ~= 0 then 134 | local st = cs.ocap[co].s 135 | local l = cs.ocap[co].siz - 1 136 | out.outindex = out.outindex + 1 137 | out.out[out.outindex] = cs.s and cs.s:sub(st, st + l - 1) or cs.stream(st, st + l - 1) 138 | return 1; -- that is it 139 | else 140 | local n = 0; 141 | while cs.ocap[cs.cap].kind ~= Cclose do -- repeat for all nested patterns 142 | n = n + pushcapture(cs, out, valuetable); 143 | end 144 | -- need extra? 145 | if addextra or n == 0 then 146 | local st = cs.ocap[co].s 147 | local l = cs.ocap[cs.cap].s - cs.ocap[co].s 148 | out.outindex = out.outindex + 1 149 | out.out[out.outindex] = cs.s and cs.s:sub(st, st + l - 1) or cs.stream(st, st + l - 1) 150 | n = n + 1 151 | end 152 | cs.cap = cs.cap + 1 -- skip close entry 153 | return n; 154 | end 155 | end 156 | 157 | 158 | -- Push only the first value generated by nested captures 159 | 160 | local function pushonenestedvalue(cs, out, valuetable) 161 | local n = pushnestedvalues(cs, false, out, valuetable) 162 | for i = n, 2, -1 do 163 | out.out[out.outindex] = nil 164 | out.outindex = out.outindex - 1 165 | end 166 | end 167 | 168 | 169 | -- Try to find a named group capture with the name given at the top of 170 | -- the stack; goes backward from 'cap'. 171 | 172 | local function findback(cs, cap, name, valuetable) 173 | -- repeat until end of list 174 | while cap > 0 do 175 | cap = cap - 1 176 | local continue 177 | if cs.ocap[cap].kind == Cclose then 178 | cap = findopen(cs, cap); -- skip nested captures 179 | elseif cs.ocap[cap].siz == 0 then 180 | continue = true -- opening an enclosing capture: skip and get previous 181 | end 182 | if not continue and cs.ocap[cap].kind == Cgroup and cs.ocap[cap].idx ~= 0 then 183 | local gname = valuetable[cs.ocap[cap].idx] -- get group name 184 | -- right group? 185 | if name == gname then 186 | return cap; 187 | end 188 | end 189 | end 190 | error(("back reference '%s' not found"):format(name), 0) 191 | end 192 | 193 | 194 | -- Back-reference capture. Return number of values pushed. 195 | 196 | local function backrefcap(cs, out, valuetable) 197 | local curr = cs.cap; 198 | local name = valuetable[cs.ocap[cs.cap].idx] -- reference name 199 | cs.cap = findback(cs, curr, name, valuetable) -- find corresponding group 200 | local n = pushnestedvalues(cs, false, out, valuetable); -- push group's values 201 | cs.cap = curr + 1; 202 | return n; 203 | end 204 | 205 | 206 | -- Table capture: creates a new table and populates it with nested 207 | -- captures. 208 | 209 | local function tablecap(cs, out, valuetable) 210 | local n = 0; 211 | local t = {} 212 | cs.cap = cs.cap + 1 213 | -- table is empty 214 | if cs.ocap[cs.cap - 1].siz == 0 then 215 | while cs.ocap[cs.cap].kind ~= Cclose do 216 | local subout = { outindex = 0, out = {} } 217 | -- named group? 218 | if cs.ocap[cs.cap].kind == Cgroup and cs.ocap[cs.cap].idx ~= 0 then 219 | local groupname = valuetable[cs.ocap[cs.cap].idx] -- push group name 220 | pushonenestedvalue(cs, subout, valuetable) 221 | t[groupname] = subout.out[1] 222 | else 223 | -- not a named group 224 | local k = pushcapture(cs, subout, valuetable) 225 | -- store all values into table 226 | for i = 1, subout.outindex do 227 | t[i + n] = subout.out[i] 228 | end 229 | n = n + k; 230 | end 231 | end 232 | cs.cap = cs.cap + 1 -- skip close entry 233 | end 234 | out.outindex = out.outindex + 1 235 | out.out[out.outindex] = t 236 | return 1; -- number of values pushed (only the table) 237 | end 238 | 239 | 240 | -- Table-query capture 241 | 242 | local function querycap(cs, out, valuetable) 243 | local table = valuetable[cs.ocap[cs.cap].idx] 244 | local subout = { outindex = 0, out = {} } 245 | pushonenestedvalue(cs, subout, valuetable) -- get nested capture 246 | -- query cap. value at table 247 | if table[subout.out[1]] ~= nil then 248 | out.outindex = out.outindex + 1 249 | out.out[out.outindex] = table[subout.out[1]] 250 | return 1 251 | end 252 | return 0 253 | end 254 | 255 | 256 | -- Fold capture 257 | 258 | local function foldcap(cs, out, valuetable) 259 | local fce = valuetable[cs.ocap[cs.cap].idx] 260 | cs.cap = cs.cap + 1 261 | -- no nested captures? 262 | -- or no nested captures (large subject)? 263 | if cs.ocap[cs.cap - 1].siz ~= 0 or 264 | cs.ocap[cs.cap].kind == Cclose then 265 | error("no initial value for fold capture", 0); 266 | end 267 | local subout = { outindex = 0; out = {} } 268 | local n = pushcapture(cs, subout, valuetable) -- nested captures with no values? 269 | if n == 0 then 270 | error("no initial value for fold capture", 0); 271 | end 272 | local acumulator = subout.out[1] -- leave only one result for accumulator 273 | while cs.ocap[cs.cap].kind ~= Cclose do 274 | local subout = { outindex = 0; out = {} } 275 | n = pushcapture(cs, subout, valuetable); -- get next capture's values 276 | acumulator = fce(acumulator, unpack(subout.out, 1, subout.outindex)) -- call folding function 277 | end 278 | cs.cap = cs.cap + 1; -- skip close entry 279 | out.outindex = out.outindex + 1 280 | out.out[out.outindex] = acumulator 281 | return 1; -- only accumulator left on the stack 282 | end 283 | 284 | 285 | local function retcount(...) 286 | return select('#', ...), { ... } 287 | end 288 | 289 | 290 | -- Function capture 291 | 292 | local function functioncap(cs, out, valuetable) 293 | local fce = valuetable[cs.ocap[cs.cap].idx] -- push function 294 | local subout = { outindex = 0, out = {} } 295 | local n = pushnestedvalues(cs, false, subout, valuetable); -- push nested captures 296 | local count, ret = retcount(fce(unpack(subout.out, 1, n))) -- call function 297 | for i = 1, count do 298 | out.outindex = out.outindex + 1 299 | out.out[out.outindex] = ret[i] 300 | end 301 | return count 302 | end 303 | 304 | 305 | -- Select capture 306 | 307 | local function numcap(cs, out, valuetable) 308 | local idx = valuetable[cs.ocap[cs.cap].idx] -- value to select 309 | -- no values? 310 | if idx == 0 then 311 | nextcap(cs); -- skip entire capture 312 | return 0; -- no value produced 313 | else 314 | local subout = { outindex = 0, out = {} } 315 | local n = pushnestedvalues(cs, false, subout, valuetable) 316 | -- invalid index? 317 | if n < idx then 318 | error(("no capture '%d'"):format(idx), 0) 319 | else 320 | out.outindex = out.outindex + 1 321 | out.out[out.outindex] = subout.out[idx] -- get selected capture 322 | return 1; 323 | end 324 | end 325 | end 326 | 327 | 328 | -- Calls a runtime capture. Returns number of captures removed by 329 | -- the call, including the initial Cgroup. (Captures to be added are 330 | -- on the Lua stack.) 331 | 332 | local function runtimecap(cs, close, s, out, valuetable) 333 | local open = findopen(cs, close) 334 | assert(cs.ocap[open].kind == Cgroup) 335 | cs.ocap[close].kind = Cclose; -- closes the group 336 | cs.ocap[close].s = s; 337 | cs.cap = open; 338 | local fce = valuetable[cs.ocap[cs.cap].idx] -- push function to be called 339 | local subout = { outindex = 0, out = {} } 340 | local n = pushnestedvalues(cs, false, subout, valuetable); -- push nested captures 341 | local count, ret = retcount(fce(cs.s or cs.stream, s, unpack(subout.out, 1, n))) -- call dynamic function 342 | for i = 1, count do 343 | out.outindex = out.outindex + 1 344 | out.out[out.outindex] = ret[i] 345 | end 346 | return close - open -- number of captures of all kinds removed 347 | end 348 | 349 | -- Collect values from current capture into array 'cps'. Current 350 | -- capture must be Cstring (first call) or Csimple (recursive calls). 351 | -- (In first call, fills %0 with whole match for Cstring.) 352 | -- Returns number of elements in the array that were filled. 353 | 354 | local function getstrcaps(cs, cps, n) 355 | local k = n 356 | n = n + 1 357 | cps[k + 1].isstring = true; -- get string value 358 | cps[k + 1].startstr = cs.ocap[cs.cap].s; -- starts here 359 | cs.cap = cs.cap + 1 360 | -- nested captures? 361 | if cs.ocap[cs.cap - 1].siz == 0 then 362 | -- traverse them 363 | while cs.ocap[cs.cap].kind ~= Cclose do 364 | -- too many captures? 365 | if n >= MAXSTRCAPS then 366 | nextcap(cs); -- skip extra captures (will not need them) 367 | elseif cs.ocap[cs.cap].kind == Csimple then 368 | -- string? 369 | n = getstrcaps(cs, cps, n); -- put info. into array 370 | else 371 | cps[n + 1].isstring = false; -- not a string 372 | cps[n + 1].origcap = cs.cap; -- keep original capture 373 | nextcap(cs); 374 | n = n + 1; 375 | end 376 | end 377 | cs.cap = cs.cap + 1 -- skip close 378 | end 379 | cps[k + 1].endstr = cs.ocap[cs.cap - 1].s + cs.ocap[cs.cap - 1].siz - 1 -- ends here 380 | return n; 381 | end 382 | 383 | 384 | -- add next capture value (which should be a string) to buffer 'b' 385 | 386 | -- String capture: add result to buffer 'b' (instead of pushing 387 | -- it into the stack) 388 | 389 | local function stringcap(cs, b, valuetable) 390 | local cps = {} 391 | for i = 1, MAXSTRCAPS do 392 | cps[#cps + 1] = {} 393 | end 394 | local fmt = valuetable[cs.ocap[cs.cap].idx] 395 | local n = getstrcaps(cs, cps, 0) - 1; -- collect nested captures 396 | local i = 1 397 | -- traverse them 398 | while i <= #fmt do 399 | local c = fmt:sub(i, i) 400 | -- not an escape? 401 | if c ~= '%' then 402 | b[#b + 1] = c -- add it to buffer 403 | elseif fmt:sub(i + 1, i + 1) < '0' or fmt:sub(i + 1, i + 1) > '9' then 404 | -- not followed by a digit? 405 | i = i + 1 406 | b[#b + 1] = fmt:sub(i, i) 407 | else 408 | i = i + 1 409 | local l = fmt:sub(i, i) - '0'; -- capture index 410 | if l > n then 411 | error(("invalid capture index (%d)"):format(l), 0) 412 | elseif cps[l + 1].isstring then 413 | b[#b + 1] = cs.s and cs.s:sub(cps[l + 1].startstr, cps[l + 1].endstr - cps[l + 1].startstr + cps[l + 1].startstr - 1) or 414 | cs.stream(cps[l + 1].startstr, cps[l + 1].endstr - cps[l + 1].startstr + cps[l + 1].startstr - 1) 415 | else 416 | local curr = cs.cap; 417 | cs.cap = cps[l + 1].origcap; -- go back to evaluate that nested capture 418 | if not addonestring(cs, b, "capture", valuetable) then 419 | error(("no values in capture index %d"):format(l), 0) 420 | end 421 | cs.cap = curr; -- continue from where it stopped 422 | end 423 | end 424 | i = i + 1 425 | end 426 | end 427 | 428 | 429 | -- Substitution capture: add result to buffer 'b' 430 | 431 | local function substcap(cs, b, valuetable) 432 | local curr = cs.ocap[cs.cap].s; 433 | -- no nested captures? 434 | if cs.ocap[cs.cap].siz ~= 0 then 435 | -- keep original text 436 | b[#b + 1] = cs.s and cs.s:sub(curr, cs.ocap[cs.cap].siz - 1 + curr - 1) or 437 | cs.stream(curr, cs.ocap[cs.cap].siz - 1 + curr - 1) 438 | else 439 | cs.cap = cs.cap + 1 -- skip open entry 440 | -- traverse nested captures 441 | while cs.ocap[cs.cap].kind ~= Cclose do 442 | local next = cs.ocap[cs.cap].s; 443 | b[#b + 1] = cs.s and cs.s:sub(curr, next - curr + curr - 1) or 444 | cs.stream(curr, next - curr + curr - 1) -- add text up to capture 445 | if addonestring(cs, b, "replacement", valuetable) then 446 | curr = cs.ocap[cs.cap - 1].s + cs.ocap[cs.cap - 1].siz - 1; -- continue after match 447 | else 448 | -- no capture value 449 | curr = next; -- keep original text in final result 450 | end 451 | end 452 | b[#b + 1] = cs.s and cs.s:sub(curr, curr + cs.ocap[cs.cap].s - curr - 1) or 453 | cs.stream(curr, curr + cs.ocap[cs.cap].s - curr - 1) -- add last piece of text 454 | end 455 | cs.cap = cs.cap + 1 -- go to next capture 456 | end 457 | 458 | 459 | -- Evaluates a capture and adds its first value to buffer 'b'; returns 460 | -- whether there was a value 461 | 462 | function addonestring(cs, b, what, valuetable) 463 | local tag = cs.ocap[cs.cap].kind 464 | if tag == Cstring then 465 | stringcap(cs, b, valuetable); -- add capture directly to buffer 466 | return 1 467 | elseif tag == Csubst then 468 | substcap(cs, b, valuetable); -- add capture directly to buffer 469 | return 1 470 | else 471 | local subout = { outindex = 0, out = {} } 472 | local n = pushcapture(cs, subout, valuetable); 473 | if n > 0 then 474 | if type(subout.out[1]) ~= 'string' and type(subout.out[1]) ~= 'number' then 475 | error(("invalid %s value (a %s)"):format(what, type(subout.out[1])), 0) 476 | end 477 | b[#b + 1] = subout.out[1] 478 | return n 479 | end 480 | end 481 | end 482 | 483 | 484 | -- Push all values of the current capture into the stack; returns 485 | -- number of values pushed 486 | 487 | function pushcapture(cs, out, valuetable) 488 | local type = cs.ocap[cs.cap].kind 489 | if type == Cposition then 490 | out.outindex = out.outindex + 1 491 | out.out[out.outindex] = cs.ocap[cs.cap].s 492 | cs.cap = cs.cap + 1; 493 | return 1; 494 | elseif type == Cconst then 495 | out.outindex = out.outindex + 1 496 | out.out[out.outindex] = valuetable[cs.ocap[cs.cap].idx] 497 | cs.cap = cs.cap + 1 498 | return 1; 499 | elseif type == Carg then 500 | local arg = valuetable[cs.ocap[cs.cap].idx] 501 | cs.cap = cs.cap + 1 502 | if arg > cs.ptopcount then 503 | error(("reference to absent extra argument #%d"):format(arg), 0) 504 | end 505 | out.outindex = out.outindex + 1 506 | out.out[out.outindex] = cs.ptop[arg] 507 | return 1; 508 | elseif type == Csimple then 509 | local k = pushnestedvalues(cs, true, out, valuetable) 510 | local index = out.outindex 511 | table.insert(out.out, index - k + 1, out.out[index]) 512 | out[index + 1] = nil 513 | return k; 514 | elseif type == Cruntime then 515 | out.outindex = out.outindex + 1 516 | out.out[out.outindex] = valuetable[cs.ocap[cs.cap].idx] 517 | cs.cap = cs.cap + 1; 518 | return 1; 519 | elseif type == Cstring then 520 | local b = {} 521 | stringcap(cs, b, valuetable) 522 | out.outindex = out.outindex + 1 523 | out.out[out.outindex] = table.concat(b) 524 | return 1; 525 | elseif type == Csubst then 526 | local b = {} 527 | substcap(cs, b, valuetable); 528 | out.outindex = out.outindex + 1 529 | out.out[out.outindex] = table.concat(b) 530 | return 1; 531 | elseif type == Cgroup then 532 | -- anonymous group? 533 | if cs.ocap[cs.cap].idx == 0 then 534 | return pushnestedvalues(cs, false, out, valuetable); -- add all nested values 535 | else 536 | -- named group: add no values 537 | nextcap(cs); -- skip capture 538 | return 0 539 | end 540 | elseif type == Cbackref then 541 | return backrefcap(cs, out, valuetable) 542 | elseif type == Ctable then 543 | return tablecap(cs, out, valuetable) 544 | elseif type == Cfunction then 545 | return functioncap(cs, out, valuetable) 546 | elseif type == Cnum then 547 | return numcap(cs, out, valuetable) 548 | elseif type == Cquery then 549 | return querycap(cs, out, valuetable) 550 | elseif type == Cfold then 551 | return foldcap(cs, out, valuetable) 552 | else 553 | assert(false) 554 | end 555 | end 556 | 557 | 558 | -- Prepare a CapState structure and traverse the entire list of 559 | -- captures in the stack pushing its results. 's' is the subject 560 | -- string, 'r' is the final position of the match, and 'ptop' 561 | -- the index in the stack where some useful values were pushed. 562 | -- Returns the number of results pushed. (If the list produces no 563 | -- results, push the final position of the match.) 564 | 565 | local function getcaptures(capture, s, stream, r, valuetable, ...) 566 | local n = 0; 567 | local cs = { cap = 0 } 568 | local out = { outindex = 0; out = {} } 569 | -- is there any capture? 570 | if capture[cs.cap].kind ~= Cclose then 571 | cs.ocap = capture 572 | cs.s = s; 573 | cs.stream = stream 574 | cs.ptopcount, cs.ptop = retcount(...) 575 | repeat -- collect their values 576 | n = n + pushcapture(cs, out, valuetable) 577 | until cs.ocap[cs.cap].kind == Cclose 578 | end 579 | -- no capture values? 580 | if n == 0 then 581 | if not r then 582 | return 583 | else 584 | return r 585 | end 586 | end 587 | assert(out.outindex < 7998, "(too many captures)") 588 | return unpack(out.out, 1, out.outindex) 589 | end 590 | 591 | local function getcapturesruntime(capture, s, stream, notdelete, min, max, captop, valuetable, ...) 592 | local n = 0; 593 | local cs = { cap = min } 594 | local out = { outindex = 0; out = {} } 595 | cs.ocap = capture 596 | cs.s = s 597 | cs.stream = stream 598 | cs.ptopcount, cs.ptop = retcount(...) 599 | local start = 0 600 | repeat -- collect their values 601 | if not checknextcap(cs, max) then break end 602 | local notdelete = notdelete or capture[cs.cap].kind == Cgroup and capture[cs.cap].idx ~= 0 and capture[cs.cap].candelete == 0 603 | pushcapture(cs, out, valuetable) 604 | if notdelete then 605 | start = cs.cap 606 | else 607 | n = n + cs.cap - start 608 | for i = 0, captop - cs.cap - 1 do 609 | ffi.copy(capture + start + i, capture + cs.cap + i, ffi.sizeof('CAPTURE')) 610 | end 611 | max = max - (cs.cap - start) 612 | captop = captop - (cs.cap - start) 613 | cs.cap = start 614 | end 615 | until cs.cap == max 616 | assert(out.outindex < 7998, "(too many captures)") 617 | return n, out.out, out.outindex 618 | end 619 | 620 | return { 621 | getcaptures = getcaptures, 622 | runtimecap = runtimecap, 623 | getcapturesruntime = getcapturesruntime, 624 | } 625 | 626 | -------------------------------------------------------------------------------- /src/lpcode.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | LPEGLJ 3 | lpcode.lua 4 | Generating code from tree 5 | Copyright (C) 2014 Rostislav Sacek. 6 | based on LPeg v1.0 - PEG pattern matching for Lua 7 | Lua.org & PUC-Rio written by Roberto Ierusalimschy 8 | http://www.inf.puc-rio.br/~roberto/lpeg/ 9 | 10 | ** Permission is hereby granted, free of charge, to any person obtaining 11 | ** a copy of this software and associated documentation files (the 12 | ** "Software"), to deal in the Software without restriction, including 13 | ** without limitation the rights to use, copy, modify, merge, publish, 14 | ** distribute, sublicense, and/or sell copies of the Software, and to 15 | ** permit persons to whom the Software is furnished to do so, subject to 16 | ** the following conditions: 17 | ** 18 | ** The above copyright notice and this permission notice shall be 19 | ** included in all copies or substantial portions of the Software. 20 | ** 21 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | ** 29 | ** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] 30 | --]] 31 | local ffi = require "ffi" 32 | require "lpvm" 33 | 34 | local band, bor, bnot, rshift, lshift = bit.band, bit.bor, bit.bnot, bit.rshift, bit.lshift 35 | 36 | local TChar = 0 37 | local TSet = 1 38 | local TAny = 2 -- standard PEG elements 39 | local TTrue = 3 40 | local TFalse = 4 41 | local TRep = 5 42 | local TSeq = 6 43 | local TChoice = 7 44 | local TNot = 8 45 | local TAnd = 9 46 | local TCall = 10 47 | local TOpenCall = 11 48 | local TRule = 12 -- sib1 is rule's pattern, sib2 is 'next' rule 49 | local TGrammar = 13 -- sib1 is initial (and first) rule 50 | local TBehind = 14 -- match behind 51 | local TCapture = 15 -- regular capture 52 | local TRunTime = 16 -- run-time capture 53 | 54 | 55 | local IAny = 0 -- if no char, fail 56 | local IChar = 1 -- if char != val, fail 57 | local ISet = 2 -- if char not in val, fail 58 | local ITestAny = 3 -- in no char, jump to 'offset' 59 | local ITestChar = 4 -- if char != val, jump to 'offset' 60 | local ITestSet = 5 -- if char not in val, jump to 'offset' 61 | local ISpan = 6 -- read a span of chars in val 62 | local IBehind = 7 -- walk back 'val' characters (fail if not possible) 63 | local IRet = 8 -- return from a rule 64 | local IEnd = 9 -- end of pattern 65 | local IChoice = 10 -- stack a choice; next fail will jump to 'offset' 66 | local IJmp = 11 -- jump to 'offset' 67 | local ICall = 12 -- call rule at 'offset' 68 | local IOpenCall = 13 -- call rule number 'offset' (must be closed to a ICall) 69 | local ICommit = 14 -- pop choice and jump to 'offset' 70 | local IPartialCommit = 15 -- update top choice to current position and jump 71 | local IBackCommit = 16 -- "fails" but jump to its own 'offset' 72 | local IFailTwice = 17 -- pop one choice and then fail 73 | local IFail = 18 -- go back to saved state on choice and jump to saved offset 74 | local IGiveup = 19 -- internal use 75 | local IFullCapture = 20 -- complete capture of last 'off' chars 76 | local IOpenCapture = 21 -- start a capture 77 | local ICloseCapture = 22 78 | local ICloseRunTime = 23 79 | 80 | 81 | local Cclose = 0 82 | local Cposition = 1 83 | local Cconst = 2 84 | local Cbackref = 3 85 | local Carg = 4 86 | local Csimple = 5 87 | local Ctable = 6 88 | local Cfunction = 7 89 | local Cquery = 8 90 | local Cstring = 9 91 | local Cnum = 10 92 | local Csubst = 11 93 | local Cfold = 12 94 | local Cruntime = 13 95 | local Cgroup = 14 96 | 97 | 98 | local PEnullable = 0 99 | local PEnofail = 1 100 | local RuleLR = 0x10000 101 | local NOINST = -2 102 | 103 | 104 | local MAXBEHINDPREDICATE = 255 105 | local MAXRULES = 200 106 | local MAXOFF = 0xF 107 | 108 | -- number of siblings for each tree 109 | local numsiblings = { 110 | 0, 0, 0, -- char, set, any 111 | 0, 0, -- true, false 112 | 1, -- rep 113 | 2, 2, -- seq, choice 114 | 1, 1, -- not, and 115 | 0, 0, 2, 1, -- call, opencall, rule, grammar 116 | 1, -- behind 117 | 1, 1 -- capture, runtime capture 118 | } 119 | 120 | 121 | local patternelement = ffi.typeof('PATTERN_ELEMENT') 122 | local pattern = ffi.typeof('PATTERN') 123 | local settype = ffi.typeof('int32_t[8]') 124 | local fullset = settype(-1, -1, -1, -1, -1, -1, -1, -1) 125 | 126 | -- {====================================================== 127 | -- Analysis and some optimizations 128 | -- ======================================================= 129 | 130 | local codegen 131 | 132 | 133 | -- Check whether a charset is empty (IFail), singleton (IChar), 134 | -- full (IAny), or none of those (ISet). 135 | 136 | local function charsettype(cs) 137 | local count = 0; 138 | local candidate = -1; -- candidate position for a char 139 | for i = 0, 8 - 1 do 140 | local b = cs[i]; 141 | if b == 0 then 142 | if count > 1 then 143 | return ISet; -- else set is still empty 144 | end 145 | elseif b == -1 then 146 | if count < (i * 32) then 147 | return ISet; 148 | else 149 | count = count + 32; -- set is still full 150 | end 151 | -- byte has only one bit? 152 | elseif band(b, (b - 1)) == 0 then 153 | if count > 0 then 154 | return ISet; -- set is neither full nor empty 155 | -- set has only one char till now; track it 156 | else 157 | count = count + 1; 158 | candidate = i; 159 | end 160 | else 161 | return ISet; -- byte is neither empty, full, nor singleton 162 | end 163 | end 164 | if count == 0 then 165 | return IFail, 0 -- empty set 166 | -- singleton; find character bit inside byte 167 | elseif count == 1 then 168 | local b = cs[candidate]; 169 | local c = candidate * 32; 170 | for i = 1, 32 do 171 | if b == 1 then 172 | c = c + i - 1 173 | break 174 | end 175 | b = rshift(b, 1) 176 | end 177 | return IChar, c 178 | elseif count == 256 then 179 | return IAny, 0 -- full set 180 | else 181 | assert(false) -- should have returned by now 182 | end 183 | end 184 | 185 | 186 | -- A few basic operations on Charsets 187 | 188 | local function cs_complement(cs) 189 | for i = 0, 8 - 1 do 190 | cs[i] = bnot(cs[i]) 191 | end 192 | end 193 | 194 | 195 | local function cs_equal(cs1, cs2) 196 | for i = 0, 8 - 1 do 197 | if cs1[i] ~= cs2[i] then 198 | return 199 | end 200 | end 201 | return true 202 | end 203 | 204 | 205 | -- computes whether sets st1 and st2 are disjoint 206 | 207 | local function cs_disjoint(st1, st2) 208 | for i = 0, 8 - 1 do 209 | if band(st1[i], st2[i]) ~= 0 then 210 | return 211 | end 212 | end 213 | return true 214 | end 215 | 216 | 217 | -- Convert a 'char' pattern (TSet, TChar, TAny) to a charset 218 | 219 | local function tocharset(tree, index, valuetable) 220 | local val = settype() 221 | if tree.p[index].tag == TSet then 222 | ffi.copy(val, valuetable[tree.p[index].val], ffi.sizeof(val)) 223 | return val 224 | elseif tree.p[index].tag == TChar then 225 | local b = tree.p[index].val 226 | -- only one char 227 | -- add that one 228 | val[rshift(b, 5)] = lshift(1, band(b, 31)) 229 | return val 230 | elseif tree.p[index].tag == TAny then 231 | ffi.fill(val, ffi.sizeof(val), 0xff) 232 | return val 233 | end 234 | end 235 | 236 | 237 | -- checks whether a pattern has captures 238 | 239 | local function hascaptures(tree, index) 240 | if tree.p[index].tag == TCapture or tree.p[index].tag == TRunTime then 241 | return true 242 | elseif tree.p[index].tag == TCall then 243 | return hascaptures(tree, index + tree.p[index].ps) 244 | else 245 | local ns = numsiblings[tree.p[index].tag + 1] 246 | if ns == 0 then 247 | return 248 | elseif ns == 1 then 249 | return hascaptures(tree, index + 1) 250 | elseif ns == 2 then 251 | if hascaptures(tree, index + 1) then 252 | return true 253 | elseif tree.p[index].tag ~= TRule then 254 | return hascaptures(tree, index + tree.p[index].ps) 255 | end 256 | else 257 | assert(false) 258 | end 259 | end 260 | end 261 | 262 | 263 | -- Checks how a pattern behaves regarding the empty string, 264 | -- in one of two different ways: 265 | -- A pattern is *nullable* if it can match without consuming any character; 266 | -- A pattern is *nofail* if it never fails for any string 267 | -- (including the empty string). 268 | -- The difference is only for predicates; for patterns without 269 | -- predicates, the two properties are equivalent. 270 | -- (With predicates, &'a' is nullable but not nofail. Of course, 271 | -- nofail => nullable.) 272 | -- These functions are all convervative in the following way: 273 | -- p is nullable => nullable(p) 274 | -- nofail(p) => p cannot fail 275 | -- (The function assumes that TOpenCall and TRunTime are not nullable: 276 | -- TOpenCall must be checked again when the grammar is fixed; 277 | -- TRunTime is an arbitrary choice.) 278 | 279 | local function checkaux(tree, pred, index, lrcall) 280 | lrcall = lrcall or {} 281 | local tag = tree.p[index].tag 282 | if tag == TChar or tag == TSet or tag == TAny or 283 | tag == TFalse or tag == TOpenCall then 284 | return -- not nullable 285 | elseif tag == TRep or tag == TTrue then 286 | return true -- no fail 287 | elseif tag == TNot or tag == TBehind then 288 | -- can match empty, but may fail 289 | if pred == PEnofail then 290 | return 291 | else 292 | return true -- PEnullable 293 | end 294 | elseif tag == TAnd then 295 | -- can match empty; fail iff body does 296 | if pred == PEnullable then 297 | return true 298 | else 299 | return checkaux(tree, pred, index + 1, lrcall) 300 | end 301 | -- can fail; match empty iff body does 302 | elseif tag == TRunTime then 303 | if pred == PEnofail then 304 | return 305 | else 306 | return checkaux(tree, pred, index + 1, lrcall) 307 | end 308 | elseif tag == TSeq then 309 | if not checkaux(tree, pred, index + 1, lrcall) then 310 | return 311 | else 312 | return checkaux(tree, pred, index + tree.p[index].ps, lrcall) 313 | end 314 | elseif tag == TChoice then 315 | if checkaux(tree, pred, index + tree.p[index].ps, lrcall) then 316 | return true 317 | else 318 | return checkaux(tree, pred, index + 1, lrcall) 319 | end 320 | elseif tag == TCapture or tag == TGrammar or tag == TRule then 321 | return checkaux(tree, pred, index + 1, lrcall) 322 | elseif tag == TCall then 323 | --left recursive rule 324 | if bit.band(tree.p[index].cap, 0xffff) ~= 0 then 325 | local lr = index + tree.p[index].ps 326 | if lrcall[lr] then 327 | return 328 | end 329 | lrcall[lr] = true 330 | end 331 | return checkaux(tree, pred, index + tree.p[index].ps, lrcall) 332 | else 333 | assert(false) 334 | end 335 | end 336 | 337 | 338 | -- number of characters to match a pattern (or -1 if variable) 339 | -- ('count' avoids infinite loops for grammars) 340 | 341 | local function fixedlenx(tree, count, len, index) 342 | local tag = tree.p[index].tag 343 | if tag == TChar or tag == TSet or tag == TAny then 344 | return len + 1; 345 | elseif tag == TFalse or tag == TTrue or tag == TNot or tag == TAnd or tag == TBehind then 346 | return len; 347 | elseif tag == TRep or tag == TRunTime or tag == TOpenCall then 348 | return -1; 349 | elseif tag == TCapture or tag == TRule or tag == TGrammar then 350 | return fixedlenx(tree, count, len, index + 1) 351 | elseif tag == TCall then 352 | if count >= MAXRULES then 353 | return -1; -- may be a loop 354 | else 355 | return fixedlenx(tree, count + 1, len, index + tree.p[index].ps) 356 | end 357 | elseif tag == TSeq then 358 | len = fixedlenx(tree, count, len, index + 1) 359 | if (len < 0) then 360 | return -1; 361 | else 362 | return fixedlenx(tree, count, len, index + tree.p[index].ps) 363 | end 364 | elseif tag == TChoice then 365 | local n1 = fixedlenx(tree, count, len, index + 1) 366 | if n1 < 0 then return -1 end 367 | local n2 = fixedlenx(tree, count, len, index + tree.p[index].ps) 368 | if n1 == n2 then 369 | return n1 370 | else 371 | return -1 372 | end 373 | else 374 | assert(false) 375 | end 376 | end 377 | 378 | 379 | -- Computes the 'first set' of a pattern. 380 | -- The result is a conservative aproximation: 381 | -- match p ax -> x' for some x ==> a in first(p). 382 | -- match p '' -> '' ==> returns 1. 383 | -- The set 'follow' is the first set of what follows the 384 | -- pattern (full set if nothing follows it) 385 | 386 | local function getfirst(tree, follow, index, valuetable, lrcall) 387 | lrcall = lrcall or {} 388 | local tag = tree.p[index].tag 389 | if tag == TChar or tag == TSet or tag == TAny then 390 | local firstset = tocharset(tree, index, valuetable) 391 | return 0, firstset 392 | elseif tag == TTrue then 393 | local firstset = settype() 394 | ffi.copy(firstset, follow, ffi.sizeof(firstset)) 395 | return 1, firstset 396 | elseif tag == TFalse then 397 | local firstset = settype() 398 | return 0, firstset 399 | elseif tag == TChoice then 400 | local e1, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) 401 | local e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall) 402 | for i = 0, 8 - 1 do 403 | firstset[i] = bor(firstset[i], csaux[i]) 404 | end 405 | return bor(e1, e2), firstset 406 | elseif tag == TSeq then 407 | if not checkaux(tree, PEnullable, index + 1) then 408 | return getfirst(tree, fullset, index + 1, valuetable, lrcall) 409 | -- FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) 410 | else 411 | local e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall) 412 | local e1, firstset = getfirst(tree, csaux, index + 1, valuetable, lrcall) 413 | if e1 == 0 then -- 'e1' ensures that first can be used 414 | return 0, firstset 415 | -- one of the children has a matchtime? 416 | elseif band(bor(e1, e2), 2) == 2 then 417 | return 2, firstset -- pattern has a matchtime capture 418 | else 419 | return e2, firstset -- else depends on 'e2' 420 | end 421 | end 422 | elseif tag == TRep then 423 | local _, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) 424 | for i = 0, 8 - 1 do 425 | firstset[i] = bor(firstset[i], follow[i]) 426 | end 427 | return 1, firstset -- accept the empty string 428 | elseif tag == TCapture or tag == TGrammar or tag == TRule then 429 | return getfirst(tree, follow, index + 1, valuetable, lrcall) 430 | -- function invalidates any follow info. 431 | elseif tag == TRunTime then 432 | local e, firstset = getfirst(tree, fullset, index + 1, valuetable, lrcall) 433 | if e ~= 0 then 434 | return 2, firstset -- function is not "protected"? 435 | else 436 | return 0, firstset -- pattern inside capture ensures first can be used 437 | end 438 | elseif tag == TCall then 439 | -- left recursive rule 440 | if bit.band(tree.p[index].cap, 0xffff) ~= 0 then 441 | local lr = index + tree.p[index].ps 442 | if lrcall[lr] then 443 | return 0, settype() 444 | else 445 | lrcall[lr] = true 446 | end 447 | end 448 | return getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall) 449 | elseif tag == TAnd then 450 | local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) 451 | for i = 0, 8 - 1 do 452 | firstset[i] = band(firstset[i], follow[i]) 453 | end 454 | return e, firstset 455 | elseif tag == TNot then 456 | local firstset = tocharset(tree, index + 1, valuetable) 457 | if firstset then 458 | cs_complement(firstset) 459 | return 1, firstset 460 | end 461 | local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) 462 | ffi.copy(firstset, follow, ffi.sizeof(firstset)) 463 | return bor(e, 1), firstset -- always can accept the empty string 464 | -- instruction gives no new information 465 | elseif tag == TBehind then 466 | -- call 'getfirst' to check for math-time captures 467 | local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) 468 | ffi.copy(firstset, follow, ffi.sizeof(firstset)) 469 | return bor(e, 1), firstset -- always can accept the empty string 470 | else 471 | assert(false) 472 | end 473 | end 474 | 475 | 476 | -- If it returns true, then pattern can fail only depending on the next 477 | -- character of the subject 478 | 479 | local function headfail(tree, index, lrcall) 480 | lrcall = lrcall or {} 481 | local tag = tree.p[index].tag 482 | if tag == TChar or tag == TSet or tag == TAny or tag == TFalse then 483 | return true 484 | elseif tag == TTrue or tag == TRep or tag == TRunTime or tag == TNot or tag == TBehind then 485 | return 486 | elseif tag == TCapture or tag == TGrammar or tag == TRule or tag == TAnd then 487 | return headfail(tree, index + 1, lrcall) 488 | elseif tag == TCall then 489 | -- left recursive rule 490 | if bit.band(tree.p[index].cap, 0xffff) ~= 0 then 491 | local lr = index + tree.p[index].ps 492 | if lrcall[lr] then 493 | return true 494 | else 495 | lrcall[lr] = true 496 | end 497 | end 498 | return headfail(tree, index + tree.p[index].ps, lrcall) 499 | elseif tag == TSeq then 500 | if not checkaux(tree, PEnofail, index + tree.p[index].ps) then 501 | return 502 | else 503 | return headfail(tree, index + 1, lrcall) 504 | end 505 | elseif tag == TChoice then 506 | if not headfail(tree, index + 1, lrcall) then 507 | return 508 | else 509 | return headfail(tree, index + tree.p[index].ps, lrcall) 510 | end 511 | else 512 | assert(false) 513 | end 514 | end 515 | 516 | 517 | -- Check whether the code generation for the given tree can benefit 518 | -- from a follow set (to avoid computing the follow set when it is 519 | -- not needed) 520 | 521 | local function needfollow(tree, index) 522 | local tag = tree.p[index].tag 523 | if tag == TChar or tag == TSet or tag == TAny or tag == TFalse or tag == TTrue or tag == TAnd or tag == TNot or 524 | tag == TRunTime or tag == TGrammar or tag == TCall or tag == TBehind then 525 | return 526 | elseif tag == TChoice or tag == TRep then 527 | return true 528 | elseif tag == TCapture then 529 | return needfollow(tree, index + 1) 530 | elseif tag == TSeq then 531 | return needfollow(tree, index + tree.p[index].ps) 532 | else 533 | assert(false) 534 | end 535 | end 536 | 537 | -- ====================================================== 538 | 539 | 540 | -- {====================================================== 541 | -- Code generation 542 | -- ======================================================= 543 | 544 | 545 | -- code generation is recursive; 'opt' indicates that the code is 546 | -- being generated under a 'IChoice' operator jumping to its end. 547 | -- 'tt' points to a previous test protecting this code. 'fl' is 548 | -- the follow set of the pattern. 549 | 550 | 551 | local function addinstruction(code, op, val) 552 | local size = code.size 553 | if size >= code.allocsize then 554 | code:doublesize() 555 | end 556 | code.p[size].code = op 557 | code.p[size].val = val 558 | code.size = size + 1 559 | return size 560 | end 561 | 562 | 563 | local function setoffset(code, instruction, offset) 564 | code.p[instruction].offset = offset; 565 | end 566 | 567 | 568 | -- Add a capture instruction: 569 | -- 'op' is the capture instruction; 'cap' the capture kind; 570 | -- 'key' the key into ktable; 'aux' is optional offset 571 | 572 | local function addinstcap(code, op, cap, key, aux) 573 | local i = addinstruction(code, op, bor(cap, lshift(aux, 4))) 574 | setoffset(code, i, key) 575 | return i 576 | end 577 | 578 | 579 | local function jumptothere(code, instruction, target) 580 | if instruction >= 0 then 581 | setoffset(code, instruction, target - instruction) 582 | end 583 | end 584 | 585 | 586 | local function jumptohere(code, instruction) 587 | jumptothere(code, instruction, code.size) 588 | end 589 | 590 | 591 | -- Code an IChar instruction, or IAny if there is an equivalent 592 | -- test dominating it 593 | 594 | local function codechar(code, c, tt) 595 | assert(tt ~= -1) 596 | if tt >= 0 and code.p[tt].code == ITestChar and 597 | code.p[tt].val == c then 598 | addinstruction(code, IAny, 0) 599 | else 600 | addinstruction(code, IChar, c) 601 | end 602 | end 603 | 604 | 605 | -- Code an ISet instruction 606 | 607 | local function coderealcharset(code, cs, valuetable) 608 | local ind = #valuetable + 1 609 | valuetable[ind] = cs 610 | return addinstruction(code, ISet, ind) 611 | end 612 | 613 | 614 | -- code a char set, optimizing unit sets for IChar, "complete" 615 | -- sets for IAny, and empty sets for IFail; also use an IAny 616 | -- when instruction is dominated by an equivalent test. 617 | 618 | local function codecharset(code, cs, tt, valuetable) 619 | local op, c = charsettype(cs) 620 | if op == IChar then 621 | codechar(code, c, tt) 622 | elseif op == ISet then 623 | assert(tt ~= -1) 624 | if tt >= 0 and code.p[tt].code == ITestSet and 625 | cs_equal(cs, valuetable[code.p[tt].val]) then 626 | addinstruction(code, IAny, 0) 627 | else 628 | coderealcharset(code, cs, valuetable) 629 | end 630 | else 631 | addinstruction(code, op, c) 632 | end 633 | end 634 | 635 | 636 | -- code a test set, optimizing unit sets for ITestChar, "complete" 637 | -- sets for ITestAny, and empty sets for IJmp (always fails). 638 | -- 'e' is true iff test should accept the empty string. (Test 639 | -- instructions in the current VM never accept the empty string.) 640 | 641 | local function codetestset(code, cs, e, valuetable) 642 | if e ~= 0 then 643 | return NOINST -- no test 644 | else 645 | local pos = code.size 646 | codecharset(code, cs, NOINST, valuetable) 647 | local inst = code.p[pos] 648 | local code = inst.code 649 | if code == IFail then 650 | inst.code = IJmp -- always jump 651 | elseif code == IAny then 652 | inst.code = ITestAny 653 | elseif code == IChar then 654 | inst.code = ITestChar 655 | elseif code == ISet then 656 | inst.code = ITestSet 657 | else 658 | assert(false) 659 | end 660 | return pos 661 | end 662 | end 663 | 664 | 665 | -- Find the final destination of a sequence of jumps 666 | 667 | local function finaltarget(code, i) 668 | while code.p[i].code == IJmp do 669 | i = i + code.p[i].offset 670 | end 671 | return i 672 | end 673 | 674 | 675 | -- final label (after traversing any jumps) 676 | 677 | local function finallabel(code, i) 678 | return finaltarget(code, i + code.p[i].offset) 679 | end 680 | 681 | -- == behind n;

(where n = fixedlen(p)) 682 | 683 | local function codebehind(code, tree, index, valuetable) 684 | if tree.p[index].val > 0 then 685 | addinstruction(code, IBehind, tree.p[index].val) 686 | end 687 | codegen(code, tree, fullset, false, NOINST, index + 1, valuetable) -- NOINST 688 | end 689 | 690 | 691 | -- Choice; optimizations: 692 | -- - when p1 is headfail 693 | -- - when first(p1) and first(p2) are disjoint; than 694 | -- a character not in first(p1) cannot go to p1, and a character 695 | -- in first(p1) cannot go to p2 (at it is not in first(p2)). 696 | -- (The optimization is not valid if p1 accepts the empty string, 697 | -- as then there is no character at all...) 698 | -- - when p2 is empty and opt is true; a IPartialCommit can resuse 699 | -- the Choice already active in the stack. 700 | 701 | local function codechoice(code, tree, fl, opt, p1, p2, valuetable) 702 | local emptyp2 = tree.p[p2].tag == TTrue 703 | local e1, st1 = getfirst(tree, fullset, p1, valuetable) 704 | local _, st2 = getfirst(tree, fl, p2, valuetable) 705 | if headfail(tree, p1) or (e1 == 0 and cs_disjoint(st1, st2)) then 706 | -- == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: 707 | local test = codetestset(code, st1, 0, valuetable) 708 | local jmp = NOINST; 709 | codegen(code, tree, fl, false, test, p1, valuetable) 710 | if not emptyp2 then 711 | jmp = addinstruction(code, IJmp, 0) 712 | end 713 | jumptohere(code, test) 714 | codegen(code, tree, fl, opt, NOINST, p2, valuetable) 715 | jumptohere(code, jmp) 716 | elseif opt and emptyp2 then 717 | -- p1? == IPartialCommit; p1 718 | jumptohere(code, addinstruction(code, IPartialCommit, 0)) 719 | codegen(code, tree, fullset, true, NOINST, p1, valuetable) 720 | else 721 | -- == 722 | -- test(fail(p1)) -> L1; choice L1; ; commit L2; L1: ; L2: 723 | local test = codetestset(code, st1, e1, valuetable) 724 | local pchoice = addinstruction(code, IChoice, 0) 725 | codegen(code, tree, fullset, emptyp2, test, p1, valuetable) 726 | local pcommit = addinstruction(code, ICommit, 0) 727 | jumptohere(code, pchoice) 728 | jumptohere(code, test) 729 | codegen(code, tree, fl, opt, NOINST, p2, valuetable) 730 | jumptohere(code, pcommit) 731 | end 732 | end 733 | 734 | 735 | -- And predicate 736 | -- optimization: fixedlen(p) = n ==> <&p> ==

; behind n 737 | -- (valid only when 'p' has no captures) 738 | 739 | local function codeand(code, tree, tt, index, valuetable) 740 | local n = fixedlenx(tree, 0, 0, index) 741 | if n >= 0 and n <= MAXBEHINDPREDICATE and not hascaptures(tree, index) then 742 | codegen(code, tree, fullset, false, tt, index, valuetable) 743 | if n > 0 then 744 | addinstruction(code, IBehind, n) 745 | end 746 | else 747 | -- default: Choice L1; p1; BackCommit L2; L1: Fail; L2: 748 | local pchoice = addinstruction(code, IChoice, 0) 749 | codegen(code, tree, fullset, false, tt, index, valuetable) 750 | local pcommit = addinstruction(code, IBackCommit, 0) 751 | jumptohere(code, pchoice) 752 | addinstruction(code, IFail, 0) 753 | jumptohere(code, pcommit) 754 | end 755 | end 756 | 757 | 758 | -- Captures: if pattern has fixed (and not too big) length, use 759 | -- a single IFullCapture instruction after the match; otherwise, 760 | -- enclose the pattern with OpenCapture - CloseCapture. 761 | 762 | local function codecapture(code, tree, fl, tt, index, valuetable) 763 | local len = fixedlenx(tree, 0, 0, index + 1) 764 | if len >= 0 and len <= MAXOFF and not hascaptures(tree, index + 1) then 765 | codegen(code, tree, fl, false, tt, index + 1, valuetable) 766 | addinstcap(code, IFullCapture, tree.p[index].cap, tree.p[index].val, len) 767 | else 768 | addinstcap(code, IOpenCapture, tree.p[index].cap, tree.p[index].val, 0) 769 | codegen(code, tree, fl, false, tt, index + 1, valuetable) 770 | addinstcap(code, ICloseCapture, Cclose, 0, 0) 771 | end 772 | end 773 | 774 | 775 | local function coderuntime(code, tree, tt, index, valuetable) 776 | addinstcap(code, IOpenCapture, Cgroup, tree.p[index].val, 0) 777 | codegen(code, tree, fullset, false, tt, index + 1, valuetable) 778 | addinstcap(code, ICloseRunTime, Cclose, 0, 0) 779 | end 780 | 781 | 782 | -- Repetion; optimizations: 783 | -- When pattern is a charset, can use special instruction ISpan. 784 | -- When pattern is head fail, or if it starts with characters that 785 | -- are disjoint from what follows the repetions, a simple test 786 | -- is enough (a fail inside the repetition would backtrack to fail 787 | -- again in the following pattern, so there is no need for a choice). 788 | -- When 'opt' is true, the repetion can reuse the Choice already 789 | -- active in the stack. 790 | 791 | local function coderep(code, tree, opt, fl, index, valuetable) 792 | local st = tocharset(tree, index, valuetable) 793 | if st then 794 | local op = coderealcharset(code, st, valuetable) 795 | code.p[op].code = ISpan; 796 | else 797 | local e1, st = getfirst(tree, fullset, index, valuetable) 798 | if headfail(tree, index) or (e1 == 0 and cs_disjoint(st, fl)) then 799 | -- L1: test (fail(p1)) -> L2;

; jmp L1; L2: 800 | local test = codetestset(code, st, 0, valuetable) 801 | codegen(code, tree, fullset, false, test, index, valuetable) 802 | local jmp = addinstruction(code, IJmp, 0) 803 | jumptohere(code, test) 804 | jumptothere(code, jmp, test) 805 | else 806 | -- test(fail(p1)) -> L2; choice L2; L1:

; partialcommit L1; L2: 807 | -- or (if 'opt'): partialcommit L1; L1:

; partialcommit L1; 808 | local test = codetestset(code, st, e1, valuetable) 809 | local pchoice = NOINST; 810 | if opt then 811 | jumptohere(code, addinstruction(code, IPartialCommit, 0)) 812 | else 813 | pchoice = addinstruction(code, IChoice, 0) 814 | end 815 | local l2 = code.size 816 | codegen(code, tree, fullset, false, NOINST, index, valuetable) 817 | local commit = addinstruction(code, IPartialCommit, 0) 818 | jumptothere(code, commit, l2) 819 | jumptohere(code, pchoice) 820 | jumptohere(code, test) 821 | end 822 | end 823 | end 824 | 825 | 826 | -- Not predicate; optimizations: 827 | -- In any case, if first test fails, 'not' succeeds, so it can jump to 828 | -- the end. If pattern is headfail, that is all (it cannot fail 829 | -- in other parts); this case includes 'not' of simple sets. Otherwise, 830 | -- use the default code (a choice plus a failtwice). 831 | 832 | local function codenot(code, tree, index, valuetable) 833 | local e, st = getfirst(tree, fullset, index, valuetable) 834 | local test = codetestset(code, st, e, valuetable) 835 | -- test (fail(p1)) -> L1; fail; L1: 836 | if headfail(tree, index) then 837 | addinstruction(code, IFail, 0) 838 | else 839 | -- test(fail(p))-> L1; choice L1;

; failtwice; L1: 840 | local pchoice = addinstruction(code, IChoice, 0) 841 | codegen(code, tree, fullset, false, NOINST, index, valuetable) 842 | addinstruction(code, IFailTwice, 0) 843 | jumptohere(code, pchoice) 844 | end 845 | jumptohere(code, test) 846 | end 847 | 848 | 849 | -- change open calls to calls, using list 'positions' to find 850 | -- correct offsets; also optimize tail calls 851 | 852 | local function correctcalls(code, positions, from, to) 853 | for i = from, to - 1 do 854 | if code.p[i].code == IOpenCall then 855 | local n = code.p[i].offset; -- rule number 856 | local rule = positions[n]; -- rule position 857 | assert(rule == from or code.p[rule - 1].code == IRet) 858 | -- call; ret ? 859 | if bit.band(code.p[i].val, 0xffff) == 0 and code.p[finaltarget(code, i + 1)].code == IRet then 860 | code.p[i].code = IJmp; -- tail call 861 | else 862 | code.p[i].code = ICall; 863 | end 864 | jumptothere(code, i, rule) -- call jumps to respective rule 865 | end 866 | end 867 | end 868 | 869 | 870 | -- Code for a grammar: 871 | -- call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: 872 | 873 | local function codegrammar(code, tree, index, valuetable) 874 | local positions = {} 875 | local rulenumber = 1; 876 | -- tree.p[rule].tag 877 | local rule = index + 1 878 | assert(tree.p[rule].tag == TRule) 879 | local LR = 0 880 | if band(RuleLR, tree.p[rule].cap) ~= 0 then LR = 1 end 881 | local firstcall = addinstruction(code, ICall, LR) -- call initial rule 882 | code.p[firstcall].aux = tree.p[rule].val 883 | local jumptoend = addinstruction(code, IJmp, 0) -- jump to the end 884 | jumptohere(code, firstcall) -- here starts the initial rule 885 | while tree.p[rule].tag == TRule do 886 | positions[rulenumber] = code.size -- save rule position 887 | rulenumber = rulenumber + 1 888 | codegen(code, tree, fullset, false, NOINST, rule + 1, valuetable) -- code rule 889 | addinstruction(code, IRet, 0) 890 | rule = rule + tree.p[rule].ps 891 | end 892 | assert(tree.p[rule].tag == TTrue) 893 | jumptohere(code, jumptoend) 894 | correctcalls(code, positions, firstcall + 2, code.size) 895 | end 896 | 897 | 898 | local function codecall(code, tree, index, val) 899 | local c = addinstruction(code, IOpenCall, tree.p[index].cap) -- to be corrected later 900 | code.p[c].aux = val 901 | assert(tree.p[index + tree.p[index].ps].tag == TRule) 902 | setoffset(code, c, band(tree.p[index + tree.p[index].ps].cap, 0x7fff)) -- offset = rule number 903 | end 904 | 905 | 906 | local function codeseq(code, tree, fl, opt, tt, p1, p2, valuetable) 907 | if needfollow(tree, p1) then 908 | local _, fll = getfirst(tree, fl, p2, valuetable) -- p1 follow is p2 first 909 | codegen(code, tree, fll, false, tt, p1, valuetable) 910 | else 911 | -- use 'fullset' as follow 912 | codegen(code, tree, fullset, false, tt, p1, valuetable) 913 | end 914 | -- can p1 consume anything? 915 | if (fixedlenx(tree, 0, 0, p1) ~= 0) then 916 | tt = NOINST; -- invalidate test 917 | end 918 | return codegen(code, tree, fl, opt, tt, p2, valuetable) 919 | end 920 | 921 | 922 | -- Main code-generation function: dispatch to auxiliar functions 923 | -- according to kind of tree 924 | 925 | -- code generation is recursive; 'opt' indicates that the code is being 926 | -- generated as the last thing inside an optional pattern (so, if that 927 | -- code is optional too, it can reuse the 'IChoice' already in place for 928 | -- the outer pattern). 'tt' points to a previous test protecting this 929 | -- code (or NOINST). 'fl' is the follow set of the pattern. 930 | 931 | function codegen(code, tree, fl, opt, tt, index, valuetable) 932 | local tag = tree.p[index].tag 933 | if tag == TChar then 934 | return codechar(code, tree.p[index].val, tt) 935 | elseif tag == TAny then 936 | return addinstruction(code, IAny, 0) 937 | elseif tag == TSet then 938 | return codecharset(code, valuetable[tree.p[index].val], tt, valuetable) 939 | elseif tag == TTrue then 940 | elseif tag == TFalse then 941 | return addinstruction(code, IFail, 0) 942 | elseif tag == TSeq then 943 | return codeseq(code, tree, fl, opt, tt, index + 1, index + tree.p[index].ps, valuetable) 944 | elseif tag == TChoice then 945 | return codechoice(code, tree, fl, opt, index + 1, index + tree.p[index].ps, valuetable) 946 | elseif tag == TRep then 947 | return coderep(code, tree, opt, fl, index + 1, valuetable) 948 | elseif tag == TBehind then 949 | return codebehind(code, tree, index, valuetable) 950 | elseif tag == TNot then 951 | return codenot(code, tree, index + 1, valuetable) 952 | elseif tag == TAnd then 953 | return codeand(code, tree, tt, index + 1, valuetable) 954 | elseif tag == TCapture then 955 | return codecapture(code, tree, fl, tt, index, valuetable) 956 | elseif tag == TRunTime then 957 | return coderuntime(code, tree, tt, index, valuetable) 958 | elseif tag == TGrammar then 959 | return codegrammar(code, tree, index, valuetable) 960 | elseif tag == TCall then 961 | return codecall(code, tree, index, tree.p[index].val) 962 | else 963 | assert(false) 964 | end 965 | end 966 | 967 | 968 | -- Optimize jumps and other jump-like instructions. 969 | -- * Update labels of instructions with labels to their final 970 | -- destinations (e.g., choice L1; ... L1: jmp L2: becomes 971 | -- choice L2) 972 | -- * Jumps to other instructions that do jumps become those 973 | -- instructions (e.g., jump to return becomes a return; jump 974 | -- to commit becomes a commit) 975 | 976 | local function peephole(code) 977 | local i = 0 978 | while i < code.size do 979 | local tag = code.p[i].code 980 | if tag == IChoice or tag == ICall or tag == ICommit or tag == IPartialCommit or 981 | tag == IBackCommit or tag == ITestChar or tag == ITestSet or tag == ITestAny then 982 | -- instructions with labels 983 | jumptothere(code, i, finallabel(code, i)) -- optimize label 984 | 985 | elseif tag == IJmp then 986 | local ft = finaltarget(code, i) 987 | local tag = code.p[ft].code -- jumping to what? 988 | -- instructions with unconditional implicit jumps 989 | if tag == IRet or tag == IFail or tag == IFailTwice or tag == IEnd then 990 | ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)) -- jump becomes that instruction 991 | elseif tag == ICommit or tag == IPartialCommit or tag == IBackCommit then 992 | -- inst. with unconditional explicit jumps 993 | local fft = finallabel(code, ft) 994 | ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)) -- jump becomes that instruction... 995 | jumptothere(code, i, fft) -- but must correct its offset 996 | i = i - 1 -- reoptimize its label 997 | else 998 | jumptothere(code, i, ft) -- optimize label 999 | end 1000 | end 1001 | i = i + 1 1002 | end 1003 | end 1004 | 1005 | 1006 | -- Compile a pattern 1007 | 1008 | local function compile(tree, index, valuetable) 1009 | local code = pattern() 1010 | codegen(code, tree, fullset, false, NOINST, index, valuetable) 1011 | addinstruction(code, IEnd, 0) 1012 | peephole(code) 1013 | ffi.C.free(tree.code) 1014 | tree.code = code 1015 | end 1016 | 1017 | local function pat_new(ct, size) 1018 | size = size or 0 1019 | local allocsize = size 1020 | if allocsize < 10 then 1021 | allocsize = 10 1022 | end 1023 | local pat = ffi.cast('PATTERN*', ffi.C.malloc(ffi.sizeof(pattern))) 1024 | assert(pat ~= nil) 1025 | pat.allocsize = allocsize 1026 | pat.size = size 1027 | pat.p = ffi.C.malloc(ffi.sizeof(patternelement) * allocsize) 1028 | assert(pat.p ~= nil) 1029 | ffi.fill(pat.p, ffi.sizeof(patternelement) * allocsize) 1030 | return pat 1031 | end 1032 | 1033 | local function doublesize(ct) 1034 | ct.p = ffi.C.realloc(ct.p, ffi.sizeof(patternelement) * ct.allocsize * 2) 1035 | assert(ct.p ~= nil) 1036 | ffi.fill(ct.p + ct.allocsize, ffi.sizeof(patternelement) * ct.allocsize) 1037 | ct.allocsize = ct.allocsize * 2 1038 | end 1039 | 1040 | local pattreg = { 1041 | doublesize = doublesize, 1042 | } 1043 | 1044 | local metareg = { 1045 | ["__new"] = pat_new, 1046 | ["__index"] = pattreg 1047 | } 1048 | 1049 | ffi.metatype(pattern, metareg) 1050 | 1051 | return { 1052 | checkaux = checkaux, 1053 | tocharset = tocharset, 1054 | fixedlenx = fixedlenx, 1055 | hascaptures = hascaptures, 1056 | compile = compile, 1057 | } 1058 | -------------------------------------------------------------------------------- /src/lpeglj.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | LPEGLJ 3 | lpeglj.lua 4 | Main module and tree generation 5 | Copyright (C) 2014 Rostislav Sacek. 6 | based on LPeg v1.0 - PEG pattern matching for Lua 7 | Lua.org & PUC-Rio written by Roberto Ierusalimschy 8 | http://www.inf.puc-rio.br/~roberto/lpeg/ 9 | 10 | ** Permission is hereby granted, free of charge, to any person obtaining 11 | ** a copy of this software and associated documentation files (the 12 | ** "Software"), to deal in the Software without restriction, including 13 | ** without limitation the rights to use, copy, modify, merge, publish, 14 | ** distribute, sublicense, and/or sell copies of the Software, and to 15 | ** permit persons to whom the Software is furnished to do so, subject to 16 | ** the following conditions: 17 | ** 18 | ** The above copyright notice and this permission notice shall be 19 | ** included in all copies or substantial portions of the Software. 20 | ** 21 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | ** 29 | ** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] 30 | --]] 31 | 32 | assert(jit.version_num > 20000, "Use LuaJIT v2.0.1 or higher.") 33 | 34 | local ffi = require "ffi" 35 | local lpcode = require "lpcode" 36 | local lpprint = require "lpprint" 37 | local lpvm = require "lpvm" 38 | 39 | local band, bor, bnot, rshift, lshift = bit.band, bit.bor, bit.bnot, bit.rshift, bit.lshift 40 | 41 | ffi.cdef [[ 42 | int isalnum(int c); 43 | int isalpha(int c); 44 | int iscntrl(int c); 45 | int isdigit(int c); 46 | int isgraph(int c); 47 | int islower(int c); 48 | int isprint(int c); 49 | int ispunct(int c); 50 | int isspace(int c); 51 | int isupper(int c); 52 | int isxdigit(int c); 53 | ]] 54 | 55 | local MAXBEHIND = 255 56 | local MAXRULES = 200 57 | local VERSION = "1.0.0.0LJ" 58 | 59 | local TChar = 0 60 | local TSet = 1 61 | local TAny = 2 -- standard PEG elements 62 | local TTrue = 3 63 | local TFalse = 4 64 | local TRep = 5 65 | local TSeq = 6 66 | local TChoice = 7 67 | local TNot = 8 68 | local TAnd = 9 69 | local TCall = 10 70 | local TOpenCall = 11 71 | local TRule = 12 -- sib1 is rule's pattern, sib2 is 'next' rule 72 | local TGrammar = 13 -- sib1 is initial (and first) rule 73 | local TBehind = 14 -- match behind 74 | local TCapture = 15 -- regular capture 75 | local TRunTime = 16 -- run-time capture 76 | 77 | local IAny = 0 -- if no char, fail 78 | local IChar = 1 -- if char != val, fail 79 | local ISet = 2 -- if char not in val, fail 80 | local ITestAny = 3 -- in no char, jump to 'offset' 81 | local ITestChar = 4 -- if char != val, jump to 'offset' 82 | local ITestSet = 5 -- if char not in val, jump to 'offset' 83 | local ISpan = 6 -- read a span of chars in val 84 | local IBehind = 7 -- walk back 'val' characters (fail if not possible) 85 | local IRet = 8 -- return from a rule 86 | local IEnd = 9 -- end of pattern 87 | local IChoice = 10 -- stack a choice; next fail will jump to 'offset' 88 | local IJmp = 11 -- jump to 'offset' 89 | local ICall = 12 -- call rule at 'offset' 90 | local IOpenCall = 13 -- call rule number 'offset' (must be closed to a ICall) 91 | local ICommit = 14 -- pop choice and jump to 'offset' 92 | local IPartialCommit = 15 -- update top choice to current position and jump 93 | local IBackCommit = 16 -- "fails" but jump to its own 'offset' 94 | local IFailTwice = 17 -- pop one choice and then fail 95 | local IFail = 18 -- go back to saved state on choice and jump to saved offset 96 | local IGiveup = 19 -- internal use 97 | local IFullCapture = 20 -- complete capture of last 'off' chars 98 | local IOpenCapture = 21 -- start a capture 99 | local ICloseCapture = 22 100 | local ICloseRunTime = 23 101 | 102 | local Cclose = 0 103 | local Cposition = 1 104 | local Cconst = 2 105 | local Cbackref = 3 106 | local Carg = 4 107 | local Csimple = 5 108 | local Ctable = 6 109 | local Cfunction = 7 110 | local Cquery = 8 111 | local Cstring = 9 112 | local Cnum = 10 113 | local Csubst = 11 114 | local Cfold = 12 115 | local Cruntime = 13 116 | local Cgroup = 14 117 | 118 | local PEnullable = 0 119 | local PEnofail = 1 120 | local PEleftrecursion = 2 121 | 122 | local newgrammar 123 | 124 | local RuleLR = 0x10000 125 | local Ruleused = 0x20000 126 | local BCapcandelete = 0x30000 127 | 128 | local LREnable = false 129 | 130 | -- number of siblings for each tree 131 | local numsiblings = { 132 | 0, 0, 0, -- char, set, any 133 | 0, 0, -- true, false 134 | 1, -- rep 135 | 2, 2, -- seq, choice 136 | 1, 1, -- not, and 137 | 0, 0, 2, 1, -- call, opencall, rule, grammar 138 | 1, -- behind 139 | 1, 1 -- capture, runtime capture 140 | } 141 | 142 | 143 | 144 | local patternid = 0 145 | local valuetable = {} 146 | 147 | local funcnames = setmetatable({}, { __mode = 'k' }) 148 | 149 | local treepatternelement = ffi.typeof('TREEPATTERN_ELEMENT') 150 | local treepattern = ffi.typeof('TREEPATTERN') 151 | local patternelement = ffi.typeof('PATTERN_ELEMENT') 152 | local pattern = ffi.typeof('PATTERN') 153 | local settype = ffi.typeof('int32_t[8]') 154 | local uint32 = ffi.typeof('uint32_t[1]') 155 | 156 | -- Fix a TOpenCall into a TCall node, using table 'postable' to 157 | -- translate a key to its rule address in the tree. Raises an 158 | -- error if key does not exist. 159 | 160 | local function fixonecall(postable, grammar, index, valuetable) 161 | local name = valuetable[grammar.p[index].val] -- get rule's name 162 | local n = postable[name] -- query name in position table 163 | -- no position? 164 | if not n then 165 | error(("rule '%s' undefined in given grammar"):format(type(name) == 'table' and '(a table)' or name), 0) 166 | end 167 | grammar.p[index].tag = TCall; 168 | grammar.p[index].ps = n - index -- position relative to node 169 | grammar.p[index + grammar.p[index].ps].cap = bit.bor(grammar.p[index + grammar.p[index].ps].cap, Ruleused) 170 | end 171 | 172 | 173 | -- Transform left associative constructions into right 174 | -- associative ones, for sequence and choice; that is: 175 | -- (t11 + t12) + t2 => t11 + (t12 + t2) 176 | -- (t11 * t12) * t2 => t11 * (t12 * t2) 177 | -- (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) 178 | 179 | local function correctassociativity(tree, index) 180 | local t1 = index + 1 181 | assert(tree.p[index].tag == TChoice or tree.p[index].tag == TSeq) 182 | while tree.p[t1].tag == tree.p[index].tag do 183 | local n1size = tree.p[index].ps - 1; -- t1 == Op t11 t12 184 | local n11size = tree.p[t1].ps - 1; 185 | local n12size = n1size - n11size - 1 186 | for i = 1, n11size do 187 | ffi.copy(tree.p + index + i, tree.p + t1 + i, ffi.sizeof(treepatternelement)) 188 | end 189 | tree.p[index].ps = n11size + 1 190 | tree.p[index + tree.p[index].ps].tag = tree.p[index].tag 191 | tree.p[index + tree.p[index].ps].ps = n12size + 1 192 | end 193 | end 194 | 195 | 196 | -- Make final adjustments in a tree. Fix open calls in tree, 197 | -- making them refer to their respective rules or raising appropriate 198 | -- errors (if not inside a grammar). Correct associativity of associative 199 | -- constructions (making them right associative). 200 | 201 | local function finalfix(fix, postable, grammar, index, valuetable) 202 | 203 | local tag = grammar.p[index].tag 204 | --subgrammars were already fixed 205 | if tag == TGrammar then 206 | return 207 | elseif tag == TOpenCall then 208 | -- inside a grammar? 209 | if fix then 210 | fixonecall(postable, grammar, index, valuetable) 211 | -- open call outside grammar 212 | else 213 | error(("rule '%s' used outside a grammar"):format(tostring(valuetable[grammar.p[index].val])), 0) 214 | end 215 | elseif tag == TSeq or tag == TChoice then 216 | correctassociativity(grammar, index) 217 | end 218 | local ns = numsiblings[tag + 1] 219 | if ns == 0 then 220 | elseif ns == 1 then 221 | return finalfix(fix, postable, grammar, index + 1, valuetable) 222 | elseif ns == 2 then 223 | finalfix(fix, postable, grammar, index + 1, valuetable) 224 | return finalfix(fix, postable, grammar, index + grammar.p[index].ps, valuetable) 225 | else 226 | assert(false) 227 | end 228 | end 229 | 230 | 231 | -- {====================================================== 232 | -- Tree generation 233 | -- ======================================================= 234 | 235 | local function newcharset() 236 | local tree = treepattern(1) 237 | valuetable[tree.id] = { settype() } 238 | tree.p[0].tag = TSet 239 | tree.p[0].val = 1 240 | return tree, valuetable[tree.id][1] 241 | end 242 | 243 | 244 | -- add to tree a sequence where first sibling is 'sib' (with size 245 | -- 'sibsize') 246 | 247 | local function seqaux(tree, sib, start, sibsize) 248 | tree.p[start].tag = TSeq; 249 | tree.p[start].ps = sibsize + 1 250 | ffi.copy(tree.p + start + 1, sib.p, ffi.sizeof(treepatternelement) * sibsize) 251 | end 252 | 253 | 254 | -- Build a sequence of 'n' nodes, each with tag 'tag' and 'val' got 255 | -- from the array 's' (or 0 if array is NULL). (TSeq is binary, so it 256 | -- must build a sequence of sequence of sequence...) 257 | 258 | local function fillseq(tree, tag, start, n, s) 259 | -- initial n-1 copies of Seq tag; Seq ... 260 | for i = 1, n - 1 do 261 | tree.p[start].tag = TSeq 262 | tree.p[start].ps = 2 263 | tree.p[start + 1].tag = tag 264 | if s then 265 | tree.p[start + 1].val = s:sub(i, i):byte() 266 | end 267 | start = start + tree.p[start].ps 268 | end 269 | tree.p[start].tag = tag -- last one does not need TSeq 270 | if s then 271 | tree.p[start].val = s:sub(n, n):byte() 272 | end 273 | end 274 | 275 | 276 | -- Numbers as patterns: 277 | -- 0 == true (always match); n == TAny repeated 'n' times; 278 | -- -n == not (TAny repeated 'n' times) 279 | 280 | local function numtree(n) 281 | if n == 0 then 282 | local tree = treepattern(1) 283 | tree.p[0].tag = TTrue 284 | return tree 285 | else 286 | local tree, start 287 | if n > 0 then 288 | tree = treepattern(2 * n - 1) 289 | start = 0 290 | -- negative: code it as !(-n) 291 | else 292 | n = -n; 293 | tree = treepattern(2 * n) 294 | tree.p[0].tag = TNot 295 | start = 1 296 | end 297 | fillseq(tree, TAny, start, n) -- sequence of 'n' any's 298 | return tree; 299 | end 300 | end 301 | 302 | 303 | -- Convert value to a pattern 304 | 305 | local function getpatt(val, name) 306 | local typ = type(val) 307 | if typ == 'string' then 308 | -- empty? 309 | if #val == 0 then 310 | local pat = treepattern(1) 311 | pat.p[0].tag = TTrue -- always match 312 | return pat 313 | else 314 | local tree = treepattern(2 * (#val - 1) + 1) 315 | fillseq(tree, TChar, 0, #val, val) -- sequence of '#val' chars 316 | return tree 317 | end 318 | elseif typ == 'number' then 319 | return numtree(val) 320 | elseif typ == 'boolean' then 321 | local pat = treepattern(1) 322 | pat.p[0].tag = val and TTrue or TFalse 323 | return pat 324 | elseif typ == 'table' then 325 | return newgrammar(val) 326 | elseif typ == 'function' then 327 | if name and type(name) == 'string' then 328 | funcnames[val] = name 329 | end 330 | local pat = treepattern(2) 331 | valuetable[pat.id] = { val } 332 | pat.p[0].tag = TRunTime 333 | pat.p[0].val = 1 334 | pat.p[1].tag = TTrue 335 | return pat 336 | elseif ffi.istype(treepattern, val) then 337 | assert(val.treesize > 0) 338 | return val 339 | end 340 | assert(false) 341 | end 342 | 343 | local function copykeys(ktable1, ktable2) 344 | local ktable, offset = {}, 0 345 | if not ktable1 and not ktable2 then 346 | return ktable, 0 347 | elseif ktable1 then 348 | for i = 1, #ktable1 do 349 | ktable[#ktable + 1] = ktable1[i] 350 | end 351 | offset = #ktable1 352 | if not ktable2 then 353 | return ktable, 0 354 | end 355 | end 356 | if ktable2 then 357 | for i = 1, #ktable2 do 358 | ktable[#ktable + 1] = ktable2[i] 359 | end 360 | end 361 | assert(#ktable < 65536, "too many Lua values in pattern") 362 | return ktable, offset 363 | end 364 | 365 | local function correctkeys(tree, index, offset) 366 | local tag = tree.p[index].tag 367 | if (tag == TSet or tag == TRule or tag == TCall or tag == TRunTime or tag == TOpenCall or tag == TCapture) and 368 | tree.p[index].val ~= 0 then 369 | tree.p[index].val = tree.p[index].val + offset 370 | end 371 | local ns = numsiblings[tag + 1] 372 | if ns == 0 then 373 | elseif ns == 1 then 374 | return correctkeys(tree, index + 1, offset) 375 | elseif ns == 2 then 376 | correctkeys(tree, index + 1, offset) 377 | return correctkeys(tree, index + tree.p[index].ps, offset) 378 | else 379 | assert(false) 380 | end 381 | end 382 | 383 | 384 | 385 | -- create a new tree, with a new root and one sibling. 386 | 387 | local function newroot1sib(tag, pat) 388 | local tree1 = getpatt(pat) 389 | local tree = treepattern(1 + tree1.treesize) -- create new tree 390 | valuetable[tree.id] = copykeys(valuetable[tree1.id]) 391 | tree.p[0].tag = tag 392 | ffi.copy(tree.p + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) 393 | return tree 394 | end 395 | 396 | 397 | -- create a new tree, with a new root and 2 siblings. 398 | 399 | local function newroot2sib(tag, pat1, pat2) 400 | local tree1 = getpatt(pat1) 401 | local tree2 = getpatt(pat2) 402 | local tree = treepattern(1 + tree1.treesize + tree2.treesize) -- create new tree 403 | local ktable, offset = copykeys(valuetable[tree1.id], valuetable[tree2.id]) 404 | valuetable[tree.id] = ktable 405 | tree.p[0].tag = tag 406 | tree.p[0].ps = 1 + tree1.treesize 407 | ffi.copy(tree.p + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) 408 | ffi.copy(tree.p + 1 + tree1.treesize, tree2.p, ffi.sizeof(treepatternelement) * tree2.treesize) 409 | if offset > 0 then 410 | correctkeys(tree, 1 + tree1.treesize, offset) 411 | end 412 | return tree; 413 | end 414 | 415 | 416 | local function lp_P(val, name) 417 | assert(type(val) ~= 'nil') 418 | return getpatt(val, name) 419 | end 420 | 421 | 422 | -- sequence operator; optimizations: 423 | -- false x => false, x true => x, true x => x 424 | -- (cannot do x . false => false because x may have runtime captures) 425 | 426 | local function lp_seq(pat1, pat2) 427 | local tree1 = getpatt(pat1) 428 | local tree2 = getpatt(pat2) 429 | -- false . x == false, x . true = x 430 | if tree1.p[0].tag == TFalse or tree2.p[0].tag == TTrue then 431 | return tree1 432 | -- true . x = x 433 | elseif tree1.p[0].tag == TTrue then 434 | return tree2 435 | else 436 | return newroot2sib(TSeq, tree1, tree2) 437 | end 438 | end 439 | 440 | 441 | -- choice operator; optimizations: 442 | -- charset / charset => charset 443 | -- true / x => true, x / false => x, false / x => x 444 | -- (x / true is not equivalent to true) 445 | 446 | local function lp_choice(pat1, pat2) 447 | local tree1 = getpatt(pat1) 448 | local tree2 = getpatt(pat2) 449 | local charset1 = lpcode.tocharset(tree1, 0, valuetable[tree1.id]) 450 | local charset2 = lpcode.tocharset(tree2, 0, valuetable[tree2.id]) 451 | if charset1 and charset2 then 452 | local t, set = newcharset() 453 | for i = 0, 7 do 454 | set[i] = bor(charset1[i], charset2[i]) 455 | end 456 | return t 457 | elseif lpcode.checkaux(tree1, PEnofail, 0) or tree2.p[0].tag == TFalse then 458 | return tree1 -- true / x => true, x / false => x 459 | elseif tree1.p[0].tag == TFalse then 460 | return tree2 -- false / x => x 461 | else 462 | return newroot2sib(TChoice, tree1, tree2) 463 | end 464 | end 465 | 466 | 467 | -- p^n 468 | 469 | local function lp_star(tree1, n) 470 | local tree 471 | n = tonumber(n) 472 | assert(type(n) == 'number') 473 | -- seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) 474 | if n >= 0 then 475 | tree = treepattern((n + 1) * (tree1.treesize + 1)) 476 | if lpcode.checkaux(tree1, PEnullable, 0) then 477 | error("loop body may accept empty string", 0) 478 | end 479 | valuetable[tree.id] = copykeys(valuetable[tree1.id]) 480 | local start = 0 481 | -- repeat 'n' times 482 | for i = 1, n do 483 | seqaux(tree, tree1, start, tree1.treesize) 484 | start = start + tree.p[start].ps 485 | end 486 | tree.p[start].tag = TRep 487 | ffi.copy(tree.p + start + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) 488 | -- choice (seq tree1 ... choice tree1 true ...) true 489 | else 490 | n = -n; 491 | -- size = (choice + seq + tree1 + true) * n, but the last has no seq 492 | tree = treepattern(n * (tree1.treesize + 3) - 1) 493 | valuetable[tree.id] = copykeys(valuetable[tree1.id]) 494 | local start = 0 495 | -- repeat (n - 1) times 496 | for i = n, 2, -1 do 497 | tree.p[start].tag = TChoice; 498 | tree.p[start].ps = i * (tree1.treesize + 3) - 2 499 | tree.p[start + tree.p[start].ps].tag = TTrue; 500 | start = start + 1 501 | seqaux(tree, tree1, start, tree1.treesize) 502 | start = start + tree.p[start].ps 503 | end 504 | tree.p[start].tag = TChoice; 505 | tree.p[start].ps = tree1.treesize + 1 506 | tree.p[start + tree.p[start].ps].tag = TTrue 507 | ffi.copy(tree.p + start + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) 508 | end 509 | return tree 510 | end 511 | 512 | 513 | -- #p == &p 514 | 515 | local function lp_and(pat) 516 | return newroot1sib(TAnd, pat) 517 | end 518 | 519 | 520 | -- -p == !p 521 | 522 | local function lp_not(pat) 523 | return newroot1sib(TNot, pat) 524 | end 525 | 526 | 527 | -- [t1 - t2] == Seq (Not t2) t1 528 | -- If t1 and t2 are charsets, make their difference. 529 | 530 | local function lp_sub(pat1, pat2) 531 | local tree1 = getpatt(pat1) 532 | local tree2 = getpatt(pat2) 533 | local charset1 = lpcode.tocharset(tree1, 0, valuetable[tree1.id]) 534 | local charset2 = lpcode.tocharset(tree2, 0, valuetable[tree2.id]) 535 | if charset1 and charset2 then 536 | local tree, set = newcharset() 537 | for i = 0, 7 do 538 | set[i] = band(charset1[i], bnot(charset2[i])) 539 | end 540 | return tree 541 | else 542 | local tree = treepattern(2 + tree1.treesize + tree2.treesize) 543 | local ktable, offset = copykeys(valuetable[tree2.id], valuetable[tree1.id]) 544 | valuetable[tree.id] = ktable 545 | tree.p[0].tag = TSeq; -- sequence of... 546 | tree.p[0].ps = 2 + tree2.treesize 547 | tree.p[1].tag = TNot; -- ...not... 548 | ffi.copy(tree.p + 2, tree2.p, ffi.sizeof(treepatternelement) * tree2.treesize) 549 | ffi.copy(tree.p + tree2.treesize + 2, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) 550 | if offset > 0 then 551 | correctkeys(tree, 2 + tree2.treesize, offset) 552 | end 553 | return tree 554 | end 555 | end 556 | 557 | 558 | local function lp_set(val) 559 | assert(type(val) == 'string') 560 | local tree, set = newcharset() 561 | for i = 1, #val do 562 | local b = val:sub(i, i):byte() 563 | set[rshift(b, 5)] = bor(set[rshift(b, 5)], lshift(1, band(b, 31))) 564 | end 565 | return tree 566 | end 567 | 568 | 569 | local function lp_range(...) 570 | local args = { ... } 571 | local top = #args 572 | local tree, set = newcharset() 573 | for i = 1, top do 574 | assert(#args[i] == 2, args[i] .. " range must have two characters") 575 | for b = args[i]:sub(1, 1):byte(), args[i]:sub(2, 2):byte() do 576 | set[rshift(b, 5)] = bor(set[rshift(b, 5)], lshift(1, band(b, 31))) 577 | end 578 | end 579 | return tree 580 | end 581 | 582 | 583 | -- Look-behind predicate 584 | 585 | local function lp_behind(pat) 586 | local tree1 = getpatt(pat) 587 | local n = lpcode.fixedlenx(tree1, 0, 0, 0) 588 | assert(not lpcode.hascaptures(tree1, 0), "pattern have captures") 589 | assert(n >= 0, "pattern may not have fixed length") 590 | assert(n <= MAXBEHIND, "pattern too long to look behind") 591 | local tree = newroot1sib(TBehind, pat) 592 | tree.p[0].val = n; 593 | return tree 594 | end 595 | 596 | 597 | -- Create a non-terminal 598 | 599 | local function lp_V(val, p) 600 | assert(val, "non-nil value expected") 601 | local tree = treepattern(1) 602 | valuetable[tree.id] = { val } 603 | tree.p[0].tag = TOpenCall 604 | tree.p[0].val = 1 605 | tree.p[0].cap = p or 0 606 | return tree 607 | end 608 | 609 | 610 | -- Create a tree for a non-empty capture, with a body and 611 | -- optionally with an associated value 612 | 613 | local function capture_aux(cap, pat, val) 614 | local tree = newroot1sib(TCapture, pat) 615 | tree.p[0].cap = cap 616 | if val then 617 | local ind = #valuetable[tree.id] + 1 618 | assert(ind <= 65536, "too many Lua values in pattern" .. ind) 619 | valuetable[tree.id][ind] = val 620 | tree.p[0].val = ind 621 | end 622 | return tree 623 | end 624 | 625 | 626 | -- Fill a tree with an empty capture, using an empty (TTrue) sibling. 627 | 628 | local function auxemptycap(tree, cap, par, start) 629 | tree.p[start].tag = TCapture; 630 | tree.p[start].cap = cap 631 | if type(par) ~= 'nil' then 632 | local ind = #valuetable[tree.id] + 1 633 | assert(ind <= 65536, "too many Lua values in pattern") 634 | valuetable[tree.id][ind] = par 635 | tree.p[start].val = ind 636 | end 637 | tree.p[start + 1].tag = TTrue; 638 | end 639 | 640 | 641 | -- Create a tree for an empty capture 642 | 643 | local function newemptycap(cap, par) 644 | local tree = treepattern(2) 645 | if type(par) ~= 'nil' then valuetable[tree.id] = {} end 646 | auxemptycap(tree, cap, par, 0) 647 | return tree 648 | end 649 | 650 | 651 | -- Captures with syntax p / v 652 | -- (function capture, query capture, string capture, or number capture) 653 | 654 | local function lp_divcapture(pat, par, xxx) 655 | local typ = type(par) 656 | if typ == "function" then 657 | return capture_aux(Cfunction, pat, par) 658 | elseif typ == "table" then 659 | return capture_aux(Cquery, pat, par) 660 | elseif typ == "string" then 661 | return capture_aux(Cstring, pat, par) 662 | elseif typ == "number" then 663 | local tree = newroot1sib(TCapture, pat) 664 | assert(0 <= par and par <= 0xffff, "invalid number") 665 | tree.p[0].cap = Cnum; 666 | local ind = #valuetable[tree.id] + 1 667 | assert(ind <= 65536, "too many Lua values in pattern") 668 | valuetable[tree.id][ind] = par 669 | tree.p[0].val = ind 670 | return tree 671 | else 672 | error("invalid replacement value", 0) 673 | end 674 | end 675 | 676 | 677 | local function lp_substcapture(pat) 678 | return capture_aux(Csubst, pat) 679 | end 680 | 681 | 682 | local function lp_tablecapture(pat) 683 | return capture_aux(Ctable, pat, 0) 684 | end 685 | 686 | 687 | local function lp_groupcapture(pat, val) 688 | if not val then 689 | return capture_aux(Cgroup, pat) 690 | else 691 | return capture_aux(Cgroup, pat, val) 692 | end 693 | end 694 | 695 | 696 | local function lp_foldcapture(pat, fce) 697 | assert(type(fce) == 'function') 698 | return capture_aux(Cfold, pat, fce) 699 | end 700 | 701 | 702 | local function lp_simplecapture(pat) 703 | return capture_aux(Csimple, pat) 704 | end 705 | 706 | 707 | local function lp_poscapture() 708 | return newemptycap(Cposition) 709 | end 710 | 711 | 712 | local function lp_argcapture(val) 713 | assert(type(val) == 'number') 714 | local tree = newemptycap(Carg, 0) 715 | local ind = #valuetable[tree.id] + 1 716 | assert(ind <= 65536, "too many Lua values in pattern") 717 | valuetable[tree.id][ind] = val 718 | tree.p[0].val = ind 719 | assert(0 < val and val <= 0xffff, "invalid argument index") 720 | return tree 721 | end 722 | 723 | 724 | local function lp_backref(val) 725 | return newemptycap(Cbackref, val) 726 | end 727 | 728 | 729 | -- Constant capture 730 | 731 | local function lp_constcapture(...) 732 | local tree 733 | local args = { ... } 734 | local n = select('#', ...) -- number of values 735 | -- no values? 736 | if n == 0 then 737 | tree = treepattern(1) -- no capture 738 | tree.p[0].tag = TTrue 739 | elseif n == 1 then 740 | tree = newemptycap(Cconst, args[1]) -- single constant capture 741 | -- create a group capture with all values 742 | else 743 | tree = treepattern(3 + 3 * (n - 1)) 744 | valuetable[tree.id] = {} 745 | tree.p[0].tag = TCapture 746 | tree.p[0].cap = Cgroup 747 | local start = 1 748 | for i = 1, n - 1 do 749 | tree.p[start].tag = TSeq 750 | tree.p[start].ps = 3 751 | auxemptycap(tree, Cconst, args[i], start + 1) 752 | start = start + tree.p[start].ps 753 | end 754 | auxemptycap(tree, Cconst, args[n], start) 755 | end 756 | return tree 757 | end 758 | 759 | 760 | local function lp_matchtime(pat, fce, name) 761 | assert(type(fce) == 'function') 762 | if name and type(name) == 'string' then 763 | funcnames[fce] = name 764 | end 765 | local tree = newroot1sib(TRunTime, pat) 766 | local ind = #valuetable[tree.id] + 1 767 | assert(ind <= 65536, "too many Lua values in pattern") 768 | valuetable[tree.id][ind] = fce 769 | tree.p[0].val = ind 770 | return tree 771 | end 772 | 773 | -- ====================================================== 774 | 775 | 776 | 777 | -- ====================================================== 778 | -- Grammar - Tree generation 779 | -- ======================================================= 780 | 781 | 782 | -- return index and the pattern for the 783 | -- initial rule of grammar; 784 | -- also add that index into position table. 785 | 786 | local function getfirstrule(pat, postab) 787 | local key 788 | -- access first element 789 | if type(pat[1]) == 'string' then 790 | key = pat[1] 791 | else 792 | key = 1 793 | end 794 | local rule = pat[key] 795 | if not rule then 796 | error("grammar has no initial rule", 0) 797 | end 798 | -- initial rule not a pattern? 799 | if not ffi.istype(treepattern, rule) then 800 | error(("initial rule '%s' is not a pattern"):format(tostring(key)), 0) 801 | end 802 | postab[key] = 1 803 | return key, rule 804 | end 805 | 806 | 807 | -- traverse grammar, collect all its keys and patterns 808 | -- into rule table. Create a new table (before all pairs key-pattern) to 809 | -- collect all keys and their associated positions in the final tree 810 | -- (the "position table"). 811 | -- Return the number of rules and the total size 812 | -- for the new tree. 813 | 814 | local function collectrules(pat) 815 | local n = 1; -- to count number of rules 816 | local postab = {} 817 | local firstkeyrule, firstrule = getfirstrule(pat, postab) 818 | local rules = { firstkeyrule, firstrule } 819 | local size = 2 + firstrule.treesize -- TGrammar + TRule + rule 820 | for key, val in pairs(pat) do 821 | -- initial rule? 822 | if key ~= 1 and tostring(val) ~= tostring(firstrule) then 823 | -- value is not a pattern? 824 | if not ffi.istype(treepattern, val) then 825 | error(("rule '%s' is not a pattern"):format(tostring(key)), 0) 826 | end 827 | rules[#rules + 1] = key 828 | rules[#rules + 1] = val 829 | postab[key] = size 830 | size = 1 + size + val.treesize 831 | n = n + 1 832 | end 833 | end 834 | size = size + 1; -- TTrue to finish list of rules 835 | return n, size, rules, postab 836 | end 837 | 838 | 839 | local function buildgrammar(grammar, rules, n, index, valuetable) 840 | local ktable, offset = {}, 0 841 | -- add each rule into new tree 842 | for i = 1, n do 843 | local size = rules[i * 2].treesize 844 | grammar.p[index].tag = TRule; 845 | grammar.p[index].cap = i; -- rule number 846 | grammar.p[index].ps = size + 1; -- point to next rule 847 | local ind = #ktable + 1 848 | ktable[ind] = rules[i * 2 - 1] 849 | grammar.p[index].val = ind 850 | ffi.copy(grammar.p + index + 1, rules[i * 2].p, ffi.sizeof(treepatternelement) * size) -- copy rule 851 | ktable, offset = copykeys(ktable, valuetable[rules[i * 2].id]) 852 | if offset > 0 then 853 | correctkeys(grammar, index + 1, offset) 854 | end 855 | index = index + grammar.p[index].ps; -- move to next rule 856 | end 857 | grammar.p[index].tag = TTrue; -- finish list of rules 858 | return ktable 859 | end 860 | 861 | 862 | -- Check whether a tree has potential infinite loops 863 | 864 | local function checkloops(tree, index) 865 | local tag = tree.p[index].tag 866 | if tag == TRep and lpcode.checkaux(tree, PEnullable, index + 1) then 867 | return true 868 | elseif tag == TGrammar then 869 | return -- sub-grammars already checked 870 | else 871 | local tag = numsiblings[tree.p[index].tag + 1] 872 | if tag == 0 then 873 | return 874 | elseif tag == 1 then 875 | return checkloops(tree, index + 1) 876 | elseif tag == 2 then 877 | if checkloops(tree, index + 1) then 878 | return true 879 | else 880 | return checkloops(tree, index + tree.p[index].ps) 881 | end 882 | else 883 | assert(false) 884 | end 885 | end 886 | end 887 | 888 | -- Check whether a rule can be left recursive; returns PEleftrecursion in that 889 | -- case; otherwise return 1 iff pattern is nullable. 890 | 891 | local function verifyrule(rulename, tree, passed, nullable, index, valuetable) 892 | local tag = tree.p[index].tag 893 | if tag == TChar or tag == TSet or tag == TAny or tag == TFalse then 894 | return nullable; -- cannot pass from here 895 | elseif tag == TTrue or tag == TBehind then 896 | return true; 897 | elseif tag == TNot or tag == TAnd or tag == TRep then 898 | return verifyrule(rulename, tree, passed, true, index + 1, valuetable) 899 | elseif tag == TCapture or tag == TRunTime then 900 | return verifyrule(rulename, tree, passed, nullable, index + 1, valuetable) 901 | elseif tag == TCall then 902 | local rule = valuetable[tree.p[index].val] 903 | if rule == rulename then return PEleftrecursion end 904 | if passed[rule] and passed[rule] > MAXRULES then 905 | return nullable 906 | end 907 | return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable) 908 | -- only check 2nd child if first is nullable 909 | elseif tag == TSeq then 910 | local res = verifyrule(rulename, tree, passed, false, index + 1, valuetable) 911 | if res == PEleftrecursion then 912 | return res 913 | elseif not res then 914 | return nullable 915 | else 916 | return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable) 917 | end 918 | -- must check both children 919 | elseif tag == TChoice then 920 | nullable = verifyrule(rulename, tree, passed, nullable, index + 1, valuetable) 921 | if nullable == PEleftrecursion then return nullable end 922 | return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable) 923 | elseif tag == TRule then 924 | local rule = valuetable[tree.p[index].val] 925 | passed[rule] = (passed[rule] or 0) + 1 926 | return verifyrule(rulename, tree, passed, nullable, index + 1, valuetable) 927 | elseif tag == TGrammar then 928 | return lpcode.checkaux(tree, PEnullable, index) -- sub-grammar cannot be left recursive 929 | else 930 | assert(false) 931 | end 932 | end 933 | 934 | 935 | local function verifygrammar(rule, index, valuetable) 936 | -- check left-recursive rules 937 | local LR = {} 938 | local ind = index + 1 939 | while rule.p[ind].tag == TRule do 940 | local rulename = valuetable[rule.p[ind].val] 941 | -- used rule 942 | if rulename then 943 | if verifyrule(rulename, rule, {}, false, ind + 1, valuetable) == PEleftrecursion then 944 | if not LREnable then 945 | error(("rule '%s' may be left recursive"):format(rulename), 0) 946 | end 947 | LR[rulename] = true 948 | end 949 | end 950 | ind = ind + rule.p[ind].ps 951 | end 952 | assert(rule.p[ind].tag == TTrue) 953 | 954 | for i = 0, rule.treesize - 1 do 955 | if rule.p[i].tag == TRule and LR[valuetable[rule.p[i].val]] then 956 | rule.p[i].cap = bor(rule.p[i].cap, RuleLR) --TRule can be left recursive 957 | end 958 | if rule.p[i].tag == TCall and LR[valuetable[rule.p[i].val]] then 959 | if rule.p[i].cap == 0 then 960 | rule.p[i].cap = 1 --TCall can be left recursive 961 | end 962 | end 963 | end 964 | 965 | -- check infinite loops inside rules 966 | ind = index + 1 967 | while rule.p[ind].tag == TRule do 968 | -- used rule 969 | if rule.p[ind].val then 970 | if checkloops(rule, ind + 1) then 971 | error(("empty loop in rule '%s'"):format(tostring(valuetable[rule.p[ind].val])), 0) 972 | end 973 | end 974 | ind = ind + rule.p[ind].ps 975 | end 976 | assert(rule.p[ind].tag == TTrue) 977 | end 978 | 979 | 980 | -- Give a name for the initial rule if it is not referenced 981 | 982 | local function initialrulename(grammar, val, valuetable) 983 | grammar.p[1].cap = bit.bor(grammar.p[1].cap, Ruleused) 984 | -- initial rule is not referenced? 985 | if grammar.p[1].val == 0 then 986 | local ind = #valuetable + 1 987 | assert(ind <= 65536, "too many Lua values in pattern") 988 | valuetable[ind] = val 989 | grammar.p[1].val = ind 990 | end 991 | end 992 | 993 | 994 | function newgrammar(pat) 995 | -- traverse grammar. Create a new table (before all pairs key-pattern) to 996 | -- collect all keys and their associated positions in the final tree 997 | -- (the "position table"). 998 | -- Return new tree. 999 | 1000 | local n, size, rules, postab = collectrules(pat) 1001 | local grammar = treepattern(size) 1002 | local start = 0 1003 | grammar.p[start].tag = TGrammar 1004 | grammar.p[start].val = n 1005 | valuetable[grammar.id] = buildgrammar(grammar, rules, n, start + 1, valuetable) 1006 | finalfix(true, postab, grammar, start + 1, valuetable[grammar.id]) 1007 | initialrulename(grammar, rules[1], valuetable[grammar.id]) 1008 | verifygrammar(grammar, 0, valuetable[grammar.id]) 1009 | return grammar 1010 | end 1011 | 1012 | -- ====================================================== 1013 | 1014 | -- remove duplicity from value table 1015 | 1016 | local function reducevaluetable(p) 1017 | local vtable = valuetable[p.id] 1018 | local value = {} 1019 | local newvaluetable = {} 1020 | 1021 | local function check(v) 1022 | if v > 0 then 1023 | local ord = value[vtable[v]] 1024 | if not ord then 1025 | newvaluetable[#newvaluetable + 1] = vtable[v] 1026 | ord = #newvaluetable 1027 | value[vtable[v]] = ord 1028 | end 1029 | return ord 1030 | end 1031 | return 0 1032 | end 1033 | 1034 | local function itertree(p, index) 1035 | local tag = p.p[index].tag 1036 | if tag == TSet or tag == TCall or tag == TOpenCall or 1037 | tag == TRule or tag == TCapture or tag == TRunTime then 1038 | p.p[index].val = check(p.p[index].val) 1039 | end 1040 | local ns = numsiblings[tag + 1] 1041 | if ns == 0 then 1042 | elseif ns == 1 then 1043 | return itertree(p, index + 1) 1044 | elseif ns == 2 then 1045 | itertree(p, index + 1) 1046 | return itertree(p, index + p.p[index].ps) 1047 | else 1048 | assert(false) 1049 | end 1050 | end 1051 | 1052 | if p.treesize > 0 then 1053 | itertree(p, 0) 1054 | end 1055 | if p.code ~= nil then 1056 | for i = 0, p.code.size - 1 do 1057 | local code = p.code.p[i].code 1058 | if code == ICall or code == IJmp then 1059 | p.code.p[i].aux = check(p.code.p[i].aux) 1060 | elseif code == ISet or code == ITestSet or code == ISpan then 1061 | p.code.p[i].val = check(p.code.p[i].val) 1062 | elseif code == IOpenCapture or code == IFullCapture then 1063 | p.code.p[i].offset = check(p.code.p[i].offset) 1064 | end 1065 | end 1066 | end 1067 | valuetable[p.id] = newvaluetable 1068 | end 1069 | 1070 | 1071 | local function checkalt(tree) 1072 | local notchecked = {} 1073 | local notinalternativerules = {} 1074 | 1075 | local function iter(tree, index, choice, rule) 1076 | local tag = tree[index].tag 1077 | if tag == TCapture and bit.band(tree[index].cap, 0xffff) == Cgroup then 1078 | if not choice then 1079 | if rule then 1080 | notchecked[rule] = index 1081 | end 1082 | else 1083 | tree[index].cap = bit.bor(tree[index].cap, BCapcandelete) 1084 | end 1085 | elseif tag == TChoice then 1086 | choice = true 1087 | elseif tag == TRule then 1088 | rule = tree[index].val 1089 | if bit.band(tree[index].cap, 0xffff) - 1 == 0 then 1090 | notinalternativerules[rule] = notinalternativerules[rule] or true 1091 | end 1092 | elseif tag == TCall then 1093 | local r = tree[index].val 1094 | if not choice then 1095 | notinalternativerules[r] = notinalternativerules[r] or true 1096 | end 1097 | end 1098 | local sibs = numsiblings[tree[index].tag + 1] or 0 1099 | if sibs >= 1 then 1100 | iter(tree, index + 1, choice, rule) 1101 | if sibs >= 2 then 1102 | return iter(tree, index + tree[index].ps, choice, rule) 1103 | end 1104 | end 1105 | end 1106 | 1107 | iter(tree, 0) 1108 | for k, v in pairs(notchecked) do 1109 | if not notinalternativerules[k] then 1110 | tree[v].cap = bit.bor(tree[v].cap, BCapcandelete) 1111 | end 1112 | end 1113 | end 1114 | 1115 | 1116 | local function prepcompile(p, index) 1117 | finalfix(false, nil, p, index, valuetable[p.id]) 1118 | checkalt(p.p) 1119 | lpcode.compile(p, index, valuetable[p.id]) 1120 | reducevaluetable(p) 1121 | return p.code 1122 | end 1123 | 1124 | 1125 | local function lp_printtree(pat, c) 1126 | assert(pat.treesize > 0) 1127 | if c then 1128 | finalfix(false, nil, pat, 0, valuetable[pat.id]) 1129 | end 1130 | lpprint.printtree(pat.p, 0, 0, valuetable[pat.id]) 1131 | end 1132 | 1133 | 1134 | local function lp_printcode(pat) 1135 | -- not compiled yet? 1136 | if pat.code == nil then 1137 | prepcompile(pat, 0) 1138 | end 1139 | lpprint.printpatt(pat.code, valuetable[pat.id]) 1140 | end 1141 | 1142 | 1143 | -- Main match function 1144 | 1145 | local function lp_match(pat, s, init, ...) 1146 | local p = ffi.istype(treepattern, pat) and pat or getpatt(pat) 1147 | p.code = p.code ~= nil and p.code or prepcompile(p, 0) 1148 | return lpvm.match(p, s, init, valuetable[p.id], ...) 1149 | end 1150 | 1151 | local function lp_streammatch(pat, init, ...) 1152 | local p = ffi.istype(treepattern, pat) and pat or getpatt(pat) 1153 | p.code = p.code ~= nil and p.code or prepcompile(p, 0) 1154 | return lpvm.streammatch(p, init, valuetable[p.id], ...) 1155 | end 1156 | 1157 | -- Only for testing purpose 1158 | -- stream emulation (send all chars from string one char after char) 1159 | local function lp_emulatestreammatch(pat, s, init, ...) 1160 | local p = ffi.istype(treepattern, pat) and pat or getpatt(pat) 1161 | p.code = p.code ~= nil and p.code or prepcompile(p, 0) 1162 | return lpvm.emulatestreammatch(p, s, init, valuetable[p.id], ...) 1163 | end 1164 | 1165 | -- {====================================================== 1166 | -- Library creation and functions not related to matching 1167 | -- ======================================================= 1168 | 1169 | local function lp_setmax(val) 1170 | lpvm.setmax(val) 1171 | end 1172 | 1173 | local function lp_setmaxbehind(val) 1174 | lpvm.setmaxbehind(val) 1175 | end 1176 | 1177 | local function lp_enableleftrecursion(val) 1178 | LREnable = val 1179 | end 1180 | 1181 | local function lp_version() 1182 | return VERSION 1183 | end 1184 | 1185 | 1186 | local function lp_type(pat) 1187 | if ffi.istype(treepattern, pat) then 1188 | return "pattern" 1189 | end 1190 | end 1191 | 1192 | 1193 | local function createcat(tab, catname, catfce) 1194 | local t, set = newcharset() 1195 | for i = 0, 255 do 1196 | if catfce(i) ~= 0 then 1197 | set[rshift(i, 5)] = bor(set[rshift(i, 5)], lshift(1, band(i, 31))) 1198 | end 1199 | end 1200 | tab[catname] = t 1201 | end 1202 | 1203 | 1204 | local function lp_locale(tab) 1205 | tab = tab or {} 1206 | createcat(tab, "alnum", function(c) return ffi.C.isalnum(c) end) 1207 | createcat(tab, "alpha", function(c) return ffi.C.isalpha(c) end) 1208 | createcat(tab, "cntrl", function(c) return ffi.C.iscntrl(c) end) 1209 | createcat(tab, "digit", function(c) return ffi.C.isdigit(c) end) 1210 | createcat(tab, "graph", function(c) return ffi.C.isgraph(c) end) 1211 | createcat(tab, "lower", function(c) return ffi.C.islower(c) end) 1212 | createcat(tab, "print", function(c) return ffi.C.isprint(c) end) 1213 | createcat(tab, "punct", function(c) return ffi.C.ispunct(c) end) 1214 | createcat(tab, "space", function(c) return ffi.C.isspace(c) end) 1215 | createcat(tab, "upper", function(c) return ffi.C.isupper(c) end) 1216 | createcat(tab, "xdigit", function(c) return ffi.C.isxdigit(c) end) 1217 | return tab 1218 | end 1219 | 1220 | 1221 | local function lp_new(ct, size) 1222 | local pat = ffi.new(ct, size) 1223 | pat.treesize = size 1224 | patternid = patternid + 1 1225 | pat.id = patternid 1226 | return pat 1227 | end 1228 | 1229 | 1230 | local function lp_gc(ct) 1231 | valuetable[ct.id] = nil 1232 | if ct.code ~= nil then 1233 | ffi.C.free(ct.code.p) 1234 | ffi.C.free(ct.code) 1235 | end 1236 | end 1237 | 1238 | local function lp_eq(ct1, ct2) 1239 | return tostring(ct1) == tostring(ct2) 1240 | end 1241 | 1242 | local function lp_load(str, fcetab) 1243 | local pat, t = lpvm.load(str, fcetab, true) 1244 | valuetable[pat.id] = t 1245 | return pat 1246 | end 1247 | 1248 | local function lp_loadfile(fname, fcetab) 1249 | local pat, t = lpvm.loadfile(fname, fcetab, true) 1250 | valuetable[pat.id] = t 1251 | return pat 1252 | end 1253 | 1254 | local function lp_dump(ct, tree) 1255 | local funccount = 0 1256 | -- not compiled yet? 1257 | if ct.code == nil then 1258 | prepcompile(ct, 0) 1259 | end 1260 | local out = {} 1261 | if tree then 1262 | out[#out + 1] = ffi.string(uint32(ct.treesize), 4) 1263 | out[#out + 1] = ffi.string(ct.p, ffi.sizeof(treepatternelement) * ct.treesize) 1264 | else 1265 | out[#out + 1] = ffi.string(uint32(0), 4) 1266 | end 1267 | out[#out + 1] = ffi.string(uint32(ct.code.size), 4) 1268 | out[#out + 1] = ffi.string(ct.code.p, ct.code.size * ffi.sizeof(patternelement)) 1269 | local t = valuetable[ct.id] 1270 | local len = t and #t or 0 1271 | out[#out + 1] = ffi.string(uint32(len), 4) 1272 | if len > 0 then 1273 | for _, val in ipairs(t) do 1274 | local typ = type(val) 1275 | if typ == 'string' then 1276 | out[#out + 1] = 'str' 1277 | out[#out + 1] = ffi.string(uint32(#val), 4) 1278 | out[#out + 1] = val 1279 | elseif typ == 'number' then 1280 | local val = tostring(val) 1281 | out[#out + 1] = 'num' 1282 | out[#out + 1] = ffi.string(uint32(#val), 4) 1283 | out[#out + 1] = val 1284 | elseif typ == 'cdata' then 1285 | out[#out + 1] = 'cdt' 1286 | out[#out + 1] = ffi.string(val, ffi.sizeof(val)) 1287 | elseif typ == 'function' then 1288 | out[#out + 1] = 'fnc' 1289 | funccount = funccount + 1 1290 | local name = funcnames[val] or ('FNAME%03d'):format(funccount) 1291 | out[#out + 1] = ffi.string(uint32(#name), 4) 1292 | out[#out + 1] = name 1293 | if not funcnames[val] and debug.getupvalue(val, 1) then 1294 | io.write(("Patterns function (%d) contains upvalue (%s) - use symbol name for function (%s).\n"):format(funccount, debug.getupvalue(val, 1), name), 0) 1295 | end 1296 | local data = string.dump(val, true) 1297 | out[#out + 1] = ffi.string(uint32(#data), 4) 1298 | out[#out + 1] = data 1299 | else 1300 | error(("Type '%s' NYI for dump"):format(typ), 0) 1301 | end 1302 | end 1303 | end 1304 | return table.concat(out) 1305 | end 1306 | 1307 | local function lp_save(ct, fname, tree) 1308 | local file = assert(io.open(fname, 'wb')) 1309 | file:write(lp_dump(ct, tree)) 1310 | file:close() 1311 | end 1312 | 1313 | 1314 | local pattreg = { 1315 | ["ptree"] = lp_printtree, 1316 | ["pcode"] = lp_printcode, 1317 | ["match"] = lp_match, 1318 | ["streammatch"] = lp_streammatch, 1319 | ["emulatestreammatch"] = lp_emulatestreammatch, 1320 | ["setmaxbehind"] = lp_setmaxbehind, 1321 | ["B"] = lp_behind, 1322 | ["V"] = lp_V, 1323 | ["C"] = lp_simplecapture, 1324 | ["Cc"] = lp_constcapture, 1325 | ["Cmt"] = lp_matchtime, 1326 | ["Cb"] = lp_backref, 1327 | ["Carg"] = lp_argcapture, 1328 | ["Cp"] = lp_poscapture, 1329 | ["Cs"] = lp_substcapture, 1330 | ["Ct"] = lp_tablecapture, 1331 | ["Cf"] = lp_foldcapture, 1332 | ["Cg"] = lp_groupcapture, 1333 | ["P"] = lp_P, 1334 | ["S"] = lp_set, 1335 | ["R"] = lp_range, 1336 | ["L"] = lp_and, 1337 | ["locale"] = lp_locale, 1338 | ["version"] = lp_version, 1339 | ["setmaxstack"] = lp_setmax, 1340 | ["type"] = lp_type, 1341 | ["enableleftrecursion"] = lp_enableleftrecursion, 1342 | ["enablememoization"] = lpvm.enablememoization, 1343 | ["enabletracing"] = lpvm.enabletracing, 1344 | ["save"] = lp_save, 1345 | ["dump"] = lp_dump, 1346 | ["load"] = lp_load, 1347 | ["loadfile"] = lp_loadfile, 1348 | ["__mul"] = lp_seq, 1349 | ["__add"] = lp_choice, 1350 | ["__pow"] = lp_star, 1351 | ["__len"] = lp_and, 1352 | ["__div"] = lp_divcapture, 1353 | ["__unm"] = lp_not, 1354 | ["__sub"] = lp_sub, 1355 | } 1356 | 1357 | local metareg = { 1358 | ["__gc"] = lp_gc, 1359 | ["__new"] = lp_new, 1360 | ["__mul"] = lp_seq, 1361 | ["__add"] = lp_choice, 1362 | ["__pow"] = lp_star, 1363 | ["__len"] = lp_and, 1364 | ["__div"] = lp_divcapture, 1365 | ["__unm"] = lp_not, 1366 | ["__sub"] = lp_sub, 1367 | ["__eq"] = lp_eq, 1368 | ["__index"] = pattreg 1369 | } 1370 | 1371 | ffi.metatype(treepattern, metareg) 1372 | 1373 | return pattreg 1374 | -------------------------------------------------------------------------------- /src/lpprint.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | LPEGLJ 3 | lpprint.lua 4 | Tree, code and debug print function (only for debuging) 5 | Copyright (C) 2014 Rostislav Sacek. 6 | based on LPeg v1.0 - PEG pattern matching for Lua 7 | Lua.org & PUC-Rio written by Roberto Ierusalimschy 8 | http://www.inf.puc-rio.br/~roberto/lpeg/ 9 | 10 | ** Permission is hereby granted, free of charge, to any person obtaining 11 | ** a copy of this software and associated documentation files (the 12 | ** "Software"), to deal in the Software without restriction, including 13 | ** without limitation the rights to use, copy, modify, merge, publish, 14 | ** distribute, sublicense, and/or sell copies of the Software, and to 15 | ** permit persons to whom the Software is furnished to do so, subject to 16 | ** the following conditions: 17 | ** 18 | ** The above copyright notice and this permission notice shall be 19 | ** included in all copies or substantial portions of the Software. 20 | ** 21 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | ** 29 | ** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] 30 | --]] 31 | 32 | local ffi = require"ffi" 33 | local band, rshift, lshift = bit.band, bit.rshift, bit.lshift 34 | 35 | ffi.cdef[[ 36 | int isprint ( int c ); 37 | ]] 38 | 39 | local RuleLR = 0x10000 40 | local Ruleused = 0x20000 41 | 42 | -- {====================================================== 43 | -- Printing patterns (for debugging) 44 | -- ======================================================= 45 | 46 | local TChar = 0 47 | local TSet = 1 48 | local TAny = 2 -- standard PEG elements 49 | local TTrue = 3 50 | local TFalse = 4 51 | local TRep = 5 52 | local TSeq = 6 53 | local TChoice = 7 54 | local TNot = 8 55 | local TAnd = 9 56 | local TCall = 10 57 | local TOpenCall = 11 58 | local TRule = 12 -- sib1 is rule's pattern, sib2 is 'next' rule 59 | local TGrammar = 13 -- sib1 is initial (and first) rule 60 | local TBehind = 14 -- match behind 61 | local TCapture = 15 -- regular capture 62 | local TRunTime = 16 -- run-time capture 63 | 64 | local IAny = 0 -- if no char, fail 65 | local IChar = 1 -- if char != aux, fail 66 | local ISet = 2 -- if char not in val, fail 67 | local ITestAny = 3 -- in no char, jump to 'offset' 68 | local ITestChar = 4 -- if char != aux, jump to 'offset' 69 | local ITestSet = 5 -- if char not in val, jump to 'offset' 70 | local ISpan = 6 -- read a span of chars in val 71 | local IBehind = 7 -- walk back 'aux' characters (fail if not possible) 72 | local IRet = 8 -- return from a rule 73 | local IEnd = 9 -- end of pattern 74 | local IChoice = 10 -- stack a choice; next fail will jump to 'offset' 75 | local IJmp = 11 -- jump to 'offset' 76 | local ICall = 12 -- call rule at 'offset' 77 | local IOpenCall = 13 -- call rule number 'offset' (must be closed to a ICall) 78 | local ICommit = 14 -- pop choice and jump to 'offset' 79 | local IPartialCommit = 15 -- update top choice to current position and jump 80 | local IBackCommit = 16 -- "fails" but jump to its own 'offset' 81 | local IFailTwice = 17 -- pop one choice and then fail 82 | local IFail = 18 -- go back to saved state on choice and jump to saved offset 83 | local IGiveup = 19 -- internal use 84 | local IFullCapture = 20 -- complete capture of last 'off' chars 85 | local IOpenCapture = 21 -- start a capture 86 | local ICloseCapture = 22 87 | local ICloseRunTime = 23 88 | 89 | local Cclose = 0 90 | local Cposition = 1 91 | local Cconst = 2 92 | local Cbackref = 3 93 | local Carg = 4 94 | local Csimple = 5 95 | local Ctable = 6 96 | local Cfunction = 7 97 | local Cquery = 8 98 | local Cstring = 9 99 | local Cnum = 10 100 | local Csubst = 11 101 | local Cfold = 12 102 | local Cruntime = 13 103 | local Cgroup = 14 104 | 105 | 106 | -- number of siblings for each tree 107 | local numsiblings = { 108 | [TRep] = 1, 109 | [TSeq] = 2, 110 | [TChoice] = 2, 111 | [TNot] = 1, 112 | [TAnd] = 1, 113 | [TRule] = 2, 114 | [TGrammar] = 1, 115 | [TBehind] = 1, 116 | [TCapture] = 1, 117 | [TRunTime] = 1, 118 | } 119 | local names = { 120 | [IAny] = "any", 121 | [IChar] = "char", 122 | [ISet] = "set", 123 | [ITestAny] = "testany", 124 | [ITestChar] = "testchar", 125 | [ITestSet] = "testset", 126 | [ISpan] = "span", 127 | [IBehind] = "behind", 128 | [IRet] = "ret", 129 | [IEnd] = "end", 130 | [IChoice] = "choice", 131 | [IJmp] = "jmp", 132 | [ICall] = "call", 133 | [IOpenCall] = "open_call", 134 | [ICommit] = "commit", 135 | [IPartialCommit] = "partial_commit", 136 | [IBackCommit] = "back_commit", 137 | [IFailTwice] = "failtwice", 138 | [IFail] = "fail", 139 | [IGiveup] = "giveup", 140 | [IFullCapture] = "fullcapture", 141 | [IOpenCapture] = "opencapture", 142 | [ICloseCapture] = "closecapture", 143 | [ICloseRunTime] = "closeruntime" 144 | } 145 | 146 | local function printcharset(st) 147 | io.write("["); 148 | local i = 0 149 | while i <= 255 do 150 | local first = i; 151 | while band(st[rshift(i, 5)], lshift(1, band(i, 31))) ~= 0 and i <= 255 do 152 | i = i + 1 153 | end 154 | if i - 1 == first then -- unary range? 155 | io.write(("(%02x)"):format(first)) 156 | elseif i - 1 > first then -- non-empty range? 157 | io.write(("(%02x-%02x)"):format(first, i - 1)) 158 | end 159 | i = i + 1 160 | end 161 | io.write("]") 162 | end 163 | 164 | local modes = { 165 | [Cclose] = "close", 166 | [Cposition] = "position", 167 | [Cconst] = "constant", 168 | [Cbackref] = "backref", 169 | [Carg] = "argument", 170 | [Csimple] = "simple", 171 | [Ctable] = "table", 172 | [Cfunction] = "function", 173 | [Cquery] = "query", 174 | [Cstring] = "string", 175 | [Cnum] = "num", 176 | [Csubst] = "substitution", 177 | [Cfold] = "fold", 178 | [Cruntime] = "runtime", 179 | [Cgroup] = "group" 180 | } 181 | 182 | local function printcapkind(kind) 183 | io.write(("%s"):format(modes[kind])) 184 | end 185 | 186 | local function printjmp(p, index) 187 | io.write(("-> %d"):format(index + p[index].offset)) 188 | end 189 | 190 | local function printrulename(p, index, rulenames) 191 | if rulenames and rulenames[index + p[index].offset] then 192 | io.write(' ', rulenames[index + p[index].offset]) 193 | end 194 | end 195 | 196 | local function printinst(p, index, valuetable, rulenames) 197 | local code = p[index].code 198 | if rulenames and rulenames[index] then 199 | io.write(rulenames[index], '\n') 200 | end 201 | io.write(("%04d: %s "):format(index, names[code])) 202 | if code == IChar then 203 | io.write(("'%s'"):format(string.char(p[index].val))) 204 | elseif code == ITestChar then 205 | io.write(("'%s'"):format(string.char(p[index].val))) 206 | printjmp(p, index) 207 | printrulename(p, index, rulenames) 208 | elseif code == IFullCapture then 209 | printcapkind(band(p[index].val, 0x0f)); 210 | io.write((" (size = %d) (idx = %s)"):format(band(rshift(p[index].val, 4), 0xF), tostring(valuetable[p[index].offset]))) 211 | elseif code == IOpenCapture then 212 | printcapkind(band(p[index].val, 0x0f)) 213 | io.write((" (idx = %s)"):format(tostring(valuetable[p[index].offset]))) 214 | elseif code == ISet then 215 | printcharset(valuetable[p[index].val]); 216 | elseif code == ITestSet then 217 | printcharset(valuetable[p[index].val]) 218 | printjmp(p, index); 219 | printrulename(p, index, rulenames) 220 | elseif code == ISpan then 221 | printcharset(valuetable[p[index].val]); 222 | elseif code == IOpenCall then 223 | io.write(("-> %d"):format(p[index].offset)) 224 | elseif code == IBehind then 225 | io.write(("%d"):format(p[index].val)) 226 | elseif code == IJmp or code == ICall or code == ICommit or code == IChoice or 227 | code == IPartialCommit or code == IBackCommit or code == ITestAny then 228 | printjmp(p, index); 229 | if (code == ICall or code == IJmp) and p[index].aux > 0 then 230 | io.write(' ', valuetable[p[index].aux]) 231 | else 232 | printrulename(p, index, rulenames) 233 | end 234 | end 235 | io.write("\n") 236 | end 237 | 238 | 239 | local function printpatt(p, valuetable) 240 | local ruleNames = {} 241 | for i = 0, p.size - 1 do 242 | local code = p.p[i].code 243 | if (code == ICall or code == IJmp) and p.p[i].aux > 0 then 244 | local index = i + p.p[i].offset 245 | ruleNames[index] = valuetable[p.p[i].aux] 246 | end 247 | end 248 | for i = 0, p.size - 1 do 249 | printinst(p.p, i, valuetable, ruleNames) 250 | end 251 | end 252 | 253 | 254 | local function printcap(cap, index, valuetable) 255 | printcapkind(cap[index].kind) 256 | io.write((" (idx: %s - size: %d) -> %d\n"):format(valuetable[cap[index].idx], cap[index].siz, cap[index].s)) 257 | end 258 | 259 | 260 | local function printcaplist(cap, limit, valuetable) 261 | io.write(">======\n") 262 | local index = 0 263 | while cap[index].s and index < limit do 264 | printcap(cap, index, valuetable) 265 | index = index + 1 266 | end 267 | io.write("=======\n") 268 | end 269 | 270 | -- ====================================================== 271 | 272 | 273 | 274 | -- {====================================================== 275 | -- Printing trees (for debugging) 276 | -- ======================================================= 277 | 278 | local tagnames = { 279 | [TChar] = "char", 280 | [TSet] = "set", 281 | [TAny] = "any", 282 | [TTrue] = "true", 283 | [TFalse] = "false", 284 | [TRep] = "rep", 285 | [TSeq] = "seq", 286 | [TChoice] = "choice", 287 | [TNot] = "not", 288 | [TAnd] = "and", 289 | [TCall] = "call", 290 | [TOpenCall] = "opencall", 291 | [TRule] = "rule", 292 | [TGrammar] = "grammar", 293 | [TBehind] = "behind", 294 | [TCapture] = "capture", 295 | [TRunTime] = "run-time" 296 | } 297 | 298 | 299 | local function printtree(tree, ident, index, valuetable) 300 | for i = 1, ident do 301 | io.write(" ") 302 | end 303 | local tag = tree[index].tag 304 | io.write(("%s"):format(tagnames[tag])) 305 | if tag == TChar then 306 | local c = tree[index].val 307 | if ffi.C.isprint(c) then 308 | io.write((" '%c'\n"):format(c)) 309 | else 310 | io.write((" (%02X)\n"):format(c)) 311 | end 312 | elseif tag == TSet then 313 | printcharset(valuetable[tree[index].val]); 314 | io.write("\n") 315 | elseif tag == TOpenCall or tag == TCall then 316 | io.write((" key: %s\n"):format(tostring(valuetable[tree[index].val]))) 317 | elseif tag == TBehind then 318 | io.write((" %d\n"):format(tree[index].val)) 319 | printtree(tree, ident + 2, index + 1, valuetable); 320 | elseif tag == TCapture then 321 | io.write((" cap: %s n: %s\n"):format(modes[bit.band(tree[index].cap, 0xffff)], valuetable[tree[index].val])) 322 | printtree(tree, ident + 2, index + 1, valuetable); 323 | elseif tag == TRule then 324 | local extra = bit.band(tree[index].cap, RuleLR) == RuleLR and ' left recursive' or '' 325 | extra = extra .. (bit.band(tree[index].cap, Ruleused) ~= Ruleused and ' not used' or '') 326 | io.write((" n: %d key: %s%s\n"):format(bit.band(tree[index].cap, 0xffff) - 1, valuetable[tree[index].val], extra)) 327 | printtree(tree, ident + 2, index + 1, valuetable); 328 | -- do not print next rule as a sibling 329 | elseif tag == TGrammar then 330 | local ruleindex = index + 1 331 | io.write((" %d\n"):format(tree[index].val)) -- number of rules 332 | for i = 1, tree[index].val do 333 | printtree(tree, ident + 2, ruleindex, valuetable); 334 | ruleindex = ruleindex + tree[ruleindex].ps 335 | end 336 | assert(tree[ruleindex].tag == TTrue); -- sentinel 337 | else 338 | local sibs = numsiblings[tree[index].tag] or 0 339 | io.write("\n") 340 | if sibs >= 1 then 341 | printtree(tree, ident + 2, index + 1, valuetable); 342 | if sibs >= 2 then 343 | printtree(tree, ident + 2, index + tree[index].ps, valuetable) 344 | end 345 | end 346 | end 347 | end 348 | 349 | -- }====================================================== */ 350 | 351 | return { 352 | printtree = printtree, 353 | printpatt = printpatt, 354 | printcaplist = printcaplist, 355 | printinst = printinst 356 | } -------------------------------------------------------------------------------- /src/lpvm.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | LPEGLJ 3 | lpvm.lua 4 | Virtual machine 5 | Copyright (C) 2014 Rostislav Sacek. 6 | based on LPeg v1.0 - PEG pattern matching for Lua 7 | Lua.org & PUC-Rio written by Roberto Ierusalimschy 8 | http://www.inf.puc-rio.br/~roberto/lpeg/ 9 | 10 | ** Permission is hereby granted, free of charge, to any person obtaining 11 | ** a copy of this software and associated documentation files (the 12 | ** "Software"), to deal in the Software without restriction, including 13 | ** without limitation the rights to use, copy, modify, merge, publish, 14 | ** distribute, sublicense, and/or sell copies of the Software, and to 15 | ** permit persons to whom the Software is furnished to do so, subject to 16 | ** the following conditions: 17 | ** 18 | ** The above copyright notice and this permission notice shall be 19 | ** included in all copies or substantial portions of the Software. 20 | ** 21 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | ** 29 | ** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] 30 | --]] 31 | 32 | local ffi = require "ffi" 33 | local lpcap = require "lpcap" 34 | --[[ Only for debug purpose 35 | local lpprint = require"lpprint" 36 | --]] 37 | 38 | local band, rshift, lshift = bit.band, bit.rshift, bit.lshift 39 | 40 | -- {====================================================== 41 | -- Virtual Machine 42 | -- ======================================================= 43 | 44 | -- Interpret the result of a dynamic capture: false -> fail; 45 | -- true -> keep current position; number -> next position. 46 | -- Return new subject position. 'fr' is stack index where 47 | -- is the result; 'curr' is current subject position; 'limit' 48 | -- is subject's size. 49 | 50 | local MAXBEHINDPREDICATE = 255 -- max behind for Look-behind predicate 51 | local MAXOFF = 0xF -- maximum for full capture 52 | local MAXBEHIND = math.max(MAXBEHINDPREDICATE, MAXOFF) -- maximum before current pos 53 | local INITBACK = 400 -- default maximum size for call/backtrack stack 54 | 55 | local IAny = 0 -- if no char, fail 56 | local IChar = 1 -- if char != val, fail 57 | local ISet = 2 -- if char not in val, fail 58 | local ITestAny = 3 -- in no char, jump to 'offset' 59 | local ITestChar = 4 -- if char != val, jump to 'offset' 60 | local ITestSet = 5 -- if char not in val, jump to 'offset' 61 | local ISpan = 6 -- read a span of chars in val 62 | local IBehind = 7 -- walk back 'val' characters (fail if not possible) 63 | local IRet = 8 -- return from a rule 64 | local IEnd = 9 -- end of pattern 65 | local IChoice = 10 -- stack a choice; next fail will jump to 'offset' 66 | local IJmp = 11 -- jump to 'offset' 67 | local ICall = 12 -- call rule at 'offset' 68 | local IOpenCall = 13 -- call rule number 'offset' (must be closed to a ICall) 69 | local ICommit = 14 -- pop choice and jump to 'offset' 70 | local IPartialCommit = 15 -- update top choice to current position and jump 71 | local IBackCommit = 16 -- "fails" but jump to its own 'offset' 72 | local IFailTwice = 17 -- pop one choice and then fail 73 | local IFail = 18 -- go back to saved state on choice and jump to saved offset 74 | local IGiveup = 19 -- internal use 75 | local IFullCapture = 20 -- complete capture of last 'off' chars 76 | local IOpenCapture = 21 -- start a capture 77 | local ICloseCapture = 22 78 | local ICloseRunTime = 23 79 | 80 | local Cclose = 0 81 | local Cposition = 1 82 | local Cconst = 2 83 | local Cbackref = 3 84 | local Carg = 4 85 | local Csimple = 5 86 | local Ctable = 6 87 | local Cfunction = 7 88 | local Cquery = 8 89 | local Cstring = 9 90 | local Cnum = 10 91 | local Csubst = 11 92 | local Cfold = 12 93 | local Cruntime = 13 94 | local Cgroup = 14 95 | 96 | local BCapcandelete = 0x30000 97 | local maxstack = INITBACK 98 | local maxcapturedefault = 100 99 | local maxmemo = 1000 100 | local usememoization = false 101 | local trace = false 102 | 103 | local FAIL = -1 104 | local LRFAIL = -1 105 | local VOID = -2 106 | local CHOICE = -3 107 | local CALL = -4 108 | 109 | ffi.cdef [[ 110 | typedef struct { 111 | int code; 112 | int val; 113 | int offset; 114 | int aux; 115 | } PATTERN_ELEMENT; 116 | typedef struct { 117 | int allocsize; 118 | int size; 119 | PATTERN_ELEMENT *p; 120 | } PATTERN; 121 | typedef struct { 122 | int tag; 123 | int val; 124 | int ps; 125 | int cap; 126 | } TREEPATTERN_ELEMENT; 127 | typedef struct { 128 | int id; 129 | int treesize; 130 | PATTERN *code; 131 | TREEPATTERN_ELEMENT p[?]; 132 | } TREEPATTERN; 133 | 134 | typedef struct { 135 | double s; 136 | double X; 137 | double memos; 138 | int p; 139 | int caplevel; 140 | int pA; 141 | int valuetabletop; 142 | } STACK; 143 | 144 | typedef struct { 145 | double s; 146 | int siz; 147 | int idx; 148 | int kind; 149 | int candelete; 150 | } CAPTURE; 151 | 152 | void *malloc( size_t size ); 153 | void free( void *memblock ); 154 | void *realloc( void *memblock, size_t size ); 155 | ]] 156 | 157 | local treepatternelement = ffi.typeof('TREEPATTERN_ELEMENT') 158 | local treepattern = ffi.typeof('TREEPATTERN') 159 | local patternelement = ffi.typeof('PATTERN_ELEMENT') 160 | local pattern = ffi.typeof('PATTERN') 161 | local settype = ffi.typeof('int32_t[8]') 162 | 163 | local function resdyncaptures(fr, curr, limit, checkstreamlen) 164 | local typ = type(fr) 165 | -- false value? 166 | if not fr then 167 | return FAIL -- and fail 168 | elseif typ == 'boolean' then 169 | -- true? 170 | return curr -- keep current position 171 | else 172 | local res = fr -- new position 173 | if res < curr or (limit and res > limit) or (not limit and checkstreamlen and not checkstreamlen(res - 2)) then 174 | error("invalid position returned by match-time capture", 0) 175 | end 176 | return res 177 | end 178 | assert(false) 179 | end 180 | 181 | 182 | -- Add capture values returned by a dynamic capture to the capture list 183 | -- 'base', nested inside a group capture. 'fd' indexes the first capture 184 | -- value, 'n' is the number of values (at least 1). 185 | 186 | local function adddyncaptures(s, base, index, n, fd, valuetable) 187 | -- Cgroup capture is already there 188 | assert(base[index].kind == Cgroup and base[index].siz == 0) 189 | base[index].idx = 0 -- make it an anonymous group 190 | base[index + 1] = {} 191 | -- add runtime captures 192 | for i = 1, n do 193 | base[index + i].kind = Cruntime 194 | base[index + i].siz = 1 -- mark it as closed 195 | local ind = #valuetable + 1 196 | valuetable[ind] = fd[i + 1] 197 | base[index + i].idx = ind -- stack index of capture value 198 | base[index + i].s = s 199 | base[index + i + 1] = {} 200 | end 201 | base[index + n + 1].kind = Cclose -- close group 202 | base[index + n + 1].siz = 1 203 | base[index + n + 1].s = s 204 | base[index + n + 2] = {} 205 | end 206 | 207 | 208 | -- Opcode interpreter 209 | 210 | local function match(stream, last, o, s, op, valuetable, ...) 211 | local arg = { ... } 212 | local argcount = select('#', ...) 213 | local len = #o 214 | local ptr = ffi.cast('const unsigned char*', o) 215 | s = s - 1 216 | local stackptr = 0 -- point to first empty slot in stack 217 | local captop = 0 -- point to first empty slot in captures 218 | local STACK = ffi.new("STACK[?]", INITBACK) 219 | local CAPTURE = ffi.new("CAPTURE[?]", maxcapturedefault) 220 | local CAPTURESTACK = { { capture = CAPTURE, captop = captop, maxcapture = maxcapturedefault } } 221 | local capturestackptr = #CAPTURESTACK 222 | local maxcapture = maxcapturedefault 223 | local stacklimit = INITBACK 224 | local L = {} 225 | local Memo1, Memo2 = {}, {} 226 | local memoind = 0 227 | local maxpointer = 2 ^ math.ceil(math.log(op.size) / math.log(2)) 228 | local nocapturereleased = true 229 | 230 | local p = 0 -- current instruction 231 | local streambufsize = 2 ^ 8 232 | local streambufsizemask = streambufsize - 1 -- faster modulo 233 | local streambufs = {} 234 | local streambufoffset = 0 235 | local streamstartbuffer = 0 236 | local streambufferscount = 0 237 | local level = -1 238 | 239 | local function deletestreambuffers() 240 | local min = s 241 | for i = stackptr - 1, 0, -1 do 242 | local val = STACK[i].s 243 | if val >= 0 then 244 | min = math.min(val, min) 245 | end 246 | end 247 | 248 | for i = captop - 1, 0, -1 do 249 | local val = CAPTURE[i].s 250 | if val >= 0 then 251 | min = math.min(val, min) 252 | end 253 | end 254 | for i = streamstartbuffer + 1, streambufoffset - streambufsize, streambufsize do 255 | -- max behind for full capture and max behind for Look-behind predicate 256 | if i + streambufsize + MAXBEHIND < min then 257 | streambufs[i] = nil 258 | streambufferscount = streambufferscount - 1 259 | else 260 | streamstartbuffer = i - 1 261 | break 262 | end 263 | end 264 | end 265 | 266 | local function addstreamdata(s, last) 267 | local len = #s 268 | local srcoffset = 0 269 | if streambufferscount > 128 then 270 | deletestreambuffers() 271 | end 272 | repeat 273 | local offset = bit.band(streambufoffset, streambufsizemask) 274 | if offset > 0 then 275 | local index = streambufoffset - offset + 1 276 | local count = math.min(len, streambufsize - offset) 277 | ffi.copy(streambufs[index] + offset, s:sub(srcoffset + 1, srcoffset + 1 + count), count) 278 | len = len - count 279 | srcoffset = srcoffset + count 280 | streambufoffset = streambufoffset + count 281 | end 282 | if len > 0 then 283 | local index = streambufoffset - (bit.band(streambufoffset, streambufsizemask)) + 1 284 | local buf = ffi.new('unsigned char[?]', streambufsize) 285 | streambufferscount = streambufferscount + 1 286 | streambufs[index] = buf 287 | local count = math.min(len, streambufsize) 288 | ffi.copy(buf, s:sub(srcoffset + 1, srcoffset + 1 + count), count) 289 | len = len - count 290 | srcoffset = srcoffset + count 291 | streambufoffset = streambufoffset + count 292 | end 293 | if streambufoffset >= 2 ^ 47 then 294 | error("too big input stream", 0) 295 | end 296 | until len == 0 297 | end 298 | 299 | local function getstreamchar(s) 300 | local offset = bit.band(s, streambufsizemask) 301 | local index = s - offset + 1 302 | return streambufs[index][offset] 303 | end 304 | 305 | local checkstreamlen 306 | 307 | local function getstreamstring(st, en) 308 | -- TODO Optimalize access 309 | local str = {} 310 | local i = st >= 0 and st or 1 311 | local to = en >= 0 and en or math.huge 312 | while true do 313 | if i > to then break end 314 | if not checkstreamlen(i - 1) then return end 315 | if last and (st < 0 or en < 0) then 316 | for j = i, streambufoffset do 317 | str[#str + 1] = string.char(getstreamchar(j - 1)) 318 | end 319 | en = en < 0 and streambufoffset + en + 1 or en 320 | en = st > 0 and en - st + 1 or en 321 | st = st < 0 and streambufoffset + st + 1 or 1 322 | return table.concat(str):sub(st, en) 323 | else 324 | str[#str + 1] = string.char(getstreamchar(i - 1)) 325 | i = i + 1 326 | end 327 | end 328 | return table.concat(str) 329 | end 330 | 331 | function checkstreamlen(index) 332 | local str 333 | while true do 334 | if index < streambufoffset then 335 | return true 336 | else 337 | if last then 338 | s = streambufoffset 339 | return false 340 | end 341 | local max = captop 342 | for i = stackptr - 1, 0, -1 do 343 | local val = STACK[i].X == CHOICE and STACK[i].caplevel or -1 344 | if val >= 0 then 345 | max = math.min(val, max) 346 | end 347 | end 348 | local n, out, outindex = lpcap.getcapturesruntime(CAPTURE, nil, getstreamstring, false, 0, max, captop, valuetable, unpack(arg, 1, argcount)) 349 | if n > 0 then 350 | for i = stackptr - 1, 0, -1 do 351 | local val = STACK[i].caplevel 352 | if val > 0 then 353 | STACK[i].caplevel = STACK[i].caplevel - n 354 | end 355 | end 356 | captop = captop - n 357 | end 358 | if outindex > 0 then 359 | nocapturereleased = false 360 | end 361 | str, last = coroutine.yield(1, unpack(out, 1, outindex)) 362 | addstreamdata(str) 363 | end 364 | end 365 | end 366 | 367 | local function doublecapture() 368 | maxcapture = maxcapture * 2 369 | local NEWCAPTURE = ffi.new("CAPTURE[?]", maxcapture) 370 | ffi.copy(NEWCAPTURE, CAPTURE, ffi.sizeof('CAPTURE') * captop) 371 | CAPTURE = NEWCAPTURE 372 | CAPTURESTACK[capturestackptr].capture = CAPTURE 373 | CAPTURESTACK[capturestackptr].maxcapture = maxcapture 374 | end 375 | 376 | local function pushcapture() 377 | CAPTURE[captop].idx = op.p[p].offset 378 | CAPTURE[captop].kind = band(op.p[p].val, 0x0f) 379 | CAPTURE[captop].candelete = band(op.p[p].val, BCapcandelete) ~= 0 and 1 or 0 380 | captop = captop + 1 381 | p = p + 1 382 | if captop >= maxcapture then 383 | doublecapture() 384 | end 385 | end 386 | 387 | local function traceenter(typ, par) 388 | level = level + (par or 0) 389 | io.write(('%s+%s %s\n'):format((' '):rep(level), typ, valuetable[op.p[p].aux])) 390 | end 391 | 392 | local function traceleave(inst) 393 | io.write(('%s- %s\n'):format((' '):rep(level), valuetable[op.p[inst].aux])) 394 | level = level - 1 395 | end 396 | 397 | local function tracematch(typ, start, par, from, to, inst, extra, ...) 398 | local n, caps, capscount = lpcap.getcapturesruntime(CAPTURE, o, getstreamstring, true, start, captop, captop, valuetable, ...) 399 | local capstr = {} 400 | for i = 1, capscount do capstr[i] = tostring(caps[i]) end 401 | extra = extra and '(' .. extra .. ')' or '' 402 | io.write(('%s=%s %s%s %s %s \n'):format((' '):rep(level), typ, valuetable[op.p[inst].aux], extra, 403 | o and o:sub(from, to) or getstreamstring(from, to), table.concat(capstr, " "))) 404 | level = level - par 405 | end 406 | 407 | local function fail() 408 | -- pattern failed: try to backtrack 409 | local X 410 | repeat -- remove pending calls 411 | stackptr = stackptr - 1 412 | if stackptr == -1 then 413 | p = FAIL 414 | return 415 | end 416 | s = STACK[stackptr].s 417 | X = STACK[stackptr].X 418 | if usememoization and X == CALL and STACK[stackptr].memos ~= VOID then 419 | Memo1[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = FAIL 420 | Memo2[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = FAIL 421 | end 422 | -- lvar.2 rest 423 | if X == LRFAIL then 424 | CAPTURESTACK[capturestackptr] = nil 425 | capturestackptr = capturestackptr - 1 426 | CAPTURE = CAPTURESTACK[capturestackptr].capture 427 | maxcapture = CAPTURESTACK[capturestackptr].maxcapture 428 | L[STACK[stackptr].pA + s * maxpointer] = nil 429 | end 430 | if trace and (X == CALL or X == LRFAIL) then traceleave(STACK[stackptr].p - 1) end 431 | until X == CHOICE or X >= 0 432 | p = STACK[stackptr].p 433 | for i = #valuetable, STACK[stackptr].valuetabletop + 1, -1 do 434 | table.remove(valuetable) 435 | end 436 | -- inc.2 437 | if X >= 0 then 438 | s = X 439 | capturestackptr = capturestackptr - 1 440 | CAPTURE = CAPTURESTACK[capturestackptr].capture 441 | captop = CAPTURESTACK[capturestackptr].captop 442 | maxcapture = CAPTURESTACK[capturestackptr].maxcapture 443 | local capture = L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer].capturecommit 444 | while captop + capture.captop >= maxcapture do 445 | doublecapture() 446 | end 447 | ffi.copy(CAPTURE + captop, capture.capture, capture.captop * ffi.sizeof('CAPTURE')) 448 | captop = captop + capture.captop 449 | if trace then tracematch('', captop - capture.captop, 1, STACK[stackptr].s + 1, s, STACK[stackptr].p - 1, L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer].level, unpack(arg, 1, argcount)) end 450 | CAPTURESTACK[capturestackptr + 1] = nil 451 | L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer] = nil 452 | else 453 | captop = STACK[stackptr].caplevel 454 | end 455 | end 456 | 457 | local function doublestack() 458 | if stackptr >= maxstack then 459 | error(("backtrack stack overflow (current limit is %d)"):format(maxstack), 0) 460 | end 461 | stacklimit = stacklimit * 2 462 | stacklimit = (stacklimit > maxstack) and maxstack or stacklimit 463 | local NEWSTACK = ffi.new("STACK[?]", stacklimit) 464 | ffi.copy(NEWSTACK, STACK, ffi.sizeof('STACK') * stackptr) 465 | STACK = NEWSTACK 466 | end 467 | 468 | if stream then 469 | addstreamdata(o) 470 | len = nil 471 | o = nil 472 | ptr = nil 473 | end 474 | while true do 475 | --[[ Only for debug 476 | io.write(("s: |%s| stck:%d, caps:%d \n"):format(s + 1, stackptr, captop)) 477 | if p ~= FAIL then 478 | lpprint.printinst(op.p, p, valuetable) 479 | lpprint.printcaplist(CAPTURE, captop, valuetable) 480 | end 481 | --]] 482 | if p == FAIL then return -1 end 483 | local code = op.p[p].code 484 | if code == IEnd then 485 | CAPTURE[captop].kind = Cclose 486 | CAPTURE[captop].s = -1 487 | return 0, lpcap.getcaptures(CAPTURE, o, getstreamstring, nocapturereleased and s + 1, valuetable, ...) 488 | elseif code == IRet then 489 | if STACK[stackptr - 1].X == CALL then 490 | stackptr = stackptr - 1 491 | if trace then tracematch('', STACK[stackptr].caplevel, 1, STACK[stackptr].s + 1, s, STACK[stackptr].p - 1, nil, ...) end 492 | p = STACK[stackptr].p 493 | if usememoization and STACK[stackptr].memos ~= VOID then 494 | local dif = captop - STACK[stackptr].caplevel 495 | local caps 496 | if dif > 0 then 497 | caps = ffi.new("CAPTURE[?]", dif) 498 | ffi.copy(caps, CAPTURE + captop - dif, dif * ffi.sizeof('CAPTURE')) 499 | end 500 | local val = { s, dif, caps } 501 | Memo1[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = val 502 | Memo2[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = val 503 | end 504 | else 505 | local X = STACK[stackptr - 1].X 506 | -- lvar.1 inc.1 507 | if X == LRFAIL or s > X then 508 | if trace then tracematch('IB', 0, 0, STACK[stackptr - 1].s + 1, s, STACK[stackptr - 1].p - 1, L[STACK[stackptr - 1].pA + STACK[stackptr - 1].s * maxpointer].level + 1, ...) end 509 | STACK[stackptr - 1].X = s 510 | p = STACK[stackptr - 1].pA 511 | s = STACK[stackptr - 1].s 512 | local lambda = L[p + s * maxpointer] 513 | lambda.level = lambda.level + 1 514 | lambda.X = STACK[stackptr - 1].X 515 | STACK[stackptr - 1].caplevel = captop 516 | STACK[stackptr - 1].valuetabletop = #valuetable 517 | CAPTURESTACK[capturestackptr].captop = captop 518 | lambda.capturecommit = CAPTURESTACK[capturestackptr] 519 | captop = 0 520 | CAPTURE = ffi.new("CAPTURE[?]", maxcapturedefault) 521 | CAPTURESTACK[capturestackptr] = { capture = CAPTURE, captop = captop, maxcapture = maxcapturedefault } 522 | maxcapture = maxcapturedefault 523 | else 524 | -- inc.3 525 | stackptr = stackptr - 1 526 | p = STACK[stackptr].p 527 | s = STACK[stackptr].X 528 | for i = #valuetable, STACK[stackptr].valuetabletop + 1, -1 do 529 | table.remove(valuetable) 530 | end 531 | local lambda = L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer] 532 | capturestackptr = capturestackptr - 1 533 | CAPTURE = CAPTURESTACK[capturestackptr].capture 534 | captop = CAPTURESTACK[capturestackptr].captop 535 | maxcapture = CAPTURESTACK[capturestackptr].maxcapture 536 | local capture = lambda.capturecommit 537 | while captop + capture.captop >= maxcapture do 538 | doublecapture() 539 | end 540 | ffi.copy(CAPTURE + captop, capture.capture, capture.captop * ffi.sizeof('CAPTURE')) 541 | captop = captop + capture.captop 542 | if trace then tracematch('', captop - capture.captop, 1, STACK[stackptr].s + 1, s, STACK[stackptr].p - 1, L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer].level, ...) end 543 | CAPTURESTACK[capturestackptr + 1] = nil 544 | L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer] = nil 545 | end 546 | end 547 | elseif code == IBehind then 548 | local n = op.p[p].val 549 | if n > s then 550 | fail() 551 | else 552 | s = s - n 553 | p = p + 1 554 | end 555 | elseif code == IJmp then 556 | if trace and op.p[p].aux ~= 0 then traceenter('TC') end 557 | p = p + op.p[p].offset 558 | elseif code == IChoice then 559 | if stackptr == stacklimit then 560 | doublestack() 561 | end 562 | STACK[stackptr].X = CHOICE 563 | STACK[stackptr].p = p + op.p[p].offset 564 | STACK[stackptr].s = s 565 | STACK[stackptr].caplevel = captop 566 | STACK[stackptr].valuetabletop = #valuetable 567 | stackptr = stackptr + 1 568 | p = p + 1 569 | elseif code == ICall then 570 | if stackptr == stacklimit then 571 | doublestack() 572 | end 573 | local k = bit.band(op.p[p].val, 0xffff) 574 | if k == 0 then 575 | local pA = p + op.p[p].offset 576 | local memo = Memo1[pA + s * maxpointer] 577 | if usememoization and memo then 578 | if trace then traceenter('M', 1) end 579 | if memo == FAIL then 580 | if trace then traceleave(p) end 581 | fail() 582 | else 583 | local dif = memo[2] 584 | if dif > 0 then 585 | while captop + dif >= maxcapture do 586 | doublecapture() 587 | end 588 | local caps = memo[3] 589 | ffi.copy(CAPTURE + captop, caps, dif * ffi.sizeof('CAPTURE')) 590 | captop = captop + dif 591 | end 592 | if trace then tracematch('M', captop - dif, 1, s + 1, memo[1], p, nil, ...) end 593 | s = memo[1] 594 | p = p + 1 595 | end 596 | else 597 | if trace then traceenter('', 1) end 598 | STACK[stackptr].X = CALL 599 | STACK[stackptr].s = s 600 | STACK[stackptr].p = p + 1 -- save return address 601 | STACK[stackptr].pA = pA 602 | STACK[stackptr].memos = s 603 | STACK[stackptr].caplevel = captop 604 | stackptr = stackptr + 1 605 | p = pA 606 | if usememoization and not memo then 607 | memoind = memoind + 1 608 | if memoind > maxmemo then 609 | memoind = 0 610 | Memo1 = Memo2 611 | Memo2 = {} 612 | end 613 | end 614 | end 615 | else 616 | local pA = p + op.p[p].offset 617 | local X = L[pA + s * maxpointer] 618 | -- lvar.1 lvar.2 619 | if not X then 620 | if trace then traceenter('', 1) end 621 | CAPTURESTACK[capturestackptr].captop = captop 622 | local capture = ffi.new("CAPTURE[?]", maxcapturedefault) 623 | capturestackptr = capturestackptr + 1 624 | CAPTURESTACK[capturestackptr] = { capture = capture, captop = captop, maxcapture = maxcapturedefault } 625 | CAPTURE = capture 626 | maxcapture = maxcapturedefault 627 | captop = 0 628 | L[pA + s * maxpointer] = { X = LRFAIL, k = k, cs = capturestackptr, level = 0 } 629 | STACK[stackptr].p = p + 1 630 | STACK[stackptr].pA = pA 631 | STACK[stackptr].s = s 632 | STACK[stackptr].X = LRFAIL 633 | stackptr = stackptr + 1 634 | p = pA 635 | elseif X.X == LRFAIL or k < X.k then 636 | -- lvar.3 lvar.5 637 | fail() 638 | else 639 | -- lvar.4 640 | local capture = X.capturecommit 641 | while captop + capture.captop >= maxcapture do 642 | doublecapture() 643 | end 644 | ffi.copy(CAPTURE + captop, capture.capture, capture.captop * ffi.sizeof('CAPTURE')) 645 | captop = captop + capture.captop 646 | p = p + 1 647 | s = X.X 648 | end 649 | end 650 | elseif code == ICommit then 651 | stackptr = stackptr - 1 652 | p = p + op.p[p].offset 653 | elseif code == IPartialCommit then 654 | STACK[stackptr - 1].s = s 655 | STACK[stackptr - 1].caplevel = captop 656 | STACK[stackptr - 1].valuetabletop = #valuetable 657 | p = p + op.p[p].offset 658 | elseif code == IBackCommit then 659 | stackptr = stackptr - 1 660 | s = STACK[stackptr].s 661 | captop = STACK[stackptr].caplevel 662 | for i = #valuetable, STACK[stackptr].valuetabletop + 1, -1 do 663 | table.remove(valuetable) 664 | end 665 | p = p + op.p[p].offset 666 | elseif code == IFailTwice then 667 | stackptr = stackptr - 1 668 | fail() 669 | elseif code == IFail then 670 | fail() 671 | elseif code == ICloseRunTime then 672 | -- invalidate memo 673 | for i = 0, stackptr - 1 do 674 | STACK[i].memos = VOID 675 | end 676 | local cs = {} 677 | cs.s = o 678 | cs.stream = getstreamstring 679 | cs.ocap = CAPTURE 680 | cs.ptop = arg 681 | cs.ptopcount = argcount 682 | local out = { outindex = 0, out = {} } 683 | local n = lpcap.runtimecap(cs, captop, s + 1, out, valuetable) -- call function 684 | captop = captop - n 685 | local res = resdyncaptures(out.out[1], s + 1, len and len + 1, checkstreamlen) -- get result 686 | -- fail? 687 | if res == FAIL then 688 | fail() 689 | else 690 | s = res - 1 -- else update current position 691 | n = out.outindex - 1 -- number of new captures 692 | -- any new capture? 693 | if n > 0 then 694 | captop = captop + 1 695 | while captop + n + 1 >= maxcapture do 696 | doublecapture() 697 | end 698 | captop = captop + n + 1 699 | -- add new captures to 'capture' list 700 | adddyncaptures(s + 1, CAPTURE, captop - n - 2, n, out.out, valuetable) 701 | end 702 | p = p + 1 703 | end 704 | elseif code == ICloseCapture then 705 | local s1 = s + 1 706 | assert(captop > 0) 707 | -- if possible, turn capture into a full capture 708 | if CAPTURE[captop - 1].siz == 0 and 709 | s1 - CAPTURE[captop - 1].s < 255 then 710 | CAPTURE[captop - 1].siz = s1 - CAPTURE[captop - 1].s + 1 711 | p = p + 1 712 | else 713 | CAPTURE[captop].siz = 1 714 | CAPTURE[captop].s = s + 1 715 | pushcapture() 716 | end 717 | elseif code == IOpenCapture then 718 | CAPTURE[captop].siz = 0 719 | CAPTURE[captop].s = s + 1 720 | pushcapture() 721 | elseif code == IFullCapture then 722 | CAPTURE[captop].siz = band(rshift(op.p[p].val, 4), 0x0F) + 1 -- save capture size 723 | CAPTURE[captop].s = s + 1 - band(rshift(op.p[p].val, 4), 0x0F) 724 | pushcapture() 725 | -- standard mode 726 | elseif o then 727 | if code == IAny then 728 | if s < len then 729 | p = p + 1 730 | s = s + 1 731 | else 732 | fail() 733 | end 734 | elseif code == ITestAny then 735 | if s < len then 736 | p = p + 1 737 | else 738 | p = p + op.p[p].offset 739 | end 740 | elseif code == IChar then 741 | if s < len and ptr[s] == op.p[p].val then 742 | p = p + 1 743 | s = s + 1 744 | else 745 | fail() 746 | end 747 | elseif code == ITestChar then 748 | if s < len and ptr[s] == op.p[p].val then 749 | p = p + 1 750 | else 751 | p = p + op.p[p].offset 752 | end 753 | elseif code == ISet then 754 | local c = ptr[s] 755 | local set = valuetable[op.p[p].val] 756 | if s < len and band(set[rshift(c, 5)], lshift(1, band(c, 31))) ~= 0 then 757 | p = p + 1 758 | s = s + 1 759 | else 760 | fail() 761 | end 762 | elseif code == ITestSet then 763 | local c = ptr[s] 764 | local set = valuetable[op.p[p].val] 765 | if s < len and band(set[rshift(c, 5)], lshift(1, band(c, 31))) ~= 0 then 766 | p = p + 1 767 | else 768 | p = p + op.p[p].offset 769 | end 770 | elseif code == ISpan then 771 | while s < len do 772 | local c = ptr[s] 773 | local set = valuetable[op.p[p].val] 774 | if band(set[rshift(c, 5)], lshift(1, band(c, 31))) == 0 then 775 | break 776 | end 777 | s = s + 1 778 | end 779 | p = p + 1 780 | end 781 | else 782 | -- stream mode 783 | if code == IAny then 784 | if checkstreamlen(s) then 785 | p = p + 1 786 | s = s + 1 787 | else 788 | fail() 789 | end 790 | elseif code == ITestAny then 791 | if checkstreamlen(s) then 792 | p = p + 1 793 | else 794 | p = p + op.p[p].offset 795 | end 796 | elseif code == IChar then 797 | if checkstreamlen(s) and getstreamchar(s) == op.p[p].val then 798 | p = p + 1 799 | s = s + 1 800 | else 801 | fail() 802 | end 803 | elseif code == ITestChar then 804 | if checkstreamlen(s) and getstreamchar(s) == op.p[p].val then 805 | p = p + 1 806 | else 807 | p = p + op.p[p].offset 808 | end 809 | elseif code == ISet then 810 | local c = checkstreamlen(s) and getstreamchar(s) 811 | local set = valuetable[op.p[p].val] 812 | if c and band(set[rshift(c, 5)], lshift(1, band(c, 31))) ~= 0 then 813 | p = p + 1 814 | s = s + 1 815 | else 816 | fail() 817 | end 818 | elseif code == ITestSet then 819 | local c = checkstreamlen(s) and getstreamchar(s) 820 | local set = valuetable[op.p[p].val] 821 | if c and band(set[rshift(c, 5)], lshift(1, band(c, 31))) ~= 0 then 822 | p = p + 1 823 | else 824 | p = p + op.p[p].offset 825 | end 826 | elseif code == ISpan then 827 | while checkstreamlen(s) do 828 | local c = getstreamchar(s) 829 | local set = valuetable[op.p[p].val] 830 | if band(set[rshift(c, 5)], lshift(1, band(c, 31))) == 0 then 831 | break 832 | end 833 | s = s + 1 834 | end 835 | p = p + 1 836 | end 837 | end 838 | end 839 | end 840 | 841 | local function setmax(val) 842 | maxstack = val 843 | if maxstack < INITBACK then 844 | maxstack = INITBACK 845 | end 846 | end 847 | 848 | local function setmaxbehind(val) 849 | MAXBEHIND = math.max(MAXBEHINDPREDICATE, MAXOFF, val or 0) 850 | end 851 | 852 | local function enablememoization(val) 853 | usememoization = val 854 | end 855 | 856 | local function enabletracing(val) 857 | trace = val 858 | end 859 | 860 | -- Get the initial position for the match, interpreting negative 861 | -- values from the end of the subject 862 | 863 | local function initposition(len, pos) 864 | local ii = pos or 1 865 | -- positive index? 866 | if (ii > 0) then 867 | -- inside the string? 868 | if ii <= len then 869 | return ii - 1; -- return it (corrected to 0-base) 870 | else 871 | return len; -- crop at the end 872 | end 873 | else 874 | -- negative index 875 | -- inside the string? 876 | if -ii <= len then 877 | return len + ii -- return position from the end 878 | else 879 | return 0; -- crop at the beginning 880 | end 881 | end 882 | end 883 | 884 | local function lp_match(pat, s, init, valuetable, ...) 885 | local i = initposition(s:len(), init) + 1 886 | return select(2, match(false, true, s, i, pat.code, valuetable, ...)) 887 | end 888 | 889 | local function lp_streammatch(pat, init, valuetable, ...) 890 | local params = { ... } 891 | local paramslength = select('#', ...) 892 | local fce = coroutine.wrap(function(s, last) 893 | return match(true, last, s, init or 1, pat.code, valuetable, unpack(params, 1, paramslength)) 894 | end) 895 | return fce 896 | end 897 | 898 | local function retcount(...) 899 | return select('#', ...), { ... } 900 | end 901 | 902 | -- Only for testing purpose 903 | -- stream emulation (send all chars from string one char after char) 904 | local function lp_emulatestreammatch(pat, s, init, valuetable, ...) 905 | local init = initposition(s:len(), init) + 1 906 | local fce = lp_streammatch(pat, init, valuetable, ...) 907 | local ret, count = {}, 0 908 | for j = 1, #s do 909 | local pcount, pret = retcount(fce(s:sub(j, j), j == #s)) -- one char 910 | if pret[1] == -1 then 911 | return -- fail 912 | elseif pret[1] == 0 then 913 | -- parsing finished 914 | -- collect result 915 | for i = 2, pcount do 916 | ret[count + i - 1] = pret[i] 917 | end 918 | count = count + pcount - 1 919 | return unpack(ret, 1, count) 920 | end 921 | for i = 2, pcount do 922 | ret[count + i - 1] = pret[i] 923 | end 924 | count = count + pcount - 1 925 | end 926 | return select(2, fce(s, true)) -- empty string 927 | end 928 | 929 | local function lp_load(str, fcetab, usemeta) 930 | local index = 0 931 | assert(str) 932 | local ptr = ffi.cast('const char*', str) 933 | local patsize = ffi.cast('uint32_t*', ptr + index)[0] 934 | index = index + 4 935 | local len = ffi.sizeof(treepatternelement) * patsize 936 | 937 | local pat 938 | if usemeta then 939 | pat = treepattern(patsize) 940 | else 941 | pat = ffi.gc(ffi.cast('TREEPATTERN*', ffi.C.malloc(ffi.sizeof(treepattern, patsize))), 942 | function(ct) 943 | if ct.code ~= nil then 944 | ffi.C.free(ct.code.p) 945 | ffi.C.free(ct.code) 946 | end 947 | ffi.C.free(ct) 948 | end) 949 | ffi.fill(pat, ffi.sizeof(treepattern, patsize)) 950 | pat.treesize = patsize 951 | pat.id = 0 952 | end 953 | ffi.copy(pat.p, ptr + index, len) 954 | index = index + len 955 | if usemeta then 956 | pat.code = pattern() 957 | else 958 | pat.code = ffi.cast('PATTERN*', ffi.C.malloc(ffi.sizeof(pattern))) 959 | assert(pat.code ~= nil) 960 | pat.code.allocsize = 10 961 | pat.code.size = 0 962 | pat.code.p = ffi.C.malloc(ffi.sizeof(patternelement) * pat.code.allocsize) 963 | assert(pat.code.p ~= nil) 964 | ffi.fill(pat.code.p, ffi.sizeof(patternelement) * pat.code.allocsize) 965 | end 966 | pat.code.size = ffi.cast('uint32_t*', ptr + index)[0] 967 | index = index + 4 968 | local len = pat.code.size * ffi.sizeof(patternelement) 969 | local data = ffi.string(ptr + index, len) 970 | index = index + len 971 | local count = ffi.cast('uint32_t*', ptr + index)[0] 972 | index = index + 4 973 | local valuetable = {} 974 | for i = 1, count do 975 | local tag = ffi.string(ptr + index, 3) 976 | index = index + 3 977 | --string 978 | if tag == 'str' then 979 | local len = ffi.cast('uint32_t*', ptr + index)[0] 980 | index = index + 4 981 | local val = ffi.string(ptr + index, len) 982 | index = index + len 983 | valuetable[#valuetable + 1] = val 984 | elseif tag == 'num' then 985 | --number 986 | local len = ffi.cast('uint32_t*', ptr + index)[0] 987 | index = index + 4 988 | local val = ffi.string(ptr + index, len) 989 | index = index + len 990 | valuetable[#valuetable + 1] = tonumber(val) 991 | elseif tag == 'cdt' then 992 | --ctype 993 | local val = settype() 994 | ffi.copy(val, ptr + index, ffi.sizeof(settype)) 995 | index = index + ffi.sizeof(settype) 996 | valuetable[#valuetable + 1] = val 997 | elseif tag == 'fnc' then 998 | --function 999 | local len = ffi.cast('uint32_t*', ptr + index)[0] 1000 | index = index + 4 1001 | local fname = ffi.string(ptr + index, len) 1002 | index = index + len 1003 | len = ffi.cast('uint32_t*', ptr + index)[0] 1004 | index = index + 4 1005 | local val = ffi.string(ptr + index, len) 1006 | index = index + len 1007 | if fcetab and fcetab[fname] then 1008 | assert(type(fcetab[fname]) == 'function', ('"%s" is not function'):format(fname)) 1009 | valuetable[#valuetable + 1] = fcetab[fname] 1010 | else 1011 | valuetable[#valuetable + 1] = loadstring(val) 1012 | end 1013 | end 1014 | end 1015 | pat.code.allocsize = pat.code.size 1016 | pat.code.p = ffi.C.realloc(pat.code.p, ffi.sizeof(patternelement) * pat.code.allocsize) 1017 | assert(pat.code.p ~= nil) 1018 | ffi.copy(pat.code.p, data, ffi.sizeof(patternelement) * pat.code.allocsize) 1019 | return pat, valuetable 1020 | end 1021 | 1022 | local function lp_loadfile(fname, fcetab, usemeta) 1023 | local file = assert(io.open(fname, 'rb')) 1024 | local pat, valuetable = lp_load(assert(file:read("*a")), fcetab, usemeta) 1025 | file:close() 1026 | return pat, valuetable 1027 | end 1028 | 1029 | -- ====================================================== 1030 | 1031 | return { 1032 | match = lp_match, 1033 | streammatch = lp_streammatch, 1034 | emulatestreammatch = lp_emulatestreammatch, 1035 | load = lp_load, 1036 | loadfile = lp_loadfile, 1037 | setmax = setmax, 1038 | setmaxbehind = setmaxbehind, 1039 | enablememoization = enablememoization, 1040 | enabletracing = enabletracing 1041 | } 1042 | -------------------------------------------------------------------------------- /src/re.lua: -------------------------------------------------------------------------------- 1 | -- $Id: re.lua,v 1.44 2013/03/26 20:11:40 roberto Exp $ 2 | -- 2014/08/15 changes rostislav 3 | 4 | -- imported functions and modules 5 | local tonumber, print, error = tonumber, print, error 6 | local setmetatable = setmetatable 7 | local m = require"lpeglj" 8 | 9 | -- 'm' will be used to parse expressions, and 'mm' will be used to 10 | -- create expressions; that is, 're' runs on 'm', creating patterns 11 | -- on 'mm' 12 | local mm = m 13 | 14 | -- pattern's metatable 15 | local mt = getmetatable(mm.P(0)) 16 | mt = m.version() == "1.0.0.0LJ" and m or mt 17 | 18 | 19 | 20 | -- No more global accesses after this point 21 | local version = _VERSION 22 | if version == "Lua 5.2" then _ENV = nil end 23 | 24 | 25 | local any = m.P(1) 26 | 27 | 28 | -- Pre-defined names 29 | local Predef = { nl = m.P"\n" } 30 | 31 | 32 | local mem 33 | local fmem 34 | local gmem 35 | 36 | 37 | local function updatelocale () 38 | mm.locale(Predef) 39 | Predef.a = Predef.alpha 40 | Predef.c = Predef.cntrl 41 | Predef.d = Predef.digit 42 | Predef.g = Predef.graph 43 | Predef.l = Predef.lower 44 | Predef.p = Predef.punct 45 | Predef.s = Predef.space 46 | Predef.u = Predef.upper 47 | Predef.w = Predef.alnum 48 | Predef.x = Predef.xdigit 49 | Predef.A = any - Predef.a 50 | Predef.C = any - Predef.c 51 | Predef.D = any - Predef.d 52 | Predef.G = any - Predef.g 53 | Predef.L = any - Predef.l 54 | Predef.P = any - Predef.p 55 | Predef.S = any - Predef.s 56 | Predef.U = any - Predef.u 57 | Predef.W = any - Predef.w 58 | Predef.X = any - Predef.x 59 | mem = {} -- restart memoization 60 | fmem = {} 61 | gmem = {} 62 | local mt = {__mode = "v"} 63 | setmetatable(mem, mt) 64 | setmetatable(fmem, mt) 65 | setmetatable(gmem, mt) 66 | end 67 | 68 | 69 | updatelocale() 70 | 71 | 72 | 73 | local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) 74 | 75 | 76 | local function getdef (id, defs) 77 | local c = defs and defs[id] 78 | if not c then error("undefined name: " .. id) end 79 | return c 80 | end 81 | 82 | 83 | local function patt_error (s, i) 84 | local msg = (#s < i + 20) and s:sub(i) 85 | or s:sub(i,i+20) .. "..." 86 | msg = ("pattern error near '%s'"):format(msg) 87 | error(msg, 2) 88 | end 89 | 90 | local function mult (p, n) 91 | local np = mm.P(true) 92 | while n >= 1 do 93 | if n%2 >= 1 then np = np * p end 94 | p = p * p 95 | n = n/2 96 | end 97 | return np 98 | end 99 | 100 | local function equalcap (s, i, c) 101 | if type(c) ~= "string" then return nil end 102 | local e = #c + i 103 | if type(s) == 'function' then -- stream mode 104 | if s(i, e - 1) == c then return e else return nil end 105 | else 106 | if s:sub(i, e - 1) == c then return e else return nil end 107 | end 108 | end 109 | 110 | 111 | local S = (Predef.space + "--" * (any - Predef.nl)^0)^0 112 | 113 | local name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")^0 114 | 115 | local arrow = S * "<-" 116 | 117 | local seq_follow = m.P"/" + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1 118 | 119 | name = m.C(name) 120 | 121 | 122 | -- a defined name only have meaning in a given environment 123 | local Def = name * m.Carg(1) 124 | 125 | local num = m.C(m.R"09"^1) * S / tonumber 126 | 127 | local String = "'" * m.C((any - "'")^0) * "'" + 128 | '"' * m.C((any - '"')^0) * '"' 129 | 130 | 131 | local defined = "%" * Def / function (c,Defs) 132 | local cat = Defs and Defs[c] or Predef[c] 133 | if not cat then error ("name '" .. c .. "' undefined") end 134 | return cat 135 | end 136 | 137 | local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R 138 | 139 | local item = defined + Range + m.C(any) 140 | 141 | local Class = 142 | "[" 143 | * (m.C(m.P"^"^-1)) -- optional complement symbol 144 | * m.Cf(item * (item - "]")^0, mt.__add) / 145 | function (c, p) return c == "^" and any - p or p end 146 | * "]" 147 | 148 | local function adddef (t, k, exp) 149 | if t[k] then 150 | error("'"..k.."' already defined as a rule") 151 | else 152 | t[k] = exp 153 | end 154 | return t 155 | end 156 | 157 | local function firstdef (n, r) return adddef({n}, n, r) end 158 | 159 | 160 | local function NT (n, b, p) 161 | if not b then 162 | error("rule '"..n.."' used outside a grammar") 163 | else return mm.V(n, p or 0) 164 | end 165 | end 166 | 167 | 168 | local exp = m.P{ "Exp", 169 | Exp = S * ( m.V"Grammar" 170 | + m.Cf(m.V"Seq" * ("/" * S * m.V"Seq")^0, mt.__add) ); 171 | Seq = m.Cf(m.Cc(m.P"") * m.V"Prefix"^0 , mt.__mul) 172 | * (#seq_follow + patt_error); 173 | Prefix = "&" * S * m.V"Prefix" / mt.__len 174 | + "!" * S * m.V"Prefix" / mt.__unm 175 | + m.V"Suffix"; 176 | Suffix = m.Cf(m.V"Primary" * S * 177 | ( ( m.P"+" * m.Cc(1, mt.__pow) 178 | + m.P"*" * m.Cc(0, mt.__pow) 179 | + m.P"?" * m.Cc(-1, mt.__pow) 180 | + "^" * ( m.Cg(num * m.Cc(mult)) 181 | + m.Cg(m.C(m.S"+-" * m.R"09"^1) * m.Cc(mt.__pow)) 182 | ) 183 | + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) 184 | + m.P"{}" * m.Cc(nil, m.Ct) 185 | + m.Cg(Def / getdef * m.Cc(mt.__div)) 186 | ) 187 | + "=>" * S * m.Cg(Def / getdef * m.Cc(m.Cmt)) 188 | ) * S 189 | )^0, function (a,b,f) return f(a,b) end ); 190 | Primary = "(" * m.V"Exp" * ")" 191 | + String / mm.P 192 | + Class 193 | + defined 194 | + "{:" * (name * ":" + m.Cc(nil)) * m.V"Exp" * ":}" / 195 | function (n, p) return mm.Cg(p, n) end 196 | + "=" * name / function (n) return mm.Cmt(mm.Cb(n), equalcap) end 197 | + m.P"{}" / mm.Cp 198 | + "{~" * m.V"Exp" * "~}" / mm.Cs 199 | + "{|" * m.V"Exp" * "|}" / mm.Ct 200 | + "{" * m.V"Exp" * "}" / mm.C 201 | + m.P"." * m.Cc(any) 202 | + (name * m.Cb("G") * (S * ":" * S * num)^-1 * -arrow + "<" * name * m.Cb("G") * (S * ":" * S * num)^-1 * ">") / NT; 203 | Definition = name * arrow * m.V"Exp"; 204 | Grammar = m.Cg(m.Cc(true), "G") * 205 | m.Cf(m.V"Definition" / firstdef * m.Cg(m.V"Definition")^0, 206 | adddef) / mm.P 207 | } 208 | 209 | local pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error) 210 | 211 | 212 | local function compile (p, defs) 213 | if mm.type(p) == "pattern" then return p end -- already compiled 214 | local cp = pattern:match(p, 1, defs) 215 | if not cp then error("incorrect pattern", 3) end 216 | return cp 217 | end 218 | 219 | local function match (s, p, i) 220 | local cp = mem[p] 221 | if not cp then 222 | cp = compile(p) 223 | mem[p] = cp 224 | end 225 | return cp:match(s, i or 1) 226 | end 227 | 228 | local function streammatch (p, i) 229 | local cp = mem[p] 230 | if not cp then 231 | cp = compile(p) 232 | mem[p] = cp 233 | end 234 | return cp:streammatch(i or 1) 235 | end 236 | 237 | -- Only for testing purpose 238 | local function emulatestreammatch(s, p, i) 239 | local cp = mem[p] 240 | if not cp then 241 | cp = compile(p) 242 | mem[p] = cp 243 | end 244 | return cp:emulatestreammatch(s, i or 1) 245 | end 246 | 247 | local function find (s, p, i) 248 | local cp = fmem[p] 249 | if not cp then 250 | cp = compile(p) / 0 251 | cp = mm.P{ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) } 252 | fmem[p] = cp 253 | end 254 | local i, e = cp:match(s, i or 1) 255 | if i then return i, e - 1 256 | else return i 257 | end 258 | end 259 | 260 | local function gsub (s, p, rep) 261 | local g = gmem[p] or {} -- ensure gmem[p] is not collected while here 262 | gmem[p] = g 263 | local cp = g[rep] 264 | if not cp then 265 | cp = compile(p) 266 | cp = mm.Cs((cp / rep + 1)^0) 267 | g[rep] = cp 268 | end 269 | return cp:match(s) 270 | end 271 | 272 | 273 | -- exported names 274 | local re = { 275 | compile = compile, 276 | match = match, 277 | streammatch = streammatch, 278 | emulatestreammatch = emulatestreammatch, 279 | find = find, 280 | gsub = gsub, 281 | updatelocale = updatelocale, 282 | } 283 | 284 | if version == "Lua 5.1" then _G.re = re end 285 | 286 | return re 287 | -------------------------------------------------------------------------------- /tests/loadtest.lua: -------------------------------------------------------------------------------- 1 | local vm = require"lpvm" 2 | local m = require"lpeglj" 3 | local re = require"re" 4 | 5 | local function checkeq(x, y, p) 6 | if p then print(x, y) end 7 | if type(x) ~= "table" then assert(x == y) 8 | else 9 | for k, v in pairs(x) do checkeq(v, y[k], p) end 10 | for k, v in pairs(y) do checkeq(v, x[k], p) end 11 | end 12 | end 13 | 14 | print"Tests for LPegLJ pattern saving and loading" 15 | print("version " .. m.version()) 16 | 17 | local c = re.compile([[ 18 | s <- ({(!longstring .)+} / longstring)* 19 | longstring <- '[' {:init: '='* :} '[' close 20 | close <- ']' =init ']' / . close 21 | ]]) 22 | 23 | local teststring = 'data1[=[insidedata1]=]data2[==[====]==]data3[[]]' 24 | 25 | local patfile = 'test.pat' 26 | 27 | local patdata = c:dump() 28 | c:save(patfile) 29 | 30 | local pat = m.load(patdata) 31 | checkeq({ pat:match(teststring) }, { "data1", "data2", "data3" }) 32 | 33 | local pat = m.loadfile(patfile) 34 | checkeq({ pat:match(teststring) }, { "data1", "data2", "data3" }) 35 | 36 | -- use only vm module (lpvm + lpcap) 37 | local pat, valuetable = vm.load(patdata) 38 | checkeq({ vm.match(pat, teststring, 1, valuetable) }, { "data1", "data2", "data3" }) 39 | 40 | local pat, valuetable = vm.loadfile(patfile) 41 | checkeq({ vm.match(pat, teststring, 1, valuetable) }, { "data1", "data2", "data3" }) 42 | 43 | print('OK') 44 | -------------------------------------------------------------------------------- /tests/streamtest2.lua: -------------------------------------------------------------------------------- 1 | local m = require"lpeglj" 2 | local re = require"re" 3 | 4 | local function checkeq(x, y, p) 5 | if p then print(x, y) end 6 | if type(x) ~= "table" then assert(x == y) 7 | else 8 | for k, v in pairs(x) do checkeq(v, y[k], p) end 9 | for k, v in pairs(y) do checkeq(v, x[k], p) end 10 | end 11 | end 12 | 13 | local ret 14 | 15 | print"Tests for LPegLJ stream mode" 16 | 17 | assert(type(m.version()) == "string") 18 | print("version " .. m.version()) 19 | 20 | local pat = m.C('abcd') * m.C('x') 21 | local fce = pat:streammatch() 22 | 23 | ret = { fce("a") } 24 | checkeq(ret, { 1 }) 25 | ret = { fce("b") } 26 | checkeq(ret, { 1 }) 27 | ret = { fce("c") } 28 | checkeq(ret, { 1 }) 29 | ret = { fce("d") } 30 | checkeq(ret, { 1, "abcd" }) 31 | ret = { fce("x") } 32 | checkeq(ret, { 0, 'x' }) 33 | 34 | local pat = m.C('abcd') * m.C('x') + m.C('abcd') * m.C('y') 35 | local fce = pat:streammatch() 36 | ret = { fce("abcd") } 37 | checkeq(ret, { 1 }) 38 | ret = { fce("y") } 39 | checkeq(ret, { 0, "abcd", "y" }) 40 | 41 | local pat = m.C('abcd') ^ 0 * m.C('x') 42 | local fce = pat:streammatch() 43 | for i = 1, 1e3 do 44 | ret = { fce("ab") } 45 | checkeq(ret, { 1 }) 46 | ret = { fce("cd") } 47 | checkeq(ret, { 1, "abcd" }) 48 | end 49 | ret = { fce("x") } 50 | checkeq(ret, { 0, "x" }) 51 | 52 | local pat = (m.C('abcd') / "out") ^ 0 * m.C('x') 53 | local fce = pat:streammatch() 54 | for i = 1, 1e3 do 55 | ret = { fce("ab") } 56 | checkeq(ret, { 1 }) 57 | ret = { fce("cd") } 58 | checkeq(ret, { 1, "out" }) 59 | end 60 | ret = { fce("x") } 61 | checkeq(ret, { 0, "x" }) 62 | 63 | local pat = (m.C('abcd') / "pattern1" + m.C('efgh') / "pattern2" + (m.P(1) - 'xyz')) ^ 0 * (m.C("xyz") / "pattern3") 64 | local fce = pat:streammatch() 65 | 66 | for i = 1, 1e3 do 67 | ret = { fce("ef") } 68 | checkeq(ret, { 1 }) 69 | ret = { fce("gh") } 70 | checkeq(ret, { 1, "pattern2" }) 71 | ret = { fce("a") } 72 | checkeq(ret, { 1 }) 73 | ret = { fce("bcd") } 74 | checkeq(ret, { 1, "pattern1" }) 75 | end 76 | ret = { fce("xyz") } 77 | checkeq(ret, { 0, "pattern3" }) 78 | 79 | local pat = m.P('abcd') * -1 80 | local fce = pat:streammatch() 81 | ret = { fce("abc") } 82 | checkeq(ret, { 1 }) 83 | ret = { fce("d") } 84 | checkeq(ret, { 1 }) 85 | ret = { fce("", true) } 86 | checkeq(ret, { 0, 5 }) 87 | 88 | local field = '"' * m.Cs(((m.P(1) - '"') + m.P'""' / '"') ^ 0) * '"' + 89 | m.C((1 - m.S',\n"') ^ 0) 90 | 91 | local record = field * (',' * field) ^ 0 * (m.P'\n' + -1) 92 | 93 | local fce = record:streammatch() 94 | ret = { fce('ab') } 95 | checkeq(ret, { 1 }) 96 | ret = { fce('c') } 97 | checkeq(ret, { 1 }) 98 | ret = { fce(',"def",') } 99 | checkeq(ret, { 1, 'abc', 'def' }) 100 | ret = { fce('x', true) } 101 | checkeq(ret, { 0, 'x' }) 102 | 103 | record = re.compile[[ 104 | record <- field (',' field)* (%nl / !.) 105 | field <- escaped / nonescaped 106 | nonescaped <- { [^,"%nl]* } 107 | escaped <- '"' {~ ([^"] / '""' -> '"')* ~} '"' 108 | ]] 109 | 110 | local fce = record:streammatch() 111 | ret = { fce("a") } 112 | checkeq(ret, { 1 }) 113 | ret = { fce("bc,") } 114 | checkeq(ret, { 1, 'abc' }) 115 | ret = { fce("def", true) } 116 | checkeq(ret, { 0, 'def' }) 117 | 118 | local c = re.compile([[ 119 | s <- ({(!longstring .)+} / longstring)* 120 | longstring <- '[' {:init: '='* :} '[' close 121 | close <- ']' =init ']' / . close 122 | ]]) 123 | 124 | local teststring = 'data1[=[insidedata1]=]data2[==[====]==]data3[[]]' 125 | 126 | local output = { 'data1', 'data2', 'data3' } 127 | 128 | local fce = c:streammatch() 129 | 130 | local index = 1 131 | 132 | for i = 1, #output do 133 | local status, data 134 | repeat 135 | status, data = fce(teststring:sub(index, index), index == #teststring) 136 | index = index + 1 137 | until data or status ~= 1 138 | checkeq(output[i], data) 139 | end 140 | 141 | local pat = m.C('a') * m.Cg('b', 'backref1') * m.C('c') * m.Cg('d', 'backref2') * m.C('e') * m.Cg('f', 'backref3') * 142 | m.Cb('backref1') * m.C('g') * m.Cb('backref2') * m.C('h') * m.Cb('backref3') * m.C('i') 143 | local fce = pat:streammatch() 144 | 145 | ret = { fce("a") } 146 | checkeq(ret, { 1, 'a' }) 147 | ret = { fce("b") } 148 | checkeq(ret, { 1 }) 149 | ret = { fce("c") } 150 | checkeq(ret, { 1, "c" }) 151 | ret = { fce("d") } 152 | checkeq(ret, { 1, }) 153 | ret = { fce("e") } 154 | checkeq(ret, { 1, "e" }) 155 | ret = { fce("f") } 156 | checkeq(ret, { 1, "b" }) 157 | ret = { fce("g") } 158 | checkeq(ret, { 1, "g", "d" }) 159 | ret = { fce("h") } 160 | checkeq(ret, { 1, "h", "f" }) 161 | ret = { fce("i") } 162 | checkeq(ret, { 0, "i" }) 163 | 164 | local pat = m.C('a') * (m.Cg(1, 'backref') * m.C('x1') * m.Cb('backref') + m.Cg(1, 'backref') * m.C('x2') * m.Cb('backref')) 165 | local fce = pat:streammatch() 166 | ret = { fce("a") } 167 | checkeq(ret, { 1, 'a' }) 168 | ret = { fce("x") } 169 | checkeq(ret, { 1 }) 170 | ret = { fce("x") } 171 | checkeq(ret, { 1 }) 172 | ret = { fce("2") } 173 | checkeq(ret, { 0, 'x2', 'x' }) 174 | 175 | 176 | local pat = m.C('a') * m.Ct(m.Cg('b', 'index')) * m.C('c') 177 | local fce = pat:streammatch() 178 | 179 | ret = { fce("a") } 180 | checkeq(ret, { 1, 'a' }) 181 | ret = { fce("b") } 182 | checkeq(ret, { 1, { index = 'b' } }) 183 | ret = { fce("c") } 184 | checkeq(ret, { 0, 'c' }) 185 | 186 | print('OK') 187 | 188 | -------------------------------------------------------------------------------- /tests/testlr.lua: -------------------------------------------------------------------------------- 1 | local lpeg = require"lpeglj" 2 | local re = require"re" 3 | 4 | local m = lpeg 5 | 6 | local function checkeq(x, y, p) 7 | if p then print(x, y) end 8 | if type(x) ~= "table" then assert(x == y) 9 | else 10 | for k, v in pairs(x) do checkeq(v, y[k], p) end 11 | for k, v in pairs(y) do checkeq(v, x[k], p) end 12 | end 13 | end 14 | 15 | print"Tests for LPegLJ left recursion" 16 | 17 | assert(type(m.version()) == "string") 18 | print("version " .. m.version()) 19 | 20 | m.enableleftrecursion(true) 21 | 22 | --[[ 23 | direct left recursion 24 | E ← E + n / n 25 | --]] 26 | 27 | local pat = m.P{ 28 | "E"; 29 | E = m.V"E" * '+' * "n" + "n", 30 | } 31 | 32 | assert(pat:match("n+n+n") == 6) 33 | 34 | --[[ 35 | indirect left recursion 36 | L ← P.x / x 37 | P ← P(n) / L 38 | --]] 39 | 40 | local pat = m.P{ 41 | "L"; 42 | L = m.V"P" * ".x" + "x", 43 | P = m.V"P" * "(n)" + m.V"L" 44 | } 45 | 46 | assert(pat:match("x(n)(n).x(n).x") == 15) 47 | 48 | --[[ 49 | left and right recursion with precedence rules 50 | E ← E1 + E2 / E1 − E2 / E2 ∗ E3 / E2 ÷ E3 / E3 ∗∗ E3 / − E4 / (E1) / n 51 | --]] 52 | 53 | 54 | local pat = m.P{ 55 | "E", 56 | E = m.V("E", 1) * m.S'+-' * m.V("E", 2) + 57 | m.V("E", 2) * m.S'*/' * m.V("E", 3) + 58 | m.V("E", 3) * '**' * m.V("E", 3) + 59 | '-' * m.V("E", 4) + 60 | '(' * m.V("E") * ')' + 61 | m.R'09' ^ 1, 62 | } 63 | 64 | assert(pat:match("-1*(6+2/4+3-1)**2") == 18) 65 | 66 | --[[ 67 | left and right recursion with precedence rules 68 | E ← E1 + E2 / E1 − E2 / E2 ∗ E3 / E2 ÷ E3 / E3 ∗∗ E3 / − E4 / (E1) / n 69 | create AST tree 70 | --]] 71 | 72 | 73 | local pat = m.P{ 74 | "E", 75 | E = m.Ct(m.V("E", 1) * m.C(m.S'+-') * m.V("E", 2) + 76 | m.V("E", 2) * m.C(m.S'*/') * m.V("E", 3) + 77 | m.V("E", 3) * m.C('**') * m.V("E", 3) + 78 | m.C('-') * m.V("E", 4) + 79 | '(' * m.V("E") * ')' + 80 | m.C(m.R'09' ^ 1)), 81 | } 82 | 83 | local ASTtree = pat:match("1+1+1") 84 | checkeq(ASTtree, { { { "1" }, "+", { "1" } }, "+", { "1" } }) 85 | 86 | local ASTtree = pat:match("-1*(6+2/4+3-1)**2") 87 | checkeq(ASTtree, { { "-", { "1" } }, "*", { { { { { { "6" }, "+", { { "2" }, "/", { "4" } } }, "+", { "3" } }, "-", { "1" } } }, "**", { "2" } } }) 88 | 89 | -- using re module with precedence (the same example as above) 90 | -- call_nonterminal : precedence_level or 91 | 92 | local pat = [[ 93 | E <- (E:1 {[+-]} E:2 / 94 | E:2 {[*/]} E:3 / 95 | E:3 {'**'} E:3 / 96 | {'-'} E:4 / 97 | '(' E ')' / 98 | {[0-9]+}) -> {} 99 | ]] 100 | 101 | local ASTtree = re.match("-1*(6+2/4+3-1)**2", pat) 102 | checkeq(ASTtree, { { "-", { "1" } }, "*", { { { { { { "6" }, "+", { { "2" }, "/", { "4" } } }, "+", { "3" } }, "-", { "1" } } }, "**", { "2" } } }) 103 | 104 | --[[ 105 | simple evaluator 106 | E ← E1 + E2 / E1 − E2 / E2 ∗ E3 / E2 ÷ E3 / E3 ∗∗ E3 / − E4 / (E1) / n 107 | --]] 108 | 109 | local eval = function(s, i, p1, p2, p3) 110 | local res 111 | if p2 == '+' then 112 | res = p1 + p3 113 | elseif p2 == '-' then 114 | res = p1 - p3 115 | elseif p2 == '*' then 116 | res = p1 * p3 117 | elseif p2 == '/' then 118 | res = p1 / p3 119 | elseif p1 == '-' then 120 | res = -p2 121 | elseif p2 == '**' then 122 | res = p1 ^ p3 123 | else 124 | res = p1 125 | end 126 | return true, res 127 | end 128 | 129 | 130 | local pat = m.P{ 131 | "E", 132 | E = m.Cmt(m.V("E", 1) * m.C(m.S'+-') * m.V("E", 2) + 133 | m.V("E", 2) * m.C(m.S'*/') * m.V("E", 3) + 134 | m.V("E", 3) * m.C('**') * m.V("E", 3) + 135 | m.C('-') * m.V("E", 4) + 136 | '(' * m.V("E") * ')' + 137 | m.C(m.R'09' ^ 1), eval), 138 | } 139 | 140 | assert(pat:match("-1*(6+2/4+3-1)**2") == -72.25) 141 | 142 | 143 | local pat = m.P{ 144 | "E", 145 | E = m.V("E", 1) * '+' * m.V("E", 2) / function(c1, c2) return c1 + c2 end + 146 | m.V("E", 1) * '-' * m.V("E", 2) / function(c1, c2) return c1 - c2 end + 147 | m.V("E", 2) * '*' * m.V("E", 3) / function(c1, c2) return c1 * c2 end + 148 | m.V("E", 2) * '/' * m.V("E", 3) / function(c1, c2) return c1 / c2 end + 149 | m.V("E", 3) * '**' * m.V("E", 3) / function(c1, c2) return c1 ^ c2 end + 150 | '-' * m.V("E", 4) / function(c1) return -c1 end + 151 | '(' * m.V("E") * ')' + 152 | m.C(m.R'09' ^ 1), 153 | } 154 | 155 | assert(pat:match("-1*(6+2/4+3-1)**2") == -72.25) 156 | 157 | local def = { 158 | plus = function(p1, p2) return p1 + p2 end, 159 | minus = function(p1, p2) return p1 - p2 end, 160 | mult = function(p1, p2) return p1 * p2 end, 161 | div = function(p1, p2) return p1 / p2 end, 162 | pow = function(p1, p2) return p1 ^ p2 end, 163 | uminus = function(p1) return -p1 end, 164 | errfce = function(o, i) 165 | local errstr = o .. '\n' .. (' '):rep(i) .. '^' .. '\n' 166 | io.write(errstr) 167 | return false 168 | end, 169 | } 170 | 171 | local pat = [[ 172 | P <- E s (!. / error) 173 | s <- %s* 174 | error <- '' => errfce 175 | E <- (E:1 s'+' E:2) -> plus / 176 | (E:1 s'-' E:2) -> minus / 177 | (E:2 s'*' E:3) -> mult / 178 | (E:2 s'/' E:3) -> div / 179 | (E:3 s'**' E:3)-> pow / 180 | (s'-' E:4) -> uminus / 181 | s'(' E s')' / 182 | s{[0-9]+} / 183 | error 184 | ]] 185 | 186 | local pat = re.compile(pat, def) 187 | assert(re.match("-1 * (6 + 2 / 4 + 3 - 1)**2", pat) == -72.25) 188 | 189 | local pat = [[ 190 | A <- B "a" 191 | B <- C "b" 192 | C <- B / A / "c" 193 | ]] 194 | 195 | local pat = re.compile(pat) 196 | assert(re.match("cbbabbba", pat) == 9) 197 | 198 | local pat = [[ 199 | S <- A / B 200 | A <- A "a" / B / "a" 201 | B <- B "b" / A / "b" 202 | ]] 203 | 204 | local pat = re.compile(pat) 205 | assert(re.match("baabbaaa", pat) == 9) 206 | 207 | print"OK" 208 | --------------------------------------------------------------------------------