├── .github └── FUNDING.yml ├── LICENSE ├── README.md ├── base ├── ast.lua ├── datareader.lua ├── parser.lua └── tokenizer.lua ├── distinfo ├── grammar ├── parser.lua └── tokenizer.lua ├── load_xform.lua ├── lua ├── ast.lua ├── parser.lua └── tokenizer.lua ├── parser.lua ├── parser.rockspec ├── syntax_5.0.txt ├── syntax_5.1.txt ├── syntax_5.2.txt ├── syntax_5.3.txt ├── syntax_5.4.txt ├── syntax_ast_5.1.txt ├── syntax_grammar.txt └── tests ├── flatten.lua ├── lua_to_c.lua ├── lua_to_c_test.lua ├── parse.lua ├── parsemyself.lua ├── spantest.lua ├── strings.lua ├── validate-key.txt └── validate.lua /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: thenumbernine # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: thenumbernine # Replace with a single Buy Me a Coffee username 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017-2025 Christopher E. Moore ( christopher.e.moore@gmail.com / http://thenumbernine.github.io ) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Donate via Stripe](https://img.shields.io/badge/Donate-Stripe-green.svg)](https://buy.stripe.com/00gbJZ0OdcNs9zi288)
2 | 3 | # Lua Parser in Lua 4 | 5 | Parses to an abstract syntax tree representation. 6 | Call tostring() on the AST to get equivalent Lua code. 7 | 8 | Works for versions ~~5.1 5.2~~ 5.3 5.4 and Luajit. I broke <=5.2 compatability when I resorted to throwing objects for parse error reporting. 9 | 10 | AST also contains some functions like flatten() for use with optimizing / auto-inlining Lua. 11 | 12 | See the tests folder for example usage. 13 | 14 | ### Reference 15 | 16 | `Parser = require 'parser'` 17 | This will return the parser class. 18 | 19 | `result, msg = Parser.parse(data[, source, version, useluajit])` 20 | This parses the code in `data` and returns an `ast._block` object. 21 | This is shorthand for `Parser(data, source, version, useluajit).tree` 22 | `version` is a string `'5.3', '5.4'`, etc., corresponding to your Lua version. 23 | The `Parser` object has a few more functions to it corresponding with internal use while parsing. 24 | `source` is a description of the source, i.e. filename, which is included in some nodes (functions) for information on where they are declared. 25 | Returns `result` in case of success. If it encounters a parse error returns `false` and `msg` as what went wrong. 26 | 27 | `ast = require 'parser.lua.ast'` 28 | This is the AST (abstract syntax tree) library, 29 | it hold a collection of AST classes, each representing a different token in the Lua syntax. 30 | 31 | 32 | `n = ast.node()` 33 | = This is the superclass of all AST classes. 34 | 35 | Each has the following properties: 36 | 37 | `n.type` = returns the type of the node, coinciding with the classname in the `ast` library with underscore removed. 38 | 39 | `n.span` = source code span information (`from` and `to` subtables each with `source`, `line` and `col` fields) 40 | 41 | `n:copy()` = returns a copy of the node. 42 | 43 | `n:flatten(func, varmap)` = flattens / inlines the contents of all function call of this function. Used for performance optimizations. 44 | 45 | `n:toLua()` = generate Lua code. same as the node's `__tostring`. 46 | 47 | `n:serialize(apply)` = apply a to-string serialization function to the AST. 48 | 49 | ## ast.node subclasses: 50 | 51 | `n = ast._block(...)` = a block of code in Lua.
52 | `...` is a list of initial child `stmt` nodes to populate the `block` node with.
53 | `n.type == 'block'`.
54 | `n[1] ... n[#n] =` nodes of statements within the block.
55 |
56 | `n = ast._stmt()` = a statement-node parent-class.
57 |
58 | `n = ast._assign(vars, exprs)` =
59 | An assignment operation.
60 | Subclass of `_stmt`.
61 | `n.type == 'assign'`.
62 | Represents the assignment of `n.vars` to `n.exprs`.
63 |
64 | `n = ast._do(...)` =
65 | A `do ... end` block.
66 | Subclass of `_stmt`.
67 | `n.type == 'do'`.
68 | `n[1] ... n[#n] =` nodes of statements within the block.
69 |
70 | `n = ast._while(cond, ...)` =
71 | A `while cond do ... end` block.
72 | Subclass of `_stmt`.
73 | `n.type == 'while'`.
74 | `n.cond` holds the condition expression.
75 | `n[1] ... n[#n] =` nodes of statements within the block.
76 |
77 | `n = ast._repeat(cond, ...)` =
78 | A `repeat ... until cond` block.
79 | Subclass of `_stmt`.
80 | `n.type == 'repeat'`.
81 | `n.cond` holds the condition expression.
82 | `n[1] ... n[#n] =` nodes of statements within the block.
83 |
84 | `n = ast._if(cond, ...)` =
85 | A `if cond then ... elseif ... else ... end` block.
86 | Subclass of `_stmt`.
87 | `n.type == 'if'`.
88 | `n.cond` holds the condtion expression of the first `if` statement.
89 | All subsequent arguments must be `ast._elseif` objects, optionally with a final `ast._else` object.
90 | `n.elseifs` holds the `ast._elseif` objects.
91 | `n.elsestmt` optionally holds the final `ast._else`.
92 |
93 | `n = ast._elseif(cond, ...)` =
94 | A `elseif cond then ...` block.
95 | Subclass of `_stmt`.
96 | `n.type == 'elseif'`.
97 | `n.cond` holds the condition expression of the `else` statement.
98 | `n[1] ... n[#n] =` nodes of statements within the block.
99 |
100 | `n = ast._else(...)` =
101 | A `else ...` block.
102 | `n.type == 'else'`.
103 | `n[1] ... n[#n] =` nodes of statements within the block.
104 |
105 | `n = ast._foreq(var, min, max, step, ...)` =
106 | A `for var=min,max[,step] do ... end` block.
107 | Subclass of `_stmt`.
108 | `n.type == 'foreq'`.
109 | `n.var =` the variable node.
110 | `n.min =` the min expression.
111 | `n.max =` the max expression.
112 | `n.step =` the optional step expression.
113 | `n[1] ... n[#n] =` nodes of statements within the block.
114 |
115 | `n = ast._forin(vars, iterexprs, ...)`
116 | A `for var1,...varN in expr1,...exprN do ... end` block.
117 | Subclass of `_stmt`.
118 | `n.type == 'forin'`.
119 | `n.vars = ` table of variables of the for-in loop.
120 | `n.iterexprs = ` table of iterator expressions of the for-in loop.
121 | `n[1] ... n[#n] =` nodes of statements within the block.
122 |
123 | `n = ast._function(name, args, ...)`
124 | A `function [name](arg1, ...argN) ... end` block.
125 | Subclass of `_stmt`.
126 | `n.type == 'function'`.
127 | `n.name = ` the function name. This is optional. Omit name for this to represent lambda function. (Which technically becomes an expression and not a statement...)
128 | `n.args = ` table of arguments. This does get modified: each argument gets assigned an `.param = true`, and an `.index =` for which index it is in the argument list.
129 | `n[1] ... n[#n] =` nodes of statements within the block.
130 |
131 | `n = ast._local(exprs)`
132 | A `local ...` statement.
133 | Subclass of `_stmt`.
134 | `n.type == 'local'`
135 | `n.exprs =` list of expressions to be declared as locals.
136 | Expects its member-expressions to be either functions or assigns.
137 |
138 | `n = ast._return(...)`
139 | A `return ...` statement.
140 | Subclass of `_stmt`.
141 | `n.type == 'return'`
142 | `n.exprs =` list of expressions to return.
143 |
144 | `n = ast._break(...)`
145 | A `break` statement.
146 | Subclass of `_stmt`.
147 | `n.type == 'break'`
148 |
149 | `n = ast._call(func, ...)`
150 | A `func(...)` function-call expression.
151 | `n.type == 'call'`
152 | `n.func =` expression of the function to call.
153 | `n.args =` list argument expressions to pass into the function-call.
154 |
155 | `n = ast._nil()`
156 | A `nil` literal expression.
157 | `n.type == 'nil'`.
158 | `n.const == true`.
159 | 160 | `n = ast._boolean()`
161 | The parent class of the `true`/`false` AST nodes. 162 |
163 | `n = ast._true()`
164 | A `true` boolean literal expression
165 | `n.type == 'true'`.
166 | `n.const == true`.
167 | `n.value == true`.
168 | `ast._boolean:isa(n)` evaluates to `true`
169 |
170 | `n = ast._false()`
171 | A `false` boolean literal expression
172 | `n.type == 'true'`.
173 | `n.const == true`.
174 | `n.value == false`.
175 | `ast._boolean:isa(n)` evaluates to `true`
176 |
177 | `n = ast._number(value)`
178 | A numeric literal expression.
179 | `n.type == 'number'`.
180 | `n.value =` the numerical value.
181 |
182 | `n = ast._string(value)`
183 | A string literal expression.
184 | `n.type == 'string'`.
185 | `n.value =` the string value.
186 |
187 | `n = ast._vararg()`
188 | A vararg `...` expression.
189 | `n.type == 'vararg'`.
190 | For use within function arguments, assignment expressions, function calls, etc.
191 |
192 | `n = ast._table(...)`
193 | A table `{ ... }` expression.
194 | `n.type == 'table'`.
195 | `n[1] ... n[#n] =` expressions of the table.
196 | If the expression in `n[i]` is an `ast._assign` then an entry is added into the table as `key = value`. If it is not an `ast._assign` then it is inserted as a sequenced entry.
197 |
198 | `n = ast._var(name)`
199 | A variable reference expression.
200 | `n.type == 'var'`
201 | `n.name =` the variable name.
202 |
203 | `n = ast._par(expr)`
204 | A `( ... )` parenthesis expression.
205 | `n.type == 'par'`.
206 | `n.expr =` the expression within the parenthesis.
207 |
208 | `n = ast._index(expr, key)`
209 | An `expr[key]` expression, i.e. an `__index`-metatable operation.
210 | `n.type == 'index'`.
211 | `n.expr =` the expression to be indexed.
212 | `n.key =` the expression of the index key.
213 |
214 | `n = ast._indexself(expr, key)`
215 | An `expr:key` expression, to be used as the expression of a `ast._ call` node for member-function-calls. These are Lua's shorthand insertion of `self` as the first argument.
216 | `n.type == 'indexself'`.
217 | `n.expr =` the expression to be indexed.
218 | `n.key =` the key to index. Must only be a Lua string, (not an `ast._ string`, but a real Lua string).
219 | 220 | Binary operations: 221 | 222 | |node type|Lua operator| | 223 | |---------|------------|------| 224 | |`_add` |`+` | | 225 | |`_sub` |`-` | | 226 | |`_mul` |`*` | | 227 | |`_div` |`/` | | 228 | |`_mod` |`%` | | 229 | |`_concat`|`..` | | 230 | |`_lt` |`<` | | 231 | |`_le` |`<=` | | 232 | |`_gt` |`>` | | 233 | |`_ge` |`>=` | | 234 | |`_eq` |`==` | | 235 | |`_ne` |`~=` | | 236 | |`_and` |`and` | | 237 | |`_or` |`or` | | 238 | |`_idiv` |`//` | 5.3+ | 239 | |`_band` |`&` | 5.3+ | 240 | |`_bxor` |`~` | 5.3+ | 241 | |`_bor` |`\|` | 5.3+ | 242 | |`_shl` |`<<` | 5.3+ | 243 | |`_shr` |`>>` | 5.3+ | 244 | 245 | `n[1] ... n[#n] =` a table of the arguments of the operation. 246 | 247 | Unary operations: 248 | 249 | |node type|Lua operator| | 250 | |---------|------------|------| 251 | |`_unm` |`-` | | 252 | |`_not` |`not` | | 253 | |`_len` |`#` | | 254 | |`_bnot` |`~` | 5.3+ | 255 | 256 | `n[1] =` the single argument of the operation. 257 | 258 | ## more extra functions: 259 | 260 | Some more useful functions in AST: 261 | - `ast.copy(node)` = equivalent of `node:copy()` 262 | - `ast.flatten(node, func, varmap)` = equivalent of `node:flatten(func, varmap)` 263 | - `ast.refreshparents` 264 | - `ast.traverse` 265 | - `ast.nodeclass(type, parent, args)` 266 | - `ast.tostringmethod` = this specifies the serialization method. It is used to look up the serializer stored in `ast.tostringmethods` 267 | - `parser.load_xform` works with my `ext.load` shim load layer to allow you to modify the AST of all subsequent loaded Lua code. 268 | 269 | 270 | ### TODO: 271 | 272 | - Option for parsing LuaJIT -i number suffixes. 273 | - Speaking of LuaJIT, it has different edge case syntax for 2.0.5, 2.1.0, and whether 5.2-compat is enabled or not. It isn't passing the `validate.lua`. 274 | - How about flags to turn off and on each feature, then a function for auto-detect flag sets based on Lua VERSION string or by running some local `load()` tests 275 | - Make all node allocation routed through `Parser:node` to give the node a .parser field to point back to the parser - necessary for certain AST nodes that need to tell what parser keywords are allowed. I do this where necessary but I should do it always. 276 | - I've also made this keyword test optional since in some rare projects (`vec-lua` for one) I am inserting AST nodes for the sake of a portable AST that I can inject as inline'd code, but without a parser, so I don't have a proper enumeration of keywords. So for now I'm making ast node `.parser` optional and the keyword test bypassed if `.parser` isn't present. I'll probably make it a hard constraint later when I rework `vec-lua`. 277 | - It seems like a quick fix to just convert all `a.b`s into `a['b']`s ... but Lua for some reason doesn't support `a['b']:c()` as an equivalent of `a.b:c()` ... so converting everything from dot to brack index could break some regenerated Lua scripts. 278 | - To preserve spacing and comments (useful for my [`langfix`](https://github.com/thenumbernine/langfix-lua) transpiler), instead of using ast fields which are tokens, I should use token-references as fields and allow them to be replaced ... maybe ... 279 | - I'm very tempted to switch the AST index names to remove the preceding underscore. Pro of keeping it: the keywords become valid Lua names. Pro of removing it: the AST index matches the keyword that the AST node represents ... 280 | 281 | ### Dependencies: 282 | 283 | - https://github.com/thenumbernine/lua-ext 284 | - https://github.com/thenumbernine/lua-template 285 | 286 | `validate-key.txt` originally taken from `minify_tests.txt` at https://github.com/stravant/LuaMinify 287 | -------------------------------------------------------------------------------- /base/ast.lua: -------------------------------------------------------------------------------- 1 | local table = require 'ext.table' 2 | local string = require 'ext.string' 3 | local class = require 'ext.class' 4 | 5 | local BaseAST = class() 6 | 7 | -- this is too relaxed, since concat maps to tostring maps to toLua, and I want toLua only called from external, and toLua_recursive from internal 8 | --BaseAST.__concat = string.concat 9 | 10 | function BaseAST:setspan(span) 11 | self.span = span 12 | return self 13 | end 14 | 15 | -- returns ancestors as a table, including self 16 | function BaseAST:ancestors() 17 | local n = self 18 | local t = table() 19 | repeat 20 | t:insert(n) 21 | n = n.parent 22 | until not n 23 | return t 24 | end 25 | 26 | -- TODO move traverse flatten etc here once the fields problem is sorted out 27 | 28 | return BaseAST 29 | -------------------------------------------------------------------------------- /base/datareader.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | TODO 3 | store all tokens(term?) as we go in tokenhistory 4 | then have Tokenizer keep track of the range in this array / forward it to be used as the span in AST 5 | then the AST can look into this array, (maybe also keep track of which tokens are whitespace/comments) 6 | ... and reproduce the original file exactly as-is (if so desired). 7 | 8 | TODO make sure *all* tokens are correctly stored in tokenhistory. right now it doesn't reproduce source in 100% of cases. maybe just 99%. 9 | 10 | TODO terminology ... 11 | DataReader gets chars as input, turns them into ... collections-of-chars? 12 | Tokenizer gets collections-of-chars as input, turns them into tokens 13 | Parser gets tokens as input, turns them into AST nodes 14 | --]] 15 | local table = require 'ext.table' 16 | local class = require 'ext.class' 17 | local assert = require 'ext.assert' 18 | 19 | local DataReader = class() 20 | 21 | -- At the moment this is 100% cosmetic. 22 | -- In case someone doesn't want tracking all tokens done for whatever reason (slowdown, memory, etc) 23 | -- enable/disable this to make token-tracking optional 24 | DataReader.tracktokens = true 25 | 26 | function DataReader:init(data) 27 | self.data = data 28 | self.index = 1 29 | 30 | -- keep track of all tokens as we parse them. 31 | self.tokenhistory = table() 32 | 33 | -- TODO this isn't robust against different OS file formats. maybe switching back to determining line number offline / upon error encounter is better than trying to track it while we parse. 34 | self.line = 1 35 | self.col = 1 36 | end 37 | 38 | function DataReader:done() 39 | return self.index > #self.data 40 | end 41 | 42 | local slashNByte = ('\n'):byte() 43 | function DataReader:updatelinecol() 44 | if not self.lastUpdateLineColIndex then 45 | self.lastUpdateLineColIndex = 1 46 | else 47 | assert.ge(self.index, self.lastUpdateLineColIndex) 48 | end 49 | for i=self.lastUpdateLineColIndex,self.index do 50 | if self.data:byte(i,i) == slashNByte then 51 | self.col = 1 52 | self.line = self.line + 1 53 | else 54 | self.col = self.col + 1 55 | end 56 | end 57 | self.lastUpdateLineColIndex = self.index+1 58 | end 59 | 60 | function DataReader:setlasttoken(lasttoken, skipped) 61 | self.lasttoken = lasttoken 62 | if self.tracktokens then 63 | if skipped and #skipped > 0 then 64 | --DEBUG(@5): print('SKIPPED', require 'ext.tolua'(skipped)) 65 | self.tokenhistory:insert(skipped) 66 | end 67 | --DEBUG(@5): print('TOKEN', require 'ext.tolua'(self.lasttoken)) 68 | self.tokenhistory:insert(self.lasttoken) 69 | --DEBUG(paranoid): local sofar = self.tokenhistory:concat() 70 | --DEBUG(paranoid): assert.eq(self.data:sub(1,#sofar), sofar, "source vs tokenhistory") 71 | end 72 | return self.lasttoken 73 | end 74 | 75 | function DataReader:seekpast(pattern) 76 | --DEBUG(@5): print('DataReader:seekpast', require 'ext.tolua'(pattern)) 77 | local from, to = self.data:find(pattern, self.index) 78 | if not from then return end 79 | local skipped = self.data:sub(self.index, from - 1) 80 | self.index = to + 1 81 | self:updatelinecol() 82 | return self:setlasttoken(self.data:sub(from, to), skipped) 83 | end 84 | 85 | function DataReader:canbe(pattern) 86 | --DEBUG(@5): print('DataReader:canbe', require 'ext.tolua'(pattern)) 87 | return self:seekpast('^'..pattern) 88 | end 89 | 90 | function DataReader:mustbe(pattern, msg) 91 | --DEBUG(@5): print('DataReader:mustbe', require 'ext.tolua'(pattern)) 92 | if not self:canbe(pattern) then error{msg=msg or "expected "..pattern} end 93 | return self.lasttoken 94 | end 95 | 96 | return DataReader 97 | -------------------------------------------------------------------------------- /base/parser.lua: -------------------------------------------------------------------------------- 1 | local class = require 'ext.class' 2 | local table = require 'ext.table' 3 | local tolua = require 'ext.tolua' 4 | 5 | local Parser = class() 6 | 7 | -- seems redundant. does anyone need to construct a Parser without data? maybe to modify the syntax or something? just build a subclass in that case? 8 | function Parser:init(data, ...) 9 | if data then 10 | assert(self:setData(data, ...)) 11 | end 12 | end 13 | 14 | --[[ 15 | returns 16 | true upon success 17 | nil, msg, loc upon failure 18 | --]] 19 | function Parser:setData(data, source) 20 | assert(data, "expected data") 21 | data = tostring(data) 22 | self.source = source 23 | local t = self:buildTokenizer(data) 24 | self.t = t 25 | 26 | -- default entry point for parsing data sources 27 | local parseError 28 | local result = table.pack(xpcall(function() 29 | t:start() 30 | self.tree = self:parseTree() 31 | end, function(err) 32 | -- throw an object if it's an error parsing the code 33 | if type(err) == 'table' then 34 | --DEBUG(@5):print('got parse error:', require'ext.tolua'(err)) 35 | --DEBUG(@5):print(debug.traceback()) 36 | parseError = err 37 | return 38 | else 39 | return err..'\n' 40 | ..self.t:getpos()..'\n' 41 | ..debug.traceback() 42 | end 43 | end)) 44 | if not result[1] then 45 | if not parseError then error(result[2]) end -- internal error 46 | return false, self.t:getpos()..': '..tostring(parseError.msg) -- parsed code error 47 | end 48 | 49 | -- 50 | -- now that we have the tree, build parents 51 | -- ... since I don't do that during construction ... 52 | if self.ast 53 | and self.ast.refreshparents 54 | then 55 | self.ast.refreshparents(self.tree) 56 | end 57 | 58 | if self.t.token then 59 | return false, self.t:getpos()..": expected eof, found "..self.t.token 60 | end 61 | return true 62 | end 63 | 64 | -- TODO I don't need all these, just :getloc() 65 | function Parser:getloc() 66 | local loc = self.t:getloc() 67 | loc.source = self.source 68 | return loc 69 | end 70 | 71 | function Parser:canbe(token, tokentype) -- token is optional 72 | assert(tokentype) 73 | if (not token or token == self.t.token) 74 | and tokentype == self.t.tokentype 75 | then 76 | self.lasttoken, self.lasttokentype = self.t.token, self.t.tokentype 77 | self.t:consume() 78 | return self.lasttoken, self.lasttokentype 79 | end 80 | end 81 | 82 | function Parser:mustbe(token, tokentype, opentoken, openloc) 83 | local lasttoken, lasttokentype = self.t.token, self.t.tokentype 84 | self.lasttoken, self.lasttokentype = self:canbe(token, tokentype) 85 | if not self.lasttoken then 86 | local msg = "expected token="..tolua(token).." tokentype="..tolua(tokentype) 87 | .." but found token="..tolua(lasttoken).." type="..tolua(lasttokentype) 88 | if opentoken then 89 | msg = msg .. " to close "..tolua(opentoken).." at line="..openloc.line..' col='..openloc.col 90 | end 91 | error{msg=msg} 92 | end 93 | return self.lasttoken, self.lasttokentype 94 | end 95 | 96 | -- make new ast node, assign it back to the parser (so it can tell what version / keywords / etc are being used) 97 | function Parser:node(index, ...) 98 | --DEBUG(@5):print('Parser:node', index, ...) 99 | local node = self.ast[index](...) 100 | node.parser = self 101 | return node 102 | end 103 | 104 | -- used with parse_expr_precedenceTable 105 | function Parser:getNextRule(rules) 106 | for _, rule in pairs(rules) do 107 | -- TODO why even bother separate it in canbe() ? 108 | local keywordOrSymbol = rule.token:match'^[_a-zA-Z][_a-zA-Z0-9]*$' and 'keyword' or 'symbol' 109 | if self:canbe(rule.token, keywordOrSymbol) then 110 | return rule 111 | end 112 | end 113 | end 114 | 115 | -- a useful tool for specifying lots of precedence level rules 116 | -- used with self.parseExprPrecedenceRulesAndClassNames 117 | -- example in parser/lua/parser.lua 118 | function Parser:parse_expr_precedenceTable(i) 119 | --DEBUG(@5):print('Parser:parse_expr_precedenceTable', i, 'of', #self.parseExprPrecedenceRulesAndClassNames, 'token=', self.t.token) 120 | local precedenceLevel = self.parseExprPrecedenceRulesAndClassNames[i] 121 | if precedenceLevel.unaryLHS then 122 | local from = self:getloc() 123 | local rule = self:getNextRule(precedenceLevel.rules) 124 | if rule then 125 | local nextLevel = i 126 | if rule.nextLevel then 127 | nextLevel = self.parseExprPrecedenceRulesAndClassNames:find(nil, function(level) 128 | return level.name == rule.nextLevel 129 | end) or error{msg="couldn't find precedence level named "..tostring(rule.nextLevel)} 130 | end 131 | local a = assert(self:parse_expr_precedenceTable(nextLevel), {msg='unexpected symbol'}) 132 | a = self:node(rule.className, a) 133 | if a.span then 134 | a:setspan{from = a.span.from, to = self:getloc()} 135 | end 136 | return a 137 | end 138 | 139 | if i < #self.parseExprPrecedenceRulesAndClassNames then 140 | return self:parse_expr_precedenceTable(i+1) 141 | else 142 | return self:parse_subexp() 143 | end 144 | else 145 | -- binary operation by default 146 | local a 147 | if i < #self.parseExprPrecedenceRulesAndClassNames then 148 | a = self:parse_expr_precedenceTable(i+1) 149 | else 150 | a = self:parse_subexp() 151 | end 152 | if not a then return end 153 | local rule = self:getNextRule(precedenceLevel.rules) 154 | if rule then 155 | local nextLevel = i 156 | if rule.nextLevel then 157 | nextLevel = self.parseExprPrecedenceRulesAndClassNames:find(nil, function(level) 158 | return level.name == rule.nextLevel 159 | end) or error{msg="couldn't find precedence level named "..tostring(rule.nextLevel)} 160 | end 161 | a = self:node(rule.className, a, (assert(self:parse_expr_precedenceTable(nextLevel), {msg='unexpected symbol'}))) 162 | if a.span then 163 | a:setspan{from = a.span.from, to = self:getloc()} 164 | end 165 | end 166 | return a 167 | end 168 | end 169 | 170 | 171 | 172 | return Parser 173 | -------------------------------------------------------------------------------- /base/tokenizer.lua: -------------------------------------------------------------------------------- 1 | local table = require 'ext.table' 2 | local string = require 'ext.string' 3 | local class = require 'ext.class' 4 | local assert = require 'ext.assert' 5 | local DataReader = require 'parser.base.datareader' 6 | 7 | local Tokenizer = class() 8 | 9 | function Tokenizer:initSymbolsAndKeywords(...) 10 | end 11 | 12 | function Tokenizer:init(data, ...) 13 | -- TODO move what this does to just the subclass initialization 14 | self.symbols = table(self.symbols) 15 | self.keywords = table(self.keywords):setmetatable(nil) 16 | self:initSymbolsAndKeywords(...) 17 | 18 | self.r = DataReader(data) 19 | self.gettokenthread = coroutine.create(function() 20 | local r = self.r 21 | 22 | while not r:done() do 23 | self:skipWhiteSpaces() 24 | if r:done() then break end 25 | 26 | if self:parseComment() then 27 | elseif self:parseString() then 28 | elseif self:parseName() then 29 | elseif self:parseNumber() then 30 | elseif self:parseSymbol() then 31 | else 32 | error{msg="unknown token "..r.data:sub(r.index, r.index+20)..(r.index+20 > #r.data and '...' or '')} 33 | end 34 | end 35 | end) 36 | end 37 | 38 | function Tokenizer:skipWhiteSpaces() 39 | local r = self.r 40 | r:canbe'%s+' 41 | --DEBUG(@5): if r.lasttoken then print('read space ['..(r.index-#r.lasttoken)..','..r.index..']: '..r.lasttoken) end 42 | end 43 | 44 | -- Lua-specific comments (tho changing the comment symbol is easy ...) 45 | Tokenizer.singleLineComment = string.patescape'--' 46 | function Tokenizer:parseComment() 47 | local r = self.r 48 | 49 | -- TODO try block comments first 50 | if self:parseBlockComment() then return true end 51 | 52 | if r:canbe(self.singleLineComment) then 53 | --DEBUG(@5):local start = r.index - #r.lasttoken 54 | -- read line 55 | if not r:seekpast'\n' then 56 | r:seekpast'$' 57 | end 58 | --DEBUG(@5):local commentstr = r.data:sub(start, r.index-1) 59 | -- TODO how to insert comments into the AST? should they be their own nodes? 60 | -- should all whitespace be its own node, so the original code text can be reconstructed exactly? 61 | --coroutine.yield(commentstr, 'comment') 62 | --DEBUG(@5):print('read comment ['..start..','..(r.index-1)..']:'..commentstr) 63 | return true 64 | end 65 | end 66 | 67 | -- parse a string 68 | function Tokenizer:parseString() 69 | if self:parseQuoteString() then return true end 70 | end 71 | 72 | -- TODO this is a very lua function though it's in parser/base/ and not parser/lua/ ... 73 | -- '' or "" single-line quote-strings with escape-codes 74 | function Tokenizer:parseQuoteString() 75 | local r = self.r 76 | if r:canbe'["\']' then 77 | --DEBUG(@5): print('read quote string ['..(r.index-#r.lasttoken)..','..r.index..']: '..r.lasttoken) 78 | --DEBUG(@5): local start = r.index-#r.lasttoken 79 | local quote = r.lasttoken 80 | local s = table() 81 | while true do 82 | r:seekpast'.' 83 | if r.lasttoken == quote then break end 84 | if r:done() then error{msg="unfinished string"} end 85 | if r.lasttoken == '\\' then 86 | local esc = r:canbe'.' 87 | local escapeCodes = {a='\a', b='\b', f='\f', n='\n', r='\r', t='\t', v='\v', ['\\']='\\', ['"']='"', ["'"]="'", ['0']='\0', ['\r']='\n', ['\n']='\n'} 88 | local escapeCode = escapeCodes[esc] 89 | if escapeCode then 90 | s:insert(escapeCode) 91 | elseif esc == 'x' and self.version >= '5.2' then 92 | esc = r:mustbe'%x' .. r:mustbe'%x' 93 | s:insert(string.char(tonumber(esc, 16))) 94 | elseif esc == 'u' and self.version >= '5.3' then 95 | r:mustbe'{' 96 | local code = 0 97 | while true do 98 | local ch = r:canbe'%x' 99 | if not ch then break end 100 | code = code * 16 + tonumber(ch, 16) 101 | end 102 | r:mustbe'}' 103 | 104 | -- hmm, needs bit library or bit operations, which should only be present in version >= 5.3 anyways so ... 105 | local bit = bit32 or require 'bit' 106 | if code < 0x80 then 107 | s:insert(string.char(code)) -- 0xxxxxxx 108 | elseif code < 0x800 then 109 | s:insert( 110 | string.char(bit.bor(0xc0, bit.band(0x1f, bit.rshift(code, 6)))) 111 | .. string.char(bit.bor(0x80, bit.band(0x3f, code))) 112 | ) 113 | elseif code < 0x10000 then 114 | s:insert( 115 | string.char(bit.bor(0xe0, bit.band(0x0f, bit.rshift(code, 12)))) 116 | .. string.char(bit.bor(0x80, bit.band(0x3f, bit.rshift(code, 6)))) 117 | .. string.char(bit.bor(0x80, bit.band(0x3f, code))) 118 | ) 119 | else 120 | s:insert( 121 | string.char(bit.bor(0xf0, bit.band(0x07, bit.rshift(code, 18)))) 122 | .. string.char(bit.bor(0x80, bit.band(0x3f, bit.rshift(code, 12)))) 123 | .. string.char(bit.bor(0x80, bit.band(0x3f, bit.rshift(code, 6)))) 124 | .. string.char(bit.bor(0x80, bit.band(0x3f, code))) 125 | ) 126 | end 127 | elseif esc:match('%d') then 128 | -- can read up to three 129 | if r:canbe'%d' then esc = esc .. r.lasttoken end 130 | if r:canbe'%d' then esc = esc .. r.lasttoken end 131 | s:insert(string.char(tonumber(esc))) 132 | else 133 | if self.version >= '5.2' then 134 | -- lua5.1 doesn't care about bad escape codes 135 | error{msg="invalid escape sequence "..esc} 136 | end 137 | end 138 | else 139 | s:insert(r.lasttoken) 140 | end 141 | end 142 | --DEBUG(@5): print('read quote string ['..start..','..(r.index-#r.lasttoken)..']: '..r.data:sub(start, r.index-#r.lasttoken)) 143 | coroutine.yield(s:concat(), 'string') 144 | return true 145 | end 146 | end 147 | 148 | -- C names 149 | function Tokenizer:parseName() 150 | local r = self.r 151 | if r:canbe'[%a_][%w_]*' then -- name 152 | --DEBUG(@5): print('read name ['..(r.index-#r.lasttoken)..', '..r.index..']: '..r.lasttoken) 153 | coroutine.yield(r.lasttoken, self.keywords[r.lasttoken] and 'keyword' or 'name') 154 | return true 155 | end 156 | end 157 | 158 | function Tokenizer:parseNumber() 159 | local r = self.r 160 | if r.data:match('^[%.%d]', r.index) -- if it's a decimal or a number... 161 | and (r.data:match('^%d', r.index) -- then, if it's a number it's good 162 | or r.data:match('^%.%d', r.index)) -- or if it's a decimal then if it has a number following it then it's good ... 163 | then -- otherwise I want it to continue to the next 'else' 164 | -- lua doesn't consider the - to be a part of the number literal 165 | -- instead, it parses it as a unary - and then possibly optimizes it into the literal during ast optimization 166 | --DEBUG(@5): local start = r.index 167 | if r:canbe'0[xX]' then 168 | self:parseHexNumber() 169 | else 170 | self:parseDecNumber() 171 | end 172 | --DEBUG(@5): print('read number ['..start..', '..r.index..']: '..r.data:sub(start, r.index-1)) 173 | return true 174 | end 175 | end 176 | 177 | function Tokenizer:parseHexNumber() 178 | local r = self.r 179 | local token = r:mustbe('[%da-fA-F]+', 'malformed number') 180 | coroutine.yield('0x'..token, 'number') 181 | end 182 | 183 | function Tokenizer:parseDecNumber() 184 | local r = self.r 185 | local token = r:canbe'[%.%d]+' 186 | assert.le(#token:gsub('[^%.]',''), 1, 'malformed number') 187 | local n = table{token} 188 | if r:canbe'e' then 189 | n:insert(r.lasttoken) 190 | n:insert(r:mustbe('[%+%-]%d+', 'malformed number')) 191 | end 192 | coroutine.yield(n:concat(), 'number') 193 | end 194 | 195 | function Tokenizer:parseSymbol() 196 | local r = self.r 197 | -- see if it matches any symbols 198 | for _,symbol in ipairs(self.symbols) do 199 | if r:canbe(string.patescape(symbol)) then 200 | --DEBUG(@5): print('read symbol ['..(r.index-#r.lasttoken)..','..r.index..']: '..r.lasttoken) 201 | coroutine.yield(r.lasttoken, 'symbol') 202 | return true 203 | end 204 | end 205 | end 206 | 207 | -- separate this in case someone has to modify the tokenizer symbols and keywords before starting 208 | function Tokenizer:start() 209 | -- TODO provide tokenizer the AST namespace and have it build the tokens (and keywords?) here automatically 210 | self.symbols = self.symbols:mapi(function(v,k) return true, v end):keys() 211 | -- arrange symbols from largest to smallest 212 | self.symbols:sort(function(a,b) return #a > #b end) 213 | self:consume() 214 | self:consume() 215 | end 216 | 217 | function Tokenizer:consume() 218 | -- [[ TODO store these in an array somewhere, make the history adjustable 219 | -- then in all the get[prev][2]loc's just pass an index for how far back to search 220 | self.prev2index = self.previndex 221 | self.prev2tokenIndex = self.prevtokenIndex 222 | 223 | self.previndex = self.r.index 224 | self.prevtokenIndex = #self.r.tokenhistory+1 225 | --]] 226 | 227 | self.token = self.nexttoken 228 | self.tokentype = self.nexttokentype 229 | if coroutine.status(self.gettokenthread) == 'dead' then 230 | self.nexttoken = nil 231 | self.nexttokentype = nil 232 | -- done = true 233 | return 234 | end 235 | local status, nexttoken, nexttokentype = coroutine.resume(self.gettokenthread) 236 | -- detect errors 237 | if not status then 238 | local err = nexttoken 239 | --[[ enabling this to forward errors wasn't so foolproof... 240 | if type(err) == 'table' then 241 | --]] 242 | -- then repackage it and include our parser state 243 | error{ 244 | msg = err.msg, 245 | token = self.token, 246 | tokentype = self.tokentype, 247 | pos = self:getpos(), 248 | traceback = debug.traceback(self.gettokenthread), 249 | } 250 | --[[ see above 251 | else 252 | -- internal error - just rethrow 253 | error(err) 254 | end 255 | --]] 256 | end 257 | self.nexttoken = nexttoken 258 | self.nexttokentype = nexttokentype 259 | end 260 | 261 | function Tokenizer:getpos() 262 | return 'line '..self.r.line 263 | ..' col '..self.r.col 264 | ..' code "'..self.r.data:sub(self.r.index):match'^[^\n]*'..'"' 265 | end 266 | 267 | -- return the span across 268 | function Tokenizer:getloc() 269 | local r = self.r 270 | local line = self.r.line 271 | local col = self.r.col 272 | 273 | return { 274 | line = line, 275 | col = col, 276 | index = self.prev2index, 277 | tokenIndex = self.prev2tokenIndex, 278 | } 279 | end 280 | 281 | return Tokenizer 282 | -------------------------------------------------------------------------------- /distinfo: -------------------------------------------------------------------------------- 1 | name = "parser" 2 | files = { 3 | ["LICENSE"] = "parser/LICENSE", 4 | ["README.md"] = "parser/README.md", 5 | ["base/ast.lua"] = "parser/base/ast.lua", 6 | ["base/datareader.lua"] = "parser/base/datareader.lua", 7 | ["base/parser.lua"] = "parser/base/parser.lua", 8 | ["base/tokenizer.lua"] = "parser/base/tokenizer.lua", 9 | ["grammar/parser.lua"] = "parser/grammar/parser.lua", 10 | ["grammar/tokenizer.lua"] = "parser/grammar/tokenizer.lua", 11 | ["load_xform.lua"] = "parser/load_xform.lua", 12 | ["lua/ast.lua"] = "parser/lua/ast.lua", 13 | ["lua/parser.lua"] = "parser/lua/parser.lua", 14 | ["lua/tokenizer.lua"] = "parser/lua/tokenizer.lua", 15 | ["parser.rockspec"] = "parser/parser.rockspec", 16 | ["parser.lua"] = "parser/parser.lua", 17 | ["syntax_5.0.txt"] = "parser/syntax_5.0.txt", 18 | ["syntax_5.1.txt"] = "parser/syntax_5.1.txt", 19 | ["syntax_5.2.txt"] = "parser/syntax_5.2.txt", 20 | ["syntax_5.3.txt"] = "parser/syntax_5.3.txt", 21 | ["syntax_5.4.txt"] = "parser/syntax_5.4.txt", 22 | ["syntax_ast_5.1.txt"] = "parser/syntax_ast_5.1.txt", 23 | ["syntax_grammar.txt"] = "parser/syntax_grammar.txt", 24 | ["tests/flatten.lua"] = "parser/tests/flatten.lua", 25 | ["tests/lua_to_c.lua"] = "parser/tests/lua_to_c.lua", 26 | ["tests/lua_to_c_test.lua"] = "parser/tests/lua_to_c_test.lua", 27 | ["tests/parse.lua"] = "parser/tests/parse.lua", 28 | ["tests/parsemyself.lua"] = "parser/tests/parsemyself.lua", 29 | ["tests/spantest.lua"] = "parser/tests/spantest.lua", 30 | ["tests/strings.lua"] = "parser/tests/strings.lua", 31 | ["tests/validate-key.txt"] = "parser/tests/validate-key.txt", 32 | ["tests/validate.lua"] = "parser/tests/validate.lua", 33 | } 34 | deps = { 35 | "bit", 36 | "ext", 37 | "template", 38 | } 39 | -------------------------------------------------------------------------------- /grammar/parser.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | building a parser from a grammar ... 3 | grammar syntax: 4 | 5 | ::= is used to define an AST node with the name `name` 6 | block ::= chunk 7 | 8 | | means multiple optional rules 9 | binop ::= `+` | `-` 10 | 11 | {} means zero-or-more multiple optional rules 12 | 13 | [] means a single optional rule 14 | 15 | '' means a keyword / symbol ... notice keywords are alphabetic only and symbols are non-alphabetic only. The distinction is to enforce that keywords cannot neighbor one another while symbols can, and though keywords are legal variable names (while symbols are not), variables must be checked to ensure they are not keywords. not sure how I'll mix these ... 16 | 17 | ; means end-of-expression-list 18 | I was debating a few ways to distingish rule ends. Options could be: 19 | - Wrap in { } 20 | - Use ; as a terminator 21 | - Prefix rules with "def" or something, because the end of an expression-list is either a | or a new rule. 22 | 23 | Grammar implementation: 24 | 1) scan all rules for all literal strings/keywords. sort them all by size, largest-to-smallest. 25 | 2) need to explicitly define some axiom rules. 26 | For Lua: Name, Numeral, LiteralString 27 | 28 | --]] 29 | local path = require 'ext.path' 30 | local table = require 'ext.table' 31 | local class = require 'ext.class' 32 | local assert = require 'ext.assert' 33 | local tolua = require 'ext.tolua' 34 | local template = require 'template' 35 | local GrammarTokenizer = require 'parser.grammar.tokenizer' 36 | local Parser = require 'parser.base.parser' 37 | 38 | local function tab(s) 39 | return s:gsub('\n', '\n\t') 40 | end 41 | 42 | 43 | -- all grammar ast classes, key'd by rule-name 44 | local ast = {} 45 | 46 | -- hmm ... move this to ASTNode root eventually 47 | ast.refreshparents = require 'parser.lua.ast'.refreshparents 48 | 49 | -- TODO for these rules (and for the rules that GrammarParser code-generates) 50 | -- I might as well create AST objects and override their :getcode() instead of making all the if/else conditions in GrammarParser:getcode() 51 | 52 | local ASTNode = require 'parser.base.ast' 53 | 54 | local GrammarASTNode = ASTNode:subclass() 55 | ast._node = GrammarASTNode 56 | 57 | function GrammarASTNode:init(...) 58 | for i=1,select('#', ...) do 59 | self[i] = select(i, ...) 60 | end 61 | end 62 | 63 | GrammarASTNode.insert = table.insert 64 | GrammarASTNode.append = table.append 65 | 66 | function GrammarASTNode:getcode(parser) 67 | error("need to handle grammar type "..tolua(self.type).." "..tolua(self)) 68 | end 69 | 70 | local function nodeclass(type) 71 | local cl = GrammarASTNode:subclass() 72 | cl.type = type 73 | ast['_'..type] = cl 74 | return cl 75 | end 76 | 77 | local _rule = nodeclass'rule' 78 | --[[ how to alias? hmm, don't do this or :isa won't work ... 79 | _rule.__index = function(self, k) 80 | if k == 'name' then return self[1] end 81 | if k == 'expr' then return self[2] end 82 | --return _rule.super.__index(self, k) 83 | return _rule.super.__index[k] 84 | end 85 | --]] 86 | function _rule:name() return self[1] end 87 | function _rule:expr() return self[2] end 88 | 89 | -- :getcode(parser) will generate the code for inserting into the current created node, 90 | -- the current created node is assumed to be named 'result' 91 | 92 | local _or = nodeclass'or' 93 | function _or:getcode(parser) 94 | return template([[ 95 | -- or 96 | -- TODO push rewind point here? 97 | repeat 98 | 100 | local oldlen = #result 101 | do 102 | 103 | end 104 | if #result == oldlen then 105 | -- nothing was added? then break. 106 | -- TODO rewind token here? 107 | break 108 | end 109 | until true]], 111 | { 112 | node = self, 113 | parser = parser, 114 | tab = tab, 115 | }) 116 | end 117 | 118 | local _optional = nodeclass'optional' 119 | -- optional is only different in that, after the optional code, we don't need any assert()'s / mustbe()'s 120 | function _optional:getcode(parser) 121 | assert.len(self, 1) 122 | return self[1]:getcode(parser) 123 | end 124 | 125 | local _multiple = nodeclass'multiple' 126 | function _multiple:getcode(parser) 127 | return template([[ 128 | -- multiple 129 | repeat 130 | 137 | local oldlen = #result 138 | do 139 | 140 | end 141 | if #result == oldlen then 142 | -- didn't get anything 143 | -- TODO a token rewind maybe? 144 | break 145 | end 146 | until false]], 149 | { 150 | node = self, 151 | parser = parser, 152 | ast = ast, 153 | tab = tab, 154 | }) 155 | end 156 | 157 | -- expr just encapsulates multiple child exprs? hmm seems it does close to nothing. 158 | local _expr = nodeclass'expr' 159 | function _expr:getcode(parser) 160 | if #self == 1 then return self[1]:getcode(parser) end 161 | return template([[ 162 | -- expr 163 | 171 | ]], 173 | { 174 | node = self, 175 | parser = parser, 176 | ast = ast, 177 | tab = tab, 178 | }) 179 | end 180 | 181 | local _capture = nodeclass'capture' 182 | function _capture:getcode(parser) 183 | return '-- capture' 184 | end 185 | 186 | local _name = nodeclass'name' 187 | function _name:getcode(parser) 188 | assert.len(self, 1) 189 | local name = assert.type(self[1], 'string') 190 | assert.index(parser.ruleForName, name) 191 | return 'result:insert(self:parse_'..name..'())' 192 | end 193 | 194 | local _string = nodeclass'string' 195 | function _string:getcode(parser, canbe) 196 | assert.len(self, 1) 197 | local s = assert.type(self[1], 'string') 198 | -- keyword / symbol 199 | -- TODO this should be 'mustbe' unless its parent is 'optional' or 'multiple' ... 200 | -- or maybe don't make that change here, but make it in the parent node that generates this code ... 201 | local canmust = canbe and 'canbe' or 'mustbe' 202 | if parser.langKeywords[s] then 203 | return "self:"..canmust.."('"..s.."', 'keyword')" 204 | elseif parser.langSymbols[s] then 205 | return "self:"..canmust.."('"..s.."', 'symbol')" 206 | else 207 | error("found a string that wasn't a keyword or a symbol: "..tolua(s)) 208 | end 209 | end 210 | 211 | --[[ hmm why does this get errors about {"stat"} ... 212 | _name.__index = function(self, k) 213 | if k == 'value' then return self[1] end 214 | return _name.super.__index[k] 215 | end 216 | 217 | _number.__index = function(self, k) 218 | if k == 'value' then return self[1] end 219 | return _number.super.__index[k] 220 | end 221 | 222 | _string.__index = function(self, k) 223 | if k == 'value' then return self[1] end 224 | return _string.super.__index[k] 225 | end 226 | --]] 227 | 228 | function _name:value() return self[1] end 229 | function _string:value() return self[1] end 230 | --function _number:value() return self[1] end 231 | 232 | 233 | local GrammarParser = Parser:subclass() 234 | GrammarParser.ast = ast 235 | 236 | -- static method, call with : 237 | function GrammarParser:fromFile(fn) 238 | return self(assert(path(fn):read()), fn) 239 | end 240 | 241 | function GrammarParser:buildTokenizer(data) 242 | return GrammarTokenizer(data) 243 | end 244 | 245 | function GrammarParser:setData(data, source, ...) 246 | GrammarParser.super.setData(self, data, source, ...) 247 | 248 | -- now we should have our self.tree 249 | -- from here we can convert it into a parse structure 250 | -- our first rule will be the start, i.e. :parseTree() 251 | -- subsequent rules become member functions 252 | 253 | self.ruleForName = {} 254 | -- builtin rules 255 | self.ruleForName.Name = true 256 | self.ruleForName.LiteralString = true 257 | self.ruleForName.Numeral = true 258 | for _,rule in ipairs(self.tree) do 259 | assert.len(rule, 2) 260 | self.ruleForName[rule:name()] = rule 261 | end 262 | 263 | -- while we're here, traverse all rules and pick out all symbols and keywords 264 | self.langKeywords = {} 265 | self.langSymbols = {} 266 | local function process(node) 267 | if ast._name:isa(node) then 268 | assert.len(node, 1) 269 | -- names in the grammar should always point to either other rules, or to builtin axiomatic rules (Name, Numeric, LiteralString) 270 | local name = assert.type(node:value(), 'string') 271 | local rule = self.ruleForName[name] 272 | if not rule then 273 | error("rule referenced but not defined: "..tolua(name)) 274 | end 275 | -- TODO replace the element in the table with the AST? that'd remove the DAG property of the AST. no more pretty `tolua()` output. 276 | elseif ast._string:isa(node) then 277 | assert.len(node, 1) 278 | local s = assert.type(node:value(), 'string') 279 | 280 | -- keywords vs symbols are parsed separately 281 | -- keywords must be space-separated, and for now are only letters -- no symbol characters used (allowed?) 282 | -- symbols don't have to be space-separated and for now cannot be letters 283 | if s:find'%A' then 284 | assert(not s:find'%a') 285 | self.langSymbols[s] = true 286 | else 287 | self.langKeywords[s] = true 288 | end 289 | end 290 | 291 | for i,child in ipairs(node) do 292 | if type(child) == 'table' then 293 | process(child) 294 | end 295 | end 296 | end 297 | for _,rule in ipairs(self.tree) do 298 | process(rule) 299 | end 300 | 301 | local validTokenTypes = { 302 | start = true, 303 | ['end'] = true, 304 | keyword = true, -- word, unquoted, reserved token 305 | name = true, -- word, unquoted, not-reserved 306 | symbol = true, -- non-alphanumeric token 307 | number = true, -- number 308 | string = true, 309 | } 310 | 311 | local function tokenAndTypeToStr(tokenPair) 312 | return table.concat(tokenPair, ':') 313 | end 314 | 315 | --[[ construct DAG ... 316 | assert.is(self.tree[1], ast._rule) 317 | for _,nextTokenPair in ipairs(addRule({'start'}, self.tree[1])) do 318 | addEdge(nextTokenPair, {'end'}) 319 | end 320 | --]] 321 | -- [[ 322 | local function combine(...) 323 | return table():append(...):mapi(function(v) 324 | return true, v 325 | end):keys() 326 | end 327 | local function addEdges(edges, froms, tos) 328 | for _,from in ipairs(froms) do 329 | edges[from] = edges[from] or {} 330 | for _,to in ipairs(tos) do 331 | print('adding edge', from, to) 332 | edges[from][to] = true 333 | end 334 | end 335 | end 336 | local addFromsToRule 337 | local function addFromsToNode(edges, froms, node) 338 | assert.type(froms, 'table') 339 | --print('addFromsToNode', require 'ext.tolua'(froms), node.type) 340 | if ast._expr:isa(node) then 341 | -- "expr" is really "list" or "container of other nodes" 342 | for _,ch in ipairs(node) do 343 | assert.is(ch, ast._node) 344 | froms = addFromsToNode(edges, froms, ch) 345 | end 346 | return froms 347 | elseif ast._multiple:isa(node) then 348 | --[[ 349 | multiple means ... 350 | froms -> start(node) 351 | end(node) -> start(node) 352 | end(node) -> tos 353 | --]] 354 | assert.len(node, 1) 355 | local mult = node[1] 356 | local firstfroms = froms 357 | froms = addFromsToNode(edges, froms, mult) 358 | addFromsToNode(edges, froms, mult) -- from end to beginning ... output should match 'froms' 359 | return combine(firstfroms, froms) -- combine for when there's 0 360 | elseif ast._optional:isa(node) then 361 | --[[ 362 | froms -> optional 363 | optional -> tos 364 | froms -> tos 365 | ... same as multiple without the loop back 366 | --]] 367 | assert.len(node, 1) 368 | local opt = node[1] 369 | local firstfroms = froms 370 | froms = addFromsToNode(edges, froms, opt) 371 | return combine(firstfroms, froms) -- combine for when we skip it 372 | elseif ast._or:isa(node) then 373 | --[[ 374 | froms -> start of each child of node 375 | end of each child of node -> tos 376 | --]] 377 | local tos = table() 378 | for _,ch in ipairs(node) do 379 | tos = combine(tos, addFromsToNode(edges, froms, ch)) 380 | end 381 | return tos 382 | elseif ast._name:isa(node) then 383 | -- name is a rule ... or a builtin rule 384 | local ruleName = node:value() 385 | if ruleName == 'LiteralString' 386 | or ruleName == 'Numeral' 387 | or ruleName == 'Name' 388 | then 389 | local tos = {ruleName} 390 | addEdges(edges, froms, tos) 391 | return tos 392 | else 393 | local tos = {'rule:'..ruleName} 394 | addEdges(edges, froms, tos) 395 | return tos 396 | end 397 | elseif ast._string:isa(node) then 398 | -- string == literal 399 | local to = assert.type(node:value(), 'string') 400 | -- TODO why even bother separate it in canbe() ? 401 | local keywordOrSymbol = to:match'^[_a-zA-Z][_a-zA-Z0-9]*$' and 'keyword' or 'symbol' 402 | local tos = {keywordOrSymbol..':'..to} 403 | addEdges(edges, froms, tos) 404 | return tos 405 | elseif ast._capture:isa(node) then 406 | assert.len(node, 1) 407 | -- TODO where to tell the digraph that we are capturing something ... 408 | return addFromsToNode(edges, froms, node[1]) 409 | end 410 | error('here with type '..tostring(node.type)) 411 | end 412 | function addFromsToRule(edges, froms, rule) 413 | assert.is(rule, ast._rule) 414 | print() 415 | print('adding rule', rule:name()) 416 | return addFromsToNode(edges, froms, rule:expr()) 417 | end 418 | 419 | local edges = {} 420 | 421 | --[[ 422 | each rule gets its own edges[][] digraph 423 | whose start node is 'start' and end node is 'end' 424 | --]] 425 | for i,rule in ipairs(self.tree) do 426 | assert.is(rule, ast._rule) 427 | local froms = addFromsToRule(edges, {'start:'..rule:name(), i==1 and 'start' or nil}, rule) 428 | addEdges(edges, froms, {'end:'..rule:name(), i==1 and 'end' or nil}) 429 | end 430 | 431 | print() 432 | print'before collapse:' 433 | for from,tos in pairs(edges) do 434 | for to,v in pairs(tos) do 435 | print(from..' -> '..to) 436 | end 437 | end 438 | 439 | -- now collapse the rule parts of the graph ... 440 | for from,tos in pairs(edges) do 441 | for _,to in ipairs(table.keys(tos)) do 442 | 443 | -- send end:* to wherever rule:* goes 444 | -- mind you if nobody uses a rule then its end goes nowhere right? 445 | -- and give the DAG a value at this point to tell it to create this rule 446 | local ruleName = to:match'^end:(.*)$' 447 | if ruleName then 448 | edges[from][to] = nil 449 | for newto,v in pairs(assert.index(edges, 'rule:'..ruleName)) do 450 | -- now upon finishing a rule ... we're going to want it to pop out that node, right? 451 | edges[from][newto] = ruleName 452 | end 453 | end 454 | end 455 | end 456 | local stillMoreToCollapse 457 | repeat 458 | stillMoreToCollapse = false 459 | local rulesStillUsed = {} 460 | for from,tos in pairs(edges) do 461 | for _,to in ipairs(table.keys(tos)) do 462 | -- if it goes to rule:* then send it to start:* 463 | local ruleName = to:match'^rule:(.*)$' 464 | if ruleName then 465 | rulesStillUsed[ruleName] = true 466 | -- TODO if a rule: points to a start: points to a rule: then we could be erasing its dest here ... 467 | -- so only erase a rule IF you know it's not used anymore ... 468 | --edges[from][to] = nil 469 | for newto,v in pairs(assert.index(edges, 'start:'..ruleName)) do 470 | -- what about rule: that points to rule: ? 471 | local ruleName2 = newto:match'^rule:(.*)$' 472 | if ruleName2 and ruleName2 ~= ruleName then 473 | -- still getting some circlees ... 474 | print(to.." goes to "..newto) 475 | stillMoreToCollapse = true 476 | end 477 | edges[from][newto] = true 478 | end 479 | end 480 | end 481 | end 482 | for from,tos in pairs(edges) do 483 | for _,to in ipairs(table.keys(tos)) do 484 | local ruleName = to:match'^rule:(.*)$' 485 | if ruleName 486 | --and not rulesStillUsed[ruleName] 487 | then 488 | edges[from][to] = nil 489 | end 490 | end 491 | end 492 | until not stillMoreToCollapse 493 | 494 | -- ... and then remove the rule starts and ends 495 | for _,from in ipairs(table.keys(edges)) do 496 | local ruleName = from:match'^start:(.*)$' or from:match'^end:(.*)$' or from:match'^rule:(.*)$' 497 | if ruleName then 498 | edges[from] = nil 499 | end 500 | end 501 | --]] 502 | 503 | --[[ list every edge 504 | for _,from in ipairs(table.keys(edges)) do 505 | local tos = edges[from] 506 | for _,to in ipairs(table.keys(tos)) do 507 | local v = tos[to] 508 | print(from..' -> '..to..(v ~= true and ' ['..tostring(v)..']' or '')) 509 | end 510 | end 511 | --]] 512 | -- [[ generate code 513 | 514 | local rootASTClassName = 'LuaASTNode' 515 | local tokenizerClassName = 'LuaTokenizer' 516 | local parserClassName = 'LuaParser' 517 | 518 | print(template([=[ 519 | -- generated by 'parser.grammar' using file "" 520 | local table = require 'ext.table' 521 | local ASTNode = require 'parser.base.ast' 522 | local Tokenizer = require 'parser.base.tokenizer' 523 | 524 | local ast = {} 525 | 526 | local = ASTNode:subclass() 527 | 528 | local function nodeclass(args, parent) 529 | parent = parent or 530 | local cl = parent:subclass(args) 531 | ast['_'..cl.type] = cl 532 | return cl 533 | end 534 | 535 | local _ = nodeclass{type=} 537 | 539 | 540 | local = Tokenizer:subclass() 541 | 542 | local edges = 543 | 544 | .symbols = table() 545 | .keywords = 546 | 547 | function :buildTokenizer(data) 548 | return (data) 549 | end 550 | 551 | function :parseTree() 552 | return :parse_() 553 | end 554 | ]=], { 555 | -- requires above 556 | table = table, 557 | tolua = tolua, 558 | -- self 559 | self = self, 560 | -- locals 561 | edges = edges, 562 | tab = tab, 563 | source = source, 564 | rules = self.tree, 565 | rootASTClassName = rootASTClassName, 566 | tokenizerClassName = tokenizerClassName, 567 | parserClassName = parserClassName, 568 | })) 569 | 570 | --]] 571 | end 572 | 573 | function GrammarParser:parseTree() 574 | rules = table() 575 | repeat 576 | if not self.t.token then break end -- nothing left = done 577 | 578 | local rule = self:parseRule() 579 | if not rule then break end 580 | 581 | self:canbe(';', 'symbol') 582 | assert.is(rule, ast._rule) 583 | rules:insert(rule) 584 | until false 585 | return rules 586 | end 587 | 588 | function GrammarParser:parseRule() 589 | -- can-be + capture + assign 'name' 590 | local name = self:mustbe(nil, 'name') 591 | 592 | -- must-be + ignore ... do we ever want to capture a must-be? maybe? 593 | self:mustbe('::=', 'symbol') 594 | 595 | -- TODO i'm overusing and improperly using the term 'expr' 596 | -- can-be + capture + assign 'expr' 597 | local expr = self:parseExprOr() 598 | 599 | return ast._rule(name, expr) 600 | end 601 | 602 | function GrammarParser:parseExprOr() 603 | local expr = self:parseExprList() 604 | local orexpr 605 | 606 | if self:canbe('|', 'symbol') then 607 | local expr2 = self:parseExprOr() 608 | if not orexpr then 609 | orexpr = ast._or(expr) 610 | expr = orexpr 611 | end 612 | if ast._or:isa(expr2) then 613 | -- merge or's 614 | for i,child in ipairs(expr2) do 615 | orexpr:insert(child) 616 | end 617 | else 618 | orexpr:insert(expr2) 619 | end 620 | end 621 | return expr 622 | end 623 | 624 | function GrammarParser:parseExprList() 625 | local expr = ast._expr() 626 | repeat 627 | if self:canbe('{', 'symbol') then 628 | local expr2 = self:parseExprOr() 629 | --assert(not ast._multiple:isa(expr2)) -- no {{ }} allowed, just { } 630 | self:mustbe('}', 'symbol') 631 | expr:insert(ast._multiple(expr2)) 632 | elseif self:canbe('[', 'symbol') then 633 | local expr2 = self:parseExprOr() 634 | self:mustbe(']', 'symbol') 635 | expr:insert(ast._optional(expr2)) 636 | elseif self:canbe('(', 'symbol') then 637 | local expr2 = self:parseExprOr() 638 | self:mustbe(')', 'symbol') 639 | expr:insert(ast._capture(expr2)) 640 | elseif self:canbe(nil, 'name') then 641 | expr:insert(ast._name(self.lasttoken)) 642 | elseif self:canbe(nil, 'number') then 643 | expr:insert(ast._number(self.lasttoken)) 644 | elseif self:canbe(nil, 'string') then 645 | expr:insert(ast._string(self.lasttoken)) 646 | else 647 | break 648 | end 649 | until false 650 | -- unwrap 651 | while #expr == 1 and ast._expr:isa(expr) do 652 | expr = expr[1] 653 | end 654 | return expr 655 | end 656 | 657 | -- [[ test: 658 | local syntax51 = GrammarParser:fromFile'syntax_ast_5.1.txt' 659 | --]] 660 | 661 | return GrammarParser 662 | -------------------------------------------------------------------------------- /grammar/tokenizer.lua: -------------------------------------------------------------------------------- 1 | local Tokenizer = require 'parser.base.tokenizer' 2 | 3 | local GrammarTokenizer = Tokenizer:subclass() 4 | 5 | function GrammarTokenizer:initSymbolsAndKeywords() 6 | for w in ([[ ::= | ; { } [ ] ( ) ]]):gmatch('%S+') do 7 | self.symbols:insert(w) 8 | end 9 | end 10 | 11 | return GrammarTokenizer 12 | -------------------------------------------------------------------------------- /load_xform.lua: -------------------------------------------------------------------------------- 1 | -- parser.load_xorm uses ext.load to modify the load(), loadfile() and require() functions 2 | --DEBUG: local showcode = require 'template.showcode' 3 | local LuaParser = require 'parser.lua.parser' 4 | 5 | local callbacks = setmetatable({}, {__index=table}) 6 | 7 | local state = require 'ext.load'() 8 | callbacks.state = state 9 | 10 | state.xforms:insert(function(d, source) 11 | --DEBUG: print() 12 | --DEBUG: print(debug.traceback()) 13 | --DEBUG: print'!!! BEFORE PARSE !!!' 14 | --DEBUG: print('parser.load_xform source: '..source) 15 | --DEBUG: print(showcode(d)) 16 | --DEBUG: print() 17 | 18 | local parser = LuaParser() 19 | local success, msg = parser:setData(d, source) 20 | if not success then 21 | if parser.t then 22 | msg = parser.t:getpos()..': '..msg 23 | end 24 | return nil, msg 25 | end 26 | local tree = parser.tree 27 | for _,cb in ipairs(callbacks) do 28 | cb(tree, source) 29 | end 30 | local code = tree:toLua() 31 | 32 | --DEBUG: print() 33 | --DEBUG: print(debug.traceback()) 34 | --DEBUG: print'!!! AFTER PARSE !!!' 35 | --DEBUG: print('parser.load_xform source: '..source) 36 | --DEBUG: print(showcode(code)) 37 | --DEBUG: print() 38 | 39 | return code 40 | end) 41 | 42 | return callbacks 43 | -------------------------------------------------------------------------------- /lua/ast.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | parser.base.ast returns the BaseAST root of all AST nodes 3 | 4 | TODO ... 5 | ... but parser.lua.ast (and maybe soon parser.grammar.ast) return a collection-of-nodes, which are key'd to the token ... hmm ... 6 | maybe for consistency I should have parser.lua.ast return the LuaAST, which is an BaseAST child, and parent of all Lua AST nodes ... 7 | ... and give that node a member htat holds a key/value map to all nodes per token ... 8 | But using a namespace is definitely convenient, especially with all the member subclasses and methods that go in it (traverse, nodeclass, etc) 9 | ... though these can easily turn into member fields and member methods 10 | 11 | tempting to replace the 'ast' namespace with just LuaAST itself, and keep the convention that keys beginning with `_` are subclasses... 12 | --]] 13 | local table = require 'ext.table' 14 | local assert = require 'ext.assert' 15 | local tolua = require 'ext.tolua' 16 | 17 | local BaseAST = require 'parser.base.ast' 18 | 19 | 20 | -- namespace table of all Lua AST nodes 21 | -- TODO get rid of parser's dependency on this? or somehow make the relation between parser rules and ast's closer, like generate the AST from the parser-rules? 22 | -- another TODO how about just storing subclasses as `._type` , then the 'ast' usage outside this file can be just echanged with LuaASTNode itself, and the file can return a class, and lots of things can be simplified 23 | local ast = {} 24 | 25 | -- Lua-specific parent class. root of all other ast node classes in this file. 26 | local LuaAST = BaseAST:subclass() 27 | 28 | -- assign to 'ast.node' to define it as the Lua ast's parent-most node class 29 | ast.node = LuaAST 30 | 31 | --[[ 32 | args: 33 | maintainSpan = set to true to have the output maintain the input's span 34 | --]] 35 | local slashNByte = ('\n'):byte() 36 | function LuaAST:serializeRecursiveMember(field, args) 37 | local maintainSpan 38 | if args then 39 | maintainSpan = args.maintainSpan 40 | end 41 | local s = '' 42 | -- :serialize() impl provided by child classes 43 | -- :serialize() should call traversal in-order of parsing (why I want to make it auto and assoc wth the parser and grammar and rule-generated ast node classes) 44 | -- that means serialize() itself should never call serialize() but only call the consume() function passed into it (for modularity's sake) 45 | -- it might mean i should capture all nodes too, even those that are fixed, like keywords and symbols, for the sake of reassmbling the syntax 46 | local line = 1 47 | local col = 1 48 | local index = 1 49 | local consume 50 | local lastspan 51 | consume = function(x) 52 | if type(x) == 'number' then 53 | x = tostring(x) 54 | end 55 | if type(x) == 'string' then 56 | -- here's our only string join 57 | local function append(u) 58 | for i=1,#u do 59 | if u:byte(i) == slashNByte then 60 | col = 1 61 | line = line + 1 62 | else 63 | col = col + 1 64 | end 65 | end 66 | index = index + #u 67 | s = s .. u 68 | end 69 | 70 | -- TODO here if you want ... pad lines and cols until we match the original location (or exceed it) 71 | -- to do that, track appended strings to have a running line/col counter just like we do in parser 72 | -- to do that, separate teh updatelinecol() in the parser to work outside datareader 73 | if maintainSpan and lastspan then 74 | while line < lastspan.from.line do 75 | append'\n' 76 | end 77 | end 78 | 79 | -- if we have a name coming in, only insert a space if we were already at a name 80 | local namelhs = s:sub(-1):match'[_%w%.]' 81 | local namerhs = x:sub(1,1):match'[_%w%.]' 82 | if namelhs and namerhs then 83 | append' ' 84 | elseif not namelhs and not namerhs then 85 | -- TODO here for minification if you want 86 | -- if we have a symbol coming in, only insert a space if we were already at a symbol and the two together would make a different valid symbol 87 | -- you'll need to search back the # of the max length of any symbol ... 88 | append' ' 89 | end 90 | append(x) 91 | elseif type(x) == 'table' then 92 | lastspan = x.span 93 | assert.is(x, BaseAST) 94 | assert.index(x, field) 95 | x[field](x, consume) 96 | else 97 | error('here with unknown type '..type(x)) 98 | end 99 | end 100 | self[field](self, consume) 101 | return s 102 | end 103 | 104 | function LuaAST:toLua(args) 105 | return self:serializeRecursiveMember('toLua_recursive', args) 106 | end 107 | 108 | -- why distinguish toLua() and serialize(consume)? 109 | -- The need for this design pops up more in subclasses. 110 | -- serialize(consume) is used by all language-serializations 111 | -- toLua_recursive(consume) is for Lua-specific serialization (to-be-subclassed) 112 | -- I'm not sure if this is better than just using a fully separate table of serialization functions per node ... 113 | -- toLua() is the external API 114 | function LuaAST:toLua_recursive(consume) 115 | return self:serialize(consume) 116 | end 117 | 118 | -- ok maybe it's not such a good idea to use tostring and serialization for the same purpose ... 119 | LuaAST.__tostring = string.nametostring 120 | 121 | function LuaAST:exec(...) 122 | local code = self:toLua() 123 | local f, msg = load(code, ...) 124 | if not f then 125 | return nil, msg, code 126 | end 127 | return f 128 | end 129 | 130 | 131 | -- TODO what's a more flexible way of iterating through all child fields? 132 | -- and what's a more flexible way of constructing AST node subclass, and of specifying their fields, 133 | -- especially with grammar rule construction? 134 | -- ... how about instead make all fields indexed, and then for certain classes give them aliases into the fields? 135 | -- ... same with htmlparser? 136 | -- then in line with this, fields will either point to nodes, or point to tables to nodes? 137 | -- or maybe the tables-of-nodes should themselves be AST nodes? 138 | local fields = { 139 | {'name', 'field'}, 140 | {'index', 'field'}, 141 | {'value', 'field'}, 142 | {'cond', 'one'}, 143 | {'var', 'one'}, 144 | {'min', 'one'}, 145 | {'max', 'one'}, 146 | {'step', 'one'}, 147 | {'func', 'one'}, -- should this be a _function, or a string depicting a function? 148 | {'arg', 'one'}, 149 | {'key', 'one'}, 150 | {'expr', 'one'}, 151 | {'stmt', 'one'}, 152 | {'args', 'many'}, 153 | {'exprs', 'many'}, 154 | {'elseifs', 'many'}, 155 | {'elsestmt', 'many'}, 156 | {'vars', 'many'}, 157 | } 158 | 159 | ast.exec = LuaAST.exec 160 | 161 | --[[ 162 | I need to fix this up better to handle short-circuiting, replacing, removing, etc... 163 | parentFirstCallback is the parent-first traversal method 164 | childFirstCallback is the child-first traversal 165 | return what value of the callbacks you want 166 | returning a new node at the parent callback will not traverse its subsequent new children added to the tree 167 | --]] 168 | local function traverseRecurse( 169 | node, 170 | parentFirstCallback, 171 | childFirstCallback, 172 | parentNode 173 | ) 174 | if not LuaAST:isa(node) then return node end 175 | if parentFirstCallback then 176 | local ret = parentFirstCallback(node, parentNode) 177 | if ret ~= node then 178 | return ret 179 | end 180 | end 181 | if type(node) == 'table' then 182 | -- treat the object itself like an array of many 183 | for i=1,#node do 184 | node[i] = traverseRecurse(node[i], parentFirstCallback, childFirstCallback, node) 185 | end 186 | for _,field in ipairs(fields) do 187 | local name = field[1] 188 | local howmuch = field[2] 189 | if node[name] then 190 | if howmuch == 'one' then 191 | node[name] = traverseRecurse(node[name], parentFirstCallback, childFirstCallback, node) 192 | elseif howmuch == 'many' then 193 | local value = node[name] 194 | for i=#value,1,-1 do 195 | value[i] = traverseRecurse(value[i], parentFirstCallback, childFirstCallback, node) 196 | end 197 | elseif howmuch == 'field' then 198 | else 199 | error("unknown howmuch "..howmuch) 200 | end 201 | end 202 | end 203 | end 204 | if childFirstCallback then 205 | node = childFirstCallback(node, parentNode) 206 | end 207 | return node 208 | end 209 | 210 | function ast.refreshparents(node) 211 | traverseRecurse(node, function(node, parent) 212 | node.parent = parent 213 | return node 214 | end) 215 | end 216 | 217 | local function traverse(node, ...) 218 | local newnode = traverseRecurse(node, ...) 219 | ast.refreshparents(newnode) 220 | return newnode 221 | end 222 | 223 | LuaAST.traverse = traverse 224 | ast.traverse = traverse 225 | 226 | function LuaAST.copy(n) 227 | local newn = {} 228 | setmetatable(newn, getmetatable(n)) 229 | for i=1,#n do 230 | newn[i] = LuaAST.copy(n[i]) 231 | end 232 | for _,field in ipairs(fields) do 233 | local name = field[1] 234 | local howmuch = field[2] 235 | local value = n[name] 236 | if value then 237 | if howmuch == 'one' then 238 | if type(value) == 'table' then 239 | newn[name] = LuaAST.copy(value) 240 | else 241 | newn[name] = value 242 | end 243 | elseif howmuch == 'many' then 244 | local newmany = {} 245 | for k,v in ipairs(value) do 246 | if type(v) == 'table' then 247 | newmany[k] = LuaAST.copy(v) 248 | else 249 | newmany[k] = v 250 | end 251 | end 252 | newn[name] = newmany 253 | elseif howmuch == 'field' then 254 | newn[name] = value 255 | else 256 | error("unknown howmuch "..howmuch) 257 | end 258 | end 259 | end 260 | return newn 261 | end 262 | ast.copy = LuaAST.copy 263 | 264 | --[[ 265 | flatten a function: 266 | for all its calls, insert them as statements inside the function 267 | this is only possible if the called functions are of a specific form... 268 | varmap is the mapping from function names to _function objects to inline in the _call's place 269 | 270 | 271 | if the nested function ends with a return ... 272 | ... then insert its declarations (for var remapping) into a statement just before the one with this call 273 | ... and wrap our return contents in parenthesis ... or make general use of ()'s everywhere (for resolution order) 274 | 275 | f stmt 276 | f stmt 277 | f stmt 278 | return something(g(...), h(...)) 279 | 280 | becomes 281 | 282 | f stmt 283 | f stmt 284 | f stmt 285 | local g ret 286 | g stmt 287 | g stmt 288 | g stmt 289 | g ret = previous return value of h 290 | local h ret 291 | h stmt 292 | h stmt 293 | h stmt 294 | h ret = previous return value of h 295 | return something(g ret, h ret) 296 | 297 | --]] 298 | function LuaAST.flatten(f, varmap) 299 | f = LuaAST.copy(f) 300 | traverseRecurse(f, function(n) 301 | if type(n) == 'table' 302 | and ast._call:isa(n) 303 | then 304 | local funcname = n.func:toLua() -- in case it's a var ... ? 305 | assert(funcname, "can't flatten a function with anonymous calls") 306 | local f = varmap[funcname] 307 | if f 308 | and #f == 1 309 | and ast._return:isa(f[1]) 310 | then 311 | local retexprs = {} 312 | for i,e in ipairs(f[1].exprs) do 313 | retexprs[i] = LuaAST.copy(e) 314 | traverseRecurse(retexprs[i], function(v) 315 | -- _arg is not used by parser - externally used only - I should move flatten somewhere else ... 316 | if ast._arg:isa(v) then 317 | return LuaAST.copy(n.args[i]) 318 | end 319 | end) 320 | retexprs[i] = ast._par(retexprs[i]) 321 | end 322 | return ast._block(table.unpack(retexprs)) -- TODO exprlist, and redo assign to be based on vars and exprs 323 | end 324 | end 325 | return n 326 | end) 327 | return f 328 | end 329 | ast.flatten = LuaAST.flatten 330 | 331 | local function consumeconcat(consume, t, sep) 332 | for i,x in ipairs(t) do 333 | consume(x) 334 | if sep and i < #t then 335 | consume(sep) 336 | end 337 | end 338 | end 339 | 340 | local function spacesep(stmts, consume) 341 | consumeconcat(consume, stmts) 342 | end 343 | 344 | local function commasep(exprs, consume) 345 | consumeconcat(consume, exprs, ',') 346 | end 347 | 348 | local function nodeclass(type, parent, args) 349 | parent = parent or LuaAST 350 | local cl = parent:subclass(args) 351 | cl.type = type 352 | cl.__name = type 353 | ast['_'..type] = cl 354 | return cl 355 | end 356 | ast.nodeclass = nodeclass 357 | 358 | -- helper function 359 | local function isLuaName(s) 360 | return s:match'^[_%a][_%w]*$' 361 | end 362 | function ast.keyIsName(key, parser) 363 | return ast._string:isa(key) 364 | -- if key is a string and has no funny chars 365 | and isLuaName(key.value) 366 | and ( 367 | -- ... and if we don't have a .parser assigned (as is the case of some dynamic ast manipulation ... *cough* vec-lua *cough* ...) 368 | not parser 369 | -- ... or if we do have a parser and this name isn't a keyword in the parser's tokenizer 370 | or not parser.t.keywords[key.value] 371 | ) 372 | end 373 | 374 | -- generic global stmt collection 375 | local _block = nodeclass'block' 376 | function _block:init(...) 377 | for i=1,select('#', ...) do 378 | self[i] = select(i, ...) 379 | end 380 | end 381 | function _block:serialize(consume) 382 | spacesep(self, consume) 383 | end 384 | 385 | --statements 386 | 387 | local _stmt = nodeclass'stmt' 388 | 389 | -- TODO 'vars' and 'exprs' should be nodes themselves ... 390 | local _assign = nodeclass('assign', _stmt) 391 | function _assign:init(vars, exprs) 392 | self.vars = table(vars) 393 | self.exprs = table(exprs) 394 | end 395 | function _assign:serialize(consume) 396 | commasep(self.vars, consume) 397 | consume'=' 398 | commasep(self.exprs, consume) 399 | end 400 | 401 | -- should we impose construction constraints _do(_block(...)) 402 | -- or should we infer? _do(...) = {type = 'do', block = {type = 'block, ...}} 403 | -- or should we do neither? _do(...) = {type = 'do', ...} 404 | -- neither for now 405 | -- but that means _do and _block are identical ... 406 | local _do = nodeclass('do', _stmt) 407 | function _do:init(...) 408 | for i=1,select('#', ...) do 409 | self[i] = select(i, ...) 410 | end 411 | end 412 | function _do:serialize(consume) 413 | consume'do' 414 | spacesep(self, consume) 415 | consume'end' 416 | end 417 | 418 | local _while = nodeclass('while', _stmt) 419 | -- TODO just make self[1] into the cond ... 420 | function _while:init(cond, ...) 421 | self.cond = cond 422 | for i=1,select('#', ...) do 423 | self[i] = select(i, ...) 424 | end 425 | end 426 | function _while:serialize(consume) 427 | consume'while' 428 | consume(self.cond) 429 | consume'do' 430 | spacesep(self, consume) 431 | consume'end' 432 | end 433 | 434 | local _repeat = nodeclass('repeat', _stmt) 435 | -- TODO just make self[1] into the cond ... 436 | function _repeat:init(cond, ...) 437 | self.cond = cond 438 | for i=1,select('#', ...) do 439 | self[i] = select(i, ...) 440 | end 441 | end 442 | function _repeat:serialize(consume) 443 | consume'repeat' 444 | spacesep(self, consume) 445 | consume'until' 446 | consume(self.cond) 447 | end 448 | 449 | --[[ 450 | _if(_eq(a,b), 451 | _assign({a},{2}), 452 | _elseif(...), 453 | _elseif(...), 454 | _else(...)) 455 | --]] 456 | -- weird one, idk how to reformat 457 | local _if = nodeclass('if', _stmt) 458 | -- TODO maybe just assert the node types and store them as-is in self[i] 459 | function _if:init(cond,...) 460 | local elseifs = table() 461 | local elsestmt, laststmt 462 | for i=1,select('#', ...) do 463 | local stmt = select(i, ...) 464 | if ast._elseif:isa(stmt) then 465 | elseifs:insert(stmt) 466 | elseif ast._else:isa(stmt) then 467 | assert(not elsestmt) 468 | elsestmt = stmt -- and remove 469 | else 470 | if laststmt then 471 | assert(laststmt.type ~= 'elseif' and laststmt.type ~= 'else', "got a bad stmt in an if after an else: "..laststmt.type) 472 | end 473 | table.insert(self, stmt) 474 | end 475 | laststmt = stmt 476 | end 477 | self.cond = cond 478 | self.elseifs = elseifs 479 | self.elsestmt = elsestmt 480 | end 481 | function _if:serialize(consume) 482 | consume'if' 483 | consume(self.cond) 484 | consume'then' 485 | spacesep(self, consume) 486 | for _,ei in ipairs(self.elseifs) do 487 | consume(ei) 488 | end 489 | if self.elsestmt then 490 | consume(self.elsestmt) 491 | end 492 | consume'end' 493 | end 494 | 495 | -- aux for _if 496 | local _elseif = nodeclass('elseif', _stmt) 497 | -- TODO just make self[1] into the cond ... 498 | function _elseif:init(cond,...) 499 | self.cond = cond 500 | for i=1,select('#', ...) do 501 | self[i] = select(i, ...) 502 | end 503 | end 504 | function _elseif:serialize(consume) 505 | consume'elseif' 506 | consume(self.cond) 507 | consume'then' 508 | spacesep(self, consume) 509 | end 510 | 511 | -- aux for _if 512 | local _else = nodeclass('else', _stmt) 513 | function _else:init(...) 514 | for i=1,select('#', ...) do 515 | self[i] = select(i, ...) 516 | end 517 | end 518 | function _else:serialize(consume) 519 | consume'else' 520 | spacesep(self, consume) 521 | end 522 | 523 | local _foreq = nodeclass('foreq', _stmt) 524 | -- step is optional 525 | -- TODO just make self[1..4] into the var, min, max, step ... 526 | -- ... this means we can possibly have a nil child mid-sequence ... 527 | -- .. hmm ... 528 | -- ... which is better: 529 | -- *) requiring table.max for integer iteration instead of ipairs 530 | -- *) or using fields instead of integer indexes? 531 | function _foreq:init(var,min,max,step,...) 532 | self.var = var 533 | self.min = min 534 | self.max = max 535 | self.step = step 536 | for i=1,select('#', ...) do 537 | self[i] = select(i, ...) 538 | end 539 | end 540 | function _foreq:serialize(consume) 541 | consume'for' 542 | consume(self.var) 543 | consume'=' 544 | consume(self.min) 545 | consume',' 546 | consume(self.max) 547 | if self.step then 548 | consume',' 549 | consume(self.step) 550 | end 551 | consume'do' 552 | spacesep(self, consume) 553 | consume'end' 554 | end 555 | 556 | -- TODO 'vars' should be a node itself 557 | local _forin = nodeclass('forin', _stmt) 558 | function _forin:init(vars, iterexprs, ...) 559 | self.vars = vars 560 | self.iterexprs = iterexprs 561 | for i=1,select('#', ...) do 562 | self[i] = select(i, ...) 563 | end 564 | end 565 | function _forin:serialize(consume) 566 | consume'for' 567 | commasep(self.vars, consume) 568 | consume'in' 569 | commasep(self.iterexprs, consume) 570 | consume'do' 571 | spacesep(self, consume) 572 | consume'end' 573 | end 574 | 575 | local _function = nodeclass('function', _stmt) 576 | -- name is optional 577 | -- TODO make 'args' a node 578 | function _function:init(name, args, ...) 579 | -- prep args... 580 | for i=1,#args do 581 | args[i].index = i 582 | args[i].param = true 583 | end 584 | self.name = name 585 | self.args = args 586 | for i=1,select('#', ...) do 587 | self[i] = select(i, ...) 588 | end 589 | end 590 | function _function:serialize(consume) 591 | consume'function' 592 | if self.name then 593 | consume(self.name) 594 | end 595 | consume'(' 596 | commasep(self.args, consume) 597 | consume')' 598 | spacesep(self, consume) 599 | consume'end' 600 | end 601 | 602 | -- aux for _function 603 | -- not used by parser - externally used only - I should get rid of it 604 | local _arg = nodeclass'arg' 605 | -- TODO just self[1] ? 606 | function _arg:init(index) 607 | self.index = index 608 | end 609 | -- params need to know what function they're in 610 | -- so they can reference the function's arg names 611 | function _arg:serialize(consume) 612 | consume('arg'..self.index) 613 | end 614 | 615 | -- _local can be an assignment of multi vars to muli exprs 616 | -- or can optionally be a declaration of multi vars with no statements 617 | -- so it will take the form of assignments 618 | -- but it can also be a single function declaration with no equals symbol ... 619 | -- the parser has to accept functions and variables as separate conditions 620 | -- I'm tempted to make them separate symbols here too ... 621 | -- exprs is a table containing: 1) a single function 2) a single assign statement 3) a list of variables 622 | local _local = nodeclass('local', _stmt) 623 | -- TODO just self[1] instead of self.exprs[i] 624 | function _local:init(exprs) 625 | if ast._function:isa(exprs[1]) or ast._assign:isa(exprs[1]) then 626 | assert(#exprs == 1, "local functions or local assignments must be the only child") 627 | end 628 | self.exprs = table(assert(exprs)) 629 | end 630 | function _local:serialize(consume) 631 | if ast._function:isa(self.exprs[1]) or ast._assign:isa(self.exprs[1]) then 632 | consume'local' 633 | consume(self.exprs[1]) 634 | else 635 | consume'local' 636 | commasep(self.exprs, consume) 637 | end 638 | end 639 | 640 | -- control 641 | 642 | local _return = nodeclass('return', _stmt) 643 | -- TODO either 'exprs' a node of its own, or flatten it into 'return' 644 | function _return:init(...) 645 | self.exprs = {...} 646 | end 647 | function _return:serialize(consume) 648 | consume'return' 649 | commasep(self.exprs, consume) 650 | end 651 | 652 | local _break = nodeclass('break', _stmt) 653 | function _break:serialize(consume) consume'break' end 654 | 655 | local _call = nodeclass'call' 656 | -- TODO 'args' a node of its own ? or store it in self[i] ? 657 | function _call:init(func, ...) 658 | self.func = func 659 | self.args = {...} 660 | end 661 | function _call:serialize(consume) 662 | if #self.args == 1 663 | and (ast._table:isa(self.args[1]) 664 | or ast._string:isa(self.args[1]) 665 | ) then 666 | consume(self.func) 667 | consume(self.args[1]) 668 | else 669 | consume(self.func) 670 | consume'(' 671 | commasep(self.args, consume) 672 | consume')' 673 | end 674 | end 675 | 676 | local _nil = nodeclass'nil' 677 | _nil.const = true 678 | function _nil:serialize(consume) consume'nil' end 679 | 680 | local _boolean = nodeclass'boolean' 681 | 682 | local _true = nodeclass('true', _boolean) 683 | _true.const = true 684 | _true.value = true 685 | function _true:serialize(consume) consume'true' end 686 | 687 | local _false = nodeclass('false', _boolean) 688 | _false.const = true 689 | _false.value = false 690 | function _false:serialize(consume) consume'false' end 691 | 692 | local _number = nodeclass'number' 693 | -- TODO just self[1] instead of self.value ? 694 | -- but this breaks convention with _boolean having .value as its static member value. 695 | -- I could circumvent this with _boolean subclass [1] holding the value ... 696 | function _number:init(value) self.value = value end 697 | function _number:serialize(consume) consume(tostring(self.value)) end 698 | 699 | local _string = nodeclass'string' 700 | -- TODO just self[1] instead of self.value 701 | function _string:init(value) self.value = value end 702 | function _string:serialize(consume) 703 | -- use ext.tolua's string serializer 704 | consume(tolua(self.value)) 705 | end 706 | 707 | local _vararg = nodeclass'vararg' 708 | function _vararg:serialize(consume) consume'...' end 709 | 710 | -- TODO 'args' a node, or flatten into self[i] ? 711 | local _table = nodeclass'table' -- single-element assigns 712 | function _table:init(...) 713 | for i=1,select('#', ...) do 714 | self[i] = select(i, ...) 715 | end 716 | end 717 | function _table:serialize(consume) 718 | consume'{' 719 | for i,arg in ipairs(self) do 720 | -- if it's an assign then wrap the vars[1] with []'s 721 | if ast._assign:isa(arg) then 722 | assert.len(arg.vars, 1) 723 | assert.len(arg.exprs, 1) 724 | -- TODO if it's a string and name and not a keyword then use our shorthand 725 | -- but for this , I should put the Lua keywords in somewhere that both the AST and Tokenizer can see them 726 | -- and the Tokenizer builds separate lists depending on the version (so I guess a table per version?) 727 | if ast.keyIsName(arg.vars[1], self.parser) then 728 | consume(arg.vars[1].value) 729 | else 730 | consume'[' 731 | consume(arg.vars[1]) 732 | consume']' 733 | end 734 | consume'=' 735 | consume(arg.exprs[1]) 736 | else 737 | consume(arg) 738 | end 739 | if i < #self then 740 | consume',' 741 | end 742 | end 743 | consume'}' 744 | end 745 | 746 | -- OK here is the classic example of the benefits of fields over integers: 747 | -- extensibility. 748 | -- attrib was added later 749 | -- as we add/remove fields, that means reordering indexes, and that means a break in compat 750 | -- one workaround to merging the two is just named functions and integer-indexed children 751 | -- another is a per-child traversal routine (like :serialize()) 752 | local _var = nodeclass'var' -- variable, lhs of ast._assign's 753 | function _var:init(name, attrib) 754 | self.name = name 755 | self.attrib = attrib 756 | end 757 | function _var:serialize(consume) 758 | consume(self.name) 759 | if self.attrib then 760 | -- the extra space is needed for assignments, otherwise lua5.4 `local x=1` chokes while `local x =1` works 761 | consume'<' 762 | consume(self.attrib) 763 | consume'>' 764 | end 765 | end 766 | 767 | local _par = nodeclass'par' 768 | ast._par = _par 769 | ast._parenthesis = nil 770 | function _par:init(expr) 771 | self.expr = expr 772 | end 773 | function _par:serialize(consume) 774 | consume'(' 775 | consume(self.expr) 776 | consume')' 777 | end 778 | 779 | local _index = nodeclass'index' 780 | function _index:init(expr,key) 781 | self.expr = expr 782 | -- helper add wrappers to some types: 783 | -- TODO or not? 784 | if type(key) == 'string' then 785 | key = ast._string(key) 786 | elseif type(key) == 'number' then 787 | key = ast._number(key) 788 | end 789 | self.key = key 790 | end 791 | function _index:serialize(consume) 792 | if ast.keyIsName(self.key, self.parser) then 793 | -- the use a .$key instead of [$key] 794 | consume(self.expr) 795 | consume'.' 796 | consume(self.key.value) 797 | else 798 | consume(self.expr) 799 | consume'[' 800 | consume(self.key) 801 | consume']' 802 | end 803 | end 804 | 805 | -- this isn't the () call itself, this is just the : dereference 806 | -- a:b(c) is _call(_indexself(_var'a', _var'b'), _var'c') 807 | -- technically this is a string lookup, however it is only valid as a lua name, so I'm just passing the Lua string itself 808 | local _indexself = nodeclass'indexself' 809 | function _indexself:init(expr,key) 810 | self.expr = assert(expr) 811 | assert(isLuaName(key)) 812 | -- TODO compat with _index? always wrap? do this before passing in key? 813 | --key = ast._string(key) 814 | self.key = assert(key) 815 | end 816 | function _indexself:serialize(consume) 817 | consume(self.expr) 818 | consume':' 819 | consume(self.key) 820 | end 821 | 822 | local _op = nodeclass'op' 823 | -- TODO 'args' a node ... or just flatten it into this node ... 824 | function _op:init(...) 825 | for i=1,select('#', ...) do 826 | self[i] = select(i, ...) 827 | end 828 | end 829 | function _op:serialize(consume) 830 | for i,x in ipairs(self) do 831 | consume(x) 832 | if i < #self then consume(self.op) end 833 | end 834 | end 835 | 836 | for _,info in ipairs{ 837 | {'add','+'}, 838 | {'sub','-'}, 839 | {'mul','*'}, 840 | {'div','/'}, 841 | {'pow','^'}, 842 | {'mod','%'}, 843 | {'concat','..'}, 844 | {'lt','<'}, 845 | {'le','<='}, 846 | {'gt','>'}, 847 | {'ge','>='}, 848 | {'eq','=='}, 849 | {'ne','~='}, 850 | {'and','and'}, 851 | {'or','or'}, 852 | {'idiv', '//'}, -- 5.3+ 853 | {'band', '&'}, -- 5.3+ 854 | {'bxor', '~'}, -- 5.3+ 855 | {'bor', '|'}, -- 5.3+ 856 | {'shl', '<<'}, -- 5.3+ 857 | {'shr', '>>'}, -- 5.3+ 858 | } do 859 | local op = info[2] 860 | local cl = nodeclass(info[1], _op) 861 | cl.op = op 862 | end 863 | 864 | for _,info in ipairs{ 865 | {'unm','-'}, 866 | {'not','not'}, 867 | {'len','#'}, 868 | {'bnot','~'}, -- 5.3+ 869 | } do 870 | local op = info[2] 871 | local cl = nodeclass(info[1], _op) 872 | cl.op = op 873 | function cl:init(...) 874 | for i=1,select('#', ...) do 875 | self[i] = select(i, ...) 876 | end 877 | end 878 | function cl:serialize(consume) 879 | consume(self.op) 880 | consume(self[1]) -- spaces required for 'not' 881 | end 882 | end 883 | 884 | local _goto = nodeclass('goto', _stmt) 885 | function _goto:init(name) 886 | self.name = name 887 | end 888 | function _goto:serialize(consume) 889 | consume'goto' 890 | consume(self.name) 891 | end 892 | 893 | local _label = nodeclass('label', _stmt) 894 | function _label:init(name) 895 | self.name = name 896 | end 897 | function _label:serialize(consume) 898 | consume'::' 899 | consume(self.name) 900 | consume'::' 901 | end 902 | 903 | return ast 904 | -------------------------------------------------------------------------------- /lua/parser.lua: -------------------------------------------------------------------------------- 1 | local table = require 'ext.table' 2 | local assert = require 'ext.assert' 3 | local Parser = require 'parser.base.parser' 4 | 5 | local ast = require 'parser.lua.ast' 6 | 7 | local LuaTokenizer = require 'parser.lua.tokenizer' 8 | 9 | local LuaParser = Parser:subclass() 10 | 11 | -- save the namespace here, for Parser:setData() 12 | LuaParser.ast = ast 13 | 14 | -- static function 15 | function LuaParser.parse(data, source, ...) 16 | local parser = LuaParser(nil, nil, ...) 17 | local result = table.pack(parser:setData(data, source)) 18 | if not result[1] then return result:unpack() end 19 | return parser.tree 20 | end 21 | 22 | -- TODO instead of version and useluajit, how about parseFlags, and enable/disable them depending on the version 23 | function LuaParser:init(data, version, source, useluajit) 24 | self.version = version or _VERSION:match'^Lua (.*)$' 25 | if useluajit == nil then 26 | -- I could test for _G.jit's presence, but what if luajit is compiled with jit off but still has LL language feature on ... 27 | -- TODO unified load shim layer , esp for lua 5.1 ... 28 | -- TODO TODO if langfix's load has been replaced then this will segfault... 29 | -- we are detecting LL / ULL suffix, but using load to do so causes some recursion problems (since in some cases I've already overridden load() via ext.load and parser.load_xform ...) 30 | --local _load = loadstring or load 31 | --useluajit = _load'return 1LL' 32 | -- ... so instead, for now just assume jit's presence implies luajit implies LL / ULL for parsing 33 | useluajit = not not _G.jit 34 | end 35 | self.useluajit = not not useluajit 36 | 37 | -- TODO between this and parser.grammar, make a table-based way to specify the rules 38 | -- TODO TODO a token DAG from the grammar would be nice ... 39 | -- [[ what to name this ... 40 | self.parseExprPrecedenceRulesAndClassNames = table{ 41 | { 42 | name = 'or', 43 | rules = { 44 | {token='or', className='_or'}, 45 | }, 46 | }, 47 | { 48 | name = 'and', 49 | rules = { 50 | {token='and', className='_and'}, 51 | }, 52 | }, 53 | { 54 | name = 'cmp', 55 | rules = { 56 | {token='<', className='_lt'}, 57 | {token='>', className='_gt'}, 58 | {token='<=', className='_le'}, 59 | {token='>=', className='_ge'}, 60 | {token='~=', className='_ne'}, 61 | {token='==', className='_eq'}, 62 | }, 63 | }, 64 | }:append( 65 | self.version < '5.3' and nil or table{ 66 | { 67 | name = 'bor', 68 | rules = { 69 | {token='|', className='_bor'}, 70 | }, 71 | }, 72 | { 73 | name = 'bxor', 74 | rules = { 75 | {token='~', className='_bxor'}, 76 | }, 77 | }, 78 | { 79 | name = 'band', 80 | rules = { 81 | {token='&', className='_band'}, 82 | }, 83 | }, 84 | { 85 | name = 'shift', 86 | rules = { 87 | {token='<<', className='_shl'}, 88 | {token='>>', className='_shr'}, 89 | }, 90 | }, 91 | }):append{ 92 | { 93 | name = 'concat', 94 | rules = { 95 | {token='..', className='_concat'}, 96 | }, 97 | }, 98 | { 99 | name = 'addsub', -- arithmetic 100 | rules = { 101 | {token='+', className='_add'}, 102 | {token='-', className='_sub'}, 103 | }, 104 | }, 105 | { 106 | name = 'muldivmod', -- geometric 107 | rules = { 108 | {token='*', className='_mul'}, 109 | {token='/', className='_div'}, 110 | {token='%', className='_mod'}, 111 | -- if version < 5.3 then the // symbol won't be added to the tokenizer anyways... 112 | {token='//', className='_idiv'}, 113 | }, 114 | }, 115 | { 116 | name = 'unary', 117 | unaryLHS = true, 118 | rules = { 119 | {token='not', className='_not'}, 120 | {token='#', className='_len'}, -- only a 5.1+ token 121 | {token='-', className='_unm'}, 122 | {token='~', className='_bnot'}, -- only a 5.3+ token 123 | }, 124 | }, 125 | { 126 | name = 'pow', 127 | rules = { 128 | {token='^', className='_pow', nextLevel='unary'}, 129 | }, 130 | }, 131 | } 132 | --]] 133 | 134 | if data then 135 | -- can't return from init so gotta error ... 136 | assert(self:setData(data, source)) 137 | end 138 | end 139 | 140 | function LuaParser:setData(data, source) 141 | self.gotos = {} -- keep track of all gotos 142 | self.labels = {} -- keep track of all labels 143 | self.blockStack = table() 144 | self.functionStack = table{'function-vararg'} 145 | 146 | local result = table.pack(LuaParser.super.setData(self, data)) 147 | if not result[1] then 148 | return result:unpack() 149 | end 150 | 151 | -- last verify that all gotos went to all labels 152 | for _,g in pairs(self.gotos) do 153 | if not self.labels[g.name] then 154 | return false, "line "..g.span.to.line..": no visible label '"..g.name.."' for " 155 | end 156 | end 157 | return true 158 | end 159 | 160 | function LuaParser:buildTokenizer(data) 161 | return LuaTokenizer(data, self.version, self.useluajit) 162 | end 163 | 164 | -- default entry point for parsing data sources 165 | function LuaParser:parseTree() 166 | return self:parse_chunk() 167 | end 168 | 169 | function LuaParser:parse_chunk() 170 | local from = self:getloc() 171 | local stmts = table() 172 | if self.version >= '5.2' or self.useluajit then 173 | -- preceding ;'s allowed 174 | while self:canbe(';', 'symbol') do end 175 | end 176 | repeat 177 | local stmt = self:parse_stat() 178 | if not stmt then break end 179 | stmts:insert(stmt) 180 | self:canbe(';', 'symbol') 181 | until false 182 | local laststat = self:parse_retstat() 183 | if laststat then 184 | stmts:insert(laststat) 185 | self:canbe(';', 'symbol') 186 | end 187 | return self:node('_block', table.unpack(stmts)) 188 | :setspan{from = from, to = self:getloc()} 189 | end 190 | 191 | function LuaParser:parse_block(blockName) 192 | if blockName then self.blockStack:insert(blockName) end 193 | local chunk = self:parse_chunk() 194 | if blockName then assert.eq(self.blockStack:remove(), blockName) end 195 | return chunk 196 | end 197 | 198 | function LuaParser:parse_stat() 199 | local from = self:getloc() 200 | if self:canbe('local', 'keyword') then 201 | local ffrom = self:getloc() 202 | if self:canbe('function', 'keyword') then 203 | local namevar = self:parse_var() 204 | if not namevar then error{msg="expected name"} end 205 | return self:node('_local', { 206 | self:makeFunction( 207 | namevar, 208 | table.unpack((assert(self:parse_funcbody(), {msg="expected function body"}))) 209 | ):setspan{from = ffrom , to = self:getloc()} 210 | }):setspan{from = from , to = self:getloc()} 211 | else 212 | local afrom = self:getloc() 213 | local namelist = assert(self:parse_attnamelist(), {msg="expected attr name list"}) 214 | if self:canbe('=', 'symbol') then 215 | local explist = assert(self:parse_explist(), {msg="expected expr list"}) 216 | local assign = self:node('_assign', namelist, explist) 217 | :setspan{from = ffrom, to = self:getloc()} 218 | return self:node('_local', {assign}) 219 | :setspan{from = from, to = self:getloc()} 220 | else 221 | return self:node('_local', namelist) 222 | :setspan{from = from, to = self:getloc()} 223 | end 224 | end 225 | elseif self:canbe('function', 'keyword') then 226 | local funcname = self:parse_funcname() 227 | return self:makeFunction(funcname, table.unpack((assert(self:parse_funcbody(), {msg="expected function body"})))) 228 | :setspan{from = from , to = self:getloc()} 229 | elseif self:canbe('for', 'keyword') then 230 | local namelist = assert(self:parse_namelist(), {msg="expected name list"}) 231 | if self:canbe('=', 'symbol') then 232 | assert.eq(#namelist, 1, {msg="expected only one name in for loop"}) 233 | local explist = assert(self:parse_explist(), {msg="expected exp list"}) 234 | assert.ge(#explist, 2, {msg="bad for loop"}) 235 | assert.le(#explist, 3, {msg="bad for loop"}) 236 | local doloc = self:getloc() 237 | self:mustbe('do', 'keyword') 238 | local block = assert(self:parse_block'for =', {msg="for loop expected block"}) 239 | self:mustbe('end', 'keyword', 'do', doloc) 240 | return self:node('_foreq', namelist[1], explist[1], explist[2], explist[3], table.unpack(block)) 241 | :setspan{from = from, to = self:getloc()} 242 | elseif self:canbe('in', 'keyword') then 243 | local explist = assert(self:parse_explist(), {msg="expected expr list"}) 244 | local doloc = self:getloc() 245 | self:mustbe('do', 'keyword') 246 | local block = assert(self:parse_block'for in', {msg="expected block"}) 247 | self:mustbe('end', 'keyword', 'do', doloc) 248 | return self:node('_forin', namelist, explist, table.unpack(block)) 249 | :setspan{from = from, to = self:getloc()} 250 | else 251 | error{msg="'=' or 'in' expected"} 252 | end 253 | elseif self:canbe('if', 'keyword') then 254 | local cond = assert(self:parse_exp(), {msg="unexpected symbol"}) 255 | self:mustbe('then', 'keyword') 256 | local block = self:parse_block() 257 | local stmts = table(block) 258 | -- ...and add elseifs and else to this 259 | local efrom = self:getloc() 260 | while self:canbe('elseif', 'keyword') do 261 | local cond = assert(self:parse_exp(), {msg='unexpected symbol'}) 262 | self:mustbe('then', 'keyword') 263 | stmts:insert( 264 | self:node('_elseif', cond, table.unpack((assert(self:parse_block(), {msg='expected block'})))) 265 | :setspan{from = efrom, to = self:getloc()} 266 | ) 267 | efrom = self:getloc() 268 | end 269 | if self:canbe('else', 'keyword') then 270 | stmts:insert( 271 | self:node('_else', table.unpack((assert(self:parse_block(), {msg='expected block'})))) 272 | :setspan{from = efrom, to = self:getloc()} 273 | ) 274 | end 275 | self:mustbe('end', 'keyword', 'if', from) 276 | return self:node('_if', cond, table.unpack(stmts)) 277 | :setspan{from = from, to = self:getloc()} 278 | elseif self:canbe('repeat', 'keyword') then 279 | local block = assert(self:parse_block'repeat', {msg='expected block'}) 280 | self:mustbe('until', 'keyword') 281 | return self:node( 282 | '_repeat', 283 | (assert(self:parse_exp(), {msg='unexpected symbol'})), 284 | table.unpack(block) 285 | ):setspan{from = from, to = self:getloc()} 286 | elseif self:canbe('while', 'keyword') then 287 | local cond = assert(self:parse_exp(), {msg='unexpected symbol'}) 288 | local doloc = self:getloc() 289 | self:mustbe('do', 'keyword') 290 | local block = assert(self:parse_block'while', {msg='expected block'}) 291 | self:mustbe('end', 'keyword', 'do', doloc) 292 | return self:node('_while', cond, table.unpack(block)) 293 | :setspan{from = from, to = self:getloc()} 294 | elseif self:canbe('do', 'keyword') then 295 | local block = assert(self:parse_block(), {msg='expected block'}) 296 | self:mustbe('end', 'keyword', 'do', from) 297 | return self:node('_do', table.unpack(block)) 298 | :setspan{from = from, to = self:getloc()} 299 | elseif self.version >= '5.2' then 300 | if self:canbe('goto', 'keyword') then 301 | local name = self:mustbe(nil, 'name') 302 | local g = self:node('_goto', name) 303 | :setspan{from = from, to = self:getloc()} 304 | self.gotos[name] = g 305 | return g 306 | -- lua5.2+ break is a statement, so you can have multiple breaks in a row with no syntax error 307 | elseif self:canbe('break', 'keyword') then 308 | return self:parse_break() 309 | :setspan{from = from, to = self:getloc()} 310 | elseif self:canbe('::', 'symbol') then 311 | local name = self:mustbe(nil, 'name') 312 | local l = self:node('_label', name) 313 | self.labels[name] = true 314 | self:mustbe('::', 'symbol') 315 | return l:setspan{from = from, to = self:getloc()} 316 | end 317 | end 318 | 319 | -- now we handle functioncall and varlist = explist rules 320 | 321 | --[[ 322 | stat ::= varlist `=` explist | functioncall 323 | varlist ::= var {`,` var} 324 | var ::= Name | prefixexp `[` exp `]` | prefixexp `.` Name 325 | prefixexp ::= var | functioncall | `(` exp `)` 326 | functioncall ::= prefixexp args | prefixexp `:` Name args 327 | right now prefixexp is designed to process trailing args ... 328 | ... so just use it and complain if the wrapping ast is not a _call 329 | likewise with var, complain if it is a call 330 | --]] 331 | 332 | local prefixexp = self:parse_prefixexp() 333 | if prefixexp then 334 | if self.ast._call:isa(prefixexp) then -- function call 335 | return prefixexp 336 | else -- varlist assignment 337 | local vars = table{prefixexp} 338 | while self:canbe(',', 'symbol') do 339 | local var = assert(self:parse_prefixexp(), {msg='expected expr'}) 340 | assert.ne(var.type, 'call', {msg="syntax error"}) 341 | vars:insert(var) 342 | end 343 | return self:parse_assign(vars, from) 344 | end 345 | end 346 | end 347 | 348 | function LuaParser:parse_assign(vars, from) 349 | self:mustbe('=', 'symbol') 350 | return self:node('_assign', vars, (assert(self:parse_explist(), {msg='expected expr'}))) 351 | :setspan{from = from, to = self:getloc()} 352 | end 353 | 354 | -- 'laststat' in 5.1, 'retstat' in 5.2+ 355 | function LuaParser:parse_retstat() 356 | local from = self:getloc() 357 | -- lua5.2+ break is a statement, so you can have multiple breaks in a row with no syntax error 358 | -- that means only handle 'break' here in 5.1 359 | if self.version <= '5.1' and self:canbe('break', 'keyword') then 360 | return self:parse_break() 361 | :setspan{from = from, to = self:getloc()} 362 | end 363 | if self:canbe('return', 'keyword') then 364 | local explist = self:parse_explist() or {} 365 | return self:node('_return', table.unpack(explist)) 366 | :setspan{from = from, to = self:getloc()} 367 | end 368 | end 369 | 370 | -- verify we're in a loop, then return the break 371 | 372 | function LuaParser:parse_break() 373 | local from = self:getloc() 374 | if not ({['while']=1, ['repeat']=1, ['for =']=1, ['for in']=1})[self.blockStack:last()] then 375 | error{msg="break not inside loop"} 376 | end 377 | return self:node('_break') 378 | :setspan{from = from, to = self:getloc()} 379 | end 380 | 381 | 382 | function LuaParser:parse_funcname() 383 | local from = self:getloc() 384 | local name = self:parse_var() 385 | if not name then return end 386 | while self:canbe('.', 'symbol') do 387 | local sfrom = self.t:getloc() 388 | name = self:node('_index', 389 | name, 390 | self:node('_string', self:mustbe(nil, 'name')) 391 | :setspan{from = sfrom, to = self:getloc()} 392 | ):setspan{from = from, to = self:getloc()} 393 | end 394 | if self:canbe(':', 'symbol') then 395 | name = self:node('_indexself', name, self:mustbe(nil, 'name')) 396 | :setspan{from = from, to = self:getloc()} 397 | end 398 | return name 399 | end 400 | 401 | -- parses a varialbe name, without attribs, and returns it in a '_var' node 402 | function LuaParser:parse_var() 403 | local from = self:getloc() 404 | local name = self:canbe(nil, 'name') 405 | if not name then return end 406 | return self:node('_var', name) 407 | :setspan{from=from, to=self:getloc()} 408 | end 409 | 410 | function LuaParser:parse_namelist() 411 | local var = self:parse_var() 412 | if not var then return end 413 | local names = table{var} 414 | while self:canbe(',', 'symbol') do 415 | names:insert((assert(self:parse_var(), {msg="expected name"}))) 416 | end 417 | return names 418 | end 419 | 420 | -- same as above but with optional attributes 421 | 422 | function LuaParser:parse_attnamelist() 423 | local from = self:getloc() 424 | local name = self:canbe(nil, 'name') 425 | if not name then return end 426 | local attrib = self:parse_attrib() 427 | local names = table{ 428 | self:node('_var', name, attrib) 429 | :setspan{from = from, to = self:getloc()} 430 | } 431 | while self:canbe(',', 'symbol') do 432 | from = self:getloc() 433 | local name = self:mustbe(nil, 'name') 434 | local attrib = self:parse_attrib() 435 | names:insert( 436 | self:node('_var', name, attrib) 437 | :setspan{from = from, to = self:getloc()} 438 | ) 439 | end 440 | return names 441 | end 442 | 443 | function LuaParser:parse_attrib() 444 | if self.version < '5.4' then return end 445 | local attrib 446 | if self:canbe('<', 'symbol') then 447 | attrib = self:mustbe(nil, 'name') 448 | self:mustbe('>', 'symbol') 449 | end 450 | return attrib 451 | end 452 | 453 | function LuaParser:parse_explist() 454 | local exp = self:parse_exp() 455 | if not exp then return end 456 | local exps = table{exp} 457 | while self:canbe(',', 'symbol') do 458 | exps:insert((assert(self:parse_exp(), {msg='unexpected symbol'}))) 459 | end 460 | return exps 461 | end 462 | 463 | --[[ 464 | exp ::= nil | false | true | Numeral | LiteralString | `...` | function | prefixexp | tableconstructor | exp binop exp | unop exp 465 | ... splitting this into two ... 466 | exp ::= [unop] subexp {binop [unop] subexp} 467 | subexp ::= nil | false | true | Numeral | LiteralString | `...` | function | prefixexp | tableconstructor 468 | --]] 469 | 470 | function LuaParser:parse_exp() 471 | return self:parse_expr_precedenceTable(1) 472 | end 473 | 474 | function LuaParser:parse_subexp() 475 | local tableconstructor = self:parse_tableconstructor() 476 | if tableconstructor then return tableconstructor end 477 | 478 | local prefixexp = self:parse_prefixexp() 479 | if prefixexp then return prefixexp end 480 | 481 | local functiondef = self:parse_functiondef() 482 | if functiondef then return functiondef end 483 | 484 | local from = self:getloc() 485 | if self:canbe('...', 'symbol') then 486 | if self.version == '5.0' then error{msg="unexpected symbol near '...'"} end 487 | assert.eq(self.functionStack:last(), 'function-vararg', {msg='unexpected symbol'}) 488 | return self:node('_vararg') 489 | :setspan{from = from, to = self:getloc()} 490 | end 491 | if self:canbe(nil, 'string') then 492 | return self:node('_string', self.lasttoken) 493 | :setspan{from = from, to = self:getloc()} 494 | end 495 | if self:canbe(nil, 'number') then 496 | return self:node('_number', self.lasttoken) 497 | :setspan{from = from, to = self:getloc()} 498 | end 499 | if self:canbe('true', 'keyword') then 500 | return self:node('_true') 501 | :setspan{from = from, to = self:getloc()} 502 | end 503 | if self:canbe('false', 'keyword') then 504 | return self:node('_false') 505 | :setspan{from = from, to = self:getloc()} 506 | end 507 | if self:canbe('nil', 'keyword') then 508 | return self:node('_nil') 509 | :setspan{from = from, to = self:getloc()} 510 | end 511 | end 512 | 513 | --[[ 514 | prefixexp ::= var | functioncall | `(` exp `)` 515 | 516 | functioncall ::= prefixexp args | prefixexp `:` Name args 517 | combine... 518 | prefixexp ::= var | prefixexp args | prefixexp `:` Name args | `(` exp `)` 519 | var ::= Name | prefixexp `[` exp `]` | prefixexp `.` Name 520 | combine ... 521 | prefixexp ::= Name | prefixexp `[` exp `]` | prefixexp `.` Name | prefixexp args | prefixexp `:` Name args | `(` exp `)` 522 | simplify ... 523 | prefixexp ::= (Name {'[' exp ']' | `.` Name | [`:` Name] args} | `(` exp `)`) {args} 524 | --]] 525 | 526 | function LuaParser:parse_prefixexp() 527 | local prefixexp 528 | local from = self:getloc() 529 | 530 | if self:canbe('(', 'symbol') then 531 | local exp = assert(self:parse_exp(), {msg='unexpected symbol'}) 532 | self:mustbe(')', 'symbol') 533 | prefixexp = self:node('_par', exp) 534 | :setspan{from = from, to = self:getloc()} 535 | else 536 | prefixexp = self:parse_var() 537 | if not prefixexp then return end 538 | end 539 | 540 | while true do 541 | if self:canbe('[', 'symbol') then 542 | prefixexp = self:node('_index', prefixexp, (assert(self:parse_exp(), {msg='unexpected symbol'}))) 543 | self:mustbe(']', 'symbol') 544 | prefixexp:setspan{from = from, to = self:getloc()} 545 | elseif self:canbe('.', 'symbol') then 546 | local sfrom = self:getloc() 547 | prefixexp = self:node('_index', 548 | prefixexp, 549 | self:node('_string', self:mustbe(nil, 'name')) 550 | :setspan{from = sfrom, to = self:getloc()} 551 | ) 552 | :setspan{from = from, to = self:getloc()} 553 | elseif self:canbe(':', 'symbol') then 554 | prefixexp = self:node('_indexself', 555 | prefixexp, 556 | self:mustbe(nil, 'name') 557 | ):setspan{from = from, to = self:getloc()} 558 | local args = self:parse_args() 559 | if not args then error{msg="function arguments expected"} end 560 | prefixexp = self:node('_call', prefixexp, table.unpack(args)) 561 | :setspan{from = from, to = self:getloc()} 562 | else 563 | local args = self:parse_args() 564 | if not args then break end 565 | 566 | prefixexp = self:node('_call', prefixexp, table.unpack(args)) 567 | :setspan{from = from, to = self:getloc()} 568 | end 569 | end 570 | 571 | return prefixexp 572 | end 573 | 574 | -- returns nil on fail to match, like all functions 575 | -- produces error on syntax error 576 | -- returns a table of the args -- particularly an empty table if no args were found 577 | 578 | function LuaParser:parse_args() 579 | local from = self:getloc() 580 | if self:canbe(nil, 'string') then 581 | return { 582 | self:node('_string', self.lasttoken) 583 | :setspan{from = from, to = self:getloc()} 584 | } 585 | end 586 | 587 | local tableconstructor = self:parse_tableconstructor() 588 | if tableconstructor then return {tableconstructor} end 589 | 590 | if self:canbe('(', 'symbol') then 591 | local explist = self:parse_explist() 592 | self:mustbe(')', 'symbol') 593 | return explist or {} 594 | end 595 | end 596 | -- helper which also includes the line and col in the function object 597 | 598 | function LuaParser:makeFunction(...) 599 | return self:node('_function', ...) -- no :setspan(), this is done by the caller 600 | end 601 | -- 'function' in the 5.1 syntax 602 | 603 | function LuaParser:parse_functiondef() 604 | local from = self:getloc() 605 | if not self:canbe('function', 'keyword') then return end 606 | return self:makeFunction(nil, table.unpack((assert(self:parse_funcbody(), {msg='expected function body'})))) 607 | :setspan{from = from, to = self:getloc()} 608 | end 609 | -- returns a table of ... first element is a table of args, rest of elements are the body statements 610 | 611 | function LuaParser:parse_funcbody() 612 | local funcloc = self:getloc() 613 | if not self:canbe('(', 'symbol') then return end 614 | local args = self:parse_parlist() or table() 615 | local lastArg = args:last() 616 | local functionType = self.ast._vararg:isa(lastArg) and 'function-vararg' or 'function' 617 | self:mustbe(')', 'symbol') 618 | self.functionStack:insert(functionType) 619 | local block = self:parse_block(functionType) 620 | assert.eq(self.functionStack:remove(), functionType) 621 | self:mustbe('end', 'keyword', 'function', funcloc) 622 | return table{args, table.unpack(block)} 623 | end 624 | 625 | function LuaParser:parse_parlist() -- matches namelist() with ... as a terminator 626 | local from = self:getloc() 627 | if self:canbe('...', 'symbol') then 628 | return table{ 629 | self:node('_vararg') 630 | :setspan{from = from, to = self:getloc()} 631 | } 632 | end 633 | 634 | local namevar = self:parse_var() 635 | if not namevar then return end 636 | local names = table{namevar} 637 | while self:canbe(',', 'symbol') do 638 | from = self:getloc() 639 | if self:canbe('...', 'symbol') then 640 | names:insert( 641 | self:node('_vararg') 642 | :setspan{from = from, to = self:getloc()} 643 | ) 644 | return names 645 | end 646 | local namevar = self:parse_var() 647 | if not namevar then error{msg="expected name"} end 648 | names:insert(namevar) 649 | end 650 | return names 651 | end 652 | 653 | function LuaParser:parse_tableconstructor() 654 | local from = self:getloc() 655 | if not self:canbe('{', 'symbol') then return end 656 | if self.version == '5.0' then 657 | -- despite what the 5.0 syntax says, it looks like the 5.0 parser will parse and ignore a leading semicolon as valid: {; 1, 2, 3, 4} 658 | self:canbe(';', 'symbol') 659 | end 660 | local fields = self:parse_fieldlist() 661 | self:mustbe('}', 'symbol') 662 | --[[ ok design flaw I didn't foresee when trying to unify all the AST as indexed children (like my symmath project) 663 | -- if this _table's children are too big then you can't unpack it into the ctor args... 664 | local result = self:node('_table', table.unpack(fields or {})) 665 | --]] 666 | -- [[ ... so instead, manually insert them... 667 | -- but a later TODO might be to go back to accepting a table-of-children. 668 | local result = self:node'_table' 669 | if fields then 670 | for i,field in ipairs(fields) do 671 | result[i] = field 672 | end 673 | end 674 | --]] 675 | result:setspan{from = from, to = self:getloc()} 676 | return result 677 | end 678 | 679 | function LuaParser:parse_fieldlist() 680 | local field = self:parse_field() 681 | if not field then return end 682 | local fields = table{field} 683 | while self:parse_fieldsep() do 684 | local field = self:parse_field() 685 | if not field then break end 686 | fields:insert(field) 687 | end 688 | self:parse_fieldsep() 689 | return fields 690 | end 691 | 692 | function LuaParser:parse_field() 693 | local from = self:getloc() 694 | if self:canbe('[', 'symbol') then 695 | local keyexp = assert(self:parse_exp(), {msg='unexpected symbol'}) 696 | self:mustbe(']', 'symbol') 697 | self:mustbe('=', 'symbol') 698 | local valexp = self:parse_exp() 699 | if not valexp then error{msg="expected expression but found "..tostring(self.t.token)} end 700 | return self:node('_assign', {keyexp}, {valexp}) 701 | :setspan{from = from, to = self:getloc()} 702 | end 703 | 704 | -- this will be Name or exp 705 | -- in the case that it is a Name then check for = exp 706 | local exp = self:parse_exp() 707 | if not exp then return end 708 | 709 | if self.ast._var:isa(exp) and self:canbe('=', 'symbol') then 710 | return self:node('_assign', 711 | { 712 | self:node('_string', exp.name):setspan(exp.span) 713 | }, { 714 | (assert(self:parse_exp(), {msg='unexpected symbol'})) 715 | } 716 | ):setspan{from = from, to = self:getloc()} 717 | else 718 | return exp 719 | end 720 | end 721 | 722 | function LuaParser:parse_fieldsep() 723 | return self:canbe(',', 'symbol') or self:canbe(';', 'symbol') 724 | end 725 | 726 | return LuaParser 727 | -------------------------------------------------------------------------------- /lua/tokenizer.lua: -------------------------------------------------------------------------------- 1 | local table = require 'ext.table' 2 | local assert = require 'ext.assert' 3 | local Tokenizer = require 'parser.base.tokenizer' 4 | 5 | local LuaTokenizer = Tokenizer:subclass() 6 | 7 | --[[ 8 | NOTICE this only needs to be initialized once per tokenizer, not per-data-source 9 | however at the moment it does need to be initialized once-per-version (as the extra arg to Tokenizer) 10 | maybe I should move it to static initialization and move version-based stuff to subclasses' static-init? 11 | 12 | So why 'symbols' vs 'keywords' ? 13 | 'Keywords' consist of valid names (names like variables functions etc use) 14 | while 'symbols' consist of everything else. (can symbols contain letters that names can use? at the moment they do not.) 15 | For this reason, when parsing, keywords need separated spaces, while symbols do not (except for distinguishing between various-sized symbols, i.e. < < vs <<). 16 | --]] 17 | function LuaTokenizer:initSymbolsAndKeywords(version, useluajit) 18 | -- store later for parseHexNumber 19 | self.version = assert(version) 20 | self.useluajit = useluajit 21 | 22 | for w in ([[... .. == ~= <= >= + - * / ^ < > = ( ) { } [ ] ; : , .]]):gmatch('%S+') do 23 | self.symbols:insert(w) 24 | end 25 | 26 | if version >= '5.1' then 27 | self.symbols:insert'#' 28 | self.symbols:insert'%' 29 | end 30 | 31 | for w in ([[and break do else elseif end false for function if in local nil not or repeat return then true until while]]):gmatch('%S+') do 32 | self.keywords[w] = true 33 | end 34 | 35 | -- TODO this will break because luajit doesn't care about versions 36 | -- if I use a load-test, the ext.load shim layer will break 37 | -- if I use a load('goto=true') test without ext.load then load() doens't accept strings for 5.1 when the goto isn't a keyword, so I might as well just test if load can load any string ... 38 | -- TODO separate language features from versions and put all the language options in a ctor table somewhere 39 | do--if version >= '5.2' then 40 | self.symbols:insert'::' -- for labels .. make sure you insert it before :: 41 | self.keywords['goto'] = true 42 | end 43 | 44 | if version >= '5.3' then -- and not useluajit then ... setting this fixes some validation tests, but setting it breaks langfix+luajit ... TODO straighten out parser/version configuration 45 | self.symbols:insert'//' 46 | self.symbols:insert'~' 47 | self.symbols:insert'&' 48 | self.symbols:insert'|' 49 | self.symbols:insert'<<' 50 | self.symbols:insert'>>' 51 | end 52 | end 53 | 54 | function LuaTokenizer:init(...) 55 | LuaTokenizer.super.init(self, ...) 56 | 57 | -- skip past initial #'s 58 | local r = self.r 59 | if r.data:sub(1,1) == '#' then 60 | if not r:seekpast'\n' then 61 | r:seekpast'$' 62 | end 63 | end 64 | end 65 | 66 | function LuaTokenizer:parseBlockComment() 67 | local r = self.r 68 | -- look for --[====[ 69 | if not r:canbe'%-%-%[=*%[' then return end 70 | self:readRestOfBlock(r.lasttoken) 71 | return true 72 | end 73 | 74 | function LuaTokenizer:parseString() 75 | -- try to parse block strings 76 | if self:parseBlockString() then return true end 77 | 78 | -- try for base's quote strings 79 | return LuaTokenizer.super.parseString(self) 80 | end 81 | 82 | -- Lua-specific block strings 83 | function LuaTokenizer:parseBlockString() 84 | local r = self.r 85 | if not r:canbe'%[=*%[' then return end 86 | if self:readRestOfBlock(r.lasttoken) then 87 | --DEBUG(@5): print('read multi-line string ['..(r.index-#r.lasttoken)..','..r.index..']: '..r.lasttoken) 88 | coroutine.yield(r.lasttoken, 'string') 89 | return true 90 | end 91 | end 92 | 93 | function LuaTokenizer:readRestOfBlock(startToken) 94 | local r = self.r 95 | 96 | local eq = assert(startToken:match('%[(=*)%[$')) 97 | -- skip whitespace? 98 | r:canbe'\n' -- if the first character is a newline then skip it 99 | local start = r.index 100 | if not r:seekpast('%]'..eq..'%]') then 101 | error{msg="expected closing block"} 102 | end 103 | -- since we used seekpast, the string isn't being captured as a lasttoken ... 104 | --return r:setlasttoken(r.data:sub(start, r.index - #r.lasttoken - 1)) 105 | -- ... so don't push it into the history here, just assign it. 106 | r.lasttoken = r.data:sub(start, r.index - #r.lasttoken - 1) 107 | return r.lasttoken 108 | end 109 | 110 | 111 | function LuaTokenizer:parseHexNumber(...) 112 | local r = self.r 113 | -- if version is 5.2 then allow decimals in hex #'s, and use 'p's instead of 'e's for exponents 114 | if self.version >= '5.2' then 115 | -- TODO this looks like the float-parse code below (but with e+- <-> p+-) but meh I'm lazy so I just copied it. 116 | local token = r:canbe'[%.%da-fA-F]+' 117 | local numdots = #token:gsub('[^%.]','') 118 | assert.le(numdots, 1, {msg='malformed number'}) 119 | local n = table{'0x', token} 120 | if r:canbe'p' then 121 | n:insert(r.lasttoken) 122 | -- fun fact, while the hex float can include hex digits, its 'p+-' exponent must be in decimal. 123 | n:insert(r:mustbe('[%+%-]%d+', 'malformed number')) 124 | elseif numdots == 0 and self.useluajit then 125 | if r:canbe'LL' then 126 | n:insert'LL' 127 | elseif r:canbe'ULL' then 128 | n:insert'ULL' 129 | end 130 | end 131 | coroutine.yield(n:concat(), 'number') 132 | else 133 | --return LuaTokenizer.super.parseHexNumber(self, ...) 134 | local token = r:mustbe('[%da-fA-F]+', 'malformed number') 135 | local n = table{'0x', token} 136 | if self.useluajit then 137 | if r:canbe'LL' then 138 | n:insert'LL' 139 | elseif r:canbe'ULL' then 140 | n:insert'ULL' 141 | end 142 | end 143 | coroutine.yield(n:concat(), 'number') 144 | end 145 | end 146 | 147 | function LuaTokenizer:parseDecNumber() 148 | local r = self.r 149 | local token = r:canbe'[%.%d]+' 150 | local numdots = #token:gsub('[^%.]','') 151 | assert.le(numdots, 1, {msg='malformed number'}) 152 | local n = table{token} 153 | if r:canbe'e' then 154 | n:insert(r.lasttoken) 155 | n:insert(r:mustbe('[%+%-]%d+', 'malformed number')) 156 | elseif numdots == 0 and self.useluajit then 157 | if r:canbe'LL' then 158 | n:insert'LL' 159 | elseif r:canbe'ULL' then 160 | n:insert'ULL' 161 | end 162 | end 163 | coroutine.yield(n:concat(), 'number') 164 | end 165 | 166 | return LuaTokenizer 167 | -------------------------------------------------------------------------------- /parser.lua: -------------------------------------------------------------------------------- 1 | -- me moving classes around 2 | -- TODO get rid of this file and rename all `require 'parser'` to `require 'parser.lua.parser'` ... or maybe ... 3 | -- ... maybe that's a bad idea, because it is more verbose ... 4 | -- maybe instead of forwarding LuaParser, I should just write some wrapper function shere, like parser.parse(...) to auto-construct a LuaParser and return its tree ... 5 | return require 'parser.lua.parser' 6 | -------------------------------------------------------------------------------- /parser.rockspec: -------------------------------------------------------------------------------- 1 | package = "parser" 2 | version = "dev-1" 3 | source = { 4 | url = "git+https://github.com/thenumbernine/lua-parser" 5 | } 6 | description = { 7 | summary = "Lua Parser in Lua", 8 | detailed = "Lua Parser in Lua", 9 | homepage = "https://github.com/thenumbernine/lua-parser", 10 | license = "MIT" 11 | } 12 | dependencies = { 13 | "lua ~> 5.3" 14 | } 15 | build = { 16 | type = "builtin", 17 | modules = { 18 | ["parser"] = "parser.lua", 19 | ["parser.load_xform"] = "load_xform.lua", 20 | ["parser.base.ast"] = "base/ast.lua", 21 | ["parser.base.datareader"] = "base/datareader.lua", 22 | ["parser.base.parser"] = "base/parser.lua", 23 | ["parser.base.tokenizer"] = "base/tokenizer.lua", 24 | ["parser.grammar.parser"] = "grammar/parser.lua", 25 | ["parser.grammar.tokenizer"] = "grammar/tokenizer.lua", 26 | ["parser.lua.ast"] = "lua/ast.lua", 27 | ["parser.lua.parser"] = "lua/parser.lua", 28 | ["parser.lua.tokenizer"] = "lua/tokenizer.lua", 29 | }, 30 | copy_directories = { 31 | "tests" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /syntax_5.0.txt: -------------------------------------------------------------------------------- 1 | chunk ::= {stat [`;´]} ; 2 | 3 | block ::= chunk ; 4 | 5 | stat ::= varlist1 `=´ explist1 6 | | functioncall 7 | | do block end 8 | | while exp do block end 9 | | repeat block until exp 10 | | if exp then block {elseif exp then block} [else block] end 11 | | return [explist1] 12 | | break 13 | | for Name `=´ exp `,´ exp [`,´ exp] do block end 14 | | for Name {`,´ Name} in explist1 do block end 15 | | function funcname funcbody 16 | | local function Name funcbody 17 | | local namelist [init] ; 18 | 19 | funcname ::= Name {`.´ Name} [`:´ Name] ; 20 | 21 | varlist1 ::= var {`,´ var} ; 22 | 23 | var ::= Name 24 | | prefixexp `[´ exp `]´ 25 | | prefixexp `.´ Name ; 26 | 27 | namelist ::= Name {`,´ Name} ; 28 | 29 | init ::= `=´ explist1 ; 30 | 31 | explist1 ::= {exp `,´} exp ; 32 | 33 | exp ::= nil 34 | | false 35 | | true 36 | | Number 37 | | Literal 38 | | function 39 | | prefixexp 40 | | tableconstructor 41 | | exp binop exp 42 | | unop exp ; 43 | 44 | prefixexp ::= var 45 | | functioncall 46 | | `(´ exp `)´ ; 47 | 48 | functioncall ::= prefixexp args 49 | | prefixexp `:´ Name args ; 50 | 51 | args ::= `(´ [explist1] `)´ 52 | | tableconstructor 53 | | Literal ; 54 | 55 | function ::= function funcbody ; 56 | 57 | funcbody ::= `(´ [parlist] `)´ block end ; 58 | 59 | parlist ::= Name {`,´ Name} [`,´ `...´] 60 | | `...´ ; 61 | 62 | tableconstructor ::= `{´ [fieldlist] `}´ ; 63 | 64 | fieldlist ::= field {fieldsep field} [fieldsep] ; 65 | 66 | field ::= `[´ exp `]´ `=´ exp 67 | | name `=´ exp 68 | | exp ; 69 | 70 | fieldsep ::= `,´ 71 | | `;´ ; 72 | 73 | binop ::= `+´ 74 | | `-´ 75 | | `*´ 76 | | `/´ 77 | | `^´ 78 | | `..´ 79 | | `<´ 80 | | `<=´ 81 | | `>´ 82 | | `>=´ 83 | | `==´ 84 | | `~=´ 85 | | and 86 | | or ; 87 | 88 | unop ::= `-´ 89 | | not ; 90 | -------------------------------------------------------------------------------- /syntax_5.1.txt: -------------------------------------------------------------------------------- 1 | chunk ::= {stat [';']} [laststat [';']] ; 2 | 3 | block ::= chunk ; 4 | 5 | stat ::= varlist '=' explist 6 | | functioncall 7 | | 'do' block 'end' 8 | | 'while' exp 'do' block 'end' 9 | | 'repeat' block 'until' exp 10 | | 'if' exp 'then' block {'elseif' exp 'then' block} ['else' block] 'end' 11 | | 'for' Name '=' exp ',' exp [',' exp] 'do' block 'end' 12 | | 'for' namelist 'in' explist 'do' block 'end' 13 | | 'function' funcname funcbody 14 | | 'local' 'function' Name funcbody 15 | | 'local' namelist ['=' explist] 16 | ; 17 | 18 | laststat ::= 'return' [explist] 19 | | 'break' 20 | ; 21 | 22 | funcname ::= Name {'.' Name} [':' Name] ; 23 | 24 | varlist ::= var {',' var} ; 25 | 26 | var ::= Name 27 | | prefixexp '[' exp ']' 28 | | prefixexp '.' Name 29 | ; 30 | 31 | namelist ::= Name {',' Name} ; 32 | 33 | explist ::= {exp ','} exp ; 34 | 35 | exp ::= 'nil' 36 | | 'false' 37 | | 'true' 38 | | Numeral 39 | | LiteralString 40 | | '...' 41 | | functiondef 42 | | prefixexp 43 | | tableconstructor 44 | | exp binop exp 45 | | unop exp 46 | ; 47 | 48 | prefixexp ::= var 49 | | functioncall 50 | | '(' exp ')' 51 | ; 52 | 53 | functioncall ::= prefixexp args 54 | | prefixexp ':' Name args 55 | ; 56 | 57 | args ::= '(' [explist] ')' 58 | | tableconstructor 59 | | LiteralString 60 | ; 61 | 62 | functiondef ::= 'function' funcbody ; 63 | 64 | funcbody ::= '(' [parlist] ')' block 'end' ; 65 | 66 | parlist ::= namelist [',' '...'] 67 | | '...' 68 | ; 69 | 70 | tableconstructor ::= '{' [fieldlist] '}' ; 71 | 72 | fieldlist ::= field {fieldsep field} [fieldsep] ; 73 | 74 | field ::= '[' exp ']' '=' exp 75 | | Name '=' exp 76 | | exp 77 | ; 78 | 79 | fieldsep ::= ',' 80 | | ';' 81 | ; 82 | 83 | binop ::= '+' 84 | | '-' 85 | | '*' 86 | | '/' 87 | | '^' 88 | | '%' 89 | | '..' 90 | | '<' 91 | | '<=' 92 | | '>' 93 | | '>=' 94 | | '==' 95 | | '~=' 96 | | 'and' 97 | | 'or' 98 | ; 99 | 100 | unop ::= '-' 101 | | 'not' 102 | | '#' 103 | ; 104 | 105 | -- Name ::= ... how to define valid names ... 106 | -- Numeral ::= ... how to define numerals ... 107 | -- LiteralString ::= how to define literal strings ... 108 | -------------------------------------------------------------------------------- /syntax_5.2.txt: -------------------------------------------------------------------------------- 1 | chunk ::= block ; 2 | 3 | block ::= {stat} [retstat] ; 4 | 5 | retstat ::= 'return' [explist] [';'] ; 6 | 7 | stat ::= ';' 8 | | varlist '=' explist 9 | | functioncall 10 | | label 11 | | 'break' 12 | | 'goto' Name 13 | | 'do' block 'end' 14 | | 'while' exp 'do' block 'end' 15 | | 'repeat' block 'until' exp 16 | | 'if' exp 'then' block {'elseif' exp 'then' block} ['else' block] 'end' 17 | | 'for' Name '=' exp ',' exp [',' exp] 'do' block 'end' 18 | | 'for' namelist 'in' explist 'do' block 'end' 19 | | 'function' funcname funcbody 20 | | 'local' 'function' Name funcbody 21 | | 'local' namelist ['=' explist] 22 | ; 23 | 24 | varlist ::= var {',' var} ; 25 | 26 | funcname ::= Name {'.' Name} [':' Name] ; 27 | 28 | label ::= '::' Name '::' ; 29 | 30 | var ::= Name 31 | | prefixexp '[' exp ']' 32 | | prefixexp '.' Name 33 | ; 34 | 35 | namelist ::= Name {',' Name} ; 36 | 37 | explist ::= exp {',' exp} ; 38 | 39 | exp ::= 'nil' 40 | | 'false' 41 | | 'true' 42 | | Numeral 43 | | LiteralString 44 | | '...' 45 | | functiondef 46 | | prefixexp 47 | | tableconstructor 48 | | exp binop exp 49 | | unop exp 50 | ; 51 | 52 | prefixexp ::= var 53 | | functioncall 54 | | '(' exp ')' 55 | ; 56 | 57 | functioncall ::= prefixexp args 58 | | prefixexp ':' Name args ; 59 | 60 | args ::= '(' [explist] ')' 61 | | tableconstructor 62 | | LiteralString 63 | ; 64 | 65 | functiondef ::= 'function' funcbody ; 66 | 67 | funcbody ::= '(' [parlist] ')' block 'end' ; 68 | 69 | parlist ::= namelist [',' '...'] 70 | | '...' 71 | ; 72 | 73 | tableconstructor ::= '{' [fieldlist] '}' ; 74 | 75 | fieldlist ::= field {fieldsep field} [fieldsep] ; 76 | 77 | field ::= '[' exp ']' '=' exp 78 | | Name '=' exp 79 | | exp 80 | ; 81 | 82 | fieldsep ::= ',' | ';' ; 83 | 84 | binop ::= '+' 85 | | '-' 86 | | '*' 87 | | '/' 88 | | '^' 89 | | '%' 90 | | '..' 91 | | '<' 92 | | '<=' 93 | | '>' 94 | | '>=' 95 | | '==' 96 | | '~=' 97 | | 'and' 98 | | 'or' 99 | ; 100 | 101 | unop ::= '-' 102 | | 'not' 103 | | '#' 104 | ; 105 | -------------------------------------------------------------------------------- /syntax_5.3.txt: -------------------------------------------------------------------------------- 1 | chunk ::= block ; 2 | 3 | block ::= {stat} [retstat] ; 4 | 5 | stat ::= ';' 6 | | varlist '=' explist 7 | | functioncall 8 | | label 9 | | break 10 | | goto Name 11 | | do block end 12 | | while exp do block end 13 | | repeat block until exp 14 | | if exp then block {elseif exp then block} [else block] end 15 | | for Name '=' exp ',' exp [',' exp] do block end 16 | | for namelist in explist do block end 17 | | function funcname funcbody 18 | | local function Name funcbody 19 | | local namelist ['=' explist] 20 | ; 21 | 22 | retstat ::= return [explist] [';'] ; 23 | 24 | label ::= '::' Name '::' ; 25 | 26 | funcname ::= Name {'.' Name} [':' Name] ; 27 | 28 | varlist ::= var {',' var} ; 29 | 30 | var ::= Name 31 | | prefixexp '[' exp ']' 32 | | prefixexp '.' Name 33 | ; 34 | 35 | namelist ::= Name {',' Name} ; 36 | 37 | explist ::= exp {',' exp} ; 38 | 39 | exp ::= nil 40 | | false 41 | | true 42 | | Numeral 43 | | LiteralString 44 | | '...' 45 | | functiondef 46 | | prefixexp 47 | | tableconstructor 48 | | exp binop exp 49 | | unop exp 50 | ; 51 | 52 | prefixexp ::= var 53 | | functioncall 54 | | '(' exp ')' 55 | ; 56 | 57 | functioncall ::= prefixexp args 58 | | prefixexp ':' Name args 59 | ; 60 | 61 | args ::= '(' [explist] ')' 62 | | tableconstructor 63 | | LiteralString 64 | ; 65 | 66 | functiondef ::= function funcbody ; 67 | 68 | funcbody ::= '(' [parlist] ')' block end ; 69 | 70 | parlist ::= namelist [',' '...'] 71 | | '...' 72 | ; 73 | 74 | tableconstructor ::= '{' [fieldlist] '}' ; 75 | 76 | fieldlist ::= field {fieldsep field} [fieldsep] ; 77 | 78 | field ::= '[' exp ']' '=' exp 79 | | Name '=' exp 80 | | exp 81 | ; 82 | 83 | fieldsep ::= ',' 84 | | ';' 85 | ; 86 | 87 | binop ::= '+' 88 | | '-' 89 | | '*' 90 | | '/' 91 | | '//' 92 | | '^' 93 | | '%' 94 | | '&' 95 | | '~' 96 | | '|' 97 | | '>>' 98 | | '<<' 99 | | '..' 100 | | '<' 101 | | '<=' 102 | | '>' 103 | | '>=' 104 | | '==' 105 | | '~=' 106 | | and 107 | | or 108 | ; 109 | 110 | unop ::= '-' 111 | | not 112 | | '#' 113 | | '~' 114 | ; 115 | -------------------------------------------------------------------------------- /syntax_5.4.txt: -------------------------------------------------------------------------------- 1 | chunk ::= block 2 | ; 3 | 4 | block ::= {stat} [retstat] ; 5 | 6 | stat ::= ';' 7 | | varlist '=' explist 8 | | functioncall 9 | | label 10 | | break 11 | | goto Name 12 | | do block end 13 | | while exp do block end 14 | | repeat block until exp 15 | | if exp then block {elseif exp then block} [else block] end 16 | | for Name '=' exp ',' exp [',' exp] do block end 17 | | for namelist in explist do block end 18 | | function funcname funcbody 19 | | local function Name funcbody 20 | | local attnamelist ['=' explist] 21 | ; 22 | 23 | attnamelist ::= Name attrib {',' Name attrib} ; 24 | 25 | attrib ::= ['<' Name '>'] ; 26 | 27 | retstat ::= return [explist] [';'] ; 28 | 29 | label ::= '::' Name '::' ; 30 | 31 | funcname ::= Name {'.' Name} [':' Name] ; 32 | 33 | varlist ::= var {',' var} ; 34 | 35 | var ::= Name 36 | | prefixexp '[' exp ']' 37 | | prefixexp '.' Name 38 | ; 39 | 40 | namelist ::= Name {',' Name} ; 41 | 42 | explist ::= exp {',' exp} ; 43 | 44 | exp ::= nil 45 | | false 46 | | true 47 | | Numeral 48 | | LiteralString 49 | | '...' 50 | | functiondef 51 | | prefixexp 52 | | tableconstructor 53 | | exp binop exp 54 | | unop exp 55 | ; 56 | 57 | prefixexp ::= var 58 | | functioncall 59 | | '(' exp ')' 60 | ; 61 | 62 | functioncall ::= prefixexp args 63 | | prefixexp ':' Name args 64 | ; 65 | 66 | args ::= '(' [explist] ')' 67 | | tableconstructor 68 | | LiteralString 69 | ; 70 | 71 | functiondef ::= function funcbody ; 72 | 73 | funcbody ::= '(' [parlist] ')' block end ; 74 | 75 | parlist ::= namelist [',' '...'] 76 | | '...' 77 | ; 78 | 79 | tableconstructor ::= '{' [fieldlist] '}' ; 80 | 81 | fieldlist ::= field {fieldsep field} [fieldsep] ; 82 | 83 | field ::= '[' exp ']' '=' exp 84 | | Name '=' exp 85 | | exp 86 | ; 87 | 88 | fieldsep ::= ',' 89 | | ';' 90 | ; 91 | 92 | binop ::= '+' 93 | | '-' 94 | | '*' 95 | | '/' 96 | | '//' 97 | | '^' 98 | | '%' 99 | | '&' 100 | | '~' 101 | | '|' 102 | | '>>' 103 | | '<<' 104 | | '..' 105 | | '<' 106 | | '<=' 107 | | '>' 108 | | '>=' 109 | | '==' 110 | | '~=' 111 | | and 112 | | or 113 | ; 114 | 115 | unop ::= '-' 116 | | not 117 | | '#' 118 | | '~' 119 | ; 120 | -------------------------------------------------------------------------------- /syntax_ast_5.1.txt: -------------------------------------------------------------------------------- 1 | -- TODO declare a parent-node 'op' somehow 2 | -- one downside to this system is .. you need one rule per unique ast node ... 3 | -- TODO all these should inherit from 'op' 4 | -- another TODO ... 5 | -- ... all these were originally implemented as `if token found then build the node` 6 | -- but for the auto generation, I think I have to build the node, then bail if something doesn't match ... 7 | -- ... and that means no more 'mustbe' ? since instead it'll just be returning nil? 8 | -- or should I keep the old design? but that means changing the code-generation ... 9 | -- hmm but that means pushing the first matched token of each rule back into the calling rule as a ... 10 | -- `if canbe(symbol) then parse_nextrule()` 11 | -- that might mean I need to assert every rule only has "or" on its topmost, and then next, every expression starts with a keyword/symbol 12 | -- 13 | -- or how about I just generate a FSM? that seems to be trendy. 14 | 15 | -- are the captures just the rules themselves? 16 | -- should I just capture everything? 17 | -- should I tag everything with what rule created it? 18 | 19 | block ::= {(stat) [';']} [(laststat) [';']] ; 20 | 21 | stat ::= 'local' 'function' (Name) (funcbody) 22 | | 'local' (namelist) ['=' (explist)] 23 | | 'function' (funcname) (funcbody) 24 | | 'for' (Name) '=' (exp) ',' (exp) [',' (exp)] 'do' (block) 'end' 25 | | 'for' (namelist) 'in' (explist) 'do' (block) 'end' 26 | | 'if' (exp) 'then' (block) {'elseif' (exp) 'then' (block)} ['else' (block)] 'end' 27 | | 'repeat' (block) 'until' (exp) 28 | | 'while' (exp) 'do' (block) 'end' 29 | | 'do' (block) 'end' 30 | | (functioncall) 31 | | (varlist) '=' (explist) 32 | ; 33 | 34 | laststat ::= 'return' [explist] 35 | | 'break' 36 | ; 37 | 38 | funcname ::= Name {'.' Name} [':' Name] ; 39 | 40 | varlist ::= var {',' var} ; 41 | 42 | var ::= Name 43 | | prefixexp '[' exp ']' 44 | | prefixexp '.' Name 45 | ; 46 | 47 | namelist ::= Name {',' Name} ; 48 | 49 | explist ::= {exp ','} exp ; 50 | 51 | exp ::= 'nil' 52 | | 'false' 53 | | 'true' 54 | | Numeral 55 | | LiteralString 56 | | '...' 57 | | functiondef 58 | | prefixexp 59 | | tableconstructor 60 | | exp binop exp 61 | | unop exp 62 | ; 63 | 64 | prefixexp ::= var 65 | | functioncall 66 | | '(' exp ')' 67 | ; 68 | 69 | functioncall ::= prefixexp args 70 | | prefixexp ':' Name args 71 | ; 72 | 73 | args ::= '(' [explist] ')' 74 | | tableconstructor 75 | | LiteralString 76 | ; 77 | 78 | functiondef ::= 'function' funcbody ; 79 | 80 | funcbody ::= '(' [parlist] ')' block 'end' ; 81 | 82 | parlist ::= namelist [',' '...'] 83 | | '...' 84 | ; 85 | 86 | tableconstructor ::= '{' [fieldlist] '}' ; 87 | 88 | fieldlist ::= field {fieldsep field} [fieldsep] ; 89 | 90 | field ::= '[' exp ']' '=' exp 91 | | Name '=' exp 92 | | exp 93 | ; 94 | 95 | fieldsep ::= ',' 96 | | ';' 97 | ; 98 | 99 | binop ::= add 100 | | sub 101 | | mul 102 | | div 103 | | pow 104 | | mod 105 | | concat 106 | | lt 107 | | le 108 | | gt 109 | | ge 110 | | eq 111 | | ne 112 | | and 113 | | or 114 | ; 115 | 116 | add ::= '+' ; 117 | sub ::= '-' ; 118 | mul ::= '*' ; 119 | div ::= '/' ; 120 | pow ::= '^' ; 121 | mod ::= '%' ; 122 | concat ::= '..' ; 123 | lt ::= '<' ; 124 | le ::= '<=' ; 125 | gt ::= '>' ; 126 | ge ::= '>=' ; 127 | eq ::= '==' ; 128 | ne ::= '~=' ; 129 | and ::= 'and' ; 130 | or ::= 'or' ; 131 | 132 | unop ::= unm 133 | | not 134 | | len 135 | ; 136 | 137 | unm ::= '-' ; 138 | not ::= 'not' ; 139 | len ::= '#' ; 140 | 141 | -- Name ::= ... how to define valid names ... 142 | -- Numeral ::= ... how to define numerals ... 143 | -- LiteralString ::= how to define literal strings ... 144 | -------------------------------------------------------------------------------- /syntax_grammar.txt: -------------------------------------------------------------------------------- 1 | ... what other projects are using this parser anyways: 2 | 3 | ./netrefl/netfield_vec.lua:local ast = require 'parser.lua.ast' 4 | ./lua-to-batch/lua_to_batch.lua:local ast = require 'parser.lua.ast' 5 | ./vec/create.lua:local ast = require 'parser.lua.ast' 6 | ./local-default/local-default.lua 7 | 8 | ./sand-attack/verify-demo.lua:local parser = require 'parser' 9 | ./dumpworld-from-2020/convert-mario-maps.lua:local parser = require 'parser' 10 | ./lua-to-batch/lua_to_batch.lua:local parser = require 'parser' 11 | ./zeta2d/convert-mario-maps.lua:local parser = require 'parser' 12 | 13 | 14 | 15 | simplest case for a grammar of grammers: 16 | 17 | rules ::= rule { ';' rule } 18 | rule ::= name '::=' expr_or ; 19 | expr_or ::= expr_list {'|' expr_list} ; 20 | expr_list ::= 21 | '{' expr_or '}' 22 | | '[' expr_or ']' 23 | | Name 24 | | Numeral 25 | | LiteralString 26 | ; 27 | 28 | ... how to also include named-captures into the grammar? 29 | 30 | rules ::= rule { ';' rule } 31 | rule ::= name=name '::=' expr=expr_or ; 32 | expr_or ::= exprs=(expr_list {'|' expr_list}); 33 | expr_list ::= 34 | type=multiple '{' expr_or '}' 35 | | type=optional '[' expr_or ']' 36 | | Name 37 | | Numeral 38 | | LiteralString 39 | ; 40 | 41 | ... which would then make the grammar more complex: 42 | Using the new rules: 43 | field=token to capture and assign a single token to field 'field' 44 | field=(token tokens...) to capture and assign multiple tokens 45 | type=whatever to specify that, for this particular '|' branch, the AST node type 46 | ... maybe instead of type=, use some other syntax, to not collide with the field= syntax 47 | ... and maybe somehow syntax to distinguish when we want to capture tokens 48 | like maybe a * means "don't capture token" 49 | or maybe simply no field= means no capture 50 | though field=( ... ) means capture a list, and from there we might want to specify what in the list we don't want to capture 51 | 52 | 53 | What if I modeled the grammar grammar after the parser I already wrote, instead of after the grammars I wrote it after? 54 | How about `*` suffix means "don't trap as a distinct AST, forward back one level instead" 55 | And `name=` means "assign this to a named field" ? 56 | or should I even use named fields? 57 | More flexible for tree traversal if I don't ... 58 | ... and then per-class I could have member functions that return named versions of different fields, 59 | or even __index alias's? 60 | 61 | -- type=block, {stat} [return] will be assigned to self[i] as per default behavior 62 | block ::= { stat } [return] ; 63 | 64 | -- type=return, 65 | -- currently self.exprs[i] = unpack(explist) 66 | -- but maybe I should change from self.exprs[i] to self[i] ? 67 | return ::= 'return' [explist] ; 68 | 69 | -- * after rule name means forward/unpack: don't build a 'stat' node, just forward it back into block. 70 | -- ...or should only the rule-references have *'s for unpacking? 71 | -- But doing so with named fields is ambiguous ... more of an argument to get rid of all named fields. 72 | -- Should the * go on the rule or on the reference-to-rule? 73 | stat* ::= 74 | 'local' local 75 | | 'function' functionstmt 76 | ; 77 | 78 | -- type=local 79 | local ::= 'function' localfunction 80 | | localassign 81 | ; 82 | 83 | -- type=function 84 | -- in my current implementation, but maybe it's a bad idea to depend on 2 levels of AST to determine a local function vs a global function ? 85 | localfunction ::= Name funcbody ; 86 | 87 | -- type=assign 88 | -- but in my implementation I use 'assign' in a lot of places, and for this particular it is a local(assign(...)) 89 | localassign ::= namelist ['=' explist] ; 90 | 91 | -- forward, rename type to 'function' (tho we're gonna see 'function' elsewhere) ... 92 | functionstmt* ::= funcname funcbody ; 93 | 94 | -- forward ... 95 | funcname* ::= Name {'.' Name} [':' Name] ; 96 | 97 | -- :funcbody() in my code returns a table 98 | -- whose first argument is the 'parlist' rule locally named 'args' 99 | -- and whose arguments 2...n are the statements in 'block' 100 | funcbody ::= '(' [parlist] ')' block 'end' ; 101 | 102 | -- parlist returns a table of type=var wrapping the arg name, or type=vararg 103 | parlist ::= namelist [',' '...'] | '...' ; 104 | 105 | 106 | TODO 107 | - how does specifying rule class hierarchy work? things like how 'true' and 'false' literal ast node are subclasses of 'boolean' ast node 108 | - also fix args, use indexes whenever possible, use 1:1 with AST grammar whenver possible, justify flattening whenever possible, use aliases 109 | - merge :serialize() and :traverse() 110 | - do something about implicit keywords/symbols that are read but not saved (save them? keep track of where they are?) 111 | - auto grammar -> ast node class generation 112 | - auto grammar -> parser code generation 113 | - move all this stuff into base/ast.lua 114 | 115 | 116 | -- symbols/keywords aren't captured, so ';' isn't captured 117 | -- so all the `rule` objs get put into an `ast._rule`, integer-indexed 118 | rules ::= rule { ';' rule } ; 119 | -- mind you regenerating code with optional elements means deciding where to re-insert them 120 | -- so regenerating the code means either save all the tokens, or it means ... idk what other options ... 121 | rules ::= rule { ';'* rule } ; 122 | -- maybe I should denote them optional with a * suffix or something, and then keep two lists: one of read tokens (for regeneration) and another of indexed tokens via labels or () for capturing or something 123 | rules ::= rules+=rule { ';' rules+=rule } ; 124 | 125 | -- `name=name` means to alias the first capture as 'name' 126 | -- symbols/keywords aren't captured, so '::=' isn't captured 127 | rule ::= name=name '::='* expr_or ; 128 | 129 | expr_or ::= expr_list {'|' expr_list} ; 130 | expr_list ::= 131 | '(' expr_or ')' -- parenthesis mean capture as a separate subtable (otherwise all captured expressions go into [i]) 132 | | '{' expr_or '}' -- means multiple 133 | | '[' expr_or ']' -- means optional 134 | | Name 135 | | Numeral 136 | | LiteralString 137 | ; 138 | 139 | 140 | -------------------------------------------------------------------------------- /tests/flatten.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | 3 | local tolua = require 'ext.tolua' 4 | local parser = require 'parser' 5 | 6 | local gcode = [[function g() return print'hi' end]] 7 | local fcode = [[return function() g() end]] 8 | local code = gcode..'\n'..fcode 9 | 10 | print('original code') 11 | print(code) 12 | print() 13 | 14 | local ftree = parser.parse(fcode) 15 | print('f code') 16 | print(tolua(ftree)) 17 | print('f ast code (should match original code)') 18 | print(ftree:toLua()) 19 | print() 20 | 21 | local gtree = parser.parse(gcode) 22 | print('g code') 23 | print(tolua(gtree)) 24 | print('g ast code') 25 | print(gtree:toLua()) 26 | print() 27 | 28 | local fflat = ftree:flatten{ 29 | g = table.unpack(gtree), -- TODO gtree:find'g' to look for global-level definitions? 30 | } 31 | print('flattened f ast') 32 | print(tolua(fflat)) 33 | print('flattened f code') 34 | print(fflat:toLua()) 35 | -------------------------------------------------------------------------------- /tests/lua_to_c.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | local parser = require 'parser' 3 | local ast = require 'parser.lua.ast' 4 | local path = require 'ext.path' 5 | local assert = require 'ext.assert' 6 | local table = require 'ext.table' 7 | 8 | local requires = table() 9 | local cobjtype = 'Object' 10 | 11 | local cppReservedWord = { 12 | 'class', 13 | } 14 | 15 | local tabs = -1 -- because everything is in one block 16 | function tab() 17 | return ('\t'):rep(tabs) 18 | end 19 | function tabblock(t, consume) 20 | tabs = tabs + 1 21 | for i,ti in ipairs(t) do 22 | consume(tab()) 23 | consume(ti) 24 | if i < #t then consume'\n' end 25 | end 26 | tabs = tabs - 1 27 | end 28 | 29 | for k,cl in pairs(ast) do 30 | if ast.node:isa(cl) then 31 | function cl:toC() 32 | local s = '' 33 | local consume 34 | consume = function(x) 35 | if type(x) == 'number' then 36 | x = tostring(x) 37 | end 38 | if type(x) == 'string' then 39 | s = s .. x 40 | elseif type(x) == 'table' then 41 | assert.is(x, ast.node) 42 | assert.index(x, 'toC_recursive') 43 | x:toC_recursive(consume) 44 | else 45 | error('here with unknown type '..type(x)) 46 | end 47 | end 48 | self:toC_recursive(consume) 49 | return s 50 | end 51 | -- weakness to this design ...i need to always keep specifying the above toC() wrapper, or I have to make a seprate member function... 52 | function cl:toC_recursive(consume) 53 | self:serialize(consume) 54 | end 55 | end 56 | end 57 | 58 | 59 | -- make lua output the default for nodes' c outputw 60 | for _,info in ipairs{ 61 | {'concat','+'}, 62 | {'and','&&'}, 63 | {'or','||'}, 64 | {'ne','!='}, 65 | } do 66 | local name, op = table.unpack(info) 67 | -- hmm, can I override serialize but only for specific consume()'s ? 68 | -- I guess if I want to test consume == my new custom one vs otherwise call super ... 69 | ast['_'..name].toC_recursive = function(self, consume) 70 | for i,x in ipairs(self) do 71 | consume(x) 72 | if i < #self then 73 | consume' ' 74 | consume(op) 75 | consume ' ' 76 | end 77 | end 78 | end 79 | end 80 | function ast._not:toC_recursive(consume) 81 | consume'!' 82 | consume(self[1]) 83 | end 84 | function ast._len:toC_recursive(consume) 85 | consume(self[1]) 86 | consume'.size()' 87 | end 88 | function ast._assign:toC_recursive(consume) 89 | for i=1,#self.vars do 90 | if self.exprs[i] then 91 | consume(self.vars[i]) 92 | consume' = ' 93 | consume(self.exprs[i]) 94 | else 95 | consume(self.vars[i]) 96 | end 97 | if i < #self.vars then consume', ' end 98 | end 99 | end 100 | function ast._block:toC_recursive(consume) 101 | tabblock(self, consume) 102 | end 103 | function ast._call:toC_recursive(consume) 104 | consume(self.func) 105 | consume'(' 106 | for i,x in ipairs(self.args) do 107 | consume(x) 108 | if i < #self.args then consume', ' end 109 | end 110 | consume')' 111 | if self.func.name == 'require' then 112 | if self.args[1].type == 'string' then 113 | -- ok here we add the require file based on our lua path 114 | -- does this mean we need to declare the lua path up front to lua_to_c? 115 | requires:insert(self.args[1].value) 116 | else 117 | consume'\n#error require arg not a string' 118 | end 119 | end 120 | end 121 | function ast._foreq:toC_recursive(consume) 122 | consume'for (' 123 | consume(cobjtype) 124 | consume' ' 125 | consume(self.var) 126 | consume' = ' 127 | consume(self.min) 128 | consume'; ' 129 | consume(self.var) 130 | consume' < ' 131 | consume(self.max) 132 | consume'; ' 133 | if self.step then 134 | consume(self.var) 135 | consume' += ' 136 | consume(self.step) 137 | else 138 | consume'++' 139 | consume(self.var) 140 | end 141 | consume') {\n' 142 | tabblock(self, consume) 143 | consume(tab()) 144 | consume'}' 145 | end 146 | function ast._forin:toC_recursive(consume) 147 | consume'for (' 148 | for i,v in ipairs(self.vars) do 149 | consume(v) 150 | if i < #self.vars then consume', ' end 151 | end 152 | consume' in ' 153 | for i,v in ipairs(self.iterexprs) do 154 | consume(v) 155 | if i < #self.iterexprs then consume', ' end 156 | end 157 | consume') {\n' 158 | tabblock(self, consume) 159 | consume(tab()) 160 | consume'}' 161 | end 162 | function ast._function:toC_recursive(consume) 163 | if self.name then 164 | -- global-scope def? 165 | --return cobjtype..' '..self.name..'('..table(self.args):mapi(function(arg) return cobjtype..' '..apply(arg) end):concat', '..') {\n' .. tabblock(self, apply) .. tab() .. '}' 166 | -- local-scope named function def ... 167 | consume(cobjtype) 168 | consume' ' 169 | consume(self.name) 170 | consume' = [](' 171 | for i,arg in ipairs(self.args) do 172 | consume(cobjtype) 173 | consume' ' 174 | consume(arg) 175 | if i < #self.args then consume', ' end 176 | end 177 | consume') {\n' 178 | tabblock(self, consume) 179 | consume(tab()) 180 | consume'}' 181 | else 182 | -- lambdas? 183 | consume'[](' 184 | for i,arg in ipairs(self.args) do 185 | consume(cobjtype) 186 | consume' ' 187 | consume(arg) 188 | if i < #self.args then consume', ' end 189 | end 190 | consume') {\n' 191 | tabblock(self, consume) 192 | consuem(tab()) 193 | consume'}' 194 | end 195 | end 196 | function ast._if:toC_recursive(consume) 197 | consume'if (' 198 | consume(self.cond) 199 | consume') {\n' 200 | tabblock(self, consume) 201 | consume(tab()..'}') 202 | for _,ei in ipairs(self.elseifs) do 203 | consume(ei) 204 | end 205 | if self.elsestmt then consume(self.elsestmt) end 206 | end 207 | function ast._elseif:toC_recursive(consume) 208 | consume' else if (' 209 | consume(self.cond) 210 | consume') {\n' 211 | tabblock(self, consume) 212 | consume(tab()) 213 | consume'}' 214 | end 215 | function ast._else:toC_recursive(consume) 216 | consume' else {\n' 217 | tabblock(self, consume) 218 | consume(tab()) 219 | consume'}' 220 | end 221 | function ast._index:toC_recursive(consume) 222 | consume(self.expr) 223 | consume'[' 224 | consume(self.key) 225 | consume']' 226 | end 227 | function ast._indexself:toC_recursive(consume) 228 | consume(self.expr) 229 | consume'.' 230 | consume(self.key) 231 | end 232 | function ast._local:toC_recursive(consume) 233 | if self.exprs[1].type == 'function' or self.exprs[1].type == 'assign' then 234 | -- if exprs[1] is a multi-assign then an 'cobjtype' needs to prefix each new declaration 235 | consume(cobjtype) 236 | consume' ' 237 | consume(self.exprs[1]) 238 | else 239 | for i=1,#self.exprs do 240 | consume(cobjtype) 241 | consume' ' 242 | consume(self.exprs[i]) 243 | if i < #self.exprs then consume'\n' end 244 | end 245 | end 246 | end 247 | function ast._vararg:toC_recursive(consume) 248 | consume'reserved_vararg' -- reserved name? 249 | end 250 | function ast._var:toC_recursive(consume) 251 | if cppReservedWord[self.name] then 252 | consume('cppreserved_' .. self.name) 253 | else 254 | consume(self.name) 255 | end 256 | end 257 | 258 | 259 | local function addtab(s) 260 | return '\t'..(s:gsub('\n', '\n\t')) -- tab 261 | end 262 | 263 | -- also populates requires() 264 | local function luaFileToCpp(fn) 265 | assert(fn, "expected filename") 266 | local luacode = assert(path(fn):exists(), "failed to find "..tostring(fn)) 267 | local luacode = assert(path(fn):read(), "failed to find "..tostring(fn)) 268 | local tree = parser.parse(luacode) 269 | local cppcode = tree:toC() 270 | cppcode = '//file: '..fn..'\n'..cppcode 271 | cppcode = addtab(cppcode) 272 | return cppcode 273 | end 274 | 275 | 276 | 277 | print[[ 278 | 279 | #include "CxxAsLua/Object.h" 280 | using namespace CxxAsLua; 281 | 282 | // how to handle _G ... 283 | // esp wrt locals ... 284 | // if we use _G then that incurs overhead ... 285 | Object _G; 286 | 287 | // for global calls ... 288 | Object error; 289 | Object type; 290 | Object require; 291 | Object table; 292 | 293 | int main(int argc, char** argv) { 294 | _G = Object::Map(); 295 | _G["package"] = Object::Map(); 296 | _G["package"]["loaded"] = Object::Map(); 297 | 298 | error = _G["error"] = [](Object x) -> Object { 299 | throw std::runtime_error((std::string)x); 300 | }; 301 | 302 | //hmm, 'type' might be used as a global later, so i might have to remove the 'using namespace' and instead replace all Object's with Object::Object's 303 | ::type = _G["type"] = [](Object x) -> Object { 304 | if (x.is_nil()) { 305 | return "nil"; 306 | } else if (x.is_string()) { 307 | return "string"; 308 | } else if (x.is_table()) { 309 | return "table"; 310 | } else if (x.is_boolean()) { 311 | return "boolean"; 312 | } else if (x.is_function()) { 313 | return "function"; 314 | } else if (x.is_nil()) { 315 | return "nil"; 316 | } 317 | //or use getTypeIndex() 318 | // or better yet, rewrite our x.details to be a std::variant, 319 | // and map the variant index to a type, 320 | // then just store type info in that extra arra 321 | }; 322 | 323 | table = _G["table"] = Object::Map(); 324 | 325 | table["concat"] = [](VarArg arg) -> Object { 326 | if (!arg[1].is_table()) error("expected a table"); 327 | //TODO FINISHME 328 | // list, sep, i 329 | std::ostringstream s; 330 | std::string sep = ""; 331 | for (const Object& o : arg.objects) { 332 | std::cout << sep; 333 | std::cout << o; 334 | sep = "\t"; 335 | } 336 | std::cout << std::endl; 337 | }; 338 | 339 | require = _G["require"] = [&](std::string const & s) -> Object { 340 | Object x = _G["package"]["loaded"][s]; 341 | if (x != nil) return x; 342 | 343 | x = _G["cppmodules"][s]; 344 | if (x != nil) { 345 | x = x(); 346 | _G["package"]["loaded"][s] = x; 347 | return x; 348 | } 349 | 350 | return error(Object("idk how to load ") + s); 351 | }; 352 | 353 | _G["cppmodules"] = Object::Map(); 354 | ]] 355 | 356 | local cppcode = luaFileToCpp(... or 'lua_to_c_test.lua') 357 | 358 | for _,req in ipairs(requires) do 359 | -- ok here's where lua_to_c has to assume the same LUA_PATH as the c++ runtime 360 | print('//require: '..req) 361 | local fn = package.searchpath(req, package.path) 362 | if not fn then 363 | print("// package.searchpath couldn't find file") 364 | else 365 | print([[ 366 | _G["cppmodules"]["]]..req..[["] = []() -> Object { 367 | ]]) 368 | print(addtab(luaFileToCpp(fn))) 369 | 370 | print[[ 371 | }; 372 | ]] 373 | end 374 | end 375 | 376 | print(cppcode) 377 | 378 | print[[ 379 | } 380 | ]] 381 | -------------------------------------------------------------------------------- /tests/lua_to_c_test.lua: -------------------------------------------------------------------------------- 1 | print'hello' 2 | -------------------------------------------------------------------------------- /tests/parse.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | local path = require 'ext.path' 3 | local parser = require 'parser' 4 | local tree = assert(parser.parse(path(assert(..., "expected filename")):read())) 5 | print(tree:toLua()) 6 | -------------------------------------------------------------------------------- /tests/parsemyself.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | local path = require 'ext.path' 3 | local os = require 'ext.os' 4 | local LuaParser = require 'parser' 5 | 6 | -- TODO would be nice to remember who is executing you ... lua vs luajit vs whatever ... 7 | local lua = 'lua' 8 | 9 | local inceptionLevel = ... or 1 10 | inceptionLevel = assert(tonumber(inceptionLevel), "expected number") 11 | if inceptionLevel > 5 then 12 | print('nobody can survive beyond 5 inception levels') 13 | return 14 | end 15 | 16 | local dstpath = path'inception' 17 | dstpath = dstpath:abs() 18 | dstpath:mkdir() 19 | 20 | -- now parse and output a new Lua path in the dst folder ... 21 | local function rewrite(src, dst) 22 | print(src..' => '..dst) 23 | dst:getdir():mkdir(true) 24 | assert(dst:write(LuaParser.parse((assert(src:read()))):toLua())) 25 | end 26 | 27 | -- find all lua files? search the rockspec? 28 | local srcpath = path'../..' 29 | for _,info in ipairs{ 30 | -- [[ if you want to parse *everything* and not just the parser tree 31 | {dir='ext', files={'assert.lua', 'class.lua', 'cmdline.lua', 'coroutine.lua', 'ctypes.lua', 'debug.lua', 'detect_ffi.lua', 'detect_lfs.lua', 'detect_os.lua', 'env.lua', 'ext.lua', 'fromlua.lua', 'gcmem.lua', 'io.lua', 'load.lua', 'math.lua', 'meta.lua', 'number.lua', 'op.lua', 'os.lua', 'path.lua', 'range.lua', 'reload.lua', 'require.lua', 'string.lua', 'table.lua', 'timer.lua', 'tolua.lua', 'xpcall.lua'}}, 32 | --]] 33 | {dir='parser', files={'parser.lua', 'load_xform.lua'}}, 34 | {dir='parser/base', files={'ast.lua', 'datareader.lua', 'parser.lua', 'tokenizer.lua'}}, 35 | {dir='parser/lua', files={'ast.lua', 'parser.lua', 'tokenizer.lua'}}, 36 | {dir='parser/grammar', files={'parser.lua', 'tokenizer.lua'}}, 37 | {dir='parser/tests', files={'flatten.lua', 'lua_to_c.lua', 'lua_to_c_test.lua', 'validate.lua', 'parse.lua', 'parsemyself.lua', 'spantest.lua'}}, 38 | } do 39 | for _,fn in ipairs(info.files) do 40 | rewrite(srcpath/info.dir/fn, dstpath/info.dir/fn) 41 | end 42 | end 43 | 44 | -- then chdir and run it again 45 | dstpath'parser/tests':cd() 46 | os.exec( 47 | -- [[ if you want to only use reparsed content for the second parse ... 48 | 'LUA_PATH="'..dstpath..'/?.lua;'..dstpath..'/?/?.lua" && '.. 49 | --]] 50 | lua..' parsemyself.lua '..(inceptionLevel+1)) 51 | -------------------------------------------------------------------------------- /tests/spantest.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | require 'ext' 3 | local LuaParser = require 'parser.lua.parser' 4 | 5 | --[=[ 6 | local code = [[ 7 | local result = aa and bb 8 | x = 1 9 | y = 2 10 | z = x + y 11 | function h() 12 | print'hello world' 13 | return 42 14 | end 15 | ]] 16 | --]=] 17 | -- [=[ 18 | local code = [[ 19 | function f() end 20 | function g() end 21 | function h() end 22 | ]] 23 | --]=] 24 | --[[ 25 | local code = path'../lua/parser.lua':read() 26 | --]] 27 | local parser = LuaParser(code, code) 28 | 29 | local tree = parser.tree 30 | local datareader = parser.t.r 31 | 32 | -- TODO this but for every test in minify_tests.txt 33 | -- then verify the :lua() serialized results match the source results 34 | local function printspan(x, tab) 35 | tab = tab or '' 36 | if x.type then 37 | local reconstructed = x:toLua() 38 | print(tab..'tostring():', string.trim(reconstructed)) 39 | local fromIndexSpan = code:sub(x.span.from.index, x.span.to.index) 40 | print(tab..'span substr:', tolua(fromIndexSpan)) 41 | local fromTokenSpan = datareader.tokenhistory:sub(x.span.from.tokenIndex, x.span.to.tokenIndex):concat() 42 | print(tab..'token range: '..x.span.from.tokenIndex..', '..x.span.to.tokenIndex) 43 | print(tab..'token substr:', tolua(fromTokenSpan)) 44 | print(tab..'type:', x.type) 45 | 46 | --[[ 47 | local reconstructedCode = load(reconstructed):dump() 48 | local fromIndexSpanCode = load(fromIndexSpan):dump() 49 | local fromTokenSpanCode = load(fromTokenSpan):dump() 50 | assert.eq(reconstructedCode:hexdump(), fromIndexSpanCode:hexdump()) 51 | assert.eq(reconstructedCode:hexdump(), fromTokenSpanCode:hexdump()) 52 | --]] 53 | --[[ 54 | local function reduceString(s) 55 | -- remove comments too, those will be in tokenSpan text 56 | s = s:gsub('%-%-[^\n]*', '') 57 | repeat 58 | local start1, start2 = s:find('%-%-%[=*%[') 59 | if not start1 then break end 60 | local eq = s:sub(start1+3, start2-1) 61 | assert(eq:match'^=*$') 62 | local finish1, finish2 = s:find('%]'..eq..'%]', start2) 63 | if not finish1 then break end 64 | s = s:sub(1, start1-1)..s:sub(finish2+1) 65 | until false 66 | s = s:gsub('%s+', ''):gsub('["\']', "'") 67 | return s 68 | end 69 | reconstructed = reduceString(reconstructed) 70 | fromIndexSpan = reduceString(fromIndexSpan) 71 | fromTokenSpan = reduceString(fromTokenSpan) 72 | assert.eq(reconstructed, fromIndexSpan) 73 | assert.eq(reconstructed, fromTokenSpan) 74 | --]] 75 | end 76 | for k,v in pairs(x) do 77 | if k == 'span' then 78 | print(tab..k..' = index range '..tostring(v.from.index)..'..'..tostring(v.to.index) 79 | ..', line/col range '..v.from.line..'/'..v.from.col..'..'..v.to.line..'/'..v.to.col) 80 | elseif k ~= 'parent' 81 | and k ~= 'span' 82 | and k ~= 'parser' 83 | then 84 | if type(v) == 'table' then 85 | print(tab..k) 86 | printspan(v, tab..' ') 87 | else 88 | print(tab..k..' = '..(v.toLua and v:toLua() or tostring(v)))--tolua(v)) 89 | end 90 | end 91 | end 92 | end 93 | 94 | printspan(tree) 95 | -------------------------------------------------------------------------------- /tests/strings.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env luajit 2 | local assert = require 'ext.assert' 3 | local Parser = require 'parser' 4 | 5 | local function test(codein, eq) 6 | local codeout = ''..Parser.parse(codein) 7 | print(codein, codeout) 8 | local s = assert(load(codeout))() -- evaluate it ... 9 | assert.eq(s, eq) -- assert it's correct 10 | end 11 | 12 | -- parse dec escape code, since 5.1 13 | if _VERSION >= 'Lua 5.1' then 14 | test([[return '\97']], 'a') 15 | end 16 | 17 | -- parse hex escape code, since 5.2 18 | if _VERSION >= 'Lua 5.2' then 19 | test([[return '\x62']], 'b') -- don't test same as before, in case false positives 20 | test([[return '\x7a']], 'z') -- make sure to test hex chars 21 | end 22 | 23 | -- parse unicode, since 5.3 24 | if _VERSION >= 'Lua 5.3' then 25 | test([[return '\u{2200}']], '∀') 26 | test([[return '\u{2a01}']], '⨁') 27 | end 28 | -------------------------------------------------------------------------------- /tests/validate.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | --[[ 3 | './validate.lua' = runs through the validation tests and verifies that the parser produces the correct output for the currently-running version of lua 4 | './validate.lua all' = to test all versions at once (provided you have all built and with their names matching etc ...) 5 | './validate.lua makekey' = to regenerate the key to stdout 6 | --]] 7 | require 'ext' 8 | local luas = table{ 9 | 'lua5.0', -- lua 5.0.3 10 | 'lua5.1', -- lua 5.1.5 11 | 'lua5.2', -- lua 5.2.4 with LUA_COMPAT_ALL enabled 12 | 'lua5.3', -- lua 5.3.6 with LUA_COMPAT_5_2 enabled 13 | 'lua5.4', -- lua 5.4.7 with LUA_COMPAT_5_3 enabled 14 | 'luajit', -- luajit 2.1.x ... I think openresty variant ... with LUAJIT_ENABLE_LUA52COMPAT enabled 15 | -- (TODO luajit 2.0x vs 2.1x, vanilla vs openresty) 16 | } 17 | local tmp = path'tmp.lua' 18 | local lines = assert(path'validate-key.txt':read()):trim():split'\n' 19 | local trimmedlines = lines:mapi(function(line) -- trim comments 20 | return (line:match'^(.-)%-%-.*$' or line):trim() 21 | end) 22 | local maxline = trimmedlines:mapi(function(line) return #line end):sup() 23 | 24 | -- which to test? current version or all? 25 | local testluas 26 | if cmdline.all then 27 | testluas = table(luas) 28 | else 29 | local version = _VERSION:match'^Lua (.*)$' 30 | --if version == '5.1' and jit then version = '5.2' end -- TODO more on luajit versions and COMPAT* builds and parser feature detection ... 31 | if jit then version = 'jit' end 32 | testluas = table{'lua'..version} 33 | end 34 | 35 | for i,line in ipairs(lines) do 36 | 37 | -- [[ if we're making the key ... 38 | if cmdline.makekey then 39 | -- TODO more comprehensive on with/without COMPAT flags enabled 40 | local verstats = {} 41 | for _,lua in ipairs(luas) do 42 | 43 | tmp:write(line) 44 | local results = table.pack(os.execute(lua..' -e "assert(loadfile\''..tmp..'\')" > /dev/null 2>&1')) -- load, don't run 45 | local luaSuccess = not not results[1] 46 | 47 | if not luaSuccess and results[2] == 'signal' and results[3] ~= 1 then break end -- detect ctrl+c instead of syntax error ... this is not always picking it up 48 | 49 | --print() 50 | --print(results:unpack()) 51 | verstats[lua] = luaSuccess 52 | -- [[ check my old key for bugs/changes 53 | local version = lua:match'^lua(%d%.%d$)' 54 | if version then 55 | local expected = not line:match'FAIL_'..version 56 | assert.eq(expected, result) 57 | end 58 | --]] 59 | end 60 | local line = trimmedlines[i] -- don't need comments so use the comment-less version 61 | print(line..(' '):rep(maxline - #line + 10)..'--\t'..luas:mapi(function(lua) 62 | return lua..'='..tostring(verstats[lua] and 1 or 0) 63 | end):concat'\t') 64 | 65 | else 66 | --]] 67 | -- [[ if we're testing the parser ... 68 | for _,testlua in ipairs(testluas) do 69 | -- TODO remove the 'lua' prefix and TODO make sure this is compat with whatever the parser version input is ...") 70 | 71 | -- determine 'version' to pass to the parser 72 | -- TODO more on luajit versions and COMPAT* builds and parser feature detection ... 73 | local version = testlua:match'^lua(.*)$' 74 | if version == '5.1' and jit then version = '5.2' end 75 | 76 | local keySuccess = assert( 77 | -- TODO if we don't have it then ... regenerate it from the bin ... ? and maybe even re-write it out? 78 | line:match('lua'..version..'=(%d)'), "couldn't find lua version "..version 79 | ) ~= '0' 80 | 81 | local luaSuccess 82 | if cmdline.all then 83 | tmp:write(line) 84 | local results = table.pack(os.execute(testlua..' -e "assert(loadfile\''..tmp..'\')" > /dev/null 2>&1')) -- load, don't run 85 | luaSuccess = not not results[1] 86 | else 87 | luaSuccess = not not (loadstring or load)(line) 88 | end 89 | 90 | local LuaParser = require 'parser.lua.parser' 91 | -- mannnn between parser.parse, Parser:init, Parser:setData, and parser/base/parser and parser/lua/parser, I need to clean up these function signatures 92 | local parser = LuaParser(nil, version, nil, testlua == 'luajit') 93 | local parseSuccess, errorString = parser:setData(line) 94 | parseSuccess = not not parseSuccess 95 | print('key results', keySuccess, 'parser results', parseSuccess, 'lua results', luaSuccess, 'line', line, 'version', version) 96 | if keySuccess ~= parseSuccess or parseSuccess ~= luaSuccess then 97 | error("parser failed to recreate same results. error="..errorString) 98 | end 99 | end 100 | end 101 | --]] 102 | end 103 | tmp:remove() 104 | --------------------------------------------------------------------------------