├── README.md └── plugins ├── SmartJump.py └── SmartJump ├── __init__.py ├── lex.py └── yacc.py /README.md: -------------------------------------------------------------------------------- 1 | IDA Pro plugin to enhance the JumpAsk 'g' command 2 | 3 | 4 | # Installation 5 | 6 | Copy the contents of the plugin folder into your IDA_DIR/plugin folder 7 | Edit `IDA_DIR\cfg\idagui.cfg` so that the line that has default text with: 8 | `"JumpAsk" = 'g'` 9 | Instead reads: 10 | `"JumpAsk" = 0` 11 | You can append the text: 12 | `// 'g'` 13 | To the line to give a full entry of: 14 | `"JumpAsk" = 0 // 'g'` 15 | If you do not want to remember what the default value was. 16 | 17 | # Usage 18 | 19 | SmartJump is designed to improve the `g` keyboard shortcut in IDA, especially when using IDA to debug binaries. It allows a user to do basic mathematical operations `-`, `+`, `/`, `*` on values and labels in the JumpAsk window. 20 | In addition, it allows a user to use the symbols `[` and `]` to dereference memory addresses and jump to the values contained at the address. 21 | 22 | The supported type of values that can be used in the jumpask window are: 23 | `here` and `here()` - these resolve to the current result of `idc.here()` 24 | `main` and `sub_123456` - you can still jump by names/labels in the binary 25 | `12ab34` and `0x12ab34` - all numbers are interpreted as hexadecimal numbers and can either be preceded by `0x` or not. If a global name also matches a hexadecimal number pattern then the global name will take precedence 26 | `eax`, `ebx`, ... , `rax`, ... - 32 and 64 bit registers for x86 and x64 are supported. Using `eax` on an x64 binary will mask the lower 32 bits and return that value 27 | 28 | All of these values can be used in combinations, a brief but inexhaustive list of examples is below: 29 | `[eax]` - Grabs the current value in eax, goes to the memory location and attempts to read a 32 bit pointer if in IDA32 or a 64 bit pointer if in IDA64. If the resolution is a valid address then it jumps there 30 | `[here] + rsp` - Grabs the value stored at the current address and adds it to the 64 bit stack pointer then jumps to the resulting stack location 31 | `[[ebx]*4]+[edx]` - You are starting to get the picture, you can do any jumps that resolve to an address in IDA... 32 | 33 | You can use the symbols `(` and `)` to explicitly group operations together rather than relying on the precedences assigned for the operators 34 | E.g. 35 | `0x1200 * ([ebx] + here)` - This will grab the contents of ebx, add the current address to it and then multiply it by 0x1200 and try to jump to the result 36 | 37 | You do not have to match `[` and `]` symbols: 38 | `[[[eax` - This will automatically have enough `]` symbols appended to the end of the query to match the opening `[` symbols 39 | The final result would be a triple dereference of eax - `[[[eax]]]` 40 | The matching brace completion only supports the `[` symbol currently and will not match `(` symbols. All auto added braces are put at the end of the input expression 41 | -------------------------------------------------------------------------------- /plugins/SmartJump.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2023 PwCIL 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ''' 16 | 17 | import idc, idaapi 18 | import SmartJump.lex as lex 19 | import SmartJump.yacc as yacc 20 | import re 21 | 22 | #utilities 23 | arch_size = 32 24 | if idaapi.idainfo_is_64bit(): 25 | arch_size = 64 26 | 27 | #support x86 and x64 so check which pointers we will be accessing 28 | def get_pointer(address): 29 | if arch_size == 32: 30 | return idaapi.get_dword(address) 31 | else: 32 | return idaapi.get_qword(address) 33 | 34 | tokens = ('HERE', 'HEXADDR', 'ADDR', 'PLUS', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'MINUS', 'TIMES', 'DIVIDE', 35 | 'EAX', 'EBX', 'ECX', 'EDX', 'EIP', 'ESI', 'EDI', 'EBP', 'ESP', 36 | 'RAX', 'RBX', 'RCX', 'RDX', 'RIP', 'RSI', 'RDI', 'RBP', 'RSP', 'R8', 'R9', 'R10', 'R11', 'R12', 'R13', 'R14', 'R15', 'NAME', 37 | ) 38 | 39 | t_PLUS = r'\+' 40 | t_MINUS = r'-' 41 | t_LBRACKET = r'\[' 42 | t_RBRACKET = r'\]' 43 | t_TIMES = r'\*' 44 | t_DIVIDE = r'/' 45 | t_LPAREN = r'\(' 46 | t_RPAREN = r'\)' 47 | 48 | def t_HERE(t): 49 | r'here(\(\))?' 50 | t.value = idc.here() 51 | return t 52 | 53 | def t_RAX(t): 54 | r'rax' 55 | t.value = idaapi.get_reg_val("RAX") 56 | return t 57 | 58 | def t_RBX(t): 59 | r'rbx' 60 | t.value = idaapi.get_reg_val("RBX") 61 | return t 62 | 63 | def t_RCX(t): 64 | r'rcx' 65 | t.value = idaapi.get_reg_val("RCX") 66 | return t 67 | 68 | def t_RDX(t): 69 | r'rdx' 70 | t.value = idaapi.get_reg_val("RDX") 71 | return t 72 | 73 | def t_RIP(t): 74 | r'rip' 75 | t.value = idaapi.get_reg_val("RIP") 76 | return t 77 | 78 | def t_RSI(t): 79 | r'rsi' 80 | t.value = idaapi.get_reg_val("RSI") 81 | return t 82 | 83 | def t_RDI(t): 84 | r'rdi' 85 | t.value = idaapi.get_reg_val("RDI") 86 | return t 87 | 88 | def t_RBP(t): 89 | r'rbp' 90 | t.value = idaapi.get_reg_val("RBP") 91 | return t 92 | 93 | def t_RSP(t): 94 | r'rsp' 95 | t.value = idaapi.get_reg_val("RSP") 96 | return t 97 | 98 | def t_R8(t): 99 | r'r8' 100 | t.value = idaapi.get_reg_val("R8") 101 | return t 102 | 103 | def t_R9(t): 104 | r'r9' 105 | t.value = idaapi.get_reg_val("R9") 106 | return t 107 | 108 | def t_R10(t): 109 | r'r10' 110 | t.value = idaapi.get_reg_val("R10") 111 | return t 112 | 113 | def t_R11(t): 114 | r'r11' 115 | t.value = idaapi.get_reg_val("R11") 116 | return t 117 | 118 | def t_R12(t): 119 | r'r12' 120 | t.value = idaapi.get_reg_val("R12") 121 | return t 122 | 123 | def t_R13(t): 124 | r'r13' 125 | t.value = idaapi.get_reg_val("R13") 126 | return t 127 | 128 | def t_R14(t): 129 | r'r14' 130 | t.value = idaapi.get_reg_val("R14") 131 | return t 132 | 133 | def t_R15(t): 134 | r'r15' 135 | t.value = idaapi.get_reg_val("R15") 136 | return t 137 | 138 | def t_EAX(t): 139 | r'eax' 140 | if arch_size == 32: 141 | t.value = idaapi.get_reg_val("EAX") 142 | else: 143 | t.value = (idaapi.get_reg_val("RAX")) & 0xffffffff 144 | return t 145 | 146 | def t_EBX(t): 147 | r'ebx' 148 | if arch_size == 32: 149 | t.value = idaapi.get_reg_val("EBX") 150 | else: 151 | t.value = (idaapi.get_reg_val("RBX")) & 0xffffffff 152 | return t 153 | 154 | def t_ECX(t): 155 | r'ecx' 156 | if arch_size == 32: 157 | t.value = idaapi.get_reg_val("ECX") 158 | else: 159 | t.value = (idaapi.get_reg_val("RCX")) & 0xffffffff 160 | return t 161 | 162 | def t_EDX(t): 163 | r'edx' 164 | if arch_size == 32: 165 | t.value = idaapi.get_reg_val("EDX") 166 | else: 167 | t.value = (idaapi.get_reg_val("RDX")) & 0xffffffff 168 | return t 169 | 170 | def t_EIP(t): 171 | r'eip' 172 | if arch_size == 32: 173 | t.value = idaapi.get_reg_val("EIP") 174 | else: 175 | t.value = (idaapi.get_reg_val("RIP")) & 0xffffffff 176 | return t 177 | 178 | def t_ESI(t): 179 | r'esi' 180 | if arch_size == 32: 181 | t.value = idaapi.get_reg_val("ESI") 182 | else: 183 | t.value = (idaapi.get_reg_val("RSI")) & 0xffffffff 184 | return t 185 | 186 | def t_EDI(t): 187 | r'edi' 188 | if arch_size == 32: 189 | t.value = idaapi.get_reg_val("EDI") 190 | else: 191 | t.value = (idaapi.get_reg_val("RDI")) & 0xffffffff 192 | return t 193 | 194 | def t_EBP(t): 195 | r'ebp' 196 | if arch_size == 32: 197 | t.value = idaapi.get_reg_val("EBP") 198 | else: 199 | t.value = (idaapi.get_reg_val("RBP")) & 0xffffffff 200 | return t 201 | 202 | def t_ESP(t): 203 | r'esp' 204 | if arch_size == 32: 205 | t.value = idaapi.get_reg_val("ESP") 206 | else: 207 | t.value = (idaapi.get_reg_val("RSP")) & 0xffffffff 208 | return t 209 | 210 | def t_NAME(t): 211 | r'[a-zA-Z_\.][a-zA-Z0-9_\.]*' 212 | temp = idc.get_name_ea_simple(t.value) 213 | if temp == idaapi.BADADDR: 214 | t.value = int(t.value, 16) 215 | else: 216 | t.value = temp 217 | return t 218 | 219 | def t_HEXADDR(t): 220 | r'0x[0-9a-f]+' 221 | t.value = int(t.value[2:], 16) 222 | return t 223 | 224 | def t_ADDR(t): 225 | r'[0-9a-f]+' 226 | t.value = int(t.value, 16) 227 | return t 228 | 229 | t_ignore = " \t" 230 | 231 | def t_newline(t): 232 | r'\n+' 233 | t.lexer.lineno += t.value.count("\n") 234 | 235 | def t_error(t): 236 | debug_out(f"Illegal character {t.value[0]!r}") 237 | t.lexer.skip(1) 238 | 239 | glob_lex = lex.lex(reflags=re.I) 240 | 241 | precedence = ( 242 | ('left','PLUS','MINUS'), 243 | ('left','TIMES','DIVIDE'), 244 | ) 245 | 246 | def p_expression_binop(p): 247 | '''expression : expression PLUS expression 248 | | expression MINUS expression 249 | | expression TIMES expression 250 | | expression DIVIDE expression''' 251 | if p[2] == '+' : p[0] = p[1] + p[3] 252 | elif p[2] == '-': p[0] = p[1] - p[3] 253 | elif p[2] == '*': p[0] = p[1] * p[3] 254 | elif p[2] == '/': p[0] = p[1] / p[3] 255 | 256 | def p_expression_group(p): 257 | 'expression : LPAREN expression RPAREN' 258 | p[0] = p[2] 259 | 260 | def p_expression_deref(p): 261 | 'expression : LBRACKET expression RBRACKET' 262 | p[0] = get_pointer(p[2]) 263 | 264 | def p_expression_hexaddr(p): 265 | 'expression : HEXADDR' 266 | p[0] = p[1] 267 | 268 | def p_expression_addr(p): 269 | 'expression : ADDR' 270 | p[0] = p[1] 271 | 272 | def p_expression_reg(p): 273 | '''expression : RAX 274 | | RBX 275 | | RCX 276 | | RDX 277 | | RSI 278 | | RDI 279 | | RBP 280 | | RSP 281 | | RIP 282 | | R8 283 | | R9 284 | | R10 285 | | R11 286 | | R12 287 | | R13 288 | | R14 289 | | R15 290 | | EAX 291 | | EBX 292 | | ECX 293 | | EDX 294 | | ESI 295 | | EDI 296 | | EBP 297 | | ESP 298 | | EIP 299 | | HERE''' 300 | p[0] = p[1] 301 | 302 | def p_expression_name(p): 303 | 'expression : NAME' 304 | p[0] = p[1] 305 | 306 | def p_error(p): 307 | print(f"Syntax error at {p.value!r}") 308 | 309 | glob_parser = yacc.yacc() 310 | 311 | def debug_out(str): 312 | print ("[SMARTJUMPER]: %s" % str) 313 | 314 | class SmartJump_t(idaapi.plugin_t): 315 | flags = 0 316 | comment = "Smart IDA jumping" 317 | wanted_hotkey = 'g' 318 | help = "Runs by replacing Go command when pressing G" 319 | wanted_name = "SmartJumper" 320 | lexer = None 321 | parser = None 322 | 323 | def init(self): 324 | global glob_lex 325 | global glob_parser 326 | if idaapi.ph_get_id() != idaapi.PLFM_386: 327 | return idaapi.PLUGIN_SKIP 328 | debug_out("Loading Parsers") 329 | 330 | self.lexer = glob_lex 331 | self.parser = glob_parser 332 | debug_out("Loaded SmartJumper") 333 | return idaapi.PLUGIN_KEEP 334 | 335 | def run(self, arg=0): 336 | jump_str = idaapi.ask_str("", 0, "Jump expression...") 337 | if jump_str != None: 338 | try: 339 | open_deref = jump_str.count("[") 340 | close_deref = jump_str.count("]") 341 | if close_deref < open_deref: 342 | jump_str += "]" * (open_deref-close_deref) 343 | if open_deref < close_deref: 344 | debug_out("mismatched dereferences") 345 | else: 346 | result = self.parser.parse(jump_str, lexer=self.lexer) 347 | debug_out("resolved to %08x" % result) 348 | idaapi.jumpto(result) 349 | except: 350 | debug_out("problem parsing") 351 | 352 | def term(self): 353 | return 354 | 355 | def PLUGIN_ENTRY(): 356 | return SmartJump_t() 357 | -------------------------------------------------------------------------------- /plugins/SmartJump/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '4.0' 2 | __all__ = ['lex','yacc'] 3 | -------------------------------------------------------------------------------- /plugins/SmartJump/lex.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # ply: lex.py 3 | # 4 | # Copyright (C) 2001-2020 5 | # David M. Beazley (Dabeaz LLC) 6 | # All rights reserved. 7 | # 8 | # Latest version: https://github.com/dabeaz/ply 9 | # 10 | # Redistribution and use in source and binary forms, with or without 11 | # modification, are permitted provided that the following conditions are 12 | # met: 13 | # 14 | # * Redistributions of source code must retain the above copyright notice, 15 | # this list of conditions and the following disclaimer. 16 | # * Redistributions in binary form must reproduce the above copyright notice, 17 | # this list of conditions and the following disclaimer in the documentation 18 | # and/or other materials provided with the distribution. 19 | # * Neither the name of David Beazley or Dabeaz LLC may be used to 20 | # endorse or promote products derived from this software without 21 | # specific prior written permission. 22 | # 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 | # ----------------------------------------------------------------------------- 35 | 36 | import re 37 | import sys 38 | import types 39 | import copy 40 | import os 41 | import inspect 42 | 43 | # This tuple contains acceptable string types 44 | StringTypes = (str, bytes) 45 | 46 | # This regular expression is used to match valid token names 47 | _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 48 | 49 | # Exception thrown when invalid token encountered and no default error 50 | # handler is defined. 51 | class LexError(Exception): 52 | def __init__(self, message, s): 53 | self.args = (message,) 54 | self.text = s 55 | 56 | # Token class. This class is used to represent the tokens produced. 57 | class LexToken(object): 58 | def __repr__(self): 59 | return f'LexToken({self.type},{self.value!r},{self.lineno},{self.lexpos})' 60 | 61 | # This object is a stand-in for a logging object created by the 62 | # logging module. 63 | 64 | class PlyLogger(object): 65 | def __init__(self, f): 66 | self.f = f 67 | 68 | def critical(self, msg, *args, **kwargs): 69 | self.f.write((msg % args) + '\n') 70 | 71 | def warning(self, msg, *args, **kwargs): 72 | self.f.write('WARNING: ' + (msg % args) + '\n') 73 | 74 | def error(self, msg, *args, **kwargs): 75 | self.f.write('ERROR: ' + (msg % args) + '\n') 76 | 77 | info = critical 78 | debug = critical 79 | 80 | # ----------------------------------------------------------------------------- 81 | # === Lexing Engine === 82 | # 83 | # The following Lexer class implements the lexer runtime. There are only 84 | # a few public methods and attributes: 85 | # 86 | # input() - Store a new string in the lexer 87 | # token() - Get the next token 88 | # clone() - Clone the lexer 89 | # 90 | # lineno - Current line number 91 | # lexpos - Current position in the input string 92 | # ----------------------------------------------------------------------------- 93 | 94 | class Lexer: 95 | def __init__(self): 96 | self.lexre = None # Master regular expression. This is a list of 97 | # tuples (re, findex) where re is a compiled 98 | # regular expression and findex is a list 99 | # mapping regex group numbers to rules 100 | self.lexretext = None # Current regular expression strings 101 | self.lexstatere = {} # Dictionary mapping lexer states to master regexs 102 | self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 103 | self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 104 | self.lexstate = 'INITIAL' # Current lexer state 105 | self.lexstatestack = [] # Stack of lexer states 106 | self.lexstateinfo = None # State information 107 | self.lexstateignore = {} # Dictionary of ignored characters for each state 108 | self.lexstateerrorf = {} # Dictionary of error functions for each state 109 | self.lexstateeoff = {} # Dictionary of eof functions for each state 110 | self.lexreflags = 0 # Optional re compile flags 111 | self.lexdata = None # Actual input data (as a string) 112 | self.lexpos = 0 # Current position in input text 113 | self.lexlen = 0 # Length of the input text 114 | self.lexerrorf = None # Error rule (if any) 115 | self.lexeoff = None # EOF rule (if any) 116 | self.lextokens = None # List of valid tokens 117 | self.lexignore = '' # Ignored characters 118 | self.lexliterals = '' # Literal characters that can be passed through 119 | self.lexmodule = None # Module 120 | self.lineno = 1 # Current line number 121 | 122 | def clone(self, object=None): 123 | c = copy.copy(self) 124 | 125 | # If the object parameter has been supplied, it means we are attaching the 126 | # lexer to a new object. In this case, we have to rebind all methods in 127 | # the lexstatere and lexstateerrorf tables. 128 | 129 | if object: 130 | newtab = {} 131 | for key, ritem in self.lexstatere.items(): 132 | newre = [] 133 | for cre, findex in ritem: 134 | newfindex = [] 135 | for f in findex: 136 | if not f or not f[0]: 137 | newfindex.append(f) 138 | continue 139 | newfindex.append((getattr(object, f[0].__name__), f[1])) 140 | newre.append((cre, newfindex)) 141 | newtab[key] = newre 142 | c.lexstatere = newtab 143 | c.lexstateerrorf = {} 144 | for key, ef in self.lexstateerrorf.items(): 145 | c.lexstateerrorf[key] = getattr(object, ef.__name__) 146 | c.lexmodule = object 147 | return c 148 | 149 | # ------------------------------------------------------------ 150 | # input() - Push a new string into the lexer 151 | # ------------------------------------------------------------ 152 | def input(self, s): 153 | self.lexdata = s 154 | self.lexpos = 0 155 | self.lexlen = len(s) 156 | 157 | # ------------------------------------------------------------ 158 | # begin() - Changes the lexing state 159 | # ------------------------------------------------------------ 160 | def begin(self, state): 161 | if state not in self.lexstatere: 162 | raise ValueError(f'Undefined state {state!r}') 163 | self.lexre = self.lexstatere[state] 164 | self.lexretext = self.lexstateretext[state] 165 | self.lexignore = self.lexstateignore.get(state, '') 166 | self.lexerrorf = self.lexstateerrorf.get(state, None) 167 | self.lexeoff = self.lexstateeoff.get(state, None) 168 | self.lexstate = state 169 | 170 | # ------------------------------------------------------------ 171 | # push_state() - Changes the lexing state and saves old on stack 172 | # ------------------------------------------------------------ 173 | def push_state(self, state): 174 | self.lexstatestack.append(self.lexstate) 175 | self.begin(state) 176 | 177 | # ------------------------------------------------------------ 178 | # pop_state() - Restores the previous state 179 | # ------------------------------------------------------------ 180 | def pop_state(self): 181 | self.begin(self.lexstatestack.pop()) 182 | 183 | # ------------------------------------------------------------ 184 | # current_state() - Returns the current lexing state 185 | # ------------------------------------------------------------ 186 | def current_state(self): 187 | return self.lexstate 188 | 189 | # ------------------------------------------------------------ 190 | # skip() - Skip ahead n characters 191 | # ------------------------------------------------------------ 192 | def skip(self, n): 193 | self.lexpos += n 194 | 195 | # ------------------------------------------------------------ 196 | # token() - Return the next token from the Lexer 197 | # 198 | # Note: This function has been carefully implemented to be as fast 199 | # as possible. Don't make changes unless you really know what 200 | # you are doing 201 | # ------------------------------------------------------------ 202 | def token(self): 203 | # Make local copies of frequently referenced attributes 204 | lexpos = self.lexpos 205 | lexlen = self.lexlen 206 | lexignore = self.lexignore 207 | lexdata = self.lexdata 208 | 209 | while lexpos < lexlen: 210 | # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 211 | if lexdata[lexpos] in lexignore: 212 | lexpos += 1 213 | continue 214 | 215 | # Look for a regular expression match 216 | for lexre, lexindexfunc in self.lexre: 217 | m = lexre.match(lexdata, lexpos) 218 | if not m: 219 | continue 220 | 221 | # Create a token for return 222 | tok = LexToken() 223 | tok.value = m.group() 224 | tok.lineno = self.lineno 225 | tok.lexpos = lexpos 226 | 227 | i = m.lastindex 228 | func, tok.type = lexindexfunc[i] 229 | 230 | if not func: 231 | # If no token type was set, it's an ignored token 232 | if tok.type: 233 | self.lexpos = m.end() 234 | return tok 235 | else: 236 | lexpos = m.end() 237 | break 238 | 239 | lexpos = m.end() 240 | 241 | # If token is processed by a function, call it 242 | 243 | tok.lexer = self # Set additional attributes useful in token rules 244 | self.lexmatch = m 245 | self.lexpos = lexpos 246 | newtok = func(tok) 247 | del tok.lexer 248 | del self.lexmatch 249 | 250 | # Every function must return a token, if nothing, we just move to next token 251 | if not newtok: 252 | lexpos = self.lexpos # This is here in case user has updated lexpos. 253 | lexignore = self.lexignore # This is here in case there was a state change 254 | break 255 | return newtok 256 | else: 257 | # No match, see if in literals 258 | if lexdata[lexpos] in self.lexliterals: 259 | tok = LexToken() 260 | tok.value = lexdata[lexpos] 261 | tok.lineno = self.lineno 262 | tok.type = tok.value 263 | tok.lexpos = lexpos 264 | self.lexpos = lexpos + 1 265 | return tok 266 | 267 | # No match. Call t_error() if defined. 268 | if self.lexerrorf: 269 | tok = LexToken() 270 | tok.value = self.lexdata[lexpos:] 271 | tok.lineno = self.lineno 272 | tok.type = 'error' 273 | tok.lexer = self 274 | tok.lexpos = lexpos 275 | self.lexpos = lexpos 276 | newtok = self.lexerrorf(tok) 277 | if lexpos == self.lexpos: 278 | # Error method didn't change text position at all. This is an error. 279 | raise LexError(f"Scanning error. Illegal character {lexdata[lexpos]!r}", 280 | lexdata[lexpos:]) 281 | lexpos = self.lexpos 282 | if not newtok: 283 | continue 284 | return newtok 285 | 286 | self.lexpos = lexpos 287 | raise LexError(f"Illegal character {lexdata[lexpos]!r} at index {lexpos}", 288 | lexdata[lexpos:]) 289 | 290 | if self.lexeoff: 291 | tok = LexToken() 292 | tok.type = 'eof' 293 | tok.value = '' 294 | tok.lineno = self.lineno 295 | tok.lexpos = lexpos 296 | tok.lexer = self 297 | self.lexpos = lexpos 298 | newtok = self.lexeoff(tok) 299 | return newtok 300 | 301 | self.lexpos = lexpos + 1 302 | if self.lexdata is None: 303 | raise RuntimeError('No input string given with input()') 304 | return None 305 | 306 | # Iterator interface 307 | def __iter__(self): 308 | return self 309 | 310 | def __next__(self): 311 | t = self.token() 312 | if t is None: 313 | raise StopIteration 314 | return t 315 | 316 | # ----------------------------------------------------------------------------- 317 | # ==== Lex Builder === 318 | # 319 | # The functions and classes below are used to collect lexing information 320 | # and build a Lexer object from it. 321 | # ----------------------------------------------------------------------------- 322 | 323 | # ----------------------------------------------------------------------------- 324 | # _get_regex(func) 325 | # 326 | # Returns the regular expression assigned to a function either as a doc string 327 | # or as a .regex attribute attached by the @TOKEN decorator. 328 | # ----------------------------------------------------------------------------- 329 | def _get_regex(func): 330 | return getattr(func, 'regex', func.__doc__) 331 | 332 | # ----------------------------------------------------------------------------- 333 | # get_caller_module_dict() 334 | # 335 | # This function returns a dictionary containing all of the symbols defined within 336 | # a caller further down the call stack. This is used to get the environment 337 | # associated with the yacc() call if none was provided. 338 | # ----------------------------------------------------------------------------- 339 | def get_caller_module_dict(levels): 340 | f = sys._getframe(levels) 341 | return { **f.f_globals, **f.f_locals } 342 | 343 | # ----------------------------------------------------------------------------- 344 | # _form_master_re() 345 | # 346 | # This function takes a list of all of the regex components and attempts to 347 | # form the master regular expression. Given limitations in the Python re 348 | # module, it may be necessary to break the master regex into separate expressions. 349 | # ----------------------------------------------------------------------------- 350 | def _form_master_re(relist, reflags, ldict, toknames): 351 | if not relist: 352 | return [], [], [] 353 | regex = '|'.join(relist) 354 | try: 355 | lexre = re.compile(regex, reflags) 356 | 357 | # Build the index to function map for the matching engine 358 | lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1) 359 | lexindexnames = lexindexfunc[:] 360 | 361 | for f, i in lexre.groupindex.items(): 362 | handle = ldict.get(f, None) 363 | if type(handle) in (types.FunctionType, types.MethodType): 364 | lexindexfunc[i] = (handle, toknames[f]) 365 | lexindexnames[i] = f 366 | elif handle is not None: 367 | lexindexnames[i] = f 368 | if f.find('ignore_') > 0: 369 | lexindexfunc[i] = (None, None) 370 | else: 371 | lexindexfunc[i] = (None, toknames[f]) 372 | 373 | return [(lexre, lexindexfunc)], [regex], [lexindexnames] 374 | except Exception: 375 | m = (len(relist) // 2) + 1 376 | llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames) 377 | rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames) 378 | return (llist+rlist), (lre+rre), (lnames+rnames) 379 | 380 | # ----------------------------------------------------------------------------- 381 | # def _statetoken(s,names) 382 | # 383 | # Given a declaration name s of the form "t_" and a dictionary whose keys are 384 | # state names, this function returns a tuple (states,tokenname) where states 385 | # is a tuple of state names and tokenname is the name of the token. For example, 386 | # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 387 | # ----------------------------------------------------------------------------- 388 | def _statetoken(s, names): 389 | parts = s.split('_') 390 | for i, part in enumerate(parts[1:], 1): 391 | if part not in names and part != 'ANY': 392 | break 393 | 394 | if i > 1: 395 | states = tuple(parts[1:i]) 396 | else: 397 | states = ('INITIAL',) 398 | 399 | if 'ANY' in states: 400 | states = tuple(names) 401 | 402 | tokenname = '_'.join(parts[i:]) 403 | return (states, tokenname) 404 | 405 | 406 | # ----------------------------------------------------------------------------- 407 | # LexerReflect() 408 | # 409 | # This class represents information needed to build a lexer as extracted from a 410 | # user's input file. 411 | # ----------------------------------------------------------------------------- 412 | class LexerReflect(object): 413 | def __init__(self, ldict, log=None, reflags=0): 414 | self.ldict = ldict 415 | self.error_func = None 416 | self.tokens = [] 417 | self.reflags = reflags 418 | self.stateinfo = {'INITIAL': 'inclusive'} 419 | self.modules = set() 420 | self.error = False 421 | self.log = PlyLogger(sys.stderr) if log is None else log 422 | 423 | # Get all of the basic information 424 | def get_all(self): 425 | self.get_tokens() 426 | self.get_literals() 427 | self.get_states() 428 | self.get_rules() 429 | 430 | # Validate all of the information 431 | def validate_all(self): 432 | self.validate_tokens() 433 | self.validate_literals() 434 | self.validate_rules() 435 | return self.error 436 | 437 | # Get the tokens map 438 | def get_tokens(self): 439 | tokens = self.ldict.get('tokens', None) 440 | if not tokens: 441 | self.log.error('No token list is defined') 442 | self.error = True 443 | return 444 | 445 | if not isinstance(tokens, (list, tuple)): 446 | self.log.error('tokens must be a list or tuple') 447 | self.error = True 448 | return 449 | 450 | if not tokens: 451 | self.log.error('tokens is empty') 452 | self.error = True 453 | return 454 | 455 | self.tokens = tokens 456 | 457 | # Validate the tokens 458 | def validate_tokens(self): 459 | terminals = {} 460 | for n in self.tokens: 461 | if not _is_identifier.match(n): 462 | self.log.error(f"Bad token name {n!r}") 463 | self.error = True 464 | if n in terminals: 465 | self.log.warning(f"Token {n!r} multiply defined") 466 | terminals[n] = 1 467 | 468 | # Get the literals specifier 469 | def get_literals(self): 470 | self.literals = self.ldict.get('literals', '') 471 | if not self.literals: 472 | self.literals = '' 473 | 474 | # Validate literals 475 | def validate_literals(self): 476 | try: 477 | for c in self.literals: 478 | if not isinstance(c, StringTypes) or len(c) > 1: 479 | self.log.error(f'Invalid literal {c!r}. Must be a single character') 480 | self.error = True 481 | 482 | except TypeError: 483 | self.log.error('Invalid literals specification. literals must be a sequence of characters') 484 | self.error = True 485 | 486 | def get_states(self): 487 | self.states = self.ldict.get('states', None) 488 | # Build statemap 489 | if self.states: 490 | if not isinstance(self.states, (tuple, list)): 491 | self.log.error('states must be defined as a tuple or list') 492 | self.error = True 493 | else: 494 | for s in self.states: 495 | if not isinstance(s, tuple) or len(s) != 2: 496 | self.log.error("Invalid state specifier %r. Must be a tuple (statename,'exclusive|inclusive')", s) 497 | self.error = True 498 | continue 499 | name, statetype = s 500 | if not isinstance(name, StringTypes): 501 | self.log.error('State name %r must be a string', name) 502 | self.error = True 503 | continue 504 | if not (statetype == 'inclusive' or statetype == 'exclusive'): 505 | self.log.error("State type for state %r must be 'inclusive' or 'exclusive'", name) 506 | self.error = True 507 | continue 508 | if name in self.stateinfo: 509 | self.log.error("State %r already defined", name) 510 | self.error = True 511 | continue 512 | self.stateinfo[name] = statetype 513 | 514 | # Get all of the symbols with a t_ prefix and sort them into various 515 | # categories (functions, strings, error functions, and ignore characters) 516 | 517 | def get_rules(self): 518 | tsymbols = [f for f in self.ldict if f[:2] == 't_'] 519 | 520 | # Now build up a list of functions and a list of strings 521 | self.toknames = {} # Mapping of symbols to token names 522 | self.funcsym = {} # Symbols defined as functions 523 | self.strsym = {} # Symbols defined as strings 524 | self.ignore = {} # Ignore strings by state 525 | self.errorf = {} # Error functions by state 526 | self.eoff = {} # EOF functions by state 527 | 528 | for s in self.stateinfo: 529 | self.funcsym[s] = [] 530 | self.strsym[s] = [] 531 | 532 | if len(tsymbols) == 0: 533 | self.log.error('No rules of the form t_rulename are defined') 534 | self.error = True 535 | return 536 | 537 | for f in tsymbols: 538 | t = self.ldict[f] 539 | states, tokname = _statetoken(f, self.stateinfo) 540 | self.toknames[f] = tokname 541 | 542 | if hasattr(t, '__call__'): 543 | if tokname == 'error': 544 | for s in states: 545 | self.errorf[s] = t 546 | elif tokname == 'eof': 547 | for s in states: 548 | self.eoff[s] = t 549 | elif tokname == 'ignore': 550 | line = t.__code__.co_firstlineno 551 | file = t.__code__.co_filename 552 | self.log.error("%s:%d: Rule %r must be defined as a string", file, line, t.__name__) 553 | self.error = True 554 | else: 555 | for s in states: 556 | self.funcsym[s].append((f, t)) 557 | elif isinstance(t, StringTypes): 558 | if tokname == 'ignore': 559 | for s in states: 560 | self.ignore[s] = t 561 | if '\\' in t: 562 | self.log.warning("%s contains a literal backslash '\\'", f) 563 | 564 | elif tokname == 'error': 565 | self.log.error("Rule %r must be defined as a function", f) 566 | self.error = True 567 | else: 568 | for s in states: 569 | self.strsym[s].append((f, t)) 570 | else: 571 | self.log.error('%s not defined as a function or string', f) 572 | self.error = True 573 | 574 | # Sort the functions by line number 575 | for f in self.funcsym.values(): 576 | f.sort(key=lambda x: x[1].__code__.co_firstlineno) 577 | 578 | # Sort the strings by regular expression length 579 | for s in self.strsym.values(): 580 | s.sort(key=lambda x: len(x[1]), reverse=True) 581 | 582 | # Validate all of the t_rules collected 583 | def validate_rules(self): 584 | for state in self.stateinfo: 585 | # Validate all rules defined by functions 586 | 587 | for fname, f in self.funcsym[state]: 588 | line = f.__code__.co_firstlineno 589 | file = f.__code__.co_filename 590 | module = inspect.getmodule(f) 591 | self.modules.add(module) 592 | 593 | tokname = self.toknames[fname] 594 | if isinstance(f, types.MethodType): 595 | reqargs = 2 596 | else: 597 | reqargs = 1 598 | nargs = f.__code__.co_argcount 599 | if nargs > reqargs: 600 | self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__) 601 | self.error = True 602 | continue 603 | 604 | if nargs < reqargs: 605 | self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__) 606 | self.error = True 607 | continue 608 | 609 | if not _get_regex(f): 610 | self.log.error("%s:%d: No regular expression defined for rule %r", file, line, f.__name__) 611 | self.error = True 612 | continue 613 | 614 | try: 615 | c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), self.reflags) 616 | if c.match(''): 617 | self.log.error("%s:%d: Regular expression for rule %r matches empty string", file, line, f.__name__) 618 | self.error = True 619 | except re.error as e: 620 | self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e) 621 | if '#' in _get_regex(f): 622 | self.log.error("%s:%d. Make sure '#' in rule %r is escaped with '\\#'", file, line, f.__name__) 623 | self.error = True 624 | 625 | # Validate all rules defined by strings 626 | for name, r in self.strsym[state]: 627 | tokname = self.toknames[name] 628 | if tokname == 'error': 629 | self.log.error("Rule %r must be defined as a function", name) 630 | self.error = True 631 | continue 632 | 633 | if tokname not in self.tokens and tokname.find('ignore_') < 0: 634 | self.log.error("Rule %r defined for an unspecified token %s", name, tokname) 635 | self.error = True 636 | continue 637 | 638 | try: 639 | c = re.compile('(?P<%s>%s)' % (name, r), self.reflags) 640 | if (c.match('')): 641 | self.log.error("Regular expression for rule %r matches empty string", name) 642 | self.error = True 643 | except re.error as e: 644 | self.log.error("Invalid regular expression for rule %r. %s", name, e) 645 | if '#' in r: 646 | self.log.error("Make sure '#' in rule %r is escaped with '\\#'", name) 647 | self.error = True 648 | 649 | if not self.funcsym[state] and not self.strsym[state]: 650 | self.log.error("No rules defined for state %r", state) 651 | self.error = True 652 | 653 | # Validate the error function 654 | efunc = self.errorf.get(state, None) 655 | if efunc: 656 | f = efunc 657 | line = f.__code__.co_firstlineno 658 | file = f.__code__.co_filename 659 | module = inspect.getmodule(f) 660 | self.modules.add(module) 661 | 662 | if isinstance(f, types.MethodType): 663 | reqargs = 2 664 | else: 665 | reqargs = 1 666 | nargs = f.__code__.co_argcount 667 | if nargs > reqargs: 668 | self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__) 669 | self.error = True 670 | 671 | if nargs < reqargs: 672 | self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__) 673 | self.error = True 674 | 675 | for module in self.modules: 676 | self.validate_module(module) 677 | 678 | # ----------------------------------------------------------------------------- 679 | # validate_module() 680 | # 681 | # This checks to see if there are duplicated t_rulename() functions or strings 682 | # in the parser input file. This is done using a simple regular expression 683 | # match on each line in the source code of the given module. 684 | # ----------------------------------------------------------------------------- 685 | 686 | def validate_module(self, module): 687 | try: 688 | lines, linen = inspect.getsourcelines(module) 689 | except IOError: 690 | return 691 | 692 | fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 693 | sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 694 | 695 | counthash = {} 696 | linen += 1 697 | for line in lines: 698 | m = fre.match(line) 699 | if not m: 700 | m = sre.match(line) 701 | if m: 702 | name = m.group(1) 703 | prev = counthash.get(name) 704 | if not prev: 705 | counthash[name] = linen 706 | else: 707 | filename = inspect.getsourcefile(module) 708 | self.log.error('%s:%d: Rule %s redefined. Previously defined on line %d', filename, linen, name, prev) 709 | self.error = True 710 | linen += 1 711 | 712 | # ----------------------------------------------------------------------------- 713 | # lex(module) 714 | # 715 | # Build all of the regular expression rules from definitions in the supplied module 716 | # ----------------------------------------------------------------------------- 717 | def lex(*, module=None, object=None, debug=False, 718 | reflags=int(re.VERBOSE), debuglog=None, errorlog=None): 719 | 720 | global lexer 721 | 722 | ldict = None 723 | stateinfo = {'INITIAL': 'inclusive'} 724 | lexobj = Lexer() 725 | global token, input 726 | 727 | if errorlog is None: 728 | errorlog = PlyLogger(sys.stderr) 729 | 730 | if debug: 731 | if debuglog is None: 732 | debuglog = PlyLogger(sys.stderr) 733 | 734 | # Get the module dictionary used for the lexer 735 | if object: 736 | module = object 737 | 738 | # Get the module dictionary used for the parser 739 | if module: 740 | _items = [(k, getattr(module, k)) for k in dir(module)] 741 | ldict = dict(_items) 742 | # If no __file__ attribute is available, try to obtain it from the __module__ instead 743 | if '__file__' not in ldict: 744 | ldict['__file__'] = sys.modules[ldict['__module__']].__file__ 745 | else: 746 | ldict = get_caller_module_dict(2) 747 | 748 | # Collect parser information from the dictionary 749 | linfo = LexerReflect(ldict, log=errorlog, reflags=reflags) 750 | linfo.get_all() 751 | if linfo.validate_all(): 752 | raise SyntaxError("Can't build lexer") 753 | 754 | # Dump some basic debugging information 755 | if debug: 756 | debuglog.info('lex: tokens = %r', linfo.tokens) 757 | debuglog.info('lex: literals = %r', linfo.literals) 758 | debuglog.info('lex: states = %r', linfo.stateinfo) 759 | 760 | # Build a dictionary of valid token names 761 | lexobj.lextokens = set() 762 | for n in linfo.tokens: 763 | lexobj.lextokens.add(n) 764 | 765 | # Get literals specification 766 | if isinstance(linfo.literals, (list, tuple)): 767 | lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 768 | else: 769 | lexobj.lexliterals = linfo.literals 770 | 771 | lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals) 772 | 773 | # Get the stateinfo dictionary 774 | stateinfo = linfo.stateinfo 775 | 776 | regexs = {} 777 | # Build the master regular expressions 778 | for state in stateinfo: 779 | regex_list = [] 780 | 781 | # Add rules defined by functions first 782 | for fname, f in linfo.funcsym[state]: 783 | regex_list.append('(?P<%s>%s)' % (fname, _get_regex(f))) 784 | if debug: 785 | debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state) 786 | 787 | # Now add all of the simple rules 788 | for name, r in linfo.strsym[state]: 789 | regex_list.append('(?P<%s>%s)' % (name, r)) 790 | if debug: 791 | debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state) 792 | 793 | regexs[state] = regex_list 794 | 795 | # Build the master regular expressions 796 | 797 | if debug: 798 | debuglog.info('lex: ==== MASTER REGEXS FOLLOW ====') 799 | 800 | for state in regexs: 801 | lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames) 802 | lexobj.lexstatere[state] = lexre 803 | lexobj.lexstateretext[state] = re_text 804 | lexobj.lexstaterenames[state] = re_names 805 | if debug: 806 | for i, text in enumerate(re_text): 807 | debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text) 808 | 809 | # For inclusive states, we need to add the regular expressions from the INITIAL state 810 | for state, stype in stateinfo.items(): 811 | if state != 'INITIAL' and stype == 'inclusive': 812 | lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 813 | lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 814 | lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 815 | 816 | lexobj.lexstateinfo = stateinfo 817 | lexobj.lexre = lexobj.lexstatere['INITIAL'] 818 | lexobj.lexretext = lexobj.lexstateretext['INITIAL'] 819 | lexobj.lexreflags = reflags 820 | 821 | # Set up ignore variables 822 | lexobj.lexstateignore = linfo.ignore 823 | lexobj.lexignore = lexobj.lexstateignore.get('INITIAL', '') 824 | 825 | # Set up error functions 826 | lexobj.lexstateerrorf = linfo.errorf 827 | lexobj.lexerrorf = linfo.errorf.get('INITIAL', None) 828 | if not lexobj.lexerrorf: 829 | errorlog.warning('No t_error rule is defined') 830 | 831 | # Set up eof functions 832 | lexobj.lexstateeoff = linfo.eoff 833 | lexobj.lexeoff = linfo.eoff.get('INITIAL', None) 834 | 835 | # Check state information for ignore and error rules 836 | for s, stype in stateinfo.items(): 837 | if stype == 'exclusive': 838 | if s not in linfo.errorf: 839 | errorlog.warning("No error rule is defined for exclusive state %r", s) 840 | if s not in linfo.ignore and lexobj.lexignore: 841 | errorlog.warning("No ignore rule is defined for exclusive state %r", s) 842 | elif stype == 'inclusive': 843 | if s not in linfo.errorf: 844 | linfo.errorf[s] = linfo.errorf.get('INITIAL', None) 845 | if s not in linfo.ignore: 846 | linfo.ignore[s] = linfo.ignore.get('INITIAL', '') 847 | 848 | # Create global versions of the token() and input() functions 849 | token = lexobj.token 850 | input = lexobj.input 851 | lexer = lexobj 852 | 853 | return lexobj 854 | 855 | # ----------------------------------------------------------------------------- 856 | # runmain() 857 | # 858 | # This runs the lexer as a main program 859 | # ----------------------------------------------------------------------------- 860 | 861 | def runmain(lexer=None, data=None): 862 | if not data: 863 | try: 864 | filename = sys.argv[1] 865 | with open(filename) as f: 866 | data = f.read() 867 | except IndexError: 868 | sys.stdout.write('Reading from standard input (type EOF to end):\n') 869 | data = sys.stdin.read() 870 | 871 | if lexer: 872 | _input = lexer.input 873 | else: 874 | _input = input 875 | _input(data) 876 | if lexer: 877 | _token = lexer.token 878 | else: 879 | _token = token 880 | 881 | while True: 882 | tok = _token() 883 | if not tok: 884 | break 885 | sys.stdout.write(f'({tok.type},{tok.value!r},{tok.lineno},{tok.lexpos})\n') 886 | 887 | # ----------------------------------------------------------------------------- 888 | # @TOKEN(regex) 889 | # 890 | # This decorator function can be used to set the regex expression on a function 891 | # when its docstring might need to be set in an alternative way 892 | # ----------------------------------------------------------------------------- 893 | 894 | def TOKEN(r): 895 | def set_regex(f): 896 | if hasattr(r, '__call__'): 897 | f.regex = _get_regex(r) 898 | else: 899 | f.regex = r 900 | return f 901 | return set_regex 902 | -------------------------------------------------------------------------------- /plugins/SmartJump/yacc.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # ply: yacc.py 3 | # 4 | # Copyright (C) 2001-2020 5 | # David M. Beazley (Dabeaz LLC) 6 | # All rights reserved. 7 | # 8 | # Latest version: https://github.com/dabeaz/ply 9 | # 10 | # Redistribution and use in source and binary forms, with or without 11 | # modification, are permitted provided that the following conditions are 12 | # met: 13 | # 14 | # * Redistributions of source code must retain the above copyright notice, 15 | # this list of conditions and the following disclaimer. 16 | # * Redistributions in binary form must reproduce the above copyright notice, 17 | # this list of conditions and the following disclaimer in the documentation 18 | # and/or other materials provided with the distribution. 19 | # * Neither the name of David Beazley or Dabeaz LLC may be used to 20 | # endorse or promote products derived from this software without 21 | # specific prior written permission. 22 | # 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 | # ----------------------------------------------------------------------------- 35 | # 36 | # This implements an LR parser that is constructed from grammar rules defined 37 | # as Python functions. The grammar is specified by supplying the BNF inside 38 | # Python documentation strings. The inspiration for this technique was borrowed 39 | # from John Aycock's Spark parsing system. PLY might be viewed as cross between 40 | # Spark and the GNU bison utility. 41 | # 42 | # The current implementation is only somewhat object-oriented. The 43 | # LR parser itself is defined in terms of an object (which allows multiple 44 | # parsers to co-exist). However, most of the variables used during table 45 | # construction are defined in terms of global variables. Users shouldn't 46 | # notice unless they are trying to define multiple parsers at the same 47 | # time using threads (in which case they should have their head examined). 48 | # 49 | # This implementation supports both SLR and LALR(1) parsing. LALR(1) 50 | # support was originally implemented by Elias Ioup (ezioup@alumni.uchicago.edu), 51 | # using the algorithm found in Aho, Sethi, and Ullman "Compilers: Principles, 52 | # Techniques, and Tools" (The Dragon Book). LALR(1) has since been replaced 53 | # by the more efficient DeRemer and Pennello algorithm. 54 | # 55 | # :::::::: WARNING ::::::: 56 | # 57 | # Construction of LR parsing tables is fairly complicated and expensive. 58 | # To make this module run fast, a *LOT* of work has been put into 59 | # optimization---often at the expensive of readability and what might 60 | # consider to be good Python "coding style." Modify the code at your 61 | # own risk! 62 | # ---------------------------------------------------------------------------- 63 | 64 | import re 65 | import types 66 | import sys 67 | import inspect 68 | 69 | #----------------------------------------------------------------------------- 70 | # === User configurable parameters === 71 | # 72 | # Change these to modify the default behavior of yacc (if you wish) 73 | #----------------------------------------------------------------------------- 74 | 75 | yaccdebug = False # Debugging mode. If set, yacc generates a 76 | # a 'parser.out' file in the current directory 77 | 78 | debug_file = 'parser.out' # Default name of the debugging file 79 | error_count = 3 # Number of symbols that must be shifted to leave recovery mode 80 | resultlimit = 40 # Size limit of results when running in debug mode. 81 | 82 | MAXINT = sys.maxsize 83 | 84 | # This object is a stand-in for a logging object created by the 85 | # logging module. PLY will use this by default to create things 86 | # such as the parser.out file. If a user wants more detailed 87 | # information, they can create their own logging object and pass 88 | # it into PLY. 89 | 90 | class PlyLogger(object): 91 | def __init__(self, f): 92 | self.f = f 93 | 94 | def debug(self, msg, *args, **kwargs): 95 | self.f.write((msg % args) + '\n') 96 | 97 | info = debug 98 | 99 | def warning(self, msg, *args, **kwargs): 100 | self.f.write('WARNING: ' + (msg % args) + '\n') 101 | 102 | def error(self, msg, *args, **kwargs): 103 | self.f.write('ERROR: ' + (msg % args) + '\n') 104 | 105 | critical = debug 106 | 107 | # Null logger is used when no output is generated. Does nothing. 108 | class NullLogger(object): 109 | def __getattribute__(self, name): 110 | return self 111 | 112 | def __call__(self, *args, **kwargs): 113 | return self 114 | 115 | # Exception raised for yacc-related errors 116 | class YaccError(Exception): 117 | pass 118 | 119 | # Format the result message that the parser produces when running in debug mode. 120 | def format_result(r): 121 | repr_str = repr(r) 122 | if '\n' in repr_str: 123 | repr_str = repr(repr_str) 124 | if len(repr_str) > resultlimit: 125 | repr_str = repr_str[:resultlimit] + ' ...' 126 | result = '<%s @ 0x%x> (%s)' % (type(r).__name__, id(r), repr_str) 127 | return result 128 | 129 | # Format stack entries when the parser is running in debug mode 130 | def format_stack_entry(r): 131 | repr_str = repr(r) 132 | if '\n' in repr_str: 133 | repr_str = repr(repr_str) 134 | if len(repr_str) < 16: 135 | return repr_str 136 | else: 137 | return '<%s @ 0x%x>' % (type(r).__name__, id(r)) 138 | 139 | #----------------------------------------------------------------------------- 140 | # === LR Parsing Engine === 141 | # 142 | # The following classes are used for the LR parser itself. These are not 143 | # used during table construction and are independent of the actual LR 144 | # table generation algorithm 145 | #----------------------------------------------------------------------------- 146 | 147 | # This class is used to hold non-terminal grammar symbols during parsing. 148 | # It normally has the following attributes set: 149 | # .type = Grammar symbol type 150 | # .value = Symbol value 151 | # .lineno = Starting line number 152 | # .endlineno = Ending line number (optional, set automatically) 153 | # .lexpos = Starting lex position 154 | # .endlexpos = Ending lex position (optional, set automatically) 155 | 156 | class YaccSymbol: 157 | def __str__(self): 158 | return self.type 159 | 160 | def __repr__(self): 161 | return str(self) 162 | 163 | # This class is a wrapper around the objects actually passed to each 164 | # grammar rule. Index lookup and assignment actually assign the 165 | # .value attribute of the underlying YaccSymbol object. 166 | # The lineno() method returns the line number of a given 167 | # item (or 0 if not defined). The linespan() method returns 168 | # a tuple of (startline,endline) representing the range of lines 169 | # for a symbol. The lexspan() method returns a tuple (lexpos,endlexpos) 170 | # representing the range of positional information for a symbol. 171 | 172 | class YaccProduction: 173 | def __init__(self, s, stack=None): 174 | self.slice = s 175 | self.stack = stack 176 | self.lexer = None 177 | self.parser = None 178 | 179 | def __getitem__(self, n): 180 | if isinstance(n, slice): 181 | return [s.value for s in self.slice[n]] 182 | elif n >= 0: 183 | return self.slice[n].value 184 | else: 185 | return self.stack[n].value 186 | 187 | def __setitem__(self, n, v): 188 | self.slice[n].value = v 189 | 190 | def __getslice__(self, i, j): 191 | return [s.value for s in self.slice[i:j]] 192 | 193 | def __len__(self): 194 | return len(self.slice) 195 | 196 | def lineno(self, n): 197 | return getattr(self.slice[n], 'lineno', 0) 198 | 199 | def set_lineno(self, n, lineno): 200 | self.slice[n].lineno = lineno 201 | 202 | def linespan(self, n): 203 | startline = getattr(self.slice[n], 'lineno', 0) 204 | endline = getattr(self.slice[n], 'endlineno', startline) 205 | return startline, endline 206 | 207 | def lexpos(self, n): 208 | return getattr(self.slice[n], 'lexpos', 0) 209 | 210 | def set_lexpos(self, n, lexpos): 211 | self.slice[n].lexpos = lexpos 212 | 213 | def lexspan(self, n): 214 | startpos = getattr(self.slice[n], 'lexpos', 0) 215 | endpos = getattr(self.slice[n], 'endlexpos', startpos) 216 | return startpos, endpos 217 | 218 | def error(self): 219 | raise SyntaxError 220 | 221 | # ----------------------------------------------------------------------------- 222 | # == LRParser == 223 | # 224 | # The LR Parsing engine. 225 | # ----------------------------------------------------------------------------- 226 | 227 | class LRParser: 228 | def __init__(self, lrtab, errorf): 229 | self.productions = lrtab.lr_productions 230 | self.action = lrtab.lr_action 231 | self.goto = lrtab.lr_goto 232 | self.errorfunc = errorf 233 | self.set_defaulted_states() 234 | self.errorok = True 235 | 236 | def errok(self): 237 | self.errorok = True 238 | 239 | def restart(self): 240 | del self.statestack[:] 241 | del self.symstack[:] 242 | sym = YaccSymbol() 243 | sym.type = '$end' 244 | self.symstack.append(sym) 245 | self.statestack.append(0) 246 | 247 | # Defaulted state support. 248 | # This method identifies parser states where there is only one possible reduction action. 249 | # For such states, the parser can make a choose to make a rule reduction without consuming 250 | # the next look-ahead token. This delayed invocation of the tokenizer can be useful in 251 | # certain kinds of advanced parsing situations where the lexer and parser interact with 252 | # each other or change states (i.e., manipulation of scope, lexer states, etc.). 253 | # 254 | # See: http://www.gnu.org/software/bison/manual/html_node/Default-Reductions.html#Default-Reductions 255 | def set_defaulted_states(self): 256 | self.defaulted_states = {} 257 | for state, actions in self.action.items(): 258 | rules = list(actions.values()) 259 | if len(rules) == 1 and rules[0] < 0: 260 | self.defaulted_states[state] = rules[0] 261 | 262 | def disable_defaulted_states(self): 263 | self.defaulted_states = {} 264 | 265 | # parse(). 266 | # 267 | # This is the core parsing engine. To operate, it requires a lexer object. 268 | # Two options are provided. The debug flag turns on debugging so that you can 269 | # see the various rule reductions and parsing steps. tracking turns on position 270 | # tracking. In this mode, symbols will record the starting/ending line number and 271 | # character index. 272 | 273 | def parse(self, input=None, lexer=None, debug=False, tracking=False): 274 | # If debugging has been specified as a flag, turn it into a logging object 275 | if isinstance(debug, int) and debug: 276 | debug = PlyLogger(sys.stderr) 277 | 278 | lookahead = None # Current lookahead symbol 279 | lookaheadstack = [] # Stack of lookahead symbols 280 | actions = self.action # Local reference to action table (to avoid lookup on self.) 281 | goto = self.goto # Local reference to goto table (to avoid lookup on self.) 282 | prod = self.productions # Local reference to production list (to avoid lookup on self.) 283 | defaulted_states = self.defaulted_states # Local reference to defaulted states 284 | pslice = YaccProduction(None) # Production object passed to grammar rules 285 | errorcount = 0 # Used during error recovery 286 | 287 | if debug: 288 | debug.info('PLY: PARSE DEBUG START') 289 | 290 | # If no lexer was given, we will try to use the lex module 291 | if not lexer: 292 | from . import lex 293 | lexer = lex.lexer 294 | 295 | # Set up the lexer and parser objects on pslice 296 | pslice.lexer = lexer 297 | pslice.parser = self 298 | 299 | # If input was supplied, pass to lexer 300 | if input is not None: 301 | lexer.input(input) 302 | 303 | # Set the token function 304 | get_token = self.token = lexer.token 305 | 306 | # Set up the state and symbol stacks 307 | statestack = self.statestack = [] # Stack of parsing states 308 | symstack = self.symstack = [] # Stack of grammar symbols 309 | pslice.stack = symstack # Put in the production 310 | errtoken = None # Err token 311 | 312 | # The start state is assumed to be (0,$end) 313 | 314 | statestack.append(0) 315 | sym = YaccSymbol() 316 | sym.type = '$end' 317 | symstack.append(sym) 318 | state = 0 319 | while True: 320 | # Get the next symbol on the input. If a lookahead symbol 321 | # is already set, we just use that. Otherwise, we'll pull 322 | # the next token off of the lookaheadstack or from the lexer 323 | 324 | if debug: 325 | debug.debug('State : %s', state) 326 | 327 | if state not in defaulted_states: 328 | if not lookahead: 329 | if not lookaheadstack: 330 | lookahead = get_token() # Get the next token 331 | else: 332 | lookahead = lookaheadstack.pop() 333 | if not lookahead: 334 | lookahead = YaccSymbol() 335 | lookahead.type = '$end' 336 | 337 | # Check the action table 338 | ltype = lookahead.type 339 | t = actions[state].get(ltype) 340 | else: 341 | t = defaulted_states[state] 342 | if debug: 343 | debug.debug('Defaulted state %s: Reduce using %d', state, -t) 344 | 345 | if debug: 346 | debug.debug('Stack : %s', 347 | ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) 348 | 349 | if t is not None: 350 | if t > 0: 351 | # shift a symbol on the stack 352 | statestack.append(t) 353 | state = t 354 | 355 | if debug: 356 | debug.debug('Action : Shift and goto state %s', t) 357 | 358 | symstack.append(lookahead) 359 | lookahead = None 360 | 361 | # Decrease error count on successful shift 362 | if errorcount: 363 | errorcount -= 1 364 | continue 365 | 366 | if t < 0: 367 | # reduce a symbol on the stack, emit a production 368 | p = prod[-t] 369 | pname = p.name 370 | plen = p.len 371 | 372 | # Get production function 373 | sym = YaccSymbol() 374 | sym.type = pname # Production name 375 | sym.value = None 376 | 377 | if debug: 378 | if plen: 379 | debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, 380 | '['+','.join([format_stack_entry(_v.value) for _v in symstack[-plen:]])+']', 381 | goto[statestack[-1-plen]][pname]) 382 | else: 383 | debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, [], 384 | goto[statestack[-1]][pname]) 385 | 386 | if plen: 387 | targ = symstack[-plen-1:] 388 | targ[0] = sym 389 | 390 | if tracking: 391 | t1 = targ[1] 392 | sym.lineno = t1.lineno 393 | sym.lexpos = t1.lexpos 394 | t1 = targ[-1] 395 | sym.endlineno = getattr(t1, 'endlineno', t1.lineno) 396 | sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) 397 | 398 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 399 | # The code enclosed in this section is duplicated 400 | # below as a performance optimization. Make sure 401 | # changes get made in both locations. 402 | 403 | pslice.slice = targ 404 | 405 | try: 406 | # Call the grammar rule with our special slice object 407 | del symstack[-plen:] 408 | self.state = state 409 | p.callable(pslice) 410 | del statestack[-plen:] 411 | if debug: 412 | debug.info('Result : %s', format_result(pslice[0])) 413 | symstack.append(sym) 414 | state = goto[statestack[-1]][pname] 415 | statestack.append(state) 416 | except SyntaxError: 417 | # If an error was set. Enter error recovery state 418 | lookaheadstack.append(lookahead) # Save the current lookahead token 419 | symstack.extend(targ[1:-1]) # Put the production slice back on the stack 420 | statestack.pop() # Pop back one state (before the reduce) 421 | state = statestack[-1] 422 | sym.type = 'error' 423 | sym.value = 'error' 424 | lookahead = sym 425 | errorcount = error_count 426 | self.errorok = False 427 | 428 | continue 429 | 430 | else: 431 | 432 | if tracking: 433 | sym.lineno = lexer.lineno 434 | sym.lexpos = lexer.lexpos 435 | 436 | targ = [sym] 437 | 438 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 439 | # The code enclosed in this section is duplicated 440 | # above as a performance optimization. Make sure 441 | # changes get made in both locations. 442 | 443 | pslice.slice = targ 444 | 445 | try: 446 | # Call the grammar rule with our special slice object 447 | self.state = state 448 | p.callable(pslice) 449 | if debug: 450 | debug.info('Result : %s', format_result(pslice[0])) 451 | symstack.append(sym) 452 | state = goto[statestack[-1]][pname] 453 | statestack.append(state) 454 | except SyntaxError: 455 | # If an error was set. Enter error recovery state 456 | lookaheadstack.append(lookahead) # Save the current lookahead token 457 | statestack.pop() # Pop back one state (before the reduce) 458 | state = statestack[-1] 459 | sym.type = 'error' 460 | sym.value = 'error' 461 | lookahead = sym 462 | errorcount = error_count 463 | self.errorok = False 464 | 465 | continue 466 | 467 | if t == 0: 468 | n = symstack[-1] 469 | result = getattr(n, 'value', None) 470 | 471 | if debug: 472 | debug.info('Done : Returning %s', format_result(result)) 473 | debug.info('PLY: PARSE DEBUG END') 474 | 475 | return result 476 | 477 | if t is None: 478 | 479 | if debug: 480 | debug.error('Error : %s', 481 | ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) 482 | 483 | # We have some kind of parsing error here. To handle 484 | # this, we are going to push the current token onto 485 | # the tokenstack and replace it with an 'error' token. 486 | # If there are any synchronization rules, they may 487 | # catch it. 488 | # 489 | # In addition to pushing the error token, we call call 490 | # the user defined p_error() function if this is the 491 | # first syntax error. This function is only called if 492 | # errorcount == 0. 493 | if errorcount == 0 or self.errorok: 494 | errorcount = error_count 495 | self.errorok = False 496 | errtoken = lookahead 497 | if errtoken.type == '$end': 498 | errtoken = None # End of file! 499 | if self.errorfunc: 500 | if errtoken and not hasattr(errtoken, 'lexer'): 501 | errtoken.lexer = lexer 502 | self.state = state 503 | tok = self.errorfunc(errtoken) 504 | if self.errorok: 505 | # User must have done some kind of panic 506 | # mode recovery on their own. The 507 | # returned token is the next lookahead 508 | lookahead = tok 509 | errtoken = None 510 | continue 511 | else: 512 | if errtoken: 513 | if hasattr(errtoken, 'lineno'): 514 | lineno = lookahead.lineno 515 | else: 516 | lineno = 0 517 | if lineno: 518 | sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) 519 | else: 520 | sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) 521 | else: 522 | sys.stderr.write('yacc: Parse error in input. EOF\n') 523 | return 524 | 525 | else: 526 | errorcount = error_count 527 | 528 | # case 1: the statestack only has 1 entry on it. If we're in this state, the 529 | # entire parse has been rolled back and we're completely hosed. The token is 530 | # discarded and we just keep going. 531 | 532 | if len(statestack) <= 1 and lookahead.type != '$end': 533 | lookahead = None 534 | errtoken = None 535 | state = 0 536 | # Nuke the pushback stack 537 | del lookaheadstack[:] 538 | continue 539 | 540 | # case 2: the statestack has a couple of entries on it, but we're 541 | # at the end of the file. nuke the top entry and generate an error token 542 | 543 | # Start nuking entries on the stack 544 | if lookahead.type == '$end': 545 | # Whoa. We're really hosed here. Bail out 546 | return 547 | 548 | if lookahead.type != 'error': 549 | sym = symstack[-1] 550 | if sym.type == 'error': 551 | # Hmmm. Error is on top of stack, we'll just nuke input 552 | # symbol and continue 553 | if tracking: 554 | sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) 555 | sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) 556 | lookahead = None 557 | continue 558 | 559 | # Create the error symbol for the first time and make it the new lookahead symbol 560 | t = YaccSymbol() 561 | t.type = 'error' 562 | 563 | if hasattr(lookahead, 'lineno'): 564 | t.lineno = t.endlineno = lookahead.lineno 565 | if hasattr(lookahead, 'lexpos'): 566 | t.lexpos = t.endlexpos = lookahead.lexpos 567 | t.value = lookahead 568 | lookaheadstack.append(lookahead) 569 | lookahead = t 570 | else: 571 | sym = symstack.pop() 572 | if tracking: 573 | lookahead.lineno = sym.lineno 574 | lookahead.lexpos = sym.lexpos 575 | statestack.pop() 576 | state = statestack[-1] 577 | 578 | continue 579 | 580 | # If we'r here, something really bad happened 581 | raise RuntimeError('yacc: internal parser error!!!\n') 582 | 583 | # ----------------------------------------------------------------------------- 584 | # === Grammar Representation === 585 | # 586 | # The following functions, classes, and variables are used to represent and 587 | # manipulate the rules that make up a grammar. 588 | # ----------------------------------------------------------------------------- 589 | 590 | # regex matching identifiers 591 | _is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$') 592 | 593 | # ----------------------------------------------------------------------------- 594 | # class Production: 595 | # 596 | # This class stores the raw information about a single production or grammar rule. 597 | # A grammar rule refers to a specification such as this: 598 | # 599 | # expr : expr PLUS term 600 | # 601 | # Here are the basic attributes defined on all productions 602 | # 603 | # name - Name of the production. For example 'expr' 604 | # prod - A list of symbols on the right side ['expr','PLUS','term'] 605 | # prec - Production precedence level 606 | # number - Production number. 607 | # func - Function that executes on reduce 608 | # file - File where production function is defined 609 | # lineno - Line number where production function is defined 610 | # 611 | # The following attributes are defined or optional. 612 | # 613 | # len - Length of the production (number of symbols on right hand side) 614 | # usyms - Set of unique symbols found in the production 615 | # ----------------------------------------------------------------------------- 616 | 617 | class Production(object): 618 | reduced = 0 619 | def __init__(self, number, name, prod, precedence=('right', 0), func=None, file='', line=0): 620 | self.name = name 621 | self.prod = tuple(prod) 622 | self.number = number 623 | self.func = func 624 | self.callable = None 625 | self.file = file 626 | self.line = line 627 | self.prec = precedence 628 | 629 | # Internal settings used during table construction 630 | 631 | self.len = len(self.prod) # Length of the production 632 | 633 | # Create a list of unique production symbols used in the production 634 | self.usyms = [] 635 | for s in self.prod: 636 | if s not in self.usyms: 637 | self.usyms.append(s) 638 | 639 | # List of all LR items for the production 640 | self.lr_items = [] 641 | self.lr_next = None 642 | 643 | # Create a string representation 644 | if self.prod: 645 | self.str = '%s -> %s' % (self.name, ' '.join(self.prod)) 646 | else: 647 | self.str = '%s -> ' % self.name 648 | 649 | def __str__(self): 650 | return self.str 651 | 652 | def __repr__(self): 653 | return 'Production(' + str(self) + ')' 654 | 655 | def __len__(self): 656 | return len(self.prod) 657 | 658 | def __nonzero__(self): 659 | return 1 660 | 661 | def __getitem__(self, index): 662 | return self.prod[index] 663 | 664 | # Return the nth lr_item from the production (or None if at the end) 665 | def lr_item(self, n): 666 | if n > len(self.prod): 667 | return None 668 | p = LRItem(self, n) 669 | # Precompute the list of productions immediately following. 670 | try: 671 | p.lr_after = self.Prodnames[p.prod[n+1]] 672 | except (IndexError, KeyError): 673 | p.lr_after = [] 674 | try: 675 | p.lr_before = p.prod[n-1] 676 | except IndexError: 677 | p.lr_before = None 678 | return p 679 | 680 | # Bind the production function name to a callable 681 | def bind(self, pdict): 682 | if self.func: 683 | self.callable = pdict[self.func] 684 | 685 | # ----------------------------------------------------------------------------- 686 | # class LRItem 687 | # 688 | # This class represents a specific stage of parsing a production rule. For 689 | # example: 690 | # 691 | # expr : expr . PLUS term 692 | # 693 | # In the above, the "." represents the current location of the parse. Here 694 | # basic attributes: 695 | # 696 | # name - Name of the production. For example 'expr' 697 | # prod - A list of symbols on the right side ['expr','.', 'PLUS','term'] 698 | # number - Production number. 699 | # 700 | # lr_next Next LR item. Example, if we are ' expr -> expr . PLUS term' 701 | # then lr_next refers to 'expr -> expr PLUS . term' 702 | # lr_index - LR item index (location of the ".") in the prod list. 703 | # lookaheads - LALR lookahead symbols for this item 704 | # len - Length of the production (number of symbols on right hand side) 705 | # lr_after - List of all productions that immediately follow 706 | # lr_before - Grammar symbol immediately before 707 | # ----------------------------------------------------------------------------- 708 | 709 | class LRItem(object): 710 | def __init__(self, p, n): 711 | self.name = p.name 712 | self.prod = list(p.prod) 713 | self.number = p.number 714 | self.lr_index = n 715 | self.lookaheads = {} 716 | self.prod.insert(n, '.') 717 | self.prod = tuple(self.prod) 718 | self.len = len(self.prod) 719 | self.usyms = p.usyms 720 | 721 | def __str__(self): 722 | if self.prod: 723 | s = '%s -> %s' % (self.name, ' '.join(self.prod)) 724 | else: 725 | s = '%s -> ' % self.name 726 | return s 727 | 728 | def __repr__(self): 729 | return 'LRItem(' + str(self) + ')' 730 | 731 | # ----------------------------------------------------------------------------- 732 | # rightmost_terminal() 733 | # 734 | # Return the rightmost terminal from a list of symbols. Used in add_production() 735 | # ----------------------------------------------------------------------------- 736 | def rightmost_terminal(symbols, terminals): 737 | i = len(symbols) - 1 738 | while i >= 0: 739 | if symbols[i] in terminals: 740 | return symbols[i] 741 | i -= 1 742 | return None 743 | 744 | # ----------------------------------------------------------------------------- 745 | # === GRAMMAR CLASS === 746 | # 747 | # The following class represents the contents of the specified grammar along 748 | # with various computed properties such as first sets, follow sets, LR items, etc. 749 | # This data is used for critical parts of the table generation process later. 750 | # ----------------------------------------------------------------------------- 751 | 752 | class GrammarError(YaccError): 753 | pass 754 | 755 | class Grammar(object): 756 | def __init__(self, terminals): 757 | self.Productions = [None] # A list of all of the productions. The first 758 | # entry is always reserved for the purpose of 759 | # building an augmented grammar 760 | 761 | self.Prodnames = {} # A dictionary mapping the names of nonterminals to a list of all 762 | # productions of that nonterminal. 763 | 764 | self.Prodmap = {} # A dictionary that is only used to detect duplicate 765 | # productions. 766 | 767 | self.Terminals = {} # A dictionary mapping the names of terminal symbols to a 768 | # list of the rules where they are used. 769 | 770 | for term in terminals: 771 | self.Terminals[term] = [] 772 | 773 | self.Terminals['error'] = [] 774 | 775 | self.Nonterminals = {} # A dictionary mapping names of nonterminals to a list 776 | # of rule numbers where they are used. 777 | 778 | self.First = {} # A dictionary of precomputed FIRST(x) symbols 779 | 780 | self.Follow = {} # A dictionary of precomputed FOLLOW(x) symbols 781 | 782 | self.Precedence = {} # Precedence rules for each terminal. Contains tuples of the 783 | # form ('right',level) or ('nonassoc', level) or ('left',level) 784 | 785 | self.UsedPrecedence = set() # Precedence rules that were actually used by the grammer. 786 | # This is only used to provide error checking and to generate 787 | # a warning about unused precedence rules. 788 | 789 | self.Start = None # Starting symbol for the grammar 790 | 791 | 792 | def __len__(self): 793 | return len(self.Productions) 794 | 795 | def __getitem__(self, index): 796 | return self.Productions[index] 797 | 798 | # ----------------------------------------------------------------------------- 799 | # set_precedence() 800 | # 801 | # Sets the precedence for a given terminal. assoc is the associativity such as 802 | # 'left','right', or 'nonassoc'. level is a numeric level. 803 | # 804 | # ----------------------------------------------------------------------------- 805 | 806 | def set_precedence(self, term, assoc, level): 807 | assert self.Productions == [None], 'Must call set_precedence() before add_production()' 808 | if term in self.Precedence: 809 | raise GrammarError('Precedence already specified for terminal %r' % term) 810 | if assoc not in ['left', 'right', 'nonassoc']: 811 | raise GrammarError("Associativity must be one of 'left','right', or 'nonassoc'") 812 | self.Precedence[term] = (assoc, level) 813 | 814 | # ----------------------------------------------------------------------------- 815 | # add_production() 816 | # 817 | # Given an action function, this function assembles a production rule and 818 | # computes its precedence level. 819 | # 820 | # The production rule is supplied as a list of symbols. For example, 821 | # a rule such as 'expr : expr PLUS term' has a production name of 'expr' and 822 | # symbols ['expr','PLUS','term']. 823 | # 824 | # Precedence is determined by the precedence of the right-most non-terminal 825 | # or the precedence of a terminal specified by %prec. 826 | # 827 | # A variety of error checks are performed to make sure production symbols 828 | # are valid and that %prec is used correctly. 829 | # ----------------------------------------------------------------------------- 830 | 831 | def add_production(self, prodname, syms, func=None, file='', line=0): 832 | 833 | if prodname in self.Terminals: 834 | raise GrammarError('%s:%d: Illegal rule name %r. Already defined as a token' % (file, line, prodname)) 835 | if prodname == 'error': 836 | raise GrammarError('%s:%d: Illegal rule name %r. error is a reserved word' % (file, line, prodname)) 837 | if not _is_identifier.match(prodname): 838 | raise GrammarError('%s:%d: Illegal rule name %r' % (file, line, prodname)) 839 | 840 | # Look for literal tokens 841 | for n, s in enumerate(syms): 842 | if s[0] in "'\"": 843 | try: 844 | c = eval(s) 845 | if (len(c) > 1): 846 | raise GrammarError('%s:%d: Literal token %s in rule %r may only be a single character' % 847 | (file, line, s, prodname)) 848 | if c not in self.Terminals: 849 | self.Terminals[c] = [] 850 | syms[n] = c 851 | continue 852 | except SyntaxError: 853 | pass 854 | if not _is_identifier.match(s) and s != '%prec': 855 | raise GrammarError('%s:%d: Illegal name %r in rule %r' % (file, line, s, prodname)) 856 | 857 | # Determine the precedence level 858 | if '%prec' in syms: 859 | if syms[-1] == '%prec': 860 | raise GrammarError('%s:%d: Syntax error. Nothing follows %%prec' % (file, line)) 861 | if syms[-2] != '%prec': 862 | raise GrammarError('%s:%d: Syntax error. %%prec can only appear at the end of a grammar rule' % 863 | (file, line)) 864 | precname = syms[-1] 865 | prodprec = self.Precedence.get(precname) 866 | if not prodprec: 867 | raise GrammarError('%s:%d: Nothing known about the precedence of %r' % (file, line, precname)) 868 | else: 869 | self.UsedPrecedence.add(precname) 870 | del syms[-2:] # Drop %prec from the rule 871 | else: 872 | # If no %prec, precedence is determined by the rightmost terminal symbol 873 | precname = rightmost_terminal(syms, self.Terminals) 874 | prodprec = self.Precedence.get(precname, ('right', 0)) 875 | 876 | # See if the rule is already in the rulemap 877 | map = '%s -> %s' % (prodname, syms) 878 | if map in self.Prodmap: 879 | m = self.Prodmap[map] 880 | raise GrammarError('%s:%d: Duplicate rule %s. ' % (file, line, m) + 881 | 'Previous definition at %s:%d' % (m.file, m.line)) 882 | 883 | # From this point on, everything is valid. Create a new Production instance 884 | pnumber = len(self.Productions) 885 | if prodname not in self.Nonterminals: 886 | self.Nonterminals[prodname] = [] 887 | 888 | # Add the production number to Terminals and Nonterminals 889 | for t in syms: 890 | if t in self.Terminals: 891 | self.Terminals[t].append(pnumber) 892 | else: 893 | if t not in self.Nonterminals: 894 | self.Nonterminals[t] = [] 895 | self.Nonterminals[t].append(pnumber) 896 | 897 | # Create a production and add it to the list of productions 898 | p = Production(pnumber, prodname, syms, prodprec, func, file, line) 899 | self.Productions.append(p) 900 | self.Prodmap[map] = p 901 | 902 | # Add to the global productions list 903 | try: 904 | self.Prodnames[prodname].append(p) 905 | except KeyError: 906 | self.Prodnames[prodname] = [p] 907 | 908 | # ----------------------------------------------------------------------------- 909 | # set_start() 910 | # 911 | # Sets the starting symbol and creates the augmented grammar. Production 912 | # rule 0 is S' -> start where start is the start symbol. 913 | # ----------------------------------------------------------------------------- 914 | 915 | def set_start(self, start=None): 916 | if not start: 917 | start = self.Productions[1].name 918 | if start not in self.Nonterminals: 919 | raise GrammarError('start symbol %s undefined' % start) 920 | self.Productions[0] = Production(0, "S'", [start]) 921 | self.Nonterminals[start].append(0) 922 | self.Start = start 923 | 924 | # ----------------------------------------------------------------------------- 925 | # find_unreachable() 926 | # 927 | # Find all of the nonterminal symbols that can't be reached from the starting 928 | # symbol. Returns a list of nonterminals that can't be reached. 929 | # ----------------------------------------------------------------------------- 930 | 931 | def find_unreachable(self): 932 | 933 | # Mark all symbols that are reachable from a symbol s 934 | def mark_reachable_from(s): 935 | if s in reachable: 936 | return 937 | reachable.add(s) 938 | for p in self.Prodnames.get(s, []): 939 | for r in p.prod: 940 | mark_reachable_from(r) 941 | 942 | reachable = set() 943 | mark_reachable_from(self.Productions[0].prod[0]) 944 | return [s for s in self.Nonterminals if s not in reachable] 945 | 946 | # ----------------------------------------------------------------------------- 947 | # infinite_cycles() 948 | # 949 | # This function looks at the various parsing rules and tries to detect 950 | # infinite recursion cycles (grammar rules where there is no possible way 951 | # to derive a string of only terminals). 952 | # ----------------------------------------------------------------------------- 953 | 954 | def infinite_cycles(self): 955 | terminates = {} 956 | 957 | # Terminals: 958 | for t in self.Terminals: 959 | terminates[t] = True 960 | 961 | terminates['$end'] = True 962 | 963 | # Nonterminals: 964 | 965 | # Initialize to false: 966 | for n in self.Nonterminals: 967 | terminates[n] = False 968 | 969 | # Then propagate termination until no change: 970 | while True: 971 | some_change = False 972 | for (n, pl) in self.Prodnames.items(): 973 | # Nonterminal n terminates iff any of its productions terminates. 974 | for p in pl: 975 | # Production p terminates iff all of its rhs symbols terminate. 976 | for s in p.prod: 977 | if not terminates[s]: 978 | # The symbol s does not terminate, 979 | # so production p does not terminate. 980 | p_terminates = False 981 | break 982 | else: 983 | # didn't break from the loop, 984 | # so every symbol s terminates 985 | # so production p terminates. 986 | p_terminates = True 987 | 988 | if p_terminates: 989 | # symbol n terminates! 990 | if not terminates[n]: 991 | terminates[n] = True 992 | some_change = True 993 | # Don't need to consider any more productions for this n. 994 | break 995 | 996 | if not some_change: 997 | break 998 | 999 | infinite = [] 1000 | for (s, term) in terminates.items(): 1001 | if not term: 1002 | if s not in self.Prodnames and s not in self.Terminals and s != 'error': 1003 | # s is used-but-not-defined, and we've already warned of that, 1004 | # so it would be overkill to say that it's also non-terminating. 1005 | pass 1006 | else: 1007 | infinite.append(s) 1008 | 1009 | return infinite 1010 | 1011 | # ----------------------------------------------------------------------------- 1012 | # undefined_symbols() 1013 | # 1014 | # Find all symbols that were used the grammar, but not defined as tokens or 1015 | # grammar rules. Returns a list of tuples (sym, prod) where sym in the symbol 1016 | # and prod is the production where the symbol was used. 1017 | # ----------------------------------------------------------------------------- 1018 | def undefined_symbols(self): 1019 | result = [] 1020 | for p in self.Productions: 1021 | if not p: 1022 | continue 1023 | 1024 | for s in p.prod: 1025 | if s not in self.Prodnames and s not in self.Terminals and s != 'error': 1026 | result.append((s, p)) 1027 | return result 1028 | 1029 | # ----------------------------------------------------------------------------- 1030 | # unused_terminals() 1031 | # 1032 | # Find all terminals that were defined, but not used by the grammar. Returns 1033 | # a list of all symbols. 1034 | # ----------------------------------------------------------------------------- 1035 | def unused_terminals(self): 1036 | unused_tok = [] 1037 | for s, v in self.Terminals.items(): 1038 | if s != 'error' and not v: 1039 | unused_tok.append(s) 1040 | 1041 | return unused_tok 1042 | 1043 | # ------------------------------------------------------------------------------ 1044 | # unused_rules() 1045 | # 1046 | # Find all grammar rules that were defined, but not used (maybe not reachable) 1047 | # Returns a list of productions. 1048 | # ------------------------------------------------------------------------------ 1049 | 1050 | def unused_rules(self): 1051 | unused_prod = [] 1052 | for s, v in self.Nonterminals.items(): 1053 | if not v: 1054 | p = self.Prodnames[s][0] 1055 | unused_prod.append(p) 1056 | return unused_prod 1057 | 1058 | # ----------------------------------------------------------------------------- 1059 | # unused_precedence() 1060 | # 1061 | # Returns a list of tuples (term,precedence) corresponding to precedence 1062 | # rules that were never used by the grammar. term is the name of the terminal 1063 | # on which precedence was applied and precedence is a string such as 'left' or 1064 | # 'right' corresponding to the type of precedence. 1065 | # ----------------------------------------------------------------------------- 1066 | 1067 | def unused_precedence(self): 1068 | unused = [] 1069 | for termname in self.Precedence: 1070 | if not (termname in self.Terminals or termname in self.UsedPrecedence): 1071 | unused.append((termname, self.Precedence[termname][0])) 1072 | 1073 | return unused 1074 | 1075 | # ------------------------------------------------------------------------- 1076 | # _first() 1077 | # 1078 | # Compute the value of FIRST1(beta) where beta is a tuple of symbols. 1079 | # 1080 | # During execution of compute_first1, the result may be incomplete. 1081 | # Afterward (e.g., when called from compute_follow()), it will be complete. 1082 | # ------------------------------------------------------------------------- 1083 | def _first(self, beta): 1084 | 1085 | # We are computing First(x1,x2,x3,...,xn) 1086 | result = [] 1087 | for x in beta: 1088 | x_produces_empty = False 1089 | 1090 | # Add all the non- symbols of First[x] to the result. 1091 | for f in self.First[x]: 1092 | if f == '': 1093 | x_produces_empty = True 1094 | else: 1095 | if f not in result: 1096 | result.append(f) 1097 | 1098 | if x_produces_empty: 1099 | # We have to consider the next x in beta, 1100 | # i.e. stay in the loop. 1101 | pass 1102 | else: 1103 | # We don't have to consider any further symbols in beta. 1104 | break 1105 | else: 1106 | # There was no 'break' from the loop, 1107 | # so x_produces_empty was true for all x in beta, 1108 | # so beta produces empty as well. 1109 | result.append('') 1110 | 1111 | return result 1112 | 1113 | # ------------------------------------------------------------------------- 1114 | # compute_first() 1115 | # 1116 | # Compute the value of FIRST1(X) for all symbols 1117 | # ------------------------------------------------------------------------- 1118 | def compute_first(self): 1119 | if self.First: 1120 | return self.First 1121 | 1122 | # Terminals: 1123 | for t in self.Terminals: 1124 | self.First[t] = [t] 1125 | 1126 | self.First['$end'] = ['$end'] 1127 | 1128 | # Nonterminals: 1129 | 1130 | # Initialize to the empty set: 1131 | for n in self.Nonterminals: 1132 | self.First[n] = [] 1133 | 1134 | # Then propagate symbols until no change: 1135 | while True: 1136 | some_change = False 1137 | for n in self.Nonterminals: 1138 | for p in self.Prodnames[n]: 1139 | for f in self._first(p.prod): 1140 | if f not in self.First[n]: 1141 | self.First[n].append(f) 1142 | some_change = True 1143 | if not some_change: 1144 | break 1145 | 1146 | return self.First 1147 | 1148 | # --------------------------------------------------------------------- 1149 | # compute_follow() 1150 | # 1151 | # Computes all of the follow sets for every non-terminal symbol. The 1152 | # follow set is the set of all symbols that might follow a given 1153 | # non-terminal. See the Dragon book, 2nd Ed. p. 189. 1154 | # --------------------------------------------------------------------- 1155 | def compute_follow(self, start=None): 1156 | # If already computed, return the result 1157 | if self.Follow: 1158 | return self.Follow 1159 | 1160 | # If first sets not computed yet, do that first. 1161 | if not self.First: 1162 | self.compute_first() 1163 | 1164 | # Add '$end' to the follow list of the start symbol 1165 | for k in self.Nonterminals: 1166 | self.Follow[k] = [] 1167 | 1168 | if not start: 1169 | start = self.Productions[1].name 1170 | 1171 | self.Follow[start] = ['$end'] 1172 | 1173 | while True: 1174 | didadd = False 1175 | for p in self.Productions[1:]: 1176 | # Here is the production set 1177 | for i, B in enumerate(p.prod): 1178 | if B in self.Nonterminals: 1179 | # Okay. We got a non-terminal in a production 1180 | fst = self._first(p.prod[i+1:]) 1181 | hasempty = False 1182 | for f in fst: 1183 | if f != '' and f not in self.Follow[B]: 1184 | self.Follow[B].append(f) 1185 | didadd = True 1186 | if f == '': 1187 | hasempty = True 1188 | if hasempty or i == (len(p.prod)-1): 1189 | # Add elements of follow(a) to follow(b) 1190 | for f in self.Follow[p.name]: 1191 | if f not in self.Follow[B]: 1192 | self.Follow[B].append(f) 1193 | didadd = True 1194 | if not didadd: 1195 | break 1196 | return self.Follow 1197 | 1198 | 1199 | # ----------------------------------------------------------------------------- 1200 | # build_lritems() 1201 | # 1202 | # This function walks the list of productions and builds a complete set of the 1203 | # LR items. The LR items are stored in two ways: First, they are uniquely 1204 | # numbered and placed in the list _lritems. Second, a linked list of LR items 1205 | # is built for each production. For example: 1206 | # 1207 | # E -> E PLUS E 1208 | # 1209 | # Creates the list 1210 | # 1211 | # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] 1212 | # ----------------------------------------------------------------------------- 1213 | 1214 | def build_lritems(self): 1215 | for p in self.Productions: 1216 | lastlri = p 1217 | i = 0 1218 | lr_items = [] 1219 | while True: 1220 | if i > len(p): 1221 | lri = None 1222 | else: 1223 | lri = LRItem(p, i) 1224 | # Precompute the list of productions immediately following 1225 | try: 1226 | lri.lr_after = self.Prodnames[lri.prod[i+1]] 1227 | except (IndexError, KeyError): 1228 | lri.lr_after = [] 1229 | try: 1230 | lri.lr_before = lri.prod[i-1] 1231 | except IndexError: 1232 | lri.lr_before = None 1233 | 1234 | lastlri.lr_next = lri 1235 | if not lri: 1236 | break 1237 | lr_items.append(lri) 1238 | lastlri = lri 1239 | i += 1 1240 | p.lr_items = lr_items 1241 | 1242 | # ----------------------------------------------------------------------------- 1243 | # === LR Generator === 1244 | # 1245 | # The following classes and functions are used to generate LR parsing tables on 1246 | # a grammar. 1247 | # ----------------------------------------------------------------------------- 1248 | 1249 | # ----------------------------------------------------------------------------- 1250 | # digraph() 1251 | # traverse() 1252 | # 1253 | # The following two functions are used to compute set valued functions 1254 | # of the form: 1255 | # 1256 | # F(x) = F'(x) U U{F(y) | x R y} 1257 | # 1258 | # This is used to compute the values of Read() sets as well as FOLLOW sets 1259 | # in LALR(1) generation. 1260 | # 1261 | # Inputs: X - An input set 1262 | # R - A relation 1263 | # FP - Set-valued function 1264 | # ------------------------------------------------------------------------------ 1265 | 1266 | def digraph(X, R, FP): 1267 | N = {} 1268 | for x in X: 1269 | N[x] = 0 1270 | stack = [] 1271 | F = {} 1272 | for x in X: 1273 | if N[x] == 0: 1274 | traverse(x, N, stack, F, X, R, FP) 1275 | return F 1276 | 1277 | def traverse(x, N, stack, F, X, R, FP): 1278 | stack.append(x) 1279 | d = len(stack) 1280 | N[x] = d 1281 | F[x] = FP(x) # F(X) <- F'(x) 1282 | 1283 | rel = R(x) # Get y's related to x 1284 | for y in rel: 1285 | if N[y] == 0: 1286 | traverse(y, N, stack, F, X, R, FP) 1287 | N[x] = min(N[x], N[y]) 1288 | for a in F.get(y, []): 1289 | if a not in F[x]: 1290 | F[x].append(a) 1291 | if N[x] == d: 1292 | N[stack[-1]] = MAXINT 1293 | F[stack[-1]] = F[x] 1294 | element = stack.pop() 1295 | while element != x: 1296 | N[stack[-1]] = MAXINT 1297 | F[stack[-1]] = F[x] 1298 | element = stack.pop() 1299 | 1300 | class LALRError(YaccError): 1301 | pass 1302 | 1303 | 1304 | # ----------------------------------------------------------------------------- 1305 | # == LRTable == 1306 | # 1307 | # This class implements the LR table generation algorithm. There are no 1308 | # public methods. 1309 | # ----------------------------------------------------------------------------- 1310 | 1311 | class LRTable: 1312 | def __init__(self, grammar, log=None): 1313 | self.grammar = grammar 1314 | 1315 | # Set up the logger 1316 | if not log: 1317 | log = NullLogger() 1318 | self.log = log 1319 | 1320 | # Internal attributes 1321 | self.lr_action = {} # Action table 1322 | self.lr_goto = {} # Goto table 1323 | self.lr_productions = grammar.Productions # Copy of grammar Production array 1324 | self.lr_goto_cache = {} # Cache of computed gotos 1325 | self.lr0_cidhash = {} # Cache of closures 1326 | 1327 | self._add_count = 0 # Internal counter used to detect cycles 1328 | 1329 | # Diagnostic information filled in by the table generator 1330 | self.sr_conflict = 0 1331 | self.rr_conflict = 0 1332 | self.conflicts = [] # List of conflicts 1333 | 1334 | self.sr_conflicts = [] 1335 | self.rr_conflicts = [] 1336 | 1337 | # Build the tables 1338 | self.grammar.build_lritems() 1339 | self.grammar.compute_first() 1340 | self.grammar.compute_follow() 1341 | self.lr_parse_table() 1342 | 1343 | # Bind all production function names to callable objects in pdict 1344 | def bind_callables(self, pdict): 1345 | for p in self.lr_productions: 1346 | p.bind(pdict) 1347 | 1348 | # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. 1349 | 1350 | def lr0_closure(self, I): 1351 | self._add_count += 1 1352 | 1353 | # Add everything in I to J 1354 | J = I[:] 1355 | didadd = True 1356 | while didadd: 1357 | didadd = False 1358 | for j in J: 1359 | for x in j.lr_after: 1360 | if getattr(x, 'lr0_added', 0) == self._add_count: 1361 | continue 1362 | # Add B --> .G to J 1363 | J.append(x.lr_next) 1364 | x.lr0_added = self._add_count 1365 | didadd = True 1366 | 1367 | return J 1368 | 1369 | # Compute the LR(0) goto function goto(I,X) where I is a set 1370 | # of LR(0) items and X is a grammar symbol. This function is written 1371 | # in a way that guarantees uniqueness of the generated goto sets 1372 | # (i.e. the same goto set will never be returned as two different Python 1373 | # objects). With uniqueness, we can later do fast set comparisons using 1374 | # id(obj) instead of element-wise comparison. 1375 | 1376 | def lr0_goto(self, I, x): 1377 | # First we look for a previously cached entry 1378 | g = self.lr_goto_cache.get((id(I), x)) 1379 | if g: 1380 | return g 1381 | 1382 | # Now we generate the goto set in a way that guarantees uniqueness 1383 | # of the result 1384 | 1385 | s = self.lr_goto_cache.get(x) 1386 | if not s: 1387 | s = {} 1388 | self.lr_goto_cache[x] = s 1389 | 1390 | gs = [] 1391 | for p in I: 1392 | n = p.lr_next 1393 | if n and n.lr_before == x: 1394 | s1 = s.get(id(n)) 1395 | if not s1: 1396 | s1 = {} 1397 | s[id(n)] = s1 1398 | gs.append(n) 1399 | s = s1 1400 | g = s.get('$end') 1401 | if not g: 1402 | if gs: 1403 | g = self.lr0_closure(gs) 1404 | s['$end'] = g 1405 | else: 1406 | s['$end'] = gs 1407 | self.lr_goto_cache[(id(I), x)] = g 1408 | return g 1409 | 1410 | # Compute the LR(0) sets of item function 1411 | def lr0_items(self): 1412 | C = [self.lr0_closure([self.grammar.Productions[0].lr_next])] 1413 | i = 0 1414 | for I in C: 1415 | self.lr0_cidhash[id(I)] = i 1416 | i += 1 1417 | 1418 | # Loop over the items in C and each grammar symbols 1419 | i = 0 1420 | while i < len(C): 1421 | I = C[i] 1422 | i += 1 1423 | 1424 | # Collect all of the symbols that could possibly be in the goto(I,X) sets 1425 | asyms = {} 1426 | for ii in I: 1427 | for s in ii.usyms: 1428 | asyms[s] = None 1429 | 1430 | for x in asyms: 1431 | g = self.lr0_goto(I, x) 1432 | if not g or id(g) in self.lr0_cidhash: 1433 | continue 1434 | self.lr0_cidhash[id(g)] = len(C) 1435 | C.append(g) 1436 | 1437 | return C 1438 | 1439 | # ----------------------------------------------------------------------------- 1440 | # ==== LALR(1) Parsing ==== 1441 | # 1442 | # LALR(1) parsing is almost exactly the same as SLR except that instead of 1443 | # relying upon Follow() sets when performing reductions, a more selective 1444 | # lookahead set that incorporates the state of the LR(0) machine is utilized. 1445 | # Thus, we mainly just have to focus on calculating the lookahead sets. 1446 | # 1447 | # The method used here is due to DeRemer and Pennelo (1982). 1448 | # 1449 | # DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) 1450 | # Lookahead Sets", ACM Transactions on Programming Languages and Systems, 1451 | # Vol. 4, No. 4, Oct. 1982, pp. 615-649 1452 | # 1453 | # Further details can also be found in: 1454 | # 1455 | # J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", 1456 | # McGraw-Hill Book Company, (1985). 1457 | # 1458 | # ----------------------------------------------------------------------------- 1459 | 1460 | # ----------------------------------------------------------------------------- 1461 | # compute_nullable_nonterminals() 1462 | # 1463 | # Creates a dictionary containing all of the non-terminals that might produce 1464 | # an empty production. 1465 | # ----------------------------------------------------------------------------- 1466 | 1467 | def compute_nullable_nonterminals(self): 1468 | nullable = set() 1469 | num_nullable = 0 1470 | while True: 1471 | for p in self.grammar.Productions[1:]: 1472 | if p.len == 0: 1473 | nullable.add(p.name) 1474 | continue 1475 | for t in p.prod: 1476 | if t not in nullable: 1477 | break 1478 | else: 1479 | nullable.add(p.name) 1480 | if len(nullable) == num_nullable: 1481 | break 1482 | num_nullable = len(nullable) 1483 | return nullable 1484 | 1485 | # ----------------------------------------------------------------------------- 1486 | # find_nonterminal_trans(C) 1487 | # 1488 | # Given a set of LR(0) items, this functions finds all of the non-terminal 1489 | # transitions. These are transitions in which a dot appears immediately before 1490 | # a non-terminal. Returns a list of tuples of the form (state,N) where state 1491 | # is the state number and N is the nonterminal symbol. 1492 | # 1493 | # The input C is the set of LR(0) items. 1494 | # ----------------------------------------------------------------------------- 1495 | 1496 | def find_nonterminal_transitions(self, C): 1497 | trans = [] 1498 | for stateno, state in enumerate(C): 1499 | for p in state: 1500 | if p.lr_index < p.len - 1: 1501 | t = (stateno, p.prod[p.lr_index+1]) 1502 | if t[1] in self.grammar.Nonterminals: 1503 | if t not in trans: 1504 | trans.append(t) 1505 | return trans 1506 | 1507 | # ----------------------------------------------------------------------------- 1508 | # dr_relation() 1509 | # 1510 | # Computes the DR(p,A) relationships for non-terminal transitions. The input 1511 | # is a tuple (state,N) where state is a number and N is a nonterminal symbol. 1512 | # 1513 | # Returns a list of terminals. 1514 | # ----------------------------------------------------------------------------- 1515 | 1516 | def dr_relation(self, C, trans, nullable): 1517 | state, N = trans 1518 | terms = [] 1519 | 1520 | g = self.lr0_goto(C[state], N) 1521 | for p in g: 1522 | if p.lr_index < p.len - 1: 1523 | a = p.prod[p.lr_index+1] 1524 | if a in self.grammar.Terminals: 1525 | if a not in terms: 1526 | terms.append(a) 1527 | 1528 | # This extra bit is to handle the start state 1529 | if state == 0 and N == self.grammar.Productions[0].prod[0]: 1530 | terms.append('$end') 1531 | 1532 | return terms 1533 | 1534 | # ----------------------------------------------------------------------------- 1535 | # reads_relation() 1536 | # 1537 | # Computes the READS() relation (p,A) READS (t,C). 1538 | # ----------------------------------------------------------------------------- 1539 | 1540 | def reads_relation(self, C, trans, empty): 1541 | # Look for empty transitions 1542 | rel = [] 1543 | state, N = trans 1544 | 1545 | g = self.lr0_goto(C[state], N) 1546 | j = self.lr0_cidhash.get(id(g), -1) 1547 | for p in g: 1548 | if p.lr_index < p.len - 1: 1549 | a = p.prod[p.lr_index + 1] 1550 | if a in empty: 1551 | rel.append((j, a)) 1552 | 1553 | return rel 1554 | 1555 | # ----------------------------------------------------------------------------- 1556 | # compute_lookback_includes() 1557 | # 1558 | # Determines the lookback and includes relations 1559 | # 1560 | # LOOKBACK: 1561 | # 1562 | # This relation is determined by running the LR(0) state machine forward. 1563 | # For example, starting with a production "N : . A B C", we run it forward 1564 | # to obtain "N : A B C ." We then build a relationship between this final 1565 | # state and the starting state. These relationships are stored in a dictionary 1566 | # lookdict. 1567 | # 1568 | # INCLUDES: 1569 | # 1570 | # Computes the INCLUDE() relation (p,A) INCLUDES (p',B). 1571 | # 1572 | # This relation is used to determine non-terminal transitions that occur 1573 | # inside of other non-terminal transition states. (p,A) INCLUDES (p', B) 1574 | # if the following holds: 1575 | # 1576 | # B -> LAT, where T -> epsilon and p' -L-> p 1577 | # 1578 | # L is essentially a prefix (which may be empty), T is a suffix that must be 1579 | # able to derive an empty string. State p' must lead to state p with the string L. 1580 | # 1581 | # ----------------------------------------------------------------------------- 1582 | 1583 | def compute_lookback_includes(self, C, trans, nullable): 1584 | lookdict = {} # Dictionary of lookback relations 1585 | includedict = {} # Dictionary of include relations 1586 | 1587 | # Make a dictionary of non-terminal transitions 1588 | dtrans = {} 1589 | for t in trans: 1590 | dtrans[t] = 1 1591 | 1592 | # Loop over all transitions and compute lookbacks and includes 1593 | for state, N in trans: 1594 | lookb = [] 1595 | includes = [] 1596 | for p in C[state]: 1597 | if p.name != N: 1598 | continue 1599 | 1600 | # Okay, we have a name match. We now follow the production all the way 1601 | # through the state machine until we get the . on the right hand side 1602 | 1603 | lr_index = p.lr_index 1604 | j = state 1605 | while lr_index < p.len - 1: 1606 | lr_index = lr_index + 1 1607 | t = p.prod[lr_index] 1608 | 1609 | # Check to see if this symbol and state are a non-terminal transition 1610 | if (j, t) in dtrans: 1611 | # Yes. Okay, there is some chance that this is an includes relation 1612 | # the only way to know for certain is whether the rest of the 1613 | # production derives empty 1614 | 1615 | li = lr_index + 1 1616 | while li < p.len: 1617 | if p.prod[li] in self.grammar.Terminals: 1618 | break # No forget it 1619 | if p.prod[li] not in nullable: 1620 | break 1621 | li = li + 1 1622 | else: 1623 | # Appears to be a relation between (j,t) and (state,N) 1624 | includes.append((j, t)) 1625 | 1626 | g = self.lr0_goto(C[j], t) # Go to next set 1627 | j = self.lr0_cidhash.get(id(g), -1) # Go to next state 1628 | 1629 | # When we get here, j is the final state, now we have to locate the production 1630 | for r in C[j]: 1631 | if r.name != p.name: 1632 | continue 1633 | if r.len != p.len: 1634 | continue 1635 | i = 0 1636 | # This look is comparing a production ". A B C" with "A B C ." 1637 | while i < r.lr_index: 1638 | if r.prod[i] != p.prod[i+1]: 1639 | break 1640 | i = i + 1 1641 | else: 1642 | lookb.append((j, r)) 1643 | for i in includes: 1644 | if i not in includedict: 1645 | includedict[i] = [] 1646 | includedict[i].append((state, N)) 1647 | lookdict[(state, N)] = lookb 1648 | 1649 | return lookdict, includedict 1650 | 1651 | # ----------------------------------------------------------------------------- 1652 | # compute_read_sets() 1653 | # 1654 | # Given a set of LR(0) items, this function computes the read sets. 1655 | # 1656 | # Inputs: C = Set of LR(0) items 1657 | # ntrans = Set of nonterminal transitions 1658 | # nullable = Set of empty transitions 1659 | # 1660 | # Returns a set containing the read sets 1661 | # ----------------------------------------------------------------------------- 1662 | 1663 | def compute_read_sets(self, C, ntrans, nullable): 1664 | FP = lambda x: self.dr_relation(C, x, nullable) 1665 | R = lambda x: self.reads_relation(C, x, nullable) 1666 | F = digraph(ntrans, R, FP) 1667 | return F 1668 | 1669 | # ----------------------------------------------------------------------------- 1670 | # compute_follow_sets() 1671 | # 1672 | # Given a set of LR(0) items, a set of non-terminal transitions, a readset, 1673 | # and an include set, this function computes the follow sets 1674 | # 1675 | # Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} 1676 | # 1677 | # Inputs: 1678 | # ntrans = Set of nonterminal transitions 1679 | # readsets = Readset (previously computed) 1680 | # inclsets = Include sets (previously computed) 1681 | # 1682 | # Returns a set containing the follow sets 1683 | # ----------------------------------------------------------------------------- 1684 | 1685 | def compute_follow_sets(self, ntrans, readsets, inclsets): 1686 | FP = lambda x: readsets[x] 1687 | R = lambda x: inclsets.get(x, []) 1688 | F = digraph(ntrans, R, FP) 1689 | return F 1690 | 1691 | # ----------------------------------------------------------------------------- 1692 | # add_lookaheads() 1693 | # 1694 | # Attaches the lookahead symbols to grammar rules. 1695 | # 1696 | # Inputs: lookbacks - Set of lookback relations 1697 | # followset - Computed follow set 1698 | # 1699 | # This function directly attaches the lookaheads to productions contained 1700 | # in the lookbacks set 1701 | # ----------------------------------------------------------------------------- 1702 | 1703 | def add_lookaheads(self, lookbacks, followset): 1704 | for trans, lb in lookbacks.items(): 1705 | # Loop over productions in lookback 1706 | for state, p in lb: 1707 | if state not in p.lookaheads: 1708 | p.lookaheads[state] = [] 1709 | f = followset.get(trans, []) 1710 | for a in f: 1711 | if a not in p.lookaheads[state]: 1712 | p.lookaheads[state].append(a) 1713 | 1714 | # ----------------------------------------------------------------------------- 1715 | # add_lalr_lookaheads() 1716 | # 1717 | # This function does all of the work of adding lookahead information for use 1718 | # with LALR parsing 1719 | # ----------------------------------------------------------------------------- 1720 | 1721 | def add_lalr_lookaheads(self, C): 1722 | # Determine all of the nullable nonterminals 1723 | nullable = self.compute_nullable_nonterminals() 1724 | 1725 | # Find all non-terminal transitions 1726 | trans = self.find_nonterminal_transitions(C) 1727 | 1728 | # Compute read sets 1729 | readsets = self.compute_read_sets(C, trans, nullable) 1730 | 1731 | # Compute lookback/includes relations 1732 | lookd, included = self.compute_lookback_includes(C, trans, nullable) 1733 | 1734 | # Compute LALR FOLLOW sets 1735 | followsets = self.compute_follow_sets(trans, readsets, included) 1736 | 1737 | # Add all of the lookaheads 1738 | self.add_lookaheads(lookd, followsets) 1739 | 1740 | # ----------------------------------------------------------------------------- 1741 | # lr_parse_table() 1742 | # 1743 | # This function constructs the parse tables for SLR or LALR 1744 | # ----------------------------------------------------------------------------- 1745 | def lr_parse_table(self): 1746 | Productions = self.grammar.Productions 1747 | Precedence = self.grammar.Precedence 1748 | goto = self.lr_goto # Goto array 1749 | action = self.lr_action # Action array 1750 | log = self.log # Logger for output 1751 | 1752 | actionp = {} # Action production array (temporary) 1753 | 1754 | # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items 1755 | # This determines the number of states 1756 | 1757 | C = self.lr0_items() 1758 | self.add_lalr_lookaheads(C) 1759 | 1760 | # Build the parser table, state by state 1761 | st = 0 1762 | for I in C: 1763 | # Loop over each production in I 1764 | actlist = [] # List of actions 1765 | st_action = {} 1766 | st_actionp = {} 1767 | st_goto = {} 1768 | log.info('') 1769 | log.info('state %d', st) 1770 | log.info('') 1771 | for p in I: 1772 | log.info(' (%d) %s', p.number, p) 1773 | log.info('') 1774 | 1775 | for p in I: 1776 | if p.len == p.lr_index + 1: 1777 | if p.name == "S'": 1778 | # Start symbol. Accept! 1779 | st_action['$end'] = 0 1780 | st_actionp['$end'] = p 1781 | else: 1782 | # We are at the end of a production. Reduce! 1783 | laheads = p.lookaheads[st] 1784 | for a in laheads: 1785 | actlist.append((a, p, 'reduce using rule %d (%s)' % (p.number, p))) 1786 | r = st_action.get(a) 1787 | if r is not None: 1788 | # Whoa. Have a shift/reduce or reduce/reduce conflict 1789 | if r > 0: 1790 | # Need to decide on shift or reduce here 1791 | # By default we favor shifting. Need to add 1792 | # some precedence rules here. 1793 | 1794 | # Shift precedence comes from the token 1795 | sprec, slevel = Precedence.get(a, ('right', 0)) 1796 | 1797 | # Reduce precedence comes from rule being reduced (p) 1798 | rprec, rlevel = Productions[p.number].prec 1799 | 1800 | if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): 1801 | # We really need to reduce here. 1802 | st_action[a] = -p.number 1803 | st_actionp[a] = p 1804 | if not slevel and not rlevel: 1805 | log.info(' ! shift/reduce conflict for %s resolved as reduce', a) 1806 | self.sr_conflicts.append((st, a, 'reduce')) 1807 | Productions[p.number].reduced += 1 1808 | elif (slevel == rlevel) and (rprec == 'nonassoc'): 1809 | st_action[a] = None 1810 | else: 1811 | # Hmmm. Guess we'll keep the shift 1812 | if not rlevel: 1813 | log.info(' ! shift/reduce conflict for %s resolved as shift', a) 1814 | self.sr_conflicts.append((st, a, 'shift')) 1815 | elif r < 0: 1816 | # Reduce/reduce conflict. In this case, we favor the rule 1817 | # that was defined first in the grammar file 1818 | oldp = Productions[-r] 1819 | pp = Productions[p.number] 1820 | if oldp.line > pp.line: 1821 | st_action[a] = -p.number 1822 | st_actionp[a] = p 1823 | chosenp, rejectp = pp, oldp 1824 | Productions[p.number].reduced += 1 1825 | Productions[oldp.number].reduced -= 1 1826 | else: 1827 | chosenp, rejectp = oldp, pp 1828 | self.rr_conflicts.append((st, chosenp, rejectp)) 1829 | log.info(' ! reduce/reduce conflict for %s resolved using rule %d (%s)', 1830 | a, st_actionp[a].number, st_actionp[a]) 1831 | else: 1832 | raise LALRError('Unknown conflict in state %d' % st) 1833 | else: 1834 | st_action[a] = -p.number 1835 | st_actionp[a] = p 1836 | Productions[p.number].reduced += 1 1837 | else: 1838 | i = p.lr_index 1839 | a = p.prod[i+1] # Get symbol right after the "." 1840 | if a in self.grammar.Terminals: 1841 | g = self.lr0_goto(I, a) 1842 | j = self.lr0_cidhash.get(id(g), -1) 1843 | if j >= 0: 1844 | # We are in a shift state 1845 | actlist.append((a, p, 'shift and go to state %d' % j)) 1846 | r = st_action.get(a) 1847 | if r is not None: 1848 | # Whoa have a shift/reduce or shift/shift conflict 1849 | if r > 0: 1850 | if r != j: 1851 | raise LALRError('Shift/shift conflict in state %d' % st) 1852 | elif r < 0: 1853 | # Do a precedence check. 1854 | # - if precedence of reduce rule is higher, we reduce. 1855 | # - if precedence of reduce is same and left assoc, we reduce. 1856 | # - otherwise we shift 1857 | 1858 | # Shift precedence comes from the token 1859 | sprec, slevel = Precedence.get(a, ('right', 0)) 1860 | 1861 | # Reduce precedence comes from the rule that could have been reduced 1862 | rprec, rlevel = Productions[st_actionp[a].number].prec 1863 | 1864 | if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')): 1865 | # We decide to shift here... highest precedence to shift 1866 | Productions[st_actionp[a].number].reduced -= 1 1867 | st_action[a] = j 1868 | st_actionp[a] = p 1869 | if not rlevel: 1870 | log.info(' ! shift/reduce conflict for %s resolved as shift', a) 1871 | self.sr_conflicts.append((st, a, 'shift')) 1872 | elif (slevel == rlevel) and (rprec == 'nonassoc'): 1873 | st_action[a] = None 1874 | else: 1875 | # Hmmm. Guess we'll keep the reduce 1876 | if not slevel and not rlevel: 1877 | log.info(' ! shift/reduce conflict for %s resolved as reduce', a) 1878 | self.sr_conflicts.append((st, a, 'reduce')) 1879 | 1880 | else: 1881 | raise LALRError('Unknown conflict in state %d' % st) 1882 | else: 1883 | st_action[a] = j 1884 | st_actionp[a] = p 1885 | 1886 | # Print the actions associated with each terminal 1887 | _actprint = {} 1888 | for a, p, m in actlist: 1889 | if a in st_action: 1890 | if p is st_actionp[a]: 1891 | log.info(' %-15s %s', a, m) 1892 | _actprint[(a, m)] = 1 1893 | log.info('') 1894 | # Print the actions that were not used. (debugging) 1895 | not_used = 0 1896 | for a, p, m in actlist: 1897 | if a in st_action: 1898 | if p is not st_actionp[a]: 1899 | if not (a, m) in _actprint: 1900 | log.debug(' ! %-15s [ %s ]', a, m) 1901 | not_used = 1 1902 | _actprint[(a, m)] = 1 1903 | if not_used: 1904 | log.debug('') 1905 | 1906 | # Construct the goto table for this state 1907 | 1908 | nkeys = {} 1909 | for ii in I: 1910 | for s in ii.usyms: 1911 | if s in self.grammar.Nonterminals: 1912 | nkeys[s] = None 1913 | for n in nkeys: 1914 | g = self.lr0_goto(I, n) 1915 | j = self.lr0_cidhash.get(id(g), -1) 1916 | if j >= 0: 1917 | st_goto[n] = j 1918 | log.info(' %-30s shift and go to state %d', n, j) 1919 | 1920 | action[st] = st_action 1921 | actionp[st] = st_actionp 1922 | goto[st] = st_goto 1923 | st += 1 1924 | 1925 | # ----------------------------------------------------------------------------- 1926 | # === INTROSPECTION === 1927 | # 1928 | # The following functions and classes are used to implement the PLY 1929 | # introspection features followed by the yacc() function itself. 1930 | # ----------------------------------------------------------------------------- 1931 | 1932 | # ----------------------------------------------------------------------------- 1933 | # get_caller_module_dict() 1934 | # 1935 | # This function returns a dictionary containing all of the symbols defined within 1936 | # a caller further down the call stack. This is used to get the environment 1937 | # associated with the yacc() call if none was provided. 1938 | # ----------------------------------------------------------------------------- 1939 | 1940 | def get_caller_module_dict(levels): 1941 | f = sys._getframe(levels) 1942 | ldict = f.f_globals.copy() 1943 | if f.f_globals != f.f_locals: 1944 | ldict.update(f.f_locals) 1945 | return ldict 1946 | 1947 | # ----------------------------------------------------------------------------- 1948 | # parse_grammar() 1949 | # 1950 | # This takes a raw grammar rule string and parses it into production data 1951 | # ----------------------------------------------------------------------------- 1952 | def parse_grammar(doc, file, line): 1953 | grammar = [] 1954 | # Split the doc string into lines 1955 | pstrings = doc.splitlines() 1956 | lastp = None 1957 | dline = line 1958 | for ps in pstrings: 1959 | dline += 1 1960 | p = ps.split() 1961 | if not p: 1962 | continue 1963 | try: 1964 | if p[0] == '|': 1965 | # This is a continuation of a previous rule 1966 | if not lastp: 1967 | raise SyntaxError("%s:%d: Misplaced '|'" % (file, dline)) 1968 | prodname = lastp 1969 | syms = p[1:] 1970 | else: 1971 | prodname = p[0] 1972 | lastp = prodname 1973 | syms = p[2:] 1974 | assign = p[1] 1975 | if assign != ':' and assign != '::=': 1976 | raise SyntaxError("%s:%d: Syntax error. Expected ':'" % (file, dline)) 1977 | 1978 | grammar.append((file, dline, prodname, syms)) 1979 | except SyntaxError: 1980 | raise 1981 | except Exception: 1982 | raise SyntaxError('%s:%d: Syntax error in rule %r' % (file, dline, ps.strip())) 1983 | 1984 | return grammar 1985 | 1986 | # ----------------------------------------------------------------------------- 1987 | # ParserReflect() 1988 | # 1989 | # This class represents information extracted for building a parser including 1990 | # start symbol, error function, tokens, precedence list, action functions, 1991 | # etc. 1992 | # ----------------------------------------------------------------------------- 1993 | class ParserReflect(object): 1994 | def __init__(self, pdict, log=None): 1995 | self.pdict = pdict 1996 | self.start = None 1997 | self.error_func = None 1998 | self.tokens = None 1999 | self.modules = set() 2000 | self.grammar = [] 2001 | self.error = False 2002 | 2003 | if log is None: 2004 | self.log = PlyLogger(sys.stderr) 2005 | else: 2006 | self.log = log 2007 | 2008 | # Get all of the basic information 2009 | def get_all(self): 2010 | self.get_start() 2011 | self.get_error_func() 2012 | self.get_tokens() 2013 | self.get_precedence() 2014 | self.get_pfunctions() 2015 | 2016 | # Validate all of the information 2017 | def validate_all(self): 2018 | self.validate_start() 2019 | self.validate_error_func() 2020 | self.validate_tokens() 2021 | self.validate_precedence() 2022 | self.validate_pfunctions() 2023 | self.validate_modules() 2024 | return self.error 2025 | 2026 | # Compute a signature over the grammar 2027 | def signature(self): 2028 | parts = [] 2029 | try: 2030 | if self.start: 2031 | parts.append(self.start) 2032 | if self.prec: 2033 | parts.append(''.join([''.join(p) for p in self.prec])) 2034 | if self.tokens: 2035 | parts.append(' '.join(self.tokens)) 2036 | for f in self.pfuncs: 2037 | if f[3]: 2038 | parts.append(f[3]) 2039 | except (TypeError, ValueError): 2040 | pass 2041 | return ''.join(parts) 2042 | 2043 | # ----------------------------------------------------------------------------- 2044 | # validate_modules() 2045 | # 2046 | # This method checks to see if there are duplicated p_rulename() functions 2047 | # in the parser module file. Without this function, it is really easy for 2048 | # users to make mistakes by cutting and pasting code fragments (and it's a real 2049 | # bugger to try and figure out why the resulting parser doesn't work). Therefore, 2050 | # we just do a little regular expression pattern matching of def statements 2051 | # to try and detect duplicates. 2052 | # ----------------------------------------------------------------------------- 2053 | 2054 | def validate_modules(self): 2055 | # Match def p_funcname( 2056 | fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') 2057 | 2058 | for module in self.modules: 2059 | try: 2060 | lines, linen = inspect.getsourcelines(module) 2061 | except IOError: 2062 | continue 2063 | 2064 | counthash = {} 2065 | for linen, line in enumerate(lines): 2066 | linen += 1 2067 | m = fre.match(line) 2068 | if m: 2069 | name = m.group(1) 2070 | prev = counthash.get(name) 2071 | if not prev: 2072 | counthash[name] = linen 2073 | else: 2074 | filename = inspect.getsourcefile(module) 2075 | self.log.warning('%s:%d: Function %s redefined. Previously defined on line %d', 2076 | filename, linen, name, prev) 2077 | 2078 | # Get the start symbol 2079 | def get_start(self): 2080 | self.start = self.pdict.get('start') 2081 | 2082 | # Validate the start symbol 2083 | def validate_start(self): 2084 | if self.start is not None: 2085 | if not isinstance(self.start, str): 2086 | self.log.error("'start' must be a string") 2087 | 2088 | # Look for error handler 2089 | def get_error_func(self): 2090 | self.error_func = self.pdict.get('p_error') 2091 | 2092 | # Validate the error function 2093 | def validate_error_func(self): 2094 | if self.error_func: 2095 | if isinstance(self.error_func, types.FunctionType): 2096 | ismethod = 0 2097 | elif isinstance(self.error_func, types.MethodType): 2098 | ismethod = 1 2099 | else: 2100 | self.log.error("'p_error' defined, but is not a function or method") 2101 | self.error = True 2102 | return 2103 | 2104 | eline = self.error_func.__code__.co_firstlineno 2105 | efile = self.error_func.__code__.co_filename 2106 | module = inspect.getmodule(self.error_func) 2107 | self.modules.add(module) 2108 | 2109 | argcount = self.error_func.__code__.co_argcount - ismethod 2110 | if argcount != 1: 2111 | self.log.error('%s:%d: p_error() requires 1 argument', efile, eline) 2112 | self.error = True 2113 | 2114 | # Get the tokens map 2115 | def get_tokens(self): 2116 | tokens = self.pdict.get('tokens') 2117 | if not tokens: 2118 | self.log.error('No token list is defined') 2119 | self.error = True 2120 | return 2121 | 2122 | if not isinstance(tokens, (list, tuple)): 2123 | self.log.error('tokens must be a list or tuple') 2124 | self.error = True 2125 | return 2126 | 2127 | if not tokens: 2128 | self.log.error('tokens is empty') 2129 | self.error = True 2130 | return 2131 | 2132 | self.tokens = sorted(tokens) 2133 | 2134 | # Validate the tokens 2135 | def validate_tokens(self): 2136 | # Validate the tokens. 2137 | if 'error' in self.tokens: 2138 | self.log.error("Illegal token name 'error'. Is a reserved word") 2139 | self.error = True 2140 | return 2141 | 2142 | terminals = set() 2143 | for n in self.tokens: 2144 | if n in terminals: 2145 | self.log.warning('Token %r multiply defined', n) 2146 | terminals.add(n) 2147 | 2148 | # Get the precedence map (if any) 2149 | def get_precedence(self): 2150 | self.prec = self.pdict.get('precedence') 2151 | 2152 | # Validate and parse the precedence map 2153 | def validate_precedence(self): 2154 | preclist = [] 2155 | if self.prec: 2156 | if not isinstance(self.prec, (list, tuple)): 2157 | self.log.error('precedence must be a list or tuple') 2158 | self.error = True 2159 | return 2160 | for level, p in enumerate(self.prec): 2161 | if not isinstance(p, (list, tuple)): 2162 | self.log.error('Bad precedence table') 2163 | self.error = True 2164 | return 2165 | 2166 | if len(p) < 2: 2167 | self.log.error('Malformed precedence entry %s. Must be (assoc, term, ..., term)', p) 2168 | self.error = True 2169 | return 2170 | assoc = p[0] 2171 | if not isinstance(assoc, str): 2172 | self.log.error('precedence associativity must be a string') 2173 | self.error = True 2174 | return 2175 | for term in p[1:]: 2176 | if not isinstance(term, str): 2177 | self.log.error('precedence items must be strings') 2178 | self.error = True 2179 | return 2180 | preclist.append((term, assoc, level+1)) 2181 | self.preclist = preclist 2182 | 2183 | # Get all p_functions from the grammar 2184 | def get_pfunctions(self): 2185 | p_functions = [] 2186 | for name, item in self.pdict.items(): 2187 | if not name.startswith('p_') or name == 'p_error': 2188 | continue 2189 | if isinstance(item, (types.FunctionType, types.MethodType)): 2190 | line = getattr(item, 'co_firstlineno', item.__code__.co_firstlineno) 2191 | module = inspect.getmodule(item) 2192 | p_functions.append((line, module, name, item.__doc__)) 2193 | 2194 | # Sort all of the actions by line number; make sure to stringify 2195 | # modules to make them sortable, since `line` may not uniquely sort all 2196 | # p functions 2197 | p_functions.sort(key=lambda p_function: ( 2198 | p_function[0], 2199 | str(p_function[1]), 2200 | p_function[2], 2201 | p_function[3])) 2202 | self.pfuncs = p_functions 2203 | 2204 | # Validate all of the p_functions 2205 | def validate_pfunctions(self): 2206 | grammar = [] 2207 | # Check for non-empty symbols 2208 | if len(self.pfuncs) == 0: 2209 | self.log.error('no rules of the form p_rulename are defined') 2210 | self.error = True 2211 | return 2212 | 2213 | for line, module, name, doc in self.pfuncs: 2214 | file = inspect.getsourcefile(module) 2215 | func = self.pdict[name] 2216 | if isinstance(func, types.MethodType): 2217 | reqargs = 2 2218 | else: 2219 | reqargs = 1 2220 | if func.__code__.co_argcount > reqargs: 2221 | self.log.error('%s:%d: Rule %r has too many arguments', file, line, func.__name__) 2222 | self.error = True 2223 | elif func.__code__.co_argcount < reqargs: 2224 | self.log.error('%s:%d: Rule %r requires an argument', file, line, func.__name__) 2225 | self.error = True 2226 | elif not func.__doc__: 2227 | self.log.warning('%s:%d: No documentation string specified in function %r (ignored)', 2228 | file, line, func.__name__) 2229 | else: 2230 | try: 2231 | parsed_g = parse_grammar(doc, file, line) 2232 | for g in parsed_g: 2233 | grammar.append((name, g)) 2234 | except SyntaxError as e: 2235 | self.log.error(str(e)) 2236 | self.error = True 2237 | 2238 | # Looks like a valid grammar rule 2239 | # Mark the file in which defined. 2240 | self.modules.add(module) 2241 | 2242 | # Secondary validation step that looks for p_ definitions that are not functions 2243 | # or functions that look like they might be grammar rules. 2244 | 2245 | for n, v in self.pdict.items(): 2246 | if n.startswith('p_') and isinstance(v, (types.FunctionType, types.MethodType)): 2247 | continue 2248 | if n.startswith('t_'): 2249 | continue 2250 | if n.startswith('p_') and n != 'p_error': 2251 | self.log.warning('%r not defined as a function', n) 2252 | if ((isinstance(v, types.FunctionType) and v.__code__.co_argcount == 1) or 2253 | (isinstance(v, types.MethodType) and v.__func__.__code__.co_argcount == 2)): 2254 | if v.__doc__: 2255 | try: 2256 | doc = v.__doc__.split(' ') 2257 | if doc[1] == ':': 2258 | self.log.warning('%s:%d: Possible grammar rule %r defined without p_ prefix', 2259 | v.__code__.co_filename, v.__code__.co_firstlineno, n) 2260 | except IndexError: 2261 | pass 2262 | 2263 | self.grammar = grammar 2264 | 2265 | # ----------------------------------------------------------------------------- 2266 | # yacc(module) 2267 | # 2268 | # Build a parser 2269 | # ----------------------------------------------------------------------------- 2270 | 2271 | def yacc(*, debug=yaccdebug, module=None, start=None, 2272 | check_recursion=True, optimize=False, debugfile=debug_file, 2273 | debuglog=None, errorlog=None): 2274 | 2275 | # Reference to the parsing method of the last built parser 2276 | global parse 2277 | 2278 | if errorlog is None: 2279 | errorlog = PlyLogger(sys.stderr) 2280 | 2281 | # Get the module dictionary used for the parser 2282 | if module: 2283 | _items = [(k, getattr(module, k)) for k in dir(module)] 2284 | pdict = dict(_items) 2285 | # If no __file__ or __package__ attributes are available, try to obtain them 2286 | # from the __module__ instead 2287 | if '__file__' not in pdict: 2288 | pdict['__file__'] = sys.modules[pdict['__module__']].__file__ 2289 | if '__package__' not in pdict and '__module__' in pdict: 2290 | if hasattr(sys.modules[pdict['__module__']], '__package__'): 2291 | pdict['__package__'] = sys.modules[pdict['__module__']].__package__ 2292 | else: 2293 | pdict = get_caller_module_dict(2) 2294 | 2295 | # Set start symbol if it's specified directly using an argument 2296 | if start is not None: 2297 | pdict['start'] = start 2298 | 2299 | # Collect parser information from the dictionary 2300 | pinfo = ParserReflect(pdict, log=errorlog) 2301 | pinfo.get_all() 2302 | 2303 | if pinfo.error: 2304 | raise YaccError('Unable to build parser') 2305 | 2306 | if debuglog is None: 2307 | if debug: 2308 | try: 2309 | debuglog = PlyLogger(open(debugfile, 'w')) 2310 | except IOError as e: 2311 | errorlog.warning("Couldn't open %r. %s" % (debugfile, e)) 2312 | debuglog = NullLogger() 2313 | else: 2314 | debuglog = NullLogger() 2315 | 2316 | debuglog.info('Created by PLY (http://www.dabeaz.com/ply)') 2317 | 2318 | errors = False 2319 | 2320 | # Validate the parser information 2321 | if pinfo.validate_all(): 2322 | raise YaccError('Unable to build parser') 2323 | 2324 | if not pinfo.error_func: 2325 | errorlog.warning('no p_error() function is defined') 2326 | 2327 | # Create a grammar object 2328 | grammar = Grammar(pinfo.tokens) 2329 | 2330 | # Set precedence level for terminals 2331 | for term, assoc, level in pinfo.preclist: 2332 | try: 2333 | grammar.set_precedence(term, assoc, level) 2334 | except GrammarError as e: 2335 | errorlog.warning('%s', e) 2336 | 2337 | # Add productions to the grammar 2338 | for funcname, gram in pinfo.grammar: 2339 | file, line, prodname, syms = gram 2340 | try: 2341 | grammar.add_production(prodname, syms, funcname, file, line) 2342 | except GrammarError as e: 2343 | errorlog.error('%s', e) 2344 | errors = True 2345 | 2346 | # Set the grammar start symbols 2347 | try: 2348 | if start is None: 2349 | grammar.set_start(pinfo.start) 2350 | else: 2351 | grammar.set_start(start) 2352 | except GrammarError as e: 2353 | errorlog.error(str(e)) 2354 | errors = True 2355 | 2356 | if errors: 2357 | raise YaccError('Unable to build parser') 2358 | 2359 | # Verify the grammar structure 2360 | undefined_symbols = grammar.undefined_symbols() 2361 | for sym, prod in undefined_symbols: 2362 | errorlog.error('%s:%d: Symbol %r used, but not defined as a token or a rule', prod.file, prod.line, sym) 2363 | errors = True 2364 | 2365 | unused_terminals = grammar.unused_terminals() 2366 | if unused_terminals: 2367 | debuglog.info('') 2368 | debuglog.info('Unused terminals:') 2369 | debuglog.info('') 2370 | for term in unused_terminals: 2371 | errorlog.warning('Token %r defined, but not used', term) 2372 | debuglog.info(' %s', term) 2373 | 2374 | # Print out all productions to the debug log 2375 | if debug: 2376 | debuglog.info('') 2377 | debuglog.info('Grammar') 2378 | debuglog.info('') 2379 | for n, p in enumerate(grammar.Productions): 2380 | debuglog.info('Rule %-5d %s', n, p) 2381 | 2382 | # Find unused non-terminals 2383 | unused_rules = grammar.unused_rules() 2384 | for prod in unused_rules: 2385 | errorlog.warning('%s:%d: Rule %r defined, but not used', prod.file, prod.line, prod.name) 2386 | 2387 | if len(unused_terminals) == 1: 2388 | errorlog.warning('There is 1 unused token') 2389 | if len(unused_terminals) > 1: 2390 | errorlog.warning('There are %d unused tokens', len(unused_terminals)) 2391 | 2392 | if len(unused_rules) == 1: 2393 | errorlog.warning('There is 1 unused rule') 2394 | if len(unused_rules) > 1: 2395 | errorlog.warning('There are %d unused rules', len(unused_rules)) 2396 | 2397 | if debug: 2398 | debuglog.info('') 2399 | debuglog.info('Terminals, with rules where they appear') 2400 | debuglog.info('') 2401 | terms = list(grammar.Terminals) 2402 | terms.sort() 2403 | for term in terms: 2404 | debuglog.info('%-20s : %s', term, ' '.join([str(s) for s in grammar.Terminals[term]])) 2405 | 2406 | debuglog.info('') 2407 | debuglog.info('Nonterminals, with rules where they appear') 2408 | debuglog.info('') 2409 | nonterms = list(grammar.Nonterminals) 2410 | nonterms.sort() 2411 | for nonterm in nonterms: 2412 | debuglog.info('%-20s : %s', nonterm, ' '.join([str(s) for s in grammar.Nonterminals[nonterm]])) 2413 | debuglog.info('') 2414 | 2415 | if check_recursion: 2416 | unreachable = grammar.find_unreachable() 2417 | for u in unreachable: 2418 | errorlog.warning('Symbol %r is unreachable', u) 2419 | 2420 | infinite = grammar.infinite_cycles() 2421 | for inf in infinite: 2422 | errorlog.error('Infinite recursion detected for symbol %r', inf) 2423 | errors = True 2424 | 2425 | unused_prec = grammar.unused_precedence() 2426 | for term, assoc in unused_prec: 2427 | errorlog.error('Precedence rule %r defined for unknown symbol %r', assoc, term) 2428 | errors = True 2429 | 2430 | if errors: 2431 | raise YaccError('Unable to build parser') 2432 | 2433 | # Run the LRTable on the grammar 2434 | lr = LRTable(grammar, debuglog) 2435 | 2436 | if debug: 2437 | num_sr = len(lr.sr_conflicts) 2438 | 2439 | # Report shift/reduce and reduce/reduce conflicts 2440 | if num_sr == 1: 2441 | errorlog.warning('1 shift/reduce conflict') 2442 | elif num_sr > 1: 2443 | errorlog.warning('%d shift/reduce conflicts', num_sr) 2444 | 2445 | num_rr = len(lr.rr_conflicts) 2446 | if num_rr == 1: 2447 | errorlog.warning('1 reduce/reduce conflict') 2448 | elif num_rr > 1: 2449 | errorlog.warning('%d reduce/reduce conflicts', num_rr) 2450 | 2451 | # Write out conflicts to the output file 2452 | if debug and (lr.sr_conflicts or lr.rr_conflicts): 2453 | debuglog.warning('') 2454 | debuglog.warning('Conflicts:') 2455 | debuglog.warning('') 2456 | 2457 | for state, tok, resolution in lr.sr_conflicts: 2458 | debuglog.warning('shift/reduce conflict for %s in state %d resolved as %s', tok, state, resolution) 2459 | 2460 | already_reported = set() 2461 | for state, rule, rejected in lr.rr_conflicts: 2462 | if (state, id(rule), id(rejected)) in already_reported: 2463 | continue 2464 | debuglog.warning('reduce/reduce conflict in state %d resolved using rule (%s)', state, rule) 2465 | debuglog.warning('rejected rule (%s) in state %d', rejected, state) 2466 | errorlog.warning('reduce/reduce conflict in state %d resolved using rule (%s)', state, rule) 2467 | errorlog.warning('rejected rule (%s) in state %d', rejected, state) 2468 | already_reported.add((state, id(rule), id(rejected))) 2469 | 2470 | warned_never = [] 2471 | for state, rule, rejected in lr.rr_conflicts: 2472 | if not rejected.reduced and (rejected not in warned_never): 2473 | debuglog.warning('Rule (%s) is never reduced', rejected) 2474 | errorlog.warning('Rule (%s) is never reduced', rejected) 2475 | warned_never.append(rejected) 2476 | 2477 | # Build the parser 2478 | lr.bind_callables(pinfo.pdict) 2479 | parser = LRParser(lr, pinfo.error_func) 2480 | 2481 | parse = parser.parse 2482 | return parser 2483 | --------------------------------------------------------------------------------