├── Makefile ├── samples ├── foo_lib.c └── foo.c ├── .gitignore ├── README.md ├── c.py ├── clex.py ├── cparse.py ├── cvisitors.py ├── lex.py ├── cx86.py └── yacc.py /Makefile: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Makefile 3 | # 4 | # Atul Varma 5 | # Python C Compiler - Makefile 6 | # $Id: Makefile,v 1.8 2004/06/02 21:11:57 varmaa Exp $ 7 | # 8 | # This just makes all the sample code and lets you clean up 9 | # intermediate/output files. 10 | # --------------------------------------------------------------- 11 | 12 | FLAGS=-annotate -ast 13 | PYTHON=python 14 | 15 | compile-samples: 16 | ${PYTHON} c.py samples/foo.c samples/foo_lib.c ${FLAGS} 17 | gcc samples/foo.s samples/foo_lib.s -o samples/foo 18 | 19 | clean: 20 | rm -f parsetab.py parser.out *.pyc samples/*.ast \ 21 | samples/*.s samples/*.exe samples/foo 22 | -------------------------------------------------------------------------------- /samples/foo_lib.c: -------------------------------------------------------------------------------- 1 | /******************************************************************* 2 | * foo_lib.c 3 | * Atul Varma - 5/24/2004 4 | * CS Independent Study 5 | * $Id: foo_lib.c,v 1.1 2004/05/27 16:25:14 varmaa Exp $ 6 | * 7 | * Contains external library functions/variables for foo.c. 8 | ******************************************************************* 9 | */ 10 | 11 | /* Test global variable. */ 12 | int stuff_count; 13 | 14 | /* Test of static function definition, to make sure it 15 | doesn't conflict with fib() defined in foo.c. */ 16 | static int fib() 17 | { 18 | return stuff_count += 1; 19 | } 20 | 21 | /* Increment global variable. */ 22 | int increment_stuff_count() 23 | { 24 | fib(); 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /samples/foo.c: -------------------------------------------------------------------------------- 1 | /******************************************************************* 2 | * foo.c 3 | * Atul Varma - 5/24/2004 4 | * CS Independent Study 5 | * $Id: foo.c,v 1.1 2004/05/27 16:25:14 varmaa Exp $ 6 | * 7 | * This is a simple C file that should be compiled by my mini-C 8 | * compiler. 9 | ******************************************************************* 10 | */ 11 | 12 | /* Prototypes for some standard C library functions (the code 13 | calls these directly). */ 14 | extern int printf(char *str, ...); 15 | extern char *malloc(int size); 16 | extern int free(char *ptr); 17 | 18 | /* Test of extern variable. How many times we've called 19 | a printf() function. */ 20 | extern int stuff_count; 21 | 22 | /* Increments this global variable. */ 23 | extern int increment_stuff_count(); 24 | 25 | /* Test of global variable. How many times we've called 26 | the fib() function. */ 27 | int fib_count; 28 | 29 | /* fibonacci function: Test of basic branching and recursion. */ 30 | static int fib(int i) 31 | { 32 | fib_count += 1; 33 | if (i == 1) { 34 | return 1; 35 | } else { 36 | if (i == 0) { 37 | return 0; 38 | } else { 39 | return fib(i-1) + fib(i-2); 40 | } 41 | } 42 | } 43 | 44 | /* Just a wrapper to easily show the results of a 45 | call to fib(). */ 46 | static int show_fib(int i) 47 | { 48 | printf("fib(%d) is %d.\n", i, fib(i)); 49 | return 0; 50 | } 51 | 52 | /* Test of pointer indirection and char type. */ 53 | static int set_a(char *c) 54 | { 55 | *c = 'a'; 56 | return 0; 57 | } 58 | 59 | /* Test of string literals and returning char *'s. */ 60 | static char *get_literal() 61 | { 62 | return "blah\n"; 63 | } 64 | 65 | /* Main program that runs the tests. */ 66 | int main(int argc, char **argv) { 67 | char c; 68 | int i; 69 | 70 | c = 'h'; 71 | 72 | /* Test of multiple assignment. */ 73 | fib_count = stuff_count = 0; 74 | 75 | /* Test of command-line argument passing, pointer 76 | indirection/array indexing, for looping. */ 77 | printf("My executable name is %s.\n", *argv); 78 | for (i = 0; i < argc; i += 1) { 79 | printf(" argv[%d] is: %s " 80 | "argv[%d][0] is: %c\n", i, argv[i], i, argv[i][0]); 81 | increment_stuff_count(); 82 | } 83 | 84 | /* Test of while looping with break/continue. */ 85 | i = 0; 86 | while (1) { 87 | show_fib(i); 88 | i += 1; 89 | if (i > 5) 90 | break; 91 | else 92 | continue; 93 | } 94 | stuff_count = stuff_count * 2; 95 | 96 | printf("fib_count is %d.\n", fib_count); 97 | printf("stuff_count is %d.\n", stuff_count); 98 | 99 | printf("before set_a(&c), c == '%c'\n", c); 100 | 101 | /* Test of address-of (&) operator. */ 102 | set_a(&c); 103 | 104 | { 105 | /* Test of char-int and int-char type coercion. */ 106 | int a; 107 | char b; 108 | int c; 109 | 110 | /* Note that in two's complement arithmetic, this is 111 | a 32-bit int consisting of all 1's. 112 | 113 | (This is also a test of the '-' unary operator.) */ 114 | a = -1; 115 | 116 | /* The following line will raise a warning from the 117 | compiler, because a signed 32-bit int is being truncated 118 | to an unsigned 8-bit char. */ 119 | b = a; 120 | 121 | c = b; 122 | 123 | printf(" a = %d\n", a); 124 | printf(" b = %d\n", b); 125 | printf(" c = %d\n", c); 126 | } 127 | 128 | /* Note now that the scope of c is in the function's main 129 | scope, not the scope of the above compound statement. 130 | This test makes sure that the address and contents 131 | of c did not change during the execution of the 132 | compound statement. */ 133 | printf("after set_a(&c), c == '%c'\n", c); 134 | 135 | printf("get_literal() = %s\n", get_literal()); 136 | 137 | /* Pointer indexing via array example. */ 138 | printf("get_literal()[3] = %c\n", get_literal()[3]); 139 | 140 | { 141 | /* Test of building a string using assignment via array indexing 142 | of a char pointer. The buffer is dynamically allocated. */ 143 | char *c; 144 | 145 | c = malloc(30); 146 | c[0] = 'h'; 147 | c[1] = 'i'; 148 | c[2] = 0; 149 | printf("array-built string is: %s\n", c); 150 | free(c); 151 | } 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [Atul](https://github.com/toolness)'s Mini-C Compiler 2 | June 2, 2004 3 | 4 | This is a compiler for a subset of the C programming language. It was 5 | written in Python during the spring of 2004. 6 | 7 | The lexer and parser were constructed using Dave Beazley's PLY (Python 8 | Lex-Yacc), an open-source Python implementation of GNU 9 | lex/yacc. Stages of compilation (symbol tree generation, type 10 | checking, flow control checking, etc) are performed using an 11 | object-oriented design pattern called a visitor (GoF 1995). The output 12 | is annotated Intel 80x86 assembly, suitable for translation to machine 13 | language using the GNU Assembler (GAS). 14 | 15 | --------------------------------------------------------------- 16 | LANGUAGE FEATURES 17 | --------------------------------------------------------------- 18 | 19 | The subset of the C language implemented here includes: 20 | 21 | * Functions, variables (local and global), and character and 22 | string literals. 23 | 24 | * Assignments (=, +=, etc), standard arithmetic binary and unary 25 | operators (+,-,*, etc), logical binary and unary operators (!, 26 | ==, <, etc). 27 | 28 | * Support for the C datatypes char and int, as well as implicit 29 | type conversion between the two (warnings are raised in 30 | situations of potential data loss). int variables are assumed to 31 | be signed, and char variables are assumed to be unsigned (this 32 | is not a violation of the ANSI C standard). 33 | 34 | * Control flow elements including while and for loops, 35 | if/then/else conditionals, and recursion. 36 | 37 | * Support for the C keywords extern for functions and variables, 38 | and static for functions. 39 | 40 | * Pointers, including pointer dereferencing (the * operator), 41 | multiple levels of indirection (double pointers, triple 42 | pointers, etc), array indexing notation, and the address-of (&) 43 | operator. 44 | 45 | --------------------------------------------------------------- 46 | FILES AND DIRECTORIES 47 | --------------------------------------------------------------- 48 | 49 | lex.py - Python Lex (this is part of PLY). 50 | yacc.py - Python Yacc (this is part of PLY). 51 | clex.py - Mini-C lexer. 52 | cparse.py - Mini-C parser. Contains yacc rules for Mini-C and 53 | defines the classes that make up the AST. 54 | cvisitors.py - Mini-C visitors. Defines the base visitor class, 55 | and concrete visitor classes for printing the AST, 56 | doing symbol table generation, type checking, and 57 | flow control. 58 | cx86.py - Intel 80x86 assembly code generator. Defines a 59 | virtual stack machine class and the code generator 60 | visitor. 61 | c.py - Front-end to the compiler. This takes in command- 62 | line options and runs the compiler on the filenames 63 | you give it. 64 | samples/ - This directory contains foo.c and foo_lib.c, two 65 | C files that can be compiled by the mini-c 66 | compiler. foo_lib.c is intended to be used as 67 | a library that foo.c accesses, to show 68 | that mini-c generates assembly that can be linked 69 | with gcc. 70 | 71 | --------------------------------------------------------------- 72 | USING THE COMPILER 73 | --------------------------------------------------------------- 74 | 75 | The syntax for using the mini-c compiler is as follows: 76 | 77 | c.py [[source-file-2] ...] [-ast] [-annotate] 78 | 79 | Source files are the C files you want to compile into assembly (.s 80 | files). 81 | 82 | The '-ast' option generates a file with extension .ast that is a 83 | printout of the abstract syntax tree for the source file, after 84 | all stages of compilation occur. 85 | 86 | The '-annotate' option generates annotated assembly. That is, 87 | assembly is generated with comments describing what each instruction 88 | does, its relevance to the original C source code, and so forth. 89 | Additional comments are inserted to delimit functions, control 90 | structures, and so forth. 91 | 92 | --------------------------------------------------------------- 93 | THE MAKEFILE 94 | --------------------------------------------------------------- 95 | 96 | The makefile just compiles the two files in the samples/ directory and 97 | outputs an executable called 'foo' into this directory (all other 98 | output files are also placed here). 99 | 100 | Note that while compiling this, you may receive a bunch of warnings 101 | mentioning something about an "Illegal character: ''". This is just 102 | an artifact of newline translation differences between platforms and 103 | should be ignored. 104 | -------------------------------------------------------------------------------- /c.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # c.py 3 | # 4 | # Atul Varma 5 | # Python C Compiler - Front-end 6 | # $Id: c.py,v 1.3 2004/05/27 17:52:19 varmaa Exp $ 7 | # 8 | # This is the main program for the compiler, which just parses 9 | # command-line options, figures out which source files to read 10 | # and write to, and invokes the different stages of the 11 | # compiler proper. 12 | # --------------------------------------------------------------- 13 | 14 | import yacc 15 | 16 | import cparse, cvisitors, cx86 17 | 18 | import sys 19 | 20 | class Compiler: 21 | """This object encapsulates the front-end for the compiler and 22 | serves as a facade interface to the 'meat' of the compiler 23 | underneath.""" 24 | 25 | class CompileError(Exception): 26 | """Exception raised when there's been a compilation error.""" 27 | 28 | pass 29 | 30 | def __init__(self): 31 | self.total_errors = 0 32 | self.total_warnings = 0 33 | 34 | def _parse(self): 35 | """Parses the source code.""" 36 | self.ast = yacc.parse(self.code) 37 | 38 | def _compile_phase(self, visitor): 39 | """Applies a visitor to the abstract syntax tree.""" 40 | 41 | visitor.visit(self.ast) 42 | self.total_errors += visitor.errors 43 | self.total_warnings += visitor.warnings 44 | if visitor.has_errors(): 45 | raise Compiler.CompileError() 46 | 47 | def _do_compile(self, outfile, ast_file, show_comments): 48 | """Compiles the code to the given file object. Enabling 49 | show_ast prints out the abstract syntax tree.""" 50 | 51 | self._parse() 52 | self._compile_phase(cvisitors.SymtabVisitor()) 53 | self._compile_phase(cvisitors.TypeCheckVisitor()) 54 | self._compile_phase(cvisitors.FlowControlVisitor()) 55 | self._compile_phase(cx86.CodeGenVisitor(outfile, 56 | show_comments)) 57 | if ast_file != None: 58 | self._compile_phase(cvisitors.ASTPrinterVisitor(ast_file)) 59 | 60 | def _print_stats(self): 61 | """Prints the total number of errors/warnings from compilation.""" 62 | 63 | print "%d errors, %d warnings." % (self.total_errors, self.total_warnings) 64 | 65 | def compile(self, code, outfile, show_ast, show_comments): 66 | """Compiles the given code string to the given file object.""" 67 | 68 | self.code = code 69 | try: 70 | self._do_compile(outfile, show_ast, show_comments) 71 | except cparse.ParseError: 72 | print "Errors encountered, bailing." 73 | return 1 74 | except Compiler.CompileError: 75 | self._print_stats() 76 | print "Errors encountered, bailing." 77 | return 1 78 | self._print_stats() 79 | print "Compile successful." 80 | return 0 81 | 82 | def run_compiler(): 83 | """Runs the command-line compiler.""" 84 | 85 | if len(sys.argv) < 2: 86 | print "Usage: c.py [[source-file-2] ...] [-ast] [-annotate]" 87 | sys.exit(1) 88 | 89 | show_ast = 0 90 | show_comments = 0 91 | 92 | params = sys.argv[1:] 93 | files = sys.argv[1:] 94 | 95 | for param in params: 96 | if param[0] == '-': 97 | if param == '-ast': 98 | show_ast = 1 99 | elif param == '-annotate': 100 | print "Annotated assembly generation enabled." 101 | show_comments = 1 102 | else: 103 | print "Unknown option: %s" % param 104 | sys.exit(1) 105 | files.remove(param) 106 | 107 | for file in files: 108 | source_filename = file 109 | dest_filename = file[:-2]+'.s' 110 | print "Compiling %s -> %s." % (source_filename, dest_filename) 111 | open_files = [] 112 | ast_file = None 113 | if show_ast: 114 | ast_filename = file[:-2]+'.ast' 115 | print "Outputting AST to %s." % ast_filename 116 | ast_file = open(ast_filename, 'w') 117 | open_files.append(ast_file) 118 | source = open(source_filename, 'r') 119 | code = source.read() 120 | source.close() 121 | dest = open(dest_filename, 'w') 122 | open_files.append(dest) 123 | retval = Compiler().compile(code, dest, ast_file, show_comments) 124 | for f in open_files: 125 | f.close() 126 | if retval != 0: 127 | sys.exit(retval) 128 | print 129 | 130 | sys.exit(retval) 131 | 132 | if __name__ == '__main__': 133 | run_compiler() 134 | 135 | # --------------------------------------------------------------- 136 | # End of c.py 137 | # --------------------------------------------------------------- 138 | -------------------------------------------------------------------------------- /clex.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # clex.py 3 | # 4 | # Atul Varma 5 | # Python C Compiler - Lexical Analyzer 6 | # $Id: clex.py,v 1.2 2004/06/02 21:05:45 varmaa Exp $ 7 | # --------------------------------------------------------------- 8 | 9 | import lex 10 | import re 11 | 12 | # --------------------------------------------------------------- 13 | # TOKEN LIST 14 | # --------------------------------------------------------------- 15 | 16 | tokens = ( 17 | # Reserved words 18 | 'AUTO', 19 | 'BREAK', 20 | 'CASE', 21 | 'CHAR', 22 | 'CONST', 23 | 'CONTINUE', 24 | 'DEFAULT', 25 | 'DO', 26 | 'DOUBLE', 27 | 'ELSE', 28 | 'ENUM', 29 | 'EXTERN', 30 | 'FLOAT', 31 | 'FOR', 32 | 'GOTO', 33 | 'IF', 34 | 'INT', 35 | 'LONG', 36 | 'REGISTER', 37 | 'RETURN', 38 | 'SHORT', 39 | 'SIGNED', 40 | 'SIZEOF', 41 | 'STATIC', 42 | 'STRUCT', 43 | 'SWITCH', 44 | 'TYPEDEF', 45 | 'UNION', 46 | 'UNSIGNED', 47 | 'VOID', 48 | 'VOLATILE', 49 | 'WHILE', 50 | 51 | # Special characters 52 | 'COMMA', 53 | 'COLON', 54 | 'SEMICOLON', 55 | 'LPAREN', 56 | 'RPAREN', 57 | 'LBRACKET', 58 | 'RBRACKET', 59 | 'LBRACE', 60 | 'RBRACE', 61 | 'ASSIGN', 62 | 'GREATER', 63 | 'LESS', 64 | 'EQ', 65 | 'NOT_EQ', 66 | 'GREATER_EQ', 67 | 'LESS_EQ', 68 | 'DOUBLE_PLUS', 69 | 'DOUBLE_MINUS', 70 | 'PLUS', 71 | 'MINUS', 72 | 'TIMES', 73 | 'DIV', 74 | 'MODULO', 75 | 'DOUBLE_AMPERSAND', 76 | 'DOUBLE_PIPE', 77 | 'EXCLAMATION', 78 | 'AMPERSAND', 79 | 'PIPE', 80 | 'CARET', 81 | 'ASTERISK', 82 | 'QUESTION', 83 | 'TILDE', 84 | 'POUND', 85 | 'DOT', 86 | 'ELLIPSIS', 87 | 'ARROW', 88 | 'SHIFT_LEFT', 89 | 'SHIFT_RIGHT', 90 | 'EQ_PLUS', 91 | 'EQ_MINUS', 92 | 'EQ_TIMES', 93 | 'EQ_DIV', 94 | 'EQ_MODULO', 95 | 'EQ_PIPE', 96 | 'EQ_AMPERSAND', 97 | 'EQ_CARET', 98 | 'EQ_SHIFT_LEFT', 99 | 'EQ_SHIFT_RIGHT', 100 | 101 | # Complex tokens 102 | 'ID', 103 | 'FNUMBER', 104 | 'INUMBER', 105 | 'STRING', 106 | 'CHARACTER', 107 | ) 108 | 109 | # --------------------------------------------------------------- 110 | # RESERVED WORDS 111 | # --------------------------------------------------------------- 112 | 113 | reserved_words = { 114 | 'auto' : 'AUTO', 115 | 'break' : 'BREAK', 116 | 'case' : 'CASE', 117 | 'char' : 'CHAR', 118 | 'const' : 'CONST', 119 | 'continue' : 'CONTINUE', 120 | 'default' : 'DEFAULT', 121 | 'do' : 'DO', 122 | 'double' : 'DOUBLE', 123 | 'else' : 'ELSE', 124 | 'enum' : 'ENUM', 125 | 'extern' : 'EXTERN', 126 | 'float' : 'FLOAT', 127 | 'for' : 'FOR', 128 | 'goto' : 'GOTO', 129 | 'if' : 'IF', 130 | 'int' : 'INT', 131 | 'long' : 'LONG', 132 | 'register' : 'REGISTER', 133 | 'return' : 'RETURN', 134 | 'short' : 'SHORT', 135 | 'signed' : 'SIGNED', 136 | 'sizeof' : 'SIZEOF', 137 | 'static' : 'STATIC', 138 | 'struct' : 'STRUCT', 139 | 'switch' : 'SWITCH', 140 | 'typedef' : 'TYPEDEF', 141 | 'union' : 'UNION', 142 | 'unsigned' : 'UNSIGNED', 143 | 'void' : 'VOID', 144 | 'volatile' : 'VOLATILE', 145 | 'while' : 'WHILE' 146 | } 147 | 148 | # --------------------------------------------------------------- 149 | # SPECIAL CHARACTERS 150 | # --------------------------------------------------------------- 151 | 152 | t_COMMA = r',' 153 | t_COLON = r':' 154 | t_SEMICOLON = r';' 155 | t_LPAREN = r'\(' 156 | t_RPAREN = r'\)' 157 | t_LBRACKET = r'\[' 158 | t_RBRACKET = r'\]' 159 | t_LBRACE = r'{' 160 | t_RBRACE = r'}' 161 | t_ASSIGN = r'=' 162 | t_GREATER = r'>' 163 | t_LESS = r'<' 164 | t_EQ = r'==' 165 | t_NOT_EQ = r'!=' 166 | t_GREATER_EQ = r'>=' 167 | t_LESS_EQ = r'<=' 168 | t_DOUBLE_PLUS = r'\+\+' 169 | t_DOUBLE_MINUS = r'--' 170 | t_PLUS = r'\+' 171 | t_MINUS = r'-' 172 | t_TIMES = r'\*' 173 | t_DIV = r'/(?!\*)' 174 | t_MODULO = r'%' 175 | t_DOUBLE_AMPERSAND = r'&&' 176 | t_DOUBLE_PIPE = r'\|\|' 177 | t_EXCLAMATION = r'!' 178 | t_AMPERSAND = r'&' 179 | t_PIPE = r'\|' 180 | t_CARET = r'^' 181 | t_ASTERISK = r'\*' 182 | t_QUESTION = r'\?' 183 | t_TILDE = r'~' 184 | t_POUND = r'\#' 185 | t_ELLIPSIS = r'\.\.\.' 186 | t_DOT = r'\.' 187 | t_ARROW = r'->' 188 | t_SHIFT_LEFT = r'<<' 189 | t_SHIFT_RIGHT = r'>>' 190 | t_EQ_PLUS = r'\+=' 191 | t_EQ_MINUS = r'-=' 192 | t_EQ_TIMES = r'\*=' 193 | t_EQ_DIV = r'/=' 194 | t_EQ_MODULO = r'%=' 195 | t_EQ_PIPE = r'\|=' 196 | t_EQ_AMPERSAND = r'&=' 197 | t_EQ_CARET = r'\^=' 198 | t_EQ_SHIFT_LEFT = r'<<=' 199 | t_EQ_SHIFT_RIGHT = r'>>=' 200 | 201 | # --------------------------------------------------------------- 202 | # COMPLEX TOKENS 203 | # --------------------------------------------------------------- 204 | 205 | def t_ID(t): 206 | r'[A-Za-z_][\w]*' 207 | if reserved_words.has_key(t.value): 208 | t.type = reserved_words[t.value] 209 | return t 210 | 211 | def t_FNUMBER(t): 212 | r'((0(?!\d))|([1-9]\d*))((\.\d+(e[+-]?\d+)?)|(e[+-]?\d+))' 213 | return t 214 | 215 | def t_malformed_fnumber(t): 216 | r'(0\d+)((\.\d+(e[+-]?\d+)?)|(e[+-]?\d+))' 217 | print "Line %d. Malformed floating point number '%s'" % (t.lineno, t.value) 218 | 219 | def t_INUMBER(t): 220 | r'0(?!\d)|([1-9]\d*)' 221 | return t 222 | 223 | def t_malformed_inumber(t): 224 | r'0\d+' 225 | print "Line %d. Malformed integer '%s'" % (t.lineno, t.value) 226 | 227 | def t_CHARACTER(t): 228 | r"'\w'" 229 | return t 230 | 231 | def t_STRING(t): 232 | r'"[^\n]*?(? 0: 266 | t.skip(t.value.index('\n')) 267 | elif t.value[0:2] == '/*': 268 | print "Unterminated comment." 269 | else: 270 | print "Illegal character '%s'" % t.value[0] 271 | t.skip(1) 272 | 273 | # --------------------------------------------------------------- 274 | # MAIN LEXER FUNCTIONALITY 275 | # --------------------------------------------------------------- 276 | 277 | def run_lexer(): 278 | """This is just a debugging function that prints out a list of 279 | tokens, it's not actually called by the compiler or anything.""" 280 | 281 | import sys 282 | file = open(sys.argv[1]) 283 | lines = file.readlines() 284 | file.close() 285 | strings = "" 286 | for i in lines: 287 | strings += i 288 | lex.input(strings) 289 | while 1: 290 | token = lex.token() # Get a token 291 | if not token: break # No more tokens 292 | print "(%s,'%s',%d)" % (token.type, token.value, token.lineno) 293 | 294 | lex.lex() 295 | 296 | if __name__ == '__main__': 297 | run_lexer() 298 | 299 | # --------------------------------------------------------------- 300 | # End of clex.py 301 | # --------------------------------------------------------------- 302 | -------------------------------------------------------------------------------- /cparse.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # cparse.py 3 | # 4 | # Atul Varma 5 | # Python C Compiler - Parser 6 | # $Id: cparse.py,v 1.2 2004/05/27 16:25:08 varmaa Exp $ 7 | # --------------------------------------------------------------- 8 | 9 | import yacc 10 | 11 | from clex import tokens 12 | 13 | # --------------------------------------------------------------- 14 | # ABSTRACT SYNTAX TREE - NODES 15 | # --------------------------------------------------------------- 16 | 17 | class Node: 18 | "Base class for all nodes on the abstract syntax tree." 19 | 20 | def is_null(self): 21 | """Returns whether the node represents a null node.""" 22 | 23 | return 0 24 | 25 | def is_const(self): 26 | """Returns whether the node is a constant numeric number 27 | (e.g., "5").""" 28 | 29 | return 0 30 | 31 | def has_address(self): 32 | """Returns whether the node has an address (i.e., is a valid 33 | lvalue).""" 34 | 35 | return self.__dict__.has_key("has_addr") 36 | 37 | def set_has_address(self): 38 | """Tells the node that has an address (is an lvalue). 39 | Ultimately, the address of the node should be placed in the 40 | output_addr attribute.""" 41 | 42 | self.has_addr = 1 43 | self.output_addr = 0 44 | 45 | def calculate(self): 46 | """Calculates the constant numeric value of the node and 47 | its subnodes, if one exists. For instance, if a node 48 | corresponds to the expression "5+3", then this method 49 | would return 8.""" 50 | 51 | return None 52 | 53 | def accept(self, visitor): 54 | """Accept method for visitor classes (see cvisitor.py).""" 55 | 56 | return self._accept(self.__class__, visitor) 57 | 58 | def _accept(self, klass, visitor): 59 | """Accept implementation. This is actually a recursive 60 | function that dynamically figures out which visitor method to 61 | call. This is done by appending the class' name to 'v', so if 62 | the node class is called MyNode, then this method tries 63 | calling visitor.vMyNode(). If that node doesn't exist, then 64 | it recursively attempts to call the visitor method 65 | corresponding to the class' superclass (e.g., 66 | visitor.vNode()).""" 67 | 68 | visitor_method = getattr(visitor, "v%s" % klass.__name__, None) 69 | if visitor_method == None: 70 | bases = klass.__bases__ 71 | last = None 72 | for i in bases: 73 | last = self._accept(i, visitor) 74 | return last 75 | else: 76 | return visitor_method(self) 77 | 78 | class NullNode(Node): 79 | """A null node is like a null terminator for AST's.""" 80 | 81 | def __init__(self): 82 | self.type = 'void' 83 | 84 | def is_null(self): 85 | return 1 86 | 87 | class ArrayExpression(Node): 88 | """This is an expression with array notation, like "a[5+b]".""" 89 | 90 | def __init__(self, expr, index): 91 | self.expr = expr 92 | self.index = index 93 | 94 | class StringLiteral(Node): 95 | """A string literal, e.g. the string "Hello World" in 96 | printf("Hello World").""" 97 | 98 | def __init__(self, str): 99 | self._str = str 100 | self.type = PointerType(BaseType('char')) 101 | 102 | def append_str(self, str): 103 | self._str += str 104 | 105 | def get_str(self): 106 | return self._str 107 | 108 | def get_sanitized_str(self): 109 | """Returns a 'sanitized' version of the string, converting 110 | all carriage returns to '\n' symbols, etc.""" 111 | 112 | return self._str.replace('\n', '\\n') 113 | 114 | class Id(Node): 115 | """An identifier, which can correspond to the name of 116 | a function, variable, etc...""" 117 | 118 | def __init__(self, name, lineno): 119 | self.name = name 120 | self.lineno = lineno 121 | 122 | class Const(Node): 123 | """A numeric constant (i.e., an integral literal), such as 124 | the number 5.""" 125 | 126 | def __init__(self, value, type): 127 | self.value = value 128 | self.type = type 129 | 130 | def calculate(self): 131 | return self.value 132 | 133 | def is_const(self): 134 | return 1 135 | 136 | def _get_calculated(node): 137 | """Attempts to calculate the numeric value of the expression, 138 | returning a Const node if it was able to convert the expression. 139 | If the expression isn't a constant expression like "5+3", then 140 | this function just returns the node unmodified.""" 141 | 142 | result = node.calculate() 143 | if result != None: 144 | result = int(result) 145 | return Const(result, BaseType('int')) 146 | else: 147 | return node 148 | 149 | class Unaryop(Node): 150 | """Any generic unary operator. This is an abstract base class.""" 151 | 152 | def __init__(self, node): 153 | self.expr = node 154 | 155 | class Negative(Unaryop): 156 | """A negative unary operator, e.g. '-5'.""" 157 | 158 | def calculate(self): 159 | val = self.expr.calculate() 160 | if val != None: 161 | return -val 162 | return None 163 | 164 | class Pointer(Unaryop): 165 | """A pointer dereference, e.g. '*a'.""" 166 | 167 | pass 168 | 169 | class AddrOf(Unaryop): 170 | """An address-of operator, e.g. '&a'.""" 171 | 172 | pass 173 | 174 | class Binop(Node): 175 | """Any binary operator, such as that for arithmetic operations 176 | (+/-/*), assignment operations (=/+=/-=), and so forth.""" 177 | 178 | # List of assignment operators. 179 | ASSIGN_OPS = ['=', '+=', '-='] 180 | 181 | def __init__(self, left, right, op): 182 | self.left = left 183 | self.right = right 184 | self.op = op 185 | 186 | def calculate(self): 187 | left = self.left.calculate() 188 | right = self.right.calculate() 189 | if left != None and right != None: 190 | return int(eval("%d %s %d" % (left, self.op, right))) 191 | else: 192 | return None 193 | 194 | class IfStatement(Node): 195 | """An if/then/else statement.""" 196 | 197 | def __init__(self, expr, then_stmt, else_stmt): 198 | self.expr = expr 199 | self.then_stmt = then_stmt 200 | self.else_stmt = else_stmt 201 | 202 | class BreakStatement(Node): 203 | """A break statement (used while in a loop structure to bust out 204 | of it).""" 205 | 206 | pass 207 | 208 | class ContinueStatement(Node): 209 | """A continue statement (used while in a loop structure to bust 210 | back to the beginning of it).""" 211 | 212 | pass 213 | 214 | class ReturnStatement(Node): 215 | """A return statement, used to exit a function and optionally 216 | return a value.""" 217 | 218 | def __init__(self, expr): 219 | self.expr = expr 220 | 221 | class ForLoop(Node): 222 | """A for loop.""" 223 | 224 | def __init__(self, begin_stmt, expr, end_stmt, stmt): 225 | self.expr = expr 226 | self.stmt = stmt 227 | self.begin_stmt = begin_stmt 228 | self.end_stmt = end_stmt 229 | 230 | class WhileLoop(Node): 231 | """A while loop.""" 232 | 233 | def __init__(self, expr, stmt): 234 | self.expr = expr 235 | self.stmt = stmt 236 | 237 | class NodeList(Node): 238 | """A list of nodes. This is an abstract base class.""" 239 | 240 | def __init__(self, node=None): 241 | self.nodes = [] 242 | if node != None: 243 | self.nodes.append(node) 244 | 245 | def add(self, node): 246 | self.nodes.append(node) 247 | 248 | class ArgumentList(NodeList): 249 | """A list of arguments for a function expression. e.g., the list 250 | '5,2,3' in 'a = my_func(5,2,3)'.""" 251 | 252 | pass 253 | 254 | class ParamList(NodeList): 255 | """A list of parameters for a function prototype, e.g. the list 256 | 'int a, char b, char c' in 'int my_func(int a, char b, char c)'.""" 257 | 258 | def __init__(self, node=None): 259 | NodeList.__init__(self, node) 260 | self.has_ellipsis = 0 261 | 262 | class StatementList(NodeList): 263 | """Any list of statements. For instance, this can be the list of 264 | statements in a function body.""" 265 | 266 | pass 267 | 268 | class TranslationUnit(NodeList): 269 | """A list of nodes representing the program itself.""" 270 | 271 | pass 272 | 273 | class DeclarationList(NodeList): 274 | """A list of variable declarations, such as the ones put 275 | at the beginning of a compound statement (e.g., the beginning 276 | of a function body).""" 277 | 278 | pass 279 | 280 | class FunctionExpression(Node): 281 | """An execution of a function, e.g. 'my_func(a,b,c)'.""" 282 | 283 | def __init__(self, function, arglist): 284 | self.function = function 285 | self.arglist = arglist 286 | 287 | class CompoundStatement(Node): 288 | """A compound statement, e.g. '{ int i; i += 1; }'.""" 289 | 290 | def __init__(self, declaration_list, statement_list): 291 | self.declaration_list = declaration_list 292 | self.statement_list = statement_list 293 | 294 | class FunctionDefn(Node): 295 | """A node representing a function definition (its declaration 296 | and body).""" 297 | 298 | def __init__(self, declaration, body): 299 | self.type = declaration.type 300 | self.name = declaration.name 301 | self.extern = declaration.extern 302 | self.static = declaration.static 303 | self.body = body 304 | 305 | class Declaration(Node): 306 | """A node representing a declaration of a function or 307 | variable.""" 308 | 309 | def __init__(self, name, type=None): 310 | if type == None: 311 | type = NullNode() 312 | self.extern = 0 313 | self.static = 0 314 | self.type = type 315 | self.name = name 316 | self.is_used = 0 317 | 318 | def set_base_type(self, type): 319 | if self.type.is_null(): 320 | self.type = type 321 | else: 322 | self.type.set_base_type(type) 323 | 324 | def add_type(self, type): 325 | type.set_base_type(self.type) 326 | self.type = type 327 | 328 | # --------------------------------------------------------------- 329 | # ABSTRACT SYNTAX TREE - TYPE SYSTEM 330 | # --------------------------------------------------------------- 331 | 332 | class Type(Node): 333 | """A node representing the type of another node. For instance, 334 | the Binop node representing '5 + a', where a is an int, will have 335 | a Type node associated with it that represents the fact that 336 | the result of the Binop is an int. 337 | 338 | Types can also be nested, so that for instance you can have 339 | a type like 'pointer(pointer(int))' which represents a 340 | double-pointer to an int. 341 | 342 | This is an abstract base class.""" 343 | 344 | def __init__(self, child=None): 345 | if child == None: 346 | child = NullNode() 347 | self.child = child 348 | 349 | def set_base_type(self, type): 350 | """Set the base (innermost) type of a type. For instance, 351 | calling this with a pointer(int) type on a pointer() type 352 | will give you a pointer(pointer(int)).""" 353 | 354 | if self.child.is_null(): 355 | self.child = type 356 | else: 357 | self.child.set_base_type(type) 358 | 359 | def get_string(self): 360 | """Return a string corresponding to the type, e.g. 361 | 'pointer(pointer(int))'.""" 362 | 363 | raise NotImplementedError() 364 | 365 | def get_outer_string(self): 366 | """Return only the outermost type of a type. e.g., 367 | calling this on a pointer(pointer(int)) type will 368 | return 'pointer'.""" 369 | 370 | raise NotImplementedError() 371 | 372 | def is_function(self): 373 | """Returns whether or not this type represents a 374 | function.""" 375 | 376 | return 0 377 | 378 | class BaseType(Type): 379 | """A base type representing ints, chars, etc...""" 380 | 381 | def __init__(self, type_str, child=None): 382 | Type.__init__(self, child) 383 | self.type_str = type_str 384 | 385 | def get_string(self): 386 | return self.type_str 387 | 388 | def get_outer_string(self): 389 | return self.type_str 390 | 391 | class FunctionType(Type): 392 | """A type representing a function (for function prototypes and 393 | function calls).""" 394 | 395 | def __init__(self, params=None, child=None): 396 | Type.__init__(self, child) 397 | if (params == None): 398 | params = NullNode() 399 | self.params = params 400 | 401 | def get_string(self): 402 | param_str = "" 403 | for param in self.params.nodes: 404 | param_str += "," + param.type.get_string() 405 | return "function(%s)->%s" % (param_str[1:], self.child.get_string()) 406 | 407 | def get_outer_string(self): 408 | return 'function' 409 | 410 | def is_function(self): 411 | return 1 412 | 413 | def get_return_type(self): 414 | """Returns the return type of the function. Internally, 415 | this is stored as the nested type within the function.""" 416 | 417 | return self.child 418 | 419 | def get_params(self): 420 | """Returns the list of parameters for the function.""" 421 | 422 | return self.params 423 | 424 | class PointerType(Type): 425 | """A type representing a pointer to another (nested) type.""" 426 | 427 | def get_string(self): 428 | return "pointer(%s)" % self.child.get_string() 429 | 430 | def get_outer_string(self): 431 | return 'pointer' 432 | 433 | # --------------------------------------------------------------- 434 | # PARSER GRAMMAR / AST CONSTRUCTION 435 | # 436 | # The only thing the yacc grammar rules do is create an 437 | # abstract syntax tree. Actual symbol table generation, 438 | # type checking, flow control checking, etc. are done by 439 | # the visitor classes (see cvisitors.py). 440 | # --------------------------------------------------------------- 441 | 442 | # Precedence for ambiguous grammar elements. 443 | precedence = ( 444 | ('right', 'ELSE'), 445 | ) 446 | 447 | class ParseError(Exception): 448 | "Exception raised whenever a parsing error occurs." 449 | 450 | pass 451 | 452 | def p_translation_unit_01(t): 453 | '''translation_unit : external_declaration''' 454 | t[0] = TranslationUnit(t[1]) 455 | 456 | def p_translation_unit_02(t): 457 | '''translation_unit : translation_unit external_declaration''' 458 | t[1].add(t[2]) 459 | t[0] = t[1] 460 | 461 | def p_external_declaration(t): 462 | '''external_declaration : function_definition 463 | | declaration''' 464 | t[0] = t[1] 465 | 466 | def p_function_definition_01(t): 467 | '''function_definition : type_specifier declarator compound_statement''' 468 | t[2].set_base_type(t[1]) 469 | t[0] = FunctionDefn(t[2], t[3]) 470 | 471 | def p_function_definition_02(t): 472 | '''function_definition : STATIC type_specifier declarator compound_statement''' 473 | t[3].static = 1 474 | t[3].set_base_type(t[2]) 475 | t[0] = FunctionDefn(t[3], t[4]) 476 | 477 | def p_declaration_01(t): 478 | '''declaration : type_specifier declarator SEMICOLON''' 479 | if isinstance(t[2].type, FunctionType): 480 | t[2].extern = 1 481 | t[2].set_base_type(t[1]) 482 | t[0] = t[2] 483 | 484 | def p_declaration_02(t): 485 | '''declaration : EXTERN type_specifier declarator SEMICOLON''' 486 | t[3].extern = 1 487 | t[3].set_base_type(t[2]) 488 | t[0] = t[3] 489 | 490 | def p_declaration_list_opt_01(t): 491 | '''declaration_list_opt : empty''' 492 | t[0] = NullNode() 493 | 494 | def p_declaration_list_opt_02(t): 495 | '''declaration_list_opt : declaration_list''' 496 | t[0] = t[1] 497 | 498 | def p_declaration_list_02(t): 499 | '''declaration_list : declaration''' 500 | t[0] = DeclarationList(t[1]) 501 | 502 | def p_declaration_list_03(t): 503 | '''declaration_list : declaration_list declaration''' 504 | t[1].add(t[2]) 505 | t[0] = t[1] 506 | 507 | def p_type_specifier(t): 508 | '''type_specifier : INT 509 | | CHAR''' 510 | t[0] = BaseType(t[1]) 511 | 512 | def p_declarator_01(t): 513 | '''declarator : direct_declarator''' 514 | t[0] = t[1] 515 | 516 | def p_declarator_02(t): 517 | '''declarator : ASTERISK declarator''' 518 | t[2].set_base_type(PointerType()) 519 | t[0] = t[2] 520 | 521 | def p_direct_declarator_01(t): 522 | '''direct_declarator : ID''' 523 | t[0] = Declaration(t[1]) 524 | 525 | def p_direct_declarator_02(t): 526 | '''direct_declarator : direct_declarator LPAREN parameter_type_list RPAREN''' 527 | t[1].add_type(FunctionType(t[3])) 528 | t[0] = t[1] 529 | 530 | def p_direct_declarator_03(t): 531 | '''direct_declarator : direct_declarator LPAREN RPAREN''' 532 | t[1].add_type(FunctionType(ParamList())) 533 | t[0] = t[1] 534 | 535 | def p_parameter_type_list_01(t): 536 | '''parameter_type_list : parameter_list''' 537 | t[0] = t[1] 538 | 539 | def p_parameter_type_list_02(t): 540 | '''parameter_type_list : parameter_list COMMA ELLIPSIS''' 541 | t[1].has_ellipsis = 1 542 | t[0] = t[1] 543 | 544 | def p_parameter_list_01(t): 545 | '''parameter_list : parameter_declaration''' 546 | t[0] = ParamList(t[1]) 547 | 548 | def p_parameter_list_02(t): 549 | '''parameter_list : parameter_list COMMA parameter_declaration''' 550 | t[1].add(t[3]) 551 | t[0] = t[1] 552 | 553 | def p_parameter_declaration(t): 554 | '''parameter_declaration : type_specifier declarator''' 555 | # NOTE: this is the same code as p_declaration_01! 556 | p_declaration_01(t) 557 | 558 | def p_compound_statement_01(t): 559 | '''compound_statement : LBRACE declaration_list_opt statement_list RBRACE''' 560 | t[0] = CompoundStatement(t[2], t[3]) 561 | 562 | def p_compound_statement_02(t): 563 | '''compound_statement : LBRACE declaration_list_opt RBRACE''' 564 | t[0] = CompoundStatement(t[2], NullNode()) 565 | 566 | def p_expression_statement(t): 567 | '''expression_statement : expression SEMICOLON''' 568 | t[0] = t[1] 569 | 570 | def p_expression_01(t): 571 | '''expression : equality_expression''' 572 | t[0] = t[1] 573 | 574 | def p_expression_02(t): 575 | '''expression : equality_expression ASSIGN expression 576 | | equality_expression EQ_PLUS expression 577 | | equality_expression EQ_MINUS expression''' 578 | t[0] = Binop(t[1], t[3], t[2]) 579 | 580 | def p_equality_expression_01(t): 581 | '''equality_expression : relational_expression''' 582 | t[0] = t[1] 583 | 584 | def p_equality_expression_02(t): 585 | '''equality_expression : equality_expression EQ relational_expression 586 | | equality_expression NOT_EQ relational_expression''' 587 | t[0] = _get_calculated(Binop(t[1], t[3], t[2])) 588 | 589 | def p_relational_expression_01(t): 590 | '''relational_expression : additive_expression''' 591 | t[0] = t[1] 592 | 593 | def p_relational_expression_02(t): 594 | '''relational_expression : relational_expression LESS additive_expression 595 | | relational_expression GREATER additive_expression 596 | | relational_expression LESS_EQ additive_expression 597 | | relational_expression GREATER_EQ additive_expression''' 598 | t[0] = _get_calculated(Binop(t[1], t[3], t[2])) 599 | 600 | def p_postfix_expression_01(t): 601 | '''postfix_expression : primary_expression''' 602 | t[0] = t[1] 603 | 604 | def p_postfix_expression_02(t): 605 | '''postfix_expression : postfix_expression LPAREN argument_expression_list RPAREN''' 606 | t[0] = FunctionExpression(t[1], t[3]) 607 | pass 608 | 609 | def p_postfix_expression_03(t): 610 | '''postfix_expression : postfix_expression LPAREN RPAREN''' 611 | t[0] = FunctionExpression(t[1], ArgumentList()) 612 | 613 | def p_postfix_expression_04(t): 614 | '''postfix_expression : postfix_expression LBRACKET expression RBRACKET''' 615 | t[0] = ArrayExpression(t[1], t[3]) 616 | 617 | def p_argument_expression_list_01(t): 618 | '''argument_expression_list : expression''' 619 | t[0] = ArgumentList(t[1]) 620 | 621 | def p_argument_expression_list_02(t): 622 | '''argument_expression_list : argument_expression_list COMMA expression''' 623 | t[1].add(t[3]) 624 | t[0] = t[1] 625 | 626 | def p_unary_expression_01(t): 627 | '''unary_expression : postfix_expression''' 628 | t[0] = t[1] 629 | 630 | def p_unary_expression_02(t): 631 | '''unary_expression : MINUS unary_expression''' 632 | t[0] = _get_calculated(Negative(t[2])) 633 | 634 | def p_unary_expression_03(t): 635 | '''unary_expression : PLUS unary_expression''' 636 | t[0] = t[2] 637 | 638 | def p_unary_expression_03(t): 639 | '''unary_expression : EXCLAMATION unary_expression''' 640 | # horrible hack for the '!' operator... Just insert an 641 | # (expr == 0) into the AST. 642 | t[0] = _get_calculated(Binop(t[2], Const(0, BaseType('int')), '==')) 643 | 644 | def p_unary_expression_04(t): 645 | '''unary_expression : ASTERISK unary_expression''' 646 | t[0] = Pointer(t[2]) 647 | 648 | def p_unary_expression_05(t): 649 | '''unary_expression : AMPERSAND unary_expression''' 650 | t[0] = AddrOf(t[2]) 651 | 652 | def p_mult_expression_01(t): 653 | '''mult_expression : unary_expression''' 654 | t[0] = t[1] 655 | 656 | def p_mult_expression_02(t): 657 | '''mult_expression : mult_expression ASTERISK unary_expression 658 | | mult_expression DIV unary_expression 659 | | mult_expression MODULO unary_expression''' 660 | t[0] = _get_calculated(Binop(t[1], t[3], t[2])) 661 | 662 | def p_additive_expression_01(t): 663 | '''additive_expression : mult_expression''' 664 | t[0] = t[1] 665 | 666 | def p_additive_expression_02(t): 667 | '''additive_expression : additive_expression PLUS mult_expression 668 | | additive_expression MINUS mult_expression''' 669 | t[0] = _get_calculated(Binop(t[1], t[3], t[2])) 670 | 671 | def p_primary_expression_01(t): 672 | '''primary_expression : ID''' 673 | t[0] = Id(t[1], t.lineno(1)) 674 | 675 | def p_primary_expression_02(t): 676 | '''primary_expression : INUMBER''' 677 | t[0] = Const(int(t[1]), BaseType('int')) 678 | 679 | def p_primary_expression_03(t): 680 | '''primary_expression : FNUMBER''' 681 | t[0] = Const(float(t[1]), BaseType('double')) 682 | 683 | def p_primary_expression_04(t): 684 | '''primary_expression : CHARACTER''' 685 | t[0] = Const(ord(eval(t[1])), BaseType('char')) 686 | 687 | def p_primary_expression_05(t): 688 | '''primary_expression : string_literal''' 689 | t[0] = t[1] 690 | 691 | def p_primary_expression_06(t): 692 | '''primary_expression : LPAREN expression RPAREN''' 693 | t[0] = t[2] 694 | 695 | def p_string_literal_01(t): 696 | '''string_literal : STRING''' 697 | t[0] = StringLiteral(eval(t[1])) 698 | 699 | def p_string_literal_02(t): 700 | '''string_literal : string_literal STRING''' 701 | t[1].append_str(eval(t[2])) 702 | t[0] = t[1] 703 | 704 | def p_statement(t): 705 | '''statement : compound_statement 706 | | expression_statement 707 | | selection_statement 708 | | iteration_statement 709 | | jump_statement''' 710 | t[0] = t[1] 711 | 712 | def p_jump_statement_01(t): 713 | '''jump_statement : RETURN SEMICOLON''' 714 | t[0] = ReturnStatement(NullNode()) 715 | 716 | def p_jump_statement_02(t): 717 | '''jump_statement : RETURN expression SEMICOLON''' 718 | t[0] = ReturnStatement(t[2]) 719 | 720 | def p_jump_statement_03(t): 721 | '''jump_statement : BREAK SEMICOLON''' 722 | t[0] = BreakStatement() 723 | 724 | def p_jump_statement_04(t): 725 | '''jump_statement : CONTINUE SEMICOLON''' 726 | t[0] = ContinueStatement() 727 | 728 | def p_iteration_statement_01(t): 729 | '''iteration_statement : WHILE LPAREN expression RPAREN statement''' 730 | t[0] = WhileLoop(t[3], t[5]) 731 | 732 | def p_iteration_statement_02(t): 733 | '''iteration_statement : FOR LPAREN expression_statement expression_statement expression RPAREN statement''' 734 | t[0] = ForLoop(t[3], t[4], t[5], t[7]) 735 | 736 | def p_selection_statement_01(t): 737 | '''selection_statement : IF LPAREN expression RPAREN statement''' 738 | t[0] = IfStatement(t[3], t[5], NullNode()) 739 | 740 | def p_selection_statement_02(t): 741 | '''selection_statement : IF LPAREN expression RPAREN statement ELSE statement''' 742 | t[0] = IfStatement(t[3], t[5], t[7]) 743 | 744 | def p_statement_list_02(t): 745 | '''statement_list : statement''' 746 | t[0] = StatementList(t[1]) 747 | 748 | def p_statement_list_03(t): 749 | '''statement_list : statement_list statement''' 750 | t[1].add(t[2]) 751 | t[0] = t[1] 752 | 753 | def p_empty(t): 754 | 'empty :' 755 | pass 756 | 757 | def p_error(t): 758 | print "You've got a syntax error somewhere in your code." 759 | print "It could be around line %d." % t.lineno 760 | print "Good luck finding it." 761 | raise ParseError() 762 | 763 | yacc.yacc(debug=1) 764 | 765 | # --------------------------------------------------------------- 766 | # End of cparse.py 767 | # --------------------------------------------------------------- 768 | -------------------------------------------------------------------------------- /cvisitors.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # cvisitors.py 3 | # 4 | # Atul Varma 5 | # Python C Compiler - Visitors 6 | # $Id: cvisitors.py,v 1.3 2004/05/27 17:51:47 varmaa Exp $ 7 | # 8 | # The Visitor is a pattern outlined in "Design Patterns" by 9 | # Gamma et al., used here to encapsulate different parts of parsing 10 | # and compilation into separate classes via a mechanism called 11 | # double dispatching. 12 | # 13 | # In this compiler, the yacc grammar rules in cparse.py just create 14 | # the abstract syntax tree, and visitors do the bulk of parsing 15 | # and compilation. 16 | # --------------------------------------------------------------- 17 | 18 | # TODO: make it so functions can return void. 19 | # TODO: mark all statements with an 'ignore return value' flag 20 | # to enable some optimizations if the statement is an 21 | # expression. 22 | # TODO: move extern, static indicators in functions to their 23 | # Type object, maybe. 24 | # 25 | # Possible things to do: 26 | # Add compilation to JVM/python bytecode/z-machine... 27 | # Implement arrays 28 | # Pass line numbers to constructors for nodes 29 | # 30 | # Faults so far: 31 | # * doesn't check for variable initialization before use. 32 | # * const number ranges aren't being checked. 33 | 34 | import cparse 35 | 36 | class Visitor: 37 | """The base visitor class. This is an abstract base class.""" 38 | 39 | def __init__(self): 40 | self.warnings = 0 41 | self.errors = 0 42 | 43 | def _visitList(self, list): 44 | """Visit a list of nodes. 'list' should be an actual list, 45 | not a cparse.NodeList object.""" 46 | 47 | last = None 48 | for i in list: 49 | last = i.accept(self) 50 | return last 51 | 52 | def visit(self, node): 53 | """Visits the given node by telling the node to call the 54 | visitor's class-specific visitor method for that node's 55 | class (i.e., double dispatching).""" 56 | 57 | return node.accept(self) 58 | 59 | def warning(self, str): 60 | """Output a non-fatal compilation warning.""" 61 | 62 | print "warning: %s" % str 63 | self.warnings += 1 64 | 65 | def error(self, str): 66 | """Output a fatal compilation error.""" 67 | 68 | print "error: %s" % str 69 | self.errors += 1 70 | 71 | def has_errors(self): 72 | """Returns whether the visitor has encountered any 73 | errors.""" 74 | 75 | return self.errors > 0 76 | 77 | # --------------------------------------------------------------- 78 | # ABSTRACT SYNTAX TREE PRINTER (for debugging) 79 | # --------------------------------------------------------------- 80 | 81 | class ASTPrinterVisitor(Visitor): 82 | """Simple visitor that outputs a textual representation of 83 | the abstract syntax tree, for debugging purposes, to an 84 | output file.""" 85 | 86 | def __init__(self, ast_file, indent_amt=2): 87 | self.ast_file = ast_file 88 | Visitor.__init__(self) 89 | self._indent = 0 90 | self._indent_amt = indent_amt 91 | 92 | def indent(self): 93 | self._indent += self._indent_amt 94 | 95 | def unindent(self): 96 | self._indent -= self._indent_amt 97 | 98 | def p(self, str): 99 | self.ast_file.write( 100 | (' ' * (self._indent_amt * self._indent) ) + str + "\n" ) 101 | 102 | def pNodeInfo(self, node): 103 | # Print out the name of the node's class. 104 | self.p('+ ' + node.__class__.__name__) 105 | 106 | # If the node has a type associated with it, 107 | # print the string of the type. 108 | if node.__dict__.has_key("type"): 109 | self.p(" Type-string: %s" % node.type.get_string()) 110 | 111 | # Find all attributes of the node that are ints or 112 | # strings and aren't 'private' (i.e., don't begin with 113 | # '_'), and print their values. 114 | for key in node.__dict__.keys(): 115 | if key[0] == '_': 116 | continue 117 | val = node.__dict__[key] 118 | if (isinstance(val, str) or 119 | isinstance(val, int)): 120 | self.p(" %s: %s" % (key, str(val))) 121 | 122 | def pSubnodeInfo(self, subnode, label): 123 | if not subnode.is_null(): 124 | self.p(" %s:" % label) 125 | self.indent() 126 | subnode.accept(self) 127 | self.unindent() 128 | 129 | def vNullNode(self, node): 130 | self.pNodeInfo(node) 131 | 132 | def vArrayExpression(self, node): 133 | self.pNodeInfo(node) 134 | self.pSubnodeInfo(node.expr, "Expression") 135 | self.pSubnodeInfo(node.index, "Index") 136 | 137 | def vStringLiteral(self, node): 138 | self.pNodeInfo(node) 139 | self.p(' Value: "%s"' % node.get_sanitized_str()) 140 | 141 | def vId(self, node): 142 | self.pNodeInfo(node) 143 | 144 | def vUnaryop(self, node): 145 | self.pNodeInfo(node) 146 | self.pSubnodeInfo(node.expr, "Expression") 147 | 148 | def vFunctionExpression(self, node): 149 | self.pNodeInfo(node) 150 | self.pSubnodeInfo(node.function, "Function") 151 | self.pSubnodeInfo(node.arglist, "Arguments") 152 | 153 | def vConst(self, node): 154 | self.pNodeInfo(node) 155 | self.pSubnodeInfo(node.type, "Type") 156 | 157 | def vBinop(self, node): 158 | self.pNodeInfo(node) 159 | self.pSubnodeInfo(node.left, "Left operand") 160 | self.pSubnodeInfo(node.right, "Right operand") 161 | 162 | def vNodeList(self, node): 163 | self.pNodeInfo(node) 164 | self.indent() 165 | self._visitList(node.nodes) 166 | self.unindent() 167 | 168 | def vCompoundStatement(self, node): 169 | self.pNodeInfo(node) 170 | self.pSubnodeInfo(node.declaration_list, "Declaration list") 171 | self.pSubnodeInfo(node.statement_list, "Statement list") 172 | 173 | def vBaseType(self, node): 174 | self.pNodeInfo(node) 175 | 176 | def vFunctionType(self, node): 177 | self.pNodeInfo(node) 178 | self.pSubnodeInfo(node.params, "Parameters:") 179 | self.pSubnodeInfo(node.child, "Child:") 180 | 181 | def vPointerType(self, node): 182 | self.pNodeInfo(node) 183 | self.pSubnodeInfo(node.child, "Child:") 184 | 185 | def vDeclaration(self, node): 186 | self.pNodeInfo(node) 187 | self.pSubnodeInfo(node.type, "Type") 188 | 189 | def vReturnStatement(self, node): 190 | self.pNodeInfo(node) 191 | self.pSubnodeInfo(node.expr, "Expression") 192 | 193 | def vFunctionDefn(self, node): 194 | self.pNodeInfo(node) 195 | self.pSubnodeInfo(node.type, "Type") 196 | self.pSubnodeInfo(node.body, "Body") 197 | 198 | def vIfStatement(self, node): 199 | self.pNodeInfo(node) 200 | self.pSubnodeInfo(node.expr, "Expression") 201 | self.pSubnodeInfo(node.then_stmt, "Then statement") 202 | self.pSubnodeInfo(node.else_stmt, "Else statement") 203 | 204 | def vWhileLoop(self, node): 205 | self.pNodeInfo(node) 206 | self.pSubnodeInfo(node.expr, "Expression") 207 | self.pSubnodeInfo(node.stmt, "Statement") 208 | 209 | def vForLoop(self, node): 210 | self.pNodeInfo(node) 211 | self.pSubnodeInfo(node.begin_stmt, "Begin statement") 212 | self.pSubnodeInfo(node.expr, "Test expression") 213 | self.pSubnodeInfo(node.end_stmt, "End statement") 214 | self.pSubnodeInfo(node.stmt, "Statement") 215 | 216 | # --------------------------------------------------------------- 217 | # SYMBOL TABLE GENERATION 218 | # --------------------------------------------------------------- 219 | 220 | class Symtab: 221 | """A symbol table. This is a simple object that just keeps a 222 | hashtable of symbol names and the Declaration or FunctionDefn 223 | nodes that they refer to. 224 | 225 | There is a separate symbol table for each code element that 226 | has its own scope (for instance, each compound statement will 227 | have its own symbol table). As a result, symbol tables can 228 | be nested if the code elements are nested, and symbol table 229 | lookups will recurse upwards through parents to represent 230 | lexical scoping rules.""" 231 | 232 | class SymbolDefinedError(Exception): 233 | """Exception raised when the code tries to add a symbol 234 | to a table where the symbol has already been defined. 235 | Note that 'defined' is used in the C sense here--i.e., 236 | 'space has been allocated for the symbol', as opposed 237 | to a declaration.""" 238 | 239 | pass 240 | 241 | class SymbolConflictError(Exception): 242 | """Exception raised when the code tries to add a 243 | symbol to a tamble where the symbol already exists 244 | and its type differs from the previously existing 245 | one.""" 246 | 247 | pass 248 | 249 | def __init__(self, parent=None): 250 | """Creates an empty symbol table with the given 251 | parent symbol table.""" 252 | 253 | self.entries = {} 254 | self.parent = parent 255 | if self.parent != None: 256 | self.parent.children.append(self) 257 | self.children = [] 258 | 259 | def add(self, name, value): 260 | """Adds a symbol with the given value to the symbol table. 261 | The value is usually an AST node that represents the 262 | declaration or definition of a function/variable (e.g., 263 | Declaration or FunctionDefn).""" 264 | 265 | if self.entries.has_key(name): 266 | if not self.entries[name].extern: 267 | raise Symtab.SymbolDefinedError() 268 | elif self.entries[name].type.get_string() != \ 269 | value.type.get_string(): 270 | raise Symtab.SymbolConflictError() 271 | self.entries[name] = value 272 | 273 | def get(self, name): 274 | """Retrieves the symbol with the given name from the symbol 275 | table, recursing upwards through parent symbol tables if it is 276 | not found in the current one.""" 277 | 278 | if self.entries.has_key(name): 279 | return self.entries[name] 280 | else: 281 | if self.parent != None: 282 | return self.parent.get(name) 283 | else: 284 | return None 285 | 286 | class SymtabVisitor(Visitor): 287 | """Visitor that creates and attaches symbol tables to the AST.""" 288 | 289 | def push_symtab(self, node): 290 | """Pushes a new symbol table onto the visitor's symbol table 291 | stack and attaches this symbol table to the given node. This 292 | is used whenever a new lexical scope is encountered, so the 293 | node is usually a CompoundStatement object.""" 294 | 295 | self.curr_symtab = Symtab(self.curr_symtab) 296 | node.symtab = self.curr_symtab 297 | 298 | def pop_symtab(self): 299 | """Pops a symbol table off the visitor's symbol table stack. 300 | This is used whenever a new lexical scope is exited.""" 301 | 302 | self.curr_symtab = self.curr_symtab.parent 303 | 304 | def vNode(self, node): 305 | pass 306 | 307 | def vArrayExpression(self, node): 308 | node.expr.accept(self) 309 | node.index.accept(self) 310 | 311 | def vFunctionExpression(self, node): 312 | node.function.accept(self) 313 | node.arglist.accept(self) 314 | 315 | def vId(self, node): 316 | symbol = self.curr_symtab.get(node.name) 317 | if symbol != None: 318 | node.symbol = symbol 319 | node.symbol.is_used = 1 320 | node.set_has_address() 321 | else: 322 | self.error("Line %d: Unknown identifier '%s'." % (node.lineno, node.name)) 323 | 324 | def vUnaryop(self, node): 325 | node.expr.accept(self) 326 | 327 | def vBinop(self, node): 328 | node.left.accept(self) 329 | node.right.accept(self) 330 | 331 | def vNodeList(self, node): 332 | self._visitList(node.nodes) 333 | 334 | def vParamList(self, node): 335 | # Assign a number to each parameter. This will later be 336 | # useful for the code generation phase. 337 | # 338 | # TODO: might be best to just move this to the code 339 | # generation phase, since this doesn't have anything to 340 | # do with symbol table generation. 341 | param_num = 0 342 | for param in node.nodes: 343 | param.accept(self) 344 | param.param_num = param_num 345 | param_num += 1 346 | 347 | def vTranslationUnit(self, node): 348 | self.root_symtab = Symtab() 349 | self.curr_symtab = self.root_symtab 350 | self.vNodeList(node) 351 | node.symtab = self.root_symtab 352 | 353 | def vCompoundStatement(self, node): 354 | self.push_symtab(node) 355 | node.declaration_list.accept(self) 356 | node.statement_list.accept(self) 357 | self.pop_symtab() 358 | 359 | def _add_symbol(self, node): 360 | """Attempts to add a symbol for the given node to the current 361 | symbol table, catching any exceptions that occur and printing 362 | errors if necessary.""" 363 | 364 | try: 365 | self.curr_symtab.add(node.name, node) 366 | except Symtab.SymbolDefinedError: 367 | self.error("Symbol '%s' already defined." % node.name) 368 | except Symtab.SymbolConflictError: 369 | self.error("Symbol '%s' has multiple differing declarations." % node.name) 370 | 371 | def vDeclaration(self, node): 372 | self._add_symbol(node) 373 | 374 | def vReturnStatement(self, node): 375 | node.expr.accept(self) 376 | 377 | def vFunctionType(self, node): 378 | node.params.accept(self) 379 | 380 | def vFunctionDefn(self, node): 381 | self._add_symbol(node) 382 | self.push_symtab(node) 383 | node.type.accept(self) 384 | node.body.accept(self) 385 | self.pop_symtab() 386 | 387 | def vIfStatement(self, node): 388 | node.expr.accept(self) 389 | node.then_stmt.accept(self) 390 | node.else_stmt.accept(self) 391 | 392 | def vWhileLoop(self, node): 393 | node.expr.accept(self) 394 | node.stmt.accept(self) 395 | 396 | def vForLoop(self, node): 397 | node.begin_stmt.accept(self) 398 | node.expr.accept(self) 399 | node.end_stmt.accept(self) 400 | node.stmt.accept(self) 401 | 402 | # --------------------------------------------------------------- 403 | # TYPE CHECKING 404 | # --------------------------------------------------------------- 405 | 406 | class TypeCheckVisitor(Visitor): 407 | """Visitor that performs type checking on the AST, attaching a 408 | Type object subclass to every eligible node and making sure these 409 | types don't conflict.""" 410 | 411 | def _process_conditional(self, expr): 412 | """Does simple type checking for an expression that is 413 | supposed to be the expression for a conditional 414 | statement (e.g., the conditional clause of an if/then 415 | statement or a loop).""" 416 | 417 | if expr.type.get_outer_string() not in ['int', 'char']: 418 | self.error("Conditional expression doesn't evaluate to an int/char/etc.") 419 | 420 | def _coerce_consts(self, var1, var2): 421 | """Looks at two typed terminals to see if one of them 422 | is a constant integral. If it is, then coerce it to 423 | the type of the other terminal. 424 | 425 | Note that both terminals cannot be constant integrals, or else 426 | they would have already been reduced to one node by the node's 427 | calculate() method in the parsing stage.""" 428 | 429 | if var1.is_const(): 430 | self._coerce_const(var1, var2.type) 431 | elif var2.is_const(): 432 | self._coerce_const(var2, var1.type) 433 | 434 | def _coerce_const(self, var, type): 435 | """If the given typed terminal is a constant, coerces it to 436 | the given type.""" 437 | 438 | if var.is_const() and type.get_string() in ['int', 'char']: 439 | var.type = type 440 | 441 | def _check_const_range(self, var, type): 442 | """Checks the given integral constant to make sure its value 443 | is within the bounds of the given type.""" 444 | 445 | val = var.value 446 | type_str = type.get_outside_string() 447 | # TODO: implement this! 448 | if type_str == 'char': 449 | pass 450 | elif type_str == 'int': 451 | pass 452 | 453 | def _compare_types(self, name_str, from_type, to_type, raise_errors=1): 454 | """Compares the two types to see if it's possible to perform a 455 | binary operation on them. If it is not, then the appropriate 456 | errors/warnings are raised, unless raise_errors is set to 457 | 0.""" 458 | 459 | WARNING = 1 460 | ERROR = 2 461 | conflict = 0 462 | from_str = from_type.get_string() 463 | to_str = to_type.get_string() 464 | if (from_str != to_str): 465 | if from_str == 'char': 466 | if to_str == 'int': 467 | pass 468 | else: 469 | conflict = ERROR 470 | elif from_str == 'int': 471 | if to_str == 'char': 472 | conflict = WARNING 473 | else: 474 | conflict = ERROR 475 | else: 476 | conflict = ERROR 477 | if not raise_errors: 478 | return conflict 479 | if conflict == WARNING: 480 | self.warning("%s: Conversion from %s to %s may result in data loss." % (name_str, from_str, to_str)) 481 | elif conflict == ERROR: 482 | self.error("%s: Cannot convert from %s to %s." % (name_str, from_str, to_str)) 483 | 484 | def vNode(self, node): 485 | pass 486 | 487 | def vId(self, node): 488 | node.type = node.symbol.type 489 | 490 | def vNegative(self, node): 491 | node.expr.accept(self) 492 | node.type = node.expr.type 493 | # TODO: check to make sure expr is a signed type? 494 | 495 | def vAddrOf(self, node): 496 | node.expr.accept(self) 497 | if not node.expr.has_address(): 498 | self.error("Address-of (&) target has no address!") 499 | else: 500 | node.expr.output_addr = 1 501 | node.type = cparse.PointerType(node.expr.type) 502 | 503 | def vPointer(self, node): 504 | node.expr.accept(self) 505 | if node.expr.type.get_outer_string() == 'pointer': 506 | node.type = node.expr.type.child 507 | node.set_has_address() 508 | else: 509 | self.error("Pointer dereference (*) target is not a pointer!") 510 | 511 | def vBinop(self, node): 512 | node.left.accept(self) 513 | node.right.accept(self) 514 | if node.op in cparse.Binop.ASSIGN_OPS: 515 | if not node.left.has_address(): 516 | self.error("Invalid lvalue: not an address!") 517 | node.left.output_addr = 1 518 | self._coerce_const(node.right, node.left.type) 519 | # TODO: re-implement this! 520 | # elif node.left.symbol.is_constant: 521 | # self.error("Invalid lvalue: lvalue is constant!") 522 | self._compare_types("Assignment", node.right.type, node.left.type) 523 | node.right.coerce_to_type = node.left.type 524 | node.type = node.left.type 525 | else: 526 | # TODO: not sure if this results in the ANSI C 527 | # specification for binary operand type coercion. 528 | 529 | self._coerce_consts(node.left, node.right) 530 | left_conflicts = self._compare_types("", node.right.type, node.left.type, raise_errors=0) 531 | right_conflicts = self._compare_types("", node.left.type, node.right.type, raise_errors=0) 532 | if left_conflicts < right_conflicts: 533 | from_node = node.right 534 | to_node = node.left 535 | else: 536 | from_node = node.left 537 | to_node = node.right 538 | self._compare_types("Binop '%s'" % node.op, from_node.type, to_node.type) 539 | from_node.coerce_to_type = to_node.type 540 | to_node.coerce_to_type = to_node.type 541 | node.type = to_node.type 542 | 543 | def vNodeList(self, node): 544 | self._visitList(node.nodes) 545 | 546 | def vCompoundStatement(self, node): 547 | node.statement_list.accept(self) 548 | 549 | def vReturnStatement(self, node): 550 | node.expr.accept(self) 551 | return_type = self.curr_func.type.get_return_type() 552 | self._coerce_const(node.expr, return_type) 553 | self._compare_types("Return expression", node.expr.type, return_type) 554 | node.expr.coerce_to_type = return_type 555 | 556 | def vArrayExpression(self, node): 557 | node.expr.accept(self) 558 | node.index.accept(self) 559 | if node.index.type.get_outer_string() not in ['int', 'char']: 560 | self.error("Array index is not an int or char!") 561 | elif node.expr.type.get_outer_string() != 'pointer': 562 | self.error("Array expression is not a pointer!") 563 | else: 564 | node.type = node.expr.type.child 565 | node.set_has_address() 566 | 567 | def vFunctionExpression(self, node): 568 | node.function.accept(self) 569 | if not node.function.type.is_function(): 570 | self.error("Target of function expression is not a function!") 571 | node.type = node.function.symbol.type.get_return_type() 572 | node.arglist.accept(self) 573 | params = node.function.symbol.type.get_params() 574 | num_args = len(node.arglist.nodes) 575 | num_params = len(params.nodes) 576 | if (not params.has_ellipsis) and (num_args > num_params): 577 | self.error("Too many arguments passed to function.") 578 | elif num_args < num_params: 579 | self.error("Too few arguments passed to function.") 580 | for arg, param in zip(node.arglist.nodes, params.nodes): 581 | self._coerce_const(arg, param.type) 582 | self._compare_types("Function call argument", arg.type, param.type) 583 | arg.coerce_to_type = param.type 584 | # If this function takes a variable number of args and 585 | # we've got more args than required parameters, we need 586 | # to set some of the extra arguments' field(s) properly. 587 | if (params.has_ellipsis) and (num_args > num_params): 588 | for arg in node.arglist.nodes[num_params:]: 589 | arg.coerce_to_type = arg.type 590 | 591 | def vFunctionDefn(self, node): 592 | self.curr_func = node 593 | node.body.accept(self) 594 | 595 | def vIfStatement(self, node): 596 | node.expr.accept(self) 597 | 598 | self._process_conditional(node.expr) 599 | node.then_stmt.accept(self) 600 | node.else_stmt.accept(self) 601 | 602 | def vWhileLoop(self, node): 603 | node.expr.accept(self) 604 | self._process_conditional(node.expr) 605 | node.stmt.accept(self) 606 | 607 | def vForLoop(self, node): 608 | node.begin_stmt.accept(self) 609 | node.expr.accept(self) 610 | self._process_conditional(node.expr) 611 | node.end_stmt.accept(self) 612 | node.stmt.accept(self) 613 | 614 | # --------------------------------------------------------------- 615 | # FLOW CONTROL 616 | # --------------------------------------------------------------- 617 | 618 | class FlowControlVisitor(Visitor): 619 | """Performs flow control checking on the AST. This makes sure 620 | that functions return properly through all branches, that 621 | break/continue statements are only present within loops, and so 622 | forth.""" 623 | 624 | def vNode(self, node): 625 | node.has_return_stmt = 0 626 | 627 | def vStatementList(self, node): 628 | node.has_return_stmt = 0 629 | for stmt in node.nodes: 630 | if node.has_return_stmt: 631 | self.warning("Function %s has at least one unreachable statement." % self.curr_func.name) 632 | stmt.accept(self) 633 | if stmt.has_return_stmt: 634 | node.has_return_stmt = 1 635 | 636 | def vTranslationUnit(self, node): 637 | self._visitList(node.nodes) 638 | 639 | def vWhileLoop(self, node): 640 | old_in_loop = self.in_loop 641 | self.in_loop = 1 642 | node.stmt.accept(self) 643 | self.in_loop = old_in_loop 644 | node.has_return_stmt = node.stmt.has_return_stmt 645 | 646 | def vForLoop(self, node): 647 | self.vWhileLoop(node) 648 | 649 | def vBreakStatement(self, node): 650 | node.has_return_stmt = 0 651 | if not self.in_loop: 652 | self.error("Break statement outside of loop.") 653 | 654 | def vContinueStatement(self, node): 655 | node.has_return_stmt = 0 656 | if not self.in_loop: 657 | self.error("Continue statement outside of loop.") 658 | 659 | def vIfStatement(self, node): 660 | node.then_stmt.accept(self) 661 | node.else_stmt.accept(self) 662 | if node.then_stmt.has_return_stmt and node.else_stmt.has_return_stmt: 663 | node.has_return_stmt = 1 664 | else: 665 | node.has_return_stmt = 0 666 | 667 | def vFunctionDefn(self, node): 668 | self.curr_func = node 669 | self.in_loop = 0 670 | node.body.accept(self) 671 | if not node.body.has_return_stmt: 672 | self.warning("Function %s doesn't return through all branches." % node.name) 673 | 674 | def vReturnStatement(self, node): 675 | node.has_return_stmt = 1 676 | 677 | def vCompoundStatement(self, node): 678 | node.statement_list.accept(self) 679 | node.has_return_stmt = node.statement_list.has_return_stmt 680 | 681 | # --------------------------------------------------------------- 682 | # End of cvisitors.py 683 | # --------------------------------------------------------------- 684 | -------------------------------------------------------------------------------- /lex.py: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------------------------- 2 | # ply: lex.py 3 | # 4 | # Author: David M. Beazley (beazley@cs.uchicago.edu) 5 | # Department of Computer Science 6 | # University of Chicago 7 | # Chicago, IL 60637 8 | # 9 | # Copyright (C) 2001, David M. Beazley 10 | # 11 | # $Header: /cygdrive/c/prog/CVS/mini_c/lex.py,v 1.1.1.1 2004/05/27 06:40:38 varmaa Exp $ 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 26 | # 27 | # See the file COPYING for a complete copy of the LGPL. 28 | # 29 | # 30 | # This module automatically constructs a lexical analysis module from regular 31 | # expression rules defined in a user-defined module. The idea is essentially the same 32 | # as that used in John Aycock's Spark framework, but the implementation works 33 | # at the module level rather than requiring the use of classes. 34 | # 35 | # This module tries to provide an interface that is closely modeled after 36 | # the traditional lex interface in Unix. It also differs from Spark 37 | # in that: 38 | # 39 | # - It provides more extensive error checking and reporting if 40 | # the user supplies a set of regular expressions that can't 41 | # be compiled or if there is any other kind of a problem in 42 | # the specification. 43 | # 44 | # - The interface is geared towards LALR(1) and LR(1) parser 45 | # generators. That is tokens are generated one at a time 46 | # rather than being generated in advanced all in one step. 47 | # 48 | # There are a few limitations of this module 49 | # 50 | # - The module interface makes it somewhat awkward to support more 51 | # than one lexer at a time. Although somewhat inelegant from a 52 | # design perspective, this is rarely a practical concern for 53 | # most compiler projects. 54 | # 55 | # - The lexer requires that the entire input text be read into 56 | # a string before scanning. I suppose that most machines have 57 | # enough memory to make this a minor issues, but it makes 58 | # the lexer somewhat difficult to use in interactive sessions 59 | # or with streaming data. 60 | # 61 | #----------------------------------------------------------------------------- 62 | 63 | r""" 64 | lex.py 65 | 66 | This module builds lex-like scanners based on regular expression rules. 67 | To use the module, simply write a collection of regular expression rules 68 | and actions like this: 69 | 70 | # lexer.py 71 | import lex 72 | 73 | # Define a list of valid tokens 74 | tokens = ( 75 | 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS' 76 | ) 77 | 78 | # Define tokens as functions 79 | def t_IDENTIFIER(t): 80 | r' ([a-zA-Z_](\w|_)* ' 81 | return t 82 | 83 | def t_NUMBER(t): 84 | r' \d+ ' 85 | return t 86 | 87 | # Some simple tokens with no actions 88 | t_PLUS = r'\+' 89 | t_MINUS = r'-' 90 | 91 | # Initialize the lexer 92 | lex.lex() 93 | 94 | The tokens list is required and contains a complete list of all valid 95 | token types that the lexer is allowed to produce. Token types are 96 | restricted to be valid identifiers. This means that 'MINUS' is a valid 97 | token type whereas '-' is not. 98 | 99 | Rules are defined by writing a function with a name of the form 100 | t_rulename. Each rule must accept a single argument which is 101 | a token object generated by the lexer. This token has the following 102 | attributes: 103 | 104 | t.type = type string of the token. This is initially set to the 105 | name of the rule without the leading t_ 106 | t.value = The value of the lexeme. 107 | t.lineno = The value of the line number where the token was encountered 108 | 109 | For example, the t_NUMBER() rule above might be called with the following: 110 | 111 | t.type = 'NUMBER' 112 | t.value = '42' 113 | t.lineno = 3 114 | 115 | Each rule returns the token object it would like to supply to the 116 | parser. In most cases, the token t is returned with few, if any 117 | modifications. To discard a token for things like whitespace or 118 | comments, simply return nothing. For instance: 119 | 120 | def t_whitespace(t): 121 | r' \s+ ' 122 | pass 123 | 124 | For faster lexing, you can also define this in terms of the ignore set like this: 125 | 126 | t_ignore = ' \t' 127 | 128 | The characters in this string are ignored by the lexer. Use of this feature can speed 129 | up parsing significantly since scanning will immediately proceed to the next token. 130 | 131 | lex requires that the token returned by each rule has an attribute 132 | t.type. Other than this, rules are free to return any kind of token 133 | object that they wish and may construct a new type of token object 134 | from the attributes of t (provided the new object has the required 135 | type attribute). 136 | 137 | If illegal characters are encountered, the scanner executes the 138 | function t_error(t) where t is a token representing the rest of the 139 | string that hasn't been matched. If this function isn't defined, a 140 | LexError exception is raised. The .text attribute of this exception 141 | object contains the part of the string that wasn't matched. 142 | 143 | The t.skip(n) method can be used to skip ahead n characters in the 144 | input stream. This is usually only used in the error handling rule. 145 | For instance, the following rule would print an error message and 146 | continue: 147 | 148 | def t_error(t): 149 | print "Illegal character in input %s" % t.value[0] 150 | t.skip(1) 151 | 152 | Of course, a nice scanner might wish to skip more than one character 153 | if the input looks very corrupted. 154 | 155 | The lex module defines a t.lineno attribute on each token that can be used 156 | to track the current line number in the input. The value of this 157 | variable is not modified by lex so it is up to your lexer module 158 | to correctly update its value depending on the lexical properties 159 | of the input language. To do this, you might write rules such as 160 | the following: 161 | 162 | def t_newline(t): 163 | r' \n+ ' 164 | t.lineno += t.value.count("\n") 165 | 166 | To initialize your lexer so that it can be used, simply call the lex.lex() 167 | function in your rule file. If there are any errors in your 168 | specification, warning messages or an exception will be generated to 169 | alert you to the problem. 170 | 171 | (dave: this needs to be rewritten) 172 | To use the newly constructed lexer from another module, simply do 173 | this: 174 | 175 | import lex 176 | import lexer 177 | plex.input("position = initial + rate*60") 178 | 179 | while 1: 180 | token = plex.token() # Get a token 181 | if not token: break # No more tokens 182 | ... do whatever ... 183 | 184 | Assuming that the module 'lexer' has initialized plex as shown 185 | above, parsing modules can safely import 'plex' without having 186 | to import the rule file or any additional imformation about the 187 | scanner you have defined. 188 | """ 189 | 190 | # ----------------------------------------------------------------------------- 191 | 192 | 193 | __version__ = "1.4" 194 | 195 | import re, types, sys, copy 196 | 197 | # Exception thrown when invalid token encountered and no default 198 | class LexError(Exception): 199 | def __init__(self,message,s): 200 | self.args = (message,) 201 | self.text = s 202 | 203 | # Token class 204 | class LexToken: 205 | def __str__(self): 206 | return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno) 207 | def __repr__(self): 208 | return str(self) 209 | def skip(self,n): 210 | try: 211 | self._skipn += n 212 | except AttributeError: 213 | self._skipn = n 214 | 215 | # ----------------------------------------------------------------------------- 216 | # Lexer class 217 | # 218 | # input() - Store a new string in the lexer 219 | # token() - Get the next token 220 | # ----------------------------------------------------------------------------- 221 | 222 | class Lexer: 223 | def __init__(self): 224 | self.lexre = None # Master regular expression 225 | self.lexdata = None # Actual input data (as a string) 226 | self.lexpos = 0 # Current position in input text 227 | self.lexlen = 0 # Length of the input text 228 | self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types 229 | self.lexerrorf = None # Error rule (if any) 230 | self.lextokens = None # List of valid tokens 231 | self.lexignore = None # Ignored characters 232 | self.lineno = 1 # Current line number 233 | self.debug = 0 # Debugging mode 234 | self.optimize = 0 # Optimized mode 235 | self.token = self.errtoken 236 | 237 | def __copy__(self): 238 | c = Lexer() 239 | c.lexre = self.lexre 240 | c.lexdata = self.lexdata 241 | c.lexpos = self.lexpos 242 | c.lexlen = self.lexlen 243 | c.lenindexfunc = self.lexindexfunc 244 | c.lexerrorf = self.lexerrorf 245 | c.lextokens = self.lextokens 246 | c.lexignore = self.lexignore 247 | c.lineno = self.lineno 248 | c.optimize = self.optimize 249 | c.token = c.realtoken 250 | 251 | # ------------------------------------------------------------ 252 | # input() - Push a new string into the lexer 253 | # ------------------------------------------------------------ 254 | def input(self,s): 255 | if not isinstance(s,types.StringType): 256 | raise ValueError, "Expected a string" 257 | self.lexdata = s 258 | self.lexpos = 0 259 | self.lexlen = len(s) 260 | self.token = self.realtoken 261 | 262 | # Change the token routine to point to realtoken() 263 | global token 264 | if token == self.errtoken: 265 | token = self.token 266 | 267 | # ------------------------------------------------------------ 268 | # errtoken() - Return error if token is called with no data 269 | # ------------------------------------------------------------ 270 | def errtoken(self): 271 | raise RuntimeError, "No input string given with input()" 272 | 273 | # ------------------------------------------------------------ 274 | # token() - Return the next token from the Lexer 275 | # 276 | # Note: This function has been carefully implemented to be as fast 277 | # as possible. Don't make changes unless you really know what 278 | # you are doing 279 | # ------------------------------------------------------------ 280 | def realtoken(self): 281 | # Make local copies of frequently referenced attributes 282 | lexpos = self.lexpos 283 | lexlen = self.lexlen 284 | lexignore = self.lexignore 285 | lexdata = self.lexdata 286 | 287 | while lexpos < lexlen: 288 | # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 289 | if lexdata[lexpos] in lexignore: 290 | lexpos += 1 291 | continue 292 | 293 | # Look for a regular expression match 294 | m = self.lexre.match(lexdata,lexpos) 295 | if m: 296 | i = m.lastindex 297 | lexpos = m.end() 298 | tok = LexToken() 299 | tok.value = m.group() 300 | tok.lineno = self.lineno 301 | tok.lexer = self 302 | func,tok.type = self.lexindexfunc[i] 303 | if not func: 304 | self.lexpos = lexpos 305 | return tok 306 | 307 | # If token is processed by a function, call it 308 | self.lexpos = lexpos 309 | newtok = func(tok) 310 | self.lineno = tok.lineno # Update line number 311 | 312 | # Every function must return a token, if nothing, we just move to next token 313 | if not newtok: continue 314 | 315 | # Verify type of the token. If not in the token map, raise an error 316 | if not self.optimize: 317 | if not self.lextokens.has_key(newtok.type): 318 | raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 319 | func.func_code.co_filename, func.func_code.co_firstlineno, 320 | func.__name__, newtok.type),lexdata[lexpos:]) 321 | 322 | return newtok 323 | 324 | # No match. Call t_error() if defined. 325 | if self.lexerrorf: 326 | tok = LexToken() 327 | tok.value = self.lexdata[lexpos:] 328 | tok.lineno = self.lineno 329 | tok.type = "error" 330 | tok.lexer = self 331 | oldpos = lexpos 332 | newtok = self.lexerrorf(tok) 333 | lexpos += getattr(tok,"_skipn",0) 334 | if oldpos == lexpos: 335 | # Error method didn't change text position at all. This is an error. 336 | self.lexpos = lexpos 337 | raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) 338 | if not newtok: continue 339 | self.lexpos = lexpos 340 | return newtok 341 | 342 | self.lexpos = lexpos 343 | raise LexError, ("No match found", lexdata[lexpos:]) 344 | 345 | # No more input data 346 | self.lexpos = lexpos + 1 347 | return None 348 | 349 | 350 | # ----------------------------------------------------------------------------- 351 | # validate_file() 352 | # 353 | # This checks to see if there are duplicated t_rulename() functions or strings 354 | # in the parser input file. This is done using a simple regular expression 355 | # match on each line in the filename. 356 | # ----------------------------------------------------------------------------- 357 | 358 | def validate_file(filename): 359 | import os.path 360 | base,ext = os.path.splitext(filename) 361 | if ext != '.py': return 1 # No idea what the file is. Return OK 362 | 363 | try: 364 | f = open(filename) 365 | lines = f.readlines() 366 | f.close() 367 | except IOError: 368 | return 1 # Oh well 369 | 370 | fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 371 | sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 372 | counthash = { } 373 | linen = 1 374 | noerror = 1 375 | for l in lines: 376 | m = fre.match(l) 377 | if not m: 378 | m = sre.match(l) 379 | if m: 380 | name = m.group(1) 381 | prev = counthash.get(name) 382 | if not prev: 383 | counthash[name] = linen 384 | else: 385 | print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) 386 | noerror = 0 387 | linen += 1 388 | return noerror 389 | 390 | # ----------------------------------------------------------------------------- 391 | # _read_lextab(module) 392 | # 393 | # Reads lexer table from a lextab file instead of using introspection. 394 | # ----------------------------------------------------------------------------- 395 | 396 | def _read_lextab(lexer, fdict, module): 397 | exec "import %s as lextab" % module 398 | lexer.lexre = re.compile(lextab._lexre, re.VERBOSE) 399 | lexer.lexindexfunc = lextab._lextab 400 | for i in range(len(lextab._lextab)): 401 | t = lexer.lexindexfunc[i] 402 | if t: 403 | if t[0]: 404 | lexer.lexindexfunc[i] = (fdict[t[0]],t[1]) 405 | lexer.lextokens = lextab._lextokens 406 | lexer.lexignore = lextab._lexignore 407 | if lextab._lexerrorf: 408 | lexer.lexerrorf = fdict[lextab._lexerrorf] 409 | 410 | # ----------------------------------------------------------------------------- 411 | # lex(module) 412 | # 413 | # Build all of the regular expression rules from definitions in the supplied module 414 | # ----------------------------------------------------------------------------- 415 | def lex(module=None,debug=0,optimize=0,lextab="lextab"): 416 | ldict = None 417 | regex = "" 418 | error = 0 419 | files = { } 420 | lexer = Lexer() 421 | lexer.debug = debug 422 | lexer.optimize = optimize 423 | global token,input 424 | 425 | if module: 426 | # User supplied a module object. 427 | if isinstance(module, types.ModuleType): 428 | ldict = module.__dict__ 429 | elif isinstance(module, types.InstanceType): 430 | _items = [(k,getattr(module,k)) for k in dir(module)] 431 | ldict = { } 432 | for (i,v) in _items: 433 | ldict[i] = v 434 | else: 435 | raise ValueError,"Expected a module or instance" 436 | 437 | else: 438 | # No module given. We might be able to get information from the caller. 439 | try: 440 | raise RuntimeError 441 | except RuntimeError: 442 | e,b,t = sys.exc_info() 443 | f = t.tb_frame 444 | f = f.f_back # Walk out to our calling function 445 | ldict = f.f_globals # Grab its globals dictionary 446 | 447 | if optimize and lextab: 448 | try: 449 | _read_lextab(lexer,ldict, lextab) 450 | if not lexer.lexignore: lexer.lexignore = "" 451 | token = lexer.token 452 | input = lexer.input 453 | return lexer 454 | 455 | except ImportError: 456 | pass 457 | 458 | # Get the tokens map 459 | if (module and isinstance(module,types.InstanceType)): 460 | tokens = getattr(module,"tokens",None) 461 | else: 462 | try: 463 | tokens = ldict["tokens"] 464 | except KeyError: 465 | tokens = None 466 | 467 | if not tokens: 468 | raise SyntaxError,"lex: module does not define 'tokens'" 469 | if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): 470 | raise SyntaxError,"lex: tokens must be a list or tuple." 471 | 472 | # Build a dictionary of valid token names 473 | lexer.lextokens = { } 474 | if not optimize: 475 | 476 | # Utility function for verifying tokens 477 | def is_identifier(s): 478 | for c in s: 479 | if not (c.isalnum() or c == '_'): return 0 480 | return 1 481 | 482 | for n in tokens: 483 | if not is_identifier(n): 484 | print "lex: Bad token name '%s'" % n 485 | error = 1 486 | if lexer.lextokens.has_key(n): 487 | print "lex: Warning. Token '%s' multiply defined." % n 488 | lexer.lextokens[n] = None 489 | else: 490 | for n in tokens: lexer.lextokens[n] = None 491 | 492 | 493 | if debug: 494 | print "lex: tokens = '%s'" % lexer.lextokens.keys() 495 | 496 | # Get a list of symbols with the t_ prefix 497 | tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] 498 | 499 | # Now build up a list of functions and a list of strings 500 | fsymbols = [ ] 501 | ssymbols = [ ] 502 | for f in tsymbols: 503 | if callable(ldict[f]): 504 | fsymbols.append(ldict[f]) 505 | elif isinstance(ldict[f], types.StringType): 506 | ssymbols.append((f,ldict[f])) 507 | else: 508 | print "lex: %s not defined as a function or string" % f 509 | error = 1 510 | 511 | # Sort the functions by line number 512 | fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) 513 | 514 | # Sort the strings by regular expression length 515 | ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 516 | 517 | # Check for non-empty symbols 518 | if len(fsymbols) == 0 and len(ssymbols) == 0: 519 | raise SyntaxError,"lex: no rules of the form t_rulename are defined." 520 | 521 | # Add all of the rules defined with actions first 522 | for f in fsymbols: 523 | 524 | line = f.func_code.co_firstlineno 525 | file = f.func_code.co_filename 526 | files[file] = None 527 | 528 | ismethod = isinstance(f, types.MethodType) 529 | 530 | if not optimize: 531 | nargs = f.func_code.co_argcount 532 | if ismethod: 533 | reqargs = 2 534 | else: 535 | reqargs = 1 536 | if nargs > reqargs: 537 | print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) 538 | error = 1 539 | continue 540 | 541 | if nargs < reqargs: 542 | print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) 543 | error = 1 544 | continue 545 | 546 | if f.__name__ == 't_ignore': 547 | print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) 548 | error = 1 549 | continue 550 | 551 | if f.__name__ == 't_error': 552 | lexer.lexerrorf = f 553 | continue 554 | 555 | if f.__doc__: 556 | if not optimize: 557 | try: 558 | c = re.compile(f.__doc__, re.VERBOSE) 559 | except re.error,e: 560 | print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) 561 | error = 1 562 | continue 563 | 564 | if debug: 565 | print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__) 566 | 567 | # Okay. The regular expression seemed okay. Let's append it to the master regular 568 | # expression we're building 569 | 570 | if (regex): regex += "|" 571 | regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) 572 | else: 573 | print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) 574 | 575 | # Now add all of the simple rules 576 | for name,r in ssymbols: 577 | 578 | if name == 't_ignore': 579 | lexer.lexignore = r 580 | continue 581 | 582 | if not optimize: 583 | if name == 't_error': 584 | raise SyntaxError,"lex: Rule 't_error' must be defined as a function" 585 | error = 1 586 | continue 587 | 588 | if not lexer.lextokens.has_key(name[2:]): 589 | print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) 590 | error = 1 591 | continue 592 | try: 593 | c = re.compile(r,re.VERBOSE) 594 | except re.error,e: 595 | print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) 596 | error = 1 597 | continue 598 | if debug: 599 | print "lex: Adding rule %s -> '%s'" % (name,r) 600 | 601 | if regex: regex += "|" 602 | regex += "(?P<%s>%s)" % (name,r) 603 | 604 | if not optimize: 605 | for f in files.keys(): 606 | if not validate_file(f): 607 | error = 1 608 | try: 609 | if debug: 610 | print "lex: regex = '%s'" % regex 611 | lexer.lexre = re.compile(regex, re.VERBOSE) 612 | 613 | # Build the index to function map for the matching engine 614 | lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1) 615 | for f,i in lexer.lexre.groupindex.items(): 616 | handle = ldict[f] 617 | if type(handle) in (types.FunctionType, types.MethodType): 618 | lexer.lexindexfunc[i] = (handle,handle.__name__[2:]) 619 | else: 620 | # If rule was specified as a string, we build an anonymous 621 | # callback function to carry out the action 622 | lexer.lexindexfunc[i] = (None,f[2:]) 623 | 624 | # If a lextab was specified, we create a file containing the precomputed 625 | # regular expression and index table 626 | 627 | if lextab and optimize: 628 | lt = open(lextab+".py","w") 629 | lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) 630 | lt.write("_lexre = %s\n" % repr(regex)) 631 | lt.write("_lextab = [\n"); 632 | for i in range(0,len(lexer.lexindexfunc)): 633 | t = lexer.lexindexfunc[i] 634 | if t: 635 | if t[0]: 636 | lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1]))) 637 | else: 638 | lt.write(" (None,%s),\n" % repr(t[1])) 639 | else: 640 | lt.write(" None,\n") 641 | 642 | lt.write("]\n"); 643 | lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) 644 | lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) 645 | if (lexer.lexerrorf): 646 | lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__)) 647 | else: 648 | lt.write("_lexerrorf = None\n") 649 | lt.close() 650 | 651 | except re.error,e: 652 | print "lex: Fatal error. Unable to compile regular expression rules. %s" % e 653 | error = 1 654 | if error: 655 | raise SyntaxError,"lex: Unable to build lexer." 656 | if not lexer.lexerrorf: 657 | print "lex: Warning. no t_error rule is defined." 658 | 659 | if not lexer.lexignore: lexer.lexignore = "" 660 | 661 | # Create global versions of the token() and input() functions 662 | token = lexer.token 663 | input = lexer.input 664 | 665 | return lexer 666 | 667 | # ----------------------------------------------------------------------------- 668 | # run() 669 | # 670 | # This runs the lexer as a main program 671 | # ----------------------------------------------------------------------------- 672 | 673 | def runmain(lexer=None,data=None): 674 | if not data: 675 | try: 676 | filename = sys.argv[1] 677 | f = open(filename) 678 | data = f.read() 679 | f.close() 680 | except IndexError: 681 | print "Reading from standard input (type EOF to end):" 682 | data = sys.stdin.read() 683 | 684 | if lexer: 685 | _input = lexer.input 686 | else: 687 | _input = input 688 | _input(data) 689 | if lexer: 690 | _token = lexer.token 691 | else: 692 | _token = token 693 | 694 | while 1: 695 | tok = _token() 696 | if not tok: break 697 | print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno) 698 | 699 | 700 | 701 | 702 | -------------------------------------------------------------------------------- /cx86.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # cx86.py 3 | # 4 | # Atul Varma 5 | # Python C Compiler - Intel x86 Code Generator 6 | # $Id: cx86.py,v 1.3 2004/06/02 21:05:23 varmaa Exp $ 7 | # --------------------------------------------------------------- 8 | 9 | import cparse 10 | from cvisitors import Visitor 11 | 12 | # --------------------------------------------------------------- 13 | # CONSTANTS 14 | # --------------------------------------------------------------- 15 | 16 | # Size of the 'int' type. 17 | INT_SIZE = 4 18 | 19 | # Size of the 'char' type. 20 | CHAR_SIZE = 1 21 | 22 | # The machine's word size. Note that making this different 23 | # from INT_SIZE may cause serious problems. 24 | WORD_SIZE = 4 25 | 26 | # This is a strange multiplier that needs to be used in the allocation 27 | # of global variables for the GNU Assembler. Not sure exactly what it 28 | # represents. 29 | WEIRD_MULTIPLIER = 4 30 | 31 | # --------------------------------------------------------------- 32 | # STACK MACHINE ABSTRACTION 33 | # --------------------------------------------------------------- 34 | 35 | class x86Registers: 36 | """This class attempts to abstract the x86 registers into a stack 37 | machine. Calling push() gives you a register that isn't currently 38 | in use by the stack machine, pop() gives you a register with the 39 | value of the most recently pushed element. 40 | 41 | Through this method the stack machine can be used to compute 42 | values the same way a reverse polish notation (RPN) calculator 43 | does. 44 | 45 | When push() and pop() are called, it may be the case that no 46 | registers are currently available; if this happens, the least 47 | recently used register is 'spilled' into a temporary local 48 | variable on the process' stack and freed for use. Note that the 49 | process' stack is not to be confused with this stack machine 50 | abstraction--the two are completely different entities. 51 | 52 | Currently, push() and pop() also implement a little bit of 53 | implicit type conversion, so they take as parameters a cparse.Type 54 | object; currently conversion is done between char and int types, 55 | so depending on the pushed and popped types, some type conversion 56 | assembly code may be generated. 57 | 58 | Finally, an additional method, done(), should be called whenever 59 | the stack machine is done popping values for the current 60 | operation. This is because when pop is called, the returned 61 | register is not immediately made 'free' for another call to pop or 62 | push. If this were the case, then the following situation could 63 | occur: 64 | 65 | rightOp.calc() # calc val of right op, put on stack 66 | leftOp.calc() # calc val of left op, put on stack 67 | l = leftOp.pop() # pop left val from stack 68 | r = rightOp.pop() # pop right val from stack 69 | output('addl %s, %s' % (r, l)) 70 | 71 | The problem with this approach is that we don't know how many 72 | registers will be used by leftOp's calc() method--it may use all 73 | the remaining registers, in which case the value that rightOp's 74 | calc() method put on the stack is no longer stored in a register. 75 | If leftOp.pop() returned register %eax and immediately marked the 76 | %eax register as being 'free for use', then the call to 77 | rightOp.pop() could very well generate code that moves rightOp's 78 | value from a temporary variable into %eax, thereby overwriting 79 | leftOp's value! 80 | 81 | So, instead, the pop() method places the %eax register (in this 82 | example) into an internal list of 'almost free' registers; 83 | registers that have just been returned by pop() but shouldn't be 84 | used by the stack machine until a call to done() is made. The 85 | done() method simply moves the registers in the 'almost free' list 86 | over to the 'free' list.""" 87 | 88 | def __init__(self, parent, base_fp): 89 | # A list of all registers on the machine. 90 | self.all_regs = ['%ebx','%esi','%edi','%eax','%ecx','%edx'] 91 | 92 | # A list of the registers currently free. Note that this 93 | # is a *copy* of the list of all registers on the machine. 94 | self.regs_free = self.all_regs[:] 95 | 96 | # A list of all the registers that are "almost" free 97 | # (see the docstring for this class). 98 | self.regs_almost_free = [] 99 | 100 | # A list of all the temporary variable memory locations 101 | # that are currently unused. 102 | self.mem_free = [] 103 | 104 | # A list corresponding to the actual stack of the stack 105 | # machine. The item at the top of the stack is the 106 | # last element of this list. 107 | self.stack = [] 108 | 109 | # A list that stores the Type objects of each corresponding 110 | # element on the stack machine's stack. e.g., type_stack[0] 111 | # represents the type of the element at stack[0]. 112 | self.type_stack = [] 113 | 114 | # The location of the next memory location to be used for 115 | # temporary variables, relative to the current function's 116 | # frame pointer. 117 | self.next_temp = base_fp - WORD_SIZE 118 | 119 | # The parent CodeGenVisitor object of this stack machine. 120 | self.parent = parent 121 | 122 | # A list of the callee-save registers that have been used 123 | # so far by this function. Once processing is finished, 124 | # these registers will be pushed onto the process' stack 125 | # at the beginning of the function and popped off just 126 | # before the function terminates. 127 | self.callee_save_regs_used = [] 128 | 129 | # A list of the caller-save registers on the machine. 130 | self.caller_save_regs = ['%eax', '%ecx', '%edx'] 131 | 132 | # A list of the callee-save registers on the machine. 133 | self.callee_save_regs = ['%ebx', '%esi', '%edi'] 134 | 135 | # A list of the registers on the machine that have 136 | # sub-registers allowing access to their low-order bytes. 137 | self.byte_compat_regs = ['%eax', '%ebx', '%ecx', '%edx'] 138 | 139 | # The default type of an element that is pushed onto 140 | # the stack machine without a 'type' object passed. 141 | self.default_type = cparse.BaseType('int') 142 | 143 | def o(self, str, comment=None): 144 | """Wrapper for the parent CodeGenVisitor's o() method.""" 145 | 146 | self.parent.o(str, comment) 147 | 148 | def save_caller_saves(self): 149 | """Saves the caller-save registers, which should be done 150 | before the current function makes a function call, so that 151 | the registers don't get corrupted by the called function. 152 | 153 | Normally, this is done by pushing the caller-save registers 154 | onto the stack just before the function call is made and 155 | popping them off afterwards; however, due to the workings of 156 | this particular stack machine it's much easier to just move 157 | the contents of the caller-save registers, if they are 158 | currently being used, into temporary variables.""" 159 | 160 | for reg in self.caller_save_regs: 161 | if reg not in self.regs_free: 162 | self._copy_reg_to_temp([reg], 163 | "Save caller-save register to temp") 164 | self.regs_free.append(reg) 165 | 166 | def save_callee_saves(self): 167 | """Emits code that pushes the callee-save registers used by 168 | the stack machine onto the process' stack.""" 169 | 170 | for reg in self.callee_save_regs_used: 171 | self.o(" pushl %s" % reg, 172 | "Save callee-save register") 173 | 174 | def load_callee_saves(self): 175 | """Emits code that pops the callee-save registers used by 176 | the stack machine off the process' stack.""" 177 | 178 | for reg in self.callee_save_regs_used: 179 | self.o(" popl %s" % reg, 180 | "Restore callee-save register") 181 | 182 | def _copy_reg_to_temp(self, valid_regs, comment_str=None): 183 | """Copy the least recently used register on the stack into a 184 | temporary variable. The register must be in the valid_regs 185 | list.""" 186 | 187 | # if no free temp variables exist, 188 | # create a new one. 189 | if len(self.mem_free) == 0: 190 | self.mem_free.append("%d(%%ebp)" % self.next_temp) 191 | self.next_temp -= WORD_SIZE 192 | 193 | # get an unused temp var 194 | mem = self.mem_free.pop() 195 | 196 | # find the least recently used register on the stack 197 | reg = None 198 | index = 0 199 | for i in self.stack: 200 | if i in valid_regs: 201 | reg = i 202 | break 203 | index += 1 204 | if reg == None: 205 | raise Exception("No free registers inside OR outside of stack!") 206 | 207 | # emit code to copy the register to the memory location. 208 | if comment_str == None: 209 | comment_str = "Stack machine: copy register to temp" 210 | self.o(" movl %s, %s" % (reg, mem), 211 | comment_str) 212 | 213 | # Modify the element's stack machine position to reflect 214 | # its new location. 215 | self.stack[index] = mem 216 | return reg 217 | 218 | def _get_free_reg(self, valid_regs, preferred_reg=None): 219 | """Returns a free register that is in the valid_regs list. If 220 | no registers are available, the most least-recently used 221 | eligible one is freed (by moving its contents to a temporary 222 | variable) and returned.""" 223 | 224 | # If we have a register free, return it. 225 | if len(self.regs_free) > 0: 226 | reg = None 227 | if preferred_reg != None and preferred_reg in self.regs_free: 228 | reg = preferred_reg 229 | else: 230 | for r in self.regs_free: 231 | if r in valid_regs: 232 | reg = r 233 | if reg != None: 234 | self.regs_free.remove(reg) 235 | # If this register is a callee-save register that 236 | # we haven't used before, add it to our list 237 | # of used callee-save registers. 238 | if reg in self.callee_save_regs and reg not in self.callee_save_regs_used: 239 | self.callee_save_regs_used.append(reg) 240 | return reg 241 | # copy a register into a temp var and return the register. 242 | return self._copy_reg_to_temp(valid_regs) 243 | 244 | def _get_type_valid_regs(self, type): 245 | """Returns the valid registers that an element of the given 246 | type can occupy. For instance, 8-bit chars should only be 247 | placed in %eax/%ebx/%ecx/%edx because these are the only 248 | registers with low-order byte sub-registers 249 | (%al/%bl/%cl/%dl).""" 250 | 251 | type_str = type.get_outer_string() 252 | if type_str == 'char': 253 | return self.byte_compat_regs 254 | elif type_str in ['int', 'pointer']: 255 | return self.all_regs 256 | 257 | def push(self, type=None, preferred_reg=None, valid_regs=None): 258 | """Finds a free eligible register (or frees one if all are 259 | being used) and returns it, pushing the register onto the 260 | stack machine's stack. 261 | 262 | This method associates the stack entry with the given Type 263 | object; if none is supplied, then an 'int' type is used 264 | by default. 265 | 266 | If preferred_reg is passed, this function will try its 267 | best to return preferred_reg, if it's available.""" 268 | 269 | if type == None: 270 | type = self.default_type 271 | self.type_stack.append(type) 272 | if valid_regs == None: 273 | valid_regs = self._get_type_valid_regs(type) 274 | reg = self._get_free_reg(valid_regs, preferred_reg) 275 | self.stack.append(reg) 276 | return reg 277 | 278 | def _coerce_type(self, curr_reg, from_type, to_type): 279 | """Attempts to coerce the element in the current register 280 | from the given type to the given type.""" 281 | 282 | from_str = from_type.get_outer_string() 283 | to_str = to_type.get_outer_string() 284 | comment_str = "Implicit cast: %s -> %s" % (from_str, to_str) 285 | if from_str == to_str: 286 | return curr_reg 287 | if from_str == 'char': 288 | if to_str == 'int': 289 | return curr_reg 290 | elif from_str == 'int': 291 | if to_str == 'char': 292 | self.o(" movzbl %s, %s" % (self.lo(curr_reg), 293 | curr_reg), 294 | comment_str) 295 | return curr_reg 296 | 297 | def pop(self, type=None, valid_regs=None): 298 | """Pops the top element off the stack machine's stack, coerces 299 | it to the given type if necessary, and returns a register in 300 | which the element's value now resides. 301 | 302 | If no type is specified, pop() returns the value of the 303 | element as-is.""" 304 | 305 | prev_type = self.type_stack.pop() 306 | if type != None: 307 | if valid_regs == None: 308 | valid_regs = self._get_type_valid_regs(type) 309 | reg = self._pop(valid_regs) 310 | return self._coerce_type(reg, prev_type, type) 311 | else: 312 | return self._pop(self.all_regs) 313 | 314 | def _pop(self, valid_regs): 315 | """Pops the top element of the stack into a free register 316 | that is also in valid_regs and returns the register name. If 317 | no registers are free, the least recently used one is first 318 | copied into a temporary variable and then used.""" 319 | 320 | loc = self.stack.pop() 321 | 322 | # If the top of the stack is a register, just return the 323 | # name of the register and add the register to our free 324 | # register list. 325 | if loc in valid_regs: 326 | self.regs_almost_free.append(loc) 327 | return loc 328 | 329 | # Otherwise, copy the temp variable at the top of the stack 330 | # into a free register, possibly requiring us to spill the 331 | # current contents of the memory register into another temp 332 | # variable. 333 | reg = self._get_free_reg(valid_regs) 334 | self.o(" movl %s, %s" % (loc, reg), 335 | "Stack machine: copy temp to register") 336 | 337 | # if our location was a register but not in valid_regs, 338 | # make the register free for use. 339 | if loc in self.all_regs: 340 | self.regs_free.append(loc) 341 | 342 | self.regs_almost_free.append(reg) 343 | return reg 344 | 345 | def peek(self): 346 | """Returns the top element of the stack, but doesn't pop 347 | it. Note that this is not guaranteed to be a register; it 348 | could be a memory location!""" 349 | 350 | return self.stack[-1] 351 | 352 | def is_empty(self): 353 | """Returns whether the stack machine is empty.""" 354 | 355 | return len(self.stack) == 0 356 | 357 | def done(self): 358 | """Frees all registers that are marked as being in 359 | intermediate use (i.e., have been pop()'d).""" 360 | 361 | self.regs_free.extend(self.regs_almost_free) 362 | self.regs_almost_free = [] 363 | 364 | def get_max_fp(self): 365 | """Returns the maximum point in the process' stack, relative 366 | to the current function's frame pointer, that the stack 367 | machine is using for temporary variables.""" 368 | 369 | return self.next_temp + WORD_SIZE 370 | 371 | def lo(self, reg): 372 | """Returns the low-order byte of the given register. If the 373 | register isn't byte-compatible (i.e., isn't %eax, %ebx, %ecx, 374 | or %edx), then an exception is raised. 375 | 376 | Example: stack.lo('%eax') == '%al'.""" 377 | 378 | if reg[0] == '$': 379 | return reg 380 | if reg not in self.byte_compat_regs: 381 | raise Exception("Register %s is not byte-compatible!" % reg) 382 | return '%' + reg[2] + 'l' 383 | 384 | def force_type_change(self, type): 385 | """Forces a type change of the top element of the stack.""" 386 | 387 | self.type_stack[-1] = type 388 | 389 | # --------------------------------------------------------------- 390 | # CODE GENERATOR 391 | # --------------------------------------------------------------- 392 | 393 | class CodeGenVisitor(Visitor): 394 | """Visitor that generates x86 assembly code for the abstract 395 | syntax tree.""" 396 | 397 | def __init__(self, file, show_comments=0): 398 | """Constructor. 'file' is the file object to output the 399 | resulting code to. If 'show_comments' is true, then annotated 400 | comments are produced for the generated assembly code.""" 401 | 402 | Visitor.__init__(self) 403 | 404 | # The current label number we're on, for generating 405 | # jump labels in the assembly code (e.g., 'LO', 'L1', etc). 406 | self.__label = 0 407 | 408 | # Current label number for generating string literal labels. 409 | self.__str_literal_label = 0 410 | 411 | # Current assembly code for string literals. 412 | self.__str_literal_str = "" 413 | 414 | # Whether we should show comments or not. 415 | self.show_comments = show_comments 416 | 417 | # The file we're outputting the generated code to. 418 | self.file = file 419 | 420 | # A hashtable of binary operators and the assembly 421 | # instructions corresponding to them. Certain instructions 422 | # are just the 'base' instruction and require a suffix 423 | # corresponding to the size of the operands; for instance, 424 | # addition can be accomplished with the 'addl' instruction 425 | # for 32-bit integers and 'addb' for 8-bit integers. 426 | # 427 | # In such cases, the code adds the appropriate suffixes on its 428 | # own. 429 | self.binop_instructions = \ 430 | { '==' : 'sete', 431 | '!=' : 'setne', 432 | '>=' : 'setge', 433 | '<=' : 'setle', 434 | '>' : 'setg', 435 | '<' : 'setl', 436 | '+' : 'add', 437 | '-' : 'sub', 438 | '*' : 'imul', 439 | '=' : 'mov' 440 | } 441 | 442 | # Windows' C linkage prepends a '_' before symbol 443 | # names, whereas Unix doesn't. This is particularly 444 | # critical if the source file is linking to external 445 | # libraries that we're not compiling. Figure out 446 | # which one to use here. 447 | import sys 448 | if sys.platform == 'win32': 449 | self.symbol_prepend = "_" 450 | else: 451 | self.symbol_prepend = "" 452 | 453 | def new_label(self): 454 | """Generate a new jump label and return it.""" 455 | 456 | label = ".L%d" % self.__label 457 | self.__label += 1 458 | return label 459 | 460 | def o(self, str, comment=None): 461 | """Output a line of assembly code to the output file, 462 | with an optional annotated comment (if comments are 463 | enabled).""" 464 | 465 | if self.show_comments and comment != None: 466 | comment = "# %s" % comment 467 | self.curr_str += "%-35s %s\n" % (str, comment) 468 | else: 469 | if str == "": 470 | return 471 | self.curr_str += str + "\n" 472 | 473 | def c(self, str, indent_amt=2): 474 | """Output a single-line comment to the output file, if 475 | comments are enabled.""" 476 | 477 | indent = " " * indent_amt 478 | 479 | if self.show_comments: 480 | self.o("\n%s# %s\n" % (indent, str)) 481 | 482 | def vNodeList(self, node): 483 | self._visitList(node.nodes) 484 | 485 | def _empty_stack(self, node): 486 | """Pops the top value from the stack machine's stack and 487 | discard it. This is used when a statement has a return 488 | value (for instance, the line 'a = b + 1;') and its 489 | return value has been pushed onto the stack but there's 490 | nothing to pop it off.""" 491 | 492 | # if the statement was also an expression, then its return 493 | # value is still on the stack, so empty it (throw away 494 | # the return value). 495 | if not self.stack.is_empty(): 496 | self.stack.pop(node.type) 497 | self.stack.done() 498 | if not self.stack.is_empty(): 499 | raise Exception("PANIC! Register stack isn't empty!") 500 | 501 | def _accept_and_empty_stack(self, node): 502 | """Visit the node and then empty the stack machine of the 503 | node's return value, if one exists.""" 504 | 505 | node.accept(self) 506 | self._empty_stack(node) 507 | 508 | def vStatementList(self, node): 509 | for n in node.nodes: 510 | self._accept_and_empty_stack(n) 511 | 512 | def _generate_global_variable_definitions(self, node): 513 | """Generate and return a list of global variable 514 | definitions.""" 515 | 516 | globals_str = ".global_vars:\n" 517 | for symbol in node.symtab.entries.values(): 518 | symbol.compile_loc = self.symbol_prepend + symbol.name 519 | if not symbol.type.is_function() and not symbol.extern: 520 | globals_str += " .comm %s,%d\n" % \ 521 | (symbol.compile_loc, \ 522 | self._calc_var_size(symbol.type)*WEIRD_MULTIPLIER) 523 | return globals_str 524 | 525 | def vTranslationUnit(self, node): 526 | """Outputs the entire assembly source file.""" 527 | 528 | self.curr_str = "" 529 | self.o("# Generated by c.py") 530 | self.o("# Atul Varma (Spring 2004)\n") 531 | self.o(" .text") 532 | 533 | globals_str = self._generate_global_variable_definitions(node) 534 | 535 | # Generate the main code. 536 | self._visitList(node.nodes) 537 | 538 | # Append global variable definitions. 539 | self.o(globals_str) 540 | 541 | # Append string literal definitions. 542 | self.o(self.__str_literal_str) 543 | 544 | # Output the entire file. 545 | self.file.write(self.curr_str) 546 | 547 | def _calc_var_size(self, type): 548 | """Calculate and return the size of the given type, in 549 | bytes.""" 550 | 551 | type_str = type.get_outer_string() 552 | if type_str == "int": 553 | return INT_SIZE 554 | elif type_str == "char": 555 | return CHAR_SIZE 556 | elif type_str == "pointer": 557 | return WORD_SIZE 558 | else: 559 | self.error("Unknown type: %s" % type_str) 560 | 561 | def _calc_var_align(self, type): 562 | """Calculate and return the alignment of the given type, 563 | in bytes.""" 564 | 565 | return self._calc_var_size(type) 566 | 567 | def _calc_function_var_addrs(self, symtab, last_fp_loc): 568 | """Calculate the addresses of all local variables in the 569 | function and attach them to their respective symbols in 570 | the function's symbol table(s).""" 571 | 572 | self._calc_function_arg_addrs(symtab) 573 | return self._calc_local_var_addrs(symtab.children[0], last_fp_loc) 574 | 575 | def _calc_function_arg_addrs(self, symtab): 576 | """Calculate the addresses of all the arguments passed to 577 | the function.""" 578 | 579 | for symbol in symtab.entries.values(): 580 | symbol.compile_loc = "%d(%%ebp)" % (WORD_SIZE*2+(symbol.param_num*WORD_SIZE)) 581 | if not symbol.is_used: 582 | self.warning("function argument '%s' is never used." % symbol.name) 583 | 584 | def _calc_local_var_addrs(self, symtab, last_fp_loc): 585 | """Calculate the locations of all the local variables defined 586 | in the function's body and all nested scopes therein. 587 | 588 | This model of allocation assumes a 'worst-case' scenario 589 | where all branches and nested scopes of the function are 590 | executed; thus the space required for all the local 591 | variables is allocated on the process' stack at the 592 | beginning of the function. 593 | 594 | Note, however, that lexical scopes that cannot exist 595 | at the same time may overlap in memory. For instance, 596 | examine the following 'if' statement: 597 | 598 | if (a > 1) { 599 | int i; 600 | } else { 601 | int j; 602 | } 603 | 604 | Here 'i' and 'j' will actually occupy the same place in 605 | memory because it is impossible for both of them to 606 | exist in memory at the same time.""" 607 | 608 | for symbol in symtab.entries.values(): 609 | if symbol.extern: 610 | symbol.compile_loc = self.symbol_prepend + symbol.name 611 | continue 612 | last_fp_loc -= self._calc_var_size(symbol.type) 613 | 614 | # adjust location for alignment 615 | align = self._calc_var_align(symbol.type) 616 | bytes_overboard = (-last_fp_loc) % align 617 | if bytes_overboard != 0: 618 | last_fp_loc -= (align - bytes_overboard) 619 | 620 | symbol.compile_loc = "%d(%%ebp)" % last_fp_loc 621 | if not symbol.is_used: 622 | self.warning("local variable '%s' is never used." % symbol.name) 623 | max_last_fp = last_fp_loc 624 | for kid in symtab.children: 625 | curr_last_fp = self._calc_local_var_addrs(kid, last_fp_loc) 626 | if curr_last_fp < max_last_fp: 627 | max_last_fp = curr_last_fp 628 | 629 | # adjust location for alignment, to keep the stack aligned 630 | # on a word-sized boundary. 631 | align = self._calc_var_align(cparse.PointerType()) 632 | bytes_overboard = (-max_last_fp) % align 633 | if bytes_overboard != 0: 634 | max_last_fp -= (align - bytes_overboard) 635 | 636 | return max_last_fp 637 | 638 | def _fill_line(self, str, width=70): 639 | """Fills a string to the given width with the '-' 640 | character.""" 641 | 642 | extra = "-" * (width-1-len(str)) 643 | return str + " " + extra 644 | 645 | def vFunctionDefn(self, node): 646 | """Output the assembly code for a function.""" 647 | 648 | self.break_labels = [] 649 | self.continue_labels = [] 650 | self.curr_func_end_label = self.new_label() + "_function_end" 651 | 652 | # Calculate the base size of the stack frame (not including 653 | # space for the stack machine's temporary variables). 654 | stack_frame_size = self._calc_function_var_addrs(node.symtab, 0) 655 | 656 | line = self._fill_line("BEGIN FUNCTION: %s()" % node.name) 657 | self.c("%s\n" 658 | "#\n" 659 | "# Function type: %s" % 660 | (line, node.type.get_string()), 0) 661 | 662 | if not node.static: 663 | self.o(" .global %s" % node.compile_loc) 664 | self.o("%s:" % node.compile_loc) 665 | self.o(" pushl %ebp", "Save old frame pointer") 666 | self.o(" movl %esp, %ebp", "Set new frame pointer") 667 | 668 | # Create a new stack machine for this function. 669 | self.stack = x86Registers(self, stack_frame_size) 670 | 671 | # Generate assembly code for the function. Here we 672 | # perform a little hack so that we can generate the 673 | # code for the function into a separate string, and then 674 | # insert it into our code later on. 675 | 676 | old_str = self.curr_str 677 | self.curr_str = "" 678 | 679 | node.body.accept(self) 680 | 681 | function_str = self.curr_str 682 | self.curr_str = old_str 683 | 684 | # Figure out the final size of the stack frame, taking into 685 | # account the stack machine's temporary variables, and 686 | # insert the code at the beginning of the function. 687 | if self.stack.get_max_fp() != 0: 688 | self.o(" subl $%d, %%esp" % (-self.stack.get_max_fp()), 689 | "Allocate space for local+temp vars") 690 | 691 | # Save any callee-save registers that may have been used. 692 | self.stack.save_callee_saves() 693 | 694 | # Add the previously-generated assembly code for the function. 695 | self.curr_str += function_str 696 | 697 | self.o("%s:" % self.curr_func_end_label) 698 | 699 | # Restore any callee-save registers that may have been used. 700 | self.stack.load_callee_saves() 701 | self.o(" movl %ebp, %esp", "Deallocate stack frame") 702 | self.o(" popl %ebp", "Restore old stack frame") 703 | self.o(" ret\n") 704 | 705 | line = self._fill_line("END FUNCTION: %s()" % node.name) 706 | self.c(line, 0) 707 | 708 | def vCompoundStatement(self, node): 709 | node.statement_list.accept(self) 710 | 711 | def vIfStatement(self, node): 712 | done_label = self.new_label() + "_done" 713 | if not node.else_stmt.is_null(): 714 | else_label = self.new_label() + "_else" 715 | else: 716 | else_label = done_label 717 | 718 | self.c("IF statment - begin") 719 | 720 | node.expr.accept(self) 721 | comparer = self.stack.pop() 722 | self.stack.done() 723 | self.o(" testl %s, %s" % (comparer, comparer), "Test the result") 724 | self.o(" jz %s" % else_label, 725 | "If result is zero, jump to else clause") 726 | self.c("IF statment - THEN clause - begin") 727 | self._accept_and_empty_stack(node.then_stmt) 728 | self.c("IF statment - THEN clause - end") 729 | self.o(" jmp %s" % done_label) 730 | if not node.else_stmt.is_null(): 731 | self.c("IF statment - ELSE clause - begin") 732 | self.o("%s:" % else_label) 733 | self._accept_and_empty_stack(node.else_stmt) 734 | self.c("IF statment - ELSE clause - end") 735 | self.o("%s:" % done_label) 736 | 737 | self.c("IF statment - end") 738 | 739 | def _push_loop_labels(self, break_label, continue_label): 740 | """Pushes new values of labels to jump to for 'break' and 741 | 'continue' statements.""" 742 | 743 | self.break_labels.append(break_label) 744 | self.continue_labels.append(continue_label) 745 | 746 | def _pop_loop_labels(self): 747 | """Restores old values of labels to jump to for 'break' and 748 | 'continue' statements.""" 749 | 750 | self.break_labels.pop() 751 | self.continue_labels.pop() 752 | 753 | def vWhileLoop(self, node): 754 | test_label = self.new_label() + "_test" 755 | done_label = self.new_label() + "_done" 756 | 757 | self._push_loop_labels(break_label=done_label, 758 | continue_label=test_label) 759 | 760 | self.c("WHILE loop - begin") 761 | 762 | self.o("%s:" % test_label) 763 | node.expr.accept(self) 764 | 765 | comparer = self.stack.pop() 766 | self.stack.done() 767 | self.o(" testl %s, %s" % (comparer, comparer), "Test the result") 768 | self.o(" jz %s" % done_label, 769 | "If result is zero, leave while loop") 770 | self._accept_and_empty_stack(node.stmt) 771 | self.o(" jmp %s" % test_label, "Jump to start of while loop") 772 | self.o("%s:" % done_label) 773 | 774 | self.c("WHILE loop - end") 775 | 776 | self._pop_loop_labels() 777 | 778 | def vForLoop(self, node): 779 | test_label = self.new_label() + "_test" 780 | done_label = self.new_label() + "_done" 781 | 782 | self._push_loop_labels(break_label=done_label, 783 | continue_label=test_label) 784 | 785 | self.c("FOR loop - begin") 786 | 787 | self._accept_and_empty_stack(node.begin_stmt) 788 | 789 | self.o("%s:" % test_label) 790 | node.expr.accept(self) 791 | 792 | comparer = self.stack.pop() 793 | self.stack.done() 794 | self.o(" testl %s, %s" % (comparer, comparer), "Test the result") 795 | self.o(" jz %s" % done_label, 796 | "If result is zero, leave for loop") 797 | self._accept_and_empty_stack(node.stmt) 798 | self._accept_and_empty_stack(node.end_stmt) 799 | self.o(" jmp %s" % test_label, "Jump to start of for loop") 800 | self.o("%s:" % done_label) 801 | 802 | self.c("FOR loop - end") 803 | 804 | self._pop_loop_labels() 805 | 806 | def vBreakStatement(self, node): 807 | self.o(" jmp %s" % self.break_labels[-1], 808 | "Loop: break statement") 809 | 810 | def vContinueStatement(self, node): 811 | self.o(" jmp %s" % self.continue_labels[-1], 812 | "Loop: continue statement") 813 | 814 | def _get_new_str_literal_label(self, str): 815 | """Create a new string literal label for the given string, 816 | generate (but do not yet emit) the assembly for it, and return 817 | the name of the new label.""" 818 | 819 | label_str = "LC%d" % self.__str_literal_label 820 | str = str.replace('\n', '\\12') 821 | self.__str_literal_str += """%s:\n .ascii "%s\\0"\n""" % (label_str, str) 822 | self.__str_literal_label += 1 823 | return label_str 824 | 825 | def vStringLiteral(self, node): 826 | label_str = self._get_new_str_literal_label(node.get_str()) 827 | 828 | # Make a little preview of the literal in the annotated 829 | # comments. 830 | COMMENT_CHARS = 7 831 | comment_label = node.get_sanitized_str() 832 | if len(comment_label) > COMMENT_CHARS: 833 | comment_label = "%s..." % comment_label[0:COMMENT_CHARS] 834 | 835 | self.o(" movl $%s, %s" % (label_str, 836 | self.stack.push(node.type)), 837 | "Get addr of string literal '%s'" % comment_label) 838 | 839 | def vConst(self, node): 840 | self.o(" movl $%d, %s" % (node.value, 841 | self.stack.push(node.type)), 842 | "Load numeric constant %d" % node.value) 843 | 844 | def vId(self, node): 845 | # If we're only supposed to push our address on the stack, not 846 | # our actual value, then do that and exit. 847 | if node.output_addr: 848 | self.o(" leal %s, %s" % (node.symbol.compile_loc, 849 | self.stack.push()), 850 | "Get address of %s" % node.symbol.name) 851 | return 852 | type_str = node.type.get_outer_string() 853 | if type_str in ['pointer', 'int']: 854 | instr = 'movl' 855 | elif type_str == 'char': 856 | instr = 'movzbl' 857 | self.o(" %s %s, %s" % (instr, node.symbol.compile_loc, 858 | self.stack.push(node.type)), 859 | "Get value of %s" % node.symbol.name) 860 | 861 | def vArrayExpression(self, node): 862 | node.expr.accept(self) 863 | node.index.accept(self) 864 | reg_index = self.stack.pop(node.index.type) 865 | reg_expr = self.stack.pop(node.expr.type) 866 | reg_to = self.stack.push(node.type) 867 | size = self._calc_var_size(node.type) 868 | addr_str = "(%s,%s,%d)" % (reg_expr, reg_index, size) 869 | self.stack.done() 870 | if node.output_addr: 871 | self.o(" leal %s, %s" % (addr_str, reg_to), 872 | "Load addr of pointer array index") 873 | else: 874 | type_str = node.type.get_outer_string() 875 | if type_str in ['int', 'pointer']: 876 | instr = 'movl' 877 | elif type_str == 'char': 878 | instr = 'movzbl' 879 | self.o(" %s %s, %s" % (instr, addr_str, reg_to), 880 | "Pointer array index dereference") 881 | 882 | def vFunctionExpression(self, node): 883 | """Generates assembly for calling a function.""" 884 | 885 | self.c("FUNCTION CALL to %s() - begin" % 886 | node.function.symbol.name) 887 | 888 | # If we're using any caller-save registers, free them up. 889 | self.stack.save_caller_saves() 890 | 891 | # We need to temporarily reverse the order of the function's 892 | # arguments because we need to push them onto the stack 893 | # in reverse order. 894 | node.arglist.nodes.reverse() 895 | argnum = len(node.arglist.nodes) 896 | for arg in node.arglist.nodes: 897 | arg_reg = self._accept_and_pop(arg) 898 | self.o(" pushl %s" % arg_reg, "Push arg %d" % argnum) 899 | self.stack.done() 900 | argnum -= 1 901 | node.arglist.nodes.reverse() 902 | 903 | self.o(" call %s" % node.function.symbol.compile_loc, 904 | "Call %s()" % node.function.symbol.name) 905 | 906 | # The function will place its return value in register %eax. 907 | # So, we'll push a register from the stack and ask it to 908 | # give us %eax. 909 | result = self.stack.push(node.function.symbol.type.get_return_type(), preferred_reg='%eax') 910 | 911 | # If we got %eax, don't do anything, because our return 912 | # value is already in there. Otherwise, move it. 913 | # 914 | # (Note that in the current implementation of the stack 915 | # machine, we should always get %eax.) 916 | if result != '%eax': 917 | self.o(" movl %%eax, %s" % result, "Copy return value") 918 | 919 | arg_stack_size = (len(node.arglist.nodes)*WORD_SIZE) 920 | 921 | if arg_stack_size > 0: 922 | self.o(" addl $%d, %%esp" % arg_stack_size, 923 | "Deallocate argument stack") 924 | 925 | self.c("FUNCTION CALL to %s() - end" % 926 | node.function.symbol.name) 927 | 928 | def vReturnStatement(self, node): 929 | return_reg = self._accept_and_pop(node.expr) 930 | self.o(" movl %s, %%eax" % return_reg, "Set return value") 931 | self.o(" jmp %s" % self.curr_func_end_label, "Exit function") 932 | self.stack.done() 933 | 934 | def _accept_and_pop(self, node): 935 | """Accept the given node and pop its value into a register and 936 | return the register. Implicit type conversion is performed, 937 | if necessary, by the stack machine. 938 | 939 | Also, if the node is determined to be a numeric constant, 940 | the literal value of the constant (e.g., '$15') is returned, 941 | for purposes of optimization.""" 942 | 943 | if node.is_const(): 944 | return "$%d" % node.value 945 | else: 946 | node.accept(self) 947 | return self.stack.pop(node.coerce_to_type) 948 | 949 | def _binop_assign(self, node): 950 | """Performs an assignment operation (=, +=, etc) on the given 951 | Binop node.""" 952 | 953 | node.left.accept(self) 954 | right_reg = self._accept_and_pop(node.right) 955 | left_reg = self.stack.pop() 956 | instr = self.binop_instructions[node.op[0]] 957 | instr += self._type_suffix(node.type) 958 | 959 | type_str = node.type.get_outer_string() 960 | if type_str == 'char': 961 | right_reg = self.stack.lo(right_reg) 962 | 963 | self.o(" %s %s, (%s)" % (instr, right_reg, left_reg), 964 | "Perform assignment '%s'" % node.op) 965 | 966 | # NOTE: Wow, this makes for insanely inefficient code, especially 967 | # when the result of the operation isn't being used. 968 | if type_str in ['int', 'pointer']: 969 | instr = 'movl' 970 | elif type_str == 'char': 971 | instr = 'movzbl' 972 | 973 | self.o(" %s (%s), %s" % (instr, left_reg, 974 | self.stack.push(node.type)), 975 | "Copy assignment result to register") 976 | self.stack.done() 977 | 978 | def _type_suffix(self, type): 979 | """Returns the assembly instruction suffix for the given type; 980 | 'l' for 32-bit types, 'b' for 8-bit types, etc...""" 981 | 982 | type_str = type.get_outer_string() 983 | if type_str in ['int', 'pointer']: 984 | return 'l' 985 | elif type_str == 'char': 986 | return 'b' 987 | 988 | def _binop_arith(self, node): 989 | """Performs an arithmetic operation (+, -, etc) on the given 990 | Binop node.""" 991 | 992 | node.left.accept(self) 993 | right_reg = self._accept_and_pop(node.right) 994 | left_reg = self.stack.pop(node.left.coerce_to_type) 995 | 996 | instr = self.binop_instructions[node.op] + \ 997 | self._type_suffix(node.type) 998 | type_str = node.type.get_outer_string() 999 | 1000 | if type_str == 'char': 1001 | r_reg = self.stack.lo(right_reg) 1002 | l_reg = self.stack.lo(left_reg) 1003 | else: 1004 | r_reg = right_reg 1005 | l_reg = left_reg 1006 | 1007 | self.o(" %s %s, %s" % (instr, r_reg, l_reg), 1008 | "Perform '%s'" % node.op) 1009 | self.stack.done() 1010 | 1011 | # Here we are relying on the fact that left_reg is now free 1012 | # from the last pop(), so we should be able to push it 1013 | # back onto the stack machine. 1014 | 1015 | new_reg = self.stack.push(node.type, preferred_reg=left_reg) 1016 | if new_reg != left_reg: 1017 | raise Exception("PANIC! Binop push() isn't same as last pop()!") 1018 | 1019 | def _binop_compare(self, node): 1020 | """Performs a comparison operation (>, ==, etc) on the given 1021 | Binop node.""" 1022 | 1023 | node.left.accept(self) 1024 | right_reg = self._accept_and_pop(node.right) 1025 | left_reg = self.stack.pop(node.left.coerce_to_type) 1026 | self.stack.done() 1027 | 1028 | self.o(" cmpl %s, %s" % (right_reg, left_reg), 1029 | "Compare %s to %s" % (left_reg, right_reg)) 1030 | 1031 | # TODO: this could cause errors, if push() generates 1032 | # mov instructions... not sure if mov instructions 1033 | # change the flags though, they probably shouldn't 1034 | # since they're not arithmetic operations. 1035 | byte_reg = self.stack.push(cparse.BaseType('char')) 1036 | lo = self.stack.lo(byte_reg) 1037 | self.o(" %s %s" % (self.binop_instructions[node.op], 1038 | lo), 1039 | "Perform '%s'" % node.op) 1040 | self.o(" movzbl %s, %s" % (lo, byte_reg), 1041 | "Zero-extend the boolean result") 1042 | 1043 | def vBinop(self, node): 1044 | if node.op in cparse.Binop.ASSIGN_OPS: 1045 | self._binop_assign(node) 1046 | elif node.op in ['+','-','*']: 1047 | self._binop_arith(node) 1048 | elif node.op in ['==', '!=', '<', '>', '<=', '>=']: 1049 | self._binop_compare(node) 1050 | 1051 | def vNegative(self, node): 1052 | node.expr.accept(self) 1053 | self.o(" negl %s" % self.stack.peek(), 1054 | "Perform unary negation") 1055 | 1056 | def vPointer(self, node): 1057 | node.expr.accept(self) 1058 | if node.output_addr: 1059 | self.o("", "(Getting pointer target addr via '*')") 1060 | return 1061 | reg_from = self.stack.pop(node.expr.type) 1062 | reg_to = self.stack.push(node.type) 1063 | type_str = node.type.get_outer_string() 1064 | if type_str in ['int', 'pointer']: 1065 | instr = 'movl' 1066 | elif type_str == 'char': 1067 | instr = 'movzbl' 1068 | self.o(" %s (%s), %s" % (instr, reg_from, reg_to), 1069 | "Pointer dereference") 1070 | self.stack.done() 1071 | 1072 | def vAddrOf(self, node): 1073 | node.expr.accept(self) 1074 | self.stack.force_type_change(node.type) 1075 | self.o("", "(Address-of operator '&' used here)") 1076 | 1077 | # --------------------------------------------------------------- 1078 | # End of cx86.py 1079 | # --------------------------------------------------------------- 1080 | -------------------------------------------------------------------------------- /yacc.py: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------------------------- 2 | # ply: yacc.py 3 | # 4 | # Author: David M. Beazley (beazley@cs.uchicago.edu) 5 | # Department of Computer Science 6 | # University of Chicago 7 | # Chicago, IL 60637 8 | # 9 | # Copyright (C) 2001, David M. Beazley 10 | # 11 | # $Header: /cygdrive/c/prog/CVS/mini_c/yacc.py,v 1.1.1.1 2004/05/27 06:40:38 varmaa Exp $ 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 26 | # 27 | # See the file COPYING for a complete copy of the LGPL. 28 | # 29 | # 30 | # This implements an LR parser that is constructed from grammar rules defined 31 | # as Python functions. Roughly speaking, this module is a cross between 32 | # John Aycock's Spark system and the GNU bison utility. 33 | # 34 | # Disclaimer: This is a work in progress. SLR parsing seems to work fairly 35 | # well and there is extensive error checking. LALR(1) is in progress. The 36 | # rest of this file is a bit of a mess. Please pardon the dust. 37 | # 38 | # The current implementation is only somewhat object-oriented. The 39 | # LR parser itself is defined in terms of an object (which allows multiple 40 | # parsers to co-exist). However, most of the variables used during table 41 | # construction are defined in terms of global variables. Users shouldn't 42 | # notice unless they are trying to define multiple parsers at the same 43 | # time using threads (in which case they should have their head examined). 44 | #----------------------------------------------------------------------------- 45 | 46 | __version__ = "1.4" 47 | 48 | #----------------------------------------------------------------------------- 49 | # === User configurable parameters === 50 | # 51 | # Change these to modify the default behavior of yacc (if you wish) 52 | #----------------------------------------------------------------------------- 53 | 54 | yaccdebug = 1 # Debugging mode. If set, yacc generates a 55 | # a 'parser.out' file in the current directory 56 | 57 | debug_file = 'parser.out' # Default name of the debugging file 58 | tab_module = 'parsetab' # Default name of the table module 59 | default_lr = 'SLR' # Default LR table generation method 60 | 61 | error_count = 3 # Number of symbols that must be shifted to leave recovery mode 62 | 63 | import re, types, sys, cStringIO, md5, os.path 64 | 65 | # Exception raised for yacc-related errors 66 | class YaccError(Exception): pass 67 | 68 | #----------------------------------------------------------------------------- 69 | # === LR Parsing Engine === 70 | # 71 | # The following classes are used for the LR parser itself. These are not 72 | # used during table construction and are independent of the actual LR 73 | # table generation algorithm 74 | #----------------------------------------------------------------------------- 75 | 76 | # This class is used to hold non-terminal grammar symbols during parsing. 77 | # It normally has the following attributes set: 78 | # .type = Grammar symbol type 79 | # .value = Symbol value 80 | # .lineno = Starting line number 81 | # .endlineno = Ending line number (optional, set automatically) 82 | 83 | class YaccSymbol: 84 | def __str__(self): return self.type 85 | def __repr__(self): return str(self) 86 | 87 | # This class is a wrapper around the objects actually passed to each 88 | # grammar rule. Index lookup and assignment actually assign the 89 | # .value attribute of the underlying YaccSymbol object. 90 | # The lineno() method returns the line number of a given 91 | # item (or 0 if not defined). The linespan() method returns 92 | # a tuple of (startline,endline) representing the range of lines 93 | # for a symbol. 94 | 95 | class YaccSlice: 96 | def __init__(self,s): 97 | self.slice = s 98 | self.pbstack = [] 99 | 100 | def __getitem__(self,n): 101 | return self.slice[n].value 102 | 103 | def __setitem__(self,n,v): 104 | self.slice[n].value = v 105 | 106 | def lineno(self,n): 107 | return getattr(self.slice[n],"lineno",0) 108 | 109 | def linespan(self,n): 110 | startline = getattr(self.slice[n],"lineno",0) 111 | endline = getattr(self.slice[n],"endlineno",startline) 112 | return startline,endline 113 | 114 | def pushback(self,n): 115 | if n <= 0: 116 | raise ValueError, "Expected a positive value" 117 | if n > (len(self.slice)-1): 118 | raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1) 119 | for i in range(0,n): 120 | self.pbstack.append(self.slice[-i-1]) 121 | 122 | # The LR Parsing engine. This is defined as a class so that multiple parsers 123 | # can exist in the same process. A user never instantiates this directly. 124 | # Instead, the global yacc() function should be used to create a suitable Parser 125 | # object. 126 | 127 | class Parser: 128 | def __init__(self,magic=None): 129 | 130 | # This is a hack to keep users from trying to instantiate a Parser 131 | # object directly. 132 | 133 | if magic != "xyzzy": 134 | raise YaccError, "Can't instantiate Parser. Use yacc() instead." 135 | 136 | # Reset internal state 137 | self.productions = None # List of productions 138 | self.errorfunc = None # Error handling function 139 | self.action = { } # LR Action table 140 | self.goto = { } # LR goto table 141 | self.require = { } # Attribute require table 142 | self.method = "Unknown LR" # Table construction method used 143 | 144 | def errok(self): 145 | self.errorcount = 0 146 | 147 | def restart(self): 148 | del self.statestack[:] 149 | del self.symstack[:] 150 | sym = YaccSymbol() 151 | sym.type = '$' 152 | self.symstack.append(sym) 153 | self.statestack.append(0) 154 | 155 | def parse(self,input=None,lexer=None,debug=0): 156 | lookahead = None # Current lookahead symbol 157 | lookaheadstack = [ ] # Stack of lookahead symbols 158 | actions = self.action # Local reference to action table 159 | goto = self.goto # Local reference to goto table 160 | prod = self.productions # Local reference to production list 161 | pslice = YaccSlice(None) # Slice object passed to grammar rules 162 | pslice.parser = self # Parser object 163 | self.errorcount = 0 # Used during error recovery 164 | 165 | # If no lexer was given, we will try to use the lex module 166 | if not lexer: 167 | import lex as lexer 168 | 169 | pslice.lexer = lexer 170 | 171 | # If input was supplied, pass to lexer 172 | if input: 173 | lexer.input(input) 174 | 175 | # Tokenize function 176 | get_token = lexer.token 177 | 178 | statestack = [ ] # Stack of parsing states 179 | self.statestack = statestack 180 | symstack = [ ] # Stack of grammar symbols 181 | self.symstack = symstack 182 | 183 | errtoken = None # Err token 184 | 185 | # The start state is assumed to be (0,$) 186 | statestack.append(0) 187 | sym = YaccSymbol() 188 | sym.type = '$' 189 | symstack.append(sym) 190 | 191 | while 1: 192 | # Get the next symbol on the input. If a lookahead symbol 193 | # is already set, we just use that. Otherwise, we'll pull 194 | # the next token off of the lookaheadstack or from the lexer 195 | if not lookahead: 196 | if not lookaheadstack: 197 | lookahead = get_token() # Get the next token 198 | else: 199 | lookahead = lookaheadstack.pop() 200 | if not lookahead: 201 | lookahead = YaccSymbol() 202 | lookahead.type = '$' 203 | if debug: 204 | errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() 205 | 206 | # Check the action table 207 | s = statestack[-1] 208 | ltype = lookahead.type 209 | t = actions.get((s,ltype),None) 210 | 211 | if t is not None: 212 | if t > 0: 213 | # shift a symbol on the stack 214 | if ltype == '$': 215 | # Error, end of input 216 | sys.stderr.write("yacc: Parse error. EOF\n") 217 | return 218 | statestack.append(t) 219 | if debug > 1: 220 | sys.stderr.write("%-60s shift state %s\n" % (errorlead, t)) 221 | symstack.append(lookahead) 222 | lookahead = None 223 | 224 | # Decrease error count on successful shift 225 | if self.errorcount > 0: 226 | self.errorcount -= 1 227 | 228 | continue 229 | 230 | if t < 0: 231 | # reduce a symbol on the stack, emit a production 232 | p = prod[-t] 233 | pname = p.name 234 | plen = p.len 235 | 236 | # Get production function 237 | sym = YaccSymbol() 238 | sym.type = pname # Production name 239 | sym.value = None 240 | if debug > 1: 241 | sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t)) 242 | 243 | if plen: 244 | targ = symstack[-plen-1:] 245 | targ[0] = sym 246 | try: 247 | sym.lineno = targ[1].lineno 248 | sym.endlineno = getattr(targ[-1],"endlineno",targ[-1].lineno) 249 | except AttributeError: 250 | sym.lineno = 0 251 | del symstack[-plen:] 252 | del statestack[-plen:] 253 | else: 254 | sym.lineno = 0 255 | targ = [ sym ] 256 | pslice.slice = targ 257 | pslice.pbstack = [] 258 | # Call the grammar rule with our special slice object 259 | p.func(pslice) 260 | 261 | # Validate attributes of the resulting value attribute 262 | # if require: 263 | # try: 264 | # t0 = targ[0] 265 | # r = Requires.get(t0.type,None) 266 | # t0d = t0.__dict__ 267 | # if r: 268 | # for field in r: 269 | # tn = t0 270 | # for fname in field: 271 | # try: 272 | # tf = tn.__dict__ 273 | # tn = tf.get(fname) 274 | # except StandardError: 275 | # tn = None 276 | # if not tn: 277 | # print "%s:%d: Rule %s doesn't set required attribute '%s'" % \ 278 | # (p.file,p.line,p.name,".".join(field)) 279 | # except TypeError,LookupError: 280 | # print "Bad requires directive " % r 281 | # pass 282 | 283 | 284 | # If there was a pushback, put that on the stack 285 | if pslice.pbstack: 286 | lookaheadstack.append(lookahead) 287 | for _t in pslice.pbstack: 288 | lookaheadstack.append(_t) 289 | lookahead = None 290 | 291 | symstack.append(sym) 292 | statestack.append(goto[statestack[-1],pname]) 293 | continue 294 | 295 | if t == 0: 296 | n = symstack[-1] 297 | return getattr(n,"value",None) 298 | sys.stderr.write(errorlead, "\n") 299 | 300 | if t == None: 301 | if debug: 302 | sys.stderr.write(errorlead + "\n") 303 | # We have some kind of parsing error here. To handle 304 | # this, we are going to push the current token onto 305 | # the tokenstack and replace it with an 'error' token. 306 | # If there are any synchronization rules, they may 307 | # catch it. 308 | # 309 | # In addition to pushing the error token, we call call 310 | # the user defined p_error() function if this is the 311 | # first syntax error. This function is only called if 312 | # errorcount == 0. 313 | if not self.errorcount: 314 | self.errorcount = error_count 315 | errtoken = lookahead 316 | if errtoken.type == '$': 317 | errtoken = None # End of file! 318 | if self.errorfunc: 319 | global errok,token,restart 320 | errok = self.errok # Set some special functions available in error recovery 321 | token = get_token 322 | restart = self.restart 323 | tok = self.errorfunc(errtoken) 324 | del errok, token, restart # Delete special functions 325 | 326 | if not self.errorcount: 327 | # User must have done some kind of panic 328 | # mode recovery on their own. The 329 | # returned token is the next lookahead 330 | lookahead = tok 331 | errtoken = None 332 | continue 333 | else: 334 | if errtoken: 335 | if hasattr(errtoken,"lineno"): lineno = lookahead.lineno 336 | else: lineno = 0 337 | if lineno: 338 | sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) 339 | else: 340 | sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) 341 | else: 342 | sys.stderr.write("yacc: Parse error in input. EOF\n") 343 | return 344 | 345 | else: 346 | self.errorcount = error_count 347 | 348 | # case 1: the statestack only has 1 entry on it. If we're in this state, the 349 | # entire parse has been rolled back and we're completely hosed. The token is 350 | # discarded and we just keep going. 351 | 352 | if len(statestack) <= 1 and lookahead.type != '$': 353 | lookahead = None 354 | errtoken = None 355 | # Nuke the pushback stack 356 | del lookaheadstack[:] 357 | continue 358 | 359 | # case 2: the statestack has a couple of entries on it, but we're 360 | # at the end of the file. nuke the top entry and generate an error token 361 | 362 | # Start nuking entries on the stack 363 | if lookahead.type == '$': 364 | # Whoa. We're really hosed here. Bail out 365 | return 366 | 367 | if lookahead.type != 'error': 368 | sym = symstack[-1] 369 | if sym.type == 'error': 370 | # Hmmm. Error is on top of stack, we'll just nuke input 371 | # symbol and continue 372 | lookahead = None 373 | continue 374 | t = YaccSymbol() 375 | t.type = 'error' 376 | if hasattr(lookahead,"lineno"): 377 | t.lineno = lookahead.lineno 378 | t.value = lookahead 379 | lookaheadstack.append(lookahead) 380 | lookahead = t 381 | else: 382 | symstack.pop() 383 | statestack.pop() 384 | 385 | continue 386 | 387 | # Call an error function here 388 | raise RuntimeError, "yacc: internal parser error!!!\n" 389 | 390 | # ----------------------------------------------------------------------------- 391 | # === Parser Construction === 392 | # 393 | # The following functions and variables are used to implement the yacc() function 394 | # itself. This is pretty hairy stuff involving lots of error checking, 395 | # construction of LR items, kernels, and so forth. Although a lot of 396 | # this work is done using global variables, the resulting Parser object 397 | # is completely self contained--meaning that it is safe to repeatedly 398 | # call yacc() with different grammars in the same application. 399 | # ----------------------------------------------------------------------------- 400 | 401 | # ----------------------------------------------------------------------------- 402 | # validate_file() 403 | # 404 | # This function checks to see if there are duplicated p_rulename() functions 405 | # in the parser module file. Without this function, it is really easy for 406 | # users to make mistakes by cutting and pasting code fragments (and it's a real 407 | # bugger to try and figure out why the resulting parser doesn't work). Therefore, 408 | # we just do a little regular expression pattern matching of def statements 409 | # to try and detect duplicates. 410 | # ----------------------------------------------------------------------------- 411 | 412 | def validate_file(filename): 413 | base,ext = os.path.splitext(filename) 414 | if ext != '.py': return 1 # No idea. Assume it's okay. 415 | 416 | try: 417 | f = open(filename) 418 | lines = f.readlines() 419 | f.close() 420 | except IOError: 421 | return 1 # Oh well 422 | 423 | # Match def p_funcname( 424 | fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') 425 | counthash = { } 426 | linen = 1 427 | noerror = 1 428 | for l in lines: 429 | m = fre.match(l) 430 | if m: 431 | name = m.group(1) 432 | prev = counthash.get(name) 433 | if not prev: 434 | counthash[name] = linen 435 | else: 436 | sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev)) 437 | noerror = 0 438 | linen += 1 439 | return noerror 440 | 441 | # This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. 442 | def validate_dict(d): 443 | for n,v in d.items(): 444 | if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue 445 | if n[0:2] == 't_': continue 446 | 447 | if n[0:2] == 'p_': 448 | sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n) 449 | if 1 and isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: 450 | try: 451 | doc = v.__doc__.split(" ") 452 | if doc[1] == ':': 453 | sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (v.func_code.co_filename, v.func_code.co_firstlineno,n)) 454 | except StandardError: 455 | pass 456 | 457 | # ----------------------------------------------------------------------------- 458 | # === GRAMMAR FUNCTIONS === 459 | # 460 | # The following global variables and functions are used to store, manipulate, 461 | # and verify the grammar rules specified by the user. 462 | # ----------------------------------------------------------------------------- 463 | 464 | # Initialize all of the global variables used during grammar construction 465 | def initialize_vars(): 466 | global Productions, Prodnames, Prodmap, Terminals 467 | global Nonterminals, First, Follow, Precedence, LRitems 468 | global Errorfunc, Signature, Requires 469 | 470 | Productions = [None] # A list of all of the productions. The first 471 | # entry is always reserved for the purpose of 472 | # building an augmented grammar 473 | 474 | Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all 475 | # productions of that nonterminal. 476 | 477 | Prodmap = { } # A dictionary that is only used to detect duplicate 478 | # productions. 479 | 480 | Terminals = { } # A dictionary mapping the names of terminal symbols to a 481 | # list of the rules where they are used. 482 | 483 | Nonterminals = { } # A dictionary mapping names of nonterminals to a list 484 | # of rule numbers where they are used. 485 | 486 | First = { } # A dictionary of precomputed FIRST(x) symbols 487 | 488 | Follow = { } # A dictionary of precomputed FOLLOW(x) symbols 489 | 490 | Precedence = { } # Precedence rules for each terminal. Contains tuples of the 491 | # form ('right',level) or ('nonassoc', level) or ('left',level) 492 | 493 | LRitems = [ ] # A list of all LR items for the grammar. These are the 494 | # productions with the "dot" like E -> E . PLUS E 495 | 496 | Errorfunc = None # User defined error handler 497 | 498 | Signature = md5.new() # Digital signature of the grammar rules, precedence 499 | # and other information. Used to determined when a 500 | # parsing table needs to be regenerated. 501 | 502 | Requires = { } # Requires list 503 | 504 | # File objects used when creating the parser.out debugging file 505 | global _vf, _vfc 506 | _vf = cStringIO.StringIO() 507 | _vfc = cStringIO.StringIO() 508 | 509 | # ----------------------------------------------------------------------------- 510 | # class Production: 511 | # 512 | # This class stores the raw information about a single production or grammar rule. 513 | # It has a few required attributes: 514 | # 515 | # name - Name of the production (nonterminal) 516 | # prod - A list of symbols making up its production 517 | # number - Production number. 518 | # 519 | # In addition, a few additional attributes are used to help with debugging or 520 | # optimization of table generation. 521 | # 522 | # file - File where production action is defined. 523 | # lineno - Line number where action is defined 524 | # func - Action function 525 | # prec - Precedence level 526 | # lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' 527 | # then lr_next refers to 'E -> E PLUS . E' 528 | # lr_index - LR item index (location of the ".") in the prod list. 529 | # len - Length of the production (number of symbols on right hand side) 530 | # ----------------------------------------------------------------------------- 531 | 532 | class Production: 533 | def __init__(self,**kw): 534 | for k,v in kw.items(): 535 | setattr(self,k,v) 536 | self.lr_index = -1 537 | self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure 538 | self.usyms = [ ] 539 | 540 | def __str__(self): 541 | if self.prod: 542 | s = "%s -> %s" % (self.name," ".join(self.prod)) 543 | else: 544 | s = "%s -> " % self.name 545 | return s 546 | 547 | def __repr__(self): 548 | return str(self) 549 | 550 | # Compute lr_items from the production 551 | def lr_item(self,n): 552 | if n > len(self.prod): return None 553 | p = Production() 554 | p.name = self.name 555 | p.prod = list(self.prod) 556 | p.number = self.number 557 | p.lr_index = n 558 | p.prod.insert(n,".") 559 | p.prod = tuple(p.prod) 560 | p.len = len(p.prod) 561 | p.usyms = self.usyms 562 | 563 | # Precompute list of productions immediately following 564 | try: 565 | p.lrafter = Prodnames[p.prod[n+1]] 566 | except (IndexError,KeyError),e: 567 | p.lrafter = [] 568 | try: 569 | p.lrbefore = p.prod[n-1] 570 | except IndexError: 571 | p.lrbefore = None 572 | 573 | return p 574 | 575 | class MiniProduction: 576 | pass 577 | 578 | # Utility function 579 | def is_identifier(s): 580 | for c in s: 581 | if not (c.isalnum() or c == '_'): return 0 582 | return 1 583 | 584 | # ----------------------------------------------------------------------------- 585 | # add_production() 586 | # 587 | # Given an action function, this function assembles a production rule. 588 | # The production rule is assumed to be found in the function's docstring. 589 | # This rule has the general syntax: 590 | # 591 | # name1 ::= production1 592 | # | production2 593 | # | production3 594 | # ... 595 | # | productionn 596 | # name2 ::= production1 597 | # | production2 598 | # ... 599 | # ----------------------------------------------------------------------------- 600 | 601 | def add_production(f,file,line,prodname,syms): 602 | 603 | if Terminals.has_key(prodname): 604 | sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname)) 605 | return -1 606 | if prodname == 'error': 607 | sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname)) 608 | return -1 609 | 610 | if not is_identifier(prodname): 611 | sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) 612 | return -1 613 | 614 | for s in syms: 615 | if not is_identifier(s) and s != '%prec': 616 | sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) 617 | return -1 618 | 619 | # See if the rule is already in the rulemap 620 | map = "%s -> %s" % (prodname,syms) 621 | if Prodmap.has_key(map): 622 | m = Prodmap[map] 623 | sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m)) 624 | sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line)) 625 | return -1 626 | 627 | p = Production() 628 | p.name = prodname 629 | p.prod = syms 630 | p.file = file 631 | p.line = line 632 | p.func = f 633 | p.number = len(Productions) 634 | 635 | 636 | Productions.append(p) 637 | Prodmap[map] = p 638 | if not Nonterminals.has_key(prodname): 639 | Nonterminals[prodname] = [ ] 640 | 641 | # Add all terminals to Terminals 642 | i = 0 643 | while i < len(p.prod): 644 | t = p.prod[i] 645 | if t == '%prec': 646 | try: 647 | precname = p.prod[i+1] 648 | except IndexError: 649 | sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line)) 650 | return -1 651 | 652 | prec = Precedence.get(precname,None) 653 | if not prec: 654 | sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname)) 655 | return -1 656 | else: 657 | p.prec = prec 658 | del p.prod[i] 659 | del p.prod[i] 660 | continue 661 | 662 | if Terminals.has_key(t): 663 | Terminals[t].append(p.number) 664 | # Is a terminal. We'll assign a precedence to p based on this 665 | if not hasattr(p,"prec"): 666 | p.prec = Precedence.get(t,('right',0)) 667 | else: 668 | if not Nonterminals.has_key(t): 669 | Nonterminals[t] = [ ] 670 | Nonterminals[t].append(p.number) 671 | i += 1 672 | 673 | if not hasattr(p,"prec"): 674 | p.prec = ('right',0) 675 | 676 | # Set final length of productions 677 | p.len = len(p.prod) 678 | p.prod = tuple(p.prod) 679 | 680 | # Calculate unique syms in the production 681 | p.usyms = [ ] 682 | for s in p.prod: 683 | if s not in p.usyms: 684 | p.usyms.append(s) 685 | 686 | # Add to the global productions list 687 | try: 688 | Prodnames[p.name].append(p) 689 | except KeyError: 690 | Prodnames[p.name] = [ p ] 691 | return 0 692 | 693 | # Given a raw rule function, this function rips out its doc string 694 | # and adds rules to the grammar 695 | 696 | def add_function(f): 697 | line = f.func_code.co_firstlineno 698 | file = f.func_code.co_filename 699 | error = 0 700 | 701 | if isinstance(f,types.MethodType): 702 | reqdargs = 2 703 | else: 704 | reqdargs = 1 705 | 706 | if f.func_code.co_argcount > reqdargs: 707 | sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__)) 708 | return -1 709 | 710 | if f.func_code.co_argcount < reqdargs: 711 | sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__)) 712 | return -1 713 | 714 | if f.__doc__: 715 | # Split the doc string into lines 716 | pstrings = f.__doc__.splitlines() 717 | lastp = None 718 | dline = line 719 | for ps in pstrings: 720 | dline += 1 721 | p = ps.split() 722 | if not p: continue 723 | try: 724 | if p[0] == '|': 725 | # This is a continuation of a previous rule 726 | if not lastp: 727 | sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline)) 728 | return -1 729 | prodname = lastp 730 | if len(p) > 1: 731 | syms = p[1:] 732 | else: 733 | syms = [ ] 734 | else: 735 | prodname = p[0] 736 | lastp = prodname 737 | assign = p[1] 738 | if len(p) > 2: 739 | syms = p[2:] 740 | else: 741 | syms = [ ] 742 | if assign != ':' and assign != '::=': 743 | sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) 744 | return -1 745 | e = add_production(f,file,dline,prodname,syms) 746 | error += e 747 | except StandardError: 748 | sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) 749 | error -= 1 750 | else: 751 | sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__)) 752 | return error 753 | 754 | 755 | # Cycle checking code (Michael Dyck) 756 | 757 | def compute_reachable(): 758 | ''' 759 | Find each symbol that can be reached from the start symbol. 760 | Print a warning for any nonterminals that can't be reached. 761 | (Unused terminals have already had their warning.) 762 | ''' 763 | Reachable = { } 764 | for s in Terminals.keys() + Nonterminals.keys(): 765 | Reachable[s] = 0 766 | 767 | mark_reachable_from( Productions[0].prod[0], Reachable ) 768 | 769 | for s in Nonterminals.keys(): 770 | if not Reachable[s]: 771 | sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s) 772 | 773 | def mark_reachable_from(s, Reachable): 774 | ''' 775 | Mark all symbols that are reachable from symbol s. 776 | ''' 777 | if Reachable[s]: 778 | # We've already reached symbol s. 779 | return 780 | Reachable[s] = 1 781 | for p in Prodnames.get(s,[]): 782 | for r in p.prod: 783 | mark_reachable_from(r, Reachable) 784 | 785 | # ----------------------------------------------------------------------------- 786 | # compute_terminates() 787 | # 788 | # This function looks at the various parsing rules and tries to detect 789 | # infinite recursion cycles (grammar rules where there is no possible way 790 | # to derive a string of only terminals). 791 | # ----------------------------------------------------------------------------- 792 | def compute_terminates(): 793 | ''' 794 | Raise an error for any symbols that don't terminate. 795 | ''' 796 | Terminates = {} 797 | 798 | # Terminals: 799 | for t in Terminals.keys(): 800 | Terminates[t] = 1 801 | 802 | Terminates['$'] = 1 803 | 804 | # Nonterminals: 805 | 806 | # Initialize to false: 807 | for n in Nonterminals.keys(): 808 | Terminates[n] = 0 809 | 810 | # Then propagate termination until no change: 811 | while 1: 812 | some_change = 0 813 | for (n,pl) in Prodnames.items(): 814 | # Nonterminal n terminates iff any of its productions terminates. 815 | for p in pl: 816 | # Production p terminates iff all of its rhs symbols terminate. 817 | for s in p.prod: 818 | if not Terminates[s]: 819 | # The symbol s does not terminate, 820 | # so production p does not terminate. 821 | p_terminates = 0 822 | break 823 | else: 824 | # didn't break from the loop, 825 | # so every symbol s terminates 826 | # so production p terminates. 827 | p_terminates = 1 828 | 829 | if p_terminates: 830 | # symbol n terminates! 831 | if not Terminates[n]: 832 | Terminates[n] = 1 833 | some_change = 1 834 | # Don't need to consider any more productions for this n. 835 | break 836 | 837 | if not some_change: 838 | break 839 | 840 | some_error = 0 841 | for (s,terminates) in Terminates.items(): 842 | if not terminates: 843 | if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': 844 | # s is used-but-not-defined, and we've already warned of that, 845 | # so it would be overkill to say that it's also non-terminating. 846 | pass 847 | else: 848 | sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s) 849 | some_error = 1 850 | 851 | return some_error 852 | 853 | # ----------------------------------------------------------------------------- 854 | # verify_productions() 855 | # 856 | # This function examines all of the supplied rules to see if they seem valid. 857 | # ----------------------------------------------------------------------------- 858 | def verify_productions(cycle_check=1): 859 | error = 0 860 | for p in Productions: 861 | if not p: continue 862 | 863 | for s in p.prod: 864 | if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': 865 | sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s)) 866 | error = 1 867 | continue 868 | 869 | unused_tok = 0 870 | # Now verify all of the tokens 871 | if yaccdebug: 872 | _vf.write("Unused terminals:\n\n") 873 | for s,v in Terminals.items(): 874 | if s != 'error' and not v: 875 | sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s) 876 | if yaccdebug: _vf.write(" %s\n"% s) 877 | unused_tok += 1 878 | 879 | # Print out all of the productions 880 | if yaccdebug: 881 | _vf.write("\nGrammar\n\n") 882 | for i in range(1,len(Productions)): 883 | _vf.write("Rule %-5d %s\n" % (i, Productions[i])) 884 | 885 | unused_prod = 0 886 | # Verify the use of all productions 887 | for s,v in Nonterminals.items(): 888 | if not v: 889 | p = Prodnames[s][0] 890 | sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s)) 891 | unused_prod += 1 892 | 893 | 894 | if unused_tok == 1: 895 | sys.stderr.write("yacc: Warning. There is 1 unused token.\n") 896 | if unused_tok > 1: 897 | sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok) 898 | 899 | if unused_prod == 1: 900 | sys.stderr.write("yacc: Warning. There is 1 unused rule.\n") 901 | if unused_prod > 1: 902 | sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod) 903 | 904 | if yaccdebug: 905 | _vf.write("\nTerminals, with rules where they appear\n\n") 906 | ks = Terminals.keys() 907 | ks.sort() 908 | for k in ks: 909 | _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]]))) 910 | _vf.write("\nNonterminals, with rules where they appear\n\n") 911 | ks = Nonterminals.keys() 912 | ks.sort() 913 | for k in ks: 914 | _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]]))) 915 | 916 | if (cycle_check): 917 | compute_reachable() 918 | error += compute_terminates() 919 | # error += check_cycles() 920 | return error 921 | 922 | # ----------------------------------------------------------------------------- 923 | # build_lritems() 924 | # 925 | # This function walks the list of productions and builds a complete set of the 926 | # LR items. The LR items are stored in two ways: First, they are uniquely 927 | # numbered and placed in the list _lritems. Second, a linked list of LR items 928 | # is built for each production. For example: 929 | # 930 | # E -> E PLUS E 931 | # 932 | # Creates the list 933 | # 934 | # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] 935 | # ----------------------------------------------------------------------------- 936 | 937 | def build_lritems(): 938 | for p in Productions: 939 | lastlri = p 940 | lri = p.lr_item(0) 941 | i = 0 942 | while 1: 943 | lri = p.lr_item(i) 944 | lastlri.lr_next = lri 945 | if not lri: break 946 | lri.lr_num = len(LRitems) 947 | LRitems.append(lri) 948 | lastlri = lri 949 | i += 1 950 | 951 | # In order for the rest of the parser generator to work, we need to 952 | # guarantee that no more lritems are generated. Therefore, we nuke 953 | # the p.lr_item method. (Only used in debugging) 954 | # Production.lr_item = None 955 | 956 | # ----------------------------------------------------------------------------- 957 | # add_precedence() 958 | # 959 | # Given a list of precedence rules, add to the precedence table. 960 | # ----------------------------------------------------------------------------- 961 | 962 | def add_precedence(plist): 963 | plevel = 0 964 | error = 0 965 | for p in plist: 966 | plevel += 1 967 | try: 968 | prec = p[0] 969 | terms = p[1:] 970 | if prec != 'left' and prec != 'right' and prec != 'nonassoc': 971 | sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec) 972 | return -1 973 | for t in terms: 974 | if Precedence.has_key(t): 975 | sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t) 976 | error += 1 977 | continue 978 | Precedence[t] = (prec,plevel) 979 | except: 980 | sys.stderr.write("yacc: Invalid precedence table.\n") 981 | error += 1 982 | 983 | return error 984 | 985 | # ----------------------------------------------------------------------------- 986 | # augment_grammar() 987 | # 988 | # Compute the augmented grammar. This is just a rule S' -> start where start 989 | # is the starting symbol. 990 | # ----------------------------------------------------------------------------- 991 | 992 | def augment_grammar(start=None): 993 | if not start: 994 | start = Productions[1].name 995 | Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None) 996 | Productions[0].usyms = [ start ] 997 | Nonterminals[start].append(0) 998 | 999 | 1000 | # ------------------------------------------------------------------------- 1001 | # first() 1002 | # 1003 | # Compute the value of FIRST1(beta) where beta is a tuple of symbols. 1004 | # 1005 | # During execution of compute_first1, the result may be incomplete. 1006 | # Afterward (e.g., when called from compute_follow()), it will be complete. 1007 | # ------------------------------------------------------------------------- 1008 | def first(beta): 1009 | 1010 | # We are computing First(x1,x2,x3,...,xn) 1011 | result = [ ] 1012 | for x in beta: 1013 | x_produces_empty = 0 1014 | 1015 | # Add all the non- symbols of First[x] to the result. 1016 | for f in First[x]: 1017 | if f == '': 1018 | x_produces_empty = 1 1019 | else: 1020 | if f not in result: result.append(f) 1021 | 1022 | if x_produces_empty: 1023 | # We have to consider the next x in beta, 1024 | # i.e. stay in the loop. 1025 | pass 1026 | else: 1027 | # We don't have to consider any further symbols in beta. 1028 | break 1029 | else: 1030 | # There was no 'break' from the loop, 1031 | # so x_produces_empty was true for all x in beta, 1032 | # so beta produces empty as well. 1033 | result.append('') 1034 | 1035 | return result 1036 | 1037 | 1038 | # FOLLOW(x) 1039 | # Given a non-terminal. This function computes the set of all symbols 1040 | # that might follow it. Dragon book, p. 189. 1041 | 1042 | def compute_follow(start=None): 1043 | # Add '$' to the follow list of the start symbol 1044 | for k in Nonterminals.keys(): 1045 | Follow[k] = [ ] 1046 | 1047 | if not start: 1048 | start = Productions[1].name 1049 | 1050 | Follow[start] = [ '$' ] 1051 | 1052 | while 1: 1053 | didadd = 0 1054 | for p in Productions[1:]: 1055 | # Here is the production set 1056 | for i in range(len(p.prod)): 1057 | B = p.prod[i] 1058 | if Nonterminals.has_key(B): 1059 | # Okay. We got a non-terminal in a production 1060 | fst = first(p.prod[i+1:]) 1061 | hasempty = 0 1062 | for f in fst: 1063 | if f != '' and f not in Follow[B]: 1064 | Follow[B].append(f) 1065 | didadd = 1 1066 | if f == '': 1067 | hasempty = 1 1068 | if hasempty or i == (len(p.prod)-1): 1069 | # Add elements of follow(a) to follow(b) 1070 | for f in Follow[p.name]: 1071 | if f not in Follow[B]: 1072 | Follow[B].append(f) 1073 | didadd = 1 1074 | if not didadd: break 1075 | 1076 | if 0 and yaccdebug: 1077 | _vf.write('\nFollow:\n') 1078 | for k in Nonterminals.keys(): 1079 | _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]]))) 1080 | 1081 | # ------------------------------------------------------------------------- 1082 | # compute_first1() 1083 | # 1084 | # Compute the value of FIRST1(X) for all symbols 1085 | # ------------------------------------------------------------------------- 1086 | def compute_first1(): 1087 | 1088 | # Terminals: 1089 | for t in Terminals.keys(): 1090 | First[t] = [t] 1091 | 1092 | First['$'] = ['$'] 1093 | First['#'] = ['#'] # what's this for? 1094 | 1095 | # Nonterminals: 1096 | 1097 | # Initialize to the empty set: 1098 | for n in Nonterminals.keys(): 1099 | First[n] = [] 1100 | 1101 | # Then propagate symbols until no change: 1102 | while 1: 1103 | some_change = 0 1104 | for n in Nonterminals.keys(): 1105 | for p in Prodnames[n]: 1106 | for f in first(p.prod): 1107 | if f not in First[n]: 1108 | First[n].append( f ) 1109 | some_change = 1 1110 | if not some_change: 1111 | break 1112 | 1113 | if 0 and yaccdebug: 1114 | _vf.write('\nFirst:\n') 1115 | for k in Nonterminals.keys(): 1116 | _vf.write("%-20s : %s\n" % 1117 | (k, " ".join([str(s) for s in First[k]]))) 1118 | 1119 | # ----------------------------------------------------------------------------- 1120 | # === SLR Generation === 1121 | # 1122 | # The following functions are used to construct SLR (Simple LR) parsing tables 1123 | # as described on p.221-229 of the dragon book. 1124 | # ----------------------------------------------------------------------------- 1125 | 1126 | # Global variables for the LR parsing engine 1127 | def lr_init_vars(): 1128 | global _lr_action, _lr_goto, _lr_method 1129 | global _lr_goto_cache 1130 | 1131 | _lr_action = { } # Action table 1132 | _lr_goto = { } # Goto table 1133 | _lr_method = "Unknown" # LR method used 1134 | _lr_goto_cache = { } 1135 | 1136 | # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. 1137 | # prodlist is a list of productions. 1138 | 1139 | _add_count = 0 # Counter used to detect cycles 1140 | 1141 | def lr0_closure(I): 1142 | global _add_count 1143 | 1144 | _add_count += 1 1145 | prodlist = Productions 1146 | 1147 | # Add everything in I to J 1148 | J = I[:] 1149 | didadd = 1 1150 | while didadd: 1151 | didadd = 0 1152 | for j in J: 1153 | for x in j.lrafter: 1154 | if x.lr0_added == _add_count: continue 1155 | # Add B --> .G to J 1156 | J.append(x.lr_next) 1157 | x.lr0_added = _add_count 1158 | didadd = 1 1159 | 1160 | return J 1161 | 1162 | # Compute the LR(0) goto function goto(I,X) where I is a set 1163 | # of LR(0) items and X is a grammar symbol. This function is written 1164 | # in a way that guarantees uniqueness of the generated goto sets 1165 | # (i.e. the same goto set will never be returned as two different Python 1166 | # objects). With uniqueness, we can later do fast set comparisons using 1167 | # id(obj) instead of element-wise comparison. 1168 | 1169 | def lr0_goto(I,x): 1170 | # First we look for a previously cached entry 1171 | g = _lr_goto_cache.get((id(I),x),None) 1172 | if g: return g 1173 | 1174 | # Now we generate the goto set in a way that guarantees uniqueness 1175 | # of the result 1176 | 1177 | s = _lr_goto_cache.get(x,None) 1178 | if not s: 1179 | s = { } 1180 | _lr_goto_cache[x] = s 1181 | 1182 | gs = [ ] 1183 | for p in I: 1184 | n = p.lr_next 1185 | if n and n.lrbefore == x: 1186 | s1 = s.get(id(n),None) 1187 | if not s1: 1188 | s1 = { } 1189 | s[id(n)] = s1 1190 | gs.append(n) 1191 | s = s1 1192 | g = s.get('$',None) 1193 | if not g: 1194 | if gs: 1195 | g = lr0_closure(gs) 1196 | s['$'] = g 1197 | else: 1198 | s['$'] = gs 1199 | _lr_goto_cache[(id(I),x)] = g 1200 | return g 1201 | 1202 | # Compute the kernel of a set of LR(0) items 1203 | def lr0_kernel(I): 1204 | KI = [ ] 1205 | for p in I: 1206 | if p.name == "S'" or p.lr_index > 0 or p.len == 0: 1207 | KI.append(p) 1208 | 1209 | return KI 1210 | 1211 | _lr0_cidhash = { } 1212 | 1213 | # Compute the LR(0) sets of item function 1214 | def lr0_items(): 1215 | 1216 | C = [ lr0_closure([Productions[0].lr_next]) ] 1217 | i = 0 1218 | for I in C: 1219 | _lr0_cidhash[id(I)] = i 1220 | i += 1 1221 | 1222 | # Loop over the items in C and each grammar symbols 1223 | i = 0 1224 | while i < len(C): 1225 | I = C[i] 1226 | i += 1 1227 | 1228 | # Collect all of the symbols that could possibly be in the goto(I,X) sets 1229 | asyms = { } 1230 | for ii in I: 1231 | for s in ii.usyms: 1232 | asyms[s] = None 1233 | 1234 | for x in asyms.keys(): 1235 | g = lr0_goto(I,x) 1236 | if not g: continue 1237 | if _lr0_cidhash.has_key(id(g)): continue 1238 | _lr0_cidhash[id(g)] = len(C) 1239 | C.append(g) 1240 | 1241 | return C 1242 | 1243 | # ----------------------------------------------------------------------------- 1244 | # slr_parse_table() 1245 | # 1246 | # This function constructs an SLR table. 1247 | # ----------------------------------------------------------------------------- 1248 | def slr_parse_table(): 1249 | global _lr_method 1250 | goto = _lr_goto # Goto array 1251 | action = _lr_action # Action array 1252 | actionp = { } # Action production array (temporary) 1253 | 1254 | _lr_method = "SLR" 1255 | 1256 | n_srconflict = 0 1257 | n_rrconflict = 0 1258 | 1259 | if yaccdebug: 1260 | sys.stderr.write("yacc: Generating SLR parsing table...\n") 1261 | _vf.write("\n\nParsing method: SLR\n\n") 1262 | 1263 | # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items 1264 | # This determines the number of states 1265 | 1266 | C = lr0_items() 1267 | 1268 | # Build the parser table, state by state 1269 | st = 0 1270 | for I in C: 1271 | # Loop over each production in I 1272 | actlist = [ ] # List of actions 1273 | 1274 | if yaccdebug: 1275 | _vf.write("\nstate %d\n\n" % st) 1276 | for p in I: 1277 | _vf.write(" (%d) %s\n" % (p.number, str(p))) 1278 | _vf.write("\n") 1279 | 1280 | for p in I: 1281 | try: 1282 | if p.prod[-1] == ".": 1283 | if p.name == "S'": 1284 | # Start symbol. Accept! 1285 | action[st,"$"] = 0 1286 | actionp[st,"$"] = p 1287 | else: 1288 | # We are at the end of a production. Reduce! 1289 | for a in Follow[p.name]: 1290 | actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) 1291 | r = action.get((st,a),None) 1292 | if r is not None: 1293 | # Whoa. Have a shift/reduce or reduce/reduce conflict 1294 | if r > 0: 1295 | # Need to decide on shift or reduce here 1296 | # By default we favor shifting. Need to add 1297 | # some precedence rules here. 1298 | sprec,slevel = Productions[actionp[st,a].number].prec 1299 | rprec,rlevel = Precedence.get(a,('right',0)) 1300 | if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): 1301 | # We really need to reduce here. 1302 | action[st,a] = -p.number 1303 | actionp[st,a] = p 1304 | if not slevel and not rlevel: 1305 | _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) 1306 | _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) 1307 | n_srconflict += 1 1308 | elif (slevel == rlevel) and (rprec == 'nonassoc'): 1309 | action[st,a] = None 1310 | else: 1311 | # Hmmm. Guess we'll keep the shift 1312 | if not slevel and not rlevel: 1313 | _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) 1314 | _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) 1315 | n_srconflict +=1 1316 | elif r < 0: 1317 | # Reduce/reduce conflict. In this case, we favor the rule 1318 | # that was defined first in the grammar file 1319 | oldp = Productions[-r] 1320 | pp = Productions[p.number] 1321 | if oldp.line > pp.line: 1322 | action[st,a] = -p.number 1323 | actionp[st,a] = p 1324 | # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) 1325 | n_rrconflict += 1 1326 | _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) 1327 | _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) 1328 | else: 1329 | sys.stderr.write("Unknown conflict in state %d\n" % st) 1330 | else: 1331 | action[st,a] = -p.number 1332 | actionp[st,a] = p 1333 | else: 1334 | i = p.lr_index 1335 | a = p.prod[i+1] # Get symbol right after the "." 1336 | if Terminals.has_key(a): 1337 | g = lr0_goto(I,a) 1338 | j = _lr0_cidhash.get(id(g),-1) 1339 | if j >= 0: 1340 | # We are in a shift state 1341 | actlist.append((a,p,"shift and go to state %d" % j)) 1342 | r = action.get((st,a),None) 1343 | if r is not None: 1344 | # Whoa have a shift/reduce or shift/shift conflict 1345 | if r > 0: 1346 | if r != j: 1347 | sys.stderr.write("Shift/shift conflict in state %d\n" % st) 1348 | elif r < 0: 1349 | # Do a precedence check. 1350 | # - if precedence of reduce rule is higher, we reduce. 1351 | # - if precedence of reduce is same and left assoc, we reduce. 1352 | # - otherwise we shift 1353 | rprec,rlevel = Productions[actionp[st,a].number].prec 1354 | sprec,slevel = Precedence.get(a,('right',0)) 1355 | if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): 1356 | # We decide to shift here... highest precedence to shift 1357 | action[st,a] = j 1358 | actionp[st,a] = p 1359 | if not slevel and not rlevel: 1360 | n_srconflict += 1 1361 | _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) 1362 | _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) 1363 | elif (slevel == rlevel) and (rprec == 'nonassoc'): 1364 | action[st,a] = None 1365 | else: 1366 | # Hmmm. Guess we'll keep the reduce 1367 | if not slevel and not rlevel: 1368 | n_srconflict +=1 1369 | _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) 1370 | _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) 1371 | 1372 | else: 1373 | sys.stderr.write("Unknown conflict in state %d\n" % st) 1374 | else: 1375 | action[st,a] = j 1376 | actionp[st,a] = p 1377 | 1378 | except StandardError,e: 1379 | raise YaccError, "Hosed in slr_parse_table", e 1380 | 1381 | # Print the actions associated with each terminal 1382 | if yaccdebug: 1383 | for a,p,m in actlist: 1384 | if action.has_key((st,a)): 1385 | if p is actionp[st,a]: 1386 | _vf.write(" %-15s %s\n" % (a,m)) 1387 | _vf.write("\n") 1388 | for a,p,m in actlist: 1389 | if action.has_key((st,a)): 1390 | if p is not actionp[st,a]: 1391 | _vf.write(" ! %-15s [ %s ]\n" % (a,m)) 1392 | 1393 | # Construct the goto table for this state 1394 | if yaccdebug: 1395 | _vf.write("\n") 1396 | nkeys = { } 1397 | for ii in I: 1398 | for s in ii.usyms: 1399 | if Nonterminals.has_key(s): 1400 | nkeys[s] = None 1401 | for n in nkeys.keys(): 1402 | g = lr0_goto(I,n) 1403 | j = _lr0_cidhash.get(id(g),-1) 1404 | if j >= 0: 1405 | goto[st,n] = j 1406 | if yaccdebug: 1407 | _vf.write(" %-15s shift and go to state %d\n" % (n,j)) 1408 | 1409 | st += 1 1410 | 1411 | if yaccdebug: 1412 | if n_srconflict == 1: 1413 | sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) 1414 | if n_srconflict > 1: 1415 | sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) 1416 | if n_rrconflict == 1: 1417 | sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) 1418 | if n_rrconflict > 1: 1419 | sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) 1420 | 1421 | 1422 | # ----------------------------------------------------------------------------- 1423 | # ==== LALR(1) Parsing ==== 1424 | # **** UNFINISHED! 6/16/01 1425 | # ----------------------------------------------------------------------------- 1426 | 1427 | 1428 | # Compute the lr1_closure of a set I. I is a list of tuples (p,a) where 1429 | # p is a LR0 item and a is a terminal 1430 | 1431 | _lr1_add_count = 0 1432 | 1433 | def lr1_closure(I): 1434 | global _lr1_add_count 1435 | 1436 | _lr1_add_count += 1 1437 | 1438 | J = I[:] 1439 | 1440 | # Loop over items (p,a) in I. 1441 | ji = 0 1442 | while ji < len(J): 1443 | p,a = J[ji] 1444 | # p = [ A -> alpha . B beta] 1445 | 1446 | # For each production B -> gamma 1447 | for B in p.lr1_after: 1448 | f = tuple(p.lr1_beta + (a,)) 1449 | 1450 | # For each terminal b in first(Beta a) 1451 | for b in first(f): 1452 | # Check if (B -> . gamma, b) is in J 1453 | # Only way this can happen is if the add count mismatches 1454 | pn = B.lr_next 1455 | if pn.lr_added.get(b,0) == _lr1_add_count: continue 1456 | pn.lr_added[b] = _lr1_add_count 1457 | J.append((pn,b)) 1458 | ji += 1 1459 | 1460 | return J 1461 | 1462 | def lalr_parse_table(): 1463 | 1464 | # Compute some lr1 information about all of the productions 1465 | for p in LRitems: 1466 | try: 1467 | after = p.prod[p.lr_index + 1] 1468 | p.lr1_after = Prodnames[after] 1469 | p.lr1_beta = p.prod[p.lr_index + 2:] 1470 | except LookupError: 1471 | p.lr1_after = [ ] 1472 | p.lr1_beta = [ ] 1473 | p.lr_added = { } 1474 | 1475 | # Compute the LR(0) items 1476 | C = lr0_items() 1477 | CK = [] 1478 | for I in C: 1479 | CK.append(lr0_kernel(I)) 1480 | 1481 | print CK 1482 | 1483 | # ----------------------------------------------------------------------------- 1484 | # ==== LR Utility functions ==== 1485 | # ----------------------------------------------------------------------------- 1486 | 1487 | # ----------------------------------------------------------------------------- 1488 | # _lr_write_tables() 1489 | # 1490 | # This function writes the LR parsing tables to a file 1491 | # ----------------------------------------------------------------------------- 1492 | 1493 | def lr_write_tables(modulename=tab_module): 1494 | filename = modulename + ".py" 1495 | try: 1496 | f = open(filename,"w") 1497 | 1498 | f.write(""" 1499 | # %s 1500 | # This file is automatically generated. Do not edit. 1501 | 1502 | _lr_method = %s 1503 | 1504 | _lr_signature = %s 1505 | """ % (filename, repr(_lr_method), repr(Signature.digest()))) 1506 | 1507 | # Change smaller to 0 to go back to original tables 1508 | smaller = 1 1509 | 1510 | # Factor out names to try and make smaller 1511 | if smaller: 1512 | items = { } 1513 | 1514 | for k,v in _lr_action.items(): 1515 | i = items.get(k[1]) 1516 | if not i: 1517 | i = ([],[]) 1518 | items[k[1]] = i 1519 | i[0].append(k[0]) 1520 | i[1].append(v) 1521 | 1522 | f.write("\n_lr_action_items = {") 1523 | for k,v in items.items(): 1524 | f.write("%r:([" % k) 1525 | for i in v[0]: 1526 | f.write("%r," % i) 1527 | f.write("],[") 1528 | for i in v[1]: 1529 | f.write("%r," % i) 1530 | 1531 | f.write("]),") 1532 | f.write("}\n") 1533 | 1534 | f.write(""" 1535 | _lr_action = { } 1536 | for _k, _v in _lr_action_items.items(): 1537 | for _x,_y in zip(_v[0],_v[1]): 1538 | _lr_action[(_x,_k)] = _y 1539 | del _lr_action_items 1540 | """) 1541 | 1542 | else: 1543 | f.write("\n_lr_action = { "); 1544 | for k,v in _lr_action.items(): 1545 | f.write("(%r,%r):%r," % (k[0],k[1],v)) 1546 | f.write("}\n"); 1547 | 1548 | if smaller: 1549 | # Factor out names to try and make smaller 1550 | items = { } 1551 | 1552 | for k,v in _lr_goto.items(): 1553 | i = items.get(k[1]) 1554 | if not i: 1555 | i = ([],[]) 1556 | items[k[1]] = i 1557 | i[0].append(k[0]) 1558 | i[1].append(v) 1559 | 1560 | f.write("\n_lr_goto_items = {") 1561 | for k,v in items.items(): 1562 | f.write("%r:([" % k) 1563 | for i in v[0]: 1564 | f.write("%r," % i) 1565 | f.write("],[") 1566 | for i in v[1]: 1567 | f.write("%r," % i) 1568 | 1569 | f.write("]),") 1570 | f.write("}\n") 1571 | 1572 | f.write(""" 1573 | _lr_goto = { } 1574 | for _k, _v in _lr_goto_items.items(): 1575 | for _x,_y in zip(_v[0],_v[1]): 1576 | _lr_goto[(_x,_k)] = _y 1577 | del _lr_goto_items 1578 | """) 1579 | else: 1580 | f.write("\n_lr_goto = { "); 1581 | for k,v in _lr_goto.items(): 1582 | f.write("(%r,%r):%r," % (k[0],k[1],v)) 1583 | f.write("}\n"); 1584 | 1585 | # Write production table 1586 | f.write("_lr_productions = [\n") 1587 | for p in Productions: 1588 | if p: 1589 | if (p.func): 1590 | f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line)) 1591 | else: 1592 | f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len)) 1593 | else: 1594 | f.write(" None,\n") 1595 | f.write("]\n") 1596 | f.close() 1597 | 1598 | except IOError,e: 1599 | print "Unable to create '%s'" % filename 1600 | print e 1601 | return 1602 | 1603 | def lr_read_tables(module=tab_module,optimize=0): 1604 | global _lr_action, _lr_goto, _lr_productions, _lr_method 1605 | try: 1606 | exec "import %s as parsetab" % module 1607 | 1608 | if (optimize) or (Signature.digest() == parsetab._lr_signature): 1609 | _lr_action = parsetab._lr_action 1610 | _lr_goto = parsetab._lr_goto 1611 | _lr_productions = parsetab._lr_productions 1612 | _lr_method = parsetab._lr_method 1613 | return 1 1614 | else: 1615 | return 0 1616 | 1617 | except (ImportError,AttributeError): 1618 | return 0 1619 | 1620 | # ----------------------------------------------------------------------------- 1621 | # yacc(module) 1622 | # 1623 | # Build the parser module 1624 | # ----------------------------------------------------------------------------- 1625 | 1626 | def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file): 1627 | global yaccdebug 1628 | yaccdebug = debug 1629 | 1630 | initialize_vars() 1631 | files = { } 1632 | error = 0 1633 | 1634 | # Add starting symbol to signature 1635 | if start: 1636 | Signature.update(start) 1637 | 1638 | # If a "module" parameter was supplied, extract its dictionary. 1639 | # Note: a module may in fact be an instance as well. 1640 | 1641 | if module: 1642 | # User supplied a module object. 1643 | if isinstance(module, types.ModuleType): 1644 | ldict = module.__dict__ 1645 | elif isinstance(module, types.InstanceType): 1646 | _items = [(k,getattr(module,k)) for k in dir(module)] 1647 | ldict = { } 1648 | for i in _items: 1649 | ldict[i[0]] = i[1] 1650 | else: 1651 | raise ValueError,"Expected a module" 1652 | 1653 | else: 1654 | # No module given. We might be able to get information from the caller. 1655 | # Throw an exception and unwind the traceback to get the globals 1656 | 1657 | try: 1658 | raise RuntimeError 1659 | except RuntimeError: 1660 | e,b,t = sys.exc_info() 1661 | f = t.tb_frame 1662 | f = f.f_back # Walk out to our calling function 1663 | ldict = f.f_globals # Grab its globals dictionary 1664 | 1665 | # If running in optimized mode. We're going to 1666 | 1667 | if (optimize and lr_read_tables(tabmodule,1)): 1668 | # Read parse table 1669 | del Productions[:] 1670 | for p in _lr_productions: 1671 | if not p: 1672 | Productions.append(None) 1673 | else: 1674 | m = MiniProduction() 1675 | m.name = p[0] 1676 | m.len = p[1] 1677 | m.file = p[3] 1678 | m.line = p[4] 1679 | if p[2]: 1680 | m.func = ldict[p[2]] 1681 | Productions.append(m) 1682 | 1683 | else: 1684 | # Get the tokens map 1685 | if (module and isinstance(module,types.InstanceType)): 1686 | tokens = getattr(module,"tokens",None) 1687 | else: 1688 | tokens = ldict.get("tokens",None) 1689 | 1690 | if not tokens: 1691 | raise YaccError,"module does not define a list 'tokens'" 1692 | if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): 1693 | raise YaccError,"tokens must be a list or tuple." 1694 | 1695 | # Check to see if a requires dictionary is defined. 1696 | requires = ldict.get("require",None) 1697 | if requires: 1698 | if not (isinstance(requires,types.DictType)): 1699 | raise YaccError,"require must be a dictionary." 1700 | 1701 | for r,v in requires.items(): 1702 | try: 1703 | if not (isinstance(v,types.ListType)): 1704 | raise TypeError 1705 | v1 = [x.split(".") for x in v] 1706 | Requires[r] = v1 1707 | except StandardError: 1708 | print "Invalid specification for rule '%s' in require. Expected a list of strings" % r 1709 | 1710 | 1711 | # Build the dictionary of terminals. We a record a 0 in the 1712 | # dictionary to track whether or not a terminal is actually 1713 | # used in the grammar 1714 | 1715 | if 'error' in tokens: 1716 | print "yacc: Illegal token 'error'. Is a reserved word." 1717 | raise YaccError,"Illegal token name" 1718 | 1719 | for n in tokens: 1720 | if Terminals.has_key(n): 1721 | print "yacc: Warning. Token '%s' multiply defined." % n 1722 | Terminals[n] = [ ] 1723 | 1724 | Terminals['error'] = [ ] 1725 | 1726 | # Get the precedence map (if any) 1727 | prec = ldict.get("precedence",None) 1728 | if prec: 1729 | if not (isinstance(prec,types.ListType) or isinstance(prec,types.TupleType)): 1730 | raise YaccError,"precedence must be a list or tuple." 1731 | add_precedence(prec) 1732 | Signature.update(repr(prec)) 1733 | 1734 | for n in tokens: 1735 | if not Precedence.has_key(n): 1736 | Precedence[n] = ('right',0) # Default, right associative, 0 precedence 1737 | 1738 | # Look for error handler 1739 | ef = ldict.get('p_error',None) 1740 | if ef: 1741 | if isinstance(ef,types.FunctionType): 1742 | ismethod = 0 1743 | elif isinstance(ef, types.MethodType): 1744 | ismethod = 1 1745 | else: 1746 | raise YaccError,"'p_error' defined, but is not a function or method." 1747 | eline = ef.func_code.co_firstlineno 1748 | efile = ef.func_code.co_filename 1749 | files[efile] = None 1750 | 1751 | if (ef.func_code.co_argcount != 1+ismethod): 1752 | raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) 1753 | global Errorfunc 1754 | Errorfunc = ef 1755 | else: 1756 | print "yacc: Warning. no p_error() function is defined." 1757 | 1758 | # Get the list of built-in functions with p_ prefix 1759 | symbols = [ldict[f] for f in ldict.keys() 1760 | if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_' 1761 | and ldict[f].__name__ != 'p_error')] 1762 | 1763 | # Check for non-empty symbols 1764 | if len(symbols) == 0: 1765 | raise YaccError,"no rules of the form p_rulename are defined." 1766 | 1767 | # Sort the symbols by line number 1768 | symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) 1769 | 1770 | # Add all of the symbols to the grammar 1771 | for f in symbols: 1772 | if (add_function(f)) < 0: 1773 | error += 1 1774 | else: 1775 | files[f.func_code.co_filename] = None 1776 | 1777 | # Make a signature of the docstrings 1778 | for f in symbols: 1779 | if f.__doc__: 1780 | Signature.update(f.__doc__) 1781 | 1782 | lr_init_vars() 1783 | 1784 | if error: 1785 | raise YaccError,"Unable to construct parser." 1786 | 1787 | if not lr_read_tables(tabmodule): 1788 | 1789 | # Validate files 1790 | for filename in files.keys(): 1791 | if not validate_file(filename): 1792 | error = 1 1793 | 1794 | # Validate dictionary 1795 | validate_dict(ldict) 1796 | 1797 | if start and not Prodnames.has_key(start): 1798 | raise YaccError,"Bad starting symbol '%s'" % start 1799 | 1800 | augment_grammar(start) 1801 | error = verify_productions(cycle_check=check_recursion) 1802 | otherfunc = [ldict[f] for f in ldict.keys() 1803 | if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')] 1804 | 1805 | if error: 1806 | raise YaccError,"Unable to construct parser." 1807 | 1808 | build_lritems() 1809 | compute_first1() 1810 | compute_follow(start) 1811 | 1812 | if method == 'SLR': 1813 | slr_parse_table() 1814 | elif method == 'LALR1': 1815 | lalr_parse_table() 1816 | return 1817 | else: 1818 | raise YaccError, "Unknown parsing method '%s'" % method 1819 | 1820 | if write_tables: 1821 | lr_write_tables(tabmodule) 1822 | 1823 | if yaccdebug: 1824 | try: 1825 | f = open(debugfile,"w") 1826 | f.write(_vfc.getvalue()) 1827 | f.write("\n\n") 1828 | f.write(_vf.getvalue()) 1829 | f.close() 1830 | except IOError,e: 1831 | print "yacc: can't create '%s'" % debugfile,e 1832 | 1833 | # Made it here. Create a parser object and set up its internal state. 1834 | # Set global parse() method to bound method of parser object. 1835 | 1836 | p = Parser("xyzzy") 1837 | p.productions = Productions 1838 | p.errorfunc = Errorfunc 1839 | p.action = _lr_action 1840 | p.goto = _lr_goto 1841 | p.method = _lr_method 1842 | p.require = Requires 1843 | 1844 | global parse 1845 | parse = p.parse 1846 | 1847 | # Clean up all of the globals we created 1848 | if (not optimize): 1849 | yacc_cleanup() 1850 | return p 1851 | 1852 | # yacc_cleanup function. Delete all of the global variables 1853 | # used during table construction 1854 | 1855 | def yacc_cleanup(): 1856 | global _lr_action, _lr_goto, _lr_method, _lr_goto_cache 1857 | del _lr_action, _lr_goto, _lr_method, _lr_goto_cache 1858 | 1859 | global Productions, Prodnames, Prodmap, Terminals 1860 | global Nonterminals, First, Follow, Precedence, LRitems 1861 | global Errorfunc, Signature, Requires 1862 | 1863 | del Productions, Prodnames, Prodmap, Terminals 1864 | del Nonterminals, First, Follow, Precedence, LRitems 1865 | del Errorfunc, Signature, Requires 1866 | 1867 | global _vf, _vfc 1868 | del _vf, _vfc 1869 | 1870 | 1871 | # Stub that raises an error if parsing is attempted without first calling yacc() 1872 | def parse(*args,**kwargs): 1873 | raise YaccError, "yacc: No parser built with yacc()" 1874 | 1875 | --------------------------------------------------------------------------------