├── Makefile
├── samples
    ├── foo_lib.c
    └── foo.c
├── .gitignore
├── README.md
├── c.py
├── clex.py
├── cparse.py
├── cvisitors.py
├── lex.py
├── cx86.py
└── yacc.py


/Makefile:
--------------------------------------------------------------------------------
 1 | #  ---------------------------------------------------------------
 2 | #  Makefile
 3 | #
 4 | #  Atul Varma
 5 | #  Python C Compiler - Makefile
 6 | #  $Id: Makefile,v 1.8 2004/06/02 21:11:57 varmaa Exp $
 7 | #
 8 | #  This just makes all the sample code and lets you clean up
 9 | #  intermediate/output files.
10 | #  ---------------------------------------------------------------
11 | 
12 | FLAGS=-annotate -ast
13 | PYTHON=python
14 | 
15 | compile-samples:
16 | 	${PYTHON} c.py samples/foo.c samples/foo_lib.c ${FLAGS}
17 | 	gcc samples/foo.s samples/foo_lib.s -o samples/foo
18 | 
19 | clean:
20 | 	rm -f parsetab.py parser.out *.pyc samples/*.ast \
21 |               samples/*.s samples/*.exe samples/foo
22 | 


--------------------------------------------------------------------------------
/samples/foo_lib.c:
--------------------------------------------------------------------------------
 1 | /*******************************************************************
 2 |  * foo_lib.c
 3 |  * Atul Varma - 5/24/2004
 4 |  * CS Independent Study
 5 |  * $Id: foo_lib.c,v 1.1 2004/05/27 16:25:14 varmaa Exp $
 6 |  *
 7 |  * Contains external library functions/variables for foo.c.
 8 |  *******************************************************************
 9 | */
10 | 
11 | /* Test global variable. */
12 | int stuff_count;
13 | 
14 | /* Test of static function definition, to make sure it
15 |    doesn't conflict with fib() defined in foo.c. */
16 | static int fib()
17 | {
18 |   return stuff_count += 1;
19 | }
20 | 
21 | /* Increment global variable. */
22 | int increment_stuff_count()
23 | {
24 |   fib();
25 |   return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/samples/foo.c:
--------------------------------------------------------------------------------
  1 | /*******************************************************************
  2 |  * foo.c
  3 |  * Atul Varma - 5/24/2004
  4 |  * CS Independent Study
  5 |  * $Id: foo.c,v 1.1 2004/05/27 16:25:14 varmaa Exp $
  6 |  *
  7 |  * This is a simple C file that should be compiled by my mini-C
  8 |  * compiler.
  9 |  *******************************************************************
 10 | */
 11 | 
 12 | /* Prototypes for some standard C library functions (the code
 13 |    calls these directly). */
 14 | extern int printf(char *str, ...);
 15 | extern char *malloc(int size);
 16 | extern int free(char *ptr);
 17 | 
 18 | /* Test of extern variable.  How many times we've called
 19 |    a printf() function. */
 20 | extern int stuff_count;
 21 | 
 22 | /* Increments this global variable. */
 23 | extern int increment_stuff_count();
 24 | 
 25 | /* Test of global variable.  How many times we've called
 26 |    the fib() function. */
 27 | int fib_count;
 28 | 
 29 | /* fibonacci function: Test of basic branching and recursion. */
 30 | static int fib(int i)
 31 | {
 32 |   fib_count += 1;
 33 |   if (i == 1) {
 34 |     return 1;
 35 |   } else {
 36 |     if (i == 0) {
 37 |       return 0;
 38 |     } else {
 39 |       return fib(i-1) + fib(i-2);
 40 |     }
 41 |   }
 42 | }
 43 | 
 44 | /* Just a wrapper to easily show the results of a
 45 |    call to fib(). */
 46 | static int show_fib(int i)
 47 | {
 48 |   printf("fib(%d) is %d.\n", i, fib(i));
 49 |   return 0;
 50 | }
 51 | 
 52 | /* Test of pointer indirection and char type. */
 53 | static int set_a(char *c)
 54 | {
 55 |   *c = 'a';
 56 |   return 0;
 57 | }
 58 | 
 59 | /* Test of string literals and returning char *'s. */
 60 | static char *get_literal()
 61 | {
 62 |   return "blah\n";
 63 | }
 64 | 
 65 | /* Main program that runs the tests. */
 66 | int main(int argc, char **argv) {
 67 |   char c;
 68 |   int i;
 69 | 
 70 |   c = 'h';
 71 | 
 72 |   /* Test of multiple assignment. */
 73 |   fib_count = stuff_count = 0;
 74 | 
 75 |   /* Test of command-line argument passing, pointer
 76 |      indirection/array indexing, for looping. */
 77 |   printf("My executable name is %s.\n", *argv);
 78 |   for (i = 0; i < argc; i += 1) {
 79 |     printf("  argv[%d] is: %s    "
 80 |            "argv[%d][0] is: %c\n", i, argv[i], i, argv[i][0]);
 81 |     increment_stuff_count();
 82 |   }
 83 | 
 84 |   /* Test of while looping with break/continue. */
 85 |   i = 0;
 86 |   while (1) {
 87 |     show_fib(i);
 88 |     i += 1;
 89 |     if (i > 5)
 90 |       break;
 91 |     else
 92 |       continue;
 93 |   }
 94 |   stuff_count = stuff_count * 2;
 95 | 
 96 |   printf("fib_count is %d.\n", fib_count);
 97 |   printf("stuff_count is %d.\n", stuff_count);
 98 | 
 99 |   printf("before set_a(&c), c == '%c'\n", c);
100 | 
101 |   /* Test of address-of (&) operator. */
102 |   set_a(&c);
103 | 
104 |   {
105 |     /* Test of char-int and int-char type coercion. */
106 |     int a;
107 |     char b;
108 |     int c;
109 | 
110 |     /* Note that in two's complement arithmetic, this is
111 |        a 32-bit int consisting of all 1's.
112 | 
113 |        (This is also a test of the '-' unary operator.) */
114 |     a = -1;
115 | 
116 |     /* The following line will raise a warning from the
117 |        compiler, because a signed 32-bit int is being truncated
118 |        to an unsigned 8-bit char. */
119 |     b = a;
120 | 
121 |     c = b;
122 | 
123 |     printf("  a = %d\n", a);
124 |     printf("  b = %d\n", b);
125 |     printf("  c = %d\n", c);
126 |   }
127 | 
128 |   /* Note now that the scope of c is in the function's main
129 |      scope, not the scope of the above compound statement.
130 |      This test makes sure that the address and contents
131 |      of c did not change during the execution of the
132 |      compound statement. */
133 |   printf("after set_a(&c), c == '%c'\n", c);
134 | 
135 |   printf("get_literal() = %s\n", get_literal());
136 | 
137 |   /* Pointer indexing via array example. */
138 |   printf("get_literal()[3] = %c\n", get_literal()[3]);
139 | 
140 |   {
141 |     /* Test of building a string using assignment via array indexing
142 |        of a char pointer.  The buffer is dynamically allocated. */
143 |     char *c;
144 | 
145 |     c = malloc(30);
146 |     c[0] = 'h';
147 |     c[1] = 'i';
148 |     c[2] = 0;
149 |     printf("array-built string is: %s\n", c);
150 |     free(c);
151 |   }
152 |   return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [Atul](https://github.com/toolness)'s Mini-C Compiler
  2 | June 2, 2004
  3 | 
  4 | This is a compiler for a subset of the C programming language. It was
  5 | written in Python during the spring of 2004.
  6 | 
  7 | The lexer and parser were constructed using Dave Beazley's PLY (Python
  8 | Lex-Yacc), an open-source Python implementation of GNU
  9 | lex/yacc. Stages of compilation (symbol tree generation, type
 10 | checking, flow control checking, etc) are performed using an
 11 | object-oriented design pattern called a visitor (GoF 1995). The output
 12 | is annotated Intel 80x86 assembly, suitable for translation to machine
 13 | language using the GNU Assembler (GAS).
 14 | 
 15 | ---------------------------------------------------------------
 16 | LANGUAGE FEATURES
 17 | ---------------------------------------------------------------
 18 | 
 19 | The subset of the C language implemented here includes:
 20 | 
 21 |     * Functions, variables (local and global), and character and
 22 |       string literals.
 23 | 
 24 |     * Assignments (=, +=, etc), standard arithmetic binary and unary
 25 |       operators (+,-,*, etc), logical binary and unary operators (!,
 26 |       ==, <, etc).
 27 | 
 28 |     * Support for the C datatypes char and int, as well as implicit
 29 |       type conversion between the two (warnings are raised in
 30 |       situations of potential data loss). int variables are assumed to
 31 |       be signed, and char variables are assumed to be unsigned (this
 32 |       is not a violation of the ANSI C standard).
 33 | 
 34 |     * Control flow elements including while and for loops,
 35 |       if/then/else conditionals, and recursion.
 36 | 
 37 |     * Support for the C keywords extern for functions and variables,
 38 |       and static for functions.
 39 | 
 40 |     * Pointers, including pointer dereferencing (the * operator),
 41 |       multiple levels of indirection (double pointers, triple
 42 |       pointers, etc), array indexing notation, and the address-of (&)
 43 |       operator.
 44 | 
 45 | ---------------------------------------------------------------
 46 | FILES AND DIRECTORIES
 47 | ---------------------------------------------------------------
 48 | 
 49 |     lex.py       - Python Lex (this is part of PLY).
 50 |     yacc.py      - Python Yacc (this is part of PLY).
 51 |     clex.py      - Mini-C lexer.
 52 |     cparse.py    - Mini-C parser.  Contains yacc rules for Mini-C and
 53 |                    defines the classes that make up the AST.
 54 |     cvisitors.py - Mini-C visitors.  Defines the base visitor class,
 55 |                    and concrete visitor classes for printing the AST,
 56 |                    doing symbol table generation, type checking, and
 57 |                    flow control.
 58 |     cx86.py      - Intel 80x86 assembly code generator.  Defines a
 59 |                    virtual stack machine class and the code generator
 60 |                    visitor.
 61 |     c.py         - Front-end to the compiler.  This takes in command-
 62 |                    line options and runs the compiler on the filenames
 63 |                    you give it.
 64 |     samples/     - This directory contains foo.c and foo_lib.c, two
 65 |                    C files that can be compiled by the mini-c
 66 |                    compiler.  foo_lib.c is intended to be used as
 67 |                    a library that foo.c accesses, to show
 68 |                    that mini-c generates assembly that can be linked
 69 |                    with gcc.
 70 | 
 71 | ---------------------------------------------------------------
 72 | USING THE COMPILER
 73 | ---------------------------------------------------------------
 74 | 
 75 | The syntax for using the mini-c compiler is as follows:
 76 | 
 77 |     c.py <source-file-1> [[source-file-2] ...] [-ast] [-annotate]
 78 | 
 79 | Source files are the C files you want to compile into assembly (.s
 80 | files).
 81 | 
 82 | The '-ast' option generates a file with extension .ast that is a
 83 | printout of the abstract syntax tree for the source file, after
 84 | all stages of compilation occur.
 85 | 
 86 | The '-annotate' option generates annotated assembly.  That is,
 87 | assembly is generated with comments describing what each instruction
 88 | does, its relevance to the original C source code, and so forth.
 89 | Additional comments are inserted to delimit functions, control
 90 | structures, and so forth.
 91 | 
 92 | ---------------------------------------------------------------
 93 | THE MAKEFILE
 94 | ---------------------------------------------------------------
 95 | 
 96 | The makefile just compiles the two files in the samples/ directory and
 97 | outputs an executable called 'foo' into this directory (all other
 98 | output files are also placed here).
 99 | 
100 | Note that while compiling this, you may receive a bunch of warnings
101 | mentioning something about an "Illegal character: ''".  This is just
102 | an artifact of newline translation differences between platforms and
103 | should be ignored.
104 | 


--------------------------------------------------------------------------------
/c.py:
--------------------------------------------------------------------------------
  1 | #  ---------------------------------------------------------------
  2 | #  c.py
  3 | #
  4 | #  Atul Varma
  5 | #  Python C Compiler - Front-end
  6 | #  $Id: c.py,v 1.3 2004/05/27 17:52:19 varmaa Exp $
  7 | #
  8 | #  This is the main program for the compiler, which just parses
  9 | #  command-line options, figures out which source files to read
 10 | #  and write to, and invokes the different stages of the
 11 | #  compiler proper.
 12 | #  ---------------------------------------------------------------
 13 | 
 14 | import yacc
 15 | 
 16 | import cparse, cvisitors, cx86
 17 | 
 18 | import sys
 19 | 
 20 | class Compiler:
 21 |     """This object encapsulates the front-end for the compiler and
 22 |     serves as a facade interface to the 'meat' of the compiler
 23 |     underneath."""
 24 |     
 25 |     class CompileError(Exception):
 26 |         """Exception raised when there's been a compilation error."""
 27 | 
 28 |         pass
 29 | 
 30 |     def __init__(self):
 31 |         self.total_errors = 0
 32 |         self.total_warnings = 0
 33 | 
 34 |     def _parse(self):
 35 |         """Parses the source code."""
 36 |         self.ast = yacc.parse(self.code)
 37 | 
 38 |     def _compile_phase(self, visitor):
 39 |         """Applies a visitor to the abstract syntax tree."""
 40 |         
 41 |         visitor.visit(self.ast)
 42 |         self.total_errors += visitor.errors
 43 |         self.total_warnings += visitor.warnings
 44 |         if visitor.has_errors():
 45 |             raise Compiler.CompileError()
 46 | 
 47 |     def _do_compile(self, outfile, ast_file, show_comments):
 48 |         """Compiles the code to the given file object.  Enabling
 49 |         show_ast prints out the abstract syntax tree."""
 50 |         
 51 |         self._parse()
 52 |         self._compile_phase(cvisitors.SymtabVisitor())
 53 |         self._compile_phase(cvisitors.TypeCheckVisitor())
 54 |         self._compile_phase(cvisitors.FlowControlVisitor())
 55 |         self._compile_phase(cx86.CodeGenVisitor(outfile,
 56 |                                                 show_comments))
 57 |         if ast_file != None:
 58 |             self._compile_phase(cvisitors.ASTPrinterVisitor(ast_file))
 59 | 
 60 |     def _print_stats(self):
 61 |         """Prints the total number of errors/warnings from compilation."""
 62 |         
 63 |         print "%d errors, %d warnings." % (self.total_errors, self.total_warnings)
 64 | 
 65 |     def compile(self, code, outfile, show_ast, show_comments):
 66 |         """Compiles the given code string to the given file object."""
 67 | 
 68 |         self.code = code
 69 |         try:
 70 |             self._do_compile(outfile, show_ast, show_comments)
 71 |         except cparse.ParseError:
 72 |             print "Errors encountered, bailing."
 73 |             return 1            
 74 |         except Compiler.CompileError:
 75 |             self._print_stats()
 76 |             print "Errors encountered, bailing."
 77 |             return 1
 78 |         self._print_stats()
 79 |         print "Compile successful."
 80 |         return 0
 81 | 
 82 | def run_compiler():
 83 |     """Runs the command-line compiler."""
 84 |     
 85 |     if len(sys.argv) < 2:
 86 |         print "Usage: c.py <source-file-1> [[source-file-2] ...] [-ast] [-annotate]"
 87 |         sys.exit(1)
 88 | 
 89 |     show_ast = 0
 90 |     show_comments = 0
 91 | 
 92 |     params = sys.argv[1:]
 93 |     files = sys.argv[1:]
 94 | 
 95 |     for param in params:
 96 |         if param[0] == '-':
 97 |             if param == '-ast':
 98 |                 show_ast = 1
 99 |             elif param == '-annotate':
100 |                 print "Annotated assembly generation enabled."
101 |                 show_comments = 1
102 |             else:
103 |                 print "Unknown option: %s" % param
104 |                 sys.exit(1)
105 |             files.remove(param)
106 | 
107 |     for file in files:
108 |         source_filename = file
109 |         dest_filename = file[:-2]+'.s'
110 |         print "Compiling %s -> %s." % (source_filename, dest_filename)
111 |         open_files = []
112 |         ast_file = None
113 |         if show_ast:
114 |             ast_filename = file[:-2]+'.ast'
115 |             print "Outputting AST to %s." % ast_filename
116 |             ast_file = open(ast_filename, 'w')
117 |             open_files.append(ast_file)
118 |         source = open(source_filename, 'r')
119 |         code = source.read()
120 |         source.close()
121 |         dest = open(dest_filename, 'w')
122 |         open_files.append(dest)
123 |         retval = Compiler().compile(code, dest, ast_file, show_comments)
124 |         for f in open_files:
125 |             f.close()
126 |         if retval != 0:
127 |             sys.exit(retval)
128 |         print
129 | 
130 |     sys.exit(retval)
131 | 
132 | if __name__ == '__main__':
133 |     run_compiler()
134 | 
135 | #  ---------------------------------------------------------------
136 | #  End of c.py
137 | #  ---------------------------------------------------------------
138 | 


--------------------------------------------------------------------------------
/clex.py:
--------------------------------------------------------------------------------
  1 | #  ---------------------------------------------------------------
  2 | #  clex.py
  3 | #
  4 | #  Atul Varma
  5 | #  Python C Compiler - Lexical Analyzer
  6 | #  $Id: clex.py,v 1.2 2004/06/02 21:05:45 varmaa Exp $
  7 | #  ---------------------------------------------------------------
  8 | 
  9 | import lex
 10 | import re
 11 | 
 12 | #  ---------------------------------------------------------------
 13 | #  TOKEN LIST
 14 | #  ---------------------------------------------------------------
 15 | 
 16 | tokens = (
 17 |     # Reserved words
 18 |     'AUTO',
 19 |     'BREAK',
 20 |     'CASE',
 21 |     'CHAR',
 22 |     'CONST',
 23 |     'CONTINUE',
 24 |     'DEFAULT',    
 25 |     'DO',
 26 |     'DOUBLE',
 27 |     'ELSE',
 28 |     'ENUM',
 29 |     'EXTERN',
 30 |     'FLOAT',
 31 |     'FOR',
 32 |     'GOTO',
 33 |     'IF',
 34 |     'INT',
 35 |     'LONG',
 36 |     'REGISTER',
 37 |     'RETURN',
 38 |     'SHORT',
 39 |     'SIGNED',
 40 |     'SIZEOF',
 41 |     'STATIC',
 42 |     'STRUCT',
 43 |     'SWITCH',
 44 |     'TYPEDEF',
 45 |     'UNION',
 46 |     'UNSIGNED',
 47 |     'VOID',
 48 |     'VOLATILE',
 49 |     'WHILE',
 50 | 
 51 |     # Special characters
 52 |     'COMMA',
 53 |     'COLON',
 54 |     'SEMICOLON',
 55 |     'LPAREN',
 56 |     'RPAREN',
 57 |     'LBRACKET',
 58 |     'RBRACKET',
 59 |     'LBRACE',
 60 |     'RBRACE',
 61 |     'ASSIGN',
 62 |     'GREATER',
 63 |     'LESS',
 64 |     'EQ',
 65 |     'NOT_EQ',
 66 |     'GREATER_EQ',
 67 |     'LESS_EQ',
 68 |     'DOUBLE_PLUS',
 69 |     'DOUBLE_MINUS',
 70 |     'PLUS',
 71 |     'MINUS',
 72 |     'TIMES',
 73 |     'DIV',
 74 |     'MODULO',
 75 |     'DOUBLE_AMPERSAND',
 76 |     'DOUBLE_PIPE',
 77 |     'EXCLAMATION',
 78 |     'AMPERSAND',
 79 |     'PIPE',
 80 |     'CARET',
 81 |     'ASTERISK',
 82 |     'QUESTION',
 83 |     'TILDE',
 84 |     'POUND',
 85 |     'DOT',
 86 |     'ELLIPSIS',
 87 |     'ARROW',
 88 |     'SHIFT_LEFT',
 89 |     'SHIFT_RIGHT',
 90 |     'EQ_PLUS',
 91 |     'EQ_MINUS',
 92 |     'EQ_TIMES',
 93 |     'EQ_DIV',
 94 |     'EQ_MODULO',
 95 |     'EQ_PIPE',
 96 |     'EQ_AMPERSAND',
 97 |     'EQ_CARET',
 98 |     'EQ_SHIFT_LEFT',
 99 |     'EQ_SHIFT_RIGHT',
100 | 
101 |     # Complex tokens
102 |     'ID',
103 |     'FNUMBER',    
104 |     'INUMBER',
105 |     'STRING',
106 |     'CHARACTER',
107 |     )
108 | 
109 | #  ---------------------------------------------------------------
110 | #  RESERVED WORDS
111 | #  ---------------------------------------------------------------
112 | 
113 | reserved_words = {
114 |     'auto' : 'AUTO',
115 |     'break' : 'BREAK',
116 |     'case' : 'CASE',
117 |     'char' : 'CHAR',
118 |     'const' : 'CONST',
119 |     'continue' : 'CONTINUE',
120 |     'default' : 'DEFAULT',    
121 |     'do' : 'DO',
122 |     'double' : 'DOUBLE',
123 |     'else' : 'ELSE',
124 |     'enum' : 'ENUM',
125 |     'extern' : 'EXTERN',
126 |     'float' : 'FLOAT',
127 |     'for' : 'FOR',
128 |     'goto' : 'GOTO',
129 |     'if' : 'IF',
130 |     'int' : 'INT',
131 |     'long' : 'LONG',
132 |     'register' : 'REGISTER',
133 |     'return' : 'RETURN',
134 |     'short' : 'SHORT',
135 |     'signed' : 'SIGNED',
136 |     'sizeof' : 'SIZEOF',
137 |     'static' : 'STATIC',
138 |     'struct' : 'STRUCT',
139 |     'switch' : 'SWITCH',
140 |     'typedef' : 'TYPEDEF',
141 |     'union' : 'UNION',
142 |     'unsigned' : 'UNSIGNED',
143 |     'void' : 'VOID',
144 |     'volatile' : 'VOLATILE',
145 |     'while' : 'WHILE'
146 | }
147 | 
148 | #  ---------------------------------------------------------------
149 | #  SPECIAL CHARACTERS
150 | #  ---------------------------------------------------------------
151 | 
152 | t_COMMA = r','
153 | t_COLON = r':'
154 | t_SEMICOLON = r';'
155 | t_LPAREN = r'\('
156 | t_RPAREN = r'\)'
157 | t_LBRACKET = r'\['
158 | t_RBRACKET = r'\]'
159 | t_LBRACE = r'{'
160 | t_RBRACE = r'}'
161 | t_ASSIGN = r'='
162 | t_GREATER = r'>'
163 | t_LESS = r'<'
164 | t_EQ = r'=='
165 | t_NOT_EQ = r'!='
166 | t_GREATER_EQ = r'>='
167 | t_LESS_EQ = r'<='
168 | t_DOUBLE_PLUS = r'\+\+'
169 | t_DOUBLE_MINUS = r'--'
170 | t_PLUS = r'\+'
171 | t_MINUS = r'-'
172 | t_TIMES = r'\*'
173 | t_DIV = r'/(?!\*)'
174 | t_MODULO = r'%'
175 | t_DOUBLE_AMPERSAND = r'&&'
176 | t_DOUBLE_PIPE = r'\|\|'
177 | t_EXCLAMATION = r'!'
178 | t_AMPERSAND = r'&'
179 | t_PIPE = r'\|'
180 | t_CARET = r'^'
181 | t_ASTERISK = r'\*'
182 | t_QUESTION = r'\?'
183 | t_TILDE = r'~'
184 | t_POUND = r'\#'
185 | t_ELLIPSIS = r'\.\.\.'
186 | t_DOT = r'\.'
187 | t_ARROW = r'->'
188 | t_SHIFT_LEFT = r'<<'
189 | t_SHIFT_RIGHT = r'>>'
190 | t_EQ_PLUS = r'\+='
191 | t_EQ_MINUS = r'-='
192 | t_EQ_TIMES = r'\*='
193 | t_EQ_DIV = r'/='
194 | t_EQ_MODULO = r'%='
195 | t_EQ_PIPE = r'\|='
196 | t_EQ_AMPERSAND = r'&='
197 | t_EQ_CARET = r'\^='
198 | t_EQ_SHIFT_LEFT = r'<<='
199 | t_EQ_SHIFT_RIGHT = r'>>='
200 | 
201 | #  ---------------------------------------------------------------
202 | #  COMPLEX TOKENS
203 | #  ---------------------------------------------------------------
204 | 
205 | def t_ID(t):
206 |     r'[A-Za-z_][\w]*'
207 |     if reserved_words.has_key(t.value):
208 |         t.type = reserved_words[t.value]
209 |     return t
210 | 
211 | def t_FNUMBER(t):
212 |     r'((0(?!\d))|([1-9]\d*))((\.\d+(e[+-]?\d+)?)|(e[+-]?\d+))'
213 |     return t
214 | 
215 | def t_malformed_fnumber(t):
216 |     r'(0\d+)((\.\d+(e[+-]?\d+)?)|(e[+-]?\d+))'
217 |     print "Line %d. Malformed floating point number '%s'" % (t.lineno, t.value)
218 | 
219 | def t_INUMBER(t):
220 |     r'0(?!\d)|([1-9]\d*)'
221 |     return t
222 | 
223 | def t_malformed_inumber(t):
224 |     r'0\d+'
225 |     print "Line %d. Malformed integer '%s'" % (t.lineno, t.value)
226 | 
227 | def t_CHARACTER(t):
228 |     r"'\w'"
229 |     return t
230 | 
231 | def t_STRING(t):
232 |     r'"[^\n]*?(?<!\\)"'
233 |     temp_str = t.value.replace(r'\\', '')
234 |     m = re.search(r'\\[^n"]', temp_str)
235 |     if m != None:
236 |         print "Line %d. Unsupported character escape %s in string literal." % (t.lineno, m.group(0))
237 |         return
238 |     return t
239 | 
240 | #  ---------------------------------------------------------------
241 | #  IGNORED TOKENS
242 | #  ---------------------------------------------------------------
243 | 
244 | def t_WHITESPACE(t):
245 |     r'[ \t]+'
246 |     pass
247 | 
248 | def t_NEWLINE(t):
249 |     r'\n+'
250 |     t.lineno += len(t.value)
251 | 
252 | def t_COMMENT(t):
253 |     r'/\*[\w\W]*?\*/'
254 |     t.lineno += t.value.count('\n')
255 |     pass
256 | 
257 | #  ---------------------------------------------------------------
258 | #  ERROR HANDLING
259 | #  ---------------------------------------------------------------
260 | 
261 | def t_error(t):
262 |     print "Line %d." % (t.lineno,) + "",
263 |     if t.value[0] == '"':
264 |         print "Unterminated string literal."
265 |         if t.value.count('\n') > 0:
266 |             t.skip(t.value.index('\n'))
267 |     elif t.value[0:2] == '/*':
268 |         print "Unterminated comment."
269 |     else:
270 |         print "Illegal character '%s'" % t.value[0]
271 |         t.skip(1)
272 | 
273 | #  ---------------------------------------------------------------
274 | #  MAIN LEXER FUNCTIONALITY
275 | #  ---------------------------------------------------------------
276 | 
277 | def run_lexer():
278 |     """This is just a debugging function that prints out a list of
279 |     tokens, it's not actually called by the compiler or anything."""
280 |     
281 |     import sys
282 |     file = open(sys.argv[1])
283 |     lines = file.readlines()
284 |     file.close()
285 |     strings = ""
286 |     for i in lines:
287 |         strings += i
288 |     lex.input(strings)
289 |     while 1:
290 |         token = lex.token()       # Get a token
291 |         if not token: break        # No more tokens
292 |         print "(%s,'%s',%d)" % (token.type, token.value, token.lineno)
293 | 
294 | lex.lex()
295 | 
296 | if __name__ == '__main__':
297 |     run_lexer()
298 | 
299 | #  ---------------------------------------------------------------
300 | #  End of clex.py
301 | #  ---------------------------------------------------------------
302 | 


--------------------------------------------------------------------------------
/cparse.py:
--------------------------------------------------------------------------------
  1 | #  ---------------------------------------------------------------
  2 | #  cparse.py
  3 | #
  4 | #  Atul Varma
  5 | #  Python C Compiler - Parser
  6 | #  $Id: cparse.py,v 1.2 2004/05/27 16:25:08 varmaa Exp $
  7 | #  ---------------------------------------------------------------
  8 | 
  9 | import yacc
 10 | 
 11 | from clex import tokens
 12 | 
 13 | #  ---------------------------------------------------------------
 14 | #  ABSTRACT SYNTAX TREE - NODES
 15 | #  ---------------------------------------------------------------
 16 | 
 17 | class Node:
 18 |     "Base class for all nodes on the abstract syntax tree."
 19 |     
 20 |     def is_null(self):
 21 |         """Returns whether the node represents a null node."""
 22 |         
 23 |         return 0
 24 | 
 25 |     def is_const(self):
 26 |         """Returns whether the node is a constant numeric number
 27 |         (e.g., "5")."""
 28 |         
 29 |         return 0
 30 |     
 31 |     def has_address(self):
 32 |         """Returns whether the node has an address (i.e., is a valid
 33 |         lvalue)."""
 34 |         
 35 |         return self.__dict__.has_key("has_addr")
 36 | 
 37 |     def set_has_address(self):
 38 |         """Tells the node that has an address (is an lvalue).
 39 |         Ultimately, the address of the node should be placed in the
 40 |         output_addr attribute."""
 41 |         
 42 |         self.has_addr = 1
 43 |         self.output_addr = 0
 44 | 
 45 |     def calculate(self):
 46 |         """Calculates the constant numeric value of the node and
 47 |         its subnodes, if one exists.  For instance, if a node
 48 |         corresponds to the expression "5+3", then this method
 49 |         would return 8."""
 50 |         
 51 |         return None
 52 |     
 53 |     def accept(self, visitor):
 54 |         """Accept method for visitor classes (see cvisitor.py)."""
 55 |         
 56 |         return self._accept(self.__class__, visitor)
 57 |         
 58 |     def _accept(self, klass, visitor):
 59 |         """Accept implementation.  This is actually a recursive
 60 |         function that dynamically figures out which visitor method to
 61 |         call.  This is done by appending the class' name to 'v', so if
 62 |         the node class is called MyNode, then this method tries
 63 |         calling visitor.vMyNode().  If that node doesn't exist, then
 64 |         it recursively attempts to call the visitor method
 65 |         corresponding to the class' superclass (e.g.,
 66 |         visitor.vNode())."""
 67 |         
 68 |         visitor_method = getattr(visitor, "v%s" % klass.__name__, None)
 69 |         if visitor_method == None:
 70 |             bases = klass.__bases__
 71 |             last = None
 72 |             for i in bases:
 73 |                 last = self._accept(i, visitor)
 74 |             return last
 75 |         else:
 76 |             return visitor_method(self)
 77 | 
 78 | class NullNode(Node):
 79 |     """A null node is like a null terminator for AST's."""
 80 | 
 81 |     def __init__(self):
 82 |         self.type = 'void'
 83 | 
 84 |     def is_null(self):
 85 |         return 1
 86 | 
 87 | class ArrayExpression(Node):
 88 |     """This is an expression with array notation, like "a[5+b]"."""
 89 |     
 90 |     def __init__(self, expr, index):
 91 |         self.expr = expr
 92 |         self.index = index
 93 | 
 94 | class StringLiteral(Node):
 95 |     """A string literal, e.g. the string "Hello World" in
 96 |     printf("Hello World")."""
 97 |     
 98 |     def __init__(self, str):
 99 |         self._str = str
100 |         self.type = PointerType(BaseType('char'))
101 | 
102 |     def append_str(self, str):
103 |         self._str += str
104 |     
105 |     def get_str(self):
106 |         return self._str
107 |     
108 |     def get_sanitized_str(self):
109 |         """Returns a 'sanitized' version of the string, converting
110 |         all carriage returns to '\n' symbols, etc."""
111 | 
112 |         return self._str.replace('\n', '\\n')
113 | 
114 | class Id(Node):
115 |     """An identifier, which can correspond to the name of
116 |     a function, variable, etc..."""
117 | 
118 |     def __init__(self, name, lineno):
119 |         self.name = name
120 |         self.lineno = lineno
121 | 
122 | class Const(Node):
123 |     """A numeric constant (i.e., an integral literal), such as
124 |     the number 5."""
125 |     
126 |     def __init__(self, value, type):
127 |         self.value = value
128 |         self.type = type
129 | 
130 |     def calculate(self):
131 |         return self.value
132 | 
133 |     def is_const(self):
134 |         return 1
135 | 
136 | def _get_calculated(node):
137 |     """Attempts to calculate the numeric value of the expression,
138 |     returning a Const node if it was able to convert the expression.
139 |     If the expression isn't a constant expression like "5+3", then
140 |     this function just returns the node unmodified."""
141 | 
142 |     result = node.calculate()
143 |     if result != None:        
144 |         result = int(result)
145 |         return Const(result, BaseType('int'))
146 |     else:
147 |         return node
148 | 
149 | class Unaryop(Node):
150 |     """Any generic unary operator.  This is an abstract base class."""
151 |     
152 |     def __init__(self, node):
153 |         self.expr = node
154 | 
155 | class Negative(Unaryop):
156 |     """A negative unary operator, e.g. '-5'."""
157 |     
158 |     def calculate(self):
159 |         val = self.expr.calculate()
160 |         if val != None:
161 |             return -val
162 |         return None
163 | 
164 | class Pointer(Unaryop):
165 |     """A pointer dereference, e.g. '*a'."""
166 | 
167 |     pass
168 | 
169 | class AddrOf(Unaryop):
170 |     """An address-of operator, e.g. '&a'."""
171 |     
172 |     pass
173 | 
174 | class Binop(Node):
175 |     """Any binary operator, such as that for arithmetic operations
176 |     (+/-/*), assignment operations (=/+=/-=), and so forth."""
177 | 
178 |     # List of assignment operators.
179 |     ASSIGN_OPS = ['=', '+=', '-=']
180 |     
181 |     def __init__(self, left, right, op):
182 |         self.left = left
183 |         self.right = right
184 |         self.op = op
185 | 
186 |     def calculate(self):
187 |         left = self.left.calculate()
188 |         right = self.right.calculate()
189 |         if left != None and right != None:
190 |             return int(eval("%d %s %d" % (left, self.op, right)))
191 |         else:
192 |             return None
193 | 
194 | class IfStatement(Node):
195 |     """An if/then/else statement."""
196 |     
197 |     def __init__(self, expr, then_stmt, else_stmt):
198 |         self.expr = expr
199 |         self.then_stmt = then_stmt
200 |         self.else_stmt = else_stmt
201 | 
202 | class BreakStatement(Node):
203 |     """A break statement (used while in a loop structure to bust out
204 |     of it)."""
205 | 
206 |     pass
207 | 
208 | class ContinueStatement(Node):
209 |     """A continue statement (used while in a loop structure to bust
210 |     back to the beginning of it)."""
211 |     
212 |     pass
213 | 
214 | class ReturnStatement(Node):
215 |     """A return statement, used to exit a function and optionally
216 |     return a value."""
217 |     
218 |     def __init__(self, expr):
219 |         self.expr = expr
220 | 
221 | class ForLoop(Node):
222 |     """A for loop."""
223 |     
224 |     def __init__(self, begin_stmt, expr, end_stmt, stmt):
225 |         self.expr = expr
226 |         self.stmt = stmt
227 |         self.begin_stmt = begin_stmt
228 |         self.end_stmt = end_stmt
229 | 
230 | class WhileLoop(Node):
231 |     """A while loop."""
232 |     
233 |     def __init__(self, expr, stmt):
234 |         self.expr = expr
235 |         self.stmt = stmt
236 | 
237 | class NodeList(Node):
238 |     """A list of nodes.  This is an abstract base class."""
239 |     
240 |     def __init__(self, node=None):
241 |         self.nodes = []
242 |         if node != None:
243 |             self.nodes.append(node)
244 | 
245 |     def add(self, node):
246 |         self.nodes.append(node)
247 | 
248 | class ArgumentList(NodeList):
249 |     """A list of arguments for a function expression.  e.g., the list
250 |     '5,2,3' in 'a = my_func(5,2,3)'."""
251 |     
252 |     pass
253 | 
254 | class ParamList(NodeList):
255 |     """A list of parameters for a function prototype, e.g. the list
256 |     'int a, char b, char c' in 'int my_func(int a, char b, char c)'."""
257 | 
258 |     def __init__(self, node=None):
259 |         NodeList.__init__(self, node)
260 |         self.has_ellipsis = 0
261 | 
262 | class StatementList(NodeList):
263 |     """Any list of statements.  For instance, this can be the list of
264 |     statements in a function body."""
265 | 
266 |     pass
267 | 
268 | class TranslationUnit(NodeList):
269 |     """A list of nodes representing the program itself."""
270 | 
271 |     pass
272 | 
273 | class DeclarationList(NodeList):
274 |     """A list of variable declarations, such as the ones put
275 |     at the beginning of a compound statement (e.g., the beginning
276 |     of a function body)."""
277 |     
278 |     pass
279 | 
280 | class FunctionExpression(Node):
281 |     """An execution of a function, e.g. 'my_func(a,b,c)'."""
282 |     
283 |     def __init__(self, function, arglist):
284 |         self.function = function
285 |         self.arglist = arglist
286 | 
287 | class CompoundStatement(Node):
288 |     """A compound statement, e.g. '{ int i; i += 1; }'."""
289 |     
290 |     def __init__(self, declaration_list, statement_list):
291 |         self.declaration_list = declaration_list
292 |         self.statement_list = statement_list
293 | 
294 | class FunctionDefn(Node):
295 |     """A node representing a function definition (its declaration
296 |     and body)."""
297 |     
298 |     def __init__(self, declaration, body):
299 |         self.type = declaration.type
300 |         self.name = declaration.name
301 |         self.extern = declaration.extern
302 |         self.static = declaration.static
303 |         self.body = body
304 | 
305 | class Declaration(Node):
306 |     """A node representing a declaration of a function or
307 |     variable."""
308 |     
309 |     def __init__(self, name, type=None):
310 |         if type == None:
311 |             type = NullNode()
312 |         self.extern = 0
313 |         self.static = 0
314 |         self.type = type
315 |         self.name = name
316 |         self.is_used = 0
317 | 
318 |     def set_base_type(self, type):
319 |         if self.type.is_null():
320 |             self.type = type
321 |         else:
322 |             self.type.set_base_type(type)
323 | 
324 |     def add_type(self, type):
325 |         type.set_base_type(self.type)
326 |         self.type = type
327 | 
328 | #  ---------------------------------------------------------------
329 | #  ABSTRACT SYNTAX TREE - TYPE SYSTEM
330 | #  ---------------------------------------------------------------
331 | 
332 | class Type(Node):
333 |     """A node representing the type of another node.  For instance,
334 |     the Binop node representing '5 + a', where a is an int, will have
335 |     a Type node associated with it that represents the fact that
336 |     the result of the Binop is an int.
337 | 
338 |     Types can also be nested, so that for instance you can have
339 |     a type like 'pointer(pointer(int))' which represents a
340 |     double-pointer to an int.
341 | 
342 |     This is an abstract base class."""
343 |     
344 |     def __init__(self, child=None):
345 |         if child == None:
346 |             child = NullNode()
347 |         self.child = child
348 | 
349 |     def set_base_type(self, type):
350 |         """Set the base (innermost) type of a type.  For instance,
351 |         calling this with a pointer(int) type on a pointer() type
352 |         will give you a pointer(pointer(int))."""
353 |         
354 |         if self.child.is_null():
355 |             self.child = type
356 |         else:
357 |             self.child.set_base_type(type)
358 | 
359 |     def get_string(self):
360 |         """Return a string corresponding to the type, e.g.
361 |         'pointer(pointer(int))'."""
362 |         
363 |         raise NotImplementedError()
364 | 
365 |     def get_outer_string(self):
366 |         """Return only the outermost type of a type.  e.g.,
367 |         calling this on a pointer(pointer(int)) type will
368 |         return 'pointer'."""
369 |         
370 |         raise NotImplementedError()
371 | 
372 |     def is_function(self):
373 |         """Returns whether or not this type represents a
374 |         function."""
375 |         
376 |         return 0
377 | 
378 | class BaseType(Type):
379 |     """A base type representing ints, chars, etc..."""
380 |     
381 |     def __init__(self, type_str, child=None):
382 |         Type.__init__(self, child)
383 |         self.type_str = type_str
384 | 
385 |     def get_string(self):
386 |         return self.type_str
387 | 
388 |     def get_outer_string(self):
389 |         return self.type_str
390 | 
391 | class FunctionType(Type):
392 |     """A type representing a function (for function prototypes and
393 |     function calls)."""
394 |     
395 |     def __init__(self, params=None, child=None):
396 |         Type.__init__(self, child)
397 |         if (params == None):
398 |             params = NullNode()
399 |         self.params = params
400 | 
401 |     def get_string(self):
402 |         param_str = ""
403 |         for param in self.params.nodes:
404 |             param_str += "," + param.type.get_string()
405 |         return "function(%s)->%s" % (param_str[1:], self.child.get_string())
406 | 
407 |     def get_outer_string(self):
408 |         return 'function'
409 | 
410 |     def is_function(self):
411 |         return 1
412 | 
413 |     def get_return_type(self):
414 |         """Returns the return type of the function.  Internally,
415 |         this is stored as the nested type within the function."""
416 |         
417 |         return self.child
418 | 
419 |     def get_params(self):
420 |         """Returns the list of parameters for the function."""
421 |         
422 |         return self.params
423 | 
424 | class PointerType(Type):
425 |     """A type representing a pointer to another (nested) type."""
426 |     
427 |     def get_string(self):
428 |         return "pointer(%s)" % self.child.get_string()
429 | 
430 |     def get_outer_string(self):
431 |         return 'pointer'
432 | 
433 | #  ---------------------------------------------------------------
434 | #  PARSER GRAMMAR / AST CONSTRUCTION
435 | #
436 | #  The only thing the yacc grammar rules do is create an
437 | #  abstract syntax tree.  Actual symbol table generation,
438 | #  type checking, flow control checking, etc. are done by
439 | #  the visitor classes (see cvisitors.py).
440 | #  ---------------------------------------------------------------
441 | 
442 | # Precedence for ambiguous grammar elements.
443 | precedence = (
444 |     ('right', 'ELSE'),
445 | )
446 | 
447 | class ParseError(Exception):
448 |     "Exception raised whenever a parsing error occurs."
449 | 
450 |     pass
451 | 
452 | def p_translation_unit_01(t):
453 |     '''translation_unit : external_declaration'''
454 |     t[0] = TranslationUnit(t[1])
455 | 
456 | def p_translation_unit_02(t):
457 |     '''translation_unit : translation_unit external_declaration'''
458 |     t[1].add(t[2])
459 |     t[0] = t[1]
460 | 
461 | def p_external_declaration(t):
462 |     '''external_declaration : function_definition
463 |                             | declaration'''
464 |     t[0] = t[1]
465 | 
466 | def p_function_definition_01(t):
467 |     '''function_definition : type_specifier declarator compound_statement'''
468 |     t[2].set_base_type(t[1])
469 |     t[0] = FunctionDefn(t[2], t[3])
470 | 
471 | def p_function_definition_02(t):
472 |     '''function_definition : STATIC type_specifier declarator compound_statement'''
473 |     t[3].static = 1
474 |     t[3].set_base_type(t[2])
475 |     t[0] = FunctionDefn(t[3], t[4])
476 |     
477 | def p_declaration_01(t):
478 |     '''declaration : type_specifier declarator SEMICOLON'''
479 |     if isinstance(t[2].type, FunctionType):
480 |         t[2].extern = 1
481 |     t[2].set_base_type(t[1])
482 |     t[0] = t[2]
483 | 
484 | def p_declaration_02(t):
485 |     '''declaration : EXTERN type_specifier declarator SEMICOLON'''
486 |     t[3].extern = 1
487 |     t[3].set_base_type(t[2])
488 |     t[0] = t[3]
489 | 
490 | def p_declaration_list_opt_01(t):
491 |     '''declaration_list_opt : empty'''
492 |     t[0] = NullNode()
493 | 
494 | def p_declaration_list_opt_02(t):
495 |     '''declaration_list_opt : declaration_list'''
496 |     t[0] = t[1]
497 | 
498 | def p_declaration_list_02(t):
499 |     '''declaration_list : declaration'''
500 |     t[0] = DeclarationList(t[1])
501 | 
502 | def p_declaration_list_03(t):
503 |     '''declaration_list : declaration_list declaration'''
504 |     t[1].add(t[2])
505 |     t[0] = t[1]
506 |     
507 | def p_type_specifier(t):
508 |     '''type_specifier : INT
509 |                       | CHAR'''
510 |     t[0] = BaseType(t[1])
511 | 
512 | def p_declarator_01(t):
513 |     '''declarator : direct_declarator'''
514 |     t[0] = t[1]
515 | 
516 | def p_declarator_02(t):
517 |     '''declarator : ASTERISK declarator'''
518 |     t[2].set_base_type(PointerType())
519 |     t[0] = t[2]
520 | 
521 | def p_direct_declarator_01(t):
522 |     '''direct_declarator : ID'''
523 |     t[0] = Declaration(t[1])
524 | 
525 | def p_direct_declarator_02(t):
526 |     '''direct_declarator : direct_declarator LPAREN parameter_type_list RPAREN'''
527 |     t[1].add_type(FunctionType(t[3]))
528 |     t[0] = t[1]
529 | 
530 | def p_direct_declarator_03(t):
531 |     '''direct_declarator : direct_declarator LPAREN RPAREN'''
532 |     t[1].add_type(FunctionType(ParamList()))
533 |     t[0] = t[1]
534 |     
535 | def p_parameter_type_list_01(t):
536 |     '''parameter_type_list : parameter_list'''
537 |     t[0] = t[1]
538 | 
539 | def p_parameter_type_list_02(t):
540 |     '''parameter_type_list : parameter_list COMMA ELLIPSIS'''
541 |     t[1].has_ellipsis = 1
542 |     t[0] = t[1]
543 | 
544 | def p_parameter_list_01(t):
545 |     '''parameter_list : parameter_declaration'''
546 |     t[0] = ParamList(t[1])
547 | 
548 | def p_parameter_list_02(t):
549 |     '''parameter_list : parameter_list COMMA parameter_declaration'''
550 |     t[1].add(t[3])
551 |     t[0] = t[1]
552 | 
553 | def p_parameter_declaration(t):
554 |     '''parameter_declaration : type_specifier declarator'''
555 |     # NOTE: this is the same code as p_declaration_01!
556 |     p_declaration_01(t)
557 | 
558 | def p_compound_statement_01(t):
559 |     '''compound_statement : LBRACE declaration_list_opt statement_list RBRACE'''
560 |     t[0] = CompoundStatement(t[2], t[3])
561 | 
562 | def p_compound_statement_02(t):
563 |     '''compound_statement : LBRACE declaration_list_opt RBRACE'''
564 |     t[0] = CompoundStatement(t[2], NullNode())
565 | 
566 | def p_expression_statement(t):
567 |     '''expression_statement : expression SEMICOLON'''
568 |     t[0] = t[1]
569 | 
570 | def p_expression_01(t):
571 |     '''expression : equality_expression'''
572 |     t[0] = t[1]
573 | 
574 | def p_expression_02(t):    
575 |     '''expression : equality_expression ASSIGN expression
576 |                   | equality_expression EQ_PLUS expression
577 |                   | equality_expression EQ_MINUS expression'''
578 |     t[0] = Binop(t[1], t[3], t[2])
579 | 
580 | def p_equality_expression_01(t):
581 |     '''equality_expression : relational_expression'''
582 |     t[0] = t[1]
583 | 
584 | def p_equality_expression_02(t):    
585 |     '''equality_expression : equality_expression EQ relational_expression
586 |                            | equality_expression NOT_EQ relational_expression'''
587 |     t[0] = _get_calculated(Binop(t[1], t[3], t[2]))
588 | 
589 | def p_relational_expression_01(t):
590 |     '''relational_expression : additive_expression'''
591 |     t[0] = t[1]
592 | 
593 | def p_relational_expression_02(t):
594 |     '''relational_expression : relational_expression LESS additive_expression
595 |                              | relational_expression GREATER additive_expression
596 |                              | relational_expression LESS_EQ additive_expression
597 |                              | relational_expression GREATER_EQ additive_expression'''
598 |     t[0] = _get_calculated(Binop(t[1], t[3], t[2]))
599 | 
600 | def p_postfix_expression_01(t):
601 |     '''postfix_expression : primary_expression'''
602 |     t[0] = t[1]
603 | 
604 | def p_postfix_expression_02(t):
605 |     '''postfix_expression : postfix_expression LPAREN argument_expression_list RPAREN'''
606 |     t[0] = FunctionExpression(t[1], t[3])
607 |     pass
608 | 
609 | def p_postfix_expression_03(t):
610 |     '''postfix_expression : postfix_expression LPAREN RPAREN'''
611 |     t[0] = FunctionExpression(t[1], ArgumentList())
612 | 
613 | def p_postfix_expression_04(t):
614 |     '''postfix_expression : postfix_expression LBRACKET expression RBRACKET'''
615 |     t[0] = ArrayExpression(t[1], t[3])
616 | 
617 | def p_argument_expression_list_01(t):
618 |     '''argument_expression_list : expression'''
619 |     t[0] = ArgumentList(t[1])
620 | 
621 | def p_argument_expression_list_02(t):
622 |     '''argument_expression_list : argument_expression_list COMMA expression'''
623 |     t[1].add(t[3])
624 |     t[0] = t[1]
625 | 
626 | def p_unary_expression_01(t):
627 |     '''unary_expression : postfix_expression'''
628 |     t[0] = t[1]
629 | 
630 | def p_unary_expression_02(t):
631 |     '''unary_expression : MINUS unary_expression'''
632 |     t[0] = _get_calculated(Negative(t[2]))
633 | 
634 | def p_unary_expression_03(t):
635 |     '''unary_expression : PLUS unary_expression'''
636 |     t[0] = t[2]
637 | 
638 | def p_unary_expression_03(t):
639 |     '''unary_expression : EXCLAMATION unary_expression'''
640 |     # horrible hack for the '!' operator... Just insert an
641 |     # (expr == 0) into the AST.
642 |     t[0] = _get_calculated(Binop(t[2], Const(0, BaseType('int')), '=='))
643 | 
644 | def p_unary_expression_04(t):
645 |     '''unary_expression : ASTERISK unary_expression'''
646 |     t[0] = Pointer(t[2])
647 | 
648 | def p_unary_expression_05(t):
649 |     '''unary_expression : AMPERSAND unary_expression'''
650 |     t[0] = AddrOf(t[2])
651 | 
652 | def p_mult_expression_01(t):
653 |     '''mult_expression : unary_expression'''
654 |     t[0] = t[1]
655 | 
656 | def p_mult_expression_02(t):
657 |     '''mult_expression : mult_expression ASTERISK unary_expression
658 |                        | mult_expression DIV unary_expression    
659 |                        | mult_expression MODULO unary_expression'''
660 |     t[0] = _get_calculated(Binop(t[1], t[3], t[2]))
661 | 
662 | def p_additive_expression_01(t):
663 |     '''additive_expression : mult_expression'''
664 |     t[0] = t[1]
665 | 
666 | def p_additive_expression_02(t):
667 |     '''additive_expression : additive_expression PLUS mult_expression
668 |                            | additive_expression MINUS mult_expression'''
669 |     t[0] = _get_calculated(Binop(t[1], t[3], t[2]))
670 | 
671 | def p_primary_expression_01(t):
672 |     '''primary_expression : ID'''
673 |     t[0] = Id(t[1], t.lineno(1))
674 | 
675 | def p_primary_expression_02(t):
676 |     '''primary_expression : INUMBER'''
677 |     t[0] = Const(int(t[1]), BaseType('int'))
678 | 
679 | def p_primary_expression_03(t):
680 |     '''primary_expression : FNUMBER'''
681 |     t[0] = Const(float(t[1]), BaseType('double'))
682 | 
683 | def p_primary_expression_04(t):
684 |     '''primary_expression : CHARACTER'''
685 |     t[0] = Const(ord(eval(t[1])), BaseType('char'))
686 | 
687 | def p_primary_expression_05(t):
688 |     '''primary_expression : string_literal'''
689 |     t[0] = t[1]
690 | 
691 | def p_primary_expression_06(t):
692 |     '''primary_expression : LPAREN expression RPAREN'''
693 |     t[0] = t[2]
694 | 
695 | def p_string_literal_01(t):
696 |     '''string_literal : STRING'''
697 |     t[0] = StringLiteral(eval(t[1]))
698 | 
699 | def p_string_literal_02(t):
700 |     '''string_literal : string_literal STRING'''
701 |     t[1].append_str(eval(t[2]))
702 |     t[0] = t[1]
703 | 
704 | def p_statement(t):
705 |     '''statement : compound_statement
706 |                  | expression_statement
707 |                  | selection_statement
708 |                  | iteration_statement
709 |                  | jump_statement'''
710 |     t[0] = t[1]
711 | 
712 | def p_jump_statement_01(t):
713 |     '''jump_statement : RETURN SEMICOLON'''
714 |     t[0] = ReturnStatement(NullNode())
715 |     
716 | def p_jump_statement_02(t):
717 |     '''jump_statement : RETURN expression SEMICOLON'''
718 |     t[0] = ReturnStatement(t[2])
719 | 
720 | def p_jump_statement_03(t):
721 |     '''jump_statement : BREAK SEMICOLON'''
722 |     t[0] = BreakStatement()
723 | 
724 | def p_jump_statement_04(t):
725 |     '''jump_statement : CONTINUE SEMICOLON'''
726 |     t[0] = ContinueStatement()
727 | 
728 | def p_iteration_statement_01(t):
729 |     '''iteration_statement : WHILE LPAREN expression RPAREN statement'''
730 |     t[0] = WhileLoop(t[3], t[5])
731 | 
732 | def p_iteration_statement_02(t):
733 |     '''iteration_statement : FOR LPAREN expression_statement expression_statement expression RPAREN statement'''
734 |     t[0] = ForLoop(t[3], t[4], t[5], t[7])
735 | 
736 | def p_selection_statement_01(t):
737 |     '''selection_statement : IF LPAREN expression RPAREN statement'''
738 |     t[0] = IfStatement(t[3], t[5], NullNode())
739 | 
740 | def p_selection_statement_02(t):
741 |     '''selection_statement : IF LPAREN expression RPAREN statement ELSE statement'''
742 |     t[0] = IfStatement(t[3], t[5], t[7])
743 | 
744 | def p_statement_list_02(t):
745 |     '''statement_list : statement'''
746 |     t[0] = StatementList(t[1])
747 | 
748 | def p_statement_list_03(t):
749 |     '''statement_list : statement_list statement'''
750 |     t[1].add(t[2])
751 |     t[0] = t[1]
752 | 
753 | def p_empty(t):
754 |     'empty :'
755 |     pass
756 | 
757 | def p_error(t):
758 |     print "You've got a syntax error somewhere in your code."
759 |     print "It could be around line %d." % t.lineno
760 |     print "Good luck finding it."
761 |     raise ParseError()
762 | 
763 | yacc.yacc(debug=1)
764 | 
765 | #  ---------------------------------------------------------------
766 | #  End of cparse.py
767 | #  ---------------------------------------------------------------
768 | 


--------------------------------------------------------------------------------
/cvisitors.py:
--------------------------------------------------------------------------------
  1 | #  ---------------------------------------------------------------
  2 | #  cvisitors.py
  3 | #
  4 | #  Atul Varma
  5 | #  Python C Compiler - Visitors
  6 | #  $Id: cvisitors.py,v 1.3 2004/05/27 17:51:47 varmaa Exp $
  7 | #
  8 | #  The Visitor is a pattern outlined in "Design Patterns" by
  9 | #  Gamma et al., used here to encapsulate different parts of parsing 
 10 | #  and compilation into separate classes via a mechanism called 
 11 | #  double dispatching.
 12 | #
 13 | #  In this compiler, the yacc grammar rules in cparse.py just create
 14 | #  the abstract syntax tree, and visitors do the bulk of parsing
 15 | #  and compilation.
 16 | #  ---------------------------------------------------------------
 17 | 
 18 | # TODO: make it so functions can return void.
 19 | # TODO: mark all statements with an 'ignore return value' flag
 20 | #       to enable some optimizations if the statement is an
 21 | #       expression.
 22 | # TODO: move extern, static indicators in functions to their
 23 | #       Type object, maybe.
 24 | #
 25 | # Possible things to do:
 26 | #   Add compilation to JVM/python bytecode/z-machine...
 27 | #   Implement arrays
 28 | #   Pass line numbers to constructors for nodes
 29 | #
 30 | # Faults so far:
 31 | #   * doesn't check for variable initialization before use.
 32 | #   * const number ranges aren't being checked.
 33 | 
 34 | import cparse
 35 | 
 36 | class Visitor:
 37 |     """The base visitor class.  This is an abstract base class."""
 38 | 
 39 |     def __init__(self):
 40 |         self.warnings = 0
 41 |         self.errors = 0
 42 | 
 43 |     def _visitList(self, list):
 44 |         """Visit a list of nodes.  'list' should be an actual list,
 45 |         not a cparse.NodeList object."""
 46 |         
 47 |         last = None
 48 |         for i in list:
 49 |             last = i.accept(self)
 50 |         return last
 51 |     
 52 |     def visit(self, node):
 53 |         """Visits the given node by telling the node to call the
 54 |         visitor's class-specific visitor method for that node's
 55 |         class (i.e., double dispatching)."""
 56 |         
 57 |         return node.accept(self)
 58 | 
 59 |     def warning(self, str):
 60 |         """Output a non-fatal compilation warning."""
 61 |         
 62 |         print "warning: %s" % str
 63 |         self.warnings += 1
 64 | 
 65 |     def error(self, str):
 66 |         """Output a fatal compilation error."""
 67 |         
 68 |         print "error: %s" % str
 69 |         self.errors += 1
 70 | 
 71 |     def has_errors(self):
 72 |         """Returns whether the visitor has encountered any
 73 |         errors."""
 74 |         
 75 |         return self.errors > 0
 76 | 
 77 | #  ---------------------------------------------------------------
 78 | #  ABSTRACT SYNTAX TREE PRINTER (for debugging)
 79 | #  ---------------------------------------------------------------
 80 | 
 81 | class ASTPrinterVisitor(Visitor):
 82 |     """Simple visitor that outputs a textual representation of
 83 |     the abstract syntax tree, for debugging purposes, to an
 84 |     output file."""
 85 |     
 86 |     def __init__(self, ast_file, indent_amt=2):
 87 |         self.ast_file = ast_file
 88 |         Visitor.__init__(self)
 89 |         self._indent = 0
 90 |         self._indent_amt = indent_amt
 91 | 
 92 |     def indent(self):
 93 |         self._indent += self._indent_amt
 94 | 
 95 |     def unindent(self):
 96 |         self._indent -= self._indent_amt
 97 | 
 98 |     def p(self, str):
 99 |         self.ast_file.write(
100 |             (' ' * (self._indent_amt * self._indent) ) + str + "\n" )
101 | 
102 |     def pNodeInfo(self, node):
103 |         # Print out the name of the node's class.
104 |         self.p('+ ' + node.__class__.__name__)
105 | 
106 |         # If the node has a type associated with it,
107 |         # print the string of the type.
108 |         if node.__dict__.has_key("type"):
109 |             self.p("  Type-string: %s" % node.type.get_string())
110 | 
111 |         # Find all attributes of the node that are ints or
112 |         # strings and aren't 'private' (i.e., don't begin with
113 |         # '_'), and print their values.
114 |         for key in node.__dict__.keys():
115 |             if key[0] == '_':
116 |                 continue
117 |             val = node.__dict__[key]
118 |             if (isinstance(val, str) or
119 |                 isinstance(val, int)):
120 |                 self.p("  %s: %s" % (key, str(val)))
121 | 
122 |     def pSubnodeInfo(self, subnode, label):
123 |         if not subnode.is_null():
124 |             self.p("  %s:" % label)
125 |             self.indent()
126 |             subnode.accept(self)
127 |             self.unindent()
128 | 
129 |     def vNullNode(self, node):
130 |         self.pNodeInfo(node)
131 | 
132 |     def vArrayExpression(self, node):
133 |         self.pNodeInfo(node)
134 |         self.pSubnodeInfo(node.expr, "Expression")
135 |         self.pSubnodeInfo(node.index, "Index")
136 | 
137 |     def vStringLiteral(self, node):
138 |         self.pNodeInfo(node)
139 |         self.p('  Value: "%s"' % node.get_sanitized_str())
140 | 
141 |     def vId(self, node):
142 |         self.pNodeInfo(node)
143 | 
144 |     def vUnaryop(self, node):
145 |         self.pNodeInfo(node)
146 |         self.pSubnodeInfo(node.expr, "Expression")
147 | 
148 |     def vFunctionExpression(self, node):
149 |         self.pNodeInfo(node)
150 |         self.pSubnodeInfo(node.function, "Function")
151 |         self.pSubnodeInfo(node.arglist, "Arguments")
152 | 
153 |     def vConst(self, node):
154 |         self.pNodeInfo(node)
155 |         self.pSubnodeInfo(node.type, "Type")
156 | 
157 |     def vBinop(self, node):
158 |         self.pNodeInfo(node)
159 |         self.pSubnodeInfo(node.left, "Left operand")
160 |         self.pSubnodeInfo(node.right, "Right operand")
161 | 
162 |     def vNodeList(self, node):
163 |         self.pNodeInfo(node)
164 |         self.indent()
165 |         self._visitList(node.nodes)
166 |         self.unindent()
167 | 
168 |     def vCompoundStatement(self, node):
169 |         self.pNodeInfo(node)
170 |         self.pSubnodeInfo(node.declaration_list, "Declaration list")
171 |         self.pSubnodeInfo(node.statement_list, "Statement list")        
172 | 
173 |     def vBaseType(self, node):
174 |         self.pNodeInfo(node)
175 | 
176 |     def vFunctionType(self, node):
177 |         self.pNodeInfo(node)
178 |         self.pSubnodeInfo(node.params, "Parameters:")
179 |         self.pSubnodeInfo(node.child, "Child:")
180 | 
181 |     def vPointerType(self, node):
182 |         self.pNodeInfo(node)
183 |         self.pSubnodeInfo(node.child, "Child:")
184 | 
185 |     def vDeclaration(self, node):
186 |         self.pNodeInfo(node)
187 |         self.pSubnodeInfo(node.type, "Type")
188 | 
189 |     def vReturnStatement(self, node):
190 |         self.pNodeInfo(node)
191 |         self.pSubnodeInfo(node.expr, "Expression")
192 | 
193 |     def vFunctionDefn(self, node):
194 |         self.pNodeInfo(node)
195 |         self.pSubnodeInfo(node.type, "Type")
196 |         self.pSubnodeInfo(node.body, "Body")
197 | 
198 |     def vIfStatement(self, node):
199 |         self.pNodeInfo(node)
200 |         self.pSubnodeInfo(node.expr, "Expression")
201 |         self.pSubnodeInfo(node.then_stmt, "Then statement")
202 |         self.pSubnodeInfo(node.else_stmt, "Else statement")
203 | 
204 |     def vWhileLoop(self, node):
205 |         self.pNodeInfo(node)
206 |         self.pSubnodeInfo(node.expr, "Expression")
207 |         self.pSubnodeInfo(node.stmt, "Statement")
208 | 
209 |     def vForLoop(self, node):
210 |         self.pNodeInfo(node)
211 |         self.pSubnodeInfo(node.begin_stmt, "Begin statement")
212 |         self.pSubnodeInfo(node.expr, "Test expression")
213 |         self.pSubnodeInfo(node.end_stmt, "End statement")
214 |         self.pSubnodeInfo(node.stmt, "Statement")
215 | 
216 | #  ---------------------------------------------------------------
217 | #  SYMBOL TABLE GENERATION
218 | #  ---------------------------------------------------------------
219 | 
220 | class Symtab:
221 |     """A symbol table.  This is a simple object that just keeps a
222 |     hashtable of symbol names and the Declaration or FunctionDefn
223 |     nodes that they refer to.
224 | 
225 |     There is a separate symbol table for each code element that
226 |     has its own scope (for instance, each compound statement will
227 |     have its own symbol table).  As a result, symbol tables can
228 |     be nested if the code elements are nested, and symbol table
229 |     lookups will recurse upwards through parents to represent
230 |     lexical scoping rules."""
231 | 
232 |     class SymbolDefinedError(Exception):
233 |         """Exception raised when the code tries to add a symbol
234 |         to a table where the symbol has already been defined.
235 |         Note that 'defined' is used in the C sense here--i.e.,
236 |         'space has been allocated for the symbol', as opposed
237 |         to a declaration."""
238 | 
239 |         pass
240 | 
241 |     class SymbolConflictError(Exception):
242 |         """Exception raised when the code tries to add a
243 |         symbol to a tamble where the symbol already exists
244 |         and its type differs from the previously existing
245 |         one."""
246 |         
247 |         pass
248 | 
249 |     def __init__(self, parent=None):
250 |         """Creates an empty symbol table with the given
251 |         parent symbol table."""
252 |         
253 |         self.entries = {}
254 |         self.parent = parent
255 |         if self.parent != None:
256 |             self.parent.children.append(self)
257 |         self.children = []
258 |     
259 |     def add(self, name, value):
260 |         """Adds a symbol with the given value to the symbol table.
261 |         The value is usually an AST node that represents the
262 |         declaration or definition of a function/variable (e.g.,
263 |         Declaration or FunctionDefn)."""
264 |         
265 |         if self.entries.has_key(name):
266 |             if not self.entries[name].extern:
267 |                 raise Symtab.SymbolDefinedError()
268 |             elif self.entries[name].type.get_string() != \
269 |                  value.type.get_string():
270 |                 raise Symtab.SymbolConflictError()
271 |         self.entries[name] = value
272 | 
273 |     def get(self, name):
274 |         """Retrieves the symbol with the given name from the symbol
275 |         table, recursing upwards through parent symbol tables if it is
276 |         not found in the current one."""
277 | 
278 |         if self.entries.has_key(name):
279 |             return self.entries[name]
280 |         else:
281 |             if self.parent != None:
282 |                 return self.parent.get(name)
283 |             else:
284 |                 return None
285 | 
286 | class SymtabVisitor(Visitor):
287 |     """Visitor that creates and attaches symbol tables to the AST."""
288 |     
289 |     def push_symtab(self, node):
290 |         """Pushes a new symbol table onto the visitor's symbol table
291 |         stack and attaches this symbol table to the given node.  This
292 |         is used whenever a new lexical scope is encountered, so the
293 |         node is usually a CompoundStatement object."""
294 | 
295 |         self.curr_symtab = Symtab(self.curr_symtab)
296 |         node.symtab = self.curr_symtab
297 | 
298 |     def pop_symtab(self):
299 |         """Pops a symbol table off the visitor's symbol table stack.
300 |         This is used whenever a new lexical scope is exited."""
301 |         
302 |         self.curr_symtab = self.curr_symtab.parent
303 | 
304 |     def vNode(self, node):
305 |         pass
306 | 
307 |     def vArrayExpression(self, node):
308 |         node.expr.accept(self)
309 |         node.index.accept(self)
310 | 
311 |     def vFunctionExpression(self, node):
312 |         node.function.accept(self)
313 |         node.arglist.accept(self)
314 |     
315 |     def vId(self, node):
316 |         symbol = self.curr_symtab.get(node.name)
317 |         if symbol != None:
318 |             node.symbol = symbol
319 |             node.symbol.is_used = 1
320 |             node.set_has_address()
321 |         else:
322 |             self.error("Line %d: Unknown identifier '%s'." % (node.lineno, node.name))
323 | 
324 |     def vUnaryop(self, node):
325 |         node.expr.accept(self)
326 | 
327 |     def vBinop(self, node):
328 |         node.left.accept(self)
329 |         node.right.accept(self)
330 | 
331 |     def vNodeList(self, node):
332 |         self._visitList(node.nodes)
333 | 
334 |     def vParamList(self, node):
335 |         # Assign a number to each parameter.  This will later be
336 |         # useful for the code generation phase.
337 |         #
338 |         # TODO: might be best to just move this to the code
339 |         # generation phase, since this doesn't have anything to
340 |         # do with symbol table generation.
341 |         param_num = 0
342 |         for param in node.nodes:
343 |             param.accept(self)
344 |             param.param_num = param_num
345 |             param_num += 1
346 | 
347 |     def vTranslationUnit(self, node):
348 |         self.root_symtab = Symtab()
349 |         self.curr_symtab = self.root_symtab
350 |         self.vNodeList(node)
351 |         node.symtab = self.root_symtab
352 |         
353 |     def vCompoundStatement(self, node):
354 |         self.push_symtab(node)
355 |         node.declaration_list.accept(self)
356 |         node.statement_list.accept(self)
357 |         self.pop_symtab()
358 | 
359 |     def _add_symbol(self, node):
360 |         """Attempts to add a symbol for the given node to the current
361 |         symbol table, catching any exceptions that occur and printing
362 |         errors if necessary."""
363 |         
364 |         try:
365 |             self.curr_symtab.add(node.name, node)
366 |         except Symtab.SymbolDefinedError:
367 |             self.error("Symbol '%s' already defined." % node.name)
368 |         except Symtab.SymbolConflictError:
369 |             self.error("Symbol '%s' has multiple differing declarations." % node.name)
370 | 
371 |     def vDeclaration(self, node):
372 |         self._add_symbol(node)
373 | 
374 |     def vReturnStatement(self, node):
375 |         node.expr.accept(self)
376 | 
377 |     def vFunctionType(self, node):
378 |         node.params.accept(self)
379 | 
380 |     def vFunctionDefn(self, node):
381 |         self._add_symbol(node)
382 |         self.push_symtab(node)
383 |         node.type.accept(self)
384 |         node.body.accept(self)
385 |         self.pop_symtab()
386 | 
387 |     def vIfStatement(self, node):
388 |         node.expr.accept(self)
389 |         node.then_stmt.accept(self)
390 |         node.else_stmt.accept(self)
391 |     
392 |     def vWhileLoop(self, node):
393 |         node.expr.accept(self)
394 |         node.stmt.accept(self)
395 | 
396 |     def vForLoop(self, node):
397 |         node.begin_stmt.accept(self)
398 |         node.expr.accept(self)
399 |         node.end_stmt.accept(self)
400 |         node.stmt.accept(self)
401 | 
402 | #  ---------------------------------------------------------------
403 | #  TYPE CHECKING
404 | #  ---------------------------------------------------------------
405 | 
406 | class TypeCheckVisitor(Visitor):
407 |     """Visitor that performs type checking on the AST, attaching a
408 |     Type object subclass to every eligible node and making sure these
409 |     types don't conflict."""
410 | 
411 |     def _process_conditional(self, expr):
412 |         """Does simple type checking for an expression that is
413 |         supposed to be the expression for a conditional
414 |         statement (e.g., the conditional clause of an if/then
415 |         statement or a loop)."""
416 |         
417 |         if expr.type.get_outer_string() not in ['int', 'char']:
418 |             self.error("Conditional expression doesn't evaluate to an int/char/etc.")
419 | 
420 |     def _coerce_consts(self, var1, var2):
421 |         """Looks at two typed terminals to see if one of them
422 |         is a constant integral.  If it is, then coerce it to
423 |         the type of the other terminal.
424 | 
425 |         Note that both terminals cannot be constant integrals, or else
426 |         they would have already been reduced to one node by the node's
427 |         calculate() method in the parsing stage."""
428 |         
429 |         if var1.is_const():
430 |             self._coerce_const(var1, var2.type)
431 |         elif var2.is_const():
432 |             self._coerce_const(var2, var1.type)
433 | 
434 |     def _coerce_const(self, var, type):
435 |         """If the given typed terminal is a constant, coerces it to
436 |          the given type."""
437 |         
438 |         if var.is_const() and type.get_string() in ['int', 'char']:
439 |             var.type = type
440 |             
441 |     def _check_const_range(self, var, type):
442 |         """Checks the given integral constant to make sure its value
443 |         is within the bounds of the given type."""
444 |         
445 |         val = var.value
446 |         type_str = type.get_outside_string()
447 |         # TODO: implement this!
448 |         if type_str == 'char':
449 |             pass
450 |         elif type_str == 'int':
451 |             pass
452 | 
453 |     def _compare_types(self, name_str, from_type, to_type, raise_errors=1):
454 |         """Compares the two types to see if it's possible to perform a
455 |         binary operation on them.  If it is not, then the appropriate
456 |         errors/warnings are raised, unless raise_errors is set to
457 |         0."""
458 | 
459 |         WARNING = 1
460 |         ERROR = 2
461 |         conflict = 0
462 |         from_str = from_type.get_string()
463 |         to_str = to_type.get_string()
464 |         if (from_str != to_str):
465 |             if from_str == 'char':
466 |                 if to_str == 'int':
467 |                     pass
468 |                 else:
469 |                     conflict = ERROR
470 |             elif from_str == 'int':
471 |                 if to_str == 'char':
472 |                     conflict = WARNING
473 |                 else:
474 |                     conflict = ERROR
475 |             else:
476 |                 conflict = ERROR
477 |         if not raise_errors:
478 |             return conflict
479 |         if conflict == WARNING:
480 |             self.warning("%s: Conversion from %s to %s may result in data loss." % (name_str, from_str, to_str))            
481 |         elif conflict == ERROR:
482 |             self.error("%s: Cannot convert from %s to %s." % (name_str, from_str, to_str))
483 | 
484 |     def vNode(self, node):
485 |         pass
486 | 
487 |     def vId(self, node):
488 |         node.type = node.symbol.type
489 | 
490 |     def vNegative(self, node):
491 |         node.expr.accept(self)
492 |         node.type = node.expr.type
493 |         # TODO: check to make sure expr is a signed type?
494 | 
495 |     def vAddrOf(self, node):
496 |         node.expr.accept(self)
497 |         if not node.expr.has_address():
498 |             self.error("Address-of (&) target has no address!")
499 |         else:
500 |             node.expr.output_addr = 1
501 |             node.type = cparse.PointerType(node.expr.type)
502 | 
503 |     def vPointer(self, node):
504 |         node.expr.accept(self)
505 |         if node.expr.type.get_outer_string() == 'pointer':
506 |             node.type = node.expr.type.child
507 |             node.set_has_address()
508 |         else:
509 |             self.error("Pointer dereference (*) target is not a pointer!")
510 | 
511 |     def vBinop(self, node):
512 |         node.left.accept(self)
513 |         node.right.accept(self)
514 |         if node.op in cparse.Binop.ASSIGN_OPS:
515 |             if not node.left.has_address():
516 |                 self.error("Invalid lvalue: not an address!")
517 |             node.left.output_addr = 1
518 |             self._coerce_const(node.right, node.left.type)
519 |             # TODO: re-implement this!
520 |             # elif node.left.symbol.is_constant:
521 |             #    self.error("Invalid lvalue: lvalue is constant!")
522 |             self._compare_types("Assignment", node.right.type, node.left.type)
523 |             node.right.coerce_to_type = node.left.type
524 |             node.type = node.left.type
525 |         else:
526 |             # TODO: not sure if this results in the ANSI C
527 |             # specification for binary operand type coercion.
528 |     
529 |             self._coerce_consts(node.left, node.right)
530 |             left_conflicts = self._compare_types("", node.right.type, node.left.type, raise_errors=0)
531 |             right_conflicts = self._compare_types("", node.left.type, node.right.type, raise_errors=0)
532 |             if left_conflicts < right_conflicts:
533 |                 from_node = node.right
534 |                 to_node = node.left
535 |             else:
536 |                 from_node = node.left
537 |                 to_node = node.right
538 |             self._compare_types("Binop '%s'" % node.op, from_node.type, to_node.type)
539 |             from_node.coerce_to_type = to_node.type
540 |             to_node.coerce_to_type = to_node.type
541 |             node.type = to_node.type
542 |             
543 |     def vNodeList(self, node):
544 |         self._visitList(node.nodes)
545 | 
546 |     def vCompoundStatement(self, node):
547 |         node.statement_list.accept(self)
548 | 
549 |     def vReturnStatement(self, node):
550 |         node.expr.accept(self)
551 |         return_type = self.curr_func.type.get_return_type()
552 |         self._coerce_const(node.expr, return_type)
553 |         self._compare_types("Return expression", node.expr.type, return_type)
554 |         node.expr.coerce_to_type = return_type
555 | 
556 |     def vArrayExpression(self, node):
557 |         node.expr.accept(self)
558 |         node.index.accept(self)
559 |         if node.index.type.get_outer_string() not in ['int', 'char']:
560 |             self.error("Array index is not an int or char!")
561 |         elif node.expr.type.get_outer_string() != 'pointer':
562 |             self.error("Array expression is not a pointer!")
563 |         else:
564 |             node.type = node.expr.type.child
565 |             node.set_has_address()
566 | 
567 |     def vFunctionExpression(self, node):
568 |         node.function.accept(self)
569 |         if not node.function.type.is_function():
570 |             self.error("Target of function expression is not a function!")
571 |         node.type = node.function.symbol.type.get_return_type()
572 |         node.arglist.accept(self)
573 |         params = node.function.symbol.type.get_params()
574 |         num_args = len(node.arglist.nodes)
575 |         num_params = len(params.nodes)
576 |         if (not params.has_ellipsis) and (num_args > num_params):
577 |             self.error("Too many arguments passed to function.")
578 |         elif num_args < num_params:
579 |             self.error("Too few arguments passed to function.")
580 |         for arg, param in zip(node.arglist.nodes, params.nodes):
581 |             self._coerce_const(arg, param.type)
582 |             self._compare_types("Function call argument", arg.type, param.type)
583 |             arg.coerce_to_type = param.type
584 |         # If this function takes a variable number of args and
585 |         # we've got more args than required parameters, we need
586 |         # to set some of the extra arguments' field(s) properly.
587 |         if (params.has_ellipsis) and (num_args > num_params):
588 |             for arg in node.arglist.nodes[num_params:]:
589 |                 arg.coerce_to_type = arg.type
590 | 
591 |     def vFunctionDefn(self, node):
592 |         self.curr_func = node
593 |         node.body.accept(self)
594 | 
595 |     def vIfStatement(self, node):
596 |         node.expr.accept(self)
597 | 
598 |         self._process_conditional(node.expr)
599 |         node.then_stmt.accept(self)
600 |         node.else_stmt.accept(self)
601 | 
602 |     def vWhileLoop(self, node):
603 |         node.expr.accept(self)
604 |         self._process_conditional(node.expr)
605 |         node.stmt.accept(self)
606 | 
607 |     def vForLoop(self, node):
608 |         node.begin_stmt.accept(self)
609 |         node.expr.accept(self)
610 |         self._process_conditional(node.expr)
611 |         node.end_stmt.accept(self)
612 |         node.stmt.accept(self)
613 | 
614 | #  ---------------------------------------------------------------
615 | #  FLOW CONTROL
616 | #  ---------------------------------------------------------------
617 | 
618 | class FlowControlVisitor(Visitor):
619 |     """Performs flow control checking on the AST.  This makes sure
620 |     that functions return properly through all branches, that
621 |     break/continue statements are only present within loops, and so
622 |     forth."""
623 |     
624 |     def vNode(self, node):
625 |         node.has_return_stmt = 0
626 | 
627 |     def vStatementList(self, node):
628 |         node.has_return_stmt = 0
629 |         for stmt in node.nodes:
630 |             if node.has_return_stmt:
631 |                 self.warning("Function %s has at least one unreachable statement." % self.curr_func.name)
632 |             stmt.accept(self)
633 |             if stmt.has_return_stmt:
634 |                 node.has_return_stmt = 1
635 | 
636 |     def vTranslationUnit(self, node):
637 |         self._visitList(node.nodes)
638 |         
639 |     def vWhileLoop(self, node):
640 |         old_in_loop = self.in_loop
641 |         self.in_loop = 1
642 |         node.stmt.accept(self)
643 |         self.in_loop = old_in_loop
644 |         node.has_return_stmt = node.stmt.has_return_stmt
645 | 
646 |     def vForLoop(self, node):
647 |         self.vWhileLoop(node)
648 | 
649 |     def vBreakStatement(self, node):
650 |         node.has_return_stmt = 0        
651 |         if not self.in_loop:
652 |             self.error("Break statement outside of loop.")
653 | 
654 |     def vContinueStatement(self, node):
655 |         node.has_return_stmt = 0        
656 |         if not self.in_loop:
657 |             self.error("Continue statement outside of loop.")
658 |             
659 |     def vIfStatement(self, node):
660 |         node.then_stmt.accept(self)
661 |         node.else_stmt.accept(self)
662 |         if node.then_stmt.has_return_stmt and node.else_stmt.has_return_stmt:
663 |             node.has_return_stmt = 1
664 |         else:
665 |             node.has_return_stmt = 0
666 |             
667 |     def vFunctionDefn(self, node):
668 |         self.curr_func = node
669 |         self.in_loop = 0
670 |         node.body.accept(self)
671 |         if not node.body.has_return_stmt:
672 |             self.warning("Function %s doesn't return through all branches." % node.name)
673 | 
674 |     def vReturnStatement(self, node):
675 |         node.has_return_stmt = 1
676 | 
677 |     def vCompoundStatement(self, node):
678 |         node.statement_list.accept(self)
679 |         node.has_return_stmt = node.statement_list.has_return_stmt
680 | 
681 | #  ---------------------------------------------------------------
682 | #  End of cvisitors.py
683 | #  ---------------------------------------------------------------
684 | 


--------------------------------------------------------------------------------
/lex.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------------------
  2 | # ply: lex.py
  3 | #
  4 | # Author: David M. Beazley (beazley@cs.uchicago.edu)
  5 | #         Department of Computer Science
  6 | #         University of Chicago
  7 | #         Chicago, IL  60637
  8 | #
  9 | # Copyright (C) 2001, David M. Beazley
 10 | #
 11 | # $Header: /cygdrive/c/prog/CVS/mini_c/lex.py,v 1.1.1.1 2004/05/27 06:40:38 varmaa Exp $
 12 | #
 13 | # This library is free software; you can redistribute it and/or
 14 | # modify it under the terms of the GNU Lesser General Public
 15 | # License as published by the Free Software Foundation; either
 16 | # version 2.1 of the License, or (at your option) any later version.
 17 | # 
 18 | # This library is distributed in the hope that it will be useful,
 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 21 | # Lesser General Public License for more details.
 22 | # 
 23 | # You should have received a copy of the GNU Lesser General Public
 24 | # License along with this library; if not, write to the Free Software
 25 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 26 | # 
 27 | # See the file COPYING for a complete copy of the LGPL.
 28 | #
 29 | # 
 30 | # This module automatically constructs a lexical analysis module from regular
 31 | # expression rules defined in a user-defined module.  The idea is essentially the same
 32 | # as that used in John Aycock's Spark framework, but the implementation works
 33 | # at the module level rather than requiring the use of classes.
 34 | #
 35 | # This module tries to provide an interface that is closely modeled after
 36 | # the traditional lex interface in Unix.  It also differs from Spark
 37 | # in that:
 38 | #
 39 | #   -  It provides more extensive error checking and reporting if
 40 | #      the user supplies a set of regular expressions that can't
 41 | #      be compiled or if there is any other kind of a problem in
 42 | #      the specification.
 43 | #
 44 | #   -  The interface is geared towards LALR(1) and LR(1) parser
 45 | #      generators.  That is tokens are generated one at a time
 46 | #      rather than being generated in advanced all in one step.
 47 | #
 48 | # There are a few limitations of this module
 49 | #
 50 | #   -  The module interface makes it somewhat awkward to support more
 51 | #      than one lexer at a time.  Although somewhat inelegant from a
 52 | #      design perspective, this is rarely a practical concern for
 53 | #      most compiler projects.
 54 | #
 55 | #   -  The lexer requires that the entire input text be read into
 56 | #      a string before scanning.  I suppose that most machines have
 57 | #      enough memory to make this a minor issues, but it makes
 58 | #      the lexer somewhat difficult to use in interactive sessions
 59 | #      or with streaming data.
 60 | #
 61 | #-----------------------------------------------------------------------------
 62 | 
 63 | r"""
 64 | lex.py
 65 | 
 66 | This module builds lex-like scanners based on regular expression rules.
 67 | To use the module, simply write a collection of regular expression rules
 68 | and actions like this:
 69 | 
 70 | # lexer.py
 71 | import lex
 72 | 
 73 | # Define a list of valid tokens
 74 | tokens = (
 75 |     'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS'
 76 |     )
 77 | 
 78 | # Define tokens as functions
 79 | def t_IDENTIFIER(t):
 80 |     r' ([a-zA-Z_](\w|_)* '
 81 |     return t
 82 | 
 83 | def t_NUMBER(t):
 84 |     r' \d+ '
 85 |     return t
 86 | 
 87 | # Some simple tokens with no actions
 88 | t_PLUS = r'\+'
 89 | t_MINUS = r'-'
 90 | 
 91 | # Initialize the lexer
 92 | lex.lex()
 93 | 
 94 | The tokens list is required and contains a complete list of all valid
 95 | token types that the lexer is allowed to produce.  Token types are
 96 | restricted to be valid identifiers.  This means that 'MINUS' is a valid
 97 | token type whereas '-' is not.
 98 | 
 99 | Rules are defined by writing a function with a name of the form
100 | t_rulename.  Each rule must accept a single argument which is
101 | a token object generated by the lexer. This token has the following
102 | attributes:
103 | 
104 |     t.type   = type string of the token.  This is initially set to the
105 |                name of the rule without the leading t_
106 |     t.value  = The value of the lexeme.
107 |     t.lineno = The value of the line number where the token was encountered
108 |     
109 | For example, the t_NUMBER() rule above might be called with the following:
110 |     
111 |     t.type  = 'NUMBER'
112 |     t.value = '42'
113 |     t.lineno = 3
114 | 
115 | Each rule returns the token object it would like to supply to the
116 | parser.  In most cases, the token t is returned with few, if any
117 | modifications.  To discard a token for things like whitespace or
118 | comments, simply return nothing.  For instance:
119 | 
120 | def t_whitespace(t):
121 |     r' \s+ '
122 |     pass
123 | 
124 | For faster lexing, you can also define this in terms of the ignore set like this:
125 | 
126 | t_ignore = ' \t'
127 | 
128 | The characters in this string are ignored by the lexer. Use of this feature can speed
129 | up parsing significantly since scanning will immediately proceed to the next token.
130 | 
131 | lex requires that the token returned by each rule has an attribute
132 | t.type.  Other than this, rules are free to return any kind of token
133 | object that they wish and may construct a new type of token object
134 | from the attributes of t (provided the new object has the required
135 | type attribute).
136 | 
137 | If illegal characters are encountered, the scanner executes the
138 | function t_error(t) where t is a token representing the rest of the
139 | string that hasn't been matched.  If this function isn't defined, a
140 | LexError exception is raised.  The .text attribute of this exception
141 | object contains the part of the string that wasn't matched.
142 | 
143 | The t.skip(n) method can be used to skip ahead n characters in the
144 | input stream.  This is usually only used in the error handling rule.
145 | For instance, the following rule would print an error message and
146 | continue:
147 | 
148 | def t_error(t):
149 |     print "Illegal character in input %s" % t.value[0]
150 |     t.skip(1)
151 | 
152 | Of course, a nice scanner might wish to skip more than one character
153 | if the input looks very corrupted.
154 | 
155 | The lex module defines a t.lineno attribute on each token that can be used
156 | to track the current line number in the input.  The value of this
157 | variable is not modified by lex so it is up to your lexer module
158 | to correctly update its value depending on the lexical properties
159 | of the input language.  To do this, you might write rules such as
160 | the following:
161 | 
162 | def t_newline(t):
163 |     r' \n+ '
164 |     t.lineno += t.value.count("\n")
165 | 
166 | To initialize your lexer so that it can be used, simply call the lex.lex()
167 | function in your rule file.  If there are any errors in your
168 | specification, warning messages or an exception will be generated to
169 | alert you to the problem.
170 | 
171 | (dave: this needs to be rewritten)
172 | To use the newly constructed lexer from another module, simply do
173 | this:
174 | 
175 |     import lex
176 |     import lexer
177 |     plex.input("position = initial + rate*60")
178 | 
179 |     while 1:
180 |         token = plex.token()       # Get a token
181 |         if not token: break        # No more tokens
182 |         ... do whatever ...
183 | 
184 | Assuming that the module 'lexer' has initialized plex as shown
185 | above, parsing modules can safely import 'plex' without having
186 | to import the rule file or any additional imformation about the
187 | scanner you have defined.
188 | """    
189 | 
190 | # -----------------------------------------------------------------------------
191 | 
192 | 
193 | __version__ = "1.4"
194 | 
195 | import re, types, sys, copy
196 | 
197 | # Exception thrown when invalid token encountered and no default
198 | class LexError(Exception):
199 |     def __init__(self,message,s):
200 |          self.args = (message,)
201 |          self.text = s
202 | 
203 | # Token class
204 | class LexToken:
205 |     def __str__(self):
206 |         return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno)
207 |     def __repr__(self):
208 |         return str(self)
209 |     def skip(self,n):
210 |         try:
211 |             self._skipn += n
212 |         except AttributeError:
213 |             self._skipn = n
214 | 
215 | # -----------------------------------------------------------------------------
216 | # Lexer class
217 | #
218 | #    input()          -  Store a new string in the lexer
219 | #    token()          -  Get the next token
220 | # -----------------------------------------------------------------------------
221 | 
222 | class Lexer:
223 |     def __init__(self):
224 |         self.lexre = None           # Master regular expression
225 |         self.lexdata = None         # Actual input data (as a string)
226 |         self.lexpos = 0             # Current position in input text
227 |         self.lexlen = 0             # Length of the input text
228 |         self.lexindexfunc = [ ]     # Reverse mapping of groups to functions and types
229 |         self.lexerrorf = None       # Error rule (if any)
230 |         self.lextokens = None       # List of valid tokens
231 |         self.lexignore = None       # Ignored characters
232 |         self.lineno = 1             # Current line number
233 |         self.debug = 0              # Debugging mode
234 |         self.optimize = 0           # Optimized mode
235 |         self.token = self.errtoken
236 | 
237 |     def __copy__(self):
238 |         c = Lexer()
239 |         c.lexre = self.lexre
240 |         c.lexdata = self.lexdata
241 |         c.lexpos = self.lexpos
242 |         c.lexlen = self.lexlen
243 |         c.lenindexfunc = self.lexindexfunc
244 |         c.lexerrorf = self.lexerrorf
245 |         c.lextokens = self.lextokens
246 |         c.lexignore = self.lexignore
247 |         c.lineno = self.lineno
248 |         c.optimize = self.optimize
249 |         c.token = c.realtoken
250 | 
251 |     # ------------------------------------------------------------
252 |     # input() - Push a new string into the lexer
253 |     # ------------------------------------------------------------
254 |     def input(self,s):
255 |         if not isinstance(s,types.StringType):
256 |             raise ValueError, "Expected a string"
257 |         self.lexdata = s
258 |         self.lexpos = 0
259 |         self.lexlen = len(s)
260 |         self.token = self.realtoken
261 |         
262 |         # Change the token routine to point to realtoken()
263 |         global token
264 |         if token == self.errtoken:
265 |             token = self.token
266 | 
267 |     # ------------------------------------------------------------
268 |     # errtoken() - Return error if token is called with no data
269 |     # ------------------------------------------------------------
270 |     def errtoken(self):
271 |         raise RuntimeError, "No input string given with input()"
272 |     
273 |     # ------------------------------------------------------------
274 |     # token() - Return the next token from the Lexer
275 |     #
276 |     # Note: This function has been carefully implemented to be as fast
277 |     # as possible.  Don't make changes unless you really know what
278 |     # you are doing
279 |     # ------------------------------------------------------------
280 |     def realtoken(self):
281 |         # Make local copies of frequently referenced attributes
282 |         lexpos    = self.lexpos
283 |         lexlen    = self.lexlen
284 |         lexignore = self.lexignore
285 |         lexdata   = self.lexdata
286 |         
287 |         while lexpos < lexlen:
288 |             # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
289 |             if lexdata[lexpos] in lexignore:
290 |                 lexpos += 1
291 |                 continue
292 | 
293 |             # Look for a regular expression match
294 |             m = self.lexre.match(lexdata,lexpos)
295 |             if m:
296 |                 i = m.lastindex
297 |                 lexpos = m.end()
298 |                 tok = LexToken()
299 |                 tok.value = m.group()
300 |                 tok.lineno = self.lineno
301 |                 tok.lexer = self
302 |                 func,tok.type = self.lexindexfunc[i]
303 |                 if not func:
304 |                     self.lexpos = lexpos
305 |                     return tok
306 |                 
307 |                 # If token is processed by a function, call it
308 |                 self.lexpos = lexpos
309 |                 newtok = func(tok)
310 |                 self.lineno = tok.lineno     # Update line number
311 |                 
312 |                 # Every function must return a token, if nothing, we just move to next token
313 |                 if not newtok: continue
314 |                 
315 |                 # Verify type of the token.  If not in the token map, raise an error
316 |                 if not self.optimize:
317 |                     if not self.lextokens.has_key(newtok.type):
318 |                         raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
319 |                             func.func_code.co_filename, func.func_code.co_firstlineno,
320 |                             func.__name__, newtok.type),lexdata[lexpos:])
321 | 
322 |                 return newtok
323 | 
324 |             # No match. Call t_error() if defined.
325 |             if self.lexerrorf:
326 |                 tok = LexToken()
327 |                 tok.value = self.lexdata[lexpos:]
328 |                 tok.lineno = self.lineno
329 |                 tok.type = "error"
330 |                 tok.lexer = self
331 |                 oldpos = lexpos
332 |                 newtok = self.lexerrorf(tok)
333 |                 lexpos += getattr(tok,"_skipn",0)
334 |                 if oldpos == lexpos:
335 |                     # Error method didn't change text position at all. This is an error.
336 |                     self.lexpos = lexpos
337 |                     raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
338 |                 if not newtok: continue
339 |                 self.lexpos = lexpos
340 |                 return newtok
341 | 
342 |             self.lexpos = lexpos
343 |             raise LexError, ("No match found", lexdata[lexpos:])
344 | 
345 |         # No more input data
346 |         self.lexpos = lexpos + 1
347 |         return None
348 | 
349 |         
350 | # -----------------------------------------------------------------------------
351 | # validate_file()
352 | #
353 | # This checks to see if there are duplicated t_rulename() functions or strings
354 | # in the parser input file.  This is done using a simple regular expression
355 | # match on each line in the filename.
356 | # -----------------------------------------------------------------------------
357 | 
358 | def validate_file(filename):
359 |     import os.path
360 |     base,ext = os.path.splitext(filename)
361 |     if ext != '.py': return 1        # No idea what the file is. Return OK
362 | 
363 |     try:
364 |         f = open(filename)
365 |         lines = f.readlines()
366 |         f.close()
367 |     except IOError:
368 |         return 1                       # Oh well
369 | 
370 |     fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
371 |     sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
372 |     counthash = { }
373 |     linen = 1
374 |     noerror = 1
375 |     for l in lines:
376 |         m = fre.match(l)
377 |         if not m:
378 |             m = sre.match(l)
379 |         if m:
380 |             name = m.group(1)
381 |             prev = counthash.get(name)
382 |             if not prev:
383 |                 counthash[name] = linen
384 |             else:
385 |                 print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
386 |                 noerror = 0
387 |         linen += 1
388 |     return noerror
389 | 
390 | # -----------------------------------------------------------------------------
391 | # _read_lextab(module)
392 | #
393 | # Reads lexer table from a lextab file instead of using introspection.
394 | # -----------------------------------------------------------------------------
395 | 
396 | def _read_lextab(lexer, fdict, module):
397 |     exec "import %s as lextab" % module
398 |     lexer.lexre = re.compile(lextab._lexre, re.VERBOSE)
399 |     lexer.lexindexfunc = lextab._lextab
400 |     for i in range(len(lextab._lextab)):
401 |         t = lexer.lexindexfunc[i]
402 |         if t:
403 |             if t[0]:
404 |                 lexer.lexindexfunc[i] = (fdict[t[0]],t[1])
405 |     lexer.lextokens = lextab._lextokens
406 |     lexer.lexignore = lextab._lexignore
407 |     if lextab._lexerrorf:
408 |         lexer.lexerrorf = fdict[lextab._lexerrorf]
409 |         
410 | # -----------------------------------------------------------------------------
411 | # lex(module)
412 | #
413 | # Build all of the regular expression rules from definitions in the supplied module
414 | # -----------------------------------------------------------------------------
415 | def lex(module=None,debug=0,optimize=0,lextab="lextab"):
416 |     ldict = None
417 |     regex = ""
418 |     error = 0
419 |     files = { }
420 |     lexer = Lexer()
421 |     lexer.debug = debug
422 |     lexer.optimize = optimize
423 |     global token,input
424 |     
425 |     if module:
426 |         # User supplied a module object.
427 |         if isinstance(module, types.ModuleType):
428 |             ldict = module.__dict__
429 |         elif isinstance(module, types.InstanceType):
430 |             _items = [(k,getattr(module,k)) for k in dir(module)]
431 |             ldict = { }
432 |             for (i,v) in _items:
433 |                 ldict[i] = v
434 |         else:
435 |             raise ValueError,"Expected a module or instance"
436 |         
437 |     else:
438 |         # No module given.  We might be able to get information from the caller.
439 |         try:
440 |             raise RuntimeError
441 |         except RuntimeError:
442 |             e,b,t = sys.exc_info()
443 |             f = t.tb_frame
444 |             f = f.f_back           # Walk out to our calling function
445 |             ldict = f.f_globals    # Grab its globals dictionary
446 | 
447 |     if optimize and lextab:
448 |         try:
449 |             _read_lextab(lexer,ldict, lextab)
450 |             if not lexer.lexignore: lexer.lexignore = ""            
451 |             token = lexer.token
452 |             input = lexer.input
453 |             return lexer
454 |         
455 |         except ImportError:
456 |             pass
457 |         
458 |     # Get the tokens map
459 |     if (module and isinstance(module,types.InstanceType)):
460 |         tokens = getattr(module,"tokens",None)
461 |     else:
462 |         try:
463 |             tokens = ldict["tokens"]
464 |         except KeyError:
465 |             tokens = None
466 |         
467 |     if not tokens:
468 |         raise SyntaxError,"lex: module does not define 'tokens'"
469 |     if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
470 |         raise SyntaxError,"lex: tokens must be a list or tuple."
471 | 
472 |     # Build a dictionary of valid token names
473 |     lexer.lextokens = { }
474 |     if not optimize:
475 | 
476 |         # Utility function for verifying tokens
477 |         def is_identifier(s):
478 |             for c in s:
479 |                 if not (c.isalnum() or c == '_'): return 0
480 |             return 1
481 |         
482 |         for n in tokens:
483 |             if not is_identifier(n):
484 |                 print "lex: Bad token name '%s'" % n
485 |                 error = 1
486 |             if lexer.lextokens.has_key(n):
487 |                 print "lex: Warning. Token '%s' multiply defined." % n
488 |             lexer.lextokens[n] = None
489 |     else:
490 |         for n in tokens: lexer.lextokens[n] = None
491 |         
492 | 
493 |     if debug:
494 |         print "lex: tokens = '%s'" % lexer.lextokens.keys()
495 | 
496 |     # Get a list of symbols with the t_ prefix
497 |     tsymbols = [f for f in ldict.keys() if f[:2] == 't_']
498 |     
499 |     # Now build up a list of functions and a list of strings
500 |     fsymbols = [ ]
501 |     ssymbols = [ ]
502 |     for f in tsymbols:
503 |         if callable(ldict[f]):
504 |             fsymbols.append(ldict[f])
505 |         elif isinstance(ldict[f], types.StringType):
506 |             ssymbols.append((f,ldict[f]))
507 |         else:
508 |             print "lex: %s not defined as a function or string" % f
509 |             error = 1
510 |             
511 |     # Sort the functions by line number
512 |     fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno))
513 | 
514 |     # Sort the strings by regular expression length
515 |     ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
516 |     
517 |     # Check for non-empty symbols
518 |     if len(fsymbols) == 0 and len(ssymbols) == 0:
519 |         raise SyntaxError,"lex: no rules of the form t_rulename are defined."
520 | 
521 |     # Add all of the rules defined with actions first
522 |     for f in fsymbols:
523 |         
524 |         line = f.func_code.co_firstlineno
525 |         file = f.func_code.co_filename
526 |         files[file] = None
527 | 
528 |         ismethod = isinstance(f, types.MethodType)
529 | 
530 |         if not optimize:
531 |             nargs = f.func_code.co_argcount
532 |             if ismethod:
533 |                 reqargs = 2
534 |             else:
535 |                 reqargs = 1
536 |             if nargs > reqargs:
537 |                 print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
538 |                 error = 1
539 |                 continue
540 | 
541 |             if nargs < reqargs:
542 |                 print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
543 |                 error = 1
544 |                 continue
545 | 
546 |             if f.__name__ == 't_ignore':
547 |                 print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
548 |                 error = 1
549 |                 continue
550 |         
551 |         if f.__name__ == 't_error':
552 |             lexer.lexerrorf = f
553 |             continue
554 | 
555 |         if f.__doc__:
556 |             if not optimize:
557 |                 try:
558 |                     c = re.compile(f.__doc__, re.VERBOSE)
559 |                 except re.error,e:
560 |                     print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
561 |                     error = 1
562 |                     continue
563 | 
564 |                 if debug:
565 |                     print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__)
566 | 
567 |             # Okay. The regular expression seemed okay.  Let's append it to the master regular
568 |             # expression we're building
569 |   
570 |             if (regex): regex += "|"
571 |             regex += "(?P<%s>%s)" % (f.__name__,f.__doc__)
572 |         else:
573 |             print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
574 | 
575 |     # Now add all of the simple rules
576 |     for name,r in ssymbols:
577 | 
578 |         if name == 't_ignore':
579 |             lexer.lexignore = r
580 |             continue
581 |         
582 |         if not optimize:
583 |             if name == 't_error':
584 |                 raise SyntaxError,"lex: Rule 't_error' must be defined as a function"
585 |                 error = 1
586 |                 continue
587 |         
588 |             if not lexer.lextokens.has_key(name[2:]):
589 |                 print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:])
590 |                 error = 1
591 |                 continue
592 |             try:
593 |                 c = re.compile(r,re.VERBOSE)
594 |             except re.error,e:
595 |                 print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
596 |                 error = 1
597 |                 continue
598 |             if debug:
599 |                 print "lex: Adding rule %s -> '%s'" % (name,r)
600 |                 
601 |         if regex: regex += "|"
602 |         regex += "(?P<%s>%s)" % (name,r)
603 | 
604 |     if not optimize:
605 |         for f in files.keys():
606 |             if not validate_file(f):
607 |                 error = 1
608 |     try:
609 |         if debug:
610 |             print "lex: regex = '%s'" % regex
611 |         lexer.lexre = re.compile(regex, re.VERBOSE)
612 | 
613 |         # Build the index to function map for the matching engine
614 |         lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1)
615 |         for f,i in lexer.lexre.groupindex.items():
616 |             handle = ldict[f]
617 |             if type(handle) in (types.FunctionType, types.MethodType):
618 |                 lexer.lexindexfunc[i] = (handle,handle.__name__[2:])
619 |             else:
620 |                 # If rule was specified as a string, we build an anonymous
621 |                 # callback function to carry out the action
622 |                 lexer.lexindexfunc[i] = (None,f[2:])
623 | 
624 |         # If a lextab was specified, we create a file containing the precomputed
625 |         # regular expression and index table
626 |         
627 |         if lextab and optimize:
628 |             lt = open(lextab+".py","w")
629 |             lt.write("# %s.py.  This file automatically created by PLY. Don't edit.\n" % lextab)
630 |             lt.write("_lexre = %s\n" % repr(regex))
631 |             lt.write("_lextab = [\n");
632 |             for i in range(0,len(lexer.lexindexfunc)):
633 |                 t = lexer.lexindexfunc[i]
634 |                 if t:
635 |                     if t[0]:
636 |                         lt.write("  ('%s',%s),\n"% (t[0].__name__, repr(t[1])))
637 |                     else:
638 |                         lt.write("  (None,%s),\n" % repr(t[1]))
639 |                 else:
640 |                     lt.write("  None,\n")
641 |                     
642 |             lt.write("]\n");
643 |             lt.write("_lextokens = %s\n" % repr(lexer.lextokens))
644 |             lt.write("_lexignore = %s\n" % repr(lexer.lexignore))
645 |             if (lexer.lexerrorf):
646 |                 lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__))
647 |             else:
648 |                 lt.write("_lexerrorf = None\n")
649 |             lt.close()
650 |         
651 |     except re.error,e:
652 |         print "lex: Fatal error. Unable to compile regular expression rules. %s" % e
653 |         error = 1
654 |     if error:
655 |         raise SyntaxError,"lex: Unable to build lexer."
656 |     if not lexer.lexerrorf:
657 |         print "lex: Warning. no t_error rule is defined."
658 | 
659 |     if not lexer.lexignore: lexer.lexignore = ""
660 |     
661 |     # Create global versions of the token() and input() functions
662 |     token = lexer.token
663 |     input = lexer.input
664 |     
665 |     return lexer
666 | 
667 | # -----------------------------------------------------------------------------
668 | # run()
669 | #
670 | # This runs the lexer as a main program
671 | # -----------------------------------------------------------------------------
672 | 
673 | def runmain(lexer=None,data=None):
674 |     if not data:
675 |         try:
676 |             filename = sys.argv[1]
677 |             f = open(filename)
678 |             data = f.read()
679 |             f.close()
680 |         except IndexError:
681 |             print "Reading from standard input (type EOF to end):"
682 |             data = sys.stdin.read()
683 | 
684 |     if lexer:
685 |         _input = lexer.input
686 |     else:
687 |         _input = input
688 |     _input(data)
689 |     if lexer:
690 |         _token = lexer.token
691 |     else:
692 |         _token = token
693 |         
694 |     while 1:
695 |         tok = _token()
696 |         if not tok: break
697 |         print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno)
698 |         
699 |     
700 | 
701 | 
702 | 


--------------------------------------------------------------------------------
/cx86.py:
--------------------------------------------------------------------------------
   1 | #  ---------------------------------------------------------------
   2 | #  cx86.py
   3 | #
   4 | #  Atul Varma
   5 | #  Python C Compiler - Intel x86 Code Generator
   6 | #  $Id: cx86.py,v 1.3 2004/06/02 21:05:23 varmaa Exp $
   7 | #  ---------------------------------------------------------------
   8 | 
   9 | import cparse
  10 | from cvisitors import Visitor
  11 | 
  12 | #  ---------------------------------------------------------------
  13 | #  CONSTANTS
  14 | #  ---------------------------------------------------------------
  15 | 
  16 | # Size of the 'int' type.
  17 | INT_SIZE = 4
  18 | 
  19 | # Size of the 'char' type.
  20 | CHAR_SIZE = 1
  21 | 
  22 | # The machine's word size.  Note that making this different
  23 | # from INT_SIZE may cause serious problems.
  24 | WORD_SIZE = 4
  25 | 
  26 | # This is a strange multiplier that needs to be used in the allocation
  27 | # of global variables for the GNU Assembler.  Not sure exactly what it
  28 | # represents.
  29 | WEIRD_MULTIPLIER = 4
  30 | 
  31 | #  ---------------------------------------------------------------
  32 | #  STACK MACHINE ABSTRACTION
  33 | #  ---------------------------------------------------------------
  34 | 
  35 | class x86Registers:
  36 |     """This class attempts to abstract the x86 registers into a stack
  37 |     machine.  Calling push() gives you a register that isn't currently
  38 |     in use by the stack machine, pop() gives you a register with the
  39 |     value of the most recently pushed element.
  40 | 
  41 |     Through this method the stack machine can be used to compute
  42 |     values the same way a reverse polish notation (RPN) calculator
  43 |     does.
  44 | 
  45 |     When push() and pop() are called, it may be the case that no
  46 |     registers are currently available; if this happens, the least
  47 |     recently used register is 'spilled' into a temporary local
  48 |     variable on the process' stack and freed for use.  Note that the
  49 |     process' stack is not to be confused with this stack machine
  50 |     abstraction--the two are completely different entities.
  51 | 
  52 |     Currently, push() and pop() also implement a little bit of
  53 |     implicit type conversion, so they take as parameters a cparse.Type
  54 |     object; currently conversion is done between char and int types,
  55 |     so depending on the pushed and popped types, some type conversion
  56 |     assembly code may be generated.
  57 | 
  58 |     Finally, an additional method, done(), should be called whenever
  59 |     the stack machine is done popping values for the current
  60 |     operation.  This is because when pop is called, the returned
  61 |     register is not immediately made 'free' for another call to pop or
  62 |     push.  If this were the case, then the following situation could
  63 |     occur:
  64 | 
  65 |              rightOp.calc()      # calc val of right op, put on stack
  66 |              leftOp.calc()       # calc val of left op, put on stack
  67 |              l = leftOp.pop()    # pop left val from stack
  68 |              r = rightOp.pop()   # pop right val from stack
  69 |              output('addl %s, %s' % (r, l))
  70 | 
  71 |     The problem with this approach is that we don't know how many
  72 |     registers will be used by leftOp's calc() method--it may use all
  73 |     the remaining registers, in which case the value that rightOp's
  74 |     calc() method put on the stack is no longer stored in a register.
  75 |     If leftOp.pop() returned register %eax and immediately marked the
  76 |     %eax register as being 'free for use', then the call to
  77 |     rightOp.pop() could very well generate code that moves rightOp's
  78 |     value from a temporary variable into %eax, thereby overwriting
  79 |     leftOp's value!
  80 | 
  81 |     So, instead, the pop() method places the %eax register (in this
  82 |     example) into an internal list of 'almost free' registers;
  83 |     registers that have just been returned by pop() but shouldn't be
  84 |     used by the stack machine until a call to done() is made.  The
  85 |     done() method simply moves the registers in the 'almost free' list
  86 |     over to the 'free' list."""
  87 |     
  88 |     def __init__(self, parent, base_fp):
  89 |         # A list of all registers on the machine.
  90 |         self.all_regs = ['%ebx','%esi','%edi','%eax','%ecx','%edx']
  91 | 
  92 |         # A list of the registers currently free.  Note that this
  93 |         # is a *copy* of the list of all registers on the machine.
  94 |         self.regs_free = self.all_regs[:]
  95 | 
  96 |         # A list of all the registers that are "almost" free
  97 |         # (see the docstring for this class).
  98 |         self.regs_almost_free = []
  99 | 
 100 |         # A list of all the temporary variable memory locations
 101 |         # that are currently unused.
 102 |         self.mem_free = []
 103 | 
 104 |         # A list corresponding to the actual stack of the stack
 105 |         # machine.  The item at the top of the stack is the
 106 |         # last element of this list.
 107 |         self.stack = []
 108 | 
 109 |         # A list that stores the Type objects of each corresponding
 110 |         # element on the stack machine's stack.  e.g., type_stack[0]
 111 |         # represents the type of the element at stack[0].
 112 |         self.type_stack = []
 113 | 
 114 |         # The location of the next memory location to be used for
 115 |         # temporary variables, relative to the current function's
 116 |         # frame pointer.
 117 |         self.next_temp = base_fp - WORD_SIZE
 118 | 
 119 |         # The parent CodeGenVisitor object of this stack machine.
 120 |         self.parent = parent
 121 | 
 122 |         # A list of the callee-save registers that have been used
 123 |         # so far by this function.  Once processing is finished,
 124 |         # these registers will be pushed onto the process' stack
 125 |         # at the beginning of the function and popped off just
 126 |         # before the function terminates.
 127 |         self.callee_save_regs_used = []
 128 | 
 129 |         # A list of the caller-save registers on the machine.
 130 |         self.caller_save_regs = ['%eax', '%ecx', '%edx']
 131 | 
 132 |         # A list of the callee-save registers on the machine.
 133 |         self.callee_save_regs = ['%ebx', '%esi', '%edi']
 134 | 
 135 |         # A list of the registers on the machine that have
 136 |         # sub-registers allowing access to their low-order bytes.
 137 |         self.byte_compat_regs = ['%eax', '%ebx', '%ecx', '%edx']
 138 | 
 139 |         # The default type of an element that is pushed onto
 140 |         # the stack machine without a 'type' object passed.
 141 |         self.default_type = cparse.BaseType('int')
 142 |         
 143 |     def o(self, str, comment=None):
 144 |         """Wrapper for the parent CodeGenVisitor's o() method."""
 145 |         
 146 |         self.parent.o(str, comment)
 147 | 
 148 |     def save_caller_saves(self):
 149 |         """Saves the caller-save registers, which should be done
 150 |         before the current function makes a function call, so that
 151 |         the registers don't get corrupted by the called function.
 152 | 
 153 |         Normally, this is done by pushing the caller-save registers
 154 |         onto the stack just before the function call is made and
 155 |         popping them off afterwards; however, due to the workings of
 156 |         this particular stack machine it's much easier to just move
 157 |         the contents of the caller-save registers, if they are
 158 |         currently being used, into temporary variables."""
 159 |         
 160 |         for reg in self.caller_save_regs:
 161 |             if reg not in self.regs_free:
 162 |                 self._copy_reg_to_temp([reg],
 163 |                                        "Save caller-save register to temp")
 164 |                 self.regs_free.append(reg)
 165 | 
 166 |     def save_callee_saves(self):
 167 |         """Emits code that pushes the callee-save registers used by
 168 |         the stack machine onto the process' stack."""
 169 |         
 170 |         for reg in self.callee_save_regs_used:
 171 |             self.o("  pushl %s" % reg,
 172 |                    "Save callee-save register")
 173 | 
 174 |     def load_callee_saves(self):
 175 |         """Emits code that pops the callee-save registers used by
 176 |         the stack machine off the process' stack."""
 177 |         
 178 |         for reg in self.callee_save_regs_used:
 179 |             self.o("  popl %s" % reg,
 180 |                    "Restore callee-save register")
 181 | 
 182 |     def _copy_reg_to_temp(self, valid_regs, comment_str=None):
 183 |         """Copy the least recently used register on the stack into a
 184 |         temporary variable.  The register must be in the valid_regs
 185 |         list."""
 186 |         
 187 |         # if no free temp variables exist,
 188 |         # create a new one.
 189 |         if len(self.mem_free) == 0:
 190 |             self.mem_free.append("%d(%%ebp)" % self.next_temp)
 191 |             self.next_temp -= WORD_SIZE        
 192 | 
 193 |         # get an unused temp var
 194 |         mem = self.mem_free.pop()
 195 |         
 196 |         # find the least recently used register on the stack
 197 |         reg = None
 198 |         index = 0
 199 |         for i in self.stack:
 200 |             if i in valid_regs:
 201 |                 reg = i
 202 |                 break
 203 |             index += 1
 204 |         if reg == None:
 205 |             raise Exception("No free registers inside OR outside of stack!")
 206 | 
 207 |         # emit code to copy the register to the memory location.
 208 |         if comment_str == None:
 209 |             comment_str = "Stack machine: copy register to temp"
 210 |         self.o("  movl %s, %s" % (reg, mem),
 211 |                comment_str)
 212 | 
 213 |         # Modify the element's stack machine position to reflect
 214 |         # its new location.
 215 |         self.stack[index] = mem
 216 |         return reg
 217 | 
 218 |     def _get_free_reg(self, valid_regs, preferred_reg=None):
 219 |         """Returns a free register that is in the valid_regs list.  If
 220 |         no registers are available, the most least-recently used
 221 |         eligible one is freed (by moving its contents to a temporary
 222 |         variable) and returned."""
 223 | 
 224 |         # If we have a register free, return it.
 225 |         if len(self.regs_free) > 0:
 226 |             reg = None
 227 |             if preferred_reg != None and preferred_reg in self.regs_free:
 228 |                 reg = preferred_reg
 229 |             else:
 230 |                 for r in self.regs_free:
 231 |                     if r in valid_regs:
 232 |                         reg = r
 233 |             if reg != None:
 234 |                 self.regs_free.remove(reg)
 235 |                 # If this register is a callee-save register that
 236 |                 # we haven't used before, add it to our list
 237 |                 # of used callee-save registers.
 238 |                 if reg in self.callee_save_regs and reg not in self.callee_save_regs_used:
 239 |                     self.callee_save_regs_used.append(reg)
 240 |                 return reg
 241 |         # copy a register into a temp var and return the register.
 242 |         return self._copy_reg_to_temp(valid_regs)
 243 | 
 244 |     def _get_type_valid_regs(self, type):
 245 |         """Returns the valid registers that an element of the given
 246 |         type can occupy.  For instance, 8-bit chars should only be
 247 |         placed in %eax/%ebx/%ecx/%edx because these are the only
 248 |         registers with low-order byte sub-registers
 249 |         (%al/%bl/%cl/%dl)."""
 250 |         
 251 |         type_str = type.get_outer_string()
 252 |         if type_str == 'char':
 253 |             return self.byte_compat_regs
 254 |         elif type_str in ['int', 'pointer']:
 255 |             return self.all_regs
 256 | 
 257 |     def push(self, type=None, preferred_reg=None, valid_regs=None):
 258 |         """Finds a free eligible register (or frees one if all are
 259 |         being used) and returns it, pushing the register onto the
 260 |         stack machine's stack.
 261 | 
 262 |         This method associates the stack entry with the given Type
 263 |         object; if none is supplied, then an 'int' type is used
 264 |         by default.
 265 | 
 266 |         If preferred_reg is passed, this function will try its
 267 |         best to return preferred_reg, if it's available."""
 268 | 
 269 |         if type == None:
 270 |             type = self.default_type
 271 |         self.type_stack.append(type)
 272 |         if valid_regs == None:
 273 |             valid_regs = self._get_type_valid_regs(type)
 274 |         reg = self._get_free_reg(valid_regs, preferred_reg)
 275 |         self.stack.append(reg)
 276 |         return reg
 277 | 
 278 |     def _coerce_type(self, curr_reg, from_type, to_type):
 279 |         """Attempts to coerce the element in the current register
 280 |         from the given type to the given type."""
 281 |         
 282 |         from_str = from_type.get_outer_string()
 283 |         to_str = to_type.get_outer_string()
 284 |         comment_str = "Implicit cast: %s -> %s" % (from_str, to_str)
 285 |         if from_str == to_str:
 286 |             return curr_reg
 287 |         if from_str == 'char':
 288 |             if to_str == 'int':
 289 |                 return curr_reg
 290 |         elif from_str == 'int':
 291 |             if to_str == 'char':
 292 |                 self.o("  movzbl %s, %s" % (self.lo(curr_reg),
 293 |                                             curr_reg),
 294 |                        comment_str)
 295 |                 return curr_reg
 296 | 
 297 |     def pop(self, type=None, valid_regs=None):
 298 |         """Pops the top element off the stack machine's stack, coerces
 299 |         it to the given type if necessary, and returns a register in
 300 |         which the element's value now resides.
 301 | 
 302 |         If no type is specified, pop() returns the value of the
 303 |         element as-is."""
 304 |         
 305 |         prev_type = self.type_stack.pop()
 306 |         if type != None:
 307 |             if valid_regs == None:
 308 |                 valid_regs = self._get_type_valid_regs(type)
 309 |             reg = self._pop(valid_regs)
 310 |             return self._coerce_type(reg, prev_type, type)
 311 |         else:
 312 |             return self._pop(self.all_regs)
 313 | 
 314 |     def _pop(self, valid_regs):
 315 |         """Pops the top element of the stack into a free register
 316 |         that is also in valid_regs and returns the register name.  If
 317 |         no registers are free, the least recently used one is first
 318 |         copied into a temporary variable and then used."""
 319 |         
 320 |         loc = self.stack.pop()
 321 |         
 322 |         # If the top of the stack is a register, just return the
 323 |         # name of the register and add the register to our free
 324 |         # register list.
 325 |         if loc in valid_regs:
 326 |             self.regs_almost_free.append(loc)
 327 |             return loc
 328 |         
 329 |         # Otherwise, copy the temp variable at the top of the stack
 330 |         # into a free register, possibly requiring us to spill the
 331 |         # current contents of the memory register into another temp
 332 |         # variable.
 333 |         reg = self._get_free_reg(valid_regs)
 334 |         self.o("  movl %s, %s" % (loc, reg),
 335 |                "Stack machine: copy temp to register")
 336 | 
 337 |         # if our location was a register but not in valid_regs,
 338 |         # make the register free for use.
 339 |         if loc in self.all_regs:
 340 |             self.regs_free.append(loc)
 341 |             
 342 |         self.regs_almost_free.append(reg)
 343 |         return reg
 344 | 
 345 |     def peek(self):
 346 |         """Returns the top element of the stack, but doesn't pop
 347 |         it.  Note that this is not guaranteed to be a register; it
 348 |         could be a memory location!"""
 349 |         
 350 |         return self.stack[-1]
 351 | 
 352 |     def is_empty(self):
 353 |         """Returns whether the stack machine is empty."""
 354 |         
 355 |         return len(self.stack) == 0
 356 | 
 357 |     def done(self):
 358 |         """Frees all registers that are marked as being in
 359 |         intermediate use (i.e., have been pop()'d)."""
 360 |         
 361 |         self.regs_free.extend(self.regs_almost_free)
 362 |         self.regs_almost_free = []
 363 | 
 364 |     def get_max_fp(self):
 365 |         """Returns the maximum point in the process' stack, relative
 366 |         to the current function's frame pointer, that the stack
 367 |         machine is using for temporary variables."""
 368 |         
 369 |         return self.next_temp + WORD_SIZE
 370 | 
 371 |     def lo(self, reg):
 372 |         """Returns the low-order byte of the given register.  If the
 373 |         register isn't byte-compatible (i.e., isn't %eax, %ebx, %ecx,
 374 |         or %edx), then an exception is raised.
 375 | 
 376 |         Example: stack.lo('%eax') == '%al'."""
 377 |         
 378 |         if reg[0] == '$':
 379 |             return reg
 380 |         if reg not in self.byte_compat_regs:
 381 |             raise Exception("Register %s is not byte-compatible!" % reg)
 382 |         return '%' + reg[2] + 'l'
 383 | 
 384 |     def force_type_change(self, type):
 385 |         """Forces a type change of the top element of the stack."""
 386 | 
 387 |         self.type_stack[-1] = type
 388 | 
 389 | #  ---------------------------------------------------------------
 390 | #  CODE GENERATOR
 391 | #  ---------------------------------------------------------------
 392 | 
 393 | class CodeGenVisitor(Visitor):
 394 |     """Visitor that generates x86 assembly code for the abstract
 395 |     syntax tree."""
 396 |     
 397 |     def __init__(self, file, show_comments=0):
 398 |         """Constructor.  'file' is the file object to output the
 399 |         resulting code to.  If 'show_comments' is true, then annotated
 400 |         comments are produced for the generated assembly code."""
 401 |         
 402 |         Visitor.__init__(self)
 403 | 
 404 |         # The current label number we're on, for generating
 405 |         # jump labels in the assembly code (e.g., 'LO', 'L1', etc).
 406 |         self.__label = 0
 407 | 
 408 |         # Current label number for generating string literal labels.
 409 |         self.__str_literal_label = 0
 410 | 
 411 |         # Current assembly code for string literals.
 412 |         self.__str_literal_str = ""
 413 | 
 414 |         # Whether we should show comments or not.
 415 |         self.show_comments = show_comments
 416 | 
 417 |         # The file we're outputting the generated code to.
 418 |         self.file = file
 419 | 
 420 |         # A hashtable of binary operators and the assembly
 421 |         # instructions corresponding to them.  Certain instructions
 422 |         # are just the 'base' instruction and require a suffix
 423 |         # corresponding to the size of the operands; for instance,
 424 |         # addition can be accomplished with the 'addl' instruction
 425 |         # for 32-bit integers and 'addb' for 8-bit integers.
 426 |         #
 427 |         # In such cases, the code adds the appropriate suffixes on its
 428 |         # own.
 429 |         self.binop_instructions = \
 430 |                                 { '==' : 'sete',
 431 |                                   '!=' : 'setne',
 432 |                                   '>=' : 'setge',
 433 |                                   '<=' : 'setle',
 434 |                                   '>'  : 'setg',
 435 |                                   '<'  : 'setl',
 436 |                                   '+'  : 'add',
 437 |                                   '-'  : 'sub',
 438 |                                   '*'  : 'imul',
 439 |                                   '='  : 'mov'
 440 |                                   }
 441 | 
 442 |         # Windows' C linkage prepends a '_' before symbol
 443 |         # names, whereas Unix doesn't.  This is particularly
 444 |         # critical if the source file is linking to external
 445 |         # libraries that we're not compiling.  Figure out
 446 |         # which one to use here.
 447 |         import sys
 448 |         if sys.platform == 'win32':
 449 |             self.symbol_prepend = "_"
 450 |         else:
 451 |             self.symbol_prepend = ""
 452 |             
 453 |     def new_label(self):
 454 |         """Generate a new jump label and return it."""
 455 |         
 456 |         label = ".L%d" % self.__label
 457 |         self.__label += 1
 458 |         return label
 459 | 
 460 |     def o(self, str, comment=None):
 461 |         """Output a line of assembly code to the output file,
 462 |         with an optional annotated comment (if comments are
 463 |         enabled)."""
 464 |         
 465 |         if self.show_comments and comment != None:
 466 |             comment = "# %s" % comment
 467 |             self.curr_str += "%-35s %s\n" % (str, comment)
 468 |         else:
 469 |             if str == "":
 470 |                 return
 471 |             self.curr_str += str + "\n"
 472 | 
 473 |     def c(self, str, indent_amt=2):
 474 |         """Output a single-line comment to the output file, if
 475 |         comments are enabled."""
 476 | 
 477 |         indent = " " * indent_amt
 478 |         
 479 |         if self.show_comments:
 480 |             self.o("\n%s# %s\n" % (indent, str))
 481 |             
 482 |     def vNodeList(self, node):
 483 |         self._visitList(node.nodes)
 484 | 
 485 |     def _empty_stack(self, node):
 486 |         """Pops the top value from the stack machine's stack and
 487 |         discard it.  This is used when a statement has a return
 488 |         value (for instance, the line 'a = b + 1;') and its
 489 |         return value has been pushed onto the stack but there's
 490 |         nothing to pop it off."""
 491 |         
 492 |         # if the statement was also an expression, then its return
 493 |         # value is still on the stack, so empty it (throw away
 494 |         # the return value).
 495 |         if not self.stack.is_empty():
 496 |             self.stack.pop(node.type)
 497 |             self.stack.done()
 498 |             if not self.stack.is_empty():
 499 |                 raise Exception("PANIC! Register stack isn't empty!")
 500 | 
 501 |     def _accept_and_empty_stack(self, node):
 502 |         """Visit the node and then empty the stack machine of the
 503 |         node's return value, if one exists."""
 504 |         
 505 |         node.accept(self)
 506 |         self._empty_stack(node)
 507 | 
 508 |     def vStatementList(self, node):
 509 |         for n in node.nodes:
 510 |             self._accept_and_empty_stack(n)
 511 | 
 512 |     def _generate_global_variable_definitions(self, node):
 513 |         """Generate and return a list of global variable
 514 |         definitions."""
 515 |         
 516 |         globals_str = ".global_vars:\n"
 517 |         for symbol in node.symtab.entries.values():
 518 |             symbol.compile_loc = self.symbol_prepend + symbol.name
 519 |             if not symbol.type.is_function() and not symbol.extern:
 520 |                 globals_str += "  .comm %s,%d\n" % \
 521 |                 (symbol.compile_loc, \
 522 |                  self._calc_var_size(symbol.type)*WEIRD_MULTIPLIER)
 523 |         return globals_str
 524 |     
 525 |     def vTranslationUnit(self, node):
 526 |         """Outputs the entire assembly source file."""
 527 |         
 528 |         self.curr_str = ""
 529 |         self.o("# Generated by c.py")
 530 |         self.o("# Atul Varma (Spring 2004)\n")
 531 |         self.o("    .text")
 532 | 
 533 |         globals_str = self._generate_global_variable_definitions(node)
 534 | 
 535 |         # Generate the main code.
 536 |         self._visitList(node.nodes)
 537 |         
 538 |         # Append global variable definitions.
 539 |         self.o(globals_str)
 540 | 
 541 |         # Append string literal definitions.
 542 |         self.o(self.__str_literal_str)
 543 | 
 544 |         # Output the entire file.
 545 |         self.file.write(self.curr_str)
 546 | 
 547 |     def _calc_var_size(self, type):
 548 |         """Calculate and return the size of the given type, in
 549 |         bytes."""
 550 |         
 551 |         type_str = type.get_outer_string()
 552 |         if type_str == "int":
 553 |             return INT_SIZE
 554 |         elif type_str == "char":
 555 |             return CHAR_SIZE
 556 |         elif type_str == "pointer":
 557 |             return WORD_SIZE
 558 |         else:
 559 |             self.error("Unknown type: %s" % type_str)
 560 | 
 561 |     def _calc_var_align(self, type):
 562 |         """Calculate and return the alignment of the given type,
 563 |         in bytes."""
 564 |         
 565 |         return self._calc_var_size(type)
 566 | 
 567 |     def _calc_function_var_addrs(self, symtab, last_fp_loc):
 568 |         """Calculate the addresses of all local variables in the
 569 |         function and attach them to their respective symbols in
 570 |         the function's symbol table(s)."""
 571 |         
 572 |         self._calc_function_arg_addrs(symtab)
 573 |         return self._calc_local_var_addrs(symtab.children[0], last_fp_loc)
 574 | 
 575 |     def _calc_function_arg_addrs(self, symtab):
 576 |         """Calculate the addresses of all the arguments passed to
 577 |         the function."""
 578 |         
 579 |         for symbol in symtab.entries.values():
 580 |             symbol.compile_loc = "%d(%%ebp)" % (WORD_SIZE*2+(symbol.param_num*WORD_SIZE))
 581 |             if not symbol.is_used:
 582 |                 self.warning("function argument '%s' is never used." % symbol.name)
 583 | 
 584 |     def _calc_local_var_addrs(self, symtab, last_fp_loc):
 585 |         """Calculate the locations of all the local variables defined
 586 |         in the function's body and all nested scopes therein.
 587 | 
 588 |         This model of allocation assumes a 'worst-case' scenario
 589 |         where all branches and nested scopes of the function are
 590 |         executed; thus the space required for all the local
 591 |         variables is allocated on the process' stack at the
 592 |         beginning of the function.
 593 | 
 594 |         Note, however, that lexical scopes that cannot exist
 595 |         at the same time may overlap in memory.  For instance,
 596 |         examine the following 'if' statement:
 597 | 
 598 |           if (a > 1) {
 599 |             int i;
 600 |           } else {
 601 |             int j;
 602 |           }
 603 | 
 604 |         Here 'i' and 'j' will actually occupy the same place in
 605 |         memory because it is impossible for both of them to
 606 |         exist in memory at the same time."""
 607 |           
 608 |         for symbol in symtab.entries.values():
 609 |             if symbol.extern:
 610 |                 symbol.compile_loc = self.symbol_prepend + symbol.name
 611 |                 continue
 612 |             last_fp_loc -= self._calc_var_size(symbol.type)
 613 |             
 614 |             # adjust location for alignment
 615 |             align = self._calc_var_align(symbol.type)
 616 |             bytes_overboard = (-last_fp_loc) % align
 617 |             if bytes_overboard != 0:
 618 |                 last_fp_loc -= (align - bytes_overboard)
 619 | 
 620 |             symbol.compile_loc = "%d(%%ebp)" % last_fp_loc
 621 |             if not symbol.is_used:
 622 |                 self.warning("local variable '%s' is never used." % symbol.name)
 623 |         max_last_fp = last_fp_loc
 624 |         for kid in symtab.children:
 625 |             curr_last_fp = self._calc_local_var_addrs(kid, last_fp_loc)
 626 |             if curr_last_fp < max_last_fp:
 627 |                 max_last_fp = curr_last_fp
 628 | 
 629 |         # adjust location for alignment, to keep the stack aligned
 630 |         # on a word-sized boundary.
 631 |         align = self._calc_var_align(cparse.PointerType())
 632 |         bytes_overboard = (-max_last_fp) % align
 633 |         if bytes_overboard != 0:
 634 |             max_last_fp -= (align - bytes_overboard)
 635 | 
 636 |         return max_last_fp
 637 | 
 638 |     def _fill_line(self, str, width=70):
 639 |         """Fills a string to the given width with the '-'
 640 |         character."""
 641 | 
 642 |         extra = "-" * (width-1-len(str))
 643 |         return str + " " + extra
 644 |     
 645 |     def vFunctionDefn(self, node):
 646 |         """Output the assembly code for a function."""
 647 |         
 648 |         self.break_labels = []
 649 |         self.continue_labels = []
 650 |         self.curr_func_end_label = self.new_label() + "_function_end"
 651 | 
 652 |         # Calculate the base size of the stack frame (not including
 653 |         # space for the stack machine's temporary variables).
 654 |         stack_frame_size = self._calc_function_var_addrs(node.symtab, 0)
 655 | 
 656 |         line = self._fill_line("BEGIN FUNCTION: %s()" % node.name)
 657 |         self.c("%s\n"
 658 |                "#\n"
 659 |                "# Function type: %s" %
 660 |                (line, node.type.get_string()), 0)
 661 | 
 662 |         if not node.static:
 663 |             self.o("    .global %s" % node.compile_loc)
 664 |         self.o("%s:" % node.compile_loc)
 665 |         self.o("  pushl %ebp", "Save old frame pointer")
 666 |         self.o("  movl %esp, %ebp", "Set new frame pointer")
 667 | 
 668 |         # Create a new stack machine for this function.
 669 |         self.stack = x86Registers(self, stack_frame_size)
 670 | 
 671 |         # Generate assembly code for the function.  Here we
 672 |         # perform a little hack so that we can generate the
 673 |         # code for the function into a separate string, and then
 674 |         # insert it into our code later on.
 675 |         
 676 |         old_str = self.curr_str
 677 |         self.curr_str = ""
 678 | 
 679 |         node.body.accept(self)
 680 |         
 681 |         function_str = self.curr_str
 682 |         self.curr_str = old_str
 683 | 
 684 |         # Figure out the final size of the stack frame, taking into
 685 |         # account the stack machine's temporary variables, and
 686 |         # insert the code at the beginning of the function.
 687 |         if self.stack.get_max_fp() != 0:
 688 |             self.o("  subl $%d, %%esp" % (-self.stack.get_max_fp()),
 689 |                    "Allocate space for local+temp vars")
 690 | 
 691 |         # Save any callee-save registers that may have been used.
 692 |         self.stack.save_callee_saves()
 693 | 
 694 |         # Add the previously-generated assembly code for the function.
 695 |         self.curr_str += function_str
 696 |         
 697 |         self.o("%s:" % self.curr_func_end_label)
 698 | 
 699 |         # Restore any callee-save registers that may have been used.
 700 |         self.stack.load_callee_saves()
 701 |         self.o("  movl %ebp, %esp", "Deallocate stack frame")
 702 |         self.o("  popl %ebp", "Restore old stack frame")
 703 |         self.o("  ret\n")
 704 | 
 705 |         line = self._fill_line("END FUNCTION: %s()" % node.name)
 706 |         self.c(line, 0)
 707 |         
 708 |     def vCompoundStatement(self, node):
 709 |         node.statement_list.accept(self)
 710 | 
 711 |     def vIfStatement(self, node):
 712 |         done_label = self.new_label() + "_done"
 713 |         if not node.else_stmt.is_null():
 714 |             else_label = self.new_label() + "_else"
 715 |         else:
 716 |             else_label = done_label
 717 | 
 718 |         self.c("IF statment - begin")
 719 |         
 720 |         node.expr.accept(self)
 721 |         comparer = self.stack.pop()
 722 |         self.stack.done()
 723 |         self.o("  testl %s, %s" % (comparer, comparer), "Test the result")        
 724 |         self.o("  jz %s" % else_label,
 725 |                "If result is zero, jump to else clause")
 726 |         self.c("IF statment - THEN clause - begin")
 727 |         self._accept_and_empty_stack(node.then_stmt)
 728 |         self.c("IF statment - THEN clause - end")        
 729 |         self.o("  jmp %s" % done_label)
 730 |         if not node.else_stmt.is_null():
 731 |             self.c("IF statment - ELSE clause - begin")
 732 |             self.o("%s:" % else_label)
 733 |             self._accept_and_empty_stack(node.else_stmt)
 734 |             self.c("IF statment - ELSE clause - end")
 735 |         self.o("%s:" % done_label)
 736 | 
 737 |         self.c("IF statment - end")
 738 |         
 739 |     def _push_loop_labels(self, break_label, continue_label):
 740 |         """Pushes new values of labels to jump to for 'break' and
 741 |         'continue' statements."""
 742 |         
 743 |         self.break_labels.append(break_label)
 744 |         self.continue_labels.append(continue_label)
 745 |         
 746 |     def _pop_loop_labels(self):
 747 |         """Restores old values of labels to jump to for 'break' and
 748 |         'continue' statements."""
 749 |         
 750 |         self.break_labels.pop()
 751 |         self.continue_labels.pop()
 752 | 
 753 |     def vWhileLoop(self, node):
 754 |         test_label = self.new_label() + "_test"
 755 |         done_label = self.new_label() + "_done"
 756 | 
 757 |         self._push_loop_labels(break_label=done_label,
 758 |                                continue_label=test_label)
 759 | 
 760 |         self.c("WHILE loop - begin")
 761 |         
 762 |         self.o("%s:" % test_label)
 763 |         node.expr.accept(self)
 764 | 
 765 |         comparer = self.stack.pop()
 766 |         self.stack.done()
 767 |         self.o("  testl %s, %s" % (comparer, comparer), "Test the result")
 768 |         self.o("  jz %s" % done_label,
 769 |                "If result is zero, leave while loop")
 770 |         self._accept_and_empty_stack(node.stmt)
 771 |         self.o("  jmp %s" % test_label, "Jump to start of while loop")
 772 |         self.o("%s:" % done_label)
 773 | 
 774 |         self.c("WHILE loop - end")
 775 |         
 776 |         self._pop_loop_labels()
 777 | 
 778 |     def vForLoop(self, node):
 779 |         test_label = self.new_label() + "_test"
 780 |         done_label = self.new_label() + "_done"
 781 | 
 782 |         self._push_loop_labels(break_label=done_label,
 783 |                                continue_label=test_label)
 784 | 
 785 |         self.c("FOR loop - begin")
 786 |         
 787 |         self._accept_and_empty_stack(node.begin_stmt)
 788 |         
 789 |         self.o("%s:" % test_label)
 790 |         node.expr.accept(self)
 791 | 
 792 |         comparer = self.stack.pop()
 793 |         self.stack.done()
 794 |         self.o("  testl %s, %s" % (comparer, comparer), "Test the result")
 795 |         self.o("  jz %s" % done_label,
 796 |                "If result is zero, leave for loop")
 797 |         self._accept_and_empty_stack(node.stmt)
 798 |         self._accept_and_empty_stack(node.end_stmt)
 799 |         self.o("  jmp %s" % test_label, "Jump to start of for loop")
 800 |         self.o("%s:" % done_label)
 801 | 
 802 |         self.c("FOR loop - end")
 803 |         
 804 |         self._pop_loop_labels()
 805 | 
 806 |     def vBreakStatement(self, node):
 807 |         self.o("  jmp %s" % self.break_labels[-1],
 808 |                "Loop: break statement")
 809 | 
 810 |     def vContinueStatement(self, node):
 811 |         self.o("  jmp %s" % self.continue_labels[-1],
 812 |                "Loop: continue statement")
 813 | 
 814 |     def _get_new_str_literal_label(self, str):
 815 |         """Create a new string literal label for the given string,
 816 |         generate (but do not yet emit) the assembly for it, and return
 817 |         the name of the new label."""
 818 |         
 819 |         label_str = "LC%d" % self.__str_literal_label
 820 |         str = str.replace('\n', '\\12')
 821 |         self.__str_literal_str += """%s:\n  .ascii "%s\\0"\n""" % (label_str, str)
 822 |         self.__str_literal_label += 1
 823 |         return label_str
 824 | 
 825 |     def vStringLiteral(self, node):
 826 |         label_str = self._get_new_str_literal_label(node.get_str())
 827 | 
 828 |         # Make a little preview of the literal in the annotated
 829 |         # comments.
 830 |         COMMENT_CHARS = 7
 831 |         comment_label = node.get_sanitized_str()
 832 |         if len(comment_label) > COMMENT_CHARS:
 833 |             comment_label = "%s..." % comment_label[0:COMMENT_CHARS]
 834 | 
 835 |         self.o("  movl $%s, %s" % (label_str,
 836 |                                    self.stack.push(node.type)),
 837 |                "Get addr of string literal '%s'" % comment_label)
 838 | 
 839 |     def vConst(self, node):
 840 |         self.o("  movl $%d, %s" % (node.value,
 841 |                                    self.stack.push(node.type)),
 842 |                "Load numeric constant %d" % node.value)
 843 | 
 844 |     def vId(self, node):
 845 |         # If we're only supposed to push our address on the stack, not
 846 |         # our actual value, then do that and exit.
 847 |         if node.output_addr:
 848 |             self.o("  leal %s, %s" % (node.symbol.compile_loc,
 849 |                                       self.stack.push()),
 850 |                    "Get address of %s" % node.symbol.name)
 851 |             return
 852 |         type_str = node.type.get_outer_string()
 853 |         if type_str in ['pointer', 'int']:
 854 |             instr = 'movl'
 855 |         elif type_str == 'char':
 856 |             instr = 'movzbl'
 857 |         self.o("  %s %s, %s" % (instr, node.symbol.compile_loc,
 858 |                                 self.stack.push(node.type)),
 859 |                "Get value of %s" % node.symbol.name)
 860 | 
 861 |     def vArrayExpression(self, node):
 862 |         node.expr.accept(self)
 863 |         node.index.accept(self)
 864 |         reg_index = self.stack.pop(node.index.type)
 865 |         reg_expr = self.stack.pop(node.expr.type)
 866 |         reg_to = self.stack.push(node.type)
 867 |         size = self._calc_var_size(node.type)
 868 |         addr_str = "(%s,%s,%d)" % (reg_expr, reg_index, size)
 869 |         self.stack.done()
 870 |         if node.output_addr:
 871 |             self.o("  leal %s, %s" % (addr_str, reg_to),
 872 |                    "Load addr of pointer array index")
 873 |         else:
 874 |             type_str = node.type.get_outer_string()            
 875 |             if type_str in ['int', 'pointer']:
 876 |                 instr = 'movl'
 877 |             elif type_str == 'char':
 878 |                 instr = 'movzbl'
 879 |             self.o("  %s %s, %s" % (instr, addr_str, reg_to),
 880 |                    "Pointer array index dereference")
 881 | 
 882 |     def vFunctionExpression(self, node):
 883 |         """Generates assembly for calling a function."""
 884 | 
 885 |         self.c("FUNCTION CALL to %s() - begin" %
 886 |                node.function.symbol.name)
 887 |         
 888 |         # If we're using any caller-save registers, free them up.
 889 |         self.stack.save_caller_saves()
 890 | 
 891 |         # We need to temporarily reverse the order of the function's
 892 |         # arguments because we need to push them onto the stack
 893 |         # in reverse order.
 894 |         node.arglist.nodes.reverse()
 895 |         argnum = len(node.arglist.nodes)
 896 |         for arg in node.arglist.nodes:
 897 |             arg_reg = self._accept_and_pop(arg)
 898 |             self.o("  pushl %s" % arg_reg, "Push arg %d" % argnum)
 899 |             self.stack.done()
 900 |             argnum -= 1
 901 |         node.arglist.nodes.reverse()
 902 | 
 903 |         self.o("  call %s" % node.function.symbol.compile_loc,
 904 |                "Call %s()" % node.function.symbol.name)
 905 | 
 906 |         # The function will place its return value in register %eax.
 907 |         # So, we'll push a register from the stack and ask it to
 908 |         # give us %eax.
 909 |         result = self.stack.push(node.function.symbol.type.get_return_type(), preferred_reg='%eax')
 910 | 
 911 |         # If we got %eax, don't do anything, because our return
 912 |         # value is already in there.  Otherwise, move it.
 913 |         #
 914 |         # (Note that in the current implementation of the stack
 915 |         # machine, we should always get %eax.)
 916 |         if result != '%eax':
 917 |             self.o("  movl %%eax, %s" % result, "Copy return value")
 918 | 
 919 |         arg_stack_size = (len(node.arglist.nodes)*WORD_SIZE)
 920 | 
 921 |         if arg_stack_size > 0:
 922 |             self.o("  addl $%d, %%esp" % arg_stack_size,
 923 |                    "Deallocate argument stack")
 924 | 
 925 |         self.c("FUNCTION CALL to %s() - end" %
 926 |         node.function.symbol.name)
 927 |         
 928 |     def vReturnStatement(self, node):
 929 |         return_reg = self._accept_and_pop(node.expr)
 930 |         self.o("  movl %s, %%eax" % return_reg, "Set return value")
 931 |         self.o("  jmp %s" % self.curr_func_end_label, "Exit function")
 932 |         self.stack.done()
 933 | 
 934 |     def _accept_and_pop(self, node):
 935 |         """Accept the given node and pop its value into a register and
 936 |         return the register.  Implicit type conversion is performed,
 937 |         if necessary, by the stack machine.
 938 | 
 939 |         Also, if the node is determined to be a numeric constant,
 940 |         the literal value of the constant (e.g., '$15') is returned,
 941 |         for purposes of optimization."""
 942 | 
 943 |         if node.is_const():
 944 |             return "$%d" % node.value
 945 |         else:
 946 |             node.accept(self)
 947 |             return self.stack.pop(node.coerce_to_type)
 948 | 
 949 |     def _binop_assign(self, node):
 950 |         """Performs an assignment operation (=, +=, etc) on the given
 951 |         Binop node."""
 952 |         
 953 |         node.left.accept(self)
 954 |         right_reg = self._accept_and_pop(node.right)
 955 |         left_reg = self.stack.pop()
 956 |         instr = self.binop_instructions[node.op[0]]
 957 |         instr += self._type_suffix(node.type)
 958 |         
 959 |         type_str = node.type.get_outer_string()
 960 |         if type_str == 'char':
 961 |             right_reg = self.stack.lo(right_reg)
 962 |             
 963 |         self.o("  %s %s, (%s)" % (instr, right_reg, left_reg),
 964 |                "Perform assignment '%s'" % node.op)
 965 | 
 966 |         # NOTE: Wow, this makes for insanely inefficient code, especially
 967 |         # when the result of the operation isn't being used.
 968 |         if type_str in ['int', 'pointer']:
 969 |             instr = 'movl'
 970 |         elif type_str == 'char':
 971 |             instr = 'movzbl'
 972 |             
 973 |         self.o("  %s (%s), %s" % (instr, left_reg,
 974 |                                   self.stack.push(node.type)),
 975 |                "Copy assignment result to register")
 976 |         self.stack.done()
 977 | 
 978 |     def _type_suffix(self, type):
 979 |         """Returns the assembly instruction suffix for the given type;
 980 |         'l' for 32-bit types, 'b' for 8-bit types, etc..."""
 981 |         
 982 |         type_str = type.get_outer_string()
 983 |         if type_str in ['int', 'pointer']:
 984 |             return 'l'
 985 |         elif type_str == 'char':
 986 |             return 'b'
 987 | 
 988 |     def _binop_arith(self, node):
 989 |         """Performs an arithmetic operation (+, -, etc) on the given
 990 |         Binop node."""
 991 |         
 992 |         node.left.accept(self)
 993 |         right_reg = self._accept_and_pop(node.right)
 994 |         left_reg = self.stack.pop(node.left.coerce_to_type)
 995 | 
 996 |         instr = self.binop_instructions[node.op] + \
 997 |                 self._type_suffix(node.type)
 998 |         type_str = node.type.get_outer_string()
 999 | 
1000 |         if type_str == 'char':
1001 |             r_reg = self.stack.lo(right_reg)
1002 |             l_reg = self.stack.lo(left_reg)
1003 |         else:
1004 |             r_reg = right_reg
1005 |             l_reg = left_reg
1006 |             
1007 |         self.o("  %s %s, %s" % (instr, r_reg, l_reg),
1008 |                "Perform '%s'" % node.op)
1009 |         self.stack.done()
1010 | 
1011 |         # Here we are relying on the fact that left_reg is now free
1012 |         # from the last pop(), so we should be able to push it
1013 |         # back onto the stack machine.
1014 | 
1015 |         new_reg = self.stack.push(node.type, preferred_reg=left_reg)
1016 |         if new_reg != left_reg:
1017 |             raise Exception("PANIC! Binop push() isn't same as last pop()!")
1018 | 
1019 |     def _binop_compare(self, node):
1020 |         """Performs a comparison operation (>, ==, etc) on the given
1021 |         Binop node."""
1022 |         
1023 |         node.left.accept(self)
1024 |         right_reg = self._accept_and_pop(node.right)
1025 |         left_reg = self.stack.pop(node.left.coerce_to_type)
1026 |         self.stack.done()
1027 | 
1028 |         self.o("  cmpl %s, %s" % (right_reg, left_reg),
1029 |                "Compare %s to %s" % (left_reg, right_reg))
1030 | 
1031 |         # TODO: this could cause errors, if push() generates
1032 |         # mov instructions...  not sure if mov instructions
1033 |         # change the flags though, they probably shouldn't
1034 |         # since they're not arithmetic operations.
1035 |         byte_reg = self.stack.push(cparse.BaseType('char'))
1036 |         lo = self.stack.lo(byte_reg)
1037 |         self.o("  %s %s" % (self.binop_instructions[node.op],
1038 |                             lo),
1039 |                "Perform '%s'" % node.op)
1040 |         self.o("  movzbl %s, %s" % (lo, byte_reg),
1041 |                "Zero-extend the boolean result")
1042 | 
1043 |     def vBinop(self, node):
1044 |         if node.op in cparse.Binop.ASSIGN_OPS:
1045 |             self._binop_assign(node)
1046 |         elif node.op in ['+','-','*']:
1047 |             self._binop_arith(node)
1048 |         elif node.op in ['==', '!=', '<', '>', '<=', '>=']:
1049 |             self._binop_compare(node)
1050 | 
1051 |     def vNegative(self, node):
1052 |         node.expr.accept(self)
1053 |         self.o("  negl %s" % self.stack.peek(),
1054 |                "Perform unary negation")
1055 | 
1056 |     def vPointer(self, node):
1057 |         node.expr.accept(self)
1058 |         if node.output_addr:
1059 |             self.o("", "(Getting pointer target addr via '*')")
1060 |             return
1061 |         reg_from = self.stack.pop(node.expr.type)
1062 |         reg_to = self.stack.push(node.type)
1063 |         type_str = node.type.get_outer_string()
1064 |         if type_str in ['int', 'pointer']:
1065 |             instr = 'movl'
1066 |         elif type_str == 'char':
1067 |             instr = 'movzbl'
1068 |         self.o("  %s (%s), %s" % (instr, reg_from, reg_to),
1069 |                "Pointer dereference")
1070 |         self.stack.done()
1071 | 
1072 |     def vAddrOf(self, node):
1073 |         node.expr.accept(self)
1074 |         self.stack.force_type_change(node.type)
1075 |         self.o("", "(Address-of operator '&' used here)")
1076 | 
1077 | #  ---------------------------------------------------------------
1078 | #  End of cx86.py
1079 | #  ---------------------------------------------------------------
1080 | 


--------------------------------------------------------------------------------
/yacc.py:
--------------------------------------------------------------------------------
   1 | #-----------------------------------------------------------------------------
   2 | # ply: yacc.py
   3 | #
   4 | # Author: David M. Beazley (beazley@cs.uchicago.edu)
   5 | #         Department of Computer Science
   6 | #         University of Chicago
   7 | #         Chicago, IL 60637
   8 | #
   9 | # Copyright (C) 2001, David M. Beazley
  10 | #
  11 | # $Header: /cygdrive/c/prog/CVS/mini_c/yacc.py,v 1.1.1.1 2004/05/27 06:40:38 varmaa Exp $
  12 | #
  13 | # This library is free software; you can redistribute it and/or
  14 | # modify it under the terms of the GNU Lesser General Public
  15 | # License as published by the Free Software Foundation; either
  16 | # version 2.1 of the License, or (at your option) any later version.
  17 | # 
  18 | # This library is distributed in the hope that it will be useful,
  19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 | # Lesser General Public License for more details.
  22 | # 
  23 | # You should have received a copy of the GNU Lesser General Public
  24 | # License along with this library; if not, write to the Free Software
  25 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  26 | # 
  27 | # See the file COPYING for a complete copy of the LGPL.
  28 | #
  29 | #
  30 | # This implements an LR parser that is constructed from grammar rules defined
  31 | # as Python functions.  Roughly speaking, this module is a cross between
  32 | # John Aycock's Spark system and the GNU bison utility.
  33 | #
  34 | # Disclaimer:  This is a work in progress. SLR parsing seems to work fairly
  35 | # well and there is extensive error checking. LALR(1) is in progress.  The
  36 | # rest of this file is a bit of a mess.  Please pardon the dust.
  37 | #
  38 | # The current implementation is only somewhat object-oriented. The
  39 | # LR parser itself is defined in terms of an object (which allows multiple
  40 | # parsers to co-exist).  However, most of the variables used during table
  41 | # construction are defined in terms of global variables.  Users shouldn't
  42 | # notice unless they are trying to define multiple parsers at the same
  43 | # time using threads (in which case they should have their head examined).
  44 | #-----------------------------------------------------------------------------
  45 | 
  46 | __version__ = "1.4"
  47 | 
  48 | #-----------------------------------------------------------------------------
  49 | #                     === User configurable parameters ===
  50 | #
  51 | # Change these to modify the default behavior of yacc (if you wish)
  52 | #-----------------------------------------------------------------------------
  53 | 
  54 | yaccdebug   = 1                # Debugging mode.  If set, yacc generates a
  55 |                                # a 'parser.out' file in the current directory
  56 | 
  57 | debug_file  = 'parser.out'     # Default name of the debugging file
  58 | tab_module  = 'parsetab'       # Default name of the table module
  59 | default_lr  = 'SLR'            # Default LR table generation method
  60 | 
  61 | error_count = 3                # Number of symbols that must be shifted to leave recovery mode
  62 | 
  63 | import re, types, sys, cStringIO, md5, os.path
  64 | 
  65 | # Exception raised for yacc-related errors
  66 | class YaccError(Exception):   pass
  67 | 
  68 | #-----------------------------------------------------------------------------
  69 | #                        ===  LR Parsing Engine ===
  70 | #
  71 | # The following classes are used for the LR parser itself.  These are not
  72 | # used during table construction and are independent of the actual LR
  73 | # table generation algorithm
  74 | #-----------------------------------------------------------------------------
  75 | 
  76 | # This class is used to hold non-terminal grammar symbols during parsing.
  77 | # It normally has the following attributes set:
  78 | #        .type       = Grammar symbol type
  79 | #        .value      = Symbol value
  80 | #        .lineno     = Starting line number
  81 | #        .endlineno  = Ending line number (optional, set automatically)
  82 | 
  83 | class YaccSymbol:
  84 |     def __str__(self):    return self.type
  85 |     def __repr__(self):   return str(self)
  86 | 
  87 | # This class is a wrapper around the objects actually passed to each
  88 | # grammar rule.   Index lookup and assignment actually assign the
  89 | # .value attribute of the underlying YaccSymbol object.
  90 | # The lineno() method returns the line number of a given
  91 | # item (or 0 if not defined).   The linespan() method returns
  92 | # a tuple of (startline,endline) representing the range of lines
  93 | # for a symbol.
  94 | 
  95 | class YaccSlice:
  96 |     def __init__(self,s):
  97 |         self.slice = s
  98 |         self.pbstack = []
  99 | 
 100 |     def __getitem__(self,n):
 101 |         return self.slice[n].value
 102 | 
 103 |     def __setitem__(self,n,v):
 104 |         self.slice[n].value = v
 105 | 
 106 |     def lineno(self,n):
 107 |         return getattr(self.slice[n],"lineno",0)
 108 | 
 109 |     def linespan(self,n):
 110 |         startline = getattr(self.slice[n],"lineno",0)
 111 |         endline = getattr(self.slice[n],"endlineno",startline)
 112 |         return startline,endline
 113 | 
 114 |     def pushback(self,n):
 115 |         if n <= 0:
 116 |             raise ValueError, "Expected a positive value"
 117 |         if n > (len(self.slice)-1):
 118 |             raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1)
 119 |         for i in range(0,n):
 120 |             self.pbstack.append(self.slice[-i-1])
 121 | 
 122 | # The LR Parsing engine.   This is defined as a class so that multiple parsers
 123 | # can exist in the same process.  A user never instantiates this directly.
 124 | # Instead, the global yacc() function should be used to create a suitable Parser
 125 | # object. 
 126 | 
 127 | class Parser:
 128 |     def __init__(self,magic=None):
 129 | 
 130 |         # This is a hack to keep users from trying to instantiate a Parser
 131 |         # object directly.
 132 | 
 133 |         if magic != "xyzzy":
 134 |             raise YaccError, "Can't instantiate Parser. Use yacc() instead."
 135 | 
 136 |         # Reset internal state
 137 |         self.productions = None          # List of productions
 138 |         self.errorfunc   = None          # Error handling function
 139 |         self.action      = { }           # LR Action table
 140 |         self.goto        = { }           # LR goto table
 141 |         self.require     = { }           # Attribute require table
 142 |         self.method      = "Unknown LR"  # Table construction method used
 143 | 
 144 |     def errok(self):
 145 |         self.errorcount = 0
 146 | 
 147 |     def restart(self):
 148 |         del self.statestack[:]
 149 |         del self.symstack[:]
 150 |         sym = YaccSymbol()
 151 |         sym.type = '$'
 152 |         self.symstack.append(sym)
 153 |         self.statestack.append(0)
 154 |         
 155 |     def parse(self,input=None,lexer=None,debug=0):
 156 |         lookahead = None                 # Current lookahead symbol
 157 |         lookaheadstack = [ ]             # Stack of lookahead symbols
 158 |         actions = self.action            # Local reference to action table
 159 |         goto    = self.goto              # Local reference to goto table
 160 |         prod    = self.productions       # Local reference to production list
 161 |         pslice  = YaccSlice(None)        # Slice object passed to grammar rules
 162 |         pslice.parser = self             # Parser object
 163 |         self.errorcount = 0              # Used during error recovery
 164 | 
 165 |         # If no lexer was given, we will try to use the lex module
 166 |         if not lexer:
 167 |             import lex as lexer
 168 | 
 169 |         pslice.lexer = lexer
 170 |         
 171 |         # If input was supplied, pass to lexer
 172 |         if input:
 173 |             lexer.input(input)
 174 | 
 175 |         # Tokenize function
 176 |         get_token = lexer.token
 177 | 
 178 |         statestack = [ ]                # Stack of parsing states
 179 |         self.statestack = statestack
 180 |         symstack   = [ ]                # Stack of grammar symbols
 181 |         self.symstack = symstack
 182 | 
 183 |         errtoken   = None               # Err token
 184 | 
 185 |         # The start state is assumed to be (0,$)
 186 |         statestack.append(0)
 187 |         sym = YaccSymbol()
 188 |         sym.type = '$'
 189 |         symstack.append(sym)
 190 |         
 191 |         while 1:
 192 |             # Get the next symbol on the input.  If a lookahead symbol
 193 |             # is already set, we just use that. Otherwise, we'll pull
 194 |             # the next token off of the lookaheadstack or from the lexer
 195 |             if not lookahead:
 196 |                 if not lookaheadstack:
 197 |                     lookahead = get_token()     # Get the next token
 198 |                 else:
 199 |                     lookahead = lookaheadstack.pop()
 200 |                 if not lookahead:
 201 |                     lookahead = YaccSymbol()
 202 |                     lookahead.type = '$'
 203 |             if debug:
 204 |                 errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()
 205 | 
 206 |             # Check the action table
 207 |             s = statestack[-1]
 208 |             ltype = lookahead.type
 209 |             t = actions.get((s,ltype),None)
 210 | 
 211 |             if t is not None:
 212 |                 if t > 0:
 213 |                     # shift a symbol on the stack
 214 |                     if ltype == '$':
 215 |                         # Error, end of input
 216 |                         sys.stderr.write("yacc: Parse error. EOF\n")
 217 |                         return
 218 |                     statestack.append(t)
 219 |                     if debug > 1:
 220 |                         sys.stderr.write("%-60s shift state %s\n" % (errorlead, t))
 221 |                     symstack.append(lookahead)
 222 |                     lookahead = None
 223 | 
 224 |                     # Decrease error count on successful shift
 225 |                     if self.errorcount > 0:
 226 |                         self.errorcount -= 1
 227 |                         
 228 |                     continue
 229 |                 
 230 |                 if t < 0:
 231 |                     # reduce a symbol on the stack, emit a production
 232 |                     p = prod[-t]
 233 |                     pname = p.name
 234 |                     plen  = p.len
 235 | 
 236 |                     # Get production function
 237 |                     sym = YaccSymbol()
 238 |                     sym.type = pname       # Production name
 239 |                     sym.value = None
 240 |                     if debug > 1:
 241 |                         sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t))
 242 | 
 243 |                     if plen:
 244 |                         targ = symstack[-plen-1:]
 245 |                         targ[0] = sym
 246 |                         try:
 247 |                             sym.lineno = targ[1].lineno
 248 |                             sym.endlineno = getattr(targ[-1],"endlineno",targ[-1].lineno)
 249 |                         except AttributeError:
 250 |                             sym.lineno = 0
 251 |                         del symstack[-plen:]
 252 |                         del statestack[-plen:]
 253 |                     else:
 254 |                         sym.lineno = 0
 255 |                         targ = [ sym ]
 256 |                     pslice.slice = targ
 257 |                     pslice.pbstack = []
 258 |                     # Call the grammar rule with our special slice object
 259 |                     p.func(pslice)
 260 | 
 261 |                     # Validate attributes of the resulting value attribute
 262 | #                  if require:
 263 | #                      try:
 264 | #                          t0 = targ[0]
 265 | #                          r = Requires.get(t0.type,None)
 266 | #                          t0d = t0.__dict__
 267 | #                          if r:
 268 | #                              for field in r:
 269 | #                                  tn = t0
 270 | #                                  for fname in field:
 271 | #                                      try:
 272 | #                                          tf = tn.__dict__
 273 | #                                          tn = tf.get(fname)
 274 | #                                      except StandardError:
 275 | #                                          tn = None
 276 | #                                      if not tn:
 277 | #                                          print "%s:%d: Rule %s doesn't set required attribute '%s'" % \
 278 | #                                                (p.file,p.line,p.name,".".join(field))
 279 | #                      except TypeError,LookupError:
 280 | #                          print "Bad requires directive " % r
 281 | #                          pass
 282 | 
 283 | 
 284 |                     # If there was a pushback, put that on the stack
 285 |                     if pslice.pbstack:
 286 |                         lookaheadstack.append(lookahead)
 287 |                         for _t in pslice.pbstack:
 288 |                             lookaheadstack.append(_t)
 289 |                         lookahead = None
 290 | 
 291 |                     symstack.append(sym)
 292 |                     statestack.append(goto[statestack[-1],pname])
 293 |                     continue
 294 | 
 295 |                 if t == 0:
 296 |                     n = symstack[-1]
 297 |                     return getattr(n,"value",None)
 298 |                     sys.stderr.write(errorlead, "\n")
 299 | 
 300 |             if t == None:
 301 |                 if debug:
 302 |                     sys.stderr.write(errorlead + "\n")
 303 |                 # We have some kind of parsing error here.  To handle
 304 |                 # this, we are going to push the current token onto
 305 |                 # the tokenstack and replace it with an 'error' token.
 306 |                 # If there are any synchronization rules, they may
 307 |                 # catch it.
 308 |                 #
 309 |                 # In addition to pushing the error token, we call call
 310 |                 # the user defined p_error() function if this is the
 311 |                 # first syntax error.  This function is only called if
 312 |                 # errorcount == 0.
 313 |                 if not self.errorcount:
 314 |                     self.errorcount = error_count
 315 |                     errtoken = lookahead
 316 |                     if errtoken.type == '$':
 317 |                         errtoken = None               # End of file!
 318 |                     if self.errorfunc:
 319 |                         global errok,token,restart
 320 |                         errok = self.errok        # Set some special functions available in error recovery
 321 |                         token = get_token
 322 |                         restart = self.restart
 323 |                         tok = self.errorfunc(errtoken)
 324 |                         del errok, token, restart   # Delete special functions
 325 |                         
 326 |                         if not self.errorcount:
 327 |                             # User must have done some kind of panic
 328 |                             # mode recovery on their own.  The
 329 |                             # returned token is the next lookahead
 330 |                             lookahead = tok
 331 |                             errtoken = None
 332 |                             continue
 333 |                     else:
 334 |                         if errtoken:
 335 |                             if hasattr(errtoken,"lineno"): lineno = lookahead.lineno
 336 |                             else: lineno = 0
 337 |                             if lineno:
 338 |                                 sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type))
 339 |                             else:
 340 |                                 sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type)
 341 |                         else:
 342 |                             sys.stderr.write("yacc: Parse error in input. EOF\n")
 343 |                             return
 344 | 
 345 |                 else:
 346 |                     self.errorcount = error_count
 347 |                 
 348 |                 # case 1:  the statestack only has 1 entry on it.  If we're in this state, the
 349 |                 # entire parse has been rolled back and we're completely hosed.   The token is
 350 |                 # discarded and we just keep going.
 351 | 
 352 |                 if len(statestack) <= 1 and lookahead.type != '$':
 353 |                     lookahead = None
 354 |                     errtoken = None
 355 |                     # Nuke the pushback stack
 356 |                     del lookaheadstack[:]
 357 |                     continue
 358 | 
 359 |                 # case 2: the statestack has a couple of entries on it, but we're
 360 |                 # at the end of the file. nuke the top entry and generate an error token
 361 | 
 362 |                 # Start nuking entries on the stack
 363 |                 if lookahead.type == '$':
 364 |                     # Whoa. We're really hosed here. Bail out
 365 |                     return 
 366 | 
 367 |                 if lookahead.type != 'error':
 368 |                     sym = symstack[-1]
 369 |                     if sym.type == 'error':
 370 |                         # Hmmm. Error is on top of stack, we'll just nuke input
 371 |                         # symbol and continue
 372 |                         lookahead = None
 373 |                         continue
 374 |                     t = YaccSymbol()
 375 |                     t.type = 'error'
 376 |                     if hasattr(lookahead,"lineno"):
 377 |                         t.lineno = lookahead.lineno
 378 |                     t.value = lookahead
 379 |                     lookaheadstack.append(lookahead)
 380 |                     lookahead = t
 381 |                 else:
 382 |                     symstack.pop()
 383 |                     statestack.pop()
 384 | 
 385 |                 continue
 386 | 
 387 |             # Call an error function here
 388 |             raise RuntimeError, "yacc: internal parser error!!!\n"
 389 | 
 390 | # -----------------------------------------------------------------------------
 391 | #                          === Parser Construction ===
 392 | #
 393 | # The following functions and variables are used to implement the yacc() function
 394 | # itself.   This is pretty hairy stuff involving lots of error checking,
 395 | # construction of LR items, kernels, and so forth.   Although a lot of
 396 | # this work is done using global variables, the resulting Parser object
 397 | # is completely self contained--meaning that it is safe to repeatedly
 398 | # call yacc() with different grammars in the same application.
 399 | # -----------------------------------------------------------------------------
 400 |         
 401 | # -----------------------------------------------------------------------------
 402 | # validate_file()
 403 | #
 404 | # This function checks to see if there are duplicated p_rulename() functions
 405 | # in the parser module file.  Without this function, it is really easy for
 406 | # users to make mistakes by cutting and pasting code fragments (and it's a real
 407 | # bugger to try and figure out why the resulting parser doesn't work).  Therefore,
 408 | # we just do a little regular expression pattern matching of def statements
 409 | # to try and detect duplicates.
 410 | # -----------------------------------------------------------------------------
 411 | 
 412 | def validate_file(filename):
 413 |     base,ext = os.path.splitext(filename)
 414 |     if ext != '.py': return 1          # No idea. Assume it's okay.
 415 | 
 416 |     try:
 417 |         f = open(filename)
 418 |         lines = f.readlines()
 419 |         f.close()
 420 |     except IOError:
 421 |         return 1                       # Oh well
 422 | 
 423 |     # Match def p_funcname(
 424 |     fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(')
 425 |     counthash = { }
 426 |     linen = 1
 427 |     noerror = 1
 428 |     for l in lines:
 429 |         m = fre.match(l)
 430 |         if m:
 431 |             name = m.group(1)
 432 |             prev = counthash.get(name)
 433 |             if not prev:
 434 |                 counthash[name] = linen
 435 |             else:
 436 |                 sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev))
 437 |                 noerror = 0
 438 |         linen += 1
 439 |     return noerror
 440 | 
 441 | # This function looks for functions that might be grammar rules, but which don't have the proper p_suffix.
 442 | def validate_dict(d):
 443 |     for n,v in d.items(): 
 444 |         if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue
 445 |         if n[0:2] == 't_': continue
 446 | 
 447 |         if n[0:2] == 'p_':
 448 |             sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n)
 449 |         if 1 and isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1:
 450 |             try:
 451 |                 doc = v.__doc__.split(" ")
 452 |                 if doc[1] == ':':
 453 |                     sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (v.func_code.co_filename, v.func_code.co_firstlineno,n))
 454 |             except StandardError:
 455 |                 pass
 456 | 
 457 | # -----------------------------------------------------------------------------
 458 | #                           === GRAMMAR FUNCTIONS ===
 459 | #
 460 | # The following global variables and functions are used to store, manipulate,
 461 | # and verify the grammar rules specified by the user.
 462 | # -----------------------------------------------------------------------------
 463 | 
 464 | # Initialize all of the global variables used during grammar construction
 465 | def initialize_vars():
 466 |     global Productions, Prodnames, Prodmap, Terminals 
 467 |     global Nonterminals, First, Follow, Precedence, LRitems
 468 |     global Errorfunc, Signature, Requires
 469 | 
 470 |     Productions  = [None]  # A list of all of the productions.  The first
 471 |                            # entry is always reserved for the purpose of
 472 |                            # building an augmented grammar
 473 |                         
 474 |     Prodnames    = { }     # A dictionary mapping the names of nonterminals to a list of all
 475 |                            # productions of that nonterminal.
 476 |                         
 477 |     Prodmap      = { }     # A dictionary that is only used to detect duplicate
 478 |                            # productions.
 479 | 
 480 |     Terminals    = { }     # A dictionary mapping the names of terminal symbols to a
 481 |                            # list of the rules where they are used.
 482 | 
 483 |     Nonterminals = { }     # A dictionary mapping names of nonterminals to a list
 484 |                            # of rule numbers where they are used.
 485 | 
 486 |     First        = { }     # A dictionary of precomputed FIRST(x) symbols
 487 |     
 488 |     Follow       = { }     # A dictionary of precomputed FOLLOW(x) symbols
 489 | 
 490 |     Precedence   = { }     # Precedence rules for each terminal. Contains tuples of the
 491 |                            # form ('right',level) or ('nonassoc', level) or ('left',level)
 492 | 
 493 |     LRitems      = [ ]     # A list of all LR items for the grammar.  These are the
 494 |                            # productions with the "dot" like E -> E . PLUS E
 495 | 
 496 |     Errorfunc    = None    # User defined error handler
 497 | 
 498 |     Signature    = md5.new()   # Digital signature of the grammar rules, precedence
 499 |                                # and other information.  Used to determined when a
 500 |                                # parsing table needs to be regenerated.
 501 | 
 502 |     Requires     = { }     # Requires list
 503 | 
 504 |     # File objects used when creating the parser.out debugging file
 505 |     global _vf, _vfc
 506 |     _vf           = cStringIO.StringIO()
 507 |     _vfc          = cStringIO.StringIO()
 508 | 
 509 | # -----------------------------------------------------------------------------
 510 | # class Production:
 511 | #
 512 | # This class stores the raw information about a single production or grammar rule.
 513 | # It has a few required attributes:
 514 | #
 515 | #       name     - Name of the production (nonterminal)
 516 | #       prod     - A list of symbols making up its production
 517 | #       number   - Production number.
 518 | #
 519 | # In addition, a few additional attributes are used to help with debugging or
 520 | # optimization of table generation.
 521 | #
 522 | #       file     - File where production action is defined.
 523 | #       lineno   - Line number where action is defined
 524 | #       func     - Action function
 525 | #       prec     - Precedence level
 526 | #       lr_next  - Next LR item. Example, if we are ' E -> E . PLUS E'
 527 | #                  then lr_next refers to 'E -> E PLUS . E'   
 528 | #       lr_index - LR item index (location of the ".") in the prod list.
 529 | #       len      - Length of the production (number of symbols on right hand side)
 530 | # -----------------------------------------------------------------------------
 531 | 
 532 | class Production:
 533 |     def __init__(self,**kw):
 534 |         for k,v in kw.items():
 535 |             setattr(self,k,v)
 536 |         self.lr_index = -1
 537 |         self.lr0_added = 0    # Flag indicating whether or not added to LR0 closure
 538 |         self.usyms = [ ]
 539 |         
 540 |     def __str__(self):
 541 |         if self.prod:
 542 |             s = "%s -> %s" % (self.name," ".join(self.prod))
 543 |         else:
 544 |             s = "%s -> <empty>" % self.name
 545 |         return s
 546 | 
 547 |     def __repr__(self):
 548 |         return str(self)
 549 | 
 550 |     # Compute lr_items from the production
 551 |     def lr_item(self,n):
 552 |         if n > len(self.prod): return None
 553 |         p = Production()
 554 |         p.name = self.name
 555 |         p.prod = list(self.prod)
 556 |         p.number = self.number
 557 |         p.lr_index = n
 558 |         p.prod.insert(n,".")
 559 |         p.prod = tuple(p.prod)
 560 |         p.len = len(p.prod)
 561 |         p.usyms = self.usyms
 562 | 
 563 |         # Precompute list of productions immediately following
 564 |         try:
 565 |             p.lrafter = Prodnames[p.prod[n+1]]
 566 |         except (IndexError,KeyError),e:
 567 |             p.lrafter = []
 568 |         try:
 569 |             p.lrbefore = p.prod[n-1]
 570 |         except IndexError:
 571 |             p.lrbefore = None
 572 | 
 573 |         return p
 574 | 
 575 | class MiniProduction:
 576 |     pass
 577 | 
 578 | # Utility function
 579 | def is_identifier(s):
 580 |     for c in s:
 581 |         if not (c.isalnum() or c == '_'): return 0
 582 |     return 1
 583 | 
 584 | # -----------------------------------------------------------------------------
 585 | # add_production()
 586 | #
 587 | # Given an action function, this function assembles a production rule.
 588 | # The production rule is assumed to be found in the function's docstring.
 589 | # This rule has the general syntax:
 590 | #
 591 | #              name1 ::= production1
 592 | #                     |  production2
 593 | #                     |  production3
 594 | #                    ...
 595 | #                     |  productionn
 596 | #              name2 ::= production1
 597 | #                     |  production2
 598 | #                    ... 
 599 | # -----------------------------------------------------------------------------
 600 | 
 601 | def add_production(f,file,line,prodname,syms):
 602 |     
 603 |     if Terminals.has_key(prodname):
 604 |         sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname))
 605 |         return -1
 606 |     if prodname == 'error':
 607 |         sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname))
 608 |         return -1
 609 |                 
 610 |     if not is_identifier(prodname):
 611 |         sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname))
 612 |         return -1
 613 | 
 614 |     for s in syms:
 615 |         if not is_identifier(s) and s != '%prec':
 616 |             sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname))
 617 |             return -1
 618 | 
 619 |     # See if the rule is already in the rulemap
 620 |     map = "%s -> %s" % (prodname,syms)
 621 |     if Prodmap.has_key(map):
 622 |         m = Prodmap[map]
 623 |         sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m))
 624 |         sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line))
 625 |         return -1
 626 | 
 627 |     p = Production()
 628 |     p.name = prodname
 629 |     p.prod = syms
 630 |     p.file = file
 631 |     p.line = line
 632 |     p.func = f
 633 |     p.number = len(Productions)
 634 | 
 635 |             
 636 |     Productions.append(p)
 637 |     Prodmap[map] = p
 638 |     if not Nonterminals.has_key(prodname):
 639 |         Nonterminals[prodname] = [ ]
 640 |     
 641 |     # Add all terminals to Terminals
 642 |     i = 0
 643 |     while i < len(p.prod):
 644 |         t = p.prod[i]
 645 |         if t == '%prec':
 646 |             try:
 647 |                 precname = p.prod[i+1]
 648 |             except IndexError:
 649 |                 sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line))
 650 |                 return -1
 651 | 
 652 |             prec = Precedence.get(precname,None)
 653 |             if not prec:
 654 |                 sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname))
 655 |                 return -1
 656 |             else:
 657 |                 p.prec = prec
 658 |             del p.prod[i]
 659 |             del p.prod[i]
 660 |             continue
 661 | 
 662 |         if Terminals.has_key(t):
 663 |             Terminals[t].append(p.number)
 664 |             # Is a terminal.  We'll assign a precedence to p based on this
 665 |             if not hasattr(p,"prec"):
 666 |                 p.prec = Precedence.get(t,('right',0))
 667 |         else:
 668 |             if not Nonterminals.has_key(t):
 669 |                 Nonterminals[t] = [ ]
 670 |             Nonterminals[t].append(p.number)
 671 |         i += 1
 672 | 
 673 |     if not hasattr(p,"prec"):
 674 |         p.prec = ('right',0)
 675 |         
 676 |     # Set final length of productions
 677 |     p.len  = len(p.prod)
 678 |     p.prod = tuple(p.prod)
 679 | 
 680 |     # Calculate unique syms in the production
 681 |     p.usyms = [ ]
 682 |     for s in p.prod:
 683 |         if s not in p.usyms:
 684 |             p.usyms.append(s)
 685 |     
 686 |     # Add to the global productions list
 687 |     try:
 688 |         Prodnames[p.name].append(p)
 689 |     except KeyError:
 690 |         Prodnames[p.name] = [ p ]
 691 |     return 0
 692 | 
 693 | # Given a raw rule function, this function rips out its doc string
 694 | # and adds rules to the grammar
 695 | 
 696 | def add_function(f):
 697 |     line = f.func_code.co_firstlineno
 698 |     file = f.func_code.co_filename
 699 |     error = 0
 700 | 
 701 |     if isinstance(f,types.MethodType):
 702 |         reqdargs = 2
 703 |     else:
 704 |         reqdargs = 1
 705 |         
 706 |     if f.func_code.co_argcount > reqdargs:
 707 |         sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__))
 708 |         return -1
 709 | 
 710 |     if f.func_code.co_argcount < reqdargs:
 711 |         sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__))
 712 |         return -1
 713 |           
 714 |     if f.__doc__:
 715 |         # Split the doc string into lines
 716 |         pstrings = f.__doc__.splitlines()
 717 |         lastp = None
 718 |         dline = line
 719 |         for ps in pstrings:
 720 |             dline += 1
 721 |             p = ps.split()
 722 |             if not p: continue
 723 |             try:
 724 |                 if p[0] == '|':
 725 |                     # This is a continuation of a previous rule
 726 |                     if not lastp:
 727 |                         sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline))
 728 |                         return -1
 729 |                     prodname = lastp
 730 |                     if len(p) > 1:
 731 |                         syms = p[1:]
 732 |                     else:
 733 |                         syms = [ ]
 734 |                 else:
 735 |                     prodname = p[0]
 736 |                     lastp = prodname
 737 |                     assign = p[1]
 738 |                     if len(p) > 2:
 739 |                         syms = p[2:]
 740 |                     else:
 741 |                         syms = [ ]
 742 |                     if assign != ':' and assign != '::=':
 743 |                         sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline))
 744 |                         return -1
 745 |                 e = add_production(f,file,dline,prodname,syms)
 746 |                 error += e
 747 |             except StandardError:
 748 |                 sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps))
 749 |                 error -= 1
 750 |     else:
 751 |         sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__))
 752 |     return error
 753 | 
 754 | 
 755 | # Cycle checking code (Michael Dyck)
 756 | 
 757 | def compute_reachable():
 758 |     '''
 759 |     Find each symbol that can be reached from the start symbol.
 760 |     Print a warning for any nonterminals that can't be reached.
 761 |     (Unused terminals have already had their warning.)
 762 |     '''
 763 |     Reachable = { }
 764 |     for s in Terminals.keys() + Nonterminals.keys():
 765 |         Reachable[s] = 0
 766 | 
 767 |     mark_reachable_from( Productions[0].prod[0], Reachable )
 768 | 
 769 |     for s in Nonterminals.keys():
 770 |         if not Reachable[s]:
 771 |             sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s)
 772 | 
 773 | def mark_reachable_from(s, Reachable):
 774 |     '''
 775 |     Mark all symbols that are reachable from symbol s.
 776 |     '''
 777 |     if Reachable[s]:
 778 |         # We've already reached symbol s.
 779 |         return
 780 |     Reachable[s] = 1
 781 |     for p in Prodnames.get(s,[]):
 782 |         for r in p.prod:
 783 |             mark_reachable_from(r, Reachable)
 784 | 
 785 | # -----------------------------------------------------------------------------
 786 | # compute_terminates()
 787 | #
 788 | # This function looks at the various parsing rules and tries to detect
 789 | # infinite recursion cycles (grammar rules where there is no possible way
 790 | # to derive a string of only terminals).
 791 | # -----------------------------------------------------------------------------
 792 | def compute_terminates():
 793 |     '''
 794 |     Raise an error for any symbols that don't terminate.
 795 |     '''
 796 |     Terminates = {}
 797 | 
 798 |     # Terminals:
 799 |     for t in Terminals.keys():
 800 |         Terminates[t] = 1
 801 | 
 802 |     Terminates['$'] = 1
 803 | 
 804 |     # Nonterminals:
 805 | 
 806 |     # Initialize to false:
 807 |     for n in Nonterminals.keys():
 808 |         Terminates[n] = 0
 809 | 
 810 |     # Then propagate termination until no change:
 811 |     while 1:
 812 |         some_change = 0
 813 |         for (n,pl) in Prodnames.items():
 814 |             # Nonterminal n terminates iff any of its productions terminates.
 815 |             for p in pl:
 816 |                 # Production p terminates iff all of its rhs symbols terminate.
 817 |                 for s in p.prod:
 818 |                     if not Terminates[s]:
 819 |                         # The symbol s does not terminate,
 820 |                         # so production p does not terminate.
 821 |                         p_terminates = 0
 822 |                         break
 823 |                 else:
 824 |                     # didn't break from the loop,
 825 |                     # so every symbol s terminates
 826 |                     # so production p terminates.
 827 |                     p_terminates = 1
 828 | 
 829 |                 if p_terminates:
 830 |                     # symbol n terminates!
 831 |                     if not Terminates[n]:
 832 |                         Terminates[n] = 1
 833 |                         some_change = 1
 834 |                     # Don't need to consider any more productions for this n.
 835 |                     break
 836 | 
 837 |         if not some_change:
 838 |             break
 839 | 
 840 |     some_error = 0
 841 |     for (s,terminates) in Terminates.items():
 842 |         if not terminates:
 843 |             if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error':
 844 |                 # s is used-but-not-defined, and we've already warned of that,
 845 |                 # so it would be overkill to say that it's also non-terminating.
 846 |                 pass
 847 |             else:
 848 |                 sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s)
 849 |                 some_error = 1
 850 | 
 851 |     return some_error
 852 | 
 853 | # -----------------------------------------------------------------------------
 854 | # verify_productions()
 855 | #
 856 | # This function examines all of the supplied rules to see if they seem valid.
 857 | # -----------------------------------------------------------------------------
 858 | def verify_productions(cycle_check=1):
 859 |     error = 0
 860 |     for p in Productions:
 861 |         if not p: continue
 862 | 
 863 |         for s in p.prod:
 864 |             if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error':
 865 |                 sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s))
 866 |                 error = 1
 867 |                 continue
 868 | 
 869 |     unused_tok = 0 
 870 |     # Now verify all of the tokens
 871 |     if yaccdebug:
 872 |         _vf.write("Unused terminals:\n\n")
 873 |     for s,v in Terminals.items():
 874 |         if s != 'error' and not v:
 875 |             sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s)
 876 |             if yaccdebug: _vf.write("   %s\n"% s)
 877 |             unused_tok += 1
 878 | 
 879 |     # Print out all of the productions
 880 |     if yaccdebug:
 881 |         _vf.write("\nGrammar\n\n")
 882 |         for i in range(1,len(Productions)):
 883 |             _vf.write("Rule %-5d %s\n" % (i, Productions[i]))
 884 |         
 885 |     unused_prod = 0
 886 |     # Verify the use of all productions
 887 |     for s,v in Nonterminals.items():
 888 |         if not v:
 889 |             p = Prodnames[s][0]
 890 |             sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s))
 891 |             unused_prod += 1
 892 | 
 893 |     
 894 |     if unused_tok == 1:
 895 |         sys.stderr.write("yacc: Warning. There is 1 unused token.\n")
 896 |     if unused_tok > 1:
 897 |         sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok)
 898 | 
 899 |     if unused_prod == 1:
 900 |         sys.stderr.write("yacc: Warning. There is 1 unused rule.\n")
 901 |     if unused_prod > 1:
 902 |         sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod)
 903 | 
 904 |     if yaccdebug:
 905 |         _vf.write("\nTerminals, with rules where they appear\n\n")
 906 |         ks = Terminals.keys()
 907 |         ks.sort()
 908 |         for k in ks:
 909 |             _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]])))
 910 |         _vf.write("\nNonterminals, with rules where they appear\n\n")
 911 |         ks = Nonterminals.keys()
 912 |         ks.sort()
 913 |         for k in ks:
 914 |             _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]])))
 915 | 
 916 |     if (cycle_check):
 917 |         compute_reachable()
 918 |         error += compute_terminates()
 919 | #        error += check_cycles()
 920 |     return error
 921 | 
 922 | # -----------------------------------------------------------------------------
 923 | # build_lritems()
 924 | #
 925 | # This function walks the list of productions and builds a complete set of the
 926 | # LR items.  The LR items are stored in two ways:  First, they are uniquely
 927 | # numbered and placed in the list _lritems.  Second, a linked list of LR items
 928 | # is built for each production.  For example:
 929 | #
 930 | #   E -> E PLUS E
 931 | #
 932 | # Creates the list
 933 | #
 934 | #  [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] 
 935 | # -----------------------------------------------------------------------------
 936 | 
 937 | def build_lritems():
 938 |     for p in Productions:
 939 |         lastlri = p
 940 |         lri = p.lr_item(0)
 941 |         i = 0
 942 |         while 1:
 943 |             lri = p.lr_item(i)
 944 |             lastlri.lr_next = lri
 945 |             if not lri: break
 946 |             lri.lr_num = len(LRitems)
 947 |             LRitems.append(lri)
 948 |             lastlri = lri
 949 |             i += 1
 950 | 
 951 |     # In order for the rest of the parser generator to work, we need to
 952 |     # guarantee that no more lritems are generated.  Therefore, we nuke
 953 |     # the p.lr_item method.  (Only used in debugging)
 954 |     # Production.lr_item = None
 955 | 
 956 | # -----------------------------------------------------------------------------
 957 | # add_precedence()
 958 | #
 959 | # Given a list of precedence rules, add to the precedence table.
 960 | # -----------------------------------------------------------------------------
 961 | 
 962 | def add_precedence(plist):
 963 |     plevel = 0
 964 |     error = 0
 965 |     for p in plist:
 966 |         plevel += 1
 967 |         try:
 968 |             prec = p[0]
 969 |             terms = p[1:]
 970 |             if prec != 'left' and prec != 'right' and prec != 'nonassoc':
 971 |                 sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec)
 972 |                 return -1
 973 |             for t in terms:
 974 |                 if Precedence.has_key(t):
 975 |                     sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t)
 976 |                     error += 1
 977 |                     continue
 978 |                 Precedence[t] = (prec,plevel)
 979 |         except:
 980 |             sys.stderr.write("yacc: Invalid precedence table.\n")
 981 |             error += 1
 982 | 
 983 |     return error
 984 | 
 985 | # -----------------------------------------------------------------------------
 986 | # augment_grammar()
 987 | #
 988 | # Compute the augmented grammar.  This is just a rule S' -> start where start
 989 | # is the starting symbol.
 990 | # -----------------------------------------------------------------------------
 991 | 
 992 | def augment_grammar(start=None):
 993 |     if not start:
 994 |         start = Productions[1].name
 995 |     Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None)
 996 |     Productions[0].usyms = [ start ]
 997 |     Nonterminals[start].append(0)
 998 | 
 999 | 
1000 | # -------------------------------------------------------------------------
1001 | # first()
1002 | #
1003 | # Compute the value of FIRST1(beta) where beta is a tuple of symbols.
1004 | #
1005 | # During execution of compute_first1, the result may be incomplete.
1006 | # Afterward (e.g., when called from compute_follow()), it will be complete.
1007 | # -------------------------------------------------------------------------
1008 | def first(beta):
1009 | 
1010 |     # We are computing First(x1,x2,x3,...,xn)
1011 |     result = [ ]
1012 |     for x in beta:
1013 |         x_produces_empty = 0
1014 | 
1015 |         # Add all the non-<empty> symbols of First[x] to the result.
1016 |         for f in First[x]:
1017 |             if f == '<empty>':
1018 |                 x_produces_empty = 1
1019 |             else:
1020 |                 if f not in result: result.append(f)
1021 | 
1022 |         if x_produces_empty:
1023 |             # We have to consider the next x in beta,
1024 |             # i.e. stay in the loop.
1025 |             pass
1026 |         else:
1027 |             # We don't have to consider any further symbols in beta.
1028 |             break
1029 |     else:
1030 |         # There was no 'break' from the loop,
1031 |         # so x_produces_empty was true for all x in beta,
1032 |         # so beta produces empty as well.
1033 |         result.append('<empty>')
1034 | 
1035 |     return result
1036 | 
1037 | 
1038 | # FOLLOW(x)
1039 | # Given a non-terminal.  This function computes the set of all symbols
1040 | # that might follow it.  Dragon book, p. 189.
1041 | 
1042 | def compute_follow(start=None):
1043 |     # Add '$' to the follow list of the start symbol
1044 |     for k in Nonterminals.keys():
1045 |         Follow[k] = [ ]
1046 | 
1047 |     if not start:
1048 |         start = Productions[1].name
1049 |         
1050 |     Follow[start] = [ '$' ]
1051 |         
1052 |     while 1:
1053 |         didadd = 0
1054 |         for p in Productions[1:]:
1055 |             # Here is the production set
1056 |             for i in range(len(p.prod)):
1057 |                 B = p.prod[i]
1058 |                 if Nonterminals.has_key(B):
1059 |                     # Okay. We got a non-terminal in a production
1060 |                     fst = first(p.prod[i+1:])
1061 |                     hasempty = 0
1062 |                     for f in fst:
1063 |                         if f != '<empty>' and f not in Follow[B]:
1064 |                             Follow[B].append(f)
1065 |                             didadd = 1
1066 |                         if f == '<empty>':
1067 |                             hasempty = 1
1068 |                     if hasempty or i == (len(p.prod)-1):
1069 |                         # Add elements of follow(a) to follow(b)
1070 |                         for f in Follow[p.name]:
1071 |                             if f not in Follow[B]:
1072 |                                 Follow[B].append(f)
1073 |                                 didadd = 1
1074 |         if not didadd: break
1075 | 
1076 |     if 0 and yaccdebug:
1077 |         _vf.write('\nFollow:\n')
1078 |         for k in Nonterminals.keys():
1079 |             _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]])))
1080 | 
1081 | # -------------------------------------------------------------------------
1082 | # compute_first1()
1083 | #
1084 | # Compute the value of FIRST1(X) for all symbols
1085 | # -------------------------------------------------------------------------
1086 | def compute_first1():
1087 | 
1088 |     # Terminals:
1089 |     for t in Terminals.keys():
1090 |         First[t] = [t]
1091 | 
1092 |     First['$'] = ['$']
1093 |     First['#'] = ['#'] # what's this for?
1094 | 
1095 |     # Nonterminals:
1096 | 
1097 |     # Initialize to the empty set:
1098 |     for n in Nonterminals.keys():
1099 |         First[n] = []
1100 | 
1101 |     # Then propagate symbols until no change:
1102 |     while 1:
1103 |         some_change = 0
1104 |         for n in Nonterminals.keys():
1105 |             for p in Prodnames[n]:
1106 |                 for f in first(p.prod):
1107 |                     if f not in First[n]:
1108 |                         First[n].append( f )
1109 |                         some_change = 1
1110 |         if not some_change:
1111 |             break
1112 | 
1113 |     if 0 and yaccdebug:
1114 |         _vf.write('\nFirst:\n')
1115 |         for k in Nonterminals.keys():
1116 |             _vf.write("%-20s : %s\n" %
1117 |                 (k, " ".join([str(s) for s in First[k]])))
1118 | 
1119 | # -----------------------------------------------------------------------------
1120 | #                           === SLR Generation ===
1121 | #
1122 | # The following functions are used to construct SLR (Simple LR) parsing tables
1123 | # as described on p.221-229 of the dragon book.
1124 | # -----------------------------------------------------------------------------
1125 | 
1126 | # Global variables for the LR parsing engine
1127 | def lr_init_vars():
1128 |     global _lr_action, _lr_goto, _lr_method
1129 |     global _lr_goto_cache
1130 |     
1131 |     _lr_action       = { }        # Action table
1132 |     _lr_goto         = { }        # Goto table
1133 |     _lr_method       = "Unknown"  # LR method used
1134 |     _lr_goto_cache   = { }
1135 | 
1136 | # Compute the LR(0) closure operation on I, where I is a set of LR(0) items.
1137 | # prodlist is a list of productions.
1138 | 
1139 | _add_count = 0       # Counter used to detect cycles
1140 | 
1141 | def lr0_closure(I):
1142 |     global _add_count
1143 |     
1144 |     _add_count += 1
1145 |     prodlist = Productions
1146 |     
1147 |     # Add everything in I to J        
1148 |     J = I[:]
1149 |     didadd = 1
1150 |     while didadd:
1151 |         didadd = 0
1152 |         for j in J:
1153 |             for x in j.lrafter:
1154 |                 if x.lr0_added == _add_count: continue
1155 |                 # Add B --> .G to J
1156 |                 J.append(x.lr_next)
1157 |                 x.lr0_added = _add_count
1158 |                 didadd = 1
1159 |                
1160 |     return J
1161 | 
1162 | # Compute the LR(0) goto function goto(I,X) where I is a set
1163 | # of LR(0) items and X is a grammar symbol.   This function is written
1164 | # in a way that guarantees uniqueness of the generated goto sets
1165 | # (i.e. the same goto set will never be returned as two different Python
1166 | # objects).  With uniqueness, we can later do fast set comparisons using
1167 | # id(obj) instead of element-wise comparison.
1168 | 
1169 | def lr0_goto(I,x):
1170 |     # First we look for a previously cached entry
1171 |     g = _lr_goto_cache.get((id(I),x),None)
1172 |     if g: return g
1173 | 
1174 |     # Now we generate the goto set in a way that guarantees uniqueness
1175 |     # of the result
1176 |     
1177 |     s = _lr_goto_cache.get(x,None)
1178 |     if not s:
1179 |         s = { }
1180 |         _lr_goto_cache[x] = s
1181 | 
1182 |     gs = [ ]
1183 |     for p in I:
1184 |         n = p.lr_next
1185 |         if n and n.lrbefore == x:
1186 |             s1 = s.get(id(n),None)
1187 |             if not s1:
1188 |                 s1 = { }
1189 |                 s[id(n)] = s1
1190 |             gs.append(n)
1191 |             s = s1
1192 |     g = s.get('$',None)
1193 |     if not g:
1194 |         if gs:
1195 |             g = lr0_closure(gs)
1196 |             s['$'] = g
1197 |         else:
1198 |             s['$'] = gs
1199 |     _lr_goto_cache[(id(I),x)] = g
1200 |     return g
1201 | 
1202 | # Compute the kernel of a set of LR(0) items
1203 | def lr0_kernel(I):
1204 |     KI = [ ]
1205 |     for p in I:
1206 |         if p.name == "S'" or p.lr_index > 0 or p.len == 0:
1207 |             KI.append(p)
1208 | 
1209 |     return KI
1210 | 
1211 | _lr0_cidhash = { }
1212 | 
1213 | # Compute the LR(0) sets of item function
1214 | def lr0_items():
1215 |     
1216 |     C = [ lr0_closure([Productions[0].lr_next]) ]
1217 |     i = 0
1218 |     for I in C:
1219 |         _lr0_cidhash[id(I)] = i
1220 |         i += 1
1221 | 
1222 |     # Loop over the items in C and each grammar symbols
1223 |     i = 0
1224 |     while i < len(C):
1225 |         I = C[i]
1226 |         i += 1
1227 | 
1228 |         # Collect all of the symbols that could possibly be in the goto(I,X) sets
1229 |         asyms = { }
1230 |         for ii in I:
1231 |             for s in ii.usyms:
1232 |                 asyms[s] = None
1233 | 
1234 |         for x in asyms.keys():
1235 |             g = lr0_goto(I,x)
1236 |             if not g:  continue
1237 |             if _lr0_cidhash.has_key(id(g)): continue
1238 |             _lr0_cidhash[id(g)] = len(C)            
1239 |             C.append(g)
1240 |             
1241 |     return C
1242 | 
1243 | # -----------------------------------------------------------------------------
1244 | # slr_parse_table()
1245 | #
1246 | # This function constructs an SLR table.
1247 | # -----------------------------------------------------------------------------
1248 | def slr_parse_table():
1249 |     global _lr_method
1250 |     goto = _lr_goto           # Goto array
1251 |     action = _lr_action       # Action array
1252 |     actionp = { }             # Action production array (temporary)
1253 | 
1254 |     _lr_method = "SLR"
1255 |     
1256 |     n_srconflict = 0
1257 |     n_rrconflict = 0
1258 | 
1259 |     if yaccdebug:
1260 |         sys.stderr.write("yacc: Generating SLR parsing table...\n")        
1261 |         _vf.write("\n\nParsing method: SLR\n\n")
1262 |         
1263 |     # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items
1264 |     # This determines the number of states
1265 |     
1266 |     C = lr0_items()
1267 | 
1268 |     # Build the parser table, state by state
1269 |     st = 0
1270 |     for I in C:
1271 |         # Loop over each production in I
1272 |         actlist = [ ]              # List of actions
1273 |         
1274 |         if yaccdebug:
1275 |             _vf.write("\nstate %d\n\n" % st)
1276 |             for p in I:
1277 |                 _vf.write("    (%d) %s\n" % (p.number, str(p)))
1278 |             _vf.write("\n")
1279 | 
1280 |         for p in I:
1281 |             try:
1282 |                 if p.prod[-1] == ".":
1283 |                     if p.name == "S'":
1284 |                         # Start symbol. Accept!
1285 |                         action[st,"$"] = 0
1286 |                         actionp[st,"$"] = p
1287 |                     else:
1288 |                         # We are at the end of a production.  Reduce!
1289 |                         for a in Follow[p.name]:
1290 |                             actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p)))
1291 |                             r = action.get((st,a),None)
1292 |                             if r is not None:
1293 |                                 # Whoa. Have a shift/reduce or reduce/reduce conflict
1294 |                                 if r > 0:
1295 |                                     # Need to decide on shift or reduce here
1296 |                                     # By default we favor shifting. Need to add
1297 |                                     # some precedence rules here.
1298 |                                     sprec,slevel = Productions[actionp[st,a].number].prec                                    
1299 |                                     rprec,rlevel = Precedence.get(a,('right',0))
1300 |                                     if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')):
1301 |                                         # We really need to reduce here.  
1302 |                                         action[st,a] = -p.number
1303 |                                         actionp[st,a] = p
1304 |                                         if not slevel and not rlevel:
1305 |                                             _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
1306 |                                             _vf.write("  ! shift/reduce conflict for %s resolved as reduce.\n" % a)
1307 |                                             n_srconflict += 1
1308 |                                     elif (slevel == rlevel) and (rprec == 'nonassoc'):
1309 |                                         action[st,a] = None
1310 |                                     else:
1311 |                                         # Hmmm. Guess we'll keep the shift
1312 |                                         if not slevel and not rlevel:
1313 |                                             _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
1314 |                                             _vf.write("  ! shift/reduce conflict for %s resolved as shift.\n" % a)
1315 |                                             n_srconflict +=1                                    
1316 |                                 elif r < 0:
1317 |                                     # Reduce/reduce conflict.   In this case, we favor the rule
1318 |                                     # that was defined first in the grammar file
1319 |                                     oldp = Productions[-r]
1320 |                                     pp = Productions[p.number]
1321 |                                     if oldp.line > pp.line:
1322 |                                         action[st,a] = -p.number
1323 |                                         actionp[st,a] = p
1324 |                                     # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st)
1325 |                                     n_rrconflict += 1
1326 |                                     _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a]))
1327 |                                     _vf.write("  ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a]))
1328 |                                 else:
1329 |                                     sys.stderr.write("Unknown conflict in state %d\n" % st)
1330 |                             else:
1331 |                                 action[st,a] = -p.number
1332 |                                 actionp[st,a] = p
1333 |                 else:
1334 |                     i = p.lr_index
1335 |                     a = p.prod[i+1]       # Get symbol right after the "."
1336 |                     if Terminals.has_key(a):
1337 |                         g = lr0_goto(I,a)
1338 |                         j = _lr0_cidhash.get(id(g),-1)
1339 |                         if j >= 0:
1340 |                             # We are in a shift state
1341 |                             actlist.append((a,p,"shift and go to state %d" % j))
1342 |                             r = action.get((st,a),None)
1343 |                             if r is not None:
1344 |                                 # Whoa have a shift/reduce or shift/shift conflict
1345 |                                 if r > 0:
1346 |                                     if r != j:
1347 |                                         sys.stderr.write("Shift/shift conflict in state %d\n" % st)
1348 |                                 elif r < 0:
1349 |                                     # Do a precedence check.
1350 |                                     #   -  if precedence of reduce rule is higher, we reduce.
1351 |                                     #   -  if precedence of reduce is same and left assoc, we reduce.
1352 |                                     #   -  otherwise we shift
1353 |                                     rprec,rlevel = Productions[actionp[st,a].number].prec
1354 |                                     sprec,slevel = Precedence.get(a,('right',0))
1355 |                                     if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')):
1356 |                                         # We decide to shift here... highest precedence to shift
1357 |                                         action[st,a] = j
1358 |                                         actionp[st,a] = p
1359 |                                         if not slevel and not rlevel:
1360 |                                             n_srconflict += 1
1361 |                                             _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
1362 |                                             _vf.write("  ! shift/reduce conflict for %s resolved as shift.\n" % a)
1363 |                                     elif (slevel == rlevel) and (rprec == 'nonassoc'):
1364 |                                         action[st,a] = None
1365 |                                     else:                                            
1366 |                                         # Hmmm. Guess we'll keep the reduce
1367 |                                         if not slevel and not rlevel:
1368 |                                             n_srconflict +=1
1369 |                                             _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
1370 |                                             _vf.write("  ! shift/reduce conflict for %s resolved as reduce.\n" % a)
1371 |                                             
1372 |                                 else:
1373 |                                     sys.stderr.write("Unknown conflict in state %d\n" % st)
1374 |                             else:
1375 |                                 action[st,a] = j
1376 |                                 actionp[st,a] = p
1377 |                                 
1378 |             except StandardError,e:
1379 |                 raise YaccError, "Hosed in slr_parse_table", e
1380 | 
1381 |         # Print the actions associated with each terminal
1382 |         if yaccdebug:
1383 |           for a,p,m in actlist:
1384 |             if action.has_key((st,a)):
1385 |                 if p is actionp[st,a]:
1386 |                     _vf.write("    %-15s %s\n" % (a,m))
1387 |           _vf.write("\n")
1388 |           for a,p,m in actlist:
1389 |             if action.has_key((st,a)):
1390 |                 if p is not actionp[st,a]:
1391 |                     _vf.write("  ! %-15s [ %s ]\n" % (a,m))
1392 |             
1393 |         # Construct the goto table for this state
1394 |         if yaccdebug:
1395 |             _vf.write("\n")
1396 |         nkeys = { }
1397 |         for ii in I:
1398 |             for s in ii.usyms:
1399 |                 if Nonterminals.has_key(s):
1400 |                     nkeys[s] = None
1401 |         for n in nkeys.keys():
1402 |             g = lr0_goto(I,n)
1403 |             j = _lr0_cidhash.get(id(g),-1)            
1404 |             if j >= 0:
1405 |                 goto[st,n] = j
1406 |                 if yaccdebug:
1407 |                     _vf.write("    %-15s shift and go to state %d\n" % (n,j))
1408 | 
1409 |         st += 1
1410 | 
1411 |     if yaccdebug:
1412 |         if n_srconflict == 1:
1413 |             sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict)
1414 |         if n_srconflict > 1:
1415 |             sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict)
1416 |         if n_rrconflict == 1:
1417 |             sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict)
1418 |         if n_rrconflict > 1:
1419 |             sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict)
1420 | 
1421 | 
1422 | # -----------------------------------------------------------------------------
1423 | #                       ==== LALR(1) Parsing ====
1424 | # **** UNFINISHED!  6/16/01
1425 | # -----------------------------------------------------------------------------
1426 | 
1427 | 
1428 | # Compute the lr1_closure of a set I.  I is a list of tuples (p,a) where
1429 | # p is a LR0 item and a is a terminal
1430 | 
1431 | _lr1_add_count = 0
1432 | 
1433 | def lr1_closure(I):
1434 |     global _lr1_add_count
1435 | 
1436 |     _lr1_add_count += 1
1437 | 
1438 |     J = I[:]
1439 | 
1440 |     # Loop over items (p,a) in I.
1441 |     ji = 0
1442 |     while ji < len(J):
1443 |         p,a = J[ji]
1444 |         #  p = [ A -> alpha . B beta]
1445 | 
1446 |         #  For each production B -> gamma 
1447 |         for B in p.lr1_after:
1448 |             f = tuple(p.lr1_beta + (a,))
1449 | 
1450 |             # For each terminal b in first(Beta a)
1451 |             for b in first(f):
1452 |                 # Check if (B -> . gamma, b) is in J
1453 |                 # Only way this can happen is if the add count mismatches
1454 |                 pn = B.lr_next
1455 |                 if pn.lr_added.get(b,0) == _lr1_add_count: continue
1456 |                 pn.lr_added[b] = _lr1_add_count
1457 |                 J.append((pn,b))
1458 |         ji += 1
1459 | 
1460 |     return J
1461 | 
1462 | def lalr_parse_table():
1463 | 
1464 |     # Compute some lr1 information about all of the productions
1465 |     for p in LRitems:
1466 |         try:
1467 |             after = p.prod[p.lr_index + 1]
1468 |             p.lr1_after = Prodnames[after]
1469 |             p.lr1_beta = p.prod[p.lr_index + 2:]
1470 |         except LookupError:
1471 |             p.lr1_after = [ ]
1472 |             p.lr1_beta = [ ]
1473 |         p.lr_added = { }
1474 | 
1475 |     # Compute the LR(0) items
1476 |     C = lr0_items()
1477 |     CK = []
1478 |     for I in C:
1479 |         CK.append(lr0_kernel(I))
1480 | 
1481 |     print CK
1482 |     
1483 | # -----------------------------------------------------------------------------
1484 | #                          ==== LR Utility functions ====
1485 | # -----------------------------------------------------------------------------
1486 | 
1487 | # -----------------------------------------------------------------------------
1488 | # _lr_write_tables()
1489 | #
1490 | # This function writes the LR parsing tables to a file
1491 | # -----------------------------------------------------------------------------
1492 | 
1493 | def lr_write_tables(modulename=tab_module):
1494 |     filename = modulename + ".py"
1495 |     try:
1496 |         f = open(filename,"w")
1497 | 
1498 |         f.write("""
1499 | # %s
1500 | # This file is automatically generated. Do not edit.
1501 | 
1502 | _lr_method = %s
1503 | 
1504 | _lr_signature = %s
1505 | """ % (filename, repr(_lr_method), repr(Signature.digest())))
1506 | 
1507 |         # Change smaller to 0 to go back to original tables
1508 |         smaller = 1
1509 |                 
1510 |         # Factor out names to try and make smaller
1511 |         if smaller:
1512 |             items = { }
1513 |         
1514 |             for k,v in _lr_action.items():
1515 |                 i = items.get(k[1])
1516 |                 if not i:
1517 |                     i = ([],[])
1518 |                     items[k[1]] = i
1519 |                 i[0].append(k[0])
1520 |                 i[1].append(v)
1521 | 
1522 |             f.write("\n_lr_action_items = {")
1523 |             for k,v in items.items():
1524 |                 f.write("%r:([" % k)
1525 |                 for i in v[0]:
1526 |                     f.write("%r," % i)
1527 |                 f.write("],[")
1528 |                 for i in v[1]:
1529 |                     f.write("%r," % i)
1530 |                            
1531 |                 f.write("]),")
1532 |             f.write("}\n")
1533 | 
1534 |             f.write("""
1535 | _lr_action = { }
1536 | for _k, _v in _lr_action_items.items():
1537 |    for _x,_y in zip(_v[0],_v[1]):
1538 |        _lr_action[(_x,_k)] = _y
1539 | del _lr_action_items
1540 | """)
1541 |             
1542 |         else:
1543 |             f.write("\n_lr_action = { ");
1544 |             for k,v in _lr_action.items():
1545 |                 f.write("(%r,%r):%r," % (k[0],k[1],v))
1546 |             f.write("}\n");
1547 | 
1548 |         if smaller:
1549 |             # Factor out names to try and make smaller
1550 |             items = { }
1551 |         
1552 |             for k,v in _lr_goto.items():
1553 |                 i = items.get(k[1])
1554 |                 if not i:
1555 |                     i = ([],[])
1556 |                     items[k[1]] = i
1557 |                 i[0].append(k[0])
1558 |                 i[1].append(v)
1559 | 
1560 |             f.write("\n_lr_goto_items = {")
1561 |             for k,v in items.items():
1562 |                 f.write("%r:([" % k)
1563 |                 for i in v[0]:
1564 |                     f.write("%r," % i)
1565 |                 f.write("],[")
1566 |                 for i in v[1]:
1567 |                     f.write("%r," % i)
1568 |                            
1569 |                 f.write("]),")
1570 |             f.write("}\n")
1571 | 
1572 |             f.write("""
1573 | _lr_goto = { }
1574 | for _k, _v in _lr_goto_items.items():
1575 |    for _x,_y in zip(_v[0],_v[1]):
1576 |        _lr_goto[(_x,_k)] = _y
1577 | del _lr_goto_items
1578 | """)
1579 |         else:
1580 |             f.write("\n_lr_goto = { ");
1581 |             for k,v in _lr_goto.items():
1582 |                 f.write("(%r,%r):%r," % (k[0],k[1],v))                    
1583 |             f.write("}\n");
1584 | 
1585 |         # Write production table
1586 |         f.write("_lr_productions = [\n")
1587 |         for p in Productions:
1588 |             if p:
1589 |                 if (p.func):
1590 |                     f.write("  (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line))
1591 |                 else:
1592 |                     f.write("  (%r,%d,None,None,None),\n" % (p.name, p.len))
1593 |             else:
1594 |                 f.write("  None,\n")
1595 |         f.write("]\n")
1596 |         f.close()
1597 | 
1598 |     except IOError,e:
1599 |         print "Unable to create '%s'" % filename
1600 |         print e
1601 |         return
1602 | 
1603 | def lr_read_tables(module=tab_module,optimize=0):
1604 |     global _lr_action, _lr_goto, _lr_productions, _lr_method
1605 |     try:
1606 |         exec "import %s as parsetab" % module
1607 |         
1608 |         if (optimize) or (Signature.digest() == parsetab._lr_signature):
1609 |             _lr_action = parsetab._lr_action
1610 |             _lr_goto   = parsetab._lr_goto
1611 |             _lr_productions = parsetab._lr_productions
1612 |             _lr_method = parsetab._lr_method
1613 |             return 1
1614 |         else:
1615 |             return 0
1616 |         
1617 |     except (ImportError,AttributeError):
1618 |         return 0
1619 | 
1620 | # -----------------------------------------------------------------------------
1621 | # yacc(module)
1622 | #
1623 | # Build the parser module
1624 | # -----------------------------------------------------------------------------
1625 | 
1626 | def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file):
1627 |     global yaccdebug
1628 |     yaccdebug = debug
1629 |     
1630 |     initialize_vars()
1631 |     files = { }
1632 |     error = 0
1633 | 
1634 |     # Add starting symbol to signature
1635 |     if start:
1636 |         Signature.update(start)
1637 |         
1638 |     # If a "module" parameter was supplied, extract its dictionary.
1639 |     # Note: a module may in fact be an instance as well.
1640 |     
1641 |     if module:
1642 |         # User supplied a module object.
1643 |         if isinstance(module, types.ModuleType):
1644 |             ldict = module.__dict__
1645 |         elif isinstance(module, types.InstanceType):
1646 |             _items = [(k,getattr(module,k)) for k in dir(module)]
1647 |             ldict = { }
1648 |             for i in _items:
1649 |                 ldict[i[0]] = i[1]
1650 |         else:
1651 |             raise ValueError,"Expected a module"
1652 |         
1653 |     else:
1654 |         # No module given.  We might be able to get information from the caller.
1655 |         # Throw an exception and unwind the traceback to get the globals
1656 |         
1657 |         try:
1658 |             raise RuntimeError
1659 |         except RuntimeError:
1660 |             e,b,t = sys.exc_info()
1661 |             f = t.tb_frame
1662 |             f = f.f_back           # Walk out to our calling function
1663 |             ldict = f.f_globals    # Grab its globals dictionary
1664 | 
1665 |     # If running in optimized mode.  We're going to
1666 | 
1667 |     if (optimize and lr_read_tables(tabmodule,1)):
1668 |         # Read parse table
1669 |         del Productions[:]
1670 |         for p in _lr_productions:
1671 |             if not p:
1672 |                 Productions.append(None)
1673 |             else:
1674 |                 m = MiniProduction()
1675 |                 m.name = p[0]
1676 |                 m.len  = p[1]
1677 |                 m.file = p[3]
1678 |                 m.line = p[4]
1679 |                 if p[2]:
1680 |                     m.func = ldict[p[2]]
1681 |                 Productions.append(m)
1682 |         
1683 |     else:
1684 |         # Get the tokens map
1685 |         if (module and isinstance(module,types.InstanceType)):
1686 |             tokens = getattr(module,"tokens",None)
1687 |         else:
1688 |             tokens = ldict.get("tokens",None)
1689 |     
1690 |         if not tokens:
1691 |             raise YaccError,"module does not define a list 'tokens'"
1692 |         if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
1693 |             raise YaccError,"tokens must be a list or tuple."
1694 | 
1695 |         # Check to see if a requires dictionary is defined.
1696 |         requires = ldict.get("require",None)
1697 |         if requires:
1698 |             if not (isinstance(requires,types.DictType)):
1699 |                 raise YaccError,"require must be a dictionary."
1700 | 
1701 |             for r,v in requires.items():
1702 |                 try:
1703 |                     if not (isinstance(v,types.ListType)):
1704 |                         raise TypeError
1705 |                     v1 = [x.split(".") for x in v]
1706 |                     Requires[r] = v1
1707 |                 except StandardError:
1708 |                     print "Invalid specification for rule '%s' in require. Expected a list of strings" % r            
1709 | 
1710 |         
1711 |         # Build the dictionary of terminals.  We a record a 0 in the
1712 |         # dictionary to track whether or not a terminal is actually
1713 |         # used in the grammar
1714 | 
1715 |         if 'error' in tokens:
1716 |             print "yacc: Illegal token 'error'.  Is a reserved word."
1717 |             raise YaccError,"Illegal token name"
1718 | 
1719 |         for n in tokens:
1720 |             if Terminals.has_key(n):
1721 |                 print "yacc: Warning. Token '%s' multiply defined." % n
1722 |             Terminals[n] = [ ]
1723 | 
1724 |         Terminals['error'] = [ ]
1725 | 
1726 |         # Get the precedence map (if any)
1727 |         prec = ldict.get("precedence",None)
1728 |         if prec:
1729 |             if not (isinstance(prec,types.ListType) or isinstance(prec,types.TupleType)):
1730 |                 raise YaccError,"precedence must be a list or tuple."
1731 |             add_precedence(prec)
1732 |             Signature.update(repr(prec))
1733 | 
1734 |         for n in tokens:
1735 |             if not Precedence.has_key(n):
1736 |                 Precedence[n] = ('right',0)         # Default, right associative, 0 precedence
1737 | 
1738 |         # Look for error handler
1739 |         ef = ldict.get('p_error',None)
1740 |         if ef:
1741 |             if isinstance(ef,types.FunctionType):
1742 |                 ismethod = 0
1743 |             elif isinstance(ef, types.MethodType):
1744 |                 ismethod = 1
1745 |             else:
1746 |                 raise YaccError,"'p_error' defined, but is not a function or method."                
1747 |             eline = ef.func_code.co_firstlineno
1748 |             efile = ef.func_code.co_filename
1749 |             files[efile] = None
1750 | 
1751 |             if (ef.func_code.co_argcount != 1+ismethod):
1752 |                 raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline)
1753 |             global Errorfunc
1754 |             Errorfunc = ef
1755 |         else:
1756 |             print "yacc: Warning. no p_error() function is defined."
1757 |             
1758 |         # Get the list of built-in functions with p_ prefix
1759 |         symbols = [ldict[f] for f in ldict.keys()
1760 |                if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_'
1761 |                    and ldict[f].__name__ != 'p_error')]
1762 | 
1763 |         # Check for non-empty symbols
1764 |         if len(symbols) == 0:
1765 |             raise YaccError,"no rules of the form p_rulename are defined."
1766 |     
1767 |         # Sort the symbols by line number
1768 |         symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno))
1769 | 
1770 |         # Add all of the symbols to the grammar
1771 |         for f in symbols:
1772 |             if (add_function(f)) < 0:
1773 |                 error += 1
1774 |             else:
1775 |                 files[f.func_code.co_filename] = None
1776 | 
1777 |         # Make a signature of the docstrings
1778 |         for f in symbols:
1779 |             if f.__doc__:
1780 |                 Signature.update(f.__doc__)
1781 |     
1782 |         lr_init_vars()
1783 | 
1784 |         if error:
1785 |             raise YaccError,"Unable to construct parser."
1786 | 
1787 |         if not lr_read_tables(tabmodule):
1788 | 
1789 |             # Validate files
1790 |             for filename in files.keys():
1791 |                 if not validate_file(filename):
1792 |                     error = 1
1793 | 
1794 |             # Validate dictionary
1795 |             validate_dict(ldict)
1796 | 
1797 |             if start and not Prodnames.has_key(start):
1798 |                 raise YaccError,"Bad starting symbol '%s'" % start
1799 |         
1800 |             augment_grammar(start)    
1801 |             error = verify_productions(cycle_check=check_recursion)
1802 |             otherfunc = [ldict[f] for f in ldict.keys()
1803 |                if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')]
1804 | 
1805 |             if error:
1806 |                 raise YaccError,"Unable to construct parser."
1807 |             
1808 |             build_lritems()
1809 |             compute_first1()
1810 |             compute_follow(start)
1811 |         
1812 |             if method == 'SLR':
1813 |                 slr_parse_table()
1814 |             elif method == 'LALR1':
1815 |                 lalr_parse_table()
1816 |                 return
1817 |             else:
1818 |                 raise YaccError, "Unknown parsing method '%s'" % method
1819 | 
1820 |             if write_tables:
1821 |                 lr_write_tables(tabmodule)        
1822 |     
1823 |             if yaccdebug:
1824 |                 try:
1825 |                     f = open(debugfile,"w")
1826 |                     f.write(_vfc.getvalue())
1827 |                     f.write("\n\n")
1828 |                     f.write(_vf.getvalue())
1829 |                     f.close()
1830 |                 except IOError,e:
1831 |                     print "yacc: can't create '%s'" % debugfile,e
1832 |         
1833 |     # Made it here.   Create a parser object and set up its internal state.
1834 |     # Set global parse() method to bound method of parser object.
1835 | 
1836 |     p = Parser("xyzzy")
1837 |     p.productions = Productions
1838 |     p.errorfunc = Errorfunc
1839 |     p.action = _lr_action
1840 |     p.goto   = _lr_goto
1841 |     p.method = _lr_method
1842 |     p.require = Requires
1843 | 
1844 |     global parse
1845 |     parse = p.parse
1846 | 
1847 |     # Clean up all of the globals we created
1848 |     if (not optimize):
1849 |         yacc_cleanup()
1850 |     return p
1851 | 
1852 | # yacc_cleanup function.  Delete all of the global variables
1853 | # used during table construction
1854 | 
1855 | def yacc_cleanup():
1856 |     global _lr_action, _lr_goto, _lr_method, _lr_goto_cache
1857 |     del _lr_action, _lr_goto, _lr_method, _lr_goto_cache
1858 | 
1859 |     global Productions, Prodnames, Prodmap, Terminals 
1860 |     global Nonterminals, First, Follow, Precedence, LRitems
1861 |     global Errorfunc, Signature, Requires
1862 | 
1863 |     del Productions, Prodnames, Prodmap, Terminals
1864 |     del Nonterminals, First, Follow, Precedence, LRitems
1865 |     del Errorfunc, Signature, Requires
1866 | 
1867 |     global _vf, _vfc
1868 |     del _vf, _vfc
1869 |     
1870 |     
1871 | # Stub that raises an error if parsing is attempted without first calling yacc()
1872 | def parse(*args,**kwargs):
1873 |     raise YaccError, "yacc: No parser built with yacc()"
1874 | 
1875 | 


--------------------------------------------------------------------------------