├── CH11 ├── a.out └── ap1.s ├── .gitignore ├── CH10 ├── ex1.in ├── test1.in ├── test2.in ├── p1.in └── h1.py ├── CH7 ├── p0701a.in ├── p0701b.in ├── p0701c.in ├── t1.in ├── p1.in ├── ch7_p1.py └── p1.py ├── sample ├── CH5 ├── .ch5_p1.py.swp └── ch5_p1.py ├── CH6 ├── t1.in ├── __pycache__ │ └── tokenizer_1.cpython-38.pyc ├── README.md ├── t1.py └── tokenizer_1.py ├── CH8 ├── p1.in └── p1.py ├── README.md ├── CH4 ├── ch4_p9.py ├── ch4_p11.py ├── ch4_p2.py ├── ch4_p1.py ├── ch4_p4.py ├── sp.py └── ch4_p3.py └── CH9 └── p1.py /CH11/a.out: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | -------------------------------------------------------------------------------- /CH10/ex1.in: -------------------------------------------------------------------------------- 1 | 2 + 3 2 | -------------------------------------------------------------------------------- /CH7/p0701a.in: -------------------------------------------------------------------------------- 1 | a = 2 | -------------------------------------------------------------------------------- /CH7/p0701b.in: -------------------------------------------------------------------------------- 1 | printf(3) 2 | -------------------------------------------------------------------------------- /CH7/p0701c.in: -------------------------------------------------------------------------------- 1 | print(3)) 2 | -------------------------------------------------------------------------------- /sample: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spak9/WICRPI/HEAD/sample -------------------------------------------------------------------------------- /CH10/test1.in: -------------------------------------------------------------------------------- 1 | x = 5 + 5 2 | print(x) 3 | y = x + 5 4 | print(y) 5 | -------------------------------------------------------------------------------- /CH10/test2.in: -------------------------------------------------------------------------------- 1 | x = (10 + (10 + 10)) 2 | y = (x * 10) 3 | print(x) 4 | print(y) 5 | -------------------------------------------------------------------------------- /CH5/.ch5_p1.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spak9/WICRPI/HEAD/CH5/.ch5_p1.py.swp -------------------------------------------------------------------------------- /CH6/t1.in: -------------------------------------------------------------------------------- 1 | print(-59 + 20*3) 2 | a = 2 3 | bb_1 = -a + 12 4 | print(a*bb_1 + a*3*(-1 + -1 + -1)) 5 | -------------------------------------------------------------------------------- /CH7/t1.in: -------------------------------------------------------------------------------- 1 | print(-59 + 20*3) 2 | a = 2 3 | bb_1 = -a + 12 4 | print(a*bb_1 + a*3*(-1 + -1 + -1)) 5 | -------------------------------------------------------------------------------- /CH10/p1.in: -------------------------------------------------------------------------------- 1 | print(-59 + 20*3) 2 | a = 2 3 | bb_1 = -(a) + 12 4 | print(a*bb_1 + a*3*(-1 + -1 + -1)) 5 | 6 | -------------------------------------------------------------------------------- /CH7/p1.in: -------------------------------------------------------------------------------- 1 | print(-59 + 20*3) 2 | a = 2 3 | bb_1 = -a + 12 4 | print(a*bb_1 + a*3*(-1 + -1 + -1)) 5 | 6 | -------------------------------------------------------------------------------- /CH8/p1.in: -------------------------------------------------------------------------------- 1 | -(-2+3) 2 | print(-59 + 20*3) 3 | a = 2 4 | bb_1 = -a + 12 5 | print(a*bb_1 + a*3*(-1 + -1 + -1)) 6 | 7 | -------------------------------------------------------------------------------- /CH6/__pycache__/tokenizer_1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spak9/WICRPI/HEAD/CH6/__pycache__/tokenizer_1.cpython-38.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WICRP 2 | A repository that is a workflow of Anthony Reis's "Writing Interpreters and Compilers for the Raspberry Pi Using Python" 2nd Edition 3 | -------------------------------------------------------------------------------- /CH6/README.md: -------------------------------------------------------------------------------- 1 | # CH 6: Basic Tokenizer for a Python Subset 2 | 3 | ## Tokenizer 4 | The original tokenizer can be found from the software 5 | package from Reis, but I reconstructed the code 6 | just reinvention-sake. 7 | -------------------------------------------------------------------------------- /CH4/ch4_p9.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author Steven Pak 3 | Reis's Writing Interpreters and Compilers 2nd Edition 4 | 5 | ch 4 p9 6 | 7 | Grammar: 8 | -> 'a' 9 | -> | 10 | -> 'b' 'b' 'b' 11 | -> 'c' 'c' 'c' 12 | ''' 13 | -------------------------------------------------------------------------------- /CH4/ch4_p11.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author Steven Pak 3 | Reis's Writing Intepreters and Compilers 2nd edition 4 | 5 | ch4 p11 6 | 7 | The current grammar of: 8 | -> 'a' 9 | -> 'b' 10 | doesn't work well with top-down parsers because 11 | of left-recursion. 12 | 13 | The new grammar is: 14 | -> 'b' 15 | -> 'a' | lambda 16 | ''' 17 | -------------------------------------------------------------------------------- /CH11/ap1.s: -------------------------------------------------------------------------------- 1 | @ ap1.s 2 | .text @ start of read-only segment 3 | .global _start 4 | _start: 5 | ldr r0,x @ load r0 from x 6 | mov r7, #1 @ mov 1 into r7 7 | svc 0 @ supervisor call to terminate program 8 | 9 | x: .word 14 @ the variable x 10 | 11 | -------------------------------------------------------------------------------- /CH7/ch7_p1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The following error is that 'a =' expects a but gets a 3 | NEWLINE 4 | 5 | The following error with 'printf(3)' is that when tokenizing, it views 6 | 'printf' as a name, but Reis is using it as function, even a C function 7 | which shouldn't be recognized as such, therefore it will expect as ASSIGNOP 8 | following a NAME token. 9 | 10 | The following error with 'print(3))' won't work from first glance because 11 | nothing in our grammar supports unbalanced parentheses. The error the parser 12 | gets is 'Expecting NEWLINE' 13 | ''' 14 | -------------------------------------------------------------------------------- /CH4/ch4_p2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author Steven Pak 3 | Reis's Writing Intepreters & Compilers 2nd Edition 4 | 5 | Ch4 problem 2 solution 6 | 7 | Grammar: 8 | -> 'a' 'd' 9 | -> ('b' 'b')* [c] 10 | ''' 11 | 12 | import sys 13 | 14 | tokenindex = -1 15 | token = '' 16 | 17 | def main(): 18 | try: 19 | parser() 20 | except RuntimeError as emsg: 21 | print(emsg) 22 | 23 | def advance(): 24 | global tokenindex, token 25 | tokenindex += 1 # increment index 26 | # check if we're at the end of string or given no input string 27 | if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])): 28 | token = ''; # the end 29 | else: 30 | token = sys.argv[1][tokenindex] # advance to next token (character) 31 | 32 | def consume(expected): 33 | if (expected == token): 34 | advance() 35 | else: 36 | raise RuntimeError(f'Expecting: {expected}') 37 | 38 | def parser(): 39 | # prime token with first token 40 | advance() 41 | S() 42 | 43 | 44 | def S(): 45 | consume('a') 46 | B() 47 | consume('d') 48 | 49 | def B(): 50 | # pairs of b's 51 | while (token == 'b'): 52 | advance() 53 | consume('b') 54 | 55 | # optional 'c' 56 | if (token == 'c'): 57 | advance() 58 | 59 | main() 60 | -------------------------------------------------------------------------------- /CH4/ch4_p1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author Steven Pak 3 | Reis's Writing Intepreters & Compilers 2nd Edition 4 | 5 | Ch4 problem 1 solution 6 | 7 | Grammar: 8 | -> 'a' 'b' 9 | -> 'c' 10 | ''' 11 | 12 | import sys 13 | 14 | tokenindex = -1 15 | token = '' 16 | 17 | def main(): 18 | try: 19 | parser() 20 | except RuntimeError as emsg: 21 | print(emsg) 22 | 23 | def advance(): 24 | global tokenindex, token 25 | tokenindex += 1 # increment index 26 | # check if we're at the end of string or given no input string 27 | if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])): 28 | token = ''; # the end 29 | else: 30 | token = sys.argv[1][tokenindex] # advance to next token (character) 31 | 32 | def consume(expected): 33 | if (expected == token): 34 | advance() 35 | else: 36 | raise RuntimeError(f'Expecting: {expected}') 37 | 38 | def parser(): 39 | # prime token with first token 40 | advance() 41 | S() 42 | 43 | def S(): 44 | if (token == 'a'): 45 | # consuming would cause another check 46 | advance() 47 | S() 48 | consume('b') 49 | elif (token == 'c'): 50 | advance() 51 | else: 52 | raise RuntimeError('Expecting an a or c') 53 | 54 | 55 | # begin the program 56 | main() 57 | -------------------------------------------------------------------------------- /CH4/ch4_p4.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author Steven Pak 3 | Reis's Writing Intepreters & Compilers 2nd Edition 4 | 5 | Ch4 problem 4 solution 6 | 7 | Grammar: 8 | -> 'a'* 9 | -> 'e' 10 | -> 'b' 'c' 'd' 11 | ''' 12 | 13 | import sys 14 | 15 | tokenindex = -1 16 | token = '' 17 | 18 | def main(): 19 | try: 20 | parser() 21 | except RuntimeError as emsg: 22 | print(emsg) 23 | 24 | def advance(): 25 | global tokenindex, token 26 | tokenindex += 1 # increment index 27 | # check if we're at the end of string or given no input string 28 | if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])): 29 | token = ''; # the end 30 | else: 31 | token = sys.argv[1][tokenindex] # advance to next token (character) 32 | 33 | def consume(expected): 34 | if (expected == token): 35 | advance() 36 | else: 37 | raise RuntimeError(f'Expecting: {expected}') 38 | 39 | def parser(): 40 | # prime token with first token 41 | advance() 42 | S() 43 | 44 | # check if we're at the end of the string 45 | if (token != ''): 46 | print('Garbage within the -string') 47 | else: 48 | print('-string valid') 49 | 50 | 51 | def S(): 52 | if (token == 'a'): 53 | advance() 54 | B() 55 | S() 56 | elif (token == 'e'): 57 | advance() 58 | else: 59 | raise RuntimeError('Expecting a or e') 60 | def B(): 61 | consume('b') 62 | consume('c') 63 | consume('d') 64 | 65 | main() 66 | -------------------------------------------------------------------------------- /CH4/sp.py: -------------------------------------------------------------------------------- 1 | # From Reis's Writing Compilers 2nd Edition 2 | 3 | # Grammer: 4 | # S -> AC 5 | # A -> ab 6 | # C -> cC 7 | # C -> d 8 | 9 | import sys 10 | 11 | tokenindex = -1 12 | token = '' 13 | 14 | def main(): 15 | try: 16 | parser() 17 | except RuntimeError as emsg: 18 | print(emsg) 19 | 20 | def advance(): 21 | global tokenindex, token 22 | tokenindex += 1 # increment index 23 | # check if we're at the end of string or given no input string 24 | if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])): 25 | token = ''; # the end 26 | else: 27 | token = sys.argv[1][tokenindex] # advance to next token (character) 28 | 29 | def consume(expected): 30 | if (expected == token): 31 | advance() 32 | else: 33 | raise RuntimeError(f'Expecting: {expected}') 34 | 35 | def parser(): 36 | # prime token with first token 37 | advance() 38 | S() 39 | 40 | # check if we've finished input string, that is 41 | # S() will eventually chain all calls to end, therefore 42 | # if we end up with another token after S(), input doesn't 43 | # end with 'd' 44 | if token != '': 45 | print('Garbage following -string') 46 | 47 | def S(): 48 | A() 49 | C() 50 | 51 | def A(): 52 | consume('a') 53 | consume('b') 54 | 55 | def C(): 56 | if (token == 'c'): 57 | advance() 58 | C() 59 | # if we reach 'd' token, then we've come to the end of the grammar and input string 60 | elif token == 'd': 61 | advance() 62 | else: 63 | raise RuntimeError('Expecting c or d') 64 | 65 | main() 66 | -------------------------------------------------------------------------------- /CH4/ch4_p3.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author Steven Pak 3 | Reis's Writing Intepreters & Compilers 2nd Edition 4 | 5 | Ch4 problem 3 solution 6 | 7 | Grammar: 8 | -> 'a'* 9 | -> 'b'* 10 | -> 'c'['d'|'e']'f' 11 | ''' 12 | 13 | import sys 14 | 15 | tokenindex = -1 16 | token = '' 17 | 18 | def main(): 19 | try: 20 | parser() 21 | except RuntimeError as emsg: 22 | print(emsg) 23 | 24 | def advance(): 25 | global tokenindex, token 26 | tokenindex += 1 # increment index 27 | # check if we're at the end of string or given no input string 28 | if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])): 29 | token = ''; # the end 30 | else: 31 | token = sys.argv[1][tokenindex] # advance to next token (character) 32 | 33 | def consume(expected): 34 | if (expected == token): 35 | advance() 36 | else: 37 | raise RuntimeError(f'Expecting: {expected}') 38 | 39 | def parser(): 40 | # prime token with first token 41 | advance() 42 | S() 43 | 44 | # check if we're at the end of the string 45 | if (token != ''): 46 | print('Garbage within the -string') 47 | else: 48 | print('-string valid') 49 | def S(): 50 | # loop while token is 'a' 51 | while (token == 'a'): 52 | advance() 53 | B() 54 | 55 | def B(): 56 | # loop while token is 'b' 57 | while (token == 'b'): 58 | advance() 59 | C() 60 | 61 | def C(): 62 | consume('c') 63 | # check between the optional tokens 64 | if (token == 'd'): 65 | advance() 66 | elif (token == 'e'): 67 | advance() 68 | consume('f') 69 | 70 | main() 71 | -------------------------------------------------------------------------------- /CH5/ch5_p1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author Steven Pak 3 | Reis's Writing Intepreters & Compilers 2nd Edition 4 | 5 | Ch4 problem 4 solution 6 | 7 | Grammar: 8 | -> 9 | -> 'a' 10 | -> '' 11 | -> 'b' 12 | -> '' 13 | -> 'c' 14 | -> '' 15 | ''' 16 | 17 | import sys 18 | 19 | tokenindex = -1 20 | token = '' 21 | 22 | def main(): 23 | try: 24 | parser() 25 | except RuntimeError as emsg: 26 | print(emsg) 27 | 28 | def advance(): 29 | global tokenindex, token 30 | tokenindex += 1 # increment index 31 | # check if we're at the end of string or given no input string 32 | if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])): 33 | token = ''; # the end 34 | else: 35 | token = sys.argv[1][tokenindex] # advance to next token (character) 36 | 37 | def consume(expected): 38 | if (expected == token): 39 | advance() 40 | else: 41 | raise RuntimeError(f'Expecting: {expected}') 42 | 43 | def parser(): 44 | # prime token with first token 45 | advance() 46 | S() 47 | 48 | if (token != ''): 49 | print('Garbage following -string') 50 | else: 51 | print('pass') 52 | 53 | def S(): 54 | # Only 1 production 55 | if (token in ['a', 'b', 'c']): 56 | A() 57 | B() 58 | C() 59 | else: 60 | raise RuntimeError('Expecting a, b, or c') 61 | 62 | def A(): 63 | # 1st production 64 | if (token == 'a'): 65 | advance() 66 | 67 | # 2nd production that is lambda, therefore 68 | # the current token is already on the next 69 | # production's token 70 | elif (token in ['b', 'c', '']): 71 | pass 72 | else: 73 | raise RuntimeError('Expecting a or lambda') 74 | 75 | def B(): 76 | if (token == 'b'): 77 | advance() 78 | elif (token in ['c', '']): 79 | pass 80 | else: 81 | raise RuntimeError('Expecting b or lambda') 82 | 83 | def C(): 84 | if (token == 'c'): 85 | advance() 86 | elif (token == ''): 87 | pass 88 | else: 89 | raise RuntimeError('Expecting c or lambda') 90 | 91 | main() 92 | 93 | -------------------------------------------------------------------------------- /CH6/t1.py: -------------------------------------------------------------------------------- 1 | # t1.py tokenizer 2 | import sys # sys needed to access cmd line args and sys.exit() 3 | 4 | class Token: 5 | def __init__(self, line, column, category, lexeme): 6 | self.line = line # source prog line number of the token 7 | self.column = column # source prog col in which token starts 8 | self.category = category # category of the token 9 | self.lexeme = lexeme # token in string form 10 | 11 | # global variables 12 | trace = True # controls token trace 13 | source = '' # receives entire source program 14 | sourceindex = 0 # index into source 15 | line = 0 # current line number 16 | column = 0 # current column number 17 | tokenlist = [] # list of tokens created by tokenizer 18 | prevchar = '\n' # '\n' in prevchar signals start of new line 19 | blankline = True # reset to False if line is not blank 20 | 21 | # constants that represent token categories 22 | EOF = 0 # end of file 23 | PRINT = 1 # 'print' keyword 24 | UNSIGNEDINT = 2 # integer 25 | NAME = 3 # identifier that is not a keyword 26 | ASSIGNOP = 4 # '=' assignment operator 27 | LEFTPAREN = 5 # '(' 28 | RIGHTPAREN = 6 # ')' 29 | PLUS = 7 # '+' 30 | MINUS = 8 # '-' 31 | TIMES = 9 # '*' 32 | NEWLINE = 10 # newline character 33 | ERROR = 11 # if not any of the above, then error 34 | 35 | # displayable names for each token category 36 | catnames = ['EOF', 'PRINT', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP', 37 | 'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS', 38 | 'TIMES', 'NEWLINE','ERROR'] 39 | 40 | # keywords and their token categories} 41 | keywords = {'print': PRINT} 42 | 43 | # one-character tokens and their token categories 44 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN, 45 | '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF} 46 | 47 | # main() reads input file and calls tokenizer() 48 | def main(): 49 | global source 50 | 51 | if len(sys.argv) == 2: # check if correct number of cmd line args 52 | try: 53 | infile = open(sys.argv[1], 'r') 54 | source = infile.read() # read source program 55 | except IOError: 56 | print('Cannot read input file ' + sys.argv[1]) 57 | sys.exit(1) 58 | else: 59 | print('Wrong number of command line arguments') 60 | print('format: python t1.py ') 61 | sys.exit(1) 62 | 63 | if source[-1] != '\n': # add newline to end if missing 64 | source = source + '\n' 65 | 66 | if trace: # for token trace 67 | print('Line Col Category Lexeme\n') 68 | 69 | try: 70 | tokenizer() # tokenize source code in source 71 | except RuntimeError as emsg: 72 | # output slash n in place of newline 73 | lexeme = token.lexeme.replace('\n', '\\n') 74 | print('\nError on '+ "'" + lexeme + "'" + ' line ' + 75 | str(token.line) + ' column ' + str(token.column)) 76 | print(emsg) # message from RuntimeError object 77 | sys.exit(1) # 1 return code indicates an error has occurred 78 | 79 | # tokenizer tokenizes tokens in source code and appends them to tokens 80 | def tokenizer(): 81 | global token 82 | curchar = ' ' # prime curchar with space 83 | 84 | while True: 85 | # skip whitespace but not newlines 86 | while curchar != '\n' and curchar.isspace(): 87 | curchar = getchar() # get next char from source program 88 | 89 | # construct and initialize a new token 90 | token = Token(line, column, None, '') 91 | 92 | if curchar.isdigit(): # start of unsigned int? 93 | token.category = UNSIGNEDINT # save category of token 94 | while True: 95 | token.lexeme += curchar # append curchar to lexeme 96 | curchar = getchar() # get next character 97 | if not curchar.isdigit(): # break if not a digit 98 | break 99 | 100 | elif curchar.isalpha() or curchar == '_': # start of name? 101 | while True: 102 | token.lexeme += curchar # append curchar to lexeme 103 | curchar = getchar() # get next character 104 | # break if not letter, '_', or digit 105 | if not (curchar.isalnum() or curchar == '_'): 106 | break 107 | 108 | # determine if lexeme is a keyword or name of variable 109 | if token.lexeme in keywords: 110 | token.category = keywords[token.lexeme] 111 | else: 112 | token.category = NAME 113 | 114 | elif curchar in smalltokens: 115 | token.category = smalltokens[curchar] # get category 116 | token.lexeme = curchar 117 | curchar = getchar() # move to first char after token 118 | 119 | else: 120 | token.category = ERROR # invalid token 121 | token.lexeme = curchar # save lexeme 122 | raise RuntimeError('Invalid token') 123 | 124 | tokenlist.append(token) # append token to tokens list 125 | if trace: # display token if trace is True 126 | print("%3s %4s %-14s %s" % (str(token.line), 127 | str(token.column), catnames[token.category], token.lexeme)) 128 | 129 | if token.category == EOF: # finished tokenizing? 130 | break 131 | 132 | # getchar() gets next char from source and adjusts line and column 133 | def getchar(): 134 | global sourceindex, column, line, prevchar, blankline 135 | 136 | # check if starting a new line 137 | if prevchar == '\n': # '\n' signals start of a new line 138 | line += 1 # increment line number 139 | column = 0 # reset column number 140 | blankline = True # initialize blankline 141 | 142 | if sourceindex >= len(source): # at end of source code? 143 | column = 1 # set EOF column to 1 144 | prevchar = '' # save current char for next call 145 | return '' # null str signals end of source 146 | 147 | c = source[sourceindex] # get next char in the source program 148 | sourceindex += 1 # increment sourceindex to next character 149 | column += 1 # increment column number 150 | if not c.isspace(): # if c not whitespace then line not blank 151 | blankline = False # indicate line not blank 152 | prevchar = c # save current char for next call 153 | 154 | # if at end of blank line, return space in place of '\n' 155 | if c == '\n' and blankline: 156 | return ' ' 157 | else: 158 | return c # return character to tokenizer() 159 | 160 | main() # call main function 161 | 162 | -------------------------------------------------------------------------------- /CH6/tokenizer_1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The 1st version of the Python tokenizer from 3 | Reis's Compilers book. Original version can be 4 | found from his software package and this 5 | verison is simply for reinvention-sake. 6 | ''' 7 | 8 | import sys # cmd line args 9 | 10 | class Token: 11 | ''' The only class the tokenizer needs to realize 12 | are tokens, which consist of the literal lexeme 13 | , token type, and other properties for errors 14 | ''' 15 | def __init__(self, line, column, category, lexeme): 16 | self.line = line # source program line number 17 | self.column = column # the column/index within the line 18 | self.category = category # token type 19 | self.lexeme = lexeme # the literal string 20 | 21 | ''' Global variables ''' 22 | trace = True 23 | source = '' # The whole source program 24 | sourceindex = 0 # the index for source 25 | line = 0 # the actual line 26 | column = 0 # the character index within a line 27 | tokenlist = [] # list holding ALL tokens; for parser 28 | prevchar = '\n' # '\n' signal start of new line 29 | blankline = True # False if line is not blank 30 | 31 | ''' Token Categories ''' 32 | EOF = 0 # end of file 33 | PRINT = 1 # 'print' keyword 34 | UNSIGNEDINT = 2 # integer 35 | NAME = 3 # identifier that is not a keyword 36 | ASSIGNOP = 4 # '=' assignment operator 37 | LEFTPAREN = 5 # '(' 38 | RIGHTPAREN = 6 # ')' 39 | PLUS = 7 # '+' 40 | MINUS = 8 # '-' 41 | TIMES = 9 # '*' 42 | NEWLINE = 10 # newline character 43 | ERROR = 11 # if not any of the above, then error 44 | 45 | # displayable names for each token category, 46 | # indice match up with category type 47 | catnames = ['EOF', 'PRINT', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP', 48 | 'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS', 49 | 'TIMES', 'NEWLINE','ERROR'] 50 | 51 | # keywords and their token categories} 52 | keywords = {'print': PRINT} 53 | 54 | # one-character tokens and their token categories 55 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN, 56 | '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF} 57 | 58 | # reads in source program file & calls tokenizer, 59 | # eventually returning with an Exception or a full list of tokens 60 | def main(): 61 | global source # source program 62 | 63 | # correct no. of cmd line args 64 | if (len(sys.argv) == 2): 65 | with open(sys.argv[1]) as f: 66 | source = f.read() # returns the whole file as a string 67 | 68 | else: 69 | print('Incorrect number of cmd-line args') 70 | print('format: python tokenizer_1.py ') 71 | sys.exit(1) 72 | 73 | # /n for text editors that DON'T end with /n 74 | # if one is missing, add one 75 | if (source[-1] != '\n'): 76 | source = source + '\n' 77 | 78 | # run the tokenizer 79 | try: 80 | tokenizer() 81 | except RuntimeError as emsg: 82 | print(emsg) 83 | sys.exit(1) 84 | 85 | def tokenizer(): 86 | ''' Tokenizes tokens in source code and appends them to 'tokenlist' ''' 87 | global token 88 | curchar = '' 89 | 90 | # tokenize through the whole source program 91 | while True: 92 | # skip the white spaces, but not \n 93 | while (curchar != '\n' and curchar.isspace()): 94 | curchar = getchar() # get next char 95 | 96 | # create a new token; category & lexeme are tbd 97 | token = Token(line, column, None, '') 98 | 99 | # Now we tokenize based on the current character we can see 100 | 101 | # case 1: unsigned ints 102 | if (curchar.isdigit()): 103 | token.category = UNSIGNEDINT 104 | # get the whole string of numbers 105 | while True: 106 | token.lexeme += curchar # append 107 | curchar = getchar() # update char 108 | if not curchar.isdigit(): # break if no longer digit 109 | break 110 | 111 | # case 2: keywords or identifier 112 | elif (curchar.isalpha() or curchar == '_'): 113 | while True: 114 | token.lexeme += curchar # append 115 | curchar = getchar() # get next 116 | if not (curchar.isanum() or curchar == '_'): 117 | break 118 | 119 | # check if lexeme is a keyword or identifier 120 | if (token.lexeme in keywords): 121 | token.category = keywords[token.lexeme] # PRINT 122 | else: 123 | token.category = NAME # else, it's a identifier 124 | 125 | # case 3: operators/small tokens 126 | elif (curchar in smalltokens): 127 | token.category = smalltokens[curchar] # get category 128 | token.lexeme = curchar 129 | curchar = getchar() # move ot first char after token 130 | 131 | # case 4: not a valid token 132 | else: 133 | token.category = ERROR 134 | token.lexeme = curchar 135 | raise RuntimeError('Invalid Token') 136 | 137 | # append to tokenlist 138 | tokenlist.append(token) 139 | 140 | if (trace): 141 | print("%3s %4s %-14s %s" % (str(token.line), 142 | str(token.column), catnames[token.category], token.lexeme)) 143 | 144 | if (token.category == EOF): # end of tokenizing 145 | break 146 | 147 | def getchar(): 148 | ''' returns the next character in the source program and 149 | adjusts 'line' and 'column' globals if needed. 150 | It also returns '' when EOF & ' ' for blankline 151 | ''' 152 | global sourceindex, column, line, prevchar, blankline 153 | 154 | # if we're starting a new line, then we must update the global 155 | # properties for where we are in source program 156 | 157 | # if the prevchar is a \n, then we must be a new line, 158 | # but we'll check for blank line soon 159 | if (prevchar == '\n'): # saying that we've seen a new line 160 | line += 1 161 | column = 0 162 | blankline = True # first assume that we've reached a blank line 163 | 164 | # check if we're at end of source 165 | if (sourceindex >= len(source)): 166 | column = 1 167 | prevchar = '' 168 | return '' # empty string signals EOF 169 | 170 | # get c 171 | c = source[sourceindex] 172 | sourceindex += 1 173 | column += 1 174 | # if 'c' is NOT a whitespace, then it must not be a blank line 175 | if (c.isspace() == False): 176 | blankline = False 177 | prevchar = c # update prevchar 178 | 179 | # if at end of blank line (just 1 \n), return space instead 180 | if (c == '\n' and blankline): 181 | return ' ' 182 | else: 183 | return c # return character to tokenizer() 184 | 185 | main() 186 | -------------------------------------------------------------------------------- /CH7/p1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 1st basic parser of a simple subset of the python 3 | language. This is mostly reinvention from 4 | Reis's Writing Compilers 5 | ''' 6 | 7 | import sys, time # sys needed to access cmd line args and sys.exit() 8 | 9 | class Token: 10 | def __init__(self, line, column, category, lexeme): 11 | self.line = line # srce program line number of the token 12 | self.column = column # srce program col in which token starts 13 | self.category = category # category of the token 14 | self.lexeme = lexeme # token in string form 15 | 16 | # globals grade 17 | trace = True # controls token trace 18 | source = '' # receives entire source program 19 | sourceindex = 0 # index into the source code in source 20 | line = 0 # current line number 21 | column = 0 # current column number 22 | tokenlist = [] # list of tokens created by tokenizer 23 | tokenindex = -1 # index of current token in tokens 24 | token = None # current token 25 | prevchar = '\n' # '\n' in prevchar signals start of new line 26 | blankline = True # reset to False if line is not blank 27 | 28 | # constants that represent token categories 29 | EOF = 0 # end of file 30 | PRINT = 1 # 'print' keyword 31 | UNSIGNEDINT = 2 # unsigned integer 32 | NAME = 3 # identifier that is not a keyword 33 | ASSIGNOP = 4 # '=' assignment operator 34 | LEFTPAREN = 5 # '(' 35 | RIGHTPAREN = 6 # ')' 36 | PLUS = 7 # '+' 37 | MINUS = 8 # '-' 38 | TIMES = 9 # '*' 39 | NEWLINE = 10 # end of line 40 | ERROR = 11 # if not any of the above, then error 41 | 42 | # displayable names for each token category 43 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP', 44 | 'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS', 45 | 'TIMES', 'NEWLINE','ERROR'] 46 | 47 | # keywords and their token categories} 48 | keywords = {'print': PRINT} 49 | 50 | # one-character tokens and their token categories 51 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN, 52 | '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF} 53 | 54 | ################# 55 | # main function # 56 | ################# 57 | # main() reads input file and calls tokenizer() 58 | def main(): 59 | global source 60 | 61 | if len(sys.argv) == 2: # check if correct number of cmd line args 62 | try: 63 | infile = open(sys.argv[1], 'r') 64 | source = infile.read() # read source program 65 | except IOError: 66 | print('Cannot read input file ' + sys.argv[1]) 67 | sys.exit(1) 68 | else: 69 | print('Wrong number of command line arguments') 70 | print('Format: python p1.py ') 71 | sys.exit(1) 72 | 73 | if source[-1] != '\n': # add newline to end if missing 74 | source = source + '\n' 75 | 76 | if trace: 77 | print('------------------------------------------- Token trace') 78 | print('Line Col Category Lexeme\n') 79 | 80 | 81 | try: 82 | tokenizer() 83 | parser() 84 | 85 | # on an error, display an error message 86 | # token is the token object on which the error was detected 87 | except RuntimeError as emsg: 88 | # output slash n in place of newline 89 | lexeme = token.lexeme.replace('\n', '\\n') 90 | print('\nError on '+ "'" + lexeme + "'" + ' line ' + 91 | str(token.line) + ' column ' + str(token.column)) 92 | print(emsg) # message from RuntimeError object 93 | sys.exit(1) 94 | 95 | #################### 96 | # tokenizer # 97 | #################### 98 | def tokenizer(): 99 | global token 100 | curchar = ' ' # prime curchar with space 101 | 102 | while True: 103 | # skip whitespace but not newlines 104 | while curchar != '\n' and curchar.isspace(): 105 | curchar = getchar() # get next char from source program 106 | 107 | # construct and initialize token 108 | token = Token(line, column, None, '') 109 | 110 | if curchar.isdigit(): # start of unsigned int? 111 | token.category = UNSIGNEDINT # save category of token 112 | while True: 113 | token.lexeme += curchar # append curchar to lexeme 114 | curchar = getchar() # get next character 115 | if not curchar.isdigit(): # break if not a digit 116 | break 117 | 118 | elif curchar.isalpha() or curchar == '_': # start of name? 119 | while True: 120 | token.lexeme += curchar # append curchar to lexeme 121 | curchar = getchar() # get next character 122 | # break if not letter, '_', or digit 123 | if not (curchar.isalnum() or curchar == '_'): 124 | break 125 | 126 | # determine if lexeme is a keyword or name of variable 127 | if token.lexeme in keywords: 128 | token.category = keywords[token.lexeme] 129 | else: 130 | token.category = NAME 131 | 132 | elif curchar in smalltokens: 133 | token.category = smalltokens[curchar] # get category 134 | token.lexeme = curchar 135 | curchar = getchar() # move to first char after the token 136 | 137 | else: 138 | token.category = ERROR # invalid token 139 | token.lexeme = curchar 140 | raise RuntimeError('Invalid token') 141 | 142 | tokenlist.append(token) # append token to tokens list 143 | if trace: # display token if trace is True 144 | print("%3s %4s %-14s %s" % (str(token.line), 145 | str(token.column), catnames[token.category], token.lexeme)) 146 | 147 | if token.category == EOF: # finished tokenizing? 148 | break 149 | 150 | # getchar() gets next char from source and adjusts line and column 151 | def getchar(): 152 | global sourceindex, column, line, prevchar, blankline 153 | 154 | # check if starting a new line 155 | if prevchar == '\n': # '\n' signals start of a new line 156 | line += 1 # increment line number 157 | column = 0 # reset column number 158 | blankline = True # initialize blankline 159 | 160 | if sourceindex >= len(source): # at end of source code? 161 | column = 1 # set EOF column to 1 162 | prevchar = '' # save current char for next call 163 | return '' # null str signals end of source 164 | 165 | c = source[sourceindex] # get next char in the source program 166 | sourceindex += 1 # increment sourceindex to next character 167 | column += 1 # increment column number 168 | if not c.isspace(): # if c not whitespace then line not blank 169 | blankline = False # indicate line not blank 170 | prevchar = c # save current character 171 | 172 | # if at end of blank line, return space in place of '\n' 173 | if c == '\n' and blankline: 174 | return ' ' 175 | else: 176 | return c # return character to tokenizer() 177 | 178 | #################### 179 | # Simple Parser # 180 | #################### 181 | 182 | # begin the parser, starting with the 1st token in tokenlist 183 | def parser(): 184 | advance() 185 | program() 186 | 187 | # major function 1: advance() 188 | def advance(): 189 | ''' update the global token to the next token 190 | from tokenlist ''' 191 | global token, tokenindex 192 | tokenindex += 1 # move to next token 193 | if (tokenindex >= len(tokenlist)): # reached the end 194 | raise RuntimeError('Unexpected EOF') 195 | token = tokenlist[tokenindex] 196 | 197 | # major function 2: consume() 198 | def consume(expectedcat): 199 | # check current token with expected 200 | if (token.category == expectedcat): 201 | advance() # get next token 202 | else: 203 | raise RuntimeError('Expecting ' + catnames[expectedcat]) 204 | 205 | # -> * EOF 206 | def program(): 207 | print('parsing has started') 208 | # although stated, note that semantically, this means 209 | # that a program consists of 0 or more statements that 210 | # all begin with some 'NAME' or 'PRINT' token 211 | while (token.category in [NAME, PRINT]): 212 | print('entering stmt loop') 213 | stmt() 214 | if (token.category != EOF): 215 | raise RuntimeError('Expecting EOF') 216 | print(token.category) 217 | 218 | 219 | # -> NEWLINE 220 | def stmt(): 221 | print('stmt()') 222 | # note that we don't consume a 'simplestmt', but rather just call it 223 | # this is because simplestmt is a non-terminal, and not a token (terminal) 224 | simplestmt() 225 | # NEWLINE is a token, therefore we'll consume 226 | consume(NEWLINE) 227 | 228 | # -> 229 | def simplestmt(): 230 | print('simple') 231 | # this is where FIRST sets come in 232 | if (token.category == NAME): 233 | assignmentstmt() 234 | elif (token.category == PRINT): 235 | printstmt() 236 | else: 237 | raise RuntimeError('Expecting NAME or PRINT') 238 | 239 | # -> NAME '=' 240 | def assignmentstmt(): 241 | print('assignmentstmt') 242 | consume(NAME) 243 | consume(ASSIGNOP) 244 | expr() 245 | 246 | # -> 'print' '(' ')' 247 | def printstmt(): 248 | print('printstmt') 249 | consume(PRINT) 250 | consume(LEFTPAREN) 251 | expr() 252 | consume(RIGHTPAREN) 253 | 254 | # -> ('+' )* 255 | def expr(): 256 | print('expr') 257 | term() 258 | # loop for (+ ) 259 | while (token.category == PLUS): 260 | # consume wastes another check, just advance() 261 | advance() 262 | term() 263 | # when term() returns, if it sees another + 264 | # in the token stream, it will loop again 265 | 266 | # -> ('*' )* 267 | def term(): 268 | print('term') 269 | factor() 270 | # loop for (* ) 271 | while (token.category == TIMES): 272 | advance() 273 | factor() 274 | 275 | # -> '+' | '-' | UNSIGNEDINT | NAME | '(' ')' 276 | def factor(): 277 | print('factor') 278 | # a lot of cases, all disjoint 279 | if (token.category == PLUS or token.category == MINUS): 280 | advance() 281 | factor() 282 | elif (token.category == UNSIGNEDINT or token.category == NAME): 283 | advance() 284 | elif (token.category == LEFTPAREN): 285 | advance() 286 | expr() 287 | consume(RIGHTPAREN) 288 | else: 289 | raise RuntimeError('Expecting a factor') 290 | 291 | 292 | 293 | 294 | 295 | main() 296 | -------------------------------------------------------------------------------- /CH8/p1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 1st basic parser of a simple subset of the python 3 | language. This is mostly reinvention from 4 | Reis's Writing Compilers 5 | 6 | This version is the basic interpreter 7 | ''' 8 | 9 | import sys, time # sys needed to access cmd line args and sys.exit() 10 | 11 | class Token: 12 | def __init__(self, line, column, category, lexeme): 13 | self.line = line # srce program line number of the token 14 | self.column = column # srce program col in which token starts 15 | self.category = category # category of the token 16 | self.lexeme = lexeme # token in string form 17 | 18 | # globals grade 19 | trace = True # controls token trace 20 | grade = False 21 | source = '' # receives entire source program 22 | sourceindex = 0 # index into the source code in source 23 | line = 0 # current line number 24 | column = 0 # current column number 25 | tokenlist = [] # list of tokens created by tokenizer 26 | tokenindex = -1 # index of current token in tokens 27 | token = None # current token 28 | prevchar = '\n' # '\n' in prevchar signals start of new line 29 | blankline = True # reset to False if line is not blank 30 | symtab = {} # a symbol table for tracking information 31 | operandstack = [] # a stack that will hold information for symbol table 32 | sign = 0 # a sign boolean that will allow us to know the number of unary minuses 33 | 34 | # constants that represent token categories 35 | EOF = 0 # end of file 36 | PRINT = 1 # 'print' keyword 37 | UNSIGNEDINT = 2 # unsigned integer 38 | NAME = 3 # identifier that is not a keyword 39 | ASSIGNOP = 4 # '=' assignment operator 40 | LEFTPAREN = 5 # '(' 41 | RIGHTPAREN = 6 # ')' 42 | PLUS = 7 # '+' 43 | MINUS = 8 # '-' 44 | TIMES = 9 # '*' 45 | NEWLINE = 10 # end of line 46 | ERROR = 11 # if not any of the above, then error 47 | 48 | # displayable names for each token category 49 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP', 50 | 'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS', 51 | 'TIMES', 'NEWLINE','ERROR'] 52 | 53 | # keywords and their token categories} 54 | keywords = {'print': PRINT} 55 | 56 | # one-character tokens and their token categories 57 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN, 58 | '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF} 59 | 60 | ################# 61 | # main function # 62 | ################# 63 | # main() reads input file and calls tokenizer() 64 | def main(): 65 | global source 66 | 67 | if len(sys.argv) == 2: # check if correct number of cmd line args 68 | try: 69 | infile = open(sys.argv[1], 'r') 70 | source = infile.read() # read source program 71 | except IOError: 72 | print('Cannot read input file ' + sys.argv[1]) 73 | sys.exit(1) 74 | else: 75 | print('Wrong number of command line arguments') 76 | print('Format: python p1.py ') 77 | sys.exit(1) 78 | 79 | if source[-1] != '\n': # add newline to end if missing 80 | source = source + '\n' 81 | 82 | if trace: 83 | print('------------------------------------------- Token trace') 84 | print('Line Col Category Lexeme\n') 85 | 86 | 87 | try: 88 | tokenizer() 89 | parser() 90 | 91 | # on an error, display an error message 92 | # token is the token object on which the error was detected 93 | except RuntimeError as emsg: 94 | # output slash n in place of newline 95 | lexeme = token.lexeme.replace('\n', '\\n') 96 | print('\nError on '+ "'" + lexeme + "'" + ' line ' + 97 | str(token.line) + ' column ' + str(token.column)) 98 | print(emsg) # message from RuntimeError object 99 | sys.exit(1) 100 | 101 | #################### 102 | # tokenizer # 103 | #################### 104 | def tokenizer(): 105 | global token 106 | curchar = ' ' # prime curchar with space 107 | 108 | while True: 109 | # skip whitespace but not newlines 110 | while curchar != '\n' and curchar.isspace(): 111 | curchar = getchar() # get next char from source program 112 | 113 | # construct and initialize token 114 | token = Token(line, column, None, '') 115 | 116 | if curchar.isdigit(): # start of unsigned int? 117 | token.category = UNSIGNEDINT # save category of token 118 | while True: 119 | token.lexeme += curchar # append curchar to lexeme 120 | curchar = getchar() # get next character 121 | if not curchar.isdigit(): # break if not a digit 122 | break 123 | 124 | elif curchar.isalpha() or curchar == '_': # start of name? 125 | while True: 126 | token.lexeme += curchar # append curchar to lexeme 127 | curchar = getchar() # get next character 128 | # break if not letter, '_', or digit 129 | if not (curchar.isalnum() or curchar == '_'): 130 | break 131 | 132 | # determine if lexeme is a keyword or name of variable 133 | if token.lexeme in keywords: 134 | token.category = keywords[token.lexeme] 135 | else: 136 | token.category = NAME 137 | 138 | elif curchar in smalltokens: 139 | token.category = smalltokens[curchar] # get category 140 | token.lexeme = curchar 141 | curchar = getchar() # move to first char after the token 142 | 143 | else: 144 | token.category = ERROR # invalid token 145 | token.lexeme = curchar 146 | raise RuntimeError('Invalid token') 147 | 148 | tokenlist.append(token) # append token to tokens list 149 | if trace: # display token if trace is True 150 | print("%3s %4s %-14s %s" % (str(token.line), 151 | str(token.column), catnames[token.category], token.lexeme)) 152 | 153 | if token.category == EOF: # finished tokenizing? 154 | break 155 | 156 | # getchar() gets next char from source and adjusts line and column 157 | def getchar(): 158 | global sourceindex, column, line, prevchar, blankline 159 | 160 | # check if starting a new line 161 | if prevchar == '\n': # '\n' signals start of a new line 162 | line += 1 # increment line number 163 | column = 0 # reset column number 164 | blankline = True # initialize blankline 165 | 166 | if sourceindex >= len(source): # at end of source code? 167 | column = 1 # set EOF column to 1 168 | prevchar = '' # save current char for next call 169 | return '' # null str signals end of source 170 | 171 | c = source[sourceindex] # get next char in the source program 172 | sourceindex += 1 # increment sourceindex to next character 173 | column += 1 # increment column number 174 | if not c.isspace(): # if c not whitespace then line not blank 175 | blankline = False # indicate line not blank 176 | prevchar = c # save current character 177 | 178 | # if at end of blank line, return space in place of '\n' 179 | if c == '\n' and blankline: 180 | return ' ' 181 | else: 182 | return c # return character to tokenizer() 183 | 184 | #################### 185 | # Simple Parser # 186 | #################### 187 | 188 | # begin the parser, starting with the 1st token in tokenlist 189 | def parser(): 190 | advance() 191 | program() 192 | 193 | # major function 1: advance() 194 | def advance(): 195 | ''' update the global token to the next token 196 | from tokenlist ''' 197 | global token, tokenindex 198 | tokenindex += 1 # move to next token 199 | if (tokenindex >= len(tokenlist)): # reached the end 200 | raise RuntimeError('Unexpected EOF') 201 | token = tokenlist[tokenindex] 202 | 203 | # major function 2: consume() 204 | def consume(expectedcat): 205 | # check current token with expected 206 | if (token.category == expectedcat): 207 | advance() # get next token 208 | else: 209 | raise RuntimeError('Expecting ' + catnames[expectedcat]) 210 | 211 | # -> * EOF 212 | def program(): 213 | # although stated, note that semantically, this means 214 | # that a program consists of 0 or more statements that 215 | # all begin with some 'NAME' or 'PRINT' token 216 | while (token.category in [NAME, PRINT]): 217 | stmt() 218 | if (token.category != EOF): 219 | raise RuntimeError('Expecting EOF') 220 | print(token.category) 221 | 222 | 223 | # -> NEWLINE 224 | def stmt(): 225 | # note that we don't consume a 'simplestmt', but rather just call it 226 | # this is because simplestmt is a non-terminal, and not a token (terminal) 227 | simplestmt() 228 | # NEWLINE is a token, therefore we'll consume 229 | consume(NEWLINE) 230 | 231 | # -> 232 | def simplestmt(): 233 | # this is where FIRST sets come in 234 | if (token.category == NAME): 235 | assignmentstmt() 236 | elif (token.category == PRINT): 237 | printstmt() 238 | else: 239 | raise RuntimeError('Expecting NAME or PRINT') 240 | 241 | # -> NAME '=' 242 | def assignmentstmt(): 243 | left = token.lexeme # will be the key into the symbol table 244 | consume(NAME) 245 | consume(ASSIGNOP) 246 | expr() 247 | 248 | # after expr() returns, it will have pushed 's value 249 | # to the top 250 | symtab[left] = operandstack.pop() 251 | 252 | # -> 'print' '(' ')' 253 | def printstmt(): 254 | consume(PRINT) 255 | consume(LEFTPAREN) 256 | # expr() will have pushed on its value 257 | expr() 258 | print(operandstack.pop()) 259 | consume(RIGHTPAREN) 260 | 261 | # -> ('+' )* 262 | def expr(): 263 | term() # pushes value of term on top of stack 264 | # loop for (+ ) 265 | while (token.category == PLUS): 266 | # consume wastes another check, just advance() 267 | advance() 268 | term() # pushes value of term on top of stack 269 | rightoperand = operandstack.pop() 270 | leftoperand = operandstack.pop() 271 | operandstack.append(leftoperand + rightoperand) 272 | # when term() returns, if it sees another + 273 | # in the token stream, it will loop again 274 | 275 | # -> ('*' )* 276 | def term(): 277 | global sign 278 | sign = 1 279 | factor() 280 | # loop for (* ) 281 | while (token.category == TIMES): 282 | advance() 283 | sign = 1 # initialize sign before every factor call because it's the only production with MINUS 284 | factor() 285 | rightoperand = operandstack.pop() 286 | leftoperand = operandstack.pop() 287 | operandstack.append(leftoperand * rightoperand) 288 | 289 | # -> '+' | '-' | UNSIGNEDINT | NAME | '(' ')' 290 | def factor(): 291 | global sign 292 | # a lot of cases, all disjoint 293 | if (token.category == PLUS): 294 | advance() 295 | factor() 296 | elif (token.category == MINUS): 297 | sign = -sign 298 | advance() 299 | factor() 300 | elif (token.category == UNSIGNEDINT): 301 | operandstack.append(sign * int(token.lexeme)) 302 | advance() 303 | elif (token.category == NAME): 304 | if (token.lexeme in symtab): 305 | operandstack.append(sign * symtab[token.lexeme]) 306 | else: 307 | raise RuntimeError(f'Name: {token.lexeme} is not defined') 308 | advance() 309 | elif (token.category == LEFTPAREN): 310 | advance() 311 | # need to save sign because expr() will cause global one to change 312 | savesign = sign 313 | expr() 314 | # if our current factor() call's sign was negative, then make our expression negative 315 | if savesign == -1: 316 | operandstack[-1] = -operandstack[-1] 317 | consume(RIGHTPAREN) 318 | else: 319 | raise RuntimeError('Expecting a factor') 320 | 321 | 322 | 323 | 324 | 325 | main() 326 | if grade: 327 | # display interpreter source code 328 | print('------------------------------------------- ' + sys.argv[0]) 329 | print(open(sys.argv[0]).read()) 330 | -------------------------------------------------------------------------------- /CH9/p1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 1st basic parser of a simple subset of the python 3 | language. This is mostly reinvention from 4 | Reis's Writing Compilers 5 | 6 | This version is the basic interpreter 7 | ''' 8 | 9 | import sys, time # sys needed to access cmd line args and sys.exit() 10 | 11 | class Token: 12 | def __init__(self, line, column, category, lexeme): 13 | self.line = line # srce program line number of the token 14 | self.column = column # srce program col in which token starts 15 | self.category = category # category of the token 16 | self.lexeme = lexeme # token in string form 17 | 18 | # globals grade 19 | trace = True # controls token trace 20 | grade = False 21 | source = '' # receives entire source program 22 | sourceindex = 0 # index into the source code in source 23 | line = 0 # current line number 24 | column = 0 # current column number 25 | tokenlist = [] # list of tokens created by tokenizer 26 | tokenindex = -1 # index of current token in tokens 27 | token = None # current token 28 | prevchar = '\n' # '\n' in prevchar signals start of new line 29 | blankline = True # reset to False if line is not blank 30 | symtab = {} # a symbol table for tracking information 31 | operandstack = [] # a stack that will hold information for symbol table 32 | sign = 0 # a sign boolean that will allow us to know the number of unary minuses 33 | 34 | # constants that represent token categories 35 | EOF = 0 # end of file 36 | PRINT = 1 # 'print' keyword 37 | UNSIGNEDINT = 2 # unsigned integer 38 | NAME = 3 # identifier that is not a keyword 39 | ASSIGNOP = 4 # '=' assignment operator 40 | LEFTPAREN = 5 # '(' 41 | RIGHTPAREN = 6 # ')' 42 | PLUS = 7 # '+' 43 | MINUS = 8 # '-' 44 | TIMES = 9 # '*' 45 | NEWLINE = 10 # end of line 46 | ERROR = 11 # if not any of the above, then error 47 | 48 | # displayable names for each token category 49 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP', 50 | 'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS', 51 | 'TIMES', 'NEWLINE','ERROR'] 52 | 53 | # keywords and their token categories} 54 | keywords = {'print': PRINT} 55 | 56 | # one-character tokens and their token categories 57 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN, 58 | '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF} 59 | 60 | ################# 61 | # main function # 62 | ################# 63 | # main() reads input file and calls tokenizer() 64 | def main(): 65 | global source 66 | 67 | if len(sys.argv) == 2: # check if correct number of cmd line args 68 | try: 69 | infile = open(sys.argv[1], 'r') 70 | source = infile.read() # read source program 71 | except IOError: 72 | print('Cannot read input file ' + sys.argv[1]) 73 | sys.exit(1) 74 | else: 75 | print('Wrong number of command line arguments') 76 | print('Format: python p1.py ') 77 | sys.exit(1) 78 | 79 | if source[-1] != '\n': # add newline to end if missing 80 | source = source + '\n' 81 | 82 | if trace: 83 | print('------------------------------------------- Token trace') 84 | print('Line Col Category Lexeme\n') 85 | 86 | 87 | try: 88 | tokenizer() 89 | parser() 90 | 91 | # on an error, display an error message 92 | # token is the token object on which the error was detected 93 | except RuntimeError as emsg: 94 | # output slash n in place of newline 95 | lexeme = token.lexeme.replace('\n', '\\n') 96 | print('\nError on '+ "'" + lexeme + "'" + ' line ' + 97 | str(token.line) + ' column ' + str(token.column)) 98 | print(emsg) # message from RuntimeError object 99 | sys.exit(1) 100 | 101 | #################### 102 | # tokenizer # 103 | #################### 104 | def tokenizer(): 105 | global token 106 | curchar = ' ' # prime curchar with space 107 | 108 | while True: 109 | # skip whitespace but not newlines 110 | while curchar != '\n' and curchar.isspace(): 111 | curchar = getchar() # get next char from source program 112 | 113 | # construct and initialize token 114 | token = Token(line, column, None, '') 115 | 116 | if curchar.isdigit(): # start of unsigned int? 117 | token.category = UNSIGNEDINT # save category of token 118 | while True: 119 | token.lexeme += curchar # append curchar to lexeme 120 | curchar = getchar() # get next character 121 | if not curchar.isdigit(): # break if not a digit 122 | break 123 | 124 | elif curchar.isalpha() or curchar == '_': # start of name? 125 | while True: 126 | token.lexeme += curchar # append curchar to lexeme 127 | curchar = getchar() # get next character 128 | # break if not letter, '_', or digit 129 | if not (curchar.isalnum() or curchar == '_'): 130 | break 131 | 132 | # determine if lexeme is a keyword or name of variable 133 | if token.lexeme in keywords: 134 | token.category = keywords[token.lexeme] 135 | else: 136 | token.category = NAME 137 | 138 | elif curchar in smalltokens: 139 | token.category = smalltokens[curchar] # get category 140 | token.lexeme = curchar 141 | curchar = getchar() # move to first char after the token 142 | 143 | else: 144 | token.category = ERROR # invalid token 145 | token.lexeme = curchar 146 | raise RuntimeError('Invalid token') 147 | 148 | tokenlist.append(token) # append token to tokens list 149 | if trace: # display token if trace is True 150 | print("%3s %4s %-14s %s" % (str(token.line), 151 | str(token.column), catnames[token.category], token.lexeme)) 152 | 153 | if token.category == EOF: # finished tokenizing? 154 | break 155 | 156 | # getchar() gets next char from source and adjusts line and column 157 | def getchar(): 158 | global sourceindex, column, line, prevchar, blankline 159 | 160 | # check if starting a new line 161 | if prevchar == '\n': # '\n' signals start of a new line 162 | line += 1 # increment line number 163 | column = 0 # reset column number 164 | blankline = True # initialize blankline 165 | 166 | if sourceindex >= len(source): # at end of source code? 167 | column = 1 # set EOF column to 1 168 | prevchar = '' # save current char for next call 169 | return '' # null str signals end of source 170 | 171 | c = source[sourceindex] # get next char in the source program 172 | sourceindex += 1 # increment sourceindex to next character 173 | column += 1 # increment column number 174 | if not c.isspace(): # if c not whitespace then line not blank 175 | blankline = False # indicate line not blank 176 | prevchar = c # save current character 177 | 178 | # if at end of blank line, return space in place of '\n' 179 | if c == '\n' and blankline: 180 | return ' ' 181 | else: 182 | return c # return character to tokenizer() 183 | 184 | #################### 185 | # Simple Parser # 186 | #################### 187 | 188 | # begin the parser, starting with the 1st token in tokenlist 189 | def parser(): 190 | advance() 191 | program() 192 | 193 | # major function 1: advance() 194 | def advance(): 195 | ''' update the global token to the next token 196 | from tokenlist ''' 197 | global token, tokenindex 198 | tokenindex += 1 # move to next token 199 | if (tokenindex >= len(tokenlist)): # reached the end 200 | raise RuntimeError('Unexpected EOF') 201 | token = tokenlist[tokenindex] 202 | 203 | # major function 2: consume() 204 | def consume(expectedcat): 205 | # check current token with expected 206 | if (token.category == expectedcat): 207 | advance() # get next token 208 | else: 209 | raise RuntimeError('Expecting ' + catnames[expectedcat]) 210 | 211 | # -> * EOF 212 | def program(): 213 | # although stated, note that semantically, this means 214 | # that a program consists of 0 or more statements that 215 | # all begin with some 'NAME' or 'PRINT' token 216 | while (token.category in [NAME, PRINT]): 217 | stmt() 218 | if (token.category != EOF): 219 | raise RuntimeError('Expecting EOF') 220 | print(token.category) 221 | 222 | 223 | # -> NEWLINE 224 | def stmt(): 225 | # note that we don't consume a 'simplestmt', but rather just call it 226 | # this is because simplestmt is a non-terminal, and not a token (terminal) 227 | simplestmt() 228 | # NEWLINE is a token, therefore we'll consume 229 | consume(NEWLINE) 230 | 231 | # -> 232 | def simplestmt(): 233 | # this is where FIRST sets come in 234 | if (token.category == NAME): 235 | assignmentstmt() 236 | elif (token.category == PRINT): 237 | printstmt() 238 | else: 239 | raise RuntimeError('Expecting NAME or PRINT') 240 | 241 | # -> NAME '=' 242 | def assignmentstmt(): 243 | left = token.lexeme # will be the key into the symbol table 244 | consume(NAME) 245 | consume(ASSIGNOP) 246 | expr() 247 | 248 | # after expr() returns, it will have pushed 's value 249 | # to the top 250 | symtab[left] = operandstack.pop() 251 | 252 | # -> 'print' '(' ')' 253 | def printstmt(): 254 | consume(PRINT) 255 | consume(LEFTPAREN) 256 | # expr() will have pushed on its value 257 | expr() 258 | print(operandstack.pop()) 259 | consume(RIGHTPAREN) 260 | 261 | # -> ('+' )* 262 | def expr(): 263 | term() # pushes value of term on top of stack 264 | # loop for (+ ) 265 | while (token.category == PLUS): 266 | # consume wastes another check, just advance() 267 | advance() 268 | term() # pushes value of term on top of stack 269 | rightoperand = operandstack.pop() 270 | leftoperand = operandstack.pop() 271 | operandstack.append(leftoperand + rightoperand) 272 | # when term() returns, if it sees another + 273 | # in the token stream, it will loop again 274 | 275 | # -> ('*' )* 276 | def term(): 277 | global sign 278 | sign = 1 279 | factor() 280 | # loop for (* ) 281 | while (token.category == TIMES): 282 | advance() 283 | sign = 1 # initialize sign before every factor call because it's the only production with MINUS 284 | factor() 285 | rightoperand = operandstack.pop() 286 | leftoperand = operandstack.pop() 287 | operandstack.append(leftoperand * rightoperand) 288 | 289 | # -> '+' | '-' | UNSIGNEDINT | NAME | '(' ')' 290 | def factor(): 291 | global sign 292 | # a lot of cases, all disjoint 293 | if (token.category == PLUS): 294 | advance() 295 | factor() 296 | elif (token.category == MINUS): 297 | sign = -sign 298 | advance() 299 | factor() 300 | elif (token.category == UNSIGNEDINT): 301 | operandstack.append(sign * int(token.lexeme)) 302 | advance() 303 | elif (token.category == NAME): 304 | if (token.lexeme in symtab): 305 | operandstack.append(sign * symtab[token.lexeme]) 306 | else: 307 | raise RuntimeError(f'Name: {token.lexeme} is not defined') 308 | advance() 309 | elif (token.category == LEFTPAREN): 310 | advance() 311 | # need to save sign because expr() will cause global one to change 312 | savesign = sign 313 | expr() 314 | # if our current factor() call's sign was negative, then make our expression negative 315 | if savesign == -1: 316 | operandstack[-1] = -operandstack[-1] 317 | consume(RIGHTPAREN) 318 | else: 319 | raise RuntimeError('Expecting a factor') 320 | 321 | 322 | 323 | 324 | 325 | main() 326 | if grade: 327 | # display interpreter source code 328 | print('------------------------------------------- ' + sys.argv[0]) 329 | print(open(sys.argv[0]).read()) 330 | -------------------------------------------------------------------------------- /CH10/h1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 1st basic hybrid inteprete of a simple subset of the python 3 | language. For this interpreter, we're only going to utilize a small 4 | subset of the bytecode instructions you can see below as well as 5 | only a few parts for our virtual stack machine (ex: co_names, etc...) 6 | 7 | This is mostly reinvention from 8 | Reis's Writing Compilers 9 | 10 | This version is the basic hybrid interpreter 11 | ''' 12 | 13 | import sys, time # sys needed to access cmd line args and sys.exit() 14 | 15 | class Token: 16 | def __init__(self, line, column, category, lexeme): 17 | self.line = line # srce program line number of the token 18 | self.column = column # srce program col in which token starts 19 | self.category = category # category of the token 20 | self.lexeme = lexeme # token in string form 21 | 22 | # globals 23 | 24 | ############################## 25 | # hybrid intepreter specific # 26 | ############################## 27 | co_code = [] # table for bytecode instructions 28 | co_names = [] # table for names of the 'global' variables 29 | co_consts = [] # table for all the constants 30 | 31 | trace = True # controls token trace 32 | grade = False 33 | source = '' # receives entire source program 34 | sourceindex = 0 # index into the source code in source 35 | line = 0 # current line number 36 | column = 0 # current column number 37 | tokenlist = [] # list of tokens created by tokenizer 38 | tokenindex = -1 # index of current token in tokens 39 | token = None # current token 40 | prevchar = '\n' # '\n' in prevchar signals start of new line 41 | blankline = True # reset to False if line is not blank 42 | symtab = {} # a symbol table for tracking information 43 | operandstack = [] # a stack that will hold information for symbol table 44 | sign = 0 # a sign boolean that will allow us to know the number of unary minuses 45 | 46 | # constants that represent token categories 47 | EOF = 0 # end of file 48 | PRINT = 1 # 'print' keyword 49 | UNSIGNEDINT = 2 # unsigned integer 50 | NAME = 3 # identifier that is not a keyword 51 | ASSIGNOP = 4 # '=' assignment operator 52 | LEFTPAREN = 5 # '(' 53 | RIGHTPAREN = 6 # ')' 54 | PLUS = 7 # '+' 55 | MINUS = 8 # '-' 56 | TIMES = 9 # '*' 57 | NEWLINE = 10 # end of line 58 | ERROR = 11 # if not any of the above, then error 59 | 60 | # bytecode opcodes (subset) 61 | UNARY_NEGATIVE = 11 62 | BINARY_MULTIPLY = 20 63 | BINARY_ADD = 23 64 | PRINT_EXPR = 70 65 | # be wary of 71 & 72 66 | PRINT_ITEM = 71 67 | PRINT_NEWLINE = 72 68 | STORE_NAME = 90 69 | LOAD_CONST = 100 70 | LOAD_NAME = 101 71 | 72 | # displayable names for each token category 73 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP', 74 | 'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS', 75 | 'TIMES', 'NEWLINE','ERROR'] 76 | 77 | # keywords and their token categories} 78 | keywords = {'print': PRINT} 79 | 80 | # one-character tokens and their token categories 81 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN, 82 | '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF} 83 | 84 | ################# 85 | # main function # 86 | ################# 87 | # main() reads input file and calls tokenizer() 88 | def main(): 89 | global source 90 | 91 | if len(sys.argv) == 2: # check if correct number of cmd line args 92 | try: 93 | infile = open(sys.argv[1], 'r') 94 | source = infile.read() # read source program 95 | except IOError: 96 | print('Cannot read input file ' + sys.argv[1]) 97 | sys.exit(1) 98 | else: 99 | print('Wrong number of command line arguments') 100 | print('Format: python p1.py ') 101 | sys.exit(1) 102 | 103 | if source[-1] != '\n': # add newline to end if missing 104 | source = source + '\n' 105 | 106 | if trace: 107 | print('------------------------------------------- Token trace') 108 | print('Line Col Category Lexeme\n') 109 | 110 | 111 | try: 112 | tokenizer() 113 | parser() 114 | print('Nice! Seems like everything was parsed!') 115 | interpreter() 116 | 117 | # on an error, display an error message 118 | # token is the token object on which the error was detected 119 | except RuntimeError as emsg: 120 | # output slash n in place of newline 121 | lexeme = token.lexeme.replace('\n', '\\n') 122 | print('\nError on '+ "'" + lexeme + "'" + ' line ' + 123 | str(token.line) + ' column ' + str(token.column)) 124 | print(emsg) # message from RuntimeError object 125 | sys.exit(1) 126 | 127 | #################### 128 | # tokenizer # 129 | #################### 130 | def tokenizer(): 131 | global token 132 | curchar = ' ' # prime curchar with space 133 | 134 | while True: 135 | # skip whitespace but not newlines 136 | while curchar != '\n' and curchar.isspace(): 137 | curchar = getchar() # get next char from source program 138 | 139 | # construct and initialize token 140 | token = Token(line, column, None, '') 141 | 142 | if curchar.isdigit(): # start of unsigned int? 143 | token.category = UNSIGNEDINT # save category of token 144 | while True: 145 | token.lexeme += curchar # append curchar to lexeme 146 | curchar = getchar() # get next character 147 | if not curchar.isdigit(): # break if not a digit 148 | break 149 | 150 | elif curchar.isalpha() or curchar == '_': # start of name? 151 | while True: 152 | token.lexeme += curchar # append curchar to lexeme 153 | curchar = getchar() # get next character 154 | # break if not letter, '_', or digit 155 | if not (curchar.isalnum() or curchar == '_'): 156 | break 157 | 158 | # determine if lexeme is a keyword or name of variable 159 | if token.lexeme in keywords: 160 | token.category = keywords[token.lexeme] 161 | else: 162 | token.category = NAME 163 | 164 | elif curchar in smalltokens: 165 | token.category = smalltokens[curchar] # get category 166 | token.lexeme = curchar 167 | curchar = getchar() # move to first char after the token 168 | 169 | else: 170 | token.category = ERROR # invalid token 171 | token.lexeme = curchar 172 | raise RuntimeError('Invalid token') 173 | 174 | tokenlist.append(token) # append token to tokens list 175 | if trace: # display token if trace is True 176 | print("%3s %4s %-14s %s" % (str(token.line), 177 | str(token.column), catnames[token.category], token.lexeme)) 178 | 179 | if token.category == EOF: # finished tokenizing? 180 | break 181 | 182 | # getchar() gets next char from source and adjusts line and column 183 | def getchar(): 184 | global sourceindex, column, line, prevchar, blankline 185 | 186 | # check if starting a new line 187 | if prevchar == '\n': # '\n' signals start of a new line 188 | line += 1 # increment line number 189 | column = 0 # reset column number 190 | blankline = True # initialize blankline 191 | 192 | if sourceindex >= len(source): # at end of source code? 193 | column = 1 # set EOF column to 1 194 | prevchar = '' # save current char for next call 195 | return '' # null str signals end of source 196 | 197 | c = source[sourceindex] # get next char in the source program 198 | sourceindex += 1 # increment sourceindex to next character 199 | column += 1 # increment column number 200 | if not c.isspace(): # if c not whitespace then line not blank 201 | blankline = False # indicate line not blank 202 | prevchar = c # save current character 203 | 204 | # if at end of blank line, return space in place of '\n' 205 | if c == '\n' and blankline: 206 | return ' ' 207 | else: 208 | return c # return character to tokenizer() 209 | 210 | ################################ 211 | # Simple Parser/Generator # 212 | ################################ 213 | 214 | # begin the parser, starting with the 1st token in tokenlist 215 | def parser(): 216 | advance() 217 | program() 218 | 219 | # major function 1: advance() 220 | def advance(): 221 | ''' update the global token to the next token 222 | from tokenlist ''' 223 | global token, tokenindex 224 | tokenindex += 1 # move to next token 225 | if (tokenindex >= len(tokenlist)): # reached the end 226 | raise RuntimeError('Unexpected EOF') 227 | token = tokenlist[tokenindex] 228 | # print(f'Current Token: {token.lexeme}, Cat: {token.category} ') 229 | 230 | # major function 2: consume() 231 | def consume(expectedcat): 232 | # check current token with expected 233 | if (token.category == expectedcat): 234 | advance() # get next token 235 | else: 236 | raise RuntimeError('Expecting ' + catnames[expectedcat]) 237 | 238 | # -> * EOF 239 | def program(): 240 | # although stated, note that semantically, this means 241 | # that a program consists of 0 or more statements that 242 | # all begin with some 'NAME' or 'PRINT' token 243 | while (token.category in [NAME, PRINT,]): 244 | stmt() 245 | if (token.category != EOF): 246 | raise RuntimeError('Expecting EOF') 247 | print(token.category) 248 | 249 | 250 | # -> NEWLINE 251 | def stmt(): 252 | # note that we don't consume a 'simplestmt', but rather just call it 253 | # this is because simplestmt is a non-terminal, and not a token (terminal) 254 | simplestmt() 255 | # NEWLINE is a token, therefore we'll consume 256 | consume(NEWLINE) 257 | 258 | # -> 259 | def simplestmt(): 260 | # this is where FIRST sets come in 261 | if (token.category == NAME): 262 | assignmentstmt() 263 | elif (token.category == PRINT): 264 | printstmt() 265 | else: 266 | raise RuntimeError('Expecting NAME or PRINT') 267 | 268 | # -> NAME '=' 269 | def assignmentstmt(): 270 | # check if NAME exists in program 271 | if token.lexeme in co_names: 272 | index = co_names.index(token.lexeme) 273 | 274 | # first time seeing the variable 275 | else: 276 | index = len(co_names) 277 | co_names.append(token.lexeme) 278 | 279 | advance() 280 | consume(ASSIGNOP) 281 | expr() # will push expr() value 282 | 283 | # generate bytecode - STORE_NAME 284 | co_code.append(STORE_NAME) # pops TOS and stores in co_values[index] 285 | co_code.append(index) 286 | 287 | 288 | # -> 'print' '(' ')' 289 | def printstmt(): 290 | advance() 291 | consume(LEFTPAREN) 292 | # expr() will generate its bytecode and push it's 'value' 293 | expr() 294 | 295 | # printstmt() needs to pop the value from expr() and print it 296 | # note: book says use PRINT_ITEM & PRINT_NEWLINE, but we're 297 | # going to try to keep up to date: use PRINT_EXPR (70 dec) 298 | co_code.append(PRINT_EXPR) 299 | consume(RIGHTPAREN) 300 | 301 | # -> ('+' )* 302 | def expr(): 303 | term() # pushes value of term on top of stack 304 | # loop for (+ ) 305 | while (token.category == PLUS): 306 | # consume wastes another check, just advance() 307 | advance() 308 | term() # pushes value of term on top of stack 309 | 310 | # when our 2nd term returns, we'll need to add both 's 311 | co_code.append(BINARY_ADD) 312 | 313 | # when term() returns, if it sees another + 314 | # in the token stream, it will loop again 315 | 316 | # -> ('*' )* 317 | def term(): 318 | global sign 319 | sign = 1 320 | factor() 321 | # loop for (* ) 322 | while (token.category == TIMES): 323 | advance() 324 | sign = 1 # initialize sign before every factor call because it's the only production with MINUS 325 | factor() 326 | # after our 2nd factor returns, we need to multiply the two factors 327 | co_code.append(BINARY_MULTIPLY) 328 | 329 | # -> '+' | '-' | UNSIGNED_INT | NAME | '(' ')' 330 | def factor(): 331 | global sign 332 | # a lot of cases, all disjoint 333 | if (token.category == PLUS): 334 | advance() 335 | factor() 336 | elif (token.category == MINUS): 337 | sign = -sign 338 | advance() 339 | factor() 340 | 341 | # UNSIGNED_INT needs to save our const within co_consts 342 | # and append the appropriate bytecode instructions 343 | elif (token.category == UNSIGNEDINT): 344 | val = sign * int(token.lexeme) # get our value 345 | # don't waste space and have multiple copies; just use the same instance 346 | if val in co_consts: 347 | index = co_consts.index(val) 348 | else: 349 | # first time seeing the constant 350 | index = len(co_consts) 351 | co_consts.append(val) 352 | # generate bytecode; LOAD_CONST & consti 353 | co_code.append(LOAD_CONST) 354 | co_code.append(index) 355 | advance() 356 | 357 | # NAME needs to generate LOAD_NAME after checking whether name 358 | # exists within co_names 359 | elif (token.category == NAME): 360 | # check if name has been declared 361 | if token.lexeme in co_names: 362 | index = co_names.index(token.lexeme) 363 | else: 364 | raise RuntimeError(f'Name: {token.lexeme} is not defined') 365 | # generate code 366 | co_code.append(LOAD_NAME) 367 | co_code.append(index) 368 | # check if we're a negative op 369 | if sign == -1: 370 | co_code.append(UNARY_NEGATIVE) 371 | advance() 372 | 373 | 374 | elif (token.category == LEFTPAREN): 375 | advance() 376 | # expr() will call term() which restarts our global sign of negation; have a local 377 | # copy our this function call's negative state 378 | savesign = sign 379 | expr() 380 | if savesign == -1: # note that this 'savesign' refers to outside , not inside 381 | co_code.append(UNARY_NEGATIVE) 382 | consume(RIGHTPAREN) 383 | 384 | else: 385 | raise RuntimeError('Expecting a factor') 386 | 387 | 388 | ################### 389 | # The interpreter # 390 | ################### 391 | 392 | # Just going to read a bytecode instructions from co_code and 393 | # execute based on instruction; loop until no more or error 394 | # Note: As before, Python 3.6 introduced 2 byte instructions, but we're doing 1 & 2 395 | 396 | def interpreter(): 397 | stack = [] 398 | pc =0 399 | co_values = [None] * len(co_names) # values correspond to variables 400 | # print(f'co_code: {co_code}') 401 | 402 | while pc < len(co_code): 403 | opcode = co_code[pc] # get opcode (1 or 2 for instruction) 404 | pc += 1 # increment pc 405 | # print(f'Loop Count: {pc}') 406 | # 1 byte instructions 407 | if opcode == UNARY_NEGATIVE: 408 | stack[-1] = -stack[-1] 409 | elif opcode == BINARY_MULTIPLY: 410 | operand1 = stack.pop() 411 | operand2 = stack.pop() 412 | stack.append(operand1 * operand2) 413 | elif opcode == BINARY_ADD: 414 | operand1 = stack.pop() 415 | operand2 = stack.pop() 416 | stack.append(operand1 + operand2) 417 | # don't look into POP_TOP.. 418 | elif opcode == PRINT_EXPR: 419 | print(stack.pop(), end='') 420 | 421 | # 2 byte instructions 422 | elif opcode == STORE_NAME: 423 | index = co_code[pc] # get the index for the value 424 | pc += 1 425 | operand = stack.pop() 426 | co_values[index] = operand # update the table with the value 427 | elif opcode == LOAD_CONST: 428 | index = co_code[pc] 429 | pc += 1 430 | value = co_consts[index] 431 | stack.append(value) 432 | elif opcode == LOAD_NAME: 433 | index = co_code[pc] 434 | pc += 1 435 | value = co_values[index] # get the index of variable 436 | if value == None: 437 | print(f'No value for {co_names[index]}') 438 | sys.exit(1) 439 | stack.append(value) 440 | else: 441 | break 442 | 443 | # Call main() 444 | 445 | main() 446 | --------------------------------------------------------------------------------