├── CH11
    ├── a.out
    └── ap1.s
├── .gitignore
├── CH10
    ├── ex1.in
    ├── test1.in
    ├── test2.in
    ├── p1.in
    └── h1.py
├── CH7
    ├── p0701a.in
    ├── p0701b.in
    ├── p0701c.in
    ├── t1.in
    ├── p1.in
    ├── ch7_p1.py
    └── p1.py
├── sample
├── CH5
    ├── .ch5_p1.py.swp
    └── ch5_p1.py
├── CH6
    ├── t1.in
    ├── __pycache__
    │   └── tokenizer_1.cpython-38.pyc
    ├── README.md
    ├── t1.py
    └── tokenizer_1.py
├── CH8
    ├── p1.in
    └── p1.py
├── README.md
├── CH4
    ├── ch4_p9.py
    ├── ch4_p11.py
    ├── ch4_p2.py
    ├── ch4_p1.py
    ├── ch4_p4.py
    ├── sp.py
    └── ch4_p3.py
└── CH9
    └── p1.py


/CH11/a.out:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | 


--------------------------------------------------------------------------------
/CH10/ex1.in:
--------------------------------------------------------------------------------
1 | 2 + 3
2 | 


--------------------------------------------------------------------------------
/CH7/p0701a.in:
--------------------------------------------------------------------------------
1 | a =
2 | 


--------------------------------------------------------------------------------
/CH7/p0701b.in:
--------------------------------------------------------------------------------
1 | printf(3)
2 | 


--------------------------------------------------------------------------------
/CH7/p0701c.in:
--------------------------------------------------------------------------------
1 | print(3))
2 | 


--------------------------------------------------------------------------------
/sample:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spak9/WICRPI/HEAD/sample


--------------------------------------------------------------------------------
/CH10/test1.in:
--------------------------------------------------------------------------------
1 | x = 5 + 5
2 | print(x)
3 | y = x + 5 
4 | print(y)
5 | 


--------------------------------------------------------------------------------
/CH10/test2.in:
--------------------------------------------------------------------------------
1 | x = (10 + (10 + 10))
2 | y = (x * 10)
3 | print(x)
4 | print(y)
5 | 


--------------------------------------------------------------------------------
/CH5/.ch5_p1.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spak9/WICRPI/HEAD/CH5/.ch5_p1.py.swp


--------------------------------------------------------------------------------
/CH6/t1.in:
--------------------------------------------------------------------------------
1 | print(-59 + 20*3)
2 | a = 2
3 | bb_1 = -a + 12
4 | print(a*bb_1 + a*3*(-1 + -1 + -1))
5 | 


--------------------------------------------------------------------------------
/CH7/t1.in:
--------------------------------------------------------------------------------
1 | print(-59 + 20*3)
2 | a = 2
3 | bb_1 = -a + 12
4 | print(a*bb_1 + a*3*(-1 + -1 + -1))
5 | 


--------------------------------------------------------------------------------
/CH10/p1.in:
--------------------------------------------------------------------------------
1 | print(-59 + 20*3)
2 | a = 2
3 | bb_1 = -(a) + 12
4 | print(a*bb_1 + a*3*(-1 + -1 + -1))
5 | 
6 | 


--------------------------------------------------------------------------------
/CH7/p1.in:
--------------------------------------------------------------------------------
1 | print(-59 + 20*3)
2 | a = 2
3 | bb_1 = -a + 12
4 | print(a*bb_1 + a*3*(-1 + -1 + -1))
5 | 
6 | 


--------------------------------------------------------------------------------
/CH8/p1.in:
--------------------------------------------------------------------------------
1 | -(-2+3)
2 | print(-59 + 20*3)
3 | a = 2
4 | bb_1 = -a + 12
5 | print(a*bb_1 + a*3*(-1 + -1 + -1))
6 | 
7 | 


--------------------------------------------------------------------------------
/CH6/__pycache__/tokenizer_1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spak9/WICRPI/HEAD/CH6/__pycache__/tokenizer_1.cpython-38.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WICRP
2 | A repository that is a workflow of Anthony Reis's "Writing Interpreters and Compilers for the Raspberry Pi Using Python" 2nd Edition
3 | 


--------------------------------------------------------------------------------
/CH6/README.md:
--------------------------------------------------------------------------------
1 | # CH 6: Basic Tokenizer for a Python Subset
2 | 
3 | ## Tokenizer
4 | The original tokenizer can be found from the software 
5 | package from Reis, but I reconstructed the code 
6 | just reinvention-sake. 
7 | 


--------------------------------------------------------------------------------
/CH4/ch4_p9.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author Steven Pak
 3 | Reis's Writing Interpreters and Compilers 2nd Edition
 4 | 
 5 | ch 4 p9
 6 | 
 7 | Grammar:
 8 |     <S> -> 'a' <X>
 9 |     <X> -> <B> | <C>
10 |     <B> -> 'b' 'b' 'b'
11 |     <C> -> 'c' 'c' 'c'
12 | '''
13 | 


--------------------------------------------------------------------------------
/CH4/ch4_p11.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author Steven Pak
 3 | Reis's Writing Intepreters and Compilers 2nd edition
 4 | 
 5 | ch4 p11
 6 | 
 7 | The current grammar of:
 8 |     <S> -> <S> 'a'
 9 |     <S> -> 'b'
10 | doesn't work well with top-down parsers because
11 | of left-recursion. 
12 | 
13 | The new grammar is:
14 |     <S> -> 'b' <T>
15 |     <T> -> 'a' <T> | lambda
16 | '''
17 | 


--------------------------------------------------------------------------------
/CH11/ap1.s:
--------------------------------------------------------------------------------
 1 |                                 @ ap1.s
 2 |           .text                 @ start of read-only segment
 3 |           .global _start
 4 | _start:
 5 |           ldr r0,x              @ load r0 from x
 6 |           mov r7, #1            @ mov 1 into r7
 7 |           svc 0                 @ supervisor call to terminate program
 8 | 
 9 | x:        .word 14              @ the variable x
10 | 
11 | 


--------------------------------------------------------------------------------
/CH7/ch7_p1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     The following error is that 'a =' expects a <factor> but gets a 
 3 |     NEWLINE
 4 | 
 5 |     The following error with 'printf(3)' is that when tokenizing, it views
 6 |     'printf' as a name, but Reis is using it as function, even a C function
 7 |     which shouldn't be recognized as such, therefore it will expect as ASSIGNOP 
 8 |     following a NAME token.
 9 | 
10 |     The following error with 'print(3))' won't work from first glance because 
11 |     nothing in our grammar supports unbalanced parentheses. The error the parser
12 |     gets is 'Expecting NEWLINE'
13 | '''
14 | 


--------------------------------------------------------------------------------
/CH4/ch4_p2.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author Steven Pak
 3 | Reis's Writing Intepreters & Compilers 2nd Edition
 4 | 
 5 | Ch4 problem 2 solution
 6 | 
 7 | Grammar:
 8 |     <S> -> 'a' <B> 'd'
 9 |     <B> -> ('b' 'b')* [c]
10 | '''
11 | 
12 | import sys
13 | 
14 | tokenindex = -1
15 | token = ''
16 | 
17 | def main():
18 |     try:
19 |         parser()
20 |     except RuntimeError as emsg:
21 |         print(emsg)
22 | 
23 | def advance():
24 |     global tokenindex, token
25 |     tokenindex += 1 # increment index
26 |     # check if we're at the end of string or given no input string
27 |     if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])):
28 |         token = ''; # the end
29 |     else:
30 |         token = sys.argv[1][tokenindex] # advance to next token (character)
31 | 
32 | def consume(expected):
33 |     if (expected == token):
34 |         advance()
35 |     else:
36 |         raise RuntimeError(f'Expecting: {expected}')
37 | 
38 | def parser():
39 |     # prime token with first token
40 |     advance()
41 |     S()
42 | 
43 | 
44 | def S():
45 |     consume('a')
46 |     B()
47 |     consume('d')
48 | 
49 | def B():
50 |     # pairs of b's
51 |     while (token == 'b'):
52 |         advance()
53 |         consume('b')
54 | 
55 |     # optional 'c'
56 |     if (token == 'c'):
57 |         advance()
58 | 
59 | main()
60 | 


--------------------------------------------------------------------------------
/CH4/ch4_p1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author Steven Pak
 3 | Reis's Writing Intepreters & Compilers 2nd Edition
 4 | 
 5 | Ch4 problem 1 solution
 6 | 
 7 | Grammar:
 8 |     <S> -> 'a' <S> 'b'
 9 |     <S> -> 'c'
10 | '''
11 | 
12 | import sys
13 | 
14 | tokenindex = -1
15 | token = ''
16 | 
17 | def main():
18 |     try:
19 |         parser()
20 |     except RuntimeError as emsg:
21 |         print(emsg)
22 | 
23 | def advance():
24 |     global tokenindex, token
25 |     tokenindex += 1 # increment index
26 |     # check if we're at the end of string or given no input string
27 |     if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])):
28 |         token = ''; # the end
29 |     else:
30 |         token = sys.argv[1][tokenindex] # advance to next token (character)
31 | 
32 | def consume(expected):
33 |     if (expected == token):
34 |         advance()
35 |     else:
36 |         raise RuntimeError(f'Expecting: {expected}')
37 | 
38 | def parser():
39 |     # prime token with first token
40 |     advance()
41 |     S()
42 | 
43 | def S():
44 |     if (token == 'a'):
45 |         # consuming would cause another check
46 |         advance()
47 |         S()
48 |         consume('b')
49 |     elif (token == 'c'):
50 |         advance()
51 |     else:
52 |         raise RuntimeError('Expecting an a or c')
53 | 
54 | 
55 | # begin the program
56 | main()
57 | 


--------------------------------------------------------------------------------
/CH4/ch4_p4.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author Steven Pak
 3 | Reis's Writing Intepreters & Compilers 2nd Edition
 4 | 
 5 | Ch4 problem 4 solution
 6 | 
 7 | Grammar:
 8 |     <S> -> 'a'* <B> <S>
 9 |     <S> -> 'e'
10 |     <B> -> 'b' 'c' 'd'
11 | '''
12 | 
13 | import sys
14 | 
15 | tokenindex = -1
16 | token = ''
17 | 
18 | def main():
19 |     try:
20 |         parser()
21 |     except RuntimeError as emsg:
22 |         print(emsg)
23 | 
24 | def advance():
25 |     global tokenindex, token
26 |     tokenindex += 1 # increment index
27 |     # check if we're at the end of string or given no input string
28 |     if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])):
29 |         token = ''; # the end
30 |     else:
31 |         token = sys.argv[1][tokenindex] # advance to next token (character)
32 | 
33 | def consume(expected):
34 |     if (expected == token):
35 |         advance()
36 |     else:
37 |         raise RuntimeError(f'Expecting: {expected}')
38 | 
39 | def parser():
40 |     # prime token with first token
41 |     advance()
42 |     S()
43 | 
44 |     # check if we're at the end of the string
45 |     if (token != ''):
46 |         print('Garbage within the <S>-string')
47 |     else:
48 |         print('<S>-string valid')
49 | 
50 | 
51 | def S():
52 |     if (token == 'a'):
53 |         advance()
54 |         B()
55 |         S()
56 |     elif (token == 'e'):
57 |         advance()
58 |     else:
59 |         raise RuntimeError('Expecting a or e')
60 | def B():
61 |     consume('b')
62 |     consume('c')
63 |     consume('d')
64 | 
65 | main()
66 | 


--------------------------------------------------------------------------------
/CH4/sp.py:
--------------------------------------------------------------------------------
 1 | # From Reis's Writing Compilers 2nd Edition
 2 | 
 3 | # Grammer:
 4 | # S -> AC
 5 | # A -> ab
 6 | # C -> cC
 7 | # C -> d
 8 | 
 9 | import sys
10 | 
11 | tokenindex = -1
12 | token = ''
13 | 
14 | def main():
15 |     try:
16 |         parser()
17 |     except RuntimeError as emsg:
18 |         print(emsg)
19 | 
20 | def advance():
21 |     global tokenindex, token
22 |     tokenindex += 1 # increment index
23 |     # check if we're at the end of string or given no input string
24 |     if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])):
25 |         token = ''; # the end 
26 |     else:
27 |         token = sys.argv[1][tokenindex] # advance to next token (character)
28 | 
29 | def consume(expected):
30 |     if (expected == token):
31 |         advance()
32 |     else:
33 |         raise RuntimeError(f'Expecting: {expected}')
34 | 
35 | def parser():
36 |     # prime token with first token 
37 |     advance() 
38 |     S()
39 | 
40 |     # check if we've finished input string, that is
41 |     # S() will eventually chain all calls to end, therefore
42 |     # if we end up with another token after S(), input doesn't
43 |     # end with 'd'
44 |     if token != '':
45 |         print('Garbage following <S>-string')
46 | 
47 | def S():
48 |     A()
49 |     C()
50 | 
51 | def A():
52 |     consume('a')
53 |     consume('b')
54 | 
55 | def C():
56 |     if (token == 'c'):
57 |         advance()
58 |         C()
59 |     # if we reach 'd' token, then we've come to the end of the grammar and input string
60 |     elif token == 'd':
61 |         advance()
62 |     else:
63 |         raise RuntimeError('Expecting c or d')
64 | 
65 | main()
66 | 


--------------------------------------------------------------------------------
/CH4/ch4_p3.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author Steven Pak
 3 | Reis's Writing Intepreters & Compilers 2nd Edition
 4 | 
 5 | Ch4 problem 3 solution
 6 | 
 7 | Grammar:
 8 |     <S> -> 'a'* <B>
 9 |     <B> -> 'b'* <C>
10 |     <C> -> 'c'['d'|'e']'f'
11 | '''
12 | 
13 | import sys
14 | 
15 | tokenindex = -1
16 | token = ''
17 | 
18 | def main():
19 |     try:
20 |         parser()
21 |     except RuntimeError as emsg:
22 |         print(emsg)
23 | 
24 | def advance():
25 |     global tokenindex, token
26 |     tokenindex += 1 # increment index
27 |     # check if we're at the end of string or given no input string
28 |     if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])):
29 |         token = ''; # the end
30 |     else:
31 |         token = sys.argv[1][tokenindex] # advance to next token (character)
32 | 
33 | def consume(expected):
34 |     if (expected == token):
35 |         advance()
36 |     else:
37 |         raise RuntimeError(f'Expecting: {expected}')
38 | 
39 | def parser():
40 |     # prime token with first token
41 |     advance()
42 |     S()
43 |     
44 |     # check if we're at the end of the string
45 |     if (token != ''):
46 |         print('Garbage within the <S>-string')
47 |     else:
48 |         print('<S>-string valid')
49 | def S():
50 |     # loop while token is 'a'
51 |     while (token == 'a'):
52 |         advance()
53 |     B()
54 | 
55 | def B():
56 |     # loop while token is 'b'
57 |     while (token == 'b'):
58 |         advance()
59 |     C()
60 | 
61 | def C():
62 |     consume('c')
63 |     # check between the optional tokens
64 |     if (token == 'd'):
65 |         advance()
66 |     elif (token == 'e'):
67 |         advance()
68 |     consume('f')
69 | 
70 | main()
71 | 


--------------------------------------------------------------------------------
/CH5/ch5_p1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author Steven Pak
 3 | Reis's Writing Intepreters & Compilers 2nd Edition
 4 | 
 5 | Ch4 problem 4 solution
 6 | 
 7 | Grammar:
 8 |     <S> -> <A><B><C>
 9 |     <A> -> 'a' 
10 |     <A> -> '' 
11 |     <B> -> 'b'
12 |     <B> -> ''
13 |     <C> -> 'c'
14 |     <C> -> ''
15 | '''
16 | 
17 | import sys
18 | 
19 | tokenindex = -1
20 | token = ''
21 | 
22 | def main():
23 |     try:
24 |         parser()
25 |     except RuntimeError as emsg:
26 |         print(emsg)
27 | 
28 | def advance():
29 |     global tokenindex, token
30 |     tokenindex += 1 # increment index
31 |     # check if we're at the end of string or given no input string
32 |     if (len(sys.argv) < 2 or tokenindex >= len(sys.argv[1])):
33 |         token = ''; # the end
34 |     else:
35 |         token = sys.argv[1][tokenindex] # advance to next token (character)
36 | 
37 | def consume(expected):
38 |     if (expected == token):
39 |         advance()
40 |     else:
41 |         raise RuntimeError(f'Expecting: {expected}')
42 | 
43 | def parser():
44 |     # prime token with first token
45 |     advance()
46 |     S()
47 |     
48 |     if (token != ''):
49 |         print('Garbage following <S>-string')
50 |     else:
51 |         print('pass')
52 |     
53 | def S():
54 |     # Only 1 <S> production
55 |     if (token in ['a', 'b', 'c']):
56 |         A()
57 |         B()
58 |         C()
59 |     else:
60 |         raise RuntimeError('Expecting a, b, or c')
61 | 
62 | def A():
63 |     # 1st production
64 |     if (token == 'a'):
65 |         advance()
66 |     
67 |     # 2nd production that is lambda, therefore
68 |     # the current token is already on the next 
69 |     # production's token
70 |     elif (token in ['b', 'c', '']):
71 |         pass
72 |     else: 
73 |         raise RuntimeError('Expecting a or lambda')
74 | 
75 | def B():
76 |     if (token == 'b'):
77 |         advance()
78 |     elif (token in ['c', '']):
79 |         pass
80 |     else:
81 |         raise RuntimeError('Expecting b or lambda')
82 | 
83 | def C():
84 |     if (token == 'c'):
85 |         advance()
86 |     elif (token == ''):
87 |         pass
88 |     else:
89 |         raise RuntimeError('Expecting c or lambda')
90 | 
91 | main()
92 | 
93 | 


--------------------------------------------------------------------------------
/CH6/t1.py:
--------------------------------------------------------------------------------
  1 | # t1.py tokenizer
  2 | import sys        # sys needed to access cmd line args and sys.exit()
  3 | 
  4 | class Token:
  5 |    def __init__(self, line, column, category, lexeme):
  6 |       self.line = line         # source prog line number of the token
  7 |       self.column = column     # source prog col in which token starts
  8 |       self.category = category # category of the token
  9 |       self.lexeme = lexeme     # token in string form
 10 | 
 11 | # global variables
 12 | trace = True           # controls token trace
 13 | source = ''            # receives entire source program
 14 | sourceindex = 0        # index into source
 15 | line = 0               # current line number 
 16 | column = 0             # current column number
 17 | tokenlist = []         # list of tokens created by tokenizer
 18 | prevchar = '\n'        # '\n' in prevchar signals start of new line
 19 | blankline = True       # reset to False if line is not blank
 20 | 
 21 | # constants that represent token categories
 22 | EOF           = 0      # end of file
 23 | PRINT         = 1      # 'print' keyword
 24 | UNSIGNEDINT   = 2      # integer
 25 | NAME          = 3      # identifier that is not a keyword
 26 | ASSIGNOP      = 4      # '=' assignment operator
 27 | LEFTPAREN     = 5      # '('
 28 | RIGHTPAREN    = 6      # ')'
 29 | PLUS          = 7      # '+'
 30 | MINUS         = 8      # '-'
 31 | TIMES         = 9      # '*'
 32 | NEWLINE       = 10     # newline character
 33 | ERROR         = 11     # if not any of the above, then error
 34 | 
 35 | # displayable names for each token category
 36 | catnames = ['EOF', 'PRINT', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP',
 37 |             'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS',
 38 |             'TIMES', 'NEWLINE','ERROR']
 39 | 
 40 | # keywords and their token categories}
 41 | keywords = {'print': PRINT}
 42 | 
 43 | # one-character tokens and their token categories
 44 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN,
 45 |                '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF}
 46 | 
 47 | # main() reads input file and calls tokenizer()
 48 | def main():
 49 |    global source
 50 | 
 51 |    if len(sys.argv) == 2:   # check if correct number of cmd line args
 52 |       try:
 53 |          infile = open(sys.argv[1], 'r')
 54 |          source = infile.read()  # read source program
 55 |       except IOError:
 56 |          print('Cannot read input file ' + sys.argv[1])
 57 |          sys.exit(1)
 58 |    else:
 59 |       print('Wrong number of command line arguments')
 60 |       print('format: python t1.py <infile>')
 61 |       sys.exit(1)
 62 | 
 63 |    if source[-1] != '\n':        # add newline to end if missing
 64 |       source = source + '\n'
 65 | 
 66 |    if trace:                     # for token trace
 67 |       print('Line  Col Category       Lexeme\n')
 68 | 
 69 |    try:
 70 |       tokenizer()                # tokenize source code in source
 71 |    except RuntimeError as emsg: 
 72 |      # output slash n in place of newline
 73 |      lexeme = token.lexeme.replace('\n', '\\n')
 74 |      print('\nError on '+ "'" + lexeme + "'" + ' line ' +
 75 |         str(token.line) + ' column ' + str(token.column))
 76 |      print(emsg) # message from RuntimeError object
 77 |      sys.exit(1)       # 1 return code indicates an error has occurred
 78 |  
 79 | # tokenizer tokenizes tokens in source code and appends them to tokens
 80 | def tokenizer():
 81 |    global token
 82 |    curchar = ' '                 # prime curchar with space
 83 | 
 84 |    while True:
 85 |       # skip whitespace but not newlines
 86 |       while curchar != '\n' and curchar.isspace():
 87 |          curchar = getchar() # get next char from source program
 88 | 
 89 |       # construct and initialize a new token
 90 |       token = Token(line, column, None, '')  
 91 | 
 92 |       if curchar.isdigit():            # start of unsigned int?
 93 |          token.category = UNSIGNEDINT  # save category of token
 94 |          while True:
 95 |             token.lexeme += curchar    # append curchar to lexeme
 96 |             curchar = getchar()        # get next character
 97 |             if not curchar.isdigit():  # break if not a digit
 98 |                break
 99 | 
100 |       elif curchar.isalpha() or curchar == '_':   # start of name?
101 |          while True:
102 |             token.lexeme += curchar    # append curchar to lexeme
103 |             curchar = getchar()        # get next character
104 |             # break if not letter, '_', or digit
105 |             if not (curchar.isalnum() or curchar == '_'):
106 |                break
107 | 
108 |          # determine if lexeme is a keyword or name of variable
109 |          if token.lexeme in keywords:
110 |             token.category = keywords[token.lexeme]
111 |          else:
112 |             token.category = NAME
113 | 
114 |       elif curchar in smalltokens:
115 |          token.category = smalltokens[curchar]   # get category
116 |          token.lexeme = curchar
117 |          curchar = getchar()       # move to first char after token
118 | 
119 |       else:                         
120 |          token.category = ERROR    # invalid token 
121 |          token.lexeme = curchar    # save lexeme
122 |          raise RuntimeError('Invalid token')
123 |       
124 |       tokenlist.append(token)      # append token to tokens list
125 |       if trace:                    # display token if trace is True
126 |          print("%3s %4s  %-14s %s" % (str(token.line), 
127 |             str(token.column), catnames[token.category], token.lexeme))
128 | 
129 |       if token.category == EOF:    # finished tokenizing?
130 |          break
131 | 
132 | # getchar() gets next char from source and adjusts line and column
133 | def getchar():
134 |    global sourceindex, column, line, prevchar, blankline
135 | 
136 |    # check if starting a new line
137 |    if prevchar == '\n':    # '\n' signals start of a new line
138 |       line += 1            # increment line number                             
139 |       column = 0           # reset column number
140 |       blankline = True     # initialize blankline
141 | 
142 |    if sourceindex >= len(source): # at end of source code?
143 |       column = 1                  # set EOF column to 1  
144 |       prevchar = ''               # save current char for next call
145 |       return ''                   # null str signals end of source
146 | 
147 |    c = source[sourceindex] # get next char in the source program
148 |    sourceindex += 1        # increment sourceindex to next character
149 |    column += 1             # increment column number
150 |    if not c.isspace():     # if c not whitespace then line not blank
151 |       blankline = False    # indicate line not blank
152 |    prevchar = c            # save current char for next call
153 | 
154 |    # if at end of blank line, return space in place of '\n'
155 |    if c == '\n' and blankline:
156 |       return ' '
157 |    else:
158 |       return c             # return character to tokenizer()
159 | 
160 | main()                     # call main function
161 | 
162 | 


--------------------------------------------------------------------------------
/CH6/tokenizer_1.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The 1st version of the Python tokenizer from
  3 | Reis's Compilers book. Original version can be 
  4 | found from his software package and this 
  5 | verison is simply for reinvention-sake.
  6 | '''
  7 | 
  8 | import sys # cmd line args
  9 | 
 10 | class Token:
 11 |     ''' The only class the tokenizer needs to realize
 12 |         are tokens, which consist of the literal lexeme
 13 |         , token type, and other properties for errors
 14 |     '''
 15 |     def __init__(self, line, column, category, lexeme):
 16 |         self.line = line # source program line number
 17 |         self.column = column # the column/index within the line
 18 |         self.category = category # token type 
 19 |         self.lexeme = lexeme # the literal string
 20 | 
 21 | ''' Global variables '''
 22 | trace = True
 23 | source = ''     # The whole source program
 24 | sourceindex = 0 # the index for source 
 25 | line = 0        # the actual line
 26 | column = 0      # the character index within a line
 27 | tokenlist = []  # list holding ALL tokens; for parser
 28 | prevchar = '\n' # '\n' signal start of new line
 29 | blankline = True # False if line is not blank
 30 | 
 31 | ''' Token Categories '''
 32 | EOF           = 0      # end of file
 33 | PRINT         = 1      # 'print' keyword
 34 | UNSIGNEDINT   = 2      # integer
 35 | NAME          = 3      # identifier that is not a keyword
 36 | ASSIGNOP      = 4      # '=' assignment operator
 37 | LEFTPAREN     = 5      # '('
 38 | RIGHTPAREN    = 6      # ')'
 39 | PLUS          = 7      # '+'
 40 | MINUS         = 8      # '-'
 41 | TIMES         = 9      # '*'
 42 | NEWLINE       = 10     # newline character
 43 | ERROR         = 11     # if not any of the above, then error
 44 | 
 45 | # displayable names for each token category, 
 46 | # indice match up with category type
 47 | catnames = ['EOF', 'PRINT', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP',
 48 |             'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS',
 49 |             'TIMES', 'NEWLINE','ERROR']
 50 | 
 51 | # keywords and their token categories}
 52 | keywords = {'print': PRINT}
 53 | 
 54 | # one-character tokens and their token categories
 55 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN,
 56 |                '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF}
 57 | 
 58 | # reads in source program file & calls tokenizer,
 59 | # eventually returning with an Exception or a full list of tokens
 60 | def main():
 61 |     global source # source program
 62 |     
 63 |     # correct no. of cmd line args
 64 |     if (len(sys.argv) == 2): 
 65 |         with open(sys.argv[1]) as f:
 66 |             source = f.read() # returns the whole file as a string
 67 | 
 68 |     else:
 69 |         print('Incorrect number of cmd-line args')
 70 |         print('format: python tokenizer_1.py <file_name>')
 71 |         sys.exit(1)
 72 | 
 73 |     # /n for text editors that DON'T end with /n
 74 |     # if one is missing, add one 
 75 |     if (source[-1] != '\n'):
 76 |         source = source + '\n'
 77 | 
 78 |     # run the tokenizer
 79 |     try:
 80 |         tokenizer()
 81 |     except RuntimeError as emsg:
 82 |         print(emsg)
 83 |         sys.exit(1)
 84 | 
 85 | def tokenizer():
 86 |     '''  Tokenizes tokens in source code and appends them to 'tokenlist' '''
 87 |     global token
 88 |     curchar = '' 
 89 |    
 90 |     # tokenize through the whole source program
 91 |     while True:
 92 |         # skip the white spaces, but not \n
 93 |         while (curchar != '\n' and curchar.isspace()):
 94 |             curchar = getchar() # get next char 
 95 |     
 96 |         # create a new token; category & lexeme are tbd
 97 |         token = Token(line, column, None, '')
 98 | 
 99 |         # Now we tokenize based on the current character we can see 
100 |     
101 |         # case 1: unsigned ints
102 |         if (curchar.isdigit()):
103 |             token.category = UNSIGNEDINT 
104 |             # get the whole string of numbers
105 |             while True:
106 |                 token.lexeme += curchar # append
107 |                 curchar = getchar()     # update char
108 |                 if not curchar.isdigit():   # break if no longer digit
109 |                     break
110 |         
111 |         # case 2: keywords or identifier
112 |         elif (curchar.isalpha() or curchar == '_'): 
113 |             while True:
114 |                 token.lexeme += curchar     # append 
115 |                 curchar = getchar()         # get next 
116 |                 if not (curchar.isanum() or curchar == '_'):
117 |                     break
118 | 
119 |             # check if lexeme is a keyword or identifier 
120 |             if (token.lexeme in keywords):
121 |                 token.category = keywords[token.lexeme] # PRINT
122 |             else:
123 |                 token.category = NAME       # else, it's a identifier
124 |             
125 |         # case 3: operators/small tokens
126 |         elif (curchar in smalltokens):
127 |             token.category = smalltokens[curchar] # get category
128 |             token.lexeme = curchar
129 |             curchar = getchar()     # move ot first char after token
130 | 
131 |         # case 4: not a valid token
132 |         else:
133 |             token.category = ERROR
134 |             token.lexeme = curchar
135 |             raise RuntimeError('Invalid Token')
136 | 
137 |         # append to tokenlist
138 |         tokenlist.append(token)
139 |         
140 |         if (trace):
141 |             print("%3s %4s  %-14s %s" % (str(token.line),
142 |                 str(token.column), catnames[token.category], token.lexeme))
143 |         
144 |         if (token.category == EOF): # end of tokenizing
145 |             break
146 | 
147 | def getchar():
148 |     ''' returns the next character in the source program and
149 |         adjusts 'line' and 'column' globals if needed.
150 |         It also returns '' when EOF & ' ' for blankline 
151 |     '''
152 |     global sourceindex, column, line, prevchar, blankline
153 | 
154 |     # if we're starting a new line, then we must update the global 
155 |     # properties for where we are in source program
156 | 
157 |     # if the prevchar is a \n, then we must be a new line,
158 |     # but we'll check for blank line soon
159 |     if (prevchar == '\n'): # saying that we've seen a new line
160 |         line += 1
161 |         column = 0
162 |         blankline = True # first assume that we've reached a blank line
163 | 
164 |     # check if we're at end of source
165 |     if (sourceindex >= len(source)):
166 |         column = 1
167 |         prevchar = ''
168 |         return '' # empty string signals EOF
169 | 
170 |     # get c
171 |     c = source[sourceindex]
172 |     sourceindex += 1
173 |     column += 1
174 |     # if 'c' is NOT a whitespace, then it must not be a blank line
175 |     if (c.isspace() == False):
176 |         blankline = False
177 |     prevchar = c        # update prevchar 
178 | 
179 |     # if at end of blank line (just 1 \n), return space instead
180 |     if (c == '\n' and blankline):
181 |         return ' '
182 |     else: 
183 |         return c        # return character to tokenizer()
184 | 
185 | main()
186 | 


--------------------------------------------------------------------------------
/CH7/p1.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     1st basic parser of a simple subset of the python
  3 |     language. This is mostly reinvention from 
  4 |     Reis's Writing Compilers 
  5 | '''
  6 | 
  7 | import sys, time   # sys needed to access cmd line args and sys.exit()
  8 | 
  9 | class Token:
 10 |    def __init__(self, line, column, category, lexeme):
 11 |       self.line = line         # srce program line number of the token
 12 |       self.column = column     # srce program col in which token starts
 13 |       self.category = category # category of the token
 14 |       self.lexeme = lexeme     # token in string form
 15 | 
 16 | # globals grade
 17 | trace = True        # controls token trace
 18 | source = ''          # receives entire source program
 19 | sourceindex = 0      # index into the source code in source
 20 | line = 0             # current line number
 21 | column = 0           # current column number
 22 | tokenlist = []       # list of tokens created by tokenizer
 23 | tokenindex = -1      # index of current token in tokens
 24 | token = None         # current token
 25 | prevchar = '\n'      # '\n' in prevchar signals start of new line
 26 | blankline = True     # reset to False if line is not blank
 27 | 
 28 | # constants that represent token categories
 29 | EOF           = 0    # end of file
 30 | PRINT         = 1    # 'print' keyword
 31 | UNSIGNEDINT   = 2    # unsigned integer
 32 | NAME          = 3    # identifier that is not a keyword
 33 | ASSIGNOP      = 4    # '=' assignment operator
 34 | LEFTPAREN     = 5    # '('
 35 | RIGHTPAREN    = 6    # ')'
 36 | PLUS          = 7    # '+'
 37 | MINUS         = 8    # '-'
 38 | TIMES         = 9    # '*'
 39 | NEWLINE       = 10   # end of line
 40 | ERROR         = 11   # if not any of the above, then error
 41 | 
 42 | # displayable names for each token category
 43 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP',
 44 |             'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS',
 45 |             'TIMES', 'NEWLINE','ERROR']
 46 | 
 47 | # keywords and their token categories}
 48 | keywords = {'print': PRINT}
 49 | 
 50 | # one-character tokens and their token categories
 51 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN,
 52 |                '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF}
 53 | 
 54 | #################
 55 | # main function #
 56 | #################
 57 | # main() reads input file and calls tokenizer()
 58 | def main():
 59 |     global source
 60 | 
 61 |     if len(sys.argv) == 2:   # check if correct number of cmd line args
 62 |       try:
 63 |          infile = open(sys.argv[1], 'r')
 64 |          source = infile.read()  # read source program
 65 |       except IOError:
 66 |          print('Cannot read input file ' + sys.argv[1])
 67 |          sys.exit(1)
 68 |     else:
 69 |       print('Wrong number of command line arguments')
 70 |       print('Format: python p1.py <infile>')
 71 |       sys.exit(1)
 72 | 
 73 |     if source[-1] != '\n':  # add newline to end if missing
 74 |       source = source + '\n'
 75 | 
 76 |     if trace:
 77 |       print('------------------------------------------- Token trace')
 78 |       print('Line  Col Category    Lexeme\n')
 79 | 
 80 | 
 81 |     try:
 82 |         tokenizer()
 83 |         parser()
 84 | 
 85 |    # on an error, display an error message
 86 |    # token is the token object on which the error was detected
 87 |     except RuntimeError as emsg:
 88 |       # output slash n in place of newline
 89 |       lexeme = token.lexeme.replace('\n', '\\n')
 90 |       print('\nError on '+ "'" + lexeme + "'" + ' line ' +
 91 |          str(token.line) + ' column ' + str(token.column))
 92 |       print(emsg)      # message from RuntimeError object
 93 |       sys.exit(1)
 94 | 
 95 | ####################
 96 | # tokenizer        #
 97 | ####################
 98 | def tokenizer():
 99 |    global token
100 |    curchar = ' '          # prime curchar with space
101 | 
102 |    while True:
103 |       # skip whitespace but not newlines
104 |       while curchar != '\n' and curchar.isspace():
105 |          curchar = getchar() # get next char from source program
106 | 
107 |       # construct and initialize token
108 |       token = Token(line, column, None, '')
109 | 
110 |       if curchar.isdigit():               # start of unsigned int?
111 |          token.category = UNSIGNEDINT     # save category of token
112 |          while True:
113 |             token.lexeme += curchar       # append curchar to lexeme
114 |             curchar = getchar()           # get next character
115 |             if not curchar.isdigit():     # break if not a digit
116 |                break
117 | 
118 |       elif curchar.isalpha() or curchar == '_':   # start of name?
119 |          while True:
120 |             token.lexeme += curchar       # append curchar to lexeme
121 |             curchar = getchar()           # get next character
122 |             # break if not letter, '_', or digit
123 |             if not (curchar.isalnum() or curchar == '_'):
124 |                break
125 | 
126 |          # determine if lexeme is a keyword or name of variable
127 |          if token.lexeme in keywords:
128 |             token.category = keywords[token.lexeme]
129 |          else:
130 |             token.category = NAME
131 | 
132 |       elif curchar in smalltokens:
133 |          token.category = smalltokens[curchar]      # get category
134 |          token.lexeme = curchar
135 |          curchar = getchar()       # move to first char after the token
136 | 
137 |       else:
138 |          token.category = ERROR    # invalid token
139 |          token.lexeme = curchar
140 |          raise RuntimeError('Invalid token')
141 | 
142 |       tokenlist.append(token)      # append token to tokens list
143 |       if trace:                    # display token if trace is True
144 |          print("%3s %4s  %-14s %s" % (str(token.line),
145 |             str(token.column), catnames[token.category], token.lexeme))
146 | 
147 |       if token.category == EOF:    # finished tokenizing?
148 |          break
149 | 
150 | # getchar() gets next char from source and adjusts line and column
151 | def getchar():
152 |    global sourceindex, column, line, prevchar, blankline
153 | 
154 |    # check if starting a new line
155 |    if prevchar == '\n':    # '\n' signals start of a new line
156 |       line += 1            # increment line number
157 |       column = 0           # reset column number
158 |       blankline = True     # initialize blankline
159 | 
160 |    if sourceindex >= len(source): # at end of source code?
161 |       column = 1                  # set EOF column to 1
162 |       prevchar = ''               # save current char for next call
163 |       return ''                   # null str signals end of source
164 | 
165 |    c = source[sourceindex] # get next char in the source program
166 |    sourceindex += 1        # increment sourceindex to next character
167 |    column += 1             # increment column number
168 |    if not c.isspace():     # if c not whitespace then line not blank
169 |       blankline = False    # indicate line not blank
170 |    prevchar = c            # save current character
171 | 
172 |    # if at end of blank line, return space in place of '\n'
173 |    if c == '\n' and blankline:
174 |       return ' '
175 |    else:
176 |       return c             # return character to tokenizer()
177 | 
178 | ####################
179 | # Simple Parser #
180 | ####################
181 | 
182 | # begin the parser, starting with the 1st token in tokenlist
183 | def parser():
184 |     advance()
185 |     program()
186 | 
187 | # major function 1: advance()
188 | def advance():
189 |     ''' update the global token to the next token
190 |         from tokenlist '''
191 |     global token, tokenindex
192 |     tokenindex += 1         # move to next token
193 |     if (tokenindex >= len(tokenlist)): # reached the end
194 |         raise RuntimeError('Unexpected EOF')
195 |     token = tokenlist[tokenindex]
196 | 
197 | # major function 2: consume()
198 | def consume(expectedcat):
199 |     # check current token with expected
200 |     if (token.category == expectedcat):
201 |         advance() # get next token
202 |     else:
203 |         raise RuntimeError('Expecting ' + catnames[expectedcat])
204 | 
205 | # <program> -> <stmt>* EOF
206 | def program():
207 |     print('parsing has started')
208 |     # although stated, note that semantically, this means 
209 |     # that a program consists of 0 or more statements that 
210 |     # all begin with some 'NAME' or 'PRINT' token
211 |     while (token.category in [NAME, PRINT]): 
212 |         print('entering stmt loop')
213 |         stmt()
214 |     if (token.category != EOF):
215 |         raise RuntimeError('Expecting EOF')
216 |     print(token.category)
217 | 
218 | 
219 | # <stmt> -> <simplestmt> NEWLINE
220 | def stmt():
221 |     print('stmt()')
222 |     # note that we don't consume a 'simplestmt', but rather just call it
223 |     # this is because simplestmt is a non-terminal, and not a token (terminal)
224 |     simplestmt()
225 |     # NEWLINE is a token, therefore we'll consume
226 |     consume(NEWLINE)
227 |     
228 | # <simplestmt> -> <assignmentstmt>
229 | def simplestmt():
230 |     print('simple')
231 |     # this is where FIRST sets come in
232 |     if (token.category == NAME):
233 |         assignmentstmt()
234 |     elif (token.category == PRINT):
235 |         printstmt()
236 |     else:
237 |         raise RuntimeError('Expecting NAME or PRINT')
238 | 
239 | # <assignmentstmt> -> NAME '=' <expr>
240 | def assignmentstmt():
241 |     print('assignmentstmt')
242 |     consume(NAME)
243 |     consume(ASSIGNOP)
244 |     expr()
245 | 
246 | # <printstmt> -> 'print' '(' <expr> ')'
247 | def printstmt():
248 |     print('printstmt')
249 |     consume(PRINT)
250 |     consume(LEFTPAREN)
251 |     expr()
252 |     consume(RIGHTPAREN)
253 | 
254 | # <expr> -> <term> ('+' <term>)*
255 | def expr():
256 |     print('expr')
257 |     term()
258 |     # loop for (+ <term>)
259 |     while (token.category == PLUS):
260 |         # consume wastes another check, just advance()
261 |         advance()
262 |         term()
263 |         # when term() returns, if it sees another +
264 |         # in the token stream, it will loop again
265 |         
266 | # <term> -> <factor> ('*' <factor>)*
267 | def term():
268 |     print('term')
269 |     factor()
270 |     # loop for (* <factor>)
271 |     while (token.category == TIMES):
272 |         advance()
273 |         factor()
274 | 
275 | # <factor> -> '+' <factor> | '-' <factor> | UNSIGNEDINT | NAME | '(' <expr> ')'
276 | def factor():
277 |     print('factor')
278 |     # a lot of cases, all disjoint
279 |     if (token.category == PLUS or token.category == MINUS):
280 |         advance() 
281 |         factor()
282 |     elif (token.category == UNSIGNEDINT or token.category == NAME):
283 |         advance()
284 |     elif (token.category == LEFTPAREN):
285 |         advance()
286 |         expr()
287 |         consume(RIGHTPAREN)
288 |     else: 
289 |         raise RuntimeError('Expecting a factor')
290 | 
291 | 
292 | 
293 | 
294 | 
295 | main()
296 | 


--------------------------------------------------------------------------------
/CH8/p1.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     1st basic parser of a simple subset of the python
  3 |     language. This is mostly reinvention from 
  4 |     Reis's Writing Compilers
  5 | 
  6 |     This version is the basic interpreter
  7 | '''
  8 | 
  9 | import sys, time   # sys needed to access cmd line args and sys.exit()
 10 | 
 11 | class Token:
 12 |    def __init__(self, line, column, category, lexeme):
 13 |       self.line = line         # srce program line number of the token
 14 |       self.column = column     # srce program col in which token starts
 15 |       self.category = category # category of the token
 16 |       self.lexeme = lexeme     # token in string form
 17 | 
 18 | # globals grade
 19 | trace = True        # controls token trace
 20 | grade = False 
 21 | source = ''          # receives entire source program
 22 | sourceindex = 0      # index into the source code in source
 23 | line = 0             # current line number
 24 | column = 0           # current column number
 25 | tokenlist = []       # list of tokens created by tokenizer
 26 | tokenindex = -1      # index of current token in tokens
 27 | token = None         # current token
 28 | prevchar = '\n'      # '\n' in prevchar signals start of new line
 29 | blankline = True     # reset to False if line is not blank
 30 | symtab = {}          # a symbol table for tracking information
 31 | operandstack = []    # a stack that will hold information for symbol table
 32 | sign = 0             # a sign boolean that will allow us to know the number of unary minuses
 33 | 
 34 | # constants that represent token categories
 35 | EOF           = 0    # end of file
 36 | PRINT         = 1    # 'print' keyword
 37 | UNSIGNEDINT   = 2    # unsigned integer
 38 | NAME          = 3    # identifier that is not a keyword
 39 | ASSIGNOP      = 4    # '=' assignment operator
 40 | LEFTPAREN     = 5    # '('
 41 | RIGHTPAREN    = 6    # ')'
 42 | PLUS          = 7    # '+'
 43 | MINUS         = 8    # '-'
 44 | TIMES         = 9    # '*'
 45 | NEWLINE       = 10   # end of line
 46 | ERROR         = 11   # if not any of the above, then error
 47 | 
 48 | # displayable names for each token category
 49 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP',
 50 |             'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS',
 51 |             'TIMES', 'NEWLINE','ERROR']
 52 | 
 53 | # keywords and their token categories}
 54 | keywords = {'print': PRINT}
 55 | 
 56 | # one-character tokens and their token categories
 57 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN,
 58 |                '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF}
 59 | 
 60 | #################
 61 | # main function #
 62 | #################
 63 | # main() reads input file and calls tokenizer()
 64 | def main():
 65 |     global source
 66 | 
 67 |     if len(sys.argv) == 2:   # check if correct number of cmd line args
 68 |       try:
 69 |          infile = open(sys.argv[1], 'r')
 70 |          source = infile.read()  # read source program
 71 |       except IOError:
 72 |          print('Cannot read input file ' + sys.argv[1])
 73 |          sys.exit(1)
 74 |     else:
 75 |       print('Wrong number of command line arguments')
 76 |       print('Format: python p1.py <infile>')
 77 |       sys.exit(1)
 78 | 
 79 |     if source[-1] != '\n':  # add newline to end if missing
 80 |       source = source + '\n'
 81 | 
 82 |     if trace:
 83 |       print('------------------------------------------- Token trace')
 84 |       print('Line  Col Category    Lexeme\n')
 85 | 
 86 | 
 87 |     try:
 88 |         tokenizer()
 89 |         parser()
 90 | 
 91 |    # on an error, display an error message
 92 |    # token is the token object on which the error was detected
 93 |     except RuntimeError as emsg:
 94 |       # output slash n in place of newline
 95 |       lexeme = token.lexeme.replace('\n', '\\n')
 96 |       print('\nError on '+ "'" + lexeme + "'" + ' line ' +
 97 |          str(token.line) + ' column ' + str(token.column))
 98 |       print(emsg)      # message from RuntimeError object
 99 |       sys.exit(1)
100 | 
101 | ####################
102 | # tokenizer        #
103 | ####################
104 | def tokenizer():
105 |    global token
106 |    curchar = ' '          # prime curchar with space
107 | 
108 |    while True:
109 |       # skip whitespace but not newlines
110 |       while curchar != '\n' and curchar.isspace():
111 |          curchar = getchar() # get next char from source program
112 | 
113 |       # construct and initialize token
114 |       token = Token(line, column, None, '')
115 | 
116 |       if curchar.isdigit():               # start of unsigned int?
117 |          token.category = UNSIGNEDINT     # save category of token
118 |          while True:
119 |             token.lexeme += curchar       # append curchar to lexeme
120 |             curchar = getchar()           # get next character
121 |             if not curchar.isdigit():     # break if not a digit
122 |                break
123 | 
124 |       elif curchar.isalpha() or curchar == '_':   # start of name?
125 |          while True:
126 |             token.lexeme += curchar       # append curchar to lexeme
127 |             curchar = getchar()           # get next character
128 |             # break if not letter, '_', or digit
129 |             if not (curchar.isalnum() or curchar == '_'):
130 |                break
131 | 
132 |          # determine if lexeme is a keyword or name of variable
133 |          if token.lexeme in keywords:
134 |             token.category = keywords[token.lexeme]
135 |          else:
136 |             token.category = NAME
137 | 
138 |       elif curchar in smalltokens:
139 |          token.category = smalltokens[curchar]      # get category
140 |          token.lexeme = curchar
141 |          curchar = getchar()       # move to first char after the token
142 | 
143 |       else:
144 |          token.category = ERROR    # invalid token
145 |          token.lexeme = curchar
146 |          raise RuntimeError('Invalid token')
147 | 
148 |       tokenlist.append(token)      # append token to tokens list
149 |       if trace:                    # display token if trace is True
150 |          print("%3s %4s  %-14s %s" % (str(token.line),
151 |             str(token.column), catnames[token.category], token.lexeme))
152 | 
153 |       if token.category == EOF:    # finished tokenizing?
154 |          break
155 | 
156 | # getchar() gets next char from source and adjusts line and column
157 | def getchar():
158 |    global sourceindex, column, line, prevchar, blankline
159 | 
160 |    # check if starting a new line
161 |    if prevchar == '\n':    # '\n' signals start of a new line
162 |       line += 1            # increment line number
163 |       column = 0           # reset column number
164 |       blankline = True     # initialize blankline
165 | 
166 |    if sourceindex >= len(source): # at end of source code?
167 |       column = 1                  # set EOF column to 1
168 |       prevchar = ''               # save current char for next call
169 |       return ''                   # null str signals end of source
170 | 
171 |    c = source[sourceindex] # get next char in the source program
172 |    sourceindex += 1        # increment sourceindex to next character
173 |    column += 1             # increment column number
174 |    if not c.isspace():     # if c not whitespace then line not blank
175 |       blankline = False    # indicate line not blank
176 |    prevchar = c            # save current character
177 | 
178 |    # if at end of blank line, return space in place of '\n'
179 |    if c == '\n' and blankline:
180 |       return ' '
181 |    else:
182 |       return c             # return character to tokenizer()
183 | 
184 | ####################
185 | # Simple Parser    #
186 | ####################
187 | 
188 | # begin the parser, starting with the 1st token in tokenlist
189 | def parser():
190 |     advance()
191 |     program()
192 | 
193 | # major function 1: advance()
194 | def advance():
195 |     ''' update the global token to the next token
196 |         from tokenlist '''
197 |     global token, tokenindex
198 |     tokenindex += 1         # move to next token
199 |     if (tokenindex >= len(tokenlist)): # reached the end
200 |         raise RuntimeError('Unexpected EOF')
201 |     token = tokenlist[tokenindex]
202 | 
203 | # major function 2: consume()
204 | def consume(expectedcat):
205 |     # check current token with expected
206 |     if (token.category == expectedcat):
207 |         advance() # get next token
208 |     else:
209 |         raise RuntimeError('Expecting ' + catnames[expectedcat])
210 | 
211 | # <program> -> <stmt>* EOF
212 | def program():
213 |     # although stated, note that semantically, this means 
214 |     # that a program consists of 0 or more statements that 
215 |     # all begin with some 'NAME' or 'PRINT' token
216 |     while (token.category in [NAME, PRINT]): 
217 |         stmt()
218 |     if (token.category != EOF):
219 |         raise RuntimeError('Expecting EOF')
220 |     print(token.category)
221 | 
222 | 
223 | # <stmt> -> <simplestmt> NEWLINE
224 | def stmt():
225 |     # note that we don't consume a 'simplestmt', but rather just call it
226 |     # this is because simplestmt is a non-terminal, and not a token (terminal)
227 |     simplestmt()
228 |     # NEWLINE is a token, therefore we'll consume
229 |     consume(NEWLINE)
230 |     
231 | # <simplestmt> -> <assignmentstmt>
232 | def simplestmt():
233 |     # this is where FIRST sets come in
234 |     if (token.category == NAME):
235 |         assignmentstmt()
236 |     elif (token.category == PRINT):
237 |         printstmt()
238 |     else:
239 |         raise RuntimeError('Expecting NAME or PRINT')
240 | 
241 | # <assignmentstmt> -> NAME '=' <expr>
242 | def assignmentstmt():
243 |     left = token.lexeme     # will be the key into the symbol table 
244 |     consume(NAME)
245 |     consume(ASSIGNOP)
246 |     expr()
247 | 
248 |     # after expr() returns, it will have pushed <expr>'s value
249 |     # to the top
250 |     symtab[left] = operandstack.pop()
251 | 
252 | # <printstmt> -> 'print' '(' <expr> ')'
253 | def printstmt():
254 |     consume(PRINT)
255 |     consume(LEFTPAREN)
256 |     # expr() will have pushed on its value
257 |     expr()
258 |     print(operandstack.pop())
259 |     consume(RIGHTPAREN)
260 | 
261 | # <expr> -> <term> ('+' <term>)*
262 | def expr():
263 |     term()      # pushes value of term on top of stack
264 |     # loop for (+ <term>)
265 |     while (token.category == PLUS):
266 |         # consume wastes another check, just advance()
267 |         advance()
268 |         term()  # pushes value of term on top of stack
269 |         rightoperand = operandstack.pop()
270 |         leftoperand = operandstack.pop()
271 |         operandstack.append(leftoperand + rightoperand)
272 |         # when term() returns, if it sees another +
273 |         # in the token stream, it will loop again
274 |         
275 | # <term> -> <factor> ('*' <factor>)*
276 | def term():
277 |     global sign
278 |     sign = 1
279 |     factor()
280 |     # loop for (* <factor>)
281 |     while (token.category == TIMES):
282 |         advance()
283 |         sign = 1    # initialize sign before every factor call because it's the only production with MINUS
284 |         factor()
285 |         rightoperand = operandstack.pop()
286 |         leftoperand = operandstack.pop()
287 |         operandstack.append(leftoperand * rightoperand)
288 | 
289 | # <factor> -> '+' <factor> | '-' <factor> | UNSIGNEDINT | NAME | '(' <expr> ')'
290 | def factor():
291 |     global sign
292 |     # a lot of cases, all disjoint
293 |     if (token.category == PLUS):
294 |         advance() 
295 |         factor()
296 |     elif (token.category == MINUS):
297 |         sign = -sign 
298 |         advance()
299 |         factor()
300 |     elif (token.category == UNSIGNEDINT):
301 |         operandstack.append(sign * int(token.lexeme))
302 |         advance()
303 |     elif (token.category == NAME):
304 |         if (token.lexeme in symtab):
305 |             operandstack.append(sign * symtab[token.lexeme])
306 |         else: 
307 |             raise RuntimeError(f'Name: {token.lexeme} is not defined')
308 |         advance()
309 |     elif (token.category == LEFTPAREN):
310 |         advance()
311 |         # need to save sign because expr() will cause global one to change
312 |         savesign = sign
313 |         expr()
314 |         # if our current factor() call's sign was negative, then make our expression negative
315 |         if savesign == -1:
316 |             operandstack[-1] = -operandstack[-1]
317 |         consume(RIGHTPAREN)
318 |     else: 
319 |         raise RuntimeError('Expecting a factor')
320 | 
321 | 
322 | 
323 | 
324 | 
325 | main()
326 | if grade:
327 |    # display interpreter source code
328 |    print('------------------------------------------- ' + sys.argv[0])
329 |    print(open(sys.argv[0]).read())
330 | 


--------------------------------------------------------------------------------
/CH9/p1.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     1st basic parser of a simple subset of the python
  3 |     language. This is mostly reinvention from 
  4 |     Reis's Writing Compilers
  5 | 
  6 |     This version is the basic interpreter
  7 | '''
  8 | 
  9 | import sys, time   # sys needed to access cmd line args and sys.exit()
 10 | 
 11 | class Token:
 12 |    def __init__(self, line, column, category, lexeme):
 13 |       self.line = line         # srce program line number of the token
 14 |       self.column = column     # srce program col in which token starts
 15 |       self.category = category # category of the token
 16 |       self.lexeme = lexeme     # token in string form
 17 | 
 18 | # globals grade
 19 | trace = True        # controls token trace
 20 | grade = False
 21 | source = ''          # receives entire source program
 22 | sourceindex = 0      # index into the source code in source
 23 | line = 0             # current line number
 24 | column = 0           # current column number
 25 | tokenlist = []       # list of tokens created by tokenizer
 26 | tokenindex = -1      # index of current token in tokens
 27 | token = None         # current token
 28 | prevchar = '\n'      # '\n' in prevchar signals start of new line
 29 | blankline = True     # reset to False if line is not blank
 30 | symtab = {}          # a symbol table for tracking information
 31 | operandstack = []    # a stack that will hold information for symbol table
 32 | sign = 0             # a sign boolean that will allow us to know the number of unary minuses
 33 | 
 34 | # constants that represent token categories
 35 | EOF           = 0    # end of file
 36 | PRINT         = 1    # 'print' keyword
 37 | UNSIGNEDINT   = 2    # unsigned integer
 38 | NAME          = 3    # identifier that is not a keyword
 39 | ASSIGNOP      = 4    # '=' assignment operator
 40 | LEFTPAREN     = 5    # '('
 41 | RIGHTPAREN    = 6    # ')'
 42 | PLUS          = 7    # '+'
 43 | MINUS         = 8    # '-'
 44 | TIMES         = 9    # '*'
 45 | NEWLINE       = 10   # end of line
 46 | ERROR         = 11   # if not any of the above, then error
 47 | 
 48 | # displayable names for each token category
 49 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP',
 50 |             'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS',
 51 |             'TIMES', 'NEWLINE','ERROR']
 52 | 
 53 | # keywords and their token categories}
 54 | keywords = {'print': PRINT}
 55 | 
 56 | # one-character tokens and their token categories
 57 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN,
 58 |                '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF}
 59 | 
 60 | #################
 61 | # main function #
 62 | #################
 63 | # main() reads input file and calls tokenizer()
 64 | def main():
 65 |     global source
 66 | 
 67 |     if len(sys.argv) == 2:   # check if correct number of cmd line args
 68 |       try:
 69 |          infile = open(sys.argv[1], 'r')
 70 |          source = infile.read()  # read source program
 71 |       except IOError:
 72 |          print('Cannot read input file ' + sys.argv[1])
 73 |          sys.exit(1)
 74 |     else:
 75 |       print('Wrong number of command line arguments')
 76 |       print('Format: python p1.py <infile>')
 77 |       sys.exit(1)
 78 | 
 79 |     if source[-1] != '\n':  # add newline to end if missing
 80 |       source = source + '\n'
 81 | 
 82 |     if trace:
 83 |       print('------------------------------------------- Token trace')
 84 |       print('Line  Col Category    Lexeme\n')
 85 | 
 86 | 
 87 |     try:
 88 |         tokenizer()
 89 |         parser()
 90 | 
 91 |    # on an error, display an error message
 92 |    # token is the token object on which the error was detected
 93 |     except RuntimeError as emsg:
 94 |       # output slash n in place of newline
 95 |       lexeme = token.lexeme.replace('\n', '\\n')
 96 |       print('\nError on '+ "'" + lexeme + "'" + ' line ' +
 97 |          str(token.line) + ' column ' + str(token.column))
 98 |       print(emsg)      # message from RuntimeError object
 99 |       sys.exit(1)
100 | 
101 | ####################
102 | # tokenizer        #
103 | ####################
104 | def tokenizer():
105 |    global token
106 |    curchar = ' '          # prime curchar with space
107 | 
108 |    while True:
109 |       # skip whitespace but not newlines
110 |       while curchar != '\n' and curchar.isspace():
111 |          curchar = getchar() # get next char from source program
112 | 
113 |       # construct and initialize token
114 |       token = Token(line, column, None, '')
115 | 
116 |       if curchar.isdigit():               # start of unsigned int?
117 |          token.category = UNSIGNEDINT     # save category of token
118 |          while True:
119 |             token.lexeme += curchar       # append curchar to lexeme
120 |             curchar = getchar()           # get next character
121 |             if not curchar.isdigit():     # break if not a digit
122 |                break
123 | 
124 |       elif curchar.isalpha() or curchar == '_':   # start of name?
125 |          while True:
126 |             token.lexeme += curchar       # append curchar to lexeme
127 |             curchar = getchar()           # get next character
128 |             # break if not letter, '_', or digit
129 |             if not (curchar.isalnum() or curchar == '_'):
130 |                break
131 | 
132 |          # determine if lexeme is a keyword or name of variable
133 |          if token.lexeme in keywords:
134 |             token.category = keywords[token.lexeme]
135 |          else:
136 |             token.category = NAME
137 | 
138 |       elif curchar in smalltokens:
139 |          token.category = smalltokens[curchar]      # get category
140 |          token.lexeme = curchar
141 |          curchar = getchar()       # move to first char after the token
142 | 
143 |       else:
144 |          token.category = ERROR    # invalid token
145 |          token.lexeme = curchar
146 |          raise RuntimeError('Invalid token')
147 | 
148 |       tokenlist.append(token)      # append token to tokens list
149 |       if trace:                    # display token if trace is True
150 |          print("%3s %4s  %-14s %s" % (str(token.line),
151 |             str(token.column), catnames[token.category], token.lexeme))
152 | 
153 |       if token.category == EOF:    # finished tokenizing?
154 |          break
155 | 
156 | # getchar() gets next char from source and adjusts line and column
157 | def getchar():
158 |    global sourceindex, column, line, prevchar, blankline
159 | 
160 |    # check if starting a new line
161 |    if prevchar == '\n':    # '\n' signals start of a new line
162 |       line += 1            # increment line number
163 |       column = 0           # reset column number
164 |       blankline = True     # initialize blankline
165 | 
166 |    if sourceindex >= len(source): # at end of source code?
167 |       column = 1                  # set EOF column to 1
168 |       prevchar = ''               # save current char for next call
169 |       return ''                   # null str signals end of source
170 | 
171 |    c = source[sourceindex] # get next char in the source program
172 |    sourceindex += 1        # increment sourceindex to next character
173 |    column += 1             # increment column number
174 |    if not c.isspace():     # if c not whitespace then line not blank
175 |       blankline = False    # indicate line not blank
176 |    prevchar = c            # save current character
177 | 
178 |    # if at end of blank line, return space in place of '\n'
179 |    if c == '\n' and blankline:
180 |       return ' '
181 |    else:
182 |       return c             # return character to tokenizer()
183 | 
184 | ####################
185 | # Simple Parser    #
186 | ####################
187 | 
188 | # begin the parser, starting with the 1st token in tokenlist
189 | def parser():
190 |     advance()
191 |     program()
192 | 
193 | # major function 1: advance()
194 | def advance():
195 |     ''' update the global token to the next token
196 |         from tokenlist '''
197 |     global token, tokenindex
198 |     tokenindex += 1         # move to next token
199 |     if (tokenindex >= len(tokenlist)): # reached the end
200 |         raise RuntimeError('Unexpected EOF')
201 |     token = tokenlist[tokenindex]
202 | 
203 | # major function 2: consume()
204 | def consume(expectedcat):
205 |     # check current token with expected
206 |     if (token.category == expectedcat):
207 |         advance() # get next token
208 |     else:
209 |         raise RuntimeError('Expecting ' + catnames[expectedcat])
210 | 
211 | # <program> -> <stmt>* EOF
212 | def program():
213 |     # although stated, note that semantically, this means 
214 |     # that a program consists of 0 or more statements that 
215 |     # all begin with some 'NAME' or 'PRINT' token
216 |     while (token.category in [NAME, PRINT]): 
217 |         stmt()
218 |     if (token.category != EOF):
219 |         raise RuntimeError('Expecting EOF')
220 |     print(token.category)
221 | 
222 | 
223 | # <stmt> -> <simplestmt> NEWLINE
224 | def stmt():
225 |     # note that we don't consume a 'simplestmt', but rather just call it
226 |     # this is because simplestmt is a non-terminal, and not a token (terminal)
227 |     simplestmt()
228 |     # NEWLINE is a token, therefore we'll consume
229 |     consume(NEWLINE)
230 |     
231 | # <simplestmt> -> <assignmentstmt>
232 | def simplestmt():
233 |     # this is where FIRST sets come in
234 |     if (token.category == NAME):
235 |         assignmentstmt()
236 |     elif (token.category == PRINT):
237 |         printstmt()
238 |     else:
239 |         raise RuntimeError('Expecting NAME or PRINT')
240 | 
241 | # <assignmentstmt> -> NAME '=' <expr>
242 | def assignmentstmt():
243 |     left = token.lexeme     # will be the key into the symbol table 
244 |     consume(NAME)
245 |     consume(ASSIGNOP)
246 |     expr()
247 | 
248 |     # after expr() returns, it will have pushed <expr>'s value
249 |     # to the top
250 |     symtab[left] = operandstack.pop()
251 | 
252 | # <printstmt> -> 'print' '(' <expr> ')'
253 | def printstmt():
254 |     consume(PRINT)
255 |     consume(LEFTPAREN)
256 |     # expr() will have pushed on its value
257 |     expr()
258 |     print(operandstack.pop())
259 |     consume(RIGHTPAREN)
260 | 
261 | # <expr> -> <term> ('+' <term>)*
262 | def expr():
263 |     term()      # pushes value of term on top of stack
264 |     # loop for (+ <term>)
265 |     while (token.category == PLUS):
266 |         # consume wastes another check, just advance()
267 |         advance()
268 |         term()  # pushes value of term on top of stack
269 |         rightoperand = operandstack.pop()
270 |         leftoperand = operandstack.pop()
271 |         operandstack.append(leftoperand + rightoperand)
272 |         # when term() returns, if it sees another +
273 |         # in the token stream, it will loop again
274 |         
275 | # <term> -> <factor> ('*' <factor>)*
276 | def term():
277 |     global sign
278 |     sign = 1
279 |     factor()
280 |     # loop for (* <factor>)
281 |     while (token.category == TIMES):
282 |         advance()
283 |         sign = 1    # initialize sign before every factor call because it's the only production with MINUS
284 |         factor()
285 |         rightoperand = operandstack.pop()
286 |         leftoperand = operandstack.pop()
287 |         operandstack.append(leftoperand * rightoperand)
288 | 
289 | # <factor> -> '+' <factor> | '-' <factor> | UNSIGNEDINT | NAME | '(' <expr> ')'
290 | def factor():
291 |     global sign
292 |     # a lot of cases, all disjoint
293 |     if (token.category == PLUS):
294 |         advance() 
295 |         factor()
296 |     elif (token.category == MINUS):
297 |         sign = -sign 
298 |         advance()
299 |         factor()
300 |     elif (token.category == UNSIGNEDINT):
301 |         operandstack.append(sign * int(token.lexeme))
302 |         advance()
303 |     elif (token.category == NAME):
304 |         if (token.lexeme in symtab):
305 |             operandstack.append(sign * symtab[token.lexeme])
306 |         else: 
307 |             raise RuntimeError(f'Name: {token.lexeme} is not defined')
308 |         advance()
309 |     elif (token.category == LEFTPAREN):
310 |         advance()
311 |         # need to save sign because expr() will cause global one to change
312 |         savesign = sign
313 |         expr()
314 |         # if our current factor() call's sign was negative, then make our expression negative
315 |         if savesign == -1:
316 |             operandstack[-1] = -operandstack[-1]
317 |         consume(RIGHTPAREN)
318 |     else: 
319 |         raise RuntimeError('Expecting a factor')
320 | 
321 | 
322 | 
323 | 
324 | 
325 | main()
326 | if grade:
327 |    # display interpreter source code
328 |    print('------------------------------------------- ' + sys.argv[0])
329 |    print(open(sys.argv[0]).read())
330 | 


--------------------------------------------------------------------------------
/CH10/h1.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     1st basic hybrid inteprete of a simple subset of the python
  3 |     language. For this interpreter, we're only going to utilize a small
  4 |     subset of the bytecode instructions you can see below as well as 
  5 |     only a few parts for our virtual stack machine (ex: co_names, etc...)
  6 | 
  7 |     This is mostly reinvention from 
  8 |     Reis's Writing Compilers
  9 | 
 10 |     This version is the basic hybrid interpreter
 11 | '''
 12 | 
 13 | import sys, time   # sys needed to access cmd line args and sys.exit()
 14 | 
 15 | class Token:
 16 |    def __init__(self, line, column, category, lexeme):
 17 |       self.line = line         # srce program line number of the token
 18 |       self.column = column     # srce program col in which token starts
 19 |       self.category = category # category of the token
 20 |       self.lexeme = lexeme     # token in string form
 21 | 
 22 | # globals 
 23 | 
 24 | ##############################
 25 | # hybrid intepreter specific #
 26 | ##############################
 27 | co_code = []        # table for bytecode instructions
 28 | co_names = []       # table for names of the 'global' variables
 29 | co_consts = []      # table for all the constants
 30 | 
 31 | trace = True        # controls token trace
 32 | grade = False
 33 | source = ''          # receives entire source program
 34 | sourceindex = 0      # index into the source code in source
 35 | line = 0             # current line number
 36 | column = 0           # current column number
 37 | tokenlist = []       # list of tokens created by tokenizer
 38 | tokenindex = -1      # index of current token in tokens
 39 | token = None         # current token
 40 | prevchar = '\n'      # '\n' in prevchar signals start of new line
 41 | blankline = True     # reset to False if line is not blank
 42 | symtab = {}          # a symbol table for tracking information
 43 | operandstack = []    # a stack that will hold information for symbol table
 44 | sign = 0             # a sign boolean that will allow us to know the number of unary minuses
 45 | 
 46 | # constants that represent token categories
 47 | EOF           = 0    # end of file
 48 | PRINT         = 1    # 'print' keyword
 49 | UNSIGNEDINT   = 2    # unsigned integer
 50 | NAME          = 3    # identifier that is not a keyword
 51 | ASSIGNOP      = 4    # '=' assignment operator
 52 | LEFTPAREN     = 5    # '('
 53 | RIGHTPAREN    = 6    # ')'
 54 | PLUS          = 7    # '+'
 55 | MINUS         = 8    # '-'
 56 | TIMES         = 9    # '*'
 57 | NEWLINE       = 10   # end of line
 58 | ERROR         = 11   # if not any of the above, then error
 59 | 
 60 | # bytecode opcodes (subset)
 61 | UNARY_NEGATIVE =    11
 62 | BINARY_MULTIPLY =   20
 63 | BINARY_ADD =        23
 64 | PRINT_EXPR =        70
 65 | # be wary of 71 & 72
 66 | PRINT_ITEM =        71
 67 | PRINT_NEWLINE =     72
 68 | STORE_NAME =        90
 69 | LOAD_CONST =        100
 70 | LOAD_NAME =         101
 71 | 
 72 | # displayable names for each token category
 73 | catnames = ['EOF', 'print', 'UNSIGNEDINT', 'NAME', 'ASSIGNOP',
 74 |             'LEFTPAREN', 'RIGHTPAREN', 'PLUS', 'MINUS',
 75 |             'TIMES', 'NEWLINE','ERROR']
 76 | 
 77 | # keywords and their token categories}
 78 | keywords = {'print': PRINT}
 79 | 
 80 | # one-character tokens and their token categories
 81 | smalltokens = {'=':ASSIGNOP, '(':LEFTPAREN, ')':RIGHTPAREN,
 82 |                '+':PLUS, '-':MINUS, '*':TIMES, '\n':NEWLINE, '':EOF}
 83 | 
 84 | #################
 85 | # main function #
 86 | #################
 87 | # main() reads input file and calls tokenizer()
 88 | def main():
 89 |     global source
 90 | 
 91 |     if len(sys.argv) == 2:   # check if correct number of cmd line args
 92 |       try:
 93 |          infile = open(sys.argv[1], 'r')
 94 |          source = infile.read()  # read source program
 95 |       except IOError:
 96 |          print('Cannot read input file ' + sys.argv[1])
 97 |          sys.exit(1)
 98 |     else:
 99 |       print('Wrong number of command line arguments')
100 |       print('Format: python p1.py <infile>')
101 |       sys.exit(1)
102 | 
103 |     if source[-1] != '\n':  # add newline to end if missing
104 |       source = source + '\n'
105 | 
106 |     if trace:
107 |       print('------------------------------------------- Token trace')
108 |       print('Line  Col Category    Lexeme\n')
109 | 
110 | 
111 |     try:
112 |         tokenizer()
113 |         parser()
114 |         print('Nice! Seems like everything was parsed!')
115 |         interpreter()
116 | 
117 |    # on an error, display an error message
118 |    # token is the token object on which the error was detected
119 |     except RuntimeError as emsg:
120 |       # output slash n in place of newline
121 |       lexeme = token.lexeme.replace('\n', '\\n')
122 |       print('\nError on '+ "'" + lexeme + "'" + ' line ' +
123 |          str(token.line) + ' column ' + str(token.column))
124 |       print(emsg)      # message from RuntimeError object
125 |       sys.exit(1)
126 | 
127 | ####################
128 | # tokenizer        #
129 | ####################
130 | def tokenizer():
131 |    global token
132 |    curchar = ' '          # prime curchar with space
133 | 
134 |    while True:
135 |       # skip whitespace but not newlines
136 |       while curchar != '\n' and curchar.isspace():
137 |          curchar = getchar() # get next char from source program
138 | 
139 |       # construct and initialize token
140 |       token = Token(line, column, None, '')
141 | 
142 |       if curchar.isdigit():               # start of unsigned int?
143 |          token.category = UNSIGNEDINT     # save category of token
144 |          while True:
145 |             token.lexeme += curchar       # append curchar to lexeme
146 |             curchar = getchar()           # get next character
147 |             if not curchar.isdigit():     # break if not a digit
148 |                break
149 | 
150 |       elif curchar.isalpha() or curchar == '_':   # start of name?
151 |          while True:
152 |             token.lexeme += curchar       # append curchar to lexeme
153 |             curchar = getchar()           # get next character
154 |             # break if not letter, '_', or digit
155 |             if not (curchar.isalnum() or curchar == '_'):
156 |                break
157 | 
158 |          # determine if lexeme is a keyword or name of variable
159 |          if token.lexeme in keywords:
160 |             token.category = keywords[token.lexeme]
161 |          else:
162 |             token.category = NAME
163 | 
164 |       elif curchar in smalltokens:
165 |          token.category = smalltokens[curchar]      # get category
166 |          token.lexeme = curchar
167 |          curchar = getchar()       # move to first char after the token
168 | 
169 |       else:
170 |          token.category = ERROR    # invalid token
171 |          token.lexeme = curchar
172 |          raise RuntimeError('Invalid token')
173 | 
174 |       tokenlist.append(token)      # append token to tokens list
175 |       if trace:                    # display token if trace is True
176 |          print("%3s %4s  %-14s %s" % (str(token.line),
177 |             str(token.column), catnames[token.category], token.lexeme))
178 | 
179 |       if token.category == EOF:    # finished tokenizing?
180 |          break
181 | 
182 | # getchar() gets next char from source and adjusts line and column
183 | def getchar():
184 |    global sourceindex, column, line, prevchar, blankline
185 | 
186 |    # check if starting a new line
187 |    if prevchar == '\n':    # '\n' signals start of a new line
188 |       line += 1            # increment line number
189 |       column = 0           # reset column number
190 |       blankline = True     # initialize blankline
191 | 
192 |    if sourceindex >= len(source): # at end of source code?
193 |       column = 1                  # set EOF column to 1
194 |       prevchar = ''               # save current char for next call
195 |       return ''                   # null str signals end of source
196 | 
197 |    c = source[sourceindex] # get next char in the source program
198 |    sourceindex += 1        # increment sourceindex to next character
199 |    column += 1             # increment column number
200 |    if not c.isspace():     # if c not whitespace then line not blank
201 |       blankline = False    # indicate line not blank
202 |    prevchar = c            # save current character
203 | 
204 |    # if at end of blank line, return space in place of '\n'
205 |    if c == '\n' and blankline:
206 |       return ' '
207 |    else:
208 |       return c             # return character to tokenizer()
209 | 
210 | ################################
211 | #   Simple Parser/Generator    #
212 | ################################
213 | 
214 | # begin the parser, starting with the 1st token in tokenlist
215 | def parser():
216 |     advance()
217 |     program()
218 | 
219 | # major function 1: advance()
220 | def advance():
221 |     ''' update the global token to the next token
222 |         from tokenlist '''
223 |     global token, tokenindex
224 |     tokenindex += 1         # move to next token
225 |     if (tokenindex >= len(tokenlist)): # reached the end
226 |         raise RuntimeError('Unexpected EOF')
227 |     token = tokenlist[tokenindex]
228 |     # print(f'Current Token: {token.lexeme}, Cat: {token.category} ')
229 | 
230 | # major function 2: consume()
231 | def consume(expectedcat):
232 |     # check current token with expected
233 |     if (token.category == expectedcat):
234 |         advance() # get next token
235 |     else:
236 |         raise RuntimeError('Expecting ' + catnames[expectedcat])
237 | 
238 | # <program> -> <stmt>* EOF
239 | def program():
240 |     # although stated, note that semantically, this means 
241 |     # that a program consists of 0 or more statements that 
242 |     # all begin with some 'NAME' or 'PRINT' token
243 |     while (token.category in [NAME, PRINT,]): 
244 |         stmt()
245 |     if (token.category != EOF):
246 |         raise RuntimeError('Expecting EOF')
247 |     print(token.category)
248 | 
249 | 
250 | # <stmt> -> <simplestmt> NEWLINE
251 | def stmt():
252 |     # note that we don't consume a 'simplestmt', but rather just call it
253 |     # this is because simplestmt is a non-terminal, and not a token (terminal)
254 |     simplestmt()
255 |     # NEWLINE is a token, therefore we'll consume
256 |     consume(NEWLINE)
257 |     
258 | # <simplestmt> -> <assignmentstmt>
259 | def simplestmt():
260 |     # this is where FIRST sets come in
261 |     if (token.category == NAME):
262 |         assignmentstmt()
263 |     elif (token.category == PRINT):
264 |         printstmt()
265 |     else:
266 |         raise RuntimeError('Expecting NAME or PRINT')
267 | 
268 | # <assignmentstmt> -> NAME '=' <expr>
269 | def assignmentstmt():
270 |     # check if NAME exists in program 
271 |     if token.lexeme in co_names:
272 |         index = co_names.index(token.lexeme)
273 |     
274 |     # first time seeing the variable
275 |     else:
276 |         index = len(co_names)
277 |         co_names.append(token.lexeme)
278 |     
279 |     advance()
280 |     consume(ASSIGNOP)
281 |     expr() # will push expr() value
282 | 
283 |     # generate bytecode - STORE_NAME
284 |     co_code.append(STORE_NAME)  # pops TOS and stores in co_values[index]
285 |     co_code.append(index)
286 | 
287 | 
288 | # <printstmt> -> 'print' '(' <expr> ')'
289 | def printstmt():
290 |     advance()
291 |     consume(LEFTPAREN)
292 |     # expr() will generate its bytecode and push it's 'value'
293 |     expr()
294 | 
295 |     # printstmt() needs to pop the value from expr() and print it
296 |     # note: book says use PRINT_ITEM & PRINT_NEWLINE, but we're 
297 |     # going to try to keep up to date: use PRINT_EXPR (70 dec)
298 |     co_code.append(PRINT_EXPR)
299 |     consume(RIGHTPAREN)
300 | 
301 | # <expr> -> <term> ('+' <term>)*
302 | def expr():
303 |     term()      # pushes value of term on top of stack
304 |     # loop for (+ <term>)
305 |     while (token.category == PLUS):
306 |         # consume wastes another check, just advance()
307 |         advance()
308 |         term()  # pushes value of term on top of stack
309 |         
310 |         # when our 2nd term returns, we'll need to add both <term>'s
311 |         co_code.append(BINARY_ADD)
312 | 
313 |         # when term() returns, if it sees another +
314 |         # in the token stream, it will loop again
315 |         
316 | # <term> -> <factor> ('*' <factor>)*
317 | def term():
318 |     global sign
319 |     sign = 1
320 |     factor()
321 |     # loop for (* <factor>)
322 |     while (token.category == TIMES):
323 |         advance()
324 |         sign = 1    # initialize sign before every factor call because it's the only production with MINUS
325 |         factor()
326 |         # after our 2nd factor returns, we need to multiply the two factors
327 |         co_code.append(BINARY_MULTIPLY)
328 | 
329 | # <factor> -> '+' <factor> | '-' <factor> | UNSIGNED_INT | NAME | '(' <expr> ')'
330 | def factor():
331 |     global sign
332 |     # a lot of cases, all disjoint
333 |     if (token.category == PLUS):
334 |         advance() 
335 |         factor()
336 |     elif (token.category == MINUS):
337 |         sign = -sign 
338 |         advance()
339 |         factor()
340 | 
341 |     # UNSIGNED_INT needs to save our const within co_consts
342 |     # and append the appropriate bytecode instructions
343 |     elif (token.category == UNSIGNEDINT):
344 |         val = sign * int(token.lexeme)      # get our value
345 |         # don't waste space and have multiple copies; just use the same instance
346 |         if val in co_consts:
347 |             index = co_consts.index(val)
348 |         else:
349 |             # first time seeing the constant
350 |             index = len(co_consts)
351 |             co_consts.append(val)
352 |         # generate bytecode; LOAD_CONST & consti
353 |         co_code.append(LOAD_CONST)
354 |         co_code.append(index)
355 |         advance()
356 |     
357 |     # NAME needs to generate LOAD_NAME after checking whether name
358 |     # exists within co_names 
359 |     elif (token.category == NAME):
360 |         # check if name has been declared
361 |         if token.lexeme in co_names:
362 |             index = co_names.index(token.lexeme)
363 |         else: 
364 |             raise RuntimeError(f'Name: {token.lexeme} is not defined')
365 |         # generate code
366 |         co_code.append(LOAD_NAME)
367 |         co_code.append(index)
368 |         # check if we're a negative op
369 |         if sign == -1:
370 |             co_code.append(UNARY_NEGATIVE)
371 |         advance()
372 | 
373 |    
374 |     elif (token.category == LEFTPAREN):
375 |         advance()
376 |         # expr() will call term() which restarts our global sign of negation; have a local
377 |         # copy our this function call's negative state
378 |         savesign = sign
379 |         expr()
380 |         if savesign == -1: # note that this 'savesign' refers to outside <expr>, not inside
381 |             co_code.append(UNARY_NEGATIVE)
382 |         consume(RIGHTPAREN)
383 | 
384 |     else: 
385 |         raise RuntimeError('Expecting a factor')
386 | 
387 | 
388 | ###################
389 | # The interpreter #
390 | ###################
391 | 
392 | # Just going to read a bytecode instructions from co_code and
393 | # execute based on instruction; loop until no more or error
394 | # Note: As before, Python 3.6 introduced 2 byte instructions, but we're doing 1 & 2
395 | 
396 | def interpreter():
397 |     stack = []
398 |     pc =0
399 |     co_values = [None] * len(co_names)  # values correspond to variables
400 |     # print(f'co_code: {co_code}')
401 | 
402 |     while pc < len(co_code):
403 |         opcode = co_code[pc]    # get opcode (1 or 2 for instruction)
404 |         pc += 1                 # increment pc
405 |         #   print(f'Loop Count: {pc}') 
406 |         # 1 byte instructions 
407 |         if opcode == UNARY_NEGATIVE:
408 |             stack[-1] = -stack[-1]
409 |         elif opcode == BINARY_MULTIPLY:
410 |             operand1 = stack.pop()
411 |             operand2 = stack.pop()
412 |             stack.append(operand1 * operand2)
413 |         elif opcode == BINARY_ADD:
414 |             operand1 = stack.pop()
415 |             operand2 = stack.pop()
416 |             stack.append(operand1 + operand2)
417 |         # don't look into POP_TOP.. 
418 |         elif opcode == PRINT_EXPR:
419 |             print(stack.pop(), end='')
420 |         
421 |         # 2 byte instructions
422 |         elif opcode == STORE_NAME:
423 |             index = co_code[pc]      # get the index for the value
424 |             pc += 1
425 |             operand = stack.pop()
426 |             co_values[index] = operand   # update the table with the value
427 |         elif opcode == LOAD_CONST:
428 |             index = co_code[pc]
429 |             pc += 1
430 |             value = co_consts[index]
431 |             stack.append(value)
432 |         elif opcode == LOAD_NAME:
433 |             index = co_code[pc]
434 |             pc += 1
435 |             value = co_values[index] # get the index of variable
436 |             if value == None:
437 |                 print(f'No value for {co_names[index]}')
438 |                 sys.exit(1)
439 |             stack.append(value)
440 |         else:
441 |             break
442 | 
443 | # Call main()
444 | 
445 | main()
446 | 


--------------------------------------------------------------------------------