├── .gitignore ├── 1.c ├── README.md ├── grammer.txt ├── lexer.py ├── parser.py ├── sema.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea/ 3 | -------------------------------------------------------------------------------- /1.c: -------------------------------------------------------------------------------- 1 | int main() { 2 | int a; 3 | float b; 4 | int c; 5 | float e; 6 | c=10; 7 | if(c) { 8 | a = 1 + 10; 9 | b = 10.9 + 8.9; 10 | } 11 | b = 1.11 * 8.9; 12 | while(a) { 13 | b = 10.44; 14 | e = 990.45; 15 | c = 90; 16 | } 17 | c = 80; 18 | } 19 | 20 | int func1 () { 21 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 哈工大软件学院编译原理实验 2 | ========================== 3 | 4 | 这是我用Python实现的版本,没有图形界面,有很多Bug,其中```lexer.py```, ```parser.py```, ```sema.py```,分别是词法分析,句法分析和语义分析。可以参考,但是不推荐直接使用。 5 | 6 | 推荐使用下面学长实现的版本,都是有图形界面的: 7 | 8 | **PS:真的可以直接用,不骗你们的!自己写的话肯定收获会比较大,但是得分真的不一定有抄的高!自己写的话,要想得分高,也一定要有图形界面!看你自己选择了少年,学长们只能帮你到这儿了...** 9 | 10 | **Update:说可以直接用,你们不能真的就一点都不改拿来就用吧。。。学长没想到你们还是 too young,这么 naive 啊。。。变量名,程序结构啥的改改再用啊,代码混淆懂不懂啊亲。。。** 11 | 12 | [Macroszh](https://github.com/marcoszh) 13 | 14 | * https://github.com/marcoszh/Compilation1 15 | * https://github.com/marcoszh/Compilation2 16 | * https://github.com/marcoszh/Compilation3 17 | 18 | 19 | [Winlandiano](https://github.com/winlandiano) 20 | 21 | * https://github.com/winlandiano/CompilerExp1_lexical_analysis (这个没有图形界面,下面两个有) 22 | * https://github.com/winlandiano/CompilerExp2_Syntax_Analysis 23 | * https://github.com/winlandiano/CompilerExp3_Semantic_Analysis- 24 | 25 | [LeechanX](https://github.com/LeechanX) 26 | 27 | * https://github.com/LeechanX/My-Compiler-Designer--final-version- 28 | 29 | 30 | [zsy112371](https://github.com/zsy112371) 31 | 32 | * https://github.com/zsy112371/bianyi 33 | 34 | [MaybeMercy](https://github.com/MaybeMercy) 35 | 36 | * https://github.com/MaybeMercy/Compiler_Experiment 37 | 38 | -------------------------------------------------------------------------------- /grammer.txt: -------------------------------------------------------------------------------- 1 | *terminals 2 | ID 3 | VOID 4 | INT 5 | CHAR 6 | FLOAT 7 | LONG 8 | DOUBLE 9 | SHORT 10 | STRING_LITERAL 11 | ( 12 | ) 13 | [ 14 | ] 15 | , 16 | ; 17 | { 18 | } 19 | = 20 | : 21 | > 22 | < 23 | >= 24 | <= 25 | != 26 | == 27 | = 28 | += 29 | -= 30 | *= 31 | /= 32 | %= 33 | + 34 | - 35 | * 36 | / 37 | % 38 | & 39 | ~ 40 | ++ 41 | -- 42 | ! 43 | # 44 | int 45 | float 46 | double 47 | short 48 | long 49 | while 50 | if 51 | else 52 | *productions 53 | ::= 54 | ::= 55 | ::= 56 | ::= 57 | ::= P21 ; 58 | ::= 59 | ::= 60 | ::= 61 | ::= { } 62 | ::= , 63 | ::= 64 | ::= 65 | ::= P22 66 | ::= ID P31 67 | ::= ( ) 68 | ::= 69 | ::= 70 | ::= 71 | ::= 72 | ::= 73 | ::= 74 | ::= 75 | ::= 76 | ::= 77 | ::= 78 | ::= ; 79 | ::= ; 80 | ::= { } 81 | ::= { } 82 | ::= 83 | ::= 84 | ::= while ( ) P91 P92 85 | ::= if ( ) P81 P82 86 | ::= int P11 87 | ::= float P12 88 | ::= double P13 89 | ::= short P14 90 | ::= long P15 91 | ::= INT P41 92 | ::= FLOAT P42 93 | ::= SHORT P43 94 | ::= LONG P44 95 | ::= > 96 | ::= < 97 | ::= == 98 | ::= >= 99 | ::= <= 100 | ::= == 101 | ::= != 102 | ::= & 103 | ::= * 104 | ::= + 105 | ::= - 106 | ::= ~ 107 | ::= ! 108 | ::= = 109 | ::= -= 110 | ::= += 111 | ::= *= 112 | ::= /= 113 | ::= %= 114 | ::= ID = P62 P61 115 | ::= 116 | ::= + P101 117 | ::= * P102 118 | ::= 119 | ::= += 120 | ::= -= 121 | ::= ID P51 122 | ::= P52 123 | ::= STRING_LITERAL 124 | ::= ( ) 125 | ::= P71 126 | ::= ++ P72 127 | ::= -- P73 128 | ::= 129 | *end -------------------------------------------------------------------------------- /lexer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | KEYWORD_LIST = [ 6 | "if", 7 | "else", 8 | "while", 9 | "break", 10 | "continue", 11 | "for", 12 | "double", 13 | "int", 14 | "float", 15 | "long", 16 | "short", 17 | "switch", 18 | "case", 19 | "return", 20 | "void", 21 | ] 22 | 23 | SEPARATOR_LIST = ["{", "}", "[", "]", "(", ")", "~", ",", ";", ".", "?", ":"] 24 | 25 | OPERATOR_LIST = [ 26 | "+", 27 | "++", 28 | "-", 29 | "--", 30 | "+=", 31 | "-=", 32 | "*", 33 | "*=", 34 | "%", 35 | "%=", 36 | "->", 37 | "|", 38 | "||", 39 | "|=", 40 | "/", 41 | "/=", 42 | ">", 43 | "<", 44 | ">=", 45 | "<=", 46 | "=", 47 | "==", 48 | "!=", 49 | "!", 50 | ] 51 | 52 | CATEGORY_DICT = { 53 | "double": 265, 54 | "int": 266, 55 | "break": 268, 56 | "else": 269, 57 | "switch": 271, 58 | "case": 272, 59 | "char": 276, 60 | "return": 278, 61 | "float": 281, 62 | "continue": 284, 63 | "for": 285, 64 | "void": 287, 65 | "do": 292, 66 | "if": 293, 67 | "while": 294, 68 | "static": 295, 69 | "{": 299, 70 | "}": 300, 71 | "[": 301, 72 | "]": 302, 73 | "(": 303, 74 | ")": 304, 75 | "~": 305, 76 | ",": 306, 77 | ";": 307, 78 | "?": 310, 79 | ":": 311, 80 | "<": 314, 81 | "<=": 315, 82 | ">": 316, 83 | ">=": 317, 84 | "=": 318, 85 | "==": 319, 86 | "|": 320, 87 | "||": 321, 88 | "|=": 322, 89 | "^": 323, 90 | "^=": 324, 91 | "&": 325, 92 | "&&": 326, 93 | "&=": 327, 94 | "%": 328, 95 | "%=": 329, 96 | "+": 330, 97 | "++": 331, 98 | "+=": 332, 99 | "-": 333, 100 | "--": 334, 101 | "-=": 335, 102 | "->": 336, 103 | "/": 337, 104 | "/=": 338, 105 | "*": 339, 106 | "*=": 340, 107 | "!": 341, 108 | "!=": 342, 109 | "ID": 256, 110 | "INT10": 346, 111 | "FLOAT": 347, 112 | "STRING": 351, 113 | } 114 | 115 | current_row = -1 116 | current_line = 0 117 | input_str = [] 118 | 119 | 120 | def is_keyword(s): 121 | return s in KEYWORD_LIST 122 | 123 | 124 | def is_separator(s): 125 | return s in SEPARATOR_LIST 126 | 127 | 128 | def is_operator(s): 129 | return s in OPERATOR_LIST 130 | 131 | 132 | def get_cate_id(s): 133 | return CATEGORY_DICT[s] 134 | 135 | 136 | def getchar(): 137 | global current_row 138 | global current_line 139 | current_row += 1 140 | 141 | if current_row == len(input_str[current_line]): 142 | current_line += 1 143 | current_row = 0 144 | 145 | if current_line == len(input_str): 146 | return "SCANEOF" 147 | 148 | return input_str[current_line][current_row] 149 | 150 | 151 | def ungetc(): 152 | global current_row 153 | global current_line 154 | current_row = current_row - 1 155 | if current_row < 0: 156 | current_line = current_line - 1 157 | current_row = len(input_str[current_row]) - 1 158 | return input_str[current_line][current_row] 159 | 160 | 161 | def read_source_file(file): 162 | global input_str 163 | f = open(file, "r") 164 | input_str = f.readlines() 165 | f.close() 166 | 167 | 168 | def lexical_error(msg, line=None, row=None): 169 | if line is None: 170 | line = current_line + 1 171 | if row is None: 172 | row = current_row + 1 173 | print(str(line) + ":" + str(row) + " Lexical error: " + msg) 174 | 175 | 176 | def scanner(): 177 | current_char = getchar() 178 | if current_char == "SCANEOF": 179 | return ("SCANEOF", "", "") 180 | if current_char.strip() == "": 181 | return 182 | if current_char.isdigit(): 183 | int_value = 0 184 | while current_char.isdigit(): 185 | int_value = int_value * 10 + int(current_char) 186 | current_char = getchar() 187 | 188 | if current_char != ".": 189 | ungetc() 190 | return ("INT", int_value, get_cate_id("INT10")) 191 | 192 | float_value = str(int_value) + "." 193 | current_char = getchar() 194 | while current_char.isdigit(): 195 | float_value += current_char 196 | current_char = getchar() 197 | ungetc() 198 | return ("FLOAT", float_value, get_cate_id("FLOAT")) 199 | if current_char.isalpha() or current_char == "_": 200 | string = "" 201 | while current_char.isalpha() or current_char.isdigit() or current_char == "_": 202 | string += current_char 203 | current_char = getchar() 204 | if current_char == "SCANEOF": 205 | break 206 | 207 | ungetc() 208 | if is_keyword(string): 209 | return (string, "", get_cate_id(string)) 210 | else: 211 | return ("ID", string, get_cate_id("ID")) 212 | 213 | if current_char == '"': 214 | str_literal = "" 215 | global current_line 216 | global current_row 217 | line = current_line + 1 218 | row = current_row + 1 219 | 220 | current_char = getchar() 221 | while current_char != '"': 222 | str_literal += current_char 223 | current_char = getchar() 224 | if current_char == "SCANEOF": 225 | lexical_error('missing terminating "', line, row) 226 | 227 | current_line = line 228 | current_row = row 229 | return ("SCANEOF", "", "") 230 | return ("STRING_LITERAL", str_literal, get_cate_id("STRING")) 231 | 232 | if current_char == "/": 233 | next_char = getchar() 234 | line = int(current_line) + 1 235 | row = int(current_row) + 1 236 | if next_char == "*": 237 | comment = "" 238 | next_char = getchar() 239 | while True: 240 | if next_char == "SCANEOF": 241 | lexical_error("unteminated /* comment", line, row) 242 | return ("SCANEOF", "", "") 243 | if next_char == "*": 244 | end_char = getchar() 245 | if end_char == "/": 246 | # Comment, return None to ignore it. 247 | return None 248 | if end_char == "SCANEOF": 249 | lexical_error("unteminated /* comment", line, row) 250 | return ("SCANEOF", "", "") 251 | comment += next_char 252 | next_char = getchar() 253 | else: 254 | ungetc() 255 | op = current_char 256 | current_char = getchar() 257 | if is_operator(current_char): 258 | op += current_char 259 | else: 260 | ungetc() 261 | return ("OP", op, get_cate_id(op)) 262 | 263 | if is_separator(current_char): 264 | return ("SEP", current_char, get_cate_id(current_char)) 265 | 266 | if is_operator(current_char): 267 | op = current_char 268 | current_char = getchar() 269 | if is_operator(current_char): 270 | op += current_char 271 | else: 272 | ungetc() 273 | return ("OP", op, get_cate_id(op)) 274 | else: 275 | lexical_error("unknown character: " + current_char) 276 | 277 | 278 | def main(): 279 | file_name = sys.argv[1] 280 | read_source_file(file_name) 281 | while True: 282 | r = scanner() 283 | if r is not None: 284 | if r[0] == "SCANEOF": 285 | break 286 | print(r) 287 | 288 | 289 | if __name__ == "__main__": 290 | main() 291 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import lexer 4 | 5 | from util import Production, Symbol 6 | 7 | 8 | TERMINAL_SET = set() 9 | 10 | NON_TERMINAL_SET = set() 11 | 12 | SYMBOL_DICT = {} 13 | 14 | PRODUCTION_LIST = [] 15 | 16 | PARSING_TABLE = {} 17 | 18 | SYMBOL_STACK = [] 19 | 20 | SYMBOL_TABLE = {} 21 | 22 | LAST_STACK_TOP_SYMBOL = None 23 | 24 | 25 | def symbol_for_str(string): 26 | return SYMBOL_DICT[string] 27 | 28 | 29 | def is_terminal(string): 30 | return string in TERMINAL_SET 31 | 32 | 33 | def syntax_error(msg, line=None, row=None): 34 | if line is None: 35 | line = lexer.current_line + 1 36 | if row is None: 37 | row = lexer.current_row + 1 38 | print(str(line) + ":" + str(row) + " Syntax error: " + msg) 39 | 40 | 41 | def prepare_symbols_and_productions(): 42 | f = open("grammer.txt", "r") 43 | lines = f.readlines() 44 | terminal = False 45 | production = False 46 | for l in lines: 47 | if l.strip() == "*terminals": 48 | terminal = True 49 | production = False 50 | continue 51 | if l.strip() == "*productions": 52 | terminal = False 53 | production = True 54 | continue 55 | if l.strip() == "*end": 56 | break 57 | if terminal: 58 | TERMINAL_SET.update([l.strip()]) 59 | if production: 60 | left = l.split("::=")[0].strip() 61 | NON_TERMINAL_SET.update([left]) 62 | 63 | try: 64 | right = l.split("::=")[1].strip() 65 | if right == "": 66 | raise IndexError 67 | p = Production(left, right.split(" ")) 68 | except IndexError: 69 | p = Production(left, ["null"]) 70 | 71 | PRODUCTION_LIST.append(p) 72 | 73 | for s in TERMINAL_SET: 74 | sym = Symbol(s, sym_type="T") 75 | SYMBOL_DICT[s] = sym 76 | 77 | for s in NON_TERMINAL_SET: 78 | sym = Symbol(s, sym_type="N") 79 | SYMBOL_DICT[s] = sym 80 | 81 | 82 | def get_nullable(): 83 | """ 84 | Calculate and mark non-terminals found that is nullable(can derive null). 85 | We do this first, so we can use the result when calculating First and Follow. 86 | """ 87 | changes = True 88 | while changes: 89 | changes = False 90 | for p in PRODUCTION_LIST: 91 | if not symbol_for_str(p.left).is_nullable: 92 | if p.right[0] == "null": 93 | symbol_for_str(p.left).is_nullable = True 94 | changes = True 95 | continue 96 | else: 97 | right_is_nullable = symbol_for_str(p.right[0]).is_nullable 98 | # For X -> Y1 ... YN, Nullable(X) = Nullable(Y1) & 99 | # Nullable(Y2) ... & Nullable(YN) 100 | for r in p.right[1:]: 101 | right_is_nullable = ( 102 | right_is_nullable & symbol_for_str(r).is_nullable 103 | ) 104 | 105 | if right_is_nullable: 106 | changes = True 107 | symbol_for_str(p.left).is_nullable = True 108 | 109 | 110 | def get_first(): 111 | """ 112 | Calculate First set of each symbol. 113 | """ 114 | for s in TERMINAL_SET: 115 | # For each terminal, initialize First with itself. 116 | sym = SYMBOL_DICT[s] 117 | sym.first_set = set([s]) 118 | 119 | for s in NON_TERMINAL_SET: 120 | sym = SYMBOL_DICT[s] 121 | if sym.is_nullable: 122 | sym.first_set = set(["null"]) 123 | else: 124 | sym.first_set = set() 125 | 126 | while True: 127 | first_set_is_stable = True 128 | for p in PRODUCTION_LIST: 129 | sym_left = symbol_for_str(p.left) 130 | if p.right[0] == "null": 131 | sym_left.first_set.update(set(["null"])) 132 | continue 133 | previous_first_set = set(sym_left.first_set) 134 | 135 | for s in p.right: 136 | # For X -> Y..., First(X) = First(X) U First(Y) 137 | sym_right = symbol_for_str(s) 138 | sym_left.first_set.update(sym_right.first_set) 139 | # For X -> Y1 Y2 ... Yi-1 , if Y1...Yi-1 is all nullable 140 | # Then First(X) = First(X) U First(Y1) U First(Y2) ... 141 | if sym_right.is_nullable: 142 | continue 143 | else: 144 | break 145 | 146 | if previous_first_set != sym_left.first_set: 147 | first_set_is_stable = False 148 | 149 | if first_set_is_stable: 150 | break 151 | 152 | 153 | def get_follow(): 154 | """ 155 | Calculate Follow set of each symbol. 156 | """ 157 | for s in NON_TERMINAL_SET: 158 | sym = symbol_for_str(s) 159 | sym.follow_set = set() 160 | 161 | symbol_for_str("").follow_set.update(set(["#"])) 162 | 163 | while True: 164 | follow_set_is_stable = True 165 | for p in PRODUCTION_LIST: 166 | sym_left = symbol_for_str(p.left) 167 | if sym_left.is_terminal(): 168 | continue 169 | for s in p.right: 170 | if s == "null": 171 | continue 172 | if symbol_for_str(s).is_terminal(): 173 | continue 174 | current_symbol = symbol_for_str(s) 175 | previous_follow_set = set(current_symbol.follow_set) 176 | next_is_nullable = True 177 | for s2 in p.right[p.right.index(s) + 1 :]: 178 | # For X -> sYt, Follow(Y) = Follow(Y) U First(t) 179 | next_symbol = symbol_for_str(s2) 180 | current_symbol.follow_set.update(next_symbol.first_set) 181 | if next_symbol.is_nullable: 182 | continue 183 | else: 184 | next_is_nullable = False 185 | break 186 | if next_is_nullable: 187 | # For X -> sYt, if t is nullable, Follow(Y) = Follow(Y) U 188 | # Follow(X) 189 | current_symbol.follow_set.update(sym_left.follow_set) 190 | 191 | if current_symbol.follow_set != previous_follow_set: 192 | follow_set_is_stable = False 193 | 194 | if follow_set_is_stable: 195 | break 196 | 197 | 198 | def get_select(): 199 | """ 200 | Calculate Select set for each production. 201 | """ 202 | while True: 203 | select_set_is_stable = True 204 | for p in PRODUCTION_LIST: 205 | sym_left = symbol_for_str(p.left) 206 | previous_select = set(p.select) 207 | if p.right[0] == "null": 208 | # For A -> a, if a is null, Select(i) = Follow(A) 209 | p.select.update(sym_left.follow_set) 210 | continue 211 | sym_right = symbol_for_str(p.right[0]) 212 | # Otherwise, Select(i) = First(a) 213 | p.select.update(sym_right.first_set) 214 | # If a is nullable, Select(i) = First(a) U Follow(A) 215 | if sym_right.is_nullable: 216 | p.select.update(sym_right.first_set.union(sym_left.follow_set)) 217 | if previous_select != p.select: 218 | select_set_is_stable = False 219 | if select_set_is_stable: 220 | break 221 | 222 | 223 | def get_parsing_table(): 224 | """ 225 | Calculate parsing table. 226 | """ 227 | global PARSING_TABLE 228 | for non_terminal in NON_TERMINAL_SET: 229 | PARSING_TABLE[non_terminal] = {} 230 | for p in PRODUCTION_LIST: 231 | if non_terminal == p.left: 232 | for symbol in p.select: 233 | PARSING_TABLE[non_terminal][symbol] = p 234 | # Calculate SYNC 235 | for symbol in symbol_for_str(non_terminal).follow_set: 236 | if is_terminal(symbol): 237 | try: 238 | p = PARSING_TABLE[non_terminal][symbol] 239 | except KeyError: 240 | PARSING_TABLE[non_terminal][symbol] = "SYNC" 241 | 242 | for symbol in symbol_for_str(non_terminal).first_set: 243 | if is_terminal(symbol): 244 | try: 245 | p = PARSING_TABLE[non_terminal][symbol] 246 | except KeyError: 247 | PARSING_TABLE[non_terminal][symbol] = "SYNC" 248 | 249 | # prettyprint_parsing_table() 250 | 251 | 252 | def prettyprint_parsing_table(): 253 | for non_terminal in PARSING_TABLE.keys(): 254 | symbol_to_production_list = [] 255 | for symbol in PARSING_TABLE[non_terminal]: 256 | p = PARSING_TABLE[non_terminal][symbol] 257 | symbol_to_production = str(symbol) + ":" + str(p) 258 | symbol_to_production_list.append(symbol_to_production) 259 | 260 | print(non_terminal) 261 | print(symbol_to_production_list) 262 | 263 | 264 | def print_symbol_table(): 265 | for t in SYMBOL_TABLE: 266 | print(t) 267 | 268 | 269 | def next_token(): 270 | r = lexer.scanner() 271 | while r is None: 272 | r = lexer.scanner() 273 | return r 274 | 275 | 276 | def prepare_grammar(): 277 | prepare_symbols_and_productions() 278 | get_nullable() 279 | get_first() 280 | get_follow() 281 | get_select() 282 | get_parsing_table() 283 | 284 | 285 | def do_parsing(): 286 | SYMBOL_STACK.append("#") 287 | SYMBOL_STACK.append("") 288 | 289 | token_tuple = next_token() 290 | productions = open("productions.txt", "w") 291 | stack = open("stack.txt", "w") 292 | while len(SYMBOL_STACK) > 0: 293 | stack_top_symbol = SYMBOL_STACK[-1] 294 | current_token = token_tuple[0] 295 | if current_token == "OP" or current_token == "SEP": 296 | current_token = token_tuple[1] 297 | 298 | if current_token == "SCANEOF": 299 | current_token = "#" 300 | 301 | if stack_top_symbol == "null": 302 | LAST_STACK_TOP_SYMBOL = SYMBOL_STACK.pop() 303 | continue 304 | 305 | if stack_top_symbol == "#": 306 | break 307 | 308 | if not is_terminal(stack_top_symbol): 309 | try: 310 | p = PARSING_TABLE[stack_top_symbol][current_token] 311 | except KeyError: 312 | # Stack top symbol unmatched, ignore it 313 | syntax_error("unmatched") 314 | token_tuple = next_token() 315 | continue 316 | 317 | if p == "SYNC": 318 | # SYNC recognized, pop Stack 319 | syntax_error("sync symbol, recovering") 320 | LAST_STACK_TOP_SYMBOL = SYMBOL_STACK.pop() 321 | stack.write(str(SYMBOL_STACK) + "\n") 322 | productions.write(str(p) + "\n") 323 | continue 324 | 325 | stack.write(str(SYMBOL_STACK) + "\n") 326 | productions.write(str(p) + "\n") 327 | LAST_STACK_TOP_SYMBOL = SYMBOL_STACK.pop() 328 | SYMBOL_STACK.extend(reversed(p.right)) 329 | 330 | else: 331 | SYMBOL_STACK.pop() 332 | token_tuple = next_token() 333 | 334 | productions.close() 335 | stack.close() 336 | 337 | 338 | def main(): 339 | prepare_grammar() 340 | lexer.read_source_file("1.c") 341 | do_parsing() 342 | print_symbol_table() 343 | 344 | 345 | if __name__ == "__main__": 346 | main() 347 | -------------------------------------------------------------------------------- /sema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import lexer 4 | 5 | from util import Production, Symbol, Entry 6 | 7 | 8 | TERMINAL_SET = set() 9 | 10 | NON_TERMINAL_SET = set() 11 | 12 | SYMBOL_DICT = {} 13 | 14 | PRODUCTION_LIST = [] 15 | 16 | PARSING_TABLE = {} 17 | 18 | SEMA_ACTION_TABLE = {} 19 | 20 | SYMBOL_STACK = [] 21 | 22 | SYMBOL_TABLE = [] 23 | 24 | LAST_STACK_TOP_SYMBOL = None 25 | 26 | CODE_SIZE = 0 27 | 28 | CODE_RESULT = [] 29 | 30 | current_symbol_table_pos = 0 31 | current_symbol_index = 0 32 | 33 | CURRENT_CONDITION_NODE = None 34 | 35 | 36 | def P11(): 37 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["type"] = "int" 38 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["length"] = 4 39 | 40 | 41 | def P12(): 42 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["type"] = "float" 43 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["length"] = 4 44 | 45 | 46 | def P13(): 47 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["type"] = "double" 48 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["length"] = 8 49 | 50 | 51 | def P14(): 52 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["type"] = "short" 53 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["length"] = 2 54 | 55 | 56 | def P15(): 57 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["type"] = "long" 58 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["length"] = 4 59 | 60 | 61 | def P21(): 62 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["type"] = ( 63 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.children[0].attr["type"] 64 | ) 65 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.attr["length"] = ( 66 | symbol_for_str(LAST_STACK_TOP_SYMBOL).father.children[0].attr["length"] 67 | ) 68 | 69 | 70 | def P22(): 71 | global current_symbol_table_pos 72 | global current_symbol_index 73 | s = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.children[0] 74 | SYMBOL_TABLE.append(Entry(s.attr["type"], s.attr["length"], s.attr["name"])) 75 | current_symbol_index += 1 76 | current_symbol_table_pos += s.attr["length"] 77 | 78 | 79 | def P31(): 80 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 81 | f.attr["name"] = f.children[1].lexical_value 82 | 83 | 84 | def P41(): 85 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 86 | f.attr["type"] = "int" 87 | f.attr["value"] = f.children[0].lexical_value 88 | 89 | 90 | def P42(): 91 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 92 | f.attr["type"] = "float" 93 | f.attr["value"] = float(f.children[0].lexical_value) 94 | 95 | 96 | def P43(): 97 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 98 | f.attr["type"] = "short" 99 | f.attr["value"] = f.children[0].lexical_value 100 | 101 | 102 | def P44(): 103 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 104 | f.attr["type"] = "long" 105 | f.attr["value"] = f.children[0].lexical_value 106 | 107 | 108 | def P51(): 109 | pass 110 | 111 | 112 | def P52(): 113 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.father 114 | f.attr["type"] = f.children[0].attr["type"] 115 | f.attr["value"] = f.children[0].attr["value"] 116 | 117 | 118 | def P61(): 119 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 120 | if len(f.children) < 3: 121 | f = f.father.father.father.father 122 | 123 | l = f.children[0] 124 | r = f.children[2] 125 | 126 | fac = f.children[4] 127 | 128 | lv = search_for_symbol(l.lexical_value) 129 | if lv is None: 130 | syntax_error("undefined " + l.lexical_value) 131 | return 132 | 133 | if lv.type != r.attr["type"]: 134 | syntax_error("type mismatch") 135 | return 136 | 137 | result = None 138 | if "op" in fac.attr: 139 | if fac.attr["op"] == "+": 140 | result = f.attr["value"] + fac.attr["factor"] 141 | 142 | if fac.attr["op"] == "*": 143 | result = f.attr["value"] * fac.attr["factor"] 144 | else: 145 | result = r.attr["value"] 146 | fac.attr = {} 147 | 148 | code_output(lv.name + " := " + str(result)) 149 | 150 | 151 | def P62(): 152 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.father.father.father 153 | f.attr["type"] = f.children[2].attr["type"] 154 | f.attr["value"] = f.children[2].attr["value"] 155 | 156 | 157 | def P71(): 158 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.father.father 159 | f.attr["type"] = f.children[0].attr["type"] 160 | f.attr["value"] = f.children[0].attr["value"] 161 | 162 | 163 | def P72(): 164 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.father.father 165 | f.attr["type"] = f.children[0].attr["type"] 166 | f.attr["value"] = f.children[0].attr["value"] + 1 167 | 168 | 169 | def P73(): 170 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.father.father 171 | f.attr["type"] = f.children[0].attr["type"] 172 | f.attr["value"] = f.children[0].attr["value"] - 1 173 | 174 | 175 | def P81(): 176 | global CURRENT_CONDITION_NODE 177 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 178 | CURRENT_CONDITION_NODE = f 179 | e = f.children[2] 180 | code_output("IF " + str(e.attr["value"]) + " GOTO " + str(CODE_SIZE + 2)) 181 | code_output(None) 182 | f.attr["back"] = CODE_SIZE - 1 183 | 184 | 185 | def P82(): 186 | prev = CURRENT_CONDITION_NODE.attr["back"] 187 | CODE_RESULT[prev] = "GOTO " + str(CODE_SIZE) 188 | 189 | 190 | def P91(): 191 | global CURRENT_CONDITION_NODE 192 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father 193 | CURRENT_CONDITION_NODE = f 194 | e = f.children[2] 195 | code_output("IF " + str(e.attr["value"]) + " GOTO " + str(CODE_SIZE + 2)) 196 | code_output(None) 197 | f.attr["back"] = CODE_SIZE - 1 198 | 199 | 200 | def P92(): 201 | prev = CURRENT_CONDITION_NODE.attr["back"] 202 | CODE_RESULT[prev] = "GOTO " + str(CODE_SIZE + 1) 203 | code_output("GOTO " + str(prev - 1)) 204 | 205 | 206 | def P101(): 207 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.father.father.father 208 | f.attr["op"] = f.children[0].lexical_value 209 | f.attr["factor"] = f.children[1].attr["value"] 210 | 211 | 212 | def P102(): 213 | f = symbol_for_str(LAST_STACK_TOP_SYMBOL).father.father.father.father 214 | f.attr["op"] = f.children[0].lexical_value 215 | f.attr["factor"] = f.children[1].attr["value"] 216 | 217 | 218 | def no_action(): 219 | pass 220 | 221 | 222 | SEMA_ACTION_TABLE["P11"] = P11 223 | SEMA_ACTION_TABLE["P12"] = P12 224 | SEMA_ACTION_TABLE["P13"] = P13 225 | SEMA_ACTION_TABLE["P14"] = P14 226 | SEMA_ACTION_TABLE["P15"] = P15 227 | SEMA_ACTION_TABLE["P21"] = P21 228 | SEMA_ACTION_TABLE["P22"] = P22 229 | SEMA_ACTION_TABLE["P31"] = P31 230 | SEMA_ACTION_TABLE["P41"] = P41 231 | SEMA_ACTION_TABLE["P42"] = P42 232 | SEMA_ACTION_TABLE["P43"] = P43 233 | SEMA_ACTION_TABLE["P44"] = P44 234 | SEMA_ACTION_TABLE["P51"] = P51 235 | SEMA_ACTION_TABLE["P52"] = P52 236 | SEMA_ACTION_TABLE["P61"] = P61 237 | SEMA_ACTION_TABLE["P62"] = P62 238 | SEMA_ACTION_TABLE["P71"] = P71 239 | SEMA_ACTION_TABLE["P72"] = P72 240 | SEMA_ACTION_TABLE["P73"] = P73 241 | SEMA_ACTION_TABLE["P81"] = P81 242 | SEMA_ACTION_TABLE["P82"] = P82 243 | SEMA_ACTION_TABLE["P91"] = P91 244 | SEMA_ACTION_TABLE["P92"] = P92 245 | SEMA_ACTION_TABLE["P101"] = P101 246 | SEMA_ACTION_TABLE["P102"] = P102 247 | 248 | SEMA_ACTION_TABLE["null"] = no_action 249 | 250 | 251 | def symbol_for_str(string): 252 | return SYMBOL_DICT[string] 253 | 254 | 255 | def is_terminal(string): 256 | return string in TERMINAL_SET 257 | 258 | 259 | def syntax_error(msg, line=None, row=None): 260 | if line is None: 261 | line = lexer.current_line + 1 262 | if row is None: 263 | row = lexer.current_row + 1 264 | print(str(line) + ":" + str(row) + " Syntax error: " + msg) 265 | 266 | 267 | def code_output(code): 268 | global CODE_SIZE 269 | CODE_SIZE += 1 270 | CODE_RESULT.append(code) 271 | 272 | 273 | def search_for_symbol(name): 274 | for e in SYMBOL_TABLE: 275 | if e.name == name: 276 | return e 277 | 278 | 279 | def prepare_symbols_and_productions(): 280 | f = open("grammer.txt", "r") 281 | lines = f.readlines() 282 | terminal = False 283 | production = False 284 | for l in lines: 285 | if l.strip() == "*terminals": 286 | terminal = True 287 | production = False 288 | continue 289 | if l.strip() == "*productions": 290 | terminal = False 291 | production = True 292 | continue 293 | if l.strip() == "*end": 294 | break 295 | if terminal: 296 | TERMINAL_SET.update([l.strip()]) 297 | if production: 298 | left = l.split("::=")[0].strip() 299 | NON_TERMINAL_SET.update([left]) 300 | 301 | try: 302 | right = l.split("::=")[1].strip() 303 | if right == "": 304 | raise IndexError 305 | p = Production(left, right.split(" ")) 306 | except IndexError: 307 | p = Production(left, ["null"]) 308 | 309 | PRODUCTION_LIST.append(p) 310 | 311 | for s in TERMINAL_SET: 312 | sym = Symbol(s, sym_type="T") 313 | SYMBOL_DICT[s] = sym 314 | 315 | for s in NON_TERMINAL_SET: 316 | sym = Symbol(s, sym_type="N") 317 | SYMBOL_DICT[s] = sym 318 | 319 | 320 | def get_nullable(): 321 | """ 322 | Calculate and mark non-terminals found that is nullable(can derive null). 323 | We do this first, so we can use the result when calculating First and Follow. 324 | """ 325 | changes = True 326 | while changes: 327 | changes = False 328 | for p in PRODUCTION_LIST: 329 | if not symbol_for_str(p.left).is_nullable: 330 | if p.right[0] == "null": 331 | symbol_for_str(p.left).is_nullable = True 332 | changes = True 333 | continue 334 | else: 335 | right_is_nullable = symbol_for_str(p.right[0]).is_nullable 336 | # For X -> Y1 ... YN, Nullable(X) = Nullable(Y1) & 337 | # Nullable(Y2) ... & Nullable(YN) 338 | for r in p.right[1:]: 339 | if r.startswith("P"): 340 | continue 341 | right_is_nullable = ( 342 | right_is_nullable & symbol_for_str(r).is_nullable 343 | ) 344 | 345 | if right_is_nullable: 346 | changes = True 347 | symbol_for_str(p.left).is_nullable = True 348 | 349 | 350 | def get_first(): 351 | """ 352 | Calculate First set of each symbol. 353 | """ 354 | for s in TERMINAL_SET: 355 | # For each terminal, initialize First with itself. 356 | sym = SYMBOL_DICT[s] 357 | sym.first_set = set([s]) 358 | 359 | for s in NON_TERMINAL_SET: 360 | sym = SYMBOL_DICT[s] 361 | if sym.is_nullable: 362 | sym.first_set = set(["null"]) 363 | else: 364 | sym.first_set = set() 365 | 366 | while True: 367 | first_set_is_stable = True 368 | for p in PRODUCTION_LIST: 369 | sym_left = symbol_for_str(p.left) 370 | if p.right[0] == "null": 371 | sym_left.first_set.update(set(["null"])) 372 | continue 373 | previous_first_set = set(sym_left.first_set) 374 | 375 | for s in p.right: 376 | # For X -> Y..., First(X) = First(X) U First(Y) 377 | sym_right = symbol_for_str(s) 378 | sym_left.first_set.update(sym_right.first_set) 379 | # For X -> Y1 Y2 ... Yi-1 , if Y1...Yi-1 is all nullable 380 | # Then First(X) = First(X) U First(Y1) U First(Y2) ... 381 | if sym_right.is_nullable: 382 | continue 383 | else: 384 | break 385 | 386 | if previous_first_set != sym_left.first_set: 387 | first_set_is_stable = False 388 | 389 | if first_set_is_stable: 390 | break 391 | 392 | 393 | def get_follow(): 394 | """ 395 | Calculate Follow set of each symbol. 396 | """ 397 | for s in NON_TERMINAL_SET: 398 | sym = symbol_for_str(s) 399 | sym.follow_set = set() 400 | 401 | symbol_for_str("").follow_set.update(set(["#"])) 402 | 403 | while True: 404 | follow_set_is_stable = True 405 | for p in PRODUCTION_LIST: 406 | sym_left = symbol_for_str(p.left) 407 | if sym_left.is_terminal(): 408 | continue 409 | for s in p.right: 410 | if s == "null": 411 | continue 412 | if s.startswith("P"): 413 | continue 414 | if symbol_for_str(s).is_terminal(): 415 | continue 416 | current_symbol = symbol_for_str(s) 417 | previous_follow_set = set(current_symbol.follow_set) 418 | next_is_nullable = True 419 | for s2 in p.right[p.right.index(s) + 1 :]: 420 | if s2.startswith("P"): 421 | continue 422 | # For X -> sYt, Follow(Y) = Follow(Y) U First(t) 423 | next_symbol = symbol_for_str(s2) 424 | current_symbol.follow_set.update(next_symbol.first_set) 425 | if next_symbol.is_nullable: 426 | continue 427 | else: 428 | next_is_nullable = False 429 | break 430 | if next_is_nullable: 431 | # For X -> sYt, if t is nullable, Follow(Y) = Follow(Y) U 432 | # Follow(X) 433 | current_symbol.follow_set.update(sym_left.follow_set) 434 | 435 | if current_symbol.follow_set != previous_follow_set: 436 | follow_set_is_stable = False 437 | 438 | if follow_set_is_stable: 439 | break 440 | 441 | 442 | def get_select(): 443 | """ 444 | Calculate Select set for each production. 445 | """ 446 | while True: 447 | select_set_is_stable = True 448 | for p in PRODUCTION_LIST: 449 | sym_left = symbol_for_str(p.left) 450 | previous_select = set(p.select) 451 | if p.right[0] == "null": 452 | # For A -> a, if a is null, Select(i) = Follow(A) 453 | p.select.update(sym_left.follow_set) 454 | continue 455 | sym_right = symbol_for_str(p.right[0]) 456 | # Otherwise, Select(i) = First(a) 457 | p.select.update(sym_right.first_set) 458 | # If a is nullable, Select(i) = First(a) U Follow(A) 459 | if sym_right.is_nullable: 460 | p.select.update(sym_right.first_set.union(sym_left.follow_set)) 461 | if previous_select != p.select: 462 | select_set_is_stable = False 463 | if select_set_is_stable: 464 | break 465 | 466 | 467 | def get_parsing_table(): 468 | """ 469 | Calculate parsing table. 470 | """ 471 | global PARSING_TABLE 472 | for non_terminal in NON_TERMINAL_SET: 473 | if non_terminal.startswith("P"): 474 | continue 475 | PARSING_TABLE[non_terminal] = {} 476 | for p in PRODUCTION_LIST: 477 | if non_terminal == p.left: 478 | for symbol in p.select: 479 | PARSING_TABLE[non_terminal][symbol] = p 480 | # Calculate SYNC 481 | for symbol in symbol_for_str(non_terminal).follow_set: 482 | if is_terminal(symbol): 483 | try: 484 | p = PARSING_TABLE[non_terminal][symbol] 485 | except KeyError: 486 | PARSING_TABLE[non_terminal][symbol] = "SYNC" 487 | 488 | for symbol in symbol_for_str(non_terminal).first_set: 489 | if is_terminal(symbol): 490 | try: 491 | p = PARSING_TABLE[non_terminal][symbol] 492 | except KeyError: 493 | PARSING_TABLE[non_terminal][symbol] = "SYNC" 494 | 495 | # prettyprint_parsing_table() 496 | 497 | 498 | def prettyprint_parsing_table(): 499 | for non_terminal in PARSING_TABLE.keys(): 500 | symbol_to_production_list = [] 501 | for symbol in PARSING_TABLE[non_terminal]: 502 | p = PARSING_TABLE[non_terminal][symbol] 503 | symbol_to_production = str(symbol) + ":" + str(p) 504 | symbol_to_production_list.append(symbol_to_production) 505 | 506 | print(non_terminal) 507 | print(symbol_to_production_list) 508 | 509 | 510 | def print_symbol_table(): 511 | for t in SYMBOL_TABLE: 512 | print(t) 513 | 514 | 515 | def print_code_result(): 516 | for r in CODE_RESULT: 517 | print(str(CODE_RESULT.index(r)) + ": " + r) 518 | 519 | 520 | def next_token(): 521 | r = lexer.scanner() 522 | while r is None: 523 | r = lexer.scanner() 524 | return r 525 | 526 | 527 | def prepare_grammar(): 528 | prepare_symbols_and_productions() 529 | get_nullable() 530 | get_first() 531 | get_follow() 532 | get_select() 533 | get_parsing_table() 534 | 535 | 536 | def do_sema_actions(symbol): 537 | SEMA_ACTION_TABLE[symbol]() 538 | 539 | 540 | def do_parsing(): 541 | global LAST_STACK_TOP_SYMBOL 542 | SYMBOL_STACK.append("#") 543 | SYMBOL_STACK.append("") 544 | 545 | token_tuple = next_token() 546 | productions = open("productions.txt", "w") 547 | stack = open("stack.txt", "w") 548 | while len(SYMBOL_STACK) > 0: 549 | stack_top_symbol = SYMBOL_STACK[-1] 550 | while stack_top_symbol == "null": 551 | SYMBOL_STACK.pop() 552 | stack_top_symbol = SYMBOL_STACK[-1] 553 | 554 | if stack_top_symbol.startswith("P"): 555 | do_sema_actions(stack_top_symbol) 556 | SYMBOL_STACK.pop() 557 | stack.write(str(SYMBOL_STACK) + "\n") 558 | continue 559 | current_token = token_tuple[0] 560 | if current_token == "OP" or current_token == "SEP": 561 | current_token = token_tuple[1] 562 | 563 | if current_token == "SCANEOF": 564 | current_token = "#" 565 | 566 | if stack_top_symbol == "null": 567 | LAST_STACK_TOP_SYMBOL = SYMBOL_STACK.pop() 568 | continue 569 | 570 | if stack_top_symbol == "#": 571 | break 572 | 573 | if not is_terminal(stack_top_symbol): 574 | try: 575 | p = PARSING_TABLE[stack_top_symbol][current_token] 576 | except KeyError: 577 | # Stack top symbol unmatched, ignore it 578 | syntax_error("unmatched") 579 | token_tuple = next_token() 580 | continue 581 | 582 | if p == "SYNC": 583 | # SYNC recognized, pop Stack 584 | syntax_error("sync symbol, recovering") 585 | LAST_STACK_TOP_SYMBOL = SYMBOL_STACK.pop() 586 | stack.write(str(SYMBOL_STACK) + "\n") 587 | productions.write(str(p) + "\n") 588 | continue 589 | 590 | stack.write(str(SYMBOL_STACK) + "\n") 591 | productions.write(str(p) + "\n") 592 | LAST_STACK_TOP_SYMBOL = SYMBOL_STACK.pop() 593 | SYMBOL_STACK.extend(reversed(p.right)) 594 | symbol_for_str((LAST_STACK_TOP_SYMBOL)).children = [] 595 | for symbol in p.right: 596 | if symbol.startswith("P"): 597 | symbol_for_str(LAST_STACK_TOP_SYMBOL).children.append(symbol) 598 | continue 599 | 600 | if symbol == "null": 601 | continue 602 | t = symbol_for_str(symbol) 603 | symbol_for_str(LAST_STACK_TOP_SYMBOL).children.append(t) 604 | t.father = symbol_for_str(LAST_STACK_TOP_SYMBOL) 605 | 606 | else: 607 | symbol_for_str(stack_top_symbol).lexical_value = token_tuple[1] 608 | LAST_STACK_TOP_SYMBOL = SYMBOL_STACK.pop() 609 | stack.write(str(SYMBOL_STACK) + "\n") 610 | token_tuple = next_token() 611 | 612 | productions.close() 613 | stack.close() 614 | 615 | 616 | def main(): 617 | prepare_grammar() 618 | lexer.read_source_file("1.c") 619 | do_parsing() 620 | print("SYMBOL TABLE") 621 | print("------------") 622 | print_symbol_table() 623 | print("\n") 624 | print("CODE") 625 | print("------------") 626 | print_code_result() 627 | 628 | 629 | if __name__ == "__main__": 630 | main() 631 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | class Production(object): 2 | def __init__(self, left, right, select=None): 3 | self.left = left 4 | self.right = right 5 | self.select = set() 6 | 7 | def __str__(self): 8 | return self.left + " -> " + str(self.right) + " Select: " + str(self.select) 9 | 10 | 11 | class Symbol(object): 12 | def __init__(self, symbol, first_set=None, follow_set=None, sym_type="N"): 13 | self.symbol = symbol 14 | self.first_set = first_set 15 | self.follow_set = follow_set 16 | self.sym_type = sym_type 17 | self.is_nullable = False 18 | self.attr = {} 19 | self.father = None 20 | self.children = [] 21 | self.lexical_value = None 22 | 23 | def __str__(self): 24 | return ( 25 | self.symbol 26 | + " Derive_empty:" 27 | + str(self.is_nullable) 28 | + " First:" 29 | + str(self.first_set) 30 | + " Follow:" 31 | + str(self.follow_set) 32 | ) 33 | 34 | def is_terminal(self): 35 | return self.sym_type == "T" 36 | 37 | 38 | class Entry(object): 39 | def __init__(self, type, length, name): 40 | self.type = type 41 | self.length = length 42 | self.name = name 43 | 44 | def __str__(self): 45 | return self.name + " " + self.type + " " + str(self.length) 46 | --------------------------------------------------------------------------------