├── .gitignore ├── README.md ├── lex_grammar.txt ├── lexical_analyze.py ├── nfa_and_dfa.py ├── sample_syn_grammar.txt ├── sample_token_table.txt ├── source.cc ├── syn_grammar.txt ├── syntax_analyze.py └── token_table.data /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | .idea/ 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Compiler 2 | ------- 3 | 基于 Python 的词法和LR(1)文法分析器 4 | 5 | ## 总体说明 6 | * 编程语言:Python 2.7.11 7 | * 编程平台:Ubuntu16.04 8 | * 编程环境:sublime 9 | * 完成的内容:实现了 3型文法的词法分析器和2 型文法的LR(1)语法分析器。 10 | * 测试文法:一个类C语言的文法 11 | * 测试程序:一个类C语言的程序。 12 | 13 | ## 文件说明 14 | 本程序共涉及7个文件, 现将其说明如下: 15 | 16 | lexical_analyze.py 词法分析程序 17 | 18 | syntax_analyze.py 语法分析程序 19 | 20 | nfa_and_dfa.py 定义了 nfa 和 dfa 类以及其节点 21 | 22 | lex_grammar.txt 词法文法文件 23 | 24 | syn_grammar.txt 语法文法文件 25 | 26 | source.cc 待分析的类C程序 27 | 28 | token_table.data 词法分析输出的token表 29 | 30 | ## 词法文法说明 31 | 我构造了一个3型文法作为程序读入的词法的文法,为了简化读入操作,我将课本上的 ->(推出符)换成了 ‘:’, ‘:’的左侧是产生式的左部, 右侧是产生式的右部, 并且将空产生式的右部的埃普西隆用 $ 来代替。 32 | 我将token的类型大体分成了 identifier、limiter、operator、number、string 等五类, 并每一类设计了表达式和推导过程。 33 | 34 | ## 语法文法说明 35 | 直到开始写文法我才直到这个课设最难的是文法的构造, 自己YY了很久也没能设计出一个让自己满意且可用的文法, 所以中从 http://www.nongnu.org/hcb/ 这里参考了一下,最后写出来了一个2型文法。 36 | 跟词法的文法, 这里用 ‘:’ 来代替课本上的 –> 并且用 ‘$’ 来代替埃普西隆空产生式右部。 37 | 38 | ## 词法分析器说明 39 | 词法分析器接受一个3型文法, 接受3型文法后会分析其终结符和非终结符, 分析方法是: 对于非终结符, 很明显, 所有在产生式左部的符号都是非终结符, 那么终结符就是所有的符号集合与非终结符集合的差集.分析完终结符和非终结符之后根据课本算法构造 NFA, 然后根据课本的算法构造 DFA,至此文法的处理工作结束。 40 | 接下来分析待分析的程序, 对于读入的程序, 将每个字符一次输入到 DFA 里面,当 DFA 不能接受某个字符的时候判断当前状态是否是一个终结状态, 如果是则token分析成功, 否则词法分析失败。 41 | 42 | ## 语法分析器说明 43 | 语法分析器首先读入要分析的2型文法, 然后求出文法的终结符和非终结符, 求法与上面相同, 之后要求出每个文法符号的 first集,终结符的first 集是他本身,非终结符的 first 集的求解过程是一个记忆化搜索的过程。 然后为文法添加拓展的 S’->S,# 在此基础上进行拓展形成项目集 I0, 然后对项目集I0 进行推广, 同时构建 LR(1) 分析表。 44 | 有了 LR(1) 分析表后接下来的过程我们只需要一个一个的将词法分析生成的token读入到程序里面放在分析表中寻找移进或者归约操作即可, 如果最后的状态是 acc 则文法符合要求, 如果最后无法得到 acc,或者在分析表中找不到相应的操作, 则语法错误。 45 | 46 | ## 代码说明 47 | ### 工具类(nfa_and_dfa.py) 48 | ```python 49 | class NFANode(object) # NFA的节点结构 50 | def __init__(self, name, _type) # 类的构造函数, 传入名称和类型 51 | def add_edge(self, alpha, target) # 为节点添加边 52 | 53 | class NFA(object) # NFA 的结构 54 | def __init__(self) # 类的构造函数 55 | def get_target(self, cur_status, alpha) # 从当前状态,输入一个字符返回下一个状态 56 | 57 | class DFANode(object) # DFA 的节点结构 58 | def __init__(self, name, _type) # 类的构造函数, 传入名称和类型 59 | def add_edge(self, alpha, target) # 为节点添加边 60 | 61 | class LRDFANode(object) # LR(1)DFA的节点结构 62 | def __init__(self, set_id) # 类的构造函数, 传入节点的编号 63 | def add_object_set(self, id, left, right, index, tail) # 为项目集添加产生式 64 | def add_object_set_by_set(self, object_set) # 以一个集合的方式向项目集中添加产生式 65 | 66 | class DFA(object) # DFA 的结构类 67 | def __init__(self) # 构造函数 68 | def get_target(self, cur_status, alpha) # 从当前状态,输入一个字符返回下一个状态 69 | ``` 70 | ### 词法分析器(lexical_analyze.py) 71 | ```python 72 | class LexicaAnalyze(object) # 与词法分析有关的操作 73 | def read_lex_grammar(self, file_name) # 读取词法的文法, 参数为文法文件路径 74 | def create_nfa(self) # 根据输入的文法创建 nfa 75 | def get_create_nfa_node(name, _type) # 创建新的节点或者返回一个已存在的节点 76 | def nfa_to_dfa(self) # 由 nfa 转向 dfa 77 | def get_create_dfaNode(name, _type) # 创建新的节点或者返回一个已存在的节点 78 | def run_on_dfa(self, line,pos) # 给定一行语句, 让其在dfa上跑生成 token 79 | def read_and_analyze(self, file_name) # 读取待分析的句子并生成token_table 80 | def main() # 主函数调用, 创建 LexicaAnalyze 对象, 并完成词法分析操作 81 | ``` 82 | ### 语法分析器(syntax_analyze.py) 83 | ```python 84 | class SyntaxAnalyze(object) # 与语法分析相关的操作的类 85 | def __init__(self) # 构造函数 86 | def read_syntax_grammar(self, file_name) # 读取语法分析需要的文法, 传入文件名 87 | def get_terminate_noterminate(self) # 得到文法的非终结符和终结符 88 | def __get_first_set(self, cur_status, all_elem) # 递归得到一个非终结符的first 集 89 | def init_first_set(self) # 初始化所有符号的first集 90 | def create_lr_dfa(self) # 创建项目集 DFA, 同时构造分析表 91 | def create_get_lr_dfa_node(set_id) # 创建新的节点或者得到一个已有的节点 92 | def expand_production(self, production,set) # 通过一个产生式得到与其项目集 93 | def run_on_lr_dfa(self, tokens) # 分析 token_table 并返回结果 94 | def read_and_analyze(self, file_name) # 读取token_table 95 | def main() # 主函数,创建 SyntaxAnalyze 对象并进行所有操作输出结果 96 | ``` 97 | ### 运行 98 | ```shell 99 | python lexical_analyze.py # 生成 token_table 100 | python syntax_analyze.py # 输出语法分析结果 101 | ``` 102 | -------------------------------------------------------------------------------- /lex_grammar.txt: -------------------------------------------------------------------------------- 1 | digit:0|1|2|3|4|5|6|7|8|9 2 | nozero_digit:1|2|3|4|5|6|7|8|9 3 | alphabet:a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z 4 | key_word:int|double|char|float|const|auto|break|case|continue|do|while|if|else|switch|enum|for|do|long|short|static|struct|typedef|unsigned|void|return|break|scanf|print|function 5 | start:$ identifier 6 | start:$ limiter 7 | start:$ operator 8 | start:$ string 9 | start:$ number 10 | limiter:, 11 | limiter:; 12 | limiter:[ 13 | limiter:] 14 | limiter:( 15 | limiter:) 16 | limiter:{ 17 | limiter:} 18 | operator:+ 19 | operator:- 20 | operator:* 21 | operator:/ 22 | operator:! 23 | operator:% 24 | operator:^ 25 | operator:& 26 | operator:= 27 | operator:~ 28 | operator:" 29 | operator:<< 30 | operator:>> 31 | operator:++ 32 | operator:-- 33 | operator:> 34 | operator:< 35 | identifier:_ 36 | identifier:alphabet 37 | identifier:_ identifier_tail 38 | identifier:alphabet identifier_tail 39 | identifier_tail:_ 40 | identifier_tail:digit 41 | identifier_tail:alphabet 42 | identifier_tail:_ identifier_tail 43 | identifier_tail:digit identifier_tail 44 | identifier_tail:alphabet identifier_tail 45 | number:digit 46 | number:nozero_digit number_tail 47 | number_tail:digit 48 | number_tail:digit number_tail 49 | number_tail:e number_tail 50 | string:" string_tail 51 | string_tail:, string_tail 52 | string_tail:; string_tail 53 | string_tail:[ string_tail 54 | string_tail:] string_tail 55 | string_tail:( string_tail 56 | string_tail:) string_tail 57 | string_tail:{ string_tail 58 | string_tail:} string_tail 59 | string_tail:. string_tail 60 | string_tail:+ string_tail 61 | string_tail:- string_tail 62 | string_tail:* string_tail 63 | string_tail:/ string_tail 64 | string_tail:! string_tail 65 | string_tail:% string_tail 66 | string_tail:^ string_tail 67 | string_tail:& string_tail 68 | string_tail:= string_tail 69 | string_tail:~ string_tail 70 | string_tail:" string_tail 71 | string_tail:" string_tail 72 | string_tail:< string_tail 73 | string_tail:> string_tail 74 | string_tail:_ string_tail 75 | string_tail:# string_tail 76 | string_tail:digit string_tail 77 | string_tail:alphabet string_tail 78 | string_tail:" -------------------------------------------------------------------------------- /lexical_analyze.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from nfa_and_dfa import NFA, DFA, NFANode, DFANode 5 | 6 | 7 | class LexicalAnalyze(object): 8 | def __init__(self): 9 | super(LexicalAnalyze, self).__init__() 10 | self.productions = [] 11 | self.key_words = {} 12 | self.tool_set = {} 13 | self.NFA = None 14 | self.DFA = None 15 | 16 | def read_lex_grammar(self, file_name): 17 | cur_left = None 18 | cur_right = [] 19 | line_num = 0 20 | for line in open(file_name, 'r'): 21 | line = line.split('\n')[0] 22 | index = line.find(':') 23 | cur_left = line[0:index] 24 | cur_right = line[index + 1:len(line)] 25 | line_num += 1 26 | if line_num < 4: 27 | self.tool_set[cur_left] = set(cur_right.split('|')) 28 | continue 29 | elif line_num == 4: 30 | for word in set(cur_right.split('|')): 31 | self.key_words[word] = cur_left 32 | continue 33 | production = {} 34 | production['left'] = cur_left 35 | index = cur_right.find(' ') 36 | if index != -1: 37 | production['input'] = cur_right[0:index] 38 | production['right'] = cur_right[index + 1:len(cur_right)] 39 | else: 40 | production['input'] = cur_right 41 | production['right'] = None 42 | self.productions.append(production) 43 | 44 | def create_nfa(self): 45 | all_status = {} 46 | 47 | def get_create_nfa_node(name, _type): 48 | if name in all_status: 49 | node = all_status[name] 50 | else: 51 | node = NFANode(name=name, _type=_type) 52 | return node 53 | 54 | start_node = get_create_nfa_node('start', 0) 55 | end_node = get_create_nfa_node('end', 1) 56 | all_status['start'] = start_node 57 | all_status['end'] = end_node 58 | for produ in self.productions: 59 | name = produ['left'] 60 | alpha = produ['input'] 61 | right = produ['right'] 62 | node = get_create_nfa_node(name, 0) 63 | if right is not None: 64 | target_node = get_create_nfa_node(right, 0) 65 | if alpha not in self.tool_set.keys(): 66 | if right is None: 67 | node.add_edge(alpha, 'end') 68 | else: 69 | if right in self.tool_set: 70 | for val in self.tool_set[right]: 71 | node.add_edge(alpha, val) 72 | else: 73 | node.add_edge(alpha, right) 74 | else: 75 | for val in self.tool_set[alpha]: 76 | if right is None: 77 | node.add_edge(val, 'end') 78 | else: 79 | if right in self.tool_set: 80 | for val in self.tool_set[right]: 81 | node.add_edge(alpha, val) 82 | else: 83 | node.add_edge(alpha, right) 84 | node.add_edge(val, right) 85 | all_status[name] = node 86 | if right is not None: 87 | all_status[right] = target_node 88 | 89 | alphabets = set() 90 | for i in range(ord(' '), ord('~') + 1): 91 | alphabets.add(chr(i)) 92 | self.NFA = NFA(alphabets) 93 | self.NFA.status = all_status 94 | 95 | def nfa_to_dfa(self): 96 | all_status = {} 97 | 98 | def get_create_dfaNode(name, _type): 99 | if name in all_status: 100 | return all_status[name] 101 | else: 102 | node = DFANode(name, _type) 103 | return node 104 | for node_name in self.NFA.status['start'].edge['$']: 105 | start_node = get_create_dfaNode('start', 0) 106 | dfa_node = get_create_dfaNode(node_name, 0) 107 | start_node.add_edge('$', node_name) 108 | all_status['start'] = start_node 109 | all_status[node_name] = dfa_node 110 | is_visit = set() 111 | queue = list() 112 | nfa_node_set = set() 113 | nfa_node_set.add(node_name) 114 | queue.append((nfa_node_set, node_name)) 115 | while queue: 116 | node_name = queue.pop(0) 117 | top_node_name = node_name[0] 118 | dfa_node_name = node_name[1] 119 | # print 'to =', top_node_name, ', df =', dfa_node_name 120 | dfa_node = get_create_dfaNode(dfa_node_name, 0) 121 | for alpha in self.NFA.alphabets: 122 | target_set = set() 123 | for nfa_node_name in top_node_name: 124 | nfa_name = self.NFA.status[nfa_node_name] 125 | if alpha in nfa_name.edge.keys(): 126 | for name in nfa_name.edge[alpha]: 127 | target_set.add(name) 128 | if not target_set: 129 | continue 130 | dfa_new_node_name = '' 131 | _type = 0 132 | tmp_list = list(target_set) 133 | target_list = sorted(tmp_list) 134 | for tar in target_list: 135 | dfa_new_node_name = '%s$%s' % (dfa_new_node_name, tar) 136 | _type += int(self.NFA.status[tar]._type) 137 | if _type > 0: 138 | _type = 1 139 | dfa_new_node = get_create_dfaNode(dfa_new_node_name, _type) 140 | dfa_node.add_edge(alpha, dfa_new_node_name) 141 | all_status[dfa_node_name] = dfa_node 142 | all_status[dfa_new_node_name] = dfa_new_node 143 | if dfa_new_node_name in is_visit: 144 | continue 145 | else: 146 | is_visit.add(dfa_new_node_name) 147 | queue.append((target_set, dfa_new_node_name)) 148 | alphabets = set() 149 | for i in range(ord(' '), ord('~') + 1): 150 | alphabets.add(chr(i)) 151 | self.DFA = DFA(alphabets) 152 | self.DFA.status = all_status 153 | 154 | def run_on_dfa(self, line, pos): 155 | for dfa_name in self.DFA.status['start'].edge['$']: 156 | cur_pos = pos 157 | token = '' 158 | token_type = dfa_name 159 | c_node = self.DFA.status[dfa_name] 160 | while cur_pos < len(line) and line[cur_pos] in c_node.edge.keys(): 161 | token += line[cur_pos] 162 | c_node = self.DFA.status[list(c_node.edge[line[cur_pos]])[0]] 163 | cur_pos += 1 164 | if c_node._type > 0: 165 | if token in self.key_words.keys(): 166 | token_type = token 167 | return cur_pos - 1, token_type, token 168 | return pos, None, '' 169 | 170 | def read_and_analyze(self, file_name): 171 | line_num = 0 172 | lex_error = False 173 | token_table = [] 174 | for line in open(file_name, 'r'): 175 | pos = 0 176 | line_num += 1 177 | line = line.split('\n')[0] 178 | while pos < len(line) and not lex_error: 179 | while pos < len(line) and line[pos] in ['\t', '\n', ' ', '\r']: 180 | pos += 1 181 | if pos < len(line): 182 | pos, token_type, token = self.run_on_dfa(line, pos) 183 | if token_type is None: 184 | print 'Lexical error at line %s, column %s' % ( 185 | (str(line_num), str(pos))) 186 | lex_error = True 187 | break 188 | else: 189 | token_table.append((token_type, token)) 190 | print '(\'%s\'\t, \'%s\')' % (token_type, token) 191 | pos += 1 192 | if not lex_error: 193 | output = open('token_table.data', 'w+') 194 | for token_type, token in token_table: 195 | type_of_token = token 196 | if token_type == 'identifier' or token_type == 'number': 197 | type_of_token = token_type 198 | output.write('%s %s\n' % (type_of_token, token)) 199 | output.close() 200 | return True 201 | return False 202 | 203 | 204 | def main(): 205 | lex_ana = LexicalAnalyze() 206 | lex_ana.read_lex_grammar('lex_grammar.txt') 207 | lex_ana.create_nfa() 208 | lex_ana.nfa_to_dfa() 209 | lex_ana.read_and_analyze('source.cc') 210 | 211 | if __name__ == '__main__': 212 | main() 213 | -------------------------------------------------------------------------------- /nfa_and_dfa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class NFANode(object): 6 | 7 | def __init__(self, name=None, _type=0): 8 | super(NFANode, self).__init__() 9 | self.name = name 10 | self._type = _type 11 | self.edge = {} 12 | 13 | def add_edge(self, alpha, target): 14 | if alpha not in self.edge: 15 | targets = set() 16 | targets.add(target) 17 | self.edge[alpha] = targets 18 | else: 19 | self.edge[alpha].add(target) 20 | 21 | 22 | class NFA(object): 23 | def __init__(self, alphabets): 24 | super(NFA, self).__init__() 25 | self.status = {} 26 | self.alphabets = alphabets 27 | 28 | def get_target(self, cur_status, alpha): 29 | if cur_status in self.status: 30 | if alpha in self.status[cur_status]: 31 | return self.status[cur_status][alpha] 32 | return None 33 | 34 | 35 | class DFANode(object): 36 | def __init__(self, name, _type=None): 37 | super(DFANode, self).__init__() 38 | self.name = name 39 | self._type = _type 40 | self.edge = {} 41 | 42 | def add_edge(self, alpha, target): 43 | if alpha not in self.edge: 44 | targets = set() 45 | targets.add(target) 46 | self.edge[alpha] = targets 47 | else: 48 | self.edge[alpha].add(target) 49 | 50 | 51 | class LRDFANode(object): 52 | 53 | def __init__(self, set_id): 54 | self.set_id = set_id 55 | self.object_set = set() 56 | self.edge = {} 57 | 58 | def add_object_set(self, id, left, right, index, tail): 59 | tmp = (id, left, right, index, tail) 60 | if tmp not in self.object_set: 61 | self.object_set.add(tmp) 62 | 63 | def add_object_set_by_set(self, object_set): 64 | self.object_set |= object_set 65 | 66 | 67 | class DFA(object): 68 | def __init__(self, alphabets): 69 | super(DFA, self).__init__() 70 | self.status = {} 71 | self.alphabets = alphabets 72 | -------------------------------------------------------------------------------- /sample_syn_grammar.txt: -------------------------------------------------------------------------------- 1 | start1:start 2 | start:a A 3 | start:b B 4 | A:c A 5 | A:d 6 | B:c B 7 | B:d 8 | -------------------------------------------------------------------------------- /sample_token_table.txt: -------------------------------------------------------------------------------- 1 | b b 2 | c c 3 | c c 4 | d d -------------------------------------------------------------------------------- /source.cc: -------------------------------------------------------------------------------- 1 | int a = 1, b = 234 , c = 2e4; 2 | 3 | function int max(int a,int b){ 4 | if ( a>b ) return a; 5 | else return b; 6 | } 7 | function double min(int A,int B){ 8 | if ( A 14 | arithmetic_expression:operator 15 | arithmetic_expression:primary_expression arithmetic_expression 16 | arithmetic_expression:operator primary_expression arithmetic_expression 17 | arithmetic_expression:$ 18 | constant_expression:primary_expression arithmetic_expression 19 | assignment_operator:= 20 | assignment_operator:+ = 21 | assignment_operator:- = 22 | assignment_operator:* = 23 | assignment_operator:/ = 24 | assignment_operator:% = 25 | assignment_expression:identifier assignment_operator expression 26 | assignment_expression_profix:, assignment_expression assignment_expression_profix 27 | assignment_expression_profix:$ 28 | assignment_expression_list:assignment_expression assignment_expression_profix 29 | assignment_expression_list:$ 30 | function_expression:function identifier ( expression_list ) 31 | expression:constant_expression 32 | expression:function_expression 33 | expression_profix:, expression expression_profix 34 | expression_profix:$ 35 | expression_list:expression expression_profix 36 | expression_list:$ 37 | type_specifier:char 38 | type_specifier:int 39 | type_specifier:double 40 | declaration_assign:= expression 41 | declaration_assign:$ 42 | declaration_init:identifier declaration_assign 43 | declaration_init_list:, declaration_init declaration_init_list 44 | declaration_init_list:$ 45 | declaration:type_specifier declaration_init declaration_init_list ; 46 | function_declaration:type_specifier identifier 47 | function_declaration_suffix:, function_declaration function_declaration_suffix 48 | function_declaration_suffix:$ 49 | function_declaration_list:function_declaration function_declaration_suffix 50 | function_declaration_list:$ 51 | function_definition:function type_specifier identifier ( function_declaration_list ) compound_statement 52 | statement:expression_statement 53 | statement:jump_statement 54 | statement:selection_statement 55 | statement:iteration_statement 56 | statement:compound_statement 57 | statement:declaration 58 | statement_list:statement statement_list 59 | statement_list:$ 60 | expression_statement:assignment_expression_list ; 61 | expression_statement:print ( expression ) ; 62 | expression_statement:scanf ( identifier ) ; 63 | jump_statement:continue ; 64 | jump_statement:break ; 65 | jump_statement:return expression ; 66 | selection_statement:if ( expression ) statement else statement 67 | iteration_statement:while ( expression ) statement 68 | iteration_statement:for ( declaration expression ; assignment_expression ) statement 69 | compound_statement:{ statement_list } 70 | external_declaration:function_definition 71 | external_declaration:declaration 72 | -------------------------------------------------------------------------------- /syntax_analyze.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from nfa_and_dfa import DFA, LRDFANode 5 | 6 | 7 | class SyntaxAnalyze(object): 8 | 9 | def __init__(self): 10 | super(SyntaxAnalyze, self).__init__() 11 | self.first_set = {} 12 | self.productions = [] 13 | self.all_elem = set() 14 | self.terminate = set() 15 | self.noterminate = set() 16 | self.productions_dict = {} 17 | self.lr_analyze_table = {} 18 | 19 | def read_syntax_grammar(self, file_name): 20 | for line in open(file_name, 'r'): 21 | line = line[:-1] 22 | cur_left = line.split(':')[0] 23 | cur_right = line.split(':')[1] 24 | right_list = [] 25 | if cur_right.find(' ') != -1: 26 | right_list = cur_right.split(' ') 27 | else: 28 | right_list.append(cur_right) 29 | production = {cur_left: right_list} 30 | self.productions.append(production) 31 | 32 | def get_terminate_noterminate(self): 33 | for production in self.productions: 34 | for left in production.keys(): 35 | if left not in self.productions_dict: 36 | self.productions_dict[left] = [] 37 | self.productions_dict[left].append(( 38 | tuple(production[left]), 39 | self.productions.index(production))) 40 | self.all_elem.add(left) 41 | self.noterminate.add(left) 42 | for right in production[left]: 43 | self.all_elem.add(right) 44 | self.terminate = self.all_elem - self.noterminate 45 | 46 | def __get_first_set(self, cur_status, all_elem): 47 | if cur_status in self.first_set: 48 | return self.first_set[cur_status] 49 | all_elem.add(cur_status) 50 | cur_status_set = set() 51 | for right_list in self.productions_dict[cur_status]: 52 | for right in right_list[0]: 53 | right_set = None 54 | if right in all_elem: 55 | continue 56 | if right in self.first_set: 57 | right_set = self.first_set[right] 58 | else: 59 | right_set = self.__get_first_set(right, all_elem) 60 | cur_status_set |= right_set 61 | if '$' not in right_set: 62 | break 63 | return cur_status_set 64 | 65 | def init_first_set(self): 66 | for terminate in self.terminate: 67 | self.first_set[terminate] = set([terminate]) 68 | for noterminate in self.noterminate: 69 | self.first_set[noterminate] = self.__get_first_set( 70 | noterminate, set()) 71 | 72 | def create_lr_dfa(self): 73 | all_status = {} 74 | all_object_set = {} 75 | self.DFA = DFA(set()) 76 | 77 | def create_get_lr_dfa_node(set_id): 78 | if set_id in all_status: 79 | return all_status[set_id] 80 | return LRDFANode(set_id=set_id) 81 | 82 | def expand_production(self, cur_production, ex_object_set): 83 | ex_object_set.add(cur_production) 84 | right = cur_production[2] 85 | point_index = cur_production[3] 86 | tail_set = cur_production[4] 87 | if point_index < len(right) and\ 88 | (right[point_index] in self.noterminate): 89 | for pro_right in self.productions_dict[right[point_index]]: 90 | new_tail_set = set() 91 | flag = True 92 | for i in range(point_index + 1, len(right)): 93 | cur_first_set = self.first_set[right[i]] 94 | if '$' in cur_first_set: 95 | new_tail_set = tuple( 96 | set(new_tail_set) | (cur_first_set - set('$'))) 97 | else: 98 | flag = False 99 | new_tail_set = tuple( 100 | set(new_tail_set) | cur_first_set) 101 | break 102 | if flag: 103 | new_tail_set = tuple(set(new_tail_set) | set(tail_set)) 104 | ex_new_production = ( 105 | pro_right[1], 106 | right[point_index], pro_right[0], 0, new_tail_set) 107 | if ex_new_production not in ex_object_set: 108 | ex_object_set |= expand_production( 109 | self, ex_new_production, ex_object_set) 110 | new_ex_object_set = {} 111 | for eos in ex_object_set: 112 | pro_key = (eos[0], eos[1], eos[2], eos[3]) 113 | if tuple(pro_key) not in new_ex_object_set: 114 | new_ex_object_set[tuple(pro_key)] = set() 115 | new_ex_object_set[pro_key] |= set(eos[4]) 116 | ex_object_set = set() 117 | for key in new_ex_object_set: 118 | production = (key[0], key[1], key[2], key[ 119 | 3], tuple(new_ex_object_set[key])) 120 | ex_object_set.add(tuple(production)) 121 | return ex_object_set 122 | 123 | set_id = 0 124 | new_node = create_get_lr_dfa_node(set_id) 125 | object_set = expand_production( 126 | self, (0, 'start1', ('start',), 0, '#'), set()) 127 | new_node.add_object_set_by_set(object_set) 128 | all_object_set[tuple(object_set)] = set_id 129 | all_status[set_id] = new_node 130 | object_set_queue = list() 131 | object_set_queue.append(new_node) 132 | while object_set_queue: 133 | top_object_node = object_set_queue.pop(0) 134 | old_set = top_object_node.object_set 135 | old_set_id = top_object_node.set_id 136 | # print 'object_set_id =', old_set_id 137 | for cur_production in old_set: 138 | # print cur_production 139 | pro_id = cur_production[0] 140 | left = cur_production[1] 141 | right = cur_production[2] 142 | point_index = cur_production[3] 143 | tail_set = cur_production[4] 144 | if point_index >= len(right) or '$' in right: 145 | if old_set_id not in self.lr_analyze_table: 146 | self.lr_analyze_table[old_set_id] = {} 147 | for tail in tail_set: 148 | if tail in self.lr_analyze_table[old_set_id]: 149 | print 'the grammar is not a LR(1) grammar!!!' 150 | return 151 | self.lr_analyze_table[old_set_id][tail] = ('r', pro_id) 152 | else: 153 | tar_set_id = 0 154 | new_production = (pro_id, left, right, 155 | point_index + 1, tail_set) 156 | new_object_set = expand_production( 157 | self, new_production, set()) 158 | if tuple(new_object_set) in all_object_set.keys(): 159 | tar_set_id = all_object_set[tuple(new_object_set)] 160 | else: 161 | set_id += 1 162 | tar_set_id = set_id 163 | all_object_set[tuple(new_object_set)] = set_id 164 | new_node = create_get_lr_dfa_node(tar_set_id) 165 | new_node.add_object_set_by_set(new_object_set) 166 | all_status[tar_set_id] = new_node 167 | object_set_queue.append(new_node) 168 | if old_set_id not in self.lr_analyze_table: 169 | self.lr_analyze_table[old_set_id] = {} 170 | if right[point_index] in self.terminate: 171 | self.lr_analyze_table[old_set_id][ 172 | right[point_index]] = ('s', tar_set_id) 173 | else: 174 | self.lr_analyze_table[old_set_id][ 175 | right[point_index]] = ('g', tar_set_id) 176 | self.DFA.status = all_status 177 | 178 | def run_on_lr_dfa(self, tokens): 179 | status_stack = [0] 180 | symbol_stack = ['#'] 181 | top = 0 182 | success = False 183 | tokens.reverse() 184 | while not success: 185 | top = status_stack[-1] 186 | print 'token =', tokens[-1] 187 | # print symbol_stack 188 | print symbol_stack 189 | if tokens[-1] in self.lr_analyze_table[top]: 190 | action = self.lr_analyze_table[top][tokens[-1]] 191 | if action[0] == 's': 192 | status_stack.append(action[1]) 193 | symbol_stack.append(tokens[-1]) 194 | tokens = tokens[:-1] 195 | elif action[0] == 'r': 196 | if action[1] == 0: 197 | print 'Syntax anaysis successfully!' 198 | success = True 199 | break 200 | production = self.productions[action[1]] 201 | left = production.keys()[0] 202 | right_len = len(production[left]) 203 | tokens.append(left) 204 | if production[left] == ['$']: 205 | continue 206 | status_stack = status_stack[:-right_len] 207 | symbol_stack = symbol_stack[:-right_len] 208 | else: 209 | status_stack.append(action[1]) 210 | symbol_stack.append(tokens[-1]) 211 | tokens = tokens[:-1] 212 | # print status_stack, symbol_stack 213 | else: 214 | print self.lr_analyze_table[top] 215 | print 'Syntax error!\n' 216 | break 217 | 218 | def read_and_analyze(self, fileName): 219 | token_table = open(fileName, 'r') 220 | tokens = [] 221 | for line in token_table: 222 | line = line[:-1] 223 | tokens.append(line.split(' ')[0]) 224 | tokens.append('#') 225 | self.run_on_lr_dfa(tokens) 226 | 227 | 228 | def main(): 229 | syn_ana = SyntaxAnalyze() 230 | # syn_ana.read_syntax_grammar('sample_syn_grammar.txt') 231 | syn_ana.read_syntax_grammar('syn_grammar.txt') 232 | syn_ana.get_terminate_noterminate() 233 | syn_ana.init_first_set() 234 | syn_ana.create_lr_dfa() 235 | syn_ana.read_and_analyze('token_table.data') 236 | # syn_ana.read_and_analyze('sample_token_table.txt') 237 | # for key in syn_ana.lr_analyze_table: 238 | # print key, ': ', syn_ana.lr_analyze_table[key] 239 | # for pro in syn_ana.productions: 240 | # print syn_ana.productions.index(pro), pro 241 | # for key in syn_ana.first_set.keys(): 242 | # print 'key =', key, '\n', 'first =', syn_ana.first_set[key] 243 | # print syn_ana.productions 244 | # print '\n' 245 | # for left in syn_ana.productions_dict: 246 | # print left, ':', syn_ana.productions_dict[left] 247 | # print syn_ana.terminate 248 | # print syn_ana.noterminate 249 | 250 | if __name__ == '__main__': 251 | main() 252 | -------------------------------------------------------------------------------- /token_table.data: -------------------------------------------------------------------------------- 1 | int int 2 | identifier a 3 | = = 4 | number 1 5 | , , 6 | identifier b 7 | = = 8 | number 234 9 | , , 10 | identifier c 11 | = = 12 | number 2e4 13 | ; ; 14 | function function 15 | int int 16 | identifier max 17 | ( ( 18 | int int 19 | identifier a 20 | , , 21 | int int 22 | identifier b 23 | ) ) 24 | { { 25 | if if 26 | ( ( 27 | identifier a 28 | > > 29 | identifier b 30 | ) ) 31 | return return 32 | identifier a 33 | ; ; 34 | else else 35 | return return 36 | identifier b 37 | ; ; 38 | } } 39 | function function 40 | double double 41 | identifier min 42 | ( ( 43 | int int 44 | identifier A 45 | , , 46 | int int 47 | identifier B 48 | ) ) 49 | { { 50 | if if 51 | ( ( 52 | identifier A 53 | < < 54 | identifier B 55 | ) ) 56 | return return 57 | identifier A 58 | ; ; 59 | else else 60 | return return 61 | identifier B 62 | ; ; 63 | } } 64 | function function 65 | int int 66 | identifier main 67 | ( ( 68 | ) ) 69 | { { 70 | double double 71 | identifier sum_1_to_50 72 | = = 73 | number 1 74 | ; ; 75 | for for 76 | ( ( 77 | int int 78 | identifier i 79 | = = 80 | number 1 81 | ; ; 82 | identifier i 83 | < < 84 | number 100 85 | ; ; 86 | identifier i 87 | + + 88 | = = 89 | number 1 90 | ) ) 91 | { { 92 | if if 93 | ( ( 94 | identifier i 95 | < < 96 | number 50 97 | ) ) 98 | break break 99 | ; ; 100 | else else 101 | identifier sum_1_to_50 102 | + + 103 | = = 104 | identifier i 105 | ; ; 106 | } } 107 | int int 108 | identifier k 109 | = = 110 | number 0 111 | , , 112 | identifier s 113 | = = 114 | ( ( 115 | ( ( 116 | number 534 117 | - - 118 | number 23 119 | ) ) 120 | + + 121 | number 423 122 | ) ) 123 | * * 124 | number 23 125 | ; ; 126 | while while 127 | ( ( 128 | identifier k 129 | < < 130 | number 40 131 | ) ) 132 | scanf scanf 133 | ( ( 134 | identifier s 135 | ) ) 136 | ; ; 137 | int int 138 | identifier A 139 | = = 140 | number 50 141 | , , 142 | identifier B 143 | = = 144 | number 23 145 | , , 146 | identifier C 147 | ; ; 148 | identifier C 149 | = = 150 | function function 151 | identifier max 152 | ( ( 153 | identifier A 154 | , , 155 | identifier B 156 | ) ) 157 | ; ; 158 | print print 159 | ( ( 160 | identifier C 161 | ) ) 162 | ; ; 163 | print print 164 | ( ( 165 | identifier A 166 | + + 167 | identifier B 168 | * * 169 | identifier C 170 | ) ) 171 | ; ; 172 | return return 173 | number 0 174 | ; ; 175 | } } 176 | --------------------------------------------------------------------------------