├── .gitignore
├── README.md
├── lex_grammar.txt
├── lexical_analyze.py
├── nfa_and_dfa.py
├── sample_syn_grammar.txt
├── sample_token_table.txt
├── source.cc
├── syn_grammar.txt
├── syntax_analyze.py
└── token_table.data


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | .idea/
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Compiler
  2 | -------
  3 | 基于 Python 的词法和LR(1)文法分析器
  4 | 
  5 | ## 总体说明
  6 | * 编程语言:Python 2.7.11
  7 | * 编程平台:Ubuntu16.04
  8 | * 编程环境:sublime
  9 | * 完成的内容:实现了 3型文法的词法分析器和2 型文法的LR(1)语法分析器。
 10 | * 测试文法:一个类C语言的文法
 11 | * 测试程序:一个类C语言的程序。
 12 | 
 13 | ## 文件说明
 14 | 本程序共涉及7个文件, 现将其说明如下:
 15 | 
 16 | lexical_analyze.py    词法分析程序
 17 | 
 18 | syntax_analyze.py     语法分析程序
 19 | 
 20 | nfa_and_dfa.py        定义了 nfa 和 dfa 类以及其节点
 21 | 
 22 | lex_grammar.txt       词法文法文件
 23 | 
 24 | syn_grammar.txt       语法文法文件
 25 | 
 26 | source.cc             待分析的类C程序
 27 | 
 28 | token_table.data      词法分析输出的token表
 29 | 
 30 | ## 词法文法说明
 31 | 我构造了一个3型文法作为程序读入的词法的文法，为了简化读入操作，我将课本上的 ->(推出符)换成了 ‘:’, ‘:’的左侧是产生式的左部, 右侧是产生式的右部, 并且将空产生式的右部的埃普西隆用 $ 来代替。
 32 | 我将token的类型大体分成了 identifier、limiter、operator、number、string 等五类, 并每一类设计了表达式和推导过程。
 33 | 
 34 | ## 语法文法说明
 35 | 直到开始写文法我才直到这个课设最难的是文法的构造, 自己YY了很久也没能设计出一个让自己满意且可用的文法, 所以中从 http://www.nongnu.org/hcb/ 这里参考了一下，最后写出来了一个2型文法。
 36 | 跟词法的文法, 这里用 ‘:’ 来代替课本上的 –> 并且用 ‘$’ 来代替埃普西隆空产生式右部。
 37 | 
 38 | ## 词法分析器说明
 39 | 词法分析器接受一个3型文法, 接受3型文法后会分析其终结符和非终结符, 分析方法是: 对于非终结符, 很明显, 所有在产生式左部的符号都是非终结符, 那么终结符就是所有的符号集合与非终结符集合的差集.分析完终结符和非终结符之后根据课本算法构造 NFA， 然后根据课本的算法构造 DFA，至此文法的处理工作结束。
 40 | 接下来分析待分析的程序, 对于读入的程序, 将每个字符一次输入到 DFA 里面，当 DFA 不能接受某个字符的时候判断当前状态是否是一个终结状态, 如果是则token分析成功, 否则词法分析失败。
 41 | 
 42 | ## 语法分析器说明
 43 | 语法分析器首先读入要分析的2型文法, 然后求出文法的终结符和非终结符, 求法与上面相同, 之后要求出每个文法符号的 first集，终结符的first 集是他本身，非终结符的 first 集的求解过程是一个记忆化搜索的过程。 然后为文法添加拓展的 S’->S,# 在此基础上进行拓展形成项目集 I0, 然后对项目集I0 进行推广, 同时构建 LR(1) 分析表。
 44 | 有了 LR(1) 分析表后接下来的过程我们只需要一个一个的将词法分析生成的token读入到程序里面放在分析表中寻找移进或者归约操作即可, 如果最后的状态是 acc 则文法符合要求, 如果最后无法得到 acc，或者在分析表中找不到相应的操作, 则语法错误。
 45 | 
 46 | ## 代码说明
 47 | ### 工具类(nfa_and_dfa.py)
 48 | ```python
 49 | class NFANode(object)                               # NFA的节点结构
 50 |     def __init__(self, name, _type)                 # 类的构造函数, 传入名称和类型
 51 |     def add_edge(self, alpha, target)               # 为节点添加边
 52 | 
 53 | class NFA(object)                                   # NFA 的结构
 54 |     def __init__(self)                              # 类的构造函数
 55 |     def get_target(self, cur_status, alpha)         # 从当前状态,输入一个字符返回下一个状态
 56 | 
 57 | class DFANode(object)                               # DFA 的节点结构
 58 |     def __init__(self, name, _type)                 # 类的构造函数, 传入名称和类型
 59 |     def add_edge(self, alpha, target)               # 为节点添加边
 60 | 
 61 | class LRDFANode(object)                             # LR(1)DFA的节点结构
 62 |     def __init__(self, set_id)                      # 类的构造函数, 传入节点的编号
 63 |     def add_object_set(self, id, left, right, index, tail)   # 为项目集添加产生式
 64 |     def add_object_set_by_set(self, object_set)              # 以一个集合的方式向项目集中添加产生式
 65 | 
 66 | class DFA(object)                                   # DFA 的结构类
 67 |     def __init__(self)                              # 构造函数
 68 |     def get_target(self, cur_status, alpha)         # 从当前状态,输入一个字符返回下一个状态
 69 | ```
 70 | ### 词法分析器(lexical_analyze.py)
 71 | ```python
 72 | class LexicaAnalyze(object)                         # 与词法分析有关的操作
 73 |     def read_lex_grammar(self, file_name)           # 读取词法的文法, 参数为文法文件路径
 74 |     def create_nfa(self)                            # 根据输入的文法创建 nfa
 75 |     def get_create_nfa_node(name, _type)            # 创建新的节点或者返回一个已存在的节点
 76 |     def nfa_to_dfa(self)                            # 由 nfa 转向 dfa
 77 |     def get_create_dfaNode(name, _type) 		    # 创建新的节点或者返回一个已存在的节点
 78 |     def run_on_dfa(self, line,pos)					# 给定一行语句, 让其在dfa上跑生成 token
 79 |     def read_and_analyze(self, file_name)			# 读取待分析的句子并生成token_table
 80 | def main()											# 主函数调用, 创建 LexicaAnalyze 对象, 并完成词法分析操作
 81 | ```
 82 | ### 语法分析器(syntax_analyze.py)
 83 | ```python
 84 | class SyntaxAnalyze(object)                         # 与语法分析相关的操作的类
 85 |     def __init__(self)                              # 构造函数
 86 |     def read_syntax_grammar(self, file_name)        # 读取语法分析需要的文法, 传入文件名
 87 |     def get_terminate_noterminate(self)             # 得到文法的非终结符和终结符
 88 |     def __get_first_set(self, cur_status, all_elem) # 递归得到一个非终结符的first 集
 89 |     def init_first_set(self)                        # 初始化所有符号的first集
 90 |     def create_lr_dfa(self)                         # 创建项目集 DFA, 同时构造分析表
 91 |     def create_get_lr_dfa_node(set_id)              # 创建新的节点或者得到一个已有的节点
 92 |     def expand_production(self, production,set)     # 通过一个产生式得到与其项目集
 93 |     def run_on_lr_dfa(self, tokens)                 # 分析 token_table 并返回结果
 94 |     def read_and_analyze(self, file_name)           # 读取token_table
 95 | def main()                                          # 主函数，创建 SyntaxAnalyze 对象并进行所有操作输出结果
 96 | ```
 97 | ### 运行
 98 | ```shell
 99 | python lexical_analyze.py                           # 生成 token_table
100 | python syntax_analyze.py                            # 输出语法分析结果
101 | ```
102 | 


--------------------------------------------------------------------------------
/lex_grammar.txt:
--------------------------------------------------------------------------------
 1 | digit:0|1|2|3|4|5|6|7|8|9
 2 | nozero_digit:1|2|3|4|5|6|7|8|9
 3 | alphabet:a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z
 4 | key_word:int|double|char|float|const|auto|break|case|continue|do|while|if|else|switch|enum|for|do|long|short|static|struct|typedef|unsigned|void|return|break|scanf|print|function
 5 | start:$ identifier
 6 | start:$ limiter
 7 | start:$ operator
 8 | start:$ string
 9 | start:$ number
10 | limiter:,
11 | limiter:;
12 | limiter:[
13 | limiter:]
14 | limiter:(
15 | limiter:)
16 | limiter:{
17 | limiter:}
18 | operator:+
19 | operator:-
20 | operator:*
21 | operator:/
22 | operator:!
23 | operator:%
24 | operator:^
25 | operator:&
26 | operator:=
27 | operator:~
28 | operator:"
29 | operator:<<
30 | operator:>>
31 | operator:++
32 | operator:--
33 | operator:>
34 | operator:<
35 | identifier:_
36 | identifier:alphabet
37 | identifier:_ identifier_tail
38 | identifier:alphabet identifier_tail
39 | identifier_tail:_
40 | identifier_tail:digit
41 | identifier_tail:alphabet
42 | identifier_tail:_ identifier_tail
43 | identifier_tail:digit identifier_tail
44 | identifier_tail:alphabet identifier_tail
45 | number:digit
46 | number:nozero_digit number_tail
47 | number_tail:digit
48 | number_tail:digit number_tail
49 | number_tail:e number_tail
50 | string:" string_tail
51 | string_tail:, string_tail
52 | string_tail:; string_tail
53 | string_tail:[ string_tail
54 | string_tail:] string_tail
55 | string_tail:( string_tail
56 | string_tail:) string_tail
57 | string_tail:{ string_tail
58 | string_tail:} string_tail
59 | string_tail:. string_tail
60 | string_tail:+ string_tail
61 | string_tail:- string_tail
62 | string_tail:* string_tail
63 | string_tail:/ string_tail
64 | string_tail:! string_tail
65 | string_tail:% string_tail
66 | string_tail:^ string_tail
67 | string_tail:& string_tail
68 | string_tail:= string_tail
69 | string_tail:~ string_tail
70 | string_tail:" string_tail
71 | string_tail:" string_tail
72 | string_tail:< string_tail
73 | string_tail:> string_tail
74 | string_tail:_ string_tail
75 | string_tail:# string_tail
76 | string_tail:digit string_tail
77 | string_tail:alphabet string_tail
78 | string_tail:"


--------------------------------------------------------------------------------
/lexical_analyze.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from nfa_and_dfa import NFA, DFA, NFANode, DFANode
  5 | 
  6 | 
  7 | class LexicalAnalyze(object):
  8 |     def __init__(self):
  9 |         super(LexicalAnalyze, self).__init__()
 10 |         self.productions = []
 11 |         self.key_words = {}
 12 |         self.tool_set = {}
 13 |         self.NFA = None
 14 |         self.DFA = None
 15 | 
 16 |     def read_lex_grammar(self, file_name):
 17 |         cur_left = None
 18 |         cur_right = []
 19 |         line_num = 0
 20 |         for line in open(file_name, 'r'):
 21 |             line = line.split('\n')[0]
 22 |             index = line.find(':')
 23 |             cur_left = line[0:index]
 24 |             cur_right = line[index + 1:len(line)]
 25 |             line_num += 1
 26 |             if line_num < 4:
 27 |                 self.tool_set[cur_left] = set(cur_right.split('|'))
 28 |                 continue
 29 |             elif line_num == 4:
 30 |                 for word in set(cur_right.split('|')):
 31 |                     self.key_words[word] = cur_left
 32 |                 continue
 33 |             production = {}
 34 |             production['left'] = cur_left
 35 |             index = cur_right.find(' ')
 36 |             if index != -1:
 37 |                 production['input'] = cur_right[0:index]
 38 |                 production['right'] = cur_right[index + 1:len(cur_right)]
 39 |             else:
 40 |                 production['input'] = cur_right
 41 |                 production['right'] = None
 42 |             self.productions.append(production)
 43 | 
 44 |     def create_nfa(self):
 45 |         all_status = {}
 46 | 
 47 |         def get_create_nfa_node(name, _type):
 48 |             if name in all_status:
 49 |                 node = all_status[name]
 50 |             else:
 51 |                 node = NFANode(name=name, _type=_type)
 52 |             return node
 53 | 
 54 |         start_node = get_create_nfa_node('start', 0)
 55 |         end_node = get_create_nfa_node('end', 1)
 56 |         all_status['start'] = start_node
 57 |         all_status['end'] = end_node
 58 |         for produ in self.productions:
 59 |             name = produ['left']
 60 |             alpha = produ['input']
 61 |             right = produ['right']
 62 |             node = get_create_nfa_node(name, 0)
 63 |             if right is not None:
 64 |                 target_node = get_create_nfa_node(right, 0)
 65 |             if alpha not in self.tool_set.keys():
 66 |                 if right is None:
 67 |                     node.add_edge(alpha, 'end')
 68 |                 else:
 69 |                     if right in self.tool_set:
 70 |                         for val in self.tool_set[right]:
 71 |                             node.add_edge(alpha, val)
 72 |                     else:
 73 |                         node.add_edge(alpha, right)
 74 |             else:
 75 |                 for val in self.tool_set[alpha]:
 76 |                     if right is None:
 77 |                         node.add_edge(val, 'end')
 78 |                     else:
 79 |                         if right in self.tool_set:
 80 |                             for val in self.tool_set[right]:
 81 |                                 node.add_edge(alpha, val)
 82 |                         else:
 83 |                             node.add_edge(alpha, right)
 84 |                             node.add_edge(val, right)
 85 |             all_status[name] = node
 86 |             if right is not None:
 87 |                 all_status[right] = target_node
 88 | 
 89 |         alphabets = set()
 90 |         for i in range(ord(' '), ord('~') + 1):
 91 |             alphabets.add(chr(i))
 92 |         self.NFA = NFA(alphabets)
 93 |         self.NFA.status = all_status
 94 | 
 95 |     def nfa_to_dfa(self):
 96 |         all_status = {}
 97 | 
 98 |         def get_create_dfaNode(name, _type):
 99 |             if name in all_status:
100 |                 return all_status[name]
101 |             else:
102 |                 node = DFANode(name, _type)
103 |             return node
104 |         for node_name in self.NFA.status['start'].edge['$']:
105 |             start_node = get_create_dfaNode('start', 0)
106 |             dfa_node = get_create_dfaNode(node_name, 0)
107 |             start_node.add_edge('$', node_name)
108 |             all_status['start'] = start_node
109 |             all_status[node_name] = dfa_node
110 |             is_visit = set()
111 |             queue = list()
112 |             nfa_node_set = set()
113 |             nfa_node_set.add(node_name)
114 |             queue.append((nfa_node_set, node_name))
115 |             while queue:
116 |                 node_name = queue.pop(0)
117 |                 top_node_name = node_name[0]
118 |                 dfa_node_name = node_name[1]
119 |                 # print 'to =', top_node_name, ', df =', dfa_node_name
120 |                 dfa_node = get_create_dfaNode(dfa_node_name, 0)
121 |                 for alpha in self.NFA.alphabets:
122 |                     target_set = set()
123 |                     for nfa_node_name in top_node_name:
124 |                         nfa_name = self.NFA.status[nfa_node_name]
125 |                         if alpha in nfa_name.edge.keys():
126 |                             for name in nfa_name.edge[alpha]:
127 |                                 target_set.add(name)
128 |                     if not target_set:
129 |                         continue
130 |                     dfa_new_node_name = ''
131 |                     _type = 0
132 |                     tmp_list = list(target_set)
133 |                     target_list = sorted(tmp_list)
134 |                     for tar in target_list:
135 |                         dfa_new_node_name = '%s$%s' % (dfa_new_node_name, tar)
136 |                         _type += int(self.NFA.status[tar]._type)
137 |                     if _type > 0:
138 |                         _type = 1
139 |                     dfa_new_node = get_create_dfaNode(dfa_new_node_name, _type)
140 |                     dfa_node.add_edge(alpha, dfa_new_node_name)
141 |                     all_status[dfa_node_name] = dfa_node
142 |                     all_status[dfa_new_node_name] = dfa_new_node
143 |                     if dfa_new_node_name in is_visit:
144 |                         continue
145 |                     else:
146 |                         is_visit.add(dfa_new_node_name)
147 |                         queue.append((target_set, dfa_new_node_name))
148 |         alphabets = set()
149 |         for i in range(ord(' '), ord('~') + 1):
150 |             alphabets.add(chr(i))
151 |         self.DFA = DFA(alphabets)
152 |         self.DFA.status = all_status
153 | 
154 |     def run_on_dfa(self, line, pos):
155 |         for dfa_name in self.DFA.status['start'].edge['$']:
156 |             cur_pos = pos
157 |             token = ''
158 |             token_type = dfa_name
159 |             c_node = self.DFA.status[dfa_name]
160 |             while cur_pos < len(line) and line[cur_pos] in c_node.edge.keys():
161 |                 token += line[cur_pos]
162 |                 c_node = self.DFA.status[list(c_node.edge[line[cur_pos]])[0]]
163 |                 cur_pos += 1
164 |             if c_node._type > 0:
165 |                 if token in self.key_words.keys():
166 |                     token_type = token
167 |                 return cur_pos - 1, token_type, token
168 |         return pos, None, ''
169 | 
170 |     def read_and_analyze(self, file_name):
171 |         line_num = 0
172 |         lex_error = False
173 |         token_table = []
174 |         for line in open(file_name, 'r'):
175 |             pos = 0
176 |             line_num += 1
177 |             line = line.split('\n')[0]
178 |             while pos < len(line) and not lex_error:
179 |                 while pos < len(line) and line[pos] in ['\t', '\n', ' ', '\r']:
180 |                     pos += 1
181 |                 if pos < len(line):
182 |                     pos, token_type, token = self.run_on_dfa(line, pos)
183 |                     if token_type is None:
184 |                         print 'Lexical error at line %s, column %s' % (
185 |                             (str(line_num), str(pos)))
186 |                         lex_error = True
187 |                         break
188 |                     else:
189 |                         token_table.append((token_type, token))
190 |                         print '(\'%s\'\t, \'%s\')' % (token_type, token)
191 |                     pos += 1
192 |         if not lex_error:
193 |             output = open('token_table.data', 'w+')
194 |             for token_type, token in token_table:
195 |                 type_of_token = token
196 |                 if token_type == 'identifier' or token_type == 'number':
197 |                     type_of_token = token_type
198 |                 output.write('%s %s\n' % (type_of_token, token))
199 |             output.close()
200 |             return True
201 |         return False
202 | 
203 | 
204 | def main():
205 |     lex_ana = LexicalAnalyze()
206 |     lex_ana.read_lex_grammar('lex_grammar.txt')
207 |     lex_ana.create_nfa()
208 |     lex_ana.nfa_to_dfa()
209 |     lex_ana.read_and_analyze('source.cc')
210 | 
211 | if __name__ == '__main__':
212 |     main()
213 | 


--------------------------------------------------------------------------------
/nfa_and_dfa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | class NFANode(object):
 6 | 
 7 |     def __init__(self, name=None, _type=0):
 8 |         super(NFANode, self).__init__()
 9 |         self.name = name
10 |         self._type = _type
11 |         self.edge = {}
12 | 
13 |     def add_edge(self, alpha, target):
14 |         if alpha not in self.edge:
15 |             targets = set()
16 |             targets.add(target)
17 |             self.edge[alpha] = targets
18 |         else:
19 |             self.edge[alpha].add(target)
20 | 
21 | 
22 | class NFA(object):
23 |     def __init__(self, alphabets):
24 |         super(NFA, self).__init__()
25 |         self.status = {}
26 |         self.alphabets = alphabets
27 | 
28 |     def get_target(self, cur_status, alpha):
29 |         if cur_status in self.status:
30 |             if alpha in self.status[cur_status]:
31 |                 return self.status[cur_status][alpha]
32 |         return None
33 | 
34 | 
35 | class DFANode(object):
36 |     def __init__(self, name, _type=None):
37 |         super(DFANode, self).__init__()
38 |         self.name = name
39 |         self._type = _type
40 |         self.edge = {}
41 | 
42 |     def add_edge(self, alpha, target):
43 |         if alpha not in self.edge:
44 |             targets = set()
45 |             targets.add(target)
46 |             self.edge[alpha] = targets
47 |         else:
48 |             self.edge[alpha].add(target)
49 | 
50 | 
51 | class LRDFANode(object):
52 | 
53 |     def __init__(self, set_id):
54 |         self.set_id = set_id
55 |         self.object_set = set()
56 |         self.edge = {}
57 | 
58 |     def add_object_set(self, id, left, right, index, tail):
59 |         tmp = (id, left, right, index, tail)
60 |         if tmp not in self.object_set:
61 |             self.object_set.add(tmp)
62 | 
63 |     def add_object_set_by_set(self, object_set):
64 |         self.object_set |= object_set
65 | 
66 | 
67 | class DFA(object):
68 |     def __init__(self, alphabets):
69 |         super(DFA, self).__init__()
70 |         self.status = {}
71 |         self.alphabets = alphabets
72 | 


--------------------------------------------------------------------------------
/sample_syn_grammar.txt:
--------------------------------------------------------------------------------
1 | start1:start
2 | start:a A
3 | start:b B
4 | A:c A
5 | A:d
6 | B:c B
7 | B:d
8 | 


--------------------------------------------------------------------------------
/sample_token_table.txt:
--------------------------------------------------------------------------------
1 | b b
2 | c c
3 | c c
4 | d d


--------------------------------------------------------------------------------
/source.cc:
--------------------------------------------------------------------------------
 1 | int a = 1, b = 234 , c = 2e4;
 2 | 
 3 | function int max(int a,int b){
 4 | 	if ( a>b ) return  a;
 5 | 	else return b;
 6 | }
 7 | function double min(int A,int B){
 8 | 	if ( A<B ) return  A;
 9 | 	else return B;	
10 | }
11 | 
12 | function int main(){
13 | 
14 | 	double sum_1_to_50 = 1;
15 | 	for(int i=1;i<100;i+=1){
16 | 		if ( i<50 ) break;
17 | 		else sum_1_to_50 += i;
18 | 	}
19 | 	int k = 0 , s = ((534-23)+423)*23;
20 | 	while ( k<40 ) scanf( s );
21 | 
22 | 	int A = 50 , B = 23 , C;
23 | 	C = function max(A,B) ;
24 | 	print( C );
25 | 	print( A+B*C );
26 | 
27 | 	return 0;
28 | }


--------------------------------------------------------------------------------
/syn_grammar.txt:
--------------------------------------------------------------------------------
 1 | start1:start
 2 | start:external_declaration start
 3 | start:$
 4 | primary_expression:identifier
 5 | primary_expression:number
 6 | primary_expression:( expression )
 7 | operator:+
 8 | operator:-
 9 | operator:*
10 | operator:/
11 | operator:%
12 | operator:<
13 | operator:>
14 | arithmetic_expression:operator
15 | arithmetic_expression:primary_expression arithmetic_expression
16 | arithmetic_expression:operator primary_expression arithmetic_expression
17 | arithmetic_expression:$
18 | constant_expression:primary_expression arithmetic_expression
19 | assignment_operator:=
20 | assignment_operator:+ =
21 | assignment_operator:- =
22 | assignment_operator:* =
23 | assignment_operator:/ =
24 | assignment_operator:% =
25 | assignment_expression:identifier assignment_operator expression
26 | assignment_expression_profix:, assignment_expression assignment_expression_profix
27 | assignment_expression_profix:$
28 | assignment_expression_list:assignment_expression assignment_expression_profix
29 | assignment_expression_list:$
30 | function_expression:function identifier ( expression_list )
31 | expression:constant_expression
32 | expression:function_expression
33 | expression_profix:, expression expression_profix
34 | expression_profix:$
35 | expression_list:expression expression_profix
36 | expression_list:$
37 | type_specifier:char
38 | type_specifier:int
39 | type_specifier:double
40 | declaration_assign:= expression
41 | declaration_assign:$
42 | declaration_init:identifier declaration_assign
43 | declaration_init_list:, declaration_init declaration_init_list
44 | declaration_init_list:$
45 | declaration:type_specifier declaration_init declaration_init_list ;
46 | function_declaration:type_specifier identifier
47 | function_declaration_suffix:, function_declaration function_declaration_suffix
48 | function_declaration_suffix:$
49 | function_declaration_list:function_declaration function_declaration_suffix
50 | function_declaration_list:$
51 | function_definition:function type_specifier identifier ( function_declaration_list ) compound_statement
52 | statement:expression_statement
53 | statement:jump_statement
54 | statement:selection_statement
55 | statement:iteration_statement
56 | statement:compound_statement
57 | statement:declaration
58 | statement_list:statement statement_list
59 | statement_list:$
60 | expression_statement:assignment_expression_list ;
61 | expression_statement:print ( expression ) ;
62 | expression_statement:scanf ( identifier ) ;
63 | jump_statement:continue ;
64 | jump_statement:break ;
65 | jump_statement:return expression ;
66 | selection_statement:if ( expression ) statement else statement
67 | iteration_statement:while ( expression ) statement
68 | iteration_statement:for ( declaration expression ; assignment_expression ) statement
69 | compound_statement:{ statement_list }
70 | external_declaration:function_definition
71 | external_declaration:declaration
72 | 


--------------------------------------------------------------------------------
/syntax_analyze.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from nfa_and_dfa import DFA, LRDFANode
  5 | 
  6 | 
  7 | class SyntaxAnalyze(object):
  8 | 
  9 |     def __init__(self):
 10 |         super(SyntaxAnalyze, self).__init__()
 11 |         self.first_set = {}
 12 |         self.productions = []
 13 |         self.all_elem = set()
 14 |         self.terminate = set()
 15 |         self.noterminate = set()
 16 |         self.productions_dict = {}
 17 |         self.lr_analyze_table = {}
 18 | 
 19 |     def read_syntax_grammar(self, file_name):
 20 |         for line in open(file_name, 'r'):
 21 |             line = line[:-1]
 22 |             cur_left = line.split(':')[0]
 23 |             cur_right = line.split(':')[1]
 24 |             right_list = []
 25 |             if cur_right.find(' ') != -1:
 26 |                 right_list = cur_right.split(' ')
 27 |             else:
 28 |                 right_list.append(cur_right)
 29 |             production = {cur_left: right_list}
 30 |             self.productions.append(production)
 31 | 
 32 |     def get_terminate_noterminate(self):
 33 |         for production in self.productions:
 34 |             for left in production.keys():
 35 |                 if left not in self.productions_dict:
 36 |                     self.productions_dict[left] = []
 37 |                 self.productions_dict[left].append((
 38 |                     tuple(production[left]),
 39 |                     self.productions.index(production)))
 40 |                 self.all_elem.add(left)
 41 |                 self.noterminate.add(left)
 42 |                 for right in production[left]:
 43 |                     self.all_elem.add(right)
 44 |         self.terminate = self.all_elem - self.noterminate
 45 | 
 46 |     def __get_first_set(self, cur_status, all_elem):
 47 |         if cur_status in self.first_set:
 48 |             return self.first_set[cur_status]
 49 |         all_elem.add(cur_status)
 50 |         cur_status_set = set()
 51 |         for right_list in self.productions_dict[cur_status]:
 52 |             for right in right_list[0]:
 53 |                 right_set = None
 54 |                 if right in all_elem:
 55 |                     continue
 56 |                 if right in self.first_set:
 57 |                     right_set = self.first_set[right]
 58 |                 else:
 59 |                     right_set = self.__get_first_set(right, all_elem)
 60 |                 cur_status_set |= right_set
 61 |                 if '$' not in right_set:
 62 |                     break
 63 |         return cur_status_set
 64 | 
 65 |     def init_first_set(self):
 66 |         for terminate in self.terminate:
 67 |             self.first_set[terminate] = set([terminate])
 68 |         for noterminate in self.noterminate:
 69 |             self.first_set[noterminate] = self.__get_first_set(
 70 |                 noterminate, set())
 71 | 
 72 |     def create_lr_dfa(self):
 73 |         all_status = {}
 74 |         all_object_set = {}
 75 |         self.DFA = DFA(set())
 76 | 
 77 |         def create_get_lr_dfa_node(set_id):
 78 |             if set_id in all_status:
 79 |                 return all_status[set_id]
 80 |             return LRDFANode(set_id=set_id)
 81 | 
 82 |         def expand_production(self, cur_production, ex_object_set):
 83 |             ex_object_set.add(cur_production)
 84 |             right = cur_production[2]
 85 |             point_index = cur_production[3]
 86 |             tail_set = cur_production[4]
 87 |             if point_index < len(right) and\
 88 |                     (right[point_index] in self.noterminate):
 89 |                 for pro_right in self.productions_dict[right[point_index]]:
 90 |                     new_tail_set = set()
 91 |                     flag = True
 92 |                     for i in range(point_index + 1, len(right)):
 93 |                         cur_first_set = self.first_set[right[i]]
 94 |                         if '$' in cur_first_set:
 95 |                             new_tail_set = tuple(
 96 |                                 set(new_tail_set) | (cur_first_set - set('$')))
 97 |                         else:
 98 |                             flag = False
 99 |                             new_tail_set = tuple(
100 |                                 set(new_tail_set) | cur_first_set)
101 |                             break
102 |                     if flag:
103 |                         new_tail_set = tuple(set(new_tail_set) | set(tail_set))
104 |                     ex_new_production = (
105 |                         pro_right[1],
106 |                         right[point_index], pro_right[0], 0, new_tail_set)
107 |                     if ex_new_production not in ex_object_set:
108 |                         ex_object_set |= expand_production(
109 |                             self, ex_new_production, ex_object_set)
110 |                 new_ex_object_set = {}
111 |                 for eos in ex_object_set:
112 |                     pro_key = (eos[0], eos[1], eos[2], eos[3])
113 |                     if tuple(pro_key) not in new_ex_object_set:
114 |                         new_ex_object_set[tuple(pro_key)] = set()
115 |                     new_ex_object_set[pro_key] |= set(eos[4])
116 |                 ex_object_set = set()
117 |                 for key in new_ex_object_set:
118 |                     production = (key[0], key[1], key[2], key[
119 |                                   3], tuple(new_ex_object_set[key]))
120 |                     ex_object_set.add(tuple(production))
121 |             return ex_object_set
122 | 
123 |         set_id = 0
124 |         new_node = create_get_lr_dfa_node(set_id)
125 |         object_set = expand_production(
126 |             self, (0, 'start1', ('start',), 0, '#'), set())
127 |         new_node.add_object_set_by_set(object_set)
128 |         all_object_set[tuple(object_set)] = set_id
129 |         all_status[set_id] = new_node
130 |         object_set_queue = list()
131 |         object_set_queue.append(new_node)
132 |         while object_set_queue:
133 |             top_object_node = object_set_queue.pop(0)
134 |             old_set = top_object_node.object_set
135 |             old_set_id = top_object_node.set_id
136 |             # print 'object_set_id =', old_set_id
137 |             for cur_production in old_set:
138 |                 # print cur_production
139 |                 pro_id = cur_production[0]
140 |                 left = cur_production[1]
141 |                 right = cur_production[2]
142 |                 point_index = cur_production[3]
143 |                 tail_set = cur_production[4]
144 |                 if point_index >= len(right) or '$' in right:
145 |                     if old_set_id not in self.lr_analyze_table:
146 |                         self.lr_analyze_table[old_set_id] = {}
147 |                     for tail in tail_set:
148 |                         if tail in self.lr_analyze_table[old_set_id]:
149 |                             print 'the grammar is not a LR(1) grammar!!!'
150 |                             return
151 |                         self.lr_analyze_table[old_set_id][tail] = ('r', pro_id)
152 |                 else:
153 |                     tar_set_id = 0
154 |                     new_production = (pro_id, left, right,
155 |                                       point_index + 1, tail_set)
156 |                     new_object_set = expand_production(
157 |                         self, new_production, set())
158 |                     if tuple(new_object_set) in all_object_set.keys():
159 |                         tar_set_id = all_object_set[tuple(new_object_set)]
160 |                     else:
161 |                         set_id += 1
162 |                         tar_set_id = set_id
163 |                         all_object_set[tuple(new_object_set)] = set_id
164 |                         new_node = create_get_lr_dfa_node(tar_set_id)
165 |                         new_node.add_object_set_by_set(new_object_set)
166 |                         all_status[tar_set_id] = new_node
167 |                         object_set_queue.append(new_node)
168 |                     if old_set_id not in self.lr_analyze_table:
169 |                         self.lr_analyze_table[old_set_id] = {}
170 |                     if right[point_index] in self.terminate:
171 |                         self.lr_analyze_table[old_set_id][
172 |                             right[point_index]] = ('s', tar_set_id)
173 |                     else:
174 |                         self.lr_analyze_table[old_set_id][
175 |                             right[point_index]] = ('g', tar_set_id)
176 |         self.DFA.status = all_status
177 | 
178 |     def run_on_lr_dfa(self, tokens):
179 |         status_stack = [0]
180 |         symbol_stack = ['#']
181 |         top = 0
182 |         success = False
183 |         tokens.reverse()
184 |         while not success:
185 |             top = status_stack[-1]
186 |             print 'token =', tokens[-1]
187 |             # print symbol_stack
188 |             print symbol_stack
189 |             if tokens[-1] in self.lr_analyze_table[top]:
190 |                 action = self.lr_analyze_table[top][tokens[-1]]
191 |                 if action[0] == 's':
192 |                     status_stack.append(action[1])
193 |                     symbol_stack.append(tokens[-1])
194 |                     tokens = tokens[:-1]
195 |                 elif action[0] == 'r':
196 |                     if action[1] == 0:
197 |                         print 'Syntax anaysis successfully!'
198 |                         success = True
199 |                         break
200 |                     production = self.productions[action[1]]
201 |                     left = production.keys()[0]
202 |                     right_len = len(production[left])
203 |                     tokens.append(left)
204 |                     if production[left] == ['$']:
205 |                         continue
206 |                     status_stack = status_stack[:-right_len]
207 |                     symbol_stack = symbol_stack[:-right_len]
208 |                 else:
209 |                     status_stack.append(action[1])
210 |                     symbol_stack.append(tokens[-1])
211 |                     tokens = tokens[:-1]
212 |                 # print status_stack, symbol_stack
213 |             else:
214 |                 print self.lr_analyze_table[top]
215 |                 print 'Syntax error!\n'
216 |                 break
217 | 
218 |     def read_and_analyze(self, fileName):
219 |         token_table = open(fileName, 'r')
220 |         tokens = []
221 |         for line in token_table:
222 |             line = line[:-1]
223 |             tokens.append(line.split(' ')[0])
224 |         tokens.append('#')
225 |         self.run_on_lr_dfa(tokens)
226 | 
227 | 
228 | def main():
229 |     syn_ana = SyntaxAnalyze()
230 |     # syn_ana.read_syntax_grammar('sample_syn_grammar.txt')
231 |     syn_ana.read_syntax_grammar('syn_grammar.txt')
232 |     syn_ana.get_terminate_noterminate()
233 |     syn_ana.init_first_set()
234 |     syn_ana.create_lr_dfa()
235 |     syn_ana.read_and_analyze('token_table.data')
236 |     # syn_ana.read_and_analyze('sample_token_table.txt')
237 |     # for key in syn_ana.lr_analyze_table:
238 |     #     print key, ': ', syn_ana.lr_analyze_table[key]
239 |     # for pro in syn_ana.productions:
240 |     #     print syn_ana.productions.index(pro), pro
241 |     # for key in syn_ana.first_set.keys():
242 |     #     print 'key =', key, '\n', 'first =', syn_ana.first_set[key]
243 |     # print syn_ana.productions
244 |     # print '\n'
245 |     # for left in syn_ana.productions_dict:
246 |     #     print left, ':', syn_ana.productions_dict[left]
247 |     # print syn_ana.terminate
248 |     # print syn_ana.noterminate
249 | 
250 | if __name__ == '__main__':
251 |     main()
252 | 


--------------------------------------------------------------------------------
/token_table.data:
--------------------------------------------------------------------------------
  1 | int int
  2 | identifier a
  3 | = =
  4 | number 1
  5 | , ,
  6 | identifier b
  7 | = =
  8 | number 234
  9 | , ,
 10 | identifier c
 11 | = =
 12 | number 2e4
 13 | ; ;
 14 | function function
 15 | int int
 16 | identifier max
 17 | ( (
 18 | int int
 19 | identifier a
 20 | , ,
 21 | int int
 22 | identifier b
 23 | ) )
 24 | { {
 25 | if if
 26 | ( (
 27 | identifier a
 28 | > >
 29 | identifier b
 30 | ) )
 31 | return return
 32 | identifier a
 33 | ; ;
 34 | else else
 35 | return return
 36 | identifier b
 37 | ; ;
 38 | } }
 39 | function function
 40 | double double
 41 | identifier min
 42 | ( (
 43 | int int
 44 | identifier A
 45 | , ,
 46 | int int
 47 | identifier B
 48 | ) )
 49 | { {
 50 | if if
 51 | ( (
 52 | identifier A
 53 | < <
 54 | identifier B
 55 | ) )
 56 | return return
 57 | identifier A
 58 | ; ;
 59 | else else
 60 | return return
 61 | identifier B
 62 | ; ;
 63 | } }
 64 | function function
 65 | int int
 66 | identifier main
 67 | ( (
 68 | ) )
 69 | { {
 70 | double double
 71 | identifier sum_1_to_50
 72 | = =
 73 | number 1
 74 | ; ;
 75 | for for
 76 | ( (
 77 | int int
 78 | identifier i
 79 | = =
 80 | number 1
 81 | ; ;
 82 | identifier i
 83 | < <
 84 | number 100
 85 | ; ;
 86 | identifier i
 87 | + +
 88 | = =
 89 | number 1
 90 | ) )
 91 | { {
 92 | if if
 93 | ( (
 94 | identifier i
 95 | < <
 96 | number 50
 97 | ) )
 98 | break break
 99 | ; ;
100 | else else
101 | identifier sum_1_to_50
102 | + +
103 | = =
104 | identifier i
105 | ; ;
106 | } }
107 | int int
108 | identifier k
109 | = =
110 | number 0
111 | , ,
112 | identifier s
113 | = =
114 | ( (
115 | ( (
116 | number 534
117 | - -
118 | number 23
119 | ) )
120 | + +
121 | number 423
122 | ) )
123 | * *
124 | number 23
125 | ; ;
126 | while while
127 | ( (
128 | identifier k
129 | < <
130 | number 40
131 | ) )
132 | scanf scanf
133 | ( (
134 | identifier s
135 | ) )
136 | ; ;
137 | int int
138 | identifier A
139 | = =
140 | number 50
141 | , ,
142 | identifier B
143 | = =
144 | number 23
145 | , ,
146 | identifier C
147 | ; ;
148 | identifier C
149 | = =
150 | function function
151 | identifier max
152 | ( (
153 | identifier A
154 | , ,
155 | identifier B
156 | ) )
157 | ; ;
158 | print print
159 | ( (
160 | identifier C
161 | ) )
162 | ; ;
163 | print print
164 | ( (
165 | identifier A
166 | + +
167 | identifier B
168 | * *
169 | identifier C
170 | ) )
171 | ; ;
172 | return return
173 | number 0
174 | ; ;
175 | } }
176 | 


--------------------------------------------------------------------------------