├── README.md
└── parser_gen.py


/README.md:
--------------------------------------------------------------------------------
  1 | # parser_gen
  2 | 
  3 | 一个简单的`LR(1)/LALR(1)`解析器生成工具，适用于`C++17`或更高。
  4 | 
  5 | Working in progress.
  6 | 
  7 | ## TODO
  8 | 
  9 | - [x] 完成LR(1)支持
 10 | - [x] 冲突解决
 11 | - [ ] 完成LALR支持
 12 | - [ ] 人可读报错
 13 | 
 14 | ## 快速开始
 15 | 
 16 | Calculator.p:
 17 | 
 18 | ```
 19 | term Plus assoc(left) prec(1);  // +
 20 | term Minus assoc(left) prec(1);  // -
 21 | term Multiply assoc(left) prec(2);  // *
 22 | term Division assoc(left) prec(2);  // /
 23 | term LeftParen;  // (
 24 | term RightParen;  // )
 25 | term LiteralNumber {% int %};
 26 | 
 27 | nonterm exp {% int %};
 28 | 
 29 | grammar {
 30 |     exp -> LiteralNumber(value) {% return value; %};
 31 |     exp -> LeftParen exp(exp) RightParen {% return exp; %};
 32 |     exp -> Minus exp(rhs) prec(10) {% return -rhs; %};
 33 |     exp -> exp(lhs) Plus exp(rhs) {% return lhs + rhs; %};
 34 |     exp -> exp(lhs) Minus exp(rhs) {% return lhs - rhs; %};
 35 |     exp -> exp(lhs) Multiply exp(rhs) {% return lhs * rhs; %};
 36 |     exp -> exp(lhs) Division exp(rhs) {% return lhs / rhs; %};
 37 | };
 38 | 
 39 | generator {%
 40 |     {
 41 |         "class_name": "CalculatorParser"
 42 |     }
 43 | %};
 44 | ```
 45 | 
 46 | Main.cpp:
 47 | 
 48 | ```c++
 49 | #include <tuple>
 50 | #include <variant>
 51 | #include <iostream>
 52 | 
 53 | #include "CalculatorParser.hpp"
 54 | 
 55 | class Tokenizer {
 56 | public:
 57 |     Tokenizer(const char* buffer)
 58 |         : m_pBuffer(buffer) {}
 59 | public:
 60 |     std::tuple<CalculatorParser::TokenTypes, CalculatorParser::TokenValues> Advance() {
 61 |         using TokenTypes = CalculatorParser::TokenTypes;
 62 |         using TokenValues = CalculatorParser::TokenValues;
 63 |         while (true) {
 64 |             if (*m_pBuffer == '\0')
 65 |                 return { TokenTypes::_, TokenValues {} };
 66 | 
 67 |             char c;
 68 |             switch (c = *(m_pBuffer++)) {
 69 |                 case '+': return { TokenTypes::Plus, TokenValues {} };
 70 |                 case '-': return { TokenTypes::Minus, TokenValues {} };
 71 |                 case '*': return { TokenTypes::Multiply, TokenValues {} };
 72 |                 case '/': return { TokenTypes::Division, TokenValues {} };
 73 |                 case '(': return { TokenTypes::LeftParen, TokenValues {} };
 74 |                 case ')': return { TokenTypes::RightParen, TokenValues {} };
 75 |                 case ' ':
 76 |                 case '\t':
 77 |                 case '\n':
 78 |                 case '\r':
 79 |                     continue;
 80 |                 default:
 81 |                     if (c >= '0' && c <= '9') {
 82 |                         int ret = (c - '0');
 83 |                         while (*m_pBuffer >= '0' && *m_pBuffer <= '9')
 84 |                             ret = ret * 10 + (*(m_pBuffer++) - '0');
 85 |                         return { TokenTypes::LiteralNumber, TokenValues { ret } };
 86 |                     }
 87 |                     else
 88 |                         throw std::runtime_error("Bad input");
 89 |             }
 90 |         }
 91 |     }
 92 | private:
 93 |     const char* m_pBuffer;
 94 | };
 95 | 
 96 | int main() {
 97 |     try {
 98 |         while (std::cin) {
 99 |             std::string input;
100 |             std::getline(std::cin, input);
101 | 
102 |             Tokenizer tokenizer(input.c_str());
103 |             CalculatorParser parser;
104 |             while (true) {
105 |                 auto [t, v] = tokenizer.Advance();
106 | 
107 |                 auto ret = parser(t, v);
108 |                 if (ret == CalculatorParser::ParseResult::Rejected)
109 |                     throw std::runtime_error("Parse error");
110 |                 else if (ret == CalculatorParser::ParseResult::Accepted)
111 |                     std::cout << parser.Result() << std::endl;
112 | 
113 |                 if (t == CalculatorParser::TokenTypes::_)
114 |                     break;
115 |             }
116 |         };
117 |     }
118 |     catch (const std::exception& ex) {
119 |         std::cerr << ex.what() << std::endl;
120 |         return 1;
121 |     }
122 |     return 0;
123 | }
124 | ```
125 | 
126 | Build:
127 | 
128 | ```bash
129 | ./parser_gen.py --header-file CalculatorParser.hpp --source-file CalculatorParser.cpp Calculator.p
130 | g++ CalculatorParser.cpp Main.cpp -std=c++17 -o calculator
131 | ```
132 | 
133 | Run it:
134 | 
135 | ```bash
136 | ./calculator
137 | ```
138 | 
139 | ## 特性
140 | 
141 | - 生成可重入代码
142 | - 不污染命名空间
143 | - 用户驱动接口
144 | 
145 | ## 语法规则文件
146 | 
147 | 语法规则文件由四部分声明构成：
148 | - 终结符
149 | - 非终结符
150 | - 规则
151 | - 生成器参数
152 | 
153 | ### 终结符
154 | 
155 | 终结符使用下述方式声明：
156 | 
157 | ```
158 |   term 标识符 {% 替换 %} ;
159 | ```
160 | 
161 | 其中，标识符用于指定终结符的名称，可以由非数字开头的若干数字、字母或者下划线构成（下同），需要注意的是单独的`_`会被识别为关键词。
162 | 
163 | 替换部分应当填写一个C/C++类型，当语法制导翻译遇到一个标识符时可以给出对应的C/C++类型的值供用户代码使用。
164 | 
165 | 若替换部分留空，则该标识符的值不可在翻译过程中被使用。
166 | 
167 | 此外，为了支撑算符优先冲突解决规则，可以在标识符后面使用关键字`assoc`和`prec`来指定左结合或右结合以及对应的优先级，例如：
168 | 
169 | ```
170 |   term minus assoc(left) prec(1) {% Tokenizer::Token %};
171 | ```
172 | 
173 | 其中`assoc`可以接`left`、`right`或者`none`，表明左结合、右结合或者无结合性。
174 | 
175 | 其中`prec`用于指定算符优先级，算符优先级高的表达式会在`移进/规约`冲突中被优先选择。
176 | 
177 | 在解决冲突时，如果发现算符无结合性则会产生错误，若解决冲突的任意一方不指定结合性或优先级，则会按照其他规约规则自动解决冲突。
178 | 
179 | 此外，算符优先冲突解决规则仅适用于诸如：`Exp op Exp`的表达式，其中`op`是一个非终结符。
180 | 
181 | ### 非终结符
182 | 
183 | 非终结符使用下述方式声明：
184 | 
185 | ```
186 |   nonterm 标识符 {% 替换 %};
187 | ```
188 | 
189 | 具体规则和终结符一致，但是不可以声明结合性或者优先级，其他内容不再赘述。
190 | 
191 | ### 语法规则
192 | 
193 | 声明完终结符和非终结符后可以声明语法规则，举例如下：
194 | 
195 | ```
196 |   grammar {
197 |     Exp -> Exp(lhs) plus Exp(rhs) {% return Ast::BinExp(lhs, rhs, Ast::BinOp::Plus); %};
198 |     Exp -> Exp(lhs) minus Exp(rhs) {% return Ast::BinExp(lhs, rhs, Ast::BinOp::Minus); %};
199 |   }
200 | ```
201 | 
202 | 语法规则定义在`grammar`块中，一个产生式具备下述形式：
203 | 
204 | ```
205 |   非终结符 -> 符号1 ( 标识符1 ) 符号2 ( 标识符2 ) ... {% 替换 %} ;
206 | ```
207 | 
208 | 其中，非终结符指示从哪个终结符推导而来，整个产生式在规约后将会具备该终结符对应的类型。
209 | 
210 | `符号1..n`指示产生式的构成，每个符号可以接一个标识符，将会在生成代码中使用符号对应的类型捕获值给解析器代码使用。
211 | 
212 | 需要注意，首条规则被作为入口规则产生文法。此外如果产生式不规约任何符号，需要使用特殊的语法来声明：
213 | 
214 | ```
215 |   非终结符 -> _ {% 替换 %};
216 | ```
217 | 
218 | 另外，为了支持单目运算符的特殊优先级，产生式本身可以指定一个独立的优先级，例如：
219 | 
220 | ```
221 |   grammar {
222 |     UnaryExp -> minus Exp(rhs) prec(10) {% ... %};
223 |   }
224 | ```
225 | 
226 | 此时，`prec`必须在产生式末尾，当生成器在解决`BinExp`和`UnaryExp`的冲突时会优先匹配`UnaryExp`。
227 | 
228 | ### 代码生成参数
229 | 
230 | 在完成上述定义后，你可以使用 Json 来向代码生成器传递参数，这些参数会被用于在模板中替换对应的变量：
231 | 
232 | ```
233 |   generator {%
234 |     {
235 |       "namespace": "Test",
236 |       "class_name": "MyParser",
237 |       "includes": [
238 |         "Ast.hpp"
239 |       ]
240 |     }
241 |   %}
242 | ```
243 | 
244 | ### 附录：关键词表
245 |   
246 | ```
247 |   _ term nonterm grammar generator assoc prec left right none
248 | ```
249 | 
250 | ### 附录：规约/移进冲突解决规则
251 | 
252 | - 下述规则被依次用于解决规约/移进冲突：
253 |   - 尝试使用算符优先和结合性规则进行解决；
254 |   - 采取移进规则解决；
255 | - 下述规则被依次用于解决规约/规约冲突：
256 |   - 依照生成式的定义顺序解决，先定义的生成式会先被用于解决冲突；
257 | 
258 | ## 生成代码接口
259 | 
260 | 生成器将会依据模板产生下述样式的入口：
261 | 
262 | ```c++
263 |     class Parser
264 |     {
265 |     public:
266 |         enum class ParseResult
267 |         {
268 |             Undecided = 0,
269 |             Accepted = 1,
270 |             Rejected = 2,
271 |         };
272 |         
273 |         enum class TokenTypes
274 |         {
275 |             _ = 0,
276 |             Division = 1,
277 |             LeftParen = 2,
278 |             LiteralNumber = 3,
279 |             Minus = 4,
280 |             Multiply = 5,
281 |             Plus = 6,
282 |             RightParen = 7,
283 |         };
284 |         
285 |         using TokenValues = std::variant<std::monostate, int>;
286 |         using ProductionValues = std::variant<int>;
287 |         using UnionValues = std::variant<TokenValues, ProductionValues>;
288 |         
289 |     public:
290 |         Parser();
291 |         
292 |     public:
293 |         ParseResult operator()(TokenTypes token, const TokenValues& value);
294 |         void Reset()noexcept;
295 |         const int& Result()const noexcept { return m_stResult; }
296 |         int& Result()noexcept { return m_stResult; }
297 |         
298 |     private:
299 |         std::vector<uint32_t> m_stStack;
300 |         std::vector<UnionValues> m_stValueStack;
301 |         
302 |         int m_stResult {};
303 |     };
304 | ```
305 | 
306 | - Parser::TokenTypes
307 | 
308 |     存放所有终结符的枚举表示，`Tokenizer`可以利用这里的`TokenTypes`向`Parser`传递下一个符号。
309 | 
310 | - Parser::TokenValues
311 | 
312 |     存放所有终结符的值表示，将会传递给用户定义的驱动函数使用。
313 |     
314 | - Parser::ProductionValues
315 | 
316 |     存放所有非终结符的值表示，将会在计算过程中被使用。
317 | 
318 | - Parser::operator()
319 | 
320 |     通过解析器的`operator()`向解析器喂一个`Token`。如果解析失败，返回`Reject`；如果解析成功，返回`Accept`，并且`Parser::Result()`可以访问存储的解析结果。
321 | 
322 |     若内部抛出异常，需要手动执行`Reset()`重置状态，否则行为是未定义的。
323 | 
324 | - Parser::Reset()
325 | 
326 |     重置状态。
327 | 
328 | - Parser::Result()
329 | 
330 |     获取解析结果，对应第一个产生式的非终结符类型。
331 | 
332 | ## License
333 | 
334 | MIT License
335 | 


--------------------------------------------------------------------------------
/parser_gen.py:
--------------------------------------------------------------------------------
   1 | #!python3
   2 | # -*- coding: utf-8 -*-
   3 | #
   4 | # parser_gen
   5 | #   一个 LR(1)/LALR 语法解析器生成工具。
   6 | #
   7 | # Copyright (C) 2020 Chen Chu<1871361697@qq.com>
   8 | #
   9 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  10 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
  11 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
  12 | # persons to whom the Software is furnished to do so, subject to the following conditions:
  13 | #
  14 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
  15 | # Software.
  16 | #
  17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
  18 | # WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
  19 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21 | #
  22 | import os
  23 | import sys
  24 | import json
  25 | import argparse
  26 | import datetime
  27 | from typing import List, Set, Dict, Tuple, Optional
  28 | 
  29 | 
  30 | # ---------------------------------------- 文法解析器部分 ----------------------------------------
  31 | # 文法解析器用于解析文法文件，一个文法文件包含语法的终极符、非终结符和对应的产生式及相关的属性。
  32 | #
  33 | # 终结符使用下述方式声明：
  34 | #   term 标识符 {% 替换 %} ;
  35 | # 其中，标识符用于指定终结符的名称，可以由非数字开头的若干数字、字母或者下划线构成（下同），需要注意的是单独的_会被识别为关键词。
  36 | # 替换部分应当填写一个C/C++类型，当语法制导翻译遇到一个标识符时可以给出对应的C/C++类型的值供用户代码使用。
  37 | # 若替换部分留空，则该标识符的值不可在翻译过程中被使用。
  38 | # 此外，为了支撑算符优先冲突解决规则，可以在标识符后面使用关键字 assoc 和 prec 来指定左结合或右结合以及对应的优先级，例如：
  39 | #   term minus assoc(left) prec(1) {% Tokenizer::Token %};
  40 | # 其中 assoc 可以接 left、right 或者 none，表明左结合、右结合或者无结合性。
  41 | # 需要注意的是，在解决冲突时，如果发现算符无结合性则会产生错误，若不指定结合性，则会按照其他规约规则自动解决冲突。
  42 | # 其中 prec 用于指定算符优先级，算符优先级高的表达式会在移进-规约冲突中被优先选择。
  43 | #
  44 | # 非终结符使用下述方式声明：
  45 | #   nonterm 标识符 {% 替换 %};
  46 | # 具体规则和终结符一致，但是不可以声明结合性或者优先级，其他内容不再赘述。
  47 | #
  48 | # 声明完终结符和非终结符后可以声明语法规则，举例如下：
  49 | #   grammar {
  50 | #     BinExp -> Exp(lhs) BinOp(op) Exp(rhs) {% return Ast::BinExp(lhs, rhs, op); %};
  51 | #     BinOp -> minus {%
  52 | #       return Ast::BinOp::Minus;
  53 | #     %};
  54 | #     BinOp -> plus {%
  55 | #       return Ast::BinOp::Plus;
  56 | #     %};
  57 | #   }
  58 | # 语法规则定义在Grammar块中，一个产生式具备下述形式：
  59 | #   非终结符 -> 符号1 ( 标识符1 ) 符号2 ( 标识符2 ) ... {% 替换 %} ;
  60 | # 其中，非终结符指示从哪个终结符推导而来，整个产生式在规约后将会具备该终结符对应的类型。
  61 | # 符号1..n 指示产生式的构成，每个符号可以接一个标识符，将会在生成代码中使用符号对应的类型捕获值给解析器代码使用。
  62 | # 需要注意，首条规则被作为入口规则产生文法。此外如果产生式不规约任何符号，需要使用特殊的语法来声明：
  63 | #   非终结符 -> _ {% 替换 %};
  64 | # 另外，为了支持单目运算符的特殊优先级，产生式本身可以指定一个独立的优先级，例如：
  65 | #   grammar {
  66 | #     UnaryExp -> minus Exp(rhs) prec(10) {% ... %};
  67 | #   }
  68 | # 此时，prec 必须在产生式末尾，当生成器在解决 BinExp 和 UnaryExp 的冲突时会优先匹配 UnaryExp。
  69 | #
  70 | # 最后，在进行代码生成时，你可以使用 Json 来向生成器传递参数，这些参数会被用于在模板中替换对应的变量：
  71 | #   generator {%
  72 | #     {
  73 | #       "namespace": "Test",
  74 | #       "class_name": "MyParser"
  75 | #     }
  76 | #   %}
  77 | #
  78 | # 附录：关键词表
  79 | #   _ term nonterm grammar generator assoc prec left right none
  80 | #
  81 | # 附录：规约/移进冲突解决规则：
  82 | #   下述规则被依次用于解决规约/移进冲突：
  83 | #     1. 尝试使用算符优先和结合性规则进行解决；
  84 | #     2. 采取移进规则解决；
  85 | #   下述规则被依次用于解决规约/规约冲突：
  86 | #     1. 依照生成式的定义顺序解决，先定义的生成式会先被用于解决冲突；
  87 | #
  88 | 
  89 | class Symbol:
  90 |     """
  91 |     符号
  92 | 
  93 |     标识一个终结符或者非终结符。
  94 |     符号不覆盖__eq__和__hash__，因为在一个实例中应该是唯一的。
  95 |     """
  96 |     def __init__(self, t: int, id: str, replace: Optional[str] = None, assoc: int = 0, prec: int = 0, line: int=0):
  97 |         self._type = t
  98 |         self._id = id
  99 |         self._replace = None if replace is None else replace.strip()
 100 |         self._assoc = assoc
 101 |         self._prec = prec
 102 |         self._line = line
 103 | 
 104 |     def __repr__(self):
 105 |         return self._id
 106 | 
 107 |     def type(self) -> int:
 108 |         """
 109 |         获取符号类型
 110 |         :return: 符号类型
 111 |         """
 112 |         return self._type
 113 | 
 114 |     def id(self) -> str:
 115 |         """
 116 |         获取标识符
 117 |         :return: 标识符
 118 |         """
 119 |         return self._id
 120 | 
 121 |     def replace(self) -> Optional[str]:
 122 |         """
 123 |         获取替换文本
 124 |         :return: 替换文本
 125 |         """
 126 |         return self._replace
 127 | 
 128 |     def associativity(self) -> int:
 129 |         """
 130 |         获取结合性
 131 |         :return: 结合性
 132 |         """
 133 |         return self._assoc
 134 | 
 135 |     def precedence(self) -> int:
 136 |         """
 137 |         获取优先级
 138 |         :return: 优先级
 139 |         """
 140 |         return self._prec
 141 | 
 142 |     def line_defined(self) -> int:
 143 |         """
 144 |         获取符号在源码中定义的行号
 145 |         :return: 行号
 146 |         """
 147 |         return self._line
 148 | 
 149 | 
 150 | SYMBOL_TESTER = -2  # special symbol '#', for generating LALR parser
 151 | SYMBOL_ENTRY = -1  # special symbol '@', for extending the grammar
 152 | SYMBOL_EOF = 0  # special symbol '$'
 153 | SYMBOL_TERMINAL = 1  # terminal symbol
 154 | SYMBOL_NON_TERMINAL = 2  # non-terminal symbol
 155 | 
 156 | ASSOC_UNDEF = 0
 157 | ASSOC_LEFT = 1
 158 | ASSOC_RIGHT = 2
 159 | ASSOC_NONE = 3
 160 | 
 161 | kEofSymbol = Symbol(SYMBOL_EOF, "$", "")
 162 | kEntrySymbol = Symbol(SYMBOL_ENTRY, "@", "")
 163 | kTesterSymbol = Symbol(SYMBOL_TESTER, "#", "")
 164 | 
 165 | 
 166 | class Production:
 167 |     """
 168 |     产生式
 169 | 
 170 |     由一系列符号构成。
 171 |     """
 172 |     def __init__(self, left: Symbol, right: List[Symbol], binding: Dict[int, str], replace: Optional[str] = None,
 173 |                  prec: int = 0, line: int = -1, index: int = -1):
 174 |         self._left = left
 175 |         self._right = right
 176 |         self._binding = binding
 177 |         self._replace = replace
 178 |         self._prec = prec
 179 |         self._line = line
 180 |         self._index = index
 181 | 
 182 |     def __repr__(self):
 183 |         if self._prec != 0:
 184 |             return "%s -> %s prec(%d)" % (repr(self._left), " ".join([repr(x) for x in self._right]), self._prec)
 185 |         return "%s -> %s" % (repr(self._left), " ".join([repr(x) for x in self._right]))
 186 | 
 187 |     def __len__(self):
 188 |         return len(self._right)
 189 | 
 190 |     def __getitem__(self, item):
 191 |         assert isinstance(item, int)
 192 |         return self._right[item]
 193 | 
 194 |     def __eq__(self, obj) -> bool:  # binding, replace, prec, line 不参与比较
 195 |         if not isinstance(obj, Production):
 196 |             return False
 197 |         if self._left != obj._left:
 198 |             return False
 199 |         if len(self._right) != len(obj._right):
 200 |             return False
 201 |         for i in range(0, len(self._right)):
 202 |             if self._right[i] != obj._right[i]:
 203 |                 return False
 204 |         return True
 205 | 
 206 |     def __ne__(self, obj) -> bool:
 207 |         return not self == obj
 208 | 
 209 |     def __hash__(self) -> int:
 210 |         ret = hash(self._left)
 211 |         for i in range(0, len(self._right)):
 212 |             ret = ret ^ hash(self._right[i])
 213 |         return ret
 214 | 
 215 |     def left(self) -> Symbol:
 216 |         """
 217 |         获取产生式对应的非终结符
 218 |         :return: 非终结符
 219 |         """
 220 |         return self._left
 221 | 
 222 |     def binding(self) -> Dict[int, str]:
 223 |         """
 224 |         获取绑定参数名的映射情况
 225 |         :return: 绑定参数映射表
 226 |         """
 227 |         return self._binding
 228 | 
 229 |     def replace(self) -> Optional[str]:
 230 |         """
 231 |         获取产生式对应的替代文本
 232 |         :return: 替代文本
 233 |         """
 234 |         return self._replace
 235 | 
 236 |     def precedence(self) -> int:
 237 |         """
 238 |         获取优先级
 239 |         :return: 优先级
 240 |         """
 241 |         return self._prec
 242 | 
 243 |     def line_defined(self) -> int:
 244 |         """
 245 |         获取符号在源码中定义的行号
 246 |         :return: 行号
 247 |         """
 248 |         return self._line
 249 | 
 250 |     def index(self) -> int:
 251 |         """
 252 |         获取产生式在源码中的索引
 253 |         :return: 索引
 254 |         """
 255 |         return self._index
 256 | 
 257 | 
 258 | class ParseError(Exception):
 259 |     """
 260 |     解析错误
 261 |     """
 262 |     def __init__(self, message: str, line: int, col: Optional[int] = None):
 263 |         Exception.__init__(self, message)
 264 |         self._message = message
 265 |         self._line = line
 266 |         self._col = col
 267 | 
 268 |     def __str__(self):
 269 |         if self._col is not None:
 270 |             return f"{self._message} (line {self._line}, col {self._col})"
 271 |         return f"{self._message} (line {self._line})"
 272 | 
 273 |     def message(self):
 274 |         return self._message
 275 | 
 276 |     def line(self):
 277 |         return self._line
 278 | 
 279 |     def col(self):
 280 |         return self._col
 281 | 
 282 | 
 283 | class SourceReader:
 284 |     """
 285 |     源代码读取器
 286 |     """
 287 |     def __init__(self, filename):
 288 |         with open(filename, "r", encoding="utf-8") as f:
 289 |             self._content = f.read()
 290 |         self._pos = 0
 291 |         self._line = 1
 292 |         self._col = 0
 293 | 
 294 |     def pos(self):
 295 |         return self._pos
 296 | 
 297 |     def line(self):
 298 |         return self._line
 299 | 
 300 |     def col(self):
 301 |         return self._col
 302 | 
 303 |     def peek(self):
 304 |         if self._pos >= len(self._content):
 305 |             return '\0'
 306 |         return self._content[self._pos]
 307 | 
 308 |     def read(self):
 309 |         ch = self.peek()
 310 |         if ch == '\0':
 311 |             return ch
 312 |         self._pos = self._pos + 1
 313 |         self._col = self._col + 1
 314 |         if ch == '\n':
 315 |             self._line = self._line + 1
 316 |             self._col = 0
 317 |         return ch
 318 | 
 319 |     def raise_error(self, msg):
 320 |         raise ParseError(msg, self._line, self._col)
 321 | 
 322 | 
 323 | TOKEN_EOF = 0
 324 | TOKEN_IDENTIFIER = 1  # 标识符
 325 | TOKEN_LITERAL = 2  # 替换用文本
 326 | TOKEN_INTEGER = 3  # 整数
 327 | TOKEN_EOD = 4  # 分号 ;
 328 | TOKEN_DEDUCE = 5  # 推导符号 ->
 329 | TOKEN_BEGIN_BLOCK = 6  # {
 330 | TOKEN_END_BLOCK = 7  # }
 331 | TOKEN_BEGIN_ARG = 8  # (
 332 | TOKEN_END_ARG = 9  # )
 333 | TOKEN_EMPTY = 10  # 关键词 _
 334 | TOKEN_TERM = 11  # 关键词 term
 335 | TOKEN_NON_TERM = 12  # 关键词 nonterm
 336 | TOKEN_GRAMMAR = 13  # 关键词 grammar
 337 | TOKEN_GENERATOR = 14  # 关键词 generator
 338 | TOKEN_ASSOC = 15  # 关键词 assoc
 339 | TOKEN_PREC = 16  # 关键词 prec
 340 | TOKEN_LEFT = 17  # 关键词 left
 341 | TOKEN_RIGHT = 18  # 关键词 right
 342 | TOKEN_NONE = 19  # 关键词 none
 343 | 
 344 | 
 345 | class GrammarDocument:
 346 |     """
 347 |     语法文件
 348 | 
 349 |     存储语法文件内容并提供解析功能。
 350 |     使用手写的递归下降来实现解析。
 351 | 
 352 |     @mq
 353 |     - 没有parser gen，要怎么解析语法文件
 354 |     - 写parser啊
 355 |     - 没有parser gen怎么写parser
 356 |     - 那就写parser gen
 357 |     - 写parser gen怎么解析语法规则！！
 358 |     - 写parser！！！
 359 |     """
 360 |     def __init__(self):
 361 |         self._productions = []  # type: List[Production]
 362 |         self._symbols = set()  # type: Set[Symbol]
 363 |         self._terminals = set()  # type: Set[Symbol]
 364 |         self._non_terminals = set()  # type: Set[Symbol]
 365 |         self._generator_args = None  # type: Optional[Dict]
 366 | 
 367 |     def clear(self):
 368 |         self._productions = []  # type: List[Production]
 369 |         self._symbols = set()  # type: Set[Symbol]
 370 |         self._terminals = set()  # type: Set[Symbol]
 371 |         self._non_terminals = set()  # type: Set[Symbol]
 372 |         self._generator_args = None  # type: Optional[Dict]
 373 | 
 374 |     def productions(self) -> List[Production]:
 375 |         """
 376 |         获取所有产生式
 377 |         :return: 产生式列表
 378 |         """
 379 |         return self._productions
 380 | 
 381 |     def symbols(self) -> Set[Symbol]:
 382 |         """
 383 |         获取所有符号
 384 |         :return: 符号集合
 385 |         """
 386 |         return self._symbols
 387 | 
 388 |     def terminals(self) -> Set[Symbol]:
 389 |         """
 390 |         获取终结符号
 391 |         :return: 终结符号集合
 392 |         """
 393 |         return self._terminals
 394 | 
 395 |     def non_terminals(self) -> Set[Symbol]:
 396 |         """
 397 |         获取非终结符号
 398 |         :return: 非终结符号集合
 399 |         """
 400 |         return self._non_terminals
 401 | 
 402 |     def generator_args(self) -> Optional[Dict]:
 403 |         """
 404 |         获取生成器参数
 405 |         :return: 参数
 406 |         """
 407 |         return self._generator_args
 408 | 
 409 |     @staticmethod
 410 |     def _advance(reader: SourceReader):
 411 |         while True:
 412 |             if reader.peek() == '\0':
 413 |                 return TOKEN_EOF, None, reader.line()
 414 | 
 415 |             # 跳过空白
 416 |             if reader.peek().isspace():
 417 |                 while reader.peek().isspace():
 418 |                     reader.read()
 419 |                 continue
 420 | 
 421 |             # 跳过注释
 422 |             if reader.peek() == '/':
 423 |                 reader.read()
 424 |                 if reader.peek() != '/':  # 当前语法只有'//'的可能
 425 |                     reader.raise_error(f"'/' expected, but found {repr(reader.peek())}")
 426 |                 reader.read()
 427 |                 while reader.peek() != '\0' and reader.peek() != '\n':  # 读到末尾
 428 |                     reader.read()
 429 |                 continue
 430 | 
 431 |             # 符号
 432 |             if reader.peek() == ';':
 433 |                 line = reader.line()
 434 |                 reader.read()
 435 |                 return TOKEN_EOD, None, line
 436 |             elif reader.peek() == '-':
 437 |                 line = reader.line()
 438 |                 reader.read()
 439 |                 if reader.peek() != '>':  # 当前语法只有'->'可能
 440 |                     reader.raise_error(f"'>' expected, but found {repr(reader.peek())}")
 441 |                 reader.read()
 442 |                 return TOKEN_DEDUCE, None, line
 443 |             elif reader.peek() == '{':
 444 |                 line = reader.line()
 445 |                 reader.read()
 446 |                 if reader.peek() == '%':
 447 |                     reader.read()
 448 |                     content = []
 449 |                     while True:
 450 |                         if reader.peek() == '%':
 451 |                             reader.read()
 452 |                             if reader.peek() == '}':
 453 |                                 reader.read()
 454 |                                 break
 455 |                             elif reader.peek() == '%':
 456 |                                 reader.read()
 457 |                                 content.append('%')
 458 |                             else:
 459 |                                 reader.raise_error(f"'%' or '}}' expected, but found {repr(reader.peek())}")
 460 |                         elif reader.peek() == '\0':
 461 |                             reader.raise_error("Unexpected eof")
 462 |                         else:
 463 |                             content.append(reader.read())
 464 |                     return TOKEN_LITERAL, "".join(content), line
 465 |                 else:
 466 |                     return TOKEN_BEGIN_BLOCK, None, line
 467 |             elif reader.peek() == '}':
 468 |                 line = reader.line()
 469 |                 reader.read()
 470 |                 return TOKEN_END_BLOCK, None, line
 471 |             elif reader.peek() == '(':
 472 |                 line = reader.line()
 473 |                 reader.read()
 474 |                 return TOKEN_BEGIN_ARG, None, line
 475 |             elif reader.peek() == ')':
 476 |                 line = reader.line()
 477 |                 reader.read()
 478 |                 return TOKEN_END_ARG, None, line
 479 | 
 480 |             # 关键词/Identifier/数字
 481 |             content = []
 482 |             if reader.peek().isalpha() or reader.peek() == '_':
 483 |                 line = reader.line()
 484 |                 while reader.peek().isalnum() or reader.peek() == '_':
 485 |                     content.append(reader.read())
 486 |                 identifier = "".join(content)
 487 |                 if identifier == "_":
 488 |                     return TOKEN_EMPTY, identifier, line
 489 |                 elif identifier == "term":
 490 |                     return TOKEN_TERM, identifier, line
 491 |                 elif identifier == "nonterm":
 492 |                     return TOKEN_NON_TERM, identifier, line
 493 |                 elif identifier == "grammar":
 494 |                     return TOKEN_GRAMMAR, identifier, line
 495 |                 elif identifier == "generator":
 496 |                     return TOKEN_GENERATOR, identifier, line
 497 |                 elif identifier == "assoc":
 498 |                     return TOKEN_ASSOC, identifier, line
 499 |                 elif identifier == "prec":
 500 |                     return TOKEN_PREC, identifier, line
 501 |                 elif identifier == "left":
 502 |                     return TOKEN_LEFT, identifier, line
 503 |                 elif identifier == "right":
 504 |                     return TOKEN_RIGHT, identifier, line
 505 |                 elif identifier == "none":
 506 |                     return TOKEN_NONE, identifier, line
 507 |                 return TOKEN_IDENTIFIER, identifier, line
 508 |             if reader.peek().isnumeric():
 509 |                 line = reader.line()
 510 |                 while reader.peek().isnumeric():
 511 |                     content.append(reader.read())
 512 |                 return TOKEN_INTEGER, int("".join(content)), line
 513 |             reader.raise_error(f"Unexpected character '{repr(reader.peek())}'")
 514 | 
 515 |     def parse(self, filename):
 516 |         reader = SourceReader(filename)
 517 |         symbols = {}
 518 |         productions = []
 519 |         production_set = set()
 520 |         generator_args = None
 521 |         while True:
 522 |             token, value, line = GrammarDocument._advance(reader)
 523 |             if token == TOKEN_EOF:
 524 |                 break
 525 |             elif token == TOKEN_TERM:
 526 |                 # read identifier
 527 |                 token, identifier, line = GrammarDocument._advance(reader)
 528 |                 if token != TOKEN_IDENTIFIER:
 529 |                     raise ParseError("Identifier required parsing term statement", line)
 530 |                 if identifier in symbols:
 531 |                     raise ParseError(f"Terminated symbol \"{identifier}\" redefined", line)
 532 |                 replace = None
 533 |                 def_line = line
 534 |                 # read assoc or prec
 535 |                 assoc = None
 536 |                 prec = None
 537 |                 while True:
 538 |                     token, value, line = GrammarDocument._advance(reader)
 539 |                     if token == TOKEN_ASSOC:
 540 |                         if assoc is not None:
 541 |                             raise ParseError("Associate type redefined", line)
 542 |                         token, _, line = GrammarDocument._advance(reader)
 543 |                         if token != TOKEN_BEGIN_ARG:
 544 |                             raise ParseError("'(' expected parsing associate type", line)
 545 |                         token, _, line = GrammarDocument._advance(reader)
 546 |                         if token == TOKEN_LEFT:
 547 |                             assoc = ASSOC_LEFT
 548 |                         elif token == TOKEN_RIGHT:
 549 |                             assoc = ASSOC_RIGHT
 550 |                         elif token == TOKEN_NONE:
 551 |                             assoc = ASSOC_NONE
 552 |                         else:
 553 |                             raise ParseError("'left', 'right' or 'none' expected parsing associate type", line)
 554 |                         token, _, line = GrammarDocument._advance(reader)
 555 |                         if token != TOKEN_END_ARG:
 556 |                             raise ParseError("')' expected parsing associate type", line)
 557 |                     elif token == TOKEN_PREC:
 558 |                         if prec is not None:
 559 |                             raise ParseError("Precedence redefined", line)
 560 |                         token, _, line = GrammarDocument._advance(reader)
 561 |                         if token != TOKEN_BEGIN_ARG:
 562 |                             raise ParseError("'(' expected parsing precedence", line)
 563 |                         token, prec, line = GrammarDocument._advance(reader)
 564 |                         if token != TOKEN_INTEGER:
 565 |                             raise ParseError("Integer expected parsing precedence", line)
 566 |                         if prec == 0:
 567 |                             raise ParseError("Precedence must large than zero", line)
 568 |                         token, _, line = GrammarDocument._advance(reader)
 569 |                         if token != TOKEN_END_ARG:
 570 |                             raise ParseError("')' expected parsing associate type", line)
 571 |                     else:
 572 |                         break
 573 |                 # replace
 574 |                 if token == TOKEN_LITERAL:
 575 |                     replace = value
 576 |                     token, _, line = GrammarDocument._advance(reader)
 577 |                 if token != TOKEN_EOD:
 578 |                     raise ParseError("End of definition required", line)
 579 |                 if (assoc is not None) and (prec is None):
 580 |                     raise ParseError("Precedence must be defined while associativity defined", def_line)
 581 |                 symbols[identifier] = Symbol(SYMBOL_TERMINAL, identifier, replace,
 582 |                                              ASSOC_UNDEF if assoc is None else assoc,
 583 |                                              0 if prec is None else prec,
 584 |                                              def_line)
 585 |             elif token == TOKEN_NON_TERM:
 586 |                 # read identifier
 587 |                 token, identifier, line = GrammarDocument._advance(reader)
 588 |                 if token != TOKEN_IDENTIFIER:
 589 |                     raise ParseError("Identifier required parsing term statement", line)
 590 |                 if identifier in symbols:
 591 |                     raise ParseError(f"Non-terminated symbol \"{identifier}\" redefined", line)
 592 |                 replace = None
 593 |                 def_line = line
 594 |                 # replace
 595 |                 token, value, line = GrammarDocument._advance(reader)
 596 |                 if token == TOKEN_LITERAL:
 597 |                     replace = value
 598 |                     token, _, line = GrammarDocument._advance(reader)
 599 |                 if token != TOKEN_EOD:
 600 |                     raise ParseError("End of definition required", line)
 601 |                 symbols[identifier] = Symbol(SYMBOL_NON_TERMINAL, identifier, replace, ASSOC_UNDEF, 0, def_line)
 602 |             elif token == TOKEN_GRAMMAR:
 603 |                 token, _, line = GrammarDocument._advance(reader)
 604 |                 if token != TOKEN_BEGIN_BLOCK:
 605 |                     raise ParseError("'{' required parsing grammar block", line)
 606 |                 while True:
 607 |                     token, identifier, line = GrammarDocument._advance(reader)
 608 |                     if token == TOKEN_END_BLOCK:  # }
 609 |                         break
 610 |                     elif token != TOKEN_IDENTIFIER:
 611 |                         raise ParseError("Identifier required parsing production expression", line)
 612 | 
 613 |                     # identifier
 614 |                     if identifier not in symbols:
 615 |                         raise ParseError(f"Undefined symbol \"{identifier}\" parsing production expression", line)
 616 |                     # ->
 617 |                     token, _, line = GrammarDocument._advance(reader)
 618 |                     if token != TOKEN_DEDUCE:
 619 |                         raise ParseError("Deduce operator required parsing production expression", line)
 620 |                     right = []
 621 |                     replace = None
 622 |                     prec = None
 623 |                     empty_production = False
 624 |                     def_line = line
 625 |                     binding = {}
 626 |                     while True:
 627 |                         token, value, line = GrammarDocument._advance(reader)
 628 |                         if token == TOKEN_EOD:  # ;
 629 |                             if not empty_production and len(right) == 0:
 630 |                                 raise ParseError("Symbol expected but found ';' parsing production expression", line)
 631 |                             break
 632 |                         elif token == TOKEN_LITERAL:
 633 |                             if not empty_production and len(right) == 0:
 634 |                                 raise ParseError("Symbol expected but found replacement literal", line)
 635 |                             replace = value
 636 |                             token, _, line = GrammarDocument._advance(reader)
 637 |                             if token != TOKEN_EOD:
 638 |                                 raise ParseError("End of definition required parsing production expression", line)
 639 |                             break
 640 |                         elif token == TOKEN_EMPTY:
 641 |                             if len(right) != 0 or (prec is not None):
 642 |                                 raise ParseError("Epsilon symbol cannot be placed here parsing production expression",
 643 |                                                  line)
 644 |                             empty_production = True
 645 |                         elif token == TOKEN_PREC:
 646 |                             token, _, line = GrammarDocument._advance(reader)
 647 |                             if token != TOKEN_BEGIN_ARG:
 648 |                                 raise ParseError("'(' required parsing precedence", line)
 649 |                             token, prec, line = GrammarDocument._advance(reader)
 650 |                             if token != TOKEN_INTEGER:
 651 |                                 raise ParseError("Integer expected parsing precedence", line)
 652 |                             if prec == 0:
 653 |                                 raise ParseError("Precedence must large than zero", line)
 654 |                             token, _, line = GrammarDocument._advance(reader)
 655 |                             if token != TOKEN_END_ARG:
 656 |                                 raise ParseError("')' required parsing precedence", line)
 657 |                         elif token == TOKEN_IDENTIFIER:
 658 |                             if empty_production or (prec is not None):
 659 |                                 raise ParseError("Identifier cannot be placed here", line)
 660 |                             if value not in symbols:
 661 |                                 raise ParseError(f"Undefined symbol \"{value}\"", line)
 662 |                             right.append(symbols[value])
 663 |                         elif token == TOKEN_BEGIN_ARG:
 664 |                             if len(right) == 0:
 665 |                                 raise ParseError("Symbol required for binding argument name", line)
 666 |                             if right[len(right) - 1].replace() is None:
 667 |                                 raise ParseError("Symbol don't have type for binding", line)
 668 |                             token, arg_id, line = GrammarDocument._advance(reader)
 669 |                             if token != TOKEN_IDENTIFIER:
 670 |                                 raise ParseError("Identifier required parsing binding argument", line)
 671 |                             token, _, line = GrammarDocument._advance(reader)
 672 |                             if token != TOKEN_END_ARG:
 673 |                                 raise ParseError("')' expected parsing binding argument", line)
 674 |                             binding[len(right) - 1] = arg_id
 675 |                         else:
 676 |                             raise ParseError("Unexpected token", line)
 677 |                     assert len(right) > 0 or empty_production
 678 |                     # calc prec if user not defined
 679 |                     if prec is None:
 680 |                         for e in reversed(right):
 681 |                             if e.type() == SYMBOL_TERMINAL:
 682 |                                 prec = e.precedence()
 683 |                         if prec is None:
 684 |                             prec = 0
 685 |                     production = Production(symbols[identifier], right, binding, replace, prec, def_line,
 686 |                                             len(productions))
 687 |                     if production in production_set:
 688 |                         raise ParseError(f"Production \"{production}\" redefined", def_line)
 689 |                     if (production.left().replace() is not None) and (production.replace() is None):
 690 |                         raise ParseError(f"Action body expected for production \"{production}\"", def_line)
 691 |                     productions.append(production)
 692 |                     production_set.add(production)
 693 |                 token, _, line = GrammarDocument._advance(reader)
 694 |                 if token != TOKEN_EOD:
 695 |                     raise ParseError("End of definition required parsing grammar block", line)
 696 |             elif token == TOKEN_GENERATOR:
 697 |                 if generator_args is not None:
 698 |                     raise ParseError("Generator arguments is redefined", line)
 699 |                 try:
 700 |                     token, json_args, line = GrammarDocument._advance(reader)
 701 |                 except Exception as ex:
 702 |                     raise ParseError(f"Parsing json error parsing generator block: {ex}", line)
 703 |                 if token != TOKEN_LITERAL:
 704 |                     raise ParseError("String literal required parsing generator block", line)
 705 |                 token, _, line = GrammarDocument._advance(reader)
 706 |                 if token != TOKEN_EOD:
 707 |                     raise ParseError("';' expected parsing generator block", line)
 708 |                 generator_args = json.loads(json_args)
 709 |             else:
 710 |                 raise ParseError("Unexpected token", line)
 711 |         self._productions = productions
 712 |         self._symbols = set([symbols[s] for s in symbols])
 713 |         self._terminals = set([s for s in self._symbols if s.type() == SYMBOL_TERMINAL])
 714 |         self._non_terminals = set([s for s in self._symbols if s.type() == SYMBOL_NON_TERMINAL])
 715 |         self._generator_args = generator_args
 716 | 
 717 | # ---------------------------------------- LR(1)/LALR分析器部分 ----------------------------------------
 718 | # LR(1)/LALR分析器用于解算状态转移矩阵。
 719 | # 通过对文法进行LR分析，可以得到类似下图的转换矩阵：
 720 | #      x opt eq $  | S E  V
 721 | #   0 s2 s4        |
 722 | #   1           a  |
 723 | #   2        r3 r3 |
 724 | #   3 s2 s4        |   g8 g7
 725 | # ……下略
 726 | # 其中，表头表示向前看符号，每一行代表一个解析器状态，每一个格表明在看到下一个输入符号时需要进行的动作：
 727 | #   sX 表明一个移进操作，在移入下一个符号后跳转到状态X
 728 | #   rX 表明一个规约操作，在看到当前符号时按照产生式X进行规约，弹出解析栈顶部的|X|个元素
 729 | #   gX 表明在规约操作后，在看到栈顶符号为这个格子对应的符号时，转移状态到X状态
 730 | # 同时分析器会依据之前的规则对 SR冲突、RR冲突 进行解决
 731 | 
 732 | 
 733 | class ExtendProduction:
 734 |     """
 735 |     扩展生成式（项）
 736 | 
 737 |     增加当前位置和向前看符号来计算闭包。
 738 |     """
 739 |     def __init__(self, raw: Production, pos: int, lookahead: Set[Symbol]):
 740 |         assert len(raw) >= pos
 741 |         self._production = raw
 742 |         self._pos = pos
 743 |         self._lookahead = lookahead
 744 | 
 745 |     def __repr__(self):
 746 |         right = [repr(x) for x in self._production]
 747 |         right.insert(self._pos, "·")
 748 |         return "(%s -> %s, %s)" % (repr(self._production.left()), " ".join(right), self._lookahead)
 749 | 
 750 |     def __len__(self):
 751 |         return len(self._production)
 752 | 
 753 |     def __getitem__(self, item):
 754 |         assert isinstance(item, int)
 755 |         return self._production[item]
 756 | 
 757 |     def __eq__(self, obj) -> bool:
 758 |         if not isinstance(obj, ExtendProduction):
 759 |             return False
 760 |         if self._pos != obj._pos:
 761 |             return False
 762 |         if self._production != obj._production:
 763 |             return False
 764 |         if self._lookahead != obj._lookahead:
 765 |             return False
 766 |         return True
 767 | 
 768 |     def __ne__(self, obj) -> bool:
 769 |         return not self == obj
 770 | 
 771 |     def __hash__(self) -> int:
 772 |         ret = hash(self._pos)
 773 |         for x in self._lookahead:
 774 |             ret = ret ^ hash(x)
 775 |         ret = ret ^ hash(self._production)
 776 |         return ret
 777 | 
 778 |     def production(self) -> Production:
 779 |         """
 780 |         获取原始产生式
 781 |         :return: 产生式
 782 |         """
 783 |         return self._production
 784 | 
 785 |     def pos(self) -> int:
 786 |         """
 787 |         获取当前分析位置
 788 |         :return: 位置
 789 |         """
 790 |         return self._pos
 791 | 
 792 |     def lookahead(self) -> Set[Symbol]:
 793 |         """
 794 |         关联的向前看符号
 795 |         :return: 符号
 796 |         """
 797 |         return self._lookahead
 798 | 
 799 | 
 800 | class ExtendProductionSet:
 801 |     """
 802 |     扩展生成式集合（项集）
 803 |     """
 804 |     def __init__(self, s: Set[ExtendProduction], state_id: Optional[int]):
 805 |         self._set = s
 806 |         self._state = state_id
 807 | 
 808 |     def __len__(self):
 809 |         return len(self._set)
 810 | 
 811 |     def __eq__(self, obj) -> bool:  # state_id不参与比较
 812 |         if not isinstance(obj, ExtendProductionSet):
 813 |             return False
 814 |         return self._set == obj._set
 815 | 
 816 |     def __ne__(self, obj) -> bool:
 817 |         return not self == obj
 818 | 
 819 |     def __hash__(self) -> int:
 820 |         ret = 0
 821 |         for x in self._set:
 822 |             ret = ret ^ hash(x)
 823 |         return ret
 824 | 
 825 |     def __iter__(self):
 826 |         return iter(self._set)
 827 | 
 828 |     def __repr__(self):
 829 |         ret = ["["]
 830 |         for e in self._set:
 831 |             ret.append(f"  {repr(e)}")
 832 |         ret.append("]")
 833 |         return "\n".join(ret)
 834 | 
 835 |     def state(self):
 836 |         """
 837 |         获取状态ID
 838 |         :return: 状态ID
 839 |         """
 840 |         return self._state
 841 | 
 842 |     def set_state(self, state):
 843 |         """
 844 |         设置状态ID
 845 |         """
 846 |         self._state = state
 847 | 
 848 |     def add(self, x: ExtendProduction):
 849 |         """
 850 |         向集合添加产生式
 851 |         :param x: 产生式
 852 |         """
 853 |         self._set.add(x)
 854 | 
 855 |     def union(self, x):
 856 |         """
 857 |         合并其他集合
 858 |         :param x: 集合
 859 |         """
 860 |         if isinstance(x, set):
 861 |             self._set = self._set.union(x)
 862 |         else:
 863 |             assert isinstance(x, ExtendProductionSet)
 864 |             self._set = self._set.union(x._set)
 865 | 
 866 |     def clone(self):
 867 |         """
 868 |         产生副本
 869 |         :return: 项集
 870 |         """
 871 |         return ExtendProductionSet(set(self._set), self._state)
 872 | 
 873 | 
 874 | ACTION_ACCEPT = 1
 875 | ACTION_GOTO = 2  # Shift和Goto可以用一个动作表示，因为对于非终结符总是Goto的，对于终结符总是Shift的
 876 | ACTION_REDUCE = 3  # 规约动作
 877 | 
 878 | 
 879 | class Action:
 880 |     """
 881 |     语法动作
 882 |     """
 883 |     def __init__(self, action: int, arg, ref_state: ExtendProductionSet, ref_symbol: Symbol,
 884 |                  ref_prod: Optional[ExtendProduction]):
 885 |         self._action = action
 886 |         self._arg = arg
 887 |         self._ref_state = ref_state
 888 |         self._ref_symbol = ref_symbol
 889 |         self._ref_prod = ref_prod
 890 | 
 891 |         # 参数检查
 892 |         if action == ACTION_GOTO:
 893 |             assert isinstance(arg, ExtendProductionSet)
 894 |         elif action == ACTION_REDUCE:
 895 |             assert isinstance(arg, Production)
 896 |             assert arg.index() >= 0
 897 | 
 898 |     def __repr__(self):
 899 |         if self._action == ACTION_ACCEPT:
 900 |             return "a"
 901 |         elif self._action == ACTION_GOTO:
 902 |             assert isinstance(self._arg, ExtendProductionSet)
 903 |             if self._ref_symbol.type() == SYMBOL_NON_TERMINAL:
 904 |                 return f"g{self._arg.state()}"
 905 |             else:
 906 |                 return f"s{self._arg.state()}"
 907 |         elif self._action == ACTION_REDUCE:
 908 |             assert isinstance(self._arg, Production)
 909 |             return f"r{self._arg.index()}"
 910 |         return ""
 911 | 
 912 |     def action(self) -> int:
 913 |         """
 914 |         获取动作
 915 |         :return: 动作
 916 |         """
 917 |         return self._action
 918 | 
 919 |     def arg(self):
 920 |         """
 921 |         获取动作参数
 922 |         :return: 参数
 923 |         """
 924 |         return self._arg
 925 | 
 926 |     def ref_state(self) -> ExtendProductionSet:
 927 |         """
 928 |         获取关联的状态集合
 929 |         :return: 项集
 930 |         """
 931 |         return self._ref_state
 932 | 
 933 |     def ref_symbol(self) -> Symbol:
 934 |         """
 935 |         获取关联的符号
 936 |         :return: 符号
 937 |         """
 938 |         return self._ref_symbol
 939 | 
 940 |     def ref_prod(self) -> Optional[ExtendProduction]:
 941 |         """
 942 |         获取关联的生成式
 943 | 
 944 |         对于Shift操作不存在关联的生成式。
 945 |         :return: 项
 946 |         """
 947 |         return self._ref_prod
 948 | 
 949 | 
 950 | class GrammarError(Exception):
 951 |     """
 952 |     解析错误
 953 |     """
 954 |     def __init__(self, message: str):
 955 |         Exception.__init__(self, message)
 956 |         self._message = message
 957 | 
 958 |     def message(self):
 959 |         return self._message
 960 | 
 961 | 
 962 | GRAMMAR_MODE_LR1 = 0
 963 | GRAMMAR_MODE_LALR = 1
 964 | 
 965 | 
 966 | class GrammarAnalyzer:
 967 |     def __init__(self, document: GrammarDocument):
 968 |         self._doc = document
 969 | 
 970 |         # 初始化 NullableSet、FirstSet 和 FollowSet 并计算
 971 |         # 注意这个 Set 会包含 kEofSymbol
 972 |         self._nullable_set = {}  # type: Dict[Symbol, bool]
 973 |         self._first_set = {}  # type: Dict[Symbol, Set[Symbol]]
 974 |         self._follow_set = {}  # type: Dict[Symbol, Set[Symbol]]
 975 |         self._analyze_nullable_first_follow_set()
 976 | 
 977 |         # 初始化扩展符号表
 978 |         self._extend_symbols = set(self._doc.symbols())  # type: Set[Symbol]
 979 |         self._extend_symbols.add(kEofSymbol)
 980 | 
 981 |         # 初始化分析动作表
 982 |         self._actions = {}  # type: Dict[Symbol, Dict[int, Action]]
 983 |         self._max_state = 0  # 最大的状态ID
 984 |         self._resolve_rr_conflict = 0  # 解决Reduce/Reduce冲突的次数
 985 |         self._resolve_sr_conflict_by_prec = 0  # 解决Reduce/Shift冲突的次数（通过算符优先）
 986 |         self._resolve_sr_conflict_by_shift = 0  # 解决Reduce/Shift冲突的次数（通过Shift优先）
 987 |         self._reset_actions()
 988 | 
 989 |     def _analyze_nullable_first_follow_set(self):
 990 |         # 对所有产生式执行拓扑排序的计算，并按照出度从小到大排序
 991 |         toposort_states = {}  # type: Dict[Symbol, Dict]
 992 |         toposort_results = []  # type: List[Production]
 993 | 
 994 |         # 初始化数据集
 995 |         for s in self._doc.non_terminals():
 996 |             toposort_states[s] = {
 997 |                 "out": 0,  # 出度
 998 |                 "from": [],  # 入度
 999 |                 "visited": False,  # 是否已处理
1000 |                 "productions": [],  # 从当前非终结符号导出的产生式
1001 |             }
1002 |         for p in self._doc.productions():
1003 |             toposort_states[p.left()]["productions"].append(p)
1004 |             for i in range(0, len(p)):
1005 |                 if p[i].type() == SYMBOL_NON_TERMINAL:
1006 |                     toposort_states[p.left()]["out"] += 1
1007 |                     toposort_states[p[i]]["from"].append(toposort_states[p.left()])
1008 | 
1009 |         # 迭代进行拓扑排序直到集合为空
1010 |         while len(toposort_results) < len(self._doc.productions()):
1011 |             refs_min = None
1012 |             for k in toposort_states:  # 寻找一个出度最小节点
1013 |                 state = toposort_states[k]
1014 |                 if state["visited"]:
1015 |                     continue
1016 |                 if refs_min is None or state["out"] < refs_min["out"]:
1017 |                     refs_min = state
1018 |             assert refs_min is not None
1019 |             toposort_results.extend(refs_min["productions"])  # 将当前节点的产生式放入队列
1020 |             # 从集合中隐藏当前节点
1021 |             refs_min["visited"] = True
1022 |             for e in refs_min["from"]:
1023 |                 e["out"] -= 1
1024 |                 assert e["out"] >= 0
1025 |         assert len(toposort_results) == len(self._doc.productions())
1026 | 
1027 |         # 初始化集合
1028 |         nullable_set = {kEofSymbol: False}  # type: Dict[Symbol, bool]
1029 |         first_set = {kEofSymbol: {kEofSymbol}}  # type: Dict[Symbol, Set[Symbol]]
1030 |         follow_set = {kEofSymbol: set()}  # type: Dict[Symbol, Set[Symbol]]
1031 |         for s in self._doc.symbols():
1032 |             nullable_set[s] = False
1033 |             first_set[s] = {s} if s.type() == SYMBOL_TERMINAL else set()
1034 |             follow_set[s] = set()
1035 | 
1036 |         # 迭代到不动点计算NULLABLE、FIRST集合和FOLLOW集合
1037 |         while True:
1038 |             stopped = True
1039 |             for p in toposort_results:
1040 |                 s = p.left()
1041 | 
1042 |                 # 检查产生式是否可空，即产生式中所有项都可空能推导出当前的非终结符可空
1043 |                 if not nullable_set[s]:  # 对于已经认为可空的永远不会变为非可空
1044 |                     nullable = True
1045 |                     for i in range(0, len(p)):
1046 |                         if not nullable_set[p[i]]:  # 非空
1047 |                             nullable = False
1048 |                             break
1049 |                     if nullable_set[s] != nullable:
1050 |                         nullable_set[s] = nullable
1051 |                         stopped = False
1052 | 
1053 |                 # 计算FIRST集
1054 |                 first = set(first_set[s])
1055 |                 for i in range(0, len(p)):
1056 |                     # 若 p[0..i] 都可空，那么 first[s] = first[s] ∪ first[p[i]]
1057 |                     prefix_nullable = True
1058 |                     for j in range(0, i):
1059 |                         if not nullable_set[p[j]]:
1060 |                             prefix_nullable = False
1061 |                             break
1062 |                     if prefix_nullable:
1063 |                         first = first.union(first_set[p[i]])
1064 |                     else:
1065 |                         break  # 如果中间出现过不可空的，则无需继续看
1066 |                 if first_set[s] != first:
1067 |                     first_set[s] = first
1068 |                     stopped = False
1069 | 
1070 |                 # 计算FOLLOW集
1071 |                 for i in range(0, len(p)):
1072 |                     x = p[i]  # 注意此时计算的目标是产生式中的每个项
1073 |                     follow = set(follow_set[x])  # copy
1074 |                     # 若 p[i+1..len(p)] 都可空，那么 follow[x] = follow[x] ∪ follow[s]
1075 |                     postfix_nullable = True
1076 |                     for j in range(i + 1, len(p)):
1077 |                         if not nullable_set[p[j]]:
1078 |                             postfix_nullable = False
1079 |                             break
1080 |                     if postfix_nullable:
1081 |                         follow = follow.union(follow_set[s])
1082 |                     # 若 p[i+1..j] 都可空，那么 follow[x] = follow[x] ∪ first[j]
1083 |                     for j in range(i + 1, len(p)):
1084 |                         inner_nullable = True
1085 |                         for k in range(i + 1, j):
1086 |                             if not nullable_set[p[k]]:
1087 |                                 inner_nullable = False
1088 |                                 break
1089 |                         if inner_nullable:
1090 |                             follow = follow.union(first_set[p[j]])
1091 |                     if follow_set[x] != follow:
1092 |                         follow_set[x] = follow
1093 |                         stopped = False
1094 |             if stopped:
1095 |                 break
1096 |         self._nullable_set = nullable_set
1097 |         self._first_set = first_set
1098 |         self._follow_set = follow_set
1099 | 
1100 |     def _reset_actions(self):
1101 |         for s in self._extend_symbols:
1102 |             self._actions[s] = {}
1103 |         self._max_state = 0
1104 |         self._resolve_rr_conflict = 0
1105 |         self._resolve_sr_conflict_by_prec = 0
1106 |         self._resolve_sr_conflict_by_shift = 0
1107 | 
1108 |     def _closure(self, org: ExtendProductionSet) -> ExtendProductionSet:
1109 |         """
1110 |         求项集的闭包
1111 |         :param org: 原始项集
1112 |         :return: 项集的闭包
1113 |         """
1114 |         ret = org.clone()  # copy
1115 |         ret.set_state(-1)  # 需要外部重新赋予状态ID
1116 |         add = set()
1117 |         while True:
1118 |             for e in ret:
1119 |                 if e.pos() >= len(e.production()):
1120 |                     continue
1121 | 
1122 |                 x = e.production()[e.pos()]
1123 |                 if x.type() == SYMBOL_TERMINAL:
1124 |                     continue
1125 |                 if x.type() == SYMBOL_EOF:
1126 |                     assert (len(e.lookahead()) == 0)
1127 |                     continue
1128 |                 assert(x.type() != SYMBOL_ENTRY)
1129 | 
1130 |                 # 计算 FIRST 集
1131 |                 first = set()
1132 |                 for i in range(e.pos() + 1, len(e.production()) + 1):
1133 |                     # 若 p[cur+1..i] 都可空，那么 first[X] = first[X] ∪ first[p[i]]
1134 |                     prefix_nullable = True
1135 |                     for j in range(e.pos() + 1, i):
1136 |                         if not self._nullable_set[e.production()[j]]:
1137 |                             prefix_nullable = False
1138 |                             break
1139 |                     if prefix_nullable:
1140 |                         if i == len(e.production()):
1141 |                             first = first.union(e.lookahead())
1142 |                         else:
1143 |                             first = first.union(self._first_set[e.production()[i]])
1144 |                     else:
1145 |                         break  # 如果中间出现过不可空的，则无需继续看
1146 | 
1147 |                 # 展开终结符
1148 |                 for p in self._doc.productions():
1149 |                     if p.left() == x:
1150 |                         for w in first:
1151 |                             item = ExtendProduction(p, 0, {w})
1152 |                             if item not in ret and item not in add:
1153 |                                 add.add(item)
1154 | 
1155 |             if len(add) == 0:
1156 |                 break
1157 |             ret.union(add)
1158 |             add.clear()
1159 |         return ret
1160 | 
1161 |     def _goto(self, org: ExtendProductionSet, x: Symbol) -> ExtendProductionSet:
1162 |         """
1163 |         求项集在符号 X 下可以转移到的状态
1164 |         :param org: 原始项集
1165 |         :param x: 转移符号
1166 |         :return: 输出状态
1167 |         """
1168 |         ret = set()
1169 |         for e in org:
1170 |             if e.pos() >= len(e.production()):
1171 |                 continue
1172 |             s = e.production()[e.pos()]
1173 |             if s != x:
1174 |                 continue
1175 |             p = ExtendProduction(e.production(), e.pos() + 1, e.lookahead())
1176 |             if p not in ret:
1177 |                 ret.add(p)
1178 |         return self._closure(ExtendProductionSet(ret, -1))  # 需要外部重新赋予状态ID
1179 | 
1180 |     def _populate_action(self, s: Symbol, state: int, act: Action):
1181 |         if state in self._actions[s]:  # 冲突解决
1182 |             raise_error = True
1183 |             conflict_type = 0  # 0: unknown 1: shift/shift冲突 2:shift/reduce冲突 3:reduce/reduce冲突
1184 |             conflict_args = ()
1185 |             org_action = self._actions[s][state]
1186 |             assert state == org_action.ref_state().state()
1187 | 
1188 |             # 如果存在Shift/Shift冲突，则抛出错误
1189 |             if org_action.action() == ACTION_GOTO and act.action() == ACTION_GOTO:
1190 |                 assert isinstance(org_action.arg(), ExtendProductionSet)
1191 |                 assert isinstance(act.arg(), ExtendProductionSet)
1192 |                 conflict_type = 1
1193 |                 conflict_args = (s, org_action.ref_state().state(), org_action.arg(), act.arg())
1194 | 
1195 |             # 针对Reduce/Reduce的情况，选择优先出现的规则
1196 |             if org_action.action() == ACTION_REDUCE and act.action() == ACTION_REDUCE:
1197 |                 assert isinstance(org_action.arg(), Production)
1198 |                 assert isinstance(act.arg(), Production)
1199 |                 assert org_action.arg().index() != act.arg().index()
1200 |                 conflict_type = 3
1201 |                 conflict_args = (s, org_action.ref_state().state(), org_action.arg(), act.arg())
1202 |                 raise_error = False
1203 |                 self._resolve_rr_conflict += 1
1204 |                 if act.arg().index() > org_action.arg().index():
1205 |                     return  # 不接受在后面的产生式
1206 | 
1207 |             # 针对Reduce/Shift的情况
1208 |             if (org_action.action() == ACTION_REDUCE and act.action() == ACTION_GOTO) or \
1209 |                     (org_action.action() == ACTION_GOTO and act.action() == ACTION_REDUCE):
1210 |                 if org_action.action() == ACTION_REDUCE:
1211 |                     reduce_action = org_action
1212 |                     shift_action = act
1213 |                 else:
1214 |                     reduce_action = act
1215 |                     shift_action = org_action
1216 |                 reduce_production = reduce_action.arg()  # type: Production
1217 |                 shift_state = shift_action.arg()  # type: ExtendProductionSet
1218 |                 assert isinstance(reduce_production, Production)
1219 |                 assert isinstance(shift_state, ExtendProductionSet)
1220 |                 assert shift_action.ref_symbol() == s
1221 |                 assert s.type() != SYMBOL_NON_TERMINAL  # 非终结符不可能出现SR冲突
1222 |                 conflict_type = 2
1223 |                 conflict_args = (s, org_action.ref_state().state(), reduce_production)
1224 | 
1225 |                 accept_reduce = None
1226 |                 raise_error = False
1227 | 
1228 |                 # 首先尝试算符优先
1229 |                 # 语法规则保证定义了结合性时必然定义了算符优先级，对于没有定义算符优先级的表达式/符号不会通过算符优先方式解决
1230 |                 if s.type() == SYMBOL_TERMINAL and s.precedence() > 0 and reduce_production.precedence() > 0:
1231 |                     # 如果优先级一致，则考虑结合性
1232 |                     if s.precedence() == reduce_production.precedence():
1233 |                         # 找到Reduce产生式的符号获取结合性
1234 |                         reduce_symbol = None
1235 |                         for i in range(len(reduce_production) - 1, -1, -1):
1236 |                             if reduce_production[i].type() == SYMBOL_TERMINAL:
1237 |                                 reduce_symbol = reduce_production[i]
1238 |                                 break
1239 |                         assert reduce_symbol is not None
1240 | 
1241 |                         if reduce_symbol.associativity() == ASSOC_NONE or s.associativity() == ASSOC_NONE:
1242 |                             # 没有结合性，报错
1243 |                             raise_error = True
1244 |                         elif reduce_symbol.associativity() == ASSOC_UNDEF or s.associativity() == ASSOC_UNDEF:
1245 |                             # 未定义结合性，回退到Shift优先规则
1246 |                             pass
1247 |                         elif reduce_symbol.associativity() != s.associativity():
1248 |                             # 结合性不一致，报错
1249 |                             raise_error = True
1250 |                         else:
1251 |                             # 结合性一致，按照结合性解决SR冲突
1252 |                             assert reduce_symbol.associativity() == s.associativity()
1253 | 
1254 |                             # 如果为左结合，则采取Reduce操作，否则采取Shift操作
1255 |                             if s.associativity() == ASSOC_LEFT:
1256 |                                 accept_reduce = True
1257 |                             else:
1258 |                                 assert s.associativity() == ASSOC_RIGHT
1259 |                                 accept_reduce = False
1260 |                             self._resolve_sr_conflict_by_prec += 1
1261 |                     else:  # 优先级不一致，选择优先级高的进行reduce/shift
1262 |                         if reduce_production.precedence() > s.precedence():
1263 |                             accept_reduce = True
1264 |                         else:
1265 |                             accept_reduce = False
1266 |                         self._resolve_sr_conflict_by_prec += 1
1267 | 
1268 |                 # 在算符优先也没有解决的情况下，优先使用Shift规则
1269 |                 if (accept_reduce is None) and (not raise_error):
1270 |                     accept_reduce = False
1271 |                     self._resolve_sr_conflict_by_shift += 1
1272 | 
1273 |                 # 最终决定是否接受覆盖
1274 |                 if accept_reduce is not None:
1275 |                     assert not raise_error
1276 |                     if accept_reduce and reduce_action == org_action:
1277 |                         return
1278 |                     elif not accept_reduce and reduce_action == act:
1279 |                         return
1280 | 
1281 |             assert conflict_type != 0
1282 |             if raise_error:  # 未能解决冲突
1283 |                 if conflict_type == 1:
1284 |                     raise GrammarError(f"Shift/shift conflict detected, symbol {repr(conflict_args[0])}, state: "
1285 |                                        f"{repr(conflict_args[1])}, shift state 1: {repr(conflict_args[2])}, "
1286 |                                        f"shift state 2: {repr(conflict_args[3])}")
1287 |                 elif conflict_type == 2:
1288 |                     raise GrammarError(f"Shift/reduce conflict detected, state: {repr(conflict_args[1])}, "
1289 |                                        f"shift symbol: {repr(conflict_args[0])}, reduce production: "
1290 |                                        f"{repr(conflict_args[2])}")
1291 |                 elif conflict_type == 3:
1292 |                     assert False  # Reduce/reduce冲突总能被解决
1293 |         self._actions[s][state] = act  # 覆盖状态
1294 | 
1295 |     def _process_lr1(self):
1296 |         # 以首个规则作为入口
1297 |         entry_production = Production(kEntrySymbol, [self._doc.productions()[0].left(), kEofSymbol], {})
1298 |         entry_production_ex = ExtendProduction(entry_production, 0, set())
1299 |         entry_item_set = self._closure(ExtendProductionSet({entry_production_ex}, -1))
1300 |         entry_item_set.set_state(0)  # 首个状态
1301 | 
1302 |         # 初始化状态
1303 |         next_state = 1
1304 |         states = {entry_item_set: entry_item_set.state()}  # type: Dict[ExtendProductionSet, int]
1305 |         q = [entry_item_set]  # type: List[ExtendProductionSet]
1306 | 
1307 |         # 计算动作表
1308 |         while len(q) > 0:
1309 |             state = q.pop(0)
1310 |             assert states[state] == state.state()
1311 | 
1312 |             # 填写规约动作
1313 |             for p in state:
1314 |                 if p.pos() >= len(p.production()):
1315 |                     for x in p.lookahead():
1316 |                         action = Action(ACTION_REDUCE, p.production(), state, x, p)
1317 |                         self._populate_action(x, state.state(), action)
1318 | 
1319 |             # 计算Shift/Goto/Accept
1320 |             for x in self._extend_symbols:
1321 |                 goto = self._goto(state, x)
1322 |                 if len(goto) == 0:
1323 |                     continue
1324 |                 if x == kEofSymbol:
1325 |                     for p in goto:
1326 |                         if p.pos() >= len(p.production()):
1327 |                             action = Action(ACTION_ACCEPT, None, state, x, p)
1328 |                             self._populate_action(x, state.state(), action)
1329 |                         else:
1330 |                             assert False  # 经由Eof推导出的状态只能是Reduce，不可能出现其他情况
1331 |                 else:
1332 |                     if goto in states:
1333 |                         goto.set_state(states[goto])
1334 |                     else:
1335 |                         goto.set_state(next_state)
1336 |                         next_state += 1
1337 |                         states[goto] = goto.state()
1338 |                         q.append(goto)
1339 |                     assert goto.state() != -1
1340 |                     action = Action(ACTION_GOTO, goto, state, x, None)
1341 |                     self._populate_action(x, state.state(), action)
1342 |         self._max_state = next_state - 1
1343 | 
1344 |     def document(self):
1345 |         """
1346 |         获取原始语法文件
1347 |         :return: 文档对象
1348 |         """
1349 |         return self._doc
1350 | 
1351 |     def actions(self):
1352 |         """
1353 |         获取计算后的动作表
1354 |         :return: 动作转换表
1355 |         """
1356 |         return self._actions
1357 | 
1358 |     def max_state(self):
1359 |         """
1360 |         获取最大的状态ID
1361 |         """
1362 |         return self._max_state
1363 | 
1364 |     def printable_actions(self) -> str:
1365 |         """
1366 |         获取可打印动作表
1367 |         :return: 字符串结果
1368 |         """
1369 |         ret = []
1370 |         header = [None]  # 表头
1371 |         for s in self._doc.terminals():
1372 |             header.append(s)
1373 |         header.append(kEofSymbol)
1374 |         for s in self._doc.non_terminals():
1375 |             header.append(s)
1376 |         min_width = len(str(self._max_state)) + 1
1377 |         header_width = [max(min_width, len(s.id()) if s is not None else 0) for s in header]
1378 | 
1379 |         # 打印表头
1380 |         ret.append(" | ".join([header[i].id().rjust(header_width[i]) if header[i] is not None else
1381 |                                "".rjust(header_width[i]) for i in range(0, len(header))]))
1382 | 
1383 |         # 打印所有行
1384 |         for s in range(0, self._max_state + 1):
1385 |             empty = True
1386 |             data = []
1387 |             for i in range(0, len(header)):
1388 |                 if i == 0:
1389 |                     data.append(str(s).rjust(header_width[i]))
1390 |                 else:
1391 |                     if s in self._actions[header[i]]:
1392 |                         data.append(repr(self._actions[header[i]][s]).rjust(header_width[i]))
1393 |                         empty = False
1394 |                     else:
1395 |                         data.append("".rjust(header_width[i]))
1396 |             if not empty:
1397 |                 ret.append(" | ".join(data))
1398 |         return "\n".join(ret)
1399 | 
1400 |     def process(self, mode):
1401 |         """
1402 |         处理语法
1403 |         :param mode: 语法模式
1404 |         """
1405 |         self._reset_actions()
1406 |         if mode == GRAMMAR_MODE_LR1:
1407 |             self._process_lr1()
1408 |         else:
1409 |             assert mode == GRAMMAR_MODE_LALR
1410 |             # TODO
1411 |             raise NotImplementedError()
1412 | 
1413 |     def resolve_stat(self) -> Tuple[int, int, int]:
1414 |         return self._resolve_rr_conflict, self._resolve_sr_conflict_by_prec, self._resolve_sr_conflict_by_shift
1415 | 
1416 | # ---------------------------------------- 模板渲染器 ----------------------------------------
1417 | # 见 https://github.com/9chu/et-py
1418 | 
1419 | 
1420 | class TemplateNode:
1421 |     def __init__(self, parent):
1422 |         self.parent = parent
1423 |         self.nodes = []
1424 | 
1425 |     def render(self, context):
1426 |         pass
1427 | 
1428 | 
1429 | class TemplateForNode(TemplateNode):
1430 |     def __init__(self, parent, identifier, expression):
1431 |         TemplateNode.__init__(self, parent)
1432 |         self.identifier = identifier
1433 |         self.expression = expression
1434 | 
1435 |     def render(self, context):
1436 |         result = eval(self.expression, None, context)
1437 |         origin = context[self.identifier] if self.identifier in context else None
1438 |         for i in result:
1439 |             context[self.identifier] = i
1440 |             yield iter(self.nodes)
1441 |         if origin:
1442 |             context[self.identifier] = origin
1443 | 
1444 | 
1445 | class TemplateIfNode(TemplateNode):
1446 |     def __init__(self, parent, expression):
1447 |         TemplateNode.__init__(self, parent)
1448 |         self.expression = expression
1449 |         self.true_branch = self.nodes
1450 | 
1451 |     def render(self, context):
1452 |         test = eval(self.expression, None, context)
1453 |         if test:
1454 |             yield iter(self.true_branch)
1455 | 
1456 | 
1457 | class TemplateIfElseNode(TemplateNode):
1458 |     def __init__(self, parent, if_node):  # extent from IfNode
1459 |         TemplateNode.__init__(self, parent)
1460 |         self.expression = if_node.expression
1461 |         self.true_branch = if_node.true_branch
1462 |         self.false_branch = self.nodes
1463 | 
1464 |     def render(self, context):
1465 |         test = eval(self.expression, None, context)
1466 |         if test:
1467 |             yield iter(self.true_branch)
1468 |         else:
1469 |             yield iter(self.false_branch)
1470 | 
1471 | 
1472 | class TemplateExpressionNode(TemplateNode):
1473 |     def __init__(self, parent, expression):
1474 |         TemplateNode.__init__(self, parent)
1475 |         self.expression = expression
1476 | 
1477 |     def render(self, context):
1478 |         return eval(self.expression, None, context)
1479 | 
1480 | 
1481 | class TextConsumer:
1482 |     def __init__(self, text):
1483 |         self._text = text
1484 |         self._len = len(text)
1485 |         self._pos = 0
1486 |         self._line = 1
1487 |         self._row = 0
1488 | 
1489 |     def get_pos(self):
1490 |         return self._pos
1491 | 
1492 |     def get_line(self):
1493 |         return self._line
1494 | 
1495 |     def get_row(self):
1496 |         return self._row
1497 | 
1498 |     def read(self):
1499 |         if self._pos >= self._len:
1500 |             return '\0'
1501 |         ch = self._text[self._pos]
1502 |         self._pos += 1
1503 |         self._row += 1
1504 |         if ch == '\n':
1505 |             self._line += 1
1506 |             self._row = 0
1507 |         return ch
1508 | 
1509 |     def peek(self, advance=0):
1510 |         if self._pos + advance >= self._len:
1511 |             return '\0'
1512 |         return self._text[self._pos + advance]
1513 | 
1514 |     def substr(self, begin, end):
1515 |         return self._text[begin:end]
1516 | 
1517 | 
1518 | class TemplateParser:
1519 |     OUTER_TOKEN_LITERAL = 1
1520 |     OUTER_TOKEN_EXPRESS = 2
1521 | 
1522 |     RESERVED = ["and", "as", "assert", "break", "class", "continue", "def", "del", "elif", "else", "except", "exec",
1523 |                 "finally", "for", "from", "global", "if", "import", "in", "is", "lambda", "not", "or", "pass", "print",
1524 |                 "raise", "return", "try", "while", "with", "yield"]
1525 | 
1526 |     def __init__(self, text):
1527 |         self._text = text
1528 |         self._consumer = TextConsumer(text)
1529 | 
1530 |     @staticmethod
1531 |     def _is_starting_by_new_line(text):
1532 |         for i in range(0, len(text)):
1533 |             ch = text[i:i + 1]
1534 |             if ch == '\n':
1535 |                 return True
1536 |             elif not ch.isspace():
1537 |                 break
1538 |         return False
1539 | 
1540 |     @staticmethod
1541 |     def _is_ending_by_new_line(text):
1542 |         for i in range(len(text) - 1, -1, -1):
1543 |             ch = text[i:i + 1]
1544 |             if ch == '\n':
1545 |                 return True
1546 |             elif not ch.isspace():
1547 |                 break
1548 |         return False
1549 | 
1550 |     @staticmethod
1551 |     def _trim_left_until_new_line(text):
1552 |         for i in range(0, len(text)):
1553 |             ch = text[i:i+1]
1554 |             if ch == '\n':
1555 |                 return text[i+1:]
1556 |             elif not ch.isspace():
1557 |                 break
1558 |         return text
1559 | 
1560 |     @staticmethod
1561 |     def _trim_right_until_new_line(text):
1562 |         for i in range(len(text) - 1, -1, -1):
1563 |             ch = text[i:i+1]
1564 |             if ch == '\n':
1565 |                 return text[0:i+1]  # save right \n
1566 |             elif not ch.isspace():
1567 |                 break
1568 |         return text
1569 | 
1570 |     @staticmethod
1571 |     def _parse_blank(consumer):
1572 |         while consumer.peek().isspace():  # 跳过所有空白
1573 |             consumer.read()
1574 | 
1575 |     @staticmethod
1576 |     def _parse_identifier(consumer):
1577 |         ch = consumer.peek()
1578 |         if not (ch.isalpha() or ch == '_'):
1579 |             return ""
1580 |         chars = [consumer.read()]  # ch
1581 |         ch = consumer.peek()
1582 |         while ch.isalnum() or ch == '_':
1583 |             chars.append(consumer.read())  # ch
1584 |             ch = consumer.peek()
1585 |         return "".join(chars)
1586 | 
1587 |     @staticmethod
1588 |     def _parse_inner(content, line, row):
1589 |         """内层解析函数
1590 |         考虑到表达式解析非常费力不讨好，这里采用偷懒方式进行。
1591 |         表达式全部交由python自行解决，匹配仅匹配开头，此外不处理注释（意味着不能在表达式中包含注释内容）。
1592 |         当满足 for <identifier> in <...> 时产生 for节点
1593 |         当满足 if <...> 时产生 if节点
1594 |         当满足 elif <...> 时产生 elif节点
1595 |         当满足 else 时产生 else节点
1596 |         当满足 end 时产生 end节点
1597 |         :param content: 内层内容
1598 |         :param line: 起始行
1599 |         :param row: 起始列
1600 |         :return: 节点名称, 表达式部分, 可选的Identifier
1601 |         """
1602 |         consumer = TextConsumer(content)
1603 |         TemplateParser._parse_blank(consumer)
1604 |         operator = TemplateParser._parse_identifier(consumer)
1605 |         identifier = None
1606 |         if operator == "for":
1607 |             TemplateParser._parse_blank(consumer)
1608 |             identifier = TemplateParser._parse_identifier(consumer)
1609 |             if identifier == "" or (identifier in TemplateParser.RESERVED):
1610 |                 raise ParseError("Identifier expected", consumer.get_line() + line - 1,
1611 |                                  consumer.get_row() + row if consumer.get_line() == 1 else consumer.get_row())
1612 |             TemplateParser._parse_blank(consumer)
1613 |             if TemplateParser._parse_identifier(consumer) != "in":
1614 |                 raise ParseError("Keyword 'in' expected", consumer.get_line() + line - 1,
1615 |                                  consumer.get_row() + row if consumer.get_line() == 1 else consumer.get_row())
1616 |             TemplateParser._parse_blank(consumer)
1617 |             expression = content[consumer.get_pos():]
1618 |             if expression == "":
1619 |                 raise ParseError("Expression expected", consumer.get_line() + line - 1,
1620 |                                  consumer.get_row() + row if consumer.get_line() == 1 else consumer.get_row())
1621 |         elif operator == "if" or operator == "elif":
1622 |             TemplateParser._parse_blank(consumer)
1623 |             expression = content[consumer.get_pos():]
1624 |             if expression == "":
1625 |                 raise ParseError("Expression expected", consumer.get_line() + line - 1,
1626 |                                  consumer.get_row() + row if consumer.get_line() == 1 else consumer.get_row())
1627 |         elif operator == "end" or operator == "else":
1628 |             TemplateParser._parse_blank(consumer)
1629 |             expression = content[consumer.get_pos():]
1630 |             if expression != '':
1631 |                 raise ParseError("Unexpected content", consumer.get_line() + line - 1,
1632 |                                  consumer.get_row() + row if consumer.get_line() == 1 else consumer.get_row())
1633 |         else:
1634 |             operator = ""
1635 |             expression = content
1636 |         return operator, expression.strip(), identifier
1637 | 
1638 |     def _parse_outer(self):
1639 |         """外层解析函数
1640 |         将输入拆分成字符串(Literal)和表达式(Expression)两个组成。
1641 |         遇到'{%'开始解析Expression，在解析Expression时允许使用'%%'转义，即'%%'->'%'，这使得'%%>'->'%>'而不会结束表达式。
1642 |         :return: 类型, 内容, 起始行, 起始列
1643 |         """
1644 |         begin = self._consumer.get_pos()
1645 |         end = begin  # [begin, end)
1646 |         begin_line = self._consumer.get_line()
1647 |         begin_row = self._consumer.get_row()
1648 |         ch = self._consumer.peek()
1649 |         while ch != '\0':
1650 |             if ch == '{':
1651 |                 ahead = self._consumer.peek(1)
1652 |                 if ahead == '%':
1653 |                     if begin != end:
1654 |                         return TemplateParser.OUTER_TOKEN_LITERAL, self._consumer.substr(begin, end), begin_line, \
1655 |                                begin_row
1656 |                     self._consumer.read()  # {
1657 |                     self._consumer.read()  # %
1658 |                     begin_line = self._consumer.get_line()
1659 |                     begin_row = self._consumer.get_row()
1660 |                     chars = []
1661 |                     while True:
1662 |                         ch = self._consumer.read()
1663 |                         if ch == '\0':
1664 |                             raise ParseError("Unexpected eof", self._consumer.get_line(), self._consumer.get_row())
1665 |                         elif ch == '%':
1666 |                             if self._consumer.peek() == '}':  # '%}'
1667 |                                 self._consumer.read()
1668 |                                 return TemplateParser.OUTER_TOKEN_EXPRESS, "".join(chars), begin_line, begin_row
1669 |                             elif self._consumer.peek() == '%':  # '%%' -> '%'
1670 |                                 self._consumer.read()
1671 |                         chars.append(ch)
1672 |             self._consumer.read()
1673 |             ch = self._consumer.peek()
1674 |             end = self._consumer.get_pos()
1675 |         return TemplateParser.OUTER_TOKEN_LITERAL, self._consumer.substr(begin, end), begin_line, begin_row
1676 | 
1677 |     @staticmethod
1678 |     def _trim_empty_line(result):
1679 |         state = 0
1680 |         left = None  # 需要剔除右边的元素
1681 |         for i in range(0, len(result)):
1682 |             cur = result[i]
1683 |             p = result[i - 1] if i != 0 else None
1684 |             n = result[i + 1] if i != len(result) - 1 else None
1685 |             if state == 0:
1686 |                 # 当前是表达式，且上一个是文本
1687 |                 if cur[0] == TemplateParser.OUTER_TOKEN_EXPRESS:
1688 |                     if p is None or (p[0] == TemplateParser.OUTER_TOKEN_LITERAL and
1689 |                                      TemplateParser._is_ending_by_new_line(p[1])):
1690 |                         left = i - 1 if p else None
1691 |                         state = 1
1692 |             if state == 1:
1693 |                 if n is None or (n[0] == TemplateParser.OUTER_TOKEN_LITERAL and
1694 |                                  TemplateParser._is_starting_by_new_line(n[1])):
1695 |                     right = i + 1 if n else None
1696 |                     if left is not None:
1697 |                         result[left] = (result[left][0],
1698 |                                         TemplateParser._trim_right_until_new_line(result[left][1]),
1699 |                                         result[left][2],
1700 |                                         result[left][3])
1701 |                     if right is not None:
1702 |                         result[right] = (result[right][0],
1703 |                                          TemplateParser._trim_left_until_new_line(result[right][1]),
1704 |                                          result[right][2],
1705 |                                          result[right][3])
1706 |                     state = 0
1707 |                 elif cur[0] != TemplateParser.OUTER_TOKEN_EXPRESS:  # 行中有其他文本，不进行剔除
1708 |                     state = 0
1709 | 
1710 |     def process(self):
1711 |         root = []  # 根
1712 |         nodes = []  # 未闭合节点队列
1713 |         outer_results = []
1714 |         while True:  # 为了剔除空行，需要先解析完所有的根元素做预处理
1715 |             ret = self._parse_outer()
1716 |             if ret[0] == TemplateParser.OUTER_TOKEN_LITERAL and ret[1] == "":  # EOF
1717 |                 break
1718 |             outer_results.append(ret)
1719 |         TemplateParser._trim_empty_line(outer_results)
1720 |         for i in outer_results:
1721 |             (t, content, line, row) = i
1722 |             back = None if len(nodes) == 0 else nodes[len(nodes) - 1]
1723 |             if t == TemplateParser.OUTER_TOKEN_LITERAL:
1724 |                 root.append(content) if back is None else back.nodes.append(content)
1725 |             else:
1726 |                 assert t == TemplateParser.OUTER_TOKEN_EXPRESS
1727 |                 (operator, expression, identifier) = self._parse_inner(content, line, row)
1728 |                 if operator == "for":
1729 |                     node = TemplateForNode(back, identifier, expression)
1730 |                     root.append(node) if back is None else back.nodes.append(node)
1731 |                     nodes.append(node)
1732 |                 elif operator == "if":
1733 |                     node = TemplateIfNode(back, expression)
1734 |                     root.append(node) if back is None else back.nodes.append(node)
1735 |                     nodes.append(node)
1736 |                 elif operator == "else":
1737 |                     if not isinstance(back, TemplateIfNode):
1738 |                         raise ParseError("Unexpected else branch", line, row)
1739 |                     node = TemplateIfElseNode(back.parent, back)
1740 |                     # 从root或者父节点中删除back
1741 |                     if back.parent is None:
1742 |                         assert root[len(root) - 1] == back
1743 |                         root.pop()
1744 |                         root.append(node)
1745 |                     else:
1746 |                         parent_nodes = back.parent.nodes
1747 |                         assert parent_nodes[len(parent_nodes) - 1] == back
1748 |                         parent_nodes.pop()
1749 |                         parent_nodes.append(node)
1750 |                     # 升级并取代
1751 |                     nodes.pop()
1752 |                     nodes.append(node)
1753 |                 elif operator == "elif":
1754 |                     if not isinstance(back, TemplateIfNode):
1755 |                         raise ParseError("Unexpected elif branch", line, row)
1756 |                     closed_else = TemplateIfElseNode(back.parent, back)
1757 |                     # 从root或者父节点中删除back
1758 |                     if back.parent is None:
1759 |                         assert root[len(root) - 1] == back
1760 |                         root.pop()
1761 |                         root.append(closed_else)
1762 |                     else:
1763 |                         parent_nodes = back.parent.nodes
1764 |                         assert parent_nodes[len(parent_nodes) - 1] == back
1765 |                         parent_nodes.pop()
1766 |                         parent_nodes.append(closed_else)
1767 |                     node = TemplateIfNode(closed_else, expression)
1768 |                     closed_else.nodes.append(node)
1769 |                     # 取代
1770 |                     nodes.pop()
1771 |                     nodes.append(node)
1772 |                 elif operator == "end":
1773 |                     if back is None:
1774 |                         raise ParseError("Unexpected block end", line, row)
1775 |                     nodes.pop()  # 完成一个节点
1776 |                 else:
1777 |                     assert operator == ""
1778 |                     node = TemplateExpressionNode(back, expression)
1779 |                     root.append(node) if back is None else back.nodes.append(node)
1780 |         if len(nodes) != 0:
1781 |             raise ParseError("Unclosed block", self._consumer.get_line(), self._consumer.get_row())
1782 |         return root
1783 | 
1784 | 
1785 | def render_template(template, **context):
1786 |     p = TemplateParser(template)
1787 |     root = p.process()
1788 |     output = []
1789 |     stack = [iter(root)]
1790 |     while stack:
1791 |         node = stack.pop()
1792 |         if isinstance(node, str):
1793 |             output.append(node)
1794 |         elif isinstance(node, TemplateExpressionNode):
1795 |             output.append(str(node.render(context)))
1796 |         elif isinstance(node, TemplateNode):
1797 |             stack.append(node.render(context))
1798 |         else:
1799 |             new_node = next(node, None)
1800 |             if new_node is not None:
1801 |                 stack.append(node)
1802 |                 stack.append(new_node)
1803 |     return "".join(output)
1804 | 
1805 | 
1806 | # ---------------------------------------- 代码生成 ----------------------------------------
1807 | 
1808 | 
1809 | def generate_code(header_template: str, source_template: str, analyzer: GrammarAnalyzer, header_filename: str):
1810 |     # 对所有符号进行整理，下标即最终的符号ID
1811 |     symbols = [kEofSymbol]
1812 |     tmp = list(analyzer.document().terminals())
1813 |     tmp.sort(key=lambda s: s.id())
1814 |     symbols.extend(tmp)
1815 |     token_cnt = len(symbols)
1816 |     tmp = list(analyzer.document().non_terminals())
1817 |     tmp.sort(key=lambda s: s.id())
1818 |     symbols.extend(tmp)
1819 | 
1820 |     # 生成token信息
1821 |     token_info = []
1822 |     for i in range(0, token_cnt):
1823 |         assert symbols[i].type() == SYMBOL_TERMINAL or symbols[i].type() == SYMBOL_EOF
1824 |         token_info.append({
1825 |             "id": i,
1826 |             "c_name": "_" if symbols[i] == kEofSymbol else symbols[i].id(),
1827 |             "raw": symbols[i]
1828 |         })
1829 | 
1830 |     # 生成映射表
1831 |     symbol_mapping = {}
1832 |     for i in range(0, len(symbols)):
1833 |         s = symbols[i]
1834 |         symbol_mapping[s] = i
1835 | 
1836 |     # 生成产生式信息
1837 |     production_info = []
1838 |     for i in range(0, len(analyzer.document().productions())):
1839 |         p = analyzer.document().productions()[i]
1840 |         assert i == p.index()
1841 |         production_info.append({
1842 |             "symbol": symbol_mapping[p.left()],
1843 |             "count": len(p),
1844 |             "raw": p
1845 |         })
1846 | 
1847 |     # 生成动作表
1848 |     actions = []
1849 |     state_remap_id_to_state_id = {}
1850 |     state_id_to_state_remap_id = {}
1851 |     offset = 0
1852 |     state_cnt = 0
1853 |     for i in range(0, analyzer.max_state() + 1):
1854 |         empty_state = True
1855 |         if i in analyzer.actions()[kEofSymbol]:
1856 |             empty_state = False
1857 |         else:
1858 |             for s in analyzer.document().symbols():
1859 |                 if i in analyzer.actions()[s]:
1860 |                     empty_state = False
1861 |                     break
1862 |         if empty_state:
1863 |             offset += 1
1864 |         else:
1865 |             assert i not in state_id_to_state_remap_id
1866 |             assert (i - offset) not in state_remap_id_to_state_id
1867 |             state_id_to_state_remap_id[i] = i - offset
1868 |             state_remap_id_to_state_id[i - offset] = i
1869 |             state_cnt += 1
1870 |     for i in range(0, state_cnt):
1871 |         action = []
1872 |         for j in range(0, len(symbols)):
1873 |             s = symbols[j]
1874 |             one_action = [0, 0]
1875 |             state = state_remap_id_to_state_id[i]
1876 |             if state in analyzer.actions()[s]:
1877 |                 act = analyzer.actions()[s][state]
1878 |                 one_action[0] = act.action()
1879 |                 if act.action() == ACTION_GOTO:
1880 |                     one_action[1] = state_id_to_state_remap_id[act.arg().state()]
1881 |                 elif act.action() == ACTION_REDUCE:
1882 |                     assert analyzer.document().productions()[act.arg().index()] == act.arg()
1883 |                     one_action[1] = act.arg().index()
1884 |             action.append(one_action)
1885 |         actions.append(action)
1886 | 
1887 |     # 生成C++类型
1888 |     token_types = []
1889 |     need_monostate = False
1890 |     for s in analyzer.document().terminals():
1891 |         if s.replace() is None:
1892 |             need_monostate = True
1893 |         else:
1894 |             assert s.replace().strip() == s.replace()
1895 |             assert s.replace() != "std::monostate"
1896 |             if s.replace() not in token_types:
1897 |                 token_types.append(s.replace())
1898 |     token_types.sort()
1899 |     if need_monostate or len(token_types) == 0:
1900 |         token_types.insert(0, "std::monostate")
1901 |     production_types = []
1902 |     need_monostate = False
1903 |     for s in analyzer.document().non_terminals():
1904 |         if s.replace() is None:
1905 |             need_monostate = True
1906 |         else:
1907 |             assert s.replace().strip() == s.replace()
1908 |             assert s.replace() != "std::monostate"
1909 |             if s.replace() not in production_types:
1910 |                 production_types.append(s.replace())
1911 |     production_types.sort()
1912 |     if need_monostate or len(production_types) == 0:
1913 |         production_types.insert(0, "std::monostate")
1914 | 
1915 |     # generate the context
1916 |     args = analyzer.document().generator_args() or {}
1917 |     context = {
1918 |         "namespace": args.get("namespace", None),
1919 |         "class_name": args.get("class_name", "Parser"),
1920 |         "includes": args.get("includes", []),
1921 |         "symbols": symbols,
1922 |         "token_info": token_info,
1923 |         "token_types": token_types,
1924 |         "production_info": production_info,
1925 |         "production_types": production_types,
1926 |         "actions": actions,
1927 |         "header_filename": header_filename,
1928 |     }
1929 | 
1930 |     header_src = render_template(header_template, **context)
1931 |     source_src = render_template(source_template, **context)
1932 |     return header_src, source_src
1933 | 
1934 | # ---------------------------------------- Main ----------------------------------------
1935 | 
1936 | 
1937 | CPP_HEADER_TPL = """/**
1938 |  * @file
1939 |  * @date {% datetime.date.today() %}
1940 |  *
1941 |  * Auto generated code by 9chu/parser_gen.
1942 |  */
1943 | #pragma once
1944 | #include <cstdint>
1945 | #include <vector>
1946 | #include <variant>
1947 | 
1948 | {% for f in includes %}
1949 | #include "{% f %}"
1950 | {% end %}
1951 | 
1952 | {% if namespace is None %}
1953 | // namespace {
1954 | {% else %}
1955 | namespace {% namespace %}
1956 | {
1957 | {% end %}
1958 |     class {% class_name %}
1959 |     {
1960 |     public:
1961 |         enum class ParseResult
1962 |         {
1963 |             Undecided = 0,
1964 |             Accepted = 1,
1965 |             Rejected = 2,
1966 |         };
1967 |         
1968 |         enum class TokenTypes
1969 |         {
1970 |             {% for t in token_info %}
1971 |             {% t["c_name"] %} = {% t["id"] %},
1972 |             {% end %}
1973 |         };
1974 |         
1975 |         using TokenValues = std::variant<
1976 |             {% for i in range(0, len(token_types)) %}
1977 |             {% token_types[i] %}{% if i != len(token_types) - 1 %},{% end %}
1978 |             {% end %}
1979 |             >;
1980 |             
1981 |         using ProductionValues = std::variant<
1982 |             {% for i in range(0, len(production_types)) %}
1983 |             {% production_types[i] %}{% if i != len(production_types) - 1 %},{% end %}
1984 |             {% end %}
1985 |             >;
1986 |         
1987 |         using UnionValues = std::variant<TokenValues, ProductionValues>;
1988 |         
1989 |     public:
1990 |         {% class_name %}();
1991 |         
1992 |     public:
1993 |         ParseResult operator()(TokenTypes token, const TokenValues& value);
1994 |         void Reset()noexcept;
1995 |         
1996 |         {% if production_info[0]["raw"].left().replace() is not None %}
1997 |         const {% production_info[0]["raw"].left().replace() %}& Result()const noexcept { return m_stResult; }
1998 |         {% production_info[0]["raw"].left().replace() %}& Result()noexcept { return m_stResult; }
1999 |         {% end %}
2000 |         
2001 |     private:
2002 |         std::vector<uint32_t> m_stStack;
2003 |         std::vector<UnionValues> m_stValueStack;
2004 |         
2005 |         {% if production_info[0]["raw"].left().replace() is not None %}
2006 |         {% production_info[0]["raw"].left().replace() %} m_stResult {};
2007 |         {% end %}
2008 |     };
2009 | {% if namespace is None %}
2010 | // }
2011 | {% else %}
2012 | }
2013 | {% end %}
2014 | """
2015 | 
2016 | CPP_SOURCE_TPL = """/**
2017 |  * @file
2018 |  * @date {% datetime.date.today() %}
2019 |  *
2020 |  * Auto generated code by 9chu/parser_gen.
2021 |  */
2022 | #include "{% header_filename %}"
2023 | 
2024 | #include <cassert>
2025 | 
2026 | {% if namespace is not None %}
2027 | using namespace {% namespace %};
2028 | {% end %}
2029 | 
2030 | #define ACTION_ERROR 0
2031 | #define ACTION_ACCEPT 1
2032 | #define ACTION_GOTO 2
2033 | #define ACTION_REDUCE 3
2034 | 
2035 | namespace {
2036 |     {% for idx in range(0, len(production_info)) %}
2037 |     {% class_name %}::ProductionValues Reduce{% idx %}(std::vector<{% class_name %}::UnionValues>& stack_)
2038 |     {
2039 |         // binding values
2040 |         assert(stack_.size() >= {% len(production_info[idx]["raw"]) %});
2041 |         {% for pos in production_info[idx]["raw"].binding() %}
2042 |         auto {% production_info[idx]["raw"].binding()[pos] %} =
2043 |         {% if production_info[idx]["raw"][pos].type() == 2 %}
2044 |             std::move(std::get<{% production_info[idx]["raw"][pos].replace() %}>(
2045 |                 std::get<{% class_name %}::ProductionValues>(
2046 |                     std::move(stack_[stack_.size() - {% len(production_info[idx]["raw"]) - pos %}])
2047 |                 )
2048 |             ));
2049 |         {% else %}
2050 |             std::move(std::get<{% production_info[idx]["raw"][pos].replace() %}>(
2051 |                 std::get<{% class_name %}::TokenValues>(
2052 |                     std::move(stack_[stack_.size() - {% len(production_info[idx]["raw"]) - pos %}])
2053 |                 )
2054 |             ));{% end %}
2055 |         {% end %}
2056 |         
2057 |         // user code
2058 |         {% if production_info[idx]["raw"].left().replace() is not None %}
2059 |         auto ret = [&]() {
2060 |             {% production_info[idx]["raw"].replace().strip() %}
2061 |         }();
2062 |         return {% class_name %}::ProductionValues { std::move(ret) };
2063 |         {% else %}
2064 |         {% if production_info[idx]["raw"].replace() is not None %}
2065 |         {% production_info[idx]["raw"].replace() %}
2066 |         {% end %}
2067 |         return {% class_name %}::ProductionValues {};
2068 |         {% end %}
2069 |     }
2070 |     
2071 |     {% end %}
2072 | }
2073 | 
2074 | using ReduceFunction = {% class_name %}::ProductionValues(*)(std::vector<{% class_name %}::UnionValues>&);
2075 | 
2076 | struct ProductionInfo
2077 | {
2078 |     uint32_t NonTerminal;
2079 |     uint32_t SymbolCount;
2080 |     ReduceFunction Callback;
2081 | };
2082 | 
2083 | struct ActionInfo
2084 | {
2085 |     uint8_t Action;
2086 |     uint32_t Arg;
2087 | };
2088 | 
2089 | static const ProductionInfo kProductions[{% len(production_info) %}] = {
2090 |     {% for i in range(0, len(production_info)) %}
2091 |     { {% production_info[i]["symbol"] %}, {% production_info[i]["count"] %}, ::Reduce{% i %} },
2092 |     {% end %}
2093 | };
2094 | 
2095 | static const ActionInfo kActions[{% len(actions) %}][{% len(symbols) %}] = {
2096 |     {% for action in actions %}
2097 |     { {% for act in action %}{ {% act[0] %}, {% act[1] %} },{% end %} },
2098 |     {% end %}
2099 | };
2100 | 
2101 | {% class_name %}::{% class_name %}()
2102 | {
2103 |     Reset();
2104 | }
2105 | 
2106 | {% class_name %}::ParseResult {% class_name %}::operator()(TokenTypes token, const TokenValues& value)
2107 | {
2108 |     while (true)
2109 |     {
2110 |         assert(!m_stStack.empty());
2111 |         assert(static_cast<uint32_t>(token) < {% len(token_info) %});
2112 |         
2113 |         const ActionInfo& act = kActions[m_stStack.back()][static_cast<uint32_t>(token)];
2114 |         if (act.Action == ACTION_ACCEPT)
2115 |         {
2116 |             {% if production_info[0]["raw"].left().replace() is not None %}
2117 |             // store the result
2118 |             assert(!m_stValueStack.empty());
2119 |             m_stResult =
2120 |                 std::move(std::get<{% production_info[0]["raw"].left().replace() %}>(
2121 |                     std::get<ProductionValues>(std::move(m_stValueStack.back()))
2122 |                 ));
2123 |             {% end %}
2124 |     
2125 |             Reset();
2126 |             return ParseResult::Accepted;
2127 |         }
2128 |         else if (act.Action == ACTION_ERROR)
2129 |         {
2130 |             Reset();
2131 |             return ParseResult::Rejected;
2132 |         }
2133 |         else if (act.Action == ACTION_GOTO)
2134 |         {
2135 |             m_stStack.push_back(static_cast<uint32_t>(token));
2136 |             m_stStack.push_back(act.Arg);
2137 |             assert(m_stStack.back() < {% len(actions) %});
2138 |             
2139 |             m_stValueStack.push_back(value);
2140 |         }
2141 |         else
2142 |         {
2143 |             assert(act.Action == ACTION_REDUCE);
2144 |             assert(act.Arg < {% len(production_info) %});
2145 |             
2146 |             const ProductionInfo& info = kProductions[act.Arg];
2147 |             auto val = info.Callback(m_stValueStack);
2148 |             
2149 |             assert(m_stStack.size() >= info.SymbolCount * 2);
2150 |             m_stStack.resize(m_stStack.size() - info.SymbolCount * 2);
2151 |             
2152 |             assert(m_stValueStack.size() >= info.SymbolCount);
2153 |             m_stValueStack.resize(m_stValueStack.size() - info.SymbolCount);
2154 |             
2155 |             m_stValueStack.emplace_back(std::move(val));
2156 |             assert(!m_stStack.empty());
2157 |             
2158 |             const ActionInfo& act2 = kActions[m_stStack.back()][info.NonTerminal];
2159 |             if (act2.Action == ACTION_GOTO)
2160 |             {
2161 |                 m_stStack.push_back(info.NonTerminal);
2162 |                 m_stStack.push_back(act2.Arg);
2163 |             }
2164 |             else
2165 |             {
2166 |                 assert(false);
2167 |                 Reset();
2168 |                 return ParseResult::Rejected;
2169 |             }
2170 |             
2171 |             continue;
2172 |         }
2173 |         break;
2174 |     }
2175 |     
2176 |     return ParseResult::Undecided;
2177 | }
2178 | 
2179 | void {% class_name %}::Reset()noexcept
2180 | {
2181 |     m_stStack.clear();
2182 |     m_stValueStack.clear();
2183 |     
2184 |     // initial state
2185 |     m_stStack.push_back(0);
2186 | }
2187 | """
2188 | 
2189 | 
2190 | def main():
2191 |     parser = argparse.ArgumentParser(description="A LR(1)/LALR(1) parser generator for C++17.")
2192 |     parser.add_argument("--header-file", type=str, help="Output header filename", default="Parser.hpp")
2193 |     parser.add_argument("--source-file", type=str, help="Output source filename", default="Parser.cpp")
2194 |     parser.add_argument("-o", "--output-dir", type=str, help="Output directory", default="./")
2195 |     parser.add_argument("--header-template", type=str, help="User defined header template", default="")
2196 |     parser.add_argument("--source-template", type=str, help="User defined source template", default="")
2197 |     parser.add_argument("--lalr", action="store_true", help="Set to LALR(1) mode", default=False)
2198 |     parser.add_argument("--print-actions", action="store_true", help="Print action table", default=False)
2199 |     parser.add_argument("grammar", help="Grammar filename")
2200 |     args = parser.parse_args()
2201 | 
2202 |     doc = GrammarDocument()
2203 |     doc.parse(args.grammar)
2204 | 
2205 |     analyzer = GrammarAnalyzer(doc)
2206 |     analyzer.process(GRAMMAR_MODE_LALR if args.lalr else GRAMMAR_MODE_LR1)
2207 | 
2208 |     if args.print_actions:
2209 |         print(analyzer.printable_actions())
2210 | 
2211 |     resolve_rr_cnt, resolve_sr_by_prec_cnt, resolve_sr_by_shift_cnt = analyzer.resolve_stat()
2212 |     sys.stderr.write(f"Reduce/Reduce conflict resolved count: {resolve_rr_cnt}\n")
2213 |     sys.stderr.write(f"Shift/Reduce conflict resolved count (by Operator Precedence): {resolve_sr_by_prec_cnt}\n")
2214 |     sys.stderr.write(f"Shift/Reduce conflict resolved count (by Shift Priority): {resolve_sr_by_shift_cnt}\n")
2215 | 
2216 |     header_tpl_content = CPP_HEADER_TPL
2217 |     source_tpl_content = CPP_SOURCE_TPL
2218 |     if args.header_template != "":
2219 |         with open(args.header_template, "r", encoding="utf-8") as f:
2220 |             header_tpl_content = f.read()
2221 |     if args.source_template != "":
2222 |         with open(args.source_template, "r", encoding="utf-8") as f:
2223 |             source_tpl_content = f.read()
2224 |     header_output, source_output = generate_code(header_tpl_content, source_tpl_content, analyzer, args.header_file)
2225 |     with open(os.path.join(args.output_dir, args.header_file), "w", encoding="utf-8") as f:
2226 |         f.write(header_output)
2227 |     with open(os.path.join(args.output_dir, args.source_file), "w", encoding="utf-8") as f:
2228 |         f.write(source_output)
2229 | 
2230 | 
2231 | if __name__ == "__main__":
2232 |     main()
2233 | 


--------------------------------------------------------------------------------