├── tests ├── __init__.py ├── tests_error_handling │ ├── __init__.py │ └── test_language_errors.py ├── run.bat ├── test_trivial.py ├── test_church.py ├── test_non_lalr.py ├── test_paren.py ├── test_verbose_reader.py ├── test_first.py ├── test_with_reader.py ├── test_resolve.py ├── test_lrval_dumpsloads.py ├── test_grammar.py ├── test_generalized.py ├── test_conflict.py ├── test_prece.py ├── test_arith.py ├── test_pystruct.py ├── test_courses.py ├── test_basic.py └── sexp_dump.py ├── MANIFEST ├── examples ├── preamble.py ├── eg_online.py ├── eg_ambig.py ├── eg_direct.py ├── eg_precedence.py ├── eg_dangling.py ├── eg_dumps.py ├── eg_logic.py ├── eg_demo_py2.py ├── eg_lisp.py ├── eg_demo_py3.py ├── eg_pystructs.py ├── eg_read_ebnf.py ├── eg_read_yacc.py ├── eg_demo.py ├── eg_demo_dump.py ├── eg_dumps_file.py ├── eg_func_lang.py ├── sexp_dump.py └── eg_dumps_direct_use.py ├── experiments ├── preamble.py ├── frontend_styles.py ├── only_syntax.py ├── LL.py ├── peg.py ├── meta_dumps.py └── meta_dumps_standalone.py ├── .gitignore ├── setup.py ├── .gitattributes ├── README.md └── metaparse.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tests_error_handling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/run.bat: -------------------------------------------------------------------------------- 1 | python -m unittest discover . -v 2 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | metaparse.py 3 | setup.py 4 | -------------------------------------------------------------------------------- /examples/preamble.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # Include parent path for testing 5 | sys.path.append( 6 | os.path.abspath( 7 | os.path.join( 8 | os.path.dirname(__file__), 9 | os.pardir 10 | ))) 11 | -------------------------------------------------------------------------------- /experiments/preamble.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # Include parent path for testing 5 | sys.path.append( 6 | os.path.abspath( 7 | os.path.join( 8 | os.path.dirname(__file__), 9 | os.pardir 10 | ))) 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.sqlite 3 | *.stats 4 | *.html 5 | *.py# 6 | *.cache 7 | *_raw.py 8 | .ipython 9 | metaparse.egg-info 10 | dist 11 | *.ipython 12 | *.python_history 13 | __pycache__ 14 | *.grip 15 | 16 | venv 17 | .coverage 18 | coverage_html 19 | 20 | experiments* -------------------------------------------------------------------------------- /examples/eg_online.py: -------------------------------------------------------------------------------- 1 | from eg_demo import * 2 | 3 | # Prepare a parsing routine 4 | p = pCalc.prepare() 5 | 6 | # Start this routine 7 | next(p) 8 | 9 | # Send tokens one-by-one 10 | for token in pCalc.lexer.tokenize('bar = 1 + 2 + + 3', with_end=True): 11 | print("Sends: ", token) 12 | r = p.send(token) 13 | print("Got: ", r) 14 | print() 15 | 16 | 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | import metaparse 6 | 7 | setup(name='metaparse', 8 | version='0.1', 9 | description='A tool for powerful instant parsing supported by optional algorithms.', 10 | author='Xuelei Li', 11 | author_email='lixuelei86@gmail.com', 12 | url='https://github.com/Shellay/metaparse', 13 | py_modules=['metaparse'], 14 | ) 15 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /tests/test_trivial.py: -------------------------------------------------------------------------------- 1 | from metaparse import * 2 | from pprint import pprint 3 | 4 | class S(metaclass=LALR.meta): 5 | # class S(metaclass=GLR.meta): 6 | a, b, c = 'abc' 7 | def S(A, B, C): return (A, *B, C) 8 | def A(a): return a 9 | def A(): return () 10 | def B(): return () 11 | def B(B, b): return B + (b,) 12 | def C(c): return c 13 | 14 | 15 | # pprint([*p.lexer.tokenize('abbbc', True)]) 16 | 17 | from unittest import main, TestCase 18 | 19 | class Test(TestCase): 20 | def test(self): 21 | r = S.interpret('abbbbc') 22 | self.assertEqual(r, ('a', 'b', 'b', 'b', 'b', 'c')) 23 | 24 | 25 | # pprint(p.__dict__) 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /tests/test_church.py: -------------------------------------------------------------------------------- 1 | from metaparse import LALR 2 | 3 | class LangChurch(metaclass=LALR.meta): 4 | 5 | """ 6 | Grammar for interpreting Church-Numerals. 7 | """ 8 | 9 | ZERO = r'zero' 10 | SUCC = r'succ' 11 | 12 | def num(ZERO): 13 | return 0 14 | 15 | def num(SUCC, num): 16 | return num + 1 17 | 18 | 19 | import unittest 20 | class Test(unittest.TestCase): 21 | def test_church(self): 22 | self.assertEqual(LangChurch.interpret('zero') , 0) 23 | self.assertEqual(LangChurch.interpret('succ zero') , 1) 24 | self.assertEqual(LangChurch.interpret('succ succ zero') , 2) 25 | self.assertEqual(LangChurch.interpret('succ succ succ zero') , 3) 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /tests/test_non_lalr.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from metaparse import * 3 | 4 | 5 | class TestConflicts(unittest.TestCase): 6 | 7 | def test_conflicts(self): 8 | 9 | with self.assertRaises(LanguageError): 10 | 11 | class G(metaclass=LALR.meta): 12 | 'A Grammar.meta which is LR(1) but not LALR(1).' 13 | 14 | a = r'a' 15 | b = r'b' 16 | c = r'c' 17 | d = r'd' 18 | e = r'e' 19 | 20 | def S(a, A, d): return 21 | def S(b, B, d): return 22 | def S(a, B, e): return 23 | def S(b, A, e): return 24 | 25 | def A(c): return c 26 | def B(c): return c 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/test_paren.py: -------------------------------------------------------------------------------- 1 | from metaparse import * 2 | 3 | class LangParen(metaclass=LALR.meta): 4 | 5 | """ 6 | Grammar for matching arbitrary paired parenthesises. 7 | """ 8 | 9 | END = r'\$' 10 | LEFT = r'\(' 11 | RIGHT = r'\)' 12 | 13 | 14 | def top(pair): 15 | return pair 16 | 17 | 18 | def pair(LEFT, pair_1, RIGHT, pair_2): 19 | return '<' + pair_1 + '>' + pair_2 20 | 21 | 22 | def pair(): 23 | return '' 24 | 25 | 26 | from unittest import main, TestCase 27 | 28 | class Test(TestCase): 29 | 30 | def test_paren(self): 31 | 32 | assert LangParen.interpret('()') == '<>' 33 | assert LangParen.interpret('( ( ) )') == '<<>>' 34 | assert LangParen.interpret('( ( ) ) ( )') == '<<>><>' 35 | 36 | if __name__ == '__main__': 37 | # import pprint as pp 38 | # s = LangParen.parse('( ( ) ) ( )') 39 | # pp.pprint(s) 40 | main() 41 | -------------------------------------------------------------------------------- /examples/eg_ambig.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import * 3 | 4 | 5 | class pExpr(metaclass=GLR.meta): 6 | 7 | 'An ambigious grammar for arithmetic expressions.' 8 | 9 | def plus(lex: r'\+'): 10 | return lex 11 | 12 | def times(lex: r'\*'): 13 | return lex 14 | 15 | def number(lex: r'\d+'): 16 | return int(lex) 17 | 18 | 19 | def expr(expr, plus, expr_1): 20 | return expr + expr_1 21 | 22 | def expr(expr, times, expr_1): 23 | return expr * expr_1 24 | 25 | def expr(number): 26 | return number 27 | 28 | 29 | inp = '2 + 1 * 3' 30 | 31 | tks = list(pExpr.lexer.tokenize(inp)) 32 | 33 | from pprint import pprint 34 | 35 | pprint(tks) 36 | 37 | r = pExpr.prepare_generalized() 38 | next(r) 39 | 40 | for tk in tks: 41 | x = r.send(tk) 42 | pprint(x) 43 | else: 44 | x = r.send(END_TOKEN) 45 | pprint(x) 46 | 47 | # Keep sending further tokens! 48 | tks = list(pExpr.lexer.tokenize(' + + 1')) 49 | for tk in tks: 50 | rs = r.send(tk) 51 | for e in rs: 52 | if isinstance(e, ParseError): 53 | pprint(e.args) 54 | else: break 55 | else: 56 | pprint(rs) 57 | else: 58 | x = r.send(END_TOKEN) 59 | pprint(x) 60 | 61 | -------------------------------------------------------------------------------- /examples/eg_direct.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import LALR 3 | 4 | pCalc = LALR() 5 | 6 | lex = pCalc.lexer 7 | rule = pCalc.rule 8 | 9 | # lex( = ) 10 | lex(IGNORED = r'\s+') 11 | lex(NUM = r'[0-9]+') 12 | lex(EQ = r'=') 13 | lex(ID = r'[_a-zA-Z]\w*') 14 | 15 | # lex(... , p = ) 16 | lex(POW = r'\*\*', p=3) 17 | lex(POW = r'\^') # No need to give the precedence twice for POW. 18 | lex(MUL = r'\*' , p=2) 19 | lex(ADD = r'\+' , p=1) 20 | 21 | # @rule 22 | # def ( ): 23 | # 24 | @rule 25 | def assign(ID, EQ, expr): 26 | context[ID] = expr 27 | return expr 28 | 29 | @rule 30 | def expr(ID): 31 | return context[ID] 32 | 33 | @rule 34 | def expr(NUM): 35 | return int(NUM) 36 | 37 | @rule 38 | def expr(expr_1, ADD, expr_2): 39 | return expr_1 + expr_2 40 | 41 | @rule 42 | def expr(expr, MUL, expr_1): 43 | return expr * expr_1 44 | 45 | @rule 46 | def expr(expr, POW, expr_1): 47 | return expr ** expr_1 48 | 49 | # Complete making the parser after collecting things! 50 | pCalc.make() 51 | 52 | context = {} 53 | pCalc.interpret("x = 3") 54 | pCalc.interpret("y = x ^ 2") 55 | pCalc.interpret("z = x + y + 1") 56 | 57 | from pprint import pprint 58 | print(context) 59 | print(pCalc.precedence) 60 | -------------------------------------------------------------------------------- /examples/eg_precedence.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import LALR 3 | 4 | class E(metaclass=LALR.meta): 5 | 6 | num = r'\d+' 7 | 8 | pow = r'\*\*', 3 # r'\*\*' is matched before r'\*' 9 | 10 | mul = r'\*', 2 11 | div = r'\/', 2 12 | 13 | add = r'\+', 1 14 | mns = r'-', 1 15 | 16 | l = r'\(' 17 | r = r'\)' 18 | 19 | def E(E, add, E_1): 20 | return '({} + {})'.format(E, E_1) 21 | def E(E, mns, E_1): 22 | return '({} - {})'.format(E, E_1) 23 | def E(E, mul, E_1): 24 | return '({} * {})'.format(E, E_1) 25 | def E(E, div, E_1): 26 | return '({} / {})'.format(E, E_1) 27 | def E(E, pow, E_1): 28 | return '({} ** {})'.format(E, E_1) 29 | def E(num): 30 | return num 31 | def E(l, E, r): 32 | return E 33 | 34 | 35 | import pprint as pp 36 | 37 | # pp.pprint(E.parse_many('3 + 2 * 7')) 38 | # pp.pprint(E.parse_many('3 + 2 * 7 + 1')) 39 | # pp.pprint(E.interpret_many('3 + 2 * 7 + 1')) 40 | 41 | print(E) 42 | pp.pprint(E.precedence) 43 | psr = (E) 44 | # print(psr.table.__len__()) 45 | # pp.pprint([*zip(psr.Ks, psr.ACTION)]) 46 | 47 | # print(psr.interpret('3 + 2 * 7')) 48 | # print(psr.interpret('3 * 2 + 7')) 49 | print(psr.interpret('3 + 2 * 7 / 5 - 1')) 50 | print(psr.interpret('3 + 2 * 7 ** 2 * 5')) 51 | -------------------------------------------------------------------------------- /examples/eg_dangling.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import LALR, GLR 3 | 4 | class pIfThenElse(metaclass=GLR.meta): 5 | 6 | IF = r'if' 7 | THEN = r'then' 8 | ELSE = r'else' 9 | EXPR = r'\d+' 10 | SINGLE = r'[_a-zA-Z]+' 11 | 12 | def stmt(ifstmt): 13 | return ifstmt 14 | 15 | def stmt(SINGLE): 16 | return SINGLE 17 | 18 | def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2): 19 | return ('ite', EXPR, stmt_1, stmt_2) 20 | 21 | def ifstmt(IF, EXPR, THEN, stmt): 22 | return ('it', EXPR, stmt) 23 | 24 | from pprint import pprint 25 | 26 | res = pIfThenElse.interpret_generalized('if 1 then if 2 then if 3 then a else b else c') 27 | pprint(res) 28 | 29 | 30 | 31 | class pIfThenElse(metaclass=LALR.meta): 32 | 33 | IF = r'if' 34 | THEN = r'then', 1 35 | ELSE = r'else', 2 36 | EXPR = r'\d+' 37 | SINGLE = r'[_a-zA-Z]+' 38 | 39 | def stmt(ifstmt): 40 | return ifstmt 41 | 42 | def stmt(SINGLE): 43 | return SINGLE 44 | 45 | def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2): 46 | return ('ite', EXPR, stmt_1, stmt_2) 47 | 48 | def ifstmt(IF, EXPR, THEN, stmt): 49 | return ('it', EXPR, stmt) 50 | 51 | res = pIfThenElse.interpret('if 1 then if 2 then if 3 then a else b else c') 52 | pprint(res) 53 | -------------------------------------------------------------------------------- /tests/test_verbose_reader.py: -------------------------------------------------------------------------------- 1 | from metaparse import LALR 2 | 3 | @LALR.verbose 4 | def calc(lex, rule): 5 | 6 | lex(IGNORED = r'\s+') 7 | 8 | @lex(NUM = r'[0-9]+') 9 | def NUM(val): 10 | return int(val) 11 | 12 | lex(EQ = r'=') 13 | lex(ID = r'[_a-zA-Z]\w*') 14 | 15 | lex(POW = r'\*\*', p = 3) 16 | lex(MUL = r'\*', p = 2) 17 | lex(ADD = r'\+', p = 1) 18 | lex(SUB = r'\-', p = 1) 19 | 20 | @rule 21 | def assign(ID, EQ, expr): 22 | table[ID] = expr 23 | return expr 24 | 25 | @rule 26 | def expr(ID): 27 | return table[ID] 28 | 29 | @rule 30 | def expr(NUM): 31 | return int(NUM) 32 | 33 | @rule 34 | def expr(expr_1, ADD, expr_2): 35 | return expr_1 + expr_2 36 | 37 | @rule 38 | def expr(expr_1, SUB, expr_2): 39 | return expr_1 - expr_2 40 | 41 | @rule 42 | def expr(expr, MUL, expr_1): 43 | return expr * expr_1 44 | 45 | @rule 46 | def expr(expr, POW, expr_1): 47 | return expr ** expr_1 48 | 49 | 50 | from pprint import pprint 51 | # pprint(lex) 52 | # pprint(rule) 53 | 54 | # 55 | table = {} 56 | 57 | calc.interpret('x = 8') 58 | calc.interpret('y = x - 6 ') 59 | calc.interpret('z = x ** y ') 60 | 61 | 62 | import unittest 63 | 64 | class Test(unittest.TestCase): 65 | 66 | def test(self): 67 | self.assertEqual(table, dict(x=8, y=2, z=64)) 68 | 69 | 70 | if __name__ == '__main__': 71 | unittest.main() 72 | 73 | -------------------------------------------------------------------------------- /tests/test_first.py: -------------------------------------------------------------------------------- 1 | from metaparse import * 2 | 3 | 4 | if __name__ == '__main__': 5 | 6 | rs = ([ 7 | Rule('S', ('A', 'B', 'C')), 8 | Rule('S', ('D',)), 9 | Rule('A', ('a', 'A')), 10 | Rule('A', ()), 11 | Rule('B', ('B', 'b')), 12 | Rule('B', ()), 13 | Rule('C', ('c',)), 14 | Rule('C', ('D',)), 15 | Rule('D', ('d', 'D')), 16 | Rule('D', ('E',)), 17 | Rule('E', ('D',)), 18 | Rule('E', ('B',)), 19 | ]) 20 | g = Grammar(rs) 21 | 22 | rs1 = [ 23 | Rule('expr', ['expr', '+', 'term']), 24 | Rule('expr', ['term']), 25 | Rule('term', ['term', '*', 'factor']), 26 | Rule('term', ['factor']), 27 | Rule('factor', ['ID']), 28 | Rule('factor', ['(', 'expr', ')']), 29 | ] 30 | e = Grammar(rs1) 31 | 32 | import unittest 33 | 34 | class TestGrammar(unittest.TestCase): 35 | 36 | def test_first_0(self): 37 | self.assertEqual(g.FIRST['S'], {'a', 'b', 'c', 'd', 'EPSILON'}) 38 | self.assertEqual(g.FIRST['E'], {'b', 'd', 'EPSILON'}) 39 | 40 | def test_first_1(self): 41 | self.assertEqual(e.FIRST['expr'], {'ID', '('}) 42 | self.assertEqual(e.FIRST['term'], {'ID', '('}) 43 | self.assertEqual(e.FIRST['factor'], {'ID', '('}) 44 | 45 | def test_nullalbe(self): 46 | self.assertEqual(set(g.NULLABLE), {'S', 'A', 'B', 'C', 'D', 'E'}) 47 | 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /examples/eg_dumps.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | import ast 3 | 4 | from metaparse import * 5 | 6 | class G_Calc(metaclass=LALR.meta): 7 | 8 | IGNORED = r'\s+' 9 | 10 | EQ = r'=' 11 | 12 | def NUM(lex: r'[1-9]\d*'): 13 | return float(lex) 14 | 15 | ID = r'[_a-zA-Z]\w*' 16 | POW = r'\*\*', 3 17 | MUL = r'\*' , 2 18 | ADD = r'\+' , 1 19 | 20 | def assign(ID, EQ, expr): 21 | table[ID] = expr 22 | 23 | def expr(NUM): 24 | return NUM 25 | 26 | def expr(ID): 27 | return table[ID] 28 | 29 | def expr(expr_1, ADD, expr_2): 30 | return expr_1 + expr_2 31 | 32 | def expr(expr, MUL, expr_1): 33 | return expr * expr_1 34 | 35 | def expr(expr, POW, expr_1): 36 | return expr ** expr_1 37 | 38 | # assert 0 39 | 40 | p = (G_Calc) 41 | 42 | from pprint import pprint 43 | 44 | # with open('eg_dumps_file.py', 'w') as o: 45 | # psr_fl = p.dumps() 46 | # o.write(psr_fl) 47 | 48 | # with open('eg_dumps_file.py', 'r') as o: 49 | # s = o.read() 50 | # p = LALR.loads(s, globals()) 51 | 52 | p.dump('eg_dumps_file.py') 53 | p.load('eg_dumps_file.py', globals()) 54 | 55 | # pprint(p.__dict__) 56 | # pprint(ctx) 57 | 58 | # timeit LALR.loads(s, globals()) 59 | # timeit p = LALR(G_Calc) 60 | 61 | s1 = p.dumps() 62 | p1 = LALR.loads(s1, globals()) 63 | s2 = p1.dumps() 64 | p2 = LALR.loads(s2, globals()) 65 | 66 | table = {} 67 | p2.interpret('x = 3') 68 | p2.interpret('y = x ** 2 * 2 + 1') 69 | pprint(table) 70 | -------------------------------------------------------------------------------- /tests/test_with_reader.py: -------------------------------------------------------------------------------- 1 | from metaparse import LALR 2 | 3 | calc = LALR() 4 | 5 | 6 | with calc as (lex, rule): 7 | 8 | lex(IGNORED = r'\s+') 9 | 10 | @lex(NUM = r'[0-9]+') 11 | def NUM(val): 12 | return int(val) 13 | 14 | lex(EQ = r'=') 15 | lex(ID = r'[_a-zA-Z]\w*') 16 | 17 | lex(POW = r'\*\*', p = 3) 18 | lex(MUL = r'\*', p = 2) 19 | lex(ADD = r'\+', p = 1) 20 | lex(SUB = r'\-', p = 1) 21 | 22 | @rule 23 | def assign(ID, EQ, expr): 24 | table[ID] = expr 25 | return expr 26 | 27 | @rule 28 | def expr(ID): 29 | return table[ID] 30 | 31 | @rule 32 | def expr(NUM): 33 | return int(NUM) 34 | 35 | @rule 36 | def expr(expr_1, ADD, expr_2): 37 | return expr_1 + expr_2 38 | 39 | @rule 40 | def expr(expr_1, SUB, expr_2): 41 | return expr_1 - expr_2 42 | 43 | @rule 44 | def expr(expr, MUL, expr_1): 45 | return expr * expr_1 46 | 47 | @rule 48 | def expr(expr, POW, expr_1): 49 | return expr ** expr_1 50 | 51 | 52 | from pprint import pprint 53 | # pprint(lex) 54 | # pprint(rule) 55 | 56 | # 57 | table = {} 58 | 59 | calc.interpret('x = 8') 60 | calc.interpret('y = x - 6 ') 61 | calc.interpret('z = x ** y ') 62 | 63 | 64 | import unittest 65 | 66 | class Test(unittest.TestCase): 67 | 68 | def test(self): 69 | self.assertEqual(table, dict(x=8, y=2, z=64)) 70 | 71 | 72 | if __name__ == '__main__': 73 | unittest.main() 74 | 75 | -------------------------------------------------------------------------------- /tests/test_resolve.py: -------------------------------------------------------------------------------- 1 | from metaparse import LALR 2 | from pprint import pprint 3 | from unittest import TestCase, main 4 | 5 | 6 | class LangIfThenElse(metaclass=LALR.meta): 7 | 8 | 'Dangling else grammar with ambiguity resolved by precedence.' 9 | 10 | IGNORED = r'[ \(\)]' 11 | IF = r'if' 12 | THEN = r'then', 1 13 | def ELSE(lex: r'else') -> 2: 14 | return lex 15 | 16 | EXPR = r'e' 17 | SINGLE = r's' 18 | 19 | def stmt(ifstmt): 20 | return ifstmt 21 | 22 | def stmt(SINGLE): 23 | return SINGLE 24 | 25 | def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2): 26 | return ('ite', stmt_1, stmt_2) 27 | 28 | def ifstmt(IF, EXPR, THEN, stmt): 29 | return ('it', stmt) 30 | 31 | 32 | 33 | class Test(TestCase): 34 | 35 | def test_parse(self): 36 | 37 | inp = 'if e then (if e then (if e then s else s) else s)' 38 | res = LangIfThenElse.interpret(inp) 39 | 40 | self.assertEqual(res, ('it', ('ite', ('ite', 's', 's'), 's'))) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | 46 | # inp = 'if e then else (if e then (if e then s else s) else s)' 47 | # r = LangIfThenElse.prepare_generalized() 48 | # l = LangIfThenElse.lexer.tokenize(inp) 49 | # next(r) 50 | # for t in l: 51 | # print('feeding: ', t) 52 | # res = r.send(t) 53 | # print(res) 54 | # res = r.send(None) 55 | # print(res) 56 | 57 | # t = LangIfThenElse.parse_generalized(inp) 58 | -------------------------------------------------------------------------------- /examples/eg_logic.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | 3 | from metaparse import LALR 4 | from collections import namedtuple 5 | 6 | 7 | class PropLogic(metaclass=LALR.meta): 8 | 9 | T = r'True' 10 | F = r'False' 11 | W = r'[A-Z]\w*' 12 | 13 | L = r'\(' ; R = r'\)' 14 | LL = r'\['; RR = r'\]' 15 | 16 | NEG = r'!' , 5 17 | CON = r'&' , 4 18 | DIS = r'\|' , 3 19 | IMP = r'->' , 2 20 | IFF = r'<=>' , 1 21 | 22 | def Sentence(Atomic): 23 | return Atomic 24 | def Sentence(Complex): 25 | return Complex 26 | 27 | def Atomic(T): 28 | return True 29 | def Atomic(F): 30 | return False 31 | def Atomic(W): 32 | return table[W] 33 | 34 | def Complex(L, Sentence, R): 35 | return Sentence 36 | def Complex(LL, Sentence, RR): 37 | return Sentence 38 | def Complex(NEG, Sentence): 39 | return not Sentence 40 | def Complex(Sentence, CON, Sentence_1): 41 | return Sentence and Sentence_1 42 | def Complex(Sentence, DIS, Sentence_1): 43 | return Sentence or Sentence_1 44 | def Complex(Sentence, IMP, Sentence_1): 45 | return not Sentence or Sentence_1 46 | def Complex(Sentence, IFF, Sentence_1): 47 | return Sentence == Sentence_1 48 | 49 | 50 | inp = """ 51 | (P & Q | R & !S) 52 | """ 53 | 54 | table = dict( 55 | P=True, 56 | Q=False, 57 | R=True, 58 | S=False, 59 | ) 60 | 61 | t = PropLogic.parse(inp) 62 | r = PropLogic.interpret(inp) 63 | 64 | from pprint import pprint 65 | 66 | pprint(t) 67 | pprint(r) 68 | 69 | # pprint(PropLogic.__dict__) 70 | -------------------------------------------------------------------------------- /tests/test_lrval_dumpsloads.py: -------------------------------------------------------------------------------- 1 | from metaparse import * 2 | 3 | table = [] 4 | refs = 0 5 | 6 | class G(metaclass=LALR.meta): 7 | 8 | EQ = r'=' 9 | 10 | def STAR(lex: r'\*'): 11 | global refs 12 | refs += 1 13 | return lex 14 | 15 | def ID(lex: r'[_a-zA-Z]\w*'): 16 | table.append(lex) 17 | return lex 18 | 19 | def S(L, EQ, R): 20 | return ('assign', L, R) 21 | 22 | def S(R): 23 | return ('expr', R) 24 | 25 | def L(STAR, R): 26 | return ('deref', R) 27 | 28 | def L(ID): 29 | return ID 30 | 31 | def R(L): 32 | return L 33 | 34 | 35 | import unittest 36 | 37 | class TestDumpLoad(unittest.TestCase): 38 | 39 | def test_dumpload(self): 40 | 41 | inp = '*a = **b' 42 | 43 | import pprint as pp 44 | 45 | p1 = G 46 | 47 | s1 = p1.dumps() 48 | p1 = LALR.loads(s1, globals()) 49 | s1 = p1.dumps() 50 | p1 = LALR.loads(s1, globals()) 51 | 52 | r = p1.interpret(inp) 53 | r = p1.interpret(inp) 54 | 55 | self.assertEqual(r, ('assign', 56 | ('deref', 'a'), 57 | ('deref', ('deref', 'b')))) 58 | 59 | self.assertEqual(table, ['a', 'b', 'a', 'b']) 60 | self.assertEqual(refs, 6) 61 | 62 | # s = p1.lexer.dumps() 63 | # lexer = Lexer.loads(s, globals()) 64 | # xs = list(lexer.tokenize(inp, True)) 65 | # # pp.pprint(xs) 66 | 67 | # self.assertEqual(refs, 9) 68 | # # pp.pprint(p1) 69 | # # pp.pprint(p1.__dict__) 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /examples/eg_demo_py2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import preamble 4 | 5 | from metaparse import LALR 6 | 7 | # Global stuff 8 | table = {} 9 | 10 | @LALR.verbose 11 | def calc(lex, rule): 12 | 13 | lex(IGNORED = r'\s+') 14 | 15 | @lex(NUM = r'[0-9]+') 16 | def NUM(val): 17 | return int(val) 18 | 19 | lex(LEFT = r'\(') 20 | lex(RIGHT = r'\)') 21 | 22 | lex(EQ = r'=') 23 | lex(ID = r'[_a-zA-Z]\w*') 24 | 25 | lex(POW = r'\*\*', p = 3) 26 | lex(MUL = r'\*', p = 2) 27 | lex(ADD = r'\+', p = 1) 28 | lex(SUB = r'\-', p = 1) 29 | 30 | @rule 31 | def stmt(assign): 32 | return assign 33 | @rule 34 | def stmt(expr): 35 | return expr 36 | 37 | @rule 38 | def assign(ID, EQ, expr): 39 | table[ID] = expr 40 | return expr 41 | 42 | @rule 43 | def expr(ID): 44 | return table[ID] 45 | @rule 46 | def expr(NUM): 47 | return int(NUM) 48 | @rule 49 | def expr(LEFT, expr, RIGHT): 50 | return expr 51 | 52 | @rule 53 | def expr(expr_1, ADD, expr_2): 54 | return expr_1 + expr_2 55 | @rule 56 | def expr(expr_1, SUB, expr_2): 57 | return expr_1 - expr_2 58 | @rule 59 | def expr(expr, MUL, expr_1): 60 | return expr * expr_1 61 | @rule 62 | def expr(expr, POW, expr_1): 63 | return expr ** expr_1 64 | 65 | 66 | from pprint import pprint 67 | 68 | table = {} 69 | 70 | calc.interpret('x = 8') 71 | calc.interpret('y = x - 6 ') 72 | calc.interpret('z = x ** y ') 73 | 74 | calc.interpret(' (3) ') 75 | calc.interpret(' x = 03 ') 76 | calc.interpret(' y = 4 * x ** (2 + 1) * 2') 77 | 78 | print(table) 79 | 80 | # print(calc.dumps()) 81 | calc1 = LALR.loads(calc.dumps(), globals()) 82 | 83 | calc1.interpret(' w = x + 1') 84 | 85 | print(table) 86 | -------------------------------------------------------------------------------- /examples/eg_lisp.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | import metaparse 3 | from metaparse import LALR 4 | 5 | 6 | class ListParser(metaclass=LALR.meta): 7 | """A tiny grammar for lists.""" 8 | IGNORED = r'\s' 9 | SYMBOL = r'\w+' 10 | def list(list, SYMBOL): 11 | list.append(SYMBOL) 12 | return list 13 | def list(): 14 | return [] 15 | 16 | 17 | class LISP(metaclass=LALR.meta): 18 | """A parser for scheme-like grammar. Should be easy to describe and 19 | parse. 20 | 21 | """ 22 | 23 | LAMBDA = r'\(\s*lambda' 24 | LEFT = r'\(' 25 | RIGHT = r'\)' 26 | SYMBOL = r'[^\(\)\s]+' 27 | 28 | # _env = {} 29 | # def _unify(): 30 | # pass 31 | 32 | def sexp(var): 33 | return var 34 | def sexp(abst): 35 | return abst 36 | def sexp(appl): 37 | return appl 38 | 39 | def var(SYMBOL): 40 | return SYMBOL 41 | def abst(LAMBDA, LEFT, parlist, RIGHT_1, sexp, RIGHT_2): 42 | return ('LAMBDA', parlist, sexp) 43 | def appl(LEFT, sexp, sexps, RIGHT): 44 | return [sexp, sexps] 45 | 46 | def parlist(SYMBOL, parlist): 47 | return [SYMBOL] + parlist 48 | # def parlist(parlist, SYMBOL): 49 | # return parlist + [SYMBOL] 50 | def parlist(): 51 | return [] 52 | 53 | def sexps(sexps, sexp): 54 | return sexps + [sexp] 55 | # def sexps(sexp, sexps): 56 | # return sexps + [sexp] 57 | def sexps(): 58 | return [] 59 | 60 | 61 | p_lisp = (LISP) 62 | 63 | lx = p_lisp.lexer 64 | p = p_lisp.prepare(True) 65 | next(p) 66 | 67 | inp = '(+ (+ 1 2) 3 ))' 68 | tks = list(lx.tokenize(inp, True)) 69 | 70 | 71 | from pprint import pprint 72 | 73 | # pprint(tks) 74 | 75 | for tk in tks: 76 | res = p.send(tk) 77 | pprint(res) 78 | 79 | # res = p_lisp.interpret('(lambda (x y) (+ x y) ))') 80 | # print(res) 81 | 82 | for tk in tks: 83 | res = p.send(tk) 84 | pprint(res) 85 | -------------------------------------------------------------------------------- /tests/test_grammar.py: -------------------------------------------------------------------------------- 1 | """This file tests the fundamental checking mechanism of the class 2 | Grammar.meta.""" 3 | 4 | import warnings 5 | import unittest 6 | 7 | from metaparse import * 8 | 9 | w = [] 10 | 11 | # with warnings.catch_warnings(record=True) as ws: 12 | if 1: 13 | 14 | class G(metaclass=Grammar.meta): 15 | 16 | def S(A, B, C): pass 17 | def S(D): pass 18 | def A(a, A): pass 19 | def A(): pass 20 | def B(B, b): pass 21 | def B(): pass 22 | def C(c): pass 23 | def C(D): pass 24 | def D(d, D): pass 25 | def D(E): pass 26 | def E(D): pass 27 | def E(B): pass 28 | 29 | # assert len(ws) == 1 30 | 31 | # pprint.pprint(G.terminals) 32 | # pprint.pprint(G.nonterminals) 33 | 34 | 35 | class TestGrammar(unittest.TestCase): 36 | 37 | def test_first_all(self): 38 | self.assertEqual(G.first_of_seq(['A', 'B', 'C'], '#'), {'a', 'b', 'c', 'd', '#'}) 39 | 40 | def test_nullalbe(self): 41 | self.assertEqual(set(G.NULLABLE), {'S', 'A', 'B', 'C', 'D', 'E'}) 42 | 43 | # def test_warn_loop(self): 44 | # with warnings.catch_warnings(record=True) as ws: 45 | # # Same as `G` above. 46 | # class F(metaclass=cfg): 47 | # a, b, c, d = r'abcd' 48 | # def S(A, B, C): pass 49 | # def S(D): pass 50 | # def A(a, A): pass 51 | # def A(): pass 52 | # def B(B, b): pass 53 | # def B(): pass 54 | # def C(c): pass 55 | # def C(D): pass 56 | # def D(d, D): pass 57 | # def D(E): pass 58 | # def E(D): pass 59 | # def E(B): pass 60 | # # Now raised warnings get captured into `ws`. 61 | # self.assertEqual(len(ws), 1) 62 | # # print(ws) 63 | 64 | 65 | if __name__ == '__main__': 66 | unittest.main() 67 | # pass 68 | -------------------------------------------------------------------------------- /examples/eg_demo_py3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import preamble 3 | 4 | import pprint as pp 5 | from metaparse import LALR 6 | 7 | # Global stuff 8 | table = {} 9 | 10 | class G_Calc(metaclass=LALR.meta): 11 | 12 | # ===== Lexical patterns / Terminals ===== 13 | # - A pattern is defined by Python regex literal. 14 | # - Patterns will be matched in given order when tokenizing. 15 | 16 | IGNORED = r' ' # Special token ignored by tokenizer. 17 | IGNORED = r'\t' # Can add alternative patterns. 18 | 19 | POW = r'\*\*', 3 # Precedence of token (for LALR) 20 | MUL = r'\*' , 2 21 | ADD = r'\+' , 1 22 | 23 | EQ = r'=' # Precedence is 0 by default. 24 | 25 | NUM = r'[1-9]\d*' 26 | def NUM(lex): # Handler for translating token value. 27 | return int(lex) 28 | 29 | ID = r'[_a-zA-Z]\w*' # Unhandled token yields literal value. 30 | 31 | # === Optional error handling for tokenizer === 32 | # - If handler defined, token ERROR is ignored when tokenizing. 33 | # - Otherwise token ERROR is yielded. 34 | ERROR = r'#' 35 | def ERROR(lex): 36 | print("Error literal '{}'".format(lex)) 37 | 38 | # ===== Syntactic/Semantic rules in SDT-style ===== 39 | 40 | def assign(ID, EQ, expr): # May rely on side-effect... 41 | table[ID] = expr 42 | 43 | def expr(NUM): # or return local results for purity 44 | return NUM 45 | 46 | def expr(ID): 47 | return table[ID] 48 | 49 | def expr(expr_1, ADD, expr_2): # With TeX-subscripts, meaning (expr → expr₁ + expr₂) 50 | return expr_1 + expr_2 51 | 52 | def expr(expr, MUL, expr_1): # Can ignore one of the subscripts. 53 | return expr * expr_1 54 | 55 | def expr(expr, POW, expr_1): 56 | return expr ** expr_1 57 | 58 | 59 | pCalc = G_Calc 60 | 61 | from pprint import pprint 62 | # parse and tree 63 | t = pCalc.parse("x = 1 + 4 * 3 ** 2 + 5") 64 | pprint(t) 65 | 66 | -------------------------------------------------------------------------------- /tests/test_generalized.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, main 2 | from metaparse import * 3 | 4 | 5 | class LangExpr(metaclass=GLR.meta): 6 | 7 | 'An ambigious grammar for arithmetic expressions.' 8 | 9 | def plus(lex: r'\+'): 10 | return lex 11 | 12 | def times(lex: r'\*'): 13 | return lex 14 | 15 | def number(lex: r'\d+'): 16 | return int(lex) 17 | 18 | 19 | def expr(expr, plus, expr_1): 20 | return expr + expr_1 21 | 22 | def expr(expr, times, expr_1): 23 | return expr * expr_1 24 | 25 | def expr(number): 26 | return number 27 | 28 | 29 | class Test(TestCase): 30 | 31 | def test_send(self): 32 | 33 | p = LangExpr.prepare_generalized() 34 | 35 | inp = '1 + 2 * 3 + 4' 36 | x = list(LangExpr.lexer.tokenize(inp)) 37 | 38 | next(p) 39 | for tk in x: 40 | r = p.send(tk) 41 | else: 42 | r = p.send(END_TOKEN) 43 | # 5 combinations for association! 44 | # 45 | # How to calc the number of combinations? 46 | # 47 | # <==> 48 | # 49 | # Given i operators, how many binary trees can they form 50 | # with the same infix-order? 51 | # 52 | # - choose each one as a subtree root 53 | # - divide by the root, calc recursively 54 | # 55 | # B(0) == 1 56 | # B(1) == 1 57 | # B(2) == B(1) + B(1) == 2 58 | # B(3) == B(2) + B(1)*B(1) + B(2) == 2 + 1 + 2 == 5 59 | # B(4) == B(3) + B(1)B(2) + B(2)B(1) + B(3) == 5 + 2 + 2 + 5 == 14 60 | # ... 61 | # B(n) == sum(B(i)B(n-1-i) for i in [1..n-1]) 62 | self.assertEqual(len(r), 2 + 1 + 2) 63 | 64 | def test_send_more(self): 65 | inp = '1 + 2 * 3 + 4 * 5' 66 | y = LangExpr.interpret_generalized(inp) 67 | self.assertEqual(len(y), 14) 68 | 69 | def test_parse(self): 70 | y = LangExpr.interpret_generalized('1 + 2 * 3') 71 | self.assertEqual(y, [9, 7]) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | 77 | -------------------------------------------------------------------------------- /tests/test_conflict.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from metaparse import LanguageError, LALR, GLR 4 | 5 | from pprint import pprint 6 | 7 | 8 | class TestLRGrammar(unittest.TestCase): 9 | 10 | def test_LALR_report(self): 11 | """LALR parser should report conflicts for ambiguous Grammar.meta! """ 12 | with self.assertRaises(LanguageError) as caught: 13 | 14 | class LangIfThenElse(metaclass=LALR.meta): 15 | 16 | IF = r'if' 17 | THEN = r'then' 18 | ELSE = r'else' 19 | EXPR = r'e' 20 | SINGLE = r's' 21 | 22 | def stmt(ifstmt): 23 | return ifstmt 24 | 25 | def stmt(SINGLE): 26 | return SINGLE 27 | 28 | def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2): 29 | return ('ite', EXPR, stmt_1, stmt_2) 30 | 31 | def ifstmt(IF, EXPR, THEN, stmt): 32 | return ('it', EXPR, stmt) 33 | 34 | self.assertIn( 35 | 'Conflict on lookahead: ELSE', 36 | caught.exception.message) 37 | 38 | def test_many(self): 39 | 40 | class LangIfThenElse(metaclass=GLR.meta): 41 | 42 | IF = r'if' 43 | THEN = r'then' 44 | ELSE = r'else' 45 | EXPR = r'\d' 46 | SINGLE = r'[xyz]' 47 | 48 | def stmt(ifstmt): 49 | return ifstmt 50 | 51 | def stmt(SINGLE): 52 | return SINGLE 53 | 54 | def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2): 55 | return ('ite', EXPR, stmt_1, stmt_2) 56 | 57 | def ifstmt(IF, EXPR, THEN, stmt): 58 | return ('it', EXPR, stmt) 59 | 60 | results = LangIfThenElse.interpret_generalized('if 1 then if 2 then x else y') 61 | self.assertEqual(len(results), 2) 62 | self.assertIn( 63 | ('it', '1', ('ite', '2', 'x', 'y')), 64 | results) 65 | self.assertIn( 66 | ('ite', '1', ('it', '2', 'x'), 'y'), 67 | results) 68 | 69 | 70 | if __name__ == '__main__': 71 | unittest.main() 72 | -------------------------------------------------------------------------------- /experiments/frontend_styles.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | 3 | from metaparse import * 4 | 5 | table = {} 6 | 7 | # Clean style 8 | class G_Calc(metaclass=cfg): 9 | 10 | IGNORED = r'\s+' 11 | 12 | EQ = r'=' 13 | NUM = r'[0-9]+' 14 | ID = r'[_a-zA-Z]\w*' 15 | POW = r'\*\*', 3 16 | MUL = r'\*' , 2 17 | ADD = r'\+' , 1 18 | 19 | # ERROR handler? 20 | 21 | def assign(ID, EQ, expr): 22 | table[ID] = expr 23 | 24 | def expr(NUM): 25 | return int(NUM) 26 | 27 | def expr(ID): 28 | return table[ID] 29 | 30 | def expr(expr_1, ADD, expr_2): 31 | return expr_1 + expr_2 32 | 33 | def expr(expr, MUL, expr_1): 34 | return expr * expr_1 35 | 36 | def expr(expr, POW, expr_1): 37 | return expr ** expr_1 38 | 39 | 40 | # Handler style 41 | class G_Calc(): 42 | 43 | def IGNORED(lex: r'\v'): 44 | pass 45 | def IGNORED(lex: r'\\'): 46 | pass 47 | 48 | def ERROR(lex: r'\t'): 49 | print('ERROR') 50 | 51 | def UNRECOGNIZED(lex: r'.'): 52 | pass 53 | 54 | # Terminals 55 | def NUM(lex: r'\d+'): 56 | return int(lex) 57 | 58 | def ID(lex: r'[_a-zA-Z]\w*'): 59 | return lex 60 | 61 | def L(lex: r'\('): 62 | return lex 63 | def R(lex: r'\)'): 64 | return lex 65 | 66 | L2 = r'\[' 67 | R2 = r'\]' 68 | 69 | def PLUS(lex: r'\+') -> 1: 70 | return lex 71 | def POW(lex: r'\*\*') -> 3: 72 | return lex 73 | def TIMES(lex: r'\*') -> 2: 74 | return lex 75 | 76 | # Nonterminals 77 | def assign(ID: r'[_a-zA-Z]\w*', 78 | EQ: '=', 79 | expr): 80 | table[ID] = expr 81 | 82 | def expr(NUM): 83 | return NUM 84 | 85 | def expr(expr_1, ADD: r'\+', expr_2): 86 | return expr_1 + expr_2 87 | 88 | 89 | # Decorator style 90 | def lex(pat, p=0): 91 | def _(func): 92 | return (func.__name__, pat, p, func) 93 | return _ 94 | 95 | class G_Calc(): 96 | 97 | @lex(r'\s+', 3) 98 | def IGNORED(val): 99 | pass 100 | 101 | @lex(r'\t', 2) 102 | def ERROR(val): 103 | print('ERROR!') 104 | 105 | -------------------------------------------------------------------------------- /tests/test_prece.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pprint as pp 4 | from metaparse import LALR 5 | from unittest import main, TestCase 6 | 7 | # Global stuff 8 | table = {} 9 | 10 | class LangCalc(metaclass=LALR.meta): 11 | 12 | # ===== Lexical patterns / Terminals ===== 13 | # - A pattern is defined by Python regex literal. 14 | # - Patterns will be matched in given order when tokenizing. 15 | 16 | IGNORED = r' ' # Special token ignored by tokenizer. 17 | IGNORED = r'\t' # Can add alternative patterns. 18 | 19 | POW = r'\*\*', 3 # Precedence of token (for LALR) 20 | MUL = r'\*' , 2 21 | ADD = r'\+' , 1 22 | 23 | EQ = r'=' # Precedence is 0 by default. 24 | 25 | def NUM(lex: r'[1-9]\d*'): # Handler for translating token value. 26 | return int(lex) 27 | 28 | ID = r'[_a-zA-Z]\w*' # Unhandled token yields literal value. 29 | 30 | # === Optional error handling for tokenizer === 31 | # - If handler defined, token ERROR is ignored when tokenizing. 32 | # - Otherwise token ERROR is yielded. 33 | def ERROR(lex: r'#'): 34 | print("Error literal '{}'".format(lex)) 35 | 36 | # ===== Syntactic/Semantic rules in SDT-style ===== 37 | 38 | def assign(ID, EQ, expr): # May rely on side-effect... 39 | table[ID] = expr 40 | 41 | def expr(NUM): # or return local results for purity 42 | return NUM 43 | 44 | def expr(ID): 45 | return table[ID] 46 | 47 | def expr(expr_1, ADD, expr_2): # With TeX-subscripts, meaning (expr → expr₁ + expr₂) 48 | return expr_1 + expr_2 49 | 50 | def expr(expr, MUL, expr_1): # Can ignore one of the subscripts. 51 | return expr * expr_1 52 | 53 | def expr(expr, POW, expr_1): 54 | return expr ** expr_1 55 | 56 | 57 | 58 | 59 | class Test(TestCase): 60 | 61 | def test_interp(self): 62 | t = LangCalc.interpret("x = 1 + 4 * 3 ** 2 + 5") 63 | assert table == {'x': 42} 64 | LangCalc.interpret("y = 5 + x * 2") 65 | assert table == {'x': 42, 'y': 5 + 42 * 2} 66 | LangCalc.interpret("z = 99") 67 | assert table == {'x': 42, 'y': 5 + 42 * 2, 'z': 99} 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /tests/test_arith.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from metaparse import Grammar, LALR 4 | 5 | class GArith(metaclass=LALR.meta): 6 | 7 | 'Textbook Grammar.meta for simple arithmetics.' 8 | 9 | # E -> E + T 10 | # E -> T 11 | # T -> T * F 12 | # T -> F 13 | # F -> NUMBER 14 | # F -> ( E ) 15 | 16 | IGNORED = r' ' 17 | 18 | plus = r'\+' 19 | times = r'\*' 20 | 21 | def number(lex: r'\d+'): 22 | return int(lex) 23 | 24 | left = r'\(' 25 | right = r'\)' 26 | 27 | 28 | def Expr(Expr, plus, Term): 29 | return Expr + Term 30 | def Expr(Term): 31 | return Term 32 | 33 | def Term(Term, times, Factor): 34 | return Term * Factor 35 | def Term(Factor): 36 | return Factor 37 | 38 | def Factor(number): 39 | return number 40 | def Factor(left, Expr, right): 41 | return Expr 42 | 43 | # def Atom(number): 44 | # return int(number) 45 | 46 | g = Grammar(GArith.rules) 47 | p = GArith 48 | 49 | # l = p.lexer 50 | # print(list(l.tokenize('1 2'))) 51 | # assert 0 52 | 53 | class TestArithParser(unittest.TestCase): 54 | 55 | def test_FIRST(self): 56 | self.assertEqual(g.FIRST['Expr'], {'left', 'number'}) 57 | self.assertEqual(g.FIRST['Term'], {'left', 'number'}) 58 | self.assertEqual(g.FIRST['Factor'], {'left', 'number'}) 59 | self.assertEqual(g.FIRST['number'], {'number'}) 60 | 61 | def test_single(self): 62 | inp = '0' 63 | self.assertEqual(eval(inp), p.interpret(inp)) 64 | 65 | def test_normal(self): 66 | inp = '3 + 2 * (5 + 11) * 2 + 3' 67 | self.assertEqual(eval(inp), p.interpret(inp)) 68 | 69 | def test_tough(self): 70 | inp = '3 + 2 * (5 + 11)' 71 | tough_inp = ' + '.join(inp for _ in range(100)) 72 | self.assertEqual(eval(inp), p.interpret(inp)) 73 | 74 | 75 | if __name__ == '__main__': 76 | 77 | unittest.main() 78 | 79 | # For debugging 80 | # t = TestArithParser() 81 | # t.test_normal() 82 | 83 | # tough = ' + '.join(['(2 * (1 + 1) + 2 * 2)'] * 1000) 84 | # %timeit ari_LALR.meta.interpret(tough) 85 | # 1 loops, best of 3: 347 ms per loop 86 | 87 | # with open('C:/Users/Shellay/Desktop/ari.psr', 'wb') as o: 88 | # o.write(ari_LALR.meta.dumps()) 89 | -------------------------------------------------------------------------------- /examples/eg_pystructs.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | import unittest 3 | 4 | from metaparse import * 5 | # from earley import earley 6 | 7 | # class PyStructReader(metaclass=earley): 8 | class PyStructReader(metaclass=LALR.meta): 9 | 10 | """ 11 | Grammar for python object and built-in container types. 12 | """ 13 | 14 | l1 = r'\(' 15 | r1 = r',?\s*\)' 16 | l2 = r'\[' 17 | r2 = r',?\s*\]' 18 | l3 = r'\{' 19 | r3 = r',?\s*\}' 20 | comma = r',' 21 | colon = r':' 22 | id = r'[A-Za-z_]\w*' 23 | 24 | def Obj(id) : return ('Sym', id) 25 | 26 | def Obj(Lst) : return Lst 27 | def Obj(Tpl) : return Tpl 28 | def Obj(Dic) : return Dic 29 | def Obj(Set) : return Set 30 | 31 | def Tpl(l1, Objs, r1) : return ('Tpl', Objs) 32 | def Lst(l2, Objs, r2) : return ('Lst', Objs) 33 | def Set(l3, Obj, Objs, r3) : return ('Set', [Obj] + Objs) # 'Set' contains at least one object 34 | def Dic(l3, DTerms, r3) : return ('Dic', DTerms) 35 | 36 | def Objs(Objs, comma, Obj) : return Objs + [Obj] 37 | def Objs(Obj) : return [Obj] 38 | def Objs() : return [] 39 | 40 | def DTerms(DTerms, comma, DTerm) : return DTerms + [DTerm] 41 | def DTerms(DTerm) : return [DTerm] 42 | def DTerms() : return [] 43 | 44 | def DTerm(Obj_1, colon, Obj_2) : return (Obj_1, Obj_2) 45 | 46 | 47 | target = PyStructReader.interpret 48 | 49 | class TestPyStructParser(unittest.TestCase): 50 | 51 | def test_empty_list(self): 52 | r = target('[]') 53 | self.assertEqual(r, ('Lst', [])) 54 | 55 | def test_empty_dict(self): 56 | r = target('{}') 57 | self.assertEqual(r, ('Dic', [])) 58 | 59 | def test_symbol(self): 60 | self.assertEqual(target('a'), ('Sym', 'a')) 61 | 62 | def test_normal_set(self): 63 | self.assertEqual( 64 | target('{(a, b), c, {e, f}}'), 65 | ('Set', [('Tpl', [('Sym', 'a'), ('Sym', 'b')]), 66 | ('Sym', 'c'), 67 | ('Set', [('Sym', 'e'), ('Sym', 'f')])])) 68 | 69 | def test_normal_dict(self): 70 | self.assertEqual( 71 | target('[{a: b}, {c}, {x: y, z: [a]}]'), 72 | ('Lst', [('Dic', [(('Sym', 'a'), ('Sym', 'b'))]), 73 | ('Set', [('Sym', 'c')]), 74 | ('Dic', [(('Sym', 'x'), ('Sym', 'y')), 75 | (('Sym', 'z'), ('Lst', [('Sym', 'a')]))])])) 76 | 77 | 78 | if __name__ == '__main__': 79 | unittest.main() 80 | -------------------------------------------------------------------------------- /tests/test_pystruct.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from metaparse import * 4 | 5 | class LangPyStruct(metaclass=LALR.meta): 6 | 7 | """ 8 | Grammar for python object and built-in container types. 9 | """ 10 | 11 | l1 = r'\(' 12 | r1 = r',?\s*\)' 13 | l2 = r'\[' 14 | r2 = r',?\s*\]' 15 | l3 = r'\{' 16 | r3 = r',?\s*\}' 17 | comma = r',' 18 | colon = r':' 19 | id = r'[A-Za-z_]\w*' 20 | 21 | def Obj(id) : return ('Sym', id) 22 | 23 | def Obj(Lst) : return Lst 24 | def Obj(Tpl) : return Tpl 25 | def Obj(Dic) : return Dic 26 | def Obj(Set) : return Set 27 | 28 | def Tpl(l1, Objs, r1) : return ('Tpl', Objs) 29 | def Lst(l2, Objs, r2) : return ('Lst', Objs) 30 | def Set(l3, Obj, Objs, r3) : return ('Set', [Obj] + Objs) # 'Set' contains at least one object 31 | def Dic(l3, DTerms, r3) : return ('Dic', DTerms) 32 | 33 | def Objs(Objs, comma, Obj) : return Objs + [Obj] 34 | def Objs(Obj) : return [Obj] 35 | def Objs() : return [] 36 | 37 | def DTerms(DTerms, comma, DTerm) : return DTerms + [DTerm] 38 | def DTerms(DTerm) : return [DTerm] 39 | def DTerms() : return [] 40 | 41 | def DTerm(Obj_1, colon, Obj_2) : return (Obj_1, Obj_2) 42 | 43 | 44 | target = LangPyStruct.interpret 45 | 46 | GrammarPyStruct = Grammar(LangPyStruct.rules) 47 | 48 | 49 | class TestPyStructParser(unittest.TestCase): 50 | 51 | def test_first(self): 52 | self.assertEqual(GrammarPyStruct.FIRST['Obj'], {'l1', 'id', 'l2', 'l3'}) 53 | 54 | def test_empty_list(self): 55 | self.assertEqual(target('[]'), ('Lst', [])) 56 | 57 | def test_empty_dict(self): 58 | self.assertEqual(target('{}'), ('Dic', [])) 59 | 60 | def test_symbol(self): 61 | self.assertEqual(target('a'), ('Sym', 'a')) 62 | 63 | def test_normal_set(self): 64 | self.assertEqual( 65 | target('{(a, b), c, {e, f}}'), 66 | ('Set', [('Tpl', [('Sym', 'a'), ('Sym', 'b')]), 67 | ('Sym', 'c'), 68 | ('Set', [('Sym', 'e'), ('Sym', 'f')])])) 69 | 70 | def test_normal_dict(self): 71 | self.assertEqual( 72 | target('[{a: b}, {c}, {x: y, z: [a]}]'), 73 | ('Lst', [('Dic', [(('Sym', 'a'), ('Sym', 'b'))]), 74 | ('Set', [('Sym', 'c')]), 75 | ('Dic', [(('Sym', 'x'), ('Sym', 'y')), 76 | (('Sym', 'z'), ('Lst', [('Sym', 'a')]))])])) 77 | 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /experiments/only_syntax.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import Symbol, Rule, Grammar, LALR 3 | from pprint import pprint 4 | 5 | class read(type): 6 | 7 | class gs(object): 8 | def __init__(self): 9 | self.lexes = [] 10 | self.pats = [] 11 | self.rules = [] 12 | self.prece = {} 13 | def __setitem__(self, k, v): 14 | if not k.startswith('__'): 15 | # lexical tuple 16 | if isinstance(v, tuple): 17 | assert len(v) == 2 18 | l, p = v 19 | self.lexes.append(k) 20 | self.pats.append(l) 21 | self.prece[k] = p 22 | # lexical str 23 | elif isinstance(v, str): 24 | self.lexes.append(k) 25 | self.pats.append(v) 26 | # alternatives 27 | elif isinstance(v, (list, set)): 28 | for alt in v: 29 | if not isinstance(alt, (list, tuple)): 30 | alt = (alt,) 31 | rhs = [] 32 | for x in alt: 33 | if isinstance(x, Symbol): 34 | rhs.append(str(x)) 35 | elif isinstance(x, str): 36 | self.lexes.append(x) 37 | self.pats.append(None) 38 | rhs.append(x) 39 | self.rules.append(Rule(k, rhs)) 40 | # 41 | elif callable(v): 42 | pass 43 | def __getitem__(self, k0): 44 | return Symbol(k0) 45 | 46 | @classmethod 47 | def __prepare__(mcls, n, bs, **kw): 48 | return read.gs() 49 | def __new__(mcls, n, bs, gs): 50 | return Grammar(gs.lexes, gs.pats, gs.rules, prece=gs.prece) 51 | 52 | 53 | class E(metaclass=read): 54 | 55 | # IGNORED = r'\s+' 56 | 57 | NEG = r'!' , 5 58 | CON = r'&' , 4 59 | DIS = r'\|' , 3 60 | IMP = r'->' , 2 61 | IFF = r'<=>' , 1 62 | 63 | W = r'[A-Z]\w*' 64 | 65 | Sentence = [ 66 | Atomic, 67 | Complex, 68 | ] 69 | 70 | Atomic = [ 71 | 'True', 72 | 'False', 73 | W, 74 | ] 75 | 76 | Complex = [ 77 | ('(', Sentence, ')'), 78 | ('[', Sentence, ']'), 79 | (NEG, Sentence), 80 | (Sentence, CON, Sentence), 81 | (Sentence, DIS, Sentence), 82 | (Sentence, IMP, Sentence), 83 | (Sentence, IFF, Sentence), 84 | ] 85 | 86 | 87 | # pprint(E) 88 | 89 | # g = Grammar(*E) 90 | # pprint(g) 91 | pprint(E.lex2pats) 92 | p = LALR(E) 93 | 94 | # pprint([*p.lexer.tokenize('True & False', True)]) 95 | # pprint(p.parse('P & Q | R & !S')) 96 | 97 | s = p.dump('meta_dumps.py') 98 | p1 = LALR.load('meta_dumps.py', globals()) 99 | 100 | # print(s) 101 | 102 | pprint(p1.parse('P & Q | R & !S')) 103 | -------------------------------------------------------------------------------- /examples/eg_read_ebnf.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import Token, Rule, Lexer 3 | from metaparse import LALR 4 | from collections import namedtuple 5 | from pprint import pprint 6 | 7 | Seq = namedtuple('Seq', 'exprs') 8 | Rep = namedtuple('Rep', 'expr') 9 | Opt = namedtuple('Opt', 'expr') 10 | Alts = namedtuple('Alts', 'exprs') 11 | 12 | 13 | class Symbol(str): 14 | def __repr__(self): 15 | return self 16 | 17 | 18 | class EBNF(metaclass=LALR.meta): 19 | 20 | ID = r'[a-zA-Z]\w+' 21 | TERM1 = r'\'[^\']*\'' 22 | TERM2 = r'\"[^\"]*\"' 23 | 24 | DRV = r'=' 25 | ALT = r'\|' 26 | CON = r',' 27 | SEMI = r';' 28 | 29 | L = r'\(' ; R = r'\)' 30 | Lb = r'\[' ; Rb = r'\]' 31 | LB = r'\{' ; RB = r'\}' 32 | 33 | def grammar(rules): 34 | return rules 35 | 36 | def rules(rules, rule): 37 | rules.append(rule) 38 | return rules 39 | def rules(): 40 | return [] 41 | 42 | def rule(lhs, DRV, rhs, SEMI): 43 | return (Symbol(lhs), rhs) 44 | 45 | def lhs(ID): 46 | return ID 47 | 48 | def rhs(alts): 49 | return Alts(alts) 50 | 51 | def alts(alts, ALT, seq): 52 | alts.append(Seq(seq)) 53 | return alts 54 | def alts(seq): 55 | return [Seq(seq)] 56 | 57 | def seq(seq, CON, expr): 58 | return seq + (expr,) 59 | def seq(expr): 60 | return (expr,) 61 | 62 | def expr(ID): return Symbol(ID) 63 | def expr(term): return term[1:-1] 64 | def expr(opt): return Opt(opt) 65 | def expr(rep): return Rep(rep) 66 | def expr(grp): return grp 67 | 68 | def term(TERM1): return TERM1 69 | def term(TERM2): return TERM2 70 | 71 | def grp(L, alts, R): return (alts) 72 | def opt(Lb, alts, Rb): return (alts) 73 | def rep(LB, alts, RB): return (alts) 74 | 75 | 76 | inp = """ 77 | letter = "A" | "B" | "C" | "D" | "E" | "F" | "G" 78 | | "H" | "I" | "J" | "K" | "L" | "M" | "N" 79 | | "O" | "P" | "Q" | "R" | "S" | "T" | "U" 80 | | "V" | "W" | "X" | "Y" | "Z" | "a" | "b" 81 | | "c" | "d" | "e" | "f" | "g" | "h" | "i" 82 | | "j" | "k" | "l" | "m" | "n" | "o" | "p" 83 | | "q" | "r" | "s" | "t" | "u" | "v" | "w" 84 | | "x" | "y" | "z" ; 85 | digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; 86 | symbol = "[" | "]" | "{" | "}" | "(" | ")" | "<" | ">" 87 | | "'" | '"' | "=" | "|" | "." | "," | ";" ; 88 | character = letter | digit | symbol | "_" ; 89 | 90 | identifier = letter , { letter | digit | "_" } ; 91 | terminal = "'" , character , { character } , "'" 92 | | '"' , character , { character } , '"' ; 93 | 94 | lhs = identifier ; 95 | rhs = identifier 96 | | terminal 97 | | "[" , rhs , "]" 98 | | "{" , rhs , "}" 99 | | "(" , rhs , ")" 100 | | rhs , "|" , rhs 101 | | rhs , "," , rhs ; 102 | 103 | rule = lhs , "=" , rhs , ";" ; 104 | grammar = { rule } ; 105 | """ 106 | 107 | pprint(EBNF) 108 | 109 | tr = EBNF.parse(inp) 110 | e = EBNF.interpret(inp) 111 | 112 | # pprint(tr) 113 | pprint(e) 114 | -------------------------------------------------------------------------------- /tests/test_courses.py: -------------------------------------------------------------------------------- 1 | from metaparse import LALR 2 | 3 | 4 | def fappend(l, x): 5 | l.append(x) 6 | return l 7 | 8 | class LangCourses(metaclass=LALR.meta): 9 | 10 | """ 11 | Grammar to assign multiple numbers to precedend course name. 12 | Examples: 13 | 14 | "CS 2110" => ("CS", 2110) # 0 15 | 16 | "CS 2110 and INFO 3300" => [("CS", 2110), ("INFO", 3300)] # 1 17 | "CS 2110, INFO 3300" => [("CS", 2110), ("INFO", 3300)] # 1 18 | "CS 2110, 3300, 3140" => [("CS", 2110), ("CS", 3300), ("CS", 3140)] # 1 19 | 20 | "CS 2110 or INFO 3300" => [[("CS", 2110)], [("INFO", 3300)]] # 2 21 | 22 | "MATH 2210, 2230, 2310, or 2940" => [[("MATH", 2210), ("MATH", 2230), ("MATH", 2310)], [("MATH", 2940)]] # 3 23 | 24 | """ 25 | 26 | IGNORED = r'[ \t]+|(,)|(and)' 27 | NAME = r'[A-Z]+' 28 | NUMBER = r'\d{4}' 29 | OR = r'or' 30 | 31 | 32 | # info -> headed 33 | def info(headed): return headed 34 | 35 | # info -> conj 36 | def info(conj): return conj 37 | 38 | # info -> disj 39 | def info(disj): return disj 40 | 41 | # headed -> NAME nums 42 | def headed(NAME, nums): return [(NAME, x) for x in nums] 43 | 44 | # nums -> nums NUMBER 45 | def nums(nums, NUMBER): return fappend(nums , NUMBER) 46 | # def nums(nums, NUMBER): return nums + [NUMBER] 47 | 48 | # nums -> NUMBER 49 | def nums(NUMBER): return [NUMBER] 50 | 51 | # conj -> headed headed 52 | def conj(headed_1, headed_2): return headed_1 + headed_2 53 | 54 | # disj -> headed OR headed 55 | def disj(headed_1, OR, headed_2): return [headed_1, headed_2] 56 | 57 | # disj -> headed OR nums 58 | def disj(headed, OR, nums): return [headed, [(headed[0][0], n) for n in nums]] 59 | 60 | import pprint as pp 61 | from unittest import main, TestCase 62 | 63 | gcrs = LangCourses 64 | 65 | class Test(TestCase): 66 | def test_match(self): 67 | assert gcrs.interpret('CS 2110') == \ 68 | [('CS', '2110')] 69 | assert gcrs.interpret('CS 2110 and INFO 3300') == \ 70 | [('CS', '2110'), ('INFO', '3300')] 71 | assert gcrs.interpret('CS 2110, INFO 3300') == \ 72 | [('CS', '2110'), ('INFO', '3300')] 73 | assert gcrs.interpret('CS 2110, 3300, 3140') == \ 74 | [('CS', '2110'), ('CS', '3300'), ('CS', '3140')] 75 | assert gcrs.interpret('CS 2110 or INFO 3300') == \ 76 | [[('CS', '2110')], [('INFO', '3300')]] 77 | 78 | # Compare forms with same semantics... 79 | inp = "MATH 2210, 2230, 2310 or 2940" 80 | s1 = gcrs.parse(inp) 81 | v1 = gcrs.interpret(inp) 82 | 83 | inp = "MATH 2210, 2230, 2310, or 2940" 84 | s2 = gcrs.parse(inp) 85 | v2 = gcrs.interpret(inp) 86 | 87 | # assert s1 == s2 88 | from pprint import pprint 89 | # pprint(s1) 90 | # pprint(s2) 91 | # self.assertEqual((s1), (s2)) 92 | self.assertEqual(str(s1), str(s2)) 93 | self.assertEqual(v1, v2) 94 | 95 | 96 | if __name__ == '__main__': 97 | main() 98 | -------------------------------------------------------------------------------- /tests/tests_error_handling/test_language_errors.py: -------------------------------------------------------------------------------- 1 | from metaparse import LanguageError, LALR 2 | import unittest 3 | 4 | 5 | class TestLangError(unittest.TestCase): 6 | 7 | def test_missing_symbol(self): 8 | with self.assertRaises(LanguageError) as excCtx: 9 | 10 | class ExprLang(metaclass=LALR.meta): 11 | 12 | NUM = '\d+' 13 | PLUS = '\+' 14 | # TIMES = '\*' 15 | 16 | def expr(expr, PLUS, term): 17 | return expr + term 18 | 19 | def expr(expr, TIMES, term): 20 | return expr * term 21 | 22 | def expr(term): 23 | return term 24 | 25 | def term(NUM): 26 | return int(NUM) 27 | 28 | def factor(NUM): 29 | return int(NUM) 30 | 31 | self.assertIn( 32 | 'No lexical pattern provided for terminal symbol: TIMES', 33 | excCtx.exception.message) 34 | 35 | def test_unreachable_rule(self): 36 | with self.assertRaises(LanguageError) as excCtx: 37 | 38 | class ExprLang(metaclass=LALR.meta): 39 | 40 | NUM = '\d+' 41 | PLUS = '\+' 42 | TIMES = '\*' 43 | 44 | def expr(expr, PLUS, term): 45 | return expr + term 46 | 47 | def expr(expr, TIMES, term): 48 | return expr * term 49 | 50 | def expr(term): 51 | return term 52 | 53 | def term(NUM): 54 | return int(NUM) 55 | 56 | def factor(NUM): 57 | return int(NUM) 58 | 59 | self.assertIn( 60 | "There are unreachable nonterminals at 5th rule: {'factor'}.", 61 | excCtx.exception.message) 62 | 63 | 64 | class TestLangErrorApi2(unittest.TestCase): 65 | 66 | def test_missing_symbol(self): 67 | with self.assertRaises(LanguageError) as excCtx: 68 | p = LALR() 69 | with p as (lex, rule): 70 | lex(a = 'a') 71 | lex(b = 'b') 72 | @rule 73 | def S(a, S, b): pass 74 | @rule 75 | def S(): pass 76 | @rule 77 | def S(c): pass 78 | self.assertIn( 79 | 'No lexical pattern provided for terminal symbol: c', 80 | excCtx.exception.message) 81 | 82 | def test_unreachable_rule(self): 83 | with self.assertRaises(LanguageError) as excCtx: 84 | p = LALR() 85 | with p as (l, r): 86 | l(a = 'a') 87 | l(b = 'b') 88 | @r 89 | def S(a, S, b): pass 90 | @r 91 | def S(): pass 92 | @r 93 | def B(a): pass 94 | @r 95 | def B(b): pass 96 | 97 | self.assertIn( 98 | "There are unreachable nonterminals at 3th rule: {'B'}.", 99 | excCtx.exception.message) 100 | 101 | 102 | if __name__ == '__main__': 103 | unittest.main() 104 | -------------------------------------------------------------------------------- /examples/eg_read_yacc.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import * 3 | from collections import OrderedDict 4 | from pprint import pprint 5 | 6 | 7 | class Symbol(str): 8 | def __repr__(self): 9 | return self 10 | 11 | 12 | class Helper: 13 | 14 | terms = OrderedDict() 15 | _c = -1 16 | 17 | def reset(): 18 | Helper._c = -1 19 | Helper.terms = OrderedDict() 20 | 21 | def get_term(lit): 22 | Helper._c += 1 23 | if lit not in Helper.terms: 24 | term = Symbol('TM{}'.format(Helper._c)) 25 | Helper.terms[lit] = term 26 | return Helper.terms[lit] 27 | 28 | 29 | class YACC(metaclass=LALR.meta): 30 | 31 | IGNORED = r'\s+' 32 | IGNORED = r'\/\*[^(\*/)]*\*\/' 33 | # IGNORED = r'\{[^\}]*\}' 34 | 35 | ALT = r'\|' 36 | DRV = r':' 37 | SEMI = r';' 38 | 39 | BODY = r'\{[^\}]*\}' 40 | 41 | ID = r'[_a-zA-Z]\w*' 42 | TERM1 = r"\'[^\']*\'" 43 | TERM2 = r'\"[^\"]*\"' 44 | 45 | def grammar(rules): 46 | terms = [' {} = r{}'.format( 47 | tok, 48 | repr(pat[1:-1])) for pat, tok in Helper.terms.items()] 49 | gen = '\n'.join([ 50 | 'from metaparse import LALR', 51 | '', 52 | 'class G(metaclass=LALR.meta):', 53 | '', 54 | *terms, 55 | '', 56 | *rules, 57 | ]) 58 | return gen 59 | 60 | def rules(): return [] 61 | def rules(rules, rule): 62 | return rules + rule 63 | 64 | def term(TERM1): 65 | return Helper.get_term(TERM1) 66 | def term(TERM2): 67 | return Helper.get_term(TERM2) 68 | 69 | def rule(ID, DRV, alts, SEMI): 70 | r_defs = [] 71 | for seq, bdy in alts: 72 | r_def = ' def {}{}:\n r"""{}"""\n'.format( 73 | ID, 74 | seq, 75 | repr(bdy), 76 | ) 77 | r_defs.append(r_def) 78 | return r_defs 79 | 80 | def alts(alts, ALT, alt): 81 | alts.append(alt) 82 | return alts 83 | def alts(alt): 84 | return [alt] 85 | 86 | def alt(seq): 87 | return (seq, '') 88 | def alt(seq, BODY): 89 | return (seq, BODY) 90 | 91 | def seq(seq, symbol): 92 | return seq + (symbol,) 93 | def seq(): 94 | return () 95 | 96 | def symbol(term): 97 | return term 98 | def symbol(ID): 99 | return Symbol(ID) 100 | 101 | 102 | eg = """ 103 | input: /* empty */ 104 | | input line 105 | ; 106 | 107 | line: '\n' 108 | | exp '\n' { printf ("\t%.10g\n", $1); } 109 | ; 110 | 111 | exp: NUM { $$ = $1; } 112 | | exp exp '+' { $$ = $1 + $2; } 113 | | exp exp '-' { $$ = $1 - $2; } 114 | | exp exp '*' { $$ = $1 * $2; } 115 | | exp exp '/' { $$ = $1 / $2; } 116 | /* Exponentiation */ 117 | | exp exp '^' { $$ = pow ($1, $2); } 118 | /* Unary minus */ 119 | | exp '-' { $$ = -$1; } 120 | ; 121 | """ 122 | 123 | # pprint([*YACC.tokenize(eg, True)]) 124 | 125 | yacc = YACC 126 | tr = yacc.parse(eg) 127 | res = yacc.interpret(eg) 128 | 129 | # pprint(yacc.grammar.lexers) 130 | # pprint(yacc) 131 | # pprint(tr) 132 | print() 133 | print(res) 134 | 135 | 136 | r_plus = Rule('exp', ['exp', 'exp', '+']) 137 | print(r_plus) 138 | -------------------------------------------------------------------------------- /examples/eg_demo.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import LALR 3 | 4 | # Global context/environment for language semantics. 5 | context = {} 6 | 7 | class pCalc(metaclass=LALR.meta): 8 | 9 | "A language for calculating expressions." 10 | 11 | # ===== Lexical patterns / Terminals ===== 12 | # - Patterns specified with regular expressions 13 | # - Patterns will be tested in declaration order during tokenizing 14 | 15 | IGNORED = r'\s+' # Special pattern to be ignored. 16 | 17 | EQ = r'=' 18 | POW = r'\*\*', 3 # Can specify precedence of token (for LALR conflict resolution) 19 | POW = r'\^' , 3 # Alternative patterns can share the same name 20 | MUL = r'\*' , 2 21 | ADD = r'\+' , 1 22 | 23 | ID = r'[_a-zA-Z]\w*' 24 | NUM = r'[1-9][0-9]*' 25 | def NUM(value): # Can specify handler for lexical pattern! 26 | return int(value) 27 | 28 | # ===== Syntactic/Semantic rules in SDT-style ===== 29 | 30 | def assign(ID, EQ, expr): # May access global context. 31 | context[ID] = expr 32 | return expr 33 | 34 | def expr(NUM): # May compute result purely. 35 | return NUM # NUM is passed as int due to the handler! 36 | 37 | def expr(ID): 38 | return context[ID] 39 | 40 | def expr(expr_1, ADD, expr_2): # With TeX-subscripts, meaning (expr → expr₁ + expr₂) 41 | return expr_1 + expr_2 42 | 43 | def expr(expr, MUL, expr_1): # Can ignore one of the subscripts. 44 | return expr * expr_1 45 | 46 | def expr(expr, POW, expr_1): 47 | return expr ** expr_1 48 | 49 | 50 | from pprint import pprint 51 | 52 | print (type(pCalc)) 53 | 54 | print (pCalc.interpret("x = 1 + 4 * 3 ** 2 + 5")) 55 | # 42 56 | print (pCalc.interpret("y = 5 + x * 2")) # Here `x` is extracted from the context `context` 57 | # 89 58 | print (pCalc.interpret("z = 9 ^ 2")) 59 | # 81 60 | 61 | print (context) 62 | 63 | 64 | tr = pCalc.parse(" w = 1 + 2 * 3 ** 4 + 5 ") 65 | 66 | # pprint(tr) 67 | print (pCalc.lexer) 68 | 69 | for token in pCalc.lexer.tokenize(" foo = 1 + bar * 2"): 70 | print(token.pos, 71 | token.end, 72 | token.symbol, 73 | repr(token.lexeme), # (lexeme) is something literal. 74 | repr(token.value)) # (value) is something computed by handler, if exists. 75 | 76 | # 1 2 ID 'w' 77 | # 4 5 EQ '=' 78 | # 6 7 NUM '1' 79 | # 8 9 ADD '+' 80 | # 10 11 ID 'x' 81 | # 12 13 MUL '*' 82 | # 14 15 NUM '2' 83 | 84 | ('assign', 85 | [('ID', 'w'), 86 | ('EQ', '='), 87 | ('expr', 88 | [('expr', 89 | [('expr', [('NUM', '1')]), 90 | ('ADD', '+'), 91 | ('expr', 92 | [('expr', [('NUM', '2')]), 93 | ('MUL', '*'), 94 | ('expr', 95 | [('expr', [('NUM', '3')]), 96 | ('POW', '**'), 97 | ('expr', [('NUM', '4')])])])]), 98 | ('ADD', '+'), 99 | ('expr', [('NUM', '5')])])]) 100 | 101 | 102 | # s = pCalc.dumps() 103 | # print(s) 104 | # pCalc.dump('./eg_demo_dump.py') 105 | 106 | 107 | # Let loaded parser be able to access current runtime env `globals()`. 108 | # qCalc = LALR.load('./eg_demo_dump.py', globals()) 109 | 110 | # Context instance to be accessed by the loaded parser 111 | # context = {} 112 | 113 | # qCalc.interpret('foo = 1 + 9') 114 | 115 | # print (context) 116 | # {'foo': 10} 117 | 118 | 119 | 120 | # context = {} 121 | # pCalc.interpret("bar = 10 ^ 3") 122 | # # pCalc1.interpret("bar = 99 + 1") 123 | # print(context) 124 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | import metaparse as mp 2 | from metaparse import LALR, END_TOKEN 3 | 4 | p = LALR() 5 | 6 | p.lexer.more( 7 | IGNORED=' ', 8 | PLUS='\+', 9 | TIMES='\*', 10 | LEFT='\(', 11 | RIGHT='\)' 12 | ) 13 | 14 | @p.lexer(NUMBER='\d+') 15 | def _(val): 16 | return int(val) 17 | 18 | @p.rule 19 | def expr(expr, PLUS, term): 20 | return expr + term 21 | 22 | @p.rule 23 | def expr(term): 24 | return term 25 | 26 | @p.rule 27 | def term(term, TIMES, factor): 28 | return term * factor 29 | 30 | @p.rule 31 | def term(factor): 32 | return factor 33 | 34 | 35 | with p as (lexer, rule): 36 | 37 | @rule 38 | def factor(NUMBER): 39 | return NUMBER 40 | 41 | @rule 42 | def factor(LEFT, expr, RIGHT): 43 | return expr 44 | 45 | # exit and make! 46 | 47 | # p.make() 48 | 49 | 50 | from pprint import pprint 51 | # pprint(p.grammar) 52 | # p.inspect_ACTION 53 | # t = p.parse('123') 54 | # pprint(t) 55 | tkns = (p.lexer.tokenize('123 + 8')) 56 | q = p.prepare() 57 | next(q) 58 | q.send(next(tkns)) 59 | q.send(next(tkns)) 60 | q.send(next(tkns)) 61 | t = q.send(END_TOKEN) 62 | assert t == mp.Just(131) 63 | 64 | t = p.parse('123 + 8') 65 | assert p.interpret('123 + 8') == 131 66 | t = p.parse('123 + 2 * 1') 67 | assert p.interpret('123 + 2 * 1') == 125 68 | assert p.interpret('123 + 2 * (1 + 2)') == 129 69 | 70 | tough = ' + '.join(['(2 * (1 + (1)) + 2 * 2 + (3))'] * 100) 71 | assert p.interpret(tough) == eval(tough) 72 | 73 | # if replication is 10000 74 | # %timeit p.interpret(tough) 75 | # 1 loops, best of 3: 346 ms per loop 76 | 77 | p_sexp = LALR() 78 | 79 | with p_sexp as (lex, rule): 80 | 81 | # # Order??? 82 | # lex.word( 83 | # IGNORED=' ', 84 | # LEFT='(', 85 | # RIGHT=')', 86 | # COMMA=',', 87 | # ) 88 | # lex.re( 89 | # NUMBER='\d+(\.\d*)?', 90 | # SYMBOL='\w+', 91 | # UNKNOWN='%', 92 | # ) 93 | lex.more( 94 | IGNORED='%', 95 | LEFT='\(', 96 | RIGHT='\)', 97 | COMMA=',', 98 | ) 99 | lex(IGNORED='\s+') 100 | lex(SYMBOL='[_a-zA-Z]\w*') 101 | lex(UNKNOWN='&') 102 | 103 | @lex(NUMBER='[1-9]\d*(\.\d*)?') 104 | def _(val): 105 | return int(val) 106 | 107 | @rule 108 | def sexp(atom): 109 | return atom 110 | @rule 111 | def sexp(LEFT, slist, RIGHT): 112 | return slist 113 | 114 | 115 | @rule 116 | def slist(): 117 | return [] 118 | @rule 119 | def slist(slist, sexp): 120 | slist.append(sexp) 121 | return slist 122 | 123 | @rule 124 | def atom(NUMBER): 125 | return NUMBER 126 | @rule 127 | def atom(SYMBOL): 128 | return SYMBOL 129 | 130 | # p_sexp.inspect_ACTION 131 | # p_sexp.inspect_GOTO 132 | 133 | # debug p_sexp.make() 134 | 135 | # s = p_sexp.parse('123') 136 | # pprint(s) 137 | # pprint(list(p_sexp.lexer.tokenize('(a b (c d))'))) 138 | # pprint(p_sexp.lexer) 139 | 140 | # ds = (p_sexp.dumps()) 141 | # ctx = {} 142 | # exec(ds, {}, ctx) 143 | # pprint(ctx) 144 | 145 | # lx_dp = p_sexp.lexer.dumps() 146 | # print(lx_dp) 147 | # lexer1 = Lexer.loads(lx_dp, globals()) 148 | 149 | # print(list(lexer1.tokenize(' 123 99 '))) 150 | 151 | 152 | import warnings 153 | 154 | with warnings.catch_warnings(record=True) as w: 155 | s = p_sexp.interpret('(a 123 (c (d)) % & e)') 156 | assert len(w) == 1 157 | 158 | assert s == ['a', 123, ['c', ['d']], 'e'], s 159 | 160 | 161 | sexp_dp = p_sexp.dumps() 162 | 163 | with open('sexp_dump.py', 'w') as o: 164 | o.write(sexp_dp) 165 | 166 | # print(sexp_dp) 167 | p_sexp1 = LALR.loads(sexp_dp, globals()) 168 | 169 | with warnings.catch_warnings(record=True) as w: 170 | s = p_sexp.interpret('(a & 123 (c (d)) % & e)') 171 | assert len(w) == 2 172 | 173 | 174 | assert s == ['a', 123, ['c', ['d']], 'e'], s 175 | 176 | -------------------------------------------------------------------------------- /examples/eg_demo_dump.py: -------------------------------------------------------------------------------- 1 | lex2pats = [('IGNORED', '\\s+'), 2 | ('EQ', '='), 3 | ('NUM', '[1-9][0-9]*'), 4 | ('ID', '[_a-zA-Z]\\w*'), 5 | ('POW', '\\*\\*'), 6 | ('MUL', '\\*'), 7 | ('ADD', '\\+')] 8 | 9 | handlers = [None, None, None, None, None, None, None] 10 | 11 | rules = [('assign^', ('assign',)), 12 | ('assign', ('ID', 'EQ', 'expr')), 13 | ('expr', ('NUM',)), 14 | ('expr', ('ID',)), 15 | ('expr', ('expr', 'ADD', 'expr')), 16 | ('expr', ('expr', 'MUL', 'expr')), 17 | ('expr', ('expr', 'POW', 'expr'))] 18 | 19 | ACTION = [{'ID': ('shift', 2)}, 20 | {'\x03': ('accept', 0)}, 21 | {'EQ': ('shift', 3)}, 22 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 23 | {'\x03': ('reduce', 1), 24 | 'ADD': ('shift', 7), 25 | 'MUL': ('shift', 8), 26 | 'POW': ('shift', 9)}, 27 | {'\x03': ('reduce', 2), 28 | 'ADD': ('reduce', 2), 29 | 'MUL': ('reduce', 2), 30 | 'POW': ('reduce', 2)}, 31 | {'\x03': ('reduce', 3), 32 | 'ADD': ('reduce', 3), 33 | 'MUL': ('reduce', 3), 34 | 'POW': ('reduce', 3)}, 35 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 36 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 37 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 38 | {'\x03': ('reduce', 4), 39 | 'ADD': ('reduce', 4), 40 | 'MUL': ('shift', 8), 41 | 'POW': ('shift', 9)}, 42 | {'\x03': ('reduce', 5), 43 | 'ADD': ('reduce', 5), 44 | 'MUL': ('reduce', 5), 45 | 'POW': ('shift', 9)}, 46 | {'\x03': ('reduce', 6), 47 | 'ADD': ('reduce', 6), 48 | 'MUL': ('reduce', 6), 49 | 'POW': ('reduce', 6)}] 50 | 51 | GOTO = [{'ID': 2, 'assign': 1}, 52 | {}, 53 | {'EQ': 3}, 54 | {'ID': 6, 'NUM': 5, 'expr': 4}, 55 | {'ADD': 7, 'MUL': 8, 'POW': 9}, 56 | {}, 57 | {}, 58 | {'ID': 6, 'NUM': 5, 'expr': 10}, 59 | {'ID': 6, 'NUM': 5, 'expr': 11}, 60 | {'ID': 6, 'NUM': 5, 'expr': 12}, 61 | {'ADD': 7, 'MUL': 8, 'POW': 9}, 62 | {'ADD': 7, 'MUL': 8, 'POW': 9}, 63 | {'ADD': 7, 'MUL': 8, 'POW': 9}] 64 | 65 | semans = [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 66 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x' 67 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\Users\\Shellay\\Documents\\GitHu' 68 | b'b\\metaparse\\metaparse.py\xda\x08identity1\x00\x00\x00s\x02\x00\x00\x00\x00' 69 | b'\x01', 70 | b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00' 71 | b'\x00C\x00\x00\x00s\x0e\x00\x00\x00|\x02\x00t\x00\x00|\x00\x00<|\x02\x00S' 72 | b')\x01N)\x01\xda\x07context)\x03\xda\x02ID\xda\x02EQ\xda\x04expr\xa9\x00' 73 | b'r\x05\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/' 74 | b'eg_demo.py\xda\x06assign\x19\x00\x00\x00s\x04\x00\x00\x00\x00\x01\n\x01', 75 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00' 76 | b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)' 77 | b'\x01\xda\x03int)\x01\xda\x03NUM\xa9\x00r\x03\x00\x00\x00\xfa?c:/Users/Shell' 78 | b'ay/Documents/GitHub/metaparse/examples/eg_demo.py\xda\x04expr\x1d' 79 | b'\x00\x00\x00s\x02\x00\x00\x00\x00\x01', 80 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00' 81 | b'\x00C\x00\x00\x00s\x08\x00\x00\x00t\x00\x00|\x00\x00\x19S)\x01N)\x01\xda' 82 | b'\x07context)\x01\xda\x02ID\xa9\x00r\x03\x00\x00\x00\xfa?c:/Users/Shellay/' 83 | b'Documents/GitHub/metaparse/examples/eg_demo.py\xda\x04expr \x00\x00\x00' 84 | b's\x02\x00\x00\x00\x00\x01', 85 | b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00' 86 | b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x17S)\x01N\xa9\x00)' 87 | b'\x03\xda\x06expr_1\xda\x03ADD\xda\x06expr_2r\x01\x00\x00\x00r' 88 | b'\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/e' 89 | b'g_demo.py\xda\x04expr#\x00\x00\x00s\x02\x00\x00\x00\x00\x01', 90 | b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00' 91 | b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x14S)\x01N\xa9\x00)' 92 | b'\x03\xda\x04expr\xda\x03MUL\xda\x06expr_1r\x01\x00\x00\x00r\x01\x00' 93 | b'\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/eg_demo' 94 | b'.pyr\x02\x00\x00\x00&\x00\x00\x00s\x02\x00\x00\x00\x00\x01', 95 | b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00' 96 | b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x13S)\x01N\xa9\x00)' 97 | b'\x03\xda\x04expr\xda\x03POW\xda\x06expr_1r\x01\x00\x00\x00r\x01\x00' 98 | b'\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/eg_demo' 99 | b'.pyr\x02\x00\x00\x00)\x00\x00\x00s\x02\x00\x00\x00\x00\x01'] 100 | -------------------------------------------------------------------------------- /tests/sexp_dump.py: -------------------------------------------------------------------------------- 1 | lex2pats = [('RIGHT', '\\)'), 2 | ('COMMA', ','), 3 | ('IGNORED', '%'), 4 | ('LEFT', '\\('), 5 | ('IGNORED', '\\s+'), 6 | ('SYMBOL', '[_a-zA-Z]\\w*'), 7 | ('UNKNOWN', '&'), 8 | ('NUMBER', '[1-9]\\d*(\\.\\d*)?')] 9 | 10 | handlers = [None, 11 | None, 12 | None, 13 | None, 14 | None, 15 | None, 16 | None, 17 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00' 18 | b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)' 19 | b'\x01\xda\x03int)\x01\xda\x03val\xa9\x00r\x03\x00\x00\x00\xfa?c:/Users/Shell' 20 | b'ay/Documents/GitHub/metaparse/tests/test_basic.py\xda\x01_h\x00\x00\x00' 21 | b's\x02\x00\x00\x00\x00\x02'] 22 | 23 | rules = [('sexp^', ('sexp',)), 24 | ('sexp', ('atom',)), 25 | ('sexp', ('LEFT', 'slist', 'RIGHT')), 26 | ('slist', ()), 27 | ('slist', ('slist', 'sexp')), 28 | ('atom', ('NUMBER',)), 29 | ('atom', ('SYMBOL',))] 30 | 31 | ACTION = [{'LEFT': ('shift', 3), 'NUMBER': ('shift', 4), 'SYMBOL': ('shift', 5)}, 32 | {'\x03': ('accept', 0)}, 33 | {'\x03': ('reduce', 1), 34 | 'LEFT': ('reduce', 1), 35 | 'NUMBER': ('reduce', 1), 36 | 'RIGHT': ('reduce', 1), 37 | 'SYMBOL': ('reduce', 1)}, 38 | {'LEFT': ('reduce', 3), 39 | 'NUMBER': ('reduce', 3), 40 | 'RIGHT': ('reduce', 3), 41 | 'SYMBOL': ('reduce', 3)}, 42 | {'\x03': ('reduce', 5), 43 | 'LEFT': ('reduce', 5), 44 | 'NUMBER': ('reduce', 5), 45 | 'RIGHT': ('reduce', 5), 46 | 'SYMBOL': ('reduce', 5)}, 47 | {'\x03': ('reduce', 6), 48 | 'LEFT': ('reduce', 6), 49 | 'NUMBER': ('reduce', 6), 50 | 'RIGHT': ('reduce', 6), 51 | 'SYMBOL': ('reduce', 6)}, 52 | {'LEFT': ('shift', 3), 53 | 'NUMBER': ('shift', 4), 54 | 'RIGHT': ('shift', 7), 55 | 'SYMBOL': ('shift', 5)}, 56 | {'\x03': ('reduce', 2), 57 | 'LEFT': ('reduce', 2), 58 | 'NUMBER': ('reduce', 2), 59 | 'RIGHT': ('reduce', 2), 60 | 'SYMBOL': ('reduce', 2)}, 61 | {'LEFT': ('reduce', 4), 62 | 'NUMBER': ('reduce', 4), 63 | 'RIGHT': ('reduce', 4), 64 | 'SYMBOL': ('reduce', 4)}] 65 | 66 | GOTO = [{'LEFT': 3, 'NUMBER': 4, 'SYMBOL': 5, 'atom': 2, 'sexp': 1}, 67 | {}, 68 | {}, 69 | {'slist': 6}, 70 | {}, 71 | {}, 72 | {'LEFT': 3, 'NUMBER': 4, 'RIGHT': 7, 'SYMBOL': 5, 'atom': 2, 'sexp': 8}, 73 | {}, 74 | {}] 75 | 76 | semans = [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 77 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x' 78 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\Users\\Shellay\\Documents\\GitHu' 79 | b'b\\metaparse\\metaparse.py\xda\x08identity1\x00\x00\x00s\x02\x00\x00\x00\x00' 80 | b'\x01', 81 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 82 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x04atomr' 83 | b'\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/met' 84 | b'aparse/tests/test_basic.py\xda\x04sexpl\x00\x00\x00s\x02\x00\x00' 85 | b'\x00\x00\x02', 86 | b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00' 87 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x01\x00S)\x01N\xa9\x00)\x03\xda\x04L' 88 | b'EFT\xda\x05slist\xda\x05RIGHTr\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/User' 89 | b's/Shellay/Documents/GitHub/metaparse/tests/test_basic.py\xda\x04sexpo\x00' 90 | b'\x00\x00s\x02\x00\x00\x00\x00\x02', 91 | b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00' 92 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00g\x00\x00S)\x01N\xa9\x00r\x01\x00\x00\x00' 93 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/me' 94 | b'taparse/tests/test_basic.py\xda\x05slistt\x00\x00\x00s\x02\x00\x00\x00\x00' 95 | b'\x02', 96 | b'\xe3\x02\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00' 97 | b'\x00C\x00\x00\x00s\x11\x00\x00\x00|\x00\x00j\x00\x00|\x01\x00\x83' 98 | b'\x01\x00\x01|\x00\x00S)\x01N)\x01\xda\x06append)\x02\xda\x05slist\xda\x04s' 99 | b'exp\xa9\x00r\x04\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metapa' 100 | b'rse/tests/test_basic.pyr\x02\x00\x00\x00w\x00\x00\x00s\x04\x00\x00' 101 | b'\x00\x00\x02\r\x01', 102 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 103 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06NUMBE' 104 | b'Rr\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/m' 105 | b'etaparse/tests/test_basic.py\xda\x04atom|\x00\x00\x00s\x02\x00\x00\x00\x00' 106 | b'\x02', 107 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 108 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06SYMBO' 109 | b'Lr\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/m' 110 | b'etaparse/tests/test_basic.py\xda\x04atom\x7f\x00\x00\x00s\x02' 111 | b'\x00\x00\x00\x00\x02'] 112 | -------------------------------------------------------------------------------- /examples/eg_dumps_file.py: -------------------------------------------------------------------------------- 1 | lex2pats = [('IGNORED', '\\s+'), 2 | ('EQ', '='), 3 | ('NUM', '[1-9]\\d*'), 4 | ('ID', '[_a-zA-Z]\\w*'), 5 | ('POW', '\\*\\*'), 6 | ('MUL', '\\*'), 7 | ('ADD', '\\+')] 8 | 9 | handlers = [None, 10 | None, 11 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00' 12 | b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)' 13 | b'\x01\xda\x05float)\x01\xda\x03lex\xa9\x00r\x03\x00\x00\x00\xfaVc:/Users/She' 14 | b'llay/Documents/GitHub/metaparse/experiments/lessparse/examples/eg_dumps.' 15 | b'py\xda\x03NUM\x0c\x00\x00\x00s\x02\x00\x00\x00\x00\x01', 16 | None, 17 | None, 18 | None, 19 | None] 20 | 21 | rules = [('assign^', ('assign',)), 22 | ('assign', ('ID', 'EQ', 'expr')), 23 | ('expr', ('NUM',)), 24 | ('expr', ('ID',)), 25 | ('expr', ('expr', 'ADD', 'expr')), 26 | ('expr', ('expr', 'MUL', 'expr')), 27 | ('expr', ('expr', 'POW', 'expr'))] 28 | 29 | ACTION1 = [{'ID': ('shift', 2)}, 30 | {'\x03': ('reduce', 0)}, 31 | {'EQ': ('shift', 3)}, 32 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 33 | {'\x03': ('reduce', 1), 34 | 'ADD': ('shift', 7), 35 | 'MUL': ('shift', 8), 36 | 'POW': ('shift', 9)}, 37 | {'\x03': ('reduce', 2), 38 | 'ADD': ('reduce', 2), 39 | 'MUL': ('reduce', 2), 40 | 'POW': ('reduce', 2)}, 41 | {'\x03': ('reduce', 3), 42 | 'ADD': ('reduce', 3), 43 | 'MUL': ('reduce', 3), 44 | 'POW': ('reduce', 3)}, 45 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 46 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 47 | {'ID': ('shift', 6), 'NUM': ('shift', 5)}, 48 | {'\x03': ('reduce', 4), 49 | 'ADD': ('reduce', 4), 50 | 'MUL': ('shift', 8), 51 | 'POW': ('shift', 9)}, 52 | {'\x03': ('reduce', 5), 53 | 'ADD': ('reduce', 5), 54 | 'MUL': ('reduce', 5), 55 | 'POW': ('shift', 9)}, 56 | {'\x03': ('reduce', 6), 57 | 'ADD': ('reduce', 6), 58 | 'MUL': ('reduce', 6), 59 | 'POW': ('reduce', 6)}] 60 | 61 | GOTO = [{'ID': 2, 'assign': 1}, 62 | {}, 63 | {'EQ': 3}, 64 | {'ID': 6, 'NUM': 5, 'expr': 4}, 65 | {'ADD': 7, 'MUL': 8, 'POW': 9}, 66 | {}, 67 | {}, 68 | {'ID': 6, 'NUM': 5, 'expr': 10}, 69 | {'ID': 6, 'NUM': 5, 'expr': 11}, 70 | {'ID': 6, 'NUM': 5, 'expr': 12}, 71 | {'ADD': 7, 'MUL': 8, 'POW': 9}, 72 | {'ADD': 7, 'MUL': 8, 'POW': 9}, 73 | {'ADD': 7, 'MUL': 8, 'POW': 9}] 74 | 75 | semans = [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 76 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x' 77 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfaNc:\\Users\\Shellay\\Documents\\GitHu' 78 | b'b\\metaparse\\experiments\\lessparse\\metaparse.py\xda\x08identity' 79 | b'+\x00\x00\x00s\x02\x00\x00\x00\x00\x01', 80 | b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00' 81 | b'\x00C\x00\x00\x00s\x0e\x00\x00\x00|\x02\x00t\x00\x00|\x00\x00' # + TWN 51 | COMMA = r',' # + TW 52 | SEMI = r';' # + TW 53 | L1 = r'\(' # + TW 54 | R1 = r'\)' # + TW 55 | 56 | VALUE = r'\d+' # + TW 57 | VAR = r'[_a-z]\w*' # + TW 58 | CONS = r'[A-Z]\w*' # + TW 59 | 60 | INFIX = r'[\+\-\*\/]' # + TW 61 | 62 | def prog(binds): 63 | return binds 64 | 65 | # Stand-alone expression 66 | def exprx(expr): 67 | return expr 68 | def exprx(let): 69 | return let 70 | def exprx(abst): 71 | return abst 72 | def exprx(appl): 73 | return appl 74 | 75 | # Atomic expression 76 | def expr(VALUE): 77 | return float(VALUE) 78 | def expr(VAR): 79 | return VAR 80 | def expr(L1, exprx, R1): 81 | return exprx 82 | 83 | # Pattern 84 | def pat(VAR): 85 | return VAR 86 | def pat(CONS, arglist): 87 | return (CONS, arglist) 88 | def arglist(arglist, expr): 89 | return arglist + (expr,) 90 | def arglist(): 91 | return () 92 | 93 | # Application (Curried) 94 | def appl(expr_1, expr_2): 95 | return Appl(expr_1, expr_2) 96 | def appl(appl, expr): 97 | return Appl(appl, expr) 98 | def appl(expr_1, INFIX, expr_2): 99 | return Appl(INFIX, expr_1, expr_2) 100 | 101 | # Lambda-Abstraction (also Curried) 102 | def abst(LAMBDA, parlist, ARROW, exprx): 103 | tar = exprx 104 | for par in reversed(parlist): 105 | tar = Abst(par, tar) 106 | return tar 107 | def parlist(VAR): 108 | return [VAR] 109 | def parlist(parlist, COMMA, VAR): 110 | return [*parlist, VAR] 111 | 112 | # Let-expression with environmental bindings 113 | def let(LET, binds, IN, exprx): 114 | return Let(binds, exprx) 115 | def bind(pat, EQ, exprx): 116 | return {pat: exprx} 117 | def binds(bind): 118 | return bind 119 | def binds(binds, SEMI, bind): 120 | return {**binds, **bind} 121 | 122 | # def _env(): 123 | # print('Env!') 124 | 125 | # def _unify(): 126 | # print('Unify!') 127 | 128 | 129 | # Test whether the grammar is LALR to exclude potential ambiguity 130 | # and prepare for better performance 131 | psr_lalr = Lam 132 | 133 | 134 | inp = """ 135 | 136 | k = let 137 | a = 3 ; 138 | P p q = u v 139 | in 140 | map (\c, d -> f c d) xs ys ; 141 | 142 | l = 3 ; 143 | m = 4 144 | """ 145 | 146 | # r = psr_gll.parse_many(inp) 147 | # r = psr_glr.parse_many(inp) 148 | # print(r) 149 | 150 | # assert 0 151 | 152 | inp = """ 153 | k = let a = 3 ; 154 | P q = u v # 155 | !in $$ 156 | map (\c, d -> f c d) xs ys 157 | """ 158 | 159 | 160 | # print(Lam) 161 | # psr_gll.interpret(inp) # LEFT-RECURSION!!!! 162 | # psr_glr.interpret(inp) 163 | # psr_lalr.interpret(inp) 164 | 165 | psr = psr_lalr 166 | # psr = psr_glr 167 | # psr = psr_ear 168 | 169 | tough_inp = ' ;\n'.join([inp for _ in range(10)]) 170 | # tough_inp = ' ;\n'.join([inp for _ in range(100)]) 171 | 172 | # pp.pprint(list(psr.grammar.tokenize(inp, False))) 173 | # pp.pprint(psr.interpret_many(inp)) 174 | # print(len(psr.ACTION)) 175 | # pp.pprint(psr.ACTION) 176 | 177 | pp.pprint(psr.interpret(tough_inp)) 178 | 179 | s = psr.dumps() 180 | psr1 = psr.loads(s, globals()) 181 | # timeit psr.loads(s, globals()) 182 | # timeit LALR(Lam) 183 | 184 | pp.pprint(psr1.interpret(tough_inp)) 185 | 186 | 187 | # assert psr_glr.interpret_many(tough_inp)[0] == psr1.interpret(tough_inp) 188 | assert psr.interpret(tough_inp) == psr1.interpret(tough_inp) 189 | -------------------------------------------------------------------------------- /experiments/LL.py: -------------------------------------------------------------------------------- 1 | import preamble 2 | from metaparse import * 3 | 4 | @meta 5 | class WLL1(ParserDeterm): 6 | """Weak-LL(1)-Parser. 7 | 8 | Since 'strong'-LL(1) grammar parser includes the usage of FOLLOW 9 | set, which is only heuristically helpful for the recognitive 10 | capability when handling NULLABLE rules, this parser suppress the 11 | need of FOLLOW. 12 | 13 | When deducing a NULLABLE nonterminal A with some lookahead a, if a 14 | does not belong to any FIRST of A's alternatives, then the NULL 15 | alternative is chosen. In other words, all terminals not in 16 | FIRST(A) leads to the prediction (as well as immediate reduction) 17 | of (A -> ε) in the predictive table. 18 | 19 | This variation allows predicting (A -> ε) even when lookahead a is 20 | not in FOLLOW, which means this parser will postpone the 21 | recognition error compared to strong-LL(1) parser. 22 | 23 | """ 24 | 25 | def __init__(self, grammar): 26 | self.grammar = grammar 27 | self.lexer = Lexer.from_grammar(grammar) 28 | self.semans = grammar.semans 29 | self._calc_ll1_table() 30 | 31 | def _calc_ll1_table(self): 32 | G = self.grammar 33 | table = self.table = {} 34 | for r, rule in enumerate(G.rules): 35 | lhs, rhs = rule 36 | if lhs not in table: 37 | table[lhs] = {} 38 | # NON-NULL rule 39 | if rhs: 40 | for a in G.first_of_seq(rhs, EPSILON): 41 | if a is EPSILON: 42 | pass 43 | elif a in table[lhs]: 44 | raise GrammarError('Not simple LL(1) grammar! ') 45 | else: 46 | table[lhs][a] = rule 47 | # NULL rule 48 | # This rule tends to be tried when 49 | # the lookahead doesn't appear in 50 | # other sibling rules. 51 | else: 52 | pass 53 | 54 | def parse(self, inp, interp=False): 55 | """The process is exactly the `translate' process of a ParseTree. 56 | 57 | """ 58 | # Backtracking is yet supported 59 | # Each choice should be deterministic 60 | push = list.append 61 | pop = list.pop 62 | G = self.grammar 63 | pstack = self.pstack = [] 64 | table = self.table 65 | toker = enumerate(self.lexer.tokenize(inp, with_end=True)) 66 | pstack.append(G.rules[0].lhs) 67 | argstack = [] 68 | try: 69 | k, tok = next(toker) 70 | while pstack: 71 | actor = pop(pstack) 72 | at, look, tokval = tok 73 | # Reduction 74 | if isinstance(actor, Rule): 75 | args = [] 76 | # Pop the size of args, conclude subtree 77 | # for prediction made before 78 | for _ in actor.rhs: 79 | args.insert(0, pop(argstack)) 80 | if interp: 81 | arg1 = actor.seman(*args) 82 | else: 83 | arg1 = ParseTree(actor, args) 84 | # Finish - no prediction in stack 85 | # Should declare end-of-input 86 | if not pstack: 87 | return arg1 88 | else: 89 | push(argstack, arg1) 90 | # Make prediction on nonterminal 91 | elif actor in G.nonterminals: 92 | if look in table[actor]: 93 | pred = table[actor][look] 94 | # Singal for reduction 95 | push(pstack, pred) 96 | # Push symbols into prediction-stack, 97 | # last symbol first in. 98 | for x in reversed(pred.rhs): 99 | push(pstack, x) 100 | # !!! Heuristically do epsilon-reduction when no 101 | # viable lookahead found 102 | elif actor in G.NULLABLE: 103 | for r0 in G.rules: 104 | if r0.lhs == actor and not r0.rhs: 105 | if interp: 106 | argstack.append(r0.seman()) 107 | else: 108 | argstack.append(ParseTree(r0, [])) 109 | # Recognition failed, ignore 110 | else: 111 | raise ParserError('No production found.') 112 | # Try match terminal 113 | else: 114 | if actor == look: 115 | if interp: 116 | argstack.append(tokval) 117 | else: 118 | argstack.append(tok) 119 | k, tok = next(toker) 120 | 121 | except StopIteration: 122 | raise ParserError('No enough tokens to complete parsing.') 123 | 124 | -------------------------------------------------------------------------------- /examples/sexp_dump.py: -------------------------------------------------------------------------------- 1 | lex2pats = \ 2 | [('COMMA', ','), 3 | ('IGNORED', '%'), 4 | ('RIGHT', '\\)'), 5 | ('LEFT', '\\('), 6 | ('IGNORED', '\\s+'), 7 | ('SYMBOL', '[_a-zA-Z]\\w*'), 8 | ('UNKNOWN', '&'), 9 | ('NUMBER', '[1-9]\\d*(\\.\\d*)?')] 10 | 11 | handlers = \ 12 | [None, 13 | None, 14 | None, 15 | None, 16 | None, 17 | None, 18 | None, 19 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00' 20 | b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)' 21 | b'\x01\xda\x03int)\x01\xda\x03val\xa9\x00r\x03\x00\x00\x00\xfaOc:\\Users\\S' 22 | b'hellay\\Documents\\GitHub\\metaparse\\experiments\\lessparse\\test_basic.p' 23 | b'y\xda\x01_g\x00\x00\x00s\x02\x00\x00\x00\x00\x02'] 24 | 25 | rules = \ 26 | [('sexp^', ('sexp',)), 27 | ('sexp', ('atom',)), 28 | ('sexp', ('LEFT', 'slist', 'RIGHT')), 29 | ('slist', ()), 30 | ('slist', ('slist', 'sexp')), 31 | ('atom', ('NUMBER',)), 32 | ('atom', ('SYMBOL',))] 33 | 34 | ACTION1 = \ 35 | [{'LEFT': ('shift', 3), 'NUMBER': ('shift', 4), 'SYMBOL': ('shift', 5)}, 36 | {'\x03': ('reduce', 0)}, 37 | {'\x03': ('reduce', 1), 38 | 'LEFT': ('reduce', 1), 39 | 'NUMBER': ('reduce', 1), 40 | 'RIGHT': ('reduce', 1), 41 | 'SYMBOL': ('reduce', 1)}, 42 | {'LEFT': ('reduce', 3), 43 | 'NUMBER': ('reduce', 3), 44 | 'RIGHT': ('reduce', 3), 45 | 'SYMBOL': ('reduce', 3)}, 46 | {'\x03': ('reduce', 5), 47 | 'LEFT': ('reduce', 5), 48 | 'NUMBER': ('reduce', 5), 49 | 'RIGHT': ('reduce', 5), 50 | 'SYMBOL': ('reduce', 5)}, 51 | {'\x03': ('reduce', 6), 52 | 'LEFT': ('reduce', 6), 53 | 'NUMBER': ('reduce', 6), 54 | 'RIGHT': ('reduce', 6), 55 | 'SYMBOL': ('reduce', 6)}, 56 | {'LEFT': ('shift', 3), 57 | 'NUMBER': ('shift', 4), 58 | 'RIGHT': ('shift', 7), 59 | 'SYMBOL': ('shift', 5)}, 60 | {'\x03': ('reduce', 2), 61 | 'LEFT': ('reduce', 2), 62 | 'NUMBER': ('reduce', 2), 63 | 'RIGHT': ('reduce', 2), 64 | 'SYMBOL': ('reduce', 2)}, 65 | {'LEFT': ('reduce', 4), 66 | 'NUMBER': ('reduce', 4), 67 | 'RIGHT': ('reduce', 4), 68 | 'SYMBOL': ('reduce', 4)}] 69 | 70 | GOTO = \ 71 | [{'LEFT': 3, 'NUMBER': 4, 'SYMBOL': 5, 'atom': 2, 'sexp': 1}, 72 | {}, 73 | {}, 74 | {'slist': 6}, 75 | {}, 76 | {}, 77 | {'LEFT': 3, 'NUMBER': 4, 'RIGHT': 7, 'SYMBOL': 5, 'atom': 2, 'sexp': 8}, 78 | {}, 79 | {}] 80 | 81 | semans = \ 82 | [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 83 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x' 84 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfaNc:\\Users\\Shellay\\Documents\\GitHu' 85 | b'b\\metaparse\\experiments\\lessparse\\metaparse.py\xda\x08identity' 86 | b'*\x00\x00\x00s\x02\x00\x00\x00\x00\x01', 87 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 88 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x04atomr' 89 | b'\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitHub' 90 | b'\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x04sexpk\x00\x00\x00' 91 | b's\x02\x00\x00\x00\x00\x02', 92 | b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00' 93 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x01\x00S)\x01N\xa9\x00)\x03\xda\x04L' 94 | b'EFT\xda\x05slist\xda\x05RIGHTr\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\User' 95 | b's\\Shellay\\Documents\\GitHub\\metaparse\\experiments\\lessparse\\test_basi' 96 | b'c.py\xda\x04sexpn\x00\x00\x00s\x02\x00\x00\x00\x00\x02', 97 | b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00' 98 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00g\x00\x00S)\x01N\xa9\x00r\x01\x00\x00\x00' 99 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitHu' 100 | b'b\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x05slists\x00' 101 | b'\x00\x00s\x02\x00\x00\x00\x00\x02', 102 | b'\xe3\x02\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00' 103 | b'\x00C\x00\x00\x00s\x11\x00\x00\x00|\x00\x00j\x00\x00|\x01\x00\x83' 104 | b'\x01\x00\x01|\x00\x00S)\x01N)\x01\xda\x06append)\x02\xda\x05slist\xda\x04s' 105 | b'exp\xa9\x00r\x04\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitHub\\me' 106 | b'taparse\\experiments\\lessparse\\test_basic.pyr\x02\x00\x00\x00v\x00\x00\x00' 107 | b's\x04\x00\x00\x00\x00\x02\r\x01', 108 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 109 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06NUMBE' 110 | b'Rr\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitH' 111 | b'ub\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x04atom{\x00' 112 | b'\x00\x00s\x02\x00\x00\x00\x00\x02', 113 | b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 114 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06SYMBO' 115 | b'Lr\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitH' 116 | b'ub\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x04atom~\x00' 117 | b'\x00\x00s\x02\x00\x00\x00\x00\x02'] 118 | -------------------------------------------------------------------------------- /examples/eg_dumps_direct_use.py: -------------------------------------------------------------------------------- 1 | from eg_dumps_file import * 2 | 3 | import re 4 | import types 5 | import marshal 6 | import warnings 7 | import pprint as pp 8 | 9 | from collections import namedtuple, deque 10 | 11 | lex2rgxs = [(lex, re.compile(pat)) for lex, pat in lex2pats] 12 | 13 | lex_handlers = { 14 | name: types.FunctionType(marshal.loads(src), globals()) 15 | for name, src in lex_handler_sources.items() 16 | } 17 | 18 | semans = [ 19 | types.FunctionType(marshal.loads(src), globals()) 20 | for src in seman_sources 21 | ] 22 | 23 | Rule = namedtuple('Rule', 'lhs rhs') 24 | Rule.__repr__ = lambda s: '({} = {})'.format(s.lhs, ' '.join(s.rhs)) 25 | 26 | Item = namedtuple('Item', 'rule pos') 27 | Item.__repr__ = lambda s: '({} = {}.{})'.format(s.rule.lhs, 28 | ' '.join(s.rule.rhs[:s.pos]), 29 | ' '.join(s.rule.rhs[s.pos:])) 30 | 31 | rules = [Rule(l, r) for l, r in rules] 32 | Ks = [[Item(rules[r], pos) for r, pos in K] for K in Ks] 33 | 34 | 35 | Token = namedtuple('Token', 'at symbol lexeme value') 36 | Token.__repr__ = lambda s: '({} = {})'.format(s.symbol, repr(s.value)) 37 | 38 | 39 | 40 | def tokenize(inp, with_end=True): 41 | 42 | pos = 0 43 | while pos < len(inp): 44 | # raw string match 45 | raw_match = False 46 | # re match 47 | n = None 48 | m = None 49 | for cat, rgx in lex2rgxs: 50 | # raw 51 | if rgx is None: 52 | if inp.startswith(cat, pos): 53 | yield Token(pos, cat, cat, cat) 54 | pos += len(cat) 55 | raw_match = True 56 | break 57 | # re 58 | else: 59 | m = rgx.match(inp, pos=pos) 60 | # The first match with non-zero length is yielded. 61 | if m and len(m.group()) > 0: 62 | n = cat 63 | break 64 | if raw_match: 65 | continue 66 | elif m: 67 | assert isinstance(n, str) 68 | if n == 'IGNORED': 69 | # Need IGNORED handler? 70 | at, pos = m.span() 71 | elif n == 'ERROR': 72 | # Call ERROR handler! 73 | at, pos = m.span() 74 | lxm = m.group() 75 | if 'ERROR' in lex_handlers: 76 | # Suppress error token and call handler. 77 | lex_handlers[ERROR](lxm) 78 | # yield Token(at, ERROR, lxm, h(lxm)) 79 | else: 80 | # Yield error token when no handler available. 81 | yield Token(at, ERROR, lxm, lxm) 82 | else: 83 | at, pos = m.span() 84 | lxm = m.group() 85 | if n in lex_handlers: 86 | # Call normal token handler. 87 | h = lex_handlers[n] 88 | # Bind semantic value. 89 | yield Token(at, n, lxm, h(lxm)) 90 | else: 91 | yield Token(at, n, lxm, lxm) 92 | else: 93 | # Report unrecognized Token here! 94 | msg = '\n'.join([ 95 | '', 96 | '=========================', 97 | 'No defined pattern starts with char `{}` @{}'.format(inp[pos], pos), 98 | '', 99 | '* Consumed input: ', 100 | repr(inp[:pos]), 101 | '=========================', 102 | '', 103 | ]) 104 | raise GrammarError(msg) 105 | if with_end: 106 | yield Token(pos, 'END', None, None) 107 | 108 | 109 | def parse(inp, interp=False, n_warns=5): 110 | 111 | trees = [] 112 | sstack = [0] 113 | 114 | toker = tokenize(inp, with_end=True) # Use END to force finishing by ACCEPT 115 | tok = next(toker) 116 | warns = [] 117 | 118 | try: 119 | while 1: 120 | 121 | # Peek state 122 | s = sstack[-1] 123 | 124 | if tok.symbol not in ACTION[s]: 125 | msg = '\n'.join([ 126 | '', 127 | 'WARNING: ', 128 | 'LALR - Ignoring syntax error reading Token {}'.format(tok), 129 | '- Current kernel derivation stack:', 130 | pp.pformat([Ks[i] for i in sstack]), 131 | '- Expecting tokens and actions:', 132 | pp.pformat(ACTION[s]), 133 | '- But got: \n{}'.format(tok), 134 | '', 135 | ]) 136 | warnings.warn(msg) 137 | warns.append(msg) 138 | if len(warns) == n_warns: 139 | raise ValueError( 140 | 'Warning tolerance {} reached. Parsing exited.'.format(n_warns)) 141 | else: 142 | tok = next(toker) 143 | 144 | else: 145 | act, arg = ACTION[s][tok.symbol] 146 | 147 | # SHIFT 148 | if act == 'SHIFT': 149 | if interp: 150 | trees.append(tok.value) 151 | else: 152 | trees.append(tok) 153 | sstack.append(GOTO[s][tok.symbol]) 154 | # Go on scanning 155 | tok = next(toker) 156 | 157 | # REDUCE 158 | elif act == 'REDUCE': 159 | assert isinstance(arg, int) 160 | rule = lhs, rhs = rules[arg] 161 | seman = semans[arg] 162 | subts = deque() 163 | for _ in rhs: 164 | subt = trees.pop() 165 | subts.appendleft(subt) 166 | sstack.pop() 167 | if interp: 168 | tree = seman(*subts) 169 | else: 170 | tree = ((rule, seman), list(subts)) 171 | trees.append(tree) 172 | sstack.append(GOTO[sstack[-1]][lhs]) 173 | 174 | # ACCEPT 175 | elif act == 'ACCEPT': 176 | # Reduce the top semantics. 177 | assert isinstance(arg, int), arg 178 | rule = rules[arg] 179 | seman = semans[arg] 180 | if interp: 181 | return seman(*trees) 182 | else: 183 | assert len(trees) == 1 184 | return trees[0] 185 | else: 186 | raise ValueError('Invalid action {} on {}'.format(act, arg)) 187 | 188 | except StopIteration: 189 | raise ValueError('No enough tokens for completing the parse. ') 190 | 191 | 192 | def interpret(inp): 193 | return parse(inp, interp=True) 194 | 195 | table = {} 196 | 197 | inp = 'x = 1 + 2 7 ** 3 * 5 + 9' 198 | 199 | ts = list(tokenize(inp)) 200 | pp.pprint(ts) 201 | 202 | r = interpret(inp) 203 | 204 | pp.pprint(table) 205 | pp.pprint(r) 206 | -------------------------------------------------------------------------------- /experiments/peg.py: -------------------------------------------------------------------------------- 1 | # Experimental implementation for Parser Expression Grammar, 2 | # represented by EBNF-like notation. 3 | 4 | import re 5 | 6 | from collections import namedtuple as data 7 | 8 | Rule = data('Rule', 'lhs rhs') 9 | 10 | # Expression is a superclass, which can be subclassed into 11 | # - Terminal 12 | # - Nonterminal 13 | # - Alternatves 14 | # - Sequence 15 | # - Repeated/Star 16 | # - Optional/Opt 17 | 18 | Expr = data('Symbol', 'symbol') 19 | 20 | Nonterminal = data('Nonterminal', 'symb') 21 | Nonterminal = str 22 | Terminal = data('Terminal', 'symb regexp') 23 | Star = data('Star', 'sub') 24 | Opt = data('Opt', 'sub') 25 | Plus = data('Plus', 'sub') 26 | Seq = data('Seq', 'subs') # Using python list rather than CONS structure. 27 | Alt = data('Alt', 'subs') # Using python list rather than CONS structure. 28 | Nil = None 29 | 30 | is_a = isinstance 31 | 32 | # Notes: 33 | 34 | # To allow parsing expressions to include Sequence and Kleene Closure, 35 | # there must be a corresponding sequenctial structure behaving as a 36 | # primitive construction of a parse tree's subtrees. Theoretical it 37 | # can be described as a monoid, which defines Unit(the empty) and 38 | # Append(operation of accumulating). 39 | 40 | # To represent the parsing result more simply, a parse result is 41 | # either a ([Tree], Inp) or a (Tree, inp), whereas the latter`s first 42 | # component can be represented as a singleton list. 43 | 44 | # data Result = ([Tree], String) | (Tree, String) | FAIL 45 | 46 | FAIL = (None, None) 47 | 48 | def parse(G, x, inp): 49 | if is_a(x, Terminal): 50 | return parse_terminal(G, x, inp) 51 | elif is_a(x, Nonterminal): 52 | sub, inp1 = parse(G, G[x], inp) 53 | if (sub, inp1) == FAIL: 54 | return FAIL 55 | else: 56 | # Make a parse tree of 1 Nonterminal. 57 | # return (x.symb, sub), inp1 58 | return (x, sub), inp1 59 | elif is_a(x, Alt): 60 | return parse_alts(G, x.subs, inp) 61 | elif is_a(x, Seq): 62 | return parse_seq(G, x.subs, inp) 63 | elif is_a(x, Star): 64 | return parse_star(G, x.sub, inp) 65 | elif is_a(x, Opt): 66 | return parse_opt(G, x.sub, inp) 67 | else: 68 | raise TypeError('{} is not an expression.'.format(x)) 69 | 70 | def parse_terminal(G, x: Terminal, inp: str): 71 | if not inp: 72 | return FAIL 73 | else: 74 | m = re.match(x.regexp, inp, re.MULTILINE) # Matching MULTILINE activated. 75 | if not m: 76 | return FAIL 77 | else: 78 | _, end = m.span() 79 | tokval = re.sub(r'\s+', '', inp[:end]) 80 | return (x.symb, tokval), inp[end:] 81 | 82 | def parse_alts(G, alts: [Expr], inp: str) -> (tuple, str): 83 | """May return a OR-tree here. Recall each parse tree is an AND-OR 84 | tree. 85 | 86 | """ 87 | pf = [] 88 | for a in alts: 89 | t, inp1 = parse(G, a, inp) 90 | if (t, inp1) != FAIL: 91 | return (t, inp1) 92 | return FAIL 93 | 94 | def parse_seq(G, subs: [Expr], inp: str) -> (tuple, str): 95 | ss = [] 96 | for sub in subs: 97 | (t1, inp1) = parse(G, sub, inp) 98 | if (t1, inp1) != FAIL: 99 | # See whether the result is list or atom. 100 | if isinstance(t1, list): 101 | # For parse_star, parse_opt, parse_seq the result is a 102 | # list. 103 | ss.extend(t1) 104 | else: 105 | # For parse_terminal the result is an atom. It is a 106 | # singleton list as parse forest per se!!! For parse, 107 | # parse_alts the result maybe either. 108 | ss.append(t1) 109 | inp = inp1 110 | else: 111 | return FAIL 112 | # May convert singleton list to single node. 113 | if len(ss) == 1: 114 | return ss[0], inp 115 | else: 116 | return ss, inp 117 | 118 | 119 | # Extended monoidic expressional structures. 120 | 121 | def parse_star(G, sub: Expr, inp: str) -> (tuple, str): 122 | 'sub is the expression enclosed by Star.' 123 | rep = [] 124 | while 1 and inp: 125 | t1, inp1 = parse(G, sub, inp) 126 | if (t1, inp1) != FAIL: 127 | rep.append(t1) 128 | inp = inp1 129 | else: 130 | break 131 | return rep, inp 132 | 133 | def parse_opt(G, sub: Expr, inp: str) -> (tuple, str): 134 | opt = [] 135 | t1, inp1 = parse(G, sub, inp) 136 | if (t1, inp1) != FAIL: 137 | opt.append(t1) 138 | inp = inp1 139 | return [], inp 140 | 141 | 142 | # i1 = Seq([Terminal('NUM', r'\d+'), Terminal('SPC', r'\s+'), Terminal('NUM', r'\d+')]) 143 | # i2 = Seq([Terminal('NUM', r'\d+'), Terminal('SPC', r'\s+'), Terminal('NUM', r'[A-Za-z_]\w+')]) 144 | # parse_seq(None, i1.subs, '123 456') 145 | # parse_seq(None, i2.subs, '123 456') 146 | 147 | # parse(None, i1, '123 456') 148 | # parse(None, i2, '123 456') 149 | G1 = {Nonterminal('E'): Seq([Nonterminal('T'), 150 | Star(Seq([Terminal('PLUS', r'\+'), Nonterminal('T')]))]), 151 | Nonterminal('T'): Seq([Nonterminal('F'), 152 | Star(Seq([Terminal('TIMES', r'\*'), Nonterminal('F')]))]), 153 | Nonterminal('F'): Terminal('NUM', r'\d+'), 154 | } 155 | 156 | 157 | # Bootstrapping grammar. 158 | SPCS = r'\s*' 159 | 160 | p_QUAL = r'[\?\*\+]' 161 | 162 | p_HEAD = r'^' + SPCS 163 | p_LEFT = r'\(' + SPCS 164 | p_RIGHT = r'\)' + SPCS 165 | p_SEMI = r';' + SPCS 166 | p_ALT1 = r'/' + SPCS 167 | p_ALT2 = r'\|' + SPCS 168 | p_ALT = r'[/\|]' + SPCS 169 | p_ARROW = r'(->|::=)' + SPCS 170 | p_SYMBOL = r'[^;/\(\)\|\?\*\+\s]+' + SPCS 171 | 172 | p_RIGHTQ = p_RIGHT + p_QUAL + r'?' + SPCS 173 | p_SYMBOLQ= p_SYMBOL + p_QUAL + r'?' + SPCS 174 | 175 | t_HEAD = Terminal("HEAD" , p_HEAD) 176 | t_LEFT = Terminal("LEFT" , p_LEFT) 177 | t_RIGHT = Terminal("RIGHT" , p_RIGHT) 178 | t_QUAL = Terminal("QUAL" , p_QUAL) 179 | t_SEMI = Terminal("SEMI" , p_SEMI) 180 | t_ALT1 = Terminal("ALT1" , p_ALT1) 181 | t_ALT2 = Terminal("ALT2" , p_ALT2) 182 | t_ALT = Terminal("ALT" , p_ALT) 183 | t_ARROW = Terminal("ARROW" , p_ARROW) 184 | t_SYMBOL = Terminal("SYMBOL", p_SYMBOL) 185 | t_RIGHTQ = Terminal("RIGHTQ", p_RIGHTQ) 186 | t_SYMBOLQ= Terminal("SYMBOLQ", p_SYMBOLQ) 187 | 188 | EBNF = { 189 | 'Rules': Star('Rule'), 190 | 'Rule': Seq(['LHS', t_ARROW, 'RHS']), 191 | 'LHS': t_SYMBOL, 192 | 'RHS': Seq(['Sequence', 193 | Star(Seq([t_ALT, 'Sequence'])), 194 | t_SEMI]), 195 | 'Sequence': Star('Expr'), 196 | 'Expr': Alt([t_SYMBOLQ, 197 | Seq([t_LEFT, 'Sequence', t_RIGHTQ])]), 198 | } 199 | 200 | parse(EBNF, t_SYMBOL, 'ab') 201 | parse(EBNF, t_SYMBOL, 'ab*') 202 | parse(EBNF, t_SYMBOLQ, 'ab') 203 | parse(EBNF, t_SYMBOLQ, 'ab*') 204 | parse(EBNF, t_SYMBOLQ, 'ab +') 205 | parse(EBNF, ('Expr'), 'ab*') 206 | parse(EBNF, ('Expr'), 'ab') 207 | parse(EBNF, ('Expr'), 'ab*') 208 | parse(EBNF, ('Expr'), 'ab +') 209 | parse(EBNF, ('Expr'), 'ab +;') 210 | parse(EBNF, ('Expr'), "(plus E)") 211 | parse(EBNF, ('Sequence'), 'ab + bc?;') 212 | parse(EBNF, ('RHS'), "T (+ E) ;") # Error, using preserved symbol '+' 213 | parse(EBNF, ('RHS'), "T (\+ E) ;") 214 | parse(EBNF, ('RHS'), "T (plus E) ;") 215 | parse(EBNF, ('RHS'), """T (plus E) | T; """) 216 | parse(EBNF, ('RHS'), "a \+ b | a \* b | a? - b; ") 217 | parse(EBNF, ('RHS'), "T plus E;") 218 | parse(EBNF, ('RHS'), "T (plus E) ;") 219 | parse(EBNF, ('Rule'), " -> a \+ b | a \* b | a? - b; ") 220 | parse(EBNF, ('Rule'), "E -> T (plus T);") 221 | res = parse(EBNF, Nonterminal('Rules'), """E -> T (plus T)*; 222 | T -> F (times F)*; 223 | F -> id;""") 224 | res = parse(EBNF, Nonterminal('Rules'), """Expr -> atom | left Expr* right; 225 | atom -> id; 226 | """) 227 | 228 | 229 | # Further functionalities: 230 | 231 | # - Detect left factor: Test whether two alternatives of a rule share 232 | # an identical FIRST token. 233 | 234 | # - Detect left recursion: Test whether cycle exits after exploring 235 | # derivation path. 236 | if __name__ == '__main__': 237 | import pprint as pp 238 | pp.pprint(res) 239 | -------------------------------------------------------------------------------- /experiments/meta_dumps.py: -------------------------------------------------------------------------------- 1 | ## This file is generated. Do not modify. 2 | 3 | ## Lexer$BEGIN 4 | 5 | lex2pats = \ 6 | [('NEG', '!'), 7 | ('CON', '&'), 8 | ('DIS', '\\|'), 9 | ('IMP', '->'), 10 | ('IFF', '<=>'), 11 | ('W', '[A-Z]\\w*'), 12 | ('True', None), 13 | ('False', None), 14 | ('(', None), 15 | (')', None), 16 | ('[', None), 17 | (']', None), 18 | ('IGNORED', '[ \\t\\n]'), 19 | ('ERROR', '.')] 20 | 21 | lex_handler_sources = \ 22 | {} 23 | 24 | ## Lexer$END 25 | 26 | 27 | ## Parser$BEGIN 28 | 29 | precedence = \ 30 | {'CON': 4, 'DIS': 3, 'IFF': 1, 'IMP': 2, 'NEG': 5} 31 | 32 | rules = \ 33 | [('Sentence^', ('Sentence',)), 34 | ('Sentence', ['Atomic']), 35 | ('Sentence', ['Complex']), 36 | ('Atomic', ['True']), 37 | ('Atomic', ['False']), 38 | ('Atomic', ['W']), 39 | ('Complex', ['(', 'Sentence', ')']), 40 | ('Complex', ['[', 'Sentence', ']']), 41 | ('Complex', ['NEG', 'Sentence']), 42 | ('Complex', ['Sentence', 'CON', 'Sentence']), 43 | ('Complex', ['Sentence', 'DIS', 'Sentence']), 44 | ('Complex', ['Sentence', 'IMP', 'Sentence']), 45 | ('Complex', ['Sentence', 'IFF', 'Sentence'])] 46 | 47 | seman_sources = \ 48 | [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 49 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x' 50 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\users\\shellay\\documents\\githu' 51 | b'b\\metaparse\\metaparse.py\xda\x07id_func\x90\x00\x00\x00s\x02\x00' 52 | b'\x00\x00\x00\x01', 53 | None, 54 | None, 55 | None, 56 | None, 57 | None, 58 | None, 59 | None, 60 | None, 61 | None, 62 | None, 63 | None, 64 | None] 65 | 66 | Ks = \ 67 | [[(0, 0)], 68 | [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)], 69 | [(1, 1)], 70 | [(2, 1)], 71 | [(3, 1)], 72 | [(4, 1)], 73 | [(5, 1)], 74 | [(6, 1)], 75 | [(7, 1)], 76 | [(8, 1)], 77 | [(9, 2)], 78 | [(10, 2)], 79 | [(11, 2)], 80 | [(12, 2)], 81 | [(6, 2), (9, 1), (10, 1), (11, 1), (12, 1)], 82 | [(7, 2), (9, 1), (10, 1), (11, 1), (12, 1)], 83 | [(8, 2), (9, 1), (10, 1), (11, 1), (12, 1)], 84 | [(9, 1), (9, 3), (10, 1), (11, 1), (12, 1)], 85 | [(9, 1), (10, 1), (10, 3), (11, 1), (12, 1)], 86 | [(9, 1), (10, 1), (11, 1), (11, 3), (12, 1)], 87 | [(9, 1), (10, 1), (11, 1), (12, 1), (12, 3)], 88 | [(6, 3)], 89 | [(7, 3)]] 90 | 91 | ACTION = \ 92 | [{'(': ('SHIFT', 7), 93 | 'False': ('SHIFT', 5), 94 | 'NEG': ('SHIFT', 9), 95 | 'True': ('SHIFT', 4), 96 | 'W': ('SHIFT', 6), 97 | '[': ('SHIFT', 8)}, 98 | {'CON': ('SHIFT', 10), 99 | 'DIS': ('SHIFT', 11), 100 | 'END': ('ACCEPT', 0), 101 | 'IFF': ('SHIFT', 13), 102 | 'IMP': ('SHIFT', 12)}, 103 | {')': ('REDUCE', 1), 104 | 'CON': ('REDUCE', 1), 105 | 'DIS': ('REDUCE', 1), 106 | 'END': ('REDUCE', 1), 107 | 'IFF': ('REDUCE', 1), 108 | 'IMP': ('REDUCE', 1), 109 | ']': ('REDUCE', 1)}, 110 | {')': ('REDUCE', 2), 111 | 'CON': ('REDUCE', 2), 112 | 'DIS': ('REDUCE', 2), 113 | 'END': ('REDUCE', 2), 114 | 'IFF': ('REDUCE', 2), 115 | 'IMP': ('REDUCE', 2), 116 | ']': ('REDUCE', 2)}, 117 | {')': ('REDUCE', 3), 118 | 'CON': ('REDUCE', 3), 119 | 'DIS': ('REDUCE', 3), 120 | 'END': ('REDUCE', 3), 121 | 'IFF': ('REDUCE', 3), 122 | 'IMP': ('REDUCE', 3), 123 | ']': ('REDUCE', 3)}, 124 | {')': ('REDUCE', 4), 125 | 'CON': ('REDUCE', 4), 126 | 'DIS': ('REDUCE', 4), 127 | 'END': ('REDUCE', 4), 128 | 'IFF': ('REDUCE', 4), 129 | 'IMP': ('REDUCE', 4), 130 | ']': ('REDUCE', 4)}, 131 | {')': ('REDUCE', 5), 132 | 'CON': ('REDUCE', 5), 133 | 'DIS': ('REDUCE', 5), 134 | 'END': ('REDUCE', 5), 135 | 'IFF': ('REDUCE', 5), 136 | 'IMP': ('REDUCE', 5), 137 | ']': ('REDUCE', 5)}, 138 | {'(': ('SHIFT', 7), 139 | 'False': ('SHIFT', 5), 140 | 'NEG': ('SHIFT', 9), 141 | 'True': ('SHIFT', 4), 142 | 'W': ('SHIFT', 6), 143 | '[': ('SHIFT', 8)}, 144 | {'(': ('SHIFT', 7), 145 | 'False': ('SHIFT', 5), 146 | 'NEG': ('SHIFT', 9), 147 | 'True': ('SHIFT', 4), 148 | 'W': ('SHIFT', 6), 149 | '[': ('SHIFT', 8)}, 150 | {'(': ('SHIFT', 7), 151 | 'False': ('SHIFT', 5), 152 | 'NEG': ('SHIFT', 9), 153 | 'True': ('SHIFT', 4), 154 | 'W': ('SHIFT', 6), 155 | '[': ('SHIFT', 8)}, 156 | {'(': ('SHIFT', 7), 157 | 'False': ('SHIFT', 5), 158 | 'NEG': ('SHIFT', 9), 159 | 'True': ('SHIFT', 4), 160 | 'W': ('SHIFT', 6), 161 | '[': ('SHIFT', 8)}, 162 | {'(': ('SHIFT', 7), 163 | 'False': ('SHIFT', 5), 164 | 'NEG': ('SHIFT', 9), 165 | 'True': ('SHIFT', 4), 166 | 'W': ('SHIFT', 6), 167 | '[': ('SHIFT', 8)}, 168 | {'(': ('SHIFT', 7), 169 | 'False': ('SHIFT', 5), 170 | 'NEG': ('SHIFT', 9), 171 | 'True': ('SHIFT', 4), 172 | 'W': ('SHIFT', 6), 173 | '[': ('SHIFT', 8)}, 174 | {'(': ('SHIFT', 7), 175 | 'False': ('SHIFT', 5), 176 | 'NEG': ('SHIFT', 9), 177 | 'True': ('SHIFT', 4), 178 | 'W': ('SHIFT', 6), 179 | '[': ('SHIFT', 8)}, 180 | {')': ('SHIFT', 21), 181 | 'CON': ('SHIFT', 10), 182 | 'DIS': ('SHIFT', 11), 183 | 'IFF': ('SHIFT', 13), 184 | 'IMP': ('SHIFT', 12)}, 185 | {'CON': ('SHIFT', 10), 186 | 'DIS': ('SHIFT', 11), 187 | 'IFF': ('SHIFT', 13), 188 | 'IMP': ('SHIFT', 12), 189 | ']': ('SHIFT', 22)}, 190 | {')': ('REDUCE', 8), 191 | 'CON': ('REDUCE', 8), 192 | 'DIS': ('REDUCE', 8), 193 | 'END': ('REDUCE', 8), 194 | 'IFF': ('REDUCE', 8), 195 | 'IMP': ('REDUCE', 8), 196 | ']': ('REDUCE', 8)}, 197 | {')': ('REDUCE', 9), 198 | 'CON': ('REDUCE', 9), 199 | 'DIS': ('REDUCE', 9), 200 | 'END': ('REDUCE', 9), 201 | 'IFF': ('REDUCE', 9), 202 | 'IMP': ('REDUCE', 9), 203 | ']': ('REDUCE', 9)}, 204 | {')': ('REDUCE', 10), 205 | 'CON': ('SHIFT', 10), 206 | 'DIS': ('REDUCE', 10), 207 | 'END': ('REDUCE', 10), 208 | 'IFF': ('REDUCE', 10), 209 | 'IMP': ('REDUCE', 10), 210 | ']': ('REDUCE', 10)}, 211 | {')': ('REDUCE', 11), 212 | 'CON': ('SHIFT', 10), 213 | 'DIS': ('SHIFT', 11), 214 | 'END': ('REDUCE', 11), 215 | 'IFF': ('REDUCE', 11), 216 | 'IMP': ('REDUCE', 11), 217 | ']': ('REDUCE', 11)}, 218 | {')': ('REDUCE', 12), 219 | 'CON': ('SHIFT', 10), 220 | 'DIS': ('SHIFT', 11), 221 | 'END': ('REDUCE', 12), 222 | 'IFF': ('REDUCE', 12), 223 | 'IMP': ('SHIFT', 12), 224 | ']': ('REDUCE', 12)}, 225 | {')': ('REDUCE', 6), 226 | 'CON': ('REDUCE', 6), 227 | 'DIS': ('REDUCE', 6), 228 | 'END': ('REDUCE', 6), 229 | 'IFF': ('REDUCE', 6), 230 | 'IMP': ('REDUCE', 6), 231 | ']': ('REDUCE', 6)}, 232 | {')': ('REDUCE', 7), 233 | 'CON': ('REDUCE', 7), 234 | 'DIS': ('REDUCE', 7), 235 | 'END': ('REDUCE', 7), 236 | 'IFF': ('REDUCE', 7), 237 | 'IMP': ('REDUCE', 7), 238 | ']': ('REDUCE', 7)}] 239 | 240 | GOTO = \ 241 | [{'(': 7, 242 | 'Atomic': 2, 243 | 'Complex': 3, 244 | 'False': 5, 245 | 'NEG': 9, 246 | 'Sentence': 1, 247 | 'True': 4, 248 | 'W': 6, 249 | '[': 8}, 250 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 251 | {}, 252 | {}, 253 | {}, 254 | {}, 255 | {}, 256 | {'(': 7, 257 | 'Atomic': 2, 258 | 'Complex': 3, 259 | 'False': 5, 260 | 'NEG': 9, 261 | 'Sentence': 14, 262 | 'True': 4, 263 | 'W': 6, 264 | '[': 8}, 265 | {'(': 7, 266 | 'Atomic': 2, 267 | 'Complex': 3, 268 | 'False': 5, 269 | 'NEG': 9, 270 | 'Sentence': 15, 271 | 'True': 4, 272 | 'W': 6, 273 | '[': 8}, 274 | {'(': 7, 275 | 'Atomic': 2, 276 | 'Complex': 3, 277 | 'False': 5, 278 | 'NEG': 9, 279 | 'Sentence': 16, 280 | 'True': 4, 281 | 'W': 6, 282 | '[': 8}, 283 | {'(': 7, 284 | 'Atomic': 2, 285 | 'Complex': 3, 286 | 'False': 5, 287 | 'NEG': 9, 288 | 'Sentence': 17, 289 | 'True': 4, 290 | 'W': 6, 291 | '[': 8}, 292 | {'(': 7, 293 | 'Atomic': 2, 294 | 'Complex': 3, 295 | 'False': 5, 296 | 'NEG': 9, 297 | 'Sentence': 18, 298 | 'True': 4, 299 | 'W': 6, 300 | '[': 8}, 301 | {'(': 7, 302 | 'Atomic': 2, 303 | 'Complex': 3, 304 | 'False': 5, 305 | 'NEG': 9, 306 | 'Sentence': 19, 307 | 'True': 4, 308 | 'W': 6, 309 | '[': 8}, 310 | {'(': 7, 311 | 'Atomic': 2, 312 | 'Complex': 3, 313 | 'False': 5, 314 | 'NEG': 9, 315 | 'Sentence': 20, 316 | 'True': 4, 317 | 'W': 6, 318 | '[': 8}, 319 | {')': 21, 'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 320 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12, ']': 22}, 321 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 322 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 323 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 324 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 325 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 326 | {}, 327 | {}] 328 | 329 | ## Parser$END 330 | -------------------------------------------------------------------------------- /experiments/meta_dumps_standalone.py: -------------------------------------------------------------------------------- 1 | ## This file is generated. Do not modify. 2 | 3 | ## Lexer$BEGIN 4 | 5 | lex2pats = \ 6 | [('NEG', '!'), 7 | ('CON', '&'), 8 | ('DIS', '\\|'), 9 | ('IMP', '->'), 10 | ('IFF', '<=>'), 11 | ('W', '[A-Z]\\w*'), 12 | ('True', None), 13 | ('False', None), 14 | ('(', None), 15 | (')', None), 16 | ('[', None), 17 | (']', None), 18 | ('IGNORED', '[ \\t\\n]'), 19 | ('ERROR', '.')] 20 | 21 | lex_handler_sources = \ 22 | {} 23 | 24 | ## Lexer$END 25 | 26 | 27 | ## Parser$BEGIN 28 | 29 | precedence = \ 30 | {'CON': 4, 'DIS': 3, 'IFF': 1, 'IMP': 2, 'NEG': 5} 31 | 32 | rules = \ 33 | [('Sentence^', ('Sentence',)), 34 | ('Sentence', ['Atomic']), 35 | ('Sentence', ['Complex']), 36 | ('Atomic', ['True']), 37 | ('Atomic', ['False']), 38 | ('Atomic', ['W']), 39 | ('Complex', ['(', 'Sentence', ')']), 40 | ('Complex', ['[', 'Sentence', ']']), 41 | ('Complex', ['NEG', 'Sentence']), 42 | ('Complex', ['Sentence', 'CON', 'Sentence']), 43 | ('Complex', ['Sentence', 'DIS', 'Sentence']), 44 | ('Complex', ['Sentence', 'IMP', 'Sentence']), 45 | ('Complex', ['Sentence', 'IFF', 'Sentence'])] 46 | 47 | seman_sources = \ 48 | [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00' 49 | b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x' 50 | b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\users\\shellay\\documents\\githu' 51 | b'b\\metaparse\\metaparse.py\xda\x07id_func\x90\x00\x00\x00s\x02\x00' 52 | b'\x00\x00\x00\x01', 53 | None, 54 | None, 55 | None, 56 | None, 57 | None, 58 | None, 59 | None, 60 | None, 61 | None, 62 | None, 63 | None, 64 | None] 65 | 66 | Ks = \ 67 | [[(0, 0)], 68 | [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)], 69 | [(1, 1)], 70 | [(2, 1)], 71 | [(3, 1)], 72 | [(4, 1)], 73 | [(5, 1)], 74 | [(6, 1)], 75 | [(7, 1)], 76 | [(8, 1)], 77 | [(9, 2)], 78 | [(10, 2)], 79 | [(11, 2)], 80 | [(12, 2)], 81 | [(6, 2), (9, 1), (10, 1), (11, 1), (12, 1)], 82 | [(7, 2), (9, 1), (10, 1), (11, 1), (12, 1)], 83 | [(8, 2), (9, 1), (10, 1), (11, 1), (12, 1)], 84 | [(9, 1), (9, 3), (10, 1), (11, 1), (12, 1)], 85 | [(9, 1), (10, 1), (10, 3), (11, 1), (12, 1)], 86 | [(9, 1), (10, 1), (11, 1), (11, 3), (12, 1)], 87 | [(9, 1), (10, 1), (11, 1), (12, 1), (12, 3)], 88 | [(6, 3)], 89 | [(7, 3)]] 90 | 91 | ACTION = \ 92 | [{'(': ('SHIFT', 7), 93 | 'False': ('SHIFT', 5), 94 | 'NEG': ('SHIFT', 9), 95 | 'True': ('SHIFT', 4), 96 | 'W': ('SHIFT', 6), 97 | '[': ('SHIFT', 8)}, 98 | {'CON': ('SHIFT', 10), 99 | 'DIS': ('SHIFT', 11), 100 | 'END': ('ACCEPT', 0), 101 | 'IFF': ('SHIFT', 13), 102 | 'IMP': ('SHIFT', 12)}, 103 | {')': ('REDUCE', 1), 104 | 'CON': ('REDUCE', 1), 105 | 'DIS': ('REDUCE', 1), 106 | 'END': ('REDUCE', 1), 107 | 'IFF': ('REDUCE', 1), 108 | 'IMP': ('REDUCE', 1), 109 | ']': ('REDUCE', 1)}, 110 | {')': ('REDUCE', 2), 111 | 'CON': ('REDUCE', 2), 112 | 'DIS': ('REDUCE', 2), 113 | 'END': ('REDUCE', 2), 114 | 'IFF': ('REDUCE', 2), 115 | 'IMP': ('REDUCE', 2), 116 | ']': ('REDUCE', 2)}, 117 | {')': ('REDUCE', 3), 118 | 'CON': ('REDUCE', 3), 119 | 'DIS': ('REDUCE', 3), 120 | 'END': ('REDUCE', 3), 121 | 'IFF': ('REDUCE', 3), 122 | 'IMP': ('REDUCE', 3), 123 | ']': ('REDUCE', 3)}, 124 | {')': ('REDUCE', 4), 125 | 'CON': ('REDUCE', 4), 126 | 'DIS': ('REDUCE', 4), 127 | 'END': ('REDUCE', 4), 128 | 'IFF': ('REDUCE', 4), 129 | 'IMP': ('REDUCE', 4), 130 | ']': ('REDUCE', 4)}, 131 | {')': ('REDUCE', 5), 132 | 'CON': ('REDUCE', 5), 133 | 'DIS': ('REDUCE', 5), 134 | 'END': ('REDUCE', 5), 135 | 'IFF': ('REDUCE', 5), 136 | 'IMP': ('REDUCE', 5), 137 | ']': ('REDUCE', 5)}, 138 | {'(': ('SHIFT', 7), 139 | 'False': ('SHIFT', 5), 140 | 'NEG': ('SHIFT', 9), 141 | 'True': ('SHIFT', 4), 142 | 'W': ('SHIFT', 6), 143 | '[': ('SHIFT', 8)}, 144 | {'(': ('SHIFT', 7), 145 | 'False': ('SHIFT', 5), 146 | 'NEG': ('SHIFT', 9), 147 | 'True': ('SHIFT', 4), 148 | 'W': ('SHIFT', 6), 149 | '[': ('SHIFT', 8)}, 150 | {'(': ('SHIFT', 7), 151 | 'False': ('SHIFT', 5), 152 | 'NEG': ('SHIFT', 9), 153 | 'True': ('SHIFT', 4), 154 | 'W': ('SHIFT', 6), 155 | '[': ('SHIFT', 8)}, 156 | {'(': ('SHIFT', 7), 157 | 'False': ('SHIFT', 5), 158 | 'NEG': ('SHIFT', 9), 159 | 'True': ('SHIFT', 4), 160 | 'W': ('SHIFT', 6), 161 | '[': ('SHIFT', 8)}, 162 | {'(': ('SHIFT', 7), 163 | 'False': ('SHIFT', 5), 164 | 'NEG': ('SHIFT', 9), 165 | 'True': ('SHIFT', 4), 166 | 'W': ('SHIFT', 6), 167 | '[': ('SHIFT', 8)}, 168 | {'(': ('SHIFT', 7), 169 | 'False': ('SHIFT', 5), 170 | 'NEG': ('SHIFT', 9), 171 | 'True': ('SHIFT', 4), 172 | 'W': ('SHIFT', 6), 173 | '[': ('SHIFT', 8)}, 174 | {'(': ('SHIFT', 7), 175 | 'False': ('SHIFT', 5), 176 | 'NEG': ('SHIFT', 9), 177 | 'True': ('SHIFT', 4), 178 | 'W': ('SHIFT', 6), 179 | '[': ('SHIFT', 8)}, 180 | {')': ('SHIFT', 21), 181 | 'CON': ('SHIFT', 10), 182 | 'DIS': ('SHIFT', 11), 183 | 'IFF': ('SHIFT', 13), 184 | 'IMP': ('SHIFT', 12)}, 185 | {'CON': ('SHIFT', 10), 186 | 'DIS': ('SHIFT', 11), 187 | 'IFF': ('SHIFT', 13), 188 | 'IMP': ('SHIFT', 12), 189 | ']': ('SHIFT', 22)}, 190 | {')': ('REDUCE', 8), 191 | 'CON': ('REDUCE', 8), 192 | 'DIS': ('REDUCE', 8), 193 | 'END': ('REDUCE', 8), 194 | 'IFF': ('REDUCE', 8), 195 | 'IMP': ('REDUCE', 8), 196 | ']': ('REDUCE', 8)}, 197 | {')': ('REDUCE', 9), 198 | 'CON': ('REDUCE', 9), 199 | 'DIS': ('REDUCE', 9), 200 | 'END': ('REDUCE', 9), 201 | 'IFF': ('REDUCE', 9), 202 | 'IMP': ('REDUCE', 9), 203 | ']': ('REDUCE', 9)}, 204 | {')': ('REDUCE', 10), 205 | 'CON': ('SHIFT', 10), 206 | 'DIS': ('REDUCE', 10), 207 | 'END': ('REDUCE', 10), 208 | 'IFF': ('REDUCE', 10), 209 | 'IMP': ('REDUCE', 10), 210 | ']': ('REDUCE', 10)}, 211 | {')': ('REDUCE', 11), 212 | 'CON': ('SHIFT', 10), 213 | 'DIS': ('SHIFT', 11), 214 | 'END': ('REDUCE', 11), 215 | 'IFF': ('REDUCE', 11), 216 | 'IMP': ('REDUCE', 11), 217 | ']': ('REDUCE', 11)}, 218 | {')': ('REDUCE', 12), 219 | 'CON': ('SHIFT', 10), 220 | 'DIS': ('SHIFT', 11), 221 | 'END': ('REDUCE', 12), 222 | 'IFF': ('REDUCE', 12), 223 | 'IMP': ('SHIFT', 12), 224 | ']': ('REDUCE', 12)}, 225 | {')': ('REDUCE', 6), 226 | 'CON': ('REDUCE', 6), 227 | 'DIS': ('REDUCE', 6), 228 | 'END': ('REDUCE', 6), 229 | 'IFF': ('REDUCE', 6), 230 | 'IMP': ('REDUCE', 6), 231 | ']': ('REDUCE', 6)}, 232 | {')': ('REDUCE', 7), 233 | 'CON': ('REDUCE', 7), 234 | 'DIS': ('REDUCE', 7), 235 | 'END': ('REDUCE', 7), 236 | 'IFF': ('REDUCE', 7), 237 | 'IMP': ('REDUCE', 7), 238 | ']': ('REDUCE', 7)}] 239 | 240 | GOTO = \ 241 | [{'(': 7, 242 | 'Atomic': 2, 243 | 'Complex': 3, 244 | 'False': 5, 245 | 'NEG': 9, 246 | 'Sentence': 1, 247 | 'True': 4, 248 | 'W': 6, 249 | '[': 8}, 250 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 251 | {}, 252 | {}, 253 | {}, 254 | {}, 255 | {}, 256 | {'(': 7, 257 | 'Atomic': 2, 258 | 'Complex': 3, 259 | 'False': 5, 260 | 'NEG': 9, 261 | 'Sentence': 14, 262 | 'True': 4, 263 | 'W': 6, 264 | '[': 8}, 265 | {'(': 7, 266 | 'Atomic': 2, 267 | 'Complex': 3, 268 | 'False': 5, 269 | 'NEG': 9, 270 | 'Sentence': 15, 271 | 'True': 4, 272 | 'W': 6, 273 | '[': 8}, 274 | {'(': 7, 275 | 'Atomic': 2, 276 | 'Complex': 3, 277 | 'False': 5, 278 | 'NEG': 9, 279 | 'Sentence': 16, 280 | 'True': 4, 281 | 'W': 6, 282 | '[': 8}, 283 | {'(': 7, 284 | 'Atomic': 2, 285 | 'Complex': 3, 286 | 'False': 5, 287 | 'NEG': 9, 288 | 'Sentence': 17, 289 | 'True': 4, 290 | 'W': 6, 291 | '[': 8}, 292 | {'(': 7, 293 | 'Atomic': 2, 294 | 'Complex': 3, 295 | 'False': 5, 296 | 'NEG': 9, 297 | 'Sentence': 18, 298 | 'True': 4, 299 | 'W': 6, 300 | '[': 8}, 301 | {'(': 7, 302 | 'Atomic': 2, 303 | 'Complex': 3, 304 | 'False': 5, 305 | 'NEG': 9, 306 | 'Sentence': 19, 307 | 'True': 4, 308 | 'W': 6, 309 | '[': 8}, 310 | {'(': 7, 311 | 'Atomic': 2, 312 | 'Complex': 3, 313 | 'False': 5, 314 | 'NEG': 9, 315 | 'Sentence': 20, 316 | 'True': 4, 317 | 'W': 6, 318 | '[': 8}, 319 | {')': 21, 'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 320 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12, ']': 22}, 321 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 322 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 323 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 324 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 325 | {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12}, 326 | {}, 327 | {}] 328 | 329 | ## Parser$END 330 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | metaparse 2 | ===== 3 | 4 | This is a tool which lets you do instant parsing or language design tasks 5 | enjoying the elegancy of pure Python[1]. 6 | With this tool, creating a Python class is sufficient 7 | to define a language, which includes 8 | 9 | * lexical patterns 10 | * syntatical rules 11 | * semantic actions (i.e. interpretation/translation) 12 | 13 | On top of this class, a parser/interpreter is automatically generated. 14 | You can already use it to directly parse strings via calling its `parse` 15 | or `interpret` method. 16 | 17 | [1]. This module is motivated by [instaparse][] in [Clojure][], but goes another way more like [PLY][]. 18 |
19 | 20 | # Table of Contents 21 | 1. [Quick Example](#quick-example) 22 | 1. [Design and Usage](#design-and-usage) 23 | 1. [Generalized LALR Parsing](#generalized-lalr-and-dealing-with-ambiguity) 24 | 1. [API](#api) 25 | 26 | 27 | # Quick Example 28 | 29 | In `metaparse`, language syntax and semantics can be simply defined 30 | as methods of a class. To illustrate this, we create a tiny 31 | calculator grammar which can read basic arithmetic expressions and 32 | register variable bindings in a global dictionary. 33 | 34 | At first, we conceptually design the grammar on a paper, as seen from the 35 | textbooks, 36 | 37 | ``` 38 | assign → ID = expr 39 | expr → NUM 40 | expr → ID 41 | expr → expr₁ + expr₂ 42 | expr → expr₁ * expr₂ 43 | expr → expr₁ ** expr₂ 44 | ``` 45 | 46 | then we map them to method declarations in Python: 47 | ``` python 48 | def assign(ID, EQ, expr): ... 49 | def expr(NUM): ... 50 | def expr(ID): ... 51 | def expr(expr_1, ADD, expr_2): ... 52 | def expr(expr_1, MUL, expr_2): ... 53 | def expr(expr_1, POW, expr_2): ... 54 | ``` 55 | 56 | and finally we write down the semantic rules as method bodies, 57 | in a [SDT][]-style (cf. [Yacc][]). The method parameters are bound 58 | to the parse result of the sub-tree when a rule is being executed 59 | (i.e. being reduced after its sub-rules or tokens have been 60 | successfully processed). 61 | 62 | ``` python 63 | from metaparse import LALR 64 | 65 | # Global context/environment for language semantics. 66 | context = {} 67 | 68 | class LangArith(metaclass=LALR.meta): 69 | 70 | "A language for calculating expressions." 71 | 72 | # ===== Lexical patterns / Terminals ===== 73 | # - Patterns are specified via regular expressions 74 | # - Patterns will be checked with the same order as declared during tokenizing 75 | 76 | IGNORED = r'\s+' # Special pattern to be ignored. 77 | 78 | EQ = r'=' 79 | POW = r'\*\*', 3 # Can include precedence of token using a number (for LALR conflict resolution) 80 | POW = r'\^' , 3 # Alternative patterns can share the same name 81 | MUL = r'\*' , 2 82 | ADD = r'\+' , 1 83 | 84 | ID = r'[_a-zA-Z]\w*' 85 | NUM = r'[1-9][0-9]*' 86 | def NUM(value): # Can specify translator for certain lexical patterns! 87 | return int(value) 88 | 89 | # ===== Syntactic/Semantic rules in SDT-style ===== 90 | 91 | def assign(ID, EQ, expr): # Can access global context in Python environment. 92 | context[ID] = expr 93 | return expr 94 | 95 | def expr(NUM): # Normally computing result without side-effects would be better. 96 | return NUM # NUM is passed as (int) since there is a NUM handler! 97 | 98 | def expr(ID): 99 | return context[ID] 100 | 101 | def expr(expr_1, ADD, expr_2): # TeX style subscripts used for identifying expression instances, like (expr → expr₁ + expr₂) 102 | return expr_1 + expr_2 103 | 104 | def expr(expr, MUL, expr_1): # Can ignore one of the subscripts. 105 | return expr * expr_1 106 | 107 | def expr(expr, POW, expr_1): 108 | return expr ** expr_1 109 | ``` 110 | 111 | Then we get a `LALR` parser object: 112 | 113 | ``` python 114 | >>> type(LangArith) 115 | 116 | ``` 117 | 118 | Now we are **done** and it's quite straightforward trying it out. 119 | 120 | ``` python 121 | >>> LangArith.interpret("x = 1 + 4 * 3 ** 2 + 5") 122 | 42 123 | >>> LangArith.interpret("y = 5 + x * 2") 124 | 89 125 | >>> LangArith.interpret("z = 9 ^ 2") 126 | 81 127 | 128 | >>> context 129 | {'y': 89, 'x': 42, 'z': 81} 130 | ``` 131 | 132 | IMO, tools under state-of-the-art could hardly get more handy than 133 | this. 134 | 135 | Note `metaclass=LALR.meta` only works in Python 3. There is an 136 | [alternative](#verbose-style) way which works in Python 2. 137 | Directly using the APIs without all syntactic sugars is also possible. 138 | 139 | 140 | # Design and Usage 141 | 142 | The design of this module targets "**native** parsing" (like [instaparse][] and [Parsec][]). Highlights are 143 | 144 | * native structure representing grammar rules 145 | - like `def E(E, plus, T)`, `def T(F)` ... 146 | - rather than **literal string notations** like `"E = E + T"`, `"T = F"` ... 147 | * language translation implemented in *pure* Python, 148 | * easy to play with (e.g. in REPL), 149 | * no need to generate a program before use 150 | * but you can generate one and save it for future use (via dump/load APIs) 151 | * does not feel too much like a DSL (maybe?), 152 | * no dependencies, 153 | * optional precedence specification (for LALR), 154 | * nice error reporting, 155 | * and etc. 156 | 157 | 158 | 160 | 161 | Though this slim module does not intend to replace full-fledged tools 162 | like [Bison][] and [ANTLR][], it is still very handy and its 163 | integration into existing Python project is seamless. 164 | 165 | The following sections explains more details about the core utilities 166 | . Feel free to skip them since you already see from above how it is 167 | used. 168 | 169 | 170 | ## Retrieving the Parse Tree 171 | 172 | Continuing the first example, if only the parse tree is needed rather 173 | than the translation result, use method `parse` instead of 174 | `interpret`: 175 | 176 | ``` python 177 | tr = LangArith.parse(" w = 1 + 2 * 3 ** 4 + 5 ") 178 | 179 | >>> tr 180 | ('assign', 181 | [('ID', 'w'), 182 | ('EQ', '='), 183 | ('expr', 184 | [('expr', 185 | [('expr', [('NUM', '1')]), 186 | ('ADD', '+'), 187 | ('expr', 188 | [('expr', [('NUM', '2')]), 189 | ('MUL', '*'), 190 | ('expr', 191 | [('expr', [('NUM', '3')]), 192 | ('POW', '**'), 193 | ('expr', [('NUM', '4')])])])]), 194 | ('ADD', '+'), 195 | ('expr', [('NUM', '5')])])]) 196 | ``` 197 | 198 | The result is a `ParseTree` object with tuple representation. A parse 199 | leaf is just a `Token` object represented as ```(, 200 | '')```. 201 | 202 | 203 | ## Save generated parser object 204 | 205 | It can be time consuming when `metaparse` converts your language into 206 | a parser/interpreter, depending on the size of the language. You might 207 | not want to re-generate the parser each time you starts a Python 208 | process. So `metaparse` allows you to serialize your parser (which is 209 | no much more than a dictionary encoding the state machine under the 210 | hood). The API is `dumps/loads` or `dump/load`. 211 | 212 | ``` python 213 | LangArith.dumps('./eg_demo_dump.py') 214 | ``` 215 | 216 | Since our parser is created given access to a global variable named 217 | `context`, which makes `globals` and `context` dependencies of your 218 | translation scheme, you have to pass it to `load` when loading the 219 | parser and define the `context` object in the global scope to allow 220 | your translation to be still functional (for sure, a better way is to 221 | define your context object dedicatedly instead of using `globals`): 222 | 223 | ``` python 224 | # Another file using the parser 225 | 226 | from metaparse import LALR 227 | 228 | # Let loaded parser be able to access current runtime env `globals()`. 229 | arith_parser = LALR.load('./eg_demo_dump.py', globals()) 230 | 231 | # Context instance to be accessed by the loaded parser 232 | context = {} 233 | 234 | arith_parser.interpret('foo = 1 + 9') 235 | 236 | print (context) 237 | # {'foo': 10} 238 | ``` 239 | 240 | You might wonder why passing `globals` can work - It's due to that in 241 | Python the `__code__` object can be evaluated given whatever context 242 | and that's what `metaparse` does internally. (more basic details see 243 | the documents for `exec` and `code` object). 244 | 245 | 246 | ## Error Reporting 247 | 248 | During designing a language, it's very easy to make inconsistent 249 | rules. `metaparse` provides sensible error reporting for such cases - 250 | for example, executing the following 251 | 252 | ``` python 253 | from metaparse import LALR 254 | 255 | class ExprLang(metaclass=LALR.meta): 256 | 257 | NUM = '\d+' 258 | PLUS = '\+' 259 | 260 | def expr(expr, PLUS, term): 261 | return expr + term 262 | 263 | def expr(expr, TIMES, term): 264 | return expr * term 265 | 266 | def expr(term): 267 | return term 268 | 269 | def term(NUM): 270 | return int(NUM) 271 | 272 | def factor(NUM): 273 | return int(NUM) 274 | ``` 275 | 276 | would result in error report: 277 | 278 | ``` python-traceback 279 | metaparse.LanguageError: No lexical pattern provided for terminal symbol: TIMES 280 | - in 2th rule (expr = expr TIMES term) 281 | - with helping traceback (if available): 282 | File "test_make_error.py", line 21, in expr 283 | 284 | - declared lexes: Lexer{ 285 | [('NUM', re.compile('\\d+')), 286 | ('PLUS', re.compile('\\+')), 287 | ('IGNORED', re.compile('\\s+'))]} 288 | ``` 289 | 290 | After providing the missing terminal symbol `TIMES`, another error is 291 | detected during re-run: 292 | 293 | ``` python-traceback 294 | metaparse.LanguageError: There are unreachable nonterminal at 5th rule: {'factor'}. 295 | - with helping traceback: 296 | File "test_make_error.py", line 30, in factor 297 | ``` 298 | 299 | The error information is formulated within Python *traceback* and 300 | should be precise enough and guide you or editors to the exact place 301 | where correction is needed. 302 | 303 | 304 | # Generalized LALR and Dealing with Ambiguity 305 | 306 | `metaparse` supplies an interesting extension: the `GLR` parser with 307 | look-ahead, which can parse ambiguous grammars and help you figure out 308 | why a grammar is ambiguous and fails to be LALR(1). 309 | 310 | Given the famous ambiguous [Dangling-Else][] grammar: 311 | 312 | ``` 313 | selection-statement = ... 314 | | IF expression THEN statement 315 | | IF expression THEN statement ELSE statement 316 | ``` 317 | 318 | let's build it 319 | using `LALR`: 320 | 321 | ``` python 322 | from metaparse import GLR, LALR 323 | 324 | class LangIfThenElse(metaclass=LALR.meta): 325 | 326 | IF = r'if' 327 | THEN = r'then' 328 | ELSE = r'else' 329 | EXPR = r'\d+' 330 | SINGLE = r'[_a-zA-Z]+' 331 | 332 | def stmt(ifstmt): 333 | return ifstmt 334 | 335 | def stmt(SINGLE): 336 | return SINGLE 337 | 338 | def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2): 339 | return ('ite', EXPR, stmt_1, stmt_2) 340 | 341 | def ifstmt(IF, EXPR, THEN, stmt): 342 | return ('it', EXPR, stmt) 343 | ``` 344 | 345 | would result in a *shift/reduce* conflict on the token `ELSE` with error hints: 346 | 347 | ``` python-traceback 348 | Handling item set: 349 | ['(ifstmt = IF EXPR THEN stmt.ELSE stmt)', '(ifstmt = IF EXPR THEN stmt.)'] 350 | Conflict on lookahead: ELSE 351 | - ('reduce', (ifstmt = IF EXPR THEN stmt)) 352 | - ('shift', ['(ifstmt = IF EXPR THEN stmt ELSE.stmt)']) 353 | ``` 354 | 355 | Using `GLR.meta` instead of `LALR.meta`, and `interpret_generalized` respectively: 356 | 357 | ``` python 358 | >>> LangIfThenElse.interpret_generalized('if 1 then if 2 then if 3 then a else b else c') 359 | [('ite', '1', ('ite', '2', ('it', '3', 'a'), 'b'), 'c'), 360 | ('ite', '1', ('it', '2', ('ite', '3', 'a', 'b')), 'c'), 361 | ('it', '1', ('ite', '2', ('ite', '3', 'a', 'b'), 'c'))] 362 | ``` 363 | 364 | the parser delivers all ambiguous parse results which cannot be 365 | handled by LALR(1) properly. From the result you can gather more 366 | insights about why it's ambigious. 367 | 368 | Note that interpreting ambigious grammar is error-prone if 369 | side-effects are involved, since the translator function for each 370 | alternative result is executed and it is hard to understand how they 371 | can potentially interfer. **(It is generally advised to use 372 | side-effects-free translation when using GLR parsers!)**. 373 | 374 | 375 | ## Using Token Precedence to Resolve Conflicts 376 | 377 | Though GLR is powerful, we may not want to keep ambiguity in practical 378 | cases and eventually would prefer `LALR` for the sake of clarity and 379 | performance. Very likely, ambiguity is not what you really want and 380 | you might want to resolve ambiguity by specifying precedence of 381 | certain tokens. 382 | 383 | Taking the Dangling-Else example, by associate to `ELSE` a higher 384 | precedence than `THEN` (just like the arithmetic grammar example 385 | regarding operators), meaning when handling `stmt` between `THEN` and 386 | `ELSE`, i.e. conflicting rules raise an `ELSE` token, the rule having 387 | `ELSE` has higher precedence and will be chosen: 388 | 389 | ``` python 390 | class LangIfThenElse(metaclass=LALR.meta): 391 | ... 392 | THEN = r'then', 1 393 | ELSE = r'else', 2 394 | ... 395 | ``` 396 | 397 | With this conflict resolution. The LALR parser can be constructed 398 | successfully and parsing delivers 399 | 400 | ``` 401 | >>> LangIfThenElse.interpret('if 1 then if 2 then if 3 then a else b else c') 402 | ('it', '1', ('ite', '2', ('ite', '3', 'a', 'b'), 'c')) 403 | ``` 404 | 405 | However, in practice, precedence specification can get highly 406 | complicated and intended behavior gets much less than explicit. It is 407 | advised to not use precedence at all if you could find more explicit 408 | and straightforward alternatives. 409 | 410 | 411 | # API 412 | 413 | The following contents give more details about the underlying utilities. 414 | 415 | ## Explicitly Registering Lexical Patterns and Syntactic Rules 416 | 417 | The following APIs for defining the language in [the very first 418 | example](#quick-example) works for both Python 2 and Python 3, with 419 | the more verbose but more explicit style, heavily relying on using 420 | decorators. 421 | 422 | ``` python 423 | from metaparse import LALR 424 | 425 | LangArith = LALR() 426 | 427 | lex = LangArith.lexer 428 | rule = LangArith.rule 429 | 430 | # lex( = ) 431 | lex(IGNORED = r'\s+') 432 | lex(NUM = r'[0-9]+') 433 | lex(EQ = r'=') 434 | lex(ID = r'[_a-zA-Z]\w*') 435 | 436 | # lex(... , p = ) 437 | lex(POW = r'\*\*', p=3) 438 | lex(POW = r'\^') # No need to give the precedence twice for POW. 439 | lex(MUL = r'\*' , p=2) 440 | lex(ADD = r'\+' , p=1) 441 | 442 | # @rule 443 | # def ( ): 444 | # 445 | @rule 446 | def assign(ID, EQ, expr): 447 | context[ID] = expr 448 | return expr 449 | 450 | @rule 451 | def expr(ID): 452 | return context[ID] 453 | 454 | @rule 455 | def expr(NUM): 456 | return int(NUM) 457 | 458 | @rule 459 | def expr(expr_1, ADD, expr_2): 460 | return expr_1 + expr_2 461 | 462 | @rule 463 | def expr(expr, MUL, expr_1): 464 | return expr * expr_1 465 | 466 | @rule 467 | def expr(expr, POW, expr_1): 468 | return expr ** expr_1 469 | 470 | # Complete making the parser after collecting things! 471 | LangArith.make() 472 | ``` 473 | 474 | Explanation in short: 475 | 476 | * `lex` is the `Lexer` instance associated with `LangArith`, which is also 477 | able to collect definition of lexical patterns. 478 | 479 | * `rule` is a decorator which extracts syntactic rule information from 480 | the function signature and register the function itself as translator 481 | for this rule. 482 | 483 | ## The Underlying Lexical Analyzer 484 | 485 | After declaring the language like above, `metaparse` internally 486 | creates a lexical analyzer as a component used by the internal parser. 487 | Lexical analyzer maintains a list of terminal symbols of the language 488 | defined, preserving the order they appear in the code. 489 | 490 | ``` python 491 | >>> LangArith.lexer 492 | Lexer{ 493 | [('IGNORED', re.compile('\\s+')), 494 | ('EQ', re.compile('=')), 495 | ('NUM', re.compile('[1-9][0-9]*')), 496 | ('ID', re.compile('[_a-zA-Z]\\w*')), 497 | ('POW', re.compile('\\*\\*')), 498 | ('MUL', re.compile('\\*')), 499 | ('ADD', re.compile('\\+'))]} 500 | ``` 501 | 502 | It runs when method `tokenize` is called and generates tokens carrying 503 | attributes. During tokenizing, the patterns are checked respecting the 504 | order in the list. 505 | 506 | Note there is a pre-defined special lexical element `IGNORED`: 507 | 508 | * When `Lexer` reads a string matching the pattern associating 509 | `IGNORED`, no token is generated for the matching part of the 510 | string; 511 | 512 | * If `IGNORED` is not explicitly overriden in the user's language 513 | definition, it will have the default value `r'\s+'`. 514 | 515 | We can print out the tracing of lexcial analyzing process: 516 | 517 | ``` python 518 | >>> for token in LangArith.lexer.tokenize(" foo = 1 + bar * 2"): 519 | ... print(token.pos, 520 | ... token.end, 521 | ... token.symbol, 522 | ... repr(token.lexeme), # (lexeme) is something literal. 523 | ... repr(token.value)) # (value) is something computed by handler, if exists. 524 | 525 | 1 4 ID 'foo' 'foo' 526 | 6 7 EQ '=' '=' 527 | 8 9 NUM '1' 1 528 | 10 11 ADD '+' '+' 529 | 12 15 ID 'bar' 'bar' 530 | 16 17 MUL '*' '*' 531 | 18 19 NUM '2' 2 532 | 533 | ``` 534 | 535 | Moreover, it is OK to declare more lexical patterns under the same 536 | name: 537 | 538 | ``` python 539 | class LangArith(metaclass=LALR.meta): 540 | ... 541 | IGNORED = r' ' 542 | IGNORED = r'\t' 543 | IGNORED = r'#' 544 | ... 545 | POW = r'\*\*' 546 | POW = r'\^' 547 | ... 548 | ``` 549 | 550 | which avoids clustering alternative sub-patterns in one `re` expression. 551 | 552 | In practical use, you might not need to call `Lexer` at all. 553 | 554 | 555 | ## Online-Parsing behind the Scene 556 | 557 | The `parse` and `interpret` methods are implemented internally based 558 | on generators, which is a sort of *online-processing* behavior, i.e. 559 | 560 | ``` 561 | —→ —→ 562 | ``` 563 | 564 | The following block of code calls the routine directly, starts it, and 565 | traces the intermediate states: 566 | 567 | ``` python 568 | # Prepare a parsing routine 569 | p = LangArith.prepare() 570 | 571 | # Start this routine 572 | next(p) 573 | 574 | # Send tokens one-by-one 575 | for token in LangArith.lexer.tokenize('bar = 1 + 2 + + 3', with_end=True): 576 | print("Sends: ", token) 577 | r = p.send(token) 578 | print("Got: ", r) 579 | print() 580 | ``` 581 | 582 | that is, via sending tokens to the parser one-by-one for 583 | interpretation, an internal interpretation stack is maintained and 584 | updated. The top element of the stack is returned wrapped in a `Just` 585 | structure as a response to each token (which can be a reduced result 586 | from a sequence of elements perfectly matching the rule). When token 587 | fails processing a `ParseError` containing useful information is 588 | returned (rather than thrown). 589 | 590 | ``` python-traceback 591 | Sends: ('ID', 'bar') 592 | Got: Just(result=('ID', 'bar')) 593 | 594 | Sends: ('EQ', '=') 595 | Got: Just(result=('EQ', '=')) 596 | 597 | Sends: ('NUM', '1') 598 | Got: Just(result=('NUM', '1')) 599 | 600 | Sends: ('ADD', '+') 601 | Got: Just(result=('ADD', '+')) 602 | 603 | Sends: ('NUM', '2') 604 | Got: Just(result=('NUM', '2')) 605 | 606 | Sends: ('ADD', '+') 607 | Got: Just(result=('ADD', '+')) 608 | 609 | Sends: ('ADD', '+') 610 | Got: Unexpected token ('ADD', '+') at (14:15) 611 | while expecting actions 612 | {'ID': ('shift', 5), 'NUM': ('shift', 6)} 613 | with state stack 614 | [['(assign^ = .assign)'], 615 | ['(assign = ID.EQ expr)'], 616 | ['(assign = ID EQ.expr)'], 617 | ['(assign = ID EQ expr.)', 618 | '(expr = expr.ADD expr)', 619 | '(expr = expr.MUL expr)', 620 | '(expr = expr.POW expr)'], 621 | ['(expr = expr ADD.expr)']] 622 | and subtree stack 623 | ['bar', '=', 3, '+'] 624 | 625 | 626 | Sends: ('NUM', '3') 627 | Got: Just(result=('NUM', '3')) 628 | 629 | Sends: ('\x03', None) 630 | Got: Just(result=6) 631 | ``` 632 | 633 | 634 | # Limitations 635 | 636 | Though this module provides advantageous features, there are also limitations: 637 | 638 | * Parsing grammars with **loops** is not supported. For example, the 639 | grammar 640 | 641 | ``` 642 | P → Q | a 643 | Q → P 644 | ``` 645 | 646 | is *infinitely ambiguous*, which has infinite number of derivations 647 | while processing only finite input, e.g. `"a"`: 648 | 649 | ``` 650 | P ⇒ a 651 | P ⇒ Q ⇒ P ⇒ a 652 | ... 653 | P ⇒ Q ⇒ ... ⇒ P ⇒ a 654 | ``` 655 | 656 | where each derivation corresponds to a parse tree. Eager generation 657 | of these trees lead to non-termination during parsing. 658 | 659 | * Only **legal Python identifier**, rather than non-alphabetic symbols 660 | (like ``, `==`, `raise`, etc) can be used as symbols in 661 | grammar (seems no serious). 662 | 663 | * Parsing algorithms are implemented in pure Python, but speed-up via 664 | Cython should be possible in the future. 665 | 666 | 667 | [Parsing]: https://en.wikipedia.org/wiki/Parsing "Parsing" 668 | [Interpreting]: https://en.wikipedia.org/wiki/Interpreter_(computing) "Interpreter" 669 | [DSL]: https://en.wikipedia.org/wiki/Domain-specific_language "Domain-specific Language" 670 | [BNF]: https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form "Backus-Naur From" 671 | [Earley]: https://en.wikipedia.org/wiki/Earley_parser "Earley" 672 | [LL]: https://en.wikipedia.org/wiki/LL_parser "Left-to-right, Leftmost-derivation" 673 | [GLL]: http://dotat.at/tmp/gll.pdf "General Left-to-right, Leftmost-derivation" 674 | [GLR]: https://en.wikipedia.org/wiki/GLR_parser "General Left-to-right, Rightmost derivation" 675 | [LALR]: https://en.wikipedia.org/wiki/LALR_parser "Look-Ahead Left-to-right, Rightmost-derivation" 676 | [CFG]: https://en.wikipedia.org/wiki/Context-free_grammar "Context-free Grammar" 677 | [Yacc]: https://en.wikipedia.org/wiki/Yacc "Yet Another Compiler Compiler" 678 | [Bison]: https://en.wikipedia.org/wiki/GNU_bison "Bison" 679 | [Parsec]: http://book.realworldhaskell.org/read/using-parsec.html "Parsec" 680 | [instaparse]: https://github.com/Engelberg/instaparse "Instaparse" 681 | [SDT]: https://en.wikipedia.org/wiki/Syntax-directed_translation "Syntax-directed Translation" 682 | [LF]: http://www.csd.uwo.ca/~moreno//CS447/Lectures/Syntax.html/node9.html "Left-factoring" 683 | [ANTLR]: http://www.antlr.org/ "ANother Tool for Language Recognition" 684 | [clojure]: https://clojure.org/ "Clojure" 685 | [PLY]: http://www.dabeaz.com/ply/ "PLY" 686 | [Dangling-Else]: https://en.wikipedia.org/wiki/Dangling_else "Dangling-Else" 687 | -------------------------------------------------------------------------------- /metaparse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import pprint 5 | import warnings 6 | import marshal 7 | import types 8 | import traceback 9 | 10 | from pprint import pformat 11 | 12 | from collections import deque 13 | from collections import namedtuple 14 | from collections import OrderedDict as odict 15 | 16 | 17 | class Token(namedtuple('Token', 'pos symbol lexeme value')): 18 | 19 | @property 20 | def end(self): 21 | return self.pos + len(self.lexeme) 22 | 23 | def __repr__(self): 24 | return "({}, {})".format( 25 | repr(self.symbol), repr(self.value)) 26 | 27 | 28 | class Rule(namedtuple('Rule', 'lhs rhs')): 29 | 30 | def __repr__(self): 31 | return '({} = {})'.format( 32 | self.lhs, ' '.join(self.rhs)) 33 | 34 | @staticmethod 35 | def from_func(func): 36 | 'Construct a rule object from a function`s signature. ' 37 | lhs = func.__name__ 38 | rhs = [] 39 | ac = func.__code__.co_argcount 40 | vs = func.__code__.co_varnames 41 | for x in vs[:ac]: 42 | # Cut tailing digital subscript like xxx_4. 43 | s = re.search(r'_(\d+)$', x) 44 | if s: 45 | x = x[:s.start()] 46 | rhs.append(x) 47 | # Use immutable. 48 | rhs = tuple(rhs) 49 | return Rule(lhs, rhs) 50 | 51 | 52 | class ParseTree(namedtuple('ParseTree', 'node subs')): 53 | 54 | def __repr__(self): 55 | return tuple.__repr__(self) 56 | 57 | @property 58 | def pos(self): 59 | return self.subs[0].pos 60 | 61 | @property 62 | def end(self): 63 | return self.subs[-1].pos 64 | 65 | 66 | def identity(x): 67 | return x 68 | 69 | 70 | # Special token to be delivered by the tokenizer. 71 | END_TOKEN = Token(-1, '\x03', None, None) 72 | 73 | 74 | class Lexer(object): 75 | 76 | class Error(Exception): 77 | pass 78 | 79 | def __init__(self, names=None, patterns=None, handlers=None): 80 | """The Lexer object bookkeeps 3 same-sized parallel lists: 81 | 82 | :names: 83 | 84 | The names of the patterns, which are supposed to be 85 | consistent with dependent `Grammar` object, where they 86 | are terminal symbols. 87 | 88 | :patterns: 89 | 90 | The patterns corresponding to the names with same 91 | indexing. Each pattern is a /compiled/ regular 92 | expression object. 93 | 94 | :handlers: 95 | 96 | The handlers corresponding to the named patterns, 97 | called when successfully tokenizing the named pattern. 98 | 99 | """ 100 | 101 | self.names = names if names else [] 102 | self.patterns = patterns if patterns else [] 103 | self.handlers = handlers if handlers else [] 104 | self.precedence = {} 105 | 106 | def __call__(self, **kw): 107 | """Supporting registering lexical pattern like: 108 | :: 109 | 110 | @my_lexer(INTEGER = r'[1-9][0-9]*') 111 | def handler(value): 112 | return int(value) 113 | 114 | """ 115 | assert ('p' in kw and len(kw) == 2) or len(kw) == 1 116 | prece = kw.pop('p') if 'p' in kw else None 117 | name, pattern = kw.popitem() 118 | if prece: 119 | self.precedence[name] = prece 120 | self.names.append(name) 121 | self.patterns.append(re.compile(pattern)) 122 | self.handlers.append(None) 123 | assert len(self.names) == len(self.patterns) == len(self.handlers) 124 | 125 | def z(func): 126 | 'Swap the last handler with the decorated function.' 127 | self.handlers[-1] = func 128 | 129 | return z 130 | 131 | def __repr__(self): 132 | return 'Lexer{{\n{}}}'.format( 133 | pformat(list(zip(self.names, self.patterns)))) 134 | 135 | def more(self, **kw): 136 | """Register more lexcial name-patterns with one call like:: 137 | 138 | my_lexer.more( 139 | ADD = r'\+', 140 | SUB = r'-', 141 | TIMES = r'\*', 142 | ... 143 | ) 144 | 145 | * Note: 146 | In this case the /order/ of these name-patterns are not preserved! 147 | """ 148 | for name, pat in kw.items(): 149 | self.names.append(name) 150 | self.patterns.append(re.compile(pat)) 151 | self.handlers.append(None) 152 | 153 | def register(self, name, pattern, handler=None, precedence=None): 154 | """Registers lexical pattern directly.""" 155 | self.names.append(name) 156 | self.patterns.append(re.compile(pattern)) 157 | self.handlers.append(handler) 158 | if precedence is not None: 159 | self.precedence[name] = precedence 160 | 161 | def tokenize(self, inp, with_end=False): 162 | """Prepares a generator object, which iteratively finds possible 163 | lexical patterns given input. 164 | 165 | :with_end: 166 | 167 | means delivering the END_TOKEN after reading over the input. 168 | 169 | """ 170 | names = self.names 171 | patterns = self.patterns 172 | handlers = self.handlers 173 | pos = 0 174 | while pos < len(inp): 175 | match = None 176 | name = None 177 | handler = None 178 | for nm, rgx, hdl in zip(names, patterns, handlers): 179 | match = rgx.match(inp, pos=pos) 180 | if match: 181 | name = nm 182 | handler = hdl 183 | break 184 | else: 185 | raise Lexer.Error( 186 | "No pattern for unrecognized: {}th char in input: '{}'\n" 187 | .format(pos, inp[pos])) 188 | lxm = match.group() 189 | if name == 'IGNORED': 190 | # IGNORED should be associated with no handler. 191 | pass 192 | elif name == 'ERROR': 193 | # ERROR must have a handler, whilst not yielded as a token. 194 | assert handler, 'Each ERROR token must have a handler!' 195 | handler(lxm) 196 | else: 197 | val = handler(lxm) if handler else lxm 198 | yield Token(pos, name, lxm, val) 199 | pos = match.end() 200 | if with_end: 201 | yield END_TOKEN 202 | 203 | 204 | class Grammar(object): 205 | 206 | def __init__(self, rules, precedence=None): 207 | """A `Grammar` object has these attributes: 208 | 209 | Core attributes: 210 | 211 | :start: 212 | The starting syntactic rule of the grammar. 213 | :rules: 214 | All syntactic rules of the grammar. 215 | :nonterminals: 216 | Non-terminal symbols. 217 | :terminals: 218 | Terminal symbols. 219 | :precedence: 220 | Precedence of symbols to resolve LR-conflicts. 221 | 222 | Auxiliary attributes: 223 | 224 | :group: dict 225 | Lookup rules grouped by the same LHS. 226 | :unreachable: set 227 | Unreachable non-terminal symbols by deriving from start rule. 228 | :NULLABLE: set 229 | Nullable rules in the grammar. 230 | :FIRST: dict 231 | The FIRST set of terminal symbols of each non-terminal symbol. 232 | 233 | All these attributes are necessary for performing the 234 | CLOSURE-algorithm, including: 235 | 236 | :closure: 237 | 238 | :closure1_with_lookahead: 239 | 240 | closure with or without lookahead. 241 | 242 | """ 243 | 244 | if not precedence: 245 | precedence = {} 246 | 247 | # Augmented grammar with singleton/non-alternated start-rule. 248 | self.start = rules[0].lhs 249 | self.rules = rules 250 | 251 | # Conclude nonterminals/terminals. 252 | self.nonterminals = set() 253 | self.symbols = set() 254 | for lhs, rhs in rules: 255 | self.nonterminals.add(lhs) 256 | self.symbols.update(rhs) 257 | self.terminals = self.symbols - self.nonterminals 258 | 259 | # Group by LHS 260 | self.group = {nt: [] for nt in self.nonterminals} 261 | for i, (lhs, rhs) in enumerate(rules): 262 | self.group[lhs].append(i) 263 | # Collect unreachable nonterminal from start symbol. 264 | reachable = {self.start} 265 | while 1: 266 | news = set() 267 | for X in reachable: 268 | for j in self.group[X]: 269 | for Y in self.rules[j].rhs: 270 | if Y in self.nonterminals: 271 | if Y not in reachable: 272 | news.add(Y) 273 | if news: 274 | reachable.update(news) 275 | else: 276 | break 277 | self.unreachable = self.nonterminals - reachable 278 | 279 | # precedence is not only specifiable for tokens, but also for 280 | # symbols. 281 | self.precedence = precedence 282 | 283 | # Calc NULLABLE 284 | self.NULLABLE = NULLABLE = set() 285 | while 1: 286 | has_new = False 287 | for lhs, rhs in rules: 288 | if all(x in NULLABLE for x in rhs): 289 | if lhs not in NULLABLE: 290 | NULLABLE.add(lhs) 291 | has_new = True 292 | if not has_new: 293 | break 294 | 295 | # Calc FIRST 296 | self.FIRST = FIRST = {} 297 | for t in self.terminals: 298 | FIRST[t] = {t} 299 | for nt in self.nonterminals: 300 | FIRST[nt] = set() 301 | if nt in NULLABLE: 302 | FIRST[nt].add('EPSILON') 303 | while 1: 304 | has_new = False 305 | for lhs, rhs in rules: 306 | # Use the FIRST[rhs] to update FIRST[lhs]. 307 | for Y in rhs: 308 | for a in FIRST[Y]: 309 | if a not in FIRST[lhs]: 310 | FIRST[lhs].add(a) 311 | has_new = True 312 | if Y not in NULLABLE: 313 | break 314 | if not has_new: 315 | break 316 | 317 | def __repr__(self): 318 | return pprint.pformat(self.rules) 319 | 320 | def first(self, X): 321 | if X in self.FIRST: 322 | return self.FIRST[X] 323 | else: 324 | return {X} 325 | 326 | def first_of_seq(self, seq, tail): 327 | assert tail != 'EPSILON' 328 | s = set() 329 | # `for-else` structure: do-and-find sth, if not found, run `else`. 330 | for Y in seq: 331 | s.update(self.first(Y)) 332 | if Y not in self.NULLABLE: 333 | break 334 | else: 335 | # `else` is executed only when `for` is not broken out. 336 | s.add(tail) 337 | s.discard('EPSILON') 338 | return s 339 | 340 | def closure(self, I): 341 | """Naive closure algorithm on item set :I:.""" 342 | G = self 343 | C = I[:] 344 | z = 0 345 | while z < len(C): 346 | (i, p) = C[z] 347 | if p < len(G.rules[i].rhs): 348 | X = G.rules[i].rhs[p] 349 | if X in G.nonterminals: 350 | for j in G.group[X]: 351 | if (j, 0) not in C: 352 | C.append((j, 0)) 353 | z += 1 354 | return C 355 | 356 | def closure1_with_lookahead(self, item, a): 357 | """Lookahead closure algorithm on item set [(:item:, :a:)].""" 358 | G = self 359 | C = [(item, a)] 360 | z = 0 361 | while z < len(C): 362 | (i, p), a = C[z] 363 | if p < len(G.rules[i].rhs): 364 | X = G.rules[i].rhs[p] 365 | if X in G.nonterminals: 366 | for j in G.group[X]: 367 | for b in G.first_of_seq(G.rules[i].rhs[p+1:], a): 368 | if ((j, 0), b) not in C: 369 | C.append(((j, 0), b)) 370 | z += 1 371 | return C 372 | 373 | class meta(type): 374 | 375 | class Reader(list): 376 | 377 | def __getitem__(self, k): 378 | raise KeyError() 379 | 380 | def __setitem__(self, k, v): 381 | if callable(v): 382 | self.append(Rule.from_func(v)) 383 | else: 384 | pass 385 | 386 | @classmethod 387 | def __prepare__(mcls, name, bases, *a, **kw): 388 | return Grammar.meta.Reader() 389 | 390 | def __new__(mcls, n, b, r): 391 | return Grammar(list(r)) 392 | 393 | 394 | def augment(rules, semans): 395 | """Augment language (rules, semantics) with a top rule and a top 396 | semantics. 397 | 398 | """ 399 | assert len(rules) == len(semans) 400 | start = rules[0].lhs 401 | rules = [Rule(start+'^', (start,))] + rules 402 | semans = [identity] + semans 403 | assert len(rules) == len(semans) 404 | return rules, semans 405 | 406 | 407 | class GSS(namedtuple('GSS', 'tail head')): 408 | 409 | """Graph Structured Stack: a memory-friendly structure for forking 410 | states during generalized parsing, which is identical to CONS 411 | structure in LISP. """ 412 | 413 | def to_list(self): 414 | 'Stack safety.' 415 | gss = self 416 | l = deque() 417 | while gss is not Nil: 418 | l.appendleft(gss.head) 419 | gss = gss.tail 420 | return l 421 | 422 | def __repr__(self): 423 | return repr(self.to_list()) 424 | 425 | Nil = GSS(None, None) 426 | 427 | 428 | # In order to supply an API, syntax error during parsing may be 429 | # returned as object containing error information. 430 | # class MetaparseSyntaxError(SyntaxError): 431 | # def __init__(self, *a, lineno=None, offset=None): 432 | # super(MetaparseSyntaxError, self).__init__(*a) 433 | # self.lineno = lineno 434 | # self.offset = offset 435 | Just = namedtuple('Just', 'result') 436 | 437 | 438 | class LanguageError(Exception): 439 | # FIXME: Should contain some data attributes? 440 | def __init__(self, message): 441 | self.message = message 442 | 443 | 444 | class ParseError(Exception): 445 | 446 | def __init__(self, token, action, stack, tree_stack): 447 | """Record for syntactic error information during parsing. 448 | - thrown/returned during parsing? 449 | - handler? 450 | 451 | - May need to associate syntax error handler to the parser! 452 | - How to define such a handler? 453 | - For each rule? 454 | - Error correction? 455 | 456 | - Or even semantic error handler? 457 | - A handler defined to check the whole content of argument stack! 458 | - translation (i.e. applying semantics to arguments in arg-stack) 459 | only available after such check. 460 | - To be thrown in the rule-seman-body 461 | - To be catched and reported by the parsing routine 462 | """ 463 | 464 | """Which information to be included? 465 | 466 | - The syntax tree being constructed -- exactly the active item 467 | in the current state (top of stack), as well as the expected 468 | token. 469 | 470 | - The range of input text corresponding to the syntax tree? 471 | 472 | """ 473 | msg = ('Unexpected token {} at ({}:{})\n' 474 | 'while expecting actions \n{}\n' 475 | 'with state stack \n{}\n' 476 | 'and subtree stack \n{}\n' 477 | .format( 478 | token, 479 | token.pos, token.end, 480 | pformat(action), 481 | pformat(stack), 482 | pformat(tree_stack))) 483 | 484 | super(ParseError, self).__init__(msg) 485 | # self.tree = tree 486 | self.token = token 487 | self.action = action 488 | self.stack = stack 489 | 490 | 491 | class GLR(object): 492 | 493 | """Generalized LR parser with lookahead. 494 | 495 | - It is the generalized version of LALR parser, thus being 496 | slightly more powerful than typical GLR(0) parser due to 497 | utilization of lookhead. 498 | 499 | """ 500 | 501 | def __init__(self, lexer=None, rules=None, precedence=None): 502 | self.rules = rules if rules else [] 503 | self.precedence = precedence if precedence else {} 504 | self.lexer = lexer if lexer else Lexer() 505 | self.semans = [] 506 | 507 | assert isinstance(self.lexer, Lexer) 508 | assert isinstance(self.precedence, dict) 509 | assert isinstance(self.rules, list) 510 | assert isinstance(self.semans, list) 511 | 512 | def rule(self, func): 513 | rule = Rule.from_func(func) 514 | self.rules.append(rule) 515 | self.semans.append(func) 516 | 517 | def make(self): 518 | 519 | # Augmented lexer - ignoring spaces by default. 520 | lexes = set(self.lexer.names) 521 | if 'IGNORED' not in lexes: 522 | self.lexer.register('IGNORED', r'\s+') 523 | 524 | # Augmented grammar - top semantics 525 | self.rules, self.semans = augment(self.rules, self.semans) 526 | 527 | # Propagate precedence from lexer. 528 | if self.lexer.precedence: 529 | self.precedence.update(self.lexer.precedence) 530 | 531 | # Prepare Grammar object to use closure algorithms. 532 | G = Grammar(self.rules, self.precedence) 533 | 534 | # if 'ERROR' not in self.lexer.handler: 535 | # warnings.warn( 536 | # "No ERROR handler available. " 537 | # "Lexer will fail when reading unrecognized character.") 538 | 539 | # Check coverage of Lexer. 540 | # - Each terminal should have its corresponding lexical pattern. 541 | for r, rule in enumerate(G.rules): 542 | for y in rule.rhs: 543 | if y in G.terminals and y not in lexes: 544 | msg = ('No lexical pattern provided ' 545 | 'for terminal symbol: {}\n' 546 | '- in {}th rule {}\n' 547 | ).format(y, r, rule) 548 | seman = self.semans[r] 549 | trc = traceback.format_list([ 550 | (seman.__code__.co_filename, 551 | seman.__code__.co_firstlineno, 552 | seman.__name__, 553 | '')])[0] 554 | trc_msg = ('- with helping traceback (if available): \n' 555 | '{}\n').format(trc) 556 | lex_msg = ('- declared lexes: {}\n').format(self.lexer) 557 | raise LanguageError(msg + trc_msg + lex_msg) 558 | 559 | # Report soundness of grammar (unreachable, loops, etc). 560 | for X in G.unreachable: 561 | for i in G.group[X]: 562 | seman = self.semans[i] 563 | trc = traceback.format_list([ 564 | (seman.__code__.co_filename, 565 | seman.__code__.co_firstlineno, 566 | seman.__name__, 567 | '')])[0] 568 | msg = ('There are unreachable nonterminals at {}th rule: {}.\n' 569 | '- with helping traceback: \n{}\n' 570 | ).format(i, G.unreachable, trc) 571 | # warnings.warn(msg) 572 | raise LanguageError(msg) 573 | 574 | # Kernel sets and corresponding GOTO 575 | self.Ks = Ks = [[(0, 0)]] 576 | self.GOTO = GOTO = [] 577 | 578 | # Make LR(0) kernel sets Ks and GOTO, incrementally. 579 | i = 0 580 | while i < len(Ks): 581 | I = Ks[i] 582 | igotoset = odict() 583 | for (nk, p) in G.closure(I): 584 | if p < len(G.rules[nk].rhs): 585 | X = G.rules[nk].rhs[p] 586 | if X not in igotoset: 587 | igotoset[X] = [] 588 | if (nk, p+1) not in igotoset[X]: 589 | # (nk, p+1) is the shifted item of (nk, p) 590 | igotoset[X].append((nk, p+1)) 591 | igoto = {} 592 | for X, J in igotoset.items(): 593 | J.sort() 594 | if J in Ks: 595 | igoto[X] = Ks.index(J) 596 | else: 597 | igoto[X] = len(Ks) 598 | Ks.append(J) 599 | GOTO.append(igoto) 600 | i += 1 601 | 602 | # Lookahead set corresponding to item set 603 | self.Ls = Ls = [[set() for _ in K] for K in Ks] 604 | 605 | Ls[0][0] = {'\x03'} 606 | # Ls[0][0] = {'$'} 607 | 608 | DUMMY = '\x00' 609 | propa = [] 610 | for i, K in enumerate(Ks): 611 | for ii, itm in enumerate(K): 612 | C = G.closure1_with_lookahead(itm, DUMMY) 613 | # for each non-kernel nk 614 | for (nk, p), a in C: 615 | # active 616 | if p < len(G.rules[nk].rhs): 617 | # actor 618 | X = G.rules[nk].rhs[p] 619 | # target item 620 | j = GOTO[i][X] 621 | jj = Ks[j].index((nk, p+1)) 622 | # spontaneous 623 | if a != DUMMY: 624 | Ls[j][jj].add(a) 625 | # propagated 626 | else: 627 | propa.append(( 628 | # from K[i], ii'th item 629 | (i, ii), 630 | # to K[j], jj'th item 631 | (j, jj), 632 | )) 633 | else: 634 | # Handle ended item here? 635 | # 636 | # No. The item to be reduced should share the 637 | # set of lookaheads of kernel item whilst this 638 | # set is yet to be accomplished. 639 | pass 640 | 641 | # Propagation till fix-point 642 | self.propa = propa 643 | while 1: 644 | has_new = False 645 | for (i, ii), (j, jj) in propa: 646 | for a in Ls[i][ii]: 647 | if a not in Ls[j][jj]: 648 | Ls[j][jj].add(a) 649 | has_new = True 650 | if not has_new: 651 | break 652 | 653 | # Conclude lookahead actions allowing conflicts on identical 654 | # lookaheads. 655 | # self.ACTION = ACTION = [set() for _ in Ks] 656 | self.ACTION = ACTION = [{} for _ in Ks] 657 | for A, Xto in zip(ACTION, GOTO): 658 | for X, j in Xto.items(): 659 | if X in G.terminals: 660 | if X not in A: 661 | A[X] = set() 662 | A[X].add(('shift', j)) 663 | for K, L, A in zip(Ks, Ls, ACTION): 664 | for k, l in zip(K, L): 665 | for (c, q), b in G.closure1_with_lookahead(k, DUMMY): 666 | # Accept state. 667 | if c == 0 and q == 1: 668 | if '\x03' not in A: 669 | A['\x03'] = {('accept', 0)} 670 | # IMPORTANT: kernel/non-kernels which are ended! 671 | elif q == len(G.rules[c].rhs): 672 | # Spontaneous reduction 673 | if b != DUMMY: 674 | if b not in A: 675 | A[b] = set() 676 | A[b].add(('reduce', c)) 677 | # Propagated from lookaheads of kernel item 678 | # being closed 679 | else: 680 | for a in l: 681 | if a not in A: 682 | A[a] = set() 683 | A[a].add(('reduce', c)) 684 | 685 | # TODO: Resolving conflicts with symbol precedence 686 | # - Resolution can filter some invalid actions in ACTION 687 | # for GLR. 688 | # - Use phantom-precedence to decide! 689 | # - decider for shift: the left neighbor of item actor symbol 690 | # - decider for reduce: the lookahead symbol 691 | # - For any action in ACTION[i], i.e. A: 692 | # - if the decider has no precedence, it must be preserved; 693 | # - if the decider has highest precedence among A, it must be 694 | # preserved; 695 | # - otherwise, it gets excluded. 696 | # if self.precedence: 697 | # def prsv(i, look, action): 698 | # if Ks[i] 699 | # act, arg = action 700 | # if act == 'reduce': 701 | return 702 | 703 | def prepare_generalized(self, interpret=True): 704 | """Prepare a parsing coroutine which accepts tokens.""" 705 | agenda = deque() 706 | agenda.append((GSS(Nil, 0), Nil)) 707 | tokens = [] 708 | # results = ddict(list) 709 | 710 | token = yield None 711 | tokens.append(token) 712 | while 1: 713 | 714 | agenda_bak = deque(agenda) 715 | agenda_new = deque() 716 | 717 | # Dead states for error reporting. 718 | dead = [] 719 | 720 | while agenda: 721 | 722 | sstk, tstk = agenda.popleft() 723 | s = sstk.head 724 | 725 | if token.symbol in self.ACTION[s]: 726 | 727 | for act, arg in self.ACTION[s][token.symbol]: 728 | 729 | sstk1, tstk1 = sstk, tstk 730 | 731 | if act == 'reduce': 732 | tar_rule = self.rules[arg] 733 | subs = deque() 734 | for _ in tar_rule.rhs: 735 | # Pop from GSS 736 | sstk1 = sstk1.tail 737 | tstk1, sub = tstk1.tail, tstk1.head 738 | subs.appendleft(sub) 739 | if interpret: 740 | tree = self.semans[arg](*subs) 741 | else: 742 | tree = ParseTree(tar_rule.lhs, 743 | list(subs)) 744 | 745 | # NOTE: 746 | # 747 | # - Each state during cascaded reduction 748 | # should be added to the forks! 749 | # 750 | # - Intermediate reduction items may or 751 | # may not have a GOTO target! If no, 752 | # such items are denoted as "dead" - 753 | # they show possible expectations. 754 | if tar_rule.lhs in self.GOTO[sstk1.head]: 755 | tar_trans = self.GOTO[sstk1.head][tar_rule.lhs] 756 | agenda.append( 757 | # Push into GSS 758 | (GSS(sstk1, tar_trans), 759 | GSS(tstk1, tree))) 760 | else: 761 | dead.append( 762 | (sstk1, tstk1)) 763 | 764 | elif act == 'accept': 765 | agenda_new.append( 766 | (sstk1, tstk1)) 767 | 768 | elif act == 'shift': 769 | agenda_new.append( 770 | (GSS(sstk1, arg), 771 | GSS(tstk1, 772 | token.value if interpret else token))) 773 | else: 774 | dead.append((sstk, tstk)) 775 | 776 | if not agenda_new: 777 | token = yield [ 778 | ParseError(token, self.ACTION[ss.head], ss, aa) 779 | for ss, aa in dead 780 | ] 781 | agenda = agenda_bak 782 | else: 783 | token = yield [Just(ts) for ss, ts in agenda_new] 784 | tokens.append(token) 785 | agenda = agenda_new 786 | 787 | def parse_generalized(self, inp, interpret=False): 788 | assert hasattr(self, 'ACTION'), \ 789 | 'Call your_parser.make() to build the parser first!' 790 | p = self.prepare_generalized(interpret) 791 | next(p) 792 | for token in self.lexer.tokenize(inp, False): 793 | rs = p.send(token) 794 | else: 795 | rs = p.send(END_TOKEN) 796 | return [r.result[-1] for r in rs] 797 | 798 | def interpret_generalized(self, inp): 799 | return self.parse_generalized(inp, True) 800 | 801 | def dumps(self): 802 | 'Dump this parser instance to readable Python code string.' 803 | 804 | tar = odict() 805 | 806 | tar['names'] = self.lexer.names 807 | tar['patterns'] = [ 808 | rgx.pattern for rgx in self.lexer.patterns 809 | ] 810 | tar['handlers'] = [ 811 | marshal.dumps(h.__code__) if h else None 812 | for h in self.lexer.handlers 813 | ] 814 | 815 | tar['rules'] = [tuple(rl) for rl in self.rules] 816 | tar['ACTION'] = self.ACTION 817 | tar['GOTO'] = self.GOTO 818 | 819 | tar['semans'] = [ 820 | marshal.dumps(f.__code__) 821 | for f in self.semans 822 | ] 823 | 824 | return '\n'.join( 825 | '{} = {}\n'.format(k, pformat(v)) 826 | for k, v in tar.items()) 827 | 828 | def dump(self, filename): 829 | with open(filename, 'w') as o: 830 | o.write(self.dumps()) 831 | 832 | @staticmethod 833 | def loads(src, env=globals()): 834 | 'Load a dumped code string and make a usable parse instance.' 835 | ctx = {} 836 | exec(src, env, ctx) 837 | 838 | names = ctx.pop('names') 839 | patterns = [re.compile(pat) 840 | for pat in ctx.pop('patterns')] 841 | handlers = [ 842 | types.FunctionType(marshal.loads(co), env) if co else None 843 | for co in ctx.pop('handlers') 844 | ] 845 | p = LALR(Lexer(names, patterns, handlers)) 846 | 847 | p.rules = [Rule(*rl) for rl in ctx.pop('rules')] 848 | p.semans = [ 849 | types.FunctionType(marshal.loads(co), env) 850 | for co in ctx.pop('semans') 851 | ] 852 | p.ACTION = ctx.pop('ACTION') 853 | p.GOTO = ctx.pop('GOTO') 854 | return p 855 | 856 | @staticmethod 857 | def load(filename, env=globals()): 858 | with open(filename, 'r') as o: 859 | return LALR.loads(o.read(), env=env) 860 | 861 | # Helper for easy reading/tracing/debugging. 862 | def show_item(self, item): 863 | i, p = item 864 | lhs, rhs = self.rules[i] 865 | return '({} = {}.{})'.format(lhs, 866 | ' '.join(rhs[:p]), 867 | ' '.join(rhs[p:])) 868 | 869 | def show_itemset(self, i): 870 | return ([self.show_item(tm) for tm in self.Ks[i]]) 871 | 872 | def show_action(self, action): 873 | act, arg = action 874 | if act == 'shift': 875 | return (act, self.show_itemset(arg)) 876 | else: 877 | return (act, self.rules[arg]) 878 | 879 | # Various style of declaration. 880 | def __getitem__(self, k): 881 | raise KeyError() 882 | 883 | def __setitem__(self, k, v): 884 | 'This method is used to register attributes.' 885 | 886 | # Docstring of instance. 887 | if k == '__doc__': 888 | self.__doc__ = v 889 | 890 | # Built-in attributes ignored. 891 | elif k.startswith('__') and k.endswith('__'): 892 | pass 893 | 894 | # Lexical element. 895 | elif isinstance(v, str): 896 | self.lexer.register(k, v) 897 | 898 | # Lexical element with precedence. 899 | elif isinstance(v, tuple): 900 | assert len(v) == 2 901 | pat, prece = v 902 | self.lexer.register(k, pat) 903 | if prece in self.precedence: 904 | raise ValueError( 905 | 'Repeated specifying the precedence ' 906 | 'of symbol: {}'.format(k)) 907 | else: 908 | self.precedence[k] = prece 909 | 910 | # Method as handler... 911 | elif callable(v): 912 | parlist = v.__code__.co_varnames[:v.__code__.co_argcount] 913 | # for new lexical element. 914 | if len(parlist) == 1 and parlist[0] in ('lex', 'LEX'): 915 | for prm, pat in v.__annotations__.items(): 916 | if prm == 'return': 917 | self.precedence[k] = pat 918 | else: 919 | self.lexer.register(k, pat, v) 920 | # for existing lexical element 921 | elif any(k == lx for lx in self.lexer.names): 922 | assert len(parlist) == 1 923 | for i, lx in reversed( 924 | list(enumerate(self.lexer.names))): 925 | if lx == k: 926 | self.lexer.handlers[i] = v 927 | # for syntax rule, i.e. semantics. 928 | else: 929 | self.rule(v) 930 | 931 | def __enter__(self): 932 | return self.lexer, self.rule 933 | 934 | def __exit__(self, *a, **kw): 935 | self.make() 936 | 937 | class meta(type): 938 | 939 | @classmethod 940 | def __prepare__(mcls, name, bases, *a, **kw): 941 | return GLR(*a, **kw) 942 | 943 | def __new__(mcls, m, bs, p, **kw): 944 | p.make() 945 | return p 946 | 947 | @classmethod 948 | def verbose(cls, func_def): 949 | "Polymorphic class method which tends to be overriden." 950 | assert func_def.__code__.co_argcount == 2 951 | p = cls() 952 | func_def(p.lexer, p.rule) 953 | p.make() 954 | return p 955 | 956 | 957 | class LALR(GLR): 958 | 959 | """LookAhead LR parser. 960 | 961 | - Can use precedence of tokens to resolve conflicts. 962 | 963 | """ 964 | 965 | def make(self): 966 | # Make GLALR(1) automaton. 967 | super(LALR, self).make() 968 | # Resolve conflicts with precedence. 969 | Ks = self.Ks 970 | ACTION = self.ACTION 971 | ACTION1 = [{} for _ in Ks] 972 | for i, A in enumerate(ACTION): 973 | A1 = ACTION1[i] 974 | # Try add (act, arg) into A1. 975 | for a, actargs in A.items(): 976 | for act, arg in actargs: 977 | # It is assured that 'shift' is added earlier than 'reduce' 978 | if a in A1: 979 | # Conflict resolver here! 980 | act0, arg0 = A1[a] 981 | if {act0, act} == {'shift', 'reduce'}: 982 | if act0 == 'reduce': 983 | s, s_i = act, arg 984 | r, r_r = act0, arg0 985 | else: 986 | s, s_i = act0, arg0 987 | r, r_r = act, arg 988 | redu = self.rules[r_r] 989 | if a in self.precedence: 990 | if len(redu.rhs) > 1 and \ 991 | redu.rhs[-2] in self.precedence: 992 | lft = redu.rhs[-2] 993 | rgt = a 994 | if self.precedence[lft] >= \ 995 | self.precedence[rgt]: 996 | A1[a] = (r, r_r) 997 | else: 998 | A1[a] = (s, s_i) 999 | continue 1000 | # Unable to resolve 1001 | msg = ("\n" 1002 | "Handling item set: \n" "{}\n" 1003 | "Conflict on lookahead: {} \n" 1004 | "- {}\n" "- {}\n" 1005 | ).format( 1006 | self.show_itemset(i), 1007 | a, 1008 | self.show_action(A1[a]), 1009 | self.show_action((act, arg))) 1010 | raise LanguageError(msg) 1011 | else: 1012 | A1[a] = (act, arg) 1013 | 1014 | self.ACTION = ACTION1 1015 | 1016 | def prepare(self, interpret=True): 1017 | """Prepare a parsing coroutine which accepts tokens.""" 1018 | sstk = [0] # state stack 1019 | tstk = [] # subtree stack 1020 | token = yield Just(None) 1021 | 1022 | while 1: 1023 | 1024 | if token.symbol in self.ACTION[sstk[-1]]: 1025 | act, arg = self.ACTION[sstk[-1]][token.symbol] 1026 | # Active tree set default to token. 1027 | tree = token 1028 | 1029 | # Reduce (no new token fetched during reduction) 1030 | if act == 'reduce': 1031 | subs = deque() 1032 | for _ in self.rules[arg].rhs: 1033 | sstk.pop() 1034 | subs.appendleft(tstk.pop()) 1035 | 1036 | if interpret: 1037 | tree = self.semans[arg](*subs) 1038 | else: 1039 | tree = ParseTree(self.rules[arg].lhs, list(subs)) 1040 | 1041 | # Transfer with reduced symbol. 1042 | sstk.append(self.GOTO[sstk[-1]][self.rules[arg].lhs]) 1043 | tstk.append(tree) 1044 | 1045 | # Accept 1046 | if act == 'accept': 1047 | assert sstk.pop() == 1 1048 | tree = tstk.pop() 1049 | assert sstk == [0], sstk 1050 | assert tstk == [], tstk 1051 | # Now parsing routine is identical to the initial 1052 | # state and can start a new round, thus no need 1053 | # to create new routines for more parsing tasks. 1054 | token = yield Just(tree) 1055 | 1056 | # Shift 1057 | elif act == 'shift': 1058 | sstk.append(arg) 1059 | tstk.append(token.value if interpret else token) 1060 | token = yield Just(tree) 1061 | 1062 | else: 1063 | token = yield ParseError( 1064 | token, 1065 | self.ACTION[sstk[-1]], 1066 | [self.show_itemset(s) for s in sstk], 1067 | tstk) 1068 | 1069 | def parse(self, inp, interpret=False): 1070 | assert hasattr(self, 'ACTION'), \ 1071 | 'Call yourparser.make() to build the parser first!' 1072 | rtn = self.prepare(interpret) 1073 | next(rtn) 1074 | for token in self.lexer.tokenize(inp, False): 1075 | opt = rtn.send(token) 1076 | if isinstance(opt, ParseError): 1077 | warnings.warn(opt) 1078 | just = rtn.send(END_TOKEN) 1079 | return just.result 1080 | 1081 | def interpret(self, inp): 1082 | assert self.semans, 'Must have semantics to interpret.' 1083 | return self.parse(inp, True) 1084 | 1085 | class meta(type): 1086 | 1087 | def __prepare__(mcls, *a, **kw): 1088 | return LALR(*a, **kw) 1089 | 1090 | def __new__(mcls, m, bs, p, **kw): 1091 | p.make() 1092 | return p 1093 | 1094 | 1095 | class Inspector(LALR): 1096 | 1097 | """Collection of methods for inspecting LALR parser object's 1098 | attributes. 1099 | 1100 | - Since the representation of structures in LALR are raw 1101 | integers/pairs as indices, these methods help inspect indexed 1102 | objects. 1103 | 1104 | - They are organized here to avoid clustering. 1105 | 1106 | """ 1107 | 1108 | def inspect_Ks(self): 1109 | pprint.pprint([(k, [self.show_item(itm) for itm in K]) 1110 | for k, K in enumerate(self.Ks)]) 1111 | 1112 | def inspect_lkhs(self): 1113 | pprint.pprint([ 1114 | [(i, self.show_item(self.Ks[i][ii])), 1115 | (j, self.show_item(self.Ks[j][jj]))] 1116 | for (i, ii), (j, jj) in self.propa 1117 | ]) 1118 | 1119 | def inspect_propa(self): 1120 | pprint.pprint([ 1121 | [(i, self.show_item(self.Ks[i][ii])), 1122 | (j, self.show_item(self.Ks[j][jj]))] 1123 | for (i, ii), (j, jj) in self.propa 1124 | ]) 1125 | 1126 | def inspect_Ls(self): 1127 | pprint.pprint([ 1128 | (i, [(self.show_item(itm), lkhs) 1129 | for itm, lkhs in zip(K, self.Ls[i])]) 1130 | for i, K in enumerate(self.Ks) 1131 | ]) 1132 | 1133 | def inspect_ACTION(self): 1134 | pprint.pprint([ 1135 | (i, self.show_itemset(i), self.ACTION[i]) 1136 | for i, K in enumerate(self.Ks) 1137 | ]) 1138 | 1139 | def inspect_GOTO(self): 1140 | pprint.pprint([ 1141 | (i, self.show_itemset(i), self.GOTO[i]) 1142 | for i, K in enumerate(self.Ks) 1143 | ]) 1144 | 1145 | --------------------------------------------------------------------------------