├── tests
    ├── __init__.py
    ├── tests_error_handling
    │   ├── __init__.py
    │   └── test_language_errors.py
    ├── run.bat
    ├── test_trivial.py
    ├── test_church.py
    ├── test_non_lalr.py
    ├── test_paren.py
    ├── test_verbose_reader.py
    ├── test_first.py
    ├── test_with_reader.py
    ├── test_resolve.py
    ├── test_lrval_dumpsloads.py
    ├── test_grammar.py
    ├── test_generalized.py
    ├── test_conflict.py
    ├── test_prece.py
    ├── test_arith.py
    ├── test_pystruct.py
    ├── test_courses.py
    ├── test_basic.py
    └── sexp_dump.py
├── MANIFEST
├── examples
    ├── preamble.py
    ├── eg_online.py
    ├── eg_ambig.py
    ├── eg_direct.py
    ├── eg_precedence.py
    ├── eg_dangling.py
    ├── eg_dumps.py
    ├── eg_logic.py
    ├── eg_demo_py2.py
    ├── eg_lisp.py
    ├── eg_demo_py3.py
    ├── eg_pystructs.py
    ├── eg_read_ebnf.py
    ├── eg_read_yacc.py
    ├── eg_demo.py
    ├── eg_demo_dump.py
    ├── eg_dumps_file.py
    ├── eg_func_lang.py
    ├── sexp_dump.py
    └── eg_dumps_direct_use.py
├── experiments
    ├── preamble.py
    ├── frontend_styles.py
    ├── only_syntax.py
    ├── LL.py
    ├── peg.py
    ├── meta_dumps.py
    └── meta_dumps_standalone.py
├── .gitignore
├── setup.py
├── .gitattributes
├── README.md
└── metaparse.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tests_error_handling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/run.bat:
--------------------------------------------------------------------------------
1 | python -m unittest discover . -v
2 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | metaparse.py
3 | setup.py
4 | 


--------------------------------------------------------------------------------
/examples/preamble.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # Include parent path for testing
 5 | sys.path.append(
 6 |     os.path.abspath(
 7 |         os.path.join(
 8 |             os.path.dirname(__file__),
 9 |             os.pardir
10 |         )))
11 | 


--------------------------------------------------------------------------------
/experiments/preamble.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # Include parent path for testing
 5 | sys.path.append(
 6 |     os.path.abspath(
 7 |         os.path.join(
 8 |             os.path.dirname(__file__),
 9 |             os.pardir
10 |         )))
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.sqlite
 3 | *.stats
 4 | *.html
 5 | *.py#
 6 | *.cache
 7 | *_raw.py
 8 | .ipython
 9 | metaparse.egg-info
10 | dist
11 | *.ipython
12 | *.python_history
13 | __pycache__
14 | *.grip
15 | 
16 | venv
17 | .coverage
18 | coverage_html
19 | 
20 | experiments*


--------------------------------------------------------------------------------
/examples/eg_online.py:
--------------------------------------------------------------------------------
 1 | from eg_demo import *
 2 | 
 3 | # Prepare a parsing routine
 4 | p = pCalc.prepare()
 5 | 
 6 | # Start this routine
 7 | next(p)
 8 | 
 9 | # Send tokens one-by-one
10 | for token in pCalc.lexer.tokenize('bar = 1 + 2 + + 3', with_end=True):
11 |     print("Sends: ", token)
12 |     r = p.send(token)
13 |     print("Got:   ", r)
14 |     print()
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | import metaparse
 6 | 
 7 | setup(name='metaparse',
 8 |       version='0.1',
 9 |       description='A tool for powerful instant parsing supported by optional algorithms.',
10 |       author='Xuelei Li',
11 |       author_email='lixuelei86@gmail.com',
12 |       url='https://github.com/Shellay/metaparse',
13 |       py_modules=['metaparse'],
14 | )
15 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/tests/test_trivial.py:
--------------------------------------------------------------------------------
 1 | from metaparse import *
 2 | from pprint import pprint
 3 | 
 4 | class S(metaclass=LALR.meta):
 5 | # class S(metaclass=GLR.meta):
 6 |     a, b, c = 'abc'
 7 |     def S(A, B, C): return (A, *B, C)
 8 |     def A(a): return a
 9 |     def A(): return ()
10 |     def B(): return ()
11 |     def B(B, b): return B + (b,)
12 |     def C(c): return c
13 | 
14 | 
15 | # pprint([*p.lexer.tokenize('abbbc', True)])
16 | 
17 | from unittest import main, TestCase
18 | 
19 | class Test(TestCase):
20 |     def test(self):
21 |         r = S.interpret('abbbbc')
22 |         self.assertEqual(r, ('a', 'b', 'b', 'b', 'b', 'c'))
23 | 
24 | 
25 | # pprint(p.__dict__)
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/tests/test_church.py:
--------------------------------------------------------------------------------
 1 | from metaparse import LALR
 2 | 
 3 | class LangChurch(metaclass=LALR.meta):
 4 | 
 5 |     """
 6 |     Grammar for interpreting Church-Numerals.
 7 |     """
 8 | 
 9 |     ZERO = r'zero'
10 |     SUCC = r'succ'
11 | 
12 |     def num(ZERO):
13 |         return 0
14 | 
15 |     def num(SUCC, num):
16 |         return num + 1
17 | 
18 | 
19 | import unittest
20 | class Test(unittest.TestCase):
21 |     def test_church(self):
22 |         self.assertEqual(LangChurch.interpret('zero')                ,  0)
23 |         self.assertEqual(LangChurch.interpret('succ zero')           ,  1)
24 |         self.assertEqual(LangChurch.interpret('succ succ zero')      ,  2)
25 |         self.assertEqual(LangChurch.interpret('succ succ succ zero') ,  3)
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/tests/test_non_lalr.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from metaparse import *
 3 | 
 4 | 
 5 | class TestConflicts(unittest.TestCase):
 6 | 
 7 |     def test_conflicts(self):
 8 | 
 9 |         with self.assertRaises(LanguageError):
10 | 
11 |             class G(metaclass=LALR.meta):
12 |                 'A Grammar.meta which is LR(1) but not LALR(1).'
13 | 
14 |                 a = r'a'
15 |                 b = r'b'
16 |                 c = r'c'
17 |                 d = r'd'
18 |                 e = r'e'
19 | 
20 |                 def S(a, A, d): return
21 |                 def S(b, B, d): return
22 |                 def S(a, B, e): return
23 |                 def S(b, A, e): return
24 | 
25 |                 def A(c): return c
26 |                 def B(c): return c
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     unittest.main()
31 | 


--------------------------------------------------------------------------------
/tests/test_paren.py:
--------------------------------------------------------------------------------
 1 | from metaparse import *
 2 | 
 3 | class LangParen(metaclass=LALR.meta):
 4 | 
 5 |     """
 6 |     Grammar for matching arbitrary paired parenthesises.
 7 |     """
 8 | 
 9 |     END   = r'\$'
10 |     LEFT  = r'\('
11 |     RIGHT = r'\)'
12 | 
13 | 
14 |     def top(pair):
15 |         return pair
16 | 
17 | 
18 |     def pair(LEFT, pair_1, RIGHT, pair_2):
19 |         return '<' + pair_1 + '>' + pair_2
20 | 
21 | 
22 |     def pair():
23 |         return ''
24 | 
25 | 
26 | from unittest import main, TestCase
27 | 
28 | class Test(TestCase):
29 | 
30 |     def test_paren(self):
31 | 
32 |         assert LangParen.interpret('()') == '<>'
33 |         assert LangParen.interpret('( ( ) )') == '<<>>'
34 |         assert LangParen.interpret('( ( ) ) ( )') == '<<>><>'
35 | 
36 | if __name__ == '__main__':
37 |     # import pprint as pp
38 |     # s = LangParen.parse('( ( ) ) ( )')
39 |     # pp.pprint(s)
40 |     main()
41 | 


--------------------------------------------------------------------------------
/examples/eg_ambig.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | from metaparse import *
 3 | 
 4 | 
 5 | class pExpr(metaclass=GLR.meta):
 6 | 
 7 |     'An ambigious grammar for arithmetic expressions.'
 8 | 
 9 |     def plus(lex: r'\+'):
10 |         return lex
11 | 
12 |     def times(lex: r'\*'):
13 |         return lex
14 | 
15 |     def number(lex: r'\d+'):
16 |         return int(lex)
17 | 
18 | 
19 |     def expr(expr, plus, expr_1):
20 |         return expr + expr_1
21 | 
22 |     def expr(expr, times, expr_1):
23 |         return expr * expr_1
24 | 
25 |     def expr(number):
26 |         return number
27 | 
28 | 
29 | inp = '2 + 1 * 3'
30 | 
31 | tks = list(pExpr.lexer.tokenize(inp))
32 | 
33 | from pprint import pprint
34 | 
35 | pprint(tks)
36 | 
37 | r = pExpr.prepare_generalized()
38 | next(r)
39 | 
40 | for tk in tks:
41 |     x = r.send(tk)
42 |     pprint(x)
43 | else:
44 |     x = r.send(END_TOKEN)
45 |     pprint(x)
46 | 
47 | # Keep sending further tokens!
48 | tks = list(pExpr.lexer.tokenize(' +  + 1'))
49 | for tk in tks:
50 |     rs = r.send(tk)
51 |     for e in rs:
52 |         if isinstance(e, ParseError):
53 |             pprint(e.args)
54 |         else: break
55 |     else:
56 |         pprint(rs)
57 | else:
58 |     x = r.send(END_TOKEN)
59 |     pprint(x)
60 | 
61 | 


--------------------------------------------------------------------------------
/examples/eg_direct.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | from metaparse import LALR
 3 | 
 4 | pCalc = LALR()
 5 | 
 6 | lex  = pCalc.lexer
 7 | rule = pCalc.rule
 8 | 
 9 | # lex(<terminal-symbol> = <pattern>)
10 | lex(IGNORED = r'\s+')
11 | lex(NUM = r'[0-9]+')
12 | lex(EQ  = r'=')
13 | lex(ID  = r'[_a-zA-Z]\w*')
14 | 
15 | # lex(... , p = <precedence>)
16 | lex(POW = r'\*\*', p=3)
17 | lex(POW = r'\^')                # No need to give the precedence twice for POW.
18 | lex(MUL = r'\*'  , p=2)
19 | lex(ADD = r'\+'  , p=1)
20 | 
21 | # @rule
22 | # def <lhs> ( <rhs> ):
23 | #     <semantics>
24 | @rule
25 | def assign(ID, EQ, expr):
26 |     context[ID] = expr
27 |     return expr
28 | 
29 | @rule
30 | def expr(ID):
31 |     return context[ID]
32 | 
33 | @rule
34 | def expr(NUM):
35 |     return int(NUM)
36 | 
37 | @rule
38 | def expr(expr_1, ADD, expr_2):
39 |     return expr_1 + expr_2
40 | 
41 | @rule
42 | def expr(expr, MUL, expr_1):
43 |     return expr * expr_1
44 | 
45 | @rule
46 | def expr(expr, POW, expr_1):
47 |     return expr ** expr_1
48 | 
49 | # Complete making the parser after collecting things!
50 | pCalc.make()
51 | 
52 | context = {}
53 | pCalc.interpret("x = 3")
54 | pCalc.interpret("y = x ^ 2")
55 | pCalc.interpret("z = x + y + 1")
56 | 
57 | from pprint import pprint
58 | print(context)
59 | print(pCalc.precedence)
60 | 


--------------------------------------------------------------------------------
/examples/eg_precedence.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | from metaparse import LALR
 3 | 
 4 | class E(metaclass=LALR.meta):
 5 | 
 6 |     num = r'\d+'
 7 | 
 8 |     pow = r'\*\*', 3            # r'\*\*' is matched before r'\*'
 9 | 
10 |     mul = r'\*', 2
11 |     div = r'\/', 2
12 | 
13 |     add = r'\+', 1
14 |     mns = r'-', 1
15 | 
16 |     l   = r'\('
17 |     r   = r'\)'
18 | 
19 |     def E(E, add, E_1):
20 |         return '({} + {})'.format(E, E_1)
21 |     def E(E, mns, E_1):
22 |         return '({} - {})'.format(E, E_1)
23 |     def E(E, mul, E_1):
24 |         return '({} * {})'.format(E, E_1)
25 |     def E(E, div, E_1):
26 |         return '({} / {})'.format(E, E_1)
27 |     def E(E, pow, E_1):
28 |         return '({} ** {})'.format(E, E_1)
29 |     def E(num):
30 |         return num
31 |     def E(l, E, r):
32 |         return E
33 | 
34 | 
35 | import pprint as pp
36 | 
37 | # pp.pprint(E.parse_many('3 + 2 * 7'))
38 | # pp.pprint(E.parse_many('3 + 2 * 7 + 1'))
39 | # pp.pprint(E.interpret_many('3 + 2 * 7 + 1'))
40 | 
41 | print(E)
42 | pp.pprint(E.precedence)
43 | psr = (E)
44 | # print(psr.table.__len__())
45 | # pp.pprint([*zip(psr.Ks, psr.ACTION)])
46 | 
47 | # print(psr.interpret('3 + 2 * 7'))
48 | # print(psr.interpret('3 * 2 + 7'))
49 | print(psr.interpret('3 + 2 * 7 / 5 - 1'))
50 | print(psr.interpret('3 + 2 * 7 ** 2 * 5'))
51 | 


--------------------------------------------------------------------------------
/examples/eg_dangling.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | from metaparse import LALR, GLR
 3 | 
 4 | class pIfThenElse(metaclass=GLR.meta):
 5 | 
 6 |     IF     = r'if'
 7 |     THEN   = r'then'
 8 |     ELSE   = r'else'
 9 |     EXPR   = r'\d+'
10 |     SINGLE = r'[_a-zA-Z]+'
11 | 
12 |     def stmt(ifstmt):
13 |         return ifstmt 
14 | 
15 |     def stmt(SINGLE):
16 |         return SINGLE 
17 | 
18 |     def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2):
19 |         return ('ite', EXPR, stmt_1, stmt_2) 
20 | 
21 |     def ifstmt(IF, EXPR, THEN, stmt):
22 |         return ('it', EXPR, stmt)
23 | 
24 | from pprint import pprint
25 | 
26 | res = pIfThenElse.interpret_generalized('if 1 then if 2 then if 3 then a else b else c')
27 | pprint(res)
28 | 
29 | 
30 | 
31 | class pIfThenElse(metaclass=LALR.meta):
32 | 
33 |     IF     = r'if'
34 |     THEN   = r'then', 1
35 |     ELSE   = r'else', 2
36 |     EXPR   = r'\d+'
37 |     SINGLE = r'[_a-zA-Z]+'
38 | 
39 |     def stmt(ifstmt):
40 |         return ifstmt 
41 | 
42 |     def stmt(SINGLE):
43 |         return SINGLE 
44 | 
45 |     def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2):
46 |         return ('ite', EXPR, stmt_1, stmt_2) 
47 | 
48 |     def ifstmt(IF, EXPR, THEN, stmt):
49 |         return ('it', EXPR, stmt)
50 | 
51 | res = pIfThenElse.interpret('if 1 then if 2 then if 3 then a else b else c')
52 | pprint(res)
53 | 


--------------------------------------------------------------------------------
/tests/test_verbose_reader.py:
--------------------------------------------------------------------------------
 1 | from metaparse import LALR
 2 | 
 3 | @LALR.verbose
 4 | def calc(lex, rule):
 5 | 
 6 |     lex(IGNORED = r'\s+')
 7 | 
 8 |     @lex(NUM = r'[0-9]+')
 9 |     def NUM(val):
10 |         return int(val)
11 | 
12 |     lex(EQ  = r'=')
13 |     lex(ID  = r'[_a-zA-Z]\w*')
14 | 
15 |     lex(POW = r'\*\*', p = 3)
16 |     lex(MUL = r'\*', p = 2)
17 |     lex(ADD = r'\+', p = 1)
18 |     lex(SUB = r'\-', p = 1)
19 | 
20 |     @rule
21 |     def assign(ID, EQ, expr):
22 |         table[ID] = expr
23 |         return expr
24 | 
25 |     @rule
26 |     def expr(ID):
27 |         return table[ID]
28 | 
29 |     @rule
30 |     def expr(NUM):
31 |         return int(NUM)
32 | 
33 |     @rule
34 |     def expr(expr_1, ADD, expr_2):
35 |         return expr_1 + expr_2
36 | 
37 |     @rule
38 |     def expr(expr_1, SUB, expr_2):
39 |         return expr_1 - expr_2
40 | 
41 |     @rule
42 |     def expr(expr, MUL, expr_1):
43 |         return expr * expr_1
44 | 
45 |     @rule
46 |     def expr(expr, POW, expr_1):
47 |         return expr ** expr_1
48 | 
49 | 
50 | from pprint import pprint
51 | # pprint(lex)
52 | # pprint(rule)
53 | 
54 | # 
55 | table = {}
56 | 
57 | calc.interpret('x  =  8')
58 | calc.interpret('y  =  x -  6 ')
59 | calc.interpret('z  =  x ** y ')
60 | 
61 | 
62 | import unittest
63 | 
64 | class Test(unittest.TestCase):
65 | 
66 |     def test(self):
67 |         self.assertEqual(table, dict(x=8, y=2, z=64))
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     unittest.main()
72 | 
73 | 


--------------------------------------------------------------------------------
/tests/test_first.py:
--------------------------------------------------------------------------------
 1 | from metaparse import * 
 2 | 
 3 | 
 4 | if __name__ == '__main__':
 5 | 
 6 |     rs = ([
 7 |         Rule('S', ('A', 'B', 'C')),
 8 |         Rule('S', ('D',)),
 9 |         Rule('A', ('a', 'A')),
10 |         Rule('A', ()),
11 |         Rule('B', ('B', 'b')),
12 |         Rule('B', ()),
13 |         Rule('C', ('c',)),
14 |         Rule('C', ('D',)),
15 |         Rule('D', ('d', 'D')),
16 |         Rule('D', ('E',)),
17 |         Rule('E', ('D',)),
18 |         Rule('E', ('B',)),
19 |     ])
20 |     g = Grammar(rs)
21 | 
22 |     rs1 = [
23 |         Rule('expr', ['expr', '+', 'term']),
24 |         Rule('expr', ['term']),
25 |         Rule('term', ['term', '*', 'factor']),
26 |         Rule('term', ['factor']),
27 |         Rule('factor', ['ID']),
28 |         Rule('factor', ['(', 'expr', ')']),
29 |     ]
30 |     e = Grammar(rs1)
31 | 
32 |     import unittest
33 | 
34 |     class TestGrammar(unittest.TestCase):
35 | 
36 |         def test_first_0(self):
37 |             self.assertEqual(g.FIRST['S'], {'a', 'b', 'c', 'd', 'EPSILON'})
38 |             self.assertEqual(g.FIRST['E'], {'b', 'd', 'EPSILON'})
39 | 
40 |         def test_first_1(self):
41 |             self.assertEqual(e.FIRST['expr'], {'ID', '('})
42 |             self.assertEqual(e.FIRST['term'], {'ID', '('})
43 |             self.assertEqual(e.FIRST['factor'], {'ID', '('})
44 | 
45 |         def test_nullalbe(self):
46 |             self.assertEqual(set(g.NULLABLE), {'S', 'A', 'B', 'C', 'D', 'E'})
47 | 
48 |     unittest.main()
49 | 


--------------------------------------------------------------------------------
/examples/eg_dumps.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | import ast
 3 | 
 4 | from metaparse import *
 5 | 
 6 | class G_Calc(metaclass=LALR.meta):
 7 | 
 8 |     IGNORED = r'\s+'
 9 | 
10 |     EQ  = r'='
11 | 
12 |     def NUM(lex: r'[1-9]\d*'):
13 |         return float(lex)
14 | 
15 |     ID  = r'[_a-zA-Z]\w*'
16 |     POW = r'\*\*', 3
17 |     MUL = r'\*'  , 2
18 |     ADD = r'\+'  , 1
19 | 
20 |     def assign(ID, EQ, expr):
21 |         table[ID] = expr
22 | 
23 |     def expr(NUM):
24 |         return NUM
25 | 
26 |     def expr(ID):
27 |         return table[ID]
28 | 
29 |     def expr(expr_1, ADD, expr_2):
30 |         return expr_1 + expr_2
31 | 
32 |     def expr(expr, MUL, expr_1):
33 |         return expr * expr_1
34 | 
35 |     def expr(expr, POW, expr_1):
36 |         return expr ** expr_1
37 | 
38 | # assert 0
39 | 
40 | p = (G_Calc)
41 | 
42 | from pprint import pprint
43 | 
44 | # with open('eg_dumps_file.py', 'w') as o:
45 | #     psr_fl = p.dumps()
46 | #     o.write(psr_fl)
47 | 
48 | # with open('eg_dumps_file.py', 'r') as o:
49 | #     s = o.read()
50 | #     p = LALR.loads(s, globals())
51 | 
52 | p.dump('eg_dumps_file.py')
53 | p.load('eg_dumps_file.py', globals())
54 | 
55 | # pprint(p.__dict__)
56 | # pprint(ctx)
57 | 
58 | # timeit LALR.loads(s, globals())
59 | # timeit p = LALR(G_Calc)
60 | 
61 | s1 = p.dumps()
62 | p1 = LALR.loads(s1, globals())
63 | s2 = p1.dumps()
64 | p2 = LALR.loads(s2, globals())
65 | 
66 | table = {}
67 | p2.interpret('x = 3')
68 | p2.interpret('y = x ** 2 * 2 + 1')
69 | pprint(table)
70 | 


--------------------------------------------------------------------------------
/tests/test_with_reader.py:
--------------------------------------------------------------------------------
 1 | from metaparse import LALR
 2 | 
 3 | calc = LALR()
 4 | 
 5 | 
 6 | with calc as (lex, rule):
 7 | 
 8 |     lex(IGNORED = r'\s+')
 9 | 
10 |     @lex(NUM = r'[0-9]+')
11 |     def NUM(val):
12 |         return int(val)
13 | 
14 |     lex(EQ  = r'=')
15 |     lex(ID  = r'[_a-zA-Z]\w*')
16 | 
17 |     lex(POW = r'\*\*', p = 3)
18 |     lex(MUL = r'\*', p = 2)
19 |     lex(ADD = r'\+', p = 1)
20 |     lex(SUB = r'\-', p = 1)
21 | 
22 |     @rule
23 |     def assign(ID, EQ, expr):
24 |         table[ID] = expr
25 |         return expr
26 | 
27 |     @rule
28 |     def expr(ID):
29 |         return table[ID]
30 | 
31 |     @rule
32 |     def expr(NUM):
33 |         return int(NUM)
34 | 
35 |     @rule
36 |     def expr(expr_1, ADD, expr_2):
37 |         return expr_1 + expr_2
38 | 
39 |     @rule
40 |     def expr(expr_1, SUB, expr_2):
41 |         return expr_1 - expr_2
42 | 
43 |     @rule
44 |     def expr(expr, MUL, expr_1):
45 |         return expr * expr_1
46 | 
47 |     @rule
48 |     def expr(expr, POW, expr_1):
49 |         return expr ** expr_1
50 | 
51 | 
52 | from pprint import pprint
53 | # pprint(lex)
54 | # pprint(rule)
55 | 
56 | # 
57 | table = {}
58 | 
59 | calc.interpret('x  =  8')
60 | calc.interpret('y  =  x -  6 ')
61 | calc.interpret('z  =  x ** y ')
62 | 
63 | 
64 | import unittest
65 | 
66 | class Test(unittest.TestCase):
67 | 
68 |     def test(self):
69 |         self.assertEqual(table, dict(x=8, y=2, z=64))
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     unittest.main()
74 | 
75 | 


--------------------------------------------------------------------------------
/tests/test_resolve.py:
--------------------------------------------------------------------------------
 1 | from metaparse import LALR
 2 | from pprint import pprint
 3 | from unittest import TestCase, main 
 4 | 
 5 | 
 6 | class LangIfThenElse(metaclass=LALR.meta):
 7 | 
 8 |     'Dangling else grammar with ambiguity resolved by precedence.'
 9 | 
10 |     IGNORED = r'[ \(\)]'
11 |     IF     = r'if'
12 |     THEN   = r'then', 1
13 |     def ELSE(lex: r'else') -> 2:
14 |         return lex
15 | 
16 |     EXPR   = r'e'
17 |     SINGLE = r's'
18 | 
19 |     def stmt(ifstmt):
20 |         return ifstmt 
21 | 
22 |     def stmt(SINGLE):
23 |         return SINGLE 
24 | 
25 |     def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2):
26 |         return ('ite', stmt_1, stmt_2) 
27 | 
28 |     def ifstmt(IF, EXPR, THEN, stmt):
29 |         return ('it', stmt)
30 | 
31 | 
32 | 
33 | class Test(TestCase):
34 | 
35 |     def test_parse(self):
36 | 
37 |         inp = 'if e then (if e then (if e then s else s) else s)'
38 |         res = LangIfThenElse.interpret(inp)
39 |         
40 |         self.assertEqual(res, ('it', ('ite', ('ite', 's', 's'), 's')))
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 
46 |     # inp = 'if e then else (if e then (if e then s else s) else s)'
47 |     # r = LangIfThenElse.prepare_generalized()
48 |     # l = LangIfThenElse.lexer.tokenize(inp)
49 |     # next(r)
50 |     # for t in l:
51 |     #     print('feeding: ', t)
52 |     #     res = r.send(t)
53 |     #     print(res)
54 |     # res = r.send(None)
55 |     # print(res)
56 | 
57 |     # t = LangIfThenElse.parse_generalized(inp)
58 | 


--------------------------------------------------------------------------------
/examples/eg_logic.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | 
 3 | from metaparse import LALR
 4 | from collections import namedtuple
 5 | 
 6 | 
 7 | class PropLogic(metaclass=LALR.meta):
 8 | 
 9 |     T = r'True'
10 |     F = r'False'
11 |     W = r'[A-Z]\w*'
12 | 
13 |     L = r'\(' ; R = r'\)'
14 |     LL = r'\['; RR = r'\]'
15 | 
16 |     NEG = r'!'   , 5
17 |     CON = r'&'   , 4
18 |     DIS = r'\|'  , 3
19 |     IMP = r'->'  , 2
20 |     IFF = r'<=>' , 1
21 | 
22 |     def Sentence(Atomic):
23 |         return Atomic
24 |     def Sentence(Complex):
25 |         return Complex
26 | 
27 |     def Atomic(T):
28 |         return True
29 |     def Atomic(F):
30 |         return False
31 |     def Atomic(W):
32 |         return table[W]
33 | 
34 |     def Complex(L, Sentence, R):
35 |         return Sentence
36 |     def Complex(LL, Sentence, RR):
37 |         return Sentence
38 |     def Complex(NEG, Sentence):
39 |         return not Sentence
40 |     def Complex(Sentence, CON, Sentence_1):
41 |         return Sentence and Sentence_1
42 |     def Complex(Sentence, DIS, Sentence_1):
43 |         return Sentence or Sentence_1
44 |     def Complex(Sentence, IMP, Sentence_1):
45 |         return not Sentence or Sentence_1
46 |     def Complex(Sentence, IFF, Sentence_1):
47 |         return Sentence == Sentence_1
48 | 
49 | 
50 | inp = """
51 | (P & Q | R & !S)
52 | """
53 | 
54 | table = dict(
55 |     P=True,
56 |     Q=False,
57 |     R=True,
58 |     S=False,
59 | )
60 | 
61 | t = PropLogic.parse(inp)
62 | r = PropLogic.interpret(inp)
63 | 
64 | from pprint import pprint
65 | 
66 | pprint(t)
67 | pprint(r)
68 | 
69 | # pprint(PropLogic.__dict__)
70 | 


--------------------------------------------------------------------------------
/tests/test_lrval_dumpsloads.py:
--------------------------------------------------------------------------------
 1 | from metaparse import *
 2 | 
 3 | table = []
 4 | refs = 0
 5 | 
 6 | class G(metaclass=LALR.meta):
 7 | 
 8 |     EQ   = r'='
 9 | 
10 |     def STAR(lex: r'\*'):
11 |         global refs
12 |         refs += 1
13 |         return lex
14 | 
15 |     def ID(lex: r'[_a-zA-Z]\w*'):
16 |         table.append(lex)
17 |         return lex
18 | 
19 |     def S(L, EQ, R):
20 |         return ('assign', L, R)
21 | 
22 |     def S(R):
23 |         return ('expr', R)
24 | 
25 |     def L(STAR, R):
26 |         return ('deref', R)
27 | 
28 |     def L(ID):
29 |         return ID
30 | 
31 |     def R(L):
32 |         return L
33 | 
34 | 
35 | import unittest
36 | 
37 | class TestDumpLoad(unittest.TestCase):
38 | 
39 |     def test_dumpload(self):
40 | 
41 |         inp = '*a = **b'
42 | 
43 |         import pprint as pp
44 | 
45 |         p1 = G
46 | 
47 |         s1 = p1.dumps()
48 |         p1 = LALR.loads(s1, globals()) 
49 |         s1 = p1.dumps()
50 |         p1 = LALR.loads(s1, globals()) 
51 | 
52 |         r = p1.interpret(inp)
53 |         r = p1.interpret(inp)
54 | 
55 |         self.assertEqual(r, ('assign',
56 |                              ('deref', 'a'),
57 |                              ('deref', ('deref', 'b'))))
58 | 
59 |         self.assertEqual(table, ['a', 'b', 'a', 'b'])
60 |         self.assertEqual(refs, 6)
61 | 
62 |         # s = p1.lexer.dumps()
63 |         # lexer = Lexer.loads(s, globals())
64 |         # xs = list(lexer.tokenize(inp, True))
65 |         # # pp.pprint(xs)
66 | 
67 |         # self.assertEqual(refs, 9)
68 |         # # pp.pprint(p1)
69 |         # # pp.pprint(p1.__dict__)
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/examples/eg_demo_py2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import preamble
 4 | 
 5 | from metaparse import LALR
 6 | 
 7 | # Global stuff
 8 | table = {}
 9 | 
10 | @LALR.verbose
11 | def calc(lex, rule):
12 | 
13 |     lex(IGNORED = r'\s+')
14 | 
15 |     @lex(NUM = r'[0-9]+')
16 |     def NUM(val):
17 |         return int(val)
18 | 
19 |     lex(LEFT = r'\(')
20 |     lex(RIGHT = r'\)')
21 | 
22 |     lex(EQ  = r'=')
23 |     lex(ID  = r'[_a-zA-Z]\w*')
24 | 
25 |     lex(POW = r'\*\*', p = 3)
26 |     lex(MUL = r'\*', p = 2)
27 |     lex(ADD = r'\+', p = 1)
28 |     lex(SUB = r'\-', p = 1)
29 | 
30 |     @rule
31 |     def stmt(assign):
32 |         return assign
33 |     @rule
34 |     def stmt(expr):
35 |         return expr
36 |     
37 |     @rule
38 |     def assign(ID, EQ, expr):
39 |         table[ID] = expr
40 |         return expr
41 | 
42 |     @rule
43 |     def expr(ID):
44 |         return table[ID]
45 |     @rule
46 |     def expr(NUM):
47 |         return int(NUM)
48 |     @rule
49 |     def expr(LEFT, expr, RIGHT):
50 |         return expr
51 | 
52 |     @rule
53 |     def expr(expr_1, ADD, expr_2):
54 |         return expr_1 + expr_2
55 |     @rule
56 |     def expr(expr_1, SUB, expr_2):
57 |         return expr_1 - expr_2
58 |     @rule
59 |     def expr(expr, MUL, expr_1):
60 |         return expr * expr_1
61 |     @rule
62 |     def expr(expr, POW, expr_1):
63 |         return expr ** expr_1
64 | 
65 | 
66 | from pprint import pprint
67 | 
68 | table = {}
69 | 
70 | calc.interpret('x  =  8')
71 | calc.interpret('y  =  x -  6 ')
72 | calc.interpret('z  =  x ** y ')
73 | 
74 | calc.interpret(' (3) ')
75 | calc.interpret(' x = 03 ')
76 | calc.interpret(' y = 4 * x ** (2 + 1) * 2')
77 | 
78 | print(table)
79 | 
80 | # print(calc.dumps())
81 | calc1 = LALR.loads(calc.dumps(), globals())
82 | 
83 | calc1.interpret(' w = x + 1')
84 | 
85 | print(table)
86 | 


--------------------------------------------------------------------------------
/examples/eg_lisp.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | import metaparse
 3 | from metaparse import LALR
 4 | 
 5 | 
 6 | class ListParser(metaclass=LALR.meta):
 7 |     """A tiny grammar for lists."""
 8 |     IGNORED = r'\s'
 9 |     SYMBOL  = r'\w+'
10 |     def list(list, SYMBOL):
11 |         list.append(SYMBOL)
12 |         return list
13 |     def list():
14 |         return []
15 | 
16 | 
17 | class LISP(metaclass=LALR.meta):
18 |     """A parser for scheme-like grammar. Should be easy to describe and
19 |     parse.
20 | 
21 |     """
22 | 
23 |     LAMBDA = r'\(\s*lambda'
24 |     LEFT   = r'\('
25 |     RIGHT  = r'\)'
26 |     SYMBOL = r'[^\(\)\s]+'
27 | 
28 |     # _env = {}
29 |     # def _unify():
30 |     #     pass
31 | 
32 |     def sexp(var):
33 |         return var
34 |     def sexp(abst):
35 |         return abst
36 |     def sexp(appl):
37 |         return appl
38 | 
39 |     def var(SYMBOL):
40 |         return SYMBOL
41 |     def abst(LAMBDA, LEFT, parlist, RIGHT_1, sexp, RIGHT_2):
42 |         return ('LAMBDA', parlist, sexp)
43 |     def appl(LEFT, sexp, sexps, RIGHT):
44 |         return [sexp, sexps]
45 | 
46 |     def parlist(SYMBOL, parlist):
47 |         return [SYMBOL] + parlist
48 |     # def parlist(parlist, SYMBOL):
49 |     #     return parlist + [SYMBOL]
50 |     def parlist():
51 |         return []
52 | 
53 |     def sexps(sexps, sexp):
54 |         return sexps + [sexp]
55 |     # def sexps(sexp, sexps):
56 |     #     return sexps + [sexp]
57 |     def sexps():
58 |         return []
59 | 
60 | 
61 | p_lisp = (LISP)
62 | 
63 | lx = p_lisp.lexer
64 | p = p_lisp.prepare(True)
65 | next(p)
66 | 
67 | inp = '(+ (+ 1 2) 3 ))'
68 | tks = list(lx.tokenize(inp, True))
69 | 
70 | 
71 | from pprint import pprint
72 | 
73 | # pprint(tks)
74 | 
75 | for tk in tks:
76 |     res = p.send(tk)
77 |     pprint(res)
78 | 
79 | # res = p_lisp.interpret('(lambda (x y) (+ x y) ))')
80 | # print(res)
81 | 
82 | for tk in tks:
83 |     res = p.send(tk)
84 |     pprint(res)
85 | 


--------------------------------------------------------------------------------
/tests/test_grammar.py:
--------------------------------------------------------------------------------
 1 | """This file tests the fundamental checking mechanism of the class
 2 | Grammar.meta."""
 3 | 
 4 | import warnings
 5 | import unittest
 6 | 
 7 | from metaparse import *
 8 | 
 9 | w = []
10 | 
11 | # with warnings.catch_warnings(record=True) as ws:
12 | if 1:
13 | 
14 |     class G(metaclass=Grammar.meta):
15 | 
16 |         def S(A, B, C): pass
17 |         def S(D): pass
18 |         def A(a, A): pass
19 |         def A(): pass
20 |         def B(B, b): pass
21 |         def B(): pass
22 |         def C(c): pass
23 |         def C(D): pass
24 |         def D(d, D): pass
25 |         def D(E): pass
26 |         def E(D): pass
27 |         def E(B): pass
28 | 
29 |     # assert len(ws) == 1
30 | 
31 | # pprint.pprint(G.terminals)
32 | # pprint.pprint(G.nonterminals)
33 | 
34 | 
35 | class TestGrammar(unittest.TestCase):
36 | 
37 |     def test_first_all(self):
38 |         self.assertEqual(G.first_of_seq(['A', 'B', 'C'], '#'), {'a', 'b', 'c', 'd', '#'})
39 | 
40 |     def test_nullalbe(self):
41 |         self.assertEqual(set(G.NULLABLE), {'S', 'A', 'B', 'C', 'D', 'E'})
42 | 
43 |     # def test_warn_loop(self):
44 |     #     with warnings.catch_warnings(record=True) as ws:
45 |     #         # Same as `G` above.
46 |     #         class F(metaclass=cfg):
47 |     #             a, b, c, d = r'abcd'
48 |     #             def S(A, B, C): pass
49 |     #             def S(D): pass
50 |     #             def A(a, A): pass
51 |     #             def A(): pass
52 |     #             def B(B, b): pass
53 |     #             def B(): pass
54 |     #             def C(c): pass
55 |     #             def C(D): pass
56 |     #             def D(d, D): pass
57 |     #             def D(E): pass
58 |     #             def E(D): pass
59 |     #             def E(B): pass
60 |     #         # Now raised warnings get captured into `ws`.
61 |     #         self.assertEqual(len(ws), 1)
62 |     #         # print(ws)
63 | 
64 |         
65 | if __name__ == '__main__':
66 |     unittest.main()
67 |     # pass
68 | 


--------------------------------------------------------------------------------
/examples/eg_demo_py3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import preamble
 3 | 
 4 | import pprint as pp
 5 | from metaparse import LALR
 6 | 
 7 | # Global stuff
 8 | table = {}
 9 | 
10 | class G_Calc(metaclass=LALR.meta):
11 | 
12 |     # ===== Lexical patterns / Terminals =====
13 |     # - A pattern is defined by Python regex literal.
14 |     # - Patterns will be matched in given order when tokenizing.
15 | 
16 |     IGNORED = r' '              # Special token ignored by tokenizer.
17 |     IGNORED = r'\t'             # Can add alternative patterns.
18 | 
19 |     POW = r'\*\*', 3            # Precedence of token (for LALR)
20 |     MUL = r'\*'  , 2
21 |     ADD = r'\+'  , 1
22 | 
23 |     EQ  = r'='                  # Precedence is 0 by default.
24 | 
25 |     NUM = r'[1-9]\d*'
26 |     def NUM(lex):                 # Handler for translating token value.
27 |         return int(lex)
28 | 
29 |     ID  = r'[_a-zA-Z]\w*'       # Unhandled token yields literal value.
30 | 
31 |     # === Optional error handling for tokenizer ===
32 |     # - If handler defined, token ERROR is ignored when tokenizing.
33 |     # - Otherwise token ERROR is yielded.
34 |     ERROR = r'#'
35 |     def ERROR(lex):
36 |         print("Error literal '{}'".format(lex))
37 | 
38 |     # ===== Syntactic/Semantic rules in SDT-style =====
39 | 
40 |     def assign(ID, EQ, expr):        # May rely on side-effect...
41 |         table[ID] = expr
42 | 
43 |     def expr(NUM):                   # or return local results for purity
44 |         return NUM
45 | 
46 |     def expr(ID):
47 |         return table[ID]
48 | 
49 |     def expr(expr_1, ADD, expr_2):   # With TeX-subscripts, meaning (expr → expr₁ + expr₂)
50 |         return expr_1 + expr_2
51 | 
52 |     def expr(expr, MUL, expr_1):     # Can ignore one of the subscripts.
53 |         return expr * expr_1
54 | 
55 |     def expr(expr, POW, expr_1):
56 |         return expr ** expr_1
57 | 
58 | 
59 | pCalc = G_Calc
60 | 
61 | from pprint import pprint
62 | # parse and tree
63 | t = pCalc.parse("x = 1 + 4 * 3 ** 2 + 5")
64 | pprint(t)
65 | 
66 | 


--------------------------------------------------------------------------------
/tests/test_generalized.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase, main
 2 | from metaparse import *
 3 | 
 4 | 
 5 | class LangExpr(metaclass=GLR.meta):
 6 | 
 7 |     'An ambigious grammar for arithmetic expressions.'
 8 | 
 9 |     def plus(lex: r'\+'):
10 |         return lex
11 | 
12 |     def times(lex: r'\*'):
13 |         return lex
14 | 
15 |     def number(lex: r'\d+'):
16 |         return int(lex)
17 | 
18 | 
19 |     def expr(expr, plus, expr_1):
20 |         return expr + expr_1
21 | 
22 |     def expr(expr, times, expr_1):
23 |         return expr * expr_1
24 | 
25 |     def expr(number):
26 |         return number
27 | 
28 | 
29 | class Test(TestCase):
30 | 
31 |     def test_send(self):
32 | 
33 |         p = LangExpr.prepare_generalized()
34 | 
35 |         inp = '1 + 2 * 3 + 4'
36 |         x = list(LangExpr.lexer.tokenize(inp))
37 | 
38 |         next(p)
39 |         for tk in x:
40 |             r = p.send(tk)
41 |         else:
42 |             r = p.send(END_TOKEN)
43 |             # 5 combinations for association!
44 |             # 
45 |             # How to calc the number of combinations?
46 |             # 
47 |             # <==>
48 |             # 
49 |             # Given i operators, how many binary trees can they form
50 |             # with the same infix-order?
51 |             #
52 |             # - choose each one as a subtree root
53 |             # - divide by the root, calc recursively
54 |             # 
55 |             # B(0) == 1
56 |             # B(1) == 1
57 |             # B(2) == B(1) + B(1) == 2
58 |             # B(3) == B(2) + B(1)*B(1) + B(2) == 2 + 1 + 2 == 5
59 |             # B(4) == B(3) + B(1)B(2) + B(2)B(1) + B(3) == 5 + 2 + 2 + 5 == 14
60 |             # ...
61 |             # B(n) == sum(B(i)B(n-1-i) for i in [1..n-1])
62 |             self.assertEqual(len(r), 2 + 1 + 2)
63 | 
64 |     def test_send_more(self):
65 |         inp = '1 + 2 * 3 + 4 * 5'
66 |         y = LangExpr.interpret_generalized(inp)
67 |         self.assertEqual(len(y), 14)
68 | 
69 |     def test_parse(self):
70 |         y = LangExpr.interpret_generalized('1 + 2 * 3')
71 |         self.assertEqual(y, [9, 7])
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 
77 | 


--------------------------------------------------------------------------------
/tests/test_conflict.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from metaparse import LanguageError, LALR, GLR
 4 | 
 5 | from pprint import pprint
 6 | 
 7 | 
 8 | class TestLRGrammar(unittest.TestCase):
 9 | 
10 |     def test_LALR_report(self):
11 |         """LALR parser should report conflicts for ambiguous Grammar.meta! """
12 |         with self.assertRaises(LanguageError) as caught:
13 | 
14 |             class LangIfThenElse(metaclass=LALR.meta):
15 | 
16 |                 IF     = r'if'
17 |                 THEN   = r'then'
18 |                 ELSE   = r'else'
19 |                 EXPR   = r'e'
20 |                 SINGLE = r's'
21 | 
22 |                 def stmt(ifstmt):
23 |                     return ifstmt 
24 | 
25 |                 def stmt(SINGLE):
26 |                     return SINGLE 
27 | 
28 |                 def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2):
29 |                     return ('ite', EXPR, stmt_1, stmt_2) 
30 | 
31 |                 def ifstmt(IF, EXPR, THEN, stmt):
32 |                     return ('it', EXPR, stmt)
33 | 
34 |         self.assertIn(
35 |             'Conflict on lookahead: ELSE',
36 |             caught.exception.message)
37 | 
38 |     def test_many(self):
39 | 
40 |         class LangIfThenElse(metaclass=GLR.meta):
41 | 
42 |             IF     = r'if'
43 |             THEN   = r'then'
44 |             ELSE   = r'else'
45 |             EXPR   = r'\d'
46 |             SINGLE = r'[xyz]'
47 | 
48 |             def stmt(ifstmt):
49 |                 return ifstmt 
50 | 
51 |             def stmt(SINGLE):
52 |                 return SINGLE 
53 | 
54 |             def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2):
55 |                 return ('ite', EXPR, stmt_1, stmt_2) 
56 | 
57 |             def ifstmt(IF, EXPR, THEN, stmt):
58 |                 return ('it', EXPR, stmt)
59 | 
60 |         results = LangIfThenElse.interpret_generalized('if 1 then if 2 then x else y')
61 |         self.assertEqual(len(results), 2)
62 |         self.assertIn(
63 |             ('it', '1', ('ite', '2', 'x', 'y')),
64 |             results)
65 |         self.assertIn(
66 |             ('ite', '1', ('it', '2', 'x'), 'y'),
67 |             results)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     unittest.main()
72 | 


--------------------------------------------------------------------------------
/experiments/frontend_styles.py:
--------------------------------------------------------------------------------
  1 | import preamble
  2 | 
  3 | from metaparse import *
  4 | 
  5 | table = {}
  6 | 
  7 | # Clean style
  8 | class G_Calc(metaclass=cfg):
  9 | 
 10 |     IGNORED = r'\s+'
 11 | 
 12 |     EQ  = r'='
 13 |     NUM = r'[0-9]+'
 14 |     ID  = r'[_a-zA-Z]\w*'
 15 |     POW = r'\*\*', 3
 16 |     MUL = r'\*'  , 2
 17 |     ADD = r'\+'  , 1
 18 | 
 19 |     # ERROR handler?
 20 | 
 21 |     def assign(ID, EQ, expr):
 22 |         table[ID] = expr
 23 | 
 24 |     def expr(NUM):
 25 |         return int(NUM)
 26 | 
 27 |     def expr(ID):
 28 |         return table[ID]
 29 | 
 30 |     def expr(expr_1, ADD, expr_2):
 31 |         return expr_1 + expr_2
 32 | 
 33 |     def expr(expr, MUL, expr_1):
 34 |         return expr * expr_1
 35 | 
 36 |     def expr(expr, POW, expr_1):
 37 |         return expr ** expr_1
 38 | 
 39 | 
 40 | # Handler style
 41 | class G_Calc():
 42 | 
 43 |     def IGNORED(lex: r'\v'):
 44 |         pass
 45 |     def IGNORED(lex: r'\\'):
 46 |         pass
 47 | 
 48 |     def ERROR(lex: r'\t'):
 49 |         print('ERROR')
 50 | 
 51 |     def UNRECOGNIZED(lex: r'.'):
 52 |         pass
 53 | 
 54 |     # Terminals
 55 |     def NUM(lex: r'\d+'):
 56 |         return int(lex)
 57 | 
 58 |     def ID(lex: r'[_a-zA-Z]\w*'):
 59 |         return lex
 60 | 
 61 |     def L(lex: r'\('):
 62 |         return lex
 63 |     def R(lex: r'\)'):
 64 |         return lex
 65 | 
 66 |     L2 = r'\['
 67 |     R2 = r'\]'
 68 | 
 69 |     def PLUS(lex: r'\+') -> 1:
 70 |         return lex
 71 |     def POW(lex: r'\*\*') -> 3:
 72 |         return lex
 73 |     def TIMES(lex: r'\*') -> 2:
 74 |         return lex
 75 |     
 76 |     # Nonterminals
 77 |     def assign(ID: r'[_a-zA-Z]\w*',
 78 |                EQ: '=',
 79 |                expr):
 80 |         table[ID] = expr
 81 | 
 82 |     def expr(NUM):
 83 |         return NUM
 84 | 
 85 |     def expr(expr_1, ADD: r'\+', expr_2):
 86 |         return expr_1 + expr_2
 87 | 
 88 | 
 89 | # Decorator style
 90 | def lex(pat, p=0):
 91 |     def _(func):
 92 |         return (func.__name__, pat, p, func)
 93 |     return _
 94 |         
 95 | class G_Calc():
 96 | 
 97 |     @lex(r'\s+', 3)
 98 |     def IGNORED(val):
 99 |         pass
100 | 
101 |     @lex(r'\t', 2)
102 |     def ERROR(val):
103 |         print('ERROR!')
104 | 
105 | 


--------------------------------------------------------------------------------
/tests/test_prece.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pprint as pp
 4 | from metaparse import LALR
 5 | from unittest import main, TestCase
 6 | 
 7 | # Global stuff
 8 | table = {}
 9 | 
10 | class LangCalc(metaclass=LALR.meta):
11 | 
12 |     # ===== Lexical patterns / Terminals =====
13 |     # - A pattern is defined by Python regex literal.
14 |     # - Patterns will be matched in given order when tokenizing.
15 | 
16 |     IGNORED = r' '              # Special token ignored by tokenizer.
17 |     IGNORED = r'\t'             # Can add alternative patterns.
18 | 
19 |     POW = r'\*\*', 3            # Precedence of token (for LALR)
20 |     MUL = r'\*'  , 2
21 |     ADD = r'\+'  , 1
22 | 
23 |     EQ  = r'='                  # Precedence is 0 by default.
24 | 
25 |     def NUM(lex: r'[1-9]\d*'):  # Handler for translating token value.
26 |         return int(lex)
27 | 
28 |     ID  = r'[_a-zA-Z]\w*'       # Unhandled token yields literal value.
29 | 
30 |     # === Optional error handling for tokenizer ===
31 |     # - If handler defined, token ERROR is ignored when tokenizing.
32 |     # - Otherwise token ERROR is yielded.
33 |     def ERROR(lex: r'#'):
34 |         print("Error literal '{}'".format(lex))
35 | 
36 |     # ===== Syntactic/Semantic rules in SDT-style =====
37 | 
38 |     def assign(ID, EQ, expr):        # May rely on side-effect...
39 |         table[ID] = expr
40 | 
41 |     def expr(NUM):                   # or return local results for purity
42 |         return NUM
43 | 
44 |     def expr(ID):
45 |         return table[ID]
46 | 
47 |     def expr(expr_1, ADD, expr_2):   # With TeX-subscripts, meaning (expr → expr₁ + expr₂)
48 |         return expr_1 + expr_2
49 | 
50 |     def expr(expr, MUL, expr_1):     # Can ignore one of the subscripts.
51 |         return expr * expr_1
52 | 
53 |     def expr(expr, POW, expr_1):
54 |         return expr ** expr_1
55 | 
56 | 
57 | 
58 | 
59 | class Test(TestCase):
60 | 
61 |     def test_interp(self):
62 |         t = LangCalc.interpret("x = 1 + 4 * 3 ** 2 + 5")
63 |         assert table == {'x': 42}
64 |         LangCalc.interpret("y = 5 + x * 2")
65 |         assert table == {'x': 42, 'y': 5 + 42 * 2}
66 |         LangCalc.interpret("z = 99")
67 |         assert table == {'x': 42, 'y': 5 + 42 * 2, 'z': 99}
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/tests/test_arith.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from metaparse import Grammar, LALR
 4 | 
 5 | class GArith(metaclass=LALR.meta):
 6 | 
 7 |     'Textbook Grammar.meta for simple arithmetics.'
 8 | 
 9 |     # E -> E + T
10 |     # E -> T
11 |     # T -> T * F
12 |     # T -> F
13 |     # F -> NUMBER
14 |     # F -> ( E )
15 | 
16 |     IGNORED = r' '
17 | 
18 |     plus   = r'\+'
19 |     times  = r'\*'
20 | 
21 |     def number(lex: r'\d+'):
22 |         return int(lex)
23 | 
24 |     left   = r'\('
25 |     right  = r'\)'
26 | 
27 | 
28 |     def Expr(Expr, plus, Term):
29 |         return Expr + Term
30 |     def Expr(Term):
31 |         return Term
32 | 
33 |     def Term(Term, times, Factor):
34 |         return Term * Factor
35 |     def Term(Factor):
36 |         return Factor
37 | 
38 |     def Factor(number):
39 |         return number
40 |     def Factor(left, Expr, right):
41 |         return Expr
42 | 
43 |     # def Atom(number):
44 |     #     return int(number)
45 | 
46 | g = Grammar(GArith.rules)
47 | p = GArith
48 | 
49 | # l = p.lexer
50 | # print(list(l.tokenize('1 2')))
51 | # assert 0
52 | 
53 | class TestArithParser(unittest.TestCase):
54 | 
55 |     def test_FIRST(self):
56 |         self.assertEqual(g.FIRST['Expr'], {'left', 'number'})
57 |         self.assertEqual(g.FIRST['Term'], {'left', 'number'})
58 |         self.assertEqual(g.FIRST['Factor'], {'left', 'number'})
59 |         self.assertEqual(g.FIRST['number'], {'number'})
60 | 
61 |     def test_single(self):
62 |         inp = '0'
63 |         self.assertEqual(eval(inp), p.interpret(inp))
64 | 
65 |     def test_normal(self):
66 |         inp = '3 + 2 * (5 + 11) * 2 + 3'
67 |         self.assertEqual(eval(inp), p.interpret(inp))
68 | 
69 |     def test_tough(self):
70 |         inp = '3 + 2 * (5 + 11)'
71 |         tough_inp = ' + '.join(inp for _ in range(100))
72 |         self.assertEqual(eval(inp), p.interpret(inp))
73 | 
74 | 
75 | if __name__ == '__main__':
76 | 
77 |     unittest.main()
78 | 
79 |     # For debugging
80 |     # t = TestArithParser()
81 |     # t.test_normal()
82 | 
83 |     # tough = ' + '.join(['(2 * (1 + 1) + 2 * 2)'] * 1000)
84 |     # %timeit ari_LALR.meta.interpret(tough)
85 |     # 1 loops, best of 3: 347 ms per loop
86 | 
87 |     # with open('C:/Users/Shellay/Desktop/ari.psr', 'wb') as o:
88 |     #     o.write(ari_LALR.meta.dumps())
89 | 


--------------------------------------------------------------------------------
/examples/eg_pystructs.py:
--------------------------------------------------------------------------------
 1 | import preamble
 2 | import unittest
 3 | 
 4 | from metaparse import *
 5 | # from earley import earley
 6 | 
 7 | # class PyStructReader(metaclass=earley):
 8 | class PyStructReader(metaclass=LALR.meta):
 9 | 
10 |     """
11 |     Grammar for python object and built-in container types.
12 |     """
13 | 
14 |     l1 = r'\('
15 |     r1 = r',?\s*\)'
16 |     l2 = r'\['
17 |     r2 = r',?\s*\]'
18 |     l3 = r'\{'
19 |     r3 = r',?\s*\}'
20 |     comma = r','
21 |     colon = r':'
22 |     id = r'[A-Za-z_]\w*'
23 | 
24 |     def Obj(id)                      : return ('Sym', id)
25 | 
26 |     def Obj(Lst)                     : return Lst
27 |     def Obj(Tpl)                     : return Tpl
28 |     def Obj(Dic)                     : return Dic
29 |     def Obj(Set)                     : return Set
30 | 
31 |     def Tpl(l1, Objs, r1)            : return ('Tpl', Objs)
32 |     def Lst(l2, Objs, r2)            : return ('Lst', Objs)
33 |     def Set(l3, Obj, Objs, r3)       : return ('Set', [Obj] + Objs) # 'Set' contains at least one object
34 |     def Dic(l3, DTerms, r3)          : return ('Dic', DTerms)
35 | 
36 |     def Objs(Objs, comma, Obj)       : return Objs + [Obj]
37 |     def Objs(Obj)                    : return [Obj]
38 |     def Objs()                       : return []
39 | 
40 |     def DTerms(DTerms, comma, DTerm) : return DTerms + [DTerm]
41 |     def DTerms(DTerm)                : return [DTerm]
42 |     def DTerms()                     : return []
43 | 
44 |     def DTerm(Obj_1, colon, Obj_2)   : return (Obj_1, Obj_2)
45 | 
46 | 
47 | target = PyStructReader.interpret
48 | 
49 | class TestPyStructParser(unittest.TestCase):
50 | 
51 |     def test_empty_list(self):
52 |         r = target('[]')
53 |         self.assertEqual(r, ('Lst', []))
54 | 
55 |     def test_empty_dict(self):
56 |         r = target('{}')
57 |         self.assertEqual(r, ('Dic', []))
58 | 
59 |     def test_symbol(self):
60 |         self.assertEqual(target('a'), ('Sym', 'a'))
61 | 
62 |     def test_normal_set(self):
63 |         self.assertEqual(
64 |             target('{(a, b), c, {e, f}}'),
65 |             ('Set', [('Tpl', [('Sym', 'a'), ('Sym', 'b')]),
66 |                      ('Sym', 'c'),
67 |                      ('Set', [('Sym', 'e'), ('Sym', 'f')])]))
68 | 
69 |     def test_normal_dict(self):
70 |         self.assertEqual(
71 |             target('[{a: b}, {c}, {x: y, z: [a]}]'),
72 |             ('Lst', [('Dic', [(('Sym', 'a'), ('Sym', 'b'))]),
73 |                      ('Set', [('Sym', 'c')]),
74 |                      ('Dic', [(('Sym', 'x'), ('Sym', 'y')),
75 |                               (('Sym', 'z'), ('Lst', [('Sym', 'a')]))])]))
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     unittest.main()
80 | 


--------------------------------------------------------------------------------
/tests/test_pystruct.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from metaparse import *
 4 | 
 5 | class LangPyStruct(metaclass=LALR.meta):
 6 | 
 7 |     """
 8 |     Grammar for python object and built-in container types.
 9 |     """
10 | 
11 |     l1 = r'\('
12 |     r1 = r',?\s*\)'
13 |     l2 = r'\['
14 |     r2 = r',?\s*\]'
15 |     l3 = r'\{'
16 |     r3 = r',?\s*\}'
17 |     comma = r','
18 |     colon = r':'
19 |     id = r'[A-Za-z_]\w*'
20 | 
21 |     def Obj(id)                      : return ('Sym', id)
22 | 
23 |     def Obj(Lst)                     : return Lst
24 |     def Obj(Tpl)                     : return Tpl
25 |     def Obj(Dic)                     : return Dic
26 |     def Obj(Set)                     : return Set
27 | 
28 |     def Tpl(l1, Objs, r1)            : return ('Tpl', Objs)
29 |     def Lst(l2, Objs, r2)            : return ('Lst', Objs)
30 |     def Set(l3, Obj, Objs, r3)       : return ('Set', [Obj] + Objs) # 'Set' contains at least one object
31 |     def Dic(l3, DTerms, r3)          : return ('Dic', DTerms)
32 | 
33 |     def Objs(Objs, comma, Obj)       : return Objs + [Obj]
34 |     def Objs(Obj)                    : return [Obj]
35 |     def Objs()                       : return []
36 | 
37 |     def DTerms(DTerms, comma, DTerm) : return DTerms + [DTerm]
38 |     def DTerms(DTerm)                : return [DTerm]
39 |     def DTerms()                     : return []
40 | 
41 |     def DTerm(Obj_1, colon, Obj_2)   : return (Obj_1, Obj_2)
42 | 
43 | 
44 | target = LangPyStruct.interpret
45 | 
46 | GrammarPyStruct = Grammar(LangPyStruct.rules)
47 | 
48 | 
49 | class TestPyStructParser(unittest.TestCase):
50 | 
51 |     def test_first(self):
52 |         self.assertEqual(GrammarPyStruct.FIRST['Obj'], {'l1', 'id', 'l2', 'l3'})
53 | 
54 |     def test_empty_list(self):
55 |         self.assertEqual(target('[]'), ('Lst', []))
56 | 
57 |     def test_empty_dict(self):
58 |         self.assertEqual(target('{}'), ('Dic', []))
59 | 
60 |     def test_symbol(self):
61 |         self.assertEqual(target('a'), ('Sym', 'a'))
62 | 
63 |     def test_normal_set(self):
64 |         self.assertEqual(
65 |             target('{(a, b), c, {e, f}}'),
66 |             ('Set', [('Tpl', [('Sym', 'a'), ('Sym', 'b')]),
67 |                      ('Sym', 'c'),
68 |                      ('Set', [('Sym', 'e'), ('Sym', 'f')])]))
69 | 
70 |     def test_normal_dict(self):
71 |         self.assertEqual(
72 |             target('[{a: b}, {c}, {x: y, z: [a]}]'),
73 |             ('Lst', [('Dic', [(('Sym', 'a'), ('Sym', 'b'))]),
74 |                      ('Set', [('Sym', 'c')]),
75 |                      ('Dic', [(('Sym', 'x'), ('Sym', 'y')),
76 |                               (('Sym', 'z'), ('Lst', [('Sym', 'a')]))])]))
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------
/experiments/only_syntax.py:
--------------------------------------------------------------------------------
  1 | import preamble
  2 | from metaparse import Symbol, Rule, Grammar, LALR
  3 | from pprint import pprint
  4 | 
  5 | class read(type):
  6 | 
  7 |     class gs(object):
  8 |         def __init__(self):
  9 |             self.lexes = []
 10 |             self.pats = []
 11 |             self.rules = []
 12 |             self.prece = {}
 13 |         def __setitem__(self, k, v):
 14 |             if not k.startswith('__'):
 15 |                 # lexical tuple
 16 |                 if isinstance(v, tuple):
 17 |                     assert len(v) == 2
 18 |                     l, p = v
 19 |                     self.lexes.append(k)
 20 |                     self.pats.append(l)
 21 |                     self.prece[k] = p
 22 |                 # lexical str
 23 |                 elif isinstance(v, str):
 24 |                     self.lexes.append(k)
 25 |                     self.pats.append(v)
 26 |                 # alternatives
 27 |                 elif isinstance(v, (list, set)):
 28 |                     for alt in v:
 29 |                         if not isinstance(alt, (list, tuple)):
 30 |                             alt = (alt,)
 31 |                         rhs = []
 32 |                         for x in alt:
 33 |                             if isinstance(x, Symbol):
 34 |                                 rhs.append(str(x))
 35 |                             elif isinstance(x, str):
 36 |                                 self.lexes.append(x)
 37 |                                 self.pats.append(None)
 38 |                                 rhs.append(x)
 39 |                         self.rules.append(Rule(k, rhs))
 40 |                 # 
 41 |                 elif callable(v):
 42 |                     pass
 43 |         def __getitem__(self, k0):
 44 |             return Symbol(k0)
 45 | 
 46 |     @classmethod
 47 |     def __prepare__(mcls, n, bs, **kw):
 48 |         return read.gs()
 49 |     def __new__(mcls, n, bs, gs):
 50 |         return Grammar(gs.lexes, gs.pats, gs.rules, prece=gs.prece)
 51 | 
 52 | 
 53 | class E(metaclass=read): 
 54 | 
 55 |     # IGNORED = r'\s+'
 56 | 
 57 |     NEG = r'!'   , 5
 58 |     CON = r'&'   , 4
 59 |     DIS = r'\|'  , 3
 60 |     IMP = r'->'  , 2
 61 |     IFF = r'<=>' , 1
 62 | 
 63 |     W   = r'[A-Z]\w*'
 64 | 
 65 |     Sentence = [
 66 |         Atomic,
 67 |         Complex,
 68 |     ]
 69 | 
 70 |     Atomic = [
 71 |         'True',
 72 |         'False',
 73 |         W,
 74 |     ]
 75 | 
 76 |     Complex = [
 77 |         ('(', Sentence, ')'),
 78 |         ('[', Sentence, ']'),
 79 |         (NEG, Sentence),
 80 |         (Sentence, CON, Sentence),
 81 |         (Sentence, DIS, Sentence),
 82 |         (Sentence, IMP, Sentence),
 83 |         (Sentence, IFF, Sentence),
 84 |     ]
 85 | 
 86 | 
 87 | # pprint(E)
 88 | 
 89 | # g = Grammar(*E)
 90 | # pprint(g)
 91 | pprint(E.lex2pats)
 92 | p = LALR(E)
 93 | 
 94 | # pprint([*p.lexer.tokenize('True & False', True)])
 95 | # pprint(p.parse('P & Q | R & !S'))
 96 | 
 97 | s = p.dump('meta_dumps.py')
 98 | p1 = LALR.load('meta_dumps.py', globals())
 99 | 
100 | # print(s)
101 | 
102 | pprint(p1.parse('P & Q | R & !S'))
103 | 


--------------------------------------------------------------------------------
/examples/eg_read_ebnf.py:
--------------------------------------------------------------------------------
  1 | import preamble
  2 | from metaparse import Token, Rule, Lexer
  3 | from metaparse import LALR
  4 | from collections import namedtuple
  5 | from pprint import pprint
  6 | 
  7 | Seq = namedtuple('Seq', 'exprs')
  8 | Rep = namedtuple('Rep', 'expr')
  9 | Opt = namedtuple('Opt', 'expr')
 10 | Alts = namedtuple('Alts', 'exprs')
 11 | 
 12 | 
 13 | class Symbol(str):
 14 |     def __repr__(self):
 15 |         return self
 16 | 
 17 | 
 18 | class EBNF(metaclass=LALR.meta):
 19 | 
 20 |     ID    = r'[a-zA-Z]\w+'
 21 |     TERM1 = r'\'[^\']*\''
 22 |     TERM2 = r'\"[^\"]*\"'
 23 | 
 24 |     DRV = r'='
 25 |     ALT = r'\|'
 26 |     CON = r','
 27 |     SEMI = r';'
 28 | 
 29 |     L  = r'\(' ; R  = r'\)'
 30 |     Lb = r'\[' ; Rb = r'\]'
 31 |     LB = r'\{' ; RB = r'\}'
 32 | 
 33 |     def grammar(rules):
 34 |         return rules
 35 | 
 36 |     def rules(rules, rule):
 37 |         rules.append(rule)
 38 |         return rules
 39 |     def rules():
 40 |         return []
 41 | 
 42 |     def rule(lhs, DRV, rhs, SEMI):
 43 |         return (Symbol(lhs), rhs)
 44 | 
 45 |     def lhs(ID):
 46 |         return ID
 47 | 
 48 |     def rhs(alts):
 49 |         return Alts(alts)
 50 | 
 51 |     def alts(alts, ALT, seq):
 52 |         alts.append(Seq(seq))
 53 |         return alts
 54 |     def alts(seq):
 55 |         return [Seq(seq)]
 56 | 
 57 |     def seq(seq, CON, expr):
 58 |         return seq + (expr,)
 59 |     def seq(expr):
 60 |         return (expr,)
 61 | 
 62 |     def expr(ID): return Symbol(ID)
 63 |     def expr(term): return term[1:-1]
 64 |     def expr(opt): return Opt(opt)
 65 |     def expr(rep): return Rep(rep)
 66 |     def expr(grp): return grp
 67 | 
 68 |     def term(TERM1): return TERM1
 69 |     def term(TERM2): return TERM2
 70 | 
 71 |     def grp(L, alts, R): return (alts)
 72 |     def opt(Lb, alts, Rb): return (alts)
 73 |     def rep(LB, alts, RB): return (alts)
 74 | 
 75 | 
 76 | inp = """
 77 | letter = "A" | "B" | "C" | "D" | "E" | "F" | "G"
 78 |        | "H" | "I" | "J" | "K" | "L" | "M" | "N"
 79 |        | "O" | "P" | "Q" | "R" | "S" | "T" | "U"
 80 |        | "V" | "W" | "X" | "Y" | "Z" | "a" | "b"
 81 |        | "c" | "d" | "e" | "f" | "g" | "h" | "i"
 82 |        | "j" | "k" | "l" | "m" | "n" | "o" | "p"
 83 |        | "q" | "r" | "s" | "t" | "u" | "v" | "w"
 84 |        | "x" | "y" | "z" ;
 85 | digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
 86 | symbol = "[" | "]" | "{" | "}" | "(" | ")" | "<" | ">"
 87 |        | "'" | '"' | "=" | "|" | "." | "," | ";" ;
 88 | character = letter | digit | symbol | "_" ;
 89 | 
 90 | identifier = letter , { letter | digit | "_" } ;
 91 | terminal = "'" , character , { character } , "'"
 92 |          | '"' , character , { character } , '"' ;
 93 | 
 94 | lhs = identifier ;
 95 | rhs = identifier
 96 |      | terminal
 97 |      | "[" , rhs , "]"
 98 |      | "{" , rhs , "}"
 99 |      | "(" , rhs , ")"
100 |      | rhs , "|" , rhs
101 |      | rhs , "," , rhs ;
102 | 
103 | rule = lhs , "=" , rhs , ";" ;
104 | grammar = { rule } ;
105 | """
106 | 
107 | pprint(EBNF)
108 | 
109 | tr = EBNF.parse(inp)
110 | e = EBNF.interpret(inp)
111 | 
112 | # pprint(tr)
113 | pprint(e)
114 | 


--------------------------------------------------------------------------------
/tests/test_courses.py:
--------------------------------------------------------------------------------
 1 | from metaparse import LALR
 2 | 
 3 | 
 4 | def fappend(l, x):
 5 |     l.append(x)
 6 |     return l
 7 | 
 8 | class LangCourses(metaclass=LALR.meta):
 9 | 
10 |     """
11 |     Grammar to assign multiple numbers to precedend course name.
12 |     Examples:
13 | 
14 |     "CS 2110"                        => ("CS", 2110) # 0
15 | 
16 |     "CS 2110 and INFO 3300"          => [("CS", 2110), ("INFO", 3300)] # 1
17 |     "CS 2110, INFO 3300"             => [("CS", 2110), ("INFO", 3300)] # 1
18 |     "CS 2110, 3300, 3140"            => [("CS", 2110), ("CS", 3300), ("CS", 3140)] # 1
19 | 
20 |     "CS 2110 or INFO 3300"           => [[("CS", 2110)], [("INFO", 3300)]] # 2
21 | 
22 |     "MATH 2210, 2230, 2310, or 2940" => [[("MATH", 2210), ("MATH", 2230), ("MATH", 2310)], [("MATH", 2940)]] # 3
23 | 
24 |     """
25 | 
26 |     IGNORED = r'[ \t]+|(,)|(and)'
27 |     NAME    = r'[A-Z]+'
28 |     NUMBER  = r'\d{4}'
29 |     OR      = r'or'
30 | 
31 | 
32 |     # info -> headed
33 |     def info(headed):                return headed
34 | 
35 |     # info -> conj
36 |     def info(conj):                  return conj
37 | 
38 |     # info -> disj
39 |     def info(disj):                  return disj
40 | 
41 |     # headed -> NAME nums
42 |     def headed(NAME, nums):          return [(NAME, x) for x in nums]
43 | 
44 |     # nums -> nums NUMBER
45 |     def nums(nums, NUMBER):          return fappend(nums , NUMBER)
46 |     # def nums(nums, NUMBER):          return nums + [NUMBER]
47 | 
48 |     # nums -> NUMBER
49 |     def nums(NUMBER):                return [NUMBER]
50 | 
51 |     # conj -> headed headed
52 |     def conj(headed_1, headed_2):      return headed_1 + headed_2
53 | 
54 |     # disj -> headed OR headed
55 |     def disj(headed_1, OR, headed_2):  return [headed_1, headed_2]
56 | 
57 |     # disj -> headed OR nums
58 |     def disj(headed, OR, nums):     return [headed, [(headed[0][0], n) for n in nums]]
59 | 
60 | import pprint as pp
61 | from unittest import main, TestCase
62 | 
63 | gcrs = LangCourses
64 | 
65 | class Test(TestCase):
66 |     def test_match(self):
67 |         assert gcrs.interpret('CS 2110') == \
68 |             [('CS', '2110')]
69 |         assert gcrs.interpret('CS 2110 and INFO 3300') == \
70 |             [('CS', '2110'), ('INFO', '3300')]
71 |         assert gcrs.interpret('CS 2110, INFO 3300') == \
72 |             [('CS', '2110'), ('INFO', '3300')]
73 |         assert gcrs.interpret('CS 2110, 3300, 3140') == \
74 |             [('CS', '2110'), ('CS', '3300'), ('CS', '3140')]
75 |         assert gcrs.interpret('CS 2110 or INFO 3300') == \
76 |             [[('CS', '2110')], [('INFO', '3300')]]
77 | 
78 |         # Compare forms with same semantics...
79 |         inp = "MATH 2210, 2230, 2310 or 2940"
80 |         s1 =  gcrs.parse(inp)
81 |         v1 =  gcrs.interpret(inp)
82 | 
83 |         inp = "MATH 2210, 2230, 2310, or 2940"
84 |         s2 = gcrs.parse(inp)
85 |         v2 =  gcrs.interpret(inp)
86 | 
87 |         # assert s1 == s2
88 |         from pprint import pprint
89 |         # pprint(s1)
90 |         # pprint(s2)
91 |         # self.assertEqual((s1), (s2))
92 |         self.assertEqual(str(s1), str(s2))
93 |         self.assertEqual(v1, v2)
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     main()
98 | 


--------------------------------------------------------------------------------
/tests/tests_error_handling/test_language_errors.py:
--------------------------------------------------------------------------------
  1 | from metaparse import LanguageError, LALR
  2 | import unittest
  3 | 
  4 | 
  5 | class TestLangError(unittest.TestCase):
  6 |     
  7 |     def test_missing_symbol(self):
  8 |         with self.assertRaises(LanguageError) as excCtx:
  9 | 
 10 |             class ExprLang(metaclass=LALR.meta):
 11 | 
 12 |                 NUM = '\d+'
 13 |                 PLUS = '\+'
 14 |                 # TIMES = '\*'
 15 | 
 16 |                 def expr(expr, PLUS, term):
 17 |                     return expr + term
 18 | 
 19 |                 def expr(expr, TIMES, term):
 20 |                     return expr * term
 21 | 
 22 |                 def expr(term):
 23 |                     return term
 24 | 
 25 |                 def term(NUM):
 26 |                     return int(NUM)
 27 | 
 28 |                 def factor(NUM):
 29 |                     return int(NUM)
 30 | 
 31 |         self.assertIn(
 32 |             'No lexical pattern provided for terminal symbol: TIMES',
 33 |             excCtx.exception.message)
 34 | 
 35 |     def test_unreachable_rule(self):
 36 |         with self.assertRaises(LanguageError) as excCtx:
 37 | 
 38 |             class ExprLang(metaclass=LALR.meta):
 39 | 
 40 |                 NUM = '\d+'
 41 |                 PLUS = '\+'
 42 |                 TIMES = '\*'
 43 | 
 44 |                 def expr(expr, PLUS, term):
 45 |                     return expr + term
 46 | 
 47 |                 def expr(expr, TIMES, term):
 48 |                     return expr * term
 49 | 
 50 |                 def expr(term):
 51 |                     return term
 52 | 
 53 |                 def term(NUM):
 54 |                     return int(NUM)
 55 | 
 56 |                 def factor(NUM):
 57 |                     return int(NUM)
 58 | 
 59 |         self.assertIn(
 60 |             "There are unreachable nonterminals at 5th rule: {'factor'}.",
 61 |             excCtx.exception.message)
 62 | 
 63 |     
 64 | class TestLangErrorApi2(unittest.TestCase):
 65 | 
 66 |     def test_missing_symbol(self):
 67 |         with self.assertRaises(LanguageError) as excCtx:
 68 |             p = LALR()
 69 |             with p as (lex, rule):
 70 |                 lex(a = 'a')
 71 |                 lex(b = 'b')
 72 |                 @rule
 73 |                 def S(a, S, b): pass
 74 |                 @rule
 75 |                 def S(): pass
 76 |                 @rule
 77 |                 def S(c): pass
 78 |         self.assertIn(
 79 |             'No lexical pattern provided for terminal symbol: c',
 80 |             excCtx.exception.message)
 81 | 
 82 |     def test_unreachable_rule(self):
 83 |         with self.assertRaises(LanguageError) as excCtx:
 84 |             p = LALR()
 85 |             with p as (l, r):
 86 |                 l(a = 'a')
 87 |                 l(b = 'b')
 88 |                 @r
 89 |                 def S(a, S, b): pass
 90 |                 @r
 91 |                 def S(): pass
 92 |                 @r
 93 |                 def B(a): pass
 94 |                 @r
 95 |                 def B(b): pass
 96 | 
 97 |         self.assertIn(
 98 |             "There are unreachable nonterminals at 3th rule: {'B'}.",
 99 |             excCtx.exception.message)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     unittest.main()
104 | 


--------------------------------------------------------------------------------
/examples/eg_read_yacc.py:
--------------------------------------------------------------------------------
  1 | import preamble
  2 | from metaparse import *
  3 | from collections import OrderedDict
  4 | from pprint import pprint
  5 | 
  6 | 
  7 | class Symbol(str):
  8 |     def __repr__(self):
  9 |         return self
 10 | 
 11 | 
 12 | class Helper:
 13 | 
 14 |     terms = OrderedDict()
 15 |     _c = -1
 16 | 
 17 |     def reset():
 18 |         Helper._c = -1
 19 |         Helper.terms = OrderedDict()
 20 | 
 21 |     def get_term(lit):
 22 |         Helper._c += 1
 23 |         if lit not in Helper.terms:
 24 |             term = Symbol('TM{}'.format(Helper._c))
 25 |             Helper.terms[lit] = term
 26 |         return Helper.terms[lit]
 27 | 
 28 | 
 29 | class YACC(metaclass=LALR.meta):
 30 | 
 31 |     IGNORED = r'\s+'
 32 |     IGNORED = r'\/\*[^(\*/)]*\*\/'
 33 |     # IGNORED = r'\{[^\}]*\}'
 34 |     
 35 |     ALT = r'\|'
 36 |     DRV = r':'
 37 |     SEMI = r';'
 38 |     
 39 |     BODY = r'\{[^\}]*\}'
 40 | 
 41 |     ID = r'[_a-zA-Z]\w*'
 42 |     TERM1 = r"\'[^\']*\'"
 43 |     TERM2 = r'\"[^\"]*\"'
 44 | 
 45 |     def grammar(rules):
 46 |         terms = ['    {} = r{}'.format(
 47 |             tok,
 48 |             repr(pat[1:-1])) for pat, tok in Helper.terms.items()]
 49 |         gen = '\n'.join([
 50 |             'from metaparse import LALR',
 51 |             '',
 52 |             'class G(metaclass=LALR.meta):',
 53 |             '',
 54 |             *terms,
 55 |             '',
 56 |             *rules,
 57 |         ])
 58 |         return gen
 59 | 
 60 |     def rules(): return []
 61 |     def rules(rules, rule):
 62 |         return rules + rule
 63 |     
 64 |     def term(TERM1):
 65 |         return Helper.get_term(TERM1)
 66 |     def term(TERM2):
 67 |         return Helper.get_term(TERM2)
 68 | 
 69 |     def rule(ID, DRV, alts, SEMI):
 70 |         r_defs = []
 71 |         for seq, bdy in alts:
 72 |             r_def = '    def {}{}:\n        r"""{}"""\n'.format(
 73 |                 ID,
 74 |                 seq,
 75 |                 repr(bdy),
 76 |             )
 77 |             r_defs.append(r_def)
 78 |         return r_defs
 79 | 
 80 |     def alts(alts, ALT, alt):
 81 |         alts.append(alt)
 82 |         return alts
 83 |     def alts(alt):
 84 |         return [alt]
 85 | 
 86 |     def alt(seq):
 87 |         return (seq, '')
 88 |     def alt(seq, BODY):
 89 |         return (seq, BODY)
 90 | 
 91 |     def seq(seq, symbol):
 92 |         return seq + (symbol,)
 93 |     def seq():
 94 |         return ()
 95 | 
 96 |     def symbol(term):
 97 |         return term
 98 |     def symbol(ID):
 99 |         return Symbol(ID)
100 | 
101 | 
102 | eg = """
103 | input:    /* empty */
104 |         | input line
105 | ;
106 | 
107 | line:     '\n'
108 |         | exp '\n'  { printf ("\t%.10g\n", $1); }
109 | ;
110 | 
111 | exp:      NUM             { $$ = $1;         }
112 |         | exp exp '+'     { $$ = $1 + $2;    }
113 |         | exp exp '-'     { $$ = $1 - $2;    }
114 |         | exp exp '*'     { $$ = $1 * $2;    }
115 |         | exp exp '/'     { $$ = $1 / $2;    }
116 |       /* Exponentiation */
117 |         | exp exp '^'     { $$ = pow ($1, $2); }
118 |       /* Unary minus    */
119 |         | exp '-'         { $$ = -$1;        }
120 | ;    
121 | """
122 | 
123 | # pprint([*YACC.tokenize(eg, True)])
124 | 
125 | yacc = YACC
126 | tr = yacc.parse(eg)
127 | res = yacc.interpret(eg)
128 | 
129 | # pprint(yacc.grammar.lexers)
130 | # pprint(yacc)
131 | # pprint(tr)
132 | print()
133 | print(res)
134 | 
135 | 
136 | r_plus = Rule('exp', ['exp', 'exp', '+'])
137 | print(r_plus)
138 | 


--------------------------------------------------------------------------------
/examples/eg_demo.py:
--------------------------------------------------------------------------------
  1 | import preamble
  2 | from metaparse import LALR
  3 | 
  4 | # Global context/environment for language semantics.
  5 | context = {}
  6 | 
  7 | class pCalc(metaclass=LALR.meta):
  8 | 
  9 |     "A language for calculating expressions."
 10 | 
 11 |     # ===== Lexical patterns / Terminals =====
 12 |     # - Patterns specified with regular expressions
 13 |     # - Patterns will be tested in declaration order during tokenizing
 14 | 
 15 |     IGNORED = r'\s+'             # Special pattern to be ignored.
 16 | 
 17 |     EQ  = r'='
 18 |     POW = r'\*\*', 3             # Can specify precedence of token (for LALR conflict resolution)
 19 |     POW = r'\^'  , 3             # Alternative patterns can share the same name
 20 |     MUL = r'\*'  , 2
 21 |     ADD = r'\+'  , 1
 22 | 
 23 |     ID  = r'[_a-zA-Z]\w*'
 24 |     NUM = r'[1-9][0-9]*'
 25 |     def NUM(value):              # Can specify handler for lexical pattern!
 26 |         return int(value)
 27 | 
 28 |     # ===== Syntactic/Semantic rules in SDT-style =====
 29 | 
 30 |     def assign(ID, EQ, expr):        # May access global context.
 31 |         context[ID] = expr
 32 |         return expr
 33 | 
 34 |     def expr(NUM):                   # May compute result purely.
 35 |         return NUM                   # NUM is passed as int due to the handler!
 36 | 
 37 |     def expr(ID):
 38 |         return context[ID]
 39 | 
 40 |     def expr(expr_1, ADD, expr_2):   # With TeX-subscripts, meaning (expr → expr₁ + expr₂)
 41 |         return expr_1 + expr_2
 42 | 
 43 |     def expr(expr, MUL, expr_1):     # Can ignore one of the subscripts.
 44 |         return expr * expr_1
 45 | 
 46 |     def expr(expr, POW, expr_1):
 47 |         return expr ** expr_1
 48 | 
 49 | 
 50 | from pprint import pprint
 51 | 
 52 | print (type(pCalc))
 53 | 
 54 | print (pCalc.interpret("x = 1 + 4 * 3 ** 2 + 5"))
 55 | # 42
 56 | print (pCalc.interpret("y = 5 + x * 2")) # Here `x` is extracted from the context `context`
 57 | # 89
 58 | print (pCalc.interpret("z = 9 ^ 2"))
 59 | # 81
 60 | 
 61 | print (context)
 62 | 
 63 | 
 64 | tr = pCalc.parse(" w  = 1 + 2 * 3 ** 4 + 5 ")
 65 | 
 66 | # pprint(tr)
 67 | print (pCalc.lexer)
 68 | 
 69 | for token in pCalc.lexer.tokenize(" foo  = 1 + bar * 2"):
 70 |     print(token.pos,
 71 |           token.end,
 72 |           token.symbol,
 73 |           repr(token.lexeme),   # (lexeme) is something literal.
 74 |           repr(token.value))    # (value) is something computed by handler, if exists.
 75 | 
 76 | # 1 2 ID 'w'
 77 | # 4 5 EQ '='
 78 | # 6 7 NUM '1'
 79 | # 8 9 ADD '+'
 80 | # 10 11 ID 'x'
 81 | # 12 13 MUL '*'
 82 | # 14 15 NUM '2'
 83 | 
 84 | ('assign',
 85 |  [('ID', 'w'),
 86 |   ('EQ', '='),
 87 |   ('expr',
 88 |    [('expr',
 89 |      [('expr', [('NUM', '1')]),
 90 |       ('ADD', '+'),
 91 |       ('expr',
 92 |        [('expr', [('NUM', '2')]),
 93 |         ('MUL', '*'),
 94 |         ('expr',
 95 |          [('expr', [('NUM', '3')]),
 96 |           ('POW', '**'),
 97 |           ('expr', [('NUM', '4')])])])]),
 98 |     ('ADD', '+'),
 99 |     ('expr', [('NUM', '5')])])])
100 | 
101 | 
102 | # s = pCalc.dumps()
103 | # print(s)
104 | # pCalc.dump('./eg_demo_dump.py')
105 | 
106 | 
107 | # Let loaded parser be able to access current runtime env `globals()`.
108 | # qCalc = LALR.load('./eg_demo_dump.py', globals())
109 | 
110 | # Context instance to be accessed by the loaded parser
111 | # context = {}
112 | 
113 | # qCalc.interpret('foo = 1 + 9')
114 | 
115 | # print (context)
116 | # {'foo': 10}
117 | 
118 | 
119 | 
120 | # context = {}
121 | # pCalc.interpret("bar = 10 ^ 3")
122 | # # pCalc1.interpret("bar = 99 + 1")
123 | # print(context)
124 | 


--------------------------------------------------------------------------------
/tests/test_basic.py:
--------------------------------------------------------------------------------
  1 | import metaparse as mp
  2 | from metaparse import LALR, END_TOKEN
  3 | 
  4 | p = LALR()
  5 | 
  6 | p.lexer.more(
  7 |     IGNORED=' ',
  8 |     PLUS='\+',
  9 |     TIMES='\*',
 10 |     LEFT='\(',
 11 |     RIGHT='\)'
 12 | )
 13 | 
 14 | @p.lexer(NUMBER='\d+')
 15 | def _(val):
 16 |     return int(val)
 17 | 
 18 | @p.rule
 19 | def expr(expr, PLUS, term):
 20 |     return expr + term
 21 | 
 22 | @p.rule
 23 | def expr(term):
 24 |     return term
 25 | 
 26 | @p.rule
 27 | def term(term, TIMES, factor):
 28 |     return term * factor
 29 | 
 30 | @p.rule
 31 | def term(factor):
 32 |     return factor
 33 | 
 34 | 
 35 | with p as (lexer, rule):
 36 | 
 37 |     @rule
 38 |     def factor(NUMBER):
 39 |         return NUMBER
 40 | 
 41 |     @rule
 42 |     def factor(LEFT, expr, RIGHT):
 43 |         return expr
 44 | 
 45 |     # exit and make!
 46 | 
 47 | # p.make()
 48 | 
 49 | 
 50 | from pprint import pprint
 51 | # pprint(p.grammar)
 52 | # p.inspect_ACTION
 53 | # t = p.parse('123')
 54 | # pprint(t)
 55 | tkns = (p.lexer.tokenize('123 + 8'))
 56 | q = p.prepare()
 57 | next(q)
 58 | q.send(next(tkns))
 59 | q.send(next(tkns))
 60 | q.send(next(tkns))
 61 | t = q.send(END_TOKEN)
 62 | assert t == mp.Just(131)
 63 | 
 64 | t = p.parse('123 + 8')
 65 | assert p.interpret('123 + 8') == 131
 66 | t = p.parse('123 + 2 * 1')
 67 | assert p.interpret('123 + 2 * 1') == 125
 68 | assert p.interpret('123 + 2 * (1 + 2)') == 129
 69 | 
 70 | tough = ' + '.join(['(2 * (1 + (1)) + 2 * 2 + (3))'] * 100)
 71 | assert p.interpret(tough) == eval(tough)
 72 | 
 73 | # if replication is 10000
 74 | # %timeit p.interpret(tough)
 75 | # 1 loops, best of 3: 346 ms per loop
 76 | 
 77 | p_sexp = LALR()
 78 | 
 79 | with p_sexp as (lex, rule):
 80 | 
 81 |     # # Order???
 82 |     # lex.word(
 83 |     #     IGNORED=' ',
 84 |     #     LEFT='(',
 85 |     #     RIGHT=')',
 86 |     #     COMMA=',',
 87 |     # )
 88 |     # lex.re(
 89 |     #     NUMBER='\d+(\.\d*)?',
 90 |     #     SYMBOL='\w+',
 91 |     #     UNKNOWN='%',
 92 |     # )
 93 |     lex.more(
 94 |         IGNORED='%',
 95 |         LEFT='\(',
 96 |         RIGHT='\)',
 97 |         COMMA=',',
 98 |     )
 99 |     lex(IGNORED='\s+')
100 |     lex(SYMBOL='[_a-zA-Z]\w*')
101 |     lex(UNKNOWN='&')
102 | 
103 |     @lex(NUMBER='[1-9]\d*(\.\d*)?')
104 |     def _(val):
105 |         return int(val)
106 | 
107 |     @rule
108 |     def sexp(atom):
109 |         return atom
110 |     @rule
111 |     def sexp(LEFT, slist, RIGHT):
112 |         return slist
113 | 
114 | 
115 |     @rule
116 |     def slist():
117 |         return []
118 |     @rule
119 |     def slist(slist, sexp):
120 |         slist.append(sexp)
121 |         return slist
122 | 
123 |     @rule
124 |     def atom(NUMBER):
125 |         return NUMBER
126 |     @rule
127 |     def atom(SYMBOL):
128 |         return SYMBOL
129 | 
130 | # p_sexp.inspect_ACTION
131 | # p_sexp.inspect_GOTO
132 | 
133 | # debug p_sexp.make()
134 | 
135 | # s = p_sexp.parse('123')
136 | # pprint(s)
137 | # pprint(list(p_sexp.lexer.tokenize('(a b (c d))')))
138 | # pprint(p_sexp.lexer)
139 | 
140 | # ds = (p_sexp.dumps())
141 | # ctx = {}
142 | # exec(ds, {}, ctx)
143 | # pprint(ctx)
144 | 
145 | # lx_dp = p_sexp.lexer.dumps()
146 | # print(lx_dp)
147 | # lexer1 = Lexer.loads(lx_dp, globals())
148 | 
149 | # print(list(lexer1.tokenize(' 123  99 ')))
150 | 
151 | 
152 | import warnings
153 | 
154 | with warnings.catch_warnings(record=True) as w:
155 |     s = p_sexp.interpret('(a 123 (c (d)) %  & e)')
156 |     assert len(w) == 1
157 | 
158 | assert s == ['a', 123, ['c', ['d']], 'e'], s
159 | 
160 | 
161 | sexp_dp = p_sexp.dumps()
162 | 
163 | with open('sexp_dump.py', 'w') as o:
164 |     o.write(sexp_dp)
165 | 
166 | # print(sexp_dp)
167 | p_sexp1 = LALR.loads(sexp_dp, globals())
168 | 
169 | with warnings.catch_warnings(record=True) as w:
170 |     s = p_sexp.interpret('(a & 123 (c (d)) %  & e)')
171 |     assert len(w) == 2
172 |     
173 | 
174 | assert s == ['a', 123, ['c', ['d']], 'e'], s
175 | 
176 | 


--------------------------------------------------------------------------------
/examples/eg_demo_dump.py:
--------------------------------------------------------------------------------
  1 | lex2pats = [('IGNORED', '\\s+'),
  2 |  ('EQ', '='),
  3 |  ('NUM', '[1-9][0-9]*'),
  4 |  ('ID', '[_a-zA-Z]\\w*'),
  5 |  ('POW', '\\*\\*'),
  6 |  ('MUL', '\\*'),
  7 |  ('ADD', '\\+')]
  8 | 
  9 | handlers = [None, None, None, None, None, None, None]
 10 | 
 11 | rules = [('assign^', ('assign',)),
 12 |  ('assign', ('ID', 'EQ', 'expr')),
 13 |  ('expr', ('NUM',)),
 14 |  ('expr', ('ID',)),
 15 |  ('expr', ('expr', 'ADD', 'expr')),
 16 |  ('expr', ('expr', 'MUL', 'expr')),
 17 |  ('expr', ('expr', 'POW', 'expr'))]
 18 | 
 19 | ACTION = [{'ID': ('shift', 2)},
 20 |  {'\x03': ('accept', 0)},
 21 |  {'EQ': ('shift', 3)},
 22 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 23 |  {'\x03': ('reduce', 1),
 24 |   'ADD': ('shift', 7),
 25 |   'MUL': ('shift', 8),
 26 |   'POW': ('shift', 9)},
 27 |  {'\x03': ('reduce', 2),
 28 |   'ADD': ('reduce', 2),
 29 |   'MUL': ('reduce', 2),
 30 |   'POW': ('reduce', 2)},
 31 |  {'\x03': ('reduce', 3),
 32 |   'ADD': ('reduce', 3),
 33 |   'MUL': ('reduce', 3),
 34 |   'POW': ('reduce', 3)},
 35 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 36 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 37 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 38 |  {'\x03': ('reduce', 4),
 39 |   'ADD': ('reduce', 4),
 40 |   'MUL': ('shift', 8),
 41 |   'POW': ('shift', 9)},
 42 |  {'\x03': ('reduce', 5),
 43 |   'ADD': ('reduce', 5),
 44 |   'MUL': ('reduce', 5),
 45 |   'POW': ('shift', 9)},
 46 |  {'\x03': ('reduce', 6),
 47 |   'ADD': ('reduce', 6),
 48 |   'MUL': ('reduce', 6),
 49 |   'POW': ('reduce', 6)}]
 50 | 
 51 | GOTO = [{'ID': 2, 'assign': 1},
 52 |  {},
 53 |  {'EQ': 3},
 54 |  {'ID': 6, 'NUM': 5, 'expr': 4},
 55 |  {'ADD': 7, 'MUL': 8, 'POW': 9},
 56 |  {},
 57 |  {},
 58 |  {'ID': 6, 'NUM': 5, 'expr': 10},
 59 |  {'ID': 6, 'NUM': 5, 'expr': 11},
 60 |  {'ID': 6, 'NUM': 5, 'expr': 12},
 61 |  {'ADD': 7, 'MUL': 8, 'POW': 9},
 62 |  {'ADD': 7, 'MUL': 8, 'POW': 9},
 63 |  {'ADD': 7, 'MUL': 8, 'POW': 9}]
 64 | 
 65 | semans = [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 66 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x'
 67 |  b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\Users\\Shellay\\Documents\\GitHu'
 68 |  b'b\\metaparse\\metaparse.py\xda\x08identity1\x00\x00\x00s\x02\x00\x00\x00\x00'
 69 |  b'\x01',
 70 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00'
 71 |  b'\x00C\x00\x00\x00s\x0e\x00\x00\x00|\x02\x00t\x00\x00|\x00\x00<|\x02\x00S'
 72 |  b')\x01N)\x01\xda\x07context)\x03\xda\x02ID\xda\x02EQ\xda\x04expr\xa9\x00'
 73 |  b'r\x05\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/'
 74 |  b'eg_demo.py\xda\x06assign\x19\x00\x00\x00s\x04\x00\x00\x00\x00\x01\n\x01',
 75 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00'
 76 |  b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)'
 77 |  b'\x01\xda\x03int)\x01\xda\x03NUM\xa9\x00r\x03\x00\x00\x00\xfa?c:/Users/Shell'
 78 |  b'ay/Documents/GitHub/metaparse/examples/eg_demo.py\xda\x04expr\x1d'
 79 |  b'\x00\x00\x00s\x02\x00\x00\x00\x00\x01',
 80 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00'
 81 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00t\x00\x00|\x00\x00\x19S)\x01N)\x01\xda'
 82 |  b'\x07context)\x01\xda\x02ID\xa9\x00r\x03\x00\x00\x00\xfa?c:/Users/Shellay/'
 83 |  b'Documents/GitHub/metaparse/examples/eg_demo.py\xda\x04expr \x00\x00\x00'
 84 |  b's\x02\x00\x00\x00\x00\x01',
 85 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00'
 86 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x17S)\x01N\xa9\x00)'
 87 |  b'\x03\xda\x06expr_1\xda\x03ADD\xda\x06expr_2r\x01\x00\x00\x00r'
 88 |  b'\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/e'
 89 |  b'g_demo.py\xda\x04expr#\x00\x00\x00s\x02\x00\x00\x00\x00\x01',
 90 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00'
 91 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x14S)\x01N\xa9\x00)'
 92 |  b'\x03\xda\x04expr\xda\x03MUL\xda\x06expr_1r\x01\x00\x00\x00r\x01\x00'
 93 |  b'\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/eg_demo'
 94 |  b'.pyr\x02\x00\x00\x00&\x00\x00\x00s\x02\x00\x00\x00\x00\x01',
 95 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00'
 96 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x13S)\x01N\xa9\x00)'
 97 |  b'\x03\xda\x04expr\xda\x03POW\xda\x06expr_1r\x01\x00\x00\x00r\x01\x00'
 98 |  b'\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metaparse/examples/eg_demo'
 99 |  b'.pyr\x02\x00\x00\x00)\x00\x00\x00s\x02\x00\x00\x00\x00\x01']
100 | 


--------------------------------------------------------------------------------
/tests/sexp_dump.py:
--------------------------------------------------------------------------------
  1 | lex2pats = [('RIGHT', '\\)'),
  2 |  ('COMMA', ','),
  3 |  ('IGNORED', '%'),
  4 |  ('LEFT', '\\('),
  5 |  ('IGNORED', '\\s+'),
  6 |  ('SYMBOL', '[_a-zA-Z]\\w*'),
  7 |  ('UNKNOWN', '&'),
  8 |  ('NUMBER', '[1-9]\\d*(\\.\\d*)?')]
  9 | 
 10 | handlers = [None,
 11 |  None,
 12 |  None,
 13 |  None,
 14 |  None,
 15 |  None,
 16 |  None,
 17 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00'
 18 |  b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)'
 19 |  b'\x01\xda\x03int)\x01\xda\x03val\xa9\x00r\x03\x00\x00\x00\xfa?c:/Users/Shell'
 20 |  b'ay/Documents/GitHub/metaparse/tests/test_basic.py\xda\x01_h\x00\x00\x00'
 21 |  b's\x02\x00\x00\x00\x00\x02']
 22 | 
 23 | rules = [('sexp^', ('sexp',)),
 24 |  ('sexp', ('atom',)),
 25 |  ('sexp', ('LEFT', 'slist', 'RIGHT')),
 26 |  ('slist', ()),
 27 |  ('slist', ('slist', 'sexp')),
 28 |  ('atom', ('NUMBER',)),
 29 |  ('atom', ('SYMBOL',))]
 30 | 
 31 | ACTION = [{'LEFT': ('shift', 3), 'NUMBER': ('shift', 4), 'SYMBOL': ('shift', 5)},
 32 |  {'\x03': ('accept', 0)},
 33 |  {'\x03': ('reduce', 1),
 34 |   'LEFT': ('reduce', 1),
 35 |   'NUMBER': ('reduce', 1),
 36 |   'RIGHT': ('reduce', 1),
 37 |   'SYMBOL': ('reduce', 1)},
 38 |  {'LEFT': ('reduce', 3),
 39 |   'NUMBER': ('reduce', 3),
 40 |   'RIGHT': ('reduce', 3),
 41 |   'SYMBOL': ('reduce', 3)},
 42 |  {'\x03': ('reduce', 5),
 43 |   'LEFT': ('reduce', 5),
 44 |   'NUMBER': ('reduce', 5),
 45 |   'RIGHT': ('reduce', 5),
 46 |   'SYMBOL': ('reduce', 5)},
 47 |  {'\x03': ('reduce', 6),
 48 |   'LEFT': ('reduce', 6),
 49 |   'NUMBER': ('reduce', 6),
 50 |   'RIGHT': ('reduce', 6),
 51 |   'SYMBOL': ('reduce', 6)},
 52 |  {'LEFT': ('shift', 3),
 53 |   'NUMBER': ('shift', 4),
 54 |   'RIGHT': ('shift', 7),
 55 |   'SYMBOL': ('shift', 5)},
 56 |  {'\x03': ('reduce', 2),
 57 |   'LEFT': ('reduce', 2),
 58 |   'NUMBER': ('reduce', 2),
 59 |   'RIGHT': ('reduce', 2),
 60 |   'SYMBOL': ('reduce', 2)},
 61 |  {'LEFT': ('reduce', 4),
 62 |   'NUMBER': ('reduce', 4),
 63 |   'RIGHT': ('reduce', 4),
 64 |   'SYMBOL': ('reduce', 4)}]
 65 | 
 66 | GOTO = [{'LEFT': 3, 'NUMBER': 4, 'SYMBOL': 5, 'atom': 2, 'sexp': 1},
 67 |  {},
 68 |  {},
 69 |  {'slist': 6},
 70 |  {},
 71 |  {},
 72 |  {'LEFT': 3, 'NUMBER': 4, 'RIGHT': 7, 'SYMBOL': 5, 'atom': 2, 'sexp': 8},
 73 |  {},
 74 |  {}]
 75 | 
 76 | semans = [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 77 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x'
 78 |  b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\Users\\Shellay\\Documents\\GitHu'
 79 |  b'b\\metaparse\\metaparse.py\xda\x08identity1\x00\x00\x00s\x02\x00\x00\x00\x00'
 80 |  b'\x01',
 81 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 82 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x04atomr'
 83 |  b'\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/met'
 84 |  b'aparse/tests/test_basic.py\xda\x04sexpl\x00\x00\x00s\x02\x00\x00'
 85 |  b'\x00\x00\x02',
 86 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00'
 87 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x01\x00S)\x01N\xa9\x00)\x03\xda\x04L'
 88 |  b'EFT\xda\x05slist\xda\x05RIGHTr\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/User'
 89 |  b's/Shellay/Documents/GitHub/metaparse/tests/test_basic.py\xda\x04sexpo\x00'
 90 |  b'\x00\x00s\x02\x00\x00\x00\x00\x02',
 91 |  b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00'
 92 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00g\x00\x00S)\x01N\xa9\x00r\x01\x00\x00\x00'
 93 |  b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/me'
 94 |  b'taparse/tests/test_basic.py\xda\x05slistt\x00\x00\x00s\x02\x00\x00\x00\x00'
 95 |  b'\x02',
 96 |  b'\xe3\x02\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00'
 97 |  b'\x00C\x00\x00\x00s\x11\x00\x00\x00|\x00\x00j\x00\x00|\x01\x00\x83'
 98 |  b'\x01\x00\x01|\x00\x00S)\x01N)\x01\xda\x06append)\x02\xda\x05slist\xda\x04s'
 99 |  b'exp\xa9\x00r\x04\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/metapa'
100 |  b'rse/tests/test_basic.pyr\x02\x00\x00\x00w\x00\x00\x00s\x04\x00\x00'
101 |  b'\x00\x00\x02\r\x01',
102 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
103 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06NUMBE'
104 |  b'Rr\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/m'
105 |  b'etaparse/tests/test_basic.py\xda\x04atom|\x00\x00\x00s\x02\x00\x00\x00\x00'
106 |  b'\x02',
107 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
108 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06SYMBO'
109 |  b'Lr\x01\x00\x00\x00r\x01\x00\x00\x00\xfa?c:/Users/Shellay/Documents/GitHub/m'
110 |  b'etaparse/tests/test_basic.py\xda\x04atom\x7f\x00\x00\x00s\x02'
111 |  b'\x00\x00\x00\x00\x02']
112 | 


--------------------------------------------------------------------------------
/examples/eg_dumps_file.py:
--------------------------------------------------------------------------------
  1 | lex2pats = [('IGNORED', '\\s+'),
  2 |  ('EQ', '='),
  3 |  ('NUM', '[1-9]\\d*'),
  4 |  ('ID', '[_a-zA-Z]\\w*'),
  5 |  ('POW', '\\*\\*'),
  6 |  ('MUL', '\\*'),
  7 |  ('ADD', '\\+')]
  8 | 
  9 | handlers = [None,
 10 |  None,
 11 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00'
 12 |  b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)'
 13 |  b'\x01\xda\x05float)\x01\xda\x03lex\xa9\x00r\x03\x00\x00\x00\xfaVc:/Users/She'
 14 |  b'llay/Documents/GitHub/metaparse/experiments/lessparse/examples/eg_dumps.'
 15 |  b'py\xda\x03NUM\x0c\x00\x00\x00s\x02\x00\x00\x00\x00\x01',
 16 |  None,
 17 |  None,
 18 |  None,
 19 |  None]
 20 | 
 21 | rules = [('assign^', ('assign',)),
 22 |  ('assign', ('ID', 'EQ', 'expr')),
 23 |  ('expr', ('NUM',)),
 24 |  ('expr', ('ID',)),
 25 |  ('expr', ('expr', 'ADD', 'expr')),
 26 |  ('expr', ('expr', 'MUL', 'expr')),
 27 |  ('expr', ('expr', 'POW', 'expr'))]
 28 | 
 29 | ACTION1 = [{'ID': ('shift', 2)},
 30 |  {'\x03': ('reduce', 0)},
 31 |  {'EQ': ('shift', 3)},
 32 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 33 |  {'\x03': ('reduce', 1),
 34 |   'ADD': ('shift', 7),
 35 |   'MUL': ('shift', 8),
 36 |   'POW': ('shift', 9)},
 37 |  {'\x03': ('reduce', 2),
 38 |   'ADD': ('reduce', 2),
 39 |   'MUL': ('reduce', 2),
 40 |   'POW': ('reduce', 2)},
 41 |  {'\x03': ('reduce', 3),
 42 |   'ADD': ('reduce', 3),
 43 |   'MUL': ('reduce', 3),
 44 |   'POW': ('reduce', 3)},
 45 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 46 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 47 |  {'ID': ('shift', 6), 'NUM': ('shift', 5)},
 48 |  {'\x03': ('reduce', 4),
 49 |   'ADD': ('reduce', 4),
 50 |   'MUL': ('shift', 8),
 51 |   'POW': ('shift', 9)},
 52 |  {'\x03': ('reduce', 5),
 53 |   'ADD': ('reduce', 5),
 54 |   'MUL': ('reduce', 5),
 55 |   'POW': ('shift', 9)},
 56 |  {'\x03': ('reduce', 6),
 57 |   'ADD': ('reduce', 6),
 58 |   'MUL': ('reduce', 6),
 59 |   'POW': ('reduce', 6)}]
 60 | 
 61 | GOTO = [{'ID': 2, 'assign': 1},
 62 |  {},
 63 |  {'EQ': 3},
 64 |  {'ID': 6, 'NUM': 5, 'expr': 4},
 65 |  {'ADD': 7, 'MUL': 8, 'POW': 9},
 66 |  {},
 67 |  {},
 68 |  {'ID': 6, 'NUM': 5, 'expr': 10},
 69 |  {'ID': 6, 'NUM': 5, 'expr': 11},
 70 |  {'ID': 6, 'NUM': 5, 'expr': 12},
 71 |  {'ADD': 7, 'MUL': 8, 'POW': 9},
 72 |  {'ADD': 7, 'MUL': 8, 'POW': 9},
 73 |  {'ADD': 7, 'MUL': 8, 'POW': 9}]
 74 | 
 75 | semans = [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 76 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x'
 77 |  b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfaNc:\\Users\\Shellay\\Documents\\GitHu'
 78 |  b'b\\metaparse\\experiments\\lessparse\\metaparse.py\xda\x08identity'
 79 |  b'+\x00\x00\x00s\x02\x00\x00\x00\x00\x01',
 80 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00'
 81 |  b'\x00C\x00\x00\x00s\x0e\x00\x00\x00|\x02\x00t\x00\x00|\x00\x00<d\x00\x00S'
 82 |  b')\x01N)\x01\xda\x05table)\x03\xda\x02ID\xda\x02EQ\xda\x04expr\xa9\x00r\x05'
 83 |  b'\x00\x00\x00\xfaVc:/Users/Shellay/Documents/GitHub/metaparse/experiments/les'
 84 |  b'sparse/examples/eg_dumps.py\xda\x06assign\x14\x00\x00\x00s\x02\x00\x00\x00'
 85 |  b'\x00\x01',
 86 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 87 |  b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x03N'
 88 |  b'UMr\x01\x00\x00\x00r\x01\x00\x00\x00\xfaVc:/Users/Shellay/Documents/GitHub/'
 89 |  b'metaparse/experiments/lessparse/examples/eg_dumps.py\xda\x04expr\x17\x00'
 90 |  b'\x00\x00s\x02\x00\x00\x00\x00\x01',
 91 |  b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00'
 92 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00t\x00\x00|\x00\x00\x19S)\x01N)\x01\xda'
 93 |  b'\x05table)\x01\xda\x02ID\xa9\x00r\x03\x00\x00\x00\xfaVc:/Users/Shellay/Do'
 94 |  b'cuments/GitHub/metaparse/experiments/lessparse/examples/eg_dumps.py\xda'
 95 |  b'\x04expr\x1a\x00\x00\x00s\x02\x00\x00\x00\x00\x01',
 96 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00'
 97 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x17S)\x01N\xa9\x00)'
 98 |  b'\x03\xda\x06expr_1\xda\x03ADD\xda\x06expr_2r\x01\x00\x00\x00r'
 99 |  b'\x01\x00\x00\x00\xfaVc:/Users/Shellay/Documents/GitHub/metaparse/experiment'
100 |  b's/lessparse/examples/eg_dumps.py\xda\x04expr\x1d\x00\x00\x00s\x02'
101 |  b'\x00\x00\x00\x00\x01',
102 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00'
103 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x14S)\x01N\xa9\x00)'
104 |  b'\x03\xda\x04expr\xda\x03MUL\xda\x06expr_1r\x01\x00\x00\x00r\x01\x00'
105 |  b'\x00\x00\xfaVc:/Users/Shellay/Documents/GitHub/metaparse/experiments/less'
106 |  b'parse/examples/eg_dumps.pyr\x02\x00\x00\x00 \x00\x00\x00s\x02\x00\x00\x00'
107 |  b'\x00\x01',
108 |  b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00'
109 |  b'\x00C\x00\x00\x00s\x08\x00\x00\x00|\x00\x00|\x02\x00\x13S)\x01N\xa9\x00)'
110 |  b'\x03\xda\x04expr\xda\x03POW\xda\x06expr_1r\x01\x00\x00\x00r\x01\x00'
111 |  b'\x00\x00\xfaVc:/Users/Shellay/Documents/GitHub/metaparse/experiments/less'
112 |  b'parse/examples/eg_dumps.pyr\x02\x00\x00\x00#\x00\x00\x00s\x02\x00\x00\x00'
113 |  b'\x00\x01']
114 | 


--------------------------------------------------------------------------------
/examples/eg_func_lang.py:
--------------------------------------------------------------------------------
  1 | """A simple haskell-like grammar for basic lambda-calculus.
  2 | 
  3 | A GLR(0) parser can be used to find LR(0)-conflicts as well as
  4 | contruct partial parse trees.
  5 | 
  6 | """
  7 | 
  8 | import preamble
  9 | import pprint as pp
 10 | 
 11 | from metaparse import *
 12 | 
 13 | from collections import namedtuple as data
 14 | 
 15 | Let   = data('Let', 'binds body')
 16 | Abst  = data('Abst', 'par body')
 17 | Appl  = data('Appl', 'func arg')
 18 | Var   = data('Var', 'symbol')
 19 | Value = data('Value', 'value')
 20 | 
 21 | Expr  = [Let, Abst, Appl, Var, Value]
 22 | for E in Expr:
 23 |     E.__repr__ = tuple.__repr__
 24 | 
 25 | TW = r'[ \t]*'
 26 | TWN = r'[ \t\n]*'
 27 | 
 28 | 
 29 | class Lam(metaclass=LALR.meta):
 30 | 
 31 |     "A haskell like grammar."
 32 | 
 33 |     # IGNORED = r'(^[ \t]*\n)| '
 34 |     # NEWLINE = r'\n'         # + TWN
 35 |     IGNORED = r'\s+'
 36 | 
 37 |     # Examplar ERROR handling
 38 |     def ERROR(lex: '\#'):
 39 |         print('Found ERRORed lexeme: `{}` and ignored it.'.format(lex))
 40 |     def ERROR(lex: '\$'):
 41 |         print('Found ERRORed lexeme: `{}` and ignored it.'.format(lex))
 42 |     def ERROR(lex: '\!'):
 43 |         print('Found ERRORed lexeme: `{}` and ignored it.'.format(lex))
 44 |         
 45 | 
 46 |     EQ      = r'='          # + TW
 47 |     IN      = r'in'         # + TWN
 48 |     LET     = r'let'        # + TWN
 49 |     LAMBDA  = r'\\'         # + TWN
 50 |     ARROW   = r'->'         # + TWN
 51 |     COMMA   = r','          # + TW
 52 |     SEMI    = r';'          # + TW
 53 |     L1      = r'\('         # + TW
 54 |     R1      = r'\)'         # + TW
 55 | 
 56 |     VALUE   = r'\d+'        # + TW
 57 |     VAR     = r'[_a-z]\w*'  # + TW
 58 |     CONS    = r'[A-Z]\w*'   # + TW
 59 | 
 60 |     INFIX   = r'[\+\-\*\/]' # + TW
 61 | 
 62 |     def prog(binds):
 63 |         return binds
 64 | 
 65 |     # Stand-alone expression
 66 |     def exprx(expr):
 67 |         return expr
 68 |     def exprx(let):
 69 |         return let
 70 |     def exprx(abst):
 71 |         return abst
 72 |     def exprx(appl):
 73 |         return appl
 74 | 
 75 |     # Atomic expression
 76 |     def expr(VALUE):
 77 |         return float(VALUE)
 78 |     def expr(VAR):
 79 |         return VAR
 80 |     def expr(L1, exprx, R1):
 81 |         return exprx
 82 | 
 83 |     # Pattern
 84 |     def pat(VAR):
 85 |         return VAR
 86 |     def pat(CONS, arglist):
 87 |         return (CONS, arglist)
 88 |     def arglist(arglist, expr):
 89 |         return arglist + (expr,)
 90 |     def arglist():
 91 |         return ()
 92 | 
 93 |     # Application (Curried)
 94 |     def appl(expr_1, expr_2):
 95 |         return Appl(expr_1, expr_2)
 96 |     def appl(appl, expr):
 97 |         return Appl(appl, expr)
 98 |     def appl(expr_1, INFIX, expr_2):
 99 |         return Appl(INFIX, expr_1, expr_2)
100 | 
101 |     # Lambda-Abstraction (also Curried)
102 |     def abst(LAMBDA, parlist, ARROW, exprx):
103 |         tar = exprx
104 |         for par in reversed(parlist):
105 |             tar = Abst(par, tar)
106 |         return tar
107 |     def parlist(VAR):
108 |         return [VAR]
109 |     def parlist(parlist, COMMA, VAR):
110 |         return [*parlist, VAR]
111 | 
112 |     # Let-expression with environmental bindings
113 |     def let(LET, binds, IN, exprx):
114 |         return Let(binds, exprx)
115 |     def bind(pat, EQ, exprx):
116 |         return {pat: exprx}
117 |     def binds(bind):
118 |         return bind
119 |     def binds(binds, SEMI, bind):
120 |         return {**binds, **bind}
121 | 
122 |     # def _env():
123 |     #     print('Env!')
124 | 
125 |     # def _unify():
126 |     #     print('Unify!')
127 | 
128 | 
129 | # Test whether the grammar is LALR to exclude potential ambiguity
130 | # and prepare for better performance
131 | psr_lalr = Lam
132 | 
133 | 
134 | inp = """
135 | 
136 | k = let
137 |      a = 3 ;
138 |      P p q = u v
139 |  in 
140 |    map (\c, d -> f c d) xs ys ;
141 | 
142 | l = 3 ;
143 | m = 4
144 | """
145 | 
146 | # r = psr_gll.parse_many(inp)
147 | # r = psr_glr.parse_many(inp)
148 | # print(r)
149 | 
150 | # assert 0
151 | 
152 | inp = """
153 | k = let a = 3 ;
154 |     P q = u v #
155 | !in  $$
156 |    map (\c, d -> f c d) xs ys
157 | """
158 | 
159 | 
160 | # print(Lam)
161 | # psr_gll.interpret(inp) # LEFT-RECURSION!!!!
162 | # psr_glr.interpret(inp)
163 | # psr_lalr.interpret(inp)
164 | 
165 | psr = psr_lalr
166 | # psr = psr_glr
167 | # psr = psr_ear
168 | 
169 | tough_inp = '   ;\n'.join([inp for _ in range(10)])
170 | # tough_inp = '   ;\n'.join([inp for _ in range(100)])
171 | 
172 | # pp.pprint(list(psr.grammar.tokenize(inp, False)))
173 | # pp.pprint(psr.interpret_many(inp))
174 | # print(len(psr.ACTION))
175 | # pp.pprint(psr.ACTION)
176 | 
177 | pp.pprint(psr.interpret(tough_inp))
178 | 
179 | s = psr.dumps()
180 | psr1 = psr.loads(s, globals())
181 | # timeit psr.loads(s, globals())
182 | # timeit LALR(Lam)
183 | 
184 | pp.pprint(psr1.interpret(tough_inp))
185 | 
186 | 
187 | # assert psr_glr.interpret_many(tough_inp)[0] == psr1.interpret(tough_inp)
188 | assert psr.interpret(tough_inp) == psr1.interpret(tough_inp)
189 | 


--------------------------------------------------------------------------------
/experiments/LL.py:
--------------------------------------------------------------------------------
  1 | import preamble
  2 | from metaparse import *
  3 | 
  4 | @meta
  5 | class WLL1(ParserDeterm):
  6 |     """Weak-LL(1)-Parser.
  7 | 
  8 |     Since 'strong'-LL(1) grammar parser includes the usage of FOLLOW
  9 |     set, which is only heuristically helpful for the recognitive
 10 |     capability when handling NULLABLE rules, this parser suppress the
 11 |     need of FOLLOW.
 12 | 
 13 |     When deducing a NULLABLE nonterminal A with some lookahead a, if a
 14 |     does not belong to any FIRST of A's alternatives, then the NULL
 15 |     alternative is chosen. In other words, all terminals not in
 16 |     FIRST(A) leads to the prediction (as well as immediate reduction)
 17 |     of (A -> ε) in the predictive table.
 18 | 
 19 |     This variation allows predicting (A -> ε) even when lookahead a is
 20 |     not in FOLLOW, which means this parser will postpone the
 21 |     recognition error compared to strong-LL(1) parser.
 22 | 
 23 |     """
 24 | 
 25 |     def __init__(self, grammar):
 26 |         self.grammar = grammar
 27 |         self.lexer = Lexer.from_grammar(grammar)
 28 |         self.semans = grammar.semans
 29 |         self._calc_ll1_table()
 30 | 
 31 |     def _calc_ll1_table(self):
 32 |         G = self.grammar
 33 |         table = self.table = {}
 34 |         for r, rule in enumerate(G.rules):
 35 |             lhs, rhs = rule
 36 |             if lhs not in table:
 37 |                 table[lhs] = {}
 38 |             # NON-NULL rule
 39 |             if rhs:
 40 |                 for a in G.first_of_seq(rhs, EPSILON):
 41 |                     if a is EPSILON:
 42 |                         pass
 43 |                     elif a in table[lhs]:
 44 |                         raise GrammarError('Not simple LL(1) grammar! ')
 45 |                     else:
 46 |                         table[lhs][a] = rule
 47 |             # NULL rule
 48 |             # This rule tends to be tried when
 49 |             # the lookahead doesn't appear in
 50 |             # other sibling rules.
 51 |             else:
 52 |                 pass
 53 | 
 54 |     def parse(self, inp, interp=False):
 55 |         """The process is exactly the `translate' process of a ParseTree.
 56 | 
 57 |         """
 58 |         # Backtracking is yet supported
 59 |         # Each choice should be deterministic
 60 |         push = list.append
 61 |         pop = list.pop
 62 |         G = self.grammar
 63 |         pstack = self.pstack = []
 64 |         table = self.table
 65 |         toker = enumerate(self.lexer.tokenize(inp, with_end=True))
 66 |         pstack.append(G.rules[0].lhs)
 67 |         argstack = []
 68 |         try:
 69 |             k, tok = next(toker)
 70 |             while pstack:
 71 |                 actor = pop(pstack)
 72 |                 at, look, tokval = tok
 73 |                 # Reduction
 74 |                 if isinstance(actor, Rule):
 75 |                     args = []
 76 |                     # Pop the size of args, conclude subtree
 77 |                     # for prediction made before
 78 |                     for _ in actor.rhs:
 79 |                         args.insert(0, pop(argstack))
 80 |                     if interp:
 81 |                         arg1 = actor.seman(*args)
 82 |                     else:
 83 |                         arg1 = ParseTree(actor, args)
 84 |                     # Finish - no prediction in stack
 85 |                     # Should declare end-of-input
 86 |                     if not pstack:
 87 |                         return arg1
 88 |                     else:
 89 |                         push(argstack, arg1)
 90 |                 # Make prediction on nonterminal
 91 |                 elif actor in G.nonterminals:
 92 |                     if look in table[actor]:
 93 |                         pred = table[actor][look]
 94 |                         # Singal for reduction
 95 |                         push(pstack, pred)
 96 |                         # Push symbols into prediction-stack,
 97 |                         # last symbol first in.
 98 |                         for x in reversed(pred.rhs):
 99 |                             push(pstack, x)
100 |                     # !!! Heuristically do epsilon-reduction when no
101 |                     # viable lookahead found
102 |                     elif actor in G.NULLABLE:
103 |                         for r0 in G.rules:
104 |                             if r0.lhs == actor and not r0.rhs:
105 |                                 if interp:
106 |                                     argstack.append(r0.seman())
107 |                                 else:
108 |                                     argstack.append(ParseTree(r0, []))
109 |                     # Recognition failed, ignore
110 |                     else:
111 |                         raise ParserError('No production found.')
112 |                 # Try match terminal
113 |                 else:
114 |                     if actor == look:
115 |                         if interp:
116 |                             argstack.append(tokval)
117 |                         else:
118 |                             argstack.append(tok)
119 |                     k, tok = next(toker)
120 | 
121 |         except StopIteration:
122 |             raise ParserError('No enough tokens to complete parsing.')
123 | 
124 | 


--------------------------------------------------------------------------------
/examples/sexp_dump.py:
--------------------------------------------------------------------------------
  1 | lex2pats = \
  2 |     [('COMMA', ','),
  3 |      ('IGNORED', '%'),
  4 |      ('RIGHT', '\\)'),
  5 |      ('LEFT', '\\('),
  6 |      ('IGNORED', '\\s+'),
  7 |      ('SYMBOL', '[_a-zA-Z]\\w*'),
  8 |      ('UNKNOWN', '&'),
  9 |      ('NUMBER', '[1-9]\\d*(\\.\\d*)?')]
 10 | 
 11 | handlers = \
 12 |     [None,
 13 |      None,
 14 |      None,
 15 |      None,
 16 |      None,
 17 |      None,
 18 |      None,
 19 |      b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00'
 20 |      b'\x00C\x00\x00\x00s\n\x00\x00\x00t\x00\x00|\x00\x00\x83\x01\x00S)\x01N)'
 21 |      b'\x01\xda\x03int)\x01\xda\x03val\xa9\x00r\x03\x00\x00\x00\xfaOc:\\Users\\S'
 22 |      b'hellay\\Documents\\GitHub\\metaparse\\experiments\\lessparse\\test_basic.p'
 23 |      b'y\xda\x01_g\x00\x00\x00s\x02\x00\x00\x00\x00\x02']
 24 | 
 25 | rules = \
 26 |     [('sexp^', ('sexp',)),
 27 |      ('sexp', ('atom',)),
 28 |      ('sexp', ('LEFT', 'slist', 'RIGHT')),
 29 |      ('slist', ()),
 30 |      ('slist', ('slist', 'sexp')),
 31 |      ('atom', ('NUMBER',)),
 32 |      ('atom', ('SYMBOL',))]
 33 | 
 34 | ACTION1 = \
 35 |     [{'LEFT': ('shift', 3), 'NUMBER': ('shift', 4), 'SYMBOL': ('shift', 5)},
 36 |      {'\x03': ('reduce', 0)},
 37 |      {'\x03': ('reduce', 1),
 38 |       'LEFT': ('reduce', 1),
 39 |       'NUMBER': ('reduce', 1),
 40 |       'RIGHT': ('reduce', 1),
 41 |       'SYMBOL': ('reduce', 1)},
 42 |      {'LEFT': ('reduce', 3),
 43 |       'NUMBER': ('reduce', 3),
 44 |       'RIGHT': ('reduce', 3),
 45 |       'SYMBOL': ('reduce', 3)},
 46 |      {'\x03': ('reduce', 5),
 47 |       'LEFT': ('reduce', 5),
 48 |       'NUMBER': ('reduce', 5),
 49 |       'RIGHT': ('reduce', 5),
 50 |       'SYMBOL': ('reduce', 5)},
 51 |      {'\x03': ('reduce', 6),
 52 |       'LEFT': ('reduce', 6),
 53 |       'NUMBER': ('reduce', 6),
 54 |       'RIGHT': ('reduce', 6),
 55 |       'SYMBOL': ('reduce', 6)},
 56 |      {'LEFT': ('shift', 3),
 57 |       'NUMBER': ('shift', 4),
 58 |       'RIGHT': ('shift', 7),
 59 |       'SYMBOL': ('shift', 5)},
 60 |      {'\x03': ('reduce', 2),
 61 |       'LEFT': ('reduce', 2),
 62 |       'NUMBER': ('reduce', 2),
 63 |       'RIGHT': ('reduce', 2),
 64 |       'SYMBOL': ('reduce', 2)},
 65 |      {'LEFT': ('reduce', 4),
 66 |       'NUMBER': ('reduce', 4),
 67 |       'RIGHT': ('reduce', 4),
 68 |       'SYMBOL': ('reduce', 4)}]
 69 | 
 70 | GOTO = \
 71 |     [{'LEFT': 3, 'NUMBER': 4, 'SYMBOL': 5, 'atom': 2, 'sexp': 1},
 72 |      {},
 73 |      {},
 74 |      {'slist': 6},
 75 |      {},
 76 |      {},
 77 |      {'LEFT': 3, 'NUMBER': 4, 'RIGHT': 7, 'SYMBOL': 5, 'atom': 2, 'sexp': 8},
 78 |      {},
 79 |      {}]
 80 | 
 81 | semans = \
 82 |     [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 83 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x'
 84 |      b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfaNc:\\Users\\Shellay\\Documents\\GitHu'
 85 |      b'b\\metaparse\\experiments\\lessparse\\metaparse.py\xda\x08identity'
 86 |      b'*\x00\x00\x00s\x02\x00\x00\x00\x00\x01',
 87 |      b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 88 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x04atomr'
 89 |      b'\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitHub'
 90 |      b'\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x04sexpk\x00\x00\x00'
 91 |      b's\x02\x00\x00\x00\x00\x02',
 92 |      b'\xe3\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00'
 93 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x01\x00S)\x01N\xa9\x00)\x03\xda\x04L'
 94 |      b'EFT\xda\x05slist\xda\x05RIGHTr\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\User'
 95 |      b's\\Shellay\\Documents\\GitHub\\metaparse\\experiments\\lessparse\\test_basi'
 96 |      b'c.py\xda\x04sexpn\x00\x00\x00s\x02\x00\x00\x00\x00\x02',
 97 |      b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00'
 98 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00g\x00\x00S)\x01N\xa9\x00r\x01\x00\x00\x00'
 99 |      b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitHu'
100 |      b'b\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x05slists\x00'
101 |      b'\x00\x00s\x02\x00\x00\x00\x00\x02',
102 |      b'\xe3\x02\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00'
103 |      b'\x00C\x00\x00\x00s\x11\x00\x00\x00|\x00\x00j\x00\x00|\x01\x00\x83'
104 |      b'\x01\x00\x01|\x00\x00S)\x01N)\x01\xda\x06append)\x02\xda\x05slist\xda\x04s'
105 |      b'exp\xa9\x00r\x04\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitHub\\me'
106 |      b'taparse\\experiments\\lessparse\\test_basic.pyr\x02\x00\x00\x00v\x00\x00\x00'
107 |      b's\x04\x00\x00\x00\x00\x02\r\x01',
108 |      b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
109 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06NUMBE'
110 |      b'Rr\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitH'
111 |      b'ub\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x04atom{\x00'
112 |      b'\x00\x00s\x02\x00\x00\x00\x00\x02',
113 |      b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
114 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x06SYMBO'
115 |      b'Lr\x01\x00\x00\x00r\x01\x00\x00\x00\xfaOc:\\Users\\Shellay\\Documents\\GitH'
116 |      b'ub\\metaparse\\experiments\\lessparse\\test_basic.py\xda\x04atom~\x00'
117 |      b'\x00\x00s\x02\x00\x00\x00\x00\x02']
118 | 


--------------------------------------------------------------------------------
/examples/eg_dumps_direct_use.py:
--------------------------------------------------------------------------------
  1 | from eg_dumps_file import *
  2 | 
  3 | import re
  4 | import types
  5 | import marshal
  6 | import warnings
  7 | import pprint as pp
  8 | 
  9 | from collections import namedtuple, deque
 10 | 
 11 | lex2rgxs = [(lex, re.compile(pat)) for lex, pat in lex2pats]
 12 | 
 13 | lex_handlers = {
 14 |     name: types.FunctionType(marshal.loads(src), globals())
 15 |     for name, src in lex_handler_sources.items()
 16 | }
 17 | 
 18 | semans = [
 19 |     types.FunctionType(marshal.loads(src), globals())
 20 |     for src in seman_sources
 21 | ]
 22 | 
 23 | Rule = namedtuple('Rule', 'lhs rhs')
 24 | Rule.__repr__ = lambda s: '({} = {})'.format(s.lhs, ' '.join(s.rhs))
 25 | 
 26 | Item = namedtuple('Item', 'rule pos')
 27 | Item.__repr__ = lambda s: '({} = {}.{})'.format(s.rule.lhs,
 28 |                                                 ' '.join(s.rule.rhs[:s.pos]),
 29 |                                                 ' '.join(s.rule.rhs[s.pos:]))
 30 | 
 31 | rules = [Rule(l, r) for l, r in rules]
 32 | Ks = [[Item(rules[r], pos) for r, pos in K] for K in Ks]
 33 | 
 34 | 
 35 | Token = namedtuple('Token', 'at symbol lexeme value')
 36 | Token.__repr__ = lambda s: '({} = {})'.format(s.symbol, repr(s.value))
 37 | 
 38 | 
 39 | 
 40 | def tokenize(inp, with_end=True):
 41 | 
 42 |     pos = 0
 43 |     while pos < len(inp):
 44 |         # raw string match
 45 |         raw_match = False
 46 |         # re match
 47 |         n = None
 48 |         m = None
 49 |         for cat, rgx in lex2rgxs:
 50 |             # raw
 51 |             if rgx is None:
 52 |                 if inp.startswith(cat, pos):
 53 |                     yield Token(pos, cat, cat, cat)
 54 |                     pos += len(cat)
 55 |                     raw_match = True
 56 |                     break
 57 |             # re
 58 |             else:
 59 |                 m = rgx.match(inp, pos=pos)
 60 |                 # The first match with non-zero length is yielded.
 61 |                 if m and len(m.group()) > 0:
 62 |                     n = cat
 63 |                     break
 64 |         if raw_match:
 65 |             continue
 66 |         elif m:
 67 |             assert isinstance(n, str)
 68 |             if n == 'IGNORED':
 69 |                 # Need IGNORED handler?
 70 |                 at, pos = m.span()
 71 |             elif n == 'ERROR':
 72 |                 # Call ERROR handler!
 73 |                 at, pos = m.span()
 74 |                 lxm = m.group()
 75 |                 if 'ERROR' in lex_handlers:
 76 |                     # Suppress error token and call handler.
 77 |                     lex_handlers[ERROR](lxm)
 78 |                     # yield Token(at, ERROR, lxm, h(lxm))
 79 |                 else:
 80 |                     # Yield error token when no handler available.
 81 |                     yield Token(at, ERROR, lxm, lxm)
 82 |             else:
 83 |                 at, pos = m.span()
 84 |                 lxm = m.group()
 85 |                 if n in lex_handlers:
 86 |                     # Call normal token handler.
 87 |                     h = lex_handlers[n]
 88 |                     # Bind semantic value.
 89 |                     yield Token(at, n, lxm, h(lxm))
 90 |                 else:
 91 |                     yield Token(at, n, lxm, lxm)
 92 |         else:
 93 |             # Report unrecognized Token here!
 94 |             msg = '\n'.join([
 95 |                 '',
 96 |                 '=========================',
 97 |                 'No defined pattern starts with char `{}` @{}'.format(inp[pos], pos),
 98 |                 '',
 99 |                 '* Consumed input: ',
100 |                 repr(inp[:pos]),
101 |                 '=========================',
102 |                 '',
103 |             ])
104 |             raise GrammarError(msg)
105 |     if with_end:
106 |         yield Token(pos, 'END', None, None)
107 | 
108 | 
109 | def parse(inp, interp=False, n_warns=5):
110 | 
111 |     trees = []
112 |     sstack = [0]
113 | 
114 |     toker = tokenize(inp, with_end=True) # Use END to force finishing by ACCEPT
115 |     tok = next(toker)
116 |     warns = []
117 | 
118 |     try:
119 |         while 1:
120 | 
121 |             # Peek state
122 |             s = sstack[-1]
123 | 
124 |             if tok.symbol not in ACTION[s]:
125 |                 msg = '\n'.join([
126 |                     '',
127 |                     'WARNING: ',
128 |                     'LALR - Ignoring syntax error reading Token {}'.format(tok),
129 |                     '- Current kernel derivation stack:',
130 |                     pp.pformat([Ks[i] for i in sstack]),
131 |                     '- Expecting tokens and actions:',
132 |                     pp.pformat(ACTION[s]),
133 |                     '- But got: \n{}'.format(tok),
134 |                     '',
135 |                 ])
136 |                 warnings.warn(msg)
137 |                 warns.append(msg)
138 |                 if len(warns) == n_warns:
139 |                     raise ValueError(
140 |                         'Warning tolerance {} reached. Parsing exited.'.format(n_warns))
141 |                 else:
142 |                     tok = next(toker)
143 | 
144 |             else:
145 |                 act, arg = ACTION[s][tok.symbol]
146 | 
147 |                 # SHIFT
148 |                 if act == 'SHIFT':
149 |                     if interp:
150 |                         trees.append(tok.value)
151 |                     else:
152 |                         trees.append(tok)
153 |                     sstack.append(GOTO[s][tok.symbol])
154 |                     # Go on scanning
155 |                     tok = next(toker)
156 | 
157 |                 # REDUCE
158 |                 elif act == 'REDUCE':
159 |                     assert isinstance(arg, int)
160 |                     rule = lhs, rhs = rules[arg]
161 |                     seman = semans[arg]
162 |                     subts = deque()
163 |                     for _ in rhs:
164 |                         subt = trees.pop()
165 |                         subts.appendleft(subt)
166 |                         sstack.pop()
167 |                     if interp:
168 |                         tree = seman(*subts)
169 |                     else:
170 |                         tree = ((rule, seman), list(subts))
171 |                     trees.append(tree)
172 |                     sstack.append(GOTO[sstack[-1]][lhs])
173 | 
174 |                 # ACCEPT
175 |                 elif act == 'ACCEPT':
176 |                     # Reduce the top semantics.
177 |                     assert isinstance(arg, int), arg
178 |                     rule = rules[arg]
179 |                     seman = semans[arg]
180 |                     if interp:
181 |                         return seman(*trees)
182 |                     else:
183 |                         assert len(trees) == 1
184 |                         return trees[0]
185 |                 else:
186 |                     raise ValueError('Invalid action {} on {}'.format(act, arg))
187 | 
188 |     except StopIteration:
189 |         raise ValueError('No enough tokens for completing the parse. ')
190 | 
191 | 
192 | def interpret(inp):
193 |     return parse(inp, interp=True)
194 | 
195 | table = {}
196 | 
197 | inp = 'x = 1 + 2 7 ** 3 * 5 + 9'
198 | 
199 | ts = list(tokenize(inp))
200 | pp.pprint(ts)
201 | 
202 | r = interpret(inp)
203 | 
204 | pp.pprint(table)
205 | pp.pprint(r)
206 | 


--------------------------------------------------------------------------------
/experiments/peg.py:
--------------------------------------------------------------------------------
  1 | # Experimental implementation for Parser Expression Grammar,
  2 | # represented by EBNF-like notation.
  3 | 
  4 | import re
  5 | 
  6 | from collections import namedtuple as data
  7 | 
  8 | Rule = data('Rule', 'lhs rhs')
  9 | 
 10 | # Expression is a superclass, which can be subclassed into
 11 | # - Terminal
 12 | # - Nonterminal
 13 | # - Alternatves
 14 | # - Sequence
 15 | # - Repeated/Star
 16 | # - Optional/Opt
 17 | 
 18 | Expr = data('Symbol', 'symbol')
 19 | 
 20 | Nonterminal = data('Nonterminal', 'symb')
 21 | Nonterminal = str
 22 | Terminal = data('Terminal', 'symb regexp')
 23 | Star = data('Star', 'sub')
 24 | Opt  = data('Opt', 'sub')
 25 | Plus = data('Plus', 'sub')
 26 | Seq  = data('Seq', 'subs')      # Using python list rather than CONS structure.
 27 | Alt  = data('Alt', 'subs')      # Using python list rather than CONS structure.
 28 | Nil  = None
 29 | 
 30 | is_a = isinstance
 31 | 
 32 | # Notes:
 33 | 
 34 | # To allow parsing expressions to include Sequence and Kleene Closure,
 35 | # there must be a corresponding sequenctial structure behaving as a
 36 | # primitive construction of a parse tree's subtrees. Theoretical it
 37 | # can be described as a monoid, which defines Unit(the empty) and
 38 | # Append(operation of accumulating).
 39 | 
 40 | # To represent the parsing result more simply, a parse result is
 41 | # either a ([Tree], Inp) or a (Tree, inp), whereas the latter`s first
 42 | # component can be represented as a singleton list.
 43 | 
 44 | # data Result = ([Tree], String) | (Tree, String) | FAIL
 45 | 
 46 | FAIL = (None, None)
 47 | 
 48 | def parse(G, x, inp):
 49 |     if is_a(x, Terminal):
 50 |         return parse_terminal(G, x, inp)
 51 |     elif is_a(x, Nonterminal):
 52 |         sub, inp1 = parse(G, G[x], inp)
 53 |         if (sub, inp1) == FAIL:
 54 |             return FAIL
 55 |         else:
 56 |             # Make a parse tree of 1 Nonterminal.
 57 |             # return (x.symb, sub), inp1
 58 |             return (x, sub), inp1
 59 |     elif is_a(x, Alt):
 60 |         return parse_alts(G, x.subs, inp)
 61 |     elif is_a(x, Seq):
 62 |         return parse_seq(G, x.subs, inp)
 63 |     elif is_a(x, Star):
 64 |         return parse_star(G, x.sub, inp)
 65 |     elif is_a(x, Opt):
 66 |         return parse_opt(G, x.sub, inp)
 67 |     else:
 68 |         raise TypeError('{} is not an expression.'.format(x))
 69 | 
 70 | def parse_terminal(G, x: Terminal, inp: str):
 71 |     if not inp:
 72 |         return FAIL
 73 |     else:
 74 |         m = re.match(x.regexp, inp, re.MULTILINE) # Matching MULTILINE activated.
 75 |         if not m:
 76 |             return FAIL
 77 |         else:
 78 |             _, end = m.span()
 79 |             tokval = re.sub(r'\s+', '', inp[:end])
 80 |             return (x.symb, tokval), inp[end:]
 81 | 
 82 | def parse_alts(G, alts: [Expr], inp: str) -> (tuple, str):
 83 |     """May return a OR-tree here. Recall each parse tree is an AND-OR
 84 |     tree.
 85 | 
 86 |     """
 87 |     pf = []
 88 |     for a in alts:
 89 |         t, inp1 = parse(G, a, inp)
 90 |         if (t, inp1) != FAIL:
 91 |             return (t, inp1)
 92 |     return FAIL
 93 | 
 94 | def parse_seq(G, subs: [Expr], inp: str) -> (tuple, str):
 95 |     ss = []
 96 |     for sub in subs:
 97 |         (t1, inp1) = parse(G, sub, inp)
 98 |         if (t1, inp1) != FAIL:
 99 |             # See whether the result is list or atom.
100 |             if isinstance(t1, list):
101 |                 # For parse_star, parse_opt, parse_seq the result is a
102 |                 # list.
103 |                 ss.extend(t1)
104 |             else:
105 |                 # For parse_terminal the result is an atom. It is a
106 |                 # singleton list as parse forest per se!!! For parse,
107 |                 # parse_alts the result maybe either.
108 |                 ss.append(t1)
109 |             inp = inp1
110 |         else:
111 |             return FAIL
112 |     # May convert singleton list to single node.
113 |     if len(ss) == 1:
114 |         return ss[0], inp
115 |     else:
116 |         return ss, inp
117 | 
118 | 
119 | # Extended monoidic expressional structures.
120 | 
121 | def parse_star(G, sub: Expr, inp: str) -> (tuple, str):
122 |     'sub is the expression enclosed by Star.'
123 |     rep = []
124 |     while 1 and inp:
125 |         t1, inp1 = parse(G, sub, inp)
126 |         if (t1, inp1) != FAIL:
127 |             rep.append(t1)
128 |             inp = inp1
129 |         else:
130 |             break
131 |     return rep, inp
132 | 
133 | def parse_opt(G, sub: Expr, inp: str) -> (tuple, str):
134 |     opt = []
135 |     t1, inp1 = parse(G, sub, inp)
136 |     if (t1, inp1) != FAIL:
137 |         opt.append(t1)
138 |         inp = inp1
139 |     return [], inp
140 | 
141 | 
142 | # i1 = Seq([Terminal('NUM', r'\d+'), Terminal('SPC', r'\s+'), Terminal('NUM', r'\d+')])
143 | # i2 = Seq([Terminal('NUM', r'\d+'), Terminal('SPC', r'\s+'), Terminal('NUM', r'[A-Za-z_]\w+')])
144 | # parse_seq(None, i1.subs, '123 456')
145 | # parse_seq(None, i2.subs, '123 456')
146 | 
147 | # parse(None, i1, '123 456')
148 | # parse(None, i2, '123 456') 
149 | G1 = {Nonterminal('E'): Seq([Nonterminal('T'),
150 |                              Star(Seq([Terminal('PLUS', r'\+'), Nonterminal('T')]))]),
151 |       Nonterminal('T'): Seq([Nonterminal('F'),
152 |                              Star(Seq([Terminal('TIMES', r'\*'), Nonterminal('F')]))]), 
153 |       Nonterminal('F'): Terminal('NUM', r'\d+'),
154 | }
155 | 
156 | 
157 | # Bootstrapping grammar.
158 | SPCS = r'\s*'
159 | 
160 | p_QUAL   = r'[\?\*\+]'
161 | 
162 | p_HEAD   = r'^'        + SPCS
163 | p_LEFT   = r'\('       + SPCS
164 | p_RIGHT  = r'\)'       + SPCS
165 | p_SEMI   = r';'        + SPCS
166 | p_ALT1   = r'/'        + SPCS
167 | p_ALT2   = r'\|'       + SPCS
168 | p_ALT    = r'[/\|]'    + SPCS
169 | p_ARROW  = r'(->|::=)' + SPCS 
170 | p_SYMBOL = r'[^;/\(\)\|\?\*\+\s]+' + SPCS
171 | 
172 | p_RIGHTQ = p_RIGHT + p_QUAL + r'?' + SPCS
173 | p_SYMBOLQ= p_SYMBOL + p_QUAL + r'?' + SPCS
174 | 
175 | t_HEAD   = Terminal("HEAD"  , p_HEAD)
176 | t_LEFT   = Terminal("LEFT"  , p_LEFT)
177 | t_RIGHT  = Terminal("RIGHT" , p_RIGHT)
178 | t_QUAL   = Terminal("QUAL"  , p_QUAL)
179 | t_SEMI   = Terminal("SEMI"  , p_SEMI)
180 | t_ALT1   = Terminal("ALT1"  , p_ALT1)
181 | t_ALT2   = Terminal("ALT2"  , p_ALT2)
182 | t_ALT    = Terminal("ALT"   , p_ALT)
183 | t_ARROW  = Terminal("ARROW" , p_ARROW)
184 | t_SYMBOL = Terminal("SYMBOL", p_SYMBOL)
185 | t_RIGHTQ = Terminal("RIGHTQ", p_RIGHTQ)
186 | t_SYMBOLQ= Terminal("SYMBOLQ", p_SYMBOLQ)
187 | 
188 | EBNF = {
189 |     'Rules': Star('Rule'),
190 |     'Rule': Seq(['LHS', t_ARROW, 'RHS']),
191 |     'LHS': t_SYMBOL,
192 |     'RHS': Seq(['Sequence',
193 |                 Star(Seq([t_ALT, 'Sequence'])),
194 |                 t_SEMI]),
195 |     'Sequence': Star('Expr'),
196 |     'Expr': Alt([t_SYMBOLQ,
197 |                  Seq([t_LEFT, 'Sequence', t_RIGHTQ])]),
198 | }
199 | 
200 | parse(EBNF, t_SYMBOL, 'ab')
201 | parse(EBNF, t_SYMBOL, 'ab*')
202 | parse(EBNF, t_SYMBOLQ, 'ab')
203 | parse(EBNF, t_SYMBOLQ, 'ab*')
204 | parse(EBNF, t_SYMBOLQ, 'ab  +')
205 | parse(EBNF, ('Expr'), 'ab*')
206 | parse(EBNF, ('Expr'), 'ab')
207 | parse(EBNF, ('Expr'), 'ab*')
208 | parse(EBNF, ('Expr'), 'ab  +')
209 | parse(EBNF, ('Expr'), 'ab +;')
210 | parse(EBNF, ('Expr'), "(plus E)")
211 | parse(EBNF, ('Sequence'), 'ab + bc?;')
212 | parse(EBNF, ('RHS'), "T (+ E) ;") # Error, using preserved symbol '+'
213 | parse(EBNF, ('RHS'), "T (\+ E) ;")
214 | parse(EBNF, ('RHS'), "T (plus E) ;")
215 | parse(EBNF, ('RHS'), """T (plus E) | T; """)
216 | parse(EBNF, ('RHS'), "a \+ b | a \* b | a? - b; ")
217 | parse(EBNF, ('RHS'), "T plus E;")
218 | parse(EBNF, ('RHS'), "T (plus E) ;")
219 | parse(EBNF, ('Rule'), "<expr> -> a \+ b | a \* b | a? - b; ")
220 | parse(EBNF, ('Rule'), "E -> T (plus T);")
221 | res = parse(EBNF, Nonterminal('Rules'), """E -> T (plus T)*;
222 | T -> F (times F)*;
223 | F -> id;""")
224 | res = parse(EBNF, Nonterminal('Rules'), """Expr -> atom | left Expr* right;
225 | atom -> id;
226 | """)
227 | 
228 | 
229 | # Further functionalities:
230 | 
231 | # - Detect left factor: Test whether two alternatives of a rule share
232 | # an identical FIRST token.
233 | 
234 | # - Detect left recursion: Test whether cycle exits after exploring
235 | # derivation path.
236 | if __name__ == '__main__':
237 |     import pprint as pp
238 |     pp.pprint(res)
239 | 


--------------------------------------------------------------------------------
/experiments/meta_dumps.py:
--------------------------------------------------------------------------------
  1 | ## This file is generated. Do not modify.
  2 | 
  3 | ## Lexer$BEGIN
  4 | 
  5 | lex2pats = \
  6 |     [('NEG', '!'),
  7 |      ('CON', '&'),
  8 |      ('DIS', '\\|'),
  9 |      ('IMP', '->'),
 10 |      ('IFF', '<=>'),
 11 |      ('W', '[A-Z]\\w*'),
 12 |      ('True', None),
 13 |      ('False', None),
 14 |      ('(', None),
 15 |      (')', None),
 16 |      ('[', None),
 17 |      (']', None),
 18 |      ('IGNORED', '[ \\t\\n]'),
 19 |      ('ERROR', '.')]
 20 | 
 21 | lex_handler_sources = \
 22 |     {}
 23 | 
 24 | ## Lexer$END
 25 | 
 26 | 
 27 | ## Parser$BEGIN
 28 | 
 29 | precedence = \
 30 |     {'CON': 4, 'DIS': 3, 'IFF': 1, 'IMP': 2, 'NEG': 5}
 31 | 
 32 | rules = \
 33 |     [('Sentence^', ('Sentence',)),
 34 |      ('Sentence', ['Atomic']),
 35 |      ('Sentence', ['Complex']),
 36 |      ('Atomic', ['True']),
 37 |      ('Atomic', ['False']),
 38 |      ('Atomic', ['W']),
 39 |      ('Complex', ['(', 'Sentence', ')']),
 40 |      ('Complex', ['[', 'Sentence', ']']),
 41 |      ('Complex', ['NEG', 'Sentence']),
 42 |      ('Complex', ['Sentence', 'CON', 'Sentence']),
 43 |      ('Complex', ['Sentence', 'DIS', 'Sentence']),
 44 |      ('Complex', ['Sentence', 'IMP', 'Sentence']),
 45 |      ('Complex', ['Sentence', 'IFF', 'Sentence'])]
 46 | 
 47 | seman_sources = \
 48 |     [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 49 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x'
 50 |      b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\users\\shellay\\documents\\githu'
 51 |      b'b\\metaparse\\metaparse.py\xda\x07id_func\x90\x00\x00\x00s\x02\x00'
 52 |      b'\x00\x00\x00\x01',
 53 |      None,
 54 |      None,
 55 |      None,
 56 |      None,
 57 |      None,
 58 |      None,
 59 |      None,
 60 |      None,
 61 |      None,
 62 |      None,
 63 |      None,
 64 |      None]
 65 | 
 66 | Ks = \
 67 |     [[(0, 0)],
 68 |      [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 69 |      [(1, 1)],
 70 |      [(2, 1)],
 71 |      [(3, 1)],
 72 |      [(4, 1)],
 73 |      [(5, 1)],
 74 |      [(6, 1)],
 75 |      [(7, 1)],
 76 |      [(8, 1)],
 77 |      [(9, 2)],
 78 |      [(10, 2)],
 79 |      [(11, 2)],
 80 |      [(12, 2)],
 81 |      [(6, 2), (9, 1), (10, 1), (11, 1), (12, 1)],
 82 |      [(7, 2), (9, 1), (10, 1), (11, 1), (12, 1)],
 83 |      [(8, 2), (9, 1), (10, 1), (11, 1), (12, 1)],
 84 |      [(9, 1), (9, 3), (10, 1), (11, 1), (12, 1)],
 85 |      [(9, 1), (10, 1), (10, 3), (11, 1), (12, 1)],
 86 |      [(9, 1), (10, 1), (11, 1), (11, 3), (12, 1)],
 87 |      [(9, 1), (10, 1), (11, 1), (12, 1), (12, 3)],
 88 |      [(6, 3)],
 89 |      [(7, 3)]]
 90 | 
 91 | ACTION = \
 92 |     [{'(': ('SHIFT', 7),
 93 |       'False': ('SHIFT', 5),
 94 |       'NEG': ('SHIFT', 9),
 95 |       'True': ('SHIFT', 4),
 96 |       'W': ('SHIFT', 6),
 97 |       '[': ('SHIFT', 8)},
 98 |      {'CON': ('SHIFT', 10),
 99 |       'DIS': ('SHIFT', 11),
100 |       'END': ('ACCEPT', 0),
101 |       'IFF': ('SHIFT', 13),
102 |       'IMP': ('SHIFT', 12)},
103 |      {')': ('REDUCE', 1),
104 |       'CON': ('REDUCE', 1),
105 |       'DIS': ('REDUCE', 1),
106 |       'END': ('REDUCE', 1),
107 |       'IFF': ('REDUCE', 1),
108 |       'IMP': ('REDUCE', 1),
109 |       ']': ('REDUCE', 1)},
110 |      {')': ('REDUCE', 2),
111 |       'CON': ('REDUCE', 2),
112 |       'DIS': ('REDUCE', 2),
113 |       'END': ('REDUCE', 2),
114 |       'IFF': ('REDUCE', 2),
115 |       'IMP': ('REDUCE', 2),
116 |       ']': ('REDUCE', 2)},
117 |      {')': ('REDUCE', 3),
118 |       'CON': ('REDUCE', 3),
119 |       'DIS': ('REDUCE', 3),
120 |       'END': ('REDUCE', 3),
121 |       'IFF': ('REDUCE', 3),
122 |       'IMP': ('REDUCE', 3),
123 |       ']': ('REDUCE', 3)},
124 |      {')': ('REDUCE', 4),
125 |       'CON': ('REDUCE', 4),
126 |       'DIS': ('REDUCE', 4),
127 |       'END': ('REDUCE', 4),
128 |       'IFF': ('REDUCE', 4),
129 |       'IMP': ('REDUCE', 4),
130 |       ']': ('REDUCE', 4)},
131 |      {')': ('REDUCE', 5),
132 |       'CON': ('REDUCE', 5),
133 |       'DIS': ('REDUCE', 5),
134 |       'END': ('REDUCE', 5),
135 |       'IFF': ('REDUCE', 5),
136 |       'IMP': ('REDUCE', 5),
137 |       ']': ('REDUCE', 5)},
138 |      {'(': ('SHIFT', 7),
139 |       'False': ('SHIFT', 5),
140 |       'NEG': ('SHIFT', 9),
141 |       'True': ('SHIFT', 4),
142 |       'W': ('SHIFT', 6),
143 |       '[': ('SHIFT', 8)},
144 |      {'(': ('SHIFT', 7),
145 |       'False': ('SHIFT', 5),
146 |       'NEG': ('SHIFT', 9),
147 |       'True': ('SHIFT', 4),
148 |       'W': ('SHIFT', 6),
149 |       '[': ('SHIFT', 8)},
150 |      {'(': ('SHIFT', 7),
151 |       'False': ('SHIFT', 5),
152 |       'NEG': ('SHIFT', 9),
153 |       'True': ('SHIFT', 4),
154 |       'W': ('SHIFT', 6),
155 |       '[': ('SHIFT', 8)},
156 |      {'(': ('SHIFT', 7),
157 |       'False': ('SHIFT', 5),
158 |       'NEG': ('SHIFT', 9),
159 |       'True': ('SHIFT', 4),
160 |       'W': ('SHIFT', 6),
161 |       '[': ('SHIFT', 8)},
162 |      {'(': ('SHIFT', 7),
163 |       'False': ('SHIFT', 5),
164 |       'NEG': ('SHIFT', 9),
165 |       'True': ('SHIFT', 4),
166 |       'W': ('SHIFT', 6),
167 |       '[': ('SHIFT', 8)},
168 |      {'(': ('SHIFT', 7),
169 |       'False': ('SHIFT', 5),
170 |       'NEG': ('SHIFT', 9),
171 |       'True': ('SHIFT', 4),
172 |       'W': ('SHIFT', 6),
173 |       '[': ('SHIFT', 8)},
174 |      {'(': ('SHIFT', 7),
175 |       'False': ('SHIFT', 5),
176 |       'NEG': ('SHIFT', 9),
177 |       'True': ('SHIFT', 4),
178 |       'W': ('SHIFT', 6),
179 |       '[': ('SHIFT', 8)},
180 |      {')': ('SHIFT', 21),
181 |       'CON': ('SHIFT', 10),
182 |       'DIS': ('SHIFT', 11),
183 |       'IFF': ('SHIFT', 13),
184 |       'IMP': ('SHIFT', 12)},
185 |      {'CON': ('SHIFT', 10),
186 |       'DIS': ('SHIFT', 11),
187 |       'IFF': ('SHIFT', 13),
188 |       'IMP': ('SHIFT', 12),
189 |       ']': ('SHIFT', 22)},
190 |      {')': ('REDUCE', 8),
191 |       'CON': ('REDUCE', 8),
192 |       'DIS': ('REDUCE', 8),
193 |       'END': ('REDUCE', 8),
194 |       'IFF': ('REDUCE', 8),
195 |       'IMP': ('REDUCE', 8),
196 |       ']': ('REDUCE', 8)},
197 |      {')': ('REDUCE', 9),
198 |       'CON': ('REDUCE', 9),
199 |       'DIS': ('REDUCE', 9),
200 |       'END': ('REDUCE', 9),
201 |       'IFF': ('REDUCE', 9),
202 |       'IMP': ('REDUCE', 9),
203 |       ']': ('REDUCE', 9)},
204 |      {')': ('REDUCE', 10),
205 |       'CON': ('SHIFT', 10),
206 |       'DIS': ('REDUCE', 10),
207 |       'END': ('REDUCE', 10),
208 |       'IFF': ('REDUCE', 10),
209 |       'IMP': ('REDUCE', 10),
210 |       ']': ('REDUCE', 10)},
211 |      {')': ('REDUCE', 11),
212 |       'CON': ('SHIFT', 10),
213 |       'DIS': ('SHIFT', 11),
214 |       'END': ('REDUCE', 11),
215 |       'IFF': ('REDUCE', 11),
216 |       'IMP': ('REDUCE', 11),
217 |       ']': ('REDUCE', 11)},
218 |      {')': ('REDUCE', 12),
219 |       'CON': ('SHIFT', 10),
220 |       'DIS': ('SHIFT', 11),
221 |       'END': ('REDUCE', 12),
222 |       'IFF': ('REDUCE', 12),
223 |       'IMP': ('SHIFT', 12),
224 |       ']': ('REDUCE', 12)},
225 |      {')': ('REDUCE', 6),
226 |       'CON': ('REDUCE', 6),
227 |       'DIS': ('REDUCE', 6),
228 |       'END': ('REDUCE', 6),
229 |       'IFF': ('REDUCE', 6),
230 |       'IMP': ('REDUCE', 6),
231 |       ']': ('REDUCE', 6)},
232 |      {')': ('REDUCE', 7),
233 |       'CON': ('REDUCE', 7),
234 |       'DIS': ('REDUCE', 7),
235 |       'END': ('REDUCE', 7),
236 |       'IFF': ('REDUCE', 7),
237 |       'IMP': ('REDUCE', 7),
238 |       ']': ('REDUCE', 7)}]
239 | 
240 | GOTO = \
241 |     [{'(': 7,
242 |       'Atomic': 2,
243 |       'Complex': 3,
244 |       'False': 5,
245 |       'NEG': 9,
246 |       'Sentence': 1,
247 |       'True': 4,
248 |       'W': 6,
249 |       '[': 8},
250 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
251 |      {},
252 |      {},
253 |      {},
254 |      {},
255 |      {},
256 |      {'(': 7,
257 |       'Atomic': 2,
258 |       'Complex': 3,
259 |       'False': 5,
260 |       'NEG': 9,
261 |       'Sentence': 14,
262 |       'True': 4,
263 |       'W': 6,
264 |       '[': 8},
265 |      {'(': 7,
266 |       'Atomic': 2,
267 |       'Complex': 3,
268 |       'False': 5,
269 |       'NEG': 9,
270 |       'Sentence': 15,
271 |       'True': 4,
272 |       'W': 6,
273 |       '[': 8},
274 |      {'(': 7,
275 |       'Atomic': 2,
276 |       'Complex': 3,
277 |       'False': 5,
278 |       'NEG': 9,
279 |       'Sentence': 16,
280 |       'True': 4,
281 |       'W': 6,
282 |       '[': 8},
283 |      {'(': 7,
284 |       'Atomic': 2,
285 |       'Complex': 3,
286 |       'False': 5,
287 |       'NEG': 9,
288 |       'Sentence': 17,
289 |       'True': 4,
290 |       'W': 6,
291 |       '[': 8},
292 |      {'(': 7,
293 |       'Atomic': 2,
294 |       'Complex': 3,
295 |       'False': 5,
296 |       'NEG': 9,
297 |       'Sentence': 18,
298 |       'True': 4,
299 |       'W': 6,
300 |       '[': 8},
301 |      {'(': 7,
302 |       'Atomic': 2,
303 |       'Complex': 3,
304 |       'False': 5,
305 |       'NEG': 9,
306 |       'Sentence': 19,
307 |       'True': 4,
308 |       'W': 6,
309 |       '[': 8},
310 |      {'(': 7,
311 |       'Atomic': 2,
312 |       'Complex': 3,
313 |       'False': 5,
314 |       'NEG': 9,
315 |       'Sentence': 20,
316 |       'True': 4,
317 |       'W': 6,
318 |       '[': 8},
319 |      {')': 21, 'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
320 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12, ']': 22},
321 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
322 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
323 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
324 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
325 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
326 |      {},
327 |      {}]
328 | 
329 | ## Parser$END
330 | 


--------------------------------------------------------------------------------
/experiments/meta_dumps_standalone.py:
--------------------------------------------------------------------------------
  1 | ## This file is generated. Do not modify.
  2 | 
  3 | ## Lexer$BEGIN
  4 | 
  5 | lex2pats = \
  6 |     [('NEG', '!'),
  7 |      ('CON', '&'),
  8 |      ('DIS', '\\|'),
  9 |      ('IMP', '->'),
 10 |      ('IFF', '<=>'),
 11 |      ('W', '[A-Z]\\w*'),
 12 |      ('True', None),
 13 |      ('False', None),
 14 |      ('(', None),
 15 |      (')', None),
 16 |      ('[', None),
 17 |      (']', None),
 18 |      ('IGNORED', '[ \\t\\n]'),
 19 |      ('ERROR', '.')]
 20 | 
 21 | lex_handler_sources = \
 22 |     {}
 23 | 
 24 | ## Lexer$END
 25 | 
 26 | 
 27 | ## Parser$BEGIN
 28 | 
 29 | precedence = \
 30 |     {'CON': 4, 'DIS': 3, 'IFF': 1, 'IMP': 2, 'NEG': 5}
 31 | 
 32 | rules = \
 33 |     [('Sentence^', ('Sentence',)),
 34 |      ('Sentence', ['Atomic']),
 35 |      ('Sentence', ['Complex']),
 36 |      ('Atomic', ['True']),
 37 |      ('Atomic', ['False']),
 38 |      ('Atomic', ['W']),
 39 |      ('Complex', ['(', 'Sentence', ')']),
 40 |      ('Complex', ['[', 'Sentence', ']']),
 41 |      ('Complex', ['NEG', 'Sentence']),
 42 |      ('Complex', ['Sentence', 'CON', 'Sentence']),
 43 |      ('Complex', ['Sentence', 'DIS', 'Sentence']),
 44 |      ('Complex', ['Sentence', 'IMP', 'Sentence']),
 45 |      ('Complex', ['Sentence', 'IFF', 'Sentence'])]
 46 | 
 47 | seman_sources = \
 48 |     [b'\xe3\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00'
 49 |      b'\x00C\x00\x00\x00s\x04\x00\x00\x00|\x00\x00S)\x01N\xa9\x00)\x01\xda\x01x'
 50 |      b'r\x01\x00\x00\x00r\x01\x00\x00\x00\xfa8c:\\users\\shellay\\documents\\githu'
 51 |      b'b\\metaparse\\metaparse.py\xda\x07id_func\x90\x00\x00\x00s\x02\x00'
 52 |      b'\x00\x00\x00\x01',
 53 |      None,
 54 |      None,
 55 |      None,
 56 |      None,
 57 |      None,
 58 |      None,
 59 |      None,
 60 |      None,
 61 |      None,
 62 |      None,
 63 |      None,
 64 |      None]
 65 | 
 66 | Ks = \
 67 |     [[(0, 0)],
 68 |      [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 69 |      [(1, 1)],
 70 |      [(2, 1)],
 71 |      [(3, 1)],
 72 |      [(4, 1)],
 73 |      [(5, 1)],
 74 |      [(6, 1)],
 75 |      [(7, 1)],
 76 |      [(8, 1)],
 77 |      [(9, 2)],
 78 |      [(10, 2)],
 79 |      [(11, 2)],
 80 |      [(12, 2)],
 81 |      [(6, 2), (9, 1), (10, 1), (11, 1), (12, 1)],
 82 |      [(7, 2), (9, 1), (10, 1), (11, 1), (12, 1)],
 83 |      [(8, 2), (9, 1), (10, 1), (11, 1), (12, 1)],
 84 |      [(9, 1), (9, 3), (10, 1), (11, 1), (12, 1)],
 85 |      [(9, 1), (10, 1), (10, 3), (11, 1), (12, 1)],
 86 |      [(9, 1), (10, 1), (11, 1), (11, 3), (12, 1)],
 87 |      [(9, 1), (10, 1), (11, 1), (12, 1), (12, 3)],
 88 |      [(6, 3)],
 89 |      [(7, 3)]]
 90 | 
 91 | ACTION = \
 92 |     [{'(': ('SHIFT', 7),
 93 |       'False': ('SHIFT', 5),
 94 |       'NEG': ('SHIFT', 9),
 95 |       'True': ('SHIFT', 4),
 96 |       'W': ('SHIFT', 6),
 97 |       '[': ('SHIFT', 8)},
 98 |      {'CON': ('SHIFT', 10),
 99 |       'DIS': ('SHIFT', 11),
100 |       'END': ('ACCEPT', 0),
101 |       'IFF': ('SHIFT', 13),
102 |       'IMP': ('SHIFT', 12)},
103 |      {')': ('REDUCE', 1),
104 |       'CON': ('REDUCE', 1),
105 |       'DIS': ('REDUCE', 1),
106 |       'END': ('REDUCE', 1),
107 |       'IFF': ('REDUCE', 1),
108 |       'IMP': ('REDUCE', 1),
109 |       ']': ('REDUCE', 1)},
110 |      {')': ('REDUCE', 2),
111 |       'CON': ('REDUCE', 2),
112 |       'DIS': ('REDUCE', 2),
113 |       'END': ('REDUCE', 2),
114 |       'IFF': ('REDUCE', 2),
115 |       'IMP': ('REDUCE', 2),
116 |       ']': ('REDUCE', 2)},
117 |      {')': ('REDUCE', 3),
118 |       'CON': ('REDUCE', 3),
119 |       'DIS': ('REDUCE', 3),
120 |       'END': ('REDUCE', 3),
121 |       'IFF': ('REDUCE', 3),
122 |       'IMP': ('REDUCE', 3),
123 |       ']': ('REDUCE', 3)},
124 |      {')': ('REDUCE', 4),
125 |       'CON': ('REDUCE', 4),
126 |       'DIS': ('REDUCE', 4),
127 |       'END': ('REDUCE', 4),
128 |       'IFF': ('REDUCE', 4),
129 |       'IMP': ('REDUCE', 4),
130 |       ']': ('REDUCE', 4)},
131 |      {')': ('REDUCE', 5),
132 |       'CON': ('REDUCE', 5),
133 |       'DIS': ('REDUCE', 5),
134 |       'END': ('REDUCE', 5),
135 |       'IFF': ('REDUCE', 5),
136 |       'IMP': ('REDUCE', 5),
137 |       ']': ('REDUCE', 5)},
138 |      {'(': ('SHIFT', 7),
139 |       'False': ('SHIFT', 5),
140 |       'NEG': ('SHIFT', 9),
141 |       'True': ('SHIFT', 4),
142 |       'W': ('SHIFT', 6),
143 |       '[': ('SHIFT', 8)},
144 |      {'(': ('SHIFT', 7),
145 |       'False': ('SHIFT', 5),
146 |       'NEG': ('SHIFT', 9),
147 |       'True': ('SHIFT', 4),
148 |       'W': ('SHIFT', 6),
149 |       '[': ('SHIFT', 8)},
150 |      {'(': ('SHIFT', 7),
151 |       'False': ('SHIFT', 5),
152 |       'NEG': ('SHIFT', 9),
153 |       'True': ('SHIFT', 4),
154 |       'W': ('SHIFT', 6),
155 |       '[': ('SHIFT', 8)},
156 |      {'(': ('SHIFT', 7),
157 |       'False': ('SHIFT', 5),
158 |       'NEG': ('SHIFT', 9),
159 |       'True': ('SHIFT', 4),
160 |       'W': ('SHIFT', 6),
161 |       '[': ('SHIFT', 8)},
162 |      {'(': ('SHIFT', 7),
163 |       'False': ('SHIFT', 5),
164 |       'NEG': ('SHIFT', 9),
165 |       'True': ('SHIFT', 4),
166 |       'W': ('SHIFT', 6),
167 |       '[': ('SHIFT', 8)},
168 |      {'(': ('SHIFT', 7),
169 |       'False': ('SHIFT', 5),
170 |       'NEG': ('SHIFT', 9),
171 |       'True': ('SHIFT', 4),
172 |       'W': ('SHIFT', 6),
173 |       '[': ('SHIFT', 8)},
174 |      {'(': ('SHIFT', 7),
175 |       'False': ('SHIFT', 5),
176 |       'NEG': ('SHIFT', 9),
177 |       'True': ('SHIFT', 4),
178 |       'W': ('SHIFT', 6),
179 |       '[': ('SHIFT', 8)},
180 |      {')': ('SHIFT', 21),
181 |       'CON': ('SHIFT', 10),
182 |       'DIS': ('SHIFT', 11),
183 |       'IFF': ('SHIFT', 13),
184 |       'IMP': ('SHIFT', 12)},
185 |      {'CON': ('SHIFT', 10),
186 |       'DIS': ('SHIFT', 11),
187 |       'IFF': ('SHIFT', 13),
188 |       'IMP': ('SHIFT', 12),
189 |       ']': ('SHIFT', 22)},
190 |      {')': ('REDUCE', 8),
191 |       'CON': ('REDUCE', 8),
192 |       'DIS': ('REDUCE', 8),
193 |       'END': ('REDUCE', 8),
194 |       'IFF': ('REDUCE', 8),
195 |       'IMP': ('REDUCE', 8),
196 |       ']': ('REDUCE', 8)},
197 |      {')': ('REDUCE', 9),
198 |       'CON': ('REDUCE', 9),
199 |       'DIS': ('REDUCE', 9),
200 |       'END': ('REDUCE', 9),
201 |       'IFF': ('REDUCE', 9),
202 |       'IMP': ('REDUCE', 9),
203 |       ']': ('REDUCE', 9)},
204 |      {')': ('REDUCE', 10),
205 |       'CON': ('SHIFT', 10),
206 |       'DIS': ('REDUCE', 10),
207 |       'END': ('REDUCE', 10),
208 |       'IFF': ('REDUCE', 10),
209 |       'IMP': ('REDUCE', 10),
210 |       ']': ('REDUCE', 10)},
211 |      {')': ('REDUCE', 11),
212 |       'CON': ('SHIFT', 10),
213 |       'DIS': ('SHIFT', 11),
214 |       'END': ('REDUCE', 11),
215 |       'IFF': ('REDUCE', 11),
216 |       'IMP': ('REDUCE', 11),
217 |       ']': ('REDUCE', 11)},
218 |      {')': ('REDUCE', 12),
219 |       'CON': ('SHIFT', 10),
220 |       'DIS': ('SHIFT', 11),
221 |       'END': ('REDUCE', 12),
222 |       'IFF': ('REDUCE', 12),
223 |       'IMP': ('SHIFT', 12),
224 |       ']': ('REDUCE', 12)},
225 |      {')': ('REDUCE', 6),
226 |       'CON': ('REDUCE', 6),
227 |       'DIS': ('REDUCE', 6),
228 |       'END': ('REDUCE', 6),
229 |       'IFF': ('REDUCE', 6),
230 |       'IMP': ('REDUCE', 6),
231 |       ']': ('REDUCE', 6)},
232 |      {')': ('REDUCE', 7),
233 |       'CON': ('REDUCE', 7),
234 |       'DIS': ('REDUCE', 7),
235 |       'END': ('REDUCE', 7),
236 |       'IFF': ('REDUCE', 7),
237 |       'IMP': ('REDUCE', 7),
238 |       ']': ('REDUCE', 7)}]
239 | 
240 | GOTO = \
241 |     [{'(': 7,
242 |       'Atomic': 2,
243 |       'Complex': 3,
244 |       'False': 5,
245 |       'NEG': 9,
246 |       'Sentence': 1,
247 |       'True': 4,
248 |       'W': 6,
249 |       '[': 8},
250 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
251 |      {},
252 |      {},
253 |      {},
254 |      {},
255 |      {},
256 |      {'(': 7,
257 |       'Atomic': 2,
258 |       'Complex': 3,
259 |       'False': 5,
260 |       'NEG': 9,
261 |       'Sentence': 14,
262 |       'True': 4,
263 |       'W': 6,
264 |       '[': 8},
265 |      {'(': 7,
266 |       'Atomic': 2,
267 |       'Complex': 3,
268 |       'False': 5,
269 |       'NEG': 9,
270 |       'Sentence': 15,
271 |       'True': 4,
272 |       'W': 6,
273 |       '[': 8},
274 |      {'(': 7,
275 |       'Atomic': 2,
276 |       'Complex': 3,
277 |       'False': 5,
278 |       'NEG': 9,
279 |       'Sentence': 16,
280 |       'True': 4,
281 |       'W': 6,
282 |       '[': 8},
283 |      {'(': 7,
284 |       'Atomic': 2,
285 |       'Complex': 3,
286 |       'False': 5,
287 |       'NEG': 9,
288 |       'Sentence': 17,
289 |       'True': 4,
290 |       'W': 6,
291 |       '[': 8},
292 |      {'(': 7,
293 |       'Atomic': 2,
294 |       'Complex': 3,
295 |       'False': 5,
296 |       'NEG': 9,
297 |       'Sentence': 18,
298 |       'True': 4,
299 |       'W': 6,
300 |       '[': 8},
301 |      {'(': 7,
302 |       'Atomic': 2,
303 |       'Complex': 3,
304 |       'False': 5,
305 |       'NEG': 9,
306 |       'Sentence': 19,
307 |       'True': 4,
308 |       'W': 6,
309 |       '[': 8},
310 |      {'(': 7,
311 |       'Atomic': 2,
312 |       'Complex': 3,
313 |       'False': 5,
314 |       'NEG': 9,
315 |       'Sentence': 20,
316 |       'True': 4,
317 |       'W': 6,
318 |       '[': 8},
319 |      {')': 21, 'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
320 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12, ']': 22},
321 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
322 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
323 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
324 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
325 |      {'CON': 10, 'DIS': 11, 'IFF': 13, 'IMP': 12},
326 |      {},
327 |      {}]
328 | 
329 | ## Parser$END
330 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | metaparse
  2 | =====
  3 | 
  4 | This is a tool which lets you do instant parsing or language design tasks
  5 | enjoying the elegancy of pure Python<sup>[1]</sup>.
  6 | With this tool, creating a Python class is sufficient
  7 | to define a language, which includes
  8 | 
  9 | * lexical patterns
 10 | * syntatical rules
 11 | * semantic actions (i.e. interpretation/translation)
 12 | 
 13 | On top of this class, a parser/interpreter is automatically generated.
 14 | You can already use it to directly parse strings via calling its `parse`
 15 | or `interpret` method.
 16 | 
 17 | <sub>[1]. This module is motivated by [instaparse][] in [Clojure][], but goes another way more like [PLY][].</sub>
 18 | <br/>
 19 | 
 20 | # Table of Contents
 21 | 1. [Quick Example](#quick-example)
 22 | 1. [Design and Usage](#design-and-usage)
 23 | 1. [Generalized LALR Parsing](#generalized-lalr-and-dealing-with-ambiguity)
 24 | 1. [API](#api)
 25 | 
 26 | 
 27 | # Quick Example
 28 | 
 29 | In `metaparse`, language syntax and semantics can be simply defined
 30 | as methods of a class. To illustrate this, we create a tiny
 31 | calculator grammar which can read basic arithmetic expressions and
 32 | register variable bindings in a global dictionary.
 33 | 
 34 | At first, we conceptually design the grammar on a paper, as seen from the
 35 | textbooks,
 36 | 
 37 | ```
 38 | assign → ID = expr
 39 | expr → NUM
 40 | expr → ID
 41 | expr → expr₁ + expr₂
 42 | expr → expr₁ * expr₂
 43 | expr → expr₁ ** expr₂
 44 | ```
 45 | 
 46 | then we map them to method declarations in Python:
 47 | ``` python
 48 | def assign(ID, EQ, expr): ...
 49 | def expr(NUM): ...
 50 | def expr(ID): ...
 51 | def expr(expr_1, ADD, expr_2): ...
 52 | def expr(expr_1, MUL, expr_2): ...
 53 | def expr(expr_1, POW, expr_2): ...
 54 | ```
 55 | 
 56 | and finally we write down the semantic rules as method bodies,
 57 | in a [SDT][]-style (cf. [Yacc][]). The method parameters are bound
 58 | to the parse result of the sub-tree when a rule is being executed
 59 | (i.e. being reduced after its sub-rules or tokens have been
 60 | successfully processed).
 61 | 
 62 | ``` python
 63 | from metaparse import LALR
 64 | 
 65 | # Global context/environment for language semantics.
 66 | context = {}
 67 | 
 68 | class LangArith(metaclass=LALR.meta):
 69 | 
 70 |     "A language for calculating expressions."
 71 | 
 72 |     # ===== Lexical patterns / Terminals =====
 73 |     # - Patterns are specified via regular expressions
 74 |     # - Patterns will be checked with the same order as declared during tokenizing
 75 | 
 76 |     IGNORED = r'\s+'             # Special pattern to be ignored.
 77 | 
 78 |     EQ  = r'='
 79 |     POW = r'\*\*', 3             # Can include precedence of token using a number (for LALR conflict resolution)
 80 |     POW = r'\^'  , 3             # Alternative patterns can share the same name
 81 |     MUL = r'\*'  , 2
 82 |     ADD = r'\+'  , 1
 83 | 
 84 |     ID  = r'[_a-zA-Z]\w*'
 85 |     NUM = r'[1-9][0-9]*'
 86 |     def NUM(value):              # Can specify translator for certain lexical patterns!
 87 |         return int(value)
 88 | 
 89 |     # ===== Syntactic/Semantic rules in SDT-style =====
 90 | 
 91 |     def assign(ID, EQ, expr):        # Can access global context in Python environment.
 92 |         context[ID] = expr
 93 |         return expr
 94 | 
 95 |     def expr(NUM):                   # Normally computing result without side-effects would be better.
 96 |         return NUM                   # NUM is passed as (int) since there is a NUM handler!
 97 | 
 98 |     def expr(ID):
 99 |         return context[ID]
100 | 
101 |     def expr(expr_1, ADD, expr_2):   # TeX style subscripts used for identifying expression instances, like (expr → expr₁ + expr₂)
102 |         return expr_1 + expr_2
103 | 
104 |     def expr(expr, MUL, expr_1):     # Can ignore one of the subscripts.
105 |         return expr * expr_1
106 | 
107 |     def expr(expr, POW, expr_1):
108 |         return expr ** expr_1
109 | ```
110 | 
111 | Then we get a `LALR` parser object:
112 | 
113 | ``` python
114 | >>> type(LangArith)
115 | <class 'metaparse.LALR>
116 | ```
117 | 
118 | Now we are **done** and it's quite straightforward trying it out.
119 | 
120 | ``` python
121 | >>> LangArith.interpret("x = 1 + 4 * 3 ** 2 + 5")
122 | 42
123 | >>> LangArith.interpret("y = 5 + x * 2")
124 | 89
125 | >>> LangArith.interpret("z = 9 ^ 2")
126 | 81
127 | 
128 | >>> context
129 | {'y': 89, 'x': 42, 'z': 81}
130 | ```
131 | 
132 | IMO, tools under state-of-the-art could hardly get more handy than
133 | this.
134 | 
135 | Note `metaclass=LALR.meta` only works in Python 3. There is an
136 | [alternative](#verbose-style) way which works in Python 2.
137 | Directly using the APIs without all syntactic sugars is also possible.
138 | 
139 | 
140 | # Design and Usage
141 | 
142 | The design of this module targets "**native** parsing" (like [instaparse][] and [Parsec][]). Highlights are
143 | 
144 | * native structure representing grammar rules
145 |     - like `def E(E, plus, T)`, `def T(F)` ...
146 |     - rather than **literal string notations** like `"E = E + T"`, `"T = F"` ...
147 | * language translation implemented in *pure* Python,
148 | * easy to play with (e.g. in REPL),
149 | * no need to generate a program before use
150 | * but you can generate one and save it for future use (via dump/load APIs)
151 | * does not feel too much like a DSL (maybe?),
152 | * no dependencies,
153 | * optional precedence specification (for LALR),
154 | * nice error reporting,
155 | * and etc.
156 | 
157 | 
158 | <!-- All thanks to [metaprogramming](https://docs.python.org/3/reference/datamodel.html#customizing-class-creation) techniques.
159 |  -->
160 | 
161 | Though this slim module does not intend to replace full-fledged tools
162 | like [Bison][] and [ANTLR][], it is still very handy and its
163 | integration into existing Python project is seamless.
164 | 
165 | The following sections explains more details about the core utilities
166 | . Feel free to skip them since you already see from above how it is
167 | used.
168 | 
169 | 
170 | ## Retrieving the Parse Tree
171 | 
172 | Continuing the first example, if only the parse tree is needed rather
173 | than the translation result, use method `parse` instead of
174 | `interpret`:
175 | 
176 | ``` python
177 | tr = LangArith.parse(" w  = 1 + 2 * 3 ** 4 + 5 ")
178 | 
179 | >>> tr
180 | ('assign',
181 |  [('ID', 'w'),
182 |   ('EQ', '='),
183 |   ('expr',
184 |    [('expr',
185 |      [('expr', [('NUM', '1')]),
186 |       ('ADD', '+'),
187 |       ('expr',
188 |        [('expr', [('NUM', '2')]),
189 |         ('MUL', '*'),
190 |         ('expr',
191 |          [('expr', [('NUM', '3')]),
192 |           ('POW', '**'),
193 |           ('expr', [('NUM', '4')])])])]),
194 |     ('ADD', '+'),
195 |     ('expr', [('NUM', '5')])])])
196 | ```
197 | 
198 | The result is a `ParseTree` object with tuple representation. A parse
199 | leaf is just a `Token` object represented as ```(<token-name>,
200 | '<lexeme>')```.
201 | 
202 | 
203 | ## Save generated parser object
204 | 
205 | It can be time consuming when `metaparse` converts your language into
206 | a parser/interpreter, depending on the size of the language. You might
207 | not want to re-generate the parser each time you starts a Python
208 | process.  So `metaparse` allows you to serialize your parser (which is
209 | no much more than a dictionary encoding the state machine under the
210 | hood). The API is `dumps/loads` or `dump/load`.
211 | 
212 | ``` python
213 | LangArith.dumps('./eg_demo_dump.py')
214 | ```
215 | 
216 | Since our parser is created given access to a global variable named
217 | `context`, which makes `globals` and `context` dependencies of your
218 | translation scheme, you have to pass it to `load` when loading the
219 | parser and define the `context` object in the global scope to allow
220 | your translation to be still functional (for sure, a better way is to
221 | define your context object dedicatedly instead of using `globals`):
222 | 
223 | ``` python
224 | # Another file using the parser
225 | 
226 | from metaparse import LALR
227 | 
228 | # Let loaded parser be able to access current runtime env `globals()`.
229 | arith_parser = LALR.load('./eg_demo_dump.py', globals())
230 | 
231 | # Context instance to be accessed by the loaded parser
232 | context = {}
233 | 
234 | arith_parser.interpret('foo = 1 + 9')
235 | 
236 | print (context)
237 | # {'foo': 10}
238 | ```
239 | 
240 | You might wonder why passing `globals` can work - It's due to that in
241 | Python the `__code__` object can be evaluated given whatever context
242 | and that's what `metaparse` does internally. (more basic details see
243 | the documents for `exec` and `code` object).
244 | 
245 | 
246 | ## Error Reporting
247 | 
248 | During designing a language, it's very easy to make inconsistent
249 | rules. `metaparse` provides sensible error reporting for such cases -
250 | for example, executing the following
251 | 
252 | ``` python
253 | from metaparse import LALR
254 | 
255 | class ExprLang(metaclass=LALR.meta):
256 | 
257 |     NUM = '\d+'
258 |     PLUS = '\+'
259 | 
260 |     def expr(expr, PLUS, term):
261 |         return expr + term
262 | 
263 |     def expr(expr, TIMES, term):
264 |         return expr * term
265 | 
266 |     def expr(term):
267 |         return term
268 | 
269 |     def term(NUM):
270 |         return int(NUM)
271 | 
272 |     def factor(NUM):
273 |         return int(NUM)
274 | ```
275 | 
276 | would result in error report:
277 | 
278 | ``` python-traceback
279 | metaparse.LanguageError: No lexical pattern provided for terminal symbol: TIMES
280 | - in 2th rule (expr = expr TIMES term)
281 | - with helping traceback (if available): 
282 |   File "test_make_error.py", line 21, in expr
283 | 
284 | - declared lexes: Lexer{
285 | [('NUM', re.compile('\\d+')),
286 |  ('PLUS', re.compile('\\+')),
287 |  ('IGNORED', re.compile('\\s+'))]}
288 | ```
289 | 
290 | After providing the missing terminal symbol `TIMES`, another error is
291 | detected during re-run:
292 | 
293 | ``` python-traceback
294 | metaparse.LanguageError: There are unreachable nonterminal at 5th rule: {'factor'}.
295 | - with helping traceback: 
296 |   File "test_make_error.py", line 30, in factor
297 | ```
298 | 
299 | The error information is formulated within Python *traceback* and
300 | should be precise enough and guide you or editors to the exact place
301 | where correction is needed.
302 | 
303 | 
304 | # Generalized LALR and Dealing with Ambiguity
305 | 
306 | `metaparse` supplies an interesting extension: the `GLR` parser with
307 | look-ahead, which can parse ambiguous grammars and help you figure out
308 | why a grammar is ambiguous and fails to be LALR(1).
309 | 
310 | Given the famous ambiguous [Dangling-Else][] grammar:
311 | 
312 | ```
313 |  selection-statement = ...
314 |     | IF expression THEN statement
315 |     | IF expression THEN statement ELSE statement
316 | ```
317 | 
318 | let's build it
319 | using `LALR`:
320 | 
321 | ``` python
322 | from metaparse import GLR, LALR
323 | 
324 | class LangIfThenElse(metaclass=LALR.meta):
325 | 
326 |     IF     = r'if'
327 |     THEN   = r'then'
328 |     ELSE   = r'else'
329 |     EXPR   = r'\d+'
330 |     SINGLE = r'[_a-zA-Z]+'
331 | 
332 |     def stmt(ifstmt):
333 |         return ifstmt 
334 | 
335 |     def stmt(SINGLE):
336 |         return SINGLE 
337 | 
338 |     def ifstmt(IF, EXPR, THEN, stmt_1, ELSE, stmt_2):
339 |         return ('ite', EXPR, stmt_1, stmt_2) 
340 | 
341 |     def ifstmt(IF, EXPR, THEN, stmt):
342 |         return ('it', EXPR, stmt)
343 | ```
344 | 
345 | would result in a *shift/reduce* conflict on the token `ELSE` with error hints:
346 | 
347 | ``` python-traceback
348 | Handling item set: 
349 | ['(ifstmt = IF EXPR THEN stmt.ELSE stmt)', '(ifstmt = IF EXPR THEN stmt.)']
350 | Conflict on lookahead: ELSE 
351 | - ('reduce', (ifstmt = IF EXPR THEN stmt))
352 | - ('shift', ['(ifstmt = IF EXPR THEN stmt ELSE.stmt)'])
353 | ```
354 | 
355 | Using `GLR.meta` instead of `LALR.meta`, and `interpret_generalized` respectively:
356 | 
357 | ``` python
358 | >>> LangIfThenElse.interpret_generalized('if 1 then if 2 then if 3 then a else b else c')
359 | [('ite', '1', ('ite', '2', ('it', '3', 'a'), 'b'), 'c'),
360 |  ('ite', '1', ('it', '2', ('ite', '3', 'a', 'b')), 'c'),
361 |  ('it', '1', ('ite', '2', ('ite', '3', 'a', 'b'), 'c'))]
362 | ```
363 | 
364 | the parser delivers all ambiguous parse results which cannot be
365 | handled by LALR(1) properly. From the result you can gather more
366 | insights about why it's ambigious.
367 | 
368 | Note that interpreting ambigious grammar is error-prone if
369 | side-effects are involved, since the translator function for each
370 | alternative result is executed and it is hard to understand how they
371 | can potentially interfer. **(It is generally advised to use
372 | side-effects-free translation when using GLR parsers!)**.
373 | 
374 | 
375 | ## Using Token Precedence to Resolve Conflicts
376 | 
377 | Though GLR is powerful, we may not want to keep ambiguity in practical
378 | cases and eventually would prefer `LALR` for the sake of clarity and
379 | performance. Very likely, ambiguity is not what you really want and
380 | you might want to resolve ambiguity by specifying precedence of
381 | certain tokens.
382 | 
383 | Taking the Dangling-Else example, by associate to `ELSE` a higher
384 | precedence than `THEN` (just like the arithmetic grammar example
385 | regarding operators), meaning when handling `stmt` between `THEN` and
386 | `ELSE`, i.e. conflicting rules raise an `ELSE` token, the rule having
387 | `ELSE` has higher precedence and will be chosen:
388 | 
389 | ``` python
390 | class LangIfThenElse(metaclass=LALR.meta):
391 |     ...
392 |     THEN = r'then', 1
393 |     ELSE = r'else', 2
394 |     ...
395 | ```
396 | 
397 | With this conflict resolution. The LALR parser can be constructed
398 | successfully and parsing delivers
399 | 
400 | ```
401 | >>> LangIfThenElse.interpret('if 1 then if 2 then if 3 then a else b else c')
402 | ('it', '1', ('ite', '2', ('ite', '3', 'a', 'b'), 'c'))
403 | ```
404 | 
405 | However, in practice, precedence specification can get highly
406 | complicated and intended behavior gets much less than explicit. It is
407 | advised to not use precedence at all if you could find more explicit
408 | and straightforward alternatives.
409 | 
410 | 
411 | # API
412 | 
413 | The following contents give more details about the underlying utilities.
414 | 
415 | ## Explicitly Registering Lexical Patterns and Syntactic Rules
416 | 
417 | The following APIs for defining the language in [the very first
418 | example](#quick-example) works for both Python 2 and Python 3, with
419 | the more verbose but more explicit style, heavily relying on using
420 | decorators.
421 | 
422 | ``` python
423 | from metaparse import LALR
424 | 
425 | LangArith = LALR()
426 | 
427 | lex  = LangArith.lexer
428 | rule = LangArith.rule
429 | 
430 | # lex(<terminal-symbol> = <pattern>)
431 | lex(IGNORED = r'\s+')
432 | lex(NUM = r'[0-9]+')
433 | lex(EQ  = r'=')
434 | lex(ID  = r'[_a-zA-Z]\w*')
435 | 
436 | # lex(... , p = <precedence>)
437 | lex(POW = r'\*\*', p=3)
438 | lex(POW = r'\^')                # No need to give the precedence twice for POW.
439 | lex(MUL = r'\*'  , p=2)
440 | lex(ADD = r'\+'  , p=1)
441 | 
442 | # @rule
443 | # def <lhs> ( <rhs> ):
444 | #     <semantics>
445 | @rule
446 | def assign(ID, EQ, expr):
447 |     context[ID] = expr
448 |     return expr
449 | 
450 | @rule
451 | def expr(ID):
452 |     return context[ID]
453 | 
454 | @rule
455 | def expr(NUM):
456 |     return int(NUM)
457 | 
458 | @rule
459 | def expr(expr_1, ADD, expr_2):
460 |     return expr_1 + expr_2
461 | 
462 | @rule
463 | def expr(expr, MUL, expr_1):
464 |     return expr * expr_1
465 | 
466 | @rule
467 | def expr(expr, POW, expr_1):
468 |     return expr ** expr_1
469 | 
470 | # Complete making the parser after collecting things!
471 | LangArith.make()
472 | ```
473 | 
474 | Explanation in short:
475 | 
476 | * `lex` is the `Lexer` instance associated with `LangArith`, which is also
477 | able to collect definition of lexical patterns.
478 | 
479 | * `rule` is a decorator which extracts syntactic rule information from
480 | the function signature and register the function itself as translator
481 | for this rule.
482 | 
483 | ## The Underlying Lexical Analyzer
484 | 
485 | After declaring the language like above, `metaparse` internally
486 | creates a lexical analyzer as a component used by the internal parser.
487 | Lexical analyzer maintains a list of terminal symbols of the language
488 | defined, preserving the order they appear in the code.
489 | 
490 | ``` python
491 | >>> LangArith.lexer
492 | Lexer{
493 | [('IGNORED', re.compile('\\s+')),
494 |  ('EQ', re.compile('=')),
495 |  ('NUM', re.compile('[1-9][0-9]*')),
496 |  ('ID', re.compile('[_a-zA-Z]\\w*')),
497 |  ('POW', re.compile('\\*\\*')),
498 |  ('MUL', re.compile('\\*')),
499 |  ('ADD', re.compile('\\+'))]}
500 | ```
501 | 
502 | It runs when method `tokenize` is called and generates tokens carrying
503 | attributes. During tokenizing, the patterns are checked respecting the
504 | order in the list.
505 | 
506 | Note there is a pre-defined special lexical element `IGNORED`:
507 | 
508 |   * When `Lexer` reads a string matching the pattern associating
509 |     `IGNORED`, no token is generated for the matching part of the
510 |     string;
511 | 
512 |   * If `IGNORED` is not explicitly overriden in the user's language
513 |     definition, it will have the default value `r'\s+'`.
514 | 
515 | We can print out the tracing of lexcial analyzing process:
516 | 
517 | ``` python
518 | >>> for token in LangArith.lexer.tokenize(" foo  = 1 + bar * 2"):
519 | ...     print(token.pos,
520 | ...           token.end,
521 | ...           token.symbol,
522 | ...           repr(token.lexeme),   # (lexeme) is something literal.
523 | ...           repr(token.value))    # (value) is something computed by handler, if exists.
524 | 
525 | 1 4 ID 'foo' 'foo'
526 | 6 7 EQ '=' '='
527 | 8 9 NUM '1' 1
528 | 10 11 ADD '+' '+'
529 | 12 15 ID 'bar' 'bar'
530 | 16 17 MUL '*' '*'
531 | 18 19 NUM '2' 2
532 | 
533 | ```
534 | 
535 | Moreover, it is OK to declare more lexical patterns under the same
536 | name:
537 | 
538 | ``` python
539 | class LangArith(metaclass=LALR.meta):
540 |     ...
541 |     IGNORED = r' '
542 |     IGNORED = r'\t'
543 |     IGNORED = r'#'
544 |     ...
545 |     POW     = r'\*\*'
546 |     POW     = r'\^'
547 |     ...
548 | ```
549 | 
550 | which avoids clustering alternative sub-patterns in one `re` expression.
551 | 
552 | In practical use, you might not need to call `Lexer` at all.
553 | 
554 | 
555 | ## Online-Parsing behind the Scene
556 | 
557 | The `parse` and `interpret` methods are implemented internally based
558 | on generators, which is a sort of *online-processing* behavior, i.e.
559 | 
560 | ```
561 | <get-token> —→ <process-actions> —→ <wait-for-next-token>
562 | ```
563 | 
564 | The following block of code calls the routine directly, starts it, and
565 | traces the intermediate states:
566 | 
567 | ``` python
568 | # Prepare a parsing routine
569 | p = LangArith.prepare()
570 | 
571 | # Start this routine
572 | next(p)
573 | 
574 | # Send tokens one-by-one
575 | for token in LangArith.lexer.tokenize('bar = 1 + 2 + + 3', with_end=True):
576 |     print("Sends: ", token)
577 |     r = p.send(token)
578 |     print("Got:   ", r)
579 |     print()
580 | ``` 
581 | 
582 | that is, via sending tokens to the parser one-by-one for
583 | interpretation, an internal interpretation stack is maintained and
584 | updated. The top element of the stack is returned wrapped in a `Just`
585 | structure as a response to each token (which can be a reduced result
586 | from a sequence of elements perfectly matching the rule). When token
587 | fails processing a `ParseError` containing useful information is
588 | returned (rather than thrown).
589 | 
590 | ``` python-traceback
591 | Sends:  ('ID', 'bar')
592 | Got:    Just(result=('ID', 'bar'))
593 | 
594 | Sends:  ('EQ', '=')
595 | Got:    Just(result=('EQ', '='))
596 | 
597 | Sends:  ('NUM', '1')
598 | Got:    Just(result=('NUM', '1'))
599 | 
600 | Sends:  ('ADD', '+')
601 | Got:    Just(result=('ADD', '+'))
602 | 
603 | Sends:  ('NUM', '2')
604 | Got:    Just(result=('NUM', '2'))
605 | 
606 | Sends:  ('ADD', '+')
607 | Got:    Just(result=('ADD', '+'))
608 | 
609 | Sends:  ('ADD', '+')
610 | Got:    Unexpected token ('ADD', '+') at (14:15)
611 | while expecting actions 
612 | {'ID': ('shift', 5), 'NUM': ('shift', 6)}
613 | with state stack 
614 | [['(assign^ = .assign)'],
615 |  ['(assign = ID.EQ expr)'],
616 |  ['(assign = ID EQ.expr)'],
617 |  ['(assign = ID EQ expr.)',
618 |   '(expr = expr.ADD expr)',
619 |   '(expr = expr.MUL expr)',
620 |   '(expr = expr.POW expr)'],
621 |  ['(expr = expr ADD.expr)']]
622 | and subtree stack 
623 | ['bar', '=', 3, '+']
624 | 
625 | 
626 | Sends:  ('NUM', '3')
627 | Got:    Just(result=('NUM', '3'))
628 | 
629 | Sends:  ('\x03', None)
630 | Got:    Just(result=6)
631 | ```
632 | 
633 | 
634 | # Limitations
635 | 
636 | Though this module provides advantageous features, there are also limitations:
637 | 
638 | * Parsing grammars with **loops** is not supported. For example, the
639 |   grammar
640 | 
641 |   ```
642 |   P → Q | a
643 |   Q → P
644 |   ```
645 | 
646 |   is *infinitely ambiguous*, which has infinite number of derivations
647 |   while processing only finite input, e.g. `"a"`:
648 | 
649 |   ```
650 |   P ⇒ a
651 |   P ⇒ Q ⇒ P ⇒ a
652 |   ...
653 |   P ⇒ Q ⇒ ... ⇒ P ⇒ a
654 |   ```
655 | 
656 |   where each derivation corresponds to a parse tree. Eager generation
657 |   of these trees lead to non-termination during parsing.
658 | 
659 | * Only **legal Python identifier**, rather than non-alphabetic symbols
660 |   (like `<fo#o>`, `==`, `raise`, etc) can be used as symbols in
661 |   grammar (seems no serious).
662 | 
663 | * Parsing algorithms are implemented in pure Python, but speed-up via
664 |   Cython should be possible in the future.
665 | 
666 | 
667 | [Parsing]: https://en.wikipedia.org/wiki/Parsing "Parsing"
668 | [Interpreting]: https://en.wikipedia.org/wiki/Interpreter_(computing) "Interpreter"
669 | [DSL]: https://en.wikipedia.org/wiki/Domain-specific_language "Domain-specific Language"
670 | [BNF]: https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form "Backus-Naur From"
671 | [Earley]: https://en.wikipedia.org/wiki/Earley_parser "Earley"
672 | [LL]: https://en.wikipedia.org/wiki/LL_parser "Left-to-right, Leftmost-derivation"
673 | [GLL]: http://dotat.at/tmp/gll.pdf "General Left-to-right, Leftmost-derivation"
674 | [GLR]: https://en.wikipedia.org/wiki/GLR_parser "General Left-to-right, Rightmost derivation"
675 | [LALR]: https://en.wikipedia.org/wiki/LALR_parser "Look-Ahead Left-to-right, Rightmost-derivation"
676 | [CFG]: https://en.wikipedia.org/wiki/Context-free_grammar "Context-free Grammar"
677 | [Yacc]: https://en.wikipedia.org/wiki/Yacc "Yet Another Compiler Compiler"
678 | [Bison]: https://en.wikipedia.org/wiki/GNU_bison "Bison"
679 | [Parsec]: http://book.realworldhaskell.org/read/using-parsec.html "Parsec"
680 | [instaparse]: https://github.com/Engelberg/instaparse "Instaparse"
681 | [SDT]: https://en.wikipedia.org/wiki/Syntax-directed_translation "Syntax-directed Translation"
682 | [LF]: http://www.csd.uwo.ca/~moreno//CS447/Lectures/Syntax.html/node9.html "Left-factoring"
683 | [ANTLR]: http://www.antlr.org/ "ANother Tool for Language Recognition"
684 | [clojure]: https://clojure.org/ "Clojure"
685 | [PLY]: http://www.dabeaz.com/ply/ "PLY"
686 | [Dangling-Else]: https://en.wikipedia.org/wiki/Dangling_else "Dangling-Else"
687 | 


--------------------------------------------------------------------------------
/metaparse.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | 
   3 | import re
   4 | import pprint
   5 | import warnings
   6 | import marshal
   7 | import types
   8 | import traceback
   9 | 
  10 | from pprint import pformat
  11 | 
  12 | from collections import deque
  13 | from collections import namedtuple
  14 | from collections import OrderedDict as odict
  15 | 
  16 | 
  17 | class Token(namedtuple('Token', 'pos symbol lexeme value')):
  18 | 
  19 |     @property
  20 |     def end(self):
  21 |         return self.pos + len(self.lexeme)
  22 | 
  23 |     def __repr__(self):
  24 |         return "({}, {})".format(
  25 |             repr(self.symbol), repr(self.value))
  26 | 
  27 | 
  28 | class Rule(namedtuple('Rule', 'lhs rhs')):
  29 | 
  30 |     def __repr__(self):
  31 |         return '({} = {})'.format(
  32 |             self.lhs, ' '.join(self.rhs))
  33 | 
  34 |     @staticmethod
  35 |     def from_func(func):
  36 |         'Construct a rule object from a function`s signature. '
  37 |         lhs = func.__name__
  38 |         rhs = []
  39 |         ac = func.__code__.co_argcount
  40 |         vs = func.__code__.co_varnames
  41 |         for x in vs[:ac]:
  42 |             # Cut tailing digital subscript like xxx_4.
  43 |             s = re.search(r'_(\d+)$', x)
  44 |             if s:
  45 |                 x = x[:s.start()]
  46 |             rhs.append(x)
  47 |         # Use immutable.
  48 |         rhs = tuple(rhs)
  49 |         return Rule(lhs, rhs)
  50 | 
  51 | 
  52 | class ParseTree(namedtuple('ParseTree', 'node subs')):
  53 | 
  54 |     def __repr__(self):
  55 |         return tuple.__repr__(self)
  56 | 
  57 |     @property
  58 |     def pos(self):
  59 |         return self.subs[0].pos
  60 | 
  61 |     @property
  62 |     def end(self):
  63 |         return self.subs[-1].pos
  64 | 
  65 | 
  66 | def identity(x):
  67 |     return x
  68 | 
  69 | 
  70 | # Special token to be delivered by the tokenizer.
  71 | END_TOKEN = Token(-1, '\x03', None, None)
  72 | 
  73 | 
  74 | class Lexer(object):
  75 | 
  76 |     class Error(Exception):
  77 |         pass
  78 | 
  79 |     def __init__(self, names=None, patterns=None, handlers=None):
  80 |         """The Lexer object bookkeeps 3 same-sized parallel lists:
  81 | 
  82 |             :names:
  83 | 
  84 |                 The names of the patterns, which are supposed to be
  85 |                 consistent with dependent `Grammar` object, where they
  86 |                 are terminal symbols.
  87 | 
  88 |             :patterns:
  89 | 
  90 |                 The patterns corresponding to the names with same
  91 |                 indexing.  Each pattern is a /compiled/ regular
  92 |                 expression object.
  93 | 
  94 |             :handlers:
  95 | 
  96 |                 The handlers corresponding to the named patterns,
  97 |                 called when successfully tokenizing the named pattern.
  98 | 
  99 |         """
 100 | 
 101 |         self.names = names if names else []
 102 |         self.patterns = patterns if patterns else []
 103 |         self.handlers = handlers if handlers else []
 104 |         self.precedence = {}
 105 | 
 106 |     def __call__(self, **kw):
 107 |         """Supporting registering lexical pattern like:
 108 |         ::
 109 | 
 110 |             @my_lexer(INTEGER = r'[1-9][0-9]*')
 111 |             def handler(value):
 112 |                 return int(value)
 113 | 
 114 |         """
 115 |         assert ('p' in kw and len(kw) == 2) or len(kw) == 1
 116 |         prece = kw.pop('p') if 'p' in kw else None
 117 |         name, pattern = kw.popitem()
 118 |         if prece:
 119 |             self.precedence[name] = prece
 120 |         self.names.append(name)
 121 |         self.patterns.append(re.compile(pattern))
 122 |         self.handlers.append(None)
 123 |         assert len(self.names) == len(self.patterns) == len(self.handlers)
 124 | 
 125 |         def z(func):
 126 |             'Swap the last handler with the decorated function.'
 127 |             self.handlers[-1] = func
 128 | 
 129 |         return z
 130 | 
 131 |     def __repr__(self):
 132 |         return 'Lexer{{\n{}}}'.format(
 133 |             pformat(list(zip(self.names, self.patterns))))
 134 | 
 135 |     def more(self, **kw):
 136 |         """Register more lexcial name-patterns with one call like::
 137 | 
 138 |             my_lexer.more(
 139 |                 ADD = r'\+',
 140 |                 SUB = r'-',
 141 |                 TIMES = r'\*',
 142 |                 ...
 143 |             )
 144 | 
 145 |         * Note:
 146 |             In this case the /order/ of these name-patterns are not preserved!
 147 |         """
 148 |         for name, pat in kw.items():
 149 |             self.names.append(name)
 150 |             self.patterns.append(re.compile(pat))
 151 |             self.handlers.append(None)
 152 | 
 153 |     def register(self, name, pattern, handler=None, precedence=None):
 154 |         """Registers lexical pattern directly."""
 155 |         self.names.append(name)
 156 |         self.patterns.append(re.compile(pattern))
 157 |         self.handlers.append(handler)
 158 |         if precedence is not None:
 159 |             self.precedence[name] = precedence
 160 | 
 161 |     def tokenize(self, inp, with_end=False):
 162 |         """Prepares a generator object, which iteratively finds possible
 163 |         lexical patterns given input.
 164 | 
 165 |         :with_end:
 166 | 
 167 |             means delivering the END_TOKEN after reading over the input.
 168 | 
 169 |         """
 170 |         names = self.names
 171 |         patterns = self.patterns
 172 |         handlers = self.handlers
 173 |         pos = 0
 174 |         while pos < len(inp):
 175 |             match = None
 176 |             name = None
 177 |             handler = None
 178 |             for nm, rgx, hdl in zip(names, patterns, handlers):
 179 |                 match = rgx.match(inp, pos=pos)
 180 |                 if match:
 181 |                     name = nm
 182 |                     handler = hdl
 183 |                     break
 184 |             else:
 185 |                 raise Lexer.Error(
 186 |                     "No pattern for unrecognized: {}th char in input: '{}'\n"
 187 |                     .format(pos, inp[pos]))
 188 |             lxm = match.group()
 189 |             if name == 'IGNORED':
 190 |                 # IGNORED should be associated with no handler.
 191 |                 pass
 192 |             elif name == 'ERROR':
 193 |                 # ERROR must have a handler, whilst not yielded as a token.
 194 |                 assert handler, 'Each ERROR token must have a handler!'
 195 |                 handler(lxm)
 196 |             else:
 197 |                 val = handler(lxm) if handler else lxm
 198 |                 yield Token(pos, name, lxm, val)
 199 |             pos = match.end()
 200 |         if with_end:
 201 |             yield END_TOKEN
 202 | 
 203 | 
 204 | class Grammar(object):
 205 | 
 206 |     def __init__(self, rules, precedence=None):
 207 |         """A `Grammar` object has these attributes:
 208 | 
 209 |         Core attributes:
 210 | 
 211 |             :start:
 212 |                 The starting syntactic rule of the grammar.
 213 |             :rules:
 214 |                 All syntactic rules of the grammar.
 215 |             :nonterminals:
 216 |                 Non-terminal symbols.
 217 |             :terminals:
 218 |                 Terminal symbols.
 219 |             :precedence:
 220 |                 Precedence of symbols to resolve LR-conflicts.
 221 | 
 222 |         Auxiliary attributes:
 223 | 
 224 |             :group: dict
 225 |                 Lookup rules grouped by the same LHS.
 226 |             :unreachable: set
 227 |                 Unreachable non-terminal symbols by deriving from start rule.
 228 |             :NULLABLE: set
 229 |                 Nullable rules in the grammar.
 230 |             :FIRST: dict
 231 |                 The FIRST set of terminal symbols of each non-terminal symbol.
 232 | 
 233 |         All these attributes are necessary for performing the
 234 |         CLOSURE-algorithm, including:
 235 | 
 236 |             :closure:
 237 | 
 238 |             :closure1_with_lookahead:
 239 | 
 240 |         closure with or without lookahead.
 241 | 
 242 |         """
 243 | 
 244 |         if not precedence:
 245 |             precedence = {}
 246 | 
 247 |         # Augmented grammar with singleton/non-alternated start-rule.
 248 |         self.start = rules[0].lhs
 249 |         self.rules = rules
 250 | 
 251 |         # Conclude nonterminals/terminals.
 252 |         self.nonterminals = set()
 253 |         self.symbols = set()
 254 |         for lhs, rhs in rules:
 255 |             self.nonterminals.add(lhs)
 256 |             self.symbols.update(rhs)
 257 |         self.terminals = self.symbols - self.nonterminals
 258 | 
 259 |         # Group by LHS
 260 |         self.group = {nt: [] for nt in self.nonterminals}
 261 |         for i, (lhs, rhs) in enumerate(rules):
 262 |             self.group[lhs].append(i)
 263 |         # Collect unreachable nonterminal from start symbol.
 264 |         reachable = {self.start}
 265 |         while 1:
 266 |             news = set()
 267 |             for X in reachable:
 268 |                 for j in self.group[X]:
 269 |                     for Y in self.rules[j].rhs:
 270 |                         if Y in self.nonterminals:
 271 |                             if Y not in reachable:
 272 |                                 news.add(Y)
 273 |             if news:
 274 |                 reachable.update(news)
 275 |             else:
 276 |                 break
 277 |         self.unreachable = self.nonterminals - reachable
 278 | 
 279 |         # precedence is not only specifiable for tokens, but also for
 280 |         # symbols.
 281 |         self.precedence = precedence
 282 | 
 283 |         # Calc NULLABLE
 284 |         self.NULLABLE = NULLABLE = set()
 285 |         while 1:
 286 |             has_new = False
 287 |             for lhs, rhs in rules:
 288 |                 if all(x in NULLABLE for x in rhs):
 289 |                     if lhs not in NULLABLE:
 290 |                         NULLABLE.add(lhs)
 291 |                         has_new = True
 292 |             if not has_new:
 293 |                 break
 294 | 
 295 |         # Calc FIRST
 296 |         self.FIRST = FIRST = {}
 297 |         for t in self.terminals:
 298 |             FIRST[t] = {t}
 299 |         for nt in self.nonterminals:
 300 |             FIRST[nt] = set()
 301 |             if nt in NULLABLE:
 302 |                 FIRST[nt].add('EPSILON')
 303 |         while 1:
 304 |             has_new = False
 305 |             for lhs, rhs in rules:
 306 |                 # Use the FIRST[rhs] to update FIRST[lhs].
 307 |                 for Y in rhs:
 308 |                     for a in FIRST[Y]:
 309 |                         if a not in FIRST[lhs]:
 310 |                             FIRST[lhs].add(a)
 311 |                             has_new = True
 312 |                     if Y not in NULLABLE:
 313 |                         break
 314 |             if not has_new:
 315 |                 break
 316 | 
 317 |     def __repr__(self):
 318 |         return pprint.pformat(self.rules)
 319 | 
 320 |     def first(self, X):
 321 |         if X in self.FIRST:
 322 |             return self.FIRST[X]
 323 |         else:
 324 |             return {X}
 325 | 
 326 |     def first_of_seq(self, seq, tail):
 327 |         assert tail != 'EPSILON'
 328 |         s = set()
 329 |         # `for-else` structure: do-and-find sth, if not found, run `else`.
 330 |         for Y in seq:
 331 |             s.update(self.first(Y))
 332 |             if Y not in self.NULLABLE:
 333 |                 break
 334 |         else:
 335 |             # `else` is executed only when `for` is not broken out.
 336 |             s.add(tail)
 337 |         s.discard('EPSILON')
 338 |         return s
 339 | 
 340 |     def closure(self, I):
 341 |         """Naive closure algorithm on item set :I:."""
 342 |         G = self
 343 |         C = I[:]
 344 |         z = 0
 345 |         while z < len(C):
 346 |             (i, p) = C[z]
 347 |             if p < len(G.rules[i].rhs):
 348 |                 X = G.rules[i].rhs[p]
 349 |                 if X in G.nonterminals:
 350 |                     for j in G.group[X]:
 351 |                         if (j, 0) not in C:
 352 |                             C.append((j, 0))
 353 |             z += 1
 354 |         return C
 355 | 
 356 |     def closure1_with_lookahead(self, item, a):
 357 |         """Lookahead closure algorithm on item set [(:item:, :a:)]."""
 358 |         G = self
 359 |         C = [(item, a)]
 360 |         z = 0
 361 |         while z < len(C):
 362 |             (i, p), a = C[z]
 363 |             if p < len(G.rules[i].rhs):
 364 |                 X = G.rules[i].rhs[p]
 365 |                 if X in G.nonterminals:
 366 |                     for j in G.group[X]:
 367 |                         for b in G.first_of_seq(G.rules[i].rhs[p+1:], a):
 368 |                             if ((j, 0), b) not in C:
 369 |                                 C.append(((j, 0), b))
 370 |             z += 1
 371 |         return C
 372 | 
 373 |     class meta(type):
 374 | 
 375 |         class Reader(list):
 376 | 
 377 |             def __getitem__(self, k):
 378 |                 raise KeyError()
 379 | 
 380 |             def __setitem__(self, k, v):
 381 |                 if callable(v):
 382 |                     self.append(Rule.from_func(v))
 383 |                 else:
 384 |                     pass
 385 | 
 386 |         @classmethod
 387 |         def __prepare__(mcls, name, bases, *a, **kw):
 388 |             return Grammar.meta.Reader()
 389 | 
 390 |         def __new__(mcls, n, b, r):
 391 |             return Grammar(list(r))
 392 | 
 393 | 
 394 | def augment(rules, semans):
 395 |     """Augment language (rules, semantics) with a top rule and a top
 396 |     semantics.
 397 | 
 398 |     """
 399 |     assert len(rules) == len(semans)
 400 |     start = rules[0].lhs
 401 |     rules = [Rule(start+'^', (start,))] + rules
 402 |     semans = [identity] + semans
 403 |     assert len(rules) == len(semans)
 404 |     return rules, semans
 405 | 
 406 | 
 407 | class GSS(namedtuple('GSS', 'tail head')):
 408 | 
 409 |     """Graph Structured Stack: a memory-friendly structure for forking
 410 |     states during generalized parsing, which is identical to CONS
 411 |     structure in LISP. """
 412 | 
 413 |     def to_list(self):
 414 |         'Stack safety.'
 415 |         gss = self
 416 |         l = deque()
 417 |         while gss is not Nil:
 418 |             l.appendleft(gss.head)
 419 |             gss = gss.tail
 420 |         return l
 421 | 
 422 |     def __repr__(self):
 423 |         return repr(self.to_list())
 424 | 
 425 | Nil = GSS(None, None)
 426 | 
 427 | 
 428 | # In order to supply an API, syntax error during parsing may be
 429 | # returned as object containing error information.
 430 | # class MetaparseSyntaxError(SyntaxError):
 431 | #     def __init__(self, *a, lineno=None, offset=None):
 432 | #         super(MetaparseSyntaxError, self).__init__(*a)
 433 | #         self.lineno = lineno
 434 | #         self.offset = offset
 435 | Just = namedtuple('Just', 'result')
 436 | 
 437 | 
 438 | class LanguageError(Exception):
 439 |     # FIXME: Should contain some data attributes?
 440 |     def __init__(self, message):
 441 |         self.message = message
 442 | 
 443 | 
 444 | class ParseError(Exception):
 445 | 
 446 |     def __init__(self, token, action, stack, tree_stack):
 447 |         """Record for syntactic error information during parsing.
 448 |         - thrown/returned during parsing?
 449 |         - handler?
 450 | 
 451 |           - May need to associate syntax error handler to the parser!
 452 |             - How to define such a handler?
 453 |               - For each rule?
 454 |               - Error correction?
 455 | 
 456 |           - Or even semantic error handler?
 457 |             - A handler defined to check the whole content of argument stack!
 458 |             - translation (i.e. applying semantics to arguments in arg-stack)
 459 |               only available after such check.
 460 |             - To be thrown in the rule-seman-body
 461 |             - To be catched and reported by the parsing routine
 462 |         """
 463 | 
 464 |         """Which information to be included?
 465 | 
 466 |         - The syntax tree being constructed -- exactly the active item
 467 |         in the current state (top of stack), as well as the expected
 468 |         token.
 469 | 
 470 |         - The range of input text corresponding to the syntax tree?
 471 | 
 472 |         """
 473 |         msg = ('Unexpected token {} at ({}:{})\n'
 474 |                'while expecting actions \n{}\n'
 475 |                'with state stack \n{}\n'
 476 |                'and subtree stack \n{}\n'
 477 |                .format(
 478 |                    token,
 479 |                    token.pos, token.end,
 480 |                    pformat(action),
 481 |                    pformat(stack),
 482 |                    pformat(tree_stack)))
 483 | 
 484 |         super(ParseError, self).__init__(msg)
 485 |         # self.tree = tree
 486 |         self.token = token
 487 |         self.action = action
 488 |         self.stack = stack
 489 | 
 490 | 
 491 | class GLR(object):
 492 | 
 493 |     """Generalized LR parser with lookahead.
 494 | 
 495 |     - It is the generalized version of LALR parser, thus being
 496 |     slightly more powerful than typical GLR(0) parser due to
 497 |     utilization of lookhead.
 498 | 
 499 |     """
 500 | 
 501 |     def __init__(self, lexer=None, rules=None, precedence=None):
 502 |         self.rules = rules if rules else []
 503 |         self.precedence = precedence if precedence else {}
 504 |         self.lexer = lexer if lexer else Lexer()
 505 |         self.semans = []
 506 | 
 507 |         assert isinstance(self.lexer, Lexer)
 508 |         assert isinstance(self.precedence, dict)
 509 |         assert isinstance(self.rules, list)
 510 |         assert isinstance(self.semans, list)
 511 | 
 512 |     def rule(self, func):
 513 |         rule = Rule.from_func(func)
 514 |         self.rules.append(rule)
 515 |         self.semans.append(func)
 516 | 
 517 |     def make(self):
 518 | 
 519 |         # Augmented lexer - ignoring spaces by default.
 520 |         lexes = set(self.lexer.names)
 521 |         if 'IGNORED' not in lexes:
 522 |             self.lexer.register('IGNORED', r'\s+')
 523 | 
 524 |         # Augmented grammar - top semantics
 525 |         self.rules, self.semans = augment(self.rules, self.semans)
 526 | 
 527 |         # Propagate precedence from lexer.
 528 |         if self.lexer.precedence:
 529 |             self.precedence.update(self.lexer.precedence)
 530 | 
 531 |         # Prepare Grammar object to use closure algorithms.
 532 |         G = Grammar(self.rules, self.precedence)
 533 | 
 534 |         # if 'ERROR' not in self.lexer.handler:
 535 |         #     warnings.warn(
 536 |         #         "No ERROR handler available. "
 537 |         #         "Lexer will fail when reading unrecognized character.")
 538 | 
 539 |         # Check coverage of Lexer.
 540 |         # - Each terminal should have its corresponding lexical pattern.
 541 |         for r, rule in enumerate(G.rules):
 542 |             for y in rule.rhs:
 543 |                 if y in G.terminals and y not in lexes:
 544 |                     msg = ('No lexical pattern provided '
 545 |                            'for terminal symbol: {}\n'
 546 |                            '- in {}th rule {}\n'
 547 |                     ).format(y, r, rule)
 548 |                     seman = self.semans[r]
 549 |                     trc = traceback.format_list([
 550 |                         (seman.__code__.co_filename,
 551 |                          seman.__code__.co_firstlineno,
 552 |                          seman.__name__,
 553 |                          '')])[0]
 554 |                     trc_msg = ('- with helping traceback (if available): \n'
 555 |                                '{}\n').format(trc)
 556 |                     lex_msg = ('- declared lexes: {}\n').format(self.lexer)
 557 |                     raise LanguageError(msg + trc_msg + lex_msg)
 558 | 
 559 |         # Report soundness of grammar (unreachable, loops, etc).
 560 |         for X in G.unreachable:
 561 |             for i in G.group[X]:
 562 |                 seman = self.semans[i]
 563 |                 trc = traceback.format_list([
 564 |                     (seman.__code__.co_filename,
 565 |                      seman.__code__.co_firstlineno,
 566 |                      seman.__name__,
 567 |                      '')])[0]
 568 |                 msg = ('There are unreachable nonterminals at {}th rule: {}.\n'
 569 |                        '- with helping traceback: \n{}\n'
 570 |                 ).format(i, G.unreachable, trc)
 571 |                 # warnings.warn(msg)
 572 |                 raise LanguageError(msg)
 573 | 
 574 |         # Kernel sets and corresponding GOTO
 575 |         self.Ks = Ks = [[(0, 0)]]
 576 |         self.GOTO = GOTO = []
 577 | 
 578 |         # Make LR(0) kernel sets Ks and GOTO, incrementally.
 579 |         i = 0
 580 |         while i < len(Ks):
 581 |             I = Ks[i]
 582 |             igotoset = odict()
 583 |             for (nk, p) in G.closure(I):
 584 |                 if p < len(G.rules[nk].rhs):
 585 |                     X = G.rules[nk].rhs[p]
 586 |                     if X not in igotoset:
 587 |                         igotoset[X] = []
 588 |                     if (nk, p+1) not in igotoset[X]:
 589 |                         # (nk, p+1) is the shifted item of (nk, p)
 590 |                         igotoset[X].append((nk, p+1))
 591 |             igoto = {}
 592 |             for X, J in igotoset.items():
 593 |                 J.sort()
 594 |                 if J in Ks:
 595 |                     igoto[X] = Ks.index(J)
 596 |                 else:
 597 |                     igoto[X] = len(Ks)
 598 |                     Ks.append(J)
 599 |             GOTO.append(igoto)
 600 |             i += 1
 601 | 
 602 |         # Lookahead set corresponding to item set
 603 |         self.Ls = Ls = [[set() for _ in K] for K in Ks]
 604 | 
 605 |         Ls[0][0] = {'\x03'}
 606 |         # Ls[0][0] = {'$'}
 607 | 
 608 |         DUMMY = '\x00'
 609 |         propa = []
 610 |         for i, K in enumerate(Ks):
 611 |             for ii, itm in enumerate(K):
 612 |                 C = G.closure1_with_lookahead(itm, DUMMY)
 613 |                 # for each non-kernel nk
 614 |                 for (nk, p), a in C:
 615 |                     # active
 616 |                     if p < len(G.rules[nk].rhs):
 617 |                         # actor
 618 |                         X = G.rules[nk].rhs[p]
 619 |                         # target item
 620 |                         j = GOTO[i][X]
 621 |                         jj = Ks[j].index((nk, p+1))
 622 |                         # spontaneous
 623 |                         if a != DUMMY:
 624 |                             Ls[j][jj].add(a)
 625 |                         # propagated
 626 |                         else:
 627 |                             propa.append((
 628 |                                 # from K[i], ii'th item
 629 |                                 (i, ii),
 630 |                                 # to K[j], jj'th item
 631 |                                 (j, jj),
 632 |                             ))
 633 |                     else:
 634 |                         # Handle ended item here?
 635 |                         #
 636 |                         # No. The item to be reduced should share the
 637 |                         # set of lookaheads of kernel item whilst this
 638 |                         # set is yet to be accomplished.
 639 |                         pass
 640 | 
 641 |         # Propagation till fix-point
 642 |         self.propa = propa
 643 |         while 1:
 644 |             has_new = False
 645 |             for (i, ii), (j, jj) in propa:
 646 |                 for a in Ls[i][ii]:
 647 |                     if a not in Ls[j][jj]:
 648 |                         Ls[j][jj].add(a)
 649 |                         has_new = True
 650 |             if not has_new:
 651 |                 break
 652 | 
 653 |         # Conclude lookahead actions allowing conflicts on identical
 654 |         # lookaheads.
 655 |         # self.ACTION = ACTION = [set() for _ in Ks]
 656 |         self.ACTION = ACTION = [{} for _ in Ks]
 657 |         for A, Xto in zip(ACTION, GOTO):
 658 |             for X, j in Xto.items():
 659 |                 if X in G.terminals:
 660 |                     if X not in A:
 661 |                         A[X] = set()
 662 |                     A[X].add(('shift', j))
 663 |         for K, L, A in zip(Ks, Ls, ACTION):
 664 |             for k, l in zip(K, L):
 665 |                 for (c, q), b in G.closure1_with_lookahead(k, DUMMY):
 666 |                     # Accept state.
 667 |                     if c == 0 and q == 1:
 668 |                         if '\x03' not in A:
 669 |                             A['\x03'] = {('accept', 0)}
 670 |                     # IMPORTANT: kernel/non-kernels which are ended!
 671 |                     elif q == len(G.rules[c].rhs):
 672 |                         # Spontaneous reduction
 673 |                         if b != DUMMY:
 674 |                             if b not in A:
 675 |                                 A[b] = set()
 676 |                             A[b].add(('reduce', c))
 677 |                         # Propagated from lookaheads of kernel item
 678 |                         # being closed
 679 |                         else:
 680 |                             for a in l:
 681 |                                 if a not in A:
 682 |                                     A[a] = set()
 683 |                                 A[a].add(('reduce', c))
 684 | 
 685 |         # TODO: Resolving conflicts with symbol precedence
 686 |         # - Resolution can filter some invalid actions in ACTION
 687 |         #   for GLR.
 688 |         # - Use phantom-precedence to decide!
 689 |         #   - decider for shift: the left neighbor of item actor symbol
 690 |         #   - decider for reduce: the lookahead symbol
 691 |         # - For any action in ACTION[i], i.e. A:
 692 |         #   - if the decider has no precedence, it must be preserved;
 693 |         #   - if the decider has highest precedence among A, it must be
 694 |         #     preserved;
 695 |         #   - otherwise, it gets excluded.
 696 |         # if self.precedence:
 697 |         #     def prsv(i, look, action):
 698 |         #         if Ks[i]
 699 |         #         act, arg = action
 700 |         #         if act == 'reduce':
 701 |         return
 702 | 
 703 |     def prepare_generalized(self, interpret=True):
 704 |         """Prepare a parsing coroutine which accepts tokens."""
 705 |         agenda = deque()
 706 |         agenda.append((GSS(Nil, 0), Nil))
 707 |         tokens = []
 708 |         # results = ddict(list)
 709 | 
 710 |         token = yield None
 711 |         tokens.append(token)
 712 |         while 1:
 713 | 
 714 |             agenda_bak = deque(agenda)
 715 |             agenda_new = deque()
 716 | 
 717 |             # Dead states for error reporting.
 718 |             dead = []
 719 | 
 720 |             while agenda:
 721 | 
 722 |                 sstk, tstk = agenda.popleft()
 723 |                 s = sstk.head
 724 | 
 725 |                 if token.symbol in self.ACTION[s]:
 726 | 
 727 |                     for act, arg in self.ACTION[s][token.symbol]:
 728 | 
 729 |                         sstk1, tstk1 = sstk, tstk
 730 | 
 731 |                         if act == 'reduce':
 732 |                             tar_rule = self.rules[arg]
 733 |                             subs = deque()
 734 |                             for _ in tar_rule.rhs:
 735 |                                 # Pop from GSS
 736 |                                 sstk1 = sstk1.tail
 737 |                                 tstk1, sub = tstk1.tail, tstk1.head
 738 |                                 subs.appendleft(sub)
 739 |                             if interpret:
 740 |                                 tree = self.semans[arg](*subs)
 741 |                             else:
 742 |                                 tree = ParseTree(tar_rule.lhs,
 743 |                                                  list(subs))
 744 | 
 745 |                             # NOTE:
 746 |                             #
 747 |                             # - Each state during cascaded reduction
 748 |                             #   should be added to the forks!
 749 |                             #
 750 |                             # - Intermediate reduction items may or
 751 |                             #   may not have a GOTO target! If no,
 752 |                             #   such items are denoted as "dead" -
 753 |                             #   they show possible expectations.
 754 |                             if tar_rule.lhs in self.GOTO[sstk1.head]:
 755 |                                 tar_trans = self.GOTO[sstk1.head][tar_rule.lhs]
 756 |                                 agenda.append(
 757 |                                     # Push into GSS
 758 |                                     (GSS(sstk1, tar_trans),
 759 |                                      GSS(tstk1, tree)))
 760 |                             else:
 761 |                                 dead.append(
 762 |                                     (sstk1, tstk1))
 763 | 
 764 |                         elif act == 'accept':
 765 |                             agenda_new.append(
 766 |                                 (sstk1, tstk1))
 767 | 
 768 |                         elif act == 'shift':
 769 |                             agenda_new.append(
 770 |                                 (GSS(sstk1, arg),
 771 |                                  GSS(tstk1,
 772 |                                      token.value if interpret else token)))
 773 |                 else:
 774 |                     dead.append((sstk, tstk))
 775 | 
 776 |             if not agenda_new:
 777 |                 token = yield [
 778 |                     ParseError(token, self.ACTION[ss.head], ss, aa)
 779 |                     for ss, aa in dead
 780 |                 ]
 781 |                 agenda = agenda_bak
 782 |             else:
 783 |                 token = yield [Just(ts) for ss, ts in agenda_new]
 784 |                 tokens.append(token)
 785 |                 agenda = agenda_new
 786 | 
 787 |     def parse_generalized(self, inp, interpret=False):
 788 |         assert hasattr(self, 'ACTION'), \
 789 |             'Call your_parser.make() to build the parser first!'
 790 |         p = self.prepare_generalized(interpret)
 791 |         next(p)
 792 |         for token in self.lexer.tokenize(inp, False):
 793 |             rs = p.send(token)
 794 |         else:
 795 |             rs = p.send(END_TOKEN)
 796 |             return [r.result[-1] for r in rs]
 797 | 
 798 |     def interpret_generalized(self, inp):
 799 |         return self.parse_generalized(inp, True)
 800 | 
 801 |     def dumps(self):
 802 |         'Dump this parser instance to readable Python code string.'
 803 | 
 804 |         tar = odict()
 805 | 
 806 |         tar['names'] = self.lexer.names
 807 |         tar['patterns'] = [
 808 |             rgx.pattern for rgx in self.lexer.patterns
 809 |         ]
 810 |         tar['handlers'] = [
 811 |             marshal.dumps(h.__code__) if h else None
 812 |             for h in self.lexer.handlers
 813 |         ]
 814 | 
 815 |         tar['rules'] = [tuple(rl) for rl in self.rules]
 816 |         tar['ACTION'] = self.ACTION
 817 |         tar['GOTO'] = self.GOTO
 818 | 
 819 |         tar['semans'] = [
 820 |             marshal.dumps(f.__code__)
 821 |             for f in self.semans
 822 |         ]
 823 | 
 824 |         return '\n'.join(
 825 |             '{} = {}\n'.format(k, pformat(v))
 826 |             for k, v in tar.items())
 827 | 
 828 |     def dump(self, filename):
 829 |         with open(filename, 'w') as o:
 830 |             o.write(self.dumps())
 831 | 
 832 |     @staticmethod
 833 |     def loads(src, env=globals()):
 834 |         'Load a dumped code string and make a usable parse instance.'
 835 |         ctx = {}
 836 |         exec(src, env, ctx)
 837 | 
 838 |         names = ctx.pop('names')
 839 |         patterns = [re.compile(pat)
 840 |                     for pat in ctx.pop('patterns')]
 841 |         handlers = [
 842 |             types.FunctionType(marshal.loads(co), env) if co else None
 843 |             for co in ctx.pop('handlers')
 844 |         ]
 845 |         p = LALR(Lexer(names, patterns, handlers))
 846 | 
 847 |         p.rules = [Rule(*rl) for rl in ctx.pop('rules')]
 848 |         p.semans = [
 849 |             types.FunctionType(marshal.loads(co), env)
 850 |             for co in ctx.pop('semans')
 851 |         ]
 852 |         p.ACTION = ctx.pop('ACTION')
 853 |         p.GOTO = ctx.pop('GOTO')
 854 |         return p
 855 | 
 856 |     @staticmethod
 857 |     def load(filename, env=globals()):
 858 |         with open(filename, 'r') as o:
 859 |             return LALR.loads(o.read(), env=env)
 860 | 
 861 |     # Helper for easy reading/tracing/debugging.
 862 |     def show_item(self, item):
 863 |         i, p = item
 864 |         lhs, rhs = self.rules[i]
 865 |         return '({} = {}.{})'.format(lhs,
 866 |                                      ' '.join(rhs[:p]),
 867 |                                      ' '.join(rhs[p:]))
 868 | 
 869 |     def show_itemset(self, i):
 870 |         return ([self.show_item(tm) for tm in self.Ks[i]])
 871 | 
 872 |     def show_action(self, action):
 873 |         act, arg = action
 874 |         if act == 'shift':
 875 |             return (act, self.show_itemset(arg))
 876 |         else:
 877 |             return (act, self.rules[arg])
 878 | 
 879 |     # Various style of declaration.
 880 |     def __getitem__(self, k):
 881 |         raise KeyError()
 882 | 
 883 |     def __setitem__(self, k, v):
 884 |         'This method is used to register attributes.'
 885 | 
 886 |         # Docstring of instance.
 887 |         if k == '__doc__':
 888 |             self.__doc__ = v
 889 | 
 890 |         # Built-in attributes ignored.
 891 |         elif k.startswith('__') and k.endswith('__'):
 892 |             pass
 893 | 
 894 |         # Lexical element.
 895 |         elif isinstance(v, str):
 896 |             self.lexer.register(k, v)
 897 | 
 898 |         # Lexical element with precedence.
 899 |         elif isinstance(v, tuple):
 900 |             assert len(v) == 2
 901 |             pat, prece = v
 902 |             self.lexer.register(k, pat)
 903 |             if prece in self.precedence:
 904 |                 raise ValueError(
 905 |                     'Repeated specifying the precedence '
 906 |                     'of symbol: {}'.format(k))
 907 |             else:
 908 |                 self.precedence[k] = prece
 909 | 
 910 |         # Method as handler...
 911 |         elif callable(v):
 912 |             parlist = v.__code__.co_varnames[:v.__code__.co_argcount]
 913 |             # for new lexical element.
 914 |             if len(parlist) == 1 and parlist[0] in ('lex', 'LEX'):
 915 |                 for prm, pat in v.__annotations__.items():
 916 |                     if prm == 'return':
 917 |                         self.precedence[k] = pat
 918 |                     else:
 919 |                         self.lexer.register(k, pat, v)
 920 |             # for existing lexical element
 921 |             elif any(k == lx for lx in self.lexer.names):
 922 |                 assert len(parlist) == 1
 923 |                 for i, lx in reversed(
 924 |                         list(enumerate(self.lexer.names))):
 925 |                     if lx == k:
 926 |                         self.lexer.handlers[i] = v
 927 |             # for syntax rule, i.e. semantics.
 928 |             else:
 929 |                 self.rule(v)
 930 | 
 931 |     def __enter__(self):
 932 |         return self.lexer, self.rule
 933 | 
 934 |     def __exit__(self, *a, **kw):
 935 |         self.make()
 936 | 
 937 |     class meta(type):
 938 | 
 939 |         @classmethod
 940 |         def __prepare__(mcls, name, bases, *a, **kw):
 941 |             return GLR(*a, **kw)
 942 | 
 943 |         def __new__(mcls, m, bs, p, **kw):
 944 |             p.make()
 945 |             return p
 946 | 
 947 |     @classmethod
 948 |     def verbose(cls, func_def):
 949 |         "Polymorphic class method which tends to be overriden."
 950 |         assert func_def.__code__.co_argcount == 2
 951 |         p = cls()
 952 |         func_def(p.lexer, p.rule)
 953 |         p.make()
 954 |         return p
 955 | 
 956 | 
 957 | class LALR(GLR):
 958 | 
 959 |     """LookAhead LR parser.
 960 | 
 961 |     - Can use precedence of tokens to resolve conflicts.
 962 | 
 963 |     """
 964 | 
 965 |     def make(self):
 966 |         # Make GLALR(1) automaton.
 967 |         super(LALR, self).make()
 968 |         # Resolve conflicts with precedence.
 969 |         Ks = self.Ks
 970 |         ACTION = self.ACTION
 971 |         ACTION1 = [{} for _ in Ks]
 972 |         for i, A in enumerate(ACTION):
 973 |             A1 = ACTION1[i]
 974 |             # Try add (act, arg) into A1.
 975 |             for a, actargs in A.items():
 976 |                 for act, arg in actargs:
 977 |                     # It is assured that 'shift' is added earlier than 'reduce'
 978 |                     if a in A1:
 979 |                         # Conflict resolver here!
 980 |                         act0, arg0 = A1[a]
 981 |                         if {act0, act} == {'shift', 'reduce'}:
 982 |                             if act0 == 'reduce':
 983 |                                 s, s_i = act, arg
 984 |                                 r, r_r = act0, arg0
 985 |                             else:
 986 |                                 s, s_i = act0, arg0
 987 |                                 r, r_r = act, arg
 988 |                             redu = self.rules[r_r]
 989 |                             if a in self.precedence:
 990 |                                 if len(redu.rhs) > 1 and \
 991 |                                    redu.rhs[-2] in self.precedence:
 992 |                                     lft = redu.rhs[-2]
 993 |                                     rgt = a
 994 |                                     if self.precedence[lft] >= \
 995 |                                        self.precedence[rgt]:
 996 |                                         A1[a] = (r, r_r)
 997 |                                     else:
 998 |                                         A1[a] = (s, s_i)
 999 |                                     continue
1000 |                         # Unable to resolve
1001 |                         msg = ("\n"
1002 |                                "Handling item set: \n" "{}\n"
1003 |                                "Conflict on lookahead: {} \n"
1004 |                                "- {}\n" "- {}\n"
1005 |                         ).format(
1006 |                             self.show_itemset(i),
1007 |                             a,
1008 |                             self.show_action(A1[a]),
1009 |                             self.show_action((act, arg)))
1010 |                         raise LanguageError(msg)
1011 |                     else:
1012 |                         A1[a] = (act, arg)
1013 | 
1014 |         self.ACTION = ACTION1
1015 | 
1016 |     def prepare(self, interpret=True):
1017 |         """Prepare a parsing coroutine which accepts tokens."""
1018 |         sstk = [0]              # state stack
1019 |         tstk = []               # subtree stack
1020 |         token = yield Just(None)
1021 | 
1022 |         while 1:
1023 | 
1024 |             if token.symbol in self.ACTION[sstk[-1]]:
1025 |                 act, arg = self.ACTION[sstk[-1]][token.symbol]
1026 |                 # Active tree set default to token.
1027 |                 tree = token
1028 | 
1029 |                 # Reduce (no new token fetched during reduction)
1030 |                 if act == 'reduce':
1031 |                     subs = deque()
1032 |                     for _ in self.rules[arg].rhs:
1033 |                         sstk.pop()
1034 |                         subs.appendleft(tstk.pop())
1035 | 
1036 |                     if interpret:
1037 |                         tree = self.semans[arg](*subs)
1038 |                     else:
1039 |                         tree = ParseTree(self.rules[arg].lhs, list(subs))
1040 | 
1041 |                     # Transfer with reduced symbol.
1042 |                     sstk.append(self.GOTO[sstk[-1]][self.rules[arg].lhs])
1043 |                     tstk.append(tree)
1044 | 
1045 |                 # Accept
1046 |                 if act == 'accept':
1047 |                     assert sstk.pop() == 1
1048 |                     tree = tstk.pop()
1049 |                     assert sstk == [0], sstk
1050 |                     assert tstk == [], tstk
1051 |                     # Now parsing routine is identical to the initial
1052 |                     # state and can start a new round, thus no need
1053 |                     # to create new routines for more parsing tasks.
1054 |                     token = yield Just(tree)
1055 | 
1056 |                 # Shift
1057 |                 elif act == 'shift':
1058 |                     sstk.append(arg)
1059 |                     tstk.append(token.value if interpret else token)
1060 |                     token = yield Just(tree)
1061 | 
1062 |             else:
1063 |                 token = yield ParseError(
1064 |                     token,
1065 |                     self.ACTION[sstk[-1]],
1066 |                     [self.show_itemset(s) for s in sstk],
1067 |                     tstk)
1068 | 
1069 |     def parse(self, inp, interpret=False):
1070 |         assert hasattr(self, 'ACTION'), \
1071 |             'Call yourparser.make() to build the parser first!'
1072 |         rtn = self.prepare(interpret)
1073 |         next(rtn)
1074 |         for token in self.lexer.tokenize(inp, False):
1075 |             opt = rtn.send(token)
1076 |             if isinstance(opt, ParseError):
1077 |                 warnings.warn(opt)
1078 |         just = rtn.send(END_TOKEN)
1079 |         return just.result
1080 | 
1081 |     def interpret(self, inp):
1082 |         assert self.semans, 'Must have semantics to interpret.'
1083 |         return self.parse(inp, True)
1084 | 
1085 |     class meta(type):
1086 | 
1087 |         def __prepare__(mcls, *a, **kw):
1088 |             return LALR(*a, **kw)
1089 | 
1090 |         def __new__(mcls, m, bs, p, **kw):
1091 |             p.make()
1092 |             return p
1093 | 
1094 | 
1095 | class Inspector(LALR):
1096 | 
1097 |     """Collection of methods for inspecting LALR parser object's
1098 |     attributes.
1099 | 
1100 |     - Since the representation of structures in LALR are raw
1101 |       integers/pairs as indices, these methods help inspect indexed
1102 |       objects.
1103 | 
1104 |     - They are organized here to avoid clustering.
1105 | 
1106 |     """
1107 | 
1108 |     def inspect_Ks(self):
1109 |         pprint.pprint([(k, [self.show_item(itm) for itm in K])
1110 |                        for k, K in enumerate(self.Ks)])
1111 | 
1112 |     def inspect_lkhs(self):
1113 |         pprint.pprint([
1114 |             [(i, self.show_item(self.Ks[i][ii])),
1115 |              (j, self.show_item(self.Ks[j][jj]))]
1116 |             for (i, ii), (j, jj) in self.propa
1117 |         ])
1118 | 
1119 |     def inspect_propa(self):
1120 |         pprint.pprint([
1121 |             [(i, self.show_item(self.Ks[i][ii])),
1122 |              (j, self.show_item(self.Ks[j][jj]))]
1123 |             for (i, ii), (j, jj) in self.propa
1124 |         ])
1125 | 
1126 |     def inspect_Ls(self):
1127 |         pprint.pprint([
1128 |             (i, [(self.show_item(itm), lkhs)
1129 |                  for itm, lkhs in zip(K, self.Ls[i])])
1130 |             for i, K in enumerate(self.Ks)
1131 |         ])
1132 | 
1133 |     def inspect_ACTION(self):
1134 |         pprint.pprint([
1135 |             (i, self.show_itemset(i), self.ACTION[i])
1136 |             for i, K in enumerate(self.Ks)
1137 |         ])
1138 | 
1139 |     def inspect_GOTO(self):
1140 |         pprint.pprint([
1141 |             (i, self.show_itemset(i), self.GOTO[i])
1142 |             for i, K in enumerate(self.Ks)
1143 |         ])
1144 | 
1145 | 


--------------------------------------------------------------------------------