├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── python_parser ├── __init__.py ├── examples │ ├── __init__.py │ ├── calc.py │ ├── json_parser.py │ └── output.png ├── parser.py └── util.py ├── setup.py └── tests ├── test_calc.py ├── test_json_loader.py └── test_parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea/* 91 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: '3.5' 3 | install: 4 | - travis_retry pip install pytest 5 | - travis_retry python setup.py install 6 | script: 7 | - pytest . -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 George Fortunatov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-parser 2 | A recursive descent parser generator 3 | 4 | [](https://travis-ci.org/qweeze/python-parser) 5 | 6 | ### Usage 7 | To create a parser you should provide a sequence of tokens and a BNF-like grammar. Here's a simple example of parsing an expression with nested parentheses: 8 | ```python 9 | from python_parser import Parser, a, anyof, someof, maybe, skip, to_dot 10 | 11 | tokens = ( 12 | ('\(', 'L_PAR'), 13 | ('\)', 'R_PAR'), 14 | ('\,', 'SEP'), 15 | ('\d+', 'NUM'), 16 | ('"\w+"', 'STR') 17 | ) 18 | grammar = { 19 | 'EXPR': a( 20 | skip('L_PAR'), 21 | 'VALUE', maybe(someof(skip('SEP'), 'VALUE')), 22 | skip('R_PAR') 23 | ), 24 | 'VALUE': anyof('STR', 'NUM', 'EXPR') 25 | } 26 | string_to_parse = '(1, 2, ("test", ((3), 4)))' 27 | 28 | parser = Parser(tokens, grammar) 29 | ast = parser.parse('EXPR', string_to_parse) 30 | 31 | with open('ast.dot', 'w') as f: 32 | f.write(to_dot(ast)) 33 | ``` 34 | 35 | After running this code `ast.dot` should contain the following graph: 36 |
37 |
38 |
39 | ### More examples
40 | [Calculator](python_parser/examples/calc.py)
41 |
42 | > (((23 + -3.2)) / 34+(2*((3)) *(3.0 - .1))) 43 | 17.98235294117647 44 | > 1/0 45 | float division by zero 46 | > a = 11 47 | > b = -6 48 | > a * b 49 |52 | [JSON parser](python_parser/examples/json_parser.py) 53 | ```python 54 | >>> from python_parser.examples.json_parser import load as json_loads 55 | >>> json_string = '{"first": [1, 2, 3, {"5": 6}, true], "key": "value", "1": 2}' 56 | >>> result = json_loads(json_string) 57 | >>> type(result) 58 | builtins.dict 59 | >>> result['first'][3]['5'] 60 | 6 61 | ``` 62 | -------------------------------------------------------------------------------- /python_parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .parser import Parser, a, anyof, someof, maybe, skip 2 | from .util import to_dot 3 | -------------------------------------------------------------------------------- /python_parser/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qweeze/python-parser/e7761d219d2b2035f03f914905c75240dbf18b96/python_parser/examples/__init__.py -------------------------------------------------------------------------------- /python_parser/examples/calc.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | import readline 3 | import operator as op 4 | 5 | from python_parser import Parser, a, anyof, maybe, someof 6 | 7 | 8 | class InterpreterError(Exception): 9 | pass 10 | 11 | class Interpreter(object): 12 | 13 | def __init__(self): 14 | self.un_ops = { 15 | '-': op.neg, 16 | '+': op.pos 17 | } 18 | self.bin_ops = { 19 | '*': op.mul, 20 | '/': op.truediv, 21 | '+': op.add, 22 | '-': op.sub 23 | } 24 | self.vars = {} 25 | 26 | def expr(self, items): 27 | result = self.visit(next(items)) 28 | op = next(items, None) 29 | while op is not None: 30 | result = self.bin_ops[op.value](result, self.visit(next(items))) 31 | op = next(items, None) 32 | return result 33 | 34 | def term(self, items): 35 | return self.expr(items) 36 | 37 | def factor(self, items): 38 | item = next(items) 39 | if item.name == 'L_PAR': 40 | result = self.visit(next(items)) 41 | elif item.name in ('ADD', 'SUB'): 42 | result = self.un_ops[item.value](self.visit(next(items))) 43 | elif item.name == 'NAME': 44 | if item.value not in self.vars: 45 | raise InterpreterError( 46 | 'Variable {} is not defined'.format(item.value)) 47 | result = self.vars[item.value] 48 | else: 49 | result = float(item.value) 50 | next(items, None) 51 | return result 52 | 53 | def defn(self, items): 54 | name = next(items).value.split('=')[0].rstrip() 55 | self.vars[name] = self.visit(next(items)) 56 | 57 | def skip(self, items): 58 | return self.visit(next(items)) 59 | 60 | def visit(self, node): 61 | return getattr(self, node.name.lower(), self.skip)(iter(node.items)) 62 | 63 | 64 | tokens = ( 65 | ('(\d*\.\d+)|(\d+\.\d*)', 'FLOAT'), 66 | ('\d+', 'INT'), 67 | ('\+', 'ADD'), 68 | ('-', 'SUB'), 69 | ('\*', 'MUL'), 70 | ('/', 'DIV'), 71 | ('\)', 'R_PAR'), 72 | ('\(', 'L_PAR'), 73 | ('\w+\s*=', 'SET'), 74 | ('\w+', 'NAME'), 75 | ('=', 'EQ') 76 | ) 77 | 78 | grammar = { 79 | 'FACTOR': anyof( 80 | 'FLOAT', 'INT', 'NAME', 81 | a(anyof('ADD', 'SUB'), 'FACTOR'), 82 | a('L_PAR', 'EXPR', 'R_PAR')), 83 | 'TERM': a('FACTOR', maybe(someof(anyof('DIV', 'MUL'), 'FACTOR'))), 84 | 'DEFN': a('SET', 'EXPR'), 85 | 'EXPR': a('TERM', maybe(someof(anyof('ADD', 'SUB'), 'TERM'))), 86 | 'PROGRAM': anyof('EXPR', 'DEFN') 87 | } 88 | 89 | parser = Parser(tokens, grammar) 90 | interpreter = Interpreter() 91 | 92 | def calc_eval(text): 93 | ast = parser.parse('PROGRAM', text) 94 | return interpreter.visit(ast) 95 | 96 | 97 | _bold = '\033[;1m{}\033[0;0m'.format 98 | _red = '\033[1;31m{}\033[0;0m'.format 99 | 100 | if __name__ == '__main__': 101 | while True: 102 | try: 103 | text = input(_bold('> ')) 104 | if not text: 105 | continue 106 | rv = calc_eval(text) 107 | if rv is not None: 108 | print(' ', _bold(rv)) 109 | except (KeyboardInterrupt, EOFError): 110 | exit(0) 111 | except SyntaxError as exc: 112 | msg = traceback.format_exception_only(type(exc), exc) 113 | print(_red(''.join(msg[3:] + msg[1:3])), end='') 114 | except (ArithmeticError, InterpreterError) as exc: 115 | print(_red(exc)) 116 | -------------------------------------------------------------------------------- /python_parser/examples/json_parser.py: -------------------------------------------------------------------------------- 1 | from python_parser import Parser, a, anyof, maybe, skip, someof 2 | 3 | tokens = ( 4 | ('[+-]?(\\d+(\\.\\d*)?|\\.\\d+)', 'NUM'), 5 | ('"\w+"', 'STR'), 6 | (':', 'COL'), 7 | ('\[', 'L_BR'), 8 | ('\]', 'R_BR'), 9 | ('{', 'L_PAR'), 10 | ('}', 'R_PAR'), 11 | ('\,', 'SEP'), 12 | ('true', 'TRUE'), 13 | ('false', 'FALSE'), 14 | ('null', 'NULL') 15 | ) 16 | grammar = { 17 | 'ROOT': anyof('ARR', 'OBJ'), 18 | 'VAL': anyof('STR', 'NUM', 'ARR', 'OBJ', 'TRUE', 'FALSE', 'NULL'), 19 | 'ARR': a( 20 | skip('L_BR'), 21 | maybe('VAL', maybe(someof(skip('SEP'), 'VAL'))), 22 | skip('R_BR') 23 | ), 24 | 'OBJ': a( 25 | skip('L_PAR'), 26 | maybe('PAIR', maybe(someof(skip('SEP'), 'PAIR'))), 27 | skip('R_PAR') 28 | ), 29 | 'PAIR': a('STR', skip('COL'), 'VAL') 30 | } 31 | 32 | parser = Parser(tokens, grammar) 33 | 34 | 35 | def load(text): 36 | 37 | def arr(node): 38 | return list(map(val, node.items)) 39 | 40 | def obj(node): 41 | return dict(map(pair, node.items)) 42 | 43 | def val(node): 44 | node = node.items[0] 45 | return { 46 | 'STR': lambda t: t.value[1: -1], 47 | 'NUM': lambda t: float(t.value), 48 | 'TRUE': lambda _: True, 49 | 'FALSE': lambda _: False, 50 | 'NULL': lambda _: None, 51 | 'ARR': arr, 52 | 'OBJ': obj 53 | }[node.name](node) 54 | 55 | def pair(node): 56 | key, value = node.items 57 | return val(node), val(value) 58 | 59 | ast = parser.parse('ROOT', text) 60 | root = ast.items[0] 61 | return locals()[root.name.lower()](root) 62 | -------------------------------------------------------------------------------- /python_parser/examples/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qweeze/python-parser/e7761d219d2b2035f03f914905c75240dbf18b96/python_parser/examples/output.png -------------------------------------------------------------------------------- /python_parser/parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import namedtuple 3 | 4 | 5 | Token = namedtuple('Token', ('name', 'value')) 6 | Node = namedtuple('Node', ('name', 'items')) 7 | 8 | 9 | class Lexer(object): 10 | """ 11 | A tokenizer that takes a string and produces a sequence of 12 | `Token` instances. If no match found a SyntaxError is raised. 13 | """ 14 | def __init__(self, patterns): 15 | """ 16 | :param patterns: A sequence of (regex_pattern, token_name) tuples. 17 | Patterns are order dependent: first match wins 18 | """ 19 | self.patterns = [ 20 | (re.compile(bytes(p, 'utf8')), name) for p, name in patterns] 21 | 22 | def lex(self, raw, ignore_spaces=True): 23 | """ 24 | :param raw: an input string 25 | :param ignore_spaces: if True, all whitespace characters are skipped 26 | :return: generator of tokens 27 | """ 28 | self.raw = bytearray(raw, 'utf8') 29 | self.pos = 0 30 | endpos = len(self.raw) 31 | 32 | while self.pos != endpos: 33 | if ignore_spaces and self.raw[self.pos: self.pos + 1].isspace(): 34 | self.pos += 1 35 | continue 36 | for p, name in self.patterns: 37 | m = p.match(self.raw[self.pos:]) 38 | if m is not None: 39 | val, offset = m.group(), m.end() 40 | yield Token(name, str(val, 'utf8')) 41 | self.pos += offset 42 | break 43 | else: 44 | self.error('Illegal character') 45 | yield Token('EOF', None) 46 | 47 | def error(self, message): 48 | raise SyntaxError(message, self.get_debug_info()) 49 | 50 | def get_debug_info(self, f_name=None): 51 | pos = self.pos + 1 52 | raw = self.raw 53 | line_no = raw[:pos].count(b'\n') 54 | line_start = max(raw.rfind(b'\n'), 0) 55 | line_end = max(raw.find(b'\n'), len(raw)) 56 | line = str(raw[line_start:line_end], 'utf-8') 57 | offset = pos - line_start 58 | return (f_name, line_no, offset, line) 59 | 60 | 61 | class Parser(object): 62 | def __init__(self, tokens, grammar): 63 | self.lexer = Lexer(tokens) 64 | self.grammar = grammar 65 | self.count = 0 66 | 67 | def step(self): 68 | self.cur_token = next(self.token_generator, None) 69 | if self.cur_token is not None: 70 | self.count += 1 71 | 72 | def error(self): 73 | if self.cur_token.value is not None: 74 | message = 'Unexpected token {}'.format(self.cur_token.value) 75 | else: 76 | message = 'Unexpected EOF' 77 | self.lexer.error(message) 78 | 79 | def eat(self, token_name): 80 | if self.cur_token is not None and self.cur_token.name == token_name: 81 | token = self.cur_token 82 | self.step() 83 | return token 84 | 85 | def parse_rule(self, rule): 86 | return Node(name=rule, items=self.grammar[rule](self)) 87 | 88 | def parse(self, rule, text, ignore_spaces=True, check_eof=True): 89 | self.token_generator = self.lexer.lex(text, ignore_spaces) 90 | self.cur_token = None 91 | self.step() 92 | try: 93 | result = self.parse_rule(rule) 94 | if check_eof: 95 | a('EOF')(self) 96 | except ParserError: 97 | self.error() 98 | else: 99 | return result 100 | 101 | 102 | class ParserError(Exception): 103 | pass 104 | 105 | 106 | def unify(*args): 107 | args = (arg if callable(arg) else a(arg) for arg in args) 108 | return a(*args) 109 | 110 | 111 | def just(token_name): 112 | def inner(parser): 113 | token = parser.eat(token_name) 114 | if token is None: 115 | raise ParserError 116 | return token 117 | return inner 118 | 119 | 120 | def maybe(*args): 121 | def inner(parser): 122 | cnt = parser.count 123 | try: 124 | return unify(*args)(parser) 125 | except ParserError: 126 | if parser.count != cnt: 127 | raise ParserError 128 | return inner 129 | 130 | 131 | def skip(*args): 132 | def inner(parser): 133 | unify(*args)(parser) 134 | return inner 135 | 136 | 137 | def anyof(*args): 138 | def inner(parser): 139 | for arg in args: 140 | result = maybe(arg)(parser) 141 | if result: 142 | return result 143 | raise ParserError 144 | return inner 145 | 146 | 147 | def someof(*args): 148 | def inner(parser): 149 | result = unify(*args)(parser) 150 | while True: 151 | part = maybe(unify(*args))(parser) 152 | if part: 153 | result.extend(part) 154 | else: 155 | break 156 | return result 157 | return inner 158 | 159 | 160 | def a(*args): 161 | def inner(parser): 162 | result = [] 163 | for arg in args: 164 | if arg in parser.grammar: 165 | arg = parser.parse_rule(arg) 166 | if isinstance(arg, str): 167 | arg = just(arg) 168 | if callable(arg): 169 | arg = arg(parser) 170 | if arg is not None: 171 | if not isinstance(arg, list): 172 | result.append(arg) 173 | else: 174 | result.extend(arg) 175 | return result 176 | return inner 177 | -------------------------------------------------------------------------------- /python_parser/util.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from .parser import Token, Node 3 | 4 | 5 | def to_dot(ast): 6 | result = [ 7 | 'strict digraph "AST" {', 8 | 'size="16,14"; ratio = fill;' 9 | ] 10 | _escape = lambda s: s.replace('"', r'\"') 11 | 12 | def format_node(node, uid): 13 | if isinstance(node, Token): 14 | label = '{} [{}]'.format(*map(_escape, (node.name, node.value))) 15 | elif isinstance(node, Node): 16 | label = '{}'.format(*map(_escape, (node.name,))) 17 | else: 18 | raise ValueError("Can't format node {}".format(node)) 19 | return '"{}" [label="{}"];'.format(uid, label) 20 | 21 | def walk(node, uid): 22 | result.append(format_node(node, uid)) 23 | if isinstance(node, Node): 24 | for i in node.items: 25 | child_uid = uuid.uuid4().hex 26 | walk(i, child_uid) 27 | result.append('"{}" -> "{}";'.format(uid, child_uid)) 28 | 29 | uid = uuid.uuid4().hex 30 | walk(ast, uid) 31 | result.append('}') 32 | return '\n'.join(result) 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='python-parser', 5 | author='qweeze', 6 | author_email='qweeeze@gmail.com', 7 | description='A recursive descent parser', 8 | packages=find_packages('.') 9 | ) 10 | -------------------------------------------------------------------------------- /tests/test_calc.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from python_parser.examples.calc import parser, interpreter, InterpreterError 3 | 4 | 5 | def test_calc(): 6 | 7 | interpreter.visit(parser.parse('PROGRAM', 'a = 11')) 8 | interpreter.visit(parser.parse('PROGRAM', 'b = -13')) 9 | a, b = 11, -13 10 | 11 | samples = ( 12 | '2+2', 13 | '(((23 + -3.2)) / 34+(2*((3)) *(3.0 - .1)))', 14 | '0.000', 15 | 'a * b - 123' 16 | ) 17 | 18 | for i in samples: 19 | assert interpreter.visit(parser.parse('PROGRAM', i)) == eval(i) 20 | 21 | with pytest.raises(ZeroDivisionError): 22 | interpreter.visit(parser.parse('PROGRAM', '1 / 0')) 23 | 24 | with pytest.raises(InterpreterError): 25 | interpreter.visit(parser.parse('PROGRAM', '1 + undefined')) 26 | -------------------------------------------------------------------------------- /tests/test_json_loader.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from python_parser.examples.json_parser import load as json_loads 4 | 5 | 6 | def test_json_loader(): 7 | samples = ( 8 | { 9 | 'first': [1, 2, 3, {'5': 6}, True], 10 | 'key': 'value', 11 | '1': 2, 12 | }, 13 | [ 14 | {'test': 'test'}, 15 | None, 16 | {'test': False}, 17 | -321.123 18 | ], 19 | {}, 20 | [] 21 | ) 22 | for sample in samples: 23 | assert json_loads(json.dumps(sample)) == sample 24 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from python_parser.parser import Lexer, Token, Node 3 | from python_parser import * 4 | 5 | 6 | def test_lexer(): 7 | patterns = ( 8 | ('\d+', 'NUM'), 9 | ('\w+', 'STR') 10 | ) 11 | lexer = Lexer(patterns) 12 | 13 | assert list(lexer.lex( '123 abc ')) == ( 14 | [Token('NUM', '123'), Token('STR', 'abc'), Token('EOF', None)] 15 | ) 16 | assert list(lexer.lex('')) == [Token('EOF', None)] 17 | 18 | with pytest.raises(SyntaxError): 19 | list(lexer.lex('123 abc ?')) 20 | 21 | 22 | def test_parser(): 23 | patterns = ( 24 | ('\(', 'L_PAR'), 25 | ('\)', 'R_PAR'), 26 | ('\,', 'SEP'), 27 | ('\d+', 'NUM'), 28 | ('"\w+"', 'STR') 29 | ) 30 | grammar = { 31 | 'EXPR': a( 32 | skip('L_PAR'), 33 | 'VALUE', maybe(someof(skip('SEP'), 'VALUE')), 34 | skip('R_PAR') 35 | ), 36 | 'VALUE': anyof('STR', 'NUM', 'EXPR') 37 | } 38 | parser = Parser(patterns, grammar) 39 | ast = parser.parse('EXPR', '(1, 2, ("test", ((3), 4)))') 40 | 41 | 42 | result = Node('EXPR', items=[ 43 | Node('VALUE', [Token('NUM', '1')]), 44 | Node('VALUE', [Token('NUM', '2')]), 45 | Node('VALUE', [Node('EXPR', items=[ 46 | Node('VALUE', [Token('STR', '"test"')]), 47 | Node('VALUE', items=[ 48 | Node('EXPR', items=[ 49 | Node('VALUE', items=[ 50 | Node('EXPR', items=[ 51 | Node('VALUE', [Token('NUM', '3')]) 52 | ]) 53 | ]), 54 | Node('VALUE', [Token('NUM', '4')]) 55 | ]) 56 | ]) 57 | ])]) 58 | ]) 59 | assert ast == result 60 | 61 | with pytest.raises(SyntaxError): 62 | parser.parse('EXPR', '((1, 2, ("test", (3, 4)))') ---------------------------------------------------------------------------------66.0 50 | > 51 |