├── parsimonious ├── tests │ ├── __init__.py │ ├── test_benchmarks.py │ ├── benchmarks.py │ ├── test_nodes.py │ ├── test_expressions.py │ └── test_grammar.py ├── __init__.py ├── utils.py ├── exceptions.py ├── nodes.py ├── expressions.py └── grammar.py ├── MANIFEST.in ├── .gitignore ├── .travis.yml ├── tox.ini ├── LICENSE ├── setup.py └── README.rst /parsimonious/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .tox 2 | *.egg-info 3 | *.egg 4 | *.pyc 5 | build 6 | dist 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.3" 6 | - "3.4" 7 | - "3.5" 8 | install: 9 | - pip install tox tox-travis 10 | script: 11 | - tox 12 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26, py27, py33, py34, py35 3 | 4 | [tox:travis] 5 | 2.6 = py26 6 | 2.7 = py27 7 | 3.3 = py33 8 | 3.4 = py34 9 | 3.5 = py35 10 | 11 | [testenv] 12 | usedevelop = True 13 | commands = nosetests parsimonious 14 | deps = nose 15 | # So Python 3 doesn't pick up incompatible, un-2to3'd source from the cwd: 16 | changedir = .tox 17 | -------------------------------------------------------------------------------- /parsimonious/__init__.py: -------------------------------------------------------------------------------- 1 | """Parsimonious's public API. Import from here. 2 | 3 | Things may move around in modules deeper than this one. 4 | 5 | """ 6 | from parsimonious.exceptions import (ParseError, IncompleteParseError, 7 | VisitationError, UndefinedLabel) 8 | from parsimonious.grammar import Grammar, TokenGrammar 9 | from parsimonious.nodes import NodeVisitor, VisitationError, rule 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Erik Rose 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /parsimonious/utils.py: -------------------------------------------------------------------------------- 1 | """General tools which don't depend on other parts of Parsimonious""" 2 | 3 | import ast 4 | from sys import version_info 5 | 6 | from six import python_2_unicode_compatible 7 | 8 | 9 | class StrAndRepr(object): 10 | """Mix-in to add a ``__str__`` and ``__repr__`` which return the 11 | UTF-8-encoded value of ``__unicode__``""" 12 | 13 | if version_info >= (3,): 14 | # Don't return the "bytes" type from Python 3's __str__: 15 | def __repr__(self): 16 | return self.__str__() 17 | else: 18 | def __repr__(self): 19 | return self.__str__().encode('utf-8') 20 | 21 | 22 | def evaluate_string(string): 23 | """Piggyback on Python's string support so we can have backslash escaping 24 | and niceties like \n, \t, etc. string.decode('string_escape') would have 25 | been a lower-level possibility. 26 | 27 | """ 28 | return ast.literal_eval(string) 29 | 30 | 31 | @python_2_unicode_compatible 32 | class Token(StrAndRepr): 33 | """A class to represent tokens, for use with TokenGrammars 34 | 35 | You will likely want to subclass this to hold additional information, like 36 | the characters that you lexed to create this token. Alternately, feel free 37 | to create your own class from scratch. The only contract is that tokens 38 | must have a ``type`` attr. 39 | 40 | """ 41 | __slots__ = ['type'] 42 | 43 | def __init__(self, type): 44 | self.type = type 45 | 46 | def __str__(self): 47 | return u'' % (self.type,) 48 | 49 | def __eq__(self, other): 50 | return self.type == other.type 51 | -------------------------------------------------------------------------------- /parsimonious/tests/test_benchmarks.py: -------------------------------------------------------------------------------- 1 | """Tests to show that the benchmarks we based our speed optimizations on are 2 | still valid""" 3 | 4 | from functools import partial 5 | from timeit import timeit 6 | 7 | from nose.tools import ok_ 8 | 9 | 10 | timeit = partial(timeit, number=500000) 11 | 12 | 13 | def test_lists_vs_dicts(): 14 | """See what's faster at int key lookup: dicts or lists.""" 15 | list_time = timeit('item = l[9000]', 'l = [0] * 10000') 16 | dict_time = timeit('item = d[9000]', 'd = dict((x, 0) for x in range(10000))') 17 | 18 | # Dicts take about 1.6x as long as lists in Python 2.6 and 2.7. 19 | ok_(list_time < dict_time, '%s < %s' % (list_time, dict_time)) 20 | 21 | 22 | def test_call_vs_inline(): 23 | """How bad is the calling penalty?""" 24 | no_call = timeit('l[0] += 1', 'l = [0]') 25 | call = timeit('add(); l[0] += 1', 'l = [0]\n' 26 | 'def add():\n' 27 | ' pass') 28 | 29 | # Calling a function is pretty fast; it takes just 1.2x as long as the 30 | # global var access and addition in l[0] += 1. 31 | ok_(no_call < call, '%s (no call) < %s (call)' % (no_call, call)) 32 | 33 | 34 | def test_startswith_vs_regex(): 35 | """Can I beat the speed of regexes by special-casing literals?""" 36 | re_time = timeit( 37 | 'r.match(t, 19)', 38 | 'import re\n' 39 | "r = re.compile('hello')\n" 40 | "t = 'this is the finest hello ever'") 41 | startswith_time = timeit("t.startswith('hello', 19)", 42 | "t = 'this is the finest hello ever'") 43 | 44 | # Regexes take 2.24x as long as simple string matching. 45 | ok_(startswith_time < re_time, 46 | '%s (startswith) < %s (re)' % (startswith_time, re_time)) 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from sys import version_info 2 | 3 | # Prevent spurious errors during `python setup.py test` in 2.6, a la 4 | # http://www.eby-sarna.com/pipermail/peak/2010-May/003357.html: 5 | try: 6 | import multiprocessing 7 | except ImportError: 8 | pass 9 | 10 | from io import open 11 | from setuptools import setup, find_packages 12 | 13 | long_description=open('README.rst', 'r', encoding='utf8').read() 14 | 15 | setup( 16 | name='parsimonious', 17 | version='0.7.0', 18 | description='(Soon to be) the fastest pure-Python PEG parser I could muster', 19 | long_description=long_description, 20 | author='Erik Rose', 21 | author_email='erikrose@grinchcentral.com', 22 | license='MIT', 23 | packages=find_packages(exclude=['ez_setup']), 24 | tests_require=['nose'], 25 | test_suite='nose.collector', 26 | url='https://github.com/erikrose/parsimonious', 27 | include_package_data=True, 28 | install_requires=['six'], 29 | classifiers=[ 30 | 'Intended Audience :: Developers', 31 | 'Natural Language :: English', 32 | 'Development Status :: 3 - Alpha', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Operating System :: OS Independent', 35 | 'Programming Language :: Python :: 2', 36 | 'Programming Language :: Python :: 2.6', 37 | 'Programming Language :: Python :: 2.7', 38 | 'Programming Language :: Python :: 3', 39 | 'Programming Language :: Python :: 3.3', 40 | 'Programming Language :: Python :: 3.4', 41 | 'Programming Language :: Python :: 3.5', 42 | 'Topic :: Scientific/Engineering :: Information Analysis', 43 | 'Topic :: Software Development :: Libraries', 44 | 'Topic :: Text Processing :: General'], 45 | keywords=['parse', 'parser', 'parsing', 'peg', 'packrat', 'grammar', 'language'], 46 | ) 47 | -------------------------------------------------------------------------------- /parsimonious/tests/benchmarks.py: -------------------------------------------------------------------------------- 1 | """Benchmarks for Parsimonious 2 | 3 | Run these with ``nosetests parsimonious/tests/bench.py``. They don't run during 4 | normal test runs because they're not tests--they don't assert anything. Also, 5 | they're a bit slow. 6 | 7 | These differ from the ones in test_benchmarks in that these are meant to be 8 | compared from revision to revision of Parsimonious to make sure we're not 9 | getting slower. test_benchmarks simply makes sure our choices among 10 | implementation alternatives remain valid. 11 | 12 | """ 13 | # These aren't really tests, as they don't assert anything, but I found myself 14 | # rewriting nose's discovery and selection bits, so why not just use nose? 15 | 16 | import gc 17 | from timeit import repeat 18 | 19 | from parsimonious.grammar import Grammar 20 | 21 | 22 | def test_not_really_json_parsing(): 23 | """As a baseline for speed, parse some JSON. 24 | 25 | I have no reason to believe that JSON is a particularly representative or 26 | revealing grammar to test with. Also, this is a naive, unoptimized, 27 | incorrect grammar, so don't use it as a basis for comparison with other 28 | parsers. It's just meant to compare across versions of Parsimonious. 29 | 30 | """ 31 | father = """{ 32 | "id" : 1, 33 | "married" : true, 34 | "name" : "Larry Lopez", 35 | "sons" : null, 36 | "daughters" : [ 37 | { 38 | "age" : 26, 39 | "name" : "Sandra" 40 | }, 41 | { 42 | "age" : 25, 43 | "name" : "Margaret" 44 | }, 45 | { 46 | "age" : 6, 47 | "name" : "Mary" 48 | } 49 | ] 50 | }""" 51 | more_fathers = ','.join([father] * 60) 52 | json = '{"fathers" : [' + more_fathers + ']}' 53 | grammar = Grammar(r""" 54 | value = space (string / number / object / array / true_false_null) 55 | space 56 | 57 | object = "{" members "}" 58 | members = (pair ("," pair)*)? 59 | pair = string ":" value 60 | array = "[" elements "]" 61 | elements = (value ("," value)*)? 62 | true_false_null = "true" / "false" / "null" 63 | 64 | string = space "\"" chars "\"" space 65 | chars = ~"[^\"]*" # TODO implement the real thing 66 | number = (int frac exp) / (int exp) / (int frac) / int 67 | int = "-"? ((digit1to9 digits) / digit) 68 | frac = "." digits 69 | exp = e digits 70 | digits = digit+ 71 | e = "e+" / "e-" / "e" / "E+" / "E-" / "E" 72 | 73 | digit1to9 = ~"[1-9]" 74 | digit = ~"[0-9]" 75 | space = ~"\s*" 76 | """) 77 | 78 | # These number and repetition values seem to keep results within 5% of the 79 | # difference between min and max. We get more consistent results running a 80 | # bunch of single-parse tests and taking the min rather than upping the 81 | # NUMBER and trying to stomp out the outliers with averaging. 82 | NUMBER = 1 83 | REPEAT = 5 84 | total_seconds = min(repeat(lambda: grammar.parse(json), 85 | lambda: gc.enable(), # so we take into account how we treat the GC 86 | repeat=REPEAT, 87 | number=NUMBER)) 88 | seconds_each = total_seconds / NUMBER 89 | 90 | kb = len(json) / 1024.0 91 | print 'Took %.3fs to parse %.1fKB: %.0fKB/s.' % (seconds_each, 92 | kb, 93 | kb / seconds_each) 94 | -------------------------------------------------------------------------------- /parsimonious/exceptions.py: -------------------------------------------------------------------------------- 1 | from six import text_type, python_2_unicode_compatible 2 | 3 | from parsimonious.utils import StrAndRepr 4 | 5 | 6 | @python_2_unicode_compatible 7 | class ParseError(StrAndRepr, Exception): 8 | """A call to ``Expression.parse()`` or ``match()`` didn't match.""" 9 | 10 | def __init__(self, text, pos=-1, expr=None): 11 | # It would be nice to use self.args, but I don't want to pay a penalty 12 | # to call descriptors or have the confusion of numerical indices in 13 | # Expression.match_core(). 14 | self.text = text 15 | self.pos = pos 16 | self.expr = expr 17 | 18 | def __str__(self): 19 | rule_name = ((u"'%s'" % self.expr.name) if self.expr.name else 20 | text_type(self.expr)) 21 | return u"Rule %s didn't match at '%s' (line %s, column %s)." % ( 22 | rule_name, 23 | self.text[self.pos:self.pos + 20], 24 | self.line(), 25 | self.column()) 26 | 27 | # TODO: Add line, col, and separated-out error message so callers can build 28 | # their own presentation. 29 | 30 | def line(self): 31 | """Return the 1-based line number where the expression ceased to 32 | match.""" 33 | # This is a method rather than a property in case we ever wanted to 34 | # pass in which line endings we want to use. 35 | return self.text.count('\n', 0, self.pos) + 1 36 | 37 | def column(self): 38 | """Return the 1-based column where the expression ceased to match.""" 39 | # We choose 1-based because that's what Python does with SyntaxErrors. 40 | try: 41 | return self.pos - self.text.rindex('\n', 0, self.pos) 42 | except ValueError: 43 | return self.pos + 1 44 | 45 | 46 | @python_2_unicode_compatible 47 | class IncompleteParseError(ParseError): 48 | """A call to ``parse()`` matched a whole Expression but did not consume the 49 | entire text.""" 50 | 51 | def __str__(self): 52 | return u"Rule '%s' matched in its entirety, but it didn't consume all the text. The non-matching portion of the text begins with '%s' (line %s, column %s)." % ( 53 | self.expr.name, 54 | self.text[self.pos:self.pos + 20], 55 | self.line(), 56 | self.column()) 57 | 58 | 59 | class VisitationError(Exception): 60 | """Something went wrong while traversing a parse tree. 61 | 62 | This exception exists to augment an underlying exception with information 63 | about where in the parse tree the error occurred. Otherwise, it could be 64 | tiresome to figure out what went wrong; you'd have to play back the whole 65 | tree traversal in your head. 66 | 67 | """ 68 | # TODO: Make sure this is pickleable. Probably use @property pattern. Make 69 | # the original exc and node available on it if they don't cause a whole 70 | # raft of stack frames to be retained. 71 | def __init__(self, exc, exc_class, node): 72 | """Construct. 73 | 74 | :arg exc: What went wrong. We wrap this and add more info. 75 | :arg node: The node at which the error occurred 76 | 77 | """ 78 | self.original_class = exc_class 79 | super(VisitationError, self).__init__( 80 | '%s: %s\n\n' 81 | 'Parse tree:\n' 82 | '%s' % 83 | (exc_class.__name__, 84 | exc, 85 | node.prettily(error=node))) 86 | 87 | 88 | class BadGrammar(StrAndRepr, Exception): 89 | """Something was wrong with the definition of a grammar. 90 | 91 | Note that a ParseError might be raised instead if the error is in the 92 | grammar definition syntax. 93 | 94 | """ 95 | 96 | 97 | @python_2_unicode_compatible 98 | class UndefinedLabel(BadGrammar): 99 | """A rule referenced in a grammar was never defined. 100 | 101 | Circular references and forward references are okay, but you have to define 102 | stuff at some point. 103 | 104 | """ 105 | def __init__(self, label): 106 | self.label = label 107 | 108 | def __str__(self): 109 | return u'The label "%s" was never defined.' % self.label 110 | -------------------------------------------------------------------------------- /parsimonious/tests/test_nodes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from nose import SkipTest 3 | from nose.tools import eq_, ok_, assert_raises 4 | 5 | from parsimonious import Grammar, NodeVisitor, VisitationError, rule 6 | from parsimonious.nodes import Node 7 | 8 | 9 | class HtmlFormatter(NodeVisitor): 10 | """Visitor that turns a parse tree into HTML fragments""" 11 | 12 | grammar = Grammar("""bold_open = '(('""") # just partial 13 | 14 | def visit_bold_open(self, node, visited_children): 15 | return '' 16 | 17 | def visit_bold_close(self, node, visited_children): 18 | return '' 19 | 20 | def visit_text(self, node, visited_children): 21 | """Return the text verbatim.""" 22 | return node.text 23 | 24 | def visit_bold_text(self, node, visited_children): 25 | return ''.join(visited_children) 26 | 27 | 28 | class ExplosiveFormatter(NodeVisitor): 29 | """Visitor which raises exceptions""" 30 | 31 | def visit_boom(self, node, visited_children): 32 | raise ValueError 33 | 34 | 35 | def test_visitor(): 36 | """Assert a tree gets visited correctly. 37 | 38 | We start with a tree from applying this grammar... :: 39 | 40 | bold_text = bold_open text bold_close 41 | text = ~'[a-zA-Z 0-9]*' 42 | bold_open = '((' 43 | bold_close = '))' 44 | 45 | ...to this text:: 46 | 47 | ((o hai)) 48 | 49 | """ 50 | text = '((o hai))' 51 | tree = Node('bold_text', text, 0, 9, 52 | [Node('bold_open', text, 0, 2), 53 | Node('text', text, 2, 7), 54 | Node('bold_close', text, 7, 9)]) 55 | result = HtmlFormatter().visit(tree) 56 | eq_(result, 'o hai') 57 | 58 | 59 | def test_visitation_exception(): 60 | assert_raises(VisitationError, 61 | ExplosiveFormatter().visit, 62 | Node('boom', '', 0, 0)) 63 | 64 | 65 | def test_str(): 66 | """Test str and unicode of ``Node``.""" 67 | n = Node('text', 'o hai', 0, 5) 68 | good = '' 69 | eq_(str(n), good) 70 | 71 | 72 | def test_repr(): 73 | """Test repr of ``Node``.""" 74 | s = u'hai ö' 75 | boogie = u'böogie' 76 | n = Node(boogie, s, 0, 3, children=[ 77 | Node('', s, 3, 4), Node('', s, 4, 5)]) 78 | eq_(repr(n), """s = {hai_o}\nNode({boogie}, s, 0, 3, children=[Node('', s, 3, 4), Node('', s, 4, 5)])""".format(hai_o=repr(s), boogie=repr(boogie))) 79 | 80 | 81 | def test_parse_shortcut(): 82 | """Exercise the simple case in which the visitor takes care of parsing.""" 83 | eq_(HtmlFormatter().parse('(('), '') 84 | 85 | 86 | def test_match_shortcut(): 87 | """Exercise the simple case in which the visitor takes care of matching.""" 88 | eq_(HtmlFormatter().match('((other things'), '') 89 | 90 | 91 | class CoupledFormatter(NodeVisitor): 92 | @rule('bold_open text bold_close') 93 | def visit_bold_text(self, node, visited_children): 94 | return ''.join(visited_children) 95 | 96 | @rule('"(("') 97 | def visit_bold_open(self, node, visited_children): 98 | return '' 99 | 100 | @rule('"))"') 101 | def visit_bold_close(self, node, visited_children): 102 | return '' 103 | 104 | @rule('~"[a-zA-Z 0-9]*"') 105 | def visit_text(self, node, visited_children): 106 | """Return the text verbatim.""" 107 | return node.text 108 | 109 | 110 | def test_rule_decorator(): 111 | """Make sure the @rule decorator works.""" 112 | eq_(CoupledFormatter().parse('((hi))'), 'hi') 113 | 114 | 115 | def test_rule_decorator_subclassing(): 116 | """Make sure we can subclass and override visitor methods without blowing 117 | away the rules attached to them.""" 118 | class OverridingFormatter(CoupledFormatter): 119 | def visit_text(self, node, visited_children): 120 | """Return the text capitalized.""" 121 | return node.text.upper() 122 | 123 | @rule('"not used"') 124 | def visit_useless(self, node, visited_children): 125 | """Get in the way. Tempt the metaclass to pave over the 126 | superclass's grammar with a new one.""" 127 | 128 | raise SkipTest("I haven't got around to making this work yet.") 129 | eq_(OverridingFormatter().parse('((hi))'), 'HI') 130 | 131 | 132 | class PrimalScream(Exception): 133 | pass 134 | 135 | 136 | def test_unwrapped_exceptions(): 137 | class Screamer(NodeVisitor): 138 | grammar = Grammar("""greeting = 'howdy'""") 139 | unwrapped_exceptions = (PrimalScream,) 140 | 141 | def visit_greeting(self, thing, visited_children): 142 | raise PrimalScream('This should percolate up!') 143 | 144 | assert_raises(PrimalScream, Screamer().parse, 'howdy') 145 | 146 | 147 | def test_node_inequality(): 148 | node = Node('text', 'o hai', 0, 5) 149 | ok_(node != 5) 150 | ok_(node != None) 151 | -------------------------------------------------------------------------------- /parsimonious/tests/test_expressions.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from unittest import TestCase 3 | 4 | from nose.tools import eq_, ok_, assert_raises 5 | from six import text_type 6 | 7 | from parsimonious.exceptions import ParseError, IncompleteParseError 8 | from parsimonious.expressions import (Literal, Regex, Sequence, OneOf, Not, 9 | Optional, ZeroOrMore, OneOrMore, Expression) 10 | from parsimonious.grammar import Grammar, rule_grammar 11 | from parsimonious.nodes import Node 12 | 13 | 14 | def len_eq(node, length): 15 | """Return whether the match lengths of 2 nodes are equal. 16 | 17 | Makes tests shorter and lets them omit positional stuff they don't care 18 | about. 19 | 20 | """ 21 | node_length = None if node is None else node.end - node.start 22 | return node_length == length 23 | 24 | 25 | class LengthTests(TestCase): 26 | """Tests for returning the right lengths 27 | 28 | I wrote these before parse tree generation was implemented. They're 29 | partially redundant with TreeTests. 30 | 31 | """ 32 | def test_regex(self): 33 | len_eq(Literal('hello').match('ehello', 1), 5) # simple 34 | len_eq(Regex('hello*').match('hellooo'), 7) # * 35 | assert_raises(ParseError, Regex('hello*').match, 'goodbye') # no match 36 | len_eq(Regex('hello', ignore_case=True).match('HELLO'), 5) 37 | 38 | def test_sequence(self): 39 | len_eq(Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match('hiiiilobingo1234'), 40 | 12) # succeed 41 | assert_raises(ParseError, Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match, 'hiiiilobing') # don't 42 | len_eq(Sequence(Regex('hi*')).match('>hiiii', 1), 43 | 5) # non-0 pos 44 | 45 | def test_one_of(self): 46 | len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aaa'), 3) # first alternative 47 | len_eq(OneOf(Literal('aaa'), Literal('bb')).match('bbaaa'), 2) # second 48 | assert_raises(ParseError, OneOf(Literal('aaa'), Literal('bb')).match, 'aa') # no match 49 | 50 | def test_not(self): 51 | len_eq(Not(Regex('.')).match(''), 0) # match 52 | assert_raises(ParseError, Not(Regex('.')).match, 'Hi') # don't 53 | 54 | def test_optional(self): 55 | len_eq(Sequence(Optional(Literal('a')), Literal('b')).match('b'), 1) # contained expr fails 56 | len_eq(Sequence(Optional(Literal('a')), Literal('b')).match('ab'), 2) # contained expr succeeds 57 | 58 | def test_zero_or_more(self): 59 | len_eq(ZeroOrMore(Literal('b')).match(''), 0) # zero 60 | len_eq(ZeroOrMore(Literal('b')).match('bbb'), 3) # more 61 | 62 | len_eq(Regex('^').match(''), 0) # Validate the next test. 63 | 64 | # Try to make it loop infinitely using a zero-length contained expression: 65 | len_eq(ZeroOrMore(Regex('^')).match(''), 0) 66 | 67 | def test_one_or_more(self): 68 | len_eq(OneOrMore(Literal('b')).match('b'), 1) # one 69 | len_eq(OneOrMore(Literal('b')).match('bbb'), 3) # more 70 | len_eq(OneOrMore(Literal('b'), min=3).match('bbb'), 3) # with custom min; success 71 | assert_raises(ParseError, OneOrMore(Literal('b'), min=3).match, 'bb') # with custom min; failure 72 | len_eq(OneOrMore(Regex('^')).match('bb'), 0) # attempt infinite loop 73 | 74 | 75 | class TreeTests(TestCase): 76 | """Tests for building the right trees 77 | 78 | We have only to test successes here; failures (None-returning cases) are 79 | covered above. 80 | 81 | """ 82 | def test_simple_node(self): 83 | """Test that leaf expressions like ``Literal`` make the right nodes.""" 84 | h = Literal('hello', name='greeting') 85 | eq_(h.match('hello'), Node('greeting', 'hello', 0, 5)) 86 | 87 | def test_sequence_nodes(self): 88 | """Assert that ``Sequence`` produces nodes with the right children.""" 89 | s = Sequence(Literal('heigh', name='greeting1'), 90 | Literal('ho', name='greeting2'), name='dwarf') 91 | text = 'heighho' 92 | eq_(s.match(text), Node('dwarf', text, 0, 7, children= 93 | [Node('greeting1', text, 0, 5), 94 | Node('greeting2', text, 5, 7)])) 95 | 96 | def test_one_of(self): 97 | """``OneOf`` should return its own node, wrapping the child that succeeds.""" 98 | o = OneOf(Literal('a', name='lit'), name='one_of') 99 | text = 'aa' 100 | eq_(o.match(text), Node('one_of', text, 0, 1, children=[ 101 | Node('lit', text, 0, 1)])) 102 | 103 | def test_optional(self): 104 | """``Optional`` should return its own node wrapping the succeeded child.""" 105 | expr = Optional(Literal('a', name='lit'), name='opt') 106 | 107 | text = 'a' 108 | eq_(expr.match(text), Node('opt', text, 0, 1, children=[ 109 | Node('lit', text, 0, 1)])) 110 | 111 | # Test failure of the Literal inside the Optional; the 112 | # LengthTests.test_optional is ambiguous for that. 113 | text = '' 114 | eq_(expr.match(text), Node('opt', text, 0, 0)) 115 | 116 | def test_zero_or_more_zero(self): 117 | """Test the 0 case of ``ZeroOrMore``; it should still return a node.""" 118 | expr = ZeroOrMore(Literal('a'), name='zero') 119 | text = '' 120 | eq_(expr.match(text), Node('zero', text, 0, 0)) 121 | 122 | def test_one_or_more_one(self): 123 | """Test the 1 case of ``OneOrMore``; it should return a node with a child.""" 124 | expr = OneOrMore(Literal('a', name='lit'), name='one') 125 | text = 'a' 126 | eq_(expr.match(text), Node('one', text, 0, 1, children=[ 127 | Node('lit', text, 0, 1)])) 128 | 129 | # Things added since Grammar got implemented are covered in integration 130 | # tests in test_grammar. 131 | 132 | 133 | class ParseTests(TestCase): 134 | """Tests for the ``parse()`` method""" 135 | 136 | def test_parse_success(self): 137 | """Make sure ``parse()`` returns the tree on success. 138 | 139 | There's not much more than that to test that we haven't already vetted 140 | above. 141 | 142 | """ 143 | expr = OneOrMore(Literal('a', name='lit'), name='more') 144 | text = 'aa' 145 | eq_(expr.parse(text), Node('more', text, 0, 2, children=[ 146 | Node('lit', text, 0, 1), 147 | Node('lit', text, 1, 2)])) 148 | 149 | 150 | class ErrorReportingTests(TestCase): 151 | """Tests for reporting parse errors""" 152 | 153 | def test_inner_rule_succeeding(self): 154 | """Make sure ``parse()`` fails and blames the 155 | rightward-progressing-most named Expression when an Expression isn't 156 | satisfied. 157 | 158 | Make sure ParseErrors have nice Unicode representations. 159 | 160 | """ 161 | grammar = Grammar(""" 162 | bold_text = open_parens text close_parens 163 | open_parens = "((" 164 | text = ~"[a-zA-Z]+" 165 | close_parens = "))" 166 | """) 167 | text = '((fred!!' 168 | try: 169 | grammar.parse(text) 170 | except ParseError as error: 171 | eq_(error.pos, 6) 172 | eq_(error.expr, grammar['close_parens']) 173 | eq_(error.text, text) 174 | eq_(text_type(error), "Rule 'close_parens' didn't match at '!!' (line 1, column 7).") 175 | 176 | def test_rewinding(self): 177 | """Make sure rewinding the stack and trying an alternative (which 178 | progresses farther) from a higher-level rule can blame an expression 179 | within the alternative on failure. 180 | 181 | There's no particular reason I suspect this wouldn't work, but it's a 182 | more real-world example than the no-alternative cases already tested. 183 | 184 | """ 185 | grammar = Grammar(""" 186 | formatted_text = bold_text / weird_text 187 | bold_text = open_parens text close_parens 188 | weird_text = open_parens text "!!" bork 189 | bork = "bork" 190 | open_parens = "((" 191 | text = ~"[a-zA-Z]+" 192 | close_parens = "))" 193 | """) 194 | text = '((fred!!' 195 | try: 196 | grammar.parse(text) 197 | except ParseError as error: 198 | eq_(error.pos, 8) 199 | eq_(error.expr, grammar['bork']) 200 | eq_(error.text, text) 201 | 202 | def test_no_named_rule_succeeding(self): 203 | """Make sure ParseErrors have sane printable representations even if we 204 | never succeeded in matching any named expressions.""" 205 | grammar = Grammar('''bork = "bork"''') 206 | try: 207 | grammar.parse('snork') 208 | except ParseError as error: 209 | eq_(error.pos, 0) 210 | eq_(error.expr, grammar['bork']) 211 | eq_(error.text, 'snork') 212 | 213 | def test_parse_with_leftovers(self): 214 | """Make sure ``parse()`` reports where we started failing to match, 215 | even if a partial match was successful.""" 216 | grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''') 217 | try: 218 | grammar.parse('chitty bangbang') 219 | except IncompleteParseError as error: 220 | eq_(text_type(error), u"Rule 'sequence' matched in its entirety, but it didn't consume all the text. The non-matching portion of the text begins with 'bang' (line 1, column 12).") 221 | 222 | def test_favoring_named_rules(self): 223 | """Named rules should be used in error messages in favor of anonymous 224 | ones, even if those are rightward-progressing-more, and even if the 225 | failure starts at position 0.""" 226 | grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''') 227 | try: 228 | grammar.parse('burp') 229 | except ParseError as error: 230 | eq_(text_type(error), u"Rule 'starts_with_a' didn't match at 'burp' (line 1, column 1).") 231 | 232 | def test_line_and_column(self): 233 | """Make sure we got the line and column computation right.""" 234 | grammar = Grammar(r""" 235 | whee_lah = whee "\n" lah "\n" 236 | whee = "whee" 237 | lah = "lah" 238 | """) 239 | try: 240 | grammar.parse('whee\nlahGOO') 241 | except ParseError as error: 242 | # TODO: Right now, this says "Rule 243 | # didn't match". That's not the greatest. Fix that, then fix this. 244 | ok_(text_type(error).endswith(r"""didn't match at 'GOO' (line 2, column 4).""")) 245 | 246 | 247 | class RepresentationTests(TestCase): 248 | """Tests for str(), unicode(), and repr() of expressions""" 249 | 250 | def test_unicode_crash(self): 251 | """Make sure matched unicode strings don't crash ``__str__``.""" 252 | grammar = Grammar(r'string = ~r"\S+"u') 253 | str(grammar.parse(u'中文')) 254 | 255 | def test_unicode(self): 256 | """Smoke-test the conversion of expressions to bits of rules. 257 | 258 | A slightly more comprehensive test of the actual values is in 259 | ``GrammarTests.test_unicode``. 260 | 261 | """ 262 | text_type(rule_grammar) 263 | 264 | def test_unicode_keep_parens(self): 265 | """Make sure converting an expression to unicode doesn't strip 266 | parenthesis. 267 | 268 | """ 269 | # ZeroOrMore 270 | eq_(text_type(Grammar('foo = "bar" ("baz" "eggs")* "spam"')), 271 | u'foo = "bar" ("baz" "eggs")* "spam"') 272 | 273 | # OneOf 274 | eq_(text_type(Grammar('foo = "bar" ("baz" / "eggs") "spam"')), 275 | u'foo = "bar" ("baz" / "eggs") "spam"') 276 | 277 | # Lookahead 278 | eq_(text_type(Grammar('foo = "bar" &("baz" "eggs") "spam"')), 279 | u'foo = "bar" &("baz" "eggs") "spam"') 280 | 281 | # Multiple sequences 282 | eq_(text_type(Grammar('foo = ("bar" "baz") / ("baff" "bam")')), 283 | u'foo = ("bar" "baz") / ("baff" "bam")') 284 | 285 | def test_unicode_surrounding_parens(self): 286 | """ 287 | Make sure there are no surrounding parens around the entire 288 | right-hand side of an expression (as they're unnecessary). 289 | 290 | """ 291 | eq_(text_type(Grammar('foo = ("foo" ("bar" "baz"))')), 292 | u'foo = "foo" ("bar" "baz")') 293 | 294 | 295 | class SlotsTests(TestCase): 296 | """Tests to do with __slots__""" 297 | 298 | def test_subclassing(self): 299 | """Make sure a subclass of a __slots__-less class can introduce new 300 | slots itself. 301 | 302 | This isn't supposed to work, according to the language docs: 303 | 304 | When inheriting from a class without __slots__, the __dict__ 305 | attribute of that class will always be accessible, so a __slots__ 306 | definition in the subclass is meaningless. 307 | 308 | But it does. 309 | 310 | """ 311 | class Smoo(Optional): 312 | __slots__ = ['smoo'] 313 | 314 | def __init__(self): 315 | self.smoo = 'smoo' 316 | 317 | smoo = Smoo() 318 | eq_(smoo.__dict__, {}) # has a __dict__ but with no smoo in it 319 | eq_(smoo.smoo, 'smoo') # The smoo attr ended up in a slot. 320 | -------------------------------------------------------------------------------- /parsimonious/nodes.py: -------------------------------------------------------------------------------- 1 | """Nodes that make up parse trees 2 | 3 | Parsing spits out a tree of these, which you can then tell to walk itself and 4 | spit out a useful value. Or you can walk it yourself; the structural attributes 5 | are public. 6 | 7 | """ 8 | # TODO: If this is slow, think about using cElementTree or something. 9 | from inspect import isfunction 10 | from sys import version_info, exc_info 11 | 12 | from six import reraise, python_2_unicode_compatible, with_metaclass, \ 13 | iteritems 14 | 15 | from parsimonious.exceptions import VisitationError, UndefinedLabel 16 | from parsimonious.utils import StrAndRepr 17 | 18 | 19 | @python_2_unicode_compatible 20 | class Node(StrAndRepr): 21 | """A parse tree node 22 | 23 | Consider these immutable once constructed. As a side effect of a 24 | memory-saving strategy in the cache, multiple references to a single 25 | ``Node`` might be returned in a single parse tree. So, if you start 26 | messing with one, you'll see surprising parallel changes pop up elsewhere. 27 | 28 | My philosophy is that parse trees (and their nodes) should be 29 | representation-agnostic. That is, they shouldn't get all mixed up with what 30 | the final rendered form of a wiki page (or the intermediate representation 31 | of a programming language, or whatever) is going to be: you should be able 32 | to parse once and render several representations from the tree, one after 33 | another. 34 | 35 | """ 36 | # I tried making this subclass list, but it got ugly. I had to construct 37 | # invalid ones and patch them up later, and there were other problems. 38 | __slots__ = ['expr_name', # The name of the expression that generated me 39 | 'full_text', # The full text fed to the parser 40 | 'start', # The position in the text where that expr started matching 41 | 'end', # The position after start where the expr first didn't 42 | # match. [start:end] follow Python slice conventions. 43 | 'children'] # List of child parse tree nodes 44 | 45 | def __init__(self, expr_name, full_text, start, end, children=None): 46 | self.expr_name = expr_name 47 | self.full_text = full_text 48 | self.start = start 49 | self.end = end 50 | self.children = children or [] 51 | 52 | def __iter__(self): 53 | """Support looping over my children and doing tuple unpacks on me. 54 | 55 | It can be very handy to unpack nodes in arg lists; see 56 | :class:`PegVisitor` for an example. 57 | 58 | """ 59 | return iter(self.children) 60 | 61 | @property 62 | def text(self): 63 | """Return the text this node matched.""" 64 | return self.full_text[self.start:self.end] 65 | 66 | # From here down is just stuff for testing and debugging. 67 | 68 | def prettily(self, error=None): 69 | """Return a unicode, pretty-printed representation of me. 70 | 71 | :arg error: The node to highlight because an error occurred there 72 | 73 | """ 74 | # TODO: If a Node appears multiple times in the tree, we'll point to 75 | # them all. Whoops. 76 | def indent(text): 77 | return '\n'.join((' ' + line) for line in text.splitlines()) 78 | ret = [u'<%s%s matching "%s">%s' % ( 79 | self.__class__.__name__, 80 | (' called "%s"' % self.expr_name) if self.expr_name else '', 81 | self.text, 82 | ' <-- *** We were here. ***' if error is self else '')] 83 | for n in self: 84 | ret.append(indent(n.prettily(error=error))) 85 | return '\n'.join(ret) 86 | 87 | def __str__(self): 88 | """Return a compact, human-readable representation of me.""" 89 | return self.prettily() 90 | 91 | def __eq__(self, other): 92 | """Support by-value deep comparison with other nodes for testing.""" 93 | if not isinstance(other, Node): 94 | return NotImplemented 95 | 96 | return (self.expr_name == other.expr_name and 97 | self.full_text == other.full_text and 98 | self.start == other.start and 99 | self.end == other.end and 100 | self.children == other.children) 101 | 102 | def __ne__(self, other): 103 | return not self == other 104 | 105 | def __repr__(self, top_level=True): 106 | """Return a bit of code (though not an expression) that will recreate 107 | me.""" 108 | # repr() of unicode flattens everything out to ASCII, so we don't need 109 | # to explicitly encode things afterward. 110 | ret = ["s = %r" % self.full_text] if top_level else [] 111 | ret.append("%s(%r, s, %s, %s%s)" % ( 112 | self.__class__.__name__, 113 | self.expr_name, 114 | self.start, 115 | self.end, 116 | (', children=[%s]' % 117 | ', '.join([c.__repr__(top_level=False) for c in self.children])) 118 | if self.children else '')) 119 | return '\n'.join(ret) 120 | 121 | 122 | class RegexNode(Node): 123 | """Node returned from a ``Regex`` expression 124 | 125 | Grants access to the ``re.Match`` object, in case you want to access 126 | capturing groups, etc. 127 | 128 | """ 129 | __slots__ = ['match'] 130 | 131 | 132 | class RuleDecoratorMeta(type): 133 | def __new__(metaclass, name, bases, namespace): 134 | def unvisit(name): 135 | """Remove any leading "visit_" from a method name.""" 136 | return name[6:] if name.startswith('visit_') else name 137 | 138 | methods = [v for k, v in iteritems(namespace) if 139 | hasattr(v, '_rule') and isfunction(v)] 140 | if methods: 141 | from parsimonious.grammar import Grammar # circular import dodge 142 | 143 | methods.sort(key=(lambda x: x.func_code.co_firstlineno) 144 | if version_info[0] < 3 else 145 | (lambda x: x.__code__.co_firstlineno)) 146 | # Possible enhancement: once we get the Grammar extensibility story 147 | # solidified, we can have @rules *add* to the default grammar 148 | # rather than pave over it. 149 | namespace['grammar'] = Grammar( 150 | '\n'.join('{name} = {expr}'.format(name=unvisit(m.__name__), 151 | expr=m._rule) 152 | for m in methods)) 153 | return super(RuleDecoratorMeta, 154 | metaclass).__new__(metaclass, name, bases, namespace) 155 | 156 | 157 | class NodeVisitor(with_metaclass(RuleDecoratorMeta, object)): 158 | """A shell for writing things that turn parse trees into something useful 159 | 160 | Performs a depth-first traversal of an AST. Subclass this, add methods for 161 | each expr you care about, instantiate, and call 162 | ``visit(top_node_of_parse_tree)``. It'll return the useful stuff. This API 163 | is very similar to that of ``ast.NodeVisitor``. 164 | 165 | These could easily all be static methods, but that would add at least as 166 | much weirdness at the call site as the ``()`` for instantiation. And this 167 | way, we support subclasses that require state: options, for example, or a 168 | symbol table constructed from a programming language's AST. 169 | 170 | We never transform the parse tree in place, because... 171 | 172 | * There are likely multiple references to the same ``Node`` object in a 173 | parse tree, and changes to one reference would surprise you elsewhere. 174 | * It makes it impossible to report errors: you'd end up with the "error" 175 | arrow pointing someplace in a half-transformed mishmash of nodes--and 176 | that's assuming you're even transforming the tree into another tree. 177 | Heaven forbid you're making it into a string or something else. 178 | 179 | """ 180 | 181 | #: The :term:`default grammar`: the one recommended for use with this 182 | #: visitor. If you populate this, you will be able to call 183 | #: :meth:`NodeVisitor.parse()` as a shortcut. 184 | grammar = None 185 | 186 | #: Classes of exceptions you actually intend to raise during visitation 187 | #: and which should propagate out of the visitor. These will not be 188 | #: wrapped in a VisitationError when they arise. 189 | unwrapped_exceptions = () 190 | 191 | # TODO: If we need to optimize this, we can go back to putting subclasses 192 | # in charge of visiting children; they know when not to bother. Or we can 193 | # mark nodes as not descent-worthy in the grammar. 194 | def visit(self, node): 195 | """Walk a parse tree, transforming it into another representation. 196 | 197 | Recursively descend a parse tree, dispatching to the method named after 198 | the rule in the :class:`~parsimonious.grammar.Grammar` that produced 199 | each node. If, for example, a rule was... :: 200 | 201 | bold = '' 202 | 203 | ...the ``visit_bold()`` method would be called. It is your 204 | responsibility to subclass :class:`NodeVisitor` and implement those 205 | methods. 206 | 207 | """ 208 | method = getattr(self, 'visit_' + node.expr_name, self.generic_visit) 209 | 210 | # Call that method, and show where in the tree it failed if it blows 211 | # up. 212 | try: 213 | return method(node, [self.visit(n) for n in node]) 214 | except (VisitationError, UndefinedLabel): 215 | # Don't catch and re-wrap already-wrapped exceptions. 216 | raise 217 | except self.unwrapped_exceptions: 218 | raise 219 | except Exception: 220 | # Catch any exception, and tack on a parse tree so it's easier to 221 | # see where it went wrong. 222 | exc_class, exc, tb = exc_info() 223 | reraise(VisitationError, VisitationError(exc, exc_class, node), tb) 224 | 225 | def generic_visit(self, node, visited_children): 226 | """Default visitor method 227 | 228 | :arg node: The node we're visiting 229 | :arg visited_children: The results of visiting the children of that 230 | node, in a list 231 | 232 | I'm not sure there's an implementation of this that makes sense across 233 | all (or even most) use cases, so we leave it to subclasses to implement 234 | for now. 235 | 236 | """ 237 | raise NotImplementedError("No visitor method was defined for %s." % 238 | node.expr_name) 239 | 240 | # Convenience methods: 241 | 242 | def parse(self, text, pos=0): 243 | """Parse some text with this Visitor's default grammar. 244 | 245 | ``SomeVisitor().parse('some_string')`` is a shortcut for 246 | ``SomeVisitor().visit(some_grammar.parse('some_string'))``. 247 | 248 | """ 249 | return self._parse_or_match(text, pos, 'parse') 250 | 251 | def match(self, text, pos=0): 252 | """Parse some text with this Visitor's default grammar, but don't 253 | insist on parsing all the way to the end. 254 | 255 | ``SomeVisitor().match('some_string')`` is a shortcut for 256 | ``SomeVisitor().visit(some_grammar.match('some_string'))``. 257 | 258 | """ 259 | return self._parse_or_match(text, pos, 'match') 260 | 261 | # Internal convenience methods to help you write your own visitors: 262 | 263 | def lift_child(self, node, children): 264 | """Lift the sole child of ``node`` up to replace the node.""" 265 | first_child, = children 266 | return first_child 267 | 268 | # Private methods: 269 | 270 | def _parse_or_match(self, text, pos, method_name): 271 | """Execute a parse or match on the default grammar, followed by a 272 | visitation. 273 | 274 | Raise RuntimeError if there is no default grammar specified. 275 | 276 | """ 277 | if not self.grammar: 278 | raise RuntimeError( 279 | "The {cls}.{method}() shortcut won't work because {cls} was " 280 | "never associated with a specific " "grammar. Fill out its " 281 | "`grammar` attribute, and try again.".format( 282 | cls=self.__class__.__name__, 283 | method=method_name)) 284 | return self.visit(getattr(self.grammar, method_name)(text, pos=pos)) 285 | 286 | 287 | def rule(rule_string): 288 | """Decorate a NodeVisitor ``visit_*`` method to tie a grammar rule to it. 289 | 290 | The following will arrange for the ``visit_digit`` method to receive the 291 | results of the ``~"[0-9]"`` parse rule:: 292 | 293 | @rule('~"[0-9]"') 294 | def visit_digit(self, node, visited_children): 295 | ... 296 | 297 | Notice that there is no "digit = " as part of the rule; that gets inferred 298 | from the method name. 299 | 300 | In cases where there is only one kind of visitor interested in a grammar, 301 | using ``@rule`` saves you having to look back and forth between the visitor 302 | and the grammar definition. 303 | 304 | On an implementation level, all ``@rule`` rules get stitched together into 305 | a :class:`~parsimonious.Grammar` that becomes the NodeVisitor's 306 | :term:`default grammar`. 307 | 308 | Typically, the choice of a default rule for this grammar is simple: whatever 309 | ``@rule`` comes first in the class is the default. But the choice may become 310 | surprising if you divide the ``@rule`` calls among subclasses. At the 311 | moment, which method "comes first" is decided simply by comparing line 312 | numbers, so whatever method is on the smallest-numbered line will be the 313 | default. In a future release, this will change to pick the 314 | first ``@rule`` call on the basemost class that has one. That way, a 315 | subclass which does not override the default rule's ``visit_*`` method 316 | won't unintentionally change which rule is the default. 317 | 318 | """ 319 | def decorator(method): 320 | method._rule = rule_string # XXX: Maybe register them on a class var instead so we can just override a @rule'd visitor method on a subclass without blowing away the rule string that comes with it. 321 | return method 322 | return decorator 323 | -------------------------------------------------------------------------------- /parsimonious/expressions.py: -------------------------------------------------------------------------------- 1 | """Subexpressions that make up a parsed grammar 2 | 3 | These do the parsing. 4 | 5 | """ 6 | # TODO: Make sure all symbol refs are local--not class lookups or 7 | # anything--for speed. And kill all the dots. 8 | 9 | from inspect import getargspec 10 | import re 11 | 12 | from six import integer_types, python_2_unicode_compatible 13 | from six.moves import range 14 | 15 | from parsimonious.exceptions import ParseError, IncompleteParseError 16 | from parsimonious.nodes import Node, RegexNode 17 | from parsimonious.utils import StrAndRepr 18 | 19 | MARKER = object() 20 | 21 | 22 | def expression(callable, rule_name, grammar): 23 | """Turn a plain callable into an Expression. 24 | 25 | The callable can be of this simple form:: 26 | 27 | def foo(text, pos): 28 | '''If this custom expression matches starting at text[pos], return 29 | the index where it stops matching. Otherwise, return None.''' 30 | if the expression matched: 31 | return end_pos 32 | 33 | If there child nodes to return, return a tuple:: 34 | 35 | return end_pos, children 36 | 37 | If the expression doesn't match at the given ``pos`` at all... :: 38 | 39 | return None 40 | 41 | If your callable needs to make sub-calls to other rules in the grammar or 42 | do error reporting, it can take this form, gaining additional arguments:: 43 | 44 | def foo(text, pos, cache, error, grammar): 45 | # Call out to other rules: 46 | node = grammar['another_rule'].match_core(text, pos, cache, error) 47 | ... 48 | # Return values as above. 49 | 50 | The return value of the callable, if an int or a tuple, will be 51 | automatically transmuted into a :class:`~parsimonious.Node`. If it returns 52 | a Node-like class directly, it will be passed through unchanged. 53 | 54 | :arg rule_name: The rule name to attach to the resulting 55 | :class:`~parsimonious.Expression` 56 | :arg grammar: The :class:`~parsimonious.Grammar` this expression will be a 57 | part of, to make delegating to other rules possible 58 | 59 | """ 60 | num_args = len(getargspec(callable).args) 61 | if num_args == 2: 62 | is_simple = True 63 | elif num_args == 5: 64 | is_simple = False 65 | else: 66 | raise RuntimeError("Custom rule functions must take either 2 or 5 " 67 | "arguments, not %s." % num_args) 68 | 69 | class AdHocExpression(Expression): 70 | def _uncached_match(self, text, pos, cache, error): 71 | result = (callable(text, pos) if is_simple else 72 | callable(text, pos, cache, error, grammar)) 73 | 74 | if isinstance(result, integer_types): 75 | end, children = result, None 76 | elif isinstance(result, tuple): 77 | end, children = result 78 | else: 79 | # Node or None 80 | return result 81 | return Node(self.name, text, pos, end, children=children) 82 | 83 | def _as_rhs(self): 84 | return '{custom function "%s"}' % callable.__name__ 85 | 86 | return AdHocExpression(name=rule_name) 87 | 88 | 89 | @python_2_unicode_compatible 90 | class Expression(StrAndRepr): 91 | """A thing that can be matched against a piece of text""" 92 | 93 | # Slots are about twice as fast as __dict__-based attributes: 94 | # http://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why 95 | 96 | # Top-level expressions--rules--have names. Subexpressions are named ''. 97 | __slots__ = ['name'] 98 | 99 | def __init__(self, name=''): 100 | self.name = name 101 | 102 | def parse(self, text, pos=0): 103 | """Return a parse tree of ``text``. 104 | 105 | Raise ``ParseError`` if the expression wasn't satisfied. Raise 106 | ``IncompleteParseError`` if the expression was satisfied but didn't 107 | consume the full string. 108 | 109 | """ 110 | node = self.match(text, pos=pos) 111 | if node.end < len(text): 112 | raise IncompleteParseError(text, node.end, self) 113 | return node 114 | 115 | def match(self, text, pos=0): 116 | """Return the parse tree matching this expression at the given 117 | position, not necessarily extending all the way to the end of ``text``. 118 | 119 | Raise ``ParseError`` if there is no match there. 120 | 121 | :arg pos: The index at which to start matching 122 | 123 | """ 124 | error = ParseError(text) 125 | node = self.match_core(text, pos, {}, error) 126 | if node is None: 127 | raise error 128 | return node 129 | 130 | def match_core(self, text, pos, cache, error): 131 | """Internal guts of ``match()`` 132 | 133 | This is appropriate to call only from custom rules or Expression 134 | subclasses. 135 | 136 | :arg cache: The packrat cache:: 137 | 138 | {(oid, pos): Node tree matched by object `oid` at index `pos` ...} 139 | 140 | :arg error: A ParseError instance with ``text`` already filled in but 141 | otherwise blank. We update the error reporting info on this object 142 | as we go. (Sticking references on an existing instance is faster 143 | than allocating a new one for each expression that fails.) We 144 | return None rather than raising and catching ParseErrors because 145 | catching is slow. 146 | 147 | """ 148 | # TODO: Optimize. Probably a hot spot. 149 | # 150 | # Is there a way of looking up cached stuff that's faster than hashing 151 | # this id-pos pair? 152 | # 153 | # If this is slow, think about the array module. It might (or might 154 | # not!) use more RAM, but it'll likely be faster than hashing things 155 | # all the time. Also, can we move all the allocs up front? 156 | # 157 | # To save space, we have lots of choices: (0) Quit caching whole Node 158 | # objects. Cache just what you need to reconstitute them. (1) Cache 159 | # only the results of entire rules, not subexpressions (probably a 160 | # horrible idea for rules that need to backtrack internally a lot). (2) 161 | # Age stuff out of the cache somehow. LRU? (3) Cuts. 162 | expr_id = id(self) 163 | node = cache.get((expr_id, pos), MARKER) # TODO: Change to setdefault to prevent infinite recursion in left-recursive rules. 164 | if node is MARKER: 165 | node = cache[(expr_id, pos)] = self._uncached_match(text, 166 | pos, 167 | cache, 168 | error) 169 | 170 | # Record progress for error reporting: 171 | if node is None and pos >= error.pos and ( 172 | self.name or getattr(error.expr, 'name', None) is None): 173 | # Don't bother reporting on unnamed expressions (unless that's all 174 | # we've seen so far), as they're hard to track down for a human. 175 | # Perhaps we could include the unnamed subexpressions later as 176 | # auxiliary info. 177 | error.expr = self 178 | error.pos = pos 179 | 180 | return node 181 | 182 | def __str__(self): 183 | return u'<%s %s at 0x%s>' % ( 184 | self.__class__.__name__, 185 | self.as_rule(), 186 | id(self)) 187 | 188 | def as_rule(self): 189 | """Return the left- and right-hand sides of a rule that represents me. 190 | 191 | Return unicode. If I have no ``name``, omit the left-hand side. 192 | 193 | """ 194 | rhs = self._as_rhs().strip() 195 | if rhs.startswith('(') and rhs.endswith(')'): 196 | rhs = rhs[1:-1] 197 | 198 | return (u'%s = %s' % (self.name, rhs)) if self.name else rhs 199 | 200 | def _unicode_members(self): 201 | """Return an iterable of my unicode-represented children, stopping 202 | descent when we hit a named node so the returned value resembles the 203 | input rule.""" 204 | return [(m.name or m._as_rhs()) for m in self.members] 205 | 206 | def _as_rhs(self): 207 | """Return the right-hand side of a rule that represents me. 208 | 209 | Implemented by subclasses. 210 | 211 | """ 212 | raise NotImplementedError 213 | 214 | 215 | class Literal(Expression): 216 | """A string literal 217 | 218 | Use these if you can; they're the fastest. 219 | 220 | """ 221 | __slots__ = ['literal'] 222 | 223 | def __init__(self, literal, name=''): 224 | super(Literal, self).__init__(name) 225 | self.literal = literal 226 | 227 | def _uncached_match(self, text, pos, cache, error): 228 | if text.startswith(self.literal, pos): 229 | return Node(self.name, text, pos, pos + len(self.literal)) 230 | 231 | def _as_rhs(self): 232 | # TODO: Get backslash escaping right. 233 | return '"%s"' % self.literal 234 | 235 | 236 | class TokenMatcher(Literal): 237 | """An expression matching a single token of a given type 238 | 239 | This is for use only with TokenGrammars. 240 | 241 | """ 242 | def _uncached_match(self, token_list, pos, cache, error): 243 | if token_list[pos].type == self.literal: 244 | return Node(self.name, token_list, pos, pos + 1) 245 | 246 | 247 | class Regex(Expression): 248 | """An expression that matches what a regex does. 249 | 250 | Use these as much as you can and jam as much into each one as you can; 251 | they're fast. 252 | 253 | """ 254 | __slots__ = ['re'] 255 | 256 | def __init__(self, pattern, name='', ignore_case=False, locale=False, 257 | multiline=False, dot_all=False, unicode=False, verbose=False): 258 | super(Regex, self).__init__(name) 259 | self.re = re.compile(pattern, (ignore_case and re.I) | 260 | (locale and re.L) | 261 | (multiline and re.M) | 262 | (dot_all and re.S) | 263 | (unicode and re.U) | 264 | (verbose and re.X)) 265 | 266 | def _uncached_match(self, text, pos, cache, error): 267 | """Return length of match, ``None`` if no match.""" 268 | m = self.re.match(text, pos) 269 | if m is not None: 270 | span = m.span() 271 | node = RegexNode(self.name, text, pos, pos + span[1] - span[0]) 272 | node.match = m # TODO: A terrible idea for cache size? 273 | return node 274 | 275 | def _regex_flags_from_bits(self, bits): 276 | """Return the textual equivalent of numerically encoded regex flags.""" 277 | flags = 'ilmsux' 278 | return ''.join(flags[i - 1] if (1 << i) & bits else '' for i in range(1, len(flags) + 1)) 279 | 280 | def _as_rhs(self): 281 | # TODO: Get backslash escaping right. 282 | return '~"%s"%s' % (self.re.pattern, 283 | self._regex_flags_from_bits(self.re.flags)) 284 | 285 | 286 | class Compound(Expression): 287 | """An abstract expression which contains other expressions""" 288 | 289 | __slots__ = ['members'] 290 | 291 | def __init__(self, *members, **kwargs): 292 | """``members`` is a sequence of expressions.""" 293 | super(Compound, self).__init__(kwargs.get('name', '')) 294 | self.members = members 295 | 296 | 297 | class Sequence(Compound): 298 | """A series of expressions that must match contiguous, ordered pieces of 299 | the text 300 | 301 | In other words, it's a concatenation operator: each piece has to match, one 302 | after another. 303 | 304 | """ 305 | def _uncached_match(self, text, pos, cache, error): 306 | new_pos = pos 307 | length_of_sequence = 0 308 | children = [] 309 | for m in self.members: 310 | node = m.match_core(text, new_pos, cache, error) 311 | if node is None: 312 | return None 313 | children.append(node) 314 | length = node.end - node.start 315 | new_pos += length 316 | length_of_sequence += length 317 | # Hooray! We got through all the members! 318 | return Node(self.name, text, pos, pos + length_of_sequence, children) 319 | 320 | def _as_rhs(self): 321 | return u'({0})'.format(u' '.join(self._unicode_members())) 322 | 323 | 324 | class OneOf(Compound): 325 | """A series of expressions, one of which must match 326 | 327 | Expressions are tested in order from first to last. The first to succeed 328 | wins. 329 | 330 | """ 331 | def _uncached_match(self, text, pos, cache, error): 332 | for m in self.members: 333 | node = m.match_core(text, pos, cache, error) 334 | if node is not None: 335 | # Wrap the succeeding child in a node representing the OneOf: 336 | return Node(self.name, text, pos, node.end, children=[node]) 337 | 338 | def _as_rhs(self): 339 | return u'({0})'.format(u' / '.join(self._unicode_members())) 340 | 341 | 342 | class Lookahead(Compound): 343 | """An expression which consumes nothing, even if its contained expression 344 | succeeds""" 345 | 346 | # TODO: Merge this and Not for better cache hit ratios and less code. 347 | # Downside: pretty-printed grammars might be spelled differently than what 348 | # went in. That doesn't bother me. 349 | 350 | def _uncached_match(self, text, pos, cache, error): 351 | node = self.members[0].match_core(text, pos, cache, error) 352 | if node is not None: 353 | return Node(self.name, text, pos, pos) 354 | 355 | def _as_rhs(self): 356 | return u'&%s' % self._unicode_members()[0] 357 | 358 | 359 | class Not(Compound): 360 | """An expression that succeeds only if the expression within it doesn't 361 | 362 | In any case, it never consumes any characters; it's a negative lookahead. 363 | 364 | """ 365 | def _uncached_match(self, text, pos, cache, error): 366 | # FWIW, the implementation in Parsing Techniques in Figure 15.29 does 367 | # not bother to cache NOTs directly. 368 | node = self.members[0].match_core(text, pos, cache, error) 369 | if node is None: 370 | return Node(self.name, text, pos, pos) 371 | 372 | def _as_rhs(self): 373 | # TODO: Make sure this parenthesizes the member properly if it's an OR 374 | # or AND. 375 | return u'!%s' % self._unicode_members()[0] 376 | 377 | 378 | # Quantifiers. None of these is strictly necessary, but they're darn handy. 379 | 380 | class Optional(Compound): 381 | """An expression that succeeds whether or not the contained one does 382 | 383 | If the contained expression succeeds, it goes ahead and consumes what it 384 | consumes. Otherwise, it consumes nothing. 385 | 386 | """ 387 | def _uncached_match(self, text, pos, cache, error): 388 | node = self.members[0].match_core(text, pos, cache, error) 389 | return (Node(self.name, text, pos, pos) if node is None else 390 | Node(self.name, text, pos, node.end, children=[node])) 391 | 392 | def _as_rhs(self): 393 | return u'%s?' % self._unicode_members()[0] 394 | 395 | 396 | # TODO: Merge with OneOrMore. 397 | class ZeroOrMore(Compound): 398 | """An expression wrapper like the * quantifier in regexes.""" 399 | 400 | def _uncached_match(self, text, pos, cache, error): 401 | new_pos = pos 402 | children = [] 403 | while True: 404 | node = self.members[0].match_core(text, new_pos, cache, error) 405 | if node is None or not (node.end - node.start): 406 | # Node was None or 0 length. 0 would otherwise loop infinitely. 407 | return Node(self.name, text, pos, new_pos, children) 408 | children.append(node) 409 | new_pos += node.end - node.start 410 | 411 | def _as_rhs(self): 412 | return u'%s*' % self._unicode_members()[0] 413 | 414 | 415 | class OneOrMore(Compound): 416 | """An expression wrapper like the + quantifier in regexes. 417 | 418 | You can also pass in an alternate minimum to make this behave like "2 or 419 | more", "3 or more", etc. 420 | 421 | """ 422 | __slots__ = ['min'] 423 | 424 | # TODO: Add max. It should probably succeed if there are more than the max 425 | # --just not consume them. 426 | 427 | def __init__(self, member, name='', min=1): 428 | super(OneOrMore, self).__init__(member, name=name) 429 | self.min = min 430 | 431 | def _uncached_match(self, text, pos, cache, error): 432 | new_pos = pos 433 | children = [] 434 | while True: 435 | node = self.members[0].match_core(text, new_pos, cache, error) 436 | if node is None: 437 | break 438 | children.append(node) 439 | length = node.end - node.start 440 | if length == 0: # Don't loop infinitely. 441 | break 442 | new_pos += length 443 | if len(children) >= self.min: 444 | return Node(self.name, text, pos, new_pos, children) 445 | 446 | def _as_rhs(self): 447 | return u'%s+' % self._unicode_members()[0] 448 | -------------------------------------------------------------------------------- /parsimonious/tests/test_grammar.py: -------------------------------------------------------------------------------- 1 | from sys import version_info 2 | from unittest import TestCase 3 | 4 | from nose import SkipTest 5 | from nose.tools import eq_, assert_raises, ok_ 6 | from six import text_type 7 | 8 | from parsimonious.exceptions import UndefinedLabel, ParseError 9 | from parsimonious.expressions import Sequence 10 | from parsimonious.grammar import rule_grammar, RuleVisitor, Grammar, TokenGrammar, LazyReference 11 | from parsimonious.nodes import Node 12 | from parsimonious.utils import Token 13 | 14 | 15 | class BootstrappingGrammarTests(TestCase): 16 | """Tests for the expressions in the grammar that parses the grammar 17 | definition syntax""" 18 | 19 | def test_quantifier(self): 20 | text = '*' 21 | eq_(rule_grammar['quantifier'].parse(text), 22 | Node('quantifier', text, 0, 1, children=[ 23 | Node('', text, 0, 1), Node('_', text, 1, 1)])) 24 | text = '?' 25 | eq_(rule_grammar['quantifier'].parse(text), 26 | Node('quantifier', text, 0, 1, children=[ 27 | Node('', text, 0, 1), Node('_', text, 1, 1)])) 28 | text = '+' 29 | eq_(rule_grammar['quantifier'].parse(text), 30 | Node('quantifier', text, 0, 1, children=[ 31 | Node('', text, 0, 1), Node('_', text, 1, 1)])) 32 | 33 | def test_spaceless_literal(self): 34 | text = '"anything but quotes#$*&^"' 35 | eq_(rule_grammar['spaceless_literal'].parse(text), 36 | Node('spaceless_literal', text, 0, len(text), children=[ 37 | Node('', text, 0, len(text))])) 38 | text = r'''r"\""''' 39 | eq_(rule_grammar['spaceless_literal'].parse(text), 40 | Node('spaceless_literal', text, 0, 5, children=[ 41 | Node('', text, 0, 5)])) 42 | 43 | def test_regex(self): 44 | text = '~"[a-zA-Z_][a-zA-Z_0-9]*"LI' 45 | eq_(rule_grammar['regex'].parse(text), 46 | Node('regex', text, 0, len(text), children=[ 47 | Node('', text, 0, 1), 48 | Node('spaceless_literal', text, 1, 25, children=[ 49 | Node('', text, 1, 25)]), 50 | Node('', text, 25, 27), 51 | Node('_', text, 27, 27)])) 52 | 53 | def test_successes(self): 54 | """Make sure the PEG recognition grammar succeeds on various inputs.""" 55 | ok_(rule_grammar['label'].parse('_')) 56 | ok_(rule_grammar['label'].parse('jeff')) 57 | ok_(rule_grammar['label'].parse('_THIS_THING')) 58 | 59 | ok_(rule_grammar['atom'].parse('some_label')) 60 | ok_(rule_grammar['atom'].parse('"some literal"')) 61 | ok_(rule_grammar['atom'].parse('~"some regex"i')) 62 | 63 | ok_(rule_grammar['quantified'].parse('~"some regex"i*')) 64 | ok_(rule_grammar['quantified'].parse('thing+')) 65 | ok_(rule_grammar['quantified'].parse('"hi"?')) 66 | 67 | ok_(rule_grammar['term'].parse('this')) 68 | ok_(rule_grammar['term'].parse('that+')) 69 | 70 | ok_(rule_grammar['sequence'].parse('this that? other')) 71 | 72 | ok_(rule_grammar['ored'].parse('this / that+ / "other"')) 73 | 74 | # + is higher precedence than &, so 'anded' should match the whole 75 | # thing: 76 | ok_(rule_grammar['lookahead_term'].parse('&this+')) 77 | 78 | ok_(rule_grammar['expression'].parse('this')) 79 | ok_(rule_grammar['expression'].parse('this? that other*')) 80 | ok_(rule_grammar['expression'].parse('&this / that+ / "other"')) 81 | ok_(rule_grammar['expression'].parse('this / that? / "other"+')) 82 | ok_(rule_grammar['expression'].parse('this? that other*')) 83 | 84 | ok_(rule_grammar['rule'].parse('this = that\r')) 85 | ok_(rule_grammar['rule'].parse('this = the? that other* \t\r')) 86 | ok_(rule_grammar['rule'].parse('the=~"hi*"\n')) 87 | 88 | ok_(rule_grammar.parse(''' 89 | this = the? that other* 90 | that = "thing" 91 | the=~"hi*" 92 | other = "ahoy hoy" 93 | ''')) 94 | 95 | 96 | class RuleVisitorTests(TestCase): 97 | """Tests for ``RuleVisitor`` 98 | 99 | As I write these, Grammar is not yet fully implemented. Normally, there'd 100 | be no reason to use ``RuleVisitor`` directly. 101 | 102 | """ 103 | def test_round_trip(self): 104 | """Test a simple round trip. 105 | 106 | Parse a simple grammar, turn the parse tree into a map of expressions, 107 | and use that to parse another piece of text. 108 | 109 | Not everything was implemented yet, but it was a big milestone and a 110 | proof of concept. 111 | 112 | """ 113 | tree = rule_grammar.parse('''number = ~"[0-9]+"\n''') 114 | rules, default_rule = RuleVisitor().visit(tree) 115 | 116 | text = '98' 117 | eq_(default_rule.parse(text), Node('number', text, 0, 2)) 118 | 119 | def test_undefined_rule(self): 120 | """Make sure we throw the right exception on undefined rules.""" 121 | tree = rule_grammar.parse('boy = howdy\n') 122 | assert_raises(UndefinedLabel, RuleVisitor().visit, tree) 123 | 124 | def test_optional(self): 125 | tree = rule_grammar.parse('boy = "howdy"?\n') 126 | rules, default_rule = RuleVisitor().visit(tree) 127 | 128 | howdy = 'howdy' 129 | 130 | # It should turn into a Node from the Optional and another from the 131 | # Literal within. 132 | eq_(default_rule.parse(howdy), Node('boy', howdy, 0, 5, children=[ 133 | Node('', howdy, 0, 5)])) 134 | 135 | 136 | class GrammarTests(TestCase): 137 | """Integration-test ``Grammar``: feed it a PEG and see if it works.""" 138 | 139 | def test_expressions_from_rules(self): 140 | """Test the ``Grammar`` base class's ability to compile an expression 141 | tree from rules. 142 | 143 | That the correct ``Expression`` tree is built is already tested in 144 | ``RuleGrammarTests``. This tests only that the ``Grammar`` base class's 145 | ``_expressions_from_rules`` works. 146 | 147 | """ 148 | greeting_grammar = Grammar('greeting = "hi" / "howdy"') 149 | tree = greeting_grammar.parse('hi') 150 | eq_(tree, Node('greeting', 'hi', 0, 2, children=[ 151 | Node('', 'hi', 0, 2)])) 152 | 153 | def test_unicode(self): 154 | """Assert that a ``Grammar`` can convert into a string-formatted series 155 | of rules.""" 156 | grammar = Grammar(r""" 157 | bold_text = bold_open text bold_close 158 | text = ~"[A-Z 0-9]*"i 159 | bold_open = "((" 160 | bold_close = "))" 161 | """) 162 | lines = text_type(grammar).splitlines() 163 | eq_(lines[0], 'bold_text = bold_open text bold_close') 164 | ok_('text = ~"[A-Z 0-9]*"i%s' % ('u' if version_info >= (3,) else '') 165 | in lines) 166 | ok_('bold_open = "(("' in lines) 167 | ok_('bold_close = "))"' in lines) 168 | eq_(len(lines), 4) 169 | 170 | def test_match(self): 171 | """Make sure partial-matching (with pos) works.""" 172 | grammar = Grammar(r""" 173 | bold_text = bold_open text bold_close 174 | text = ~"[A-Z 0-9]*"i 175 | bold_open = "((" 176 | bold_close = "))" 177 | """) 178 | s = ' ((boo))yah' 179 | eq_(grammar.match(s, pos=1), Node('bold_text', s, 1, 8, children=[ 180 | Node('bold_open', s, 1, 3), 181 | Node('text', s, 3, 6), 182 | Node('bold_close', s, 6, 8)])) 183 | 184 | def test_bad_grammar(self): 185 | """Constructing a Grammar with bad rules should raise ParseError.""" 186 | assert_raises(ParseError, Grammar, 'just a bunch of junk') 187 | 188 | def test_comments(self): 189 | """Test tolerance of comments and blank lines in and around rules.""" 190 | grammar = Grammar(r"""# This is a grammar. 191 | 192 | # It sure is. 193 | bold_text = stars text stars # nice 194 | text = ~"[A-Z 0-9]*"i #dude 195 | 196 | 197 | stars = "**" 198 | # Pretty good 199 | #Oh yeah.#""") # Make sure a comment doesn't need a 200 | # \n or \r to end. 201 | eq_(list(sorted(str(grammar).splitlines())), 202 | ['''bold_text = stars text stars''', 203 | # TODO: Unicode flag is on by default in Python 3. I wonder if we 204 | # should turn it on all the time in Parsimonious. 205 | '''stars = "**"''', 206 | '''text = ~"[A-Z 0-9]*"i%s''' % ('u' if version_info >= (3,) 207 | else '')]) 208 | 209 | def test_multi_line(self): 210 | """Make sure we tolerate all sorts of crazy line breaks and comments in 211 | the middle of rules.""" 212 | grammar = Grammar(""" 213 | bold_text = bold_open # commenty comment 214 | text # more comment 215 | bold_close 216 | text = ~"[A-Z 0-9]*"i 217 | bold_open = "((" bold_close = "))" 218 | """) 219 | ok_(grammar.parse('((booyah))') is not None) 220 | 221 | def test_not(self): 222 | """Make sure "not" predicates get parsed and work properly.""" 223 | grammar = Grammar(r'''not_arp = !"arp" ~"[a-z]+"''') 224 | assert_raises(ParseError, grammar.parse, 'arp') 225 | ok_(grammar.parse('argle') is not None) 226 | 227 | def test_lookahead(self): 228 | grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''') 229 | assert_raises(ParseError, grammar.parse, 'burp') 230 | 231 | s = 'arp' 232 | eq_(grammar.parse('arp'), Node('starts_with_a', s, 0, 3, children=[ 233 | Node('', s, 0, 0), 234 | Node('', s, 0, 3)])) 235 | 236 | def test_parens(self): 237 | grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''') 238 | # Make sure it's not as if the parens aren't there: 239 | assert_raises(ParseError, grammar.parse, 'chitty bangbang') 240 | 241 | s = 'chitty bang bang' 242 | eq_(str(grammar.parse(s)), 243 | """ 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | """) 252 | 253 | def test_resolve_refs_order(self): 254 | """Smoke-test a circumstance where lazy references don't get resolved.""" 255 | grammar = Grammar(""" 256 | expression = "(" terms ")" 257 | terms = term+ 258 | term = number 259 | number = ~r"[0-9]+" 260 | """) 261 | grammar.parse('(34)') 262 | 263 | def test_infinite_loop(self): 264 | """Smoke-test a grammar that was causing infinite loops while building. 265 | 266 | This was going awry because the "int" rule was never getting marked as 267 | resolved, so it would just keep trying to resolve it over and over. 268 | 269 | """ 270 | Grammar(""" 271 | digits = digit+ 272 | int = digits 273 | digit = ~"[0-9]" 274 | number = int 275 | main = number 276 | """) 277 | 278 | def test_right_recursive(self): 279 | """Right-recursive refs should resolve.""" 280 | grammar = Grammar(""" 281 | digits = digit digits? 282 | digit = ~r"[0-9]" 283 | """) 284 | ok_(grammar.parse('12') is not None) 285 | 286 | def test_badly_circular(self): 287 | """Uselessly circular references should be detected by the grammar 288 | compiler.""" 289 | raise SkipTest('We have yet to make the grammar compiler detect these.') 290 | grammar = Grammar(""" 291 | foo = bar 292 | bar = foo 293 | """) 294 | 295 | def test_parens_with_leading_whitespace(self): 296 | """Make sure a parenthesized expression is allowed to have leading 297 | whitespace when nested directly inside another.""" 298 | Grammar("""foo = ( ("c") )""").parse('c') 299 | 300 | def test_single_quoted_literals(self): 301 | Grammar("""foo = 'a' '"'""").parse('a"') 302 | 303 | def test_simple_custom_rules(self): 304 | """Run 2-arg custom-coded rules through their paces.""" 305 | grammar = Grammar(""" 306 | bracketed_digit = start digit end 307 | start = '[' 308 | end = ']'""", 309 | digit=lambda text, pos: 310 | (pos + 1) if text[pos].isdigit() else None) 311 | s = '[6]' 312 | eq_(grammar.parse(s), 313 | Node('bracketed_digit', s, 0, 3, children=[ 314 | Node('start', s, 0, 1), 315 | Node('digit', s, 1, 2), 316 | Node('end', s, 2, 3)])) 317 | 318 | def test_complex_custom_rules(self): 319 | """Run 5-arg custom rules through their paces. 320 | 321 | Incidentally tests returning an actual Node from the custom rule. 322 | 323 | """ 324 | grammar = Grammar(""" 325 | bracketed_digit = start digit end 326 | start = '[' 327 | end = ']' 328 | real_digit = '6'""", 329 | # In this particular implementation of the digit rule, no node is 330 | # generated for `digit`; it falls right through to `real_digit`. 331 | # I'm not sure if this could lead to problems; I can't think of 332 | # any, but it's probably not a great idea. 333 | digit=lambda text, pos, cache, error, grammar: 334 | grammar['real_digit'].match_core(text, pos, cache, error)) 335 | s = '[6]' 336 | eq_(grammar.parse(s), 337 | Node('bracketed_digit', s, 0, 3, children=[ 338 | Node('start', s, 0, 1), 339 | Node('real_digit', s, 1, 2), 340 | Node('end', s, 2, 3)])) 341 | 342 | def test_lazy_custom_rules(self): 343 | """Make sure LazyReferences manually shoved into custom rules are 344 | resolved. 345 | 346 | Incidentally test passing full-on Expressions as custom rules and 347 | having a custom rule as the default one. 348 | 349 | """ 350 | grammar = Grammar(""" 351 | four = '4' 352 | five = '5'""", 353 | forty_five=Sequence(LazyReference('four'), 354 | LazyReference('five'), 355 | name='forty_five')).default('forty_five') 356 | s = '45' 357 | eq_(grammar.parse(s), 358 | Node('forty_five', s, 0, 2, children=[ 359 | Node('four', s, 0, 1), 360 | Node('five', s, 1, 2)])) 361 | 362 | def test_unconnected_custom_rules(self): 363 | """Make sure custom rules that aren't hooked to any other rules still 364 | get included in the grammar and that lone ones get set as the 365 | default. 366 | 367 | Incidentally test Grammar's `rules` default arg. 368 | 369 | """ 370 | grammar = Grammar(one_char=lambda text, pos: pos + 1).default('one_char') 371 | s = '4' 372 | eq_(grammar.parse(s), 373 | Node('one_char', s, 0, 1)) 374 | 375 | def test_lazy_default_rule(self): 376 | """Make sure we get an actual rule set as our default rule, even when 377 | the first rule has forward references and is thus a LazyReference at 378 | some point during grammar compilation. 379 | 380 | """ 381 | grammar = Grammar(r""" 382 | styled_text = text 383 | text = "hi" 384 | """) 385 | eq_(grammar.parse('hi'), Node('text', 'hi', 0, 2)) 386 | 387 | def test_immutable_grammar(self): 388 | """Make sure that a Grammar is immutable after being created.""" 389 | grammar = Grammar(r""" 390 | foo = 'bar' 391 | """) 392 | 393 | def mod_grammar(grammar): 394 | grammar['foo'] = 1 395 | assert_raises(TypeError, mod_grammar, [grammar]) 396 | 397 | def mod_grammar(grammar): 398 | new_grammar = Grammar(r""" 399 | baz = 'biff' 400 | """) 401 | grammar.update(new_grammar) 402 | assert_raises(AttributeError, mod_grammar, [grammar]) 403 | 404 | def test_repr(self): 405 | self.assertTrue(repr(Grammar(r'foo = "a"'))) 406 | 407 | 408 | class TokenGrammarTests(TestCase): 409 | """Tests for the TokenGrammar class and associated machinery""" 410 | 411 | def test_parse_success(self): 412 | """Token literals should work.""" 413 | s = [Token('token1'), Token('token2')] 414 | grammar = TokenGrammar(""" 415 | foo = token1 "token2" 416 | token1 = "token1" 417 | """) 418 | eq_(grammar.parse(s), 419 | Node('foo', s, 0, 2, children=[ 420 | Node('token1', s, 0, 1), 421 | Node('', s, 1, 2)])) 422 | 423 | def test_parse_failure(self): 424 | """Parse failures should work normally with token literals.""" 425 | grammar = TokenGrammar(""" 426 | foo = "token1" "token2" 427 | """) 428 | assert_raises(ParseError, 429 | grammar.parse, 430 | [Token('tokenBOO'), Token('token2')]) 431 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Parsimonious 3 | ============ 4 | 5 | Parsimonious aims to be the fastest arbitrary-lookahead parser written in pure 6 | Python—and the most usable. It's based on parsing expression grammars (PEGs), 7 | which means you feed it a simplified sort of EBNF notation. Parsimonious was 8 | designed to undergird a MediaWiki parser that wouldn't take 5 seconds or a GB 9 | of RAM to do one page, but it's applicable to all sorts of languages. 10 | 11 | 12 | Goals 13 | ===== 14 | 15 | * Speed 16 | * Frugal RAM use 17 | * Minimalistic, understandable, idiomatic Python code 18 | * Readable grammars 19 | * Extensible grammars 20 | * Complete test coverage 21 | * Separation of concerns. Some Python parsing kits mix recognition with 22 | instructions about how to turn the resulting tree into some kind of other 23 | representation. This is limiting when you want to do several different things 24 | with a tree: for example, render wiki markup to HTML *or* to text. 25 | * Good error reporting. I want the parser to work *with* me as I develop a 26 | grammar. 27 | 28 | 29 | Example Usage 30 | ============= 31 | 32 | Here's how to build a simple grammar: 33 | 34 | .. code:: python 35 | 36 | >>> from parsimonious.grammar import Grammar 37 | >>> grammar = Grammar( 38 | ... """ 39 | ... bold_text = bold_open text bold_close 40 | ... text = ~"[A-Z 0-9]*"i 41 | ... bold_open = "((" 42 | ... bold_close = "))" 43 | ... """) 44 | 45 | You can have forward references and even right recursion; it's all taken care 46 | of by the grammar compiler. The first rule is taken to be the default start 47 | symbol, but you can override that. 48 | 49 | Next, let's parse something and get an abstract syntax tree: 50 | 51 | .. code:: python 52 | 53 | >>> print grammar.parse('((bold stuff))') 54 | 55 | 56 | 57 | 58 | 59 | You'd typically then use a ``nodes.NodeVisitor`` subclass (see below) to walk 60 | the tree and do something useful with it. 61 | 62 | 63 | Status 64 | ====== 65 | 66 | * Everything that exists works. Test coverage is good. 67 | * I don't plan on making any backward-incompatible changes to the rule syntax 68 | in the future, so you can write grammars with confidence. 69 | * It may be slow and use a lot of RAM; I haven't measured either yet. However, 70 | I have yet to begin optimizing in earnest. 71 | * Error reporting is now in place. ``repr`` methods of expressions, grammars, 72 | and nodes are clear and helpful as well. The ``Grammar`` ones are 73 | even round-trippable! 74 | * The grammar extensibility story is underdeveloped at the moment. You should 75 | be able to extend a grammar by simply concatening more rules onto the 76 | existing ones; later rules of the same name should override previous ones. 77 | However, this is untested and may not be the final story. 78 | * Sphinx docs are coming, but the docstrings are quite useful now. 79 | * Note that there may be API changes until we get to 1.0, so be sure to pin to 80 | the version you're using. 81 | 82 | Coming Soon 83 | ----------- 84 | 85 | * Optimizations to make Parsimonious worthy of its name 86 | * Tighter RAM use 87 | * Better-thought-out grammar extensibility story 88 | * Amazing grammar debugging 89 | 90 | 91 | A Little About PEG Parsers 92 | ========================== 93 | 94 | PEG parsers don't draw a distinction between lexing and parsing; everything is 95 | done at once. As a result, there is no lookahead limit, as there is with, for 96 | instance, Yacc. And, due to both of these properties, PEG grammars are easier 97 | to write: they're basically just a more practical dialect of EBNF. With 98 | caching, they take O(grammar size * text length) memory (though I plan to do 99 | better), but they run in O(text length) time. 100 | 101 | More Technically 102 | ---------------- 103 | 104 | PEGs can describe a superset of *LL(k)* languages, any deterministic *LR(k)* 105 | language, and many others—including some that aren't context-free 106 | (http://www.brynosaurus.com/pub/lang/peg.pdf). They can also deal with what 107 | would be ambiguous languages if described in canonical EBNF. They do this by 108 | trading the ``|`` alternation operator for the ``/`` operator, which works the 109 | same except that it makes priority explicit: ``a / b / c`` first tries matching 110 | ``a``. If that fails, it tries ``b``, and, failing that, moves on to ``c``. 111 | Thus, ambiguity is resolved by always yielding the first successful recognition. 112 | 113 | 114 | Writing Grammars 115 | ================ 116 | 117 | Grammars are defined by a series of rules. The syntax should be familiar to 118 | anyone who uses regexes or reads programming language manuals. An example will 119 | serve best: 120 | 121 | .. code:: python 122 | 123 | my_grammar = Grammar(r""" 124 | styled_text = bold_text / italic_text 125 | bold_text = "((" text "))" 126 | italic_text = "''" text "''" 127 | text = ~"[A-Z 0-9]*"i 128 | """) 129 | 130 | You can wrap a rule across multiple lines if you like; the syntax is very 131 | forgiving. 132 | 133 | 134 | Syntax Reference 135 | ---------------- 136 | 137 | ==================== ======================================================== 138 | ``"some literal"`` Used to quote literals. Backslash escaping and Python 139 | conventions for "raw" and Unicode strings help support 140 | fiddly characters. 141 | 142 | [space] Sequences are made out of space- or tab-delimited 143 | things. ``a b c`` matches spots where those 3 144 | terms appear in that order. 145 | 146 | ``a / b / c`` Alternatives. The first to succeed of ``a / b / c`` 147 | wins. 148 | 149 | ``thing?`` An optional expression. This is greedy, always consuming 150 | ``thing`` if it exists. 151 | 152 | ``&thing`` A lookahead assertion. Ensures ``thing`` matches at the 153 | current position but does not consume it. 154 | 155 | ``!thing`` A negative lookahead assertion. Matches if ``thing`` 156 | isn't found here. Doesn't consume any text. 157 | 158 | ``things*`` Zero or more things. This is greedy, always consuming as 159 | many repetitions as it can. 160 | 161 | ``things+`` One or more things. This is greedy, always consuming as 162 | many repetitions as it can. 163 | 164 | ``~r"regex"ilmsux`` Regexes have ``~`` in front and are quoted like 165 | literals. Any flags follow the end quotes as single 166 | chars. Regexes are good for representing character 167 | classes (``[a-z0-9]``) and optimizing for speed. The 168 | downside is that they won't be able to take advantage 169 | of our fancy debugging, once we get that working. 170 | Ultimately, I'd like to deprecate explicit regexes and 171 | instead have Parsimonious dynamically build them out of 172 | simpler primitives. 173 | 174 | ``(things)`` Parentheses are used for grouping, like in every other 175 | language. 176 | ==================== ======================================================== 177 | 178 | 179 | Optimizing Grammars 180 | =================== 181 | 182 | Don't Repeat Expressions 183 | ------------------------ 184 | 185 | If you need a ``~"[a-z0-9]"i`` at two points in your grammar, don't type it 186 | twice. Make it a rule of its own, and reference it from wherever you need it. 187 | You'll get the most out of the caching this way, since cache lookups are by 188 | expression object identity (for speed). 189 | 190 | Even if you have an expression that's very simple, not repeating it will 191 | save RAM, as there can, at worst, be a cached int for every char in the text 192 | you're parsing. In the future, we may identify repeated subexpressions 193 | automatically and factor them up while building the grammar. 194 | 195 | How much should you shove into one regex, versus how much should you break them 196 | up to not repeat yourself? That's a fine balance and worthy of benchmarking. 197 | More stuff jammed into a regex will execute faster, because it doesn't have to 198 | run any Python between pieces, but a broken-up one will give better cache 199 | performance if the individual pieces are re-used elsewhere. If the pieces of a 200 | regex aren't used anywhere else, by all means keep the whole thing together. 201 | 202 | 203 | Quantifiers 204 | ----------- 205 | 206 | Bring your ``?`` and ``*`` quantifiers up to the highest level you 207 | can. Otherwise, lower-level patterns could succeed but be empty and put a bunch 208 | of useless nodes in your tree that didn't really match anything. 209 | 210 | 211 | Processing Parse Trees 212 | ====================== 213 | 214 | A parse tree has a node for each expression matched, even if it matched a 215 | zero-length string, like ``"thing"?`` might. 216 | 217 | The ``NodeVisitor`` class provides an inversion-of-control framework for 218 | walking a tree and returning a new construct (tree, string, or whatever) based 219 | on it. For now, have a look at its docstrings for more detail. There's also a 220 | good example in ``grammar.RuleVisitor``. Notice how we take advantage of nodes' 221 | iterability by using tuple unpacks in the formal parameter lists: 222 | 223 | .. code:: python 224 | 225 | def visit_or_term(self, or_term, (slash, _, term)): 226 | ... 227 | 228 | For reference, here is the production the above unpacks:: 229 | 230 | or_term = "/" _ term 231 | 232 | When something goes wrong in your visitor, you get a nice error like this:: 233 | 234 | [normal traceback here...] 235 | VisitationException: 'Node' object has no attribute 'foo' 236 | 237 | Parse tree: 238 | <-- *** We were here. *** 239 | 240 | 241 | 242 | 243 | 244 | 245 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 258 | 259 | 260 | The parse tree is tacked onto the exception, and the node whose visitor method 261 | raised the error is pointed out. 262 | 263 | Why No Streaming Tree Processing? 264 | --------------------------------- 265 | 266 | Some have asked why we don't process the tree as we go, SAX-style. There are 267 | two main reasons: 268 | 269 | 1. It wouldn't work. With a PEG parser, no parsing decision is final until the 270 | whole text is parsed. If we had to change a decision, we'd have to backtrack 271 | and redo the SAX-style interpretation as well, which would involve 272 | reconstituting part of the AST and quite possibly scuttling whatever you 273 | were doing with the streaming output. (Note that some bursty SAX-style 274 | processing may be possible in the future if we use cuts.) 275 | 276 | 2. It interferes with the ability to derive multiple representations from the 277 | AST: for example, turning wiki markup into first HTML and then text. 278 | 279 | 280 | Future Directions 281 | ================= 282 | 283 | Rule Syntax Changes 284 | ------------------- 285 | 286 | * Maybe support left-recursive rules like PyMeta, if anybody cares. 287 | * Ultimately, I'd like to get rid of explicit regexes and break them into more 288 | atomic things like character classes. Then we can dynamically compile bits 289 | of the grammar into regexes as necessary to boost speed. 290 | 291 | Optimizations 292 | ------------- 293 | 294 | * Make RAM use almost constant by automatically inserting "cuts", as described 295 | in 296 | http://ialab.cs.tsukuba.ac.jp/~mizusima/publications/paste513-mizushima.pdf. 297 | This would also improve error reporting, as we wouldn't backtrack out of 298 | everything informative before finally failing. 299 | * Find all the distinct subexpressions, and unify duplicates for a better cache 300 | hit ratio. 301 | * Think about having the user (optionally) provide some representative input 302 | along with a grammar. We can then profile against it, see which expressions 303 | are worth caching, and annotate the grammar. Perhaps there will even be 304 | positions at which a given expression is more worth caching. Or we could keep 305 | a count of how many times each cache entry has been used and evict the most 306 | useless ones as RAM use grows. 307 | * We could possibly compile the grammar into VM instructions, like in "A 308 | parsing machine for PEGs" by Medeiros. 309 | * If the recursion gets too deep in practice, use trampolining to dodge it. 310 | 311 | Niceties 312 | -------- 313 | 314 | * Pijnu has a raft of tree manipulators. I don't think I want all of them, but 315 | a judicious subset might be nice. Don't get into mixing formatting with tree 316 | manipulation. 317 | https://github.com/erikrose/pijnu/blob/master/library/node.py#L333. PyPy's 318 | parsing lib exposes a sane subset: 319 | http://doc.pypy.org/en/latest/rlib.html#tree-transformations. 320 | 321 | 322 | Version History 323 | =============== 324 | 325 | 0.7.0 326 | * Add experimental token-based parsing, via TokenGrammar class, for those 327 | operating on pre-lexed streams of tokens. This can, for example, help parse 328 | indentation-sensitive languages that use the "off-side rule", like Python. 329 | (Erik Rose) 330 | * Common codebase for Python 2 and 3: no more 2to3 translation step (Mattias 331 | Urlichs, Lucas Wiman) 332 | * Drop Python 3.1 and 3.2 support. 333 | * Fix a bug in ``Grammar.__repr__`` which fails to work on Python 3 since the 334 | string_escape codec is gone in Python 3. (Lucas Wiman) 335 | * Don't lose parentheses when printing representations of expressions. 336 | (Michael Kelly) 337 | * Make Grammar an immutable mapping (until we add automatic recompilation). 338 | (Michael Kelly) 339 | 340 | 0.6.2 341 | * Make grammar compilation 100x faster. Thanks to dmoisset for the initial 342 | patch. 343 | 344 | 0.6.1 345 | * Fix bug which made the default rule of a grammar invalid when it 346 | contained a forward reference. 347 | 348 | 0.6 349 | .. warning:: 350 | 351 | This release makes backward-incompatible changes: 352 | 353 | * The ``default_rule`` arg to Grammar's constructor has been replaced 354 | with a method, ``some_grammar.default('rule_name')``, which returns a 355 | new grammar just like the old except with its default rule changed. 356 | This is to free up the constructor kwargs for custom rules. 357 | * ``UndefinedLabel`` is no longer a subclass of ``VisitationError``. This 358 | matters only in the unlikely case that you were catching 359 | ``VisitationError`` exceptions and expecting to thus also catch 360 | ``UndefinedLabel``. 361 | 362 | * Add support for "custom rules" in Grammars. These provide a hook for simple 363 | custom parsing hooks spelled as Python lambdas. For heavy-duty needs, 364 | you can put in Compound Expressions with LazyReferences as subexpressions, 365 | and the Grammar will hook them up for optimal efficiency--no calling 366 | ``__getitem__`` on Grammar at parse time. 367 | * Allow grammars without a default rule (in cases where there are no string 368 | rules), which leads to also allowing empty grammars. Perhaps someone 369 | building up grammars dynamically will find that useful. 370 | * Add ``@rule`` decorator, allowing grammars to be constructed out of 371 | notations on ``NodeVisitor`` methods. This saves looking back and forth 372 | between the visitor and the grammar when there is only one visitor per 373 | grammar. 374 | * Add ``parse()`` and ``match()`` convenience methods to ``NodeVisitor``. 375 | This makes the common case of parsing a string and applying exactly one 376 | visitor to the AST shorter and simpler. 377 | * Improve exception message when you forget to declare a visitor method. 378 | * Add ``unwrapped_exceptions`` attribute to ``NodeVisitor``, letting you 379 | name certain exceptions which propagate out of visitors without being 380 | wrapped by ``VisitationError`` exceptions. 381 | * Expose much more of the library in ``__init__``, making your imports 382 | shorter. 383 | * Drastically simplify reference resolution machinery. (Vladimir Keleshev) 384 | 385 | 0.5 386 | .. warning:: 387 | 388 | This release makes some backward-incompatible changes. See below. 389 | 390 | * Add alpha-quality error reporting. Now, rather than returning ``None``, 391 | ``parse()`` and ``match()`` raise ``ParseError`` if they don't succeed. 392 | This makes more sense, since you'd rarely attempt to parse something and 393 | not care if it succeeds. It was too easy before to forget to check for a 394 | ``None`` result. ``ParseError`` gives you a human-readable unicode 395 | representation as well as some attributes that let you construct your own 396 | custom presentation. 397 | * Grammar construction now raises ``ParseError`` rather than ``BadGrammar`` 398 | if it can't parse your rules. 399 | * ``parse()`` now takes an optional ``pos`` argument, like ``match()``. 400 | * Make the ``_str__()`` method of ``UndefinedLabel`` return the right type. 401 | * Support splitting rules across multiple lines, interleaving comments, 402 | putting multiple rules on one line (but don't do that) and all sorts of 403 | other horrific behavior. 404 | * Tolerate whitespace after opening parens. 405 | * Add support for single-quoted literals. 406 | 407 | 0.4 408 | * Support Python 3. 409 | * Fix ``import *`` for ``parsimonious.expressions``. 410 | * Rewrite grammar compiler so right-recursive rules can be compiled and 411 | parsing no longer fails in some cases with forward rule references. 412 | 413 | 0.3 414 | * Support comments, the ``!`` ("not") operator, and parentheses in grammar 415 | definition syntax. 416 | * Change the ``&`` operator to a prefix operator to conform to the original 417 | PEG syntax. The version in Parsing Techniques was infix, and that's what I 418 | used as a reference. However, the unary version is more convenient, as it 419 | lets you spell ``AB & A`` as simply ``A &B``. 420 | * Take the ``print`` statements out of the benchmark tests. 421 | * Give Node an evaluate-able ``__repr__``. 422 | 423 | 0.2 424 | * Support matching of prefixes and other not-to-the-end slices of strings by 425 | making ``match()`` public and able to initialize a new cache. Add 426 | ``match()`` callthrough method to ``Grammar``. 427 | * Report a ``BadGrammar`` exception (rather than crashing) when there are 428 | mistakes in a grammar definition. 429 | * Simplify grammar compilation internals: get rid of superfluous visitor 430 | methods and factor up repetitive ones. Simplify rule grammar as well. 431 | * Add ``NodeVisitor.lift_child`` convenience method. 432 | * Rename ``VisitationException`` to ``VisitationError`` for consistency with 433 | the standard Python exception hierarchy. 434 | * Rework ``repr`` and ``str`` values for grammars and expressions. Now they 435 | both look like rule syntax. Grammars are even round-trippable! This fixes a 436 | unicode encoding error when printing nodes that had parsed unicode text. 437 | * Add tox for testing. Stop advertising Python 2.5 support, which never 438 | worked (and won't unless somebody cares a lot, since it makes Python 3 439 | support harder). 440 | * Settle (hopefully) on the term "rule" to mean "the string representation of 441 | a production". Get rid of the vague, mysterious "DSL". 442 | 443 | 0.1 444 | * A rough but useable preview release 445 | 446 | Thanks to Wiki Loves Monuments Panama for showing their support with a generous 447 | gift. 448 | -------------------------------------------------------------------------------- /parsimonious/grammar.py: -------------------------------------------------------------------------------- 1 | """A convenience which constructs expression trees from an easy-to-read syntax 2 | 3 | Use this unless you have a compelling reason not to; it performs some 4 | optimizations that would be tedious to do when constructing an expression tree 5 | by hand. 6 | 7 | """ 8 | from collections import Mapping 9 | from inspect import isfunction, ismethod 10 | 11 | from six import (text_type, iterkeys, itervalues, iteritems, 12 | python_2_unicode_compatible, PY2) 13 | 14 | from parsimonious.exceptions import BadGrammar, UndefinedLabel 15 | from parsimonious.expressions import (Literal, Regex, Sequence, OneOf, 16 | Lookahead, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher, 17 | expression) 18 | from parsimonious.nodes import NodeVisitor 19 | from parsimonious.utils import StrAndRepr, evaluate_string 20 | 21 | @python_2_unicode_compatible 22 | class Grammar(StrAndRepr, Mapping): 23 | """A collection of rules that describe a language 24 | 25 | You can start parsing from the default rule by calling ``parse()`` 26 | directly on the ``Grammar`` object:: 27 | 28 | g = Grammar(''' 29 | polite_greeting = greeting ", my good " title 30 | greeting = "Hi" / "Hello" 31 | title = "madam" / "sir" 32 | ''') 33 | g.parse('Hello, my good sir') 34 | 35 | Or start parsing from any of the other rules; you can pull them out of the 36 | grammar as if it were a dictionary:: 37 | 38 | g['title'].parse('sir') 39 | 40 | You could also just construct a bunch of ``Expression`` objects yourself 41 | and stitch them together into a language, but using a ``Grammar`` has some 42 | important advantages: 43 | 44 | * Languages are much easier to define in the nice syntax it provides. 45 | * Circular references aren't a pain. 46 | * It does all kinds of whizzy space- and time-saving optimizations, like 47 | factoring up repeated subexpressions into a single object, which should 48 | increase cache hit ratio. [Is this implemented yet?] 49 | 50 | """ 51 | def __init__(self, rules='', **more_rules): 52 | """Construct a grammar. 53 | 54 | :arg rules: A string of production rules, one per line. 55 | :arg default_rule: The name of the rule invoked when you call 56 | :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the 57 | first rule. Falls back to None if there are no string-based rules 58 | in this grammar. 59 | :arg more_rules: Additional kwargs whose names are rule names and 60 | values are Expressions or custom-coded callables which accomplish 61 | things the built-in rule syntax cannot. These take precedence over 62 | ``rules`` in case of naming conflicts. 63 | 64 | """ 65 | decorated_custom_rules = dict( 66 | (k, expression(v, k, self) if isfunction(v) or 67 | ismethod(v) else 68 | v) for k, v in iteritems(more_rules)) 69 | 70 | self._expressions, first = self._expressions_from_rules(rules, decorated_custom_rules) 71 | self.default_rule = first # may be None 72 | 73 | def __getitem__(self, rule_name): 74 | return self._expressions[rule_name] 75 | 76 | def __iter__(self): 77 | return iterkeys(self._expressions) 78 | 79 | def __len__(self): 80 | return len(self._expressions) 81 | 82 | def default(self, rule_name): 83 | """Return a new Grammar whose :term:`default rule` is ``rule_name``.""" 84 | new = self._copy() 85 | new.default_rule = new[rule_name] 86 | return new 87 | 88 | def _copy(self): 89 | """Return a shallow copy of myself. 90 | 91 | Deep is unnecessary, since Expression trees are immutable. Subgrammars 92 | recreate all the Expressions from scratch, and AbstractGrammars have 93 | no Expressions. 94 | 95 | """ 96 | new = Grammar(**self._expressions) 97 | new.default_rule = self.default_rule 98 | return new 99 | 100 | def _expressions_from_rules(self, rules, custom_rules): 101 | """Return a 2-tuple: a dict of rule names pointing to their 102 | expressions, and then the first rule. 103 | 104 | It's a web of expressions, all referencing each other. Typically, 105 | there's a single root to the web of references, and that root is the 106 | starting symbol for parsing, but there's nothing saying you can't have 107 | multiple roots. 108 | 109 | :arg custom_rules: A map of rule names to custom-coded rules: 110 | Expressions 111 | 112 | """ 113 | tree = rule_grammar.parse(rules) 114 | return RuleVisitor(custom_rules).visit(tree) 115 | 116 | def parse(self, text, pos=0): 117 | """Parse some text with the :term:`default rule`. 118 | 119 | :arg pos: The index at which to start parsing 120 | 121 | """ 122 | self._check_default_rule() 123 | return self.default_rule.parse(text, pos=pos) 124 | 125 | def match(self, text, pos=0): 126 | """Parse some text with the :term:`default rule` but not necessarily 127 | all the way to the end. 128 | 129 | :arg pos: The index at which to start parsing 130 | 131 | """ 132 | self._check_default_rule() 133 | return self.default_rule.match(text, pos=pos) 134 | 135 | def _check_default_rule(self): 136 | """Raise RuntimeError if there is no default rule defined.""" 137 | if not self.default_rule: 138 | raise RuntimeError("Can't call parse() on a Grammar that has no " 139 | "default rule. Choose a specific rule instead, " 140 | "like some_grammar['some_rule'].parse(...).") 141 | 142 | def __str__(self): 143 | """Return a rule string that, when passed to the constructor, would 144 | reconstitute the grammar.""" 145 | exprs = [self.default_rule] if self.default_rule else [] 146 | exprs.extend(expr for expr in itervalues(self) if 147 | expr is not self.default_rule) 148 | return '\n'.join(expr.as_rule() for expr in exprs) 149 | 150 | def __repr__(self): 151 | """Return an expression that will reconstitute the grammar.""" 152 | codec = 'string_escape' if PY2 else 'unicode_escape' 153 | return "Grammar('%s')" % str(self).encode(codec) 154 | 155 | 156 | class TokenGrammar(Grammar): 157 | """A Grammar which takes a list of pre-lexed tokens instead of text 158 | 159 | This is useful if you want to do the lexing yourself, as a separate pass: 160 | for example, to implement indentation-based languages. 161 | 162 | """ 163 | def _expressions_from_rules(self, rules, custom_rules): 164 | tree = rule_grammar.parse(rules) 165 | return TokenRuleVisitor(custom_rules).visit(tree) 166 | 167 | 168 | class BootstrappingGrammar(Grammar): 169 | """The grammar used to recognize the textual rules that describe other 170 | grammars 171 | 172 | This grammar gets its start from some hard-coded Expressions and claws its 173 | way from there to an expression tree that describes how to parse the 174 | grammar description syntax. 175 | 176 | """ 177 | def _expressions_from_rules(self, rule_syntax, custom_rules): 178 | """Return the rules for parsing the grammar definition syntax. 179 | 180 | Return a 2-tuple: a dict of rule names pointing to their expressions, 181 | and then the top-level expression for the first rule. 182 | 183 | """ 184 | # Hard-code enough of the rules to parse the grammar that describes the 185 | # grammar description language, to bootstrap: 186 | comment = Regex(r'#[^\r\n]*', name='comment') 187 | meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness') 188 | _ = ZeroOrMore(meaninglessness, name='_') 189 | equals = Sequence(Literal('='), _, name='equals') 190 | label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label') 191 | reference = Sequence(label, Not(equals), name='reference') 192 | quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier') 193 | # This pattern supports empty literals. TODO: A problem? 194 | spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', 195 | ignore_case=True, 196 | dot_all=True, 197 | name='spaceless_literal') 198 | literal = Sequence(spaceless_literal, _, name='literal') 199 | regex = Sequence(Literal('~'), 200 | literal, 201 | Regex('[ilmsux]*', ignore_case=True), 202 | _, 203 | name='regex') 204 | atom = OneOf(reference, literal, regex, name='atom') 205 | quantified = Sequence(atom, quantifier, name='quantified') 206 | 207 | term = OneOf(quantified, atom, name='term') 208 | not_term = Sequence(Literal('!'), term, _, name='not_term') 209 | term.members = (not_term,) + term.members 210 | 211 | sequence = Sequence(term, OneOrMore(term), name='sequence') 212 | or_term = Sequence(Literal('/'), _, term, name='or_term') 213 | ored = Sequence(term, OneOrMore(or_term), name='ored') 214 | expression = OneOf(ored, sequence, term, name='expression') 215 | rule = Sequence(label, equals, expression, name='rule') 216 | rules = Sequence(_, OneOrMore(rule), name='rules') 217 | 218 | # Use those hard-coded rules to parse the (more extensive) rule syntax. 219 | # (For example, unless I start using parentheses in the rule language 220 | # definition itself, I should never have to hard-code expressions for 221 | # those above.) 222 | 223 | rule_tree = rules.parse(rule_syntax) 224 | 225 | # Turn the parse tree into a map of expressions: 226 | return RuleVisitor().visit(rule_tree) 227 | 228 | 229 | # The grammar for parsing PEG grammar definitions: 230 | # This is a nice, simple grammar. We may someday add to it, but it's a safe bet 231 | # that the future will always be a superset of this. 232 | rule_syntax = (r''' 233 | # Ignored things (represented by _) are typically hung off the end of the 234 | # leafmost kinds of nodes. Literals like "/" count as leaves. 235 | 236 | rules = _ rule* 237 | rule = label equals expression 238 | equals = "=" _ 239 | literal = spaceless_literal _ 240 | 241 | # So you can't spell a regex like `~"..." ilm`: 242 | spaceless_literal = ~"u?r?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is / 243 | ~"u?r?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is 244 | 245 | expression = ored / sequence / term 246 | or_term = "/" _ term 247 | ored = term or_term+ 248 | sequence = term term+ 249 | not_term = "!" term _ 250 | lookahead_term = "&" term _ 251 | term = not_term / lookahead_term / quantified / atom 252 | quantified = atom quantifier 253 | atom = reference / literal / regex / parenthesized 254 | regex = "~" spaceless_literal ~"[ilmsux]*"i _ 255 | parenthesized = "(" _ expression ")" _ 256 | quantifier = ~"[*+?]" _ 257 | reference = label !equals 258 | 259 | # A subsequent equal sign is the only thing that distinguishes a label 260 | # (which begins a new rule) from a reference (which is just a pointer to a 261 | # rule defined somewhere else): 262 | label = ~"[a-zA-Z_][a-zA-Z_0-9]*" _ 263 | 264 | # _ = ~r"\s*(?:#[^\r\n]*)?\s*" 265 | _ = meaninglessness* 266 | meaninglessness = ~r"\s+" / comment 267 | comment = ~r"#[^\r\n]*" 268 | ''') 269 | 270 | 271 | class LazyReference(text_type): 272 | """A lazy reference to a rule, which we resolve after grokking all the 273 | rules""" 274 | 275 | name = u'' 276 | 277 | # Just for debugging: 278 | def _as_rhs(self): 279 | return u'' % self 280 | 281 | 282 | class RuleVisitor(NodeVisitor): 283 | """Turns a parse tree of a grammar definition into a map of ``Expression`` 284 | objects 285 | 286 | This is the magic piece that breathes life into a parsed bunch of parse 287 | rules, allowing them to go forth and parse other things. 288 | 289 | """ 290 | quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore} 291 | 292 | visit_expression = visit_term = visit_atom = NodeVisitor.lift_child 293 | 294 | def __init__(self, custom_rules=None): 295 | """Construct. 296 | 297 | :arg custom_rules: A dict of {rule name: expression} holding custom 298 | rules which will take precedence over the others 299 | 300 | """ 301 | self.custom_rules = custom_rules or {} 302 | 303 | def visit_parenthesized(self, node, parenthesized): 304 | """Treat a parenthesized subexpression as just its contents. 305 | 306 | Its position in the tree suffices to maintain its grouping semantics. 307 | 308 | """ 309 | left_paren, _, expression, right_paren, _ = parenthesized 310 | return expression 311 | 312 | def visit_quantifier(self, node, quantifier): 313 | """Turn a quantifier into just its symbol-matching node.""" 314 | symbol, _ = quantifier 315 | return symbol 316 | 317 | def visit_quantified(self, node, quantified): 318 | atom, quantifier = quantified 319 | return self.quantifier_classes[quantifier.text](atom) 320 | 321 | def visit_lookahead_term(self, node, lookahead_term): 322 | ampersand, term, _ = lookahead_term 323 | return Lookahead(term) 324 | 325 | def visit_not_term(self, node, not_term): 326 | exclamation, term, _ = not_term 327 | return Not(term) 328 | 329 | def visit_rule(self, node, rule): 330 | """Assign a name to the Expression and return it.""" 331 | label, equals, expression = rule 332 | expression.name = label # Assign a name to the expr. 333 | return expression 334 | 335 | def visit_sequence(self, node, sequence): 336 | """A parsed Sequence looks like [term node, OneOrMore node of 337 | ``another_term``s]. Flatten it out.""" 338 | term, other_terms = sequence 339 | return Sequence(term, *other_terms) 340 | 341 | def visit_ored(self, node, ored): 342 | first_term, other_terms = ored 343 | return OneOf(first_term, *other_terms) 344 | 345 | def visit_or_term(self, node, or_term): 346 | """Return just the term from an ``or_term``. 347 | 348 | We already know it's going to be ored, from the containing ``ored``. 349 | 350 | """ 351 | slash, _, term = or_term 352 | return term 353 | 354 | def visit_label(self, node, label): 355 | """Turn a label into a unicode string.""" 356 | name, _ = label 357 | return name.text 358 | 359 | def visit_reference(self, node, reference): 360 | """Stick a :class:`LazyReference` in the tree as a placeholder. 361 | 362 | We resolve them all later. 363 | 364 | """ 365 | label, not_equals = reference 366 | return LazyReference(label) 367 | 368 | def visit_regex(self, node, regex): 369 | """Return a ``Regex`` expression.""" 370 | tilde, literal, flags, _ = regex 371 | flags = flags.text.upper() 372 | pattern = literal.literal # Pull the string back out of the Literal 373 | # object. 374 | return Regex(pattern, ignore_case='I' in flags, 375 | locale='L' in flags, 376 | multiline='M' in flags, 377 | dot_all='S' in flags, 378 | unicode='U' in flags, 379 | verbose='X' in flags) 380 | 381 | def visit_spaceless_literal(self, spaceless_literal, visited_children): 382 | """Turn a string literal into a ``Literal`` that recognizes it.""" 383 | return Literal(evaluate_string(spaceless_literal.text)) 384 | 385 | def visit_literal(self, node, literal): 386 | """Pick just the literal out of a literal-and-junk combo.""" 387 | spaceless_literal, _ = literal 388 | return spaceless_literal 389 | 390 | def generic_visit(self, node, visited_children): 391 | """Replace childbearing nodes with a list of their children; keep 392 | others untouched. 393 | 394 | For our case, if a node has children, only the children are important. 395 | Otherwise, keep the node around for (for example) the flags of the 396 | regex rule. Most of these kept-around nodes are subsequently thrown 397 | away by the other visitor methods. 398 | 399 | We can't simply hang the visited children off the original node; that 400 | would be disastrous if the node occurred in more than one place in the 401 | tree. 402 | 403 | """ 404 | return visited_children or node # should semantically be a tuple 405 | 406 | def _resolve_refs(self, rule_map, expr, done): 407 | """Return an expression with all its lazy references recursively 408 | resolved. 409 | 410 | Resolve any lazy references in the expression ``expr``, recursing into 411 | all subexpressions. 412 | 413 | :arg done: The set of Expressions that have already been or are 414 | currently being resolved, to ward off redundant work and prevent 415 | infinite recursion for circular refs 416 | 417 | """ 418 | if isinstance(expr, LazyReference): 419 | label = text_type(expr) 420 | try: 421 | reffed_expr = rule_map[label] 422 | except KeyError: 423 | raise UndefinedLabel(expr) 424 | return self._resolve_refs(rule_map, reffed_expr, done) 425 | else: 426 | if getattr(expr, 'members', ()) and expr not in done: 427 | # Prevents infinite recursion for circular refs. At worst, one 428 | # of `expr.members` can refer back to `expr`, but it can't go 429 | # any farther. 430 | done.add(expr) 431 | expr.members = [self._resolve_refs(rule_map, member, done) 432 | for member in expr.members] 433 | return expr 434 | 435 | def visit_rules(self, node, rules_list): 436 | """Collate all the rules into a map. Return (map, default rule). 437 | 438 | The default rule is the first one. Or, if you have more than one rule 439 | of that name, it's the last-occurring rule of that name. (This lets you 440 | override the default rule when you extend a grammar.) If there are no 441 | string-based rules, the default rule is None, because the custom rules, 442 | due to being kwarg-based, are unordered. 443 | 444 | """ 445 | _, rules = rules_list 446 | 447 | # Map each rule's name to its Expression. Later rules of the same name 448 | # override earlier ones. This lets us define rules multiple times and 449 | # have the last declaration win, so you can extend grammars by 450 | # concatenation. 451 | rule_map = dict((expr.name, expr) for expr in rules) 452 | 453 | # And custom rules override string-based rules. This is the least 454 | # surprising choice when you compare the dict constructor: 455 | # dict({'x': 5}, x=6). 456 | rule_map.update(self.custom_rules) 457 | 458 | # Resolve references. This tolerates forward references. 459 | done = set() 460 | rule_map = dict((expr.name, self._resolve_refs(rule_map, expr, done)) 461 | for expr in itervalues(rule_map)) 462 | 463 | # isinstance() is a temporary hack around the fact that * rules don't 464 | # always get transformed into lists by NodeVisitor. We should fix that; 465 | # it's surprising and requires writing lame branches like this. 466 | return rule_map, (rule_map[rules[0].name] 467 | if isinstance(rules, list) and rules else None) 468 | 469 | 470 | class TokenRuleVisitor(RuleVisitor): 471 | """A visitor which builds expression trees meant to work on sequences of 472 | pre-lexed tokens rather than strings""" 473 | 474 | def visit_spaceless_literal(self, spaceless_literal, visited_children): 475 | """Turn a string literal into a ``TokenMatcher`` that matches 476 | ``Token`` objects by their ``type`` attributes.""" 477 | return TokenMatcher(evaluate_string(spaceless_literal.text)) 478 | 479 | def visit_regex(self, node, regex): 480 | tilde, literal, flags, _ = regex 481 | raise BadGrammar('Regexes do not make sense in TokenGrammars, since ' 482 | 'TokenGrammars operate on pre-lexed tokens rather ' 483 | 'than characters.') 484 | 485 | 486 | # Bootstrap to level 1... 487 | rule_grammar = BootstrappingGrammar(rule_syntax) 488 | # ...and then to level 2. This establishes that the node tree of our rule 489 | # syntax is built by the same machinery that will build trees of our users' 490 | # grammars. And the correctness of that tree is tested, indirectly, in 491 | # test_grammar. 492 | rule_grammar = Grammar(rule_syntax) 493 | 494 | 495 | # TODO: Teach Expression trees how to spit out Python representations of 496 | # themselves. Then we can just paste that in above, and we won't have to 497 | # bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not 498 | # so clean, because it would have to output multiple statements to get multiple 499 | # refs to a single expression hooked up.] 500 | --------------------------------------------------------------------------------