├── parsimonious
    ├── tests
    │   ├── __init__.py
    │   ├── test_benchmarks.py
    │   ├── benchmarks.py
    │   ├── test_nodes.py
    │   ├── test_expressions.py
    │   └── test_grammar.py
    ├── __init__.py
    ├── utils.py
    ├── exceptions.py
    ├── nodes.py
    ├── expressions.py
    └── grammar.py
├── MANIFEST.in
├── .gitignore
├── .travis.yml
├── tox.ini
├── LICENSE
├── setup.py
└── README.rst


/parsimonious/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .tox
2 | *.egg-info
3 | *.egg
4 | *.pyc
5 | build
6 | dist
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.6"
 4 |   - "2.7"
 5 |   - "3.3"
 6 |   - "3.4"
 7 |   - "3.5"
 8 | install:
 9 |   - pip install tox tox-travis
10 | script:
11 |   - tox
12 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py26, py27, py33, py34, py35
 3 | 
 4 | [tox:travis]
 5 | 2.6 = py26
 6 | 2.7 = py27
 7 | 3.3 = py33
 8 | 3.4 = py34
 9 | 3.5 = py35
10 | 
11 | [testenv]
12 | usedevelop = True
13 | commands = nosetests parsimonious
14 | deps = nose
15 | # So Python 3 doesn't pick up incompatible, un-2to3'd source from the cwd:
16 | changedir = .tox
17 | 


--------------------------------------------------------------------------------
/parsimonious/__init__.py:
--------------------------------------------------------------------------------
 1 | """Parsimonious's public API. Import from here.
 2 | 
 3 | Things may move around in modules deeper than this one.
 4 | 
 5 | """
 6 | from parsimonious.exceptions import (ParseError, IncompleteParseError,
 7 |                                      VisitationError, UndefinedLabel)
 8 | from parsimonious.grammar import Grammar, TokenGrammar
 9 | from parsimonious.nodes import NodeVisitor, VisitationError, rule
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 Erik Rose
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/parsimonious/utils.py:
--------------------------------------------------------------------------------
 1 | """General tools which don't depend on other parts of Parsimonious"""
 2 | 
 3 | import ast
 4 | from sys import version_info
 5 | 
 6 | from six import python_2_unicode_compatible
 7 | 
 8 | 
 9 | class StrAndRepr(object):
10 |     """Mix-in to add a ``__str__`` and ``__repr__`` which return the
11 |     UTF-8-encoded value of ``__unicode__``"""
12 | 
13 |     if version_info >= (3,):
14 |         # Don't return the "bytes" type from Python 3's __str__:
15 |         def __repr__(self):
16 |             return self.__str__()
17 |     else:
18 |         def __repr__(self):
19 |             return self.__str__().encode('utf-8')
20 | 
21 | 
22 | def evaluate_string(string):
23 |     """Piggyback on Python's string support so we can have backslash escaping
24 |     and niceties like \n, \t, etc. string.decode('string_escape') would have
25 |     been a lower-level possibility.
26 | 
27 |     """
28 |     return ast.literal_eval(string)
29 | 
30 | 
31 | @python_2_unicode_compatible
32 | class Token(StrAndRepr):
33 |     """A class to represent tokens, for use with TokenGrammars
34 | 
35 |     You will likely want to subclass this to hold additional information, like
36 |     the characters that you lexed to create this token. Alternately, feel free
37 |     to create your own class from scratch. The only contract is that tokens
38 |     must have a ``type`` attr.
39 | 
40 |     """
41 |     __slots__ = ['type']
42 | 
43 |     def __init__(self, type):
44 |         self.type = type
45 | 
46 |     def __str__(self):
47 |         return u'<Token "%s">' % (self.type,)
48 | 
49 |     def __eq__(self, other):
50 |         return self.type == other.type
51 | 


--------------------------------------------------------------------------------
/parsimonious/tests/test_benchmarks.py:
--------------------------------------------------------------------------------
 1 | """Tests to show that the benchmarks we based our speed optimizations on are
 2 | still valid"""
 3 | 
 4 | from functools import partial
 5 | from timeit import timeit
 6 | 
 7 | from nose.tools import ok_
 8 | 
 9 | 
10 | timeit = partial(timeit, number=500000)
11 | 
12 | 
13 | def test_lists_vs_dicts():
14 |     """See what's faster at int key lookup: dicts or lists."""
15 |     list_time = timeit('item = l[9000]', 'l = [0] * 10000')
16 |     dict_time = timeit('item = d[9000]', 'd = dict((x, 0) for x in range(10000))')
17 | 
18 |     # Dicts take about 1.6x as long as lists in Python 2.6 and 2.7.
19 |     ok_(list_time < dict_time, '%s < %s' % (list_time, dict_time))
20 | 
21 | 
22 | def test_call_vs_inline():
23 |     """How bad is the calling penalty?"""
24 |     no_call = timeit('l[0] += 1', 'l = [0]')
25 |     call = timeit('add(); l[0] += 1', 'l = [0]\n'
26 |                                       'def add():\n'
27 |                                       '    pass')
28 | 
29 |     # Calling a function is pretty fast; it takes just 1.2x as long as the
30 |     # global var access and addition in l[0] += 1.
31 |     ok_(no_call < call, '%s (no call) < %s (call)' % (no_call, call))
32 | 
33 | 
34 | def test_startswith_vs_regex():
35 |     """Can I beat the speed of regexes by special-casing literals?"""
36 |     re_time = timeit(
37 |         'r.match(t, 19)',
38 |         'import re\n'
39 |         "r = re.compile('hello')\n"
40 |         "t = 'this is the finest hello ever'")
41 |     startswith_time = timeit("t.startswith('hello', 19)",
42 |                              "t = 'this is the finest hello ever'")
43 | 
44 |     # Regexes take 2.24x as long as simple string matching.
45 |     ok_(startswith_time < re_time,
46 |         '%s (startswith) < %s (re)' % (startswith_time, re_time))
47 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from sys import version_info
 2 | 
 3 | # Prevent spurious errors during `python setup.py test` in 2.6, a la
 4 | # http://www.eby-sarna.com/pipermail/peak/2010-May/003357.html:
 5 | try:
 6 |     import multiprocessing
 7 | except ImportError:
 8 |     pass
 9 | 
10 | from io import open
11 | from setuptools import setup, find_packages
12 | 
13 | long_description=open('README.rst', 'r', encoding='utf8').read()
14 | 
15 | setup(
16 |     name='parsimonious',
17 |     version='0.7.0',
18 |     description='(Soon to be) the fastest pure-Python PEG parser I could muster',
19 |     long_description=long_description,
20 |     author='Erik Rose',
21 |     author_email='erikrose@grinchcentral.com',
22 |     license='MIT',
23 |     packages=find_packages(exclude=['ez_setup']),
24 |     tests_require=['nose'],
25 |     test_suite='nose.collector',
26 |     url='https://github.com/erikrose/parsimonious',
27 |     include_package_data=True,
28 |     install_requires=['six'],
29 |     classifiers=[
30 |         'Intended Audience :: Developers',
31 |         'Natural Language :: English',
32 |         'Development Status :: 3 - Alpha',
33 |         'License :: OSI Approved :: MIT License',
34 |         'Operating System :: OS Independent',
35 |         'Programming Language :: Python :: 2',
36 |         'Programming Language :: Python :: 2.6',
37 |         'Programming Language :: Python :: 2.7',
38 |         'Programming Language :: Python :: 3',
39 |         'Programming Language :: Python :: 3.3',
40 |         'Programming Language :: Python :: 3.4',
41 |         'Programming Language :: Python :: 3.5',
42 |         'Topic :: Scientific/Engineering :: Information Analysis',
43 |         'Topic :: Software Development :: Libraries',
44 |         'Topic :: Text Processing :: General'],
45 |     keywords=['parse', 'parser', 'parsing', 'peg', 'packrat', 'grammar', 'language'],
46 | )
47 | 


--------------------------------------------------------------------------------
/parsimonious/tests/benchmarks.py:
--------------------------------------------------------------------------------
 1 | """Benchmarks for Parsimonious
 2 | 
 3 | Run these with ``nosetests parsimonious/tests/bench.py``. They don't run during
 4 | normal test runs because they're not tests--they don't assert anything. Also,
 5 | they're a bit slow.
 6 | 
 7 | These differ from the ones in test_benchmarks in that these are meant to be
 8 | compared from revision to revision of Parsimonious to make sure we're not
 9 | getting slower. test_benchmarks simply makes sure our choices among
10 | implementation alternatives remain valid.
11 | 
12 | """
13 | # These aren't really tests, as they don't assert anything, but I found myself
14 | # rewriting nose's discovery and selection bits, so why not just use nose?
15 | 
16 | import gc
17 | from timeit import repeat
18 | 
19 | from parsimonious.grammar import Grammar
20 | 
21 | 
22 | def test_not_really_json_parsing():
23 |     """As a baseline for speed, parse some JSON.
24 | 
25 |     I have no reason to believe that JSON is a particularly representative or
26 |     revealing grammar to test with. Also, this is a naive, unoptimized,
27 |     incorrect grammar, so don't use it as a basis for comparison with other
28 |     parsers. It's just meant to compare across versions of Parsimonious.
29 | 
30 |     """
31 |     father = """{
32 |         "id" : 1,
33 |         "married" : true,
34 |         "name" : "Larry Lopez",
35 |         "sons" : null,
36 |         "daughters" : [
37 |           {
38 |             "age" : 26,
39 |             "name" : "Sandra"
40 |             },
41 |           {
42 |             "age" : 25,
43 |             "name" : "Margaret"
44 |             },
45 |           {
46 |             "age" : 6,
47 |             "name" : "Mary"
48 |             }
49 |           ]
50 |         }"""
51 |     more_fathers = ','.join([father] * 60)
52 |     json = '{"fathers" : [' + more_fathers + ']}'
53 |     grammar = Grammar(r"""
54 |         value = space (string / number / object / array / true_false_null)
55 |                 space
56 | 
57 |         object = "{" members "}"
58 |         members = (pair ("," pair)*)?
59 |         pair = string ":" value
60 |         array = "[" elements "]"
61 |         elements = (value ("," value)*)?
62 |         true_false_null = "true" / "false" / "null"
63 | 
64 |         string = space "\"" chars "\"" space
65 |         chars = ~"[^\"]*"  # TODO implement the real thing
66 |         number = (int frac exp) / (int exp) / (int frac) / int
67 |         int = "-"? ((digit1to9 digits) / digit)
68 |         frac = "." digits
69 |         exp = e digits
70 |         digits = digit+
71 |         e = "e+" / "e-" / "e" / "E+" / "E-" / "E"
72 | 
73 |         digit1to9 = ~"[1-9]"
74 |         digit = ~"[0-9]"
75 |         space = ~"\s*"
76 |         """)
77 | 
78 |     # These number and repetition values seem to keep results within 5% of the
79 |     # difference between min and max. We get more consistent results running a
80 |     # bunch of single-parse tests and taking the min rather than upping the
81 |     # NUMBER and trying to stomp out the outliers with averaging.
82 |     NUMBER = 1
83 |     REPEAT = 5
84 |     total_seconds = min(repeat(lambda: grammar.parse(json),
85 |                                lambda: gc.enable(),  # so we take into account how we treat the GC
86 |                                repeat=REPEAT,
87 |                                number=NUMBER))
88 |     seconds_each = total_seconds / NUMBER
89 | 
90 |     kb = len(json) / 1024.0
91 |     print 'Took %.3fs to parse %.1fKB: %.0fKB/s.' % (seconds_each,
92 |                                                      kb,
93 |                                                      kb / seconds_each)
94 | 


--------------------------------------------------------------------------------
/parsimonious/exceptions.py:
--------------------------------------------------------------------------------
  1 | from six import text_type, python_2_unicode_compatible
  2 | 
  3 | from parsimonious.utils import StrAndRepr
  4 | 
  5 | 
  6 | @python_2_unicode_compatible
  7 | class ParseError(StrAndRepr, Exception):
  8 |     """A call to ``Expression.parse()`` or ``match()`` didn't match."""
  9 | 
 10 |     def __init__(self, text, pos=-1, expr=None):
 11 |         # It would be nice to use self.args, but I don't want to pay a penalty
 12 |         # to call descriptors or have the confusion of numerical indices in
 13 |         # Expression.match_core().
 14 |         self.text = text
 15 |         self.pos = pos
 16 |         self.expr = expr
 17 | 
 18 |     def __str__(self):
 19 |         rule_name = ((u"'%s'" % self.expr.name) if self.expr.name else
 20 |                      text_type(self.expr))
 21 |         return u"Rule %s didn't match at '%s' (line %s, column %s)." % (
 22 |                 rule_name,
 23 |                 self.text[self.pos:self.pos + 20],
 24 |                 self.line(),
 25 |                 self.column())
 26 | 
 27 |     # TODO: Add line, col, and separated-out error message so callers can build
 28 |     # their own presentation.
 29 | 
 30 |     def line(self):
 31 |         """Return the 1-based line number where the expression ceased to
 32 |         match."""
 33 |         # This is a method rather than a property in case we ever wanted to
 34 |         # pass in which line endings we want to use.
 35 |         return self.text.count('\n', 0, self.pos) + 1
 36 | 
 37 |     def column(self):
 38 |         """Return the 1-based column where the expression ceased to match."""
 39 |         # We choose 1-based because that's what Python does with SyntaxErrors.
 40 |         try:
 41 |             return self.pos - self.text.rindex('\n', 0, self.pos)
 42 |         except ValueError:
 43 |             return self.pos + 1
 44 | 
 45 | 
 46 | @python_2_unicode_compatible
 47 | class IncompleteParseError(ParseError):
 48 |     """A call to ``parse()`` matched a whole Expression but did not consume the
 49 |     entire text."""
 50 | 
 51 |     def __str__(self):
 52 |         return u"Rule '%s' matched in its entirety, but it didn't consume all the text. The non-matching portion of the text begins with '%s' (line %s, column %s)." % (
 53 |                 self.expr.name,
 54 |                 self.text[self.pos:self.pos + 20],
 55 |                 self.line(),
 56 |                 self.column())
 57 | 
 58 | 
 59 | class VisitationError(Exception):
 60 |     """Something went wrong while traversing a parse tree.
 61 | 
 62 |     This exception exists to augment an underlying exception with information
 63 |     about where in the parse tree the error occurred. Otherwise, it could be
 64 |     tiresome to figure out what went wrong; you'd have to play back the whole
 65 |     tree traversal in your head.
 66 | 
 67 |     """
 68 |     # TODO: Make sure this is pickleable. Probably use @property pattern. Make
 69 |     # the original exc and node available on it if they don't cause a whole
 70 |     # raft of stack frames to be retained.
 71 |     def __init__(self, exc, exc_class, node):
 72 |         """Construct.
 73 | 
 74 |         :arg exc: What went wrong. We wrap this and add more info.
 75 |         :arg node: The node at which the error occurred
 76 | 
 77 |         """
 78 |         self.original_class = exc_class
 79 |         super(VisitationError, self).__init__(
 80 |             '%s: %s\n\n'
 81 |             'Parse tree:\n'
 82 |             '%s' %
 83 |             (exc_class.__name__,
 84 |              exc,
 85 |              node.prettily(error=node)))
 86 | 
 87 | 
 88 | class BadGrammar(StrAndRepr, Exception):
 89 |     """Something was wrong with the definition of a grammar.
 90 | 
 91 |     Note that a ParseError might be raised instead if the error is in the
 92 |     grammar definition syntax.
 93 | 
 94 |     """
 95 | 
 96 | 
 97 | @python_2_unicode_compatible
 98 | class UndefinedLabel(BadGrammar):
 99 |     """A rule referenced in a grammar was never defined.
100 | 
101 |     Circular references and forward references are okay, but you have to define
102 |     stuff at some point.
103 | 
104 |     """
105 |     def __init__(self, label):
106 |         self.label = label
107 | 
108 |     def __str__(self):
109 |         return u'The label "%s" was never defined.' % self.label
110 | 


--------------------------------------------------------------------------------
/parsimonious/tests/test_nodes.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from nose import SkipTest
  3 | from nose.tools import eq_, ok_, assert_raises
  4 | 
  5 | from parsimonious import Grammar, NodeVisitor, VisitationError, rule
  6 | from parsimonious.nodes import Node
  7 | 
  8 | 
  9 | class HtmlFormatter(NodeVisitor):
 10 |     """Visitor that turns a parse tree into HTML fragments"""
 11 | 
 12 |     grammar = Grammar("""bold_open  = '(('""")  # just partial
 13 | 
 14 |     def visit_bold_open(self, node, visited_children):
 15 |         return '<b>'
 16 | 
 17 |     def visit_bold_close(self, node, visited_children):
 18 |         return '</b>'
 19 | 
 20 |     def visit_text(self, node, visited_children):
 21 |         """Return the text verbatim."""
 22 |         return node.text
 23 | 
 24 |     def visit_bold_text(self, node, visited_children):
 25 |         return ''.join(visited_children)
 26 | 
 27 | 
 28 | class ExplosiveFormatter(NodeVisitor):
 29 |     """Visitor which raises exceptions"""
 30 | 
 31 |     def visit_boom(self, node, visited_children):
 32 |         raise ValueError
 33 | 
 34 | 
 35 | def test_visitor():
 36 |     """Assert a tree gets visited correctly.
 37 | 
 38 |     We start with a tree from applying this grammar... ::
 39 | 
 40 |         bold_text  = bold_open text bold_close
 41 |         text       = ~'[a-zA-Z 0-9]*'
 42 |         bold_open  = '(('
 43 |         bold_close = '))'
 44 | 
 45 |     ...to this text::
 46 | 
 47 |         ((o hai))
 48 | 
 49 |     """
 50 |     text = '((o hai))'
 51 |     tree = Node('bold_text', text, 0, 9,
 52 |                 [Node('bold_open', text, 0, 2),
 53 |                  Node('text', text, 2, 7),
 54 |                  Node('bold_close', text, 7, 9)])
 55 |     result = HtmlFormatter().visit(tree)
 56 |     eq_(result, '<b>o hai</b>')
 57 | 
 58 | 
 59 | def test_visitation_exception():
 60 |     assert_raises(VisitationError,
 61 |                   ExplosiveFormatter().visit,
 62 |                   Node('boom', '', 0, 0))
 63 | 
 64 | 
 65 | def test_str():
 66 |     """Test str and unicode of ``Node``."""
 67 |     n = Node('text', 'o hai', 0, 5)
 68 |     good = '<Node called "text" matching "o hai">'
 69 |     eq_(str(n), good)
 70 | 
 71 | 
 72 | def test_repr():
 73 |     """Test repr of ``Node``."""
 74 |     s = u'hai ö'
 75 |     boogie = u'böogie'
 76 |     n = Node(boogie, s, 0, 3, children=[
 77 |             Node('', s, 3, 4), Node('', s, 4, 5)])
 78 |     eq_(repr(n), """s = {hai_o}\nNode({boogie}, s, 0, 3, children=[Node('', s, 3, 4), Node('', s, 4, 5)])""".format(hai_o=repr(s), boogie=repr(boogie)))
 79 | 
 80 | 
 81 | def test_parse_shortcut():
 82 |     """Exercise the simple case in which the visitor takes care of parsing."""
 83 |     eq_(HtmlFormatter().parse('(('), '<b>')
 84 | 
 85 | 
 86 | def test_match_shortcut():
 87 |     """Exercise the simple case in which the visitor takes care of matching."""
 88 |     eq_(HtmlFormatter().match('((other things'), '<b>')
 89 | 
 90 | 
 91 | class CoupledFormatter(NodeVisitor):
 92 |     @rule('bold_open text bold_close')
 93 |     def visit_bold_text(self, node, visited_children):
 94 |         return ''.join(visited_children)
 95 | 
 96 |     @rule('"(("')
 97 |     def visit_bold_open(self, node, visited_children):
 98 |         return '<b>'
 99 | 
100 |     @rule('"))"')
101 |     def visit_bold_close(self, node, visited_children):
102 |         return '</b>'
103 | 
104 |     @rule('~"[a-zA-Z 0-9]*"')
105 |     def visit_text(self, node, visited_children):
106 |         """Return the text verbatim."""
107 |         return node.text
108 | 
109 | 
110 | def test_rule_decorator():
111 |     """Make sure the @rule decorator works."""
112 |     eq_(CoupledFormatter().parse('((hi))'), '<b>hi</b>')
113 | 
114 | 
115 | def test_rule_decorator_subclassing():
116 |     """Make sure we can subclass and override visitor methods without blowing
117 |     away the rules attached to them."""
118 |     class OverridingFormatter(CoupledFormatter):
119 |         def visit_text(self, node, visited_children):
120 |             """Return the text capitalized."""
121 |             return node.text.upper()
122 | 
123 |         @rule('"not used"')
124 |         def visit_useless(self, node, visited_children):
125 |             """Get in the way. Tempt the metaclass to pave over the
126 |             superclass's grammar with a new one."""
127 | 
128 |     raise SkipTest("I haven't got around to making this work yet.")
129 |     eq_(OverridingFormatter().parse('((hi))'), '<b>HI</b>')
130 | 
131 | 
132 | class PrimalScream(Exception):
133 |     pass
134 | 
135 | 
136 | def test_unwrapped_exceptions():
137 |     class Screamer(NodeVisitor):
138 |         grammar = Grammar("""greeting = 'howdy'""")
139 |         unwrapped_exceptions = (PrimalScream,)
140 | 
141 |         def visit_greeting(self, thing, visited_children):
142 |             raise PrimalScream('This should percolate up!')
143 | 
144 |     assert_raises(PrimalScream, Screamer().parse, 'howdy')
145 | 
146 | 
147 | def test_node_inequality():
148 |     node = Node('text', 'o hai', 0, 5)
149 |     ok_(node != 5)
150 |     ok_(node != None)
151 | 


--------------------------------------------------------------------------------
/parsimonious/tests/test_expressions.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | from unittest import TestCase
  3 | 
  4 | from nose.tools import eq_, ok_, assert_raises
  5 | from six import text_type
  6 | 
  7 | from parsimonious.exceptions import ParseError, IncompleteParseError
  8 | from parsimonious.expressions import (Literal, Regex, Sequence, OneOf, Not,
  9 |     Optional, ZeroOrMore, OneOrMore, Expression)
 10 | from parsimonious.grammar import Grammar, rule_grammar
 11 | from parsimonious.nodes import Node
 12 | 
 13 | 
 14 | def len_eq(node, length):
 15 |     """Return whether the match lengths of 2 nodes are equal.
 16 | 
 17 |     Makes tests shorter and lets them omit positional stuff they don't care
 18 |     about.
 19 | 
 20 |     """
 21 |     node_length = None if node is None else node.end - node.start
 22 |     return node_length == length
 23 | 
 24 | 
 25 | class LengthTests(TestCase):
 26 |     """Tests for returning the right lengths
 27 | 
 28 |     I wrote these before parse tree generation was implemented. They're
 29 |     partially redundant with TreeTests.
 30 | 
 31 |     """
 32 |     def test_regex(self):
 33 |         len_eq(Literal('hello').match('ehello', 1), 5)  # simple
 34 |         len_eq(Regex('hello*').match('hellooo'), 7)  # *
 35 |         assert_raises(ParseError, Regex('hello*').match, 'goodbye')  # no match
 36 |         len_eq(Regex('hello', ignore_case=True).match('HELLO'), 5)
 37 | 
 38 |     def test_sequence(self):
 39 |         len_eq(Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match('hiiiilobingo1234'),
 40 |             12)  # succeed
 41 |         assert_raises(ParseError, Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match, 'hiiiilobing')  # don't
 42 |         len_eq(Sequence(Regex('hi*')).match('>hiiii', 1),
 43 |             5)  # non-0 pos
 44 | 
 45 |     def test_one_of(self):
 46 |         len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aaa'), 3)  # first alternative
 47 |         len_eq(OneOf(Literal('aaa'), Literal('bb')).match('bbaaa'), 2)  # second
 48 |         assert_raises(ParseError, OneOf(Literal('aaa'), Literal('bb')).match, 'aa')  # no match
 49 | 
 50 |     def test_not(self):
 51 |         len_eq(Not(Regex('.')).match(''), 0)  # match
 52 |         assert_raises(ParseError, Not(Regex('.')).match, 'Hi')  # don't
 53 | 
 54 |     def test_optional(self):
 55 |         len_eq(Sequence(Optional(Literal('a')), Literal('b')).match('b'), 1)  # contained expr fails
 56 |         len_eq(Sequence(Optional(Literal('a')), Literal('b')).match('ab'), 2)  # contained expr succeeds
 57 | 
 58 |     def test_zero_or_more(self):
 59 |         len_eq(ZeroOrMore(Literal('b')).match(''), 0)  # zero
 60 |         len_eq(ZeroOrMore(Literal('b')).match('bbb'), 3)  # more
 61 | 
 62 |         len_eq(Regex('^').match(''), 0)  # Validate the next test.
 63 | 
 64 |         # Try to make it loop infinitely using a zero-length contained expression:
 65 |         len_eq(ZeroOrMore(Regex('^')).match(''), 0)
 66 | 
 67 |     def test_one_or_more(self):
 68 |         len_eq(OneOrMore(Literal('b')).match('b'), 1)  # one
 69 |         len_eq(OneOrMore(Literal('b')).match('bbb'), 3)  # more
 70 |         len_eq(OneOrMore(Literal('b'), min=3).match('bbb'), 3)  # with custom min; success
 71 |         assert_raises(ParseError, OneOrMore(Literal('b'), min=3).match, 'bb')  # with custom min; failure
 72 |         len_eq(OneOrMore(Regex('^')).match('bb'), 0)  # attempt infinite loop
 73 | 
 74 | 
 75 | class TreeTests(TestCase):
 76 |     """Tests for building the right trees
 77 | 
 78 |     We have only to test successes here; failures (None-returning cases) are
 79 |     covered above.
 80 | 
 81 |     """
 82 |     def test_simple_node(self):
 83 |         """Test that leaf expressions like ``Literal`` make the right nodes."""
 84 |         h = Literal('hello', name='greeting')
 85 |         eq_(h.match('hello'), Node('greeting', 'hello', 0, 5))
 86 | 
 87 |     def test_sequence_nodes(self):
 88 |         """Assert that ``Sequence`` produces nodes with the right children."""
 89 |         s = Sequence(Literal('heigh', name='greeting1'),
 90 |                      Literal('ho',    name='greeting2'), name='dwarf')
 91 |         text = 'heighho'
 92 |         eq_(s.match(text), Node('dwarf', text, 0, 7, children=
 93 |                                 [Node('greeting1', text, 0, 5),
 94 |                                  Node('greeting2', text, 5, 7)]))
 95 | 
 96 |     def test_one_of(self):
 97 |         """``OneOf`` should return its own node, wrapping the child that succeeds."""
 98 |         o = OneOf(Literal('a', name='lit'), name='one_of')
 99 |         text = 'aa'
100 |         eq_(o.match(text), Node('one_of', text, 0, 1, children=[
101 |                                 Node('lit', text, 0, 1)]))
102 | 
103 |     def test_optional(self):
104 |         """``Optional`` should return its own node wrapping the succeeded child."""
105 |         expr = Optional(Literal('a', name='lit'), name='opt')
106 | 
107 |         text = 'a'
108 |         eq_(expr.match(text), Node('opt', text, 0, 1, children=[
109 |                                    Node('lit', text, 0, 1)]))
110 | 
111 |         # Test failure of the Literal inside the Optional; the
112 |         # LengthTests.test_optional is ambiguous for that.
113 |         text = ''
114 |         eq_(expr.match(text), Node('opt', text, 0, 0))
115 | 
116 |     def test_zero_or_more_zero(self):
117 |         """Test the 0 case of ``ZeroOrMore``; it should still return a node."""
118 |         expr = ZeroOrMore(Literal('a'), name='zero')
119 |         text = ''
120 |         eq_(expr.match(text), Node('zero', text, 0, 0))
121 | 
122 |     def test_one_or_more_one(self):
123 |         """Test the 1 case of ``OneOrMore``; it should return a node with a child."""
124 |         expr = OneOrMore(Literal('a', name='lit'), name='one')
125 |         text = 'a'
126 |         eq_(expr.match(text), Node('one', text, 0, 1, children=[
127 |                                    Node('lit', text, 0, 1)]))
128 | 
129 |     # Things added since Grammar got implemented are covered in integration
130 |     # tests in test_grammar.
131 | 
132 | 
133 | class ParseTests(TestCase):
134 |     """Tests for the ``parse()`` method"""
135 | 
136 |     def test_parse_success(self):
137 |         """Make sure ``parse()`` returns the tree on success.
138 | 
139 |         There's not much more than that to test that we haven't already vetted
140 |         above.
141 | 
142 |         """
143 |         expr = OneOrMore(Literal('a', name='lit'), name='more')
144 |         text = 'aa'
145 |         eq_(expr.parse(text), Node('more', text, 0, 2, children=[
146 |                                    Node('lit', text, 0, 1),
147 |                                    Node('lit', text, 1, 2)]))
148 | 
149 | 
150 | class ErrorReportingTests(TestCase):
151 |     """Tests for reporting parse errors"""
152 | 
153 |     def test_inner_rule_succeeding(self):
154 |         """Make sure ``parse()`` fails and blames the
155 |         rightward-progressing-most named Expression when an Expression isn't
156 |         satisfied.
157 | 
158 |         Make sure ParseErrors have nice Unicode representations.
159 | 
160 |         """
161 |         grammar = Grammar("""
162 |             bold_text = open_parens text close_parens
163 |             open_parens = "(("
164 |             text = ~"[a-zA-Z]+"
165 |             close_parens = "))"
166 |             """)
167 |         text = '((fred!!'
168 |         try:
169 |             grammar.parse(text)
170 |         except ParseError as error:
171 |             eq_(error.pos, 6)
172 |             eq_(error.expr, grammar['close_parens'])
173 |             eq_(error.text, text)
174 |             eq_(text_type(error), "Rule 'close_parens' didn't match at '!!' (line 1, column 7).")
175 | 
176 |     def test_rewinding(self):
177 |         """Make sure rewinding the stack and trying an alternative (which
178 |         progresses farther) from a higher-level rule can blame an expression
179 |         within the alternative on failure.
180 | 
181 |         There's no particular reason I suspect this wouldn't work, but it's a
182 |         more real-world example than the no-alternative cases already tested.
183 | 
184 |         """
185 |         grammar = Grammar("""
186 |             formatted_text = bold_text / weird_text
187 |             bold_text = open_parens text close_parens
188 |             weird_text = open_parens text "!!" bork
189 |             bork = "bork"
190 |             open_parens = "(("
191 |             text = ~"[a-zA-Z]+"
192 |             close_parens = "))"
193 |             """)
194 |         text = '((fred!!'
195 |         try:
196 |             grammar.parse(text)
197 |         except ParseError as error:
198 |             eq_(error.pos, 8)
199 |             eq_(error.expr, grammar['bork'])
200 |             eq_(error.text, text)
201 | 
202 |     def test_no_named_rule_succeeding(self):
203 |         """Make sure ParseErrors have sane printable representations even if we
204 |         never succeeded in matching any named expressions."""
205 |         grammar = Grammar('''bork = "bork"''')
206 |         try:
207 |             grammar.parse('snork')
208 |         except ParseError as error:
209 |             eq_(error.pos, 0)
210 |             eq_(error.expr, grammar['bork'])
211 |             eq_(error.text, 'snork')
212 | 
213 |     def test_parse_with_leftovers(self):
214 |         """Make sure ``parse()`` reports where we started failing to match,
215 |         even if a partial match was successful."""
216 |         grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''')
217 |         try:
218 |             grammar.parse('chitty bangbang')
219 |         except IncompleteParseError as error:
220 |             eq_(text_type(error), u"Rule 'sequence' matched in its entirety, but it didn't consume all the text. The non-matching portion of the text begins with 'bang' (line 1, column 12).")
221 | 
222 |     def test_favoring_named_rules(self):
223 |         """Named rules should be used in error messages in favor of anonymous
224 |         ones, even if those are rightward-progressing-more, and even if the
225 |         failure starts at position 0."""
226 |         grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''')
227 |         try:
228 |             grammar.parse('burp')
229 |         except ParseError as error:
230 |             eq_(text_type(error), u"Rule 'starts_with_a' didn't match at 'burp' (line 1, column 1).")
231 | 
232 |     def test_line_and_column(self):
233 |         """Make sure we got the line and column computation right."""
234 |         grammar = Grammar(r"""
235 |             whee_lah = whee "\n" lah "\n"
236 |             whee = "whee"
237 |             lah = "lah"
238 |             """)
239 |         try:
240 |             grammar.parse('whee\nlahGOO')
241 |         except ParseError as error:
242 |             # TODO: Right now, this says "Rule <Literal "\n" at 0x4368250432>
243 |             # didn't match". That's not the greatest. Fix that, then fix this.
244 |             ok_(text_type(error).endswith(r"""didn't match at 'GOO' (line 2, column 4)."""))
245 | 
246 | 
247 | class RepresentationTests(TestCase):
248 |     """Tests for str(), unicode(), and repr() of expressions"""
249 | 
250 |     def test_unicode_crash(self):
251 |         """Make sure matched unicode strings don't crash ``__str__``."""
252 |         grammar = Grammar(r'string = ~r"\S+"u')
253 |         str(grammar.parse(u'中文'))
254 | 
255 |     def test_unicode(self):
256 |         """Smoke-test the conversion of expressions to bits of rules.
257 | 
258 |         A slightly more comprehensive test of the actual values is in
259 |         ``GrammarTests.test_unicode``.
260 | 
261 |         """
262 |         text_type(rule_grammar)
263 | 
264 |     def test_unicode_keep_parens(self):
265 |         """Make sure converting an expression to unicode doesn't strip
266 |         parenthesis.
267 | 
268 |         """
269 |         # ZeroOrMore
270 |         eq_(text_type(Grammar('foo = "bar" ("baz" "eggs")* "spam"')),
271 |             u'foo = "bar" ("baz" "eggs")* "spam"')
272 | 
273 |         # OneOf
274 |         eq_(text_type(Grammar('foo = "bar" ("baz" / "eggs") "spam"')),
275 |             u'foo = "bar" ("baz" / "eggs") "spam"')
276 | 
277 |         # Lookahead
278 |         eq_(text_type(Grammar('foo = "bar" &("baz" "eggs") "spam"')),
279 |             u'foo = "bar" &("baz" "eggs") "spam"')
280 | 
281 |         # Multiple sequences
282 |         eq_(text_type(Grammar('foo = ("bar" "baz") / ("baff" "bam")')),
283 |             u'foo = ("bar" "baz") / ("baff" "bam")')
284 | 
285 |     def test_unicode_surrounding_parens(self):
286 |         """
287 |         Make sure there are no surrounding parens around the entire
288 |         right-hand side of an expression (as they're unnecessary).
289 | 
290 |         """
291 |         eq_(text_type(Grammar('foo = ("foo" ("bar" "baz"))')),
292 |             u'foo = "foo" ("bar" "baz")')
293 | 
294 | 
295 | class SlotsTests(TestCase):
296 |     """Tests to do with __slots__"""
297 | 
298 |     def test_subclassing(self):
299 |         """Make sure a subclass of a __slots__-less class can introduce new
300 |         slots itself.
301 | 
302 |         This isn't supposed to work, according to the language docs:
303 | 
304 |             When inheriting from a class without __slots__, the __dict__
305 |             attribute of that class will always be accessible, so a __slots__
306 |             definition in the subclass is meaningless.
307 | 
308 |         But it does.
309 | 
310 |         """
311 |         class Smoo(Optional):
312 |             __slots__ = ['smoo']
313 | 
314 |             def __init__(self):
315 |                 self.smoo = 'smoo'
316 | 
317 |         smoo = Smoo()
318 |         eq_(smoo.__dict__, {})  # has a __dict__ but with no smoo in it
319 |         eq_(smoo.smoo, 'smoo')  # The smoo attr ended up in a slot.
320 | 


--------------------------------------------------------------------------------
/parsimonious/nodes.py:
--------------------------------------------------------------------------------
  1 | """Nodes that make up parse trees
  2 | 
  3 | Parsing spits out a tree of these, which you can then tell to walk itself and
  4 | spit out a useful value. Or you can walk it yourself; the structural attributes
  5 | are public.
  6 | 
  7 | """
  8 | # TODO: If this is slow, think about using cElementTree or something.
  9 | from inspect import isfunction
 10 | from sys import version_info, exc_info
 11 | 
 12 | from six import reraise, python_2_unicode_compatible, with_metaclass, \
 13 |     iteritems
 14 | 
 15 | from parsimonious.exceptions import VisitationError, UndefinedLabel
 16 | from parsimonious.utils import StrAndRepr
 17 | 
 18 | 
 19 | @python_2_unicode_compatible
 20 | class Node(StrAndRepr):
 21 |     """A parse tree node
 22 | 
 23 |     Consider these immutable once constructed. As a side effect of a
 24 |     memory-saving strategy in the cache, multiple references to a single
 25 |     ``Node`` might be returned in a single parse tree. So, if you start
 26 |     messing with one, you'll see surprising parallel changes pop up elsewhere.
 27 | 
 28 |     My philosophy is that parse trees (and their nodes) should be
 29 |     representation-agnostic. That is, they shouldn't get all mixed up with what
 30 |     the final rendered form of a wiki page (or the intermediate representation
 31 |     of a programming language, or whatever) is going to be: you should be able
 32 |     to parse once and render several representations from the tree, one after
 33 |     another.
 34 | 
 35 |     """
 36 |     # I tried making this subclass list, but it got ugly. I had to construct
 37 |     # invalid ones and patch them up later, and there were other problems.
 38 |     __slots__ = ['expr_name',  # The name of the expression that generated me
 39 |                  'full_text',  # The full text fed to the parser
 40 |                  'start', # The position in the text where that expr started matching
 41 |                  'end',   # The position after start where the expr first didn't
 42 |                           # match. [start:end] follow Python slice conventions.
 43 |                  'children']  # List of child parse tree nodes
 44 | 
 45 |     def __init__(self, expr_name, full_text, start, end, children=None):
 46 |         self.expr_name = expr_name
 47 |         self.full_text = full_text
 48 |         self.start = start
 49 |         self.end = end
 50 |         self.children = children or []
 51 | 
 52 |     def __iter__(self):
 53 |         """Support looping over my children and doing tuple unpacks on me.
 54 | 
 55 |         It can be very handy to unpack nodes in arg lists; see
 56 |         :class:`PegVisitor` for an example.
 57 | 
 58 |         """
 59 |         return iter(self.children)
 60 | 
 61 |     @property
 62 |     def text(self):
 63 |         """Return the text this node matched."""
 64 |         return self.full_text[self.start:self.end]
 65 | 
 66 |     # From here down is just stuff for testing and debugging.
 67 | 
 68 |     def prettily(self, error=None):
 69 |         """Return a unicode, pretty-printed representation of me.
 70 | 
 71 |         :arg error: The node to highlight because an error occurred there
 72 | 
 73 |         """
 74 |         # TODO: If a Node appears multiple times in the tree, we'll point to
 75 |         # them all. Whoops.
 76 |         def indent(text):
 77 |             return '\n'.join(('    ' + line) for line in text.splitlines())
 78 |         ret = [u'<%s%s matching "%s">%s' % (
 79 |             self.__class__.__name__,
 80 |             (' called "%s"' % self.expr_name) if self.expr_name else '',
 81 |             self.text,
 82 |             '  <-- *** We were here. ***' if error is self else '')]
 83 |         for n in self:
 84 |             ret.append(indent(n.prettily(error=error)))
 85 |         return '\n'.join(ret)
 86 | 
 87 |     def __str__(self):
 88 |         """Return a compact, human-readable representation of me."""
 89 |         return self.prettily()
 90 | 
 91 |     def __eq__(self, other):
 92 |         """Support by-value deep comparison with other nodes for testing."""
 93 |         if not isinstance(other, Node):
 94 |             return NotImplemented
 95 | 
 96 |         return (self.expr_name == other.expr_name and
 97 |                 self.full_text == other.full_text and
 98 |                 self.start == other.start and
 99 |                 self.end == other.end and
100 |                 self.children == other.children)
101 | 
102 |     def __ne__(self, other):
103 |         return not self == other
104 | 
105 |     def __repr__(self, top_level=True):
106 |         """Return a bit of code (though not an expression) that will recreate
107 |         me."""
108 |         # repr() of unicode flattens everything out to ASCII, so we don't need
109 |         # to explicitly encode things afterward.
110 |         ret = ["s = %r" % self.full_text] if top_level else []
111 |         ret.append("%s(%r, s, %s, %s%s)" % (
112 |             self.__class__.__name__,
113 |             self.expr_name,
114 |             self.start,
115 |             self.end,
116 |             (', children=[%s]' %
117 |              ', '.join([c.__repr__(top_level=False) for c in self.children]))
118 |             if self.children else ''))
119 |         return '\n'.join(ret)
120 | 
121 | 
122 | class RegexNode(Node):
123 |     """Node returned from a ``Regex`` expression
124 | 
125 |     Grants access to the ``re.Match`` object, in case you want to access
126 |     capturing groups, etc.
127 | 
128 |     """
129 |     __slots__ = ['match']
130 | 
131 | 
132 | class RuleDecoratorMeta(type):
133 |     def __new__(metaclass, name, bases, namespace):
134 |         def unvisit(name):
135 |             """Remove any leading "visit_" from a method name."""
136 |             return name[6:] if name.startswith('visit_') else name
137 | 
138 |         methods = [v for k, v in iteritems(namespace) if
139 |                    hasattr(v, '_rule') and isfunction(v)]
140 |         if methods:
141 |             from parsimonious.grammar import Grammar  # circular import dodge
142 | 
143 |             methods.sort(key=(lambda x: x.func_code.co_firstlineno)
144 |                              if version_info[0] < 3 else
145 |                              (lambda x: x.__code__.co_firstlineno))
146 |             # Possible enhancement: once we get the Grammar extensibility story
147 |             # solidified, we can have @rules *add* to the default grammar
148 |             # rather than pave over it.
149 |             namespace['grammar'] = Grammar(
150 |                 '\n'.join('{name} = {expr}'.format(name=unvisit(m.__name__),
151 |                                                    expr=m._rule)
152 |                           for m in methods))
153 |         return super(RuleDecoratorMeta,
154 |                      metaclass).__new__(metaclass, name, bases, namespace)
155 | 
156 | 
157 | class NodeVisitor(with_metaclass(RuleDecoratorMeta, object)):
158 |     """A shell for writing things that turn parse trees into something useful
159 | 
160 |     Performs a depth-first traversal of an AST. Subclass this, add methods for
161 |     each expr you care about, instantiate, and call
162 |     ``visit(top_node_of_parse_tree)``. It'll return the useful stuff. This API
163 |     is very similar to that of ``ast.NodeVisitor``.
164 | 
165 |     These could easily all be static methods, but that would add at least as
166 |     much weirdness at the call site as the ``()`` for instantiation. And this
167 |     way, we support subclasses that require state: options, for example, or a
168 |     symbol table constructed from a programming language's AST.
169 | 
170 |     We never transform the parse tree in place, because...
171 | 
172 |     * There are likely multiple references to the same ``Node`` object in a
173 |       parse tree, and changes to one reference would surprise you elsewhere.
174 |     * It makes it impossible to report errors: you'd end up with the "error"
175 |       arrow pointing someplace in a half-transformed mishmash of nodes--and
176 |       that's assuming you're even transforming the tree into another tree.
177 |       Heaven forbid you're making it into a string or something else.
178 | 
179 |     """
180 | 
181 |     #: The :term:`default grammar`: the one recommended for use with this
182 |     #: visitor. If you populate this, you will be able to call
183 |     #: :meth:`NodeVisitor.parse()` as a shortcut.
184 |     grammar = None
185 | 
186 |     #: Classes of exceptions you actually intend to raise during visitation
187 |     #: and which should propagate out of the visitor. These will not be
188 |     #: wrapped in a VisitationError when they arise.
189 |     unwrapped_exceptions = ()
190 | 
191 |     # TODO: If we need to optimize this, we can go back to putting subclasses
192 |     # in charge of visiting children; they know when not to bother. Or we can
193 |     # mark nodes as not descent-worthy in the grammar.
194 |     def visit(self, node):
195 |         """Walk a parse tree, transforming it into another representation.
196 | 
197 |         Recursively descend a parse tree, dispatching to the method named after
198 |         the rule in the :class:`~parsimonious.grammar.Grammar` that produced
199 |         each node. If, for example, a rule was... ::
200 | 
201 |             bold = '<b>'
202 | 
203 |         ...the ``visit_bold()`` method would be called. It is your
204 |         responsibility to subclass :class:`NodeVisitor` and implement those
205 |         methods.
206 | 
207 |         """
208 |         method = getattr(self, 'visit_' + node.expr_name, self.generic_visit)
209 | 
210 |         # Call that method, and show where in the tree it failed if it blows
211 |         # up.
212 |         try:
213 |             return method(node, [self.visit(n) for n in node])
214 |         except (VisitationError, UndefinedLabel):
215 |             # Don't catch and re-wrap already-wrapped exceptions.
216 |             raise
217 |         except self.unwrapped_exceptions:
218 |             raise
219 |         except Exception:
220 |             # Catch any exception, and tack on a parse tree so it's easier to
221 |             # see where it went wrong.
222 |             exc_class, exc, tb = exc_info()
223 |             reraise(VisitationError, VisitationError(exc, exc_class, node), tb)
224 | 
225 |     def generic_visit(self, node, visited_children):
226 |         """Default visitor method
227 | 
228 |         :arg node: The node we're visiting
229 |         :arg visited_children: The results of visiting the children of that
230 |             node, in a list
231 | 
232 |         I'm not sure there's an implementation of this that makes sense across
233 |         all (or even most) use cases, so we leave it to subclasses to implement
234 |         for now.
235 | 
236 |         """
237 |         raise NotImplementedError("No visitor method was defined for %s." %
238 |                                   node.expr_name)
239 | 
240 |     # Convenience methods:
241 | 
242 |     def parse(self, text, pos=0):
243 |         """Parse some text with this Visitor's default grammar.
244 | 
245 |         ``SomeVisitor().parse('some_string')`` is a shortcut for
246 |         ``SomeVisitor().visit(some_grammar.parse('some_string'))``.
247 | 
248 |         """
249 |         return self._parse_or_match(text, pos, 'parse')
250 | 
251 |     def match(self, text, pos=0):
252 |         """Parse some text with this Visitor's default grammar, but don't
253 |         insist on parsing all the way to the end.
254 | 
255 |         ``SomeVisitor().match('some_string')`` is a shortcut for
256 |         ``SomeVisitor().visit(some_grammar.match('some_string'))``.
257 | 
258 |         """
259 |         return self._parse_or_match(text, pos, 'match')
260 | 
261 |     # Internal convenience methods to help you write your own visitors:
262 | 
263 |     def lift_child(self, node, children):
264 |         """Lift the sole child of ``node`` up to replace the node."""
265 |         first_child, = children
266 |         return first_child
267 | 
268 |     # Private methods:
269 | 
270 |     def _parse_or_match(self, text, pos, method_name):
271 |         """Execute a parse or match on the default grammar, followed by a
272 |         visitation.
273 | 
274 |         Raise RuntimeError if there is no default grammar specified.
275 | 
276 |         """
277 |         if not self.grammar:
278 |             raise RuntimeError(
279 |                 "The {cls}.{method}() shortcut won't work because {cls} was "
280 |                 "never associated with a specific " "grammar. Fill out its "
281 |                 "`grammar` attribute, and try again.".format(
282 |                     cls=self.__class__.__name__,
283 |                     method=method_name))
284 |         return self.visit(getattr(self.grammar, method_name)(text, pos=pos))
285 | 
286 | 
287 | def rule(rule_string):
288 |     """Decorate a NodeVisitor ``visit_*`` method to tie a grammar rule to it.
289 | 
290 |     The following will arrange for the ``visit_digit`` method to receive the
291 |     results of the ``~"[0-9]"`` parse rule::
292 | 
293 |         @rule('~"[0-9]"')
294 |         def visit_digit(self, node, visited_children):
295 |             ...
296 | 
297 |     Notice that there is no "digit = " as part of the rule; that gets inferred
298 |     from the method name.
299 | 
300 |     In cases where there is only one kind of visitor interested in a grammar,
301 |     using ``@rule`` saves you having to look back and forth between the visitor
302 |     and the grammar definition.
303 | 
304 |     On an implementation level, all ``@rule`` rules get stitched together into
305 |     a :class:`~parsimonious.Grammar` that becomes the NodeVisitor's
306 |     :term:`default grammar`.
307 | 
308 |     Typically, the choice of a default rule for this grammar is simple: whatever
309 |     ``@rule`` comes first in the class is the default. But the choice may become
310 |     surprising if you divide the ``@rule`` calls among subclasses. At the
311 |     moment, which method "comes first" is decided simply by comparing line
312 |     numbers, so whatever method is on the smallest-numbered line will be the
313 |     default. In a future release, this will change to pick the
314 |     first ``@rule`` call on the basemost class that has one. That way, a
315 |     subclass which does not override the default rule's ``visit_*`` method
316 |     won't unintentionally change which rule is the default.
317 | 
318 |     """
319 |     def decorator(method):
320 |         method._rule = rule_string  # XXX: Maybe register them on a class var instead so we can just override a @rule'd visitor method on a subclass without blowing away the rule string that comes with it.
321 |         return method
322 |     return decorator
323 | 


--------------------------------------------------------------------------------
/parsimonious/expressions.py:
--------------------------------------------------------------------------------
  1 | """Subexpressions that make up a parsed grammar
  2 | 
  3 | These do the parsing.
  4 | 
  5 | """
  6 | # TODO: Make sure all symbol refs are local--not class lookups or
  7 | # anything--for speed. And kill all the dots.
  8 | 
  9 | from inspect import getargspec
 10 | import re
 11 | 
 12 | from six import integer_types, python_2_unicode_compatible
 13 | from six.moves import range
 14 | 
 15 | from parsimonious.exceptions import ParseError, IncompleteParseError
 16 | from parsimonious.nodes import Node, RegexNode
 17 | from parsimonious.utils import StrAndRepr
 18 | 
 19 | MARKER = object()
 20 | 
 21 | 
 22 | def expression(callable, rule_name, grammar):
 23 |     """Turn a plain callable into an Expression.
 24 | 
 25 |     The callable can be of this simple form::
 26 | 
 27 |         def foo(text, pos):
 28 |             '''If this custom expression matches starting at text[pos], return
 29 |             the index where it stops matching. Otherwise, return None.'''
 30 |             if the expression matched:
 31 |                 return end_pos
 32 | 
 33 |     If there child nodes to return, return a tuple::
 34 | 
 35 |         return end_pos, children
 36 | 
 37 |     If the expression doesn't match at the given ``pos`` at all... ::
 38 | 
 39 |         return None
 40 | 
 41 |     If your callable needs to make sub-calls to other rules in the grammar or
 42 |     do error reporting, it can take this form, gaining additional arguments::
 43 | 
 44 |         def foo(text, pos, cache, error, grammar):
 45 |             # Call out to other rules:
 46 |             node = grammar['another_rule'].match_core(text, pos, cache, error)
 47 |             ...
 48 |             # Return values as above.
 49 | 
 50 |     The return value of the callable, if an int or a tuple, will be
 51 |     automatically transmuted into a :class:`~parsimonious.Node`. If it returns
 52 |     a Node-like class directly, it will be passed through unchanged.
 53 | 
 54 |     :arg rule_name: The rule name to attach to the resulting
 55 |         :class:`~parsimonious.Expression`
 56 |     :arg grammar: The :class:`~parsimonious.Grammar` this expression will be a
 57 |         part of, to make delegating to other rules possible
 58 | 
 59 |     """
 60 |     num_args = len(getargspec(callable).args)
 61 |     if num_args == 2:
 62 |         is_simple = True
 63 |     elif num_args == 5:
 64 |         is_simple = False
 65 |     else:
 66 |         raise RuntimeError("Custom rule functions must take either 2 or 5 "
 67 |                            "arguments, not %s." % num_args)
 68 | 
 69 |     class AdHocExpression(Expression):
 70 |         def _uncached_match(self, text, pos, cache, error):
 71 |             result = (callable(text, pos) if is_simple else
 72 |                       callable(text, pos, cache, error, grammar))
 73 | 
 74 |             if isinstance(result, integer_types):
 75 |                 end, children = result, None
 76 |             elif isinstance(result, tuple):
 77 |                 end, children = result
 78 |             else:
 79 |                 # Node or None
 80 |                 return result
 81 |             return Node(self.name, text, pos, end, children=children)
 82 | 
 83 |         def _as_rhs(self):
 84 |             return '{custom function "%s"}' % callable.__name__
 85 | 
 86 |     return AdHocExpression(name=rule_name)
 87 | 
 88 | 
 89 | @python_2_unicode_compatible
 90 | class Expression(StrAndRepr):
 91 |     """A thing that can be matched against a piece of text"""
 92 | 
 93 |     # Slots are about twice as fast as __dict__-based attributes:
 94 |     # http://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why
 95 | 
 96 |     # Top-level expressions--rules--have names. Subexpressions are named ''.
 97 |     __slots__ = ['name']
 98 | 
 99 |     def __init__(self, name=''):
100 |         self.name = name
101 | 
102 |     def parse(self, text, pos=0):
103 |         """Return a parse tree of ``text``.
104 | 
105 |         Raise ``ParseError`` if the expression wasn't satisfied. Raise
106 |         ``IncompleteParseError`` if the expression was satisfied but didn't
107 |         consume the full string.
108 | 
109 |         """
110 |         node = self.match(text, pos=pos)
111 |         if node.end < len(text):
112 |             raise IncompleteParseError(text, node.end, self)
113 |         return node
114 | 
115 |     def match(self, text, pos=0):
116 |         """Return the parse tree matching this expression at the given
117 |         position, not necessarily extending all the way to the end of ``text``.
118 | 
119 |         Raise ``ParseError`` if there is no match there.
120 | 
121 |         :arg pos: The index at which to start matching
122 | 
123 |         """
124 |         error = ParseError(text)
125 |         node = self.match_core(text, pos, {}, error)
126 |         if node is None:
127 |             raise error
128 |         return node
129 | 
130 |     def match_core(self, text, pos, cache, error):
131 |         """Internal guts of ``match()``
132 | 
133 |         This is appropriate to call only from custom rules or Expression
134 |         subclasses.
135 | 
136 |         :arg cache: The packrat cache::
137 | 
138 |             {(oid, pos): Node tree matched by object `oid` at index `pos` ...}
139 | 
140 |         :arg error: A ParseError instance with ``text`` already filled in but
141 |             otherwise blank. We update the error reporting info on this object
142 |             as we go. (Sticking references on an existing instance is faster
143 |             than allocating a new one for each expression that fails.) We
144 |             return None rather than raising and catching ParseErrors because
145 |             catching is slow.
146 | 
147 |         """
148 |         # TODO: Optimize. Probably a hot spot.
149 |         #
150 |         # Is there a way of looking up cached stuff that's faster than hashing
151 |         # this id-pos pair?
152 |         #
153 |         # If this is slow, think about the array module. It might (or might
154 |         # not!) use more RAM, but it'll likely be faster than hashing things
155 |         # all the time. Also, can we move all the allocs up front?
156 |         #
157 |         # To save space, we have lots of choices: (0) Quit caching whole Node
158 |         # objects. Cache just what you need to reconstitute them. (1) Cache
159 |         # only the results of entire rules, not subexpressions (probably a
160 |         # horrible idea for rules that need to backtrack internally a lot). (2)
161 |         # Age stuff out of the cache somehow. LRU? (3) Cuts.
162 |         expr_id = id(self)
163 |         node = cache.get((expr_id, pos), MARKER)  # TODO: Change to setdefault to prevent infinite recursion in left-recursive rules.
164 |         if node is MARKER:
165 |             node = cache[(expr_id, pos)] = self._uncached_match(text,
166 |                                                                 pos,
167 |                                                                 cache,
168 |                                                                 error)
169 | 
170 |         # Record progress for error reporting:
171 |         if node is None and pos >= error.pos and (
172 |                 self.name or getattr(error.expr, 'name', None) is None):
173 |             # Don't bother reporting on unnamed expressions (unless that's all
174 |             # we've seen so far), as they're hard to track down for a human.
175 |             # Perhaps we could include the unnamed subexpressions later as
176 |             # auxiliary info.
177 |             error.expr = self
178 |             error.pos = pos
179 | 
180 |         return node
181 | 
182 |     def __str__(self):
183 |         return u'<%s %s at 0x%s>' % (
184 |             self.__class__.__name__,
185 |             self.as_rule(),
186 |             id(self))
187 | 
188 |     def as_rule(self):
189 |         """Return the left- and right-hand sides of a rule that represents me.
190 | 
191 |         Return unicode. If I have no ``name``, omit the left-hand side.
192 | 
193 |         """
194 |         rhs = self._as_rhs().strip()
195 |         if rhs.startswith('(') and rhs.endswith(')'):
196 |             rhs = rhs[1:-1]
197 | 
198 |         return (u'%s = %s' % (self.name, rhs)) if self.name else rhs
199 | 
200 |     def _unicode_members(self):
201 |         """Return an iterable of my unicode-represented children, stopping
202 |         descent when we hit a named node so the returned value resembles the
203 |         input rule."""
204 |         return [(m.name or m._as_rhs()) for m in self.members]
205 | 
206 |     def _as_rhs(self):
207 |         """Return the right-hand side of a rule that represents me.
208 | 
209 |         Implemented by subclasses.
210 | 
211 |         """
212 |         raise NotImplementedError
213 | 
214 | 
215 | class Literal(Expression):
216 |     """A string literal
217 | 
218 |     Use these if you can; they're the fastest.
219 | 
220 |     """
221 |     __slots__ = ['literal']
222 | 
223 |     def __init__(self, literal, name=''):
224 |         super(Literal, self).__init__(name)
225 |         self.literal = literal
226 | 
227 |     def _uncached_match(self, text, pos, cache, error):
228 |         if text.startswith(self.literal, pos):
229 |             return Node(self.name, text, pos, pos + len(self.literal))
230 | 
231 |     def _as_rhs(self):
232 |         # TODO: Get backslash escaping right.
233 |         return '"%s"' % self.literal
234 | 
235 | 
236 | class TokenMatcher(Literal):
237 |     """An expression matching a single token of a given type
238 | 
239 |     This is for use only with TokenGrammars.
240 | 
241 |     """
242 |     def _uncached_match(self, token_list, pos, cache, error):
243 |         if token_list[pos].type == self.literal:
244 |             return Node(self.name, token_list, pos, pos + 1)
245 | 
246 | 
247 | class Regex(Expression):
248 |     """An expression that matches what a regex does.
249 | 
250 |     Use these as much as you can and jam as much into each one as you can;
251 |     they're fast.
252 | 
253 |     """
254 |     __slots__ = ['re']
255 | 
256 |     def __init__(self, pattern, name='', ignore_case=False, locale=False,
257 |                  multiline=False, dot_all=False, unicode=False, verbose=False):
258 |         super(Regex, self).__init__(name)
259 |         self.re = re.compile(pattern, (ignore_case and re.I) |
260 |                                       (locale and re.L) |
261 |                                       (multiline and re.M) |
262 |                                       (dot_all and re.S) |
263 |                                       (unicode and re.U) |
264 |                                       (verbose and re.X))
265 | 
266 |     def _uncached_match(self, text, pos, cache, error):
267 |         """Return length of match, ``None`` if no match."""
268 |         m = self.re.match(text, pos)
269 |         if m is not None:
270 |             span = m.span()
271 |             node = RegexNode(self.name, text, pos, pos + span[1] - span[0])
272 |             node.match = m  # TODO: A terrible idea for cache size?
273 |             return node
274 | 
275 |     def _regex_flags_from_bits(self, bits):
276 |         """Return the textual equivalent of numerically encoded regex flags."""
277 |         flags = 'ilmsux'
278 |         return ''.join(flags[i - 1] if (1 << i) & bits else '' for i in range(1, len(flags) + 1))
279 | 
280 |     def _as_rhs(self):
281 |         # TODO: Get backslash escaping right.
282 |         return '~"%s"%s' % (self.re.pattern,
283 |                             self._regex_flags_from_bits(self.re.flags))
284 | 
285 | 
286 | class Compound(Expression):
287 |     """An abstract expression which contains other expressions"""
288 | 
289 |     __slots__ = ['members']
290 | 
291 |     def __init__(self, *members, **kwargs):
292 |         """``members`` is a sequence of expressions."""
293 |         super(Compound, self).__init__(kwargs.get('name', ''))
294 |         self.members = members
295 | 
296 | 
297 | class Sequence(Compound):
298 |     """A series of expressions that must match contiguous, ordered pieces of
299 |     the text
300 | 
301 |     In other words, it's a concatenation operator: each piece has to match, one
302 |     after another.
303 | 
304 |     """
305 |     def _uncached_match(self, text, pos, cache, error):
306 |         new_pos = pos
307 |         length_of_sequence = 0
308 |         children = []
309 |         for m in self.members:
310 |             node = m.match_core(text, new_pos, cache, error)
311 |             if node is None:
312 |                 return None
313 |             children.append(node)
314 |             length = node.end - node.start
315 |             new_pos += length
316 |             length_of_sequence += length
317 |         # Hooray! We got through all the members!
318 |         return Node(self.name, text, pos, pos + length_of_sequence, children)
319 | 
320 |     def _as_rhs(self):
321 |         return u'({0})'.format(u' '.join(self._unicode_members()))
322 | 
323 | 
324 | class OneOf(Compound):
325 |     """A series of expressions, one of which must match
326 | 
327 |     Expressions are tested in order from first to last. The first to succeed
328 |     wins.
329 | 
330 |     """
331 |     def _uncached_match(self, text, pos, cache, error):
332 |         for m in self.members:
333 |             node = m.match_core(text, pos, cache, error)
334 |             if node is not None:
335 |                 # Wrap the succeeding child in a node representing the OneOf:
336 |                 return Node(self.name, text, pos, node.end, children=[node])
337 | 
338 |     def _as_rhs(self):
339 |         return u'({0})'.format(u' / '.join(self._unicode_members()))
340 | 
341 | 
342 | class Lookahead(Compound):
343 |     """An expression which consumes nothing, even if its contained expression
344 |     succeeds"""
345 | 
346 |     # TODO: Merge this and Not for better cache hit ratios and less code.
347 |     # Downside: pretty-printed grammars might be spelled differently than what
348 |     # went in. That doesn't bother me.
349 | 
350 |     def _uncached_match(self, text, pos, cache, error):
351 |         node = self.members[0].match_core(text, pos, cache, error)
352 |         if node is not None:
353 |             return Node(self.name, text, pos, pos)
354 | 
355 |     def _as_rhs(self):
356 |         return u'&%s' % self._unicode_members()[0]
357 | 
358 | 
359 | class Not(Compound):
360 |     """An expression that succeeds only if the expression within it doesn't
361 | 
362 |     In any case, it never consumes any characters; it's a negative lookahead.
363 | 
364 |     """
365 |     def _uncached_match(self, text, pos, cache, error):
366 |         # FWIW, the implementation in Parsing Techniques in Figure 15.29 does
367 |         # not bother to cache NOTs directly.
368 |         node = self.members[0].match_core(text, pos, cache, error)
369 |         if node is None:
370 |             return Node(self.name, text, pos, pos)
371 | 
372 |     def _as_rhs(self):
373 |         # TODO: Make sure this parenthesizes the member properly if it's an OR
374 |         # or AND.
375 |         return u'!%s' % self._unicode_members()[0]
376 | 
377 | 
378 | # Quantifiers. None of these is strictly necessary, but they're darn handy.
379 | 
380 | class Optional(Compound):
381 |     """An expression that succeeds whether or not the contained one does
382 | 
383 |     If the contained expression succeeds, it goes ahead and consumes what it
384 |     consumes. Otherwise, it consumes nothing.
385 | 
386 |     """
387 |     def _uncached_match(self, text, pos, cache, error):
388 |         node = self.members[0].match_core(text, pos, cache, error)
389 |         return (Node(self.name, text, pos, pos) if node is None else
390 |                 Node(self.name, text, pos, node.end, children=[node]))
391 | 
392 |     def _as_rhs(self):
393 |         return u'%s?' % self._unicode_members()[0]
394 | 
395 | 
396 | # TODO: Merge with OneOrMore.
397 | class ZeroOrMore(Compound):
398 |     """An expression wrapper like the * quantifier in regexes."""
399 | 
400 |     def _uncached_match(self, text, pos, cache, error):
401 |         new_pos = pos
402 |         children = []
403 |         while True:
404 |             node = self.members[0].match_core(text, new_pos, cache, error)
405 |             if node is None or not (node.end - node.start):
406 |                 # Node was None or 0 length. 0 would otherwise loop infinitely.
407 |                 return Node(self.name, text, pos, new_pos, children)
408 |             children.append(node)
409 |             new_pos += node.end - node.start
410 | 
411 |     def _as_rhs(self):
412 |         return u'%s*' % self._unicode_members()[0]
413 | 
414 | 
415 | class OneOrMore(Compound):
416 |     """An expression wrapper like the + quantifier in regexes.
417 | 
418 |     You can also pass in an alternate minimum to make this behave like "2 or
419 |     more", "3 or more", etc.
420 | 
421 |     """
422 |     __slots__ = ['min']
423 | 
424 |     # TODO: Add max. It should probably succeed if there are more than the max
425 |     # --just not consume them.
426 | 
427 |     def __init__(self, member, name='', min=1):
428 |         super(OneOrMore, self).__init__(member, name=name)
429 |         self.min = min
430 | 
431 |     def _uncached_match(self, text, pos, cache, error):
432 |         new_pos = pos
433 |         children = []
434 |         while True:
435 |             node = self.members[0].match_core(text, new_pos, cache, error)
436 |             if node is None:
437 |                 break
438 |             children.append(node)
439 |             length = node.end - node.start
440 |             if length == 0:  # Don't loop infinitely.
441 |                 break
442 |             new_pos += length
443 |         if len(children) >= self.min:
444 |             return Node(self.name, text, pos, new_pos, children)
445 | 
446 |     def _as_rhs(self):
447 |         return u'%s+' % self._unicode_members()[0]
448 | 


--------------------------------------------------------------------------------
/parsimonious/tests/test_grammar.py:
--------------------------------------------------------------------------------
  1 | from sys import version_info
  2 | from unittest import TestCase
  3 | 
  4 | from nose import SkipTest
  5 | from nose.tools import eq_, assert_raises, ok_
  6 | from six import text_type
  7 | 
  8 | from parsimonious.exceptions import UndefinedLabel, ParseError
  9 | from parsimonious.expressions import Sequence
 10 | from parsimonious.grammar import rule_grammar, RuleVisitor, Grammar, TokenGrammar, LazyReference
 11 | from parsimonious.nodes import Node
 12 | from parsimonious.utils import Token
 13 | 
 14 | 
 15 | class BootstrappingGrammarTests(TestCase):
 16 |     """Tests for the expressions in the grammar that parses the grammar
 17 |     definition syntax"""
 18 | 
 19 |     def test_quantifier(self):
 20 |         text = '*'
 21 |         eq_(rule_grammar['quantifier'].parse(text),
 22 |             Node('quantifier', text, 0, 1, children=[
 23 |                 Node('', text, 0, 1), Node('_', text, 1, 1)]))
 24 |         text = '?'
 25 |         eq_(rule_grammar['quantifier'].parse(text),
 26 |             Node('quantifier', text, 0, 1, children=[
 27 |                 Node('', text, 0, 1), Node('_', text, 1, 1)]))
 28 |         text = '+'
 29 |         eq_(rule_grammar['quantifier'].parse(text),
 30 |             Node('quantifier', text, 0, 1, children=[
 31 |                 Node('', text, 0, 1), Node('_', text, 1, 1)]))
 32 | 
 33 |     def test_spaceless_literal(self):
 34 |         text = '"anything but quotes#$*&^"'
 35 |         eq_(rule_grammar['spaceless_literal'].parse(text),
 36 |             Node('spaceless_literal', text, 0, len(text), children=[
 37 |                 Node('', text, 0, len(text))]))
 38 |         text = r'''r"\""'''
 39 |         eq_(rule_grammar['spaceless_literal'].parse(text),
 40 |             Node('spaceless_literal', text, 0, 5, children=[
 41 |                 Node('', text, 0, 5)]))
 42 | 
 43 |     def test_regex(self):
 44 |         text = '~"[a-zA-Z_][a-zA-Z_0-9]*"LI'
 45 |         eq_(rule_grammar['regex'].parse(text),
 46 |             Node('regex', text, 0, len(text), children=[
 47 |                  Node('', text, 0, 1),
 48 |                  Node('spaceless_literal', text, 1, 25, children=[
 49 |                      Node('', text, 1, 25)]),
 50 |                  Node('', text, 25, 27),
 51 |                  Node('_', text, 27, 27)]))
 52 | 
 53 |     def test_successes(self):
 54 |         """Make sure the PEG recognition grammar succeeds on various inputs."""
 55 |         ok_(rule_grammar['label'].parse('_'))
 56 |         ok_(rule_grammar['label'].parse('jeff'))
 57 |         ok_(rule_grammar['label'].parse('_THIS_THING'))
 58 | 
 59 |         ok_(rule_grammar['atom'].parse('some_label'))
 60 |         ok_(rule_grammar['atom'].parse('"some literal"'))
 61 |         ok_(rule_grammar['atom'].parse('~"some regex"i'))
 62 | 
 63 |         ok_(rule_grammar['quantified'].parse('~"some regex"i*'))
 64 |         ok_(rule_grammar['quantified'].parse('thing+'))
 65 |         ok_(rule_grammar['quantified'].parse('"hi"?'))
 66 | 
 67 |         ok_(rule_grammar['term'].parse('this'))
 68 |         ok_(rule_grammar['term'].parse('that+'))
 69 | 
 70 |         ok_(rule_grammar['sequence'].parse('this that? other'))
 71 | 
 72 |         ok_(rule_grammar['ored'].parse('this / that+ / "other"'))
 73 | 
 74 |         # + is higher precedence than &, so 'anded' should match the whole
 75 |         # thing:
 76 |         ok_(rule_grammar['lookahead_term'].parse('&this+'))
 77 | 
 78 |         ok_(rule_grammar['expression'].parse('this'))
 79 |         ok_(rule_grammar['expression'].parse('this? that other*'))
 80 |         ok_(rule_grammar['expression'].parse('&this / that+ / "other"'))
 81 |         ok_(rule_grammar['expression'].parse('this / that? / "other"+'))
 82 |         ok_(rule_grammar['expression'].parse('this? that other*'))
 83 | 
 84 |         ok_(rule_grammar['rule'].parse('this = that\r'))
 85 |         ok_(rule_grammar['rule'].parse('this = the? that other* \t\r'))
 86 |         ok_(rule_grammar['rule'].parse('the=~"hi*"\n'))
 87 | 
 88 |         ok_(rule_grammar.parse('''
 89 |             this = the? that other*
 90 |             that = "thing"
 91 |             the=~"hi*"
 92 |             other = "ahoy hoy"
 93 |             '''))
 94 | 
 95 | 
 96 | class RuleVisitorTests(TestCase):
 97 |     """Tests for ``RuleVisitor``
 98 | 
 99 |     As I write these, Grammar is not yet fully implemented. Normally, there'd
100 |     be no reason to use ``RuleVisitor`` directly.
101 | 
102 |     """
103 |     def test_round_trip(self):
104 |         """Test a simple round trip.
105 | 
106 |         Parse a simple grammar, turn the parse tree into a map of expressions,
107 |         and use that to parse another piece of text.
108 | 
109 |         Not everything was implemented yet, but it was a big milestone and a
110 |         proof of concept.
111 | 
112 |         """
113 |         tree = rule_grammar.parse('''number = ~"[0-9]+"\n''')
114 |         rules, default_rule = RuleVisitor().visit(tree)
115 | 
116 |         text = '98'
117 |         eq_(default_rule.parse(text), Node('number', text, 0, 2))
118 | 
119 |     def test_undefined_rule(self):
120 |         """Make sure we throw the right exception on undefined rules."""
121 |         tree = rule_grammar.parse('boy = howdy\n')
122 |         assert_raises(UndefinedLabel, RuleVisitor().visit, tree)
123 | 
124 |     def test_optional(self):
125 |         tree = rule_grammar.parse('boy = "howdy"?\n')
126 |         rules, default_rule = RuleVisitor().visit(tree)
127 | 
128 |         howdy = 'howdy'
129 | 
130 |         # It should turn into a Node from the Optional and another from the
131 |         # Literal within.
132 |         eq_(default_rule.parse(howdy), Node('boy', howdy, 0, 5, children=[
133 |                                            Node('', howdy, 0, 5)]))
134 | 
135 | 
136 | class GrammarTests(TestCase):
137 |     """Integration-test ``Grammar``: feed it a PEG and see if it works."""
138 | 
139 |     def test_expressions_from_rules(self):
140 |         """Test the ``Grammar`` base class's ability to compile an expression
141 |         tree from rules.
142 | 
143 |         That the correct ``Expression`` tree is built is already tested in
144 |         ``RuleGrammarTests``. This tests only that the ``Grammar`` base class's
145 |         ``_expressions_from_rules`` works.
146 | 
147 |         """
148 |         greeting_grammar = Grammar('greeting = "hi" / "howdy"')
149 |         tree = greeting_grammar.parse('hi')
150 |         eq_(tree, Node('greeting', 'hi', 0, 2, children=[
151 |                        Node('', 'hi', 0, 2)]))
152 | 
153 |     def test_unicode(self):
154 |         """Assert that a ``Grammar`` can convert into a string-formatted series
155 |         of rules."""
156 |         grammar = Grammar(r"""
157 |                           bold_text  = bold_open text bold_close
158 |                           text       = ~"[A-Z 0-9]*"i
159 |                           bold_open  = "(("
160 |                           bold_close = "))"
161 |                           """)
162 |         lines = text_type(grammar).splitlines()
163 |         eq_(lines[0], 'bold_text = bold_open text bold_close')
164 |         ok_('text = ~"[A-Z 0-9]*"i%s' % ('u' if version_info >= (3,) else '')
165 |             in lines)
166 |         ok_('bold_open = "(("' in lines)
167 |         ok_('bold_close = "))"' in lines)
168 |         eq_(len(lines), 4)
169 | 
170 |     def test_match(self):
171 |         """Make sure partial-matching (with pos) works."""
172 |         grammar = Grammar(r"""
173 |                           bold_text  = bold_open text bold_close
174 |                           text       = ~"[A-Z 0-9]*"i
175 |                           bold_open  = "(("
176 |                           bold_close = "))"
177 |                           """)
178 |         s = ' ((boo))yah'
179 |         eq_(grammar.match(s, pos=1), Node('bold_text', s, 1, 8, children=[
180 |                                          Node('bold_open', s, 1, 3),
181 |                                          Node('text', s, 3, 6),
182 |                                          Node('bold_close', s, 6, 8)]))
183 | 
184 |     def test_bad_grammar(self):
185 |         """Constructing a Grammar with bad rules should raise ParseError."""
186 |         assert_raises(ParseError, Grammar, 'just a bunch of junk')
187 | 
188 |     def test_comments(self):
189 |         """Test tolerance of comments and blank lines in and around rules."""
190 |         grammar = Grammar(r"""# This is a grammar.
191 | 
192 |                           # It sure is.
193 |                           bold_text  = stars text stars  # nice
194 |                           text       = ~"[A-Z 0-9]*"i #dude
195 | 
196 | 
197 |                           stars      = "**"
198 |                           # Pretty good
199 |                           #Oh yeah.#""")  # Make sure a comment doesn't need a
200 |                                           # \n or \r to end.
201 |         eq_(list(sorted(str(grammar).splitlines())),
202 |             ['''bold_text = stars text stars''',
203 |              # TODO: Unicode flag is on by default in Python 3. I wonder if we
204 |              # should turn it on all the time in Parsimonious.
205 |              '''stars = "**"''',
206 |              '''text = ~"[A-Z 0-9]*"i%s''' % ('u' if version_info >= (3,)
207 |                                               else '')])
208 | 
209 |     def test_multi_line(self):
210 |         """Make sure we tolerate all sorts of crazy line breaks and comments in
211 |         the middle of rules."""
212 |         grammar = Grammar("""
213 |             bold_text  = bold_open  # commenty comment
214 |                          text  # more comment
215 |                          bold_close
216 |             text       = ~"[A-Z 0-9]*"i
217 |             bold_open  = "((" bold_close =  "))"
218 |             """)
219 |         ok_(grammar.parse('((booyah))') is not None)
220 | 
221 |     def test_not(self):
222 |         """Make sure "not" predicates get parsed and work properly."""
223 |         grammar = Grammar(r'''not_arp = !"arp" ~"[a-z]+"''')
224 |         assert_raises(ParseError, grammar.parse, 'arp')
225 |         ok_(grammar.parse('argle') is not None)
226 | 
227 |     def test_lookahead(self):
228 |         grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''')
229 |         assert_raises(ParseError, grammar.parse, 'burp')
230 | 
231 |         s = 'arp'
232 |         eq_(grammar.parse('arp'), Node('starts_with_a', s, 0, 3, children=[
233 |                                       Node('', s, 0, 0),
234 |                                       Node('', s, 0, 3)]))
235 | 
236 |     def test_parens(self):
237 |         grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''')
238 |         # Make sure it's not as if the parens aren't there:
239 |         assert_raises(ParseError, grammar.parse, 'chitty bangbang')
240 | 
241 |         s = 'chitty bang bang'
242 |         eq_(str(grammar.parse(s)),
243 |             """<Node called "sequence" matching "chitty bang bang">
244 |     <Node matching "chitty">
245 |     <Node matching " bang bang">
246 |         <Node matching " bang">
247 |             <Node matching " ">
248 |             <Node matching "bang">
249 |         <Node matching " bang">
250 |             <Node matching " ">
251 |             <Node matching "bang">""")
252 | 
253 |     def test_resolve_refs_order(self):
254 |         """Smoke-test a circumstance where lazy references don't get resolved."""
255 |         grammar = Grammar("""
256 |             expression = "(" terms ")"
257 |             terms = term+
258 |             term = number
259 |             number = ~r"[0-9]+"
260 |             """)
261 |         grammar.parse('(34)')
262 | 
263 |     def test_infinite_loop(self):
264 |         """Smoke-test a grammar that was causing infinite loops while building.
265 | 
266 |         This was going awry because the "int" rule was never getting marked as
267 |         resolved, so it would just keep trying to resolve it over and over.
268 | 
269 |         """
270 |         Grammar("""
271 |             digits = digit+
272 |             int = digits
273 |             digit = ~"[0-9]"
274 |             number = int
275 |             main = number
276 |             """)
277 | 
278 |     def test_right_recursive(self):
279 |         """Right-recursive refs should resolve."""
280 |         grammar = Grammar("""
281 |             digits = digit digits?
282 |             digit = ~r"[0-9]"
283 |             """)
284 |         ok_(grammar.parse('12') is not None)
285 | 
286 |     def test_badly_circular(self):
287 |         """Uselessly circular references should be detected by the grammar
288 |         compiler."""
289 |         raise SkipTest('We have yet to make the grammar compiler detect these.')
290 |         grammar = Grammar("""
291 |             foo = bar
292 |             bar = foo
293 |             """)
294 | 
295 |     def test_parens_with_leading_whitespace(self):
296 |         """Make sure a parenthesized expression is allowed to have leading
297 |         whitespace when nested directly inside another."""
298 |         Grammar("""foo = ( ("c") )""").parse('c')
299 | 
300 |     def test_single_quoted_literals(self):
301 |         Grammar("""foo = 'a' '"'""").parse('a"')
302 | 
303 |     def test_simple_custom_rules(self):
304 |         """Run 2-arg custom-coded rules through their paces."""
305 |         grammar = Grammar("""
306 |             bracketed_digit = start digit end
307 |             start = '['
308 |             end = ']'""",
309 |             digit=lambda text, pos:
310 |                     (pos + 1) if text[pos].isdigit() else None)
311 |         s = '[6]'
312 |         eq_(grammar.parse(s),
313 |             Node('bracketed_digit', s, 0, 3, children=[
314 |                 Node('start', s, 0, 1),
315 |                 Node('digit', s, 1, 2),
316 |                 Node('end', s, 2, 3)]))
317 | 
318 |     def test_complex_custom_rules(self):
319 |         """Run 5-arg custom rules through their paces.
320 | 
321 |         Incidentally tests returning an actual Node from the custom rule.
322 | 
323 |         """
324 |         grammar = Grammar("""
325 |             bracketed_digit = start digit end
326 |             start = '['
327 |             end = ']'
328 |             real_digit = '6'""",
329 |             # In this particular implementation of the digit rule, no node is
330 |             # generated for `digit`; it falls right through to `real_digit`.
331 |             # I'm not sure if this could lead to problems; I can't think of
332 |             # any, but it's probably not a great idea.
333 |             digit=lambda text, pos, cache, error, grammar:
334 |                     grammar['real_digit'].match_core(text, pos, cache, error))
335 |         s = '[6]'
336 |         eq_(grammar.parse(s),
337 |             Node('bracketed_digit', s, 0, 3, children=[
338 |                 Node('start', s, 0, 1),
339 |                 Node('real_digit', s, 1, 2),
340 |                 Node('end', s, 2, 3)]))
341 | 
342 |     def test_lazy_custom_rules(self):
343 |         """Make sure LazyReferences manually shoved into custom rules are
344 |         resolved.
345 | 
346 |         Incidentally test passing full-on Expressions as custom rules and
347 |         having a custom rule as the default one.
348 | 
349 |         """
350 |         grammar = Grammar("""
351 |             four = '4'
352 |             five = '5'""",
353 |             forty_five=Sequence(LazyReference('four'),
354 |                                 LazyReference('five'),
355 |                                 name='forty_five')).default('forty_five')
356 |         s = '45'
357 |         eq_(grammar.parse(s),
358 |             Node('forty_five', s, 0, 2, children=[
359 |                 Node('four', s, 0, 1),
360 |                 Node('five', s, 1, 2)]))
361 | 
362 |     def test_unconnected_custom_rules(self):
363 |         """Make sure custom rules that aren't hooked to any other rules still
364 |         get included in the grammar and that lone ones get set as the
365 |         default.
366 | 
367 |         Incidentally test Grammar's `rules` default arg.
368 | 
369 |         """
370 |         grammar = Grammar(one_char=lambda text, pos: pos + 1).default('one_char')
371 |         s = '4'
372 |         eq_(grammar.parse(s),
373 |             Node('one_char', s, 0, 1))
374 | 
375 |     def test_lazy_default_rule(self):
376 |         """Make sure we get an actual rule set as our default rule, even when
377 |         the first rule has forward references and is thus a LazyReference at
378 |         some point during grammar compilation.
379 | 
380 |         """
381 |         grammar = Grammar(r"""
382 |             styled_text = text
383 |             text        = "hi"
384 |             """)
385 |         eq_(grammar.parse('hi'), Node('text', 'hi', 0, 2))
386 | 
387 |     def test_immutable_grammar(self):
388 |         """Make sure that a Grammar is immutable after being created."""
389 |         grammar = Grammar(r"""
390 |             foo = 'bar'
391 |         """)
392 | 
393 |         def mod_grammar(grammar):
394 |             grammar['foo'] = 1
395 |         assert_raises(TypeError, mod_grammar, [grammar])
396 | 
397 |         def mod_grammar(grammar):
398 |             new_grammar = Grammar(r"""
399 |                 baz = 'biff'
400 |             """)
401 |             grammar.update(new_grammar)
402 |         assert_raises(AttributeError, mod_grammar, [grammar])
403 | 
404 |     def test_repr(self):
405 |         self.assertTrue(repr(Grammar(r'foo = "a"')))
406 | 
407 | 
408 | class TokenGrammarTests(TestCase):
409 |     """Tests for the TokenGrammar class and associated machinery"""
410 | 
411 |     def test_parse_success(self):
412 |         """Token literals should work."""
413 |         s = [Token('token1'), Token('token2')]
414 |         grammar = TokenGrammar("""
415 |             foo = token1 "token2"
416 |             token1 = "token1"
417 |             """)
418 |         eq_(grammar.parse(s),
419 |             Node('foo', s, 0, 2, children=[
420 |                 Node('token1', s, 0, 1),
421 |                 Node('', s, 1, 2)]))
422 | 
423 |     def test_parse_failure(self):
424 |         """Parse failures should work normally with token literals."""
425 |         grammar = TokenGrammar("""
426 |             foo = "token1" "token2"
427 |             """)
428 |         assert_raises(ParseError,
429 |                       grammar.parse,
430 |                       [Token('tokenBOO'), Token('token2')])
431 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Parsimonious
  3 | ============
  4 | 
  5 | Parsimonious aims to be the fastest arbitrary-lookahead parser written in pure
  6 | Python—and the most usable. It's based on parsing expression grammars (PEGs),
  7 | which means you feed it a simplified sort of EBNF notation. Parsimonious was
  8 | designed to undergird a MediaWiki parser that wouldn't take 5 seconds or a GB
  9 | of RAM to do one page, but it's applicable to all sorts of languages.
 10 | 
 11 | 
 12 | Goals
 13 | =====
 14 | 
 15 | * Speed
 16 | * Frugal RAM use
 17 | * Minimalistic, understandable, idiomatic Python code
 18 | * Readable grammars
 19 | * Extensible grammars
 20 | * Complete test coverage
 21 | * Separation of concerns. Some Python parsing kits mix recognition with
 22 |   instructions about how to turn the resulting tree into some kind of other
 23 |   representation. This is limiting when you want to do several different things
 24 |   with a tree: for example, render wiki markup to HTML *or* to text.
 25 | * Good error reporting. I want the parser to work *with* me as I develop a
 26 |   grammar.
 27 | 
 28 | 
 29 | Example Usage
 30 | =============
 31 | 
 32 | Here's how to build a simple grammar:
 33 | 
 34 | .. code:: python
 35 | 
 36 |     >>> from parsimonious.grammar import Grammar
 37 |     >>> grammar = Grammar(
 38 |     ...     """
 39 |     ...     bold_text  = bold_open text bold_close
 40 |     ...     text       = ~"[A-Z 0-9]*"i
 41 |     ...     bold_open  = "(("
 42 |     ...     bold_close = "))"
 43 |     ...     """)
 44 | 
 45 | You can have forward references and even right recursion; it's all taken care
 46 | of by the grammar compiler. The first rule is taken to be the default start
 47 | symbol, but you can override that.
 48 | 
 49 | Next, let's parse something and get an abstract syntax tree:
 50 | 
 51 | .. code:: python
 52 | 
 53 |     >>> print grammar.parse('((bold stuff))')
 54 |     <Node called "bold_text" matching "((bold stuff))">
 55 |         <Node called "bold_open" matching "((">
 56 |         <RegexNode called "text" matching "bold stuff">
 57 |         <Node called "bold_close" matching "))">
 58 | 
 59 | You'd typically then use a ``nodes.NodeVisitor`` subclass (see below) to walk
 60 | the tree and do something useful with it.
 61 | 
 62 | 
 63 | Status
 64 | ======
 65 | 
 66 | * Everything that exists works. Test coverage is good.
 67 | * I don't plan on making any backward-incompatible changes to the rule syntax
 68 |   in the future, so you can write grammars with confidence.
 69 | * It may be slow and use a lot of RAM; I haven't measured either yet. However,
 70 |   I have yet to begin optimizing in earnest.
 71 | * Error reporting is now in place. ``repr`` methods of expressions, grammars,
 72 |   and nodes are clear and helpful as well. The ``Grammar`` ones are
 73 |   even round-trippable!
 74 | * The grammar extensibility story is underdeveloped at the moment. You should
 75 |   be able to extend a grammar by simply concatening more rules onto the
 76 |   existing ones; later rules of the same name should override previous ones.
 77 |   However, this is untested and may not be the final story.
 78 | * Sphinx docs are coming, but the docstrings are quite useful now.
 79 | * Note that there may be API changes until we get to 1.0, so be sure to pin to
 80 |   the version you're using.
 81 | 
 82 | Coming Soon
 83 | -----------
 84 | 
 85 | * Optimizations to make Parsimonious worthy of its name
 86 | * Tighter RAM use
 87 | * Better-thought-out grammar extensibility story
 88 | * Amazing grammar debugging
 89 | 
 90 | 
 91 | A Little About PEG Parsers
 92 | ==========================
 93 | 
 94 | PEG parsers don't draw a distinction between lexing and parsing; everything is
 95 | done at once. As a result, there is no lookahead limit, as there is with, for
 96 | instance, Yacc. And, due to both of these properties, PEG grammars are easier
 97 | to write: they're basically just a more practical dialect of EBNF. With
 98 | caching, they take O(grammar size * text length) memory (though I plan to do
 99 | better), but they run in O(text length) time.
100 | 
101 | More Technically
102 | ----------------
103 | 
104 | PEGs can describe a superset of *LL(k)* languages, any deterministic *LR(k)*
105 | language, and many others—including some that aren't context-free
106 | (http://www.brynosaurus.com/pub/lang/peg.pdf). They can also deal with what
107 | would be ambiguous languages if described in canonical EBNF. They do this by
108 | trading the ``|`` alternation operator for the ``/`` operator, which works the
109 | same except that it makes priority explicit: ``a / b / c`` first tries matching
110 | ``a``. If that fails, it tries ``b``, and, failing that, moves on to ``c``.
111 | Thus, ambiguity is resolved by always yielding the first successful recognition.
112 | 
113 | 
114 | Writing Grammars
115 | ================
116 | 
117 | Grammars are defined by a series of rules. The syntax should be familiar to
118 | anyone who uses regexes or reads programming language manuals. An example will
119 | serve best:
120 | 
121 | .. code:: python
122 | 
123 |     my_grammar = Grammar(r"""
124 |         styled_text = bold_text / italic_text
125 |         bold_text   = "((" text "))"
126 |         italic_text = "''" text "''"
127 |         text        = ~"[A-Z 0-9]*"i
128 |         """)
129 | 
130 | You can wrap a rule across multiple lines if you like; the syntax is very
131 | forgiving.
132 | 
133 | 
134 | Syntax Reference
135 | ----------------
136 | 
137 | ====================    ========================================================
138 | ``"some literal"``      Used to quote literals. Backslash escaping and Python
139 |                         conventions for "raw" and Unicode strings help support
140 |                         fiddly characters.
141 | 
142 | [space]                 Sequences are made out of space- or tab-delimited
143 |                         things. ``a b c`` matches spots where those 3
144 |                         terms appear in that order.
145 | 
146 | ``a / b / c``           Alternatives. The first to succeed of ``a / b / c``
147 |                         wins.
148 | 
149 | ``thing?``              An optional expression. This is greedy, always consuming
150 |                         ``thing`` if it exists.
151 | 
152 | ``&thing``              A lookahead assertion. Ensures ``thing`` matches at the
153 |                         current position but does not consume it.
154 | 
155 | ``!thing``              A negative lookahead assertion. Matches if ``thing``
156 |                         isn't found here. Doesn't consume any text.
157 | 
158 | ``things*``             Zero or more things. This is greedy, always consuming as
159 |                         many repetitions as it can.
160 | 
161 | ``things+``             One or more things. This is greedy, always consuming as
162 |                         many repetitions as it can.
163 | 
164 | ``~r"regex"ilmsux``     Regexes have ``~`` in front and are quoted like
165 |                         literals. Any flags follow the end quotes as single
166 |                         chars. Regexes are good for representing character
167 |                         classes (``[a-z0-9]``) and optimizing for speed. The
168 |                         downside is that they won't be able to take advantage
169 |                         of our fancy debugging, once we get that working.
170 |                         Ultimately, I'd like to deprecate explicit regexes and
171 |                         instead have Parsimonious dynamically build them out of
172 |                         simpler primitives.
173 | 
174 | ``(things)``            Parentheses are used for grouping, like in every other
175 |                         language.
176 | ====================    ========================================================
177 | 
178 | 
179 | Optimizing Grammars
180 | ===================
181 | 
182 | Don't Repeat Expressions
183 | ------------------------
184 | 
185 | If you need a ``~"[a-z0-9]"i`` at two points in your grammar, don't type it
186 | twice. Make it a rule of its own, and reference it from wherever you need it. 
187 | You'll get the most out of the caching this way, since cache lookups are by 
188 | expression object identity (for speed). 
189 | 
190 | Even if you have an expression that's very simple, not repeating it will 
191 | save RAM, as there can, at worst, be a cached int for every char in the text 
192 | you're parsing. In the future, we may identify repeated subexpressions 
193 | automatically and factor them up while building the grammar.
194 | 
195 | How much should you shove into one regex, versus how much should you break them
196 | up to not repeat yourself? That's a fine balance and worthy of benchmarking.
197 | More stuff jammed into a regex will execute faster, because it doesn't have to
198 | run any Python between pieces, but a broken-up one will give better cache
199 | performance if the individual pieces are re-used elsewhere. If the pieces of a
200 | regex aren't used anywhere else, by all means keep the whole thing together.
201 | 
202 | 
203 | Quantifiers
204 | -----------
205 | 
206 | Bring your ``?`` and ``*`` quantifiers up to the highest level you
207 | can. Otherwise, lower-level patterns could succeed but be empty and put a bunch
208 | of useless nodes in your tree that didn't really match anything.
209 | 
210 | 
211 | Processing Parse Trees
212 | ======================
213 | 
214 | A parse tree has a node for each expression matched, even if it matched a
215 | zero-length string, like ``"thing"?`` might.
216 | 
217 | The ``NodeVisitor`` class provides an inversion-of-control framework for
218 | walking a tree and returning a new construct (tree, string, or whatever) based
219 | on it. For now, have a look at its docstrings for more detail. There's also a
220 | good example in ``grammar.RuleVisitor``. Notice how we take advantage of nodes'
221 | iterability by using tuple unpacks in the formal parameter lists:
222 | 
223 | .. code:: python
224 | 
225 |     def visit_or_term(self, or_term, (slash, _, term)):
226 |         ...
227 | 
228 | For reference, here is the production the above unpacks::
229 | 
230 |     or_term = "/" _ term
231 | 
232 | When something goes wrong in your visitor, you get a nice error like this::
233 | 
234 |     [normal traceback here...]
235 |     VisitationException: 'Node' object has no attribute 'foo'
236 | 
237 |     Parse tree:
238 |     <Node called "rules" matching "number = ~"[0-9]+"">  <-- *** We were here. ***
239 |         <Node matching "number = ~"[0-9]+"">
240 |             <Node called "rule" matching "number = ~"[0-9]+"">
241 |                 <Node matching "">
242 |                 <Node called "label" matching "number">
243 |                 <Node matching " ">
244 |                     <Node called "_" matching " ">
245 |                 <Node matching "=">
246 |                 <Node matching " ">
247 |                     <Node called "_" matching " ">
248 |                 <Node called "rhs" matching "~"[0-9]+"">
249 |                     <Node called "term" matching "~"[0-9]+"">
250 |                         <Node called "atom" matching "~"[0-9]+"">
251 |                             <Node called "regex" matching "~"[0-9]+"">
252 |                                 <Node matching "~">
253 |                                 <Node called "literal" matching ""[0-9]+"">
254 |                                 <Node matching "">
255 |                 <Node matching "">
256 |                 <Node called "eol" matching "
257 |                 ">
258 |         <Node matching "">
259 | 
260 | The parse tree is tacked onto the exception, and the node whose visitor method
261 | raised the error is pointed out.
262 | 
263 | Why No Streaming Tree Processing?
264 | ---------------------------------
265 | 
266 | Some have asked why we don't process the tree as we go, SAX-style. There are
267 | two main reasons:
268 | 
269 | 1. It wouldn't work. With a PEG parser, no parsing decision is final until the
270 |    whole text is parsed. If we had to change a decision, we'd have to backtrack
271 |    and redo the SAX-style interpretation as well, which would involve
272 |    reconstituting part of the AST and quite possibly scuttling whatever you
273 |    were doing with the streaming output. (Note that some bursty SAX-style
274 |    processing may be possible in the future if we use cuts.)
275 | 
276 | 2. It interferes with the ability to derive multiple representations from the
277 |    AST: for example, turning wiki markup into first HTML and then text.
278 | 
279 | 
280 | Future Directions
281 | =================
282 | 
283 | Rule Syntax Changes
284 | -------------------
285 | 
286 | * Maybe support left-recursive rules like PyMeta, if anybody cares.
287 | * Ultimately, I'd like to get rid of explicit regexes and break them into more
288 |   atomic things like character classes. Then we can dynamically compile bits
289 |   of the grammar into regexes as necessary to boost speed.
290 | 
291 | Optimizations
292 | -------------
293 | 
294 | * Make RAM use almost constant by automatically inserting "cuts", as described
295 |   in
296 |   http://ialab.cs.tsukuba.ac.jp/~mizusima/publications/paste513-mizushima.pdf.
297 |   This would also improve error reporting, as we wouldn't backtrack out of
298 |   everything informative before finally failing.
299 | * Find all the distinct subexpressions, and unify duplicates for a better cache
300 |   hit ratio.
301 | * Think about having the user (optionally) provide some representative input
302 |   along with a grammar. We can then profile against it, see which expressions
303 |   are worth caching, and annotate the grammar. Perhaps there will even be
304 |   positions at which a given expression is more worth caching. Or we could keep
305 |   a count of how many times each cache entry has been used and evict the most
306 |   useless ones as RAM use grows.
307 | * We could possibly compile the grammar into VM instructions, like in "A
308 |   parsing machine for PEGs" by Medeiros.
309 | * If the recursion gets too deep in practice, use trampolining to dodge it.
310 | 
311 | Niceties
312 | --------
313 | 
314 | * Pijnu has a raft of tree manipulators. I don't think I want all of them, but
315 |   a judicious subset might be nice. Don't get into mixing formatting with tree
316 |   manipulation.
317 |   https://github.com/erikrose/pijnu/blob/master/library/node.py#L333. PyPy's
318 |   parsing lib exposes a sane subset:
319 |   http://doc.pypy.org/en/latest/rlib.html#tree-transformations.
320 | 
321 | 
322 | Version History
323 | ===============
324 | 
325 | 0.7.0
326 |   * Add experimental token-based parsing, via TokenGrammar class, for those
327 |     operating on pre-lexed streams of tokens. This can, for example, help parse
328 |     indentation-sensitive languages that use the "off-side rule", like Python.
329 |     (Erik Rose)
330 |   * Common codebase for Python 2 and 3: no more 2to3 translation step (Mattias
331 |     Urlichs, Lucas Wiman)
332 |   * Drop Python 3.1 and 3.2 support.
333 |   * Fix a bug in ``Grammar.__repr__`` which fails to work on Python 3 since the
334 |     string_escape codec is gone in Python 3. (Lucas Wiman)
335 |   * Don't lose parentheses when printing representations of expressions.
336 |     (Michael Kelly)
337 |   * Make Grammar an immutable mapping (until we add automatic recompilation).
338 |     (Michael Kelly)
339 | 
340 | 0.6.2
341 |   * Make grammar compilation 100x faster. Thanks to dmoisset for the initial
342 |     patch.
343 | 
344 | 0.6.1
345 |   * Fix bug which made the default rule of a grammar invalid when it
346 |     contained a forward reference.
347 | 
348 | 0.6
349 |   .. warning::
350 | 
351 |       This release makes backward-incompatible changes:
352 | 
353 |       * The ``default_rule`` arg to Grammar's constructor has been replaced
354 |         with a method, ``some_grammar.default('rule_name')``, which returns a
355 |         new grammar just like the old except with its default rule changed.
356 |         This is to free up the constructor kwargs for custom rules.
357 |       * ``UndefinedLabel`` is no longer a subclass of ``VisitationError``. This
358 |         matters only in the unlikely case that you were catching
359 |         ``VisitationError`` exceptions and expecting to thus also catch
360 |         ``UndefinedLabel``.
361 | 
362 |   * Add support for "custom rules" in Grammars. These provide a hook for simple
363 |     custom parsing hooks spelled as Python lambdas. For heavy-duty needs,
364 |     you can put in Compound Expressions with LazyReferences as subexpressions,
365 |     and the Grammar will hook them up for optimal efficiency--no calling
366 |     ``__getitem__`` on Grammar at parse time.
367 |   * Allow grammars without a default rule (in cases where there are no string
368 |     rules), which leads to also allowing empty grammars. Perhaps someone
369 |     building up grammars dynamically will find that useful.
370 |   * Add ``@rule`` decorator, allowing grammars to be constructed out of
371 |     notations on ``NodeVisitor`` methods. This saves looking back and forth
372 |     between the visitor and the grammar when there is only one visitor per
373 |     grammar.
374 |   * Add ``parse()`` and ``match()`` convenience methods to ``NodeVisitor``.
375 |     This makes the common case of parsing a string and applying exactly one
376 |     visitor to the AST shorter and simpler.
377 |   * Improve exception message when you forget to declare a visitor method.
378 |   * Add ``unwrapped_exceptions`` attribute to ``NodeVisitor``, letting you
379 |     name certain exceptions which propagate out of visitors without being
380 |     wrapped by ``VisitationError`` exceptions.
381 |   * Expose much more of the library in ``__init__``, making your imports
382 |     shorter.
383 |   * Drastically simplify reference resolution machinery. (Vladimir Keleshev)
384 | 
385 | 0.5
386 |   .. warning::
387 | 
388 |       This release makes some backward-incompatible changes. See below.
389 | 
390 |   * Add alpha-quality error reporting. Now, rather than returning ``None``,
391 |     ``parse()`` and ``match()`` raise ``ParseError`` if they don't succeed.
392 |     This makes more sense, since you'd rarely attempt to parse something and
393 |     not care if it succeeds. It was too easy before to forget to check for a
394 |     ``None`` result. ``ParseError`` gives you a human-readable unicode
395 |     representation as well as some attributes that let you construct your own
396 |     custom presentation.
397 |   * Grammar construction now raises ``ParseError`` rather than ``BadGrammar``
398 |     if it can't parse your rules.
399 |   * ``parse()`` now takes an optional ``pos`` argument, like ``match()``.
400 |   * Make the ``_str__()`` method of ``UndefinedLabel`` return the right type.
401 |   * Support splitting rules across multiple lines, interleaving comments,
402 |     putting multiple rules on one line (but don't do that) and all sorts of
403 |     other horrific behavior.
404 |   * Tolerate whitespace after opening parens.
405 |   * Add support for single-quoted literals.
406 | 
407 | 0.4
408 |   * Support Python 3.
409 |   * Fix ``import *`` for ``parsimonious.expressions``.
410 |   * Rewrite grammar compiler so right-recursive rules can be compiled and
411 |     parsing no longer fails in some cases with forward rule references.
412 | 
413 | 0.3
414 |   * Support comments, the ``!`` ("not") operator, and parentheses in grammar
415 |     definition syntax.
416 |   * Change the ``&`` operator to a prefix operator to conform to the original
417 |     PEG syntax. The version in Parsing Techniques was infix, and that's what I
418 |     used as a reference. However, the unary version is more convenient, as it
419 |     lets you spell ``AB & A`` as simply ``A &B``.
420 |   * Take the ``print`` statements out of the benchmark tests.
421 |   * Give Node an evaluate-able ``__repr__``.
422 | 
423 | 0.2
424 |   * Support matching of prefixes and other not-to-the-end slices of strings by
425 |     making ``match()`` public and able to initialize a new cache. Add
426 |     ``match()`` callthrough method to ``Grammar``.
427 |   * Report a ``BadGrammar`` exception (rather than crashing) when there are
428 |     mistakes in a grammar definition.
429 |   * Simplify grammar compilation internals: get rid of superfluous visitor
430 |     methods and factor up repetitive ones. Simplify rule grammar as well.
431 |   * Add ``NodeVisitor.lift_child`` convenience method.
432 |   * Rename ``VisitationException`` to ``VisitationError`` for consistency with
433 |     the standard Python exception hierarchy.
434 |   * Rework ``repr`` and ``str`` values for grammars and expressions. Now they
435 |     both look like rule syntax. Grammars are even round-trippable! This fixes a
436 |     unicode encoding error when printing nodes that had parsed unicode text.
437 |   * Add tox for testing. Stop advertising Python 2.5 support, which never
438 |     worked (and won't unless somebody cares a lot, since it makes Python 3
439 |     support harder).
440 |   * Settle (hopefully) on the term "rule" to mean "the string representation of
441 |     a production". Get rid of the vague, mysterious "DSL".
442 | 
443 | 0.1
444 |   * A rough but useable preview release
445 | 
446 | Thanks to Wiki Loves Monuments Panama for showing their support with a generous
447 | gift.
448 | 


--------------------------------------------------------------------------------
/parsimonious/grammar.py:
--------------------------------------------------------------------------------
  1 | """A convenience which constructs expression trees from an easy-to-read syntax
  2 | 
  3 | Use this unless you have a compelling reason not to; it performs some
  4 | optimizations that would be tedious to do when constructing an expression tree
  5 | by hand.
  6 | 
  7 | """
  8 | from collections import Mapping
  9 | from inspect import isfunction, ismethod
 10 | 
 11 | from six import (text_type, iterkeys, itervalues, iteritems,
 12 |     python_2_unicode_compatible, PY2)
 13 | 
 14 | from parsimonious.exceptions import BadGrammar, UndefinedLabel
 15 | from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,
 16 |     Lookahead, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher,
 17 |     expression)
 18 | from parsimonious.nodes import NodeVisitor
 19 | from parsimonious.utils import StrAndRepr, evaluate_string
 20 | 
 21 | @python_2_unicode_compatible
 22 | class Grammar(StrAndRepr, Mapping):
 23 |     """A collection of rules that describe a language
 24 | 
 25 |     You can start parsing from the default rule by calling ``parse()``
 26 |     directly on the ``Grammar`` object::
 27 | 
 28 |         g = Grammar('''
 29 |                     polite_greeting = greeting ", my good " title
 30 |                     greeting        = "Hi" / "Hello"
 31 |                     title           = "madam" / "sir"
 32 |                     ''')
 33 |         g.parse('Hello, my good sir')
 34 | 
 35 |     Or start parsing from any of the other rules; you can pull them out of the
 36 |     grammar as if it were a dictionary::
 37 | 
 38 |         g['title'].parse('sir')
 39 | 
 40 |     You could also just construct a bunch of ``Expression`` objects yourself
 41 |     and stitch them together into a language, but using a ``Grammar`` has some
 42 |     important advantages:
 43 | 
 44 |     * Languages are much easier to define in the nice syntax it provides.
 45 |     * Circular references aren't a pain.
 46 |     * It does all kinds of whizzy space- and time-saving optimizations, like
 47 |       factoring up repeated subexpressions into a single object, which should
 48 |       increase cache hit ratio. [Is this implemented yet?]
 49 | 
 50 |     """
 51 |     def __init__(self, rules='', **more_rules):
 52 |         """Construct a grammar.
 53 | 
 54 |         :arg rules: A string of production rules, one per line.
 55 |         :arg default_rule: The name of the rule invoked when you call
 56 |             :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the
 57 |             first rule. Falls back to None if there are no string-based rules
 58 |             in this grammar.
 59 |         :arg more_rules: Additional kwargs whose names are rule names and
 60 |             values are Expressions or custom-coded callables which accomplish
 61 |             things the built-in rule syntax cannot. These take precedence over
 62 |             ``rules`` in case of naming conflicts.
 63 | 
 64 |         """
 65 |         decorated_custom_rules = dict(
 66 |             (k, expression(v, k, self) if isfunction(v) or
 67 |                                           ismethod(v) else
 68 |                 v) for k, v in iteritems(more_rules))
 69 | 
 70 |         self._expressions, first = self._expressions_from_rules(rules, decorated_custom_rules)
 71 |         self.default_rule = first  # may be None
 72 | 
 73 |     def __getitem__(self, rule_name):
 74 |         return self._expressions[rule_name]
 75 | 
 76 |     def __iter__(self):
 77 |         return iterkeys(self._expressions)
 78 | 
 79 |     def __len__(self):
 80 |         return len(self._expressions)
 81 | 
 82 |     def default(self, rule_name):
 83 |         """Return a new Grammar whose :term:`default rule` is ``rule_name``."""
 84 |         new = self._copy()
 85 |         new.default_rule = new[rule_name]
 86 |         return new
 87 | 
 88 |     def _copy(self):
 89 |         """Return a shallow copy of myself.
 90 | 
 91 |         Deep is unnecessary, since Expression trees are immutable. Subgrammars
 92 |         recreate all the Expressions from scratch, and AbstractGrammars have
 93 |         no Expressions.
 94 | 
 95 |         """
 96 |         new = Grammar(**self._expressions)
 97 |         new.default_rule = self.default_rule
 98 |         return new
 99 | 
100 |     def _expressions_from_rules(self, rules, custom_rules):
101 |         """Return a 2-tuple: a dict of rule names pointing to their
102 |         expressions, and then the first rule.
103 | 
104 |         It's a web of expressions, all referencing each other. Typically,
105 |         there's a single root to the web of references, and that root is the
106 |         starting symbol for parsing, but there's nothing saying you can't have
107 |         multiple roots.
108 | 
109 |         :arg custom_rules: A map of rule names to custom-coded rules:
110 |             Expressions
111 | 
112 |         """
113 |         tree = rule_grammar.parse(rules)
114 |         return RuleVisitor(custom_rules).visit(tree)
115 | 
116 |     def parse(self, text, pos=0):
117 |         """Parse some text with the :term:`default rule`.
118 | 
119 |         :arg pos: The index at which to start parsing
120 | 
121 |         """
122 |         self._check_default_rule()
123 |         return self.default_rule.parse(text, pos=pos)
124 | 
125 |     def match(self, text, pos=0):
126 |         """Parse some text with the :term:`default rule` but not necessarily
127 |         all the way to the end.
128 | 
129 |         :arg pos: The index at which to start parsing
130 | 
131 |         """
132 |         self._check_default_rule()
133 |         return self.default_rule.match(text, pos=pos)
134 | 
135 |     def _check_default_rule(self):
136 |         """Raise RuntimeError if there is no default rule defined."""
137 |         if not self.default_rule:
138 |             raise RuntimeError("Can't call parse() on a Grammar that has no "
139 |                                "default rule. Choose a specific rule instead, "
140 |                                "like some_grammar['some_rule'].parse(...).")
141 | 
142 |     def __str__(self):
143 |         """Return a rule string that, when passed to the constructor, would
144 |         reconstitute the grammar."""
145 |         exprs = [self.default_rule] if self.default_rule else []
146 |         exprs.extend(expr for expr in itervalues(self) if
147 |                      expr is not self.default_rule)
148 |         return '\n'.join(expr.as_rule() for expr in exprs)
149 | 
150 |     def __repr__(self):
151 |         """Return an expression that will reconstitute the grammar."""
152 |         codec = 'string_escape' if PY2 else 'unicode_escape'
153 |         return "Grammar('%s')" % str(self).encode(codec)
154 | 
155 | 
156 | class TokenGrammar(Grammar):
157 |     """A Grammar which takes a list of pre-lexed tokens instead of text
158 | 
159 |     This is useful if you want to do the lexing yourself, as a separate pass:
160 |     for example, to implement indentation-based languages.
161 | 
162 |     """
163 |     def _expressions_from_rules(self, rules, custom_rules):
164 |         tree = rule_grammar.parse(rules)
165 |         return TokenRuleVisitor(custom_rules).visit(tree)
166 | 
167 | 
168 | class BootstrappingGrammar(Grammar):
169 |     """The grammar used to recognize the textual rules that describe other
170 |     grammars
171 | 
172 |     This grammar gets its start from some hard-coded Expressions and claws its
173 |     way from there to an expression tree that describes how to parse the
174 |     grammar description syntax.
175 | 
176 |     """
177 |     def _expressions_from_rules(self, rule_syntax, custom_rules):
178 |         """Return the rules for parsing the grammar definition syntax.
179 | 
180 |         Return a 2-tuple: a dict of rule names pointing to their expressions,
181 |         and then the top-level expression for the first rule.
182 | 
183 |         """
184 |         # Hard-code enough of the rules to parse the grammar that describes the
185 |         # grammar description language, to bootstrap:
186 |         comment = Regex(r'#[^\r\n]*', name='comment')
187 |         meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
188 |         _ = ZeroOrMore(meaninglessness, name='_')
189 |         equals = Sequence(Literal('='), _, name='equals')
190 |         label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
191 |         reference = Sequence(label, Not(equals), name='reference')
192 |         quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
193 |         # This pattern supports empty literals. TODO: A problem?
194 |         spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
195 |                                   ignore_case=True,
196 |                                   dot_all=True,
197 |                                   name='spaceless_literal')
198 |         literal = Sequence(spaceless_literal, _, name='literal')
199 |         regex = Sequence(Literal('~'),
200 |                          literal,
201 |                          Regex('[ilmsux]*', ignore_case=True),
202 |                          _,
203 |                          name='regex')
204 |         atom = OneOf(reference, literal, regex, name='atom')
205 |         quantified = Sequence(atom, quantifier, name='quantified')
206 | 
207 |         term = OneOf(quantified, atom, name='term')
208 |         not_term = Sequence(Literal('!'), term, _, name='not_term')
209 |         term.members = (not_term,) + term.members
210 | 
211 |         sequence = Sequence(term, OneOrMore(term), name='sequence')
212 |         or_term = Sequence(Literal('/'), _, term, name='or_term')
213 |         ored = Sequence(term, OneOrMore(or_term), name='ored')
214 |         expression = OneOf(ored, sequence, term, name='expression')
215 |         rule = Sequence(label, equals, expression, name='rule')
216 |         rules = Sequence(_, OneOrMore(rule), name='rules')
217 | 
218 |         # Use those hard-coded rules to parse the (more extensive) rule syntax.
219 |         # (For example, unless I start using parentheses in the rule language
220 |         # definition itself, I should never have to hard-code expressions for
221 |         # those above.)
222 | 
223 |         rule_tree = rules.parse(rule_syntax)
224 | 
225 |         # Turn the parse tree into a map of expressions:
226 |         return RuleVisitor().visit(rule_tree)
227 | 
228 | 
229 | # The grammar for parsing PEG grammar definitions:
230 | # This is a nice, simple grammar. We may someday add to it, but it's a safe bet
231 | # that the future will always be a superset of this.
232 | rule_syntax = (r'''
233 |     # Ignored things (represented by _) are typically hung off the end of the
234 |     # leafmost kinds of nodes. Literals like "/" count as leaves.
235 | 
236 |     rules = _ rule*
237 |     rule = label equals expression
238 |     equals = "=" _
239 |     literal = spaceless_literal _
240 | 
241 |     # So you can't spell a regex like `~"..." ilm`:
242 |     spaceless_literal = ~"u?r?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is /
243 |                         ~"u?r?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is
244 | 
245 |     expression = ored / sequence / term
246 |     or_term = "/" _ term
247 |     ored = term or_term+
248 |     sequence = term term+
249 |     not_term = "!" term _
250 |     lookahead_term = "&" term _
251 |     term = not_term / lookahead_term / quantified / atom
252 |     quantified = atom quantifier
253 |     atom = reference / literal / regex / parenthesized
254 |     regex = "~" spaceless_literal ~"[ilmsux]*"i _
255 |     parenthesized = "(" _ expression ")" _
256 |     quantifier = ~"[*+?]" _
257 |     reference = label !equals
258 | 
259 |     # A subsequent equal sign is the only thing that distinguishes a label
260 |     # (which begins a new rule) from a reference (which is just a pointer to a
261 |     # rule defined somewhere else):
262 |     label = ~"[a-zA-Z_][a-zA-Z_0-9]*" _
263 | 
264 |     # _ = ~r"\s*(?:#[^\r\n]*)?\s*"
265 |     _ = meaninglessness*
266 |     meaninglessness = ~r"\s+" / comment
267 |     comment = ~r"#[^\r\n]*"
268 |     ''')
269 | 
270 | 
271 | class LazyReference(text_type):
272 |     """A lazy reference to a rule, which we resolve after grokking all the
273 |     rules"""
274 | 
275 |     name = u''
276 | 
277 |     # Just for debugging:
278 |     def _as_rhs(self):
279 |         return u'<LazyReference to %s>' % self
280 | 
281 | 
282 | class RuleVisitor(NodeVisitor):
283 |     """Turns a parse tree of a grammar definition into a map of ``Expression``
284 |     objects
285 | 
286 |     This is the magic piece that breathes life into a parsed bunch of parse
287 |     rules, allowing them to go forth and parse other things.
288 | 
289 |     """
290 |     quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore}
291 | 
292 |     visit_expression = visit_term = visit_atom = NodeVisitor.lift_child
293 | 
294 |     def __init__(self, custom_rules=None):
295 |         """Construct.
296 | 
297 |         :arg custom_rules: A dict of {rule name: expression} holding custom
298 |             rules which will take precedence over the others
299 | 
300 |         """
301 |         self.custom_rules = custom_rules or {}
302 | 
303 |     def visit_parenthesized(self, node, parenthesized):
304 |         """Treat a parenthesized subexpression as just its contents.
305 | 
306 |         Its position in the tree suffices to maintain its grouping semantics.
307 | 
308 |         """
309 |         left_paren, _, expression, right_paren, _ = parenthesized
310 |         return expression
311 | 
312 |     def visit_quantifier(self, node, quantifier):
313 |         """Turn a quantifier into just its symbol-matching node."""
314 |         symbol, _ = quantifier
315 |         return symbol
316 | 
317 |     def visit_quantified(self, node, quantified):
318 |         atom, quantifier = quantified
319 |         return self.quantifier_classes[quantifier.text](atom)
320 | 
321 |     def visit_lookahead_term(self, node, lookahead_term):
322 |         ampersand, term, _ = lookahead_term
323 |         return Lookahead(term)
324 | 
325 |     def visit_not_term(self, node, not_term):
326 |         exclamation, term, _ = not_term
327 |         return Not(term)
328 | 
329 |     def visit_rule(self, node, rule):
330 |         """Assign a name to the Expression and return it."""
331 |         label, equals, expression = rule
332 |         expression.name = label  # Assign a name to the expr.
333 |         return expression
334 | 
335 |     def visit_sequence(self, node, sequence):
336 |         """A parsed Sequence looks like [term node, OneOrMore node of
337 |         ``another_term``s]. Flatten it out."""
338 |         term, other_terms = sequence
339 |         return Sequence(term, *other_terms)
340 | 
341 |     def visit_ored(self, node, ored):
342 |         first_term, other_terms = ored
343 |         return OneOf(first_term, *other_terms)
344 | 
345 |     def visit_or_term(self, node, or_term):
346 |         """Return just the term from an ``or_term``.
347 | 
348 |         We already know it's going to be ored, from the containing ``ored``.
349 | 
350 |         """
351 |         slash, _, term = or_term
352 |         return term
353 | 
354 |     def visit_label(self, node, label):
355 |         """Turn a label into a unicode string."""
356 |         name, _ = label
357 |         return name.text
358 | 
359 |     def visit_reference(self, node, reference):
360 |         """Stick a :class:`LazyReference` in the tree as a placeholder.
361 | 
362 |         We resolve them all later.
363 | 
364 |         """
365 |         label, not_equals = reference
366 |         return LazyReference(label)
367 | 
368 |     def visit_regex(self, node, regex):
369 |         """Return a ``Regex`` expression."""
370 |         tilde, literal, flags, _ = regex
371 |         flags = flags.text.upper()
372 |         pattern = literal.literal  # Pull the string back out of the Literal
373 |                                    # object.
374 |         return Regex(pattern, ignore_case='I' in flags,
375 |                               locale='L' in flags,
376 |                               multiline='M' in flags,
377 |                               dot_all='S' in flags,
378 |                               unicode='U' in flags,
379 |                               verbose='X' in flags)
380 | 
381 |     def visit_spaceless_literal(self, spaceless_literal, visited_children):
382 |         """Turn a string literal into a ``Literal`` that recognizes it."""
383 |         return Literal(evaluate_string(spaceless_literal.text))
384 | 
385 |     def visit_literal(self, node, literal):
386 |         """Pick just the literal out of a literal-and-junk combo."""
387 |         spaceless_literal, _ = literal
388 |         return spaceless_literal
389 | 
390 |     def generic_visit(self, node, visited_children):
391 |         """Replace childbearing nodes with a list of their children; keep
392 |         others untouched.
393 | 
394 |         For our case, if a node has children, only the children are important.
395 |         Otherwise, keep the node around for (for example) the flags of the
396 |         regex rule. Most of these kept-around nodes are subsequently thrown
397 |         away by the other visitor methods.
398 | 
399 |         We can't simply hang the visited children off the original node; that
400 |         would be disastrous if the node occurred in more than one place in the
401 |         tree.
402 | 
403 |         """
404 |         return visited_children or node  # should semantically be a tuple
405 | 
406 |     def _resolve_refs(self, rule_map, expr, done):
407 |         """Return an expression with all its lazy references recursively
408 |         resolved.
409 | 
410 |         Resolve any lazy references in the expression ``expr``, recursing into
411 |         all subexpressions.
412 | 
413 |         :arg done: The set of Expressions that have already been or are
414 |             currently being resolved, to ward off redundant work and prevent
415 |             infinite recursion for circular refs
416 | 
417 |         """
418 |         if isinstance(expr, LazyReference):
419 |             label = text_type(expr)
420 |             try:
421 |                 reffed_expr = rule_map[label]
422 |             except KeyError:
423 |                 raise UndefinedLabel(expr)
424 |             return self._resolve_refs(rule_map, reffed_expr, done)
425 |         else:
426 |             if getattr(expr, 'members', ()) and expr not in done:
427 |                 # Prevents infinite recursion for circular refs. At worst, one
428 |                 # of `expr.members` can refer back to `expr`, but it can't go
429 |                 # any farther.
430 |                 done.add(expr)
431 |                 expr.members = [self._resolve_refs(rule_map, member, done)
432 |                                 for member in expr.members]
433 |             return expr
434 | 
435 |     def visit_rules(self, node, rules_list):
436 |         """Collate all the rules into a map. Return (map, default rule).
437 | 
438 |         The default rule is the first one. Or, if you have more than one rule
439 |         of that name, it's the last-occurring rule of that name. (This lets you
440 |         override the default rule when you extend a grammar.) If there are no
441 |         string-based rules, the default rule is None, because the custom rules,
442 |         due to being kwarg-based, are unordered.
443 | 
444 |         """
445 |         _, rules = rules_list
446 | 
447 |         # Map each rule's name to its Expression. Later rules of the same name
448 |         # override earlier ones. This lets us define rules multiple times and
449 |         # have the last declaration win, so you can extend grammars by
450 |         # concatenation.
451 |         rule_map = dict((expr.name, expr) for expr in rules)
452 | 
453 |         # And custom rules override string-based rules. This is the least
454 |         # surprising choice when you compare the dict constructor:
455 |         # dict({'x': 5}, x=6).
456 |         rule_map.update(self.custom_rules)
457 | 
458 |         # Resolve references. This tolerates forward references.
459 |         done = set()
460 |         rule_map = dict((expr.name, self._resolve_refs(rule_map, expr, done))
461 |                         for expr in itervalues(rule_map))
462 | 
463 |         # isinstance() is a temporary hack around the fact that * rules don't
464 |         # always get transformed into lists by NodeVisitor. We should fix that;
465 |         # it's surprising and requires writing lame branches like this.
466 |         return rule_map, (rule_map[rules[0].name]
467 |                           if isinstance(rules, list) and rules else None)
468 | 
469 | 
470 | class TokenRuleVisitor(RuleVisitor):
471 |     """A visitor which builds expression trees meant to work on sequences of
472 |     pre-lexed tokens rather than strings"""
473 | 
474 |     def visit_spaceless_literal(self, spaceless_literal, visited_children):
475 |         """Turn a string literal into a ``TokenMatcher`` that matches
476 |         ``Token`` objects by their ``type`` attributes."""
477 |         return TokenMatcher(evaluate_string(spaceless_literal.text))
478 | 
479 |     def visit_regex(self, node, regex):
480 |         tilde, literal, flags, _ = regex
481 |         raise BadGrammar('Regexes do not make sense in TokenGrammars, since '
482 |                          'TokenGrammars operate on pre-lexed tokens rather '
483 |                          'than characters.')
484 | 
485 | 
486 | # Bootstrap to level 1...
487 | rule_grammar = BootstrappingGrammar(rule_syntax)
488 | # ...and then to level 2. This establishes that the node tree of our rule
489 | # syntax is built by the same machinery that will build trees of our users'
490 | # grammars. And the correctness of that tree is tested, indirectly, in
491 | # test_grammar.
492 | rule_grammar = Grammar(rule_syntax)
493 | 
494 | 
495 | # TODO: Teach Expression trees how to spit out Python representations of
496 | # themselves. Then we can just paste that in above, and we won't have to
497 | # bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not
498 | # so clean, because it would have to output multiple statements to get multiple
499 | # refs to a single expression hooked up.]
500 | 


--------------------------------------------------------------------------------