├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.rst
├── parsimonious
    ├── __init__.py
    ├── exceptions.py
    ├── expressions.py
    ├── grammar.py
    ├── nodes.py
    ├── tests
    │   ├── __init__.py
    │   ├── benchmarks.py
    │   ├── test_benchmarks.py
    │   ├── test_expressions.py
    │   ├── test_grammar.py
    │   └── test_nodes.py
    └── utils.py
├── pyproject.toml
├── setup.py
└── tox.ini


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: CI
 3 | 
 4 | on:
 5 |   push:
 6 |     branches: [ master ]
 7 |   pull_request:
 8 |     branches: [ master ]
 9 | 
10 | jobs:
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     strategy:
16 |       matrix:
17 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
18 | 
19 |     name: Python ${{ matrix.python-version}}
20 |     steps:
21 |       - uses: actions/checkout@v4
22 | 
23 |       - name: Set up Python
24 |         uses: actions/setup-python@v5
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 | 
28 |       - name: Update pip and install dev requirements
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           pip install tox tox-gh-actions
32 | 
33 |       - name: Test
34 |         run: tox
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .tox
2 | *.egg-info
3 | *.egg
4 | *.pyc
5 | build
6 | dist
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 Erik Rose
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Parsimonious
  3 | ============
  4 | 
  5 | Parsimonious aims to be the fastest arbitrary-lookahead parser written in pure
  6 | Python—and the most usable. It's based on parsing expression grammars (PEGs),
  7 | which means you feed it a simplified sort of EBNF notation. Parsimonious was
  8 | designed to undergird a MediaWiki parser that wouldn't take 5 seconds or a GB
  9 | of RAM to do one page, but it's applicable to all sorts of languages.
 10 | 
 11 | :Code:    https://github.com/erikrose/parsimonious/
 12 | :Issues:  https://github.com/erikrose/parsimonious/issues
 13 | :License: MIT License (MIT)
 14 | :Package: https://pypi.org/project/parsimonious/
 15 | 
 16 | 
 17 | Goals
 18 | =====
 19 | 
 20 | * Speed
 21 | * Frugal RAM use
 22 | * Minimalistic, understandable, idiomatic Python code
 23 | * Readable grammars
 24 | * Extensible grammars
 25 | * Complete test coverage
 26 | * Separation of concerns. Some Python parsing kits mix recognition with
 27 |   instructions about how to turn the resulting tree into some kind of other
 28 |   representation. This is limiting when you want to do several different things
 29 |   with a tree: for example, render wiki markup to HTML *or* to text.
 30 | * Good error reporting. I want the parser to work *with* me as I develop a
 31 |   grammar.
 32 | 
 33 | 
 34 | Install
 35 | =======
 36 | 
 37 | To install Parsimonious, run::
 38 | 
 39 |     $ pip install parsimonious
 40 | 
 41 | 
 42 | Example Usage
 43 | =============
 44 | 
 45 | Here's how to build a simple grammar:
 46 | 
 47 | .. code:: python
 48 | 
 49 |     >>> from parsimonious.grammar import Grammar
 50 |     >>> grammar = Grammar(
 51 |     ...     """
 52 |     ...     bold_text  = bold_open text bold_close
 53 |     ...     text       = ~"[A-Z 0-9]*"i
 54 |     ...     bold_open  = "(("
 55 |     ...     bold_close = "))"
 56 |     ...     """)
 57 | 
 58 | You can have forward references and even right recursion; it's all taken care
 59 | of by the grammar compiler. The first rule is taken to be the default start
 60 | symbol, but you can override that.
 61 | 
 62 | Next, let's parse something and get an abstract syntax tree:
 63 | 
 64 | .. code:: python
 65 | 
 66 |     >>> print(grammar.parse('((bold stuff))'))
 67 |     <Node called "bold_text" matching "((bold stuff))">
 68 |         <Node called "bold_open" matching "((">
 69 |         <RegexNode called "text" matching "bold stuff">
 70 |         <Node called "bold_close" matching "))">
 71 | 
 72 | You'd typically then use a ``nodes.NodeVisitor`` subclass (see below) to walk
 73 | the tree and do something useful with it.
 74 | 
 75 | Another example would be to implement a parser for ``.ini``-files. Consider the following:
 76 | 
 77 | .. code:: python
 78 | 
 79 |     grammar = Grammar(
 80 |         r"""
 81 |         expr        = (entry / emptyline)*
 82 |         entry       = section pair*
 83 | 
 84 |         section     = lpar word rpar ws
 85 |         pair        = key equal value ws?
 86 | 
 87 |         key         = word+
 88 |         value       = (word / quoted)+
 89 |         word        = ~r"[-\w]+"
 90 |         quoted      = ~'"[^\"]+"'
 91 |         equal       = ws? "=" ws?
 92 |         lpar        = "["
 93 |         rpar        = "]"
 94 |         ws          = ~r"\s*"
 95 |         emptyline   = ws+
 96 |         """
 97 |     )
 98 | 
 99 | 
100 | We could now implement a subclass of ``NodeVisitor`` like so:
101 | 
102 | .. code:: python
103 | 
104 |     class IniVisitor(NodeVisitor):
105 |         def visit_expr(self, node, visited_children):
106 |             """ Returns the overall output. """
107 |             output = {}
108 |             for child in visited_children:
109 |                 output.update(child[0])
110 |             return output
111 | 
112 |         def visit_entry(self, node, visited_children):
113 |             """ Makes a dict of the section (as key) and the key/value pairs. """
114 |             key, values = visited_children
115 |             return {key: dict(values)}
116 | 
117 |         def visit_section(self, node, visited_children):
118 |             """ Gets the section name. """
119 |             _, section, *_ = visited_children
120 |             return section.text
121 | 
122 |         def visit_pair(self, node, visited_children):
123 |             """ Gets each key/value pair, returns a tuple. """
124 |             key, _, value, *_ = node.children
125 |             return key.text, value.text
126 | 
127 |         def generic_visit(self, node, visited_children):
128 |             """ The generic visit method. """
129 |             return visited_children or node
130 | 
131 | And call it like that:
132 | 
133 | .. code:: python
134 | 
135 |     from parsimonious.grammar import Grammar
136 |     from parsimonious.nodes import NodeVisitor
137 | 
138 |     data = """[section]
139 |     somekey = somevalue
140 |     someotherkey=someothervalue
141 | 
142 |     [anothersection]
143 |     key123 = "what the heck?"
144 |     key456="yet another one here"
145 | 
146 |     """
147 | 
148 |     tree = grammar.parse(data)
149 | 
150 |     iv = IniVisitor()
151 |     output = iv.visit(tree)
152 |     print(output)
153 | 
154 | This would yield
155 | 
156 | .. code:: python
157 | 
158 |     {'section': {'somekey': 'somevalue', 'someotherkey': 'someothervalue'}, 'anothersection': {'key123': '"what the heck?"', 'key456': '"yet another one here"'}}
159 | 
160 | Status
161 | ======
162 | 
163 | * Everything that exists works. Test coverage is good.
164 | * I don't plan on making any backward-incompatible changes to the rule syntax
165 |   in the future, so you can write grammars with confidence.
166 | * It may be slow and use a lot of RAM; I haven't measured either yet. However,
167 |   I have yet to begin optimizing in earnest.
168 | * Error reporting is now in place. ``repr`` methods of expressions, grammars,
169 |   and nodes are clear and helpful as well. The ``Grammar`` ones are
170 |   even round-trippable!
171 | * The grammar extensibility story is underdeveloped at the moment. You should
172 |   be able to extend a grammar by simply concatenating more rules onto the
173 |   existing ones; later rules of the same name should override previous ones.
174 |   However, this is untested and may not be the final story.
175 | * Sphinx docs are coming, but the docstrings are quite useful now.
176 | * Note that there may be API changes until we get to 1.0, so be sure to pin to
177 |   the version you're using.
178 | 
179 | Coming Soon
180 | -----------
181 | 
182 | * Optimizations to make Parsimonious worthy of its name
183 | * Tighter RAM use
184 | * Better-thought-out grammar extensibility story
185 | * Amazing grammar debugging
186 | 
187 | 
188 | A Little About PEG Parsers
189 | ==========================
190 | 
191 | PEG parsers don't draw a distinction between lexing and parsing; everything is
192 | done at once. As a result, there is no lookahead limit, as there is with, for
193 | instance, Yacc. And, due to both of these properties, PEG grammars are easier
194 | to write: they're basically just a more practical dialect of EBNF. With
195 | caching, they take O(grammar size * text length) memory (though I plan to do
196 | better), but they run in O(text length) time.
197 | 
198 | More Technically
199 | ----------------
200 | 
201 | PEGs can describe a superset of *LL(k)* languages, any deterministic *LR(k)*
202 | language, and many others—including some that aren't context-free
203 | (http://www.brynosaurus.com/pub/lang/peg.pdf). They can also deal with what
204 | would be ambiguous languages if described in canonical EBNF. They do this by
205 | trading the ``|`` alternation operator for the ``/`` operator, which works the
206 | same except that it makes priority explicit: ``a / b / c`` first tries matching
207 | ``a``. If that fails, it tries ``b``, and, failing that, moves on to ``c``.
208 | Thus, ambiguity is resolved by always yielding the first successful recognition.
209 | 
210 | 
211 | Writing Grammars
212 | ================
213 | 
214 | Grammars are defined by a series of rules. The syntax should be familiar to
215 | anyone who uses regexes or reads programming language manuals. An example will
216 | serve best:
217 | 
218 | .. code:: python
219 | 
220 |     my_grammar = Grammar(r"""
221 |         styled_text = bold_text / italic_text
222 |         bold_text   = "((" text "))"
223 |         italic_text = "''" text "''"
224 |         text        = ~"[A-Z 0-9]*"i
225 |         """)
226 | 
227 | You can wrap a rule across multiple lines if you like; the syntax is very
228 | forgiving.
229 | 
230 | If you want to save your grammar into a separate file, you should name it using
231 | ``.ppeg`` extension.
232 | 
233 | 
234 | Syntax Reference
235 | ----------------
236 | 
237 | ====================    ========================================================
238 | ``"some literal"``      Used to quote literals. Backslash escaping and Python
239 |                         conventions for "raw" and Unicode strings help support
240 |                         fiddly characters.
241 | 
242 | ``b"some literal"``     A bytes literal. Using bytes literals and regular
243 |                         expressions allows your grammar to parse binary files.
244 |                         Note that all literals and regular expressions must be
245 |                         of the same type within a grammar. In grammars that
246 |                         process bytestrings, you should make the grammar string
247 |                         an ``r"""string"""`` so that byte literals like ``\xff``
248 |                         work correctly.
249 | 
250 | [space]                 Sequences are made out of space- or tab-delimited
251 |                         things. ``a b c`` matches spots where those 3
252 |                         terms appear in that order.
253 | 
254 | ``a / b / c``           Alternatives. The first to succeed of ``a / b / c``
255 |                         wins.
256 | 
257 | ``thing?``              An optional expression. This is greedy, always consuming
258 |                         ``thing`` if it exists.
259 | 
260 | ``&thing``              A lookahead assertion. Ensures ``thing`` matches at the
261 |                         current position but does not consume it.
262 | 
263 | ``!thing``              A negative lookahead assertion. Matches if ``thing``
264 |                         isn't found here. Doesn't consume any text.
265 | 
266 | ``things*``             Zero or more things. This is greedy, always consuming as
267 |                         many repetitions as it can.
268 | 
269 | ``things+``             One or more things. This is greedy, always consuming as
270 |                         many repetitions as it can.
271 | 
272 | ``~r"regex"ilmsuxa``    Regexes have ``~`` in front and are quoted like
273 |                         literals. Any flags_ (``asilmx``) follow the end quotes
274 |                         as single chars. Regexes are good for representing
275 |                         character classes (``[a-z0-9]``) and optimizing for
276 |                         speed. The downside is that they won't be able to take
277 |                         advantage of our fancy debugging, once we get that
278 |                         working. Ultimately, I'd like to deprecate explicit
279 |                         regexes and instead have Parsimonious dynamically build
280 |                         them out of simpler primitives. Parsimonious uses the
281 |                         regex_ library instead of the built-in re module.
282 | 
283 | ``~br"regex"``          A bytes regex; required if your grammar parses
284 |                         bytestrings.
285 | 
286 | ``(things)``            Parentheses are used for grouping, like in every other
287 |                         language.
288 | 
289 | ``thing{n}``            Exactly ``n`` repetitions of ``thing``.
290 | 
291 | ``thing{n,m}``          Between ``n`` and ``m`` repititions (inclusive.)
292 | 
293 | ``thing{,m}``           At most ``m`` repetitions of ``thing``.
294 | 
295 | ``thing{n,}``           At least ``n`` repetitions of ``thing``.
296 | 
297 | ====================    ========================================================
298 | 
299 | .. _flags: https://docs.python.org/3/howto/regex.html#compilation
300 | .. _regex: https://github.com/mrabarnett/mrab-regex
301 | 
302 | Optimizing Grammars
303 | ===================
304 | 
305 | Don't Repeat Expressions
306 | ------------------------
307 | 
308 | If you need a ``~"[a-z0-9]"i`` at two points in your grammar, don't type it
309 | twice. Make it a rule of its own, and reference it from wherever you need it.
310 | You'll get the most out of the caching this way, since cache lookups are by
311 | expression object identity (for speed).
312 | 
313 | Even if you have an expression that's very simple, not repeating it will
314 | save RAM, as there can, at worst, be a cached int for every char in the text
315 | you're parsing. In the future, we may identify repeated subexpressions
316 | automatically and factor them up while building the grammar.
317 | 
318 | How much should you shove into one regex, versus how much should you break them
319 | up to not repeat yourself? That's a fine balance and worthy of benchmarking.
320 | More stuff jammed into a regex will execute faster, because it doesn't have to
321 | run any Python between pieces, but a broken-up one will give better cache
322 | performance if the individual pieces are re-used elsewhere. If the pieces of a
323 | regex aren't used anywhere else, by all means keep the whole thing together.
324 | 
325 | 
326 | Quantifiers
327 | -----------
328 | 
329 | Bring your ``?`` and ``*`` quantifiers up to the highest level you
330 | can. Otherwise, lower-level patterns could succeed but be empty and put a bunch
331 | of useless nodes in your tree that didn't really match anything.
332 | 
333 | 
334 | Processing Parse Trees
335 | ======================
336 | 
337 | A parse tree has a node for each expression matched, even if it matched a
338 | zero-length string, like ``"thing"?`` might.
339 | 
340 | The ``NodeVisitor`` class provides an inversion-of-control framework for
341 | walking a tree and returning a new construct (tree, string, or whatever) based
342 | on it. For now, have a look at its docstrings for more detail. There's also a
343 | good example in ``grammar.RuleVisitor``. Notice how we take advantage of nodes'
344 | iterability by using tuple unpacks in the formal parameter lists:
345 | 
346 | .. code:: python
347 | 
348 |     def visit_or_term(self, or_term, (slash, _, term)):
349 |         ...
350 | 
351 | For reference, here is the production the above unpacks::
352 | 
353 |     or_term = "/" _ term
354 | 
355 | When something goes wrong in your visitor, you get a nice error like this::
356 | 
357 |     [normal traceback here...]
358 |     VisitationException: 'Node' object has no attribute 'foo'
359 | 
360 |     Parse tree:
361 |     <Node called "rules" matching "number = ~"[0-9]+"">  <-- *** We were here. ***
362 |         <Node matching "number = ~"[0-9]+"">
363 |             <Node called "rule" matching "number = ~"[0-9]+"">
364 |                 <Node matching "">
365 |                 <Node called "label" matching "number">
366 |                 <Node matching " ">
367 |                     <Node called "_" matching " ">
368 |                 <Node matching "=">
369 |                 <Node matching " ">
370 |                     <Node called "_" matching " ">
371 |                 <Node called "rhs" matching "~"[0-9]+"">
372 |                     <Node called "term" matching "~"[0-9]+"">
373 |                         <Node called "atom" matching "~"[0-9]+"">
374 |                             <Node called "regex" matching "~"[0-9]+"">
375 |                                 <Node matching "~">
376 |                                 <Node called "literal" matching ""[0-9]+"">
377 |                                 <Node matching "">
378 |                 <Node matching "">
379 |                 <Node called "eol" matching "
380 |                 ">
381 |         <Node matching "">
382 | 
383 | The parse tree is tacked onto the exception, and the node whose visitor method
384 | raised the error is pointed out.
385 | 
386 | Why No Streaming Tree Processing?
387 | ---------------------------------
388 | 
389 | Some have asked why we don't process the tree as we go, SAX-style. There are
390 | two main reasons:
391 | 
392 | 1. It wouldn't work. With a PEG parser, no parsing decision is final until the
393 |    whole text is parsed. If we had to change a decision, we'd have to backtrack
394 |    and redo the SAX-style interpretation as well, which would involve
395 |    reconstituting part of the AST and quite possibly scuttling whatever you
396 |    were doing with the streaming output. (Note that some bursty SAX-style
397 |    processing may be possible in the future if we use cuts.)
398 | 
399 | 2. It interferes with the ability to derive multiple representations from the
400 |    AST: for example, turning wiki markup into first HTML and then text.
401 | 
402 | 
403 | Future Directions
404 | =================
405 | 
406 | Rule Syntax Changes
407 | -------------------
408 | 
409 | * Maybe support left-recursive rules like PyMeta, if anybody cares.
410 | * Ultimately, I'd like to get rid of explicit regexes and break them into more
411 |   atomic things like character classes. Then we can dynamically compile bits
412 |   of the grammar into regexes as necessary to boost speed.
413 | 
414 | Optimizations
415 | -------------
416 | 
417 | * Make RAM use almost constant by automatically inserting "cuts", as described
418 |   in
419 |   http://ialab.cs.tsukuba.ac.jp/~mizusima/publications/paste513-mizushima.pdf.
420 |   This would also improve error reporting, as we wouldn't backtrack out of
421 |   everything informative before finally failing.
422 | * Find all the distinct subexpressions, and unify duplicates for a better cache
423 |   hit ratio.
424 | * Think about having the user (optionally) provide some representative input
425 |   along with a grammar. We can then profile against it, see which expressions
426 |   are worth caching, and annotate the grammar. Perhaps there will even be
427 |   positions at which a given expression is more worth caching. Or we could keep
428 |   a count of how many times each cache entry has been used and evict the most
429 |   useless ones as RAM use grows.
430 | * We could possibly compile the grammar into VM instructions, like in "A
431 |   parsing machine for PEGs" by Medeiros.
432 | * If the recursion gets too deep in practice, use trampolining to dodge it.
433 | 
434 | Niceties
435 | --------
436 | 
437 | * Pijnu has a raft of tree manipulators. I don't think I want all of them, but
438 |   a judicious subset might be nice. Don't get into mixing formatting with tree
439 |   manipulation.
440 |   https://github.com/erikrose/pijnu/blob/master/library/node.py#L333. PyPy's
441 |   parsing lib exposes a sane subset:
442 |   http://doc.pypy.org/en/latest/rlib.html#tree-transformations.
443 | 
444 | 
445 | Version History
446 | ===============
447 | (Next release)
448 |   * Fix bug #238: correctly handle `/` expressions with multiple terms in a row. (lucaswiman)
449 | 
450 | 0.10.0
451 |   * Fix infinite recursion in __eq__ in some cases. (FelisNivalis)
452 |   * Improve error message in left-recursive rules. (lucaswiman)
453 |   * Add support for range ``{min,max}`` repetition expressions (righthandabacus)
454 |   * Fix bug in ``*`` and ``+`` for token grammars (lucaswiman)
455 |   * Add support for grammars on bytestrings (lucaswiman)
456 |   * Fix LazyReference resolution bug #134 (righthandabacus)
457 |   * ~15% speedup on benchmarks with a faster node cache (ethframe)
458 | 
459 |   .. warning::
460 | 
461 |       This release makes backward-incompatible changes:
462 | 
463 |       * Fix precedence of string literal modifiers ``u/r/b``.
464 |         This will break grammars with no spaces between a
465 |         reference and a string literal. (lucaswiman)
466 | 
467 | 
468 | 0.9.0
469 |   * Add support for Python 3.7, 3.8, 3.9, 3.10 (righthandabacus, Lonnen)
470 |   * Drop support for Python 2.x, 3.3, 3.4 (righthandabacus, Lonnen)
471 |   * Remove six and go all in on Python 3 idioms (Lonnen)
472 |   * Replace re with regex for improved handling of unicode characters
473 |     in regexes (Oderjunkie)
474 |   * Dropped nose for unittest (swayson)
475 |   * `Grammar.__repr__()` now correctly escapes backslashes (ingolemo)
476 |   * Custom rules can now be class methods in addition to
477 |     functions (James Addison)
478 |   * Make the ascii flag available in the regex syntax (Roman Inflianskas)
479 | 
480 | 0.8.1
481 |   * Switch to a function-style ``print`` in the benchmark tests so we work
482 |     cleanly as a dependency on Python 3. (Edward Betts)
483 | 
484 | 0.8.0
485 |   * Make Grammar iteration ordered, making the ``__repr__`` more like the
486 |     original input. (Lucas Wiman)
487 |   * Improve text representation and error messages for anonymous
488 |     subexpressions. (Lucas Wiman)
489 |   * Expose BadGrammar and VisitationError as top-level imports.
490 |   * No longer crash when you try to compare a Node to an instance of a
491 |     different class. (Esben Sonne)
492 |   * Pin ``six`` at 1.9.0 to ensure we have ``python_2_unicode_compatible``.
493 |     (Sam Raker)
494 |   * Drop Python 2.6 support.
495 | 
496 | 0.7.0
497 |   * Add experimental token-based parsing, via TokenGrammar class, for those
498 |     operating on pre-lexed streams of tokens. This can, for example, help parse
499 |     indentation-sensitive languages that use the "off-side rule", like Python.
500 |     (Erik Rose)
501 |   * Common codebase for Python 2 and 3: no more 2to3 translation step (Mattias
502 |     Urlichs, Lucas Wiman)
503 |   * Drop Python 3.1 and 3.2 support.
504 |   * Fix a bug in ``Grammar.__repr__`` which fails to work on Python 3 since the
505 |     string_escape codec is gone in Python 3. (Lucas Wiman)
506 |   * Don't lose parentheses when printing representations of expressions.
507 |     (Michael Kelly)
508 |   * Make Grammar an immutable mapping (until we add automatic recompilation).
509 |     (Michael Kelly)
510 | 
511 | 0.6.2
512 |   * Make grammar compilation 100x faster. Thanks to dmoisset for the initial
513 |     patch.
514 | 
515 | 0.6.1
516 |   * Fix bug which made the default rule of a grammar invalid when it
517 |     contained a forward reference.
518 | 
519 | 0.6
520 |   .. warning::
521 | 
522 |       This release makes backward-incompatible changes:
523 | 
524 |       * The ``default_rule`` arg to Grammar's constructor has been replaced
525 |         with a method, ``some_grammar.default('rule_name')``, which returns a
526 |         new grammar just like the old except with its default rule changed.
527 |         This is to free up the constructor kwargs for custom rules.
528 |       * ``UndefinedLabel`` is no longer a subclass of ``VisitationError``. This
529 |         matters only in the unlikely case that you were catching
530 |         ``VisitationError`` exceptions and expecting to thus also catch
531 |         ``UndefinedLabel``.
532 | 
533 |   * Add support for "custom rules" in Grammars. These provide a hook for simple
534 |     custom parsing hooks spelled as Python lambdas. For heavy-duty needs,
535 |     you can put in Compound Expressions with LazyReferences as subexpressions,
536 |     and the Grammar will hook them up for optimal efficiency--no calling
537 |     ``__getitem__`` on Grammar at parse time.
538 |   * Allow grammars without a default rule (in cases where there are no string
539 |     rules), which leads to also allowing empty grammars. Perhaps someone
540 |     building up grammars dynamically will find that useful.
541 |   * Add ``@rule`` decorator, allowing grammars to be constructed out of
542 |     notations on ``NodeVisitor`` methods. This saves looking back and forth
543 |     between the visitor and the grammar when there is only one visitor per
544 |     grammar.
545 |   * Add ``parse()`` and ``match()`` convenience methods to ``NodeVisitor``.
546 |     This makes the common case of parsing a string and applying exactly one
547 |     visitor to the AST shorter and simpler.
548 |   * Improve exception message when you forget to declare a visitor method.
549 |   * Add ``unwrapped_exceptions`` attribute to ``NodeVisitor``, letting you
550 |     name certain exceptions which propagate out of visitors without being
551 |     wrapped by ``VisitationError`` exceptions.
552 |   * Expose much more of the library in ``__init__``, making your imports
553 |     shorter.
554 |   * Drastically simplify reference resolution machinery. (Vladimir Keleshev)
555 | 
556 | 0.5
557 |   .. warning::
558 | 
559 |       This release makes some backward-incompatible changes. See below.
560 | 
561 |   * Add alpha-quality error reporting. Now, rather than returning ``None``,
562 |     ``parse()`` and ``match()`` raise ``ParseError`` if they don't succeed.
563 |     This makes more sense, since you'd rarely attempt to parse something and
564 |     not care if it succeeds. It was too easy before to forget to check for a
565 |     ``None`` result. ``ParseError`` gives you a human-readable unicode
566 |     representation as well as some attributes that let you construct your own
567 |     custom presentation.
568 |   * Grammar construction now raises ``ParseError`` rather than ``BadGrammar``
569 |     if it can't parse your rules.
570 |   * ``parse()`` now takes an optional ``pos`` argument, like ``match()``.
571 |   * Make the ``_str__()`` method of ``UndefinedLabel`` return the right type.
572 |   * Support splitting rules across multiple lines, interleaving comments,
573 |     putting multiple rules on one line (but don't do that) and all sorts of
574 |     other horrific behavior.
575 |   * Tolerate whitespace after opening parens.
576 |   * Add support for single-quoted literals.
577 | 
578 | 0.4
579 |   * Support Python 3.
580 |   * Fix ``import *`` for ``parsimonious.expressions``.
581 |   * Rewrite grammar compiler so right-recursive rules can be compiled and
582 |     parsing no longer fails in some cases with forward rule references.
583 | 
584 | 0.3
585 |   * Support comments, the ``!`` ("not") operator, and parentheses in grammar
586 |     definition syntax.
587 |   * Change the ``&`` operator to a prefix operator to conform to the original
588 |     PEG syntax. The version in Parsing Techniques was infix, and that's what I
589 |     used as a reference. However, the unary version is more convenient, as it
590 |     lets you spell ``AB & A`` as simply ``A &B``.
591 |   * Take the ``print`` statements out of the benchmark tests.
592 |   * Give Node an evaluate-able ``__repr__``.
593 | 
594 | 0.2
595 |   * Support matching of prefixes and other not-to-the-end slices of strings by
596 |     making ``match()`` public and able to initialize a new cache. Add
597 |     ``match()`` callthrough method to ``Grammar``.
598 |   * Report a ``BadGrammar`` exception (rather than crashing) when there are
599 |     mistakes in a grammar definition.
600 |   * Simplify grammar compilation internals: get rid of superfluous visitor
601 |     methods and factor up repetitive ones. Simplify rule grammar as well.
602 |   * Add ``NodeVisitor.lift_child`` convenience method.
603 |   * Rename ``VisitationException`` to ``VisitationError`` for consistency with
604 |     the standard Python exception hierarchy.
605 |   * Rework ``repr`` and ``str`` values for grammars and expressions. Now they
606 |     both look like rule syntax. Grammars are even round-trippable! This fixes a
607 |     unicode encoding error when printing nodes that had parsed unicode text.
608 |   * Add tox for testing. Stop advertising Python 2.5 support, which never
609 |     worked (and won't unless somebody cares a lot, since it makes Python 3
610 |     support harder).
611 |   * Settle (hopefully) on the term "rule" to mean "the string representation of
612 |     a production". Get rid of the vague, mysterious "DSL".
613 | 
614 | 0.1
615 |   * A rough but useable preview release
616 | 
617 | Thanks to Wiki Loves Monuments Panama for showing their support with a generous
618 | gift.
619 | 


--------------------------------------------------------------------------------
/parsimonious/__init__.py:
--------------------------------------------------------------------------------
 1 | """Parsimonious's public API. Import from here.
 2 | 
 3 | Things may move around in modules deeper than this one.
 4 | 
 5 | """
 6 | from parsimonious.exceptions import (ParseError, IncompleteParseError,
 7 |                                      VisitationError, UndefinedLabel,
 8 |                                      BadGrammar)
 9 | from parsimonious.grammar import Grammar, TokenGrammar
10 | from parsimonious.nodes import NodeVisitor, VisitationError, rule
11 | 


--------------------------------------------------------------------------------
/parsimonious/exceptions.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | from parsimonious.utils import StrAndRepr
  4 | 
  5 | 
  6 | class ParsimoniousError(Exception):
  7 |     """A base exception class to allow library users to catch any Parsimonious error."""
  8 |     pass
  9 | 
 10 | 
 11 | class ParseError(StrAndRepr, ParsimoniousError):
 12 |     """A call to ``Expression.parse()`` or ``match()`` didn't match."""
 13 | 
 14 |     def __init__(self, text, pos=-1, expr=None):
 15 |         # It would be nice to use self.args, but I don't want to pay a penalty
 16 |         # to call descriptors or have the confusion of numerical indices in
 17 |         # Expression.match_core().
 18 |         self.text = text
 19 |         self.pos = pos
 20 |         self.expr = expr
 21 | 
 22 |     def __str__(self):
 23 |         rule_name = (("'%s'" % self.expr.name) if self.expr.name else
 24 |                      str(self.expr))
 25 |         return "Rule %s didn't match at '%s' (line %s, column %s)." % (
 26 |                 rule_name,
 27 |                 self.text[self.pos:self.pos + 20],
 28 |                 self.line(),
 29 |                 self.column())
 30 | 
 31 |     # TODO: Add line, col, and separated-out error message so callers can build
 32 |     # their own presentation.
 33 | 
 34 |     def line(self):
 35 |         """Return the 1-based line number where the expression ceased to
 36 |         match."""
 37 |         # This is a method rather than a property in case we ever wanted to
 38 |         # pass in which line endings we want to use.
 39 |         if isinstance(self.text, list):  # TokenGrammar
 40 |             return None
 41 |         else:
 42 |             return self.text.count('\n', 0, self.pos) + 1
 43 | 
 44 |     def column(self):
 45 |         """Return the 1-based column where the expression ceased to match."""
 46 |         # We choose 1-based because that's what Python does with SyntaxErrors.
 47 |         try:
 48 |             return self.pos - self.text.rindex('\n', 0, self.pos)
 49 |         except (ValueError, AttributeError):
 50 |             return self.pos + 1
 51 | 
 52 | 
 53 | class LeftRecursionError(ParseError):
 54 |     def __str__(self):
 55 |         rule_name = self.expr.name if self.expr.name else str(self.expr)
 56 |         window = self.text[self.pos:self.pos + 20]
 57 |         return dedent(f"""
 58 |             Left recursion in rule {rule_name!r} at {window!r} (line {self.line()}, column {self.column()}).
 59 | 
 60 |             Parsimonious is a packrat parser, so it can't handle left recursion.
 61 |             See https://en.wikipedia.org/wiki/Parsing_expression_grammar#Indirect_left_recursion
 62 |             for how to rewrite your grammar into a rule that does not use left-recursion.
 63 |             """
 64 |         ).strip()
 65 | 
 66 | 
 67 | class IncompleteParseError(ParseError):
 68 |     """A call to ``parse()`` matched a whole Expression but did not consume the
 69 |     entire text."""
 70 | 
 71 |     def __str__(self):
 72 |         return "Rule '%s' matched in its entirety, but it didn't consume all the text. The non-matching portion of the text begins with '%s' (line %s, column %s)." % (
 73 |                 self.expr.name,
 74 |                 self.text[self.pos:self.pos + 20],
 75 |                 self.line(),
 76 |                 self.column())
 77 | 
 78 | 
 79 | class VisitationError(ParsimoniousError):
 80 |     """Something went wrong while traversing a parse tree.
 81 | 
 82 |     This exception exists to augment an underlying exception with information
 83 |     about where in the parse tree the error occurred. Otherwise, it could be
 84 |     tiresome to figure out what went wrong; you'd have to play back the whole
 85 |     tree traversal in your head.
 86 | 
 87 |     """
 88 |     # TODO: Make sure this is pickleable. Probably use @property pattern. Make
 89 |     # the original exc and node available on it if they don't cause a whole
 90 |     # raft of stack frames to be retained.
 91 |     def __init__(self, exc, exc_class, node):
 92 |         """Construct.
 93 | 
 94 |         :arg exc: What went wrong. We wrap this and add more info.
 95 |         :arg node: The node at which the error occurred
 96 | 
 97 |         """
 98 |         self.original_class = exc_class
 99 |         super().__init__(
100 |             '%s: %s\n\n'
101 |             'Parse tree:\n'
102 |             '%s' %
103 |             (exc_class.__name__,
104 |              exc,
105 |              node.prettily(error=node)))
106 | 
107 | 
108 | class BadGrammar(StrAndRepr, ParsimoniousError):
109 |     """Something was wrong with the definition of a grammar.
110 | 
111 |     Note that a ParseError might be raised instead if the error is in the
112 |     grammar definition syntax.
113 | 
114 |     """
115 | 
116 | 
117 | class UndefinedLabel(BadGrammar):
118 |     """A rule referenced in a grammar was never defined.
119 | 
120 |     Circular references and forward references are okay, but you have to define
121 |     stuff at some point.
122 | 
123 |     """
124 |     def __init__(self, label):
125 |         self.label = label
126 | 
127 |     def __str__(self):
128 |         return 'The label "%s" was never defined.' % self.label
129 | 


--------------------------------------------------------------------------------
/parsimonious/expressions.py:
--------------------------------------------------------------------------------
  1 | """Subexpressions that make up a parsed grammar
  2 | 
  3 | These do the parsing.
  4 | 
  5 | """
  6 | # TODO: Make sure all symbol refs are local--not class lookups or
  7 | # anything--for speed. And kill all the dots.
  8 | 
  9 | from collections import defaultdict
 10 | from inspect import getfullargspec, isfunction, ismethod, ismethoddescriptor
 11 | try:
 12 |     import regex as re
 13 | except ImportError:
 14 |     import re  # Fallback as per https://github.com/erikrose/parsimonious/issues/231
 15 | 
 16 | from parsimonious.exceptions import ParseError, IncompleteParseError, LeftRecursionError
 17 | from parsimonious.nodes import Node, RegexNode
 18 | from parsimonious.utils import StrAndRepr
 19 | 
 20 | 
 21 | def is_callable(value):
 22 |     criteria = [isfunction, ismethod, ismethoddescriptor]
 23 |     return any([criterion(value) for criterion in criteria])
 24 | 
 25 | 
 26 | def expression(callable, rule_name, grammar):
 27 |     """Turn a plain callable into an Expression.
 28 | 
 29 |     The callable can be of this simple form::
 30 | 
 31 |         def foo(text, pos):
 32 |             '''If this custom expression matches starting at text[pos], return
 33 |             the index where it stops matching. Otherwise, return None.'''
 34 |             if the expression matched:
 35 |                 return end_pos
 36 | 
 37 |     If there child nodes to return, return a tuple::
 38 | 
 39 |         return end_pos, children
 40 | 
 41 |     If the expression doesn't match at the given ``pos`` at all... ::
 42 | 
 43 |         return None
 44 | 
 45 |     If your callable needs to make sub-calls to other rules in the grammar or
 46 |     do error reporting, it can take this form, gaining additional arguments::
 47 | 
 48 |         def foo(text, pos, cache, error, grammar):
 49 |             # Call out to other rules:
 50 |             node = grammar['another_rule'].match_core(text, pos, cache, error)
 51 |             ...
 52 |             # Return values as above.
 53 | 
 54 |     The return value of the callable, if an int or a tuple, will be
 55 |     automatically transmuted into a :class:`~parsimonious.Node`. If it returns
 56 |     a Node-like class directly, it will be passed through unchanged.
 57 | 
 58 |     :arg rule_name: The rule name to attach to the resulting
 59 |         :class:`~parsimonious.Expression`
 60 |     :arg grammar: The :class:`~parsimonious.Grammar` this expression will be a
 61 |         part of, to make delegating to other rules possible
 62 | 
 63 |     """
 64 | 
 65 |     # Resolve unbound methods; allows grammars to use @staticmethod custom rules
 66 |     # https://stackoverflow.com/questions/41921255/staticmethod-object-is-not-callable
 67 |     if ismethoddescriptor(callable) and hasattr(callable, '__func__'):
 68 |         callable = callable.__func__
 69 | 
 70 |     num_args = len(getfullargspec(callable).args)
 71 |     if ismethod(callable):
 72 |         # do not count the first argument (typically 'self') for methods
 73 |         num_args -= 1
 74 |     if num_args == 2:
 75 |         is_simple = True
 76 |     elif num_args == 5:
 77 |         is_simple = False
 78 |     else:
 79 |         raise RuntimeError("Custom rule functions must take either 2 or 5 "
 80 |                            "arguments, not %s." % num_args)
 81 | 
 82 |     class AdHocExpression(Expression):
 83 |         def _uncached_match(self, text, pos, cache, error):
 84 |             result = (callable(text, pos) if is_simple else
 85 |                       callable(text, pos, cache, error, grammar))
 86 | 
 87 |             if isinstance(result, int):
 88 |                 end, children = result, None
 89 |             elif isinstance(result, tuple):
 90 |                 end, children = result
 91 |             else:
 92 |                 # Node or None
 93 |                 return result
 94 |             return Node(self, text, pos, end, children=children)
 95 | 
 96 |         def _as_rhs(self):
 97 |             return '{custom function "%s"}' % callable.__name__
 98 | 
 99 |     return AdHocExpression(name=rule_name)
100 | 
101 | 
102 | IN_PROGRESS = object()
103 | 
104 | 
105 | class Expression(StrAndRepr):
106 |     """A thing that can be matched against a piece of text"""
107 | 
108 |     # Slots are about twice as fast as __dict__-based attributes:
109 |     # http://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why
110 | 
111 |     # Top-level expressions--rules--have names. Subexpressions are named ''.
112 |     __slots__ = ['name', 'identity_tuple']
113 | 
114 |     def __init__(self, name=''):
115 |         self.name = name
116 |         self.identity_tuple = (self.name, )
117 | 
118 |     def __hash__(self):
119 |         return hash(self.identity_tuple)
120 | 
121 |     def __eq__(self, other):
122 |         return self._eq_check_cycles(other, set())
123 | 
124 |     def __ne__(self, other):
125 |         return not (self == other)
126 | 
127 |     def _eq_check_cycles(self, other, checked):
128 |         # keep a set of all pairs that are already checked, so we won't fall into infinite recursions.
129 |         checked.add((id(self), id(other)))
130 |         return other.__class__ is self.__class__ and self.identity_tuple == other.identity_tuple
131 | 
132 |     def resolve_refs(self, rule_map):
133 |         # Nothing to do on the base expression.
134 |         return self
135 | 
136 |     def parse(self, text, pos=0):
137 |         """Return a parse tree of ``text``.
138 | 
139 |         Raise ``ParseError`` if the expression wasn't satisfied. Raise
140 |         ``IncompleteParseError`` if the expression was satisfied but didn't
141 |         consume the full string.
142 | 
143 |         """
144 |         node = self.match(text, pos=pos)
145 |         if node.end < len(text):
146 |             raise IncompleteParseError(text, node.end, self)
147 |         return node
148 | 
149 |     def match(self, text, pos=0):
150 |         """Return the parse tree matching this expression at the given
151 |         position, not necessarily extending all the way to the end of ``text``.
152 | 
153 |         Raise ``ParseError`` if there is no match there.
154 | 
155 |         :arg pos: The index at which to start matching
156 | 
157 |         """
158 |         error = ParseError(text)
159 |         node = self.match_core(text, pos, defaultdict(dict), error)
160 |         if node is None:
161 |             raise error
162 |         return node
163 | 
164 |     def match_core(self, text, pos, cache, error):
165 |         """Internal guts of ``match()``
166 | 
167 |         This is appropriate to call only from custom rules or Expression
168 |         subclasses.
169 | 
170 |         :arg cache: The packrat cache::
171 | 
172 |             {(oid, pos): Node tree matched by object `oid` at index `pos` ...}
173 | 
174 |         :arg error: A ParseError instance with ``text`` already filled in but
175 |             otherwise blank. We update the error reporting info on this object
176 |             as we go. (Sticking references on an existing instance is faster
177 |             than allocating a new one for each expression that fails.) We
178 |             return None rather than raising and catching ParseErrors because
179 |             catching is slow.
180 | 
181 |         """
182 |         # TODO: Optimize. Probably a hot spot.
183 |         #
184 |         # Is there a faster way of looking up cached stuff?
185 |         #
186 |         # If this is slow, think about the array module. It might (or might
187 |         # not!) use more RAM, but it'll likely be faster than hashing things
188 |         # all the time. Also, can we move all the allocs up front?
189 |         #
190 |         # To save space, we have lots of choices: (0) Quit caching whole Node
191 |         # objects. Cache just what you need to reconstitute them. (1) Cache
192 |         # only the results of entire rules, not subexpressions (probably a
193 |         # horrible idea for rules that need to backtrack internally a lot). (2)
194 |         # Age stuff out of the cache somehow. LRU? (3) Cuts.
195 |         expr_cache = cache[id(self)]
196 |         if pos in expr_cache:
197 |             node = expr_cache[pos]
198 |         else:
199 |             # TODO: Set default value to prevent infinite recursion in left-recursive rules.
200 |             expr_cache[pos] = IN_PROGRESS  # Mark as in progress
201 |             node = expr_cache[pos] = self._uncached_match(text, pos, cache, error)
202 |         if node is IN_PROGRESS:
203 |             raise LeftRecursionError(text, pos=-1, expr=self)
204 | 
205 |         # Record progress for error reporting:
206 |         if node is None and pos >= error.pos and (
207 |                 self.name or getattr(error.expr, 'name', None) is None):
208 |             # Don't bother reporting on unnamed expressions (unless that's all
209 |             # we've seen so far), as they're hard to track down for a human.
210 |             # Perhaps we could include the unnamed subexpressions later as
211 |             # auxiliary info.
212 |             error.expr = self
213 |             error.pos = pos
214 | 
215 |         return node
216 | 
217 |     def __str__(self):
218 |         return '<%s %s>' % (
219 |             self.__class__.__name__,
220 |             self.as_rule())
221 | 
222 |     def as_rule(self):
223 |         """Return the left- and right-hand sides of a rule that represents me.
224 | 
225 |         Return unicode. If I have no ``name``, omit the left-hand side.
226 | 
227 |         """
228 |         rhs = self._as_rhs().strip()
229 |         if rhs.startswith('(') and rhs.endswith(')'):
230 |             rhs = rhs[1:-1]
231 | 
232 |         return ('%s = %s' % (self.name, rhs)) if self.name else rhs
233 | 
234 |     def _unicode_members(self):
235 |         """Return an iterable of my unicode-represented children, stopping
236 |         descent when we hit a named node so the returned value resembles the
237 |         input rule."""
238 |         return [(m.name or m._as_rhs()) for m in self.members]
239 | 
240 |     def _as_rhs(self):
241 |         """Return the right-hand side of a rule that represents me.
242 | 
243 |         Implemented by subclasses.
244 | 
245 |         """
246 |         raise NotImplementedError
247 | 
248 | 
249 | class Literal(Expression):
250 |     """A string literal
251 | 
252 |     Use these if you can; they're the fastest.
253 | 
254 |     """
255 |     __slots__ = ['literal']
256 | 
257 |     def __init__(self, literal, name=''):
258 |         super().__init__(name)
259 |         self.literal = literal
260 |         self.identity_tuple = (name, literal)
261 | 
262 |     def _uncached_match(self, text, pos, cache, error):
263 |         if text.startswith(self.literal, pos):
264 |             return Node(self, text, pos, pos + len(self.literal))
265 | 
266 |     def _as_rhs(self):
267 |         return repr(self.literal)
268 | 
269 | 
270 | class TokenMatcher(Literal):
271 |     """An expression matching a single token of a given type
272 | 
273 |     This is for use only with TokenGrammars.
274 | 
275 |     """
276 |     def _uncached_match(self, token_list, pos, cache, error):
277 |         if token_list[pos].type == self.literal:
278 |             return Node(self, token_list, pos, pos + 1)
279 | 
280 | 
281 | class Regex(Expression):
282 |     """An expression that matches what a regex does.
283 | 
284 |     Use these as much as you can and jam as much into each one as you can;
285 |     they're fast.
286 | 
287 |     """
288 |     __slots__ = ['re']
289 | 
290 |     def __init__(self, pattern, name='', ignore_case=False, locale=False,
291 |                  multiline=False, dot_all=False, unicode=False, verbose=False, ascii=False):
292 |         super().__init__(name)
293 |         self.re = re.compile(pattern, (ignore_case and re.I) |
294 |                                       (locale and re.L) |
295 |                                       (multiline and re.M) |
296 |                                       (dot_all and re.S) |
297 |                                       (unicode and re.U) |
298 |                                       (verbose and re.X) |
299 |                                       (ascii and re.A))
300 |         self.identity_tuple = (self.name, self.re)
301 | 
302 |     def _uncached_match(self, text, pos, cache, error):
303 |         """Return length of match, ``None`` if no match."""
304 |         m = self.re.match(text, pos)
305 |         if m is not None:
306 |             span = m.span()
307 |             node = RegexNode(self, text, pos, pos + span[1] - span[0])
308 |             node.match = m  # TODO: A terrible idea for cache size?
309 |             return node
310 | 
311 |     def _regex_flags_from_bits(self, bits):
312 |         """Return the textual equivalent of numerically encoded regex flags."""
313 |         flags = 'ilmsuxa'
314 |         return ''.join(flags[i - 1] if (1 << i) & bits else '' for i in range(1, len(flags) + 1))
315 | 
316 |     def _as_rhs(self):
317 |         return '~{!r}{}'.format(self.re.pattern,
318 |                                 self._regex_flags_from_bits(self.re.flags))
319 | 
320 | 
321 | class Compound(Expression):
322 |     """An abstract expression which contains other expressions"""
323 | 
324 |     __slots__ = ['members']
325 | 
326 |     def __init__(self, *members, **kwargs):
327 |         """``members`` is a sequence of expressions."""
328 |         super().__init__(kwargs.get('name', ''))
329 |         self.members = members
330 | 
331 |     def resolve_refs(self, rule_map):
332 |         self.members = tuple(m.resolve_refs(rule_map) for m in self.members)
333 |         return self
334 | 
335 |     def _eq_check_cycles(self, other, checked):
336 |         return (
337 |             super()._eq_check_cycles(other, checked) and
338 |             len(self.members) == len(other.members) and
339 |             all(m._eq_check_cycles(mo, checked) for m, mo in zip(self.members, other.members) if (id(m), id(mo)) not in checked)
340 |         )
341 | 
342 |     def __hash__(self):
343 |         # Note we leave members out of the hash computation, since compounds can get added to
344 |         # sets, then have their members mutated. See RuleVisitor._resolve_refs.
345 |         # Equality should still work, but we want the rules to go into the correct hash bucket.
346 |         return hash((self.__class__, self.name))
347 | 
348 | 
349 | class Sequence(Compound):
350 |     """A series of expressions that must match contiguous, ordered pieces of
351 |     the text
352 | 
353 |     In other words, it's a concatenation operator: each piece has to match, one
354 |     after another.
355 | 
356 |     """
357 |     def _uncached_match(self, text, pos, cache, error):
358 |         new_pos = pos
359 |         children = []
360 |         for m in self.members:
361 |             node = m.match_core(text, new_pos, cache, error)
362 |             if node is None:
363 |                 return None
364 |             children.append(node)
365 |             length = node.end - node.start
366 |             new_pos += length
367 |         # Hooray! We got through all the members!
368 |         return Node(self, text, pos, new_pos, children)
369 | 
370 |     def _as_rhs(self):
371 |         return '({0})'.format(' '.join(self._unicode_members()))
372 | 
373 | 
374 | class OneOf(Compound):
375 |     """A series of expressions, one of which must match
376 | 
377 |     Expressions are tested in order from first to last. The first to succeed
378 |     wins.
379 | 
380 |     """
381 |     def _uncached_match(self, text, pos, cache, error):
382 |         for m in self.members:
383 |             node = m.match_core(text, pos, cache, error)
384 |             if node is not None:
385 |                 # Wrap the succeeding child in a node representing the OneOf:
386 |                 return Node(self, text, pos, node.end, children=[node])
387 | 
388 |     def _as_rhs(self):
389 |         return '({0})'.format(' / '.join(self._unicode_members()))
390 | 
391 | 
392 | class Lookahead(Compound):
393 |     """An expression which consumes nothing, even if its contained expression
394 |     succeeds"""
395 | 
396 |     __slots__ = ['negativity']
397 | 
398 |     def __init__(self, member, *, negative=False, **kwargs):
399 |         super().__init__(member, **kwargs)
400 |         self.negativity = bool(negative)
401 | 
402 |     def _uncached_match(self, text, pos, cache, error):
403 |         node = self.members[0].match_core(text, pos, cache, error)
404 |         if (node is None) == self.negativity: # negative lookahead == match only if not found
405 |             return Node(self, text, pos, pos)
406 | 
407 |     def _as_rhs(self):
408 |         return '%s%s' % ('!' if self.negativity else '&', self._unicode_members()[0])
409 | 
410 |     def _eq_check_cycles(self, other, checked):
411 |         return (
412 |             super()._eq_check_cycles(other, checked) and
413 |             self.negativity == other.negativity
414 |         )
415 | 
416 | def Not(term):
417 |     return Lookahead(term, negative=True)
418 | 
419 | # Quantifiers. None of these is strictly necessary, but they're darn handy.
420 | 
421 | class Quantifier(Compound):
422 |     """An expression wrapper like the */+/?/{n,m} quantifier in regexes."""
423 | 
424 |     __slots__ = ['min', 'max']
425 | 
426 |     def __init__(self, member, *, min=0, max=float('inf'), name='', **kwargs):
427 |         super().__init__(member, name=name, **kwargs)
428 |         self.min = min
429 |         self.max = max
430 | 
431 |     def _uncached_match(self, text, pos, cache, error):
432 |         new_pos = pos
433 |         children = []
434 |         size = len(text)
435 |         while new_pos < size and len(children) < self.max:
436 |             node = self.members[0].match_core(text, new_pos, cache, error)
437 |             if node is None:
438 |                 break # no more matches
439 |             children.append(node)
440 |             length = node.end - node.start
441 |             if len(children) >= self.min and length == 0:  # Don't loop infinitely
442 |                 break
443 |             new_pos += length
444 |         if len(children) >= self.min:
445 |             return Node(self, text, pos, new_pos, children)
446 | 
447 |     def _as_rhs(self):
448 |         if self.min == 0 and self.max == 1:
449 |             qualifier = '?'
450 |         elif self.min == 0 and self.max == float('inf'):
451 |             qualifier = '*'
452 |         elif self.min == 1 and self.max == float('inf'):
453 |             qualifier = '+'
454 |         elif self.max == float('inf'):
455 |             qualifier = '{%d,}' % self.min
456 |         elif self.min == 0:
457 |             qualifier = '{,%d}' % self.max
458 |         else:
459 |             qualifier = '{%d,%d}' % (self.min, self.max)
460 |         return '%s%s' % (self._unicode_members()[0], qualifier)
461 | 
462 |     def _eq_check_cycles(self, other, checked):
463 |         return (
464 |             super()._eq_check_cycles(other, checked) and
465 |             self.min == other.min and
466 |             self.max == other.max
467 |         )
468 | 
469 | def ZeroOrMore(member, name=''):
470 |     return Quantifier(member, name=name, min=0, max=float('inf'))
471 | 
472 | def OneOrMore(member, name='', min=1):
473 |     return Quantifier(member, name=name, min=min, max=float('inf'))
474 | 
475 | def Optional(member, name=''):
476 |     return Quantifier(member, name=name, min=0, max=1)
477 | 


--------------------------------------------------------------------------------
/parsimonious/grammar.py:
--------------------------------------------------------------------------------
  1 | """A convenience which constructs expression trees from an easy-to-read syntax
  2 | 
  3 | Use this unless you have a compelling reason not to; it performs some
  4 | optimizations that would be tedious to do when constructing an expression tree
  5 | by hand.
  6 | 
  7 | """
  8 | from collections import OrderedDict
  9 | from textwrap import dedent
 10 | 
 11 | from parsimonious.exceptions import BadGrammar, UndefinedLabel
 12 | from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,
 13 |     Lookahead, Quantifier, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher,
 14 |     expression, is_callable)
 15 | from parsimonious.nodes import NodeVisitor
 16 | from parsimonious.utils import evaluate_string
 17 | 
 18 | class Grammar(OrderedDict):
 19 |     """A collection of rules that describe a language
 20 | 
 21 |     You can start parsing from the default rule by calling ``parse()``
 22 |     directly on the ``Grammar`` object::
 23 | 
 24 |         g = Grammar('''
 25 |                     polite_greeting = greeting ", my good " title
 26 |                     greeting        = "Hi" / "Hello"
 27 |                     title           = "madam" / "sir"
 28 |                     ''')
 29 |         g.parse('Hello, my good sir')
 30 | 
 31 |     Or start parsing from any of the other rules; you can pull them out of the
 32 |     grammar as if it were a dictionary::
 33 | 
 34 |         g['title'].parse('sir')
 35 | 
 36 |     You could also just construct a bunch of ``Expression`` objects yourself
 37 |     and stitch them together into a language, but using a ``Grammar`` has some
 38 |     important advantages:
 39 | 
 40 |     * Languages are much easier to define in the nice syntax it provides.
 41 |     * Circular references aren't a pain.
 42 |     * It does all kinds of whizzy space- and time-saving optimizations, like
 43 |       factoring up repeated subexpressions into a single object, which should
 44 |       increase cache hit ratio. [Is this implemented yet?]
 45 | 
 46 |     """
 47 |     def __init__(self, rules='', **more_rules):
 48 |         """Construct a grammar.
 49 | 
 50 |         :arg rules: A string of production rules, one per line.
 51 |         :arg default_rule: The name of the rule invoked when you call
 52 |             :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the
 53 |             first rule. Falls back to None if there are no string-based rules
 54 |             in this grammar.
 55 |         :arg more_rules: Additional kwargs whose names are rule names and
 56 |             values are Expressions or custom-coded callables which accomplish
 57 |             things the built-in rule syntax cannot. These take precedence over
 58 |             ``rules`` in case of naming conflicts.
 59 | 
 60 |         """
 61 | 
 62 |         decorated_custom_rules = {
 63 |             k: (expression(v, k, self) if is_callable(v) else v)
 64 |             for k, v in more_rules.items()}
 65 | 
 66 |         exprs, first = self._expressions_from_rules(rules, decorated_custom_rules)
 67 |         super().__init__(exprs.items())
 68 |         self.default_rule = first  # may be None
 69 | 
 70 |     def default(self, rule_name):
 71 |         """Return a new Grammar whose :term:`default rule` is ``rule_name``."""
 72 |         new = self._copy()
 73 |         new.default_rule = new[rule_name]
 74 |         return new
 75 | 
 76 |     def _copy(self):
 77 |         """Return a shallow copy of myself.
 78 | 
 79 |         Deep is unnecessary, since Expression trees are immutable. Subgrammars
 80 |         recreate all the Expressions from scratch, and AbstractGrammars have
 81 |         no Expressions.
 82 | 
 83 |         """
 84 |         new = Grammar.__new__(Grammar)
 85 |         super(Grammar, new).__init__(self.items())
 86 |         new.default_rule = self.default_rule
 87 |         return new
 88 | 
 89 |     def _expressions_from_rules(self, rules, custom_rules):
 90 |         """Return a 2-tuple: a dict of rule names pointing to their
 91 |         expressions, and then the first rule.
 92 | 
 93 |         It's a web of expressions, all referencing each other. Typically,
 94 |         there's a single root to the web of references, and that root is the
 95 |         starting symbol for parsing, but there's nothing saying you can't have
 96 |         multiple roots.
 97 | 
 98 |         :arg custom_rules: A map of rule names to custom-coded rules:
 99 |             Expressions
100 | 
101 |         """
102 |         tree = rule_grammar.parse(rules)
103 |         return RuleVisitor(custom_rules).visit(tree)
104 | 
105 |     def parse(self, text, pos=0):
106 |         """Parse some text with the :term:`default rule`.
107 | 
108 |         :arg pos: The index at which to start parsing
109 | 
110 |         """
111 |         self._check_default_rule()
112 |         return self.default_rule.parse(text, pos=pos)
113 | 
114 |     def match(self, text, pos=0):
115 |         """Parse some text with the :term:`default rule` but not necessarily
116 |         all the way to the end.
117 | 
118 |         :arg pos: The index at which to start parsing
119 | 
120 |         """
121 |         self._check_default_rule()
122 |         return self.default_rule.match(text, pos=pos)
123 | 
124 |     def _check_default_rule(self):
125 |         """Raise RuntimeError if there is no default rule defined."""
126 |         if not self.default_rule:
127 |             raise RuntimeError("Can't call parse() on a Grammar that has no "
128 |                                "default rule. Choose a specific rule instead, "
129 |                                "like some_grammar['some_rule'].parse(...).")
130 | 
131 |     def __str__(self):
132 |         """Return a rule string that, when passed to the constructor, would
133 |         reconstitute the grammar."""
134 |         exprs = [self.default_rule] if self.default_rule else []
135 |         exprs.extend(expr for expr in self.values() if
136 |                      expr is not self.default_rule)
137 |         return '\n'.join(expr.as_rule() for expr in exprs)
138 | 
139 |     def __repr__(self):
140 |         """Return an expression that will reconstitute the grammar."""
141 |         return "Grammar({!r})".format(str(self))
142 | 
143 | 
144 | class TokenGrammar(Grammar):
145 |     """A Grammar which takes a list of pre-lexed tokens instead of text
146 | 
147 |     This is useful if you want to do the lexing yourself, as a separate pass:
148 |     for example, to implement indentation-based languages.
149 | 
150 |     """
151 |     def _expressions_from_rules(self, rules, custom_rules):
152 |         tree = rule_grammar.parse(rules)
153 |         return TokenRuleVisitor(custom_rules).visit(tree)
154 | 
155 | 
156 | class BootstrappingGrammar(Grammar):
157 |     """The grammar used to recognize the textual rules that describe other
158 |     grammars
159 | 
160 |     This grammar gets its start from some hard-coded Expressions and claws its
161 |     way from there to an expression tree that describes how to parse the
162 |     grammar description syntax.
163 | 
164 |     """
165 |     def _expressions_from_rules(self, rule_syntax, custom_rules):
166 |         """Return the rules for parsing the grammar definition syntax.
167 | 
168 |         Return a 2-tuple: a dict of rule names pointing to their expressions,
169 |         and then the top-level expression for the first rule.
170 | 
171 |         """
172 |         # Hard-code enough of the rules to parse the grammar that describes the
173 |         # grammar description language, to bootstrap:
174 |         comment = Regex(r'#[^\r\n]*', name='comment')
175 |         meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
176 |         _ = ZeroOrMore(meaninglessness, name='_')
177 |         equals = Sequence(Literal('='), _, name='equals')
178 |         label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
179 |         reference = Sequence(label, Not(equals), name='reference')
180 |         quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
181 |         # This pattern supports empty literals. TODO: A problem?
182 |         spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
183 |                                   ignore_case=True,
184 |                                   dot_all=True,
185 |                                   name='spaceless_literal')
186 |         literal = Sequence(spaceless_literal, _, name='literal')
187 |         regex = Sequence(Literal('~'),
188 |                          literal,
189 |                          Regex('[ilmsuxa]*', ignore_case=True),
190 |                          _,
191 |                          name='regex')
192 |         atom = OneOf(reference, literal, regex, name='atom')
193 |         quantified = Sequence(atom, quantifier, name='quantified')
194 | 
195 |         term = OneOf(quantified, atom, name='term')
196 |         not_term = Sequence(Literal('!'), term, _, name='not_term')
197 |         term.members = (not_term,) + term.members
198 | 
199 |         sequence = Sequence(term, OneOrMore(term), name='sequence')
200 |         or_term = Sequence(Literal('/'), _, OneOrMore(term), name='or_term')
201 |         ored = Sequence(OneOrMore(term), OneOrMore(or_term), name='ored')
202 |         expression = OneOf(ored, sequence, term, name='expression')
203 |         rule = Sequence(label, equals, expression, name='rule')
204 |         rules = Sequence(_, OneOrMore(rule), name='rules')
205 | 
206 |         # Use those hard-coded rules to parse the (more extensive) rule syntax.
207 |         # (For example, unless I start using parentheses in the rule language
208 |         # definition itself, I should never have to hard-code expressions for
209 |         # those above.)
210 | 
211 |         rule_tree = rules.parse(rule_syntax)
212 | 
213 |         # Turn the parse tree into a map of expressions:
214 |         return RuleVisitor().visit(rule_tree)
215 | 
216 | 
217 | # The grammar for parsing PEG grammar definitions:
218 | # This is a nice, simple grammar. We may someday add to it, but it's a safe bet
219 | # that the future will always be a superset of this.
220 | rule_syntax = (r'''
221 |     # Ignored things (represented by _) are typically hung off the end of the
222 |     # leafmost kinds of nodes. Literals like "/" count as leaves.
223 | 
224 |     rules = _ rule*
225 |     rule = label equals expression
226 |     equals = "=" _
227 |     literal = spaceless_literal _
228 | 
229 |     # So you can't spell a regex like `~"..." ilm`:
230 |     spaceless_literal = ~"u?r?b?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is /
231 |                         ~"u?r?b?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is
232 | 
233 |     expression = ored / sequence / term
234 |     or_term = "/" _ term+
235 |     ored = term+ or_term+
236 |     sequence = term term+
237 |     not_term = "!" term _
238 |     lookahead_term = "&" term _
239 |     term = not_term / lookahead_term / quantified / atom
240 |     quantified = atom quantifier
241 |     atom = reference / literal / regex / parenthesized
242 |     regex = "~" spaceless_literal ~"[ilmsuxa]*"i _
243 |     parenthesized = "(" _ expression ")" _
244 |     quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _
245 |     reference = label !equals
246 | 
247 |     # A subsequent equal sign is the only thing that distinguishes a label
248 |     # (which begins a new rule) from a reference (which is just a pointer to a
249 |     # rule defined somewhere else):
250 |     label = ~"[a-zA-Z_][a-zA-Z_0-9]*(?![\"'])" _
251 | 
252 |     # _ = ~r"\s*(?:#[^\r\n]*)?\s*"
253 |     _ = meaninglessness*
254 |     meaninglessness = ~r"\s+" / comment
255 |     comment = ~r"#[^\r\n]*"
256 |     ''')
257 | 
258 | 
259 | class LazyReference(str):
260 |     """A lazy reference to a rule, which we resolve after grokking all the
261 |     rules"""
262 | 
263 |     name = ''
264 | 
265 |     def resolve_refs(self, rule_map):
266 |         """
267 |         Traverse the rule map following top-level lazy references,
268 |         until we reach a cycle (raise an error) or a concrete expression.
269 | 
270 |         For example, the following is a circular reference:
271 |             foo = bar
272 |             baz = foo2
273 |             foo2 = foo
274 | 
275 |         Note that every RHS of a grammar rule _must_ be either a
276 |         LazyReference or a concrete expression, so the reference chain will
277 |         eventually either terminate or find a cycle.
278 |         """
279 |         seen = set()
280 |         cur = self
281 |         while True:
282 |             if cur in seen:
283 |                 raise BadGrammar(f"Circular Reference resolving {self.name}={self}.")
284 |             else:
285 |                 seen.add(cur)
286 |             try:
287 |                 cur = rule_map[str(cur)]
288 |             except KeyError:
289 |                 raise UndefinedLabel(cur)
290 |             if not isinstance(cur, LazyReference):
291 |                 return cur
292 | 
293 |     # Just for debugging:
294 |     def _as_rhs(self):
295 |         return '<LazyReference to %s>' % self
296 | 
297 | 
298 | class RuleVisitor(NodeVisitor):
299 |     """Turns a parse tree of a grammar definition into a map of ``Expression``
300 |     objects
301 | 
302 |     This is the magic piece that breathes life into a parsed bunch of parse
303 |     rules, allowing them to go forth and parse other things.
304 | 
305 |     """
306 |     quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore}
307 | 
308 |     visit_expression = visit_term = visit_atom = NodeVisitor.lift_child
309 | 
310 |     def __init__(self, custom_rules=None):
311 |         """Construct.
312 | 
313 |         :arg custom_rules: A dict of {rule name: expression} holding custom
314 |             rules which will take precedence over the others
315 | 
316 |         """
317 |         self.custom_rules = custom_rules or {}
318 |         self._last_literal_node_and_type = None
319 | 
320 |     def visit_parenthesized(self, node, parenthesized):
321 |         """Treat a parenthesized subexpression as just its contents.
322 | 
323 |         Its position in the tree suffices to maintain its grouping semantics.
324 | 
325 |         """
326 |         left_paren, _, expression, right_paren, _ = parenthesized
327 |         return expression
328 | 
329 |     def visit_quantifier(self, node, quantifier):
330 |         """Turn a quantifier into just its symbol-matching node."""
331 |         symbol, _ = quantifier
332 |         return symbol
333 | 
334 |     def visit_quantified(self, node, quantified):
335 |         atom, quantifier = quantified
336 |         try:
337 |             return self.quantifier_classes[quantifier.text](atom)
338 |         except KeyError:
339 |             # This should pass: assert re.full_match("\{(\d*)(,(\d*))?\}", quantifier)
340 |             quantifier = quantifier.text[1:-1].split(",")
341 |             if len(quantifier) == 1:
342 |                 min_match = max_match = int(quantifier[0])
343 |             else:
344 |                 min_match = int(quantifier[0]) if quantifier[0] else 0
345 |                 max_match = int(quantifier[1]) if quantifier[1] else float('inf')
346 |             return Quantifier(atom, min=min_match, max=max_match)
347 | 
348 |     def visit_lookahead_term(self, node, lookahead_term):
349 |         ampersand, term, _ = lookahead_term
350 |         return Lookahead(term)
351 | 
352 |     def visit_not_term(self, node, not_term):
353 |         exclamation, term, _ = not_term
354 |         return Not(term)
355 | 
356 |     def visit_rule(self, node, rule):
357 |         """Assign a name to the Expression and return it."""
358 |         label, equals, expression = rule
359 |         expression.name = label  # Assign a name to the expr.
360 |         return expression
361 | 
362 |     def visit_sequence(self, node, sequence):
363 |         """A parsed Sequence looks like [term node, OneOrMore node of
364 |         ``another_term``s]. Flatten it out."""
365 |         term, other_terms = sequence
366 |         return Sequence(term, *other_terms)
367 | 
368 |     def visit_ored(self, node, ored):
369 |         first_term, other_terms = ored
370 |         if len(first_term) == 1:
371 |             first_term = first_term[0]
372 |         else:
373 |             first_term = Sequence(*first_term)
374 |         return OneOf(first_term, *other_terms)
375 | 
376 |     def visit_or_term(self, node, or_term):
377 |         """Return just the term from an ``or_term``.
378 | 
379 |         We already know it's going to be ored, from the containing ``ored``.
380 | 
381 |         """
382 |         slash, _, terms = or_term
383 |         if len(terms) == 1:
384 |             return terms[0]
385 |         else:
386 |             return Sequence(*terms)
387 | 
388 |     def visit_label(self, node, label):
389 |         """Turn a label into a unicode string."""
390 |         name, _ = label
391 |         return name.text
392 | 
393 |     def visit_reference(self, node, reference):
394 |         """Stick a :class:`LazyReference` in the tree as a placeholder.
395 | 
396 |         We resolve them all later.
397 | 
398 |         """
399 |         label, not_equals = reference
400 |         return LazyReference(label)
401 | 
402 |     def visit_regex(self, node, regex):
403 |         """Return a ``Regex`` expression."""
404 |         tilde, literal, flags, _ = regex
405 |         flags = flags.text.upper()
406 |         pattern = literal.literal  # Pull the string back out of the Literal
407 |                                    # object.
408 |         return Regex(pattern, ignore_case='I' in flags,
409 |                               locale='L' in flags,
410 |                               multiline='M' in flags,
411 |                               dot_all='S' in flags,
412 |                               unicode='U' in flags,
413 |                               verbose='X' in flags,
414 |                               ascii='A' in flags)
415 | 
416 |     def visit_spaceless_literal(self, spaceless_literal, visited_children):
417 |         """Turn a string literal into a ``Literal`` that recognizes it."""
418 |         literal_value = evaluate_string(spaceless_literal.text)
419 |         if self._last_literal_node_and_type:
420 |             last_node, last_type = self._last_literal_node_and_type
421 |             if last_type != type(literal_value):
422 |                 raise BadGrammar(dedent(f"""\
423 |                     Found {last_node.text} ({last_type}) and {spaceless_literal.text} ({type(literal_value)}) string literals.
424 |                     All strings in a single grammar must be of the same type.
425 |                 """)
426 |                 )
427 | 
428 |         self._last_literal_node_and_type = spaceless_literal, type(literal_value)
429 | 
430 |         return Literal(literal_value)
431 | 
432 |     def visit_literal(self, node, literal):
433 |         """Pick just the literal out of a literal-and-junk combo."""
434 |         spaceless_literal, _ = literal
435 |         return spaceless_literal
436 | 
437 |     def generic_visit(self, node, visited_children):
438 |         """Replace childbearing nodes with a list of their children; keep
439 |         others untouched.
440 | 
441 |         For our case, if a node has children, only the children are important.
442 |         Otherwise, keep the node around for (for example) the flags of the
443 |         regex rule. Most of these kept-around nodes are subsequently thrown
444 |         away by the other visitor methods.
445 | 
446 |         We can't simply hang the visited children off the original node; that
447 |         would be disastrous if the node occurred in more than one place in the
448 |         tree.
449 | 
450 |         """
451 |         return visited_children or node  # should semantically be a tuple
452 | 
453 |     def visit_rules(self, node, rules_list):
454 |         """Collate all the rules into a map. Return (map, default rule).
455 | 
456 |         The default rule is the first one. Or, if you have more than one rule
457 |         of that name, it's the last-occurring rule of that name. (This lets you
458 |         override the default rule when you extend a grammar.) If there are no
459 |         string-based rules, the default rule is None, because the custom rules,
460 |         due to being kwarg-based, are unordered.
461 | 
462 |         """
463 |         _, rules = rules_list
464 | 
465 |         # Map each rule's name to its Expression. Later rules of the same name
466 |         # override earlier ones. This lets us define rules multiple times and
467 |         # have the last declaration win, so you can extend grammars by
468 |         # concatenation.
469 |         rule_map = OrderedDict((expr.name, expr) for expr in rules)
470 | 
471 |         # And custom rules override string-based rules. This is the least
472 |         # surprising choice when you compare the dict constructor:
473 |         # dict({'x': 5}, x=6).
474 |         rule_map.update(self.custom_rules)
475 | 
476 |         # Resolve references. This tolerates forward references.
477 |         for name, rule in list(rule_map.items()):
478 |             if hasattr(rule, 'resolve_refs'):
479 |                 # Some custom rules may not define a resolve_refs method,
480 |                 # though anything that inherits from Expression will have it.
481 |                 rule_map[name] = rule.resolve_refs(rule_map)
482 | 
483 |         # isinstance() is a temporary hack around the fact that * rules don't
484 |         # always get transformed into lists by NodeVisitor. We should fix that;
485 |         # it's surprising and requires writing lame branches like this.
486 |         return rule_map, (rule_map[rules[0].name]
487 |                           if isinstance(rules, list) and rules else None)
488 | 
489 | 
490 | class TokenRuleVisitor(RuleVisitor):
491 |     """A visitor which builds expression trees meant to work on sequences of
492 |     pre-lexed tokens rather than strings"""
493 | 
494 |     def visit_spaceless_literal(self, spaceless_literal, visited_children):
495 |         """Turn a string literal into a ``TokenMatcher`` that matches
496 |         ``Token`` objects by their ``type`` attributes."""
497 |         return TokenMatcher(evaluate_string(spaceless_literal.text))
498 | 
499 |     def visit_regex(self, node, regex):
500 |         tilde, literal, flags, _ = regex
501 |         raise BadGrammar('Regexes do not make sense in TokenGrammars, since '
502 |                          'TokenGrammars operate on pre-lexed tokens rather '
503 |                          'than characters.')
504 | 
505 | 
506 | # Bootstrap to level 1...
507 | rule_grammar = BootstrappingGrammar(rule_syntax)
508 | # ...and then to level 2. This establishes that the node tree of our rule
509 | # syntax is built by the same machinery that will build trees of our users'
510 | # grammars. And the correctness of that tree is tested, indirectly, in
511 | # test_grammar.
512 | rule_grammar = Grammar(rule_syntax)
513 | 
514 | 
515 | # TODO: Teach Expression trees how to spit out Python representations of
516 | # themselves. Then we can just paste that in above, and we won't have to
517 | # bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not
518 | # so clean, because it would have to output multiple statements to get multiple
519 | # refs to a single expression hooked up.]
520 | 


--------------------------------------------------------------------------------
/parsimonious/nodes.py:
--------------------------------------------------------------------------------
  1 | """Nodes that make up parse trees
  2 | 
  3 | Parsing spits out a tree of these, which you can then tell to walk itself and
  4 | spit out a useful value. Or you can walk it yourself; the structural attributes
  5 | are public.
  6 | 
  7 | """
  8 | # TODO: If this is slow, think about using cElementTree or something.
  9 | from inspect import isfunction
 10 | from sys import version_info, exc_info
 11 | 
 12 | from parsimonious.exceptions import VisitationError, UndefinedLabel
 13 | 
 14 | 
 15 | class Node(object):
 16 |     """A parse tree node
 17 | 
 18 |     Consider these immutable once constructed. As a side effect of a
 19 |     memory-saving strategy in the cache, multiple references to a single
 20 |     ``Node`` might be returned in a single parse tree. So, if you start
 21 |     messing with one, you'll see surprising parallel changes pop up elsewhere.
 22 | 
 23 |     My philosophy is that parse trees (and their nodes) should be
 24 |     representation-agnostic. That is, they shouldn't get all mixed up with what
 25 |     the final rendered form of a wiki page (or the intermediate representation
 26 |     of a programming language, or whatever) is going to be: you should be able
 27 |     to parse once and render several representations from the tree, one after
 28 |     another.
 29 | 
 30 |     """
 31 |     # I tried making this subclass list, but it got ugly. I had to construct
 32 |     # invalid ones and patch them up later, and there were other problems.
 33 |     __slots__ = ['expr',  # The expression that generated me
 34 |                  'full_text',  # The full text fed to the parser
 35 |                  'start', # The position in the text where that expr started matching
 36 |                  'end',   # The position after start where the expr first didn't
 37 |                           # match. [start:end] follow Python slice conventions.
 38 |                  'children']  # List of child parse tree nodes
 39 | 
 40 |     def __init__(self, expr, full_text, start, end, children=None):
 41 |         self.expr = expr
 42 |         self.full_text = full_text
 43 |         self.start = start
 44 |         self.end = end
 45 |         self.children = children or []
 46 | 
 47 |     @property
 48 |     def expr_name(self):
 49 |         # backwards compatibility
 50 |         return self.expr.name
 51 | 
 52 |     def __iter__(self):
 53 |         """Support looping over my children and doing tuple unpacks on me.
 54 | 
 55 |         It can be very handy to unpack nodes in arg lists; see
 56 |         :class:`PegVisitor` for an example.
 57 | 
 58 |         """
 59 |         return iter(self.children)
 60 | 
 61 |     @property
 62 |     def text(self):
 63 |         """Return the text this node matched."""
 64 |         return self.full_text[self.start:self.end]
 65 | 
 66 |     # From here down is just stuff for testing and debugging.
 67 | 
 68 |     def prettily(self, error=None):
 69 |         """Return a unicode, pretty-printed representation of me.
 70 | 
 71 |         :arg error: The node to highlight because an error occurred there
 72 | 
 73 |         """
 74 |         # TODO: If a Node appears multiple times in the tree, we'll point to
 75 |         # them all. Whoops.
 76 |         def indent(text):
 77 |             return '\n'.join(('    ' + line) for line in text.splitlines())
 78 |         ret = [u'<%s%s matching "%s">%s' % (
 79 |             self.__class__.__name__,
 80 |             (' called "%s"' % self.expr_name) if self.expr_name else '',
 81 |             self.text,
 82 |             '  <-- *** We were here. ***' if error is self else '')]
 83 |         for n in self:
 84 |             ret.append(indent(n.prettily(error=error)))
 85 |         return '\n'.join(ret)
 86 | 
 87 |     def __str__(self):
 88 |         """Return a compact, human-readable representation of me."""
 89 |         return self.prettily()
 90 | 
 91 |     def __eq__(self, other):
 92 |         """Support by-value deep comparison with other nodes for testing."""
 93 |         if not isinstance(other, Node):
 94 |             return NotImplemented
 95 | 
 96 |         return (self.expr == other.expr and
 97 |                 self.full_text == other.full_text and
 98 |                 self.start == other.start and
 99 |                 self.end == other.end and
100 |                 self.children == other.children)
101 | 
102 |     def __ne__(self, other):
103 |         return not self == other
104 | 
105 |     def __repr__(self, top_level=True):
106 |         """Return a bit of code (though not an expression) that will recreate
107 |         me."""
108 |         # repr() of unicode flattens everything out to ASCII, so we don't need
109 |         # to explicitly encode things afterward.
110 |         ret = ["s = %r" % self.full_text] if top_level else []
111 |         ret.append("%s(%r, s, %s, %s%s)" % (
112 |             self.__class__.__name__,
113 |             self.expr,
114 |             self.start,
115 |             self.end,
116 |             (', children=[%s]' %
117 |              ', '.join([c.__repr__(top_level=False) for c in self.children]))
118 |             if self.children else ''))
119 |         return '\n'.join(ret)
120 | 
121 | 
122 | class RegexNode(Node):
123 |     """Node returned from a ``Regex`` expression
124 | 
125 |     Grants access to the ``re.Match`` object, in case you want to access
126 |     capturing groups, etc.
127 | 
128 |     """
129 |     __slots__ = ['match']
130 | 
131 | 
132 | class RuleDecoratorMeta(type):
133 |     def __new__(metaclass, name, bases, namespace):
134 |         def unvisit(name):
135 |             """Remove any leading "visit_" from a method name."""
136 |             return name[6:] if name.startswith('visit_') else name
137 | 
138 |         methods = [v for k, v in namespace.items() if
139 |                    hasattr(v, '_rule') and isfunction(v)]
140 |         if methods:
141 |             from parsimonious.grammar import Grammar  # circular import dodge
142 | 
143 |             methods.sort(key=(lambda x: x.func_code.co_firstlineno)
144 |                              if version_info[0] < 3 else
145 |                              (lambda x: x.__code__.co_firstlineno))
146 |             # Possible enhancement: once we get the Grammar extensibility story
147 |             # solidified, we can have @rules *add* to the default grammar
148 |             # rather than pave over it.
149 |             namespace['grammar'] = Grammar(
150 |                 '\n'.join('{name} = {expr}'.format(name=unvisit(m.__name__),
151 |                                                    expr=m._rule)
152 |                           for m in methods))
153 |         return super(RuleDecoratorMeta,
154 |                      metaclass).__new__(metaclass, name, bases, namespace)
155 | 
156 | 
157 | class NodeVisitor(object, metaclass=RuleDecoratorMeta):
158 |     """A shell for writing things that turn parse trees into something useful
159 | 
160 |     Performs a depth-first traversal of an AST. Subclass this, add methods for
161 |     each expr you care about, instantiate, and call
162 |     ``visit(top_node_of_parse_tree)``. It'll return the useful stuff. This API
163 |     is very similar to that of ``ast.NodeVisitor``.
164 | 
165 |     These could easily all be static methods, but that would add at least as
166 |     much weirdness at the call site as the ``()`` for instantiation. And this
167 |     way, we support subclasses that require state: options, for example, or a
168 |     symbol table constructed from a programming language's AST.
169 | 
170 |     We never transform the parse tree in place, because...
171 | 
172 |     * There are likely multiple references to the same ``Node`` object in a
173 |       parse tree, and changes to one reference would surprise you elsewhere.
174 |     * It makes it impossible to report errors: you'd end up with the "error"
175 |       arrow pointing someplace in a half-transformed mishmash of nodes--and
176 |       that's assuming you're even transforming the tree into another tree.
177 |       Heaven forbid you're making it into a string or something else.
178 | 
179 |     """
180 | 
181 |     #: The :term:`default grammar`: the one recommended for use with this
182 |     #: visitor. If you populate this, you will be able to call
183 |     #: :meth:`NodeVisitor.parse()` as a shortcut.
184 |     grammar = None
185 | 
186 |     #: Classes of exceptions you actually intend to raise during visitation
187 |     #: and which should propagate out of the visitor. These will not be
188 |     #: wrapped in a VisitationError when they arise.
189 |     unwrapped_exceptions = ()
190 | 
191 |     # TODO: If we need to optimize this, we can go back to putting subclasses
192 |     # in charge of visiting children; they know when not to bother. Or we can
193 |     # mark nodes as not descent-worthy in the grammar.
194 |     def visit(self, node):
195 |         """Walk a parse tree, transforming it into another representation.
196 | 
197 |         Recursively descend a parse tree, dispatching to the method named after
198 |         the rule in the :class:`~parsimonious.grammar.Grammar` that produced
199 |         each node. If, for example, a rule was... ::
200 | 
201 |             bold = '<b>'
202 | 
203 |         ...the ``visit_bold()`` method would be called. It is your
204 |         responsibility to subclass :class:`NodeVisitor` and implement those
205 |         methods.
206 | 
207 |         """
208 |         method = getattr(self, 'visit_' + node.expr_name, self.generic_visit)
209 | 
210 |         # Call that method, and show where in the tree it failed if it blows
211 |         # up.
212 |         try:
213 |             return method(node, [self.visit(n) for n in node])
214 |         except (VisitationError, UndefinedLabel):
215 |             # Don't catch and re-wrap already-wrapped exceptions.
216 |             raise
217 |         except Exception as exc:
218 |             # implentors may define exception classes that should not be
219 |             # wrapped.
220 |             if isinstance(exc, self.unwrapped_exceptions):
221 |                 raise
222 |             # Catch any exception, and tack on a parse tree so it's easier to
223 |             # see where it went wrong.
224 |             exc_class = type(exc)
225 |             raise VisitationError(exc, exc_class, node) from exc
226 | 
227 |     def generic_visit(self, node, visited_children):
228 |         """Default visitor method
229 | 
230 |         :arg node: The node we're visiting
231 |         :arg visited_children: The results of visiting the children of that
232 |             node, in a list
233 | 
234 |         I'm not sure there's an implementation of this that makes sense across
235 |         all (or even most) use cases, so we leave it to subclasses to implement
236 |         for now.
237 | 
238 |         """
239 |         raise NotImplementedError('No visitor method was defined for this expression: %s' %
240 |                                   node.expr.as_rule())
241 | 
242 |     # Convenience methods:
243 | 
244 |     def parse(self, text, pos=0):
245 |         """Parse some text with this Visitor's default grammar and return the
246 |         result of visiting it.
247 | 
248 |         ``SomeVisitor().parse('some_string')`` is a shortcut for
249 |         ``SomeVisitor().visit(some_grammar.parse('some_string'))``.
250 | 
251 |         """
252 |         return self._parse_or_match(text, pos, 'parse')
253 | 
254 |     def match(self, text, pos=0):
255 |         """Parse and visit some text with this Visitor's default grammar, but
256 |         don't insist on parsing all the way to the end.
257 | 
258 |         ``SomeVisitor().match('some_string')`` is a shortcut for
259 |         ``SomeVisitor().visit(some_grammar.match('some_string'))``.
260 | 
261 |         """
262 |         return self._parse_or_match(text, pos, 'match')
263 | 
264 |     # Internal convenience methods to help you write your own visitors:
265 | 
266 |     def lift_child(self, node, children):
267 |         """Lift the sole child of ``node`` up to replace the node."""
268 |         first_child, = children
269 |         return first_child
270 | 
271 |     # Private methods:
272 | 
273 |     def _parse_or_match(self, text, pos, method_name):
274 |         """Execute a parse or match on the default grammar, followed by a
275 |         visitation.
276 | 
277 |         Raise RuntimeError if there is no default grammar specified.
278 | 
279 |         """
280 |         if not self.grammar:
281 |             raise RuntimeError(
282 |                 "The {cls}.{method}() shortcut won't work because {cls} was "
283 |                 "never associated with a specific " "grammar. Fill out its "
284 |                 "`grammar` attribute, and try again.".format(
285 |                     cls=self.__class__.__name__,
286 |                     method=method_name))
287 |         return self.visit(getattr(self.grammar, method_name)(text, pos=pos))
288 | 
289 | 
290 | def rule(rule_string):
291 |     """Decorate a NodeVisitor ``visit_*`` method to tie a grammar rule to it.
292 | 
293 |     The following will arrange for the ``visit_digit`` method to receive the
294 |     results of the ``~"[0-9]"`` parse rule::
295 | 
296 |         @rule('~"[0-9]"')
297 |         def visit_digit(self, node, visited_children):
298 |             ...
299 | 
300 |     Notice that there is no "digit = " as part of the rule; that gets inferred
301 |     from the method name.
302 | 
303 |     In cases where there is only one kind of visitor interested in a grammar,
304 |     using ``@rule`` saves you having to look back and forth between the visitor
305 |     and the grammar definition.
306 | 
307 |     On an implementation level, all ``@rule`` rules get stitched together into
308 |     a :class:`~parsimonious.Grammar` that becomes the NodeVisitor's
309 |     :term:`default grammar`.
310 | 
311 |     Typically, the choice of a default rule for this grammar is simple: whatever
312 |     ``@rule`` comes first in the class is the default. But the choice may become
313 |     surprising if you divide the ``@rule`` calls among subclasses. At the
314 |     moment, which method "comes first" is decided simply by comparing line
315 |     numbers, so whatever method is on the smallest-numbered line will be the
316 |     default. In a future release, this will change to pick the
317 |     first ``@rule`` call on the basemost class that has one. That way, a
318 |     subclass which does not override the default rule's ``visit_*`` method
319 |     won't unintentionally change which rule is the default.
320 | 
321 |     """
322 |     def decorator(method):
323 |         method._rule = rule_string  # XXX: Maybe register them on a class var instead so we can just override a @rule'd visitor method on a subclass without blowing away the rule string that comes with it.
324 |         return method
325 |     return decorator
326 | 


--------------------------------------------------------------------------------
/parsimonious/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikrose/parsimonious/63184bb79770e687cc62ab34d50c86bf383b3533/parsimonious/tests/__init__.py


--------------------------------------------------------------------------------
/parsimonious/tests/benchmarks.py:
--------------------------------------------------------------------------------
 1 | """Benchmarks for Parsimonious
 2 | 
 3 | Run these with ``python parsimonious/tests/benchmarks.py``. They don't run during
 4 | normal test runs because they're not tests--they don't assert anything. Also,
 5 | they're a bit slow.
 6 | 
 7 | These differ from the ones in test_benchmarks in that these are meant to be
 8 | compared from revision to revision of Parsimonious to make sure we're not
 9 | getting slower. test_benchmarks simply makes sure our choices among
10 | implementation alternatives remain valid.
11 | 
12 | """
13 | from __future__ import print_function
14 | import gc
15 | from timeit import repeat
16 | 
17 | from parsimonious.grammar import Grammar
18 | 
19 | 
20 | def test_not_really_json_parsing():
21 |     """As a baseline for speed, parse some JSON.
22 | 
23 |     I have no reason to believe that JSON is a particularly representative or
24 |     revealing grammar to test with. Also, this is a naive, unoptimized,
25 |     incorrect grammar, so don't use it as a basis for comparison with other
26 |     parsers. It's just meant to compare across versions of Parsimonious.
27 | 
28 |     """
29 |     father = """{
30 |         "id" : 1,
31 |         "married" : true,
32 |         "name" : "Larry Lopez",
33 |         "sons" : null,
34 |         "daughters" : [
35 |           {
36 |             "age" : 26,
37 |             "name" : "Sandra"
38 |             },
39 |           {
40 |             "age" : 25,
41 |             "name" : "Margaret"
42 |             },
43 |           {
44 |             "age" : 6,
45 |             "name" : "Mary"
46 |             }
47 |           ]
48 |         }"""
49 |     more_fathers = ','.join([father] * 60)
50 |     json = '{"fathers" : [' + more_fathers + ']}'
51 |     grammar = Grammar(r"""
52 |         value = space (string / number / object / array / true_false_null)
53 |                 space
54 | 
55 |         object = "{" members "}"
56 |         members = (pair ("," pair)*)?
57 |         pair = string ":" value
58 |         array = "[" elements "]"
59 |         elements = (value ("," value)*)?
60 |         true_false_null = "true" / "false" / "null"
61 | 
62 |         string = space "\"" chars "\"" space
63 |         chars = ~"[^\"]*"  # TODO implement the real thing
64 |         number = (int frac exp) / (int exp) / (int frac) / int
65 |         int = "-"? ((digit1to9 digits) / digit)
66 |         frac = "." digits
67 |         exp = e digits
68 |         digits = digit+
69 |         e = "e+" / "e-" / "e" / "E+" / "E-" / "E"
70 | 
71 |         digit1to9 = ~"[1-9]"
72 |         digit = ~"[0-9]"
73 |         space = ~"\s*"
74 |         """)
75 | 
76 |     # These number and repetition values seem to keep results within 5% of the
77 |     # difference between min and max. We get more consistent results running a
78 |     # bunch of single-parse tests and taking the min rather than upping the
79 |     # NUMBER and trying to stomp out the outliers with averaging.
80 |     NUMBER = 1
81 |     REPEAT = 5
82 |     total_seconds = min(repeat(lambda: grammar.parse(json),
83 |                                lambda: gc.enable(),  # so we take into account how we treat the GC
84 |                                repeat=REPEAT,
85 |                                number=NUMBER))
86 |     seconds_each = total_seconds / NUMBER
87 | 
88 |     kb = len(json) / 1024.0
89 |     print('Took %.3fs to parse %.1fKB: %.0fKB/s.' % (seconds_each,
90 |                                                      kb,
91 |                                                      kb / seconds_each))
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     test_not_really_json_parsing()


--------------------------------------------------------------------------------
/parsimonious/tests/test_benchmarks.py:
--------------------------------------------------------------------------------
 1 | """Tests to show that the benchmarks we based our speed optimizations on are
 2 | still valid"""
 3 | import unittest
 4 | from functools import partial
 5 | from timeit import timeit
 6 | 
 7 | timeit = partial(timeit, number=500000)
 8 | 
 9 | class TestBenchmarks(unittest.TestCase):
10 |     def test_lists_vs_dicts(self):
11 |         """See what's faster at int key lookup: dicts or lists."""
12 |         list_time = timeit('item = l[9000]', 'l = [0] * 10000')
13 |         dict_time = timeit('item = d[9000]', 'd = {x: 0 for x in range(10000)}')
14 | 
15 |         # Dicts take about 1.6x as long as lists in Python 2.6 and 2.7.
16 |         self.assertTrue(list_time < dict_time, '%s < %s' % (list_time, dict_time))
17 | 
18 | 
19 |     def test_call_vs_inline(self):
20 |         """How bad is the calling penalty?"""
21 |         no_call = timeit('l[0] += 1', 'l = [0]')
22 |         call = timeit('add(); l[0] += 1', 'l = [0]\n'
23 |                                           'def add():\n'
24 |                                           '    pass')
25 | 
26 |         # Calling a function is pretty fast; it takes just 1.2x as long as the
27 |         # global var access and addition in l[0] += 1.
28 |         self.assertTrue(no_call < call, '%s (no call) < %s (call)' % (no_call, call))
29 | 
30 | 
31 |     def test_startswith_vs_regex(self):
32 |         """Can I beat the speed of regexes by special-casing literals?"""
33 |         re_time = timeit(
34 |             'r.match(t, 19)',
35 |             'import re\n'
36 |             "r = re.compile('hello')\n"
37 |             "t = 'this is the finest hello ever'")
38 |         startswith_time = timeit("t.startswith('hello', 19)",
39 |                                  "t = 'this is the finest hello ever'")
40 | 
41 |         # Regexes take 2.24x as long as simple string matching.
42 |         self.assertTrue(startswith_time < re_time,
43 |             '%s (startswith) < %s (re)' % (startswith_time, re_time))


--------------------------------------------------------------------------------
/parsimonious/tests/test_expressions.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from unittest import TestCase
  3 | 
  4 | from parsimonious.exceptions import ParseError, IncompleteParseError
  5 | from parsimonious.expressions import (Literal, Regex, Sequence, OneOf, Not,
  6 |                                       Quantifier, Optional, ZeroOrMore, OneOrMore, Expression)
  7 | from parsimonious.grammar import Grammar, rule_grammar
  8 | from parsimonious.nodes import Node
  9 | 
 10 | 
 11 | class LengthTests(TestCase):
 12 |     """Tests for returning the right lengths
 13 | 
 14 |     I wrote these before parse tree generation was implemented. They're
 15 |     partially redundant with TreeTests.
 16 | 
 17 |     """
 18 | 
 19 |     def len_eq(self, node, length):
 20 |         """Return whether the match lengths of 2 nodes are equal.
 21 | 
 22 |         Makes tests shorter and lets them omit positional stuff they don't care
 23 |         about.
 24 | 
 25 |         """
 26 |         node_length = None if node is None else node.end - node.start
 27 |         assert node_length == length
 28 | 
 29 |     def test_regex(self):
 30 |         self.len_eq(Literal('hello').match('ehello', 1), 5)  # simple
 31 |         self.len_eq(Regex('hello*').match('hellooo'), 7)  # *
 32 |         self.assertRaises(ParseError, Regex('hello*').match, 'goodbye')  # no match
 33 |         self.len_eq(Regex('hello', ignore_case=True).match('HELLO'), 5)
 34 | 
 35 |     def test_sequence(self):
 36 |         self.len_eq(Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match('hiiiilobingo1234'), 12)  # succeed
 37 |         self.assertRaises(ParseError, Sequence(Regex('hi*'), Literal('lo'),
 38 |                                                Regex('.ingo')).match, 'hiiiilobing')  # don't
 39 |         self.len_eq(Sequence(Regex('hi*')).match('>hiiii', 1), 5)  # non-0 pos
 40 | 
 41 |     def test_one_of(self):
 42 |         self.len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aaa'), 3)  # first alternative
 43 |         self.len_eq(OneOf(Literal('aaa'), Literal('bb')).match('bbaaa'), 2)  # second
 44 |         self.assertRaises(ParseError, OneOf(Literal('aaa'), Literal('bb')).match, 'aa')  # no match
 45 | 
 46 |     def test_not(self):
 47 |         self.len_eq(Not(Regex('.')).match(''), 0)  # match
 48 |         self.assertRaises(ParseError, Not(Regex('.')).match, 'Hi')  # don't
 49 | 
 50 |     def test_optional(self):
 51 |         self.len_eq(Sequence(Optional(Literal('a')), Literal('b')).match('b'), 1)  # contained expr fails
 52 |         self.len_eq(Sequence(Optional(Literal('a')), Literal('b')).match('ab'), 2)  # contained expr succeeds
 53 |         self.len_eq(Optional(Literal('a')).match('aa'), 1)
 54 |         self.len_eq(Optional(Literal('a')).match('bb'), 0)
 55 | 
 56 |     def test_zero_or_more(self):
 57 |         self.len_eq(ZeroOrMore(Literal('b')).match(''), 0)  # zero
 58 |         self.len_eq(ZeroOrMore(Literal('b')).match('bbb'), 3)  # more
 59 | 
 60 |         self.len_eq(Regex('^').match(''), 0)  # Validate the next test.
 61 | 
 62 |         # Try to make it loop infinitely using a zero-length contained expression:
 63 |         self.len_eq(ZeroOrMore(Regex('^')).match(''), 0)
 64 | 
 65 |     def test_one_or_more(self):
 66 |         self.len_eq(OneOrMore(Literal('b')).match('b'), 1)  # one
 67 |         self.len_eq(OneOrMore(Literal('b')).match('bbb'), 3)  # more
 68 |         self.len_eq(OneOrMore(Literal('b'), min=3).match('bbb'), 3)  # with custom min; success
 69 |         self.len_eq(Quantifier(Literal('b'), min=3, max=5).match('bbbb'), 4)  # with custom min and max; success
 70 |         self.len_eq(Quantifier(Literal('b'), min=3, max=5).match('bbbbbb'), 5)  # with custom min and max; success
 71 |         self.assertRaises(ParseError, OneOrMore(Literal('b'), min=3).match, 'bb')  # with custom min; failure
 72 |         self.assertRaises(ParseError, Quantifier(Literal('b'), min=3, max=5).match, 'bb')  # with custom min and max; failure
 73 |         self.len_eq(OneOrMore(Regex('^')).match('bb'), 0)  # attempt infinite loop
 74 | 
 75 | 
 76 | class TreeTests(TestCase):
 77 |     """Tests for building the right trees
 78 | 
 79 |     We have only to test successes here; failures (None-returning cases) are
 80 |     covered above.
 81 | 
 82 |     """
 83 | 
 84 |     def test_simple_node(self):
 85 |         """Test that leaf expressions like ``Literal`` make the right nodes."""
 86 |         h = Literal('hello', name='greeting')
 87 |         self.assertEqual(h.match('hello'), Node(h, 'hello', 0, 5))
 88 | 
 89 |     def test_sequence_nodes(self):
 90 |         """Assert that ``Sequence`` produces nodes with the right children."""
 91 |         s = Sequence(Literal('heigh', name='greeting1'),
 92 |                      Literal('ho',    name='greeting2'), name='dwarf')
 93 |         text = 'heighho'
 94 |         self.assertEqual(s.match(text), Node(s, text, 0, 7, children=[Node(s.members[0], text, 0, 5),
 95 |                                                                       Node(s.members[1], text, 5, 7)]))
 96 | 
 97 |     def test_one_of(self):
 98 |         """``OneOf`` should return its own node, wrapping the child that succeeds."""
 99 |         o = OneOf(Literal('a', name='lit'), name='one_of')
100 |         text = 'aa'
101 |         self.assertEqual(o.match(text), Node(o, text, 0, 1, children=[
102 |             Node(o.members[0], text, 0, 1)]))
103 | 
104 |     def test_optional(self):
105 |         """``Optional`` should return its own node wrapping the succeeded child."""
106 |         expr = Optional(Literal('a', name='lit'), name='opt')
107 | 
108 |         text = 'a'
109 |         self.assertEqual(expr.match(text), Node(expr, text, 0, 1, children=[
110 |             Node(expr.members[0], text, 0, 1)]))
111 | 
112 |         # Test failure of the Literal inside the Optional; the
113 |         # LengthTests.test_optional is ambiguous for that.
114 |         text = ''
115 |         self.assertEqual(expr.match(text), Node(expr, text, 0, 0))
116 | 
117 |     def test_zero_or_more_zero(self):
118 |         """Test the 0 case of ``ZeroOrMore``; it should still return a node."""
119 |         expr = ZeroOrMore(Literal('a'), name='zero')
120 |         text = ''
121 |         self.assertEqual(expr.match(text), Node(expr, text, 0, 0))
122 | 
123 |     def test_one_or_more_one(self):
124 |         """Test the 1 case of ``OneOrMore``; it should return a node with a child."""
125 |         expr = OneOrMore(Literal('a', name='lit'), name='one')
126 |         text = 'a'
127 |         self.assertEqual(expr.match(text), Node(expr, text, 0, 1, children=[
128 |             Node(expr.members[0], text, 0, 1)]))
129 | 
130 |     # Things added since Grammar got implemented are covered in integration
131 |     # tests in test_grammar.
132 | 
133 | 
134 | class ParseTests(TestCase):
135 |     """Tests for the ``parse()`` method"""
136 | 
137 |     def test_parse_success(self):
138 |         """Make sure ``parse()`` returns the tree on success.
139 | 
140 |         There's not much more than that to test that we haven't already vetted
141 |         above.
142 | 
143 |         """
144 |         expr = OneOrMore(Literal('a', name='lit'), name='more')
145 |         text = 'aa'
146 |         self.assertEqual(expr.parse(text), Node(expr, text, 0, 2, children=[
147 |             Node(expr.members[0], text, 0, 1),
148 |             Node(expr.members[0], text, 1, 2)]))
149 | 
150 | 
151 | class ErrorReportingTests(TestCase):
152 |     """Tests for reporting parse errors"""
153 | 
154 |     def test_inner_rule_succeeding(self):
155 |         """Make sure ``parse()`` fails and blames the
156 |         rightward-progressing-most named Expression when an Expression isn't
157 |         satisfied.
158 | 
159 |         Make sure ParseErrors have nice Unicode representations.
160 | 
161 |         """
162 |         grammar = Grammar("""
163 |             bold_text = open_parens text close_parens
164 |             open_parens = "(("
165 |             text = ~"[a-zA-Z]+"
166 |             close_parens = "))"
167 |             """)
168 |         text = '((fred!!'
169 |         try:
170 |             grammar.parse(text)
171 |         except ParseError as error:
172 |             self.assertEqual(error.pos, 6)
173 |             self.assertEqual(error.expr, grammar['close_parens'])
174 |             self.assertEqual(error.text, text)
175 |             self.assertEqual(str(error), "Rule 'close_parens' didn't match at '!!' (line 1, column 7).")
176 | 
177 |     def test_rewinding(self):
178 |         """Make sure rewinding the stack and trying an alternative (which
179 |         progresses farther) from a higher-level rule can blame an expression
180 |         within the alternative on failure.
181 | 
182 |         There's no particular reason I suspect this wouldn't work, but it's a
183 |         more real-world example than the no-alternative cases already tested.
184 | 
185 |         """
186 |         grammar = Grammar("""
187 |             formatted_text = bold_text / weird_text
188 |             bold_text = open_parens text close_parens
189 |             weird_text = open_parens text "!!" bork
190 |             bork = "bork"
191 |             open_parens = "(("
192 |             text = ~"[a-zA-Z]+"
193 |             close_parens = "))"
194 |             """)
195 |         text = '((fred!!'
196 |         try:
197 |             grammar.parse(text)
198 |         except ParseError as error:
199 |             self.assertEqual(error.pos, 8)
200 |             self.assertEqual(error.expr, grammar['bork'])
201 |             self.assertEqual(error.text, text)
202 | 
203 |     def test_no_named_rule_succeeding(self):
204 |         """Make sure ParseErrors have sane printable representations even if we
205 |         never succeeded in matching any named expressions."""
206 |         grammar = Grammar('''bork = "bork"''')
207 |         try:
208 |             grammar.parse('snork')
209 |         except ParseError as error:
210 |             self.assertEqual(error.pos, 0)
211 |             self.assertEqual(error.expr, grammar['bork'])
212 |             self.assertEqual(error.text, 'snork')
213 | 
214 |     def test_parse_with_leftovers(self):
215 |         """Make sure ``parse()`` reports where we started failing to match,
216 |         even if a partial match was successful."""
217 |         grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''')
218 |         try:
219 |             grammar.parse('chitty bangbang')
220 |         except IncompleteParseError as error:
221 |             self.assertEqual(str(
222 |                 error), "Rule 'sequence' matched in its entirety, but it didn't consume all the text. The non-matching portion of the text begins with 'bang' (line 1, column 12).")
223 | 
224 |     def test_favoring_named_rules(self):
225 |         """Named rules should be used in error messages in favor of anonymous
226 |         ones, even if those are rightward-progressing-more, and even if the
227 |         failure starts at position 0."""
228 |         grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''')
229 |         try:
230 |             grammar.parse('burp')
231 |         except ParseError as error:
232 |             self.assertEqual(str(error), "Rule 'starts_with_a' didn't match at 'burp' (line 1, column 1).")
233 | 
234 |     def test_line_and_column(self):
235 |         """Make sure we got the line and column computation right."""
236 |         grammar = Grammar(r"""
237 |             whee_lah = whee "\n" lah "\n"
238 |             whee = "whee"
239 |             lah = "lah"
240 |             """)
241 |         try:
242 |             grammar.parse('whee\nlahGOO')
243 |         except ParseError as error:
244 |             # TODO: Right now, this says "Rule <Literal "\n" at 0x4368250432>
245 |             # didn't match". That's not the greatest. Fix that, then fix this.
246 |             self.assertTrue(str(error).endswith(r"""didn't match at 'GOO' (line 2, column 4)."""))
247 | 
248 | 
249 | class RepresentationTests(TestCase):
250 |     """Tests for str(), unicode(), and repr() of expressions"""
251 | 
252 |     def test_unicode_crash(self):
253 |         """Make sure matched unicode strings don't crash ``__str__``."""
254 |         grammar = Grammar(r'string = ~r"\S+"u')
255 |         str(grammar.parse('中文'))
256 | 
257 |     def test_unicode(self):
258 |         """Smoke-test the conversion of expressions to bits of rules.
259 | 
260 |         A slightly more comprehensive test of the actual values is in
261 |         ``GrammarTests.test_unicode``.
262 | 
263 |         """
264 |         str(rule_grammar)
265 | 
266 |     def test_unicode_keep_parens(self):
267 |         """Make sure converting an expression to unicode doesn't strip
268 |         parenthesis.
269 | 
270 |         """
271 |         # ZeroOrMore
272 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" "eggs")* "spam"')),
273 |                          "foo = 'bar' ('baz' 'eggs')* 'spam'")
274 | 
275 |         # Quantifiers
276 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" "eggs"){2,4} "spam"')),
277 |                          "foo = 'bar' ('baz' 'eggs'){2,4} 'spam'")
278 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" "eggs"){2,} "spam"')),
279 |                          "foo = 'bar' ('baz' 'eggs'){2,} 'spam'")
280 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" "eggs"){1,} "spam"')),
281 |                          "foo = 'bar' ('baz' 'eggs')+ 'spam'")
282 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" "eggs"){,4} "spam"')),
283 |                          "foo = 'bar' ('baz' 'eggs'){,4} 'spam'")
284 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" "eggs"){0,1} "spam"')),
285 |                          "foo = 'bar' ('baz' 'eggs')? 'spam'")
286 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" "eggs"){0,} "spam"')),
287 |                          "foo = 'bar' ('baz' 'eggs')* 'spam'")
288 | 
289 |         # OneOf
290 |         self.assertEqual(str(Grammar('foo = "bar" ("baz" / "eggs") "spam"')),
291 |                          "foo = 'bar' ('baz' / 'eggs') 'spam'")
292 | 
293 |         # Lookahead
294 |         self.assertEqual(str(Grammar('foo = "bar" &("baz" "eggs") "spam"')),
295 |                          "foo = 'bar' &('baz' 'eggs') 'spam'")
296 | 
297 |         # Multiple sequences
298 |         self.assertEqual(str(Grammar('foo = ("bar" "baz") / ("baff" "bam")')),
299 |                          "foo = ('bar' 'baz') / ('baff' 'bam')")
300 | 
301 |     def test_unicode_surrounding_parens(self):
302 |         """
303 |         Make sure there are no surrounding parens around the entire
304 |         right-hand side of an expression (as they're unnecessary).
305 | 
306 |         """
307 |         self.assertEqual(str(Grammar('foo = ("foo" ("bar" "baz"))')),
308 |                          "foo = 'foo' ('bar' 'baz')")
309 | 
310 | 
311 | class SlotsTests(TestCase):
312 |     """Tests to do with __slots__"""
313 | 
314 |     def test_subclassing(self):
315 |         """Make sure a subclass of a __slots__-less class can introduce new
316 |         slots itself.
317 | 
318 |         This isn't supposed to work, according to the language docs:
319 | 
320 |             When inheriting from a class without __slots__, the __dict__
321 |             attribute of that class will always be accessible, so a __slots__
322 |             definition in the subclass is meaningless.
323 | 
324 |         But it does.
325 | 
326 |         """
327 |         class Smoo(Quantifier):
328 |             __slots__ = ['smoo']
329 | 
330 |             def __init__(self):
331 |                 self.smoo = 'smoo'
332 | 
333 |         smoo = Smoo()
334 |         self.assertEqual(smoo.__dict__, {})  # has a __dict__ but with no smoo in it
335 |         self.assertEqual(smoo.smoo, 'smoo')  # The smoo attr ended up in a slot.
336 | 


--------------------------------------------------------------------------------
/parsimonious/tests/test_grammar.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from sys import version_info
  4 | from unittest import TestCase
  5 | 
  6 | import pytest
  7 | 
  8 | from parsimonious.exceptions import BadGrammar, LeftRecursionError, ParseError, UndefinedLabel, VisitationError
  9 | from parsimonious.expressions import Literal, Lookahead, Regex, Sequence, TokenMatcher, is_callable
 10 | from parsimonious.grammar import rule_grammar, rule_syntax, RuleVisitor, Grammar, TokenGrammar, LazyReference
 11 | from parsimonious.nodes import Node
 12 | from parsimonious.utils import Token
 13 | 
 14 | 
 15 | class BootstrappingGrammarTests(TestCase):
 16 |     """Tests for the expressions in the grammar that parses the grammar
 17 |     definition syntax"""
 18 | 
 19 |     def test_quantifier(self):
 20 |         text = '*'
 21 |         quantifier = rule_grammar['quantifier']
 22 |         self.assertEqual(quantifier.parse(text),
 23 |             Node(quantifier, text, 0, 1, children=[
 24 |                 Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)]))
 25 |         text = '?'
 26 |         self.assertEqual(quantifier.parse(text),
 27 |             Node(quantifier, text, 0, 1, children=[
 28 |                 Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)]))
 29 |         text = '+'
 30 |         self.assertEqual(quantifier.parse(text),
 31 |             Node(quantifier, text, 0, 1, children=[
 32 |                 Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)]))
 33 | 
 34 |     def test_spaceless_literal(self):
 35 |         text = '"anything but quotes#$*&^"'
 36 |         spaceless_literal = rule_grammar['spaceless_literal']
 37 |         self.assertEqual(spaceless_literal.parse(text),
 38 |             Node(spaceless_literal, text, 0, len(text), children=[
 39 |                 Node(spaceless_literal.members[0], text, 0, len(text))]))
 40 |         text = r'''r"\""'''
 41 |         self.assertEqual(spaceless_literal.parse(text),
 42 |             Node(spaceless_literal, text, 0, 5, children=[
 43 |                 Node(spaceless_literal.members[0], text, 0, 5)]))
 44 | 
 45 |     def test_regex(self):
 46 |         text = '~"[a-zA-Z_][a-zA-Z_0-9]*"LI'
 47 |         regex = rule_grammar['regex']
 48 |         self.assertEqual(rule_grammar['regex'].parse(text),
 49 |             Node(regex, text, 0, len(text), children=[
 50 |                  Node(Literal('~'), text, 0, 1),
 51 |                  Node(rule_grammar['spaceless_literal'], text, 1, 25, children=[
 52 |                      Node(rule_grammar['spaceless_literal'].members[0], text, 1, 25)]),
 53 |                  Node(regex.members[2], text, 25, 27),
 54 |                  Node(rule_grammar['_'], text, 27, 27)]))
 55 | 
 56 |     def test_successes(self):
 57 |         """Make sure the PEG recognition grammar succeeds on various inputs."""
 58 |         self.assertTrue(rule_grammar['label'].parse('_'))
 59 |         self.assertTrue(rule_grammar['label'].parse('jeff'))
 60 |         self.assertTrue(rule_grammar['label'].parse('_THIS_THING'))
 61 | 
 62 |         self.assertTrue(rule_grammar['atom'].parse('some_label'))
 63 |         self.assertTrue(rule_grammar['atom'].parse('"some literal"'))
 64 |         self.assertTrue(rule_grammar['atom'].parse('~"some regex"i'))
 65 | 
 66 |         self.assertTrue(rule_grammar['quantified'].parse('~"some regex"i*'))
 67 |         self.assertTrue(rule_grammar['quantified'].parse('thing+'))
 68 |         self.assertTrue(rule_grammar['quantified'].parse('"hi"?'))
 69 | 
 70 |         self.assertTrue(rule_grammar['term'].parse('this'))
 71 |         self.assertTrue(rule_grammar['term'].parse('that+'))
 72 | 
 73 |         self.assertTrue(rule_grammar['sequence'].parse('this that? other'))
 74 | 
 75 |         self.assertTrue(rule_grammar['ored'].parse('this / that+ / "other"'))
 76 | 
 77 |         # + is higher precedence than &, so 'anded' should match the whole
 78 |         # thing:
 79 |         self.assertTrue(rule_grammar['lookahead_term'].parse('&this+'))
 80 | 
 81 |         self.assertTrue(rule_grammar['expression'].parse('this'))
 82 |         self.assertTrue(rule_grammar['expression'].parse('this? that other*'))
 83 |         self.assertTrue(rule_grammar['expression'].parse('&this / that+ / "other"'))
 84 |         self.assertTrue(rule_grammar['expression'].parse('this / that? / "other"+'))
 85 |         self.assertTrue(rule_grammar['expression'].parse('this? that other*'))
 86 | 
 87 |         self.assertTrue(rule_grammar['rule'].parse('this = that\r'))
 88 |         self.assertTrue(rule_grammar['rule'].parse('this = the? that other* \t\r'))
 89 |         self.assertTrue(rule_grammar['rule'].parse('the=~"hi*"\n'))
 90 | 
 91 |         self.assertTrue(rule_grammar.parse('''
 92 |             this = the? that other*
 93 |             that = "thing"
 94 |             the=~"hi*"
 95 |             other = "ahoy hoy"
 96 |             '''))
 97 | 
 98 | 
 99 | class RuleVisitorTests(TestCase):
100 |     """Tests for ``RuleVisitor``
101 | 
102 |     As I write these, Grammar is not yet fully implemented. Normally, there'd
103 |     be no reason to use ``RuleVisitor`` directly.
104 | 
105 |     """
106 |     def test_round_trip(self):
107 |         """Test a simple round trip.
108 | 
109 |         Parse a simple grammar, turn the parse tree into a map of expressions,
110 |         and use that to parse another piece of text.
111 | 
112 |         Not everything was implemented yet, but it was a big milestone and a
113 |         proof of concept.
114 | 
115 |         """
116 |         tree = rule_grammar.parse('''number = ~"[0-9]+"\n''')
117 |         rules, default_rule = RuleVisitor().visit(tree)
118 | 
119 |         text = '98'
120 |         self.assertEqual(default_rule.parse(text), Node(default_rule, text, 0, 2))
121 | 
122 |     def test_undefined_rule(self):
123 |         """Make sure we throw the right exception on undefined rules."""
124 |         tree = rule_grammar.parse('boy = howdy\n')
125 |         self.assertRaises(UndefinedLabel, RuleVisitor().visit, tree)
126 | 
127 |     def test_optional(self):
128 |         tree = rule_grammar.parse('boy = "howdy"?\n')
129 |         rules, default_rule = RuleVisitor().visit(tree)
130 | 
131 |         howdy = 'howdy'
132 | 
133 |         # It should turn into a Node from the Optional and another from the
134 |         # Literal within.
135 |         self.assertEqual(default_rule.parse(howdy), Node(default_rule, howdy, 0, 5, children=[
136 |                                            Node(Literal("howdy"), howdy, 0, 5)]))
137 | 
138 | 
139 | def function_rule(text, pos):
140 |     """This is an example of a grammar rule implemented as a function, and is
141 |     provided as a test fixture."""
142 |     token = 'function'
143 |     return pos + len(token) if text[pos:].startswith(token) else None
144 | 
145 | 
146 | class GrammarTests(TestCase):
147 |     """Integration-test ``Grammar``: feed it a PEG and see if it works."""
148 | 
149 |     def method_rule(self, text, pos):
150 |         """This is an example of a grammar rule implemented as a method, and is
151 |         provided as a test fixture."""
152 |         token = 'method'
153 |         return pos + len(token) if text[pos:].startswith(token) else None
154 | 
155 |     @staticmethod
156 |     def descriptor_rule(text, pos):
157 |         """This is an example of a grammar rule implemented as a descriptor,
158 |         and is provided as a test fixture."""
159 |         token = 'descriptor'
160 |         return pos + len(token) if text[pos:].startswith(token) else None
161 | 
162 |     rules = {"descriptor_rule": descriptor_rule}
163 | 
164 |     def test_expressions_from_rules(self):
165 |         """Test the ``Grammar`` base class's ability to compile an expression
166 |         tree from rules.
167 | 
168 |         That the correct ``Expression`` tree is built is already tested in
169 |         ``RuleGrammarTests``. This tests only that the ``Grammar`` base class's
170 |         ``_expressions_from_rules`` works.
171 | 
172 |         """
173 |         greeting_grammar = Grammar('greeting = "hi" / "howdy"')
174 |         tree = greeting_grammar.parse('hi')
175 |         self.assertEqual(tree, Node(greeting_grammar['greeting'], 'hi', 0, 2, children=[
176 |                        Node(Literal('hi'), 'hi', 0, 2)]))
177 | 
178 |     def test_unicode(self):
179 |         """Assert that a ``Grammar`` can convert into a string-formatted series
180 |         of rules."""
181 |         grammar = Grammar(r"""
182 |                           bold_text  = bold_open text bold_close
183 |                           text       = ~"[A-Z 0-9]*"i
184 |                           bold_open  = "(("
185 |                           bold_close = "))"
186 |                           """)
187 |         lines = str(grammar).splitlines()
188 |         self.assertEqual(lines[0], 'bold_text = bold_open text bold_close')
189 |         self.assertTrue("text = ~'[A-Z 0-9]*'i%s" % ('u' if version_info >= (3,) else '')
190 |             in lines)
191 |         self.assertTrue("bold_open = '(('" in lines)
192 |         self.assertTrue("bold_close = '))'" in lines)
193 |         self.assertEqual(len(lines), 4)
194 | 
195 |     def test_match(self):
196 |         """Make sure partial-matching (with pos) works."""
197 |         grammar = Grammar(r"""
198 |                           bold_text  = bold_open text bold_close
199 |                           text       = ~"[A-Z 0-9]*"i
200 |                           bold_open  = "(("
201 |                           bold_close = "))"
202 |                           """)
203 |         s = ' ((boo))yah'
204 |         self.assertEqual(grammar.match(s, pos=1), Node(grammar['bold_text'], s, 1, 8, children=[
205 |                                          Node(grammar['bold_open'], s, 1, 3),
206 |                                          Node(grammar['text'], s, 3, 6),
207 |                                          Node(grammar['bold_close'], s, 6, 8)]))
208 | 
209 |     def test_bad_grammar(self):
210 |         """Constructing a Grammar with bad rules should raise ParseError."""
211 |         self.assertRaises(ParseError, Grammar, 'just a bunch of junk')
212 | 
213 |     def test_comments(self):
214 |         """Test tolerance of comments and blank lines in and around rules."""
215 |         grammar = Grammar(r"""# This is a grammar.
216 | 
217 |                           # It sure is.
218 |                           bold_text  = stars text stars  # nice
219 |                           text       = ~"[A-Z 0-9]*"i #dude
220 | 
221 | 
222 |                           stars      = "**"
223 |                           # Pretty good
224 |                           #Oh yeah.#""")  # Make sure a comment doesn't need a
225 |                                           # \n or \r to end.
226 |         self.assertEqual(list(sorted(str(grammar).splitlines())),
227 |             ['''bold_text = stars text stars''',
228 |              # TODO: Unicode flag is on by default in Python 3. I wonder if we
229 |              # should turn it on all the time in Parsimonious.
230 |              """stars = '**'""",
231 |              '''text = ~'[A-Z 0-9]*'i%s''' % ('u' if version_info >= (3,)
232 |                                               else '')])
233 | 
234 |     def test_multi_line(self):
235 |         """Make sure we tolerate all sorts of crazy line breaks and comments in
236 |         the middle of rules."""
237 |         grammar = Grammar("""
238 |             bold_text  = bold_open  # commenty comment
239 |                          text  # more comment
240 |                          bold_close
241 |             text       = ~"[A-Z 0-9]*"i
242 |             bold_open  = "((" bold_close =  "))"
243 |             """)
244 |         self.assertTrue(grammar.parse('((booyah))') is not None)
245 | 
246 |     def test_not(self):
247 |         """Make sure "not" predicates get parsed and work properly."""
248 |         grammar = Grammar(r'''not_arp = !"arp" ~"[a-z]+"''')
249 |         self.assertRaises(ParseError, grammar.parse, 'arp')
250 |         self.assertTrue(grammar.parse('argle') is not None)
251 | 
252 |     def test_lookahead(self):
253 |         grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''')
254 |         self.assertRaises(ParseError, grammar.parse, 'burp')
255 | 
256 |         s = 'arp'
257 |         self.assertEqual(grammar.parse('arp'), Node(grammar['starts_with_a'], s, 0, 3, children=[
258 |                                       Node(Lookahead(Literal('a')), s, 0, 0),
259 |                                       Node(Regex(r'[a-z]+'), s, 0, 3)]))
260 | 
261 |     def test_parens(self):
262 |         grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''')
263 |         # Make sure it's not as if the parens aren't there:
264 |         self.assertRaises(ParseError, grammar.parse, 'chitty bangbang')
265 | 
266 |         s = 'chitty bang bang'
267 |         self.assertEqual(str(grammar.parse(s)),
268 |             """<Node called "sequence" matching "chitty bang bang">
269 |     <Node matching "chitty">
270 |     <Node matching " bang bang">
271 |         <Node matching " bang">
272 |             <Node matching " ">
273 |             <Node matching "bang">
274 |         <Node matching " bang">
275 |             <Node matching " ">
276 |             <Node matching "bang">""")
277 | 
278 |     def test_resolve_refs_order(self):
279 |         """Smoke-test a circumstance where lazy references don't get resolved."""
280 |         grammar = Grammar("""
281 |             expression = "(" terms ")"
282 |             terms = term+
283 |             term = number
284 |             number = ~r"[0-9]+"
285 |             """)
286 |         grammar.parse('(34)')
287 | 
288 |     def test_resolve_refs_completeness(self):
289 |         """Smoke-test another circumstance where lazy references don't get resolved."""
290 |         grammar = Grammar(r"""
291 |             block = "{" _ item* "}" _
292 | 
293 |             # An item is an element of a block.
294 |             item = number / word / block / paren
295 | 
296 |             # Parens are for delimiting subexpressions.
297 |             paren = "(" _ item* ")" _
298 | 
299 |             # Words are barewords, unquoted things, other than literals, that can live
300 |             # in lists. We may renege on some of these chars later, especially ".". We
301 |             # may add Unicode.
302 |             word = spaceless_word _
303 |             spaceless_word = ~r"[-a-z`~!@#$%^&*_+=|\\;<>,.?][-a-z0-9`~!@#$%^&*_+=|\\;<>,.?]*"i
304 | 
305 |             number = ~r"[0-9]+" _ # There are decimals and strings and other stuff back on the "parsing" branch, once you get this working.
306 | 
307 |             _ = meaninglessness*
308 |             meaninglessness = whitespace
309 |             whitespace = ~r"\s+"
310 |             """)
311 |         grammar.parse('{log (add 3 to 5)}')
312 | 
313 |     def test_infinite_loop(self):
314 |         """Smoke-test a grammar that was causing infinite loops while building.
315 | 
316 |         This was going awry because the "int" rule was never getting marked as
317 |         resolved, so it would just keep trying to resolve it over and over.
318 | 
319 |         """
320 |         Grammar("""
321 |             digits = digit+
322 |             int = digits
323 |             digit = ~"[0-9]"
324 |             number = int
325 |             main = number
326 |             """)
327 | 
328 |     def test_circular_toplevel_reference(self):
329 |         with pytest.raises(VisitationError):
330 |             Grammar("""
331 |                 foo = bar
332 |                 bar = foo
333 |             """)
334 |         with pytest.raises(VisitationError):
335 |             Grammar("""
336 |                 foo = foo
337 |                 bar = foo
338 |             """)
339 |         with pytest.raises(VisitationError):
340 |             Grammar("""
341 |                 foo = bar
342 |                 bar = baz
343 |                 baz = foo
344 |             """)
345 | 
346 |     def test_right_recursive(self):
347 |         """Right-recursive refs should resolve."""
348 |         grammar = Grammar("""
349 |             digits = digit digits?
350 |             digit = ~r"[0-9]"
351 |             """)
352 |         self.assertTrue(grammar.parse('12') is not None)
353 | 
354 |     def test_badly_circular(self):
355 |         """Uselessly circular references should be detected by the grammar
356 |         compiler."""
357 |         self.skipTest('We have yet to make the grammar compiler detect these.')
358 |         Grammar("""
359 |              foo = bar
360 |              bar = foo
361 |              """)
362 | 
363 |     def test_parens_with_leading_whitespace(self):
364 |         """Make sure a parenthesized expression is allowed to have leading
365 |         whitespace when nested directly inside another."""
366 |         Grammar("""foo = ( ("c") )""").parse('c')
367 | 
368 |     def test_single_quoted_literals(self):
369 |         Grammar("""foo = 'a' '"'""").parse('a"')
370 | 
371 |     def test_simple_custom_rules(self):
372 |         """Run 2-arg custom-coded rules through their paces."""
373 |         grammar = Grammar("""
374 |             bracketed_digit = start digit end
375 |             start = '['
376 |             end = ']'""",
377 |             digit=lambda text, pos:
378 |                     (pos + 1) if text[pos].isdigit() else None)
379 |         s = '[6]'
380 |         self.assertEqual(grammar.parse(s),
381 |             Node(grammar['bracketed_digit'], s, 0, 3, children=[
382 |                 Node(grammar['start'], s, 0, 1),
383 |                 Node(grammar['digit'], s, 1, 2),
384 |                 Node(grammar['end'], s, 2, 3)]))
385 | 
386 |     def test_complex_custom_rules(self):
387 |         """Run 5-arg custom rules through their paces.
388 | 
389 |         Incidentally tests returning an actual Node from the custom rule.
390 | 
391 |         """
392 |         grammar = Grammar("""
393 |             bracketed_digit = start digit end
394 |             start = '['
395 |             end = ']'
396 |             real_digit = '6'""",
397 |             # In this particular implementation of the digit rule, no node is
398 |             # generated for `digit`; it falls right through to `real_digit`.
399 |             # I'm not sure if this could lead to problems; I can't think of
400 |             # any, but it's probably not a great idea.
401 |             digit=lambda text, pos, cache, error, grammar:
402 |                     grammar['real_digit'].match_core(text, pos, cache, error))
403 |         s = '[6]'
404 |         self.assertEqual(grammar.parse(s),
405 |             Node(grammar['bracketed_digit'], s, 0, 3, children=[
406 |                 Node(grammar['start'], s, 0, 1),
407 |                 Node(grammar['real_digit'], s, 1, 2),
408 |                 Node(grammar['end'], s, 2, 3)]))
409 | 
410 |     def test_lazy_custom_rules(self):
411 |         """Make sure LazyReferences manually shoved into custom rules are
412 |         resolved.
413 | 
414 |         Incidentally test passing full-on Expressions as custom rules and
415 |         having a custom rule as the default one.
416 | 
417 |         """
418 |         grammar = Grammar("""
419 |             four = '4'
420 |             five = '5'""",
421 |             forty_five=Sequence(LazyReference('four'),
422 |                                 LazyReference('five'),
423 |                                 name='forty_five')).default('forty_five')
424 |         s = '45'
425 |         self.assertEqual(grammar.parse(s),
426 |             Node(grammar['forty_five'], s, 0, 2, children=[
427 |                 Node(grammar['four'], s, 0, 1),
428 |                 Node(grammar['five'], s, 1, 2)]))
429 | 
430 |     def test_unconnected_custom_rules(self):
431 |         """Make sure custom rules that aren't hooked to any other rules still
432 |         get included in the grammar and that lone ones get set as the
433 |         default.
434 | 
435 |         Incidentally test Grammar's `rules` default arg.
436 | 
437 |         """
438 |         grammar = Grammar(one_char=lambda text, pos: pos + 1).default('one_char')
439 |         s = '4'
440 |         self.assertEqual(grammar.parse(s),
441 |             Node(grammar['one_char'], s, 0, 1))
442 | 
443 |     def test_callability_of_routines(self):
444 |         self.assertTrue(is_callable(function_rule))
445 |         self.assertTrue(is_callable(self.method_rule))
446 |         self.assertTrue(is_callable(self.rules['descriptor_rule']))
447 | 
448 |     def test_callability_custom_rules(self):
449 |         """Confirms that functions, methods and method descriptors can all be
450 |         used to supply custom grammar rules.
451 |         """
452 |         grammar = Grammar("""
453 |             default = function method descriptor
454 |             """,
455 |             function=function_rule,
456 |             method=self.method_rule,
457 |             descriptor=self.rules['descriptor_rule'],
458 |         )
459 |         result = grammar.parse('functionmethoddescriptor')
460 |         rule_names = [node.expr.name for node in result.children]
461 |         self.assertEqual(rule_names, ['function', 'method', 'descriptor'])
462 | 
463 |     def test_lazy_default_rule(self):
464 |         """Make sure we get an actual rule set as our default rule, even when
465 |         the first rule has forward references and is thus a LazyReference at
466 |         some point during grammar compilation.
467 | 
468 |         """
469 |         grammar = Grammar(r"""
470 |             styled_text = text
471 |             text        = "hi"
472 |             """)
473 |         self.assertEqual(grammar.parse('hi'), Node(grammar['text'], 'hi', 0, 2))
474 | 
475 |     def test_immutable_grammar(self):
476 |         """Make sure that a Grammar is immutable after being created."""
477 |         grammar = Grammar(r"""
478 |             foo = 'bar'
479 |         """)
480 | 
481 |         def mod_grammar(grammar):
482 |             grammar['foo'] = 1
483 |         self.assertRaises(TypeError, mod_grammar, [grammar])
484 | 
485 |         def mod_grammar(grammar):
486 |             new_grammar = Grammar(r"""
487 |                 baz = 'biff'
488 |             """)
489 |             grammar.update(new_grammar)
490 |         self.assertRaises(AttributeError, mod_grammar, [grammar])
491 | 
492 |     def test_repr(self):
493 |         self.assertTrue(repr(Grammar(r'foo = "a"')))
494 | 
495 |     def test_rule_ordering_is_preserved(self):
496 |         grammar = Grammar('\n'.join('r%s = "something"' % i for i in range(100)))
497 |         self.assertEqual(
498 |             list(grammar.keys()),
499 |             ['r%s' % i for i in range(100)])
500 | 
501 |     def test_rule_ordering_is_preserved_on_shallow_copies(self):
502 |         grammar = Grammar('\n'.join('r%s = "something"' % i for i in range(100)))._copy()
503 |         self.assertEqual(
504 |             list(grammar.keys()),
505 |             ['r%s' % i for i in range(100)])
506 | 
507 |     def test_sequence_choice_bug(self):
508 |         """
509 |         Regression test for https://github.com/erikrose/parsimonious/issues/238
510 |         """
511 |         grammar = Grammar(r'''
512 |             value = "[" "]" / "5"
513 |         ''')
514 |         self.assertTrue(grammar.parse('[]') is not None)
515 |         self.assertTrue(grammar.parse('5') is not None)
516 |         grammar2 = Grammar(r'''
517 |             value = "5" / "[" "]"
518 |         ''')
519 |         self.assertTrue(grammar2.parse('[]') is not None)
520 |         self.assertTrue(grammar2.parse('5') is not None)
521 |         grammar3 = Grammar(r'''
522 |             value = "4" / "[" "]" / "(" ")" / "{" "}" / "5"
523 |         ''')
524 |         self.assertTrue(grammar3.parse('[]') is not None)
525 |         self.assertTrue(grammar3.parse('5') is not None)
526 |         self.assertTrue(grammar3.parse('()') is not None)
527 |         self.assertTrue(grammar3.parse('{}') is not None)
528 |         self.assertTrue(grammar3.parse('4') is not None)
529 | 
530 |     def test_repetitions(self):
531 |         grammar = Grammar(r'''
532 |             left_missing = "a"{,5}
533 |             right_missing = "a"{5,}
534 |             exact = "a"{5}
535 |             range = "a"{2,5}
536 |             optional = "a"?
537 |             plus = "a"+
538 |             star = "a"*
539 |         ''')
540 |         should_parse = [
541 |             ("left_missing", ["a" * i for i in range(6)]),
542 |             ("right_missing", ["a" * i for i in range(5, 8)]),
543 |             ("exact", ["a" * 5]),
544 |             ("range", ["a" * i for i in range(2, 6)]),
545 |             ("optional", ["", "a"]),
546 |             ("plus", ["a", "aa"]),
547 |             ("star", ["", "a", "aa"]),
548 |         ]
549 |         for rule, examples in should_parse:
550 |             for example in examples:
551 |                 assert grammar[rule].parse(example)
552 | 
553 |         should_not_parse = [
554 |             ("left_missing", ["a" * 6]),
555 |             ("right_missing", ["a" * i for i in range(5)]),
556 |             ("exact", ["a" * i for i in list(range(5)) + list(range(6, 10))]),
557 |             ("range", ["a" * i for i in list(range(2)) + list(range(6, 10))]),
558 |             ("optional", ["aa"]),
559 |             ("plus", [""]),
560 |             ("star", ["b"]),
561 |         ]
562 |         for rule, examples in should_not_parse:
563 |             for example in examples:
564 |                 with pytest.raises(ParseError):
565 |                     grammar[rule].parse(example)
566 | 
567 |     def test_equal(self):
568 |         grammar_def = (r"""
569 |             x = y / z / ""
570 |             y = "y" x
571 |             z = "z" x
572 |         """)
573 |         assert Grammar(grammar_def) == Grammar(grammar_def)
574 | 
575 |         self.assertEqual(Grammar(rule_syntax), Grammar(rule_syntax))
576 |         self.assertNotEqual(Grammar('expr = ~"[a-z]{1,3}"'), Grammar('expr = ~"[a-z]{2,3}"'))
577 |         self.assertNotEqual(Grammar('expr = ~"[a-z]{1,3}"'), Grammar('expr = ~"[a-z]{1,4}"'))
578 |         self.assertNotEqual(Grammar('expr = &"a"'), Grammar('expr = !"a"'))
579 | 
580 | 
581 | class TokenGrammarTests(TestCase):
582 |     """Tests for the TokenGrammar class and associated machinery"""
583 | 
584 |     def test_parse_success(self):
585 |         """Token literals should work."""
586 |         s = [Token('token1'), Token('token2')]
587 |         grammar = TokenGrammar("""
588 |             foo = token1 "token2"
589 |             token1 = "token1"
590 |             """)
591 |         self.assertEqual(grammar.parse(s),
592 |             Node(grammar['foo'], s, 0, 2, children=[
593 |                 Node(grammar['token1'], s, 0, 1),
594 |                 Node(TokenMatcher('token2'), s, 1, 2)]))
595 | 
596 |     def test_parse_failure(self):
597 |         """Parse failures should work normally with token literals."""
598 |         grammar = TokenGrammar("""
599 |             foo = "token1" "token2"
600 |             """)
601 |         with pytest.raises(ParseError) as e:
602 |             grammar.parse([Token('tokenBOO'), Token('token2')])
603 |         assert "Rule 'foo' didn't match at" in str(e.value)
604 | 
605 |     def test_token_repr(self):
606 |         t = Token('💣')
607 |         self.assertTrue(isinstance(t.__repr__(), str))
608 |         self.assertEqual('<Token "💣">', t.__repr__())
609 | 
610 |     def test_token_star_plus_expressions(self):
611 |         a = Token("a")
612 |         b = Token("b")
613 |         grammar = TokenGrammar("""
614 |             foo = "a"*
615 |             bar = "a"+
616 |         """)
617 |         assert grammar["foo"].parse([]) is not None
618 |         assert grammar["foo"].parse([a]) is not None
619 |         assert grammar["foo"].parse([a, a]) is not None
620 | 
621 |         with pytest.raises(ParseError):
622 |             grammar["foo"].parse([a, b])
623 |         with pytest.raises(ParseError):
624 |             grammar["foo"].parse([b])
625 | 
626 |         assert grammar["bar"].parse([a]) is not None
627 |         with pytest.raises(ParseError):
628 |             grammar["bar"].parse([a, b])
629 |         with pytest.raises(ParseError):
630 |             grammar["bar"].parse([b])
631 | 
632 | 
633 | def test_precedence_of_string_modifiers():
634 |     # r"strings", etc. should be parsed as a single literal, not r followed
635 |     # by a string literal.
636 |     g = Grammar(r"""
637 |         escaped_bell = r"\b"
638 |         r = "irrelevant"
639 |     """)
640 |     assert isinstance(g["escaped_bell"], Literal)
641 |     assert g["escaped_bell"].literal == "\\b"
642 |     with pytest.raises(ParseError):
643 |         g.parse("irrelevant\b")
644 | 
645 |     g2 = Grammar(r"""
646 |         escaped_bell = r"\b"
647 |     """)
648 |     assert g2.parse("\\b")
649 | 
650 | 
651 | def test_binary_grammar():
652 |     g = Grammar(r"""
653 |         file = header body terminator
654 |         header = b"\xFF" length b"~"
655 |         length = ~rb"\d+"
656 |         body = ~b"[^\xFF]*"
657 |         terminator = b"\xFF"
658 |     """)
659 |     length = 22
660 |     assert g.parse(b"\xff22~" + (b"a" * 22) + b"\xff") is not None
661 | 
662 | 
663 | def test_inconsistent_string_types_in_grammar():
664 |     with pytest.raises(VisitationError) as e:
665 |         Grammar(r"""
666 |             foo = b"foo"
667 |             bar = "bar"
668 |         """)
669 |     assert e.value.original_class is BadGrammar
670 |     with pytest.raises(VisitationError) as e:
671 |         Grammar(r"""
672 |             foo = ~b"foo"
673 |             bar = "bar"
674 |         """)
675 |     assert e.value.original_class is BadGrammar
676 | 
677 |     # The following should parse without errors because they use the same
678 |     # string types:
679 |     Grammar(r"""
680 |         foo = b"foo"
681 |         bar = b"bar"
682 |     """)
683 |     Grammar(r"""
684 |         foo = "foo"
685 |         bar = "bar"
686 |     """)
687 | 
688 | 
689 | def test_left_associative():
690 |     # Regression test for https://github.com/erikrose/parsimonious/issues/209
691 |     language_grammar = r"""
692 |     expression = operator_expression / non_operator_expression
693 |     non_operator_expression = number_expression
694 | 
695 |     operator_expression = expression "+" non_operator_expression
696 | 
697 |     number_expression = ~"[0-9]+"
698 |     """
699 | 
700 |     grammar = Grammar(language_grammar)
701 |     with pytest.raises(LeftRecursionError) as e:
702 |         grammar["operator_expression"].parse("1+2")
703 |     assert "Parsimonious is a packrat parser, so it can't handle left recursion." in str(e.value)
704 | 


--------------------------------------------------------------------------------
/parsimonious/tests/test_nodes.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest import SkipTest, TestCase
  3 | from parsimonious import Grammar, NodeVisitor, VisitationError, rule
  4 | from parsimonious.expressions import Literal
  5 | from parsimonious.nodes import Node
  6 | 
  7 | 
  8 | class HtmlFormatter(NodeVisitor):
  9 |     """Visitor that turns a parse tree into HTML fragments"""
 10 | 
 11 |     grammar = Grammar("""bold_open  = '(('""")  # just partial
 12 | 
 13 |     def visit_bold_open(self, node, visited_children):
 14 |         return '<b>'
 15 | 
 16 |     def visit_bold_close(self, node, visited_children):
 17 |         return '</b>'
 18 | 
 19 |     def visit_text(self, node, visited_children):
 20 |         """Return the text verbatim."""
 21 |         return node.text
 22 | 
 23 |     def visit_bold_text(self, node, visited_children):
 24 |         return ''.join(visited_children)
 25 | 
 26 | 
 27 | class ExplosiveFormatter(NodeVisitor):
 28 |     """Visitor which raises exceptions"""
 29 | 
 30 |     def visit_boom(self, node, visited_children):
 31 |         raise ValueError
 32 | 
 33 | 
 34 | class SimpleTests(TestCase):
 35 |     def test_visitor(self):
 36 |         """Assert a tree gets visited correctly."""
 37 |         grammar = Grammar(r'''
 38 |             bold_text  = bold_open text bold_close
 39 |             text       = ~'[a-zA-Z 0-9]*'
 40 |             bold_open  = '(('
 41 |             bold_close = '))'
 42 |         ''')
 43 |         text = '((o hai))'
 44 |         tree = Node(grammar['bold_text'], text, 0, 9,
 45 |                     [Node(grammar['bold_open'], text, 0, 2),
 46 |                      Node(grammar['text'], text, 2, 7),
 47 |                      Node(grammar['bold_close'], text, 7, 9)])
 48 |         self.assertEqual(grammar.parse(text), tree)
 49 |         result = HtmlFormatter().visit(tree)
 50 |         self.assertEqual(result, '<b>o hai</b>')
 51 | 
 52 | 
 53 |     def test_visitation_exception(self):
 54 |         self.assertRaises(VisitationError,
 55 |                       ExplosiveFormatter().visit,
 56 |                       Node(Literal(''), '', 0, 0))
 57 | 
 58 | 
 59 |     def test_str(self):
 60 |         """Test str and unicode of ``Node``."""
 61 |         n = Node(Literal('something', name='text'), 'o hai', 0, 5)
 62 |         good = '<Node called "text" matching "o hai">'
 63 |         self.assertEqual(str(n), good)
 64 | 
 65 | 
 66 |     def test_repr(self):
 67 |         """Test repr of ``Node``."""
 68 |         s = 'hai ö'
 69 |         boogie = 'böogie'
 70 |         n = Node(Literal(boogie), s, 0, 3, children=[
 71 |                 Node(Literal(' '), s, 3, 4), Node(Literal('ö'), s, 4, 5)])
 72 |         self.assertEqual(repr(n),
 73 |             str("""s = {hai_o}\nNode({boogie}, s, 0, 3, children=[Node({space}, s, 3, 4), Node({o}, s, 4, 5)])""").format(
 74 |                 hai_o=repr(s),
 75 |                 boogie=repr(Literal(boogie)),
 76 |                 space=repr(Literal(" ")),
 77 |                 o=repr(Literal("ö")),
 78 |             )
 79 |         )
 80 | 
 81 |     def test_parse_shortcut(self):
 82 |         """Exercise the simple case in which the visitor takes care of parsing."""
 83 |         self.assertEqual(HtmlFormatter().parse('(('), '<b>')
 84 | 
 85 | 
 86 |     def test_match_shortcut(self):
 87 |         """Exercise the simple case in which the visitor takes care of matching."""
 88 |         self.assertEqual(HtmlFormatter().match('((other things'), '<b>')
 89 | 
 90 | 
 91 | class CoupledFormatter(NodeVisitor):
 92 |     @rule('bold_open text bold_close')
 93 |     def visit_bold_text(self, node, visited_children):
 94 |         return ''.join(visited_children)
 95 | 
 96 |     @rule('"(("')
 97 |     def visit_bold_open(self, node, visited_children):
 98 |         return '<b>'
 99 | 
100 |     @rule('"))"')
101 |     def visit_bold_close(self, node, visited_children):
102 |         return '</b>'
103 | 
104 |     @rule('~"[a-zA-Z 0-9]*"')
105 |     def visit_text(self, node, visited_children):
106 |         """Return the text verbatim."""
107 |         return node.text
108 | 
109 | class DecoratorTests(TestCase):
110 |     def test_rule_decorator(self):
111 |         """Make sure the @rule decorator works."""
112 |         self.assertEqual(CoupledFormatter().parse('((hi))'), '<b>hi</b>')
113 | 
114 | 
115 |     def test_rule_decorator_subclassing(self):
116 |         """Make sure we can subclass and override visitor methods without blowing
117 |         away the rules attached to them."""
118 |         class OverridingFormatter(CoupledFormatter):
119 |             def visit_text(self, node, visited_children):
120 |                 """Return the text capitalized."""
121 |                 return node.text.upper()
122 | 
123 |             @rule('"not used"')
124 |             def visit_useless(self, node, visited_children):
125 |                 """Get in the way. Tempt the metaclass to pave over the
126 |                 superclass's grammar with a new one."""
127 | 
128 |         raise SkipTest("I haven't got around to making this work yet.")
129 |         self.assertEqual(OverridingFormatter().parse('((hi))'), '<b>HI</b>')
130 | 
131 | 
132 | class PrimalScream(Exception):
133 |     pass
134 | 
135 | 
136 | class SpecialCasesTests(TestCase):
137 |     def test_unwrapped_exceptions(self):
138 |         class Screamer(NodeVisitor):
139 |             grammar = Grammar("""greeting = 'howdy'""")
140 |             unwrapped_exceptions = (PrimalScream,)
141 | 
142 |             def visit_greeting(self, thing, visited_children):
143 |                 raise PrimalScream('This should percolate up!')
144 | 
145 |         self.assertRaises(PrimalScream, Screamer().parse, 'howdy')
146 | 
147 | 
148 |     def test_node_inequality(self):
149 |         node = Node(Literal('12345'), 'o hai', 0, 5)
150 |         self.assertTrue(node != 5)
151 |         self.assertTrue(node != None)
152 |         self.assertTrue(node != Node(Literal('23456'), 'o hai', 0, 5))
153 |         self.assertTrue(not (node != Node(Literal('12345'), 'o hai', 0, 5)))
154 | 
155 | 
156 |     def test_generic_visit_NotImplementedError_unnamed_node(self):
157 |         """
158 |         Test that generic_visit provides informative error messages
159 |         when visitors are not defined.
160 | 
161 |         Regression test for https://github.com/erikrose/parsimonious/issues/110
162 |         """
163 |         class MyVisitor(NodeVisitor):
164 |             grammar = Grammar(r'''
165 |                 bar = "b" "a" "r"
166 |             ''')
167 |             unwrapped_exceptions = (NotImplementedError, )
168 | 
169 |         with self.assertRaises(NotImplementedError) as e:
170 |             MyVisitor().parse('bar')
171 |         self.assertIn("No visitor method was defined for this expression: 'b'", str(e.exception))
172 | 
173 | 
174 |     def test_generic_visit_NotImplementedError_named_node(self):
175 |         """
176 |         Test that generic_visit provides informative error messages
177 |         when visitors are not defined.
178 |         """
179 |         class MyVisitor(NodeVisitor):
180 |             grammar = Grammar(r'''
181 |                 bar = myrule myrule myrule
182 |                 myrule = ~"[bar]"
183 |             ''')
184 |             unwrapped_exceptions = (NotImplementedError, )
185 | 
186 |         with self.assertRaises(NotImplementedError) as e:
187 |             MyVisitor().parse('bar')
188 |         self.assertIn("No visitor method was defined for this expression: myrule = ~'[bar]'", str(e.exception))
189 | 


--------------------------------------------------------------------------------
/parsimonious/utils.py:
--------------------------------------------------------------------------------
 1 | """General tools which don't depend on other parts of Parsimonious"""
 2 | 
 3 | import ast
 4 | 
 5 | 
 6 | class StrAndRepr(object):
 7 |     """Mix-in which gives the class the same __repr__ and __str__."""
 8 | 
 9 |     def __repr__(self):
10 |         return self.__str__()
11 | 
12 | 
13 | def evaluate_string(string):
14 |     """Piggyback on Python's string support so we can have backslash escaping
15 |     and niceties like \n, \t, etc.
16 | 
17 |     This also supports:
18 |     1. b"strings", allowing grammars to parse bytestrings, in addition to str.
19 |     2. r"strings" to simplify regexes.
20 |     """
21 |     return ast.literal_eval(string)
22 | 
23 | 
24 | class Token(StrAndRepr):
25 |     """A class to represent tokens, for use with TokenGrammars
26 | 
27 |     You will likely want to subclass this to hold additional information, like
28 |     the characters that you lexed to create this token. Alternately, feel free
29 |     to create your own class from scratch. The only contract is that tokens
30 |     must have a ``type`` attr.
31 | 
32 |     """
33 |     __slots__ = ['type']
34 | 
35 |     def __init__(self, type):
36 |         self.type = type
37 | 
38 |     def __str__(self):
39 |         return '<Token "%s">' % (self.type,)
40 | 
41 |     def __eq__(self, other):
42 |         return self.type == other.type
43 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.2.0", "setuptools_scm[toml]>=3.4.3"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "parsimonious"
 7 | version = "0.10.0"
 8 | authors = [{name = "Erik Rose", email = "erikrose@grinchcentral.com"}]
 9 | license = {text = "MIT"}
10 | description = "(Soon to be) the fastest pure-Python PEG parser I could muster"
11 | keywords = [
12 |     "parse",
13 |     "parser",
14 |     "parsing",
15 |     "peg",
16 |     "packrat",
17 |     "grammar",
18 |     "language",
19 | ]
20 | readme = "README.rst"
21 | classifiers = [
22 |     "Intended Audience :: Developers",
23 |     "Natural Language :: English",
24 |     "Development Status :: 3 - Alpha",
25 |     "License :: OSI Approved :: MIT License",
26 |     "Operating System :: OS Independent",
27 |     "Programming Language :: Python :: 3 :: Only",
28 |     "Programming Language :: Python :: 3",
29 |     "Programming Language :: Python :: 3.8",
30 |     "Programming Language :: Python :: 3.9",
31 |     "Programming Language :: Python :: 3.10",
32 |     "Programming Language :: Python :: 3.11",
33 |     "Programming Language :: Python :: 3.12",
34 |     "Programming Language :: Python :: 3.13",
35 |     "Topic :: Scientific/Engineering :: Information Analysis",
36 |     "Topic :: Software Development :: Libraries",
37 |     "Topic :: Text Processing :: General",
38 | ]
39 | urls = {Homepage = "https://github.com/erikrose/parsimonious"}
40 | dependencies = ["regex>=2022.3.15"]
41 | 
42 | [project.optional-dependencies]
43 | testing = ["pytest"]
44 | 
45 | [tool.setuptools]
46 | include-package-data = true
47 | 
48 | [tool.setuptools.packages]
49 | find = {namespaces = false}
50 | 
51 | [tool.setuptools_scm]
52 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | if __name__ == "__main__":
4 |     setup()
5 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py38, py39, py310, py311, py312, py313
 3 | 
 4 | [gh-actions]
 5 | python =
 6 |   3.8: py38
 7 |   3.9: py39
 8 |   3.10: py310
 9 |   3.11: py311
10 |   3.12: py312
11 |   3.13: py313
12 | 
13 | [testenv]
14 | usedevelop = True
15 | commands = py.test --tb=native {posargs:parsimonious}
16 | deps =
17 |   pytest
18 | 


--------------------------------------------------------------------------------