├── src └── parsy │ ├── version.py │ └── __init__.py ├── .gitignore ├── docs ├── installation.rst ├── howto │ ├── index.rst │ ├── other_examples.rst │ └── lexing.rst ├── ref │ ├── index.rst │ ├── parser_instances.rst │ ├── generating.rst │ ├── primitives.rst │ └── methods_and_combinators.rst ├── index.rst ├── Makefile ├── make.bat ├── history.rst ├── contributing.rst ├── overview.rst ├── conf.py └── tutorial.rst ├── .editorconfig ├── MANIFEST.in ├── setup.cfg ├── travis_tests.sh ├── .travis.yml ├── tox.ini ├── examples ├── simple_logo_lexer.py ├── simple_logo_parser.py ├── json.py └── simple_eval.py ├── RELEASE.rst ├── LICENSE ├── README.rst ├── setup.py └── test ├── test_sexpr.py └── test_parsy.py /src/parsy/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.0.1-dev1' 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /dist 3 | .tox 4 | src/parsy.egg-info 5 | docs/_build 6 | .cache 7 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | parsy can be installed with pip:: 6 | 7 | pip install parsy 8 | 9 | 10 | Python 3.3 or greater is required. 11 | 12 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # See http://editorconfig.org/ 2 | root = true 3 | 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | charset = utf-8 8 | indent_style = space 9 | 10 | [*.py] 11 | indent_size = 4 12 | -------------------------------------------------------------------------------- /docs/howto/index.rst: -------------------------------------------------------------------------------- 1 | ================================= 2 | Howto's, cookbooks and examples 3 | ================================= 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | :caption: Contents: 8 | 9 | lexing 10 | other_examples 11 | -------------------------------------------------------------------------------- /docs/ref/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | API reference 3 | =============== 4 | 5 | .. toctree:: 6 | :maxdepth: 3 7 | :caption: Contents: 8 | 9 | primitives 10 | methods_and_combinators 11 | generating 12 | parser_instances 13 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.rst 3 | include *.sh 4 | include LICENSE 5 | include tox.ini 6 | include .editorconfig 7 | recursive-include docs *.bat 8 | recursive-include docs *.py 9 | recursive-include docs *.rst 10 | recursive-include docs Makefile 11 | recursive-include examples *.py 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | 3 | [isort] 4 | multi_line_output = 5 5 | line_length = 119 6 | default_section = THIRDPARTY 7 | skip = .tox,.git,docs,dist,build 8 | known_first_party = parsy 9 | 10 | [flake8] 11 | exclude = .tox,.git,docs,dist,build 12 | ignore = E731,E221,W503 13 | max-line-length = 119 14 | -------------------------------------------------------------------------------- /travis_tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | coverage run --branch --source=parsy `which py.test` || exit 1 4 | 5 | # Coveralls is flaky sometimes, especially for concurrent uploads. 6 | # https://github.com/lemurheavy/coveralls-public/issues/487 7 | # So try again if it fails first time. 8 | coveralls || { sleep $((RANDOM / 4000 + 1)); coveralls; } 9 | -------------------------------------------------------------------------------- /docs/howto/other_examples.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Other examples 3 | ============== 4 | 5 | This section has some further example parsers that you can study. There are also 6 | examples in the :doc:`/tutorial` and in :doc:`/ref/generating`. 7 | 8 | JSON parser 9 | =========== 10 | 11 | .. literalinclude:: ../../examples/json.py 12 | :language: python 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.3" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | env: SCRIPT=./travis_tests.sh 8 | matrix: 9 | include: 10 | - python: "3.5" 11 | env: SCRIPT=flake8 12 | - python: "3.5" 13 | env: SCRIPT="isort -c" 14 | - python: "3.5" 15 | env: SCRIPT=check-manifest 16 | install: 17 | - pip install pytest flake8 check-manifest isort coverage coveralls 18 | - ./setup.py develop 19 | script: 20 | - $SCRIPT 21 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to parsy's documentation! 2 | ================================= 3 | 4 | These are the docs for parsy |release|. Check the :doc:`/history` for 5 | significant changes. 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | :caption: Contents: 10 | 11 | installation 12 | overview 13 | tutorial 14 | ref/index 15 | howto/index 16 | history 17 | contributing 18 | 19 | Indices and tables 20 | ================== 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | * :ref:`search` 25 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py33,py34,py35,py36,checkmanifest,isort-check,flake8-check 3 | 4 | [testenv] 5 | deps = pytest 6 | commands = pytest 7 | 8 | 9 | [testenv:checkmanifest] 10 | basepython = python3.3 11 | deps = check-manifest 12 | commands = check-manifest 13 | 14 | [testenv:isort-check] 15 | # isort configurations are located in setup.cfg 16 | basepython = python3.3 17 | deps = isort==4.2.15 18 | commands = isort -rc -c {toxinidir} 19 | 20 | [testenv:flake8-check] 21 | basepython = python3.3 22 | deps = flake8==3.4.1 23 | commands = flake8 24 | -------------------------------------------------------------------------------- /examples/simple_logo_lexer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stripped down logo lexer, for tokenizing Turtle Logo programs like: 3 | 4 | fd 1 5 | bk 2 6 | rt 90 7 | 8 | etc. 9 | """ 10 | 11 | from parsy import eof, regex, seq, string, string_from, whitespace 12 | 13 | command = string_from("fd", "bk", "rt", "lt") 14 | number = regex(r'[0-9]+').map(int) 15 | optional_whitespace = regex(r'\s*') 16 | eol = string("\n") 17 | line = seq(optional_whitespace >> command, 18 | whitespace >> number, 19 | (eof | eol | (whitespace >> eol)).result("\n")) 20 | flatten_list = lambda ls: sum(ls, []) 21 | lexer = line.many().map(flatten_list) 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = parsy 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /RELEASE.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | How to do releases 3 | ================== 4 | 5 | * Check test suite passes on all supported versions:: 6 | 7 | tox 8 | 9 | * Change docs/history.rst to remove " - unreleased" 10 | 11 | * Update the version number (removing the ``-dev1`` part): 12 | 13 | * src/parsy/version.py 14 | * docs/conf.py 15 | 16 | * Commit with "Version bump" 17 | 18 | * Release to PyPI:: 19 | 20 | ./setup.py sdist bdist_wheel upload 21 | 22 | * Tag and push:: 23 | 24 | 25 | git tag v$VERSION 26 | git push 27 | git push --tags 28 | 29 | 30 | Post release 31 | ------------ 32 | 33 | * Bump version numbers to next version, and add ``-dev1`` suffix, for example 34 | ``0.9.0-dev1`` 35 | 36 | * Add new section to docs/history.rst, with " - unreleased". 37 | 38 | * Commit and push 39 | -------------------------------------------------------------------------------- /examples/simple_logo_parser.py: -------------------------------------------------------------------------------- 1 | from parsy import generate, match_item, test_item 2 | 3 | 4 | class Command: 5 | def __init__(self, parameter): 6 | self.parameter = parameter 7 | 8 | def __repr__(self): 9 | return "{0}({1})".format(self.__class__.__name__, self.parameter) 10 | 11 | 12 | class Forward(Command): 13 | pass 14 | 15 | 16 | class Backward(Command): 17 | pass 18 | 19 | 20 | class Right(Command): 21 | pass 22 | 23 | 24 | class Left(Command): 25 | pass 26 | 27 | 28 | commands = { 29 | 'fd': Forward, 30 | 'bk': Backward, 31 | 'rt': Right, 32 | 'lt': Left, 33 | } 34 | 35 | 36 | @generate 37 | def statement(): 38 | cmd_name = yield test_item(lambda i: i in commands.keys(), "command") 39 | parameter = yield test_item(lambda i: isinstance(i, int), "number") 40 | yield match_item('\n') 41 | return commands[cmd_name](int(parameter)) 42 | 43 | 44 | program = statement.many() 45 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=parsy 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # MIT license. See http://www.opensource.org/licenses/mit-license.php 2 | 3 | Copyright (c) 2013 Jeanine Adkisson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | History and release notes 3 | ========================= 4 | 5 | .. currentmodule:: parsy 6 | 7 | 1.0.1 - unreleased 8 | ------------------ 9 | 10 | 11 | 1.0.0 - 2017-10-10 12 | ------------------ 13 | 14 | * Improved parse failure messages of ``@generate`` parsers. Previously 15 | the parser was given a default description of the function name, 16 | which hides all useful internal info there might be. 17 | * Added :meth:`Parser.sep_by` 18 | * Added :func:`test_char` 19 | * Added :func:`char_from` 20 | * Added :func:`string_from` 21 | * Added :data:`any_char` 22 | * Added :data:`decimal_digit` 23 | * Added :meth:`Parser.concat` 24 | * Fixed parsy so that it can again work with tokens as well as strings, allowing it to 25 | be used as both a :doc:`lexer or parser or both `, with docs and tests. 26 | * Added :func:`test_item` 27 | * Added :func:`match_item` 28 | * Added :meth:`Parser.should_fail` 29 | 30 | 0.9.0 - 2017-09-28 31 | ------------------ 32 | 33 | * Better error reporting of failed parses. 34 | * Documentation overhaul and expansion. 35 | * Added :meth:`Parser.combine`. 36 | 37 | 0.0.4 - 2014-12-28 38 | ------------------ 39 | 40 | * See git logs for changes before this point. 41 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | parsy 2 | ===== 3 | 4 | |Documentation Status| |Build Status| |Coveralls| 5 | 6 | Parsy is an easy way to combine simple, small parsers into complex, larger 7 | parsers. If it means anything to you, it's a monadic parser combinator library 8 | for LL(infinity) grammars in the spirit of `Parsec 9 | `_, `Parsnip 10 | `_, and `Parsimmon 11 | `_. 12 | 13 | Parsy requires Python 3.3 or greater. 14 | 15 | Links: 16 | 17 | - `Documentation `_ 18 | - `History and changelog `_ 19 | - `PyPI `_ 20 | 21 | To contribute, please create a fork and submit a pull request on GitHub, 22 | after checking the "contributing" section of the docs. Thanks! 23 | 24 | Parsy was originally written by `Jeanine Adkisson `_, 25 | with contributions by other people as can be found in the git commit history. 26 | 27 | .. |Documentation Status| image:: https://readthedocs.org/projects/parsy/badge/?version=latest 28 | :target: http://parsy.readthedocs.io/en/latest/?badge=latest 29 | .. |Build Status| image:: https://travis-ci.org/python-parsy/parsy.svg?branch=master 30 | :target: https://travis-ci.org/python-parsy/parsy 31 | .. |Coveralls| image:: https://coveralls.io/repos/github/python-parsy/parsy/badge.svg?branch=master 32 | :target: https://coveralls.io/github/python-parsy/parsy?branch=master 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os.path 4 | 5 | from setuptools import find_packages, setup 6 | 7 | # Evaluate version module without importing parsy, which could have undesirable 8 | # effects. 9 | version_file = os.path.join(os.path.dirname(__file__), 10 | "src", "parsy", "version.py") 11 | namespace = {} 12 | exec(compile(open(version_file, "rb").read(), version_file, 'exec'), 13 | globals(), namespace) 14 | version = namespace['__version__'] 15 | 16 | readme = open('README.rst').read() 17 | 18 | setup( 19 | name="parsy", 20 | version=version, 21 | description="easy-to-use parser combinators, for parsing in pure Python", 22 | long_description=readme, 23 | author="Jeanine Adkisson", 24 | author_email="jneen at jneen dot net (humans only, please)", 25 | maintainer="Luke Plant", 26 | maintainer_email="L.Plant.98@cantab.net", 27 | url="https://github.com/python-parsy/parsy", 28 | license="MIT", 29 | classifiers=[ 30 | "Development Status :: 5 - Production/Stable", 31 | "Intended Audience :: Developers", 32 | "Topic :: Software Development :: Compilers", 33 | "Topic :: Software Development :: Interpreters", 34 | "Topic :: Text Processing", 35 | "License :: OSI Approved :: MIT License", 36 | "Programming Language :: Python :: 3", 37 | "Programming Language :: Python :: 3.3", 38 | "Programming Language :: Python :: 3.4", 39 | "Programming Language :: Python :: 3.5", 40 | "Programming Language :: Python :: 3.6", 41 | ], 42 | keywords="parser parsers parsing monad combinators", 43 | packages=find_packages('src'), 44 | package_dir={'': 'src'}, 45 | ) 46 | -------------------------------------------------------------------------------- /examples/json.py: -------------------------------------------------------------------------------- 1 | from sys import stdin 2 | 3 | from parsy import generate, regex, string 4 | 5 | whitespace = regex(r'\s*') 6 | lexeme = lambda p: p << whitespace 7 | lbrace = lexeme(string('{')) 8 | rbrace = lexeme(string('}')) 9 | lbrack = lexeme(string('[')) 10 | rbrack = lexeme(string(']')) 11 | colon = lexeme(string(':')) 12 | comma = lexeme(string(',')) 13 | true = lexeme(string('true')).result(True) 14 | false = lexeme(string('false')).result(False) 15 | null = lexeme(string('null')).result(None) 16 | number = lexeme( 17 | regex(r'-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?') 18 | ).map(float) 19 | string_part = regex(r'[^"\\]+') 20 | string_esc = string('\\') >> ( 21 | string('\\') 22 | | string('/') 23 | | string('"') 24 | | string('b').result('\b') 25 | | string('f').result('\f') 26 | | string('n').result('\n') 27 | | string('r').result('\r') 28 | | string('t').result('\t') 29 | | regex(r'u[0-9a-fA-F]{4}').map(lambda s: chr(int(s[1:], 16))) 30 | ) 31 | quoted = lexeme(string('"') >> (string_part | string_esc).many().concat() << string('"')) 32 | 33 | 34 | # Circular dependency between array and value means we use `generate` form here 35 | @generate 36 | def array(): 37 | yield lbrack 38 | elements = yield value.sep_by(comma) 39 | yield rbrack 40 | return elements 41 | 42 | 43 | @generate 44 | def object_pair(): 45 | key = yield quoted 46 | yield colon 47 | val = yield value 48 | return (key, val) 49 | 50 | 51 | json_object = lbrace >> object_pair.sep_by(comma).map(dict) << rbrace 52 | value = quoted | number | json_object | array | true | false | null 53 | json = whitespace >> value 54 | 55 | if __name__ == '__main__': 56 | print(repr(json.parse(stdin.read()))) 57 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing to parsy 2 | ===================== 3 | 4 | Contributions to parsy, whether code or docs, are very welcome. Please 5 | contribute by making a fork, and submitting a PR on `GitHub 6 | `_. 7 | 8 | We have a high standard in terms of quality. All contributions will need to be 9 | fully covered by unit tests and documentation. Code should be formatted 10 | according to pep8, and the formatting defined by the ``../.editorconfig`` file 11 | (see `EditorConfig `_). 12 | 13 | To run the test suite:: 14 | 15 | pip install pytest 16 | pytest 17 | 18 | To run the test suite on all supported Python versions, and code quality checks, 19 | first install the various Python versions, then:: 20 | 21 | pip install tox 22 | tox 23 | 24 | To build the docs, do:: 25 | 26 | pip install sphinx 27 | cd docs 28 | make html 29 | 30 | We also require that `flake8 `_, `isort 31 | `_ and checkmanifest report zero 32 | errors (these are run by tox). 33 | 34 | When writing documentation, please keep in mind Daniele Procida's `great article 35 | on documentation `_. To summarise, 36 | there are 4 types of docs: 37 | 38 | * Tutorials (focus: learning, analogy: teaching a child to cook) 39 | * How-to guides (focus: goals, analogy: a recipe in a cook book) 40 | * Discussions (focus: understanding, analogy: an article on culinary history) 41 | * Reference (focus: information, analogy: encyclopedia article) 42 | 43 | We do not (yet) have documentation that fits into the "Discussions" category, 44 | but we do have the others, and when adding new features, documentation of the 45 | right sort(s) should be added. With parsy, where code is often very succinct, 46 | this often takes several times longer than righting the code. 47 | -------------------------------------------------------------------------------- /test/test_sexpr.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unittest 3 | 4 | from parsy import generate, regex, string 5 | 6 | whitespace = regex(r'\s+', re.MULTILINE) 7 | comment = regex(r';.*') 8 | ignore = (whitespace | comment).many() 9 | 10 | lexeme = lambda p: p << ignore 11 | 12 | lparen = lexeme(string('(')) 13 | rparen = lexeme(string(')')) 14 | number = lexeme(regex(r'\d+')).map(int) 15 | symbol = lexeme(regex(r'[\d\w_-]+')) 16 | true = lexeme(string('#t')).result(True) 17 | false = lexeme(string('#f')).result(False) 18 | 19 | atom = true | false | number | symbol 20 | 21 | 22 | @generate('a form') 23 | def form(): 24 | yield lparen 25 | els = yield expr.many() 26 | yield rparen 27 | return els 28 | 29 | 30 | @generate 31 | def quote(): 32 | yield string("'") 33 | e = yield expr 34 | return ['quote', e] 35 | 36 | 37 | expr = form | quote | atom 38 | program = ignore >> expr.many() 39 | 40 | 41 | class TestSexpr(unittest.TestCase): 42 | def test_form(self): 43 | result = program.parse('(1 2 3)') 44 | self.assertEqual(result, [[1, 2, 3]]) 45 | 46 | def test_quote(self): 47 | result = program.parse("'foo '(bar baz)") 48 | self.assertEqual(result, 49 | [['quote', 'foo'], ['quote', ['bar', 'baz']]]) 50 | 51 | def test_double_quote(self): 52 | result = program.parse("''foo") 53 | self.assertEqual(result, [['quote', ['quote', 'foo']]]) 54 | 55 | def test_boolean(self): 56 | result = program.parse('#t #f') 57 | self.assertEqual(result, [True, False]) 58 | 59 | def test_comments(self): 60 | result = program.parse( 61 | """ 62 | ; a program with a comment 63 | ( foo ; that's a foo 64 | bar ) 65 | ; some comments at the end 66 | """ 67 | ) 68 | 69 | self.assertEqual(result, [['foo', 'bar']]) 70 | 71 | 72 | if __name__ == '__main__': 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /examples/simple_eval.py: -------------------------------------------------------------------------------- 1 | from parsy import digit, generate, match_item, regex, string, success, test_item 2 | 3 | 4 | def lexer(code): 5 | whitespace = regex(r'\s*') 6 | integer = digit.at_least(1).concat().map(int) 7 | float_ = ( 8 | digit.many() + string('.').result(['.']) + digit.many() 9 | ).concat().map(float) 10 | parser = whitespace >> (( 11 | float_ | integer | regex(r'[()*/+-]') 12 | ) << whitespace).many() 13 | return parser.parse(code) 14 | 15 | 16 | def eval_tokens(tokens): 17 | # This function parses and evaluates at the same time. 18 | 19 | lparen = match_item('(') 20 | rparen = match_item(')') 21 | 22 | @generate 23 | def additive(): 24 | res = yield multiplicative 25 | sign = match_item('+') | match_item('-') 26 | while True: 27 | operation = yield sign | success('') 28 | if not operation: 29 | break 30 | operand = yield multiplicative 31 | if operation == '+': 32 | res += operand 33 | elif operation == '-': 34 | res -= operand 35 | return res 36 | 37 | @generate 38 | def multiplicative(): 39 | res = yield simple 40 | op = match_item('*') | match_item('/') 41 | while True: 42 | operation = yield op | success('') 43 | if not operation: 44 | break 45 | operand = yield simple 46 | if operation == '*': 47 | res *= operand 48 | elif operation == '/': 49 | res /= operand 50 | return res 51 | 52 | @generate 53 | def number(): 54 | sign = yield match_item('+') | match_item('-') | success('+') 55 | value = yield test_item( 56 | lambda x: isinstance(x, (int, float)), 'number') 57 | return value if sign == '+' else -value 58 | 59 | expr = additive 60 | simple = (lparen >> expr << rparen) | number 61 | 62 | return expr.parse(tokens) 63 | 64 | 65 | def simple_eval(expr): 66 | return eval_tokens(lexer(expr)) 67 | 68 | 69 | if __name__ == '__main__': 70 | print(simple_eval(input())) 71 | -------------------------------------------------------------------------------- /docs/ref/parser_instances.rst: -------------------------------------------------------------------------------- 1 | ============================= 2 | Creating new Parser instances 3 | ============================= 4 | 5 | .. currentmodule:: parsy 6 | 7 | Normally you will create Parser instances using the provided :doc:`primitives 8 | ` and :doc:`combinators `. 9 | 10 | However it is also possible to create them manually, as below. 11 | 12 | The :class:`Parser` constructor should be passed a function that takes the 13 | string/list to be parsed, and returns a and returns a :class:`Result` object. 14 | The ``Result`` object will be created either using :meth:`Result.success` or 15 | :meth:`Result.failure` to indicate success or failure respectively. 16 | :meth:`Result.success` should be passed the next index to continue parsing with, 17 | and the value that is returned from the parsing. :meth:`Result.failure` should 18 | return the index at which failure occurred i.e. the index passed in, and a 19 | string indicating what the parser expected to find. 20 | 21 | The ``Parser`` constructor will usually be called using decorator syntax. In 22 | order to pass parameters to the ``Parser`` instance, it is typically created 23 | using a closure. In the example below, we create a parser that matches any 24 | string/list of tokens of a given length. This could also be written as something 25 | like ``any_char.times(n).concat()`` but the following will be more efficient: 26 | 27 | 28 | .. code-block:: python 29 | 30 | def consume(n): 31 | 32 | @Parser 33 | def consumer(stream, index): 34 | items = stream[index:index + n] 35 | if len(items) == n: 36 | return Result.success(index + n, items) 37 | else: 38 | return Result.failure(index, "{0} items".format(n)) 39 | 40 | return consumer 41 | 42 | 43 | .. code-block:: python 44 | 45 | >>> consume(3).many().parse('abc123def') 46 | ['abc', '123', 'def'] 47 | 48 | 49 | Result objects 50 | ============== 51 | 52 | .. class:: Result 53 | 54 | .. staticmethod:: success(next_index, value) 55 | 56 | Creates a ``Result`` object indicating parsing succeeded. The index to 57 | continue parsing at, and the value retrieved from the parsing, should be 58 | passed. 59 | 60 | .. staticmethod:: failure(index, expected) 61 | 62 | Creates a ``Result`` object indicating parsing failed. The index to 63 | continue parsing at, and a string representing what the parser expected to 64 | find, should be passed. 65 | -------------------------------------------------------------------------------- /docs/howto/lexing.rst: -------------------------------------------------------------------------------- 1 | ===================================== 2 | Separate lexing/tokenization phases 3 | ===================================== 4 | 5 | .. currentmodule:: parsy 6 | 7 | Most of the documentation in parsy assumes that when you call 8 | :meth:`Parser.parse` you will pass a string, and will get back your final 9 | parsed, constructed object (of whatever type you desire). 10 | 11 | A more classical approach to parsing is that you first have a 12 | lexing/tokenization phase, the result of which is a simple list of tokens. These 13 | tokens could be strings, or other objects. 14 | 15 | You then have a separate parsing phase that consumes this list of tokens, and 16 | produces your final object, which is very often a tree-like structure or other 17 | complex object. 18 | 19 | Parsy can actually work with either approach. Further, for the split 20 | lexing/parsing approach, parsy can be used either to implement the lexer, or the 21 | parser, or both! The following examples use parsy to do both lexing and parsing. 22 | 23 | Turtle Logo 24 | =========== 25 | 26 | For our first example, we'll do a very stripped down Turtle Logo parser. First, 27 | the lexer: 28 | 29 | .. literalinclude:: ../../examples/simple_logo_lexer.py 30 | :language: python 31 | 32 | 33 | We are not interested in whitespace, so our lexer removes it all, apart from 34 | newlines. We can now parse a program into the tokens we are interested in: 35 | 36 | .. code-block:: python 37 | 38 | >>> l = lexer.parse("fd 1\nbk 2") 39 | >>> l 40 | ['fd', 1, '\n', 'bk', 2, '\n'] 41 | 42 | The ``line`` parser produces a list, so after applying ``many`` which also 43 | produces a list, we applied a level of flattening so that we end up with a 44 | simple list of tokens. We also chose to convert the parameters to integers while 45 | we were at it, so in this case our list of tokens is not a list of strings, but 46 | heterogeneous. 47 | 48 | The next step is the parser. We create some classes to represent different 49 | commands, and then use parsy again to create a parser which is very simple 50 | because this is a very limited language: 51 | 52 | .. literalinclude:: ../../examples/simple_logo_parser.py 53 | :language: python 54 | 55 | To use it, we pass the the list of tokens generated above into 56 | ``program.parse``: 57 | 58 | .. code-block:: python 59 | 60 | >>> program.parse(l) 61 | [Forward(1), Backward(2)] 62 | 63 | In a real implementation, we could then have ``execute`` methods on the 64 | ``Command`` sub-classes if we wanted to implement an interpreter, for example. 65 | 66 | Calculator 67 | ========== 68 | 69 | Our second example illustrates lexing and then parsing a sequence of 70 | mathematical operations, e.g "1 + 2 * (3 - 4.5)", with precedence. 71 | 72 | In this case, while doing the parsing stage, instead of building up an AST of 73 | objects representing the operations, the parser actually evaluates the 74 | expression. 75 | 76 | .. literalinclude:: ../../examples/simple_eval.py 77 | :language: python 78 | -------------------------------------------------------------------------------- /docs/overview.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Overview 3 | ======== 4 | 5 | Parsy is an easy way to combine simple, small parsers into complex, larger 6 | parsers. 7 | 8 | If it means anything to you, it's a monadic parser combinator library for 9 | LL(infinity) grammars in the spirit of `Parsec 10 | `_, `Parsnip 11 | `_, and `Parsimmon 12 | `_. 13 | 14 | If that means nothing, rest assured that parsy is a very straightforward and 15 | Pythonic solution for parsing text that doesn't require knowing anything about 16 | monads. 17 | 18 | Parsy differentiates itself from other solutions with the following: 19 | 20 | * it is not a parser generator, but a combinator based parsing library. 21 | * a very clean implementation, only a few hundred lines, that borrows 22 | from the best of recent combinator libraries. 23 | * free, good quality documentation, all in one place. (Please raise an issue on 24 | GitHub if you have any problems, or find the documentation lacking in any 25 | way). 26 | * it avoids mutability, and therefore a ton of related bugs. 27 | * it has monadic binding with a :doc:`nice syntax `. In plain 28 | English: 29 | 30 | * we can easily handle cases where later parsing depends on the value of 31 | something parsed earlier e.g. Hollerith constants. 32 | * it's easy to build up complex result objects, rather than having lists of 33 | lists etc. 34 | * there is no need for things like `pyparsing's Forward class 35 | `_ . 36 | 37 | * it has a minimalist philosophy. It doesn't include built-in helpers for any 38 | specific grammars or languages, but provides building blocks for making these. 39 | 40 | Basic usage looks like this: 41 | 42 | Example 1 - parsing a set of alternatives: 43 | 44 | .. code-block:: python 45 | 46 | >>> from parsy import string 47 | >>> parser = (string('Dr.') | string('Mr.') | string('Mrs.')).desc("title") 48 | >>> parser.parse('Mrs.') 49 | 'Mrs.' 50 | >>> parser.parse('Mr.') 51 | 'Mr.' 52 | 53 | >>> parser.parse('Joe') 54 | ParseError: expected title at 0:0 55 | 56 | >>> parser.parse_partial('Dr. Who') 57 | ('Dr.', ' Who') 58 | 59 | Example 2 - Parsing a dd-mm-yy date: 60 | 61 | .. code-block:: python 62 | 63 | >>> from parsy import string, regex 64 | >>> from datetime import date 65 | >>> ddmmyy = regex(r'[0-9]{2}').map(int).sep_by(string("-"), min=3, max=3).combine( 66 | ... lambda d, m, y: date(2000 + y, m, d)) 67 | >>> ddmmyy.parse('06-05-14') 68 | datetime.date(2014, 5, 6) 69 | 70 | 71 | To learn how to use parsy, you should continue with: 72 | 73 | * the :doc:`tutorial `, especially if you are not familiar with this 74 | type of parser library. 75 | * the :doc:`parser generator decorator ` 76 | * the :doc:`builtin parser primitives ` 77 | * the :doc:`method and combinator reference ` 78 | 79 | Other Python projects 80 | ===================== 81 | 82 | * `pyparsing `_. Also a combinator approach, 83 | but in general much less cleanly implemented, and rather scattered 84 | documentation. 85 | 86 | * `funcparserlib `_ - the most 87 | similar to parsy. It differs from parsy mainly in normally using a separate 88 | tokenization phase, lacking the convenience of the :func:`generate` method for 89 | creating parsers, and documentation that relies on understanding Haskell type 90 | annotations. 91 | 92 | * `Lark `_. With Lark you write a grammar 93 | definition in a separate mini-language as a string, and have a parser 94 | generated for you, rather than writing the grammar in Python. It has the 95 | advantage of speed and being able to use different parsing algorithms. 96 | -------------------------------------------------------------------------------- /docs/ref/generating.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Generating a parser 3 | =================== 4 | 5 | .. currentmodule:: parsy 6 | .. function:: generate 7 | 8 | ``generate`` converts a generator function (one that uses the ``yield`` keyword) 9 | into a parser. The generator function must yield parsers. These parsers are 10 | applied successively and their results are sent back to the generator using the 11 | ``.send()`` protocol. The generator function should return the final result of 12 | the parsing. Alternatively it can return another parser, which is equivalent to 13 | applying it and returning its result. 14 | 15 | Motivation and examples 16 | ======================= 17 | 18 | Constructing parsers by using combinators and :class:`Parser` methods to make 19 | larger parsers works well for many simpler cases. However, for more complex 20 | cases the ``generate`` function decorator is both more readable and more 21 | powerful. 22 | 23 | Alternative syntax to combinators 24 | --------------------------------- 25 | 26 | The first example just shows a different way of building a parser that could 27 | have easily been using combinators: 28 | 29 | .. code:: python 30 | 31 | from parsy import generate 32 | 33 | @generate("form") 34 | def form(): 35 | """ 36 | Parse an s-expression form, like (a b c). 37 | An equivalent to lparen >> expr.many() << rparen 38 | """ 39 | yield lparen 40 | exprs = yield expr.many() 41 | yield rparen 42 | return exprs 43 | 44 | In the example above, the parser was given a string name ``"form"``, which does 45 | the same as :meth:`Parser.desc`. This is not required, as per the examples below. 46 | 47 | Note that there is no guarantee that the entire function is executed: if any of 48 | the yielded parsers fails, the function will not complete, and parsy will try to 49 | backtrack to an alternative parser if there is one. 50 | 51 | Building complex objects 52 | ------------------------ 53 | 54 | The second example shows how you can use multiple parse results to build up a 55 | complex object: 56 | 57 | .. code:: python 58 | 59 | from datetime import date 60 | 61 | from parsy import generate, regex, string 62 | 63 | @generate 64 | def date(): 65 | """ 66 | Parse a date in the format YYYY-MM-DD 67 | """ 68 | year = yield regex("[0-9]{4}").map(int) 69 | yield string("-") 70 | month = yield regex("[0-9]{2}").map(int) 71 | yield string("-") 72 | day = yield regex("[0-9]{2}").map(int) 73 | 74 | return date(year, month, day) 75 | 76 | This could also have been achieved using :func:`seq` and :meth:`Parser.combine`. 77 | 78 | Using values already parsed 79 | --------------------------- 80 | 81 | The third example shows how we can use an earlier parsed value to influence the 82 | subsequent parsing. This example parses Hollerith constants. Hollerith constants 83 | are a way of specifying an arbitrary set of characters by first writing the 84 | integer that specifies the length, followed by the character H, followed by the 85 | set of characters. For example, ``pancakes`` would be written ``8Hpancakes``. 86 | 87 | .. code:: python 88 | 89 | from parsy import generate, regex, string, any_char 90 | 91 | @generate 92 | def hollerith(): 93 | num = yield regex(r'[0-9]+').map(int) 94 | yield string('H') 95 | return any_char.times(num).concat() 96 | 97 | (You may want to compare this with an `implementation of Hollerith constants 98 | `_ that 99 | uses `pyparsing `_, originally by John 100 | Shipman from his `pyparsing docs 101 | `_.) 102 | 103 | There are also more complex examples in the :ref:`tutorial 104 | ` of using the ``generate`` decorator to create parsers 105 | where there is logic that is conditional upon earlier parsed values. 106 | 107 | Implementing recursive definitions 108 | ---------------------------------- 109 | 110 | A fourth examples shows how you can use this syntax for grammars that you would 111 | like to define recursively (or mutually recursively). 112 | 113 | Say we want to be able to pass an s-expression like syntax which uses 114 | parenthesis for grouping items into a tree structure, like the following:: 115 | 116 | (0 1 (2 3) (4 5 6) 7 8) 117 | 118 | A naive approach would be: 119 | 120 | .. code-block:: python 121 | 122 | simple = regex('[0-9]+').map(int) 123 | group = string('(') >> expr.sep_by(string(' ')) << string(')') 124 | expr = simple | group 125 | 126 | The problem is that the second line will get a ``NameError`` because ``expr`` is 127 | not defined yet. 128 | 129 | Using the ``@generate`` syntax will introduce a level of laziness in resolving 130 | ``expr`` that allows things to work: 131 | 132 | .. code-block:: python 133 | 134 | simple = regex('[0-9]+').map(int) 135 | 136 | @generate 137 | def group(): 138 | return (yield string('(') >> expr.sep_by(string(' ')) << string(')')) 139 | 140 | expr = simple | group 141 | 142 | .. code-block:: python 143 | 144 | >>> expr.parse("(0 1 (2 3) (4 5 6) 7 8)") 145 | [0, 1, [2, 3], [4, 5, 6], 7, 8] 146 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # parsy documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Sep 25 22:24:17 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('../src')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.viewcode'] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'parsy' 50 | copyright = '2017, Jeanine Adkisson, Luke Plant' 51 | author = 'Jeanine Adkisson' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '1.0.1' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '1.0.1-dev1' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'default' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | 100 | # -- Options for HTMLHelp output ------------------------------------------ 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = 'parsydoc' 104 | 105 | 106 | # -- Options for LaTeX output --------------------------------------------- 107 | 108 | latex_elements = { 109 | # The paper size ('letterpaper' or 'a4paper'). 110 | # 111 | # 'papersize': 'letterpaper', 112 | 113 | # The font size ('10pt', '11pt' or '12pt'). 114 | # 115 | # 'pointsize': '10pt', 116 | 117 | # Additional stuff for the LaTeX preamble. 118 | # 119 | # 'preamble': '', 120 | 121 | # Latex figure (float) alignment 122 | # 123 | # 'figure_align': 'htbp', 124 | } 125 | 126 | # Grouping the document tree into LaTeX files. List of tuples 127 | # (source start file, target name, title, 128 | # author, documentclass [howto, manual, or own class]). 129 | latex_documents = [ 130 | (master_doc, 'parsy.tex', 'parsy Documentation', 131 | 'Jeanine Adkisson', 'manual'), 132 | ] 133 | 134 | 135 | # -- Options for manual page output --------------------------------------- 136 | 137 | # One entry per manual page. List of tuples 138 | # (source start file, name, description, authors, manual section). 139 | man_pages = [ 140 | (master_doc, 'parsy', 'parsy Documentation', 141 | [author], 1) 142 | ] 143 | 144 | 145 | # -- Options for Texinfo output ------------------------------------------- 146 | 147 | # Grouping the document tree into Texinfo files. List of tuples 148 | # (source start file, target name, title, author, 149 | # dir menu entry, description, category) 150 | texinfo_documents = [ 151 | (master_doc, 'parsy', 'parsy Documentation', 152 | author, 'parsy', 'One line description of project.', 153 | 'Miscellaneous'), 154 | ] 155 | -------------------------------------------------------------------------------- /docs/ref/primitives.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Parsing primitives 3 | ================== 4 | 5 | These are the lowest level building blocks for creating parsers. 6 | 7 | .. module:: parsy 8 | 9 | .. function:: string(expected_string) 10 | 11 | Returns a parser that expects the ``expected_string`` and produces 12 | that string value. 13 | 14 | .. function:: regex(exp, flags=0) 15 | 16 | Returns a parser that expects the given ``exp``, and produces the 17 | matched string. ``exp`` can be a compiled regular expression, or a 18 | string which will be compiled with the given ``flags``. 19 | 20 | Using a regex parser for small building blocks, instead of building up 21 | parsers from primitives like :func:`string`, :func:`test_char` and 22 | :meth:`Parser.times` combinators etc., can have several advantages, 23 | including: 24 | 25 | * It can be more succinct e.g. compare: 26 | 27 | .. code-block:: python 28 | 29 | >>> (string('a') | string('b')).times(1, 4) 30 | >>> regex(r'[ab]{1,4}') 31 | 32 | * It will return the entire matched string as a single item, 33 | so you don't need to use :meth:`Parser.concat`. 34 | * It can be much faster. 35 | 36 | .. function:: test_char(func, description) 37 | 38 | Returns a parser that tests a single character with the callable 39 | ``func``. If ``func`` returns ``True``, the parse succeeds, otherwise 40 | the parse fails with the description ``description``. 41 | 42 | .. code-block:: python 43 | 44 | >>> ascii = test_char(lambda c: ord(c) < 128, 45 | ... 'ascii character') 46 | >>> ascii.parse('A') 47 | 'A' 48 | 49 | .. function:: test_item(func, description) 50 | 51 | Returns a parser that tests a single item from the list of items being 52 | consumed, using the callable ``func``. If ``func`` returns ``True``, the 53 | parse succeeds, otherwise the parse fails with the description 54 | ``description``. 55 | 56 | If you are parsing a string, i.e. a list of characters, you can use 57 | :func:`test_char` instead. (In fact the implementations are identical, these 58 | functions are aliases for the sake of clear code). 59 | 60 | .. code-block:: python 61 | 62 | >>> numeric = test_item(str.isnumeric, 'numeric') 63 | >>> numeric.many().parse(['123', '456']) 64 | ['123', '456'] 65 | 66 | .. function:: char_from(characters) 67 | 68 | Accepts a string and returns a parser that matches and returns one character 69 | from the string. 70 | 71 | .. code-block:: python 72 | 73 | >>> char_from('abc').parse('a') 74 | 'a' 75 | 76 | .. function:: string_from(*strings) 77 | 78 | Accepts a sequence of strings as positional arguments, and returns a parser 79 | that matches and returns one string from the list. The list is first sorted 80 | in descending length order, so that overlapping strings are handled correctly 81 | by checking the longest one first. 82 | 83 | .. code-block:: python 84 | 85 | >>> string_from('y', 'yes').parse('yes') 86 | 'yes' 87 | 88 | 89 | .. function:: match_item(item, description=None) 90 | 91 | Returns a parser that tests the next item (or character) from the stream (or 92 | string) for equality against the provided item. Optionally a string 93 | description can be passed. 94 | 95 | Parsing a string: 96 | 97 | >>> letter_A = match_item('A') 98 | >>> letter_A.parse_partial('ABC') 99 | ('A', 'BC') 100 | 101 | Parsing a list of tokens: 102 | 103 | >>> hello = match_item('hello') 104 | >>> hello.parse_partial(['hello', 'how', 'are', 'you']) 105 | ('hello', ['how', 'are', 'you']) 106 | 107 | .. function:: success(val) 108 | 109 | Returns a parser that does not consume any of the stream, but 110 | produces ``val``. 111 | 112 | .. function:: fail(expected) 113 | 114 | Returns a parser that always fails with the provided error message. 115 | 116 | Pre-built parsers 117 | ================= 118 | 119 | Some common, pre-built parsers (all of these are :class:`Parser` objects created 120 | using the primitives above): 121 | 122 | 123 | .. data:: any_char 124 | 125 | A parser that matches any single character. 126 | 127 | .. data:: whitespace 128 | 129 | A parser that matches and returns one or more whitespace characters. 130 | 131 | .. data:: letter 132 | 133 | A parser that matches and returns a single letter, as defined by 134 | `str.isalpha `_. 135 | 136 | .. data:: digit 137 | 138 | A parser that matches and returns a single digit, as defined by `str.isdigit 139 | `_. Note that 140 | this includes various unicode characters outside of the normal 0-9 range, 141 | such as ¹²³. 142 | 143 | .. data:: decimal_digit 144 | 145 | A parser that matches and returns a single decimal digit, one of 146 | "0123456789". 147 | 148 | .. data:: line_info 149 | 150 | A parser that consumes no input and always just returns the current line 151 | information, a tuple of (line, column), zero-indexed, where lines are 152 | terminated by ``\n``. This is normally useful when wanting to build more 153 | debugging information into parse failure error messages. 154 | 155 | .. data:: index 156 | 157 | A parser that consumes no input and always just returns the current stream 158 | index. This is normally useful when wanting to build more debugging 159 | information into parse failure error messages. 160 | -------------------------------------------------------------------------------- /src/parsy/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- # 2 | 3 | import operator 4 | import re 5 | from .version import __version__ # noqa: F401 6 | from functools import wraps 7 | from collections import namedtuple 8 | 9 | 10 | def line_info_at(stream, index): 11 | if index > len(stream): 12 | raise ValueError("invalid index") 13 | line = stream.count("\n", 0, index) 14 | last_nl = stream.rfind("\n", 0, index) 15 | col = index - (last_nl + 1) 16 | return (line, col) 17 | 18 | 19 | class ParseError(RuntimeError): 20 | def __init__(self, expected, stream, index): 21 | self.expected = expected 22 | self.stream = stream 23 | self.index = index 24 | 25 | def line_info(self): 26 | try: 27 | return '{}:{}'.format(*line_info_at(self.stream, self.index)) 28 | except (TypeError, AttributeError): # not a str 29 | return str(self.index) 30 | 31 | def __str__(self): 32 | expected_list = sorted(repr(e) for e in self.expected) 33 | 34 | if len(expected_list) == 1: 35 | return 'expected {} at {}'.format(expected_list[0], self.line_info()) 36 | else: 37 | return 'expected one of {} at {}'.format(', '.join(expected_list), self.line_info()) 38 | 39 | 40 | class Result(namedtuple('Result', 'status index value furthest expected')): 41 | @staticmethod 42 | def success(index, value): 43 | return Result(True, index, value, -1, frozenset()) 44 | 45 | @staticmethod 46 | def failure(index, expected): 47 | return Result(False, -1, None, index, frozenset([expected])) 48 | 49 | # collect the furthest failure from self and other 50 | def aggregate(self, other): 51 | if not other: 52 | return self 53 | 54 | if self.furthest > other.furthest: 55 | return self 56 | elif self.furthest == other.furthest: 57 | # if we both have the same failure index, we combine the expected messages. 58 | return Result(self.status, self.index, self.value, self.furthest, self.expected | other.expected) 59 | else: 60 | return Result(self.status, self.index, self.value, other.furthest, other.expected) 61 | 62 | 63 | class Parser(object): 64 | """ 65 | A Parser is an object that wraps a function whose arguments are 66 | a string to be parsed and the index on which to begin parsing. 67 | The function should return either Result.success(next_index, value), 68 | where the next index is where to continue the parse and the value is 69 | the yielded value, or Result.failure(index, expected), where expected 70 | is a string indicating what was expected, and the index is the index 71 | of the failure. 72 | """ 73 | 74 | def __init__(self, wrapped_fn): 75 | self.wrapped_fn = wrapped_fn 76 | 77 | def __call__(self, stream, index): 78 | return self.wrapped_fn(stream, index) 79 | 80 | def parse(self, stream): 81 | """Parse a string or list of tokens and return the result or raise a ParseError.""" 82 | (result, _) = (self << eof).parse_partial(stream) 83 | return result 84 | 85 | def parse_partial(self, stream): 86 | """ 87 | Parse the longest possible prefix of a given string. 88 | Return a tuple of the result and the rest of the string, 89 | or raise a ParseError. 90 | """ 91 | result = self(stream, 0) 92 | 93 | if result.status: 94 | return (result.value, stream[result.index:]) 95 | else: 96 | raise ParseError(result.expected, stream, result.furthest) 97 | 98 | def bind(self, bind_fn): 99 | @Parser 100 | def bound_parser(stream, index): 101 | result = self(stream, index) 102 | 103 | if result.status: 104 | next_parser = bind_fn(result.value) 105 | return next_parser(stream, result.index).aggregate(result) 106 | else: 107 | return result 108 | 109 | return bound_parser 110 | 111 | def map(self, map_fn): 112 | return self.bind(lambda res: success(map_fn(res))) 113 | 114 | def combine(self, combine_fn): 115 | return self.bind(lambda res: success(combine_fn(*res))) 116 | 117 | def concat(self): 118 | return self.map(''.join) 119 | 120 | def then(self, other): 121 | return seq(self, other).combine(lambda left, right: right) 122 | 123 | def skip(self, other): 124 | return seq(self, other).combine(lambda left, right: left) 125 | 126 | def result(self, res): 127 | return self >> success(res) 128 | 129 | def many(self): 130 | return self.times(0, float('inf')) 131 | 132 | def times(self, min, max=None): 133 | # max=None means exactly min 134 | # min=max=None means from 0 to infinity 135 | if max is None: 136 | max = min 137 | 138 | @Parser 139 | def times_parser(stream, index): 140 | values = [] 141 | times = 0 142 | result = None 143 | 144 | while times < max: 145 | result = self(stream, index).aggregate(result) 146 | if result.status: 147 | values.append(result.value) 148 | index = result.index 149 | times += 1 150 | elif times >= min: 151 | break 152 | else: 153 | return result 154 | 155 | return Result.success(index, values).aggregate(result) 156 | 157 | return times_parser 158 | 159 | def at_most(self, n): 160 | return self.times(0, n) 161 | 162 | def at_least(self, n): 163 | return self.times(n) + self.many() 164 | 165 | def sep_by(self, sep, *, min=0, max=float('inf')): 166 | zero_times = success([]) 167 | if max == 0: 168 | return zero_times 169 | res = self.times(1) + (sep >> self).times(min - 1, max - 1) 170 | if min == 0: 171 | res |= zero_times 172 | return res 173 | 174 | def desc(self, description): 175 | @Parser 176 | def desc_parser(stream, index): 177 | result = self(stream, index) 178 | if result.status: 179 | return result 180 | else: 181 | return Result.failure(index, description) 182 | 183 | return desc_parser 184 | 185 | def mark(self): 186 | @generate 187 | def marked(): 188 | start = yield line_info 189 | body = yield self 190 | end = yield line_info 191 | return (start, body, end) 192 | 193 | return marked 194 | 195 | def should_fail(self, description): 196 | @Parser 197 | def fail_parser(stream, index): 198 | res = self(stream, index) 199 | if res.status: 200 | return Result.failure(index, description) 201 | return Result.success(index, res) 202 | 203 | return fail_parser 204 | 205 | def __add__(self, other): 206 | return seq(self, other).combine(operator.add) 207 | 208 | def __mul__(self, other): 209 | if isinstance(other, range): 210 | return self.times(other.start, other.stop - 1) 211 | return self.times(other) 212 | 213 | def __or__(self, other): 214 | return alt(self, other) 215 | 216 | # haskelley operators, for fun # 217 | 218 | # >> 219 | def __rshift__(self, other): 220 | return self.then(other) 221 | 222 | # << 223 | def __lshift__(self, other): 224 | return self.skip(other) 225 | 226 | 227 | def alt(*parsers): 228 | if not parsers: 229 | return fail('') 230 | 231 | @Parser 232 | def alt_parser(stream, index): 233 | result = None 234 | for parser in parsers: 235 | result = parser(stream, index).aggregate(result) 236 | if result.status: 237 | return result 238 | 239 | return result 240 | 241 | return alt_parser 242 | 243 | 244 | def seq(*parsers): 245 | """ 246 | Takes a list of list of parsers, runs them in order, 247 | and collects their individuals results in a list 248 | """ 249 | if not parsers: 250 | return success([]) 251 | 252 | @Parser 253 | def seq_parser(stream, index): 254 | result = None 255 | values = [] 256 | for parser in parsers: 257 | result = parser(stream, index).aggregate(result) 258 | if not result.status: 259 | return result 260 | index = result.index 261 | values.append(result.value) 262 | 263 | return Result.success(index, values).aggregate(result) 264 | 265 | return seq_parser 266 | 267 | 268 | # combinator syntax 269 | def generate(fn): 270 | if isinstance(fn, str): 271 | return lambda f: generate(f).desc(fn) 272 | 273 | @Parser 274 | @wraps(fn) 275 | def generated(stream, index): 276 | # start up the generator 277 | iterator = fn() 278 | 279 | result = None 280 | value = None 281 | try: 282 | while True: 283 | next_parser = iterator.send(value) 284 | result = next_parser(stream, index).aggregate(result) 285 | if not result.status: 286 | return result 287 | value = result.value 288 | index = result.index 289 | except StopIteration as stop: 290 | returnVal = stop.value 291 | if isinstance(returnVal, Parser): 292 | return returnVal(stream, index).aggregate(result) 293 | 294 | return Result.success(index, returnVal).aggregate(result) 295 | 296 | return generated 297 | 298 | 299 | index = Parser(lambda _, index: Result.success(index, index)) 300 | line_info = Parser(lambda stream, index: Result.success(index, line_info_at(stream, index))) 301 | 302 | 303 | def success(val): 304 | return Parser(lambda _, index: Result.success(index, val)) 305 | 306 | 307 | def fail(expected): 308 | return Parser(lambda _, index: Result.failure(index, expected)) 309 | 310 | 311 | def string(s): 312 | slen = len(s) 313 | 314 | @Parser 315 | def string_parser(stream, index): 316 | if stream[index:index + slen] == s: 317 | return Result.success(index + slen, s) 318 | else: 319 | return Result.failure(index, s) 320 | 321 | return string_parser 322 | 323 | 324 | def regex(exp, flags=0): 325 | if isinstance(exp, str): 326 | exp = re.compile(exp, flags) 327 | 328 | @Parser 329 | def regex_parser(stream, index): 330 | match = exp.match(stream, index) 331 | if match: 332 | return Result.success(match.end(), match.group(0)) 333 | else: 334 | return Result.failure(index, exp.pattern) 335 | 336 | return regex_parser 337 | 338 | 339 | def test_item(func, description): 340 | @Parser 341 | def test_item_parser(stream, index): 342 | if index < len(stream): 343 | item = stream[index] 344 | if func(item): 345 | return Result.success(index + 1, item) 346 | return Result.failure(index, description) 347 | 348 | return test_item_parser 349 | 350 | 351 | def test_char(func, description): 352 | # Implementation is identical to test_item 353 | return test_item(func, description) 354 | 355 | 356 | def match_item(item, description=None): 357 | if description is None: 358 | description = str(item) 359 | return test_item(lambda i: item == i, description) 360 | 361 | 362 | def string_from(*strings): 363 | # Sort longest first, so that overlapping options work correctly 364 | return alt(*map(string, sorted(strings, key=len, reverse=True))) 365 | 366 | 367 | def char_from(string): 368 | return test_char(lambda c: c in string, "[" + string + "]") 369 | 370 | 371 | any_char = test_char(lambda c: True, "any character") 372 | 373 | whitespace = regex(r'\s+') 374 | 375 | letter = test_char(lambda c: c.isalpha(), 'a letter') 376 | 377 | digit = test_char(lambda c: c.isdigit(), 'a digit') 378 | 379 | decimal_digit = char_from("0123456789") 380 | 381 | 382 | @Parser 383 | def eof(stream, index): 384 | if index >= len(stream): 385 | return Result.success(index, None) 386 | else: 387 | return Result.failure(index, 'EOF') 388 | -------------------------------------------------------------------------------- /docs/ref/methods_and_combinators.rst: -------------------------------------------------------------------------------- 1 | ========================================= 2 | Parser methods, operators and combinators 3 | ========================================= 4 | 5 | Parser methods 6 | ============== 7 | 8 | Parser objects are returned by any of the built-in parser :doc:`primitives`. They 9 | can be used and manipulated as below. 10 | 11 | .. currentmodule:: parsy 12 | 13 | .. class:: Parser 14 | 15 | .. method:: __init__(wrapped_fn) 16 | 17 | This is a low level function to create new parsers that is used internally 18 | but is rarely needed by users of the parsy library. It should be passed a 19 | parsing function, which takes two arguments - a string/list to be parsed 20 | and the current index into the list - and returns a :class:`Result` object, 21 | as described in :doc:`/ref/parser_instances`. 22 | 23 | The following methods are for actually **using** the parsers that you have 24 | created: 25 | 26 | .. method:: parse(string_or_list) 27 | 28 | Attempts to parse the given string (or list). If the parse is successful 29 | and consumes the entire string, the result is returned - otherwise, a 30 | ``ParseError`` is raised. 31 | 32 | Instead of passing a string, you can in fact pass a list of tokens. Almost 33 | all the examples assume strings for simplicity. Some of the primitives are 34 | also clearly string specific, and a few of the combinators (such as 35 | :meth:`Parser.concat`) are string specific, but most of the rest of the 36 | library will work with tokens just as well. See :doc:`/howto/lexing` for 37 | more information. 38 | 39 | .. method:: parse_partial(string_or_list) 40 | 41 | Similar to ``parse``, except that it does not require the entire 42 | string (or list) to be consumed. Returns a tuple of 43 | ``(result, remainder)``, where ``remainder`` is the part of 44 | the string (or list) that was left over. 45 | 46 | The following methods are essentially **combinators** that produce new 47 | parsers from the existing one. They are provided as methods on ``Parser`` for 48 | convenience. More combinators are documented below. 49 | 50 | .. method:: desc(string) 51 | 52 | Adds a desciption to the parser, which is used in the error message 53 | if parsing fails. 54 | 55 | >>> year = regex(r'[0-9]{4}').desc('4 digit year') 56 | >>> year.parse('123') 57 | ParseError: expected 4 digit year at 0:0 58 | 59 | .. method:: then(other_parser) 60 | 61 | Returns a parser which, if the initial parser succeeds, will continue parsing 62 | with ``other_parser``. This will produce the value produced by 63 | ``other_parser``. 64 | 65 | .. code:: python 66 | 67 | >>> string('x').then(string('y')).parse('xy') 68 | 'y' 69 | 70 | See also :ref:`parser-rshift`. 71 | 72 | .. method:: skip(other_parser) 73 | 74 | Similar to :meth:`Parser.then`, except the resulting parser will use 75 | the value produced by the first parser. 76 | 77 | .. code:: python 78 | 79 | >>> string('x').skip(string('y')).parse('xy') 80 | 'x' 81 | 82 | See also :ref:`parser-lshift`. 83 | 84 | .. method:: many() 85 | 86 | Returns a parser that expects the initial parser 0 or more times, and 87 | produces a list of the results. Note that this parser does not fail if 88 | nothing matches, but instead consumes nothing and produces an empty list. 89 | 90 | .. code:: python 91 | 92 | >>> parser = regex(r'[a-z]').many() 93 | >>> parser.parse('') 94 | [] 95 | >>> parser.parse('abc') 96 | ['a', 'b', 'c'] 97 | 98 | .. method:: times(min [, max=min]) 99 | 100 | Returns a parser that expects the initial parser at least ``min`` times, 101 | and at most ``max`` times, and produces a list of the results. If only one 102 | argument is given, the parser is expected exactly that number of times. 103 | 104 | .. method:: at_most(n) 105 | 106 | Returns a parser that expects the initial parser at most ``n`` times, and 107 | produces a list of the results. 108 | 109 | .. method:: at_least(n) 110 | 111 | Returns a parser that expects the initial parser at least ``n`` times, and 112 | produces a list of the results. 113 | 114 | .. method:: map(fn) 115 | 116 | Returns a parser that transforms the produced value of the initial parser 117 | with ``fn``. 118 | 119 | .. code:: python 120 | 121 | >>> regex(r'[0-9]+').map(int).parse('1234') 122 | 1234 123 | 124 | This is the simplest way to convert parsed strings into the data types 125 | that you need. 126 | 127 | .. method:: combine(fn) 128 | 129 | Returns a parser that transforms the produced values of the initial parser 130 | with ``fn``, passing the arguments using ``*args`` syntax. 131 | 132 | Where the current parser produces an iterable of values, this can be a 133 | more convenient way to combine them than :meth:`~Parser.map`. 134 | 135 | Example 1 - the argument order of our callable already matches: 136 | 137 | .. code:: python 138 | 139 | >>> from datetime import date 140 | >>> yyyymmdd = seq(regex(r'[0-9]{4}').map(int), 141 | ... regex(r'[0-9]{2}').map(int), 142 | ... regex(r'[0-9]{2}').map(int)).combine(date) 143 | >>> yyyymmdd.parse('20140506') 144 | datetime.date(2014, 5, 6) 145 | 146 | Example 2 - the argument order of our callable doesn't match, and 147 | we need to adjust a parameter, so we can fix it using a lambda. 148 | 149 | .. code:: python 150 | 151 | >>> ddmmyy = regex(r'[0-9]{2}').map(int).times(3).combine( 152 | ... lambda d, m, y: date(2000 + y, m, d)) 153 | >>> ddmmyy.parse('060514') 154 | datetime.date(2014, 5, 6) 155 | 156 | The equivalent ``lambda`` to use with ``map`` would be ``lambda res: 157 | date(2000 + res[2], res[1], res[0])``, which is less readable. The version 158 | with ``combine`` also ensures that exactly 3 items are generated by the 159 | previous parser, otherwise you get a ``TypeError``. 160 | 161 | .. method:: concat() 162 | 163 | Returns a parser that concatenates together (as a string) the previously 164 | produced values. Usually used after :meth:`~Parser.many` and similar 165 | methods that produce multiple values. 166 | 167 | .. code:: python 168 | 169 | >>> letter.at_least(1).parse("hello") 170 | ['h', 'e', 'l', 'l', 'o'] 171 | >>> letter.at_least(1).concat().parse("hello") 172 | 'hello' 173 | 174 | .. method:: result(val) 175 | 176 | Returns a parser that, if the initial parser succeeds, always produces 177 | ``val``. 178 | 179 | .. code:: python 180 | 181 | >>> string('foo').result(42).parse('foo') 182 | 42 183 | 184 | .. method:: should_fail(description) 185 | 186 | Returns a parser that fails when the initial parser succeeds, and succeeds 187 | when the initial parser fails (consuming no input). A description must 188 | be passed which is used in parse failure messages. 189 | 190 | This is essentially a negative lookahead: 191 | 192 | .. code:: python 193 | 194 | >>> p = letter << string(" ").should_fail("not space") 195 | >>> p.parse('A') 196 | 'A' 197 | >>> p.parse('A ') 198 | ParseError: expected 'not space' at 0:1 199 | 200 | It is also useful for implementing things like parsing repeatedly until a 201 | marker: 202 | 203 | .. code:: python 204 | 205 | >>> (string(";").should_fail("not ;") >> letter).many().concat().parse_partial('ABC;') 206 | ('ABC', ';') 207 | 208 | .. method:: bind(fn) 209 | 210 | Returns a parser which, if the initial parser is successful, passes the 211 | result to ``fn``, and continues with the parser returned from ``fn``. 212 | This is the monadic binding operation. 213 | 214 | .. method:: sep_by(sep, min=0, max=inf) 215 | 216 | Like :meth:`Parser.times`, this returns a new parser that repeats 217 | the initial parser and collects the results in a list, but in this case separated 218 | by the parser ``sep`` (whose return value is discarded). By default it 219 | repeats with no limit, but minimum and maximum values can be supplied. 220 | 221 | .. code:: python 222 | 223 | >>> csv = letter.at_least(1).concat().sep_by(string(",")) 224 | >>> csv.parse("abc,def") 225 | ['abc', 'def'] 226 | 227 | .. _operators: 228 | 229 | Parser operators 230 | ================ 231 | 232 | This section describes operators that you can use on :class:`Parser` objects to 233 | build new parsers. 234 | 235 | 236 | .. _parser-or: 237 | 238 | ``|`` operator 239 | -------------- 240 | 241 | ``parser | other_parser`` 242 | 243 | Returns a parser that tries ``parser`` and, if it fails, backtracks 244 | and tries ``other_parser``. These can be chained together. 245 | 246 | The resulting parser will produce the value produced by the first 247 | successful parser. 248 | 249 | .. code:: python 250 | 251 | >>> parser = string('x') | string('y') | string('z') 252 | >>> parser.parse('x') 253 | 'x' 254 | >>> parser.parse('y') 255 | 'y' 256 | >>> parser.parse('z') 257 | 'z' 258 | 259 | >>> (string('x') >> string('y')).parse('xy') 260 | 'y' 261 | 262 | .. _parser-lshift: 263 | 264 | ``<<`` operator 265 | --------------- 266 | 267 | ``parser << other_parser`` 268 | 269 | The same as ``parser.skip(other_parser)`` - see :meth:`Parser.skip`. 270 | 271 | (Hint - the arrows point at the important parser!) 272 | 273 | .. code:: python 274 | 275 | >>> (string('x') << string('y')).parse('xy') 276 | 'x' 277 | 278 | .. _parser-rshift: 279 | 280 | ``>>`` operator 281 | --------------- 282 | 283 | ``parser >> other_parser`` 284 | 285 | The same as ``parser.then(other_parser)`` - see :meth:`Parser.then`. 286 | 287 | (Hint - the arrows point at the important parser!) 288 | 289 | .. code-block:: python 290 | 291 | >>> (string('x') >> string('y')).parse('xy') 292 | 'y' 293 | 294 | 295 | .. _parser-plus: 296 | 297 | ``+`` operator 298 | -------------- 299 | 300 | ``parser1 + parser2`` 301 | 302 | Requires both parsers to match in order, and adds the two results together using 303 | the + operator. This will only work if the results support the plus operator 304 | (e.g. strings and lists): 305 | 306 | 307 | .. code-block:: python 308 | 309 | >>> (string("x") + regex("[0-9]")).parse("x1") 310 | "x1" 311 | 312 | >>> (string("x").many() + regex("[0-9]").map(int).many()).parse("xx123") 313 | ['x', 'x', 1, 2, 3] 314 | 315 | The plus operator is a convenient shortcut for: 316 | 317 | >>> seq(parser1, parser2).combine(lambda a, b: a + b) 318 | 319 | .. _parser-times: 320 | 321 | ``*`` operator 322 | -------------- 323 | 324 | ``parser1 * number`` 325 | 326 | This is a shortcut for doing :meth:`Parser.times`: 327 | 328 | .. code-block:: python 329 | 330 | >>> (string("x") * 3).parse("xxx") 331 | ["x", "x", "x"] 332 | 333 | You can also set both upper and lower bounds by multiplying by a range: 334 | 335 | .. code-block:: python 336 | 337 | >>> (string("x") * range(0, 3)).parse("xxx") 338 | ParseError: expected EOF at 0:2 339 | 340 | (Note the normal semantics of ``range`` are respected - the second number is an 341 | *exclusive* upper bound, not inclusive). 342 | 343 | Parser combinators 344 | ================== 345 | 346 | .. function:: alt(*parsers) 347 | 348 | Creates a parser from the passed in argument list of alternative parsers, 349 | which are tried in order, moving to the next one if the current one fails, as 350 | per the :ref:`parser-or` - in other words, it matches any one of the 351 | alternative parsers. 352 | 353 | Example using `*arg` syntax to pass a list of parsers that have been 354 | generated by mapping :func:`string` over a list of characters: 355 | 356 | .. code-block:: python 357 | 358 | >>> hexdigit = alt(*map(string, "0123456789abcdef")) 359 | 360 | (In this case you would be better off using :func:`char_from`) 361 | 362 | .. function:: seq(*parsers) 363 | 364 | Creates a parser that runs a sequence of parsers in order and combines 365 | their results in a list. 366 | 367 | 368 | .. code-block:: python 369 | 370 | >>> x_bottles_of_y_on_the_z = \ 371 | ... seq(regex(r"[0-9]+").map(int) << string(" bottles of "), 372 | ... regex(r"\S+") << string(" on the "), 373 | ... regex(r"\S+") 374 | ... ) 375 | >>> x_bottles_of_y_on_the_z.parse("99 bottles of beer on the wall") 376 | [99, 'beer', 'wall'] 377 | 378 | Other combinators 379 | ================= 380 | 381 | Parsy does not try to include every possible combinator - there is no reason why 382 | you cannot create your own for your needs using the built-in combinators and 383 | primitives. If you find something that is very generic and would be very useful 384 | to have as a built-in, please :doc:`submit `: as a PR! 385 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Tutorial 3 | ======== 4 | 5 | .. currentmodule:: parsy 6 | 7 | First :doc:`install parsy `, and check that the documentation you 8 | are reading matches the version you just installed. 9 | 10 | Building an ISO 8601 parser 11 | =========================== 12 | 13 | In this tutorial, we are going to gradually build a parser for a subset of an 14 | ISO 8601 date. Specifically, we want to handle dates that look like this: 15 | ``2017-09-25``. 16 | 17 | A problem of this size could admittedly be solved fairly easily with regexes. 18 | But very quickly regexes don't scale, especially when it comes to getting the 19 | parsed data out, and for this tutorial we need to start with a simple example. 20 | 21 | With parsy, you start by breaking the problem down into the smallest components. 22 | So we need first to match the 4 digit year at the beginning. 23 | 24 | There are various ways we can do this, but a regex works nicely, and 25 | :func:`regex` is a built-in primitive of the parsy library: 26 | 27 | .. code-block:: python 28 | 29 | >>> from parsy import regex 30 | >>> year = regex(r'[0-9]{4}') 31 | 32 | This has produced a :class:`Parser` object which has various methods. We can 33 | immediately check that it works using the :meth:`Parser.parse` method: 34 | 35 | .. code-block:: python 36 | 37 | >>> year.parse('2017') 38 | '2017' 39 | >>> year.parse('abc') 40 | ParseError: expected '[0-9]{4}' at 0:0 41 | 42 | Notice first of all that a parser consumes input (the value we pass to 43 | ``parse``), and it produces an output. In the case of ``regex``, the produced 44 | output is the string that was matched, but this doesn't have to be the case for 45 | all parsers. 46 | 47 | If there is no match, it raises a ``ParseError``. 48 | 49 | Notice as well that the parser expects to consume all the input, so if there are 50 | extra characters at the end, even if it is just whitespace, parsing will fail 51 | with a message saying it expected EOF (End Of File/Data): 52 | 53 | .. code-block:: python 54 | 55 | >>> year.parse('2017 ') 56 | ParseError: expected 'EOF' at 0:4 57 | 58 | To parse the data, we need to parse months, days, and the dash symbol, so we'll 59 | add those: 60 | 61 | .. code-block:: python 62 | 63 | >>> from parsy import string 64 | >>> month = regex('[0-9]{2}') 65 | >>> day = regex('[0-9]{2}') 66 | >>> dash = string('-') 67 | 68 | We've added use of the :func:`string` primitive here, that matches just the 69 | string passed in, and returns that string. 70 | 71 | Next we need to combine these parsers into something that will parse the whole 72 | date. The simplest way is to use the :meth:`Parser.then` method: 73 | 74 | .. code-block:: python 75 | 76 | >>> fulldate = year.then(dash).then(month).then(dash).then(day) 77 | 78 | The ``then`` method returns a new parser that requires the first parser to 79 | succeed, followed by the second parser (the argument to the method). 80 | 81 | We could also write this using the :ref:`parser-rshift` which 82 | does the same thing as :meth:`Parser.then`: 83 | 84 | .. code-block:: python 85 | 86 | >>> fulldate = year >> dash >> month >> dash >> day 87 | 88 | This parser has some problems which we need to address, but it is already useful 89 | as a basic validator: 90 | 91 | .. code-block:: python 92 | 93 | >>> fulldate.parse('2017-xx') 94 | ParseError: expected '[0-9]{2}' at 0:5 95 | >>> fulldate.parse('2017-01') 96 | ParseError: expected '-' at 0:7 97 | >>> fulldate.parse('2017-02-01') 98 | '01' 99 | 100 | If the parse doesn't succeed, we'll get ``ParseError``, otherwise it is valid 101 | (at least as far as the basic syntax checks we've added). 102 | 103 | The first problem with this parser is that it doesn't return a very useful 104 | value. Due to the way that :meth:`Parser.then` works, when it combines two 105 | parsers to produce a larger one, the value from the first parser is discarded, 106 | and the value returned by the second parser is the overall return value. So, we 107 | end up getting only the 'day' component as the result of our parse. We really 108 | want the year, month and day packaged up nicely, and converted to integers. 109 | 110 | A second problem is that our error messages are not very friendly. 111 | 112 | Our first attempt at fixing these might be to use the :ref:`parser-plus` instead 113 | of ``then``. This operator is defined to combine the results of the two parsers 114 | using the normal plus operator, which will work fine on strings: 115 | 116 | >>> fulldate = year + dash + month + dash + day 117 | >>> fulldate.parse('2017-02-01') 118 | '2017-02-01' 119 | 120 | However, it won't help us if we want to split our data up into a set of 121 | integers. 122 | 123 | Our first step should actually be to work on the year, month and day components 124 | using :meth:`Parser.map`, which allows us to convert the strings to other 125 | objects - in our case we want integers. 126 | 127 | We can also use the :meth:`Parser.desc` method to give nicer error messages, so 128 | our components now look this this: 129 | 130 | .. code-block:: python 131 | 132 | >>> year = regex('[0-9]{4}').map(int).desc('4 digit year') 133 | >>> month = regex('[0-9]{2}').map(int).desc('2 digit month') 134 | >>> day = regex('[0-9]{2}').map(int).desc('2 digit day') 135 | 136 | We get better error messages now: 137 | 138 | .. code-block:: python 139 | 140 | >>> year.then(dash).then(month).parse('2017-xx') 141 | ParseError: expected '2 digit month' at 0:5 142 | 143 | 144 | Notice that the ``map`` and ``desc`` methods, like all similar methods on 145 | ``Parser``, return new parser objects - they do not modify the existing one. 146 | This allows us to build up parsers with a 'fluent' interface, and avoid problems 147 | caused by mutating objects. 148 | 149 | However, we still need a way to package up the year, month and day as separate 150 | values. 151 | 152 | The :func:`seq` combinator provides one easy way to do that. It takes the 153 | parsers that are passed in as arguments, and combines their results into a 154 | list: 155 | 156 | .. code-block:: python 157 | 158 | >>> fulldate = seq(year, dash, month, dash, day) 159 | >>> fulldate.parse('2017-01-02') 160 | [2017, '-', 1, '-', 2] 161 | 162 | Now, we don't need those dashes, so we can eliminate them using the :ref:`parser-rshift` or :ref:`parser-lshift`: 163 | 164 | .. code-block:: python 165 | 166 | >>> fulldate = seq(year, dash >> month, dash >> day) 167 | >>> fulldate.parse('2017-01-02') 168 | [2017, 1, 2] 169 | 170 | At this point, we could also convert this to a date object if we wanted using 171 | :meth:`Parser.combine`: 172 | 173 | .. code-block:: python 174 | 175 | >>> from datetime import date 176 | >>> fulldate = seq(year, dash >> month, dash >> day).combine(date) 177 | 178 | We could have used :meth:`Parser.map` here, but :meth:`Parser.combine` is a bit 179 | nicer. It's especially succinct because the argument order to ``date`` matches 180 | the order of the values parsed (year, month, day), otherwise we could pass a 181 | ``lambda`` to ``combine``. 182 | 183 | .. _using-previous-values: 184 | 185 | Using previously parsed values 186 | ============================== 187 | 188 | Now, sometimes we might want to do more complex logic with the values that are 189 | collected as parse results, and do while we are still parsing. 190 | 191 | To continue our example, the above parser has a problem that it will raise an 192 | exception if the day and month values are not valid. We'd like to be able to 193 | check this, and produce a parse error instead, which will make our parser play 194 | better with others if we want to use it to build something bigger. 195 | 196 | Also, in ISO8601, strictly speaking you can just write the year, or the year and 197 | the month, and leave off the other parts. We'd like to handle that by returning 198 | a tuple for the result, and ``None`` for the missing data. 199 | 200 | To do this, we need to allow the parse to continue if the later components (with 201 | their leading dashes) are missing - that is, we need to express optional 202 | components, and we need a way to be able to test earlier values while in the 203 | middle of parsing, to see if we should continue looking for another component. 204 | 205 | The :meth:`Parser.bind` method provides one way to do it (yay monads!). You pass 206 | it a function that takes the output value from one parser as its input, and 207 | returns another parser as its output. (An example will help!) By appropriate use 208 | of closures, plus the :func:`success` primitive to return our values as a tuple, 209 | we can put together a parser. 210 | 211 | For our first attempt, we'll make a parser that is similar to the previous ones 212 | and requires the full date to be present. It might look like this: 213 | 214 | .. code-block:: python 215 | 216 | fulldate = \ 217 | year.skip(dash).bind(lambda y: 218 | month.skip(dash).bind(lambda m: 219 | day.bind(lambda d: 220 | success((y, m, d))))) 221 | 222 | That is not a pretty sight, and it will get even worse if we want to use 223 | statements that are not allowed inside a lambda, and therefore need to define 224 | the callables using ``def``. Can we do better? 225 | 226 | In Haskell, there is ``do`` notation that eliminates the lambdas. We don't have 227 | that in Python, but instead we can use generators and the ``yield`` keyword to 228 | great effect. 229 | 230 | Instead of wrangling lambdas or having to create many little functions, we use 231 | use a generator function and convert it into a parser by using the 232 | :func:`generate` decorator. The idea is that you ``yield`` every parser that you 233 | want to run, and receive the result of that parser as the value of the yield 234 | expression. You can then put parsers together using any logic you like, and 235 | finally return the value. 236 | 237 | An equivalent parser to the one above can be written like this: 238 | 239 | .. code-block:: python 240 | 241 | @generate 242 | def full_date(): 243 | y = yield year 244 | yield dash # implicit skip, since we do nothing with the value 245 | m = yield month 246 | yield dash 247 | d = yield day 248 | return (y, m, d) 249 | 250 | This is much better, and provides a good starting point for our next set of 251 | requirements. 252 | 253 | First of all, we need to express optional components - that is we need to be 254 | able to handle missing dashes, and return what we've got so far rather than 255 | failing the whole parse. 256 | 257 | :class:`Parser` has a set of methods that convert parsers into ones that allow 258 | multiples of the parser - including :meth:`Parser.times`, :meth:`Parser.at_most` 259 | and :meth:`Parser.at_least`. 260 | 261 | The :meth:`Parser.at_most` method will take the initial parser and return one 262 | that succeeds if there are between zero and n repetitions of matching input. It 263 | returns a (possibly empty) list of produced values. With ``n=1`` we can get an 264 | optional dash, and then we then check the length of what was produced to see if 265 | a dash was present. 266 | 267 | We also need to do checking on the month and the day. We'll take a shortcut and 268 | use the built-in ``datetime.date`` class to do the validation for us. However, 269 | rather than allow exceptions to be raised, we convert the exception into a 270 | parsing failure. 271 | 272 | 273 | .. code-block:: python 274 | 275 | optional_dash = dash.at_most(1) 276 | 277 | @generate 278 | def full_or_partial_date(): 279 | d = None 280 | m = None 281 | y = yield year 282 | dash1 = yield optional_dash 283 | if len(dash1) > 0: 284 | m = yield month 285 | dash2 = yield optional_dash 286 | if len(dash2) > 0: 287 | d = yield day 288 | if m is not None: 289 | if m < 1 or m > 12: 290 | return fail("month must be in 1..12") 291 | if d is not None: 292 | try: 293 | datetime.date(y, m, d) 294 | except ValueError as e: 295 | return fail(e.args[0]) 296 | 297 | return (y, m, d) 298 | 299 | 300 | This works now works as expected: 301 | 302 | .. code-block:: python 303 | 304 | >>> full_or_partial_date('2017-02') 305 | (2017, 2, None) 306 | >>> full_or_partial_date('2017-02-29') 307 | ParseError: expected 'day is out of range for month' at 0:10 308 | 309 | We could of course use a custom object in the final line to return a more 310 | convenient data type, if wanted. 311 | 312 | Alternatives and backtracking 313 | ============================= 314 | 315 | Suppose we are using our date parser to scrape dates off articles on a web site. 316 | We then discover that for recently published articles, instead of printing a 317 | timestamp, they write "X days ago". 318 | 319 | We want to parse this, and we'll use a timedelta object to represent the value 320 | (to easily distinguish it from other values and consume it later). We can write 321 | a parser for this easily: 322 | 323 | .. code-block:: python 324 | 325 | >>> days_ago = regex("[0-9]+").map(lambda d: timedelta(days=-int(d))) << string(" days ago") 326 | >>> days_ago.parse("5 days ago") 327 | datetime.timedelta(-5) 328 | 329 | Now we need to combine it with our date parser, and allow either to succeed. 330 | This is done using the :ref:`parser-or`, as follows: 331 | 332 | 333 | .. code-block:: python 334 | 335 | >>> flexi_date = full_or_partial_date | days_ago 336 | >>> flexi_date.parse('2012-01-05') 337 | (2012, 1, 5) 338 | >>> days_ago.parse("2 days ago") 339 | datetime.timedelta(-2) 340 | 341 | Notice that you still get good error messages from the appropriate parser, 342 | depending on which parser got furthest before returning a failure: 343 | 344 | .. code-block:: python 345 | 346 | >>> flexi_date.parse('2012-') 347 | ParseError: expected '2 digit month' at 0:5 348 | >>> flexi_data.parse('2 years ago') 349 | ParseError: expected ' days ago' at 0:1 350 | 351 | When using backtracking, you need to understand that backtracking to the other 352 | option only occurs if the first parser fails. So, for example: 353 | 354 | .. code-block:: python 355 | 356 | >>> a = string("a") 357 | >>> ab = string("ab") 358 | >>> c = string("c") 359 | >>> a_or_ab_and_c = ((a | ab) + c) 360 | >>> a_or_ab_and_c.parse('ac') 361 | 'ac' 362 | >>> a_or_ab_and_c.parse('abc') 363 | ParseError: expected 'c' at 0:1 364 | 365 | The parse fails because the ``a`` parser succeeds, and so the ``ab`` parser is 366 | never tried. This is different from most regular expressions engines, where 367 | backtracking is done over the whole regex by default. 368 | 369 | In this case we can get the parse to succeed by switching the order: 370 | 371 | .. code-block:: python 372 | 373 | >>> ((ab | a) + c).parse('abc') 374 | 'abc' 375 | 376 | >>> ((ab | a) + c).parse('ac') 377 | 'ac' 378 | 379 | We could also fix it like this: 380 | 381 | .. code-block:: python 382 | 383 | >>> ((a + c) | (ab + c)).parse('abc') 384 | 'abc' 385 | 386 | Learn more 387 | ========== 388 | 389 | For further topics, see the :doc:`table of contents ` for the rest of 390 | the documentation that should enable you to build parsers for your needs. 391 | -------------------------------------------------------------------------------- /test/test_parsy.py: -------------------------------------------------------------------------------- 1 | # -*- code: utf8 -*- 2 | import re 3 | import unittest 4 | 5 | from parsy import test_char as parsy_test_char # to stop pytest thinking this function is a test 6 | from parsy import test_item as parsy_test_item # to stop pytest thinking this function is a test 7 | from parsy import ( 8 | ParseError, alt, any_char, char_from, decimal_digit, digit, generate, index, letter, line_info, line_info_at, 9 | match_item, regex, seq, string, string_from, whitespace 10 | ) 11 | 12 | 13 | class TestParser(unittest.TestCase): 14 | 15 | def test_string(self): 16 | parser = string('x') 17 | self.assertEqual(parser.parse('x'), 'x') 18 | 19 | self.assertRaises(ParseError, parser.parse, 'y') 20 | 21 | def test_regex(self): 22 | parser = regex(r'[0-9]') 23 | 24 | self.assertEqual(parser.parse('1'), '1') 25 | self.assertEqual(parser.parse('4'), '4') 26 | 27 | self.assertRaises(ParseError, parser.parse, 'x') 28 | 29 | def test_regex_compiled(self): 30 | parser = regex(re.compile(r'[0-9]')) 31 | self.assertEqual(parser.parse('1'), '1') 32 | self.assertRaises(ParseError, parser.parse, 'x') 33 | 34 | def test_then(self): 35 | xy_parser = string('x') >> string('y') 36 | self.assertEqual(xy_parser.parse('xy'), 'y') 37 | 38 | self.assertRaises(ParseError, xy_parser.parse, 'y') 39 | self.assertRaises(ParseError, xy_parser.parse, 'z') 40 | 41 | def test_bind(self): 42 | piped = None 43 | 44 | def binder(x): 45 | nonlocal piped 46 | piped = x 47 | return string('y') 48 | 49 | parser = string('x').bind(binder) 50 | 51 | self.assertEqual(parser.parse('xy'), 'y') 52 | self.assertEqual(piped, 'x') 53 | 54 | self.assertRaises(ParseError, parser.parse, 'x') 55 | 56 | def test_map(self): 57 | parser = digit.map(int) 58 | self.assertEqual(parser.parse('7'), 59 | 7) 60 | 61 | def test_combine(self): 62 | parser = (seq(digit, letter) 63 | .combine(lambda d, l: (d, l))) 64 | self.assertEqual(parser.parse('1A'), 65 | ('1', 'A')) 66 | 67 | def test_concat(self): 68 | parser = letter.many().concat() 69 | self.assertEqual(parser.parse(''), '') 70 | self.assertEqual(parser.parse('abc'), 'abc') 71 | 72 | def test_generate(self): 73 | x = y = None 74 | 75 | @generate 76 | def xy(): 77 | nonlocal x 78 | nonlocal y 79 | x = yield string('x') 80 | y = yield string('y') 81 | return 3 82 | 83 | self.assertEqual(xy.parse('xy'), 3) 84 | self.assertEqual(x, 'x') 85 | self.assertEqual(y, 'y') 86 | 87 | def test_generate_return_parser(self): 88 | @generate 89 | def example(): 90 | yield string('x') 91 | return string('y') 92 | self.assertEqual(example.parse("xy"), "y") 93 | 94 | def test_mark(self): 95 | parser = (letter.many().mark() << string("\n")).many() 96 | 97 | lines = parser.parse("asdf\nqwer\n") 98 | 99 | self.assertEqual(len(lines), 2) 100 | 101 | (start, letters, end) = lines[0] 102 | self.assertEqual(start, (0, 0)) 103 | self.assertEqual(letters, ['a', 's', 'd', 'f']) 104 | self.assertEqual(end, (0, 4)) 105 | 106 | (start, letters, end) = lines[1] 107 | self.assertEqual(start, (1, 0)) 108 | self.assertEqual(letters, ['q', 'w', 'e', 'r']) 109 | self.assertEqual(end, (1, 4)) 110 | 111 | def test_generate_desc(self): 112 | @generate('a thing') 113 | def thing(): 114 | yield string('t') 115 | 116 | with self.assertRaises(ParseError) as err: 117 | thing.parse('x') 118 | 119 | ex = err.exception 120 | 121 | self.assertEqual(ex.expected, frozenset(['a thing'])) 122 | self.assertEqual(ex.stream, 'x') 123 | self.assertEqual(ex.index, 0) 124 | 125 | def test_generate_default_desc(self): 126 | # We shouldn't give a default desc, the messages from the internal 127 | # parsers should bubble up. 128 | @generate 129 | def thing(): 130 | yield string('a') 131 | yield string('b') 132 | 133 | with self.assertRaises(ParseError) as err: 134 | thing.parse('ax') 135 | 136 | ex = err.exception 137 | 138 | self.assertEqual(ex.expected, frozenset(['b'])) 139 | self.assertEqual(ex.stream, 'ax') 140 | self.assertEqual(ex.index, 1) 141 | 142 | self.assertIn("expected 'b' at 0:1", 143 | str(ex)) 144 | 145 | def test_multiple_failures(self): 146 | abc = string('a') | string('b') | string('c') 147 | 148 | with self.assertRaises(ParseError) as err: 149 | abc.parse('d') 150 | 151 | ex = err.exception 152 | self.assertEqual(ex.expected, frozenset(['a', 'b', 'c'])) 153 | self.assertEqual(str(ex), "expected one of 'a', 'b', 'c' at 0:0") 154 | 155 | def test_generate_backtracking(self): 156 | @generate 157 | def xy(): 158 | yield string('x') 159 | yield string('y') 160 | assert False 161 | 162 | parser = xy | string('z') 163 | # should not finish executing xy() 164 | self.assertEqual(parser.parse('z'), 'z') 165 | 166 | def test_or(self): 167 | x_or_y = string('x') | string('y') 168 | 169 | self.assertEqual(x_or_y.parse('x'), 'x') 170 | self.assertEqual(x_or_y.parse('y'), 'y') 171 | 172 | def test_or_with_then(self): 173 | parser = (string('\\') >> string('y')) | string('z') 174 | self.assertEqual(parser.parse('\\y'), 'y') 175 | self.assertEqual(parser.parse('z'), 'z') 176 | 177 | self.assertRaises(ParseError, parser.parse, '\\z') 178 | 179 | def test_many(self): 180 | letters = letter.many() 181 | self.assertEqual(letters.parse('x'), ['x']) 182 | self.assertEqual(letters.parse('xyz'), ['x', 'y', 'z']) 183 | self.assertEqual(letters.parse(''), []) 184 | 185 | self.assertRaises(ParseError, letters.parse, '1') 186 | 187 | def test_many_with_then(self): 188 | parser = string('x').many() >> string('y') 189 | self.assertEqual(parser.parse('y'), 'y') 190 | self.assertEqual(parser.parse('xy'), 'y') 191 | self.assertEqual(parser.parse('xxxxxy'), 'y') 192 | 193 | def test_times_zero(self): 194 | zero_letters = letter.times(0) 195 | self.assertEqual(zero_letters.parse(''), []) 196 | 197 | self.assertRaises(ParseError, zero_letters.parse, 'x') 198 | 199 | def test_times(self): 200 | three_letters = letter.times(3) 201 | self.assertEqual(three_letters.parse('xyz'), ['x', 'y', 'z']) 202 | 203 | self.assertRaises(ParseError, three_letters.parse, 'xy') 204 | self.assertRaises(ParseError, three_letters.parse, 'xyzw') 205 | 206 | def test_times_with_then(self): 207 | then_digit = letter.times(3) >> digit 208 | self.assertEqual(then_digit.parse('xyz1'), '1') 209 | 210 | self.assertRaises(ParseError, then_digit.parse, 'xy1') 211 | self.assertRaises(ParseError, then_digit.parse, 'xyz') 212 | self.assertRaises(ParseError, then_digit.parse, 'xyzw') 213 | 214 | def test_times_with_min_and_max(self): 215 | some_letters = letter.times(2, 4) 216 | 217 | self.assertEqual(some_letters.parse('xy'), ['x', 'y']) 218 | self.assertEqual(some_letters.parse('xyz'), ['x', 'y', 'z']) 219 | self.assertEqual(some_letters.parse('xyzw'), ['x', 'y', 'z', 'w']) 220 | 221 | self.assertRaises(ParseError, some_letters.parse, 'x') 222 | self.assertRaises(ParseError, some_letters.parse, 'xyzwv') 223 | 224 | def test_times_with_min_and_max_and_then(self): 225 | then_digit = letter.times(2, 4) >> digit 226 | 227 | self.assertEqual(then_digit.parse('xy1'), '1') 228 | self.assertEqual(then_digit.parse('xyz1'), '1') 229 | self.assertEqual(then_digit.parse('xyzw1'), '1') 230 | 231 | self.assertRaises(ParseError, then_digit.parse, 'xy') 232 | self.assertRaises(ParseError, then_digit.parse, 'xyzw') 233 | self.assertRaises(ParseError, then_digit.parse, 'xyzwv1') 234 | self.assertRaises(ParseError, then_digit.parse, 'x1') 235 | 236 | def test_at_most(self): 237 | ab = string("ab") 238 | self.assertEqual(ab.at_most(2).parse(""), 239 | []) 240 | self.assertEqual(ab.at_most(2).parse("ab"), 241 | ["ab"]) 242 | self.assertEqual(ab.at_most(2).parse("abab"), 243 | ["ab", "ab"]) 244 | self.assertRaises(ParseError, 245 | ab.at_most(2).parse, "ababab") 246 | 247 | def test_sep_by(self): 248 | digit_list = digit.map(int).sep_by(string(',')) 249 | 250 | self.assertEqual(digit_list.parse('1,2,3,4'), [1, 2, 3, 4]) 251 | self.assertEqual(digit_list.parse('9,0,4,7'), [9, 0, 4, 7]) 252 | self.assertEqual(digit_list.parse('3,7'), [3, 7]) 253 | self.assertEqual(digit_list.parse('8'), [8]) 254 | self.assertEqual(digit_list.parse(''), []) 255 | 256 | self.assertRaises(ParseError, digit_list.parse, '8,') 257 | self.assertRaises(ParseError, digit_list.parse, ',9') 258 | self.assertRaises(ParseError, digit_list.parse, '82') 259 | self.assertRaises(ParseError, digit_list.parse, '7.6') 260 | 261 | def test_sep_by_with_min_and_max(self): 262 | digit_list = digit.map(int).sep_by(string(','), min=2, max=4) 263 | 264 | self.assertEqual(digit_list.parse('1,2,3,4'), [1, 2, 3, 4]) 265 | self.assertEqual(digit_list.parse('9,0,4,7'), [9, 0, 4, 7]) 266 | self.assertEqual(digit_list.parse('3,7'), [3, 7]) 267 | 268 | self.assertRaises(ParseError, digit_list.parse, '8') 269 | self.assertRaises(ParseError, digit_list.parse, '') 270 | self.assertRaises(ParseError, digit_list.parse, '8,') 271 | self.assertRaises(ParseError, digit_list.parse, ',9') 272 | self.assertRaises(ParseError, digit_list.parse, '82') 273 | self.assertRaises(ParseError, digit_list.parse, '7.6') 274 | self.assertEqual(digit.sep_by(string(","), max=0).parse(''), 275 | []) 276 | 277 | def test_add(self): 278 | self.assertEqual((letter + digit).parse("a1"), 279 | "a1") 280 | 281 | def test_multiply(self): 282 | self.assertEqual((letter * 3).parse("abc"), 283 | ['a', 'b', 'c']) 284 | 285 | def test_multiply_range(self): 286 | self.assertEqual((letter * range(1, 2)).parse("a"), 287 | ["a"]) 288 | self.assertRaises(ParseError, (letter * range(1, 2)).parse, "aa") 289 | 290 | # Primitives 291 | def test_alt(self): 292 | self.assertRaises(ParseError, alt().parse, '') 293 | self.assertEqual(alt(letter, digit).parse('a'), 294 | 'a') 295 | self.assertEqual(alt(letter, digit).parse('1'), 296 | '1') 297 | self.assertRaises(ParseError, alt(letter, digit).parse, '.') 298 | 299 | def test_seq(self): 300 | self.assertEqual(seq().parse(''), 301 | []) 302 | self.assertEqual(seq(letter).parse('a'), 303 | ['a']) 304 | self.assertEqual(seq(letter, digit).parse('a1'), 305 | ['a', '1']) 306 | self.assertRaises(ParseError, seq(letter, digit).parse, '1a') 307 | 308 | def test_test_char(self): 309 | ascii = parsy_test_char(lambda c: ord(c) < 128, 310 | "ascii character") 311 | self.assertEqual(ascii.parse("a"), "a") 312 | with self.assertRaises(ParseError) as err: 313 | ascii.parse('☺') 314 | ex = err.exception 315 | self.assertEqual(str(ex), """expected 'ascii character' at 0:0""") 316 | 317 | with self.assertRaises(ParseError) as err: 318 | ascii.parse('') 319 | ex = err.exception 320 | self.assertEqual(str(ex), """expected 'ascii character' at 0:0""") 321 | 322 | def test_char_from(self): 323 | ab = char_from("ab") 324 | self.assertEqual(ab.parse("a"), "a") 325 | self.assertEqual(ab.parse("b"), "b") 326 | 327 | with self.assertRaises(ParseError) as err: 328 | ab.parse('x') 329 | 330 | ex = err.exception 331 | self.assertEqual(str(ex), """expected '[ab]' at 0:0""") 332 | 333 | def test_string_from(self): 334 | titles = string_from("Mr", "Mr.", "Mrs", "Mrs.") 335 | self.assertEqual(titles.parse("Mr"), "Mr") 336 | self.assertEqual(titles.parse("Mr."), "Mr.") 337 | self.assertEqual((titles + string(" Hyde")).parse("Mr. Hyde"), 338 | "Mr. Hyde") 339 | with self.assertRaises(ParseError) as err: 340 | titles.parse('foo') 341 | 342 | ex = err.exception 343 | self.assertEqual(str(ex), """expected one of 'Mr', 'Mr.', 'Mrs', 'Mrs.' at 0:0""") 344 | 345 | def test_any_char(self): 346 | self.assertEqual(any_char.parse("x"), "x") 347 | self.assertEqual(any_char.parse("\n"), "\n") 348 | self.assertRaises(ParseError, any_char.parse, "") 349 | 350 | def test_whitespace(self): 351 | self.assertEqual(whitespace.parse("\n"), "\n") 352 | self.assertEqual(whitespace.parse(" "), " ") 353 | self.assertRaises(ParseError, whitespace.parse, "x") 354 | 355 | def test_letter(self): 356 | self.assertEqual(letter.parse("a"), "a") 357 | self.assertRaises(ParseError, letter.parse, "1") 358 | 359 | def test_digit(self): 360 | self.assertEqual(digit.parse("¹"), "¹") 361 | self.assertEqual(digit.parse("2"), "2") 362 | self.assertRaises(ParseError, digit.parse, "x") 363 | 364 | def test_decimal_digit(self): 365 | self.assertEqual(decimal_digit.at_least(1).concat().parse("9876543210"), 366 | "9876543210") 367 | self.assertRaises(ParseError, decimal_digit.parse, "¹") 368 | 369 | def test_line_info(self): 370 | @generate 371 | def foo(): 372 | i = yield line_info 373 | l = yield any_char 374 | return (l, i) 375 | 376 | self.assertEqual(foo.many().parse("AB\nCD"), 377 | [("A", (0, 0)), ("B", (0, 1)), 378 | ("\n", (0, 2)), 379 | ("C", (1, 0)), ("D", (1, 1)), 380 | ]) 381 | 382 | def test_should_fail(self): 383 | not_a_digit = digit.should_fail('not a digit') >> regex(r'.*') 384 | 385 | self.assertEqual(not_a_digit.parse('a'), 'a') 386 | self.assertEqual(not_a_digit.parse('abc'), 'abc') 387 | self.assertEqual(not_a_digit.parse('a10'), 'a10') 388 | self.assertEqual(not_a_digit.parse(''), '') 389 | 390 | with self.assertRaises(ParseError) as err: 391 | not_a_digit.parse('8') 392 | self.assertEqual(str(err.exception), "expected 'not a digit' at 0:0") 393 | 394 | self.assertRaises(ParseError, not_a_digit.parse, '8ab') 395 | 396 | 397 | class TestParserTokens(unittest.TestCase): 398 | """ 399 | Tests that ensure that `.parse` can handle an arbitrary list of tokens, 400 | rather than a string. 401 | """ 402 | # Some opaque objects we will use in our stream: 403 | START = object() 404 | STOP = object() 405 | 406 | def test_test_item(self): 407 | start_stop = parsy_test_item(lambda i: i in [self.START, self.STOP], "START/STOP") 408 | self.assertEqual(start_stop.parse([self.START]), 409 | self.START) 410 | self.assertEqual(start_stop.parse([self.STOP]), 411 | self.STOP) 412 | with self.assertRaises(ParseError) as err: 413 | start_stop.many().parse([self.START, "hello"]) 414 | 415 | ex = err.exception 416 | self.assertEqual(str(ex), 417 | "expected one of 'EOF', 'START/STOP' at 1") 418 | self.assertEqual(ex.expected, 419 | {'EOF', 'START/STOP'}) 420 | self.assertEqual(ex.index, 421 | 1) 422 | 423 | def test_match_item(self): 424 | self.assertEqual(match_item(self.START).parse([self.START]), 425 | self.START) 426 | with self.assertRaises(ParseError) as err: 427 | match_item(self.START, "START").parse([]) 428 | 429 | ex = err.exception 430 | self.assertEqual(str(ex), 431 | "expected 'START' at 0") 432 | 433 | def test_parse_tokens(self): 434 | other_vals = parsy_test_item(lambda i: i not in [self.START, self.STOP], 435 | "not START/STOP") 436 | 437 | bracketed = match_item(self.START) >> other_vals.many() << match_item(self.STOP) 438 | stream = [self.START, "hello", 1, 2, "goodbye", self.STOP] 439 | result = bracketed.parse(stream) 440 | self.assertEqual(result, ["hello", 1, 2, "goodbye"]) 441 | 442 | def test_index(self): 443 | @generate 444 | def foo(): 445 | i = yield index 446 | l = yield letter 447 | return (l, i) 448 | 449 | self.assertEqual(foo.many().parse(["A", "B"]), 450 | [("A", 0), ("B", 1)]) 451 | 452 | 453 | class TestUtils(unittest.TestCase): 454 | def test_line_info_at(self): 455 | text = "abc\ndef" 456 | self.assertEqual(line_info_at(text, 0), 457 | (0, 0)) 458 | self.assertEqual(line_info_at(text, 2), 459 | (0, 2)) 460 | self.assertEqual(line_info_at(text, 3), 461 | (0, 3)) 462 | self.assertEqual(line_info_at(text, 4), 463 | (1, 0)) 464 | self.assertEqual(line_info_at(text, 7), 465 | (1, 3)) 466 | self.assertRaises(ValueError, lambda: line_info_at(text, 8)) 467 | 468 | 469 | if __name__ == '__main__': 470 | unittest.main() 471 | --------------------------------------------------------------------------------