├── .editorconfig ├── .flake8 ├── .github └── workflows │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── RELEASE.rst ├── codecov.yml ├── conftest.py ├── docs ├── Makefile ├── conf.py ├── contributing.rst ├── history.rst ├── howto │ ├── index.rst │ ├── lexing.rst │ └── other_examples.rst ├── index.rst ├── installation.rst ├── make.bat ├── overview.rst ├── ref │ ├── generating.rst │ ├── index.rst │ ├── methods_and_combinators.rst │ ├── parser_instances.rst │ └── primitives.rst ├── requirements.txt └── tutorial.rst ├── examples ├── __init__.py ├── json.py ├── simple_eval.py ├── simple_logo_lexer.py ├── simple_logo_parser.py └── sql_select.py ├── pyproject.toml ├── pytest.ini ├── release.sh ├── src └── parsy │ └── __init__.py ├── tests ├── requirements-linters.txt ├── requirements-tests.txt ├── test_parsy.py └── test_sexpr.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | # See http://editorconfig.org/ 2 | root = true 3 | 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | charset = utf-8 8 | indent_style = space 9 | 10 | [*.py] 11 | indent_size = 4 12 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = .tox,.git,docs,dist,build,todo,.venv 3 | ignore = E731,E221,W503,E741,E203 4 | max-line-length = 119 5 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | # Trigger the workflow on push or pull request 4 | # events but only for the master branch: 5 | on: 6 | push: 7 | branches: [ master ] 8 | pull_request: 9 | branches: [ master ] 10 | 11 | 12 | jobs: 13 | tests: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "pypy-3.9"] 19 | 20 | env: 21 | PYTHON: ${{ matrix.python-version }} 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python ${{ matrix.python-version }} with uv 26 | uses: drivendataorg/setup-python-uv-action@v1.0.0 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | cache: 'packages' 30 | 31 | - name: Install dependencies 32 | run: | 33 | uv sync 34 | - name: Run tests 35 | run: | 36 | uv run pytest --cov=./ --cov-report=xml 37 | - name: Upload coverage to Codecov 38 | uses: codecov/codecov-action@v1 39 | with: 40 | file: ./coverage.xml 41 | env_vars: PYTHON 42 | fail_ci_if_error: false 43 | 44 | linters: 45 | runs-on: ubuntu-latest 46 | steps: 47 | - uses: actions/checkout@v4 48 | - name: Set up Python 3.10 with uv 49 | uses: drivendataorg/setup-python-uv-action@v1.0.0 50 | with: 51 | python-version: "3.10" 52 | cache: 'packages' 53 | 54 | - name: Install dependencies 55 | run: | 56 | uv sync 57 | - name: Run pre-commit checks 58 | run: | 59 | uv run pre-commit run --all --all-files 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /dist 3 | /todo 4 | .tox 5 | src/parsy.egg-info 6 | docs/_build 7 | .cache 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - repo: https://github.com/pycqa/flake8 8 | rev: 3.8.4 9 | hooks: 10 | - id: flake8 11 | language_version: python3.10 12 | - repo: https://github.com/pre-commit/mirrors-isort 13 | rev: v5.6.4 14 | hooks: 15 | - id: isort 16 | language_version: python3.10 17 | - repo: https://github.com/ikamensh/flynt/ 18 | rev: '0.69' 19 | hooks: 20 | - id: flynt 21 | language_version: python3.10 22 | - repo: https://github.com/asottile/pyupgrade 23 | rev: v2.26.0 24 | hooks: 25 | - id: pyupgrade 26 | entry: pyupgrade --py3-plus --py36-plus --py37-plus --keep-runtime-typing 27 | language_version: python3.10 28 | - repo: https://github.com/myint/autoflake 29 | rev: 'v1.4' 30 | hooks: 31 | - id: autoflake 32 | args: ['--remove-all-unused-imports', '-i'] 33 | language_version: python3.10 34 | - repo: https://github.com/pre-commit/mirrors-autopep8 35 | rev: 'v1.5.7' 36 | hooks: 37 | - id: autopep8 38 | language_version: python3.10 39 | - repo: https://github.com/psf/black 40 | rev: 22.3.0 41 | hooks: 42 | - id: black 43 | language_version: python3.10 44 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.11" 7 | 8 | python: 9 | install: 10 | - requirements: docs/requirements.txt 11 | - method: pip 12 | path: . 13 | 14 | sphinx: 15 | configuration: docs/conf.py 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # MIT license. See http://www.opensource.org/licenses/mit-license.php 2 | 3 | Copyright (c) 2013 Jeanine Adkisson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.rst 3 | include *.sh 4 | include *.yml 5 | include *.yaml 6 | include LICENSE 7 | include tox.ini 8 | include pytest.ini 9 | include conftest.py 10 | include .editorconfig 11 | recursive-include docs *.bat 12 | recursive-include docs *.txt 13 | recursive-include docs *.py 14 | recursive-include docs *.rst 15 | recursive-include docs Makefile 16 | recursive-include examples *.py 17 | recursive-include tests *.py *.txt 18 | prune docs/_build 19 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | parsy 2 | ===== 3 | 4 | |Documentation Status| |Build Status| |Codecov| |Downloads| 5 | 6 | Parsy is an easy and elegant way to parse text in Python by combining small 7 | parsers into complex, larger parsers. If it means anything to you, it's a 8 | monadic parser combinator library for LL(infinity) grammars in the spirit of 9 | `Parsec `_, `Parsnip 10 | `_, and `Parsimmon 11 | `_. But don't worry, it has really good 12 | documentation and it doesn't say things like that! 13 | 14 | Parsy requires Python 3.7 or greater. 15 | 16 | For a good example of the kind of clear, declarative code you can create using 17 | parsy, see the `SQL SELECT statement example 18 | `_ 19 | or `JSON parser 20 | `_. 21 | 22 | Links: 23 | 24 | - `Documentation `_ 25 | - `History and changelog `_ 26 | - `PyPI `_ 27 | 28 | To contribute, please create a fork and submit a pull request on GitHub, after 29 | checking the `contributing 30 | `_ section of the 31 | docs. Thanks! 32 | 33 | If you like parsy and think it should be better known, you could: 34 | 35 | * Star this project on GitHub. 36 | * `Vote `_ for it being included on awesome-python. 37 | 38 | Parsy was originally written by `Jeanine Adkisson `_, 39 | with contributions by other people as can be found in the git commit history. 40 | 41 | .. |Documentation Status| image:: https://readthedocs.org/projects/parsy/badge/?version=latest 42 | :target: http://parsy.readthedocs.io/en/latest/?badge=latest 43 | .. |Build Status| image:: https://img.shields.io/github/actions/workflow/status/python-parsy/parsy/tests.yml?branch=master 44 | :target: https://github.com/python-parsy/parsy/actions?query=workflow%3A%22Tests%22+branch%3Amaster 45 | .. |Codecov| image:: https://img.shields.io/codecov/c/github/python-parsy/parsy/master.svg 46 | :target: https://codecov.io/gh/python-parsy/parsy 47 | .. |Downloads| image:: https://img.shields.io/pypi/dm/parsy 48 | :target: https://pypi.org/project/parsy/ 49 | -------------------------------------------------------------------------------- /RELEASE.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | How to do releases 3 | ================== 4 | 5 | * Make sure you are on master branch, and have pulled latest changes. 6 | 7 | * Check test suite passes on all supported versions:: 8 | 9 | tox 10 | 11 | * Change docs/history.rst to remove " - unreleased" 12 | 13 | * Update the version number (removing the ``-dev1`` part): 14 | 15 | * src/parsy/__init__.py 16 | * docs/conf.py 17 | 18 | * Commit with "Version bump" 19 | 20 | * Release to PyPI:: 21 | 22 | $ ./release.sh 23 | 24 | 25 | Post release 26 | ------------ 27 | 28 | * Bump version numbers to next version, and add ``-dev1`` suffix, for example 29 | ``0.9.0-dev1`` 30 | 31 | * Add new section to docs/history.rst, with " - unreleased". 32 | 33 | * Commit and push 34 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "setup.py" 3 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/python-parsy/parsy/3b72c71bf9570d73ce50477cf503fd5544c1c4b1/conftest.py -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = parsy 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # parsy documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Sep 25 22:24:17 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath("../src")) 24 | 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = ["sphinx.ext.viewcode"] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ["_templates"] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = ".rst" 45 | 46 | # The master toctree document. 47 | master_doc = "index" 48 | 49 | # General information about the project. 50 | project = "parsy" 51 | copyright = "2017, Jeanine Adkisson, Luke Plant" 52 | author = "Jeanine Adkisson" 53 | 54 | # The version info for the project you're documenting, acts as replacement for 55 | # |version| and |release|, also used in various other places throughout the 56 | # built documents. 57 | # 58 | # The short X.Y version. 59 | version = "2.1" 60 | # The full version, including alpha/beta/rc tags. 61 | release = "2.1" 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = "sphinx" 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ---------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = "default" 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ["_static"] 99 | 100 | 101 | # -- Options for HTMLHelp output ------------------------------------------ 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = "parsydoc" 105 | 106 | 107 | # -- Options for LaTeX output --------------------------------------------- 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | # The font size ('10pt', '11pt' or '12pt'). 114 | # 115 | # 'pointsize': '10pt', 116 | # Additional stuff for the LaTeX preamble. 117 | # 118 | # 'preamble': '', 119 | # Latex figure (float) alignment 120 | # 121 | # 'figure_align': 'htbp', 122 | } 123 | 124 | # Grouping the document tree into LaTeX files. List of tuples 125 | # (source start file, target name, title, 126 | # author, documentclass [howto, manual, or own class]). 127 | latex_documents = [ 128 | (master_doc, "parsy.tex", "parsy Documentation", "Jeanine Adkisson", "manual"), 129 | ] 130 | 131 | 132 | # -- Options for manual page output --------------------------------------- 133 | 134 | # One entry per manual page. List of tuples 135 | # (source start file, name, description, authors, manual section). 136 | man_pages = [(master_doc, "parsy", "parsy Documentation", [author], 1)] 137 | 138 | 139 | # -- Options for Texinfo output ------------------------------------------- 140 | 141 | # Grouping the document tree into Texinfo files. List of tuples 142 | # (source start file, target name, title, author, 143 | # dir menu entry, description, category) 144 | texinfo_documents = [ 145 | (master_doc, "parsy", "parsy Documentation", author, "parsy", "One line description of project.", "Miscellaneous"), 146 | ] 147 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing to parsy 2 | ===================== 3 | 4 | Contributions to parsy, whether code or docs, are very welcome. Please 5 | contribute by making a fork, and submitting a PR on `GitHub 6 | `_. 7 | 8 | We have a high standard in terms of quality. All contributions will need to be 9 | fully covered by unit tests and documentation. 10 | 11 | To get started you’ll need to: 12 | 13 | - Check out the repo using git, ``cd`` into the directory. 14 | 15 | - Set up a venv for development. We use `uv `_ and 16 | recommend you do the same. With uv, the setup instructions are:: 17 | 18 | uv sync 19 | 20 | This will use your default Python version. If you want to use a different 21 | Python version, instead of the above do this e.g.:: 22 | 23 | uv python install 3.10 24 | uv venv --python 3.10 25 | uv sync 26 | 27 | - Activate the venv:: 28 | 29 | source .venv/bin/activate 30 | 31 | (Alternatively, you can add ``uv run`` before most of the commands below) 32 | 33 | - Get test suite running:: 34 | 35 | pytest 36 | 37 | - Run tests against all versions:: 38 | 39 | tox 40 | 41 | - To build the docs, do:: 42 | 43 | cd docs 44 | make html 45 | 46 | We now have several linters and code formatters that we require use of, 47 | including `flake8 `_, `isort 48 | `_ and `black 49 | `_. These are most easily add by using `pre-commit 50 | `_: 51 | 52 | - Install `pre-commit `_ in the repo:: 53 | 54 | pre-commit install 55 | 56 | This will add Git hooks to run linters when committing, which ensures our style 57 | (black) and other things. 58 | 59 | Now all the linters will run when you commit changes. 60 | 61 | - You can also manually run these linters using:: 62 | 63 | pre-commit run --all --all-files 64 | 65 | 66 | When writing documentation, please keep in mind Daniele Procida's `great article 67 | on documentation `_. To summarise, 68 | there are 4 types of docs: 69 | 70 | * Tutorials (focus: learning, analogy: teaching a child to cook) 71 | * How-to guides (focus: goals, analogy: a recipe in a cook book) 72 | * Discussions (focus: understanding, analogy: an article on culinary history) 73 | * Reference (focus: information, analogy: encyclopedia article) 74 | 75 | We do not (yet) have documentation that fits into the "Discussions" category, 76 | but we do have the others, and when adding new features, documentation of the 77 | right sort(s) should be added. With parsy, where code is often very succinct, 78 | writing good docs often takes several times longer than writing the code. 79 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | History and release notes 3 | ========================= 4 | 5 | .. currentmodule:: parsy 6 | 7 | 2.2 - unreleased 8 | ---------------- 9 | * Dropped support for Python 3.7, 3.8 which are past EOL 10 | 11 | 2.1 - 2023-02-22 12 | ---------------- 13 | 14 | * Dropped support for Python 3.7 15 | * Test against Python 3.11 16 | * Added docstrings and basic type hints to all primitives and main methods 17 | 18 | 19 | 2.0 - 2022-09-08 20 | ---------------- 21 | 22 | * Dropped support for Python < 3.6 23 | * Added :meth:`Parser.until`. Thanks `@mcdeoliveira `_! 24 | * :meth:`Parser.optional` now supports an optional default argument to be returned instead of ``None``. 25 | 26 | 1.4.0 - 2021-11-15 27 | ------------------ 28 | 29 | * Documentation improvements. 30 | * Added ``group`` parameter to :func:`regex` - thanks `@camerondm9 31 | `_. 32 | * Support ``bytes`` with :func:`regex` as well as ``str`` - thanks `@quack4 33 | `_. 34 | * Added :class:`forward_declaration`. 35 | 36 | 37 | 1.3.0 - 2019-08-03 38 | ------------------ 39 | 40 | * Documentation improvements. 41 | * Added :func:`peek` - thanks `@lisael `_. 42 | * Removed Python 3.3 support 43 | * Added Python 3.7 support 44 | * :meth:`Parser.combine_dict` now strips keys that start with ``_``. 45 | 46 | 47 | 1.2.0 - 2017-11-15 48 | ------------------ 49 | 50 | * Added ``transform`` argument to :func:`string` and :func:`string_from`. 51 | * Made :meth:`Parser.combine_dict` accept lists of name value pairs, 52 | and filter out keys with value ``None``. 53 | * Added :func:`from_enum`. 54 | 55 | 56 | 1.1.0 - 2017-11-05 57 | ------------------ 58 | 59 | * Added :meth:`Parser.optional`. 60 | * Added :meth:`Parser.tag`. 61 | * Added :func:`seq` keyword argument version (Python 3.6) 62 | * Added :meth:`Parser.combine_dict`. 63 | * Documented :meth:`Parser.mark`. 64 | * Documentation improvements. 65 | 66 | 67 | 1.0.0 - 2017-10-10 68 | ------------------ 69 | 70 | * Improved parse failure messages of ``@generate`` parsers. Previously 71 | the parser was given a default description of the function name, 72 | which hides all useful internal info there might be. 73 | * Added :meth:`Parser.sep_by` 74 | * Added :func:`test_char` 75 | * Added :func:`char_from` 76 | * Added :func:`string_from` 77 | * Added :data:`any_char` 78 | * Added :data:`decimal_digit` 79 | * Added :meth:`Parser.concat` 80 | * Fixed parsy so that it can again work with tokens as well as strings, allowing it to 81 | be used as both a :doc:`lexer or parser or both `, with docs and tests. 82 | * Added :func:`test_item` 83 | * Added :func:`match_item` 84 | * Added :meth:`Parser.should_fail` 85 | 86 | 0.9.0 - 2017-09-28 87 | ------------------ 88 | 89 | * Better error reporting of failed parses. 90 | * Documentation overhaul and expansion. 91 | * Added :meth:`Parser.combine`. 92 | 93 | 0.0.4 - 2014-12-28 94 | ------------------ 95 | 96 | * See git logs for changes before this point. 97 | -------------------------------------------------------------------------------- /docs/howto/index.rst: -------------------------------------------------------------------------------- 1 | ================================= 2 | Howto's, cookbooks and examples 3 | ================================= 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | :caption: Contents: 8 | 9 | lexing 10 | other_examples 11 | -------------------------------------------------------------------------------- /docs/howto/lexing.rst: -------------------------------------------------------------------------------- 1 | ===================================== 2 | Separate lexing/tokenization phases 3 | ===================================== 4 | 5 | .. currentmodule:: parsy 6 | 7 | Most of the documentation in parsy assumes that when you call 8 | :meth:`Parser.parse` you will pass a string, and will get back your final 9 | parsed, constructed object (of whatever type you desire). 10 | 11 | A more classical approach to parsing is that you first have a 12 | lexing/tokenization phase, the result of which is a simple list of tokens. These 13 | tokens could be strings, or other objects. 14 | 15 | You then have a separate parsing phase that consumes this list of tokens, and 16 | produces your final object, which is very often a tree-like structure or other 17 | complex object. 18 | 19 | Parsy can actually work with either approach. Further, for the split 20 | lexing/parsing approach, parsy can be used either to implement the lexer, or the 21 | parser, or both! The following examples use parsy to do both lexing and parsing. 22 | 23 | However, parsy's features for this use case are not as developed as some other 24 | Python tools. If you are building a parser for a full language that needs the 25 | split lexing/parsing approach, you might be better off with `PLY 26 | `_. 27 | 28 | Turtle Logo 29 | =========== 30 | 31 | For our first example, we'll do a very stripped down Turtle Logo parser. First, 32 | the lexer: 33 | 34 | .. literalinclude:: ../../examples/simple_logo_lexer.py 35 | :language: python 36 | 37 | 38 | We are not interested in whitespace, so our lexer removes it all, apart from 39 | newlines. We can now parse a program into the tokens we are interested in: 40 | 41 | .. code-block:: python 42 | 43 | >>> l = lexer.parse("fd 1\nbk 2") 44 | >>> l 45 | ['fd', 1, '\n', 'bk', 2, '\n'] 46 | 47 | The ``line`` parser produces a list, so after applying ``many`` which also 48 | produces a list, we applied a level of flattening so that we end up with a 49 | simple list of tokens. We also chose to convert the parameters to integers while 50 | we were at it, so in this case our list of tokens is not a list of strings, but 51 | heterogeneous. 52 | 53 | The next step is the parser. We create some classes to represent different 54 | commands, and then use parsy again to create a parser which is very simple 55 | because this is a very limited language: 56 | 57 | .. literalinclude:: ../../examples/simple_logo_parser.py 58 | :language: python 59 | 60 | To use it, we pass the the list of tokens generated above into 61 | ``program.parse``: 62 | 63 | .. code-block:: python 64 | 65 | >>> program.parse(l) 66 | [Forward(1), Backward(2)] 67 | 68 | In a real implementation, we could then have ``execute`` methods on the 69 | ``Command`` sub-classes if we wanted to implement an interpreter, for example. 70 | 71 | Calculator 72 | ========== 73 | 74 | Our second example illustrates lexing and then parsing a sequence of 75 | mathematical operations, e.g "1 + 2 * (3 - 4.5)", with precedence. 76 | 77 | In this case, while doing the parsing stage, instead of building up an AST of 78 | objects representing the operations, the parser actually evaluates the 79 | expression. 80 | 81 | .. literalinclude:: ../../examples/simple_eval.py 82 | :language: python 83 | -------------------------------------------------------------------------------- /docs/howto/other_examples.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Other examples 3 | ============== 4 | 5 | .. currentmodule:: parsy 6 | 7 | This section has some further example parsers that you can study. There are also 8 | examples in the :doc:`/tutorial` and in :doc:`/ref/generating`. 9 | 10 | SQL SELECT statement parser 11 | =========================== 12 | 13 | This shows a very simplified parser for a SQL ``SELECT`` statement, using custom 14 | data structures, and the convenient keyword argument syntax for :func:`seq`, 15 | followed by :meth:`Parser.combine_dict`. 16 | 17 | .. literalinclude:: ../../examples/sql_select.py 18 | :language: python 19 | 20 | 21 | JSON parser 22 | =========== 23 | 24 | A full parser for JSON. (This will not be competitive in terms of performance 25 | with other implementations!) 26 | 27 | This demonstrates the use of :class:`forward_declaration`, needed due to the 28 | circular definition of ``json_value``. 29 | 30 | .. literalinclude:: ../../examples/json.py 31 | :language: python 32 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to parsy's documentation! 2 | ================================= 3 | 4 | These are the docs for parsy |release|. Check the :doc:`/history` for 5 | significant changes. 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | :caption: Contents: 10 | 11 | installation 12 | overview 13 | tutorial 14 | ref/index 15 | howto/index 16 | history 17 | contributing 18 | 19 | Indices and tables 20 | ================== 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | * :ref:`search` 25 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | parsy can be installed with pip:: 6 | 7 | pip install parsy 8 | 9 | 10 | Python 3.7 or greater is required. 11 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=parsy 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/overview.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Overview 3 | ======== 4 | 5 | .. currentmodule:: parsy 6 | 7 | Parsy is an easy way to combine simple, small parsers into complex, larger 8 | parsers. 9 | 10 | If it means anything to you, it's a monadic parser combinator library for 11 | LL(infinity) grammars in the spirit of `Parsec 12 | `_, `Parsnip 13 | `_, and `Parsimmon 14 | `_. 15 | 16 | If that means nothing, rest assured that parsy is a very straightforward and 17 | Pythonic solution for parsing text that doesn't require knowing anything about 18 | monads. 19 | 20 | Parsy differentiates itself from other solutions with the following: 21 | 22 | * it is not a parser generator, but a combinator based parsing library. 23 | * a very clean implementation, only a few hundred lines, that borrows 24 | from the best of recent combinator libraries. 25 | * it produces fairly terse code, with an embedded DSL feel — not too far from 26 | things like EBNF notation or Haskell’s parsec. 27 | * free, good quality documentation, all in one place. (Please raise an issue on 28 | GitHub if you have any problems, or find the documentation lacking in any 29 | way). 30 | * it avoids mutability, and therefore a ton of related bugs. 31 | * it has monadic binding with a :doc:`nice syntax `. In plain 32 | English: 33 | 34 | * we can easily handle cases where later parsing depends on the value of 35 | something parsed earlier e.g. Hollerith constants. 36 | * it's easy to build up complex result objects, rather than returning lists of 37 | lists etc. which then need to be further processed. 38 | 39 | * it has a minimalist philosophy. It doesn't include built-in helpers for any 40 | specific grammars or languages, but provides building blocks for making these. 41 | 42 | Basic usage looks like this: 43 | 44 | Example 1 - parsing a set of alternatives: 45 | 46 | .. code-block:: python 47 | 48 | >>> from parsy import string 49 | >>> title = (string('Dr.') | string('Mr.') | string('Mrs.')).desc("title") 50 | >>> title.parse('Mrs.') 51 | 'Mrs.' 52 | >>> title.parse('Mr.') 53 | 'Mr.' 54 | 55 | >>> title.parse('Joe') 56 | ParseError: expected title at 0:0 57 | 58 | >>> title.parse_partial('Dr. Who') 59 | ('Dr.', ' Who') 60 | 61 | Example 2 - Parsing a dd-mm-yy date: 62 | 63 | .. code-block:: python 64 | 65 | >>> from parsy import string, regex 66 | >>> from datetime import date 67 | >>> ddmmyy = regex(r'[0-9]{2}').map(int).sep_by(string("-"), min=3, max=3).combine( 68 | ... lambda d, m, y: date(2000 + y, m, d)) 69 | >>> ddmmyy.parse('06-05-14') 70 | datetime.date(2014, 5, 6) 71 | 72 | 73 | 74 | 75 | To learn how to use parsy, you should continue with: 76 | 77 | * the :doc:`tutorial `, especially if you are not familiar with this 78 | type of parser library. 79 | * the :doc:`parser generator decorator ` 80 | * the :doc:`builtin parser primitives ` 81 | * the :doc:`method and combinator reference ` 82 | 83 | Other Python projects 84 | ===================== 85 | 86 | This library isn’t for everyone or for every project. It excels at quickly 87 | writing easy-to-read parsers for relatively small languages, and it’s great if 88 | you are a relative newcomer to the subject of parsing but want something better 89 | than ``str.split``. If you have demanding needs in terms of performance, or 90 | producing good error messages, you may need to look elsewhere. Below are some 91 | other Python libraries you might consider: 92 | 93 | * `PLY `_. A pure Python implementation of 94 | the classic lex/yacc parsing tools. It is well suited to large grammars 95 | that would be found in typical programming languages. 96 | 97 | * `Lark `_. With Lark you write a grammar 98 | definition in a separate mini-language as a string, and have a parser 99 | generated for you, rather than writing the grammar in Python. It has the 100 | advantage of speed and being able to use different parsing algorithms. 101 | 102 | * `pyparsing `_. Also a combinator approach, 103 | but in general much less cleanly implemented, and rather scattered 104 | documentation, although it has more builtin functionality in terms 105 | of provided utilities for certain parsing tasks. 106 | 107 | * `funcparserlib `_ - the most 108 | similar to parsy. It differs from parsy mainly in normally using a separate 109 | tokenization phase and lacking the convenience of the :func:`generate` method 110 | for creating parsers. 111 | -------------------------------------------------------------------------------- /docs/ref/generating.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Generating a parser 3 | =================== 4 | 5 | .. currentmodule:: parsy 6 | .. function:: generate 7 | 8 | ``generate`` converts a generator function (one that uses the ``yield`` keyword) 9 | into a parser. The generator function must yield parsers. These parsers are 10 | applied successively and their results are sent back to the generator using the 11 | ``.send()`` protocol. The generator function should return the final result of 12 | the parsing. Alternatively it can return another parser, which is equivalent to 13 | applying it and returning its result. 14 | 15 | Motivation and examples 16 | ======================= 17 | 18 | Constructing parsers by using combinators and :class:`Parser` methods to make 19 | larger parsers works well for many simpler cases. However, for more complex 20 | cases the ``generate`` function decorator is both more readable and more 21 | powerful. (For those coming from Haskell/Parsec, this method provides an 22 | acceptable substitute for ``do`` notation). 23 | 24 | Alternative syntax to combinators 25 | --------------------------------- 26 | 27 | The first example just shows a different way of building a parser that could 28 | have easily been built using combinators: 29 | 30 | .. code:: python 31 | 32 | from parsy import generate 33 | 34 | @generate("form") 35 | def form(): 36 | """ 37 | Parse an s-expression form, like (a b c). 38 | An equivalent to lparen >> expr.many() << rparen 39 | """ 40 | yield lparen 41 | exprs = yield expr.many() 42 | yield rparen 43 | return exprs 44 | 45 | In the example above, the parser was given a string name ``"form"``, which does 46 | the same as :meth:`Parser.desc`. This is not required, as per the examples below. 47 | 48 | Note that there is no guarantee that the entire function is executed: if any of 49 | the yielded parsers fails, the function will not complete, and parsy will try to 50 | backtrack to an alternative parser if there is one. 51 | 52 | Building complex objects 53 | ------------------------ 54 | 55 | The second example shows how you can use multiple parse results to build up a 56 | complex object: 57 | 58 | .. code:: python 59 | 60 | from datetime import date 61 | 62 | from parsy import generate, regex, string 63 | 64 | @generate 65 | def date(): 66 | """ 67 | Parse a date in the format YYYY-MM-DD 68 | """ 69 | year = yield regex("[0-9]{4}").map(int) 70 | yield string("-") 71 | month = yield regex("[0-9]{2}").map(int) 72 | yield string("-") 73 | day = yield regex("[0-9]{2}").map(int) 74 | 75 | return date(year, month, day) 76 | 77 | This could also have been achieved using :func:`seq` and :meth:`Parser.combine`. 78 | 79 | Using values already parsed 80 | --------------------------- 81 | 82 | The third example shows how we can use an earlier parsed value to influence the 83 | subsequent parsing. This example parses Hollerith constants. Hollerith constants 84 | are a way of specifying an arbitrary set of characters by first writing the 85 | integer that specifies the length, followed by the character H, followed by the 86 | set of characters. For example, ``pancakes`` would be written ``8Hpancakes``. 87 | 88 | .. code:: python 89 | 90 | from parsy import generate, regex, string, any_char 91 | 92 | @generate 93 | def hollerith(): 94 | num = yield regex(r'[0-9]+').map(int) 95 | yield string('H') 96 | return any_char.times(num).concat() 97 | 98 | (You may want to compare this with an `implementation of Hollerith constants 99 | `_ that 100 | uses `pyparsing `_, originally by John 101 | Shipman from his `pyparsing docs 102 | `_.) 103 | 104 | There are also more complex examples in the :ref:`tutorial 105 | ` of using the ``generate`` decorator to create parsers 106 | where there is logic that is conditional upon earlier parsed values. 107 | 108 | .. _recursive-definitions-with-generate: 109 | 110 | Implementing recursive definitions 111 | ---------------------------------- 112 | 113 | A fourth examples shows how you can use this syntax for grammars that you would 114 | like to define recursively (or mutually recursively). 115 | 116 | Say we want to be able to parse an s-expression like syntax which uses 117 | parenthesis for grouping items into a tree structure, like the following:: 118 | 119 | (0 1 (2 3) (4 5 6) 7 8) 120 | 121 | A naive approach would be: 122 | 123 | .. code-block:: python 124 | 125 | simple = regex('[0-9]+').map(int) 126 | group = string('(') >> expr.sep_by(string(' ')) << string(')') 127 | expr = simple | group 128 | 129 | The problem is that the second line will get a ``NameError`` because ``expr`` is 130 | not defined yet. 131 | 132 | One way to solve this is to use :ref:`forward-declarations`. But another uses 133 | ``@generate``. 134 | 135 | Using the ``@generate`` syntax will introduce a level of laziness in resolving 136 | ``expr`` that allows things to work: 137 | 138 | .. code-block:: python 139 | 140 | simple = regex('[0-9]+').map(int) 141 | 142 | @generate 143 | def group(): 144 | return (yield string('(') >> expr.sep_by(string(' ')) << string(')')) 145 | 146 | expr = simple | group 147 | 148 | .. code-block:: python 149 | 150 | >>> expr.parse("(0 1 (2 3) (4 5 6) 7 8)") 151 | [0, 1, [2, 3], [4, 5, 6], 7, 8] 152 | -------------------------------------------------------------------------------- /docs/ref/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | API reference 3 | =============== 4 | 5 | .. toctree:: 6 | :maxdepth: 3 7 | :caption: Contents: 8 | 9 | primitives 10 | methods_and_combinators 11 | generating 12 | parser_instances 13 | -------------------------------------------------------------------------------- /docs/ref/methods_and_combinators.rst: -------------------------------------------------------------------------------- 1 | ========================================= 2 | Parser methods, operators and combinators 3 | ========================================= 4 | 5 | Parser methods 6 | ============== 7 | 8 | Parser objects are returned by any of the built-in parser :doc:`primitives`. They 9 | can be used and manipulated as below. 10 | 11 | .. currentmodule:: parsy 12 | 13 | .. class:: Parser 14 | 15 | .. method:: __init__(wrapped_fn) 16 | 17 | This is a low level function to create new parsers that is used internally 18 | but is rarely needed by users of the parsy library. It should be passed a 19 | parsing function, which takes two arguments - a string/list to be parsed 20 | and the current index into the list - and returns a :class:`Result` object, 21 | as described in :doc:`/ref/parser_instances`. 22 | 23 | The following methods are for actually **using** the parsers that you have 24 | created: 25 | 26 | .. method:: parse(string_or_list) 27 | 28 | Attempts to parse the given string (or list). If the parse is successful 29 | and consumes the entire string, the result is returned - otherwise, a 30 | ``ParseError`` is raised. 31 | 32 | Instead of passing a string, you can in fact pass a list of tokens. Almost 33 | all the examples assume strings for simplicity. Some of the primitives are 34 | also clearly string specific, and a few of the combinators (such as 35 | :meth:`Parser.concat`) are string specific, but most of the rest of the 36 | library will work with tokens just as well. See :doc:`/howto/lexing` for 37 | more information. 38 | 39 | .. method:: parse_partial(string_or_list) 40 | 41 | Similar to ``parse``, except that it does not require the entire 42 | string (or list) to be consumed. Returns a tuple of 43 | ``(result, remainder)``, where ``remainder`` is the part of 44 | the string (or list) that was left over. 45 | 46 | The following methods are essentially **combinators** that produce new 47 | parsers from the existing one. They are provided as methods on ``Parser`` for 48 | convenience. More combinators are documented below. 49 | 50 | .. method:: desc(string) 51 | 52 | Adds a description to the parser, which is used in the error message 53 | if parsing fails. 54 | 55 | >>> year = regex(r'[0-9]{4}').desc('4 digit year') 56 | >>> year.parse('123') 57 | ParseError: expected 4 digit year at 0:0 58 | 59 | .. method:: then(other_parser) 60 | 61 | Returns a parser which, if the initial parser succeeds, will continue parsing 62 | with ``other_parser``. This will produce the value produced by 63 | ``other_parser``. 64 | 65 | .. code:: python 66 | 67 | >>> string('x').then(string('y')).parse('xy') 68 | 'y' 69 | 70 | See also :ref:`parser-rshift`. 71 | 72 | .. method:: skip(other_parser) 73 | 74 | Similar to :meth:`Parser.then`, except the resulting parser will use 75 | the value produced by the first parser. 76 | 77 | .. code:: python 78 | 79 | >>> string('x').skip(string('y')).parse('xy') 80 | 'x' 81 | 82 | See also :ref:`parser-lshift`. 83 | 84 | .. method:: many() 85 | 86 | Returns a parser that expects the initial parser 0 or more times, and 87 | produces a list of the results. Note that this parser does not fail if 88 | nothing matches, but instead consumes nothing and produces an empty list. 89 | 90 | .. code:: python 91 | 92 | >>> parser = regex(r'[a-z]').many() 93 | >>> parser.parse('') 94 | [] 95 | >>> parser.parse('abc') 96 | ['a', 'b', 'c'] 97 | 98 | .. method:: times(min [, max=min]) 99 | 100 | Returns a parser that expects the initial parser at least ``min`` times, 101 | and at most ``max`` times, and produces a list of the results. If only one 102 | argument is given, the parser is expected exactly that number of times. 103 | 104 | .. method:: at_most(n) 105 | 106 | Returns a parser that expects the initial parser at most ``n`` times, and 107 | produces a list of the results. 108 | 109 | .. method:: at_least(n) 110 | 111 | Returns a parser that expects the initial parser at least ``n`` times, and 112 | produces a list of the results. 113 | 114 | .. method:: until(other_parser, [min=0, max=inf, consume_other=False]) 115 | 116 | Returns a parser that expects the initial parser followed by ``other_parser``. 117 | The initial parser is expected at least ``min`` times and at most ``max`` times. 118 | By default, it does not consume ``other_parser`` and it produces a list of the 119 | results excluding ``other_parser``. If ``consume_other`` is ``True`` then 120 | ``other_parser`` is consumed and its result is included in the list of results. 121 | 122 | .. code:: python 123 | 124 | >>> seq(string('A').until(string('B')), string('BC')).parse('AAABC') 125 | [['A','A','A'], 'BC'] 126 | >>> string('A').until(string('B')).then(string('BC')).parse('AAABC') 127 | 'BC' 128 | >>> string('A').until(string('BC'), consume_other=True).parse('AAABC') 129 | ['A', 'A', 'A', 'BC'] 130 | 131 | .. versionadded:: 2.0 132 | 133 | .. method:: optional(default=None) 134 | 135 | Returns a parser that expects the initial parser zero or once, and maps 136 | the result to a given default value in the case of no match. If no default 137 | value is given, ``None`` is used. 138 | 139 | .. code:: python 140 | 141 | >>> string('A').optional().parse('A') 142 | 'A' 143 | >>> string('A').optional().parse('') 144 | None 145 | >>> string('A').optional('Oops').parse('') 146 | 'Oops' 147 | 148 | .. method:: map(map_function) 149 | 150 | Returns a parser that transforms the produced value of the initial parser 151 | with ``map_function``. 152 | 153 | .. code:: python 154 | 155 | >>> regex(r'[0-9]+').map(int).parse('1234') 156 | 1234 157 | 158 | This is the simplest way to convert parsed strings into the data types 159 | that you need. See also :meth:`combine` and :meth:`combine_dict` below. 160 | 161 | .. method:: combine(combine_fn) 162 | 163 | Returns a parser that transforms the produced values of the initial parser 164 | with ``combine_fn``, passing the arguments using ``*args`` syntax. 165 | 166 | Where the current parser produces an iterable of values, this can be a 167 | more convenient way to combine them than :meth:`~Parser.map`. 168 | 169 | Example 1 - the argument order of our callable already matches: 170 | 171 | .. code:: python 172 | 173 | >>> from datetime import date 174 | >>> yyyymmdd = seq(regex(r'[0-9]{4}').map(int), 175 | ... regex(r'[0-9]{2}').map(int), 176 | ... regex(r'[0-9]{2}').map(int)).combine(date) 177 | >>> yyyymmdd.parse('20140506') 178 | datetime.date(2014, 5, 6) 179 | 180 | Example 2 - the argument order of our callable doesn't match, and 181 | we need to adjust a parameter, so we can fix it using a lambda. 182 | 183 | .. code:: python 184 | 185 | >>> ddmmyy = regex(r'[0-9]{2}').map(int).times(3).combine( 186 | ... lambda d, m, y: date(2000 + y, m, d)) 187 | >>> ddmmyy.parse('060514') 188 | datetime.date(2014, 5, 6) 189 | 190 | The equivalent ``lambda`` to use with ``map`` would be ``lambda res: 191 | date(2000 + res[2], res[1], res[0])``, which is less readable. The version 192 | with ``combine`` also ensures that exactly 3 items are generated by the 193 | previous parser, otherwise you get a ``TypeError``. 194 | 195 | .. method:: combine_dict(fn) 196 | 197 | Returns a parser that transforms the value produced by the initial parser 198 | using the supplied function/callable, passing the arguments using the 199 | ``**kwargs`` syntax. 200 | 201 | The value produced by the initial parser must be a mapping/dictionary from 202 | names to values, or a list of two-tuples, or something else that can be 203 | passed to the ``dict`` constructor. 204 | 205 | If ``None`` is present as a key in the dictionary it will be removed 206 | before passing to ``fn``, as will all keys starting with ``_``. 207 | 208 | **Motivation:** 209 | 210 | For building complex objects, this can be more convenient, flexible and 211 | readable than :meth:`map` or :meth:`combine`, because by avoiding 212 | positional arguments we can avoid a dependence on the order of components 213 | in the string being parsed and in the argument order of callables being 214 | used. It is especially designed to be used in conjunction with :func:`seq` 215 | and :meth:`tag`. 216 | 217 | We can make use of the ``**kwargs`` version of :func:`seq` to produce a 218 | very readable definition: 219 | 220 | .. code:: python 221 | 222 | >>> ddmmyyyy = seq( 223 | ... day=regex(r'[0-9]{2}').map(int), 224 | ... month=regex(r'[0-9]{2}').map(int), 225 | ... year=regex(r'[0-9]{4}').map(int), 226 | ... ).combine_dict(date) 227 | >>> ddmmyyyy.parse('04052003') 228 | datetime.date(2003, 5, 4) 229 | 230 | (If that is hard to understand, use a Python REPL, and examine the result 231 | of the ``parse`` call if you remove the ``combine_dict`` call). 232 | 233 | Here we used ``datetime.date`` which accepts keyword arguments. For your 234 | own parsing needs you will often use custom data types. You can create 235 | these however you like, but we suggest `dataclasses 236 | `_ (stdlib), `attrs 237 | `_ or `pydantic 238 | `_. You can also use 239 | `namedtuple 240 | `_ 241 | for simple cases. 242 | 243 | The following example shows the use of ``_`` as a prefix to remove 244 | elements you are not interested in, and the use of ``namedtuple`` to 245 | create a simple data-structure. 246 | 247 | .. code-block:: python 248 | 249 | >>> from collections import namedtuple 250 | >>> Pair = namedtuple('Pair', ['name', 'value']) 251 | >>> name = regex("[A-Za-z]+") 252 | >>> int_value = regex("[0-9]+").map(int) 253 | >>> bool_value = string("true").result(True) | string("false").result(False) 254 | >>> pair = seq( 255 | ... name=name, 256 | ... __eq=string('='), 257 | ... value=int_value | bool_value, 258 | ... __sc=string(';'), 259 | ... ).combine_dict(Pair) 260 | >>> pair.parse("foo=123;") 261 | Pair(name='foo', value=123) 262 | >>> pair.parse("BAR=true;") 263 | Pair(name='BAR', value=True) 264 | 265 | You could also use ``<<`` or ``>>`` for the unwanted parts (but in some 266 | cases this is less convenient): 267 | 268 | .. code-block:: python 269 | 270 | >>> pair = seq( 271 | ... name=name << string('='), 272 | ... value=(int_value | bool_value) << string(';') 273 | ... ).combine_dict(Pair) 274 | 275 | .. versionchanged:: 1.2 276 | Allow lists as well as dicts to be consumed, and filter out ``None``. 277 | 278 | .. versionchanged:: 1.3 279 | Stripping of args starting with ``_`` 280 | 281 | .. method:: tag(name) 282 | 283 | Returns a parser that wraps the produced value of the initial parser in a 284 | 2 tuple containing ``(name, value)``. This provides a very simple way to 285 | label parsed components. e.g.: 286 | 287 | .. code:: python 288 | 289 | >>> day = regex(r'[0-9]+').map(int) 290 | >>> month = string_from("January", "February", "March", "April", "May", 291 | ... "June", "July", "August", "September", "October", 292 | ... "November", "December") 293 | >>> day.parse("10") 294 | 10 295 | >>> day.tag("day").parse("10") 296 | ('day', 10) 297 | 298 | >>> seq(day.tag("day") << whitespace, 299 | ... month.tag("month") 300 | ... ).parse("10 September") 301 | [('day', 10), ('month', 'September')] 302 | 303 | It also works well when combined with ``.map(dict)`` to get a dictionary 304 | of values: 305 | 306 | .. code:: python 307 | 308 | >>> seq(day.tag("name") << whitespace, 309 | ... month.tag("month") 310 | ... ).map(dict).parse("10 September") 311 | {'day': 10, 'month': 'September'} 312 | 313 | ... and with :meth:`combine_dict` to build other objects. 314 | 315 | Usually it is better to use :func:`seq` with keyword arguments if you want 316 | to produce a dictionary. 317 | 318 | .. method:: concat() 319 | 320 | Returns a parser that concatenates together (as a string) the previously 321 | produced values. Usually used after :meth:`~Parser.many` and similar 322 | methods that produce multiple values. 323 | 324 | .. code:: python 325 | 326 | >>> letter.at_least(1).parse("hello") 327 | ['h', 'e', 'l', 'l', 'o'] 328 | >>> letter.at_least(1).concat().parse("hello") 329 | 'hello' 330 | 331 | .. method:: result(val) 332 | 333 | Returns a parser that, if the initial parser succeeds, always produces 334 | ``val``. 335 | 336 | .. code:: python 337 | 338 | >>> string('foo').result(42).parse('foo') 339 | 42 340 | 341 | .. method:: should_fail(description) 342 | 343 | Returns a parser that fails when the initial parser succeeds, and succeeds 344 | when the initial parser fails (consuming no input). A description must 345 | be passed which is used in parse failure messages. 346 | 347 | This is essentially a negative lookahead: 348 | 349 | .. code:: python 350 | 351 | >>> p = letter << string(" ").should_fail("not space") 352 | >>> p.parse('A') 353 | 'A' 354 | >>> p.parse('A ') 355 | ParseError: expected 'not space' at 0:1 356 | 357 | It is also useful for implementing things like parsing repeatedly until a 358 | marker: 359 | 360 | .. code:: python 361 | 362 | >>> (string(";").should_fail("not ;") >> letter).many().concat().parse_partial('ABC;') 363 | ('ABC', ';') 364 | 365 | .. method:: bind(fn) 366 | 367 | Returns a parser which, if the initial parser is successful, passes the 368 | result to ``fn``, and continues with the parser returned from ``fn``. This 369 | is the monadic binding operation. However, since we don't have Haskell's 370 | ``do`` notation in Python, using this is very awkward. Instead, you should 371 | look at :doc:`/ref/generating/` which provides a much nicer syntax for that 372 | cases where you would have needed ``do`` notation in Parsec. 373 | 374 | .. method:: sep_by(sep, min=0, max=inf) 375 | 376 | Like :meth:`Parser.times`, this returns a new parser that repeats 377 | the initial parser and collects the results in a list, but in this case separated 378 | by the parser ``sep`` (whose return value is discarded). By default it 379 | repeats with no limit, but minimum and maximum values can be supplied. 380 | 381 | .. code:: python 382 | 383 | >>> csv = letter.at_least(1).concat().sep_by(string(",")) 384 | >>> csv.parse("abc,def") 385 | ['abc', 'def'] 386 | 387 | .. method:: mark() 388 | 389 | Returns a parser that wraps the initial parser's result in a value 390 | containing column and line information of the match, as well as the 391 | original value. The new value is a 3-tuple: 392 | 393 | .. code:: python 394 | 395 | ((start_row, start_column), 396 | original_value, 397 | (end_row, end_column)) 398 | 399 | This is useful for being able to report problems with parsing more 400 | accurately, especially if you are using parsy as a :doc:`lexer 401 | ` and want subsequent parsing of the token stream to be 402 | able to report original positions in error messages etc. 403 | 404 | .. _operators: 405 | 406 | Parser operators 407 | ================ 408 | 409 | This section describes operators that you can use on :class:`Parser` objects to 410 | build new parsers. 411 | 412 | 413 | .. _parser-or: 414 | 415 | ``|`` operator 416 | -------------- 417 | 418 | ``parser | other_parser`` 419 | 420 | Returns a parser that tries ``parser`` and, if it fails, backtracks 421 | and tries ``other_parser``. These can be chained together. 422 | 423 | The resulting parser will produce the value produced by the first 424 | successful parser. 425 | 426 | .. code:: python 427 | 428 | >>> parser = string('x') | string('y') | string('z') 429 | >>> parser.parse('x') 430 | 'x' 431 | >>> parser.parse('y') 432 | 'y' 433 | >>> parser.parse('z') 434 | 'z' 435 | 436 | Note that ``other_parser`` will only be tried if ``parser`` cannot consume any 437 | input and fails. ``other_parser`` is not used in the case that **later** parser 438 | components fail. This means that the order of the operands matters - for 439 | example: 440 | 441 | .. code:: python 442 | 443 | >>> ((string('A') | string('AB')) + string('C')).parse('ABC') 444 | ParseEror: expected 'C' at 0:1 445 | >>> ((string('AB') | string('A')) + string('C')).parse('ABC') 446 | 'ABC' 447 | >>> ((string('AB') | string('A')) + string('C')).parse('AC') 448 | 'AC' 449 | 450 | .. _parser-lshift: 451 | 452 | ``<<`` operator 453 | --------------- 454 | 455 | ``parser << other_parser`` 456 | 457 | The same as ``parser.skip(other_parser)`` - see :meth:`Parser.skip`. 458 | 459 | (Hint - the arrows point at the important parser!) 460 | 461 | .. code:: python 462 | 463 | >>> (string('x') << string('y')).parse('xy') 464 | 'x' 465 | 466 | .. _parser-rshift: 467 | 468 | ``>>`` operator 469 | --------------- 470 | 471 | ``parser >> other_parser`` 472 | 473 | The same as ``parser.then(other_parser)`` - see :meth:`Parser.then`. 474 | 475 | (Hint - the arrows point at the important parser!) 476 | 477 | .. code-block:: python 478 | 479 | >>> (string('x') >> string('y')).parse('xy') 480 | 'y' 481 | 482 | 483 | .. _parser-plus: 484 | 485 | ``+`` operator 486 | -------------- 487 | 488 | ``parser1 + parser2`` 489 | 490 | Requires both parsers to match in order, and adds the two results together using 491 | the + operator. This will only work if the results support the plus operator 492 | (e.g. strings and lists): 493 | 494 | 495 | .. code-block:: python 496 | 497 | >>> (string("x") + regex("[0-9]")).parse("x1") 498 | "x1" 499 | 500 | >>> (string("x").many() + regex("[0-9]").map(int).many()).parse("xx123") 501 | ['x', 'x', 1, 2, 3] 502 | 503 | The plus operator is a convenient shortcut for: 504 | 505 | >>> seq(parser1, parser2).combine(lambda a, b: a + b) 506 | 507 | .. _parser-times: 508 | 509 | ``*`` operator 510 | -------------- 511 | 512 | ``parser1 * number`` 513 | 514 | This is a shortcut for doing :meth:`Parser.times`: 515 | 516 | .. code-block:: python 517 | 518 | >>> (string("x") * 3).parse("xxx") 519 | ["x", "x", "x"] 520 | 521 | You can also set both upper and lower bounds by multiplying by a range: 522 | 523 | .. code-block:: python 524 | 525 | >>> (string("x") * range(0, 3)).parse("xxx") 526 | ParseError: expected EOF at 0:2 527 | 528 | (Note the normal semantics of ``range`` are respected - the second number is an 529 | *exclusive* upper bound, not inclusive). 530 | 531 | Parser combinators 532 | ================== 533 | 534 | .. function:: alt(*parsers) 535 | 536 | Creates a parser from the passed in argument list of alternative parsers, 537 | which are tried in order, moving to the next one if the current one fails, as 538 | per the :ref:`parser-or` - in other words, it matches any one of the 539 | alternative parsers. 540 | 541 | Example using ``*args`` syntax to pass a list of parsers that have been 542 | generated by mapping :func:`string` over a list of characters: 543 | 544 | .. code-block:: python 545 | 546 | >>> hexdigit = alt(*map(string, "0123456789abcdef")) 547 | 548 | (In this case you would be better off using :func:`char_from`) 549 | 550 | Note that the order of arguments matter, as described in :ref:`parser-or`. 551 | 552 | .. function:: seq(*parsers, **kw_parsers) 553 | 554 | Creates a parser that runs a sequence of parsers in order and combines 555 | their results in a list. 556 | 557 | 558 | .. code-block:: python 559 | 560 | >>> x_bottles_of_y_on_the_z = \ 561 | ... seq(regex(r"[0-9]+").map(int) << string(" bottles of "), 562 | ... regex(r"\S+") << string(" on the "), 563 | ... regex(r"\S+") 564 | ... ) 565 | >>> x_bottles_of_y_on_the_z.parse("99 bottles of beer on the wall") 566 | [99, 'beer', 'wall'] 567 | 568 | 569 | You can also use :func:`seq` with keyword arguments instead of positional 570 | arguments. In this case, the produced value is a dictionary of the individual 571 | values, rather than a sequence. This can make the produced value easier to 572 | consume. 573 | 574 | .. code-block:: python 575 | 576 | >>> name = seq(first_name=regex("\S+") << whitespace, 577 | ... last_name=regex("\S+") 578 | >>> name.parse("Jane Smith") 579 | {'first_name': 'Jane', 580 | 'last_name': 'Smith'} 581 | 582 | .. versionchanged:: 1.1 583 | Added ``**kwargs`` option. 584 | 585 | .. note:: 586 | As an alternative, see :meth:`Parser.tag` for a way of labelling parsed 587 | components and producing dictionaries. 588 | 589 | 590 | Other combinators 591 | ================= 592 | 593 | Parsy does not try to include every possible combinator - there is no reason why 594 | you cannot create your own for your needs using the built-in combinators and 595 | primitives. If you find something that is very generic and would be very useful 596 | to have as a built-in, please :doc:`submit ` as a PR! 597 | -------------------------------------------------------------------------------- /docs/ref/parser_instances.rst: -------------------------------------------------------------------------------- 1 | ============================= 2 | Creating new Parser instances 3 | ============================= 4 | 5 | .. currentmodule:: parsy 6 | 7 | Normally you will create Parser instances using the provided :doc:`primitives 8 | ` and :doc:`combinators `. 9 | 10 | However it is also possible to create them manually, as below. 11 | 12 | The :class:`Parser` constructor should be passed a function that takes the 13 | string/list to be parsed and an index into that string, and returns a 14 | :class:`Result` object. The ``Result`` object will be created either using 15 | :meth:`Result.success` or :meth:`Result.failure` to indicate success or failure 16 | respectively. :meth:`Result.success` should be passed the next index to continue 17 | parsing with, and the value that is returned from the parsing. 18 | :meth:`Result.failure` should return the index at which failure occurred i.e. 19 | the index passed in, and a string indicating what the parser expected to find. 20 | 21 | The ``Parser`` constructor will usually be called using decorator syntax. In 22 | order to pass parameters to the ``Parser`` instance, it is typically created 23 | using a closure. In the example below, we create a parser that matches any 24 | string/list of tokens of a given length. This could also be written as something 25 | like ``any_char.times(n).concat()`` but the following will be more efficient: 26 | 27 | 28 | .. code-block:: python 29 | 30 | def consume(n): 31 | 32 | @Parser 33 | def consumer(stream, index): 34 | items = stream[index:index + n] 35 | if len(items) == n: 36 | return Result.success(index + n, items) 37 | else: 38 | return Result.failure(index, "{0} items".format(n)) 39 | 40 | return consumer 41 | 42 | 43 | .. code-block:: python 44 | 45 | >>> consume(3).many().parse('abc123def') 46 | ['abc', '123', 'def'] 47 | 48 | 49 | Result objects 50 | ============== 51 | 52 | .. class:: Result 53 | 54 | .. staticmethod:: success(next_index, value) 55 | 56 | Creates a ``Result`` object indicating parsing succeeded. The index to 57 | continue parsing at, and the value retrieved from the parsing, should be 58 | passed. 59 | 60 | .. staticmethod:: failure(index, expected) 61 | 62 | Creates a ``Result`` object indicating parsing failed. The index to 63 | continue parsing at, and a string representing what the parser expected to 64 | find, should be passed. 65 | -------------------------------------------------------------------------------- /docs/ref/primitives.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Parsing primitives 3 | ================== 4 | 5 | These are the lowest level building blocks for creating parsers. 6 | 7 | .. module:: parsy 8 | 9 | .. function:: string(expected_string, transform=None) 10 | 11 | Returns a parser that expects the ``expected_string`` and produces 12 | that string value. 13 | 14 | Optionally, a transform function can be passed, which will be used on both 15 | the expected string and tested string. This allows things like case 16 | insensitive matches to be done. This function must not change the length of 17 | the string (as determined by ``len``). The returned value of the parser will 18 | always be ``expected_string`` in its un-transformed state. 19 | 20 | .. code-block:: python 21 | 22 | >>> parser = string("Hello", transform=lambda s: s.upper()) 23 | >>> parser.parse("Hello") 24 | 'Hello' 25 | >>> parser.parse("hello") 26 | 'Hello' 27 | >>> parser.parse("HELLO") 28 | 'Hello' 29 | 30 | .. versionchanged:: 1.2 31 | Added ``transform`` argument. 32 | 33 | .. function:: regex(exp, flags=0, group=0) 34 | 35 | Returns a parser that expects the given ``exp``, and produces the 36 | matched string. ``exp`` can be a compiled regular expression, or a 37 | string which will be compiled with the given ``flags``. 38 | 39 | Optionally, accepts ``group``, which is passed to `re.Match.group 40 | `_ to 41 | return the text from a capturing group in the regex instead of the 42 | entire match. 43 | 44 | Using a regex parser for small building blocks, instead of building up 45 | parsers from primitives like :func:`string`, :func:`test_char` and 46 | :meth:`Parser.times` combinators etc., can have several advantages, 47 | including: 48 | 49 | * It can be more succinct e.g. compare: 50 | 51 | .. code-block:: python 52 | 53 | >>> (string('a') | string('b')).times(1, 4) 54 | >>> regex(r'[ab]{1,4}') 55 | 56 | * It can return the entire matched string as a single item, 57 | so you don't need to use :meth:`Parser.concat`. 58 | * It can return a part of the matched string using a capturing group 59 | from the regex, so you don't need to split the string yourself. 60 | 61 | You can use named or numbered groups, just like with `re.Match.group 62 | `_. 63 | Tuples also work, and return the captured text from multiple groups. 64 | 65 | .. code-block:: python 66 | 67 | >>> regex(r'([0-9]{4})-([0-9]{2})', group=1).parse('2020-03') 68 | '2020' 69 | >>> regex(r'(?P[0-9]{4})-(?P[0-9]{2})', group='month').parse('2020-03') 70 | '03' 71 | >>> regex(r'([0-9]{4})-([0-9]{2})', group=(1,2)).parse('2020-03') 72 | ('2020', '03') 73 | 74 | * It can be much faster. 75 | 76 | .. function:: test_char(func, description) 77 | 78 | Returns a parser that tests a single character with the callable 79 | ``func``. If ``func`` returns ``True``, the parse succeeds, otherwise 80 | the parse fails with the description ``description``. 81 | 82 | .. code-block:: python 83 | 84 | >>> ascii = test_char(lambda c: ord(c) < 128, 85 | ... 'ascii character') 86 | >>> ascii.parse('A') 87 | 'A' 88 | 89 | .. function:: test_item(func, description) 90 | 91 | Returns a parser that tests a single item from the list of items being 92 | consumed, using the callable ``func``. If ``func`` returns ``True``, the 93 | parse succeeds, otherwise the parse fails with the description 94 | ``description``. 95 | 96 | If you are parsing a string, i.e. a list of characters, you can use 97 | :func:`test_char` instead. (In fact the implementations are identical, these 98 | functions are aliases for the sake of clear code). 99 | 100 | .. code-block:: python 101 | 102 | >>> numeric = test_item(str.isnumeric, 'numeric') 103 | >>> numeric.many().parse(['123', '456']) 104 | ['123', '456'] 105 | 106 | .. function:: char_from(characters) 107 | 108 | Accepts a string and returns a parser that matches and returns one character 109 | from the string. 110 | 111 | .. code-block:: python 112 | 113 | >>> char_from('abc').parse('a') 114 | 'a' 115 | 116 | .. function:: string_from(*strings, transform=None) 117 | 118 | Accepts a sequence of strings as positional arguments, and returns a parser 119 | that matches and returns one string from the list. The list is first sorted 120 | in descending length order, so that overlapping strings are handled correctly 121 | by checking the longest one first. 122 | 123 | .. code-block:: python 124 | 125 | >>> string_from('y', 'yes').parse('yes') 126 | 'yes' 127 | 128 | Optionally accepts ``transform``, which is passed to :func:`string` (see the 129 | documentation there). 130 | 131 | .. versionchanged:: 1.2 132 | Added ``transform`` argument. 133 | 134 | 135 | .. function:: match_item(item, description=None) 136 | 137 | Returns a parser that tests the next item (or character) from the stream (or 138 | string) for equality against the provided item. Optionally a string 139 | description can be passed. 140 | 141 | Parsing a string: 142 | 143 | >>> letter_A = match_item('A') 144 | >>> letter_A.parse_partial('ABC') 145 | ('A', 'BC') 146 | 147 | Parsing a list of tokens: 148 | 149 | >>> hello = match_item('hello') 150 | >>> hello.parse_partial(['hello', 'how', 'are', 'you']) 151 | ('hello', ['how', 'are', 'you']) 152 | 153 | .. data:: eof 154 | 155 | A parser that only succeeds if the end of the stream has been reached. 156 | 157 | >>> eof.parse_partial("") 158 | (None, '') 159 | >>> eof.parse_partial("123") 160 | Traceback (most recent call last): 161 | ... 162 | parsy.ParseError: expected 'EOF' at 0:0 163 | 164 | .. function:: success(val) 165 | 166 | Returns a parser that does not consume any of the stream, but 167 | produces ``val``. 168 | 169 | .. function:: fail(expected) 170 | 171 | Returns a parser that always fails with the provided error message. 172 | 173 | .. function:: from_enum(enum_cls, transform=None) 174 | 175 | Given a class that is an `enum.Enum 176 | `_ class, returns a parser that 177 | will parse the values (or the string representations of the values) and 178 | return the corresponding enum item. 179 | 180 | .. code-block:: python 181 | 182 | >>> from enum import Enum 183 | >>> class Pet(Enum): 184 | ... CAT = "cat" 185 | ... DOG = "dog" 186 | >>> pet = from_enum(Pet) 187 | >>> pet.parse("cat") 188 | 189 | 190 | ``str`` is first run on the values (for the case of values that are integers 191 | etc.) to create the strings which are turned into parsers using 192 | :func:`string`. 193 | 194 | If ``transform`` is provided, it is passed to :func:`string` when creating 195 | the parser (allowing for things like case insensitive parsing). 196 | 197 | .. function:: peek(parser) 198 | 199 | Returns a lookahead parser that parses the input stream without consuming 200 | chars. 201 | 202 | .. code-block: python 203 | 204 | >>> peek(any_char).parse_partial("ABC") 205 | ('A', 'ABC') 206 | 207 | Pre-built parsers 208 | ================= 209 | 210 | Some common, pre-built parsers (all of these are :class:`Parser` objects created 211 | using the primitives above): 212 | 213 | 214 | .. data:: any_char 215 | 216 | A parser that matches any single character. 217 | 218 | .. data:: whitespace 219 | 220 | A parser that matches and returns one or more whitespace characters. 221 | 222 | .. data:: letter 223 | 224 | A parser that matches and returns a single letter, as defined by 225 | `str.isalpha `_. 226 | 227 | .. data:: digit 228 | 229 | A parser that matches and returns a single digit, as defined by `str.isdigit 230 | `_. Note that 231 | this includes various unicode characters outside of the normal 0-9 range, 232 | such as ¹²³. 233 | 234 | .. data:: decimal_digit 235 | 236 | A parser that matches and returns a single decimal digit, one of 237 | "0123456789". 238 | 239 | .. data:: line_info 240 | 241 | A parser that consumes no input and always just returns the current line 242 | information, a tuple of (line, column), zero-indexed, where lines are 243 | terminated by ``\n``. This is normally useful when wanting to build more 244 | debugging information into parse failure error messages. 245 | 246 | .. data:: index 247 | 248 | A parser that consumes no input and always just returns the current stream 249 | index. This is normally useful when wanting to build more debugging 250 | information into parse failure error messages. 251 | 252 | 253 | .. _forward-declarations: 254 | 255 | Forward declarations 256 | ==================== 257 | 258 | .. class:: forward_declaration 259 | 260 | When defining parsers for a recursive grammar, you may run into ``NameError`` 261 | problems with a naive approach, because you can’t refer to a Python object 262 | before you have defined it. In this case, :class:`forward_declaration` can be 263 | useful. 264 | 265 | Say we want to be able to parse an s-expression like syntax which uses 266 | parenthesis for grouping items into a tree structure, like the following:: 267 | 268 | (0 1 (2 3) (4 5 6) 7 8) 269 | 270 | A naive approach would be: 271 | 272 | .. code-block:: python 273 | 274 | simple = regex('[0-9]+').map(int) 275 | group = string('(') >> expr.sep_by(string(' ')) << string(')') 276 | expr = simple | group 277 | 278 | The problem is that the second line will get a ``NameError`` because ``expr`` is 279 | not defined yet, and we’ll have the same problem if we put the ``expr`` 280 | definition first. 281 | 282 | We can solve it like this: 283 | 284 | .. code-block:: python 285 | 286 | from parsy import forward_declaration, regex, string 287 | 288 | expr = forward_declaration() 289 | simple = regex('[0-9]+').map(int) 290 | group = string('(') >> expr.sep_by(string(' ')) << string(')') 291 | expr.become(simple | group) 292 | 293 | 294 | You must use ``.become()`` method exactly once before attempting to use the 295 | parser. 296 | 297 | An alternative to this is to use ``generate`` as described in 298 | :ref:`recursive-definitions-with-generate`. 299 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=4.3.0 2 | sphinx-rtd-theme>=1.2.0rc3,<1.3 3 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Tutorial 3 | ======== 4 | 5 | .. currentmodule:: parsy 6 | 7 | First :doc:`install parsy `, and check that the documentation you 8 | are reading matches the version you just installed. 9 | 10 | Building an ISO 8601 parser 11 | =========================== 12 | 13 | In this tutorial, we are going to gradually build a parser for a subset of an 14 | ISO 8601 date. Specifically, we want to handle dates that look like this: 15 | ``2017-09-25``. 16 | 17 | A problem of this size could admittedly be solved fairly easily with regexes. 18 | But very quickly regexes don’t scale, especially when it comes to getting the 19 | parsed data out, and for this tutorial we need to start with a simple example. 20 | 21 | With parsy, you start by breaking the problem down into the smallest components. 22 | So we need first to match the 4 digit year at the beginning. 23 | 24 | There are various ways we can do this, but a regex works nicely, and 25 | :func:`regex` is a built-in primitive of the parsy library: 26 | 27 | .. code-block:: python 28 | 29 | >>> from parsy import regex 30 | >>> year = regex(r"[0-9]{4}") 31 | 32 | (For those who don’t know regular expressions, the regex ``[0-9]{4}`` means 33 | “match any character from 0123456789 exactly 4 times”.) 34 | 35 | This has produced a :class:`Parser` object which has various methods. We can 36 | immediately check that it works using the :meth:`Parser.parse` method: 37 | 38 | .. code-block:: python 39 | 40 | >>> year.parse("2017") 41 | '2017' 42 | >>> year.parse("abc") 43 | ParseError: expected '[0-9]{4}' at 0:0 44 | 45 | Notice first of all that a parser consumes input (the value we pass to 46 | ``parse``), and it produces an output. In the case of ``regex``, the produced 47 | output is the string that was matched, but this doesn’t have to be the case for 48 | all parsers. 49 | 50 | If there is no match, it raises a ``ParseError``. 51 | 52 | Notice as well that the :meth:`Parser.parse` method expects to consume all the 53 | input, so if there are extra characters at the end, even if it is just 54 | whitespace, parsing will fail with a message saying it expected EOF (End Of 55 | File/Data): 56 | 57 | .. code-block:: python 58 | 59 | >>> year.parse("2017 ") 60 | ParseError: expected 'EOF' at 0:4 61 | 62 | You can use :meth:`Parser.parse_partial` if you want to just keep parsing as far 63 | as possible and not throw an exception. 64 | 65 | To parse the data, we need to parse months, days, and the dash symbol, so we’ll 66 | add those: 67 | 68 | .. code-block:: python 69 | 70 | >>> from parsy import string 71 | >>> month = regex("[0-9]{2}") 72 | >>> day = regex("[0-9]{2}") 73 | >>> dash = string("-") 74 | 75 | We’ve added use of the :func:`string` primitive here, that matches just the 76 | string passed in, and returns that string. 77 | 78 | Next we need to combine these parsers into something that will parse the whole 79 | date. The simplest way is to use the :meth:`Parser.then` method: 80 | 81 | .. code-block:: python 82 | 83 | >>> fulldate = year.then(dash).then(month).then(dash).then(day) 84 | 85 | The ``then`` method returns a new parser that requires the first parser to 86 | succeed, followed by the second parser (the argument to the method). 87 | 88 | We could also write this using the :ref:`parser-rshift` which 89 | does the same thing as :meth:`Parser.then`: 90 | 91 | .. code-block:: python 92 | 93 | >>> fulldate = year >> dash >> month >> dash >> day 94 | 95 | This parser has some problems which we need to address, but it is already useful 96 | as a basic validator: 97 | 98 | .. code-block:: python 99 | 100 | >>> fulldate.parse("2017-xx") 101 | ParseError: expected '[0-9]{2}' at 0:5 102 | >>> fulldate.parse("2017-01") 103 | ParseError: expected '-' at 0:7 104 | >>> fulldate.parse("2017-02-01") 105 | '01' 106 | 107 | If the parse doesn’t succeed, we’ll get ``ParseError``, otherwise it is valid 108 | (at least as far as the basic syntax checks we’ve added). 109 | 110 | The first problem with this parser is that it doesn’t return a very useful 111 | value. Due to the way that :meth:`Parser.then` works, when it combines two 112 | parsers to produce a larger one, the value from the first parser is discarded, 113 | and the value returned by the second parser is the overall return value. So, we 114 | end up getting only the 'day' component as the result of our parse. We really 115 | want the year, month and day packaged up nicely, and converted to integers. 116 | 117 | A second problem is that our error messages are not very friendly. 118 | 119 | Our first attempt at fixing these might be to use the :ref:`parser-plus` instead 120 | of ``then``. This operator is defined to combine the results of the two parsers 121 | using the normal plus operator, which will work fine on strings: 122 | 123 | >>> fulldate = year + dash + month + dash + day 124 | >>> fulldate.parse("2017-02-01") 125 | '2017-02-01' 126 | 127 | However, it won’t help us if we want to split our data up into a set of 128 | integers. 129 | 130 | Our first step should actually be to work on the year, month and day components 131 | using :meth:`Parser.map`, which allows us to convert the strings to other 132 | objects - in our case we want integers. 133 | 134 | We can also use the :meth:`Parser.desc` method to give nicer error messages, so 135 | our components now look this this: 136 | 137 | .. code-block:: python 138 | 139 | >>> year = regex("[0-9]{4}").map(int).desc("4 digit year") 140 | >>> month = regex("[0-9]{2}").map(int).desc("2 digit month") 141 | >>> day = regex("[0-9]{2}").map(int).desc("2 digit day") 142 | 143 | We get better error messages now: 144 | 145 | .. code-block:: python 146 | 147 | >>> year.then(dash).then(month).parse("2017-xx") 148 | ParseError: expected '2 digit month' at 0:5 149 | 150 | 151 | Notice that the ``map`` and ``desc`` methods, like all similar methods on 152 | ``Parser``, return new parser objects - they do not modify the existing one. 153 | This allows us to build up parsers with a 'fluent' interface, and avoid problems 154 | caused by mutating objects. 155 | 156 | However, we still need a way to package up the year, month and day as separate 157 | values. 158 | 159 | The :func:`seq` combinator provides one easy way to do that. It takes the 160 | sequence of parsers that are passed in as arguments, and returns a parser that 161 | runs each parser in order and combines their results into a list: 162 | 163 | .. code-block:: python 164 | 165 | >>> from parsy import seq 166 | >>> fulldate = seq(year, dash, month, dash, day) 167 | >>> fulldate.parse("2017-01-02") 168 | [2017, '-', 1, '-', 2] 169 | 170 | Now, we don’t need those dashes, so we can eliminate them using the :ref:`parser-rshift` or :ref:`parser-lshift`: 171 | 172 | .. code-block:: python 173 | 174 | >>> fulldate = seq(year << dash, month << dash, day) 175 | >>> fulldate.parse("2017-01-02") 176 | [2017, 1, 2] 177 | 178 | At this point, we could also convert this to a date object if we wanted using 179 | :meth:`Parser.combine`, which passes the produced sequence to another function 180 | using ``*args`` syntax. 181 | 182 | .. code-block:: python 183 | 184 | >>> from datetime import date 185 | >>> fulldate = seq(year << dash, month << dash, day).combine(date) 186 | 187 | This works because the positional argument order of ``date`` matches the order 188 | of the values parsed i.e. (year, month, day). 189 | 190 | A slightly more readable and flexible version would use the keyword argument 191 | version of :func:`seq`, followed by :meth:`Parser.combine_dict`. Putting 192 | everything together for our final solution: 193 | 194 | .. code-block:: python 195 | 196 | from datetime import date 197 | from parsy import regex, seq, string 198 | 199 | year = regex("[0-9]{4}").map(int).desc("4 digit year") 200 | month = regex("[0-9]{2}").map(int).desc("2 digit month") 201 | day = regex("[0-9]{2}").map(int).desc("2 digit day") 202 | dash = string("-") 203 | 204 | fulldate = seq( 205 | year=year << dash, 206 | month=month << dash, 207 | day=day, 208 | ).combine_dict(date) 209 | 210 | Breaking that down: 211 | 212 | * for clarity, and to allow us test separately, we have defined individual 213 | parsers for the YYYY, MM and DD components. 214 | 215 | * the ``seq`` call produces a parser that parses the year, month and day 216 | components in order, discarding the dashes, to produce a dictionary like this: 217 | 218 | .. code-block:: python 219 | 220 | { 221 | "year": 2017, 222 | "month": 1, 223 | "day": 2, 224 | } 225 | 226 | * when we chain the ``combine_dict`` call, we have a parser that passes this 227 | dictionary to the ``date`` constructor using ``**kwargs`` syntax, so we end up 228 | calling ``date(year=2017, month=1, day=2)`` 229 | 230 | 231 | So now it does exactly what we want: 232 | 233 | .. code-block:: python 234 | 235 | >>> fulldate.parse("2017-02-01") 236 | datetime.date(2017, 2, 1) 237 | 238 | 239 | .. _using-previous-values: 240 | 241 | Using previously parsed values 242 | ============================== 243 | 244 | Now, sometimes we might want to do more complex logic with the values that are 245 | collected as parse results, and do so while we are still parsing. 246 | 247 | To continue our example, the above parser has a problem that it will raise an 248 | exception if the day and month values are not valid. We’d like to be able to 249 | check this, and produce a parse error instead, which will make our parser play 250 | better with others if we want to use it to build something bigger. 251 | 252 | Also, in ISO8601, strictly speaking you can just write the year, or the year and 253 | the month, and leave off the other parts. We’d like to handle that by returning 254 | a tuple for the result, and ``None`` for the missing data. 255 | 256 | To do this, we need to allow the parse to continue if the later components (with 257 | their leading dashes) are missing - that is, we need to express optional 258 | components, and we need a way to be able to test earlier values while in the 259 | middle of parsing, to see if we should continue looking for another component. 260 | 261 | The :meth:`Parser.bind` method provides one way to do it (yay monads!). 262 | Unfortunately, it gets ugly pretty fast, and in Python we don’t have Haskell’s 263 | ``do`` notation to tidy it up. But thankfully we can use generators and the 264 | ``yield`` keyword to great effect. 265 | 266 | We use a generator function and convert it into a parser by using the 267 | :func:`generate` decorator. The idea is that you ``yield`` every parser that you 268 | want to run, and receive the result of that parser as the value of the yield 269 | expression. You can then put parsers together using any logic you like, and 270 | finally return the value. 271 | 272 | An equivalent parser to the one above can be written like this: 273 | 274 | .. code-block:: python 275 | 276 | from parsy import generate 277 | 278 | @generate 279 | def fulldate(): 280 | y = yield year 281 | yield dash # implicit skip, since we do nothing with the value 282 | m = yield month 283 | yield dash 284 | d = yield day 285 | return date(y, m, d) 286 | 287 | Notice how this follows the previous definition of ``fulldate`` using ``seq`` 288 | with keyword arguments. It’s more verbose than before, but provides a good 289 | starting point for our next set of requirements. 290 | 291 | First of all, we need to express optional components - that is we need to be 292 | able to handle missing dashes, and return what we’ve got so far rather than 293 | failing the whole parse. 294 | 295 | :class:`Parser` has a set of methods that convert parsers into ones that allow 296 | multiples of the parser - including :meth:`Parser.many`, :meth:`Parser.times`, 297 | :meth:`Parser.at_most` and :meth:`Parser.at_least`. There is also 298 | :meth:`Parser.optional` which allows matching zero times (in which case the 299 | parser will return the default value specified or ``None`` otherwise), 300 | or exactly once - just what we need in this case. 301 | 302 | We also need to do checking on the month and the day. We’ll take a shortcut and 303 | use the built-in ``datetime.date`` class to do the validation for us. However, 304 | rather than allow exceptions to be raised, we convert the exception into a 305 | parsing failure. 306 | 307 | 308 | .. code-block:: python 309 | 310 | from parsy import fail, generate 311 | 312 | optional_dash = dash.optional() 313 | 314 | @generate 315 | def full_or_partial_date(): 316 | d = None 317 | m = None 318 | y = yield year 319 | dash1 = yield optional_dash 320 | if dash1 is not None: 321 | m = yield month 322 | dash2 = yield optional_dash 323 | if dash2 is not None: 324 | d = yield day 325 | if m is not None: 326 | if m < 1 or m > 12: 327 | return fail("month must be in 1..12") 328 | if d is not None: 329 | try: 330 | datetime.date(y, m, d) 331 | except ValueError as e: 332 | return fail(e.args[0]) 333 | 334 | return (y, m, d) 335 | 336 | 337 | This works now works as expected: 338 | 339 | .. code-block:: python 340 | 341 | >>> full_or_partial_date.parse("2017-02") 342 | (2017, 2, None) 343 | >>> full_or_partial_date.parse("2017-02-29") 344 | ParseError: expected 'day is out of range for month' at 0:10 345 | 346 | We could of course use a custom object in the final line to return a more 347 | convenient data type, if wanted. 348 | 349 | Alternatives and backtracking 350 | ============================= 351 | 352 | Suppose we are using our date parser to scrape dates off articles on a web site. 353 | We then discover that for recently published articles, instead of printing a 354 | timestamp, they write "X days ago". 355 | 356 | We want to parse this, and we’ll use a timedelta object to represent the value 357 | (to easily distinguish it from other values and consume it later). We can write 358 | a parser for this using tools we’ve seen already: 359 | 360 | .. code-block:: python 361 | 362 | >>> days_ago = regex("[0-9]+").map(lambda d: timedelta(days=-int(d))) << string(" days ago") 363 | >>> days_ago.parse("5 days ago") 364 | datetime.timedelta(-5) 365 | 366 | Now we need to combine it with our date parser, and allow either to succeed. 367 | This is done using the :ref:`parser-or`, as follows: 368 | 369 | 370 | .. code-block:: python 371 | 372 | >>> flexi_date = full_or_partial_date | days_ago 373 | >>> flexi_date.parse("2012-01-05") 374 | (2012, 1, 5) 375 | >>> flexi_date.parse("2 days ago") 376 | datetime.timedelta(-2) 377 | 378 | Notice that you still get good error messages from the appropriate parser, 379 | depending on which parser got furthest before returning a failure: 380 | 381 | .. code-block:: python 382 | 383 | >>> flexi_date.parse("2012-") 384 | ParseError: expected '2 digit month' at 0:5 385 | >>> flexi_date.parse("2 years ago") 386 | ParseError: expected ' days ago' at 0:1 387 | 388 | When using backtracking, you need to understand that backtracking to the other 389 | option only occurs if the first parser fails. So, for example: 390 | 391 | .. code-block:: python 392 | 393 | >>> a = string("a") 394 | >>> ab = string("ab") 395 | >>> c = string("c") 396 | >>> a_or_ab_and_c = ((a | ab) + c) 397 | >>> a_or_ab_and_c.parse("ac") 398 | 'ac' 399 | >>> a_or_ab_and_c.parse("abc") 400 | ParseError: expected 'c' at 0:1 401 | 402 | The parse fails because the ``a`` parser succeeds, and so the ``ab`` parser is 403 | never tried. This is different from most regular expression engines, where 404 | backtracking is done over the whole regex by default. 405 | 406 | In this case we can get the parse to succeed by switching the order: 407 | 408 | .. code-block:: python 409 | 410 | >>> ((ab | a) + c).parse("abc") 411 | 'abc' 412 | 413 | >>> ((ab | a) + c).parse("ac") 414 | 'ac' 415 | 416 | We could also fix it like this: 417 | 418 | .. code-block:: python 419 | 420 | >>> ((a + c) | (ab + c)).parse("abc") 421 | 'abc' 422 | 423 | 424 | Custom data structures 425 | ====================== 426 | 427 | In the example shown so far, the result of parsing has been a native Python data 428 | type, such as a integer, string, datetime or tuple. In some cases that is 429 | enough, but very quickly you will find that for your parse result to be useful, 430 | you will need to use custom data structures (rather than ending up with nested 431 | lists etc.) 432 | 433 | For defining custom data structures, you can use any method you like (e.g. 434 | simple classes). We suggest `dataclasses 435 | `_ (stdlib), `attrs 436 | `_ or `pydantic 437 | `_. You can also use `namedtuple 438 | `_ 439 | for simple cases. 440 | 441 | For combining parsed data into these data structures, you can: 442 | 443 | 1. Use :meth:`Parser.map`, :meth:`Parser.combine` and :meth:`Parser.combine_dict`, 444 | often in conjunction with :func:`seq`. 445 | 446 | See the :doc:`SQL SELECT example 447 | ` for an example of this approach. 448 | 449 | 2. Use the ``@generate`` decorator as above, and manually call the data 450 | structure constructor with the pieces, as in ``full_date`` or 451 | ``full_or_partial_date`` above, but with your own data structure instead of a 452 | tuple or datetime in the final line. 453 | 454 | 455 | Learn more 456 | ========== 457 | 458 | For further topics, see the :doc:`table of contents ` for the rest of 459 | the documentation that should enable you to build parsers for your needs. 460 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/python-parsy/parsy/3b72c71bf9570d73ce50477cf503fd5544c1c4b1/examples/__init__.py -------------------------------------------------------------------------------- /examples/json.py: -------------------------------------------------------------------------------- 1 | from parsy import forward_declaration, regex, seq, string 2 | 3 | # Utilities 4 | whitespace = regex(r"\s*") 5 | lexeme = lambda p: p << whitespace 6 | 7 | # Punctuation 8 | lbrace = lexeme(string("{")) 9 | rbrace = lexeme(string("}")) 10 | lbrack = lexeme(string("[")) 11 | rbrack = lexeme(string("]")) 12 | colon = lexeme(string(":")) 13 | comma = lexeme(string(",")) 14 | 15 | # Primitives 16 | true = lexeme(string("true")).result(True) 17 | false = lexeme(string("false")).result(False) 18 | null = lexeme(string("null")).result(None) 19 | number = lexeme(regex(r"-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?")).map(float) 20 | string_part = regex(r'[^"\\]+') 21 | string_esc = string("\\") >> ( 22 | string("\\") 23 | | string("/") 24 | | string('"') 25 | | string("b").result("\b") 26 | | string("f").result("\f") 27 | | string("n").result("\n") 28 | | string("r").result("\r") 29 | | string("t").result("\t") 30 | | regex(r"u[0-9a-fA-F]{4}").map(lambda s: chr(int(s[1:], 16))) 31 | ) 32 | quoted = lexeme(string('"') >> (string_part | string_esc).many().concat() << string('"')) 33 | 34 | # Data structures 35 | json_value = forward_declaration() 36 | object_pair = seq(quoted << colon, json_value).map(tuple) 37 | json_object = lbrace >> object_pair.sep_by(comma).map(dict) << rbrace 38 | array = lbrack >> json_value.sep_by(comma) << rbrack 39 | 40 | # Everything 41 | json_value.become(quoted | number | json_object | array | true | false | null) 42 | json_doc = whitespace >> json_value 43 | 44 | 45 | def test(): 46 | assert ( 47 | json_doc.parse( 48 | r""" 49 | { 50 | "int": 1, 51 | "string": "hello", 52 | "a list": [1, 2, 3], 53 | "escapes": "\n \u24D2", 54 | "nested": {"x": "y"}, 55 | "other": [true, false, null] 56 | } 57 | """ 58 | ) 59 | == { 60 | "int": 1, 61 | "string": "hello", 62 | "a list": [1, 2, 3], 63 | "escapes": "\n ⓒ", 64 | "nested": {"x": "y"}, 65 | "other": [True, False, None], 66 | } 67 | ) 68 | 69 | 70 | if __name__ == "__main__": 71 | from sys import stdin 72 | 73 | print(repr(json_doc.parse(stdin.read()))) 74 | -------------------------------------------------------------------------------- /examples/simple_eval.py: -------------------------------------------------------------------------------- 1 | from parsy import digit, generate, match_item, regex, string, success, test_item 2 | 3 | 4 | def lexer(code): 5 | whitespace = regex(r"\s*") 6 | integer = digit.at_least(1).concat().map(int) 7 | float_ = (digit.many() + string(".").result(["."]) + digit.many()).concat().map(float) 8 | parser = whitespace >> ((float_ | integer | regex(r"[()*/+-]")) << whitespace).many() 9 | return parser.parse(code) 10 | 11 | 12 | def eval_tokens(tokens): 13 | # This function parses and evaluates at the same time. 14 | 15 | lparen = match_item("(") 16 | rparen = match_item(")") 17 | 18 | @generate 19 | def additive(): 20 | res = yield multiplicative 21 | sign = match_item("+") | match_item("-") 22 | while True: 23 | operation = yield sign | success("") 24 | if not operation: 25 | break 26 | operand = yield multiplicative 27 | if operation == "+": 28 | res += operand 29 | elif operation == "-": 30 | res -= operand 31 | return res 32 | 33 | @generate 34 | def multiplicative(): 35 | res = yield simple 36 | op = match_item("*") | match_item("/") 37 | while True: 38 | operation = yield op | success("") 39 | if not operation: 40 | break 41 | operand = yield simple 42 | if operation == "*": 43 | res *= operand 44 | elif operation == "/": 45 | res /= operand 46 | return res 47 | 48 | @generate 49 | def number(): 50 | sign = yield match_item("+") | match_item("-") | success("+") 51 | value = yield test_item(lambda x: isinstance(x, (int, float)), "number") 52 | return value if sign == "+" else -value 53 | 54 | expr = additive 55 | simple = (lparen >> expr << rparen) | number 56 | 57 | return expr.parse(tokens) 58 | 59 | 60 | def simple_eval(expr): 61 | return eval_tokens(lexer(expr)) 62 | 63 | 64 | import pytest # noqa isort:skip 65 | 66 | test_item = pytest.mark.skip(test_item) # This is not a test 67 | 68 | 69 | if __name__ == "__main__": 70 | print(simple_eval(input())) 71 | -------------------------------------------------------------------------------- /examples/simple_logo_lexer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stripped down logo lexer, for tokenizing Turtle Logo programs like: 3 | 4 | fd 1 5 | bk 2 6 | rt 90 7 | 8 | etc. 9 | """ 10 | 11 | from parsy import eof, regex, seq, string, string_from, whitespace 12 | 13 | command = string_from("fd", "bk", "rt", "lt") 14 | number = regex(r"[0-9]+").map(int) 15 | optional_whitespace = regex(r"\s*") 16 | eol = string("\n") 17 | line = seq( 18 | optional_whitespace >> command, 19 | whitespace >> number, 20 | (eof | eol | (whitespace >> eol)).result("\n"), 21 | ) 22 | flatten_list = lambda ls: sum(ls, []) 23 | lexer = line.many().map(flatten_list) 24 | -------------------------------------------------------------------------------- /examples/simple_logo_parser.py: -------------------------------------------------------------------------------- 1 | from parsy import generate, match_item, test_item 2 | 3 | 4 | class Command: 5 | def __init__(self, parameter): 6 | self.parameter = parameter 7 | 8 | def __repr__(self): 9 | return f"{self.__class__.__name__}({self.parameter})" 10 | 11 | 12 | class Forward(Command): 13 | pass 14 | 15 | 16 | class Backward(Command): 17 | pass 18 | 19 | 20 | class Right(Command): 21 | pass 22 | 23 | 24 | class Left(Command): 25 | pass 26 | 27 | 28 | commands = { 29 | "fd": Forward, 30 | "bk": Backward, 31 | "rt": Right, 32 | "lt": Left, 33 | } 34 | 35 | 36 | @generate 37 | def statement(): 38 | cmd_name = yield test_item(lambda i: i in commands.keys(), "command") 39 | parameter = yield test_item(lambda i: isinstance(i, int), "number") 40 | yield match_item("\n") 41 | return commands[cmd_name](int(parameter)) 42 | 43 | 44 | program = statement.many() 45 | 46 | 47 | import pytest # noqa isort:skip 48 | 49 | test_item = pytest.mark.skip(test_item) # This is not a test 50 | -------------------------------------------------------------------------------- /examples/sql_select.py: -------------------------------------------------------------------------------- 1 | # A very limited parser for SQL SELECT statements, 2 | # for demo purposes. Supports: 3 | # 1. A simple list of columns (or number/string literals) 4 | # 2. A simple table name 5 | # 3. An optional where condition, 6 | # which has the form of 'A op B' where A and B are columns, strings or number, 7 | # and op is a comparison operator 8 | # 9 | # We demonstrate the use of `map` to create AST nodes with a single arg, 10 | # and `seq` for AST nodes with more than one arg. 11 | 12 | import enum 13 | from dataclasses import dataclass 14 | from typing import List, Optional, Union 15 | 16 | from parsy import from_enum, regex, seq, string 17 | 18 | # -- AST nodes: 19 | 20 | 21 | class Operator(enum.Enum): 22 | EQ = "=" 23 | LT = "<" 24 | GT = ">" 25 | LTE = "<=" 26 | GTE = ">=" 27 | 28 | 29 | @dataclass 30 | class Number: 31 | value: int 32 | 33 | 34 | @dataclass 35 | class String: 36 | value: str 37 | 38 | 39 | @dataclass 40 | class Field: 41 | name: str 42 | 43 | 44 | @dataclass 45 | class Table: 46 | name: str 47 | 48 | 49 | ColumnExpression = Union[Field, String, Number] 50 | 51 | 52 | @dataclass 53 | class Comparison: 54 | left: ColumnExpression 55 | operator: Operator 56 | right: ColumnExpression 57 | 58 | 59 | @dataclass 60 | class Select: 61 | columns: List[ColumnExpression] 62 | table: Table 63 | where: Optional[Comparison] 64 | 65 | 66 | # -- Parsers: 67 | 68 | number_literal = regex(r"-?[0-9]+").map(int).map(Number) 69 | 70 | # We don't support ' in strings or escaping for simplicity 71 | string_literal = regex(r"'[^']*'").map(lambda s: String(s[1:-1])) 72 | 73 | identifier = regex("[a-zA-Z][a-zA-Z0-9_]*") 74 | 75 | field = identifier.map(Field) 76 | 77 | table = identifier.map(Table) 78 | 79 | space = regex(r"\s+") # non-optional whitespace 80 | padding = regex(r"\s*") # optional whitespace 81 | 82 | column_expr = field | string_literal | number_literal 83 | 84 | operator = from_enum(Operator) 85 | 86 | comparison = seq( 87 | left=column_expr << padding, 88 | operator=operator, 89 | right=padding >> column_expr, 90 | ).combine_dict(Comparison) 91 | 92 | SELECT = string("SELECT") 93 | FROM = string("FROM") 94 | WHERE = string("WHERE") 95 | 96 | # Here we demonstrate use of leading underscore to discard parts we don't want, 97 | # which is more readable and convenient than `<<` and `>>` sometimes. 98 | select = seq( 99 | _select=SELECT + space, 100 | columns=column_expr.sep_by(padding + string(",") + padding, min=1), 101 | _from=space + FROM + space, 102 | table=table, 103 | where=(space >> WHERE >> space >> comparison).optional(), 104 | _end=padding + string(";"), 105 | ).combine_dict(Select) 106 | 107 | 108 | # Run these tests with pytest: 109 | 110 | 111 | def test_select(): 112 | assert select.parse("SELECT thing, stuff, 123, 'hello' FROM my_table WHERE id = 1;") == Select( 113 | columns=[ 114 | Field("thing"), 115 | Field("stuff"), 116 | Number(123), 117 | String("hello"), 118 | ], 119 | table=Table("my_table"), 120 | where=Comparison( 121 | left=Field("id"), 122 | operator=Operator.EQ, 123 | right=Number(1), 124 | ), 125 | ) 126 | 127 | 128 | def test_optional_where(): 129 | assert select.parse("SELECT 1 FROM x;") == Select( 130 | columns=[Number(1)], 131 | table=Table("x"), 132 | where=None, 133 | ) 134 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "parsy" 3 | description = "Easy-to-use parser combinators, for parsing in pure Python" 4 | license = {text = "MIT"} 5 | authors = [ 6 | { name = "Jeanine Adkisson", email = "jneen@jneen.net" } 7 | ] 8 | maintainers = [ 9 | { name = "Luke Plant", email = "luke@lukeplant.me.uk" } 10 | ] 11 | classifiers = [ 12 | "Development Status :: 5 - Production/Stable", 13 | "Intended Audience :: Developers", 14 | "Topic :: Software Development :: Compilers", 15 | "Topic :: Software Development :: Interpreters", 16 | "Topic :: Text Processing", 17 | "License :: OSI Approved :: MIT License", 18 | "Programming Language :: Python :: 3", 19 | "Programming Language :: Python :: 3.9", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | "Programming Language :: Python :: 3.13", 24 | ] 25 | keywords = ["parser", "parsers", "parsing", "monad", "combinators"] 26 | urls = {Homepage = "https://github.com/python-parsy/parsy"} 27 | 28 | requires-python = ">=3.9" 29 | dependencies = [] 30 | 31 | dynamic = ["version"] 32 | 33 | [project.readme] 34 | file = "README.rst" 35 | content-type = "text/x-rst" 36 | 37 | [tool.setuptools.dynamic] 38 | version = {attr = "parsy.__version__"} 39 | 40 | [build-system] 41 | requires = ["setuptools>=61.2"] 42 | build-backend = "setuptools.build_meta" 43 | 44 | [dependency-groups] 45 | dev = [ 46 | "pre-commit>=4.1.0", 47 | "pytest>=8.3.4", 48 | "tox-uv>=1.20.1", 49 | "tox>=4.24.1", 50 | "pytest-cov>=6.0.0", 51 | "pre-commit-uv>=4.1.4", 52 | ] 53 | 54 | [tool.setuptools] 55 | package-dir = {"" = "src"} 56 | include-package-data = false 57 | 58 | [tool.setuptools.packages.find] 59 | where = ["src"] 60 | namespaces = false 61 | 62 | [tool.black] 63 | line-length = 119 64 | target-version = ['py310'] 65 | 66 | [tool.isort] 67 | line_length = 119 68 | profile = "black" 69 | default_section = "THIRDPARTY" 70 | skip = [".tox", ".git", "docs", "dist", "build" , "todo", ".venv"] 71 | known_first_party = "parsy" 72 | 73 | [tool.flake8] 74 | exclude = [".tox", ".git", "docs", "dist", "build", "todo"] 75 | ignore = ["E731", "E221", "W503", "E741", "E203" ] 76 | max-line-length = 119 77 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = examples/*.py tests/*.py 3 | pythonpath = src/ 4 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | pytest || exit 1 4 | pre-commit run --all --all-files || exit 1 5 | 6 | umask 000 7 | rm -rf build dist 8 | git ls-tree --full-tree --name-only -r HEAD | xargs chmod ugo+r 9 | 10 | uv build --sdist --wheel || exit 1 11 | uv publish || exit 1 12 | 13 | VERSION=$(uv pip show parsy | grep 'Version: ' | cut -f 2 -d ' ' | tr -d '\n') || exit 1 14 | 15 | git tag v$VERSION || exit 1 16 | git push || exit 1 17 | git push --tags || exit 1 18 | -------------------------------------------------------------------------------- /src/parsy/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import enum 4 | import operator 5 | import re 6 | from dataclasses import dataclass 7 | from functools import wraps 8 | from typing import Any, Callable, FrozenSet 9 | 10 | __version__ = "2.1" 11 | 12 | noop = lambda x: x 13 | 14 | 15 | def line_info_at(stream, index): 16 | if index > len(stream): 17 | raise ValueError("invalid index") 18 | line = stream.count("\n", 0, index) 19 | last_nl = stream.rfind("\n", 0, index) 20 | col = index - (last_nl + 1) 21 | return (line, col) 22 | 23 | 24 | class ParseError(RuntimeError): 25 | def __init__(self, expected, stream, index): 26 | self.expected = expected 27 | self.stream = stream 28 | self.index = index 29 | 30 | def line_info(self): 31 | try: 32 | return "{}:{}".format(*line_info_at(self.stream, self.index)) 33 | except (TypeError, AttributeError): # not a str 34 | return str(self.index) 35 | 36 | def __str__(self): 37 | expected_list = sorted(repr(e) for e in self.expected) 38 | 39 | if len(expected_list) == 1: 40 | return f"expected {expected_list[0]} at {self.line_info()}" 41 | else: 42 | return f"expected one of {', '.join(expected_list)} at {self.line_info()}" 43 | 44 | 45 | @dataclass 46 | class Result: 47 | status: bool 48 | index: int 49 | value: Any 50 | furthest: int 51 | expected: FrozenSet[str] 52 | 53 | @staticmethod 54 | def success(index, value): 55 | return Result(True, index, value, -1, frozenset()) 56 | 57 | @staticmethod 58 | def failure(index, expected): 59 | return Result(False, -1, None, index, frozenset([expected])) 60 | 61 | # collect the furthest failure from self and other 62 | def aggregate(self, other): 63 | if not other: 64 | return self 65 | 66 | if self.furthest > other.furthest: 67 | return self 68 | elif self.furthest == other.furthest: 69 | # if we both have the same failure index, we combine the expected messages. 70 | return Result(self.status, self.index, self.value, self.furthest, self.expected | other.expected) 71 | else: 72 | return Result(self.status, self.index, self.value, other.furthest, other.expected) 73 | 74 | 75 | class Parser: 76 | """ 77 | A Parser is an object that wraps a function whose arguments are 78 | a string to be parsed and the index on which to begin parsing. 79 | The function should return either Result.success(next_index, value), 80 | where the next index is where to continue the parse and the value is 81 | the yielded value, or Result.failure(index, expected), where expected 82 | is a string indicating what was expected, and the index is the index 83 | of the failure. 84 | """ 85 | 86 | def __init__(self, wrapped_fn: Callable[[str | bytes | list, int], Result]): 87 | """ 88 | Creates a new Parser from a function that takes a stream 89 | and returns a Result. 90 | """ 91 | self.wrapped_fn = wrapped_fn 92 | 93 | def __call__(self, stream: str | bytes | list, index: int): 94 | return self.wrapped_fn(stream, index) 95 | 96 | def parse(self, stream: str | bytes | list) -> Any: 97 | """Parses a string or list of tokens and returns the result or raise a ParseError.""" 98 | (result, _) = (self << eof).parse_partial(stream) 99 | return result 100 | 101 | def parse_partial(self, stream: str | bytes | list) -> tuple[Any, str | bytes | list]: 102 | """ 103 | Parses the longest possible prefix of a given string. 104 | Returns a tuple of the result and the unparsed remainder, 105 | or raises ParseError 106 | """ 107 | result = self(stream, 0) 108 | 109 | if result.status: 110 | return (result.value, stream[result.index :]) 111 | else: 112 | raise ParseError(result.expected, stream, result.furthest) 113 | 114 | def bind(self, bind_fn): 115 | @Parser 116 | def bound_parser(stream, index): 117 | result = self(stream, index) 118 | 119 | if result.status: 120 | next_parser = bind_fn(result.value) 121 | return next_parser(stream, result.index).aggregate(result) 122 | else: 123 | return result 124 | 125 | return bound_parser 126 | 127 | def map(self, map_function: Callable) -> Parser: 128 | """ 129 | Returns a parser that transforms the produced value of the initial parser with map_function. 130 | """ 131 | return self.bind(lambda res: success(map_function(res))) 132 | 133 | def combine(self, combine_fn: Callable) -> Parser: 134 | """ 135 | Returns a parser that transforms the produced values of the initial parser 136 | with ``combine_fn``, passing the arguments using ``*args`` syntax. 137 | 138 | The initial parser should return a list/sequence of parse results. 139 | """ 140 | return self.bind(lambda res: success(combine_fn(*res))) 141 | 142 | def combine_dict(self, combine_fn: Callable) -> Parser: 143 | """ 144 | Returns a parser that transforms the value produced by the initial parser 145 | using the supplied function/callable, passing the arguments using the 146 | ``**kwargs`` syntax. 147 | 148 | The value produced by the initial parser must be a mapping/dictionary from 149 | names to values, or a list of two-tuples, or something else that can be 150 | passed to the ``dict`` constructor. 151 | 152 | If ``None`` is present as a key in the dictionary it will be removed 153 | before passing to ``fn``, as will all keys starting with ``_``. 154 | """ 155 | return self.bind( 156 | lambda res: success( 157 | combine_fn( 158 | **{ 159 | k: v 160 | for k, v in dict(res).items() 161 | if k is not None and not (isinstance(k, str) and k.startswith("_")) 162 | } 163 | ) 164 | ) 165 | ) 166 | 167 | def concat(self) -> Parser: 168 | """ 169 | Returns a parser that concatenates together (as a string) the previously 170 | produced values. 171 | """ 172 | return self.map("".join) 173 | 174 | def then(self, other: Parser) -> Parser: 175 | """ 176 | Returns a parser which, if the initial parser succeeds, will 177 | continue parsing with ``other``. This will produce the 178 | value produced by ``other``. 179 | 180 | """ 181 | return seq(self, other).combine(lambda left, right: right) 182 | 183 | def skip(self, other: Parser) -> Parser: 184 | """ 185 | Returns a parser which, if the initial parser succeeds, will 186 | continue parsing with ``other``. It will produce the 187 | value produced by the initial parser. 188 | """ 189 | return seq(self, other).combine(lambda left, right: left) 190 | 191 | def result(self, value: Any) -> Parser: 192 | """ 193 | Returns a parser that, if the initial parser succeeds, always produces 194 | the passed in ``value``. 195 | """ 196 | return self >> success(value) 197 | 198 | def many(self) -> Parser: 199 | """ 200 | Returns a parser that expects the initial parser 0 or more times, and 201 | produces a list of the results. 202 | """ 203 | return self.times(0, float("inf")) 204 | 205 | def times(self, min: int, max: int = None) -> Parser: 206 | """ 207 | Returns a parser that expects the initial parser at least ``min`` times, 208 | and at most ``max`` times, and produces a list of the results. If only one 209 | argument is given, the parser is expected exactly that number of times. 210 | """ 211 | if max is None: 212 | max = min 213 | 214 | @Parser 215 | def times_parser(stream, index): 216 | values = [] 217 | times = 0 218 | result = None 219 | 220 | while times < max: 221 | result = self(stream, index).aggregate(result) 222 | if result.status: 223 | values.append(result.value) 224 | index = result.index 225 | times += 1 226 | elif times >= min: 227 | break 228 | else: 229 | return result 230 | 231 | return Result.success(index, values).aggregate(result) 232 | 233 | return times_parser 234 | 235 | def at_most(self, n: int) -> Parser: 236 | """ 237 | Returns a parser that expects the initial parser at most ``n`` times, and 238 | produces a list of the results. 239 | """ 240 | return self.times(0, n) 241 | 242 | def at_least(self, n: int) -> Parser: 243 | """ 244 | Returns a parser that expects the initial parser at least ``n`` times, and 245 | produces a list of the results. 246 | """ 247 | return self.times(n) + self.many() 248 | 249 | def optional(self, default: Any = None) -> Parser: 250 | """ 251 | Returns a parser that expects the initial parser zero or once, and maps 252 | the result to a given default value in the case of no match. If no default 253 | value is given, ``None`` is used. 254 | """ 255 | return self.times(0, 1).map(lambda v: v[0] if v else default) 256 | 257 | def until(self, other: Parser, min: int = 0, max: int = float("inf"), consume_other: bool = False) -> Parser: 258 | """ 259 | Returns a parser that expects the initial parser followed by ``other``. 260 | The initial parser is expected at least ``min`` times and at most ``max`` times. 261 | By default, it does not consume ``other`` and it produces a list of the 262 | results excluding ``other``. If ``consume_other`` is ``True`` then 263 | ``other`` is consumed and its result is included in the list of results. 264 | """ 265 | 266 | @Parser 267 | def until_parser(stream, index): 268 | values = [] 269 | times = 0 270 | while True: 271 | 272 | # try parser first 273 | res = other(stream, index) 274 | if res.status and times >= min: 275 | if consume_other: 276 | # consume other 277 | values.append(res.value) 278 | index = res.index 279 | return Result.success(index, values) 280 | 281 | # exceeded max? 282 | if times >= max: 283 | # return failure, it matched parser more than max times 284 | return Result.failure(index, f"at most {max} items") 285 | 286 | # failed, try parser 287 | result = self(stream, index) 288 | if result.status: 289 | # consume 290 | values.append(result.value) 291 | index = result.index 292 | times += 1 293 | elif times >= min: 294 | # return failure, parser is not followed by other 295 | return Result.failure(index, "did not find other parser") 296 | else: 297 | # return failure, it did not match parser at least min times 298 | return Result.failure(index, f"at least {min} items; got {times} item(s)") 299 | 300 | return until_parser 301 | 302 | def sep_by(self, sep: Parser, *, min: int = 0, max: int = float("inf")) -> Parser: 303 | """ 304 | Returns a new parser that repeats the initial parser and 305 | collects the results in a list. Between each item, the ``sep`` parser 306 | is run (and its return value is discarded). By default it 307 | repeats with no limit, but minimum and maximum values can be supplied. 308 | """ 309 | zero_times = success([]) 310 | if max == 0: 311 | return zero_times 312 | res = self.times(1) + (sep >> self).times(min - 1, max - 1) 313 | if min == 0: 314 | res |= zero_times 315 | return res 316 | 317 | def desc(self, description: str) -> Parser: 318 | """ 319 | Returns a new parser with a description added, which is used in the error message 320 | if parsing fails. 321 | """ 322 | 323 | @Parser 324 | def desc_parser(stream, index): 325 | result = self(stream, index) 326 | if result.status: 327 | return result 328 | else: 329 | return Result.failure(index, description) 330 | 331 | return desc_parser 332 | 333 | def mark(self) -> Parser: 334 | """ 335 | Returns a parser that wraps the initial parser's result in a value 336 | containing column and line information of the match, as well as the 337 | original value. The new value is a 3-tuple: 338 | 339 | ((start_row, start_column), 340 | original_value, 341 | (end_row, end_column)) 342 | """ 343 | 344 | @generate 345 | def marked(): 346 | start = yield line_info 347 | body = yield self 348 | end = yield line_info 349 | return (start, body, end) 350 | 351 | return marked 352 | 353 | def tag(self, name: str) -> Parser: 354 | """ 355 | Returns a parser that wraps the produced value of the initial parser in a 356 | 2 tuple containing ``(name, value)``. This provides a very simple way to 357 | label parsed components 358 | """ 359 | return self.map(lambda v: (name, v)) 360 | 361 | def should_fail(self, description: str) -> Parser: 362 | """ 363 | Returns a parser that fails when the initial parser succeeds, and succeeds 364 | when the initial parser fails (consuming no input). A description must 365 | be passed which is used in parse failure messages. 366 | 367 | This is essentially a negative lookahead 368 | """ 369 | 370 | @Parser 371 | def fail_parser(stream, index): 372 | res = self(stream, index) 373 | if res.status: 374 | return Result.failure(index, description) 375 | return Result.success(index, res) 376 | 377 | return fail_parser 378 | 379 | def __add__(self, other: Parser) -> Parser: 380 | return seq(self, other).combine(operator.add) 381 | 382 | def __mul__(self, other: Parser) -> Parser: 383 | if isinstance(other, range): 384 | return self.times(other.start, other.stop - 1) 385 | return self.times(other) 386 | 387 | def __or__(self, other: Parser) -> Parser: 388 | return alt(self, other) 389 | 390 | # haskelley operators, for fun # 391 | 392 | # >> 393 | def __rshift__(self, other: Parser) -> Parser: 394 | return self.then(other) 395 | 396 | # << 397 | def __lshift__(self, other: Parser) -> Parser: 398 | return self.skip(other) 399 | 400 | 401 | def alt(*parsers: Parser) -> Parser: 402 | """ 403 | Creates a parser from the passed in argument list of alternative 404 | parsers, which are tried in order, moving to the next one if the 405 | current one fails. 406 | """ 407 | if not parsers: 408 | return fail("") 409 | 410 | @Parser 411 | def alt_parser(stream, index): 412 | result = None 413 | for parser in parsers: 414 | result = parser(stream, index).aggregate(result) 415 | if result.status: 416 | return result 417 | 418 | return result 419 | 420 | return alt_parser 421 | 422 | 423 | def seq(*parsers: Parser, **kw_parsers: Parser) -> Parser: 424 | """ 425 | Takes a list of parsers, runs them in order, 426 | and collects their individuals results in a list, 427 | or in a dictionary if you pass them as keyword arguments. 428 | """ 429 | if not parsers and not kw_parsers: 430 | return success([]) 431 | 432 | if parsers and kw_parsers: 433 | raise ValueError("Use either positional arguments or keyword arguments with seq, not both") 434 | 435 | if parsers: 436 | 437 | @Parser 438 | def seq_parser(stream, index): 439 | result = None 440 | values = [] 441 | for parser in parsers: 442 | result = parser(stream, index).aggregate(result) 443 | if not result.status: 444 | return result 445 | index = result.index 446 | values.append(result.value) 447 | return Result.success(index, values).aggregate(result) 448 | 449 | return seq_parser 450 | else: 451 | 452 | @Parser 453 | def seq_kwarg_parser(stream, index): 454 | result = None 455 | values = {} 456 | for name, parser in kw_parsers.items(): 457 | result = parser(stream, index).aggregate(result) 458 | if not result.status: 459 | return result 460 | index = result.index 461 | values[name] = result.value 462 | return Result.success(index, values).aggregate(result) 463 | 464 | return seq_kwarg_parser 465 | 466 | 467 | def generate(fn) -> Parser: 468 | """ 469 | Creates a parser from a generator function 470 | """ 471 | if isinstance(fn, str): 472 | return lambda f: generate(f).desc(fn) 473 | 474 | @Parser 475 | @wraps(fn) 476 | def generated(stream, index): 477 | # start up the generator 478 | iterator = fn() 479 | 480 | result = None 481 | value = None 482 | try: 483 | while True: 484 | next_parser = iterator.send(value) 485 | result = next_parser(stream, index).aggregate(result) 486 | if not result.status: 487 | return result 488 | value = result.value 489 | index = result.index 490 | except StopIteration as stop: 491 | returnVal = stop.value 492 | if isinstance(returnVal, Parser): 493 | return returnVal(stream, index).aggregate(result) 494 | 495 | return Result.success(index, returnVal).aggregate(result) 496 | 497 | return generated 498 | 499 | 500 | index = Parser(lambda _, index: Result.success(index, index)) 501 | line_info = Parser(lambda stream, index: Result.success(index, line_info_at(stream, index))) 502 | 503 | 504 | def success(value: Any) -> Parser: 505 | """ 506 | Returns a parser that does not consume any of the stream, but 507 | produces ``value``. 508 | """ 509 | return Parser(lambda _, index: Result.success(index, value)) 510 | 511 | 512 | def fail(expected: str) -> Parser: 513 | """ 514 | Returns a parser that always fails with the provided error message. 515 | """ 516 | return Parser(lambda _, index: Result.failure(index, expected)) 517 | 518 | 519 | def string(expected_string: str, transform: Callable[[str], str] = noop) -> Parser: 520 | """ 521 | Returns a parser that expects the ``expected_string`` and produces 522 | that string value. 523 | 524 | Optionally, a transform function can be passed, which will be used on both 525 | the expected string and tested string. 526 | """ 527 | 528 | slen = len(expected_string) 529 | transformed_s = transform(expected_string) 530 | 531 | @Parser 532 | def string_parser(stream, index): 533 | if transform(stream[index : index + slen]) == transformed_s: 534 | return Result.success(index + slen, expected_string) 535 | else: 536 | return Result.failure(index, expected_string) 537 | 538 | return string_parser 539 | 540 | 541 | def regex(exp: str, flags=0, group: int | str | tuple = 0) -> Parser: 542 | """ 543 | Returns a parser that expects the given ``exp``, and produces the 544 | matched string. ``exp`` can be a compiled regular expression, or a 545 | string which will be compiled with the given ``flags``. 546 | 547 | Optionally, accepts ``group``, which is passed to re.Match.group 548 | https://docs.python.org/3/library/re.html#re.Match.group> to 549 | return the text from a capturing group in the regex instead of the 550 | entire match. 551 | """ 552 | 553 | if isinstance(exp, (str, bytes)): 554 | exp = re.compile(exp, flags) 555 | if isinstance(group, (str, int)): 556 | group = (group,) 557 | 558 | @Parser 559 | def regex_parser(stream, index): 560 | match = exp.match(stream, index) 561 | if match: 562 | return Result.success(match.end(), match.group(*group)) 563 | else: 564 | return Result.failure(index, exp.pattern) 565 | 566 | return regex_parser 567 | 568 | 569 | def test_item(func: Callable[..., bool], description: str) -> Parser: 570 | """ 571 | Returns a parser that tests a single item from the list of items being 572 | consumed, using the callable ``func``. If ``func`` returns ``True``, the 573 | parse succeeds, otherwise the parse fails with the description 574 | ``description``. 575 | """ 576 | 577 | @Parser 578 | def test_item_parser(stream, index): 579 | if index < len(stream): 580 | if isinstance(stream, bytes): 581 | # Subscripting bytes with `[index]` instead of 582 | # `[index:index + 1]` returns an int 583 | item = stream[index : index + 1] 584 | else: 585 | item = stream[index] 586 | if func(item): 587 | return Result.success(index + 1, item) 588 | return Result.failure(index, description) 589 | 590 | return test_item_parser 591 | 592 | 593 | def test_char(func: Callable[..., bool], description: str) -> Parser: 594 | """ 595 | Returns a parser that tests a single character with the callable 596 | ``func``. If ``func`` returns ``True``, the parse succeeds, otherwise 597 | the parse fails with the description ``description``. 598 | """ 599 | # Implementation is identical to test_item 600 | return test_item(func, description) 601 | 602 | 603 | def match_item(item: Any, description: str = None) -> Parser: 604 | """ 605 | Returns a parser that tests the next item (or character) from the stream (or 606 | string) for equality against the provided item. Optionally a string 607 | description can be passed. 608 | """ 609 | 610 | if description is None: 611 | description = str(item) 612 | return test_item(lambda i: item == i, description) 613 | 614 | 615 | def string_from(*strings: str, transform: Callable[[str], str] = noop): 616 | """ 617 | Accepts a sequence of strings as positional arguments, and returns a parser 618 | that matches and returns one string from the list. The list is first sorted 619 | in descending length order, so that overlapping strings are handled correctly 620 | by checking the longest one first. 621 | """ 622 | # Sort longest first, so that overlapping options work correctly 623 | return alt(*(string(s, transform) for s in sorted(strings, key=len, reverse=True))) 624 | 625 | 626 | def char_from(string: str | bytes): 627 | """ 628 | Accepts a string and returns a parser that matches and returns one character 629 | from the string. 630 | """ 631 | if isinstance(string, bytes): 632 | return test_char(lambda c: c in string, b"[" + string + b"]") 633 | else: 634 | return test_char(lambda c: c in string, "[" + string + "]") 635 | 636 | 637 | def peek(parser: Parser) -> Parser: 638 | """ 639 | Returns a lookahead parser that parses the input stream without consuming 640 | chars. 641 | """ 642 | 643 | @Parser 644 | def peek_parser(stream, index): 645 | result = parser(stream, index) 646 | if result.status: 647 | return Result.success(index, result.value) 648 | else: 649 | return result 650 | 651 | return peek_parser 652 | 653 | 654 | any_char = test_char(lambda c: True, "any character") 655 | 656 | whitespace = regex(r"\s+") 657 | 658 | letter = test_char(lambda c: c.isalpha(), "a letter") 659 | 660 | digit = test_char(lambda c: c.isdigit(), "a digit") 661 | 662 | decimal_digit = char_from("0123456789") 663 | 664 | 665 | @Parser 666 | def eof(stream, index): 667 | """ 668 | A parser that only succeeds if the end of the stream has been reached. 669 | """ 670 | 671 | if index >= len(stream): 672 | return Result.success(index, None) 673 | else: 674 | return Result.failure(index, "EOF") 675 | 676 | 677 | def from_enum(enum_cls: type[enum.Enum], transform=noop) -> Parser: 678 | """ 679 | Given a class that is an enum.Enum class 680 | https://docs.python.org/3/library/enum.html , returns a parser that 681 | will parse the values (or the string representations of the values) 682 | and return the corresponding enum item. 683 | """ 684 | 685 | items = sorted( 686 | ((str(enum_item.value), enum_item) for enum_item in enum_cls), key=lambda t: len(t[0]), reverse=True 687 | ) 688 | return alt(*(string(value, transform=transform).result(enum_item) for value, enum_item in items)) 689 | 690 | 691 | class forward_declaration(Parser): 692 | """ 693 | An empty parser that can be used as a forward declaration, 694 | especially for parsers that need to be defined recursively. 695 | 696 | You must use `.become(parser)` before using. 697 | """ 698 | 699 | def __init__(self): 700 | pass 701 | 702 | def _raise_error(self, *args, **kwargs): 703 | raise ValueError("You must use 'become' before attempting to call `parse` or `parse_partial`") 704 | 705 | parse = _raise_error 706 | parse_partial = _raise_error 707 | 708 | def become(self, other: Parser): 709 | """ 710 | Take on the behavior of the given parser. 711 | """ 712 | self.__dict__ = other.__dict__ 713 | self.__class__ = other.__class__ 714 | -------------------------------------------------------------------------------- /tests/requirements-linters.txt: -------------------------------------------------------------------------------- 1 | isort==5.4.2 2 | flake8==3.8.3 3 | -------------------------------------------------------------------------------- /tests/requirements-tests.txt: -------------------------------------------------------------------------------- 1 | pytest==7.1.1 2 | pytest-cov==4.0.0 3 | coverage==6.3.2 4 | -------------------------------------------------------------------------------- /tests/test_parsy.py: -------------------------------------------------------------------------------- 1 | # -*- code: utf8 -*- 2 | import enum 3 | import re 4 | import unittest 5 | from collections import namedtuple 6 | from datetime import date 7 | 8 | from parsy import ( 9 | ParseError, 10 | alt, 11 | any_char, 12 | char_from, 13 | decimal_digit, 14 | digit, 15 | forward_declaration, 16 | from_enum, 17 | generate, 18 | index, 19 | letter, 20 | line_info, 21 | line_info_at, 22 | match_item, 23 | peek, 24 | regex, 25 | seq, 26 | string, 27 | string_from, 28 | ) 29 | from parsy import test_char as parsy_test_char # to stop pytest thinking this function is a test 30 | from parsy import test_item as parsy_test_item # to stop pytest thinking this function is a test 31 | from parsy import whitespace 32 | 33 | 34 | class TestParser(unittest.TestCase): 35 | def test_string(self): 36 | parser = string("x") 37 | self.assertEqual(parser.parse("x"), "x") 38 | 39 | self.assertRaises(ParseError, parser.parse, "y") 40 | 41 | def test_string_transform(self): 42 | parser = string("x", transform=lambda s: s.lower()) 43 | self.assertEqual(parser.parse("x"), "x") 44 | self.assertEqual(parser.parse("X"), "x") 45 | 46 | self.assertRaises(ParseError, parser.parse, "y") 47 | 48 | def test_string_transform_2(self): 49 | parser = string("Cat", transform=lambda s: s.lower()) 50 | self.assertEqual(parser.parse("cat"), "Cat") 51 | self.assertEqual(parser.parse("CAT"), "Cat") 52 | self.assertEqual(parser.parse("CaT"), "Cat") 53 | 54 | self.assertRaises(ParseError, parser.parse, "dog") 55 | 56 | def test_regex_str(self): 57 | parser = regex(r"[0-9]") 58 | 59 | self.assertEqual(parser.parse("1"), "1") 60 | self.assertEqual(parser.parse("4"), "4") 61 | 62 | self.assertRaises(ParseError, parser.parse, "x") 63 | 64 | def test_regex_bytes(self): 65 | parser = regex(rb"[0-9]") 66 | 67 | self.assertEqual(parser.parse(b"1"), b"1") 68 | self.assertEqual(parser.parse(b"4"), b"4") 69 | 70 | self.assertRaises(ParseError, parser.parse, b"x") 71 | 72 | def test_regex_compiled(self): 73 | parser = regex(re.compile(r"[0-9]")) 74 | self.assertEqual(parser.parse("1"), "1") 75 | self.assertRaises(ParseError, parser.parse, "x") 76 | 77 | def test_regex_group_number(self): 78 | parser = regex(re.compile(r"a([0-9])b"), group=1) 79 | self.assertEqual(parser.parse("a1b"), "1") 80 | self.assertRaises(ParseError, parser.parse, "x") 81 | 82 | def test_regex_group_name(self): 83 | parser = regex(re.compile(r"a(?P[0-9])b"), group="name") 84 | self.assertEqual(parser.parse("a1b"), "1") 85 | self.assertRaises(ParseError, parser.parse, "x") 86 | 87 | def test_regex_group_tuple(self): 88 | parser = regex(re.compile(r"a([0-9])b([0-9])c"), group=(1, 2)) 89 | self.assertEqual(parser.parse("a1b2c"), ("1", "2")) 90 | self.assertRaises(ParseError, parser.parse, "x") 91 | 92 | def test_then(self): 93 | xy_parser = string("x") >> string("y") 94 | self.assertEqual(xy_parser.parse("xy"), "y") 95 | 96 | self.assertRaises(ParseError, xy_parser.parse, "y") 97 | self.assertRaises(ParseError, xy_parser.parse, "z") 98 | 99 | def test_bind(self): 100 | piped = None 101 | 102 | def binder(x): 103 | nonlocal piped 104 | piped = x 105 | return string("y") 106 | 107 | parser = string("x").bind(binder) 108 | 109 | self.assertEqual(parser.parse("xy"), "y") 110 | self.assertEqual(piped, "x") 111 | 112 | self.assertRaises(ParseError, parser.parse, "x") 113 | 114 | def test_map(self): 115 | parser = digit.map(int) 116 | self.assertEqual(parser.parse("7"), 7) 117 | 118 | def test_combine(self): 119 | parser = seq(digit, letter).combine(lambda d, l: (d, l)) 120 | self.assertEqual(parser.parse("1A"), ("1", "A")) 121 | 122 | def test_combine_dict(self): 123 | ddmmyyyy = ( 124 | seq( 125 | regex(r"[0-9]{2}").map(int).tag("day"), 126 | regex(r"[0-9]{2}").map(int).tag("month"), 127 | regex(r"[0-9]{4}").map(int).tag("year"), 128 | ) 129 | .map(dict) 130 | .combine_dict(date) 131 | ) 132 | self.assertEqual(ddmmyyyy.parse("05042003"), date(2003, 4, 5)) 133 | 134 | def test_combine_dict_list(self): 135 | Pair = namedtuple("Pair", ["word", "number"]) 136 | parser = seq( 137 | regex(r"[A-Z]+").tag("word"), 138 | regex(r"[0-9]+").map(int).tag("number"), 139 | ).combine_dict(Pair) 140 | self.assertEqual(parser.parse("ABC123"), Pair(word="ABC", number=123)) 141 | 142 | def test_combine_dict_skip_None(self): 143 | Pair = namedtuple("Pair", ["word", "number"]) 144 | parser = seq( 145 | regex(r"[A-Z]+").tag("word"), 146 | whitespace.tag(None), 147 | regex(r"[0-9]+").map(int).tag("number"), 148 | ).combine_dict(Pair) 149 | self.assertEqual(parser.parse("ABC 123"), Pair(word="ABC", number=123)) 150 | 151 | def test_combine_dict_skip_underscores(self): 152 | Pair = namedtuple("Pair", ["word", "number"]) 153 | parser = seq( 154 | regex(r"[A-Z]+").tag("word"), 155 | whitespace.tag("_whitespace"), 156 | regex(r"[0-9]+").map(int).tag("number"), 157 | ).combine_dict(Pair) 158 | self.assertEqual(parser.parse("ABC 123"), Pair(word="ABC", number=123)) 159 | 160 | def test_concat(self): 161 | parser = letter.many().concat() 162 | self.assertEqual(parser.parse(""), "") 163 | self.assertEqual(parser.parse("abc"), "abc") 164 | 165 | def test_concat_from_byte_stream(self): 166 | any_byte = parsy_test_item(lambda c: True, "any byte") 167 | parser = any_byte.map(lambda b: b.decode("ascii")).many().concat() 168 | self.assertEqual(parser.parse(b""), "") 169 | self.assertEqual(parser.parse(b"abc"), "abc") 170 | 171 | def test_generate(self): 172 | x = y = None 173 | 174 | @generate 175 | def xy(): 176 | nonlocal x 177 | nonlocal y 178 | x = yield string("x") 179 | y = yield string("y") 180 | return 3 181 | 182 | self.assertEqual(xy.parse("xy"), 3) 183 | self.assertEqual(x, "x") 184 | self.assertEqual(y, "y") 185 | 186 | def test_generate_return_parser(self): 187 | @generate 188 | def example(): 189 | yield string("x") 190 | return string("y") 191 | 192 | self.assertEqual(example.parse("xy"), "y") 193 | 194 | def test_mark(self): 195 | parser = (letter.many().mark() << string("\n")).many() 196 | 197 | lines = parser.parse("asdf\nqwer\n") 198 | 199 | self.assertEqual(len(lines), 2) 200 | 201 | (start, letters, end) = lines[0] 202 | self.assertEqual(start, (0, 0)) 203 | self.assertEqual(letters, ["a", "s", "d", "f"]) 204 | self.assertEqual(end, (0, 4)) 205 | 206 | (start, letters, end) = lines[1] 207 | self.assertEqual(start, (1, 0)) 208 | self.assertEqual(letters, ["q", "w", "e", "r"]) 209 | self.assertEqual(end, (1, 4)) 210 | 211 | def test_tag(self): 212 | parser = letter.many().concat().tag("word") 213 | self.assertEqual( 214 | parser.sep_by(string(",")).parse("this,is,a,list"), 215 | [("word", "this"), ("word", "is"), ("word", "a"), ("word", "list")], 216 | ) 217 | 218 | def test_tag_map_dict(self): 219 | parser = seq(letter.tag("first_letter"), letter.many().concat().tag("remainder")).map(dict) 220 | self.assertEqual(parser.parse("Hello"), {"first_letter": "H", "remainder": "ello"}) 221 | 222 | def test_generate_desc(self): 223 | @generate("a thing") 224 | def thing(): 225 | yield string("t") 226 | 227 | with self.assertRaises(ParseError) as err: 228 | thing.parse("x") 229 | 230 | ex = err.exception 231 | 232 | self.assertEqual(ex.expected, frozenset(["a thing"])) 233 | self.assertEqual(ex.stream, "x") 234 | self.assertEqual(ex.index, 0) 235 | 236 | def test_generate_default_desc(self): 237 | # We shouldn't give a default desc, the messages from the internal 238 | # parsers should bubble up. 239 | @generate 240 | def thing(): 241 | yield string("a") 242 | yield string("b") 243 | 244 | with self.assertRaises(ParseError) as err: 245 | thing.parse("ax") 246 | 247 | ex = err.exception 248 | 249 | self.assertEqual(ex.expected, frozenset(["b"])) 250 | self.assertEqual(ex.stream, "ax") 251 | self.assertEqual(ex.index, 1) 252 | 253 | self.assertIn("expected 'b' at 0:1", str(ex)) 254 | 255 | def test_multiple_failures(self): 256 | abc = string("a") | string("b") | string("c") 257 | 258 | with self.assertRaises(ParseError) as err: 259 | abc.parse("d") 260 | 261 | ex = err.exception 262 | self.assertEqual(ex.expected, frozenset(["a", "b", "c"])) 263 | self.assertEqual(str(ex), "expected one of 'a', 'b', 'c' at 0:0") 264 | 265 | def test_generate_backtracking(self): 266 | @generate 267 | def xy(): 268 | yield string("x") 269 | yield string("y") 270 | assert False 271 | 272 | parser = xy | string("z") 273 | # should not finish executing xy() 274 | self.assertEqual(parser.parse("z"), "z") 275 | 276 | def test_or(self): 277 | x_or_y = string("x") | string("y") 278 | 279 | self.assertEqual(x_or_y.parse("x"), "x") 280 | self.assertEqual(x_or_y.parse("y"), "y") 281 | 282 | def test_or_with_then(self): 283 | parser = (string("\\") >> string("y")) | string("z") 284 | self.assertEqual(parser.parse("\\y"), "y") 285 | self.assertEqual(parser.parse("z"), "z") 286 | 287 | self.assertRaises(ParseError, parser.parse, "\\z") 288 | 289 | def test_many(self): 290 | letters = letter.many() 291 | self.assertEqual(letters.parse("x"), ["x"]) 292 | self.assertEqual(letters.parse("xyz"), ["x", "y", "z"]) 293 | self.assertEqual(letters.parse(""), []) 294 | 295 | self.assertRaises(ParseError, letters.parse, "1") 296 | 297 | def test_many_with_then(self): 298 | parser = string("x").many() >> string("y") 299 | self.assertEqual(parser.parse("y"), "y") 300 | self.assertEqual(parser.parse("xy"), "y") 301 | self.assertEqual(parser.parse("xxxxxy"), "y") 302 | 303 | def test_times_zero(self): 304 | zero_letters = letter.times(0) 305 | self.assertEqual(zero_letters.parse(""), []) 306 | 307 | self.assertRaises(ParseError, zero_letters.parse, "x") 308 | 309 | def test_times(self): 310 | three_letters = letter.times(3) 311 | self.assertEqual(three_letters.parse("xyz"), ["x", "y", "z"]) 312 | 313 | self.assertRaises(ParseError, three_letters.parse, "xy") 314 | self.assertRaises(ParseError, three_letters.parse, "xyzw") 315 | 316 | def test_times_with_then(self): 317 | then_digit = letter.times(3) >> digit 318 | self.assertEqual(then_digit.parse("xyz1"), "1") 319 | 320 | self.assertRaises(ParseError, then_digit.parse, "xy1") 321 | self.assertRaises(ParseError, then_digit.parse, "xyz") 322 | self.assertRaises(ParseError, then_digit.parse, "xyzw") 323 | 324 | def test_times_with_min_and_max(self): 325 | some_letters = letter.times(2, 4) 326 | 327 | self.assertEqual(some_letters.parse("xy"), ["x", "y"]) 328 | self.assertEqual(some_letters.parse("xyz"), ["x", "y", "z"]) 329 | self.assertEqual(some_letters.parse("xyzw"), ["x", "y", "z", "w"]) 330 | 331 | self.assertRaises(ParseError, some_letters.parse, "x") 332 | self.assertRaises(ParseError, some_letters.parse, "xyzwv") 333 | 334 | def test_times_with_min_and_max_and_then(self): 335 | then_digit = letter.times(2, 4) >> digit 336 | 337 | self.assertEqual(then_digit.parse("xy1"), "1") 338 | self.assertEqual(then_digit.parse("xyz1"), "1") 339 | self.assertEqual(then_digit.parse("xyzw1"), "1") 340 | 341 | self.assertRaises(ParseError, then_digit.parse, "xy") 342 | self.assertRaises(ParseError, then_digit.parse, "xyzw") 343 | self.assertRaises(ParseError, then_digit.parse, "xyzwv1") 344 | self.assertRaises(ParseError, then_digit.parse, "x1") 345 | 346 | def test_at_most(self): 347 | ab = string("ab") 348 | self.assertEqual(ab.at_most(2).parse(""), []) 349 | self.assertEqual(ab.at_most(2).parse("ab"), ["ab"]) 350 | self.assertEqual(ab.at_most(2).parse("abab"), ["ab", "ab"]) 351 | self.assertRaises(ParseError, ab.at_most(2).parse, "ababab") 352 | 353 | def test_until(self): 354 | 355 | until = string("s").until(string("x")) 356 | 357 | s = "ssssx" 358 | self.assertEqual(until.parse_partial(s), (4 * ["s"], "x")) 359 | self.assertEqual(seq(until, string("x")).parse(s), [4 * ["s"], "x"]) 360 | self.assertEqual(until.then(string("x")).parse(s), "x") 361 | 362 | s = "ssssxy" 363 | self.assertEqual(until.parse_partial(s), (4 * ["s"], "xy")) 364 | self.assertEqual(seq(until, string("x")).parse_partial(s), ([4 * ["s"], "x"], "y")) 365 | self.assertEqual(until.then(string("x")).parse_partial(s), ("x", "y")) 366 | 367 | self.assertRaises(ParseError, until.parse, "ssssy") 368 | self.assertRaises(ParseError, until.parse, "xssssxy") 369 | 370 | self.assertEqual(until.parse_partial("xxx"), ([], "xxx")) 371 | 372 | until = regex(".").until(string("x")) 373 | self.assertEqual(until.parse_partial("xxxx"), ([], "xxxx")) 374 | 375 | def test_until_with_consume_other(self): 376 | 377 | until = string("s").until(string("x"), consume_other=True) 378 | 379 | self.assertEqual(until.parse("ssssx"), 4 * ["s"] + ["x"]) 380 | self.assertEqual(until.parse_partial("ssssxy"), (4 * ["s"] + ["x"], "y")) 381 | 382 | self.assertEqual(until.parse_partial("xxx"), (["x"], "xx")) 383 | 384 | self.assertRaises(ParseError, until.parse, "ssssy") 385 | self.assertRaises(ParseError, until.parse, "xssssxy") 386 | 387 | def test_until_with_min(self): 388 | 389 | until = string("s").until(string("x"), min=3) 390 | 391 | self.assertEqual(until.parse_partial("sssx"), (3 * ["s"], "x")) 392 | self.assertEqual(until.parse_partial("sssssx"), (5 * ["s"], "x")) 393 | 394 | self.assertRaises(ParseError, until.parse_partial, "ssx") 395 | 396 | def test_until_with_max(self): 397 | 398 | # until with max 399 | until = string("s").until(string("x"), max=3) 400 | 401 | self.assertEqual(until.parse_partial("ssx"), (2 * ["s"], "x")) 402 | self.assertEqual(until.parse_partial("sssx"), (3 * ["s"], "x")) 403 | 404 | self.assertRaises(ParseError, until.parse_partial, "ssssx") 405 | 406 | def test_until_with_min_max(self): 407 | 408 | until = string("s").until(string("x"), min=3, max=5) 409 | 410 | self.assertEqual(until.parse_partial("sssx"), (3 * ["s"], "x")) 411 | self.assertEqual(until.parse_partial("sssssx"), (5 * ["s"], "x")) 412 | 413 | with self.assertRaises(ParseError) as cm: 414 | until.parse_partial("ssx") 415 | assert cm.exception.args[0] == frozenset({"at least 3 items; got 2 item(s)"}) 416 | with self.assertRaises(ParseError) as cm: 417 | until.parse_partial("ssssssx") 418 | assert cm.exception.args[0] == frozenset({"at most 5 items"}) 419 | 420 | def test_optional(self): 421 | p = string("a").optional() 422 | self.assertEqual(p.parse("a"), "a") 423 | self.assertEqual(p.parse(""), None) 424 | p = string("a").optional("b") 425 | self.assertEqual(p.parse("a"), "a") 426 | self.assertEqual(p.parse(""), "b") 427 | 428 | def test_sep_by(self): 429 | digit_list = digit.map(int).sep_by(string(",")) 430 | 431 | self.assertEqual(digit_list.parse("1,2,3,4"), [1, 2, 3, 4]) 432 | self.assertEqual(digit_list.parse("9,0,4,7"), [9, 0, 4, 7]) 433 | self.assertEqual(digit_list.parse("3,7"), [3, 7]) 434 | self.assertEqual(digit_list.parse("8"), [8]) 435 | self.assertEqual(digit_list.parse(""), []) 436 | 437 | self.assertRaises(ParseError, digit_list.parse, "8,") 438 | self.assertRaises(ParseError, digit_list.parse, ",9") 439 | self.assertRaises(ParseError, digit_list.parse, "82") 440 | self.assertRaises(ParseError, digit_list.parse, "7.6") 441 | 442 | def test_sep_by_with_min_and_max(self): 443 | digit_list = digit.map(int).sep_by(string(","), min=2, max=4) 444 | 445 | self.assertEqual(digit_list.parse("1,2,3,4"), [1, 2, 3, 4]) 446 | self.assertEqual(digit_list.parse("9,0,4,7"), [9, 0, 4, 7]) 447 | self.assertEqual(digit_list.parse("3,7"), [3, 7]) 448 | 449 | self.assertRaises(ParseError, digit_list.parse, "8") 450 | self.assertRaises(ParseError, digit_list.parse, "") 451 | self.assertRaises(ParseError, digit_list.parse, "8,") 452 | self.assertRaises(ParseError, digit_list.parse, ",9") 453 | self.assertRaises(ParseError, digit_list.parse, "82") 454 | self.assertRaises(ParseError, digit_list.parse, "7.6") 455 | self.assertEqual(digit.sep_by(string(","), max=0).parse(""), []) 456 | 457 | def test_add(self): 458 | self.assertEqual((letter + digit).parse("a1"), "a1") 459 | 460 | def test_multiply(self): 461 | self.assertEqual((letter * 3).parse("abc"), ["a", "b", "c"]) 462 | 463 | def test_multiply_range(self): 464 | self.assertEqual((letter * range(1, 2)).parse("a"), ["a"]) 465 | self.assertRaises(ParseError, (letter * range(1, 2)).parse, "aa") 466 | 467 | # Primitives 468 | def test_alt(self): 469 | self.assertRaises(ParseError, alt().parse, "") 470 | self.assertEqual(alt(letter, digit).parse("a"), "a") 471 | self.assertEqual(alt(letter, digit).parse("1"), "1") 472 | self.assertRaises(ParseError, alt(letter, digit).parse, ".") 473 | 474 | def test_seq(self): 475 | self.assertEqual(seq().parse(""), []) 476 | self.assertEqual(seq(letter).parse("a"), ["a"]) 477 | self.assertEqual(seq(letter, digit).parse("a1"), ["a", "1"]) 478 | self.assertRaises(ParseError, seq(letter, digit).parse, "1a") 479 | 480 | def test_seq_kwargs(self): 481 | self.assertEqual( 482 | seq(first_name=regex(r"\S+") << whitespace, last_name=regex(r"\S+")).parse("Jane Smith"), 483 | {"first_name": "Jane", "last_name": "Smith"}, 484 | ) 485 | 486 | def test_seq_kwargs_fail(self): 487 | self.assertRaises(ParseError, seq(a=string("a")).parse, "b") 488 | 489 | def test_seq_kwargs_error(self): 490 | self.assertRaises(ValueError, lambda: seq(string("a"), b=string("b"))) 491 | 492 | def test_test_char(self): 493 | ascii = parsy_test_char(lambda c: ord(c) < 128, "ascii character") 494 | self.assertEqual(ascii.parse("a"), "a") 495 | with self.assertRaises(ParseError) as err: 496 | ascii.parse("☺") 497 | ex = err.exception 498 | self.assertEqual(str(ex), """expected 'ascii character' at 0:0""") 499 | 500 | with self.assertRaises(ParseError) as err: 501 | ascii.parse("") 502 | ex = err.exception 503 | self.assertEqual(str(ex), """expected 'ascii character' at 0:0""") 504 | 505 | def test_char_from_str(self): 506 | ab = char_from("ab") 507 | self.assertEqual(ab.parse("a"), "a") 508 | self.assertEqual(ab.parse("b"), "b") 509 | 510 | with self.assertRaises(ParseError) as err: 511 | ab.parse("x") 512 | 513 | ex = err.exception 514 | self.assertEqual(str(ex), """expected '[ab]' at 0:0""") 515 | 516 | def test_char_from_bytes(self): 517 | ab = char_from(b"ab") 518 | self.assertEqual(ab.parse(b"a"), b"a") 519 | self.assertEqual(ab.parse(b"b"), b"b") 520 | 521 | with self.assertRaises(ParseError) as err: 522 | ab.parse(b"x") 523 | 524 | ex = err.exception 525 | self.assertEqual(str(ex), """expected b'[ab]' at 0""") 526 | 527 | def test_string_from(self): 528 | titles = string_from("Mr", "Mr.", "Mrs", "Mrs.") 529 | self.assertEqual(titles.parse("Mr"), "Mr") 530 | self.assertEqual(titles.parse("Mr."), "Mr.") 531 | self.assertEqual((titles + string(" Hyde")).parse("Mr. Hyde"), "Mr. Hyde") 532 | with self.assertRaises(ParseError) as err: 533 | titles.parse("foo") 534 | 535 | ex = err.exception 536 | self.assertEqual(str(ex), """expected one of 'Mr', 'Mr.', 'Mrs', 'Mrs.' at 0:0""") 537 | 538 | def test_string_from_transform(self): 539 | titles = string_from("Mr", "Mr.", "Mrs", "Mrs.", transform=lambda s: s.lower()) 540 | self.assertEqual(titles.parse("mr"), "Mr") 541 | self.assertEqual(titles.parse("mr."), "Mr.") 542 | self.assertEqual(titles.parse("MR"), "Mr") 543 | self.assertEqual(titles.parse("MR."), "Mr.") 544 | 545 | def test_peek(self): 546 | self.assertEqual(peek(any_char).parse_partial("abc"), ("a", "abc")) 547 | with self.assertRaises(ParseError) as err: 548 | peek(digit).parse("a") 549 | self.assertEqual(str(err.exception), "expected 'a digit' at 0:0") 550 | 551 | def test_any_char(self): 552 | self.assertEqual(any_char.parse("x"), "x") 553 | self.assertEqual(any_char.parse("\n"), "\n") 554 | self.assertRaises(ParseError, any_char.parse, "") 555 | 556 | def test_whitespace(self): 557 | self.assertEqual(whitespace.parse("\n"), "\n") 558 | self.assertEqual(whitespace.parse(" "), " ") 559 | self.assertRaises(ParseError, whitespace.parse, "x") 560 | 561 | def test_letter(self): 562 | self.assertEqual(letter.parse("a"), "a") 563 | self.assertRaises(ParseError, letter.parse, "1") 564 | 565 | def test_digit(self): 566 | self.assertEqual(digit.parse("¹"), "¹") 567 | self.assertEqual(digit.parse("2"), "2") 568 | self.assertRaises(ParseError, digit.parse, "x") 569 | 570 | def test_decimal_digit(self): 571 | self.assertEqual(decimal_digit.at_least(1).concat().parse("9876543210"), "9876543210") 572 | self.assertRaises(ParseError, decimal_digit.parse, "¹") 573 | 574 | def test_line_info(self): 575 | @generate 576 | def foo(): 577 | i = yield line_info 578 | l = yield any_char 579 | return (l, i) 580 | 581 | self.assertEqual( 582 | foo.many().parse("AB\nCD"), 583 | [ 584 | ("A", (0, 0)), 585 | ("B", (0, 1)), 586 | ("\n", (0, 2)), 587 | ("C", (1, 0)), 588 | ("D", (1, 1)), 589 | ], 590 | ) 591 | 592 | def test_should_fail(self): 593 | not_a_digit = digit.should_fail("not a digit") >> regex(r".*") 594 | 595 | self.assertEqual(not_a_digit.parse("a"), "a") 596 | self.assertEqual(not_a_digit.parse("abc"), "abc") 597 | self.assertEqual(not_a_digit.parse("a10"), "a10") 598 | self.assertEqual(not_a_digit.parse(""), "") 599 | 600 | with self.assertRaises(ParseError) as err: 601 | not_a_digit.parse("8") 602 | self.assertEqual(str(err.exception), "expected 'not a digit' at 0:0") 603 | 604 | self.assertRaises(ParseError, not_a_digit.parse, "8ab") 605 | 606 | def test_from_enum_string(self): 607 | class Pet(enum.Enum): 608 | CAT = "cat" 609 | DOG = "dog" 610 | 611 | pet = from_enum(Pet) 612 | self.assertEqual(pet.parse("cat"), Pet.CAT) 613 | self.assertEqual(pet.parse("dog"), Pet.DOG) 614 | self.assertRaises(ParseError, pet.parse, "foo") 615 | 616 | def test_from_enum_int(self): 617 | class Position(enum.Enum): 618 | FIRST = 1 619 | SECOND = 2 620 | 621 | position = from_enum(Position) 622 | self.assertEqual(position.parse("1"), Position.FIRST) 623 | self.assertEqual(position.parse("2"), Position.SECOND) 624 | self.assertRaises(ParseError, position.parse, "foo") 625 | 626 | def test_from_enum_transform(self): 627 | class Pet(enum.Enum): 628 | CAT = "cat" 629 | DOG = "dog" 630 | 631 | pet = from_enum(Pet, transform=lambda s: s.lower()) 632 | self.assertEqual(pet.parse("cat"), Pet.CAT) 633 | self.assertEqual(pet.parse("CAT"), Pet.CAT) 634 | 635 | 636 | class TestParserTokens(unittest.TestCase): 637 | """ 638 | Tests that ensure that `.parse` can handle an arbitrary list of tokens, 639 | rather than a string. 640 | """ 641 | 642 | # Some opaque objects we will use in our stream: 643 | START = object() 644 | STOP = object() 645 | 646 | def test_test_item(self): 647 | start_stop = parsy_test_item(lambda i: i in [self.START, self.STOP], "START/STOP") 648 | self.assertEqual(start_stop.parse([self.START]), self.START) 649 | self.assertEqual(start_stop.parse([self.STOP]), self.STOP) 650 | with self.assertRaises(ParseError) as err: 651 | start_stop.many().parse([self.START, "hello"]) 652 | 653 | ex = err.exception 654 | self.assertEqual(str(ex), "expected one of 'EOF', 'START/STOP' at 1") 655 | self.assertEqual(ex.expected, {"EOF", "START/STOP"}) 656 | self.assertEqual(ex.index, 1) 657 | 658 | def test_match_item(self): 659 | self.assertEqual(match_item(self.START).parse([self.START]), self.START) 660 | with self.assertRaises(ParseError) as err: 661 | match_item(self.START, "START").parse([]) 662 | 663 | ex = err.exception 664 | self.assertEqual(str(ex), "expected 'START' at 0") 665 | 666 | def test_parse_tokens(self): 667 | other_vals = parsy_test_item(lambda i: i not in [self.START, self.STOP], "not START/STOP") 668 | 669 | bracketed = match_item(self.START) >> other_vals.many() << match_item(self.STOP) 670 | stream = [self.START, "hello", 1, 2, "goodbye", self.STOP] 671 | result = bracketed.parse(stream) 672 | self.assertEqual(result, ["hello", 1, 2, "goodbye"]) 673 | 674 | def test_index(self): 675 | @generate 676 | def foo(): 677 | i = yield index 678 | l = yield letter 679 | return (l, i) 680 | 681 | self.assertEqual(foo.many().parse(["A", "B"]), [("A", 0), ("B", 1)]) 682 | 683 | 684 | class TestUtils(unittest.TestCase): 685 | def test_line_info_at(self): 686 | text = "abc\ndef" 687 | self.assertEqual(line_info_at(text, 0), (0, 0)) 688 | self.assertEqual(line_info_at(text, 2), (0, 2)) 689 | self.assertEqual(line_info_at(text, 3), (0, 3)) 690 | self.assertEqual(line_info_at(text, 4), (1, 0)) 691 | self.assertEqual(line_info_at(text, 7), (1, 3)) 692 | self.assertRaises(ValueError, lambda: line_info_at(text, 8)) 693 | 694 | 695 | class TestForwardDeclaration(unittest.TestCase): 696 | def test_forward_declaration_1(self): 697 | # This is the example from the docs 698 | expr = forward_declaration() 699 | with self.assertRaises(ValueError): 700 | expr.parse("()") 701 | 702 | with self.assertRaises(ValueError): 703 | expr.parse_partial("()") 704 | 705 | simple = regex("[0-9]+").map(int) 706 | group = string("(") >> expr.sep_by(string(" ")) << string(")") 707 | expr.become(simple | group) 708 | 709 | self.assertEqual(expr.parse("(0 1 (2 3))"), [0, 1, [2, 3]]) 710 | 711 | def test_forward_declaration_2(self): 712 | # Simplest example I could think of 713 | expr = forward_declaration() 714 | expr.become(string("A") + expr | string("Z")) 715 | 716 | self.assertEqual(expr.parse("Z"), "Z") 717 | self.assertEqual(expr.parse("AZ"), "AZ") 718 | self.assertEqual(expr.parse("AAAAAZ"), "AAAAAZ") 719 | 720 | with self.assertRaises(ParseError): 721 | expr.parse("A") 722 | 723 | with self.assertRaises(ParseError): 724 | expr.parse("B") 725 | 726 | self.assertEqual(expr.parse_partial("AAZXX"), ("AAZ", "XX")) 727 | 728 | def test_forward_declaration_cant_become_twice(self): 729 | dec = forward_declaration() 730 | other = string("X") 731 | dec.become(other) 732 | 733 | with self.assertRaises((AttributeError, TypeError)): 734 | dec.become(other) 735 | 736 | 737 | if __name__ == "__main__": 738 | unittest.main() 739 | -------------------------------------------------------------------------------- /tests/test_sexpr.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unittest 3 | 4 | from parsy import generate, regex, string 5 | 6 | whitespace = regex(r"\s+", re.MULTILINE) 7 | comment = regex(r";.*") 8 | ignore = (whitespace | comment).many() 9 | 10 | lexeme = lambda p: p << ignore 11 | 12 | lparen = lexeme(string("(")) 13 | rparen = lexeme(string(")")) 14 | number = lexeme(regex(r"\d+")).map(int) 15 | symbol = lexeme(regex(r"[\d\w_-]+")) 16 | true = lexeme(string("#t")).result(True) 17 | false = lexeme(string("#f")).result(False) 18 | 19 | atom = true | false | number | symbol 20 | 21 | 22 | @generate("a form") 23 | def form(): 24 | yield lparen 25 | els = yield expr.many() 26 | yield rparen 27 | return els 28 | 29 | 30 | @generate 31 | def quote(): 32 | yield string("'") 33 | e = yield expr 34 | return ["quote", e] 35 | 36 | 37 | expr = form | quote | atom 38 | program = ignore >> expr.many() 39 | 40 | 41 | class TestSexpr(unittest.TestCase): 42 | def test_form(self): 43 | result = program.parse("(1 2 3)") 44 | self.assertEqual(result, [[1, 2, 3]]) 45 | 46 | def test_quote(self): 47 | result = program.parse("'foo '(bar baz)") 48 | self.assertEqual(result, [["quote", "foo"], ["quote", ["bar", "baz"]]]) 49 | 50 | def test_double_quote(self): 51 | result = program.parse("''foo") 52 | self.assertEqual(result, [["quote", ["quote", "foo"]]]) 53 | 54 | def test_boolean(self): 55 | result = program.parse("#t #f") 56 | self.assertEqual(result, [True, False]) 57 | 58 | def test_comments(self): 59 | result = program.parse( 60 | """ 61 | ; a program with a comment 62 | ( foo ; that's a foo 63 | bar ) 64 | ; some comments at the end 65 | """ 66 | ) 67 | 68 | self.assertEqual(result, [["foo", "bar"]]) 69 | 70 | 71 | if __name__ == "__main__": 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39,py310,py311,py312,py313,pypy39,isort-check,flake8-check 3 | 4 | [testenv] 5 | commands = pytest 6 | allowlist_externals = ["pytest"] 7 | deps = -e . 8 | pytest 9 | 10 | [testenv:isort-check] 11 | # isort configurations are located in pyproject.toml 12 | basepython = python3.9 13 | deps = -r tests/requirements-linters.txt 14 | commands = isort -c {toxinidir} 15 | 16 | [testenv:flake8-check] 17 | basepython = python3.9 18 | deps = -r tests/requirements-linters.txt 19 | commands = flake8 20 | --------------------------------------------------------------------------------