├── .fleet └── run.json ├── .github └── workflows │ ├── gh-pages.yml │ ├── publish-to-pypi.yml │ └── python-checks.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── Makefile ├── README.md ├── docs ├── api │ ├── index.md │ ├── lexer.md │ ├── parser.md │ └── util.md ├── changes.md ├── getting-started │ ├── index.md │ ├── parse-tree.md │ ├── parsing.md │ ├── tips-and-tricks.md │ └── tokenizing.md ├── index.md └── media │ └── extra.css ├── funcparserlib ├── __init__.py ├── lexer.py ├── parser.py ├── py.typed └── util.py ├── mkdocs.yml ├── mypy.ini ├── poetry.lock ├── pyproject.toml ├── tests ├── __init__.py ├── dot.py ├── json.py ├── test_dot.py ├── test_json.py └── test_parsing.py └── tox.ini /.fleet/run.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "type": "python-tests", 5 | "name": "Unit tests", 6 | "testFramework": "unittest" 7 | }, 8 | { 9 | "type": "command", 10 | "name": "pre-commit", 11 | "program": "pre-commit", 12 | "args": ["run", "-a"] 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /.github/workflows/gh-pages.yml: -------------------------------------------------------------------------------- 1 | name: Deploy docs to GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.12 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: "3.12" 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install poetry 21 | poetry install 22 | - name: Build docs with mkdocs 23 | run: | 24 | poetry run mkdocs build 25 | - name: Deploy 26 | uses: peaceiris/actions-gh-pages@v3 27 | with: 28 | github_token: ${{ secrets.GITHUB_TOKEN }} 29 | publish_dir: ./site 30 | cname: funcparserlib.pirx.ru 31 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | 8 | jobs: 9 | build-n-publish: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.12 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: "3.12" 17 | - name: Install build tools 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install poetry 21 | - name: Build sdist and wheel 22 | run: | 23 | poetry build --no-interaction 24 | - name: Publish distribution to PyPI 25 | uses: pypa/gh-action-pypi-publish@release/v1 26 | with: 27 | user: __token__ 28 | password: ${{ secrets.PYPI_API_TOKEN }} 29 | -------------------------------------------------------------------------------- /.github/workflows/python-checks.yml: -------------------------------------------------------------------------------- 1 | name: Python checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | pre-commit-checks: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: 17 | - "3.8" 18 | - "3.9" 19 | - "3.10" 20 | - "3.11" 21 | - "3.12" 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v3 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install poetry 32 | poetry install 33 | - name: Run pre-commit checks 34 | uses: pre-commit/action@v3.0.0 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | *.pyc 3 | *.swp 4 | .eggs/ 5 | .idea/ 6 | .tox/ 7 | __pycache__/ 8 | build/ 9 | dist/ 10 | site/ 11 | venv/ 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: "23.11.0" 4 | hooks: 5 | - id: black 6 | - repo: https://github.com/PyCQA/flake8 7 | rev: "6.1.0" 8 | hooks: 9 | - id: flake8 10 | args: ["--max-line-length=88"] 11 | - repo: https://github.com/pre-commit/mirrors-mypy 12 | rev: "v1.7.0" 13 | hooks: 14 | - id: mypy 15 | - repo: local 16 | hooks: 17 | - id: unittest 18 | name: unittest 19 | entry: poetry run python -m unittest discover 20 | language: system 21 | types: 22 | - python 23 | pass_filenames: false 24 | - repo: local 25 | hooks: 26 | - id: doctest 27 | name: doctest 28 | entry: poetry run python -m doctest 29 | language: system 30 | files: (^funcparserlib/|^docs/) 31 | types_or: 32 | - python 33 | - markdown 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2009/2023 Andrey Vlasovskikh 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so, subject to the following 8 | conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all copies or 11 | substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 15 | PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 16 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 17 | OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 18 | OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default install test doctest unittest clean poetry-install tox mypy 2 | 3 | default: poetry-install 4 | poetry build 5 | 6 | poetry-install: 7 | poetry install 8 | 9 | test: unittest 10 | 11 | tox: 12 | poetry run tox 13 | 14 | clean: 15 | rm -fr build dist *.egg-info .tox 16 | find . -name '*.pyc' | xargs rm -f 17 | find . -name __pycache__ | xargs rm -fr 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Funcparserlib 2 | ============= 3 | 4 | Recursive descent parsing library for Python based on functional combinators. 5 | 6 | [![PyPI](https://img.shields.io/pypi/v/funcparserlib)](https://pypi.org/project/funcparserlib/) 7 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/funcparserlib)](https://pypi.org/project/funcparserlib/) 8 | 9 | 10 | Description 11 | ----------- 12 | 13 | The primary focus of `funcparserlib` is **parsing little languages** or **external DSLs** (domain specific languages). 14 | 15 | Parsers made with `funcparserlib` are pure-Python LL(\*) parsers. It means that it's **very easy to write parsers** without thinking about lookaheads and other hardcore parsing stuff. However, recursive descent parsing is a rather slow method compared to LL(k) or LR(k) algorithms. Still, parsing with `funcparserlib` is **at least twice faster than PyParsing**, a very popular library for Python. 16 | 17 | The source code of `funcparserlib` is only 1.2K lines of code, with lots of comments. Its API is fully type hinted. It features the longest parsed prefix error reporting, as well as a tiny lexer generator for token position tracking. 18 | 19 | The idea of parser combinators used in `funcparserlib` comes from the [Introduction to Functional Programming](https://www.cl.cam.ac.uk/teaching/Lectures/funprog-jrh-1996/) course. We have converted it from ML into Python. 20 | 21 | 22 | Installation 23 | ------------ 24 | 25 | You can install `funcparserlib` from [PyPI](https://pypi.org/project/funcparserlib/): 26 | 27 | ```shell 28 | $ pip install funcparserlib 29 | ``` 30 | 31 | There are no dependencies on other libraries. 32 | 33 | 34 | Documentation 35 | ------------- 36 | 37 | * [Getting Started](https://funcparserlib.pirx.ru/getting-started/) 38 | * Your **starting point** with `funcparserlib` 39 | * [API Reference](https://funcparserlib.pirx.ru/api/) 40 | * Learn the details of the API 41 | 42 | There are several examples available in the `tests/` directory: 43 | 44 | * [GraphViz DOT parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/dot.py) 45 | * [JSON parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py) 46 | 47 | See also [the changelog](https://funcparserlib.pirx.ru/changes/). 48 | 49 | 50 | Example 51 | ------- 52 | 53 | Let's consider a little language of **numeric expressions** with a syntax similar to Python expressions. Here are some expression strings in this language: 54 | 55 | ``` 56 | 0 57 | 1 + 2 + 3 58 | -1 + 2 ** 32 59 | 3.1415926 * (2 + 7.18281828e-1) * 42 60 | ``` 61 | 62 | 63 | Here is **the complete source code** of the tokenizer and the parser for this language written using `funcparserlib`: 64 | 65 | ```python 66 | from typing import List, Tuple, Union 67 | from dataclasses import dataclass 68 | 69 | from funcparserlib.lexer import make_tokenizer, TokenSpec, Token 70 | from funcparserlib.parser import tok, Parser, many, forward_decl, finished 71 | 72 | 73 | @dataclass 74 | class BinaryExpr: 75 | op: str 76 | left: "Expr" 77 | right: "Expr" 78 | 79 | 80 | Expr = Union[BinaryExpr, int, float] 81 | 82 | 83 | def tokenize(s: str) -> List[Token]: 84 | specs = [ 85 | TokenSpec("whitespace", r"\s+"), 86 | TokenSpec("float", r"[+\-]?\d+\.\d*([Ee][+\-]?\d+)*"), 87 | TokenSpec("int", r"[+\-]?\d+"), 88 | TokenSpec("op", r"(\*\*)|[+\-*/()]"), 89 | ] 90 | tokenizer = make_tokenizer(specs) 91 | return [t for t in tokenizer(s) if t.type != "whitespace"] 92 | 93 | 94 | def parse(tokens: List[Token]) -> Expr: 95 | int_num = tok("int") >> int 96 | float_num = tok("float") >> float 97 | number = int_num | float_num 98 | 99 | expr: Parser[Token, Expr] = forward_decl() 100 | parenthesized = -op("(") + expr + -op(")") 101 | primary = number | parenthesized 102 | power = primary + many(op("**") + primary) >> to_expr 103 | term = power + many((op("*") | op("/")) + power) >> to_expr 104 | sum = term + many((op("+") | op("-")) + term) >> to_expr 105 | expr.define(sum) 106 | 107 | document = expr + -finished 108 | 109 | return document.parse(tokens) 110 | 111 | 112 | def op(name: str) -> Parser[Token, str]: 113 | return tok("op", name) 114 | 115 | 116 | def to_expr(args: Tuple[Expr, List[Tuple[str, Expr]]]) -> Expr: 117 | first, rest = args 118 | result = first 119 | for op, expr in rest: 120 | result = BinaryExpr(op, result, expr) 121 | return result 122 | ``` 123 | 124 | Now, consider this numeric expression: `3.1415926 * (2 + 7.18281828e-1) * 42`. 125 | 126 | Let's `tokenize()` it using the tokenizer we've created with `funcparserlib.lexer`: 127 | 128 | ``` 129 | [ 130 | Token('float', '3.1415926'), 131 | Token('op', '*'), 132 | Token('op', '('), 133 | Token('int', '2'), 134 | Token('op', '+'), 135 | Token('float', '7.18281828e-1'), 136 | Token('op', ')'), 137 | Token('op', '*'), 138 | Token('int', '42'), 139 | ] 140 | ``` 141 | 142 | Let's `parse()` these tokens into an expression tree using our parser created with `funcparserlib.parser`: 143 | 144 | ``` 145 | BinaryExpr( 146 | op='*', 147 | left=BinaryExpr( 148 | op='*', 149 | left=3.1415926, 150 | right=BinaryExpr(op='+', left=2, right=0.718281828), 151 | ), 152 | right=42, 153 | ) 154 | ``` 155 | 156 | Learn how to write this parser using `funcparserlib` in the [Getting Started](https://funcparserlib.pirx.ru/getting-started/) guide! 157 | 158 | 159 | Used By 160 | ------- 161 | 162 | Some open-source projects that use `funcparserlib` as an explicit dependency: 163 | 164 | * [Hy](https://github.com/hylang/hy), a Lisp dialect that's embedded in Python 165 | * 4.7K stars, version `~=1.0`, Python 3.8+ 166 | * [Splash](https://github.com/scrapinghub/splash), a JavaScript rendering service with HTTP API, by Scrapinghub 167 | * 3.9K stars, version `*`. Python 3 in Docker 168 | * [graphite-beacon](https://github.com/klen/graphite-beacon), a simple alerting system for Graphite metrics 169 | * 453 stars, version `==0.3.6`, Python 2 and 3 170 | * [blockdiag](https://github.com/blockdiag/blockdiag), generates block-diagram image file from spec-text file 171 | * 194 stars, version `>= 1.0.0a0`, Python 3.7+ 172 | * [kll](https://github.com/kiibohd/kll), Keyboard Layout Language (KLL) compiler 173 | * 113 stars, copied source code, Python 3.5+ 174 | 175 | 176 | Next 177 | ---- 178 | 179 | Read the [Getting Started](https://funcparserlib.pirx.ru/getting-started/) guide to start learning `funcparserlib`. 180 | -------------------------------------------------------------------------------- /docs/api/index.md: -------------------------------------------------------------------------------- 1 | # API Reference 2 | 3 | Funcparserlib consists of the following modules: 4 | 5 | * [`funcparserlib.lexer` — Regexp-based tokenizer](lexer.md) 6 | * [`funcparserlib.parser` — Functional parsing combinators](parser.md) 7 | * [`funcparserlib.util` — Various utilities](util.md) 8 | -------------------------------------------------------------------------------- /docs/api/lexer.md: -------------------------------------------------------------------------------- 1 | # `funcparserlib.lexer` — Regexp-based tokenizer 2 | 3 | ::: funcparserlib.lexer.make_tokenizer 4 | 5 | ::: funcparserlib.lexer.TokenSpec 6 | 7 | ::: funcparserlib.lexer.TokenSpec.__init__ 8 | rendering: 9 | heading_level: 3 10 | 11 | ::: funcparserlib.lexer.Token 12 | -------------------------------------------------------------------------------- /docs/api/parser.md: -------------------------------------------------------------------------------- 1 | # `funcparserlib.parser` — Functional parsing combinators 2 | 3 | ::: funcparserlib.parser 4 | rendering: 5 | show_root_heading: false 6 | 7 | ::: funcparserlib.parser.Parser 8 | 9 | ::: funcparserlib.parser.Parser.parse 10 | rendering: 11 | heading_level: 3 12 | 13 | ::: funcparserlib.parser.Parser.define 14 | rendering: 15 | heading_level: 3 16 | 17 | ::: funcparserlib.parser.Parser.named 18 | rendering: 19 | heading_level: 3 20 | 21 | 22 | Primitive Parsers 23 | ----------------- 24 | 25 | ::: funcparserlib.parser.tok 26 | rendering: 27 | heading_level: 3 28 | 29 | ::: funcparserlib.parser.a 30 | rendering: 31 | heading_level: 3 32 | 33 | ::: funcparserlib.parser.some 34 | rendering: 35 | heading_level: 3 36 | 37 | ::: funcparserlib.parser.forward_decl 38 | rendering: 39 | heading_level: 3 40 | 41 | ### `finished` 42 | 43 | A parser that throws an exception if there are any unparsed tokens left in the sequence. 44 | 45 | Type: `Parser[Any, None]` 46 | 47 | **Examples:** 48 | 49 | ```pycon 50 | >>> from funcparserlib.parser import a, finished 51 | >>> expr = a("x") + finished 52 | >>> expr.parse("x") 53 | ('x', None) 54 | 55 | ``` 56 | 57 | ```pycon 58 | >>> expr = a("x") + finished 59 | >>> expr.parse("xy") 60 | Traceback (most recent call last): 61 | ... 62 | funcparserlib.parser.NoParseError: got unexpected token: 'y', expected: end of input 63 | 64 | ``` 65 | 66 | 67 | Parser Combinators 68 | ------------------ 69 | 70 | ::: funcparserlib.parser.Parser.__add__ 71 | rendering: 72 | heading_level: 3 73 | 74 | ::: funcparserlib.parser.Parser.__neg__ 75 | rendering: 76 | heading_level: 3 77 | 78 | ::: funcparserlib.parser.Parser.__or__ 79 | rendering: 80 | heading_level: 3 81 | 82 | ::: funcparserlib.parser.Parser.__rshift__ 83 | rendering: 84 | heading_level: 3 85 | 86 | ::: funcparserlib.parser.maybe 87 | rendering: 88 | heading_level: 3 89 | 90 | ::: funcparserlib.parser.many 91 | rendering: 92 | heading_level: 3 93 | 94 | ::: funcparserlib.parser.oneplus 95 | rendering: 96 | heading_level: 3 97 | 98 | ::: funcparserlib.parser.skip 99 | rendering: 100 | heading_level: 3 101 | 102 | 103 | Extra: Parser Monad 104 | ------------------- 105 | 106 | As a functional programmer, you might be pleased to know, that parsers in funcparserlib 107 | form _a monad_ with `Parser.bind()` as `>>=` and `pure()` as `return`. 108 | 109 | We could have expressed other parsing combinators in terms of `bind()`, but would be 110 | inefficient in Python: 111 | 112 | ```python 113 | # noinspection PyUnresolvedReferences 114 | class Parser: 115 | def __add__(self, other): 116 | return self.bind(lambda x: other.bind(lambda y: pure((x, y)))) 117 | 118 | def __rshift__(self, other): 119 | return self.bind(lambda x: pure(x)) 120 | ``` 121 | 122 | ::: funcparserlib.parser.Parser.bind 123 | rendering: 124 | heading_level: 3 125 | 126 | ::: funcparserlib.parser.pure 127 | rendering: 128 | heading_level: 3 129 | -------------------------------------------------------------------------------- /docs/api/util.md: -------------------------------------------------------------------------------- 1 | # `funcparserlib.util` — Various utilities 2 | 3 | ::: funcparserlib.util.pretty_tree 4 | -------------------------------------------------------------------------------- /docs/changes.md: -------------------------------------------------------------------------------- 1 | The Changelog 2 | ============= 3 | 4 | 2.0.0 — to be released 5 | ---------------------- 6 | 7 | Dropped support for Python 2.7 (end of life). For compatibility with Python 2.7 please 8 | use version `>=1.0,==1.*` (`~=1.0`). 9 | 10 | ### Added 11 | 12 | * Added support for Python 3.12 13 | 14 | ### Changed 15 | 16 | * Dropped support for Python 2.7 17 | * Dropped support for Python 3.7 18 | 19 | 20 | 1.0.1 — 2022-11-04 21 | ------------------ 22 | 23 | ### Added 24 | 25 | * Added support for Python 3.11 26 | 27 | 28 | 1.0.0 — 2022-05-02 29 | ------------------ 30 | 31 | The stable 1.0.0 release freezes the API of funcparserlib 0.3.6 which was released on 32 | 2013-05-02, with a few bug fixes and small features. 33 | 34 | ### Added 35 | 36 | * Added support for Python 3.10 37 | * Added support for Python 3.9 38 | ([#63](https://github.com/vlasovskikh/funcparserlib/pull/63)) 39 | (Thanks to [@pkulev](https://github.com/pkulev)) 40 | * Added support for Python 3.8 41 | * Added `-p` (the same as `skip(p)`) with more strict type hints for `-p` and `p1 + p2` 42 | * Added `tok(type[, value])` for more compact grammars, better error messages 43 | * Added `TokenSpec(type, pattern[, flags])` to simplify the use of `make_tokenizer()` 44 | * Added type hints for the public API 45 | * Added the new library homepage with the new Getting Started guide and the new API 46 | reference 47 | 48 | ### Changed 49 | 50 | * Parse exceptions now show expected tokens and grammar rules at the stopped position 51 | ([#52](https://github.com/vlasovskikh/funcparserlib/issues/52)) 52 | * Dropped support for Python 3.4, 3.5, 3.6 (end of life) 53 | * Dropped support for Python 2.5, 2.6, 3.3 (end of life), modernized code for Python 54 | 3 to run without obsolete `2to3` 55 | ([#57](https://github.com/vlasovskikh/funcparserlib/pull/57)) 56 | (Thanks to [@jdufresne](https://github.com/jdufresne)) 57 | * Removed documentation and unit tests from the distribution 58 | * Switched from setuptools to Poetry 59 | * Switched to poetry-core for lighter PEP 517 builds 60 | ([#73](https://github.com/vlasovskikh/funcparserlib/pull/73)) 61 | (Thanks to [@fabaff](https://github.com/fabaff)) 62 | * Run unit tests on GitHub Actions for all supported Pythons 63 | 64 | ### Fixed 65 | 66 | * Fixed `TypeError` in `oneplus` when applying it `parser + parser` 67 | ([#66](https://github.com/vlasovskikh/funcparserlib/issues/66)) 68 | (Thanks to [@martica](https://github.com/martica)) 69 | * Fixed `AttributeError` when comparing `Token` objects to `None` 70 | ([#58](https://github.com/vlasovskikh/funcparserlib/pull/58)) 71 | (Thanks to [@Halolegend94](https://github.com/Halolegend94)) 72 | * Fixed doctests in the tutorial 73 | ([#49](https://github.com/vlasovskikh/funcparserlib/issues/49)) 74 | * Fixed several cases of wrong expected tokens in error messages 75 | 76 | 77 | 0.3.6 — 2013-05-02 78 | ------------------ 79 | 80 | ### Changed 81 | 82 | * Python 3 compatibility 83 | * More info available in exception objects 84 | ([#14](https://github.com/vlasovskikh/funcparserlib/issues/14)) 85 | 86 | ### Fixed 87 | 88 | * Fixed `many()` that consumed too many tokens in some cases 89 | ([#31](https://github.com/vlasovskikh/funcparserlib/issues/31)) 90 | 91 | 92 | 0.3.5 — 2011-01-13 93 | ------------------ 94 | 95 | ### Changed 96 | 97 | * Python 2.4 compatibility 98 | * More readable terminal names for error reporting 99 | 100 | ### Fixed 101 | 102 | * Fixed wrong token positions in lexer error messages 103 | 104 | 105 | 0.3.4 — 2009-10-06 106 | ------------------ 107 | 108 | ### Changed 109 | 110 | * Switched from `setuptools` to `distutils` 111 | * Improved the `run-tests` utility 112 | 113 | ### Fixed 114 | 115 | * Fixed importing all symbols from `funcparserlib.lexer` 116 | 117 | 118 | 0.3.3 — 2009-08-03 119 | ------------------ 120 | 121 | ### Added 122 | 123 | * Added a FAQ question about infinite loops in parsers 124 | 125 | ### Changed 126 | 127 | * Debug rule tracing can be enabled again 128 | 129 | ### Fixed 130 | 131 | * Fixed a bug in results of skip + skip parsers 132 | 133 | 134 | 0.3.2 — 2009-07-26 135 | ------------------ 136 | 137 | ### Added 138 | 139 | * Added the Parsing Stages Illustrated page 140 | 141 | ### Fixed 142 | 143 | * Fixed some string and number encoding issues in examples 144 | 145 | 146 | 0.3.1 — 2009-07-26 147 | ------------------ 148 | 149 | Major optimizations (10x faster than the version 0.3). 150 | 151 | ### Added 152 | 153 | * Added the `forward_decl` function, that performs better than `with_forward_decls` 154 | * Added the `pretty_tree` function for creating pseudo-graphic trees 155 | * Added the Nested Brackets Mini-HOWTO 156 | * Added `Makefile` and this `CHANGES.md` file 157 | 158 | ### Changed 159 | 160 | * Use a single immutable input sequence in parsers 161 | * Call a wrapped parser directly using `run` (without `__call__`) 162 | * The slow `logging` is enabled only when the `debug` flag is set 163 | 164 | 165 | 0.3 — 2009-07-23 166 | ---------------- 167 | 168 | ### Added 169 | 170 | * Added `pure` and `bind` functions on `Parser`s making them monads 171 | * Added the Funcparserlib Tutorial 172 | * Added a JSON parser as an example 173 | 174 | ### Changed 175 | 176 | * Translated the docs from Russian into English 177 | 178 | 179 | 0.2 — 2009-07-07 180 | ---------------- 181 | 182 | ### Added 183 | 184 | * Added the `with_forward_decls` combinator for dealing with forward declarations 185 | 186 | ### Changed 187 | 188 | * Switched to the iterative implementation of `many` 189 | * Un-curried the parser function type in order to simplify things 190 | * Improvements to the DOT parser 191 | 192 | 193 | 0.1 — 2009-06-26 194 | ---------------- 195 | 196 | Initial release. 197 | -------------------------------------------------------------------------------- /docs/getting-started/index.md: -------------------------------------------------------------------------------- 1 | Getting Started 2 | =============== 3 | 4 | 5 | Intro 6 | ----- 7 | 8 | In this guide, we will write **a parser for a numeric expression calculator** with a syntax similar to Python expressions. Writing a calculator is a common example in articles related to parsers and parsing techniques, so it is a good starting point in learning `funcparserlib`. 9 | 10 | You will learn how to write a parser of numeric expressions using 11 | `funcparserlib`. Here are some expression strings we want to be able to parse: 12 | 13 | ``` 14 | 0 15 | 1 + 2 + 3 16 | -1 + 2 ** 32 17 | 3.1415926 * (2 + 7.18281828e-1) * 42 18 | ``` 19 | 20 | We will parse these strings into trees of objects like this one: 21 | 22 | ``` 23 | BinaryExpr('*') 24 | |-- BinaryExpr('*') 25 | | |-- 3.1415926 26 | | `-- BinaryExpr('+') 27 | | |-- 2 28 | | `-- 0.718281828 29 | `-- 42 30 | ``` 31 | 32 | 33 | Diving In 34 | --------- 35 | 36 | Here is the complete source code of the expression parser we are going to write. 37 | 38 | You are **not** supposed to understand it now. Just look at its shape and try to get some feeling about its structure. By the end of this guide, **you will fully understand this code** and will be able to write parsers for your own needs. 39 | 40 | 41 | ```pycon 42 | >>> from typing import List, Tuple, Union 43 | >>> from dataclasses import dataclass 44 | >>> from funcparserlib.lexer import make_tokenizer, TokenSpec, Token 45 | >>> from funcparserlib.parser import tok, Parser, many, forward_decl, finished 46 | 47 | 48 | >>> @dataclass 49 | ... class BinaryExpr: 50 | ... op: str 51 | ... left: "Expr" 52 | ... right: "Expr" 53 | 54 | 55 | >>> Expr = Union[BinaryExpr, int, float] 56 | 57 | 58 | >>> def tokenize(s: str) -> List[Token]: 59 | ... specs = [ 60 | ... TokenSpec("whitespace", r"\s+"), 61 | ... TokenSpec("float", r"[+\-]?\d+\.\d*([Ee][+\-]?\d+)*"), 62 | ... TokenSpec("int", r"[+\-]?\d+"), 63 | ... TokenSpec("op", r"(\*\*)|[+\-*/()]"), 64 | ... ] 65 | ... tokenizer = make_tokenizer(specs) 66 | ... return [t for t in tokenizer(s) if t.type != "whitespace"] 67 | 68 | 69 | >>> def parse(tokens: List[Token]) -> Expr: 70 | ... int_num = tok("int") >> int 71 | ... float_num = tok("float") >> float 72 | ... number = int_num | float_num 73 | ... 74 | ... expr: Parser[Token, Expr] = forward_decl() 75 | ... parenthesized = -op("(") + expr + -op(")") 76 | ... primary = number | parenthesized 77 | ... power = primary + many(op("**") + primary) >> to_expr 78 | ... term = power + many((op("*") | op("/")) + power) >> to_expr 79 | ... sum = term + many((op("+") | op("-")) + term) >> to_expr 80 | ... expr.define(sum) 81 | ... 82 | ... document = expr + -finished 83 | ... 84 | ... return document.parse(tokens) 85 | 86 | 87 | >>> def op(name: str) -> Parser[Token, str]: 88 | ... return tok("op", name) 89 | 90 | 91 | >>> def to_expr(args: Tuple[Expr, List[Tuple[str, Expr]]]) -> Expr: 92 | ... first, rest = args 93 | ... result = first 94 | ... for op, expr in rest: 95 | ... result = BinaryExpr(op, result, expr) 96 | ... return result 97 | 98 | ``` 99 | 100 | !!! Note 101 | 102 | The code examples in this guide are actually executable. You can clone the [funcparserlib](https://github.com/vlasovskikh/funcparserlib) repository from GitHub and run the examples from the document via `doctest`: 103 | 104 | ```sh 105 | python3 -m doctest -v docs/getting-started/*.md 106 | 107 | ``` 108 | 109 | Test the expression parser: 110 | 111 | ```pycon 112 | >>> parse(tokenize("0")) 113 | 0 114 | 115 | >>> parse(tokenize("1 + 2 + 3")) 116 | BinaryExpr(op='+', left=BinaryExpr(op='+', left=1, right=2), right=3) 117 | 118 | >>> parse(tokenize("-1 + 2 ** 32")) 119 | BinaryExpr(op='+', left=-1, right=BinaryExpr(op='**', left=2, right=32)) 120 | 121 | >>> parse(tokenize("3.1415926 * (2 + 7.18281828e-1) * 42")) 122 | BinaryExpr(op='*', left=BinaryExpr(op='*', left=3.1415926, right=BinaryExpr(op='+', left=2, right=0.718281828)), right=42) 123 | 124 | ``` 125 | 126 | 127 | Next 128 | ---- 129 | 130 | Now let's start learning how to write a numeric expression parser using `funcparserlib`. 131 | 132 | In [the next chapter](tokenizing.md) you will learn about the first step in parsing: tokenizing the input. It means splitting your input string into a sequence of tokens that are easier to parse. 133 | -------------------------------------------------------------------------------- /docs/getting-started/parse-tree.md: -------------------------------------------------------------------------------- 1 | Preparing the Parse Tree 2 | ======================== 3 | 4 | So far we have defined the parser for our calculator expressions language: 5 | 6 | 7 | ```pycon 8 | >>> from typing import List 9 | >>> from funcparserlib.lexer import make_tokenizer, TokenSpec, Token 10 | >>> from funcparserlib.parser import tok, Parser, many, forward_decl, finished 11 | 12 | 13 | >>> def tokenize(s: str) -> List[Token]: 14 | ... specs = [ 15 | ... TokenSpec("whitespace", r"\s+"), 16 | ... TokenSpec("float", r"[+\-]?\d+\.\d*([Ee][+\-]?\d+)*"), 17 | ... TokenSpec("int", r"[+\-]?\d+"), 18 | ... TokenSpec("op", r"(\*\*)|[+\-*/()]"), 19 | ... ] 20 | ... tokenizer = make_tokenizer(specs) 21 | ... return [t for t in tokenizer(s) if t.type != "whitespace"] 22 | 23 | 24 | >>> def op(name: str) -> Parser[Token, str]: 25 | ... return tok("op", name) 26 | 27 | 28 | >>> int_str = tok("int") 29 | >>> float_str = tok("float") 30 | >>> number = int_str | float_str 31 | >>> expr = forward_decl() 32 | >>> parenthesized = op("(") + expr + op(")") 33 | >>> primary = number | parenthesized 34 | >>> power = primary + many(op("**") + primary) 35 | >>> expr.define(power) 36 | >>> document = expr + finished 37 | 38 | ``` 39 | 40 | Here is how its parse results look so far: 41 | 42 | 43 | ```pycon 44 | >>> document.parse(tokenize("2 ** (3 ** 4)")) 45 | ('2', [('**', ('(', ('3', [('**', '4')]), ')'))], None) 46 | 47 | ``` 48 | 49 | 50 | `p >> f`: Transforming Parse Results 51 | ------------------------------------ 52 | 53 | Let's start improving our parse results by converting numbers from `str` to `int` or `float`. We will use the [`Parser.__rshift__`](../api/parser.md#funcparserlib.parser.Parser.__rshift__) combinator for that. `p >> f` takes a parser `p` and a function `f` of a single argument and returns a new parser, that applies `f` to the parse result of `p`. 54 | 55 | An integer parser that returns `int` values: 56 | 57 | ```pycon 58 | >>> int_num: Parser[Token, int] = tok("int") >> int 59 | 60 | ``` 61 | 62 | !!! Note 63 | 64 | We specify the type hint for the parser only for clarity here. We wanted to highlight that `>>` here changes the output type of the parser from `str` to `int`. You may omit type hints for parsers and rely on type inference features of your text editor and type checker to get code completion and linting warnings: 65 | 66 | ```pycon 67 | >>> int_num = tok("int") >> int 68 | 69 | ``` 70 | 71 | The only combinator which type is not inferrable is `forward_decl()`. You should specify its type explicitly to get your parser fully type checked. 72 | 73 | Try it: 74 | 75 | 76 | ```pycon 77 | >>> int_num.parse(tokenize("42")) 78 | 42 79 | 80 | ``` 81 | 82 | Let's redefine our `number` parser so that it returns either `int` or `float`: 83 | 84 | ```pycon 85 | >>> from typing import Union 86 | 87 | 88 | >>> float_num: Parser[Token, float] = tok("float") >> float 89 | >>> number: Parser[Token, Union[int, float]] = int_num | float_num 90 | 91 | ``` 92 | 93 | Test it: 94 | 95 | ```pycon 96 | >>> number.parse(tokenize("42")) 97 | 42 98 | 99 | >>> number.parse(tokenize("3.14")) 100 | 3.14 101 | 102 | ``` 103 | 104 | 105 | `-p`: Skipping Parse Results 106 | ---------------------------- 107 | 108 | Let's recall our nested parenthesized numbers example: 109 | 110 | ```pycon 111 | >>> p = forward_decl() 112 | >>> p.define(number | (op("(") + p + op(")"))) 113 | 114 | ``` 115 | 116 | Test it: 117 | 118 | ```pycon 119 | >>> p.parse(tokenize("((1))")) 120 | ('(', ('(', 1, ')'), ')') 121 | 122 | ``` 123 | 124 | We have successfully parsed numbers in nested parentheses, but we don't want to see parentheses in the parsing results. Let's skip them using the [`Parser.__neg__`](../api/parser.md#funcparserlib.parser.Parser.__neg__) combinator. It allows you to skip any parts of a sequence of parsers concatenated via `p1 + p2 + ... + pN` by using a unary `-p` operator on the ones you want to skip: 125 | 126 | ```pycon 127 | >>> p = forward_decl() 128 | >>> p.define(number | (-op("(") + p + -op(")"))) 129 | 130 | ``` 131 | 132 | The result is cleaner now: 133 | 134 | 135 | ```pycon 136 | >>> p.parse(tokenize("1")) 137 | 1 138 | 139 | >>> p.parse(tokenize("(1)")) 140 | 1 141 | 142 | >>> p.parse(tokenize("((1))")) 143 | 1 144 | 145 | ``` 146 | 147 | Let's re-define our grammar using the [`Parser.__neg__`](../api/parser.md#funcparserlib.parser.Parser.__neg__) combinator to get rid of extra parentheses in the parse results, as well as of extra `None` returned by `finished`: 148 | 149 | ```pycon 150 | >>> expr = forward_decl() 151 | >>> parenthesized = -op("(") + expr + -op(")") 152 | >>> primary = number | parenthesized 153 | >>> power = primary + many(op("**") + primary) 154 | >>> expr.define(power) 155 | >>> document = expr + -finished 156 | 157 | ``` 158 | 159 | Test it: 160 | 161 | ```pycon 162 | >>> document.parse(tokenize("2 ** (3 ** 4)")) 163 | (2, [('**', (3, [('**', 4)]))]) 164 | 165 | ``` 166 | 167 | User-Defined Classes for the Parse Tree 168 | --------------------------------------- 169 | 170 | We have many types of binary operators in our grammar, but we've defined only the `**` power operator so far. Let's define them for `*`, `/`, `+`, `-` as well: 171 | 172 | ```pycon 173 | >>> expr = forward_decl() 174 | >>> parenthesized = -op("(") + expr + -op(")") 175 | >>> primary = number | parenthesized 176 | >>> power = primary + many(op("**") + primary) 177 | >>> term = power + many((op("*") | op("/")) + power) 178 | >>> sum = term + many((op("+") | op("-")) + term) 179 | >>> expr.define(sum) 180 | >>> document = expr + -finished 181 | 182 | ``` 183 | 184 | Here we've introduced a hierarchy of nested parsers: `expr -> sum -> term -> power -> primary -> parenthesized -> expr -> ...` to reflect the order of calculations set by our operator priorities: `+` < `*` < `**` < `()`. 185 | 186 | Test it: 187 | 188 | 189 | ```pycon 190 | >>> document.parse(tokenize("1 * (2 + 0) ** 3")) 191 | (1, [], [('*', (2, [], [], [('+', (0, [], []))], [('**', 3)]))], []) 192 | 193 | ``` 194 | 195 | It's hard to understand the results without proper user-defined classes for our expression types. We actually have 3 expression types: 196 | 197 | * Integer numbers 198 | * Floating point numbers 199 | * Binary expressions 200 | 201 | For integers and floats we will use Python `int` and `float` classes. For binary expressions we'll introduce the `BinaryExpr` class: 202 | 203 | ```pycon 204 | >>> from dataclasses import dataclass 205 | 206 | 207 | >>> @dataclass 208 | ... class BinaryExpr: 209 | ... op: str 210 | ... left: "Expr" 211 | ... right: "Expr" 212 | 213 | ``` 214 | 215 | Since we don't use a common base class for our expressions, we have to define `Expr` as a union of possible expression types: 216 | 217 | 218 | ``` 219 | >>> Expr = Union[BinaryExpr, int, float] 220 | 221 | ``` 222 | 223 | Now let's define a function to transform the parse results of our binary operators into `BinaryExpr` objects. Take a look at our parsers of various binary expressions. You can infer that each of them returns _(expression, list of (operator, expression))_. We will transform these nested tuples and lists into a tree of nested expressions by defining a function `to_expr(args)` and applying `>> to_expr` to our expression parsers: 224 | 225 | ```pycon 226 | >>> from typing import Tuple 227 | 228 | 229 | >>> def to_expr(args: Tuple[Expr, List[Tuple[str, Expr]]]) -> Expr: 230 | ... first, rest = args 231 | ... result = first 232 | ... for op, expr in rest: 233 | ... result = BinaryExpr(op, result, expr) 234 | ... return result 235 | 236 | ``` 237 | 238 | Let's re-define our grammar using this transformation: 239 | 240 | 241 | ```pycon 242 | >>> expr: Parser[Token, Expr] = forward_decl() 243 | >>> parenthesized = -op("(") + expr + -op(")") 244 | >>> primary = number | parenthesized 245 | >>> power = primary + many(op("**") + primary) >> to_expr 246 | >>> term = power + many((op("*") | op("/")) + power) >> to_expr 247 | >>> sum = term + many((op("+") | op("-")) + term) >> to_expr 248 | >>> expr.define(sum) 249 | >>> document = expr + -finished 250 | 251 | ``` 252 | 253 | Test it: 254 | 255 | ```pycon 256 | >>> document.parse(tokenize("3.1415926 * (2 + 7.18281828e-1) * 42")) 257 | BinaryExpr(op='*', left=BinaryExpr(op='*', left=3.1415926, right=BinaryExpr(op='+', left=2, right=0.718281828)), right=42) 258 | 259 | ``` 260 | 261 | Let's pretty-print it using the [`pretty_tree(x, kids, show)`](../api/util.md#funcparserlib.util.pretty_tree) function: 262 | 263 | ```pycon 264 | >>> from funcparserlib.util import pretty_tree 265 | 266 | 267 | >>> def pretty_expr(expr: Expr) -> str: 268 | ... 269 | ... def kids(expr: Expr) -> List[Expr]: 270 | ... if isinstance(expr, BinaryExpr): 271 | ... return [expr.left, expr.right] 272 | ... else: 273 | ... return [] 274 | ... 275 | ... def show(expr: Expr) -> str: 276 | ... if isinstance(expr, BinaryExpr): 277 | ... return f"BinaryExpr({expr.op!r})" 278 | ... else: 279 | ... return repr(expr) 280 | ... 281 | ... return pretty_tree(expr, kids, show) 282 | 283 | ``` 284 | 285 | Test it: 286 | 287 | ```pycon 288 | >>> print(pretty_expr(document.parse(tokenize("3.1415926 * (2 + 7.18281828e-1) * 42")))) 289 | BinaryExpr('*') 290 | |-- BinaryExpr('*') 291 | | |-- 3.1415926 292 | | `-- BinaryExpr('+') 293 | | |-- 2 294 | | `-- 0.718281828 295 | `-- 42 296 | 297 | ``` 298 | 299 | 300 | 301 | Finally, we have a proper parse tree that is easy to understand and work with! 302 | 303 | 304 | Next 305 | ---- 306 | 307 | 308 | We've finished writing our numeric expressions parser. 309 | 310 | If you want to learn more, let's discuss a few tips and tricks about parsing in [the next chapter](tips-and-tricks.md). 311 | -------------------------------------------------------------------------------- /docs/getting-started/parsing.md: -------------------------------------------------------------------------------- 1 | Parsing Tokens 2 | ============== 3 | 4 | So far we have defined the tokenizer for our calculator expressions language: 5 | 6 | ```pycon 7 | >>> from typing import List 8 | >>> from funcparserlib.lexer import make_tokenizer, TokenSpec, Token 9 | 10 | 11 | >>> def tokenize(s: str) -> List[Token]: 12 | ... specs = [ 13 | ... TokenSpec("whitespace", r"\s+"), 14 | ... TokenSpec("float", r"[+\-]?\d+\.\d*([Ee][+\-]?\d+)*"), 15 | ... TokenSpec("int", r"[+\-]?\d+"), 16 | ... TokenSpec("op", r"(\*\*)|[+\-*/()]"), 17 | ... ] 18 | ... tokenizer = make_tokenizer(specs) 19 | ... return [t for t in tokenizer(s) if t.type != "whitespace"] 20 | 21 | ``` 22 | 23 | It results a list of tokens which we want to parse according to our expressions grammar: 24 | 25 | ```pycon 26 | >>> from pprint import pprint 27 | 28 | 29 | >>> pprint(tokenize("3.1415926 * (2 + 7.18281828e-1) * 42")) 30 | [Token('float', '3.1415926'), 31 | Token('op', '*'), 32 | Token('op', '('), 33 | Token('int', '2'), 34 | Token('op', '+'), 35 | Token('float', '7.18281828e-1'), 36 | Token('op', ')'), 37 | Token('op', '*'), 38 | Token('int', '42')] 39 | 40 | ``` 41 | 42 | 43 | Parser Combinators 44 | ------------------ 45 | 46 | A **parser** is an object that takes input tokens and transforms them into a parse result. For example, a **primitive parser** [`tok(type, value)`](../api/parser.md#funcparserlib.parser.tok) parses a single token of a certain type and, optionally, with a certain value. 47 | 48 | Parsing a single token is not exciting at all. The interesting part comes when you start combining parsers via **parser combinators** to build bigger parsers of complex token sequences. 49 | 50 | Parsers from [`funcparserlib.parser`](../api/parser.md) have a nice layered structure that allows you to express the grammar rules of the langauge you want to parse: 51 | 52 | ``` 53 | ┌──────────┬──────────────────────┬───────────┐ 54 | │ │ Primitive Parsers │ │ 55 | │ ├──────────────────────┘ │ 56 | │ │ │ 57 | │ │ tok(type, value) forward_decl() │ 58 | │ │ │ 59 | │ │ a(token) some(pred) finished │ 60 | │ │ │ 61 | │ ├──────────────────────┬───────────┤ 62 | │ │ Parser Combinators │ │ 63 | │ ├──────────────────────┘ │ 64 | │ │ │ 65 | │ Parser │ p1 + p2 p1 | p2 p >> f -p │ 66 | │ objects │ │ 67 | │ │ many(p) oneplus(p) maybe(p) │ 68 | │ │ │ 69 | │ ├──────────────────────┬───────────┤ 70 | │ │ Means of Abstraction │ │ 71 | │ ├──────────────────────┘ │ 72 | │ │ │ 73 | │ │ Python assignments: = │ 74 | │ │ │ 75 | │ │ Python functions: def │ 76 | └──────────┴──────────────────────────────────┘ 77 | ``` 78 | 79 | You get a new [`Parser`](../api/parser.md#funcparserlib.parser.Parser) object each time you apply a parser combinator to your parsers. Therefore, the set of all parsers it closed under the operations defined by parser combinators. 80 | 81 | Parsers are regular Python objects of type [`Parser`](../api/parser.md#funcparserlib.parser.Parser). It means that you can write arbitrary Python code that builds parser objects: assign parsers to variables, pass parsers as call arguments, get them as the return values of calls, etc. 82 | 83 | !!! Note 84 | 85 | The type [`Parser`](../api/parser.md#funcparserlib.parser.Parser) is actually parameterized as `Parser[T, V]` where: 86 | 87 | * `T` is the type of input tokens 88 | * `V` is the type of the parse result 89 | 90 | Your text editor or type checker will provide better code completion and error checking for your parsing code based on the types defined in `funcparserlib` and their type inference capabilities. 91 | 92 | 93 | `tok()`: Parsing a Single Token 94 | ------------------------------- 95 | 96 | Let's recall the expressions we would like to be able to parse: 97 | 98 | ``` 99 | 0 100 | 1 + 2 + 3 101 | -1 + 2 ** 32 102 | 3.1415926 * (2 + 7.18281828e-1) * 42 103 | ``` 104 | 105 | It looks like our grammar should have expressions that consist of numbers or nested expressions. Let's start with just numbers. 106 | 107 | 108 | We'll use [`tok(type, value)`](../api/parser.md#funcparserlib.parser.tok) to create a primitive parser of a single integer token. Let's import it: 109 | 110 | ```pycon 111 | >>> from funcparserlib.parser import tok 112 | 113 | ``` 114 | 115 | Here is our parser of a single integer token. The string `"int"` is the type of the integer token spec for our tokenizer: 116 | 117 | 118 | ```pycon 119 | >>> int_str = tok("int") 120 | 121 | ``` 122 | 123 | Let's try it in action. In order to invoke a parser, we should pass a sequence of tokens to its [`Parser.parse(tokens)`](../api/parser.md#funcparserlib.parser.Parser.parse) method: 124 | 125 | ```pycon 126 | >>> int_str.parse(tokenize("42")) 127 | '42' 128 | 129 | ``` 130 | 131 | !!! Note 132 | 133 | Our parser returns integer numbers as strings at the moment. We'll cover transforming parse results and creating a proper parse tree in the next chapter. 134 | 135 | If the first token in the input is _not_ of type `"int"`, our parser raises an exception: 136 | 137 | ```pycon 138 | >>> int_str.parse(tokenize("+")) # doctest: +IGNORE_EXCEPTION_DETAIL 139 | Traceback (most recent call last): 140 | ... 141 | NoParseError: 1,1-1,1: got unexpected token: '+', expected: int 142 | 143 | ``` 144 | 145 | 146 | `p1 | p2`: Parsing Alternatives 147 | ------------------------------- 148 | 149 | We want to support floating point numbers as well. We already know how to do it: 150 | 151 | ```pycon 152 | >>> float_str = tok("float") 153 | 154 | ``` 155 | 156 | Let's define our number expression as either an integer or a float number. We can parse alternatives using the [`Parser.__or__`](../api/parser.md#funcparserlib.parser.Parser.__or__) combinator: 157 | 158 | ```pycon 159 | >>> number = int_str | float_str 160 | 161 | ``` 162 | 163 | Test it: 164 | 165 | ```pycon 166 | >>> number.parse(tokenize("42")) 167 | '42' 168 | 169 | >>> number.parse(tokenize("3.14")) 170 | '3.14' 171 | 172 | >>> number.parse(tokenize("*")) # doctest: +IGNORE_EXCEPTION_DETAIL 173 | Traceback (most recent call last): 174 | ... 175 | NoParseError: 1,1-1,1: got unexpected token: '*', expected: int or float 176 | 177 | ``` 178 | 179 | 180 | `p1 + p2`: Parsing a Sequence 181 | ----------------------------- 182 | 183 | Since we can parse numbers now, let's proceeed with expressions. The first expression we will parse is the power operator: 184 | 185 | ``` 186 | 2 ** 32 187 | ``` 188 | 189 | We need a new parser combinator to parse sequences of tokens. We can combine parsers sequentially using the [`Parser.__add__`](../api/parser.md#funcparserlib.parser.Parser.__add__) combinator. 190 | 191 | Let's try it on sequences of numbers first: 192 | 193 | ```pycon 194 | >>> p = number + number 195 | 196 | ``` 197 | 198 | Test it: 199 | 200 | ```pycon 201 | >>> p.parse(tokenize("1 2")) 202 | ('1', '2') 203 | 204 | ``` 205 | 206 | The sequence combinator returns its results as a tuple of the parse results of its arguments. The size of the resulting tuple depends on the number of the parsers in the sequence. Let's try it for three numbers: 207 | 208 | ```pycon 209 | >>> p = number + number + number 210 | 211 | ``` 212 | 213 | Test it: 214 | 215 | ```pycon 216 | >>> p.parse(tokenize("1 2 3")) 217 | ('1', '2', '3') 218 | 219 | ``` 220 | 221 | Back to parsing the power operator of our calculator expressions language. We will need to parse several different operator tokens besides `"**"` in our grammar, so let's define a helper function: 222 | 223 | ```pycon 224 | >>> from funcparserlib.parser import Parser 225 | 226 | 227 | >>> def op(name: str) -> Parser[Token, str]: 228 | ... return tok("op", name) 229 | 230 | ``` 231 | 232 | Let's define the parser of the power operator expressions using our new `op(name)` helper: 233 | 234 | ```pycon 235 | >>> power = number + op("**") + number 236 | 237 | ``` 238 | 239 | Test it: 240 | 241 | ```pycon 242 | >>> power.parse(tokenize("2 ** 32")) 243 | ('2', '**', '32') 244 | 245 | ``` 246 | 247 | 248 | `many()`: Parsing Repeated Parts 249 | -------------------------------- 250 | 251 | We want to allow sequences of power operators. Let's parse the first number, followed by zero or more pairs of the power operator and a number. We'll use the [`many(p)`](../api/parser.md#funcparserlib.parser.many) combinator for that. Let's import it: 252 | 253 | ```pycon 254 | >>> from funcparserlib.parser import many 255 | 256 | ``` 257 | 258 | Here is our parser of sequences of power operators: 259 | 260 | ```pycon 261 | >>> power = number + many(op("**") + number) 262 | 263 | ``` 264 | 265 | Test it: 266 | 267 | ```pycon 268 | >>> power.parse(tokenize("2 ** 3 ** 4")) 269 | ('2', [('**', '3'), ('**', '4')]) 270 | 271 | ``` 272 | 273 | The `many(p)` combinator applies its argument parser `p` to the input sequence of tokens many times until it fails, returning a list of the results. If `p` fails to parse any tokens, `many(p)` still succeeds and returns an empty list: 274 | 275 | ```pycon 276 | >>> power.parse(tokenize("1 + 2")) 277 | ('1', []) 278 | 279 | ``` 280 | 281 | 282 | `forward_decl()`: Parsing Recursive Parts 283 | ----------------------------------------- 284 | 285 | We want to allow using parentheses to specify the order of calculations. 286 | 287 | Ideally, we would like to write a recursive assignment like this one, but the Python syntax doesn't allow it: 288 | 289 | ```pycon 290 | >>> expr = power | number | (op("(") + expr + op(")")) # doctest: +IGNORE_EXCEPTION_DETAIL 291 | Traceback (most recent call last): 292 | ... 293 | NameError: name 'expr' is not defined 294 | 295 | ``` 296 | 297 | We will use the [`forward_decl()`](../api/parser.md#funcparserlib.parser.forward_decl) parser to solve the recursive assignment problem: 298 | 299 | 1. We create a forward declaration 300 | 2. We use the declaration in other parsers 301 | 3. We define the value of the declaration 302 | 303 | Let's start with a simple example first. We'll create a parser numbers in properly nested parentheses: 304 | 305 | ```pycon 306 | >>> from funcparserlib.parser import forward_decl 307 | >>> p = forward_decl() 308 | >>> p.define(number | (op("(") + p + op(")"))) 309 | 310 | ``` 311 | 312 | Test it: 313 | 314 | ```pycon 315 | >>> p.parse(tokenize("1")) 316 | '1' 317 | 318 | >>> p.parse(tokenize("(1)")) 319 | ('(', '1', ')') 320 | 321 | >>> p.parse(tokenize("((1))")) 322 | ('(', ('(', '1', ')'), ')') 323 | 324 | ``` 325 | 326 | Back to our recursive `expr` problem. Let's re-write our grammar using `forward_decl()` for expressions: 327 | 328 | ```pycon 329 | >>> expr = forward_decl() 330 | >>> parenthesized = op("(") + expr + op(")") 331 | >>> primary = number | parenthesized 332 | >>> power = primary + many(op("**") + primary) 333 | >>> expr.define(power) 334 | 335 | ``` 336 | 337 | Test it: 338 | 339 | ```pycon 340 | >>> expr.parse(tokenize("2 ** 3 ** 4")) 341 | ('2', [('**', '3'), ('**', '4')]) 342 | 343 | >>> expr.parse(tokenize("2 ** (3 ** 4)")) 344 | ('2', [('**', ('(', ('3', [('**', '4')]), ')'))]) 345 | 346 | ``` 347 | 348 | 349 | `finished`: Expecting No More Input 350 | ----------------------------------- 351 | 352 | Surprisingly, our `expr` parser tolerates incomplete expressions by ignoring the incomplete parts: 353 | 354 | ```pycon 355 | >>> expr.parse(tokenize("2 ** (3 ** 4")) 356 | ('2', []) 357 | 358 | ``` 359 | 360 | The problem is that its `many(p)` part parses the input while `p` succeeds, and it doesn't look any further than that. We can make a parser expect the end of the input via the [`finished`](../api/parser.md#finished) parser. Let's define a parser for the whole input document: 361 | 362 | ```pycon 363 | >>> from funcparserlib.parser import finished 364 | >>> document = expr + finished 365 | 366 | ``` 367 | 368 | !!! Note 369 | 370 | Usually you finish the topmost parser of your grammar with `... + finished` to indicate that you expect no further input. 371 | 372 | Let's try it for our grammar: 373 | 374 | ```pycon 375 | >>> document.parse(tokenize("2 ** (3 ** 4")) # doctest: +IGNORE_EXCEPTION_DETAIL 376 | Traceback (most recent call last): 377 | ... 378 | NoParseError: got unexpected end of input, expected: ')' 379 | 380 | >>> document.parse(tokenize("2 ** (3 ** 4)")) 381 | ('2', [('**', ('(', ('3', [('**', '4')]), ')'))], None) 382 | 383 | ``` 384 | 385 | Next 386 | ---- 387 | 388 | We have created a parser for power operator expressions. Its parse results are correct, but they look hard to undersand and work with: 389 | 390 | * Our integer and floating point numbers are strings, not `int` or `float` objects 391 | * The results contain `'('` and `')'` strings even though we need parentheses only temporarily to set the operator priorities 392 | * The results contain `None`, which is the parse result of [`finished`](../api/parser.md#finished), even though we don't need it 393 | * The results are lists of tuples of strings, not user-defined classes that reflect the grammar of our calculator expressions language 394 | 395 | In [the next chapter](parse-tree.md) you will learn how to transform parse results and prepare a proper, cleaned up parse tree. 396 | -------------------------------------------------------------------------------- /docs/getting-started/tips-and-tricks.md: -------------------------------------------------------------------------------- 1 | Tips and Tricks 2 | =============== 3 | 4 | Let's use the tokenizer we have defined previously for our examples in this chapter: 5 | 6 | ```pycon 7 | >>> from typing import List 8 | >>> from funcparserlib.lexer import make_tokenizer, TokenSpec, Token 9 | >>> from funcparserlib.parser import tok, Parser, many, forward_decl, finished 10 | 11 | 12 | >>> def tokenize(s: str) -> List[Token]: 13 | ... specs = [ 14 | ... TokenSpec("whitespace", r"\s+"), 15 | ... TokenSpec("float", r"[+\-]?\d+\.\d*([Ee][+\-]?\d+)*"), 16 | ... TokenSpec("int", r"[+\-]?\d+"), 17 | ... TokenSpec("op", r"(\*\*)|[+\-*/()]"), 18 | ... ] 19 | ... tokenizer = make_tokenizer(specs) 20 | ... return [t for t in tokenizer(s) if t.type != "whitespace"] 21 | 22 | 23 | >>> def op(name: str) -> Parser[Token, str]: 24 | ... return tok("op", name) 25 | 26 | ``` 27 | 28 | ## Name Alternative Parsers for Better Error Messages 29 | 30 | Consider the following grammar: 31 | 32 | ```pycon 33 | >>> number = (tok("int") >> int) | (tok("float") >> float) 34 | >>> paren = -op("(") + number + -op(")") 35 | >>> mul = number + op("*") + number 36 | >>> expr = paren | mul 37 | 38 | ``` 39 | 40 | When a parser fails to parse its input, it usually reports the token it expects: 41 | 42 | ```pycon 43 | >>> paren.parse(tokenize("(1")) # doctest: +IGNORE_EXCEPTION_DETAIL 44 | Traceback (most recent call last): 45 | ... 46 | NoParseError: got unexpected end of input, expected: ')' 47 | 48 | ``` 49 | 50 | If there were several parsing alternatives, the parser will report an error after the longest successfully parsed sequence: 51 | 52 | ```pycon 53 | 54 | >>> expr.parse(tokenize("1 + 2")) # doctest: +IGNORE_EXCEPTION_DETAIL 55 | Traceback (most recent call last): 56 | ... 57 | NoParseError: 1,3-1,3: got unexpected token: '+', expected: '*' 58 | 59 | ``` 60 | 61 | If there were several parsing alternatives and all of them failed to parse the current token, then the parser will report its name as the expected input: 62 | 63 | ```pycon 64 | >>> number.parse(tokenize("*")) # doctest: +IGNORE_EXCEPTION_DETAIL 65 | Traceback (most recent call last): 66 | ... 67 | NoParseError: 1,1-1,1: got unexpected token: '*', expected: int or float 68 | 69 | >>> expr.parse(tokenize("+")) # doctest: +IGNORE_EXCEPTION_DETAIL 70 | Traceback (most recent call last): 71 | ... 72 | NoParseError: 1,1-1,1: got unexpected token: '+', expected: int or float or (('(', int or float), ')') 73 | 74 | ``` 75 | 76 | Parser names are auto-generated and may be quite long and hard to understand. For better error messages you may want to name your parsers explicitly via [`Parser.named(name)`](../api/parser.md#funcparserlib.parser.Parser.named). The naming style is up to you. For example: 77 | 78 | ```pycon 79 | >>> number = ((tok("int") >> int) | (tok("float") >> float)).named("number") 80 | >>> paren = -op("(") + number + -op(")") 81 | >>> mul = number + op("*") + number 82 | >>> expr = (paren | mul).named("number or '('") 83 | 84 | ``` 85 | 86 | Test it: 87 | 88 | 89 | ```pycon 90 | >>> number.parse(tokenize("*")) # doctest: +IGNORE_EXCEPTION_DETAIL 91 | Traceback (most recent call last): 92 | ... 93 | NoParseError: 1,1-1,1: got unexpected token: '*', expected: number 94 | 95 | >>> expr.parse(tokenize("+")) # doctest: +IGNORE_EXCEPTION_DETAIL 96 | Traceback (most recent call last): 97 | ... 98 | NoParseError: 1,1-1,1: got unexpected token: '+', expected: number or '(' 99 | 100 | ``` 101 | 102 | 103 | ## How to Handle Conflicting Alternatives 104 | 105 | If one of the parsing alternatives is a subpart of another one, then you should put the longest alternative first. Otherwise parsing the shorter one will make another one unreachable: 106 | 107 | ```pycon 108 | >>> p = (number + number) | (number + number + number) 109 | 110 | >>> p.parse(tokenize("1 2 3")) 111 | (1, 2) 112 | 113 | ``` 114 | 115 | Parse the longest alternative first: 116 | 117 | ```pycon 118 | >>> p = (number + number + number) | (number + number) 119 | 120 | >>> p.parse(tokenize("1 2 3")) 121 | (1, 2, 3) 122 | 123 | >>> p.parse(tokenize("1 2")) 124 | (1, 2) 125 | 126 | ``` 127 | 128 | 129 | ## Watch Out for Left Recursion 130 | 131 | There are certain kinds grammar rules you cannot use with `funcparserlib`. These are the rules that contain recursion in their leftmost parts. These rules lead to infinite recursion during parsing, that results in a `RecursionError` exception. 132 | 133 | For example, we want to define an expression `expr` either a multiplication operator `mul` or a `number`. We also want an expression to be a sequence of an expression `expr`, followed by an operator `"**"`, followed by another expression `expr`: 134 | 135 | 136 | ```pycon 137 | >>> expr = forward_decl() 138 | >>> mul = expr + op("*") + expr 139 | >>> expr.define(mul | number) 140 | 141 | ``` 142 | 143 | This looks reasonable at the first glance, but it contains left recursion. In order to parse the first token for `expr`, we need to parse the first token for `mul`, for that we need to parse the first token for `expr`, and so on. This left recursion in your grammar results in a stack overflow exception: 144 | 145 | ```pycon 146 | >>> expr.parse(tokenize("1 * 2")) # doctest: +IGNORE_EXCEPTION_DETAIL 147 | Traceback (most recent call last): 148 | ... 149 | RecursionError: maximum recursion depth exceeded 150 | 151 | ``` 152 | 153 | You should think how to re-write your grammar to avoid left-recursive definitions. In our case of several multiplication operators we really want a number, followed by zero or more pairs of `*` and number: 154 | 155 | ```pycon 156 | >>> expr = forward_decl() 157 | >>> mul = number + many(op("**") + number) 158 | >>> expr.define(mul) 159 | 160 | ``` 161 | 162 | Test it: 163 | 164 | ```pycon 165 | >>> expr.parse(tokenize("1 ** 2")) 166 | (1, [('**', 2)]) 167 | 168 | 169 | >>> expr.parse(tokenize("3")) 170 | (3, []) 171 | 172 | ``` 173 | 174 | Remember that your parsers have to consume at least one token from the input before going into recursive defintions. 175 | -------------------------------------------------------------------------------- /docs/getting-started/tokenizing.md: -------------------------------------------------------------------------------- 1 | Tokenizing Input 2 | ================ 3 | 4 | Parsing is usually split into two steps: 5 | 6 | 1. Tokenizing the input string into a sequence of tokens 7 | 2. Parsing the tokens into a parse tree 8 | 9 | 10 | ``` 11 | ┌────────────┐ ┌─────────┐ 12 | str │ │ List[Token] │ │ Expr 13 | ─────────► tokenize() ├───────────────► parse() ├─────────► 14 | │ │ │ │ 15 | └────────────┘ └─────────┘ 16 | ``` 17 | 18 | **Tokens** are larger pieces of the input text such as words, punctuation marks, spaces, etc. It's easier to parse a list of tokens than a string, since you can skip auxiliary tokens (spaces, newlines, commments) during tokenizing and focus on the main ones. Tokens usually track their position in the text, which is helpful in parsing error messages. 19 | 20 | 21 | Tokenizing with `make_tokenizer()` 22 | ---------------------------------- 23 | 24 | One of the most common ways to define tokens and tokenizing rules is via regular expressions. `funcparserlib` comes with the module [`funcparserlib.lexer`](../api/lexer.md) for creating regexp-based tokenizers. 25 | 26 | !!! Note 27 | 28 | Parsers defined with `funcparserlib` can work with _any_ tokens. You can plug your custom tokenizers and token types or even parse raw strings as lists of character tokens. 29 | 30 | In this guide we will use the _recommended_ way of writing tokenizers: `make_tokenizer()` from the `funcparserlib.lexer` module. 31 | 32 | Let's identify token types in our numeric expressions language: 33 | 34 | * Whitespace 35 | * Spaces, tabs, newlines 36 | * Integer numbers 37 | * `0`, `256`, `-42`, ... 38 | * Floating point numbers 39 | * `3.1415`, `27.1828e-01`, ... 40 | * Operators 41 | * `(`, `)`, `*`, `+`, `/`, `-`, `**` 42 | 43 | We will define our token specs and pass them to `make_tokenizer()` to generate our tokenizer. We will also drop whitespace tokens from the result, since we don't need them. 44 | 45 | Some imports first: 46 | 47 | ```pycon 48 | >>> from typing import List 49 | >>> from funcparserlib.lexer import make_tokenizer, TokenSpec, Token 50 | 51 | ``` 52 | 53 | The tokenizer itself: 54 | 55 | ```pycon 56 | >>> def tokenize(s: str) -> List[Token]: 57 | ... specs = [ 58 | ... TokenSpec("whitespace", r"\s+"), 59 | ... TokenSpec("float", r"[+\-]?\d+\.\d*([Ee][+\-]?\d+)*"), 60 | ... TokenSpec("int", r"[+\-]?\d+"), 61 | ... TokenSpec("op", r"(\*\*)|[+\-*/()]"), 62 | ... ] 63 | ... tokenizer = make_tokenizer(specs) 64 | ... return [t for t in tokenizer(s) if t.type != "whitespace"] 65 | 66 | ``` 67 | 68 | !!! Warning 69 | 70 | Be careful with ordering your token specs and your regexps so that larger tokens come first before their smaller subparts. In our token specs: 71 | 72 | * _Float_ tokens should come before _int_ tokens 73 | * `**` should come before `*` 74 | 75 | Let's try our tokenizer: 76 | 77 | ```pycon 78 | >>> tokenize("42 + 1337") 79 | [Token('int', '42'), Token('op', '+'), Token('int', '1337')] 80 | 81 | ``` 82 | 83 | The `str()` form of the token shows its position in the input text, also available via `t.start` and `t.end`: 84 | 85 | ```pycon 86 | >>> [str(t) for t in tokenize("42 + 1337")] 87 | ["1,1-1,2: int '42'", "1,4-1,4: op '+'", "1,6-1,9: int '1337'"] 88 | 89 | ``` 90 | 91 | 92 | Next 93 | ---- 94 | 95 | We have tokenized an numeric expression string into a list of tokens. 96 | 97 | In [the next chapter](parsing.md) you will learn how to parse these tokens by defining a grammar for our numeric expressions language. 98 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | Funcparserlib 2 | ============= 3 | 4 | Recursive descent parsing library for Python based on functional combinators. 5 | 6 | [![PyPI](https://img.shields.io/pypi/v/funcparserlib)](https://pypi.org/project/funcparserlib/) 7 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/funcparserlib)](https://pypi.org/project/funcparserlib/) 8 | 9 | 10 | Description 11 | ----------- 12 | 13 | The primary focus of `funcparserlib` is **parsing little languages** or **external DSLs** (domain specific languages). 14 | 15 | Parsers made with `funcparserlib` are pure-Python LL(\*) parsers. It means that it's **very easy to write parsers** without thinking about lookaheads and other hardcore parsing stuff. However, recursive descent parsing is a rather slow method compared to LL(k) or LR(k) algorithms. Still, parsing with `funcparserlib` is **at least twice faster than PyParsing**, a very popular library for Python. 16 | 17 | The source code of `funcparserlib` is only 1.2K lines of code, with lots of comments. Its API is fully type hinted. It features the longest parsed prefix error reporting, as well as a tiny lexer generator for token position tracking. 18 | 19 | The idea of parser combinators used in `funcparserlib` comes from the [Introduction to Functional Programming](https://www.cl.cam.ac.uk/teaching/Lectures/funprog-jrh-1996/) course. We have converted it from ML into Python. 20 | 21 | 22 | Installation 23 | ------------ 24 | 25 | You can install `funcparserlib` from [PyPI](https://pypi.org/project/funcparserlib/): 26 | 27 | ```shell 28 | $ pip install funcparserlib 29 | ``` 30 | 31 | There are no dependencies on other libraries. 32 | 33 | 34 | Documentation 35 | ------------- 36 | 37 | * [Getting Started](https://funcparserlib.pirx.ru/getting-started/) 38 | * Your **starting point** with `funcparserlib` 39 | * [API Reference](https://funcparserlib.pirx.ru/api/) 40 | * Learn the details of the API 41 | 42 | There are several examples available in the `tests/` directory: 43 | 44 | * [GraphViz DOT parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/dot.py) 45 | * [JSON parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py) 46 | 47 | See also [the changelog](https://funcparserlib.pirx.ru/changes/). 48 | 49 | 50 | Example 51 | ------- 52 | 53 | Let's consider a little language of **numeric expressions** with a syntax similar to Python expressions. Here are some expression strings in this language: 54 | 55 | ``` 56 | 0 57 | 1 + 2 + 3 58 | -1 + 2 ** 32 59 | 3.1415926 * (2 + 7.18281828e-1) * 42 60 | ``` 61 | 62 | 63 | Here is **the complete source code** of the tokenizer and the parser for this language written using `funcparserlib`: 64 | 65 | ```python 66 | from typing import List, Tuple, Union 67 | from dataclasses import dataclass 68 | 69 | from funcparserlib.lexer import make_tokenizer, TokenSpec, Token 70 | from funcparserlib.parser import tok, Parser, many, forward_decl, finished 71 | 72 | 73 | @dataclass 74 | class BinaryExpr: 75 | op: str 76 | left: "Expr" 77 | right: "Expr" 78 | 79 | 80 | Expr = Union[BinaryExpr, int, float] 81 | 82 | 83 | def tokenize(s: str) -> List[Token]: 84 | specs = [ 85 | TokenSpec("whitespace", r"\s+"), 86 | TokenSpec("float", r"[+\-]?\d+\.\d*([Ee][+\-]?\d+)*"), 87 | TokenSpec("int", r"[+\-]?\d+"), 88 | TokenSpec("op", r"(\*\*)|[+\-*/()]"), 89 | ] 90 | tokenizer = make_tokenizer(specs) 91 | return [t for t in tokenizer(s) if t.type != "whitespace"] 92 | 93 | 94 | def parse(tokens: List[Token]) -> Expr: 95 | int_num = tok("int") >> int 96 | float_num = tok("float") >> float 97 | number = int_num | float_num 98 | 99 | expr: Parser[Token, Expr] = forward_decl() 100 | parenthesized = -op("(") + expr + -op(")") 101 | primary = number | parenthesized 102 | power = primary + many(op("**") + primary) >> to_expr 103 | term = power + many((op("*") | op("/")) + power) >> to_expr 104 | sum = term + many((op("+") | op("-")) + term) >> to_expr 105 | expr.define(sum) 106 | 107 | document = expr + -finished 108 | 109 | return document.parse(tokens) 110 | 111 | 112 | def op(name: str) -> Parser[Token, str]: 113 | return tok("op", name) 114 | 115 | 116 | def to_expr(args: Tuple[Expr, List[Tuple[str, Expr]]]) -> Expr: 117 | first, rest = args 118 | result = first 119 | for op, expr in rest: 120 | result = BinaryExpr(op, result, expr) 121 | return result 122 | ``` 123 | 124 | Now, consider this numeric expression: `3.1415926 * (2 + 7.18281828e-1) * 42`. 125 | 126 | Let's `tokenize()` it using the tokenizer we've created with `funcparserlib.lexer`: 127 | 128 | ``` 129 | [ 130 | Token('float', '3.1415926'), 131 | Token('op', '*'), 132 | Token('op', '('), 133 | Token('int', '2'), 134 | Token('op', '+'), 135 | Token('float', '7.18281828e-1'), 136 | Token('op', ')'), 137 | Token('op', '*'), 138 | Token('int', '42'), 139 | ] 140 | ``` 141 | 142 | Let's `parse()` these tokens into an expression tree using our parser created with `funcparserlib.parser`: 143 | 144 | ``` 145 | BinaryExpr( 146 | op='*', 147 | left=BinaryExpr( 148 | op='*', 149 | left=3.1415926, 150 | right=BinaryExpr(op='+', left=2, right=0.718281828), 151 | ), 152 | right=42, 153 | ) 154 | ``` 155 | 156 | Learn how to write this parser using `funcparserlib` in the [Getting Started](https://funcparserlib.pirx.ru/getting-started/) guide! 157 | 158 | 159 | Used By 160 | ------- 161 | 162 | Some open-source projects that use `funcparserlib` as an explicit dependency: 163 | 164 | * [Hy](https://github.com/hylang/hy), a Lisp dialect that's embedded in Python 165 | * 4.7K stars, version `~=1.0`, Python 3.8+ 166 | * [Splash](https://github.com/scrapinghub/splash), a JavaScript rendering service with HTTP API, by Scrapinghub 167 | * 3.9K stars, version `*`. Python 3 in Docker 168 | * [graphite-beacon](https://github.com/klen/graphite-beacon), a simple alerting system for Graphite metrics 169 | * 453 stars, version `==0.3.6`, Python 2 and 3 170 | * [blockdiag](https://github.com/blockdiag/blockdiag), generates block-diagram image file from spec-text file 171 | * 194 stars, version `>= 1.0.0a0`, Python 3.7+ 172 | * [kll](https://github.com/kiibohd/kll), Keyboard Layout Language (KLL) compiler 173 | * 113 stars, copied source code, Python 3.5+ 174 | 175 | 176 | Next 177 | ---- 178 | 179 | Read the [Getting Started](https://funcparserlib.pirx.ru/getting-started/) guide to start learning `funcparserlib`. 180 | -------------------------------------------------------------------------------- /docs/media/extra.css: -------------------------------------------------------------------------------- 1 | .doc-heading code { 2 | font-weight: bold; 3 | } -------------------------------------------------------------------------------- /funcparserlib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlasovskikh/funcparserlib/18c0a99dcdb427e35226c74b7cc2617223c8e1fa/funcparserlib/__init__.py -------------------------------------------------------------------------------- /funcparserlib/lexer.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2009/2023 Andrey Vlasovskikh 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | # software and associated documentation files (the "Software"), to deal in the Software 5 | # without restriction, including without limitation the rights to use, copy, modify, 6 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | # permit persons to whom the Software is furnished to do so, subject to the following 8 | # conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies 11 | # or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 15 | # PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 16 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 18 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | __all__ = ["make_tokenizer", "TokenSpec", "Token", "LexerError"] 21 | 22 | import re 23 | from typing import Callable, Iterable, List, Tuple, Optional, Sequence, Pattern, Union 24 | 25 | 26 | _Place = Tuple[int, int] 27 | _Spec = Tuple[str, Tuple] 28 | 29 | 30 | class LexerError(Exception): 31 | def __init__(self, place: _Place, msg: str) -> None: 32 | self.place = place 33 | self.msg = msg 34 | 35 | def __str__(self) -> str: 36 | s = "cannot tokenize data" 37 | line, pos = self.place 38 | return '%s: %d,%d: "%s"' % (s, line, pos, self.msg) 39 | 40 | 41 | class TokenSpec: 42 | """A token specification for generating a lexer via `make_tokenizer()`.""" 43 | 44 | def __init__(self, type: str, pattern: str, flags: int = 0) -> None: 45 | """Initialize a `TokenSpec` object. 46 | 47 | Parameters: 48 | type (str): User-defined type of the token (e.g. `"name"`, `"number"`, 49 | `"operator"`) 50 | pattern (str): Regexp for matching this token type 51 | flags (int, optional): Regexp flags, the second argument of `re.compile()` 52 | """ 53 | self.type = type 54 | self.pattern = pattern 55 | self.flags = flags 56 | 57 | def __repr__(self) -> str: 58 | return "TokenSpec(%r, %r, %r)" % (self.type, self.pattern, self.flags) 59 | 60 | 61 | class Token: 62 | """A token object that represents a substring of certain type in your text. 63 | 64 | You can compare tokens for equality using the `==` operator. Tokens also define 65 | custom `repr()` and `str()`. 66 | 67 | Attributes: 68 | type (str): User-defined type of the token (e.g. `"name"`, `"number"`, 69 | `"operator"`) 70 | value (str): Text value of the token 71 | start (Optional[Tuple[int, int]]): Start position (_line_, _column_) 72 | end (Optional[Tuple[int, int]]): End position (_line_, _column_) 73 | """ 74 | 75 | def __init__( 76 | self, 77 | type: str, 78 | value: str, 79 | start: Optional[_Place] = None, 80 | end: Optional[_Place] = None, 81 | ) -> None: 82 | """Initialize a `Token` object.""" 83 | self.type = type 84 | self.value = value 85 | self.start = start 86 | self.end = end 87 | 88 | def __repr__(self) -> str: 89 | return "Token(%r, %r)" % (self.type, self.value) 90 | 91 | def __eq__(self, other: object) -> bool: 92 | # FIXME: Case sensitivity is assumed here 93 | if not isinstance(other, Token): 94 | return False 95 | else: 96 | return self.type == other.type and self.value == other.value 97 | 98 | def _pos_str(self) -> str: 99 | if self.start is None or self.end is None: 100 | return "" 101 | else: 102 | sl, sp = self.start 103 | el, ep = self.end 104 | return "%d,%d-%d,%d:" % (sl, sp, el, ep) 105 | 106 | def __str__(self) -> str: 107 | s = "%s %s '%s'" % (self._pos_str(), self.type, self.value) 108 | return s.strip() 109 | 110 | @property 111 | def name(self) -> str: 112 | return self.value 113 | 114 | def pformat(self) -> str: 115 | return "%s %s '%s'" % ( 116 | self._pos_str().ljust(20), # noqa 117 | self.type.ljust(14), 118 | self.value, 119 | ) 120 | 121 | 122 | def make_tokenizer( 123 | specs: Sequence[Union[TokenSpec, _Spec]], 124 | ) -> Callable[[str], Iterable[Token]]: 125 | # noinspection GrazieInspection 126 | """Make a function that tokenizes text based on the regexp specs. 127 | 128 | Type: `(Sequence[TokenSpec | Tuple]) -> Callable[[str], Iterable[Token]]` 129 | 130 | A token spec is `TokenSpec` instance. 131 | 132 | !!! Note 133 | 134 | For legacy reasons, a token spec may also be a tuple of (_type_, _args_), where 135 | _type_ sets the value of `Token.type` for the token, and _args_ are the 136 | positional arguments for `re.compile()`: either just (_pattern_,) or 137 | (_pattern_, _flags_). 138 | 139 | It returns a tokenizer function that takes a string and returns an iterable of 140 | `Token` objects, or raises `LexerError` if it cannot tokenize the string according 141 | to its token specs. 142 | 143 | Examples: 144 | 145 | ```pycon 146 | >>> tokenize = make_tokenizer([ 147 | ... TokenSpec("space", r"\\s+"), 148 | ... TokenSpec("id", r"\\w+"), 149 | ... TokenSpec("op", r"[,!]"), 150 | ... ]) 151 | >>> text = "Hello, World!" 152 | >>> [t for t in tokenize(text) if t.type != "space"] # noqa 153 | [Token('id', 'Hello'), Token('op', ','), Token('id', 'World'), Token('op', '!')] 154 | >>> text = "Bye?" 155 | >>> list(tokenize(text)) 156 | Traceback (most recent call last): 157 | ... 158 | lexer.LexerError: cannot tokenize data: 1,4: "Bye?" 159 | 160 | ``` 161 | """ 162 | compiled: List[Tuple[str, Pattern[str]]] = [] 163 | for spec in specs: 164 | if isinstance(spec, TokenSpec): 165 | c = spec.type, re.compile(spec.pattern, spec.flags) 166 | else: 167 | name, args = spec 168 | c = name, re.compile(*args) 169 | compiled.append(c) 170 | 171 | def match_specs(s: str, i: int, position: Tuple[int, int]) -> Token: 172 | line, pos = position 173 | for type, regexp in compiled: 174 | m = regexp.match(s, i) 175 | if m is not None: 176 | value = m.group() 177 | nls = value.count("\n") 178 | n_line = line + nls 179 | if nls == 0: 180 | n_pos = pos + len(value) 181 | else: 182 | n_pos = len(value) - value.rfind("\n") - 1 183 | return Token(type, value, (line, pos + 1), (n_line, n_pos)) 184 | else: 185 | err_line = s.splitlines()[line - 1] 186 | raise LexerError((line, pos + 1), err_line) 187 | 188 | def f(s: str) -> Iterable[Token]: 189 | length = len(s) 190 | line, pos = 1, 0 191 | i = 0 192 | while i < length: 193 | t = match_specs(s, i, (line, pos)) 194 | yield t 195 | if t.end is None: 196 | raise ValueError("Token %r has no end specified", (t,)) 197 | line, pos = t.end 198 | i += len(t.value) 199 | 200 | return f 201 | 202 | 203 | # This is an example of token specs. See also [this article][1] for a 204 | # discussion of searching for multiline comments using regexps (including `*?`). 205 | # 206 | # [1]: http://ostermiller.org/findcomment.html 207 | _example_token_specs = [ 208 | TokenSpec("COMMENT", r"\(\*(.|[\r\n])*?\*\)", re.MULTILINE), 209 | TokenSpec("COMMENT", r"\{(.|[\r\n])*?\}", re.MULTILINE), 210 | TokenSpec("COMMENT", r"//.*"), 211 | TokenSpec("NL", r"[\r\n]+"), 212 | TokenSpec("SPACE", r"[ \t\r\n]+"), 213 | TokenSpec("NAME", r"[A-Za-z_][A-Za-z_0-9]*"), 214 | TokenSpec("REAL", r"[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*"), 215 | TokenSpec("INT", r"[0-9]+"), 216 | TokenSpec("INT", r"\$[0-9A-Fa-f]+"), 217 | TokenSpec("OP", r"(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]"), 218 | TokenSpec("STRING", r"'([^']|(''))*'"), 219 | TokenSpec("CHAR", r"#[0-9]+"), 220 | TokenSpec("CHAR", r"#\$[0-9A-Fa-f]+"), 221 | ] 222 | # tokenize = make_tokenizer(_example_token_specs) 223 | -------------------------------------------------------------------------------- /funcparserlib/parser.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2009/2023 Andrey Vlasovskikh 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | # software and associated documentation files (the "Software"), to deal in the Software 5 | # without restriction, including without limitation the rights to use, copy, modify, 6 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | # permit persons to whom the Software is furnished to do so, subject to the following 8 | # conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies 11 | # or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 15 | # PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 16 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 18 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | """Functional parsing combinators. 21 | 22 | Parsing combinators define an internal domain-specific language (DSL) for describing 23 | the parsing rules of a grammar. The DSL allows you to start with a few primitive 24 | parsers, then combine your parsers to get more complex ones, and finally cover 25 | the whole grammar you want to parse. 26 | 27 | The structure of the language: 28 | 29 | * Class `Parser` 30 | * All the primitives and combinators of the language return `Parser` objects 31 | * It defines the main `Parser.parse(tokens)` method 32 | * Primitive parsers 33 | * `tok(type, value)`, `a(value)`, `some(pred)`, `forward_decl()`, `finished` 34 | * Parser combinators 35 | * `p1 + p2`, `p1 | p2`, `p >> f`, `-p`, `maybe(p)`, `many(p)`, `oneplus(p)`, 36 | `skip(p)` 37 | * Abstraction 38 | * Use regular Python variables `p = ... # Expression of type Parser` to define new 39 | rules (non-terminals) of your grammar 40 | 41 | Every time you apply one of the combinators, you get a new `Parser` object. In other 42 | words, the set of `Parser` objects is closed under the means of combination. 43 | 44 | !!! Note 45 | 46 | We took the parsing combinators language from the book [Introduction to Functional 47 | Programming][1] and translated it from ML into Python. 48 | 49 | [1]: https://www.cl.cam.ac.uk/teaching/Lectures/funprog-jrh-1996/ 50 | """ 51 | 52 | __all__ = [ 53 | "some", 54 | "a", 55 | "tok", 56 | "many", 57 | "pure", 58 | "finished", 59 | "maybe", 60 | "skip", 61 | "oneplus", 62 | "forward_decl", 63 | "NoParseError", 64 | "Parser", 65 | ] 66 | 67 | import logging 68 | import warnings 69 | from typing import ( 70 | Any, 71 | Callable, 72 | Generic, 73 | List, 74 | Optional, 75 | Sequence, 76 | Tuple, 77 | TypeVar, 78 | Union, 79 | cast, 80 | overload, 81 | ) 82 | 83 | from funcparserlib.lexer import Token 84 | 85 | log = logging.getLogger("funcparserlib") 86 | 87 | debug = False 88 | 89 | _A = TypeVar("_A") 90 | _B = TypeVar("_B") 91 | _C = TypeVar("_C") 92 | 93 | 94 | class Parser(Generic[_A, _B]): 95 | """A parser object that can parse a sequence of tokens or can be combined with 96 | other parsers using `+`, `|`, `>>`, `many()`, and other parsing combinators. 97 | 98 | Type: `Parser[A, B]` 99 | 100 | The generic variables in the type are: `A` — the type of the tokens in the 101 | sequence to parse,`B` — the type of the parsed value. 102 | 103 | In order to define a parser for your grammar: 104 | 105 | 1. You start with primitive parsers by calling `a(value)`, `some(pred)`, 106 | `forward_decl()`, `finished` 107 | 2. You use parsing combinators `p1 + p2`, `p1 | p2`, `p >> f`, `many(p)`, and 108 | others to combine parsers into a more complex parser 109 | 3. You can assign complex parsers to variables to define names that correspond to 110 | the rules of your grammar 111 | 112 | !!! Note 113 | 114 | The constructor `Parser.__init__()` is considered **internal** and may be 115 | changed in future versions. Use primitive parsers and parsing combinators to 116 | construct new parsers. 117 | """ 118 | 119 | def __init__( 120 | self, 121 | p: Union[ 122 | "Parser[_A, _B]", 123 | Callable[[Sequence[_A], "State"], Tuple[_B, "State"]], 124 | ], 125 | ) -> None: 126 | """Wrap the parser function `p` into a `Parser` object.""" 127 | self.name = "" 128 | self.define(p) 129 | 130 | def named(self, name: str) -> "Parser[_A, _B]": 131 | # noinspection GrazieInspection 132 | """Specify the name of the parser for easier debugging. 133 | 134 | Type: `(str) -> Parser[A, B]` 135 | 136 | This name is used in the debug-level parsing log. You can also get it via the 137 | `Parser.name` attribute. 138 | 139 | Examples: 140 | 141 | ```pycon 142 | >>> expr = (a("x") + a("y")).named("expr") 143 | >>> expr.name 144 | 'expr' 145 | 146 | ``` 147 | 148 | ```pycon 149 | >>> expr = a("x") + a("y") 150 | >>> expr.name 151 | "('x', 'y')" 152 | 153 | ``` 154 | 155 | !!! Note 156 | 157 | You can enable the parsing log this way: 158 | 159 | ```python 160 | import logging 161 | logging.basicConfig(level=logging.DEBUG) 162 | import funcparserlib.parser 163 | funcparserlib.parser.debug = True 164 | ``` 165 | 166 | The way to enable the parsing log may be changed in future versions. 167 | """ 168 | self.name = name 169 | return self 170 | 171 | def define( 172 | self, 173 | p: Union[ 174 | "Parser[_A, _B]", 175 | Callable[[Sequence[_A], "State"], Tuple[_B, "State"]], 176 | ], 177 | ) -> None: 178 | """Define the parser created earlier as a forward declaration. 179 | 180 | Type: `(Parser[A, B]) -> None` 181 | 182 | Use `p = forward_decl()` in combination with `p.define(...)` to define 183 | recursive parsers. 184 | 185 | See the examples in the docs for `forward_decl()`. 186 | """ 187 | f = getattr(p, "run", p) 188 | if debug: 189 | setattr(self, "_run", f) 190 | else: 191 | setattr(self, "run", f) 192 | name = getattr(p, "name", p.__doc__) 193 | if name is not None: 194 | self.named(name) 195 | 196 | def run(self, tokens: Sequence[_A], s: "State") -> Tuple[_B, "State"]: 197 | """Run the parser against the tokens with the specified parsing state. 198 | 199 | Type: `(Sequence[A], State) -> Tuple[B, State]` 200 | 201 | The parsing state includes the current position in the sequence being parsed, 202 | and the position of the rightmost token that has been consumed while parsing for 203 | better error messages. 204 | 205 | If the parser fails to parse the tokens, it raises `NoParseError`. 206 | 207 | !!! Warning 208 | 209 | This is method is **internal** and may be changed in future versions. Use 210 | `Parser.parse(tokens)` instead and let the parser object take care of 211 | updating the parsing state. 212 | """ 213 | if debug: 214 | log.debug("trying %s" % self.name) 215 | return self._run(tokens, s) 216 | 217 | def _run(self, tokens: Sequence[_A], s: "State") -> Tuple[_B, "State"]: 218 | raise NotImplementedError("you must define() a parser") 219 | 220 | def parse(self, tokens: Sequence[_A]) -> _B: 221 | """Parse the sequence of tokens and return the parsed value. 222 | 223 | Type: `(Sequence[A]) -> B` 224 | 225 | It takes a sequence of tokens of arbitrary type `A` and returns the parsed value 226 | of arbitrary type `B`. 227 | 228 | If the parser fails to parse the tokens, it raises `NoParseError`. 229 | 230 | !!! Note 231 | 232 | Although `Parser.parse()` can parse sequences of any objects (including 233 | `str` which is a sequence of `str` chars), **the recommended way** is 234 | parsing sequences of `Token` objects. 235 | 236 | You **should** use a regexp-based tokenizer `make_tokenizer()` defined in 237 | `funcparserlib.lexer` to convert your text into a sequence of `Token` 238 | objects before parsing it. You will get more readable parsing error messages 239 | (as `Token` objects contain their position in the source file) and good 240 | separation of the lexical and syntactic levels of the grammar. 241 | """ 242 | try: 243 | (tree, _) = self.run(tokens, State(0, 0, None)) 244 | return tree 245 | except NoParseError as e: 246 | max = e.state.max 247 | if len(tokens) > max: 248 | t = tokens[max] 249 | if isinstance(t, Token): 250 | if t.start is None or t.end is None: 251 | loc = "" 252 | else: 253 | s_line, s_pos = t.start 254 | e_line, e_pos = t.end 255 | loc = "%d,%d-%d,%d: " % (s_line, s_pos, e_line, e_pos) 256 | msg = "%s%s: %r" % (loc, e.msg, t.value) 257 | elif isinstance(t, str): 258 | msg = "%s: %r" % (e.msg, t) 259 | else: 260 | msg = "%s: %s" % (e.msg, t) 261 | else: 262 | msg = "got unexpected end of input" 263 | e_parser = e.state.parser 264 | if isinstance(e_parser, Parser): 265 | msg = "%s, expected: %s" % (msg, e_parser.name) 266 | e.msg = msg 267 | raise 268 | 269 | @overload 270 | def __add__( # type: ignore[misc] 271 | self, other: "_IgnoredParser[_A]" 272 | ) -> "Parser[_A, _B]": 273 | pass 274 | 275 | @overload 276 | def __add__(self, other: "Parser[_A, _C]") -> "_TupleParser[_A, Tuple[_B, _C]]": 277 | pass 278 | 279 | def __add__( 280 | self, 281 | other: Union["_IgnoredParser[_A]", "Parser[_A, _C]"], 282 | ) -> Union["Parser[_A, _B]", "_TupleParser[_A, Tuple[_B, _C]]"]: 283 | """Sequential combination of parsers. It runs this parser, then the other 284 | parser. 285 | 286 | The return value of the resulting parser is a tuple of each parsed value in 287 | the sum of parsers. We merge all parsing results of `p1 + p2 + ... + pN` into a 288 | single tuple. It means that the parsing result may be a 2-tuple, a 3-tuple, 289 | a 4-tuple, etc. of parsed values. You avoid this by transforming the parsed 290 | pair into a new value using the `>>` combinator. 291 | 292 | You can also skip some parsing results in the resulting parsers by using `-p` 293 | or `skip(p)` for some parsers in your sum of parsers. It means that the parsing 294 | result might be a single value, not a tuple of parsed values. See the docs 295 | for `Parser.__neg__()` for more examples. 296 | 297 | Overloaded types (lots of them to provide stricter checking for the quite 298 | dynamic return type of this method): 299 | 300 | * `(self: Parser[A, B], _IgnoredParser[A]) -> Parser[A, B]` 301 | * `(self: Parser[A, B], Parser[A, C]) -> _TupleParser[A, Tuple[B, C]]` 302 | * `(self: _TupleParser[A, B], _IgnoredParser[A]) -> _TupleParser[A, B]` 303 | * `(self: _TupleParser[A, B], Parser[A, Any]) -> Parser[A, Any]` 304 | * `(self: _IgnoredParser[A], _IgnoredParser[A]) -> _IgnoredParser[A]` 305 | * `(self: _IgnoredParser[A], Parser[A, C]) -> Parser[A, C]` 306 | 307 | Examples: 308 | 309 | ```pycon 310 | >>> expr = a("x") + a("y") 311 | >>> expr.parse("xy") 312 | ('x', 'y') 313 | 314 | ``` 315 | 316 | ```pycon 317 | >>> expr = a("x") + a("y") + a("z") 318 | >>> expr.parse("xyz") 319 | ('x', 'y', 'z') 320 | 321 | ``` 322 | 323 | ```pycon 324 | >>> expr = a("x") + a("y") 325 | >>> expr.parse("xz") 326 | Traceback (most recent call last): 327 | ... 328 | parser.NoParseError: got unexpected token: 'z', expected: 'y' 329 | 330 | ``` 331 | """ 332 | 333 | def magic(v1: Any, v2: Any) -> _Tuple: 334 | if isinstance(v1, _Tuple): 335 | return _Tuple(v1 + (v2,)) 336 | else: 337 | return _Tuple((v1, v2)) 338 | 339 | @_TupleParser 340 | def _add(tokens: Sequence[_A], s: State) -> Tuple[Tuple[_B, _C], State]: 341 | (v1, s2) = self.run(tokens, s) 342 | (v2, s3) = other.run(tokens, s2) 343 | return cast(Tuple[_B, _C], magic(v1, v2)), s3 344 | 345 | @Parser 346 | def ignored_right(tokens: Sequence[_A], s: State) -> Tuple[_B, State]: 347 | v, s2 = self.run(tokens, s) 348 | _, s3 = other.run(tokens, s2) 349 | return v, s3 350 | 351 | name = "(%s, %s)" % (self.name, other.name) 352 | if isinstance(other, _IgnoredParser): 353 | return ignored_right.named(name) 354 | else: 355 | _add.name = name 356 | return _add 357 | 358 | def __or__(self, other: "Parser[_A, _C]") -> "Parser[_A, Union[_B, _C]]": 359 | """Choice combination of parsers. 360 | 361 | It runs this parser and returns its result. If the parser fails, it runs the 362 | other parser. 363 | 364 | Examples: 365 | 366 | ```pycon 367 | >>> expr = a("x") | a("y") 368 | >>> expr.parse("x") 369 | 'x' 370 | >>> expr.parse("y") 371 | 'y' 372 | >>> expr.parse("z") 373 | Traceback (most recent call last): 374 | ... 375 | parser.NoParseError: got unexpected token: 'z', expected: 'x' or 'y' 376 | 377 | ``` 378 | """ 379 | 380 | @Parser 381 | def _or(tokens: Sequence[_A], s: State) -> Tuple[Union[_B, _C], State]: 382 | try: 383 | return self.run(tokens, s) 384 | except NoParseError as e: 385 | state = e.state 386 | try: 387 | return other.run(tokens, State(s.pos, state.max, state.parser)) 388 | except NoParseError as e: 389 | if s.pos == e.state.max: 390 | e.state = State(e.state.pos, e.state.max, _or) 391 | raise 392 | 393 | _or.name = "%s or %s" % (self.name, other.name) 394 | return _or 395 | 396 | def __rshift__(self, f: Callable[[_B], _C]) -> "Parser[_A, _C]": 397 | """Transform the parsing result by applying the specified function. 398 | 399 | Type: `(Callable[[B], C]) -> Parser[A, C]` 400 | 401 | You can use it for transforming the parsed value into another value before 402 | including it into the parse tree (the AST). 403 | 404 | Examples: 405 | 406 | ```pycon 407 | >>> def make_canonical_name(s): 408 | ... return s.lower() 409 | >>> expr = (a("D") | a("d")) >> make_canonical_name 410 | >>> expr.parse("D") 411 | 'd' 412 | >>> expr.parse("d") 413 | 'd' 414 | 415 | ``` 416 | """ 417 | 418 | @Parser 419 | def _shift(tokens: Sequence[_A], s: State) -> Tuple[_C, State]: 420 | (v, s2) = self.run(tokens, s) 421 | return f(v), s2 422 | 423 | return _shift.named(self.name) 424 | 425 | def bind(self, f: Callable[[_B], "Parser[_A, _C]"]) -> "Parser[_A, _C]": 426 | """Bind the parser to a monadic function that returns a new parser. 427 | 428 | Type: `(Callable[[B], Parser[A, C]]) -> Parser[A, C]` 429 | 430 | Also known as `>>=` in Haskell. 431 | 432 | !!! Note 433 | 434 | You can parse any context-free grammar without resorting to `bind`. Due 435 | to its poor performance please use it only when you really need it. 436 | """ 437 | 438 | @Parser 439 | def _bind(tokens: Sequence[_A], s: State) -> Tuple[_C, State]: 440 | (v, s2) = self.run(tokens, s) 441 | return f(v).run(tokens, s2) 442 | 443 | _bind.name = "(%s >>=)" % (self.name,) 444 | return _bind 445 | 446 | def __neg__(self) -> "_IgnoredParser[_A]": 447 | """Return a parser that parses the same tokens, but its parsing result is 448 | ignored by the sequential `+` combinator. 449 | 450 | Type: `(Parser[A, B]) -> _IgnoredParser[A]` 451 | 452 | You can use it for throwing away elements of concrete syntax (e.g. `","`, 453 | `";"`). 454 | 455 | Examples: 456 | 457 | ```pycon 458 | >>> expr = -a("x") + a("y") 459 | >>> expr.parse("xy") 460 | 'y' 461 | 462 | ``` 463 | 464 | ```pycon 465 | >>> expr = a("x") + -a("y") 466 | >>> expr.parse("xy") 467 | 'x' 468 | 469 | ``` 470 | 471 | ```pycon 472 | >>> expr = a("x") + -a("y") + a("z") 473 | >>> expr.parse("xyz") 474 | ('x', 'z') 475 | 476 | ``` 477 | 478 | ```pycon 479 | >>> expr = -a("x") + a("y") + -a("z") 480 | >>> expr.parse("xyz") 481 | 'y' 482 | 483 | ``` 484 | 485 | ```pycon 486 | >>> expr = -a("x") + a("y") 487 | >>> expr.parse("yz") 488 | Traceback (most recent call last): 489 | ... 490 | parser.NoParseError: got unexpected token: 'y', expected: 'x' 491 | 492 | ``` 493 | 494 | ```pycon 495 | >>> expr = a("x") + -a("y") 496 | >>> expr.parse("xz") 497 | Traceback (most recent call last): 498 | ... 499 | parser.NoParseError: got unexpected token: 'z', expected: 'y' 500 | 501 | ``` 502 | 503 | !!! Note 504 | 505 | You **should not** pass the resulting parser to any combinators other than 506 | `+`. You **should** have at least one non-skipped value in your 507 | `p1 + p2 + ... + pN`. The parsed value of `-p` is an **internal** `_Ignored` 508 | object, not intended for actual use. 509 | """ 510 | return _IgnoredParser(self) 511 | 512 | 513 | class State: 514 | """Parsing state that is maintained basically for error reporting. 515 | 516 | It consists of the current position `pos` in the sequence being parsed, and the 517 | position `max` of the rightmost token that has been consumed while parsing. 518 | """ 519 | 520 | def __init__( 521 | self, 522 | pos: int, 523 | max: int, 524 | parser: Union[ 525 | Parser, 526 | Callable[[Any, "State"], Tuple[Any, "State"]], 527 | None, 528 | ] = None, 529 | ) -> None: 530 | self.pos = pos 531 | self.max = max 532 | self.parser = parser 533 | 534 | def __str__(self) -> str: 535 | return str((self.pos, self.max)) 536 | 537 | def __repr__(self) -> str: 538 | return "State(%r, %r)" % (self.pos, self.max) 539 | 540 | 541 | class NoParseError(Exception): 542 | def __init__(self, msg: str, state: State) -> None: 543 | self.msg = msg 544 | self.state = state 545 | 546 | def __str__(self) -> str: 547 | return self.msg 548 | 549 | 550 | class _Tuple(tuple): 551 | pass 552 | 553 | 554 | class _TupleParser(Parser[_A, _B], Generic[_A, _B]): 555 | @overload # type: ignore[override] 556 | def __add__(self, other: "_IgnoredParser[_A]") -> "_TupleParser[_A, _B]": 557 | pass 558 | 559 | @overload 560 | def __add__(self, other: Parser[_A, Any]) -> Parser[_A, Any]: 561 | pass 562 | 563 | def __add__( 564 | self, other: Union["_IgnoredParser[_A]", Parser[_A, Any]] 565 | ) -> Union["_TupleParser[_A, _B]", Parser[_A, Any]]: 566 | return super().__add__(other) 567 | 568 | 569 | class _Ignored: 570 | def __init__(self, value: Any) -> None: 571 | self.value = value 572 | 573 | def __repr__(self) -> str: 574 | return "_Ignored(%s)" % repr(self.value) 575 | 576 | def __eq__(self, other: object) -> bool: 577 | return isinstance(other, _Ignored) and self.value == other.value 578 | 579 | 580 | @Parser 581 | def finished(tokens: Sequence[Any], s: State) -> Tuple[None, State]: 582 | """A parser that throws an exception if there are any unparsed tokens left in the 583 | sequence.""" 584 | if s.pos >= len(tokens): 585 | return None, s 586 | else: 587 | s2 = State(s.pos, s.max, finished if s.pos == s.max else s.parser) 588 | raise NoParseError("got unexpected token", s2) 589 | 590 | 591 | finished.name = "end of input" 592 | 593 | 594 | def many(p: Parser[_A, _B]) -> Parser[_A, List[_B]]: 595 | """Return a parser that applies the parser `p` as many times as it succeeds at 596 | parsing the tokens. 597 | 598 | Return a parser that infinitely applies the parser `p` to the input sequence 599 | of tokens as long as it successfully parses them. The parsed value is a list of 600 | the sequentially parsed values. 601 | 602 | Examples: 603 | 604 | ```pycon 605 | >>> expr = many(a("x")) 606 | >>> expr.parse("x") 607 | ['x'] 608 | >>> expr.parse("xx") 609 | ['x', 'x'] 610 | >>> expr.parse("xxxy") # noqa 611 | ['x', 'x', 'x'] 612 | >>> expr.parse("y") 613 | [] 614 | 615 | ``` 616 | """ 617 | 618 | @Parser 619 | def _many(tokens: Sequence[_A], s: State) -> Tuple[List[_B], State]: 620 | res = [] 621 | try: 622 | while True: 623 | (v, s) = p.run(tokens, s) 624 | res.append(v) 625 | except NoParseError as e: 626 | s2 = State(s.pos, e.state.max, e.state.parser) 627 | if debug: 628 | log.debug( 629 | "*matched* %d instances of %s, new state = %s" 630 | % (len(res), _many.name, s2) 631 | ) 632 | return res, s2 633 | 634 | _many.name = "{ %s }" % p.name 635 | return _many 636 | 637 | 638 | def some(pred: Callable[[_A], bool]) -> Parser[_A, _A]: 639 | """Return a parser that parses a token if it satisfies the predicate `pred`. 640 | 641 | Type: `(Callable[[A], bool]) -> Parser[A, A]` 642 | 643 | Examples: 644 | 645 | ```pycon 646 | >>> expr = some(lambda s: s.isalpha()).named('alpha') 647 | >>> expr.parse("x") 648 | 'x' 649 | >>> expr.parse("y") 650 | 'y' 651 | >>> expr.parse("1") 652 | Traceback (most recent call last): 653 | ... 654 | parser.NoParseError: got unexpected token: '1', expected: alpha 655 | 656 | ``` 657 | 658 | !!! Warning 659 | 660 | The `some()` combinator is quite slow and may be changed or removed in future 661 | versions. If you need a parser for a token by its type (e.g. any identifier) 662 | and maybe its value, use `tok(type[, value])` instead. You should use 663 | `make_tokenizer()` from `funcparserlib.lexer` to tokenize your text first. 664 | """ 665 | 666 | @Parser 667 | def _some(tokens: Sequence[_A], s: State) -> Tuple[_A, State]: 668 | if s.pos >= len(tokens): 669 | s2 = State(s.pos, s.max, _some if s.pos == s.max else s.parser) 670 | raise NoParseError("got unexpected end of input", s2) 671 | else: 672 | t = tokens[s.pos] 673 | if pred(t): 674 | pos = s.pos + 1 675 | s2 = State(pos, max(pos, s.max), s.parser) 676 | if debug: 677 | log.debug("*matched* %r, new state = %s" % (t, s2)) 678 | return t, s2 679 | else: 680 | s2 = State(s.pos, s.max, _some if s.pos == s.max else s.parser) 681 | if debug and isinstance(s2.parser, Parser): 682 | log.debug( 683 | "failed %r, state = %s, expected = %s" % (t, s2, s2.parser.name) 684 | ) 685 | raise NoParseError("got unexpected token", s2) 686 | 687 | _some.name = "some(...)" 688 | return _some 689 | 690 | 691 | def a(value: _A) -> Parser[_A, _A]: 692 | """Return a parser that parses a token if it's equal to `value`. 693 | 694 | Type: `(A) -> Parser[A, A]` 695 | 696 | Examples: 697 | 698 | ```pycon 699 | >>> expr = a("x") 700 | >>> expr.parse("x") 701 | 'x' 702 | >>> expr.parse("y") 703 | Traceback (most recent call last): 704 | ... 705 | parser.NoParseError: got unexpected token: 'y', expected: 'x' 706 | 707 | ``` 708 | 709 | !!! Note 710 | 711 | Although `Parser.parse()` can parse sequences of any objects (including 712 | `str` which is a sequence of `str` chars), **the recommended way** is 713 | parsing sequences of `Token` objects. 714 | 715 | You **should** use a regexp-based tokenizer `make_tokenizer()` defined in 716 | `funcparserlib.lexer` to convert your text into a sequence of `Token` objects 717 | before parsing it. You will get more readable parsing error messages (as `Token` 718 | objects contain their position in the source file) and good separation of the 719 | lexical and syntactic levels of the grammar. 720 | """ 721 | name = getattr(value, "name", value) 722 | 723 | def eq_value(t: _A) -> bool: 724 | return t == value 725 | 726 | return some(eq_value).named(repr(name)) 727 | 728 | 729 | def tok(type: str, value: Optional[str] = None) -> Parser[Token, str]: 730 | """Return a parser that parses a `Token` and returns the string value of the token. 731 | 732 | Type: `(str, Optional[str]) -> Parser[Token, str]` 733 | 734 | You can match any token of the specified `type` or you can match a specific token by 735 | its `type` and `value`. 736 | 737 | Examples: 738 | 739 | ```pycon 740 | >>> expr = tok("expr") 741 | >>> expr.parse([Token("expr", "foo")]) 742 | 'foo' 743 | >>> expr.parse([Token("expr", "bar")]) 744 | 'bar' 745 | >>> expr.parse([Token("op", "=")]) 746 | Traceback (most recent call last): 747 | ... 748 | parser.NoParseError: got unexpected token: '=', expected: expr 749 | 750 | ``` 751 | 752 | ```pycon 753 | >>> expr = tok("op", "=") 754 | >>> expr.parse([Token("op", "=")]) 755 | '=' 756 | >>> expr.parse([Token("op", "+")]) 757 | Traceback (most recent call last): 758 | ... 759 | parser.NoParseError: got unexpected token: '+', expected: '=' 760 | 761 | ``` 762 | 763 | !!! Note 764 | 765 | In order to convert your text to parse into a sequence of `Token` objects, 766 | use a regexp-based tokenizer `make_tokenizer()` defined in 767 | `funcparserlib.lexer`. You will get more readable parsing error messages (as 768 | `Token` objects contain their position in the source file) and good separation 769 | of the lexical and syntactic levels of the grammar. 770 | """ 771 | 772 | def eq_type(t: Token) -> bool: 773 | return t.type == type 774 | 775 | if value is not None: 776 | p = a(Token(type, value)) 777 | else: 778 | p = some(eq_type).named(type) 779 | return (p >> (lambda t: t.value)).named(p.name) 780 | 781 | 782 | def pure(x: _A) -> Parser[Any, _A]: 783 | """Wrap any object into a parser. 784 | 785 | Type: `(A) -> Parser[A, A]` 786 | 787 | A pure parser doesn't touch the tokens sequence, it just returns its pure `x` 788 | value. 789 | 790 | Also known as `return` in Haskell. 791 | """ 792 | 793 | @Parser 794 | def _pure(_: Sequence[Any], s: State) -> Tuple[_A, State]: 795 | return x, s 796 | 797 | _pure.name = "(pure %r)" % (x,) 798 | return _pure 799 | 800 | 801 | def maybe(p: Parser[_A, _B]) -> Parser[_A, Optional[_B]]: 802 | """Return a parser that returns `None` if the parser `p` fails. 803 | 804 | Examples: 805 | 806 | ```pycon 807 | >>> expr = maybe(a("x")) 808 | >>> expr.parse("x") 809 | 'x' 810 | >>> expr.parse("y") is None 811 | True 812 | 813 | ``` 814 | """ 815 | return (p | pure(None)).named("[ %s ]" % (p.name,)) 816 | 817 | 818 | def skip(p: Parser[_A, Any]) -> "_IgnoredParser[_A]": 819 | """An alias for `-p`. 820 | 821 | See also the docs for `Parser.__neg__()`. 822 | """ 823 | return -p 824 | 825 | 826 | class _IgnoredParser(Parser[_A, Any]): 827 | def __init__( 828 | self, 829 | p: Union[ 830 | Parser[_A, Any], 831 | Callable[[Sequence[_A], "State"], Tuple[Any, "State"]], 832 | ], 833 | ) -> None: 834 | super(_IgnoredParser, self).__init__(p) 835 | run = self._run if debug else self.run 836 | 837 | def ignored(tokens: Sequence[_A], s: State) -> Tuple[Any, State]: 838 | v, s2 = run(tokens, s) 839 | return v if isinstance(v, _Ignored) else _Ignored(v), s2 840 | 841 | self.define(ignored) 842 | name = getattr(p, "name", p.__doc__) 843 | if name is not None: 844 | self.name = name 845 | 846 | @overload # type: ignore[override] 847 | def __add__(self, other: "_IgnoredParser[_A]") -> "_IgnoredParser[_A]": 848 | pass 849 | 850 | @overload 851 | def __add__(self, other: Parser[_A, _C]) -> Parser[_A, _C]: 852 | pass 853 | 854 | def __add__( 855 | self, other: Union["_IgnoredParser[_A]", Parser[_A, _C]] 856 | ) -> Union["_IgnoredParser[_A]", Parser[_A, _C]]: 857 | if isinstance(other, _IgnoredParser): 858 | 859 | @_IgnoredParser 860 | def ip(tokens: Sequence[_A], s: State) -> Tuple[Any, State]: 861 | _, s2 = self.run(tokens, s) 862 | v, s3 = other.run(tokens, s2) 863 | return v, s3 864 | 865 | ip.name = "(%s, %s)" % (self.name, other.name) 866 | return ip 867 | else: 868 | 869 | @Parser 870 | def p(tokens: Sequence[_A], s: State) -> Tuple[_C, State]: 871 | _, s2 = self.run(tokens, s) 872 | v, s3 = other.run(tokens, s2) 873 | return v, s3 874 | 875 | p.name = "(%s, %s)" % (self.name, other.name) 876 | return p 877 | 878 | 879 | def oneplus(p: Parser[_A, _B]) -> Parser[_A, List[_B]]: 880 | """Return a parser that applies the parser `p` one or more times. 881 | 882 | A similar parser combinator `many(p)` means apply `p` zero or more times, whereas 883 | `oneplus(p)` means apply `p` one or more times. 884 | 885 | Examples: 886 | 887 | ```pycon 888 | >>> expr = oneplus(a("x")) 889 | >>> expr.parse("x") 890 | ['x'] 891 | >>> expr.parse("xx") 892 | ['x', 'x'] 893 | >>> expr.parse("y") 894 | Traceback (most recent call last): 895 | ... 896 | parser.NoParseError: got unexpected token: 'y', expected: 'x' 897 | 898 | ``` 899 | """ 900 | 901 | @Parser 902 | def _oneplus(tokens: Sequence[_A], s: State) -> Tuple[List[_B], State]: 903 | (v1, s2) = p.run(tokens, s) 904 | (v2, s3) = many(p).run(tokens, s2) 905 | return [v1] + v2, s3 906 | 907 | _oneplus.name = "(%s, { %s })" % (p.name, p.name) 908 | return _oneplus 909 | 910 | 911 | def with_forward_decls(suspension: Callable[[], Parser[_A, _B]]) -> Parser[_A, _B]: 912 | warnings.warn( 913 | "Use forward_decl() instead:\n" 914 | "\n" 915 | " p = forward_decl()\n" 916 | " ...\n" 917 | " p.define(parser_value)\n", 918 | DeprecationWarning, 919 | ) 920 | 921 | @Parser 922 | def f(tokens: Sequence[_A], s: State) -> Tuple[_B, State]: 923 | return suspension().run(tokens, s) 924 | 925 | return f 926 | 927 | 928 | def forward_decl() -> Parser[Any, Any]: 929 | """Return an undefined parser that can be used as a forward declaration. 930 | 931 | Type: `Parser[Any, Any]` 932 | 933 | Use `p = forward_decl()` in combination with `p.define(...)` to define recursive 934 | parsers. 935 | 936 | 937 | Examples: 938 | 939 | ```pycon 940 | >>> expr = forward_decl() 941 | >>> expr.define(a("x") + maybe(expr) + a("y")) 942 | >>> expr.parse("xxyy") # noqa 943 | ('x', ('x', None, 'y'), 'y') 944 | >>> expr.parse("xxy") 945 | Traceback (most recent call last): 946 | ... 947 | parser.NoParseError: got unexpected end of input, expected: 'y' 948 | 949 | ``` 950 | 951 | !!! Note 952 | 953 | If you care about static types, you should add a type hint for your forward 954 | declaration, so that your type checker can check types in `p.define(...)` later: 955 | 956 | ```python 957 | p: Parser[str, int] = forward_decl() 958 | p.define(a("x")) # Type checker error 959 | p.define(a("1") >> int) # OK 960 | ``` 961 | """ 962 | 963 | @Parser 964 | def f(_tokens: Any, _s: Any) -> Any: 965 | raise NotImplementedError("you must define() a forward_decl somewhere") 966 | 967 | f.name = "forward_decl()" 968 | return f 969 | 970 | 971 | if __name__ == "__main__": 972 | import doctest 973 | 974 | doctest.testmod() 975 | -------------------------------------------------------------------------------- /funcparserlib/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlasovskikh/funcparserlib/18c0a99dcdb427e35226c74b7cc2617223c8e1fa/funcparserlib/py.typed -------------------------------------------------------------------------------- /funcparserlib/util.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2009/2023 Andrey Vlasovskikh 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | # software and associated documentation files (the "Software"), to deal in the Software 5 | # without restriction, including without limitation the rights to use, copy, modify, 6 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | # permit persons to whom the Software is furnished to do so, subject to the following 8 | # conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies 11 | # or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 15 | # PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 16 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 18 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | from typing import TypeVar, Callable, Sequence 21 | 22 | _A = TypeVar("_A") 23 | 24 | 25 | def pretty_tree( 26 | x: _A, 27 | kids: Callable[[_A], Sequence[_A]], 28 | show: Callable[[_A], str], 29 | ) -> str: 30 | """Return a pseudo-graphic tree representation of the object `x` similar to the 31 | `tree` command in Unix. 32 | 33 | Type: `(T, Callable[[T], List[T]], Callable[[T], str]) -> str` 34 | 35 | It applies the parameter `show` (which is a function of type `(T) -> str`) to get a 36 | textual representation of the objects to show. 37 | 38 | It applies the parameter `kids` (which is a function of type `(T) -> List[T]`) to 39 | list the children of the object to show. 40 | 41 | Examples: 42 | 43 | ```pycon 44 | >>> print(pretty_tree( 45 | ... ["foo", ["bar", "baz"], "quux"], 46 | ... lambda obj: obj if isinstance(obj, list) else [], 47 | ... lambda obj: "[]" if isinstance(obj, list) else str(obj), 48 | ... )) 49 | [] 50 | |-- foo 51 | |-- [] 52 | | |-- bar 53 | | `-- baz 54 | `-- quux 55 | 56 | ``` 57 | """ 58 | (MID, END, CONT, LAST, ROOT) = ("|-- ", "`-- ", "| ", " ", "") 59 | 60 | def rec(obj: _A, indent: str, sym: str) -> str: 61 | line = indent + sym + show(obj) 62 | obj_kids = kids(obj) 63 | if len(obj_kids) == 0: 64 | return line 65 | else: 66 | if sym == MID: 67 | next_indent = indent + CONT 68 | elif sym == ROOT: 69 | next_indent = indent + ROOT 70 | else: 71 | next_indent = indent + LAST 72 | chars = [MID] * (len(obj_kids) - 1) + [END] 73 | lines = [rec(kid, next_indent, sym) for kid, sym in zip(obj_kids, chars)] 74 | return "\n".join([line] + lines) 75 | 76 | return rec(x, "", ROOT) 77 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: funcparserlib 2 | repo_url: https://github.com/vlasovskikh/funcparserlib 3 | repo_name: vlasovskikh/funcparserlib 4 | nav: 5 | - Home: index.md 6 | - Getting Started: 7 | - Getting Started - Intro: getting-started/index.md 8 | - Tokenizing Input: getting-started/tokenizing.md 9 | - Parsing Tokens: getting-started/parsing.md 10 | - Preparing the Parse Tree: getting-started/parse-tree.md 11 | - Tips and Tricks: getting-started/tips-and-tricks.md 12 | # - Examples: 13 | # - Nested Brackets Language: examples/brackets.md 14 | # - S-expressions Language: examples/s-exp.md 15 | # - DOT Language: examples/dot.md 16 | # - JSON Language: examples/json.md 17 | - API Reference: 18 | - API Overview: api/index.md 19 | - Lexer: api/lexer.md 20 | - Parser: api/parser.md 21 | - Utilities: api/util.md 22 | - Changelog: changes.md 23 | theme: 24 | name: material 25 | # icon: 26 | # logo: fontawesome/solid/angle-double-right 27 | features: 28 | - navigation.expand 29 | - navigation.tabs 30 | # - toc.integrate 31 | palette: 32 | - media: "(prefers-color-scheme: light)" 33 | scheme: default 34 | primary: indigo 35 | accent: indigo 36 | toggle: 37 | icon: material/weather-sunny 38 | name: Switch to dark mode 39 | - media: "(prefers-color-scheme: dark)" 40 | scheme: slate 41 | primary: blue 42 | accent: blue 43 | toggle: 44 | icon: material/weather-night 45 | name: Switch to light mode 46 | extra_css: 47 | - "media/extra.css" 48 | markdown_extensions: 49 | - pymdownx.highlight 50 | - pymdownx.superfences 51 | - admonition 52 | plugins: 53 | - search 54 | - mkdocstrings: 55 | handlers: 56 | python: 57 | options: 58 | show_root_toc_entry: false 59 | show_root_heading: true 60 | heading_level: 3 61 | show_source: false 62 | members: false 63 | watch: 64 | - funcparserlib 65 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | check_untyped_defs = True 3 | disallow_untyped_defs = True 4 | disallow_incomplete_defs = True 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "funcparserlib" 3 | version = "2.0.0a0" 4 | description = "Recursive descent parsing library based on functional combinators" 5 | authors = ["Andrey Vlasovskikh "] 6 | license = "MIT" 7 | readme = "README.md" 8 | homepage = "https://funcparserlib.pirx.ru" 9 | repository = "https://github.com/vlasovskikh/funcparserlib" 10 | classifiers = [ 11 | "Development Status :: 5 - Production/Stable", 12 | "Intended Audience :: Developers", 13 | "License :: OSI Approved :: MIT License", 14 | "Operating System :: OS Independent", 15 | "Programming Language :: Python :: 3.8", 16 | "Programming Language :: Python :: 3.9", 17 | "Programming Language :: Python :: 3.10", 18 | "Programming Language :: Python :: 3.11", 19 | "Programming Language :: Python :: 3.12", 20 | ] 21 | 22 | [tool.poetry.dependencies] 23 | python = "^3.8" 24 | 25 | [tool.poetry.dev-dependencies] 26 | pre-commit = {version = "^3.5.0"} 27 | tox = {version = "^4.4.6"} 28 | mkdocs = {version = "^1.4.2"} 29 | mkdocs-material = {version = "^9.1.1"} 30 | mkdocstrings = {extras = ["python"], version = "^0.24.0"} 31 | 32 | [build-system] 33 | requires = ["poetry-core>=1.5.1"] 34 | build-backend = "poetry.core.masonry.api" 35 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlasovskikh/funcparserlib/18c0a99dcdb427e35226c74b7cc2617223c8e1fa/tests/__init__.py -------------------------------------------------------------------------------- /tests/dot.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2009/2023 Andrey Vlasovskikh 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | # software and associated documentation files (the "Software"), to deal in the Software 5 | # without restriction, including without limitation the rights to use, copy, modify, 6 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | # permit persons to whom the Software is furnished to do so, subject to the following 8 | # conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies 11 | # or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 15 | # PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 16 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 18 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | """A DOT language parser using funcparserlib. 21 | 22 | The parser is based on [the DOT grammar][1]. It is pretty complete with a few 23 | not supported things: 24 | 25 | * String escapes 26 | * Ports and compass points 27 | * XML identifiers 28 | 29 | At the moment, the parser builds only a parse tree, not an abstract syntax tree 30 | (AST), or an API for dealing with DOT. 31 | 32 | [1]: https://www.graphviz.org/doc/info/lang.html 33 | """ 34 | 35 | import os 36 | import sys 37 | from re import MULTILINE 38 | from typing import Sequence, List, TypeVar, Callable, NamedTuple, Union, Optional 39 | 40 | from funcparserlib.lexer import TokenSpec, make_tokenizer, Token, LexerError 41 | from funcparserlib.parser import ( 42 | maybe, 43 | many, 44 | finished, 45 | oneplus, 46 | forward_decl, 47 | NoParseError, 48 | Parser, 49 | tok, 50 | ) 51 | from funcparserlib.util import pretty_tree 52 | 53 | ENCODING = "UTF-8" 54 | 55 | 56 | class Graph(NamedTuple): 57 | strict: Optional[str] 58 | type: Optional[str] 59 | id: Optional[str] 60 | stmts: List["Statement"] 61 | 62 | 63 | class SubGraph(NamedTuple): 64 | id: Optional[str] 65 | stmts: List["Statement"] 66 | 67 | 68 | class Attr(NamedTuple): 69 | name: str 70 | value: Optional[str] 71 | 72 | 73 | class Node(NamedTuple): 74 | id: str 75 | attrs: List[Attr] 76 | 77 | 78 | class Edge(NamedTuple): 79 | nodes: List[Union[str, SubGraph]] 80 | attrs: List[Attr] 81 | 82 | 83 | class DefAttrs(NamedTuple): 84 | object: str 85 | attrs: List[Attr] 86 | 87 | 88 | Statement = Union[DefAttrs, Edge, SubGraph, Node] 89 | 90 | 91 | T = TypeVar("T") 92 | 93 | 94 | def tokenize(s: str) -> Sequence[Token]: 95 | specs = [ 96 | TokenSpec("Comment", r"/\*(.|[\r\n])*?\*/", MULTILINE), 97 | TokenSpec("Comment", r"//.*"), 98 | TokenSpec("NL", r"[\r\n]+"), 99 | TokenSpec("Space", r"[ \t\r\n]+"), 100 | TokenSpec("Name", r"[A-Za-z\200-\377_][A-Za-z\200-\377_0-9]*"), 101 | TokenSpec("Op", r"[{};,=\[\]]|(->)|(--)"), 102 | TokenSpec("Number", r"-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)"), 103 | TokenSpec("String", r'"[^"]*"'), # '\"' escapes are ignored 104 | ] 105 | useless = ["Comment", "NL", "Space"] 106 | t = make_tokenizer(specs) 107 | return [x for x in t(s) if x.type not in useless] 108 | 109 | 110 | def parse(tokens: Sequence[Token]) -> Graph: 111 | def un_arg(f: Callable[..., T]) -> Callable[[tuple], T]: 112 | return lambda args: f(*args) 113 | 114 | def flatten(xs: List[List[Attr]]) -> List[Attr]: 115 | return sum(xs, []) 116 | 117 | def n(s: str) -> Parser[Token, str]: 118 | return tok("Name", s) 119 | 120 | def op(s: str) -> Parser[Token, str]: 121 | return tok("Op", s) 122 | 123 | dot_id = (tok("Name") | tok("Number") | tok("String")).named("id") 124 | 125 | def make_graph_attr(args: tuple) -> DefAttrs: 126 | return DefAttrs("graph", [Attr(*args)]) 127 | 128 | def make_edge( 129 | node: Union[str, SubGraph], xs: List[Union[str, SubGraph]], attrs: List[Attr] 130 | ) -> Edge: 131 | return Edge([node] + xs, attrs) 132 | 133 | node_id = dot_id # + maybe(port) 134 | a_list = dot_id + maybe(-op("=") + dot_id) + -maybe(op(",")) >> un_arg(Attr) 135 | attr_list = many(-op("[") + many(a_list) + -op("]")) >> flatten 136 | attr_stmt = (n("graph") | n("node") | n("edge")) + attr_list >> un_arg(DefAttrs) 137 | graph_attr = dot_id + -op("=") + dot_id >> make_graph_attr 138 | node_stmt = node_id + attr_list >> un_arg(Node) 139 | # We use a forward_decl because of circular definitions like 140 | # (stmt_list -> stmt -> subgraph -> stmt_list) 141 | subgraph: Parser[Token, SubGraph] = forward_decl() 142 | edge_rhs = -(op("->") | op("--")) + (subgraph | node_id) 143 | edge_stmt = (subgraph | node_id) + oneplus(edge_rhs) + attr_list >> un_arg( 144 | make_edge 145 | ) 146 | stmt = attr_stmt | edge_stmt | subgraph | graph_attr | node_stmt 147 | stmt_list = many(stmt + -maybe(op(";"))) 148 | graph_body = -op("{") + stmt_list + -op("}") 149 | subgraph.define(-n("subgraph") + maybe(dot_id) + graph_body >> un_arg(SubGraph)) 150 | graph_modifiers = maybe(n("strict")) + maybe(n("graph") | n("digraph")) 151 | graph = graph_modifiers + maybe(dot_id) + graph_body >> un_arg(Graph) 152 | dotfile = graph + -finished 153 | 154 | return dotfile.parse(tokens) 155 | 156 | 157 | def pretty_parse_tree(obj: object) -> str: 158 | class NamedValues(NamedTuple): 159 | name: str 160 | values: Sequence[object] 161 | 162 | def kids(x: object) -> Sequence[object]: 163 | if isinstance(x, (Graph, SubGraph)): 164 | return [NamedValues("stmts", x.stmts)] 165 | elif isinstance(x, (Node, DefAttrs)): 166 | return [NamedValues("attrs", x.attrs)] 167 | elif isinstance(x, Edge): 168 | return [NamedValues("nodes", x.nodes), NamedValues("attrs", x.attrs)] 169 | elif isinstance(x, NamedValues): 170 | return x.values 171 | else: 172 | return [] 173 | 174 | def show(x: object) -> str: 175 | if isinstance(x, NamedValues): 176 | return x.name 177 | elif isinstance(x, Graph): 178 | return "Graph [id=%s, strict=%r, type=%s]" % ( 179 | x.id, 180 | x.strict is not None, 181 | x.type, 182 | ) 183 | elif isinstance(x, SubGraph): 184 | return "SubGraph [id=%s]" % (x.id,) 185 | elif isinstance(x, Edge): 186 | return "Edge" 187 | elif isinstance(x, Attr): 188 | return "Attr [name=%s, value=%s]" % (x.name, x.value) 189 | elif isinstance(x, DefAttrs): 190 | return "DefAttrs [object=%s]" % (x.object,) 191 | elif isinstance(x, Node): 192 | return "Node [id=%s]" % (x.id,) 193 | else: 194 | return str(x) 195 | 196 | return pretty_tree(obj, kids, show) 197 | 198 | 199 | def main() -> None: 200 | # import logging 201 | # logging.basicConfig(level=logging.DEBUG) 202 | # import funcparserlib 203 | # funcparserlib.parser.debug = True 204 | try: 205 | stdin = os.fdopen(sys.stdin.fileno(), "rb") 206 | text = stdin.read().decode(ENCODING) 207 | tree = parse(tokenize(text)) 208 | # print(pformat(tree)) 209 | print(pretty_parse_tree(tree).encode(ENCODING)) 210 | except (NoParseError, LexerError) as e: 211 | msg = ("syntax error: %s" % e).encode(ENCODING) 212 | print(msg, file=sys.stderr) 213 | sys.exit(1) 214 | 215 | 216 | if __name__ == "__main__": 217 | main() 218 | -------------------------------------------------------------------------------- /tests/json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright © 2009/2023 Andrey Vlasovskikh 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, 8 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 9 | # permit persons to whom the Software is furnished to do so, subject to the following 10 | # conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all copies 13 | # or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 17 | # PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 18 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 19 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 20 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | """A JSON parser using funcparserlib. 23 | 24 | The parser is based on [the JSON grammar][1]. 25 | 26 | [1]: https://tools.ietf.org/html/rfc4627 27 | """ 28 | 29 | import re 30 | import sys 31 | from pprint import pformat 32 | from re import VERBOSE 33 | from typing import ( 34 | List, 35 | Sequence, 36 | Optional, 37 | Tuple, 38 | Any, 39 | Dict, 40 | Match, 41 | TypeVar, 42 | Callable, 43 | Text, 44 | Union, 45 | ) 46 | 47 | from funcparserlib.lexer import TokenSpec, make_tokenizer, Token, LexerError 48 | from funcparserlib.parser import ( 49 | maybe, 50 | many, 51 | finished, 52 | forward_decl, 53 | NoParseError, 54 | Parser, 55 | tok, 56 | ) 57 | 58 | ENCODING = "UTF-8" 59 | # noinspection SpellCheckingInspection 60 | regexps = { 61 | "escaped": r""" 62 | \\ # Escape 63 | ((?P["\\/bfnrt]) # Standard escapes 64 | | (u(?P[0-9A-Fa-f]{4}))) # uXXXX 65 | """, 66 | "unescaped": r""" 67 | [^"\\] # Unescaped: avoid ["\\] 68 | """, 69 | } 70 | re_esc = re.compile(regexps["escaped"], VERBOSE) 71 | T = TypeVar("T") 72 | JsonValue = Union[None, bool, dict, list, int, float, str] 73 | JsonMember = Tuple[str, JsonValue] 74 | 75 | 76 | def tokenize(s: str) -> List[Token]: 77 | specs = [ 78 | TokenSpec("space", r"[ \t\r\n]+"), 79 | TokenSpec("string", r'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), 80 | TokenSpec( 81 | "number", 82 | r""" 83 | -? # Minus 84 | (0|([1-9][0-9]*)) # Int 85 | (\.[0-9]+)? # Frac 86 | ([Ee][+-]?[0-9]+)? # Exp 87 | """, 88 | VERBOSE, 89 | ), 90 | TokenSpec("op", r"[{}\[\]\-,:]"), 91 | TokenSpec("name", r"[A-Za-z_][A-Za-z_0-9]*"), 92 | ] 93 | useless = ["space"] 94 | t = make_tokenizer(specs) 95 | return [x for x in t(s) if x.type not in useless] 96 | 97 | 98 | def parse(tokens: Sequence[Token]) -> JsonValue: 99 | def const(x: T) -> Callable[[Any], T]: 100 | return lambda _: x 101 | 102 | def op(s: str) -> Parser[Token, str]: 103 | return tok("op", s) 104 | 105 | def n(s: str) -> Parser[Token, Text]: 106 | return tok("name", s) 107 | 108 | def make_array( 109 | values: Optional[Tuple[JsonValue, List[JsonValue]]] 110 | ) -> List[JsonValue]: 111 | if values is None: 112 | return [] 113 | else: 114 | return [values[0]] + values[1] 115 | 116 | def make_object( 117 | values: Optional[Tuple[JsonMember, List[JsonMember]]] 118 | ) -> Dict[str, Any]: 119 | if values is None: 120 | return {} 121 | else: 122 | first, rest = values 123 | k, v = first 124 | d = {k: v} 125 | d.update(rest) 126 | return d 127 | 128 | def make_number(s: str) -> Union[int, float]: 129 | try: 130 | return int(s) 131 | except ValueError: 132 | return float(s) 133 | 134 | def unescape(s: str) -> str: 135 | std = { 136 | '"': '"', 137 | "\\": "\\", 138 | "/": "/", 139 | "b": "\b", 140 | "f": "\f", 141 | "n": "\n", 142 | "r": "\r", 143 | "t": "\t", 144 | } 145 | 146 | def sub(m: Match[str]) -> str: 147 | if m.group("standard") is not None: 148 | return std[m.group("standard")] 149 | else: 150 | return chr(int(m.group("unicode"), 16)) 151 | 152 | return re_esc.sub(sub, s) 153 | 154 | def make_string(s: str) -> str: 155 | return unescape(s[1:-1]) 156 | 157 | def make_member(values: JsonMember) -> JsonMember: 158 | k, v = values 159 | return k, v 160 | 161 | null = n("null") >> const(None) 162 | true = n("true") >> const(True) 163 | false = n("false") >> const(False) 164 | number = tok("number") >> make_number 165 | string = tok("string") >> make_string 166 | value: Parser[Token, JsonValue] = forward_decl().named("json_value") 167 | member = string + -op(":") + value >> make_member 168 | json_object = ( 169 | (-op("{") + maybe(member + many(-op(",") + member)) + -op("}")) >> make_object 170 | ).named("json_object") 171 | json_array = ( 172 | (-op("[") + maybe(value + many(-op(",") + value)) + -op("]")) >> make_array 173 | ).named("json_array") 174 | value.define(null | true | false | json_object | json_array | number | string) 175 | json_text = value + -finished 176 | 177 | return json_text.parse(tokens) 178 | 179 | 180 | def loads(s: str) -> JsonValue: 181 | return parse(tokenize(s)) 182 | 183 | 184 | def main() -> None: 185 | try: 186 | text = sys.stdin.read() 187 | tree = loads(text) 188 | print(pformat(tree)) 189 | except (NoParseError, LexerError) as e: 190 | print("syntax error: %s" % e, file=sys.stderr) 191 | sys.exit(1) 192 | 193 | 194 | if __name__ == "__main__": 195 | main() 196 | -------------------------------------------------------------------------------- /tests/test_dot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from typing import Optional 5 | 6 | from funcparserlib.parser import NoParseError 7 | from funcparserlib.lexer import LexerError 8 | from .dot import parse, tokenize, Graph, Edge, SubGraph, DefAttrs, Attr, Node 9 | 10 | 11 | class DotTest(unittest.TestCase): 12 | def t(self, data: str, expected: Optional[Graph] = None) -> None: 13 | self.assertEqual(parse(tokenize(data)), expected) 14 | 15 | def test_comments(self) -> None: 16 | self.t( 17 | """ 18 | /* комм 1 */ 19 | graph /* комм 4 */ g1 { 20 | // комм 2 /* комм 3 */ 21 | } 22 | // комм 5 23 | """, 24 | Graph(strict=None, type="graph", id="g1", stmts=[]), 25 | ) 26 | 27 | def test_connected_subgraph(self) -> None: 28 | self.t( 29 | """ 30 | digraph g1 { 31 | n1 -> n2 -> 32 | subgraph n3 { 33 | nn1 -> nn2 -> nn3; 34 | nn3 -> nn1; 35 | }; 36 | subgraph n3 {} -> n1; 37 | } 38 | """, 39 | Graph( 40 | strict=None, 41 | type="digraph", 42 | id="g1", 43 | stmts=[ 44 | Edge( 45 | nodes=[ 46 | "n1", 47 | "n2", 48 | SubGraph( 49 | id="n3", 50 | stmts=[ 51 | Edge(nodes=["nn1", "nn2", "nn3"], attrs=[]), 52 | Edge(nodes=["nn3", "nn1"], attrs=[]), 53 | ], 54 | ), 55 | ], 56 | attrs=[], 57 | ), 58 | Edge(nodes=[SubGraph(id="n3", stmts=[]), "n1"], attrs=[]), 59 | ], 60 | ), 61 | ) 62 | 63 | def test_default_attrs(self) -> None: 64 | self.t( 65 | """ 66 | digraph g1 { 67 | page="3,3"; 68 | graph [rotate=90]; 69 | node [shape=box, color="#0000ff"]; 70 | edge [style=dashed]; 71 | n1 -> n2 -> n3; 72 | n3 -> n1; 73 | } 74 | """, 75 | Graph( 76 | strict=None, 77 | type="digraph", 78 | id="g1", 79 | stmts=[ 80 | DefAttrs(object="graph", attrs=[Attr(name="page", value='"3,3"')]), 81 | DefAttrs(object="graph", attrs=[Attr(name="rotate", value="90")]), 82 | DefAttrs( 83 | object="node", 84 | attrs=[ 85 | Attr(name="shape", value="box"), 86 | Attr(name="color", value='"#0000ff"'), 87 | ], 88 | ), 89 | DefAttrs(object="edge", attrs=[Attr(name="style", value="dashed")]), 90 | Edge(nodes=["n1", "n2", "n3"], attrs=[]), 91 | Edge(nodes=["n3", "n1"], attrs=[]), 92 | ], 93 | ), 94 | ) 95 | 96 | def test_empty_graph(self) -> None: 97 | self.t( 98 | """ 99 | graph g1 {} 100 | """, 101 | Graph(strict=None, type="graph", id="g1", stmts=[]), 102 | ) 103 | 104 | def test_few_attrs(self) -> None: 105 | self.t( 106 | """ 107 | digraph g1 { 108 | n1 [attr1, attr2 = value2]; 109 | } 110 | """, 111 | Graph( 112 | strict=None, 113 | type="digraph", 114 | id="g1", 115 | stmts=[ 116 | Node( 117 | id="n1", 118 | attrs=[ 119 | Attr(name="attr1", value=None), 120 | Attr(name="attr2", value="value2"), 121 | ], 122 | ) 123 | ], 124 | ), 125 | ) 126 | 127 | def test_few_nodes(self) -> None: 128 | self.t( 129 | """ 130 | graph g1 { 131 | n1; 132 | n2; 133 | n3 134 | } 135 | """, 136 | Graph( 137 | strict=None, 138 | type="graph", 139 | id="g1", 140 | stmts=[ 141 | Node(id="n1", attrs=[]), 142 | Node(id="n2", attrs=[]), 143 | Node(id="n3", attrs=[]), 144 | ], 145 | ), 146 | ) 147 | 148 | def test_illegal_comma(self) -> None: 149 | try: 150 | self.t( 151 | """ 152 | graph g1 { 153 | n1; 154 | n2; 155 | n3, 156 | } 157 | """ 158 | ) 159 | except NoParseError: 160 | pass 161 | else: 162 | self.fail("must raise NoParseError") 163 | 164 | def test_null(self) -> None: 165 | try: 166 | self.t("") 167 | except NoParseError: 168 | pass 169 | else: 170 | self.fail("must raise NoParseError") 171 | 172 | def test_simple_cycle(self) -> None: 173 | self.t( 174 | """ 175 | digraph g1 { 176 | n1 -> n2 [w=5]; 177 | n2 -> n3 [w=10]; 178 | n3 -> n1 [w=7]; 179 | } 180 | """, 181 | Graph( 182 | strict=None, 183 | type="digraph", 184 | id="g1", 185 | stmts=[ 186 | Edge(nodes=["n1", "n2"], attrs=[Attr(name="w", value="5")]), 187 | Edge(nodes=["n2", "n3"], attrs=[Attr(name="w", value="10")]), 188 | Edge(nodes=["n3", "n1"], attrs=[Attr(name="w", value="7")]), 189 | ], 190 | ), 191 | ) 192 | 193 | def test_single_unicode_char(self) -> None: 194 | try: 195 | self.t("ф") 196 | except LexerError: 197 | pass 198 | else: 199 | self.fail("must raise LexerError") 200 | 201 | def test_unicode_names(self) -> None: 202 | self.t( 203 | """ 204 | digraph g1 { 205 | n1 -> "Медведь" [label="Поехали!"]; 206 | "Медведь" -> n3 [label="Добро пожаловать!"]; 207 | n3 -> n1 ["Водка"="Селёдка"]; 208 | } 209 | """, 210 | Graph( 211 | strict=None, 212 | type="digraph", 213 | id="g1", 214 | stmts=[ 215 | Edge( 216 | nodes=["n1", '"Медведь"'], 217 | attrs=[Attr(name="label", value='"Поехали!"')], 218 | ), 219 | Edge( 220 | nodes=['"Медведь"', "n3"], 221 | attrs=[Attr(name="label", value='"Добро пожаловать!"')], 222 | ), 223 | Edge( 224 | nodes=["n3", "n1"], 225 | attrs=[Attr(name='"Водка"', value='"Селёдка"')], 226 | ), 227 | ], 228 | ), 229 | ) 230 | -------------------------------------------------------------------------------- /tests/test_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from typing import Optional 5 | 6 | from funcparserlib.parser import NoParseError 7 | from funcparserlib.lexer import LexerError 8 | from . import json 9 | 10 | 11 | class JsonTest(unittest.TestCase): 12 | def t(self, data: str, expected: Optional[object] = None) -> None: 13 | self.assertEqual(json.loads(data), expected) 14 | 15 | def test_1_array(self) -> None: 16 | self.t("[1]", [1]) 17 | 18 | def test_1_object(self) -> None: 19 | self.t('{"foo": "bar"}', {"foo": "bar"}) 20 | 21 | def test_bool_and_null(self) -> None: 22 | self.t("[null, true, false]", [None, True, False]) 23 | 24 | def test_empty_array(self) -> None: 25 | self.t("[]", []) 26 | 27 | def test_empty_object(self) -> None: 28 | self.t("{}", {}) 29 | 30 | def test_many_array(self) -> None: 31 | self.t("[1, 2, [3, 4, 5], 6]", [1, 2, [3, 4, 5], 6]) 32 | 33 | def test_many_object(self) -> None: 34 | # noinspection SpellCheckingInspection 35 | self.t( 36 | """ 37 | { 38 | "foo": 1, 39 | "bar": 40 | { 41 | "baz": 2, 42 | "quux": [true, false], 43 | "{}": {} 44 | }, 45 | "spam": "eggs" 46 | } 47 | """, 48 | { 49 | "foo": 1, 50 | "bar": { 51 | "baz": 2, 52 | "quux": [True, False], 53 | "{}": {}, 54 | }, 55 | "spam": "eggs", 56 | }, 57 | ) 58 | 59 | def test_null(self) -> None: 60 | try: 61 | self.t("") 62 | except NoParseError: 63 | pass 64 | else: 65 | self.fail("must raise NoParseError") 66 | 67 | def test_numbers(self) -> None: 68 | self.t( 69 | """\ 70 | [ 71 | 0, 1, -1, 14, -14, 65536, 72 | 0.0, 3.14, -3.14, -123.456, 73 | 6.67428e-11, -1.602176e-19, 6.67428E-11 74 | ] 75 | """, 76 | [ 77 | 0, 78 | 1, 79 | -1, 80 | 14, 81 | -14, 82 | 65536, 83 | 0.0, 84 | 3.14, 85 | -3.14, 86 | -123.456, 87 | 6.67428e-11, 88 | -1.602176e-19, 89 | 6.67428e-11, 90 | ], 91 | ) 92 | 93 | def test_strings(self) -> None: 94 | # noinspection SpellCheckingInspection 95 | self.t( 96 | r""" 97 | [ 98 | ["", "hello", "hello world!"], 99 | ["привет, мир!", "λx.x"], 100 | ["\"", "\\", "\/", "\b", "\f", "\n", "\r", "\t"], 101 | ["\u0000", "\u03bb", "\uffff", "\uFFFF"], 102 | ["вот функция идентичности:\nλx.x\nили так:\n\u03bbx.x"] 103 | ] 104 | """, 105 | [ 106 | ["", "hello", "hello world!"], 107 | ["привет, мир!", "λx.x"], 108 | ['"', "\\", "/", "\x08", "\x0c", "\n", "\r", "\t"], 109 | ["\u0000", "\u03bb", "\uffff", "\uffff"], 110 | ["вот функция идентичности:\nλx.x\nили так:\n\u03bbx.x"], 111 | ], 112 | ) 113 | 114 | def test_toplevel_string(self) -> None: 115 | try: 116 | self.t("неправильно") 117 | except LexerError: 118 | pass 119 | else: 120 | self.fail("must raise LexerError") 121 | -------------------------------------------------------------------------------- /tests/test_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from typing import Optional, Tuple 5 | 6 | from funcparserlib.lexer import TokenSpec, make_tokenizer, LexerError, Token 7 | from funcparserlib.parser import ( 8 | a, 9 | many, 10 | NoParseError, 11 | oneplus, 12 | Parser, 13 | maybe, 14 | _Ignored, # noqa 15 | tok, 16 | finished, 17 | forward_decl, 18 | some, 19 | ) 20 | 21 | 22 | class ParsingTest(unittest.TestCase): 23 | def test_oneplus(self) -> None: 24 | x = a("x") 25 | y = a("y") 26 | expr = oneplus(x + y) 27 | # noinspection SpellCheckingInspection 28 | self.assertEqual(expr.parse("xyxyxy"), ([("x", "y"), ("x", "y"), ("x", "y")])) 29 | 30 | # Issue 31 31 | def test_many_backtracking(self) -> None: 32 | x = a("x") 33 | y = a("y") 34 | expr = many(x + y) + x + x 35 | # noinspection SpellCheckingInspection 36 | self.assertEqual(expr.parse("xyxyxx"), ([("x", "y"), ("x", "y")], "x", "x")) 37 | 38 | # Issue 14 39 | def test_error_info(self) -> None: 40 | tokenize = make_tokenizer( 41 | [ 42 | TokenSpec("keyword", r"\b(is|end)\b"), 43 | TokenSpec("id", r"[a-z_]+"), 44 | ("space", (r"[ \t]+",)), # Legacy token spec 45 | TokenSpec("nl", r"[\n\r]+"), 46 | ] 47 | ) 48 | with self.assertRaises(LexerError) as ctx: 49 | list(tokenize("f is ф")) 50 | self.assertEqual(str(ctx.exception), 'cannot tokenize data: 1,6: "f is \u0444"') 51 | 52 | def make_equality(values: Tuple[str, str]) -> Tuple[str, str]: 53 | v1, v2 = values 54 | return v1, v2 55 | 56 | tok_id = tok("id") 57 | equality = tok_id + -tok("keyword", "is") + tok_id >> make_equality 58 | expr = equality + -tok("nl") 59 | file = many(expr) + tok("keyword", "end") 60 | 61 | msg = """\ 62 | spam is eggs 63 | foo is_not bar 64 | end""" 65 | tokens = [x for x in tokenize(msg) if x.type != "space"] 66 | with self.assertRaises(NoParseError) as ctx2: 67 | file.parse(tokens) 68 | self.assertEqual(ctx2.exception.state.pos, 4) 69 | self.assertEqual(ctx2.exception.state.max, 5) 70 | # May raise KeyError 71 | t = tokens[ctx2.exception.state.max] 72 | self.assertEqual(t, Token("id", "is_not")) 73 | self.assertEqual((t.start, t.end), ((2, 5), (2, 10))) 74 | self.assertEqual( 75 | ctx2.exception.msg, 76 | "2,5-2,10: got unexpected token: 'is_not', expected: 'is'", 77 | ) 78 | 79 | def test_ok_ignored(self) -> None: 80 | x = a("x") 81 | y = a("y") 82 | expr: Parser[str, str] = -x + y 83 | self.assertEqual(expr.parse("xy"), "y") 84 | 85 | def test_ignored_ok(self) -> None: 86 | x = a("x") 87 | y = a("y") 88 | expr: Parser[str, str] = x + -y 89 | self.assertEqual(expr.parse("xy"), "x") 90 | 91 | def test_ignored_ok_ok(self) -> None: 92 | x = a("x") 93 | y = a("y") 94 | expr: Parser[str, Tuple[str, str]] = -x + y + x 95 | self.assertEqual(expr.parse("xyx"), ("y", "x")) 96 | 97 | def test_ok_ignored_ok(self) -> None: 98 | x = a("x") 99 | y = a("y") 100 | expr: Parser[str, Tuple[str, str]] = x + -y + x 101 | self.assertEqual(expr.parse("xyx"), ("x", "x")) 102 | 103 | def test_ok_ok_ok(self) -> None: 104 | x = a("x") 105 | y = a("y") 106 | expr: Parser[str, Tuple[str, str]] = x + y + x 107 | self.assertEqual(expr.parse("xyx"), ("x", "y", "x")) 108 | 109 | def test_ok_ok_ignored(self) -> None: 110 | x = a("x") 111 | y = a("y") 112 | expr: Parser[str, Tuple[str, str]] = x + y + -x 113 | self.assertEqual(expr.parse("xyx"), ("x", "y")) 114 | 115 | def test_ignored_ignored_ok(self) -> None: 116 | x = a("x") 117 | y = a("y") 118 | expr: Parser[str, str] = -x + -x + y 119 | self.assertEqual(expr.parse("xxy"), "y") 120 | 121 | def test_ok_ignored_ignored(self) -> None: 122 | x = a("x") 123 | y = a("y") 124 | expr: Parser[str, str] = x + -y + -y 125 | self.assertEqual(expr.parse("xyy"), "x") 126 | 127 | def test_ignored_ignored(self) -> None: 128 | x = a("x") 129 | y = a("y") 130 | expr: Parser[str, _Ignored] = -x + -y 131 | self.assertEqual(expr.parse("xy"), _Ignored("y")) 132 | 133 | def test_ignored_ignored_ignored(self) -> None: 134 | x = a("x") 135 | y = a("y") 136 | z = a("z") 137 | expr: Parser[str, _Ignored] = -x + -y + -z 138 | self.assertEqual(expr.parse("xyz"), _Ignored("z")) 139 | 140 | def test_ignored_maybe(self) -> None: 141 | x = a("x") 142 | y = a("y") 143 | expr: Parser[str, str] = -maybe(x) + y 144 | self.assertEqual(expr.parse("xy"), "y") 145 | self.assertEqual(expr.parse("y"), "y") 146 | 147 | def test_maybe_ignored(self) -> None: 148 | x = a("x") 149 | y = a("y") 150 | expr: Parser[str, Tuple[Optional[_Ignored], str]] = maybe(-x) + y 151 | self.assertEqual(expr.parse("xy"), (_Ignored("x"), "y")) 152 | self.assertEqual(expr.parse("y"), (None, "y")) 153 | 154 | def test_ignored_maybe_ignored(self) -> None: 155 | x = a("x") 156 | y = a("y") 157 | expr: Parser[str, Optional[str]] = -x + maybe(y) + -x 158 | self.assertEqual(expr.parse("xyx"), "y") 159 | self.assertEqual(expr.parse("xx"), None) 160 | 161 | def test_compare_token_with_none(self) -> None: 162 | # https://github.com/vlasovskikh/funcparserlib/pull/58 163 | specs = [ 164 | ("id", (r"\w+",)), 165 | ] 166 | tokenize = make_tokenizer(specs) 167 | tokens = list(tokenize("foo")) 168 | expr = maybe(a(None)) 169 | self.assertEqual(expr.parse(tokens), None) # type: ignore 170 | 171 | def test_seq_parse_error(self) -> None: 172 | expr = a("x") + a("y") 173 | with self.assertRaises(NoParseError) as ctx: 174 | expr.parse("xz") 175 | self.assertEqual(ctx.exception.msg, "got unexpected token: 'z', expected: 'y'") 176 | 177 | def test_alt_2_parse_error(self) -> None: 178 | expr = a("x") + (a("x") | a("y")) 179 | with self.assertRaises(NoParseError) as ctx: 180 | expr.parse("xz") 181 | self.assertEqual( 182 | ctx.exception.msg, "got unexpected token: 'z', expected: 'x' or 'y'" 183 | ) 184 | 185 | def test_alt_3_parse_error(self) -> None: 186 | expr = a("x") + (a("x") | a("y") | a("z")) 187 | with self.assertRaises(NoParseError) as ctx: 188 | expr.parse("xa") 189 | self.assertEqual( 190 | ctx.exception.msg, 191 | "got unexpected token: 'a', expected: 'x' or 'y' or 'z'", 192 | ) 193 | 194 | def test_alt_3_two_steps_parse_error(self) -> None: 195 | expr = a("x") + (a("x") | (a("y") + a("a"))) 196 | with self.assertRaises(NoParseError) as ctx: 197 | expr.parse("xyz") 198 | self.assertEqual(ctx.exception.msg, "got unexpected token: 'z', expected: 'a'") 199 | 200 | def test_expected_eof_error(self) -> None: 201 | expr = a("x") + finished 202 | with self.assertRaises(NoParseError) as ctx: 203 | expr.parse("xy") 204 | self.assertEqual( 205 | ctx.exception.msg, 206 | "got unexpected token: 'y', expected: end of input", 207 | ) 208 | 209 | def test_expected_second_in_sequence_error(self) -> None: 210 | expr = a("x") + a("y") 211 | with self.assertRaises(NoParseError) as ctx: 212 | expr.parse("xz") 213 | self.assertEqual(ctx.exception.msg, "got unexpected token: 'z', expected: 'y'") 214 | 215 | def test_forward_decl_nested_matching_error(self) -> None: 216 | expr = forward_decl() 217 | expr.define(a("x") + maybe(expr) + a("y")) 218 | with self.assertRaises(NoParseError) as ctx: 219 | expr.parse("xxy") 220 | self.assertEqual( 221 | ctx.exception.msg, "got unexpected end of input, expected: 'y'" 222 | ) 223 | 224 | def test_expected_token_type_error(self) -> None: 225 | expr = tok("number") 226 | with self.assertRaises(NoParseError) as ctx: 227 | expr.parse([Token("id", "x")]) 228 | self.assertEqual( 229 | ctx.exception.msg, "got unexpected token: 'x', expected: number" 230 | ) 231 | 232 | def test_expected_exact_token_error(self) -> None: 233 | expr = tok("operator", "=") 234 | with self.assertRaises(NoParseError) as ctx: 235 | expr.parse([Token("operator", "+")]) 236 | self.assertEqual(ctx.exception.msg, "got unexpected token: '+', expected: '='") 237 | 238 | def test_unexpected_eof(self) -> None: 239 | expr = (a("x") + a("y")) | a("z") 240 | with self.assertRaises(NoParseError) as ctx: 241 | expr.parse("x") 242 | self.assertEqual( 243 | ctx.exception.msg, "got unexpected end of input, expected: 'y'" 244 | ) 245 | 246 | def test_expected_transform_parsing_results_error(self) -> None: 247 | expr = (a("1") >> int) | a("2") 248 | with self.assertRaises(NoParseError) as ctx: 249 | expr.parse("x") 250 | self.assertEqual( 251 | ctx.exception.msg, "got unexpected token: 'x', expected: '1' or '2'" 252 | ) 253 | 254 | def test_expected_sequence_with_skipped_parts(self) -> None: 255 | expr = (-a("x") + a("y")) | a("z") 256 | with self.assertRaises(NoParseError) as ctx: 257 | expr.parse("b") 258 | self.assertEqual( 259 | ctx.exception.msg, 260 | "got unexpected token: 'b', expected: ('x', 'y') or 'z'", 261 | ) 262 | 263 | def test_expected_some_without_name(self) -> None: 264 | def lowercase(t: str) -> bool: 265 | return t.islower() 266 | 267 | expr = some(lowercase) 268 | with self.assertRaises(NoParseError) as ctx: 269 | expr.parse("A") 270 | self.assertEqual( 271 | ctx.exception.msg, "got unexpected token: 'A', expected: some(...)" 272 | ) 273 | 274 | def test_expected_forward_decl_without_name(self) -> None: 275 | nested = forward_decl() 276 | nested.define(-a("a") + maybe(nested) + -a("z")) 277 | expr = nested | a("x") 278 | with self.assertRaises(NoParseError) as ctx: 279 | expr.parse("y") 280 | self.assertEqual( 281 | ctx.exception.msg, 282 | "got unexpected token: 'y', " 283 | "expected: (('a', [ forward_decl() ]), 'z') or 'x'", 284 | ) 285 | 286 | def test_expected_forward_decl_with_name(self) -> None: 287 | nested = forward_decl().named("nested") 288 | nested.define(-a("a") + maybe(nested) + -a("z")) 289 | expr = nested | a("x") 290 | with self.assertRaises(NoParseError) as ctx: 291 | expr.parse("y") 292 | self.assertEqual( 293 | ctx.exception.msg, 294 | "got unexpected token: 'y', expected: (('a', [ nested ]), 'z') or 'x'", 295 | ) 296 | 297 | def test_end_of_input_after_many_alternatives(self) -> None: 298 | brackets = a("[") + a("]") 299 | expr = many(a("x") | brackets) + finished 300 | with self.assertRaises(NoParseError) as ctx: 301 | expr.parse("[") 302 | self.assertEqual( 303 | ctx.exception.msg, "got unexpected end of input, expected: ']'" 304 | ) 305 | 306 | def test_parse_one_more_then_rollback_to_single(self) -> None: 307 | mul = a("x") + many(a("*") + a("y")) 308 | add = mul + many(a("+") + mul) 309 | expr = add + finished 310 | with self.assertRaises(NoParseError) as ctx: 311 | expr.parse("x*") 312 | self.assertEqual( 313 | ctx.exception.msg, "got unexpected end of input, expected: 'y'" 314 | ) 315 | 316 | def test_parse_one_more_then_rollback_to_alternative(self) -> None: 317 | mul = a("x") + many(a("*") + a("y")) 318 | addsub = mul + many((a("+") | a("-")) + mul) 319 | expr = addsub + finished 320 | with self.assertRaises(NoParseError) as ctx: 321 | expr.parse("x*") 322 | self.assertEqual( 323 | ctx.exception.msg, "got unexpected end of input, expected: 'y'" 324 | ) 325 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | isolated_build = true 3 | envlist = py{38,39,310,311,312} 4 | 5 | [testenv] 6 | commands = 7 | python -m unittest discover 8 | --------------------------------------------------------------------------------