├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── feature_request.md
│ ├── other.md
│ └── question.md
└── workflows
│ ├── codecov.yml
│ ├── mypy.yml
│ └── tests.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
├── Makefile
├── _static
│ ├── comparison_memory.png
│ ├── comparison_runtime.png
│ ├── lark_cheatsheet.pdf
│ └── sppf
│ │ ├── sppf.html
│ │ ├── sppf_111.svg
│ │ ├── sppf_abcd.svg
│ │ ├── sppf_abcd_noint.svg
│ │ └── sppf_cycle.svg
├── classes.rst
├── conf.py
├── features.md
├── forest.rst
├── grammar.md
├── how_to_develop.md
├── how_to_use.md
├── ide
│ ├── app.html
│ ├── app.js
│ ├── app
│ │ ├── app.py
│ │ ├── core.py
│ │ ├── examples.py
│ │ ├── ext.py
│ │ ├── files.json
│ │ ├── html5.py
│ │ ├── ignite.py
│ │ └── utils.py
│ ├── is-loading.gif
│ └── lark-logo.png
├── index.rst
├── json_tutorial.md
├── make.bat
├── parsers.md
├── philosophy.md
├── recipes.md
├── requirements.txt
├── tools.md
├── tree_construction.md
└── visitors.rst
├── examples
├── README.rst
├── __init__.py
├── advanced
│ ├── README.rst
│ ├── _json_parser.py
│ ├── conf_earley.py
│ ├── conf_lalr.py
│ ├── create_ast.py
│ ├── custom_lexer.py
│ ├── dynamic_complete.py
│ ├── error_handling.py
│ ├── error_reporting_earley.py
│ ├── error_reporting_lalr.py
│ ├── prioritizer.py
│ ├── py3to2.py
│ ├── python2.lark
│ ├── python_parser.py
│ ├── qscintilla_json.py
│ ├── reconstruct_json.py
│ ├── reconstruct_python.py
│ ├── template_lark.lark
│ ├── templates.py
│ └── tree_forest_transformer.py
├── calc.py
├── composition
│ ├── README.rst
│ ├── combined_csv_and_json.txt
│ ├── csv.lark
│ ├── eval_csv.py
│ ├── eval_json.py
│ ├── json.lark
│ ├── main.py
│ └── storage.lark
├── fruitflies.png
├── fruitflies.py
├── grammars
│ ├── README.rst
│ └── verilog.lark
├── indented_tree.py
├── json_parser.py
├── lark_grammar.py
├── relative-imports
│ ├── multiple2.lark
│ ├── multiple3.lark
│ ├── multiples.lark
│ └── multiples.py
├── standalone
│ ├── README.rst
│ ├── create_standalone.sh
│ ├── json.lark
│ └── json_parser_main.py
├── tests
│ ├── negative_priority.lark
│ └── no_newline_at_end.lark
└── turtle_dsl.py
├── lark
├── __init__.py
├── __pyinstaller
│ ├── __init__.py
│ └── hook-lark.py
├── ast_utils.py
├── common.py
├── exceptions.py
├── grammar.py
├── grammars
│ ├── __init__.py
│ ├── common.lark
│ ├── lark.lark
│ ├── python.lark
│ └── unicode.lark
├── indenter.py
├── lark.py
├── lexer.py
├── load_grammar.py
├── parse_tree_builder.py
├── parser_frontends.py
├── parsers
│ ├── __init__.py
│ ├── cyk.py
│ ├── earley.py
│ ├── earley_common.py
│ ├── earley_forest.py
│ ├── grammar_analysis.py
│ ├── lalr_analysis.py
│ ├── lalr_interactive_parser.py
│ ├── lalr_parser.py
│ ├── lalr_parser_state.py
│ └── xearley.py
├── py.typed
├── reconstruct.py
├── tools
│ ├── __init__.py
│ ├── nearley.py
│ ├── serialize.py
│ └── standalone.py
├── tree.py
├── tree_matcher.py
├── tree_templates.py
├── utils.py
└── visitors.py
├── pyproject.toml
├── readthedocs.yml
├── test-requirements.txt
├── tests
├── __init__.py
├── __main__.py
├── grammars
│ ├── ab.lark
│ ├── leading_underscore_grammar.lark
│ ├── templates.lark
│ ├── test.lark
│ ├── test_relative_import_of_nested_grammar.lark
│ ├── test_relative_import_of_nested_grammar__grammar_to_import.lark
│ ├── test_relative_import_of_nested_grammar__nested_grammar.lark
│ ├── test_unicode.lark
│ └── three_rules_using_same_token.lark
├── test_cache.py
├── test_grammar.py
├── test_lexer.py
├── test_logger.py
├── test_nearley
│ ├── __init__.py
│ ├── grammars
│ │ ├── include_unicode.ne
│ │ └── unicode.ne
│ └── test_nearley.py
├── test_parser.py
├── test_pattern_matching.py
├── test_python_grammar.py
├── test_reconstructor.py
├── test_relative_import.lark
├── test_relative_import_preserves_leading_underscore.lark
├── test_relative_import_rename.lark
├── test_relative_import_rules_dependencies_imported_only_once.lark
├── test_relative_import_unicode.lark
├── test_relative_multi_import.lark
├── test_relative_rule_import.lark
├── test_relative_rule_import_drop_ignore.lark
├── test_relative_rule_import_rename.lark
├── test_relative_rule_import_subrule.lark
├── test_relative_rule_import_subrule_no_conflict.lark
├── test_templates_import.lark
├── test_tools.py
├── test_tree_forest_transformer.py
├── test_tree_templates.py
└── test_trees.py
└── tox.ini
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: lark-parser
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 |
12 | A clear and concise description of what the bug is, and what you expected to happen.
13 |
14 | **To Reproduce**
15 |
16 | Provide a short script that reproduces the erroneous behavior.
17 |
18 | If that is impossible, provide clear steps to reproduce the behavior.
19 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Suggestion**
11 | Provide a clear and concise description of what the problem is, and what you would like to happen.
12 |
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 |
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/other.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Other
3 | about: For any discussion that doesn't fit the templates
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Question
3 | about: Ask a question about Lark or request help
4 | title: ''
5 | labels: question
6 | assignees: ''
7 |
8 | ---
9 |
10 | **What is your question?**
11 |
12 | Try to be accurate and concise.
13 |
14 | **If you're having trouble with your code or grammar**
15 |
16 | Provide a small script that encapsulates your issue.
17 |
18 | Explain what you're trying to do, and what is obstructing your progress.
19 |
--------------------------------------------------------------------------------
/.github/workflows/codecov.yml:
--------------------------------------------------------------------------------
1 | name: Compute coverage and push to Codecov
2 | on: [push]
3 | jobs:
4 | run:
5 | runs-on: ${{ matrix.os }}
6 | strategy:
7 | matrix:
8 | os: [ubuntu-latest, macos-latest, windows-latest]
9 | env:
10 | OS: ${{ matrix.os }}
11 | PYTHON: '3.8'
12 | steps:
13 | - uses: actions/checkout@v3
14 | name: Download with submodules
15 | with:
16 | submodules: recursive
17 | - name: Setup Python
18 | uses: actions/setup-python@v3
19 | with:
20 | python-version: "3.8"
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install -r test-requirements.txt
25 | - name: Generate coverage report
26 | run: |
27 | pip install pytest
28 | pip install pytest-cov
29 | pytest --cov=./ --cov-report=xml
30 | - name: Upload coverage to Codecov
31 | uses: codecov/codecov-action@v1
32 | with:
33 | token: ${{ secrets.CODECOV_TOKEN }}
34 | files: ./coverage.xml
35 | flags: unittests
36 | env_vars: OS,PYTHON
37 | name: codecov-umbrella
38 | fail_ci_if_error: false
39 | path_to_write_report: ./coverage/codecov_report.txt
40 | verbose: true
41 |
--------------------------------------------------------------------------------
/.github/workflows/mypy.yml:
--------------------------------------------------------------------------------
1 | name: Python type check
2 | on: [push, pull_request]
3 | jobs:
4 | type:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - uses: actions/checkout@v3
8 | with:
9 | submodules: recursive
10 | - name: Lint with mypy
11 | run: pipx run tox -e type
12 |
13 | pre-commit:
14 | name: Format
15 | runs-on: ubuntu-latest
16 | steps:
17 | - uses: actions/checkout@v3
18 | - uses: actions/setup-python@v3
19 | - uses: pre-commit/action@v3.0.1
20 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 | on: [push, pull_request]
3 |
4 | jobs:
5 | build:
6 | runs-on: ubuntu-latest
7 | strategy:
8 | matrix:
9 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev", "pypy-3.10"]
10 |
11 | steps:
12 | - uses: actions/checkout@v3
13 | with:
14 | submodules: recursive
15 | - name: Set up Python ${{ matrix.python-version }}
16 | uses: actions/setup-python@v4
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install -r test-requirements.txt
23 | - name: Run tests
24 | run: |
25 | python -m tests
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyo
3 | /.tox
4 | /lark.egg-info/**
5 | /lark_parser.egg-info/**
6 | tags
7 | .vscode
8 | .idea
9 | .ropeproject
10 | .cache
11 | .mypy_cache
12 | /dist
13 | /build
14 | docs/_build
15 | docs/examples
16 | docs/sg_execution_times.rst
17 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/test_nearley/nearley"]
2 | path = tests/test_nearley/nearley
3 | url = https://github.com/Hardmath123/nearley
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # To use:
2 | #
3 | # pre-commit run -a
4 | #
5 | # Or:
6 | #
7 | # pre-commit install # (runs every time you commit in git)
8 | #
9 | # To update this file:
10 | #
11 | # pre-commit autoupdate
12 | #
13 | # See https://github.com/pre-commit/pre-commit
14 |
15 | repos:
16 | # Standard hooks
17 | - repo: https://github.com/pre-commit/pre-commit-hooks
18 | rev: "v4.4.0"
19 | hooks:
20 | - id: check-added-large-files
21 | - id: check-case-conflict
22 | - id: check-merge-conflict
23 | - id: check-symlinks
24 | - id: check-toml
25 | - id: check-yaml
26 | - id: debug-statements
27 | - id: end-of-file-fixer
28 | exclude: '(^tests/.*\.lark|\.svg)$'
29 | - id: mixed-line-ending
30 | - id: requirements-txt-fixer
31 | - id: trailing-whitespace
32 | exclude: '(^tests/.*\.lark|\.svg)$'
33 |
34 | - repo: https://github.com/codespell-project/codespell
35 | rev: v2.2.2
36 | hooks:
37 | - id: codespell
38 | args: ["-L", "nd,iif,ot,datas"]
39 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | v1.0
2 |
3 | - `maybe_placeholders` is now True by default
4 |
5 | - Renamed TraditionalLexer to BasicLexer, and 'standard' lexer option to 'basic'
6 |
7 | - Default priority is now 0, for both terminals and rules (used to be 1 for terminals)
8 |
9 | - Discard mechanism is now done by returning Discard, instead of raising it as an exception.
10 |
11 | - `use_accepts` in `UnexpectedInput.match_examples()` is now True by default
12 |
13 | - `v_args(meta=True)` now gives meta as the first argument. i.e. `(meta, children)`
14 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright © 2017 Erez Shinan
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so,
8 | subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE docs/* examples/*.py examples/*.png examples/*.lark tests/*.py tests/*.lark tests/grammars/* tests/test_nearley/*.py tests/test_nearley/grammars/*
2 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = Lark
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_static/comparison_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/docs/_static/comparison_memory.png
--------------------------------------------------------------------------------
/docs/_static/comparison_runtime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/docs/_static/comparison_runtime.png
--------------------------------------------------------------------------------
/docs/_static/lark_cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/docs/_static/lark_cheatsheet.pdf
--------------------------------------------------------------------------------
/docs/classes.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 |
4 | Lark
5 | ----
6 |
7 | .. autoclass:: lark.Lark
8 | :members: open, parse, parse_interactive, lex, save, load, get_terminal, open_from_package
9 |
10 |
11 | Using Unicode character classes with ``regex``
12 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
13 |
14 | Python's builtin ``re`` module has a few persistent known bugs and also won't parse
15 | advanced regex features such as character classes.
16 | With ``pip install lark[regex]``, the ``regex`` module will be
17 | installed alongside lark and can act as a drop-in replacement to ``re``.
18 |
19 | Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``.
20 |
21 | For example, we can use character classes to match PEP-3131 compliant Python identifiers:
22 |
23 | ::
24 |
25 | from lark import Lark
26 | >>> g = Lark(r"""
27 | ?start: NAME
28 | NAME: ID_START ID_CONTINUE*
29 | ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
30 | ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
31 | """, regex=True)
32 |
33 | >>> g.parse('வணக்கம்')
34 | 'வணக்கம்'
35 |
36 |
37 | Tree
38 | ----
39 |
40 | .. autoclass:: lark.Tree
41 | :members: pretty, find_pred, find_data, iter_subtrees, scan_values,
42 | iter_subtrees_topdown, __rich__
43 |
44 | Token
45 | -----
46 |
47 | .. autoclass:: lark.Token
48 |
49 | Transformer, Visitor & Interpreter
50 | ----------------------------------
51 |
52 | See :doc:`visitors`.
53 |
54 | ForestVisitor, ForestTransformer, & TreeForestTransformer
55 | -----------------------------------------------------------
56 |
57 | See :doc:`forest`.
58 |
59 | UnexpectedInput
60 | ---------------
61 |
62 | .. autoclass:: lark.exceptions.UnexpectedInput
63 | :members: get_context, match_examples
64 |
65 | .. autoclass:: lark.exceptions.UnexpectedToken
66 |
67 | .. autoclass:: lark.exceptions.UnexpectedCharacters
68 |
69 | .. autoclass:: lark.exceptions.UnexpectedEOF
70 |
71 | InteractiveParser
72 | -----------------
73 |
74 | .. autoclass:: lark.parsers.lalr_interactive_parser.InteractiveParser
75 | :members: choices, feed_token, copy, pretty, resume_parse, exhaust_lexer, accepts, as_immutable
76 |
77 | .. autoclass:: lark.parsers.lalr_interactive_parser.ImmutableInteractiveParser
78 | :members: choices, feed_token, copy, pretty, resume_parse, exhaust_lexer, accepts, as_mutable
79 |
80 |
81 | ast_utils
82 | ---------
83 |
84 | For an example of using ``ast_utils``, see `/examples/advanced/create_ast.py`_
85 |
86 | .. autoclass:: lark.ast_utils.Ast
87 |
88 | .. autoclass:: lark.ast_utils.AsList
89 |
90 | .. autofunction:: lark.ast_utils.create_transformer
91 |
92 | .. _/examples/advanced/create_ast.py: examples/advanced/create_ast.html
93 |
94 | Indenter
95 | --------
96 |
97 | .. autoclass:: lark.indenter.Indenter
98 | .. autoclass:: lark.indenter.PythonIndenter
99 |
100 | TextSlice
101 | ---------
102 |
103 | .. autoclass:: lark.utils.TextSlice
104 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Lark documentation build configuration file, created by
5 | # sphinx-quickstart on Sun Aug 16 13:09:41 2020.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | import os
21 | import sys
22 | sys.path.insert(0, os.path.abspath('..'))
23 | autodoc_member_order = 'bysource'
24 |
25 |
26 | # -- General configuration ------------------------------------------------
27 |
28 | # If your documentation needs a minimal Sphinx version, state it here.
29 | #
30 | # needs_sphinx = '1.0'
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 | 'sphinx.ext.autodoc',
37 | 'sphinx.ext.napoleon',
38 | 'sphinx.ext.coverage',
39 | 'recommonmark',
40 | 'sphinx_markdown_tables',
41 | 'sphinx_gallery.gen_gallery'
42 | ]
43 |
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ['_templates']
46 |
47 | # The suffix(es) of source filenames.
48 | # You can specify multiple suffix as a list of string:
49 | #
50 | # source_suffix = ['.rst', '.md']
51 | source_suffix = {
52 | '.rst': 'restructuredtext',
53 | '.md': 'markdown'
54 | }
55 |
56 |
57 | # The master toctree document.
58 | master_doc = 'index'
59 |
60 | # General information about the project.
61 | project = 'Lark'
62 | copyright = '2020, Erez Shinan'
63 | author = 'Erez Shinan'
64 |
65 | # The version info for the project you're documenting, acts as replacement for
66 | # |version| and |release|, also used in various other places throughout the
67 | # built documents.
68 | #
69 | # The short X.Y version.
70 | version = ''
71 | # The full version, including alpha/beta/rc tags.
72 | release = ''
73 |
74 | # The language for content autogenerated by Sphinx. Refer to documentation
75 | # for a list of supported languages.
76 | #
77 | # This is also used if you do content translation via gettext catalogs.
78 | # Usually you set "language" from the command line for these cases.
79 | language = 'en'
80 |
81 | # List of patterns, relative to source directory, that match files and
82 | # directories to ignore when looking for source files.
83 | # This patterns also effect to html_static_path and html_extra_path
84 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
85 |
86 | # The name of the Pygments (syntax highlighting) style to use.
87 | pygments_style = 'sphinx'
88 |
89 | # If true, `todo` and `todoList` produce output, else they produce nothing.
90 | todo_include_todos = False
91 |
92 |
93 | # -- Options for HTML output ----------------------------------------------
94 |
95 | # The theme to use for HTML and HTML Help pages. See the documentation for
96 | # a list of builtin themes.
97 | #
98 | html_theme = 'sphinx_rtd_theme'
99 |
100 | # Theme options are theme-specific and customize the look and feel of a theme
101 | # further. For a list of options available for each theme, see the
102 | # documentation.
103 | #
104 | html_theme_options = {
105 | 'prev_next_buttons_location': 'both'
106 | }
107 |
108 | # Add any paths that contain custom static files (such as style sheets) here,
109 | # relative to this directory. They are copied after the builtin static files,
110 | # so a file named "default.css" will overwrite the builtin "default.css".
111 | html_static_path = ['_static']
112 |
113 | # Custom sidebar templates, must be a dictionary that maps document names
114 | # to template names.
115 | #
116 | # This is required for the alabaster theme
117 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
118 | html_sidebars = {
119 | '**': [
120 | 'relations.html', # needs 'show_related': True theme option to display
121 | 'searchbox.html',
122 | ]
123 | }
124 |
125 |
126 | # -- Options for HTMLHelp output ------------------------------------------
127 |
128 | # Output file base name for HTML help builder.
129 | htmlhelp_basename = 'Larkdoc'
130 |
131 |
132 | # -- Options for LaTeX output ---------------------------------------------
133 |
134 | latex_elements = {
135 | # The paper size ('letterpaper' or 'a4paper').
136 | #
137 | # 'papersize': 'letterpaper',
138 |
139 | # The font size ('10pt', '11pt' or '12pt').
140 | #
141 | # 'pointsize': '10pt',
142 |
143 | # Additional stuff for the LaTeX preamble.
144 | #
145 | # 'preamble': '',
146 |
147 | # Latex figure (float) alignment
148 | #
149 | # 'figure_align': 'htbp',
150 | }
151 |
152 | # Grouping the document tree into LaTeX files. List of tuples
153 | # (source start file, target name, title,
154 | # author, documentclass [howto, manual, or own class]).
155 | latex_documents = [
156 | (master_doc, 'Lark.tex', 'Lark Documentation',
157 | 'Erez Shinan', 'manual'),
158 | ]
159 |
160 |
161 | # -- Options for manual page output ---------------------------------------
162 |
163 | # One entry per manual page. List of tuples
164 | # (source start file, name, description, authors, manual section).
165 | man_pages = [
166 | (master_doc, 'lark', 'Lark Documentation',
167 | [author], 7)
168 | ]
169 |
170 |
171 | # -- Options for Texinfo output -------------------------------------------
172 |
173 | # Grouping the document tree into Texinfo files. List of tuples
174 | # (source start file, target name, title, author,
175 | # dir menu entry, description, category)
176 | texinfo_documents = [
177 | (master_doc, 'Lark', 'Lark Documentation',
178 | author, 'Lark', 'One line description of project.',
179 | 'Miscellaneous'),
180 | ]
181 |
182 | # -- Sphinx gallery config -------------------------------------------
183 |
184 | sphinx_gallery_conf = {
185 | 'examples_dirs': ['../examples'],
186 | 'gallery_dirs': ['examples'],
187 | }
188 |
--------------------------------------------------------------------------------
/docs/features.md:
--------------------------------------------------------------------------------
1 | # Features
2 |
3 | ## Main Features
4 | - Earley parser, capable of parsing any context-free grammar
5 | - Implements SPPF, for efficient parsing and storing of ambiguous grammars.
6 | - LALR(1) parser, limited in power of expression, but very efficient in space and performance (O(n)).
7 | - Implements a parse-aware lexer that provides a better power of expression than traditional LALR implementations (such as ply).
8 | - EBNF-inspired grammar, with extra features (See: [Grammar Reference](grammar.md))
9 | - Builds a parse-tree (AST) automagically based on the grammar
10 | - Stand-alone parser generator - create a small independent parser to embed in your project. ([read more](tools.html#stand-alone-parser))
11 | - Flexible error handling by using an interactive parser interface (LALR only)
12 | - Automatic line & column tracking (for both tokens and matched rules)
13 | - Automatic terminal collision resolution
14 | - Warns on regex collisions using the optional `interegular` library. ([read more](how_to_use.html#regex-collisions))
15 | - Grammar composition - Import terminals and rules from other grammars (see [example](https://github.com/lark-parser/lark/tree/master/examples/composition)).
16 | - Standard library of terminals (strings, numbers, names, etc.)
17 | - Unicode fully supported
18 | - Extensive test suite
19 | - Type annotations (MyPy support)
20 | - Pure-Python implementation
21 |
22 | [Read more about the parsers](parsers.md)
23 |
24 | ## Extra features
25 | - Support for external regex module ([see here](classes.html#using-unicode-character-classes-with-regex))
26 | - Import grammars from Nearley.js ([read more](tools.html#importing-grammars-from-nearleyjs))
27 | - CYK parser
28 | - Visualize your parse trees as dot or png files ([see_example](https://github.com/lark-parser/lark/blob/master/examples/fruitflies.py))
29 | - Automatic reconstruction of input from parse-tree (see [example](https://github.com/lark-parser/lark/blob/master/examples/advanced/reconstruct_json.py) and [another example](https://github.com/lark-parser/lark/blob/master/examples/advanced/reconstruct_python.py))
30 | - Use Lark grammars in [Julia](https://github.com/jamesrhester/Lerche.jl) and [Javascript](https://github.com/lark-parser/Lark.js).
31 |
--------------------------------------------------------------------------------
/docs/forest.rst:
--------------------------------------------------------------------------------
1 | Working with the SPPF
2 | =====================
3 |
4 | When parsing with Earley, Lark provides the ``ambiguity='forest'`` option
5 | to obtain the shared packed parse forest (SPPF) produced by the parser as
6 | an alternative to it being automatically converted to a tree.
7 |
8 | Lark provides a few tools to facilitate working with the SPPF. Here are some
9 | things to consider when deciding whether or not to use the SPPF.
10 |
11 | **Pros**
12 |
13 | - Efficient storage of highly ambiguous parses
14 | - Precise handling of ambiguities
15 | - Custom rule prioritizers
16 | - Ability to handle infinite ambiguities
17 | - Directly transform forest -> object instead of forest -> tree -> object
18 |
19 | **Cons**
20 |
21 | - More complex than working with a tree
22 | - SPPF may contain nodes corresponding to rules generated internally
23 | - Loss of Lark grammar features:
24 |
25 | - Rules starting with '_' are not inlined in the SPPF
26 | - Rules starting with '?' are never inlined in the SPPF
27 | - All tokens will appear in the SPPF
28 |
29 | SymbolNode
30 | ----------
31 |
32 | .. autoclass:: lark.parsers.earley_forest.SymbolNode
33 | :members: is_ambiguous, children
34 |
35 | PackedNode
36 | ----------
37 |
38 | .. autoclass:: lark.parsers.earley_forest.PackedNode
39 | :members: children
40 |
41 | ForestVisitor
42 | -------------
43 |
44 | .. autoclass:: lark.parsers.earley_forest.ForestVisitor
45 | :members: visit, visit_symbol_node_in, visit_symbol_node_out,
46 | visit_packed_node_in, visit_packed_node_out,
47 | visit_token_node, on_cycle, get_cycle_in_path
48 |
49 | ForestTransformer
50 | -----------------
51 |
52 | .. autoclass:: lark.parsers.earley_forest.ForestTransformer
53 | :members: transform, transform_symbol_node, transform_intermediate_node,
54 | transform_packed_node, transform_token_node
55 |
56 | TreeForestTransformer
57 | ---------------------
58 |
59 | .. autoclass:: lark.parsers.earley_forest.TreeForestTransformer
60 | :members: __default__, __default_token__, __default_ambig__
61 |
62 | handles_ambiguity
63 | -----------------
64 |
65 | .. autofunction:: lark.parsers.earley_forest.handles_ambiguity
66 |
--------------------------------------------------------------------------------
/docs/how_to_develop.md:
--------------------------------------------------------------------------------
1 | # How to develop Lark - Guide
2 |
3 | There are many ways you can help the project:
4 |
5 | * Help solve issues
6 | * Improve the documentation
7 | * Write new grammars for Lark's library
8 | * Write a blog post introducing Lark to your audience
9 | * Port Lark to another language
10 | * Help with code development
11 |
12 | If you're interested in taking one of these on, contact us on [Gitter](https://gitter.im/lark-parser/Lobby) or [Github Discussion](https://github.com/lark-parser/lark/discussions), and we will provide more details and assist you in the process.
13 |
14 | ## Code Style
15 |
16 | Lark does not follow a predefined code style.
17 | We accept any code style that makes sense, as long as it's Pythonic and easy to read.
18 |
19 | ## Unit Tests
20 |
21 | Lark comes with an extensive set of tests. Many of the tests will run several times, once for each parser configuration.
22 |
23 | To run the tests, just go to the lark project root, and run the command:
24 | ```bash
25 | python -m tests
26 | ```
27 |
28 | or
29 |
30 | ```bash
31 | pypy -m tests
32 | ```
33 |
34 | For a list of supported interpreters, you can consult the `tox.ini` file.
35 |
36 | You can also run a single unittest using its class and method name, for example:
37 | ```bash
38 | ## test_package test_class_name.test_function_name
39 | python -m tests TestLalrBasic.test_keep_all_tokens
40 | ```
41 |
42 | ### tox
43 |
44 | To run all Unit Tests with tox,
45 | install tox and Python 2.7 up to the latest python interpreter supported (consult the file tox.ini).
46 | Then,
47 | run the command `tox` on the root of this project (where the main setup.py file is on).
48 |
49 | And, for example,
50 | if you would like to only run the Unit Tests for Python version 2.7,
51 | you can run the command `tox -e py27`
52 |
53 | ### pytest
54 |
55 | You can also run the tests using pytest:
56 |
57 | ```bash
58 | pytest tests
59 | ```
60 |
61 | ### Using setup.py
62 |
63 | Another way to run the tests is using setup.py:
64 |
65 | ```bash
66 | python setup.py test
67 | ```
68 |
69 | ## Building the Documentation
70 |
71 | To build the documentation:
72 |
73 | ```sh
74 | cd docs/
75 | pip install -r requirements.txt
76 | make html
77 | ```
78 |
79 | To review the result, open the built HTML files under `_build/html/` in your browser.
80 |
--------------------------------------------------------------------------------
/docs/ide/app.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/docs/ide/app.js:
--------------------------------------------------------------------------------
1 | class app {
2 |
3 | constructor(modules, invocation){
4 | languagePluginLoader.then(() => {
5 | // If you don't require for pre-loaded Python packages, remove this promise below.
6 | window.pyodide.runPythonAsync("import setuptools, micropip").then(()=>{
7 | window.pyodide.runPythonAsync("micropip.install('lark-parser')").then(()=>{
8 | this.fetchSources(modules).then(() => {
9 | window.pyodide.runPythonAsync("import " + Object.keys(modules).join("\nimport ") + "\n" + invocation + "\n").then(() => this.initializingComplete());
10 | });
11 | });
12 | });
13 | });
14 | }
15 |
16 | loadSources(module, baseURL, files) {
17 | let promises = [];
18 |
19 | for (let f in files) {
20 | promises.push(
21 | new Promise((resolve, reject) => {
22 | let file = files[f];
23 | let url = (baseURL ? baseURL + "/" : "") + file;
24 |
25 | fetch(url, {}).then((response) => {
26 | if (response.status === 200)
27 | return response.text().then((code) => {
28 | let path = ("/lib/python3.7/site-packages/" + module + "/" + file).split("/");
29 | let lookup = "";
30 |
31 | for (let i in path) {
32 | if (!path[i]) {
33 | continue;
34 | }
35 |
36 | lookup += (lookup ? "/" : "") + path[i];
37 |
38 | if (parseInt(i) === path.length - 1) {
39 | window.pyodide._module.FS.writeFile(lookup, code);
40 | console.debug(`fetched ${lookup}`);
41 | } else {
42 | try {
43 | window.pyodide._module.FS.lookupPath(lookup);
44 | } catch {
45 | window.pyodide._module.FS.mkdir(lookup);
46 | console.debug(`created ${lookup}`);
47 | }
48 | }
49 | }
50 |
51 | resolve();
52 | });
53 | else
54 | reject();
55 | });
56 | })
57 | );
58 | }
59 |
60 | return Promise.all(promises);
61 | }
62 |
63 | fetchSources(modules) {
64 | let promises = [];
65 |
66 | for( let module of Object.keys(modules) )
67 | {
68 | promises.push(
69 | new Promise((resolve, reject) => {
70 | fetch(`${modules[module]}/files.json`, {}).then((response) => {
71 | if (response.status === 200) {
72 | response.text().then((list) => {
73 | let files = JSON.parse(list);
74 |
75 | this.loadSources(module, modules[module], files).then(() => {
76 | resolve();
77 | })
78 | })
79 | } else {
80 | reject();
81 | }
82 | })
83 | }));
84 | }
85 |
86 | return Promise.all(promises).then(() => {
87 | for( let module of Object.keys(modules) ) {
88 | window.pyodide.loadedPackages[module] = "default channel";
89 | }
90 |
91 | window.pyodide.runPython(
92 | 'import importlib as _importlib\n' +
93 | '_importlib.invalidate_caches()\n'
94 | );
95 | });
96 | }
97 |
98 | initializingComplete() {
99 | document.body.classList.remove("is-loading")
100 | }
101 | }
102 |
103 | (function () {
104 | window.top.app = new app({"app": "app"}, "import app.app; app.app.start()");
105 | })();
106 |
--------------------------------------------------------------------------------
/docs/ide/app/app.py:
--------------------------------------------------------------------------------
1 | from . import html5
2 | from .examples import examples
3 |
4 | from lark import Lark
5 | from lark.tree import Tree
6 |
7 |
8 | class App(html5.Div):
9 | def __init__(self):
10 | super().__init__("""
11 |
12 |
IDE
13 |
14 |
15 |
16 |
26 |
36 |
39 |
40 | """)
41 | self.sinkEvent("onKeyUp", "onChange")
42 |
43 | self.parser = "earley"
44 |
45 | # Pre-load examples
46 | for name, (grammar, input) in examples.items():
47 | option = html5.Option(name)
48 | option.grammar = grammar
49 | option.input = input
50 |
51 | self.examples.appendChild(option)
52 |
53 | def onChange(self, e):
54 | if html5.utils.doesEventHitWidgetOrChildren(e, self.examples):
55 | example = self.examples.children(self.examples["selectedIndex"])
56 | self.grammar["value"] = example.grammar.strip()
57 | self.input["value"] = example.input.strip()
58 | self.onKeyUp()
59 |
60 | elif html5.utils.doesEventHitWidgetOrChildren(e, self.parser):
61 | self.parser = self.parser.children(self.parser["selectedIndex"])["value"]
62 | self.onKeyUp()
63 |
64 | def onKeyUp(self, e=None):
65 | l = Lark(self.grammar["value"], parser=self.parser)
66 |
67 | try:
68 | ast = l.parse(self.input["value"])
69 | except Exception as e:
70 | self.ast.appendChild(
71 | html5.Li(str(e)), replace=True
72 | )
73 |
74 | print(ast)
75 | traverse = lambda node: html5.Li([node.data, html5.Ul([traverse(c) for c in node.children])] if isinstance(node, Tree) else node)
76 | self.ast.appendChild(traverse(ast), replace=True)
77 |
78 |
79 | def start():
80 | html5.Body().appendChild(
81 | App()
82 | )
83 |
--------------------------------------------------------------------------------
/docs/ide/app/examples.py:
--------------------------------------------------------------------------------
1 |
2 | # Examples formattet this way:
3 | # "name": ("grammar", "demo-input")
4 |
5 | examples = {
6 |
7 | # --- hello.lark ---
8 | "hello.lark": ("""
9 | start: WORD "," WORD "!"
10 |
11 | %import common.WORD // imports from terminal library
12 | %ignore " " // Disregard spaces in text
13 | """, "Hello, World!"),
14 |
15 | # --- calc.lark ---
16 | "calc.lark": ("""
17 | ?start: sum
18 | | NAME "=" sum -> assign_var
19 |
20 | ?sum: product
21 | | sum "+" product -> add
22 | | sum "-" product -> sub
23 |
24 | ?product: atom
25 | | product "*" atom -> mul
26 | | product "/" atom -> div
27 |
28 | ?atom: NUMBER -> number
29 | | "-" atom -> neg
30 | | NAME -> var
31 | | "(" sum ")"
32 |
33 | %import common.CNAME -> NAME
34 | %import common.NUMBER
35 | %import common.WS_INLINE
36 | %ignore WS_INLINE""",
37 | "1 + 2 * 3 + 4"),
38 |
39 | # --- json.lark ---
40 | "json.lark": ("""
41 | ?start: value
42 | ?value: object
43 | | array
44 | | string
45 | | SIGNED_NUMBER -> number
46 | | "true" -> true
47 | | "false" -> false
48 | | "null" -> null
49 | array : "[" [value ("," value)*] "]"
50 | object : "{" [pair ("," pair)*] "}"
51 | pair : string ":" value
52 | string : ESCAPED_STRING
53 | %import common.ESCAPED_STRING
54 | %import common.SIGNED_NUMBER
55 | %import common.WS
56 | %ignore WS""",
57 | """
58 | [
59 | {
60 | "_id": "5edb875cf3d764da55602437",
61 | "index": 0,
62 | "guid": "3dae2206-5d4d-41fe-b81d-dc8cdba7acaa",
63 | "isActive": false,
64 | "balance": "$2,872.54",
65 | "picture": "http://placehold.it/32x32",
66 | "age": 24,
67 | "eyeColor": "blue",
68 | "name": "Theresa Vargas",
69 | "gender": "female",
70 | "company": "GEEKOL",
71 | "email": "theresavargas@geekol.com",
72 | "phone": "+1 (930) 450-3445",
73 | "address": "418 Herbert Street, Sexton, Florida, 1375",
74 | "about": "Id minim deserunt laborum enim. Veniam commodo incididunt amet aute esse duis veniam occaecat nulla esse aute et deserunt eiusmod. Anim elit ullamco minim magna sint laboris. Est consequat quis deserunt excepteur in magna pariatur laborum quis eu. Ex quis tempor elit qui qui et culpa sunt sit esse mollit cupidatat. Fugiat cillum deserunt enim minim irure reprehenderit est. Voluptate nisi quis amet quis incididunt pariatur nostrud Lorem consectetur adipisicing voluptate.\\r\\n",
75 | "registered": "2016-11-19T01:02:42 -01:00",
76 | "latitude": -25.65267,
77 | "longitude": 104.19531,
78 | "tags": [
79 | "eiusmod",
80 | "reprehenderit",
81 | "anim",
82 | "sunt",
83 | "esse",
84 | "proident",
85 | "esse"
86 | ],
87 | "friends": [
88 | {
89 | "id": 0,
90 | "name": "Roth Herrera"
91 | },
92 | {
93 | "id": 1,
94 | "name": "Callie Christian"
95 | },
96 | {
97 | "id": 2,
98 | "name": "Gracie Whitfield"
99 | }
100 | ],
101 | "greeting": "Hello, Theresa Vargas! You have 6 unread messages.",
102 | "favoriteFruit": "banana"
103 | },
104 | {
105 | "_id": "5edb875c845eb08161a83e64",
106 | "index": 1,
107 | "guid": "a8ada2c1-e2c7-40d3-96b4-52c93baff7f0",
108 | "isActive": false,
109 | "balance": "$2,717.04",
110 | "picture": "http://placehold.it/32x32",
111 | "age": 23,
112 | "eyeColor": "green",
113 | "name": "Lily Ross",
114 | "gender": "female",
115 | "company": "RODEOMAD",
116 | "email": "lilyross@rodeomad.com",
117 | "phone": "+1 (941) 465-3561",
118 | "address": "525 Beekman Place, Blodgett, Marshall Islands, 3173",
119 | "about": "Aliquip duis proident excepteur eiusmod in quis officia consequat culpa eu et ut. Occaecat reprehenderit tempor mollit do eu magna qui et magna exercitation aliqua. Incididunt exercitation dolor proident eiusmod minim occaecat. Sunt et minim mollit et veniam sint ex. Duis ullamco elit aute eu excepteur reprehenderit officia.\\r\\n",
120 | "registered": "2019-11-02T04:06:42 -01:00",
121 | "latitude": 17.031701,
122 | "longitude": -42.657106,
123 | "tags": [
124 | "id",
125 | "non",
126 | "culpa",
127 | "reprehenderit",
128 | "esse",
129 | "elit",
130 | "sit"
131 | ],
132 | "friends": [
133 | {
134 | "id": 0,
135 | "name": "Ursula Maldonado"
136 | },
137 | {
138 | "id": 1,
139 | "name": "Traci Huff"
140 | },
141 | {
142 | "id": 2,
143 | "name": "Taylor Holt"
144 | }
145 | ],
146 | "greeting": "Hello, Lily Ross! You have 3 unread messages.",
147 | "favoriteFruit": "strawberry"
148 | }
149 | ]""")
150 | }
151 |
--------------------------------------------------------------------------------
/docs/ide/app/files.json:
--------------------------------------------------------------------------------
1 | [
2 | "app.py",
3 | "examples.py",
4 | "html5.py",
5 | "core.py",
6 | "ext.py",
7 | "ignite.py",
8 | "utils.py"
9 | ]
10 |
--------------------------------------------------------------------------------
/docs/ide/app/html5.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .core import *
4 | from . import ext, utils, ignite
5 |
--------------------------------------------------------------------------------
/docs/ide/app/ignite.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from . import core as html5
3 |
4 |
5 | @html5.tag
6 | class Label(html5.Label):
7 | _parserTagName = "ignite-label"
8 |
9 | def __init__(self, *args, **kwargs):
10 | super(Label, self).__init__(style="label ignt-label", *args, **kwargs)
11 |
12 |
13 | @html5.tag
14 | class Input(html5.Input):
15 | _parserTagName = "ignite-input"
16 |
17 | def __init__(self, *args, **kwargs):
18 | super(Input, self).__init__(style="input ignt-input", *args, **kwargs)
19 |
20 |
21 | @html5.tag
22 | class Switch(html5.Div):
23 | _parserTagName = "ignite-switch"
24 |
25 | def __init__(self, *args, **kwargs):
26 | super(Switch, self).__init__(style="switch ignt-switch", *args, **kwargs)
27 |
28 | self.input = html5.Input(style="switch-input")
29 | self.appendChild(self.input)
30 | self.input["type"] = "checkbox"
31 |
32 | switchLabel = html5.Label(forElem=self.input)
33 | switchLabel.addClass("switch-label")
34 | self.appendChild(switchLabel)
35 |
36 | def _setChecked(self, value):
37 | self.input["checked"] = bool(value)
38 |
39 | def _getChecked(self):
40 | return self.input["checked"]
41 |
42 |
43 | @html5.tag
44 | class Check(html5.Input):
45 | _parserTagName = "ignite-check"
46 |
47 | def __init__(self, *args, **kwargs):
48 | super(Check, self).__init__(style="check ignt-check", *args, **kwargs)
49 |
50 | checkInput = html5.Input()
51 | checkInput.addClass("check-input")
52 | checkInput["type"] = "checkbox"
53 | self.appendChild(checkInput)
54 |
55 | checkLabel = html5.Label(forElem=checkInput)
56 | checkLabel.addClass("check-label")
57 | self.appendChild(checkLabel)
58 |
59 |
60 | @html5.tag
61 | class Radio(html5.Div):
62 | _parserTagName = "ignite-radio"
63 |
64 | def __init__(self, *args, **kwargs):
65 | super(Radio, self).__init__(style="radio ignt-radio", *args, **kwargs)
66 |
67 | radioInput = html5.Input()
68 | radioInput.addClass("radio-input")
69 | radioInput["type"] = "radio"
70 | self.appendChild(radioInput)
71 |
72 | radioLabel = html5.Label(forElem=radioInput)
73 | radioLabel.addClass("radio-label")
74 | self.appendChild(radioLabel)
75 |
76 |
77 | @html5.tag
78 | class Select(html5.Select):
79 | _parserTagName = "ignite-select"
80 |
81 | def __init__(self, *args, **kwargs):
82 | super(Select, self).__init__(style="select ignt-select", *args, **kwargs)
83 |
84 | defaultOpt = html5.Option()
85 | defaultOpt["selected"] = True
86 | defaultOpt["disabled"] = True
87 | defaultOpt.element.innerHTML = ""
88 | self.appendChild(defaultOpt)
89 |
90 |
91 | @html5.tag
92 | class Textarea(html5.Textarea):
93 | _parserTagName = "ignite-textarea"
94 |
95 | def __init__(self, *args, **kwargs):
96 | super(Textarea, self).__init__(style="textarea ignt-textarea", *args, **kwargs)
97 |
98 |
99 | @html5.tag
100 | class Progress(html5.Progress):
101 | _parserTagName = "ignite-progress"
102 |
103 | def __init__(self, *args, **kwargs):
104 | super(Progress, self).__init__(style="progress ignt-progress", *args, **kwargs)
105 |
106 |
107 | @html5.tag
108 | class Item(html5.Div):
109 | _parserTagName = "ignite-item"
110 |
111 | def __init__(self, title=None, descr=None, className=None, *args, **kwargs):
112 | super(Item, self).__init__(style="item ignt-item", *args, **kwargs)
113 | if className:
114 | self.addClass(className)
115 |
116 | self.fromHTML("""
117 |
118 |
119 |
123 | """)
124 |
125 | if title:
126 | self.itemHeadline.appendChild(html5.TextNode(title))
127 |
128 | if descr:
129 | self.itemSubline = html5.Div()
130 | self.addClass("item-subline ignt-item-subline")
131 | self.itemSubline.appendChild(html5.TextNode(descr))
132 | self.appendChild(self.itemSubline)
133 |
134 |
135 | @html5.tag
136 | class Table(html5.Table):
137 | _parserTagName = "ignite-table"
138 |
139 | def __init__(self, *args, **kwargs):
140 | super(Table, self).__init__(*args, **kwargs)
141 | self.head.addClass("ignt-table-head")
142 | self.body.addClass("ignt-table-body")
143 |
144 | def prepareRow(self, row):
145 | assert row >= 0, "Cannot create rows with negative index"
146 |
147 | for child in self.body._children:
148 | row -= child["rowspan"]
149 | if row < 0:
150 | return
151 |
152 | while row >= 0:
153 | tableRow = html5.Tr()
154 | tableRow.addClass("ignt-table-body-row")
155 | self.body.appendChild(tableRow)
156 | row -= 1
157 |
158 | def prepareCol(self, row, col):
159 | assert col >= 0, "Cannot create cols with negative index"
160 | self.prepareRow(row)
161 |
162 | for rowChild in self.body._children:
163 | row -= rowChild["rowspan"]
164 |
165 | if row < 0:
166 | for colChild in rowChild._children:
167 | col -= colChild["colspan"]
168 | if col < 0:
169 | return
170 |
171 | while col >= 0:
172 | tableCell = html5.Td()
173 | tableCell.addClass("ignt-table-body-cell")
174 | rowChild.appendChild(tableCell)
175 | col -= 1
176 |
177 | return
178 | def fastGrid( self, rows, cols, createHidden=False ):
179 | colsstr = "".join([' | ' for i in range(0, cols)])
180 | tblstr = ''
181 |
182 | for r in range(0, rows):
183 | tblstr += '%s
' %("is-hidden" if createHidden else "",colsstr)
184 | tblstr +=""
185 |
186 | self.fromHTML(tblstr)
187 |
--------------------------------------------------------------------------------
/docs/ide/app/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from . import core as html5
3 |
4 | def unescape(val, maxLength = 0):
5 | """
6 | Unquotes several HTML-quoted characters in a string.
7 |
8 | :param val: The value to be unescaped.
9 | :type val: str
10 |
11 | :param maxLength: Cut-off after maxLength characters.
12 | A value of 0 means "unlimited". (default)
13 | :type maxLength: int
14 |
15 | :returns: The unquoted string.
16 | :rtype: str
17 | """
18 | val = val \
19 | .replace("<", "<") \
20 | .replace(">", ">") \
21 | .replace(""", "\"") \
22 | .replace("'", "'")
23 |
24 | if maxLength > 0:
25 | return val[0:maxLength]
26 |
27 | return val
28 |
29 | def doesEventHitWidgetOrParents(event, widget):
30 | """
31 | Test if event 'event' hits widget 'widget' (or *any* of its parents)
32 | """
33 | while widget:
34 | if event.target == widget.element:
35 | return widget
36 |
37 | widget = widget.parent()
38 |
39 | return None
40 |
41 | def doesEventHitWidgetOrChildren(event, widget):
42 | """
43 | Test if event 'event' hits widget 'widget' (or *any* of its children)
44 | """
45 | if event.target == widget.element:
46 | return widget
47 |
48 | for child in widget.children():
49 | if doesEventHitWidgetOrChildren(event, child):
50 | return child
51 |
52 | return None
53 |
54 | def textToHtml(node, text):
55 | """
56 | Generates html nodes from text by splitting text into content and into
57 | line breaks html5.Br.
58 |
59 | :param node: The node where the nodes are appended to.
60 | :param text: The text to be inserted.
61 | """
62 |
63 | for (i, part) in enumerate(text.split("\n")):
64 | if i > 0:
65 | node.appendChild(html5.Br())
66 |
67 | node.appendChild(html5.TextNode(part))
68 |
69 | def parseInt(s, ret = 0):
70 | """
71 | Parses a value as int
72 | """
73 | if not isinstance(s, str):
74 | return int(s)
75 | elif s:
76 | if s[0] in "+-":
77 | ts = s[1:]
78 | else:
79 | ts = s
80 |
81 | if ts and all([_ in "0123456789" for _ in ts]):
82 | return int(s)
83 |
84 | return ret
85 |
86 | def parseFloat(s, ret = 0.0):
87 | """
88 | Parses a value as float.
89 | """
90 | if not isinstance(s, str):
91 | return float(s)
92 | elif s:
93 | if s[0] in "+-":
94 | ts = s[1:]
95 | else:
96 | ts = s
97 |
98 | if ts and ts.count(".") <= 1 and all([_ in ".0123456789" for _ in ts]):
99 | return float(s)
100 |
101 | return ret
102 |
--------------------------------------------------------------------------------
/docs/ide/is-loading.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/docs/ide/is-loading.gif
--------------------------------------------------------------------------------
/docs/ide/lark-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/docs/ide/lark-logo.png
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. Lark documentation master file, created by
2 | sphinx-quickstart on Sun Aug 16 13:09:41 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to Lark's documentation!
7 | ================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Overview
12 | :hidden:
13 |
14 | philosophy
15 | features
16 | parsers
17 |
18 | .. toctree::
19 | :maxdepth: 2
20 | :caption: Tutorials & Guides
21 | :hidden:
22 |
23 | json_tutorial
24 | how_to_use
25 | how_to_develop
26 | recipes
27 | examples/index
28 |
29 |
30 | .. toctree::
31 | :maxdepth: 2
32 | :caption: Reference
33 | :hidden:
34 |
35 | grammar
36 | tree_construction
37 | classes
38 | visitors
39 | forest
40 | tools
41 |
42 |
43 |
44 | Lark is a modern parsing library for Python. Lark can parse any context-free grammar.
45 |
46 | Lark provides:
47 |
48 | - Advanced grammar language, based on EBNF
49 | - Three parsing algorithms to choose from: Earley, LALR(1) and CYK
50 | - Automatic tree construction, inferred from your grammar
51 | - Fast unicode lexer with regexp support, and automatic line-counting
52 |
53 |
54 | Install Lark
55 | --------------
56 |
57 | .. code:: bash
58 |
59 | $ pip install lark
60 |
61 | Syntax Highlighting
62 | -------------------
63 |
64 | - `Sublime Text & TextMate`_
65 | - `Visual Studio Code`_ (Or install through the vscode plugin system)
66 | - `Intellij & PyCharm`_
67 | - `Vim`_
68 | - `Atom`_
69 |
70 | .. _Sublime Text & TextMate: https://github.com/lark-parser/lark_syntax
71 | .. _Visual Studio Code: https://github.com/lark-parser/vscode-lark
72 | .. _Intellij & PyCharm: https://github.com/lark-parser/intellij-syntax-highlighting
73 | .. _Vim: https://github.com/lark-parser/vim-lark-syntax
74 | .. _Atom: https://github.com/Alhadis/language-grammars
75 |
76 | Resources
77 | ---------
78 |
79 | - :doc:`philosophy`
80 | - :doc:`features`
81 | - `Examples`_
82 | - `Third-party examples`_
83 | - `Online IDE`_
84 | - Tutorials
85 |
86 | - `How to write a DSL`_ - Implements a toy LOGO-like language with
87 | an interpreter
88 | - :doc:`json_tutorial` - Teaches you how to use Lark
89 | - Unofficial
90 |
91 | - `Program Synthesis is Possible`_ - Creates a DSL for Z3
92 | - `Using Lark to Parse Text - Robin Reynolds-Haertle (PyCascades 2023) `_ (video presentation)
93 |
94 | - Guides
95 |
96 | - :doc:`how_to_use`
97 | - :doc:`how_to_develop`
98 |
99 | - Reference
100 |
101 | - :doc:`grammar`
102 | - :doc:`tree_construction`
103 | - :doc:`visitors`
104 | - :doc:`forest`
105 | - :doc:`classes`
106 | - :doc:`tools`
107 | - `Cheatsheet (PDF)`_
108 |
109 | - Discussion
110 |
111 | - `Gitter`_
112 | - `Forum (Google Groups)`_
113 |
114 |
115 | .. _Examples: https://github.com/lark-parser/lark/tree/master/examples
116 | .. _Third-party examples: https://github.com/ligurio/lark-grammars
117 | .. _Online IDE: https://lark-parser.org/ide
118 | .. _How to write a DSL: https://eshsoft.com/blog/write-dsl-in-python-with-lark
119 | .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html
120 | .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf
121 | .. _Gitter: https://gitter.im/lark-parser/Lobby
122 | .. _Forum (Google Groups): https://groups.google.com/forum/#!forum/lark-parser
123 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=Lark
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/parsers.md:
--------------------------------------------------------------------------------
1 | # Parsers
2 | Lark implements the following parsing algorithms: Earley, LALR(1), and CYK
3 |
4 | ## Earley
5 |
6 | An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time.
7 |
8 | Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`.
9 |
10 | It's possible to bypass the dynamic lexing, and use the regular Earley parser with a basic lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'`
11 |
12 | **SPPF & Ambiguity resolution**
13 |
14 | Lark implements the Shared Packed Parse Forest data-structure for the Earley parser, in order to reduce the space and computation required to handle ambiguous grammars.
15 |
16 | You can read more about SPPF [here](https://web.archive.org/web/20191229100607/www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest)
17 |
18 | As a result, Lark can efficiently parse and store every ambiguity in the grammar, when using Earley.
19 |
20 | Lark provides the following options to combat ambiguity:
21 |
22 | 1) Lark will choose the best derivation for you (default). Users can choose between different disambiguation strategies, and can prioritize (or demote) individual rules over others, using the rule-priority syntax.
23 |
24 | 2) Users may choose to receive the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs.
25 |
26 | 3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. There is also [a 3rd party utility for iterating over the SPPF](https://github.com/chanicpanic/lark-ambig-tools).
27 |
28 | **lexer="dynamic_complete"**
29 |
30 | Earley's "dynamic" lexer uses regular expressions in order to tokenize the text. It tries every possible combination of terminals, but it matches each terminal exactly once, returning the longest possible match.
31 |
32 | That means, for example, that when `lexer="dynamic"` (which is the default), the terminal `/a+/`, when given the text `"aa"`, will return one result, `aa`, even though `a` would also be correct.
33 |
34 | This behavior was chosen because it is much faster, and it is usually what you would expect.
35 |
36 | Setting `lexer="dynamic_complete"` instructs the lexer to consider every possible regexp match. This ensures that the parser will consider and resolve every ambiguity, even inside the terminals themselves. This lexer provides the same capabilities as scannerless Earley, but with different performance tradeoffs.
37 |
38 | Warning: This lexer can be much slower, especially for open-ended terminals such as `/.*/`
39 |
40 |
41 | ## LALR(1)
42 |
43 | [LALR(1)](https://www.wikiwand.com/en/LALR_parser) is a very efficient, true-and-tested parsing algorithm. It's incredibly fast and requires very little memory. It can parse most programming languages (For example: Python and Java).
44 |
45 | LALR(1) stands for:
46 |
47 | - Left-to-right parsing order
48 |
49 | - Rightmost derivation, bottom-up
50 |
51 | - Lookahead of 1 token
52 |
53 | Lark comes with an efficient implementation that outperforms every other parsing library for Python (including PLY)
54 |
55 | Lark extends the traditional YACC-based architecture with a *contextual lexer*, which processes feedback from the parser, making the LALR(1) algorithm stronger than ever.
56 |
57 | The contextual lexer communicates with the parser, and uses the parser's lookahead prediction to narrow its choice of terminals. So at each point, the lexer only matches the subgroup of terminals that are legal at that parser state, instead of all of the terminals. It’s surprisingly effective at resolving common terminal collisions, and allows one to parse languages that LALR(1) was previously incapable of parsing.
58 |
59 | (If you're familiar with YACC, you can think of it as automatic lexer-states)
60 |
61 | This is an improvement to LALR(1) that is unique to Lark.
62 |
63 | ### Grammar constraints in LALR(1)
64 |
65 | Due to having only a lookahead of one token, LALR is limited in its ability to choose between rules, when they both match the input.
66 |
67 | Tips for writing a conforming grammar:
68 |
69 | - Try to avoid writing different rules that can match the same sequence of characters.
70 |
71 | - For the best performance, prefer left-recursion over right-recursion.
72 |
73 | - Consider setting terminal priority only as a last resort.
74 |
75 | For a better understanding of these constraints, it's recommended to learn how a SLR parser works. SLR is very similar to LALR but much simpler.
76 |
77 | ## CYK Parser
78 |
79 | A [CYK parser](https://www.wikiwand.com/en/CYK_algorithm) can parse any context-free grammar at O(n^3*|G|).
80 |
81 | Its too slow to be practical for simple grammars, but it offers good performance for highly ambiguous grammars.
82 |
--------------------------------------------------------------------------------
/docs/philosophy.md:
--------------------------------------------------------------------------------
1 | # Philosophy
2 |
3 | Parsers are innately complicated and confusing. They're difficult to understand, difficult to write, and difficult to use. Even experts on the subject can become baffled by the nuances of these complicated state-machines.
4 |
5 | Lark's mission is to make the process of writing them as simple and abstract as possible, by following these design principles:
6 |
7 | ## Design Principles
8 |
9 | 1. Readability matters
10 |
11 | 2. Keep the grammar clean and simple
12 |
13 | 2. Don't force the user to decide on things that the parser can figure out on its own
14 |
15 | 4. Usability is more important than performance
16 |
17 | 5. Performance is still very important
18 |
19 | 6. Follow the Zen of Python, whenever possible and applicable
20 |
21 |
22 | In accordance with these principles, I arrived at the following design choices:
23 |
24 | -----------
25 |
26 | ## Design Choices
27 |
28 | ### 1. Separation of code and grammar
29 |
30 | Grammars are the de-facto reference for your language, and for the structure of your parse-tree. For any non-trivial language, the conflation of code and grammar always turns out convoluted and difficult to read.
31 |
32 | The grammars in Lark are EBNF-inspired, so they are especially easy to read & work with.
33 |
34 | ### 2. Always build a parse-tree (unless told not to)
35 |
36 | Trees are always simpler to work with than state-machines.
37 |
38 | 1. Trees allow you to see the "state-machine" visually
39 |
40 | 2. Trees allow your computation to be aware of previous and future states
41 |
42 | 3. Trees allow you to process the parse in steps, instead of forcing you to do it all at once.
43 |
44 | And anyway, every parse-tree can be replayed as a state-machine, so there is no loss of information.
45 |
46 | See this answer in more detail [here](https://github.com/erezsh/lark/issues/4).
47 |
48 | To improve performance, you can skip building the tree for LALR(1), by providing Lark with a transformer (see the [JSON example](https://github.com/erezsh/lark/blob/master/examples/json_parser.py)).
49 |
50 | ### 3. Earley is the default
51 |
52 | The Earley algorithm can accept *any* context-free grammar you throw at it (i.e. any grammar you can write in EBNF, it can parse). That makes it extremely friendly to beginners, who are not aware of the strange and arbitrary restrictions that LALR(1) places on its grammars.
53 |
54 | As the users grow to understand the structure of their grammar, the scope of their target language, and their performance requirements, they may choose to switch over to LALR(1) to gain a huge performance boost, possibly at the cost of some language features.
55 |
56 | Both Earley and LALR(1) can use the same grammar, as long as all constraints are satisfied.
57 |
58 | In short, "Premature optimization is the root of all evil."
59 |
60 | ### Other design features
61 |
62 | - Automatically resolve terminal collisions whenever possible
63 |
64 | - Automatically keep track of line & column numbers
65 |
--------------------------------------------------------------------------------
/docs/recipes.md:
--------------------------------------------------------------------------------
1 | # Recipes
2 |
3 | A collection of recipes to use Lark and its various features
4 |
5 |
6 | ## Use a transformer to parse integer tokens
7 |
8 | Transformers are the common interface for processing matched rules and tokens.
9 |
10 | They can be used during parsing for better performance.
11 |
12 | ```python
13 | from lark import Lark, Transformer
14 |
15 | class T(Transformer):
16 | def INT(self, tok):
17 | "Convert the value of `tok` from string to int, while maintaining line number & column."
18 | return tok.update(value=int(tok))
19 |
20 | parser = Lark("""
21 | start: INT*
22 | %import common.INT
23 | %ignore " "
24 | """, parser="lalr", transformer=T())
25 |
26 | print(parser.parse('3 14 159'))
27 | ```
28 |
29 | Prints out:
30 |
31 | ```python
32 | Tree(start, [Token(INT, 3), Token(INT, 14), Token(INT, 159)])
33 | ```
34 |
35 |
36 | ## Collect all comments with lexer_callbacks
37 |
38 | `lexer_callbacks` can be used to interface with the lexer as it generates tokens.
39 |
40 | It accepts a dictionary of the form
41 |
42 | {TOKEN_TYPE: callback}
43 |
44 | Where callback is of type `f(Token) -> Token`
45 |
46 | It only works with the basic and contextual lexers.
47 |
48 | This has the same effect of using a transformer, but can also process ignored tokens.
49 |
50 | ```python
51 | from lark import Lark
52 |
53 | comments = []
54 |
55 | parser = Lark("""
56 | start: INT*
57 |
58 | COMMENT: /#.*/
59 |
60 | %import common (INT, WS)
61 | %ignore COMMENT
62 | %ignore WS
63 | """, parser="lalr", lexer_callbacks={'COMMENT': comments.append})
64 |
65 | parser.parse("""
66 | 1 2 3 # hello
67 | # world
68 | 4 5 6
69 | """)
70 |
71 | print(comments)
72 | ```
73 |
74 | Prints out:
75 |
76 | ```python
77 | [Token(COMMENT, '# hello'), Token(COMMENT, '# world')]
78 | ```
79 |
80 | *Note: We don't have to return a token, because comments are ignored*
81 |
82 |
83 | ## CollapseAmbiguities
84 |
85 | Parsing ambiguous texts with earley and `ambiguity='explicit'` produces a single tree with `_ambig` nodes to mark where the ambiguity occurred.
86 |
87 | However, it's sometimes more convenient instead to work with a list of all possible unambiguous trees.
88 |
89 | Lark provides a utility transformer for that purpose:
90 |
91 | ```python
92 | from lark import Lark, Tree, Transformer
93 | from lark.visitors import CollapseAmbiguities
94 |
95 | grammar = """
96 | !start: x y
97 |
98 | !x: "a" "b"
99 | | "ab"
100 | | "abc"
101 |
102 | !y: "c" "d"
103 | | "cd"
104 | | "d"
105 |
106 | """
107 | parser = Lark(grammar, ambiguity='explicit')
108 |
109 | t = parser.parse('abcd')
110 | for x in CollapseAmbiguities().transform(t):
111 | print(x.pretty())
112 | ```
113 |
114 | This prints out:
115 |
116 | start
117 | x
118 | a
119 | b
120 | y
121 | c
122 | d
123 |
124 | start
125 | x ab
126 | y cd
127 |
128 | start
129 | x abc
130 | y d
131 |
132 | While convenient, this should be used carefully, as highly ambiguous trees will soon create an exponential explosion of such unambiguous derivations.
133 |
134 |
135 | ## Keeping track of parents when visiting
136 |
137 | The following visitor assigns a `parent` attribute for every node in the tree.
138 |
139 | If your tree nodes aren't unique (if there is a shared Tree instance), the assert will fail.
140 |
141 | ```python
142 | class Parent(Visitor):
143 | def __default__(self, tree):
144 | for subtree in tree.children:
145 | if isinstance(subtree, Tree):
146 | assert not hasattr(subtree, 'parent')
147 | subtree.parent = proxy(tree)
148 | ```
149 |
150 |
151 | ## Unwinding VisitError after a transformer/visitor exception
152 |
153 | Errors that happen inside visitors and transformers get wrapped inside a `VisitError` exception.
154 |
155 | This can often be inconvenient, if you wish the actual error to propagate upwards, or if you want to catch it.
156 |
157 | But, it's easy to unwrap it at the point of calling the transformer, by catching it and raising the `VisitError.orig_exc` attribute.
158 |
159 | For example:
160 | ```python
161 | from lark import Lark, Transformer
162 | from lark.visitors import VisitError
163 |
164 | tree = Lark('start: "a"').parse('a')
165 |
166 | class T(Transformer):
167 | def start(self, x):
168 | raise KeyError("Original Exception")
169 |
170 | t = T()
171 | try:
172 | print( t.transform(tree))
173 | except VisitError as e:
174 | raise e.orig_exc
175 | ```
176 |
177 |
178 | ## Adding a Progress Bar to Parsing with tqdm
179 |
180 | Parsing large files can take a long time, even with the `parser='lalr'` option. To make this process more user-friendly, it's useful to add a progress bar. One way to achieve this is to use the `InteractiveParser` to display each token as it is processed. In this example, we use [tqdm](https://github.com/tqdm/tqdm), but it should be easy to adapt to other kinds of progress bars.
181 |
182 | ```python
183 | from tqdm import tqdm
184 |
185 | def parse_with_progress(parser: Lark, text: str, start=None):
186 | last = 0
187 | progress = tqdm(total=len(text))
188 | pi = parser.parse_interactive(text, start=start)
189 | for token in pi.iter_parse():
190 | if token.end_pos is not None:
191 | progress.update(token.end_pos - last)
192 | last = token.end_pos
193 | return pi.resume_parse() # Finish up and get the result
194 | ```
195 |
196 | Keep in mind that this implementation relies on the `InteractiveParser` and, therefore, only works with the `LALR(1)` parser, and not `Earley`.
197 |
198 |
199 | ## Parsing a Language with Significant Indentation
200 |
201 | If your grammar needs to support significant indentation (e.g. Python, YAML), you will need to use
202 | the `Indenter` class. Take a look at the [indented tree example][indent] as well as the
203 | [Python grammar][python] for inspiration.
204 |
205 | [indent]: examples/indented_tree.html
206 | [python]: https://github.com/lark-parser/lark/blob/master/lark/grammars/python.lark
207 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # https://docs.readthedocs.io/en/stable/guides/specifying-dependencies.html#specifying-a-requirements-file
2 | pillow
3 | recommonmark
4 | requests==2.28.1
5 | sphinx-gallery
6 | sphinx_markdown_tables
7 | sphinx_rtd_theme>=1.2
8 |
--------------------------------------------------------------------------------
/docs/tools.md:
--------------------------------------------------------------------------------
1 | # Tools (Stand-alone, Nearley)
2 |
3 | ## Stand-alone parser
4 |
5 | Lark can generate a stand-alone LALR(1) parser from a grammar.
6 |
7 | The resulting module provides the same interface as Lark, but with a fixed grammar, and reduced functionality.
8 |
9 | Run using:
10 |
11 | ```bash
12 | python -m lark.tools.standalone
13 | ```
14 |
15 | For a play-by-play, read the [tutorial](http://blog.erezsh.com/create-a-stand-alone-lalr1-parser-in-python/)
16 |
17 |
18 | ## Importing grammars from Nearley.js
19 |
20 | Lark comes with a tool to convert grammars from [Nearley](https://github.com/Hardmath123/nearley), a popular Earley library for Javascript. It uses [Js2Py](https://github.com/PiotrDabkowski/Js2Py) to convert and run the Javascript postprocessing code segments.
21 |
22 | #### Requirements
23 |
24 | 1. Install Lark with the `nearley` component:
25 | ```bash
26 | pip install lark[nearley]
27 | ```
28 |
29 | 2. Acquire a copy of the Nearley codebase. This can be done using:
30 | ```bash
31 | git clone https://github.com/Hardmath123/nearley
32 | ```
33 |
34 | #### Usage
35 |
36 | The tool can be run using:
37 |
38 | ```bash
39 | python -m lark.tools.nearley
40 | ```
41 |
42 | Here's an example of how to import nearley's calculator example into Lark:
43 |
44 | ```bash
45 | git clone https://github.com/Hardmath123/nearley
46 | python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main ./nearley > ncalc.py
47 | ```
48 |
49 | You can use the output as a regular python module:
50 |
51 | ```python
52 | >>> import ncalc
53 | >>> ncalc.parse('sin(pi/4) ^ e')
54 | 0.38981434460254655
55 | ```
56 |
57 | The Nearley converter also supports an experimental converter for newer JavaScript (ES6+), using the `--es6` flag:
58 |
59 | ```bash
60 | git clone https://github.com/Hardmath123/nearley
61 | python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley --es6 > ncalc.py
62 | ```
63 |
64 | #### Notes
65 |
66 | - Lark currently cannot import templates from Nearley
67 |
68 | - Lark currently cannot export grammars to Nearley
69 |
70 | These might get added in the future, if enough users ask for them.
71 |
--------------------------------------------------------------------------------
/docs/tree_construction.md:
--------------------------------------------------------------------------------
1 | # Tree Construction Reference
2 |
3 |
4 | Lark builds a tree automatically based on the structure of the grammar, where each rule that is matched becomes a branch (node) in the tree, and its children are its matches, in the order of matching.
5 |
6 | For example, the rule `node: child1 child2` will create a tree node with two children. If it is matched as part of another rule (i.e. if it isn't the root), the new rule's tree node will become its parent.
7 |
8 | Using `item+` or `item*` will result in a list of items, equivalent to writing `item item item ..`.
9 |
10 | Using `item?` will return the item if it matched, or nothing.
11 |
12 | If `maybe_placeholders=True` (the default), then using `[item]` will return the item if it matched, or the value `None`, if it didn't.
13 |
14 | If `maybe_placeholders=False`, then `[]` behaves like `()?`.
15 |
16 | ## Terminals
17 |
18 | Terminals are always values in the tree, never branches.
19 |
20 | Lark filters out certain types of terminals by default, considering them punctuation:
21 |
22 | - Terminals that won't appear in the tree are:
23 |
24 | - Unnamed literals (like `"keyword"` or `"+"`)
25 | - Terminals whose name starts with an underscore (like `_DIGIT`)
26 |
27 | - Terminals that *will* appear in the tree are:
28 |
29 | - Unnamed regular expressions (like `/[0-9]/`)
30 | - Named terminals whose name starts with a letter (like `DIGIT`)
31 |
32 | Note: Terminals composed of literals and other terminals always include the entire match without filtering any part.
33 |
34 | **Example:**
35 | ```
36 | start: PNAME pname
37 |
38 | PNAME: "(" NAME ")"
39 | pname: "(" NAME ")"
40 |
41 | NAME: /\w+/
42 | %ignore /\s+/
43 | ```
44 | Lark will parse "(Hello) (World)" as:
45 |
46 | start
47 | (Hello)
48 | pname World
49 |
50 | Rules prefixed with `!` will retain all their literals regardless.
51 |
52 |
53 |
54 |
55 | **Example:**
56 |
57 | ```perl
58 | expr: "(" expr ")"
59 | | NAME+
60 |
61 | NAME: /\w+/
62 |
63 | %ignore " "
64 | ```
65 |
66 | Lark will parse "((hello world))" as:
67 |
68 | expr
69 | expr
70 | expr
71 | "hello"
72 | "world"
73 |
74 | The brackets do not appear in the tree by design. The words appear because they are matched by a named terminal.
75 |
76 |
77 | ## Shaping the tree
78 |
79 | Users can alter the automatic construction of the tree using a collection of grammar features.
80 |
81 | ### Inlining rules with `_`
82 |
83 | Rules whose name begins with an underscore will be inlined into their containing rule.
84 |
85 | **Example:**
86 |
87 | ```perl
88 | start: "(" _greet ")"
89 | _greet: /\w+/ /\w+/
90 | ```
91 |
92 | Lark will parse "(hello world)" as:
93 |
94 | start
95 | "hello"
96 | "world"
97 |
98 | ### Conditionally inlining rules with `?`
99 |
100 | Rules that receive a question mark (?) at the beginning of their definition, will be inlined if they have a single child, after filtering.
101 |
102 | **Example:**
103 |
104 | ```ruby
105 | start: greet greet
106 | ?greet: "(" /\w+/ ")"
107 | | /\w+/ /\w+/
108 | ```
109 |
110 | Lark will parse "hello world (planet)" as:
111 |
112 | start
113 | greet
114 | "hello"
115 | "world"
116 | "planet"
117 |
118 | ### Pinning rule terminals with `!`
119 |
120 | Rules that begin with an exclamation mark will keep all their terminals (they won't get filtered).
121 |
122 | ```perl
123 | !expr: "(" expr ")"
124 | | NAME+
125 | NAME: /\w+/
126 | %ignore " "
127 | ```
128 |
129 | Will parse "((hello world))" as:
130 |
131 | expr
132 | (
133 | expr
134 | (
135 | expr
136 | hello
137 | world
138 | )
139 | )
140 |
141 | Using the `!` prefix is usually a "code smell", and may point to a flaw in your grammar design.
142 |
143 | ### Aliasing rules
144 |
145 | Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option, instead of the rule name.
146 |
147 | **Example:**
148 |
149 | ```ruby
150 | start: greet greet
151 | greet: "hello"
152 | | "world" -> planet
153 | ```
154 |
155 | Lark will parse "hello world" as:
156 |
157 | start
158 | greet
159 | planet
160 |
--------------------------------------------------------------------------------
/docs/visitors.rst:
--------------------------------------------------------------------------------
1 | Transformers & Visitors
2 | =======================
3 |
4 | Transformers & Visitors provide a convenient interface to process the
5 | parse-trees that Lark returns.
6 |
7 | They are used by inheriting from the correct class (visitor or transformer),
8 | and implementing methods corresponding to the rule you wish to process. Each
9 | method accepts the children as an argument. That can be modified using the
10 | ``v_args`` decorator, which allows one to inline the arguments (akin to ``*args``),
11 | or add the tree ``meta`` property as an argument.
12 |
13 | See: `visitors.py`_
14 |
15 | .. _visitors.py: https://github.com/lark-parser/lark/blob/master/lark/visitors.py
16 |
17 | Visitor
18 | -------
19 |
20 | Visitors visit each node of the tree, and run the appropriate method on it according to the node's data.
21 |
22 | They work bottom-up, starting with the leaves and ending at the root of the tree.
23 |
24 | There are two classes that implement the visitor interface:
25 |
26 | - ``Visitor``: Visit every node (without recursion)
27 | - ``Visitor_Recursive``: Visit every node using recursion. Slightly faster.
28 |
29 | Example:
30 | ::
31 |
32 | class IncreaseAllNumbers(Visitor):
33 | def number(self, tree):
34 | assert tree.data == "number"
35 | tree.children[0] += 1
36 |
37 | IncreaseAllNumbers().visit(parse_tree)
38 |
39 | .. autoclass:: lark.visitors.Visitor
40 | :members: visit, visit_topdown, __default__
41 |
42 | .. autoclass:: lark.visitors.Visitor_Recursive
43 | :members: visit, visit_topdown, __default__
44 |
45 | Interpreter
46 | -----------
47 |
48 | .. autoclass:: lark.visitors.Interpreter
49 |
50 |
51 | Example:
52 | ::
53 |
54 | class IncreaseSomeOfTheNumbers(Interpreter):
55 | def number(self, tree):
56 | tree.children[0] += 1
57 |
58 | def skip(self, tree):
59 | # skip this subtree. don't change any number node inside it.
60 | pass
61 |
62 | IncreaseSomeOfTheNumbers().visit(parse_tree)
63 |
64 | Transformer
65 | -----------
66 |
67 | .. autoclass:: lark.visitors.Transformer
68 | :members: transform, __default__, __default_token__, __mul__
69 |
70 | Example:
71 | ::
72 |
73 | from lark import Tree, Transformer
74 |
75 | class EvalExpressions(Transformer):
76 | def expr(self, args):
77 | return eval(args[0])
78 |
79 | t = Tree('a', [Tree('expr', ['1+2'])])
80 | print(EvalExpressions().transform( t ))
81 |
82 | # Prints: Tree(a, [3])
83 |
84 | Example:
85 | ::
86 |
87 | class T(Transformer):
88 | INT = int
89 | NUMBER = float
90 | def NAME(self, name):
91 | return lookup_dict.get(name, name)
92 |
93 | T(visit_tokens=True).transform(tree)
94 |
95 | .. autoclass:: lark.visitors.Transformer_NonRecursive
96 |
97 | .. autoclass:: lark.visitors.Transformer_InPlace
98 |
99 | .. autoclass:: lark.visitors.Transformer_InPlaceRecursive
100 |
101 | v_args
102 | ------
103 |
104 | .. autofunction:: lark.visitors.v_args
105 |
106 | merge_transformers
107 | ------------------
108 |
109 | .. autofunction:: lark.visitors.merge_transformers
110 |
111 | Discard
112 | -------
113 |
114 | ``Discard`` is the singleton instance of ``_DiscardType``.
115 |
116 | .. autoclass:: lark.visitors._DiscardType
117 |
118 |
119 | VisitError
120 | ----------
121 |
122 | .. autoclass:: lark.exceptions.VisitError
123 |
--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | Examples for Lark
2 | =================
3 |
4 | **How to run the examples**:
5 |
6 | After cloning the repo, open the terminal into the root directory of the
7 | project, and run the following:
8 |
9 | .. code:: bash
10 |
11 | [lark]$ python -m examples.
12 |
13 | For example, the following will parse all the Python files in the
14 | standard library of your local installation:
15 |
16 | .. code:: bash
17 |
18 | [lark]$ python -m examples.advanced.python_parser
19 |
20 | Beginner Examples
21 | ~~~~~~~~~~~~~~~~~
22 |
--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/examples/__init__.py
--------------------------------------------------------------------------------
/examples/advanced/README.rst:
--------------------------------------------------------------------------------
1 | Advanced Examples
2 | ~~~~~~~~~~~~~~~~~
3 |
--------------------------------------------------------------------------------
/examples/advanced/_json_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple JSON Parser
3 | ==================
4 |
5 | The code is short and clear, and outperforms every other parser (that's written in Python).
6 | For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md
7 |
8 | (this is here for use by the other examples)
9 | """
10 | from lark import Lark, Transformer, v_args
11 |
12 | json_grammar = r"""
13 | ?start: value
14 |
15 | ?value: object
16 | | array
17 | | string
18 | | SIGNED_NUMBER -> number
19 | | "true" -> true
20 | | "false" -> false
21 | | "null" -> null
22 |
23 | array : "[" [value ("," value)*] "]"
24 | object : "{" [pair ("," pair)*] "}"
25 | pair : string ":" value
26 |
27 | string : ESCAPED_STRING
28 |
29 | %import common.ESCAPED_STRING
30 | %import common.SIGNED_NUMBER
31 | %import common.WS
32 |
33 | %ignore WS
34 | """
35 |
36 |
37 | class TreeToJson(Transformer):
38 | @v_args(inline=True)
39 | def string(self, s):
40 | return s[1:-1].replace('\\"', '"')
41 |
42 | array = list
43 | pair = tuple
44 | object = dict
45 | number = v_args(inline=True)(float)
46 |
47 | null = lambda self, _: None
48 | true = lambda self, _: True
49 | false = lambda self, _: False
50 |
51 |
52 | ### Create the JSON parser with Lark, using the LALR algorithm
53 | json_parser = Lark(json_grammar, parser='lalr',
54 | # Using the basic lexer isn't required, and isn't usually recommended.
55 | # But, it's good enough for JSON, and it's slightly faster.
56 | lexer='basic',
57 | # Disabling propagate_positions and placeholders slightly improves speed
58 | propagate_positions=False,
59 | maybe_placeholders=False,
60 | # Using an internal transformer is faster and more memory efficient
61 | transformer=TreeToJson())
62 |
--------------------------------------------------------------------------------
/examples/advanced/conf_earley.py:
--------------------------------------------------------------------------------
1 | """
2 | Earley’s dynamic lexer
3 | ======================
4 |
5 | Demonstrates the power of Earley’s dynamic lexer on a toy configuration language
6 |
7 | Using a lexer for configuration files is tricky, because values don't
8 | have to be surrounded by delimiters. Using a basic lexer for this just won't work.
9 |
10 | In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity.
11 |
12 | Another approach is to use the contextual lexer with LALR. It is less powerful than Earley,
13 | but it can handle some ambiguity when lexing and it's much faster.
14 | See examples/conf_lalr.py for an example of that approach.
15 |
16 | """
17 | from lark import Lark
18 |
19 | parser = Lark(r"""
20 | start: _NL? section+
21 | section: "[" NAME "]" _NL item+
22 | item: NAME "=" VALUE? _NL
23 |
24 | NAME: /\w/+
25 | VALUE: /./+
26 |
27 | %import common.NEWLINE -> _NL
28 | %import common.WS_INLINE
29 | %ignore WS_INLINE
30 | """, parser="earley")
31 |
32 | def test():
33 | sample_conf = """
34 | [bla]
35 |
36 | a=Hello
37 | this="that",4
38 | empty=
39 | """
40 |
41 | r = parser.parse(sample_conf)
42 | print (r.pretty())
43 |
44 | if __name__ == '__main__':
45 | test()
46 |
--------------------------------------------------------------------------------
/examples/advanced/conf_lalr.py:
--------------------------------------------------------------------------------
1 | """
2 | LALR’s contextual lexer
3 | =======================
4 |
5 | This example demonstrates the power of LALR's contextual lexer,
6 | by parsing a toy configuration language.
7 |
8 | The terminals `NAME` and `VALUE` overlap. They can match the same input.
9 | A basic lexer would arbitrarily choose one over the other, based on priority,
10 | which would lead to a (confusing) parse error.
11 | However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows
12 | which one of them to expect at each point during the parse.
13 | The lexer then only matches the tokens that the parser expects.
14 | The result is a correct parse, something that is impossible with a regular lexer.
15 |
16 | Another approach is to use the Earley algorithm.
17 | It will handle more cases than the contextual lexer, but at the cost of performance.
18 | See examples/conf_earley.py for an example of that approach.
19 | """
20 | from lark import Lark
21 |
22 | parser = Lark(r"""
23 | start: _NL? section+
24 | section: "[" NAME "]" _NL item+
25 | item: NAME "=" VALUE? _NL
26 |
27 | NAME: /\w/+
28 | VALUE: /./+
29 |
30 | %import common.NEWLINE -> _NL
31 | %import common.WS_INLINE
32 | %ignore WS_INLINE
33 | """, parser="lalr")
34 |
35 |
36 | sample_conf = """
37 | [bla]
38 | a=Hello
39 | this="that",4
40 | empty=
41 | """
42 |
43 | print(parser.parse(sample_conf).pretty())
44 |
--------------------------------------------------------------------------------
/examples/advanced/create_ast.py:
--------------------------------------------------------------------------------
1 | """
2 | Creating an AST from the parse tree
3 | ===================================
4 |
5 | This example demonstrates how to transform a parse-tree into an AST using `lark.ast_utils`.
6 |
7 | create_transformer() collects every subclass of `Ast` subclass from the module,
8 | and creates a Lark transformer that builds the AST with no extra code.
9 |
10 | This example only works with Python 3.
11 | """
12 |
13 | import sys
14 | from typing import List
15 | from dataclasses import dataclass
16 |
17 | from lark import Lark, ast_utils, Transformer, v_args
18 | from lark.tree import Meta
19 |
20 | this_module = sys.modules[__name__]
21 |
22 |
23 | #
24 | # Define AST
25 | #
26 | class _Ast(ast_utils.Ast):
27 | # This will be skipped by create_transformer(), because it starts with an underscore
28 | pass
29 |
30 | class _Statement(_Ast):
31 | # This will be skipped by create_transformer(), because it starts with an underscore
32 | pass
33 |
34 | @dataclass
35 | class Value(_Ast, ast_utils.WithMeta):
36 | "Uses WithMeta to include line-number metadata in the meta attribute"
37 | meta: Meta
38 | value: object
39 |
40 | @dataclass
41 | class Name(_Ast):
42 | name: str
43 |
44 | @dataclass
45 | class CodeBlock(_Ast, ast_utils.AsList):
46 | # Corresponds to code_block in the grammar
47 | statements: List[_Statement]
48 |
49 | @dataclass
50 | class If(_Statement):
51 | cond: Value
52 | then: CodeBlock
53 |
54 | @dataclass
55 | class SetVar(_Statement):
56 | # Corresponds to set_var in the grammar
57 | name: str
58 | value: Value
59 |
60 | @dataclass
61 | class Print(_Statement):
62 | value: Value
63 |
64 |
65 | class ToAst(Transformer):
66 | # Define extra transformation functions, for rules that don't correspond to an AST class.
67 |
68 | def STRING(self, s):
69 | # Remove quotation marks
70 | return s[1:-1]
71 |
72 | def DEC_NUMBER(self, n):
73 | return int(n)
74 |
75 | @v_args(inline=True)
76 | def start(self, x):
77 | return x
78 |
79 | #
80 | # Define Parser
81 | #
82 |
83 | parser = Lark("""
84 | start: code_block
85 |
86 | code_block: statement+
87 |
88 | ?statement: if | set_var | print
89 |
90 | if: "if" value "{" code_block "}"
91 | set_var: NAME "=" value ";"
92 | print: "print" value ";"
93 |
94 | value: name | STRING | DEC_NUMBER
95 | name: NAME
96 |
97 | %import python (NAME, STRING, DEC_NUMBER)
98 | %import common.WS
99 | %ignore WS
100 | """,
101 | parser="lalr",
102 | )
103 |
104 | transformer = ast_utils.create_transformer(this_module, ToAst())
105 |
106 | def parse(text):
107 | tree = parser.parse(text)
108 | return transformer.transform(tree)
109 |
110 | #
111 | # Test
112 | #
113 |
114 | if __name__ == '__main__':
115 | print(parse("""
116 | a = 1;
117 | if a {
118 | print "a is 1";
119 | a = 2;
120 | }
121 | """))
122 |
--------------------------------------------------------------------------------
/examples/advanced/custom_lexer.py:
--------------------------------------------------------------------------------
1 | """
2 | Custom lexer
3 | ============
4 |
5 | Demonstrates using a custom lexer to parse a non-textual stream of data
6 |
7 | You can use a custom lexer to tokenize text when the lexers offered by Lark
8 | are too slow, or not flexible enough.
9 |
10 | You can also use it (as shown in this example) to tokenize streams of objects.
11 | """
12 | from lark import Lark, Transformer, v_args
13 | from lark.lexer import Lexer, Token
14 |
15 | class TypeLexer(Lexer):
16 | def __init__(self, lexer_conf):
17 | pass
18 |
19 | def lex(self, data):
20 | for obj in data:
21 | if isinstance(obj, int):
22 | yield Token('INT', obj)
23 | elif isinstance(obj, (type(''), type(u''))):
24 | yield Token('STR', obj)
25 | else:
26 | raise TypeError(obj)
27 |
28 | parser = Lark("""
29 | start: data_item+
30 | data_item: STR INT*
31 |
32 | %declare STR INT
33 | """, parser='lalr', lexer=TypeLexer)
34 |
35 |
36 | class ParseToDict(Transformer):
37 | @v_args(inline=True)
38 | def data_item(self, name, *numbers):
39 | return name.value, [n.value for n in numbers]
40 |
41 | start = dict
42 |
43 |
44 | def test():
45 | data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6]
46 |
47 | print(data)
48 |
49 | tree = parser.parse(data)
50 | res = ParseToDict().transform(tree)
51 |
52 | print('-->')
53 | print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]}
54 |
55 |
56 | if __name__ == '__main__':
57 | test()
58 |
--------------------------------------------------------------------------------
/examples/advanced/dynamic_complete.py:
--------------------------------------------------------------------------------
1 | """
2 | Using lexer dynamic_complete
3 | ============================
4 |
5 | Demonstrates how to use ``lexer='dynamic_complete'`` and ``ambiguity='explicit'``
6 |
7 | Sometimes you have data that is highly ambiguous or 'broken' in some sense.
8 | When using ``parser='earley'`` and ``lexer='dynamic_complete'``, Lark will be able
9 | parse just about anything as long as there is a valid way to generate it from
10 | the Grammar, including looking 'into' the Regexes.
11 |
12 | This examples shows how to parse a json input where the quotes have been
13 | replaced by underscores: ``{_foo_:{}, _bar_: [], _baz_: __}``
14 | Notice that underscores might still appear inside strings, so a potentially
15 | valid reading of the above is:
16 | ``{"foo_:{}, _bar": [], "baz": ""}``
17 | """
18 | from pprint import pprint
19 |
20 | from lark import Lark, Tree, Transformer, v_args
21 | from lark.visitors import Transformer_InPlace
22 |
23 | GRAMMAR = r"""
24 | %import common.SIGNED_NUMBER
25 | %import common.WS_INLINE
26 | %import common.NEWLINE
27 | %ignore WS_INLINE
28 |
29 | ?start: value
30 |
31 | ?value: object
32 | | array
33 | | string
34 | | SIGNED_NUMBER -> number
35 | | "true" -> true
36 | | "false" -> false
37 | | "null" -> null
38 |
39 | array : "[" (value ("," value)*)? "]"
40 | object : "{" (pair ("," pair)*)? "}"
41 | pair : string ":" value
42 |
43 | string: STRING
44 | STRING : ESCAPED_STRING
45 |
46 | ESCAPED_STRING: QUOTE_CHAR _STRING_ESC_INNER QUOTE_CHAR
47 | QUOTE_CHAR: "_"
48 |
49 | _STRING_INNER: /.*/
50 | _STRING_ESC_INNER: _STRING_INNER /(? var
26 |
27 | TEMPLATE_NAME: "$" NAME
28 |
29 | ?template_start: (stmt | testlist_star_expr _NEWLINE)
30 |
31 | %ignore /[\t \f]+/ // WS
32 | %ignore /\\[\t \f]*\r?\n/ // LINE_CONT
33 | %ignore COMMENT
34 | """
35 |
36 | parser = Lark(TEMPLATED_PYTHON, parser='lalr', start=['single_input', 'file_input', 'eval_input', 'template_start'], postlex=PythonIndenter(), maybe_placeholders=False)
37 |
38 |
39 | def parse_template(s):
40 | return parser.parse(s + '\n', start='template_start')
41 |
42 | def parse_code(s):
43 | return parser.parse(s + '\n', start='file_input')
44 |
45 |
46 | #
47 | # 2. Define translations using templates (each template code is parsed to a template tree)
48 | #
49 |
50 | pytemplate = TemplateConf(parse=parse_template)
51 |
52 | translations_3to2 = {
53 | 'yield from $a':
54 | 'for _tmp in $a: yield _tmp',
55 |
56 | 'raise $e from $x':
57 | 'raise $e',
58 |
59 | '$a / $b':
60 | 'float($a) / $b',
61 | }
62 | translations_3to2 = {pytemplate(k): pytemplate(v) for k, v in translations_3to2.items()}
63 |
64 | #
65 | # 3. Translate and reconstruct Python 3 code into valid Python 2 code
66 | #
67 |
68 | python_reconstruct = PythonReconstructor(parser)
69 |
70 | def translate_py3to2(code):
71 | tree = parse_code(code)
72 | tree = TemplateTranslator(translations_3to2).translate(tree)
73 | return python_reconstruct.reconstruct(tree)
74 |
75 |
76 | #
77 | # Test Code
78 | #
79 |
80 | _TEST_CODE = '''
81 | if a / 2 > 1:
82 | yield from [1,2,3]
83 | else:
84 | raise ValueError(a) from e
85 |
86 | '''
87 |
88 | def test():
89 | print(_TEST_CODE)
90 | print(' -----> ')
91 | print(translate_py3to2(_TEST_CODE))
92 |
93 | if __name__ == '__main__':
94 | test()
95 |
--------------------------------------------------------------------------------
/examples/advanced/python_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | Grammar-complete Python Parser
3 | ==============================
4 |
5 | A fully-working Python 2 & 3 parser (but not production ready yet!)
6 |
7 | This example demonstrates usage of the included Python grammars
8 | """
9 | import sys
10 | import os, os.path
11 | from io import open
12 | import glob, time
13 |
14 | from lark import Lark
15 | from lark.indenter import PythonIndenter
16 |
17 |
18 | kwargs = dict(postlex=PythonIndenter(), start='file_input')
19 |
20 | # Official Python grammar by Lark
21 | python_parser3 = Lark.open_from_package('lark', 'python.lark', ['grammars'], parser='lalr', **kwargs)
22 |
23 | # Local Python2 grammar
24 | python_parser2 = Lark.open('python2.lark', rel_to=__file__, parser='lalr', **kwargs)
25 | python_parser2_earley = Lark.open('python2.lark', rel_to=__file__, parser='earley', lexer='basic', **kwargs)
26 |
27 | try:
28 | xrange
29 | except NameError:
30 | chosen_parser = python_parser3
31 | else:
32 | chosen_parser = python_parser2
33 |
34 |
35 | def _read(fn, *args):
36 | kwargs = {'encoding': 'iso-8859-1'}
37 | with open(fn, *args, **kwargs) as f:
38 | return f.read()
39 |
40 | def _get_lib_path():
41 | if os.name == 'nt':
42 | if 'PyPy' in sys.version:
43 | return os.path.join(sys.base_prefix, 'lib-python', sys.winver)
44 | else:
45 | return os.path.join(sys.base_prefix, 'Lib')
46 | else:
47 | return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0]
48 |
49 | def test_python_lib():
50 | path = _get_lib_path()
51 |
52 | start = time.time()
53 | files = glob.glob(path+'/*.py')
54 | total_kb = 0
55 | for f in files:
56 | r = _read(os.path.join(path, f))
57 | kb = len(r) / 1024
58 | print( '%s -\t%.1f kb' % (f, kb))
59 | chosen_parser.parse(r + '\n')
60 | total_kb += kb
61 |
62 | end = time.time()
63 | print( "test_python_lib (%d files, %.1f kb), time: %.2f secs"%(len(files), total_kb, end-start) )
64 |
65 | def test_earley_equals_lalr():
66 | path = _get_lib_path()
67 |
68 | files = glob.glob(path+'/*.py')
69 | for f in files:
70 | print( f )
71 | tree1 = python_parser2.parse(_read(os.path.join(path, f)) + '\n')
72 | tree2 = python_parser2_earley.parse(_read(os.path.join(path, f)) + '\n')
73 | assert tree1 == tree2
74 |
75 |
76 | if __name__ == '__main__':
77 | test_python_lib()
78 | # test_earley_equals_lalr()
79 | # python_parser3.parse(_read(sys.argv[1]) + '\n')
80 |
--------------------------------------------------------------------------------
/examples/advanced/reconstruct_json.py:
--------------------------------------------------------------------------------
1 | """
2 | Reconstruct a JSON
3 | ==================
4 |
5 | Demonstrates the experimental text-reconstruction feature
6 |
7 | The Reconstructor takes a parse tree (already filtered from punctuation, of course),
8 | and reconstructs it into correct text, that can be parsed correctly.
9 | It can be useful for creating "hooks" to alter data before handing it to other parsers. You can also use it to generate samples from scratch.
10 | """
11 |
12 | import json
13 |
14 | from lark import Lark
15 | from lark.reconstruct import Reconstructor
16 |
17 | from _json_parser import json_grammar
18 |
19 | test_json = '''
20 | {
21 | "empty_object" : {},
22 | "empty_array" : [],
23 | "booleans" : { "YES" : true, "NO" : false },
24 | "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
25 | "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ],
26 | "nothing" : null
27 | }
28 | '''
29 |
30 | def test_earley():
31 |
32 | json_parser = Lark(json_grammar, maybe_placeholders=False)
33 | tree = json_parser.parse(test_json)
34 |
35 | new_json = Reconstructor(json_parser).reconstruct(tree)
36 | print (new_json)
37 | print (json.loads(new_json) == json.loads(test_json))
38 |
39 |
40 | def test_lalr():
41 |
42 | json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False)
43 | tree = json_parser.parse(test_json)
44 |
45 | new_json = Reconstructor(json_parser).reconstruct(tree)
46 | print (new_json)
47 | print (json.loads(new_json) == json.loads(test_json))
48 |
49 | test_earley()
50 | test_lalr()
51 |
--------------------------------------------------------------------------------
/examples/advanced/reconstruct_python.py:
--------------------------------------------------------------------------------
1 | """
2 | Reconstruct Python
3 | ==================
4 |
5 | Demonstrates how Lark's experimental text-reconstruction feature can recreate
6 | functional Python code from its parse-tree, using just the correct grammar and
7 | a small formatter.
8 |
9 | """
10 |
11 | from lark import Token, Lark
12 | from lark.reconstruct import Reconstructor
13 | from lark.indenter import PythonIndenter
14 |
15 | # Official Python grammar by Lark
16 | python_parser3 = Lark.open_from_package('lark', 'python.lark', ['grammars'],
17 | parser='lalr', postlex=PythonIndenter(), start='file_input',
18 | maybe_placeholders=False # Necessary for reconstructor
19 | )
20 |
21 | SPACE_AFTER = set(',+-*/~@<>="|:')
22 | SPACE_BEFORE = (SPACE_AFTER - set(',:')) | set('\'')
23 |
24 |
25 | def special(sym):
26 | return Token('SPECIAL', sym.name)
27 |
28 | def postproc(items):
29 | stack = ['\n']
30 | actions = []
31 | last_was_whitespace = True
32 | for item in items:
33 | if isinstance(item, Token) and item.type == 'SPECIAL':
34 | actions.append(item.value)
35 | else:
36 | if actions:
37 | assert actions[0] == '_NEWLINE' and '_NEWLINE' not in actions[1:], actions
38 |
39 | for a in actions[1:]:
40 | if a == '_INDENT':
41 | stack.append(stack[-1] + ' ' * 4)
42 | else:
43 | assert a == '_DEDENT'
44 | stack.pop()
45 | actions.clear()
46 | yield stack[-1]
47 | last_was_whitespace = True
48 | if not last_was_whitespace:
49 | if item[0] in SPACE_BEFORE:
50 | yield ' '
51 | yield item
52 | last_was_whitespace = item[-1].isspace()
53 | if not last_was_whitespace:
54 | if item[-1] in SPACE_AFTER:
55 | yield ' '
56 | last_was_whitespace = True
57 | yield "\n"
58 |
59 |
60 | class PythonReconstructor:
61 | def __init__(self, parser):
62 | self._recons = Reconstructor(parser, {'_NEWLINE': special, '_DEDENT': special, '_INDENT': special})
63 |
64 | def reconstruct(self, tree):
65 | return self._recons.reconstruct(tree, postproc)
66 |
67 |
68 | def test():
69 | python_reconstructor = PythonReconstructor(python_parser3)
70 |
71 | self_contents = open(__file__).read()
72 |
73 | tree = python_parser3.parse(self_contents+'\n')
74 | output = python_reconstructor.reconstruct(tree)
75 |
76 | tree_new = python_parser3.parse(output)
77 | print(tree.pretty())
78 | print(tree_new.pretty())
79 | # assert tree.pretty() == tree_new.pretty()
80 | assert tree == tree_new
81 |
82 | print(output)
83 |
84 |
85 | if __name__ == '__main__':
86 | test()
87 |
--------------------------------------------------------------------------------
/examples/advanced/template_lark.lark:
--------------------------------------------------------------------------------
1 | start: (_item | _NL)*
2 |
3 | _item: rule
4 | | token
5 | | statement
6 |
7 | _rule_or_token: RULE
8 | | TOKEN
9 | rule: RULE rule_params priority? ":" expansions{_rule_or_token} _NL
10 | token: TOKEN priority? ":" expansions{TOKEN} _NL
11 |
12 | rule_params: ["{" RULE ("," RULE)* "}"]
13 |
14 | priority: "." NUMBER
15 |
16 | statement: "%ignore" expansions{TOKEN} _NL -> ignore
17 | | "%import" import_path{_rule_or_token} ["->" _rule_or_token] _NL -> import
18 | | "%import" import_path{_rule_or_token} name_list{_rule_or_token} _NL -> multi_import
19 | | "%declare" TOKEN+ -> declare
20 |
21 | !import_path{name}: "."? name ("." name)*
22 | name_list{name}: "(" name ("," name)* ")"
23 |
24 | ?expansions{name}: alias{name} (_VBAR alias{name})*
25 |
26 | ?alias{name}: expansion{name} ["->" RULE]
27 |
28 | ?expansion{name}: expr{name}*
29 |
30 | ?expr{name}: atom{name} [OP | "~" NUMBER [".." NUMBER]]
31 |
32 | ?atom{name}: "(" expansions{name} ")"
33 | | "[" expansions{name} "]" -> maybe
34 | | value{name}
35 |
36 | ?value{name}: STRING ".." STRING -> literal_range
37 | | name
38 | | (REGEXP | STRING) -> literal
39 | | name "{" value{name} ("," value{name})* "}" -> template_usage
40 |
41 | _VBAR: _NL? "|"
42 | OP: /[+*]|[?](?![a-z])/
43 | RULE: /!?[_?]?[a-z][_a-z0-9]*/
44 | TOKEN: /_?[A-Z][_A-Z0-9]*/
45 | STRING: _STRING "i"?
46 | REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/
47 | _NL: /(\r?\n)+\s*/
48 |
49 | %import common.ESCAPED_STRING -> _STRING
50 | %import common.INT -> NUMBER
51 | %import common.WS_INLINE
52 |
53 | COMMENT: /\s*/ "//" /[^\n]/*
54 |
55 | %ignore WS_INLINE
56 | %ignore COMMENT
57 |
--------------------------------------------------------------------------------
/examples/advanced/templates.py:
--------------------------------------------------------------------------------
1 | """
2 | Templates
3 | =========
4 |
5 | This example shows how to use Lark's templates to achieve cleaner grammars
6 |
7 | """
8 | from lark import Lark
9 |
10 | grammar = r"""
11 | start: list | dict
12 |
13 | list: "[" _seperated{atom, ","} "]"
14 | dict: "{" _seperated{key_value, ","} "}"
15 | key_value: atom ":" atom
16 |
17 | _seperated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...'
18 |
19 | atom: NUMBER | ESCAPED_STRING
20 |
21 | %import common (NUMBER, ESCAPED_STRING, WS)
22 | %ignore WS
23 | """
24 |
25 |
26 | parser = Lark(grammar)
27 |
28 | print(parser.parse('[1, "a", 2]'))
29 | print(parser.parse('{"a": 2, "b": 6}'))
30 |
--------------------------------------------------------------------------------
/examples/advanced/tree_forest_transformer.py:
--------------------------------------------------------------------------------
1 | """
2 | Transform a Forest
3 | ==================
4 |
5 | This example demonstrates how to subclass ``TreeForestTransformer`` to
6 | directly transform a SPPF.
7 | """
8 |
9 | from lark import Lark
10 | from lark.parsers.earley_forest import TreeForestTransformer, handles_ambiguity, Discard
11 |
12 | class CustomTransformer(TreeForestTransformer):
13 |
14 | @handles_ambiguity
15 | def sentence(self, trees):
16 | return next(tree for tree in trees if tree.data == 'simple')
17 |
18 | def simple(self, children):
19 | children.append('.')
20 | return self.tree_class('simple', children)
21 |
22 | def adj(self, children):
23 | return Discard
24 |
25 | def __default_token__(self, token):
26 | return token.capitalize()
27 |
28 | grammar = """
29 | sentence: noun verb noun -> simple
30 | | noun verb "like" noun -> comparative
31 |
32 | noun: adj? NOUN
33 | verb: VERB
34 | adj: ADJ
35 |
36 | NOUN: "flies" | "bananas" | "fruit"
37 | VERB: "like" | "flies"
38 | ADJ: "fruit"
39 |
40 | %import common.WS
41 | %ignore WS
42 | """
43 |
44 | parser = Lark(grammar, start='sentence', ambiguity='forest')
45 | sentence = 'fruit flies like bananas'
46 | forest = parser.parse(sentence)
47 |
48 | tree = CustomTransformer(resolve_ambiguity=False).transform(forest)
49 | print(tree.pretty())
50 |
51 | # Output:
52 | #
53 | # simple
54 | # noun Flies
55 | # verb Like
56 | # noun Bananas
57 | # .
58 | #
59 |
--------------------------------------------------------------------------------
/examples/calc.py:
--------------------------------------------------------------------------------
1 | """
2 | Basic calculator
3 | ================
4 |
5 | A simple example of a REPL calculator
6 |
7 | This example shows how to write a basic calculator with variables.
8 | """
9 | from lark import Lark, Transformer, v_args
10 |
11 |
12 | try:
13 | input = raw_input # For Python2 compatibility
14 | except NameError:
15 | pass
16 |
17 |
18 | calc_grammar = """
19 | ?start: sum
20 | | NAME "=" sum -> assign_var
21 |
22 | ?sum: product
23 | | sum "+" product -> add
24 | | sum "-" product -> sub
25 |
26 | ?product: atom
27 | | product "*" atom -> mul
28 | | product "/" atom -> div
29 |
30 | ?atom: NUMBER -> number
31 | | "-" atom -> neg
32 | | NAME -> var
33 | | "(" sum ")"
34 |
35 | %import common.CNAME -> NAME
36 | %import common.NUMBER
37 | %import common.WS_INLINE
38 |
39 | %ignore WS_INLINE
40 | """
41 |
42 |
43 | @v_args(inline=True) # Affects the signatures of the methods
44 | class CalculateTree(Transformer):
45 | from operator import add, sub, mul, truediv as div, neg
46 | number = float
47 |
48 | def __init__(self):
49 | self.vars = {}
50 |
51 | def assign_var(self, name, value):
52 | self.vars[name] = value
53 | return value
54 |
55 | def var(self, name):
56 | try:
57 | return self.vars[name]
58 | except KeyError:
59 | raise Exception("Variable not found: %s" % name)
60 |
61 |
62 | calc_parser = Lark(calc_grammar, parser='lalr', transformer=CalculateTree())
63 | calc = calc_parser.parse
64 |
65 |
66 | def main():
67 | while True:
68 | try:
69 | s = input('> ')
70 | except EOFError:
71 | break
72 | print(calc(s))
73 |
74 |
75 | def test():
76 | print(calc("a = 1+2"))
77 | print(calc("1+a*-3"))
78 |
79 |
80 | if __name__ == '__main__':
81 | # test()
82 | main()
83 |
--------------------------------------------------------------------------------
/examples/composition/README.rst:
--------------------------------------------------------------------------------
1 | Grammar Composition
2 | ===================
3 |
4 | This example shows how to do grammar composition in Lark, by creating a new
5 | file format that allows both CSV and JSON to co-exist.
6 |
7 | We show how, by using namespaces, Lark grammars and their transformers can be fully reused -
8 | they don't need to care if their grammar is used directly, or being imported, or who is doing the importing.
9 |
10 | See `main.py`_ for more details.
11 |
12 | .. _main.py: https://github.com/lark-parser/lark/blob/master/examples/composition/main.py
13 |
--------------------------------------------------------------------------------
/examples/composition/combined_csv_and_json.txt:
--------------------------------------------------------------------------------
1 | {"header": ["this", "is", "json", 1111]}
2 | # file lines author
3 | data.json 12 Robin
4 | data.csv 30 erezsh
5 | compiler.py 123123 Megalng
6 | {"footer": "done"}
7 |
--------------------------------------------------------------------------------
/examples/composition/csv.lark:
--------------------------------------------------------------------------------
1 | start: header _NL row+
2 | header: "#" " "? (WORD _SEPARATOR?)+
3 | row: (_anything _SEPARATOR?)+ _NL
4 | _anything: INT | WORD | NON_SEPARATOR_STRING | FLOAT | SIGNED_FLOAT
5 | NON_SEPARATOR_STRING: /[a-zA-z.;\\\/]+/
6 | _SEPARATOR: /[ ]+/
7 | | "\t"
8 | | ","
9 |
10 | %import common.NEWLINE -> _NL
11 | %import common.WORD
12 | %import common.INT
13 | %import common.FLOAT
14 | %import common.SIGNED_FLOAT
15 |
--------------------------------------------------------------------------------
/examples/composition/eval_csv.py:
--------------------------------------------------------------------------------
1 | "Transformer for evaluating csv.lark"
2 |
3 | from lark import Transformer
4 |
5 | class CsvTreeToPandasDict(Transformer):
6 | INT = int
7 | FLOAT = float
8 | SIGNED_FLOAT = float
9 | WORD = str
10 | NON_SEPARATOR_STRING = str
11 |
12 | def row(self, children):
13 | return children
14 |
15 | def start(self, children):
16 | data = {}
17 |
18 | header = children[0].children
19 | for heading in header:
20 | data[heading] = []
21 |
22 | for row in children[1:]:
23 | for i, element in enumerate(row):
24 | data[header[i]].append(element)
25 |
26 | return data
27 |
--------------------------------------------------------------------------------
/examples/composition/eval_json.py:
--------------------------------------------------------------------------------
1 | "Transformer for evaluating json.lark"
2 |
3 | from lark import Transformer, v_args
4 |
5 | class JsonTreeToJson(Transformer):
6 | @v_args(inline=True)
7 | def string(self, s):
8 | return s[1:-1].replace('\\"', '"')
9 |
10 | array = list
11 | pair = tuple
12 | object = dict
13 | number = v_args(inline=True)(float)
14 |
15 | null = lambda self, _: None
16 | true = lambda self, _: True
17 | false = lambda self, _: False
18 |
--------------------------------------------------------------------------------
/examples/composition/json.lark:
--------------------------------------------------------------------------------
1 | ?start: value
2 |
3 | ?value: object
4 | | array
5 | | string
6 | | SIGNED_NUMBER -> number
7 | | "true" -> true
8 | | "false" -> false
9 | | "null" -> null
10 |
11 | array : "[" _WS? [value ("," _WS? value)*] "]"
12 | object : "{" _WS? [pair ("," _WS? pair)*] "}"
13 | pair : string ":" _WS value
14 |
15 | string : ESCAPED_STRING
16 |
17 | %import common.ESCAPED_STRING
18 | %import common.SIGNED_NUMBER
19 | %import common.WS -> _WS
20 |
--------------------------------------------------------------------------------
/examples/composition/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Grammar Composition
3 | ===================
4 |
5 | This example shows how to do grammar composition in Lark, by creating a new
6 | file format that allows both CSV and JSON to co-exist.
7 |
8 | 1) We define ``storage.lark``, which imports both ``csv.lark`` and ``json.lark``,
9 | and allows them to be used one after the other.
10 |
11 | In the generated tree, each imported rule/terminal is automatically prefixed (with ``json__`` or ``csv__),
12 | which creates an implicit namespace and allows them to coexist without collisions.
13 |
14 | 2) We merge their respective transformers (unaware of each other) into a new base transformer.
15 | The resulting transformer can evaluate both JSON and CSV in the parse tree.
16 |
17 | The methods of each transformer are renamed into their appropriate namespace, using the given prefix.
18 | This approach allows full re-use: the transformers don't need to care if their grammar is used directly,
19 | or being imported, or who is doing the importing.
20 |
21 | """
22 | from pathlib import Path
23 | from lark import Lark
24 | from json import dumps
25 | from lark.visitors import Transformer, merge_transformers
26 |
27 | from eval_csv import CsvTreeToPandasDict
28 | from eval_json import JsonTreeToJson
29 |
30 | __dir__ = Path(__file__).parent
31 |
32 | class Storage(Transformer):
33 | def start(self, children):
34 | return children
35 |
36 | storage_transformer = merge_transformers(Storage(), csv=CsvTreeToPandasDict(), json=JsonTreeToJson())
37 |
38 | parser = Lark.open("storage.lark", rel_to=__file__)
39 |
40 | def main():
41 | json_tree = parser.parse(dumps({"test": "a", "dict": { "list": [1, 1.2] }}))
42 | res = storage_transformer.transform(json_tree)
43 | print("Just JSON: ", res)
44 |
45 | csv_json_tree = parser.parse(open(__dir__ / 'combined_csv_and_json.txt').read())
46 | res = storage_transformer.transform(csv_json_tree)
47 | print("JSON + CSV: ", dumps(res, indent=2))
48 |
49 |
50 | if __name__ == "__main__":
51 | main()
52 |
--------------------------------------------------------------------------------
/examples/composition/storage.lark:
--------------------------------------------------------------------------------
1 | start: (csv__start | json__start _NL?)+
2 |
3 | // Renaming of the import variables is required, as they receive the namespace of this file.
4 | // See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565
5 | %import .csv.start -> csv__start
6 | %import .json.start -> json__start
7 |
8 | %import .csv._NL -> _NL
9 |
--------------------------------------------------------------------------------
/examples/fruitflies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/examples/fruitflies.png
--------------------------------------------------------------------------------
/examples/fruitflies.py:
--------------------------------------------------------------------------------
1 | """
2 | Handling Ambiguity
3 | ==================
4 |
5 | A demonstration of ambiguity
6 |
7 | This example shows how to use get explicit ambiguity from Lark's Earley parser.
8 |
9 | """
10 | import sys
11 | from lark import Lark, tree
12 |
13 | grammar = """
14 | sentence: noun verb noun -> simple
15 | | noun verb "like" noun -> comparative
16 |
17 | noun: adj? NOUN
18 | verb: VERB
19 | adj: ADJ
20 |
21 | NOUN: "flies" | "bananas" | "fruit"
22 | VERB: "like" | "flies"
23 | ADJ: "fruit"
24 |
25 | %import common.WS
26 | %ignore WS
27 | """
28 |
29 | parser = Lark(grammar, start='sentence', ambiguity='explicit')
30 |
31 | sentence = 'fruit flies like bananas'
32 |
33 | def make_png(filename):
34 | tree.pydot__tree_to_png( parser.parse(sentence), filename)
35 |
36 | def make_dot(filename):
37 | tree.pydot__tree_to_dot( parser.parse(sentence), filename)
38 |
39 | if __name__ == '__main__':
40 | print(parser.parse(sentence).pretty())
41 | # make_png(sys.argv[1])
42 | # make_dot(sys.argv[1])
43 |
44 | # Output:
45 | #
46 | # _ambig
47 | # comparative
48 | # noun fruit
49 | # verb flies
50 | # noun bananas
51 | # simple
52 | # noun
53 | # fruit
54 | # flies
55 | # verb like
56 | # noun bananas
57 | #
58 | # (or view a nicer version at "./fruitflies.png")
59 |
--------------------------------------------------------------------------------
/examples/grammars/README.rst:
--------------------------------------------------------------------------------
1 | Example Grammars
2 | ================
3 |
4 | This directory is a collection of lark grammars, taken from real world projects.
5 |
6 | - `Verilog`_ - Taken from https://github.com/circuitgraph/circuitgraph/blob/main/circuitgraph/parsing/verilog.lark
7 |
8 | .. _Verilog: https://github.com/lark-parser/lark/blob/master/examples/grammars/verilog.lark
9 |
--------------------------------------------------------------------------------
/examples/grammars/verilog.lark:
--------------------------------------------------------------------------------
1 | // Taken from https://github.com/circuitgraph/circuitgraph/blob/master/circuitgraph/parsing/verilog.lark
2 | // Following https://www.verilog.com/VerilogBNF.html
3 |
4 | // 1. Source Text
5 | start: description*
6 |
7 | ?description: module
8 |
9 | module: "module" name_of_module list_of_ports? ";" module_item* "endmodule"
10 |
11 | ?name_of_module: IDENTIFIER
12 |
13 | list_of_ports: "(" port ("," port)* ")"
14 |
15 | ?port: IDENTIFIER
16 |
17 | ?module_item: input_declaration
18 | | output_declaration
19 | | net_declaration
20 | | module_instantiation
21 | | continuous_assign
22 |
23 |
24 | // 2. Declarations
25 | input_declaration: "input" list_of_variables ";"
26 |
27 | output_declaration: "output" list_of_variables ";"
28 |
29 | net_declaration: "wire" list_of_variables ";"
30 |
31 | continuous_assign: "assign" list_of_assignments ";"
32 |
33 | list_of_variables: IDENTIFIER ("," IDENTIFIER)*
34 |
35 | list_of_assignments: assignment ("," assignment)*
36 |
37 |
38 | // 3. Primitive Instances
39 | // These are merged with module instantiations
40 |
41 | // 4. Module Instantiations
42 | module_instantiation: name_of_module module_instance ("," module_instance)* ";"
43 |
44 | module_instance: name_of_instance "(" list_of_module_connections ")"
45 |
46 | ?name_of_instance: IDENTIFIER
47 |
48 | list_of_module_connections: module_port_connection ("," module_port_connection)*
49 | | named_port_connection ("," named_port_connection)*
50 |
51 | module_port_connection: expression
52 |
53 | named_port_connection: "." IDENTIFIER "(" expression ")"
54 |
55 |
56 | // 5. Behavioral Statements
57 | assignment: lvalue "=" expression
58 |
59 |
60 | // 6. Specify Section
61 |
62 |
63 | // 7. Expressions
64 | ?lvalue: identifier
65 |
66 | expression: condition
67 |
68 | ?constant_value: constant_zero
69 | | constant_one
70 | | constant_x
71 |
72 | constant_zero: "1'b0"
73 | | "1'h0"
74 |
75 | constant_one: "1'b1"
76 | | "1'h1"
77 |
78 | constant_x: "1'bx"
79 | | "1'hx"
80 |
81 | ?condition : or
82 | | ternary
83 |
84 | ?ternary: or "?" or ":" or
85 |
86 | ?or : xor
87 | | or_gate
88 |
89 | ?or_gate: or "|" xor
90 |
91 | ?xor : and
92 | | xor_gate
93 | | xnor_gate
94 |
95 | ?xor_gate: xor "^" and
96 |
97 | ?xnor_gate: xor "~^" and
98 | | xor "^~" and
99 |
100 | ?and : unary
101 | | and_gate
102 |
103 | ?and_gate: and "&" unary
104 |
105 | ?unary : primary
106 | | not_gate
107 |
108 | not_gate: ( "!" | "~" ) primary
109 |
110 | ?primary : IDENTIFIER
111 | | constant_value
112 | | "(" or ")"
113 |
114 |
115 | // 8. General
116 | ?identifier: IDENTIFIER
117 |
118 | IDENTIFIER: CNAME
119 | | ESCAPED_IDENTIFIER
120 |
121 |
122 | // Lark
123 | ESCAPED_IDENTIFIER: /\\([^\s]+)/
124 | COMMENT: "//" /[^\n]*/ NEWLINE
125 | NEWLINE: "\n"
126 | MULTILINE_COMMENT: /\/\*(\*(?!\/)|[^*])*\*\//
127 |
128 | %import common.CNAME
129 | %import common.ESCAPED_STRING
130 | %import common.WS
131 |
132 | %ignore WS
133 | %ignore COMMENT
134 | %ignore MULTILINE_COMMENT
135 | %ignore NEWLINE
136 |
--------------------------------------------------------------------------------
/examples/indented_tree.py:
--------------------------------------------------------------------------------
1 | """
2 | Parsing Indentation
3 | ===================
4 |
5 | A demonstration of parsing indentation (“whitespace significant” language)
6 | and the usage of the ``Indenter`` class.
7 |
8 | Since indentation is context-sensitive, a postlex stage is introduced to
9 | manufacture ``INDENT``/``DEDENT`` tokens.
10 |
11 | It is crucial for the indenter that the ``NL_type`` matches the spaces (and
12 | tabs) after the newline.
13 |
14 | If your whitespace-significant grammar supports comments, then ``NL_type``
15 | must match those comments too. Otherwise, comments that appear in the middle
16 | of a line will `confuse Lark`_.
17 |
18 | .. _`confuse Lark`: https://github.com/lark-parser/lark/issues/863
19 | """
20 | from lark import Lark
21 | from lark.indenter import Indenter
22 |
23 | tree_grammar = r"""
24 | %import common.CNAME -> NAME
25 | %import common.WS_INLINE
26 | %import common.SH_COMMENT
27 | %ignore WS_INLINE
28 | %ignore SH_COMMENT
29 | %declare _INDENT _DEDENT
30 |
31 | ?start: _NL* tree
32 | tree: NAME _NL [_INDENT tree+ _DEDENT]
33 | _NL: (/\r?\n[\t ]*/ | SH_COMMENT)+
34 | """
35 |
36 | class TreeIndenter(Indenter):
37 | NL_type = '_NL'
38 | OPEN_PAREN_types = []
39 | CLOSE_PAREN_types = []
40 | INDENT_type = '_INDENT'
41 | DEDENT_type = '_DEDENT'
42 | tab_len = 8
43 |
44 | parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter())
45 |
46 | test_tree = """
47 | a
48 | # check this comment out
49 | b
50 | c
51 | d
52 | e
53 | f
54 | g
55 | """
56 |
57 | def test():
58 | print(parser.parse(test_tree).pretty())
59 |
60 | if __name__ == '__main__':
61 | test()
62 |
--------------------------------------------------------------------------------
/examples/json_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple JSON Parser
3 | ==================
4 |
5 | The code is short and clear, and outperforms every other parser (that's written in Python).
6 | For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md
7 | """
8 | import sys
9 |
10 | from lark import Lark, Transformer, v_args
11 |
12 | json_grammar = r"""
13 | ?start: value
14 |
15 | ?value: object
16 | | array
17 | | string
18 | | SIGNED_NUMBER -> number
19 | | "true" -> true
20 | | "false" -> false
21 | | "null" -> null
22 |
23 | array : "[" [value ("," value)*] "]"
24 | object : "{" [pair ("," pair)*] "}"
25 | pair : string ":" value
26 |
27 | string : ESCAPED_STRING
28 |
29 | %import common.ESCAPED_STRING
30 | %import common.SIGNED_NUMBER
31 | %import common.WS
32 |
33 | %ignore WS
34 | """
35 |
36 |
37 | class TreeToJson(Transformer):
38 | @v_args(inline=True)
39 | def string(self, s):
40 | return s[1:-1].replace('\\"', '"')
41 |
42 | array = list
43 | pair = tuple
44 | object = dict
45 | number = v_args(inline=True)(float)
46 |
47 | null = lambda self, _: None
48 | true = lambda self, _: True
49 | false = lambda self, _: False
50 |
51 |
52 | ### Create the JSON parser with Lark, using the Earley algorithm
53 | # json_parser = Lark(json_grammar, parser='earley', lexer='basic')
54 | # def parse(x):
55 | # return TreeToJson().transform(json_parser.parse(x))
56 |
57 | ### Create the JSON parser with Lark, using the LALR algorithm
58 | json_parser = Lark(json_grammar, parser='lalr',
59 | # Using the basic lexer isn't required, and isn't usually recommended.
60 | # But, it's good enough for JSON, and it's slightly faster.
61 | lexer='basic',
62 | # Disabling propagate_positions and placeholders slightly improves speed
63 | propagate_positions=False,
64 | maybe_placeholders=False,
65 | # Using an internal transformer is faster and more memory efficient
66 | transformer=TreeToJson())
67 | parse = json_parser.parse
68 |
69 |
70 | def test():
71 | test_json = '''
72 | {
73 | "empty_object" : {},
74 | "empty_array" : [],
75 | "booleans" : { "YES" : true, "NO" : false },
76 | "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
77 | "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ],
78 | "nothing" : null
79 | }
80 | '''
81 |
82 | j = parse(test_json)
83 | print(j)
84 | import json
85 | assert j == json.loads(test_json)
86 |
87 |
88 | if __name__ == '__main__':
89 | # test()
90 | with open(sys.argv[1]) as f:
91 | print(parse(f.read()))
92 |
--------------------------------------------------------------------------------
/examples/lark_grammar.py:
--------------------------------------------------------------------------------
1 | """
2 | Lark Grammar
3 | ============
4 |
5 | A reference implementation of the Lark grammar (using LALR(1))
6 | """
7 | import lark
8 | from pathlib import Path
9 |
10 | examples_path = Path(__file__).parent
11 | lark_path = Path(lark.__file__).parent
12 |
13 | parser = lark.Lark.open(lark_path / 'grammars/lark.lark', rel_to=__file__, parser="lalr")
14 |
15 |
16 | grammar_files = [
17 | examples_path / 'advanced/python2.lark',
18 | examples_path / 'relative-imports/multiples.lark',
19 | examples_path / 'relative-imports/multiple2.lark',
20 | examples_path / 'relative-imports/multiple3.lark',
21 | examples_path / 'tests/no_newline_at_end.lark',
22 | examples_path / 'tests/negative_priority.lark',
23 | examples_path / 'standalone/json.lark',
24 | lark_path / 'grammars/common.lark',
25 | lark_path / 'grammars/lark.lark',
26 | lark_path / 'grammars/unicode.lark',
27 | lark_path / 'grammars/python.lark',
28 | ]
29 |
30 | def test():
31 | for grammar_file in grammar_files:
32 | tree = parser.parse(open(grammar_file).read())
33 | print("All grammars parsed successfully")
34 |
35 | if __name__ == '__main__':
36 | test()
37 |
--------------------------------------------------------------------------------
/examples/relative-imports/multiple2.lark:
--------------------------------------------------------------------------------
1 | start: ("0" | "1")* "0"
2 |
--------------------------------------------------------------------------------
/examples/relative-imports/multiple3.lark:
--------------------------------------------------------------------------------
1 | start: mod0mod0+
2 |
3 | mod0mod0: "0" | "1" mod1mod0
4 | mod1mod0: "1" | "0" mod2mod1 mod1mod0
5 | mod2mod1: "0" | "1" mod2mod1
6 |
--------------------------------------------------------------------------------
/examples/relative-imports/multiples.lark:
--------------------------------------------------------------------------------
1 | start: "2:" multiple2
2 | | "3:" multiple3
3 |
4 | %import .multiple2.start -> multiple2
5 | %import .multiple3.start -> multiple3
6 |
--------------------------------------------------------------------------------
/examples/relative-imports/multiples.py:
--------------------------------------------------------------------------------
1 | #
2 | # This example demonstrates relative imports with rule rewrite
3 | # see multiples.lark
4 | #
5 |
6 | #
7 | # if b is a number written in binary, and m is either 2 or 3,
8 | # the grammar aims to recognise m:b iif b is a multiple of m
9 | #
10 | # for example, 3:1001 is recognised
11 | # because 9 (0b1001) is a multiple of 3
12 | #
13 |
14 | from lark import Lark, UnexpectedInput
15 |
16 | parser = Lark.open('multiples.lark', rel_to=__file__, parser='lalr')
17 |
18 | def is_in_grammar(data):
19 | try:
20 | parser.parse(data)
21 | except UnexpectedInput:
22 | return False
23 | return True
24 |
25 | for n_dec in range(100):
26 | n_bin = bin(n_dec)[2:]
27 | assert is_in_grammar('2:{}'.format(n_bin)) == (n_dec % 2 == 0)
28 | assert is_in_grammar('3:{}'.format(n_bin)) == (n_dec % 3 == 0)
29 |
--------------------------------------------------------------------------------
/examples/standalone/README.rst:
--------------------------------------------------------------------------------
1 | Standalone example
2 | ==================
3 |
4 | To initialize, cd to this folder, and run:
5 |
6 | .. code-block:: bash
7 |
8 | ./create_standalone.sh
9 |
10 | Or:
11 |
12 | .. code-block:: bash
13 |
14 | python -m lark.tools.standalone json.lark > json_parser.py
15 |
16 | Then run using:
17 |
18 | .. code-block:: bash
19 |
20 | python json_parser_main.py
21 |
--------------------------------------------------------------------------------
/examples/standalone/create_standalone.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py
3 |
--------------------------------------------------------------------------------
/examples/standalone/json.lark:
--------------------------------------------------------------------------------
1 | ?start: value
2 |
3 | ?value: object
4 | | array
5 | | string
6 | | SIGNED_NUMBER -> number
7 | | "true" -> true
8 | | "false" -> false
9 | | "null" -> null
10 |
11 | array : "[" [value ("," value)*] "]"
12 | object : "{" [pair ("," pair)*] "}"
13 | pair : string ":" value
14 |
15 | string : ESCAPED_STRING
16 |
17 | %import common.ESCAPED_STRING
18 | %import common.SIGNED_NUMBER
19 | %import common.WS
20 |
21 | %ignore WS
22 |
--------------------------------------------------------------------------------
/examples/standalone/json_parser_main.py:
--------------------------------------------------------------------------------
1 | """
2 | Standalone Parser
3 | ===================================
4 |
5 | This example demonstrates how to generate and use the standalone parser,
6 | using the JSON example.
7 |
8 | See README.md for more details.
9 | """
10 |
11 | import sys
12 |
13 | from json_parser import Lark_StandAlone, Transformer, v_args
14 |
15 | inline_args = v_args(inline=True)
16 |
17 | class TreeToJson(Transformer):
18 | @inline_args
19 | def string(self, s):
20 | return s[1:-1].replace('\\"', '"')
21 |
22 | array = list
23 | pair = tuple
24 | object = dict
25 | number = inline_args(float)
26 |
27 | null = lambda self, _: None
28 | true = lambda self, _: True
29 | false = lambda self, _: False
30 |
31 |
32 | parser = Lark_StandAlone(transformer=TreeToJson())
33 |
34 | if __name__ == '__main__':
35 | with open(sys.argv[1]) as f:
36 | print(parser.parse(f.read()))
37 |
--------------------------------------------------------------------------------
/examples/tests/negative_priority.lark:
--------------------------------------------------------------------------------
1 | start: r
2 | r.-1: "a"
3 |
--------------------------------------------------------------------------------
/examples/tests/no_newline_at_end.lark:
--------------------------------------------------------------------------------
1 | start: "a"
2 |
--------------------------------------------------------------------------------
/examples/turtle_dsl.py:
--------------------------------------------------------------------------------
1 | """
2 | Turtle DSL
3 | ==========
4 |
5 | Implements a LOGO-like toy language for Python’s turtle, with interpreter.
6 | """
7 |
8 | try:
9 | input = raw_input # For Python2 compatibility
10 | except NameError:
11 | pass
12 |
13 | import turtle
14 |
15 | from lark import Lark
16 |
17 | turtle_grammar = """
18 | start: instruction+
19 |
20 | instruction: MOVEMENT NUMBER -> movement
21 | | "c" COLOR [COLOR] -> change_color
22 | | "fill" code_block -> fill
23 | | "repeat" NUMBER code_block -> repeat
24 |
25 | code_block: "{" instruction+ "}"
26 |
27 | MOVEMENT: "f"|"b"|"l"|"r"
28 | COLOR: LETTER+
29 |
30 | %import common.LETTER
31 | %import common.INT -> NUMBER
32 | %import common.WS
33 | %ignore WS
34 | """
35 |
36 | parser = Lark(turtle_grammar)
37 |
38 | def run_instruction(t):
39 | if t.data == 'change_color':
40 | turtle.color(*t.children) # We just pass the color names as-is
41 |
42 | elif t.data == 'movement':
43 | name, number = t.children
44 | { 'f': turtle.fd,
45 | 'b': turtle.bk,
46 | 'l': turtle.lt,
47 | 'r': turtle.rt, }[name](int(number))
48 |
49 | elif t.data == 'repeat':
50 | count, block = t.children
51 | for i in range(int(count)):
52 | run_instruction(block)
53 |
54 | elif t.data == 'fill':
55 | turtle.begin_fill()
56 | run_instruction(t.children[0])
57 | turtle.end_fill()
58 |
59 | elif t.data == 'code_block':
60 | for cmd in t.children:
61 | run_instruction(cmd)
62 | else:
63 | raise SyntaxError('Unknown instruction: %s' % t.data)
64 |
65 |
66 | def run_turtle(program):
67 | parse_tree = parser.parse(program)
68 | for inst in parse_tree.children:
69 | run_instruction(inst)
70 |
71 | def main():
72 | while True:
73 | code = input('> ')
74 | try:
75 | run_turtle(code)
76 | except Exception as e:
77 | print(e)
78 |
79 | def test():
80 | text = """
81 | c red yellow
82 | fill { repeat 36 {
83 | f200 l170
84 | }}
85 | """
86 | run_turtle(text)
87 |
88 | if __name__ == '__main__':
89 | # test()
90 | main()
91 |
--------------------------------------------------------------------------------
/lark/__init__.py:
--------------------------------------------------------------------------------
1 | from .exceptions import (
2 | GrammarError,
3 | LarkError,
4 | LexError,
5 | ParseError,
6 | UnexpectedCharacters,
7 | UnexpectedEOF,
8 | UnexpectedInput,
9 | UnexpectedToken,
10 | )
11 | from .lark import Lark
12 | from .lexer import Token
13 | from .tree import ParseTree, Tree
14 | from .utils import logger, TextSlice
15 | from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
16 |
17 | __version__: str = "1.2.2"
18 |
19 | __all__ = (
20 | "GrammarError",
21 | "LarkError",
22 | "LexError",
23 | "ParseError",
24 | "UnexpectedCharacters",
25 | "UnexpectedEOF",
26 | "UnexpectedInput",
27 | "UnexpectedToken",
28 | "Lark",
29 | "Token",
30 | "ParseTree",
31 | "Tree",
32 | "logger",
33 | "Discard",
34 | "Transformer",
35 | "Transformer_NonRecursive",
36 | "TextSlice",
37 | "Visitor",
38 | "v_args",
39 | )
40 |
--------------------------------------------------------------------------------
/lark/__pyinstaller/__init__.py:
--------------------------------------------------------------------------------
1 | # For usage of lark with PyInstaller. See https://pyinstaller-sample-hook.readthedocs.io/en/latest/index.html
2 |
3 | import os
4 |
5 | def get_hook_dirs():
6 | return [os.path.dirname(__file__)]
7 |
--------------------------------------------------------------------------------
/lark/__pyinstaller/hook-lark.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------------------
2 | # Copyright (c) 2017-2020, PyInstaller Development Team.
3 | #
4 | # Distributed under the terms of the GNU General Public License (version 2
5 | # or later) with exception for distributing the bootloader.
6 | #
7 | # The full license is in the file COPYING.txt, distributed with this software.
8 | #
9 | # SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception)
10 | #-----------------------------------------------------------------------------
11 |
12 | from PyInstaller.utils.hooks import collect_data_files
13 |
14 | datas = collect_data_files('lark')
15 |
--------------------------------------------------------------------------------
/lark/ast_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Module of utilities for transforming a lark.Tree into a custom Abstract Syntax Tree (AST defined in classes)
3 | """
4 |
5 | import inspect, re
6 | import types
7 | from typing import Optional, Callable
8 |
9 | from lark import Transformer, v_args
10 |
11 | class Ast:
12 | """Abstract class
13 |
14 | Subclasses will be collected by `create_transformer()`
15 | """
16 | pass
17 |
18 | class AsList:
19 | """Abstract class
20 |
21 | Subclasses will be instantiated with the parse results as a single list, instead of as arguments.
22 | """
23 |
24 | class WithMeta:
25 | """Abstract class
26 |
27 | Subclasses will be instantiated with the Meta instance of the tree. (see ``v_args`` for more detail)
28 | """
29 | pass
30 |
31 | def camel_to_snake(name):
32 | return re.sub(r'(? Transformer:
37 | """Collects `Ast` subclasses from the given module, and creates a Lark transformer that builds the AST.
38 |
39 | For each class, we create a corresponding rule in the transformer, with a matching name.
40 | CamelCase names will be converted into snake_case. Example: "CodeBlock" -> "code_block".
41 |
42 | Classes starting with an underscore (`_`) will be skipped.
43 |
44 | Parameters:
45 | ast_module: A Python module containing all the subclasses of ``ast_utils.Ast``
46 | transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten.
47 | decorator_factory (Callable): An optional callable accepting two booleans, inline, and meta,
48 | and returning a decorator for the methods of ``transformer``. (default: ``v_args``).
49 | """
50 | t = transformer or Transformer()
51 |
52 | for name, obj in inspect.getmembers(ast_module):
53 | if not name.startswith('_') and inspect.isclass(obj):
54 | if issubclass(obj, Ast):
55 | wrapper = decorator_factory(inline=not issubclass(obj, AsList), meta=issubclass(obj, WithMeta))
56 | obj = wrapper(obj).__get__(t)
57 | setattr(t, camel_to_snake(name), obj)
58 |
59 | return t
60 |
--------------------------------------------------------------------------------
/lark/common.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | import sys
3 | from types import ModuleType
4 | from typing import Callable, Collection, Dict, Optional, TYPE_CHECKING, List
5 |
6 | if TYPE_CHECKING:
7 | from .lark import PostLex
8 | from .lexer import Lexer
9 | from .grammar import Rule
10 | from typing import Union, Type
11 | from typing import Literal
12 | if sys.version_info >= (3, 10):
13 | from typing import TypeAlias
14 | else:
15 | from typing_extensions import TypeAlias
16 |
17 | from .utils import Serialize
18 | from .lexer import TerminalDef, Token
19 |
20 | ###{standalone
21 |
22 | _ParserArgType: 'TypeAlias' = 'Literal["earley", "lalr", "cyk", "auto"]'
23 | _LexerArgType: 'TypeAlias' = 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]'
24 | _LexerCallback = Callable[[Token], Token]
25 | ParserCallbacks = Dict[str, Callable]
26 |
27 | class LexerConf(Serialize):
28 | __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
29 | __serialize_namespace__ = TerminalDef,
30 |
31 | terminals: Collection[TerminalDef]
32 | re_module: ModuleType
33 | ignore: Collection[str]
34 | postlex: 'Optional[PostLex]'
35 | callbacks: Dict[str, _LexerCallback]
36 | g_regex_flags: int
37 | skip_validation: bool
38 | use_bytes: bool
39 | lexer_type: Optional[_LexerArgType]
40 | strict: bool
41 |
42 | def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None,
43 | callbacks: Optional[Dict[str, _LexerCallback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False, strict: bool=False):
44 | self.terminals = terminals
45 | self.terminals_by_name = {t.name: t for t in self.terminals}
46 | assert len(self.terminals) == len(self.terminals_by_name)
47 | self.ignore = ignore
48 | self.postlex = postlex
49 | self.callbacks = callbacks or {}
50 | self.g_regex_flags = g_regex_flags
51 | self.re_module = re_module
52 | self.skip_validation = skip_validation
53 | self.use_bytes = use_bytes
54 | self.strict = strict
55 | self.lexer_type = None
56 |
57 | def _deserialize(self):
58 | self.terminals_by_name = {t.name: t for t in self.terminals}
59 |
60 | def __deepcopy__(self, memo=None):
61 | return type(self)(
62 | deepcopy(self.terminals, memo),
63 | self.re_module,
64 | deepcopy(self.ignore, memo),
65 | deepcopy(self.postlex, memo),
66 | deepcopy(self.callbacks, memo),
67 | deepcopy(self.g_regex_flags, memo),
68 | deepcopy(self.skip_validation, memo),
69 | deepcopy(self.use_bytes, memo),
70 | )
71 |
72 | class ParserConf(Serialize):
73 | __serialize_fields__ = 'rules', 'start', 'parser_type'
74 |
75 | rules: List['Rule']
76 | callbacks: ParserCallbacks
77 | start: List[str]
78 | parser_type: _ParserArgType
79 |
80 | def __init__(self, rules: List['Rule'], callbacks: ParserCallbacks, start: List[str]):
81 | assert isinstance(start, list)
82 | self.rules = rules
83 | self.callbacks = callbacks
84 | self.start = start
85 |
86 | ###}
87 |
--------------------------------------------------------------------------------
/lark/grammar.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple, ClassVar, Sequence
2 |
3 | from .utils import Serialize
4 |
5 | ###{standalone
6 | TOKEN_DEFAULT_PRIORITY = 0
7 |
8 |
9 | class Symbol(Serialize):
10 | __slots__ = ('name',)
11 |
12 | name: str
13 | is_term: ClassVar[bool] = NotImplemented
14 |
15 | def __init__(self, name: str) -> None:
16 | self.name = name
17 |
18 | def __eq__(self, other):
19 | if not isinstance(other, Symbol):
20 | return NotImplemented
21 | return self.is_term == other.is_term and self.name == other.name
22 |
23 | def __ne__(self, other):
24 | return not (self == other)
25 |
26 | def __hash__(self):
27 | return hash(self.name)
28 |
29 | def __repr__(self):
30 | return '%s(%r)' % (type(self).__name__, self.name)
31 |
32 | fullrepr = property(__repr__)
33 |
34 | def renamed(self, f):
35 | return type(self)(f(self.name))
36 |
37 |
38 | class Terminal(Symbol):
39 | __serialize_fields__ = 'name', 'filter_out'
40 |
41 | is_term: ClassVar[bool] = True
42 |
43 | def __init__(self, name: str, filter_out: bool = False) -> None:
44 | self.name = name
45 | self.filter_out = filter_out
46 |
47 | @property
48 | def fullrepr(self):
49 | return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)
50 |
51 | def renamed(self, f):
52 | return type(self)(f(self.name), self.filter_out)
53 |
54 |
55 | class NonTerminal(Symbol):
56 | __serialize_fields__ = 'name',
57 |
58 | is_term: ClassVar[bool] = False
59 |
60 |
61 | class RuleOptions(Serialize):
62 | __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices'
63 |
64 | keep_all_tokens: bool
65 | expand1: bool
66 | priority: Optional[int]
67 | template_source: Optional[str]
68 | empty_indices: Tuple[bool, ...]
69 |
70 | def __init__(self, keep_all_tokens: bool=False, expand1: bool=False, priority: Optional[int]=None, template_source: Optional[str]=None, empty_indices: Tuple[bool, ...]=()) -> None:
71 | self.keep_all_tokens = keep_all_tokens
72 | self.expand1 = expand1
73 | self.priority = priority
74 | self.template_source = template_source
75 | self.empty_indices = empty_indices
76 |
77 | def __repr__(self):
78 | return 'RuleOptions(%r, %r, %r, %r)' % (
79 | self.keep_all_tokens,
80 | self.expand1,
81 | self.priority,
82 | self.template_source
83 | )
84 |
85 |
86 | class Rule(Serialize):
87 | """
88 | origin : a symbol
89 | expansion : a list of symbols
90 | order : index of this expansion amongst all rules of the same name
91 | """
92 | __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')
93 |
94 | __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
95 | __serialize_namespace__ = Terminal, NonTerminal, RuleOptions
96 |
97 | origin: NonTerminal
98 | expansion: Sequence[Symbol]
99 | order: int
100 | alias: Optional[str]
101 | options: RuleOptions
102 | _hash: int
103 |
104 | def __init__(self, origin: NonTerminal, expansion: Sequence[Symbol],
105 | order: int=0, alias: Optional[str]=None, options: Optional[RuleOptions]=None):
106 | self.origin = origin
107 | self.expansion = expansion
108 | self.alias = alias
109 | self.order = order
110 | self.options = options or RuleOptions()
111 | self._hash = hash((self.origin, tuple(self.expansion)))
112 |
113 | def _deserialize(self):
114 | self._hash = hash((self.origin, tuple(self.expansion)))
115 |
116 | def __str__(self):
117 | return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
118 |
119 | def __repr__(self):
120 | return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
121 |
122 | def __hash__(self):
123 | return self._hash
124 |
125 | def __eq__(self, other):
126 | if not isinstance(other, Rule):
127 | return False
128 | return self.origin == other.origin and self.expansion == other.expansion
129 |
130 |
131 | ###}
132 |
--------------------------------------------------------------------------------
/lark/grammars/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/lark/grammars/__init__.py
--------------------------------------------------------------------------------
/lark/grammars/common.lark:
--------------------------------------------------------------------------------
1 | // Basic terminals for common use
2 |
3 |
4 | //
5 | // Numbers
6 | //
7 |
8 | DIGIT: "0".."9"
9 | HEXDIGIT: "a".."f"|"A".."F"|DIGIT
10 |
11 | INT: DIGIT+
12 | SIGNED_INT: ["+"|"-"] INT
13 | DECIMAL: INT "." INT? | "." INT
14 |
15 | // float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
16 | _EXP: ("e"|"E") SIGNED_INT
17 | FLOAT: INT _EXP | DECIMAL _EXP?
18 | SIGNED_FLOAT: ["+"|"-"] FLOAT
19 |
20 | NUMBER: FLOAT | INT
21 | SIGNED_NUMBER: ["+"|"-"] NUMBER
22 |
23 | //
24 | // Strings
25 | //
26 | _STRING_INNER: /.*?/
27 | _STRING_ESC_INNER: _STRING_INNER /(? ignore
19 | | "%import" import_path ["->" name] -> import
20 | | "%import" import_path name_list -> multi_import
21 | | "%override" rule -> override_rule
22 | | "%declare" name+ -> declare
23 |
24 | !import_path: "."? name ("." name)*
25 | name_list: "(" name ("," name)* ")"
26 |
27 | ?expansions: alias (_VBAR alias)*
28 |
29 | ?alias: expansion ["->" RULE]
30 |
31 | ?expansion: expr*
32 |
33 | ?expr: atom [OP | "~" NUMBER [".." NUMBER]]
34 |
35 | ?atom: "(" expansions ")"
36 | | "[" expansions "]" -> maybe
37 | | value
38 |
39 | ?value: STRING ".." STRING -> literal_range
40 | | name
41 | | (REGEXP | STRING) -> literal
42 | | name "{" value ("," value)* "}" -> template_usage
43 |
44 | name: RULE
45 | | TOKEN
46 |
47 | _VBAR: _NL? "|"
48 | OP: /[+*]|[?](?![a-z])/
49 | RULE: /!?[_?]?[a-z][_a-z0-9]*/
50 | TOKEN: /_?[A-Z][_A-Z0-9]*/
51 | STRING: _STRING "i"?
52 | REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/
53 | _NL: /(\r?\n)+\s*/
54 |
55 | %import common.ESCAPED_STRING -> _STRING
56 | %import common.SIGNED_INT -> NUMBER
57 | %import common.WS_INLINE
58 |
59 | COMMENT: /\s*/ "//" /[^\n]/* | /\s*/ "#" /[^\n]/*
60 |
61 | %ignore WS_INLINE
62 | %ignore COMMENT
63 |
--------------------------------------------------------------------------------
/lark/grammars/unicode.lark:
--------------------------------------------------------------------------------
1 | // TODO: LETTER, WORD, etc.
2 |
3 | //
4 | // Whitespace
5 | //
6 | WS_INLINE: /[ \t\xa0]/+
7 | WS: /[ \t\xa0\f\r\n]/+
8 |
--------------------------------------------------------------------------------
/lark/indenter.py:
--------------------------------------------------------------------------------
1 | "Provides a post-lexer for implementing Python-style indentation."
2 |
3 | from abc import ABC, abstractmethod
4 | from typing import List, Iterator
5 |
6 | from .exceptions import LarkError
7 | from .lark import PostLex
8 | from .lexer import Token
9 |
10 | ###{standalone
11 |
12 | class DedentError(LarkError):
13 | pass
14 |
15 | class Indenter(PostLex, ABC):
16 | """This is a postlexer that "injects" indent/dedent tokens based on indentation.
17 |
18 | It keeps track of the current indentation, as well as the current level of parentheses.
19 | Inside parentheses, the indentation is ignored, and no indent/dedent tokens get generated.
20 |
21 | Note: This is an abstract class. To use it, inherit and implement all its abstract methods:
22 | - tab_len
23 | - NL_type
24 | - OPEN_PAREN_types, CLOSE_PAREN_types
25 | - INDENT_type, DEDENT_type
26 |
27 | See also: the ``postlex`` option in `Lark`.
28 | """
29 | paren_level: int
30 | indent_level: List[int]
31 |
32 | def __init__(self) -> None:
33 | self.paren_level = 0
34 | self.indent_level = [0]
35 | assert self.tab_len > 0
36 |
37 | def handle_NL(self, token: Token) -> Iterator[Token]:
38 | if self.paren_level > 0:
39 | return
40 |
41 | yield token
42 |
43 | indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
44 | indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
45 |
46 | if indent > self.indent_level[-1]:
47 | self.indent_level.append(indent)
48 | yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
49 | else:
50 | while indent < self.indent_level[-1]:
51 | self.indent_level.pop()
52 | yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
53 |
54 | if indent != self.indent_level[-1]:
55 | raise DedentError('Unexpected dedent to column %s. Expected dedent to %s' % (indent, self.indent_level[-1]))
56 |
57 | def _process(self, stream):
58 | for token in stream:
59 | if token.type == self.NL_type:
60 | yield from self.handle_NL(token)
61 | else:
62 | yield token
63 |
64 | if token.type in self.OPEN_PAREN_types:
65 | self.paren_level += 1
66 | elif token.type in self.CLOSE_PAREN_types:
67 | self.paren_level -= 1
68 | assert self.paren_level >= 0
69 |
70 | while len(self.indent_level) > 1:
71 | self.indent_level.pop()
72 | yield Token(self.DEDENT_type, '')
73 |
74 | assert self.indent_level == [0], self.indent_level
75 |
76 | def process(self, stream):
77 | self.paren_level = 0
78 | self.indent_level = [0]
79 | return self._process(stream)
80 |
81 | # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
82 | @property
83 | def always_accept(self):
84 | return (self.NL_type,)
85 |
86 | @property
87 | @abstractmethod
88 | def NL_type(self) -> str:
89 | "The name of the newline token"
90 | raise NotImplementedError()
91 |
92 | @property
93 | @abstractmethod
94 | def OPEN_PAREN_types(self) -> List[str]:
95 | "The names of the tokens that open a parenthesis"
96 | raise NotImplementedError()
97 |
98 | @property
99 | @abstractmethod
100 | def CLOSE_PAREN_types(self) -> List[str]:
101 | """The names of the tokens that close a parenthesis
102 | """
103 | raise NotImplementedError()
104 |
105 | @property
106 | @abstractmethod
107 | def INDENT_type(self) -> str:
108 | """The name of the token that starts an indentation in the grammar.
109 |
110 | See also: %declare
111 | """
112 | raise NotImplementedError()
113 |
114 | @property
115 | @abstractmethod
116 | def DEDENT_type(self) -> str:
117 | """The name of the token that end an indentation in the grammar.
118 |
119 | See also: %declare
120 | """
121 | raise NotImplementedError()
122 |
123 | @property
124 | @abstractmethod
125 | def tab_len(self) -> int:
126 | """How many spaces does a tab equal"""
127 | raise NotImplementedError()
128 |
129 |
130 | class PythonIndenter(Indenter):
131 | """A postlexer that "injects" _INDENT/_DEDENT tokens based on indentation, according to the Python syntax.
132 |
133 | See also: the ``postlex`` option in `Lark`.
134 | """
135 |
136 | NL_type = '_NEWLINE'
137 | OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
138 | CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
139 | INDENT_type = '_INDENT'
140 | DEDENT_type = '_DEDENT'
141 | tab_len = 8
142 |
143 | ###}
144 |
--------------------------------------------------------------------------------
/lark/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/lark/parsers/__init__.py
--------------------------------------------------------------------------------
/lark/parsers/earley_common.py:
--------------------------------------------------------------------------------
1 | """This module implements useful building blocks for the Earley parser
2 | """
3 |
4 |
5 | class Item:
6 | "An Earley Item, the atom of the algorithm."
7 |
8 | __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
9 | def __init__(self, rule, ptr, start):
10 | self.is_complete = len(rule.expansion) == ptr
11 | self.rule = rule # rule
12 | self.ptr = ptr # ptr
13 | self.start = start # j
14 | self.node = None # w
15 | if self.is_complete:
16 | self.s = rule.origin
17 | self.expect = None
18 | self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
19 | else:
20 | self.s = (rule, ptr)
21 | self.expect = rule.expansion[ptr]
22 | self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
23 | self._hash = hash((self.s, self.start, self.rule))
24 |
25 | def advance(self):
26 | return Item(self.rule, self.ptr + 1, self.start)
27 |
28 | def __eq__(self, other):
29 | return self is other or (self.s == other.s and self.start == other.start and self.rule == other.rule)
30 |
31 | def __hash__(self):
32 | return self._hash
33 |
34 | def __repr__(self):
35 | before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
36 | after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
37 | symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
38 | return '%s (%d)' % (symbol, self.start)
39 |
40 |
41 | # class TransitiveItem(Item):
42 | # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
43 |
--------------------------------------------------------------------------------
/lark/parsers/lalr_parser.py:
--------------------------------------------------------------------------------
1 | """This module implements a LALR(1) Parser
2 | """
3 | # Author: Erez Shinan (2017)
4 | # Email : erezshin@gmail.com
5 | from typing import Dict, Any, Optional
6 | from ..lexer import Token, LexerThread
7 | from ..utils import Serialize
8 | from ..common import ParserConf, ParserCallbacks
9 |
10 | from .lalr_analysis import LALR_Analyzer, IntParseTable, ParseTableBase
11 | from .lalr_interactive_parser import InteractiveParser
12 | from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
13 | from .lalr_parser_state import ParserState, ParseConf
14 |
15 | ###{standalone
16 |
17 | class LALR_Parser(Serialize):
18 | def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
19 | analysis = LALR_Analyzer(parser_conf, debug=debug, strict=strict)
20 | analysis.compute_lalr()
21 | callbacks = parser_conf.callbacks
22 |
23 | self._parse_table = analysis.parse_table
24 | self.parser_conf = parser_conf
25 | self.parser = _Parser(analysis.parse_table, callbacks, debug)
26 |
27 | @classmethod
28 | def deserialize(cls, data, memo, callbacks, debug=False):
29 | inst = cls.__new__(cls)
30 | inst._parse_table = IntParseTable.deserialize(data, memo)
31 | inst.parser = _Parser(inst._parse_table, callbacks, debug)
32 | return inst
33 |
34 | def serialize(self, memo: Any = None) -> Dict[str, Any]:
35 | return self._parse_table.serialize(memo)
36 |
37 | def parse_interactive(self, lexer: LexerThread, start: str):
38 | return self.parser.parse(lexer, start, start_interactive=True)
39 |
40 | def parse(self, lexer, start, on_error=None):
41 | try:
42 | return self.parser.parse(lexer, start)
43 | except UnexpectedInput as e:
44 | if on_error is None:
45 | raise
46 |
47 | while True:
48 | if isinstance(e, UnexpectedCharacters):
49 | s = e.interactive_parser.lexer_thread.state
50 | p = s.line_ctr.char_pos
51 |
52 | if not on_error(e):
53 | raise e
54 |
55 | if isinstance(e, UnexpectedCharacters):
56 | # If user didn't change the character position, then we should
57 | if p == s.line_ctr.char_pos:
58 | s.line_ctr.feed(s.text.text[p:p+1])
59 |
60 | try:
61 | return e.interactive_parser.resume_parse()
62 | except UnexpectedToken as e2:
63 | if (isinstance(e, UnexpectedToken)
64 | and e.token.type == e2.token.type == '$END'
65 | and e.interactive_parser == e2.interactive_parser):
66 | # Prevent infinite loop
67 | raise e2
68 | e = e2
69 | except UnexpectedCharacters as e2:
70 | e = e2
71 |
72 |
73 | class _Parser:
74 | parse_table: ParseTableBase
75 | callbacks: ParserCallbacks
76 | debug: bool
77 |
78 | def __init__(self, parse_table: ParseTableBase, callbacks: ParserCallbacks, debug: bool=False):
79 | self.parse_table = parse_table
80 | self.callbacks = callbacks
81 | self.debug = debug
82 |
83 | def parse(self, lexer: LexerThread, start: str, value_stack=None, state_stack=None, start_interactive=False):
84 | parse_conf = ParseConf(self.parse_table, self.callbacks, start)
85 | parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
86 | if start_interactive:
87 | return InteractiveParser(self, parser_state, parser_state.lexer)
88 | return self.parse_from_state(parser_state)
89 |
90 |
91 | def parse_from_state(self, state: ParserState, last_token: Optional[Token]=None):
92 | """Run the main LALR parser loop
93 |
94 | Parameters:
95 | state - the initial state. Changed in-place.
96 | last_token - Used only for line information in case of an empty lexer.
97 | """
98 | try:
99 | token = last_token
100 | for token in state.lexer.lex(state):
101 | assert token is not None
102 | state.feed_token(token)
103 |
104 | end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
105 | return state.feed_token(end_token, True)
106 | except UnexpectedInput as e:
107 | try:
108 | e.interactive_parser = InteractiveParser(self, state, state.lexer)
109 | except NameError:
110 | pass
111 | raise e
112 | except Exception as e:
113 | if self.debug:
114 | print("")
115 | print("STATE STACK DUMP")
116 | print("----------------")
117 | for i, s in enumerate(state.state_stack):
118 | print('%d)' % i , s)
119 | print("")
120 |
121 | raise
122 | ###}
123 |
--------------------------------------------------------------------------------
/lark/parsers/lalr_parser_state.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy, copy
2 | from typing import Dict, Any, Generic, List
3 | from ..lexer import Token, LexerThread
4 | from ..common import ParserCallbacks
5 |
6 | from .lalr_analysis import Shift, ParseTableBase, StateT
7 | from lark.exceptions import UnexpectedToken
8 |
9 | ###{standalone
10 |
11 | class ParseConf(Generic[StateT]):
12 | __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'
13 |
14 | parse_table: ParseTableBase[StateT]
15 | callbacks: ParserCallbacks
16 | start: str
17 |
18 | start_state: StateT
19 | end_state: StateT
20 | states: Dict[StateT, Dict[str, tuple]]
21 |
22 | def __init__(self, parse_table: ParseTableBase[StateT], callbacks: ParserCallbacks, start: str):
23 | self.parse_table = parse_table
24 |
25 | self.start_state = self.parse_table.start_states[start]
26 | self.end_state = self.parse_table.end_states[start]
27 | self.states = self.parse_table.states
28 |
29 | self.callbacks = callbacks
30 | self.start = start
31 |
32 | class ParserState(Generic[StateT]):
33 | __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'
34 |
35 | parse_conf: ParseConf[StateT]
36 | lexer: LexerThread
37 | state_stack: List[StateT]
38 | value_stack: list
39 |
40 | def __init__(self, parse_conf: ParseConf[StateT], lexer: LexerThread, state_stack=None, value_stack=None):
41 | self.parse_conf = parse_conf
42 | self.lexer = lexer
43 | self.state_stack = state_stack or [self.parse_conf.start_state]
44 | self.value_stack = value_stack or []
45 |
46 | @property
47 | def position(self) -> StateT:
48 | return self.state_stack[-1]
49 |
50 | # Necessary for match_examples() to work
51 | def __eq__(self, other) -> bool:
52 | if not isinstance(other, ParserState):
53 | return NotImplemented
54 | return len(self.state_stack) == len(other.state_stack) and self.position == other.position
55 |
56 | def __copy__(self):
57 | return self.copy()
58 |
59 | def copy(self, deepcopy_values=True) -> 'ParserState[StateT]':
60 | return type(self)(
61 | self.parse_conf,
62 | self.lexer, # XXX copy
63 | copy(self.state_stack),
64 | deepcopy(self.value_stack) if deepcopy_values else copy(self.value_stack),
65 | )
66 |
67 | def feed_token(self, token: Token, is_end=False) -> Any:
68 | state_stack = self.state_stack
69 | value_stack = self.value_stack
70 | states = self.parse_conf.states
71 | end_state = self.parse_conf.end_state
72 | callbacks = self.parse_conf.callbacks
73 |
74 | while True:
75 | state = state_stack[-1]
76 | try:
77 | action, arg = states[state][token.type]
78 | except KeyError:
79 | expected = {s for s in states[state].keys() if s.isupper()}
80 | raise UnexpectedToken(token, expected, state=self, interactive_parser=None)
81 |
82 | assert arg != end_state
83 |
84 | if action is Shift:
85 | # shift once and return
86 | assert not is_end
87 | state_stack.append(arg)
88 | value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
89 | return
90 | else:
91 | # reduce+shift as many times as necessary
92 | rule = arg
93 | size = len(rule.expansion)
94 | if size:
95 | s = value_stack[-size:]
96 | del state_stack[-size:]
97 | del value_stack[-size:]
98 | else:
99 | s = []
100 |
101 | value = callbacks[rule](s) if callbacks else s
102 |
103 | _action, new_state = states[state_stack[-1]][rule.origin.name]
104 | assert _action is Shift
105 | state_stack.append(new_state)
106 | value_stack.append(value)
107 |
108 | if is_end and state_stack[-1] == end_state:
109 | return value_stack[-1]
110 | ###}
111 |
--------------------------------------------------------------------------------
/lark/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/lark/py.typed
--------------------------------------------------------------------------------
/lark/reconstruct.py:
--------------------------------------------------------------------------------
1 | """This is an experimental tool for reconstructing text from a shaped tree, based on a Lark grammar.
2 | """
3 |
4 | from typing import Dict, Callable, Iterable, Optional
5 |
6 | from .lark import Lark
7 | from .tree import Tree, ParseTree
8 | from .visitors import Transformer_InPlace
9 | from .lexer import Token, PatternStr, TerminalDef
10 | from .grammar import Terminal, NonTerminal, Symbol
11 |
12 | from .tree_matcher import TreeMatcher, is_discarded_terminal
13 | from .utils import is_id_continue
14 |
15 | def is_iter_empty(i):
16 | try:
17 | _ = next(i)
18 | return False
19 | except StopIteration:
20 | return True
21 |
22 |
23 | class WriteTokensTransformer(Transformer_InPlace):
24 | "Inserts discarded tokens into their correct place, according to the rules of grammar"
25 |
26 | tokens: Dict[str, TerminalDef]
27 | term_subs: Dict[str, Callable[[Symbol], str]]
28 |
29 | def __init__(self, tokens: Dict[str, TerminalDef], term_subs: Dict[str, Callable[[Symbol], str]]) -> None:
30 | self.tokens = tokens
31 | self.term_subs = term_subs
32 |
33 | def __default__(self, data, children, meta):
34 | if not getattr(meta, 'match_tree', False):
35 | return Tree(data, children)
36 |
37 | iter_args = iter(children)
38 | to_write = []
39 | for sym in meta.orig_expansion:
40 | if is_discarded_terminal(sym):
41 | try:
42 | v = self.term_subs[sym.name](sym)
43 | except KeyError:
44 | t = self.tokens[sym.name]
45 | if not isinstance(t.pattern, PatternStr):
46 | raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)
47 |
48 | v = t.pattern.value
49 | to_write.append(v)
50 | else:
51 | x = next(iter_args)
52 | if isinstance(x, list):
53 | to_write += x
54 | else:
55 | if isinstance(x, Token):
56 | assert Terminal(x.type) == sym, x
57 | else:
58 | assert NonTerminal(x.data) == sym, (sym, x)
59 | to_write.append(x)
60 |
61 | assert is_iter_empty(iter_args)
62 | return to_write
63 |
64 |
65 | class Reconstructor(TreeMatcher):
66 | """
67 | A Reconstructor that will, given a full parse Tree, generate source code.
68 |
69 | Note:
70 | The reconstructor cannot generate values from regexps. If you need to produce discarded
71 | regexes, such as newlines, use `term_subs` and provide default values for them.
72 |
73 | Parameters:
74 | parser: a Lark instance
75 | term_subs: a dictionary of [Terminal name as str] to [output text as str]
76 | """
77 |
78 | write_tokens: WriteTokensTransformer
79 |
80 | def __init__(self, parser: Lark, term_subs: Optional[Dict[str, Callable[[Symbol], str]]]=None) -> None:
81 | TreeMatcher.__init__(self, parser)
82 |
83 | self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {})
84 |
85 | def _reconstruct(self, tree):
86 | unreduced_tree = self.match_tree(tree, tree.data)
87 |
88 | res = self.write_tokens.transform(unreduced_tree)
89 | for item in res:
90 | if isinstance(item, Tree):
91 | # TODO use orig_expansion.rulename to support templates
92 | yield from self._reconstruct(item)
93 | else:
94 | yield item
95 |
96 | def reconstruct(self, tree: ParseTree, postproc: Optional[Callable[[Iterable[str]], Iterable[str]]]=None, insert_spaces: bool=True) -> str:
97 | x = self._reconstruct(tree)
98 | if postproc:
99 | x = postproc(x)
100 | y = []
101 | prev_item = ''
102 | for item in x:
103 | if insert_spaces and prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
104 | y.append(' ')
105 | y.append(item)
106 | prev_item = item
107 | return ''.join(y)
108 |
--------------------------------------------------------------------------------
/lark/tools/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from argparse import ArgumentParser, FileType
3 | from textwrap import indent
4 | from logging import DEBUG, INFO, WARN, ERROR
5 | from typing import Optional
6 | import warnings
7 |
8 | from lark import Lark, logger
9 | try:
10 | from interegular import logger as interegular_logger
11 | has_interegular = True
12 | except ImportError:
13 | has_interegular = False
14 |
15 | lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options')
16 |
17 | flags = [
18 | ('d', 'debug'),
19 | 'keep_all_tokens',
20 | 'regex',
21 | 'propagate_positions',
22 | 'maybe_placeholders',
23 | 'use_bytes'
24 | ]
25 |
26 | options = ['start', 'lexer']
27 |
28 | lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times")
29 | lalr_argparser.add_argument('-s', '--start', action='append', default=[])
30 | lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual'))
31 | lalr_argparser.add_argument('-o', '--out', type=FileType('w', encoding='utf-8'), default=sys.stdout, help='the output file (default=stdout)')
32 | lalr_argparser.add_argument('grammar_file', type=FileType('r', encoding='utf-8'), help='A valid .lark file')
33 |
34 | for flag in flags:
35 | if isinstance(flag, tuple):
36 | options.append(flag[1])
37 | lalr_argparser.add_argument('-' + flag[0], '--' + flag[1], action='store_true')
38 | elif isinstance(flag, str):
39 | options.append(flag)
40 | lalr_argparser.add_argument('--' + flag, action='store_true')
41 | else:
42 | raise NotImplementedError("flags must only contain strings or tuples of strings")
43 |
44 |
45 | def build_lalr(namespace):
46 | logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)])
47 | if has_interegular:
48 | interegular_logger.setLevel(logger.getEffectiveLevel())
49 | if len(namespace.start) == 0:
50 | namespace.start.append('start')
51 | kwargs = {n: getattr(namespace, n) for n in options}
52 | return Lark(namespace.grammar_file, parser='lalr', **kwargs), namespace.out
53 |
54 |
55 | def showwarning_as_comment(message, category, filename, lineno, file=None, line=None):
56 | # Based on warnings._showwarnmsg_impl
57 | text = warnings.formatwarning(message, category, filename, lineno, line)
58 | text = indent(text, '# ')
59 | if file is None:
60 | file = sys.stderr
61 | if file is None:
62 | return
63 | try:
64 | file.write(text)
65 | except OSError:
66 | pass
67 |
68 |
69 | def make_warnings_comments():
70 | warnings.showwarning = showwarning_as_comment
71 |
--------------------------------------------------------------------------------
/lark/tools/serialize.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 |
4 | from lark.grammar import Rule
5 | from lark.lexer import TerminalDef
6 | from lark.tools import lalr_argparser, build_lalr
7 |
8 | import argparse
9 |
10 | argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize', parents=[lalr_argparser],
11 | description="Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file",
12 | epilog='Look at the Lark documentation for more info on the options')
13 |
14 |
15 | def serialize(lark_inst, outfile):
16 | data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
17 | outfile.write('{\n')
18 | outfile.write(' "data": %s,\n' % json.dumps(data))
19 | outfile.write(' "memo": %s\n' % json.dumps(memo))
20 | outfile.write('}\n')
21 |
22 |
23 | def main():
24 | if len(sys.argv)==1:
25 | argparser.print_help(sys.stderr)
26 | sys.exit(1)
27 | ns = argparser.parse_args()
28 | serialize(*build_lalr(ns))
29 |
30 |
31 | if __name__ == '__main__':
32 | main()
33 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.2.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "lark"
7 | authors = [{name = "Erez Shinan", email = "erezshin@gmail.com"}]
8 | license = {text = "MIT"}
9 | description = "a modern parsing library"
10 | keywords = ["Earley", "LALR", "parser", "parsing", "ast"]
11 | classifiers = [
12 | "Development Status :: 5 - Production/Stable",
13 | "Intended Audience :: Developers",
14 | "Programming Language :: Python :: 3",
15 | "Topic :: Software Development :: Libraries :: Python Modules",
16 | "Topic :: Text Processing :: General",
17 | "Topic :: Text Processing :: Linguistic",
18 | "License :: OSI Approved :: MIT License",
19 | ]
20 | requires-python = ">=3.8"
21 | dependencies = []
22 | dynamic = ["version"]
23 |
24 | [project.readme]
25 | text = """
26 | Lark is a modern general-purpose parsing library for Python.
27 | With Lark, you can parse any context-free grammar, efficiently, with very little code.
28 | Main Features:
29 | - Builds a parse-tree (AST) automagically, based on the structure of the grammar
30 | - Earley parser
31 | - Can parse all context-free grammars
32 | - Full support for ambiguous grammars
33 | - LALR(1) parser
34 | - Fast and light, competitive with PLY
35 | - Can generate a stand-alone parser
36 | - CYK parser, for highly ambiguous grammars
37 | - EBNF grammar
38 | - Unicode fully supported
39 | - Automatic line & column tracking
40 | - Standard library of terminals (strings, numbers, names, etc.)
41 | - Import grammars from Nearley.js
42 | - Extensive test suite
43 | - And much more!
44 | Since version 1.2, only Python versions 3.8 and up are supported."""
45 | content-type = "text/markdown"
46 |
47 | [project.urls]
48 | Homepage = "https://github.com/lark-parser/lark"
49 | Download = "https://github.com/lark-parser/lark/tarball/master"
50 |
51 | [project.entry-points.pyinstaller40]
52 | hook-dirs = "lark.__pyinstaller:get_hook_dirs"
53 |
54 | [project.optional-dependencies]
55 | regex = ["regex"]
56 | nearley = ["js2py"]
57 | atomic_cache = ["atomicwrites"]
58 | interegular = ["interegular>=0.3.1,<0.4.0"]
59 |
60 | [tool.setuptools]
61 | packages = [
62 | "lark",
63 | "lark.parsers",
64 | "lark.tools",
65 | "lark.grammars",
66 | "lark.__pyinstaller",
67 | ]
68 | include-package-data = true
69 |
70 | [tool.setuptools.package-data]
71 | "*" = ["*.lark"]
72 | lark = ["py.typed"]
73 |
74 | [tool.setuptools.dynamic]
75 | version = {attr = "lark.__version__"}
76 |
77 | [tool.mypy]
78 | files = "lark"
79 | python_version = "3.8"
80 | show_error_codes = true
81 | enable_error_code = ["ignore-without-code", "unused-ignore"]
82 | exclude = [
83 | "^lark/__pyinstaller",
84 | ]
85 |
86 | # You can disable imports or control per-module/file settings here
87 | [[tool.mypy.overrides]]
88 | module = [ "js2py" ]
89 | ignore_missing_imports = true
90 |
91 | [tool.coverage.report]
92 | exclude_lines = [
93 | "pragma: no cover",
94 | "if TYPE_CHECKING:"
95 | ]
96 | [tool.pyright]
97 | include = ["lark"]
98 |
99 | [tool.pytest.ini_options]
100 | minversion = 6.0
101 | addopts = "-ra -q"
102 | testpaths =[
103 | "tests"
104 | ]
105 | python_files = "__main__.py"
106 |
--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | formats: all
4 |
5 | build:
6 | os: ubuntu-22.04
7 | tools:
8 | python: "3.7"
9 |
10 | python:
11 | # version: 3.7
12 | install:
13 | - requirements: docs/requirements.txt
14 |
15 | # Build documentation in the docs/ directory with Sphinx
16 | sphinx:
17 | configuration: docs/conf.py
18 |
--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | interegular>=0.3.1,<0.4.0
2 | Js2Py==0.68
3 | regex
4 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/tests/__init__.py
--------------------------------------------------------------------------------
/tests/__main__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, print_function
2 |
3 | import unittest
4 | import logging
5 | import sys
6 | from lark import logger
7 |
8 | from .test_trees import TestTrees
9 | from .test_tools import TestStandalone
10 | from .test_cache import TestCache
11 | from .test_grammar import TestGrammar
12 | from .test_reconstructor import TestReconstructor
13 | from .test_tree_forest_transformer import TestTreeForestTransformer
14 | from .test_lexer import TestLexer
15 | from .test_python_grammar import TestPythonParser
16 | from .test_tree_templates import * # We define __all__ to list which TestSuites to run
17 |
18 | try:
19 | from .test_nearley.test_nearley import TestNearley
20 | except ImportError:
21 | logger.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)")
22 |
23 | # from .test_selectors import TestSelectors
24 | # from .test_grammars import TestPythonG, TestConfigG
25 |
26 | from .test_logger import Testlogger
27 |
28 | from .test_parser import * # We define __all__ to list which TestSuites to run
29 |
30 | if sys.version_info >= (3, 10):
31 | from .test_pattern_matching import TestPatternMatching
32 |
33 | logger.setLevel(logging.INFO)
34 |
35 | if __name__ == '__main__':
36 | unittest.main()
37 |
--------------------------------------------------------------------------------
/tests/grammars/ab.lark:
--------------------------------------------------------------------------------
1 | startab: expr
2 |
3 | expr: A B
4 | | A expr B
5 |
6 | A: "a"
7 | B: "b"
8 |
9 | %import common.WS
10 | %ignore WS
11 |
--------------------------------------------------------------------------------
/tests/grammars/leading_underscore_grammar.lark:
--------------------------------------------------------------------------------
1 | A: "A"
2 |
3 | _SEP: "x"
4 | _a: A
5 |
6 | c: _a _SEP
--------------------------------------------------------------------------------
/tests/grammars/templates.lark:
--------------------------------------------------------------------------------
1 | sep{item, delim}: item (delim item)*
--------------------------------------------------------------------------------
/tests/grammars/test.lark:
--------------------------------------------------------------------------------
1 | %import common.NUMBER
2 | %import common.WORD
3 | %import common.WS
4 |
--------------------------------------------------------------------------------
/tests/grammars/test_relative_import_of_nested_grammar.lark:
--------------------------------------------------------------------------------
1 |
2 | start: rule_to_import
3 |
4 | %import .test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
--------------------------------------------------------------------------------
/tests/grammars/test_relative_import_of_nested_grammar__grammar_to_import.lark:
--------------------------------------------------------------------------------
1 |
2 | rule_to_import: NESTED_TERMINAL
3 |
4 | %import .test_relative_import_of_nested_grammar__nested_grammar.NESTED_TERMINAL
5 |
--------------------------------------------------------------------------------
/tests/grammars/test_relative_import_of_nested_grammar__nested_grammar.lark:
--------------------------------------------------------------------------------
1 | NESTED_TERMINAL: "N"
2 |
--------------------------------------------------------------------------------
/tests/grammars/test_unicode.lark:
--------------------------------------------------------------------------------
1 | UNICODE : /[a-zØ-öø-ÿ]/
--------------------------------------------------------------------------------
/tests/grammars/three_rules_using_same_token.lark:
--------------------------------------------------------------------------------
1 | %import common.INT
2 |
3 | a: A
4 | b: A
5 | c: A
6 |
7 | A: "A"
--------------------------------------------------------------------------------
/tests/test_lexer.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase, main
2 |
3 | from lark import Lark, Tree, TextSlice
4 |
5 |
6 | class TestLexer(TestCase):
7 | def setUp(self):
8 | pass
9 |
10 | def test_basic(self):
11 | p = Lark("""
12 | start: "a" "b" "c" "d"
13 | %ignore " "
14 | """)
15 |
16 | res = list(p.lex("abc cba dd"))
17 | assert res == list('abccbadd')
18 |
19 | res = list(p.lex("abc cba dd", dont_ignore=True))
20 | assert res == list('abc cba dd')
21 |
22 | def test_subset_lex(self):
23 | p = Lark("""
24 | start: "a" "b" "c" "d"
25 | %ignore " "
26 | """)
27 |
28 | res = list(p.lex(TextSlice("xxxabc cba ddxx", 3, -2)))
29 | assert res == list('abccbadd')
30 |
31 | res = list(p.lex(TextSlice("aaaabc cba dddd", 3, -2)))
32 | assert res == list('abccbadd')
33 |
34 |
35 | if __name__ == '__main__':
36 | main()
37 |
--------------------------------------------------------------------------------
/tests/test_logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from contextlib import contextmanager
3 | from lark import Lark, logger
4 | from unittest import TestCase, main, skipIf
5 |
6 | from io import StringIO
7 |
8 | try:
9 | import interegular
10 | except ImportError:
11 | interegular = None
12 |
13 | @contextmanager
14 | def capture_log():
15 | stream = StringIO()
16 | orig_handler = logger.handlers[0]
17 | del logger.handlers[:]
18 | logger.addHandler(logging.StreamHandler(stream))
19 | yield stream
20 | del logger.handlers[:]
21 | logger.addHandler(orig_handler)
22 |
23 | class Testlogger(TestCase):
24 |
25 | def test_debug(self):
26 | logger.setLevel(logging.DEBUG)
27 | collision_grammar = '''
28 | start: as as
29 | as: a*
30 | a: "a"
31 | '''
32 | with capture_log() as log:
33 | Lark(collision_grammar, parser='lalr', debug=True)
34 |
35 | log = log.getvalue()
36 | # since there are conflicts about A
37 | # symbol A should appear in the log message for hint
38 | self.assertIn("A", log)
39 |
40 | def test_non_debug(self):
41 | logger.setLevel(logging.WARNING)
42 | collision_grammar = '''
43 | start: as as
44 | as: a*
45 | a: "a"
46 | '''
47 | with capture_log() as log:
48 | Lark(collision_grammar, parser='lalr', debug=False)
49 | log = log.getvalue()
50 | # no log message
51 | self.assertEqual(log, "")
52 |
53 | def test_loglevel_higher(self):
54 | logger.setLevel(logging.ERROR)
55 | collision_grammar = '''
56 | start: as as
57 | as: a*
58 | a: "a"
59 | '''
60 | with capture_log() as log:
61 | Lark(collision_grammar, parser='lalr', debug=True)
62 | log = log.getvalue()
63 | # no log message
64 | self.assertEqual(len(log), 0)
65 |
66 | @skipIf(interegular is None, "interegular is not installed, can't test regex collisions")
67 | def test_regex_collision(self):
68 | logger.setLevel(logging.WARNING)
69 | collision_grammar = '''
70 | start: A | B
71 | A: /a+/
72 | B: /(a|b)+/
73 | '''
74 | with capture_log() as log:
75 | Lark(collision_grammar, parser='lalr')
76 |
77 | log = log.getvalue()
78 | # since there are conflicts between A and B
79 | # symbols A and B should appear in the log message
80 | self.assertIn("A", log)
81 | self.assertIn("B", log)
82 |
83 | @skipIf(interegular is None, "interegular is not installed, can't test regex collisions")
84 | def test_no_regex_collision(self):
85 | logger.setLevel(logging.WARNING)
86 | collision_grammar = '''
87 | start: A " " B
88 | A: /a+/
89 | B: /(a|b)+/
90 | '''
91 | with capture_log() as log:
92 | Lark(collision_grammar, parser='lalr')
93 |
94 | log = log.getvalue()
95 | self.assertEqual(log, "")
96 |
97 |
98 | if __name__ == '__main__':
99 | main()
100 |
--------------------------------------------------------------------------------
/tests/test_nearley/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lark-parser/lark/87bb8efe0d425187159b39fd788640da33d1878e/tests/test_nearley/__init__.py
--------------------------------------------------------------------------------
/tests/test_nearley/grammars/include_unicode.ne:
--------------------------------------------------------------------------------
1 | @include "unicode.ne"
2 |
3 | main -> x
4 |
--------------------------------------------------------------------------------
/tests/test_nearley/grammars/unicode.ne:
--------------------------------------------------------------------------------
1 | x -> "±a"
2 |
--------------------------------------------------------------------------------
/tests/test_nearley/test_nearley.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 |
4 | import unittest
5 | import logging
6 | import os
7 | import codecs
8 |
9 | from lark import logger
10 | from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main
11 |
12 | logger.setLevel(logging.INFO)
13 |
14 | TEST_PATH = os.path.abspath(os.path.dirname(__file__))
15 | NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley')
16 | BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin')
17 |
18 | if not os.path.exists(BUILTIN_PATH):
19 | logger.warning("Nearley not included. Skipping Nearley tests! (use git submodule to add)")
20 | raise ImportError("Skipping Nearley tests!")
21 |
22 | try:
23 | import js2py # Ensures that js2py exists, to avoid failing tests
24 | except RuntimeError as e:
25 | if "python version" in str(e):
26 | raise ImportError("js2py does not support this python version")
27 | raise
28 |
29 |
30 | class TestNearley(unittest.TestCase):
31 | def test_css(self):
32 | fn = os.path.join(NEARLEY_PATH, 'examples/csscolor.ne')
33 | with open(fn) as f:
34 | grammar = f.read()
35 |
36 | code = create_code_for_nearley_grammar(grammar, 'csscolor', BUILTIN_PATH, os.path.dirname(fn))
37 | d = {}
38 | exec (code, d)
39 | parse = d['parse']
40 |
41 | c = parse('#a199ff')
42 | assert c['r'] == 161
43 | assert c['g'] == 153
44 | assert c['b'] == 255
45 |
46 | c = parse('rgb(255, 70%, 3)')
47 | assert c['r'] == 255
48 | assert c['g'] == 178
49 | assert c['b'] == 3
50 |
51 | def test_include(self):
52 | fn = os.path.join(NEARLEY_PATH, 'test/grammars/folder-test.ne')
53 | with open(fn) as f:
54 | grammar = f.read()
55 |
56 | code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, os.path.dirname(fn))
57 | d = {}
58 | exec (code, d)
59 | parse = d['parse']
60 |
61 | parse('a')
62 | parse('b')
63 |
64 | def test_multi_include(self):
65 | fn = os.path.join(NEARLEY_PATH, 'test/grammars/multi-include-test.ne')
66 | with open(fn) as f:
67 | grammar = f.read()
68 |
69 | code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, os.path.dirname(fn))
70 | d = {}
71 | exec (code, d)
72 | parse = d['parse']
73 |
74 | parse('a')
75 | parse('b')
76 | parse('c')
77 |
78 | def test_utf8(self):
79 | grammar = u'main -> "±a"'
80 | code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, './')
81 | d = {}
82 | exec (code, d)
83 | parse = d['parse']
84 |
85 | parse(u'±a')
86 |
87 | def test_backslash(self):
88 | grammar = r'main -> "\""'
89 | code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, './')
90 | d = {}
91 | exec (code, d)
92 | parse = d['parse']
93 | parse(u'"')
94 |
95 | def test_null(self):
96 | grammar = r'main -> "a" | null'
97 | code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, './')
98 | d = {}
99 | exec (code, d)
100 | parse = d['parse']
101 | parse('a')
102 | parse('')
103 |
104 | def test_utf8_2(self):
105 | fn = os.path.join(TEST_PATH, 'grammars/unicode.ne')
106 | nearley_tool_main(fn, 'x', NEARLEY_PATH)
107 |
108 | def test_include_utf8(self):
109 | fn = os.path.join(TEST_PATH, 'grammars/include_unicode.ne')
110 | nearley_tool_main(fn, 'main', NEARLEY_PATH)
111 |
112 |
113 | if __name__ == '__main__':
114 | unittest.main()
115 |
--------------------------------------------------------------------------------
/tests/test_pattern_matching.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase, main
2 |
3 | from lark import Token, Tree
4 |
5 |
6 | class TestPatternMatching(TestCase):
7 | token = Token('A', 'a')
8 |
9 | def setUp(self):
10 | pass
11 |
12 | def test_matches_with_string(self):
13 | match self.token:
14 | case 'a':
15 | pass
16 | case _:
17 | assert False
18 |
19 | def test_matches_with_str_positional_arg(self):
20 | match self.token:
21 | case str('a'):
22 | pass
23 | case _:
24 | assert False
25 |
26 | def test_matches_with_token_positional_arg(self):
27 | match self.token:
28 | case Token('a'):
29 | assert False
30 | case Token('A'):
31 | pass
32 | case _:
33 | assert False
34 |
35 | def test_matches_with_token_kwarg_type(self):
36 | match self.token:
37 | case Token(type='A'):
38 | pass
39 | case _:
40 | assert False
41 |
42 | def test_matches_with_bad_token_type(self):
43 | match self.token:
44 | case Token(type='B'):
45 | assert False
46 | case _:
47 | pass
48 |
49 | def test_match_on_tree(self):
50 | tree1 = Tree('a', [Tree(x, y) for x, y in zip('bcd', 'xyz')])
51 | tree2 = Tree('a', [
52 | Tree('b', [Token('T', 'x')]),
53 | Tree('c', [Token('T', 'y')]),
54 | Tree('d', [Tree('z', [Token('T', 'zz'), Tree('zzz', 'zzz')])]),
55 | ])
56 |
57 | match tree1:
58 | case Tree('X', []):
59 | assert False
60 | case Tree('a', []):
61 | assert False
62 | case Tree(_, 'b'):
63 | assert False
64 | case Tree('X', _):
65 | assert False
66 | tree = Tree('q', [Token('T', 'x')])
67 | match tree:
68 | case Tree('q', [Token('T', 'x')]):
69 | pass
70 | case _:
71 | assert False
72 | tr = Tree('a', [Tree('b', [Token('T', 'a')])])
73 | match tr:
74 | case Tree('a', [Tree('b', [Token('T', 'a')])]):
75 | pass
76 | case _:
77 | assert False
78 | # test nested trees
79 | match tree2:
80 | case Tree('a', [
81 | Tree('b', [Token('T', 'x')]),
82 | Tree('c', [Token('T', 'y')]),
83 | Tree('d', [
84 | Tree('z', [
85 | Token('T', 'zz'),
86 | Tree('zzz', 'zzz')
87 | ])
88 | ])
89 | ]):
90 | pass
91 | case _:
92 | assert False
93 |
94 |
95 |
96 | if __name__ == '__main__':
97 | main()
98 |
--------------------------------------------------------------------------------
/tests/test_reconstructor.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import json
4 | import sys
5 | import unittest
6 | from itertools import product
7 | from unittest import TestCase
8 |
9 | from lark import Lark
10 | from lark.reconstruct import Reconstructor
11 |
12 | common = """
13 | %import common (WS_INLINE, NUMBER, WORD)
14 | %ignore WS_INLINE
15 | """
16 |
17 |
18 | def _remove_ws(s):
19 | return s.replace(' ', '').replace('\n', '')
20 |
21 |
22 | class TestReconstructor(TestCase):
23 |
24 | def assert_reconstruct(self, grammar, code, **options):
25 | parser = Lark(grammar, parser='lalr', maybe_placeholders=False, **options)
26 | tree = parser.parse(code)
27 | new = Reconstructor(parser).reconstruct(tree)
28 | self.assertEqual(_remove_ws(code), _remove_ws(new))
29 |
30 | def test_starred_rule(self):
31 | g = """
32 | start: item*
33 | item: NL
34 | | rule
35 | rule: WORD ":" NUMBER
36 | NL: /(\\r?\\n)+\\s*/
37 | """ + common
38 |
39 | code = """
40 | Elephants: 12
41 | """
42 |
43 | self.assert_reconstruct(g, code)
44 |
45 | def test_starred_group(self):
46 | g = """
47 | start: (rule | NL)*
48 | rule: WORD ":" NUMBER
49 | NL: /(\\r?\\n)+\\s*/
50 | """ + common
51 |
52 | code = """
53 | Elephants: 12
54 | """
55 |
56 | self.assert_reconstruct(g, code)
57 |
58 | def test_alias(self):
59 | g = """
60 | start: line*
61 | line: NL
62 | | rule
63 | | "hello" -> hi
64 | rule: WORD ":" NUMBER
65 | NL: /(\\r?\\n)+\\s*/
66 | """ + common
67 |
68 | code = """
69 | Elephants: 12
70 | hello
71 | """
72 |
73 | self.assert_reconstruct(g, code)
74 |
75 | def test_keep_tokens(self):
76 | g = """
77 | start: (NL | stmt)*
78 | stmt: var op var
79 | !op: ("+" | "-" | "*" | "/")
80 | var: WORD
81 | NL: /(\\r?\\n)+\\s*/
82 | """ + common
83 |
84 | code = """
85 | a+b
86 | """
87 |
88 | self.assert_reconstruct(g, code)
89 |
90 | def test_expand_rule(self):
91 | g = """
92 | ?start: (NL | mult_stmt)*
93 | ?mult_stmt: sum_stmt ["*" sum_stmt]
94 | ?sum_stmt: var ["+" var]
95 | var: WORD
96 | NL: /(\\r?\\n)+\\s*/
97 | """ + common
98 |
99 | code = ['a', 'a*b', 'a+b', 'a*b+c', 'a+b*c', 'a+b*c+d']
100 |
101 | for c in code:
102 | self.assert_reconstruct(g, c)
103 |
104 | def test_json_example(self):
105 | test_json = '''
106 | {
107 | "empty_object" : {},
108 | "empty_array" : [],
109 | "booleans" : { "YES" : true, "NO" : false },
110 | "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
111 | "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ],
112 | "nothing" : null
113 | }
114 | '''
115 |
116 | json_grammar = r"""
117 | ?start: value
118 |
119 | ?value: object
120 | | array
121 | | string
122 | | SIGNED_NUMBER -> number
123 | | "true" -> true
124 | | "false" -> false
125 | | "null" -> null
126 |
127 | array : "[" [value ("," value)*] "]"
128 | object : "{" [pair ("," pair)*] "}"
129 | pair : string ":" value
130 |
131 | string : ESCAPED_STRING
132 |
133 | %import common.ESCAPED_STRING
134 | %import common.SIGNED_NUMBER
135 | %import common.WS
136 |
137 | %ignore WS
138 | """
139 |
140 | json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False)
141 | tree = json_parser.parse(test_json)
142 |
143 | new_json = Reconstructor(json_parser).reconstruct(tree)
144 | self.assertEqual(json.loads(new_json), json.loads(test_json))
145 |
146 | def test_keep_all_tokens(self):
147 | g = """
148 | start: "a"? _B? c? _d?
149 | _B: "b"
150 | c: "c"
151 | _d: "d"
152 | """
153 | examples = list(map(''.join, product(('', 'a'), ('', 'b'), ('', 'c'), ('', 'd'), )))
154 | for code in examples:
155 | self.assert_reconstruct(g, code, keep_all_tokens=True)
156 |
157 | def test_switch_grammar_unicode_terminal(self):
158 | """
159 | This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
160 | with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
161 | keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
162 | tokens (e.g., `+=`) to mismatch between the two grammars.
163 | """
164 |
165 | g1 = """
166 | start: (NL | stmt)*
167 | stmt: "keyword" var op var
168 | !op: ("+=" | "-=" | "*=" | "/=")
169 | var: WORD
170 | NL: /(\\r?\\n)+\\s*/
171 | """ + common
172 |
173 | g2 = """
174 | start: (NL | stmt)*
175 | stmt: "குறிப்பு" var op var
176 | !op: ("+=" | "-=" | "*=" | "/=")
177 | var: WORD
178 | NL: /(\\r?\\n)+\\s*/
179 | """ + common
180 |
181 | code = """
182 | keyword x += y
183 | """
184 |
185 | l1 = Lark(g1, parser='lalr', maybe_placeholders=False)
186 | l2 = Lark(g2, parser='lalr', maybe_placeholders=False)
187 | r = Reconstructor(l2)
188 |
189 | tree = l1.parse(code)
190 | code2 = r.reconstruct(tree)
191 | assert l2.parse(code2) == tree
192 |
193 |
194 | if __name__ == '__main__':
195 | unittest.main()
196 |
--------------------------------------------------------------------------------
/tests/test_relative_import.lark:
--------------------------------------------------------------------------------
1 | start: NUMBER WORD
2 |
3 | %import .grammars.test.NUMBER
4 | %import common.WORD
5 | %import common.WS
6 | %ignore WS
7 |
8 |
--------------------------------------------------------------------------------
/tests/test_relative_import_preserves_leading_underscore.lark:
--------------------------------------------------------------------------------
1 | start: c
2 |
3 | %import .grammars.leading_underscore_grammar.c
--------------------------------------------------------------------------------
/tests/test_relative_import_rename.lark:
--------------------------------------------------------------------------------
1 | start: N WORD
2 |
3 | %import .grammars.test.NUMBER -> N
4 | %import common.WORD
5 | %import common.WS
6 | %ignore WS
7 |
8 |
--------------------------------------------------------------------------------
/tests/test_relative_import_rules_dependencies_imported_only_once.lark:
--------------------------------------------------------------------------------
1 | %import .grammars.three_rules_using_same_token.a
2 | %import .grammars.three_rules_using_same_token.b
3 | %import .grammars.three_rules_using_same_token.c -> d
4 |
5 | start: a b d
6 |
--------------------------------------------------------------------------------
/tests/test_relative_import_unicode.lark:
--------------------------------------------------------------------------------
1 | start: UNICODE
2 |
3 | %import .grammars.test_unicode.UNICODE
--------------------------------------------------------------------------------
/tests/test_relative_multi_import.lark:
--------------------------------------------------------------------------------
1 | start: NUMBER WORD
2 |
3 | %import .grammars.test (NUMBER, WORD, WS)
4 | %ignore WS
5 |
--------------------------------------------------------------------------------
/tests/test_relative_rule_import.lark:
--------------------------------------------------------------------------------
1 | start: X expr Y
2 |
3 | X: "x"
4 | Y: "y"
5 |
6 | %import .grammars.ab.expr
7 |
8 |
--------------------------------------------------------------------------------
/tests/test_relative_rule_import_drop_ignore.lark:
--------------------------------------------------------------------------------
1 | start: X expr Y
2 |
3 | X: "x"
4 | Y: "y"
5 |
6 | %import .grammars.ab.expr
7 |
8 |
--------------------------------------------------------------------------------
/tests/test_relative_rule_import_rename.lark:
--------------------------------------------------------------------------------
1 | start: X ab Y
2 |
3 | X: "x"
4 | Y: "y"
5 |
6 | %import .grammars.ab.expr -> ab
7 |
8 |
--------------------------------------------------------------------------------
/tests/test_relative_rule_import_subrule.lark:
--------------------------------------------------------------------------------
1 | start: X startab Y
2 |
3 | X: "x"
4 | Y: "y"
5 |
6 | %import .grammars.ab.startab
7 |
8 |
--------------------------------------------------------------------------------
/tests/test_relative_rule_import_subrule_no_conflict.lark:
--------------------------------------------------------------------------------
1 | start: expr
2 |
3 | expr: X startab Y
4 |
5 | X: "x"
6 | Y: "y"
7 |
8 | %import .grammars.ab.startab
9 |
10 |
--------------------------------------------------------------------------------
/tests/test_templates_import.lark:
--------------------------------------------------------------------------------
1 | start: "[" sep{NUMBER, ","} "]"
2 | NUMBER: /\d+/
3 | %ignore " "
4 | %import .grammars.templates.sep
--------------------------------------------------------------------------------
/tests/test_tools.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, print_function
2 |
3 | from unittest import TestCase, main
4 |
5 | from lark import Lark
6 | from lark.tree import Tree
7 | from lark.tools import standalone
8 |
9 | from io import StringIO
10 |
11 |
12 | class TestStandalone(TestCase):
13 | def setUp(self):
14 | pass
15 |
16 | def _create_standalone(self, grammar, compress=False):
17 | code_buf = StringIO()
18 | standalone.gen_standalone(Lark(grammar, parser='lalr'), out=code_buf, compress=compress)
19 | code = code_buf.getvalue()
20 |
21 | context = {'__doc__': None, '__name__': 'test_standalone'}
22 | exec(code, context)
23 | return context
24 |
25 | def test_simple(self):
26 | grammar = """
27 | start: NUMBER WORD
28 |
29 | %import common.NUMBER
30 | %import common.WORD
31 | %import common.WS
32 | %ignore WS
33 |
34 | """
35 |
36 | context = self._create_standalone(grammar)
37 |
38 | _Lark = context['Lark_StandAlone']
39 | l = _Lark()
40 | x = l.parse('12 elephants')
41 | self.assertEqual(x.children, ['12', 'elephants'])
42 | x = l.parse('16 candles')
43 | self.assertEqual(x.children, ['16', 'candles'])
44 |
45 | self.assertRaises(context['UnexpectedToken'], l.parse, 'twelve monkeys')
46 | self.assertRaises(context['UnexpectedToken'], l.parse, 'twelve')
47 | self.assertRaises(context['UnexpectedCharacters'], l.parse, '$ talks')
48 |
49 | context = self._create_standalone(grammar, compress=True)
50 | _Lark = context['Lark_StandAlone']
51 | l = _Lark()
52 | x = l.parse('12 elephants')
53 |
54 | def test_interactive(self):
55 | grammar = """
56 | start: A+ B*
57 | A: "a"
58 | B: "b"
59 | """
60 | context = self._create_standalone(grammar)
61 | parser: Lark = context['Lark_StandAlone']()
62 |
63 | ip = parser.parse_interactive()
64 |
65 | UnexpectedToken = context['UnexpectedToken']
66 | Token = context['Token']
67 |
68 | self.assertRaises(UnexpectedToken, ip.feed_eof)
69 | self.assertRaises(TypeError, ip.exhaust_lexer)
70 | ip.feed_token(Token('A', 'a'))
71 | res = ip.feed_eof()
72 | self.assertEqual(res, Tree('start', ['a']))
73 |
74 | ip = parser.parse_interactive("ab")
75 |
76 | ip.exhaust_lexer()
77 |
78 | ip_copy = ip.copy()
79 | self.assertEqual(ip_copy.parser_state, ip.parser_state)
80 | self.assertEqual(ip_copy.lexer_thread.state, ip.lexer_thread.state)
81 | self.assertIsNot(ip_copy.parser_state, ip.parser_state)
82 | self.assertIsNot(ip_copy.lexer_thread.state, ip.lexer_thread.state)
83 | self.assertIsNot(ip_copy.lexer_thread.state.line_ctr, ip.lexer_thread.state.line_ctr)
84 |
85 | res = ip.feed_eof(ip.lexer_thread.state.last_token)
86 | self.assertEqual(res, Tree('start', ['a', 'b']))
87 | self.assertRaises(UnexpectedToken, ip.feed_eof)
88 |
89 | self.assertRaises(UnexpectedToken, ip_copy.feed_token, Token('A', 'a'))
90 | ip_copy.feed_token(Token('B', 'b'))
91 | res = ip_copy.feed_eof()
92 | self.assertEqual(res, Tree('start', ['a', 'b', 'b']))
93 |
94 | def test_contextual(self):
95 | grammar = """
96 | start: a b
97 | a: "A" "B"
98 | b: "AB"
99 | """
100 |
101 | context = self._create_standalone(grammar)
102 |
103 | _Lark = context['Lark_StandAlone']
104 | l = _Lark()
105 | x = l.parse('ABAB')
106 |
107 | _v_args = context['v_args']
108 | @_v_args(inline=True)
109 | class T(context['Transformer']):
110 | def a(self):
111 | return 'a'
112 | def b(self):
113 | return 'b'
114 |
115 | start = _v_args(inline=False)(list)
116 |
117 | x = T().transform(x)
118 | self.assertEqual(x, ['a', 'b'])
119 |
120 | l2 = _Lark(transformer=T())
121 | x = l2.parse('ABAB')
122 | self.assertEqual(x, ['a', 'b'])
123 |
124 | def test_postlex(self):
125 | from lark.indenter import Indenter
126 | class MyIndenter(Indenter):
127 | NL_type = '_NEWLINE'
128 | OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
129 | CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
130 | INDENT_type = '_INDENT'
131 | DEDENT_type = '_DEDENT'
132 | tab_len = 8
133 |
134 | grammar = r"""
135 | start: "(" ")" _NEWLINE
136 | _NEWLINE: /\n/
137 | """
138 |
139 | context = self._create_standalone(grammar)
140 | _Lark = context['Lark_StandAlone']
141 |
142 | l = _Lark(postlex=MyIndenter())
143 | x = l.parse('()\n')
144 | self.assertEqual(x, Tree('start', []))
145 | l = _Lark(postlex=MyIndenter())
146 | x = l.parse('(\n)\n')
147 | self.assertEqual(x, Tree('start', []))
148 |
149 | def test_transformer(self):
150 | grammar = r"""
151 | start: some_rule "(" SOME_TERMINAL ")"
152 | some_rule: SOME_TERMINAL
153 | SOME_TERMINAL: /[A-Za-z_][A-Za-z0-9_]*/
154 | """
155 | context = self._create_standalone(grammar)
156 | _Lark = context["Lark_StandAlone"]
157 |
158 | _Token = context["Token"]
159 | _Tree = context["Tree"]
160 |
161 | class MyTransformer(context["Transformer"]):
162 | def SOME_TERMINAL(self, token):
163 | return _Token("SOME_TERMINAL", "token is transformed")
164 |
165 | def some_rule(self, children):
166 | return _Tree("rule_is_transformed", [])
167 |
168 | parser = _Lark(transformer=MyTransformer())
169 | self.assertEqual(
170 | parser.parse("FOO(BAR)"),
171 | _Tree("start", [
172 | _Tree("rule_is_transformed", []),
173 | _Token("SOME_TERMINAL", "token is transformed")
174 | ])
175 | )
176 |
177 |
178 | if __name__ == '__main__':
179 | main()
180 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = lint, type, py38, py39, py310, py311, py312, py313, pypy3
3 | skip_missing_interpreters = true
4 |
5 | [testenv]
6 | whitelist_externals = git
7 | deps =
8 | -rtest-requirements.txt
9 | passenv =
10 | TERM
11 |
12 | # to always force recreation and avoid unexpected side effects
13 | recreate = True
14 |
15 | # Require since the commands use `git`
16 | allowlist_externals = git
17 |
18 | commands =
19 | git submodule sync -q
20 | git submodule update --init
21 | python -m tests {posargs}
22 |
23 | [testenv:type]
24 | description = run type check on code base
25 | skip_install = true
26 | recreate = false
27 | deps =
28 | mypy==1.10
29 | interegular>=0.3.1,<0.4.0
30 | types-atomicwrites
31 | types-regex
32 | rich<=13.4.1
33 | commands =
34 | mypy
35 |
36 |
37 | [testenv:lint]
38 | description = run linters on code base
39 | skip_install = true
40 | recreate = false
41 | deps =
42 | pre-commit
43 | commands =
44 | pre-commit run --all-files --show-diff-on-failure
45 |
--------------------------------------------------------------------------------