├── .coveragerc ├── .flake8 ├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── conftest.py ├── data ├── Tokens ├── expr.gram ├── fullpy.gram ├── gather.gram ├── gram.gram ├── large.txt ├── medium.txt ├── python.gram ├── recursive.gram ├── small.txt ├── tiny.txt ├── top-pypi-packages-365-days.json ├── x.gram ├── x.txt ├── xl.txt └── xxl.txt ├── docs ├── Makefile ├── _static │ ├── logo.svg │ └── logo_simple.png ├── conf.py ├── grammar.rst ├── index.rst ├── make.bat └── peg_parsers.rst ├── media └── logo.svg ├── pegen └── py.typed ├── pyproject.toml ├── releasenotes.rst ├── scripts ├── __init__.py ├── ast_timings.py ├── download_pypi_packages.py ├── find_max_nesting.py ├── grammar_grapher.py ├── joinstats.py ├── show_parse.py ├── test_parse_directory.py └── test_pypi_packages.py ├── setup.py ├── src └── pegen │ ├── __init__.py │ ├── __main__.py │ ├── build.py │ ├── first_sets.py │ ├── grammar.py │ ├── grammar_parser.py │ ├── grammar_visualizer.py │ ├── metagrammar.gram │ ├── parser.py │ ├── parser_generator.py │ ├── py.typed │ ├── python_generator.py │ ├── sccutils.py │ ├── templates │ └── index.html │ ├── tokenizer.py │ ├── utils.py │ ├── validator.py │ └── web.py ├── stories ├── story1 │ ├── __init__.py │ ├── node.py │ ├── parser.py │ ├── test_parser.py │ ├── test_tokenizer.py │ ├── tokenizer.py │ └── toy.py ├── story2 │ ├── __init__.py │ ├── generator.py │ ├── generator2.py │ ├── generator3.py │ ├── grammar.py │ ├── main.py │ ├── memo.py │ ├── node.py │ ├── parser.py │ ├── test_grammar.py │ ├── test_parser.py │ ├── test_tokenizer.py │ ├── tokenizer.py │ ├── toy.gram │ └── toy.py ├── story3 │ ├── __init__.py │ ├── driver.py │ ├── generator.py │ ├── generator2.py │ ├── generator3.py │ ├── grammar.py │ ├── ifs.txt │ ├── in.txt │ ├── main.py │ ├── memo.py │ ├── node.py │ ├── parser.py │ ├── test_grammar.py │ ├── test_parser.py │ ├── test_tokenizer.py │ ├── tokenizer.py │ ├── toy.gram │ ├── toy.py │ ├── tty.gif │ └── visualizer.py ├── story4 │ ├── __init__.py │ ├── driver.py │ ├── generator3.py │ ├── grammar.py │ ├── in.txt │ ├── main.py │ ├── memo.py │ ├── node.py │ ├── parser.py │ ├── test_grammar.py │ ├── test_parser.py │ ├── test_tokenizer.py │ ├── tokenizer.py │ ├── toy.gram │ ├── toy.py │ └── visualizer.py ├── story5 │ ├── __init__.py │ ├── calc.gram │ ├── calc.py │ ├── calc.txt │ ├── driver.py │ ├── generator3.py │ ├── grammar.py │ ├── in.txt │ ├── main.py │ ├── memo.py │ ├── node.py │ ├── parser.py │ ├── test_grammar.py │ ├── test_parser.py │ ├── test_tokenizer.py │ ├── tokenizer.py │ ├── toy.gram │ ├── toy.py │ └── visualizer.py ├── story6 │ ├── __init__.py │ ├── calc.gram │ ├── calc.py │ ├── calc.txt │ ├── driver.py │ ├── generator3.py │ ├── grammar.gram │ ├── grammar.py │ ├── grammarparser.py │ ├── in.txt │ ├── main.py │ ├── memo.py │ ├── memo2.py │ ├── node.py │ ├── parser.py │ ├── test_grammar.py │ ├── test_parser.py │ ├── test_tokenizer.py │ ├── tokenizer.py │ ├── toy.gram │ ├── toy.py │ └── visualizer.py └── story7 │ ├── __init__.py │ ├── calc.gram │ ├── calc.py │ ├── calc.txt │ ├── driver.py │ ├── generator3.py │ ├── grammar.gram │ ├── grammar.py │ ├── grammarparser.py │ ├── in.txt │ ├── main.py │ ├── memo.py │ ├── memo2.py │ ├── node.py │ ├── parser.py │ ├── test_grammar.py │ ├── test_parser.py │ ├── test_tokenizer.py │ ├── tokenizer.py │ ├── toy.gram │ ├── toy.py │ └── visualizer.py ├── tests ├── __init__.py ├── demo.py ├── python_parser │ ├── __init__.py │ ├── conftest.py │ ├── data │ │ ├── advanced_decorators.py │ │ ├── assignment.py │ │ ├── async.py │ │ ├── call.py │ │ ├── classes.py │ │ ├── comprehensions.py │ │ ├── expressions.py │ │ ├── fstrings.py │ │ ├── function_def.py │ │ ├── imports.py │ │ ├── lambdas.py │ │ ├── multi_statement_per_line.py │ │ ├── no_newline_at_end_of_file.py │ │ ├── no_newline_at_end_of_file_with_comment.py │ │ ├── pattern_matching.py │ │ ├── simple_decorators.py │ │ ├── statements.py │ │ ├── try_except_group.py │ │ ├── type_comment.py │ │ ├── type_params.py │ │ └── with_statement_multi_items.py │ ├── parser_cache │ │ └── README │ ├── test_ast_parsing.py │ ├── test_syntax_error_handling.py │ └── test_unsupported_syntax.py ├── test_first_sets.py ├── test_grammar_validator.py ├── test_grammar_visitor.py ├── test_grammar_visualizer.py ├── test_pegen.py └── test_tokenizer.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | include = 4 | */pegen/* 5 | */tests/python_parser/parser_cache/* 6 | 7 | [report] 8 | # Regexes for lines to exclude from consideration 9 | exclude_lines = 10 | # Have to re-enable the standard pragma 11 | pragma: no cover 12 | 13 | # Don't complain if tests don't hit defensive assertion code: 14 | raise NotImplementedError() 15 | pass 16 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | .git, 4 | __pycache__, 5 | docs/conf.py, 6 | build, 7 | dist, 8 | src/pegen/grammar_parser.py, 9 | tests/python_parser/data, 10 | tests/python_parser/parser_cache, 11 | ignore = E203, E266, E501, W503, E731 12 | # line length is intentionally set to 80 here because pegen uses Bugbear 13 | # See https://github.com/psf/black/blob/master/README.md#line-length for more details 14 | max-line-length = 80 15 | max-complexity = 19 16 | select = B,C,E,F,W,T4,B9 -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and publish Python 🐍 distribution 📦 to PyPI and GH 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - 'v[0-9].[0-9]+.[0-9]+' 9 | 10 | jobs: 11 | 12 | build: 13 | name: Build Python 🐍 distribution 📦 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python 3.12 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: 3.12 22 | - name: Install pypa/build 23 | run: python -m pip install --user build 24 | - name: Build a binary wheel and a source tarball 25 | run: python -m build 26 | - name: Store the distribution packages 27 | uses: actions/upload-artifact@v3 28 | with: 29 | name: python-package-distributions 30 | path: dist/ 31 | 32 | publish: 33 | name: Publish Python 🐍 distributions 📦 to PyPI 34 | runs-on: ubuntu-latest 35 | needs: 36 | - build 37 | if: startsWith(github.ref, 'refs/tags') 38 | 39 | environment: 40 | name: pypi 41 | url: https://pypi.org/p/pegen 42 | permissions: 43 | id-token: write 44 | 45 | steps: 46 | - name: Download all the dists 47 | uses: actions/download-artifact@v3 48 | with: 49 | name: python-package-distributions 50 | path: dist/ 51 | - name: Publish distribution 📦 to PyPI 52 | uses: pypa/gh-action-pypi-publish@release/v1 53 | 54 | github-release: 55 | name: >- 56 | Sign the Python 🐍 distribution 📦 with Sigstore 57 | and create a GitHub Release 58 | runs-on: ubuntu-latest 59 | needs: 60 | - publish 61 | 62 | permissions: 63 | contents: write 64 | id-token: write 65 | 66 | steps: 67 | - name: Download all the dists 68 | uses: actions/download-artifact@v3 69 | with: 70 | name: python-package-distributions 71 | path: dist/ 72 | - name: Sign the dists with Sigstore 73 | uses: sigstore/gh-action-sigstore-python@v1.2.3 74 | with: 75 | inputs: >- 76 | ./dist/*.tar.gz 77 | ./dist/*.whl 78 | - name: Create GitHub Release 79 | env: 80 | GITHUB_TOKEN: ${{ github.token }} 81 | run: >- 82 | gh release create 83 | '${{ github.ref_name }}' 84 | --repo '${{ github.repository }}' 85 | --generate-notes 86 | - name: Upload artifact signatures to GitHub Release 87 | env: 88 | GITHUB_TOKEN: ${{ github.token }} 89 | run: >- 90 | gh release upload 91 | '${{ github.ref_name }}' dist/** 92 | --repo '${{ github.repository }}' 93 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run test suite 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run_tox: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ['3.8','3.9','3.10', '3.11', '3.12', '3.13'] 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Get history and tags for SCM versioning to work 20 | run: | 21 | git fetch --prune --unshallow 22 | git fetch --depth=1 origin +refs/tags/*:refs/tags/* 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v4 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Cache pip dependencies 28 | uses: actions/cache@v3 29 | with: 30 | path: ~/.cache/pip 31 | key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} 32 | restore-keys: | 33 | ${{ runner.os }}-pip- 34 | - name: Install Python dependencies 35 | run: | 36 | python3 -m pip install --upgrade pip 37 | python3 -m pip install tox tox-gh-actions 38 | - name: Tox 39 | run: tox 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Pycharm stuff 132 | .idea 133 | 134 | # Downloaded test data 135 | data/pypi 136 | 137 | # Temporary file for coverage measurement 138 | tests/python_parser/parser_cache/py_parser.py 139 | 140 | # Vscode stuff 141 | .vscode 142 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Pegen 2 | 3 | This project welcomes contributions in the form of Pull Requests. 4 | For clear bug-fixes / typos etc. just submit a PR. 5 | For new features or if there is any doubt in how to fix a bug, you might want 6 | to open an issue prior to starting work to discuss it first. 7 | 8 | ### Tests 9 | 10 | `pegen` uses [tox](https://pypi.org/project/tox/) to run the test suite. Make sure 11 | you have `tox` installed and then you can run the tests with the following command: 12 | 13 | ``` 14 | python -m tox 15 | ``` 16 | 17 | This will check that all the tests pass but also will make several checks on the code style 18 | and type annotations of the package. 19 | 20 | Additionally, if you want to just run the tests and you have `pytest` installed, you can run 21 | the tests directly by running: 22 | 23 | ``` 24 | python -m pytest tests 25 | ``` 26 | 27 | Or if you have `make`, run the following: 28 | 29 | ``` 30 | make check 31 | ``` 32 | 33 | New code should ideally have tests and not break existing tests. 34 | 35 | ### Type Checking 36 | 37 | `pegen` uses type annotations throughout, and `mypy` to do the checking. 38 | Run the following to type check `pegen`: 39 | 40 | ``` 41 | python -m tox -e lint 42 | ``` 43 | 44 | Or if you have `make` and `mypy` is installed in your current Python environment: 45 | 46 | ``` 47 | make lint 48 | ``` 49 | 50 | Please add type annotations for all new code. 51 | 52 | ### Code Formatting 53 | 54 | `pegen` uses [`black`](https://github.com/psf/black) for code formatting. 55 | I recommend setting up black in your editor to format on save. 56 | 57 | To run black from the command line, use `make format` to format and write to the files. 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 we-like-parsers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON ?= python 2 | PIP_INSTALL=$(PYTHON) -m pip install 3 | DOCSBUILDDIR := docs/_build 4 | HTMLDIR := $(DOCSBUILDDIR)/html 5 | 6 | # Use this to inject arbitrary commands before the make targets (e.g. docker) 7 | ENV := 8 | 9 | .PHONY: dist 10 | dist: ## Generate Python distribution files 11 | $(PYTHON) -m pep517.build . 12 | 13 | .PHONY: install-sdist 14 | install-sdist: dist ## Install from source distribution 15 | $(ENV) $(PIP_INSTALL) $(wildcard dist/*.tar.gz) 16 | 17 | .PHONY: install 18 | test-install: ## Install with test dependencies 19 | $(ENV) $(PIP_INSTALL) -e .[test] 20 | 21 | .PHONY: test 22 | check: ## Run the test suite 23 | $(PYTHON) -m pytest -vvv --log-cli-level=info -s --color=yes $(PYTEST_ARGS) tests 24 | 25 | .PHONY: pycoverage 26 | pycoverage: ## Run the test suite, with Python code coverage 27 | $(PYTHON) -m pytest \ 28 | -vvv \ 29 | --log-cli-level=info \ 30 | -s \ 31 | --color=yes \ 32 | --cov=pegen \ 33 | --cov-config=tox.ini \ 34 | --cov-report=term \ 35 | --cov-append $(PYTEST_ARGS) \ 36 | tests 37 | 38 | .PHONY: format 39 | format: ## Format all files 40 | $(PYTHON) -m black src tests 41 | 42 | .PHONY: lint 43 | lint: ## Lint all files 44 | $(PYTHON) -m black --check src tests 45 | $(PYTHON) -m flake8 src tests 46 | $(PYTHON) -m mypy src/pegen 47 | 48 | .PHONY: clean 49 | clean: ## Clean any built/generated artifacts 50 | find . | grep -E '(\.o|\.so|\.gcda|\.gcno|\.gcov\.json\.gz)' | xargs rm -rf 51 | find . | grep -E '(__pycache__|\.pyc|\.pyo)' | xargs rm -rf 52 | 53 | .PHONY: regen-metaparser 54 | regen-metaparser: src/pegen/metagrammar.gram src/pegen/*.py # Regenerate the metaparser 55 | $(PYTHON) -m pegen -q src/pegen/metagrammar.gram -o src/pegen/grammar_parser.py 56 | $(PYTHON) -m black src/pegen/grammar_parser.py 57 | 58 | .PHONY: docs 59 | docs: ## Generate documentation 60 | $(MAKE) -C docs clean 61 | $(MAKE) -C docs html 62 | 63 | .PHONY: gh-pages 64 | gh-pages: ## Publish documentation on BBGitHub Pages 65 | $(eval GIT_REMOTE := $(shell git remote get-url $(UPSTREAM_GIT_REMOTE))) 66 | $(eval COMMIT_HASH := $(shell git rev-parse HEAD)) 67 | touch $(HTMLDIR)/.nojekyll 68 | @echo -n "Documentation ready, push to $(GIT_REMOTE)? [Y/n] " && read ans && [ $${ans:-Y} == Y ] 69 | git init $(HTMLDIR) 70 | GIT_DIR=$(HTMLDIR)/.git GIT_WORK_TREE=$(HTMLDIR) git add -A 71 | GIT_DIR=$(HTMLDIR)/.git git commit -m "Documentation for commit $(COMMIT_HASH)" 72 | GIT_DIR=$(HTMLDIR)/.git git push $(GIT_REMOTE) HEAD:gh-pages --force 73 | rm -rf $(HTMLDIR)/.git 74 | 75 | .PHONY: help 76 | help: ## Print this message 77 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 78 | 79 | demo: 80 | PYTHONPATH=$(shell pwd)/src:$(PYTHONPATH) $(PYTHON) -m pegen data/python.gram -o data/python_parser.py 81 | PYTHONPATH=$(shell pwd)/src:$(PYTHONPATH) $(PYTHON) data/python_parser.py -r tests/demo.py 82 | 83 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def pytest_configure(config): 5 | source_root = os.path.dirname(os.path.abspath(__file__)) 6 | if os.getcwd() != source_root: 7 | os.chdir(source_root) 8 | -------------------------------------------------------------------------------- /data/Tokens: -------------------------------------------------------------------------------- 1 | ENDMARKER 2 | NAME 3 | NUMBER 4 | STRING 5 | NEWLINE 6 | INDENT 7 | DEDENT 8 | 9 | LPAR '(' 10 | RPAR ')' 11 | LSQB '[' 12 | RSQB ']' 13 | COLON ':' 14 | COMMA ',' 15 | SEMI ';' 16 | PLUS '+' 17 | MINUS '-' 18 | STAR '*' 19 | SLASH '/' 20 | VBAR '|' 21 | AMPER '&' 22 | LESS '<' 23 | GREATER '>' 24 | EQUAL '=' 25 | DOT '.' 26 | PERCENT '%' 27 | LBRACE '{' 28 | RBRACE '}' 29 | EQEQUAL '==' 30 | NOTEQUAL '!=' 31 | LESSEQUAL '<=' 32 | GREATEREQUAL '>=' 33 | TILDE '~' 34 | CIRCUMFLEX '^' 35 | LEFTSHIFT '<<' 36 | RIGHTSHIFT '>>' 37 | DOUBLESTAR '**' 38 | PLUSEQUAL '+=' 39 | MINEQUAL '-=' 40 | STAREQUAL '*=' 41 | SLASHEQUAL '/=' 42 | PERCENTEQUAL '%=' 43 | AMPEREQUAL '&=' 44 | VBAREQUAL '|=' 45 | CIRCUMFLEXEQUAL '^=' 46 | LEFTSHIFTEQUAL '<<=' 47 | RIGHTSHIFTEQUAL '>>=' 48 | DOUBLESTAREQUAL '**=' 49 | DOUBLESLASH '//' 50 | DOUBLESLASHEQUAL '//=' 51 | AT '@' 52 | ATEQUAL '@=' 53 | RARROW '->' 54 | ELLIPSIS '...' 55 | COLONEQUAL ':=' 56 | 57 | OP 58 | AWAIT 59 | ASYNC 60 | TYPE_IGNORE 61 | TYPE_COMMENT 62 | FSTRING_START 63 | FSTRING_MIDDLE 64 | FSTRING_END 65 | ERRORTOKEN 66 | 67 | # These aren't used by the C tokenizer but are needed for tokenize.py 68 | COMMENT 69 | NL 70 | ENCODING 71 | -------------------------------------------------------------------------------- /data/expr.gram: -------------------------------------------------------------------------------- 1 | start: expr NEWLINE? ENDMARKER { ast.Expression(expr, lineno=1, col_offset=0) } 2 | expr: ( expr '+' term { ast.BinOp(expr, ast.Add(), term) } 3 | | expr '-' term { ast.BinOp(expr, ast.Sub(), term) } 4 | | term { term } 5 | ) 6 | term: ( l=term '*' r=factor { ast.BinOp(l, ast.Mult(), r) } 7 | | term '/' factor { ast.BinOp(term, ast.Div(), factor) } 8 | | factor { factor } 9 | ) 10 | factor: ('(' expr ')' { expr } 11 | | atom { atom } 12 | ) 13 | atom: ( NAME { ast.Name(id=name.string, ctx=ast.Load()) } 14 | | NUMBER { ast.Constant(value=ast.literal_eval(number.string)) } 15 | ) 16 | -------------------------------------------------------------------------------- /data/gather.gram: -------------------------------------------------------------------------------- 1 | start: 'import' names 2 | names: ','.NAME+ 3 | -------------------------------------------------------------------------------- /data/gram.gram: -------------------------------------------------------------------------------- 1 | start: rs=rules ENDMARKER { rs } 2 | 3 | rules: rs=rule+ { rs } 4 | 5 | rule: ( n=NAME ':' alts=alternatives NEWLINE { Tree('Rule', n, alts) } 6 | | n=NAME ':' NEWLINE INDENT alts=('|' alternatives NEWLINE)+ DEDENT 7 | { Tree('Rule', n, flatten(alts)) } 8 | ) 9 | 10 | alternatives: l=alt rs=('|' alt)* { Tree('Alts', l, *rs) } 11 | 12 | alt: it=(named_item | item)+ { Tree('Alt', it) } 13 | 14 | named_item: NAME '=' item { Tree('Named', name, item) } 15 | 16 | item: ('[' alternatives ']' { Tree('Opt', alternatives) } 17 | | atom ' '* '?' { Tree('Opt', atom) } 18 | | atom '*' { Tree('ZeroOrMore', atom) } 19 | | atom '1' { Tree('OneOrMore', atom) } 20 | | atom { atom } 21 | ) 22 | 23 | atom: ( '(' alternatives ')' { alternatives } 24 | | NAME { name } 25 | | STRING { string } 26 | ) 27 | -------------------------------------------------------------------------------- /data/recursive.gram: -------------------------------------------------------------------------------- 1 | start: rules ENDMARKER 2 | rules: rules rule | rule 3 | rule: NAME ':' alternatives NEWLINE 4 | alternatives: alternatives '|' alt_and_action | alt_and_action 5 | alt_and_action: alt action | alt 6 | alt: alt item | item 7 | item: optional | atom '*' | atom '+' | atom '?' | atom 8 | optional: '[' alternatives ']' 9 | atom: group | NAME | STRING 10 | group: '(' alternatives ')' 11 | 12 | action: "{" ~ target_atoms "}" { target_atoms } 13 | target_atoms: 14 | | target_atom target_atoms { target_atom + " " + target_atoms } 15 | | target_atom { target_atom } 16 | target_atom: 17 | | "{" ~ target_atoms "}" { "{" + target_atoms + "}" } 18 | | NAME { name.string } 19 | | NUMBER { number.string } 20 | | STRING { string.string } 21 | | !"}" OP { op.string } 22 | -------------------------------------------------------------------------------- /data/small.txt: -------------------------------------------------------------------------------- 1 | 1 + 2 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + ((((((11 * 12 * 13 * 14 * 15 + 16 * 17 + 18 * 19 * 20)))))) 2 | 2*3 + 4*5*6 3 | 12 + (2 * 3 * 4 * 5 + 6 + 7 * 8) 4 | 1 + 2 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + ((((((11 * 12 * 13 * 14 * 15 + 16 * 17 + 18 * 19 * 20)))))) 5 | 2*3 + 4*5*6 6 | 12 + (2 * 3 * 4 * 5 + 6 + 7 * 8) 7 | 1 + 2 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + ((((((11 * 12 * 13 * 14 * 15 + 16 * 17 + 18 * 19 * 20)))))) 8 | 2*3 + 4*5*6 9 | 12 + (2 * 3 * 4 * 5 + 6 + 7 * 8) 10 | 1 + 2 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + ((((((11 * 12 * 13 * 14 * 15 + 16 * 17 + 18 * 19 * 20)))))) 11 | -------------------------------------------------------------------------------- /data/tiny.txt: -------------------------------------------------------------------------------- 1 | 1 + 2 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + ((((((11 * 12 * 13 * 14 * 15 + 16 * 17 + 18 * 19 * 20)))))) 2 | -------------------------------------------------------------------------------- /data/x.gram: -------------------------------------------------------------------------------- 1 | start[expr_ty]: expr NEWLINE? ENDMARKER 2 | expr[expr_ty]: ( term '+' expr 3 | | (sign sign)+ term '-' expr 4 | | term 5 | ) 6 | sign: '+' | '-' 7 | term[expr_ty]: ( factor '*' term 8 | | factor '/' term 9 | | factor 10 | ) 11 | factor[expr_ty]: ('(' expr ')' 12 | | atom 13 | ) 14 | atom[expr_ty]: ( NAME 15 | | NUMBER 16 | ) 17 | -------------------------------------------------------------------------------- /data/x.txt: -------------------------------------------------------------------------------- 1 | --a-b 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/logo_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/docs/_static/logo_simple.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'pegen' 21 | copyright = '2021, Pablo Galindo Salgado, Guido van Rossum, Lysandros Nikolaou' 22 | author = 'Pablo Galindo Salgado, Guido van Rossum, Lysandros Nikolaou' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | ] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # List of patterns, relative to source directory, that match files and 37 | # directories to ignore when looking for source files. 38 | # This pattern also affects html_static_path and html_extra_path. 39 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 40 | 41 | 42 | # -- Options for HTML output ------------------------------------------------- 43 | 44 | # The theme to use for HTML and HTML Help pages. See the documentation for 45 | # a list of builtin themes. 46 | # 47 | html_theme = 'furo' 48 | 49 | # Add any paths that contain custom static files (such as style sheets) here, 50 | # relative to this directory. They are copied after the builtin static files, 51 | # so a file named "default.css" will overwrite the builtin "default.css". 52 | html_static_path = ['_static'] 53 | html_theme_options = { 54 | "light_logo": "logo_simple.png", 55 | "dark_logo": "logo_simple.png", 56 | } 57 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pegen documentation master file, created by 2 | sphinx-quickstart on Tue Sep 28 13:01:24 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. image:: _static/logo.svg 7 | :width: 600 8 | :align: center 9 | 10 | What is this? 11 | ============= 12 | 13 | Pegen is the parser generator used in CPython to produce the parser used 14 | by the interpreter. It allows to produce PEG parsers from a description 15 | of a formal Grammar. 16 | 17 | Installing 18 | ========== 19 | 20 | Install with ``pip`` or your favorite PyPi package manager. 21 | 22 | .. code-block:: 23 | 24 | pip install pegen 25 | 26 | How to generate a parser 27 | ======================== 28 | 29 | Given a grammar file compatible with ``pegen`` (you can write your own 30 | or start with one in the `data `__ directory), you can easily 31 | generate a parser by running: 32 | 33 | .. code-block:: 34 | 35 | python -m pegen -o parser.py 36 | 37 | This will generate a file called ``parser.py`` in the current directory. 38 | This can be used to parse code using the grammar that we just used: 39 | 40 | .. code-block:: 41 | 42 | python parser.py 43 | 44 | Differences with CPython’s Pegen 45 | ================================ 46 | 47 | This repository exists to distribute a version of the Python PEG parser 48 | generator used by CPython that can be installed via PyPI, with some 49 | improvements. Although the official PEG generator included in CPython 50 | can generate both Python and C code, this distribution of the generator 51 | only allows to generate Python code. This is due to the fact that the C 52 | code generated by the generator included in CPython includes a lot of 53 | implementation details and private headers that are not available for 54 | general usage. 55 | 56 | The official PEG generator for Python 3.9 and later is now included in 57 | the CPython repo under 58 | `Tools/peg_generator/ `__. 59 | 60 | See also `PEP 617 `__. 61 | 62 | Indices and tables 63 | ================== 64 | 65 | .. toctree:: 66 | 67 | peg_parsers 68 | grammar 69 | 70 | * :ref:`genindex` 71 | * :ref:`modindex` 72 | * :ref:`search` 73 | 74 | Developing 75 | ---------- 76 | 77 | We welcome contributions to ``pegen``. Check 78 | `CONTRIBUTING.md `__ 79 | to get an idea of how to contribute to the project. 80 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pegen/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/pegen/py.typed -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [project] 3 | name = "pegen" 4 | description = "CPython's PEG parser generator" 5 | readme = "README.md" 6 | requires-python = ">=3.8, <4" 7 | license = {file = "LICENSE"} 8 | authors = [ 9 | {name = "Guido van Rossum"}, 10 | {name = "Pablo Galindo", email = "pablogsal@gmail.com"}, 11 | {name = "Lysandros Nikolaou", email = "lisandrosnik@gmail.com"} 12 | ] 13 | maintainers = [ 14 | {name = "Matthieu C. Dartiailh", email = "m.dartiailh@gmail.com"} 15 | ] 16 | classifiers = [ 17 | "Development Status :: 3 - Alpha", 18 | "Intended Audience :: Developers", 19 | "Topic :: Software Development :: Compilers", 20 | "License :: OSI Approved :: MIT License", 21 | "Programming Language :: Python :: 3.8", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | "Programming Language :: Python :: 3.12", 26 | "Programming Language :: Python :: 3.13", 27 | "Programming Language :: Python :: 3 :: Only", 28 | ] 29 | keywords = ["parser", "CPython", "PEG", "pegen"] 30 | dynamic=["version"] 31 | 32 | 33 | [project.optional-dependencies] 34 | docs = ["sphinx", "sphinx-copybutton", "furo"] 35 | lint = ["black", "flake8", "mypy"] 36 | test = ["pytest", "pytest-cov"] 37 | memory = ["psutil"] 38 | web = ["flask", "flask-wtf"] 39 | 40 | 41 | [project.urls] 42 | homepage = "https://github.com/we-like-parsers/pegen" 43 | documentation = "https://we-like-parsers.github.io/pegen/" 44 | source = "https://github.com/we-like-parsers/pegen" 45 | changelog = "https://github.com/we-like-parsers/pegen/releasenotes.rst" 46 | bug_reports = "https://github.com/we-like-parsers/pegen/issues" 47 | 48 | 49 | [build-system] 50 | requires = ["setuptools>=61.2", "wheel", "setuptools_scm[toml]>=3.4.3"] 51 | build-backend = "setuptools.build_meta" 52 | 53 | [tool.setuptools_scm] 54 | 55 | [tool.black] 56 | line-length = 99 57 | target_version = ['py38'] 58 | exclude = ''' 59 | ( 60 | /pegen/grammar_parser.py # generated file 61 | | /tests/python_parser/data/ # test files 62 | | /tests/python_parser/parser_cache/ # generated parser 63 | ) 64 | ''' 65 | 66 | 67 | [tool.pytest.ini_options] 68 | norecursedirs = [ 69 | "data/failset", 70 | "cpython" 71 | ] 72 | 73 | [tool.mypy] 74 | files = ["pegen", "scripts", "tests"] 75 | 76 | follow_imports = "error" 77 | no_implicit_optional = true 78 | strict_optional = true 79 | 80 | disallow_untyped_calls = true 81 | disallow_untyped_defs = true 82 | 83 | disallow_any_generics = true 84 | disallow_any_unimported = true 85 | disallow_incomplete_defs = true 86 | disallow_subclassing_any = true 87 | 88 | warn_unused_configs = true 89 | warn_unused_ignores = true 90 | warn_redundant_casts = true 91 | warn_no_return = true 92 | 93 | show_traceback = true 94 | show_error_codes = true 95 | 96 | [[tool.mypy.overrides]] 97 | module = [ 98 | "pegen.grammar_parser" 99 | ] 100 | strict_optional = false 101 | 102 | [[tool.mypy.overrides]] 103 | module = [ 104 | "psutil" 105 | ] 106 | ignore_missing_imports = true -------------------------------------------------------------------------------- /releasenotes.rst: -------------------------------------------------------------------------------- 1 | Release notes 2 | ============== 3 | 4 | 2023-11-14: Version 0.3.0 5 | ------------------------- 6 | 7 | - Replace nullable_visit with NullableVisitor (#86) 8 | - Implement nullable detection via NullableVisitor (#91) 9 | - Support Python 3.11 and 3.12 (#95) 10 | - Support Python 3.12 f-strings in grammar actions (#94, #96) 11 | - Fix typing in parser class and minor f-string fix (#97) 12 | - Improve CI/CD workflows (#98) 13 | 14 | 2023-01-18: Version 0.2.0 15 | ------------------------- 16 | 17 | - add delayed error inspection, invalid rules pass and recursive detection of 18 | invalid rules PR #60 19 | - remove generated file data/python_parser.py, and add demo target in Makefile PR #62 20 | - refactor dependencies to avoid extraneous dependencies by default PR #59 21 | - add documentation PR #43 #52 22 | - sort KEYWORDS to make output deterministic PR #44 23 | - update grammar_grapher with the new forced (&&) directive PR #57 24 | - fixed bug where tokenizer reported the last line of source as empty #77 25 | 26 | 2021-09-06: Version 0.1.0 27 | ------------------------- 28 | 29 | First numbered release -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # This exists to let mypy find modules here 2 | -------------------------------------------------------------------------------- /scripts/ast_timings.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import sys 3 | import time 4 | 5 | from pegen.utils import print_memstats 6 | 7 | 8 | def main() -> None: 9 | t0 = time.time() 10 | for filename in sys.argv[1:]: 11 | print(filename, end="\r") 12 | try: 13 | with open(filename) as file: 14 | source = file.read() 15 | tree = ast.parse(source, filename) 16 | except Exception as err: 17 | print(f"{filename}: {err.__class__.__name__}: {err}", file=sys.stderr) 18 | tok = None 19 | t1 = time.time() 20 | dt = t1 - t0 21 | print(f"Parsed in {dt:.3f} secs", file=sys.stderr) 22 | print_memstats() 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /scripts/download_pypi_packages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import argparse 4 | import json 5 | import os 6 | from typing import Any, Dict 7 | from urllib.request import urlretrieve 8 | 9 | argparser = argparse.ArgumentParser( 10 | prog="download_pypi_packages", 11 | description="Helper program to download PyPI packages", 12 | ) 13 | argparser.add_argument( 14 | "-n", "--number", type=int, default=100, help="Number of packages to download" 15 | ) 16 | argparser.add_argument( 17 | "-a", "--all", action="store_true", help="Download all packages listed in the json file" 18 | ) 19 | 20 | 21 | def load_json(filename: str) -> Dict[Any, Any]: 22 | with open(os.path.join("data", f"{filename}.json"), "r") as f: 23 | j = json.loads(f.read()) 24 | return j 25 | 26 | 27 | def remove_json(filename: str) -> None: 28 | path = os.path.join("data", f"{filename}.json") 29 | os.remove(path) 30 | 31 | 32 | def download_package_json(package_name: str) -> None: 33 | url = f"https://pypi.org/pypi/{package_name}/json" 34 | urlretrieve(url, os.path.join("data", f"{package_name}.json")) 35 | 36 | 37 | def download_package_code(name: str, package_json: Dict[Any, Any]) -> None: 38 | source_index = -1 39 | for idx, url_info in enumerate(package_json["urls"]): 40 | if url_info["python_version"] == "source": 41 | source_index = idx 42 | break 43 | filename = package_json["urls"][source_index]["filename"] 44 | url = package_json["urls"][source_index]["url"] 45 | urlretrieve(url, os.path.join("data", "pypi", filename)) 46 | 47 | 48 | def main() -> None: 49 | args = argparser.parse_args() 50 | number_packages = args.number 51 | all_packages = args.all 52 | 53 | top_pypi_packages = load_json("top-pypi-packages-365-days") 54 | if all_packages: 55 | top_pypi_packages = top_pypi_packages["rows"] 56 | elif number_packages >= 0 and number_packages <= 4000: 57 | top_pypi_packages = top_pypi_packages["rows"][:number_packages] 58 | else: 59 | raise AssertionError("Unknown value for NUMBER_OF_PACKAGES") 60 | 61 | try: 62 | os.mkdir(os.path.join("data", "pypi")) 63 | except FileExistsError: 64 | pass 65 | 66 | for package in top_pypi_packages: 67 | package_name = package["project"] 68 | 69 | print(f"Downloading JSON Data for {package_name}... ", end="") 70 | download_package_json(package_name) 71 | print("Done") 72 | 73 | package_json = load_json(package_name) 74 | try: 75 | print(f"Dowloading and compressing package {package_name} ... ", end="") 76 | download_package_code(package_name, package_json) 77 | print("Done") 78 | except (IndexError, KeyError): 79 | print(f"Could not locate source for {package_name}") 80 | continue 81 | finally: 82 | remove_json(package_name) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /scripts/find_max_nesting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | """Find the maximum amount of nesting for an expression that can be parsed 3 | without causing a parse error. 4 | 5 | Starting at the INITIAL_NESTING_DEPTH, an expression containing n parenthesis 6 | around a 0 is generated then tested with both the C and Python parsers. We 7 | continue incrementing the number of parenthesis by 10 until both parsers have 8 | failed. As soon as a single parser fails, we stop testing that parser. 9 | 10 | The grammar file, initial nesting size, and amount by which the nested size is 11 | incremented on each success can be controlled by changing the GRAMMAR_FILE, 12 | INITIAL_NESTING_DEPTH, or NESTED_INCR_AMT variables. 13 | 14 | Usage: python -m scripts.find_max_nesting 15 | """ 16 | import sys 17 | from pathlib import Path 18 | from tempfile import TemporaryDirectory 19 | from typing import Any 20 | 21 | sys.path.insert(0, ".") 22 | from pegen.build import build_parser 23 | from pegen.utils import generate_parser, generate_parser_c_extension, parse_string 24 | 25 | GRAMMAR_FILE = "data/python.gram" 26 | INITIAL_NESTING_DEPTH = 10 27 | NESTED_INCR_AMT = 10 28 | 29 | 30 | FAIL = "\033[91m" 31 | ENDC = "\033[0m" 32 | 33 | 34 | def check_nested_expr(nesting_depth: int, parser: Any, language: str) -> bool: 35 | expr = f"{'(' * nesting_depth}0{')' * nesting_depth}" 36 | 37 | try: 38 | if language == "Python": 39 | parse_string(expr, parser) 40 | else: 41 | parser.parse_string(expr) 42 | 43 | print(f"({language}) Nesting depth of {nesting_depth} is successful") 44 | 45 | return True 46 | except Exception as err: 47 | print(f"{FAIL}({language}) Failed with nesting depth of {nesting_depth}{ENDC}") 48 | print(f"{FAIL}\t{err}{ENDC}") 49 | return False 50 | 51 | 52 | def main() -> None: 53 | print(f"Testing {GRAMMAR_FILE} starting at nesting depth of {INITIAL_NESTING_DEPTH}...") 54 | 55 | with TemporaryDirectory() as tmp_dir: 56 | nesting_depth = INITIAL_NESTING_DEPTH 57 | rules, parser, tokenizer = build_parser(GRAMMAR_FILE) 58 | python_parser = generate_parser(rules) 59 | c_parser = generate_parser_c_extension(rules, Path(tmp_dir)) 60 | 61 | c_succeeded = True 62 | python_succeeded = True 63 | 64 | while c_succeeded or python_succeeded: 65 | expr = f"{'(' * nesting_depth}0{')' * nesting_depth}" 66 | 67 | if c_succeeded: 68 | c_succeeded = check_nested_expr(nesting_depth, c_parser, "C") 69 | if python_succeeded: 70 | python_succeeded = check_nested_expr(nesting_depth, python_parser, "Python") 71 | 72 | nesting_depth += NESTED_INCR_AMT 73 | 74 | sys.exit(1) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /scripts/grammar_grapher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | """ Convert a grammar into a dot-file suitable for use with GraphViz 4 | 5 | For example: 6 | Generate the GraphViz file: 7 | # scripts/grammar_grapher.py data/python.gram > python.gv 8 | 9 | Then generate the graph... 10 | 11 | # twopi python.gv -Tpng > python_twopi.png 12 | 13 | or 14 | 15 | # dot python.gv -Tpng > python_dot.png 16 | 17 | NOTE: The _dot_ and _twopi_ tools seem to produce the most useful results. 18 | The _circo_ tool is the worst of the bunch. Don't even bother. 19 | """ 20 | 21 | import argparse 22 | import sys 23 | from typing import Any, List 24 | 25 | sys.path.insert(0, ".") 26 | 27 | from pegen.build import build_parser 28 | from pegen.grammar import ( 29 | Alt, 30 | Cut, 31 | Forced, 32 | Grammar, 33 | Group, 34 | Leaf, 35 | Lookahead, 36 | NamedItem, 37 | NameLeaf, 38 | Opt, 39 | Repeat, 40 | Rhs, 41 | Rule, 42 | ) 43 | 44 | argparser = argparse.ArgumentParser( 45 | prog="graph_grammar", 46 | description="Graph a grammar tree", 47 | ) 48 | argparser.add_argument("grammar_file", help="The grammar file to graph") 49 | 50 | 51 | def references_for_item(item: Any) -> List[Any]: 52 | if isinstance(item, Alt): 53 | return [_ref for _item in item.items for _ref in references_for_item(_item)] 54 | elif isinstance(item, Cut): 55 | return [] 56 | elif isinstance(item, Forced): 57 | return references_for_item(item.node) 58 | elif isinstance(item, Group): 59 | return references_for_item(item.rhs) 60 | elif isinstance(item, Lookahead): 61 | return references_for_item(item.node) 62 | elif isinstance(item, NamedItem): 63 | return references_for_item(item.item) 64 | 65 | # NOTE NameLeaf must be before Leaf 66 | elif isinstance(item, NameLeaf): 67 | if item.value == "ENDMARKER": 68 | return [] 69 | return [item.value] 70 | elif isinstance(item, Leaf): 71 | return [] 72 | 73 | elif isinstance(item, Opt): 74 | return references_for_item(item.node) 75 | elif isinstance(item, Repeat): 76 | return references_for_item(item.node) 77 | elif isinstance(item, Rhs): 78 | return [_ref for alt in item.alts for _ref in references_for_item(alt)] 79 | elif isinstance(item, Rule): 80 | return references_for_item(item.rhs) 81 | else: 82 | raise RuntimeError(f"Unknown item: {type(item)}") 83 | 84 | 85 | def main() -> None: 86 | args = argparser.parse_args() 87 | 88 | try: 89 | grammar, parser, tokenizer = build_parser(args.grammar_file) 90 | except Exception as err: 91 | print("ERROR: Failed to parse grammar file", file=sys.stderr) 92 | sys.exit(1) 93 | 94 | references = {} 95 | for name, rule in grammar.rules.items(): 96 | references[name] = set(references_for_item(rule)) 97 | 98 | # Flatten the start node if has only a single reference 99 | root_node = "start" 100 | if start := references.get("start"): 101 | if len(start) == 1: 102 | root_node = list(start)[0] 103 | del references["start"] 104 | 105 | print("digraph g1 {") 106 | print('\toverlap="scale";') # Force twopi to scale the graph to avoid overlaps 107 | print(f'\troot="{root_node}";') 108 | if start: 109 | print(f"\t{root_node} [color=green, shape=circle]") 110 | for name, refs in references.items(): 111 | if refs: # Ignore empty sets 112 | print(f"\t{name} -> {','.join(refs)};") 113 | print("}") 114 | 115 | 116 | if __name__ == "__main__": 117 | main() 118 | -------------------------------------------------------------------------------- /scripts/joinstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | """Produce a report about the most-memoable types. 4 | 5 | Reads a list of statistics from stdin. Each line must be two numbers, 6 | being a type and a count. We then read some other files and produce a 7 | list sorted by most frequent type. 8 | 9 | There should also be something to recognize left-recursive rules. 10 | """ 11 | 12 | import os 13 | import re 14 | import sys 15 | from typing import Dict 16 | 17 | reporoot = os.path.dirname(os.path.dirname(__file__)) 18 | parse_c = os.path.join(reporoot, "peg_extension", "parse.c") 19 | 20 | 21 | class TypeMapper: 22 | """State used to map types to names.""" 23 | 24 | def __init__(self, filename: str) -> None: 25 | self.table: Dict[int, str] = {} 26 | with open(filename) as f: 27 | for line in f: 28 | match = re.match(r"#define (\w+)_type (\d+)", line) 29 | if match: 30 | name, type = match.groups() 31 | if "left" in line.lower(): 32 | name += " // Left-recursive" 33 | self.table[int(type)] = name 34 | 35 | def lookup(self, type: int) -> str: 36 | return self.table.get(type, str(type)) 37 | 38 | 39 | def main() -> None: 40 | mapper = TypeMapper(parse_c) 41 | table = [] 42 | filename = sys.argv[1] 43 | with open(filename) as f: 44 | for lineno, line in enumerate(f, 1): 45 | line = line.strip() 46 | if not line or line.startswith("#"): 47 | continue 48 | parts = line.split() 49 | # Extra fields ignored 50 | if len(parts) < 2: 51 | print(f"{lineno}: bad input ({line!r})") 52 | continue 53 | try: 54 | type, count = map(int, parts[:2]) 55 | except ValueError as err: 56 | print(f"{lineno}: non-integer input ({line!r})") 57 | continue 58 | table.append((type, count)) 59 | table.sort(key=lambda values: -values[1]) 60 | for type, count in table: 61 | print(f"{type:4d} {count:9d} {mapper.lookup(type)}") 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /scripts/test_pypi_packages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import argparse 4 | import os 5 | import glob 6 | import tarfile 7 | import zipfile 8 | import shutil 9 | import sys 10 | 11 | from typing import Generator, Any 12 | 13 | sys.path.insert(0, ".") 14 | from pegen import build 15 | from scripts import test_parse_directory 16 | 17 | argparser = argparse.ArgumentParser( 18 | prog="test_pypi_packages", 19 | description="Helper program to test parsing PyPI packages", 20 | ) 21 | argparser.add_argument( 22 | "-t", "--tree", action="count", help="Compare parse tree to official AST", default=0 23 | ) 24 | 25 | 26 | def get_packages() -> Generator[str, None, None]: 27 | all_packages = ( 28 | glob.glob("./data/pypi/*.tar.gz") 29 | + glob.glob("./data/pypi/*.zip") 30 | + glob.glob("./data/pypi/*.tgz") 31 | ) 32 | for package in all_packages: 33 | yield package 34 | 35 | 36 | def extract_files(filename: str) -> None: 37 | savedir = os.path.join("data", "pypi") 38 | if tarfile.is_tarfile(filename): 39 | tarfile.open(filename).extractall(savedir) 40 | elif zipfile.is_zipfile(filename): 41 | zipfile.ZipFile(filename).extractall(savedir) 42 | else: 43 | raise ValueError(f"Could not identify type of compressed file {filename}") 44 | 45 | 46 | def find_dirname(package_name: str) -> str: 47 | for name in os.listdir(os.path.join("data", "pypi")): 48 | full_path = os.path.join("data", "pypi", name) 49 | if os.path.isdir(full_path) and name in package_name: 50 | return full_path 51 | assert False # This is to fix mypy, should never be reached 52 | 53 | 54 | def run_tests(dirname: str, tree: int) -> int: 55 | return test_parse_directory.parse_directory( 56 | dirname, 57 | "data/python.gram", 58 | verbose=False, 59 | excluded_files=[ 60 | "*/failset/*", 61 | "*/failset/**", 62 | "*/failset/**/*", 63 | "*/test2to3/*", 64 | "*/test2to3/**/*", 65 | "*/bad*", 66 | "*/lib2to3/tests/data/*", 67 | ], 68 | skip_actions=False, 69 | tree_arg=tree, 70 | short=True, 71 | parser=None, 72 | ) 73 | 74 | 75 | def main() -> None: 76 | args = argparser.parse_args() 77 | tree = args.tree 78 | 79 | for package in get_packages(): 80 | print(f"Extracting files from {package}... ", end="") 81 | try: 82 | extract_files(package) 83 | print("Done") 84 | except ValueError as e: 85 | print(e) 86 | continue 87 | 88 | print(f"Trying to parse all python files ... ") 89 | dirname = find_dirname(package) 90 | status = run_tests(dirname, tree) 91 | if status == 0: 92 | print("Done") 93 | shutil.rmtree(dirname) 94 | else: 95 | print(f"Failed to parse {dirname}") 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # Minimal script allowing editable install 4 | setup() 5 | -------------------------------------------------------------------------------- /src/pegen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/src/pegen/__init__.py -------------------------------------------------------------------------------- /src/pegen/build.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tokenize 3 | from typing import Dict, Set, Tuple 4 | 5 | from pegen.grammar import Grammar 6 | from pegen.grammar_parser import GeneratedParser as GrammarParser 7 | from pegen.parser import Parser 8 | from pegen.parser_generator import ParserGenerator 9 | from pegen.python_generator import PythonParserGenerator 10 | from pegen.tokenizer import Tokenizer 11 | 12 | MOD_DIR = pathlib.Path(__file__).resolve().parent 13 | 14 | TokenDefinitions = Tuple[Dict[int, str], Dict[str, int], Set[str]] 15 | 16 | 17 | def build_parser( 18 | grammar_file: str, verbose_tokenizer: bool = False, verbose_parser: bool = False 19 | ) -> Tuple[Grammar, Parser, Tokenizer]: 20 | with open(grammar_file) as file: 21 | tokenizer = Tokenizer(tokenize.generate_tokens(file.readline), verbose=verbose_tokenizer) 22 | parser = GrammarParser(tokenizer, verbose=verbose_parser) 23 | grammar = parser.start() 24 | 25 | if not grammar: 26 | raise parser.make_syntax_error(grammar_file) 27 | 28 | return grammar, parser, tokenizer 29 | 30 | 31 | def build_python_generator( 32 | grammar: Grammar, 33 | grammar_file: str, 34 | output_file: str, 35 | skip_actions: bool = False, 36 | ) -> ParserGenerator: 37 | with open(output_file, "w") as file: 38 | gen: ParserGenerator = PythonParserGenerator(grammar, file) # TODO: skip_actions 39 | gen.generate(grammar_file) 40 | return gen 41 | 42 | 43 | def build_python_parser_and_generator( 44 | grammar_file: str, 45 | output_file: str, 46 | verbose_tokenizer: bool = False, 47 | verbose_parser: bool = False, 48 | skip_actions: bool = False, 49 | ) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: 50 | """Generate rules, python parser, tokenizer, parser generator for a given grammar 51 | 52 | Args: 53 | grammar_file (string): Path for the grammar file 54 | output_file (string): Path for the output file 55 | verbose_tokenizer (bool, optional): Whether to display additional output 56 | when generating the tokenizer. Defaults to False. 57 | verbose_parser (bool, optional): Whether to display additional output 58 | when generating the parser. Defaults to False. 59 | skip_actions (bool, optional): Whether to pretend no rule has any actions. 60 | """ 61 | grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) 62 | gen = build_python_generator( 63 | grammar, 64 | grammar_file, 65 | output_file, 66 | skip_actions=skip_actions, 67 | ) 68 | return grammar, parser, tokenizer, gen 69 | -------------------------------------------------------------------------------- /src/pegen/grammar_visualizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from typing import Any, Callable, Iterator 4 | 5 | from pegen.build import build_parser 6 | from pegen.grammar import Grammar, Rule 7 | 8 | argparser = argparse.ArgumentParser( 9 | prog="pegen", description="Pretty print the AST for a given PEG grammar" 10 | ) 11 | argparser.add_argument("filename", help="Grammar description") 12 | 13 | 14 | class ASTGrammarPrinter: 15 | def children(self, node: Rule) -> Iterator[Any]: 16 | for value in node: 17 | if isinstance(value, list): 18 | yield from value 19 | else: 20 | yield value 21 | 22 | def name(self, node: Rule) -> str: 23 | if not list(self.children(node)): 24 | return repr(node) 25 | return node.__class__.__name__ 26 | 27 | def print_grammar_ast(self, grammar: Grammar, printer: Callable[..., None] = print) -> None: 28 | for rule in grammar.rules.values(): 29 | printer(self.print_nodes_recursively(rule)) 30 | 31 | def print_nodes_recursively(self, node: Rule, prefix: str = "", istail: bool = True) -> str: 32 | children = list(self.children(node)) 33 | value = self.name(node) 34 | 35 | line = prefix + ("└──" if istail else "├──") + value + "\n" 36 | sufix = " " if istail else "│ " 37 | 38 | if not children: 39 | return line 40 | 41 | *children, last = children 42 | for child in children: 43 | line += self.print_nodes_recursively(child, prefix + sufix, False) 44 | line += self.print_nodes_recursively(last, prefix + sufix, True) 45 | 46 | return line 47 | 48 | 49 | def main() -> None: 50 | args = argparser.parse_args() 51 | 52 | try: 53 | grammar, parser, tokenizer = build_parser(args.filename) 54 | except Exception: 55 | print("ERROR: Failed to parse grammar file", file=sys.stderr) 56 | sys.exit(1) 57 | 58 | visitor = ASTGrammarPrinter() 59 | visitor.print_grammar_ast(grammar) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /src/pegen/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/src/pegen/py.typed -------------------------------------------------------------------------------- /src/pegen/utils.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import io 3 | import sys 4 | import textwrap 5 | import tokenize 6 | from typing import IO, Any, Dict, Final, Optional, Type, cast 7 | 8 | from pegen.grammar import Grammar 9 | from pegen.grammar_parser import GeneratedParser as GrammarParser 10 | from pegen.parser import Parser 11 | from pegen.python_generator import PythonParserGenerator 12 | from pegen.tokenizer import Tokenizer 13 | 14 | 15 | def import_file(full_name: str, path: str) -> Any: 16 | """Import a python module from a path""" 17 | 18 | spec = importlib.util.spec_from_file_location(full_name, path) 19 | assert spec 20 | mod = importlib.util.module_from_spec(spec) 21 | 22 | # We assume this is not None and has an exec_module() method. 23 | # See https://docs.python.org/3/reference/import.html?highlight=exec_module#loading 24 | loader = cast(Any, spec.loader) 25 | loader.exec_module(mod) 26 | return mod 27 | 28 | 29 | def generate_parser( 30 | grammar: Grammar, parser_path: Optional[str] = None, parser_name: str = "GeneratedParser" 31 | ) -> Type[Parser]: 32 | # Generate a parser. 33 | out = io.StringIO() 34 | genr = PythonParserGenerator(grammar, out) 35 | genr.generate("") 36 | 37 | # Load the generated parser class. 38 | ns: Dict[str, Any] = {} 39 | if parser_path: 40 | with open(parser_path, "w") as f: 41 | f.write(out.getvalue()) 42 | mod = import_file("py_parser", parser_path) 43 | return getattr(mod, parser_name) 44 | else: 45 | exec(out.getvalue(), ns) 46 | return ns[parser_name] 47 | 48 | 49 | def run_parser(file: IO[bytes], parser_class: Type[Parser], *, verbose: bool = False) -> Any: 50 | # Run a parser on a file (stream). 51 | tokenizer = Tokenizer(tokenize.generate_tokens(file.readline)) # type: ignore # typeshed issue #3515 52 | parser = parser_class(tokenizer, verbose=verbose) 53 | result = parser.start() 54 | if result is None: 55 | raise parser.make_syntax_error("invalid syntax") 56 | return result 57 | 58 | 59 | def parse_string( 60 | source: str, parser_class: Type[Parser], *, dedent: bool = True, verbose: bool = False 61 | ) -> Any: 62 | # Run the parser on a string. 63 | if dedent: 64 | source = textwrap.dedent(source) 65 | file = io.StringIO(source) 66 | return run_parser(file, parser_class, verbose=verbose) # type: ignore # typeshed issue #3515 67 | 68 | 69 | def make_parser(source: str) -> Type[Parser]: 70 | # Combine parse_string() and generate_parser(). 71 | grammar = parse_string(source, GrammarParser) 72 | return generate_parser(grammar) 73 | 74 | 75 | def print_memstats() -> bool: 76 | MiB: Final = 2**20 77 | try: 78 | import psutil 79 | except ImportError: 80 | return False 81 | print("Memory stats:") 82 | process = psutil.Process() 83 | meminfo = process.memory_info() 84 | res = {} 85 | res["rss"] = meminfo.rss / MiB 86 | res["vms"] = meminfo.vms / MiB 87 | if sys.platform == "win32": 88 | res["maxrss"] = meminfo.peak_wset / MiB 89 | else: 90 | # See https://stackoverflow.com/questions/938733/total-memory-used-by-python-process 91 | import resource # Since it doesn't exist on Windows. 92 | 93 | rusage = resource.getrusage(resource.RUSAGE_SELF) 94 | if sys.platform == "darwin": 95 | factor = 1 96 | else: 97 | factor = 1024 # Linux 98 | res["maxrss"] = rusage.ru_maxrss * factor / MiB 99 | for key, value in res.items(): 100 | print(f" {key:12.12s}: {value:10.0f} MiB") 101 | return True 102 | -------------------------------------------------------------------------------- /src/pegen/validator.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pegen import grammar 4 | from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule 5 | 6 | 7 | class ValidationError(Exception): 8 | pass 9 | 10 | 11 | class GrammarValidator(GrammarVisitor): 12 | def __init__(self, grammar: grammar.Grammar) -> None: 13 | self.grammar = grammar 14 | self.rulename: Optional[str] = None 15 | 16 | def validate_rule(self, rulename: str, node: Rule) -> None: 17 | self.rulename = rulename 18 | self.visit(node) 19 | self.rulename = None 20 | 21 | 22 | class SubRuleValidator(GrammarValidator): 23 | def visit_Rhs(self, node: Rhs) -> None: 24 | for index, alt in enumerate(node.alts): 25 | alts_to_consider = node.alts[index + 1 :] 26 | for other_alt in alts_to_consider: 27 | self.check_intersection(alt, other_alt) 28 | 29 | def check_intersection(self, first_alt: Alt, second_alt: Alt) -> None: 30 | if str(second_alt).startswith(str(first_alt)): 31 | raise ValidationError( 32 | f"In {self.rulename} there is an alternative that will " 33 | f"never be visited:\n{second_alt}" 34 | ) 35 | 36 | 37 | def validate_grammar(the_grammar: grammar.Grammar) -> None: 38 | for validator_cls in GrammarValidator.__subclasses__(): 39 | validator = validator_cls(the_grammar) 40 | for rule_name, rule in the_grammar.rules.items(): 41 | validator.validate_rule(rule_name, rule) 42 | -------------------------------------------------------------------------------- /src/pegen/web.py: -------------------------------------------------------------------------------- 1 | import io 2 | import traceback 3 | 4 | from flask import Flask, cli, render_template # type: ignore 5 | from flask_wtf import FlaskForm # type: ignore 6 | from wtforms import SubmitField, TextAreaField # type: ignore 7 | from wtforms.validators import DataRequired # type: ignore 8 | 9 | from pegen.utils import make_parser, parse_string 10 | 11 | DEFAULT_GRAMMAR = """\ 12 | start: expr NEWLINE? ENDMARKER { expr } 13 | expr: 14 | | expr '+' term { expr + term } 15 | | expr '-' term { expr - term} 16 | | term 17 | term: 18 | | term '*' factor { term * factor } 19 | | term '/' factor { term / factor } 20 | | factor 21 | 22 | factor: 23 | | '(' expr ')' { expr } 24 | | atom { int(atom.string) } 25 | atom: NUMBER 26 | """ 27 | 28 | DEFAULT_SOURCE = "(1 + 2) * (3 - 6)" 29 | 30 | 31 | app = Flask(__name__) 32 | 33 | # Flask-WTF requires an encryption key - the string can be anything 34 | app.config["SECRET_KEY"] = "does_not_matter" 35 | 36 | 37 | class GrammarForm(FlaskForm): # type: ignore 38 | grammar = TextAreaField("PEG GRAMMAR", validators=[DataRequired()], default=DEFAULT_GRAMMAR) 39 | source = TextAreaField("PROGRAM", validators=[DataRequired()], default=DEFAULT_SOURCE) 40 | submit = SubmitField("Parse!") 41 | 42 | 43 | @app.route("/", methods=["GET", "POST"]) 44 | def index() -> None: 45 | # you must tell the variable 'form' what you named the class, above 46 | # 'form' is the variable name used in this template: index.html 47 | form = GrammarForm() 48 | form.grammar(class_="form-control") 49 | output_text = "\n" 50 | if form.validate_on_submit(): 51 | grammar_source = form.grammar.data 52 | program_source = form.source.data 53 | output = io.StringIO() 54 | try: 55 | parser_class = make_parser(grammar_source) 56 | result = parse_string(program_source, parser_class, verbose=False) 57 | print(result, file=output) 58 | except Exception: 59 | traceback.print_exc(file=output) 60 | output_text += output.getvalue() 61 | return render_template("index.html", form=form, output=output_text) 62 | 63 | 64 | if __name__ == "__main__": 65 | cli.show_server_banner = lambda *_: None 66 | app.run(debug=False) 67 | -------------------------------------------------------------------------------- /stories/story1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story1/__init__.py -------------------------------------------------------------------------------- /stories/story1/node.py: -------------------------------------------------------------------------------- 1 | class Node: 2 | 3 | def __init__(self, type, children): 4 | self.type = type 5 | self.children = children 6 | 7 | def __repr__(self): 8 | return f"Node({self.type}, {self.children})" 9 | 10 | def __eq__(self, other): 11 | if not isinstance(other, Node): 12 | return NotImplemented 13 | return self.type == other.type and self.children == other.children 14 | -------------------------------------------------------------------------------- /stories/story1/parser.py: -------------------------------------------------------------------------------- 1 | class Parser: 2 | 3 | def __init__(self, tokenizer): 4 | self.tokenizer = tokenizer 5 | 6 | def mark(self): 7 | return self.tokenizer.mark() 8 | 9 | def reset(self, pos): 10 | self.tokenizer.reset(pos) 11 | 12 | def expect(self, arg): 13 | token = self.tokenizer.peek_token() 14 | if token.type == arg or token.string == arg: 15 | return self.tokenizer.get_token() 16 | return None 17 | -------------------------------------------------------------------------------- /stories/story1/test_parser.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story1.tokenizer import Tokenizer 6 | from story1.parser import Parser 7 | from story1.toy import ToyParser 8 | 9 | def test_basic(): 10 | program = "f(42)" 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = Parser(tok) 15 | t = p.expect(NAME) 16 | assert t and t.string == "f" 17 | pos = p.mark() 18 | assert p.expect("(") 19 | t = p.expect(NUMBER) 20 | assert t and t.string == "42" 21 | assert p.expect(")") 22 | pos2 = p.mark() 23 | p.reset(pos) 24 | assert p.expect("(") 25 | assert p.expect(NUMBER) 26 | assert p.expect(")") 27 | p.reset(pos) 28 | 29 | assert p.expect("(") 30 | p.reset(pos2) 31 | assert p.expect(NEWLINE) 32 | assert p.expect(ENDMARKER) 33 | 34 | def test_toy(): 35 | program = "x - (y + z)" 36 | file = StringIO(program) 37 | tokengen = generate_tokens(file.readline) 38 | tok = Tokenizer(tokengen) 39 | p = ToyParser(tok) 40 | tree = p.statement() 41 | assert tree and tree.type == "sub" 42 | assert tree.children[0].type == NAME 43 | assert tree.children[1].type == "add" 44 | -------------------------------------------------------------------------------- /stories/story1/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, OP, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story1.tokenizer import Tokenizer 6 | 7 | def test_basic(): 8 | program = "f(42)" 9 | file = StringIO(program) 10 | tokengen = generate_tokens(file.readline) 11 | tok = Tokenizer(tokengen) 12 | def get(): 13 | return tok.get_token()[:2] 14 | assert get() == (NAME, "f") 15 | assert get() == (OP, "(") 16 | assert get() == (NUMBER, "42") 17 | assert get() == (OP, ")") 18 | assert get() == (NEWLINE, "") 19 | assert get() == (ENDMARKER, "") 20 | 21 | def test_mark_reset(): 22 | program = "f(42) + abc" 23 | file = StringIO(program) 24 | tokengen = generate_tokens(file.readline) 25 | tok = Tokenizer(tokengen) 26 | def get(): 27 | return tok.get_token()[:2] 28 | assert get() == (NAME, "f") 29 | pos = tok.mark() 30 | assert get() == (OP, "(") 31 | assert get() == (NUMBER, "42") 32 | assert get() == (OP, ")") 33 | pos2 = tok.mark() 34 | tok.reset(pos) 35 | assert get() == (OP, "(") 36 | assert get() == (NUMBER, "42") 37 | assert get() == (OP, ")") 38 | tok.reset(pos) 39 | assert get() == (OP, "(") 40 | tok.reset(pos2) # Forward 41 | assert get() == (OP, "+") 42 | assert get() == (NAME, "abc") 43 | tok.reset(pos) 44 | assert get() == (OP, "(") 45 | assert get() == (NUMBER, "42") 46 | assert get() == (OP, ")") 47 | assert get() == (OP, "+") 48 | assert get() == (NAME, "abc") 49 | -------------------------------------------------------------------------------- /stories/story1/tokenizer.py: -------------------------------------------------------------------------------- 1 | class Tokenizer: 2 | 3 | def __init__(self, tokengen): 4 | """Call with tokenize.generate_tokens(...).""" 5 | self.tokengen = tokengen 6 | self.tokens = [] 7 | self.pos = 0 8 | 9 | def mark(self): 10 | return self.pos 11 | 12 | def reset(self, pos): 13 | self.pos = pos 14 | 15 | def get_token(self): 16 | token = self.peek_token() 17 | self.pos += 1 18 | return token 19 | 20 | def peek_token(self): 21 | if self.pos == len(self.tokens): 22 | self.tokens.append(next(self.tokengen)) 23 | return self.tokens[self.pos] 24 | -------------------------------------------------------------------------------- /stories/story1/toy.py: -------------------------------------------------------------------------------- 1 | from token import NAME, NUMBER 2 | 3 | from story1.parser import Parser 4 | from story1.node import Node 5 | 6 | class ToyParser(Parser): 7 | 8 | def statement(self): 9 | if a := self.assignment(): 10 | return a 11 | if e := self.expr(): 12 | return e 13 | if i := self.if_statement(): 14 | return i 15 | return None 16 | 17 | def expr(self): 18 | if t := self.term(): 19 | pos = self.mark() 20 | if op := self.expect("+"): 21 | if e := self.expr(): 22 | return Node("add", [t, e]) 23 | self.reset(pos) 24 | if op := self.expect("-"): 25 | if e := self.expr(): 26 | return Node("sub", [t, e]) 27 | self.reset(pos) 28 | return t 29 | return None 30 | 31 | def term(self): 32 | if t := self.atom(): 33 | pos = self.mark() 34 | if op := self.expect("*"): 35 | if e := self.term(): 36 | return Node("mul", [t, e]) 37 | self.reset(pos) 38 | if op := self.expect("/"): 39 | if e := self.term(): 40 | return Node("div", [t, e]) 41 | self.reset(pos) 42 | return t 43 | return None 44 | 45 | def atom(self): 46 | if token := self.expect(NAME): 47 | return token 48 | if token := self.expect(NUMBER): 49 | return token 50 | pos = self.mark() 51 | if self.expect("("): 52 | if e := self.expr(): 53 | if self.expect(")"): 54 | return e 55 | self.reset(pos) 56 | return None 57 | 58 | def assignment(self): 59 | pos = self.mark() 60 | if ((t := self.target()) and 61 | self.expect("=") and 62 | (e := self.expr())): 63 | return Node("assign", [t, e]) 64 | self.reset(pos) 65 | return None 66 | 67 | def target(self): 68 | return self.expect(NAME) 69 | 70 | def if_statement(self): 71 | pos = self.mark() 72 | if (self.expect("if") and 73 | (e := self.expr()) and 74 | self.expect(":") and 75 | (s := self.statement())): 76 | return Node("if", [e, s]) 77 | self.reset(pos) 78 | return None 79 | -------------------------------------------------------------------------------- /stories/story2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story2/__init__.py -------------------------------------------------------------------------------- /stories/story2/generator.py: -------------------------------------------------------------------------------- 1 | """Quick and dirty code generator.""" 2 | 3 | from story2.grammar import Rule 4 | 5 | HEADER = """\ 6 | # This is @generated code; do not edit! 7 | 8 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 9 | 10 | from story2.memo import memoize 11 | from story2.node import Node 12 | from story2.parser import Parser 13 | """ 14 | 15 | import sys 16 | 17 | 18 | def generate(rules, stream=None): 19 | if stream: 20 | sys.stdout = stream 21 | print(HEADER) 22 | generate_parser_class(rules) 23 | 24 | 25 | def generate_parser_class(rules): 26 | print(f"class ToyParser(Parser):") 27 | for rule in rules: 28 | print() 29 | print(f" @memoize") 30 | print(f" def {rule.name}(self):") 31 | print(f" pos = self.mark()") 32 | for alt in rule.alts: 33 | items = [] 34 | print(f" if (True") 35 | for item in alt: 36 | if item[0] in ('"', "'"): 37 | print(f" and self.expect({item})") 38 | else: 39 | var = item.lower() 40 | if var in items: 41 | var += str(len(items)) 42 | items.append(var) 43 | if item.isupper(): 44 | print(" " + 45 | f"and ({var} := self.expect({item}))") 46 | else: 47 | print(f" " + 48 | f"and ({var} := self.{item}())") 49 | print(f" ):") 50 | print(f" " + 51 | f"return Node({rule.name!r}, [{', '.join(items)}])") 52 | print(f" self.reset(pos)") 53 | print(f" return None") 54 | -------------------------------------------------------------------------------- /stories/story2/generator2.py: -------------------------------------------------------------------------------- 1 | """Simple code generator.""" 2 | 3 | from contextlib import contextmanager 4 | 5 | from story2.grammar import Rule 6 | 7 | HEADER = """\ 8 | # This is @generated code; do not edit! 9 | 10 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 11 | 12 | from story2.memo import memoize 13 | from story2.node import Node 14 | from story2.parser import Parser 15 | """ 16 | 17 | 18 | class Generator: 19 | 20 | def __init__(self, stream=None): 21 | self.stream = stream # If None, write to sys.stdout. 22 | self.indentation = "" 23 | 24 | def __call__(self, *args): 25 | # Note: print(..., file=None) prints to sys.stdout. 26 | print(end=self.indentation, file=self.stream) 27 | print(*args, file=self.stream) 28 | 29 | @contextmanager 30 | def indent(self): 31 | save = self.indentation 32 | try: 33 | self.indentation += " " 34 | yield 35 | finally: 36 | self.indentation = save 37 | 38 | 39 | def generate(rules, stream=None): 40 | gen = Generator(stream) 41 | gen(HEADER) 42 | gen(f"class ToyParser(Parser):") 43 | for rule in rules: 44 | gen() 45 | with gen.indent(): 46 | gen(f"@memoize") 47 | gen(f"def {rule.name}(self):") 48 | with gen.indent(): 49 | gen(f"pos = self.mark()") 50 | for alt in rule.alts: 51 | items = [] 52 | gen(f"if (True") 53 | with gen.indent(): 54 | for item in alt: 55 | if item[0] in ('"', "'"): 56 | gen(f"and self.expect({item})") 57 | else: 58 | var = item.lower() 59 | if var in items: 60 | var += str(len(items)) 61 | items.append(var) 62 | if item.isupper(): 63 | gen(f"and ({var} := self.expect({item}))") 64 | else: 65 | gen(f"and ({var} := self.{item}())") 66 | gen(f"):") 67 | with gen.indent(): 68 | gen(f"return Node({rule.name!r}, [{', '.join(items)}])") 69 | gen(f"self.reset(pos)") 70 | gen(f"return None") 71 | -------------------------------------------------------------------------------- /stories/story2/generator3.py: -------------------------------------------------------------------------------- 1 | """Simple code generator.""" 2 | 3 | from contextlib import contextmanager 4 | 5 | from story2.grammar import Rule 6 | 7 | HEADER = """\ 8 | # This is @generated code; do not edit! 9 | 10 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 11 | 12 | from story2.memo import memoize 13 | from story2.node import Node 14 | from story2.parser import Parser 15 | """ 16 | 17 | 18 | class Generator: 19 | 20 | def __init__(self, stream=None): 21 | self.stream = stream # If None, write to sys.stdout. 22 | self.indentation = "" 23 | 24 | def put(self, *args): 25 | # Note: print(..., file=None) prints to sys.stdout. 26 | print(end=self.indentation, file=self.stream) 27 | print(*args, file=self.stream) 28 | 29 | @contextmanager 30 | def indent(self): 31 | save = self.indentation 32 | try: 33 | self.indentation += " " 34 | yield 35 | finally: 36 | self.indentation = save 37 | 38 | def gen_rule(self, rule): 39 | self.put(f"@memoize") 40 | self.put(f"def {rule.name}(self):") 41 | with self.indent(): 42 | self.put(f"pos = self.mark()") 43 | for alt in rule.alts: 44 | self.gen_alt(alt, rule) 45 | self.put(f"return None") 46 | 47 | def gen_alt(self, alt, rule): 48 | items = [] 49 | self.put(f"if (True") 50 | with self.indent(): 51 | for item in alt: 52 | self.gen_item(item, items) 53 | self.put(f"):") 54 | with self.indent(): 55 | self.put(f"return Node({rule.name!r}, [{', '.join(items)}])") 56 | self.put(f"self.reset(pos)") 57 | 58 | def gen_item(self, item, items): 59 | if item[0] in ('"', "'"): 60 | self.put(f"and self.expect({item})") 61 | else: 62 | var = item.lower() 63 | if var in items: 64 | var += str(len(items)) 65 | items.append(var) 66 | if item.isupper(): 67 | self.put(f"and ({var} := self.expect({item}))") 68 | else: 69 | self.put(f"and ({var} := self.{item}())") 70 | 71 | 72 | def generate(rules, stream=None): 73 | gen = Generator(stream) 74 | gen.put(HEADER) 75 | gen.put(f"class ToyParser(Parser):") 76 | for rule in rules: 77 | gen.put() 78 | with gen.indent(): 79 | gen.gen_rule(rule) 80 | -------------------------------------------------------------------------------- /stories/story2/grammar.py: -------------------------------------------------------------------------------- 1 | """Parser for the grammar file.""" 2 | 3 | from token import NAME, NEWLINE, STRING, ENDMARKER 4 | 5 | from story2.parser import Parser 6 | 7 | class Rule: 8 | 9 | def __init__(self, name, alts): 10 | self.name = name 11 | self.alts = alts 12 | 13 | def __repr__(self): 14 | return f"Rule({self.name!r}, {self.alts})" 15 | 16 | def __eq__(self, other): 17 | if not isinstance(other, Rule): 18 | return NotImplemented 19 | return self.name == other.name and self.alts == other.alts 20 | 21 | 22 | class GrammarParser(Parser): 23 | 24 | def grammar(self): 25 | pos = self.mark() 26 | if rule := self.rule(): 27 | rules = [rule] 28 | while rule := self.rule(): 29 | rules.append(rule) 30 | if self.expect(ENDMARKER): 31 | return rules 32 | self.reset(pos) 33 | return None 34 | 35 | def rule(self): 36 | pos = self.mark() 37 | if name := self.expect(NAME): 38 | if self.expect(":"): 39 | if alt := self.alternative(): 40 | alts = [alt] 41 | apos = self.mark() 42 | while (self.expect("|") 43 | and (alt := self.alternative())): 44 | alts.append(alt) 45 | apos = self.mark() 46 | self.reset(apos) 47 | if self.expect(NEWLINE): 48 | return Rule(name.string, alts) 49 | self.reset(pos) 50 | return None 51 | 52 | def alternative(self): 53 | items = [] 54 | while item := self.item(): 55 | items.append(item) 56 | return items 57 | 58 | def item(self): 59 | if name := self.expect(NAME): 60 | return name.string 61 | if string := self.expect(STRING): 62 | return string.string 63 | return None 64 | -------------------------------------------------------------------------------- /stories/story2/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import sys 4 | from tokenize import generate_tokens 5 | 6 | from story2.grammar import GrammarParser 7 | from story2.tokenizer import Tokenizer 8 | from story2.generator import generate 9 | 10 | def main(): 11 | file = "story2/toy.gram" 12 | print("Reading", file) 13 | with open(file) as f: 14 | tokengen = generate_tokens(f.readline) 15 | tok = Tokenizer(tokengen) 16 | p = GrammarParser(tok) 17 | rules = p.grammar() 18 | if not rules: 19 | sys.exit("Fail") 20 | print("[") 21 | for rule in rules: 22 | print(f" {rule},") 23 | print("]") 24 | for rule in rules: 25 | print(rule.name, end=": ", file=sys.stderr) 26 | print(*(" ".join(alt) for alt in rule.alts), sep=" | ", file=sys.stderr) 27 | outfile = "story2/toy.py" 28 | print("Updating", outfile, file=sys.stderr) 29 | with open(outfile, "w") as stream: 30 | generate(rules, stream) 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /stories/story2/memo.py: -------------------------------------------------------------------------------- 1 | def memoize(func): 2 | """Memoize a parsing method. 3 | 4 | The functon must be a method on a class deriving from Parser. 5 | 6 | The method must have either no arguments or a single argument that 7 | is an int or str (the latter being the case for expect()). 8 | 9 | It must return either None or an object that is not modified (at 10 | least not while we're parsing). 11 | 12 | We memoize positive and negative outcomes per input position. 13 | 14 | The function is expected to move the input position iff it returns 15 | a not-None value. 16 | 17 | The memo is structured as a dict of dict, the outer dict indexed 18 | by input position, the inner by function and arguments. 19 | """ 20 | 21 | def memoize_wrapper(self, *args): 22 | pos = self.mark() 23 | memo = self.memos.get(pos) 24 | if memo is None: 25 | memo = self.memos[pos] = {} 26 | key = (func, args) 27 | if key in memo: 28 | res, endpos = memo[key] 29 | self.reset(endpos) 30 | else: 31 | res = func(self, *args) 32 | endpos = self.mark() 33 | if res is None: 34 | assert endpos == pos 35 | else: 36 | assert endpos > pos 37 | memo[key] = res, endpos 38 | return res 39 | 40 | return memoize_wrapper 41 | -------------------------------------------------------------------------------- /stories/story2/node.py: -------------------------------------------------------------------------------- 1 | class Node: 2 | 3 | def __init__(self, type, children): 4 | self.type = type 5 | self.children = children 6 | 7 | def __repr__(self): 8 | return f"Node({self.type}, {self.children})" 9 | 10 | def __eq__(self, other): 11 | if not isinstance(other, Node): 12 | return NotImplemented 13 | return self.type == other.type and self.children == other.children 14 | -------------------------------------------------------------------------------- /stories/story2/parser.py: -------------------------------------------------------------------------------- 1 | from story2.memo import memoize 2 | 3 | class Parser: 4 | 5 | def __init__(self, tokenizer): 6 | self.tokenizer = tokenizer 7 | self.memos = {} 8 | 9 | def mark(self): 10 | return self.tokenizer.mark() 11 | 12 | def reset(self, pos): 13 | self.tokenizer.reset(pos) 14 | 15 | def expect(self, arg): 16 | token = self.tokenizer.peek_token() 17 | if token.type == arg or token.string == arg: 18 | return self.tokenizer.get_token() 19 | return None 20 | -------------------------------------------------------------------------------- /stories/story2/test_grammar.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story2.tokenizer import Tokenizer 6 | from story2.parser import Parser 7 | from story2.grammar import GrammarParser, Rule 8 | 9 | def test_grammar(): 10 | program = ("stmt: asmt | expr\n" 11 | "asmt: NAME '=' expr\n" 12 | "expr: NAME\n") 13 | file = StringIO(program) 14 | tokengen = generate_tokens(file.readline) 15 | tok = Tokenizer(tokengen) 16 | p = GrammarParser(tok) 17 | rules = p.grammar() 18 | assert rules == [Rule('stmt', [['asmt'], ['expr']]), Rule('asmt', [['NAME', "'='", 'expr']]), Rule('expr', [['NAME']])] 19 | 20 | def test_failure(): 21 | program = ("stmt: asmt | expr\n" 22 | "asmt: NAME '=' expr 42\n" 23 | "expr: NAME\n") 24 | file = StringIO(program) 25 | tokengen = generate_tokens(file.readline) 26 | tok = Tokenizer(tokengen) 27 | p = GrammarParser(tok) 28 | rules = p.grammar() 29 | assert rules is None 30 | -------------------------------------------------------------------------------- /stories/story2/test_parser.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story2.tokenizer import Tokenizer 6 | from story2.parser import Parser 7 | from story2.toy import ToyParser 8 | 9 | def test_basic(): 10 | program = "f(42)" 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = Parser(tok) 15 | t = p.expect(NAME) 16 | assert t and t.string == "f" 17 | pos = p.mark() 18 | assert p.expect("(") 19 | t = p.expect(NUMBER) 20 | assert t and t.string == "42" 21 | assert p.expect(")") 22 | pos2 = p.mark() 23 | p.reset(pos) 24 | assert p.expect("(") 25 | assert p.expect(NUMBER) 26 | assert p.expect(")") 27 | p.reset(pos) 28 | 29 | assert p.expect("(") 30 | p.reset(pos2) 31 | assert p.expect(NEWLINE) 32 | assert p.expect(ENDMARKER) 33 | 34 | def test_toy(): 35 | program = "x - (y + z)" 36 | file = StringIO(program) 37 | tokengen = generate_tokens(file.readline) 38 | tok = Tokenizer(tokengen) 39 | p = ToyParser(tok) 40 | tree = p.statement() 41 | print(tree) 42 | assert tree and tree.type == "statement" 43 | assert tree.children[0].type == "expr" 44 | assert tree.children[0].children[0].type == "term" 45 | -------------------------------------------------------------------------------- /stories/story2/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, OP, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story2.tokenizer import Tokenizer 6 | 7 | def test_basic(): 8 | program = "f(42)" 9 | file = StringIO(program) 10 | tokengen = generate_tokens(file.readline) 11 | tok = Tokenizer(tokengen) 12 | def get(): 13 | return tok.get_token()[:2] 14 | assert get() == (NAME, "f") 15 | assert get() == (OP, "(") 16 | assert get() == (NUMBER, "42") 17 | assert get() == (OP, ")") 18 | assert get() == (NEWLINE, "") 19 | assert get() == (ENDMARKER, "") 20 | 21 | def test_mark_reset(): 22 | program = "f(42) + abc" 23 | file = StringIO(program) 24 | tokengen = generate_tokens(file.readline) 25 | tok = Tokenizer(tokengen) 26 | def get(): 27 | return tok.get_token()[:2] 28 | assert get() == (NAME, "f") 29 | pos = tok.mark() 30 | assert get() == (OP, "(") 31 | assert get() == (NUMBER, "42") 32 | assert get() == (OP, ")") 33 | pos2 = tok.mark() 34 | tok.reset(pos) 35 | assert get() == (OP, "(") 36 | assert get() == (NUMBER, "42") 37 | assert get() == (OP, ")") 38 | tok.reset(pos) 39 | assert get() == (OP, "(") 40 | tok.reset(pos2) # Forward 41 | assert get() == (OP, "+") 42 | assert get() == (NAME, "abc") 43 | tok.reset(pos) 44 | assert get() == (OP, "(") 45 | assert get() == (NUMBER, "42") 46 | assert get() == (OP, ")") 47 | assert get() == (OP, "+") 48 | assert get() == (NAME, "abc") 49 | -------------------------------------------------------------------------------- /stories/story2/tokenizer.py: -------------------------------------------------------------------------------- 1 | class Tokenizer: 2 | 3 | def __init__(self, tokengen): 4 | """Call with tokenize.generate_tokens(...).""" 5 | self.tokengen = tokengen 6 | self.tokens = [] 7 | self.pos = 0 8 | 9 | def mark(self): 10 | return self.pos 11 | 12 | def reset(self, pos): 13 | self.pos = pos 14 | 15 | def get_token(self): 16 | token = self.peek_token() 17 | self.pos += 1 18 | return token 19 | 20 | def peek_token(self): 21 | if self.pos == len(self.tokens): 22 | self.tokens.append(next(self.tokengen)) 23 | return self.tokens[self.pos] 24 | -------------------------------------------------------------------------------- /stories/story2/toy.gram: -------------------------------------------------------------------------------- 1 | start: statements ENDMARKER 2 | statements: statement NEWLINE statements | statement NEWLINE 3 | statement: if_statement | assignment | expr 4 | expr: term '+' expr | term '-' term | term 5 | term: atom '*' term | atom '/' atom | atom 6 | atom: NAME | NUMBER | '(' expr ')' 7 | assignment: target '=' expr 8 | target: NAME 9 | if_statement: 'if' expr ':' statement 10 | -------------------------------------------------------------------------------- /stories/story3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story3/__init__.py -------------------------------------------------------------------------------- /stories/story3/driver.py: -------------------------------------------------------------------------------- 1 | import curses 2 | import sys 3 | from tokenize import generate_tokens 4 | 5 | from story3.toy import ToyParser 6 | from story3.tokenizer import Tokenizer 7 | from story3.visualizer import Visualizer 8 | 9 | 10 | def main(): 11 | filename = "story3/in.txt" 12 | if sys.argv[1:]: 13 | filename = sys.argv[1] 14 | with open(filename) as f: 15 | tokengen = generate_tokens(f.readline) 16 | vis = Visualizer() 17 | tok = Tokenizer(tokengen, vis) 18 | p = ToyParser(tok) 19 | try: 20 | tree = p.statement() 21 | while True: 22 | curses.flash() 23 | vis.wait() 24 | finally: 25 | vis.close() 26 | 27 | 28 | main() 29 | -------------------------------------------------------------------------------- /stories/story3/generator.py: -------------------------------------------------------------------------------- 1 | """Quick and dirty code generator.""" 2 | 3 | from story3.grammar import Rule 4 | 5 | HEADER = """\ 6 | # This is @generated code; do not edit! 7 | 8 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 9 | 10 | from story3.memo import memoize 11 | from story3.node import Node 12 | from story3.parser import Parser 13 | """ 14 | 15 | import sys 16 | 17 | 18 | def generate(rules, stream=None): 19 | if stream: 20 | sys.stdout = stream 21 | print(HEADER) 22 | generate_parser_class(rules) 23 | 24 | 25 | def generate_parser_class(rules): 26 | print(f"class ToyParser(Parser):") 27 | for rule in rules: 28 | print() 29 | print(f" @memoize") 30 | print(f" def {rule.name}(self):") 31 | print(f" pos = self.mark()") 32 | for alt in rule.alts: 33 | items = [] 34 | print(f" if (True") 35 | for item in alt: 36 | if item[0] in ('"', "'"): 37 | print(f" and self.expect({item})") 38 | else: 39 | var = item.lower() 40 | if var in items: 41 | var += str(len(items)) 42 | items.append(var) 43 | if item.isupper(): 44 | print(" " + 45 | f"and ({var} := self.expect({item}))") 46 | else: 47 | print(f" " + 48 | f"and ({var} := self.{item}())") 49 | print(f" ):") 50 | print(f" " + 51 | f"return Node({rule.name!r}, [{', '.join(items)}])") 52 | print(f" self.reset(pos)") 53 | print(f" return None") 54 | -------------------------------------------------------------------------------- /stories/story3/generator2.py: -------------------------------------------------------------------------------- 1 | """Simple code generator.""" 2 | 3 | from contextlib import contextmanager 4 | 5 | from story3.grammar import Rule 6 | 7 | HEADER = """\ 8 | # This is @generated code; do not edit! 9 | 10 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 11 | 12 | from story3.memo import memoize 13 | from story3.node import Node 14 | from story3.parser import Parser 15 | """ 16 | 17 | 18 | class Generator: 19 | 20 | def __init__(self, stream=None): 21 | self.stream = stream # If None, write to sys.stdout. 22 | self.indentation = "" 23 | 24 | def __call__(self, *args): 25 | # Note: print(..., file=None) prints to sys.stdout. 26 | print(end=self.indentation, file=self.stream) 27 | print(*args, file=self.stream) 28 | 29 | @contextmanager 30 | def indent(self): 31 | save = self.indentation 32 | try: 33 | self.indentation += " " 34 | yield 35 | finally: 36 | self.indentation = save 37 | 38 | 39 | def generate(rules, stream=None): 40 | gen = Generator(stream) 41 | gen(HEADER) 42 | gen(f"class ToyParser(Parser):") 43 | for rule in rules: 44 | gen() 45 | with gen.indent(): 46 | gen(f"@memoize") 47 | gen(f"def {rule.name}(self):") 48 | with gen.indent(): 49 | gen(f"pos = self.mark()") 50 | for alt in rule.alts: 51 | items = [] 52 | gen(f"if (True") 53 | with gen.indent(): 54 | for item in alt: 55 | if item[0] in ('"', "'"): 56 | gen(f"and self.expect({item})") 57 | else: 58 | var = item.lower() 59 | if var in items: 60 | var += str(len(items)) 61 | items.append(var) 62 | if item.isupper(): 63 | gen(f"and ({var} := self.expect({item}))") 64 | else: 65 | gen(f"and ({var} := self.{item}())") 66 | gen(f"):") 67 | with gen.indent(): 68 | gen(f"return Node({rule.name!r}, [{', '.join(items)}])") 69 | gen(f"self.reset(pos)") 70 | gen(f"return None") 71 | -------------------------------------------------------------------------------- /stories/story3/generator3.py: -------------------------------------------------------------------------------- 1 | """Simple code generator.""" 2 | 3 | from contextlib import contextmanager 4 | 5 | from story3.grammar import Rule 6 | 7 | HEADER = """\ 8 | # This is @generated code; do not edit! 9 | 10 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 11 | 12 | from story3.memo import memoize 13 | from story3.node import Node 14 | from story3.parser import Parser 15 | """ 16 | 17 | 18 | class Generator: 19 | 20 | def __init__(self, stream=None): 21 | self.stream = stream # If None, write to sys.stdout. 22 | self.indentation = "" 23 | 24 | def put(self, *args): 25 | # Note: print(..., file=None) prints to sys.stdout. 26 | print(end=self.indentation, file=self.stream) 27 | print(*args, file=self.stream) 28 | 29 | @contextmanager 30 | def indent(self): 31 | save = self.indentation 32 | try: 33 | self.indentation += " " 34 | yield 35 | finally: 36 | self.indentation = save 37 | 38 | def gen_rule(self, rule): 39 | self.put(f"@memoize") 40 | self.put(f"def {rule.name}(self):") 41 | with self.indent(): 42 | self.put(f"self.show_rule({rule.name!r}, {rule.alts!r})") 43 | self.put(f"pos = self.mark()") 44 | for i, alt in enumerate(rule.alts): 45 | self.gen_alt(alt, rule, i) 46 | self.put(f"self.show_index(0, 0, 0)") 47 | self.put(f"return None") 48 | 49 | def gen_alt(self, alt, rule, alt_index): 50 | items = [] 51 | self.put(f"if (True") 52 | with self.indent(): 53 | for i, item in enumerate(alt): 54 | self.gen_item(item, items, alt_index, i) 55 | self.put(f"):") 56 | with self.indent(): 57 | self.put(f"self.show_index({alt_index}, 0, {len(alt)})") 58 | self.put(f"return Node({rule.name!r}, [{', '.join(items)}])") 59 | self.put(f"self.reset(pos)") 60 | 61 | def gen_item(self, item, items, alt_index, item_index): 62 | self.put(f"and self.show_index({alt_index}, {item_index})") 63 | if item[0] in ('"', "'"): 64 | self.put(f"and self.expect({item})") 65 | else: 66 | var = item.lower() 67 | if var in items: 68 | var += str(len(items)) 69 | items.append(var) 70 | if item.isupper(): 71 | self.put(f"and ({var} := self.expect({item}))") 72 | else: 73 | self.put(f"and ({var} := self.{item}())") 74 | 75 | 76 | def generate(rules, stream=None): 77 | gen = Generator(stream) 78 | gen.put(HEADER) 79 | gen.put(f"class ToyParser(Parser):") 80 | for rule in rules: 81 | gen.put() 82 | with gen.indent(): 83 | gen.gen_rule(rule) 84 | -------------------------------------------------------------------------------- /stories/story3/grammar.py: -------------------------------------------------------------------------------- 1 | """Parser for the grammar file.""" 2 | 3 | from token import NAME, NEWLINE, STRING, ENDMARKER 4 | 5 | from story3.parser import Parser 6 | 7 | class Rule: 8 | 9 | def __init__(self, name, alts): 10 | self.name = name 11 | self.alts = alts 12 | 13 | def __repr__(self): 14 | return f"Rule({self.name!r}, {self.alts})" 15 | 16 | def __eq__(self, other): 17 | if not isinstance(other, Rule): 18 | return NotImplemented 19 | return self.name == other.name and self.alts == other.alts 20 | 21 | 22 | class GrammarParser(Parser): 23 | 24 | def grammar(self): 25 | pos = self.mark() 26 | if rule := self.rule(): 27 | rules = [rule] 28 | while rule := self.rule(): 29 | rules.append(rule) 30 | if self.expect(ENDMARKER): 31 | return rules 32 | self.reset(pos) 33 | return None 34 | 35 | def rule(self): 36 | pos = self.mark() 37 | if name := self.expect(NAME): 38 | if self.expect(":"): 39 | if alt := self.alternative(): 40 | alts = [alt] 41 | apos = self.mark() 42 | while (self.expect("|") 43 | and (alt := self.alternative())): 44 | alts.append(alt) 45 | apos = self.mark() 46 | self.reset(apos) 47 | if self.expect(NEWLINE): 48 | return Rule(name.string, alts) 49 | self.reset(pos) 50 | return None 51 | 52 | def alternative(self): 53 | items = [] 54 | while item := self.item(): 55 | items.append(item) 56 | return items 57 | 58 | def item(self): 59 | if name := self.expect(NAME): 60 | return name.string 61 | if string := self.expect(STRING): 62 | return string.string 63 | return None 64 | -------------------------------------------------------------------------------- /stories/story3/ifs.txt: -------------------------------------------------------------------------------- 1 | if foo + bar: baz = one * two 2 | -------------------------------------------------------------------------------- /stories/story3/in.txt: -------------------------------------------------------------------------------- 1 | aap = cat + dog 2 | -------------------------------------------------------------------------------- /stories/story3/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import sys 4 | from tokenize import generate_tokens 5 | 6 | from story3.grammar import GrammarParser 7 | from story3.tokenizer import Tokenizer 8 | from story3.generator3 import generate 9 | from story3.visualizer import Visualizer 10 | 11 | def main(): 12 | file = "story3/toy.gram" 13 | print("Reading", file) 14 | with open(file) as f: 15 | tokengen = generate_tokens(f.readline) 16 | vis = None 17 | if "-v" in sys.argv: 18 | vis = Visualizer() 19 | tok = Tokenizer(tokengen, vis) 20 | p = GrammarParser(tok) 21 | try: 22 | rules = p.grammar() 23 | if vis: 24 | vis.wait() 25 | finally: 26 | if vis: 27 | vis.close() 28 | if not rules: 29 | sys.exit("Fail") 30 | print("[") 31 | for rule in rules: 32 | print(f" {rule},") 33 | print("]") 34 | for rule in rules: 35 | print(rule.name, end=": ", file=sys.stderr) 36 | print(*(" ".join(alt) for alt in rule.alts), sep=" | ", file=sys.stderr) 37 | outfile = "story3/toy.py" 38 | print("Updating", outfile, file=sys.stderr) 39 | with open(outfile, "w") as stream: 40 | generate(rules, stream) 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /stories/story3/memo.py: -------------------------------------------------------------------------------- 1 | def memoize(func): 2 | """Memoize a parsing method. 3 | 4 | The functon must be a method on a class deriving from Parser. 5 | 6 | The method must have either no arguments or a single argument that 7 | is an int or str (the latter being the case for expect()). 8 | 9 | It must return either None or an object that is not modified (at 10 | least not while we're parsing). 11 | 12 | We memoize positive and negative outcomes per input position. 13 | 14 | The function is expected to move the input position iff it returns 15 | a not-None value. 16 | 17 | The memo is structured as a dict of dict, the outer dict indexed 18 | by input position, the inner by function and arguments. 19 | """ 20 | 21 | def memoize_wrapper(self, *args): 22 | vis = self.tokenizer.vis 23 | pos = self.mark() 24 | if vis is not None: 25 | vis.show_call(pos, func.__name__, args) 26 | memo = self.memos.get(pos) 27 | if memo is None: 28 | memo = self.memos[pos] = {} 29 | key = (func, args) 30 | if key in memo: 31 | res, endpos = memo[key] 32 | self.reset(endpos) 33 | else: 34 | res = func(self, *args) 35 | endpos = self.mark() 36 | if res is None: 37 | assert endpos == pos 38 | else: 39 | assert endpos > pos 40 | memo[key] = res, endpos 41 | if vis is not None: 42 | vis.show_return(pos, res, endpos) 43 | return res 44 | 45 | return memoize_wrapper 46 | -------------------------------------------------------------------------------- /stories/story3/node.py: -------------------------------------------------------------------------------- 1 | from token import tok_name 2 | from tokenize import TokenInfo 3 | 4 | 5 | def short_token(tok: TokenInfo) -> str: 6 | s = tok.string 7 | if s == '' or s.isspace(): 8 | return tok_name[tok.type] 9 | else: 10 | return repr(s) 11 | 12 | 13 | def alt_repr(x) -> str: 14 | if isinstance(x, TokenInfo): 15 | return short_token(x) 16 | else: 17 | return repr(x) 18 | 19 | 20 | class Node: 21 | 22 | def __init__(self, type, children): 23 | self.type = type 24 | self.children = children 25 | 26 | def __repr__(self): 27 | return f"Node({self.type}, [{', '.join(map(alt_repr, self.children))}])" 28 | 29 | def __eq__(self, other): 30 | if not isinstance(other, Node): 31 | return NotImplemented 32 | return self.type == other.type and self.children == other.children 33 | -------------------------------------------------------------------------------- /stories/story3/parser.py: -------------------------------------------------------------------------------- 1 | from story3.memo import memoize 2 | 3 | class Parser: 4 | 5 | def __init__(self, tokenizer): 6 | self.tokenizer = tokenizer 7 | self.memos = {} 8 | 9 | def mark(self): 10 | return self.tokenizer.mark() 11 | 12 | def reset(self, pos): 13 | self.tokenizer.reset(pos) 14 | 15 | def show_rule(self, name, alts): 16 | # alts is a list of lists of strings 17 | vis = self.tokenizer.vis 18 | if vis: 19 | vis.show_rule(name, alts) 20 | 21 | def show_index(self, alt_index, item_index, num_items=1): 22 | vis = self.tokenizer.vis 23 | if vis: 24 | vis.show_index(alt_index, item_index, num_items) 25 | return True 26 | 27 | @memoize 28 | def expect(self, arg): 29 | token = self.tokenizer.peek_token() 30 | if token.type == arg or token.string == arg: 31 | return self.tokenizer.get_token() 32 | return None 33 | -------------------------------------------------------------------------------- /stories/story3/test_grammar.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story3.tokenizer import Tokenizer 6 | from story3.parser import Parser 7 | from story3.grammar import GrammarParser, Rule 8 | 9 | def test_grammar(): 10 | program = ("stmt: asmt | expr\n" 11 | "asmt: NAME '=' expr\n" 12 | "expr: NAME\n") 13 | file = StringIO(program) 14 | tokengen = generate_tokens(file.readline) 15 | tok = Tokenizer(tokengen) 16 | p = GrammarParser(tok) 17 | rules = p.grammar() 18 | assert rules == [Rule('stmt', [['asmt'], ['expr']]), Rule('asmt', [['NAME', "'='", 'expr']]), Rule('expr', [['NAME']])] 19 | 20 | def test_failure(): 21 | program = ("stmt: asmt | expr\n" 22 | "asmt: NAME '=' expr 42\n" 23 | "expr: NAME\n") 24 | file = StringIO(program) 25 | tokengen = generate_tokens(file.readline) 26 | tok = Tokenizer(tokengen) 27 | p = GrammarParser(tok) 28 | rules = p.grammar() 29 | assert rules is None 30 | -------------------------------------------------------------------------------- /stories/story3/test_parser.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story3.tokenizer import Tokenizer 6 | from story3.parser import Parser 7 | from story3.toy import ToyParser 8 | 9 | def test_basic(): 10 | program = "f(42)" 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = Parser(tok) 15 | t = p.expect(NAME) 16 | assert t and t.string == "f" 17 | pos = p.mark() 18 | assert p.expect("(") 19 | t = p.expect(NUMBER) 20 | assert t and t.string == "42" 21 | assert p.expect(")") 22 | pos2 = p.mark() 23 | p.reset(pos) 24 | assert p.expect("(") 25 | assert p.expect(NUMBER) 26 | assert p.expect(")") 27 | p.reset(pos) 28 | 29 | assert p.expect("(") 30 | p.reset(pos2) 31 | assert p.expect(NEWLINE) 32 | assert p.expect(ENDMARKER) 33 | 34 | def test_toy(): 35 | program = "x - (y + z)" 36 | file = StringIO(program) 37 | tokengen = generate_tokens(file.readline) 38 | tok = Tokenizer(tokengen) 39 | p = ToyParser(tok) 40 | tree = p.statement() 41 | print(tree) 42 | assert tree and tree.type == "statement" 43 | assert tree.children[0].type == "expr" 44 | assert tree.children[0].children[0].type == "term" 45 | -------------------------------------------------------------------------------- /stories/story3/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, OP, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story3.tokenizer import Tokenizer 6 | 7 | def test_basic(): 8 | program = "f(42)" 9 | file = StringIO(program) 10 | tokengen = generate_tokens(file.readline) 11 | tok = Tokenizer(tokengen) 12 | def get(): 13 | return tok.get_token()[:2] 14 | assert get() == (NAME, "f") 15 | assert get() == (OP, "(") 16 | assert get() == (NUMBER, "42") 17 | assert get() == (OP, ")") 18 | assert get() == (NEWLINE, "") 19 | assert get() == (ENDMARKER, "") 20 | 21 | def test_mark_reset(): 22 | program = "f(42) + abc" 23 | file = StringIO(program) 24 | tokengen = generate_tokens(file.readline) 25 | tok = Tokenizer(tokengen) 26 | def get(): 27 | return tok.get_token()[:2] 28 | assert get() == (NAME, "f") 29 | pos = tok.mark() 30 | assert get() == (OP, "(") 31 | assert get() == (NUMBER, "42") 32 | assert get() == (OP, ")") 33 | pos2 = tok.mark() 34 | tok.reset(pos) 35 | assert get() == (OP, "(") 36 | assert get() == (NUMBER, "42") 37 | assert get() == (OP, ")") 38 | tok.reset(pos) 39 | assert get() == (OP, "(") 40 | tok.reset(pos2) # Forward 41 | assert get() == (OP, "+") 42 | assert get() == (NAME, "abc") 43 | tok.reset(pos) 44 | assert get() == (OP, "(") 45 | assert get() == (NUMBER, "42") 46 | assert get() == (OP, ")") 47 | assert get() == (OP, "+") 48 | assert get() == (NAME, "abc") 49 | -------------------------------------------------------------------------------- /stories/story3/tokenizer.py: -------------------------------------------------------------------------------- 1 | class Tokenizer: 2 | 3 | def __init__(self, tokengen, vis=None): 4 | """Call with tokenize.generate_tokens(...).""" 5 | self.tokengen = tokengen 6 | self.vis = vis 7 | self.tokens = [] 8 | self.pos = 0 9 | 10 | def mark(self): 11 | return self.pos 12 | 13 | def reset(self, pos): 14 | if pos == self.pos: 15 | return 16 | self.pos = pos 17 | self.report() 18 | 19 | def get_token(self): 20 | token = self.peek_token() 21 | self.pos += 1 22 | self.report() 23 | return token 24 | 25 | def peek_token(self): 26 | if self.pos == len(self.tokens): 27 | self.tokens.append(next(self.tokengen)) 28 | self.report() 29 | return self.tokens[self.pos] 30 | 31 | def report(self): 32 | if self.vis is not None: 33 | self.vis.vis_tokens(self.tokens, self.pos) 34 | -------------------------------------------------------------------------------- /stories/story3/toy.gram: -------------------------------------------------------------------------------- 1 | start: statements ENDMARKER 2 | statements: statement NEWLINE statements | statement NEWLINE 3 | statement: if_statement | assignment | expr 4 | expr: term '+' expr | term '-' term | term 5 | term: atom '*' term | atom '/' atom | atom 6 | atom: NAME | NUMBER | '(' expr ')' 7 | assignment: target '=' expr 8 | target: NAME 9 | if_statement: 'if' expr ':' statement 10 | -------------------------------------------------------------------------------- /stories/story3/tty.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story3/tty.gif -------------------------------------------------------------------------------- /stories/story4/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story4/__init__.py -------------------------------------------------------------------------------- /stories/story4/driver.py: -------------------------------------------------------------------------------- 1 | import curses 2 | import sys 3 | from tokenize import generate_tokens 4 | 5 | from story4.toy import ToyParser 6 | from story4.tokenizer import Tokenizer 7 | from story4.visualizer import Visualizer 8 | 9 | 10 | def main(): 11 | filename = "story4/in.txt" 12 | startname = "start" 13 | if sys.argv[1:]: 14 | filename = sys.argv[1] 15 | if sys.argv[2:]: 16 | startname = sys.argv[2] 17 | with open(filename) as f: 18 | tokengen = generate_tokens(f.readline) 19 | vis = Visualizer() 20 | tok = Tokenizer(tokengen, vis) 21 | p = ToyParser(tok) 22 | start = getattr(p, startname) 23 | try: 24 | tree = start() 25 | vis.done() 26 | finally: 27 | vis.close() 28 | 29 | 30 | main() 31 | -------------------------------------------------------------------------------- /stories/story4/generator3.py: -------------------------------------------------------------------------------- 1 | """Simple code generator.""" 2 | 3 | from contextlib import contextmanager 4 | 5 | from story4.grammar import Rule 6 | 7 | HEADER = """\ 8 | # This is @generated code; do not edit! 9 | 10 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 11 | 12 | from story4.memo import memoize, memoize_left_rec 13 | from story4.node import Node 14 | from story4.parser import Parser 15 | """ 16 | 17 | 18 | class Generator: 19 | 20 | def __init__(self, stream=None): 21 | self.stream = stream # If None, write to sys.stdout. 22 | self.indentation = "" 23 | 24 | def put(self, *args): 25 | # Note: print(..., file=None) prints to sys.stdout. 26 | print(end=self.indentation, file=self.stream) 27 | print(*args, file=self.stream) 28 | 29 | @contextmanager 30 | def indent(self): 31 | save = self.indentation 32 | try: 33 | self.indentation += " " 34 | yield 35 | finally: 36 | self.indentation = save 37 | 38 | def is_left_rec(self, rule): 39 | # TODO: Indirect left recursion (hidden behind possibly-empty 40 | # items) and mutual left recursion (recursion involving 41 | # multiple rules). Indirect recursion only becomes important 42 | # once we support PEG features like optional or repeated 43 | # items. Mutual left recursion is currently an undetected 44 | # grammar bug -- don't do this! (A full implementation is in 45 | # the ../pegen/parser_generator.py module.) 46 | for alt in rule.alts: 47 | if alt[0] == rule.name: 48 | return True 49 | return False 50 | 51 | def gen_rule(self, rule): 52 | if self.is_left_rec(rule): 53 | self.put(f"@memoize_left_rec") 54 | leftrec = "'*' + " 55 | else: 56 | self.put(f"@memoize") 57 | leftrec = "" 58 | self.put(f"def {rule.name}(self):") 59 | with self.indent(): 60 | self.put(f"self.show_rule({leftrec}{rule.name!r}, {rule.alts!r})") 61 | self.put(f"pos = self.mark()") 62 | for i, alt in enumerate(rule.alts): 63 | self.gen_alt(alt, rule, i) 64 | self.put(f"self.show_index(0, 0, 0)") 65 | self.put(f"return None") 66 | 67 | def gen_alt(self, alt, rule, alt_index): 68 | items = [] 69 | self.put(f"if (True") 70 | with self.indent(): 71 | for i, item in enumerate(alt): 72 | self.gen_item(item, items, alt_index, i) 73 | self.put(f"):") 74 | with self.indent(): 75 | self.put(f"self.show_index({alt_index}, 0, {len(alt)})") 76 | self.put(f"return Node({rule.name!r}, [{', '.join(items)}])") 77 | self.put(f"self.reset(pos)") 78 | 79 | def gen_item(self, item, items, alt_index, item_index): 80 | self.put(f"and self.show_index({alt_index}, {item_index})") 81 | if item[0] in ('"', "'"): 82 | self.put(f"and self.expect({item})") 83 | else: 84 | var = item.lower() 85 | if var in items: 86 | var += str(len(items)) 87 | items.append(var) 88 | if item.isupper(): 89 | self.put(f"and ({var} := self.expect({item}))") 90 | else: 91 | self.put(f"and ({var} := self.{item}())") 92 | 93 | 94 | def generate(rules, stream=None): 95 | gen = Generator(stream) 96 | gen.put(HEADER) 97 | gen.put(f"class ToyParser(Parser):") 98 | for rule in rules: 99 | gen.put() 100 | with gen.indent(): 101 | gen.gen_rule(rule) 102 | -------------------------------------------------------------------------------- /stories/story4/grammar.py: -------------------------------------------------------------------------------- 1 | """Parser for the grammar file.""" 2 | 3 | from token import NAME, NEWLINE, STRING, ENDMARKER 4 | 5 | from story4.parser import Parser 6 | 7 | class Rule: 8 | 9 | def __init__(self, name, alts): 10 | self.name = name 11 | self.alts = alts 12 | 13 | def __repr__(self): 14 | return f"Rule({self.name!r}, {self.alts})" 15 | 16 | def __eq__(self, other): 17 | if not isinstance(other, Rule): 18 | return NotImplemented 19 | return self.name == other.name and self.alts == other.alts 20 | 21 | 22 | class GrammarParser(Parser): 23 | 24 | def grammar(self): 25 | pos = self.mark() 26 | if rule := self.rule(): 27 | rules = [rule] 28 | while rule := self.rule(): 29 | rules.append(rule) 30 | if self.expect(ENDMARKER): 31 | return rules 32 | self.reset(pos) 33 | return None 34 | 35 | def rule(self): 36 | pos = self.mark() 37 | if name := self.expect(NAME): 38 | if self.expect(":"): 39 | if alt := self.alternative(): 40 | alts = [alt] 41 | apos = self.mark() 42 | while (self.expect("|") 43 | and (alt := self.alternative())): 44 | alts.append(alt) 45 | apos = self.mark() 46 | self.reset(apos) 47 | if self.expect(NEWLINE): 48 | return Rule(name.string, alts) 49 | self.reset(pos) 50 | return None 51 | 52 | def alternative(self): 53 | items = [] 54 | while item := self.item(): 55 | items.append(item) 56 | return items 57 | 58 | def item(self): 59 | if name := self.expect(NAME): 60 | return name.string 61 | if string := self.expect(STRING): 62 | return string.string 63 | return None 64 | -------------------------------------------------------------------------------- /stories/story4/in.txt: -------------------------------------------------------------------------------- 1 | aap = cat + dog 2 | -------------------------------------------------------------------------------- /stories/story4/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import sys 4 | from tokenize import generate_tokens 5 | 6 | from story4.grammar import GrammarParser 7 | from story4.tokenizer import Tokenizer 8 | from story4.generator3 import generate 9 | from story4.visualizer import Visualizer 10 | 11 | def main(): 12 | file = "story4/toy.gram" 13 | print("Reading", file) 14 | with open(file) as f: 15 | tokengen = generate_tokens(f.readline) 16 | vis = None 17 | if "-v" in sys.argv: 18 | vis = Visualizer() 19 | tok = Tokenizer(tokengen, vis) 20 | p = GrammarParser(tok) 21 | try: 22 | rules = p.grammar() 23 | if vis: 24 | vis.done() 25 | finally: 26 | if vis: 27 | vis.close() 28 | if not rules: 29 | sys.exit("Fail") 30 | print("[") 31 | for rule in rules: 32 | print(f" {rule},") 33 | print("]") 34 | for rule in rules: 35 | print(rule.name, end=": ", file=sys.stderr) 36 | print(*(" ".join(alt) for alt in rule.alts), sep=" | ", file=sys.stderr) 37 | outfile = "story4/toy.py" 38 | print("Updating", outfile, file=sys.stderr) 39 | with open(outfile, "w") as stream: 40 | generate(rules, stream) 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /stories/story4/memo.py: -------------------------------------------------------------------------------- 1 | def memoize(func): 2 | """Memoize a parsing method. 3 | 4 | The functon must be a method on a class deriving from Parser. 5 | 6 | The method must have either no arguments or a single argument that 7 | is an int or str (the latter being the case for expect()). 8 | 9 | It must return either None or an object that is not modified (at 10 | least not while we're parsing). 11 | 12 | We memoize positive and negative outcomes per input position. 13 | 14 | The function is expected to move the input position iff it returns 15 | a not-None value. 16 | 17 | The memo is structured as a dict of dict, the outer dict indexed 18 | by input position, the inner by function and arguments. 19 | """ 20 | 21 | def memoize_wrapper(self, *args): 22 | vis = self.tokenizer.vis 23 | pos = self.mark() 24 | if vis is not None: 25 | vis.show_call(pos, func.__name__, args) 26 | memo = self.memos.get(pos) 27 | if memo is None: 28 | memo = self.memos[pos] = {} 29 | key = (func, args) 30 | if key in memo: 31 | res, endpos = memo[key] 32 | self.reset(endpos) 33 | else: 34 | res = func(self, *args) 35 | endpos = self.mark() 36 | if res is None: 37 | assert endpos == pos 38 | else: 39 | assert endpos > pos 40 | memo[key] = res, endpos 41 | if vis is not None: 42 | vis.show_return(pos, res, endpos) 43 | return res 44 | 45 | return memoize_wrapper 46 | 47 | 48 | def memoize_left_rec(func): 49 | """Memoize a left-recursive parsing method. 50 | 51 | This is similar to @memoize but loops until no longer parse is obtained. 52 | 53 | Inspired by https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion 54 | """ 55 | 56 | def memoize_left_rec_wrapper(self, *args): 57 | vis = self.tokenizer.vis 58 | pos = self.mark() 59 | if vis is not None: 60 | vis.show_call(pos, "*" + func.__name__, args) 61 | memo = self.memos.get(pos) 62 | if memo is None: 63 | memo = self.memos[pos] = {} 64 | key = (func, args) 65 | if key in memo: 66 | res, endpos = memo[key] 67 | self.reset(endpos) 68 | else: 69 | # This is where we deviate from @memoize. 70 | 71 | # Prime the cache with a failure. 72 | memo[key] = lastres, lastpos = None, pos 73 | if vis is not None: 74 | vis.stuff_cache(pos, "*" + func.__name__, args, None) 75 | 76 | # Loop until no longer parse is obtained. 77 | while True: 78 | self.reset(pos) 79 | res = func(self, *args) 80 | endpos = self.mark() 81 | if endpos <= lastpos: 82 | break 83 | memo[key] = lastres, lastpos = res, endpos 84 | if vis is not None: 85 | vis.stuff_cache(pos, "*" + func.__name__, args, res) 86 | 87 | res = lastres 88 | self.reset(lastpos) 89 | 90 | if vis is not None: 91 | vis.show_return(pos, res, endpos) 92 | return res 93 | 94 | return memoize_left_rec_wrapper 95 | -------------------------------------------------------------------------------- /stories/story4/node.py: -------------------------------------------------------------------------------- 1 | from token import tok_name 2 | from tokenize import TokenInfo 3 | 4 | 5 | def short_token(tok: TokenInfo) -> str: 6 | s = tok.string 7 | if s == '' or s.isspace(): 8 | return tok_name[tok.type] 9 | else: 10 | return repr(s) 11 | 12 | 13 | def alt_repr(x) -> str: 14 | if isinstance(x, TokenInfo): 15 | return short_token(x) 16 | else: 17 | return repr(x) 18 | 19 | 20 | class Node: 21 | 22 | def __init__(self, type, children): 23 | self.type = type 24 | self.children = children 25 | 26 | def __repr__(self): 27 | return f"Node({self.type}, [{', '.join(map(alt_repr, self.children))}])" 28 | 29 | def __eq__(self, other): 30 | if not isinstance(other, Node): 31 | return NotImplemented 32 | return self.type == other.type and self.children == other.children 33 | -------------------------------------------------------------------------------- /stories/story4/parser.py: -------------------------------------------------------------------------------- 1 | from story4.memo import memoize 2 | 3 | class Parser: 4 | 5 | def __init__(self, tokenizer): 6 | self.tokenizer = tokenizer 7 | self.memos = {} 8 | 9 | def mark(self): 10 | return self.tokenizer.mark() 11 | 12 | def reset(self, pos): 13 | self.tokenizer.reset(pos) 14 | 15 | def show_rule(self, name, alts): 16 | # alts is a list of lists of strings 17 | vis = self.tokenizer.vis 18 | if vis: 19 | vis.show_rule(name, alts) 20 | 21 | def show_index(self, alt_index, item_index, num_items=1): 22 | vis = self.tokenizer.vis 23 | if vis: 24 | vis.show_index(alt_index, item_index, num_items) 25 | return True 26 | 27 | @memoize 28 | def expect(self, arg): 29 | token = self.tokenizer.peek_token() 30 | if token.type == arg or token.string == arg: 31 | return self.tokenizer.get_token() 32 | return None 33 | -------------------------------------------------------------------------------- /stories/story4/test_grammar.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story4.tokenizer import Tokenizer 6 | from story4.parser import Parser 7 | from story4.grammar import GrammarParser, Rule 8 | 9 | def test_grammar(): 10 | program = ("stmt: asmt | expr\n" 11 | "asmt: NAME '=' expr\n" 12 | "expr: NAME\n") 13 | file = StringIO(program) 14 | tokengen = generate_tokens(file.readline) 15 | tok = Tokenizer(tokengen) 16 | p = GrammarParser(tok) 17 | rules = p.grammar() 18 | assert rules == [Rule('stmt', [['asmt'], ['expr']]), Rule('asmt', [['NAME', "'='", 'expr']]), Rule('expr', [['NAME']])] 19 | 20 | def test_failure(): 21 | program = ("stmt: asmt | expr\n" 22 | "asmt: NAME '=' expr 42\n" 23 | "expr: NAME\n") 24 | file = StringIO(program) 25 | tokengen = generate_tokens(file.readline) 26 | tok = Tokenizer(tokengen) 27 | p = GrammarParser(tok) 28 | rules = p.grammar() 29 | assert rules is None 30 | -------------------------------------------------------------------------------- /stories/story4/test_parser.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story4.tokenizer import Tokenizer 6 | from story4.parser import Parser 7 | from story4.toy import ToyParser 8 | 9 | def test_basic(): 10 | program = "f(42)" 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = Parser(tok) 15 | t = p.expect(NAME) 16 | assert t and t.string == "f" 17 | pos = p.mark() 18 | assert p.expect("(") 19 | t = p.expect(NUMBER) 20 | assert t and t.string == "42" 21 | assert p.expect(")") 22 | pos2 = p.mark() 23 | p.reset(pos) 24 | assert p.expect("(") 25 | assert p.expect(NUMBER) 26 | assert p.expect(")") 27 | p.reset(pos) 28 | 29 | assert p.expect("(") 30 | p.reset(pos2) 31 | assert p.expect(NEWLINE) 32 | assert p.expect(ENDMARKER) 33 | 34 | def test_toy(): 35 | program = "x - (y + z)" 36 | file = StringIO(program) 37 | tokengen = generate_tokens(file.readline) 38 | tok = Tokenizer(tokengen) 39 | p = ToyParser(tok) 40 | tree = p.statement() 41 | print(tree) 42 | assert tree and tree.type == "statement" 43 | assert tree.children[0].type == "expr" 44 | assert tree.children[0].children[0].type == "expr" 45 | -------------------------------------------------------------------------------- /stories/story4/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, OP, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story4.tokenizer import Tokenizer 6 | 7 | def test_basic(): 8 | program = "f(42)" 9 | file = StringIO(program) 10 | tokengen = generate_tokens(file.readline) 11 | tok = Tokenizer(tokengen) 12 | def get(): 13 | return tok.get_token()[:2] 14 | assert get() == (NAME, "f") 15 | assert get() == (OP, "(") 16 | assert get() == (NUMBER, "42") 17 | assert get() == (OP, ")") 18 | assert get() == (NEWLINE, "") 19 | assert get() == (ENDMARKER, "") 20 | 21 | def test_mark_reset(): 22 | program = "f(42) + abc" 23 | file = StringIO(program) 24 | tokengen = generate_tokens(file.readline) 25 | tok = Tokenizer(tokengen) 26 | def get(): 27 | return tok.get_token()[:2] 28 | assert get() == (NAME, "f") 29 | pos = tok.mark() 30 | assert get() == (OP, "(") 31 | assert get() == (NUMBER, "42") 32 | assert get() == (OP, ")") 33 | pos2 = tok.mark() 34 | tok.reset(pos) 35 | assert get() == (OP, "(") 36 | assert get() == (NUMBER, "42") 37 | assert get() == (OP, ")") 38 | tok.reset(pos) 39 | assert get() == (OP, "(") 40 | tok.reset(pos2) # Forward 41 | assert get() == (OP, "+") 42 | assert get() == (NAME, "abc") 43 | tok.reset(pos) 44 | assert get() == (OP, "(") 45 | assert get() == (NUMBER, "42") 46 | assert get() == (OP, ")") 47 | assert get() == (OP, "+") 48 | assert get() == (NAME, "abc") 49 | -------------------------------------------------------------------------------- /stories/story4/tokenizer.py: -------------------------------------------------------------------------------- 1 | class Tokenizer: 2 | 3 | def __init__(self, tokengen, vis=None): 4 | """Call with tokenize.generate_tokens(...).""" 5 | self.tokengen = tokengen 6 | self.vis = vis 7 | self.tokens = [] 8 | self.pos = 0 9 | 10 | def mark(self): 11 | return self.pos 12 | 13 | def reset(self, pos): 14 | if pos == self.pos: 15 | return 16 | self.pos = pos 17 | self.report() 18 | 19 | def get_token(self): 20 | token = self.peek_token() 21 | self.pos += 1 22 | self.report() 23 | return token 24 | 25 | def peek_token(self): 26 | if self.pos == len(self.tokens): 27 | self.tokens.append(next(self.tokengen)) 28 | self.report() 29 | return self.tokens[self.pos] 30 | 31 | def report(self): 32 | if self.vis is not None: 33 | self.vis.vis_tokens(self.tokens, self.pos) 34 | -------------------------------------------------------------------------------- /stories/story4/toy.gram: -------------------------------------------------------------------------------- 1 | start: statements ENDMARKER 2 | statements: statement NEWLINE statements | statement NEWLINE 3 | statement: if_statement | assignment | expr 4 | expr: expr '+' term | expr '-' term | term 5 | term: term '*' atom | term '/' atom | atom 6 | atom: NAME | NUMBER | '(' expr ')' 7 | assignment: target '=' expr 8 | target: NAME 9 | if_statement: 'if' expr ':' statement 10 | -------------------------------------------------------------------------------- /stories/story5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story5/__init__.py -------------------------------------------------------------------------------- /stories/story5/calc.gram: -------------------------------------------------------------------------------- 1 | start: expr NEWLINE { expr } 2 | expr: expr '+' term { expr + term } | expr '-' term { expr - term } | term { term } 3 | term: NUMBER { float(number.string) } 4 | -------------------------------------------------------------------------------- /stories/story5/calc.py: -------------------------------------------------------------------------------- 1 | # This is @generated code; do not edit! 2 | 3 | from token import NAME, NUMBER, STRING, NEWLINE, ENDMARKER 4 | 5 | from story5.memo import memoize, memoize_left_rec 6 | from story5.node import Node 7 | from story5.parser import Parser 8 | 9 | class CalcParser(Parser): 10 | 11 | @memoize 12 | def start(self): 13 | self.show_rule('start', [['expr', 'NEWLINE']]) 14 | pos = self.mark() 15 | if (True 16 | and self.show_index(0, 0) 17 | and (expr := self.expr()) 18 | and self.show_index(0, 1) 19 | and (newline := self.expect(NEWLINE)) 20 | ): 21 | self.show_index(0, 0, 2) 22 | return expr 23 | self.reset(pos) 24 | self.show_index(0, 0, 0) 25 | return None 26 | 27 | @memoize_left_rec 28 | def expr(self): 29 | self.show_rule('*' + 'expr', [['expr', "'+'", 'term'], ['expr', "'-'", 'term'], ['term']]) 30 | pos = self.mark() 31 | if (True 32 | and self.show_index(0, 0) 33 | and (expr := self.expr()) 34 | and self.show_index(0, 1) 35 | and self.expect('+') 36 | and self.show_index(0, 2) 37 | and (term := self.term()) 38 | ): 39 | self.show_index(0, 0, 3) 40 | return expr + term 41 | self.reset(pos) 42 | if (True 43 | and self.show_index(1, 0) 44 | and (expr := self.expr()) 45 | and self.show_index(1, 1) 46 | and self.expect('-') 47 | and self.show_index(1, 2) 48 | and (term := self.term()) 49 | ): 50 | self.show_index(1, 0, 3) 51 | return expr - term 52 | self.reset(pos) 53 | if (True 54 | and self.show_index(2, 0) 55 | and (term := self.term()) 56 | ): 57 | self.show_index(2, 0, 1) 58 | return term 59 | self.reset(pos) 60 | self.show_index(0, 0, 0) 61 | return None 62 | 63 | @memoize 64 | def term(self): 65 | self.show_rule('term', [['NUMBER']]) 66 | pos = self.mark() 67 | if (True 68 | and self.show_index(0, 0) 69 | and (number := self.expect(NUMBER)) 70 | ): 71 | self.show_index(0, 0, 1) 72 | return float ( number . string ) 73 | self.reset(pos) 74 | self.show_index(0, 0, 0) 75 | return None 76 | -------------------------------------------------------------------------------- /stories/story5/calc.txt: -------------------------------------------------------------------------------- 1 | 100 + 50 - 38 - 70 2 | -------------------------------------------------------------------------------- /stories/story5/driver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import curses 3 | import importlib 4 | import sys 5 | from tokenize import generate_tokens 6 | 7 | from story5.parser import Parser 8 | from story5.tokenizer import Tokenizer 9 | from story5.visualizer import Visualizer 10 | 11 | 12 | argparser = argparse.ArgumentParser() 13 | argparser.add_argument("program", nargs="?", default="story5/in.txt", help="Sample program (in.txt)") 14 | argparser.add_argument("-g", "--grammar", default="story5.toy.ToyParser", help="Grammar class (ToyParser)") 15 | argparser.add_argument("-s", "--start", default="start", help="Start symbol (start)") 16 | 17 | 18 | def main(): 19 | args = argparser.parse_args() 20 | filename = args.program 21 | startname = args.start 22 | modname, classname = args.grammar.rsplit(".", 1) 23 | try: 24 | mod = importlib.import_module(modname) 25 | except ImportError: 26 | sys.exit(f"Cannot import {modname}") 27 | try: 28 | cls = getattr(mod, classname) 29 | except AttributeError: 30 | sys.exit(f"Module {modname} has no attribute {classname}") 31 | if not isinstance(cls, type): 32 | sys.exit(f"Object {modname}.{classname} is not a class ({cls!r})") 33 | if not issubclass(cls, Parser): 34 | sys.exit(f"Object {modname}.{classname} is not a subclass of Parser") 35 | 36 | with open(filename) as f: 37 | tokengen = generate_tokens(f.readline) 38 | vis = Visualizer() 39 | tok = Tokenizer(tokengen, vis) 40 | p = cls(tok) 41 | start = getattr(p, startname) 42 | try: 43 | tree = start() 44 | vis.done() 45 | finally: 46 | vis.close() 47 | 48 | 49 | main() 50 | -------------------------------------------------------------------------------- /stories/story5/in.txt: -------------------------------------------------------------------------------- 1 | aap = cat + dog 2 | -------------------------------------------------------------------------------- /stories/story5/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from tokenize import generate_tokens 7 | 8 | from story5.grammar import GrammarParser 9 | from story5.tokenizer import Tokenizer 10 | from story5.generator3 import generate 11 | from story5.visualizer import Visualizer 12 | 13 | argparser = argparse.ArgumentParser() 14 | argparser.add_argument("grammar", nargs="?", default="story5/toy.gram", help="Grammar file (toy.gram)") 15 | argparser.add_argument("-o", "--output", help="Output file (toy.py)") 16 | argparser.add_argument("-c", "--classname", help="Output class name (ToyParser)") 17 | argparser.add_argument("-v", "--visualize", action="store_true", help="Use visualizer") 18 | 19 | 20 | def main(): 21 | args = argparser.parse_args() 22 | file = args.grammar 23 | outfile = args.output 24 | if not outfile: 25 | head, tail = os.path.split(file) 26 | base, ext = os.path.splitext(tail) 27 | outfile = os.path.join(head, base + ".py") 28 | classname = args.classname 29 | if not classname: 30 | tail = os.path.basename(file) 31 | base, ext = os.path.splitext(tail) 32 | classname = base.title() + "Parser" 33 | 34 | print("Reading", file) 35 | with open(file) as f: 36 | tokengen = generate_tokens(f.readline) 37 | vis = None 38 | if args.visualize: 39 | vis = Visualizer() 40 | tok = Tokenizer(tokengen, vis) 41 | p = GrammarParser(tok) 42 | try: 43 | rules = p.grammar() 44 | if vis: 45 | vis.done() 46 | finally: 47 | if vis: 48 | vis.close() 49 | if not rules: 50 | sys.exit("Fail") 51 | print("[") 52 | for rule in rules: 53 | print(f" {rule},") 54 | print("]") 55 | for rule in rules: 56 | print(rule.name, end=": ", file=sys.stderr) 57 | print(*rule.alts, sep=" | ", file=sys.stderr) 58 | 59 | print("writing class", classname, "to", outfile, file=sys.stderr) 60 | with open(outfile, "w") as stream: 61 | generate(rules, classname, stream) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /stories/story5/memo.py: -------------------------------------------------------------------------------- 1 | def memoize(func): 2 | """Memoize a parsing method. 3 | 4 | The functon must be a method on a class deriving from Parser. 5 | 6 | The method must have either no arguments or a single argument that 7 | is an int or str (the latter being the case for expect()). 8 | 9 | It must return either None or an object that is not modified (at 10 | least not while we're parsing). 11 | 12 | We memoize positive and negative outcomes per input position. 13 | 14 | The function is expected to move the input position iff it returns 15 | a not-None value. 16 | 17 | The memo is structured as a dict of dict, the outer dict indexed 18 | by input position, the inner by function and arguments. 19 | """ 20 | 21 | def memoize_wrapper(self, *args): 22 | vis = self.tokenizer.vis 23 | pos = self.mark() 24 | if vis is not None: 25 | vis.show_call(pos, func.__name__, args) 26 | memo = self.memos.get(pos) 27 | if memo is None: 28 | memo = self.memos[pos] = {} 29 | key = (func, args) 30 | if key in memo: 31 | res, endpos = memo[key] 32 | self.reset(endpos) 33 | else: 34 | res = func(self, *args) 35 | endpos = self.mark() 36 | if res is None: 37 | assert endpos == pos 38 | else: 39 | assert endpos > pos 40 | memo[key] = res, endpos 41 | if vis is not None: 42 | vis.show_return(pos, res, endpos) 43 | return res 44 | 45 | return memoize_wrapper 46 | 47 | 48 | def memoize_left_rec(func): 49 | """Memoize a left-recursive parsing method. 50 | 51 | This is similar to @memoize but loops until no longer parse is obtained. 52 | 53 | Inspired by https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion 54 | """ 55 | 56 | def memoize_left_rec_wrapper(self, *args): 57 | vis = self.tokenizer.vis 58 | pos = self.mark() 59 | if vis is not None: 60 | vis.show_call(pos, "*" + func.__name__, args) 61 | memo = self.memos.get(pos) 62 | if memo is None: 63 | memo = self.memos[pos] = {} 64 | key = (func, args) 65 | if key in memo: 66 | res, endpos = memo[key] 67 | self.reset(endpos) 68 | else: 69 | # This is where we deviate from @memoize. 70 | 71 | # Prime the cache with a failure. 72 | memo[key] = lastres, lastpos = None, pos 73 | if vis is not None: 74 | vis.stuff_cache(pos, "*" + func.__name__, args, None) 75 | 76 | # Loop until no longer parse is obtained. 77 | while True: 78 | self.reset(pos) 79 | res = func(self, *args) 80 | endpos = self.mark() 81 | if endpos <= lastpos: 82 | break 83 | memo[key] = lastres, lastpos = res, endpos 84 | if vis is not None: 85 | vis.stuff_cache(pos, "*" + func.__name__, args, res) 86 | 87 | res = lastres 88 | self.reset(lastpos) 89 | 90 | if vis is not None: 91 | vis.show_return(pos, res, endpos) 92 | return res 93 | 94 | return memoize_left_rec_wrapper 95 | -------------------------------------------------------------------------------- /stories/story5/node.py: -------------------------------------------------------------------------------- 1 | from token import tok_name 2 | from tokenize import TokenInfo 3 | 4 | 5 | def short_token(tok: TokenInfo) -> str: 6 | s = tok.string 7 | if s == '' or s.isspace(): 8 | return tok_name[tok.type] 9 | else: 10 | return repr(s) 11 | 12 | 13 | def alt_repr(x) -> str: 14 | if isinstance(x, TokenInfo): 15 | return short_token(x) 16 | else: 17 | return repr(x) 18 | 19 | 20 | class Node: 21 | 22 | def __init__(self, type, children): 23 | self.type = type 24 | self.children = children 25 | 26 | def __repr__(self): 27 | return f"Node({self.type}, [{', '.join(map(alt_repr, self.children))}])" 28 | 29 | def __eq__(self, other): 30 | if not isinstance(other, Node): 31 | return NotImplemented 32 | return self.type == other.type and self.children == other.children 33 | -------------------------------------------------------------------------------- /stories/story5/parser.py: -------------------------------------------------------------------------------- 1 | from story5.memo import memoize 2 | 3 | class Parser: 4 | 5 | def __init__(self, tokenizer): 6 | self.tokenizer = tokenizer 7 | self.memos = {} 8 | 9 | def mark(self): 10 | return self.tokenizer.mark() 11 | 12 | def reset(self, pos): 13 | self.tokenizer.reset(pos) 14 | 15 | def show_rule(self, name, alts): 16 | # alts is a list of lists of strings 17 | vis = self.tokenizer.vis 18 | if vis: 19 | vis.show_rule(name, alts) 20 | 21 | def show_index(self, alt_index, item_index, num_items=1): 22 | vis = self.tokenizer.vis 23 | if vis: 24 | vis.show_index(alt_index, item_index, num_items) 25 | return True 26 | 27 | @memoize 28 | def expect(self, arg): 29 | token = self.tokenizer.peek_token() 30 | if token.type == arg or token.string == arg: 31 | return self.tokenizer.get_token() 32 | return None 33 | -------------------------------------------------------------------------------- /stories/story5/test_grammar.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story5.tokenizer import Tokenizer 6 | from story5.parser import Parser 7 | from story5.grammar import Alt, GrammarParser, Rule 8 | 9 | def test_grammar(): 10 | program = ("stmt: asmt | expr\n" 11 | "asmt: NAME '=' expr\n" 12 | "expr: NAME\n") 13 | file = StringIO(program) 14 | tokengen = generate_tokens(file.readline) 15 | tok = Tokenizer(tokengen) 16 | p = GrammarParser(tok) 17 | rules = p.grammar() 18 | assert rules == [Rule('stmt', [Alt(['asmt']), Alt(['expr'])]), 19 | Rule('asmt', [Alt(['NAME', "'='", 'expr'])]), 20 | Rule('expr', [Alt(['NAME'])])] 21 | 22 | def test_failure(): 23 | program = ("stmt: asmt | expr\n" 24 | "asmt: NAME '=' expr 42\n" 25 | "expr: NAME\n") 26 | file = StringIO(program) 27 | tokengen = generate_tokens(file.readline) 28 | tok = Tokenizer(tokengen) 29 | p = GrammarParser(tok) 30 | rules = p.grammar() 31 | assert rules is None 32 | 33 | def test_action(): 34 | program = "start: NAME { foo + bar } | NUMBER { -baz }\n" 35 | file = StringIO(program) 36 | tokengen = generate_tokens(file.readline) 37 | tok = Tokenizer(tokengen) 38 | p = GrammarParser(tok) 39 | rules = p.grammar() 40 | assert rules == [Rule("start", [Alt(["NAME"], "foo + bar"), 41 | Alt(["NUMBER"], "- baz")])] 42 | assert rules != [Rule("start", [Alt(["NAME"], "foo + bar"), 43 | Alt(["NUMBER"], "baz")])] 44 | 45 | def test_action_repr_str(): 46 | alt = Alt(["one", "two"]) 47 | assert repr(alt) == "Alt(['one', 'two'])" 48 | assert str(alt) == "one two" 49 | 50 | alt = Alt(["one", "two"], "foo + bar") 51 | assert repr(alt) == "Alt(['one', 'two'], 'foo + bar')" 52 | assert str(alt) == "one two { foo + bar }" 53 | 54 | def test_indents(): 55 | program = ("stmt: foo | bar\n" 56 | " | baz\n" 57 | " | booh | bah\n") 58 | file = StringIO(program) 59 | tokengen = generate_tokens(file.readline) 60 | tok = Tokenizer(tokengen) 61 | p = GrammarParser(tok) 62 | rules = p.grammar() 63 | assert rules == [Rule('stmt', 64 | [Alt(['foo']), Alt(['bar']), 65 | Alt(['baz']), 66 | Alt(['booh']), Alt(['bah'])])] 67 | 68 | def test_indents2(): 69 | program = ("stmt:\n" 70 | " | foo | bar\n" 71 | " | baz\n" 72 | " | booh | bah\n" 73 | "foo: bar\n") 74 | file = StringIO(program) 75 | tokengen = generate_tokens(file.readline) 76 | tok = Tokenizer(tokengen) 77 | p = GrammarParser(tok) 78 | rules = p.grammar() 79 | assert rules == [Rule('stmt', 80 | [Alt(['foo']), Alt(['bar']), 81 | Alt(['baz']), 82 | Alt(['booh']), Alt(['bah'])]), 83 | Rule('foo', [Alt(['bar'])])] 84 | -------------------------------------------------------------------------------- /stories/story5/test_parser.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story5.tokenizer import Tokenizer 6 | from story5.parser import Parser 7 | from story5.toy import ToyParser 8 | 9 | def test_basic(): 10 | program = "f(42)" 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = Parser(tok) 15 | t = p.expect(NAME) 16 | assert t and t.string == "f" 17 | pos = p.mark() 18 | assert p.expect("(") 19 | t = p.expect(NUMBER) 20 | assert t and t.string == "42" 21 | assert p.expect(")") 22 | pos2 = p.mark() 23 | p.reset(pos) 24 | assert p.expect("(") 25 | assert p.expect(NUMBER) 26 | assert p.expect(")") 27 | p.reset(pos) 28 | 29 | assert p.expect("(") 30 | p.reset(pos2) 31 | assert p.expect(NEWLINE) 32 | assert p.expect(ENDMARKER) 33 | 34 | def test_toy(): 35 | program = "x - (y + z)" 36 | file = StringIO(program) 37 | tokengen = generate_tokens(file.readline) 38 | tok = Tokenizer(tokengen) 39 | p = ToyParser(tok) 40 | tree = p.statement() 41 | print(tree) 42 | assert tree and tree.type == "statement" 43 | assert tree.children[0].type == "expr" 44 | assert tree.children[0].children[0].type == "expr" 45 | -------------------------------------------------------------------------------- /stories/story5/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, OP, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story5.tokenizer import Tokenizer 6 | 7 | def test_basic(): 8 | program = "f(42)" 9 | file = StringIO(program) 10 | tokengen = generate_tokens(file.readline) 11 | tok = Tokenizer(tokengen) 12 | def get(): 13 | return tok.get_token()[:2] 14 | assert get() == (NAME, "f") 15 | assert get() == (OP, "(") 16 | assert get() == (NUMBER, "42") 17 | assert get() == (OP, ")") 18 | assert get() == (NEWLINE, "") 19 | assert get() == (ENDMARKER, "") 20 | 21 | def test_mark_reset(): 22 | program = "f(42) + abc" 23 | file = StringIO(program) 24 | tokengen = generate_tokens(file.readline) 25 | tok = Tokenizer(tokengen) 26 | def get(): 27 | return tok.get_token()[:2] 28 | assert get() == (NAME, "f") 29 | pos = tok.mark() 30 | assert get() == (OP, "(") 31 | assert get() == (NUMBER, "42") 32 | assert get() == (OP, ")") 33 | pos2 = tok.mark() 34 | tok.reset(pos) 35 | assert get() == (OP, "(") 36 | assert get() == (NUMBER, "42") 37 | assert get() == (OP, ")") 38 | tok.reset(pos) 39 | assert get() == (OP, "(") 40 | tok.reset(pos2) # Forward 41 | assert get() == (OP, "+") 42 | assert get() == (NAME, "abc") 43 | tok.reset(pos) 44 | assert get() == (OP, "(") 45 | assert get() == (NUMBER, "42") 46 | assert get() == (OP, ")") 47 | assert get() == (OP, "+") 48 | assert get() == (NAME, "abc") 49 | -------------------------------------------------------------------------------- /stories/story5/tokenizer.py: -------------------------------------------------------------------------------- 1 | class Tokenizer: 2 | 3 | def __init__(self, tokengen, vis=None): 4 | """Call with tokenize.generate_tokens(...).""" 5 | self.tokengen = tokengen 6 | self.vis = vis 7 | self.tokens = [] 8 | self.pos = 0 9 | 10 | def mark(self): 11 | return self.pos 12 | 13 | def reset(self, pos): 14 | if pos == self.pos: 15 | return 16 | self.pos = pos 17 | self.report() 18 | 19 | def get_token(self): 20 | token = self.peek_token() 21 | self.pos += 1 22 | self.report() 23 | return token 24 | 25 | def peek_token(self): 26 | if self.pos == len(self.tokens): 27 | self.tokens.append(next(self.tokengen)) 28 | self.report() 29 | return self.tokens[self.pos] 30 | 31 | def report(self): 32 | if self.vis is not None: 33 | self.vis.vis_tokens(self.tokens, self.pos) 34 | -------------------------------------------------------------------------------- /stories/story5/toy.gram: -------------------------------------------------------------------------------- 1 | start: statements ENDMARKER 2 | statements: statement NEWLINE statements 3 | | statement NEWLINE 4 | statement: if_statement 5 | | assignment 6 | | expr 7 | expr: expr '+' term 8 | | expr '-' term 9 | | term 10 | term: term '*' atom 11 | | term '/' atom 12 | | atom 13 | atom: NAME 14 | | NUMBER 15 | | '(' expr ')' 16 | assignment: target '=' expr 17 | target: NAME 18 | if_statement: 'if' expr ':' statement 19 | -------------------------------------------------------------------------------- /stories/story6/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story6/__init__.py -------------------------------------------------------------------------------- /stories/story6/calc.gram: -------------------------------------------------------------------------------- 1 | @subheader "from ast import literal_eval\n" 2 | 3 | start: expr_stmt* ENDMARKER 4 | expr_stmt: expr NEWLINE { print(expr) or True } 5 | 6 | expr: 7 | | expr '+' ~ term { expr + term } 8 | | expr '-' ~ term { expr - term } 9 | | term { term } 10 | 11 | term: 12 | | '-' ~ term { - term } 13 | | '+' ~ term { + term } 14 | | term '*' ~ factor { term * factor } 15 | | term '/' ~ factor { term / factor } 16 | | term '//' ~ factor { term // factor } 17 | | factor { factor } 18 | 19 | factor: 20 | | atom '**' ~ factor { atom ** factor } 21 | | atom { atom } 22 | 23 | atom: 24 | | STRING { literal_eval(string.string) } 25 | | NUMBER { literal_eval(number.string) } 26 | | '(' ~ expr ')' { expr } 27 | -------------------------------------------------------------------------------- /stories/story6/calc.txt: -------------------------------------------------------------------------------- 1 | + 100 + 50 + - 38 - 70 2 | -------------------------------------------------------------------------------- /stories/story6/driver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import curses 3 | import importlib 4 | import sys 5 | from tokenize import generate_tokens 6 | 7 | from story6.parser import Parser 8 | from story6.tokenizer import Tokenizer 9 | from story6.visualizer import Visualizer 10 | 11 | 12 | argparser = argparse.ArgumentParser() 13 | argparser.add_argument("program", nargs="?", default="story6/in.txt", help="Sample program (in.txt)") 14 | argparser.add_argument("-g", "--grammar", default="story6.toy.ToyParser", help="Grammar class (ToyParser)") 15 | argparser.add_argument("-s", "--start", default="start", help="Start symbol (start)") 16 | argparser.add_argument("-q", "--quiet", action="store_true", help="Don't use visualizer") 17 | 18 | 19 | def main(): 20 | args = argparser.parse_args() 21 | filename = args.program 22 | startname = args.start 23 | modname, classname = args.grammar.rsplit(".", 1) 24 | try: 25 | mod = importlib.import_module(modname) 26 | except ImportError: 27 | sys.exit(f"Cannot import {modname}") 28 | try: 29 | cls = getattr(mod, classname) 30 | except AttributeError: 31 | sys.exit(f"Module {modname} has no attribute {classname}") 32 | if not isinstance(cls, type): 33 | sys.exit(f"Object {modname}.{classname} is not a class ({cls!r})") 34 | if not issubclass(cls, Parser): 35 | sys.exit(f"Object {modname}.{classname} is not a subclass of Parser") 36 | 37 | tree = None 38 | with open(filename) as f: 39 | tokengen = generate_tokens(f.readline) 40 | if args.quiet: 41 | vis = None 42 | else: 43 | vis = Visualizer() 44 | try: 45 | tok = Tokenizer(tokengen, vis) 46 | p = cls(tok) 47 | start = getattr(p, startname) 48 | tree = start() 49 | if vis: 50 | vis.done() 51 | finally: 52 | if vis: 53 | vis.close() 54 | 55 | if tree: 56 | print(tree) 57 | else: 58 | if tok.tokens: 59 | last = tok.tokens[-1] 60 | print(f"Line {last.start[0]}:") 61 | print(last.line) 62 | print(" "*last.start[1] + "^") 63 | sys.exit("SyntaxError") 64 | 65 | 66 | main() 67 | -------------------------------------------------------------------------------- /stories/story6/grammar.gram: -------------------------------------------------------------------------------- 1 | @class GrammarParser 2 | 3 | @subheader """ 4 | from ast import literal_eval 5 | from token import DEDENT, INDENT, OP 6 | 7 | from story6.grammar import Grammar, Rule, Alt, NamedItem, Lookahead, Maybe, Loop, Cut 8 | 9 | BaseParser = Parser 10 | 11 | class Parser(BaseParser): 12 | 13 | def __init__(self, tokenizer): 14 | super().__init__(tokenizer) 15 | self.extra_rules = [] 16 | 17 | def synthetic_rule(self, alts): 18 | if len(alts) == 1 and len(alts[0].items) == 1: 19 | return alts[0].items[0] 20 | name = f"_synthetic_rule_{len(self.extra_rules)}" 21 | rule = Rule(name, alts) 22 | self.extra_rules.append(rule) 23 | return rule.name 24 | """ 25 | 26 | start: grammar ENDMARKER { grammar } 27 | 28 | grammar: 29 | | metas rules { Grammar(rules + self.extra_rules, metas) } 30 | | rules { Grammar(rules + self.extra_rules, []) } 31 | 32 | metas: 33 | | meta metas { [meta] + metas } 34 | | meta { [meta] } 35 | 36 | meta: 37 | | "@" NAME NEWLINE { (name.string, None) } 38 | | "@" NAME NAME NEWLINE { (name.string, name1.string) } 39 | | "@" NAME STRING NEWLINE { (name.string, literal_eval(string.string)) } 40 | 41 | rules: 42 | | rule rules { [rule] + rules } 43 | | rule { [rule] } 44 | 45 | rule: 46 | | NAME ":" alts NEWLINE INDENT more_alts DEDENT { Rule(name.string, alts + more_alts) } 47 | | NAME ":" NEWLINE INDENT more_alts DEDENT { Rule(name.string, more_alts) } 48 | | NAME ":" alts NEWLINE { Rule(name.string, alts) } 49 | 50 | more_alts: 51 | | "|" alts NEWLINE more_alts { alts + more_alts } 52 | | "|" alts NEWLINE { alts } 53 | 54 | alts: 55 | | alt "|" alts { [alt] + alts } 56 | | alt { [alt] } 57 | 58 | alt: 59 | | items action { Alt(items, action) } 60 | | items { Alt(items, None) } 61 | 62 | items: 63 | | item items { [item] + items } 64 | | item { [item] } 65 | 66 | item: 67 | | NAME '=' atom { NamedItem(name.string, atom) } 68 | | atom { atom } 69 | 70 | atom: 71 | | NAME { name.string } 72 | | STRING {string.string } 73 | 74 | action: "{" stuffs "}" { stuffs } 75 | 76 | stuffs: 77 | | stuff stuffs { stuff + " " + stuffs } 78 | | stuff { stuff } 79 | 80 | stuff: 81 | | "{" stuffs "}" { "{" + stuffs + "}" }
 82 | | NAME { name.string } 83 | | NUMBER { number.string } 84 | | STRING { string.string } 85 | | OP { None if op.string == "}" else op.string } 86 | -------------------------------------------------------------------------------- /stories/story6/in.txt: -------------------------------------------------------------------------------- 1 | aap = cat + dog 2 | -------------------------------------------------------------------------------- /stories/story6/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from tokenize import generate_tokens 7 | 8 | from story6.tokenizer import Tokenizer 9 | from story6.generator3 import check, generate 10 | from story6.visualizer import Visualizer 11 | 12 | argparser = argparse.ArgumentParser() 13 | argparser.add_argument("grammar", nargs="?", help="Grammar file (toy.gram)") 14 | argparser.add_argument("-r", "--regen", action="store_true", help="Regenerate grammar") 15 | argparser.add_argument("-o", "--output", help="Output file (toy.py)") 16 | argparser.add_argument("-c", "--classname", help="Output class name (ToyParser)") 17 | argparser.add_argument("-v", "--visualize", action="store_true", help="Use visualizer") 18 | argparser.add_argument("-b", "--backup", action="store_true", help="Use old grammar parser") 19 | 20 | 21 | def main(): 22 | args = argparser.parse_args() 23 | file = args.grammar 24 | if not file: 25 | if args.regen: 26 | file = "story6/grammar.gram" 27 | else: 28 | file = "story6/toy.gram" 29 | outfile = args.output 30 | if not outfile: 31 | head, tail = os.path.split(file) 32 | base, ext = os.path.splitext(tail) 33 | if base == "grammar": 34 | base += "parser" 35 | outfile = os.path.join(head, base + ".py") 36 | classname = args.classname 37 | 38 | if args.backup: 39 | from story6.grammar import GrammarParser 40 | else: 41 | from story6.grammarparser import GrammarParser 42 | 43 | print("Reading", file, file=sys.stderr) 44 | with open(file) as f: 45 | tokengen = generate_tokens(f.readline) 46 | vis = None 47 | if args.visualize: 48 | vis = Visualizer() 49 | try: 50 | tok = Tokenizer(tokengen, vis) 51 | p = GrammarParser(tok) 52 | grammar = p.start() 53 | if vis: 54 | vis.done() 55 | finally: 56 | if vis: 57 | vis.close() 58 | 59 | if not grammar: 60 | if tok.tokens: 61 | last = tok.tokens[-1] 62 | print(f"Line {last.start[0]}:") 63 | print(last.line) 64 | print(" "*last.start[1] + "^") 65 | sys.exit("SyntaxError") 66 | 67 | print(repr(grammar)) 68 | print(str(grammar)) 69 | 70 | if not classname: 71 | classname = grammar.metas_dict.get("class") 72 | if not classname: 73 | tail = os.path.basename(file) 74 | base, ext = os.path.splitext(tail) 75 | classname = base.title() + "Parser" 76 | 77 | errors = check(grammar) 78 | if errors: 79 | sys.exit(f"Detected {errors} errors") 80 | 81 | print("Writing class", classname, "to", outfile, file=sys.stderr) 82 | with open(outfile, "w") as stream: 83 | generate(grammar, classname, stream) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /stories/story6/memo.py: -------------------------------------------------------------------------------- 1 | def memoize(func): 2 | """Memoize a parsing method. 3 | 4 | The functon must be a method on a class deriving from Parser. 5 | 6 | The method must have either no arguments or a single argument that 7 | is an int or str (the latter being the case for expect()). 8 | 9 | It must return either None or an object that is not modified (at 10 | least not while we're parsing). 11 | 12 | We memoize positive and negative outcomes per input position. 13 | 14 | The function is expected to move the input position iff it returns 15 | a not-None value. 16 | 17 | The memo is structured as a dict of dict, the outer dict indexed 18 | by input position, the inner by function and arguments. 19 | """ 20 | 21 | def memoize_wrapper(self, *args): 22 | vis = self.tokenizer.vis 23 | pos = self.mark() 24 | if vis is not None: 25 | vis.show_call(pos, func.__name__, args) 26 | memo = self.memos.get(pos) 27 | if memo is None: 28 | memo = self.memos[pos] = {} 29 | key = (func, args) 30 | if key in memo: 31 | res, endpos = memo[key] 32 | self.reset(endpos) 33 | else: 34 | res = func(self, *args) 35 | endpos = self.mark() 36 | if res is None: 37 | assert endpos == pos 38 | else: 39 | assert endpos > pos 40 | memo[key] = res, endpos 41 | if vis is not None: 42 | vis.show_return(pos, res, endpos) 43 | return res 44 | 45 | return memoize_wrapper 46 | 47 | 48 | def memoize_left_rec(func): 49 | """Memoize a left-recursive parsing method. 50 | 51 | This is similar to @memoize but loops until no longer parse is obtained. 52 | 53 | Inspired by https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion 54 | """ 55 | 56 | def memoize_left_rec_wrapper(self, *args): 57 | vis = self.tokenizer.vis 58 | pos = self.mark() 59 | if vis is not None: 60 | vis.show_call(pos, "*" + func.__name__, args) 61 | memo = self.memos.get(pos) 62 | if memo is None: 63 | memo = self.memos[pos] = {} 64 | key = (func, args) 65 | if key in memo: 66 | res, endpos = memo[key] 67 | self.reset(endpos) 68 | else: 69 | # This is where we deviate from @memoize. 70 | 71 | # Prime the cache with a failure. 72 | memo[key] = lastres, lastpos = None, pos 73 | if vis is not None: 74 | vis.stuff_cache(pos, "*" + func.__name__, args, None) 75 | 76 | # Loop until no longer parse is obtained. 77 | while True: 78 | self.reset(pos) 79 | res = func(self, *args) 80 | endpos = self.mark() 81 | if endpos <= lastpos: 82 | break 83 | memo[key] = lastres, lastpos = res, endpos 84 | if vis is not None: 85 | vis.stuff_cache(pos, "*" + func.__name__, args, res) 86 | 87 | res = lastres 88 | self.reset(lastpos) 89 | 90 | if vis is not None: 91 | vis.show_return(pos, res, endpos) 92 | return res 93 | 94 | return memoize_left_rec_wrapper 95 | -------------------------------------------------------------------------------- /stories/story6/memo2.py: -------------------------------------------------------------------------------- 1 | def memoize_left_rec(func): 2 | 3 | def memoize_left_rec_wrapper(self, *args): 4 | pos = self.mark() 5 | memo = self.memos.get(pos) 6 | if memo is None: 7 | memo = self.memos[pos] = {} 8 | key = (func, args) 9 | if key in memo: 10 | res, endpos = memo[key] 11 | self.reset(endpos) 12 | else: 13 | # This is where we deviate from @memoize. 14 | 15 | # Prime the cache with a failure. 16 | memo[key] = lastres, lastpos = None, pos 17 | 18 | # Loop until no longer parse is obtained. 19 | while True: 20 | self.reset(pos) 21 | res = func(self, *args) 22 | endpos = self.mark() 23 | if endpos <= lastpos: 24 | break 25 | memo[key] = lastres, lastpos = res, endpos 26 | 27 | res = lastres 28 | self.reset(lastpos) 29 | 30 | return res 31 | 32 | return memoize_left_rec_wrapper 33 | -------------------------------------------------------------------------------- /stories/story6/node.py: -------------------------------------------------------------------------------- 1 | from token import tok_name 2 | from tokenize import TokenInfo 3 | 4 | 5 | def short_token(tok: TokenInfo) -> str: 6 | s = tok.string 7 | if s == '' or s.isspace(): 8 | return tok_name[tok.type] 9 | else: 10 | return repr(s) 11 | 12 | 13 | def alt_repr(x) -> str: 14 | if isinstance(x, TokenInfo): 15 | return short_token(x) 16 | else: 17 | return repr(x) 18 | 19 | 20 | class Node: 21 | 22 | def __init__(self, type, children): 23 | self.type = type 24 | self.children = children 25 | 26 | def __repr__(self): 27 | return f"Node({self.type}, [{', '.join(map(alt_repr, self.children))}])" 28 | 29 | def __eq__(self, other): 30 | if not isinstance(other, Node): 31 | return NotImplemented 32 | return self.type == other.type and self.children == other.children 33 | -------------------------------------------------------------------------------- /stories/story6/parser.py: -------------------------------------------------------------------------------- 1 | from story6.memo import memoize 2 | 3 | class Parser: 4 | 5 | def __init__(self, tokenizer): 6 | self.tokenizer = tokenizer 7 | self.memos = {} 8 | 9 | def mark(self): 10 | return self.tokenizer.mark() 11 | 12 | def reset(self, pos): 13 | self.tokenizer.reset(pos) 14 | 15 | def show_rule(self, name, alts): 16 | # alts is a list of lists of strings 17 | vis = self.tokenizer.vis 18 | if vis: 19 | vis.show_rule(name, alts) 20 | 21 | def show_index(self, alt_index, item_index, num_items=1): 22 | vis = self.tokenizer.vis 23 | if vis: 24 | vis.show_index(alt_index, item_index, num_items) 25 | return True 26 | 27 | @memoize 28 | def expect(self, arg): 29 | token = self.tokenizer.peek_token() 30 | if token.type == arg or token.string == arg: 31 | return self.tokenizer.get_token() 32 | return None 33 | 34 | def loop(self, nonempty, func, *args): 35 | mark = self.mark() 36 | nodes = [] 37 | while node := func(*args) is not None: 38 | nodes.append(node) 39 | if len(nodes) >= nonempty: 40 | return nodes 41 | self.reset(mark) 42 | return None 43 | 44 | def lookahead(self, positive, func, *args): 45 | mark = self.mark() 46 | ok = func(*args) is not None 47 | self.reset(mark) 48 | return ok == positive 49 | -------------------------------------------------------------------------------- /stories/story6/test_grammar.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story6.tokenizer import Tokenizer 6 | from story6.parser import Parser 7 | from story6.grammar import Rule, Alt, NamedItem, Maybe, Loop, Lookahead, Cut 8 | from story6.grammarparser import GrammarParser 9 | 10 | def start(program): 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = GrammarParser(tok) 15 | return p.start() 16 | 17 | def test_grammar(): 18 | program = ("stmt: asmt | expr\n" 19 | "asmt: NAME '=' expr\n" 20 | "expr: NAME\n") 21 | rules = start(program).rules 22 | assert rules == [Rule('stmt', [Alt(['asmt']), Alt(['expr'])]), 23 | Rule('asmt', [Alt(['NAME', "'='", 'expr'])]), 24 | Rule('expr', [Alt(['NAME'])])] 25 | 26 | def test_failure(): 27 | program = ("stmt: asmt | expr\n" 28 | "asmt: NAME '=' expr 42\n" 29 | "expr: NAME\n") 30 | grammar = start(program) 31 | assert grammar is None 32 | 33 | def test_action(): 34 | program = "start: NAME { foo + bar } | NUMBER { -baz }\n" 35 | rules = start(program).rules 36 | assert rules == [Rule("start", [Alt(["NAME"], "foo + bar"), 37 | Alt(["NUMBER"], "- baz")])] 38 | assert rules != [Rule("start", [Alt(["NAME"], "foo + bar"), 39 | Alt(["NUMBER"], "baz")])] 40 | 41 | def test_action_repr_str(): 42 | alt = Alt(["one", "two"]) 43 | assert repr(alt) == "Alt(['one', 'two'])" 44 | assert str(alt) == "one two" 45 | 46 | alt = Alt(["one", "two"], "foo + bar") 47 | assert repr(alt) == "Alt(['one', 'two'], 'foo + bar')" 48 | assert str(alt) == "one two { foo + bar }" 49 | 50 | def test_indents(): 51 | program = ("stmt: foo | bar\n" 52 | " | baz\n" 53 | " | booh | bah\n") 54 | rules = start(program).rules 55 | assert rules == [Rule('stmt', 56 | [Alt(['foo']), Alt(['bar']), 57 | Alt(['baz']), 58 | Alt(['booh']), Alt(['bah'])])] 59 | 60 | def test_indents2(): 61 | program = ("stmt:\n" 62 | " | foo | bar\n" 63 | " | baz\n" 64 | " | booh | bah\n" 65 | "foo: bar\n") 66 | rules = start(program).rules 67 | assert rules == [Rule('stmt', 68 | [Alt(['foo']), Alt(['bar']), 69 | Alt(['baz']), 70 | Alt(['booh']), Alt(['bah'])]), 71 | Rule('foo', [Alt(['bar'])])] 72 | 73 | def test_meta(): 74 | program = ("@start 'start'\n" 75 | "@foo bar\n" 76 | "@bar\n" 77 | "stmt: foo\n") 78 | grammar = start(program) 79 | assert grammar 80 | assert grammar.rules == [Rule('stmt', [Alt(["foo"])])] 81 | assert grammar.metas == [('start', 'start'), 82 | ('foo', 'bar'), 83 | ('bar', None)] 84 | 85 | def test_named_item(): 86 | program = ("start: f=foo\n" 87 | "foo: n=NAME\n") 88 | file = StringIO(program) 89 | tokengen = generate_tokens(file.readline) 90 | tok = Tokenizer(tokengen) 91 | p = GrammarParser(tok) 92 | rules = p.start().rules 93 | assert rules == [Rule('start', [Alt([NamedItem('f', 'foo')])]), 94 | Rule('foo', [Alt([NamedItem('n', 'NAME')])])] 95 | -------------------------------------------------------------------------------- /stories/story6/test_parser.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story6.tokenizer import Tokenizer 6 | from story6.parser import Parser 7 | from story6.toy import ToyParser 8 | 9 | def test_basic(): 10 | program = "f(42)" 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = Parser(tok) 15 | t = p.expect(NAME) 16 | assert t and t.string == "f" 17 | pos = p.mark() 18 | assert p.expect("(") 19 | t = p.expect(NUMBER) 20 | assert t and t.string == "42" 21 | assert p.expect(")") 22 | pos2 = p.mark() 23 | p.reset(pos) 24 | assert p.expect("(") 25 | assert p.expect(NUMBER) 26 | assert p.expect(")") 27 | p.reset(pos) 28 | 29 | assert p.expect("(") 30 | p.reset(pos2) 31 | assert p.expect(NEWLINE) 32 | assert p.expect(ENDMARKER) 33 | 34 | def test_toy(): 35 | program = "x - (y + z)" 36 | file = StringIO(program) 37 | tokengen = generate_tokens(file.readline) 38 | tok = Tokenizer(tokengen) 39 | p = ToyParser(tok) 40 | tree = p.statement() 41 | print(tree) 42 | assert tree and tree.type == "statement" 43 | assert tree.children[0].type == "expr" 44 | assert tree.children[0].children[0].type == "expr" 45 | -------------------------------------------------------------------------------- /stories/story6/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, OP, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story6.tokenizer import Tokenizer 6 | 7 | def test_basic(): 8 | program = "f(42)" 9 | file = StringIO(program) 10 | tokengen = generate_tokens(file.readline) 11 | tok = Tokenizer(tokengen) 12 | def get(): 13 | return tok.get_token()[:2] 14 | assert get() == (NAME, "f") 15 | assert get() == (OP, "(") 16 | assert get() == (NUMBER, "42") 17 | assert get() == (OP, ")") 18 | assert get() == (NEWLINE, "") 19 | assert get() == (ENDMARKER, "") 20 | 21 | def test_mark_reset(): 22 | program = "f(42) + abc" 23 | file = StringIO(program) 24 | tokengen = generate_tokens(file.readline) 25 | tok = Tokenizer(tokengen) 26 | def get(): 27 | return tok.get_token()[:2] 28 | assert get() == (NAME, "f") 29 | pos = tok.mark() 30 | assert get() == (OP, "(") 31 | assert get() == (NUMBER, "42") 32 | assert get() == (OP, ")") 33 | pos2 = tok.mark() 34 | tok.reset(pos) 35 | assert get() == (OP, "(") 36 | assert get() == (NUMBER, "42") 37 | assert get() == (OP, ")") 38 | tok.reset(pos) 39 | assert get() == (OP, "(") 40 | tok.reset(pos2) # Forward 41 | assert get() == (OP, "+") 42 | assert get() == (NAME, "abc") 43 | tok.reset(pos) 44 | assert get() == (OP, "(") 45 | assert get() == (NUMBER, "42") 46 | assert get() == (OP, ")") 47 | assert get() == (OP, "+") 48 | assert get() == (NAME, "abc") 49 | -------------------------------------------------------------------------------- /stories/story6/tokenizer.py: -------------------------------------------------------------------------------- 1 | from tokenize import ERRORTOKEN, NL, COMMENT 2 | 3 | 4 | class Tokenizer: 5 | 6 | def __init__(self, tokengen, vis=None): 7 | """Call with tokenize.generate_tokens(...).""" 8 | self.tokengen = tokengen 9 | self.vis = vis 10 | self.tokens = [] 11 | self.pos = 0 12 | 13 | def mark(self): 14 | return self.pos 15 | 16 | def reset(self, pos): 17 | if pos == self.pos: 18 | return 19 | self.pos = pos 20 | self.report() 21 | 22 | def get_token(self): 23 | token = self.peek_token() 24 | self.pos += 1 25 | self.report() 26 | return token 27 | 28 | def peek_token(self): 29 | if self.pos == len(self.tokens): 30 | while True: 31 | token = next(self.tokengen) 32 | if token.type == ERRORTOKEN and token.string.isspace(): 33 | continue 34 | if token.type in (NL, COMMENT): 35 | continue 36 | break 37 | self.tokens.append(token) 38 | self.report() 39 | return self.tokens[self.pos] 40 | 41 | def report(self): 42 | if self.vis is not None: 43 | self.vis.vis_tokens(self.tokens, self.pos) 44 | -------------------------------------------------------------------------------- /stories/story6/toy.gram: -------------------------------------------------------------------------------- 1 | # Toy grammar. 2 | 3 | @class ToyParser 4 | 5 | @subheader """# This is the toy grammar used in the blog series. 6 | """ 7 | 8 | @trailer """ 9 | # The end.""" 10 | 11 | start: statements ENDMARKER 12 | statements: statement NEWLINE statements 13 | | statement NEWLINE 14 | statement: if_statement 15 | | assignment 16 | | expr 17 | expr: expr '+' term 18 | | expr '-' term 19 | | term 20 | term: term '*' atom 21 | | term '/' atom 22 | | atom 23 | atom: NAME 24 | | NUMBER 25 | | '(' expr ')' 26 | assignment: target '=' expr 27 | target: NAME 28 | if_statement: 'if' expr ':' statement 29 | -------------------------------------------------------------------------------- /stories/story7/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/stories/story7/__init__.py -------------------------------------------------------------------------------- /stories/story7/calc.gram: -------------------------------------------------------------------------------- 1 | @subheader "from ast import literal_eval\n" 2 | 3 | start: expr_stmt* ENDMARKER 4 | expr_stmt: expr NEWLINE { print(expr) or True } 5 | 6 | expr: 7 | | expr '+' ~ term { expr + term } 8 | | expr '-' ~ term { expr - term } 9 | | term { term } 10 | 11 | term: 12 | | '-' ~ term { - term } 13 | | '+' ~ term { + term } 14 | | term '*' ~ factor { term * factor } 15 | | term '/' ~ factor { term / factor } 16 | | term '//' ~ factor { term // factor } 17 | | factor { factor } 18 | 19 | factor: 20 | | atom '**' ~ factor { atom ** factor } 21 | | atom { atom } 22 | 23 | atom: 24 | | STRING { literal_eval(string.string) } 25 | | NUMBER { literal_eval(number.string) } 26 | | '(' ~ expr ')' { expr } 27 | -------------------------------------------------------------------------------- /stories/story7/calc.txt: -------------------------------------------------------------------------------- 1 | + 100 + 50 + - 38 - 70 2 | -------------------------------------------------------------------------------- /stories/story7/driver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import curses 3 | import importlib 4 | import sys 5 | from tokenize import generate_tokens 6 | 7 | from story7.parser import Parser 8 | from story7.tokenizer import Tokenizer 9 | from story7.visualizer import Visualizer 10 | 11 | 12 | argparser = argparse.ArgumentParser() 13 | argparser.add_argument("program", nargs="?", default="story7/in.txt", help="Sample program (in.txt)") 14 | argparser.add_argument("-g", "--grammar", default="story7.toy.ToyParser", help="Grammar class (ToyParser)") 15 | argparser.add_argument("-s", "--start", default="start", help="Start symbol (start)") 16 | argparser.add_argument("-q", "--quiet", action="store_true", help="Don't use visualizer") 17 | 18 | 19 | def main(): 20 | args = argparser.parse_args() 21 | filename = args.program 22 | startname = args.start 23 | modname, classname = args.grammar.rsplit(".", 1) 24 | try: 25 | mod = importlib.import_module(modname) 26 | except ImportError: 27 | sys.exit(f"Cannot import {modname}") 28 | try: 29 | cls = getattr(mod, classname) 30 | except AttributeError: 31 | sys.exit(f"Module {modname} has no attribute {classname}") 32 | if not isinstance(cls, type): 33 | sys.exit(f"Object {modname}.{classname} is not a class ({cls!r})") 34 | if not issubclass(cls, Parser): 35 | sys.exit(f"Object {modname}.{classname} is not a subclass of Parser") 36 | 37 | tree = None 38 | with open(filename) as f: 39 | tokengen = generate_tokens(f.readline) 40 | if args.quiet: 41 | vis = None 42 | else: 43 | vis = Visualizer() 44 | try: 45 | tok = Tokenizer(tokengen, vis) 46 | p = cls(tok) 47 | start = getattr(p, startname) 48 | tree = start() 49 | if vis: 50 | vis.done() 51 | finally: 52 | if vis: 53 | vis.close() 54 | 55 | if tree: 56 | print(tree) 57 | else: 58 | if tok.tokens: 59 | last = tok.tokens[-1] 60 | print(f"Line {last.start[0]}:") 61 | print(last.line) 62 | print(" "*last.start[1] + "^") 63 | sys.exit("SyntaxError") 64 | 65 | 66 | main() 67 | -------------------------------------------------------------------------------- /stories/story7/grammar.gram: -------------------------------------------------------------------------------- 1 | @class GrammarParser 2 | 3 | @subheader """ 4 | from ast import literal_eval 5 | from token import DEDENT, INDENT, OP 6 | 7 | from story7.grammar import Grammar, Rule, Alt, NamedItem, Lookahead, Maybe, Loop, Cut 8 | 9 | BaseParser = Parser 10 | 11 | class Parser(BaseParser): 12 | 13 | def __init__(self, tokenizer): 14 | super().__init__(tokenizer) 15 | self.extra_rules = [] 16 | 17 | def synthetic_rule(self, alts): 18 | if len(alts) == 1 and len(alts[0].items) == 1: 19 | return alts[0].items[0] 20 | name = f"_synthetic_rule_{len(self.extra_rules)}" 21 | rule = Rule(name, alts) 22 | self.extra_rules.append(rule) 23 | return rule.name 24 | """ 25 | 26 | start: grammar ENDMARKER { grammar } 27 | 28 | grammar: 29 | | metas rules { Grammar(rules + self.extra_rules, metas) } 30 | | rules { Grammar(rules + self.extra_rules, []) } 31 | 32 | metas: 33 | | meta metas { [meta] + metas } 34 | | meta { [meta] } 35 | 36 | meta: 37 | | "@" NAME NEWLINE { (name.string, None) } 38 | | "@" NAME NAME NEWLINE { (name.string, name1.string) } 39 | | "@" NAME STRING NEWLINE { (name.string, literal_eval(string.string)) } 40 | 41 | rules: 42 | | rule rules { [rule] + rules } 43 | | rule { [rule] } 44 | 45 | rule: 46 | | NAME ":" alts NEWLINE INDENT more_alts DEDENT { Rule(name.string, alts + more_alts) } 47 | | NAME ":" NEWLINE INDENT more_alts DEDENT { Rule(name.string, more_alts) } 48 | | NAME ":" alts NEWLINE { Rule(name.string, alts) } 49 | 50 | more_alts: 51 | | "|" alts NEWLINE more_alts { alts + more_alts } 52 | | "|" alts NEWLINE { alts } 53 | 54 | alts: 55 | | alt "|" alts { [alt] + alts } 56 | | alt { [alt] } 57 | 58 | alt: 59 | | items action { Alt(items, action) } 60 | | items { Alt(items, None) } 61 | 62 | items: 63 | | item items { [item] + items } 64 | | item { [item] } 65 | 66 | item: 67 | | NAME '=' molecule { NamedItem(name.string, molecule) } 68 | | "&" atom { Lookahead(atom) } 69 | | "!" atom { Lookahead(atom, False) } 70 | | "~" { Cut() } 71 | | molecule { molecule } 72 | 73 | molecule: 74 | | atom "?" { Maybe(atom) } 75 | | atom "*" { Loop(atom) } 76 | | atom "+" { Loop(atom, True) } 77 | | atom { atom } 78 | | "[" alts "]" { Maybe(self.synthetic_rule(alts)) } 79 | 80 | atom: 81 | | NAME { name.string } 82 | | STRING {string.string } 83 | | "(" alts ")" { self.synthetic_rule(alts) } 84 | 85 | action: "{" stuffs "}" { stuffs } 86 | 87 | stuffs: 88 | | stuff stuffs { stuff + " " + stuffs } 89 | | stuff { stuff } 90 | 91 | stuff: 92 | | "{" stuffs "}" { "{" + stuffs + "}" }
 93 | | NAME { name.string } 94 | | NUMBER { number.string } 95 | | STRING { string.string } 96 | | !"}" OP { op.string } 97 | -------------------------------------------------------------------------------- /stories/story7/in.txt: -------------------------------------------------------------------------------- 1 | aap = cat + dog 2 | -------------------------------------------------------------------------------- /stories/story7/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.8 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from tokenize import generate_tokens 7 | 8 | from story7.tokenizer import Tokenizer 9 | from story7.generator3 import check, generate 10 | from story7.visualizer import Visualizer 11 | 12 | argparser = argparse.ArgumentParser() 13 | argparser.add_argument("grammar", nargs="?", help="Grammar file (toy.gram)") 14 | argparser.add_argument("-r", "--regen", action="store_true", help="Regenerate grammar") 15 | argparser.add_argument("-o", "--output", help="Output file (toy.py)") 16 | argparser.add_argument("-c", "--classname", help="Output class name (ToyParser)") 17 | argparser.add_argument("-v", "--visualize", action="store_true", help="Use visualizer") 18 | argparser.add_argument("-b", "--backup", action="store_true", help="Use old grammar parser") 19 | 20 | 21 | def main(): 22 | args = argparser.parse_args() 23 | file = args.grammar 24 | if not file: 25 | if args.regen: 26 | file = "story7/grammar.gram" 27 | else: 28 | file = "story7/toy.gram" 29 | outfile = args.output 30 | if not outfile: 31 | head, tail = os.path.split(file) 32 | base, ext = os.path.splitext(tail) 33 | if base == "grammar": 34 | base += "parser" 35 | outfile = os.path.join(head, base + ".py") 36 | classname = args.classname 37 | 38 | if args.backup: 39 | from story7.grammar import GrammarParser 40 | else: 41 | from story7.grammarparser import GrammarParser 42 | 43 | print("Reading", file, file=sys.stderr) 44 | with open(file) as f: 45 | tokengen = generate_tokens(f.readline) 46 | vis = None 47 | if args.visualize: 48 | vis = Visualizer() 49 | try: 50 | tok = Tokenizer(tokengen, vis) 51 | p = GrammarParser(tok) 52 | grammar = p.start() 53 | if vis: 54 | vis.done() 55 | finally: 56 | if vis: 57 | vis.close() 58 | 59 | if not grammar: 60 | if tok.tokens: 61 | last = tok.tokens[-1] 62 | print(f"Line {last.start[0]}:") 63 | print(last.line) 64 | print(" "*last.start[1] + "^") 65 | sys.exit("SyntaxError") 66 | 67 | print(repr(grammar)) 68 | print(str(grammar)) 69 | 70 | if not classname: 71 | classname = grammar.metas_dict.get("class") 72 | if not classname: 73 | tail = os.path.basename(file) 74 | base, ext = os.path.splitext(tail) 75 | classname = base.title() + "Parser" 76 | 77 | errors = check(grammar) 78 | if errors: 79 | sys.exit(f"Detected {errors} errors") 80 | 81 | print("Writing class", classname, "to", outfile, file=sys.stderr) 82 | with open(outfile, "w") as stream: 83 | generate(grammar, classname, stream) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /stories/story7/memo2.py: -------------------------------------------------------------------------------- 1 | def memoize_left_rec(func): 2 | 3 | def memoize_left_rec_wrapper(self, *args): 4 | pos = self.mark() 5 | memo = self.memos.get(pos) 6 | if memo is None: 7 | memo = self.memos[pos] = {} 8 | key = (func, args) 9 | if key in memo: 10 | res, endpos = memo[key] 11 | self.reset(endpos) 12 | else: 13 | # This is where we deviate from @memoize. 14 | 15 | # Prime the cache with a failure. 16 | memo[key] = lastres, lastpos = None, pos 17 | 18 | # Loop until no longer parse is obtained. 19 | while True: 20 | self.reset(pos) 21 | res = func(self, *args) 22 | endpos = self.mark() 23 | if endpos <= lastpos: 24 | break 25 | memo[key] = lastres, lastpos = res, endpos 26 | 27 | res = lastres 28 | self.reset(lastpos) 29 | 30 | return res 31 | 32 | return memoize_left_rec_wrapper 33 | -------------------------------------------------------------------------------- /stories/story7/node.py: -------------------------------------------------------------------------------- 1 | from token import tok_name 2 | from tokenize import TokenInfo 3 | 4 | 5 | def short_token(tok: TokenInfo) -> str: 6 | s = tok.string 7 | if s == '' or s.isspace(): 8 | return tok_name[tok.type] 9 | else: 10 | return repr(s) 11 | 12 | 13 | def alt_repr(x) -> str: 14 | if isinstance(x, TokenInfo): 15 | return short_token(x) 16 | else: 17 | return repr(x) 18 | 19 | 20 | class Node: 21 | 22 | def __init__(self, type, children): 23 | self.type = type 24 | self.children = children 25 | 26 | def __repr__(self): 27 | return f"Node({self.type}, [{', '.join(map(alt_repr, self.children))}])" 28 | 29 | def __eq__(self, other): 30 | if not isinstance(other, Node): 31 | return NotImplemented 32 | return self.type == other.type and self.children == other.children 33 | -------------------------------------------------------------------------------- /stories/story7/parser.py: -------------------------------------------------------------------------------- 1 | from story7.memo import memoize 2 | 3 | class Parser: 4 | 5 | def __init__(self, tokenizer): 6 | self.tokenizer = tokenizer 7 | self.memos = {} 8 | 9 | def mark(self): 10 | return self.tokenizer.mark() 11 | 12 | def reset(self, pos): 13 | self.tokenizer.reset(pos) 14 | 15 | def show_rule(self, name, alts): 16 | # alts is a list of lists of strings 17 | vis = self.tokenizer.vis 18 | if vis: 19 | vis.show_rule(name, alts) 20 | 21 | def show_index(self, alt_index, item_index, num_items=1): 22 | vis = self.tokenizer.vis 23 | if vis: 24 | vis.show_index(alt_index, item_index, num_items) 25 | return True 26 | 27 | @memoize 28 | def expect(self, arg): 29 | token = self.tokenizer.peek_token() 30 | if token.type == arg or token.string == arg: 31 | return self.tokenizer.get_token() 32 | return None 33 | 34 | def loop(self, nonempty, func, *args): 35 | mark = self.mark() 36 | nodes = [] 37 | while node := func(*args) is not None: 38 | nodes.append(node) 39 | if len(nodes) >= nonempty: 40 | return nodes 41 | self.reset(mark) 42 | return None 43 | 44 | def lookahead(self, positive, func, *args): 45 | mark = self.mark() 46 | ok = func(*args) is not None 47 | self.reset(mark) 48 | return ok == positive 49 | -------------------------------------------------------------------------------- /stories/story7/test_parser.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story7.tokenizer import Tokenizer 6 | from story7.parser import Parser 7 | from story7.toy import ToyParser 8 | 9 | def test_basic(): 10 | program = "f(42)" 11 | file = StringIO(program) 12 | tokengen = generate_tokens(file.readline) 13 | tok = Tokenizer(tokengen) 14 | p = Parser(tok) 15 | t = p.expect(NAME) 16 | assert t and t.string == "f" 17 | pos = p.mark() 18 | assert p.expect("(") 19 | t = p.expect(NUMBER) 20 | assert t and t.string == "42" 21 | assert p.expect(")") 22 | pos2 = p.mark() 23 | p.reset(pos) 24 | assert p.expect("(") 25 | assert p.expect(NUMBER) 26 | assert p.expect(")") 27 | p.reset(pos) 28 | 29 | assert p.expect("(") 30 | p.reset(pos2) 31 | assert p.expect(NEWLINE) 32 | assert p.expect(ENDMARKER) 33 | 34 | def test_toy(): 35 | program = "x - (y + z)" 36 | file = StringIO(program) 37 | tokengen = generate_tokens(file.readline) 38 | tok = Tokenizer(tokengen) 39 | p = ToyParser(tok) 40 | tree = p.statement() 41 | print(tree) 42 | assert tree and tree.type == "statement" 43 | assert tree.children[0].type == "expr" 44 | assert tree.children[0].children[0].type == "expr" 45 | -------------------------------------------------------------------------------- /stories/story7/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from token import NAME, NUMBER, OP, NEWLINE, ENDMARKER 3 | from tokenize import generate_tokens 4 | 5 | from story7.tokenizer import Tokenizer 6 | 7 | def test_basic(): 8 | program = "f(42)" 9 | file = StringIO(program) 10 | tokengen = generate_tokens(file.readline) 11 | tok = Tokenizer(tokengen) 12 | def get(): 13 | return tok.get_token()[:2] 14 | assert get() == (NAME, "f") 15 | assert get() == (OP, "(") 16 | assert get() == (NUMBER, "42") 17 | assert get() == (OP, ")") 18 | assert get() == (NEWLINE, "") 19 | assert get() == (ENDMARKER, "") 20 | 21 | def test_mark_reset(): 22 | program = "f(42) + abc" 23 | file = StringIO(program) 24 | tokengen = generate_tokens(file.readline) 25 | tok = Tokenizer(tokengen) 26 | def get(): 27 | return tok.get_token()[:2] 28 | assert get() == (NAME, "f") 29 | pos = tok.mark() 30 | assert get() == (OP, "(") 31 | assert get() == (NUMBER, "42") 32 | assert get() == (OP, ")") 33 | pos2 = tok.mark() 34 | tok.reset(pos) 35 | assert get() == (OP, "(") 36 | assert get() == (NUMBER, "42") 37 | assert get() == (OP, ")") 38 | tok.reset(pos) 39 | assert get() == (OP, "(") 40 | tok.reset(pos2) # Forward 41 | assert get() == (OP, "+") 42 | assert get() == (NAME, "abc") 43 | tok.reset(pos) 44 | assert get() == (OP, "(") 45 | assert get() == (NUMBER, "42") 46 | assert get() == (OP, ")") 47 | assert get() == (OP, "+") 48 | assert get() == (NAME, "abc") 49 | -------------------------------------------------------------------------------- /stories/story7/tokenizer.py: -------------------------------------------------------------------------------- 1 | from tokenize import ERRORTOKEN, NL, COMMENT 2 | 3 | 4 | class Tokenizer: 5 | 6 | def __init__(self, tokengen, vis=None): 7 | """Call with tokenize.generate_tokens(...).""" 8 | self.tokengen = tokengen 9 | self.vis = vis 10 | self.tokens = [] 11 | self.pos = 0 12 | 13 | def mark(self): 14 | return self.pos 15 | 16 | def reset(self, pos): 17 | if pos == self.pos: 18 | return 19 | self.pos = pos 20 | self.report() 21 | 22 | def get_token(self): 23 | token = self.peek_token() 24 | self.pos += 1 25 | self.report() 26 | return token 27 | 28 | def peek_token(self): 29 | if self.pos == len(self.tokens): 30 | while True: 31 | token = next(self.tokengen) 32 | if token.type == ERRORTOKEN and token.string.isspace(): 33 | continue 34 | if token.type in (NL, COMMENT): 35 | continue 36 | break 37 | self.tokens.append(token) 38 | self.report() 39 | return self.tokens[self.pos] 40 | 41 | def report(self): 42 | if self.vis is not None: 43 | self.vis.vis_tokens(self.tokens, self.pos) 44 | -------------------------------------------------------------------------------- /stories/story7/toy.gram: -------------------------------------------------------------------------------- 1 | # Toy grammar. 2 | 3 | @class ToyParser 4 | 5 | @subheader """# This is the toy grammar used in the blog series. 6 | """ 7 | 8 | @trailer """ 9 | # The end.""" 10 | 11 | start: statements ENDMARKER 12 | statements: statement NEWLINE statements 13 | | statement NEWLINE 14 | statement: if_statement 15 | | assignment 16 | | expr 17 | expr: expr '+' term 18 | | expr '-' term 19 | | term 20 | term: term '*' atom 21 | | term '/' atom 22 | | atom 23 | atom: NAME 24 | | NUMBER 25 | | '(' expr ')' 26 | assignment: target '=' expr 27 | target: NAME 28 | if_statement: 'if' expr ':' statement 29 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from test.support import load_package_tests 3 | 4 | 5 | # Load all tests in package 6 | def load_tests(*args): 7 | return load_package_tests(os.path.dirname(__file__), *args) 8 | -------------------------------------------------------------------------------- /tests/demo.py: -------------------------------------------------------------------------------- 1 | def foo(): 2 | print(__file__ + ": parsed and executed.") 3 | 4 | 5 | foo() 6 | -------------------------------------------------------------------------------- /tests/python_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/we-like-parsers/pegen/3b9f936a30d6c929d2538437cdc0465fa521b8f3/tests/python_parser/__init__.py -------------------------------------------------------------------------------- /tests/python_parser/conftest.py: -------------------------------------------------------------------------------- 1 | """"Conftest for pure python parser.""" 2 | 3 | from pathlib import Path 4 | 5 | import pytest 6 | from pegen.build import build_parser 7 | from pegen.utils import generate_parser 8 | 9 | 10 | @pytest.fixture(scope="session") 11 | def python_parser_cls(): 12 | grammar_path = Path(__file__).parent.parent.parent / "data/python.gram" 13 | grammar = build_parser(grammar_path)[0] 14 | source_path = str(Path(__file__).parent / "parser_cache" / "py_parser.py") 15 | parser_cls = generate_parser(grammar, source_path, "PythonParser") 16 | 17 | return parser_cls 18 | 19 | 20 | @pytest.fixture(scope="session") 21 | def python_parse_file(): 22 | grammar_path = Path(__file__).parent.parent.parent / "data/python.gram" 23 | grammar = build_parser(grammar_path)[0] 24 | source_path = str(Path(__file__).parent / "parser_cache" / "py_parser.py") 25 | parser_cls = generate_parser(grammar, source_path, "parse_file") 26 | 27 | return parser_cls 28 | 29 | 30 | @pytest.fixture(scope="session") 31 | def python_parse_str(): 32 | grammar_path = Path(__file__).parent.parent.parent / "data/python.gram" 33 | grammar = build_parser(grammar_path)[0] 34 | source_path = str(Path(__file__).parent / "parser_cache" / "py_parser.py") 35 | parser_cls = generate_parser(grammar, source_path, "parse_string") 36 | 37 | return parser_cls 38 | -------------------------------------------------------------------------------- /tests/python_parser/data/advanced_decorators.py: -------------------------------------------------------------------------------- 1 | @d[a] 2 | def f(): 3 | pass 4 | 5 | 6 | @d 7 | @d() 8 | @d(a) 9 | @d[a] 10 | def f(): 11 | pass 12 | -------------------------------------------------------------------------------- /tests/python_parser/data/assignment.py: -------------------------------------------------------------------------------- 1 | a = b 2 | a += b 3 | a -= b 4 | a *= b 5 | a /= b 6 | a //= b 7 | a %= b 8 | a |= b 9 | a ^= b 10 | a **= b 11 | a &= b 12 | a @= b 13 | a <<= b 14 | a >>= b 15 | a += yield 16 | 17 | 18 | 19 | (a) += 1 20 | a[1] += 1 21 | a.b += 1 22 | a.b.c += 1 23 | f(i for i in range(2)).a += 1 24 | f().a += 1 25 | 26 | 27 | 28 | (a) = 1 29 | a.b = 1 30 | a.b.c = 1 31 | a.b.c.d = 1 32 | a[b] = c 33 | a[b][c] = 1 34 | a.b[c] = 1 35 | a[1:] = b 36 | a[:1] = b 37 | a[1:10:2] = b 38 | 39 | 40 | 41 | a: int = b 42 | a: int = yield 43 | a.b: int 44 | a.b: int = 1 45 | a[b]: int = 1 46 | a[b]: int = 1 47 | a = 1 48 | a = 1.0 49 | 50 | 51 | 52 | a = "" 53 | a = u"" 54 | a = r"\c" 55 | a = b"a" 56 | a = f"{a}" 57 | a = f"{d}" "rr" 58 | a = "rr" f"{d}" "rr" 59 | 60 | 61 | 62 | a = () 63 | a = (1,) 64 | a = (1, 2) 65 | 66 | 67 | 68 | b = [] 69 | b = [ 70 | 1, 71 | ] 72 | b = [1, 2] 73 | 74 | 75 | 76 | c = { 77 | 1, 78 | } 79 | c = {1, 2} 80 | d = {} 81 | d = {1: 2} 82 | d = { 83 | 1: 2, 84 | } 85 | d = {1: 2, 3: 4} 86 | 87 | 88 | 89 | a = True 90 | b = False 91 | c = None 92 | 93 | 94 | 95 | d = *a, (*b, c) 96 | d = *a, (*b, *c) 97 | 98 | 99 | 100 | f = (a := 1) 101 | 102 | 103 | 104 | a, b = c 105 | a, *b = c 106 | a, *b, d = c 107 | a, *b, d = yield d 108 | -------------------------------------------------------------------------------- /tests/python_parser/data/async.py: -------------------------------------------------------------------------------- 1 | async def f(): 2 | pass 3 | 4 | 5 | async def f(): 6 | await b 7 | 8 | 9 | async def f(): 10 | async for i in range(10): 11 | pass 12 | 13 | 14 | async def f(): 15 | async with open(f) as p: 16 | pass 17 | 18 | 19 | async def f(): 20 | a = [i async for i in range(10)] 21 | return a 22 | -------------------------------------------------------------------------------- /tests/python_parser/data/call.py: -------------------------------------------------------------------------------- 1 | a = () 2 | b = {} 3 | f() 4 | f(b) 5 | f(b=c) 6 | f(*a) 7 | f(c, *a) 8 | f(c=1, *b) 9 | f(*a, c=1) 10 | f(**b) 11 | f(c, *a, **b) 12 | f(c, *a, x, **b) 13 | f(c, a=1, **b) 14 | f(a := 1) 15 | f(**b, a=1) 16 | f(i for i in range(10)) 17 | -------------------------------------------------------------------------------- /tests/python_parser/data/classes.py: -------------------------------------------------------------------------------- 1 | class A: 2 | pass 3 | 4 | 5 | class A(B): 6 | pass 7 | 8 | 9 | class A( 10 | B, 11 | C, 12 | ): 13 | pass 14 | 15 | 16 | class A(metaclass=M): 17 | pass 18 | 19 | 20 | class A(B, metaclass=M): 21 | pass 22 | 23 | 24 | class A(*t): 25 | pass 26 | 27 | 28 | class A(B, *t): 29 | pass 30 | 31 | 32 | class A(**kw): 33 | pass 34 | 35 | 36 | class A(B, **kw): 37 | pass 38 | -------------------------------------------------------------------------------- /tests/python_parser/data/comprehensions.py: -------------------------------------------------------------------------------- 1 | a = (k for k in g) 2 | b = (k for k in g if k == 1) 3 | (k for k in g).send(None) 4 | 5 | 6 | a = [k for k in g] 7 | b = [k for k in g if k == 1] 8 | 9 | 10 | a = {k for k in g} 11 | b = {k for k in g if k == 1} 12 | a = {k: 1 for k in g} 13 | b = {k: 2 for k in g if k == 1} 14 | 15 | 16 | [k for v in a for k in v] 17 | -------------------------------------------------------------------------------- /tests/python_parser/data/expressions.py: -------------------------------------------------------------------------------- 1 | a + b 2 | a - b 3 | a * b 4 | a / b 5 | a // b 6 | a % b 7 | a @ b 8 | a << b 9 | a >> b 10 | a | b 11 | a ^ b 12 | a ** b 13 | a == b 14 | a < b 15 | a <= b 16 | a > b 17 | a >= b 18 | a != b 19 | a & b 20 | ~a 21 | (1, 2, 3) 22 | ["a", "b"] 23 | {1, 2} 24 | {a: a.b} 25 | {**d, a: b} 26 | 27 | not b 28 | a if b else c 29 | a or b 30 | a and b 31 | a in b 32 | a not in b 33 | a is b 34 | a is not b 35 | 36 | a * (+1) 37 | a * (-1) 38 | a * (~1) 39 | 40 | (a) 41 | (yield a) -------------------------------------------------------------------------------- /tests/python_parser/data/fstrings.py: -------------------------------------------------------------------------------- 1 | a = 10 2 | f'{a * x()}' 3 | 4 | 5 | 6 | f'no formatted values' 7 | f'eggs {a * x()} spam {b + y()}' 8 | 9 | 10 | 11 | a = 10 12 | f'{a * x()} {a * x()} {a * x()}' 13 | 14 | 15 | 16 | a = 10 17 | f''' 18 | {a 19 | * 20 | x()} 21 | non-important content 22 | ''' 23 | 24 | 25 | 26 | a = f''' 27 | {blech} 28 | ''' 29 | 30 | 31 | 32 | x = ( 33 | f" {test(t)}" 34 | ) 35 | 36 | 37 | 38 | x = ( 39 | u'wat', 40 | u"wat", 41 | b'wat', 42 | b"wat", 43 | f'wat', 44 | f"wat", 45 | ) 46 | y = ( 47 | u'''wat''', 48 | u"""wat""", 49 | b'''wat''', 50 | b"""wat""", 51 | f'''wat''', 52 | f"""wat""", 53 | ) 54 | 55 | 56 | 57 | x = ( 58 | 'PERL_MM_OPT', ( 59 | f'wat' 60 | f'some_string={f(x)} ' 61 | f'wat' 62 | ), 63 | ) 64 | 65 | 66 | 67 | f'{expr:}' 68 | f'{expr:d}' 69 | foo = 3.14159 70 | verbosePrint(f'Foo {foo:.3} bar.') 71 | -------------------------------------------------------------------------------- /tests/python_parser/data/function_def.py: -------------------------------------------------------------------------------- 1 | def f(): 2 | pass 3 | 4 | 5 | def f() -> None: 6 | pass 7 | 8 | 9 | def f(a): 10 | pass 11 | 12 | 13 | def f(a: int) -> Tuple[int, ...]: 14 | pass 15 | 16 | 17 | def f(a: int = 1) -> Tuple[int, ...]: 18 | pass 19 | 20 | 21 | def f(a, b: int): 22 | pass 23 | 24 | 25 | def f(a: bool, b: int = 1): 26 | pass 27 | 28 | 29 | def f(a, /): 30 | pass 31 | 32 | 33 | def f(a=1, /): 34 | pass 35 | 36 | 37 | def f(a, b=1, /): 38 | pass 39 | 40 | 41 | def f(a, /, b): 42 | pass 43 | 44 | 45 | def f(a, c=2, /, b=5): 46 | pass 47 | 48 | 49 | def f(a, /, b=1): 50 | pass 51 | 52 | 53 | def f(a, *, b): 54 | pass 55 | 56 | 57 | def f(a, *, b, c=1): 58 | pass 59 | 60 | 61 | def f(a, *, b=1): 62 | pass 63 | 64 | 65 | def f(*, b): 66 | pass 67 | 68 | 69 | def f(*, b, c=1): 70 | pass 71 | 72 | 73 | def f(*, b=1): 74 | pass 75 | 76 | 77 | def f(b=1, *c): 78 | pass 79 | 80 | 81 | def f(*args): 82 | pass 83 | 84 | 85 | def f(**kwargs): 86 | pass 87 | 88 | 89 | def f(a, **kwargs): 90 | pass 91 | 92 | 93 | def f(a=1, **kwargs): 94 | pass 95 | 96 | 97 | def f(*, a=1, **kwargs): 98 | pass 99 | 100 | 101 | def f(*a, **b): 102 | pass 103 | 104 | 105 | def f(a, /, b, *, v=1, **d): 106 | pass 107 | 108 | 109 | async def f(): 110 | pass 111 | 112 | 113 | async def f() -> None: 114 | pass 115 | 116 | 117 | async def f(a): 118 | pass 119 | 120 | 121 | async def f(a: int) -> Tuple[int, ...]: 122 | pass 123 | 124 | 125 | async def f(a: int = 1) -> Tuple[int, ...]: 126 | pass 127 | 128 | 129 | async def f(a, b: int): 130 | pass 131 | 132 | 133 | async def f(a: bool, b: int = 1): 134 | pass 135 | 136 | 137 | async def f(a, /): 138 | pass 139 | 140 | 141 | async def f(a=1, /): 142 | pass 143 | 144 | 145 | async def f(a, b=1, /): 146 | pass 147 | 148 | 149 | async def f(a, /, b): 150 | pass 151 | 152 | 153 | async def f(a, c=2, /, b=5): 154 | pass 155 | 156 | 157 | async def f(a, /, b=1): 158 | pass 159 | 160 | 161 | async def f(a, *, b): 162 | pass 163 | 164 | 165 | async def f(a, *, b=1): 166 | pass 167 | 168 | 169 | async def f(*, b): 170 | pass 171 | 172 | 173 | async def f(*, b=1): 174 | pass 175 | 176 | 177 | async def f(b=1, *c): 178 | pass 179 | 180 | 181 | async def f(*args): 182 | pass 183 | 184 | 185 | async def f(**kwargs): 186 | pass 187 | 188 | 189 | async def f(a, **kwargs): 190 | pass 191 | 192 | 193 | async def f(a=1, **kwargs): 194 | pass 195 | 196 | 197 | async def f(*, a=1, **kwargs): 198 | pass 199 | 200 | 201 | async def f(*a, **b): 202 | pass 203 | 204 | 205 | async def f(a, /, b, *, v=1, **d): 206 | pass 207 | -------------------------------------------------------------------------------- /tests/python_parser/data/imports.py: -------------------------------------------------------------------------------- 1 | import test 2 | import a, b 3 | import test as t 4 | import test as t, y 5 | import test.a 6 | import test.b as b 7 | 8 | 9 | from test import a 10 | from test import a, b 11 | from test import ( 12 | a, 13 | b, 14 | ) 15 | from test import a as b 16 | from test import a as b, c 17 | from test import a as b, c as d 18 | from test import * 19 | from test.a import b 20 | from test.a import b as c 21 | from test.a import b, c 22 | from test.a import b as c, d 23 | 24 | 25 | from . import a 26 | from ... import b 27 | from .... import c 28 | from ..a import b 29 | from ...a import c 30 | from ....a import c 31 | from . import a, b 32 | from ..a import b, c 33 | from ...a import c, d 34 | from ....a import c, d 35 | -------------------------------------------------------------------------------- /tests/python_parser/data/lambdas.py: -------------------------------------------------------------------------------- 1 | lambda: 1 2 | 3 | lambda x: x 4 | 5 | lambda x,: x 6 | 7 | lambda x=1: x 8 | 9 | lambda x, y: x + y 10 | 11 | lambda x, /: x 12 | 13 | lambda x, y=1, /: x + y 14 | 15 | lambda x, /, y: x + y 16 | 17 | lambda x, /, y=1, z=2: x + y + z 18 | 19 | lambda x, y=1, /, z=5: x + y + z 20 | 21 | lambda x=1, /, *y: x + y 22 | 23 | lambda x, *, y: x + y 24 | 25 | lambda x, *, y, z: x + y + z 26 | 27 | lambda *, x: x 28 | 29 | lambda *x: x 30 | 31 | lambda **x: x 32 | 33 | lambda x, **y: y 34 | -------------------------------------------------------------------------------- /tests/python_parser/data/multi_statement_per_line.py: -------------------------------------------------------------------------------- 1 | if a: b=1; 2 | a = 1; b=2 3 | -------------------------------------------------------------------------------- /tests/python_parser/data/no_newline_at_end_of_file.py: -------------------------------------------------------------------------------- 1 | if a: 2 | b = 1 -------------------------------------------------------------------------------- /tests/python_parser/data/no_newline_at_end_of_file_with_comment.py: -------------------------------------------------------------------------------- 1 | if a: 2 | b = 1 3 | 4 | # test -------------------------------------------------------------------------------- /tests/python_parser/data/simple_decorators.py: -------------------------------------------------------------------------------- 1 | @d 2 | def f(): 3 | pass 4 | 5 | 6 | @d.a 7 | def f(): 8 | pass 9 | 10 | 11 | @d() 12 | def f(): 13 | pass 14 | 15 | 16 | @d.f() 17 | def f(): 18 | pass 19 | 20 | 21 | @d(a) 22 | def f(): 23 | pass 24 | 25 | 26 | @d 27 | class A: 28 | pass 29 | -------------------------------------------------------------------------------- /tests/python_parser/data/statements.py: -------------------------------------------------------------------------------- 1 | pass 2 | pass; 3 | 4 | assert a 5 | assert a; assert b 6 | assert a, "eee" 7 | 8 | raise RuntimeError 9 | raise RuntimeError from e 10 | 11 | return 12 | return 1 13 | return 1, 14 | return *a 15 | 16 | del a 17 | del (a) 18 | del a, b, 19 | del a[:] 20 | del a.b 21 | del (a,) 22 | del (a, b) 23 | del [a, b] 24 | del a; 25 | 26 | global a 27 | global a, b 28 | nonlocal a 29 | nonlocal a, b 30 | 31 | yield a 32 | yield from a 33 | 34 | 35 | for i in a: 36 | pass 37 | 38 | for i, in a: 39 | pass 40 | 41 | for (i,) in a: 42 | pass 43 | 44 | for (i,), in a: 45 | pass 46 | 47 | for i, *j in a: 48 | pass 49 | 50 | for i, (a, *b) in a: 51 | pass 52 | 53 | async for i in a: 54 | pass 55 | 56 | async for i, in a: 57 | pass 58 | 59 | async for (i,) in a: 60 | pass 61 | 62 | async for (i,), in a: 63 | pass 64 | 65 | async for i, *j in a: 66 | pass 67 | 68 | async for i, (a, *b) in a: 69 | pass 70 | 71 | for i in b: 72 | pass 73 | else: 74 | pass 75 | 76 | 77 | if a: 78 | b=1 79 | 80 | if a: 81 | pass 82 | else: 83 | pass 84 | 85 | if a: 86 | pass 87 | elif b: 88 | pass 89 | else: 90 | pass 91 | 92 | if a: 93 | pass 94 | elif b: 95 | pass 96 | elif c: 97 | pass 98 | 99 | 100 | while s: 101 | pass 102 | 103 | while False: 104 | pass 105 | else: 106 | pass 107 | 108 | 109 | for i in a: 110 | continue 111 | 112 | for i in a: 113 | break 114 | 115 | 116 | with a: 117 | pass 118 | 119 | with a, b: 120 | pass 121 | 122 | with a as b: 123 | pass 124 | 125 | with a as b, c: 126 | pass 127 | 128 | async with a: 129 | pass 130 | 131 | async with a, b: 132 | pass 133 | 134 | async with a as b: 135 | pass 136 | 137 | async with a as b, c: 138 | pass 139 | 140 | 141 | try: 142 | pass 143 | finally: 144 | pass 145 | 146 | 147 | try: 148 | pass 149 | except: 150 | raise 151 | finally: 152 | pass 153 | 154 | try: 155 | pass 156 | except ValueError: 157 | pass 158 | except (IndexError, RuntimeError,): 159 | pass 160 | except Exception as e: 161 | pass 162 | else: 163 | pass 164 | finally: 165 | pass 166 | -------------------------------------------------------------------------------- /tests/python_parser/data/type_comment.py: -------------------------------------------------------------------------------- 1 | a = 1 # type: int 2 | 3 | for i in range(10): # type: int 4 | pass 5 | 6 | 7 | with a: # type: int 8 | pass 9 | 10 | 11 | def f(a): # type: (int) -> None 12 | pass 13 | 14 | 15 | def f(a): 16 | # type: (int) -> None 17 | pass 18 | -------------------------------------------------------------------------------- /tests/python_parser/data/type_params.py: -------------------------------------------------------------------------------- 1 | type TA1 = int 2 | type TA2 = TA1 | str 3 | 4 | 5 | 6 | type NonGeneric = int 7 | type Generic[A] = dict[A, A] 8 | type VeryGeneric[T, *Ts, **P] = Callable[P, tuple[T, *Ts]] 9 | 10 | 11 | 12 | 13 | def outer[A](): 14 | type TA1[B] = dict[A, B] 15 | return TA1 16 | 17 | 18 | 19 | class Parent[A]: 20 | type TA1[B] = dict[A, B] 21 | 22 | 23 | 24 | 25 | class Outer[A]: 26 | def inner[B](self): 27 | type TA1[C] = TA1[A, B] | int 28 | return TA1 29 | 30 | 31 | 32 | def more_generic[T, *Ts, **P](): 33 | type TA[T2, *Ts2, **P2] = tuple[Callable[P, tuple[T, *Ts]], Callable[P2, tuple[T2, *Ts2]]] 34 | return TA 35 | -------------------------------------------------------------------------------- /tests/python_parser/data/with_statement_multi_items.py: -------------------------------------------------------------------------------- 1 | with (a, c,): 2 | pass 3 | 4 | with (a as b, c): 5 | pass 6 | 7 | async with (a, c,): 8 | pass 9 | 10 | async with (a as b, c): 11 | pass 12 | -------------------------------------------------------------------------------- /tests/python_parser/parser_cache/README: -------------------------------------------------------------------------------- 1 | We store the Python parser generated for the tests here to be able to perform 2 | code coverage measurements. 3 | -------------------------------------------------------------------------------- /tests/python_parser/test_ast_parsing.py: -------------------------------------------------------------------------------- 1 | """Test pure Python parser against cpython parser.""" 2 | 3 | import ast 4 | import difflib 5 | import io 6 | import sys 7 | import textwrap 8 | import tokenize 9 | from pathlib import Path 10 | 11 | import pytest 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "filename", 16 | [ 17 | pytest.param( 18 | "advanced_decorators.py", 19 | marks=pytest.mark.skipif( 20 | sys.version_info < (3, 9), reason="Valid only in Python 3.9+" 21 | ), 22 | ), 23 | "assignment.py", 24 | "async.py", 25 | "call.py", 26 | "comprehensions.py", 27 | "expressions.py", 28 | "fstrings.py", 29 | "function_def.py", 30 | "imports.py", 31 | "lambdas.py", 32 | pytest.param( 33 | "multi_statement_per_line.py", 34 | marks=pytest.mark.skipif( 35 | sys.version_info < (3, 9), reason="Col offset match only on Python 3.9+" 36 | ), 37 | ), 38 | "no_newline_at_end_of_file.py", 39 | "no_newline_at_end_of_file_with_comment.py", 40 | pytest.param( 41 | "pattern_matching.py", 42 | marks=pytest.mark.skipif( 43 | sys.version_info < (3, 10), reason="Valid only in Python 3.10+" 44 | ), 45 | ), 46 | "simple_decorators.py", 47 | "statements.py", 48 | pytest.param( 49 | "try_except_group.py", 50 | marks=pytest.mark.skipif( 51 | sys.version_info <= (3, 11), reason="except* allowed only in Python 3.11+" 52 | ), 53 | ), 54 | pytest.param( 55 | "type_params.py", 56 | marks=pytest.mark.skipif( 57 | sys.version_info <= (3, 12), 58 | reason="type declarations allowed only in Python 3.12+", 59 | ), 60 | ), 61 | pytest.param( 62 | "with_statement_multi_items.py", 63 | marks=pytest.mark.skipif( 64 | sys.version_info < (3, 9), 65 | reason="Parenthesized with items allowed only in Python 3.9+", 66 | ), 67 | ), 68 | ], 69 | ) 70 | def test_parser(python_parse_file, python_parse_str, filename): 71 | path = Path(__file__).parent / "data" / filename 72 | with open(path) as f: 73 | source = f.read() 74 | 75 | for part in source.split("\n\n\n"): 76 | original = ast.parse(part) 77 | 78 | kwargs = dict(include_attributes=True) 79 | if sys.version_info >= (3, 9): 80 | kwargs["indent"] = " " 81 | 82 | try: 83 | pp_ast = python_parse_str(part, "exec") 84 | except Exception: 85 | temp = io.StringIO(part) 86 | print("Parsing failed:") 87 | print("Source is:") 88 | print(textwrap.indent(part, " ")) 89 | temp = io.StringIO(part) 90 | print("Token stream is:") 91 | for t in tokenize.generate_tokens(temp.readline): 92 | print(t) 93 | print() 94 | print("CPython ast is:") 95 | print(ast.dump(original, **kwargs)) 96 | raise 97 | 98 | o = ast.dump(original, **kwargs) 99 | p = ast.dump(pp_ast, **kwargs) 100 | diff = "\n".join( 101 | difflib.unified_diff(o.split("\n"), p.split("\n"), "cpython", "python-pegen") 102 | ) 103 | if diff: 104 | print(part) 105 | print(diff) 106 | assert not diff 107 | 108 | o = ast.dump(ast.parse(source), **kwargs) 109 | p = ast.dump(python_parse_file(path), **kwargs) 110 | diff = "\n".join(difflib.unified_diff(o.split("\n"), p.split("\n"), "cpython", "python-pegen")) 111 | assert not diff 112 | -------------------------------------------------------------------------------- /tests/test_grammar_validator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pegen.grammar import Grammar 4 | from pegen.grammar_parser import GeneratedParser as GrammarParser 5 | from pegen.utils import parse_string 6 | from pegen.validator import SubRuleValidator, ValidationError 7 | 8 | 9 | class TestPegen(unittest.TestCase): 10 | def test_rule_with_no_collision(self) -> None: 11 | grammar_source = """ 12 | start: bad_rule 13 | sum: 14 | | NAME '-' NAME 15 | | NAME '+' NAME 16 | """ 17 | grammar: Grammar = parse_string(grammar_source, GrammarParser) 18 | validator = SubRuleValidator(grammar) 19 | for rule_name, rule in grammar.rules.items(): 20 | validator.validate_rule(rule_name, rule) 21 | 22 | def test_rule_with_simple_collision(self) -> None: 23 | grammar_source = """ 24 | start: bad_rule 25 | sum: 26 | | NAME '+' NAME 27 | | NAME '+' NAME ';' 28 | """ 29 | grammar: Grammar = parse_string(grammar_source, GrammarParser) 30 | validator = SubRuleValidator(grammar) 31 | with self.assertRaises(ValidationError): 32 | for rule_name, rule in grammar.rules.items(): 33 | validator.validate_rule(rule_name, rule) 34 | 35 | def test_rule_with_collision_after_some_other_rules(self) -> None: 36 | grammar_source = """ 37 | start: bad_rule 38 | sum: 39 | | NAME '+' NAME 40 | | NAME '*' NAME ';' 41 | | NAME '-' NAME 42 | | NAME '+' NAME ';' 43 | """ 44 | grammar: Grammar = parse_string(grammar_source, GrammarParser) 45 | validator = SubRuleValidator(grammar) 46 | with self.assertRaises(ValidationError): 47 | for rule_name, rule in grammar.rules.items(): 48 | validator.validate_rule(rule_name, rule) 49 | -------------------------------------------------------------------------------- /tests/test_grammar_visitor.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from pegen.grammar import GrammarVisitor 4 | from pegen.grammar_parser import GeneratedParser as GrammarParser 5 | from pegen.utils import parse_string 6 | 7 | 8 | class Visitor(GrammarVisitor): 9 | def __init__(self) -> None: 10 | self.n_nodes = 0 11 | 12 | def visit(self, node: Any, *args: Any, **kwargs: Any) -> None: 13 | self.n_nodes += 1 14 | super().visit(node, *args, **kwargs) 15 | 16 | 17 | def test_parse_trivial_grammar() -> None: 18 | grammar = """ 19 | start: 'a' 20 | """ 21 | rules = parse_string(grammar, GrammarParser) 22 | visitor = Visitor() 23 | 24 | visitor.visit(rules) 25 | 26 | assert visitor.n_nodes == 6 27 | 28 | 29 | def test_parse_or_grammar() -> None: 30 | grammar = """ 31 | start: rule 32 | rule: 'a' | 'b' 33 | """ 34 | rules = parse_string(grammar, GrammarParser) 35 | visitor = Visitor() 36 | 37 | visitor.visit(rules) 38 | 39 | # Grammar/Rule/Rhs/Alt/NamedItem/NameLeaf -> 6 40 | # Rule/Rhs/ -> 2 41 | # Alt/NamedItem/StringLeaf -> 3 42 | # Alt/NamedItem/StringLeaf -> 3 43 | 44 | assert visitor.n_nodes == 14 45 | 46 | 47 | def test_parse_repeat1_grammar() -> None: 48 | grammar = """ 49 | start: 'a'+ 50 | """ 51 | rules = parse_string(grammar, GrammarParser) 52 | visitor = Visitor() 53 | 54 | visitor.visit(rules) 55 | 56 | # Grammar/Rule/Rhs/Alt/NamedItem/Repeat1/StringLeaf -> 6 57 | assert visitor.n_nodes == 7 58 | 59 | 60 | def test_parse_repeat0_grammar() -> None: 61 | grammar = """ 62 | start: 'a'* 63 | """ 64 | rules = parse_string(grammar, GrammarParser) 65 | visitor = Visitor() 66 | 67 | visitor.visit(rules) 68 | 69 | # Grammar/Rule/Rhs/Alt/NamedItem/Repeat0/StringLeaf -> 6 70 | 71 | assert visitor.n_nodes == 7 72 | 73 | 74 | def test_parse_optional_grammar() -> None: 75 | grammar = """ 76 | start: 'a' ['b'] 77 | """ 78 | rules = parse_string(grammar, GrammarParser) 79 | visitor = Visitor() 80 | 81 | visitor.visit(rules) 82 | 83 | # Grammar/Rule/Rhs/Alt/NamedItem/StringLeaf -> 6 84 | # NamedItem/Opt/Rhs/Alt/NamedItem/Stringleaf -> 6 85 | 86 | assert visitor.n_nodes == 12 87 | -------------------------------------------------------------------------------- /tests/test_grammar_visualizer.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | from typing import List 3 | 4 | from pegen.grammar_parser import GeneratedParser as GrammarParser 5 | from pegen.grammar_visualizer import ASTGrammarPrinter 6 | from pegen.utils import parse_string 7 | 8 | 9 | def test_simple_rule() -> None: 10 | grammar = """ 11 | start: 'a' 'b' 12 | """ 13 | rules = parse_string(grammar, GrammarParser) 14 | 15 | printer = ASTGrammarPrinter() 16 | lines: List[str] = [] 17 | printer.print_grammar_ast(rules, printer=lines.append) 18 | 19 | output = "\n".join(lines) 20 | expected_output = textwrap.dedent( 21 | """\ 22 | └──Rule 23 | └──Rhs 24 | └──Alt 25 | ├──NamedItem 26 | │ └──StringLeaf("'a'") 27 | └──NamedItem 28 | └──StringLeaf("'b'") 29 | """ 30 | ) 31 | 32 | assert output == expected_output 33 | 34 | 35 | def test_multiple_rules() -> None: 36 | grammar = """ 37 | start: a b 38 | a: 'a' 39 | b: 'b' 40 | """ 41 | rules = parse_string(grammar, GrammarParser) 42 | 43 | printer = ASTGrammarPrinter() 44 | lines: List[str] = [] 45 | printer.print_grammar_ast(rules, printer=lines.append) 46 | 47 | output = "\n".join(lines) 48 | expected_output = textwrap.dedent( 49 | """\ 50 | └──Rule 51 | └──Rhs 52 | └──Alt 53 | ├──NamedItem 54 | │ └──NameLeaf('a') 55 | └──NamedItem 56 | └──NameLeaf('b') 57 | 58 | └──Rule 59 | └──Rhs 60 | └──Alt 61 | └──NamedItem 62 | └──StringLeaf("'a'") 63 | 64 | └──Rule 65 | └──Rhs 66 | └──Alt 67 | └──NamedItem 68 | └──StringLeaf("'b'") 69 | """ 70 | ) 71 | 72 | assert output == expected_output 73 | 74 | 75 | def test_deep_nested_rule() -> None: 76 | grammar = """ 77 | start: 'a' ['b'['c'['d']]] 78 | """ 79 | rules = parse_string(grammar, GrammarParser) 80 | 81 | printer = ASTGrammarPrinter() 82 | lines: List[str] = [] 83 | printer.print_grammar_ast(rules, printer=lines.append) 84 | 85 | output = "\n".join(lines) 86 | print() 87 | print(output) 88 | expected_output = textwrap.dedent( 89 | """\ 90 | └──Rule 91 | └──Rhs 92 | └──Alt 93 | ├──NamedItem 94 | │ └──StringLeaf("'a'") 95 | └──NamedItem 96 | └──Opt 97 | └──Rhs 98 | └──Alt 99 | ├──NamedItem 100 | │ └──StringLeaf("'b'") 101 | └──NamedItem 102 | └──Opt 103 | └──Rhs 104 | └──Alt 105 | ├──NamedItem 106 | │ └──StringLeaf("'c'") 107 | └──NamedItem 108 | └──Opt 109 | └──Rhs 110 | └──Alt 111 | └──NamedItem 112 | └──StringLeaf("'d'") 113 | """ 114 | ) 115 | 116 | assert output == expected_output 117 | -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | from tokenize import NEWLINE, NUMBER, ENDMARKER, TokenInfo, generate_tokens 4 | 5 | from pegen.tokenizer import Tokenizer 6 | 7 | 8 | def test_peek_getnext(): 9 | source = io.StringIO("# test\n1") 10 | t = Tokenizer(generate_tokens(source.readline)) 11 | assert t.peek() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1") 12 | assert t.getnext() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1") 13 | assert t.peek() == TokenInfo( 14 | NEWLINE, "", (2, 1), (2, 2), "1" if sys.version_info >= (3, 12) else "" 15 | ) 16 | assert t.getnext() == TokenInfo( 17 | NEWLINE, "", (2, 1), (2, 2), "1" if sys.version_info >= (3, 12) else "" 18 | ) 19 | 20 | 21 | def test_mark_reset(): 22 | source = io.StringIO("\n1 2") 23 | t = Tokenizer(generate_tokens(source.readline)) 24 | index = t.mark() 25 | assert t.peek() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1 2") 26 | assert t.getnext() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1 2") 27 | t.reset(index) 28 | assert t.peek() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1 2") 29 | assert t.getnext() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1 2") 30 | 31 | 32 | def test_last_non_whitespace(): 33 | source = io.StringIO("\n1\n2") 34 | t = Tokenizer(generate_tokens(source.readline)) 35 | assert t.peek() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1\n") 36 | assert t.getnext() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1\n") 37 | assert t.getnext() == TokenInfo(NEWLINE, "\n", (2, 1), (2, 2), "1\n") 38 | assert t.get_last_non_whitespace_token() == TokenInfo(NUMBER, "1", (2, 0), (2, 1), "1\n") 39 | 40 | 41 | def test_get_lines(): 42 | source = io.StringIO("1\n2\n3") 43 | t = Tokenizer(generate_tokens(source.readline)) 44 | while True: 45 | if t.getnext().type == ENDMARKER: 46 | break 47 | assert t.get_lines([1, 2, 3]) == ["1\n", "2\n", "3"] 48 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38, py39, py310, py311, py312, py38-cov, py39-cov, py310-cov, py311-cov, py312-cov, lint, regen, docs 3 | isolated_build = true 4 | 5 | [gh-actions] 6 | python = 7 | 3.8: py38-cov, lint, regen, docs 8 | 3.9: py39-cov 9 | 3.10: py310-cov 10 | 3.11: py311-cov 11 | 3.12: py312-cov 12 | 13 | [testenv] 14 | setenv = COVERAGE_FILE={toxworkdir}/.coverage.{envname} 15 | 16 | description = 17 | Run tests under {basepython} 18 | cov: with coverage 19 | 20 | commands = make PYTEST_ARGS='{posargs} --junitxml={toxworkdir}/{envname}_integration.xml' check 21 | commands_cov= make PYTEST_ARGS='{posargs} --junitxml={toxworkdir}/{envname}_integration.xml \ 22 | --cov-report=xml:{toxworkdir}/coverage.xml' pycoverage 23 | extras = test 24 | allowlist_externals = make 25 | 26 | [testenv:lint] 27 | description = lint code in {basepython} 28 | extras = lint 29 | commands = make lint 30 | 31 | [testenv:regen] 32 | description = regenerate metaparser {basepython} 33 | deps = black 34 | commands = make regen-metaparser 35 | make regen-metaparser 36 | 37 | [testenv:py38-cov] 38 | usedevelop = True 39 | commands = {[testenv]commands_cov} 40 | 41 | [coverage:run] 42 | source = src/pegen 43 | branch = True 44 | parallel = True 45 | omit = 46 | tests/* 47 | *__init__.py 48 | 49 | [coverage:report] 50 | skip_covered = True 51 | show_missing = True 52 | 53 | [testenv:docs] 54 | description = build the documentation 55 | extras = docs 56 | commands = make docs 57 | --------------------------------------------------------------------------------