├── .github └── workflows │ └── main.yml ├── .gitignore ├── .pylintrc ├── LICENSE.rst ├── MANIFEST.in ├── README.rst ├── RELNOTES.rst ├── pyproject.toml ├── schemas ├── format.json └── replacements.json ├── setup.cfg ├── src └── picireny │ ├── __init__.py │ ├── __main__.py │ ├── antlr4 │ ├── __init__.py │ ├── antlr_tree.py │ ├── grammar_analyzer.py │ ├── hdd_tree_builder.py │ ├── parser │ │ ├── LexerAdaptor.py │ │ └── __init__.py │ ├── parser_builder.py │ └── resources │ │ ├── ANTLRv4Lexer.g4 │ │ ├── ANTLRv4Parser.g4 │ │ ├── ExtendedTargetParser.java │ │ └── LexBasic.g4 │ ├── cli.py │ ├── filter.py │ ├── hdd.py │ ├── hdd_tree.py │ ├── hddr.py │ ├── hoist.py │ ├── info.py │ ├── prune.py │ ├── srcml │ ├── __init__.py │ └── hdd_tree_builder.py │ └── transform.py ├── tests ├── resources │ ├── INILexer.g4 │ ├── INIParser.g4 │ ├── JSON.g4 │ ├── exp-obj-arr-87.json │ ├── exp-obj-arr-bar.json │ ├── exp-obj-arr-baz.json │ ├── exp-obj-arr-foo.json │ ├── exp-str-arr-87.ini │ ├── inijson-crlf.json │ ├── inijson.json │ ├── inp-obj-arr.json │ ├── inp-str-arr.ini │ ├── sut-inijson-load.py │ ├── sut-json-load.py │ ├── test-inijson-str-arr-87.bat │ ├── test-inijson-str-arr-87.sh │ ├── test-json-obj-arr-87.bat │ ├── test-json-obj-arr-87.sh │ ├── test-json-obj-arr-bar.bat │ ├── test-json-obj-arr-bar.sh │ ├── test-json-obj-arr-baz.bat │ ├── test-json-obj-arr-baz.sh │ ├── test-json-obj-arr-foo.bat │ └── test-json-obj-arr-foo.sh └── test_cli.py └── tox.ini /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | test: 6 | strategy: 7 | matrix: 8 | os: [ubuntu-latest, macos-latest, windows-latest] 9 | python-version: [3.8, 3.9, '3.10', '3.11', '3.12', '3.13', 'pypy-3.10'] 10 | runs-on: ${{ matrix.os }} 11 | steps: 12 | - uses: actions/checkout@v4 13 | with: 14 | fetch-depth: 0 15 | - uses: actions/setup-python@v5 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - uses: actions/setup-java@v4 19 | with: 20 | java-version: 17 21 | distribution: temurin 22 | if: matrix.os == 'windows-latest' 23 | - run: pip install --upgrade tox 24 | - run: tox -v -e py 25 | 26 | lint: 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v4 30 | with: 31 | fetch-depth: 0 32 | - uses: actions/setup-python@v5 33 | with: 34 | python-version: '3.x' 35 | - run: pip install --upgrade tox 36 | - run: tox -v -e lint 37 | 38 | schema: 39 | runs-on: ubuntu-latest 40 | steps: 41 | - uses: actions/checkout@v4 42 | with: 43 | fetch-depth: 0 44 | - uses: actions/setup-python@v5 45 | with: 46 | python-version: '3.x' 47 | - run: pip install --upgrade tox 48 | - run: tox -v -e schema 49 | 50 | cov: 51 | runs-on: ubuntu-latest 52 | steps: 53 | - uses: actions/checkout@v4 54 | with: 55 | fetch-depth: 0 56 | - uses: actions/setup-python@v5 57 | with: 58 | python-version: '3.x' 59 | - run: pip install --upgrade tox coveralls 60 | - run: tox -v -e cov 61 | - run: coveralls --service=github 62 | env: 63 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 64 | 65 | publish: 66 | needs: [test, lint, schema] 67 | runs-on: ubuntu-latest 68 | steps: 69 | - uses: actions/checkout@v4 70 | with: 71 | fetch-depth: 0 72 | - uses: actions/setup-python@v5 73 | with: 74 | python-version: '3.x' 75 | - run: pip install --upgrade tox 76 | - run: tox -v -e build 77 | - uses: pypa/gh-action-pypi-publish@release/v1 78 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && github.repository == 'renatahodovan/picireny' 79 | with: 80 | password: ${{ secrets.pypi_token }} 81 | packages_dir: .tox/build/tmp/ 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.py[co] 3 | build 4 | dist 5 | .eggs 6 | *.egg-info 7 | .DS_Store 8 | .idea 9 | .cache 10 | .pytest_cache 11 | .tox 12 | .coverage* 13 | *.interp 14 | *.tokens 15 | src/picireny/antlr4/parser/ANTLRv4*.py 16 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Files or directories to be skipped. They should be base names, not paths. 4 | ignore=parser 5 | 6 | [MESSAGES CONTROL] 7 | 8 | # Disable the message, report, category or checker with the given id(s). You 9 | # can either give multiple identifiers separated by comma (,) or put this 10 | # option multiple times (only on the command line, not in the configuration 11 | # file where it should appear only once).You can also use "--disable=all" to 12 | # disable everything first and then reenable specific checks. For example, if 13 | # you want to run only the similarities checker, you can use "--disable=all 14 | # --enable=similarities". If you want to run only the classes checker, but have 15 | # no Warning level messages displayed, use"--disable=all --enable=classes 16 | # --disable=W" 17 | disable= 18 | abstract-method, 19 | attribute-defined-outside-init, 20 | import-outside-toplevel, 21 | invalid-name, 22 | line-too-long, 23 | missing-docstring, 24 | no-self-use, # disables warning in older pylint 25 | protected-access, 26 | redefined-builtin, 27 | too-few-public-methods, 28 | too-many-arguments, 29 | too-many-branches, 30 | too-many-locals, 31 | too-many-positional-arguments, 32 | too-many-return-statements, 33 | too-many-statements, 34 | unspecified-encoding, 35 | unused-argument, 36 | useless-option-value, # disables warning in recent pylint that does not check for no-self-use anymore 37 | 38 | [REPORTS] 39 | 40 | # Set the output format. Available formats are text, parseable, colorized, json 41 | # and msvs (visual studio).You can also give a reporter class, eg 42 | # mypackage.mymodule.MyReporterClass. 43 | output-format=parseable 44 | 45 | # Activate the evaluation score. 46 | score=no 47 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-2024 Renata Hodovan, Akos Kiss. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ----- 30 | 31 | This software is based on work licensed under identical terms with relevant 32 | files carrying the following copyright notice: 33 | 34 | Copyright (c) 2007 Ghassan Misherghi. 35 | 36 | ----- 37 | 38 | This software includes components from the "Grammars written for ANTLR v4" 39 | project under src/picireny/antlr4/resources and src/picireny/antlr4/parser, 40 | which files carry a compatible "BSD license" and their own copyright notices. 41 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude src/picireny/antlr4/parser/ANTLRv4*.py 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Picireny 3 | ======== 4 | *Hierarchical Delta Debugging Framework* 5 | 6 | .. image:: https://img.shields.io/pypi/v/picireny?logo=python&logoColor=white 7 | :target: https://pypi.org/project/picireny/ 8 | .. image:: https://img.shields.io/pypi/l/picireny?logo=open-source-initiative&logoColor=white 9 | :target: https://pypi.org/project/picireny/ 10 | .. image:: https://img.shields.io/github/actions/workflow/status/renatahodovan/picireny/main.yml?branch=master&logo=github&logoColor=white 11 | :target: https://github.com/renatahodovan/picireny/actions 12 | .. image:: https://img.shields.io/coveralls/github/renatahodovan/picireny/master?logo=coveralls&logoColor=white 13 | :target: https://coveralls.io/github/renatahodovan/picireny 14 | 15 | *Picireny* is a Python implementation of the Hierarchical Delta Debugging 16 | (HDD in short) algorithm adapted to use ANTLR_ v4 for parsing both the input 17 | and the grammar(s) describing the format of the input. It relies on Picire_ 18 | to provide the implementation of the core Delta Debugging algorithm along 19 | with various tweaks like parallelization. Just like the *Picire* framework, 20 | *Picireny* can also be used either as a command line tool or as a library. 21 | 22 | Both Hierarchical Delta Debugging and Delta Debugging automatically reduce 23 | "interesting" tests while keeping their "interesting" behaviour. (E.g., 24 | "interestingness" may mean failure-inducing input to a system-under-test.) 25 | However, HDD is an improvement that tries to investigate less test cases during 26 | the reduction process by making use of knowledge on the structure of the input. 27 | 28 | The tool (and the algorithm) works iteratively in several ways. As a first 29 | step, it splits up the input into tokens and organizes them in a tree structure 30 | as defined by a grammar. Then, iteratively, it invokes Delta Debugging on each 31 | level of the tree from top to bottom, and DD is an iterative process itself, 32 | too. Finally, the nodes kept in the tree are "unparsed" to yield a reduced but 33 | still "interesting" output. 34 | 35 | .. _ANTLR: http://www.antlr.org 36 | .. _Picire: https://github.com/renatahodovan/picire 37 | 38 | 39 | Requirements 40 | ============ 41 | 42 | * Python_ >= 3.8 43 | * Java_ SE >= 11 JRE or JDK (the latter is optional, only needed if Java is used 44 | as the parser language) 45 | 46 | .. _Python: https://www.python.org 47 | .. _Java: https://www.oracle.com/java/ 48 | 49 | 50 | Install 51 | ======= 52 | 53 | To use *Picireny* in another project, it can be added to ``setup.cfg`` as an 54 | install requirement (if using setuptools_ with declarative config): 55 | 56 | .. code-block:: ini 57 | 58 | [options] 59 | install_requires = 60 | picireny 61 | 62 | To install *Picireny* manually, e.g., into a virtual environment, use pip_:: 63 | 64 | pip install picireny 65 | 66 | The above approaches install the latest release of *Picireny* from PyPI_. 67 | Alternatively, for the development version, clone the project and perform a 68 | local install:: 69 | 70 | pip install . 71 | 72 | .. _setuptools: https://github.com/pypa/setuptools 73 | .. _pip: https://pip.pypa.io 74 | .. _PyPI: https://pypi.org/ 75 | 76 | 77 | Usage 78 | ===== 79 | 80 | *Picireny* uses the same CLI as *Picire* and hence accepts the same 81 | options_. 82 | On top of the inherited ones, *Picireny* accepts several further arguments: 83 | 84 | * ``--grammar`` (optional): List of grammars describing the input format. (You 85 | can write them by hand or simply download them from the 86 | `ANTLR v4 grammars repository`_.) 87 | * ``--start`` (optional): Name of the start rule (optionally prefixed with a 88 | grammar name) as ``[grammarname:]rulename``. 89 | * ``--replacements`` (optional): Json file containing rule names and minimal 90 | replacement strings (otherwise these are calculated automatically) (see 91 | schema__). 92 | * ``--format`` (optional): Json file describing the input format (see schema__ 93 | and example_). This descriptor can incorporate all the above (``--grammar``, 94 | ``--start`` and ``--replacements``) properties, along with the possibility of 95 | island grammar definitions. If both ``--format`` and the aforementioned 96 | arguments are present, then the latter will override the appropriate values of 97 | the format file. 98 | * ``--antlr`` (optional): Path to the ANTLR tool jar. 99 | * ``--parser`` (optional): Language of the generated parser. Currently 'python' 100 | (default) and 'java' targets (faster, but needs JDK) are supported. 101 | 102 | Note: although, all the arguments are optional, the grammar files and the start 103 | rule of the top-level parser must be defined with an arbitrary combination of the 104 | ``--format``, ``--grammars``, and ``--start`` arguments. 105 | 106 | .. _options: https://github.com/renatahodovan/picire/tree/master/README.rst#usage 107 | .. _`ANTLR v4 grammars repository`: https://github.com/antlr/grammars-v4 108 | .. __: schemas/replacements.json 109 | .. __: schemas/format.json 110 | .. _example: tests/resources/inijson.json 111 | 112 | Example usage to reduce an HTML file:: 113 | 114 | picireny --input= --test= \ 115 | --grammar HTMLLexer.g4 HTMLParser.g4 --start htmlDocument \ 116 | --parallel --subset-iterator=skip --complement-iterator=backward 117 | 118 | 119 | Compatibility 120 | ============= 121 | 122 | *Picireny* was tested on: 123 | 124 | * Linux (Ubuntu 14.04 / 16.04 / 18.04 / 20.04) 125 | * OS X / macOS (10.11 / 10.12 / 10.13 / 10.14 / 10.15 / 11) 126 | * Windows (Server 2012 R2 / Server version 1809 / Windows 10) 127 | 128 | 129 | Acknowledgement and Citations 130 | ============================= 131 | 132 | *Picireny* is motivated by the idea of Hierarchical Delta Debugging: 133 | 134 | * Ghassan Misherghi and Zhendong Su. HDD: Hierarchical Delta Debugging. 135 | In Proceedings of the 28th International Conference on Software Engineering 136 | (ICSE '06), pages 142-151, Shanghai, China, May 2006. ACM. 137 | https://doi.org/10.1145/1134285.1134307 138 | 139 | The details of the modernized re-implementation and further improvements are 140 | published in: 141 | 142 | * Renata Hodovan and Akos Kiss. Modernizing Hierarchical Delta Debugging. 143 | In Proceedings of the 7th International Workshop on Automating Test Case 144 | Design, Selection, and Evaluation (A-TEST 2016), pages 31-37, Seattle, 145 | Washington, USA, November 2016. ACM. 146 | https://doi.org/10.1145/2994291.2994296 147 | * Renata Hodovan, Akos Kiss, and Tibor Gyimothy. Tree Preprocessing and Test 148 | Outcome Caching for Efficient Hierarchical Delta Debugging. 149 | In Proceedings of the 12th IEEE/ACM International Workshop on Automation of 150 | Software Testing (AST 2017), pages 23-29, Buenos Aires, Argentina, May 2017. 151 | IEEE. 152 | https://doi.org/10.1109/AST.2017.4 153 | * Renata Hodovan, Akos Kiss, and Tibor Gyimothy. Coarse Hierarchical Delta 154 | Debugging. 155 | In Proceedings of the 33rd IEEE International Conference on Software 156 | Maintenance and Evolution (ICSME 2017), pages 194-203, Shanghai, China, 157 | September 2017. IEEE. 158 | https://doi.org/10.1109/ICSME.2017.26 159 | * Akos Kiss, Renata Hodovan, and Tibor Gyimothy. HDDr: A Recursive Variant of 160 | the Hierarchical Delta Debugging Algorithm. 161 | In Proceedings of the 9th ACM SIGSOFT International Workshop on Automating 162 | Test Case Design, Selection, and Evaluation (A-TEST 2018), pages 16-22, Lake 163 | Buena Vista, Florida, USA, November 2018. ACM. 164 | https://doi.org/10.1145/3278186.3278189 165 | * Daniel Vince, Renata Hodovan, Daniella Barsony, and Akos Kiss. Extending 166 | Hierarchical Delta Debugging with Hoisting. 167 | In Proceedings of the 2nd ACM/IEEE International Conference on Automation of 168 | Software Test (AST 2021), pages 60-69, Madrid, Spain (Virtual), May 2021. 169 | IEEE. 170 | https://doi.org/10.1109/AST52587.2021.00015 171 | * Daniel Vince, Renata Hodovan, Daniella Barsony, and Akos Kiss. The effect of 172 | hoisting on variants of Hierarchical Delta Debugging. 173 | Journal of Software: Evolution and Process, 34(11):e2483,1-26, November 2022. 174 | Wiley. 175 | https://doi.org/10.1002/smr.2483 176 | 177 | 178 | Copyright and Licensing 179 | ======================= 180 | 181 | Licensed under the BSD 3-Clause License_. 182 | 183 | .. _License: LICENSE.rst 184 | -------------------------------------------------------------------------------- /RELNOTES.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | *Picireny* Release Notes 3 | ======================== 4 | 5 | 21.8 6 | ==== 7 | 8 | Summary of changes: 9 | 10 | * Dropped support for Python 2. 11 | * Upgraded dependency to *Picire* 21.8 to use new/improved argument logging, CLI 12 | argument processing; and adapted code to the updated API. 13 | * Heavily simplified the signatures of picireny.build_with_antlr4 and 14 | picireny.reduce. 15 | * Changed the API of several functions and methods, made numerous arguments 16 | keyword-only. 17 | * Added a new phase that applies the coarse filter to tree nodes and runs both 18 | pruning and hoisting on them. 19 | * Fixed HDDr to correctly traverse the tree in case of filtered nodes. 20 | * Fixed line-column calculations for tree nodes. 21 | * Fixed "skip unremovable" transformation to correctly determine the unparsed 22 | representation of nodes for all parametrizations. 23 | * Upgraded dependency *ANTLeRinator* to Epoch 1 (breaking away from ANTLR 24 | version numbering) and made use of its new feature to generate the lexer and 25 | parser from the ANTLRv4 grammar at build-time. 26 | * Added direct dependency on ANTLR and upgraded it to v4.9.2. 27 | * Made use of the *inators* package to unify CLI argument handling and logging. 28 | * Dropped runtime dependency on setuptools. 29 | * Moved to pyproject.toml & setup.cfg-based packaging. 30 | * Improved log output. 31 | * Improved documentation. 32 | * Improved the testing infrastructure (stabilized tests, improved resource 33 | handling, better output on failure, testing Windows & PyPy). 34 | * Various internal refactorings. 35 | 36 | 37 | 21.3 38 | ==== 39 | 40 | Summary of changes: 41 | 42 | * Introduced phases of reduction to allow executing the same HDD algorithm 43 | variant multiple times with different parametrizations (e.g., run Coarse HDDr 44 | and HDDr after each other). 45 | * Added a new transformation-based reduction technique called hoisting, as a new 46 | optional phase, to complement the existing pruning-based approaches. 47 | * Added support for "tokens" section (i.e., token names without an associated 48 | lexer rule) in grammars. 49 | * Added support for grammars with resource files that contain utility code or 50 | base classes of lexers and parsers. 51 | * Upgraded dependency to *Picire* 20.12 to utilize its new generalized split 52 | factor concept and updated API. 53 | * Upgraded dependency to ANTLR v4.9 (via *ANTLeRinator*). 54 | * Bumped minimum Python 3 requirement to 3.5. 55 | * Improved log output. 56 | * Adapted versioning to use setuptools_scm (included distance from latest 57 | release into non-released version strings). 58 | * Added classification metadata to project. 59 | * Improved documentation. 60 | * Improved the testing infrastructure (linting, faster test suite, testing 61 | Python 3.8 and 3.9, testing macOS, migrated testing from Travis CI to GitHub 62 | Actions). 63 | * Various internal refactorings and performance improvements. 64 | * Minor bug fixes. 65 | 66 | 67 | 19.3 68 | ==== 69 | 70 | Summary of changes: 71 | 72 | * Made code Python 2 compatible (with the help of upgraded dependencies 73 | *Picire* 19.3 and *ANTLeRinator* 4.7.1-1). 74 | * Improved the testing infrastructure (testing Python 2.7 and 3.7 on Travis CI; 75 | maintenance changes to various CI configurations). 76 | 77 | 78 | 18.10 79 | ===== 80 | 81 | Summary of changes: 82 | 83 | * Added implementation for the recursive variant of the HDD algorithm (a.k.a. 84 | HDDr). 85 | * Upgraded dependency to *Picire* 18.10 to utilize its new config ID and prefix 86 | concepts. 87 | * Minor improvements. 88 | 89 | 90 | 18.2 91 | ==== 92 | 93 | Summary of changes: 94 | 95 | * Added support for multiple tree builders, and added srcML as an experimental 96 | builder in addition to the existing ANTLRv4-based solution. 97 | * Generalized HDD implementation to be parametric to express classic HDD and 98 | Coarse HDD as well. 99 | * Upgraded dependency to *Picire* 18.1 to utilize custom initial granularity. 100 | * Upgraded dependency to ANTLR v4.7.1 (via *ANTLeRinator*). 101 | * Added support for building tokens from hidden ANTLR channels (whitespace, 102 | comments, etc.) into the tree but also hiding them from the reducer (for 103 | inputs where whitespace or other hidden tokens may matter during tree 104 | unparsing). 105 | * Added new module for gathering statistics on trees and improved the logging of 106 | the results of tree transformation algorithms. 107 | * Improved various algorithms (minimal replacement calculation from ANTLRv4 108 | grammars, tree flattening for non-syntax-conforming inputs, unremovable node 109 | detection for rules in addition to tokens). 110 | * Improved Python-Java interworking (for Java-based ANTLRv4 parsers). 111 | * Improved API usability (for use-cases when *Picireny* is not called via its 112 | CLI). 113 | * Improved the testing infrastructure (by using the Coveralls online service). 114 | * Minor bug fixes and internal refactorings. 115 | 116 | 117 | 17.10 118 | ===== 119 | 120 | Summary of changes: 121 | 122 | * Improved the way how input format can be defined by enabling the use of a more 123 | consistent and well-defined config file. 124 | * Upgraded dependency to *Picire* 17.10 to utilize its Windows support. 125 | * Minor bug fixes. 126 | 127 | 128 | 17.7 129 | ==== 130 | 131 | Summary of changes: 132 | 133 | * Added implementation for the coarse variant of the HDD algorithm. 134 | * Implemented heuristical optimization to flatten left and right-recursive tree 135 | structures. 136 | * Improvements to the internal tree representation. 137 | * Simplified usage and ANTLR dependency installation via *ANTLeRinator*, and 138 | upgraded dependency to *Picire* 17.6. 139 | * Improved the testing infrastructure (support for Python 3.6 and code coverage 140 | measurement). 141 | 142 | 143 | 17.1 144 | ==== 145 | 146 | Summary of changes: 147 | 148 | * Updated dependency to *Picire* 17.1 and adopted its support for content-based 149 | result caching. 150 | * Added "squeeze tree" and "hide/skip unremovable tokens" HDD tree 151 | optimizations. 152 | * Improved handling of erroneous input. 153 | * Extended the HDD algorithm with testing of single-node tree levels to ensure 154 | 1-tree-minimality of output. 155 | * Minor bug fixes and improvements. 156 | 157 | 158 | 16.12 159 | ===== 160 | 161 | Summary of changes: 162 | 163 | * Added support for Java-based input parsing to improve performance. 164 | * Implemented HDD* (fixed-point iteration of hddmin). 165 | * Minor bug fixes and improvements. 166 | * Upgraded dependency to ANTLR v4.6. 167 | * Added *Picireny* to PyPI. 168 | 169 | 170 | 16.7 171 | ==== 172 | 173 | First public release of the *Picireny* Hierarchical Delta Debugging Framework. 174 | 175 | Summary of main features: 176 | 177 | * ANTLRv4-based input parsing and *Picire*-based ddmin. 178 | * Automatic "smallest allowable syntactic fragment" computation for both parser 179 | and lexer rules. 180 | * Support for island grammars. 181 | * Python 3 API and out-of-the-box useful CLI. 182 | * py.test-based testing and tox support. 183 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "antlerinator>=1!3.0.0", 4 | "setuptools", 5 | "setuptools_scm[toml]", 6 | "wheel", 7 | ] 8 | build-backend = "setuptools.build_meta" 9 | 10 | [tool.setuptools_scm] 11 | version_scheme = "post-release" 12 | local_scheme = "node-and-date" 13 | -------------------------------------------------------------------------------- /schemas/format.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://json-schema.org/draft/2020-12/schema", 3 | 4 | "description": "Picireny input format definition.", 5 | "type": "object", 6 | "properties": { 7 | "start": { 8 | "description": "Name of start rule optionally prefixed with a grammar name (specified as [grammarname:]rulename).", 9 | "type": "string" 10 | }, 11 | "grammars": { 12 | "description": "Grammar descriptions mapped to (freely chosen) grammar names.", 13 | "type": "object", 14 | "patternProperties": { 15 | ".*": { 16 | "description": "Grammar description.", 17 | "type": "object", 18 | "properties": { 19 | "files": { 20 | "description": "List of ANTLR grammar files.", 21 | "type": "array", 22 | "items": { 23 | "description": "Grammar file (resolved relative to the location of the input format definition).", 24 | "type": "string" 25 | }, 26 | "minItems": 1 27 | }, 28 | "islands": { 29 | "description": "Regex patterns mapped to names of tokens of the described grammar.", 30 | "type": "object", 31 | "patternProperties": { 32 | ".*": { 33 | "description": "Regex pattern matched on token instances (named capture groups define those parts of the token, which should be parsed with a rule of an island grammar, specified as [grammarname:]rulename).", 34 | "type": "string" 35 | } 36 | } 37 | }, 38 | "replacements": { 39 | "$ref": "replacements.json" 40 | } 41 | }, 42 | "required": [ "files" ] 43 | } 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /schemas/replacements.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://json-schema.org/draft/2020-12/schema", 3 | 4 | "description": "Replacement strings mapped to grammar token names.", 5 | "type": "object", 6 | "patternProperties": { 7 | ".*": { 8 | "description": "Replacement string for token instances.", 9 | "type": "string" 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = picireny 3 | description = Picireny Hierarchical Delta Debugging Framework 4 | long_description = file: README.rst 5 | long_description_content_type = text/x-rst 6 | author = Renata Hodovan, Akos Kiss 7 | author_email = hodovan@inf.u-szeged.hu, akiss@inf.u-szeged.hu 8 | url = https://github.com/renatahodovan/picireny 9 | license = BSD 10 | license_files = LICENSE.rst 11 | classifiers = 12 | Intended Audience :: Developers 13 | License :: OSI Approved :: BSD License 14 | Operating System :: OS Independent 15 | Programming Language :: Python 16 | Programming Language :: Python :: 3 17 | Programming Language :: Python :: 3.8 18 | Programming Language :: Python :: 3.9 19 | Programming Language :: Python :: 3.10 20 | Programming Language :: Python :: 3.11 21 | Programming Language :: Python :: 3.12 22 | Programming Language :: Python :: 3.13 23 | Topic :: Software Development :: Testing 24 | platform = any 25 | 26 | [options] 27 | package_dir = 28 | = src 29 | packages = find_namespace: 30 | include_package_data = True 31 | python_requires = >=3.8 32 | install_requires = 33 | antlerinator>=1!3.0.0 34 | antlr4-python3-runtime==4.13.2 35 | inators 36 | picire==21.8 37 | xson 38 | 39 | [options.packages.find] 40 | where = src 41 | 42 | [options.entry_points] 43 | console_scripts = 44 | picireny = picireny.cli:execute 45 | 46 | [build_antlr] 47 | commands = 48 | antlerinator:4.13.2 src/picireny/antlr4/resources/ANTLRv4Lexer.g4 src/picireny/antlr4/resources/ANTLRv4Parser.g4 -Dlanguage=Python3 -o src/picireny/antlr4/parser -Xexact-output-dir -no-listener 49 | output = 50 | src/picireny/antlr4/parser/ANTLRv4*.py 51 | -------------------------------------------------------------------------------- /src/picireny/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2021 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from . import cli 9 | from . import info 10 | from . import transform 11 | from .cli import __version__, build_with_antlr4, build_with_srcml, reduce 12 | from .hdd import hddmin 13 | from .hddr import hddrmin 14 | from .hdd_tree import HDDRule, HDDToken, HDDTree 15 | -------------------------------------------------------------------------------- /src/picireny/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from picireny.cli import execute 9 | 10 | 11 | if __name__ == '__main__': 12 | execute() 13 | -------------------------------------------------------------------------------- /src/picireny/antlr4/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2020 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from .hdd_tree_builder import create_hdd_tree 9 | -------------------------------------------------------------------------------- /src/picireny/antlr4/antlr_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | import logging 9 | import re 10 | 11 | from sys import maxunicode 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | # Parser Elements 17 | 18 | class ANTLRElement: 19 | def __init__(self, *, optional=False, repl=None, sep=''): 20 | """ 21 | Constructor of the base tree node type. 22 | 23 | :param optional: Boolean indicating whether the current node is optional 24 | or not. 25 | :param repl: Known replacement if any. 26 | """ 27 | self.children = [] 28 | self.replacement = repl if not optional else '' 29 | self.sep = sep 30 | 31 | def all_replacements_defined(self): 32 | """ 33 | Replacements are defined if the node has at least one child and all of 34 | the children have a replacement set. 35 | """ 36 | return self.children and all(x.replacement is not None for x in self.children) 37 | 38 | def has_defined_replacement(self): 39 | """ 40 | Checks if any of the children has a defined replacement. Needed by 41 | alternations since the replacement of a recursive rule wouldn't be 42 | possible to determine if waiting for all the children set. 43 | """ 44 | return self.children and any(x.replacement is not None for x in self.children) 45 | 46 | def calc_replacement(self): 47 | """ 48 | The minimal replacement of a parser rule is the concatenation of its 49 | children's minimal replacement. 50 | 51 | :return: Boolean denoting if a new replacement was found or not. 52 | """ 53 | if self.all_replacements_defined(): 54 | new_repl = self.sep.join(x.replacement for x in self.children if x.replacement) 55 | if self.replacement is None or len(new_repl) < len(self.replacement) or (len(new_repl) == len(self.replacement) and new_repl < self.replacement): 56 | self.replacement = new_repl 57 | return True 58 | return False 59 | 60 | 61 | class ANTLRRule(ANTLRElement): 62 | """ 63 | Representation of a parser rule. The replacement string determined here will 64 | be used in the reduce phase. This replacement can be set by the user or 65 | generated automatically. If it's set by the user then it won't be changed 66 | ever (even if it isn't minimal). 67 | """ 68 | def __init__(self, name, *, repl=None): 69 | super().__init__(repl=repl) 70 | self.name = name 71 | self.const_replacement = repl is not None 72 | 73 | def calc_replacement(self): 74 | if self.const_replacement: 75 | return False 76 | return super().calc_replacement() 77 | 78 | 79 | class ANTLRRef(ANTLRElement): 80 | def __init__(self, ref, *, optional=False): 81 | super().__init__(optional=optional) 82 | self.ref = ref 83 | 84 | 85 | class ANTLRAlternative(ANTLRElement): 86 | def __init__(self, *, repl=None): 87 | super().__init__(repl=repl, sep=' ') 88 | 89 | 90 | class ANTLRAlternation(ANTLRElement): 91 | def calc_replacement(self): 92 | """ 93 | The minimal replacement of an alternation is it's shortest child. 94 | 95 | :return: Boolean denoting if a new replacement was found or not. 96 | """ 97 | if self.has_defined_replacement(): 98 | new_repl = min((c.replacement for c in self.children if c.replacement is not None), key=len) 99 | if self.replacement is None or len(new_repl) < len(self.replacement) or (len(new_repl) == len(self.replacement) and new_repl < self.replacement): 100 | self.replacement = new_repl 101 | return True 102 | return False 103 | 104 | 105 | # Lexer Elements 106 | 107 | class ANTLRLexerElement(ANTLRElement): 108 | def __init__(self, *, optional=False, repl=None): 109 | super().__init__(optional=optional, repl=repl) 110 | self.start_intervals = None 111 | 112 | def starters_defined(self): 113 | return self.children and all(x.start_intervals is not None for x in self.children) 114 | 115 | def calc_starters(self): 116 | if self.start_intervals is None and self.starters_defined(): 117 | self.start_intervals = sum((x.start_intervals for x in self.children), []) 118 | return True 119 | return False 120 | 121 | @staticmethod 122 | def resolve_escapes(src): 123 | """ 124 | Remove escaping from escape sequences in src. E.g., lexer rules may 125 | contain such expressions like: [\t] where \t is evaluated as '\' + 't' 126 | instead of a tabulator. This function executes the reversed 127 | transformation. 128 | 129 | :param src: The string that may have escaped escape sequences. 130 | """ 131 | return src.encode('utf-8').decode('unicode_escape') 132 | 133 | 134 | class ANTLRLexerRule(ANTLRLexerElement): 135 | """ 136 | Representation of a lexer rule. The replacement string determined here will 137 | be used in the reduce phase. This replacement can be set by the user or 138 | generated automatically. If it's set by the user then it won't be changed 139 | ever (even if it's not minimal). 140 | """ 141 | def __init__(self, name, *, repl=None): 142 | super().__init__(repl=repl) 143 | self.name = name 144 | self.const_replacement = repl is not None 145 | 146 | def calc_replacement(self): 147 | if self.const_replacement: 148 | return False 149 | return super().calc_replacement() 150 | 151 | 152 | class ANTLRLexerElements(ANTLRLexerElement): 153 | def calc_starters(self): 154 | if self.children and self.children[0].start_intervals and self.start_intervals is None: 155 | self.start_intervals = self.children[0].start_intervals 156 | return True 157 | return False 158 | 159 | 160 | class ANTLRLexerAlternation(ANTLRLexerElement): 161 | def calc_replacement(self): 162 | # The replacement is the known shortest replacement of the children. 163 | if self.has_defined_replacement(): 164 | new_repl = min((c.replacement for c in self.children if c.replacement is not None), key=len) 165 | if self.replacement is None or len(new_repl) < len(self.replacement) or (len(new_repl) == len(self.replacement) and new_repl < self.replacement): 166 | self.replacement = new_repl 167 | return True 168 | return False 169 | 170 | 171 | class ANTLRTokenRef(ANTLRLexerElement): 172 | def __init__(self, ref): 173 | super().__init__() 174 | self.ref = ref 175 | 176 | 177 | class ANTLRCharacterRange(ANTLRLexerElement): 178 | def __init__(self, start, end): 179 | super().__init__() 180 | # Converting unicode code points to integers. 181 | start = int(start.split('\\u')[1], 16) if start.startswith('\\u') else ord(start) 182 | end = int(end.split('\\u')[1], 16) if end.startswith('\\u') else ord(end) 183 | self.start_intervals = [(start, end)] 184 | self.replacement = chr(start) 185 | 186 | 187 | class ANTLRDotElement(ANTLRLexerElement): 188 | def __init__(self, *, optional=False): 189 | super().__init__(optional=optional) 190 | # Hard-wiring ASCII character range here does not have any limitation (neither effect). 191 | # Basically it should not be used anyway, since the replacement is 192 | # constantly set to 'a' and negating 'any character' would not make sense. 193 | self.start_intervals = [(0, 255)] 194 | if self.replacement is None: 195 | self.replacement = 'a' 196 | 197 | 198 | class ANTLRString(ANTLRLexerElement): 199 | def __init__(self, src): 200 | super().__init__() 201 | src = self.resolve_escapes(src) 202 | self.start_intervals = [(ord(src[0]), ord(src[0]))] 203 | self.replacement = src 204 | 205 | 206 | class ANTLRSetElement(ANTLRLexerElement): 207 | def __init__(self, content=None, *, optional=False): 208 | super().__init__(optional=optional) 209 | if content and self.replacement is None: 210 | if content.startswith(('"', '\'')): 211 | self.start_intervals = [(ord(content[1]), ord(content[1]))] if len(content) > 2 else [] 212 | self.replacement = chr(self.start_intervals[0][0]) 213 | elif content.startswith('['): 214 | self.start_intervals = self.process_charset(content[1:-1]) 215 | self.replacement = chr(self.start_intervals[0][0]) 216 | 217 | @classmethod 218 | def process_charset(cls, src): 219 | """ 220 | Extract represented character intervals from character sets. 221 | 222 | :param src: The string representation of the character set (w/o 223 | brackets). 224 | """ 225 | intervals = [(ord(m.group(1)), ord(m.group(2))) for m in re.finditer(r'(\w)\-(\w)', src)] 226 | positions = [(m.start(1), m.end(2)) for m in re.finditer(r'(\w)\-(\w)', src)] 227 | 228 | # Character sets can contain multiple sets and single characters (e.g., [-ab-defg-ijkl]). 229 | # Select the single characters based on the position of sets. 230 | if not positions: 231 | intervals.extend((ord(x), ord(x)) for x in cls.resolve_escapes(src)) 232 | else: 233 | characters = [] 234 | for i, pos in enumerate(positions): 235 | # Characters before the first range. 236 | if i == 0 and pos[0] > 0: 237 | characters.extend(cls.resolve_escapes(src[0: pos[0]])) 238 | # Characters between ranges. 239 | if i < len(positions) - 1: 240 | if positions[i][1] + 1 < positions[i + 1][0]: 241 | characters.extend(cls.resolve_escapes(src[positions[i][1] + 1: positions[i + 1][0]])) 242 | # Characters after ranges. 243 | else: 244 | if pos[1] < len(src) - 1: 245 | characters.extend(cls.resolve_escapes(src[pos[1] + 1:])) 246 | intervals.extend((ord(x), ord(x)) for x in characters) 247 | 248 | return intervals 249 | 250 | def calc_starters(self): 251 | if self.start_intervals is None and self.children and self.children[0].start_intervals: 252 | self.start_intervals = self.children[0].start_intervals 253 | return True 254 | return False 255 | 256 | 257 | class ANTLRNotSet(ANTLRLexerElement): 258 | def calc_starters(self): 259 | # Known limitation (TODO?): it does not handle multiple negation. 260 | if self.starters_defined() and self.start_intervals is None: 261 | intervals = [y for x in self.children for y in x.start_intervals] 262 | # Sort list of tuples by the first element. 263 | intervals.sort(key=lambda x: x[0]) 264 | # The number (char) before the first interval's lower limit or after 265 | # the last interval's upper limit is suitable for negation. 266 | if intervals[0][0] > 0: 267 | neighbour_char = intervals[0][0] - 1 268 | elif intervals[-1][-1] < maxunicode: 269 | neighbour_char = intervals[-1][-1] + 1 270 | else: 271 | assert False, 'Cannot negate the whole unicode range.' 272 | self.start_intervals = [(neighbour_char, neighbour_char)] 273 | return True 274 | return False 275 | 276 | def calc_replacement(self): 277 | if self.start_intervals and self.replacement is None: 278 | self.replacement = chr(self.start_intervals[0][0]) 279 | return True 280 | return False 281 | -------------------------------------------------------------------------------- /src/picireny/antlr4/grammar_analyzer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from antlr4 import CommonTokenStream, FileStream 9 | from antlr4.tree import Tree 10 | 11 | from .antlr_tree import ( 12 | ANTLRAlternation, ANTLRAlternative, ANTLRCharacterRange, ANTLRDotElement, ANTLRElement, 13 | ANTLRLexerAlternation, ANTLRLexerElement, ANTLRLexerElements, ANTLRLexerRule, 14 | ANTLRNotSet, ANTLRRef, ANTLRRule, ANTLRSetElement, ANTLRString, ANTLRTokenRef 15 | ) 16 | from .parser import ANTLRv4Lexer, ANTLRv4Parser 17 | 18 | 19 | def analyze_grammars(grammars, replacements): 20 | """ 21 | Determine the minimal parser rule replacements of the input grammar. 22 | 23 | :param antlr_lexer: Reference to the ANTLR4 lexer class. 24 | :param antlr_parser: Reference to the ANTLR4 parser class. 25 | :param grammars: List of the grammars describing the input format. 26 | :param replacements: Dictionary that contains the predefined minimal 27 | replacement of any lexer or parser rules. These won't be overridden 28 | later. 29 | :return: Pair of the replacement dictionary and the positions of quantified 30 | elements in the grammars. 31 | """ 32 | 33 | def set_replacements(tree): 34 | """ 35 | Set the minimal replacements of the various subtrees. 36 | 37 | :param tree: AST-like tree representation built by create_grammar_tree. 38 | """ 39 | iterate = True 40 | # Iterate until any updates were performed. 41 | while iterate: 42 | iterate = False 43 | for e in tree: 44 | # If all of the children have a min set: 45 | s = isinstance(e, ANTLRLexerElement) and e.calc_starters() 46 | r = e.calc_replacement() 47 | if s or r: 48 | iterate = True 49 | 50 | # Only those ParseTrees are present in our tree representation that 51 | # have real effect on the minimal replacements of the rules. 52 | # e.g. actions, channels, return values, syntax elements (like: |;:), etc 53 | # are avoided, but e.g. rule definitions, alternations, references, 54 | # token definitions or such nodes that can have quantifier are kept. 55 | def create_node(ctx, optional): 56 | """ 57 | Create tree node of the lexer and parser subtrees. 58 | 59 | :param ctx: The ANTLRRuleContext object under processing. 60 | :param optional: Boolean indicating whether the current context/node is 61 | optional or not. 62 | :return: Node representation of the current context if needed, otherwise 63 | None. 64 | """ 65 | 66 | # Parser rules. 67 | 68 | if isinstance(ctx, parser.ParserRuleSpecContext): 69 | # The parserRuleSpec rule contains 3 or 4 terminal tokens and one of them is the ID of the rule. 70 | # Since we cannot make a distinction between terminals at this point, they have to be referred 71 | # by indices. Since only the first terminal is optional indexing them from the back is safe 72 | # (the 3th from back is the rule ID). 73 | name = [x for x in ctx.children if isinstance(x, Tree.TerminalNodeImpl)][-3].symbol.text 74 | return ANTLRRule(name, repl=replacements.get(name, None)) 75 | 76 | # Alternations need special handling since their minimal replacements are their shortest 77 | # child (in every other cases the children would be concatenated). 78 | if isinstance(ctx, (parser.AltListContext, parser.RuleAltListContext)): 79 | return ANTLRAlternation() 80 | 81 | # Node is created from Alternative to group its element+ children (it's a sequence). 82 | if isinstance(ctx, parser.AlternativeContext): 83 | return ANTLRAlternative(repl=('' if not ctx.children else None)) 84 | 85 | # LabeledElement and Block are created since they can have quantifier. 86 | if isinstance(ctx, (parser.LabeledElementContext, parser.BlockContext)): 87 | return ANTLRElement(optional=optional) 88 | 89 | # Atom can also have quantifier. Furthermore it may have a terminal child 90 | # (DOT = matching any character) that has to be handled here. 91 | if isinstance(ctx, parser.AtomContext): 92 | if isinstance(ctx.children[0], Tree.TerminalNodeImpl): 93 | assert ctx.children[0] == '.' 94 | return ANTLRDotElement(optional=optional) 95 | # Create a base ANTLRElement anyway to make possible applying the quantifier 96 | # to the subtree. 97 | return ANTLRElement(optional=optional) 98 | 99 | # Only the reference is set here but in the next step the whole referenced 100 | # subtree will be plugged as its child. 101 | if isinstance(ctx, parser.RulerefContext): 102 | assert ctx.getChildCount() == 1, 'RuleRef must have exactly one child.' 103 | return ANTLRRef(ctx.children[0].symbol.text, optional=optional) 104 | 105 | # Lexer rules. 106 | 107 | # The main difference between parser and lexer rules in this representation is that 108 | # lexer rules have an additional field (start_intervals) that aims to track all the 109 | # possible character ranges that a given token can start with. The purpose of this 110 | # is being able to generate minimal replacement for a negated lexer rule: having 111 | # all the possible character intervals that a lexer rule can start with we can easily 112 | # invert these ranges. 113 | if isinstance(ctx, parser.LexerRuleSpecContext): 114 | # Just like at ANTLRRule, the 3rd terminal from the back contains the name of the lexer rule. 115 | name = [x for x in ctx.children if isinstance(x, Tree.TerminalNodeImpl)][-3].symbol.text 116 | return ANTLRLexerRule(name, repl=replacements.get(name, None)) 117 | 118 | # The same logic as with parser alternations. 119 | if isinstance(ctx, parser.LexerAltListContext): 120 | return ANTLRLexerAlternation() 121 | 122 | # The special about LexerAlt is that it can have an empty child which makes 123 | # possible such alternations in lexer like: ('a'| ). Capturing an empty LexerAlt 124 | # construction is only possible here, in which case its minimal replacement is 125 | # the empty string. 126 | if isinstance(ctx, parser.LexerAltContext): 127 | # If the alternative has no children means that it's left explicitly empty. 128 | return ANTLRLexerElement(repl=('' if not ctx.children else None)) 129 | 130 | # The special about LexerElements is that by determining its start character range 131 | # is enough to get the first character of its first child (since it's a token sequence). 132 | if isinstance(ctx, parser.LexerElementsContext): 133 | return ANTLRLexerElements() 134 | 135 | # LexerBlock is created since it can have quantifier. 136 | if isinstance(ctx, parser.LexerBlockContext): 137 | return ANTLRLexerElement(optional=optional) 138 | 139 | # LexerAtom can also have quantifier. Furthermore it may have terminal children 140 | # (DOT or character set) that has to be handled here. 141 | if isinstance(ctx, parser.LexerAtomContext): 142 | if isinstance(ctx.children[0], Tree.TerminalNodeImpl): 143 | content = ctx.children[0].symbol.text 144 | if content == '.': 145 | return ANTLRDotElement(optional=optional) 146 | if content.startswith('['): 147 | return ANTLRSetElement(content, optional=optional) 148 | assert False 149 | # Create a base ANTLRLexerElement anyway to make possible applying the 150 | # quantifier to the subtree. 151 | return ANTLRLexerElement(optional=optional) 152 | 153 | if isinstance(ctx, parser.CharacterRangeContext): 154 | # The 1st and 3rd token of a character range defines its boundaries. 155 | return ANTLRCharacterRange(ctx.children[0].symbol.text[1:-1], ctx.children[2].symbol.text[1:-1]) 156 | 157 | if isinstance(ctx, parser.TerminalContext): 158 | # Terminal node is either a string literal or a token reference. 159 | content = ctx.children[0].symbol.text 160 | if content.startswith(('"', '\'')): 161 | return ANTLRString(content[1:-1]) 162 | return ANTLRTokenRef(content) 163 | 164 | if isinstance(ctx, parser.NotSetContext): 165 | return ANTLRNotSet() 166 | 167 | # SetElement is the lexer rule that will be negated. 168 | if isinstance(ctx, parser.SetElementContext): 169 | # If the first child is a terminal node then it must be one of the followings: 170 | # token_ref, string_literal or char set. 171 | if isinstance(ctx.children[0], Tree.TerminalNodeImpl): 172 | if ctx.children[0].symbol.text.isupper(): 173 | return ANTLRTokenRef(ctx.children[0].symbol.text) 174 | return ANTLRSetElement(ctx.children[0].symbol.text) 175 | # In this case we have a character range. 176 | return ANTLRSetElement() 177 | 178 | # Tokens without lexer rules. 179 | 180 | # Identifiers in a TokensSpec are definitions of token names without an 181 | # associated lexer rule. We don't know anything about them, but they are 182 | # added with a dummy representation to the tree to avoid dead links (as 183 | # they may be referenced from other (parser) rules). 184 | if isinstance(ctx, parser.IdentifierContext) and isinstance(ctx.parentCtx, parser.IdListContext) and isinstance(ctx.parentCtx.parentCtx, parser.TokensSpecContext): 185 | return ANTLRLexerRule(str(ctx.TOKEN_REF()), repl='') 186 | 187 | return None 188 | 189 | def get_quantifier(children, idx): 190 | """ 191 | Check whether a quantifier is defined on the idx-th children. 192 | 193 | :param children: All the siblings of the current node. 194 | :param idx: The index of the current node among the siblings. 195 | :return: Quantifier string of the idx-th context if one is defined, None 196 | otherwise. 197 | """ 198 | if len(children) <= idx + 1: 199 | return None 200 | suffix = None 201 | if isinstance(children[idx + 1], parser.EbnfSuffixContext): 202 | suffix = children[idx + 1].start.text 203 | elif isinstance(children[idx + 1], parser.BlockSuffixContext): 204 | suffix = children[idx + 1].children[0].start.text 205 | return suffix 206 | 207 | def is_optional(quantifier): 208 | """ 209 | Check whether a quantifier string makes its quantified expression 210 | optional, i.e., if it allows the expression to occur 0 times. 211 | 212 | :param quantifier: Quantifier string. 213 | :return: Boolean indicating whether the quantifier is optional or not. 214 | """ 215 | return quantifier.startswith(('*', '?')) 216 | 217 | def create_grammar_tree(node, positions, parent_idx, optional, parser_rule): 218 | """ 219 | Creates a tree representation of the target parser grammar to facilitate 220 | the generation of minimal replacement strings. 221 | 222 | :param node: The ANTLR parser tree whose representation will be inserted 223 | now. 224 | :param positions: Dictionary describing positions in grammars where 225 | optional actions should be injected. 226 | :param parent_idx: The index of the parent node in the elements list or 227 | None if without parent. 228 | :param optional: Boolean deciding if the current node is optional or 229 | not. 230 | :param parser_rule: Boolean value indicating if a parser rule being 231 | processed. 232 | """ 233 | element = create_node(node, optional) 234 | if element: 235 | elements.append(element) 236 | idx = len(elements) - 1 237 | if parent_idx is not None: 238 | elements[parent_idx].children.append(element) 239 | else: 240 | idx = parent_idx 241 | 242 | if node.getChildCount() > 0: 243 | # TerminalNodeImpl nodes already have been added by create_node 244 | # when processing their parent since at this point we don't know their type. 245 | for i, c in enumerate(x for x in node.children if not isinstance(x, Tree.TerminalNodeImpl)): 246 | quantifier = get_quantifier(node.children, i) 247 | 248 | # Mark positions in parser rules that have any quantifier applied on them. 249 | if quantifier and parser_rule: 250 | start_token = parser.getInputStream().get(c.getSourceInterval()[0]) 251 | end_token = parser.getInputStream().get(c.getSourceInterval()[1]) 252 | 253 | start_ln = start_token.line 254 | start = start_token.column 255 | 256 | line_breaks = end_token.text.count('\n') 257 | end_ln = end_token.line + line_breaks 258 | end = end_token.column + len(end_token.text) if not line_breaks else \ 259 | len(end_token.text) - end_token.text.rfind('\n') + 1 260 | 261 | if start_ln not in positions: 262 | positions[start_ln] = [] 263 | if end_ln not in positions: 264 | positions[end_ln] = [] 265 | 266 | positions[start_ln].append(('s', start)) 267 | positions[end_ln].append(('e', end)) 268 | 269 | create_grammar_tree(c, positions, idx, quantifier and is_optional(quantifier), 270 | parser_rule and not isinstance(element, ANTLRLexerRule)) 271 | 272 | # EOF is a special token provided by the ANTLR framework. It's added preliminarily to 273 | # our tree to avoid dead links to it. 274 | elements = [ANTLRLexerRule('EOF', repl='')] 275 | action_positions = {} 276 | replacements = replacements if replacements else {} 277 | # Fill elements with node representations. 278 | for grammar in grammars: 279 | action_positions[grammar] = {} 280 | parser = ANTLRv4Parser(CommonTokenStream(ANTLRv4Lexer(FileStream(grammar, 'utf-8')))) 281 | create_grammar_tree(parser.grammarSpec(), action_positions[grammar], None, False, True) 282 | 283 | # Create mapping between references and indices of antlr_tree to be able to plug the 284 | # appropriate subtrees into reference nodes. 285 | rules = dict((x.name, i) for i, x in enumerate(elements) if isinstance(x, (ANTLRRule, ANTLRLexerRule))) 286 | 287 | # Plug the referred node under the referrers. 288 | for i, x in enumerate(elements): 289 | if isinstance(x, (ANTLRRef, ANTLRTokenRef)): 290 | assert not elements[i].children, 'Referrer nodes must not contain children.' 291 | elements[i].children = [elements[rules[x.ref]]] 292 | 293 | # Associate tree nodes with minimal string replacements. 294 | set_replacements(elements) 295 | return dict((x.name, x.replacement) for x in elements if isinstance(x, (ANTLRRule, ANTLRLexerRule))), action_positions 296 | -------------------------------------------------------------------------------- /src/picireny/antlr4/hdd_tree_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | import logging 9 | import re 10 | import shutil 11 | import sys 12 | 13 | from glob import glob 14 | from os import makedirs, pathsep 15 | from os.path import basename, join 16 | from pkgutil import get_data 17 | from string import Template 18 | from subprocess import CalledProcessError, PIPE, run, STDOUT 19 | 20 | import xson 21 | 22 | from antlr4 import CommonTokenStream, error, InputStream, Token 23 | from antlr4.Token import CommonToken 24 | 25 | from .grammar_analyzer import analyze_grammars 26 | from .parser_builder import build_grammars 27 | from ..hdd_tree import HDDRule, HDDToken, Position 28 | from ..transform import remove_empty_nodes 29 | 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | class HDDQuantifier(HDDRule): 35 | """ 36 | Special rule type in the HDD tree to support optional quantifiers. 37 | """ 38 | def __init__(self, *, start=None, end=None): 39 | super().__init__('', start=start, end=end) 40 | 41 | 42 | class HDDHiddenToken(HDDToken): 43 | """ 44 | Special token type that represents tokens from hidden channels. 45 | """ 46 | 47 | 48 | class HDDErrorToken(HDDToken): 49 | """ 50 | Special token type that represents unmatched tokens. The minimal replacement 51 | of such nodes is an empty string. 52 | """ 53 | def __init__(self, text, *, start=None, end=None): 54 | super().__init__('', text, start=start, end=end) 55 | 56 | 57 | # Override ConsoleErrorListener to suppress parse issues in non-verbose mode. 58 | class ConsoleListener(error.ErrorListener.ConsoleErrorListener): 59 | def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): 60 | logger.debug('line %d:%d %s', line, column, msg) 61 | 62 | 63 | error.ErrorListener.ConsoleErrorListener.INSTANCE = ConsoleListener() 64 | 65 | 66 | def create_hdd_tree(src, *, 67 | input_format, start, 68 | antlr, lang='python', 69 | hidden_tokens=False, 70 | work_dir): 71 | """ 72 | Build a tree that the HDD algorithm can work with. 73 | 74 | :param src: Input source. 75 | :param input_format: Dictionary describing the input format. 76 | :param start: Name of the start rule in [grammarname:]rulename format. 77 | :param antlr: Path to the ANTLR4 tool (Java jar binary). 78 | :param lang: The target language of the parser. 79 | :param hidden_tokens: Build hidden tokens of the input format into the HDD 80 | tree. 81 | :param work_dir: Working directory. 82 | :return: The root of the created HDD tree. 83 | """ 84 | 85 | def inject_optional_actions(grammar, positions, target_file): 86 | """ 87 | Update the original parser grammar by injecting actions to the start and 88 | end of every quantified part. 89 | 90 | :param grammar: Path to the grammar to be updated. 91 | :param positions: Start and end locations of quantified elements. 92 | :param target_file: Path to the updated grammar. 93 | """ 94 | with open(grammar, 'rb') as f: 95 | lines = f.read().splitlines(True) 96 | 97 | languages = { 98 | 'python': { 99 | 'prefix': b'({self.enter_optional()} ', 100 | 'postfix': b' {self.exit_optional()})' 101 | }, 102 | 'java': { 103 | 'prefix': b'({ try { getClass().getMethod("enter_optional").invoke(this); } catch (Exception e) { assert false; }} ', 104 | 'postfix': b' { try { getClass().getMethod("exit_optional").invoke(this); } catch (Exception e) { assert false; }})' 105 | } 106 | } 107 | 108 | for ln in positions: 109 | offset = 0 110 | for position in sorted(positions[ln], key=lambda x: x[1]): 111 | if position[0] == 's': 112 | lines[ln - 1] = lines[ln - 1][0:position[1] + offset] + languages[lang]['prefix'] + lines[ln - 1][position[1] + offset:] 113 | offset += len(languages[lang]['prefix']) 114 | elif position[0] == 'e': 115 | lines[ln - 1] = lines[ln - 1][0:position[1] + offset] + languages[lang]['postfix'] + lines[ln - 1][position[1] + offset:] 116 | offset += len(languages[lang]['postfix']) 117 | 118 | with open(target_file, 'wb') as f: 119 | f.write(b''.join(lines)) 120 | 121 | def java_classpath(current_workdir): 122 | return pathsep.join([antlr, current_workdir]) 123 | 124 | def compile_java_sources(lexer, parser, listener, current_workdir): 125 | executor = Template(get_data(__package__, 'resources/ExtendedTargetParser.java').decode('utf-8')) 126 | with open(join(current_workdir, f'Extended{parser}.java'), 'w') as f: 127 | f.write(executor.substitute({'lexer_class': lexer, 128 | 'parser_class': parser, 129 | 'listener_class': listener})) 130 | try: 131 | run(('javac', '-classpath', java_classpath(current_workdir)) + tuple(basename(j) for j in glob(join(current_workdir, '*.java'))), 132 | stdout=PIPE, stderr=STDOUT, cwd=current_workdir, check=True) 133 | except CalledProcessError as e: 134 | logger.error('Java compile failed!\n%s\n', e.output) 135 | raise 136 | 137 | def prepare_parsing(grammar_name): 138 | """ 139 | Performs initiative steps needed to parse the input test case (like 140 | create directory structures, builds grammars, sets PATH, etc...) 141 | 142 | :param grammar_name: Name of the grammar to use for parsing. 143 | """ 144 | grammar = input_format[grammar_name] 145 | resources = [fn for fn in grammar['files'] if not fn.endswith('.g4')] 146 | grammar['files'] = [fn for fn in grammar['files'] if fn.endswith('.g4')] 147 | 148 | replacements, action_positions = analyze_grammars(grammar['files'], grammar['replacements']) 149 | logger.debug('Replacements are calculated...') 150 | 151 | current_workdir = join(work_dir, grammar_name) if grammar_name else work_dir 152 | makedirs(current_workdir, exist_ok=True) 153 | if current_workdir not in sys.path: 154 | sys.path.append(current_workdir) 155 | 156 | # Inject actions into the target grammars to help localizing part of the test case that are optional. 157 | for i, g in enumerate(grammar['files']): 158 | grammar['files'][i] = join(current_workdir, basename(g)) 159 | inject_optional_actions(g, action_positions[g], grammar['files'][i]) 160 | 161 | for r in resources: 162 | shutil.copy(r, current_workdir) 163 | 164 | target_lexer_class, target_parser_class, target_listener_class = build_grammars(tuple(grammar['files']), current_workdir, antlr, lang) 165 | logger.debug('Target grammars are processed...') 166 | 167 | if lang == 'java': 168 | compile_java_sources(target_lexer_class, target_parser_class, target_listener_class, current_workdir) 169 | input_format[grammar_name].update(lexer=target_lexer_class, parser=target_parser_class, listener=target_listener_class, replacements=replacements) 170 | return 171 | 172 | class ExtendedTargetLexer(target_lexer_class): 173 | """ 174 | ExtendedTargetLexer is a subclass of the original lexer 175 | implementation. It can recognize skipped tokens and instead of 176 | eliminating them from the parser they can be redirected to the 177 | dedicated PICIRENY_CHANNEL for later use. 178 | """ 179 | 180 | PICIRENY_CHANNEL = -3 181 | 182 | # Skipped tokens cannot be accessed from the parser but we still need them to 183 | # unparse test cases correctly. Sending these tokens to a dedicated channel won't 184 | # alter the parse but makes these tokens available. 185 | def skip(self): 186 | self._channel = self.PICIRENY_CHANNEL 187 | 188 | class ExtendedTargetParser(target_parser_class): 189 | """ 190 | ExtendedTargetParser is a subclass of the original parser 191 | implementation. It can trigger state changes that are needed to 192 | identify parts of the input that are not needed to keep it 193 | syntactically correct. 194 | """ 195 | def enter_optional(self): 196 | self.trigger_listener('enter_optional') 197 | 198 | def exit_optional(self): 199 | self.trigger_listener('exit_optional') 200 | 201 | def enterRecursionRule(self, localctx, state, ruleIndex, precedence): 202 | super().enterRecursionRule(localctx, state, ruleIndex, precedence) 203 | self.trigger_listener('recursion_enter') 204 | 205 | def pushNewRecursionContext(self, localctx, state, ruleIndex): 206 | super().pushNewRecursionContext(localctx, state, ruleIndex) 207 | self.trigger_listener('recursion_push') 208 | 209 | def unrollRecursionContexts(self, parentCtx): 210 | super().unrollRecursionContexts(parentCtx) 211 | self.trigger_listener('recursion_unroll') 212 | 213 | def trigger_listener(self, event): 214 | for listener in self.getParseListeners(): 215 | if hasattr(listener, event): 216 | getattr(listener, event)() 217 | 218 | def syntax_error_warning(self): 219 | if self.getNumberOfSyntaxErrors() > 0: 220 | logger.warning('%s finished with %d syntax errors. This may decrease reduce quality.', 221 | target_parser_class.__name__, self.getNumberOfSyntaxErrors()) 222 | 223 | class ExtendedTargetListener(target_listener_class): 224 | """ 225 | ExtendedTargetListener is a subclass of the original listener 226 | implementation. It can trigger state changes that are needed to 227 | identify parts of the input that are not needed to keep it 228 | syntactically correct. 229 | """ 230 | def __init__(self, parser): 231 | self.parser = parser 232 | self.current_node = None 233 | self.root = None 234 | self.seen_terminal = False 235 | self.island_nodes = [] 236 | 237 | def recursion_enter(self): 238 | assert isinstance(self.current_node, HDDRule) 239 | node = HDDRule(self.current_node.name) 240 | self.current_node.add_child(node) 241 | self.current_node.recursive_rule = True 242 | self.current_node = node 243 | 244 | def recursion_push(self): 245 | assert self.current_node.parent.children 246 | 247 | first_child = self.current_node.parent.children[0] 248 | self.current_node.parent.remove_child(first_child) 249 | self.current_node.add_child(first_child) 250 | 251 | def recursion_unroll(self): 252 | assert self.current_node.recursive_rule 253 | assert len(self.current_node.children) == 1 and self.current_node.name == self.current_node.children[0].name 254 | children_to_lift = self.current_node.children[0].children 255 | parent = self.current_node.parent 256 | if children_to_lift: 257 | self.current_node.children = [] 258 | self.current_node.add_children(children_to_lift) 259 | else: 260 | parent.remove_child(self.current_node) 261 | self.current_node = parent 262 | 263 | def enterEveryRule(self, ctx): 264 | name = self.parser.ruleNames[ctx.getRuleIndex()] 265 | node = HDDRule(name) 266 | if not self.root: 267 | self.root = node 268 | else: 269 | assert self.current_node 270 | self.current_node.add_child(node) 271 | self.current_node = node 272 | 273 | def exitEveryRule(self, ctx): 274 | # If the input contains syntax error, then the last optional block was may not closed. 275 | while isinstance(self.current_node, HDDQuantifier): 276 | self.exit_optional() 277 | 278 | assert self.current_node.name == self.parser.ruleNames[ctx.getRuleIndex()], \ 279 | f'{self.current_node.name} ({self.current_node!r}) != {self.parser.ruleNames[ctx.getRuleIndex()]}' 280 | 281 | if self.current_node.parent: 282 | self.current_node = self.current_node.parent 283 | 284 | def tokenBoundaries(self, token): 285 | start = Position(token.line, token.column) 286 | return start, start.after(token.text) 287 | 288 | def addToken(self, node, child): 289 | if not self.seen_terminal: 290 | hidden_tokens = self.parser.getTokenStream().getHiddenTokensToLeft(node.symbol.tokenIndex, -1) or [] 291 | for token in hidden_tokens: 292 | start, end = self.tokenBoundaries(token) 293 | self.current_node.add_child(HDDHiddenToken(self.parser.symbolicNames[token.type], token.text, 294 | start=start, end=end)) 295 | self.seen_terminal = True 296 | 297 | self.current_node.add_child(child) 298 | 299 | hidden_tokens = self.parser.getTokenStream().getHiddenTokensToRight(node.symbol.tokenIndex, -1) or [] 300 | for token in hidden_tokens: 301 | start, end = self.tokenBoundaries(token) 302 | self.current_node.add_child(HDDHiddenToken(self.parser.symbolicNames[token.type], token.text, 303 | start=start, end=end)) 304 | 305 | def visitTerminal(self, node): 306 | token = node.symbol 307 | name, text = (self.parser.symbolicNames[token.type], token.text) if token.type != Token.EOF else ('EOF', '') 308 | start, end = self.tokenBoundaries(token) 309 | 310 | child = HDDToken(name, text, start=start, end=end) 311 | self.addToken(node, child) 312 | if name in grammar['islands']: 313 | self.island_nodes.append(child) 314 | 315 | def visitErrorNode(self, node): 316 | if hasattr(node, 'symbol'): 317 | token = node.symbol 318 | start, end = self.tokenBoundaries(token) 319 | self.addToken(node, HDDErrorToken(token.text, start=start, end=end)) 320 | 321 | def enter_optional(self): 322 | quant_node = HDDQuantifier() 323 | self.current_node.add_child(quant_node) 324 | self.current_node = quant_node 325 | 326 | def exit_optional(self): 327 | assert self.current_node.parent, 'Quantifier node has no parent.' 328 | assert self.current_node.children, 'Quantifier node has no children.' 329 | 330 | self.current_node = self.current_node.parent 331 | 332 | input_format[grammar_name].update(lexer=ExtendedTargetLexer, parser=ExtendedTargetParser, listener=ExtendedTargetListener, replacements=replacements) 333 | 334 | class ExtendedErrorListener(error.ErrorListener.ErrorListener): 335 | 336 | def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): 337 | t = CommonToken(source=(recognizer, recognizer._input), 338 | type=Token.INVALID_TYPE, 339 | channel=Token.DEFAULT_CHANNEL, 340 | start=recognizer._tokenStartCharIndex, 341 | stop=recognizer._tokenStartCharIndex) 342 | t.line = recognizer._tokenStartLine 343 | t.column = recognizer._tokenStartColumn 344 | recognizer._type = Token.MIN_USER_TOKEN_TYPE 345 | recognizer.emitToken(t) 346 | 347 | def build_hdd_tree(src, grammar_name, start_rule): 348 | """ 349 | Parse the input with the provided ANTLR classes. 350 | 351 | :param src: Input source. 352 | :param grammar_name: Name of the grammar to use for parsing. 353 | :param start_rule: The name of the start rule of the parser. 354 | :return: The root of the created HDD tree. 355 | """ 356 | 357 | grammar = input_format[grammar_name] 358 | island_nodes = [] 359 | 360 | def set_replacement(node): 361 | if isinstance(node, (HDDQuantifier, HDDErrorToken)): 362 | node.replace = '' 363 | elif isinstance(node, HDDRule): 364 | node.replace = grammar['replacements'][node.name] 365 | else: 366 | node.replace = grammar['replacements'].get(node.name, node.text) 367 | 368 | if isinstance(node, HDDRule): 369 | for child in node.children: 370 | set_replacement(child) 371 | 372 | logger.debug('Parse input with %s rule', start_rule) 373 | if lang != 'python': 374 | 375 | def hdd_tree_from_dict(node_dict): 376 | # Convert interval dictionaries to Position objects. 377 | if 'start' in node_dict: 378 | node_dict['start'] = Position(**node_dict['start']) 379 | if 'end' in node_dict: 380 | node_dict['end'] = Position(**node_dict['end']) 381 | 382 | name = node_dict.get('name', None) 383 | children = node_dict.pop('children', None) 384 | cls = globals()[node_dict.pop('type')] 385 | node = cls(**node_dict) 386 | 387 | if children: 388 | for child in children: 389 | node.add_child(hdd_tree_from_dict(child)) 390 | elif name: 391 | if name in grammar['islands']: 392 | island_nodes.append(node) 393 | return node 394 | 395 | try: 396 | current_workdir = join(work_dir, grammar_name) if grammar_name else work_dir 397 | proc = run(('java', '-classpath', java_classpath(current_workdir), f'Extended{grammar["parser"]}', start_rule), 398 | input=src, stdout=PIPE, stderr=PIPE, universal_newlines=True, cwd=current_workdir, check=True) 399 | if proc.stderr: 400 | logger.debug(proc.stderr) 401 | result = xson.loads(proc.stdout) 402 | tree_root = hdd_tree_from_dict(result) 403 | except CalledProcessError as e: 404 | logger.error('Java parser failed!\n%s\n%s', e.stdout, e.stderr) 405 | raise 406 | else: 407 | lexer = grammar['lexer'](InputStream(src)) 408 | lexer.addErrorListener(ExtendedErrorListener()) 409 | target_parser = grammar['parser'](CommonTokenStream(lexer)) 410 | parser_listener = grammar['listener'](target_parser) 411 | target_parser.addParseListener(parser_listener) 412 | 413 | getattr(target_parser, start_rule)() 414 | target_parser.syntax_error_warning() 415 | island_nodes = parser_listener.island_nodes 416 | assert parser_listener.root == parser_listener.current_node 417 | tree_root = parser_listener.root 418 | 419 | # Traverse the HDD tree and set minimal replacements for nodes. 420 | set_replacement(tree_root) 421 | process_island_nodes(island_nodes, grammar['islands']) 422 | logger.debug('Parse done.') 423 | return tree_root 424 | 425 | def process_island_nodes(island_nodes, island_format): 426 | for node in island_nodes: 427 | if not isinstance(island_format[node.name], tuple): 428 | rewritten, mapping = rename_regex_groups(island_format[node.name]) 429 | for new_name, old_name in mapping.items(): 430 | grammar_name, rule_name = split_grammar_rule_name(old_name) 431 | mapping[new_name] = (grammar_name, rule_name) 432 | if 'lexer' not in input_format[grammar_name]: 433 | prepare_parsing(grammar_name) 434 | island_format[node.name] = (re.compile(rewritten, re.S), mapping) 435 | 436 | new_node = HDDRule(node.name, replace=node.replace) 437 | new_node.add_children(build_island_subtree(node, *island_format[node.name])) 438 | node.replace_with(new_node) 439 | 440 | def build_island_subtree(node, pattern, mapping): 441 | """ 442 | Process terminal with an island grammar. 443 | 444 | :param node: HDDToken object containing island language. 445 | :return: List of HDDTree nodes representing the `children` of node. 446 | """ 447 | last_processed = 0 448 | content = node.text 449 | children = [] 450 | 451 | # Intervals describes a non-overlapping splitting of the content according to the pattern. 452 | intervals = [] 453 | for m in re.finditer(pattern, content): 454 | intervals.extend((g, m.start(g), m.end(g)) for g in list(pattern.groupindex.keys()) if m.start(g) != m.end(g)) 455 | intervals.sort(key=lambda x: (x[1], x[2])) 456 | 457 | def shift_positions(node, start): 458 | if node.start: 459 | node.start.shift(start) 460 | if node.end: 461 | node.end.shift(start) 462 | 463 | if isinstance(node, HDDRule): 464 | for child in node.children: 465 | shift_positions(child, start) 466 | 467 | for interval in intervals: 468 | # Create simple HDDToken of the substring proceeding a subgroup. 469 | if last_processed < interval[1]: 470 | token_start = node.start.after(content[0:last_processed]) 471 | token_text = content[last_processed:interval[1]] 472 | children.append(HDDToken('', token_text, 473 | start=token_start, 474 | end=token_start.after(token_text), 475 | replace=token_text)) 476 | 477 | # Process an island and save its subtree. 478 | island_start = node.start.after(content[0:interval[1]]) 479 | island_root = build_hdd_tree(src=content[interval[1]:interval[2]], 480 | grammar_name=mapping[interval[0]][0], 481 | start_rule=mapping[interval[0]][1]) 482 | shift_positions(island_root, island_start) 483 | children.append(island_root) 484 | 485 | last_processed = interval[2] 486 | 487 | # Create simple HDDToken of the substring following the last subgroup if any. 488 | if last_processed < len(content): 489 | token_start = node.start.after(content[0:last_processed]) 490 | token_text = content[last_processed:] 491 | children.append(HDDToken('', token_text, 492 | start=token_start, 493 | end=token_start.after(token_text), 494 | replace=token_text)) 495 | return children 496 | 497 | def calculate_rule_boundaries(node): 498 | if isinstance(node, HDDRule): 499 | for child in node.children: 500 | calculate_rule_boundaries(child) 501 | 502 | node.start = node.children[0].start 503 | node.end = node.children[-1].end 504 | 505 | return node 506 | 507 | def remove_hidden_tokens(node): 508 | if isinstance(node, HDDRule): 509 | non_hidden_children = [] 510 | 511 | for child in node.children: 512 | if not isinstance(child, HDDHiddenToken): 513 | remove_hidden_tokens(child) 514 | non_hidden_children.append(child) 515 | 516 | node.children[:] = non_hidden_children 517 | 518 | return node 519 | 520 | _NAMED_GRP_PATTERN = re.compile(r'(?]*>)') # "(?P" not prefixed by a "\" 521 | _NAMED_GRP_PREFIX = '(?P<' 522 | _NAMED_GRP_SUFFIX = '>' 523 | _NAMED_REF_PATTERN = re.compile(r'(? 0: 71 | self._type = self.ARGUMENT_CONTENT 72 | 73 | def handleEndAction(self): 74 | oldMode = self._mode 75 | newMode = self.popMode() 76 | isActionWithinAction = len(self._modeStack) > 0 and newMode == self.TargetLanguageAction and oldMode == newMode 77 | if isActionWithinAction: 78 | self._type = self.ACTION_CONTENT 79 | 80 | def emit(self): 81 | if (self._type == self.OPTIONS or self._type == self.TOKENS or self._type == self.CHANNELS) and self._currentRuleType == Token.INVALID_TYPE: 82 | self._currentRuleType = self.PREQUEL_CONSTRUCT 83 | elif self._type == self.OPTIONS and self._currentRuleType == self.TOKEN_REF: 84 | self._currentRuleType = self.OPTIONS_CONSTRUCT 85 | elif self._type == self.RBRACE and self._currentRuleType == self.PREQUEL_CONSTRUCT: 86 | self._currentRuleType = Token.INVALID_TYPE 87 | elif self._type == self.RBRACE and self._currentRuleType == self.OPTIONS_CONSTRUCT: 88 | self._currentRuleType = self.TOKEN_REF 89 | elif self._type == self.AT and self._currentRuleType == Token.INVALID_TYPE: 90 | self._currentRuleType = self.AT 91 | elif self._type == self.SEMI and self._currentRuleType == self.OPTIONS_CONSTRUCT: 92 | self._currentRuleType = self._currentRuleType 93 | elif self._type == self.END_ACTION and self._currentRuleType == self.AT: 94 | self._currentRuleType = Token.INVALID_TYPE 95 | elif self._type == self.ID: 96 | firstChar = self._input.getText(self._tokenStartCharIndex, self._tokenStartCharIndex) 97 | if firstChar[0].isupper(): 98 | self._type = self.TOKEN_REF 99 | else: 100 | self._type = self.RULE_REF 101 | 102 | if self._currentRuleType == Token.INVALID_TYPE: # if outside of rule def 103 | self._currentRuleType = self._type # set to inside lexer or parser rule 104 | 105 | elif self._type == self.SEMI: # exit rule def 106 | self._currentRuleType = Token.INVALID_TYPE 107 | return Lexer.emit(self) 108 | 109 | def inLexerRule(self): 110 | return self._currentRuleType == self.TOKEN_REF 111 | 112 | def inParserRule(self): # not used, but added for clarity 113 | return self._currentRuleType == self.RULE_REF 114 | -------------------------------------------------------------------------------- /src/picireny/antlr4/parser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from .ANTLRv4Lexer import ANTLRv4Lexer 9 | from .ANTLRv4Parser import ANTLRv4Parser 10 | -------------------------------------------------------------------------------- /src/picireny/antlr4/parser_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2022 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | import logging 9 | 10 | from os import listdir 11 | from os.path import basename, commonprefix, split, splitext 12 | from subprocess import CalledProcessError, PIPE, run, STDOUT 13 | 14 | logger = logging.getLogger(__name__) 15 | grammar_cache = {} 16 | 17 | 18 | def build_grammars(grammars, out, antlr, lang='python'): 19 | """ 20 | Build lexer and grammar from ANTLRv4 grammar files in Python target. 21 | 22 | :param grammars: Tuple of grammar files. 23 | :param out: Output directory. 24 | :param antlr: Path to the ANTLR4 tool (Java jar binary). 25 | :param lang: The target language of the parser. 26 | :return: List of references/names of the lexer, parser and listener classes 27 | of the target. 28 | """ 29 | 30 | # Generate parser and lexer in the target language and return either with 31 | # python class ref or the name of java classes. 32 | if lang not in grammar_cache: 33 | grammar_cache[lang] = {} 34 | if grammars in grammar_cache[lang]: 35 | logger.debug('%r is already built with %s target.', grammars, lang) 36 | return grammar_cache[lang][grammars] 37 | 38 | try: 39 | languages = { 40 | 'python': {'antlr_arg': '-Dlanguage=Python3', 'ext': 'py', 'listener_format': 'Listener'}, 41 | 'java': {'antlr_arg': '-Dlanguage=Java', 'ext': 'java', 'listener_format': 'BaseListener'}, 42 | } 43 | 44 | try: 45 | run(('java', '-jar', antlr, languages[lang]['antlr_arg'], '-o', out) + grammars, 46 | stdout=PIPE, stderr=STDOUT, cwd=out, check=True) 47 | except CalledProcessError as e: 48 | logger.error('Building grammars %r failed!\n%s\n', grammars, e.output) 49 | raise 50 | 51 | files = listdir(out) 52 | filename = basename(grammars[0]) 53 | 54 | def file_endswith(end_pattern): 55 | f = next(f for f in files if len(commonprefix([filename, f])) > 0 and f.endswith(end_pattern)) 56 | _, f = split(f) 57 | f, _ = splitext(f) 58 | return f 59 | 60 | # Extract the name of lexer and parser from their path. 61 | lexer = file_endswith(f'Lexer.{languages[lang]["ext"]}') 62 | parser = file_endswith(f'Parser.{languages[lang]["ext"]}') 63 | # The name of the generated listeners differs if Python or other language target is used. 64 | listener = file_endswith(f'{languages[lang]["listener_format"]}.{languages[lang]["ext"]}') 65 | 66 | if lang == 'python': 67 | grammar_cache[lang][grammars] = [getattr(__import__(x, globals(), locals(), [x], 0), x) for x in [lexer, parser, listener]] 68 | else: 69 | grammar_cache[lang][grammars] = [lexer, parser, listener] 70 | 71 | return grammar_cache[lang][grammars] 72 | except Exception as e: 73 | logger.error('Exception while loading parser modules', exc_info=e) 74 | raise 75 | -------------------------------------------------------------------------------- /src/picireny/antlr4/resources/ANTLRv4Lexer.g4: -------------------------------------------------------------------------------- 1 | /* 2 | * [The "BSD license"] 3 | * Copyright (c) 2012-2015 Terence Parr 4 | * Copyright (c) 2012-2015 Sam Harwell 5 | * Copyright (c) 2015 Gerald Rosenberg 6 | * All rights reserved. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions 10 | * are met: 11 | * 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 3. The name of the author may not be used to endorse or promote products 18 | * derived from this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | /** 32 | * A grammar for ANTLR v4 implemented using v4 syntax 33 | * 34 | * Modified 2015.06.16 gbr 35 | * -- update for compatibility with Antlr v4.5 36 | */ 37 | 38 | // ====================================================== 39 | // Lexer specification 40 | // ====================================================== 41 | 42 | lexer grammar ANTLRv4Lexer; 43 | 44 | options { superClass = LexerAdaptor; } 45 | import LexBasic; 46 | 47 | // Standard set of fragments 48 | tokens { TOKEN_REF , RULE_REF , LEXER_CHAR_SET } 49 | channels { OFF_CHANNEL , COMMENT } 50 | 51 | // ------------------------- 52 | // Comments 53 | DOC_COMMENT 54 | : DocComment -> channel (COMMENT) 55 | ; 56 | 57 | BLOCK_COMMENT 58 | : BlockComment -> channel (COMMENT) 59 | ; 60 | 61 | LINE_COMMENT 62 | : LineComment -> channel (COMMENT) 63 | ; 64 | 65 | // ------------------------- 66 | // Integer 67 | 68 | INT 69 | : DecimalNumeral 70 | ; 71 | 72 | // ------------------------- 73 | // Literal string 74 | // 75 | // ANTLR makes no distinction between a single character literal and a 76 | // multi-character string. All literals are single quote delimited and 77 | // may contain unicode escape sequences of the form \uxxxx, where x 78 | // is a valid hexadecimal number (per Unicode standard). 79 | STRING_LITERAL 80 | : SQuoteLiteral 81 | ; 82 | 83 | UNTERMINATED_STRING_LITERAL 84 | : USQuoteLiteral 85 | ; 86 | 87 | // ------------------------- 88 | // Arguments 89 | // 90 | // Certain argument lists, such as those specifying call parameters 91 | // to a rule invocation, or input parameters to a rule specification 92 | // are contained within square brackets. 93 | BEGIN_ARGUMENT 94 | : LBrack 95 | { self.handleBeginArgument() } 96 | ; 97 | 98 | // ------------------------- 99 | // Target Language Actions 100 | BEGIN_ACTION 101 | : LBrace -> pushMode (TargetLanguageAction) 102 | ; 103 | 104 | // ------------------------- 105 | // Keywords 106 | // 107 | // 'options', 'tokens', and 'channels' are considered keywords 108 | // but only when followed by '{', and considered as a single token. 109 | // Otherwise, the symbols are tokenized as RULE_REF and allowed as 110 | // an identifier in a labeledElement. 111 | OPTIONS : 'options' WSNLCHARS* '{' ; 112 | TOKENS : 'tokens' WSNLCHARS* '{' ; 113 | CHANNELS : 'channels' WSNLCHARS* '{' ; 114 | 115 | fragment WSNLCHARS : ' ' | '\t' | '\f' | '\n' | '\r' ; 116 | 117 | IMPORT 118 | : 'import' 119 | ; 120 | 121 | FRAGMENT 122 | : 'fragment' 123 | ; 124 | 125 | LEXER 126 | : 'lexer' 127 | ; 128 | 129 | PARSER 130 | : 'parser' 131 | ; 132 | 133 | GRAMMAR 134 | : 'grammar' 135 | ; 136 | 137 | PROTECTED 138 | : 'protected' 139 | ; 140 | 141 | PUBLIC 142 | : 'public' 143 | ; 144 | 145 | PRIVATE 146 | : 'private' 147 | ; 148 | 149 | RETURNS 150 | : 'returns' 151 | ; 152 | 153 | LOCALS 154 | : 'locals' 155 | ; 156 | 157 | THROWS 158 | : 'throws' 159 | ; 160 | 161 | CATCH 162 | : 'catch' 163 | ; 164 | 165 | FINALLY 166 | : 'finally' 167 | ; 168 | 169 | MODE 170 | : 'mode' 171 | ; 172 | // ------------------------- 173 | // Punctuation 174 | 175 | COLON 176 | : Colon 177 | ; 178 | 179 | COLONCOLON 180 | : DColon 181 | ; 182 | 183 | COMMA 184 | : Comma 185 | ; 186 | 187 | SEMI 188 | : Semi 189 | ; 190 | 191 | LPAREN 192 | : LParen 193 | ; 194 | 195 | RPAREN 196 | : RParen 197 | ; 198 | 199 | LBRACE 200 | : LBrace 201 | ; 202 | 203 | RBRACE 204 | : RBrace 205 | ; 206 | 207 | RARROW 208 | : RArrow 209 | ; 210 | 211 | LT 212 | : Lt 213 | ; 214 | 215 | GT 216 | : Gt 217 | ; 218 | 219 | ASSIGN 220 | : Equal 221 | ; 222 | 223 | QUESTION 224 | : Question 225 | ; 226 | 227 | STAR 228 | : Star 229 | ; 230 | 231 | PLUS_ASSIGN 232 | : PlusAssign 233 | ; 234 | 235 | PLUS 236 | : Plus 237 | ; 238 | 239 | OR 240 | : Pipe 241 | ; 242 | 243 | DOLLAR 244 | : Dollar 245 | ; 246 | 247 | RANGE 248 | : Range 249 | ; 250 | 251 | DOT 252 | : Dot 253 | ; 254 | 255 | AT 256 | : At 257 | ; 258 | 259 | POUND 260 | : Pound 261 | ; 262 | 263 | NOT 264 | : Tilde 265 | ; 266 | // ------------------------- 267 | // Identifiers - allows unicode rule/token names 268 | 269 | ID 270 | : Id 271 | ; 272 | // ------------------------- 273 | // Whitespace 274 | 275 | WS 276 | : Ws+ -> channel (OFF_CHANNEL) 277 | ; 278 | 279 | // ------------------------- 280 | // Illegal Characters 281 | // 282 | // This is an illegal character trap which is always the last rule in the 283 | // lexer specification. It matches a single character of any value and being 284 | // the last rule in the file will match when no other rule knows what to do 285 | // about the character. It is reported as an error but is not passed on to the 286 | // parser. This means that the parser to deal with the gramamr file anyway 287 | // but we will not try to analyse or code generate from a file with lexical 288 | // errors. 289 | 290 | // Comment this rule out to allow the error to be propagated to the parser 291 | ERRCHAR 292 | : . -> channel (HIDDEN) 293 | ; 294 | 295 | // ====================================================== 296 | // Lexer modes 297 | // ------------------------- 298 | // Arguments 299 | mode Argument; 300 | // E.g., [int x, List a[]] 301 | NESTED_ARGUMENT 302 | : LBrack -> type (ARGUMENT_CONTENT) , pushMode (Argument) 303 | ; 304 | 305 | ARGUMENT_ESCAPE 306 | : EscAny -> type (ARGUMENT_CONTENT) 307 | ; 308 | 309 | ARGUMENT_STRING_LITERAL 310 | : DQuoteLiteral -> type (ARGUMENT_CONTENT) 311 | ; 312 | 313 | ARGUMENT_CHAR_LITERAL 314 | : SQuoteLiteral -> type (ARGUMENT_CONTENT) 315 | ; 316 | 317 | END_ARGUMENT 318 | : RBrack 319 | { self.handleEndArgument() } 320 | ; 321 | 322 | // added this to return non-EOF token type here. EOF does something weird 323 | UNTERMINATED_ARGUMENT 324 | : EOF -> popMode 325 | ; 326 | 327 | ARGUMENT_CONTENT 328 | : . 329 | ; 330 | 331 | // ------------------------- 332 | // Target Language Actions 333 | // 334 | // Many language targets use {} as block delimiters and so we 335 | // must recursively match {} delimited blocks to balance the 336 | // braces. Additionally, we must make some assumptions about 337 | // literal string representation in the target language. We assume 338 | // that they are delimited by ' or " and so consume these 339 | // in their own alts so as not to inadvertantly match {}. 340 | mode TargetLanguageAction; 341 | NESTED_ACTION 342 | : LBrace -> type (ACTION_CONTENT) , pushMode (TargetLanguageAction) 343 | ; 344 | 345 | ACTION_ESCAPE 346 | : EscAny -> type (ACTION_CONTENT) 347 | ; 348 | 349 | ACTION_STRING_LITERAL 350 | : DQuoteLiteral -> type (ACTION_CONTENT) 351 | ; 352 | 353 | ACTION_CHAR_LITERAL 354 | : SQuoteLiteral -> type (ACTION_CONTENT) 355 | ; 356 | 357 | ACTION_DOC_COMMENT 358 | : DocComment -> type (ACTION_CONTENT) 359 | ; 360 | 361 | ACTION_BLOCK_COMMENT 362 | : BlockComment -> type (ACTION_CONTENT) 363 | ; 364 | 365 | ACTION_LINE_COMMENT 366 | : LineComment -> type (ACTION_CONTENT) 367 | ; 368 | 369 | END_ACTION 370 | : RBrace 371 | { self.handleEndAction() } 372 | ; 373 | 374 | UNTERMINATED_ACTION 375 | : EOF -> popMode 376 | ; 377 | 378 | ACTION_CONTENT 379 | : . 380 | ; 381 | 382 | // ------------------------- 383 | mode LexerCharSet; 384 | LEXER_CHAR_SET_BODY 385 | : (~ [\]\\] | EscAny)+ -> more 386 | ; 387 | 388 | LEXER_CHAR_SET 389 | : RBrack -> popMode 390 | ; 391 | 392 | UNTERMINATED_CHAR_SET 393 | : EOF -> popMode 394 | ; 395 | 396 | // ------------------------------------------------------------------------------ 397 | // Grammar specific Keywords, Punctuation, etc. 398 | fragment Id 399 | : NameStartChar NameChar* 400 | ; 401 | 402 | -------------------------------------------------------------------------------- /src/picireny/antlr4/resources/ANTLRv4Parser.g4: -------------------------------------------------------------------------------- 1 | /* 2 | * [The "BSD license"] 3 | * Copyright (c) 2012-2014 Terence Parr 4 | * Copyright (c) 2012-2014 Sam Harwell 5 | * Copyright (c) 2015 Gerald Rosenberg 6 | * All rights reserved. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions 10 | * are met: 11 | * 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 3. The name of the author may not be used to endorse or promote products 18 | * derived from this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | /* A grammar for ANTLR v4 written in ANTLR v4. 33 | * 34 | * Modified 2015.06.16 gbr 35 | * -- update for compatibility with Antlr v4.5 36 | * -- add mode for channels 37 | * -- moved members to LexerAdaptor 38 | * -- move fragments to imports 39 | */ 40 | parser grammar ANTLRv4Parser; 41 | 42 | 43 | options { tokenVocab = ANTLRv4Lexer; } 44 | // The main entry point for parsing a v4 grammar. 45 | grammarSpec 46 | : grammarDecl prequelConstruct* rules modeSpec* EOF 47 | ; 48 | 49 | grammarDecl 50 | : grammarType identifier SEMI 51 | ; 52 | 53 | grammarType 54 | : (LEXER GRAMMAR | PARSER GRAMMAR | GRAMMAR) 55 | ; 56 | // This is the list of all constructs that can be declared before 57 | // the set of rules that compose the grammar, and is invoked 0..n 58 | // times by the grammarPrequel rule. 59 | 60 | prequelConstruct 61 | : optionsSpec 62 | | delegateGrammars 63 | | tokensSpec 64 | | channelsSpec 65 | | action_ 66 | ; 67 | // ------------ 68 | // Options - things that affect analysis and/or code generation 69 | 70 | optionsSpec 71 | : OPTIONS (option SEMI)* RBRACE 72 | ; 73 | 74 | option 75 | : identifier ASSIGN optionValue 76 | ; 77 | 78 | optionValue 79 | : identifier (DOT identifier)* 80 | | STRING_LITERAL 81 | | actionBlock 82 | | INT 83 | ; 84 | // ------------ 85 | // Delegates 86 | 87 | delegateGrammars 88 | : IMPORT delegateGrammar (COMMA delegateGrammar)* SEMI 89 | ; 90 | 91 | delegateGrammar 92 | : identifier ASSIGN identifier 93 | | identifier 94 | ; 95 | // ------------ 96 | // Tokens & Channels 97 | 98 | tokensSpec 99 | : TOKENS idList? RBRACE 100 | ; 101 | 102 | channelsSpec 103 | : CHANNELS idList? RBRACE 104 | ; 105 | 106 | idList 107 | : identifier (COMMA identifier)* COMMA? 108 | ; 109 | // Match stuff like @parser::members {int i;} 110 | 111 | action_ 112 | : AT (actionScopeName COLONCOLON)? identifier actionBlock 113 | ; 114 | // Scope names could collide with keywords; allow them as ids for action scopes 115 | 116 | actionScopeName 117 | : identifier 118 | | LEXER 119 | | PARSER 120 | ; 121 | 122 | actionBlock 123 | : BEGIN_ACTION ACTION_CONTENT* END_ACTION 124 | ; 125 | 126 | argActionBlock 127 | : BEGIN_ARGUMENT ARGUMENT_CONTENT* END_ARGUMENT 128 | ; 129 | 130 | modeSpec 131 | : MODE identifier SEMI lexerRuleSpec* 132 | ; 133 | 134 | rules 135 | : ruleSpec* 136 | ; 137 | 138 | ruleSpec 139 | : parserRuleSpec 140 | | lexerRuleSpec 141 | ; 142 | 143 | parserRuleSpec 144 | : ruleModifiers? RULE_REF argActionBlock? ruleReturns? throwsSpec? localsSpec? rulePrequel* COLON ruleBlock SEMI exceptionGroup 145 | ; 146 | 147 | exceptionGroup 148 | : exceptionHandler* finallyClause? 149 | ; 150 | 151 | exceptionHandler 152 | : CATCH argActionBlock actionBlock 153 | ; 154 | 155 | finallyClause 156 | : FINALLY actionBlock 157 | ; 158 | 159 | rulePrequel 160 | : optionsSpec 161 | | ruleAction 162 | ; 163 | 164 | ruleReturns 165 | : RETURNS argActionBlock 166 | ; 167 | 168 | // -------------- 169 | // Exception spec 170 | throwsSpec 171 | : THROWS identifier (COMMA identifier)* 172 | ; 173 | 174 | localsSpec 175 | : LOCALS argActionBlock 176 | ; 177 | 178 | /** Match stuff like @init {int i;} */ 179 | ruleAction 180 | : AT identifier actionBlock 181 | ; 182 | 183 | ruleModifiers 184 | : ruleModifier+ 185 | ; 186 | // An individual access modifier for a rule. The 'fragment' modifier 187 | // is an internal indication for lexer rules that they do not match 188 | // from the input but are like subroutines for other lexer rules to 189 | // reuse for certain lexical patterns. The other modifiers are passed 190 | // to the code generation templates and may be ignored by the template 191 | // if they are of no use in that language. 192 | 193 | ruleModifier 194 | : PUBLIC 195 | | PRIVATE 196 | | PROTECTED 197 | | FRAGMENT 198 | ; 199 | 200 | ruleBlock 201 | : ruleAltList 202 | ; 203 | 204 | ruleAltList 205 | : labeledAlt (OR labeledAlt)* 206 | ; 207 | 208 | labeledAlt 209 | : alternative (POUND identifier)? 210 | ; 211 | // -------------------- 212 | // Lexer rules 213 | 214 | lexerRuleSpec 215 | : FRAGMENT? TOKEN_REF optionsSpec? COLON lexerRuleBlock SEMI 216 | ; 217 | 218 | lexerRuleBlock 219 | : lexerAltList 220 | ; 221 | 222 | lexerAltList 223 | : lexerAlt (OR lexerAlt)* 224 | ; 225 | 226 | lexerAlt 227 | : lexerElements lexerCommands? 228 | | 229 | // explicitly allow empty alts 230 | ; 231 | 232 | lexerElements 233 | : lexerElement+ 234 | | 235 | ; 236 | 237 | lexerElement 238 | : lexerAtom ebnfSuffix? 239 | | lexerBlock ebnfSuffix? 240 | | actionBlock QUESTION? 241 | ; 242 | // but preds can be anywhere 243 | 244 | lexerBlock 245 | : LPAREN lexerAltList RPAREN 246 | ; 247 | // E.g., channel(HIDDEN), skip, more, mode(INSIDE), push(INSIDE), pop 248 | 249 | lexerCommands 250 | : RARROW lexerCommand (COMMA lexerCommand)* 251 | ; 252 | 253 | lexerCommand 254 | : lexerCommandName LPAREN lexerCommandExpr RPAREN 255 | | lexerCommandName 256 | ; 257 | 258 | lexerCommandName 259 | : identifier 260 | | MODE 261 | ; 262 | 263 | lexerCommandExpr 264 | : identifier 265 | | INT 266 | ; 267 | // -------------------- 268 | // Rule Alts 269 | 270 | altList 271 | : alternative (OR alternative)* 272 | ; 273 | 274 | alternative 275 | : elementOptions? element+ 276 | | 277 | // explicitly allow empty alts 278 | ; 279 | 280 | element 281 | : labeledElement (ebnfSuffix |) 282 | | atom (ebnfSuffix |) 283 | | ebnf 284 | | actionBlock QUESTION? 285 | ; 286 | 287 | labeledElement 288 | : identifier (ASSIGN | PLUS_ASSIGN) (atom | block) 289 | ; 290 | // -------------------- 291 | // EBNF and blocks 292 | 293 | ebnf 294 | : block blockSuffix? 295 | ; 296 | 297 | blockSuffix 298 | : ebnfSuffix 299 | ; 300 | 301 | ebnfSuffix 302 | : QUESTION QUESTION? 303 | | STAR QUESTION? 304 | | PLUS QUESTION? 305 | ; 306 | 307 | lexerAtom 308 | : characterRange 309 | | terminal 310 | | notSet 311 | | LEXER_CHAR_SET 312 | | DOT elementOptions? 313 | ; 314 | 315 | atom 316 | : terminal 317 | | ruleref 318 | | notSet 319 | | DOT elementOptions? 320 | ; 321 | 322 | // -------------------- 323 | // Inverted element set 324 | notSet 325 | : NOT setElement 326 | | NOT blockSet 327 | ; 328 | 329 | blockSet 330 | : LPAREN setElement (OR setElement)* RPAREN 331 | ; 332 | 333 | setElement 334 | : TOKEN_REF elementOptions? 335 | | STRING_LITERAL elementOptions? 336 | | characterRange 337 | | LEXER_CHAR_SET 338 | ; 339 | 340 | // ------------- 341 | // Grammar Block 342 | block 343 | : LPAREN (optionsSpec? ruleAction* COLON)? altList RPAREN 344 | ; 345 | 346 | // ---------------- 347 | // Parser rule ref 348 | ruleref 349 | : RULE_REF argActionBlock? elementOptions? 350 | ; 351 | 352 | // --------------- 353 | // Character Range 354 | characterRange 355 | : STRING_LITERAL RANGE STRING_LITERAL 356 | ; 357 | 358 | terminal 359 | : TOKEN_REF elementOptions? 360 | | STRING_LITERAL elementOptions? 361 | ; 362 | 363 | // Terminals may be adorned with certain options when 364 | // reference in the grammar: TOK<,,,> 365 | elementOptions 366 | : LT elementOption (COMMA elementOption)* GT 367 | ; 368 | 369 | elementOption 370 | : identifier 371 | | identifier ASSIGN (identifier | STRING_LITERAL) 372 | ; 373 | 374 | identifier 375 | : RULE_REF 376 | | TOKEN_REF 377 | ; 378 | 379 | -------------------------------------------------------------------------------- /src/picireny/antlr4/resources/ExtendedTargetParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss. 3 | * 4 | * Licensed under the BSD 3-Clause License 5 | * . 6 | * This file may not be copied, modified, or distributed except 7 | * according to those terms. 8 | */ 9 | 10 | import java.io.*; 11 | import java.util.*; 12 | import javax.xml.stream.*; 13 | 14 | import org.antlr.v4.runtime.*; 15 | import org.antlr.v4.runtime.tree.*; 16 | import org.antlr.v4.runtime.misc.Pair; 17 | 18 | 19 | /** 20 | * Extended$parser_class is a subclass of the original parser implementation. 21 | * It can trigger state changes that are needed to identify parts of the input 22 | * that are not needed to keep it syntactically correct. 23 | */ 24 | public class Extended$parser_class extends $parser_class { 25 | 26 | private static class ExtendedErrorListener extends BaseErrorListener { 27 | 28 | @Override 29 | public void syntaxError(Recognizer recognizer, 30 | Object offendingSymbol, 31 | int line, 32 | int charPositionInLine, 33 | String msg, 34 | RecognitionException e) { 35 | 36 | CommonToken t = new CommonToken(new Pair(((Lexer)recognizer), ((Lexer)recognizer)._input), 37 | Token.INVALID_TYPE, 38 | Token.DEFAULT_CHANNEL, 39 | ((Lexer)recognizer)._tokenStartCharIndex, 40 | ((Lexer)recognizer)._tokenStartCharIndex); 41 | t.setLine(((Lexer)recognizer)._tokenStartLine); 42 | t.setCharPositionInLine(((Lexer)recognizer)._tokenStartCharPositionInLine); 43 | ((Lexer)recognizer).setType(Token.MIN_USER_TOKEN_TYPE); 44 | ((Lexer)recognizer).emit(t); 45 | } 46 | } 47 | 48 | public static void main(String[] args) { 49 | try { 50 | ExtendedTargetLexer lexer = new ExtendedTargetLexer(CharStreams.fromStream(System.in)); 51 | lexer.addErrorListener(new ExtendedErrorListener()); 52 | CommonTokenStream tokens = new CommonTokenStream(lexer); 53 | Extended$parser_class parser = new Extended$parser_class(tokens); 54 | ExtendedTargetListener listener = new ExtendedTargetListener(parser); 55 | 56 | parser.addParseListener(listener); 57 | Extended$parser_class.class.getMethod(args[0]).invoke(parser); 58 | parser.syntaxErrorWarning(); 59 | 60 | try (XsonStreamWriter w = new XsonStreamWriter(System.out)) { 61 | w.write(null, listener.root); 62 | } 63 | } catch(Exception e) { 64 | e.printStackTrace(System.err); 65 | System.exit(1); 66 | } 67 | } 68 | 69 | private static interface XsonObject { 70 | public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException; 71 | } 72 | 73 | /** 74 | * XsonStreamWriter is a partial implementation for writing JSONx documents. 75 | * It only implements the minimum required to dump HDDNode objects. 76 | */ 77 | private static class XsonStreamWriter implements AutoCloseable { 78 | public static final String JSONX_PREFIX = "json"; 79 | public static final String JSONX_NS_URI = "http://www.ibm.com/xmlns/prod/2009/jsonx"; 80 | 81 | private XMLStreamWriter w; 82 | 83 | public XsonStreamWriter(OutputStream o) throws XMLStreamException { 84 | XMLOutputFactory factory = XMLOutputFactory.newInstance(); 85 | factory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true); 86 | w = factory.createXMLStreamWriter(o); 87 | w.setPrefix(JSONX_PREFIX, JSONX_NS_URI); 88 | } 89 | 90 | public void write(String name, XsonObject value) throws XMLStreamException { 91 | w.writeStartElement(JSONX_PREFIX, "object", JSONX_NS_URI); 92 | if (name != null) 93 | w.writeAttribute("name", name); 94 | value.writeXsonMembers(this); 95 | w.writeEndElement(); 96 | } 97 | 98 | public void write(String name, Iterable value) throws XMLStreamException { 99 | w.writeStartElement(JSONX_PREFIX, "array", JSONX_NS_URI); 100 | if (name != null) 101 | w.writeAttribute("name", name); 102 | for (XsonObject o : value) 103 | write(null, o); 104 | w.writeEndElement(); 105 | } 106 | 107 | public void write(String name, int value) throws XMLStreamException { 108 | w.writeStartElement(JSONX_PREFIX, "number", JSONX_NS_URI); 109 | if (name != null) 110 | w.writeAttribute("name", name); 111 | w.writeCharacters(Integer.toString(value)); 112 | w.writeEndElement(); 113 | } 114 | 115 | public void write(String name, String value) throws XMLStreamException { 116 | w.writeStartElement(JSONX_PREFIX, "string", JSONX_NS_URI); 117 | if (name != null) 118 | w.writeAttribute("name", name); 119 | w.writeCharacters(value); 120 | w.writeEndElement(); 121 | } 122 | 123 | public void close() throws XMLStreamException { 124 | w.close(); 125 | } 126 | } 127 | 128 | /** 129 | * ExtendedTargetLexer is a subclass of the original lexer implementation. 130 | * It can recognize skipped tokens and instead of eliminating them from the parser 131 | * they can be redirected to the dedicated PICIRENY_CHANNEL for later use. 132 | */ 133 | private static class ExtendedTargetLexer extends $lexer_class { 134 | 135 | public static final int PICIRENY_CHANNEL = -3; 136 | 137 | public ExtendedTargetLexer(CharStream input) { 138 | super(input); 139 | } 140 | 141 | // Skipped tokens cannot be accessed from the parser but we still need them to 142 | // unparse test cases correctly. Sending these tokens to a dedicated channel won't 143 | // alter the parse but makes these tokens available. 144 | @Override 145 | public void skip() { 146 | _channel = PICIRENY_CHANNEL; 147 | } 148 | } 149 | 150 | /** 151 | * ExtendedTargetListener is a subclass of the original listener implementation. 152 | * It can trigger state changes that are needed to identify parts of the input 153 | * that are not needed to keep it syntactically correct. 154 | */ 155 | private static class ExtendedTargetListener extends $listener_class { 156 | 157 | private HDDRule current_node; 158 | private Parser parser; 159 | private HDDRule root; 160 | private boolean seen_terminal; 161 | 162 | private static class Position implements XsonObject { 163 | public int line; 164 | public int column; 165 | 166 | public Position(int _line, int _column) { 167 | line = _line; 168 | column = _column; 169 | } 170 | 171 | public Position after(String text) { 172 | int line_breaks = countLineBreaks(text); 173 | return new Position(line + line_breaks, 174 | line_breaks == 0 ? column + text.length() : text.length() - text.lastIndexOf('\n') - 1); 175 | } 176 | 177 | private static int countLineBreaks(String text) { 178 | int count = 0; 179 | int fromIndex = 0; 180 | while (true) { 181 | int index = text.indexOf('\n', fromIndex); 182 | if (index < 0) 183 | return count; 184 | count++; 185 | fromIndex = index + 1; 186 | } 187 | } 188 | 189 | public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException { 190 | w.write("line", line); 191 | w.write("column", column); 192 | } 193 | } 194 | 195 | private static abstract class HDDNode implements XsonObject { 196 | public String name; 197 | public HDDRule parent; 198 | public Position start; 199 | public Position end; 200 | 201 | public HDDNode(String _name) { 202 | name = _name; 203 | parent = null; 204 | start = null; 205 | end = null; 206 | } 207 | 208 | public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException { 209 | w.write("type", getClass().getSimpleName()); 210 | if (name != null) 211 | w.write("name", name); 212 | if (start != null) 213 | w.write("start", start); 214 | if (end != null) 215 | w.write("end", end); 216 | } 217 | } 218 | 219 | private static class HDDRule extends HDDNode { 220 | public ArrayList children; 221 | public boolean recursive_rule; 222 | 223 | public HDDRule(String _name) { 224 | super(_name); 225 | children = new ArrayList(); 226 | recursive_rule = false; 227 | } 228 | 229 | public void addChild(HDDNode node) { 230 | children.add(node); 231 | node.parent = this; 232 | } 233 | 234 | public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException { 235 | super.writeXsonMembers(w); 236 | w.write("children", children); 237 | } 238 | } 239 | 240 | private static class HDDToken extends HDDNode { 241 | public String text; 242 | 243 | public HDDToken(String _name, String _text, Position _start, Position _end) { 244 | super(_name); 245 | text = _text; 246 | start = _start; 247 | end = _end; 248 | } 249 | 250 | public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException { 251 | super.writeXsonMembers(w); 252 | w.write("text", text); 253 | } 254 | } 255 | 256 | private static class HDDQuantifier extends HDDRule { 257 | public HDDQuantifier() { 258 | super(null); 259 | } 260 | } 261 | 262 | private static class HDDHiddenToken extends HDDToken { 263 | public HDDHiddenToken(String _name, String _text, Position _start, Position _end) { 264 | super(_name, _text, _start, _end); 265 | } 266 | } 267 | 268 | private static class HDDErrorToken extends HDDToken { 269 | public HDDErrorToken(String _text, Position _start, Position _end) { 270 | super(null, _text, _start, _end); 271 | } 272 | } 273 | 274 | public ExtendedTargetListener(Parser _parser) { 275 | parser = _parser; 276 | current_node = null; 277 | root = null; 278 | seen_terminal = false; 279 | } 280 | 281 | public void recursion_enter() { 282 | assert current_node instanceof HDDRule; 283 | HDDRule node = new HDDRule(current_node.name); 284 | 285 | current_node.addChild(node); 286 | current_node.recursive_rule = true; 287 | current_node = node; 288 | } 289 | 290 | public void recursion_push() { 291 | assert current_node.parent.children.size() > 0; 292 | HDDNode first_child = current_node.parent.children.get(0); 293 | current_node.parent.children.remove(first_child); 294 | current_node.addChild(first_child); 295 | } 296 | 297 | public void recursion_unroll() { 298 | assert current_node.recursive_rule; 299 | assert current_node.children.size() == 1 && current_node.name.equals(current_node.children.get(0).name); 300 | ArrayList children_to_lift = ((HDDRule)current_node.children.get(0)).children; 301 | HDDRule parent = current_node.parent; 302 | if (children_to_lift.size() > 0) { 303 | current_node.children = children_to_lift; 304 | } else { 305 | parent.children.remove(current_node); 306 | } 307 | current_node = parent; 308 | } 309 | 310 | public void enterEveryRule(ParserRuleContext ctx) { 311 | HDDRule node = new HDDRule(parser.getRuleNames()[ctx.getRuleIndex()]); 312 | 313 | if (root == null) { 314 | root = node; 315 | } else { 316 | assert current_node != null; 317 | current_node.addChild(node); 318 | } 319 | current_node = node; 320 | } 321 | 322 | public void exitEveryRule(ParserRuleContext ctx) { 323 | // If the input contains syntax error, then the last optional block might not have been closed. 324 | while (current_node instanceof HDDQuantifier) 325 | exit_optional(); 326 | 327 | assert current_node.name.equals(parser.getRuleNames()[ctx.getRuleIndex()]) : current_node.name + " (" + current_node.toString() + ") != " + parser.getRuleNames()[ctx.getRuleIndex()]; 328 | 329 | if (current_node.parent != null) 330 | current_node = current_node.parent; 331 | } 332 | 333 | private Position[] tokenBoundaries(Token token) { 334 | Position start = new Position(token.getLine(), token.getCharPositionInLine()); 335 | return new Position[] {start, start.after(token.getText())}; 336 | } 337 | 338 | private void addToken(TerminalNode node, HDDToken child) { 339 | if (!seen_terminal) { 340 | List hiddenTokens = ((BufferedTokenStream)parser.getTokenStream()).getHiddenTokensToLeft(node.getSymbol().getTokenIndex(), -1); 341 | if (hiddenTokens != null) { 342 | for (Token token : hiddenTokens) { 343 | Position[] boundaries = tokenBoundaries(token); 344 | current_node.addChild(new HDDHiddenToken(parser.getTokenNames()[token.getType()], token.getText(), boundaries[0], boundaries[1])); 345 | } 346 | } 347 | } 348 | seen_terminal = true; 349 | 350 | current_node.addChild(child); 351 | 352 | List hiddenTokens = ((BufferedTokenStream)parser.getTokenStream()).getHiddenTokensToRight(node.getSymbol().getTokenIndex(), -1); 353 | if (hiddenTokens != null) { 354 | for (Token token : hiddenTokens) { 355 | Position[] boundaries = tokenBoundaries(token); 356 | current_node.addChild(new HDDHiddenToken(parser.getTokenNames()[token.getType()], token.getText(), boundaries[0], boundaries[1])); 357 | } 358 | } 359 | } 360 | 361 | public void visitTerminal(TerminalNode node) { 362 | Token token = node.getSymbol(); 363 | Position[] boundaries = tokenBoundaries(token); 364 | addToken(node, token.getType() != Token.EOF 365 | ? new HDDToken(parser.getTokenNames()[token.getType()], token.getText(), boundaries[0], boundaries[1]) 366 | : new HDDToken("EOF", "", boundaries[0], boundaries[1])); 367 | } 368 | 369 | public void visitErrorNode(ErrorNode node) { 370 | Token token = node.getSymbol(); 371 | if (token != null) { 372 | Position[] boundaries = tokenBoundaries(token); 373 | addToken(node, new HDDErrorToken(node.getText(), boundaries[0], boundaries[1])); 374 | } 375 | } 376 | 377 | public void enter_optional() { 378 | HDDQuantifier quant_node = new HDDQuantifier(); 379 | current_node.addChild(quant_node); 380 | current_node = quant_node; 381 | } 382 | 383 | public void exit_optional() { 384 | assert current_node.parent != null : "Quantifier node has no parent."; 385 | assert current_node.children.size() > 0 : "Quantifier node has no children."; 386 | 387 | current_node = current_node.parent; 388 | } 389 | } 390 | 391 | public Extended$parser_class(TokenStream input) { 392 | super(input); 393 | } 394 | 395 | public void enter_optional() { 396 | trigger_listener("enter_optional"); 397 | } 398 | 399 | public void exit_optional() { 400 | trigger_listener("exit_optional"); 401 | } 402 | 403 | public void enterRecursionRule(ParserRuleContext localctx, int state, int ruleIndex, int precedence) { 404 | super.enterRecursionRule(localctx, state, ruleIndex, precedence); 405 | trigger_listener("recursion_enter"); 406 | } 407 | 408 | public void enterRecursionRule(ParserRuleContext localctx, int ruleIndex) { 409 | super.enterRecursionRule(localctx, ruleIndex); 410 | trigger_listener("recursion_enter"); 411 | } 412 | 413 | public void pushNewRecursionContext(ParserRuleContext localctx, int state, int ruleIndex) { 414 | super.pushNewRecursionContext(localctx, state, ruleIndex); 415 | trigger_listener("recursion_push"); 416 | } 417 | 418 | public void unrollRecursionContexts(ParserRuleContext _parentctx) { 419 | super.unrollRecursionContexts(_parentctx); 420 | trigger_listener("recursion_unroll"); 421 | } 422 | 423 | private void trigger_listener(String event) { 424 | for (ParseTreeListener listener : getParseListeners()) { 425 | try { 426 | ExtendedTargetListener.class.getMethod(event).invoke(listener); 427 | } catch (Exception e) { 428 | System.err.println(e); 429 | } 430 | } 431 | } 432 | 433 | private void syntaxErrorWarning() { 434 | if (getNumberOfSyntaxErrors() > 0) 435 | System.err.println("$parser_class finished with " + getNumberOfSyntaxErrors() + " syntax errors. This may decrease quality."); 436 | } 437 | } 438 | -------------------------------------------------------------------------------- /src/picireny/antlr4/resources/LexBasic.g4: -------------------------------------------------------------------------------- 1 | /* 2 | * [The "BSD license"] 3 | * Copyright (c) 2014-2015 Gerald Rosenberg 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions 8 | * are met: 9 | * 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 3. The name of the author may not be used to endorse or promote products 16 | * derived from this software without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | /** 30 | * A generally reusable set of fragments for import in to Lexer grammars. 31 | * 32 | * Modified 2015.06.16 gbr - 33 | * -- generalized for inclusion into the ANTLRv4 grammar distribution 34 | * 35 | */ 36 | lexer grammar LexBasic; 37 | // ====================================================== 38 | // Lexer fragments 39 | // 40 | // ----------------------------------- 41 | // Whitespace & Comments 42 | 43 | fragment Ws 44 | : Hws 45 | | Vws 46 | ; 47 | 48 | fragment Hws 49 | : [ \t] 50 | ; 51 | 52 | fragment Vws 53 | : [\r\n\f] 54 | ; 55 | 56 | fragment BlockComment 57 | : '/*' .*? ('*/' | EOF) 58 | ; 59 | 60 | fragment DocComment 61 | : '/**' .*? ('*/' | EOF) 62 | ; 63 | 64 | fragment LineComment 65 | : '//' ~ [\r\n]* 66 | ; 67 | // ----------------------------------- 68 | // Escapes 69 | // Any kind of escaped character that we can embed within ANTLR literal strings. 70 | 71 | fragment EscSeq 72 | : Esc ([btnfr"'\\] | UnicodeEsc | . | EOF) 73 | ; 74 | 75 | fragment EscAny 76 | : Esc . 77 | ; 78 | 79 | fragment UnicodeEsc 80 | : 'u' (HexDigit (HexDigit (HexDigit HexDigit?)?)?)? 81 | ; 82 | // ----------------------------------- 83 | // Numerals 84 | 85 | fragment DecimalNumeral 86 | : '0' 87 | | [1-9] DecDigit* 88 | ; 89 | // ----------------------------------- 90 | // Digits 91 | 92 | fragment HexDigit 93 | : [0-9a-fA-F] 94 | ; 95 | 96 | fragment DecDigit 97 | : [0-9] 98 | ; 99 | // ----------------------------------- 100 | // Literals 101 | 102 | fragment BoolLiteral 103 | : 'true' 104 | | 'false' 105 | ; 106 | 107 | fragment CharLiteral 108 | : SQuote (EscSeq | ~ ['\r\n\\]) SQuote 109 | ; 110 | 111 | fragment SQuoteLiteral 112 | : SQuote (EscSeq | ~ ['\r\n\\])* SQuote 113 | ; 114 | 115 | fragment DQuoteLiteral 116 | : DQuote (EscSeq | ~ ["\r\n\\])* DQuote 117 | ; 118 | 119 | fragment USQuoteLiteral 120 | : SQuote (EscSeq | ~ ['\r\n\\])* 121 | ; 122 | // ----------------------------------- 123 | // Character ranges 124 | 125 | fragment NameChar 126 | : NameStartChar 127 | | '0' .. '9' 128 | | Underscore 129 | | '\u00B7' 130 | | '\u0300' .. '\u036F' 131 | | '\u203F' .. '\u2040' 132 | ; 133 | 134 | fragment NameStartChar 135 | : 'A' .. 'Z' 136 | | 'a' .. 'z' 137 | | '\u00C0' .. '\u00D6' 138 | | '\u00D8' .. '\u00F6' 139 | | '\u00F8' .. '\u02FF' 140 | | '\u0370' .. '\u037D' 141 | | '\u037F' .. '\u1FFF' 142 | | '\u200C' .. '\u200D' 143 | | '\u2070' .. '\u218F' 144 | | '\u2C00' .. '\u2FEF' 145 | | '\u3001' .. '\uD7FF' 146 | | '\uF900' .. '\uFDCF' 147 | | '\uFDF0' .. '\uFFFD' 148 | ; 149 | // ignores | ['\u10000-'\uEFFFF] ; 150 | // ----------------------------------- 151 | // Types 152 | 153 | fragment Int 154 | : 'int' 155 | ; 156 | // ----------------------------------- 157 | // Symbols 158 | 159 | fragment Esc 160 | : '\\' 161 | ; 162 | 163 | fragment Colon 164 | : ':' 165 | ; 166 | 167 | fragment DColon 168 | : '::' 169 | ; 170 | 171 | fragment SQuote 172 | : '\'' 173 | ; 174 | 175 | fragment DQuote 176 | : '"' 177 | ; 178 | 179 | fragment LParen 180 | : '(' 181 | ; 182 | 183 | fragment RParen 184 | : ')' 185 | ; 186 | 187 | fragment LBrace 188 | : '{' 189 | ; 190 | 191 | fragment RBrace 192 | : '}' 193 | ; 194 | 195 | fragment LBrack 196 | : '[' 197 | ; 198 | 199 | fragment RBrack 200 | : ']' 201 | ; 202 | 203 | fragment RArrow 204 | : '->' 205 | ; 206 | 207 | fragment Lt 208 | : '<' 209 | ; 210 | 211 | fragment Gt 212 | : '>' 213 | ; 214 | 215 | fragment Equal 216 | : '=' 217 | ; 218 | 219 | fragment Question 220 | : '?' 221 | ; 222 | 223 | fragment Star 224 | : '*' 225 | ; 226 | 227 | fragment Plus 228 | : '+' 229 | ; 230 | 231 | fragment PlusAssign 232 | : '+=' 233 | ; 234 | 235 | fragment Underscore 236 | : '_' 237 | ; 238 | 239 | fragment Pipe 240 | : '|' 241 | ; 242 | 243 | fragment Dollar 244 | : '$' 245 | ; 246 | 247 | fragment Comma 248 | : ',' 249 | ; 250 | 251 | fragment Semi 252 | : ';' 253 | ; 254 | 255 | fragment Dot 256 | : '.' 257 | ; 258 | 259 | fragment Range 260 | : '..' 261 | ; 262 | 263 | fragment At 264 | : '@' 265 | ; 266 | 267 | fragment Pound 268 | : '#' 269 | ; 270 | 271 | fragment Tilde 272 | : '~' 273 | ; 274 | 275 | -------------------------------------------------------------------------------- /src/picireny/cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2024 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | import json 9 | 10 | from argparse import ArgumentParser 11 | from importlib import metadata 12 | from os.path import abspath, dirname, exists, join, realpath 13 | from shutil import rmtree 14 | 15 | import antlerinator 16 | import inators 17 | import picire 18 | 19 | from inators import log as logging 20 | 21 | from . import filter, hdd, hddr, hoist, info, prune, transform 22 | 23 | logger = logging.getLogger('picireny') 24 | __version__ = metadata.version(__package__) 25 | 26 | 27 | args_hdd_choices = { 28 | 'hdd': hdd.hddmin, 29 | 'hddr': hddr.hddrmin, 30 | } 31 | 32 | 33 | args_phase_choices = { 34 | 'prune': {'transformations': [prune.prune]}, 35 | 'coarse-prune': {'transformations': [prune.prune], 'config_filter': filter.coarse_filter}, 36 | 'hoist': {'transformations': [hoist.hoist]}, 37 | 'prune+hoist': {'transformations': [prune.prune, hoist.hoist]}, 38 | 'coarse-prune+hoist': {'transformations': [prune.prune, hoist.hoist], 'config_filter': filter.coarse_filter} 39 | } 40 | 41 | 42 | def process_antlr4_args(args): 43 | antlerinator.process_antlr_argument(args) 44 | args.antlr = realpath(args.antlr) 45 | 46 | def load_format_config(data): 47 | # Interpret relative grammar paths compared to the directory of the config file. 48 | if 'files' in data: 49 | for i, fn in enumerate(data['files']): 50 | path = join(abspath(dirname(args.format)), fn) 51 | if not exists(path): 52 | raise ValueError(f'Invalid input format definition: {path}, defined in the format config, does not exist.') 53 | data['files'][i] = path 54 | data['islands'] = data.get('islands', {}) 55 | data['replacements'] = data.get('replacements', {}) 56 | return data 57 | 58 | args.input_format = {} 59 | 60 | if args.format: 61 | if not exists(args.format): 62 | raise ValueError(f'Invalid input format definition: {args.format} does not exist.') 63 | 64 | with open(args.format, 'r') as f: 65 | try: 66 | input_description = json.load(f, object_hook=load_format_config) 67 | args.input_format = input_description['grammars'] 68 | if not args.start: 69 | args.start = input_description.get('start', None) 70 | except ValueError as e: 71 | raise ValueError(f'Invalid input format definition: The content of {args.format} is not a valid JSON object.') from e 72 | 73 | if not args.start: 74 | raise ValueError('Invalid input format definition: No start has been defined.') 75 | 76 | if args.grammar or args.replacements: 77 | # Initialize the default grammar that doesn't need to be named. 78 | if '' not in args.input_format: 79 | args.input_format[''] = {'files': [], 'replacements': {}, 'islands': {}} 80 | 81 | if args.grammar: 82 | for i, g in enumerate(args.grammar): 83 | args.input_format['']['files'].append(realpath(g)) 84 | if not exists(args.input_format['']['files'][i]): 85 | raise ValueError(f'Invalid input format definition: {args.input_format[""]["files"][i]} does not exist.') 86 | 87 | if args.replacements: 88 | if not exists(args.replacements): 89 | raise ValueError(f'Invalid input format definition: {args.replacements} does not exist.') 90 | 91 | try: 92 | with open(args.replacements, 'r') as f: 93 | args.input_format['']['replacements'] = json.load(f) 94 | except ValueError as e: 95 | raise ValueError(f'Invalid input format definition: The content of {args.replacements} is not a valid JSON object.') from e 96 | 97 | 98 | def process_srcml_args(args): 99 | if not args.srcml_language: 100 | raise ValueError('The following argument is required for srcML: --srcml:language') 101 | 102 | 103 | def process_args(args): 104 | inators.arg.process_log_level_argument(args, logger) 105 | inators.arg.process_sys_recursion_limit_argument(args) 106 | 107 | args.hddmin = args_hdd_choices[args.hdd] 108 | args.hdd_phase_configs = [args_phase_choices[phase] for phase in (args.phase or ['prune'])] 109 | 110 | if args.builder == 'antlr4': 111 | process_antlr4_args(args) 112 | elif args.builder == 'srcml': 113 | process_srcml_args(args) 114 | 115 | picire.cli.process_args(args) 116 | 117 | 118 | def log_tree(title, hdd_tree): 119 | if logger.isEnabledFor(logging.DEBUG): 120 | logger.debug('%s\n\theight: %s\n\tshape: %s\n\tnodes: %s\n', 121 | title, 122 | info.height(hdd_tree), 123 | ', '.join(str(cnt) for cnt in info.shape(hdd_tree)), 124 | ', '.join(f'{cnt} {ty}' for ty, cnt in sorted(info.count(hdd_tree).items()))) 125 | logger.trace('%r', hdd_tree) 126 | 127 | 128 | def build_with_antlr4(src, *, 129 | input_format, start, 130 | antlr, lang='python', 131 | build_hidden_tokens=False, 132 | work_dir): 133 | """ 134 | Execute ANTLRv4-based tree building part of picireny as if invoked from 135 | command line, however, control its behaviour not via command line arguments 136 | but function parameters. 137 | 138 | :param src: Contents of the test case to reduce. 139 | :param input_format: Dictionary describing the input format. 140 | :param start: Name of the start rule in [grammarname:]rulename format. 141 | :param antlr: Path to the ANTLR4 tool (Java jar binary). 142 | :param lang: The target language of the parser. 143 | :param build_hidden_tokens: Build hidden tokens of the input format into the 144 | HDD tree. 145 | :param work_dir: Path to a working directory. 146 | :return: The built HDD tree. 147 | """ 148 | # Get the parameters in a dictionary so that they can be pretty-printed 149 | args = locals().copy() 150 | del args['src'] 151 | picire.cli.log_args('Building tree with ANTLRv4', args) 152 | 153 | from .antlr4 import create_hdd_tree 154 | return create_hdd_tree(src, 155 | input_format=input_format, start=start, 156 | antlr=antlr, lang=lang, 157 | hidden_tokens=build_hidden_tokens, 158 | work_dir=work_dir) 159 | 160 | 161 | def build_with_srcml(src, *, language): 162 | """ 163 | Execute srcML-based tree building part of picireny as if invoked from 164 | command line, however, control its behaviour not via command line arguments 165 | but function parameters. 166 | 167 | :param src: Contents of the test case to reduce. 168 | :param language: Language of the input source (C, C++, C#, or Java). 169 | :return: The built HDD tree. 170 | """ 171 | # Get the parameters in a dictionary so that they can be pretty-printed 172 | args = locals().copy() 173 | del args['src'] 174 | picire.cli.log_args('Building tree with srcML', args) 175 | 176 | from .srcml import create_hdd_tree 177 | return create_hdd_tree(src, language=language) 178 | 179 | 180 | def reduce(hdd_tree, *, 181 | hddmin, reduce_class, reduce_config, tester_class, tester_config, 182 | cache_class=None, unparse_with_whitespace=True, 183 | hdd_phase_configs=({},), hdd_star=True, 184 | flatten_recursion=False, squeeze_tree=True, skip_unremovable=True, skip_whitespace=False): 185 | """ 186 | Execute tree reduction part of picireny as if invoked from command line, 187 | however, control its behaviour not via command line arguments but function 188 | parameters. 189 | 190 | :param hdd_tree: HDD tree to reduce. 191 | :param hddmin: Function implementing a HDD minimization algorithm. 192 | :param reduce_class: Reference to the reducer class. 193 | :param reduce_config: Dictionary containing information to initialize the 194 | reduce_class. 195 | :param tester_class: Reference to a runnable class that can decide about the 196 | interestingness of a test case. 197 | :param tester_config: Dictionary containing information to initialize the 198 | tester_class. 199 | :param cache_class: Reference to the cache class to use. 200 | :param unparse_with_whitespace: Unparse by adding whitespace between 201 | nonadjacent nodes. 202 | :param hdd_phase_configs: Sequence of dictionaries containing information to 203 | parametrize the hddmin function. 204 | :param hdd_star: Boolean to enable the HDD star algorithm. 205 | :param flatten_recursion: Boolean to enable flattening left/right-recursive 206 | trees. 207 | :param squeeze_tree: Boolean to enable the tree squeezing optimization. 208 | :param skip_unremovable: Boolean to enable hiding unremovable nodes from 209 | ddmin. 210 | :param skip_whitespace: Boolean to enable hiding whitespace-only tokens from 211 | ddmin. 212 | :return: The reduced HDD tree. 213 | """ 214 | # Get the parameters in a dictionary so that they can be pretty-printed 215 | args = locals().copy() 216 | del args['hdd_tree'] 217 | picire.cli.log_args('Reduce session starts', args) 218 | 219 | log_tree('Initial tree', hdd_tree) 220 | 221 | # Perform tree transformations. 222 | if flatten_recursion: 223 | hdd_tree = transform.flatten_recursion(hdd_tree) 224 | log_tree('Tree after recursion flattening', hdd_tree) 225 | 226 | if squeeze_tree: 227 | hdd_tree = transform.squeeze_tree(hdd_tree) 228 | log_tree('Tree after squeezing', hdd_tree) 229 | 230 | if skip_unremovable: 231 | hdd_tree = transform.skip_unremovable(hdd_tree, unparse_with_whitespace=unparse_with_whitespace) 232 | log_tree('Tree after skipping unremovable nodes', hdd_tree) 233 | 234 | if skip_whitespace: 235 | hdd_tree = transform.skip_whitespace(hdd_tree) 236 | log_tree('Tree after skipping whitespace tokens', hdd_tree) 237 | 238 | # Perform reduction. 239 | for phase_cnt, phase_config in enumerate(hdd_phase_configs): 240 | logger.info('Phase #%d', phase_cnt) 241 | hdd_tree = hddmin(hdd_tree, 242 | reduce_class=reduce_class, reduce_config=reduce_config, 243 | tester_class=tester_class, tester_config=tester_config, 244 | id_prefix=(f'p{phase_cnt}',), 245 | cache=cache_class() if cache_class else None, 246 | unparse_with_whitespace=unparse_with_whitespace, 247 | hdd_star=hdd_star, 248 | **phase_config) 249 | log_tree(f'Tree after reduction phase #{phase_cnt}', hdd_tree) 250 | 251 | return hdd_tree 252 | 253 | 254 | def execute(): 255 | """ 256 | The main entry point of picireny. 257 | """ 258 | logging.basicConfig(format='%(message)s') 259 | 260 | arg_parser = ArgumentParser(description='CLI for the Picireny Hierarchical Delta Debugging Framework', 261 | parents=[picire.cli.create_parser()], add_help=False) 262 | 263 | # General HDD settings. 264 | arg_parser.add_argument('--builder', metavar='NAME', choices=['antlr4', 'srcml'], default='antlr4', 265 | help='tool to build tree representation from input (%(choices)s; default: %(default)s)') 266 | arg_parser.add_argument('--hdd', metavar='NAME', choices=args_hdd_choices.keys(), default='hdd', 267 | help='HDD variant to run (%(choices)s; default: %(default)s)') 268 | arg_parser.add_argument('--phase', metavar='NAME', choices=args_phase_choices.keys(), action='append', 269 | help='parametrization of the HDD variant to run (%(choices)s; default: prune) ' 270 | '(may be specified multiple times to run different parametrizations in sequence)') 271 | arg_parser.add_argument('--no-hdd-star', dest='hdd_star', default=True, action='store_false', 272 | help='run the hddmin algorithm only once') 273 | arg_parser.add_argument('--flatten-recursion', default=False, action='store_true', 274 | help='flatten recurring blocks of left/right-recursive rules') 275 | arg_parser.add_argument('--no-squeeze-tree', dest='squeeze_tree', default=True, action='store_false', 276 | help='don\'t squeeze rule chains in tree representation') 277 | arg_parser.add_argument('--no-skip-unremovable', dest='skip_unremovable', default=True, action='store_false', 278 | help='don\'t hide unremovable nodes from the ddmin algorithm') 279 | arg_parser.add_argument('--skip-whitespace', dest='skip_whitespace', default=False, action='store_true', 280 | help='hide whitespace tokens from the ddmin algorithm') 281 | inators.arg.add_sys_recursion_limit_argument(arg_parser) 282 | inators.arg.add_version_argument(arg_parser, version=__version__) 283 | 284 | # ANTLRv4-specific settings. 285 | antlr4_grp = arg_parser.add_argument_group('ANTLRv4-specific arguments') 286 | antlr4_grp.add_argument('-s', '--start', '--antlr4:start', metavar='NAME', 287 | help='name of the start rule in [grammarname:]rulename format (default for ' 288 | 'the optional grammarname is the empty string)') 289 | antlr4_grp.add_argument('-g', '--grammar', '--antlr4:grammar', metavar='FILE', nargs='+', 290 | help='grammar file(s) describing the input format (these grammars will be ' 291 | 'associated with the empty grammar name, see `--start`)') 292 | antlr4_grp.add_argument('-r', '--replacements', '--antlr4:replacements', metavar='FILE', 293 | help='JSON file defining the default replacements for lexer and parser ' 294 | 'rules of the grammar with the empty name (usually defined via `--grammar`)') 295 | antlr4_grp.add_argument('--format', '--antlr4:format', metavar='FILE', 296 | help='JSON file describing a (possibly complex) input format') 297 | antlr4_grp.add_argument('--build-hidden-tokens', '--antlr4:build-hidden-tokens', default=False, action='store_true', 298 | help='build hidden tokens of the grammar(s) into the HDD tree') 299 | antlerinator.add_antlr_argument(antlr4_grp, long_alias='--antlr4:antlr') 300 | antlr4_grp.add_argument('--parser', '--antlr4:parser', metavar='LANG', default='python', choices=['python', 'java'], 301 | help='language of the generated parsers (%(choices)s; default: %(default)s) ' 302 | '(using Java might gain performance, but needs JDK)') 303 | 304 | # srcML-specific settings. 305 | srcml_grp = arg_parser.add_argument_group('srcML-specific arguments') 306 | srcml_grp.add_argument('--srcml:language', dest='srcml_language', metavar='LANG', choices=['C', 'C++', 'C#', 'Java'], 307 | help='language of the input (%(choices)s; default: %(default)s)') 308 | 309 | args = arg_parser.parse_args() 310 | 311 | try: 312 | process_args(args) 313 | except ValueError as e: 314 | arg_parser.error(e) 315 | 316 | if args.builder == 'antlr4': 317 | work_dir = join(args.out, 'grammar') 318 | hdd_tree = build_with_antlr4(args.src, 319 | input_format=args.input_format, start=args.start, 320 | antlr=args.antlr, lang=args.parser, 321 | build_hidden_tokens=args.build_hidden_tokens, 322 | work_dir=work_dir) 323 | unparse_with_whitespace = not args.build_hidden_tokens 324 | if args.cleanup: 325 | rmtree(work_dir) 326 | elif args.builder == 'srcml': 327 | hdd_tree = build_with_srcml(args.src, language=args.srcml_language) 328 | unparse_with_whitespace = False 329 | else: 330 | assert False, f'Unknown builder: {args.builder}' 331 | 332 | hdd_tree = reduce(hdd_tree, 333 | hddmin=args.hddmin, 334 | reduce_class=args.reduce_class, reduce_config=args.reduce_config, 335 | tester_class=args.tester_class, tester_config=args.tester_config, 336 | cache_class=args.cache, unparse_with_whitespace=unparse_with_whitespace, 337 | hdd_phase_configs=args.hdd_phase_configs, hdd_star=args.hdd_star, 338 | flatten_recursion=args.flatten_recursion, 339 | squeeze_tree=args.squeeze_tree, 340 | skip_unremovable=args.skip_unremovable, 341 | skip_whitespace=args.skip_whitespace) 342 | out_src = hdd_tree.unparse(with_whitespace=unparse_with_whitespace) 343 | 344 | picire.cli.postprocess(args, out_src) 345 | -------------------------------------------------------------------------------- /src/picireny/filter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | 9 | def coarse_filter(node): 10 | """ 11 | Config filter to keep nodes with empty replacements only, which is the core 12 | of the coarse hierarchical delta debugging reduce algorithm. 13 | """ 14 | return node.replace == '' 15 | -------------------------------------------------------------------------------- /src/picireny/hdd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2007 Ghassan Misherghi. 2 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss. 3 | # Copyright (c) 2021 Daniel Vince 4 | # 5 | # Licensed under the BSD 3-Clause License 6 | # . 7 | # This file may not be copied, modified, or distributed except 8 | # according to those terms. 9 | 10 | import itertools 11 | import logging 12 | 13 | from .info import height 14 | from .prune import prune 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def hddmin(hdd_tree, *, 20 | reduce_class, reduce_config, tester_class, tester_config, 21 | id_prefix=(), cache=None, unparse_with_whitespace=True, 22 | config_filter=None, transformations=(prune,), hdd_star=True): 23 | """ 24 | Run the hierarchical delta debugging reduce algorithm. 25 | 26 | :param hdd_tree: The root of the tree that the reduce will work with (it's 27 | the output of create_hdd_tree). 28 | :param reduce_class: Reference to the reducer class (DD, ParallelDD or 29 | CombinedParallelDD from the picire module). 30 | :param reduce_config: Dictionary containing the parameters of the 31 | reduce_class init function. 32 | :param tester_class: Reference to a callable class that can decide about the 33 | interestingness of a test case. 34 | :param tester_config: Dictionary containing the parameters of the tester 35 | class init function (except test_builder). 36 | :param id_prefix: Tuple to prepend to config IDs during tests. 37 | :param cache: Cache to use. 38 | :param unparse_with_whitespace: Build test case by adding whitespace between 39 | nonadjacent tree nodes during unparsing. 40 | :param config_filter: Filter function from node to boolean, to allow running 41 | hddmin selectively. 42 | :param transformations: Iterable of transformations that reduce a 43 | configuration of nodes. 44 | :param hdd_star: Boolean to enable the HDD star algorithm. 45 | :return: The reduced test case (1-tree-minimal if hdd_star is True and 46 | config_filter is None). 47 | """ 48 | 49 | def collect_level_nodes(level): 50 | def _collect_level_nodes(node, current_level): 51 | if node.state != node.KEEP: 52 | return 53 | if current_level == level: 54 | level_nodes.append(node) 55 | elif hasattr(node, 'children'): 56 | for child in node.children: 57 | _collect_level_nodes(child, current_level + 1) 58 | level_nodes = [] # Using `list` (not `set`) for the sake of stability. 59 | _collect_level_nodes(hdd_tree, 0) 60 | return level_nodes 61 | 62 | for iter_cnt in itertools.count(): 63 | logger.info('Iteration #%d', iter_cnt) 64 | 65 | changed = False 66 | for level in itertools.count(): 67 | level_nodes = collect_level_nodes(level) 68 | if not level_nodes: 69 | break 70 | 71 | if config_filter: 72 | level_nodes = list(filter(config_filter, level_nodes)) 73 | if not level_nodes: 74 | continue 75 | 76 | if logger.isEnabledFor(logging.INFO): 77 | logger.info('Checking level %d / %d ...', level, height(hdd_tree)) 78 | 79 | for trans_cnt, transformation in enumerate(transformations): 80 | hdd_tree, transformed = transformation(hdd_tree, level_nodes, 81 | reduce_class=reduce_class, reduce_config=reduce_config, 82 | tester_class=tester_class, tester_config=tester_config, 83 | id_prefix=id_prefix + (f'i{iter_cnt}', f'l{level}', f't{trans_cnt}'), 84 | cache=cache, 85 | unparse_with_whitespace=unparse_with_whitespace) 86 | 87 | changed = changed or transformed 88 | 89 | if not hdd_star or not changed: 90 | break 91 | 92 | return hdd_tree 93 | -------------------------------------------------------------------------------- /src/picireny/hdd_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2007 Ghassan Misherghi. 2 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss. 3 | # 4 | # Licensed under the BSD 3-Clause License 5 | # . 6 | # This file may not be copied, modified, or distributed except 7 | # according to those terms. 8 | 9 | from itertools import count 10 | from os import linesep 11 | from textwrap import indent 12 | 13 | 14 | class Position: 15 | """ 16 | Class defining a position in the input file. Used to recognise line breaks 17 | between tokens. 18 | """ 19 | def __init__(self, line=1, column=0): 20 | """ 21 | Initialize position object. 22 | 23 | :param line: Line number in the input (starts with 1). 24 | :param column: Character index relative to the beginning of the line 25 | (starts with 0). 26 | 27 | Note: The numbering of lines (1-based) and columns (0-based) follows 28 | ANTLR v4. 29 | """ 30 | self.line = line 31 | self.column = column 32 | 33 | def after(self, text): 34 | """ 35 | Calculate the end position of a text starting at the current position. 36 | """ 37 | line_breaks = text.count('\n') 38 | return Position(self.line + line_breaks, 39 | self.column + len(text) if not line_breaks else len(text) - text.rfind('\n') - 1) 40 | 41 | def shift(self, start): 42 | """ 43 | Shift the position by prepending a starting position. 44 | """ 45 | if self.line > 1: 46 | self.line += start.line - 1 47 | else: 48 | self.line = start.line 49 | self.column += start.column 50 | 51 | def __repr__(self): 52 | return f'{self.__class__.__name__}({self.line!r}, {self.column!r})' 53 | 54 | 55 | class HDDTree: 56 | # Node states for unparsing. 57 | REMOVED = 0 58 | KEEP = 1 59 | 60 | # ID generator 61 | __id = count() 62 | 63 | def __init__(self, name, *, start=None, end=None, replace=None): 64 | """ 65 | Initialize a HDD tree/node. 66 | 67 | :param name: The name of the node. 68 | :param start: Position object describing the start of the HDDTree node. 69 | :param end: Position object describing the end of the HDDTree node. 70 | :param replace: The minimal replacement string of the current node. 71 | """ 72 | self.name = name 73 | self.replace = replace 74 | self.start = start 75 | self.end = end 76 | self.parent = None 77 | self.state = self.KEEP 78 | self.id = next(self.__id) 79 | 80 | def unparse(self, *, with_whitespace=True, transform=None): 81 | """ 82 | Build test case from a HDD tree. 83 | 84 | :param with_whitespace: Add whitespace (space, new line) to separate 85 | nonadjacent nodes. 86 | :param transform: A function applied to each node before unparsing, or 87 | None. 88 | :return: The unparsed test case. 89 | """ 90 | def _unparse(node): 91 | if transform: 92 | node = transform(node) 93 | 94 | if node.state != node.KEEP: 95 | return node.replace 96 | 97 | # Keep the text of the token. 98 | if isinstance(node, HDDToken): 99 | return node.text 100 | 101 | if not node.children: 102 | return '' 103 | 104 | # Concat the text of children. 105 | child_strs = [_unparse(child) for child in node.children] 106 | node_str = child_strs[0] 107 | for i in range(1, len(node.children)): 108 | # Do not add extra spaces if the next chunk is empty. 109 | if not child_strs[i]: 110 | continue 111 | if with_whitespace: 112 | if node.children[i].start.line > node.children[i - 1].end.line: 113 | node_str += linesep 114 | elif node.children[i].start.column > node.children[i - 1].end.column: 115 | node_str += ' ' 116 | node_str += child_strs[i] 117 | 118 | return node_str 119 | 120 | return _unparse(self) 121 | 122 | def replace_with(self, other): 123 | """ 124 | Replace the current node with `other` in the HDD tree. 125 | 126 | :param other: Node to replace the current with. 127 | """ 128 | self.parent.children[self.parent.children.index(self)] = other 129 | other.parent = self.parent 130 | 131 | 132 | class HDDToken(HDDTree): 133 | def __init__(self, name, text, *, start=None, end=None, replace=None): 134 | super().__init__(name, start=start, end=end, replace=replace) 135 | self.text = text 136 | 137 | def __repr__(self): 138 | parts = [ 139 | f'name={self.name!r}', 140 | f'text={self.text!r}', 141 | ] 142 | if self.replace is not None: 143 | parts.append(f'replace={self.replace!r}') 144 | if self.start is not None: 145 | parts.append(f'start={self.start!r}') 146 | if self.end is not None: 147 | parts.append(f'end={self.end!r}') 148 | parts.append(f'id={self.id!r}') 149 | if self.state != self.KEEP: 150 | parts.append(f'state={self.state!r}') 151 | 152 | return f'{self.__class__.__name__}({", ".join(parts)})' 153 | 154 | 155 | class HDDRule(HDDTree): 156 | def __init__(self, name, *, start=None, end=None, replace=None): 157 | super().__init__(name, start=start, end=end, replace=replace) 158 | self.children = [] 159 | 160 | def add_child(self, child): 161 | self.children.append(child) 162 | child.parent = self 163 | 164 | def add_children(self, children): 165 | for child in children: 166 | self.add_child(child) 167 | 168 | def remove_child(self, child): 169 | self.children.remove(child) 170 | 171 | def __repr__(self): 172 | parts = [ 173 | f'name={self.name!r}', 174 | ] 175 | if self.replace is not None: 176 | parts.append(f'replace={self.replace!r}') 177 | if self.start is not None: 178 | parts.append(f'start={self.start!r}') 179 | if self.end is not None: 180 | parts.append(f'end={self.end!r}') 181 | parts.append(f'id={self.id!r}') 182 | if self.state != self.KEEP: 183 | parts.append(f'state={self.state!r}') 184 | if self.state == self.KEEP and self.children: 185 | parts.append('children=[\n%s\n]' % indent(',\n'.join(repr(child) for child in self.children), ' ')) 186 | 187 | return f'{self.__class__.__name__}({", ".join(parts)})' 188 | -------------------------------------------------------------------------------- /src/picireny/hddr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2022 Renata Hodovan, Akos Kiss. 2 | # Copyright (c) 2021 Daniel Vince 3 | # 4 | # Licensed under the BSD 3-Clause License 5 | # . 6 | # This file may not be copied, modified, or distributed except 7 | # according to those terms. 8 | 9 | import itertools 10 | import logging 11 | 12 | from .prune import prune 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def hddrmin(hdd_tree, *, 18 | reduce_class, reduce_config, tester_class, tester_config, 19 | id_prefix=(), cache=None, unparse_with_whitespace=True, 20 | config_filter=None, transformations=(prune,), hdd_star=True, 21 | pop_first=False, append_reversed=False): 22 | """ 23 | Run the recursive variant of the hierarchical delta debugging reduce 24 | algorithm (a.k.a. HDDr). 25 | 26 | The tree traversal implementation is actually not recursive but an iterative 27 | queue-based reformulation of HDDr. How tree nodes are popped from the queue 28 | during the iteration (whether from the beginning or from the end of the 29 | queue) and how the children of a visited node are appended to the queue 30 | (whether they are added in forward or reverse order) give rise to different 31 | variants of HDDr: 32 | 33 | - 'pop first' with 'forward append' gives the classic breadth-first 34 | traversal, 35 | - 'pop first' with 'reverse append' gives syntactically reversed 36 | breadth-first traversal, 37 | - 'pop last' with 'reverse append' gives the classic depth-first 38 | traversal, 39 | - 'pop last' with 'forward append' gives syntactically reversed 40 | depth-first traversal. 41 | 42 | :param hdd_tree: The root of the tree that the reduce will work with (it's 43 | the output of create_hdd_tree). 44 | :param reduce_class: Reference to the reducer class (DD, ParallelDD or 45 | CombinedParallelDD from the picire module). 46 | :param reduce_config: Dictionary containing the parameters of the 47 | reduce_class init function. 48 | :param tester_class: Reference to a callable class that can decide about the 49 | interestingness of a test case. 50 | :param tester_config: Dictionary containing the parameters of the tester 51 | class init function (except test_builder). 52 | :param id_prefix: Tuple to prepend to config IDs during tests. 53 | :param cache: Cache to use. 54 | :param unparse_with_whitespace: Build test case by adding whitespace between 55 | nonadjacent tree nodes during unparsing. 56 | :param config_filter: Filter function from node to boolean, to allow running 57 | hddmin selectively. 58 | :param transformations: Iterable of transformations that reduce a 59 | configuration of nodes. 60 | :param hdd_star: Boolean to enable the HDD star algorithm. 61 | :param pop_first: Boolean to control tree traversal (see above for details). 62 | :param append_reverse: Boolean to control tree traversal (see above for 63 | details). 64 | :return: The reduced test case (1-tree-minimal if hdd_star is True and 65 | config_filter is None). 66 | """ 67 | 68 | for iter_cnt in itertools.count(): 69 | logger.info('Iteration #%d', iter_cnt) 70 | 71 | changed = False 72 | queue = [hdd_tree] 73 | for node_cnt in itertools.count(): 74 | if not queue: 75 | break 76 | if pop_first: 77 | queue, node = queue[1:], queue[0] 78 | else: 79 | queue, node = queue[:-1], queue[-1] 80 | if not hasattr(node, 'children') or node.state != node.KEEP: 81 | continue 82 | 83 | children = [child for child in node.children if child.state == child.KEEP] 84 | if config_filter: 85 | children = list(filter(config_filter, children)) 86 | 87 | if children: 88 | logger.info('Checking node #%d ...', node_cnt) 89 | 90 | for trans_cnt, transformation in enumerate(transformations): 91 | hdd_tree, transformed = transformation(hdd_tree, children, 92 | reduce_class=reduce_class, reduce_config=reduce_config, 93 | tester_class=tester_class, tester_config=tester_config, 94 | id_prefix=id_prefix + (f'i{iter_cnt}', f'n{node_cnt}', f't{trans_cnt}'), 95 | cache=cache, 96 | unparse_with_whitespace=unparse_with_whitespace) 97 | 98 | changed = changed or transformed 99 | 100 | for child in node.children if not append_reversed else reversed(node.children): 101 | if child.state == child.KEEP: 102 | queue.append(child) 103 | 104 | if not hdd_star or not changed: 105 | break 106 | 107 | return hdd_tree 108 | -------------------------------------------------------------------------------- /src/picireny/hoist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2023 Renata Hodovan, Akos Kiss. 2 | # Copyright (c) 2021 Daniel Vince. 3 | # 4 | # Licensed under the BSD 3-Clause License 5 | # . 6 | # This file may not be copied, modified, or distributed except 7 | # according to those terms. 8 | 9 | import itertools 10 | import logging 11 | 12 | from picire import AbstractDD, Outcome 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class HoistingTestBuilder: 18 | 19 | def __init__(self, tree, *, with_whitespace=True): 20 | """ 21 | Initialize the test builder. 22 | 23 | :param tree: Tree representing the current test case. 24 | :param with_whitespace: Unparse by adding whitespace between nonadjacent 25 | nodes. 26 | """ 27 | self.tree = tree 28 | self.with_whitespace = with_whitespace 29 | 30 | def __call__(self, mapping_config): 31 | """ 32 | :param mapping_config: A list of mappings of initial configuration 33 | elements to new ones. 34 | :return: The unparsed test case with the mappings applied. 35 | """ 36 | def map(node): 37 | return mapping.get(node, node) 38 | 39 | mapping = dict(mapping_config) 40 | return self.tree.unparse(with_whitespace=self.with_whitespace, transform=map) 41 | 42 | 43 | class MappingMin(AbstractDD): 44 | 45 | def __init__(self, test, *, cache=None, id_prefix=None): 46 | """ 47 | :param test: A callable tester object. 48 | :param cache: Cache object to use. 49 | :param id_prefix: Tuple to prepend to config IDs during tests. 50 | """ 51 | 52 | super().__init__(test=test, split=None, cache=cache, id_prefix=id_prefix) 53 | 54 | def __call__(self, config): 55 | """ 56 | Compute a mapping of the initial configuration to another (usually 57 | smaller) but still failing configuration. 58 | 59 | :param config: The initial configuration that will be reduced. 60 | :return: A mapping of initial configuration elements to new ones. 61 | """ 62 | 63 | def collect_hoistables(node): 64 | def _collect_hoistables(desc): 65 | if desc.name == node.name: 66 | hoistables.append(desc) 67 | return 68 | if hasattr(desc, 'children') and desc.state == desc.KEEP: 69 | for child in desc.children: 70 | _collect_hoistables(child) 71 | 72 | hoistables = [] 73 | if hasattr(node, 'children') and node.state == node.KEEP and node.name: 74 | for child in node.children: 75 | _collect_hoistables(child) 76 | return hoistables 77 | 78 | mapping = {} 79 | 80 | for run in itertools.count(): 81 | logger.info('Run #%d', run) 82 | logger.info('\tMapping size: %d', len(mapping)) 83 | if logger.isEnabledFor(logging.DEBUG): 84 | logger.debug('\tMapping: %r', {c.id: m.id for c, m in mapping.items()}) 85 | 86 | for i, (c, m) in enumerate((c, m) for c in config for m in collect_hoistables(mapping.get(c, c))): 87 | new_mapping = mapping.copy() 88 | new_mapping[c] = m 89 | mapping_config = list(new_mapping.items()) 90 | config_id = (f'r{run}', f'm{i}') 91 | 92 | outcome = self._lookup_cache(mapping_config, config_id) or self._test_config(mapping_config, config_id) 93 | 94 | if outcome is Outcome.FAIL: 95 | mapping = new_mapping 96 | logger.info('\tHoisted') 97 | break 98 | else: 99 | break 100 | 101 | logger.info('\tDone') 102 | return mapping 103 | 104 | 105 | def hoist(hdd_tree, config_nodes, *, 106 | reduce_class=None, reduce_config=None, tester_class, tester_config, 107 | id_prefix, cache, unparse_with_whitespace): 108 | """ 109 | Try hoisting subtrees. 110 | 111 | :param hdd_tree: The root of the tree that the reduce will work with. 112 | :param config_nodes: Nodes from one level collected by the HDD algorithm. 113 | :param reduce_class: Unused, present for being compatible with 'prune' 114 | transformation. 115 | :param reduce_config: Unused, present for being compatible with 'prune' 116 | transformation. 117 | :param tester_class: Reference to a callable class that can decide about the 118 | interestingness of a test case. 119 | :param tester_config: Dictionary containing the parameters of the tester 120 | class init function (except test_builder). 121 | :param id_prefix: Tuple to prepend to config IDs during tests. 122 | :param cache: Cache to use. 123 | :param unparse_with_whitespace: Build test case by adding whitespace between 124 | nonadjacent tree nodes during unparsing. 125 | :return: The reduced tree and a boolean value that shows whether the tree 126 | has changed during hoisting. 127 | """ 128 | 129 | if not config_nodes: 130 | return hdd_tree, False 131 | 132 | test_builder = HoistingTestBuilder(hdd_tree, with_whitespace=unparse_with_whitespace) 133 | if cache: 134 | cache.clear() 135 | cache.set_test_builder(test_builder) 136 | 137 | test = tester_class(test_builder=test_builder, **tester_config) 138 | mapping_min = MappingMin(test, cache=cache, id_prefix=id_prefix) 139 | mapping = mapping_min(config_nodes) 140 | 141 | def _apply_mapping(node): 142 | node = mapping.get(node, node) 143 | if hasattr(node, 'children'): 144 | for i, child in enumerate(node.children): 145 | node.children[i].replace_with(_apply_mapping(child)) 146 | return node 147 | hdd_tree = _apply_mapping(hdd_tree) 148 | 149 | return hdd_tree, bool(mapping) 150 | -------------------------------------------------------------------------------- /src/picireny/info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2021 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from .hdd_tree import HDDRule 9 | 10 | 11 | def count(node, *, removed=False): 12 | """ 13 | Count nodes in the tree by type. 14 | 15 | Note: If `removed` is `True`, removed tokens and rules are also counted (but 16 | sub-trees of removed rules are not). 17 | 18 | :param node: The root of the tree to do the counting for. 19 | :return: A dictionary of counts indexed by node type name. 20 | """ 21 | def _count(node): 22 | if node.state != node.KEEP and not removed: 23 | return 24 | 25 | ty = node.__class__.__name__ 26 | if ty not in stats: 27 | stats[ty] = 0 28 | stats[ty] += 1 29 | 30 | if isinstance(node, HDDRule) and node.state == node.KEEP: 31 | for child in node.children: 32 | _count(child) 33 | 34 | stats = {} 35 | _count(node) 36 | return stats 37 | 38 | 39 | def height(node, *, removed=False): 40 | """ 41 | Calculate the height of the tree. 42 | 43 | Note: If `removed` is `True`, removed tokens and rules are also counted (but 44 | sub-trees of removed rules are not). 45 | 46 | :param node: The root of the tree to do the calculation for. 47 | :return: The height of the tree. 48 | """ 49 | if node.state != node.KEEP and not removed: 50 | return 0 51 | 52 | return 1 + (max((height(child) for child in node.children), default=0) 53 | if isinstance(node, HDDRule) and node.state == node.KEEP else 0) 54 | 55 | 56 | def shape(node, *, removed=False): 57 | """ 58 | Calculate the shape of the tree, i.e., the number of nodes on each tree 59 | level. 60 | 61 | Note: If `removed` is `True`, removed tokens and rules are also counted (but 62 | sub-trees of removed rules are not). 63 | 64 | :param node: The root of the tree to do the calculation for. 65 | :return: A list of level sizes. 66 | """ 67 | def _shape(node, level): 68 | if node.state != node.KEEP and not removed: 69 | return 70 | 71 | if len(sizes) <= level: 72 | sizes.extend([0] * (level - len(sizes) + 1)) 73 | sizes[level] += 1 74 | 75 | if isinstance(node, HDDRule) and node.state == node.KEEP: 76 | for child in node.children: 77 | _shape(child, level + 1) 78 | 79 | sizes = [] 80 | _shape(node, 0) 81 | return sizes 82 | -------------------------------------------------------------------------------- /src/picireny/prune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2023 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | import logging 9 | 10 | from copy import copy 11 | 12 | from picire import AbstractDD, Outcome 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class PruningTestBuilder: 18 | 19 | def __init__(self, tree, ids, *, with_whitespace=True): 20 | """ 21 | Initialize the test builder. 22 | 23 | :param tree: Tree representing the current test case. 24 | :param ids: The IDs of nodes that can change status. 25 | :param with_whitespace: Unparse by adding whitespace between nonadjacent 26 | nodes. 27 | """ 28 | self.tree = tree 29 | self.ids = ids 30 | self.with_whitespace = with_whitespace 31 | 32 | def __call__(self, config): 33 | """ 34 | :param config: List of IDs of nodes that will be kept in the next test 35 | case. 36 | :return: The unparsed test case containing only the units defined in 37 | config. 38 | """ 39 | def removed(node): 40 | if node.id in self.ids and node.id not in config: 41 | removed_node = copy(node) 42 | removed_node.state = removed_node.REMOVED 43 | return removed_node 44 | return node 45 | 46 | config = set(config) 47 | return self.tree.unparse(with_whitespace=self.with_whitespace, transform=removed) 48 | 49 | 50 | class EmptyDD(AbstractDD): 51 | """ 52 | Special DD variant that *does* test the empty configuration (and nothing 53 | else). 54 | """ 55 | 56 | def __init__(self, test, *, cache=None, id_prefix=None): 57 | """ 58 | Initialize an EmptyDD object. 59 | 60 | :param test: A callable tester object. 61 | :param cache: Cache object to use. 62 | :param id_prefix: Tuple to prepend to config IDs during tests. 63 | """ 64 | super().__init__(test=test, split=None, cache=cache, id_prefix=id_prefix) 65 | 66 | def __call__(self, config): 67 | """ 68 | Return a 1-minimal failing subset of the initial configuration, and also 69 | test the empty configuration while doing so. 70 | 71 | Note: The initial configuration is expected to be of size 1, thus the 72 | 1-minimal failing subset is always its trivial subset: either itself or 73 | the empty configuration. 74 | 75 | :param config: The initial configuration that will be reduced. 76 | :return: 1-minimal failing configuration. 77 | """ 78 | assert len(config) == 1 79 | # assert self._test_config(config, ('assert',)) == self.FAIL 80 | 81 | empty = [] 82 | config_id = ('empty',) 83 | 84 | logger.info('Run #empty') 85 | logger.info('\tConfig size: %d', len(config)) 86 | logger.debug('\tConfig: %r', config) 87 | 88 | outcome = self._lookup_cache(empty, config_id) or self._test_config(empty, config_id) 89 | if outcome is Outcome.FAIL: 90 | config = empty 91 | logger.info('\tReduced') 92 | 93 | logger.info('\tDone') 94 | return config 95 | 96 | 97 | def prune(hdd_tree, config_nodes, *, 98 | reduce_class, reduce_config, tester_class, tester_config, 99 | id_prefix, cache, unparse_with_whitespace): 100 | """ 101 | Pruning-based reduction of a set of nodes (i.e., sub-trees), as used by 102 | various hierarchical delta debugging algorithm variants. 103 | 104 | :param hdd_tree: The root of the tree. 105 | :param config_nodes: The list of nodes to reduce. 106 | :param reduce_class: Reference to the reducer class (DD, ParallelDD or 107 | CombinedParallelDD from the picire module). 108 | :param reduce_config: Dictionary containing the parameters of the 109 | reduce_class init function. 110 | :param tester_class: Reference to a callable class that can decide about the 111 | interestingness of a test case. 112 | :param tester_config: Dictionary containing the parameters of the tester 113 | class init function (except test_builder). 114 | :param id_prefix: Tuple to prepend to config IDs during tests. 115 | :param cache: Cache to use. 116 | :param unparse_with_whitespace: Build test case by adding whitespace between 117 | nonadjacent tree nodes during unparsing. 118 | :return: Tuple: (root of the tree, bool whether the tree changed) 119 | """ 120 | 121 | config_ids = [node.id for node in config_nodes] 122 | config_ids_set = set(config_ids) 123 | 124 | test_builder = PruningTestBuilder(hdd_tree, config_ids_set, with_whitespace=unparse_with_whitespace) 125 | if cache: 126 | cache.clear() 127 | cache.set_test_builder(test_builder) 128 | 129 | test = tester_class(test_builder=test_builder, **tester_config) 130 | dd = reduce_class(test, cache=cache, id_prefix=id_prefix, **reduce_config) 131 | c = dd(config_ids) 132 | if len(c) == 1: 133 | dd = EmptyDD(test, cache=cache, id_prefix=id_prefix) 134 | c = dd(c) 135 | c = set(c) 136 | 137 | def _set_state(node): 138 | if node.id in config_ids_set: 139 | node.state = node.KEEP if node.id in c else node.REMOVED 140 | elif hasattr(node, 'children') and node.state == node.KEEP: 141 | for child in node.children: 142 | _set_state(child) 143 | _set_state(hdd_tree) 144 | 145 | return hdd_tree, len(c) < len(config_ids_set) 146 | -------------------------------------------------------------------------------- /src/picireny/srcml/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2020 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from .hdd_tree_builder import create_hdd_tree 9 | -------------------------------------------------------------------------------- /src/picireny/srcml/hdd_tree_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2022 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | import logging 9 | import xml.etree.ElementTree as ET 10 | 11 | from subprocess import CalledProcessError, PIPE, run 12 | 13 | from ..hdd_tree import HDDRule, HDDToken, Position 14 | from ..transform import remove_empty_nodes 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def build_hdd_tree(element, start): 21 | name = element.tag 22 | name = name.replace('{http://www.srcML.org/srcML/src}', 'src:') 23 | name = name.replace('{http://www.srcML.org/srcML/cpp}', 'cpp:') 24 | name = name.replace('{http://www.srcML.org/srcML/position}', 'pos:') 25 | 26 | rule = HDDRule(name, start=start, end=start, replace='') 27 | result = [rule] 28 | 29 | if element.text: 30 | end = start.after(element.text) 31 | rule.add_child(HDDToken(f'{name}@text', element.text, start=start, end=end, replace=element.text)) 32 | rule.end = end 33 | 34 | for child in list(element): 35 | if child.tag.startswith('{http://www.srcML.org/srcML/position}'): 36 | continue 37 | for node in build_hdd_tree(child, rule.end): 38 | rule.add_child(node) 39 | rule.end = rule.children[-1].end 40 | 41 | if element.tail: 42 | result += [HDDToken(f'{name}@tail', element.tail, start=rule.end, end=rule.end.after(element.tail), replace=element.tail)] 43 | 44 | return result 45 | 46 | 47 | def create_hdd_tree(src, *, language): 48 | """ 49 | Build a tree that the HDD algorithm can work with. 50 | 51 | :param src: Input source to srcML. 52 | :param language: Language of the input source (C, C++, C#, or Java). 53 | :return: The root of the created HDD tree. 54 | """ 55 | 56 | try: 57 | stdout = run(('srcml', f'--language={language}'), 58 | input=src, stdout=PIPE, stderr=PIPE, check=True).stdout 59 | except CalledProcessError as e: 60 | logger.error('Parsing with srcml failed!\n%s\n%s\n', e.stdout, e.stderr) 61 | raise 62 | 63 | root = ET.fromstring(stdout) 64 | 65 | tree_result = build_hdd_tree(root, Position()) 66 | assert len(tree_result) == 1 67 | tree = tree_result[0] 68 | 69 | tree = remove_empty_nodes(tree) 70 | return tree 71 | -------------------------------------------------------------------------------- /src/picireny/transform.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2021 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | from .hdd_tree import HDDRule, HDDToken 9 | 10 | 11 | def remove_empty_nodes(node): 12 | """ 13 | Delete those nodes from the HDD tree that don't contribute to the output at 14 | all (tokens with empty text, e.g., the EOF token; and rules with no 15 | children, e.g., lambda rules). 16 | 17 | :param node: The root of the tree to be transformed. 18 | :return: The root of the transformed tree. 19 | """ 20 | if isinstance(node, HDDRule): 21 | non_empty_children = [] 22 | 23 | for child in node.children: 24 | if isinstance(child, HDDToken): 25 | # empty token is usually the EOF only (but interestingly, it may 26 | # appear multiple times in the tree) 27 | if child.text != '': 28 | non_empty_children.append(child) 29 | else: 30 | assert isinstance(child, HDDRule) 31 | remove_empty_nodes(child) 32 | 33 | # a grammar may contain lambda rules (with nothing on the 34 | # right-hand side, or with an empty alternative), or rules that 35 | # produce EOF only (which is removed in the branch above) 36 | if child.children: 37 | non_empty_children.append(child) 38 | 39 | node.children[:] = non_empty_children 40 | 41 | return node 42 | 43 | 44 | def flatten_recursion(node): 45 | """ 46 | Heuristics to flatten left or right-recursion. E.g., given a rule 47 | rule : a | rule b 48 | and a HDD tree built with it from an input, rewrite the resulting HDD tree 49 | as if it was built using 50 | rule : a b* 51 | This allows HDD to potentially completely remove the recurring blocks 52 | (instead of replacing them with their minimal replacement, which is usually 53 | not ""). 54 | 55 | :param node: The root of the tree to be transformed. 56 | :return: The root of the transformed tree. 57 | """ 58 | if isinstance(node, HDDRule) and node.state == node.KEEP: 59 | for child in node.children: 60 | flatten_recursion(child) 61 | 62 | if len(node.children) > 1 and node.name: 63 | if node.children[0].name == node.name: 64 | left = node.children[0] 65 | 66 | right = HDDRule('', replace='', start=node.children[1].start, end=node.children[-1].end) 67 | right.add_children(node.children[1:]) 68 | del node.children[:] 69 | 70 | node.add_children(left.children) 71 | node.add_child(right) 72 | 73 | elif node.children[-1].name == node.name: 74 | right = node.children[-1] 75 | 76 | left = HDDRule('', replace='', start=node.children[0].start, end=node.children[-2].end) 77 | left.add_children(node.children[0:-1]) 78 | del node.children[:] 79 | 80 | node.add_child(left) 81 | node.add_children(right.children) 82 | 83 | # This only seems to happen if there was some error during parsing. 84 | # In this case a weird 1-step chain gets inserted into the left/right- 85 | # recursive tree, which prevents flattening. But we cannot postpone the 86 | # merging of this 1-step chain to squeeze_tree because flatten_recursion 87 | # is usually not called again afterwards. So, do a degenerate "rotation" 88 | # (i.e., simple lifting) here. 89 | if len(node.children) == 1 and node.name: 90 | if node.children[0].name == node.name: 91 | child = node.children[0] 92 | del node.children[:] 93 | node.add_children(child.children) 94 | 95 | return node 96 | 97 | 98 | def squeeze_tree(node): 99 | """ 100 | Compress single line chains in the HDD tree whose minimal replacements are 101 | the same and hence they would result in redundant checks during the 102 | minimization. 103 | 104 | :param node: The root of the tree to be transformed. 105 | :return: The root of the transformed tree. 106 | """ 107 | if isinstance(node, HDDRule): 108 | for i, child in enumerate(node.children): 109 | squeezed_child = squeeze_tree(child) 110 | if child != squeezed_child: 111 | node.children[i].replace_with(squeezed_child) 112 | 113 | if len(node.children) == 1 and node.children[0].replace == node.replace: 114 | return node.children[0] 115 | 116 | return node 117 | 118 | 119 | def skip_unremovable(node, *, unparse_with_whitespace=True): 120 | """ 121 | Mark those nodes as removed whose unparsing (e.g., for tokens, their text) 122 | is the same tokens as their minimal replacement, thus hiding them from 123 | hddmin, because they just cause extra test runs but cannot reduce the input. 124 | 125 | :param node: The root of the tree to be transformed. 126 | :return: The root of the transformed tree. 127 | """ 128 | if isinstance(node, HDDRule): 129 | for child in node.children: 130 | skip_unremovable(child, unparse_with_whitespace=unparse_with_whitespace) 131 | 132 | if node.unparse(with_whitespace=unparse_with_whitespace) == node.replace: 133 | node.state = node.REMOVED 134 | 135 | return node 136 | 137 | 138 | def skip_whitespace(node): 139 | """ 140 | Mark tokens with whitespace-only text as removed. Useful when hidden-channel 141 | tokens are built into the tree to let hddmin deal with 142 | hidden-but-non-whitespace tokens only. 143 | 144 | :param node: The root of the tree to be transformed. 145 | :return: The root of the transformed tree. 146 | """ 147 | if isinstance(node, HDDRule): 148 | for child in node.children: 149 | skip_whitespace(child) 150 | else: 151 | assert isinstance(node, HDDToken) 152 | if node.text.isspace(): 153 | node.state = node.REMOVED 154 | 155 | return node 156 | -------------------------------------------------------------------------------- /tests/resources/INILexer.g4: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2021 Renata Hodovan, Akos Kiss. 3 | * 4 | * Licensed under the BSD 3-Clause License 5 | * . 6 | * This file may not be copied, modified, or distributed except 7 | * according to those terms. 8 | */ 9 | 10 | lexer grammar INILexer; 11 | 12 | 13 | HEADER_OPEN 14 | : '[' -> pushMode(HEADER_MODE) 15 | ; 16 | 17 | KEY 18 | : KEY_START_CHAR ( KEY_CHAR_WS* KEY_CHAR )? 19 | ; 20 | 21 | fragment 22 | KEY_START_CHAR 23 | : ~[[:=\r\n;# \t] 24 | ; 25 | 26 | fragment 27 | KEY_CHAR 28 | : KEY_START_CHAR 29 | | '[' 30 | ; 31 | 32 | fragment 33 | KEY_CHAR_WS 34 | : KEY_CHAR 35 | | WS 36 | ; 37 | 38 | EQUALS 39 | : [:=] -> pushMode(VALUE_MODE) 40 | ; 41 | 42 | WS 43 | : [ \t]+ 44 | ; 45 | 46 | EOL 47 | : '\r\n' 48 | | '\r' 49 | | '\n' 50 | ; 51 | 52 | COMMENT 53 | : COMMENT_START_CHAR ~[\r\n]* 54 | ; 55 | 56 | fragment 57 | COMMENT_START_CHAR 58 | : [;#] 59 | ; 60 | 61 | 62 | mode HEADER_MODE; 63 | 64 | HEADER 65 | : HEADER_CHAR ( HEADER_CHAR_WS* HEADER_CHAR )? 66 | ; 67 | 68 | fragment 69 | HEADER_CHAR 70 | : ~[[\]\r\n;# \t] 71 | ; 72 | 73 | fragment 74 | HEADER_CHAR_WS 75 | : HEADER_CHAR 76 | | HEADER_WS 77 | ; 78 | 79 | HEADER_CLOSE 80 | : ']' -> popMode 81 | ; 82 | 83 | HEADER_WS 84 | : [ \t]+ 85 | ; 86 | 87 | 88 | mode VALUE_MODE; 89 | 90 | VALUE 91 | : VALUE_CHAR ( VALUE_CHAR_WS* VALUE_CHAR )? -> popMode 92 | ; 93 | 94 | fragment 95 | VALUE_CHAR 96 | : ~[\r\n\t;# ] 97 | ; 98 | 99 | fragment 100 | VALUE_CHAR_WS 101 | : VALUE_CHAR 102 | | VALUE_WS 103 | ; 104 | 105 | VALUE_WS 106 | : [ \t]+ 107 | ; 108 | -------------------------------------------------------------------------------- /tests/resources/INIParser.g4: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Renata Hodovan, Akos Kiss. 3 | * 4 | * Licensed under the BSD 3-Clause License 5 | * . 6 | * This file may not be copied, modified, or distributed except 7 | * according to those terms. 8 | */ 9 | 10 | parser grammar INIParser; 11 | 12 | options { tokenVocab=INILexer; } 13 | 14 | 15 | ini 16 | : comment* section* EOF 17 | ; 18 | 19 | comment 20 | : WS? COMMENT EOL 21 | ; 22 | 23 | section 24 | : header ( comment | line )* 25 | ; 26 | 27 | header 28 | : WS? HEADER_OPEN HEADER_WS? HEADER HEADER_WS? HEADER_CLOSE WS? EOL 29 | ; 30 | 31 | // Multiline values are not handled properly by this approach, the continuation 32 | // lines will be recognized as keys, probably with no value. 33 | line 34 | : WS? ( KEY WS? ( EQUALS VALUE_WS? ( VALUE WS? )? )? )? EOL 35 | ; 36 | -------------------------------------------------------------------------------- /tests/resources/JSON.g4: -------------------------------------------------------------------------------- 1 | 2 | /** Taken from "The Definitive ANTLR 4 Reference" by Terence Parr */ 3 | 4 | // Derived from http://json.org 5 | grammar JSON; 6 | 7 | json 8 | : value 9 | ; 10 | 11 | obj 12 | : '{' pair (',' pair)* '}' 13 | | '{' '}' 14 | ; 15 | 16 | pair 17 | : STRING ':' value 18 | ; 19 | 20 | array 21 | : '[' value (',' value)* ']' 22 | | '[' ']' 23 | ; 24 | 25 | value 26 | : STRING 27 | | NUMBER 28 | | obj 29 | | array 30 | | 'true' 31 | | 'false' 32 | | 'null' 33 | ; 34 | 35 | 36 | STRING 37 | : '"' (ESC | ~ ["\\])* '"' 38 | ; 39 | 40 | 41 | fragment ESC 42 | : '\\' (["\\/bfnrt] | UNICODE) 43 | ; 44 | 45 | 46 | fragment UNICODE 47 | : 'u' HEX HEX HEX HEX 48 | ; 49 | 50 | 51 | fragment HEX 52 | : [0-9a-fA-F] 53 | ; 54 | 55 | 56 | NUMBER 57 | : '-'? INT '.' [0-9] + EXP? | '-'? INT EXP | '-'? INT 58 | ; 59 | 60 | 61 | fragment INT 62 | : '0' | [1-9] [0-9]* 63 | ; 64 | 65 | // no leading zeros 66 | 67 | fragment EXP 68 | : [Ee] [+\-]? INT 69 | ; 70 | 71 | // \- since - means "range" inside [...] 72 | 73 | WS 74 | : [ \t\n\r] + -> skip 75 | ; 76 | -------------------------------------------------------------------------------- /tests/resources/exp-obj-arr-87.json: -------------------------------------------------------------------------------- 1 | { 2 | "" : 0, 3 | "": [ 0, 87 ] 4 | } -------------------------------------------------------------------------------- /tests/resources/exp-obj-arr-bar.json: -------------------------------------------------------------------------------- 1 | { 2 | "": "bar" 3 | } -------------------------------------------------------------------------------- /tests/resources/exp-obj-arr-baz.json: -------------------------------------------------------------------------------- 1 | { 2 | "" : 0, 3 | "baz": 0 4 | } -------------------------------------------------------------------------------- /tests/resources/exp-obj-arr-foo.json: -------------------------------------------------------------------------------- 1 | { 2 | "foo": 0 3 | } -------------------------------------------------------------------------------- /tests/resources/exp-str-arr-87.ini: -------------------------------------------------------------------------------- 1 | [ a ] 2 | a:[ 0, 87 ] 3 | -------------------------------------------------------------------------------- /tests/resources/inijson-crlf.json: -------------------------------------------------------------------------------- 1 | { 2 | "start": "ini:ini", 3 | "grammars": { 4 | "ini": { 5 | "files": [ 6 | "INILexer.g4", "INIParser.g4" 7 | ], 8 | "islands": { 9 | "VALUE": "(?P.*)" 10 | }, 11 | "replacements": { 12 | "EOL": "\r\n", 13 | "HEADER": "a", 14 | "KEY": "a", 15 | "VALUE": "a" 16 | } 17 | }, 18 | "json": { 19 | "files": [ 20 | "JSON.g4" 21 | ] 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/resources/inijson.json: -------------------------------------------------------------------------------- 1 | { 2 | "start": "ini:ini", 3 | "grammars": { 4 | "ini": { 5 | "files": [ 6 | "INILexer.g4", "INIParser.g4" 7 | ], 8 | "islands": { 9 | "VALUE": "(?P.*)" 10 | }, 11 | "replacements": { 12 | "EOL": "\n", 13 | "HEADER": "a", 14 | "KEY": "a", 15 | "VALUE": "a" 16 | } 17 | }, 18 | "json": { 19 | "files": [ 20 | "JSON.g4" 21 | ] 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/resources/inp-obj-arr.json: -------------------------------------------------------------------------------- 1 | { 2 | "foo": "bar", 3 | "baz": [ 6, 7, 12, 31, 77, 87 ] 4 | } 5 | -------------------------------------------------------------------------------- /tests/resources/inp-str-arr.ini: -------------------------------------------------------------------------------- 1 | [test] 2 | foo: "bar" 3 | baz: [ 6, 7, 12, 31, 77, 87 ] 4 | -------------------------------------------------------------------------------- /tests/resources/sut-inijson-load.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import configparser 4 | import json 5 | import sys 6 | 7 | 8 | c = configparser.ConfigParser(allow_no_value=True) 9 | with open(sys.argv[1], 'r') as f: 10 | c.read_file(f) 11 | 12 | for s in c.sections(): 13 | for o in c.options(s): 14 | j = json.loads(c.get(s, o)) 15 | 16 | c.write(sys.stdout) 17 | -------------------------------------------------------------------------------- /tests/resources/sut-json-load.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import sys 5 | 6 | 7 | with open(sys.argv[1], 'r', encoding='utf-8') as f: 8 | j = json.load(f) 9 | 10 | print(f'{j!r}') 11 | -------------------------------------------------------------------------------- /tests/resources/test-inijson-str-arr-87.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | python %~f0\..\sut-inijson-load.py %1 | find "87" >NUL 2>&1 3 | -------------------------------------------------------------------------------- /tests/resources/test-inijson-str-arr-87.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | python $(dirname $0)/sut-inijson-load.py $1 | grep -q "87" 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-87.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | python %~f0\..\sut-json-load.py %1 | find "87" >NUL 2>&1 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-87.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "87" 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-bar.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | python %~f0\..\sut-json-load.py %1 | find "bar" >NUL 2>&1 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-bar.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "bar" 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-baz.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | python %~f0\..\sut-json-load.py %1 | find "baz" >NUL 2>&1 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-baz.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "baz" 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-foo.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | python %~f0\..\sut-json-load.py %1 | find "foo" >NUL 2>&1 3 | -------------------------------------------------------------------------------- /tests/resources/test-json-obj-arr-foo.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "foo" 3 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss. 2 | # 3 | # Licensed under the BSD 3-Clause License 4 | # . 5 | # This file may not be copied, modified, or distributed except 6 | # according to those terms. 7 | 8 | import os 9 | import subprocess 10 | import sys 11 | 12 | import pytest 13 | 14 | 15 | is_windows = sys.platform.startswith('win32') 16 | script_ext = '.bat' if is_windows else '.sh' 17 | 18 | tests_dir = os.path.dirname(os.path.abspath(__file__)) 19 | resources_dir = os.path.join(tests_dir, 'resources') 20 | antlr = os.getenv('ANTLR') 21 | 22 | 23 | @pytest.mark.parametrize('test, inp, exp, grammar, rule, input_format', [ 24 | ('test-json-obj-arr-foo', 'inp-obj-arr.json', 'exp-obj-arr-foo.json', 'JSON.g4', 'json', None), 25 | ('test-json-obj-arr-bar', 'inp-obj-arr.json', 'exp-obj-arr-bar.json', 'JSON.g4', 'json', None), 26 | ('test-json-obj-arr-baz', 'inp-obj-arr.json', 'exp-obj-arr-baz.json', 'JSON.g4', 'json', None), 27 | ('test-json-obj-arr-87', 'inp-obj-arr.json', 'exp-obj-arr-87.json', 'JSON.g4', 'json', None), 28 | ('test-inijson-str-arr-87', 'inp-str-arr.ini', 'exp-str-arr-87.ini', None, None, 'inijson-crlf.json' if is_windows else 'inijson.json'), 29 | ]) 30 | @pytest.mark.parametrize('args', [ 31 | ('--cache=config', ), 32 | ('--no-skip-unremovable', '--parser=java', '--cache=content', ), 33 | ('--no-squeeze-tree', '--parser=java', '--cache=none', ), 34 | ('--no-squeeze-tree', '--no-skip-unremovable', '--cache=config', ), 35 | ('--no-hdd-star', '--parser=java', '--cache=content', ), 36 | ('--no-hdd-star', '--no-skip-unremovable', '--cache=none', ), 37 | ('--no-hdd-star', '--no-squeeze-tree', '--cache=config', ), 38 | ('--no-hdd-star', '--no-squeeze-tree', '--no-skip-unremovable', '--parser=java', '--cache=content', ), 39 | ('--parallel', ), 40 | ]) 41 | def test_cli(test, inp, exp, grammar, rule, input_format, args, tmpdir): 42 | out_dir = str(tmpdir) 43 | cmd = (sys.executable, '-m', 'picireny') \ 44 | + (f'--test={test}{script_ext}', f'--input={inp}', f'--out={out_dir}') \ 45 | + ('--log-level=TRACE', ) 46 | if grammar: 47 | cmd += (f'--grammar={grammar}', ) 48 | if rule: 49 | cmd += (f'--start={rule}', ) 50 | if input_format: 51 | cmd += (f'--format={input_format}', ) 52 | if antlr: 53 | cmd += (f'--antlr={antlr}', ) 54 | cmd += args 55 | subprocess.run(cmd, cwd=resources_dir, check=True) 56 | 57 | with open(os.path.join(out_dir, inp), 'rb') as outf: 58 | outb = outf.read() 59 | with open(os.path.join(resources_dir, exp), 'rb') as expf: 60 | expb = expf.read() 61 | assert outb == expb 62 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py, lint, schema, build 3 | isolated_build = true 4 | 5 | [testenv] 6 | passenv = ANTLR 7 | deps = pytest 8 | commands = py.test {posargs} 9 | download = true 10 | 11 | [testenv:cov] 12 | deps = pytest-cov 13 | commands = py.test --cov=picireny --cov-config=tox.ini {posargs} 14 | usedevelop = true 15 | 16 | [coverage:run] 17 | omit = **/parser/* 18 | 19 | [testenv:lint] 20 | deps = 21 | pycodestyle 22 | pylint 23 | pytest 24 | commands = 25 | pylint src/picireny tests 26 | pycodestyle src/picireny tests --ignore=E501 --exclude=src/picireny/antlr4/parser/ANTLRv4*.py 27 | 28 | [testenv:schema] 29 | deps = 30 | check-jsonschema 31 | skip_install = true 32 | commands = 33 | check-jsonschema -v --check-metaschema schemas/format.json schemas/replacements.json 34 | check-jsonschema -v --schemafile schemas/format.json tests/resources/inijson.json tests/resources/inijson-crlf.json 35 | 36 | [testenv:build] 37 | deps = 38 | build 39 | twine 40 | virtualenv 41 | skip_install = true 42 | commands = 43 | pyproject-build -o {envtmpdir} 44 | twine check {envtmpdir}/* 45 | --------------------------------------------------------------------------------