├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .pylintrc
├── LICENSE.rst
├── MANIFEST.in
├── README.rst
├── RELNOTES.rst
├── pyproject.toml
├── schemas
    ├── format.json
    └── replacements.json
├── setup.cfg
├── src
    └── picireny
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── antlr4
    │       ├── __init__.py
    │       ├── antlr_tree.py
    │       ├── grammar_analyzer.py
    │       ├── hdd_tree_builder.py
    │       ├── parser
    │       │   ├── LexerAdaptor.py
    │       │   └── __init__.py
    │       ├── parser_builder.py
    │       └── resources
    │       │   ├── ANTLRv4Lexer.g4
    │       │   ├── ANTLRv4Parser.g4
    │       │   ├── ExtendedTargetParser.java
    │       │   └── LexBasic.g4
    │   ├── cli.py
    │   ├── filter.py
    │   ├── hdd.py
    │   ├── hdd_tree.py
    │   ├── hddr.py
    │   ├── hoist.py
    │   ├── info.py
    │   ├── prune.py
    │   ├── srcml
    │       ├── __init__.py
    │       └── hdd_tree_builder.py
    │   └── transform.py
├── tests
    ├── resources
    │   ├── INILexer.g4
    │   ├── INIParser.g4
    │   ├── JSON.g4
    │   ├── exp-obj-arr-87.json
    │   ├── exp-obj-arr-bar.json
    │   ├── exp-obj-arr-baz.json
    │   ├── exp-obj-arr-foo.json
    │   ├── exp-str-arr-87.ini
    │   ├── inijson-crlf.json
    │   ├── inijson.json
    │   ├── inp-obj-arr.json
    │   ├── inp-str-arr.ini
    │   ├── sut-inijson-load.py
    │   ├── sut-json-load.py
    │   ├── test-inijson-str-arr-87.bat
    │   ├── test-inijson-str-arr-87.sh
    │   ├── test-json-obj-arr-87.bat
    │   ├── test-json-obj-arr-87.sh
    │   ├── test-json-obj-arr-bar.bat
    │   ├── test-json-obj-arr-bar.sh
    │   ├── test-json-obj-arr-baz.bat
    │   ├── test-json-obj-arr-baz.sh
    │   ├── test-json-obj-arr-foo.bat
    │   └── test-json-obj-arr-foo.sh
    └── test_cli.py
└── tox.ini


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: main
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   test:
 6 |     strategy:
 7 |       matrix:
 8 |         os: [ubuntu-latest, macos-latest, windows-latest]
 9 |         python-version: [3.8, 3.9, '3.10', '3.11', '3.12', '3.13', 'pypy-3.10']
10 |     runs-on: ${{ matrix.os }}
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |         with:
14 |           fetch-depth: 0
15 |       - uses: actions/setup-python@v5
16 |         with:
17 |           python-version: ${{ matrix.python-version }}
18 |       - uses: actions/setup-java@v4
19 |         with:
20 |           java-version: 17
21 |           distribution: temurin
22 |         if: matrix.os == 'windows-latest'
23 |       - run: pip install --upgrade tox
24 |       - run: tox -v -e py
25 | 
26 |   lint:
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - uses: actions/checkout@v4
30 |         with:
31 |           fetch-depth: 0
32 |       - uses: actions/setup-python@v5
33 |         with:
34 |           python-version: '3.x'
35 |       - run: pip install --upgrade tox
36 |       - run: tox -v -e lint
37 | 
38 |   schema:
39 |     runs-on: ubuntu-latest
40 |     steps:
41 |       - uses: actions/checkout@v4
42 |         with:
43 |           fetch-depth: 0
44 |       - uses: actions/setup-python@v5
45 |         with:
46 |           python-version: '3.x'
47 |       - run: pip install --upgrade tox
48 |       - run: tox -v -e schema
49 | 
50 |   cov:
51 |     runs-on: ubuntu-latest
52 |     steps:
53 |       - uses: actions/checkout@v4
54 |         with:
55 |           fetch-depth: 0
56 |       - uses: actions/setup-python@v5
57 |         with:
58 |           python-version: '3.x'
59 |       - run: pip install --upgrade tox coveralls
60 |       - run: tox -v -e cov
61 |       - run: coveralls --service=github
62 |         env:
63 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
64 | 
65 |   publish:
66 |     needs: [test, lint, schema]
67 |     runs-on: ubuntu-latest
68 |     steps:
69 |       - uses: actions/checkout@v4
70 |         with:
71 |           fetch-depth: 0
72 |       - uses: actions/setup-python@v5
73 |         with:
74 |           python-version: '3.x'
75 |       - run: pip install --upgrade tox
76 |       - run: tox -v -e build
77 |       - uses: pypa/gh-action-pypi-publish@release/v1
78 |         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && github.repository == 'renatahodovan/picireny'
79 |         with:
80 |           password: ${{ secrets.pypi_token }}
81 |           packages_dir: .tox/build/tmp/
82 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.py[co]
 3 | build
 4 | dist
 5 | .eggs
 6 | *.egg-info
 7 | .DS_Store
 8 | .idea
 9 | .cache
10 | .pytest_cache
11 | .tox
12 | .coverage*
13 | *.interp
14 | *.tokens
15 | src/picireny/antlr4/parser/ANTLRv4*.py
16 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MASTER]
 2 | 
 3 | # Files or directories to be skipped. They should be base names, not paths.
 4 | ignore=parser
 5 | 
 6 | [MESSAGES CONTROL]
 7 | 
 8 | # Disable the message, report, category or checker with the given id(s). You
 9 | # can either give multiple identifiers separated by comma (,) or put this
10 | # option multiple times (only on the command line, not in the configuration
11 | # file where it should appear only once).You can also use "--disable=all" to
12 | # disable everything first and then reenable specific checks. For example, if
13 | # you want to run only the similarities checker, you can use "--disable=all
14 | # --enable=similarities". If you want to run only the classes checker, but have
15 | # no Warning level messages displayed, use"--disable=all --enable=classes
16 | # --disable=W"
17 | disable=
18 |     abstract-method,
19 |     attribute-defined-outside-init,
20 |     import-outside-toplevel,
21 |     invalid-name,
22 |     line-too-long,
23 |     missing-docstring,
24 |     no-self-use,  # disables warning in older pylint
25 |     protected-access,
26 |     redefined-builtin,
27 |     too-few-public-methods,
28 |     too-many-arguments,
29 |     too-many-branches,
30 |     too-many-locals,
31 |     too-many-positional-arguments,
32 |     too-many-return-statements,
33 |     too-many-statements,
34 |     unspecified-encoding,
35 |     unused-argument,
36 |     useless-option-value,  # disables warning in recent pylint that does not check for no-self-use anymore
37 | 
38 | [REPORTS]
39 | 
40 | # Set the output format. Available formats are text, parseable, colorized, json
41 | # and msvs (visual studio).You can also give a reporter class, eg
42 | # mypackage.mymodule.MyReporterClass.
43 | output-format=parseable
44 | 
45 | # Activate the evaluation score.
46 | score=no
47 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016-2024 Renata Hodovan, Akos Kiss.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |    this list of conditions and the following disclaimer in the documentation
12 |    and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 |    may be used to endorse or promote products derived from this software
16 |    without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | -----
30 | 
31 | This software is based on work licensed under identical terms with relevant
32 | files carrying the following copyright notice:
33 | 
34 | Copyright (c) 2007 Ghassan Misherghi.
35 | 
36 | -----
37 | 
38 | This software includes components from the "Grammars written for ANTLR v4"
39 | project under src/picireny/antlr4/resources and src/picireny/antlr4/parser,
40 | which files carry a compatible "BSD license" and their own copyright notices.
41 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | exclude src/picireny/antlr4/parser/ANTLRv4*.py
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ========
  2 | Picireny
  3 | ========
  4 | *Hierarchical Delta Debugging Framework*
  5 | 
  6 | .. image:: https://img.shields.io/pypi/v/picireny?logo=python&logoColor=white
  7 |    :target: https://pypi.org/project/picireny/
  8 | .. image:: https://img.shields.io/pypi/l/picireny?logo=open-source-initiative&logoColor=white
  9 |    :target: https://pypi.org/project/picireny/
 10 | .. image:: https://img.shields.io/github/actions/workflow/status/renatahodovan/picireny/main.yml?branch=master&logo=github&logoColor=white
 11 |    :target: https://github.com/renatahodovan/picireny/actions
 12 | .. image:: https://img.shields.io/coveralls/github/renatahodovan/picireny/master?logo=coveralls&logoColor=white
 13 |    :target: https://coveralls.io/github/renatahodovan/picireny
 14 | 
 15 | *Picireny* is a Python implementation of the Hierarchical Delta Debugging
 16 | (HDD in short) algorithm adapted to use ANTLR_ v4 for parsing both the input
 17 | and the grammar(s) describing the format of the input. It relies on Picire_
 18 | to provide the implementation of the core Delta Debugging algorithm along
 19 | with various tweaks like parallelization. Just like the *Picire* framework,
 20 | *Picireny* can also be used either as a command line tool or as a library.
 21 | 
 22 | Both Hierarchical Delta Debugging and Delta Debugging automatically reduce
 23 | "interesting" tests while keeping their "interesting" behaviour. (E.g.,
 24 | "interestingness" may mean failure-inducing input to a system-under-test.)
 25 | However, HDD is an improvement that tries to investigate less test cases during
 26 | the reduction process by making use of knowledge on the structure of the input.
 27 | 
 28 | The tool (and the algorithm) works iteratively in several ways. As a first
 29 | step, it splits up the input into tokens and organizes them in a tree structure
 30 | as defined by a grammar. Then, iteratively, it invokes Delta Debugging on each
 31 | level of the tree from top to bottom, and DD is an iterative process itself,
 32 | too. Finally, the nodes kept in the tree are "unparsed" to yield a reduced but
 33 | still "interesting" output.
 34 | 
 35 | .. _ANTLR: http://www.antlr.org
 36 | .. _Picire: https://github.com/renatahodovan/picire
 37 | 
 38 | 
 39 | Requirements
 40 | ============
 41 | 
 42 | * Python_ >= 3.8
 43 | * Java_ SE >= 11 JRE or JDK (the latter is optional, only needed if Java is used
 44 |   as the parser language)
 45 | 
 46 | .. _Python: https://www.python.org
 47 | .. _Java: https://www.oracle.com/java/
 48 | 
 49 | 
 50 | Install
 51 | =======
 52 | 
 53 | To use *Picireny* in another project, it can be added to ``setup.cfg`` as an
 54 | install requirement (if using setuptools_ with declarative config):
 55 | 
 56 | .. code-block:: ini
 57 | 
 58 |     [options]
 59 |     install_requires =
 60 |         picireny
 61 | 
 62 | To install *Picireny* manually, e.g., into a virtual environment, use pip_::
 63 | 
 64 |     pip install picireny
 65 | 
 66 | The above approaches install the latest release of *Picireny* from PyPI_.
 67 | Alternatively, for the development version, clone the project and perform a
 68 | local install::
 69 | 
 70 |     pip install .
 71 | 
 72 | .. _setuptools: https://github.com/pypa/setuptools
 73 | .. _pip: https://pip.pypa.io
 74 | .. _PyPI: https://pypi.org/
 75 | 
 76 | 
 77 | Usage
 78 | =====
 79 | 
 80 | *Picireny* uses the same CLI as *Picire* and hence accepts the same
 81 | options_.
 82 | On top of the inherited ones, *Picireny* accepts several further arguments:
 83 | 
 84 | * ``--grammar`` (optional): List of grammars describing the input format. (You
 85 |   can write them by hand or simply download them from the
 86 |   `ANTLR v4 grammars repository`_.)
 87 | * ``--start`` (optional): Name of the start rule (optionally prefixed with a
 88 |   grammar name) as ``[grammarname:]rulename``.
 89 | * ``--replacements`` (optional): Json file containing rule names and minimal
 90 |   replacement strings (otherwise these are calculated automatically) (see
 91 |   schema__).
 92 | * ``--format`` (optional): Json file describing the input format (see schema__
 93 |   and example_). This descriptor can incorporate all the above (``--grammar``,
 94 |   ``--start`` and ``--replacements``) properties, along with the possibility of
 95 |   island grammar definitions. If both ``--format`` and the aforementioned
 96 |   arguments are present, then the latter will override the appropriate values of
 97 |   the format file.
 98 | * ``--antlr`` (optional): Path to the ANTLR tool jar.
 99 | * ``--parser`` (optional): Language of the generated parser. Currently 'python'
100 |   (default) and 'java' targets (faster, but needs JDK) are supported.
101 | 
102 | Note: although, all the arguments are optional, the grammar files and the start
103 | rule of the top-level parser must be defined with an arbitrary combination of the
104 | ``--format``, ``--grammars``, and ``--start`` arguments.
105 | 
106 | .. _options: https://github.com/renatahodovan/picire/tree/master/README.rst#usage
107 | .. _`ANTLR v4 grammars repository`: https://github.com/antlr/grammars-v4
108 | .. __: schemas/replacements.json
109 | .. __: schemas/format.json
110 | .. _example: tests/resources/inijson.json
111 | 
112 | Example usage to reduce an HTML file::
113 | 
114 |     picireny --input=<path/to/the/input.html> --test=<path/to/the/tester> \
115 |              --grammar HTMLLexer.g4 HTMLParser.g4 --start htmlDocument \
116 |              --parallel --subset-iterator=skip --complement-iterator=backward
117 | 
118 | 
119 | Compatibility
120 | =============
121 | 
122 | *Picireny* was tested on:
123 | 
124 | * Linux (Ubuntu 14.04 / 16.04 / 18.04 / 20.04)
125 | * OS X / macOS (10.11 / 10.12 / 10.13 / 10.14 / 10.15 / 11)
126 | * Windows (Server 2012 R2 / Server version 1809 / Windows 10)
127 | 
128 | 
129 | Acknowledgement and Citations
130 | =============================
131 | 
132 | *Picireny* is motivated by the idea of Hierarchical Delta Debugging:
133 | 
134 | * Ghassan Misherghi and Zhendong Su. HDD: Hierarchical Delta Debugging.
135 |   In Proceedings of the 28th International Conference on Software Engineering
136 |   (ICSE '06), pages 142-151, Shanghai, China, May 2006. ACM.
137 |   https://doi.org/10.1145/1134285.1134307
138 | 
139 | The details of the modernized re-implementation and further improvements are
140 | published in:
141 | 
142 | * Renata Hodovan and Akos Kiss. Modernizing Hierarchical Delta Debugging.
143 |   In Proceedings of the 7th International Workshop on Automating Test Case
144 |   Design, Selection, and Evaluation (A-TEST 2016), pages 31-37, Seattle,
145 |   Washington, USA, November 2016. ACM.
146 |   https://doi.org/10.1145/2994291.2994296
147 | * Renata Hodovan, Akos Kiss, and Tibor Gyimothy. Tree Preprocessing and Test
148 |   Outcome Caching for Efficient Hierarchical Delta Debugging.
149 |   In Proceedings of the 12th IEEE/ACM International Workshop on Automation of
150 |   Software Testing (AST 2017), pages 23-29, Buenos Aires, Argentina, May 2017.
151 |   IEEE.
152 |   https://doi.org/10.1109/AST.2017.4
153 | * Renata Hodovan, Akos Kiss, and Tibor Gyimothy. Coarse Hierarchical Delta
154 |   Debugging.
155 |   In Proceedings of the 33rd IEEE International Conference on Software
156 |   Maintenance and Evolution (ICSME 2017), pages 194-203, Shanghai, China,
157 |   September 2017. IEEE.
158 |   https://doi.org/10.1109/ICSME.2017.26
159 | * Akos Kiss, Renata Hodovan, and Tibor Gyimothy. HDDr: A Recursive Variant of
160 |   the Hierarchical Delta Debugging Algorithm.
161 |   In Proceedings of the 9th ACM SIGSOFT International Workshop on Automating
162 |   Test Case Design, Selection, and Evaluation (A-TEST 2018), pages 16-22, Lake
163 |   Buena Vista, Florida, USA, November 2018. ACM.
164 |   https://doi.org/10.1145/3278186.3278189
165 | * Daniel Vince, Renata Hodovan, Daniella Barsony, and Akos Kiss. Extending
166 |   Hierarchical Delta Debugging with Hoisting.
167 |   In Proceedings of the 2nd ACM/IEEE International Conference on Automation of
168 |   Software Test (AST 2021), pages 60-69, Madrid, Spain (Virtual), May 2021.
169 |   IEEE.
170 |   https://doi.org/10.1109/AST52587.2021.00015
171 | * Daniel Vince, Renata Hodovan, Daniella Barsony, and Akos Kiss. The effect of
172 |   hoisting on variants of Hierarchical Delta Debugging.
173 |   Journal of Software: Evolution and Process, 34(11):e2483,1-26, November 2022.
174 |   Wiley.
175 |   https://doi.org/10.1002/smr.2483
176 | 
177 | 
178 | Copyright and Licensing
179 | =======================
180 | 
181 | Licensed under the BSD 3-Clause License_.
182 | 
183 | .. _License: LICENSE.rst
184 | 


--------------------------------------------------------------------------------
/RELNOTES.rst:
--------------------------------------------------------------------------------
  1 | ========================
  2 | *Picireny* Release Notes
  3 | ========================
  4 | 
  5 | 21.8
  6 | ====
  7 | 
  8 | Summary of changes:
  9 | 
 10 | * Dropped support for Python 2.
 11 | * Upgraded dependency to *Picire* 21.8 to use new/improved argument logging, CLI
 12 |   argument processing; and adapted code to the updated API.
 13 | * Heavily simplified the signatures of picireny.build_with_antlr4 and
 14 |   picireny.reduce.
 15 | * Changed the API of several functions and methods, made numerous arguments
 16 |   keyword-only.
 17 | * Added a new phase that applies the coarse filter to tree nodes and runs both
 18 |   pruning and hoisting on them.
 19 | * Fixed HDDr to correctly traverse the tree in case of filtered nodes.
 20 | * Fixed line-column calculations for tree nodes.
 21 | * Fixed "skip unremovable" transformation to correctly determine the unparsed
 22 |   representation of nodes for all parametrizations.
 23 | * Upgraded dependency *ANTLeRinator* to Epoch 1 (breaking away from ANTLR
 24 |   version numbering) and made use of its new feature to generate the lexer and
 25 |   parser from the ANTLRv4 grammar at build-time.
 26 | * Added direct dependency on ANTLR and upgraded it to v4.9.2.
 27 | * Made use of the *inators* package to unify CLI argument handling and logging.
 28 | * Dropped runtime dependency on setuptools.
 29 | * Moved to pyproject.toml & setup.cfg-based packaging.
 30 | * Improved log output.
 31 | * Improved documentation.
 32 | * Improved the testing infrastructure (stabilized tests, improved resource
 33 |   handling, better output on failure, testing Windows & PyPy).
 34 | * Various internal refactorings.
 35 | 
 36 | 
 37 | 21.3
 38 | ====
 39 | 
 40 | Summary of changes:
 41 | 
 42 | * Introduced phases of reduction to allow executing the same HDD algorithm
 43 |   variant multiple times with different parametrizations (e.g., run Coarse HDDr
 44 |   and HDDr after each other).
 45 | * Added a new transformation-based reduction technique called hoisting, as a new
 46 |   optional phase, to complement the existing pruning-based approaches.
 47 | * Added support for "tokens" section (i.e., token names without an associated
 48 |   lexer rule) in grammars.
 49 | * Added support for grammars with resource files that contain utility code or
 50 |   base classes of lexers and parsers.
 51 | * Upgraded dependency to *Picire* 20.12 to utilize its new generalized split
 52 |   factor concept and updated API.
 53 | * Upgraded dependency to ANTLR v4.9 (via *ANTLeRinator*).
 54 | * Bumped minimum Python 3 requirement to 3.5.
 55 | * Improved log output.
 56 | * Adapted versioning to use setuptools_scm (included distance from latest
 57 |   release into non-released version strings).
 58 | * Added classification metadata to project.
 59 | * Improved documentation.
 60 | * Improved the testing infrastructure (linting, faster test suite, testing
 61 |   Python 3.8 and 3.9, testing macOS, migrated testing from Travis CI to GitHub
 62 |   Actions).
 63 | * Various internal refactorings and performance improvements.
 64 | * Minor bug fixes.
 65 | 
 66 | 
 67 | 19.3
 68 | ====
 69 | 
 70 | Summary of changes:
 71 | 
 72 | * Made code Python 2 compatible (with the help of upgraded dependencies
 73 |   *Picire* 19.3 and *ANTLeRinator* 4.7.1-1).
 74 | * Improved the testing infrastructure (testing Python 2.7 and 3.7 on Travis CI;
 75 |   maintenance changes to various CI configurations).
 76 | 
 77 | 
 78 | 18.10
 79 | =====
 80 | 
 81 | Summary of changes:
 82 | 
 83 | * Added implementation for the recursive variant of the HDD algorithm (a.k.a.
 84 |   HDDr).
 85 | * Upgraded dependency to *Picire* 18.10 to utilize its new config ID and prefix
 86 |   concepts.
 87 | * Minor improvements.
 88 | 
 89 | 
 90 | 18.2
 91 | ====
 92 | 
 93 | Summary of changes:
 94 | 
 95 | * Added support for multiple tree builders, and added srcML as an experimental
 96 |   builder in addition to the existing ANTLRv4-based solution.
 97 | * Generalized HDD implementation to be parametric to express classic HDD and
 98 |   Coarse HDD as well.
 99 | * Upgraded dependency to *Picire* 18.1 to utilize custom initial granularity.
100 | * Upgraded dependency to ANTLR v4.7.1 (via *ANTLeRinator*).
101 | * Added support for building tokens from hidden ANTLR channels (whitespace,
102 |   comments, etc.) into the tree but also hiding them from the reducer (for
103 |   inputs where whitespace or other hidden tokens may matter during tree
104 |   unparsing).
105 | * Added new module for gathering statistics on trees and improved the logging of
106 |   the results of tree transformation algorithms.
107 | * Improved various algorithms (minimal replacement calculation from ANTLRv4
108 |   grammars, tree flattening for non-syntax-conforming inputs, unremovable node
109 |   detection for rules in addition to tokens).
110 | * Improved Python-Java interworking (for Java-based ANTLRv4 parsers).
111 | * Improved API usability (for use-cases when *Picireny* is not called via its
112 |   CLI).
113 | * Improved the testing infrastructure (by using the Coveralls online service).
114 | * Minor bug fixes and internal refactorings.
115 | 
116 | 
117 | 17.10
118 | =====
119 | 
120 | Summary of changes:
121 | 
122 | * Improved the way how input format can be defined by enabling the use of a more
123 |   consistent and well-defined config file.
124 | * Upgraded dependency to *Picire* 17.10 to utilize its Windows support.
125 | * Minor bug fixes.
126 | 
127 | 
128 | 17.7
129 | ====
130 | 
131 | Summary of changes:
132 | 
133 | * Added implementation for the coarse variant of the HDD algorithm.
134 | * Implemented heuristical optimization to flatten left and right-recursive tree
135 |   structures.
136 | * Improvements to the internal tree representation.
137 | * Simplified usage and ANTLR dependency installation via *ANTLeRinator*, and
138 |   upgraded dependency to *Picire* 17.6.
139 | * Improved the testing infrastructure (support for Python 3.6 and code coverage
140 |   measurement).
141 | 
142 | 
143 | 17.1
144 | ====
145 | 
146 | Summary of changes:
147 | 
148 | * Updated dependency to *Picire* 17.1 and adopted its support for content-based
149 |   result caching.
150 | * Added "squeeze tree" and "hide/skip unremovable tokens" HDD tree
151 |   optimizations.
152 | * Improved handling of erroneous input.
153 | * Extended the HDD algorithm with testing of single-node tree levels to ensure
154 |   1-tree-minimality of output.
155 | * Minor bug fixes and improvements.
156 | 
157 | 
158 | 16.12
159 | =====
160 | 
161 | Summary of changes:
162 | 
163 | * Added support for Java-based input parsing to improve performance.
164 | * Implemented HDD* (fixed-point iteration of hddmin).
165 | * Minor bug fixes and improvements.
166 | * Upgraded dependency to ANTLR v4.6.
167 | * Added *Picireny* to PyPI.
168 | 
169 | 
170 | 16.7
171 | ====
172 | 
173 | First public release of the *Picireny* Hierarchical Delta Debugging Framework.
174 | 
175 | Summary of main features:
176 | 
177 | * ANTLRv4-based input parsing and *Picire*-based ddmin.
178 | * Automatic "smallest allowable syntactic fragment" computation for both parser
179 |   and lexer rules.
180 | * Support for island grammars.
181 | * Python 3 API and out-of-the-box useful CLI.
182 | * py.test-based testing and tox support.
183 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "antlerinator>=1!3.0.0",
 4 |     "setuptools",
 5 |     "setuptools_scm[toml]",
 6 |     "wheel",
 7 | ]
 8 | build-backend = "setuptools.build_meta"
 9 | 
10 | [tool.setuptools_scm]
11 | version_scheme = "post-release"
12 | local_scheme = "node-and-date"
13 | 


--------------------------------------------------------------------------------
/schemas/format.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://json-schema.org/draft/2020-12/schema",
 3 | 
 4 |     "description": "Picireny input format definition.",
 5 |     "type": "object",
 6 |     "properties": {
 7 |         "start": {
 8 |             "description": "Name of start rule optionally prefixed with a grammar name (specified as [grammarname:]rulename).",
 9 |             "type": "string"
10 |         },
11 |         "grammars": {
12 |             "description": "Grammar descriptions mapped to (freely chosen) grammar names.",
13 |             "type": "object",
14 |             "patternProperties": {
15 |                 ".*": {
16 |                     "description": "Grammar description.",
17 |                     "type": "object",
18 |                     "properties": {
19 |                         "files": {
20 |                             "description": "List of ANTLR grammar files.",
21 |                             "type": "array",
22 |                             "items": {
23 |                                 "description": "Grammar file (resolved relative to the location of the input format definition).",
24 |                                 "type": "string"
25 |                             },
26 |                             "minItems": 1
27 |                         },
28 |                         "islands": {
29 |                             "description": "Regex patterns mapped to names of tokens of the described grammar.",
30 |                             "type": "object",
31 |                             "patternProperties": {
32 |                                 ".*": {
33 |                                     "description": "Regex pattern matched on token instances (named capture groups define those parts of the token, which should be parsed with a rule of an island grammar, specified as [grammarname:]rulename).",
34 |                                     "type": "string"
35 |                                 }
36 |                             }
37 |                         },
38 |                         "replacements": {
39 |                             "$ref": "replacements.json"
40 |                         }
41 |                     },
42 |                     "required": [ "files" ]
43 |                 }
44 |             }
45 |         }
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/schemas/replacements.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://json-schema.org/draft/2020-12/schema",
 3 | 
 4 |     "description": "Replacement strings mapped to grammar token names.",
 5 |     "type": "object",
 6 |     "patternProperties": {
 7 |         ".*": {
 8 |             "description": "Replacement string for token instances.",
 9 |             "type": "string"
10 |         }
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = picireny
 3 | description = Picireny Hierarchical Delta Debugging Framework
 4 | long_description = file: README.rst
 5 | long_description_content_type = text/x-rst
 6 | author = Renata Hodovan, Akos Kiss
 7 | author_email = hodovan@inf.u-szeged.hu, akiss@inf.u-szeged.hu
 8 | url = https://github.com/renatahodovan/picireny
 9 | license = BSD
10 | license_files = LICENSE.rst
11 | classifiers =
12 |     Intended Audience :: Developers
13 |     License :: OSI Approved :: BSD License
14 |     Operating System :: OS Independent
15 |     Programming Language :: Python
16 |     Programming Language :: Python :: 3
17 |     Programming Language :: Python :: 3.8
18 |     Programming Language :: Python :: 3.9
19 |     Programming Language :: Python :: 3.10
20 |     Programming Language :: Python :: 3.11
21 |     Programming Language :: Python :: 3.12
22 |     Programming Language :: Python :: 3.13
23 |     Topic :: Software Development :: Testing
24 | platform = any
25 | 
26 | [options]
27 | package_dir =
28 |     = src
29 | packages = find_namespace:
30 | include_package_data = True
31 | python_requires = >=3.8
32 | install_requires =
33 |     antlerinator>=1!3.0.0
34 |     antlr4-python3-runtime==4.13.2
35 |     inators
36 |     picire==21.8
37 |     xson
38 | 
39 | [options.packages.find]
40 | where = src
41 | 
42 | [options.entry_points]
43 | console_scripts =
44 |     picireny = picireny.cli:execute
45 | 
46 | [build_antlr]
47 | commands =
48 |     antlerinator:4.13.2 src/picireny/antlr4/resources/ANTLRv4Lexer.g4 src/picireny/antlr4/resources/ANTLRv4Parser.g4 -Dlanguage=Python3 -o src/picireny/antlr4/parser -Xexact-output-dir -no-listener
49 | output =
50 |     src/picireny/antlr4/parser/ANTLRv4*.py
51 | 


--------------------------------------------------------------------------------
/src/picireny/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016-2021 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | from . import cli
 9 | from . import info
10 | from . import transform
11 | from .cli import __version__, build_with_antlr4, build_with_srcml, reduce
12 | from .hdd import hddmin
13 | from .hddr import hddrmin
14 | from .hdd_tree import HDDRule, HDDToken, HDDTree
15 | 


--------------------------------------------------------------------------------
/src/picireny/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | from picireny.cli import execute
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     execute()
13 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016-2020 Renata Hodovan, Akos Kiss.
2 | #
3 | # Licensed under the BSD 3-Clause License
4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
5 | # This file may not be copied, modified, or distributed except
6 | # according to those terms.
7 | 
8 | from .hdd_tree_builder import create_hdd_tree
9 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/antlr_tree.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss.
  2 | #
  3 | # Licensed under the BSD 3-Clause License
  4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  5 | # This file may not be copied, modified, or distributed except
  6 | # according to those terms.
  7 | 
  8 | import logging
  9 | import re
 10 | 
 11 | from sys import maxunicode
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | # Parser Elements
 17 | 
 18 | class ANTLRElement:
 19 |     def __init__(self, *, optional=False, repl=None, sep=''):
 20 |         """
 21 |         Constructor of the base tree node type.
 22 | 
 23 |         :param optional: Boolean indicating whether the current node is optional
 24 |             or not.
 25 |         :param repl: Known replacement if any.
 26 |         """
 27 |         self.children = []
 28 |         self.replacement = repl if not optional else ''
 29 |         self.sep = sep
 30 | 
 31 |     def all_replacements_defined(self):
 32 |         """
 33 |         Replacements are defined if the node has at least one child and all of
 34 |         the children have a replacement set.
 35 |         """
 36 |         return self.children and all(x.replacement is not None for x in self.children)
 37 | 
 38 |     def has_defined_replacement(self):
 39 |         """
 40 |         Checks if any of the children has a defined replacement. Needed by
 41 |         alternations since the replacement of a recursive rule wouldn't be
 42 |         possible to determine if waiting for all the children set.
 43 |         """
 44 |         return self.children and any(x.replacement is not None for x in self.children)
 45 | 
 46 |     def calc_replacement(self):
 47 |         """
 48 |         The minimal replacement of a parser rule is the concatenation of its
 49 |         children's minimal replacement.
 50 | 
 51 |         :return: Boolean denoting if a new replacement was found or not.
 52 |         """
 53 |         if self.all_replacements_defined():
 54 |             new_repl = self.sep.join(x.replacement for x in self.children if x.replacement)
 55 |             if self.replacement is None or len(new_repl) < len(self.replacement) or (len(new_repl) == len(self.replacement) and new_repl < self.replacement):
 56 |                 self.replacement = new_repl
 57 |                 return True
 58 |         return False
 59 | 
 60 | 
 61 | class ANTLRRule(ANTLRElement):
 62 |     """
 63 |     Representation of a parser rule. The replacement string determined here will
 64 |     be used in the reduce phase. This replacement can be set by the user or
 65 |     generated automatically. If it's set by the user then it won't be changed
 66 |     ever (even if it isn't minimal).
 67 |     """
 68 |     def __init__(self, name, *, repl=None):
 69 |         super().__init__(repl=repl)
 70 |         self.name = name
 71 |         self.const_replacement = repl is not None
 72 | 
 73 |     def calc_replacement(self):
 74 |         if self.const_replacement:
 75 |             return False
 76 |         return super().calc_replacement()
 77 | 
 78 | 
 79 | class ANTLRRef(ANTLRElement):
 80 |     def __init__(self, ref, *, optional=False):
 81 |         super().__init__(optional=optional)
 82 |         self.ref = ref
 83 | 
 84 | 
 85 | class ANTLRAlternative(ANTLRElement):
 86 |     def __init__(self, *, repl=None):
 87 |         super().__init__(repl=repl, sep=' ')
 88 | 
 89 | 
 90 | class ANTLRAlternation(ANTLRElement):
 91 |     def calc_replacement(self):
 92 |         """
 93 |         The minimal replacement of an alternation is it's shortest child.
 94 | 
 95 |         :return: Boolean denoting if a new replacement was found or not.
 96 |         """
 97 |         if self.has_defined_replacement():
 98 |             new_repl = min((c.replacement for c in self.children if c.replacement is not None), key=len)
 99 |             if self.replacement is None or len(new_repl) < len(self.replacement) or (len(new_repl) == len(self.replacement) and new_repl < self.replacement):
100 |                 self.replacement = new_repl
101 |                 return True
102 |         return False
103 | 
104 | 
105 | # Lexer Elements
106 | 
107 | class ANTLRLexerElement(ANTLRElement):
108 |     def __init__(self, *, optional=False, repl=None):
109 |         super().__init__(optional=optional, repl=repl)
110 |         self.start_intervals = None
111 | 
112 |     def starters_defined(self):
113 |         return self.children and all(x.start_intervals is not None for x in self.children)
114 | 
115 |     def calc_starters(self):
116 |         if self.start_intervals is None and self.starters_defined():
117 |             self.start_intervals = sum((x.start_intervals for x in self.children), [])
118 |             return True
119 |         return False
120 | 
121 |     @staticmethod
122 |     def resolve_escapes(src):
123 |         """
124 |         Remove escaping from escape sequences in src. E.g., lexer rules may
125 |         contain such expressions like: [\t] where \t is evaluated as '\' + 't'
126 |         instead of a tabulator. This function executes the reversed
127 |         transformation.
128 | 
129 |         :param src: The string that may have escaped escape sequences.
130 |         """
131 |         return src.encode('utf-8').decode('unicode_escape')
132 | 
133 | 
134 | class ANTLRLexerRule(ANTLRLexerElement):
135 |     """
136 |     Representation of a lexer rule. The replacement string determined here will
137 |     be used in the reduce phase. This replacement can be set by the user or
138 |     generated automatically. If it's set by the user then it won't be changed
139 |     ever (even if it's not minimal).
140 |     """
141 |     def __init__(self, name, *, repl=None):
142 |         super().__init__(repl=repl)
143 |         self.name = name
144 |         self.const_replacement = repl is not None
145 | 
146 |     def calc_replacement(self):
147 |         if self.const_replacement:
148 |             return False
149 |         return super().calc_replacement()
150 | 
151 | 
152 | class ANTLRLexerElements(ANTLRLexerElement):
153 |     def calc_starters(self):
154 |         if self.children and self.children[0].start_intervals and self.start_intervals is None:
155 |             self.start_intervals = self.children[0].start_intervals
156 |             return True
157 |         return False
158 | 
159 | 
160 | class ANTLRLexerAlternation(ANTLRLexerElement):
161 |     def calc_replacement(self):
162 |         # The replacement is the known shortest replacement of the children.
163 |         if self.has_defined_replacement():
164 |             new_repl = min((c.replacement for c in self.children if c.replacement is not None), key=len)
165 |             if self.replacement is None or len(new_repl) < len(self.replacement) or (len(new_repl) == len(self.replacement) and new_repl < self.replacement):
166 |                 self.replacement = new_repl
167 |                 return True
168 |         return False
169 | 
170 | 
171 | class ANTLRTokenRef(ANTLRLexerElement):
172 |     def __init__(self, ref):
173 |         super().__init__()
174 |         self.ref = ref
175 | 
176 | 
177 | class ANTLRCharacterRange(ANTLRLexerElement):
178 |     def __init__(self, start, end):
179 |         super().__init__()
180 |         # Converting unicode code points to integers.
181 |         start = int(start.split('\\u')[1], 16) if start.startswith('\\u') else ord(start)
182 |         end = int(end.split('\\u')[1], 16) if end.startswith('\\u') else ord(end)
183 |         self.start_intervals = [(start, end)]
184 |         self.replacement = chr(start)
185 | 
186 | 
187 | class ANTLRDotElement(ANTLRLexerElement):
188 |     def __init__(self, *, optional=False):
189 |         super().__init__(optional=optional)
190 |         # Hard-wiring ASCII character range here does not have any limitation (neither effect).
191 |         # Basically it should not be used anyway, since the replacement is
192 |         # constantly set to 'a' and negating 'any character' would not make sense.
193 |         self.start_intervals = [(0, 255)]
194 |         if self.replacement is None:
195 |             self.replacement = 'a'
196 | 
197 | 
198 | class ANTLRString(ANTLRLexerElement):
199 |     def __init__(self, src):
200 |         super().__init__()
201 |         src = self.resolve_escapes(src)
202 |         self.start_intervals = [(ord(src[0]), ord(src[0]))]
203 |         self.replacement = src
204 | 
205 | 
206 | class ANTLRSetElement(ANTLRLexerElement):
207 |     def __init__(self, content=None, *, optional=False):
208 |         super().__init__(optional=optional)
209 |         if content and self.replacement is None:
210 |             if content.startswith(('"', '\'')):
211 |                 self.start_intervals = [(ord(content[1]), ord(content[1]))] if len(content) > 2 else []
212 |                 self.replacement = chr(self.start_intervals[0][0])
213 |             elif content.startswith('['):
214 |                 self.start_intervals = self.process_charset(content[1:-1])
215 |                 self.replacement = chr(self.start_intervals[0][0])
216 | 
217 |     @classmethod
218 |     def process_charset(cls, src):
219 |         """
220 |         Extract represented character intervals from character sets.
221 | 
222 |         :param src: The string representation of the character set (w/o
223 |             brackets).
224 |         """
225 |         intervals = [(ord(m.group(1)), ord(m.group(2))) for m in re.finditer(r'(\w)\-(\w)', src)]
226 |         positions = [(m.start(1), m.end(2)) for m in re.finditer(r'(\w)\-(\w)', src)]
227 | 
228 |         # Character sets can contain multiple sets and single characters (e.g., [-ab-defg-ijkl]).
229 |         # Select the single characters based on the position of sets.
230 |         if not positions:
231 |             intervals.extend((ord(x), ord(x)) for x in cls.resolve_escapes(src))
232 |         else:
233 |             characters = []
234 |             for i, pos in enumerate(positions):
235 |                 # Characters before the first range.
236 |                 if i == 0 and pos[0] > 0:
237 |                     characters.extend(cls.resolve_escapes(src[0: pos[0]]))
238 |                 # Characters between ranges.
239 |                 if i < len(positions) - 1:
240 |                     if positions[i][1] + 1 < positions[i + 1][0]:
241 |                         characters.extend(cls.resolve_escapes(src[positions[i][1] + 1: positions[i + 1][0]]))
242 |                 # Characters after ranges.
243 |                 else:
244 |                     if pos[1] < len(src) - 1:
245 |                         characters.extend(cls.resolve_escapes(src[pos[1] + 1:]))
246 |             intervals.extend((ord(x), ord(x)) for x in characters)
247 | 
248 |         return intervals
249 | 
250 |     def calc_starters(self):
251 |         if self.start_intervals is None and self.children and self.children[0].start_intervals:
252 |             self.start_intervals = self.children[0].start_intervals
253 |             return True
254 |         return False
255 | 
256 | 
257 | class ANTLRNotSet(ANTLRLexerElement):
258 |     def calc_starters(self):
259 |         # Known limitation (TODO?): it does not handle multiple negation.
260 |         if self.starters_defined() and self.start_intervals is None:
261 |             intervals = [y for x in self.children for y in x.start_intervals]
262 |             # Sort list of tuples by the first element.
263 |             intervals.sort(key=lambda x: x[0])
264 |             # The number (char) before the first interval's lower limit or after
265 |             # the last interval's upper limit is suitable for negation.
266 |             if intervals[0][0] > 0:
267 |                 neighbour_char = intervals[0][0] - 1
268 |             elif intervals[-1][-1] < maxunicode:
269 |                 neighbour_char = intervals[-1][-1] + 1
270 |             else:
271 |                 assert False, 'Cannot negate the whole unicode range.'
272 |             self.start_intervals = [(neighbour_char, neighbour_char)]
273 |             return True
274 |         return False
275 | 
276 |     def calc_replacement(self):
277 |         if self.start_intervals and self.replacement is None:
278 |             self.replacement = chr(self.start_intervals[0][0])
279 |             return True
280 |         return False
281 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/grammar_analyzer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss.
  2 | #
  3 | # Licensed under the BSD 3-Clause License
  4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  5 | # This file may not be copied, modified, or distributed except
  6 | # according to those terms.
  7 | 
  8 | from antlr4 import CommonTokenStream, FileStream
  9 | from antlr4.tree import Tree
 10 | 
 11 | from .antlr_tree import (
 12 |     ANTLRAlternation, ANTLRAlternative, ANTLRCharacterRange, ANTLRDotElement, ANTLRElement,
 13 |     ANTLRLexerAlternation, ANTLRLexerElement, ANTLRLexerElements, ANTLRLexerRule,
 14 |     ANTLRNotSet, ANTLRRef, ANTLRRule, ANTLRSetElement, ANTLRString, ANTLRTokenRef
 15 | )
 16 | from .parser import ANTLRv4Lexer, ANTLRv4Parser
 17 | 
 18 | 
 19 | def analyze_grammars(grammars, replacements):
 20 |     """
 21 |     Determine the minimal parser rule replacements of the input grammar.
 22 | 
 23 |     :param antlr_lexer: Reference to the ANTLR4 lexer class.
 24 |     :param antlr_parser: Reference to the ANTLR4 parser class.
 25 |     :param grammars: List of the grammars describing the input format.
 26 |     :param replacements: Dictionary that contains the predefined minimal
 27 |         replacement of any lexer or parser rules. These won't be overridden
 28 |         later.
 29 |     :return: Pair of the replacement dictionary and the positions of quantified
 30 |         elements in the grammars.
 31 |     """
 32 | 
 33 |     def set_replacements(tree):
 34 |         """
 35 |         Set the minimal replacements of the various subtrees.
 36 | 
 37 |         :param tree: AST-like tree representation built by create_grammar_tree.
 38 |         """
 39 |         iterate = True
 40 |         # Iterate until any updates were performed.
 41 |         while iterate:
 42 |             iterate = False
 43 |             for e in tree:
 44 |                 # If all of the children have a min set:
 45 |                 s = isinstance(e, ANTLRLexerElement) and e.calc_starters()
 46 |                 r = e.calc_replacement()
 47 |                 if s or r:
 48 |                     iterate = True
 49 | 
 50 |     # Only those ParseTrees are present in our tree representation that
 51 |     # have real effect on the minimal replacements of the rules.
 52 |     # e.g. actions, channels, return values, syntax elements (like: |;:), etc
 53 |     # are avoided, but e.g. rule definitions, alternations, references,
 54 |     # token definitions or such nodes that can have quantifier are kept.
 55 |     def create_node(ctx, optional):
 56 |         """
 57 |         Create tree node of the lexer and parser subtrees.
 58 | 
 59 |         :param ctx: The ANTLRRuleContext object under processing.
 60 |         :param optional: Boolean indicating whether the current context/node is
 61 |             optional or not.
 62 |         :return: Node representation of the current context if needed, otherwise
 63 |             None.
 64 |         """
 65 | 
 66 |         # Parser rules.
 67 | 
 68 |         if isinstance(ctx, parser.ParserRuleSpecContext):
 69 |             # The parserRuleSpec rule contains 3 or 4 terminal tokens and one of them is the ID of the rule.
 70 |             # Since we cannot make a distinction between terminals at this point, they have to be referred
 71 |             # by indices. Since only the first terminal is optional indexing them from the back is safe
 72 |             # (the 3th from back is the rule ID).
 73 |             name = [x for x in ctx.children if isinstance(x, Tree.TerminalNodeImpl)][-3].symbol.text
 74 |             return ANTLRRule(name, repl=replacements.get(name, None))
 75 | 
 76 |         # Alternations need special handling since their minimal replacements are their shortest
 77 |         # child (in every other cases the children would be concatenated).
 78 |         if isinstance(ctx, (parser.AltListContext, parser.RuleAltListContext)):
 79 |             return ANTLRAlternation()
 80 | 
 81 |         # Node is created from Alternative to group its element+ children (it's a sequence).
 82 |         if isinstance(ctx, parser.AlternativeContext):
 83 |             return ANTLRAlternative(repl=('' if not ctx.children else None))
 84 | 
 85 |         # LabeledElement and Block are created since they can have quantifier.
 86 |         if isinstance(ctx, (parser.LabeledElementContext, parser.BlockContext)):
 87 |             return ANTLRElement(optional=optional)
 88 | 
 89 |         # Atom can also have quantifier. Furthermore it may have a terminal child
 90 |         # (DOT = matching any character) that has to be handled here.
 91 |         if isinstance(ctx, parser.AtomContext):
 92 |             if isinstance(ctx.children[0], Tree.TerminalNodeImpl):
 93 |                 assert ctx.children[0] == '.'
 94 |                 return ANTLRDotElement(optional=optional)
 95 |             # Create a base ANTLRElement anyway to make possible applying the quantifier
 96 |             # to the subtree.
 97 |             return ANTLRElement(optional=optional)
 98 | 
 99 |         # Only the reference is set here but in the next step the whole referenced
100 |         # subtree will be plugged as its child.
101 |         if isinstance(ctx, parser.RulerefContext):
102 |             assert ctx.getChildCount() == 1, 'RuleRef must have exactly one child.'
103 |             return ANTLRRef(ctx.children[0].symbol.text, optional=optional)
104 | 
105 |         # Lexer rules.
106 | 
107 |         # The main difference between parser and lexer rules in this representation is that
108 |         # lexer rules have an additional field (start_intervals) that aims to track all the
109 |         # possible character ranges that a given token can start with. The purpose of this
110 |         # is being able to generate minimal replacement for a negated lexer rule: having
111 |         # all the possible character intervals that a lexer rule can start with we can easily
112 |         # invert these ranges.
113 |         if isinstance(ctx, parser.LexerRuleSpecContext):
114 |             # Just like at ANTLRRule, the 3rd terminal from the back contains the name of the lexer rule.
115 |             name = [x for x in ctx.children if isinstance(x, Tree.TerminalNodeImpl)][-3].symbol.text
116 |             return ANTLRLexerRule(name, repl=replacements.get(name, None))
117 | 
118 |         # The same logic as with parser alternations.
119 |         if isinstance(ctx, parser.LexerAltListContext):
120 |             return ANTLRLexerAlternation()
121 | 
122 |         # The special about LexerAlt is that it can have an empty child which makes
123 |         # possible such alternations in lexer like: ('a'| ). Capturing an empty LexerAlt
124 |         # construction is only possible here, in which case its minimal replacement is
125 |         # the empty string.
126 |         if isinstance(ctx, parser.LexerAltContext):
127 |             # If the alternative has no children means that it's left explicitly empty.
128 |             return ANTLRLexerElement(repl=('' if not ctx.children else None))
129 | 
130 |         # The special about LexerElements is that by determining its start character range
131 |         # is enough to get the first character of its first child (since it's a token sequence).
132 |         if isinstance(ctx, parser.LexerElementsContext):
133 |             return ANTLRLexerElements()
134 | 
135 |         # LexerBlock is created since it can have quantifier.
136 |         if isinstance(ctx, parser.LexerBlockContext):
137 |             return ANTLRLexerElement(optional=optional)
138 | 
139 |         # LexerAtom can also have quantifier. Furthermore it may have terminal children
140 |         # (DOT or character set) that has to be handled here.
141 |         if isinstance(ctx, parser.LexerAtomContext):
142 |             if isinstance(ctx.children[0], Tree.TerminalNodeImpl):
143 |                 content = ctx.children[0].symbol.text
144 |                 if content == '.':
145 |                     return ANTLRDotElement(optional=optional)
146 |                 if content.startswith('['):
147 |                     return ANTLRSetElement(content, optional=optional)
148 |                 assert False
149 |             # Create a base ANTLRLexerElement anyway to make possible applying the
150 |             # quantifier to the subtree.
151 |             return ANTLRLexerElement(optional=optional)
152 | 
153 |         if isinstance(ctx, parser.CharacterRangeContext):
154 |             # The 1st and 3rd token of a character range defines its boundaries.
155 |             return ANTLRCharacterRange(ctx.children[0].symbol.text[1:-1], ctx.children[2].symbol.text[1:-1])
156 | 
157 |         if isinstance(ctx, parser.TerminalContext):
158 |             # Terminal node is either a string literal or a token reference.
159 |             content = ctx.children[0].symbol.text
160 |             if content.startswith(('"', '\'')):
161 |                 return ANTLRString(content[1:-1])
162 |             return ANTLRTokenRef(content)
163 | 
164 |         if isinstance(ctx, parser.NotSetContext):
165 |             return ANTLRNotSet()
166 | 
167 |         # SetElement is the lexer rule that will be negated.
168 |         if isinstance(ctx, parser.SetElementContext):
169 |             # If the first child is a terminal node then it must be one of the followings:
170 |             # token_ref, string_literal or char set.
171 |             if isinstance(ctx.children[0], Tree.TerminalNodeImpl):
172 |                 if ctx.children[0].symbol.text.isupper():
173 |                     return ANTLRTokenRef(ctx.children[0].symbol.text)
174 |                 return ANTLRSetElement(ctx.children[0].symbol.text)
175 |             # In this case we have a character range.
176 |             return ANTLRSetElement()
177 | 
178 |         # Tokens without lexer rules.
179 | 
180 |         # Identifiers in a TokensSpec are definitions of token names without an
181 |         # associated lexer rule. We don't know anything about them, but they are
182 |         # added with a dummy representation to the tree to avoid dead links (as
183 |         # they may be referenced from other (parser) rules).
184 |         if isinstance(ctx, parser.IdentifierContext) and isinstance(ctx.parentCtx, parser.IdListContext) and isinstance(ctx.parentCtx.parentCtx, parser.TokensSpecContext):
185 |             return ANTLRLexerRule(str(ctx.TOKEN_REF()), repl='')
186 | 
187 |         return None
188 | 
189 |     def get_quantifier(children, idx):
190 |         """
191 |         Check whether a quantifier is defined on the idx-th children.
192 | 
193 |         :param children: All the siblings of the current node.
194 |         :param idx: The index of the current node among the siblings.
195 |         :return: Quantifier string of the idx-th context if one is defined, None
196 |             otherwise.
197 |         """
198 |         if len(children) <= idx + 1:
199 |             return None
200 |         suffix = None
201 |         if isinstance(children[idx + 1], parser.EbnfSuffixContext):
202 |             suffix = children[idx + 1].start.text
203 |         elif isinstance(children[idx + 1], parser.BlockSuffixContext):
204 |             suffix = children[idx + 1].children[0].start.text
205 |         return suffix
206 | 
207 |     def is_optional(quantifier):
208 |         """
209 |         Check whether a quantifier string makes its quantified expression
210 |         optional, i.e., if it allows the expression to occur 0 times.
211 | 
212 |         :param quantifier: Quantifier string.
213 |         :return: Boolean indicating whether the quantifier is optional or not.
214 |         """
215 |         return quantifier.startswith(('*', '?'))
216 | 
217 |     def create_grammar_tree(node, positions, parent_idx, optional, parser_rule):
218 |         """
219 |         Creates a tree representation of the target parser grammar to facilitate
220 |         the generation of minimal replacement strings.
221 | 
222 |         :param node: The ANTLR parser tree whose representation will be inserted
223 |             now.
224 |         :param positions: Dictionary describing positions in grammars where
225 |             optional actions should be injected.
226 |         :param parent_idx: The index of the parent node in the elements list or
227 |             None if without parent.
228 |         :param optional: Boolean deciding if the current node is optional or
229 |             not.
230 |         :param parser_rule: Boolean value indicating if a parser rule being
231 |             processed.
232 |         """
233 |         element = create_node(node, optional)
234 |         if element:
235 |             elements.append(element)
236 |             idx = len(elements) - 1
237 |             if parent_idx is not None:
238 |                 elements[parent_idx].children.append(element)
239 |         else:
240 |             idx = parent_idx
241 | 
242 |         if node.getChildCount() > 0:
243 |             # TerminalNodeImpl nodes already have been added by create_node
244 |             # when processing their parent since at this point we don't know their type.
245 |             for i, c in enumerate(x for x in node.children if not isinstance(x, Tree.TerminalNodeImpl)):
246 |                 quantifier = get_quantifier(node.children, i)
247 | 
248 |                 # Mark positions in parser rules that have any quantifier applied on them.
249 |                 if quantifier and parser_rule:
250 |                     start_token = parser.getInputStream().get(c.getSourceInterval()[0])
251 |                     end_token = parser.getInputStream().get(c.getSourceInterval()[1])
252 | 
253 |                     start_ln = start_token.line
254 |                     start = start_token.column
255 | 
256 |                     line_breaks = end_token.text.count('\n')
257 |                     end_ln = end_token.line + line_breaks
258 |                     end = end_token.column + len(end_token.text) if not line_breaks else \
259 |                         len(end_token.text) - end_token.text.rfind('\n') + 1
260 | 
261 |                     if start_ln not in positions:
262 |                         positions[start_ln] = []
263 |                     if end_ln not in positions:
264 |                         positions[end_ln] = []
265 | 
266 |                     positions[start_ln].append(('s', start))
267 |                     positions[end_ln].append(('e', end))
268 | 
269 |                 create_grammar_tree(c, positions, idx, quantifier and is_optional(quantifier),
270 |                                     parser_rule and not isinstance(element, ANTLRLexerRule))
271 | 
272 |     # EOF is a special token provided by the ANTLR framework. It's added preliminarily to
273 |     # our tree to avoid dead links to it.
274 |     elements = [ANTLRLexerRule('EOF', repl='')]
275 |     action_positions = {}
276 |     replacements = replacements if replacements else {}
277 |     # Fill elements with node representations.
278 |     for grammar in grammars:
279 |         action_positions[grammar] = {}
280 |         parser = ANTLRv4Parser(CommonTokenStream(ANTLRv4Lexer(FileStream(grammar, 'utf-8'))))
281 |         create_grammar_tree(parser.grammarSpec(), action_positions[grammar], None, False, True)
282 | 
283 |     # Create mapping between references and indices of antlr_tree to be able to plug the
284 |     # appropriate subtrees into reference nodes.
285 |     rules = dict((x.name, i) for i, x in enumerate(elements) if isinstance(x, (ANTLRRule, ANTLRLexerRule)))
286 | 
287 |     # Plug the referred node under the referrers.
288 |     for i, x in enumerate(elements):
289 |         if isinstance(x, (ANTLRRef, ANTLRTokenRef)):
290 |             assert not elements[i].children, 'Referrer nodes must not contain children.'
291 |             elements[i].children = [elements[rules[x.ref]]]
292 | 
293 |     # Associate tree nodes with minimal string replacements.
294 |     set_replacements(elements)
295 |     return dict((x.name, x.replacement) for x in elements if isinstance(x, (ANTLRRule, ANTLRLexerRule))), action_positions
296 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/hdd_tree_builder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss.
  2 | #
  3 | # Licensed under the BSD 3-Clause License
  4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  5 | # This file may not be copied, modified, or distributed except
  6 | # according to those terms.
  7 | 
  8 | import logging
  9 | import re
 10 | import shutil
 11 | import sys
 12 | 
 13 | from glob import glob
 14 | from os import makedirs, pathsep
 15 | from os.path import basename, join
 16 | from pkgutil import get_data
 17 | from string import Template
 18 | from subprocess import CalledProcessError, PIPE, run, STDOUT
 19 | 
 20 | import xson
 21 | 
 22 | from antlr4 import CommonTokenStream, error, InputStream, Token
 23 | from antlr4.Token import CommonToken
 24 | 
 25 | from .grammar_analyzer import analyze_grammars
 26 | from .parser_builder import build_grammars
 27 | from ..hdd_tree import HDDRule, HDDToken, Position
 28 | from ..transform import remove_empty_nodes
 29 | 
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | class HDDQuantifier(HDDRule):
 35 |     """
 36 |     Special rule type in the HDD tree to support optional quantifiers.
 37 |     """
 38 |     def __init__(self, *, start=None, end=None):
 39 |         super().__init__('', start=start, end=end)
 40 | 
 41 | 
 42 | class HDDHiddenToken(HDDToken):
 43 |     """
 44 |     Special token type that represents tokens from hidden channels.
 45 |     """
 46 | 
 47 | 
 48 | class HDDErrorToken(HDDToken):
 49 |     """
 50 |     Special token type that represents unmatched tokens. The minimal replacement
 51 |     of such nodes is an empty string.
 52 |     """
 53 |     def __init__(self, text, *, start=None, end=None):
 54 |         super().__init__('', text, start=start, end=end)
 55 | 
 56 | 
 57 | # Override ConsoleErrorListener to suppress parse issues in non-verbose mode.
 58 | class ConsoleListener(error.ErrorListener.ConsoleErrorListener):
 59 |     def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
 60 |         logger.debug('line %d:%d %s', line, column, msg)
 61 | 
 62 | 
 63 | error.ErrorListener.ConsoleErrorListener.INSTANCE = ConsoleListener()
 64 | 
 65 | 
 66 | def create_hdd_tree(src, *,
 67 |                     input_format, start,
 68 |                     antlr, lang='python',
 69 |                     hidden_tokens=False,
 70 |                     work_dir):
 71 |     """
 72 |     Build a tree that the HDD algorithm can work with.
 73 | 
 74 |     :param src: Input source.
 75 |     :param input_format: Dictionary describing the input format.
 76 |     :param start: Name of the start rule in [grammarname:]rulename format.
 77 |     :param antlr: Path to the ANTLR4 tool (Java jar binary).
 78 |     :param lang: The target language of the parser.
 79 |     :param hidden_tokens: Build hidden tokens of the input format into the HDD
 80 |         tree.
 81 |     :param work_dir: Working directory.
 82 |     :return: The root of the created HDD tree.
 83 |     """
 84 | 
 85 |     def inject_optional_actions(grammar, positions, target_file):
 86 |         """
 87 |         Update the original parser grammar by injecting actions to the start and
 88 |         end of every quantified part.
 89 | 
 90 |         :param grammar: Path to the grammar to be updated.
 91 |         :param positions: Start and end locations of quantified elements.
 92 |         :param target_file: Path to the updated grammar.
 93 |         """
 94 |         with open(grammar, 'rb') as f:
 95 |             lines = f.read().splitlines(True)
 96 | 
 97 |         languages = {
 98 |             'python': {
 99 |                 'prefix': b'({self.enter_optional()} ',
100 |                 'postfix': b' {self.exit_optional()})'
101 |             },
102 |             'java': {
103 |                 'prefix': b'({ try { getClass().getMethod("enter_optional").invoke(this); } catch (Exception e) { assert false; }} ',
104 |                 'postfix': b' { try { getClass().getMethod("exit_optional").invoke(this); } catch (Exception e) { assert false; }})'
105 |             }
106 |         }
107 | 
108 |         for ln in positions:
109 |             offset = 0
110 |             for position in sorted(positions[ln], key=lambda x: x[1]):
111 |                 if position[0] == 's':
112 |                     lines[ln - 1] = lines[ln - 1][0:position[1] + offset] + languages[lang]['prefix'] + lines[ln - 1][position[1] + offset:]
113 |                     offset += len(languages[lang]['prefix'])
114 |                 elif position[0] == 'e':
115 |                     lines[ln - 1] = lines[ln - 1][0:position[1] + offset] + languages[lang]['postfix'] + lines[ln - 1][position[1] + offset:]
116 |                     offset += len(languages[lang]['postfix'])
117 | 
118 |         with open(target_file, 'wb') as f:
119 |             f.write(b''.join(lines))
120 | 
121 |     def java_classpath(current_workdir):
122 |         return pathsep.join([antlr, current_workdir])
123 | 
124 |     def compile_java_sources(lexer, parser, listener, current_workdir):
125 |         executor = Template(get_data(__package__, 'resources/ExtendedTargetParser.java').decode('utf-8'))
126 |         with open(join(current_workdir, f'Extended{parser}.java'), 'w') as f:
127 |             f.write(executor.substitute({'lexer_class': lexer,
128 |                                          'parser_class': parser,
129 |                                          'listener_class': listener}))
130 |         try:
131 |             run(('javac', '-classpath', java_classpath(current_workdir)) + tuple(basename(j) for j in glob(join(current_workdir, '*.java'))),
132 |                 stdout=PIPE, stderr=STDOUT, cwd=current_workdir, check=True)
133 |         except CalledProcessError as e:
134 |             logger.error('Java compile failed!\n%s\n', e.output)
135 |             raise
136 | 
137 |     def prepare_parsing(grammar_name):
138 |         """
139 |         Performs initiative steps needed to parse the input test case (like
140 |         create directory structures, builds grammars, sets PATH, etc...)
141 | 
142 |         :param grammar_name: Name of the grammar to use for parsing.
143 |         """
144 |         grammar = input_format[grammar_name]
145 |         resources = [fn for fn in grammar['files'] if not fn.endswith('.g4')]
146 |         grammar['files'] = [fn for fn in grammar['files'] if fn.endswith('.g4')]
147 | 
148 |         replacements, action_positions = analyze_grammars(grammar['files'], grammar['replacements'])
149 |         logger.debug('Replacements are calculated...')
150 | 
151 |         current_workdir = join(work_dir, grammar_name) if grammar_name else work_dir
152 |         makedirs(current_workdir, exist_ok=True)
153 |         if current_workdir not in sys.path:
154 |             sys.path.append(current_workdir)
155 | 
156 |         # Inject actions into the target grammars to help localizing part of the test case that are optional.
157 |         for i, g in enumerate(grammar['files']):
158 |             grammar['files'][i] = join(current_workdir, basename(g))
159 |             inject_optional_actions(g, action_positions[g], grammar['files'][i])
160 | 
161 |         for r in resources:
162 |             shutil.copy(r, current_workdir)
163 | 
164 |         target_lexer_class, target_parser_class, target_listener_class = build_grammars(tuple(grammar['files']), current_workdir, antlr, lang)
165 |         logger.debug('Target grammars are processed...')
166 | 
167 |         if lang == 'java':
168 |             compile_java_sources(target_lexer_class, target_parser_class, target_listener_class, current_workdir)
169 |             input_format[grammar_name].update(lexer=target_lexer_class, parser=target_parser_class, listener=target_listener_class, replacements=replacements)
170 |             return
171 | 
172 |         class ExtendedTargetLexer(target_lexer_class):
173 |             """
174 |             ExtendedTargetLexer is a subclass of the original lexer
175 |             implementation. It can recognize skipped tokens and instead of
176 |             eliminating them from the parser they can be redirected to the
177 |             dedicated PICIRENY_CHANNEL for later use.
178 |             """
179 | 
180 |             PICIRENY_CHANNEL = -3
181 | 
182 |             # Skipped tokens cannot be accessed from the parser but we still need them to
183 |             # unparse test cases correctly. Sending these tokens to a dedicated channel won't
184 |             # alter the parse but makes these tokens available.
185 |             def skip(self):
186 |                 self._channel = self.PICIRENY_CHANNEL
187 | 
188 |         class ExtendedTargetParser(target_parser_class):
189 |             """
190 |             ExtendedTargetParser is a subclass of the original parser
191 |             implementation. It can trigger state changes that are needed to
192 |             identify parts of the input that are not needed to keep it
193 |             syntactically correct.
194 |             """
195 |             def enter_optional(self):
196 |                 self.trigger_listener('enter_optional')
197 | 
198 |             def exit_optional(self):
199 |                 self.trigger_listener('exit_optional')
200 | 
201 |             def enterRecursionRule(self, localctx, state, ruleIndex, precedence):
202 |                 super().enterRecursionRule(localctx, state, ruleIndex, precedence)
203 |                 self.trigger_listener('recursion_enter')
204 | 
205 |             def pushNewRecursionContext(self, localctx, state, ruleIndex):
206 |                 super().pushNewRecursionContext(localctx, state, ruleIndex)
207 |                 self.trigger_listener('recursion_push')
208 | 
209 |             def unrollRecursionContexts(self, parentCtx):
210 |                 super().unrollRecursionContexts(parentCtx)
211 |                 self.trigger_listener('recursion_unroll')
212 | 
213 |             def trigger_listener(self, event):
214 |                 for listener in self.getParseListeners():
215 |                     if hasattr(listener, event):
216 |                         getattr(listener, event)()
217 | 
218 |             def syntax_error_warning(self):
219 |                 if self.getNumberOfSyntaxErrors() > 0:
220 |                     logger.warning('%s finished with %d syntax errors. This may decrease reduce quality.',
221 |                                    target_parser_class.__name__, self.getNumberOfSyntaxErrors())
222 | 
223 |         class ExtendedTargetListener(target_listener_class):
224 |             """
225 |             ExtendedTargetListener is a subclass of the original listener
226 |             implementation. It can trigger state changes that are needed to
227 |             identify parts of the input that are not needed to keep it
228 |             syntactically correct.
229 |             """
230 |             def __init__(self, parser):
231 |                 self.parser = parser
232 |                 self.current_node = None
233 |                 self.root = None
234 |                 self.seen_terminal = False
235 |                 self.island_nodes = []
236 | 
237 |             def recursion_enter(self):
238 |                 assert isinstance(self.current_node, HDDRule)
239 |                 node = HDDRule(self.current_node.name)
240 |                 self.current_node.add_child(node)
241 |                 self.current_node.recursive_rule = True
242 |                 self.current_node = node
243 | 
244 |             def recursion_push(self):
245 |                 assert self.current_node.parent.children
246 | 
247 |                 first_child = self.current_node.parent.children[0]
248 |                 self.current_node.parent.remove_child(first_child)
249 |                 self.current_node.add_child(first_child)
250 | 
251 |             def recursion_unroll(self):
252 |                 assert self.current_node.recursive_rule
253 |                 assert len(self.current_node.children) == 1 and self.current_node.name == self.current_node.children[0].name
254 |                 children_to_lift = self.current_node.children[0].children
255 |                 parent = self.current_node.parent
256 |                 if children_to_lift:
257 |                     self.current_node.children = []
258 |                     self.current_node.add_children(children_to_lift)
259 |                 else:
260 |                     parent.remove_child(self.current_node)
261 |                 self.current_node = parent
262 | 
263 |             def enterEveryRule(self, ctx):
264 |                 name = self.parser.ruleNames[ctx.getRuleIndex()]
265 |                 node = HDDRule(name)
266 |                 if not self.root:
267 |                     self.root = node
268 |                 else:
269 |                     assert self.current_node
270 |                     self.current_node.add_child(node)
271 |                 self.current_node = node
272 | 
273 |             def exitEveryRule(self, ctx):
274 |                 # If the input contains syntax error, then the last optional block was may not closed.
275 |                 while isinstance(self.current_node, HDDQuantifier):
276 |                     self.exit_optional()
277 | 
278 |                 assert self.current_node.name == self.parser.ruleNames[ctx.getRuleIndex()], \
279 |                     f'{self.current_node.name} ({self.current_node!r}) != {self.parser.ruleNames[ctx.getRuleIndex()]}'
280 | 
281 |                 if self.current_node.parent:
282 |                     self.current_node = self.current_node.parent
283 | 
284 |             def tokenBoundaries(self, token):
285 |                 start = Position(token.line, token.column)
286 |                 return start, start.after(token.text)
287 | 
288 |             def addToken(self, node, child):
289 |                 if not self.seen_terminal:
290 |                     hidden_tokens = self.parser.getTokenStream().getHiddenTokensToLeft(node.symbol.tokenIndex, -1) or []
291 |                     for token in hidden_tokens:
292 |                         start, end = self.tokenBoundaries(token)
293 |                         self.current_node.add_child(HDDHiddenToken(self.parser.symbolicNames[token.type], token.text,
294 |                                                                    start=start, end=end))
295 |                 self.seen_terminal = True
296 | 
297 |                 self.current_node.add_child(child)
298 | 
299 |                 hidden_tokens = self.parser.getTokenStream().getHiddenTokensToRight(node.symbol.tokenIndex, -1) or []
300 |                 for token in hidden_tokens:
301 |                     start, end = self.tokenBoundaries(token)
302 |                     self.current_node.add_child(HDDHiddenToken(self.parser.symbolicNames[token.type], token.text,
303 |                                                                start=start, end=end))
304 | 
305 |             def visitTerminal(self, node):
306 |                 token = node.symbol
307 |                 name, text = (self.parser.symbolicNames[token.type], token.text) if token.type != Token.EOF else ('EOF', '')
308 |                 start, end = self.tokenBoundaries(token)
309 | 
310 |                 child = HDDToken(name, text, start=start, end=end)
311 |                 self.addToken(node, child)
312 |                 if name in grammar['islands']:
313 |                     self.island_nodes.append(child)
314 | 
315 |             def visitErrorNode(self, node):
316 |                 if hasattr(node, 'symbol'):
317 |                     token = node.symbol
318 |                     start, end = self.tokenBoundaries(token)
319 |                     self.addToken(node, HDDErrorToken(token.text, start=start, end=end))
320 | 
321 |             def enter_optional(self):
322 |                 quant_node = HDDQuantifier()
323 |                 self.current_node.add_child(quant_node)
324 |                 self.current_node = quant_node
325 | 
326 |             def exit_optional(self):
327 |                 assert self.current_node.parent, 'Quantifier node has no parent.'
328 |                 assert self.current_node.children, 'Quantifier node has no children.'
329 | 
330 |                 self.current_node = self.current_node.parent
331 | 
332 |         input_format[grammar_name].update(lexer=ExtendedTargetLexer, parser=ExtendedTargetParser, listener=ExtendedTargetListener, replacements=replacements)
333 | 
334 |     class ExtendedErrorListener(error.ErrorListener.ErrorListener):
335 | 
336 |         def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
337 |             t = CommonToken(source=(recognizer, recognizer._input),
338 |                             type=Token.INVALID_TYPE,
339 |                             channel=Token.DEFAULT_CHANNEL,
340 |                             start=recognizer._tokenStartCharIndex,
341 |                             stop=recognizer._tokenStartCharIndex)
342 |             t.line = recognizer._tokenStartLine
343 |             t.column = recognizer._tokenStartColumn
344 |             recognizer._type = Token.MIN_USER_TOKEN_TYPE
345 |             recognizer.emitToken(t)
346 | 
347 |     def build_hdd_tree(src, grammar_name, start_rule):
348 |         """
349 |         Parse the input with the provided ANTLR classes.
350 | 
351 |         :param src: Input source.
352 |         :param grammar_name: Name of the grammar to use for parsing.
353 |         :param start_rule: The name of the start rule of the parser.
354 |         :return: The root of the created HDD tree.
355 |         """
356 | 
357 |         grammar = input_format[grammar_name]
358 |         island_nodes = []
359 | 
360 |         def set_replacement(node):
361 |             if isinstance(node, (HDDQuantifier, HDDErrorToken)):
362 |                 node.replace = ''
363 |             elif isinstance(node, HDDRule):
364 |                 node.replace = grammar['replacements'][node.name]
365 |             else:
366 |                 node.replace = grammar['replacements'].get(node.name, node.text)
367 | 
368 |             if isinstance(node, HDDRule):
369 |                 for child in node.children:
370 |                     set_replacement(child)
371 | 
372 |         logger.debug('Parse input with %s rule', start_rule)
373 |         if lang != 'python':
374 | 
375 |             def hdd_tree_from_dict(node_dict):
376 |                 # Convert interval dictionaries to Position objects.
377 |                 if 'start' in node_dict:
378 |                     node_dict['start'] = Position(**node_dict['start'])
379 |                 if 'end' in node_dict:
380 |                     node_dict['end'] = Position(**node_dict['end'])
381 | 
382 |                 name = node_dict.get('name', None)
383 |                 children = node_dict.pop('children', None)
384 |                 cls = globals()[node_dict.pop('type')]
385 |                 node = cls(**node_dict)
386 | 
387 |                 if children:
388 |                     for child in children:
389 |                         node.add_child(hdd_tree_from_dict(child))
390 |                 elif name:
391 |                     if name in grammar['islands']:
392 |                         island_nodes.append(node)
393 |                 return node
394 | 
395 |             try:
396 |                 current_workdir = join(work_dir, grammar_name) if grammar_name else work_dir
397 |                 proc = run(('java', '-classpath', java_classpath(current_workdir), f'Extended{grammar["parser"]}', start_rule),
398 |                            input=src, stdout=PIPE, stderr=PIPE, universal_newlines=True, cwd=current_workdir, check=True)
399 |                 if proc.stderr:
400 |                     logger.debug(proc.stderr)
401 |                 result = xson.loads(proc.stdout)
402 |                 tree_root = hdd_tree_from_dict(result)
403 |             except CalledProcessError as e:
404 |                 logger.error('Java parser failed!\n%s\n%s', e.stdout, e.stderr)
405 |                 raise
406 |         else:
407 |             lexer = grammar['lexer'](InputStream(src))
408 |             lexer.addErrorListener(ExtendedErrorListener())
409 |             target_parser = grammar['parser'](CommonTokenStream(lexer))
410 |             parser_listener = grammar['listener'](target_parser)
411 |             target_parser.addParseListener(parser_listener)
412 | 
413 |             getattr(target_parser, start_rule)()
414 |             target_parser.syntax_error_warning()
415 |             island_nodes = parser_listener.island_nodes
416 |             assert parser_listener.root == parser_listener.current_node
417 |             tree_root = parser_listener.root
418 | 
419 |         # Traverse the HDD tree and set minimal replacements for nodes.
420 |         set_replacement(tree_root)
421 |         process_island_nodes(island_nodes, grammar['islands'])
422 |         logger.debug('Parse done.')
423 |         return tree_root
424 | 
425 |     def process_island_nodes(island_nodes, island_format):
426 |         for node in island_nodes:
427 |             if not isinstance(island_format[node.name], tuple):
428 |                 rewritten, mapping = rename_regex_groups(island_format[node.name])
429 |                 for new_name, old_name in mapping.items():
430 |                     grammar_name, rule_name = split_grammar_rule_name(old_name)
431 |                     mapping[new_name] = (grammar_name, rule_name)
432 |                     if 'lexer' not in input_format[grammar_name]:
433 |                         prepare_parsing(grammar_name)
434 |                 island_format[node.name] = (re.compile(rewritten, re.S), mapping)
435 | 
436 |             new_node = HDDRule(node.name, replace=node.replace)
437 |             new_node.add_children(build_island_subtree(node, *island_format[node.name]))
438 |             node.replace_with(new_node)
439 | 
440 |     def build_island_subtree(node, pattern, mapping):
441 |         """
442 |         Process terminal with an island grammar.
443 | 
444 |         :param node: HDDToken object containing island language.
445 |         :return: List of HDDTree nodes representing the `children` of node.
446 |         """
447 |         last_processed = 0
448 |         content = node.text
449 |         children = []
450 | 
451 |         # Intervals describes a non-overlapping splitting of the content according to the pattern.
452 |         intervals = []
453 |         for m in re.finditer(pattern, content):
454 |             intervals.extend((g, m.start(g), m.end(g)) for g in list(pattern.groupindex.keys()) if m.start(g) != m.end(g))
455 |         intervals.sort(key=lambda x: (x[1], x[2]))
456 | 
457 |         def shift_positions(node, start):
458 |             if node.start:
459 |                 node.start.shift(start)
460 |             if node.end:
461 |                 node.end.shift(start)
462 | 
463 |             if isinstance(node, HDDRule):
464 |                 for child in node.children:
465 |                     shift_positions(child, start)
466 | 
467 |         for interval in intervals:
468 |             # Create simple HDDToken of the substring proceeding a subgroup.
469 |             if last_processed < interval[1]:
470 |                 token_start = node.start.after(content[0:last_processed])
471 |                 token_text = content[last_processed:interval[1]]
472 |                 children.append(HDDToken('', token_text,
473 |                                          start=token_start,
474 |                                          end=token_start.after(token_text),
475 |                                          replace=token_text))
476 | 
477 |             # Process an island and save its subtree.
478 |             island_start = node.start.after(content[0:interval[1]])
479 |             island_root = build_hdd_tree(src=content[interval[1]:interval[2]],
480 |                                          grammar_name=mapping[interval[0]][0],
481 |                                          start_rule=mapping[interval[0]][1])
482 |             shift_positions(island_root, island_start)
483 |             children.append(island_root)
484 | 
485 |             last_processed = interval[2]
486 | 
487 |         # Create simple HDDToken of the substring following the last subgroup if any.
488 |         if last_processed < len(content):
489 |             token_start = node.start.after(content[0:last_processed])
490 |             token_text = content[last_processed:]
491 |             children.append(HDDToken('', token_text,
492 |                                      start=token_start,
493 |                                      end=token_start.after(token_text),
494 |                                      replace=token_text))
495 |         return children
496 | 
497 |     def calculate_rule_boundaries(node):
498 |         if isinstance(node, HDDRule):
499 |             for child in node.children:
500 |                 calculate_rule_boundaries(child)
501 | 
502 |             node.start = node.children[0].start
503 |             node.end = node.children[-1].end
504 | 
505 |         return node
506 | 
507 |     def remove_hidden_tokens(node):
508 |         if isinstance(node, HDDRule):
509 |             non_hidden_children = []
510 | 
511 |             for child in node.children:
512 |                 if not isinstance(child, HDDHiddenToken):
513 |                     remove_hidden_tokens(child)
514 |                     non_hidden_children.append(child)
515 | 
516 |             node.children[:] = non_hidden_children
517 | 
518 |         return node
519 | 
520 |     _NAMED_GRP_PATTERN = re.compile(r'(?<!\\)(\(\?P<[^>]*>)')   # "(?P<NAME>" not prefixed by a "\"
521 |     _NAMED_GRP_PREFIX = '(?P<'
522 |     _NAMED_GRP_SUFFIX = '>'
523 |     _NAMED_REF_PATTERN = re.compile(r'(?<!\\)(\(\?P=[^)]*\))')  # "(?P=NAME)" not prefixed by a "\"
524 |     _NAMED_REF_PREFIX = '(?P='
525 |     _NAMED_REF_SUFFIX = ')'
526 | 
527 |     def rename_regex_groups(pattern):
528 |         """
529 |         Rewrite capture group names in a regex pattern to ensure that the names
530 |         are valid Python identifiers (as expected by the ``re`` module). This
531 |         enables more sophisticated capture group names than allowed by default.
532 | 
533 |         :param str pattern: the original regex pattern with potentially extended
534 |             syntax for capture group names.
535 |         :return: the rewritten regex pattern and a mapping from the newly
536 |             introduced capture group names (which are guaranteed to by valid
537 |             Python identifiers) to the names used in the original pattern.
538 |         :rtype: tuple(str, dict(str, str))
539 | 
540 |         .. note::
541 | 
542 |            The function expects ``pattern`` to be syntactically valid. Its
543 |            behavior is undefined for erroneous input.
544 |         """
545 | 
546 |         grp_rewritten = ''
547 |         mapping = {}
548 |         rmapping = {}
549 |         cnt = 1
550 |         for item in _NAMED_GRP_PATTERN.split(pattern):
551 |             if _NAMED_GRP_PATTERN.match(item):
552 |                 old_name = item[len(_NAMED_GRP_PREFIX):-len(_NAMED_GRP_SUFFIX)]
553 |                 new_name = 'G' + str(cnt)
554 |                 cnt += 1
555 | 
556 |                 mapping[new_name] = old_name
557 |                 rmapping[old_name] = new_name
558 | 
559 |                 item = _NAMED_GRP_PREFIX + new_name + _NAMED_GRP_SUFFIX
560 | 
561 |             grp_rewritten += item
562 | 
563 |         ref_rewritten = ''
564 |         for item in _NAMED_REF_PATTERN.split(grp_rewritten):
565 |             if _NAMED_REF_PATTERN.match(item):
566 |                 old_name = item[len(_NAMED_REF_PREFIX):-len(_NAMED_REF_SUFFIX)]
567 |                 new_name = rmapping.get(old_name, old_name)
568 | 
569 |                 item = _NAMED_REF_PREFIX + new_name + _NAMED_REF_SUFFIX
570 | 
571 |             ref_rewritten += item
572 | 
573 |         return ref_rewritten, mapping
574 | 
575 |     def split_grammar_rule_name(name):
576 |         """
577 |         Determine the grammar and the rule parts in a potentially
578 |         grammar-prefixed rule name. The syntax for the prefixed format is
579 |         "[grammar:]rule", where "[]" denote optionality and the default for a
580 |         missing grammar part is the empty string.
581 | 
582 |         :param str name: a potentially grammar-prefixed rule name.
583 |         :return: a 2-tuple of the grammar and the rule name parts.
584 |         :rtype: tuple(str, str)
585 |         """
586 | 
587 |         names = name.split(':', 1)
588 |         if len(names) < 2:
589 |             names.insert(0, '')
590 |         return names[0], names[1]
591 | 
592 |     start_grammar, start_rule = split_grammar_rule_name(start)
593 |     prepare_parsing(start_grammar)
594 |     tree = build_hdd_tree(src=src,
595 |                           grammar_name=start_grammar,
596 |                           start_rule=start_rule)
597 |     if not hidden_tokens:
598 |         tree = remove_hidden_tokens(tree)
599 |     tree = remove_empty_nodes(tree)
600 |     tree = calculate_rule_boundaries(tree)
601 |     return tree
602 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/parser/LexerAdaptor.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  [The "BSD licence"]
  3 |  Copyright (c) 2005-2007 Terence Parr
  4 |  All rights reserved.
  5 | 
  6 |  Redistribution and use in source and binary forms, with or without
  7 |  modification, are permitted provided that the following conditions
  8 |  are met:
  9 |  1. Redistributions of source code must retain the above copyright
 10 |     notice, this list of conditions and the following disclaimer.
 11 |  2. Redistributions in binary form must reproduce the above copyright
 12 |     notice, this list of conditions and the following disclaimer in the
 13 |     documentation and/or other materials provided with the distribution.
 14 |  3. The name of the author may not be used to endorse or promote products
 15 |     derived from this software without specific prior written permission.
 16 | 
 17 |  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 18 |  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 19 |  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 20 |  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 21 |  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 22 |  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 23 |  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 24 |  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 26 |  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | """
 28 | from antlr4 import *
 29 | 
 30 | 
 31 | class LexerAdaptor(Lexer):
 32 | 
 33 |     """
 34 |       Track whether we are inside of a rule and whether it is lexical parser. _currentRuleType==Token.INVALID_TYPE
 35 |       means that we are outside of a rule. At the first sign of a rule name reference and _currentRuleType==invalid, we
 36 |       can assume that we are starting a parser rule. Similarly, seeing a token reference when not already in rule means
 37 |       starting a token rule. The terminating ';' of a rule, flips this back to invalid type.
 38 | 
 39 |       This is not perfect logic but works. For example, "grammar T;" means that we start and stop a lexical rule for
 40 |       the "T;". Dangerous but works.
 41 | 
 42 |       The whole point of this state information is to distinguish between [..arg actions..] and [charsets]. Char sets
 43 |       can only occur in lexical rules and arg actions cannot occur.
 44 |     """
 45 | 
 46 |     PREQUEL_CONSTRUCT = -10
 47 |     OPTIONS_CONSTRUCT = -11
 48 | 
 49 |     _currentRuleType = Token.INVALID_TYPE
 50 |     insideOptionsBlock = False
 51 | 
 52 |     def __init__(self, inp, output):
 53 |         Lexer.__init__(self, inp, output)
 54 | 
 55 |     def getCurrentRuleType(self):
 56 |         return self._currentRuleType
 57 | 
 58 |     def setCurrentRuleType(self, ruleType):
 59 |         self._currentRuleType = ruleType
 60 | 
 61 |     def handleBeginArgument(self):
 62 |         if self.inLexerRule():
 63 |             self.pushMode(self.LexerCharSet)
 64 |             self.more()
 65 |         else:
 66 |             self.pushMode(self.Argument)
 67 | 
 68 |     def handleEndArgument(self):
 69 |         self.popMode()
 70 |         if len(self._modeStack) > 0:
 71 |             self._type = self.ARGUMENT_CONTENT
 72 | 
 73 |     def handleEndAction(self):
 74 |         oldMode = self._mode
 75 |         newMode = self.popMode()
 76 |         isActionWithinAction = len(self._modeStack) > 0 and newMode == self.TargetLanguageAction and oldMode == newMode
 77 |         if isActionWithinAction:
 78 |             self._type = self.ACTION_CONTENT
 79 | 
 80 |     def emit(self):
 81 |         if (self._type == self.OPTIONS or self._type == self.TOKENS or self._type == self.CHANNELS) and self._currentRuleType == Token.INVALID_TYPE:
 82 |             self._currentRuleType = self.PREQUEL_CONSTRUCT
 83 |         elif self._type == self.OPTIONS and self._currentRuleType == self.TOKEN_REF:
 84 |             self._currentRuleType = self.OPTIONS_CONSTRUCT
 85 |         elif self._type == self.RBRACE and self._currentRuleType == self.PREQUEL_CONSTRUCT:
 86 |             self._currentRuleType = Token.INVALID_TYPE
 87 |         elif self._type == self.RBRACE and self._currentRuleType == self.OPTIONS_CONSTRUCT:
 88 |             self._currentRuleType = self.TOKEN_REF
 89 |         elif self._type == self.AT and self._currentRuleType == Token.INVALID_TYPE:
 90 |             self._currentRuleType = self.AT
 91 |         elif self._type == self.SEMI and self._currentRuleType == self.OPTIONS_CONSTRUCT:
 92 |             self._currentRuleType = self._currentRuleType
 93 |         elif self._type == self.END_ACTION and self._currentRuleType == self.AT:
 94 |             self._currentRuleType = Token.INVALID_TYPE
 95 |         elif self._type == self.ID:
 96 |             firstChar = self._input.getText(self._tokenStartCharIndex, self._tokenStartCharIndex)
 97 |             if firstChar[0].isupper():
 98 |                 self._type = self.TOKEN_REF
 99 |             else:
100 |                 self._type = self.RULE_REF
101 | 
102 |             if self._currentRuleType == Token.INVALID_TYPE:  # if outside of rule def
103 |                 self._currentRuleType = self._type  # set to inside lexer or parser rule
104 | 
105 |         elif self._type == self.SEMI:  # exit rule def
106 |             self._currentRuleType = Token.INVALID_TYPE
107 |         return Lexer.emit(self)
108 | 
109 |     def inLexerRule(self):
110 |         return self._currentRuleType == self.TOKEN_REF
111 | 
112 |     def inParserRule(self):  # not used, but added for clarity
113 |         return self._currentRuleType == self.RULE_REF
114 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/parser/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | from .ANTLRv4Lexer import ANTLRv4Lexer
 9 | from .ANTLRv4Parser import ANTLRv4Parser
10 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/parser_builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016-2022 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | import logging
 9 | 
10 | from os import listdir
11 | from os.path import basename, commonprefix, split, splitext
12 | from subprocess import CalledProcessError, PIPE, run, STDOUT
13 | 
14 | logger = logging.getLogger(__name__)
15 | grammar_cache = {}
16 | 
17 | 
18 | def build_grammars(grammars, out, antlr, lang='python'):
19 |     """
20 |     Build lexer and grammar from ANTLRv4 grammar files in Python target.
21 | 
22 |     :param grammars: Tuple of grammar files.
23 |     :param out: Output directory.
24 |     :param antlr: Path to the ANTLR4 tool (Java jar binary).
25 |     :param lang: The target language of the parser.
26 |     :return: List of references/names of the lexer, parser and listener classes
27 |         of the target.
28 |     """
29 | 
30 |     # Generate parser and lexer in the target language and return either with
31 |     # python class ref or the name of java classes.
32 |     if lang not in grammar_cache:
33 |         grammar_cache[lang] = {}
34 |     if grammars in grammar_cache[lang]:
35 |         logger.debug('%r is already built with %s target.', grammars, lang)
36 |         return grammar_cache[lang][grammars]
37 | 
38 |     try:
39 |         languages = {
40 |             'python': {'antlr_arg': '-Dlanguage=Python3', 'ext': 'py', 'listener_format': 'Listener'},
41 |             'java': {'antlr_arg': '-Dlanguage=Java', 'ext': 'java', 'listener_format': 'BaseListener'},
42 |         }
43 | 
44 |         try:
45 |             run(('java', '-jar', antlr, languages[lang]['antlr_arg'], '-o', out) + grammars,
46 |                 stdout=PIPE, stderr=STDOUT, cwd=out, check=True)
47 |         except CalledProcessError as e:
48 |             logger.error('Building grammars %r failed!\n%s\n', grammars, e.output)
49 |             raise
50 | 
51 |         files = listdir(out)
52 |         filename = basename(grammars[0])
53 | 
54 |         def file_endswith(end_pattern):
55 |             f = next(f for f in files if len(commonprefix([filename, f])) > 0 and f.endswith(end_pattern))
56 |             _, f = split(f)
57 |             f, _ = splitext(f)
58 |             return f
59 | 
60 |         # Extract the name of lexer and parser from their path.
61 |         lexer = file_endswith(f'Lexer.{languages[lang]["ext"]}')
62 |         parser = file_endswith(f'Parser.{languages[lang]["ext"]}')
63 |         # The name of the generated listeners differs if Python or other language target is used.
64 |         listener = file_endswith(f'{languages[lang]["listener_format"]}.{languages[lang]["ext"]}')
65 | 
66 |         if lang == 'python':
67 |             grammar_cache[lang][grammars] = [getattr(__import__(x, globals(), locals(), [x], 0), x) for x in [lexer, parser, listener]]
68 |         else:
69 |             grammar_cache[lang][grammars] = [lexer, parser, listener]
70 | 
71 |         return grammar_cache[lang][grammars]
72 |     except Exception as e:
73 |         logger.error('Exception while loading parser modules', exc_info=e)
74 |         raise
75 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/resources/ANTLRv4Lexer.g4:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * [The "BSD license"]
  3 |  *  Copyright (c) 2012-2015 Terence Parr
  4 |  *  Copyright (c) 2012-2015 Sam Harwell
  5 |  *  Copyright (c) 2015 Gerald Rosenberg
  6 |  *  All rights reserved.
  7 |  *
  8 |  *  Redistribution and use in source and binary forms, with or without
  9 |  *  modification, are permitted provided that the following conditions
 10 |  *  are met:
 11 |  *
 12 |  *  1. Redistributions of source code must retain the above copyright
 13 |  *     notice, this list of conditions and the following disclaimer.
 14 |  *  2. Redistributions in binary form must reproduce the above copyright
 15 |  *     notice, this list of conditions and the following disclaimer in the
 16 |  *     documentation and/or other materials provided with the distribution.
 17 |  *  3. The name of the author may not be used to endorse or promote products
 18 |  *     derived from this software without specific prior written permission.
 19 |  *
 20 |  *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 21 |  *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 22 |  *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 23 |  *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 24 |  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 25 |  *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 29 |  *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  */
 31 | /**
 32 |  *	A grammar for ANTLR v4 implemented using v4 syntax
 33 |  *
 34 |  *	Modified 2015.06.16 gbr
 35 |  *	-- update for compatibility with Antlr v4.5
 36 |  */
 37 | 
 38 | // ======================================================
 39 | // Lexer specification
 40 | // ======================================================
 41 | 
 42 | lexer grammar ANTLRv4Lexer;
 43 | 
 44 | options { superClass = LexerAdaptor; }
 45 | import LexBasic;
 46 | 
 47 | // Standard set of fragments
 48 | tokens { TOKEN_REF , RULE_REF , LEXER_CHAR_SET }
 49 | channels { OFF_CHANNEL , COMMENT }
 50 | 
 51 | // -------------------------
 52 | // Comments
 53 | DOC_COMMENT
 54 |    : DocComment -> channel (COMMENT)
 55 |    ;
 56 | 
 57 | BLOCK_COMMENT
 58 |    : BlockComment -> channel (COMMENT)
 59 |    ;
 60 | 
 61 | LINE_COMMENT
 62 |    : LineComment -> channel (COMMENT)
 63 |    ;
 64 | 
 65 | // -------------------------
 66 | // Integer
 67 | 
 68 | INT
 69 |    : DecimalNumeral
 70 |    ;
 71 | 
 72 | // -------------------------
 73 | // Literal string
 74 | //
 75 | // ANTLR makes no distinction between a single character literal and a
 76 | // multi-character string. All literals are single quote delimited and
 77 | // may contain unicode escape sequences of the form \uxxxx, where x
 78 | // is a valid hexadecimal number (per Unicode standard).
 79 | STRING_LITERAL
 80 |    : SQuoteLiteral
 81 |    ;
 82 | 
 83 | UNTERMINATED_STRING_LITERAL
 84 |    : USQuoteLiteral
 85 |    ;
 86 | 
 87 | // -------------------------
 88 | // Arguments
 89 | //
 90 | // Certain argument lists, such as those specifying call parameters
 91 | // to a rule invocation, or input parameters to a rule specification
 92 | // are contained within square brackets.
 93 | BEGIN_ARGUMENT
 94 |    : LBrack
 95 |    { self.handleBeginArgument() }
 96 |    ;
 97 | 
 98 | // -------------------------
 99 | // Target Language Actions
100 | BEGIN_ACTION
101 |    : LBrace -> pushMode (TargetLanguageAction)
102 |    ;
103 | 
104 | // -------------------------
105 | // Keywords
106 | //
107 | // 'options', 'tokens', and 'channels' are considered keywords
108 | // but only when followed by '{', and considered as a single token.
109 | // Otherwise, the symbols are tokenized as RULE_REF and allowed as
110 | // an identifier in a labeledElement.
111 | OPTIONS      : 'options'  WSNLCHARS* '{'  ;
112 | TOKENS       : 'tokens'   WSNLCHARS* '{'  ;
113 | CHANNELS     : 'channels' WSNLCHARS* '{'  ;
114 | 
115 | fragment WSNLCHARS : ' ' | '\t' | '\f' | '\n' | '\r' ;
116 | 
117 | IMPORT
118 |    : 'import'
119 |    ;
120 | 
121 | FRAGMENT
122 |    : 'fragment'
123 |    ;
124 | 
125 | LEXER
126 |    : 'lexer'
127 |    ;
128 | 
129 | PARSER
130 |    : 'parser'
131 |    ;
132 | 
133 | GRAMMAR
134 |    : 'grammar'
135 |    ;
136 | 
137 | PROTECTED
138 |    : 'protected'
139 |    ;
140 | 
141 | PUBLIC
142 |    : 'public'
143 |    ;
144 | 
145 | PRIVATE
146 |    : 'private'
147 |    ;
148 | 
149 | RETURNS
150 |    : 'returns'
151 |    ;
152 | 
153 | LOCALS
154 |    : 'locals'
155 |    ;
156 | 
157 | THROWS
158 |    : 'throws'
159 |    ;
160 | 
161 | CATCH
162 |    : 'catch'
163 |    ;
164 | 
165 | FINALLY
166 |    : 'finally'
167 |    ;
168 | 
169 | MODE
170 |    : 'mode'
171 |    ;
172 |    // -------------------------
173 |    // Punctuation
174 | 
175 | COLON
176 |    : Colon
177 |    ;
178 | 
179 | COLONCOLON
180 |    : DColon
181 |    ;
182 | 
183 | COMMA
184 |    : Comma
185 |    ;
186 | 
187 | SEMI
188 |    : Semi
189 |    ;
190 | 
191 | LPAREN
192 |    : LParen
193 |    ;
194 | 
195 | RPAREN
196 |    : RParen
197 |    ;
198 | 
199 | LBRACE
200 |    : LBrace
201 |    ;
202 | 
203 | RBRACE
204 |    : RBrace
205 |    ;
206 | 
207 | RARROW
208 |    : RArrow
209 |    ;
210 | 
211 | LT
212 |    : Lt
213 |    ;
214 | 
215 | GT
216 |    : Gt
217 |    ;
218 | 
219 | ASSIGN
220 |    : Equal
221 |    ;
222 | 
223 | QUESTION
224 |    : Question
225 |    ;
226 | 
227 | STAR
228 |    : Star
229 |    ;
230 | 
231 | PLUS_ASSIGN
232 |    : PlusAssign
233 |    ;
234 | 
235 | PLUS
236 |    : Plus
237 |    ;
238 | 
239 | OR
240 |    : Pipe
241 |    ;
242 | 
243 | DOLLAR
244 |    : Dollar
245 |    ;
246 | 
247 | RANGE
248 |    : Range
249 |    ;
250 | 
251 | DOT
252 |    : Dot
253 |    ;
254 | 
255 | AT
256 |    : At
257 |    ;
258 | 
259 | POUND
260 |    : Pound
261 |    ;
262 | 
263 | NOT
264 |    : Tilde
265 |    ;
266 |    // -------------------------
267 |    // Identifiers - allows unicode rule/token names
268 | 
269 | ID
270 |    : Id
271 |    ;
272 |    // -------------------------
273 |    // Whitespace
274 | 
275 | WS
276 |    : Ws+ -> channel (OFF_CHANNEL)
277 |    ;
278 | 
279 | // -------------------------
280 | // Illegal Characters
281 | //
282 | // This is an illegal character trap which is always the last rule in the
283 | // lexer specification. It matches a single character of any value and being
284 | // the last rule in the file will match when no other rule knows what to do
285 | // about the character. It is reported as an error but is not passed on to the
286 | // parser. This means that the parser to deal with the gramamr file anyway
287 | // but we will not try to analyse or code generate from a file with lexical
288 | // errors.
289 | 
290 | // Comment this rule out to allow the error to be propagated to the parser
291 | ERRCHAR
292 |    : . -> channel (HIDDEN)
293 |    ;
294 | 
295 | // ======================================================
296 | // Lexer modes
297 | // -------------------------
298 | // Arguments
299 | mode Argument;
300 | // E.g., [int x, List<String> a[]]
301 | NESTED_ARGUMENT
302 |    : LBrack -> type (ARGUMENT_CONTENT) , pushMode (Argument)
303 |    ;
304 | 
305 | ARGUMENT_ESCAPE
306 |    : EscAny -> type (ARGUMENT_CONTENT)
307 |    ;
308 | 
309 | ARGUMENT_STRING_LITERAL
310 |    : DQuoteLiteral -> type (ARGUMENT_CONTENT)
311 |    ;
312 | 
313 | ARGUMENT_CHAR_LITERAL
314 |    : SQuoteLiteral -> type (ARGUMENT_CONTENT)
315 |    ;
316 | 
317 | END_ARGUMENT
318 |    : RBrack
319 |    { self.handleEndArgument() }
320 |    ;
321 | 
322 | // added this to return non-EOF token type here. EOF does something weird
323 | UNTERMINATED_ARGUMENT
324 |    : EOF -> popMode
325 |    ;
326 | 
327 | ARGUMENT_CONTENT
328 |    : .
329 |    ;
330 | 
331 | // -------------------------
332 | // Target Language Actions
333 | //
334 | // Many language targets use {} as block delimiters and so we
335 | // must recursively match {} delimited blocks to balance the
336 | // braces. Additionally, we must make some assumptions about
337 | // literal string representation in the target language. We assume
338 | // that they are delimited by ' or " and so consume these
339 | // in their own alts so as not to inadvertantly match {}.
340 | mode TargetLanguageAction;
341 | NESTED_ACTION
342 |    : LBrace -> type (ACTION_CONTENT) , pushMode (TargetLanguageAction)
343 |    ;
344 | 
345 | ACTION_ESCAPE
346 |    : EscAny -> type (ACTION_CONTENT)
347 |    ;
348 | 
349 | ACTION_STRING_LITERAL
350 |    : DQuoteLiteral -> type (ACTION_CONTENT)
351 |    ;
352 | 
353 | ACTION_CHAR_LITERAL
354 |    : SQuoteLiteral -> type (ACTION_CONTENT)
355 |    ;
356 | 
357 | ACTION_DOC_COMMENT
358 |    : DocComment -> type (ACTION_CONTENT)
359 |    ;
360 | 
361 | ACTION_BLOCK_COMMENT
362 |    : BlockComment -> type (ACTION_CONTENT)
363 |    ;
364 | 
365 | ACTION_LINE_COMMENT
366 |    : LineComment -> type (ACTION_CONTENT)
367 |    ;
368 | 
369 | END_ACTION
370 |    : RBrace
371 |    { self.handleEndAction() }
372 |    ;
373 | 
374 | UNTERMINATED_ACTION
375 |    : EOF -> popMode
376 |    ;
377 | 
378 | ACTION_CONTENT
379 |    : .
380 |    ;
381 | 
382 | // -------------------------
383 | mode LexerCharSet;
384 | LEXER_CHAR_SET_BODY
385 |    : (~ [\]\\] | EscAny)+ -> more
386 |    ;
387 | 
388 | LEXER_CHAR_SET
389 |    : RBrack -> popMode
390 |    ;
391 | 
392 | UNTERMINATED_CHAR_SET
393 |    : EOF -> popMode
394 |    ;
395 | 
396 | // ------------------------------------------------------------------------------
397 | // Grammar specific Keywords, Punctuation, etc.
398 | fragment Id
399 |    : NameStartChar NameChar*
400 |    ;
401 |    
402 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/resources/ANTLRv4Parser.g4:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * [The "BSD license"]
  3 |  *  Copyright (c) 2012-2014 Terence Parr
  4 |  *  Copyright (c) 2012-2014 Sam Harwell
  5 |  *  Copyright (c) 2015 Gerald Rosenberg
  6 |  *  All rights reserved.
  7 |  *
  8 |  *  Redistribution and use in source and binary forms, with or without
  9 |  *  modification, are permitted provided that the following conditions
 10 |  *  are met:
 11 |  *
 12 |  *  1. Redistributions of source code must retain the above copyright
 13 |  *     notice, this list of conditions and the following disclaimer.
 14 |  *  2. Redistributions in binary form must reproduce the above copyright
 15 |  *     notice, this list of conditions and the following disclaimer in the
 16 |  *     documentation and/or other materials provided with the distribution.
 17 |  *  3. The name of the author may not be used to endorse or promote products
 18 |  *     derived from this software without specific prior written permission.
 19 |  *
 20 |  *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 21 |  *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 22 |  *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 23 |  *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 24 |  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 25 |  *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 29 |  *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  */
 31 | 
 32 | /*	A grammar for ANTLR v4 written in ANTLR v4.
 33 |  *
 34 |  *	Modified 2015.06.16 gbr
 35 |  *	-- update for compatibility with Antlr v4.5
 36 |  *	-- add mode for channels
 37 |  *	-- moved members to LexerAdaptor
 38 |  * 	-- move fragments to imports
 39 |  */
 40 | parser grammar ANTLRv4Parser;
 41 | 
 42 | 
 43 | options { tokenVocab = ANTLRv4Lexer; }
 44 | // The main entry point for parsing a v4 grammar.
 45 | grammarSpec
 46 |    : grammarDecl prequelConstruct* rules modeSpec* EOF
 47 |    ;
 48 | 
 49 | grammarDecl
 50 |    : grammarType identifier SEMI
 51 |    ;
 52 | 
 53 | grammarType
 54 |    : (LEXER GRAMMAR | PARSER GRAMMAR | GRAMMAR)
 55 |    ;
 56 |    // This is the list of all constructs that can be declared before
 57 |    // the set of rules that compose the grammar, and is invoked 0..n
 58 |    // times by the grammarPrequel rule.
 59 | 
 60 | prequelConstruct
 61 |    : optionsSpec
 62 |    | delegateGrammars
 63 |    | tokensSpec
 64 |    | channelsSpec
 65 |    | action_
 66 |    ;
 67 |    // ------------
 68 |    // Options - things that affect analysis and/or code generation
 69 | 
 70 | optionsSpec
 71 |    : OPTIONS (option SEMI)* RBRACE
 72 |    ;
 73 | 
 74 | option
 75 |    : identifier ASSIGN optionValue
 76 |    ;
 77 | 
 78 | optionValue
 79 |    : identifier (DOT identifier)*
 80 |    | STRING_LITERAL
 81 |    | actionBlock
 82 |    | INT
 83 |    ;
 84 |    // ------------
 85 |    // Delegates
 86 | 
 87 | delegateGrammars
 88 |    : IMPORT delegateGrammar (COMMA delegateGrammar)* SEMI
 89 |    ;
 90 | 
 91 | delegateGrammar
 92 |    : identifier ASSIGN identifier
 93 |    | identifier
 94 |    ;
 95 |    // ------------
 96 |    // Tokens & Channels
 97 | 
 98 | tokensSpec
 99 |    : TOKENS idList? RBRACE
100 |    ;
101 | 
102 | channelsSpec
103 |    : CHANNELS idList? RBRACE
104 |    ;
105 | 
106 | idList
107 |    : identifier (COMMA identifier)* COMMA?
108 |    ;
109 |    // Match stuff like @parser::members {int i;}
110 | 
111 | action_
112 |    : AT (actionScopeName COLONCOLON)? identifier actionBlock
113 |    ;
114 |    // Scope names could collide with keywords; allow them as ids for action scopes
115 | 
116 | actionScopeName
117 |    : identifier
118 |    | LEXER
119 |    | PARSER
120 |    ;
121 | 
122 | actionBlock
123 |    : BEGIN_ACTION ACTION_CONTENT* END_ACTION
124 |    ;
125 | 
126 | argActionBlock
127 |    : BEGIN_ARGUMENT ARGUMENT_CONTENT* END_ARGUMENT
128 |    ;
129 | 
130 | modeSpec
131 |    : MODE identifier SEMI lexerRuleSpec*
132 |    ;
133 | 
134 | rules
135 |    : ruleSpec*
136 |    ;
137 | 
138 | ruleSpec
139 |    : parserRuleSpec
140 |    | lexerRuleSpec
141 |    ;
142 | 
143 | parserRuleSpec
144 |    : ruleModifiers? RULE_REF argActionBlock? ruleReturns? throwsSpec? localsSpec? rulePrequel* COLON ruleBlock SEMI exceptionGroup
145 |    ;
146 | 
147 | exceptionGroup
148 |    : exceptionHandler* finallyClause?
149 |    ;
150 | 
151 | exceptionHandler
152 |    : CATCH argActionBlock actionBlock
153 |    ;
154 | 
155 | finallyClause
156 |    : FINALLY actionBlock
157 |    ;
158 | 
159 | rulePrequel
160 |    : optionsSpec
161 |    | ruleAction
162 |    ;
163 | 
164 | ruleReturns
165 |    : RETURNS argActionBlock
166 |    ;
167 | 
168 | // --------------
169 | // Exception spec
170 | throwsSpec
171 |    : THROWS identifier (COMMA identifier)*
172 |    ;
173 | 
174 | localsSpec
175 |    : LOCALS argActionBlock
176 |    ;
177 | 
178 | /** Match stuff like @init {int i;} */
179 | ruleAction
180 |    : AT identifier actionBlock
181 |    ;
182 | 
183 | ruleModifiers
184 |    : ruleModifier+
185 |    ;
186 |    // An individual access modifier for a rule. The 'fragment' modifier
187 |    // is an internal indication for lexer rules that they do not match
188 |    // from the input but are like subroutines for other lexer rules to
189 |    // reuse for certain lexical patterns. The other modifiers are passed
190 |    // to the code generation templates and may be ignored by the template
191 |    // if they are of no use in that language.
192 | 
193 | ruleModifier
194 |    : PUBLIC
195 |    | PRIVATE
196 |    | PROTECTED
197 |    | FRAGMENT
198 |    ;
199 | 
200 | ruleBlock
201 |    : ruleAltList
202 |    ;
203 | 
204 | ruleAltList
205 |    : labeledAlt (OR labeledAlt)*
206 |    ;
207 | 
208 | labeledAlt
209 |    : alternative (POUND identifier)?
210 |    ;
211 |    // --------------------
212 |    // Lexer rules
213 | 
214 | lexerRuleSpec
215 |    : FRAGMENT? TOKEN_REF optionsSpec? COLON lexerRuleBlock SEMI
216 |    ;
217 | 
218 | lexerRuleBlock
219 |    : lexerAltList
220 |    ;
221 | 
222 | lexerAltList
223 |    : lexerAlt (OR lexerAlt)*
224 |    ;
225 | 
226 | lexerAlt
227 |    : lexerElements lexerCommands?
228 |    |
229 |    // explicitly allow empty alts
230 |    ;
231 | 
232 | lexerElements
233 |    : lexerElement+
234 |    |
235 |    ;
236 | 
237 | lexerElement
238 |    : lexerAtom ebnfSuffix?
239 |    | lexerBlock ebnfSuffix?
240 |    | actionBlock QUESTION?
241 |    ;
242 |    // but preds can be anywhere
243 | 
244 | lexerBlock
245 |    : LPAREN lexerAltList RPAREN
246 |    ;
247 |    // E.g., channel(HIDDEN), skip, more, mode(INSIDE), push(INSIDE), pop
248 | 
249 | lexerCommands
250 |    : RARROW lexerCommand (COMMA lexerCommand)*
251 |    ;
252 | 
253 | lexerCommand
254 |    : lexerCommandName LPAREN lexerCommandExpr RPAREN
255 |    | lexerCommandName
256 |    ;
257 | 
258 | lexerCommandName
259 |    : identifier
260 |    | MODE
261 |    ;
262 | 
263 | lexerCommandExpr
264 |    : identifier
265 |    | INT
266 |    ;
267 |    // --------------------
268 |    // Rule Alts
269 | 
270 | altList
271 |    : alternative (OR alternative)*
272 |    ;
273 | 
274 | alternative
275 |    : elementOptions? element+
276 |    |
277 |    // explicitly allow empty alts
278 |    ;
279 | 
280 | element
281 |    : labeledElement (ebnfSuffix |)
282 |    | atom (ebnfSuffix |)
283 |    | ebnf
284 |    | actionBlock QUESTION?
285 |    ;
286 | 
287 | labeledElement
288 |    : identifier (ASSIGN | PLUS_ASSIGN) (atom | block)
289 |    ;
290 |    // --------------------
291 |    // EBNF and blocks
292 | 
293 | ebnf
294 |    : block blockSuffix?
295 |    ;
296 | 
297 | blockSuffix
298 |    : ebnfSuffix
299 |    ;
300 | 
301 | ebnfSuffix
302 |    : QUESTION QUESTION?
303 |    | STAR QUESTION?
304 |    | PLUS QUESTION?
305 |    ;
306 | 
307 | lexerAtom
308 |    : characterRange
309 |    | terminal
310 |    | notSet
311 |    | LEXER_CHAR_SET
312 |    | DOT elementOptions?
313 |    ;
314 | 
315 | atom
316 |    : terminal
317 |    | ruleref
318 |    | notSet
319 |    | DOT elementOptions?
320 |    ;
321 | 
322 | // --------------------
323 | // Inverted element set
324 | notSet
325 |    : NOT setElement
326 |    | NOT blockSet
327 |    ;
328 | 
329 | blockSet
330 |    : LPAREN setElement (OR setElement)* RPAREN
331 |    ;
332 | 
333 | setElement
334 |    : TOKEN_REF elementOptions?
335 |    | STRING_LITERAL elementOptions?
336 |    | characterRange
337 |    | LEXER_CHAR_SET
338 |    ;
339 | 
340 | // -------------
341 | // Grammar Block
342 | block
343 |    : LPAREN (optionsSpec? ruleAction* COLON)? altList RPAREN
344 |    ;
345 | 
346 | // ----------------
347 | // Parser rule ref
348 | ruleref
349 |    : RULE_REF argActionBlock? elementOptions?
350 |    ;
351 | 
352 | // ---------------
353 | // Character Range
354 | characterRange
355 |    : STRING_LITERAL RANGE STRING_LITERAL
356 |    ;
357 | 
358 | terminal
359 |    : TOKEN_REF elementOptions?
360 |    | STRING_LITERAL elementOptions?
361 |    ;
362 | 
363 | // Terminals may be adorned with certain options when
364 | // reference in the grammar: TOK<,,,>
365 | elementOptions
366 |    : LT elementOption (COMMA elementOption)* GT
367 |    ;
368 | 
369 | elementOption
370 |    : identifier
371 |    | identifier ASSIGN (identifier | STRING_LITERAL)
372 |    ;
373 | 
374 | identifier
375 |    : RULE_REF
376 |    | TOKEN_REF
377 |    ;
378 |    
379 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/resources/ExtendedTargetParser.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss.
  3 |  *
  4 |  * Licensed under the BSD 3-Clause License
  5 |  * <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  6 |  * This file may not be copied, modified, or distributed except
  7 |  * according to those terms.
  8 |  */
  9 | 
 10 | import java.io.*;
 11 | import java.util.*;
 12 | import javax.xml.stream.*;
 13 | 
 14 | import org.antlr.v4.runtime.*;
 15 | import org.antlr.v4.runtime.tree.*;
 16 | import org.antlr.v4.runtime.misc.Pair;
 17 | 
 18 | 
 19 | /**
 20 |  * Extended$parser_class is a subclass of the original parser implementation.
 21 |  * It can trigger state changes that are needed to identify parts of the input
 22 |  * that are not needed to keep it syntactically correct.
 23 |  */
 24 | public class Extended$parser_class extends $parser_class {
 25 | 
 26 |     private static class ExtendedErrorListener extends BaseErrorListener {
 27 | 
 28 |         @Override
 29 |         public void syntaxError(Recognizer<?, ?> recognizer,
 30 |                                 Object offendingSymbol,
 31 |                                 int line,
 32 |                                 int charPositionInLine,
 33 |                                 String msg,
 34 |                                 RecognitionException e) {
 35 | 
 36 |             CommonToken t = new CommonToken(new Pair<TokenSource, CharStream>(((Lexer)recognizer), ((Lexer)recognizer)._input),
 37 |                                             Token.INVALID_TYPE,
 38 |                                             Token.DEFAULT_CHANNEL,
 39 |                                             ((Lexer)recognizer)._tokenStartCharIndex,
 40 |                                             ((Lexer)recognizer)._tokenStartCharIndex);
 41 |             t.setLine(((Lexer)recognizer)._tokenStartLine);
 42 |             t.setCharPositionInLine(((Lexer)recognizer)._tokenStartCharPositionInLine);
 43 |             ((Lexer)recognizer).setType(Token.MIN_USER_TOKEN_TYPE);
 44 |             ((Lexer)recognizer).emit(t);
 45 |         }
 46 |     }
 47 | 
 48 |     public static void main(String[] args) {
 49 |         try {
 50 |             ExtendedTargetLexer lexer = new ExtendedTargetLexer(CharStreams.fromStream(System.in));
 51 |             lexer.addErrorListener(new ExtendedErrorListener());
 52 |             CommonTokenStream tokens = new CommonTokenStream(lexer);
 53 |             Extended$parser_class parser = new Extended$parser_class(tokens);
 54 |             ExtendedTargetListener listener = new ExtendedTargetListener(parser);
 55 | 
 56 |             parser.addParseListener(listener);
 57 |             Extended$parser_class.class.getMethod(args[0]).invoke(parser);
 58 |             parser.syntaxErrorWarning();
 59 | 
 60 |             try (XsonStreamWriter w = new XsonStreamWriter(System.out)) {
 61 |                 w.write(null, listener.root);
 62 |             }
 63 |         } catch(Exception e) {
 64 |             e.printStackTrace(System.err);
 65 |             System.exit(1);
 66 |         }
 67 |     }
 68 | 
 69 |     private static interface XsonObject {
 70 |         public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException;
 71 |     }
 72 | 
 73 |     /**
 74 |      * XsonStreamWriter is a partial implementation for writing JSONx documents.
 75 |      * It only implements the minimum required to dump HDDNode objects.
 76 |      */
 77 |     private static class XsonStreamWriter implements AutoCloseable {
 78 |         public static final String JSONX_PREFIX = "json";
 79 |         public static final String JSONX_NS_URI = "http://www.ibm.com/xmlns/prod/2009/jsonx";
 80 | 
 81 |         private XMLStreamWriter w;
 82 | 
 83 |         public XsonStreamWriter(OutputStream o) throws XMLStreamException {
 84 |             XMLOutputFactory factory = XMLOutputFactory.newInstance();
 85 |             factory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true);
 86 |             w = factory.createXMLStreamWriter(o);
 87 |             w.setPrefix(JSONX_PREFIX, JSONX_NS_URI);
 88 |         }
 89 | 
 90 |         public void write(String name, XsonObject value) throws XMLStreamException {
 91 |             w.writeStartElement(JSONX_PREFIX, "object", JSONX_NS_URI);
 92 |             if (name != null)
 93 |                 w.writeAttribute("name", name);
 94 |             value.writeXsonMembers(this);
 95 |             w.writeEndElement();
 96 |         }
 97 | 
 98 |         public void write(String name, Iterable<? extends XsonObject> value) throws XMLStreamException {
 99 |             w.writeStartElement(JSONX_PREFIX, "array", JSONX_NS_URI);
100 |             if (name != null)
101 |                 w.writeAttribute("name", name);
102 |             for (XsonObject o : value)
103 |                 write(null, o);
104 |             w.writeEndElement();
105 |         }
106 | 
107 |         public void write(String name, int value) throws XMLStreamException {
108 |             w.writeStartElement(JSONX_PREFIX, "number", JSONX_NS_URI);
109 |             if (name != null)
110 |                 w.writeAttribute("name", name);
111 |             w.writeCharacters(Integer.toString(value));
112 |             w.writeEndElement();
113 |         }
114 | 
115 |         public void write(String name, String value) throws XMLStreamException {
116 |             w.writeStartElement(JSONX_PREFIX, "string", JSONX_NS_URI);
117 |             if (name != null)
118 |                 w.writeAttribute("name", name);
119 |             w.writeCharacters(value);
120 |             w.writeEndElement();
121 |         }
122 | 
123 |         public void close() throws XMLStreamException {
124 |             w.close();
125 |         }
126 |     }
127 | 
128 |     /**
129 |      * ExtendedTargetLexer is a subclass of the original lexer implementation.
130 |      * It can recognize skipped tokens and instead of eliminating them from the parser
131 |      * they can be redirected to the dedicated PICIRENY_CHANNEL for later use.
132 |      */
133 |     private static class ExtendedTargetLexer extends $lexer_class {
134 | 
135 |         public static final int PICIRENY_CHANNEL = -3;
136 | 
137 |         public ExtendedTargetLexer(CharStream input) {
138 |             super(input);
139 |         }
140 | 
141 |         // Skipped tokens cannot be accessed from the parser but we still need them to
142 |         // unparse test cases correctly. Sending these tokens to a dedicated channel won't
143 |         // alter the parse but makes these tokens available.
144 |         @Override
145 |         public void skip() {
146 |             _channel = PICIRENY_CHANNEL;
147 |         }
148 |     }
149 | 
150 |     /**
151 |      * ExtendedTargetListener is a subclass of the original listener implementation.
152 |      * It can trigger state changes that are needed to identify parts of the input
153 |      * that are not needed to keep it syntactically correct.
154 |      */
155 |     private static class ExtendedTargetListener extends $listener_class {
156 | 
157 |         private HDDRule current_node;
158 |         private Parser parser;
159 |         private HDDRule root;
160 |         private boolean seen_terminal;
161 | 
162 |         private static class Position implements XsonObject {
163 |             public int line;
164 |             public int column;
165 | 
166 |             public Position(int _line, int _column) {
167 |                 line = _line;
168 |                 column = _column;
169 |             }
170 | 
171 |             public Position after(String text) {
172 |                 int line_breaks = countLineBreaks(text);
173 |                 return new Position(line + line_breaks,
174 |                                     line_breaks == 0 ? column + text.length() : text.length() - text.lastIndexOf('\n') - 1);
175 |             }
176 | 
177 |             private static int countLineBreaks(String text) {
178 |                 int count = 0;
179 |                 int fromIndex = 0;
180 |                 while (true) {
181 |                     int index = text.indexOf('\n', fromIndex);
182 |                     if (index < 0)
183 |                         return count;
184 |                     count++;
185 |                     fromIndex = index + 1;
186 |                 }
187 |             }
188 | 
189 |             public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException {
190 |                 w.write("line", line);
191 |                 w.write("column", column);
192 |             }
193 |         }
194 | 
195 |         private static abstract class HDDNode implements XsonObject {
196 |             public String name;
197 |             public HDDRule parent;
198 |             public Position start;
199 |             public Position end;
200 | 
201 |             public HDDNode(String _name) {
202 |                 name = _name;
203 |                 parent = null;
204 |                 start = null;
205 |                 end = null;
206 |             }
207 | 
208 |             public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException {
209 |                 w.write("type", getClass().getSimpleName());
210 |                 if (name != null)
211 |                     w.write("name", name);
212 |                 if (start != null)
213 |                     w.write("start", start);
214 |                 if (end != null)
215 |                     w.write("end", end);
216 |             }
217 |         }
218 | 
219 |         private static class HDDRule extends HDDNode {
220 |             public ArrayList<HDDNode> children;
221 |             public boolean recursive_rule;
222 | 
223 |             public HDDRule(String _name) {
224 |                 super(_name);
225 |                 children = new ArrayList<HDDNode>();
226 |                 recursive_rule = false;
227 |             }
228 | 
229 |             public void addChild(HDDNode node) {
230 |                 children.add(node);
231 |                 node.parent = this;
232 |             }
233 | 
234 |             public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException {
235 |                 super.writeXsonMembers(w);
236 |                 w.write("children", children);
237 |             }
238 |         }
239 | 
240 |         private static class HDDToken extends HDDNode {
241 |             public String text;
242 | 
243 |             public HDDToken(String _name, String _text, Position _start, Position _end) {
244 |                 super(_name);
245 |                 text = _text;
246 |                 start = _start;
247 |                 end = _end;
248 |             }
249 | 
250 |             public void writeXsonMembers(XsonStreamWriter w) throws XMLStreamException {
251 |                 super.writeXsonMembers(w);
252 |                 w.write("text", text);
253 |             }
254 |         }
255 | 
256 |         private static class HDDQuantifier extends HDDRule {
257 |             public HDDQuantifier() {
258 |                 super(null);
259 |             }
260 |         }
261 | 
262 |         private static class HDDHiddenToken extends HDDToken {
263 |             public HDDHiddenToken(String _name, String _text, Position _start, Position _end) {
264 |                 super(_name, _text, _start, _end);
265 |             }
266 |         }
267 | 
268 |         private static class HDDErrorToken extends HDDToken {
269 |             public HDDErrorToken(String _text, Position _start, Position _end) {
270 |                 super(null, _text, _start, _end);
271 |             }
272 |         }
273 | 
274 |         public ExtendedTargetListener(Parser _parser) {
275 |             parser = _parser;
276 |             current_node = null;
277 |             root = null;
278 |             seen_terminal = false;
279 |         }
280 | 
281 |         public void recursion_enter() {
282 |             assert current_node instanceof HDDRule;
283 |             HDDRule node = new HDDRule(current_node.name);
284 | 
285 |             current_node.addChild(node);
286 |             current_node.recursive_rule = true;
287 |             current_node = node;
288 |         }
289 | 
290 |         public void recursion_push() {
291 |             assert current_node.parent.children.size() > 0;
292 |             HDDNode first_child = current_node.parent.children.get(0);
293 |             current_node.parent.children.remove(first_child);
294 |             current_node.addChild(first_child);
295 |         }
296 | 
297 |         public void recursion_unroll() {
298 |             assert current_node.recursive_rule;
299 |             assert current_node.children.size() == 1 && current_node.name.equals(current_node.children.get(0).name);
300 |             ArrayList<HDDNode> children_to_lift = ((HDDRule)current_node.children.get(0)).children;
301 |             HDDRule parent = current_node.parent;
302 |             if (children_to_lift.size() > 0) {
303 |                 current_node.children = children_to_lift;
304 |             } else {
305 |                 parent.children.remove(current_node);
306 |             }
307 |             current_node = parent;
308 |         }
309 | 
310 |         public void enterEveryRule(ParserRuleContext ctx) {
311 |             HDDRule node = new HDDRule(parser.getRuleNames()[ctx.getRuleIndex()]);
312 | 
313 |             if (root == null) {
314 |                 root = node;
315 |             } else {
316 |                 assert current_node != null;
317 |                 current_node.addChild(node);
318 |             }
319 |             current_node = node;
320 |         }
321 | 
322 |         public void exitEveryRule(ParserRuleContext ctx) {
323 |             // If the input contains syntax error, then the last optional block might not have been closed.
324 |             while (current_node instanceof HDDQuantifier)
325 |                 exit_optional();
326 | 
327 |             assert current_node.name.equals(parser.getRuleNames()[ctx.getRuleIndex()]) : current_node.name + " (" + current_node.toString() + ") != " + parser.getRuleNames()[ctx.getRuleIndex()];
328 | 
329 |             if (current_node.parent != null)
330 |                 current_node = current_node.parent;
331 |         }
332 | 
333 |         private Position[] tokenBoundaries(Token token) {
334 |             Position start = new Position(token.getLine(), token.getCharPositionInLine());
335 |             return new Position[] {start, start.after(token.getText())};
336 |         }
337 | 
338 |         private void addToken(TerminalNode node, HDDToken child) {
339 |             if (!seen_terminal) {
340 |                 List<Token> hiddenTokens = ((BufferedTokenStream)parser.getTokenStream()).getHiddenTokensToLeft(node.getSymbol().getTokenIndex(), -1);
341 |                 if (hiddenTokens != null) {
342 |                     for (Token token : hiddenTokens) {
343 |                         Position[] boundaries = tokenBoundaries(token);
344 |                         current_node.addChild(new HDDHiddenToken(parser.getTokenNames()[token.getType()], token.getText(), boundaries[0], boundaries[1]));
345 |                     }
346 |                 }
347 |             }
348 |             seen_terminal = true;
349 | 
350 |             current_node.addChild(child);
351 | 
352 |             List<Token> hiddenTokens = ((BufferedTokenStream)parser.getTokenStream()).getHiddenTokensToRight(node.getSymbol().getTokenIndex(), -1);
353 |             if (hiddenTokens != null) {
354 |                 for (Token token : hiddenTokens) {
355 |                     Position[] boundaries = tokenBoundaries(token);
356 |                     current_node.addChild(new HDDHiddenToken(parser.getTokenNames()[token.getType()], token.getText(), boundaries[0], boundaries[1]));
357 |                 }
358 |             }
359 |         }
360 | 
361 |         public void visitTerminal(TerminalNode node) {
362 |             Token token = node.getSymbol();
363 |             Position[] boundaries = tokenBoundaries(token);
364 |             addToken(node, token.getType() != Token.EOF
365 |                            ? new HDDToken(parser.getTokenNames()[token.getType()], token.getText(), boundaries[0], boundaries[1])
366 |                            : new HDDToken("EOF", "", boundaries[0], boundaries[1]));
367 |         }
368 | 
369 |         public void visitErrorNode(ErrorNode node) {
370 |             Token token = node.getSymbol();
371 |             if (token != null) {
372 |                 Position[] boundaries = tokenBoundaries(token);
373 |                 addToken(node, new HDDErrorToken(node.getText(), boundaries[0], boundaries[1]));
374 |             }
375 |         }
376 | 
377 |         public void enter_optional() {
378 |             HDDQuantifier quant_node = new HDDQuantifier();
379 |             current_node.addChild(quant_node);
380 |             current_node = quant_node;
381 |         }
382 | 
383 |         public void exit_optional() {
384 |             assert current_node.parent != null : "Quantifier node has no parent.";
385 |             assert current_node.children.size() > 0 : "Quantifier node has no children.";
386 | 
387 |             current_node = current_node.parent;
388 |         }
389 |     }
390 | 
391 |     public Extended$parser_class(TokenStream input) {
392 |         super(input);
393 |     }
394 | 
395 |     public void enter_optional() {
396 |         trigger_listener("enter_optional");
397 |     }
398 | 
399 |     public void exit_optional() {
400 |         trigger_listener("exit_optional");
401 |     }
402 | 
403 |     public void enterRecursionRule(ParserRuleContext localctx, int state, int ruleIndex, int precedence) {
404 |         super.enterRecursionRule(localctx, state, ruleIndex, precedence);
405 |         trigger_listener("recursion_enter");
406 |     }
407 | 
408 |     public void enterRecursionRule(ParserRuleContext localctx, int ruleIndex) {
409 |         super.enterRecursionRule(localctx, ruleIndex);
410 |         trigger_listener("recursion_enter");
411 |     }
412 | 
413 |     public void pushNewRecursionContext(ParserRuleContext localctx, int state, int ruleIndex) {
414 |         super.pushNewRecursionContext(localctx, state, ruleIndex);
415 |         trigger_listener("recursion_push");
416 |     }
417 | 
418 |     public void unrollRecursionContexts(ParserRuleContext _parentctx) {
419 |         super.unrollRecursionContexts(_parentctx);
420 |         trigger_listener("recursion_unroll");
421 |     }
422 | 
423 |     private void trigger_listener(String event) {
424 |         for (ParseTreeListener listener : getParseListeners()) {
425 |             try {
426 |                 ExtendedTargetListener.class.getMethod(event).invoke(listener);
427 |             } catch (Exception e) {
428 |                 System.err.println(e);
429 |             }
430 |         }
431 |     }
432 | 
433 |     private void syntaxErrorWarning() {
434 |         if (getNumberOfSyntaxErrors() > 0)
435 |             System.err.println("$parser_class finished with " + getNumberOfSyntaxErrors() + " syntax errors. This may decrease quality.");
436 |     }
437 | }
438 | 


--------------------------------------------------------------------------------
/src/picireny/antlr4/resources/LexBasic.g4:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * [The "BSD license"]
  3 |  *  Copyright (c) 2014-2015 Gerald Rosenberg
  4 |  *  All rights reserved.
  5 |  *
  6 |  *  Redistribution and use in source and binary forms, with or without
  7 |  *  modification, are permitted provided that the following conditions
  8 |  *  are met:
  9 |  *
 10 |  *  1. Redistributions of source code must retain the above copyright
 11 |  *     notice, this list of conditions and the following disclaimer.
 12 |  *  2. Redistributions in binary form must reproduce the above copyright
 13 |  *     notice, this list of conditions and the following disclaimer in the
 14 |  *     documentation and/or other materials provided with the distribution.
 15 |  *  3. The name of the author may not be used to endorse or promote products
 16 |  *     derived from this software without specific prior written permission.
 17 |  *
 18 |  *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 19 |  *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 20 |  *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 21 |  *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 22 |  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 23 |  *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 |  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 |  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 |  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 27 |  *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 |  */
 29 | /** 
 30 |  * A generally reusable set of fragments for import in to Lexer grammars.
 31 |  *
 32 |  *	Modified 2015.06.16 gbr - 
 33 |  *	-- generalized for inclusion into the ANTLRv4 grammar distribution
 34 |  * 
 35 |  */
 36 | lexer grammar LexBasic;
 37 | // ======================================================
 38 | // Lexer fragments
 39 | //
 40 | // -----------------------------------
 41 | // Whitespace & Comments
 42 | 
 43 | fragment Ws
 44 |    : Hws
 45 |    | Vws
 46 |    ;
 47 | 
 48 | fragment Hws
 49 |    : [ \t]
 50 |    ;
 51 | 
 52 | fragment Vws
 53 |    : [\r\n\f]
 54 |    ;
 55 | 
 56 | fragment BlockComment
 57 |    : '/*' .*? ('*/' | EOF)
 58 |    ;
 59 | 
 60 | fragment DocComment
 61 |    : '/**' .*? ('*/' | EOF)
 62 |    ;
 63 | 
 64 | fragment LineComment
 65 |    : '//' ~ [\r\n]*
 66 |    ;
 67 |    // -----------------------------------
 68 |    // Escapes
 69 |    // Any kind of escaped character that we can embed within ANTLR literal strings.
 70 | 
 71 | fragment EscSeq
 72 |    : Esc ([btnfr"'\\] | UnicodeEsc | . | EOF)
 73 |    ;
 74 | 
 75 | fragment EscAny
 76 |    : Esc .
 77 |    ;
 78 | 
 79 | fragment UnicodeEsc
 80 |    : 'u' (HexDigit (HexDigit (HexDigit HexDigit?)?)?)?
 81 |    ;
 82 |    // -----------------------------------
 83 |    // Numerals
 84 | 
 85 | fragment DecimalNumeral
 86 |    : '0'
 87 |    | [1-9] DecDigit*
 88 |    ;
 89 |    // -----------------------------------
 90 |    // Digits
 91 | 
 92 | fragment HexDigit
 93 |    : [0-9a-fA-F]
 94 |    ;
 95 | 
 96 | fragment DecDigit
 97 |    : [0-9]
 98 |    ;
 99 |    // -----------------------------------
100 |    // Literals
101 | 
102 | fragment BoolLiteral
103 |    : 'true'
104 |    | 'false'
105 |    ;
106 | 
107 | fragment CharLiteral
108 |    : SQuote (EscSeq | ~ ['\r\n\\]) SQuote
109 |    ;
110 | 
111 | fragment SQuoteLiteral
112 |    : SQuote (EscSeq | ~ ['\r\n\\])* SQuote
113 |    ;
114 | 
115 | fragment DQuoteLiteral
116 |    : DQuote (EscSeq | ~ ["\r\n\\])* DQuote
117 |    ;
118 | 
119 | fragment USQuoteLiteral
120 |    : SQuote (EscSeq | ~ ['\r\n\\])*
121 |    ;
122 |    // -----------------------------------
123 |    // Character ranges
124 | 
125 | fragment NameChar
126 |    : NameStartChar
127 |    | '0' .. '9'
128 |    | Underscore
129 |    | '\u00B7'
130 |    | '\u0300' .. '\u036F'
131 |    | '\u203F' .. '\u2040'
132 |    ;
133 | 
134 | fragment NameStartChar
135 |    : 'A' .. 'Z'
136 |    | 'a' .. 'z'
137 |    | '\u00C0' .. '\u00D6'
138 |    | '\u00D8' .. '\u00F6'
139 |    | '\u00F8' .. '\u02FF'
140 |    | '\u0370' .. '\u037D'
141 |    | '\u037F' .. '\u1FFF'
142 |    | '\u200C' .. '\u200D'
143 |    | '\u2070' .. '\u218F'
144 |    | '\u2C00' .. '\u2FEF'
145 |    | '\u3001' .. '\uD7FF'
146 |    | '\uF900' .. '\uFDCF'
147 |    | '\uFDF0' .. '\uFFFD'
148 |    ;
149 |    // ignores | ['\u10000-'\uEFFFF] ;
150 |    // -----------------------------------
151 |    // Types
152 | 
153 | fragment Int
154 |    : 'int'
155 |    ;
156 |    // -----------------------------------
157 |    // Symbols
158 | 
159 | fragment Esc
160 |    : '\\'
161 |    ;
162 | 
163 | fragment Colon
164 |    : ':'
165 |    ;
166 | 
167 | fragment DColon
168 |    : '::'
169 |    ;
170 | 
171 | fragment SQuote
172 |    : '\''
173 |    ;
174 | 
175 | fragment DQuote
176 |    : '"'
177 |    ;
178 | 
179 | fragment LParen
180 |    : '('
181 |    ;
182 | 
183 | fragment RParen
184 |    : ')'
185 |    ;
186 | 
187 | fragment LBrace
188 |    : '{'
189 |    ;
190 | 
191 | fragment RBrace
192 |    : '}'
193 |    ;
194 | 
195 | fragment LBrack
196 |    : '['
197 |    ;
198 | 
199 | fragment RBrack
200 |    : ']'
201 |    ;
202 | 
203 | fragment RArrow
204 |    : '->'
205 |    ;
206 | 
207 | fragment Lt
208 |    : '<'
209 |    ;
210 | 
211 | fragment Gt
212 |    : '>'
213 |    ;
214 | 
215 | fragment Equal
216 |    : '='
217 |    ;
218 | 
219 | fragment Question
220 |    : '?'
221 |    ;
222 | 
223 | fragment Star
224 |    : '*'
225 |    ;
226 | 
227 | fragment Plus
228 |    : '+'
229 |    ;
230 | 
231 | fragment PlusAssign
232 |    : '+='
233 |    ;
234 | 
235 | fragment Underscore
236 |    : '_'
237 |    ;
238 | 
239 | fragment Pipe
240 |    : '|'
241 |    ;
242 | 
243 | fragment Dollar
244 |    : '$'
245 |    ;
246 | 
247 | fragment Comma
248 |    : ','
249 |    ;
250 | 
251 | fragment Semi
252 |    : ';'
253 |    ;
254 | 
255 | fragment Dot
256 |    : '.'
257 |    ;
258 | 
259 | fragment Range
260 |    : '..'
261 |    ;
262 | 
263 | fragment At
264 |    : '@'
265 |    ;
266 | 
267 | fragment Pound
268 |    : '#'
269 |    ;
270 | 
271 | fragment Tilde
272 |    : '~'
273 |    ;
274 |    
275 | 


--------------------------------------------------------------------------------
/src/picireny/cli.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-2024 Renata Hodovan, Akos Kiss.
  2 | #
  3 | # Licensed under the BSD 3-Clause License
  4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  5 | # This file may not be copied, modified, or distributed except
  6 | # according to those terms.
  7 | 
  8 | import json
  9 | 
 10 | from argparse import ArgumentParser
 11 | from importlib import metadata
 12 | from os.path import abspath, dirname, exists, join, realpath
 13 | from shutil import rmtree
 14 | 
 15 | import antlerinator
 16 | import inators
 17 | import picire
 18 | 
 19 | from inators import log as logging
 20 | 
 21 | from . import filter, hdd, hddr, hoist, info, prune, transform
 22 | 
 23 | logger = logging.getLogger('picireny')
 24 | __version__ = metadata.version(__package__)
 25 | 
 26 | 
 27 | args_hdd_choices = {
 28 |     'hdd': hdd.hddmin,
 29 |     'hddr': hddr.hddrmin,
 30 | }
 31 | 
 32 | 
 33 | args_phase_choices = {
 34 |     'prune': {'transformations': [prune.prune]},
 35 |     'coarse-prune': {'transformations': [prune.prune], 'config_filter': filter.coarse_filter},
 36 |     'hoist': {'transformations': [hoist.hoist]},
 37 |     'prune+hoist': {'transformations': [prune.prune, hoist.hoist]},
 38 |     'coarse-prune+hoist': {'transformations': [prune.prune, hoist.hoist], 'config_filter': filter.coarse_filter}
 39 | }
 40 | 
 41 | 
 42 | def process_antlr4_args(args):
 43 |     antlerinator.process_antlr_argument(args)
 44 |     args.antlr = realpath(args.antlr)
 45 | 
 46 |     def load_format_config(data):
 47 |         # Interpret relative grammar paths compared to the directory of the config file.
 48 |         if 'files' in data:
 49 |             for i, fn in enumerate(data['files']):
 50 |                 path = join(abspath(dirname(args.format)), fn)
 51 |                 if not exists(path):
 52 |                     raise ValueError(f'Invalid input format definition: {path}, defined in the format config, does not exist.')
 53 |                 data['files'][i] = path
 54 |             data['islands'] = data.get('islands', {})
 55 |             data['replacements'] = data.get('replacements', {})
 56 |         return data
 57 | 
 58 |     args.input_format = {}
 59 | 
 60 |     if args.format:
 61 |         if not exists(args.format):
 62 |             raise ValueError(f'Invalid input format definition: {args.format} does not exist.')
 63 | 
 64 |         with open(args.format, 'r') as f:
 65 |             try:
 66 |                 input_description = json.load(f, object_hook=load_format_config)
 67 |                 args.input_format = input_description['grammars']
 68 |                 if not args.start:
 69 |                     args.start = input_description.get('start', None)
 70 |             except ValueError as e:
 71 |                 raise ValueError(f'Invalid input format definition: The content of {args.format} is not a valid JSON object.') from e
 72 | 
 73 |     if not args.start:
 74 |         raise ValueError('Invalid input format definition: No start has been defined.')
 75 | 
 76 |     if args.grammar or args.replacements:
 77 |         # Initialize the default grammar that doesn't need to be named.
 78 |         if '' not in args.input_format:
 79 |             args.input_format[''] = {'files': [], 'replacements': {}, 'islands': {}}
 80 | 
 81 |         if args.grammar:
 82 |             for i, g in enumerate(args.grammar):
 83 |                 args.input_format['']['files'].append(realpath(g))
 84 |                 if not exists(args.input_format['']['files'][i]):
 85 |                     raise ValueError(f'Invalid input format definition: {args.input_format[""]["files"][i]} does not exist.')
 86 | 
 87 |         if args.replacements:
 88 |             if not exists(args.replacements):
 89 |                 raise ValueError(f'Invalid input format definition: {args.replacements} does not exist.')
 90 | 
 91 |             try:
 92 |                 with open(args.replacements, 'r') as f:
 93 |                     args.input_format['']['replacements'] = json.load(f)
 94 |             except ValueError as e:
 95 |                 raise ValueError(f'Invalid input format definition: The content of {args.replacements} is not a valid JSON object.') from e
 96 | 
 97 | 
 98 | def process_srcml_args(args):
 99 |     if not args.srcml_language:
100 |         raise ValueError('The following argument is required for srcML: --srcml:language')
101 | 
102 | 
103 | def process_args(args):
104 |     inators.arg.process_log_level_argument(args, logger)
105 |     inators.arg.process_sys_recursion_limit_argument(args)
106 | 
107 |     args.hddmin = args_hdd_choices[args.hdd]
108 |     args.hdd_phase_configs = [args_phase_choices[phase] for phase in (args.phase or ['prune'])]
109 | 
110 |     if args.builder == 'antlr4':
111 |         process_antlr4_args(args)
112 |     elif args.builder == 'srcml':
113 |         process_srcml_args(args)
114 | 
115 |     picire.cli.process_args(args)
116 | 
117 | 
118 | def log_tree(title, hdd_tree):
119 |     if logger.isEnabledFor(logging.DEBUG):
120 |         logger.debug('%s\n\theight: %s\n\tshape: %s\n\tnodes: %s\n',
121 |                      title,
122 |                      info.height(hdd_tree),
123 |                      ', '.join(str(cnt) for cnt in info.shape(hdd_tree)),
124 |                      ', '.join(f'{cnt} {ty}' for ty, cnt in sorted(info.count(hdd_tree).items())))
125 |     logger.trace('%r', hdd_tree)
126 | 
127 | 
128 | def build_with_antlr4(src, *,
129 |                       input_format, start,
130 |                       antlr, lang='python',
131 |                       build_hidden_tokens=False,
132 |                       work_dir):
133 |     """
134 |     Execute ANTLRv4-based tree building part of picireny as if invoked from
135 |     command line, however, control its behaviour not via command line arguments
136 |     but function parameters.
137 | 
138 |     :param src: Contents of the test case to reduce.
139 |     :param input_format: Dictionary describing the input format.
140 |     :param start: Name of the start rule in [grammarname:]rulename format.
141 |     :param antlr: Path to the ANTLR4 tool (Java jar binary).
142 |     :param lang: The target language of the parser.
143 |     :param build_hidden_tokens: Build hidden tokens of the input format into the
144 |         HDD tree.
145 |     :param work_dir: Path to a working directory.
146 |     :return: The built HDD tree.
147 |     """
148 |     # Get the parameters in a dictionary so that they can be pretty-printed
149 |     args = locals().copy()
150 |     del args['src']
151 |     picire.cli.log_args('Building tree with ANTLRv4', args)
152 | 
153 |     from .antlr4 import create_hdd_tree
154 |     return create_hdd_tree(src,
155 |                            input_format=input_format, start=start,
156 |                            antlr=antlr, lang=lang,
157 |                            hidden_tokens=build_hidden_tokens,
158 |                            work_dir=work_dir)
159 | 
160 | 
161 | def build_with_srcml(src, *, language):
162 |     """
163 |     Execute srcML-based tree building part of picireny as if invoked from
164 |     command line, however, control its behaviour not via command line arguments
165 |     but function parameters.
166 | 
167 |     :param src: Contents of the test case to reduce.
168 |     :param language: Language of the input source (C, C++, C#, or Java).
169 |     :return: The built HDD tree.
170 |     """
171 |     # Get the parameters in a dictionary so that they can be pretty-printed
172 |     args = locals().copy()
173 |     del args['src']
174 |     picire.cli.log_args('Building tree with srcML', args)
175 | 
176 |     from .srcml import create_hdd_tree
177 |     return create_hdd_tree(src, language=language)
178 | 
179 | 
180 | def reduce(hdd_tree, *,
181 |            hddmin, reduce_class, reduce_config, tester_class, tester_config,
182 |            cache_class=None, unparse_with_whitespace=True,
183 |            hdd_phase_configs=({},), hdd_star=True,
184 |            flatten_recursion=False, squeeze_tree=True, skip_unremovable=True, skip_whitespace=False):
185 |     """
186 |     Execute tree reduction part of picireny as if invoked from command line,
187 |     however, control its behaviour not via command line arguments but function
188 |     parameters.
189 | 
190 |     :param hdd_tree: HDD tree to reduce.
191 |     :param hddmin: Function implementing a HDD minimization algorithm.
192 |     :param reduce_class: Reference to the reducer class.
193 |     :param reduce_config: Dictionary containing information to initialize the
194 |         reduce_class.
195 |     :param tester_class: Reference to a runnable class that can decide about the
196 |         interestingness of a test case.
197 |     :param tester_config: Dictionary containing information to initialize the
198 |         tester_class.
199 |     :param cache_class: Reference to the cache class to use.
200 |     :param unparse_with_whitespace: Unparse by adding whitespace between
201 |         nonadjacent nodes.
202 |     :param hdd_phase_configs: Sequence of dictionaries containing information to
203 |         parametrize the hddmin function.
204 |     :param hdd_star: Boolean to enable the HDD star algorithm.
205 |     :param flatten_recursion: Boolean to enable flattening left/right-recursive
206 |         trees.
207 |     :param squeeze_tree: Boolean to enable the tree squeezing optimization.
208 |     :param skip_unremovable: Boolean to enable hiding unremovable nodes from
209 |         ddmin.
210 |     :param skip_whitespace: Boolean to enable hiding whitespace-only tokens from
211 |         ddmin.
212 |     :return: The reduced HDD tree.
213 |     """
214 |     # Get the parameters in a dictionary so that they can be pretty-printed
215 |     args = locals().copy()
216 |     del args['hdd_tree']
217 |     picire.cli.log_args('Reduce session starts', args)
218 | 
219 |     log_tree('Initial tree', hdd_tree)
220 | 
221 |     # Perform tree transformations.
222 |     if flatten_recursion:
223 |         hdd_tree = transform.flatten_recursion(hdd_tree)
224 |         log_tree('Tree after recursion flattening', hdd_tree)
225 | 
226 |     if squeeze_tree:
227 |         hdd_tree = transform.squeeze_tree(hdd_tree)
228 |         log_tree('Tree after squeezing', hdd_tree)
229 | 
230 |     if skip_unremovable:
231 |         hdd_tree = transform.skip_unremovable(hdd_tree, unparse_with_whitespace=unparse_with_whitespace)
232 |         log_tree('Tree after skipping unremovable nodes', hdd_tree)
233 | 
234 |     if skip_whitespace:
235 |         hdd_tree = transform.skip_whitespace(hdd_tree)
236 |         log_tree('Tree after skipping whitespace tokens', hdd_tree)
237 | 
238 |     # Perform reduction.
239 |     for phase_cnt, phase_config in enumerate(hdd_phase_configs):
240 |         logger.info('Phase #%d', phase_cnt)
241 |         hdd_tree = hddmin(hdd_tree,
242 |                           reduce_class=reduce_class, reduce_config=reduce_config,
243 |                           tester_class=tester_class, tester_config=tester_config,
244 |                           id_prefix=(f'p{phase_cnt}',),
245 |                           cache=cache_class() if cache_class else None,
246 |                           unparse_with_whitespace=unparse_with_whitespace,
247 |                           hdd_star=hdd_star,
248 |                           **phase_config)
249 |         log_tree(f'Tree after reduction phase #{phase_cnt}', hdd_tree)
250 | 
251 |     return hdd_tree
252 | 
253 | 
254 | def execute():
255 |     """
256 |     The main entry point of picireny.
257 |     """
258 |     logging.basicConfig(format='%(message)s')
259 | 
260 |     arg_parser = ArgumentParser(description='CLI for the Picireny Hierarchical Delta Debugging Framework',
261 |                                 parents=[picire.cli.create_parser()], add_help=False)
262 | 
263 |     # General HDD settings.
264 |     arg_parser.add_argument('--builder', metavar='NAME', choices=['antlr4', 'srcml'], default='antlr4',
265 |                             help='tool to build tree representation from input (%(choices)s; default: %(default)s)')
266 |     arg_parser.add_argument('--hdd', metavar='NAME', choices=args_hdd_choices.keys(), default='hdd',
267 |                             help='HDD variant to run (%(choices)s; default: %(default)s)')
268 |     arg_parser.add_argument('--phase', metavar='NAME', choices=args_phase_choices.keys(), action='append',
269 |                             help='parametrization of the HDD variant to run (%(choices)s; default: prune) '
270 |                                  '(may be specified multiple times to run different parametrizations in sequence)')
271 |     arg_parser.add_argument('--no-hdd-star', dest='hdd_star', default=True, action='store_false',
272 |                             help='run the hddmin algorithm only once')
273 |     arg_parser.add_argument('--flatten-recursion', default=False, action='store_true',
274 |                             help='flatten recurring blocks of left/right-recursive rules')
275 |     arg_parser.add_argument('--no-squeeze-tree', dest='squeeze_tree', default=True, action='store_false',
276 |                             help='don\'t squeeze rule chains in tree representation')
277 |     arg_parser.add_argument('--no-skip-unremovable', dest='skip_unremovable', default=True, action='store_false',
278 |                             help='don\'t hide unremovable nodes from the ddmin algorithm')
279 |     arg_parser.add_argument('--skip-whitespace', dest='skip_whitespace', default=False, action='store_true',
280 |                             help='hide whitespace tokens from the ddmin algorithm')
281 |     inators.arg.add_sys_recursion_limit_argument(arg_parser)
282 |     inators.arg.add_version_argument(arg_parser, version=__version__)
283 | 
284 |     # ANTLRv4-specific settings.
285 |     antlr4_grp = arg_parser.add_argument_group('ANTLRv4-specific arguments')
286 |     antlr4_grp.add_argument('-s', '--start', '--antlr4:start', metavar='NAME',
287 |                             help='name of the start rule in [grammarname:]rulename format (default for '
288 |                                  'the optional grammarname is the empty string)')
289 |     antlr4_grp.add_argument('-g', '--grammar', '--antlr4:grammar', metavar='FILE', nargs='+',
290 |                             help='grammar file(s) describing the input format (these grammars will be '
291 |                                  'associated with the empty grammar name, see `--start`)')
292 |     antlr4_grp.add_argument('-r', '--replacements', '--antlr4:replacements', metavar='FILE',
293 |                             help='JSON file defining the default replacements for lexer and parser '
294 |                                  'rules of the grammar with the empty name (usually defined via `--grammar`)')
295 |     antlr4_grp.add_argument('--format', '--antlr4:format', metavar='FILE',
296 |                             help='JSON file describing a (possibly complex) input format')
297 |     antlr4_grp.add_argument('--build-hidden-tokens', '--antlr4:build-hidden-tokens', default=False, action='store_true',
298 |                             help='build hidden tokens of the grammar(s) into the HDD tree')
299 |     antlerinator.add_antlr_argument(antlr4_grp, long_alias='--antlr4:antlr')
300 |     antlr4_grp.add_argument('--parser', '--antlr4:parser', metavar='LANG', default='python', choices=['python', 'java'],
301 |                             help='language of the generated parsers (%(choices)s; default: %(default)s) '
302 |                                  '(using Java might gain performance, but needs JDK)')
303 | 
304 |     # srcML-specific settings.
305 |     srcml_grp = arg_parser.add_argument_group('srcML-specific arguments')
306 |     srcml_grp.add_argument('--srcml:language', dest='srcml_language', metavar='LANG', choices=['C', 'C++', 'C#', 'Java'],
307 |                            help='language of the input (%(choices)s; default: %(default)s)')
308 | 
309 |     args = arg_parser.parse_args()
310 | 
311 |     try:
312 |         process_args(args)
313 |     except ValueError as e:
314 |         arg_parser.error(e)
315 | 
316 |     if args.builder == 'antlr4':
317 |         work_dir = join(args.out, 'grammar')
318 |         hdd_tree = build_with_antlr4(args.src,
319 |                                      input_format=args.input_format, start=args.start,
320 |                                      antlr=args.antlr, lang=args.parser,
321 |                                      build_hidden_tokens=args.build_hidden_tokens,
322 |                                      work_dir=work_dir)
323 |         unparse_with_whitespace = not args.build_hidden_tokens
324 |         if args.cleanup:
325 |             rmtree(work_dir)
326 |     elif args.builder == 'srcml':
327 |         hdd_tree = build_with_srcml(args.src, language=args.srcml_language)
328 |         unparse_with_whitespace = False
329 |     else:
330 |         assert False, f'Unknown builder: {args.builder}'
331 | 
332 |     hdd_tree = reduce(hdd_tree,
333 |                       hddmin=args.hddmin,
334 |                       reduce_class=args.reduce_class, reduce_config=args.reduce_config,
335 |                       tester_class=args.tester_class, tester_config=args.tester_config,
336 |                       cache_class=args.cache, unparse_with_whitespace=unparse_with_whitespace,
337 |                       hdd_phase_configs=args.hdd_phase_configs, hdd_star=args.hdd_star,
338 |                       flatten_recursion=args.flatten_recursion,
339 |                       squeeze_tree=args.squeeze_tree,
340 |                       skip_unremovable=args.skip_unremovable,
341 |                       skip_whitespace=args.skip_whitespace)
342 |     out_src = hdd_tree.unparse(with_whitespace=unparse_with_whitespace)
343 | 
344 |     picire.cli.postprocess(args, out_src)
345 | 


--------------------------------------------------------------------------------
/src/picireny/filter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | 
 9 | def coarse_filter(node):
10 |     """
11 |     Config filter to keep nodes with empty replacements only, which is the core
12 |     of the coarse hierarchical delta debugging reduce algorithm.
13 |     """
14 |     return node.replace == ''
15 | 


--------------------------------------------------------------------------------
/src/picireny/hdd.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2007 Ghassan Misherghi.
 2 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss.
 3 | # Copyright (c) 2021 Daniel Vince
 4 | #
 5 | # Licensed under the BSD 3-Clause License
 6 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 7 | # This file may not be copied, modified, or distributed except
 8 | # according to those terms.
 9 | 
10 | import itertools
11 | import logging
12 | 
13 | from .info import height
14 | from .prune import prune
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | def hddmin(hdd_tree, *,
20 |            reduce_class, reduce_config, tester_class, tester_config,
21 |            id_prefix=(), cache=None, unparse_with_whitespace=True,
22 |            config_filter=None, transformations=(prune,), hdd_star=True):
23 |     """
24 |     Run the hierarchical delta debugging reduce algorithm.
25 | 
26 |     :param hdd_tree: The root of the tree that the reduce will work with (it's
27 |         the output of create_hdd_tree).
28 |     :param reduce_class: Reference to the reducer class (DD, ParallelDD or
29 |         CombinedParallelDD from the picire module).
30 |     :param reduce_config: Dictionary containing the parameters of the
31 |         reduce_class init function.
32 |     :param tester_class: Reference to a callable class that can decide about the
33 |         interestingness of a test case.
34 |     :param tester_config: Dictionary containing the parameters of the tester
35 |         class init function (except test_builder).
36 |     :param id_prefix: Tuple to prepend to config IDs during tests.
37 |     :param cache: Cache to use.
38 |     :param unparse_with_whitespace: Build test case by adding whitespace between
39 |         nonadjacent tree nodes during unparsing.
40 |     :param config_filter: Filter function from node to boolean, to allow running
41 |         hddmin selectively.
42 |     :param transformations: Iterable of transformations that reduce a
43 |         configuration of nodes.
44 |     :param hdd_star: Boolean to enable the HDD star algorithm.
45 |     :return: The reduced test case (1-tree-minimal if hdd_star is True and
46 |         config_filter is None).
47 |     """
48 | 
49 |     def collect_level_nodes(level):
50 |         def _collect_level_nodes(node, current_level):
51 |             if node.state != node.KEEP:
52 |                 return
53 |             if current_level == level:
54 |                 level_nodes.append(node)
55 |             elif hasattr(node, 'children'):
56 |                 for child in node.children:
57 |                     _collect_level_nodes(child, current_level + 1)
58 |         level_nodes = []  # Using `list` (not `set`) for the sake of stability.
59 |         _collect_level_nodes(hdd_tree, 0)
60 |         return level_nodes
61 | 
62 |     for iter_cnt in itertools.count():
63 |         logger.info('Iteration #%d', iter_cnt)
64 | 
65 |         changed = False
66 |         for level in itertools.count():
67 |             level_nodes = collect_level_nodes(level)
68 |             if not level_nodes:
69 |                 break
70 | 
71 |             if config_filter:
72 |                 level_nodes = list(filter(config_filter, level_nodes))
73 |                 if not level_nodes:
74 |                     continue
75 | 
76 |             if logger.isEnabledFor(logging.INFO):
77 |                 logger.info('Checking level %d / %d ...', level, height(hdd_tree))
78 | 
79 |             for trans_cnt, transformation in enumerate(transformations):
80 |                 hdd_tree, transformed = transformation(hdd_tree, level_nodes,
81 |                                                        reduce_class=reduce_class, reduce_config=reduce_config,
82 |                                                        tester_class=tester_class, tester_config=tester_config,
83 |                                                        id_prefix=id_prefix + (f'i{iter_cnt}', f'l{level}', f't{trans_cnt}'),
84 |                                                        cache=cache,
85 |                                                        unparse_with_whitespace=unparse_with_whitespace)
86 | 
87 |                 changed = changed or transformed
88 | 
89 |         if not hdd_star or not changed:
90 |             break
91 | 
92 |     return hdd_tree
93 | 


--------------------------------------------------------------------------------
/src/picireny/hdd_tree.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2007 Ghassan Misherghi.
  2 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss.
  3 | #
  4 | # Licensed under the BSD 3-Clause License
  5 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  6 | # This file may not be copied, modified, or distributed except
  7 | # according to those terms.
  8 | 
  9 | from itertools import count
 10 | from os import linesep
 11 | from textwrap import indent
 12 | 
 13 | 
 14 | class Position:
 15 |     """
 16 |     Class defining a position in the input file. Used to recognise line breaks
 17 |     between tokens.
 18 |     """
 19 |     def __init__(self, line=1, column=0):
 20 |         """
 21 |         Initialize position object.
 22 | 
 23 |         :param line: Line number in the input (starts with 1).
 24 |         :param column: Character index relative to the beginning of the line
 25 |             (starts with 0).
 26 | 
 27 |         Note: The numbering of lines (1-based) and columns (0-based) follows
 28 |         ANTLR v4.
 29 |         """
 30 |         self.line = line
 31 |         self.column = column
 32 | 
 33 |     def after(self, text):
 34 |         """
 35 |         Calculate the end position of a text starting at the current position.
 36 |         """
 37 |         line_breaks = text.count('\n')
 38 |         return Position(self.line + line_breaks,
 39 |                         self.column + len(text) if not line_breaks else len(text) - text.rfind('\n') - 1)
 40 | 
 41 |     def shift(self, start):
 42 |         """
 43 |         Shift the position by prepending a starting position.
 44 |         """
 45 |         if self.line > 1:
 46 |             self.line += start.line - 1
 47 |         else:
 48 |             self.line = start.line
 49 |             self.column += start.column
 50 | 
 51 |     def __repr__(self):
 52 |         return f'{self.__class__.__name__}({self.line!r}, {self.column!r})'
 53 | 
 54 | 
 55 | class HDDTree:
 56 |     # Node states for unparsing.
 57 |     REMOVED = 0
 58 |     KEEP = 1
 59 | 
 60 |     # ID generator
 61 |     __id = count()
 62 | 
 63 |     def __init__(self, name, *, start=None, end=None, replace=None):
 64 |         """
 65 |         Initialize a HDD tree/node.
 66 | 
 67 |         :param name: The name of the node.
 68 |         :param start: Position object describing the start of the HDDTree node.
 69 |         :param end: Position object describing the end of the HDDTree node.
 70 |         :param replace: The minimal replacement string of the current node.
 71 |         """
 72 |         self.name = name
 73 |         self.replace = replace
 74 |         self.start = start
 75 |         self.end = end
 76 |         self.parent = None
 77 |         self.state = self.KEEP
 78 |         self.id = next(self.__id)
 79 | 
 80 |     def unparse(self, *, with_whitespace=True, transform=None):
 81 |         """
 82 |         Build test case from a HDD tree.
 83 | 
 84 |         :param with_whitespace: Add whitespace (space, new line) to separate
 85 |             nonadjacent nodes.
 86 |         :param transform: A function applied to each node before unparsing, or
 87 |             None.
 88 |         :return: The unparsed test case.
 89 |         """
 90 |         def _unparse(node):
 91 |             if transform:
 92 |                 node = transform(node)
 93 | 
 94 |             if node.state != node.KEEP:
 95 |                 return node.replace
 96 | 
 97 |             # Keep the text of the token.
 98 |             if isinstance(node, HDDToken):
 99 |                 return node.text
100 | 
101 |             if not node.children:
102 |                 return ''
103 | 
104 |             # Concat the text of children.
105 |             child_strs = [_unparse(child) for child in node.children]
106 |             node_str = child_strs[0]
107 |             for i in range(1, len(node.children)):
108 |                 # Do not add extra spaces if the next chunk is empty.
109 |                 if not child_strs[i]:
110 |                     continue
111 |                 if with_whitespace:
112 |                     if node.children[i].start.line > node.children[i - 1].end.line:
113 |                         node_str += linesep
114 |                     elif node.children[i].start.column > node.children[i - 1].end.column:
115 |                         node_str += ' '
116 |                 node_str += child_strs[i]
117 | 
118 |             return node_str
119 | 
120 |         return _unparse(self)
121 | 
122 |     def replace_with(self, other):
123 |         """
124 |         Replace the current node with `other` in the HDD tree.
125 | 
126 |         :param other: Node to replace the current with.
127 |         """
128 |         self.parent.children[self.parent.children.index(self)] = other
129 |         other.parent = self.parent
130 | 
131 | 
132 | class HDDToken(HDDTree):
133 |     def __init__(self, name, text, *, start=None, end=None, replace=None):
134 |         super().__init__(name, start=start, end=end, replace=replace)
135 |         self.text = text
136 | 
137 |     def __repr__(self):
138 |         parts = [
139 |             f'name={self.name!r}',
140 |             f'text={self.text!r}',
141 |         ]
142 |         if self.replace is not None:
143 |             parts.append(f'replace={self.replace!r}')
144 |         if self.start is not None:
145 |             parts.append(f'start={self.start!r}')
146 |         if self.end is not None:
147 |             parts.append(f'end={self.end!r}')
148 |         parts.append(f'id={self.id!r}')
149 |         if self.state != self.KEEP:
150 |             parts.append(f'state={self.state!r}')
151 | 
152 |         return f'{self.__class__.__name__}({", ".join(parts)})'
153 | 
154 | 
155 | class HDDRule(HDDTree):
156 |     def __init__(self, name, *, start=None, end=None, replace=None):
157 |         super().__init__(name, start=start, end=end, replace=replace)
158 |         self.children = []
159 | 
160 |     def add_child(self, child):
161 |         self.children.append(child)
162 |         child.parent = self
163 | 
164 |     def add_children(self, children):
165 |         for child in children:
166 |             self.add_child(child)
167 | 
168 |     def remove_child(self, child):
169 |         self.children.remove(child)
170 | 
171 |     def __repr__(self):
172 |         parts = [
173 |             f'name={self.name!r}',
174 |         ]
175 |         if self.replace is not None:
176 |             parts.append(f'replace={self.replace!r}')
177 |         if self.start is not None:
178 |             parts.append(f'start={self.start!r}')
179 |         if self.end is not None:
180 |             parts.append(f'end={self.end!r}')
181 |         parts.append(f'id={self.id!r}')
182 |         if self.state != self.KEEP:
183 |             parts.append(f'state={self.state!r}')
184 |         if self.state == self.KEEP and self.children:
185 |             parts.append('children=[\n%s\n]' % indent(',\n'.join(repr(child) for child in self.children), '  '))
186 | 
187 |         return f'{self.__class__.__name__}({", ".join(parts)})'
188 | 


--------------------------------------------------------------------------------
/src/picireny/hddr.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018-2022 Renata Hodovan, Akos Kiss.
  2 | # Copyright (c) 2021 Daniel Vince
  3 | #
  4 | # Licensed under the BSD 3-Clause License
  5 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  6 | # This file may not be copied, modified, or distributed except
  7 | # according to those terms.
  8 | 
  9 | import itertools
 10 | import logging
 11 | 
 12 | from .prune import prune
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def hddrmin(hdd_tree, *,
 18 |             reduce_class, reduce_config, tester_class, tester_config,
 19 |             id_prefix=(), cache=None, unparse_with_whitespace=True,
 20 |             config_filter=None, transformations=(prune,), hdd_star=True,
 21 |             pop_first=False, append_reversed=False):
 22 |     """
 23 |     Run the recursive variant of the hierarchical delta debugging reduce
 24 |     algorithm (a.k.a. HDDr).
 25 | 
 26 |     The tree traversal implementation is actually not recursive but an iterative
 27 |     queue-based reformulation of HDDr. How tree nodes are popped from the queue
 28 |     during the iteration (whether from the beginning or from the end of the
 29 |     queue) and how the children of a visited node are appended to the queue
 30 |     (whether they are added in forward or reverse order) give rise to different
 31 |     variants of HDDr:
 32 | 
 33 |         - 'pop first' with 'forward append' gives the classic breadth-first
 34 |           traversal,
 35 |         - 'pop first' with 'reverse append' gives syntactically reversed
 36 |           breadth-first traversal,
 37 |         - 'pop last' with 'reverse append' gives the classic depth-first
 38 |           traversal,
 39 |         - 'pop last' with 'forward append' gives syntactically reversed
 40 |           depth-first traversal.
 41 | 
 42 |     :param hdd_tree: The root of the tree that the reduce will work with (it's
 43 |         the output of create_hdd_tree).
 44 |     :param reduce_class: Reference to the reducer class (DD, ParallelDD or
 45 |         CombinedParallelDD from the picire module).
 46 |     :param reduce_config: Dictionary containing the parameters of the
 47 |         reduce_class init function.
 48 |     :param tester_class: Reference to a callable class that can decide about the
 49 |         interestingness of a test case.
 50 |     :param tester_config: Dictionary containing the parameters of the tester
 51 |         class init function (except test_builder).
 52 |     :param id_prefix: Tuple to prepend to config IDs during tests.
 53 |     :param cache: Cache to use.
 54 |     :param unparse_with_whitespace: Build test case by adding whitespace between
 55 |         nonadjacent tree nodes during unparsing.
 56 |     :param config_filter: Filter function from node to boolean, to allow running
 57 |         hddmin selectively.
 58 |     :param transformations: Iterable of transformations that reduce a
 59 |         configuration of nodes.
 60 |     :param hdd_star: Boolean to enable the HDD star algorithm.
 61 |     :param pop_first: Boolean to control tree traversal (see above for details).
 62 |     :param append_reverse: Boolean to control tree traversal (see above for
 63 |         details).
 64 |     :return: The reduced test case (1-tree-minimal if hdd_star is True and
 65 |         config_filter is None).
 66 |     """
 67 | 
 68 |     for iter_cnt in itertools.count():
 69 |         logger.info('Iteration #%d', iter_cnt)
 70 | 
 71 |         changed = False
 72 |         queue = [hdd_tree]
 73 |         for node_cnt in itertools.count():
 74 |             if not queue:
 75 |                 break
 76 |             if pop_first:
 77 |                 queue, node = queue[1:], queue[0]
 78 |             else:
 79 |                 queue, node = queue[:-1], queue[-1]
 80 |             if not hasattr(node, 'children') or node.state != node.KEEP:
 81 |                 continue
 82 | 
 83 |             children = [child for child in node.children if child.state == child.KEEP]
 84 |             if config_filter:
 85 |                 children = list(filter(config_filter, children))
 86 | 
 87 |             if children:
 88 |                 logger.info('Checking node #%d ...', node_cnt)
 89 | 
 90 |                 for trans_cnt, transformation in enumerate(transformations):
 91 |                     hdd_tree, transformed = transformation(hdd_tree, children,
 92 |                                                            reduce_class=reduce_class, reduce_config=reduce_config,
 93 |                                                            tester_class=tester_class, tester_config=tester_config,
 94 |                                                            id_prefix=id_prefix + (f'i{iter_cnt}', f'n{node_cnt}', f't{trans_cnt}'),
 95 |                                                            cache=cache,
 96 |                                                            unparse_with_whitespace=unparse_with_whitespace)
 97 | 
 98 |                     changed = changed or transformed
 99 | 
100 |             for child in node.children if not append_reversed else reversed(node.children):
101 |                 if child.state == child.KEEP:
102 |                     queue.append(child)
103 | 
104 |         if not hdd_star or not changed:
105 |             break
106 | 
107 |     return hdd_tree
108 | 


--------------------------------------------------------------------------------
/src/picireny/hoist.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021-2023 Renata Hodovan, Akos Kiss.
  2 | # Copyright (c) 2021 Daniel Vince.
  3 | #
  4 | # Licensed under the BSD 3-Clause License
  5 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  6 | # This file may not be copied, modified, or distributed except
  7 | # according to those terms.
  8 | 
  9 | import itertools
 10 | import logging
 11 | 
 12 | from picire import AbstractDD, Outcome
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class HoistingTestBuilder:
 18 | 
 19 |     def __init__(self, tree, *, with_whitespace=True):
 20 |         """
 21 |         Initialize the test builder.
 22 | 
 23 |         :param tree: Tree representing the current test case.
 24 |         :param with_whitespace: Unparse by adding whitespace between nonadjacent
 25 |             nodes.
 26 |         """
 27 |         self.tree = tree
 28 |         self.with_whitespace = with_whitespace
 29 | 
 30 |     def __call__(self, mapping_config):
 31 |         """
 32 |         :param mapping_config: A list of mappings of initial configuration
 33 |             elements to new ones.
 34 |         :return: The unparsed test case with the mappings applied.
 35 |         """
 36 |         def map(node):
 37 |             return mapping.get(node, node)
 38 | 
 39 |         mapping = dict(mapping_config)
 40 |         return self.tree.unparse(with_whitespace=self.with_whitespace, transform=map)
 41 | 
 42 | 
 43 | class MappingMin(AbstractDD):
 44 | 
 45 |     def __init__(self, test, *, cache=None, id_prefix=None):
 46 |         """
 47 |         :param test: A callable tester object.
 48 |         :param cache: Cache object to use.
 49 |         :param id_prefix: Tuple to prepend to config IDs during tests.
 50 |         """
 51 | 
 52 |         super().__init__(test=test, split=None, cache=cache, id_prefix=id_prefix)
 53 | 
 54 |     def __call__(self, config):
 55 |         """
 56 |         Compute a mapping of the initial configuration to another (usually
 57 |         smaller) but still failing configuration.
 58 | 
 59 |         :param config: The initial configuration that will be reduced.
 60 |         :return: A mapping of initial configuration elements to new ones.
 61 |         """
 62 | 
 63 |         def collect_hoistables(node):
 64 |             def _collect_hoistables(desc):
 65 |                 if desc.name == node.name:
 66 |                     hoistables.append(desc)
 67 |                     return
 68 |                 if hasattr(desc, 'children') and desc.state == desc.KEEP:
 69 |                     for child in desc.children:
 70 |                         _collect_hoistables(child)
 71 | 
 72 |             hoistables = []
 73 |             if hasattr(node, 'children') and node.state == node.KEEP and node.name:
 74 |                 for child in node.children:
 75 |                     _collect_hoistables(child)
 76 |             return hoistables
 77 | 
 78 |         mapping = {}
 79 | 
 80 |         for run in itertools.count():
 81 |             logger.info('Run #%d', run)
 82 |             logger.info('\tMapping size: %d', len(mapping))
 83 |             if logger.isEnabledFor(logging.DEBUG):
 84 |                 logger.debug('\tMapping: %r', {c.id: m.id for c, m in mapping.items()})
 85 | 
 86 |             for i, (c, m) in enumerate((c, m) for c in config for m in collect_hoistables(mapping.get(c, c))):
 87 |                 new_mapping = mapping.copy()
 88 |                 new_mapping[c] = m
 89 |                 mapping_config = list(new_mapping.items())
 90 |                 config_id = (f'r{run}', f'm{i}')
 91 | 
 92 |                 outcome = self._lookup_cache(mapping_config, config_id) or self._test_config(mapping_config, config_id)
 93 | 
 94 |                 if outcome is Outcome.FAIL:
 95 |                     mapping = new_mapping
 96 |                     logger.info('\tHoisted')
 97 |                     break
 98 |             else:
 99 |                 break
100 | 
101 |         logger.info('\tDone')
102 |         return mapping
103 | 
104 | 
105 | def hoist(hdd_tree, config_nodes, *,
106 |           reduce_class=None, reduce_config=None, tester_class, tester_config,
107 |           id_prefix, cache, unparse_with_whitespace):
108 |     """
109 |     Try hoisting subtrees.
110 | 
111 |     :param hdd_tree: The root of the tree that the reduce will work with.
112 |     :param config_nodes: Nodes from one level collected by the HDD algorithm.
113 |     :param reduce_class: Unused, present for being compatible with 'prune'
114 |         transformation.
115 |     :param reduce_config: Unused, present for being compatible with 'prune'
116 |         transformation.
117 |     :param tester_class: Reference to a callable class that can decide about the
118 |         interestingness of a test case.
119 |     :param tester_config: Dictionary containing the parameters of the tester
120 |         class init function (except test_builder).
121 |     :param id_prefix: Tuple to prepend to config IDs during tests.
122 |     :param cache: Cache to use.
123 |     :param unparse_with_whitespace: Build test case by adding whitespace between
124 |         nonadjacent tree nodes during unparsing.
125 |     :return: The reduced tree and a boolean value that shows whether the tree
126 |         has changed during hoisting.
127 |     """
128 | 
129 |     if not config_nodes:
130 |         return hdd_tree, False
131 | 
132 |     test_builder = HoistingTestBuilder(hdd_tree, with_whitespace=unparse_with_whitespace)
133 |     if cache:
134 |         cache.clear()
135 |         cache.set_test_builder(test_builder)
136 | 
137 |     test = tester_class(test_builder=test_builder, **tester_config)
138 |     mapping_min = MappingMin(test, cache=cache, id_prefix=id_prefix)
139 |     mapping = mapping_min(config_nodes)
140 | 
141 |     def _apply_mapping(node):
142 |         node = mapping.get(node, node)
143 |         if hasattr(node, 'children'):
144 |             for i, child in enumerate(node.children):
145 |                 node.children[i].replace_with(_apply_mapping(child))
146 |         return node
147 |     hdd_tree = _apply_mapping(hdd_tree)
148 | 
149 |     return hdd_tree, bool(mapping)
150 | 


--------------------------------------------------------------------------------
/src/picireny/info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018-2021 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | from .hdd_tree import HDDRule
 9 | 
10 | 
11 | def count(node, *, removed=False):
12 |     """
13 |     Count nodes in the tree by type.
14 | 
15 |     Note: If `removed` is `True`, removed tokens and rules are also counted (but
16 |     sub-trees of removed rules are not).
17 | 
18 |     :param node: The root of the tree to do the counting for.
19 |     :return: A dictionary of counts indexed by node type name.
20 |     """
21 |     def _count(node):
22 |         if node.state != node.KEEP and not removed:
23 |             return
24 | 
25 |         ty = node.__class__.__name__
26 |         if ty not in stats:
27 |             stats[ty] = 0
28 |         stats[ty] += 1
29 | 
30 |         if isinstance(node, HDDRule) and node.state == node.KEEP:
31 |             for child in node.children:
32 |                 _count(child)
33 | 
34 |     stats = {}
35 |     _count(node)
36 |     return stats
37 | 
38 | 
39 | def height(node, *, removed=False):
40 |     """
41 |     Calculate the height of the tree.
42 | 
43 |     Note: If `removed` is `True`, removed tokens and rules are also counted (but
44 |     sub-trees of removed rules are not).
45 | 
46 |     :param node: The root of the tree to do the calculation for.
47 |     :return: The height of the tree.
48 |     """
49 |     if node.state != node.KEEP and not removed:
50 |         return 0
51 | 
52 |     return 1 + (max((height(child) for child in node.children), default=0)
53 |                 if isinstance(node, HDDRule) and node.state == node.KEEP else 0)
54 | 
55 | 
56 | def shape(node, *, removed=False):
57 |     """
58 |     Calculate the shape of the tree, i.e., the number of nodes on each tree
59 |     level.
60 | 
61 |     Note: If `removed` is `True`, removed tokens and rules are also counted (but
62 |     sub-trees of removed rules are not).
63 | 
64 |     :param node: The root of the tree to do the calculation for.
65 |     :return: A list of level sizes.
66 |     """
67 |     def _shape(node, level):
68 |         if node.state != node.KEEP and not removed:
69 |             return
70 | 
71 |         if len(sizes) <= level:
72 |             sizes.extend([0] * (level - len(sizes) + 1))
73 |         sizes[level] += 1
74 | 
75 |         if isinstance(node, HDDRule) and node.state == node.KEEP:
76 |             for child in node.children:
77 |                 _shape(child, level + 1)
78 | 
79 |     sizes = []
80 |     _shape(node, 0)
81 |     return sizes
82 | 


--------------------------------------------------------------------------------
/src/picireny/prune.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021-2023 Renata Hodovan, Akos Kiss.
  2 | #
  3 | # Licensed under the BSD 3-Clause License
  4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  5 | # This file may not be copied, modified, or distributed except
  6 | # according to those terms.
  7 | 
  8 | import logging
  9 | 
 10 | from copy import copy
 11 | 
 12 | from picire import AbstractDD, Outcome
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class PruningTestBuilder:
 18 | 
 19 |     def __init__(self, tree, ids, *, with_whitespace=True):
 20 |         """
 21 |         Initialize the test builder.
 22 | 
 23 |         :param tree: Tree representing the current test case.
 24 |         :param ids: The IDs of nodes that can change status.
 25 |         :param with_whitespace: Unparse by adding whitespace between nonadjacent
 26 |             nodes.
 27 |         """
 28 |         self.tree = tree
 29 |         self.ids = ids
 30 |         self.with_whitespace = with_whitespace
 31 | 
 32 |     def __call__(self, config):
 33 |         """
 34 |         :param config: List of IDs of nodes that will be kept in the next test
 35 |             case.
 36 |         :return: The unparsed test case containing only the units defined in
 37 |             config.
 38 |         """
 39 |         def removed(node):
 40 |             if node.id in self.ids and node.id not in config:
 41 |                 removed_node = copy(node)
 42 |                 removed_node.state = removed_node.REMOVED
 43 |                 return removed_node
 44 |             return node
 45 | 
 46 |         config = set(config)
 47 |         return self.tree.unparse(with_whitespace=self.with_whitespace, transform=removed)
 48 | 
 49 | 
 50 | class EmptyDD(AbstractDD):
 51 |     """
 52 |     Special DD variant that *does* test the empty configuration (and nothing
 53 |     else).
 54 |     """
 55 | 
 56 |     def __init__(self, test, *, cache=None, id_prefix=None):
 57 |         """
 58 |         Initialize an EmptyDD object.
 59 | 
 60 |         :param test: A callable tester object.
 61 |         :param cache: Cache object to use.
 62 |         :param id_prefix: Tuple to prepend to config IDs during tests.
 63 |         """
 64 |         super().__init__(test=test, split=None, cache=cache, id_prefix=id_prefix)
 65 | 
 66 |     def __call__(self, config):
 67 |         """
 68 |         Return a 1-minimal failing subset of the initial configuration, and also
 69 |         test the empty configuration while doing so.
 70 | 
 71 |         Note: The initial configuration is expected to be of size 1, thus the
 72 |         1-minimal failing subset is always its trivial subset: either itself or
 73 |         the empty configuration.
 74 | 
 75 |         :param config: The initial configuration that will be reduced.
 76 |         :return: 1-minimal failing configuration.
 77 |         """
 78 |         assert len(config) == 1
 79 |         # assert self._test_config(config, ('assert',)) == self.FAIL
 80 | 
 81 |         empty = []
 82 |         config_id = ('empty',)
 83 | 
 84 |         logger.info('Run #empty')
 85 |         logger.info('\tConfig size: %d', len(config))
 86 |         logger.debug('\tConfig: %r', config)
 87 | 
 88 |         outcome = self._lookup_cache(empty, config_id) or self._test_config(empty, config_id)
 89 |         if outcome is Outcome.FAIL:
 90 |             config = empty
 91 |             logger.info('\tReduced')
 92 | 
 93 |         logger.info('\tDone')
 94 |         return config
 95 | 
 96 | 
 97 | def prune(hdd_tree, config_nodes, *,
 98 |           reduce_class, reduce_config, tester_class, tester_config,
 99 |           id_prefix, cache, unparse_with_whitespace):
100 |     """
101 |     Pruning-based reduction of a set of nodes (i.e., sub-trees), as used by
102 |     various hierarchical delta debugging algorithm variants.
103 | 
104 |     :param hdd_tree: The root of the tree.
105 |     :param config_nodes: The list of nodes to reduce.
106 |     :param reduce_class: Reference to the reducer class (DD, ParallelDD or
107 |         CombinedParallelDD from the picire module).
108 |     :param reduce_config: Dictionary containing the parameters of the
109 |         reduce_class init function.
110 |     :param tester_class: Reference to a callable class that can decide about the
111 |         interestingness of a test case.
112 |     :param tester_config: Dictionary containing the parameters of the tester
113 |         class init function (except test_builder).
114 |     :param id_prefix: Tuple to prepend to config IDs during tests.
115 |     :param cache: Cache to use.
116 |     :param unparse_with_whitespace: Build test case by adding whitespace between
117 |         nonadjacent tree nodes during unparsing.
118 |     :return: Tuple: (root of the tree, bool whether the tree changed)
119 |     """
120 | 
121 |     config_ids = [node.id for node in config_nodes]
122 |     config_ids_set = set(config_ids)
123 | 
124 |     test_builder = PruningTestBuilder(hdd_tree, config_ids_set, with_whitespace=unparse_with_whitespace)
125 |     if cache:
126 |         cache.clear()
127 |         cache.set_test_builder(test_builder)
128 | 
129 |     test = tester_class(test_builder=test_builder, **tester_config)
130 |     dd = reduce_class(test, cache=cache, id_prefix=id_prefix, **reduce_config)
131 |     c = dd(config_ids)
132 |     if len(c) == 1:
133 |         dd = EmptyDD(test, cache=cache, id_prefix=id_prefix)
134 |         c = dd(c)
135 |     c = set(c)
136 | 
137 |     def _set_state(node):
138 |         if node.id in config_ids_set:
139 |             node.state = node.KEEP if node.id in c else node.REMOVED
140 |         elif hasattr(node, 'children') and node.state == node.KEEP:
141 |             for child in node.children:
142 |                 _set_state(child)
143 |     _set_state(hdd_tree)
144 | 
145 |     return hdd_tree, len(c) < len(config_ids_set)
146 | 


--------------------------------------------------------------------------------
/src/picireny/srcml/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2018-2020 Renata Hodovan, Akos Kiss.
2 | #
3 | # Licensed under the BSD 3-Clause License
4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
5 | # This file may not be copied, modified, or distributed except
6 | # according to those terms.
7 | 
8 | from .hdd_tree_builder import create_hdd_tree
9 | 


--------------------------------------------------------------------------------
/src/picireny/srcml/hdd_tree_builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018-2022 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | import logging
 9 | import xml.etree.ElementTree as ET
10 | 
11 | from subprocess import CalledProcessError, PIPE, run
12 | 
13 | from ..hdd_tree import HDDRule, HDDToken, Position
14 | from ..transform import remove_empty_nodes
15 | 
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | def build_hdd_tree(element, start):
21 |     name = element.tag
22 |     name = name.replace('{http://www.srcML.org/srcML/src}', 'src:')
23 |     name = name.replace('{http://www.srcML.org/srcML/cpp}', 'cpp:')
24 |     name = name.replace('{http://www.srcML.org/srcML/position}', 'pos:')
25 | 
26 |     rule = HDDRule(name, start=start, end=start, replace='')
27 |     result = [rule]
28 | 
29 |     if element.text:
30 |         end = start.after(element.text)
31 |         rule.add_child(HDDToken(f'{name}@text', element.text, start=start, end=end, replace=element.text))
32 |         rule.end = end
33 | 
34 |     for child in list(element):
35 |         if child.tag.startswith('{http://www.srcML.org/srcML/position}'):
36 |             continue
37 |         for node in build_hdd_tree(child, rule.end):
38 |             rule.add_child(node)
39 |             rule.end = rule.children[-1].end
40 | 
41 |     if element.tail:
42 |         result += [HDDToken(f'{name}@tail', element.tail, start=rule.end, end=rule.end.after(element.tail), replace=element.tail)]
43 | 
44 |     return result
45 | 
46 | 
47 | def create_hdd_tree(src, *, language):
48 |     """
49 |     Build a tree that the HDD algorithm can work with.
50 | 
51 |     :param src: Input source to srcML.
52 |     :param language: Language of the input source (C, C++, C#, or Java).
53 |     :return: The root of the created HDD tree.
54 |     """
55 | 
56 |     try:
57 |         stdout = run(('srcml', f'--language={language}'),
58 |                      input=src, stdout=PIPE, stderr=PIPE, check=True).stdout
59 |     except CalledProcessError as e:
60 |         logger.error('Parsing with srcml failed!\n%s\n%s\n', e.stdout, e.stderr)
61 |         raise
62 | 
63 |     root = ET.fromstring(stdout)
64 | 
65 |     tree_result = build_hdd_tree(root, Position())
66 |     assert len(tree_result) == 1
67 |     tree = tree_result[0]
68 | 
69 |     tree = remove_empty_nodes(tree)
70 |     return tree
71 | 


--------------------------------------------------------------------------------
/src/picireny/transform.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-2021 Renata Hodovan, Akos Kiss.
  2 | #
  3 | # Licensed under the BSD 3-Clause License
  4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  5 | # This file may not be copied, modified, or distributed except
  6 | # according to those terms.
  7 | 
  8 | from .hdd_tree import HDDRule, HDDToken
  9 | 
 10 | 
 11 | def remove_empty_nodes(node):
 12 |     """
 13 |     Delete those nodes from the HDD tree that don't contribute to the output at
 14 |     all (tokens with empty text, e.g., the EOF token; and rules with no
 15 |     children, e.g., lambda rules).
 16 | 
 17 |     :param node: The root of the tree to be transformed.
 18 |     :return: The root of the transformed tree.
 19 |     """
 20 |     if isinstance(node, HDDRule):
 21 |         non_empty_children = []
 22 | 
 23 |         for child in node.children:
 24 |             if isinstance(child, HDDToken):
 25 |                 # empty token is usually the EOF only (but interestingly, it may
 26 |                 # appear multiple times in the tree)
 27 |                 if child.text != '':
 28 |                     non_empty_children.append(child)
 29 |             else:
 30 |                 assert isinstance(child, HDDRule)
 31 |                 remove_empty_nodes(child)
 32 | 
 33 |                 # a grammar may contain lambda rules (with nothing on the
 34 |                 # right-hand side, or with an empty alternative), or rules that
 35 |                 # produce EOF only (which is removed in the branch above)
 36 |                 if child.children:
 37 |                     non_empty_children.append(child)
 38 | 
 39 |         node.children[:] = non_empty_children
 40 | 
 41 |     return node
 42 | 
 43 | 
 44 | def flatten_recursion(node):
 45 |     """
 46 |     Heuristics to flatten left or right-recursion. E.g., given a rule
 47 |         rule : a | rule b
 48 |     and a HDD tree built with it from an input, rewrite the resulting HDD tree
 49 |     as if it was built using
 50 |         rule : a b*
 51 |     This allows HDD to potentially completely remove the recurring blocks
 52 |     (instead of replacing them with their minimal replacement, which is usually
 53 |     not "").
 54 | 
 55 |     :param node: The root of the tree to be transformed.
 56 |     :return: The root of the transformed tree.
 57 |     """
 58 |     if isinstance(node, HDDRule) and node.state == node.KEEP:
 59 |         for child in node.children:
 60 |             flatten_recursion(child)
 61 | 
 62 |         if len(node.children) > 1 and node.name:
 63 |             if node.children[0].name == node.name:
 64 |                 left = node.children[0]
 65 | 
 66 |                 right = HDDRule('', replace='', start=node.children[1].start, end=node.children[-1].end)
 67 |                 right.add_children(node.children[1:])
 68 |                 del node.children[:]
 69 | 
 70 |                 node.add_children(left.children)
 71 |                 node.add_child(right)
 72 | 
 73 |             elif node.children[-1].name == node.name:
 74 |                 right = node.children[-1]
 75 | 
 76 |                 left = HDDRule('', replace='', start=node.children[0].start, end=node.children[-2].end)
 77 |                 left.add_children(node.children[0:-1])
 78 |                 del node.children[:]
 79 | 
 80 |                 node.add_child(left)
 81 |                 node.add_children(right.children)
 82 | 
 83 |         # This only seems to happen if there was some error during parsing.
 84 |         # In this case a weird 1-step chain gets inserted into the left/right-
 85 |         # recursive tree, which prevents flattening. But we cannot postpone the
 86 |         # merging of this 1-step chain to squeeze_tree because flatten_recursion
 87 |         # is usually not called again afterwards. So, do a degenerate "rotation"
 88 |         # (i.e., simple lifting) here.
 89 |         if len(node.children) == 1 and node.name:
 90 |             if node.children[0].name == node.name:
 91 |                 child = node.children[0]
 92 |                 del node.children[:]
 93 |                 node.add_children(child.children)
 94 | 
 95 |     return node
 96 | 
 97 | 
 98 | def squeeze_tree(node):
 99 |     """
100 |     Compress single line chains in the HDD tree whose minimal replacements are
101 |     the same and hence they would result in redundant checks during the
102 |     minimization.
103 | 
104 |     :param node: The root of the tree to be transformed.
105 |     :return: The root of the transformed tree.
106 |     """
107 |     if isinstance(node, HDDRule):
108 |         for i, child in enumerate(node.children):
109 |             squeezed_child = squeeze_tree(child)
110 |             if child != squeezed_child:
111 |                 node.children[i].replace_with(squeezed_child)
112 | 
113 |         if len(node.children) == 1 and node.children[0].replace == node.replace:
114 |             return node.children[0]
115 | 
116 |     return node
117 | 
118 | 
119 | def skip_unremovable(node, *, unparse_with_whitespace=True):
120 |     """
121 |     Mark those nodes as removed whose unparsing (e.g., for tokens, their text)
122 |     is the same tokens as their minimal replacement, thus hiding them from
123 |     hddmin, because they just cause extra test runs but cannot reduce the input.
124 | 
125 |     :param node: The root of the tree to be transformed.
126 |     :return: The root of the transformed tree.
127 |     """
128 |     if isinstance(node, HDDRule):
129 |         for child in node.children:
130 |             skip_unremovable(child, unparse_with_whitespace=unparse_with_whitespace)
131 | 
132 |     if node.unparse(with_whitespace=unparse_with_whitespace) == node.replace:
133 |         node.state = node.REMOVED
134 | 
135 |     return node
136 | 
137 | 
138 | def skip_whitespace(node):
139 |     """
140 |     Mark tokens with whitespace-only text as removed. Useful when hidden-channel
141 |     tokens are built into the tree to let hddmin deal with
142 |     hidden-but-non-whitespace tokens only.
143 | 
144 |     :param node: The root of the tree to be transformed.
145 |     :return: The root of the transformed tree.
146 |     """
147 |     if isinstance(node, HDDRule):
148 |         for child in node.children:
149 |             skip_whitespace(child)
150 |     else:
151 |         assert isinstance(node, HDDToken)
152 |         if node.text.isspace():
153 |             node.state = node.REMOVED
154 | 
155 |     return node
156 | 


--------------------------------------------------------------------------------
/tests/resources/INILexer.g4:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017-2021 Renata Hodovan, Akos Kiss.
  3 |  *
  4 |  * Licensed under the BSD 3-Clause License
  5 |  * <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
  6 |  * This file may not be copied, modified, or distributed except
  7 |  * according to those terms.
  8 |  */
  9 | 
 10 | lexer grammar INILexer;
 11 | 
 12 | 
 13 | HEADER_OPEN
 14 |    : '[' -> pushMode(HEADER_MODE)
 15 |    ;
 16 | 
 17 | KEY
 18 |    : KEY_START_CHAR ( KEY_CHAR_WS* KEY_CHAR )?
 19 |    ;
 20 | 
 21 | fragment
 22 | KEY_START_CHAR
 23 |    : ~[[:=\r\n;# \t]
 24 |    ;
 25 | 
 26 | fragment
 27 | KEY_CHAR
 28 |    : KEY_START_CHAR
 29 |    | '['
 30 |    ;
 31 | 
 32 | fragment
 33 | KEY_CHAR_WS
 34 |    : KEY_CHAR
 35 |    | WS
 36 |    ;
 37 | 
 38 | EQUALS
 39 |    : [:=] -> pushMode(VALUE_MODE)
 40 |    ;
 41 | 
 42 | WS
 43 |    : [ \t]+
 44 |    ;
 45 | 
 46 | EOL
 47 |    : '\r\n'
 48 |    | '\r'
 49 |    | '\n'
 50 |    ;
 51 | 
 52 | COMMENT
 53 |    : COMMENT_START_CHAR ~[\r\n]*
 54 |    ;
 55 | 
 56 | fragment
 57 | COMMENT_START_CHAR
 58 |    : [;#]
 59 |    ;
 60 | 
 61 | 
 62 | mode HEADER_MODE;
 63 | 
 64 | HEADER
 65 |    : HEADER_CHAR ( HEADER_CHAR_WS* HEADER_CHAR )?
 66 |    ;
 67 | 
 68 | fragment
 69 | HEADER_CHAR
 70 |    : ~[[\]\r\n;# \t]
 71 |    ;
 72 | 
 73 | fragment
 74 | HEADER_CHAR_WS
 75 |    : HEADER_CHAR
 76 |    | HEADER_WS
 77 |    ;
 78 | 
 79 | HEADER_CLOSE
 80 |    : ']' -> popMode
 81 |    ;
 82 | 
 83 | HEADER_WS
 84 |    : [ \t]+
 85 |    ;
 86 | 
 87 | 
 88 | mode VALUE_MODE;
 89 | 
 90 | VALUE
 91 |    : VALUE_CHAR ( VALUE_CHAR_WS* VALUE_CHAR )? -> popMode
 92 |    ;
 93 | 
 94 | fragment
 95 | VALUE_CHAR
 96 |    : ~[\r\n\t;# ]
 97 |    ;
 98 | 
 99 | fragment
100 | VALUE_CHAR_WS
101 |    : VALUE_CHAR
102 |    | VALUE_WS
103 |    ;
104 | 
105 | VALUE_WS
106 |    : [ \t]+
107 |    ;
108 | 


--------------------------------------------------------------------------------
/tests/resources/INIParser.g4:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017 Renata Hodovan, Akos Kiss.
 3 |  *
 4 |  * Licensed under the BSD 3-Clause License
 5 |  * <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 6 |  * This file may not be copied, modified, or distributed except
 7 |  * according to those terms.
 8 |  */
 9 | 
10 | parser grammar INIParser;
11 | 
12 | options { tokenVocab=INILexer; }
13 | 
14 | 
15 | ini
16 |    : comment* section* EOF
17 |    ;
18 | 
19 | comment
20 |    : WS? COMMENT EOL
21 |    ;
22 | 
23 | section
24 |    : header ( comment | line )*
25 |    ;
26 | 
27 | header
28 |    : WS? HEADER_OPEN HEADER_WS? HEADER HEADER_WS? HEADER_CLOSE WS? EOL
29 |    ;
30 | 
31 | // Multiline values are not handled properly by this approach, the continuation
32 | // lines will be recognized as keys, probably with no value.
33 | line
34 |    : WS? ( KEY WS? ( EQUALS VALUE_WS? ( VALUE WS? )? )? )? EOL
35 |    ;
36 | 


--------------------------------------------------------------------------------
/tests/resources/JSON.g4:
--------------------------------------------------------------------------------
 1 | 
 2 | /** Taken from "The Definitive ANTLR 4 Reference" by Terence Parr */
 3 | 
 4 | // Derived from http://json.org
 5 | grammar JSON;
 6 | 
 7 | json
 8 |    : value
 9 |    ;
10 | 
11 | obj
12 |    : '{' pair (',' pair)* '}'
13 |    | '{' '}'
14 |    ;
15 | 
16 | pair
17 |    : STRING ':' value
18 |    ;
19 | 
20 | array
21 |    : '[' value (',' value)* ']'
22 |    | '[' ']'
23 |    ;
24 | 
25 | value
26 |    : STRING
27 |    | NUMBER
28 |    | obj
29 |    | array
30 |    | 'true'
31 |    | 'false'
32 |    | 'null'
33 |    ;
34 | 
35 | 
36 | STRING
37 |    : '"' (ESC | ~ ["\\])* '"'
38 |    ;
39 | 
40 | 
41 | fragment ESC
42 |    : '\\' (["\\/bfnrt] | UNICODE)
43 |    ;
44 | 
45 | 
46 | fragment UNICODE
47 |    : 'u' HEX HEX HEX HEX
48 |    ;
49 | 
50 | 
51 | fragment HEX
52 |    : [0-9a-fA-F]
53 |    ;
54 | 
55 | 
56 | NUMBER
57 |    : '-'? INT '.' [0-9] + EXP? | '-'? INT EXP | '-'? INT
58 |    ;
59 | 
60 | 
61 | fragment INT
62 |    : '0' | [1-9] [0-9]*
63 |    ;
64 | 
65 | // no leading zeros
66 | 
67 | fragment EXP
68 |    : [Ee] [+\-]? INT
69 |    ;
70 | 
71 | // \- since - means "range" inside [...]
72 | 
73 | WS
74 |    : [ \t\n\r] + -> skip
75 |    ;
76 | 


--------------------------------------------------------------------------------
/tests/resources/exp-obj-arr-87.json:
--------------------------------------------------------------------------------
1 | {
2 | "" : 0,
3 | "": [ 0, 87 ]
4 | }


--------------------------------------------------------------------------------
/tests/resources/exp-obj-arr-bar.json:
--------------------------------------------------------------------------------
1 | {
2 | "": "bar"
3 | }


--------------------------------------------------------------------------------
/tests/resources/exp-obj-arr-baz.json:
--------------------------------------------------------------------------------
1 | {
2 | "" : 0,
3 | "baz": 0
4 | }


--------------------------------------------------------------------------------
/tests/resources/exp-obj-arr-foo.json:
--------------------------------------------------------------------------------
1 | {
2 | "foo": 0
3 | }


--------------------------------------------------------------------------------
/tests/resources/exp-str-arr-87.ini:
--------------------------------------------------------------------------------
1 | [ a ] 
2 | a:[ 0, 87 ]
3 | 


--------------------------------------------------------------------------------
/tests/resources/inijson-crlf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "start": "ini:ini",
 3 |   "grammars": {
 4 |     "ini": {
 5 |       "files": [
 6 |         "INILexer.g4", "INIParser.g4"
 7 |       ],
 8 |       "islands": {
 9 |         "VALUE": "(?P<json:json>.*)"
10 |       },
11 |       "replacements": {
12 |         "EOL": "\r\n",
13 |         "HEADER": "a",
14 |         "KEY": "a",
15 |         "VALUE": "a"
16 |       }
17 |     },
18 |     "json": {
19 |       "files": [
20 |         "JSON.g4"
21 |       ]
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/resources/inijson.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "start": "ini:ini",
 3 |   "grammars": {
 4 |     "ini": {
 5 |       "files": [
 6 |         "INILexer.g4", "INIParser.g4"
 7 |       ],
 8 |       "islands": {
 9 |         "VALUE": "(?P<json:json>.*)"
10 |       },
11 |       "replacements": {
12 |         "EOL": "\n",
13 |         "HEADER": "a",
14 |         "KEY": "a",
15 |         "VALUE": "a"
16 |       }
17 |     },
18 |     "json": {
19 |       "files": [
20 |         "JSON.g4"
21 |       ]
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/resources/inp-obj-arr.json:
--------------------------------------------------------------------------------
1 | {
2 |     "foo": "bar",
3 |     "baz": [ 6, 7, 12, 31, 77, 87 ]
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/resources/inp-str-arr.ini:
--------------------------------------------------------------------------------
1 | [test]
2 | foo: "bar"
3 | baz: [ 6, 7, 12, 31, 77, 87 ]
4 | 


--------------------------------------------------------------------------------
/tests/resources/sut-inijson-load.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import configparser
 4 | import json
 5 | import sys
 6 | 
 7 | 
 8 | c = configparser.ConfigParser(allow_no_value=True)
 9 | with open(sys.argv[1], 'r') as f:
10 |     c.read_file(f)
11 | 
12 | for s in c.sections():
13 |     for o in c.options(s):
14 |         j = json.loads(c.get(s, o))
15 | 
16 | c.write(sys.stdout)
17 | 


--------------------------------------------------------------------------------
/tests/resources/sut-json-load.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import json
 4 | import sys
 5 | 
 6 | 
 7 | with open(sys.argv[1], 'r', encoding='utf-8') as f:
 8 |     j = json.load(f)
 9 | 
10 | print(f'{j!r}')
11 | 


--------------------------------------------------------------------------------
/tests/resources/test-inijson-str-arr-87.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python %~f0\..\sut-inijson-load.py %1 | find "87" >NUL 2>&1
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-inijson-str-arr-87.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python $(dirname $0)/sut-inijson-load.py $1 | grep -q "87"
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-87.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python %~f0\..\sut-json-load.py %1 | find "87" >NUL 2>&1
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-87.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "87"
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-bar.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python %~f0\..\sut-json-load.py %1 | find "bar" >NUL 2>&1
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-bar.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "bar"
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-baz.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python %~f0\..\sut-json-load.py %1 | find "baz" >NUL 2>&1
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-baz.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "baz"
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-foo.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python %~f0\..\sut-json-load.py %1 | find "foo" >NUL 2>&1
3 | 


--------------------------------------------------------------------------------
/tests/resources/test-json-obj-arr-foo.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python $(dirname $0)/sut-json-load.py $1 | grep -q "foo"
3 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016-2023 Renata Hodovan, Akos Kiss.
 2 | #
 3 | # Licensed under the BSD 3-Clause License
 4 | # <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
 5 | # This file may not be copied, modified, or distributed except
 6 | # according to those terms.
 7 | 
 8 | import os
 9 | import subprocess
10 | import sys
11 | 
12 | import pytest
13 | 
14 | 
15 | is_windows = sys.platform.startswith('win32')
16 | script_ext = '.bat' if is_windows else '.sh'
17 | 
18 | tests_dir = os.path.dirname(os.path.abspath(__file__))
19 | resources_dir = os.path.join(tests_dir, 'resources')
20 | antlr = os.getenv('ANTLR')
21 | 
22 | 
23 | @pytest.mark.parametrize('test, inp, exp, grammar, rule, input_format', [
24 |     ('test-json-obj-arr-foo', 'inp-obj-arr.json', 'exp-obj-arr-foo.json', 'JSON.g4', 'json', None),
25 |     ('test-json-obj-arr-bar', 'inp-obj-arr.json', 'exp-obj-arr-bar.json', 'JSON.g4', 'json', None),
26 |     ('test-json-obj-arr-baz', 'inp-obj-arr.json', 'exp-obj-arr-baz.json', 'JSON.g4', 'json', None),
27 |     ('test-json-obj-arr-87', 'inp-obj-arr.json', 'exp-obj-arr-87.json', 'JSON.g4', 'json', None),
28 |     ('test-inijson-str-arr-87', 'inp-str-arr.ini', 'exp-str-arr-87.ini', None, None, 'inijson-crlf.json' if is_windows else 'inijson.json'),
29 | ])
30 | @pytest.mark.parametrize('args', [
31 |     ('--cache=config', ),
32 |     ('--no-skip-unremovable', '--parser=java', '--cache=content', ),
33 |     ('--no-squeeze-tree', '--parser=java', '--cache=none', ),
34 |     ('--no-squeeze-tree', '--no-skip-unremovable', '--cache=config', ),
35 |     ('--no-hdd-star', '--parser=java', '--cache=content', ),
36 |     ('--no-hdd-star', '--no-skip-unremovable', '--cache=none', ),
37 |     ('--no-hdd-star', '--no-squeeze-tree', '--cache=config', ),
38 |     ('--no-hdd-star', '--no-squeeze-tree', '--no-skip-unremovable', '--parser=java', '--cache=content', ),
39 |     ('--parallel', ),
40 | ])
41 | def test_cli(test, inp, exp, grammar, rule, input_format, args, tmpdir):
42 |     out_dir = str(tmpdir)
43 |     cmd = (sys.executable, '-m', 'picireny') \
44 |         + (f'--test={test}{script_ext}', f'--input={inp}', f'--out={out_dir}') \
45 |         + ('--log-level=TRACE', )
46 |     if grammar:
47 |         cmd += (f'--grammar={grammar}', )
48 |     if rule:
49 |         cmd += (f'--start={rule}', )
50 |     if input_format:
51 |         cmd += (f'--format={input_format}', )
52 |     if antlr:
53 |         cmd += (f'--antlr={antlr}', )
54 |     cmd += args
55 |     subprocess.run(cmd, cwd=resources_dir, check=True)
56 | 
57 |     with open(os.path.join(out_dir, inp), 'rb') as outf:
58 |         outb = outf.read()
59 |     with open(os.path.join(resources_dir, exp), 'rb') as expf:
60 |         expb = expf.read()
61 |     assert outb == expb
62 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py, lint, schema, build
 3 | isolated_build = true
 4 | 
 5 | [testenv]
 6 | passenv = ANTLR
 7 | deps = pytest
 8 | commands = py.test {posargs}
 9 | download = true
10 | 
11 | [testenv:cov]
12 | deps = pytest-cov
13 | commands = py.test --cov=picireny --cov-config=tox.ini {posargs}
14 | usedevelop = true
15 | 
16 | [coverage:run]
17 | omit = **/parser/*
18 | 
19 | [testenv:lint]
20 | deps =
21 |     pycodestyle
22 |     pylint
23 |     pytest
24 | commands =
25 |     pylint src/picireny tests
26 |     pycodestyle src/picireny tests --ignore=E501 --exclude=src/picireny/antlr4/parser/ANTLRv4*.py
27 | 
28 | [testenv:schema]
29 | deps =
30 |     check-jsonschema
31 | skip_install = true
32 | commands =
33 |     check-jsonschema -v --check-metaschema schemas/format.json schemas/replacements.json
34 |     check-jsonschema -v --schemafile schemas/format.json tests/resources/inijson.json tests/resources/inijson-crlf.json
35 | 
36 | [testenv:build]
37 | deps =
38 |     build
39 |     twine
40 |     virtualenv
41 | skip_install = true
42 | commands =
43 |     pyproject-build -o {envtmpdir}
44 |     twine check {envtmpdir}/*
45 | 


--------------------------------------------------------------------------------