├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── pythonpackage.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── make.bat
├── examples
    ├── benchmarks
    │   └── json
    │   │   ├── data.json
    │   │   ├── errors.py
    │   │   ├── parse_tree.py
    │   │   ├── parsers
    │   │       ├── __init__.py
    │   │       ├── funcparserlib_json.py
    │   │       ├── lark_json.py
    │   │       ├── parsimonious_json.py
    │   │       ├── parsita_json.py
    │   │       ├── parsy_json.py
    │   │       ├── pyleri_json.py
    │   │       ├── pyparsing_json.py
    │   │       ├── textparser_json.py
    │   │       └── textx_json.py
    │   │   └── speed.py
    ├── hello_world.py
    ├── json.py
    └── proto3.py
├── requirements.txt
├── setup.py
├── tests
    ├── __init__.py
    └── test_textparser.py
└── textparser.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: eerimoq
2 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 4
11 |       matrix:
12 |         python-version: [3.6, 3.9]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v1
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Test
21 |       run: |
22 |         python -m unittest
23 | 
24 |   release:
25 |     needs: [test]
26 |     runs-on: ubuntu-latest
27 |     if: startsWith(github.ref, 'refs/tags')
28 | 
29 |     steps:
30 |     - name: Checkout
31 |       uses: actions/checkout@v1
32 |     - name: Set up Python 3.9
33 |       uses: actions/setup-python@v1
34 |       with:
35 |         python-version: 3.9
36 |     - name: Install pypa/build
37 |       run: |
38 |         python -m pip install build --user
39 |     - name: Build a binary wheel and a source tarball
40 |       run: |
41 |         git clean -dfx
42 |         python -m build --sdist --wheel --outdir dist/ .
43 |     - name: Publish distribution 📦 to PyPI
44 |       uses: pypa/gh-action-pypi-publish@master
45 |       with:
46 |         skip_existing: true
47 |         password: ${{ secrets.pypi_password }}
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018-2019 Erik Moqvist
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include Makefile
3 | recursive-include tests *.py
4 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | About
  2 | =====
  3 | 
  4 | A text parser written in the Python language.
  5 | 
  6 | The project has one goal, speed! See the benchmark below more details.
  7 | 
  8 | Project homepage: https://github.com/eerimoq/textparser
  9 | 
 10 | Documentation: http://textparser.readthedocs.org/en/latest
 11 | 
 12 | Credits
 13 | =======
 14 | 
 15 | - Thanks `PyParsing`_ for a user friendly interface. Many of
 16 |   ``textparser``'s class names are taken from this project.
 17 | 
 18 | Installation
 19 | ============
 20 | 
 21 | .. code-block:: python
 22 | 
 23 |     pip install textparser
 24 | 
 25 | Example usage
 26 | =============
 27 | 
 28 | The `Hello World`_ example parses the string ``Hello, World!`` and
 29 | outputs its parse tree ``['Hello', ',', 'World', '!']``.
 30 | 
 31 | The script:
 32 | 
 33 | .. code-block:: python
 34 | 
 35 |    import textparser
 36 |    from textparser import Sequence
 37 | 
 38 | 
 39 |    class Parser(textparser.Parser):
 40 | 
 41 |        def token_specs(self):
 42 |            return [
 43 |                ('SKIP',          r'[ \r\n\t]+'),
 44 |                ('WORD',          r'\w+'),
 45 |                ('EMARK',    '!', r'!'),
 46 |                ('COMMA',    ',', r','),
 47 |                ('MISMATCH',      r'.')
 48 |            ]
 49 | 
 50 |        def grammar(self):
 51 |            return Sequence('WORD', ',', 'WORD', '!')
 52 | 
 53 | 
 54 |    tree = Parser().parse('Hello, World!')
 55 | 
 56 |    print('Tree:', tree)
 57 | 
 58 | Script execution:
 59 | 
 60 | .. code-block:: text
 61 | 
 62 |    $ env PYTHONPATH=. python3 examples/hello_world.py
 63 |    Tree: ['Hello', ',', 'World', '!']
 64 | 
 65 | Benchmark
 66 | =========
 67 | 
 68 | A `benchmark`_ comparing the speed of 10 JSON parsers, parsing a `276
 69 | kb file`_.
 70 | 
 71 | .. code-block:: text
 72 | 
 73 |    $ env PYTHONPATH=. python3 examples/benchmarks/json/speed.py
 74 | 
 75 |    Parsed 'examples/benchmarks/json/data.json' 1 time(s) in:
 76 | 
 77 |    PACKAGE         SECONDS   RATIO  VERSION
 78 |    textparser         0.10    100%  0.21.1
 79 |    parsimonious       0.17    169%  unknown
 80 |    lark (LALR)        0.27    267%  0.7.0
 81 |    funcparserlib      0.34    340%  unknown
 82 |    textx              0.54    546%  1.8.0
 83 |    pyparsing          0.68    684%  2.4.0
 84 |    pyleri             0.88    886%  1.2.2
 85 |    parsy              0.92    925%  1.2.0
 86 |    parsita            2.28   2286%  unknown
 87 |    lark (Earley)      2.34   2348%  0.7.0
 88 | 
 89 | *NOTE 1: The parsers are not necessarily optimized for
 90 | speed. Optimizing them will likely affect the measurements.*
 91 | 
 92 | *NOTE 2: The structure of the resulting parse trees varies and
 93 | additional processing may be required to make them fit the user
 94 | application.*
 95 | 
 96 | *NOTE 3: Only JSON parsers are compared. Parsing other languages may
 97 | give vastly different results.*
 98 | 
 99 | Contributing
100 | ============
101 | 
102 | #. Fork the repository.
103 | 
104 | #. Implement the new feature or bug fix.
105 | 
106 | #. Implement test case(s) to ensure that future changes do not break
107 |    legacy.
108 | 
109 | #. Run the tests.
110 | 
111 |    .. code-block:: text
112 | 
113 |       python3 -m unittest
114 | 
115 | #. Create a pull request.
116 | 
117 | .. _PyParsing: https://github.com/pyparsing/pyparsing
118 | .. _Hello World: https://github.com/eerimoq/textparser/blob/master/examples/hello_world.py
119 | .. _benchmark: https://github.com/eerimoq/textparser/blob/master/examples/benchmarks/json/speed.py
120 | .. _276 kb file: https://github.com/eerimoq/textparser/blob/master/examples/benchmarks/json/data.json
121 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/textparser.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/textparser.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/textparser"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/textparser"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # textparser documentation build configuration file, created by
  4 | # sphinx-quickstart on Sat Apr 25 11:54:09 2015.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | import shlex
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | sys.path.insert(0, os.path.abspath('..'))
 23 | 
 24 | import textparser
 25 | 
 26 | # -- General configuration ------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     'sphinx.ext.autodoc',
 36 |     'sphinx.ext.viewcode',
 37 | ]
 38 | 
 39 | # Add any paths that contain templates here, relative to this directory.
 40 | templates_path = ['_templates']
 41 | 
 42 | # The suffix(es) of source filenames.
 43 | # You can specify multiple suffix as a list of string:
 44 | # source_suffix = ['.rst', '.md']
 45 | source_suffix = '.rst'
 46 | 
 47 | # The encoding of source files.
 48 | #source_encoding = 'utf-8-sig'
 49 | 
 50 | # The master toctree document.
 51 | master_doc = 'index'
 52 | 
 53 | # General information about the project.
 54 | project = u'textparser'
 55 | copyright = u'2018-2019, Erik Moqvist'
 56 | author = u'Erik Moqvist'
 57 | 
 58 | # The version info for the project you're documenting, acts as replacement for
 59 | # |version| and |release|, also used in various other places throughout the
 60 | # built documents.
 61 | #
 62 | # The short X.Y version.
 63 | version = textparser.__version__
 64 | # The full version, including alpha/beta/rc tags.
 65 | release = textparser.__version__
 66 | 
 67 | # The language for content autogenerated by Sphinx. Refer to documentation
 68 | # for a list of supported languages.
 69 | #
 70 | # This is also used if you do content translation via gettext catalogs.
 71 | # Usually you set "language" from the command line for these cases.
 72 | language = None
 73 | 
 74 | # There are two options for replacing |today|: either, you set today to some
 75 | # non-false value, then it is used:
 76 | #today = ''
 77 | # Else, today_fmt is used as the format for a strftime call.
 78 | #today_fmt = '%B %d, %Y'
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | exclude_patterns = ['_build']
 83 | 
 84 | # The reST default role (used for this markup: `text`) to use for all
 85 | # documents.
 86 | #default_role = None
 87 | 
 88 | # If true, '()' will be appended to :func: etc. cross-reference text.
 89 | #add_function_parentheses = True
 90 | 
 91 | # If true, the current module name will be prepended to all description
 92 | # unit titles (such as .. function::).
 93 | #add_module_names = True
 94 | 
 95 | # If true, sectionauthor and moduleauthor directives will be shown in the
 96 | # output. They are ignored by default.
 97 | #show_authors = False
 98 | 
 99 | # The name of the Pygments (syntax highlighting) style to use.
100 | pygments_style = 'sphinx'
101 | 
102 | # A list of ignored prefixes for module index sorting.
103 | #modindex_common_prefix = []
104 | 
105 | # If true, keep warnings as "system message" paragraphs in the built documents.
106 | #keep_warnings = False
107 | 
108 | # If true, `todo` and `todoList` produce output, else they produce nothing.
109 | todo_include_todos = False
110 | 
111 | 
112 | # -- Options for HTML output ----------------------------------------------
113 | 
114 | # The theme to use for HTML and HTML Help pages.  See the documentation for
115 | # a list of builtin themes.
116 | html_theme = 'sphinx_rtd_theme'
117 | 
118 | # Theme options are theme-specific and customize the look and feel of a theme
119 | # further.  For a list of options available for each theme, see the
120 | # documentation.
121 | #html_theme_options = {}
122 | 
123 | # Add any paths that contain custom themes here, relative to this directory.
124 | #html_theme_path = []
125 | 
126 | # The name for this set of Sphinx documents.  If None, it defaults to
127 | # "<project> v<release> documentation".
128 | #html_title = None
129 | 
130 | # A shorter title for the navigation bar.  Default is the same as html_title.
131 | #html_short_title = None
132 | 
133 | # The name of an image file (relative to this directory) to place at the top
134 | # of the sidebar.
135 | #html_logo = None
136 | 
137 | # The name of an image file (within the static path) to use as favicon of the
138 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
139 | # pixels large.
140 | #html_favicon = None
141 | 
142 | # Add any paths that contain custom static files (such as style sheets) here,
143 | # relative to this directory. They are copied after the builtin static files,
144 | # so a file named "default.css" will overwrite the builtin "default.css".
145 | html_static_path = ['_static']
146 | 
147 | # Add any extra paths that contain custom files (such as robots.txt or
148 | # .htaccess) here, relative to this directory. These files are copied
149 | # directly to the root of the documentation.
150 | #html_extra_path = []
151 | 
152 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
153 | # using the given strftime format.
154 | #html_last_updated_fmt = '%b %d, %Y'
155 | 
156 | # If true, SmartyPants will be used to convert quotes and dashes to
157 | # typographically correct entities.
158 | #html_use_smartypants = True
159 | 
160 | # Custom sidebar templates, maps document names to template names.
161 | #html_sidebars = {}
162 | 
163 | # Additional templates that should be rendered to pages, maps page names to
164 | # template names.
165 | #html_additional_pages = {}
166 | 
167 | # If false, no module index is generated.
168 | #html_domain_indices = True
169 | 
170 | # If false, no index is generated.
171 | #html_use_index = True
172 | 
173 | # If true, the index is split into individual pages for each letter.
174 | #html_split_index = False
175 | 
176 | # If true, links to the reST sources are added to the pages.
177 | #html_show_sourcelink = True
178 | 
179 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
180 | #html_show_sphinx = True
181 | 
182 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
183 | #html_show_copyright = True
184 | 
185 | # If true, an OpenSearch description file will be output, and all pages will
186 | # contain a <link> tag referring to it.  The value of this option must be the
187 | # base URL from which the finished HTML is served.
188 | #html_use_opensearch = ''
189 | 
190 | # This is the file name suffix for HTML files (e.g. ".xhtml").
191 | #html_file_suffix = None
192 | 
193 | # Language to be used for generating the HTML full-text search index.
194 | # Sphinx supports the following languages:
195 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
196 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
197 | #html_search_language = 'en'
198 | 
199 | # A dictionary with options for the search language support, empty by default.
200 | # Now only 'ja' uses this config value
201 | #html_search_options = {'type': 'default'}
202 | 
203 | # The name of a javascript file (relative to the configuration directory) that
204 | # implements a search results scorer. If empty, the default will be used.
205 | #html_search_scorer = 'scorer.js'
206 | 
207 | # Output file base name for HTML help builder.
208 | htmlhelp_basename = 'textparserdoc'
209 | 
210 | # -- Options for LaTeX output ---------------------------------------------
211 | 
212 | latex_elements = {
213 | # The paper size ('letterpaper' or 'a4paper').
214 | #'papersize': 'letterpaper',
215 | 
216 | # The font size ('10pt', '11pt' or '12pt').
217 | #'pointsize': '10pt',
218 | 
219 | # Additional stuff for the LaTeX preamble.
220 | #'preamble': '',
221 | 
222 | # Latex figure (float) alignment
223 | #'figure_align': 'htbp',
224 | }
225 | 
226 | # Grouping the document tree into LaTeX files. List of tuples
227 | # (source start file, target name, title,
228 | #  author, documentclass [howto, manual, or own class]).
229 | latex_documents = [
230 |   (master_doc, 'textparser.tex', u'textparser Documentation',
231 |    u'Erik Moqvist', 'manual'),
232 | ]
233 | 
234 | # The name of an image file (relative to this directory) to place at the top of
235 | # the title page.
236 | #latex_logo = None
237 | 
238 | # For "manual" documents, if this is true, then toplevel headings are parts,
239 | # not chapters.
240 | #latex_use_parts = False
241 | 
242 | # If true, show page references after internal links.
243 | #latex_show_pagerefs = False
244 | 
245 | # If true, show URL addresses after external links.
246 | #latex_show_urls = False
247 | 
248 | # Documents to append as an appendix to all manuals.
249 | #latex_appendices = []
250 | 
251 | # If false, no module index is generated.
252 | #latex_domain_indices = True
253 | 
254 | 
255 | # -- Options for manual page output ---------------------------------------
256 | 
257 | # One entry per manual page. List of tuples
258 | # (source start file, name, description, authors, manual section).
259 | man_pages = [
260 |     (master_doc, 'textparser', u'Textparser Documentation',
261 |      [author], 1)
262 | ]
263 | 
264 | # If true, show URL addresses after external links.
265 | #man_show_urls = False
266 | 
267 | 
268 | # -- Options for Texinfo output -------------------------------------------
269 | 
270 | # Grouping the document tree into Texinfo files. List of tuples
271 | # (source start file, target name, title, author,
272 | #  dir menu entry, description, category)
273 | texinfo_documents = [
274 |   (master_doc, 'textparser', u'Textparser Documentation',
275 |    author, 'textparser', 'One line description of project.',
276 |    'Miscellaneous'),
277 | ]
278 | 
279 | # Documents to append as an appendix to all manuals.
280 | #texinfo_appendices = []
281 | 
282 | # If false, no module index is generated.
283 | #texinfo_domain_indices = True
284 | 
285 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
286 | #texinfo_show_urls = 'footnote'
287 | 
288 | # If true, do not generate a @detailmenu in the "Top" node's menu.
289 | #texinfo_no_detailmenu = False
290 | 
291 | autodoc_member_order = 'bysource'
292 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. textparser documentation master file, created by
  2 |    sphinx-quickstart on Sat Apr 25 11:54:09 2015.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | .. toctree::
  7 |    :maxdepth: 2
  8 | 
  9 | Text parser
 10 | ===========
 11 | 
 12 | .. include:: ../README.rst
 13 | 
 14 | The parser class
 15 | ================
 16 | 
 17 | .. autoclass:: textparser.Parser
 18 |     :members:
 19 | 
 20 | Building the grammar
 21 | ====================
 22 | 
 23 | The grammar built by combining the classes below and strings.
 24 | 
 25 | Here is a fictitious example grammar:
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |    grammar = Sequence(
 30 |        'BEGIN',
 31 |        Optional(choice('IF', Sequence(ZeroOrMore('NUMBER')))),
 32 |        OneOrMore(Sequence('WORD', Not('NUMBER'))),
 33 |        Any(),
 34 |        DelimitedList('WORD', delim=':'),
 35 |        'END')
 36 | 
 37 | .. autoclass:: textparser.Sequence
 38 | 
 39 | .. autoclass:: textparser.Choice
 40 | 
 41 | .. autoclass:: textparser.ChoiceDict
 42 | 
 43 | .. autofunction:: textparser.choice
 44 | 
 45 | .. autoclass:: textparser.ZeroOrMore
 46 | 
 47 | .. autoclass:: textparser.ZeroOrMoreDict
 48 | 
 49 | .. autoclass:: textparser.OneOrMore
 50 | 
 51 | .. autoclass:: textparser.OneOrMoreDict
 52 | 
 53 | .. autoclass:: textparser.DelimitedList
 54 | 
 55 | .. autoclass:: textparser.Optional
 56 | 
 57 | .. autoclass:: textparser.Any
 58 | 
 59 | .. autoclass:: textparser.AnyUntil
 60 | 
 61 | .. autoclass:: textparser.And
 62 | 
 63 | .. autoclass:: textparser.Not
 64 | 
 65 | .. autoclass:: textparser.NoMatch
 66 | 
 67 | .. autoclass:: textparser.Tag
 68 | 
 69 | .. autoclass:: textparser.Forward
 70 | 
 71 | .. autoclass:: textparser.Repeated
 72 | 
 73 | .. autoclass:: textparser.RepeatedDict
 74 | 
 75 | .. autoclass:: textparser.Pattern
 76 |     :members:
 77 | 
 78 | .. autodata:: textparser.MISMATCH
 79 | 
 80 | Exceptions
 81 | ==========
 82 | 
 83 | .. autoclass:: textparser.Error
 84 |     :members:
 85 | 
 86 | .. autoclass:: textparser.ParseError
 87 |     :members:
 88 | 
 89 | .. autoclass:: textparser.TokenizeError
 90 |     :members:
 91 | 
 92 | .. autoclass:: textparser.GrammarError
 93 |     :members:
 94 | 
 95 | Utility functions
 96 | =================
 97 | 
 98 | .. autofunction:: textparser.markup_line
 99 | 
100 | .. autofunction:: textparser.tokenize_init
101 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 2> nul
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\textparser.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\textparser.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/errors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Parse error comparsion for a few JSON parsers.
  4 | 
  5 | Example execution:
  6 | 
  7 | $ env PYTHONPATH=. python3 examples/benchmarks/json/errors.py
  8 | -----------------------------------------------------------------
  9 | 
 10 | Input string between BEGIN and END:
 11 | 
 12 | BEGIN
 13 | END
 14 | 
 15 | textparser: "Invalid syntax at line 1, column 1: ">>!<<""
 16 | 
 17 | lark_lalr: "'NoneType' object has no attribute 'pos_in_stream'"
 18 | 
 19 | lark_earley: "Incomplete parse: Could not find a solution to input"
 20 | 
 21 | pyparsing: "Expected {string enclosed in double quotes | real number with scientific notation | real number | signed integer | Group:(Forward: ...) | Group:({Suppress:("[") [Forward: ... [, Forward: ...]...] Suppress:("]")}) | "true" | "false" | "null"} (at char 0), (line:1, col:1)"
 22 | 
 23 | parsita: "No exception raised!"
 24 | 
 25 | funcparserlib: "no tokens left in the stream: <EOF>"
 26 | 
 27 | parsy: "expected one of '"', '-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?', '[', 'false', 'null', 'true', '{' at 0:0"
 28 | 
 29 | parsimonious: "Rule 'json_file' didn't match at '' (line 1, column 1)."
 30 | 
 31 | pyleri: "No exception raised!"
 32 | 
 33 | textx: "None:1:1: error: Expected '[' or '{' at position (1, 1) => '*'."
 34 | 
 35 | -----------------------------------------------------------------
 36 | 
 37 | Input string between BEGIN and END:
 38 | 
 39 | BEGIN
 40 | [
 41 |   1,
 42 |   {"a": {]}
 43 | ]
 44 | END
 45 | 
 46 | textparser: "Invalid syntax at line 3, column 10: "  {"a": {>>!<<]}""
 47 | 
 48 | lark_lalr: "Unexpected token Token(RSQB, ']') at line 3, column 10.
 49 | Expected: ESCAPED_STRING, RBRACE, string, pair
 50 | "
 51 | 
 52 | lark_earley: "Unexpected token Token(RSQB, ']') at line 3, column 10.
 53 | Expected: ESCAPED_STRING, RBRACE
 54 | "
 55 | 
 56 | pyparsing: "Expected {string enclosed in double quotes | real number with scientific notation | real number | signed integer | Group:(Forward: ...) | Group:({Suppress:("[") [Forward: ... [, Forward: ...]...] Suppress:("]")}) | "true" | "false" | "null"} (at char 5), (line:2, col:4)"
 57 | 
 58 | parsita: "No exception raised!"
 59 | 
 60 | funcparserlib: "got unexpected token: 3,10-3,10: Op ']'"
 61 | 
 62 | parsy: "expected one of '"', '}' at 2:9"
 63 | 
 64 | parsimonious: "Rule 'members' didn't match at ']}
 65 | ]
 66 | ' (line 3, column 10)."
 67 | 
 68 | pyleri: "No exception raised!"
 69 | 
 70 | textx: "None:3:10: error: Expected STRING or '}' at position (3, 10) => '   {"a": {*]} ] '."
 71 | 
 72 | -----------------------------------------------------------------
 73 | 
 74 | Input string between BEGIN and END:
 75 | 
 76 | BEGIN
 77 | [
 78 |   1,
 79 |   {3: null}
 80 | ]
 81 | END
 82 | 
 83 | textparser: "Invalid syntax at line 3, column 4: "  {>>!<<3: null}""
 84 | 
 85 | lark_lalr: "Unexpected token Token(SIGNED_NUMBER, '3') at line 3, column 4.
 86 | Expected: RBRACE, pair, string, ESCAPED_STRING
 87 | "
 88 | 
 89 | lark_earley: "Unexpected token Token(SIGNED_NUMBER, '3') at line 3, column 4.
 90 | Expected: ESCAPED_STRING, RBRACE
 91 | "
 92 | 
 93 | pyparsing: "Expected {string enclosed in double quotes | real number with scientific notation | real number | signed integer | Group:(Forward: ...) | Group:({Suppress:("[") [Forward: ... [, Forward: ...]...] Suppress:("]")}) | "true" | "false" | "null"} (at char 5), (line:2, col:4)"
 94 | 
 95 | parsita: "No exception raised!"
 96 | 
 97 | funcparserlib: "got unexpected token: 3,4-3,4: Number '3'"
 98 | 
 99 | parsy: "expected one of '"', '}' at 2:3"
100 | 
101 | parsimonious: "Rule 'members' didn't match at '3: null}
102 | ]
103 | ' (line 3, column 4)."
104 | 
105 | pyleri: "No exception raised!"
106 | 
107 | textx: "None:3:4: error: Expected STRING or '}' at position (3, 4) => '[   1,   {*3: null} ]'."
108 | 
109 | -----------------------------------------------------------------
110 | 
111 | Input string between BEGIN and END:
112 | 
113 | BEGIN
114 | nul
115 | END
116 | 
117 | textparser: "Invalid syntax at line 1, column 1: ">>!<<nul""
118 | 
119 | lark_lalr: "No terminal defined for 'n' at line 1 col 1
120 | 
121 | nul
122 | ^
123 | "
124 | 
125 | lark_earley: "No terminal defined for 'n' at line 1 col 1
126 | 
127 | nul
128 | ^
129 | "
130 | 
131 | pyparsing: "Expected {string enclosed in double quotes | real number with scientific notation | real number | signed integer | Group:(Forward: ...) | Group:({Suppress:("[") [Forward: ... [, Forward: ...]...] Suppress:("]")}) | "true" | "false" | "null"} (at char 0), (line:1, col:1)"
132 | 
133 | parsita: "No exception raised!"
134 | 
135 | funcparserlib: "got unexpected token: 1,1-1,3: Name 'nul'"
136 | 
137 | parsy: "expected one of '"', '-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?', '[', 'false', 'null', 'true', '{' at 0:0"
138 | 
139 | parsimonious: "Rule 'json_file' didn't match at 'nul
140 | ' (line 1, column 1)."
141 | 
142 | pyleri: "No exception raised!"
143 | 
144 | textx: "None:1:1: error: Expected '[' or '{' at position (1, 1) => '*nul '."
145 | $
146 | 
147 | """
148 | 
149 | from __future__ import print_function
150 | 
151 | from parsers import textparser_json
152 | from parsers import lark_json
153 | from parsers import pyparsing_json
154 | from parsers import funcparserlib_json
155 | from parsers import parsimonious_json
156 | from parsers import textx_json
157 | 
158 | try:
159 |     from parsers import parsita_json
160 | except:
161 | 
162 |     class parsita_json(object):
163 | 
164 |         @staticmethod
165 |         def parse(_json_string):
166 |             raise Exception('Import failed!')
167 | 
168 | try:
169 |     from parsers import parsy_json
170 | except:
171 |     class parsy_json(object):
172 | 
173 |         @staticmethod
174 |         def parse(_json_string):
175 |             raise Exception('Import failed!')
176 | 
177 | try:
178 |     from parsers import pyleri_json
179 | except:
180 |     class pyleri_json(object):
181 | 
182 |         @staticmethod
183 |         def parse(_json_string):
184 |             raise Exception('Import failed!')
185 | 
186 | 
187 | def parse(string):
188 |     def _parse(function):
189 |         try:
190 |             function(string)
191 |         except Exception as e:
192 |             return str(e)
193 | 
194 |         return 'No exception raised!'
195 | 
196 |     results = [
197 |         ('textparser', _parse(textparser_json.parse)),
198 |         ('lark_lalr', _parse(lark_json.parse_lalr)),
199 |         ('lark_earley', _parse(lark_json.parse_earley)),
200 |         ('pyparsing', _parse(pyparsing_json.parse)),
201 |         ('parsita', _parse(parsita_json.parse)),
202 |         ('funcparserlib', _parse(funcparserlib_json.parse)),
203 |         ('parsy', _parse(parsy_json.parse)),
204 |         ('parsimonious', _parse(parsimonious_json.parse)),
205 |         ('pyleri', _parse(pyleri_json.parse)),
206 |         ('textx', _parse(textx_json.parse))
207 |     ]
208 | 
209 |     print('-----------------------------------------------------------------')
210 |     print()
211 |     print('Input string between BEGIN and END:')
212 |     print()
213 |     print('BEGIN')
214 |     print(string, end='')
215 |     print('END')
216 |     print()
217 | 
218 |     for parser, error in results:
219 |         print('{}: "{}"'.format(parser, error))
220 |         print()
221 | 
222 | 
223 | EMPTY_STRING = '''\
224 | '''
225 | 
226 | BAD_DICT_END_STRING = '''\
227 | [
228 |   1,
229 |   {"a": {]}
230 | ]
231 | '''
232 | 
233 | BAD_DICT_KEY_STRING = '''\
234 | [
235 |   1,
236 |   {3: null}
237 | ]
238 | '''
239 | 
240 | BAD_NULL_STRING = '''\
241 | nul
242 | '''
243 | 
244 | 
245 | parse(EMPTY_STRING)
246 | parse(BAD_DICT_END_STRING)
247 | parse(BAD_DICT_KEY_STRING)
248 | parse(BAD_NULL_STRING)
249 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parse_tree.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Compare the parse tree of a few JSON parsers.
  4 | 
  5 | Example execution:
  6 | 
  7 | $ env PYTHONPATH=. python3 examples/benchmarks/json/parse_tree.py
  8 | -----------------------------------------------------------------
  9 | 
 10 | Input string between BEGIN and END:
 11 | 
 12 | BEGIN
 13 | [
 14 |   "foo",
 15 |   {
 16 |     "bar": [
 17 |       1,
 18 |       2,
 19 |       3
 20 |     ]
 21 |   }
 22 | ]
 23 | END
 24 | 
 25 | textparser:
 26 | ['[', [['"foo"', ['{', [[['"bar"', ':', ['[', [['1', '2', '3']], ']']]]], '}']]], ']']
 27 | 
 28 | lark_lalr:
 29 | Tree(list, [Tree(string, [Token(ESCAPED_STRING, '"foo"')]), Tree(dict, [Tree(pair, [Tree(string, [Token(ESCAPED_STRING, '"bar"')]), Tree(list, [Token(SIGNED_NUMBER, '1'), Token(SIGNED_NUMBER, '2'), Token(SIGNED_NUMBER, '3')])])])])
 30 | 
 31 | lark_earley:
 32 | Tree(list, [Tree(string, [Token(ESCAPED_STRING, '"foo"')]), Tree(dict, [Tree(pair, [Tree(string, [Token(ESCAPED_STRING, '"bar"')]), Tree(list, [Token(SIGNED_NUMBER, '1'), Token(SIGNED_NUMBER, '2'), Token(SIGNED_NUMBER, '3')])])])])
 33 | 
 34 | pyparsing:
 35 | [['"foo"', [['"bar"', [1, 2, 3]]]]]
 36 | 
 37 | parsita:
 38 | Success(['foo', [['bar', ['1', '2', '3']]]])
 39 | 
 40 | funcparserlib:
 41 | ('"foo"', [('"bar"', ('1', ['2', '3']), [])])
 42 | 
 43 | parsy:
 44 | ['foo', [('bar', ['1', '2', '3'])]]
 45 | 
 46 | parsimonious:
 47 | <Node called "json_file" matching "[
 48 |   "foo",
 49 |   {
 50 |     "bar": [
 51 |       1,
 52 |       2,
 53 |       3
 54 |     ]
 55 |   }
 56 | ]
 57 | ">
 58 |     <Node matching "">
 59 |     <Node called "json" matching "[
 60 |       "foo",
 61 |       {
 62 |         "bar": [
 63 |           1,
 64 |           2,
 65 |           3
 66 |         ]
 67 |       }
 68 |     ]">
 69 |         <Node called "array" matching "[
 70 |           "foo",
 71 |           {
 72 |             "bar": [
 73 |               1,
 74 |               2,
 75 |               3
 76 |             ]
 77 |           }
 78 |         ]">
 79 |             <Node matching "[">
 80 |             <Node matching "
 81 |               "foo",
 82 |               {
 83 |                 "bar": [
 84 |                   1,
 85 |                   2,
 86 |                   3
 87 |                 ]
 88 |               }
 89 |             ">
 90 |                 <Node called "values" matching "
 91 |                   "foo",
 92 |                   {
 93 |                     "bar": [
 94 |                       1,
 95 |                       2,
 96 |                       3
 97 |                     ]
 98 |                   }
 99 |                 ">
100 |                     <Node matching "
101 |                       "foo",
102 |                       ">
103 |                         <Node called "value_and_comma" matching "
104 |                           "foo",
105 |                           ">
106 |                             <Node called "value" matching "
107 |                               "foo"">
108 |                                 <Node matching "
109 |                                   ">
110 |                                     <RegexNode called "ws" matching "
111 |                                       ">
112 |                                 <Node matching ""foo"">
113 |                                     <RegexNode called "string" matching ""foo"">
114 |                                 <Node matching "">
115 |                             <Node called "comma" matching ",
116 |                               ">
117 |                                 <Node matching "">
118 |                                 <Node matching ",">
119 |                                 <Node matching "
120 |                                   ">
121 |                                     <RegexNode called "ws" matching "
122 |                                       ">
123 |                     <Node called "value" matching "{
124 |                         "bar": [
125 |                           1,
126 |                           2,
127 |                           3
128 |                         ]
129 |                       }
130 |                     ">
131 |                         <Node matching "">
132 |                         <Node matching "{
133 |                             "bar": [
134 |                               1,
135 |                               2,
136 |                               3
137 |                             ]
138 |                           }">
139 |                             <Node called "object" matching "{
140 |                                 "bar": [
141 |                                   1,
142 |                                   2,
143 |                                   3
144 |                                 ]
145 |                               }">
146 |                                 <Node matching "{">
147 |                                 <Node matching "
148 |                                     "bar": [
149 |                                       1,
150 |                                       2,
151 |                                       3
152 |                                     ]
153 |                                   ">
154 |                                     <Node called "members" matching "
155 |                                         "bar": [
156 |                                           1,
157 |                                           2,
158 |                                           3
159 |                                         ]
160 |                                       ">
161 |                                         <Node matching "">
162 |                                         <Node called "member" matching "
163 |                                             "bar": [
164 |                                               1,
165 |                                               2,
166 |                                               3
167 |                                             ]
168 |                                           ">
169 |                                             <Node matching "
170 |                                                 ">
171 |                                                 <RegexNode called "ws" matching "
172 |                                                     ">
173 |                                             <RegexNode called "string" matching ""bar"">
174 |                                             <Node matching "">
175 |                                             <Node matching ":">
176 |                                             <Node called "value" matching " [
177 |                                                   1,
178 |                                                   2,
179 |                                                   3
180 |                                                 ]
181 |                                               ">
182 |                                                 <Node matching " ">
183 |                                                     <RegexNode called "ws" matching " ">
184 |                                                 <Node matching "[
185 |                                                       1,
186 |                                                       2,
187 |                                                       3
188 |                                                     ]">
189 |                                                     <Node called "array" matching "[
190 |                                                           1,
191 |                                                           2,
192 |                                                           3
193 |                                                         ]">
194 |                                                         <Node matching "[">
195 |                                                         <Node matching "
196 |                                                               1,
197 |                                                               2,
198 |                                                               3
199 |                                                             ">
200 |                                                             <Node called "values" matching "
201 |                                                                   1,
202 |                                                                   2,
203 |                                                                   3
204 |                                                                 ">
205 |                                                                 <Node matching "
206 |                                                                       1,
207 |                                                                       2,
208 |                                                                       ">
209 |                                                                     <Node called "value_and_comma" matching "
210 |                                                                           1,
211 |                                                                           ">
212 |                                                                         <Node called "value" matching "
213 |                                                                               1">
214 |                                                                             <Node matching "
215 |                                                                                   ">
216 |                                                                                 <RegexNode called "ws" matching "
217 |                                                                                       ">
218 |                                                                             <Node matching "1">
219 |                                                                                 <RegexNode called "number" matching "1">
220 |                                                                             <Node matching "">
221 |                                                                         <Node called "comma" matching ",
222 |                                                                               ">
223 |                                                                             <Node matching "">
224 |                                                                             <Node matching ",">
225 |                                                                             <Node matching "
226 |                                                                                   ">
227 |                                                                                 <RegexNode called "ws" matching "
228 |                                                                                       ">
229 |                                                                     <Node called "value_and_comma" matching "2,
230 |                                                                           ">
231 |                                                                         <Node called "value" matching "2">
232 |                                                                             <Node matching "">
233 |                                                                             <Node matching "2">
234 |                                                                                 <RegexNode called "number" matching "2">
235 |                                                                             <Node matching "">
236 |                                                                         <Node called "comma" matching ",
237 |                                                                               ">
238 |                                                                             <Node matching "">
239 |                                                                             <Node matching ",">
240 |                                                                             <Node matching "
241 |                                                                                   ">
242 |                                                                                 <RegexNode called "ws" matching "
243 |                                                                                       ">
244 |                                                                 <Node called "value" matching "3
245 |                                                                     ">
246 |                                                                     <Node matching "">
247 |                                                                     <Node matching "3">
248 |                                                                         <RegexNode called "number" matching "3">
249 |                                                                     <Node matching "
250 |                                                                         ">
251 |                                                                         <RegexNode called "ws" matching "
252 |                                                                             ">
253 |                                                         <Node matching "]">
254 |                                                 <Node matching "
255 |                                                   ">
256 |                                                     <RegexNode called "ws" matching "
257 |                                                       ">
258 |                                 <Node matching "}">
259 |                         <Node matching "
260 |                         ">
261 |                             <RegexNode called "ws" matching "
262 |                             ">
263 |             <Node matching "]">
264 |     <Node matching "
265 |     ">
266 |         <RegexNode called "ws" matching "
267 |         ">
268 | 
269 | pyleri:
270 | <pyleri.noderesult.NodeResult object at 0x7fb4596a2480>
271 | 
272 | textx:
273 | <textx:Array instance at 0x7fb4596deeb8>
274 | $
275 | 
276 | """
277 | 
278 | from __future__ import print_function
279 | 
280 | from parsers import textparser_json
281 | from parsers import lark_json
282 | from parsers import pyparsing_json
283 | from parsers import funcparserlib_json
284 | from parsers import parsimonious_json
285 | from parsers import textx_json
286 | 
287 | try:
288 |     from parsers import parsita_json
289 | except:
290 | 
291 |     class parsita_json(object):
292 | 
293 |         @staticmethod
294 |         def parse(_json_string):
295 |             return 'Import failed!'
296 | 
297 | try:
298 |     from parsers import parsy_json
299 | except:
300 |     class parsy_json(object):
301 | 
302 |         @staticmethod
303 |         def parse(_json_string):
304 |             return 'Import failed!'
305 | 
306 | try:
307 |     from parsers import pyleri_json
308 | except:
309 |     class pyleri_json(object):
310 | 
311 |         @staticmethod
312 |         def parse(_json_string):
313 |             return 'Import failed!'
314 | 
315 | 
316 | def parse(string):
317 |     results = [
318 |         ('textparser', textparser_json.parse(string)),
319 |         ('lark_lalr', lark_json.parse_lalr(string)),
320 |         ('lark_earley', lark_json.parse_earley(string)),
321 |         ('pyparsing', pyparsing_json.parse(string)),
322 |         ('parsita', parsita_json.parse(string)),
323 |         ('funcparserlib', funcparserlib_json.parse(string)),
324 |         ('parsy', parsy_json.parse(string)),
325 |         ('parsimonious', parsimonious_json.parse(string)),
326 |         ('pyleri', pyleri_json.parse(string)),
327 |         ('textx', textx_json.parse(string))
328 |     ]
329 | 
330 |     print('-----------------------------------------------------------------')
331 |     print()
332 |     print('Input string between BEGIN and END:')
333 |     print()
334 |     print('BEGIN')
335 |     print(string, end='')
336 |     print('END')
337 |     print()
338 | 
339 |     for parser, parse_tree in results:
340 |         print('{}:'.format(parser))
341 |         print(parse_tree)
342 |         print()
343 | 
344 | 
345 | JSON_STRING = '''\
346 | [
347 |   "foo",
348 |   {
349 |     "bar": [
350 |       1,
351 |       2,
352 |       3
353 |     ]
354 |   }
355 | ]
356 | '''
357 | 
358 | 
359 | parse(JSON_STRING)
360 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eerimoq/textparser/1ef809eb283da3c3ec7b8bc682f11eeada3a81d6/examples/benchmarks/json/parsers/__init__.py


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/funcparserlib_json.py:
--------------------------------------------------------------------------------
  1 | """Based on
  2 | https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py.
  3 | 
  4 | """
  5 | 
  6 | import timeit
  7 | import re
  8 | 
  9 | from funcparserlib.lexer import make_tokenizer
 10 | from funcparserlib.lexer import Token
 11 | from funcparserlib.parser import some
 12 | from funcparserlib.parser import a
 13 | from funcparserlib.parser import maybe
 14 | from funcparserlib.parser import many
 15 | from funcparserlib.parser import skip
 16 | from funcparserlib.parser import forward_decl
 17 | from funcparserlib.parser import finished
 18 | 
 19 | 
 20 | REGEXPS = {
 21 |     'escaped': r'''
 22 |     \\                                  # Escape
 23 |     ((?P<standard>["\\/bfnrt])        # Standard escapes
 24 |     | (u(?P<unicode>[0-9A-Fa-f]{4})))   # uXXXX
 25 |     ''',
 26 |     'unescaped': r'''
 27 |     [^"\\]                              # Unescaped: avoid ["\\]
 28 |     '''
 29 | }
 30 | 
 31 | 
 32 | def create_tokenizer():
 33 |     specs = [
 34 |         ('Space', (r'[ \t\r\n]+',)),
 35 |         ('String', (r'"(%(unescaped)s | %(escaped)s)*"' % REGEXPS, re.VERBOSE)),
 36 |         ('Number', (r'''
 37 |             -?                  # Minus
 38 |             (0|([1-9][0-9]*))   # Int
 39 |             (\.[0-9]+)?         # Frac
 40 |             ([Ee][+-][0-9]+)?   # Exp
 41 |             ''', re.VERBOSE)),
 42 |         ('Op', (r'[{}\[\]\-,:]',)),
 43 |         ('Name', (r'[A-Za-z_][A-Za-z_0-9]*',)),
 44 |     ]
 45 | 
 46 |     return make_tokenizer(specs)
 47 | 
 48 | 
 49 | def tokenize(tokenizer, string):
 50 |     useless = ['Space']
 51 |     
 52 |     return [x for x in tokenizer(string) if x.type not in useless]
 53 | 
 54 | 
 55 | def create_grammar():
 56 |     tokval = lambda x: x.value
 57 |     toktype = lambda t: some(lambda x: x.type == t) >> tokval
 58 |     op = lambda s: a(Token('Op', s)) >> tokval
 59 |     op_ = lambda s: skip(op(s))
 60 |     n = lambda s: a(Token('Name', s)) >> tokval
 61 | 
 62 |     null = n('null')
 63 |     true = n('true')
 64 |     false = n('false')
 65 |     number = toktype('Number')
 66 |     string = toktype('String')
 67 |     value = forward_decl()
 68 |     member = string + op_(':') + value
 69 |     object_ = (op_('{') +
 70 |                maybe(member + many(op_(',') + member)) +
 71 |                op_('}'))
 72 |     array = (op_('[') +
 73 |              maybe(value + many(op_(',') + value)) +
 74 |              op_(']'))
 75 |     value.define(null
 76 |                  | true
 77 |                  | false
 78 |                  | object_
 79 |                  | array
 80 |                  | number
 81 |                  | string)
 82 |     json_text = object_ | array
 83 |     json_file = json_text + skip(finished)
 84 | 
 85 |     return json_file
 86 | 
 87 | 
 88 | def parse_time(json_string, iterations):
 89 |     grammar = create_grammar()
 90 |     tokenizer = create_tokenizer()
 91 |     
 92 |     def _parse():
 93 |         grammar.parse(tokenize(tokenizer, json_string))
 94 | 
 95 |     return timeit.timeit(_parse, number=iterations)
 96 | 
 97 | 
 98 | def parse(json_string):
 99 |     grammar = create_grammar()
100 |     tokenizer = create_tokenizer()
101 |     
102 |     return grammar.parse(tokenize(tokenizer, json_string))
103 | 
104 | 
105 | def version():
106 |     return 'unknown'
107 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/lark_json.py:
--------------------------------------------------------------------------------
 1 | """Based on
 2 | https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md.
 3 | 
 4 | """
 5 | 
 6 | import timeit
 7 | 
 8 | import lark
 9 | from lark import Lark
10 | 
11 | 
12 | LARK_JSON_GRAMMAR = r"""
13 |     ?value: dict
14 |           | list
15 |           | string
16 |           | SIGNED_NUMBER
17 |           | "true"
18 |           | "false"
19 |           | "null"
20 | 
21 |     list : "[" [value ("," value)*] "]"
22 | 
23 |     dict : "{" [pair ("," pair)*] "}"
24 |     pair : string ":" value
25 | 
26 |     string : ESCAPED_STRING
27 | 
28 |     %import common.ESCAPED_STRING
29 |     %import common.SIGNED_NUMBER
30 |     %import common.WS
31 |     %ignore WS
32 |     """
33 | 
34 | 
35 | def parse_time_lalr(json_string, iterations):
36 |     parser = Lark(LARK_JSON_GRAMMAR,
37 |                   start='value',
38 |                   lexer='standard',
39 |                   parser='lalr')
40 | 
41 |     def _parse():
42 |         parser.parse(json_string)
43 | 
44 |     return timeit.timeit(_parse, number=iterations)
45 | 
46 | 
47 | def parse_time_earley(json_string, iterations):
48 |     parser = Lark(LARK_JSON_GRAMMAR,
49 |                   start='value',
50 |                   lexer='standard',
51 |                   parser='earley')
52 | 
53 |     def _parse():
54 |         parser.parse(json_string)
55 | 
56 |     return timeit.timeit(_parse, number=iterations)
57 | 
58 | 
59 | def parse_lalr(json_string):
60 |     parser = Lark(LARK_JSON_GRAMMAR,
61 |                   start='value',
62 |                   lexer='standard',
63 |                   parser='lalr')
64 | 
65 |     return parser.parse(json_string)
66 | 
67 | 
68 | def parse_earley(json_string):
69 |     parser = Lark(LARK_JSON_GRAMMAR,
70 |                   start='value',
71 |                   lexer='standard',
72 |                   parser='earley')
73 | 
74 |     return parser.parse(json_string)
75 | 
76 | 
77 | def version():
78 |     return lark.__version__
79 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/parsimonious_json.py:
--------------------------------------------------------------------------------
 1 | """Based on
 2 | https://gist.github.com/goodmami/686385b4b39a3bac00fbbe78a5cda6c8, by
 3 | Michael Wayne Goodman.
 4 | 
 5 | """
 6 | 
 7 | import timeit
 8 | 
 9 | from parsimonious.grammar import Grammar
10 | 
11 | 
12 | grammar = Grammar(
13 |     r"""
14 |     Start    = ~"\s*" ( Object / Array ) ~"\s*"
15 |     Object   = ~"{\s*" Members? ~"\s*}"
16 |     Members  = MappingComma* Mapping
17 |     MappingComma = Mapping ~"\s*,\s*"
18 |     Mapping  = DQString ~"\s*:\s*" Value
19 |     Array    = ~"\[\s*" Items? ~"\s*\]"
20 |     Items    = ValueComma* Value
21 |     ValueComma = Value ~"\s*,\s*"
22 |     Value    = Object / Array / DQString
23 |              / TrueVal / FalseVal / NullVal / Float / Integer
24 |     TrueVal  = "true"
25 |     FalseVal = "false"
26 |     NullVal  = "null"
27 |     DQString = ~"\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""
28 |     Float    = ~"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?"
29 |     Integer  = ~"[-+]?\d+"
30 |     """)
31 | 
32 | 
33 | def parse_time(json_string, iterations):
34 |     def _parse():
35 |         grammar.parse(json_string)
36 | 
37 |     return timeit.timeit(_parse, number=iterations)
38 | 
39 | 
40 | def parse(json_string):
41 |     return grammar.parse(json_string)
42 | 
43 | 
44 | def version():
45 |     return 'unknown'
46 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/parsita_json.py:
--------------------------------------------------------------------------------
 1 | """Based on
 2 | https://github.com/drhagen/parsita/blob/master/examples/json.py.
 3 | 
 4 | """
 5 | 
 6 | import timeit
 7 | 
 8 | from parsita import TextParsers
 9 | from parsita import lit
10 | from parsita import reg
11 | from parsita import rep
12 | from parsita import repsep
13 | 
14 | 
15 | class JsonStringParsers(TextParsers, whitespace=None):
16 |     quote = lit(r'\"')
17 |     reverse_solidus = lit(r'\\')
18 |     solidus = lit(r'\/')
19 |     backspace = lit(r'\b')
20 |     form_feed = lit(r'\f')
21 |     line_feed = lit(r'\n')
22 |     carriage_return = lit(r'\r')
23 |     tab = lit(r'\t')
24 |     uni = reg(r'\\u([0-9a-fA-F]{4})')
25 | 
26 |     escaped = (quote | reverse_solidus | solidus | backspace | form_feed |
27 |                line_feed | carriage_return | tab | uni)
28 |     unescaped = reg(r'[\u0020-\u0021\u0023-\u005B\u005D-\U0010FFFF]+')
29 | 
30 |     string = '"' >> rep(escaped | unescaped) << '"' > ''.join
31 | 
32 | 
33 | class JsonParsers(TextParsers, whitespace=r'[ \t\n\r]*'):
34 |     number = reg(r'-?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][-+]?[0-9]+)?')
35 | 
36 |     false = lit('false')
37 |     true = lit('true')
38 |     null = lit('null')
39 | 
40 |     string = JsonStringParsers.string
41 | 
42 |     array = '[' >> repsep(value, ',') << ']'
43 | 
44 |     entry = string << ':' & value
45 |     obj = '{' >> repsep(entry, ',') << '}'
46 | 
47 |     value = (number
48 |              | false
49 |              | true
50 |              | null
51 |              | string
52 |              | array
53 |              | obj)
54 | 
55 | 
56 | def parse_time(json_string, iterations):
57 |     def _parse():
58 |         JsonParsers.value.parse(json_string)
59 | 
60 |     return timeit.timeit(_parse, number=iterations)
61 | 
62 | 
63 | def parse(json_string):
64 |     return JsonParsers.value.parse(json_string)
65 | 
66 | 
67 | def version():
68 |     return 'unknown'
69 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/parsy_json.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | 
 3 | import parsy
 4 | from parsy import generate
 5 | from parsy import regex
 6 | from parsy import string
 7 | 
 8 | 
 9 | whitespace = regex(r'\s*')
10 | lexeme = lambda p: p << whitespace
11 | lbrace = lexeme(string('{'))
12 | rbrace = lexeme(string('}'))
13 | lbrack = lexeme(string('['))
14 | rbrack = lexeme(string(']'))
15 | colon  = lexeme(string(':'))
16 | comma  = lexeme(string(','))
17 | true   = lexeme(string('true'))
18 | false  = lexeme(string('false'))
19 | null   = lexeme(string('null'))
20 | number = lexeme(
21 |     regex(r'-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?')
22 | )
23 | string_part = regex(r'[^"\\]+')
24 | string_esc = string('\\') >> (
25 |     string('\\')
26 |     | string('/')
27 |     | string('"')
28 |     | string('b')
29 |     | string('f')
30 |     | string('n')
31 |     | string('r')
32 |     | string('t')
33 |     | regex(r'u[0-9a-fA-F]{4}').map(lambda s: chr(int(s[1:], 16)))
34 | )
35 | quoted = lexeme(string('"') >> (string_part | string_esc).many().concat() << string('"'))
36 | 
37 | 
38 | # Circular dependency between array and value means we use `generate`
39 | # form here.
40 | @generate
41 | def array():
42 |     yield lbrack
43 |     elements = yield value.sep_by(comma)
44 |     yield rbrack
45 | 
46 |     return elements
47 | 
48 | 
49 | @generate
50 | def object_pair():
51 |     key = yield quoted
52 |     yield colon
53 |     val = yield value
54 | 
55 |     return (key, val)
56 | 
57 | 
58 | json_object = lbrace >> object_pair.sep_by(comma) << rbrace
59 | value = quoted | number | json_object | array | true | false | null
60 | json = whitespace >> value
61 | 
62 | 
63 | def parse_time(json_string, iterations):
64 |     def _parse():
65 |         json.parse(json_string)
66 | 
67 |     return timeit.timeit(_parse, number=iterations)
68 | 
69 | 
70 | def parse(json_string):
71 |     return json.parse(json_string)
72 | 
73 | 
74 | def version():
75 |     return parsy.__version__
76 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/pyleri_json.py:
--------------------------------------------------------------------------------
 1 | """Based on
 2 | https://github.com/transceptor-technology/pyleri/blob/master/examples/json_grammar.py.
 3 | 
 4 | """
 5 | 
 6 | import timeit
 7 | 
 8 | import pyleri
 9 | from pyleri import Ref
10 | from pyleri import Choice
11 | from pyleri import Grammar
12 | from pyleri import Regex
13 | from pyleri import Keyword
14 | from pyleri import Sequence
15 | from pyleri import List
16 | 
17 | 
18 | class JsonGrammar(Grammar):
19 |     START = Ref()
20 | 
21 |     # JSON strings should be enclosed in double quotes.
22 |     # A backslash can be used as escape character.
23 |     r_string = Regex('(")(?:(?=(\\\?))\\2.)*?\\1')
24 | 
25 |     # JSON does not support floats or integers prefixed with a + sign
26 |     # and floats must start with a number, for example .5 is not allowed
27 |     # but should be written like 0.5
28 |     r_float = Regex('-?[0-9]+\.?[0-9]+')
29 |     r_integer = Regex('-?[0-9]+')
30 | 
31 |     k_true = Keyword('true')
32 |     k_false = Keyword('false')
33 |     k_null = Keyword('null')
34 | 
35 |     json_map_item = Sequence(r_string, ':', START)
36 | 
37 |     json_map = Sequence('{', List(json_map_item), '}')
38 |     json_array = Sequence('[', List(START), ']')
39 | 
40 |     START = Choice(r_string,
41 |                    r_float,
42 |                    r_integer,
43 |                    k_true,
44 |                    k_false,
45 |                    k_null,
46 |                    json_map,
47 |                    json_array)
48 | 
49 |     
50 | def parse_time(json_string, iterations):
51 |     grammar = JsonGrammar()
52 | 
53 |     def _parse():
54 |         grammar.parse(json_string)
55 | 
56 |     return timeit.timeit(_parse, number=iterations)
57 | 
58 | 
59 | def parse(json_string):
60 |     return JsonGrammar().parse(json_string)
61 | 
62 | 
63 | def version():
64 |     return pyleri.__version__
65 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/pyparsing_json.py:
--------------------------------------------------------------------------------
 1 | """Based on http://pyparsing.wikispaces.com/file/view/jsonParser.py.
 2 | 
 3 | """
 4 | 
 5 | import timeit
 6 | 
 7 | import pyparsing
 8 | from pyparsing import Keyword
 9 | from pyparsing import Suppress
10 | from pyparsing import dblQuotedString
11 | from pyparsing import Forward
12 | from pyparsing import Group
13 | from pyparsing import delimitedList
14 | from pyparsing import Optional
15 | from pyparsing import pyparsing_common
16 | from pyparsing import Dict
17 | 
18 | 
19 | def create_grammar():
20 |     TRUE  = Keyword('true')
21 |     FALSE = Keyword('false')
22 |     NULL  = Keyword('null')
23 | 
24 |     LBRACK, RBRACK, LBRACE, RBRACE, COLON = map(Suppress, '[]{}:')
25 | 
26 |     string = dblQuotedString()
27 |     number = pyparsing_common.number()
28 | 
29 |     object_ = Forward()
30 |     value = Forward()
31 |     elements = delimitedList(value)
32 |     array = Group(LBRACK + Optional(elements, []) + RBRACK)
33 |     value <<= (string
34 |                | number
35 |                | Group(object_)
36 |                | array
37 |                | TRUE
38 |                | FALSE
39 |                | NULL)
40 |     member = Group(string + COLON + value)
41 |     members = delimitedList(member)
42 |     object_ <<= Dict(LBRACE + Optional(members) + RBRACE)
43 | 
44 |     return value
45 | 
46 | 
47 | def parse_time(json_string, iterations=1):
48 |     grammar = create_grammar()
49 | 
50 |     def _parse():
51 |         grammar.parseString(json_string)
52 | 
53 |     return timeit.timeit(_parse, number=iterations)
54 | 
55 | 
56 | def parse(json_string):
57 |     grammar = create_grammar()
58 |     
59 |     return grammar.parseString(json_string).asList()
60 | 
61 | 
62 | def version():
63 |     return pyparsing.__version__
64 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/textparser_json.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | 
 3 | import textparser
 4 | from textparser import Forward
 5 | from textparser import Sequence
 6 | from textparser import DelimitedList
 7 | from textparser import choice
 8 | from textparser import Optional
 9 | 
10 | 
11 | class Parser(textparser.Parser):
12 | 
13 |     def token_specs(self):
14 |         return [
15 |             ('SKIP',                r'[ \r\n\t]+'),
16 |             ('NUMBER',              r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
17 |             ('TRUE',                r'true'),
18 |             ('FALSE',               r'false'),
19 |             ('NULL',                r'null'),
20 |             ('ESCAPED_STRING',      r'"(\\"|[^"])*?"'),
21 |             ('LPAREN',         '(', r'\('),
22 |             ('RPAREN',         ')', r'\)'),
23 |             ('LBRACKET',       '[', r'\['),
24 |             ('RBRACKET',       ']', r'\]'),
25 |             ('LBRACE',         '{', r'\{'),
26 |             ('RBRACE',         '}', r'\}'),
27 |             ('COMMA',          ',', r','),
28 |             ('COLON',          ':', r':'),
29 |             ('MISMATCH',            r'.')
30 |         ]
31 | 
32 |     def grammar(self):
33 |         value = Forward()
34 |         list_ = Sequence('[', Optional(DelimitedList(value)), ']')
35 |         pair = Sequence('ESCAPED_STRING', ':', value)
36 |         dict_ = Sequence('{', Optional(DelimitedList(pair)), '}')
37 |         value <<= choice(list_,
38 |                          dict_,
39 |                          'ESCAPED_STRING',
40 |                          'NUMBER',
41 |                          'TRUE',
42 |                          'FALSE',
43 |                          'NULL')
44 | 
45 |         return value
46 | 
47 | 
48 | def parse_time(json_string, iterations):
49 |     parser = Parser()
50 | 
51 |     def _parse():
52 |         parser.parse(json_string)
53 | 
54 |     return timeit.timeit(_parse, number=iterations)
55 | 
56 | 
57 | def parse(json_string):
58 |     return Parser().parse(json_string)
59 | 
60 | 
61 | def version():
62 |     return textparser.__version__
63 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/parsers/textx_json.py:
--------------------------------------------------------------------------------
 1 | """Based on
 2 | https://github.com/igordejanovic/textX/tree/master/examples/json.
 3 | 
 4 | """
 5 | 
 6 | import timeit
 7 | 
 8 | import textx
 9 | from textx import metamodel_from_str
10 | 
11 | 
12 | GRAMMAR = '''\
13 | /*
14 |     A grammar for JSON data-interchange format.
15 |     See: http://www.json.org/
16 | */
17 | File:
18 |     Array | Object
19 | ;
20 | 
21 | Array:
22 |     "[" values*=Value[','] "]"
23 | ;
24 | 
25 | Value:
26 |     STRING | FLOAT | BOOL | Object | Array | "null"
27 | ;
28 | 
29 | Object:
30 |     "{" members*=Member[','] "}"
31 | ;
32 | 
33 | Member:
34 |     key=STRING ':' value=Value
35 | ;
36 | '''
37 | 
38 | 
39 | def parse_time(json_string, iterations):
40 |     parser = metamodel_from_str(GRAMMAR)
41 | 
42 |     def _parse():
43 |         parser.model_from_str(json_string)
44 | 
45 |     return timeit.timeit(_parse, number=iterations)
46 | 
47 | 
48 | def parse(json_string):
49 |     parser = metamodel_from_str(GRAMMAR)
50 | 
51 |     return parser.model_from_str(json_string)
52 | 
53 | 
54 | def version():
55 |     return textx.__version__
56 | 


--------------------------------------------------------------------------------
/examples/benchmarks/json/speed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """A benchmark comparing the speed of 10 parsers.
  4 | 
  5 | Test data generated with https://www.json-generator.com.
  6 | 
  7 | Example execution:
  8 | 
  9 | $ env PYTHONPATH=. python3 examples/benchmarks/json/speed.py
 10 | Parsed 'examples/benchmarks/json/data.json' 1 time(s) in:
 11 | 
 12 | PACKAGE         SECONDS   RATIO  VERSION
 13 | textparser         0.10    100%  0.17.0
 14 | parsimonious       0.17    174%  unknown
 15 | lark (LALR)        0.25    253%  0.6.4
 16 | funcparserlib      0.33    335%  unknown
 17 | textx              0.51    520%  1.7.1
 18 | pyparsing          0.65    654%  2.2.0
 19 | pyleri             0.78    786%  1.2.2
 20 | parsy              0.92    931%  1.2.0
 21 | lark (Earley)      1.80   1816%  0.6.4
 22 | parsita            2.22   2251%  unknown
 23 | $
 24 | 
 25 | """
 26 | 
 27 | from __future__ import print_function
 28 | 
 29 | import os
 30 | 
 31 | from parsers import textparser_json
 32 | from parsers import lark_json
 33 | from parsers import pyparsing_json
 34 | from parsers import funcparserlib_json
 35 | from parsers import parsimonious_json
 36 | from parsers import textx_json
 37 | 
 38 | try:
 39 |     from parsers import parsita_json
 40 | except:
 41 |     class parsita_json(object):
 42 | 
 43 |         @staticmethod
 44 |         def parse_time(_json_string, _iterations):
 45 |             return float('inf')
 46 | 
 47 |         @staticmethod
 48 |         def version():
 49 |             return 'unknown'
 50 | 
 51 | try:
 52 |     from parsers import parsy_json
 53 | except:
 54 |     class parsy_json(object):
 55 | 
 56 |         @staticmethod
 57 |         def parse_time(_json_string, _iterations):
 58 |             return float('inf')
 59 | 
 60 |         @staticmethod
 61 |         def version():
 62 |             return 'unknown'
 63 | 
 64 | try:
 65 |     from parsers import pyleri_json
 66 | except:
 67 |     class pyleri_json(object):
 68 | 
 69 |         @staticmethod
 70 |         def parse_time(_json_string, _iterations):
 71 |             return float('inf')
 72 | 
 73 |         @staticmethod
 74 |         def version():
 75 |             return 'unknown'
 76 | 
 77 | 
 78 | SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 79 | DATA_JSON = os.path.relpath(os.path.join(SCRIPT_DIR, 'data.json'))
 80 | 
 81 | ITERATIONS = 1
 82 | 
 83 | 
 84 | with open(DATA_JSON, 'r') as fin:
 85 |     JSON_STRING = fin.read()
 86 | 
 87 | textparser_time = textparser_json.parse_time(JSON_STRING, ITERATIONS)
 88 | lark_lalr_time = lark_json.parse_time_lalr(JSON_STRING, ITERATIONS)
 89 | lark_earley_time = lark_json.parse_time_earley(JSON_STRING, ITERATIONS)
 90 | pyparsing_time = pyparsing_json.parse_time(JSON_STRING, ITERATIONS)
 91 | parsita_time = parsita_json.parse_time(JSON_STRING, ITERATIONS)
 92 | funcparserlib_time = funcparserlib_json.parse_time(JSON_STRING, ITERATIONS)
 93 | parsy_time = parsy_json.parse_time(JSON_STRING, ITERATIONS)
 94 | parsimonious_time = parsimonious_json.parse_time(JSON_STRING, ITERATIONS)
 95 | pyleri_time = pyleri_json.parse_time(JSON_STRING, ITERATIONS)
 96 | textx_time = textx_json.parse_time(JSON_STRING, ITERATIONS)
 97 | 
 98 | # Parse comparison output.
 99 | measurements = [
100 |     ('textparser', textparser_time, textparser_json.version()),
101 |     ('lark (LALR)', lark_lalr_time, lark_json.version()),
102 |     ('lark (Earley)', lark_earley_time, lark_json.version()),
103 |     ('pyparsing', pyparsing_time, pyparsing_json.version()),
104 |     ('parsita', parsita_time, parsita_json.version()),
105 |     ('funcparserlib', funcparserlib_time, funcparserlib_json.version()),
106 |     ('parsy', parsy_time, parsy_json.version()),
107 |     ('parsimonious', parsimonious_time, parsimonious_json.version()),
108 |     ('pyleri', pyleri_time, pyleri_json.version()),
109 |     ('textx', textx_time, textx_json.version())
110 | ]
111 | 
112 | measurements = sorted(measurements, key=lambda m: m[1])
113 | 
114 | print()
115 | print("Parsed '{}' {} time(s) in:".format(DATA_JSON, ITERATIONS))
116 | print()
117 | print('PACKAGE         SECONDS   RATIO  VERSION')
118 | 
119 | for package, seconds, version in measurements:
120 |     try:
121 |         ratio = int(round(100 * (seconds / textparser_time), 0))
122 |         ratio = '{:5}'.format(ratio)
123 |     except OverflowError:
124 |         ratio = '  inf'
125 | 
126 |     print('{:14s}  {:7.02f}  {}%  {}'.format(package,
127 |                                              seconds,
128 |                                              ratio,
129 |                                              version))
130 | 


--------------------------------------------------------------------------------
/examples/hello_world.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # $ env PYTHONPATH=.. python3 hello_world.py
 4 | # Tree: ['Hello', ',', 'World', '!']
 5 | #
 6 | 
 7 | import textparser
 8 | from textparser import Sequence
 9 | 
10 | 
11 | class Parser(textparser.Parser):
12 | 
13 |     def token_specs(self):
14 |         return [
15 |             ('SKIP',          r'[ \r\n\t]+'),
16 |             ('WORD',          r'\w+'),
17 |             ('EMARK',    '!', r'!'),
18 |             ('COMMA',    ',', r','),
19 |             ('MISMATCH',      r'.')
20 |         ]
21 | 
22 |     def grammar(self):
23 |         return Sequence('WORD', ',', 'WORD', '!')
24 | 
25 | 
26 | tree = Parser().parse('Hello, World!')
27 | 
28 | print('Tree:', tree)
29 | 


--------------------------------------------------------------------------------
/examples/json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """An JSON example of how to transform a parse tree of tokens into
  4 | lists, dicts, floats, booleans and None.
  5 | 
  6 | NOTE: The parse tree transformation is implemented as a separate step
  7 | after parsing. Making the transformation part of the parser is
  8 | probably desired, but there are currently no plans on doing so because
  9 | there is no use case at the moment (for me, probably the only user).
 10 | 
 11 | $ env PYTHONPATH=. python3 examples/json.py
 12 | {'number': 0.11, 'false': False, 'true': True, 'null': None, 'list': [None, 'string']}
 13 | 
 14 | """
 15 | 
 16 | import textparser
 17 | from textparser import Forward
 18 | from textparser import Sequence
 19 | from textparser import DelimitedList
 20 | from textparser import choice
 21 | from textparser import Optional
 22 | 
 23 | 
 24 | JSON_TEXT = '''\
 25 | {
 26 |     "number": 0.11,
 27 |     "false": false,
 28 |     "true": true,
 29 |     "null": null,
 30 |     "list": [null, "string"]
 31 | }
 32 | '''
 33 | 
 34 | 
 35 | class Parser(textparser.Parser):
 36 | 
 37 |     def token_specs(self):
 38 |         return [
 39 |             ('SKIP',                r'[ \r\n\t]+'),
 40 |             ('NUMBER',              r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
 41 |             ('TRUE',                r'true'),
 42 |             ('FALSE',               r'false'),
 43 |             ('NULL',                r'null'),
 44 |             ('ESCAPED_STRING',      r'"(\\"|[^"])*?"'),
 45 |             ('LPAREN',         '(', r'\('),
 46 |             ('RPAREN',         ')', r'\)'),
 47 |             ('LBRACKET',       '[', r'\['),
 48 |             ('RBRACKET',       ']', r'\]'),
 49 |             ('LBRACE',         '{', r'\{'),
 50 |             ('RBRACE',         '}', r'\}'),
 51 |             ('COMMA',          ',', r','),
 52 |             ('COLON',          ':', r':'),
 53 |             ('MISMATCH',            r'.')
 54 |         ]
 55 | 
 56 |     def grammar(self):
 57 |         value = Forward()
 58 |         list_ = Sequence('[', Optional(DelimitedList(value)), ']')
 59 |         pair = Sequence('ESCAPED_STRING', ':', value)
 60 |         dict_ = Sequence('{', Optional(DelimitedList(pair)), '}')
 61 |         value <<= choice(list_,
 62 |                          dict_,
 63 |                          'ESCAPED_STRING',
 64 |                          'NUMBER',
 65 |                          'TRUE',
 66 |                          'FALSE',
 67 |                          'NULL')
 68 | 
 69 |         return value
 70 | 
 71 | 
 72 | def transform(token):
 73 |     if isinstance(token, list):
 74 |         if token[0].kind == '{':
 75 |             if len(token[1]) > 0:
 76 |                 return {
 77 |                     key.value[1:-1]: transform(v)
 78 |                     for key, _, v in token[1][0]
 79 |                 }
 80 |             else:
 81 |                 return {}
 82 |         else:
 83 |             if len(token[1]) > 0:
 84 |                 return [transform(elem) for elem in token[1][0]]
 85 |             else:
 86 |                 return []
 87 |     elif token.kind == 'ESCAPED_STRING':
 88 |         return token.value[1:-1]
 89 |     elif token.kind == 'NUMBER':
 90 |         return float(token.value)
 91 |     elif token.kind == 'TRUE':
 92 |         return True
 93 |     elif token.kind == 'FALSE':
 94 |         return False
 95 |     else:
 96 |         return None
 97 | 
 98 | 
 99 | print(transform(Parser().parse(JSON_TEXT, token_tree=True)))
100 | 


--------------------------------------------------------------------------------
/examples/proto3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # $ env PYTHONPATH=.. python3 proto3.py
  4 | # Tree: [['syntax', '=', '"proto3"', ';'],
  5 | #  [['import', ['public'], '"foo.bar"', ';'],
  6 | #   ['option', ['java_package', []], '=', '"com.example.foo"', ';'],
  7 | #   ['option', ['java_multiple_files', []], '=', ['true'], ';'],
  8 | #   ['enum',
  9 | #    'EnumAllowingAlias',
 10 | #    ['{',
 11 | #     [['option', ['allow_alias', []], '=', ['true'], ';'],
 12 | #      ['UNKNOWN', '=', '0', [], ';'],
 13 | #      ['STARTED', '=', '1', [], ';'],
 14 | #      ['RUNNING',
 15 | #       '=',
 16 | #       '2',
 17 | #       [['[',
 18 | #         [[[['(', ['custom_option'], ')'], []], '=', '"hello world"']],
 19 | #         ']']],
 20 | #       ';']],
 21 | #     '}']],
 22 | #   ['message',
 23 | #    'outer',
 24 | #    ['{',
 25 | #     [['option', [['(', ['my_option'], ')'], [['.', 'a']]], '=', ['true'], ';'],
 26 | #      [[],
 27 | #       [[], ['int32']],
 28 | #       'old_field',
 29 | #       '=',
 30 | #       '1',
 31 | #       [['[', [[['deprecated', []], '=', ['true']]], ']']],
 32 | #       ';'],
 33 | #      ['message',
 34 | #       'inner',
 35 | #       ['{', [[[], [[], ['int64']], 'ival', '=', '2', [], ';']], '}']],
 36 | #      [['repeated'], [[], ['inner']], 'inner_message', '=', '3', [], ';'],
 37 | #      [[], [[], ['EnumAllowingAlias']], 'enum_field', '=', '4', [], ';'],
 38 | #      ['map',
 39 | #       '<',
 40 | #       'int32',
 41 | #       ',',
 42 | #       [[], ['string']],
 43 | #       '>',
 44 | #       'my_map',
 45 | #       '=',
 46 | #       '5',
 47 | #       [],
 48 | #       ';'],
 49 | #      [[], [[], ['foo', 'bar', 'Open']], 'open', '=', '6', [], ';'],
 50 | #      [[], [['.'], ['foo', 'bar', 'Close']], 'close', '=', '7', [], ';'],
 51 | #      ['oneof',
 52 | #       'test_oneof',
 53 | #       '{',
 54 | #       [[[[], ['string']], 'name', '=', '8', [], ';'],
 55 | #        [[[], ['SubMessage']], 'sub_message', '=', '9', [], ';']],
 56 | #       '}'],
 57 | #      ['reserved', [['2', '15', '9'], [['to', '11']]], ';'],
 58 | #      ['reserved', [['7'], []], ';'],
 59 | #      ['reserved', [['15'], [['to', 'max']]], ';']],
 60 | #     '}']],
 61 | #   ['service',
 62 | #    'SearchService',
 63 | #    '{',
 64 | #    [['rpc',
 65 | #      'Search',
 66 | #      '(',
 67 | #      [],
 68 | #      'SearchRequest',
 69 | #      ')',
 70 | #      'returns',
 71 | #      '(',
 72 | #      [],
 73 | #      'SearchResponse',
 74 | #      ')',
 75 | #      ';']],
 76 | #    '}']]]
 77 | #
 78 | 
 79 | from pprint import pformat
 80 | 
 81 | import textparser
 82 | from textparser import Sequence
 83 | from textparser import ZeroOrMore
 84 | from textparser import choice
 85 | from textparser import Optional
 86 | from textparser import DelimitedList
 87 | from textparser import Forward
 88 | 
 89 | 
 90 | class Parser(textparser.Parser):
 91 | 
 92 |     def keywords(self):
 93 |         return set([
 94 |             'syntax',
 95 |             'import',
 96 |             'public',
 97 |             'option',
 98 |             'enum',
 99 |             'bool',
100 |             'string',
101 |             'message',
102 |             'rpc',
103 |             'service',
104 |             'returns',
105 |             'repeated',
106 |             'map',
107 |             'package',
108 |             'stream',
109 |             'weak',
110 |             'oneof',
111 |             'reserved',
112 |             'to',
113 |             'int32',
114 |             'int64',
115 |             'uint32',
116 |             'uint64',
117 |             'sint32',
118 |             'sint64',
119 |             'fixed32',
120 |             'fixed64',
121 |             'sfixed32',
122 |             'sfixed64',
123 |             'true',
124 |             'false',
125 |             'min',
126 |             'max'
127 |         ])
128 | 
129 |     def token_specs(self):
130 |         decimals = r'[0-9]+'
131 |         exponent = r'[eE][+-]?[0-9]+'
132 |         re_float = r'{d}\.[0-9]?({e})?|{d}({e})?|\.{d}({e})?|inf|nan'.format(
133 |             d=decimals,
134 |             e=exponent)
135 | 
136 |         return [
137 |             ('SKIP',                 r'[ \r\n\t]+|//[\s\S]*?\n'),
138 |             ('ESCAPED_STRING',       r'"(\\"|[^"])*?"'),
139 |             ('INT',                  r'[1-9][0-9]*|0[0-7]*|0[xX][0-9a-fA-F]+'),
140 |             ('FLOAT',                re_float),
141 |             ('IDENT',                r'[a-zA-Z][a-zA-Z0-9_]*'),
142 |             ('DOT',            '.',  r'\.'),
143 |             ('COMMA',          ',',  r','),
144 |             ('SCOLON',         ';',  r';'),
145 |             ('EQ',             '=',  r'='),
146 |             ('LT',             '<',  r'<'),
147 |             ('GT',             '>',  r'>'),
148 |             ('LBRACE',         '{',  r'\{'),
149 |             ('RBRACE',         '}',  r'\}'),
150 |             ('LBRACK',         '[',  r'\['),
151 |             ('RBRACK',         ']',  r'\]'),
152 |             ('LPAREN',         '(',  r'\('),
153 |             ('RPAREN',         ')',  r'\)'),
154 |             ('MISMATCH',             r'.')
155 |         ]
156 | 
157 |     def grammar(self):
158 |         message = Forward()
159 |         rpc = Forward()
160 | 
161 |         ident = choice(*(list(self.keywords()) + ['IDENT']))
162 |         full_ident = DelimitedList(ident, delim='.')
163 | 
164 |         # Constant.
165 |         constant = choice(full_ident,
166 |                           Sequence(Optional(choice('-', '+')), 'INT'),
167 |                           Sequence(Optional(choice('-', '+')), 'FLOAT'),
168 |                           'ESCAPED_STRING',
169 |                           'true',
170 |                           'false')
171 | 
172 |         # Syntax.
173 |         syntax = Sequence('syntax', '=', 'ESCAPED_STRING', ';')
174 | 
175 |         # Import statement.
176 |         import_ = Sequence('import',
177 |                            Optional(choice('weak', 'public')),
178 |                            'ESCAPED_STRING', ';')
179 | 
180 |         # Package.
181 |         package = Sequence('package', full_ident, ';')
182 | 
183 |         # Option.
184 |         option_name = Sequence(choice(ident, Sequence('(', full_ident, ')')),
185 |                                ZeroOrMore(Sequence('.', ident)))
186 |         option = Sequence('option', option_name, '=', constant, ';')
187 | 
188 |         # Fields.
189 |         type_ = choice(Sequence(Optional('.'), DelimitedList(ident, '.')),
190 |                        ident)
191 |         field_number = 'INT'
192 | 
193 |         # Normal field.
194 |         field_option = Sequence(option_name, '=', constant)
195 |         field_options = DelimitedList(field_option)
196 |         field = Sequence(Optional('repeated'),
197 |                          type_, ident, '=', field_number,
198 |                          Optional(Sequence('[', field_options, ']')),
199 |                          ';')
200 | 
201 |         # Oneof and oneof field.
202 |         oneof_field = Sequence(type_, ident, '=', field_number,
203 |                                Optional(Sequence('[', field_options, ']')),
204 |                                ';')
205 |         oneof = Sequence('oneof', ident,
206 |                          '{',
207 |                          ZeroOrMore(choice(oneof_field, ';')),
208 |                          '}')
209 | 
210 |         # Map field.
211 |         key_type = choice('int32',
212 |                           'int64',
213 |                           'uint32',
214 |                           'uint64',
215 |                           'sint32',
216 |                           'sint64',
217 |                           'fixed32',
218 |                           'fixed64',
219 |                           'sfixed32',
220 |                           'sfixed64',
221 |                           'bool',
222 |                           'string')
223 |         map_field = Sequence('map', '<', key_type, ',', type_, '>',
224 |                              ident, '=', field_number,
225 |                              Optional(Sequence('[', field_options, ']')),
226 |                              ';')
227 | 
228 |         # Reserved.
229 |         field_names = DelimitedList(ident)
230 |         ranges = Sequence(DelimitedList('INT'),
231 |                           Optional(Sequence('to', choice('INT', 'max'))))
232 |         reserved = Sequence('reserved', choice(ranges, field_names), ';')
233 | 
234 |         # Enum definition.
235 |         enum_value_option = Sequence(option_name, '=', constant)
236 |         enum_field = Sequence(
237 |             ident, '=', 'INT',
238 |             Optional(Sequence('[', DelimitedList(enum_value_option), ']')),
239 |             ';')
240 |         enum_body = Sequence('{',
241 |                              ZeroOrMore(choice(option, enum_field, ';')),
242 |                              '}')
243 |         enum = Sequence('enum', ident, enum_body)
244 | 
245 |         # Message definition.
246 |         message_body = Sequence('{',
247 |                                 ZeroOrMore(choice(field,
248 |                                                   enum,
249 |                                                   message,
250 |                                                   option,
251 |                                                   oneof,
252 |                                                   map_field,
253 |                                                   reserved,
254 |                                                   ';')),
255 |                                 '}')
256 |         message <<= Sequence('message', ident, message_body)
257 | 
258 |         # Service definition.
259 |         service = Sequence('service', ident,
260 |                            '{',
261 |                            ZeroOrMore(choice(option, rpc, ';')),
262 |                            '}')
263 |         rpc <<= Sequence('rpc', ident,
264 |                          '(',
265 |                          Optional('stream'), ident,
266 |                          ')',
267 |                          'returns',
268 |                          '(',
269 |                          Optional('stream'), ident,
270 |                          ')',
271 |                          choice(Sequence('{',
272 |                                          ZeroOrMore(choice(option, ';')),
273 |                                          '}'),
274 |                                 ';'))
275 | 
276 |         # Proto file.
277 |         proto = Sequence(syntax,
278 |                          ZeroOrMore(choice(import_,
279 |                                            package,
280 |                                            option,
281 |                                            message,
282 |                                            enum,
283 |                                            service,
284 |                                            ';')))
285 | 
286 |         return proto
287 | 
288 | 
289 | proto_string = '''
290 | syntax = "proto3";
291 | 
292 | import public "foo.bar";
293 | 
294 | option java_package = "com.example.foo";
295 | option java_multiple_files = true;
296 | 
297 | enum EnumAllowingAlias {
298 |   option allow_alias = true;
299 |   UNKNOWN = 0;
300 |   STARTED = 1;
301 |   RUNNING = 2 [(custom_option) = "hello world"];
302 | }
303 | 
304 | message outer {
305 |   option (my_option).a = true;
306 |   int32 old_field = 1 [deprecated=true];
307 |   message inner {   // Level 2
308 |     int64 ival = 2;
309 |   }
310 |   repeated inner inner_message = 3;
311 |   EnumAllowingAlias enum_field =4;
312 |   map<int32, string> my_map = 5;
313 |   foo.bar.Open open = 6;
314 |   .foo.bar.Close close = 7;
315 |   oneof test_oneof {
316 |     string name = 8;
317 |     SubMessage sub_message = 9;
318 |   }
319 |   reserved 2, 15, 9 to 11;
320 |   reserved 7;
321 |   reserved 15 to max;
322 | }
323 | 
324 | service SearchService {
325 |   rpc Search (SearchRequest) returns (SearchResponse);
326 | }
327 | '''
328 | 
329 | tree = Parser().parse(proto_string)
330 | 
331 | print('Tree:', pformat(tree))
332 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lark-parser
2 | pyparsing
3 | parsita
4 | funcparserlib
5 | parsy
6 | parsimonious
7 | textx
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | from setuptools import find_packages
 5 | import re
 6 | 
 7 | 
 8 | def find_version():
 9 |     return re.search(r"^__version__ = '(.*)'$",
10 |                      open('textparser.py', 'r').read(),
11 |                      re.MULTILINE).group(1)
12 | 
13 | 
14 | setup(name='textparser',
15 |       version=find_version(),
16 |       description='Text parser.',
17 |       long_description=open('README.rst', 'r').read(),
18 |       author='Erik Moqvist',
19 |       author_email='erik.moqvist@gmail.com',
20 |       license='MIT',
21 |       classifiers=[
22 |           'License :: OSI Approved :: MIT License',
23 |           'Programming Language :: Python :: 2',
24 |           'Programming Language :: Python :: 3',
25 |       ],
26 |       keywords=['parser', 'parsing'],
27 |       url='https://github.com/eerimoq/textparser',
28 |       py_modules=['textparser'],
29 |       test_suite="tests")
30 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eerimoq/textparser/1ef809eb283da3c3ec7b8bc682f11eeada3a81d6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_textparser.py:
--------------------------------------------------------------------------------
   1 | import unittest
   2 | from collections import namedtuple
   3 | 
   4 | import textparser
   5 | from textparser import Grammar
   6 | from textparser import Sequence
   7 | from textparser import Choice
   8 | from textparser import choice
   9 | from textparser import ChoiceDict
  10 | from textparser import ZeroOrMore
  11 | from textparser import ZeroOrMoreDict
  12 | from textparser import OneOrMore
  13 | from textparser import OneOrMoreDict
  14 | from textparser import DelimitedList
  15 | from textparser import Token
  16 | from textparser import TokenizeError
  17 | from textparser import tokenize_init
  18 | from textparser import Any
  19 | from textparser import AnyUntil
  20 | from textparser import Optional
  21 | from textparser import Tag
  22 | from textparser import Forward
  23 | from textparser import NoMatch
  24 | from textparser import Not
  25 | from textparser import And
  26 | from textparser import markup_line
  27 | from textparser import replace_blocks
  28 | 
  29 | 
  30 | def tokenize(items, add_eof_token=True):
  31 |     tokens = []
  32 | 
  33 |     for item in items:
  34 |         if len(item) == 2:
  35 |             token = Token(*item, offset=1)
  36 |         else:
  37 |             token = Token(*item)
  38 | 
  39 |         tokens.append(token)
  40 | 
  41 |     if add_eof_token:
  42 |         tokens.append(Token('__EOF__', None, -1))
  43 | 
  44 |     return tokens
  45 | 
  46 | 
  47 | class TextParserTest(unittest.TestCase):
  48 | 
  49 |     def parse_and_assert_tree(self, grammar, datas):
  50 |         for tokens, expected_tree in datas:
  51 |             tree = grammar.parse(tokenize(tokens))
  52 |             self.assertEqual(tree, expected_tree)
  53 | 
  54 |     def parse_and_assert_mismatch(self, grammar, datas):
  55 |         for tokens, line in datas:
  56 |             tokens = tokenize(tokens)
  57 | 
  58 |             with self.assertRaises(textparser.GrammarError) as cm:
  59 |                 grammar.parse(tokens)
  60 | 
  61 |             self.assertEqual(cm.exception.offset, line)
  62 | 
  63 |     def test_grammar_sequence(self):
  64 |         grammar = Grammar(Sequence('NUMBER', 'WORD'))
  65 |         tokens = tokenize([
  66 |             ('NUMBER', '1.45'),
  67 |             ('WORD', 'm')
  68 |         ])
  69 |         tree = grammar.parse(tokens)
  70 |         self.assertEqual(tree, ['1.45', 'm'])
  71 | 
  72 |     def test_grammar_sequence_mismatch(self):
  73 |         grammar = Grammar(Sequence('NUMBER', 'WORD'))
  74 |         tokens = tokenize([('NUMBER', '1.45')])
  75 | 
  76 |         with self.assertRaises(textparser.GrammarError) as cm:
  77 |             grammar.parse(tokens)
  78 | 
  79 |         self.assertEqual(cm.exception.offset, -1)
  80 | 
  81 |     def test_grammar_choice(self):
  82 |         grammar = Grammar(Choice('NUMBER', 'WORD'))
  83 | 
  84 |         datas = [
  85 |             (
  86 |                 [('WORD', 'm')],
  87 |                 'm'
  88 |             ),
  89 |             (
  90 |                 [('NUMBER', '5')],
  91 |                 '5'
  92 |             )
  93 |         ]
  94 | 
  95 |         self.parse_and_assert_tree(grammar, datas)
  96 | 
  97 |     def test_grammar_choice_mismatch(self):
  98 |         grammar = Grammar(Choice(Sequence('NUMBER', 'WORD'),
  99 |                                  'WORD'))
 100 | 
 101 |         datas = [
 102 |             ([('NUMBER', '1', 5)], -1),
 103 |             ([('NUMBER', '1', 5), ('NUMBER', '2', 7)], 7)
 104 |         ]
 105 | 
 106 |         self.parse_and_assert_mismatch(grammar, datas)
 107 | 
 108 |     def test_grammar_choice_dict(self):
 109 |         number = Forward()
 110 |         number <<= Sequence('NUMBER')
 111 |         grammar = Grammar(ChoiceDict(number,
 112 |                                      Tag('foo', Sequence('WORD')),
 113 |                                      ChoiceDict('BAR'),
 114 |                                      'FIE'))
 115 | 
 116 |         datas = [
 117 |             (
 118 |                 [('WORD', 'm')],
 119 |                 ('foo', ['m'])
 120 |             ),
 121 |             (
 122 |                 [('NUMBER', '5')],
 123 |                 ['5']
 124 |             ),
 125 |             (
 126 |                 [('BAR', 'foo')],
 127 |                 'foo'
 128 |             ),
 129 |             (
 130 |                 [('FIE', 'fum')],
 131 |                 'fum'
 132 |             )
 133 |         ]
 134 | 
 135 |         self.parse_and_assert_tree(grammar, datas)
 136 | 
 137 |     def test_grammar_choice_dict_mismatch(self):
 138 |         grammar = Grammar(ChoiceDict(Sequence('NUMBER'),
 139 |                                      Sequence('WORD')))
 140 |         tokens = tokenize([(',', ',', 3)])
 141 | 
 142 |         with self.assertRaises(textparser.Error) as cm:
 143 |             grammar.parse(tokens)
 144 | 
 145 |         self.assertEqual(cm.exception.offset, 3)
 146 | 
 147 |     def test_grammar_choice_dict_init(self):
 148 |         datas = [
 149 |             (
 150 |                 ('WORD', 'WORD'),
 151 |                 "First token kind must be unique, but WORD isn't."
 152 |             ),
 153 |             (
 154 |                 ('WORD', Sequence('WORD')),
 155 |                 "First token kind must be unique, but WORD isn't."
 156 |             ),
 157 |             (
 158 |                 (Sequence(Sequence(Optional('WORD'))), ),
 159 |                 "Unsupported pattern type <class 'textparser.Optional'>."
 160 |             )
 161 |         ]
 162 | 
 163 |         for grammar, message in datas:
 164 |             with self.assertRaises(textparser.Error) as cm:
 165 |                 ChoiceDict(*grammar)
 166 | 
 167 |             self.assertEqual(str(cm.exception), message)
 168 | 
 169 |     def test_grammar_delimited_list(self):
 170 |         grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.')))
 171 | 
 172 |         datas = [
 173 |             (
 174 |                 [('WORD', 'foo')],
 175 |                 [['foo'], []]
 176 |             ),
 177 |             (
 178 |                 [('WORD', 'foo'), (',', ','), ('WORD', 'bar')],
 179 |                 [['foo', 'bar'], []]
 180 |             ),
 181 |             (
 182 |                 [('WORD', 'foo'), (',', ','), ('WORD', 'bar'), ('.', '.')],
 183 |                 [['foo', 'bar'], ['.']]
 184 |             )
 185 |         ]
 186 | 
 187 |         self.parse_and_assert_tree(grammar, datas)
 188 | 
 189 |     def test_grammar_delimited_list_mismatch(self):
 190 |         grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.')))
 191 | 
 192 |         datas = [
 193 |             (
 194 |                 [
 195 |                     ('WORD', 'foo', 1),
 196 |                     (',', ',', 2)
 197 |                 ],
 198 |                 2
 199 |             ),
 200 |             (
 201 |                 [
 202 |                     ('WORD', 'foo', 1),
 203 |                     (',', ',', 2),
 204 |                     ('WORD', 'foo', 3),
 205 |                     (',', ',', 4),
 206 |                     ('.', '.', 5)
 207 |                 ],
 208 |                 4
 209 |             )
 210 |         ]
 211 | 
 212 |         self.parse_and_assert_mismatch(grammar, datas)
 213 | 
 214 |     def test_grammar_zero_or_more(self):
 215 |         grammar = Grammar(ZeroOrMore('WORD'))
 216 | 
 217 |         datas = [
 218 |             (
 219 |                 [],
 220 |                 []
 221 |             ),
 222 |             (
 223 |                 [('WORD', 'foo')],
 224 |                 ['foo']
 225 |             ),
 226 |             (
 227 |                 [('WORD', 'foo'), ('WORD', 'bar')],
 228 |                 ['foo', 'bar']
 229 |             )
 230 |         ]
 231 | 
 232 |         self.parse_and_assert_tree(grammar, datas)
 233 | 
 234 |     def test_grammar_zero_or_more_partial_element_match(self):
 235 |         grammar = Grammar(Sequence(
 236 |             ZeroOrMore(Sequence('WORD', 'NUMBER')), 'WORD'))
 237 | 
 238 |         datas = [
 239 |             (
 240 |                 [
 241 |                     ('WORD', 'foo'),
 242 |                     ('NUMBER', '1'),
 243 |                     ('WORD', 'bar'),
 244 |                     ('NUMBER', '2'),
 245 |                     ('WORD', 'fie')],
 246 |                 [[['foo', '1'], ['bar', '2']], 'fie']
 247 |             )
 248 |         ]
 249 | 
 250 |         self.parse_and_assert_tree(grammar, datas)
 251 | 
 252 |     def test_grammar_zero_or_more_dict(self):
 253 |         grammar = Grammar(ZeroOrMoreDict(Sequence('WORD', 'NUMBER')))
 254 | 
 255 |         datas = [
 256 |             (
 257 |                 [],
 258 |                 {}
 259 |             ),
 260 |             (
 261 |                 [('WORD', 'foo'), ('NUMBER', '1'),
 262 |                  ('WORD', 'bar'), ('NUMBER', '2'),
 263 |                  ('WORD', 'foo'), ('NUMBER', '3')],
 264 |                 {
 265 |                     'foo': [['foo', '1'], ['foo', '3']],
 266 |                     'bar': [['bar', '2']]
 267 |                 }
 268 |             )
 269 |         ]
 270 | 
 271 |         self.parse_and_assert_tree(grammar, datas)
 272 | 
 273 |     def test_grammar_one_or_more(self):
 274 |         grammar = Grammar(OneOrMore('WORD'))
 275 | 
 276 |         datas = [
 277 |             (
 278 |                 [('WORD', 'foo')],
 279 |                 ['foo']
 280 |             ),
 281 |             (
 282 |                 [('WORD', 'foo'), ('WORD', 'bar')],
 283 |                 ['foo', 'bar']
 284 |             )
 285 |         ]
 286 | 
 287 |         self.parse_and_assert_tree(grammar, datas)
 288 | 
 289 |     def test_grammar_one_or_more_mismatch(self):
 290 |         grammar = Grammar(OneOrMore('WORD'))
 291 | 
 292 |         datas = [
 293 |             (
 294 |                 []
 295 |                 , -1
 296 |             ),
 297 |             (
 298 |                 [('NUMBER', 'foo', 2)],
 299 |                 2
 300 |             )
 301 |         ]
 302 | 
 303 |         self.parse_and_assert_mismatch(grammar, datas)
 304 | 
 305 |     def test_grammar_one_or_more_dict(self):
 306 |         grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER')))
 307 | 
 308 |         datas = [
 309 |             (
 310 |                 [('WORD', 'foo'), ('NUMBER', '1')],
 311 |                 {
 312 |                     'foo': [['foo', '1']]
 313 |                 }
 314 |             ),
 315 |             (
 316 |                 [('WORD', 'foo'), ('NUMBER', '1'),
 317 |                  ('WORD', 'bar'), ('NUMBER', '2'),
 318 |                  ('WORD', 'foo'), ('NUMBER', '3')],
 319 |                 {
 320 |                     'foo': [['foo', '1'], ['foo', '3']],
 321 |                     'bar': [['bar', '2']]
 322 |                 }
 323 |             )
 324 |         ]
 325 | 
 326 |         self.parse_and_assert_tree(grammar, datas)
 327 | 
 328 |     def test_grammar_one_or_more_dict_mismatch(self):
 329 |         grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER')))
 330 | 
 331 |         datas = [
 332 |             (
 333 |                 [('WORD', 'foo', 5)],
 334 |                 -1
 335 |             ),
 336 |             (
 337 |                 [
 338 |                     ('WORD', 'foo', 5),
 339 |                     ('WORD', 'bar', 6)
 340 |                 ],
 341 |                 6
 342 |             ),
 343 |             (
 344 |                 [
 345 |                     ('WORD', 'foo', 5),
 346 |                     ('NUMBER', '4', 6),
 347 |                     ('WORD', 'bar', 7),
 348 |                     ('WORD', 'fie', 8)
 349 |                 ],
 350 |                 8
 351 |             )
 352 |         ]
 353 | 
 354 |         self.parse_and_assert_mismatch(grammar, datas)
 355 | 
 356 |     def test_grammar_any(self):
 357 |         grammar = Grammar(Any())
 358 | 
 359 |         datas = [
 360 |             (
 361 |                 [('A', r'a')],
 362 |                 'a'
 363 |             ),
 364 |             (
 365 |                 [('B', r'b')],
 366 |                 'b'
 367 |             )
 368 |         ]
 369 | 
 370 |         self.parse_and_assert_tree(grammar, datas)
 371 | 
 372 |     def test_grammar_any_until(self):
 373 |         grammar = Grammar(Sequence(AnyUntil('STRING'), 'STRING'))
 374 | 
 375 |         datas = [
 376 |             (
 377 |                 [('NUMBER', '1'),
 378 |                  ('WORD', 'a'),
 379 |                  ('STRING', '"b"')],
 380 |                 [['1', 'a'], '"b"']
 381 |             )
 382 |         ]
 383 | 
 384 |         self.parse_and_assert_tree(grammar, datas)
 385 | 
 386 |     def test_grammar_any_until_sequence(self):
 387 |         grammar = Grammar(Sequence(AnyUntil(Sequence('WORD', 'STRING')),
 388 |                                    'WORD',
 389 |                                    'STRING'))
 390 | 
 391 |         datas = [
 392 |             (
 393 |                 [('NUMBER', '1'),
 394 |                  ('WORD', 'a'),
 395 |                  ('WORD', 'b'),
 396 |                  ('STRING', '"b"')],
 397 |                 [['1', 'a'], 'b', '"b"']
 398 |             )
 399 |         ]
 400 | 
 401 |         self.parse_and_assert_tree(grammar, datas)
 402 | 
 403 |     def test_grammar_1(self):
 404 |         grammar = Grammar(Sequence(
 405 |             'IF',
 406 |             choice(Sequence(choice('A', 'B'), 'STRING'),
 407 |                    'STRING'),
 408 |             'WORD',
 409 |             choice(
 410 |                 Sequence(
 411 |                     choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'),
 412 |             '.')))
 413 | 
 414 |         datas = [
 415 |             (
 416 |                 [
 417 |                     ('IF', 'IF'),
 418 |                     ('STRING', 'foo'),
 419 |                     ('WORD', 'bar'),
 420 |                     ('.', '.')
 421 |                 ],
 422 |                 ['IF', 'foo', 'bar', [[], '.']]
 423 |             ),
 424 |             (
 425 |                 [
 426 |                     ('IF', 'IF'),
 427 |                     ('STRING', 'foo'),
 428 |                     ('WORD', 'bar'),
 429 |                     ('NUMBER', '0'),
 430 |                     ('NUMBER', '100'),
 431 |                     ('.', '.')
 432 |                 ],
 433 |                 ['IF', 'foo', 'bar', [['0', '100'], '.']]
 434 |             )
 435 |         ]
 436 | 
 437 |         self.parse_and_assert_tree(grammar, datas)
 438 | 
 439 |     def test_grammar_1_mismatch(self):
 440 |         grammar = Grammar(Sequence(
 441 |             'IF',
 442 |             choice(Sequence(choice('A', 'B'), 'STRING'),
 443 |                    'STRING'),
 444 |             'WORD',
 445 |             choice(
 446 |                 Sequence(
 447 |                     choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'),
 448 |             '.')))
 449 | 
 450 |         datas = [
 451 |             (
 452 |                 [
 453 |                     ('IF', 'IF', 1),
 454 |                     ('STRING', 'foo', 2),
 455 |                     ('WORD', 'bar', 3),
 456 |                     (',', ',', 4)
 457 |                 ],
 458 |                 4
 459 |             ),
 460 |             (
 461 |                 [
 462 |                     ('IF', 'IF', 1),
 463 |                     ('STRING', 'foo', 2),
 464 |                     ('.', '.', 3)
 465 |                 ],
 466 |                 3
 467 |             ),
 468 |             (
 469 |                 [
 470 |                     ('IF', 'IF', 1),
 471 |                     ('NUMBER', '1', 2)
 472 |                 ],
 473 |                 2
 474 |             ),
 475 |             (
 476 |                 [
 477 |                     ('IF', 'IF', 1),
 478 |                     ('STRING', 'foo', 2),
 479 |                     ('WORD', 'bar', 3),
 480 |                     ('.', '.', 4),
 481 |                     ('.', '.', 5)
 482 |                 ],
 483 |                 5
 484 |             )
 485 |         ]
 486 | 
 487 |         self.parse_and_assert_mismatch(grammar, datas)
 488 | 
 489 |     def test_grammar_forward(self):
 490 |         foo = Forward()
 491 |         foo <<= Sequence('FOO')
 492 |         grammar = Grammar(foo)
 493 | 
 494 |         datas = [
 495 |             (
 496 |                 [('FOO', 'foo')],
 497 |                 ['foo']
 498 |             )
 499 |         ]
 500 | 
 501 |         self.parse_and_assert_tree(grammar, datas)
 502 | 
 503 |     def test_grammar_forward_text(self):
 504 |         foo = Forward()
 505 |         foo <<= 'FOO'
 506 |         grammar = Grammar(foo)
 507 | 
 508 |         datas = [
 509 |             (
 510 |                 [('FOO', 'foo')],
 511 |                 'foo'
 512 |             )
 513 |         ]
 514 | 
 515 |         self.parse_and_assert_tree(grammar, datas)
 516 | 
 517 |     def test_grammar_optional(self):
 518 |         grammar = Grammar(Sequence(Optional('WORD'),
 519 |                                    Optional('WORD'),
 520 |                                    Optional('NUMBER')))
 521 | 
 522 |         datas = [
 523 |             (
 524 |                 [],
 525 |                 [[], [], []]
 526 |             ),
 527 |             (
 528 |                 [('WORD', 'a')],
 529 |                 [['a'], [], []]
 530 |             ),
 531 |             (
 532 |                 [('NUMBER', 'c')],
 533 |                 [[], [], ['c']]
 534 |             ),
 535 |             (
 536 |                 [('WORD', 'a'), ('NUMBER', 'c')],
 537 |                 [['a'], [], ['c']]
 538 |             ),
 539 |             (
 540 |                 [('WORD', 'a'), ('WORD', 'b'), ('NUMBER', 'c')],
 541 |                 [['a'], ['b'], ['c']]
 542 |             )
 543 |         ]
 544 | 
 545 |         self.parse_and_assert_tree(grammar, datas)
 546 | 
 547 |     def test_grammar_tag(self):
 548 |         grammar = Grammar(Tag('a',
 549 |                               Tag('b',
 550 |                                   choice(Tag('c', 'WORD'),
 551 |                                          Tag('d', Optional('NUMBER'))))))
 552 | 
 553 |         datas = [
 554 |             (
 555 |                 [('WORD', 'bar')],
 556 |                 ('a', ('b', ('c', 'bar')))
 557 |             ),
 558 |             (
 559 |                 [('NUMBER', '1')],
 560 |                 ('a', ('b', ('d', ['1'])))
 561 |             ),
 562 |             (
 563 |                 [],
 564 |                 ('a', ('b', ('d', [])))
 565 |             )
 566 |         ]
 567 | 
 568 |         self.parse_and_assert_tree(grammar, datas)
 569 | 
 570 |     def test_grammar_tag_mismatch(self):
 571 |         grammar = Grammar(Tag('a', 'WORD'))
 572 | 
 573 |         datas = [
 574 |             (
 575 |                 [('NUMBER', 'bar')],
 576 |                 1
 577 |             )
 578 |         ]
 579 | 
 580 |         self.parse_and_assert_mismatch(grammar, datas)
 581 | 
 582 |     def test_grammar_and(self):
 583 |         grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER'))
 584 | 
 585 |         datas = [
 586 |             (
 587 |                 [('NUMBER', '1')],
 588 |                 [[], '1']
 589 |             )
 590 |         ]
 591 | 
 592 |         self.parse_and_assert_tree(grammar, datas)
 593 | 
 594 |     def test_grammar_and_mismatch(self):
 595 |         grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER'))
 596 | 
 597 |         datas = [
 598 |             (
 599 |                 [('WORD', 'foo', 3), ('NUMBER', '1', 4)],
 600 |                 3
 601 |             )
 602 |         ]
 603 | 
 604 |         self.parse_and_assert_mismatch(grammar, datas)
 605 | 
 606 |     def test_grammar_not(self):
 607 |         grammar = Grammar(Sequence(Not('WORD'), 'NUMBER'))
 608 | 
 609 |         datas = [
 610 |             (
 611 |                 [('NUMBER', '1')],
 612 |                 [[], '1']
 613 |             )
 614 |         ]
 615 | 
 616 |         self.parse_and_assert_tree(grammar, datas)
 617 | 
 618 |     def test_grammar_not_mismatch(self):
 619 |         grammar = Grammar(Sequence(Not('WORD'), 'NUMBER'))
 620 | 
 621 |         datas = [
 622 |             (
 623 |                 [('WORD', 'foo', 3), ('NUMBER', '1', 4)],
 624 |                 3
 625 |             )
 626 |         ]
 627 | 
 628 |         self.parse_and_assert_mismatch(grammar, datas)
 629 | 
 630 |     def test_grammar_no_match(self):
 631 |         grammar = Grammar(NoMatch())
 632 | 
 633 |         datas = [
 634 |             (
 635 |                 [('NUMBER', '1', 3)],
 636 |                 3
 637 |             ),
 638 |             (
 639 |                 [('WORD', 'foo', 3)],
 640 |                 3
 641 |             )
 642 |         ]
 643 | 
 644 |         self.parse_and_assert_mismatch(grammar, datas)
 645 | 
 646 |     def test_parse_start_and_end_of_file(self):
 647 |         class Parser(textparser.Parser):
 648 | 
 649 |             def grammar(self):
 650 |                 return Sequence('__SOF__', '__EOF__')
 651 | 
 652 |         self.assertEqual(Parser().parse('', match_sof=True),
 653 |                          ['__SOF__', '__EOF__'])
 654 | 
 655 |     def test_parse_start_of_file_mismatch(self):
 656 |         class Parser(textparser.Parser):
 657 | 
 658 |             def grammar(self):
 659 |                 return Sequence('__EOF__')
 660 | 
 661 |         with self.assertRaises(textparser.ParseError) as cm:
 662 |             Parser().parse('123', match_sof=True)
 663 | 
 664 |         self.assertEqual(str(cm.exception),
 665 |                          'Invalid syntax at line 1, column 1: ">>!<<123"')
 666 | 
 667 |     def test_parse_end_of_file(self):
 668 |         class Parser(textparser.Parser):
 669 | 
 670 |             def grammar(self):
 671 |                 return '__EOF__'
 672 | 
 673 |         self.assertEqual(Parser().parse('', match_sof=False), '__EOF__')
 674 | 
 675 |     def test_grammar_none(self):
 676 |         class AnyAsNone(textparser.Pattern):
 677 | 
 678 |             def match(self, tokens):
 679 |                 tokens.get_value()
 680 | 
 681 |                 return None
 682 | 
 683 |         grammar = Grammar(AnyAsNone())
 684 | 
 685 |         datas = [
 686 |             (
 687 |                 [('NUMBER', '1')],
 688 |                 None
 689 |             )
 690 |         ]
 691 | 
 692 |         self.parse_and_assert_tree(grammar, datas)
 693 | 
 694 |     def test_grammar_error(self):
 695 |         grammar = Grammar(NoMatch())
 696 | 
 697 |         datas = [
 698 |             [('NUMBER', '1', 3)],
 699 |             [('WORD', 'foo', 3)]
 700 |         ]
 701 | 
 702 |         for tokens in datas:
 703 |             tokens = tokenize(tokens)
 704 | 
 705 |             with self.assertRaises(textparser.GrammarError) as cm:
 706 |                 grammar.parse(tokens)
 707 | 
 708 |             self.assertEqual(cm.exception.offset, 3)
 709 |             self.assertEqual(str(cm.exception),
 710 |                              'Invalid syntax at offset 3.')
 711 | 
 712 |     def test_tokenize_error(self):
 713 |         datas = [
 714 |             (2, 'hej', 'Invalid syntax at line 1, column 3: "he>>!<<j"'),
 715 |             (0, 'a\nb\n', 'Invalid syntax at line 1, column 1: ">>!<<a"'),
 716 |             (1, 'a\nb\n', 'Invalid syntax at line 1, column 2: "a>>!<<"'),
 717 |             (2, 'a\nb\n', 'Invalid syntax at line 2, column 1: ">>!<<b"')
 718 |         ]
 719 | 
 720 |         for offset, text, message in datas:
 721 |             with self.assertRaises(TokenizeError) as cm:
 722 |                 raise TokenizeError(text, offset)
 723 | 
 724 |             self.assertEqual(cm.exception.text, text)
 725 |             self.assertEqual(cm.exception.offset, offset)
 726 |             self.assertEqual(str(cm.exception), message)
 727 | 
 728 |     def test_create_token_re(self):
 729 |         datas = [
 730 |             (
 731 |                 [('A', r'a')],
 732 |                 '(?P<A>a)'
 733 |             ),
 734 |             (
 735 |                 [('A', r'b'), ('C', r'd')],
 736 |                 '(?P<A>b)|(?P<C>d)'
 737 |             )
 738 |         ]
 739 | 
 740 |         for spec, expected_re_token in datas:
 741 |             tokens, re_token = tokenize_init(spec)
 742 |             self.assertEqual(tokens,
 743 |                              [Token(kind='__SOF__', value='__SOF__', offset=0)])
 744 |             self.assertEqual(re_token, expected_re_token)
 745 | 
 746 |     def test_parser(self):
 747 |         class Parser(textparser.Parser):
 748 | 
 749 |             def keywords(self):
 750 |                 return set([
 751 |                     'IF',
 752 |                     'A',
 753 |                     'B'
 754 |                 ])
 755 | 
 756 |             def token_specs(self):
 757 |                 return [
 758 |                     ('SKIP',                r'[ \r\n\t]+'),
 759 |                     ('NUMBER',              r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
 760 |                     ('DOT',            '.', r'\.'),
 761 |                     ('WORD',                r'[A-Za-z0-9_]+'),
 762 |                     ('ESCAPED_STRING',      r'"(\\"|[^"])*?"'),
 763 |                     ('MISMATCH',            r'.')
 764 |                 ]
 765 | 
 766 |             def grammar(self):
 767 |                 return Sequence(
 768 |                     'IF',
 769 |                     Optional(choice('A', 'B')),
 770 |                     'ESCAPED_STRING',
 771 |                     'WORD',
 772 |                     Optional(choice(DelimitedList('ESCAPED_STRING'),
 773 |                                     ZeroOrMore('NUMBER'))),
 774 |                     '.')
 775 | 
 776 |         datas = [
 777 |             (
 778 |                 'IF "foo" bar .',
 779 |                 ['IF', [], '"foo"', 'bar', [[]], '.'],
 780 |                 [
 781 |                     Token(kind='IF', value='IF', offset=0),
 782 |                     [],
 783 |                     Token(kind='ESCAPED_STRING', value='"foo"', offset=3),
 784 |                     Token(kind='WORD', value='bar', offset=9),
 785 |                     [[]],
 786 |                     Token(kind='.', value='.', offset=13)
 787 |                 ]
 788 |             ),
 789 |             (
 790 |                 'IF B "" b 1 2 .',
 791 |                 ['IF', ['B'], '""', 'b', [['1', '2']], '.'],
 792 |                 [
 793 |                     Token(kind='IF', value='IF', offset=0),
 794 |                     [
 795 |                         Token(kind='B', value='B', offset=3)
 796 |                     ],
 797 |                     Token(kind='ESCAPED_STRING', value='""', offset=5),
 798 |                     Token(kind='WORD', value='b', offset=8),
 799 |                     [
 800 |                         [
 801 |                             Token(kind='NUMBER', value='1', offset=10),
 802 |                             Token(kind='NUMBER', value='2', offset=12)
 803 |                         ]
 804 |                     ],
 805 |                     Token(kind='.', value='.', offset=14)
 806 |                 ]
 807 |             )
 808 |         ]
 809 | 
 810 |         for text, expected_tree, expected_token_tree in datas:
 811 |             tree = Parser().parse(text)
 812 |             self.assertEqual(tree, expected_tree)
 813 |             tree = Parser().parse(text, token_tree=True)
 814 |             self.assertEqual(tree, expected_token_tree)
 815 | 
 816 |     def test_parser_default_keywords(self):
 817 |         class Parser(textparser.Parser):
 818 | 
 819 |             def token_specs(self):
 820 |                 return [
 821 |                     ('SKIP',                r'[ \r\n\t]+'),
 822 |                     ('NUMBER',              r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
 823 |                     ('DOT',            '.', r'\.'),
 824 |                     ('WORD',                r'[A-Za-z0-9_]+'),
 825 |                     ('ESCAPED_STRING',      r'"(\\"|[^"])*?"'),
 826 |                     ('MISMATCH',            r'.')
 827 |                 ]
 828 | 
 829 |             def grammar(self):
 830 |                 return Sequence(
 831 |                     'WORD',
 832 |                     Optional('WORD'),
 833 |                     'ESCAPED_STRING',
 834 |                     'WORD',
 835 |                     Optional(choice(DelimitedList('ESCAPED_STRING'),
 836 |                                     ZeroOrMore('NUMBER'))),
 837 |                     '.')
 838 | 
 839 |         datas = [
 840 |             (
 841 |                 'IF "foo" bar .',
 842 |                 ['IF', [], '"foo"', 'bar', [[]], '.'],
 843 |                 [
 844 |                     Token(kind='WORD', value='IF', offset=0),
 845 |                     [],
 846 |                     Token(kind='ESCAPED_STRING', value='"foo"', offset=3),
 847 |                     Token(kind='WORD', value='bar', offset=9),
 848 |                     [[]],
 849 |                     Token(kind='.', value='.', offset=13)
 850 |                 ]
 851 |             ),
 852 |             (
 853 |                 'IF B "" b 1 2 .',
 854 |                 ['IF', ['B'], '""', 'b', [['1', '2']], '.'],
 855 |                 [
 856 |                     Token(kind='WORD', value='IF', offset=0),
 857 |                     [
 858 |                         Token(kind='WORD', value='B', offset=3)
 859 |                     ],
 860 |                     Token(kind='ESCAPED_STRING', value='""', offset=5),
 861 |                     Token(kind='WORD', value='b', offset=8),
 862 |                     [
 863 |                         [
 864 |                             Token(kind='NUMBER', value='1', offset=10),
 865 |                             Token(kind='NUMBER', value='2', offset=12)
 866 |                         ]
 867 |                     ],
 868 |                     Token(kind='.', value='.', offset=14)
 869 |                 ]
 870 |             )
 871 |         ]
 872 | 
 873 |         for text, expected_tree, expected_token_tree in datas:
 874 |             tree = Parser().parse(text)
 875 |             self.assertEqual(tree, expected_tree)
 876 |             tree = Parser().parse(text, token_tree=True)
 877 |             self.assertEqual(tree, expected_token_tree)
 878 | 
 879 |     def test_parser_bare(self):
 880 |         class Parser(textparser.Parser):
 881 | 
 882 |             pass
 883 | 
 884 |         with self.assertRaises(NotImplementedError) as cm:
 885 |             Parser().parse('foo')
 886 | 
 887 |         self.assertEqual(str(cm.exception), 'No grammar defined.')
 888 | 
 889 |     def test_parser_default_token_specs(self):
 890 |         class Parser(textparser.Parser):
 891 | 
 892 |             def grammar(self):
 893 |                 return 'WORD'
 894 | 
 895 |         tree = Parser().parse('foo')
 896 |         self.assertEqual(tree, 'foo')
 897 | 
 898 |     def test_parser_tokenize_mismatch(self):
 899 |         class Parser(textparser.Parser):
 900 | 
 901 |             def token_specs(self):
 902 |                 return [
 903 |                     ('SKIP',                r'[ \r\n\t]+'),
 904 |                     ('NUMBER',              r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
 905 |                     ('MISMATCH',            r'.')
 906 |                 ]
 907 | 
 908 |             def grammar(self):
 909 |                 return Grammar('NUMBER')
 910 | 
 911 |         with self.assertRaises(textparser.ParseError) as cm:
 912 |             Parser().parse('12\n34foo\n789')
 913 | 
 914 |         self.assertEqual(cm.exception.offset, 5)
 915 |         self.assertEqual(cm.exception.line, 2)
 916 |         self.assertEqual(cm.exception.column, 3)
 917 |         self.assertEqual(str(cm.exception),
 918 |                          'Invalid syntax at line 2, column 3: "34>>!<<foo"')
 919 | 
 920 |     def test_parser_grammar_mismatch(self):
 921 |         class Parser(textparser.Parser):
 922 | 
 923 |             def tokenize(self, _text):
 924 |                 return tokenize([
 925 |                     ('NUMBER', '1.45', 0),
 926 |                     ('NUMBER', '2', 5)
 927 |                 ])
 928 | 
 929 |             def grammar(self):
 930 |                 return Sequence('NUMBER', 'WORD')
 931 | 
 932 |         with self.assertRaises(textparser.ParseError) as cm:
 933 |             Parser().parse('1.45 2')
 934 | 
 935 |         self.assertEqual(cm.exception.offset, 5)
 936 |         self.assertEqual(cm.exception.line, 1)
 937 |         self.assertEqual(cm.exception.column, 6)
 938 |         self.assertEqual(str(cm.exception),
 939 |                          'Invalid syntax at line 1, column 6: "1.45 >>!<<2"')
 940 | 
 941 |     def test_parser_grammar_mismatch_choice_max(self):
 942 |         class Parser(textparser.Parser):
 943 | 
 944 |             def __init__(self, tokens):
 945 |                 self._tokens = tokens
 946 | 
 947 |             def tokenize(self, _text):
 948 |                 return tokenize(self._tokens, add_eof_token=False)
 949 | 
 950 |             def grammar(self):
 951 |                 return Choice(Sequence('NUMBER', 'WORD'),
 952 |                               'WORD')
 953 | 
 954 |         Data = namedtuple('Data',
 955 |                           [
 956 |                               'text',
 957 |                               'tokens',
 958 |                               'offset',
 959 |                               'line',
 960 |                               'column',
 961 |                               'message',
 962 |                           ])
 963 | 
 964 |         datas = [
 965 |             Data(
 966 |                 text='1.45',
 967 |                 tokens=[
 968 |                     ('NUMBER', '1.45', 0)
 969 |                 ],
 970 |                 offset=4,
 971 |                 line=1,
 972 |                 column=5,
 973 |                 message='Invalid syntax at line 1, column 5: "1.45>>!<<"'
 974 |             ),
 975 |             Data(
 976 |                 text='1.45 2',
 977 |                 tokens=[
 978 |                     ('NUMBER', '1.45', 0),
 979 |                     ('NUMBER', '2', 5)
 980 |                 ],
 981 |                 offset=5,
 982 |                 line=1,
 983 |                 column=6,
 984 |                 message='Invalid syntax at line 1, column 6: "1.45 >>!<<2"'
 985 |             )
 986 |         ]
 987 | 
 988 |         for text, tokens, offset, line, column, message in datas:
 989 |             with self.assertRaises(textparser.ParseError) as cm:
 990 |                 Parser(tokens).parse(text)
 991 | 
 992 |             self.assertEqual(cm.exception.offset, offset)
 993 |             self.assertEqual(cm.exception.line, line)
 994 |             self.assertEqual(cm.exception.column, column)
 995 |             self.assertEqual(str(cm.exception), message)
 996 | 
 997 |     def test_parse_error(self):
 998 |         class Parser(textparser.Parser):
 999 | 
1000 |             def tokenize(self, text):
1001 |                 raise TokenizeError(text, 5)
1002 | 
1003 |             def grammar(self):
1004 |                 return Grammar(Sequence('NUMBER', 'WORD'))
1005 | 
1006 |         with self.assertRaises(textparser.ParseError) as cm:
1007 |             Parser().parse('12\n3456\n789')
1008 | 
1009 |         self.assertEqual(cm.exception.text, '12\n3456\n789')
1010 |         self.assertEqual(cm.exception.offset, 5)
1011 |         self.assertEqual(cm.exception.line, 2)
1012 |         self.assertEqual(cm.exception.column, 3)
1013 |         self.assertEqual(str(cm.exception),
1014 |                          'Invalid syntax at line 2, column 3: "34>>!<<56"')
1015 | 
1016 |     def test_markup_line(self):
1017 |         datas = [
1018 |             (0, '>>!<<0', None),
1019 |             (1, '0>>!<<', None),
1020 |             (2, '>>!<<1234', None),
1021 |             (4, '12>>!<<34', None),
1022 |             (6, '1234>>!<<', None),
1023 |             (7, '>>!<<56', None),
1024 |             (8, '5>>!<<6', None),
1025 |             (9, '56>>!<<', None),
1026 |             (3, '1x234', 'x')
1027 |         ]
1028 | 
1029 |         for offset, line, marker in datas:
1030 |             if marker is None:
1031 |                 text = markup_line('0\n1234\n56', offset)
1032 |             else:
1033 |                 text = markup_line('0\n1234\n56',
1034 |                                    offset,
1035 |                                    marker=marker)
1036 | 
1037 |             self.assertEqual(text, line)
1038 | 
1039 |     def test_replace_blocks(self):
1040 |         datas = [
1041 |             ('{}', '{}'),
1042 |             ('{{}}', '{  }'),
1043 |             ('{{\n} xxx {}}', '{ \n        }'),
1044 |             ('1{a\n}2{b}3', '1{ \n}2{ }3')
1045 |         ]
1046 | 
1047 |         for old, expected in datas:
1048 |             new = replace_blocks(old)
1049 |             self.assertEqual(new, expected)
1050 | 
1051 |     def test_replace_blocks_start_end(self):
1052 |         datas = [
1053 |             ('1[a]2[b]3', '1[ ]2[ ]3', '[', ']'),
1054 |             ('1{a}2{b}3', '1{ }2{ }3', '{', '}'),
1055 |             ('1(a)2(b)3', '1( )2( )3', '(', ')'),
1056 |             ('1((a))2((b))3', '1(( ))2(( ))3', '((', '))')
1057 |         ]
1058 | 
1059 |         for old, expected, start, end in datas:
1060 |             new = replace_blocks(old, start, end)
1061 |             self.assertEqual(new, expected)
1062 | 
1063 |     def test_any_zero_or_more(self):
1064 |         class Parser(textparser.Parser):
1065 | 
1066 |             def keywords(self):
1067 |                 return ['interesting_group']
1068 | 
1069 |             def token_specs(self):
1070 |                 return [
1071 |                     ('SKIP',        r'[ \r\n\t]+'),
1072 |                     ('WORD',        r'[A-Za-z0-9_]+'),
1073 |                     ('SEMICOLON',   ';', r';'),
1074 |                     ('BRACE_OPEN',  '{', r'\{'),
1075 |                     ('BRACE_CLOSE', '}', r'\}'),
1076 |                     ('EQUAL',       '=', r'='),
1077 |                 ]
1078 | 
1079 |             def grammar(self):
1080 |                 interesting_group = textparser.Sequence(
1081 |                     'interesting_group', '{',
1082 |                     ZeroOrMore(Sequence('WORD', '=', 'WORD', ';')),
1083 |                     '}',
1084 |                     ';')
1085 | 
1086 |                 return Sequence(AnyUntil('interesting_group'),
1087 |                                 interesting_group,
1088 |                                 ZeroOrMore(Any()))
1089 | 
1090 | 
1091 |         text = '''
1092 |         some_group {
1093 |              foo bar; foo bar;
1094 |         };
1095 | 
1096 |         interesting_group {
1097 |              a = 1;
1098 |              b = 2;
1099 |         };
1100 | 
1101 |         another_group {
1102 |              foo bar
1103 |         };
1104 |         '''
1105 | 
1106 |         tree = Parser().parse(text)
1107 |         self.assertEqual(tree[1],
1108 |                          [
1109 |                              'interesting_group',
1110 |                              '{',
1111 |                              [
1112 |                                  ['a', '=', '1', ';'],
1113 |                                  ['b', '=', '2', ';']
1114 |                              ],
1115 |                              '}',
1116 |                              ';'])
1117 | 
1118 | 
1119 | if __name__ == '__main__':
1120 |     unittest.main()
1121 | 


--------------------------------------------------------------------------------
/textparser.py:
--------------------------------------------------------------------------------
  1 | # A text parser.
  2 | 
  3 | import re
  4 | from collections import namedtuple
  5 | from operator import itemgetter
  6 | 
  7 | 
  8 | __author__ = 'Erik Moqvist'
  9 | __version__ = '0.24.0'
 10 | 
 11 | 
 12 | class _Mismatch(object):
 13 |     pass
 14 | 
 15 | 
 16 | MISMATCH = _Mismatch()
 17 | """Returned by :func:`~textparser.Pattern.match()` on mismatch.
 18 | 
 19 | """
 20 | 
 21 | 
 22 | class _String(object):
 23 |     """Matches a specific token kind.
 24 | 
 25 |     """
 26 | 
 27 |     def __init__(self, kind):
 28 |         self.kind = kind
 29 | 
 30 |     def match(self, tokens):
 31 |         if self.kind == tokens.peek().kind:
 32 |             return tokens.get_value()
 33 |         else:
 34 |             return MISMATCH
 35 | 
 36 | 
 37 | class _Tokens(object):
 38 | 
 39 |     def __init__(self, tokens):
 40 |         self._tokens = tokens
 41 |         self._pos = 0
 42 |         self._max_pos = -1
 43 |         self._stack = []
 44 | 
 45 |     def get_value(self):
 46 |         pos = self._pos
 47 |         self._pos += 1
 48 | 
 49 |         return self._tokens[pos]
 50 | 
 51 |     def peek(self):
 52 |         return self._tokens[self._pos]
 53 | 
 54 |     def peek_max(self):
 55 |         pos = self._pos
 56 | 
 57 |         if self._max_pos > pos:
 58 |             pos = self._max_pos
 59 | 
 60 |         if pos >= len(self._tokens):
 61 |             return self._tokens[-1]
 62 |         else:
 63 |             return self._tokens[pos]
 64 | 
 65 |     def save(self):
 66 |         self._stack.append(self._pos)
 67 | 
 68 |     def restore(self):
 69 |         self._pos = self._stack.pop()
 70 | 
 71 |     def update(self):
 72 |         self._stack[-1] = self._pos
 73 | 
 74 |     def mark_max_restore(self):
 75 |         if self._pos > self._max_pos:
 76 |             self._max_pos = self._pos
 77 | 
 78 |         self._pos = self._stack.pop()
 79 | 
 80 |     def mark_max_load(self):
 81 |         if self._pos > self._max_pos:
 82 |             self._max_pos = self._pos
 83 | 
 84 |         self._pos = self._stack[-1]
 85 | 
 86 |     def drop(self):
 87 |         self._stack.pop()
 88 | 
 89 |     def __repr__(self):
 90 |         return str(self._tokens[self._pos:self._pos + 2])
 91 | 
 92 | 
 93 | class _StringTokens(_Tokens):
 94 | 
 95 |     def get_value(self):
 96 |         pos = self._pos
 97 |         self._pos += 1
 98 | 
 99 |         return self._tokens[pos].value
100 | 
101 | 
102 | def _wrap_string(item):
103 |     if isinstance(item, str):
104 |         item = _String(item)
105 | 
106 |     return item
107 | 
108 | 
109 | def _wrap_strings(items):
110 |     return [_wrap_string(item) for item in items]
111 | 
112 | 
113 | def _format_invalid_syntax(text, offset):
114 |     return 'Invalid syntax at line {}, column {}: "{}"'.format(
115 |         line(text, offset),
116 |         column(text, offset),
117 |         markup_line(text, offset))
118 | 
119 | 
120 | class Error(Exception):
121 |     """General textparser exception.
122 | 
123 |     """
124 | 
125 |     pass
126 | 
127 | 
128 | class TokenizeError(Error):
129 |     """This exception is raised when the text cannot be converted into
130 |     tokens.
131 | 
132 |     """
133 | 
134 |     def __init__(self, text, offset):
135 |         self._text = text
136 |         self._offset = offset
137 |         message = _format_invalid_syntax(text, offset)
138 |         super(TokenizeError, self).__init__(message)
139 | 
140 |     @property
141 |     def text(self):
142 |         """The input text to the tokenizer.
143 | 
144 |         """
145 | 
146 |         return self._text
147 | 
148 |     @property
149 |     def offset(self):
150 |         """Offset into the text where the tokenizer failed.
151 | 
152 |         """
153 | 
154 |         return self._offset
155 | 
156 | 
157 | class GrammarError(Error):
158 |     """This exception is raised when the tokens cannot be converted into a
159 |     parse tree.
160 | 
161 |     """
162 | 
163 |     def __init__(self, offset):
164 |         self._offset = offset
165 |         message = 'Invalid syntax at offset {}.'.format(offset)
166 |         super(GrammarError, self).__init__(message)
167 | 
168 |     @property
169 |     def offset(self):
170 |         """Offset into the text where the parser failed.
171 | 
172 |         """
173 | 
174 |         return self._offset
175 | 
176 | 
177 | class ParseError(Error):
178 |     """This exception is raised when the parser fails to parse the text.
179 | 
180 |     """
181 | 
182 |     def __init__(self, text, offset):
183 |         self._text = text
184 |         self._offset = offset
185 |         self._line = line(text, offset)
186 |         self._column = column(text, offset)
187 |         message = _format_invalid_syntax(text, offset)
188 |         super(ParseError, self).__init__(message)
189 | 
190 |     @property
191 |     def text(self):
192 |         """The input text to the parser.
193 | 
194 |         """
195 | 
196 |         return self._text
197 | 
198 |     @property
199 |     def offset(self):
200 |         """Offset into the text where the parser failed.
201 | 
202 |         """
203 | 
204 |         return self._offset
205 | 
206 |     @property
207 |     def line(self):
208 |         """Line where the parser failed.
209 | 
210 |         """
211 | 
212 |         return self._line
213 | 
214 |     @property
215 |     def column(self):
216 |         """Column where the parser failed.
217 | 
218 |         """
219 | 
220 |         return self._column
221 | 
222 | 
223 | Token = namedtuple('Token', ['kind', 'value', 'offset'])
224 | 
225 | 
226 | class Pattern(object):
227 |     """Base class of all patterns.
228 | 
229 |     """
230 | 
231 |     def match(self, tokens):
232 |         """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else
233 |         on match.
234 | 
235 |         """
236 | 
237 |         raise NotImplementedError('To be implemented by subclasses.')
238 | 
239 | 
240 | class Sequence(Pattern):
241 |     """Matches a sequence of patterns. Becomes a list in the parse tree.
242 | 
243 |     """
244 | 
245 |     def __init__(self, *patterns):
246 |         self.patterns = _wrap_strings(patterns)
247 | 
248 |     def match(self, tokens):
249 |         matched = []
250 | 
251 |         for pattern in self.patterns:
252 |             mo = pattern.match(tokens)
253 | 
254 |             if mo is MISMATCH:
255 |                 return MISMATCH
256 | 
257 |             matched.append(mo)
258 | 
259 |         return matched
260 | 
261 | 
262 | class Choice(Pattern):
263 |     """Matches any of given ordered patterns `patterns`. The first pattern
264 |     in the list has highest priority, and the last lowest.
265 | 
266 |     """
267 | 
268 |     def __init__(self, *patterns):
269 |         self._patterns = _wrap_strings(patterns)
270 | 
271 |     def match(self, tokens):
272 |         tokens.save()
273 | 
274 |         for pattern in self._patterns:
275 |             tokens.mark_max_load()
276 |             mo = pattern.match(tokens)
277 | 
278 |             if mo is not MISMATCH:
279 |                 tokens.drop()
280 | 
281 |                 return mo
282 | 
283 |         tokens.restore()
284 | 
285 |         return MISMATCH
286 | 
287 | 
288 | class ChoiceDict(Pattern):
289 |     """Matches any of given patterns. The first token kind of all patterns
290 |     must be unique, otherwise and :class:`~textparser.Error` exception
291 |     is raised.
292 | 
293 |     This class is faster than :class:`~textparser.Choice`, and should
294 |     be used if the grammar allows it.
295 | 
296 |     """
297 | 
298 |     def __init__(self, *patterns):
299 |         self._patterns_map = {}
300 |         patterns = _wrap_strings(patterns)
301 | 
302 |         for pattern in patterns:
303 |             self._check_pattern(pattern, pattern)
304 | 
305 |     @property
306 |     def patterns_map(self):
307 |         return self._patterns_map
308 | 
309 |     def _check_pattern(self, inner, outer):
310 |         if isinstance(inner, _String):
311 |             self._add_pattern(inner.kind, outer)
312 |         elif isinstance(inner, Sequence):
313 |             self._check_pattern(inner.patterns[0], outer)
314 |         elif isinstance(inner, (Tag, Forward)):
315 |             self._check_pattern(inner.pattern, outer)
316 |         elif isinstance(inner, ChoiceDict):
317 |             for pattern in inner.patterns_map.values():
318 |                 self._check_pattern(pattern, outer)
319 |         else:
320 |             raise Error(
321 |                 'Unsupported pattern type {}.'.format(type(inner)))
322 | 
323 |     def _add_pattern(self, kind, pattern):
324 |         if kind in self._patterns_map:
325 |             raise Error(
326 |                 "First token kind must be unique, but {} isn't.".format(
327 |                     kind))
328 | 
329 |         self._patterns_map[kind] = pattern
330 | 
331 |     def match(self, tokens):
332 |         kind = tokens.peek().kind
333 | 
334 |         if kind in self._patterns_map:
335 |             return self._patterns_map[kind].match(tokens)
336 |         else:
337 |             return MISMATCH
338 | 
339 | 
340 | class Repeated(Pattern):
341 |     """Matches `pattern` at least `minimum` times. Any match becomes a
342 |     list in the parse tree.
343 | 
344 |     """
345 | 
346 |     def __init__(self, pattern, minimum=0):
347 |         self._pattern = _wrap_string(pattern)
348 |         self._minimum = minimum
349 | 
350 |     def match(self, tokens):
351 |         matched = []
352 |         tokens.save()
353 | 
354 |         while True:
355 |             mo = self._pattern.match(tokens)
356 | 
357 |             if mo is MISMATCH:
358 |                 tokens.mark_max_restore()
359 |                 break
360 | 
361 |             matched.append(mo)
362 |             tokens.update()
363 | 
364 |         if len(matched) >= self._minimum:
365 |             return matched
366 |         else:
367 |             return MISMATCH
368 | 
369 | 
370 | class RepeatedDict(Repeated):
371 |     """Same as :class:`~textparser.Repeated`, but becomes a dictionary
372 |     instead of a list in the parse tree.
373 | 
374 |     `key` is a function taking the match as input and returning the
375 |     dictionary key. By default the first element in the match is used
376 |     as key.
377 | 
378 |     """
379 | 
380 |     def __init__(self, pattern, minimum=0, key=None):
381 |         super(RepeatedDict, self).__init__(pattern, minimum)
382 | 
383 |         if key is None:
384 |             key = itemgetter(0)
385 | 
386 |         self._key = key
387 | 
388 |     def match(self, tokens):
389 |         matched = {}
390 |         tokens.save()
391 | 
392 |         while True:
393 |             mo = self._pattern.match(tokens)
394 | 
395 |             if mo is MISMATCH:
396 |                 tokens.mark_max_restore()
397 |                 break
398 | 
399 |             key = self._key(mo)
400 | 
401 |             try:
402 |                 matched[key].append(mo)
403 |             except KeyError:
404 |                 matched[key] = [mo]
405 | 
406 |             tokens.update()
407 | 
408 |         if len(matched) >= self._minimum:
409 |             return matched
410 |         else:
411 |             return MISMATCH
412 | 
413 | 
414 | class ZeroOrMore(Repeated):
415 |     """Matches `pattern` zero or more times.
416 | 
417 |     See :class:`~textparser.Repeated` for more details.
418 | 
419 |     """
420 | 
421 |     def __init__(self, pattern):
422 |         super(ZeroOrMore, self).__init__(pattern, 0)
423 | 
424 | 
425 | class ZeroOrMoreDict(RepeatedDict):
426 |     """Matches `pattern` zero or more times.
427 | 
428 |     See :class:`~textparser.RepeatedDict` for more details.
429 | 
430 |     """
431 | 
432 |     def __init__(self, pattern, key=None):
433 |         super(ZeroOrMoreDict, self).__init__(pattern, 0, key)
434 | 
435 | 
436 | class OneOrMore(Repeated):
437 |     """Matches `pattern` one or more times.
438 | 
439 |     See :class:`~textparser.Repeated` for more details.
440 | 
441 |     """
442 | 
443 |     def __init__(self, pattern):
444 |         super(OneOrMore, self).__init__(pattern, 1)
445 | 
446 | 
447 | class OneOrMoreDict(RepeatedDict):
448 |     """Matches `pattern` one or more times.
449 | 
450 |     See :class:`~textparser.RepeatedDict` for more details.
451 | 
452 |     """
453 | 
454 |     def __init__(self, pattern, key=None):
455 |         super(OneOrMoreDict, self).__init__(pattern, 1, key)
456 | 
457 | 
458 | class DelimitedList(Pattern):
459 |     """Matches a delimented list of `pattern` separated by
460 |     `delim`. `pattern` must be matched at least once. Any match
461 |     becomes a list in the parse tree, excluding the delimiters.
462 | 
463 |     """
464 | 
465 |     def __init__(self, pattern, delim=','):
466 |         self._pattern = _wrap_string(pattern)
467 |         self._delim = _wrap_string(delim)
468 | 
469 |     def match(self, tokens):
470 |         # First pattern.
471 |         mo = self._pattern.match(tokens)
472 | 
473 |         if mo is MISMATCH:
474 |             return MISMATCH
475 | 
476 |         matched = [mo]
477 |         tokens.save()
478 | 
479 |         while True:
480 |             # Discard the delimiter.
481 |             mo = self._delim.match(tokens)
482 | 
483 |             if mo is MISMATCH:
484 |                 break
485 | 
486 |             # Pattern.
487 |             mo = self._pattern.match(tokens)
488 | 
489 |             if mo is MISMATCH:
490 |                 break
491 | 
492 |             matched.append(mo)
493 |             tokens.update()
494 | 
495 |         tokens.restore()
496 | 
497 |         return matched
498 | 
499 | 
500 | class Optional(Pattern):
501 |     """Matches `pattern` zero or one times. Becomes a list in the parse
502 |     tree, empty on mismatch.
503 | 
504 |     """
505 | 
506 |     def __init__(self, pattern):
507 |         self._pattern = _wrap_string(pattern)
508 | 
509 |     def match(self, tokens):
510 |         tokens.save()
511 |         mo = self._pattern.match(tokens)
512 | 
513 |         if mo is MISMATCH:
514 |             tokens.mark_max_restore()
515 | 
516 |             return []
517 |         else:
518 |             tokens.drop()
519 | 
520 |             return [mo]
521 | 
522 | 
523 | class Any(Pattern):
524 |     """Matches any token.
525 | 
526 |     """
527 | 
528 |     def match(self, tokens):
529 |         if tokens.peek().kind == '__EOF__':
530 |             return MISMATCH
531 |         else:
532 |             return tokens.get_value()
533 | 
534 | 
535 | class AnyUntil(Pattern):
536 |     """Matches any token until given pattern is found. Becomes a list in
537 |     the parse tree, not including the given pattern match.
538 | 
539 |     """
540 | 
541 |     def __init__(self, pattern):
542 |         self._pattern = _wrap_string(pattern)
543 | 
544 |     def match(self, tokens):
545 |         matched = []
546 | 
547 |         while True:
548 |             tokens.save()
549 |             mo = self._pattern.match(tokens)
550 | 
551 |             if mo is not MISMATCH:
552 |                 break
553 | 
554 |             tokens.restore()
555 |             matched.append(tokens.get_value())
556 | 
557 |         tokens.restore()
558 | 
559 |         return matched
560 | 
561 | 
562 | class And(Pattern):
563 |     """Matches `pattern`, without consuming any tokens. Any match becomes
564 |     an empty list in the parse tree.
565 | 
566 |     """
567 | 
568 |     def __init__(self, pattern):
569 |         self._pattern = _wrap_string(pattern)
570 | 
571 |     def match(self, tokens):
572 |         tokens.save()
573 |         mo = self._pattern.match(tokens)
574 |         tokens.restore()
575 | 
576 |         if mo is MISMATCH:
577 |             return MISMATCH
578 |         else:
579 |             return []
580 | 
581 | 
582 | class Not(Pattern):
583 |     """Matches if `pattern` does not match. Any match becomes an empty
584 |     list in the parse tree.
585 | 
586 |     Just like :class:`~textparser.And`, no tokens are consumed.
587 | 
588 |     """
589 | 
590 |     def __init__(self, pattern):
591 |         self._pattern = _wrap_string(pattern)
592 | 
593 |     def match(self, tokens):
594 |         tokens.save()
595 |         mo = self._pattern.match(tokens)
596 |         tokens.restore()
597 | 
598 |         if mo is MISMATCH:
599 |             return []
600 |         else:
601 |             return MISMATCH
602 | 
603 | 
604 | class NoMatch(Pattern):
605 |     """Never matches anything.
606 | 
607 |     """
608 | 
609 |     def match(self, tokens):
610 |         return MISMATCH
611 | 
612 | 
613 | class Tag(Pattern):
614 |     """Tags any matched `pattern` with name `name`. Becomes a two-tuple of
615 |     `name` and match in the parse tree.
616 | 
617 |     """
618 | 
619 |     def __init__(self, name, pattern):
620 |         self._name = name
621 |         self._pattern = _wrap_string(pattern)
622 | 
623 |     @property
624 |     def pattern(self):
625 |         return self._pattern
626 | 
627 |     def match(self, tokens):
628 |         mo = self._pattern.match(tokens)
629 | 
630 |         if mo is not MISMATCH:
631 |             return (self._name, mo)
632 |         else:
633 |             return MISMATCH
634 | 
635 | 
636 | class Forward(Pattern):
637 |     """Forward declaration of a pattern.
638 | 
639 |     .. code-block:: python
640 | 
641 |        >>> foo = Forward()
642 |        >>> foo <<= Sequence('NUMBER')
643 | 
644 |     """
645 | 
646 |     def __init__(self):
647 |         self._pattern = None
648 | 
649 |     @property
650 |     def pattern(self):
651 |         return self._pattern
652 | 
653 |     def __ilshift__(self, other):
654 |         self._pattern = _wrap_string(other)
655 | 
656 |         return self
657 | 
658 |     def match(self, tokens):
659 |         return self._pattern.match(tokens)
660 | 
661 | 
662 | class Grammar(object):
663 |     """Creates a tree of given tokens using the grammar `grammar`.
664 | 
665 |     """
666 | 
667 |     def __init__(self, grammar):
668 |         if isinstance(grammar, str):
669 |             grammar = _wrap_string(grammar)
670 | 
671 |         self._root = grammar
672 | 
673 |     def parse(self, tokens, token_tree=False):
674 |         if token_tree:
675 |             tokens = _Tokens(tokens)
676 |         else:
677 |             tokens = _StringTokens(tokens)
678 | 
679 |         parsed = self._root.match(tokens)
680 | 
681 |         if parsed is not MISMATCH and tokens.peek_max().kind == '__EOF__':
682 |             return parsed
683 |         else:
684 |             raise GrammarError(tokens.peek_max().offset)
685 | 
686 | 
687 | def choice(*patterns):
688 |     """Returns an instance of the fastest choice class for given patterns
689 |     `patterns`. It is recommended to use this function instead of
690 |     instantiate :class:`~textparser.Choice` or
691 |     :class:`~textparser.ChoiceDict` directly.
692 | 
693 |     """
694 | 
695 |     try:
696 |         return ChoiceDict(*patterns)
697 |     except Error:
698 |         return Choice(*patterns)
699 | 
700 | 
701 | def markup_line(text, offset, marker='>>!<<'):
702 |     """Insert `marker` at `offset` into `text`, and return the marked
703 |     line.
704 | 
705 |     .. code-block:: python
706 | 
707 |        >>> markup_line('0\\n1234\\n56', 3)
708 |        1>>!<<234
709 | 
710 |     """
711 | 
712 |     begin = text.rfind('\n', 0, offset)
713 |     begin += 1
714 | 
715 |     end = text.find('\n', offset)
716 | 
717 |     if end == -1:
718 |         end = len(text)
719 | 
720 |     return text[begin:offset] + marker + text[offset:end]
721 | 
722 | 
723 | def line(text, offset):
724 |     return text[:offset].count('\n') + 1
725 | 
726 | 
727 | def column(text, offset):
728 |     line_start = text.rfind('\n', 0, offset)
729 | 
730 |     return offset - line_start
731 | 
732 | 
733 | def tokenize_init(spec):
734 |     """Initialize a tokenizer. Should only be called by the
735 |     :func:`~textparser.Parser.tokenize` method in the parser.
736 | 
737 |     """
738 | 
739 |     tokens = [Token('__SOF__', '__SOF__', 0)]
740 |     re_token = '|'.join([
741 |         '(?P<{}>{})'.format(name, regex) for name, regex in spec
742 |     ])
743 | 
744 |     return tokens, re_token
745 | 
746 | 
747 | class Parser(object):
748 |     """The abstract base class of all text parsers.
749 | 
750 |     .. code-block:: python
751 | 
752 |        >>> from textparser import Parser, Sequence
753 |        >>> class MyParser(Parser):
754 |        ...    def token_specs(self):
755 |        ...        return [
756 |        ...            ('SKIP',          r'[ \\r\\n\\t]+'),
757 |        ...            ('WORD',          r'\\w+'),
758 |        ...            ('EMARK',    '!', r'!'),
759 |        ...            ('COMMA',    ',', r','),
760 |        ...            ('MISMATCH',      r'.')
761 |        ...        ]
762 |        ...    def grammar(self):
763 |        ...        return Sequence('WORD', ',', 'WORD', '!')
764 | 
765 |     """
766 | 
767 |     def _unpack_token_specs(self):
768 |         names = {}
769 |         specs = []
770 | 
771 |         for spec in self.token_specs():
772 |             if len(spec) == 2:
773 |                 specs.append(spec)
774 |             else:
775 |                 specs.append((spec[0], spec[2]))
776 |                 names[spec[0]] = spec[1]
777 | 
778 |         return names, specs
779 | 
780 |     def keywords(self):
781 |         """A set of keywords in the text.
782 | 
783 |         .. code-block:: python
784 | 
785 |            def keywords(self):
786 |                return set(['if', 'else'])
787 | 
788 |         """
789 | 
790 |         return set()
791 | 
792 |     def token_specs(self):
793 |         """The token specifications with token name, regular expression, and
794 |         optionally a user friendly name.
795 | 
796 |         Two token specification forms are available; ``(kind, re)`` or
797 |         ``(kind, name, re)``. If the second form is used, the grammar
798 |         should use `name` instead of `kind`.
799 | 
800 |         See :class:`~textparser.Parser` for an example usage.
801 | 
802 |         """
803 | 
804 |         return [
805 |             ('SKIP',                r'[ \r\n\t]+'),
806 |             ('NUMBER',              r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
807 |             ('WORD',                r'[A-Za-z0-9_]+'),
808 |             ('ESCAPED_STRING',      r'"(\\"|[^"])*?"'),
809 |             ('MISMATCH',            r'.')
810 |         ]
811 | 
812 |     def tokenize(self, text):
813 |         """Tokenize given string `text`, and return a list of tokens. Raises
814 |         :class:`~textparser.TokenizeError` on failure.
815 | 
816 |         This method should only be called by
817 |         :func:`~textparser.Parser.parse()`, but may very well be
818 |         overridden if the default implementation does not match the
819 |         parser needs.
820 | 
821 |         """
822 | 
823 |         names, specs = self._unpack_token_specs()
824 |         keywords = self.keywords()
825 |         tokens, re_token = tokenize_init(specs)
826 | 
827 |         for mo in re.finditer(re_token, text, re.DOTALL):
828 |             kind = mo.lastgroup
829 | 
830 |             if kind == 'SKIP':
831 |                 pass
832 |             elif kind != 'MISMATCH':
833 |                 value = mo.group(kind)
834 | 
835 |                 if value in keywords:
836 |                     kind = value
837 | 
838 |                 if kind in names:
839 |                     kind = names[kind]
840 | 
841 |                 tokens.append(Token(kind, value, mo.start()))
842 |             else:
843 |                 raise TokenizeError(text, mo.start())
844 | 
845 |         return tokens
846 | 
847 |     def grammar(self):
848 |         """The text grammar is used to create a parse tree out of a list of
849 |         tokens.
850 | 
851 |         See :class:`~textparser.Parser` for an example usage.
852 | 
853 |         """
854 | 
855 |         raise NotImplementedError('No grammar defined.')
856 | 
857 |     def parse(self, text, token_tree=False, match_sof=False):
858 |         """Parse given string `text` and return the parse tree. Raises
859 |         :class:`~textparser.ParseError` on failure.
860 | 
861 |         Returns a parse tree of tokens if `token_tree` is ``True``.
862 | 
863 |         .. code-block:: python
864 | 
865 |            >>> MyParser().parse('Hello, World!')
866 |            ['Hello', ',', 'World', '!']
867 |            >>> tree = MyParser().parse('Hello, World!', token_tree=True)
868 |            >>> from pprint import pprint
869 |            >>> pprint(tree)
870 |            [Token(kind='WORD', value='Hello', offset=0),
871 |             Token(kind=',', value=',', offset=5),
872 |             Token(kind='WORD', value='World', offset=7),
873 |             Token(kind='!', value='!', offset=12)]
874 | 
875 |         """
876 | 
877 |         try:
878 |             tokens = self.tokenize(text)
879 | 
880 |             if len(tokens) == 0 or tokens[-1].kind != '__EOF__':
881 |                 tokens.append(Token('__EOF__', '__EOF__', len(text)))
882 | 
883 |             if not match_sof:
884 |                 if len(tokens) > 0 and tokens[0].kind == '__SOF__':
885 |                     del tokens[0]
886 | 
887 |             return Grammar(self.grammar()).parse(tokens, token_tree)
888 |         except (TokenizeError, GrammarError) as e:
889 |             raise ParseError(text, e.offset)
890 | 
891 | 
892 | def replace_blocks(string, start='{', end='}'):
893 |     """Replace all blocks starting with `start` and ending with `end` with
894 |     spaces (not including `start` and `end`).
895 | 
896 |     """
897 | 
898 |     chunks = []
899 |     begin = 0
900 |     depth = 0
901 |     start_length = len(start)
902 |     pattern = r'({}|{})'.format(re.escape(start), re.escape(end))
903 | 
904 |     for mo in re.finditer(pattern, string):
905 |         pos = mo.start()
906 | 
907 |         if mo.group() == start:
908 |             if depth == 0:
909 |                 chunks.append(string[begin:pos + start_length])
910 |                 begin = (pos + start_length)
911 | 
912 |             depth += 1
913 |         elif depth > 0:
914 |             depth -= 1
915 | 
916 |             if depth == 0:
917 |                 for chunk in string[begin:pos].split('\n'):
918 |                     chunks.append(' ' * len(chunk))
919 |                     chunks.append('\n')
920 | 
921 |                 chunks.pop()
922 |                 begin = pos
923 | 
924 |     chunks.append(string[begin:])
925 | 
926 |     return ''.join(chunks)
927 | 


--------------------------------------------------------------------------------