├── .github ├── FUNDING.yml └── workflows │ └── pythonpackage.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── examples ├── benchmarks │ └── json │ │ ├── data.json │ │ ├── errors.py │ │ ├── parse_tree.py │ │ ├── parsers │ │ ├── __init__.py │ │ ├── funcparserlib_json.py │ │ ├── lark_json.py │ │ ├── parsimonious_json.py │ │ ├── parsita_json.py │ │ ├── parsy_json.py │ │ ├── pyleri_json.py │ │ ├── pyparsing_json.py │ │ ├── textparser_json.py │ │ └── textx_json.py │ │ └── speed.py ├── hello_world.py ├── json.py └── proto3.py ├── requirements.txt ├── setup.py ├── tests ├── __init__.py └── test_textparser.py └── textparser.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: eerimoq 2 | -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 4 11 | matrix: 12 | python-version: [3.6, 3.9] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Test 21 | run: | 22 | python -m unittest 23 | 24 | release: 25 | needs: [test] 26 | runs-on: ubuntu-latest 27 | if: startsWith(github.ref, 'refs/tags') 28 | 29 | steps: 30 | - name: Checkout 31 | uses: actions/checkout@v1 32 | - name: Set up Python 3.9 33 | uses: actions/setup-python@v1 34 | with: 35 | python-version: 3.9 36 | - name: Install pypa/build 37 | run: | 38 | python -m pip install build --user 39 | - name: Build a binary wheel and a source tarball 40 | run: | 41 | git clean -dfx 42 | python -m build --sdist --wheel --outdir dist/ . 43 | - name: Publish distribution 📦 to PyPI 44 | uses: pypa/gh-action-pypi-publish@master 45 | with: 46 | skip_existing: true 47 | password: ${{ secrets.pypi_password }} 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018-2019 Erik Moqvist 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include Makefile 3 | recursive-include tests *.py 4 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | About 2 | ===== 3 | 4 | A text parser written in the Python language. 5 | 6 | The project has one goal, speed! See the benchmark below more details. 7 | 8 | Project homepage: https://github.com/eerimoq/textparser 9 | 10 | Documentation: http://textparser.readthedocs.org/en/latest 11 | 12 | Credits 13 | ======= 14 | 15 | - Thanks `PyParsing`_ for a user friendly interface. Many of 16 | ``textparser``'s class names are taken from this project. 17 | 18 | Installation 19 | ============ 20 | 21 | .. code-block:: python 22 | 23 | pip install textparser 24 | 25 | Example usage 26 | ============= 27 | 28 | The `Hello World`_ example parses the string ``Hello, World!`` and 29 | outputs its parse tree ``['Hello', ',', 'World', '!']``. 30 | 31 | The script: 32 | 33 | .. code-block:: python 34 | 35 | import textparser 36 | from textparser import Sequence 37 | 38 | 39 | class Parser(textparser.Parser): 40 | 41 | def token_specs(self): 42 | return [ 43 | ('SKIP', r'[ \r\n\t]+'), 44 | ('WORD', r'\w+'), 45 | ('EMARK', '!', r'!'), 46 | ('COMMA', ',', r','), 47 | ('MISMATCH', r'.') 48 | ] 49 | 50 | def grammar(self): 51 | return Sequence('WORD', ',', 'WORD', '!') 52 | 53 | 54 | tree = Parser().parse('Hello, World!') 55 | 56 | print('Tree:', tree) 57 | 58 | Script execution: 59 | 60 | .. code-block:: text 61 | 62 | $ env PYTHONPATH=. python3 examples/hello_world.py 63 | Tree: ['Hello', ',', 'World', '!'] 64 | 65 | Benchmark 66 | ========= 67 | 68 | A `benchmark`_ comparing the speed of 10 JSON parsers, parsing a `276 69 | kb file`_. 70 | 71 | .. code-block:: text 72 | 73 | $ env PYTHONPATH=. python3 examples/benchmarks/json/speed.py 74 | 75 | Parsed 'examples/benchmarks/json/data.json' 1 time(s) in: 76 | 77 | PACKAGE SECONDS RATIO VERSION 78 | textparser 0.10 100% 0.21.1 79 | parsimonious 0.17 169% unknown 80 | lark (LALR) 0.27 267% 0.7.0 81 | funcparserlib 0.34 340% unknown 82 | textx 0.54 546% 1.8.0 83 | pyparsing 0.68 684% 2.4.0 84 | pyleri 0.88 886% 1.2.2 85 | parsy 0.92 925% 1.2.0 86 | parsita 2.28 2286% unknown 87 | lark (Earley) 2.34 2348% 0.7.0 88 | 89 | *NOTE 1: The parsers are not necessarily optimized for 90 | speed. Optimizing them will likely affect the measurements.* 91 | 92 | *NOTE 2: The structure of the resulting parse trees varies and 93 | additional processing may be required to make them fit the user 94 | application.* 95 | 96 | *NOTE 3: Only JSON parsers are compared. Parsing other languages may 97 | give vastly different results.* 98 | 99 | Contributing 100 | ============ 101 | 102 | #. Fork the repository. 103 | 104 | #. Implement the new feature or bug fix. 105 | 106 | #. Implement test case(s) to ensure that future changes do not break 107 | legacy. 108 | 109 | #. Run the tests. 110 | 111 | .. code-block:: text 112 | 113 | python3 -m unittest 114 | 115 | #. Create a pull request. 116 | 117 | .. _PyParsing: https://github.com/pyparsing/pyparsing 118 | .. _Hello World: https://github.com/eerimoq/textparser/blob/master/examples/hello_world.py 119 | .. _benchmark: https://github.com/eerimoq/textparser/blob/master/examples/benchmarks/json/speed.py 120 | .. _276 kb file: https://github.com/eerimoq/textparser/blob/master/examples/benchmarks/json/data.json 121 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/textparser.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/textparser.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/textparser" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/textparser" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # textparser documentation build configuration file, created by 4 | # sphinx-quickstart on Sat Apr 25 11:54:09 2015. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | import shlex 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | sys.path.insert(0, os.path.abspath('..')) 23 | 24 | import textparser 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | #needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.viewcode', 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # The suffix(es) of source filenames. 43 | # You can specify multiple suffix as a list of string: 44 | # source_suffix = ['.rst', '.md'] 45 | source_suffix = '.rst' 46 | 47 | # The encoding of source files. 48 | #source_encoding = 'utf-8-sig' 49 | 50 | # The master toctree document. 51 | master_doc = 'index' 52 | 53 | # General information about the project. 54 | project = u'textparser' 55 | copyright = u'2018-2019, Erik Moqvist' 56 | author = u'Erik Moqvist' 57 | 58 | # The version info for the project you're documenting, acts as replacement for 59 | # |version| and |release|, also used in various other places throughout the 60 | # built documents. 61 | # 62 | # The short X.Y version. 63 | version = textparser.__version__ 64 | # The full version, including alpha/beta/rc tags. 65 | release = textparser.__version__ 66 | 67 | # The language for content autogenerated by Sphinx. Refer to documentation 68 | # for a list of supported languages. 69 | # 70 | # This is also used if you do content translation via gettext catalogs. 71 | # Usually you set "language" from the command line for these cases. 72 | language = None 73 | 74 | # There are two options for replacing |today|: either, you set today to some 75 | # non-false value, then it is used: 76 | #today = '' 77 | # Else, today_fmt is used as the format for a strftime call. 78 | #today_fmt = '%B %d, %Y' 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | exclude_patterns = ['_build'] 83 | 84 | # The reST default role (used for this markup: `text`) to use for all 85 | # documents. 86 | #default_role = None 87 | 88 | # If true, '()' will be appended to :func: etc. cross-reference text. 89 | #add_function_parentheses = True 90 | 91 | # If true, the current module name will be prepended to all description 92 | # unit titles (such as .. function::). 93 | #add_module_names = True 94 | 95 | # If true, sectionauthor and moduleauthor directives will be shown in the 96 | # output. They are ignored by default. 97 | #show_authors = False 98 | 99 | # The name of the Pygments (syntax highlighting) style to use. 100 | pygments_style = 'sphinx' 101 | 102 | # A list of ignored prefixes for module index sorting. 103 | #modindex_common_prefix = [] 104 | 105 | # If true, keep warnings as "system message" paragraphs in the built documents. 106 | #keep_warnings = False 107 | 108 | # If true, `todo` and `todoList` produce output, else they produce nothing. 109 | todo_include_todos = False 110 | 111 | 112 | # -- Options for HTML output ---------------------------------------------- 113 | 114 | # The theme to use for HTML and HTML Help pages. See the documentation for 115 | # a list of builtin themes. 116 | html_theme = 'sphinx_rtd_theme' 117 | 118 | # Theme options are theme-specific and customize the look and feel of a theme 119 | # further. For a list of options available for each theme, see the 120 | # documentation. 121 | #html_theme_options = {} 122 | 123 | # Add any paths that contain custom themes here, relative to this directory. 124 | #html_theme_path = [] 125 | 126 | # The name for this set of Sphinx documents. If None, it defaults to 127 | # " v documentation". 128 | #html_title = None 129 | 130 | # A shorter title for the navigation bar. Default is the same as html_title. 131 | #html_short_title = None 132 | 133 | # The name of an image file (relative to this directory) to place at the top 134 | # of the sidebar. 135 | #html_logo = None 136 | 137 | # The name of an image file (within the static path) to use as favicon of the 138 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 139 | # pixels large. 140 | #html_favicon = None 141 | 142 | # Add any paths that contain custom static files (such as style sheets) here, 143 | # relative to this directory. They are copied after the builtin static files, 144 | # so a file named "default.css" will overwrite the builtin "default.css". 145 | html_static_path = ['_static'] 146 | 147 | # Add any extra paths that contain custom files (such as robots.txt or 148 | # .htaccess) here, relative to this directory. These files are copied 149 | # directly to the root of the documentation. 150 | #html_extra_path = [] 151 | 152 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 153 | # using the given strftime format. 154 | #html_last_updated_fmt = '%b %d, %Y' 155 | 156 | # If true, SmartyPants will be used to convert quotes and dashes to 157 | # typographically correct entities. 158 | #html_use_smartypants = True 159 | 160 | # Custom sidebar templates, maps document names to template names. 161 | #html_sidebars = {} 162 | 163 | # Additional templates that should be rendered to pages, maps page names to 164 | # template names. 165 | #html_additional_pages = {} 166 | 167 | # If false, no module index is generated. 168 | #html_domain_indices = True 169 | 170 | # If false, no index is generated. 171 | #html_use_index = True 172 | 173 | # If true, the index is split into individual pages for each letter. 174 | #html_split_index = False 175 | 176 | # If true, links to the reST sources are added to the pages. 177 | #html_show_sourcelink = True 178 | 179 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 180 | #html_show_sphinx = True 181 | 182 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 183 | #html_show_copyright = True 184 | 185 | # If true, an OpenSearch description file will be output, and all pages will 186 | # contain a tag referring to it. The value of this option must be the 187 | # base URL from which the finished HTML is served. 188 | #html_use_opensearch = '' 189 | 190 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 191 | #html_file_suffix = None 192 | 193 | # Language to be used for generating the HTML full-text search index. 194 | # Sphinx supports the following languages: 195 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 196 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 197 | #html_search_language = 'en' 198 | 199 | # A dictionary with options for the search language support, empty by default. 200 | # Now only 'ja' uses this config value 201 | #html_search_options = {'type': 'default'} 202 | 203 | # The name of a javascript file (relative to the configuration directory) that 204 | # implements a search results scorer. If empty, the default will be used. 205 | #html_search_scorer = 'scorer.js' 206 | 207 | # Output file base name for HTML help builder. 208 | htmlhelp_basename = 'textparserdoc' 209 | 210 | # -- Options for LaTeX output --------------------------------------------- 211 | 212 | latex_elements = { 213 | # The paper size ('letterpaper' or 'a4paper'). 214 | #'papersize': 'letterpaper', 215 | 216 | # The font size ('10pt', '11pt' or '12pt'). 217 | #'pointsize': '10pt', 218 | 219 | # Additional stuff for the LaTeX preamble. 220 | #'preamble': '', 221 | 222 | # Latex figure (float) alignment 223 | #'figure_align': 'htbp', 224 | } 225 | 226 | # Grouping the document tree into LaTeX files. List of tuples 227 | # (source start file, target name, title, 228 | # author, documentclass [howto, manual, or own class]). 229 | latex_documents = [ 230 | (master_doc, 'textparser.tex', u'textparser Documentation', 231 | u'Erik Moqvist', 'manual'), 232 | ] 233 | 234 | # The name of an image file (relative to this directory) to place at the top of 235 | # the title page. 236 | #latex_logo = None 237 | 238 | # For "manual" documents, if this is true, then toplevel headings are parts, 239 | # not chapters. 240 | #latex_use_parts = False 241 | 242 | # If true, show page references after internal links. 243 | #latex_show_pagerefs = False 244 | 245 | # If true, show URL addresses after external links. 246 | #latex_show_urls = False 247 | 248 | # Documents to append as an appendix to all manuals. 249 | #latex_appendices = [] 250 | 251 | # If false, no module index is generated. 252 | #latex_domain_indices = True 253 | 254 | 255 | # -- Options for manual page output --------------------------------------- 256 | 257 | # One entry per manual page. List of tuples 258 | # (source start file, name, description, authors, manual section). 259 | man_pages = [ 260 | (master_doc, 'textparser', u'Textparser Documentation', 261 | [author], 1) 262 | ] 263 | 264 | # If true, show URL addresses after external links. 265 | #man_show_urls = False 266 | 267 | 268 | # -- Options for Texinfo output ------------------------------------------- 269 | 270 | # Grouping the document tree into Texinfo files. List of tuples 271 | # (source start file, target name, title, author, 272 | # dir menu entry, description, category) 273 | texinfo_documents = [ 274 | (master_doc, 'textparser', u'Textparser Documentation', 275 | author, 'textparser', 'One line description of project.', 276 | 'Miscellaneous'), 277 | ] 278 | 279 | # Documents to append as an appendix to all manuals. 280 | #texinfo_appendices = [] 281 | 282 | # If false, no module index is generated. 283 | #texinfo_domain_indices = True 284 | 285 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 286 | #texinfo_show_urls = 'footnote' 287 | 288 | # If true, do not generate a @detailmenu in the "Top" node's menu. 289 | #texinfo_no_detailmenu = False 290 | 291 | autodoc_member_order = 'bysource' 292 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. textparser documentation master file, created by 2 | sphinx-quickstart on Sat Apr 25 11:54:09 2015. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | Text parser 10 | =========== 11 | 12 | .. include:: ../README.rst 13 | 14 | The parser class 15 | ================ 16 | 17 | .. autoclass:: textparser.Parser 18 | :members: 19 | 20 | Building the grammar 21 | ==================== 22 | 23 | The grammar built by combining the classes below and strings. 24 | 25 | Here is a fictitious example grammar: 26 | 27 | .. code-block:: python 28 | 29 | grammar = Sequence( 30 | 'BEGIN', 31 | Optional(choice('IF', Sequence(ZeroOrMore('NUMBER')))), 32 | OneOrMore(Sequence('WORD', Not('NUMBER'))), 33 | Any(), 34 | DelimitedList('WORD', delim=':'), 35 | 'END') 36 | 37 | .. autoclass:: textparser.Sequence 38 | 39 | .. autoclass:: textparser.Choice 40 | 41 | .. autoclass:: textparser.ChoiceDict 42 | 43 | .. autofunction:: textparser.choice 44 | 45 | .. autoclass:: textparser.ZeroOrMore 46 | 47 | .. autoclass:: textparser.ZeroOrMoreDict 48 | 49 | .. autoclass:: textparser.OneOrMore 50 | 51 | .. autoclass:: textparser.OneOrMoreDict 52 | 53 | .. autoclass:: textparser.DelimitedList 54 | 55 | .. autoclass:: textparser.Optional 56 | 57 | .. autoclass:: textparser.Any 58 | 59 | .. autoclass:: textparser.AnyUntil 60 | 61 | .. autoclass:: textparser.And 62 | 63 | .. autoclass:: textparser.Not 64 | 65 | .. autoclass:: textparser.NoMatch 66 | 67 | .. autoclass:: textparser.Tag 68 | 69 | .. autoclass:: textparser.Forward 70 | 71 | .. autoclass:: textparser.Repeated 72 | 73 | .. autoclass:: textparser.RepeatedDict 74 | 75 | .. autoclass:: textparser.Pattern 76 | :members: 77 | 78 | .. autodata:: textparser.MISMATCH 79 | 80 | Exceptions 81 | ========== 82 | 83 | .. autoclass:: textparser.Error 84 | :members: 85 | 86 | .. autoclass:: textparser.ParseError 87 | :members: 88 | 89 | .. autoclass:: textparser.TokenizeError 90 | :members: 91 | 92 | .. autoclass:: textparser.GrammarError 93 | :members: 94 | 95 | Utility functions 96 | ================= 97 | 98 | .. autofunction:: textparser.markup_line 99 | 100 | .. autofunction:: textparser.tokenize_init 101 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\textparser.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\textparser.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /examples/benchmarks/json/errors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Parse error comparsion for a few JSON parsers. 4 | 5 | Example execution: 6 | 7 | $ env PYTHONPATH=. python3 examples/benchmarks/json/errors.py 8 | ----------------------------------------------------------------- 9 | 10 | Input string between BEGIN and END: 11 | 12 | BEGIN 13 | END 14 | 15 | textparser: "Invalid syntax at line 1, column 1: ">>!<<"" 16 | 17 | lark_lalr: "'NoneType' object has no attribute 'pos_in_stream'" 18 | 19 | lark_earley: "Incomplete parse: Could not find a solution to input" 20 | 21 | pyparsing: "Expected {string enclosed in double quotes | real number with scientific notation | real number | signed integer | Group:(Forward: ...) | Group:({Suppress:("[") [Forward: ... [, Forward: ...]...] Suppress:("]")}) | "true" | "false" | "null"} (at char 0), (line:1, col:1)" 22 | 23 | parsita: "No exception raised!" 24 | 25 | funcparserlib: "no tokens left in the stream: " 26 | 27 | parsy: "expected one of '"', '-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?', '[', 'false', 'null', 'true', '{' at 0:0" 28 | 29 | parsimonious: "Rule 'json_file' didn't match at '' (line 1, column 1)." 30 | 31 | pyleri: "No exception raised!" 32 | 33 | textx: "None:1:1: error: Expected '[' or '{' at position (1, 1) => '*'." 34 | 35 | ----------------------------------------------------------------- 36 | 37 | Input string between BEGIN and END: 38 | 39 | BEGIN 40 | [ 41 | 1, 42 | {"a": {]} 43 | ] 44 | END 45 | 46 | textparser: "Invalid syntax at line 3, column 10: " {"a": {>>!<<]}"" 47 | 48 | lark_lalr: "Unexpected token Token(RSQB, ']') at line 3, column 10. 49 | Expected: ESCAPED_STRING, RBRACE, string, pair 50 | " 51 | 52 | lark_earley: "Unexpected token Token(RSQB, ']') at line 3, column 10. 53 | Expected: ESCAPED_STRING, RBRACE 54 | " 55 | 56 | pyparsing: "Expected {string enclosed in double quotes | real number with scientific notation | real number | signed integer | Group:(Forward: ...) | Group:({Suppress:("[") [Forward: ... [, Forward: ...]...] Suppress:("]")}) | "true" | "false" | "null"} (at char 5), (line:2, col:4)" 57 | 58 | parsita: "No exception raised!" 59 | 60 | funcparserlib: "got unexpected token: 3,10-3,10: Op ']'" 61 | 62 | parsy: "expected one of '"', '}' at 2:9" 63 | 64 | parsimonious: "Rule 'members' didn't match at ']} 65 | ] 66 | ' (line 3, column 10)." 67 | 68 | pyleri: "No exception raised!" 69 | 70 | textx: "None:3:10: error: Expected STRING or '}' at position (3, 10) => ' {"a": {*]} ] '." 71 | 72 | ----------------------------------------------------------------- 73 | 74 | Input string between BEGIN and END: 75 | 76 | BEGIN 77 | [ 78 | 1, 79 | {3: null} 80 | ] 81 | END 82 | 83 | textparser: "Invalid syntax at line 3, column 4: " {>>!<<3: null}"" 84 | 85 | lark_lalr: "Unexpected token Token(SIGNED_NUMBER, '3') at line 3, column 4. 86 | Expected: RBRACE, pair, string, ESCAPED_STRING 87 | " 88 | 89 | lark_earley: "Unexpected token Token(SIGNED_NUMBER, '3') at line 3, column 4. 90 | Expected: ESCAPED_STRING, RBRACE 91 | " 92 | 93 | pyparsing: "Expected {string enclosed in double quotes | real number with scientific notation | real number | signed integer | Group:(Forward: ...) | Group:({Suppress:("[") [Forward: ... [, Forward: ...]...] Suppress:("]")}) | "true" | "false" | "null"} (at char 5), (line:2, col:4)" 94 | 95 | parsita: "No exception raised!" 96 | 97 | funcparserlib: "got unexpected token: 3,4-3,4: Number '3'" 98 | 99 | parsy: "expected one of '"', '}' at 2:3" 100 | 101 | parsimonious: "Rule 'members' didn't match at '3: null} 102 | ] 103 | ' (line 3, column 4)." 104 | 105 | pyleri: "No exception raised!" 106 | 107 | textx: "None:3:4: error: Expected STRING or '}' at position (3, 4) => '[ 1, {*3: null} ]'." 108 | 109 | ----------------------------------------------------------------- 110 | 111 | Input string between BEGIN and END: 112 | 113 | BEGIN 114 | nul 115 | END 116 | 117 | textparser: "Invalid syntax at line 1, column 1: ">>!< '*nul '." 145 | $ 146 | 147 | """ 148 | 149 | from __future__ import print_function 150 | 151 | from parsers import textparser_json 152 | from parsers import lark_json 153 | from parsers import pyparsing_json 154 | from parsers import funcparserlib_json 155 | from parsers import parsimonious_json 156 | from parsers import textx_json 157 | 158 | try: 159 | from parsers import parsita_json 160 | except: 161 | 162 | class parsita_json(object): 163 | 164 | @staticmethod 165 | def parse(_json_string): 166 | raise Exception('Import failed!') 167 | 168 | try: 169 | from parsers import parsy_json 170 | except: 171 | class parsy_json(object): 172 | 173 | @staticmethod 174 | def parse(_json_string): 175 | raise Exception('Import failed!') 176 | 177 | try: 178 | from parsers import pyleri_json 179 | except: 180 | class pyleri_json(object): 181 | 182 | @staticmethod 183 | def parse(_json_string): 184 | raise Exception('Import failed!') 185 | 186 | 187 | def parse(string): 188 | def _parse(function): 189 | try: 190 | function(string) 191 | except Exception as e: 192 | return str(e) 193 | 194 | return 'No exception raised!' 195 | 196 | results = [ 197 | ('textparser', _parse(textparser_json.parse)), 198 | ('lark_lalr', _parse(lark_json.parse_lalr)), 199 | ('lark_earley', _parse(lark_json.parse_earley)), 200 | ('pyparsing', _parse(pyparsing_json.parse)), 201 | ('parsita', _parse(parsita_json.parse)), 202 | ('funcparserlib', _parse(funcparserlib_json.parse)), 203 | ('parsy', _parse(parsy_json.parse)), 204 | ('parsimonious', _parse(parsimonious_json.parse)), 205 | ('pyleri', _parse(pyleri_json.parse)), 206 | ('textx', _parse(textx_json.parse)) 207 | ] 208 | 209 | print('-----------------------------------------------------------------') 210 | print() 211 | print('Input string between BEGIN and END:') 212 | print() 213 | print('BEGIN') 214 | print(string, end='') 215 | print('END') 216 | print() 217 | 218 | for parser, error in results: 219 | print('{}: "{}"'.format(parser, error)) 220 | print() 221 | 222 | 223 | EMPTY_STRING = '''\ 224 | ''' 225 | 226 | BAD_DICT_END_STRING = '''\ 227 | [ 228 | 1, 229 | {"a": {]} 230 | ] 231 | ''' 232 | 233 | BAD_DICT_KEY_STRING = '''\ 234 | [ 235 | 1, 236 | {3: null} 237 | ] 238 | ''' 239 | 240 | BAD_NULL_STRING = '''\ 241 | nul 242 | ''' 243 | 244 | 245 | parse(EMPTY_STRING) 246 | parse(BAD_DICT_END_STRING) 247 | parse(BAD_DICT_KEY_STRING) 248 | parse(BAD_NULL_STRING) 249 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parse_tree.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Compare the parse tree of a few JSON parsers. 4 | 5 | Example execution: 6 | 7 | $ env PYTHONPATH=. python3 examples/benchmarks/json/parse_tree.py 8 | ----------------------------------------------------------------- 9 | 10 | Input string between BEGIN and END: 11 | 12 | BEGIN 13 | [ 14 | "foo", 15 | { 16 | "bar": [ 17 | 1, 18 | 2, 19 | 3 20 | ] 21 | } 22 | ] 23 | END 24 | 25 | textparser: 26 | ['[', [['"foo"', ['{', [[['"bar"', ':', ['[', [['1', '2', '3']], ']']]]], '}']]], ']'] 27 | 28 | lark_lalr: 29 | Tree(list, [Tree(string, [Token(ESCAPED_STRING, '"foo"')]), Tree(dict, [Tree(pair, [Tree(string, [Token(ESCAPED_STRING, '"bar"')]), Tree(list, [Token(SIGNED_NUMBER, '1'), Token(SIGNED_NUMBER, '2'), Token(SIGNED_NUMBER, '3')])])])]) 30 | 31 | lark_earley: 32 | Tree(list, [Tree(string, [Token(ESCAPED_STRING, '"foo"')]), Tree(dict, [Tree(pair, [Tree(string, [Token(ESCAPED_STRING, '"bar"')]), Tree(list, [Token(SIGNED_NUMBER, '1'), Token(SIGNED_NUMBER, '2'), Token(SIGNED_NUMBER, '3')])])])]) 33 | 34 | pyparsing: 35 | [['"foo"', [['"bar"', [1, 2, 3]]]]] 36 | 37 | parsita: 38 | Success(['foo', [['bar', ['1', '2', '3']]]]) 39 | 40 | funcparserlib: 41 | ('"foo"', [('"bar"', ('1', ['2', '3']), [])]) 42 | 43 | parsy: 44 | ['foo', [('bar', ['1', '2', '3'])]] 45 | 46 | parsimonious: 47 | 58 | 59 | 69 | 79 | 80 | 90 | 100 | 103 | 106 | 108 | 110 | 112 | 113 | 114 | 115 | 117 | 118 | 119 | 121 | 123 | 131 | 132 | 139 | 146 | 147 | 154 | 161 | 162 | 169 | 171 | 173 | 174 | 175 | 176 | 182 | 183 | 184 | 189 | 194 | 195 | 200 | 205 | 209 | 212 | 214 | 216 | 218 | 219 | 220 | 221 | 223 | 224 | 225 | 227 | 229 | 231 | 232 | 233 | 234 | 235 | 236 | 238 | 239 | 240 | 242 | 244 | 246 | 247 | 248 | 249 | 251 | 253 | 254 | 256 | 258 | 259 | 261 | 263 | 264 | 266 | 268 | 269 | pyleri: 270 | 271 | 272 | textx: 273 | 274 | $ 275 | 276 | """ 277 | 278 | from __future__ import print_function 279 | 280 | from parsers import textparser_json 281 | from parsers import lark_json 282 | from parsers import pyparsing_json 283 | from parsers import funcparserlib_json 284 | from parsers import parsimonious_json 285 | from parsers import textx_json 286 | 287 | try: 288 | from parsers import parsita_json 289 | except: 290 | 291 | class parsita_json(object): 292 | 293 | @staticmethod 294 | def parse(_json_string): 295 | return 'Import failed!' 296 | 297 | try: 298 | from parsers import parsy_json 299 | except: 300 | class parsy_json(object): 301 | 302 | @staticmethod 303 | def parse(_json_string): 304 | return 'Import failed!' 305 | 306 | try: 307 | from parsers import pyleri_json 308 | except: 309 | class pyleri_json(object): 310 | 311 | @staticmethod 312 | def parse(_json_string): 313 | return 'Import failed!' 314 | 315 | 316 | def parse(string): 317 | results = [ 318 | ('textparser', textparser_json.parse(string)), 319 | ('lark_lalr', lark_json.parse_lalr(string)), 320 | ('lark_earley', lark_json.parse_earley(string)), 321 | ('pyparsing', pyparsing_json.parse(string)), 322 | ('parsita', parsita_json.parse(string)), 323 | ('funcparserlib', funcparserlib_json.parse(string)), 324 | ('parsy', parsy_json.parse(string)), 325 | ('parsimonious', parsimonious_json.parse(string)), 326 | ('pyleri', pyleri_json.parse(string)), 327 | ('textx', textx_json.parse(string)) 328 | ] 329 | 330 | print('-----------------------------------------------------------------') 331 | print() 332 | print('Input string between BEGIN and END:') 333 | print() 334 | print('BEGIN') 335 | print(string, end='') 336 | print('END') 337 | print() 338 | 339 | for parser, parse_tree in results: 340 | print('{}:'.format(parser)) 341 | print(parse_tree) 342 | print() 343 | 344 | 345 | JSON_STRING = '''\ 346 | [ 347 | "foo", 348 | { 349 | "bar": [ 350 | 1, 351 | 2, 352 | 3 353 | ] 354 | } 355 | ] 356 | ''' 357 | 358 | 359 | parse(JSON_STRING) 360 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eerimoq/textparser/1ef809eb283da3c3ec7b8bc682f11eeada3a81d6/examples/benchmarks/json/parsers/__init__.py -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/funcparserlib_json.py: -------------------------------------------------------------------------------- 1 | """Based on 2 | https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py. 3 | 4 | """ 5 | 6 | import timeit 7 | import re 8 | 9 | from funcparserlib.lexer import make_tokenizer 10 | from funcparserlib.lexer import Token 11 | from funcparserlib.parser import some 12 | from funcparserlib.parser import a 13 | from funcparserlib.parser import maybe 14 | from funcparserlib.parser import many 15 | from funcparserlib.parser import skip 16 | from funcparserlib.parser import forward_decl 17 | from funcparserlib.parser import finished 18 | 19 | 20 | REGEXPS = { 21 | 'escaped': r''' 22 | \\ # Escape 23 | ((?P["\\/bfnrt]) # Standard escapes 24 | | (u(?P[0-9A-Fa-f]{4}))) # uXXXX 25 | ''', 26 | 'unescaped': r''' 27 | [^"\\] # Unescaped: avoid ["\\] 28 | ''' 29 | } 30 | 31 | 32 | def create_tokenizer(): 33 | specs = [ 34 | ('Space', (r'[ \t\r\n]+',)), 35 | ('String', (r'"(%(unescaped)s | %(escaped)s)*"' % REGEXPS, re.VERBOSE)), 36 | ('Number', (r''' 37 | -? # Minus 38 | (0|([1-9][0-9]*)) # Int 39 | (\.[0-9]+)? # Frac 40 | ([Ee][+-][0-9]+)? # Exp 41 | ''', re.VERBOSE)), 42 | ('Op', (r'[{}\[\]\-,:]',)), 43 | ('Name', (r'[A-Za-z_][A-Za-z_0-9]*',)), 44 | ] 45 | 46 | return make_tokenizer(specs) 47 | 48 | 49 | def tokenize(tokenizer, string): 50 | useless = ['Space'] 51 | 52 | return [x for x in tokenizer(string) if x.type not in useless] 53 | 54 | 55 | def create_grammar(): 56 | tokval = lambda x: x.value 57 | toktype = lambda t: some(lambda x: x.type == t) >> tokval 58 | op = lambda s: a(Token('Op', s)) >> tokval 59 | op_ = lambda s: skip(op(s)) 60 | n = lambda s: a(Token('Name', s)) >> tokval 61 | 62 | null = n('null') 63 | true = n('true') 64 | false = n('false') 65 | number = toktype('Number') 66 | string = toktype('String') 67 | value = forward_decl() 68 | member = string + op_(':') + value 69 | object_ = (op_('{') + 70 | maybe(member + many(op_(',') + member)) + 71 | op_('}')) 72 | array = (op_('[') + 73 | maybe(value + many(op_(',') + value)) + 74 | op_(']')) 75 | value.define(null 76 | | true 77 | | false 78 | | object_ 79 | | array 80 | | number 81 | | string) 82 | json_text = object_ | array 83 | json_file = json_text + skip(finished) 84 | 85 | return json_file 86 | 87 | 88 | def parse_time(json_string, iterations): 89 | grammar = create_grammar() 90 | tokenizer = create_tokenizer() 91 | 92 | def _parse(): 93 | grammar.parse(tokenize(tokenizer, json_string)) 94 | 95 | return timeit.timeit(_parse, number=iterations) 96 | 97 | 98 | def parse(json_string): 99 | grammar = create_grammar() 100 | tokenizer = create_tokenizer() 101 | 102 | return grammar.parse(tokenize(tokenizer, json_string)) 103 | 104 | 105 | def version(): 106 | return 'unknown' 107 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/lark_json.py: -------------------------------------------------------------------------------- 1 | """Based on 2 | https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md. 3 | 4 | """ 5 | 6 | import timeit 7 | 8 | import lark 9 | from lark import Lark 10 | 11 | 12 | LARK_JSON_GRAMMAR = r""" 13 | ?value: dict 14 | | list 15 | | string 16 | | SIGNED_NUMBER 17 | | "true" 18 | | "false" 19 | | "null" 20 | 21 | list : "[" [value ("," value)*] "]" 22 | 23 | dict : "{" [pair ("," pair)*] "}" 24 | pair : string ":" value 25 | 26 | string : ESCAPED_STRING 27 | 28 | %import common.ESCAPED_STRING 29 | %import common.SIGNED_NUMBER 30 | %import common.WS 31 | %ignore WS 32 | """ 33 | 34 | 35 | def parse_time_lalr(json_string, iterations): 36 | parser = Lark(LARK_JSON_GRAMMAR, 37 | start='value', 38 | lexer='standard', 39 | parser='lalr') 40 | 41 | def _parse(): 42 | parser.parse(json_string) 43 | 44 | return timeit.timeit(_parse, number=iterations) 45 | 46 | 47 | def parse_time_earley(json_string, iterations): 48 | parser = Lark(LARK_JSON_GRAMMAR, 49 | start='value', 50 | lexer='standard', 51 | parser='earley') 52 | 53 | def _parse(): 54 | parser.parse(json_string) 55 | 56 | return timeit.timeit(_parse, number=iterations) 57 | 58 | 59 | def parse_lalr(json_string): 60 | parser = Lark(LARK_JSON_GRAMMAR, 61 | start='value', 62 | lexer='standard', 63 | parser='lalr') 64 | 65 | return parser.parse(json_string) 66 | 67 | 68 | def parse_earley(json_string): 69 | parser = Lark(LARK_JSON_GRAMMAR, 70 | start='value', 71 | lexer='standard', 72 | parser='earley') 73 | 74 | return parser.parse(json_string) 75 | 76 | 77 | def version(): 78 | return lark.__version__ 79 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/parsimonious_json.py: -------------------------------------------------------------------------------- 1 | """Based on 2 | https://gist.github.com/goodmami/686385b4b39a3bac00fbbe78a5cda6c8, by 3 | Michael Wayne Goodman. 4 | 5 | """ 6 | 7 | import timeit 8 | 9 | from parsimonious.grammar import Grammar 10 | 11 | 12 | grammar = Grammar( 13 | r""" 14 | Start = ~"\s*" ( Object / Array ) ~"\s*" 15 | Object = ~"{\s*" Members? ~"\s*}" 16 | Members = MappingComma* Mapping 17 | MappingComma = Mapping ~"\s*,\s*" 18 | Mapping = DQString ~"\s*:\s*" Value 19 | Array = ~"\[\s*" Items? ~"\s*\]" 20 | Items = ValueComma* Value 21 | ValueComma = Value ~"\s*,\s*" 22 | Value = Object / Array / DQString 23 | / TrueVal / FalseVal / NullVal / Float / Integer 24 | TrueVal = "true" 25 | FalseVal = "false" 26 | NullVal = "null" 27 | DQString = ~"\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"" 28 | Float = ~"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?" 29 | Integer = ~"[-+]?\d+" 30 | """) 31 | 32 | 33 | def parse_time(json_string, iterations): 34 | def _parse(): 35 | grammar.parse(json_string) 36 | 37 | return timeit.timeit(_parse, number=iterations) 38 | 39 | 40 | def parse(json_string): 41 | return grammar.parse(json_string) 42 | 43 | 44 | def version(): 45 | return 'unknown' 46 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/parsita_json.py: -------------------------------------------------------------------------------- 1 | """Based on 2 | https://github.com/drhagen/parsita/blob/master/examples/json.py. 3 | 4 | """ 5 | 6 | import timeit 7 | 8 | from parsita import TextParsers 9 | from parsita import lit 10 | from parsita import reg 11 | from parsita import rep 12 | from parsita import repsep 13 | 14 | 15 | class JsonStringParsers(TextParsers, whitespace=None): 16 | quote = lit(r'\"') 17 | reverse_solidus = lit(r'\\') 18 | solidus = lit(r'\/') 19 | backspace = lit(r'\b') 20 | form_feed = lit(r'\f') 21 | line_feed = lit(r'\n') 22 | carriage_return = lit(r'\r') 23 | tab = lit(r'\t') 24 | uni = reg(r'\\u([0-9a-fA-F]{4})') 25 | 26 | escaped = (quote | reverse_solidus | solidus | backspace | form_feed | 27 | line_feed | carriage_return | tab | uni) 28 | unescaped = reg(r'[\u0020-\u0021\u0023-\u005B\u005D-\U0010FFFF]+') 29 | 30 | string = '"' >> rep(escaped | unescaped) << '"' > ''.join 31 | 32 | 33 | class JsonParsers(TextParsers, whitespace=r'[ \t\n\r]*'): 34 | number = reg(r'-?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][-+]?[0-9]+)?') 35 | 36 | false = lit('false') 37 | true = lit('true') 38 | null = lit('null') 39 | 40 | string = JsonStringParsers.string 41 | 42 | array = '[' >> repsep(value, ',') << ']' 43 | 44 | entry = string << ':' & value 45 | obj = '{' >> repsep(entry, ',') << '}' 46 | 47 | value = (number 48 | | false 49 | | true 50 | | null 51 | | string 52 | | array 53 | | obj) 54 | 55 | 56 | def parse_time(json_string, iterations): 57 | def _parse(): 58 | JsonParsers.value.parse(json_string) 59 | 60 | return timeit.timeit(_parse, number=iterations) 61 | 62 | 63 | def parse(json_string): 64 | return JsonParsers.value.parse(json_string) 65 | 66 | 67 | def version(): 68 | return 'unknown' 69 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/parsy_json.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | import parsy 4 | from parsy import generate 5 | from parsy import regex 6 | from parsy import string 7 | 8 | 9 | whitespace = regex(r'\s*') 10 | lexeme = lambda p: p << whitespace 11 | lbrace = lexeme(string('{')) 12 | rbrace = lexeme(string('}')) 13 | lbrack = lexeme(string('[')) 14 | rbrack = lexeme(string(']')) 15 | colon = lexeme(string(':')) 16 | comma = lexeme(string(',')) 17 | true = lexeme(string('true')) 18 | false = lexeme(string('false')) 19 | null = lexeme(string('null')) 20 | number = lexeme( 21 | regex(r'-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?') 22 | ) 23 | string_part = regex(r'[^"\\]+') 24 | string_esc = string('\\') >> ( 25 | string('\\') 26 | | string('/') 27 | | string('"') 28 | | string('b') 29 | | string('f') 30 | | string('n') 31 | | string('r') 32 | | string('t') 33 | | regex(r'u[0-9a-fA-F]{4}').map(lambda s: chr(int(s[1:], 16))) 34 | ) 35 | quoted = lexeme(string('"') >> (string_part | string_esc).many().concat() << string('"')) 36 | 37 | 38 | # Circular dependency between array and value means we use `generate` 39 | # form here. 40 | @generate 41 | def array(): 42 | yield lbrack 43 | elements = yield value.sep_by(comma) 44 | yield rbrack 45 | 46 | return elements 47 | 48 | 49 | @generate 50 | def object_pair(): 51 | key = yield quoted 52 | yield colon 53 | val = yield value 54 | 55 | return (key, val) 56 | 57 | 58 | json_object = lbrace >> object_pair.sep_by(comma) << rbrace 59 | value = quoted | number | json_object | array | true | false | null 60 | json = whitespace >> value 61 | 62 | 63 | def parse_time(json_string, iterations): 64 | def _parse(): 65 | json.parse(json_string) 66 | 67 | return timeit.timeit(_parse, number=iterations) 68 | 69 | 70 | def parse(json_string): 71 | return json.parse(json_string) 72 | 73 | 74 | def version(): 75 | return parsy.__version__ 76 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/pyleri_json.py: -------------------------------------------------------------------------------- 1 | """Based on 2 | https://github.com/transceptor-technology/pyleri/blob/master/examples/json_grammar.py. 3 | 4 | """ 5 | 6 | import timeit 7 | 8 | import pyleri 9 | from pyleri import Ref 10 | from pyleri import Choice 11 | from pyleri import Grammar 12 | from pyleri import Regex 13 | from pyleri import Keyword 14 | from pyleri import Sequence 15 | from pyleri import List 16 | 17 | 18 | class JsonGrammar(Grammar): 19 | START = Ref() 20 | 21 | # JSON strings should be enclosed in double quotes. 22 | # A backslash can be used as escape character. 23 | r_string = Regex('(")(?:(?=(\\\?))\\2.)*?\\1') 24 | 25 | # JSON does not support floats or integers prefixed with a + sign 26 | # and floats must start with a number, for example .5 is not allowed 27 | # but should be written like 0.5 28 | r_float = Regex('-?[0-9]+\.?[0-9]+') 29 | r_integer = Regex('-?[0-9]+') 30 | 31 | k_true = Keyword('true') 32 | k_false = Keyword('false') 33 | k_null = Keyword('null') 34 | 35 | json_map_item = Sequence(r_string, ':', START) 36 | 37 | json_map = Sequence('{', List(json_map_item), '}') 38 | json_array = Sequence('[', List(START), ']') 39 | 40 | START = Choice(r_string, 41 | r_float, 42 | r_integer, 43 | k_true, 44 | k_false, 45 | k_null, 46 | json_map, 47 | json_array) 48 | 49 | 50 | def parse_time(json_string, iterations): 51 | grammar = JsonGrammar() 52 | 53 | def _parse(): 54 | grammar.parse(json_string) 55 | 56 | return timeit.timeit(_parse, number=iterations) 57 | 58 | 59 | def parse(json_string): 60 | return JsonGrammar().parse(json_string) 61 | 62 | 63 | def version(): 64 | return pyleri.__version__ 65 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/pyparsing_json.py: -------------------------------------------------------------------------------- 1 | """Based on http://pyparsing.wikispaces.com/file/view/jsonParser.py. 2 | 3 | """ 4 | 5 | import timeit 6 | 7 | import pyparsing 8 | from pyparsing import Keyword 9 | from pyparsing import Suppress 10 | from pyparsing import dblQuotedString 11 | from pyparsing import Forward 12 | from pyparsing import Group 13 | from pyparsing import delimitedList 14 | from pyparsing import Optional 15 | from pyparsing import pyparsing_common 16 | from pyparsing import Dict 17 | 18 | 19 | def create_grammar(): 20 | TRUE = Keyword('true') 21 | FALSE = Keyword('false') 22 | NULL = Keyword('null') 23 | 24 | LBRACK, RBRACK, LBRACE, RBRACE, COLON = map(Suppress, '[]{}:') 25 | 26 | string = dblQuotedString() 27 | number = pyparsing_common.number() 28 | 29 | object_ = Forward() 30 | value = Forward() 31 | elements = delimitedList(value) 32 | array = Group(LBRACK + Optional(elements, []) + RBRACK) 33 | value <<= (string 34 | | number 35 | | Group(object_) 36 | | array 37 | | TRUE 38 | | FALSE 39 | | NULL) 40 | member = Group(string + COLON + value) 41 | members = delimitedList(member) 42 | object_ <<= Dict(LBRACE + Optional(members) + RBRACE) 43 | 44 | return value 45 | 46 | 47 | def parse_time(json_string, iterations=1): 48 | grammar = create_grammar() 49 | 50 | def _parse(): 51 | grammar.parseString(json_string) 52 | 53 | return timeit.timeit(_parse, number=iterations) 54 | 55 | 56 | def parse(json_string): 57 | grammar = create_grammar() 58 | 59 | return grammar.parseString(json_string).asList() 60 | 61 | 62 | def version(): 63 | return pyparsing.__version__ 64 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/textparser_json.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | import textparser 4 | from textparser import Forward 5 | from textparser import Sequence 6 | from textparser import DelimitedList 7 | from textparser import choice 8 | from textparser import Optional 9 | 10 | 11 | class Parser(textparser.Parser): 12 | 13 | def token_specs(self): 14 | return [ 15 | ('SKIP', r'[ \r\n\t]+'), 16 | ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), 17 | ('TRUE', r'true'), 18 | ('FALSE', r'false'), 19 | ('NULL', r'null'), 20 | ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), 21 | ('LPAREN', '(', r'\('), 22 | ('RPAREN', ')', r'\)'), 23 | ('LBRACKET', '[', r'\['), 24 | ('RBRACKET', ']', r'\]'), 25 | ('LBRACE', '{', r'\{'), 26 | ('RBRACE', '}', r'\}'), 27 | ('COMMA', ',', r','), 28 | ('COLON', ':', r':'), 29 | ('MISMATCH', r'.') 30 | ] 31 | 32 | def grammar(self): 33 | value = Forward() 34 | list_ = Sequence('[', Optional(DelimitedList(value)), ']') 35 | pair = Sequence('ESCAPED_STRING', ':', value) 36 | dict_ = Sequence('{', Optional(DelimitedList(pair)), '}') 37 | value <<= choice(list_, 38 | dict_, 39 | 'ESCAPED_STRING', 40 | 'NUMBER', 41 | 'TRUE', 42 | 'FALSE', 43 | 'NULL') 44 | 45 | return value 46 | 47 | 48 | def parse_time(json_string, iterations): 49 | parser = Parser() 50 | 51 | def _parse(): 52 | parser.parse(json_string) 53 | 54 | return timeit.timeit(_parse, number=iterations) 55 | 56 | 57 | def parse(json_string): 58 | return Parser().parse(json_string) 59 | 60 | 61 | def version(): 62 | return textparser.__version__ 63 | -------------------------------------------------------------------------------- /examples/benchmarks/json/parsers/textx_json.py: -------------------------------------------------------------------------------- 1 | """Based on 2 | https://github.com/igordejanovic/textX/tree/master/examples/json. 3 | 4 | """ 5 | 6 | import timeit 7 | 8 | import textx 9 | from textx import metamodel_from_str 10 | 11 | 12 | GRAMMAR = '''\ 13 | /* 14 | A grammar for JSON data-interchange format. 15 | See: http://www.json.org/ 16 | */ 17 | File: 18 | Array | Object 19 | ; 20 | 21 | Array: 22 | "[" values*=Value[','] "]" 23 | ; 24 | 25 | Value: 26 | STRING | FLOAT | BOOL | Object | Array | "null" 27 | ; 28 | 29 | Object: 30 | "{" members*=Member[','] "}" 31 | ; 32 | 33 | Member: 34 | key=STRING ':' value=Value 35 | ; 36 | ''' 37 | 38 | 39 | def parse_time(json_string, iterations): 40 | parser = metamodel_from_str(GRAMMAR) 41 | 42 | def _parse(): 43 | parser.model_from_str(json_string) 44 | 45 | return timeit.timeit(_parse, number=iterations) 46 | 47 | 48 | def parse(json_string): 49 | parser = metamodel_from_str(GRAMMAR) 50 | 51 | return parser.model_from_str(json_string) 52 | 53 | 54 | def version(): 55 | return textx.__version__ 56 | -------------------------------------------------------------------------------- /examples/benchmarks/json/speed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """A benchmark comparing the speed of 10 parsers. 4 | 5 | Test data generated with https://www.json-generator.com. 6 | 7 | Example execution: 8 | 9 | $ env PYTHONPATH=. python3 examples/benchmarks/json/speed.py 10 | Parsed 'examples/benchmarks/json/data.json' 1 time(s) in: 11 | 12 | PACKAGE SECONDS RATIO VERSION 13 | textparser 0.10 100% 0.17.0 14 | parsimonious 0.17 174% unknown 15 | lark (LALR) 0.25 253% 0.6.4 16 | funcparserlib 0.33 335% unknown 17 | textx 0.51 520% 1.7.1 18 | pyparsing 0.65 654% 2.2.0 19 | pyleri 0.78 786% 1.2.2 20 | parsy 0.92 931% 1.2.0 21 | lark (Earley) 1.80 1816% 0.6.4 22 | parsita 2.22 2251% unknown 23 | $ 24 | 25 | """ 26 | 27 | from __future__ import print_function 28 | 29 | import os 30 | 31 | from parsers import textparser_json 32 | from parsers import lark_json 33 | from parsers import pyparsing_json 34 | from parsers import funcparserlib_json 35 | from parsers import parsimonious_json 36 | from parsers import textx_json 37 | 38 | try: 39 | from parsers import parsita_json 40 | except: 41 | class parsita_json(object): 42 | 43 | @staticmethod 44 | def parse_time(_json_string, _iterations): 45 | return float('inf') 46 | 47 | @staticmethod 48 | def version(): 49 | return 'unknown' 50 | 51 | try: 52 | from parsers import parsy_json 53 | except: 54 | class parsy_json(object): 55 | 56 | @staticmethod 57 | def parse_time(_json_string, _iterations): 58 | return float('inf') 59 | 60 | @staticmethod 61 | def version(): 62 | return 'unknown' 63 | 64 | try: 65 | from parsers import pyleri_json 66 | except: 67 | class pyleri_json(object): 68 | 69 | @staticmethod 70 | def parse_time(_json_string, _iterations): 71 | return float('inf') 72 | 73 | @staticmethod 74 | def version(): 75 | return 'unknown' 76 | 77 | 78 | SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) 79 | DATA_JSON = os.path.relpath(os.path.join(SCRIPT_DIR, 'data.json')) 80 | 81 | ITERATIONS = 1 82 | 83 | 84 | with open(DATA_JSON, 'r') as fin: 85 | JSON_STRING = fin.read() 86 | 87 | textparser_time = textparser_json.parse_time(JSON_STRING, ITERATIONS) 88 | lark_lalr_time = lark_json.parse_time_lalr(JSON_STRING, ITERATIONS) 89 | lark_earley_time = lark_json.parse_time_earley(JSON_STRING, ITERATIONS) 90 | pyparsing_time = pyparsing_json.parse_time(JSON_STRING, ITERATIONS) 91 | parsita_time = parsita_json.parse_time(JSON_STRING, ITERATIONS) 92 | funcparserlib_time = funcparserlib_json.parse_time(JSON_STRING, ITERATIONS) 93 | parsy_time = parsy_json.parse_time(JSON_STRING, ITERATIONS) 94 | parsimonious_time = parsimonious_json.parse_time(JSON_STRING, ITERATIONS) 95 | pyleri_time = pyleri_json.parse_time(JSON_STRING, ITERATIONS) 96 | textx_time = textx_json.parse_time(JSON_STRING, ITERATIONS) 97 | 98 | # Parse comparison output. 99 | measurements = [ 100 | ('textparser', textparser_time, textparser_json.version()), 101 | ('lark (LALR)', lark_lalr_time, lark_json.version()), 102 | ('lark (Earley)', lark_earley_time, lark_json.version()), 103 | ('pyparsing', pyparsing_time, pyparsing_json.version()), 104 | ('parsita', parsita_time, parsita_json.version()), 105 | ('funcparserlib', funcparserlib_time, funcparserlib_json.version()), 106 | ('parsy', parsy_time, parsy_json.version()), 107 | ('parsimonious', parsimonious_time, parsimonious_json.version()), 108 | ('pyleri', pyleri_time, pyleri_json.version()), 109 | ('textx', textx_time, textx_json.version()) 110 | ] 111 | 112 | measurements = sorted(measurements, key=lambda m: m[1]) 113 | 114 | print() 115 | print("Parsed '{}' {} time(s) in:".format(DATA_JSON, ITERATIONS)) 116 | print() 117 | print('PACKAGE SECONDS RATIO VERSION') 118 | 119 | for package, seconds, version in measurements: 120 | try: 121 | ratio = int(round(100 * (seconds / textparser_time), 0)) 122 | ratio = '{:5}'.format(ratio) 123 | except OverflowError: 124 | ratio = ' inf' 125 | 126 | print('{:14s} {:7.02f} {}% {}'.format(package, 127 | seconds, 128 | ratio, 129 | version)) 130 | -------------------------------------------------------------------------------- /examples/hello_world.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # $ env PYTHONPATH=.. python3 hello_world.py 4 | # Tree: ['Hello', ',', 'World', '!'] 5 | # 6 | 7 | import textparser 8 | from textparser import Sequence 9 | 10 | 11 | class Parser(textparser.Parser): 12 | 13 | def token_specs(self): 14 | return [ 15 | ('SKIP', r'[ \r\n\t]+'), 16 | ('WORD', r'\w+'), 17 | ('EMARK', '!', r'!'), 18 | ('COMMA', ',', r','), 19 | ('MISMATCH', r'.') 20 | ] 21 | 22 | def grammar(self): 23 | return Sequence('WORD', ',', 'WORD', '!') 24 | 25 | 26 | tree = Parser().parse('Hello, World!') 27 | 28 | print('Tree:', tree) 29 | -------------------------------------------------------------------------------- /examples/json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """An JSON example of how to transform a parse tree of tokens into 4 | lists, dicts, floats, booleans and None. 5 | 6 | NOTE: The parse tree transformation is implemented as a separate step 7 | after parsing. Making the transformation part of the parser is 8 | probably desired, but there are currently no plans on doing so because 9 | there is no use case at the moment (for me, probably the only user). 10 | 11 | $ env PYTHONPATH=. python3 examples/json.py 12 | {'number': 0.11, 'false': False, 'true': True, 'null': None, 'list': [None, 'string']} 13 | 14 | """ 15 | 16 | import textparser 17 | from textparser import Forward 18 | from textparser import Sequence 19 | from textparser import DelimitedList 20 | from textparser import choice 21 | from textparser import Optional 22 | 23 | 24 | JSON_TEXT = '''\ 25 | { 26 | "number": 0.11, 27 | "false": false, 28 | "true": true, 29 | "null": null, 30 | "list": [null, "string"] 31 | } 32 | ''' 33 | 34 | 35 | class Parser(textparser.Parser): 36 | 37 | def token_specs(self): 38 | return [ 39 | ('SKIP', r'[ \r\n\t]+'), 40 | ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), 41 | ('TRUE', r'true'), 42 | ('FALSE', r'false'), 43 | ('NULL', r'null'), 44 | ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), 45 | ('LPAREN', '(', r'\('), 46 | ('RPAREN', ')', r'\)'), 47 | ('LBRACKET', '[', r'\['), 48 | ('RBRACKET', ']', r'\]'), 49 | ('LBRACE', '{', r'\{'), 50 | ('RBRACE', '}', r'\}'), 51 | ('COMMA', ',', r','), 52 | ('COLON', ':', r':'), 53 | ('MISMATCH', r'.') 54 | ] 55 | 56 | def grammar(self): 57 | value = Forward() 58 | list_ = Sequence('[', Optional(DelimitedList(value)), ']') 59 | pair = Sequence('ESCAPED_STRING', ':', value) 60 | dict_ = Sequence('{', Optional(DelimitedList(pair)), '}') 61 | value <<= choice(list_, 62 | dict_, 63 | 'ESCAPED_STRING', 64 | 'NUMBER', 65 | 'TRUE', 66 | 'FALSE', 67 | 'NULL') 68 | 69 | return value 70 | 71 | 72 | def transform(token): 73 | if isinstance(token, list): 74 | if token[0].kind == '{': 75 | if len(token[1]) > 0: 76 | return { 77 | key.value[1:-1]: transform(v) 78 | for key, _, v in token[1][0] 79 | } 80 | else: 81 | return {} 82 | else: 83 | if len(token[1]) > 0: 84 | return [transform(elem) for elem in token[1][0]] 85 | else: 86 | return [] 87 | elif token.kind == 'ESCAPED_STRING': 88 | return token.value[1:-1] 89 | elif token.kind == 'NUMBER': 90 | return float(token.value) 91 | elif token.kind == 'TRUE': 92 | return True 93 | elif token.kind == 'FALSE': 94 | return False 95 | else: 96 | return None 97 | 98 | 99 | print(transform(Parser().parse(JSON_TEXT, token_tree=True))) 100 | -------------------------------------------------------------------------------- /examples/proto3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # $ env PYTHONPATH=.. python3 proto3.py 4 | # Tree: [['syntax', '=', '"proto3"', ';'], 5 | # [['import', ['public'], '"foo.bar"', ';'], 6 | # ['option', ['java_package', []], '=', '"com.example.foo"', ';'], 7 | # ['option', ['java_multiple_files', []], '=', ['true'], ';'], 8 | # ['enum', 9 | # 'EnumAllowingAlias', 10 | # ['{', 11 | # [['option', ['allow_alias', []], '=', ['true'], ';'], 12 | # ['UNKNOWN', '=', '0', [], ';'], 13 | # ['STARTED', '=', '1', [], ';'], 14 | # ['RUNNING', 15 | # '=', 16 | # '2', 17 | # [['[', 18 | # [[[['(', ['custom_option'], ')'], []], '=', '"hello world"']], 19 | # ']']], 20 | # ';']], 21 | # '}']], 22 | # ['message', 23 | # 'outer', 24 | # ['{', 25 | # [['option', [['(', ['my_option'], ')'], [['.', 'a']]], '=', ['true'], ';'], 26 | # [[], 27 | # [[], ['int32']], 28 | # 'old_field', 29 | # '=', 30 | # '1', 31 | # [['[', [[['deprecated', []], '=', ['true']]], ']']], 32 | # ';'], 33 | # ['message', 34 | # 'inner', 35 | # ['{', [[[], [[], ['int64']], 'ival', '=', '2', [], ';']], '}']], 36 | # [['repeated'], [[], ['inner']], 'inner_message', '=', '3', [], ';'], 37 | # [[], [[], ['EnumAllowingAlias']], 'enum_field', '=', '4', [], ';'], 38 | # ['map', 39 | # '<', 40 | # 'int32', 41 | # ',', 42 | # [[], ['string']], 43 | # '>', 44 | # 'my_map', 45 | # '=', 46 | # '5', 47 | # [], 48 | # ';'], 49 | # [[], [[], ['foo', 'bar', 'Open']], 'open', '=', '6', [], ';'], 50 | # [[], [['.'], ['foo', 'bar', 'Close']], 'close', '=', '7', [], ';'], 51 | # ['oneof', 52 | # 'test_oneof', 53 | # '{', 54 | # [[[[], ['string']], 'name', '=', '8', [], ';'], 55 | # [[[], ['SubMessage']], 'sub_message', '=', '9', [], ';']], 56 | # '}'], 57 | # ['reserved', [['2', '15', '9'], [['to', '11']]], ';'], 58 | # ['reserved', [['7'], []], ';'], 59 | # ['reserved', [['15'], [['to', 'max']]], ';']], 60 | # '}']], 61 | # ['service', 62 | # 'SearchService', 63 | # '{', 64 | # [['rpc', 65 | # 'Search', 66 | # '(', 67 | # [], 68 | # 'SearchRequest', 69 | # ')', 70 | # 'returns', 71 | # '(', 72 | # [], 73 | # 'SearchResponse', 74 | # ')', 75 | # ';']], 76 | # '}']]] 77 | # 78 | 79 | from pprint import pformat 80 | 81 | import textparser 82 | from textparser import Sequence 83 | from textparser import ZeroOrMore 84 | from textparser import choice 85 | from textparser import Optional 86 | from textparser import DelimitedList 87 | from textparser import Forward 88 | 89 | 90 | class Parser(textparser.Parser): 91 | 92 | def keywords(self): 93 | return set([ 94 | 'syntax', 95 | 'import', 96 | 'public', 97 | 'option', 98 | 'enum', 99 | 'bool', 100 | 'string', 101 | 'message', 102 | 'rpc', 103 | 'service', 104 | 'returns', 105 | 'repeated', 106 | 'map', 107 | 'package', 108 | 'stream', 109 | 'weak', 110 | 'oneof', 111 | 'reserved', 112 | 'to', 113 | 'int32', 114 | 'int64', 115 | 'uint32', 116 | 'uint64', 117 | 'sint32', 118 | 'sint64', 119 | 'fixed32', 120 | 'fixed64', 121 | 'sfixed32', 122 | 'sfixed64', 123 | 'true', 124 | 'false', 125 | 'min', 126 | 'max' 127 | ]) 128 | 129 | def token_specs(self): 130 | decimals = r'[0-9]+' 131 | exponent = r'[eE][+-]?[0-9]+' 132 | re_float = r'{d}\.[0-9]?({e})?|{d}({e})?|\.{d}({e})?|inf|nan'.format( 133 | d=decimals, 134 | e=exponent) 135 | 136 | return [ 137 | ('SKIP', r'[ \r\n\t]+|//[\s\S]*?\n'), 138 | ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), 139 | ('INT', r'[1-9][0-9]*|0[0-7]*|0[xX][0-9a-fA-F]+'), 140 | ('FLOAT', re_float), 141 | ('IDENT', r'[a-zA-Z][a-zA-Z0-9_]*'), 142 | ('DOT', '.', r'\.'), 143 | ('COMMA', ',', r','), 144 | ('SCOLON', ';', r';'), 145 | ('EQ', '=', r'='), 146 | ('LT', '<', r'<'), 147 | ('GT', '>', r'>'), 148 | ('LBRACE', '{', r'\{'), 149 | ('RBRACE', '}', r'\}'), 150 | ('LBRACK', '[', r'\['), 151 | ('RBRACK', ']', r'\]'), 152 | ('LPAREN', '(', r'\('), 153 | ('RPAREN', ')', r'\)'), 154 | ('MISMATCH', r'.') 155 | ] 156 | 157 | def grammar(self): 158 | message = Forward() 159 | rpc = Forward() 160 | 161 | ident = choice(*(list(self.keywords()) + ['IDENT'])) 162 | full_ident = DelimitedList(ident, delim='.') 163 | 164 | # Constant. 165 | constant = choice(full_ident, 166 | Sequence(Optional(choice('-', '+')), 'INT'), 167 | Sequence(Optional(choice('-', '+')), 'FLOAT'), 168 | 'ESCAPED_STRING', 169 | 'true', 170 | 'false') 171 | 172 | # Syntax. 173 | syntax = Sequence('syntax', '=', 'ESCAPED_STRING', ';') 174 | 175 | # Import statement. 176 | import_ = Sequence('import', 177 | Optional(choice('weak', 'public')), 178 | 'ESCAPED_STRING', ';') 179 | 180 | # Package. 181 | package = Sequence('package', full_ident, ';') 182 | 183 | # Option. 184 | option_name = Sequence(choice(ident, Sequence('(', full_ident, ')')), 185 | ZeroOrMore(Sequence('.', ident))) 186 | option = Sequence('option', option_name, '=', constant, ';') 187 | 188 | # Fields. 189 | type_ = choice(Sequence(Optional('.'), DelimitedList(ident, '.')), 190 | ident) 191 | field_number = 'INT' 192 | 193 | # Normal field. 194 | field_option = Sequence(option_name, '=', constant) 195 | field_options = DelimitedList(field_option) 196 | field = Sequence(Optional('repeated'), 197 | type_, ident, '=', field_number, 198 | Optional(Sequence('[', field_options, ']')), 199 | ';') 200 | 201 | # Oneof and oneof field. 202 | oneof_field = Sequence(type_, ident, '=', field_number, 203 | Optional(Sequence('[', field_options, ']')), 204 | ';') 205 | oneof = Sequence('oneof', ident, 206 | '{', 207 | ZeroOrMore(choice(oneof_field, ';')), 208 | '}') 209 | 210 | # Map field. 211 | key_type = choice('int32', 212 | 'int64', 213 | 'uint32', 214 | 'uint64', 215 | 'sint32', 216 | 'sint64', 217 | 'fixed32', 218 | 'fixed64', 219 | 'sfixed32', 220 | 'sfixed64', 221 | 'bool', 222 | 'string') 223 | map_field = Sequence('map', '<', key_type, ',', type_, '>', 224 | ident, '=', field_number, 225 | Optional(Sequence('[', field_options, ']')), 226 | ';') 227 | 228 | # Reserved. 229 | field_names = DelimitedList(ident) 230 | ranges = Sequence(DelimitedList('INT'), 231 | Optional(Sequence('to', choice('INT', 'max')))) 232 | reserved = Sequence('reserved', choice(ranges, field_names), ';') 233 | 234 | # Enum definition. 235 | enum_value_option = Sequence(option_name, '=', constant) 236 | enum_field = Sequence( 237 | ident, '=', 'INT', 238 | Optional(Sequence('[', DelimitedList(enum_value_option), ']')), 239 | ';') 240 | enum_body = Sequence('{', 241 | ZeroOrMore(choice(option, enum_field, ';')), 242 | '}') 243 | enum = Sequence('enum', ident, enum_body) 244 | 245 | # Message definition. 246 | message_body = Sequence('{', 247 | ZeroOrMore(choice(field, 248 | enum, 249 | message, 250 | option, 251 | oneof, 252 | map_field, 253 | reserved, 254 | ';')), 255 | '}') 256 | message <<= Sequence('message', ident, message_body) 257 | 258 | # Service definition. 259 | service = Sequence('service', ident, 260 | '{', 261 | ZeroOrMore(choice(option, rpc, ';')), 262 | '}') 263 | rpc <<= Sequence('rpc', ident, 264 | '(', 265 | Optional('stream'), ident, 266 | ')', 267 | 'returns', 268 | '(', 269 | Optional('stream'), ident, 270 | ')', 271 | choice(Sequence('{', 272 | ZeroOrMore(choice(option, ';')), 273 | '}'), 274 | ';')) 275 | 276 | # Proto file. 277 | proto = Sequence(syntax, 278 | ZeroOrMore(choice(import_, 279 | package, 280 | option, 281 | message, 282 | enum, 283 | service, 284 | ';'))) 285 | 286 | return proto 287 | 288 | 289 | proto_string = ''' 290 | syntax = "proto3"; 291 | 292 | import public "foo.bar"; 293 | 294 | option java_package = "com.example.foo"; 295 | option java_multiple_files = true; 296 | 297 | enum EnumAllowingAlias { 298 | option allow_alias = true; 299 | UNKNOWN = 0; 300 | STARTED = 1; 301 | RUNNING = 2 [(custom_option) = "hello world"]; 302 | } 303 | 304 | message outer { 305 | option (my_option).a = true; 306 | int32 old_field = 1 [deprecated=true]; 307 | message inner { // Level 2 308 | int64 ival = 2; 309 | } 310 | repeated inner inner_message = 3; 311 | EnumAllowingAlias enum_field =4; 312 | map my_map = 5; 313 | foo.bar.Open open = 6; 314 | .foo.bar.Close close = 7; 315 | oneof test_oneof { 316 | string name = 8; 317 | SubMessage sub_message = 9; 318 | } 319 | reserved 2, 15, 9 to 11; 320 | reserved 7; 321 | reserved 15 to max; 322 | } 323 | 324 | service SearchService { 325 | rpc Search (SearchRequest) returns (SearchResponse); 326 | } 327 | ''' 328 | 329 | tree = Parser().parse(proto_string) 330 | 331 | print('Tree:', pformat(tree)) 332 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lark-parser 2 | pyparsing 3 | parsita 4 | funcparserlib 5 | parsy 6 | parsimonious 7 | textx 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | from setuptools import find_packages 5 | import re 6 | 7 | 8 | def find_version(): 9 | return re.search(r"^__version__ = '(.*)'$", 10 | open('textparser.py', 'r').read(), 11 | re.MULTILINE).group(1) 12 | 13 | 14 | setup(name='textparser', 15 | version=find_version(), 16 | description='Text parser.', 17 | long_description=open('README.rst', 'r').read(), 18 | author='Erik Moqvist', 19 | author_email='erik.moqvist@gmail.com', 20 | license='MIT', 21 | classifiers=[ 22 | 'License :: OSI Approved :: MIT License', 23 | 'Programming Language :: Python :: 2', 24 | 'Programming Language :: Python :: 3', 25 | ], 26 | keywords=['parser', 'parsing'], 27 | url='https://github.com/eerimoq/textparser', 28 | py_modules=['textparser'], 29 | test_suite="tests") 30 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eerimoq/textparser/1ef809eb283da3c3ec7b8bc682f11eeada3a81d6/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_textparser.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from collections import namedtuple 3 | 4 | import textparser 5 | from textparser import Grammar 6 | from textparser import Sequence 7 | from textparser import Choice 8 | from textparser import choice 9 | from textparser import ChoiceDict 10 | from textparser import ZeroOrMore 11 | from textparser import ZeroOrMoreDict 12 | from textparser import OneOrMore 13 | from textparser import OneOrMoreDict 14 | from textparser import DelimitedList 15 | from textparser import Token 16 | from textparser import TokenizeError 17 | from textparser import tokenize_init 18 | from textparser import Any 19 | from textparser import AnyUntil 20 | from textparser import Optional 21 | from textparser import Tag 22 | from textparser import Forward 23 | from textparser import NoMatch 24 | from textparser import Not 25 | from textparser import And 26 | from textparser import markup_line 27 | from textparser import replace_blocks 28 | 29 | 30 | def tokenize(items, add_eof_token=True): 31 | tokens = [] 32 | 33 | for item in items: 34 | if len(item) == 2: 35 | token = Token(*item, offset=1) 36 | else: 37 | token = Token(*item) 38 | 39 | tokens.append(token) 40 | 41 | if add_eof_token: 42 | tokens.append(Token('__EOF__', None, -1)) 43 | 44 | return tokens 45 | 46 | 47 | class TextParserTest(unittest.TestCase): 48 | 49 | def parse_and_assert_tree(self, grammar, datas): 50 | for tokens, expected_tree in datas: 51 | tree = grammar.parse(tokenize(tokens)) 52 | self.assertEqual(tree, expected_tree) 53 | 54 | def parse_and_assert_mismatch(self, grammar, datas): 55 | for tokens, line in datas: 56 | tokens = tokenize(tokens) 57 | 58 | with self.assertRaises(textparser.GrammarError) as cm: 59 | grammar.parse(tokens) 60 | 61 | self.assertEqual(cm.exception.offset, line) 62 | 63 | def test_grammar_sequence(self): 64 | grammar = Grammar(Sequence('NUMBER', 'WORD')) 65 | tokens = tokenize([ 66 | ('NUMBER', '1.45'), 67 | ('WORD', 'm') 68 | ]) 69 | tree = grammar.parse(tokens) 70 | self.assertEqual(tree, ['1.45', 'm']) 71 | 72 | def test_grammar_sequence_mismatch(self): 73 | grammar = Grammar(Sequence('NUMBER', 'WORD')) 74 | tokens = tokenize([('NUMBER', '1.45')]) 75 | 76 | with self.assertRaises(textparser.GrammarError) as cm: 77 | grammar.parse(tokens) 78 | 79 | self.assertEqual(cm.exception.offset, -1) 80 | 81 | def test_grammar_choice(self): 82 | grammar = Grammar(Choice('NUMBER', 'WORD')) 83 | 84 | datas = [ 85 | ( 86 | [('WORD', 'm')], 87 | 'm' 88 | ), 89 | ( 90 | [('NUMBER', '5')], 91 | '5' 92 | ) 93 | ] 94 | 95 | self.parse_and_assert_tree(grammar, datas) 96 | 97 | def test_grammar_choice_mismatch(self): 98 | grammar = Grammar(Choice(Sequence('NUMBER', 'WORD'), 99 | 'WORD')) 100 | 101 | datas = [ 102 | ([('NUMBER', '1', 5)], -1), 103 | ([('NUMBER', '1', 5), ('NUMBER', '2', 7)], 7) 104 | ] 105 | 106 | self.parse_and_assert_mismatch(grammar, datas) 107 | 108 | def test_grammar_choice_dict(self): 109 | number = Forward() 110 | number <<= Sequence('NUMBER') 111 | grammar = Grammar(ChoiceDict(number, 112 | Tag('foo', Sequence('WORD')), 113 | ChoiceDict('BAR'), 114 | 'FIE')) 115 | 116 | datas = [ 117 | ( 118 | [('WORD', 'm')], 119 | ('foo', ['m']) 120 | ), 121 | ( 122 | [('NUMBER', '5')], 123 | ['5'] 124 | ), 125 | ( 126 | [('BAR', 'foo')], 127 | 'foo' 128 | ), 129 | ( 130 | [('FIE', 'fum')], 131 | 'fum' 132 | ) 133 | ] 134 | 135 | self.parse_and_assert_tree(grammar, datas) 136 | 137 | def test_grammar_choice_dict_mismatch(self): 138 | grammar = Grammar(ChoiceDict(Sequence('NUMBER'), 139 | Sequence('WORD'))) 140 | tokens = tokenize([(',', ',', 3)]) 141 | 142 | with self.assertRaises(textparser.Error) as cm: 143 | grammar.parse(tokens) 144 | 145 | self.assertEqual(cm.exception.offset, 3) 146 | 147 | def test_grammar_choice_dict_init(self): 148 | datas = [ 149 | ( 150 | ('WORD', 'WORD'), 151 | "First token kind must be unique, but WORD isn't." 152 | ), 153 | ( 154 | ('WORD', Sequence('WORD')), 155 | "First token kind must be unique, but WORD isn't." 156 | ), 157 | ( 158 | (Sequence(Sequence(Optional('WORD'))), ), 159 | "Unsupported pattern type ." 160 | ) 161 | ] 162 | 163 | for grammar, message in datas: 164 | with self.assertRaises(textparser.Error) as cm: 165 | ChoiceDict(*grammar) 166 | 167 | self.assertEqual(str(cm.exception), message) 168 | 169 | def test_grammar_delimited_list(self): 170 | grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.'))) 171 | 172 | datas = [ 173 | ( 174 | [('WORD', 'foo')], 175 | [['foo'], []] 176 | ), 177 | ( 178 | [('WORD', 'foo'), (',', ','), ('WORD', 'bar')], 179 | [['foo', 'bar'], []] 180 | ), 181 | ( 182 | [('WORD', 'foo'), (',', ','), ('WORD', 'bar'), ('.', '.')], 183 | [['foo', 'bar'], ['.']] 184 | ) 185 | ] 186 | 187 | self.parse_and_assert_tree(grammar, datas) 188 | 189 | def test_grammar_delimited_list_mismatch(self): 190 | grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.'))) 191 | 192 | datas = [ 193 | ( 194 | [ 195 | ('WORD', 'foo', 1), 196 | (',', ',', 2) 197 | ], 198 | 2 199 | ), 200 | ( 201 | [ 202 | ('WORD', 'foo', 1), 203 | (',', ',', 2), 204 | ('WORD', 'foo', 3), 205 | (',', ',', 4), 206 | ('.', '.', 5) 207 | ], 208 | 4 209 | ) 210 | ] 211 | 212 | self.parse_and_assert_mismatch(grammar, datas) 213 | 214 | def test_grammar_zero_or_more(self): 215 | grammar = Grammar(ZeroOrMore('WORD')) 216 | 217 | datas = [ 218 | ( 219 | [], 220 | [] 221 | ), 222 | ( 223 | [('WORD', 'foo')], 224 | ['foo'] 225 | ), 226 | ( 227 | [('WORD', 'foo'), ('WORD', 'bar')], 228 | ['foo', 'bar'] 229 | ) 230 | ] 231 | 232 | self.parse_and_assert_tree(grammar, datas) 233 | 234 | def test_grammar_zero_or_more_partial_element_match(self): 235 | grammar = Grammar(Sequence( 236 | ZeroOrMore(Sequence('WORD', 'NUMBER')), 'WORD')) 237 | 238 | datas = [ 239 | ( 240 | [ 241 | ('WORD', 'foo'), 242 | ('NUMBER', '1'), 243 | ('WORD', 'bar'), 244 | ('NUMBER', '2'), 245 | ('WORD', 'fie')], 246 | [[['foo', '1'], ['bar', '2']], 'fie'] 247 | ) 248 | ] 249 | 250 | self.parse_and_assert_tree(grammar, datas) 251 | 252 | def test_grammar_zero_or_more_dict(self): 253 | grammar = Grammar(ZeroOrMoreDict(Sequence('WORD', 'NUMBER'))) 254 | 255 | datas = [ 256 | ( 257 | [], 258 | {} 259 | ), 260 | ( 261 | [('WORD', 'foo'), ('NUMBER', '1'), 262 | ('WORD', 'bar'), ('NUMBER', '2'), 263 | ('WORD', 'foo'), ('NUMBER', '3')], 264 | { 265 | 'foo': [['foo', '1'], ['foo', '3']], 266 | 'bar': [['bar', '2']] 267 | } 268 | ) 269 | ] 270 | 271 | self.parse_and_assert_tree(grammar, datas) 272 | 273 | def test_grammar_one_or_more(self): 274 | grammar = Grammar(OneOrMore('WORD')) 275 | 276 | datas = [ 277 | ( 278 | [('WORD', 'foo')], 279 | ['foo'] 280 | ), 281 | ( 282 | [('WORD', 'foo'), ('WORD', 'bar')], 283 | ['foo', 'bar'] 284 | ) 285 | ] 286 | 287 | self.parse_and_assert_tree(grammar, datas) 288 | 289 | def test_grammar_one_or_more_mismatch(self): 290 | grammar = Grammar(OneOrMore('WORD')) 291 | 292 | datas = [ 293 | ( 294 | [] 295 | , -1 296 | ), 297 | ( 298 | [('NUMBER', 'foo', 2)], 299 | 2 300 | ) 301 | ] 302 | 303 | self.parse_and_assert_mismatch(grammar, datas) 304 | 305 | def test_grammar_one_or_more_dict(self): 306 | grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER'))) 307 | 308 | datas = [ 309 | ( 310 | [('WORD', 'foo'), ('NUMBER', '1')], 311 | { 312 | 'foo': [['foo', '1']] 313 | } 314 | ), 315 | ( 316 | [('WORD', 'foo'), ('NUMBER', '1'), 317 | ('WORD', 'bar'), ('NUMBER', '2'), 318 | ('WORD', 'foo'), ('NUMBER', '3')], 319 | { 320 | 'foo': [['foo', '1'], ['foo', '3']], 321 | 'bar': [['bar', '2']] 322 | } 323 | ) 324 | ] 325 | 326 | self.parse_and_assert_tree(grammar, datas) 327 | 328 | def test_grammar_one_or_more_dict_mismatch(self): 329 | grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER'))) 330 | 331 | datas = [ 332 | ( 333 | [('WORD', 'foo', 5)], 334 | -1 335 | ), 336 | ( 337 | [ 338 | ('WORD', 'foo', 5), 339 | ('WORD', 'bar', 6) 340 | ], 341 | 6 342 | ), 343 | ( 344 | [ 345 | ('WORD', 'foo', 5), 346 | ('NUMBER', '4', 6), 347 | ('WORD', 'bar', 7), 348 | ('WORD', 'fie', 8) 349 | ], 350 | 8 351 | ) 352 | ] 353 | 354 | self.parse_and_assert_mismatch(grammar, datas) 355 | 356 | def test_grammar_any(self): 357 | grammar = Grammar(Any()) 358 | 359 | datas = [ 360 | ( 361 | [('A', r'a')], 362 | 'a' 363 | ), 364 | ( 365 | [('B', r'b')], 366 | 'b' 367 | ) 368 | ] 369 | 370 | self.parse_and_assert_tree(grammar, datas) 371 | 372 | def test_grammar_any_until(self): 373 | grammar = Grammar(Sequence(AnyUntil('STRING'), 'STRING')) 374 | 375 | datas = [ 376 | ( 377 | [('NUMBER', '1'), 378 | ('WORD', 'a'), 379 | ('STRING', '"b"')], 380 | [['1', 'a'], '"b"'] 381 | ) 382 | ] 383 | 384 | self.parse_and_assert_tree(grammar, datas) 385 | 386 | def test_grammar_any_until_sequence(self): 387 | grammar = Grammar(Sequence(AnyUntil(Sequence('WORD', 'STRING')), 388 | 'WORD', 389 | 'STRING')) 390 | 391 | datas = [ 392 | ( 393 | [('NUMBER', '1'), 394 | ('WORD', 'a'), 395 | ('WORD', 'b'), 396 | ('STRING', '"b"')], 397 | [['1', 'a'], 'b', '"b"'] 398 | ) 399 | ] 400 | 401 | self.parse_and_assert_tree(grammar, datas) 402 | 403 | def test_grammar_1(self): 404 | grammar = Grammar(Sequence( 405 | 'IF', 406 | choice(Sequence(choice('A', 'B'), 'STRING'), 407 | 'STRING'), 408 | 'WORD', 409 | choice( 410 | Sequence( 411 | choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'), 412 | '.'))) 413 | 414 | datas = [ 415 | ( 416 | [ 417 | ('IF', 'IF'), 418 | ('STRING', 'foo'), 419 | ('WORD', 'bar'), 420 | ('.', '.') 421 | ], 422 | ['IF', 'foo', 'bar', [[], '.']] 423 | ), 424 | ( 425 | [ 426 | ('IF', 'IF'), 427 | ('STRING', 'foo'), 428 | ('WORD', 'bar'), 429 | ('NUMBER', '0'), 430 | ('NUMBER', '100'), 431 | ('.', '.') 432 | ], 433 | ['IF', 'foo', 'bar', [['0', '100'], '.']] 434 | ) 435 | ] 436 | 437 | self.parse_and_assert_tree(grammar, datas) 438 | 439 | def test_grammar_1_mismatch(self): 440 | grammar = Grammar(Sequence( 441 | 'IF', 442 | choice(Sequence(choice('A', 'B'), 'STRING'), 443 | 'STRING'), 444 | 'WORD', 445 | choice( 446 | Sequence( 447 | choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'), 448 | '.'))) 449 | 450 | datas = [ 451 | ( 452 | [ 453 | ('IF', 'IF', 1), 454 | ('STRING', 'foo', 2), 455 | ('WORD', 'bar', 3), 456 | (',', ',', 4) 457 | ], 458 | 4 459 | ), 460 | ( 461 | [ 462 | ('IF', 'IF', 1), 463 | ('STRING', 'foo', 2), 464 | ('.', '.', 3) 465 | ], 466 | 3 467 | ), 468 | ( 469 | [ 470 | ('IF', 'IF', 1), 471 | ('NUMBER', '1', 2) 472 | ], 473 | 2 474 | ), 475 | ( 476 | [ 477 | ('IF', 'IF', 1), 478 | ('STRING', 'foo', 2), 479 | ('WORD', 'bar', 3), 480 | ('.', '.', 4), 481 | ('.', '.', 5) 482 | ], 483 | 5 484 | ) 485 | ] 486 | 487 | self.parse_and_assert_mismatch(grammar, datas) 488 | 489 | def test_grammar_forward(self): 490 | foo = Forward() 491 | foo <<= Sequence('FOO') 492 | grammar = Grammar(foo) 493 | 494 | datas = [ 495 | ( 496 | [('FOO', 'foo')], 497 | ['foo'] 498 | ) 499 | ] 500 | 501 | self.parse_and_assert_tree(grammar, datas) 502 | 503 | def test_grammar_forward_text(self): 504 | foo = Forward() 505 | foo <<= 'FOO' 506 | grammar = Grammar(foo) 507 | 508 | datas = [ 509 | ( 510 | [('FOO', 'foo')], 511 | 'foo' 512 | ) 513 | ] 514 | 515 | self.parse_and_assert_tree(grammar, datas) 516 | 517 | def test_grammar_optional(self): 518 | grammar = Grammar(Sequence(Optional('WORD'), 519 | Optional('WORD'), 520 | Optional('NUMBER'))) 521 | 522 | datas = [ 523 | ( 524 | [], 525 | [[], [], []] 526 | ), 527 | ( 528 | [('WORD', 'a')], 529 | [['a'], [], []] 530 | ), 531 | ( 532 | [('NUMBER', 'c')], 533 | [[], [], ['c']] 534 | ), 535 | ( 536 | [('WORD', 'a'), ('NUMBER', 'c')], 537 | [['a'], [], ['c']] 538 | ), 539 | ( 540 | [('WORD', 'a'), ('WORD', 'b'), ('NUMBER', 'c')], 541 | [['a'], ['b'], ['c']] 542 | ) 543 | ] 544 | 545 | self.parse_and_assert_tree(grammar, datas) 546 | 547 | def test_grammar_tag(self): 548 | grammar = Grammar(Tag('a', 549 | Tag('b', 550 | choice(Tag('c', 'WORD'), 551 | Tag('d', Optional('NUMBER')))))) 552 | 553 | datas = [ 554 | ( 555 | [('WORD', 'bar')], 556 | ('a', ('b', ('c', 'bar'))) 557 | ), 558 | ( 559 | [('NUMBER', '1')], 560 | ('a', ('b', ('d', ['1']))) 561 | ), 562 | ( 563 | [], 564 | ('a', ('b', ('d', []))) 565 | ) 566 | ] 567 | 568 | self.parse_and_assert_tree(grammar, datas) 569 | 570 | def test_grammar_tag_mismatch(self): 571 | grammar = Grammar(Tag('a', 'WORD')) 572 | 573 | datas = [ 574 | ( 575 | [('NUMBER', 'bar')], 576 | 1 577 | ) 578 | ] 579 | 580 | self.parse_and_assert_mismatch(grammar, datas) 581 | 582 | def test_grammar_and(self): 583 | grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER')) 584 | 585 | datas = [ 586 | ( 587 | [('NUMBER', '1')], 588 | [[], '1'] 589 | ) 590 | ] 591 | 592 | self.parse_and_assert_tree(grammar, datas) 593 | 594 | def test_grammar_and_mismatch(self): 595 | grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER')) 596 | 597 | datas = [ 598 | ( 599 | [('WORD', 'foo', 3), ('NUMBER', '1', 4)], 600 | 3 601 | ) 602 | ] 603 | 604 | self.parse_and_assert_mismatch(grammar, datas) 605 | 606 | def test_grammar_not(self): 607 | grammar = Grammar(Sequence(Not('WORD'), 'NUMBER')) 608 | 609 | datas = [ 610 | ( 611 | [('NUMBER', '1')], 612 | [[], '1'] 613 | ) 614 | ] 615 | 616 | self.parse_and_assert_tree(grammar, datas) 617 | 618 | def test_grammar_not_mismatch(self): 619 | grammar = Grammar(Sequence(Not('WORD'), 'NUMBER')) 620 | 621 | datas = [ 622 | ( 623 | [('WORD', 'foo', 3), ('NUMBER', '1', 4)], 624 | 3 625 | ) 626 | ] 627 | 628 | self.parse_and_assert_mismatch(grammar, datas) 629 | 630 | def test_grammar_no_match(self): 631 | grammar = Grammar(NoMatch()) 632 | 633 | datas = [ 634 | ( 635 | [('NUMBER', '1', 3)], 636 | 3 637 | ), 638 | ( 639 | [('WORD', 'foo', 3)], 640 | 3 641 | ) 642 | ] 643 | 644 | self.parse_and_assert_mismatch(grammar, datas) 645 | 646 | def test_parse_start_and_end_of_file(self): 647 | class Parser(textparser.Parser): 648 | 649 | def grammar(self): 650 | return Sequence('__SOF__', '__EOF__') 651 | 652 | self.assertEqual(Parser().parse('', match_sof=True), 653 | ['__SOF__', '__EOF__']) 654 | 655 | def test_parse_start_of_file_mismatch(self): 656 | class Parser(textparser.Parser): 657 | 658 | def grammar(self): 659 | return Sequence('__EOF__') 660 | 661 | with self.assertRaises(textparser.ParseError) as cm: 662 | Parser().parse('123', match_sof=True) 663 | 664 | self.assertEqual(str(cm.exception), 665 | 'Invalid syntax at line 1, column 1: ">>!<<123"') 666 | 667 | def test_parse_end_of_file(self): 668 | class Parser(textparser.Parser): 669 | 670 | def grammar(self): 671 | return '__EOF__' 672 | 673 | self.assertEqual(Parser().parse('', match_sof=False), '__EOF__') 674 | 675 | def test_grammar_none(self): 676 | class AnyAsNone(textparser.Pattern): 677 | 678 | def match(self, tokens): 679 | tokens.get_value() 680 | 681 | return None 682 | 683 | grammar = Grammar(AnyAsNone()) 684 | 685 | datas = [ 686 | ( 687 | [('NUMBER', '1')], 688 | None 689 | ) 690 | ] 691 | 692 | self.parse_and_assert_tree(grammar, datas) 693 | 694 | def test_grammar_error(self): 695 | grammar = Grammar(NoMatch()) 696 | 697 | datas = [ 698 | [('NUMBER', '1', 3)], 699 | [('WORD', 'foo', 3)] 700 | ] 701 | 702 | for tokens in datas: 703 | tokens = tokenize(tokens) 704 | 705 | with self.assertRaises(textparser.GrammarError) as cm: 706 | grammar.parse(tokens) 707 | 708 | self.assertEqual(cm.exception.offset, 3) 709 | self.assertEqual(str(cm.exception), 710 | 'Invalid syntax at offset 3.') 711 | 712 | def test_tokenize_error(self): 713 | datas = [ 714 | (2, 'hej', 'Invalid syntax at line 1, column 3: "he>>!<>!<>!<<"'), 717 | (2, 'a\nb\n', 'Invalid syntax at line 2, column 1: ">>!<a)' 733 | ), 734 | ( 735 | [('A', r'b'), ('C', r'd')], 736 | '(?Pb)|(?Pd)' 737 | ) 738 | ] 739 | 740 | for spec, expected_re_token in datas: 741 | tokens, re_token = tokenize_init(spec) 742 | self.assertEqual(tokens, 743 | [Token(kind='__SOF__', value='__SOF__', offset=0)]) 744 | self.assertEqual(re_token, expected_re_token) 745 | 746 | def test_parser(self): 747 | class Parser(textparser.Parser): 748 | 749 | def keywords(self): 750 | return set([ 751 | 'IF', 752 | 'A', 753 | 'B' 754 | ]) 755 | 756 | def token_specs(self): 757 | return [ 758 | ('SKIP', r'[ \r\n\t]+'), 759 | ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), 760 | ('DOT', '.', r'\.'), 761 | ('WORD', r'[A-Za-z0-9_]+'), 762 | ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), 763 | ('MISMATCH', r'.') 764 | ] 765 | 766 | def grammar(self): 767 | return Sequence( 768 | 'IF', 769 | Optional(choice('A', 'B')), 770 | 'ESCAPED_STRING', 771 | 'WORD', 772 | Optional(choice(DelimitedList('ESCAPED_STRING'), 773 | ZeroOrMore('NUMBER'))), 774 | '.') 775 | 776 | datas = [ 777 | ( 778 | 'IF "foo" bar .', 779 | ['IF', [], '"foo"', 'bar', [[]], '.'], 780 | [ 781 | Token(kind='IF', value='IF', offset=0), 782 | [], 783 | Token(kind='ESCAPED_STRING', value='"foo"', offset=3), 784 | Token(kind='WORD', value='bar', offset=9), 785 | [[]], 786 | Token(kind='.', value='.', offset=13) 787 | ] 788 | ), 789 | ( 790 | 'IF B "" b 1 2 .', 791 | ['IF', ['B'], '""', 'b', [['1', '2']], '.'], 792 | [ 793 | Token(kind='IF', value='IF', offset=0), 794 | [ 795 | Token(kind='B', value='B', offset=3) 796 | ], 797 | Token(kind='ESCAPED_STRING', value='""', offset=5), 798 | Token(kind='WORD', value='b', offset=8), 799 | [ 800 | [ 801 | Token(kind='NUMBER', value='1', offset=10), 802 | Token(kind='NUMBER', value='2', offset=12) 803 | ] 804 | ], 805 | Token(kind='.', value='.', offset=14) 806 | ] 807 | ) 808 | ] 809 | 810 | for text, expected_tree, expected_token_tree in datas: 811 | tree = Parser().parse(text) 812 | self.assertEqual(tree, expected_tree) 813 | tree = Parser().parse(text, token_tree=True) 814 | self.assertEqual(tree, expected_token_tree) 815 | 816 | def test_parser_default_keywords(self): 817 | class Parser(textparser.Parser): 818 | 819 | def token_specs(self): 820 | return [ 821 | ('SKIP', r'[ \r\n\t]+'), 822 | ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), 823 | ('DOT', '.', r'\.'), 824 | ('WORD', r'[A-Za-z0-9_]+'), 825 | ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), 826 | ('MISMATCH', r'.') 827 | ] 828 | 829 | def grammar(self): 830 | return Sequence( 831 | 'WORD', 832 | Optional('WORD'), 833 | 'ESCAPED_STRING', 834 | 'WORD', 835 | Optional(choice(DelimitedList('ESCAPED_STRING'), 836 | ZeroOrMore('NUMBER'))), 837 | '.') 838 | 839 | datas = [ 840 | ( 841 | 'IF "foo" bar .', 842 | ['IF', [], '"foo"', 'bar', [[]], '.'], 843 | [ 844 | Token(kind='WORD', value='IF', offset=0), 845 | [], 846 | Token(kind='ESCAPED_STRING', value='"foo"', offset=3), 847 | Token(kind='WORD', value='bar', offset=9), 848 | [[]], 849 | Token(kind='.', value='.', offset=13) 850 | ] 851 | ), 852 | ( 853 | 'IF B "" b 1 2 .', 854 | ['IF', ['B'], '""', 'b', [['1', '2']], '.'], 855 | [ 856 | Token(kind='WORD', value='IF', offset=0), 857 | [ 858 | Token(kind='WORD', value='B', offset=3) 859 | ], 860 | Token(kind='ESCAPED_STRING', value='""', offset=5), 861 | Token(kind='WORD', value='b', offset=8), 862 | [ 863 | [ 864 | Token(kind='NUMBER', value='1', offset=10), 865 | Token(kind='NUMBER', value='2', offset=12) 866 | ] 867 | ], 868 | Token(kind='.', value='.', offset=14) 869 | ] 870 | ) 871 | ] 872 | 873 | for text, expected_tree, expected_token_tree in datas: 874 | tree = Parser().parse(text) 875 | self.assertEqual(tree, expected_tree) 876 | tree = Parser().parse(text, token_tree=True) 877 | self.assertEqual(tree, expected_token_tree) 878 | 879 | def test_parser_bare(self): 880 | class Parser(textparser.Parser): 881 | 882 | pass 883 | 884 | with self.assertRaises(NotImplementedError) as cm: 885 | Parser().parse('foo') 886 | 887 | self.assertEqual(str(cm.exception), 'No grammar defined.') 888 | 889 | def test_parser_default_token_specs(self): 890 | class Parser(textparser.Parser): 891 | 892 | def grammar(self): 893 | return 'WORD' 894 | 895 | tree = Parser().parse('foo') 896 | self.assertEqual(tree, 'foo') 897 | 898 | def test_parser_tokenize_mismatch(self): 899 | class Parser(textparser.Parser): 900 | 901 | def token_specs(self): 902 | return [ 903 | ('SKIP', r'[ \r\n\t]+'), 904 | ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), 905 | ('MISMATCH', r'.') 906 | ] 907 | 908 | def grammar(self): 909 | return Grammar('NUMBER') 910 | 911 | with self.assertRaises(textparser.ParseError) as cm: 912 | Parser().parse('12\n34foo\n789') 913 | 914 | self.assertEqual(cm.exception.offset, 5) 915 | self.assertEqual(cm.exception.line, 2) 916 | self.assertEqual(cm.exception.column, 3) 917 | self.assertEqual(str(cm.exception), 918 | 'Invalid syntax at line 2, column 3: "34>>!<>!<<2"') 940 | 941 | def test_parser_grammar_mismatch_choice_max(self): 942 | class Parser(textparser.Parser): 943 | 944 | def __init__(self, tokens): 945 | self._tokens = tokens 946 | 947 | def tokenize(self, _text): 948 | return tokenize(self._tokens, add_eof_token=False) 949 | 950 | def grammar(self): 951 | return Choice(Sequence('NUMBER', 'WORD'), 952 | 'WORD') 953 | 954 | Data = namedtuple('Data', 955 | [ 956 | 'text', 957 | 'tokens', 958 | 'offset', 959 | 'line', 960 | 'column', 961 | 'message', 962 | ]) 963 | 964 | datas = [ 965 | Data( 966 | text='1.45', 967 | tokens=[ 968 | ('NUMBER', '1.45', 0) 969 | ], 970 | offset=4, 971 | line=1, 972 | column=5, 973 | message='Invalid syntax at line 1, column 5: "1.45>>!<<"' 974 | ), 975 | Data( 976 | text='1.45 2', 977 | tokens=[ 978 | ('NUMBER', '1.45', 0), 979 | ('NUMBER', '2', 5) 980 | ], 981 | offset=5, 982 | line=1, 983 | column=6, 984 | message='Invalid syntax at line 1, column 6: "1.45 >>!<<2"' 985 | ) 986 | ] 987 | 988 | for text, tokens, offset, line, column, message in datas: 989 | with self.assertRaises(textparser.ParseError) as cm: 990 | Parser(tokens).parse(text) 991 | 992 | self.assertEqual(cm.exception.offset, offset) 993 | self.assertEqual(cm.exception.line, line) 994 | self.assertEqual(cm.exception.column, column) 995 | self.assertEqual(str(cm.exception), message) 996 | 997 | def test_parse_error(self): 998 | class Parser(textparser.Parser): 999 | 1000 | def tokenize(self, text): 1001 | raise TokenizeError(text, 5) 1002 | 1003 | def grammar(self): 1004 | return Grammar(Sequence('NUMBER', 'WORD')) 1005 | 1006 | with self.assertRaises(textparser.ParseError) as cm: 1007 | Parser().parse('12\n3456\n789') 1008 | 1009 | self.assertEqual(cm.exception.text, '12\n3456\n789') 1010 | self.assertEqual(cm.exception.offset, 5) 1011 | self.assertEqual(cm.exception.line, 2) 1012 | self.assertEqual(cm.exception.column, 3) 1013 | self.assertEqual(str(cm.exception), 1014 | 'Invalid syntax at line 2, column 3: "34>>!<<56"') 1015 | 1016 | def test_markup_line(self): 1017 | datas = [ 1018 | (0, '>>!<<0', None), 1019 | (1, '0>>!<<', None), 1020 | (2, '>>!<<1234', None), 1021 | (4, '12>>!<<34', None), 1022 | (6, '1234>>!<<', None), 1023 | (7, '>>!<<56', None), 1024 | (8, '5>>!<<6', None), 1025 | (9, '56>>!<<', None), 1026 | (3, '1x234', 'x') 1027 | ] 1028 | 1029 | for offset, line, marker in datas: 1030 | if marker is None: 1031 | text = markup_line('0\n1234\n56', offset) 1032 | else: 1033 | text = markup_line('0\n1234\n56', 1034 | offset, 1035 | marker=marker) 1036 | 1037 | self.assertEqual(text, line) 1038 | 1039 | def test_replace_blocks(self): 1040 | datas = [ 1041 | ('{}', '{}'), 1042 | ('{{}}', '{ }'), 1043 | ('{{\n} xxx {}}', '{ \n }'), 1044 | ('1{a\n}2{b}3', '1{ \n}2{ }3') 1045 | ] 1046 | 1047 | for old, expected in datas: 1048 | new = replace_blocks(old) 1049 | self.assertEqual(new, expected) 1050 | 1051 | def test_replace_blocks_start_end(self): 1052 | datas = [ 1053 | ('1[a]2[b]3', '1[ ]2[ ]3', '[', ']'), 1054 | ('1{a}2{b}3', '1{ }2{ }3', '{', '}'), 1055 | ('1(a)2(b)3', '1( )2( )3', '(', ')'), 1056 | ('1((a))2((b))3', '1(( ))2(( ))3', '((', '))') 1057 | ] 1058 | 1059 | for old, expected, start, end in datas: 1060 | new = replace_blocks(old, start, end) 1061 | self.assertEqual(new, expected) 1062 | 1063 | def test_any_zero_or_more(self): 1064 | class Parser(textparser.Parser): 1065 | 1066 | def keywords(self): 1067 | return ['interesting_group'] 1068 | 1069 | def token_specs(self): 1070 | return [ 1071 | ('SKIP', r'[ \r\n\t]+'), 1072 | ('WORD', r'[A-Za-z0-9_]+'), 1073 | ('SEMICOLON', ';', r';'), 1074 | ('BRACE_OPEN', '{', r'\{'), 1075 | ('BRACE_CLOSE', '}', r'\}'), 1076 | ('EQUAL', '=', r'='), 1077 | ] 1078 | 1079 | def grammar(self): 1080 | interesting_group = textparser.Sequence( 1081 | 'interesting_group', '{', 1082 | ZeroOrMore(Sequence('WORD', '=', 'WORD', ';')), 1083 | '}', 1084 | ';') 1085 | 1086 | return Sequence(AnyUntil('interesting_group'), 1087 | interesting_group, 1088 | ZeroOrMore(Any())) 1089 | 1090 | 1091 | text = ''' 1092 | some_group { 1093 | foo bar; foo bar; 1094 | }; 1095 | 1096 | interesting_group { 1097 | a = 1; 1098 | b = 2; 1099 | }; 1100 | 1101 | another_group { 1102 | foo bar 1103 | }; 1104 | ''' 1105 | 1106 | tree = Parser().parse(text) 1107 | self.assertEqual(tree[1], 1108 | [ 1109 | 'interesting_group', 1110 | '{', 1111 | [ 1112 | ['a', '=', '1', ';'], 1113 | ['b', '=', '2', ';'] 1114 | ], 1115 | '}', 1116 | ';']) 1117 | 1118 | 1119 | if __name__ == '__main__': 1120 | unittest.main() 1121 | -------------------------------------------------------------------------------- /textparser.py: -------------------------------------------------------------------------------- 1 | # A text parser. 2 | 3 | import re 4 | from collections import namedtuple 5 | from operator import itemgetter 6 | 7 | 8 | __author__ = 'Erik Moqvist' 9 | __version__ = '0.24.0' 10 | 11 | 12 | class _Mismatch(object): 13 | pass 14 | 15 | 16 | MISMATCH = _Mismatch() 17 | """Returned by :func:`~textparser.Pattern.match()` on mismatch. 18 | 19 | """ 20 | 21 | 22 | class _String(object): 23 | """Matches a specific token kind. 24 | 25 | """ 26 | 27 | def __init__(self, kind): 28 | self.kind = kind 29 | 30 | def match(self, tokens): 31 | if self.kind == tokens.peek().kind: 32 | return tokens.get_value() 33 | else: 34 | return MISMATCH 35 | 36 | 37 | class _Tokens(object): 38 | 39 | def __init__(self, tokens): 40 | self._tokens = tokens 41 | self._pos = 0 42 | self._max_pos = -1 43 | self._stack = [] 44 | 45 | def get_value(self): 46 | pos = self._pos 47 | self._pos += 1 48 | 49 | return self._tokens[pos] 50 | 51 | def peek(self): 52 | return self._tokens[self._pos] 53 | 54 | def peek_max(self): 55 | pos = self._pos 56 | 57 | if self._max_pos > pos: 58 | pos = self._max_pos 59 | 60 | if pos >= len(self._tokens): 61 | return self._tokens[-1] 62 | else: 63 | return self._tokens[pos] 64 | 65 | def save(self): 66 | self._stack.append(self._pos) 67 | 68 | def restore(self): 69 | self._pos = self._stack.pop() 70 | 71 | def update(self): 72 | self._stack[-1] = self._pos 73 | 74 | def mark_max_restore(self): 75 | if self._pos > self._max_pos: 76 | self._max_pos = self._pos 77 | 78 | self._pos = self._stack.pop() 79 | 80 | def mark_max_load(self): 81 | if self._pos > self._max_pos: 82 | self._max_pos = self._pos 83 | 84 | self._pos = self._stack[-1] 85 | 86 | def drop(self): 87 | self._stack.pop() 88 | 89 | def __repr__(self): 90 | return str(self._tokens[self._pos:self._pos + 2]) 91 | 92 | 93 | class _StringTokens(_Tokens): 94 | 95 | def get_value(self): 96 | pos = self._pos 97 | self._pos += 1 98 | 99 | return self._tokens[pos].value 100 | 101 | 102 | def _wrap_string(item): 103 | if isinstance(item, str): 104 | item = _String(item) 105 | 106 | return item 107 | 108 | 109 | def _wrap_strings(items): 110 | return [_wrap_string(item) for item in items] 111 | 112 | 113 | def _format_invalid_syntax(text, offset): 114 | return 'Invalid syntax at line {}, column {}: "{}"'.format( 115 | line(text, offset), 116 | column(text, offset), 117 | markup_line(text, offset)) 118 | 119 | 120 | class Error(Exception): 121 | """General textparser exception. 122 | 123 | """ 124 | 125 | pass 126 | 127 | 128 | class TokenizeError(Error): 129 | """This exception is raised when the text cannot be converted into 130 | tokens. 131 | 132 | """ 133 | 134 | def __init__(self, text, offset): 135 | self._text = text 136 | self._offset = offset 137 | message = _format_invalid_syntax(text, offset) 138 | super(TokenizeError, self).__init__(message) 139 | 140 | @property 141 | def text(self): 142 | """The input text to the tokenizer. 143 | 144 | """ 145 | 146 | return self._text 147 | 148 | @property 149 | def offset(self): 150 | """Offset into the text where the tokenizer failed. 151 | 152 | """ 153 | 154 | return self._offset 155 | 156 | 157 | class GrammarError(Error): 158 | """This exception is raised when the tokens cannot be converted into a 159 | parse tree. 160 | 161 | """ 162 | 163 | def __init__(self, offset): 164 | self._offset = offset 165 | message = 'Invalid syntax at offset {}.'.format(offset) 166 | super(GrammarError, self).__init__(message) 167 | 168 | @property 169 | def offset(self): 170 | """Offset into the text where the parser failed. 171 | 172 | """ 173 | 174 | return self._offset 175 | 176 | 177 | class ParseError(Error): 178 | """This exception is raised when the parser fails to parse the text. 179 | 180 | """ 181 | 182 | def __init__(self, text, offset): 183 | self._text = text 184 | self._offset = offset 185 | self._line = line(text, offset) 186 | self._column = column(text, offset) 187 | message = _format_invalid_syntax(text, offset) 188 | super(ParseError, self).__init__(message) 189 | 190 | @property 191 | def text(self): 192 | """The input text to the parser. 193 | 194 | """ 195 | 196 | return self._text 197 | 198 | @property 199 | def offset(self): 200 | """Offset into the text where the parser failed. 201 | 202 | """ 203 | 204 | return self._offset 205 | 206 | @property 207 | def line(self): 208 | """Line where the parser failed. 209 | 210 | """ 211 | 212 | return self._line 213 | 214 | @property 215 | def column(self): 216 | """Column where the parser failed. 217 | 218 | """ 219 | 220 | return self._column 221 | 222 | 223 | Token = namedtuple('Token', ['kind', 'value', 'offset']) 224 | 225 | 226 | class Pattern(object): 227 | """Base class of all patterns. 228 | 229 | """ 230 | 231 | def match(self, tokens): 232 | """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else 233 | on match. 234 | 235 | """ 236 | 237 | raise NotImplementedError('To be implemented by subclasses.') 238 | 239 | 240 | class Sequence(Pattern): 241 | """Matches a sequence of patterns. Becomes a list in the parse tree. 242 | 243 | """ 244 | 245 | def __init__(self, *patterns): 246 | self.patterns = _wrap_strings(patterns) 247 | 248 | def match(self, tokens): 249 | matched = [] 250 | 251 | for pattern in self.patterns: 252 | mo = pattern.match(tokens) 253 | 254 | if mo is MISMATCH: 255 | return MISMATCH 256 | 257 | matched.append(mo) 258 | 259 | return matched 260 | 261 | 262 | class Choice(Pattern): 263 | """Matches any of given ordered patterns `patterns`. The first pattern 264 | in the list has highest priority, and the last lowest. 265 | 266 | """ 267 | 268 | def __init__(self, *patterns): 269 | self._patterns = _wrap_strings(patterns) 270 | 271 | def match(self, tokens): 272 | tokens.save() 273 | 274 | for pattern in self._patterns: 275 | tokens.mark_max_load() 276 | mo = pattern.match(tokens) 277 | 278 | if mo is not MISMATCH: 279 | tokens.drop() 280 | 281 | return mo 282 | 283 | tokens.restore() 284 | 285 | return MISMATCH 286 | 287 | 288 | class ChoiceDict(Pattern): 289 | """Matches any of given patterns. The first token kind of all patterns 290 | must be unique, otherwise and :class:`~textparser.Error` exception 291 | is raised. 292 | 293 | This class is faster than :class:`~textparser.Choice`, and should 294 | be used if the grammar allows it. 295 | 296 | """ 297 | 298 | def __init__(self, *patterns): 299 | self._patterns_map = {} 300 | patterns = _wrap_strings(patterns) 301 | 302 | for pattern in patterns: 303 | self._check_pattern(pattern, pattern) 304 | 305 | @property 306 | def patterns_map(self): 307 | return self._patterns_map 308 | 309 | def _check_pattern(self, inner, outer): 310 | if isinstance(inner, _String): 311 | self._add_pattern(inner.kind, outer) 312 | elif isinstance(inner, Sequence): 313 | self._check_pattern(inner.patterns[0], outer) 314 | elif isinstance(inner, (Tag, Forward)): 315 | self._check_pattern(inner.pattern, outer) 316 | elif isinstance(inner, ChoiceDict): 317 | for pattern in inner.patterns_map.values(): 318 | self._check_pattern(pattern, outer) 319 | else: 320 | raise Error( 321 | 'Unsupported pattern type {}.'.format(type(inner))) 322 | 323 | def _add_pattern(self, kind, pattern): 324 | if kind in self._patterns_map: 325 | raise Error( 326 | "First token kind must be unique, but {} isn't.".format( 327 | kind)) 328 | 329 | self._patterns_map[kind] = pattern 330 | 331 | def match(self, tokens): 332 | kind = tokens.peek().kind 333 | 334 | if kind in self._patterns_map: 335 | return self._patterns_map[kind].match(tokens) 336 | else: 337 | return MISMATCH 338 | 339 | 340 | class Repeated(Pattern): 341 | """Matches `pattern` at least `minimum` times. Any match becomes a 342 | list in the parse tree. 343 | 344 | """ 345 | 346 | def __init__(self, pattern, minimum=0): 347 | self._pattern = _wrap_string(pattern) 348 | self._minimum = minimum 349 | 350 | def match(self, tokens): 351 | matched = [] 352 | tokens.save() 353 | 354 | while True: 355 | mo = self._pattern.match(tokens) 356 | 357 | if mo is MISMATCH: 358 | tokens.mark_max_restore() 359 | break 360 | 361 | matched.append(mo) 362 | tokens.update() 363 | 364 | if len(matched) >= self._minimum: 365 | return matched 366 | else: 367 | return MISMATCH 368 | 369 | 370 | class RepeatedDict(Repeated): 371 | """Same as :class:`~textparser.Repeated`, but becomes a dictionary 372 | instead of a list in the parse tree. 373 | 374 | `key` is a function taking the match as input and returning the 375 | dictionary key. By default the first element in the match is used 376 | as key. 377 | 378 | """ 379 | 380 | def __init__(self, pattern, minimum=0, key=None): 381 | super(RepeatedDict, self).__init__(pattern, minimum) 382 | 383 | if key is None: 384 | key = itemgetter(0) 385 | 386 | self._key = key 387 | 388 | def match(self, tokens): 389 | matched = {} 390 | tokens.save() 391 | 392 | while True: 393 | mo = self._pattern.match(tokens) 394 | 395 | if mo is MISMATCH: 396 | tokens.mark_max_restore() 397 | break 398 | 399 | key = self._key(mo) 400 | 401 | try: 402 | matched[key].append(mo) 403 | except KeyError: 404 | matched[key] = [mo] 405 | 406 | tokens.update() 407 | 408 | if len(matched) >= self._minimum: 409 | return matched 410 | else: 411 | return MISMATCH 412 | 413 | 414 | class ZeroOrMore(Repeated): 415 | """Matches `pattern` zero or more times. 416 | 417 | See :class:`~textparser.Repeated` for more details. 418 | 419 | """ 420 | 421 | def __init__(self, pattern): 422 | super(ZeroOrMore, self).__init__(pattern, 0) 423 | 424 | 425 | class ZeroOrMoreDict(RepeatedDict): 426 | """Matches `pattern` zero or more times. 427 | 428 | See :class:`~textparser.RepeatedDict` for more details. 429 | 430 | """ 431 | 432 | def __init__(self, pattern, key=None): 433 | super(ZeroOrMoreDict, self).__init__(pattern, 0, key) 434 | 435 | 436 | class OneOrMore(Repeated): 437 | """Matches `pattern` one or more times. 438 | 439 | See :class:`~textparser.Repeated` for more details. 440 | 441 | """ 442 | 443 | def __init__(self, pattern): 444 | super(OneOrMore, self).__init__(pattern, 1) 445 | 446 | 447 | class OneOrMoreDict(RepeatedDict): 448 | """Matches `pattern` one or more times. 449 | 450 | See :class:`~textparser.RepeatedDict` for more details. 451 | 452 | """ 453 | 454 | def __init__(self, pattern, key=None): 455 | super(OneOrMoreDict, self).__init__(pattern, 1, key) 456 | 457 | 458 | class DelimitedList(Pattern): 459 | """Matches a delimented list of `pattern` separated by 460 | `delim`. `pattern` must be matched at least once. Any match 461 | becomes a list in the parse tree, excluding the delimiters. 462 | 463 | """ 464 | 465 | def __init__(self, pattern, delim=','): 466 | self._pattern = _wrap_string(pattern) 467 | self._delim = _wrap_string(delim) 468 | 469 | def match(self, tokens): 470 | # First pattern. 471 | mo = self._pattern.match(tokens) 472 | 473 | if mo is MISMATCH: 474 | return MISMATCH 475 | 476 | matched = [mo] 477 | tokens.save() 478 | 479 | while True: 480 | # Discard the delimiter. 481 | mo = self._delim.match(tokens) 482 | 483 | if mo is MISMATCH: 484 | break 485 | 486 | # Pattern. 487 | mo = self._pattern.match(tokens) 488 | 489 | if mo is MISMATCH: 490 | break 491 | 492 | matched.append(mo) 493 | tokens.update() 494 | 495 | tokens.restore() 496 | 497 | return matched 498 | 499 | 500 | class Optional(Pattern): 501 | """Matches `pattern` zero or one times. Becomes a list in the parse 502 | tree, empty on mismatch. 503 | 504 | """ 505 | 506 | def __init__(self, pattern): 507 | self._pattern = _wrap_string(pattern) 508 | 509 | def match(self, tokens): 510 | tokens.save() 511 | mo = self._pattern.match(tokens) 512 | 513 | if mo is MISMATCH: 514 | tokens.mark_max_restore() 515 | 516 | return [] 517 | else: 518 | tokens.drop() 519 | 520 | return [mo] 521 | 522 | 523 | class Any(Pattern): 524 | """Matches any token. 525 | 526 | """ 527 | 528 | def match(self, tokens): 529 | if tokens.peek().kind == '__EOF__': 530 | return MISMATCH 531 | else: 532 | return tokens.get_value() 533 | 534 | 535 | class AnyUntil(Pattern): 536 | """Matches any token until given pattern is found. Becomes a list in 537 | the parse tree, not including the given pattern match. 538 | 539 | """ 540 | 541 | def __init__(self, pattern): 542 | self._pattern = _wrap_string(pattern) 543 | 544 | def match(self, tokens): 545 | matched = [] 546 | 547 | while True: 548 | tokens.save() 549 | mo = self._pattern.match(tokens) 550 | 551 | if mo is not MISMATCH: 552 | break 553 | 554 | tokens.restore() 555 | matched.append(tokens.get_value()) 556 | 557 | tokens.restore() 558 | 559 | return matched 560 | 561 | 562 | class And(Pattern): 563 | """Matches `pattern`, without consuming any tokens. Any match becomes 564 | an empty list in the parse tree. 565 | 566 | """ 567 | 568 | def __init__(self, pattern): 569 | self._pattern = _wrap_string(pattern) 570 | 571 | def match(self, tokens): 572 | tokens.save() 573 | mo = self._pattern.match(tokens) 574 | tokens.restore() 575 | 576 | if mo is MISMATCH: 577 | return MISMATCH 578 | else: 579 | return [] 580 | 581 | 582 | class Not(Pattern): 583 | """Matches if `pattern` does not match. Any match becomes an empty 584 | list in the parse tree. 585 | 586 | Just like :class:`~textparser.And`, no tokens are consumed. 587 | 588 | """ 589 | 590 | def __init__(self, pattern): 591 | self._pattern = _wrap_string(pattern) 592 | 593 | def match(self, tokens): 594 | tokens.save() 595 | mo = self._pattern.match(tokens) 596 | tokens.restore() 597 | 598 | if mo is MISMATCH: 599 | return [] 600 | else: 601 | return MISMATCH 602 | 603 | 604 | class NoMatch(Pattern): 605 | """Never matches anything. 606 | 607 | """ 608 | 609 | def match(self, tokens): 610 | return MISMATCH 611 | 612 | 613 | class Tag(Pattern): 614 | """Tags any matched `pattern` with name `name`. Becomes a two-tuple of 615 | `name` and match in the parse tree. 616 | 617 | """ 618 | 619 | def __init__(self, name, pattern): 620 | self._name = name 621 | self._pattern = _wrap_string(pattern) 622 | 623 | @property 624 | def pattern(self): 625 | return self._pattern 626 | 627 | def match(self, tokens): 628 | mo = self._pattern.match(tokens) 629 | 630 | if mo is not MISMATCH: 631 | return (self._name, mo) 632 | else: 633 | return MISMATCH 634 | 635 | 636 | class Forward(Pattern): 637 | """Forward declaration of a pattern. 638 | 639 | .. code-block:: python 640 | 641 | >>> foo = Forward() 642 | >>> foo <<= Sequence('NUMBER') 643 | 644 | """ 645 | 646 | def __init__(self): 647 | self._pattern = None 648 | 649 | @property 650 | def pattern(self): 651 | return self._pattern 652 | 653 | def __ilshift__(self, other): 654 | self._pattern = _wrap_string(other) 655 | 656 | return self 657 | 658 | def match(self, tokens): 659 | return self._pattern.match(tokens) 660 | 661 | 662 | class Grammar(object): 663 | """Creates a tree of given tokens using the grammar `grammar`. 664 | 665 | """ 666 | 667 | def __init__(self, grammar): 668 | if isinstance(grammar, str): 669 | grammar = _wrap_string(grammar) 670 | 671 | self._root = grammar 672 | 673 | def parse(self, tokens, token_tree=False): 674 | if token_tree: 675 | tokens = _Tokens(tokens) 676 | else: 677 | tokens = _StringTokens(tokens) 678 | 679 | parsed = self._root.match(tokens) 680 | 681 | if parsed is not MISMATCH and tokens.peek_max().kind == '__EOF__': 682 | return parsed 683 | else: 684 | raise GrammarError(tokens.peek_max().offset) 685 | 686 | 687 | def choice(*patterns): 688 | """Returns an instance of the fastest choice class for given patterns 689 | `patterns`. It is recommended to use this function instead of 690 | instantiate :class:`~textparser.Choice` or 691 | :class:`~textparser.ChoiceDict` directly. 692 | 693 | """ 694 | 695 | try: 696 | return ChoiceDict(*patterns) 697 | except Error: 698 | return Choice(*patterns) 699 | 700 | 701 | def markup_line(text, offset, marker='>>!<<'): 702 | """Insert `marker` at `offset` into `text`, and return the marked 703 | line. 704 | 705 | .. code-block:: python 706 | 707 | >>> markup_line('0\\n1234\\n56', 3) 708 | 1>>!<<234 709 | 710 | """ 711 | 712 | begin = text.rfind('\n', 0, offset) 713 | begin += 1 714 | 715 | end = text.find('\n', offset) 716 | 717 | if end == -1: 718 | end = len(text) 719 | 720 | return text[begin:offset] + marker + text[offset:end] 721 | 722 | 723 | def line(text, offset): 724 | return text[:offset].count('\n') + 1 725 | 726 | 727 | def column(text, offset): 728 | line_start = text.rfind('\n', 0, offset) 729 | 730 | return offset - line_start 731 | 732 | 733 | def tokenize_init(spec): 734 | """Initialize a tokenizer. Should only be called by the 735 | :func:`~textparser.Parser.tokenize` method in the parser. 736 | 737 | """ 738 | 739 | tokens = [Token('__SOF__', '__SOF__', 0)] 740 | re_token = '|'.join([ 741 | '(?P<{}>{})'.format(name, regex) for name, regex in spec 742 | ]) 743 | 744 | return tokens, re_token 745 | 746 | 747 | class Parser(object): 748 | """The abstract base class of all text parsers. 749 | 750 | .. code-block:: python 751 | 752 | >>> from textparser import Parser, Sequence 753 | >>> class MyParser(Parser): 754 | ... def token_specs(self): 755 | ... return [ 756 | ... ('SKIP', r'[ \\r\\n\\t]+'), 757 | ... ('WORD', r'\\w+'), 758 | ... ('EMARK', '!', r'!'), 759 | ... ('COMMA', ',', r','), 760 | ... ('MISMATCH', r'.') 761 | ... ] 762 | ... def grammar(self): 763 | ... return Sequence('WORD', ',', 'WORD', '!') 764 | 765 | """ 766 | 767 | def _unpack_token_specs(self): 768 | names = {} 769 | specs = [] 770 | 771 | for spec in self.token_specs(): 772 | if len(spec) == 2: 773 | specs.append(spec) 774 | else: 775 | specs.append((spec[0], spec[2])) 776 | names[spec[0]] = spec[1] 777 | 778 | return names, specs 779 | 780 | def keywords(self): 781 | """A set of keywords in the text. 782 | 783 | .. code-block:: python 784 | 785 | def keywords(self): 786 | return set(['if', 'else']) 787 | 788 | """ 789 | 790 | return set() 791 | 792 | def token_specs(self): 793 | """The token specifications with token name, regular expression, and 794 | optionally a user friendly name. 795 | 796 | Two token specification forms are available; ``(kind, re)`` or 797 | ``(kind, name, re)``. If the second form is used, the grammar 798 | should use `name` instead of `kind`. 799 | 800 | See :class:`~textparser.Parser` for an example usage. 801 | 802 | """ 803 | 804 | return [ 805 | ('SKIP', r'[ \r\n\t]+'), 806 | ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), 807 | ('WORD', r'[A-Za-z0-9_]+'), 808 | ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), 809 | ('MISMATCH', r'.') 810 | ] 811 | 812 | def tokenize(self, text): 813 | """Tokenize given string `text`, and return a list of tokens. Raises 814 | :class:`~textparser.TokenizeError` on failure. 815 | 816 | This method should only be called by 817 | :func:`~textparser.Parser.parse()`, but may very well be 818 | overridden if the default implementation does not match the 819 | parser needs. 820 | 821 | """ 822 | 823 | names, specs = self._unpack_token_specs() 824 | keywords = self.keywords() 825 | tokens, re_token = tokenize_init(specs) 826 | 827 | for mo in re.finditer(re_token, text, re.DOTALL): 828 | kind = mo.lastgroup 829 | 830 | if kind == 'SKIP': 831 | pass 832 | elif kind != 'MISMATCH': 833 | value = mo.group(kind) 834 | 835 | if value in keywords: 836 | kind = value 837 | 838 | if kind in names: 839 | kind = names[kind] 840 | 841 | tokens.append(Token(kind, value, mo.start())) 842 | else: 843 | raise TokenizeError(text, mo.start()) 844 | 845 | return tokens 846 | 847 | def grammar(self): 848 | """The text grammar is used to create a parse tree out of a list of 849 | tokens. 850 | 851 | See :class:`~textparser.Parser` for an example usage. 852 | 853 | """ 854 | 855 | raise NotImplementedError('No grammar defined.') 856 | 857 | def parse(self, text, token_tree=False, match_sof=False): 858 | """Parse given string `text` and return the parse tree. Raises 859 | :class:`~textparser.ParseError` on failure. 860 | 861 | Returns a parse tree of tokens if `token_tree` is ``True``. 862 | 863 | .. code-block:: python 864 | 865 | >>> MyParser().parse('Hello, World!') 866 | ['Hello', ',', 'World', '!'] 867 | >>> tree = MyParser().parse('Hello, World!', token_tree=True) 868 | >>> from pprint import pprint 869 | >>> pprint(tree) 870 | [Token(kind='WORD', value='Hello', offset=0), 871 | Token(kind=',', value=',', offset=5), 872 | Token(kind='WORD', value='World', offset=7), 873 | Token(kind='!', value='!', offset=12)] 874 | 875 | """ 876 | 877 | try: 878 | tokens = self.tokenize(text) 879 | 880 | if len(tokens) == 0 or tokens[-1].kind != '__EOF__': 881 | tokens.append(Token('__EOF__', '__EOF__', len(text))) 882 | 883 | if not match_sof: 884 | if len(tokens) > 0 and tokens[0].kind == '__SOF__': 885 | del tokens[0] 886 | 887 | return Grammar(self.grammar()).parse(tokens, token_tree) 888 | except (TokenizeError, GrammarError) as e: 889 | raise ParseError(text, e.offset) 890 | 891 | 892 | def replace_blocks(string, start='{', end='}'): 893 | """Replace all blocks starting with `start` and ending with `end` with 894 | spaces (not including `start` and `end`). 895 | 896 | """ 897 | 898 | chunks = [] 899 | begin = 0 900 | depth = 0 901 | start_length = len(start) 902 | pattern = r'({}|{})'.format(re.escape(start), re.escape(end)) 903 | 904 | for mo in re.finditer(pattern, string): 905 | pos = mo.start() 906 | 907 | if mo.group() == start: 908 | if depth == 0: 909 | chunks.append(string[begin:pos + start_length]) 910 | begin = (pos + start_length) 911 | 912 | depth += 1 913 | elif depth > 0: 914 | depth -= 1 915 | 916 | if depth == 0: 917 | for chunk in string[begin:pos].split('\n'): 918 | chunks.append(' ' * len(chunk)) 919 | chunks.append('\n') 920 | 921 | chunks.pop() 922 | begin = pos 923 | 924 | chunks.append(string[begin:]) 925 | 926 | return ''.join(chunks) 927 | --------------------------------------------------------------------------------