├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── publish-to-pypi.yml
    │   ├── pytest.yml
    │   └── sphinx.yml
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── create_uml.sh
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── make.bat
    ├── modules.rst
    ├── pyregexp.rst
    └── uml
    │   ├── classes.engine.png
    │   ├── classes.lexer.png
    │   ├── classes.match.png
    │   ├── classes.pyregexp.png
    │   ├── classes.pyrser.png
    │   ├── classes.re_ast.png
    │   ├── classes.tokens.png
    │   └── packages.pyregexp.png
├── grammar.txt
├── print_coverage.sh
├── pyregexp
    ├── __init__.py
    ├── engine.py
    ├── lexer.py
    ├── match.py
    ├── pyrser.py
    ├── re_ast.py
    └── tokens.py
├── pytest.ini
├── regex.py
├── regex.sh
├── requirements.txt
├── setup.cfg
├── setup.py
└── test
    ├── __init__.py
    ├── test_engine.py
    ├── test_engine2.py
    ├── test_lexer.py
    ├── test_parser.py
    ├── test_re_ast.py
    └── test_tokens.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Additional context**
27 | Add any other context about the problem here.
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build-n-publish:
 7 |     name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@master
11 |       - name: Set up Python 3.9
12 |         uses: actions/setup-python@v1
13 |         with:
14 |           python-version: 3.9
15 |       - name: Install pypa/build
16 |         run: >-
17 |           python -m
18 |           pip install
19 |           build
20 |           --user
21 |       - name: Build a binary wheel and a source tarball
22 |         run: >-
23 |           python -m
24 |           build
25 |           --sdist
26 |           --wheel
27 |           --outdir dist/
28 |           .
29 |       - name: Publish distribution 📦 to Test PyPI
30 |         uses: pypa/gh-action-pypi-publish@master
31 |         with:
32 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
33 |           repository_url: https://test.pypi.org/legacy/
34 |           skip_existing: true
35 |       - name: Publish distribution 📦 to PyPI
36 |         if: startsWith(github.ref, 'refs/tags')
37 |         uses: pypa/gh-action-pypi-publish@master
38 |         with:
39 |           password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | name: Run Pytest
 2 | 
 3 | # Controls when the workflow will run
 4 | on:
 5 |   # Triggers the workflow on push or pull request events but only for the master branch
 6 |   push:
 7 |     branches: [master]
 8 |   pull_request:
 9 |     branches: [master]
10 | 
11 |   # Allows you to run this workflow manually from the Actions tab
12 |   workflow_dispatch:
13 | 
14 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
15 | jobs:
16 |   # This workflow contains a single job called "build"
17 |   build:
18 |     # The type of runner that the job will run on
19 |     runs-on: ubuntu-latest
20 | 
21 |     # Steps represent a sequence of tasks that will be executed as part of the job
22 |     steps:
23 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
24 |       - uses: actions/checkout@v2
25 |       - name: Set up Python 3.9
26 |         uses: actions/setup-python@v2
27 |         with:
28 |           python-version: 3.9
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install flake8 pytest
33 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
34 | 
35 |       - name: Test with pytest
36 |         run: |
37 |           if [ -d tests ] || [ -d test ]; then python -m pytest; fi
38 | 


--------------------------------------------------------------------------------
/.github/workflows/sphinx.yml:
--------------------------------------------------------------------------------
 1 | name: Pages
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/setup-python@v2
11 |       - uses: actions/checkout@master
12 |         with:
13 |           fetch-depth: 0 # otherwise, you will failed to push refs to dest repo
14 |       - name: Install dependencies
15 |         run: |
16 |           python -m pip install --upgrade pip
17 |           pip install furo
18 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
19 |       - name: Build and Commit
20 |         uses: sphinx-notes/pages@v2
21 |       - name: Push changes
22 |         uses: ad-m/github-push-action@master
23 |         with:
24 |           github_token: ${{ secrets.GITHUB_TOKEN }}
25 |           branch: gh-pages
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | venv/
 2 | 
 3 | __pycache__/
 4 | *.py[cod]
 5 | 
 6 | # Distribution / packaging
 7 | bin/
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | eggs/
12 | lib/
13 | lib64/
14 | parts/
15 | sdist/
16 | var/
17 | *.egg-info/
18 | .installed.cfg
19 | *.egg
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | pip-delete-this-directory.txt
24 | 
25 | # Unit test / coverage reports
26 | .tox/
27 | .coverage
28 | .cache
29 | nosetests.xml
30 | coverage.xml
31 | 
32 | .vscode
33 | .pytest_cache/
34 | .coverage
35 | 
36 | # Sphinx documentation
37 | docs/_build/*
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021-2022 Lorenzo Felletti
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pyregex(p)
 2 | 
 3 | ## What is it
 4 | 
 5 | Pyregex(p) is a backtracking Regex Engine complete with all major regular-expressions' features.
 6 | 
 7 | It is composed of a Lexer, a Parser (a TDRD parser) and finally the Engine.
 8 | 
 9 | Features implemented includes:
10 | | Feature | Syntax |
11 | |-|-|
12 | | match start | ^... |
13 | | match end | ...$ |
14 | | escaping | \\ |
15 | | grouping | (...) |
16 | | named group | (?\<name\>...) | 
17 | | non-capturing group | (?:...) |
18 | | alternative | a\|b |
19 | | wildcard | . |
20 | | space | \s |
21 | | quantifiers | ? \* + |
22 | | curly brace quantification | {exact} {min,max} {,max} {min,} |
23 | | range element | [^a-zA-Z059] |
24 | 
25 | 
26 | ## Play with the engine:
27 | 
28 | ```Python
29 | from pyregexp.engine import RegexEngine
30 | 
31 | reng = RegexEngine()
32 | 
33 | reng.match('^my_(beautiful_)+regex', '^my_beautiful_beautiful_beautiful_regex')
34 | ```
35 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/__init__.py


--------------------------------------------------------------------------------
/create_uml.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | pyreverse -o re_ast.png -A -S -mn -f ALL ./pyregexp/re_ast.py
 3 | mv classes.re_ast.png docs/uml
 4 | 
 5 | pyreverse -o engine.png -A -S -mn -f ALL ./pyregexp/engine.py
 6 | mv classes.engine.png docs/uml
 7 | 
 8 | pyreverse -o lexer.png -A -S -mn -f ALL ./pyregexp/lexer.py
 9 | mv classes.lexer.png docs/uml
10 | 
11 | pyreverse -o match.png -A -S -mn -f ALL ./pyregexp/match.py
12 | mv classes.match.png docs/uml
13 | 
14 | pyreverse -o pyrser.png -A -S -mn -f ALL ./pyregexp/pyrser.py
15 | mv classes.pyrser.png docs/uml
16 | 
17 | pyreverse -o tokens.png -A -S -mn -f ALL ./pyregexp/tokens.py
18 | mv classes.tokens.png docs/uml
19 | 
20 | pyreverse -o pyregexp.png -A -S -mn ./pyregexp/*
21 | mv classes.pyregexp.png docs/uml
22 | mv packages.pyregexp.png docs/uml
23 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | 
17 | import pyregexp
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'pyregexp'
21 | copyright = '2022, Lorenzo Felletti'
22 | author = 'Lorenzo Felletti'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     'sphinx.ext.autodoc',
32 |     'sphinx.ext.napoleon',
33 |     'sphinx.ext.viewcode',
34 |     'sphinx.ext.githubpages',
35 | ]
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 | 
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
44 | 
45 | 
46 | # -- Options for HTML output -------------------------------------------------
47 | 
48 | # The theme to use for HTML and HTML Help pages.  See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = 'furo'
52 | 
53 | # Add any paths that contain custom static files (such as style sheets) here,
54 | # relative to this directory. They are copied after the builtin static files,
55 | # so a file named "default.css" will overwrite the builtin "default.css".
56 | html_static_path = ['_static']
57 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. pyregexp documentation master file, created by
 2 |    sphinx-quickstart on Wed Mar 23 17:24:22 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to pyregexp's documentation!
 7 | ====================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | pyregexp
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    pyregexp
8 | 


--------------------------------------------------------------------------------
/docs/pyregexp.rst:
--------------------------------------------------------------------------------
  1 | pyregexp package
  2 | ================
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | pyregexp.engine module
  8 | ----------------------
  9 | 
 10 | .. figure:: /uml/classes.engine.png
 11 |    :alt: engine.py uml diagram 
 12 |    :width: 100%
 13 | 
 14 |    *UML of all pyregexp.engine classes.*
 15 | 
 16 | .. automodule:: pyregexp.engine
 17 |    :members:
 18 |    :undoc-members:
 19 |    :show-inheritance:
 20 | 
 21 | pyregexp.lexer module
 22 | ---------------------
 23 | 
 24 | .. figure:: /uml/classes.lexer.png
 25 |    :alt: lexer.py uml diagram
 26 |    :width: 25%
 27 | 
 28 |    *UML of all pyregexp.lexer classes.*
 29 | 
 30 | .. automodule:: pyregexp.lexer
 31 |    :members:
 32 |    :undoc-members:
 33 |    :show-inheritance:
 34 | 
 35 | pyregexp.match module
 36 | ---------------------
 37 | 
 38 | .. figure:: /uml/classes.match.png
 39 |    :alt: match.py uml diagram
 40 |    :width: 70%
 41 | 
 42 |    *UML of all pyregexp.match classes.*
 43 | 
 44 | .. automodule:: pyregexp.match
 45 |    :members:
 46 |    :undoc-members:
 47 |    :show-inheritance:
 48 | 
 49 | pyregexp.pyrser module
 50 | ----------------------
 51 | 
 52 | .. figure:: /uml/classes.pyrser.png
 53 |    :alt: pyrser.py uml diagram
 54 |    :width: 20%
 55 | 
 56 |    *UML of all pyregexp.pyrser classes.*
 57 | 
 58 | .. automodule:: pyregexp.pyrser
 59 |    :members:
 60 |    :undoc-members:
 61 |    :show-inheritance:
 62 | 
 63 | pyregexp.re\_ast module
 64 | -----------------------
 65 | 
 66 | .. figure:: /uml/classes.re_ast.png
 67 |    :alt: re_ast.py uml diagram
 68 |    :width: 100%
 69 | 
 70 |    *UML of all pyregexp.re_ast classes.*
 71 | 
 72 | .. automodule:: pyregexp.re_ast
 73 |    :members:
 74 |    :undoc-members:
 75 |    :show-inheritance:
 76 | 
 77 | pyregexp.tokens module
 78 | ----------------------
 79 | 
 80 | .. figure:: /uml/classes.tokens.png
 81 |    :alt: tokens.py uml diagram
 82 |    :width: 100%
 83 | 
 84 |    *UML of all pyregexp.tokens classes.*
 85 | 
 86 | .. automodule:: pyregexp.tokens
 87 |    :members:
 88 |    :undoc-members:
 89 |    :show-inheritance:
 90 | 
 91 | Module contents
 92 | ---------------
 93 | 
 94 | .. figure:: /uml/classes.pyregexp.png
 95 |    :alt: pyregexp uml diagram
 96 |    :width: 100%
 97 |    
 98 |    *UML of all pyregexp classes.*
 99 | 
100 | .. figure:: /uml/packages.pyregexp.png
101 |    :alt: packages uml diagram
102 |    :scale: 70%
103 |    
104 |    *UML of pyregexp packages.*
105 | 
106 | .. automodule:: pyregexp
107 |    :members:
108 |    :undoc-members:
109 |    :show-inheritance:
110 | 


--------------------------------------------------------------------------------
/docs/uml/classes.engine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.engine.png


--------------------------------------------------------------------------------
/docs/uml/classes.lexer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.lexer.png


--------------------------------------------------------------------------------
/docs/uml/classes.match.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.match.png


--------------------------------------------------------------------------------
/docs/uml/classes.pyregexp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.pyregexp.png


--------------------------------------------------------------------------------
/docs/uml/classes.pyrser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.pyrser.png


--------------------------------------------------------------------------------
/docs/uml/classes.re_ast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.re_ast.png


--------------------------------------------------------------------------------
/docs/uml/classes.tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.tokens.png


--------------------------------------------------------------------------------
/docs/uml/packages.pyregexp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/packages.pyregexp.png


--------------------------------------------------------------------------------
/grammar.txt:
--------------------------------------------------------------------------------
 1 | REGEX GRAMMAR
 2 | 
 3 | REGEX GRAMMAR recognized:
 4 | RE ::= RE_SEQ
 5 | RE_SEQ ::= '^'? GROUP '$'? ('|' RE_SEQ)?
 6 | GROUP ::= (RANGE_EL QTIFIER?)+
 7 | RANGE_EL ::= EL | '[' '^'? INNER_EL ']'
 8 | EL ::= '\\'? (ch | SPECIAL) | '(' ('?:')? RE_SEQ ')'
 9 | 
10 | QTIFIER ::= '*' | '+' | '?' | '{' (num)? ',' num '}' | '{' num '}'
11 | INNER_EL ::= ch+ | ch '-' ch INNER_EL
12 | SPECIAL ::= '(' | ')' | '+' | '{' | '[' | '|' | '.' | '^' | '$' | ...
13 | 


--------------------------------------------------------------------------------
/print_coverage.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | (source venv/bin/activate;coverage run --omit 'venv/*,test/*' -m pytest;coverage report "$@";deactivate)
4 | 


--------------------------------------------------------------------------------
/pyregexp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/pyregexp/__init__.py


--------------------------------------------------------------------------------
/pyregexp/engine.py:
--------------------------------------------------------------------------------
  1 | """Module containing the RegexEngine class.
  2 | 
  3 | The RegexEngine class implements a regular expressions engine.
  4 | 
  5 | Example:
  6 |     Matching a regex with some test string::
  7 | 
  8 |         reng = RegexEngine()
  9 |         result, consumed = reng.match(r"a+bx", "aabx")
 10 | """
 11 | 
 12 | 
 13 | from collections import deque
 14 | from typing import Callable, Deque, Union, Tuple, List
 15 | import unicodedata
 16 | from .pyrser import Pyrser
 17 | from .match import Match
 18 | from .re_ast import RE, GroupNode, LeafNode, OrNode, EndElement, StartElement
 19 | 
 20 | 
 21 | class RegexEngine:
 22 |     """ Regular Expressions Engine.
 23 | 
 24 |     This class contains all the necessary to recognize regular expressions in a test string.
 25 |     """
 26 | 
 27 |     def __init__(self):
 28 |         self.parser: Pyrser = Pyrser()
 29 |         self.prev_re: str = None
 30 |         self.prev_ast: RE = None
 31 | 
 32 |     def match(self, re: str, string: str, return_matches: bool = False, continue_after_match: bool = False, ignore_case: int = 0) -> Union[Tuple[bool, int, List[Deque[Match]]], Tuple[bool, int]]:
 33 |         """ Searches a regex in a test string.
 34 | 
 35 |         Searches the passed regular expression in the passed test string and
 36 |         returns the result.
 37 | 
 38 |         It is possible to customize both the returned value and the search
 39 |         method.
 40 | 
 41 |         The ignore_case flag may cause unexpected results in the returned
 42 |         number of matched characters, and also in the returned matches, e.g.
 43 |         when the character ẞ is present in either the regex or the test string.
 44 | 
 45 |         Args:
 46 |             re (str): the regular expression to search
 47 |             string (str): the test string
 48 |             return_matches (bool): if True a data structure containing the
 49 |                 matches - the whole match and the subgroups matched
 50 |                 (default is False)
 51 |             continue_after_match (bool): if True the engine continues
 52 |                 matching until the whole input is consumed
 53 |                 (default is False)
 54 |             ignore_case (int): when 0 the case is not ignored, when 1 a "soft"
 55 |                 case ignoring is performed, when 2 casefolding is performed.
 56 |                 (default is 0)
 57 | 
 58 |         Returns:
 59 |             A tuple containing whether a match was found or not, the last
 60 |             matched character index, and, if return_matches is True, a
 61 |             list of deques of Match, where each list of matches represents
 62 |             in the first position the whole match, and in the subsequent
 63 |             positions all the group and subgroups matched. 
 64 |         """
 65 | 
 66 |         def return_fnc(res: bool, consumed: int, all_matches: List[Deque[Match]], return_matches: bool) -> Union[Tuple[bool, int, List[Deque[Match]]], Tuple[bool, int]]:
 67 |             """ Create the Tuple to return."""
 68 |             if return_matches:
 69 |                 return res, consumed, all_matches
 70 |             else:
 71 |                 return res, consumed
 72 | 
 73 |         if ignore_case == 1:
 74 |             re = unicodedata.normalize("NFKD", re).lower()
 75 |             string = unicodedata.normalize("NFKD", string).casefold()
 76 |         elif ignore_case == 2:
 77 |             re = unicodedata.normalize("NFKD", re).casefold()
 78 |             string = unicodedata.normalize("NFKD", string).casefold()
 79 | 
 80 |         ast = self.parser.parse(re=re) if self.prev_re != re else self.prev_ast
 81 |         self.prev_re = re
 82 |         self.prev_ast = ast
 83 | 
 84 |         # variables holding the matched groups list for each matched substring in the test string
 85 |         all_matches: List[Deque[Match]] = []
 86 |         highest_matched_idx: int = 0  # holds the highest matched string's index
 87 | 
 88 |         res, consumed, matches = self.__match__(ast, string, 0)
 89 |         if res:
 90 |             highest_matched_idx = consumed
 91 |             all_matches.append(matches)
 92 |         else:
 93 |             return return_fnc(res, highest_matched_idx, all_matches, return_matches)
 94 | 
 95 |         if not continue_after_match or not consumed > 0:
 96 |             return return_fnc(res, highest_matched_idx, all_matches, return_matches)
 97 | 
 98 |         while True:
 99 |             res, consumed, matches = self.__match__(ast, string, consumed)
100 | 
101 |             # if consumed is not grater than highest_matched_idx this means the new match
102 |             # consumed 0 characters, so there is really nothing more to match
103 |             if res and consumed > highest_matched_idx:
104 |                 highest_matched_idx = consumed
105 |                 all_matches.append(matches)
106 |             else:
107 |                 return return_fnc(True, highest_matched_idx, all_matches, return_matches)
108 | 
109 |     def __match__(self, ast: RE, string: str, start_str_i: int) -> Tuple[bool, int, Deque[Match]]:
110 |         """ Same as match, but always returns after the first match."""
111 |         matches: Deque[Match] = deque()
112 | 
113 |         # used to restore the left match of a ornode if necessary
114 |         last_match: Match = None
115 | 
116 |         # str_i represents the matched characters so far. It is inizialized to
117 |         # the value of the input parameter start_str_i because the match could
118 |         # be to be searched starting at an index different from 0, e.g. in the
119 |         # case this function is called to search a second match in the test
120 |         # string.
121 |         str_i = start_str_i
122 | 
123 |         # max_matched_idx represents the "upper limit" of the match.
124 |         # It is necessary when backtracking in the presence of nested
125 |         # quantifiers, because we need a way to "tell" the group that
126 |         # is causing the fail by being too greedy to stop earlier if
127 |         # possible.
128 |         max_matched_idx = -1
129 | 
130 |         def return_fnc(res: bool, str_i: int) -> Tuple[bool, int, Deque[Match]]:
131 |             """ Returns the Tuple to be returned by __match__."""
132 |             nonlocal matches
133 |             return res, str_i, matches
134 | 
135 |         def save_matches(match_group: Callable, ast: Union[RE, GroupNode], string: str, start_idx: int, max_matched_idx=-1) -> Tuple[bool, int]:
136 |             """ Save the matches of capturing groups.
137 | 
138 |             Args:
139 |                 match_group (Callable): the function to use to match the group
140 |                 ast (Union[RE, GroupNode]): the group to match
141 |                 string (str): the string to match
142 |                 start_idx (int): the starting index
143 | 
144 |             Returns:
145 |                 A tuple of the boolean result of the match, and the last matched
146 |                 index.
147 |             """
148 |             nonlocal matches
149 |             nonlocal last_match
150 | 
151 |             res, end_idx = match_group(ast, string, max_matched_idx)
152 | 
153 |             if ast.is_capturing() and res == True:
154 |                 for i in range(0, len(matches)):
155 |                     if matches[i].group_id == ast.group_id:
156 |                         last_match = matches[i]
157 |                         matches.remove(matches[i])
158 |                         break
159 |                 matches.appendleft(
160 |                     Match(ast.group_id, start_idx, end_idx, string, ast.group_name))
161 | 
162 |             return res, end_idx
163 |         
164 |         def remove_leftmost_match():
165 |             """ Used when matching an OrNode.
166 |             
167 |             When matching an OrNode the right children is always saved instead
168 |             of saving the left one when the chosen path goes left. By calling
169 |             this function you remove the leftmost match (the one created by the
170 |             right child).
171 |             """
172 |             nonlocal matches
173 |             matches.popleft()
174 |         
175 |         def appendleft_last_match():
176 |             """ Used when matching an OrNode.
177 |             
178 |             When matching an OrNode the right children is always saved instead
179 |             of saving the left one when the chosen path goes left. By calling
180 |             this function you restore the left match.
181 |             """
182 |             nonlocal matches
183 |             matches.appendleft(last_match)
184 | 
185 | 
186 |         def match_group(ast: Union[RE, GroupNode, OrNode], string: str, max_matched_idx: int = -1) -> Tuple[bool, int]:
187 |             """
188 |             Match a group, which is always the case.s
189 | 
190 |             Returns the match state (True or False) and the new string i, that is the
191 |             number of matched characters in the string so far.
192 |             """
193 |             nonlocal start_str_i
194 |             nonlocal str_i
195 |             backtrack_stack: List[Tuple[int, int, int, List[int]]] = []
196 | 
197 |             def backtrack(str_i: int, curr_child_i: int, recursive: bool = False) -> Tuple[bool, int, int]:
198 |                 """ Returns whether it is possible to backtrack and the state to backtrack to.
199 | 
200 |                 Takes as input the current state of the engine and returns whether
201 |                 or not it is possible to backtrack.
202 | 
203 |                 Args:
204 |                     str_i (int): the current considered index of the test string
205 |                     curr_child_i (int): the index of the GroupNode children considered
206 | 
207 |                 Returns:
208 |                     A Tuple containing a bool, True if it is possible to backtrack,
209 |                     the new string index, and the new node children index to which
210 |                     backtrack to. Note that the last two parameters only have a
211 |                     meaning in the case it is possible to backtrack (the bool is
212 |                     True).
213 |                 """
214 |                 nonlocal backtrack_stack
215 |                 nonlocal max_matched_idx
216 |                 nonlocal ast
217 | 
218 |                 if len(backtrack_stack) == 0:
219 |                     return False, str_i, curr_child_i
220 | 
221 |                 # the fist step is to pop the last tuple from the backtrack_stack
222 |                 popped_child_i, min_, matched_times, consumed_list = backtrack_stack.pop()
223 | 
224 |                 if matched_times == min_:
225 |                     # if a node is already matched the minimum number of times, the
226 |                     # chance you have to potentially be able to backtrack is to is
227 |                     # to delete the entry from the stack and then search for a new
228 |                     # possibility (recursively calling this function).
229 |                     # But, before the recursion, you have to calculate  what the
230 |                     # string index (str_i) value was before the node was matched
231 |                     # even once. Thus, you have to decrease the string index
232 |                     # of each consumption in the consumed_list.
233 | 
234 |                     # calculate_the new str_i
235 |                     before_str_i = str_i
236 |                     for consumption in consumed_list:
237 |                         str_i -= consumption
238 |                     if max_matched_idx == -1 or isinstance(ast.children[popped_child_i], LeafNode) or before_str_i == str_i:
239 |                         # recursive call
240 |                         return backtrack(str_i, popped_child_i, True)
241 |                     else:
242 |                         # case of backtracking from nested quantifier
243 |                         # returns "not recursive" because if it is the case
244 |                         # of a recursive call, this is outside of the case of
245 |                         # simply nested quantifiers, and in I cannot backtrack
246 |                         # anymore
247 |                         return not recursive, str_i, popped_child_i
248 |                 else:
249 |                     # the node was matched more times than its min, so you just
250 |                     # need to remove the last consumption from the list,
251 |                     # decrease the str_i by that amount, decrease the times the node
252 |                     # was matched - matched_times - by 1, and then append the stack
253 |                     # the tuple with the new matched_times and consumed_list.
254 |                     last_consumed = consumed_list.pop()
255 |                     new_str_i = str_i - last_consumed
256 |                     if max_matched_idx == -1 or isinstance(ast.children[popped_child_i], LeafNode):
257 |                         backtrack_stack.append(
258 |                             (popped_child_i, min_, matched_times - 1, consumed_list))
259 |                         # lastly, you return that the backtracking is possible, and
260 |                         # the state to which backtrack to.
261 |                         return True, new_str_i, curr_child_i
262 |                     else:
263 |                         # case of backtracking from nested quantifier
264 |                         return not recursive, new_str_i, popped_child_i
265 | 
266 |             def remove_this_node_from_stack(curr_child_i: int, str_i: int) -> int:
267 |                 """ Removes node from stack and returns the new str_i.
268 |                 """
269 |                 nonlocal backtrack_stack
270 |                 popped_child_i, min_, matched_times, consumed_list = backtrack_stack.pop()
271 |                 if popped_child_i == curr_child_i:
272 |                     for consumption in consumed_list:
273 |                         str_i -= consumption
274 |                 else:
275 |                     backtrack_stack.append((popped_child_i, min_, matched_times, consumed_list))
276 |                 return str_i
277 | 
278 |             curr_node = ast.children[0] if len(ast.children) > 0 else None
279 |             i = 0  # the children i'm iterating, not to confuse with str_i
280 | 
281 |             if isinstance(ast, OrNode):
282 |                 # matcha il primo, se matcha return true
283 |                 # se no matcha il secondo
284 |                 # se matcha return true, altrimenti false
285 |                 tmp_str_i = str_i
286 |                 res, new_str_i = save_matches(
287 |                             match_group, curr_node, string, str_i, max_matched_idx) if not isinstance(curr_node, OrNode) else match_group(curr_node, string, max_matched_idx)
288 |                 if not res:
289 |                     str_i = tmp_str_i
290 |                     curr_node = ast.right
291 |                     res, new_str_i = save_matches(
292 |                             match_group, curr_node, string, str_i, max_matched_idx) if not isinstance(curr_node, OrNode) else match_group(curr_node, string, max_matched_idx)
293 |                 str_i = new_str_i
294 |                 return res, str_i
295 | 
296 |             # the passed ast can't be a Leaf
297 |             while i < len(ast.children):
298 |                 curr_node = ast.children[i]
299 | 
300 |                 # if is OrNode I evaluate the sub-groups with a recursive call
301 |                 if isinstance(curr_node, OrNode):
302 |                     before_str_i = str_i
303 |                     min_, max_ = curr_node.min, curr_node.max
304 |                     j = 0
305 |                     consumed_list = []
306 | 
307 |                     backtracking = False
308 |                     while j < max_:
309 |                         tmp_str_i = str_i
310 | 
311 |                         save_match_left = isinstance(curr_node.left, GroupNode)
312 |                         res_left, str_i_left = save_matches(match_group, curr_node.left, string, str_i, max_matched_idx) if save_match_left else match_group(curr_node.left, string, max_matched_idx)
313 | 
314 |                         str_i = tmp_str_i
315 | 
316 |                         save_match_right = isinstance(curr_node.right, GroupNode)
317 |                         res_right, str_i_right = save_matches(match_group, curr_node.right, string, str_i, max_matched_idx) if save_match_right else match_group(curr_node.right, string, max_matched_idx)
318 | 
319 |                         if res_left and res_right:
320 |                             # choose the one that consumed the most character
321 |                             # unless it exceeds the max_matched_idx
322 |                             chose_left = (str_i_left >= str_i_right)
323 |                             str_i = str_i_left if chose_left else str_i_right
324 |                             if max_matched_idx != -1 and str_i > max_matched_idx:
325 |                                 # tries to stay below the max_matched_idx threshold
326 |                                 str_i = str_i_right if chose_left else str_i_left
327 |                             if chose_left:
328 |                                 if save_match_right:
329 |                                     remove_leftmost_match()
330 |                                 if save_match_left:
331 |                                     appendleft_last_match()
332 |                             else:
333 |                                 # chose right
334 |                                 if save_match_left and not save_match_right:
335 |                                     # there is a spurious match originated from
336 |                                     # the left child
337 |                                     remove_leftmost_match()
338 | 
339 |                         elif res_left and not res_right:
340 |                             str_i = str_i_left
341 |                         elif not res_left and res_right:
342 |                             str_i = str_i_right
343 | 
344 |                         res = (res_left or res_right)
345 | 
346 |                         if res == True and (max_matched_idx == -1 or str_i <= max_matched_idx):
347 |                             if (str_i - tmp_str_i == 0) and j >= min_:
348 |                                 max_matched_idx = -1
349 |                                 break
350 |                             consumed_list.append(str_i - tmp_str_i)
351 |                         else:
352 |                             if min_ <= j:
353 |                                 max_matched_idx = -1
354 |                                 break
355 |                             if i > 0 and not isinstance(ast.children[i-1], LeafNode):
356 |                                 str_i = remove_this_node_from_stack(i, str_i)
357 |                             if str_i == start_str_i:
358 |                                 return False, str_i
359 |                             max_matched_idx = str_i - 1 if max_matched_idx == -1 else max_matched_idx - 1
360 |                             can_bt, bt_str_i, bt_i = backtrack(str_i, i)
361 |                             if can_bt:
362 |                                 i = bt_i
363 |                                 str_i = bt_str_i
364 |                                 backtracking = True
365 |                                 break  # retry to match the current node
366 |                             else:
367 |                                 return False, str_i
368 |                         j += 1
369 |                     if not backtracking:
370 |                         backtrack_stack.append(
371 |                             (i, min_, j, consumed_list))
372 |                         max_matched_idx = -1
373 |                         i += 1
374 |                     continue
375 | 
376 |                 elif isinstance(curr_node, GroupNode):
377 |                     min_, max_ = curr_node.min, curr_node.max
378 |                     j = 0
379 |                     consumed_list = []
380 |                     before_str_i = str_i
381 | 
382 |                     backtracking = False
383 |                     while j < max_:
384 |                         tmp_str_i = str_i
385 | 
386 |                         res, new_str_i = save_matches(
387 |                             match_group, curr_node, string, str_i, max_matched_idx)
388 |                         if res == True and (max_matched_idx == -1 or new_str_i <= max_matched_idx):
389 |                             # i must use tmp_str_i because str_i is changed by the match_group
390 |                             # call, so (new_str_i - str_i) would be always 0
391 |                             if (new_str_i - tmp_str_i == 0) and j >= min_:
392 |                                 max_matched_idx = -1
393 |                                 break
394 |                             consumed_list.append(new_str_i - tmp_str_i)
395 |                             #str_i = new_str_i
396 |                         else:
397 |                             if min_ <= j:
398 |                                 # i did the bare minimum or more
399 |                                 max_matched_idx = -1
400 |                                 break
401 |                             if i > 0 and not isinstance(ast.children[i-1], LeafNode):
402 |                                 str_i = remove_this_node_from_stack(i, str_i)
403 |                                 if str_i == start_str_i:
404 |                                     return False, str_i
405 |                                 max_matched_idx = str_i - 1 if max_matched_idx == -1 else max_matched_idx - 1
406 |                             can_bt, bt_str_i, bt_i = backtrack(str_i, i)
407 |                             if can_bt:
408 |                                 i = bt_i
409 |                                 str_i = bt_str_i
410 |                                 backtracking = True
411 |                                 break  # retry to match the current node
412 |                             else:
413 |                                 return False, str_i
414 |                         j += 1
415 | 
416 |                     # if NOT backtracking iterate the next element, and put the
417 |                     # current on the backtrack_stack, otherwise don't increment i, don't put on the
418 |                     # stack so to retry the current one (just continue)
419 |                     if not backtracking:
420 |                         backtrack_stack.append(
421 |                             (i, min_, j, consumed_list))
422 |                         max_matched_idx = -1
423 |                         i += 1
424 | 
425 |                     continue
426 | 
427 |                 elif isinstance(curr_node, LeafNode):
428 |                     # it is a LeafNode obviously now
429 |                     min_, max_ = curr_node.min, curr_node.max
430 |                     j = 0
431 | 
432 |                     consumed_list = []
433 | 
434 |                     before_str_i = str_i  # to discard changes made in case i need to bt
435 | 
436 |                     backtracking = False
437 |                     while j < max_:
438 |                         if str_i < len(string):  # i still have input to match
439 |                             if curr_node.is_match(ch=string[str_i], str_i=str_i, str_len=len(string)) and (max_matched_idx == -1 or str_i < max_matched_idx):
440 |                                 if not (isinstance(curr_node, StartElement) or isinstance(curr_node, EndElement)):
441 |                                     consumed_list.append(1)
442 |                                     str_i += 1
443 |                             else:
444 |                                 if min_ <= j:  # I already met the minimum requirement for match
445 |                                     break
446 |                                 if i > 0 and not isinstance(ast.children[i-1], LeafNode):
447 |                                     str_i = remove_this_node_from_stack(i, str_i)
448 |                                     if str_i == start_str_i:
449 |                                         return False, str_i
450 |                                     max_matched_idx = str_i - 1
451 |                                 can_bt, bt_str_i, bt_i = backtrack(
452 |                                     before_str_i, i)
453 |                                 if can_bt:
454 |                                     i = bt_i
455 |                                     str_i = bt_str_i
456 |                                     backtracking = True
457 |                                     break
458 |                                 else:
459 |                                     return False, str_i
460 |                         else:  # finished input
461 |                             if isinstance(curr_node, StartElement) or isinstance(curr_node, EndElement) and curr_node.is_match(str_i=str_i, str_len=len(string)):
462 |                                 pass
463 |                             # finished input w/o finishing the regex tree
464 |                             elif min_ <= j:
465 |                                 break
466 |                             else:
467 |                                 # i have more states, but the input is finished
468 |                                 can_bt, bt_str_i, bt_i = backtrack(
469 |                                     before_str_i, i)
470 |                                 if can_bt:
471 |                                     i = bt_i
472 |                                     str_i = bt_str_i
473 |                                     backtracking = True
474 |                                     break
475 |                                 else:
476 |                                     return False, str_i
477 |                         j += 1
478 |                     if not backtracking:
479 |                         backtrack_stack.append(
480 |                             (i, min_, j, consumed_list))
481 |                         i += 1
482 |                     continue
483 |                 else:
484 |                     return False, str_i
485 | 
486 |             return True, str_i
487 | 
488 |         i = str_i
489 | 
490 |         if len(string) == 0:
491 |             res, consumed = save_matches(
492 |                 match_group=match_group, ast=ast, string=string, start_idx=str_i)
493 |             return return_fnc(res, consumed)
494 | 
495 |         while str_i < len(string):
496 |             res, _ = save_matches(match_group=match_group,
497 |                                   ast=ast, string=string, start_idx=str_i)
498 |             i += 1
499 |             if res:
500 |                 return return_fnc(True, str_i)
501 |             else:
502 |                 matches = deque()
503 |                 str_i = i
504 |         return return_fnc(False, str_i)
505 | 


--------------------------------------------------------------------------------
/pyregexp/lexer.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from .tokens import *
  3 | 
  4 | 
  5 | class Lexer:
  6 |     """ Lexer for the pyregexp library.
  7 | 
  8 |     This class contains the method to scan a regular expression string producing the corresponding tokens.
  9 |     """
 10 | 
 11 |     def __init__(self) -> None:
 12 |         self.__digits__ = '0123456789'
 13 | 
 14 |     def __is_digit__(self, ch: str) -> bool:
 15 |         return self.__digits__.find(ch) > -1
 16 | 
 17 |     def scan(self, re: str) -> List[Token]:
 18 |         """ Regular expressions scanner.
 19 | 
 20 |         Scans the regular expression in input and produces the list of recognized Tokens in output.
 21 |         It raises an Exception if there are errors in the regular expression.
 22 | 
 23 |         Args:
 24 |             re (str): the regular expression to scan
 25 | 
 26 |         Returns:
 27 |             List[Token]: the list of tokens recognized in the passed regex
 28 |         """
 29 |         tokens = []
 30 | 
 31 |         def append(elem: Token) -> None:
 32 |             nonlocal tokens
 33 |             tokens.append(elem)
 34 | 
 35 |         i = 0
 36 |         escape_found = False
 37 |         while i < len(re):
 38 |             ch = re[i]
 39 |             if escape_found:
 40 |                 if ch == 't':
 41 |                     append(ElementToken(char='\t'))
 42 |                 if ch == 's':
 43 |                     # \s matches a space character
 44 |                     append(SpaceToken(char=ch))
 45 |                 else:
 46 |                     append(ElementToken(char=ch))
 47 |             elif ch == '\\':
 48 |                 escape_found = True
 49 |                 i += 1  # otherwise i won't be incremented bc of continue
 50 |                 continue
 51 |             elif ch == '.':
 52 |                 append(Wildcard())
 53 |             elif ch == '(':
 54 |                 append(LeftParenthesis())
 55 |             elif ch == ')':
 56 |                 append(RightParenthesis())
 57 |             elif ch == '[':
 58 |                 append(LeftBracket())
 59 |             elif ch == '-':
 60 |                 append(Dash())
 61 |             elif ch == ']':
 62 |                 append(RightBracket())
 63 |             elif ch == '{':
 64 |                 append(LeftCurlyBrace())
 65 |                 i += 1
 66 |                 while i < len(re):
 67 |                     ch = re[i]
 68 |                     if ch == ',':
 69 |                         append(Comma())
 70 |                     elif self.__is_digit__(ch):
 71 |                         append(ElementToken(char=ch))
 72 |                     elif ch == '}':
 73 |                         append(RightCurlyBrace())
 74 |                         break
 75 |                     else:
 76 |                         raise Exception("Bad token at index ${}.".format(i))
 77 |                     i += 1
 78 |             elif ch == '^':
 79 |                 if i == 0:
 80 |                     append(Start())
 81 |                 else:
 82 |                     append(Circumflex())
 83 |             elif ch == '$':
 84 |                 append(End())
 85 |             elif ch == '?':
 86 |                 append(QuestionMark())
 87 |             elif ch == '*':
 88 |                 append(Asterisk())
 89 |             elif ch == '+':
 90 |                 append(Plus())
 91 |             elif ch == '|':
 92 |                 append(VerticalBar())
 93 |             elif ch == '}':
 94 |                 append(RightCurlyBrace())
 95 |             else:
 96 |                 append(ElementToken(char=ch))
 97 | 
 98 |             escape_found = False
 99 |             i += 1
100 | 
101 |         return tokens
102 | 


--------------------------------------------------------------------------------
/pyregexp/match.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Match:
 4 |     """ Contains the information of a match in a regular expression."""
 5 | 
 6 |     def __init__(self, group_id: int, start_idx: int, end_idx: int, string: str, name: str) -> None:
 7 |         self.group_id: int = group_id
 8 |         self.name: str = name
 9 |         self.start_idx: int = start_idx
10 |         self.end_idx: int = end_idx
11 |         self.match: str = string[start_idx:end_idx]
12 | 


--------------------------------------------------------------------------------
/pyregexp/pyrser.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, Callable
  2 | import itertools
  3 | import math
  4 | from .lexer import Lexer
  5 | from .tokens import *
  6 | from .re_ast import *
  7 | 
  8 | 
  9 | class Pyrser:
 10 |     """ Regular Expression Parser.
 11 | 
 12 |     Pyrser instances can parse regular expressions and return the corresponding AST.    
 13 |     """
 14 | 
 15 |     def __init__(self) -> None:
 16 |         self.lxr: Lexer = Lexer()
 17 | 
 18 |     def parse(self, re: str) -> RE:
 19 |         """ Parses a regular expression.
 20 | 
 21 |         Parses a regex and returns the corresponding AST.
 22 |         If the regex contains errors raises an Exception.
 23 | 
 24 |         Args:
 25 |             re (str): a regular expression
 26 | 
 27 |         Returns:
 28 |             RE: the root node of the regular expression's AST
 29 |         """
 30 | 
 31 |         def get_range_str(start: str, end: str) -> str:
 32 |             result = ''
 33 |             i = ord(start)
 34 |             while i <= ord(end):
 35 |                 result += chr(i)
 36 |                 i += 1
 37 |             return result
 38 | 
 39 |         def next_tkn_initializer(re: str) -> Callable[[bool], Union[Token, None]]:
 40 |             tokens = self.lxr.scan(re=re)
 41 | 
 42 |             i = -1
 43 | 
 44 |             def next_tkn(without_consuming: bool = False) -> Union[Token, None]:
 45 |                 nonlocal i
 46 |                 nonlocal tokens
 47 |                 nonlocal curr_tkn
 48 | 
 49 |                 if without_consuming:
 50 |                     return tokens[i+1] if len(tokens) > i+1 else None
 51 | 
 52 |                 i += 1
 53 |                 if i < len(tokens):
 54 |                     curr_tkn = tokens[i]
 55 |                 else:
 56 |                     curr_tkn = None
 57 | 
 58 |             return next_tkn
 59 | 
 60 |         def parse_re() -> RE:
 61 |             return RE(parse_re_seq())
 62 | 
 63 |         def parse_re_seq(capturing: bool = True, group_name: str = None, group_id: int = None) -> Union[OrNode, GroupNode]:
 64 |             match_start, match_end = False, False
 65 |             if type(curr_tkn) is Start or type(curr_tkn) is Circumflex:
 66 |                 next_tkn()
 67 |                 match_start = True
 68 | 
 69 |             node = parse_group(capturing=capturing,
 70 |                                group_name=group_name, group_id=group_id)
 71 | 
 72 |             if isinstance(curr_tkn, EndToken):
 73 |                 next_tkn()
 74 |                 match_end = True
 75 |             else:
 76 |                 match_end = False
 77 | 
 78 |             if match_start:
 79 |                 node.children.appendleft(StartElement())
 80 |             if match_end:
 81 |                 node.children.append(EndElement())
 82 | 
 83 |             if isinstance(curr_tkn, OrToken):
 84 |                 next_tkn()
 85 |                 node = OrNode(left=node, right=parse_re_seq(
 86 |                     group_name=node.group_name, group_id=node.group_id))
 87 | 
 88 |             return node
 89 | 
 90 |         def parse_group(capturing: bool = True, group_name: str = None, group_id: int = None) -> GroupNode:
 91 |             nonlocal groups_counter
 92 |             if group_id is None:
 93 |                 group_id = next(groups_counter)
 94 | 
 95 |             elements = deque()  # holds the children of the GroupNode
 96 | 
 97 |             while curr_tkn is not None and not isinstance(curr_tkn, OrToken) and \
 98 |                     not isinstance(curr_tkn, RightParenthesis) and \
 99 |                     not isinstance(curr_tkn, EndToken):
100 |                 new_el = parse_range_el()
101 | 
102 |                 next_tkn()
103 | 
104 |                 if isinstance(curr_tkn, EndToken):
105 |                     elements.append(new_el)
106 |                     break
107 | 
108 |                 if isinstance(curr_tkn, Quantifier):
109 |                     if isinstance(curr_tkn, ZeroOrOne):
110 |                         new_el.min, new_el.max = 0, 1
111 |                     elif isinstance(curr_tkn, ZeroOrMore):
112 |                         new_el.min, new_el.max = 0, math.inf
113 |                     else:
114 |                         # suppose it's 1+
115 |                         new_el.min, new_el.max = 1, math.inf
116 |                     next_tkn()
117 |                 elif isinstance(curr_tkn, LeftCurlyBrace):
118 |                     parse_curly(new_el)
119 | 
120 |                 elements.append(new_el)
121 | 
122 |             return GroupNode(children=elements, capturing=capturing, group_name=group_name, group_id=group_id)
123 | 
124 |         def parse_curly(new_el: ASTNode) -> None:
125 |             # move past the left brace
126 |             next_tkn()
127 | 
128 |             # find val_1, val_2
129 |             val_1, val_2 = '', ''
130 |             try:
131 |                 while isinstance(curr_tkn, ElementToken):
132 |                     val_1 += curr_tkn.char
133 |                     next_tkn()
134 |                 if val_1 == '':
135 |                     val_1 == 0
136 |                 else:
137 |                     val_1 = int(val_1)
138 | 
139 |                 if isinstance(curr_tkn, RightCurlyBrace):
140 |                     # case {exact}
141 |                     if type(val_1) is int:
142 |                         new_el.min, new_el.max = val_1, val_1
143 |                         next_tkn()  # skip the closing brace
144 |                         return
145 |                     else:
146 |                         raise Exception("Invalid curly brace syntax.")
147 | 
148 |                 next_tkn()
149 |                 while isinstance(curr_tkn, ElementToken):
150 |                     val_2 += curr_tkn.char
151 |                     next_tkn()
152 |                 if val_2 == '':
153 |                     val_2 == math.inf
154 |                 else:
155 |                     val_2 = int(val_2)
156 | 
157 |                 next_tkn()  # skip the closing brace
158 | 
159 |                 new_el.min = val_1 if type(val_1) is int else 0
160 |                 new_el.max = val_2 if type(val_2) is int else math.inf
161 | 
162 |             except Exception as e:
163 |                 raise Exception("Invalid curly brace syntax.")
164 | 
165 |         def parse_range_el() -> ASTNode:
166 |             if isinstance(curr_tkn, LeftBracket):
167 |                 next_tkn()
168 |                 element = parse_inner_el()
169 |                 if isinstance(curr_tkn, RightBracket):
170 |                     return element
171 |                 else:
172 |                     raise Exception(
173 |                         "Missing closing ']'.")
174 |             else:
175 |                 return parse_el()
176 | 
177 |         def parse_inner_el() -> RangeElement:
178 |             # parse_inner_el creates a single RangeElement with all the matches
179 |             nonlocal curr_tkn
180 |             match_str = ''
181 |             if curr_tkn is None:
182 |                 raise Exception(
183 |                     "Missing closing ']'.")
184 | 
185 |             positive_logic = True
186 |             if isinstance(curr_tkn, NotToken):
187 |                 positive_logic = False
188 |                 next_tkn()
189 | 
190 |             prev_char = None
191 |             while curr_tkn is not None:
192 |                 if isinstance(curr_tkn, RightBracket):
193 |                     break
194 | 
195 |                 if isinstance(curr_tkn, SpaceToken):
196 |                     match_str += curr_tkn.char
197 |                     next_tkn()
198 |                     continue
199 | 
200 |                 # every character inside it must be treated as an element
201 |                 if not isinstance(curr_tkn, ElementToken):
202 |                     curr_tkn = ElementToken(char=curr_tkn.char)
203 | 
204 |                 if next_tkn(without_consuming=True) is None:
205 |                     raise Exception("Missing closing ']'.")
206 |                 elif isinstance(next_tkn(without_consuming=True), Dash):
207 |                     # it may be a range (like a-z, A-M, 0-9, ...)
208 |                     prev_char = curr_tkn.char
209 |                     next_tkn()  # current token is now the Dash
210 |                     if isinstance(next_tkn(without_consuming=True), RightBracket) or isinstance(next_tkn(without_consuming=True), SpaceToken):
211 |                         # we're in one of these scenarios: "<char>-]" "<char>-\s"
212 |                         # the dash and previous character must be interpreted as single elements
213 |                         match_str += prev_char + curr_tkn.char
214 |                     else:
215 |                         # we're in the case of an actual range (or next_tkn is none)
216 |                         next_tkn()  # curr_tkn is now the one after the dash
217 |                         if next_tkn is None:
218 |                             raise Exception("Missing closing ']'.")
219 |                         elif ord(prev_char) > ord(curr_tkn.char):
220 |                             raise Exception(
221 |                                 f"Range values reversed. Start '{prev_char}' char code is greater than end '{curr_tkn.char}' char code.")
222 |                         else:
223 |                             match_str += get_range_str(prev_char,
224 |                                                        curr_tkn.char)
225 |                 else:
226 |                     # no range, no missing ']', just a char to add to match_str
227 |                     match_str += curr_tkn.char
228 |                 next_tkn()
229 | 
230 |             return RangeElement(match_str="".join(sorted(set(match_str))), is_positive_logic=positive_logic)
231 | 
232 |         def parse_el() -> Union[Element, OrNode, GroupNode]:
233 |             group_name: Union[str, None]
234 |             group_name = None
235 |             if isinstance(curr_tkn, ElementToken):
236 |                 return Element(match_ch=curr_tkn.char)
237 |             elif isinstance(curr_tkn, Wildcard):
238 |                 return WildcardElement()
239 |             elif isinstance(curr_tkn, SpaceToken):
240 |                 return SpaceElement()
241 |             elif isinstance(curr_tkn, LeftParenthesis):
242 |                 next_tkn()
243 |                 # (?: for non-capturing group
244 |                 capturing = True
245 |                 if type(curr_tkn) is QuestionMark:
246 |                     next_tkn()
247 |                     if curr_tkn.char == ':':
248 |                         capturing = False
249 |                         next_tkn()
250 |                     elif curr_tkn.char == '<':
251 |                         next_tkn()
252 |                         group_name = parse_group_name()
253 |                     else:
254 |                         if curr_tkn is None:
255 |                             raise Exception("Unterminated group.")
256 |                         else:
257 |                             raise Exception(
258 |                                 f"Invalid group: '{{?{curr_tkn.char}'.")
259 |                 res = parse_re_seq(capturing=capturing, group_name=group_name)
260 |                 if isinstance(curr_tkn, RightParenthesis):
261 |                     # next_tkn() not needed (parse_group's while loop will eat the parenthesis)
262 |                     return res
263 |                 else:
264 |                     raise Exception("Missing closing group parenthesis ')'.")
265 |             else:
266 |                 raise Exception(
267 |                     "Unescaped special character {}.".format(curr_tkn.char))
268 | 
269 |         def parse_group_name() -> str:
270 |             if curr_tkn is None:
271 |                 raise Exception("Unterminated named group name.")
272 |             group_name = ''
273 |             while curr_tkn.char != '>':
274 |                 group_name += curr_tkn.char
275 |                 next_tkn()
276 |                 if curr_tkn is None:
277 |                     raise Exception("Unterminated named group name.")
278 |             if len(group_name) == 0:
279 |                 raise Exception("Unexpected empty named group name.")
280 |             next_tkn()  # consumes '>'
281 |             return group_name
282 | 
283 |         groups_counter = itertools.count(start=0)
284 | 
285 |         curr_tkn = None
286 |         next_tkn = next_tkn_initializer(re)
287 |         next_tkn()
288 | 
289 |         ast = parse_re()
290 |         if curr_tkn is not None:
291 |             raise Exception(
292 |                 "Unable to parse the regex.")
293 |         return ast
294 | 


--------------------------------------------------------------------------------
/pyregexp/re_ast.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | from typing import Deque, List, Union
  3 | 
  4 | 
  5 | class ASTNode:
  6 |     """ AST nodes base class.
  7 | 
  8 |     Abstract Syntax Tree classes hierarchy base class.
  9 |     """
 10 | 
 11 |     def __init__(self) -> None:
 12 |         pass
 13 | 
 14 | 
 15 | class RE(ASTNode):
 16 |     """ Entry point of the AST.
 17 | 
 18 |     This class acts as the entry point for a regular expression's AST.
 19 |     """
 20 | 
 21 |     def __init__(self, child: ASTNode, capturing: bool = False, group_name: str = "RegEx") -> None:
 22 |         super().__init__()
 23 |         self.__capturing__: bool = capturing
 24 |         self.group_name: str = group_name
 25 |         self.group_id: int = -1
 26 |         self.child: Union[GroupNode, OrNode] = child
 27 |         self.children: List[Union[GroupNode, OrNode]] = deque([child])
 28 | 
 29 |     def is_capturing(self) -> bool:
 30 |         return self.__capturing__
 31 | 
 32 | 
 33 | class LeafNode(ASTNode):
 34 |     """ AST class defining the leaf nodes.
 35 | 
 36 |     Every leaf node inherits from this class. 
 37 |     """
 38 | 
 39 |     def __init__(self) -> None:
 40 |         super().__init__()
 41 | 
 42 |     def is_match(self, ch: str = None, str_i: int = None, str_len: int = None) -> bool:
 43 |         """
 44 |         Returns whether the passed inputs matches with the node.
 45 | 
 46 |         For example, if the node matches the character "a" and the passed ch is
 47 |         "b" the method will return False, but if the passed ch was "a" then the
 48 |         result would have been True.
 49 | 
 50 |         Args:
 51 |             ch (str): the char you want to match
 52 |             str_i (int): the string index you are considering
 53 |             str_len (int): the test string length
 54 | 
 55 |         Returns:
 56 |             bool: represents whether there is a match between the node and the
 57 |             passed parameters or not.
 58 |         """
 59 |         return False
 60 | 
 61 | 
 62 | class Element(LeafNode):
 63 |     """ AST Element.
 64 | 
 65 |     Specialization of the LeafNode class. This class models the elements of a regex.
 66 |     """
 67 | 
 68 |     def __init__(self, match_ch: str = None) -> None:
 69 |         super().__init__()
 70 |         self.match: str = match_ch
 71 |         self.min: Union[int, float] = 1
 72 |         self.max: Union[int, float] = 1
 73 | 
 74 |     def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool:
 75 |         return self.match == ch
 76 | 
 77 | 
 78 | class WildcardElement(Element):
 79 |     """ AST WildcardElement.
 80 | 
 81 |     Specialization of the Element class to model the wildcard behavior.
 82 |     """
 83 | 
 84 |     def __init__(self) -> None:
 85 |         super().__init__(match_ch='anything')
 86 |         self.match = None
 87 | 
 88 |     def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool:
 89 |         return ch != '\n'
 90 | 
 91 | 
 92 | class SpaceElement(Element):
 93 |     """ AST SpaceElement.
 94 | 
 95 |     Specialization of the element class to model the match-space behavior.
 96 |     """
 97 | 
 98 |     def __init__(self) -> None:
 99 |         super().__init__()
100 |         self.match = None
101 | 
102 |     def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool:
103 |         return ch.isspace() and len(ch) == 1
104 | 
105 | 
106 | class RangeElement(LeafNode):
107 |     """ AST RangeElement.
108 | 
109 |     Specialization of the LeafNode class modeling the range-element behavior,
110 |     that is that it matches with more than one character.
111 |     """
112 | 
113 |     def __init__(self, match_str: str, is_positive_logic: bool = True) -> None:
114 |         super().__init__()
115 |         self.match: str = match_str
116 |         self.min: Union[int, float] = 1
117 |         self.max: Union[int, float] = 1
118 |         self.is_positive_logic: bool = is_positive_logic
119 | 
120 |     def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool:
121 |         # XNOR of whether the ch is found and the logic (positive/negative)
122 |         return not((ch in self.match) ^ self.is_positive_logic)
123 | 
124 | 
125 | class StartElement(LeafNode):
126 |     """ AST StartElement.
127 | 
128 |     Inherits from LeafNode and models the match-start-element behavior.
129 |     """
130 | 
131 |     def __init__(self) -> None:
132 |         super().__init__()
133 |         self.match = None
134 |         self.min: Union[int, float] = 1
135 |         self.max: Union[int, float] = 1
136 | 
137 |     def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool:
138 |         return str_i == 0
139 | 
140 | 
141 | class EndElement(LeafNode):
142 |     """ AST EndElement.
143 | 
144 |     Inherits from LeafNode and models the match-end-element behavior.
145 |     """
146 | 
147 |     def __init__(self) -> None:
148 |         super().__init__()
149 |         self.match = ''
150 |         self.min: Union[int, float] = 1
151 |         self.max: Union[int, float] = 1
152 | 
153 |     def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool:
154 |         return str_i == str_len
155 | 
156 | 
157 | class OrNode(ASTNode):
158 |     """ AST OrNode.
159 | 
160 |     Inherits from ASTNode and models the or-nodes, that is the nodes that
161 |     divides the regex into two possible matching paths.
162 |     """
163 | 
164 |     def __init__(self, left: ASTNode, right: ASTNode) -> None:
165 |         super().__init__()
166 |         self.left: ASTNode = left
167 |         self.right: ASTNode = right
168 |         self.children: List[ASTNode] = [left, right]
169 |         self.min: Union[int, float] = 1
170 |         self.max: Union[int, float] = 1
171 | 
172 | 
173 | # unused
174 | class NotNode(ASTNode):
175 |     """ AST NotNode.
176 | 
177 |     Inherits from ASTNode and models the not-node behavior.
178 |     """
179 | 
180 |     def __init__(self, child: ASTNode) -> None:
181 |         super().__init__()
182 |         self.child: ASTNode = child
183 |         self.children: Deque[ASTNode] = deque([child])
184 | 
185 | 
186 | class GroupNode(ASTNode):
187 |     """ AST GroupNode.
188 | 
189 |     Inherits from ASTNode and models the group in a regex.
190 |     """
191 | 
192 |     def __init__(self, children: Deque[ASTNode], capturing: bool = False, group_name: str = None, group_id: int = -1) -> None:
193 |         super().__init__()
194 |         self.__capturing__: bool = capturing
195 |         self.group_id: int = group_id
196 |         self.group_name: str = group_name if group_name is not None else "Group " + \
197 |             str(self.group_id)
198 |         self.children: Deque[ASTNode] = children
199 |         self.min: Union[int, float] = 1
200 |         self.max: Union[int, float] = 1
201 | 
202 |     def is_capturing(self) -> bool:
203 |         """ Returns whether the GroupNode is capturing.
204 | 
205 |             Returns:
206 |                 bool: True if the group is capturing, False otherwise
207 |             """
208 |         return self.__capturing__
209 | 


--------------------------------------------------------------------------------
/pyregexp/tokens.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | from typing import Literal
  3 | 
  4 | 
  5 | class Token:
  6 |     """ Token base class."""
  7 | 
  8 |     def __init__(self) -> None:
  9 |         self.char: str = ''
 10 |     pass
 11 | 
 12 | 
 13 | class ElementToken(Token):
 14 |     """ Token that are not associated to special meaning."""
 15 | 
 16 |     def __init__(self, char: str):
 17 |         super().__init__()
 18 |         self.char: str = char
 19 | 
 20 | 
 21 | class WildcardToken(Token):
 22 |     """ Token of a wildcard."""
 23 | 
 24 |     def __init__(self, char: str):
 25 |         super().__init__()
 26 |         self.char: str = char
 27 | 
 28 | 
 29 | class SpaceToken(Token):
 30 |     """ Token of a space."""
 31 | 
 32 |     def __init__(self, char: str) -> None:
 33 |         super().__init__()
 34 |         self.char: str = string.whitespace
 35 | 
 36 | 
 37 | class Wildcard(WildcardToken):
 38 |     """ Token using '.' as wildcard."""
 39 | 
 40 |     def __init__(self):
 41 |         super().__init__(char='.')
 42 | 
 43 | 
 44 | class StartToken(Token):
 45 |     """ Token of match start."""
 46 | 
 47 |     def __init__(self, char: str):
 48 |         super().__init__()
 49 |         self.char: str = char
 50 | 
 51 | 
 52 | class Start(StartToken):
 53 |     """ Token using '^' to match start."""
 54 | 
 55 |     def __init__(self):
 56 |         super().__init__(char='^')
 57 | 
 58 | 
 59 | class EndToken(Token):
 60 |     """ Token of match end."""
 61 | 
 62 |     def __init__(self, char: str):
 63 |         super().__init__()
 64 |         self.char: str = char
 65 | 
 66 | 
 67 | class End(EndToken):
 68 |     """ Token using '$' to match end."""
 69 | 
 70 |     def __init__(self):
 71 |         super().__init__(char='$')
 72 | 
 73 | 
 74 | class Escape(Token):
 75 |     """ Token of the escape character."""
 76 | 
 77 |     def __init__(self):
 78 |         super().__init__()
 79 |         self.char = '\\'
 80 | 
 81 | 
 82 | class Comma(Token):
 83 |     """ Token of a comma."""
 84 | 
 85 |     def __init__(self):
 86 |         super().__init__()
 87 |         self.char = ','
 88 | 
 89 | 
 90 | class Parenthesis(Token):
 91 |     """ Token of a parenthesis."""
 92 | 
 93 |     def __init__(self):
 94 |         super().__init__()
 95 | 
 96 | 
 97 | class LeftParenthesis(Parenthesis):
 98 |     """ Left parenthesis token."""
 99 | 
100 |     def __init__(self):
101 |         super().__init__()
102 |         self.char = '('
103 | 
104 | 
105 | class RightParenthesis(Parenthesis):
106 |     """ Right parenthesis token."""
107 | 
108 |     def __init__(self):
109 |         super().__init__()
110 |         self.char = ')'
111 | 
112 | 
113 | class CurlyBrace(Token):
114 |     """ Curly brace token."""
115 | 
116 |     def __init__(self):
117 |         super().__init__()
118 | 
119 | 
120 | class LeftCurlyBrace(CurlyBrace):
121 |     """ Left curly brace token."""
122 | 
123 |     def __init__(self):
124 |         super().__init__()
125 |         self.char = '{'
126 | 
127 | 
128 | class RightCurlyBrace(CurlyBrace):
129 |     """ Right curly brace token."""
130 | 
131 |     def __init__(self):
132 |         super().__init__()
133 |         self.char = '}'
134 | 
135 | 
136 | class Bracket(Token):
137 |     """ Brackets token."""
138 | 
139 |     def __init__(self):
140 |         super().__init__()
141 | 
142 | 
143 | class LeftBracket(Bracket):
144 |     """ Left bracke token."""
145 | 
146 |     def __init__(self):
147 |         super().__init__()
148 |         self.char = '['
149 | 
150 | 
151 | class RightBracket(Bracket):
152 |     """ Right bracket token."""
153 | 
154 |     def __init__(self):
155 |         super().__init__()
156 |         self.char = ']'
157 | 
158 | 
159 | class Quantifier(Token):
160 |     """ Quantifier token."""
161 | 
162 |     def __init__(self, char: str):
163 |         super().__init__()
164 |         self.char: str = char
165 | 
166 | 
167 | class ZeroOrMore(Quantifier):
168 |     """ Quantifier 'zero or more' token."""
169 | 
170 |     def __init__(self, char: str):
171 |         super().__init__(char=char)
172 | 
173 | 
174 | class OneOrMore(Quantifier):
175 |     """ Quantifier 'one or more' token."""
176 | 
177 |     def __init__(self, char: str):
178 |         super().__init__(char=char)
179 | 
180 | 
181 | class ZeroOrOne(Quantifier):
182 |     """ Quantifier 'zero or one' token."""
183 | 
184 |     def __init__(self, char: str):
185 |         super().__init__(char=char)
186 | 
187 | 
188 | class Asterisk(ZeroOrMore):
189 |     """ Quantifier 'zero or more' token using character '*'."""
190 | 
191 |     def __init__(self):
192 |         super().__init__(char='*')
193 | 
194 | 
195 | class Plus(OneOrMore):
196 |     """ Quantifier 'one or more' token using character '+'."""
197 | 
198 |     def __init__(self):
199 |         super().__init__(char='+')
200 | 
201 | 
202 | class QuestionMark(ZeroOrOne):
203 |     """ Quantifier 'zero or one' token using character '?'."""
204 | 
205 |     def __init__(self):
206 |         super().__init__(char='?')
207 | 
208 | 
209 | class OrToken(Token):
210 |     """ Token of the or."""
211 | 
212 |     def __init__(self, char: str):
213 |         super().__init__()
214 |         self.char: str = char
215 | 
216 | 
217 | class VerticalBar(OrToken):
218 |     """ Token of the or using '|'."""
219 | 
220 |     def __init__(self):
221 |         super().__init__(char='|')
222 | 
223 | 
224 | class NotToken(Token):
225 |     """ Token of the negation."""
226 | 
227 |     def __init__(self, char: str):
228 |         super().__init__()
229 |         self.char: str = char
230 | 
231 | 
232 | class Circumflex(NotToken):
233 |     """ Token of the negation using '^'."""
234 | 
235 |     def __init__(self):
236 |         super().__init__(char='^')
237 | 
238 | 
239 | class Dash(Token):
240 |     """ Token of the dash '-'."""
241 | 
242 |     def __init__(self):
243 |         super().__init__()
244 |         self.char = '-'
245 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 | [pytest]
3 | minversion = 6.0
4 | testpaths =
5 |     test
6 | 


--------------------------------------------------------------------------------
/regex.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from time import perf_counter_ns
 3 | from pyregexp.engine import RegexEngine
 4 | 
 5 | 
 6 | def usage():
 7 |     print("usage: {} regex test_string1 [test_string2 ...]".format(
 8 |         sys.argv[0]))
 9 |     pass
10 | 
11 | 
12 | reng = RegexEngine()
13 | 
14 | if __name__ == "__main__":
15 |     if len(sys.argv) == 2 and sys.argv[1] == '--usage' or sys.argv[1] == '--help' or sys.argv[1] == '-u' or sys.argv[1] == '-h':
16 |         usage()
17 |         exit(0)
18 |     else:
19 |         if len(sys.argv) < 3:
20 |             print("Missing arguments.")
21 |             usage()
22 |             exit(-1)
23 | 
24 |         regex = sys.argv[1]
25 |         print("Regular expression: '{}'".format(regex))
26 | 
27 |         i = 2
28 |         while i < len(sys.argv):
29 |             test_str = sys.argv[i]
30 |             start_time = perf_counter_ns()
31 |             res, _ = reng.match(regex, test_str)
32 |             stop_time = perf_counter_ns()
33 |             print(f'Execution time: {stop_time - start_time} ns.')
34 |             print_string = f"'{test_str}' match with the regex" if res == True else f"'{test_str}' doesn't match the given regex"
35 |             print(print_string)
36 |             i += 1
37 | 


--------------------------------------------------------------------------------
/regex.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source venv/bin/activate
3 | 
4 | python3 regex.py "$@"
5 | 
6 | deactivate
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alabaster==0.7.12
 2 | astroid==2.9.3
 3 | attrs==20.3.0
 4 | autopep8==1.6.0
 5 | Babel==2.9.1
 6 | beautifulsoup4==4.10.0
 7 | certifi==2021.10.8
 8 | charset-normalizer==2.0.12
 9 | coverage==5.5
10 | docutils==0.17.1
11 | furo==2022.3.4
12 | idna==3.3
13 | imagesize==1.3.0
14 | importlib-metadata==4.11.3
15 | iniconfig==1.1.1
16 | isort==5.10.1
17 | Jinja2==3.0.3
18 | lazy-object-proxy==1.7.1
19 | MarkupSafe==2.1.1
20 | mccabe==0.6.1
21 | packaging==20.9
22 | platformdirs==2.5.1
23 | pluggy==0.13.1
24 | py==1.10.0
25 | pycodestyle==2.8.0
26 | Pygments==2.11.2
27 | pylint==2.12.2
28 | pyparsing==2.4.7
29 | pytest==6.2.3
30 | pytz==2022.1
31 | requests==2.27.1
32 | rope==0.19.0
33 | snowballstemmer==2.2.0
34 | soupsieve==2.3.1
35 | Sphinx==4.4.0
36 | sphinxcontrib-applehelp==1.0.2
37 | sphinxcontrib-devhelp==1.0.2
38 | sphinxcontrib-htmlhelp==2.0.0
39 | sphinxcontrib-jsmath==1.0.1
40 | sphinxcontrib-qthelp==1.0.3
41 | sphinxcontrib-serializinghtml==1.1.5
42 | toml==0.10.2
43 | typing-extensions==4.1.1
44 | urllib3==1.26.9
45 | wrapt==1.13.3
46 | zipp==3.7.0
47 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | license_files = LICENSE
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | from pathlib import Path
 4 | this_directory = Path(__file__).parent
 5 | long_description = (this_directory / "README.md").read_text()
 6 | 
 7 | setup(
 8 |     name='pyregexp',
 9 |     packages=['pyregexp'],
10 |     version='0.3.1',
11 |     license='MIT',
12 |     description='Simple regex library',
13 |     long_description=long_description,
14 |     long_description_content_type='text/markdown',
15 |     author='Lorenzo Felletti',
16 |     url='https://github.com/lorenzofelletti/pyregex',
17 |     download_url='https://github.com/lorenzofelletti/pyregex/archive/v0.3.1.tar.gz',
18 |     keywords=['regex', 'regexp', 'engine'],
19 |     install_requires=[],
20 |     classifiers=[
21 |         'Development Status :: 3 - Alpha',
22 |         'Intended Audience :: Developers',
23 |         'Operating System :: OS Independent',
24 |         'Topic :: Scientific/Engineering :: Information Analysis',
25 |         'Topic :: Software Development :: Libraries :: Python Modules',
26 |         'Topic :: Text Processing',
27 |         'Topic :: Text Processing :: General',
28 |         'License :: OSI Approved :: MIT License',
29 |         'Programming Language :: Python :: 3',
30 |         'Programming Language :: Python :: 3.8',
31 |         'Programming Language :: Python :: 3.9',
32 |         'Programming Language :: Python :: 3.10',
33 |     ],
34 | )
35 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/test/__init__.py


--------------------------------------------------------------------------------
/test/test_engine.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from ..pyregexp.engine import RegexEngine
  3 | 
  4 | 
  5 | @pytest.fixture
  6 | def reng() -> RegexEngine:
  7 |     return RegexEngine()
  8 | 
  9 | 
 10 | def test_simplest(reng: RegexEngine):
 11 |     assert (True, 1) == reng.match('a', 'a')
 12 | 
 13 | 
 14 | def test_simplest_with_wildcard(reng: RegexEngine):
 15 |     assert (True, 1) == reng.match('.', 'a')
 16 | 
 17 | 
 18 | def test_simplest_but_longer(reng: RegexEngine):
 19 |     assert (True, 3) == reng.match('a.c', 'abc')
 20 | 
 21 | 
 22 | def test_wildcard(reng: RegexEngine):
 23 |     assert (True, 2) == reng.match('.*a', 'aa')
 24 | 
 25 | 
 26 | def test_backtracking(reng: RegexEngine):
 27 |     assert (True, 4) == reng.match('a*a', 'aaaa')
 28 | 
 29 | 
 30 | def test_or(reng: RegexEngine):
 31 |     assert (True, 1) == reng.match('a.*|b', 'b')
 32 | 
 33 | 
 34 | def test_or_no_match(reng: RegexEngine):
 35 |     res, _ = reng.match('^a|b$', 'c')
 36 |     assert res == False
 37 | 
 38 | 
 39 | def test_or_no_match_with_bt(reng: RegexEngine):
 40 |     res, _ = reng.match('a|b', 'c')
 41 |     assert res == False
 42 | 
 43 | 
 44 | def test_bt_no_match(reng: RegexEngine):
 45 |     res, _ = reng.match('a{5}a', 'aaaaa')
 46 |     assert res == False
 47 | 
 48 | 
 49 | def test_match_group_zero_or_more(reng: RegexEngine):
 50 |     res, consumed = reng.match('(a)*', 'aa')
 51 |     assert (True, 2) == (res, consumed)
 52 | 
 53 | 
 54 | def test_fail_group_one_or_more(reng: RegexEngine):
 55 |     res, _ = reng.match('^(a)+', 'b')
 56 |     assert res == False
 57 | 
 58 | 
 59 | def test_complex_match(reng: RegexEngine):
 60 |     res, _ = reng.match('^(a|b+c)?[n-z]{2}', 'axx')
 61 |     assert res == True
 62 | 
 63 | 
 64 | def test_complex_match_2(reng: RegexEngine):
 65 |     res, _ = reng.match('^(a|b+c)?[n-z]{2}', 'xx')
 66 |     assert res == True
 67 | 
 68 | 
 69 | def test_match_mail_simple(reng: RegexEngine):
 70 |     res, _ = reng.match(r'.*@.*\.(com|it)', 'vr@gmail.com')
 71 |     assert res == True
 72 | 
 73 | 
 74 | def test_bt_index_leaf(reng: RegexEngine):
 75 |     res, _ = reng.match(r'^aaaa.*a$', 'aaaaa')
 76 |     assert res == True
 77 | 
 78 | 
 79 | def test_bt_index_or(reng: RegexEngine):
 80 |     res, _ = reng.match(r'^x(a|b)?bc$', 'xbc')
 81 |     assert res == True
 82 | 
 83 | 
 84 | def test_bt_index_group(reng: RegexEngine):
 85 |     res, _ = reng.match(r'^x(a)?ac$', 'xac')
 86 |     assert res == True
 87 | 
 88 | 
 89 | def test_match_or_left(reng: RegexEngine):
 90 |     res, _ = reng.match('na|nb', 'na')
 91 |     assert res == True
 92 | 
 93 | 
 94 | def test_match_or_right(reng: RegexEngine):
 95 |     res, _ = reng.match('na|nb', 'nb')
 96 |     assert res == True
 97 | 
 98 | 
 99 | def test_match_or_right_at_start_end(reng: RegexEngine):
100 |     res, _ = reng.match('^na|nb$', 'nb')
101 |     assert res == True
102 | 
103 | 
104 | def test_no_match_after_end(reng: RegexEngine):
105 |     res, _ = reng.match('^na|nb$', 'nb ')
106 |     assert res == False
107 | 
108 | 
109 | def test_match_sequence_with_start_end_correctly(reng: RegexEngine):
110 |     res, _ = reng.match('^a|b$', 'a  ')
111 |     assert res == True
112 | 
113 |     res, _ = reng.match('^a|b$', ' a  ')
114 |     assert res == False
115 | 
116 |     res, _ = reng.match('^a|b$', '  b')
117 |     assert res == True
118 | 
119 |     res, _ = reng.match('^a|b$', '  b ')
120 |     assert res == False
121 | 
122 | 
123 | def test_complex_match_3(reng: RegexEngine):
124 |     res, _ = reng.match('a(b|[c-n])+b{3}.{2}', 'ahhbbbbbb')
125 |     assert res == True
126 | 
127 | 
128 | def test_bit_less_complex_match_3(reng: RegexEngine):
129 |     res, _ = reng.match('a(b|[c-n])+b{3}', 'ahhbbbbbb')
130 |     assert res == True
131 | 
132 | 
133 | def test_unescaped_special_ch(reng: RegexEngine):
134 |     with pytest.raises(Exception):
135 |         reng.match('$a^', 'aa')
136 | 
137 | 
138 | def test_various_emails(reng: RegexEngine):
139 |     res, _ = reng.match(r'.*@(gmail|hotmail)\.(com|it)', 'baa.aa@hotmail.it')
140 |     assert res == True
141 |     res, _ = reng.match(r'.*@(gmail|hotmail)\.(com|it)', 'baa.aa@gmail.com')
142 |     assert res == True
143 |     res, _ = reng.match(r'.*@(gmail|hotmail)\.(com|it)', 'baa.aa@hotmaila.com')
144 |     assert res == False
145 | 
146 | 
147 | def test_match_empty(reng: RegexEngine):
148 |     res, _ = reng.match('^$', '')
149 |     assert res == True
150 |     res, _ = reng.match('$', '')
151 |     assert res == True
152 |     res, _ = reng.match('^', '')
153 |     assert res == True
154 | 
155 | 
156 | def test_match_space(reng: RegexEngine):
157 |     res, _ = reng.match(r'\s', r' ')
158 |     assert res == True
159 |     res, _ = reng.match(r'\s', '\t')
160 |     assert res == True
161 |     res, _ = reng.match(r'\s', '\r')
162 |     assert res == True
163 |     res, _ = reng.match(r'\s', '\f')
164 |     assert res == True
165 |     res, _ = reng.match(r'\s', '\n')
166 |     assert res == True
167 |     res, _ = reng.match(r'\s', '\v')
168 |     assert res == True
169 | 
170 | 
171 | def test_match_space_2(reng: RegexEngine):
172 |     res, _ = reng.match(r'\s+', '\r\t\n \f \v')
173 |     assert res == True
174 |     res, _ = reng.match(r'^\s$', '\r\t')
175 |     assert res == False
176 | 
177 | 
178 | def test_return_matches_simple(reng: RegexEngine):
179 |     res, _, matches = reng.match(r'a\s', r'a ', return_matches=True)
180 |     assert res == True
181 |     assert len(matches[0]) == 1
182 | 
183 | 
184 | def test_return_matches_two(reng: RegexEngine):
185 |     res, _m, matches = reng.match(r'a(b)+a', r'abba', return_matches=True)
186 |     assert res == True
187 |     assert len(matches[0]) == 2
188 | 
189 | 
190 | def test_non_capturing_group(reng: RegexEngine):
191 |     res, _, matches = reng.match(r'a(?:b)+a', r'abba', return_matches=True)
192 |     assert res == True
193 |     assert len(matches[0]) == 1
194 | 
195 | 
196 | def test_continue_after_match_and_return_matches_simple(reng: RegexEngine):
197 |     string = 'abba'
198 |     res, consumed, matches = reng.match(
199 |         r'a', string, continue_after_match=True, return_matches=True)
200 |     assert consumed == len(string)
201 |     assert len(matches) == 2
202 |     assert len(matches[0]) == 1
203 |     x = matches[0]
204 |     assert matches[0][0].match == 'a'
205 |     assert len(matches[1]) == 1
206 |     assert matches[1][0].match == 'a'
207 | 
208 | 
209 | def test_continue_after_match_and_return_matches_2(reng: RegexEngine):
210 |     string = 'abbai'
211 |     res, consumed, matches = reng.match(
212 |         r'a', string, continue_after_match=True, return_matches=True)
213 |     assert consumed == len(string)-1
214 |     assert len(matches) == 2
215 |     assert len(matches[0]) == 1
216 |     x = matches[0]
217 |     assert matches[0][0].match == 'a'
218 |     assert len(matches[1]) == 1
219 |     assert matches[1][0].match == 'a'
220 | 
221 | 
222 | def test_question_mark(reng: RegexEngine):
223 |     res, _ = reng.match(r'https?://', r'http://')
224 |     assert res == True
225 |     res, _ = reng.match(r'https?://', r'https://')
226 |     assert res == True
227 | 
228 | 
229 | def test_engine_1(reng: RegexEngine):
230 |     with pytest.raises(Exception):
231 |         res, _ = reng.match("$^", '')
232 | 
233 | 
234 | def test_engine_2(reng: RegexEngine):
235 |     regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
236 | 
237 |     mail = "lorenzo.felletti@mail.com"
238 |     res, consumed = reng.match(regex, mail)
239 |     assert res == True
240 |     assert consumed == len(mail)
241 | 
242 |     mail = "lorenzo.felletti@mail.c"
243 |     res, _ = reng.match(regex, mail)
244 |     assert res == False
245 | 
246 |     mail = "lorenzo.fellettimail.com"
247 |     res, _ = reng.match(regex, mail)
248 |     assert res == False
249 | 
250 |     mail = "lorenz^^o.felletti@mymail.com"
251 |     res, _ = reng.match(regex, mail)
252 |     assert res == False
253 | 
254 |     mail = "lorenz0.%+-@mymail.com"
255 |     res, _ = reng.match(regex, mail)
256 |     assert res == True
257 | 
258 | 
259 | def test_engine_3(reng: RegexEngine):
260 |     string = "lorem ipsum"
261 |     res, consumed = reng.match(r"m", string, continue_after_match=True)
262 |     assert res == True
263 |     assert consumed == len(string)
264 | 
265 | 
266 | def test_engine_4(reng: RegexEngine):
267 |     string = "lorem ipsum"
268 |     res, consumed, matches = reng.match(
269 |         r"m", string, continue_after_match=True, return_matches=True)
270 |     assert res == True
271 |     assert consumed == len(string)
272 | 
273 |     assert len(matches) == 2
274 |     assert matches[0][0].match == 'm'
275 |     assert matches[1][0].match == 'm'
276 | 
277 | 
278 | def test_engine_5(reng: RegexEngine):
279 |     match_1 = "lor.fel@ah.ha"
280 |     match_2 = "fel.log@ha.ah"
281 |     string = match_1 + " " + match_2
282 |     res, consumed, matches = reng.match(
283 |         r"[a-z.]+@[a-z]+\.[a-z]{2}", string, continue_after_match=True, return_matches=True)
284 |     assert res == True
285 |     assert consumed == len(string)
286 | 
287 |     assert len(matches) == 2
288 |     assert matches[0][0].match == match_1
289 |     assert matches[1][0].match == match_2
290 | 
291 | 
292 | def test_engine_6(reng: RegexEngine):
293 |     res, consumed = reng.match(r'[\abc]', r'\\')
294 |     assert res == False
295 |     assert consumed == 0
296 | 
297 |     res, _ = reng.match(r'[\\abc]', r'\\')
298 |     assert res == True
299 | 
300 | 
301 | def test_engine_7(reng: RegexEngine):
302 |     res, _ = reng.match(r'(a)+(a)?(a{2}|b)+', 'aaabbaa')
303 |     assert res == True
304 | 
305 | 
306 | def test_engine_8(reng: RegexEngine):
307 |     res, _ = reng.match(r'(a){2}', r'a')
308 |     assert res == False
309 | 
310 |     res, _ = reng.match(r'(aa){1,2}', r'aa')
311 |     assert res == True
312 | 
313 | 
314 | def test_named_group(reng: RegexEngine):
315 |     res, _, matches = reng.match(
316 |         r'(?<fancy>clancy)', r'clancy', return_matches=True)
317 |     assert res == True
318 |     assert matches[0][1].name == 'fancy'
319 | 
320 | 
321 | def test_named_group_fail_1(reng: RegexEngine):
322 |     with pytest.raises(Exception):
323 |         res, _ = reng.match(r"(?<)", '')
324 | 
325 | 
326 | def test_named_group_fail_2(reng: RegexEngine):
327 |     with pytest.raises(Exception):
328 |         res, _ = reng.match(r"(?<abb)", '')
329 | 
330 | 
331 | def test_named_group_fail_empty_name(reng: RegexEngine):
332 |     with pytest.raises(Exception):
333 |         res, _ = reng.match(r"(?<>asf)", '')
334 | 
335 | 
336 | def test_matches_indexes(reng: RegexEngine):
337 |     test_str = "abbabbab"
338 |     res, consumed, matches = reng.match(
339 |         r"a", test_str, continue_after_match=True, return_matches=True)
340 |     assert res == True
341 |     assert consumed == len(test_str) - 1
342 |     assert len(matches) == 3
343 |     assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1
344 |     assert matches[1][0].start_idx == 3 and matches[1][0].end_idx == 4
345 |     assert matches[2][0].start_idx == 6 and matches[2][0].end_idx == 7
346 | 
347 | 
348 | def test_returned_matches_indexes(reng: RegexEngine):
349 |     regex = r"(a)(a)(a)(a)(a)(a)"
350 |     test_str = "aaaaaaaaaacccaaaaaac"
351 |     res, consumed, matches = reng.match(regex, test_str, True, True)
352 | 
353 |     assert res == True
354 |     assert consumed == len(test_str)-1
355 |     assert matches is not None and len(matches) == 2
356 |     assert len(matches[0]) == 7
357 |     assert len(matches[1]) == 7
358 |     assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == 6
359 |     assert matches[0][1].start_idx == 5 and matches[0][1].end_idx == 6
360 |     assert matches[0][2].start_idx == 4 and matches[0][2].end_idx == 5
361 |     assert matches[0][3].start_idx == 3 and matches[0][3].end_idx == 4
362 |     assert matches[0][4].start_idx == 2 and matches[0][4].end_idx == 3
363 |     assert matches[0][5].start_idx == 1 and matches[0][5].end_idx == 2
364 |     assert matches[0][6].start_idx == 0 and matches[0][6].end_idx == 1
365 | 
366 |     assert matches[1][0].start_idx == 13 and matches[1][0].end_idx == 19
367 |     assert matches[1][1].start_idx == 18 and matches[1][1].end_idx == 19
368 |     assert matches[1][2].start_idx == 17 and matches[1][2].end_idx == 18
369 |     assert matches[1][3].start_idx == 16 and matches[1][3].end_idx == 17
370 |     assert matches[1][4].start_idx == 15 and matches[1][4].end_idx == 16
371 |     assert matches[1][5].start_idx == 14 and matches[1][5].end_idx == 15
372 |     assert matches[1][6].start_idx == 13 and matches[1][6].end_idx == 14
373 | 
374 | 
375 | # this one loops
376 | def test_returned_groups(reng: RegexEngine):
377 |     # group e will not be matched due to the greediness of the engine,
378 |     # .* "eats" the "e" in test_str
379 |     regex = r"a(b).*(e)?c(c)(c)c"
380 |     test_str = "abxxecccc"
381 |     res, consumed, matches = reng.match(regex, test_str, True, True)
382 | 
383 |     assert res == True
384 |     assert consumed == len(test_str)
385 |     assert len(matches) == 1
386 |     assert len(matches[0]) == 4
387 |     assert matches[0][0].match == test_str
388 |     assert matches[0][1].match == "c" and matches[0][1].start_idx == len(
389 |         test_str) - 2
390 |     assert matches[0][2].match == "c" and matches[0][2].start_idx == len(
391 |         test_str) - 3
392 |     assert matches[0][3].match == "b" and matches[0][3].start_idx == 1
393 | 
394 | 
395 | def test_on_long_string(reng: RegexEngine):
396 |     regex = r"a(b)?.{0,10}c(d)"
397 |     test_str = "abcd dcvrsbshpeuiògjAAwdew ac abc vcsweacscweflllacd"
398 |     res, _, matches = reng.match(regex, test_str, True, True)
399 | 
400 |     assert res == True
401 |     assert len(matches) == 2
402 | 
403 |     assert len(matches[0]) == 3
404 |     assert matches[0][0].start_idx == 0 and \
405 |         matches[0][0].end_idx == 4
406 |     assert matches[0][1].start_idx == 3 and \
407 |         matches[0][1].end_idx == 4
408 |     assert matches[0][2].start_idx == 1 and \
409 |         matches[0][2].end_idx == 2
410 | 
411 |     len(matches[1]) == 2
412 |     assert matches[1][0].start_idx == 39 and \
413 |         matches[1][0].end_idx == len(test_str)
414 |     assert matches[1][1].start_idx == len(test_str)-1 and \
415 |         matches[1][1].end_idx == len(test_str)
416 | 
417 | 
418 | def test_ignore_case_no_casefolding(reng: RegexEngine):
419 |     regex = r"ss"
420 |     test_str = "SS"
421 |     res, _ = reng.match(regex, test_str, ignore_case=1)
422 |     assert res == True
423 | 
424 |     regex = r"ÄCHER"
425 |     test_str = "ächer"
426 |     res, _ = reng.match(regex, test_str, ignore_case=1)
427 |     assert res == True
428 | 
429 |     regex = r"ÄCHER"
430 |     test_str = "acher"
431 |     res, _ = reng.match(regex, test_str, ignore_case=1)
432 |     assert res == False
433 | 
434 | 
435 | def test_ignore_case_casefolding(reng: RegexEngine):
436 |     regex = r"ẞ"
437 |     test_str = "SS"
438 |     res, _ = reng.match(regex, test_str, ignore_case=2)
439 |     assert res == True
440 | 
441 |     regex = r"ÄCHER"
442 |     test_str = "ächer"
443 |     res, _ = reng.match(regex, test_str, ignore_case=2)
444 |     assert res == True
445 | 
446 |     regex = r"ÄCHER"
447 |     test_str = "acher"
448 |     res, _ = reng.match(regex, test_str, ignore_case=2)
449 |     assert res == False
450 | 
451 | 
452 | def test_empty_regex(reng: RegexEngine):
453 |     regex = r""
454 |     test_str = "aaaa"
455 | 
456 |     # repeate the test with different optional parameters configurations
457 |     res, _ = reng.match(regex, test_str)
458 |     assert res == True
459 | 
460 |     res, _ = reng.match(regex, test_str, ignore_case=1)
461 |     assert res == True
462 | 
463 |     res, _ = reng.match(regex, test_str, ignore_case=2)
464 |     assert res == True
465 | 
466 |     res, _ = reng.match(regex, test_str, continue_after_match=True)
467 |     assert res == True
468 | 
469 |     res, _, matches = reng.match(regex, test_str, return_matches=True)
470 |     assert res == True
471 |     assert len(matches) == 1 and len(matches[0]) == 1
472 |     assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0
473 | 
474 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
475 |     assert res == True
476 |     assert len(matches) == 1 and len(matches[0]) == 1
477 |     assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0
478 | 
479 |     res, _, matches = reng.match(regex, test_str, True, True, 1)
480 |     assert res == True
481 |     assert len(matches) == 1 and len(matches[0]) == 1
482 |     assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0
483 | 
484 |     res, _, matches = reng.match(regex, test_str, True, True, 2)
485 |     assert res == True
486 |     assert len(matches) == 1 and len(matches[0]) == 1
487 |     assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0
488 | 
489 | 
490 | def test_empty_test_str(reng: RegexEngine):
491 |     regex = r"a"
492 |     test_str = ""
493 |     res, _ = reng.match(regex, test_str)
494 |     assert res == False
495 | 
496 | 
497 | def test_empty_regex_and_test_str(reng: RegexEngine):
498 |     regex = r""
499 |     test_str = ""
500 |     res, _ = reng.match(regex, test_str)
501 |     assert res == True
502 | 
503 | 
504 | def test_regex_with_rigth_empty_group(reng: RegexEngine):
505 |     regex = r"a|"
506 |     test_str = "ab"
507 | 
508 |     # repeate the test with different optional parameters configurations
509 |     res, _ = reng.match(regex, test_str)
510 |     assert res == True
511 | 
512 |     res, _ = reng.match(regex, test_str, ignore_case=1)
513 |     assert res == True
514 | 
515 |     res, _ = reng.match(regex, test_str, ignore_case=2)
516 |     assert res == True
517 | 
518 |     res, _ = reng.match(regex, test_str, continue_after_match=True)
519 |     assert res == True
520 | 
521 |     res, _, matches = reng.match(regex, test_str, return_matches=True)
522 |     assert res == True
523 |     assert len(matches) == 1 and len(matches[0]) == 1
524 |     assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1
525 | 
526 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
527 |     assert res == True
528 |     assert len(matches) == 1 and len(matches[0]) == 1
529 |     assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1
530 | 
531 |     res, _, matches = reng.match(regex, test_str, True, True, 1)
532 |     assert res == True
533 |     assert len(matches) == 1 and len(matches[0]) == 1
534 |     assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1
535 | 
536 |     res, _, matches = reng.match(regex, test_str, True, True, 2)
537 |     assert res == True
538 |     assert len(matches) == 1 and len(matches[0]) == 1
539 |     assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1
540 | 
541 | 
542 | def test_empty_group_quantified(reng: RegexEngine):
543 |     regex = r'()+'
544 |     test_str = 'ab'
545 |     res, _ = reng.match(regex, test_str)
546 |     assert res == True
547 | 
548 | 
549 | def test_nested_quantifiers(reng: RegexEngine):
550 |     regex = r'(a*)+ab'
551 |     test_str = 'aab'
552 |     res, _ = reng.match(regex, test_str)
553 |     assert res == True
554 | 
555 |     regex = r'(a+)+ab'
556 |     test_str = 'ab'
557 |     res, _ = reng.match(regex, test_str)
558 |     assert res == False
559 | 
560 | 
561 | def test_nested_quantifiers_with_or_node(reng: RegexEngine):
562 |     regex = r'(a*|b*)*ab'
563 |     test_str = 'ab'
564 |     res, _ = reng.match(regex, test_str)
565 |     assert res == True
566 | 
567 |     regex = r'(a*|b*)+ab'
568 |     test_str = 'ab'
569 |     res, _ = reng.match(regex, test_str)
570 |     assert res == True
571 | 
572 |     regex = r'(a+|b+)+ab'
573 |     test_str = 'ab'
574 |     res, _ = reng.match(regex, test_str)
575 |     assert res == False
576 | 
577 | 
578 | def test_multiple_named_groups(reng: RegexEngine):
579 |     regex = r"(?<first>[a-z]+)(?<second>i)(?<third>l)"
580 |     test_str = "nostril"
581 |     res, _, _ = reng.match(regex, test_str, True, True, 0)
582 |     assert res == True
583 | 
584 | 
585 | def test_one_named_group(reng: RegexEngine):
586 |     regex = r"[a-z]+(?<last>l)"
587 |     test_str = "nostril"
588 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
589 |     assert res == True
590 | 
591 | 
592 | def test_two_separated_named_group(reng: RegexEngine):
593 |     regex = r"(?<first>n)[a-z]+(?<last>l)"
594 |     test_str = "nostril"
595 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
596 |     assert res == True
597 |     assert len(matches) == 1
598 |     assert len(matches[0]) == 3
599 |     assert matches[0][0].match == "nostril"
600 |     assert matches[0][1].match == "l"
601 |     assert matches[0][2].match == "n"
602 | 
603 | 
604 | def test_match_contiguous_named_groups(reng: RegexEngine):
605 |     regex = r"(?<first>n)(?<last>l)"
606 |     test_str = "nl"
607 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
608 |     assert res == True
609 |     assert len(matches) == 1
610 |     assert len(matches[0]) == 3
611 |     assert matches[0][0].match == "nl"
612 |     assert matches[0][1].match == "l"
613 |     assert matches[0][2].match == "n"
614 | 
615 | 
616 | def test_named_group_with_range_element(reng: RegexEngine):
617 |     regex = r"(?<first>[a-z])(?<last>l)"
618 |     test_str = "nl"
619 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
620 |     assert res == True
621 |     assert len(matches) == 1
622 |     assert len(matches[0]) == 3
623 |     assert matches[0][0].match == "nl"
624 |     assert matches[0][1].match == "l"
625 |     assert matches[0][2].match == "n"
626 | 
627 | 
628 | def test_named_group_with_range_element_and_quantifier(reng: RegexEngine):
629 |     regex = r"(?<first>[a-z]+)(?<last>l)"
630 |     test_str = "nl"
631 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
632 |     assert res == True
633 |     assert len(matches) == 1
634 |     assert len(matches[0]) == 3
635 |     assert matches[0][0].match == "nl"
636 |     assert matches[0][1].match == "l"
637 |     assert matches[0][2].match == "n"
638 | 
639 | 
640 | def test_backtracking_or_node_inside_group_node(reng: RegexEngine):
641 |     regex = r"(?<first>b{1,2}|[a-z]+)(?<last>l)"
642 |     test_str = "bnl"
643 | 
644 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
645 |     assert res == True
646 |     assert len(matches) == 1
647 |     assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == len(test_str)
648 |     assert matches[0][1].start_idx == 2 and matches[0][1].end_idx == len(test_str)
649 |     assert matches[0][2].start_idx == 0 and matches[0][2].end_idx == 2
650 | 
651 |     regex = r"(?<first>[a-z]+|b{1,2})(?<last>l)"
652 |     res, _, matches = reng.match(regex, test_str, True, True, 0)
653 |     assert res == True
654 |     assert len(matches) == 1
655 |     assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == len(test_str)
656 |     assert matches[0][1].start_idx == 2 and matches[0][1].end_idx == len(test_str)
657 |     assert matches[0][2].start_idx == 0 and matches[0][2].end_idx == 2
658 | 
659 | 
660 | def test_double_or_nodes_with_wildcard_in_between(reng: RegexEngine):
661 |     res, _ = reng.match(r'@(gm|ho).(com|it)', '@hoa.com')
662 |     assert res == False
663 | 


--------------------------------------------------------------------------------
/test/test_engine2.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from ..pyregexp.engine import RegexEngine
 3 | 
 4 | 
 5 | @pytest.fixture
 6 | def reng() -> RegexEngine:
 7 |     return RegexEngine()
 8 | 
 9 | 
10 | def test_1(reng: RegexEngine):
11 |     regex = r"(ad+a)*a"
12 |     test_str = "adaa"
13 | 
14 |     res, consumed, matches = reng.match(regex, test_str, True, True)
15 | 
16 |     assert res == True
17 |     consumed == len(test_str)
18 |     assert len(matches) == 1
19 | 
20 | 
21 | def test_2(reng: RegexEngine):
22 |     regex = r"0|1|2|3"
23 |     test_str = "3210"
24 | 
25 |     res, consumed, matches = reng.match(regex, test_str, True, True)
26 | 
27 |     assert res == True
28 |     consumed == len(test_str)
29 |     assert len(matches) == 4
30 | 


--------------------------------------------------------------------------------
/test/test_lexer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from ..pyregexp.tokens import *
 3 | from ..pyregexp.lexer import Lexer
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def lexer():
 8 |     return Lexer()
 9 | 
10 | 
11 | def test_simple_re_lexing(lexer: Lexer):
12 |     tokens = lexer.scan('a')
13 |     assert tokens[0].char == 'a'
14 | 
15 | 
16 | def test_escaping_char(lexer: Lexer):
17 |     tokens = lexer.scan(r'a\\a\\t\.')
18 |     assert type(tokens[1]) is ElementToken and tokens[1].char == '\\'
19 | 
20 | 
21 | def test_escaping_get_tab(lexer: Lexer):
22 |     tokens = lexer.scan(r'a\h\t')
23 |     assert type(tokens[2]) is ElementToken and tokens[2].char == '\t'
24 | 
25 | 
26 | def test_escaping_wildcard(lexer: Lexer):
27 |     tokens = lexer.scan(r'\.')
28 |     assert type(tokens[0]) is ElementToken and tokens[0].char == '.'
29 | 
30 | 
31 | def test_get_comma(lexer: Lexer):
32 |     tokens = lexer.scan('a{3,5}')
33 |     assert type(tokens[3]) is Comma
34 | 
35 | 
36 | def test_comma_is_element(lexer: Lexer):
37 |     tokens = lexer.scan('a,')
38 |     assert type(tokens[1]) is ElementToken
39 | 
40 | 
41 | def test_match_start(lexer: Lexer):
42 |     tokens = lexer.scan('^a')
43 |     assert type(tokens[0]) is Start
44 | 
45 | 
46 | def test_match_end(lexer: Lexer):
47 |     tokens = lexer.scan(r'fdsad\$cs$')
48 |     assert type(tokens[len(tokens) - 1]) is End
49 | 
50 | 
51 | def test_fail_curly(lexer: Lexer):
52 |     with pytest.raises(Exception):
53 |         lexer.scan('advfe{a}')
54 | 
55 | 
56 | def test_lexer_1(lexer: Lexer):
57 |     tokens = lexer.scan(r'-\\\/\s~')
58 |     assert len(tokens) == 5
59 |     assert type(tokens[0]) is Dash
60 |     assert type(tokens[1]) is ElementToken
61 |     assert type(tokens[2]) is ElementToken
62 |     assert type(tokens[3]) is SpaceToken
63 |     assert type(tokens[4]) is ElementToken
64 | 


--------------------------------------------------------------------------------
/test/test_parser.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import pytest
  3 | from ..pyregexp.re_ast import RE, EndElement, GroupNode, Element, OrNode, RangeElement, SpaceElement, StartElement
  4 | from ..pyregexp.pyrser import Pyrser
  5 | 
  6 | 
  7 | @pytest.fixture
  8 | def parser():
  9 |     return Pyrser()
 10 | 
 11 | 
 12 | def test_simple_regex(parser: Pyrser):
 13 |     ast = parser.parse('a')
 14 |     print(ast)
 15 |     assert type(ast) is RE
 16 |     assert type(ast.child) is GroupNode
 17 |     assert type(ast.child.children[0]) is Element
 18 | 
 19 | 
 20 | def test_grouping(parser: Pyrser):
 21 |     ast = parser.parse('a(b)c')
 22 | 
 23 |     # top level group
 24 |     assert len(ast.child.children) == 3
 25 |     assert type(ast.child.children[0]) is Element
 26 |     assert type(ast.child.children[1]) is GroupNode
 27 |     assert type(ast.child.children[2]) is Element
 28 | 
 29 |     # ast.child.children[1] group '(a)'
 30 |     assert len(ast.child.children[1].children) == 1
 31 |     assert type(ast.child.children[1].children[0]) is Element
 32 | 
 33 | 
 34 | def test_curly_braces_1(parser: Pyrser):
 35 |     ast = parser.parse(r'a{5}b')
 36 |     assert len(ast.child.children) == 2
 37 | 
 38 | 
 39 | def test_fail_curly(parser: Pyrser):
 40 |     with pytest.raises(Exception):
 41 |         parser.parse('a{3,3}}')
 42 | 
 43 | 
 44 | def test_fail_no_closing_par(parser: Pyrser):
 45 |     with pytest.raises(Exception):
 46 |         parser.parse('a[d]((vfw)')
 47 | 
 48 | 
 49 | def test_parse_match_start_end(parser: Pyrser):
 50 |     ast = parser.parse('^aaaa.*a$')
 51 |     assert len(ast.child.children) == 8
 52 | 
 53 | 
 54 | def test_complex_regex(parser: Pyrser):
 55 |     ast = parser.parse(r'^[a-zA-Z]{1,20}@[a-zA-Z]\.[a-z]{1,3}$')
 56 |     assert len(ast.child.children) == 7
 57 | 
 58 |     assert type(ast.child.children[0]) is StartElement
 59 | 
 60 |     assert type(ast.child.children[1]) is RangeElement
 61 |     assert ast.child.children[1].min == 1
 62 |     assert ast.child.children[1].max == 20
 63 | 
 64 |     assert type(ast.child.children[2]) is Element
 65 | 
 66 |     assert type(ast.child.children[3]) is RangeElement
 67 | 
 68 |     assert type(ast.child.children[4]) is Element
 69 | 
 70 |     assert type(ast.child.children[5]) is RangeElement
 71 |     assert ast.child.children[5].min == 1
 72 |     assert ast.child.children[5].max == 3
 73 | 
 74 |     assert type(ast.child.children[6]) is EndElement
 75 | 
 76 | 
 77 | def test_space_element(parser: Pyrser):
 78 |     ast = parser.parse(r'\s')
 79 |     assert len(ast.child.children) == 1
 80 |     assert type(ast.child.children[0]) is SpaceElement
 81 | 
 82 | 
 83 | def test_range_1(parser: Pyrser):
 84 |     ast = parser.parse('[^a-z]')
 85 |     assert len(ast.child.children) == 1
 86 |     assert type(ast.child.children[0]) is RangeElement
 87 |     assert ast.child.children[0].is_match('a') == False
 88 | 
 89 | 
 90 | def test_range_2(parser: Pyrser):
 91 |     ast = parser.parse(r'[^a-z-\s-]')
 92 |     assert len(ast.child.children) == 1
 93 |     assert type(ast.child.children[0]) is RangeElement
 94 |     assert ast.child.children[0].is_match('a') == False
 95 |     assert ast.child.children[0].is_match('-') == False
 96 |     ast.child.children[0].is_match(' ') == False
 97 | 
 98 | 
 99 | def test_range_3(parser: Pyrser):
100 |     ast = parser.parse(r'[a-z-\s-]')
101 |     assert len(ast.child.children) == 1
102 |     assert type(ast.child.children[0]) is RangeElement
103 |     assert ast.child.children[0].is_match('a') == True
104 |     assert ast.child.children[0].is_match('-') == True
105 |     ast.child.children[0].is_match(' ') == True
106 | 
107 | 
108 | def test_range_2(parser: Pyrser):
109 |     ast = parser.parse(r'[\]]')
110 |     assert len(ast.child.children) == 1
111 |     assert type(ast.child.children[0]) is RangeElement
112 |     assert ast.child.children[0].is_match(']') == True
113 | 
114 | 
115 | def test_parse_curly_1(parser: Pyrser):
116 |     ast = parser.parse(r'a{2}')
117 |     assert len(ast.child.children) == 1
118 |     assert type(ast.child.children[0]) is Element
119 |     assert ast.child.children[0].is_match('a') == True
120 |     assert ast.child.children[0].min == 2
121 |     ast.child.children[0].max == 2
122 | 
123 | 
124 | def test_parse_curly_2(parser: Pyrser):
125 |     ast = parser.parse(r'a{,2}')
126 |     assert len(ast.child.children) == 1
127 |     assert type(ast.child.children[0]) is Element
128 |     assert ast.child.children[0].is_match('a') == True
129 |     assert ast.child.children[0].min == 0
130 |     ast.child.children[0].max == 2
131 | 
132 | 
133 | def test_parse_curly_3(parser: Pyrser):
134 |     ast = parser.parse(r'a{2,}')
135 |     assert len(ast.child.children) == 1
136 |     assert type(ast.child.children[0]) is Element
137 |     assert ast.child.children[0].is_match('a') == True
138 |     assert ast.child.children[0].min == 2
139 |     ast.child.children[0].max == math.inf
140 | 
141 | 
142 | def test_parse_curly_4(parser: Pyrser):
143 |     ast = parser.parse(r'a{,}')
144 |     assert len(ast.child.children) == 1
145 |     assert type(ast.child.children[0]) is Element
146 |     assert ast.child.children[0].is_match('a') == True
147 |     assert ast.child.children[0].min == 0
148 |     ast.child.children[0].max == math.inf
149 | 
150 | 
151 | def test_parse_fail_empty_curly(parser: Pyrser):
152 |     with pytest.raises(Exception):
153 |         ast = parser.parse(r'a{}')
154 | 
155 | 
156 | def test_fail_quatifier_unescaped(parser: Pyrser):
157 |     with pytest.raises(Exception):
158 |         ast = parser.parse(r'?')
159 | 
160 | 
161 | def test_parse_fail_missing_clising_bracket(parser: Pyrser):
162 |     with pytest.raises(Exception):
163 |         ast = parser.parse(r'a[abc')
164 | 
165 | 
166 | def test_parse_fail_unescaped_closing_bracket(parser: Pyrser):
167 |     with pytest.raises(Exception):
168 |         ast = parser.parse(r'abc]')
169 | 
170 | 
171 | def test_parse_fail_unescaped_closing_parenthesis(parser: Pyrser):
172 |     with pytest.raises(Exception):
173 |         ast = parser.parse(r'a)')
174 | 
175 | 
176 | def test_parse_fail_unescaped_start(parser: Pyrser):
177 |     with pytest.raises(Exception):
178 |         ast = parser.parse(r'^^')
179 | 
180 | 
181 | def test_parse_fail_unescaped_end(parser: Pyrser):
182 |     with pytest.raises(Exception):
183 |         ast = parser.parse(r'$$')
184 | 
185 | 
186 | def test_parse_fail_swapped_range(parser: Pyrser):
187 |     with pytest.raises(Exception):
188 |         ast = parser.parse(r'[z-a]')
189 | 
190 | 
191 | def test_parse_fail_non_capturing_group(parser: Pyrser):
192 |     with pytest.raises(Exception):
193 |         parser.parse(r'(?')
194 | 
195 |     with pytest.raises(Exception):
196 |         parser.parse(r'(?aa')
197 | 
198 | 
199 | def test_parse_fail_non_closed_range(parser: Pyrser):
200 |     with pytest.raises(Exception):
201 |         parser.parse(r'[a')
202 | 
203 |     with pytest.raises(Exception):
204 |         parser.parse(r'[')
205 | 
206 | 
207 | def test_parse_onrnode_groups_names(parser: Pyrser):
208 |     regex = r'a|b'
209 |     ast = parser.parse(regex)
210 |     assert len(ast.children) == 1
211 |     assert isinstance(ast.child, OrNode)
212 |     assert isinstance(ast.child.left, GroupNode)
213 |     assert isinstance(ast.child.right, GroupNode)
214 |     assert ast.child.left.group_name == ast.child.right.group_name
215 |     assert ast.child.left.group_id == ast.child.right.group_id
216 | 
217 | 
218 | def test_groups_names_double_ornode(parser: Pyrser):
219 |     regex = r'a|b|c'
220 |     ast = parser.parse(regex)
221 |     assert len(ast.children) == 1
222 |     assert isinstance(ast.child, OrNode)
223 |     assert isinstance(ast.child.left, GroupNode)
224 |     leftmost_gid = ast.child.left.group_id
225 |     leftmost_gname = ast.child.left.group_name
226 | 
227 |     assert isinstance(ast.child.right, OrNode)
228 |     assert isinstance(ast.child.right.left, GroupNode)
229 |     central_gid = ast.child.right.left.group_id
230 |     central_gname = ast.child.right.left.group_name
231 | 
232 |     assert isinstance(ast.child.right.right, GroupNode)
233 |     rightmost_gid = ast.child.right.right.group_id
234 |     rightmost_gname = ast.child.right.right.group_name
235 | 
236 |     assert leftmost_gid == central_gid
237 |     assert central_gid == rightmost_gid
238 |     assert leftmost_gname == central_gname
239 |     assert central_gname == rightmost_gname
240 | 


--------------------------------------------------------------------------------
/test/test_re_ast.py:
--------------------------------------------------------------------------------
 1 | from ..pyregexp.re_ast import ASTNode, RE, LeafNode, Element, WildcardElement, SpaceElement, RangeElement, StartElement, EndElement, OrNode, NotNode, GroupNode
 2 | 
 3 | 
 4 | def test_ASTNode():
 5 |     ast_node = ASTNode()
 6 |     assert ast_node is not None
 7 | 
 8 | 
 9 | def test_RE():
10 |     re = RE(child=Element(match_ch='e'))
11 |     assert re is not None
12 | 
13 |     assert hasattr(re, 'child')
14 |     assert hasattr(re, 'children')
15 | 
16 |     assert re.child is re.children[0]
17 | 
18 | 
19 | def test_NotNode():
20 |     not_node = NotNode(child=Element(match_ch='e'))
21 |     assert not_node is not None
22 | 
23 |     assert hasattr(not_node, 'child')
24 |     assert hasattr(not_node, 'children')
25 | 
26 |     assert not_node.child is not_node.children[0]
27 | 
28 | 
29 | def test_LeafNode():
30 |     ln = LeafNode()
31 |     assert ln is not None
32 |     assert hasattr(ln, 'is_match')
33 | 
34 |     assert ln.is_match() == False
35 | 
36 | 
37 | def test_WildcardElement():
38 |     we = WildcardElement()
39 |     assert we is not None
40 | 
41 | 
42 | def test_SpaceElement():
43 |     se = SpaceElement()
44 |     assert se is not None
45 |     assert hasattr(se, 'is_match')
46 | 
47 |     assert se.is_match(" ")
48 |     assert se.is_match("\t")
49 |     assert se.is_match("\n")
50 |     assert se.is_match("\f")
51 |     assert se.is_match("\r")
52 |     assert se.is_match("t") == False
53 | 
54 | 
55 | def test_RangeElement_positive_logic():
56 |     re = RangeElement("abc", True)
57 |     assert re is not None
58 |     assert re.is_positive_logic == True
59 | 
60 |     assert re.is_match("a") == True
61 |     assert re.is_match("x") == False
62 | 
63 | 
64 | def test_RangeElement_negative_logic():
65 |     nre = RangeElement("abc", False)
66 |     assert nre is not None
67 |     assert nre.is_positive_logic == False
68 | 
69 |     assert nre.is_match("a") == False
70 |     assert nre.is_match("x") == True
71 | 


--------------------------------------------------------------------------------
/test/test_tokens.py:
--------------------------------------------------------------------------------
 1 | from ..pyregexp.tokens import Asterisk, Bracket, Circumflex, Comma, CurlyBrace, Dash, ElementToken, End, EndToken, Escape, LeftBracket, LeftCurlyBrace, LeftParenthesis, NotToken, OneOrMore, OrToken, Parenthesis, Plus, Quantifier, QuestionMark, RightBracket, RightCurlyBrace, RightParenthesis, SpaceToken, Start, StartToken, Token, VerticalBar, Wildcard, WildcardToken, ZeroOrMore, ZeroOrOne
 2 | 
 3 | 
 4 | def test_Asterisk():
 5 |     assert issubclass(Asterisk, ZeroOrMore)
 6 | 
 7 |     a = Asterisk()
 8 |     assert a is not None
 9 | 
10 |     assert type(a) == Asterisk
11 | 
12 | 
13 | def test_NotToken():
14 |     assert issubclass(NotToken, Token) == True
15 | 
16 |     nt = NotToken(char='^')
17 |     assert nt is not None
18 |     assert nt.char == '^'
19 | 
20 | 
21 | def test_Bracket():
22 |     br = Bracket()
23 |     assert br is not None
24 |     br = LeftBracket()
25 |     assert br is not None
26 |     br = RightBracket()
27 |     assert br is not None
28 | 
29 | 
30 | def test_Escape():
31 |     escape = Escape()
32 |     assert escape is not None
33 | 


--------------------------------------------------------------------------------